From 182d919b84902eece162c63ed3d476c8016b4197 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 19 Feb 2015 23:47:31 +0000
Subject: FS-Cache: Count culled objects and objects rejected due to lack of
 space

Count the number of objects that get culled by the cache backend and the
number of objects that the cache backend declines to instantiate due to lack
of space in the cache.

These numbers are made available through /proc/fs/fscache/stats

Signed-off-by: David Howells <dhowells@redhat.com>
Reviewed-by: Steve Dickson <steved@redhat.com>
Acked-by: Jeff Layton <jeff.layton@primarydata.com>
---
 include/linux/fscache-cache.h | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/fscache-cache.h b/include/linux/fscache-cache.h
index 771484993ca7..c9dafdaf3347 100644
--- a/include/linux/fscache-cache.h
+++ b/include/linux/fscache-cache.h
@@ -371,6 +371,7 @@ struct fscache_object {
 #define FSCACHE_OBJECT_IS_LOOKED_UP	4	/* T if object has been looked up */
 #define FSCACHE_OBJECT_IS_AVAILABLE	5	/* T if object has become active */
 #define FSCACHE_OBJECT_RETIRED		6	/* T if object was retired on relinquishment */
+#define FSCACHE_OBJECT_KILLED_BY_CACHE	7	/* T if object was killed by the cache */
 
 	struct list_head	cache_link;	/* link in cache->object_list */
 	struct hlist_node	cookie_link;	/* link in cookie->backing_objects */
@@ -551,4 +552,15 @@ extern enum fscache_checkaux fscache_check_aux(struct fscache_object *object,
 					       const void *data,
 					       uint16_t datalen);
 
+extern void fscache_object_retrying_stale(struct fscache_object *object);
+
+enum fscache_why_object_killed {
+	FSCACHE_OBJECT_IS_STALE,
+	FSCACHE_OBJECT_NO_SPACE,
+	FSCACHE_OBJECT_WAS_RETIRED,
+	FSCACHE_OBJECT_WAS_CULLED,
+};
+extern void fscache_object_mark_killed(struct fscache_object *object,
+				       enum fscache_why_object_killed why);
+
 #endif /* _LINUX_FSCACHE_CACHE_H */
-- 
cgit v1.2.3


From 30ceec6284129662efc3a1e7675b2bd857a046fe Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Tue, 24 Feb 2015 10:05:27 +0000
Subject: FS-Cache: When submitting an op, cancel it if the target object is
 dying

When submitting an operation, prefer to cancel the operation immediately
rather than queuing it for later processing if the object is marked as dying
(ie. the object state machine has reached the KILL_OBJECT state).

Whilst we're at it, change the series of related test_bit() calls into a
READ_ONCE() and bitwise-AND operators to reduce the number of load
instructions (test_bit() has a volatile address).

Signed-off-by: David Howells <dhowells@redhat.com>
Reviewed-by: Steve Dickson <steved@redhat.com>
Acked-by: Jeff Layton <jeff.layton@primarydata.com>
---
 fs/fscache/operation.c        | 47 +++++++++++++++++++++++++++----------------
 include/linux/fscache-cache.h |  9 +++++++--
 2 files changed, 37 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/fs/fscache/operation.c b/fs/fscache/operation.c
index 68ead8482617..dec6defe3be3 100644
--- a/fs/fscache/operation.c
+++ b/fs/fscache/operation.c
@@ -120,6 +120,8 @@ static void fscache_report_unexpected_submission(struct fscache_object *object,
 int fscache_submit_exclusive_op(struct fscache_object *object,
 				struct fscache_operation *op)
 {
+	const struct fscache_state *ostate;
+	unsigned long flags;
 	int ret;
 
 	_enter("{OBJ%x OP%x},", object->debug_id, op->debug_id);
@@ -132,8 +134,19 @@ int fscache_submit_exclusive_op(struct fscache_object *object,
 	ASSERTCMP(object->n_ops, >=, object->n_exclusive);
 	ASSERT(list_empty(&op->pend_link));
 
+	ostate = object->state;
+	smp_rmb();
+
 	op->state = FSCACHE_OP_ST_PENDING;
-	if (fscache_object_is_active(object)) {
+	flags = READ_ONCE(object->flags);
+	if (unlikely(!(flags & BIT(FSCACHE_OBJECT_IS_LIVE)))) {
+		fscache_stat(&fscache_n_op_rejected);
+		op->state = FSCACHE_OP_ST_CANCELLED;
+		ret = -ENOBUFS;
+	} else if (unlikely(fscache_cache_is_broken(object))) {
+		op->state = FSCACHE_OP_ST_CANCELLED;
+		ret = -EIO;
+	} else if (flags & BIT(FSCACHE_OBJECT_IS_AVAILABLE)) {
 		op->object = object;
 		object->n_ops++;
 		object->n_exclusive++;	/* reads and writes must wait */
@@ -155,7 +168,7 @@ int fscache_submit_exclusive_op(struct fscache_object *object,
 		/* need to issue a new write op after this */
 		clear_bit(FSCACHE_OBJECT_PENDING_WRITE, &object->flags);
 		ret = 0;
-	} else if (test_bit(FSCACHE_OBJECT_IS_LOOKED_UP, &object->flags)) {
+	} else if (flags & BIT(FSCACHE_OBJECT_IS_LOOKED_UP)) {
 		op->object = object;
 		object->n_ops++;
 		object->n_exclusive++;	/* reads and writes must wait */
@@ -164,11 +177,9 @@ int fscache_submit_exclusive_op(struct fscache_object *object,
 		fscache_stat(&fscache_n_op_pend);
 		ret = 0;
 	} else {
-		/* If we're in any other state, there must have been an I/O
-		 * error of some nature.
-		 */
-		ASSERT(test_bit(FSCACHE_IOERROR, &object->cache->flags));
-		ret = -EIO;
+		fscache_report_unexpected_submission(object, op, ostate);
+		op->state = FSCACHE_OP_ST_CANCELLED;
+		ret = -ENOBUFS;
 	}
 
 	spin_unlock(&object->lock);
@@ -187,6 +198,7 @@ int fscache_submit_op(struct fscache_object *object,
 		      struct fscache_operation *op)
 {
 	const struct fscache_state *ostate;
+	unsigned long flags;
 	int ret;
 
 	_enter("{OBJ%x OP%x},{%u}",
@@ -204,7 +216,15 @@ int fscache_submit_op(struct fscache_object *object,
 	smp_rmb();
 
 	op->state = FSCACHE_OP_ST_PENDING;
-	if (fscache_object_is_active(object)) {
+	flags = READ_ONCE(object->flags);
+	if (unlikely(!(flags & BIT(FSCACHE_OBJECT_IS_LIVE)))) {
+		fscache_stat(&fscache_n_op_rejected);
+		op->state = FSCACHE_OP_ST_CANCELLED;
+		ret = -ENOBUFS;
+	} else if (unlikely(fscache_cache_is_broken(object))) {
+		op->state = FSCACHE_OP_ST_CANCELLED;
+		ret = -EIO;
+	} else if (flags & BIT(FSCACHE_OBJECT_IS_AVAILABLE)) {
 		op->object = object;
 		object->n_ops++;
 
@@ -222,25 +242,18 @@ int fscache_submit_op(struct fscache_object *object,
 			fscache_run_op(object, op);
 		}
 		ret = 0;
-	} else if (test_bit(FSCACHE_OBJECT_IS_LOOKED_UP, &object->flags)) {
+	} else if (flags & BIT(FSCACHE_OBJECT_IS_LOOKED_UP)) {
 		op->object = object;
 		object->n_ops++;
 		atomic_inc(&op->usage);
 		list_add_tail(&op->pend_link, &object->pending_ops);
 		fscache_stat(&fscache_n_op_pend);
 		ret = 0;
-	} else if (fscache_object_is_dying(object)) {
-		fscache_stat(&fscache_n_op_rejected);
-		op->state = FSCACHE_OP_ST_CANCELLED;
-		ret = -ENOBUFS;
-	} else if (!test_bit(FSCACHE_IOERROR, &object->cache->flags)) {
+	} else {
 		fscache_report_unexpected_submission(object, op, ostate);
 		ASSERT(!fscache_object_is_active(object));
 		op->state = FSCACHE_OP_ST_CANCELLED;
 		ret = -ENOBUFS;
-	} else {
-		op->state = FSCACHE_OP_ST_CANCELLED;
-		ret = -ENOBUFS;
 	}
 
 	spin_unlock(&object->lock);
diff --git a/include/linux/fscache-cache.h b/include/linux/fscache-cache.h
index c9dafdaf3347..2e83a141e465 100644
--- a/include/linux/fscache-cache.h
+++ b/include/linux/fscache-cache.h
@@ -411,17 +411,22 @@ static inline bool fscache_object_is_available(struct fscache_object *object)
 	return test_bit(FSCACHE_OBJECT_IS_AVAILABLE, &object->flags);
 }
 
+static inline bool fscache_cache_is_broken(struct fscache_object *object)
+{
+	return test_bit(FSCACHE_IOERROR, &object->cache->flags);
+}
+
 static inline bool fscache_object_is_active(struct fscache_object *object)
 {
 	return fscache_object_is_available(object) &&
 		fscache_object_is_live(object) &&
-		!test_bit(FSCACHE_IOERROR, &object->cache->flags);
+		!fscache_cache_is_broken(object);
 }
 
 static inline bool fscache_object_is_dead(struct fscache_object *object)
 {
 	return fscache_object_is_dying(object) &&
-		test_bit(FSCACHE_IOERROR, &object->cache->flags);
+		fscache_cache_is_broken(object);
 }
 
 /**
-- 
cgit v1.2.3


From 87021526300f1a292dd966e141e183630ac95317 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Tue, 24 Feb 2015 10:52:51 +0000
Subject: FS-Cache: fscache_object_is_dead() has wrong logic, kill it

fscache_object_is_dead() returns true only if the object is marked dead and
the cache got an I/O error.  This should be a logical OR instead.  Since two
of the callers got split up into handling for separate subcases, expand the
other callers and kill the function.  This is probably the right thing to do
anyway since one of the subcases isn't about the object at all, but rather
about the cache.

Signed-off-by: David Howells <dhowells@redhat.com>
Reviewed-by: Steve Dickson <steved@redhat.com>
Acked-by: Jeff Layton <jeff.layton@primarydata.com>
---
 fs/fscache/cookie.c           | 3 ++-
 fs/fscache/page.c             | 6 ++++--
 include/linux/fscache-cache.h | 6 ------
 3 files changed, 6 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/fs/fscache/cookie.c b/fs/fscache/cookie.c
index 89acec742e0b..8de22164f5fb 100644
--- a/fs/fscache/cookie.c
+++ b/fs/fscache/cookie.c
@@ -327,7 +327,8 @@ static int fscache_alloc_object(struct fscache_cache *cache,
 
 object_already_extant:
 	ret = -ENOBUFS;
-	if (fscache_object_is_dead(object)) {
+	if (fscache_object_is_dying(object) ||
+	    fscache_cache_is_broken(object)) {
 		spin_unlock(&cookie->lock);
 		goto error;
 	}
diff --git a/fs/fscache/page.c b/fs/fscache/page.c
index de33b3fccca6..d0805e31361c 100644
--- a/fs/fscache/page.c
+++ b/fs/fscache/page.c
@@ -377,11 +377,13 @@ check_if_dead:
 		_leave(" = -ENOBUFS [cancelled]");
 		return -ENOBUFS;
 	}
-	if (unlikely(fscache_object_is_dead(object))) {
-		pr_err("%s() = -ENOBUFS [obj dead %d]\n", __func__, op->state);
+	if (unlikely(fscache_object_is_dying(object) ||
+		     fscache_cache_is_broken(object))) {
+		enum fscache_operation_state state = op->state;
 		fscache_cancel_op(op, do_cancel);
 		if (stat_object_dead)
 			fscache_stat(stat_object_dead);
+		_leave(" = -ENOBUFS [obj dead %d]", state);
 		return -ENOBUFS;
 	}
 	return 0;
diff --git a/include/linux/fscache-cache.h b/include/linux/fscache-cache.h
index 2e83a141e465..ca3d550da11e 100644
--- a/include/linux/fscache-cache.h
+++ b/include/linux/fscache-cache.h
@@ -423,12 +423,6 @@ static inline bool fscache_object_is_active(struct fscache_object *object)
 		!fscache_cache_is_broken(object);
 }
 
-static inline bool fscache_object_is_dead(struct fscache_object *object)
-{
-	return fscache_object_is_dying(object) &&
-		fscache_cache_is_broken(object);
-}
-
 /**
  * fscache_object_destroyed - Note destruction of an object in a cache
  * @cache: The cache from which the object came
-- 
cgit v1.2.3


From 1339ec98e32b4bc8efb6fbb71c006a465130aaba Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Wed, 25 Feb 2015 13:26:39 +0000
Subject: FS-Cache: Out of line fscache_operation_init()

Out of line fscache_operation_init() so that it can access internal FS-Cache
features, such as stats, in a later commit.

Signed-off-by: David Howells <dhowells@redhat.com>
Reviewed-by: Steve Dickson <steved@redhat.com>
Acked-by: Jeff Layton <jeff.layton@primarydata.com>
---
 fs/fscache/operation.c        | 22 ++++++++++++++++++++++
 include/linux/fscache-cache.h | 24 +++---------------------
 2 files changed, 25 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/fs/fscache/operation.c b/fs/fscache/operation.c
index 67594a8d791a..61a6e78b85fa 100644
--- a/fs/fscache/operation.c
+++ b/fs/fscache/operation.c
@@ -20,6 +20,28 @@
 atomic_t fscache_op_debug_id;
 EXPORT_SYMBOL(fscache_op_debug_id);
 
+/**
+ * fscache_operation_init - Do basic initialisation of an operation
+ * @op: The operation to initialise
+ * @release: The release function to assign
+ *
+ * Do basic initialisation of an operation.  The caller must still set flags,
+ * object and processor if needed.
+ */
+void fscache_operation_init(struct fscache_operation *op,
+			    fscache_operation_processor_t processor,
+			    fscache_operation_release_t release)
+{
+	INIT_WORK(&op->work, fscache_op_work_func);
+	atomic_set(&op->usage, 1);
+	op->state = FSCACHE_OP_ST_INITIALISED;
+	op->debug_id = atomic_inc_return(&fscache_op_debug_id);
+	op->processor = processor;
+	op->release = release;
+	INIT_LIST_HEAD(&op->pend_link);
+}
+EXPORT_SYMBOL(fscache_operation_init);
+
 /**
  * fscache_enqueue_operation - Enqueue an operation for processing
  * @op: The operation to enqueue
diff --git a/include/linux/fscache-cache.h b/include/linux/fscache-cache.h
index ca3d550da11e..0e26d49972e3 100644
--- a/include/linux/fscache-cache.h
+++ b/include/linux/fscache-cache.h
@@ -119,27 +119,9 @@ extern void fscache_op_work_func(struct work_struct *work);
 extern void fscache_enqueue_operation(struct fscache_operation *);
 extern void fscache_op_complete(struct fscache_operation *, bool);
 extern void fscache_put_operation(struct fscache_operation *);
-
-/**
- * fscache_operation_init - Do basic initialisation of an operation
- * @op: The operation to initialise
- * @release: The release function to assign
- *
- * Do basic initialisation of an operation.  The caller must still set flags,
- * object and processor if needed.
- */
-static inline void fscache_operation_init(struct fscache_operation *op,
-					fscache_operation_processor_t processor,
-					fscache_operation_release_t release)
-{
-	INIT_WORK(&op->work, fscache_op_work_func);
-	atomic_set(&op->usage, 1);
-	op->state = FSCACHE_OP_ST_INITIALISED;
-	op->debug_id = atomic_inc_return(&fscache_op_debug_id);
-	op->processor = processor;
-	op->release = release;
-	INIT_LIST_HEAD(&op->pend_link);
-}
+extern void fscache_operation_init(struct fscache_operation *,
+				   fscache_operation_processor_t,
+				   fscache_operation_release_t);
 
 /*
  * data read operation
-- 
cgit v1.2.3


From d3b97ca4a99e4e6c78f5a21c968eadf5c8ba9971 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Tue, 24 Feb 2015 10:05:29 +0000
Subject: FS-Cache: The operation cancellation method needs calling in more
 places

Any time an incomplete operation is cancelled, the operation cancellation
function needs to be called to clean up.  This is currently being passed
directly to some of the functions that might want to call it, but not all.

Instead, pass the cancellation method pointer to the fscache_operation_init()
and have that cache it in the operation struct.  Further, plug in a dummy
cancellation handler if the caller declines to set one as this allows us to
call the function unconditionally (the extra overhead isn't worth bothering
about as we don't expect to be calling this typically).

The cancellation method must thence be called everywhere the CANCELLED state
is set.  Note that we call it *before* setting the CANCELLED state such that
the method can use the old state value to guide its operation.

fscache_do_cancel_retrieval() needs moving higher up in the sources so that
the init function can use it now.

Without this, the following oops may be seen:

	FS-Cache: Assertion failed
	FS-Cache: 3 == 0 is false
	------------[ cut here ]------------
	kernel BUG at ../fs/fscache/page.c:261!
	...
	RIP: 0010:[<ffffffffa0089c1b>]  fscache_release_retrieval_op+0x77/0x100
	 [<ffffffffa008853d>] fscache_put_operation+0x114/0x2da
	 [<ffffffffa008b8c2>] __fscache_read_or_alloc_pages+0x358/0x3b3
	 [<ffffffffa00b761f>] __nfs_readpages_from_fscache+0x59/0xbf [nfs]
	 [<ffffffffa00b06c5>] nfs_readpages+0x10c/0x185 [nfs]
	 [<ffffffff81124925>] ? alloc_pages_current+0x119/0x13e
	 [<ffffffff810ee5fd>] ? __page_cache_alloc+0xfb/0x10a
	 [<ffffffff810f87f8>] __do_page_cache_readahead+0x188/0x22c
	 [<ffffffff810f8b3a>] ondemand_readahead+0x29e/0x2af
	 [<ffffffff810f8c92>] page_cache_sync_readahead+0x38/0x3a
	 [<ffffffff810ef337>] generic_file_read_iter+0x1a2/0x55a
	 [<ffffffffa00a9dff>] ? nfs_revalidate_mapping+0xd6/0x288 [nfs]
	 [<ffffffffa00a6a23>] nfs_file_read+0x49/0x70 [nfs]
	 [<ffffffff811363be>] new_sync_read+0x78/0x9c
	 [<ffffffff81137164>] __vfs_read+0x13/0x38
	 [<ffffffff8113721e>] vfs_read+0x95/0x121
	 [<ffffffff811372f6>] SyS_read+0x4c/0x8a
	 [<ffffffff81557a52>] system_call_fastpath+0x12/0x17

The assertion is showing that the remaining number of pages (n_pages) is not 0
when the operation is being released.

Signed-off-by: David Howells <dhowells@redhat.com>
Reviewed-by: Steve Dickson <steved@redhat.com>
Acked-by: Jeff Layton <jeff.layton@primarydata.com>
---
 fs/fscache/cookie.c           |  5 ++---
 fs/fscache/internal.h         |  7 ++-----
 fs/fscache/object.c           |  3 ++-
 fs/fscache/operation.c        | 31 ++++++++++++++++++++++-------
 fs/fscache/page.c             | 46 +++++++++++++++++++++----------------------
 include/linux/fscache-cache.h |  5 +++++
 6 files changed, 57 insertions(+), 40 deletions(-)

(limited to 'include/linux')

diff --git a/fs/fscache/cookie.c b/fs/fscache/cookie.c
index 8de22164f5fb..d403c69bee08 100644
--- a/fs/fscache/cookie.c
+++ b/fs/fscache/cookie.c
@@ -672,7 +672,7 @@ int __fscache_check_consistency(struct fscache_cookie *cookie)
 	if (!op)
 		return -ENOMEM;
 
-	fscache_operation_init(op, NULL, NULL);
+	fscache_operation_init(op, NULL, NULL, NULL);
 	op->flags = FSCACHE_OP_MYTHREAD |
 		(1 << FSCACHE_OP_WAITING) |
 		(1 << FSCACHE_OP_UNUSE_COOKIE);
@@ -696,8 +696,7 @@ int __fscache_check_consistency(struct fscache_cookie *cookie)
 	/* the work queue now carries its own ref on the object */
 	spin_unlock(&cookie->lock);
 
-	ret = fscache_wait_for_operation_activation(object, op,
-						    NULL, NULL, NULL);
+	ret = fscache_wait_for_operation_activation(object, op, NULL, NULL);
 	if (ret == 0) {
 		/* ask the cache to honour the operation */
 		ret = object->cache->ops->check_consistency(op);
diff --git a/fs/fscache/internal.h b/fs/fscache/internal.h
index a63225116db6..97ec45110957 100644
--- a/fs/fscache/internal.h
+++ b/fs/fscache/internal.h
@@ -124,9 +124,7 @@ extern int fscache_submit_exclusive_op(struct fscache_object *,
 				       struct fscache_operation *);
 extern int fscache_submit_op(struct fscache_object *,
 			     struct fscache_operation *);
-extern int fscache_cancel_op(struct fscache_operation *,
-			     void (*)(struct fscache_operation *),
-			     bool);
+extern int fscache_cancel_op(struct fscache_operation *, bool);
 extern void fscache_cancel_all_ops(struct fscache_object *);
 extern void fscache_abort_object(struct fscache_object *);
 extern void fscache_start_operations(struct fscache_object *);
@@ -139,8 +137,7 @@ extern int fscache_wait_for_deferred_lookup(struct fscache_cookie *);
 extern int fscache_wait_for_operation_activation(struct fscache_object *,
 						 struct fscache_operation *,
 						 atomic_t *,
-						 atomic_t *,
-						 void (*)(struct fscache_operation *));
+						 atomic_t *);
 extern void fscache_invalidate_writes(struct fscache_cookie *);
 
 /*
diff --git a/fs/fscache/object.c b/fs/fscache/object.c
index 40049f7505f0..9e792e30f4db 100644
--- a/fs/fscache/object.c
+++ b/fs/fscache/object.c
@@ -961,7 +961,8 @@ static const struct fscache_state *_fscache_invalidate_object(struct fscache_obj
 	if (!op)
 		goto nomem;
 
-	fscache_operation_init(op, object->cache->ops->invalidate_object, NULL);
+	fscache_operation_init(op, object->cache->ops->invalidate_object,
+			       NULL, NULL);
 	op->flags = FSCACHE_OP_ASYNC |
 		(1 << FSCACHE_OP_EXCLUSIVE) |
 		(1 << FSCACHE_OP_UNUSE_COOKIE);
diff --git a/fs/fscache/operation.c b/fs/fscache/operation.c
index c76c09730768..57d4abb68656 100644
--- a/fs/fscache/operation.c
+++ b/fs/fscache/operation.c
@@ -20,6 +20,10 @@
 atomic_t fscache_op_debug_id;
 EXPORT_SYMBOL(fscache_op_debug_id);
 
+static void fscache_operation_dummy_cancel(struct fscache_operation *op)
+{
+}
+
 /**
  * fscache_operation_init - Do basic initialisation of an operation
  * @op: The operation to initialise
@@ -30,6 +34,7 @@ EXPORT_SYMBOL(fscache_op_debug_id);
  */
 void fscache_operation_init(struct fscache_operation *op,
 			    fscache_operation_processor_t processor,
+			    fscache_operation_cancel_t cancel,
 			    fscache_operation_release_t release)
 {
 	INIT_WORK(&op->work, fscache_op_work_func);
@@ -37,6 +42,7 @@ void fscache_operation_init(struct fscache_operation *op,
 	op->state = FSCACHE_OP_ST_INITIALISED;
 	op->debug_id = atomic_inc_return(&fscache_op_debug_id);
 	op->processor = processor;
+	op->cancel = cancel ?: fscache_operation_dummy_cancel;
 	op->release = release;
 	INIT_LIST_HEAD(&op->pend_link);
 	fscache_stat(&fscache_n_op_initialised);
@@ -164,9 +170,11 @@ int fscache_submit_exclusive_op(struct fscache_object *object,
 	flags = READ_ONCE(object->flags);
 	if (unlikely(!(flags & BIT(FSCACHE_OBJECT_IS_LIVE)))) {
 		fscache_stat(&fscache_n_op_rejected);
+		op->cancel(op);
 		op->state = FSCACHE_OP_ST_CANCELLED;
 		ret = -ENOBUFS;
 	} else if (unlikely(fscache_cache_is_broken(object))) {
+		op->cancel(op);
 		op->state = FSCACHE_OP_ST_CANCELLED;
 		ret = -EIO;
 	} else if (flags & BIT(FSCACHE_OBJECT_IS_AVAILABLE)) {
@@ -200,10 +208,12 @@ int fscache_submit_exclusive_op(struct fscache_object *object,
 		fscache_stat(&fscache_n_op_pend);
 		ret = 0;
 	} else if (flags & BIT(FSCACHE_OBJECT_KILLED_BY_CACHE)) {
+		op->cancel(op);
 		op->state = FSCACHE_OP_ST_CANCELLED;
 		ret = -ENOBUFS;
 	} else {
 		fscache_report_unexpected_submission(object, op, ostate);
+		op->cancel(op);
 		op->state = FSCACHE_OP_ST_CANCELLED;
 		ret = -ENOBUFS;
 	}
@@ -245,9 +255,11 @@ int fscache_submit_op(struct fscache_object *object,
 	flags = READ_ONCE(object->flags);
 	if (unlikely(!(flags & BIT(FSCACHE_OBJECT_IS_LIVE)))) {
 		fscache_stat(&fscache_n_op_rejected);
+		op->cancel(op);
 		op->state = FSCACHE_OP_ST_CANCELLED;
 		ret = -ENOBUFS;
 	} else if (unlikely(fscache_cache_is_broken(object))) {
+		op->cancel(op);
 		op->state = FSCACHE_OP_ST_CANCELLED;
 		ret = -EIO;
 	} else if (flags & BIT(FSCACHE_OBJECT_IS_AVAILABLE)) {
@@ -276,11 +288,13 @@ int fscache_submit_op(struct fscache_object *object,
 		fscache_stat(&fscache_n_op_pend);
 		ret = 0;
 	} else if (flags & BIT(FSCACHE_OBJECT_KILLED_BY_CACHE)) {
+		op->cancel(op);
 		op->state = FSCACHE_OP_ST_CANCELLED;
 		ret = -ENOBUFS;
 	} else {
 		fscache_report_unexpected_submission(object, op, ostate);
 		ASSERT(!fscache_object_is_active(object));
+		op->cancel(op);
 		op->state = FSCACHE_OP_ST_CANCELLED;
 		ret = -ENOBUFS;
 	}
@@ -335,7 +349,6 @@ void fscache_start_operations(struct fscache_object *object)
  * cancel an operation that's pending on an object
  */
 int fscache_cancel_op(struct fscache_operation *op,
-		      void (*do_cancel)(struct fscache_operation *),
 		      bool cancel_in_progress_op)
 {
 	struct fscache_object *object = op->object;
@@ -355,9 +368,9 @@ int fscache_cancel_op(struct fscache_operation *op,
 		ASSERT(!list_empty(&op->pend_link));
 		list_del_init(&op->pend_link);
 		put = true;
+
 		fscache_stat(&fscache_n_op_cancelled);
-		if (do_cancel)
-			do_cancel(op);
+		op->cancel(op);
 		op->state = FSCACHE_OP_ST_CANCELLED;
 		if (test_bit(FSCACHE_OP_EXCLUSIVE, &op->flags))
 			object->n_exclusive--;
@@ -373,8 +386,7 @@ int fscache_cancel_op(struct fscache_operation *op,
 			fscache_start_operations(object);
 
 		fscache_stat(&fscache_n_op_cancelled);
-		if (do_cancel)
-			do_cancel(op);
+		op->cancel(op);
 		op->state = FSCACHE_OP_ST_CANCELLED;
 		if (test_bit(FSCACHE_OP_EXCLUSIVE, &op->flags))
 			object->n_exclusive--;
@@ -408,6 +420,7 @@ void fscache_cancel_all_ops(struct fscache_object *object)
 		list_del_init(&op->pend_link);
 
 		ASSERTCMP(op->state, ==, FSCACHE_OP_ST_PENDING);
+		op->cancel(op);
 		op->state = FSCACHE_OP_ST_CANCELLED;
 
 		if (test_bit(FSCACHE_OP_EXCLUSIVE, &op->flags))
@@ -440,8 +453,12 @@ void fscache_op_complete(struct fscache_operation *op, bool cancelled)
 
 	spin_lock(&object->lock);
 
-	op->state = cancelled ?
-		FSCACHE_OP_ST_CANCELLED : FSCACHE_OP_ST_COMPLETE;
+	if (!cancelled) {
+		op->state = FSCACHE_OP_ST_COMPLETE;
+	} else {
+		op->cancel(op);
+		op->state = FSCACHE_OP_ST_CANCELLED;
+	}
 
 	if (test_bit(FSCACHE_OP_EXCLUSIVE, &op->flags))
 		object->n_exclusive--;
diff --git a/fs/fscache/page.c b/fs/fscache/page.c
index 7db9752252ef..d1b23a67c031 100644
--- a/fs/fscache/page.c
+++ b/fs/fscache/page.c
@@ -213,7 +213,7 @@ int __fscache_attr_changed(struct fscache_cookie *cookie)
 		return -ENOMEM;
 	}
 
-	fscache_operation_init(op, fscache_attr_changed_op, NULL);
+	fscache_operation_init(op, fscache_attr_changed_op, NULL, NULL);
 	op->flags = FSCACHE_OP_ASYNC |
 		(1 << FSCACHE_OP_EXCLUSIVE) |
 		(1 << FSCACHE_OP_UNUSE_COOKIE);
@@ -248,6 +248,17 @@ nobufs:
 }
 EXPORT_SYMBOL(__fscache_attr_changed);
 
+/*
+ * Handle cancellation of a pending retrieval op
+ */
+static void fscache_do_cancel_retrieval(struct fscache_operation *_op)
+{
+	struct fscache_retrieval *op =
+		container_of(_op, struct fscache_retrieval, op);
+
+	atomic_set(&op->n_pages, 0);
+}
+
 /*
  * release a retrieval op reference
  */
@@ -285,7 +296,9 @@ static struct fscache_retrieval *fscache_alloc_retrieval(
 		return NULL;
 	}
 
-	fscache_operation_init(&op->op, NULL, fscache_release_retrieval_op);
+	fscache_operation_init(&op->op, NULL,
+			       fscache_do_cancel_retrieval,
+			       fscache_release_retrieval_op);
 	op->op.flags	= FSCACHE_OP_MYTHREAD |
 		(1UL << FSCACHE_OP_WAITING) |
 		(1UL << FSCACHE_OP_UNUSE_COOKIE);
@@ -329,25 +342,13 @@ int fscache_wait_for_deferred_lookup(struct fscache_cookie *cookie)
 	return 0;
 }
 
-/*
- * Handle cancellation of a pending retrieval op
- */
-static void fscache_do_cancel_retrieval(struct fscache_operation *_op)
-{
-	struct fscache_retrieval *op =
-		container_of(_op, struct fscache_retrieval, op);
-
-	atomic_set(&op->n_pages, 0);
-}
-
 /*
  * wait for an object to become active (or dead)
  */
 int fscache_wait_for_operation_activation(struct fscache_object *object,
 					  struct fscache_operation *op,
 					  atomic_t *stat_op_waits,
-					  atomic_t *stat_object_dead,
-					  void (*do_cancel)(struct fscache_operation *))
+					  atomic_t *stat_object_dead)
 {
 	int ret;
 
@@ -359,7 +360,7 @@ int fscache_wait_for_operation_activation(struct fscache_object *object,
 		fscache_stat(stat_op_waits);
 	if (wait_on_bit(&op->flags, FSCACHE_OP_WAITING,
 			TASK_INTERRUPTIBLE) != 0) {
-		ret = fscache_cancel_op(op, do_cancel, false);
+		ret = fscache_cancel_op(op, false);
 		if (ret == 0)
 			return -ERESTARTSYS;
 
@@ -380,7 +381,7 @@ check_if_dead:
 	if (unlikely(fscache_object_is_dying(object) ||
 		     fscache_cache_is_broken(object))) {
 		enum fscache_operation_state state = op->state;
-		fscache_cancel_op(op, do_cancel, true);
+		fscache_cancel_op(op, true);
 		if (stat_object_dead)
 			fscache_stat(stat_object_dead);
 		_leave(" = -ENOBUFS [obj dead %d]", state);
@@ -464,8 +465,7 @@ int __fscache_read_or_alloc_page(struct fscache_cookie *cookie,
 	ret = fscache_wait_for_operation_activation(
 		object, &op->op,
 		__fscache_stat(&fscache_n_retrieval_op_waits),
-		__fscache_stat(&fscache_n_retrievals_object_dead),
-		fscache_do_cancel_retrieval);
+		__fscache_stat(&fscache_n_retrievals_object_dead));
 	if (ret < 0)
 		goto error;
 
@@ -595,8 +595,7 @@ int __fscache_read_or_alloc_pages(struct fscache_cookie *cookie,
 	ret = fscache_wait_for_operation_activation(
 		object, &op->op,
 		__fscache_stat(&fscache_n_retrieval_op_waits),
-		__fscache_stat(&fscache_n_retrievals_object_dead),
-		fscache_do_cancel_retrieval);
+		__fscache_stat(&fscache_n_retrievals_object_dead));
 	if (ret < 0)
 		goto error;
 
@@ -702,8 +701,7 @@ int __fscache_alloc_page(struct fscache_cookie *cookie,
 	ret = fscache_wait_for_operation_activation(
 		object, &op->op,
 		__fscache_stat(&fscache_n_alloc_op_waits),
-		__fscache_stat(&fscache_n_allocs_object_dead),
-		fscache_do_cancel_retrieval);
+		__fscache_stat(&fscache_n_allocs_object_dead));
 	if (ret < 0)
 		goto error;
 
@@ -946,7 +944,7 @@ int __fscache_write_page(struct fscache_cookie *cookie,
 	if (!op)
 		goto nomem;
 
-	fscache_operation_init(&op->op, fscache_write_op,
+	fscache_operation_init(&op->op, fscache_write_op, NULL,
 			       fscache_release_write_op);
 	op->op.flags = FSCACHE_OP_ASYNC |
 		(1 << FSCACHE_OP_WAITING) |
diff --git a/include/linux/fscache-cache.h b/include/linux/fscache-cache.h
index 0e26d49972e3..69c6eb10d858 100644
--- a/include/linux/fscache-cache.h
+++ b/include/linux/fscache-cache.h
@@ -74,6 +74,7 @@ extern wait_queue_head_t fscache_cache_cleared_wq;
  */
 typedef void (*fscache_operation_release_t)(struct fscache_operation *op);
 typedef void (*fscache_operation_processor_t)(struct fscache_operation *op);
+typedef void (*fscache_operation_cancel_t)(struct fscache_operation *op);
 
 enum fscache_operation_state {
 	FSCACHE_OP_ST_BLANK,		/* Op is not yet submitted */
@@ -109,6 +110,9 @@ struct fscache_operation {
 	 *   the op in a non-pool thread */
 	fscache_operation_processor_t processor;
 
+	/* Operation cancellation cleanup (optional) */
+	fscache_operation_cancel_t cancel;
+
 	/* operation releaser */
 	fscache_operation_release_t release;
 };
@@ -121,6 +125,7 @@ extern void fscache_op_complete(struct fscache_operation *, bool);
 extern void fscache_put_operation(struct fscache_operation *);
 extern void fscache_operation_init(struct fscache_operation *,
 				   fscache_operation_processor_t,
+				   fscache_operation_cancel_t,
 				   fscache_operation_release_t);
 
 /*
-- 
cgit v1.2.3


From 4a47132ff472a0c2c5441baeb50cf97f2580bc43 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Tue, 24 Feb 2015 10:05:29 +0000
Subject: FS-Cache: Retain the netfs context in the retrieval op earlier

Now that the retrieval operation may be disposed of by fscache_put_operation()
before we actually set the context, the retrieval-specific cleanup operation
can produce a NULL-pointer dereference when it tries to unconditionally clean
up the netfs context.

Given that it is expected that we'll get at least as far as the place where we
currently set the context pointer and it is unlikely we'll go through the
error handling paths prior to that point, retain the context right from the
point that the retrieval op is allocated.

Concomitant to this, we need to retain the cookie pointer in the retrieval op
also so that we can call the netfs to release its context in the release
method.

In addition, we might now get into fscache_release_retrieval_op() with the op
only initialised.  To this end, set the operation to DEAD only after the
release method has been called and skip the n_pages test upon cleanup if the
op is still in the INITIALISED state.

Without these changes, the following oops might be seen:

	BUG: unable to handle kernel NULL pointer dereference at 00000000000000b8
	...
	RIP: 0010:[<ffffffffa0089c98>] fscache_release_retrieval_op+0xae/0x100
	...
	Call Trace:
	 [<ffffffffa0088560>] fscache_put_operation+0x117/0x2e0
	 [<ffffffffa008b8f5>] __fscache_read_or_alloc_pages+0x351/0x3ac
	 [<ffffffffa00b761f>] __nfs_readpages_from_fscache+0x59/0xbf [nfs]
	 [<ffffffffa00b06c5>] nfs_readpages+0x10c/0x185 [nfs]
	 [<ffffffff81124925>] ? alloc_pages_current+0x119/0x13e
	 [<ffffffff810ee5fd>] ? __page_cache_alloc+0xfb/0x10a
	 [<ffffffff810f87f8>] __do_page_cache_readahead+0x188/0x22c
	 [<ffffffff810f8b3a>] ondemand_readahead+0x29e/0x2af
	 [<ffffffff810f8c92>] page_cache_sync_readahead+0x38/0x3a
	 [<ffffffff810ef337>] generic_file_read_iter+0x1a2/0x55a
	 [<ffffffffa00a9dff>] ? nfs_revalidate_mapping+0xd6/0x288 [nfs]
	 [<ffffffffa00a6a23>] nfs_file_read+0x49/0x70 [nfs]
	 [<ffffffff811363be>] new_sync_read+0x78/0x9c
	 [<ffffffff81137164>] __vfs_read+0x13/0x38
	 [<ffffffff8113721e>] vfs_read+0x95/0x121
	 [<ffffffff811372f6>] SyS_read+0x4c/0x8a
	 [<ffffffff81557a52>] system_call_fastpath+0x12/0x17

Signed-off-by: David Howells <dhowells@redhat.com>
Reviewed-by: Steve Dickson <steved@redhat.com>
Acked-by: Jeff Layton <jeff.layton@primarydata.com>
---
 fs/fscache/operation.c        |  2 +-
 fs/fscache/page.c             | 20 ++++++++++----------
 include/linux/fscache-cache.h |  1 +
 3 files changed, 12 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/fs/fscache/operation.c b/fs/fscache/operation.c
index 57d4abb68656..de67745e1cd7 100644
--- a/fs/fscache/operation.c
+++ b/fs/fscache/operation.c
@@ -492,7 +492,6 @@ void fscache_put_operation(struct fscache_operation *op)
 	ASSERTIFCMP(op->state != FSCACHE_OP_ST_INITIALISED &&
 		    op->state != FSCACHE_OP_ST_COMPLETE,
 		    op->state, ==, FSCACHE_OP_ST_CANCELLED);
-	op->state = FSCACHE_OP_ST_DEAD;
 
 	fscache_stat(&fscache_n_op_release);
 
@@ -500,6 +499,7 @@ void fscache_put_operation(struct fscache_operation *op)
 		op->release(op);
 		op->release = NULL;
 	}
+	op->state = FSCACHE_OP_ST_DEAD;
 
 	object = op->object;
 	if (likely(object)) {
diff --git a/fs/fscache/page.c b/fs/fscache/page.c
index d1b23a67c031..483bbc613bf0 100644
--- a/fs/fscache/page.c
+++ b/fs/fscache/page.c
@@ -269,11 +269,12 @@ static void fscache_release_retrieval_op(struct fscache_operation *_op)
 
 	_enter("{OP%x}", op->op.debug_id);
 
-	ASSERTCMP(atomic_read(&op->n_pages), ==, 0);
+	ASSERTIFCMP(op->op.state != FSCACHE_OP_ST_INITIALISED,
+		    atomic_read(&op->n_pages), ==, 0);
 
 	fscache_hist(fscache_retrieval_histogram, op->start_time);
 	if (op->context)
-		fscache_put_context(op->op.object->cookie, op->context);
+		fscache_put_context(op->cookie, op->context);
 
 	_leave("");
 }
@@ -302,11 +303,18 @@ static struct fscache_retrieval *fscache_alloc_retrieval(
 	op->op.flags	= FSCACHE_OP_MYTHREAD |
 		(1UL << FSCACHE_OP_WAITING) |
 		(1UL << FSCACHE_OP_UNUSE_COOKIE);
+	op->cookie	= cookie;
 	op->mapping	= mapping;
 	op->end_io_func	= end_io_func;
 	op->context	= context;
 	op->start_time	= jiffies;
 	INIT_LIST_HEAD(&op->to_do);
+
+	/* Pin the netfs read context in case we need to do the actual netfs
+	 * read because we've encountered a cache read failure.
+	 */
+	if (context)
+		fscache_get_context(op->cookie, context);
 	return op;
 }
 
@@ -456,10 +464,6 @@ int __fscache_read_or_alloc_page(struct fscache_cookie *cookie,
 
 	fscache_stat(&fscache_n_retrieval_ops);
 
-	/* pin the netfs read context in case we need to do the actual netfs
-	 * read because we've encountered a cache read failure */
-	fscache_get_context(object->cookie, op->context);
-
 	/* we wait for the operation to become active, and then process it
 	 * *here*, in this thread, and not in the thread pool */
 	ret = fscache_wait_for_operation_activation(
@@ -586,10 +590,6 @@ int __fscache_read_or_alloc_pages(struct fscache_cookie *cookie,
 
 	fscache_stat(&fscache_n_retrieval_ops);
 
-	/* pin the netfs read context in case we need to do the actual netfs
-	 * read because we've encountered a cache read failure */
-	fscache_get_context(object->cookie, op->context);
-
 	/* we wait for the operation to become active, and then process it
 	 * *here*, in this thread, and not in the thread pool */
 	ret = fscache_wait_for_operation_activation(
diff --git a/include/linux/fscache-cache.h b/include/linux/fscache-cache.h
index 69c6eb10d858..604e1526cd00 100644
--- a/include/linux/fscache-cache.h
+++ b/include/linux/fscache-cache.h
@@ -133,6 +133,7 @@ extern void fscache_operation_init(struct fscache_operation *,
  */
 struct fscache_retrieval {
 	struct fscache_operation op;
+	struct fscache_cookie	*cookie;	/* The netfs cookie */
 	struct address_space	*mapping;	/* netfs pages */
 	fscache_rw_complete_t	end_io_func;	/* function to call on I/O completion */
 	void			*context;	/* netfs read context (pinned) */
-- 
cgit v1.2.3


From c8a8585431efba0faaf41167f8f7c27c48307ca6 Mon Sep 17 00:00:00 2001
From: Vianney le Clément de Saint-Marcq <vianney.leclement@essensium.com>
Date: Mon, 30 Mar 2015 10:34:58 +0200
Subject: iio: core: Introduce IIO_CHAN_INFO_CALIBEMISSIVITY
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Contact-less IR temperature sensors measure the temperature of an object
by using its thermal radiation.  Surfaces with different emissivity
ratios emit different amounts of energy at the same temperature.

IIO_CHAN_INFO_CALIBEMISSIVITY allows the user to inform the sensor of the
emissivity of the object in front of it, in order to effectively measure
its temperature.

A device providing such setting is Melexis's MLX90614:
http://melexis.com/Assets/IR-sensor-thermometer-MLX90614-Datasheet-5152.aspx.

Signed-off-by: Vianney le Clément de Saint-Marcq <vianney.leclement@essensium.com>
Cc: Arnout Vandecappelle (Essensium/Mind) <arnout@mind.be>
Signed-off-by: Jonathan Cameron <jic23@kernel.org>
---
 Documentation/ABI/testing/sysfs-bus-iio | 11 +++++++++++
 drivers/iio/industrialio-core.c         |  1 +
 include/linux/iio/iio.h                 |  1 +
 3 files changed, 13 insertions(+)

(limited to 'include/linux')

diff --git a/Documentation/ABI/testing/sysfs-bus-iio b/Documentation/ABI/testing/sysfs-bus-iio
index 3befcb19f414..866b4ec4aab6 100644
--- a/Documentation/ABI/testing/sysfs-bus-iio
+++ b/Documentation/ABI/testing/sysfs-bus-iio
@@ -1364,3 +1364,14 @@ Description:
 		hwfifo_watermak_min but not equal to any of the values in this
 		list, the driver will chose an appropriate value for the
 		hardware fifo watermark level.
+
+What:		/sys/bus/iio/devices/iio:deviceX/in_temp_calibemissivity
+What:		/sys/bus/iio/devices/iio:deviceX/in_tempX_calibemissivity
+What:		/sys/bus/iio/devices/iio:deviceX/in_temp_object_calibemissivity
+What:		/sys/bus/iio/devices/iio:deviceX/in_tempX_object_calibemissivity
+KernelVersion:	4.1
+Contact:	linux-iio@vger.kernel.org
+Description:
+		The emissivity ratio of the surface in the field of view of the
+		contactless temperature sensor.  Emissivity varies from 0 to 1,
+		with 1 being the emissivity of a black body.
diff --git a/drivers/iio/industrialio-core.c b/drivers/iio/industrialio-core.c
index 4df97f650e44..7c98bc1504e6 100644
--- a/drivers/iio/industrialio-core.c
+++ b/drivers/iio/industrialio-core.c
@@ -128,6 +128,7 @@ static const char * const iio_chan_info_postfix[] = {
 	[IIO_CHAN_INFO_CALIBWEIGHT] = "calibweight",
 	[IIO_CHAN_INFO_DEBOUNCE_COUNT] = "debounce_count",
 	[IIO_CHAN_INFO_DEBOUNCE_TIME] = "debounce_time",
+	[IIO_CHAN_INFO_CALIBEMISSIVITY] = "calibemissivity",
 };
 
 /**
diff --git a/include/linux/iio/iio.h b/include/linux/iio/iio.h
index d86b753e9b30..b1e46ae89aa7 100644
--- a/include/linux/iio/iio.h
+++ b/include/linux/iio/iio.h
@@ -43,6 +43,7 @@ enum iio_chan_info_enum {
 	IIO_CHAN_INFO_CALIBWEIGHT,
 	IIO_CHAN_INFO_DEBOUNCE_COUNT,
 	IIO_CHAN_INFO_DEBOUNCE_TIME,
+	IIO_CHAN_INFO_CALIBEMISSIVITY,
 };
 
 enum iio_shared_by {
-- 
cgit v1.2.3


From d0e83059a6c9b04f00264a74b8f6439948de4613 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Mon, 20 Apr 2015 13:39:03 +0800
Subject: crypto: rng - Convert crypto_rng to new style crypto_type

This patch converts the top-level crypto_rng to the "new" style.
It was the last algorithm type added before we switched over
to the new way of doing things exemplified by shash.

All users will automatically switch over to the new interface.

Note that this patch does not touch the low-level interface to
rng implementations.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/rng.c           | 35 +++++++++++++++++++++++------------
 include/crypto/rng.h   | 32 ++++++++++----------------------
 include/linux/crypto.h | 12 ------------
 3 files changed, 33 insertions(+), 46 deletions(-)

(limited to 'include/linux')

diff --git a/crypto/rng.c b/crypto/rng.c
index e0a25c2456de..87fa2f4933b0 100644
--- a/crypto/rng.c
+++ b/crypto/rng.c
@@ -24,11 +24,18 @@
 #include <linux/cryptouser.h>
 #include <net/netlink.h>
 
+#include "internal.h"
+
 static DEFINE_MUTEX(crypto_default_rng_lock);
 struct crypto_rng *crypto_default_rng;
 EXPORT_SYMBOL_GPL(crypto_default_rng);
 static int crypto_default_rng_refcnt;
 
+static inline struct crypto_rng *__crypto_rng_cast(struct crypto_tfm *tfm)
+{
+	return container_of(tfm, struct crypto_rng, base);
+}
+
 static int rngapi_reset(struct crypto_rng *tfm, u8 *seed, unsigned int slen)
 {
 	u8 *buf = NULL;
@@ -49,13 +56,13 @@ static int rngapi_reset(struct crypto_rng *tfm, u8 *seed, unsigned int slen)
 	return err;
 }
 
-static int crypto_init_rng_ops(struct crypto_tfm *tfm, u32 type, u32 mask)
+static int crypto_rng_init_tfm(struct crypto_tfm *tfm)
 {
+	struct crypto_rng *rng = __crypto_rng_cast(tfm);
 	struct rng_alg *alg = &tfm->__crt_alg->cra_rng;
-	struct rng_tfm *ops = &tfm->crt_rng;
 
-	ops->rng_gen_random = alg->rng_make_random;
-	ops->rng_reset = rngapi_reset;
+	rng->generate = alg->rng_make_random;
+	rng->seed = rngapi_reset;
 
 	return 0;
 }
@@ -92,22 +99,26 @@ static void crypto_rng_show(struct seq_file *m, struct crypto_alg *alg)
 	seq_printf(m, "seedsize     : %u\n", alg->cra_rng.seedsize);
 }
 
-static unsigned int crypto_rng_ctxsize(struct crypto_alg *alg, u32 type,
-				       u32 mask)
-{
-	return alg->cra_ctxsize;
-}
-
 const struct crypto_type crypto_rng_type = {
-	.ctxsize = crypto_rng_ctxsize,
-	.init = crypto_init_rng_ops,
+	.extsize = crypto_alg_extsize,
+	.init_tfm = crypto_rng_init_tfm,
 #ifdef CONFIG_PROC_FS
 	.show = crypto_rng_show,
 #endif
 	.report = crypto_rng_report,
+	.maskclear = ~CRYPTO_ALG_TYPE_MASK,
+	.maskset = CRYPTO_ALG_TYPE_MASK,
+	.type = CRYPTO_ALG_TYPE_RNG,
+	.tfmsize = offsetof(struct crypto_rng, base),
 };
 EXPORT_SYMBOL_GPL(crypto_rng_type);
 
+struct crypto_rng *crypto_alloc_rng(const char *alg_name, u32 type, u32 mask)
+{
+	return crypto_alloc_tfm(alg_name, &crypto_rng_type, type, mask);
+}
+EXPORT_SYMBOL_GPL(crypto_alloc_rng);
+
 int crypto_get_default_rng(void)
 {
 	struct crypto_rng *rng;
diff --git a/include/crypto/rng.h b/include/crypto/rng.h
index 6e28ea5be9f1..f13f3faca4d7 100644
--- a/include/crypto/rng.h
+++ b/include/crypto/rng.h
@@ -15,6 +15,12 @@
 
 #include <linux/crypto.h>
 
+struct crypto_rng {
+	int (*generate)(struct crypto_rng *tfm, u8 *rdata, unsigned int dlen);
+	int (*seed)(struct crypto_rng *tfm, u8 *seed, unsigned int slen);
+	struct crypto_tfm base;
+};
+
 extern struct crypto_rng *crypto_default_rng;
 
 int crypto_get_default_rng(void);
@@ -27,11 +33,6 @@ void crypto_put_default_rng(void);
  * CRYPTO_ALG_TYPE_RNG (listed as type "rng" in /proc/crypto)
  */
 
-static inline struct crypto_rng *__crypto_rng_cast(struct crypto_tfm *tfm)
-{
-	return (struct crypto_rng *)tfm;
-}
-
 /**
  * crypto_alloc_rng() -- allocate RNG handle
  * @alg_name: is the cra_name / name or cra_driver_name / driver name of the
@@ -52,15 +53,7 @@ static inline struct crypto_rng *__crypto_rng_cast(struct crypto_tfm *tfm)
  * Return: allocated cipher handle in case of success; IS_ERR() is true in case
  *	   of an error, PTR_ERR() returns the error code.
  */
-static inline struct crypto_rng *crypto_alloc_rng(const char *alg_name,
-						  u32 type, u32 mask)
-{
-	type &= ~CRYPTO_ALG_TYPE_MASK;
-	type |= CRYPTO_ALG_TYPE_RNG;
-	mask |= CRYPTO_ALG_TYPE_MASK;
-
-	return __crypto_rng_cast(crypto_alloc_base(alg_name, type, mask));
-}
+struct crypto_rng *crypto_alloc_rng(const char *alg_name, u32 type, u32 mask);
 
 static inline struct crypto_tfm *crypto_rng_tfm(struct crypto_rng *tfm)
 {
@@ -80,18 +73,13 @@ static inline struct rng_alg *crypto_rng_alg(struct crypto_rng *tfm)
 	return &crypto_rng_tfm(tfm)->__crt_alg->cra_rng;
 }
 
-static inline struct rng_tfm *crypto_rng_crt(struct crypto_rng *tfm)
-{
-	return &crypto_rng_tfm(tfm)->crt_rng;
-}
-
 /**
  * crypto_free_rng() - zeroize and free RNG handle
  * @tfm: cipher handle to be freed
  */
 static inline void crypto_free_rng(struct crypto_rng *tfm)
 {
-	crypto_free_tfm(crypto_rng_tfm(tfm));
+	crypto_destroy_tfm(tfm, crypto_rng_tfm(tfm));
 }
 
 /**
@@ -108,7 +96,7 @@ static inline void crypto_free_rng(struct crypto_rng *tfm)
 static inline int crypto_rng_get_bytes(struct crypto_rng *tfm,
 				       u8 *rdata, unsigned int dlen)
 {
-	return crypto_rng_crt(tfm)->rng_gen_random(tfm, rdata, dlen);
+	return tfm->generate(tfm, rdata, dlen);
 }
 
 /**
@@ -131,7 +119,7 @@ static inline int crypto_rng_get_bytes(struct crypto_rng *tfm,
 static inline int crypto_rng_reset(struct crypto_rng *tfm,
 				   u8 *seed, unsigned int slen)
 {
-	return crypto_rng_crt(tfm)->rng_reset(tfm, seed, slen);
+	return tfm->seed(tfm, seed, slen);
 }
 
 /**
diff --git a/include/linux/crypto.h b/include/linux/crypto.h
index 10df5d2d093a..781f7d546020 100644
--- a/include/linux/crypto.h
+++ b/include/linux/crypto.h
@@ -655,19 +655,12 @@ struct compress_tfm {
 	                      u8 *dst, unsigned int *dlen);
 };
 
-struct rng_tfm {
-	int (*rng_gen_random)(struct crypto_rng *tfm, u8 *rdata,
-			      unsigned int dlen);
-	int (*rng_reset)(struct crypto_rng *tfm, u8 *seed, unsigned int slen);
-};
-
 #define crt_ablkcipher	crt_u.ablkcipher
 #define crt_aead	crt_u.aead
 #define crt_blkcipher	crt_u.blkcipher
 #define crt_cipher	crt_u.cipher
 #define crt_hash	crt_u.hash
 #define crt_compress	crt_u.compress
-#define crt_rng		crt_u.rng
 
 struct crypto_tfm {
 
@@ -680,7 +673,6 @@ struct crypto_tfm {
 		struct cipher_tfm cipher;
 		struct hash_tfm hash;
 		struct compress_tfm compress;
-		struct rng_tfm rng;
 	} crt_u;
 
 	void (*exit)(struct crypto_tfm *tfm);
@@ -714,10 +706,6 @@ struct crypto_hash {
 	struct crypto_tfm base;
 };
 
-struct crypto_rng {
-	struct crypto_tfm base;
-};
-
 enum {
 	CRYPTOA_UNSPEC,
 	CRYPTOA_ALG,
-- 
cgit v1.2.3


From acec27ff35af9caf34d76d16ee17ff3b292e7d83 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Tue, 21 Apr 2015 10:46:38 +0800
Subject: crypto: rng - Convert low-level crypto_rng to new style

This patch converts the low-level crypto_rng interface to the
"new" style.

This allows existing implementations to be converted over one-
by-one.  Once that is complete we can then remove the old rng
interface.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/rng.c                  | 56 ++++++++++++++++++++++++++++++++++++++-----
 include/crypto/internal/rng.h |  3 +++
 include/crypto/rng.h          | 42 ++++++++++++++++++++++++++++++--
 include/linux/crypto.h        |  6 ++---
 4 files changed, 96 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/crypto/rng.c b/crypto/rng.c
index f1d64948f6fc..5e0425a24657 100644
--- a/crypto/rng.c
+++ b/crypto/rng.c
@@ -36,10 +36,15 @@ static inline struct crypto_rng *__crypto_rng_cast(struct crypto_tfm *tfm)
 	return container_of(tfm, struct crypto_rng, base);
 }
 
+static inline struct old_rng_alg *crypto_old_rng_alg(struct crypto_rng *tfm)
+{
+	return &crypto_rng_tfm(tfm)->__crt_alg->cra_rng;
+}
+
 static int generate(struct crypto_rng *tfm, const u8 *src, unsigned int slen,
 		    u8 *dst, unsigned int dlen)
 {
-	return crypto_rng_alg(tfm)->rng_make_random(tfm, dst, dlen);
+	return crypto_old_rng_alg(tfm)->rng_make_random(tfm, dst, dlen);
 }
 
 static int rngapi_reset(struct crypto_rng *tfm, const u8 *seed,
@@ -58,7 +63,7 @@ static int rngapi_reset(struct crypto_rng *tfm, const u8 *seed,
 		src = buf;
 	}
 
-	err = crypto_rng_alg(tfm)->rng_reset(tfm, src, slen);
+	err = crypto_old_rng_alg(tfm)->rng_reset(tfm, src, slen);
 
 	kzfree(buf);
 	return err;
@@ -88,13 +93,31 @@ EXPORT_SYMBOL_GPL(crypto_rng_reset);
 static int crypto_rng_init_tfm(struct crypto_tfm *tfm)
 {
 	struct crypto_rng *rng = __crypto_rng_cast(tfm);
+	struct rng_alg *alg = crypto_rng_alg(rng);
+	struct old_rng_alg *oalg = crypto_old_rng_alg(rng);
+
+	if (oalg->rng_make_random) {
+		rng->generate = generate;
+		rng->seed = rngapi_reset;
+		rng->seedsize = oalg->seedsize;
+		return 0;
+	}
 
-	rng->generate = generate;
-	rng->seed = rngapi_reset;
+	rng->generate = alg->generate;
+	rng->seed = alg->seed;
+	rng->seedsize = alg->seedsize;
 
 	return 0;
 }
 
+static unsigned int seedsize(struct crypto_alg *alg)
+{
+	struct rng_alg *ralg = container_of(alg, struct rng_alg, base);
+
+	return alg->cra_rng.rng_make_random ?
+	       alg->cra_rng.seedsize : ralg->seedsize;
+}
+
 #ifdef CONFIG_NET
 static int crypto_rng_report(struct sk_buff *skb, struct crypto_alg *alg)
 {
@@ -102,7 +125,7 @@ static int crypto_rng_report(struct sk_buff *skb, struct crypto_alg *alg)
 
 	strncpy(rrng.type, "rng", sizeof(rrng.type));
 
-	rrng.seedsize = alg->cra_rng.seedsize;
+	rrng.seedsize = seedsize(alg);
 
 	if (nla_put(skb, CRYPTOCFGA_REPORT_RNG,
 		    sizeof(struct crypto_report_rng), &rrng))
@@ -124,7 +147,7 @@ static void crypto_rng_show(struct seq_file *m, struct crypto_alg *alg)
 static void crypto_rng_show(struct seq_file *m, struct crypto_alg *alg)
 {
 	seq_printf(m, "type         : rng\n");
-	seq_printf(m, "seedsize     : %u\n", alg->cra_rng.seedsize);
+	seq_printf(m, "seedsize     : %u\n", seedsize(alg));
 }
 
 const struct crypto_type crypto_rng_type = {
@@ -189,5 +212,26 @@ void crypto_put_default_rng(void)
 }
 EXPORT_SYMBOL_GPL(crypto_put_default_rng);
 
+int crypto_register_rng(struct rng_alg *alg)
+{
+	struct crypto_alg *base = &alg->base;
+
+	if (alg->seedsize > PAGE_SIZE / 8)
+		return -EINVAL;
+
+	base->cra_type = &crypto_rng_type;
+	base->cra_flags &= ~CRYPTO_ALG_TYPE_MASK;
+	base->cra_flags |= CRYPTO_ALG_TYPE_RNG;
+
+	return crypto_register_alg(base);
+}
+EXPORT_SYMBOL_GPL(crypto_register_rng);
+
+void crypto_unregister_rng(struct rng_alg *alg)
+{
+	crypto_unregister_alg(&alg->base);
+}
+EXPORT_SYMBOL_GPL(crypto_unregister_rng);
+
 MODULE_LICENSE("GPL");
 MODULE_DESCRIPTION("Random Number Generator");
diff --git a/include/crypto/internal/rng.h b/include/crypto/internal/rng.h
index 896973369573..76f3c9519ba5 100644
--- a/include/crypto/internal/rng.h
+++ b/include/crypto/internal/rng.h
@@ -18,6 +18,9 @@
 
 extern const struct crypto_type crypto_rng_type;
 
+int crypto_register_rng(struct rng_alg *alg);
+void crypto_unregister_rng(struct rng_alg *alg);
+
 static inline void *crypto_rng_ctx(struct crypto_rng *tfm)
 {
 	return crypto_tfm_ctx(&tfm->base);
diff --git a/include/crypto/rng.h b/include/crypto/rng.h
index 7fca37144b59..133f0441ad3e 100644
--- a/include/crypto/rng.h
+++ b/include/crypto/rng.h
@@ -15,11 +15,48 @@
 
 #include <linux/crypto.h>
 
+struct crypto_rng;
+
+/**
+ * struct rng_alg - random number generator definition
+ *
+ * @generate:	The function defined by this variable obtains a
+ *		random number. The random number generator transform
+ *		must generate the random number out of the context
+ *		provided with this call, plus any additional data
+ *		if provided to the call.
+ * @seed:	Seed or reseed the random number generator.  With the
+ *		invocation of this function call, the random number
+ *		generator shall become ready fo generation.  If the
+ *		random number generator requires a seed for setting
+ *		up a new state, the seed must be provided by the
+ *		consumer while invoking this function. The required
+ *		size of the seed is defined with @seedsize .
+ * @seedsize:	The seed size required for a random number generator
+ *		initialization defined with this variable. Some
+ *		random number generators does not require a seed
+ *		as the seeding is implemented internally without
+ *		the need of support by the consumer. In this case,
+ *		the seed size is set to zero.
+ * @base:	Common crypto API algorithm data structure.
+ */
+struct rng_alg {
+	int (*generate)(struct crypto_rng *tfm,
+			const u8 *src, unsigned int slen,
+			u8 *dst, unsigned int dlen);
+	int (*seed)(struct crypto_rng *tfm, const u8 *seed, unsigned int slen);
+
+	unsigned int seedsize;
+
+	struct crypto_alg base;
+};
+
 struct crypto_rng {
 	int (*generate)(struct crypto_rng *tfm,
 			const u8 *src, unsigned int slen,
 			u8 *dst, unsigned int dlen);
 	int (*seed)(struct crypto_rng *tfm, const u8 *seed, unsigned int slen);
+	unsigned int seedsize;
 	struct crypto_tfm base;
 };
 
@@ -72,7 +109,8 @@ static inline struct crypto_tfm *crypto_rng_tfm(struct crypto_rng *tfm)
  */
 static inline struct rng_alg *crypto_rng_alg(struct crypto_rng *tfm)
 {
-	return &crypto_rng_tfm(tfm)->__crt_alg->cra_rng;
+	return container_of(crypto_rng_tfm(tfm)->__crt_alg,
+			    struct rng_alg, base);
 }
 
 /**
@@ -156,7 +194,7 @@ int crypto_rng_reset(struct crypto_rng *tfm, const u8 *seed,
  */
 static inline int crypto_rng_seedsize(struct crypto_rng *tfm)
 {
-	return crypto_rng_alg(tfm)->seedsize;
+	return tfm->seedsize;
 }
 
 #endif
diff --git a/include/linux/crypto.h b/include/linux/crypto.h
index 781f7d546020..2fa9b05360f7 100644
--- a/include/linux/crypto.h
+++ b/include/linux/crypto.h
@@ -427,7 +427,7 @@ struct compress_alg {
 };
 
 /**
- * struct rng_alg - random number generator definition
+ * struct old_rng_alg - random number generator definition
  * @rng_make_random: The function defined by this variable obtains a random
  *		     number. The random number generator transform must generate
  *		     the random number out of the context provided with this
@@ -445,7 +445,7 @@ struct compress_alg {
  *	      seeding is implemented internally without the need of support by
  *	      the consumer. In this case, the seed size is set to zero.
  */
-struct rng_alg {
+struct old_rng_alg {
 	int (*rng_make_random)(struct crypto_rng *tfm, u8 *rdata,
 			       unsigned int dlen);
 	int (*rng_reset)(struct crypto_rng *tfm, u8 *seed, unsigned int slen);
@@ -559,7 +559,7 @@ struct crypto_alg {
 		struct blkcipher_alg blkcipher;
 		struct cipher_alg cipher;
 		struct compress_alg compress;
-		struct rng_alg rng;
+		struct old_rng_alg rng;
 	} cra_u;
 
 	int (*cra_init)(struct crypto_tfm *tfm);
-- 
cgit v1.2.3


From 94f1bb15bed84ad6c893916b7e7b9db6f1d7eec6 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Tue, 21 Apr 2015 10:46:46 +0800
Subject: crypto: rng - Remove old low-level rng interface

Now that all rng implementations have switched over to the new
interface, we can remove the old low-level interface.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/rng.c                  | 57 +++----------------------------------------
 include/crypto/internal/rng.h |  3 +--
 include/crypto/rng.h          | 10 +++-----
 include/linux/crypto.h        | 30 -----------------------
 4 files changed, 8 insertions(+), 92 deletions(-)

(limited to 'include/linux')

diff --git a/crypto/rng.c b/crypto/rng.c
index e98ce1ca527d..055e276427b1 100644
--- a/crypto/rng.c
+++ b/crypto/rng.c
@@ -4,6 +4,7 @@
  * RNG operations.
  *
  * Copyright (c) 2008 Neil Horman <nhorman@tuxdriver.com>
+ * Copyright (c) 2015 Herbert Xu <herbert@gondor.apana.org.au>
  *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms of the GNU General Public License as published by the Free
@@ -36,39 +37,6 @@ static inline struct crypto_rng *__crypto_rng_cast(struct crypto_tfm *tfm)
 	return container_of(tfm, struct crypto_rng, base);
 }
 
-static inline struct old_rng_alg *crypto_old_rng_alg(struct crypto_rng *tfm)
-{
-	return &crypto_rng_tfm(tfm)->__crt_alg->cra_rng;
-}
-
-static int generate(struct crypto_rng *tfm, const u8 *src, unsigned int slen,
-		    u8 *dst, unsigned int dlen)
-{
-	return crypto_old_rng_alg(tfm)->rng_make_random(tfm, dst, dlen);
-}
-
-static int rngapi_reset(struct crypto_rng *tfm, const u8 *seed,
-			unsigned int slen)
-{
-	u8 *buf = NULL;
-	u8 *src = (u8 *)seed;
-	int err;
-
-	if (slen) {
-		buf = kmalloc(slen, GFP_KERNEL);
-		if (!buf)
-			return -ENOMEM;
-
-		memcpy(buf, seed, slen);
-		src = buf;
-	}
-
-	err = crypto_old_rng_alg(tfm)->rng_reset(tfm, src, slen);
-
-	kzfree(buf);
-	return err;
-}
-
 int crypto_rng_reset(struct crypto_rng *tfm, const u8 *seed, unsigned int slen)
 {
 	u8 *buf = NULL;
@@ -83,7 +51,7 @@ int crypto_rng_reset(struct crypto_rng *tfm, const u8 *seed, unsigned int slen)
 		seed = buf;
 	}
 
-	err = tfm->seed(tfm, seed, slen);
+	err = crypto_rng_alg(tfm)->seed(tfm, seed, slen);
 
 	kfree(buf);
 	return err;
@@ -92,21 +60,6 @@ EXPORT_SYMBOL_GPL(crypto_rng_reset);
 
 static int crypto_rng_init_tfm(struct crypto_tfm *tfm)
 {
-	struct crypto_rng *rng = __crypto_rng_cast(tfm);
-	struct rng_alg *alg = crypto_rng_alg(rng);
-	struct old_rng_alg *oalg = crypto_old_rng_alg(rng);
-
-	if (oalg->rng_make_random) {
-		rng->generate = generate;
-		rng->seed = rngapi_reset;
-		rng->seedsize = oalg->seedsize;
-		return 0;
-	}
-
-	rng->generate = alg->generate;
-	rng->seed = alg->seed;
-	rng->seedsize = alg->seedsize;
-
 	return 0;
 }
 
@@ -114,8 +67,7 @@ static unsigned int seedsize(struct crypto_alg *alg)
 {
 	struct rng_alg *ralg = container_of(alg, struct rng_alg, base);
 
-	return alg->cra_rng.rng_make_random ?
-	       alg->cra_rng.seedsize : ralg->seedsize;
+	return ralg->seedsize;
 }
 
 #ifdef CONFIG_NET
@@ -150,7 +102,7 @@ static void crypto_rng_show(struct seq_file *m, struct crypto_alg *alg)
 	seq_printf(m, "seedsize     : %u\n", seedsize(alg));
 }
 
-const struct crypto_type crypto_rng_type = {
+static const struct crypto_type crypto_rng_type = {
 	.extsize = crypto_alg_extsize,
 	.init_tfm = crypto_rng_init_tfm,
 #ifdef CONFIG_PROC_FS
@@ -162,7 +114,6 @@ const struct crypto_type crypto_rng_type = {
 	.type = CRYPTO_ALG_TYPE_RNG,
 	.tfmsize = offsetof(struct crypto_rng, base),
 };
-EXPORT_SYMBOL_GPL(crypto_rng_type);
 
 struct crypto_rng *crypto_alloc_rng(const char *alg_name, u32 type, u32 mask)
 {
diff --git a/include/crypto/internal/rng.h b/include/crypto/internal/rng.h
index 2c9a865c66e2..263f1a5eebc7 100644
--- a/include/crypto/internal/rng.h
+++ b/include/crypto/internal/rng.h
@@ -2,6 +2,7 @@
  * RNG: Random Number Generator  algorithms under the crypto API
  *
  * Copyright (c) 2008 Neil Horman <nhorman@tuxdriver.com>
+ * Copyright (c) 2015 Herbert Xu <herbert@gondor.apana.org.au>
  *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms of the GNU General Public License as published by the Free
@@ -16,8 +17,6 @@
 #include <crypto/algapi.h>
 #include <crypto/rng.h>
 
-extern const struct crypto_type crypto_rng_type;
-
 int crypto_register_rng(struct rng_alg *alg);
 void crypto_unregister_rng(struct rng_alg *alg);
 int crypto_register_rngs(struct rng_alg *algs, int count);
diff --git a/include/crypto/rng.h b/include/crypto/rng.h
index cc22e52a129a..c5d4684429f5 100644
--- a/include/crypto/rng.h
+++ b/include/crypto/rng.h
@@ -2,6 +2,7 @@
  * RNG: Random Number Generator  algorithms under the crypto API
  *
  * Copyright (c) 2008 Neil Horman <nhorman@tuxdriver.com>
+ * Copyright (c) 2015 Herbert Xu <herbert@gondor.apana.org.au>
  *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms of the GNU General Public License as published by the Free
@@ -56,11 +57,6 @@ struct rng_alg {
 };
 
 struct crypto_rng {
-	int (*generate)(struct crypto_rng *tfm,
-			const u8 *src, unsigned int slen,
-			u8 *dst, unsigned int dlen);
-	int (*seed)(struct crypto_rng *tfm, const u8 *seed, unsigned int slen);
-	unsigned int seedsize;
 	struct crypto_tfm base;
 };
 
@@ -144,7 +140,7 @@ static inline int crypto_rng_generate(struct crypto_rng *tfm,
 				      const u8 *src, unsigned int slen,
 				      u8 *dst, unsigned int dlen)
 {
-	return tfm->generate(tfm, src, slen, dst, dlen);
+	return crypto_rng_alg(tfm)->generate(tfm, src, slen, dst, dlen);
 }
 
 /**
@@ -198,7 +194,7 @@ int crypto_rng_reset(struct crypto_rng *tfm, const u8 *seed,
  */
 static inline int crypto_rng_seedsize(struct crypto_rng *tfm)
 {
-	return tfm->seedsize;
+	return crypto_rng_alg(tfm)->seedsize;
 }
 
 #endif
diff --git a/include/linux/crypto.h b/include/linux/crypto.h
index 2fa9b05360f7..ee14140f8893 100644
--- a/include/linux/crypto.h
+++ b/include/linux/crypto.h
@@ -138,7 +138,6 @@ struct crypto_async_request;
 struct crypto_aead;
 struct crypto_blkcipher;
 struct crypto_hash;
-struct crypto_rng;
 struct crypto_tfm;
 struct crypto_type;
 struct aead_givcrypt_request;
@@ -426,40 +425,12 @@ struct compress_alg {
 			      unsigned int slen, u8 *dst, unsigned int *dlen);
 };
 
-/**
- * struct old_rng_alg - random number generator definition
- * @rng_make_random: The function defined by this variable obtains a random
- *		     number. The random number generator transform must generate
- *		     the random number out of the context provided with this
- *		     call.
- * @rng_reset: Reset of the random number generator by clearing the entire state.
- *	       With the invocation of this function call, the random number
- *             generator shall completely reinitialize its state. If the random
- *	       number generator requires a seed for setting up a new state,
- *	       the seed must be provided by the consumer while invoking this
- *	       function. The required size of the seed is defined with
- *	       @seedsize .
- * @seedsize: The seed size required for a random number generator
- *	      initialization defined with this variable. Some random number
- *	      generators like the SP800-90A DRBG does not require a seed as the
- *	      seeding is implemented internally without the need of support by
- *	      the consumer. In this case, the seed size is set to zero.
- */
-struct old_rng_alg {
-	int (*rng_make_random)(struct crypto_rng *tfm, u8 *rdata,
-			       unsigned int dlen);
-	int (*rng_reset)(struct crypto_rng *tfm, u8 *seed, unsigned int slen);
-
-	unsigned int seedsize;
-};
-
 
 #define cra_ablkcipher	cra_u.ablkcipher
 #define cra_aead	cra_u.aead
 #define cra_blkcipher	cra_u.blkcipher
 #define cra_cipher	cra_u.cipher
 #define cra_compress	cra_u.compress
-#define cra_rng		cra_u.rng
 
 /**
  * struct crypto_alg - definition of a cryptograpic cipher algorithm
@@ -559,7 +530,6 @@ struct crypto_alg {
 		struct blkcipher_alg blkcipher;
 		struct cipher_alg cipher;
 		struct compress_alg compress;
-		struct old_rng_alg rng;
 	} cra_u;
 
 	int (*cra_init)(struct crypto_tfm *tfm);
-- 
cgit v1.2.3


From 4796cf9b02b5bea141632e21d64556a7eb883a65 Mon Sep 17 00:00:00 2001
From: Yingjoe Chen <yingjoe.chen@mediatek.com>
Date: Fri, 10 Apr 2015 21:55:50 +0800
Subject: time: Remove nonexistent function prototype

The function clocksource_get_next() was removed in commit 75c5158f70
(timekeeping: Update clocksource with stop_machine), but the
prototype was not removed with it. Remove the prototype.

Signed-off-by: Yingjoe Chen <yingjoe.chen@mediatek.com>
Cc: <linux-arm-kernel@lists.infradead.org>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: <srv_heupstream@mediatek.com>
Cc: John Stultz <john.stultz@linaro.org>
Link: http://lkml.kernel.org/r/1428674150-1780-1-git-send-email-yingjoe.chen@mediatek.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/clocksource.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h
index 135509821c39..a25fc6e873b8 100644
--- a/include/linux/clocksource.h
+++ b/include/linux/clocksource.h
@@ -181,7 +181,6 @@ static inline s64 clocksource_cyc2ns(cycle_t cycles, u32 mult, u32 shift)
 
 extern int clocksource_unregister(struct clocksource*);
 extern void clocksource_touch_watchdog(void);
-extern struct clocksource* clocksource_get_next(void);
 extern void clocksource_change_rating(struct clocksource *cs, int rating);
 extern void clocksource_suspend(void);
 extern void clocksource_resume(void);
-- 
cgit v1.2.3


From 91e5a2170e795989da9f90c18ba18984f23acc5b Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 13 Apr 2015 21:02:22 +0000
Subject: hrtimer: Document hrtimer_forward[_now]() proper

Document the calling context conditions.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/20150413210035.178751779@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/hrtimer.h | 17 ++++++++++++++++-
 kernel/time/hrtimer.c   |  8 ++++++++
 2 files changed, 24 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index 05f6df1fdf5b..7770676c387a 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -418,7 +418,22 @@ static inline int hrtimer_callback_running(struct hrtimer *timer)
 extern u64
 hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval);
 
-/* Forward a hrtimer so it expires after the hrtimer's current now */
+/**
+ * hrtimer_forward_now - forward the timer expiry so it expires after now
+ * @timer:	hrtimer to forward
+ * @interval:	the interval to forward
+ *
+ * Forward the timer expiry so it will expire after the current time
+ * of the hrtimer clock base. Returns the number of overruns.
+ *
+ * Can be safely called from the callback function of @timer. If
+ * called from other contexts @timer must neither be enqueued nor
+ * running the callback and the caller needs to take care of
+ * serialization.
+ *
+ * Note: This only updates the timer expiry value and does not requeue
+ * the timer.
+ */
 static inline u64 hrtimer_forward_now(struct hrtimer *timer,
 				      ktime_t interval)
 {
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 76d4bd962b19..b1a74ee3e99a 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -801,6 +801,14 @@ void unlock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags)
  *
  * Forward the timer expiry so it will expire in the future.
  * Returns the number of overruns.
+ *
+ * Can be safely called from the callback function of @timer. If
+ * called from other contexts @timer must neither be enqueued nor
+ * running the callback and the caller needs to take care of
+ * serialization.
+ *
+ * Note: This only updates the timer expiry value and does not requeue
+ * the timer.
  */
 u64 hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval)
 {
-- 
cgit v1.2.3


From 398ca17fb54b212cdc9da7ff4a17a35c48dd2103 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 14 Apr 2015 21:08:27 +0000
Subject: hrtimer: Get rid of the resolution field in hrtimer_clock_base

The field has no value because all clock bases have the same
resolution. The resolution only changes when we switch to high
resolution timer mode. We can evaluate that from a single static
variable as well. In the !HIGHRES case its simply a constant.

Export the variable, so we can simplify the usage sites.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Preeti U Murthy <preeti@linux.vnet.ibm.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Viresh Kumar <viresh.kumar@linaro.org>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Link: http://lkml.kernel.org/r/20150414203500.645454122@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/hrtimer.h  |  6 ++++--
 kernel/time/hrtimer.c    | 26 +++++++++-----------------
 kernel/time/timer_list.c |  8 ++++----
 3 files changed, 17 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index 7770676c387a..bc6f91b5443b 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -137,7 +137,6 @@ struct hrtimer_sleeper {
  *			timer to a base on another cpu.
  * @clockid:		clock id for per_cpu support
  * @active:		red black tree root node for the active timers
- * @resolution:		the resolution of the clock, in nanoseconds
  * @get_time:		function to retrieve the current time of the clock
  * @softirq_time:	the time when running the hrtimer queue in the softirq
  * @offset:		offset of this clock to the monotonic base
@@ -147,7 +146,6 @@ struct hrtimer_clock_base {
 	int			index;
 	clockid_t		clockid;
 	struct timerqueue_head	active;
-	ktime_t			resolution;
 	ktime_t			(*get_time)(void);
 	ktime_t			softirq_time;
 	ktime_t			offset;
@@ -295,11 +293,15 @@ extern void hrtimer_peek_ahead_timers(void);
 
 extern void clock_was_set_delayed(void);
 
+extern unsigned int hrtimer_resolution;
+
 #else
 
 # define MONOTONIC_RES_NSEC	LOW_RES_NSEC
 # define KTIME_MONOTONIC_RES	KTIME_LOW_RES
 
+#define hrtimer_resolution	LOW_RES_NSEC
+
 static inline void hrtimer_peek_ahead_timers(void) { }
 
 /*
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 9abd50b60e83..965687a1fce6 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -66,7 +66,6 @@
  */
 DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
 {
-
 	.lock = __RAW_SPIN_LOCK_UNLOCKED(hrtimer_bases.lock),
 	.clock_base =
 	{
@@ -74,25 +73,21 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
 			.index = HRTIMER_BASE_MONOTONIC,
 			.clockid = CLOCK_MONOTONIC,
 			.get_time = &ktime_get,
-			.resolution = KTIME_LOW_RES,
 		},
 		{
 			.index = HRTIMER_BASE_REALTIME,
 			.clockid = CLOCK_REALTIME,
 			.get_time = &ktime_get_real,
-			.resolution = KTIME_LOW_RES,
 		},
 		{
 			.index = HRTIMER_BASE_BOOTTIME,
 			.clockid = CLOCK_BOOTTIME,
 			.get_time = &ktime_get_boottime,
-			.resolution = KTIME_LOW_RES,
 		},
 		{
 			.index = HRTIMER_BASE_TAI,
 			.clockid = CLOCK_TAI,
 			.get_time = &ktime_get_clocktai,
-			.resolution = KTIME_LOW_RES,
 		},
 	}
 };
@@ -478,6 +473,8 @@ static ktime_t __hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base)
  * High resolution timer enabled ?
  */
 static int hrtimer_hres_enabled __read_mostly  = 1;
+unsigned int hrtimer_resolution __read_mostly = LOW_RES_NSEC;
+EXPORT_SYMBOL_GPL(hrtimer_resolution);
 
 /*
  * Enable / Disable high resolution mode
@@ -660,7 +657,7 @@ static void retrigger_next_event(void *arg)
  */
 static int hrtimer_switch_to_hres(void)
 {
-	int i, cpu = smp_processor_id();
+	int cpu = smp_processor_id();
 	struct hrtimer_cpu_base *base = &per_cpu(hrtimer_bases, cpu);
 	unsigned long flags;
 
@@ -676,8 +673,7 @@ static int hrtimer_switch_to_hres(void)
 		return 0;
 	}
 	base->hres_active = 1;
-	for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++)
-		base->clock_base[i].resolution = KTIME_HIGH_RES;
+	hrtimer_resolution = HIGH_RES_NSEC;
 
 	tick_setup_sched_timer();
 	/* "Retrigger" the interrupt to get things going */
@@ -820,8 +816,8 @@ u64 hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval)
 	if (delta.tv64 < 0)
 		return 0;
 
-	if (interval.tv64 < timer->base->resolution.tv64)
-		interval.tv64 = timer->base->resolution.tv64;
+	if (interval.tv64 < hrtimer_resolution)
+		interval.tv64 = hrtimer_resolution;
 
 	if (unlikely(delta.tv64 >= interval.tv64)) {
 		s64 incr = ktime_to_ns(interval);
@@ -963,7 +959,7 @@ int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
 		 * timeouts. This will go away with the GTOD framework.
 		 */
 #ifdef CONFIG_TIME_LOW_RES
-		tim = ktime_add_safe(tim, base->resolution);
+		tim = ktime_add_safe(tim, ktime_set(0, hrtimer_resolution));
 #endif
 	}
 
@@ -1193,12 +1189,8 @@ EXPORT_SYMBOL_GPL(hrtimer_init);
  */
 int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp)
 {
-	struct hrtimer_cpu_base *cpu_base;
-	int base = hrtimer_clockid_to_base(which_clock);
-
-	cpu_base = raw_cpu_ptr(&hrtimer_bases);
-	*tp = ktime_to_timespec(cpu_base->clock_base[base].resolution);
-
+	tp->tv_sec = 0;
+	tp->tv_nsec = hrtimer_resolution;
 	return 0;
 }
 EXPORT_SYMBOL_GPL(hrtimer_get_res);
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index 5960af2196ac..bdd5e987f115 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -127,10 +127,10 @@ static void
 print_base(struct seq_file *m, struct hrtimer_clock_base *base, u64 now)
 {
 	SEQ_printf(m, "  .base:       %pK\n", base);
-	SEQ_printf(m, "  .index:      %d\n",
-			base->index);
-	SEQ_printf(m, "  .resolution: %Lu nsecs\n",
-			(unsigned long long)ktime_to_ns(base->resolution));
+	SEQ_printf(m, "  .index:      %d\n", base->index);
+
+	SEQ_printf(m, "  .resolution: %u nsecs\n", (unsigned) hrtimer_resolution);
+
 	SEQ_printf(m,   "  .get_time:   ");
 	print_name_offset(m, base->get_time);
 	SEQ_printf(m,   "\n");
-- 
cgit v1.2.3


From 056a3cacbc46e5aca27b350ce4ecb3b33ebb0700 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 14 Apr 2015 21:08:32 +0000
Subject: hrtimer: Get rid of hrtimer_get_res()

The resolution is directly accessible now. So its simpler just to fill
in the values of the timespec and be done with it.

Text size reduction (combined with "hrtimer: Get rid of the resolution
field in hrtimer_clock_base"):
       x8664 -61, i386 -221, ARM -60, power64 -48

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Preeti U Murthy <preeti@linux.vnet.ibm.com>
Cc: Viresh Kumar <viresh.kumar@linaro.org>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Link: http://lkml.kernel.org/r/20150414203500.879888080@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/hrtimer.h    |  1 -
 kernel/time/alarmtimer.c   |  6 +++---
 kernel/time/hrtimer.c      | 16 ----------------
 kernel/time/posix-timers.c | 17 ++++++++++++-----
 4 files changed, 15 insertions(+), 25 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index bc6f91b5443b..8025156c8fa1 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -385,7 +385,6 @@ static inline int hrtimer_restart(struct hrtimer *timer)
 
 /* Query timers: */
 extern ktime_t hrtimer_get_remaining(const struct hrtimer *timer);
-extern int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp);
 
 extern ktime_t hrtimer_get_next_event(void);
 
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index 1b001ed1edb9..0b55a7570c90 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -495,12 +495,12 @@ static enum alarmtimer_restart alarm_handle_timer(struct alarm *alarm,
  */
 static int alarm_clock_getres(const clockid_t which_clock, struct timespec *tp)
 {
-	clockid_t baseid = alarm_bases[clock2alarm(which_clock)].base_clockid;
-
 	if (!alarmtimer_get_rtcdev())
 		return -EINVAL;
 
-	return hrtimer_get_res(baseid, tp);
+	tp->tv_sec = 0;
+	tp->tv_nsec = hrtimer_resolution;
+	return 0;
 }
 
 /**
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 965687a1fce6..73131dab787e 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -1179,22 +1179,6 @@ void hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
 }
 EXPORT_SYMBOL_GPL(hrtimer_init);
 
-/**
- * hrtimer_get_res - get the timer resolution for a clock
- * @which_clock: which clock to query
- * @tp:		 pointer to timespec variable to store the resolution
- *
- * Store the resolution of the clock selected by @which_clock in the
- * variable pointed to by @tp.
- */
-int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp)
-{
-	tp->tv_sec = 0;
-	tp->tv_nsec = hrtimer_resolution;
-	return 0;
-}
-EXPORT_SYMBOL_GPL(hrtimer_get_res);
-
 static void __run_hrtimer(struct hrtimer *timer, ktime_t *now)
 {
 	struct hrtimer_clock_base *base = timer->base;
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index 31ea01f42e1f..31d11ac9fa47 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -272,13 +272,20 @@ static int posix_get_tai(clockid_t which_clock, struct timespec *tp)
 	return 0;
 }
 
+static int posix_get_hrtimer_res(clockid_t which_clock, struct timespec *tp)
+{
+	tp->tv_sec = 0;
+	tp->tv_nsec = hrtimer_resolution;
+	return 0;
+}
+
 /*
  * Initialize everything, well, just everything in Posix clocks/timers ;)
  */
 static __init int init_posix_timers(void)
 {
 	struct k_clock clock_realtime = {
-		.clock_getres	= hrtimer_get_res,
+		.clock_getres	= posix_get_hrtimer_res,
 		.clock_get	= posix_clock_realtime_get,
 		.clock_set	= posix_clock_realtime_set,
 		.clock_adj	= posix_clock_realtime_adj,
@@ -290,7 +297,7 @@ static __init int init_posix_timers(void)
 		.timer_del	= common_timer_del,
 	};
 	struct k_clock clock_monotonic = {
-		.clock_getres	= hrtimer_get_res,
+		.clock_getres	= posix_get_hrtimer_res,
 		.clock_get	= posix_ktime_get_ts,
 		.nsleep		= common_nsleep,
 		.nsleep_restart	= hrtimer_nanosleep_restart,
@@ -300,7 +307,7 @@ static __init int init_posix_timers(void)
 		.timer_del	= common_timer_del,
 	};
 	struct k_clock clock_monotonic_raw = {
-		.clock_getres	= hrtimer_get_res,
+		.clock_getres	= posix_get_hrtimer_res,
 		.clock_get	= posix_get_monotonic_raw,
 	};
 	struct k_clock clock_realtime_coarse = {
@@ -312,7 +319,7 @@ static __init int init_posix_timers(void)
 		.clock_get	= posix_get_monotonic_coarse,
 	};
 	struct k_clock clock_tai = {
-		.clock_getres	= hrtimer_get_res,
+		.clock_getres	= posix_get_hrtimer_res,
 		.clock_get	= posix_get_tai,
 		.nsleep		= common_nsleep,
 		.nsleep_restart	= hrtimer_nanosleep_restart,
@@ -322,7 +329,7 @@ static __init int init_posix_timers(void)
 		.timer_del	= common_timer_del,
 	};
 	struct k_clock clock_boottime = {
-		.clock_getres	= hrtimer_get_res,
+		.clock_getres	= posix_get_hrtimer_res,
 		.clock_get	= posix_get_boottime,
 		.nsleep		= common_nsleep,
 		.nsleep_restart	= hrtimer_nanosleep_restart,
-- 
cgit v1.2.3


From a6ffebce7f89f6f97cc22838a5d4383b15d6774f Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 14 Apr 2015 21:08:34 +0000
Subject: hrtimer: Make the statistics fields smaller

No point in having usigned long for /proc/timer_list statistics. Make
them unsigned int.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Preeti U Murthy <preeti@linux.vnet.ibm.com>
Cc: Viresh Kumar <viresh.kumar@linaro.org>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Link: http://lkml.kernel.org/r/20150414203500.959773467@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/hrtimer.h  | 8 ++++----
 kernel/time/hrtimer.c    | 4 ++--
 kernel/time/timer_list.c | 2 +-
 3 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index 8025156c8fa1..d39f2847754c 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -187,10 +187,10 @@ struct hrtimer_cpu_base {
 	int				in_hrtirq;
 	int				hres_active;
 	int				hang_detected;
-	unsigned long			nr_events;
-	unsigned long			nr_retries;
-	unsigned long			nr_hangs;
-	ktime_t				max_hang_time;
+	unsigned int			nr_events;
+	unsigned int			nr_retries;
+	unsigned int			nr_hangs;
+	unsigned int			max_hang_time;
 #endif
 	struct hrtimer_clock_base	clock_base[HRTIMER_MAX_CLOCK_BASES];
 };
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 73131dab787e..874e0914eb3c 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -1327,8 +1327,8 @@ retry:
 	cpu_base->hang_detected = 1;
 	raw_spin_unlock(&cpu_base->lock);
 	delta = ktime_sub(now, entry_time);
-	if (delta.tv64 > cpu_base->max_hang_time.tv64)
-		cpu_base->max_hang_time = delta;
+	if ((unsigned int)delta.tv64 > cpu_base->max_hang_time)
+		cpu_base->max_hang_time = (unsigned int) delta.tv64;
 	/*
 	 * Limit it to a sensible value as we enforce a longer
 	 * delay. Give the CPU at least 100ms to catch up.
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index bdd5e987f115..6232fc536185 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -165,7 +165,7 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now)
 	P(nr_events);
 	P(nr_retries);
 	P(nr_hangs);
-	P_ns(max_hang_time);
+	P(max_hang_time);
 #endif
 #undef P
 #undef P_ns
-- 
cgit v1.2.3


From 21d6d52a1b7028e6a6840bd82e354aefa9a5e203 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 14 Apr 2015 21:08:35 +0000
Subject: hrtimer: Get rid of softirq time

The softirq time field in the clock bases is an optimization from the
early days of hrtimers. It provides a coarse "jiffies" like time
mostly for self rearming timers.

But that comes with a price:
    - Larger code size
    - Extra storage space
    - Duplicated functions with really small differences

The benefit of this is optimization is marginal for contemporary
systems.

Consolidate everything on the high resolution timer
implementation. This makes further optimizations possible.

Text size reduction:
       x8664 -95, i386 -356, ARM -148, ARM64 -40, power64 -16

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Preeti U Murthy <preeti@linux.vnet.ibm.com>
Cc: Viresh Kumar <viresh.kumar@linaro.org>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Link: http://lkml.kernel.org/r/20150414203501.039977424@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/hrtimer.h   |  24 ++------
 kernel/time/hrtimer.c     | 148 ++++++++++++++++++----------------------------
 kernel/time/timekeeping.c |  32 ----------
 kernel/time/timekeeping.h |   3 -
 4 files changed, 64 insertions(+), 143 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index d39f2847754c..e292830b58f0 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -138,7 +138,6 @@ struct hrtimer_sleeper {
  * @clockid:		clock id for per_cpu support
  * @active:		red black tree root node for the active timers
  * @get_time:		function to retrieve the current time of the clock
- * @softirq_time:	the time when running the hrtimer queue in the softirq
  * @offset:		offset of this clock to the monotonic base
  */
 struct hrtimer_clock_base {
@@ -147,7 +146,6 @@ struct hrtimer_clock_base {
 	clockid_t		clockid;
 	struct timerqueue_head	active;
 	ktime_t			(*get_time)(void);
-	ktime_t			softirq_time;
 	ktime_t			offset;
 };
 
@@ -260,19 +258,16 @@ static inline ktime_t hrtimer_expires_remaining(const struct hrtimer *timer)
 	return ktime_sub(timer->node.expires, timer->base->get_time());
 }
 
-#ifdef CONFIG_HIGH_RES_TIMERS
-struct clock_event_device;
-
-extern void hrtimer_interrupt(struct clock_event_device *dev);
-
-/*
- * In high resolution mode the time reference must be read accurate
- */
 static inline ktime_t hrtimer_cb_get_time(struct hrtimer *timer)
 {
 	return timer->base->get_time();
 }
 
+#ifdef CONFIG_HIGH_RES_TIMERS
+struct clock_event_device;
+
+extern void hrtimer_interrupt(struct clock_event_device *dev);
+
 static inline int hrtimer_is_hres_active(struct hrtimer *timer)
 {
 	return timer->base->cpu_base->hres_active;
@@ -304,15 +299,6 @@ extern unsigned int hrtimer_resolution;
 
 static inline void hrtimer_peek_ahead_timers(void) { }
 
-/*
- * In non high resolution mode the time reference is taken from
- * the base softirq time variable.
- */
-static inline ktime_t hrtimer_cb_get_time(struct hrtimer *timer)
-{
-	return timer->base->softirq_time;
-}
-
 static inline int hrtimer_is_hres_active(struct hrtimer *timer)
 {
 	return 0;
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 874e0914eb3c..9e111dd83ca3 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -104,27 +104,6 @@ static inline int hrtimer_clockid_to_base(clockid_t clock_id)
 	return hrtimer_clock_to_base_table[clock_id];
 }
 
-
-/*
- * Get the coarse grained time at the softirq based on xtime and
- * wall_to_monotonic.
- */
-static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base)
-{
-	ktime_t xtim, mono, boot, tai;
-	ktime_t off_real, off_boot, off_tai;
-
-	mono = ktime_get_update_offsets_tick(&off_real, &off_boot, &off_tai);
-	boot = ktime_add(mono, off_boot);
-	xtim = ktime_add(mono, off_real);
-	tai = ktime_add(mono, off_tai);
-
-	base->clock_base[HRTIMER_BASE_REALTIME].softirq_time = xtim;
-	base->clock_base[HRTIMER_BASE_MONOTONIC].softirq_time = mono;
-	base->clock_base[HRTIMER_BASE_BOOTTIME].softirq_time = boot;
-	base->clock_base[HRTIMER_BASE_TAI].softirq_time = tai;
-}
-
 /*
  * Functions and macros which are different for UP/SMP systems are kept in a
  * single place
@@ -466,6 +445,15 @@ static ktime_t __hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base)
 }
 #endif
 
+static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base)
+{
+	ktime_t *offs_real = &base->clock_base[HRTIMER_BASE_REALTIME].offset;
+	ktime_t *offs_boot = &base->clock_base[HRTIMER_BASE_BOOTTIME].offset;
+	ktime_t *offs_tai = &base->clock_base[HRTIMER_BASE_TAI].offset;
+
+	return ktime_get_update_offsets_now(offs_real, offs_boot, offs_tai);
+}
+
 /* High resolution timer related functions */
 #ifdef CONFIG_HIGH_RES_TIMERS
 
@@ -516,7 +504,12 @@ static inline int hrtimer_hres_active(void)
 static void
 hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal)
 {
-	ktime_t expires_next = __hrtimer_get_next_event(cpu_base);
+	ktime_t expires_next;
+
+	if (!cpu_base->hres_active)
+		return;
+
+	expires_next = __hrtimer_get_next_event(cpu_base);
 
 	if (skip_equal && expires_next.tv64 == cpu_base->expires_next.tv64)
 		return;
@@ -625,15 +618,6 @@ static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base)
 	base->hres_active = 0;
 }
 
-static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base)
-{
-	ktime_t *offs_real = &base->clock_base[HRTIMER_BASE_REALTIME].offset;
-	ktime_t *offs_boot = &base->clock_base[HRTIMER_BASE_BOOTTIME].offset;
-	ktime_t *offs_tai = &base->clock_base[HRTIMER_BASE_TAI].offset;
-
-	return ktime_get_update_offsets_now(offs_real, offs_boot, offs_tai);
-}
-
 /*
  * Retrigger next event is called after clock was set
  *
@@ -1179,10 +1163,10 @@ void hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
 }
 EXPORT_SYMBOL_GPL(hrtimer_init);
 
-static void __run_hrtimer(struct hrtimer *timer, ktime_t *now)
+static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base,
+			  struct hrtimer_clock_base *base,
+			  struct hrtimer *timer, ktime_t *now)
 {
-	struct hrtimer_clock_base *base = timer->base;
-	struct hrtimer_cpu_base *cpu_base = base->cpu_base;
 	enum hrtimer_restart (*fn)(struct hrtimer *);
 	int restart;
 
@@ -1219,34 +1203,9 @@ static void __run_hrtimer(struct hrtimer *timer, ktime_t *now)
 	timer->state &= ~HRTIMER_STATE_CALLBACK;
 }
 
-#ifdef CONFIG_HIGH_RES_TIMERS
-
-/*
- * High resolution timer interrupt
- * Called with interrupts disabled
- */
-void hrtimer_interrupt(struct clock_event_device *dev)
+static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now)
 {
-	struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
-	ktime_t expires_next, now, entry_time, delta;
-	int i, retries = 0;
-
-	BUG_ON(!cpu_base->hres_active);
-	cpu_base->nr_events++;
-	dev->next_event.tv64 = KTIME_MAX;
-
-	raw_spin_lock(&cpu_base->lock);
-	entry_time = now = hrtimer_update_base(cpu_base);
-retry:
-	cpu_base->in_hrtirq = 1;
-	/*
-	 * We set expires_next to KTIME_MAX here with cpu_base->lock
-	 * held to prevent that a timer is enqueued in our queue via
-	 * the migration code. This does not affect enqueueing of
-	 * timers which run their callback and need to be requeued on
-	 * this CPU.
-	 */
-	cpu_base->expires_next.tv64 = KTIME_MAX;
+	int i;
 
 	for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
 		struct hrtimer_clock_base *base;
@@ -1279,9 +1238,42 @@ retry:
 			if (basenow.tv64 < hrtimer_get_softexpires_tv64(timer))
 				break;
 
-			__run_hrtimer(timer, &basenow);
+			__run_hrtimer(cpu_base, base, timer, &basenow);
 		}
 	}
+}
+
+#ifdef CONFIG_HIGH_RES_TIMERS
+
+/*
+ * High resolution timer interrupt
+ * Called with interrupts disabled
+ */
+void hrtimer_interrupt(struct clock_event_device *dev)
+{
+	struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
+	ktime_t expires_next, now, entry_time, delta;
+	int retries = 0;
+
+	BUG_ON(!cpu_base->hres_active);
+	cpu_base->nr_events++;
+	dev->next_event.tv64 = KTIME_MAX;
+
+	raw_spin_lock(&cpu_base->lock);
+	entry_time = now = hrtimer_update_base(cpu_base);
+retry:
+	cpu_base->in_hrtirq = 1;
+	/*
+	 * We set expires_next to KTIME_MAX here with cpu_base->lock
+	 * held to prevent that a timer is enqueued in our queue via
+	 * the migration code. This does not affect enqueueing of
+	 * timers which run their callback and need to be requeued on
+	 * this CPU.
+	 */
+	cpu_base->expires_next.tv64 = KTIME_MAX;
+
+	__hrtimer_run_queues(cpu_base, now);
+
 	/* Reevaluate the clock bases for the next expiry */
 	expires_next = __hrtimer_get_next_event(cpu_base);
 	/*
@@ -1416,38 +1408,16 @@ void hrtimer_run_pending(void)
  */
 void hrtimer_run_queues(void)
 {
-	struct timerqueue_node *node;
 	struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
-	struct hrtimer_clock_base *base;
-	int index, gettime = 1;
+	ktime_t now;
 
 	if (hrtimer_hres_active())
 		return;
 
-	for (index = 0; index < HRTIMER_MAX_CLOCK_BASES; index++) {
-		base = &cpu_base->clock_base[index];
-		if (!timerqueue_getnext(&base->active))
-			continue;
-
-		if (gettime) {
-			hrtimer_get_softirq_time(cpu_base);
-			gettime = 0;
-		}
-
-		raw_spin_lock(&cpu_base->lock);
-
-		while ((node = timerqueue_getnext(&base->active))) {
-			struct hrtimer *timer;
-
-			timer = container_of(node, struct hrtimer, node);
-			if (base->softirq_time.tv64 <=
-					hrtimer_get_expires_tv64(timer))
-				break;
-
-			__run_hrtimer(timer, &base->softirq_time);
-		}
-		raw_spin_unlock(&cpu_base->lock);
-	}
+	raw_spin_lock(&cpu_base->lock);
+	now = hrtimer_update_base(cpu_base);
+	__hrtimer_run_queues(cpu_base, now);
+	raw_spin_unlock(&cpu_base->lock);
 }
 
 /*
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 946acb72179f..dd1efa6a4ea4 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -1925,37 +1925,6 @@ void do_timer(unsigned long ticks)
 	calc_global_load(ticks);
 }
 
-/**
- * ktime_get_update_offsets_tick - hrtimer helper
- * @offs_real:	pointer to storage for monotonic -> realtime offset
- * @offs_boot:	pointer to storage for monotonic -> boottime offset
- * @offs_tai:	pointer to storage for monotonic -> clock tai offset
- *
- * Returns monotonic time at last tick and various offsets
- */
-ktime_t ktime_get_update_offsets_tick(ktime_t *offs_real, ktime_t *offs_boot,
-							ktime_t *offs_tai)
-{
-	struct timekeeper *tk = &tk_core.timekeeper;
-	unsigned int seq;
-	ktime_t base;
-	u64 nsecs;
-
-	do {
-		seq = read_seqcount_begin(&tk_core.seq);
-
-		base = tk->tkr_mono.base;
-		nsecs = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift;
-
-		*offs_real = tk->offs_real;
-		*offs_boot = tk->offs_boot;
-		*offs_tai = tk->offs_tai;
-	} while (read_seqcount_retry(&tk_core.seq, seq));
-
-	return ktime_add_ns(base, nsecs);
-}
-
-#ifdef CONFIG_HIGH_RES_TIMERS
 /**
  * ktime_get_update_offsets_now - hrtimer helper
  * @offs_real:	pointer to storage for monotonic -> realtime offset
@@ -1986,7 +1955,6 @@ ktime_t ktime_get_update_offsets_now(ktime_t *offs_real, ktime_t *offs_boot,
 
 	return ktime_add_ns(base, nsecs);
 }
-#endif
 
 /**
  * do_adjtimex() - Accessor function to NTP __do_adjtimex function
diff --git a/kernel/time/timekeeping.h b/kernel/time/timekeeping.h
index 5b57f6c9ae34..4d177fce37d5 100644
--- a/kernel/time/timekeeping.h
+++ b/kernel/time/timekeeping.h
@@ -3,9 +3,6 @@
 /*
  * Internal interfaces for kernel/time/
  */
-extern ktime_t ktime_get_update_offsets_tick(ktime_t *offs_real,
-						ktime_t *offs_boot,
-						ktime_t *offs_tai);
 extern ktime_t ktime_get_update_offsets_now(ktime_t *offs_real,
 						ktime_t *offs_boot,
 						ktime_t *offs_tai);
-- 
cgit v1.2.3


From 868a3e915f7f5eba8f8cb4f7da2276760807c51c Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 14 Apr 2015 21:08:37 +0000
Subject: hrtimer: Make offset update smarter

On every tick/hrtimer interrupt we update the offset variables of the
clock bases. That's silly because these offsets change very seldom.

Add a sequence counter to the time keeping code which keeps track of
the offset updates (clock_was_set()). Have a sequence cache in the
hrtimer cpu bases to evaluate whether the offsets must be updated or
not. This allows us later to avoid pointless cacheline pollution.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Preeti U Murthy <preeti@linux.vnet.ibm.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Viresh Kumar <viresh.kumar@linaro.org>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: John Stultz <john.stultz@linaro.org>
Link: http://lkml.kernel.org/r/20150414203501.132820245@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: John Stultz <john.stultz@linaro.org>
---
 include/linux/hrtimer.h             |  4 ++--
 include/linux/timekeeper_internal.h |  2 ++
 kernel/time/hrtimer.c               |  3 ++-
 kernel/time/timekeeping.c           | 23 ++++++++++++++++-------
 kernel/time/timekeeping.h           |  7 ++++---
 5 files changed, 26 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index e292830b58f0..5e04f8fc26f6 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -163,7 +163,7 @@ enum  hrtimer_base_type {
  *			and timers
  * @cpu:		cpu number
  * @active_bases:	Bitfield to mark bases with active timers
- * @clock_was_set:	Indicates that clock was set from irq context.
+ * @clock_was_set_seq:	Sequence counter of clock was set events
  * @expires_next:	absolute time of the next event which was scheduled
  *			via clock_set_next_event()
  * @in_hrtirq:		hrtimer_interrupt() is currently executing
@@ -179,7 +179,7 @@ struct hrtimer_cpu_base {
 	raw_spinlock_t			lock;
 	unsigned int			cpu;
 	unsigned int			active_bases;
-	unsigned int			clock_was_set;
+	unsigned int			clock_was_set_seq;
 #ifdef CONFIG_HIGH_RES_TIMERS
 	ktime_t				expires_next;
 	int				in_hrtirq;
diff --git a/include/linux/timekeeper_internal.h b/include/linux/timekeeper_internal.h
index fb86963859c7..6f8276ae579c 100644
--- a/include/linux/timekeeper_internal.h
+++ b/include/linux/timekeeper_internal.h
@@ -49,6 +49,7 @@ struct tk_read_base {
  * @offs_boot:		Offset clock monotonic -> clock boottime
  * @offs_tai:		Offset clock monotonic -> clock tai
  * @tai_offset:		The current UTC to TAI offset in seconds
+ * @clock_was_set_seq:	The sequence number of clock was set events
  * @raw_time:		Monotonic raw base time in timespec64 format
  * @cycle_interval:	Number of clock cycles in one NTP interval
  * @xtime_interval:	Number of clock shifted nano seconds in one NTP
@@ -85,6 +86,7 @@ struct timekeeper {
 	ktime_t			offs_boot;
 	ktime_t			offs_tai;
 	s32			tai_offset;
+	unsigned int		clock_was_set_seq;
 	struct timespec64	raw_time;
 
 	/* The following members are for timekeeping internal use */
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 9e111dd83ca3..8ce9b3138017 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -451,7 +451,8 @@ static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base)
 	ktime_t *offs_boot = &base->clock_base[HRTIMER_BASE_BOOTTIME].offset;
 	ktime_t *offs_tai = &base->clock_base[HRTIMER_BASE_TAI].offset;
 
-	return ktime_get_update_offsets_now(offs_real, offs_boot, offs_tai);
+	return ktime_get_update_offsets_now(&base->clock_was_set_seq,
+					    offs_real, offs_boot, offs_tai);
 }
 
 /* High resolution timer related functions */
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index dd1efa6a4ea4..3365e32dc208 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -602,6 +602,9 @@ static void timekeeping_update(struct timekeeper *tk, unsigned int action)
 
 	update_fast_timekeeper(&tk->tkr_mono, &tk_fast_mono);
 	update_fast_timekeeper(&tk->tkr_raw,  &tk_fast_raw);
+
+	if (action & TK_CLOCK_WAS_SET)
+		tk->clock_was_set_seq++;
 }
 
 /**
@@ -1927,15 +1930,19 @@ void do_timer(unsigned long ticks)
 
 /**
  * ktime_get_update_offsets_now - hrtimer helper
+ * @cwsseq:	pointer to check and store the clock was set sequence number
  * @offs_real:	pointer to storage for monotonic -> realtime offset
  * @offs_boot:	pointer to storage for monotonic -> boottime offset
  * @offs_tai:	pointer to storage for monotonic -> clock tai offset
  *
- * Returns current monotonic time and updates the offsets
+ * Returns current monotonic time and updates the offsets if the
+ * sequence number in @cwsseq and timekeeper.clock_was_set_seq are
+ * different.
+ *
  * Called from hrtimer_interrupt() or retrigger_next_event()
  */
-ktime_t ktime_get_update_offsets_now(ktime_t *offs_real, ktime_t *offs_boot,
-							ktime_t *offs_tai)
+ktime_t ktime_get_update_offsets_now(unsigned int *cwsseq, ktime_t *offs_real,
+				     ktime_t *offs_boot, ktime_t *offs_tai)
 {
 	struct timekeeper *tk = &tk_core.timekeeper;
 	unsigned int seq;
@@ -1947,10 +1954,12 @@ ktime_t ktime_get_update_offsets_now(ktime_t *offs_real, ktime_t *offs_boot,
 
 		base = tk->tkr_mono.base;
 		nsecs = timekeeping_get_ns(&tk->tkr_mono);
-
-		*offs_real = tk->offs_real;
-		*offs_boot = tk->offs_boot;
-		*offs_tai = tk->offs_tai;
+		if (*cwsseq != tk->clock_was_set_seq) {
+			*cwsseq = tk->clock_was_set_seq;
+			*offs_real = tk->offs_real;
+			*offs_boot = tk->offs_boot;
+			*offs_tai = tk->offs_tai;
+		}
 	} while (read_seqcount_retry(&tk_core.seq, seq));
 
 	return ktime_add_ns(base, nsecs);
diff --git a/kernel/time/timekeeping.h b/kernel/time/timekeeping.h
index 4d177fce37d5..704f595ce83f 100644
--- a/kernel/time/timekeeping.h
+++ b/kernel/time/timekeeping.h
@@ -3,9 +3,10 @@
 /*
  * Internal interfaces for kernel/time/
  */
-extern ktime_t ktime_get_update_offsets_now(ktime_t *offs_real,
-						ktime_t *offs_boot,
-						ktime_t *offs_tai);
+extern ktime_t ktime_get_update_offsets_now(unsigned int *cwsseq,
+					    ktime_t *offs_real,
+					    ktime_t *offs_boot,
+					    ktime_t *offs_tai);
 
 extern int timekeeping_valid_for_hres(void);
 extern u64 timekeeping_max_deferment(void);
-- 
cgit v1.2.3


From e19ffe8be2cd0a1f726b235443eba21e64f6be5e Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 14 Apr 2015 21:08:39 +0000
Subject: hrtimer: Use bits for various boolean indicators

No point in wasting 12 byte storage space. Generates better code as well.

Text size reduction:
       x8664 -64, i386 -16, ARM -132, ARM64 -0, power64 -48

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Preeti U Murthy <preeti@linux.vnet.ibm.com>
Cc: Viresh Kumar <viresh.kumar@linaro.org>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Link: http://lkml.kernel.org/r/20150414203501.227955358@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/hrtimer.h |  6 +++---
 kernel/time/hrtimer.c   | 24 ++++++++++++++++--------
 2 files changed, 19 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index 5e04f8fc26f6..17a59ddcc79a 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -181,10 +181,10 @@ struct hrtimer_cpu_base {
 	unsigned int			active_bases;
 	unsigned int			clock_was_set_seq;
 #ifdef CONFIG_HIGH_RES_TIMERS
+	unsigned int			in_hrtirq	: 1,
+					hres_active	: 1,
+					hang_detected	: 1;
 	ktime_t				expires_next;
-	int				in_hrtirq;
-	int				hres_active;
-	int				hang_detected;
 	unsigned int			nr_events;
 	unsigned int			nr_retries;
 	unsigned int			nr_hangs;
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 8ce9b3138017..9bbfe33f6575 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -492,9 +492,14 @@ static inline int hrtimer_is_hres_enabled(void)
 /*
  * Is the high resolution mode active ?
  */
+static inline int __hrtimer_hres_active(struct hrtimer_cpu_base *cpu_base)
+{
+	return cpu_base->hres_active;
+}
+
 static inline int hrtimer_hres_active(void)
 {
-	return __this_cpu_read(hrtimer_bases.hres_active);
+	return __hrtimer_hres_active(this_cpu_ptr(&hrtimer_bases));
 }
 
 /*
@@ -628,7 +633,7 @@ static void retrigger_next_event(void *arg)
 {
 	struct hrtimer_cpu_base *base = this_cpu_ptr(&hrtimer_bases);
 
-	if (!hrtimer_hres_active())
+	if (!base->hres_active)
 		return;
 
 	raw_spin_lock(&base->lock);
@@ -685,6 +690,7 @@ void clock_was_set_delayed(void)
 
 #else
 
+static inline int __hrtimer_hres_active(struct hrtimer_cpu_base *b) { return 0; }
 static inline int hrtimer_hres_active(void) { return 0; }
 static inline int hrtimer_is_hres_enabled(void) { return 0; }
 static inline int hrtimer_switch_to_hres(void) { return 0; }
@@ -862,25 +868,27 @@ static void __remove_hrtimer(struct hrtimer *timer,
 			     struct hrtimer_clock_base *base,
 			     unsigned long newstate, int reprogram)
 {
+	struct hrtimer_cpu_base *cpu_base = base->cpu_base;
 	struct timerqueue_node *next_timer;
+
 	if (!(timer->state & HRTIMER_STATE_ENQUEUED))
 		goto out;
 
 	next_timer = timerqueue_getnext(&base->active);
 	timerqueue_del(&base->active, &timer->node);
 	if (!timerqueue_getnext(&base->active))
-		base->cpu_base->active_bases &= ~(1 << base->index);
+		cpu_base->active_bases &= ~(1 << base->index);
 
 	if (&timer->node == next_timer) {
 #ifdef CONFIG_HIGH_RES_TIMERS
 		/* Reprogram the clock event device. if enabled */
-		if (reprogram && hrtimer_hres_active()) {
+		if (reprogram && cpu_base->hres_active) {
 			ktime_t expires;
 
 			expires = ktime_sub(hrtimer_get_expires(timer),
 					    base->offset);
-			if (base->cpu_base->expires_next.tv64 == expires.tv64)
-				hrtimer_force_reprogram(base->cpu_base, 1);
+			if (cpu_base->expires_next.tv64 == expires.tv64)
+				hrtimer_force_reprogram(cpu_base, 1);
 		}
 #endif
 	}
@@ -1114,7 +1122,7 @@ ktime_t hrtimer_get_next_event(void)
 
 	raw_spin_lock_irqsave(&cpu_base->lock, flags);
 
-	if (!hrtimer_hres_active())
+	if (!__hrtimer_hres_active(cpu_base))
 		mindelta = ktime_sub(__hrtimer_get_next_event(cpu_base),
 				     ktime_get());
 
@@ -1412,7 +1420,7 @@ void hrtimer_run_queues(void)
 	struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
 	ktime_t now;
 
-	if (hrtimer_hres_active())
+	if (__hrtimer_hres_active(cpu_base))
 		return;
 
 	raw_spin_lock(&cpu_base->lock);
-- 
cgit v1.2.3


From 6d9a1411393d51f17bee3fe163430b21b2cb2de9 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 14 Apr 2015 21:08:42 +0000
Subject: hrtimer: Cache line align the hrtimer cpu base

We really want that data structure to start at a cache line boundary.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Preeti U Murthy <preeti@linux.vnet.ibm.com>
Cc: Viresh Kumar <viresh.kumar@linaro.org>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Link: http://lkml.kernel.org/r/20150414203501.417597627@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/hrtimer.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index 17a59ddcc79a..0853f52f8ffb 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -191,7 +191,7 @@ struct hrtimer_cpu_base {
 	unsigned int			max_hang_time;
 #endif
 	struct hrtimer_clock_base	clock_base[HRTIMER_MAX_CLOCK_BASES];
-};
+} ____cacheline_aligned;
 
 static inline void hrtimer_set_expires(struct hrtimer *timer, ktime_t time)
 {
-- 
cgit v1.2.3


From b8e38413ac2c33c497e72895fcd5da709fd1b908 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 14 Apr 2015 21:08:44 +0000
Subject: hrtimer: Align the hrtimer clock bases as well

We don't use cacheline_align here because that might waste lot of
space on 32bit machine with 64 bytes cachelines and on 64bit machines
with 128 bytes cachelines.

The size of struct hrtimer_clock_base is 64byte on 64bit and 32byte on
32bit machines. So we utilize the cache lines proper.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Preeti U Murthy <preeti@linux.vnet.ibm.com>
Cc: Viresh Kumar <viresh.kumar@linaro.org>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Link: http://lkml.kernel.org/r/20150414203501.498165771@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/hrtimer.h | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index 0853f52f8ffb..e5c22d611850 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -130,6 +130,12 @@ struct hrtimer_sleeper {
 	struct task_struct *task;
 };
 
+#ifdef CONFIG_64BIT
+# define HRTIMER_CLOCK_BASE_ALIGN	64
+#else
+# define HRTIMER_CLOCK_BASE_ALIGN	32
+#endif
+
 /**
  * struct hrtimer_clock_base - the timer base for a specific clock
  * @cpu_base:		per cpu clock base
@@ -147,7 +153,7 @@ struct hrtimer_clock_base {
 	struct timerqueue_head	active;
 	ktime_t			(*get_time)(void);
 	ktime_t			offset;
-};
+} __attribute__((__aligned__(HRTIMER_CLOCK_BASE_ALIGN)));
 
 enum  hrtimer_base_type {
 	HRTIMER_BASE_MONOTONIC,
@@ -195,6 +201,8 @@ struct hrtimer_cpu_base {
 
 static inline void hrtimer_set_expires(struct hrtimer *timer, ktime_t time)
 {
+	BUILD_BUG_ON(sizeof(struct hrtimer_clock_base) > HRTIMER_CLOCK_BASE_ALIGN);
+
 	timer->node.expires = time;
 	timer->_softexpires = time;
 }
-- 
cgit v1.2.3


From c320642e1ced3b81592610e374894fea995f475b Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 14 Apr 2015 21:08:46 +0000
Subject: timerqueue: Let timerqueue_add/del return information

The hrtimer code is interested whether the added timer is the first
one to expire and whether the removed timer was the last one in the
tree. The add/del routines have that information already. So we can
return it right away instead of reevaluating it at the call site.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Preeti U Murthy <preeti@linux.vnet.ibm.com>
Cc: Viresh Kumar <viresh.kumar@linaro.org>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: John Stultz <john.stultz@linaro.org>
Link: http://lkml.kernel.org/r/20150414203501.579063647@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/timerqueue.h |  8 ++++----
 lib/timerqueue.c           | 10 +++++++---
 2 files changed, 11 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/timerqueue.h b/include/linux/timerqueue.h
index a520fd70a59f..7eec17ad7fa1 100644
--- a/include/linux/timerqueue.h
+++ b/include/linux/timerqueue.h
@@ -16,10 +16,10 @@ struct timerqueue_head {
 };
 
 
-extern void timerqueue_add(struct timerqueue_head *head,
-				struct timerqueue_node *node);
-extern void timerqueue_del(struct timerqueue_head *head,
-				struct timerqueue_node *node);
+extern bool timerqueue_add(struct timerqueue_head *head,
+			   struct timerqueue_node *node);
+extern bool timerqueue_del(struct timerqueue_head *head,
+			   struct timerqueue_node *node);
 extern struct timerqueue_node *timerqueue_iterate_next(
 						struct timerqueue_node *node);
 
diff --git a/lib/timerqueue.c b/lib/timerqueue.c
index a382e4a32609..782ae8ca2c06 100644
--- a/lib/timerqueue.c
+++ b/lib/timerqueue.c
@@ -36,7 +36,7 @@
  * Adds the timer node to the timerqueue, sorted by the
  * node's expires value.
  */
-void timerqueue_add(struct timerqueue_head *head, struct timerqueue_node *node)
+bool timerqueue_add(struct timerqueue_head *head, struct timerqueue_node *node)
 {
 	struct rb_node **p = &head->head.rb_node;
 	struct rb_node *parent = NULL;
@@ -56,8 +56,11 @@ void timerqueue_add(struct timerqueue_head *head, struct timerqueue_node *node)
 	rb_link_node(&node->node, parent, p);
 	rb_insert_color(&node->node, &head->head);
 
-	if (!head->next || node->expires.tv64 < head->next->expires.tv64)
+	if (!head->next || node->expires.tv64 < head->next->expires.tv64) {
 		head->next = node;
+		return true;
+	}
+	return false;
 }
 EXPORT_SYMBOL_GPL(timerqueue_add);
 
@@ -69,7 +72,7 @@ EXPORT_SYMBOL_GPL(timerqueue_add);
  *
  * Removes the timer node from the timerqueue.
  */
-void timerqueue_del(struct timerqueue_head *head, struct timerqueue_node *node)
+bool timerqueue_del(struct timerqueue_head *head, struct timerqueue_node *node)
 {
 	WARN_ON_ONCE(RB_EMPTY_NODE(&node->node));
 
@@ -82,6 +85,7 @@ void timerqueue_del(struct timerqueue_head *head, struct timerqueue_node *node)
 	}
 	rb_erase(&node->node, &head->head);
 	RB_CLEAR_NODE(&node->node);
+	return head->next != NULL;
 }
 EXPORT_SYMBOL_GPL(timerqueue_del);
 
-- 
cgit v1.2.3


From 895bdfa793f6e912d1a58fc445b3dd4d686f7bd3 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 14 Apr 2015 21:08:49 +0000
Subject: hrtimer: Keep pointer to first timer and simplify __remove_hrtimer()

__remove_hrtimer() needs to evaluate the expiry time to figure out
whether the timer which is removed is eventually the first expiring
timer on the cpu. Keep a pointer to it, which is lazily updated, so we
can avoid the evaluation dance and retrieve the information from there.

Generates slightly better code.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Preeti U Murthy <preeti@linux.vnet.ibm.com>
Cc: Viresh Kumar <viresh.kumar@linaro.org>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Link: http://lkml.kernel.org/r/20150414203501.752838019@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/hrtimer.h |  6 ++++++
 kernel/time/hrtimer.c   | 46 ++++++++++++++++++++++++++++------------------
 2 files changed, 34 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index e5c22d611850..d194c1dacdaa 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -172,6 +172,7 @@ enum  hrtimer_base_type {
  * @clock_was_set_seq:	Sequence counter of clock was set events
  * @expires_next:	absolute time of the next event which was scheduled
  *			via clock_set_next_event()
+ * @next_timer:		Pointer to the first expiring timer
  * @in_hrtirq:		hrtimer_interrupt() is currently executing
  * @hres_active:	State of high resolution mode
  * @hang_detected:	The last hrtimer interrupt detected a hang
@@ -180,6 +181,10 @@ enum  hrtimer_base_type {
  * @nr_hangs:		Total number of hrtimer interrupt hangs
  * @max_hang_time:	Maximum time spent in hrtimer_interrupt
  * @clock_base:		array of clock bases for this cpu
+ *
+ * Note: next_timer is just an optimization for __remove_hrtimer().
+ *	 Do not dereference the pointer because it is not reliable on
+ *	 cross cpu removals.
  */
 struct hrtimer_cpu_base {
 	raw_spinlock_t			lock;
@@ -191,6 +196,7 @@ struct hrtimer_cpu_base {
 					hres_active	: 1,
 					hang_detected	: 1;
 	ktime_t				expires_next;
+	struct hrtimer			*next_timer;
 	unsigned int			nr_events;
 	unsigned int			nr_retries;
 	unsigned int			nr_hangs;
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 0cd1e0b8099d..30178d0656cf 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -415,12 +415,21 @@ static inline void debug_deactivate(struct hrtimer *timer)
 }
 
 #if defined(CONFIG_NO_HZ_COMMON) || defined(CONFIG_HIGH_RES_TIMERS)
+static inline void hrtimer_update_next_timer(struct hrtimer_cpu_base *cpu_base,
+					     struct hrtimer *timer)
+{
+#ifdef CONFIG_HIGH_RES_TIMERS
+	cpu_base->next_timer = timer;
+#endif
+}
+
 static ktime_t __hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base)
 {
 	struct hrtimer_clock_base *base = cpu_base->clock_base;
 	ktime_t expires, expires_next = { .tv64 = KTIME_MAX };
 	unsigned int active = cpu_base->active_bases;
 
+	hrtimer_update_next_timer(cpu_base, NULL);
 	for (; active; base++, active >>= 1) {
 		struct timerqueue_node *next;
 		struct hrtimer *timer;
@@ -431,8 +440,10 @@ static ktime_t __hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base)
 		next = timerqueue_getnext(&base->active);
 		timer = container_of(next, struct hrtimer, node);
 		expires = ktime_sub(hrtimer_get_expires(timer), base->offset);
-		if (expires.tv64 < expires_next.tv64)
+		if (expires.tv64 < expires_next.tv64) {
 			expires_next = expires;
+			hrtimer_update_next_timer(cpu_base, timer);
+		}
 	}
 	/*
 	 * clock_was_set() might have changed base->offset of any of
@@ -597,6 +608,8 @@ static int hrtimer_reprogram(struct hrtimer *timer,
 	if (cpu_base->in_hrtirq)
 		return 0;
 
+	cpu_base->next_timer = timer;
+
 	/*
 	 * If a hang was detected in the last timer interrupt then we
 	 * do not schedule a timer which is earlier than the expiry
@@ -868,30 +881,27 @@ static void __remove_hrtimer(struct hrtimer *timer,
 			     unsigned long newstate, int reprogram)
 {
 	struct hrtimer_cpu_base *cpu_base = base->cpu_base;
-	struct timerqueue_node *next_timer;
+	unsigned int state = timer->state;
 
-	if (!(timer->state & HRTIMER_STATE_ENQUEUED))
-		goto out;
+	timer->state = newstate;
+	if (!(state & HRTIMER_STATE_ENQUEUED))
+		return;
 
-	next_timer = timerqueue_getnext(&base->active);
 	if (!timerqueue_del(&base->active, &timer->node))
 		cpu_base->active_bases &= ~(1 << base->index);
 
-	if (&timer->node == next_timer) {
 #ifdef CONFIG_HIGH_RES_TIMERS
-		/* Reprogram the clock event device. if enabled */
-		if (reprogram && cpu_base->hres_active) {
-			ktime_t expires;
-
-			expires = ktime_sub(hrtimer_get_expires(timer),
-					    base->offset);
-			if (cpu_base->expires_next.tv64 == expires.tv64)
-				hrtimer_force_reprogram(cpu_base, 1);
-		}
+	/*
+	 * Note: If reprogram is false we do not update
+	 * cpu_base->next_timer. This happens when we remove the first
+	 * timer on a remote cpu. No harm as we never dereference
+	 * cpu_base->next_timer. So the worst thing what can happen is
+	 * an superflous call to hrtimer_force_reprogram() on the
+	 * remote cpu later on if the same timer gets enqueued again.
+	 */
+	if (reprogram && timer == cpu_base->next_timer)
+		hrtimer_force_reprogram(cpu_base, 1);
 #endif
-	}
-out:
-	timer->state = newstate;
 }
 
 /*
-- 
cgit v1.2.3


From c6eb3f70d4482806dc2d3e1e3c7736f497b1d418 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 14 Apr 2015 21:08:51 +0000
Subject: hrtimer: Get rid of hrtimer softirq

hrtimer softirq is a leftover from the initial implementation and
serves only the purpose to handle the enqueueing of already expired
timers in the high resolution timer mode. We discussed whether we
change the return value and force all start sites to handle that the
timer is already expired, but that would be a Herculean task and I'm
not sure whether its a good idea to enforce that handling on
everyone.

A simpler solution is to enforce a timer interrupt instead of raising
and scheduling a softirq. Just use the existing infrastructure to do
so and remove all the softirq leftovers.

The HRTIMER softirq enum is now unused, but kept around because trace
parsers rely on the existing numbering.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Preeti U Murthy <preeti@linux.vnet.ibm.com>
Cc: Viresh Kumar <viresh.kumar@linaro.org>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Link: http://lkml.kernel.org/r/20150414203501.840834708@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/hrtimer.h   |   1 -
 include/linux/interrupt.h |   3 +-
 kernel/time/hrtimer.c     | 163 ++++++++++++----------------------------------
 kernel/time/tick-common.c |  10 +++
 kernel/time/timer.c       |   2 -
 5 files changed, 55 insertions(+), 124 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index d194c1dacdaa..048270a27bc5 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -459,7 +459,6 @@ extern int schedule_hrtimeout(ktime_t *expires, const enum hrtimer_mode mode);
 
 /* Soft interrupt function to run the hrtimer queues: */
 extern void hrtimer_run_queues(void);
-extern void hrtimer_run_pending(void);
 
 /* Bootup initialization: */
 extern void __init hrtimers_init(void);
diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 950ae4501826..6bf15a66bce7 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -413,7 +413,8 @@ enum
 	BLOCK_IOPOLL_SOFTIRQ,
 	TASKLET_SOFTIRQ,
 	SCHED_SOFTIRQ,
-	HRTIMER_SOFTIRQ,
+	HRTIMER_SOFTIRQ, /* Unused, but kept as tools rely on the
+			    numbering. Sigh! */
 	RCU_SOFTIRQ,    /* Preferable RCU should always be the last softirq */
 
 	NR_SOFTIRQS
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 30178d0656cf..fc6b6d25f93d 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -555,59 +555,48 @@ hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal)
 }
 
 /*
- * Shared reprogramming for clock_realtime and clock_monotonic
- *
  * When a timer is enqueued and expires earlier than the already enqueued
  * timers, we have to check, whether it expires earlier than the timer for
  * which the clock event device was armed.
  *
- * Note, that in case the state has HRTIMER_STATE_CALLBACK set, no reprogramming
- * and no expiry check happens. The timer gets enqueued into the rbtree. The
- * reprogramming and expiry check is done in the hrtimer_interrupt or in the
- * softirq.
- *
  * Called with interrupts disabled and base->cpu_base.lock held
  */
-static int hrtimer_reprogram(struct hrtimer *timer,
-			     struct hrtimer_clock_base *base)
+static void hrtimer_reprogram(struct hrtimer *timer,
+			      struct hrtimer_clock_base *base)
 {
 	struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
 	ktime_t expires = ktime_sub(hrtimer_get_expires(timer), base->offset);
-	int res;
 
 	WARN_ON_ONCE(hrtimer_get_expires_tv64(timer) < 0);
 
 	/*
-	 * When the callback is running, we do not reprogram the clock event
-	 * device. The timer callback is either running on a different CPU or
-	 * the callback is executed in the hrtimer_interrupt context. The
-	 * reprogramming is handled either by the softirq, which called the
-	 * callback or at the end of the hrtimer_interrupt.
+	 * If the timer is not on the current cpu, we cannot reprogram
+	 * the other cpus clock event device.
 	 */
-	if (hrtimer_callback_running(timer))
-		return 0;
+	if (base->cpu_base != cpu_base)
+		return;
+
+	/*
+	 * If the hrtimer interrupt is running, then it will
+	 * reevaluate the clock bases and reprogram the clock event
+	 * device. The callbacks are always executed in hard interrupt
+	 * context so we don't need an extra check for a running
+	 * callback.
+	 */
+	if (cpu_base->in_hrtirq)
+		return;
 
 	/*
 	 * CLOCK_REALTIME timer might be requested with an absolute
-	 * expiry time which is less than base->offset. Nothing wrong
-	 * about that, just avoid to call into the tick code, which
-	 * has now objections against negative expiry values.
+	 * expiry time which is less than base->offset. Set it to 0.
 	 */
 	if (expires.tv64 < 0)
-		return -ETIME;
+		expires.tv64 = 0;
 
 	if (expires.tv64 >= cpu_base->expires_next.tv64)
-		return 0;
-
-	/*
-	 * When the target cpu of the timer is currently executing
-	 * hrtimer_interrupt(), then we do not touch the clock event
-	 * device. hrtimer_interrupt() will reevaluate all clock bases
-	 * before reprogramming the device.
-	 */
-	if (cpu_base->in_hrtirq)
-		return 0;
+		return;
 
+	/* Update the pointer to the next expiring timer */
 	cpu_base->next_timer = timer;
 
 	/*
@@ -617,15 +606,14 @@ static int hrtimer_reprogram(struct hrtimer *timer,
 	 * to make progress.
 	 */
 	if (cpu_base->hang_detected)
-		return 0;
+		return;
 
 	/*
-	 * Clockevents returns -ETIME, when the event was in the past.
+	 * Program the timer hardware. We enforce the expiry for
+	 * events which are already in the past.
 	 */
-	res = tick_program_event(expires, 0);
-	if (!IS_ERR_VALUE(res))
-		cpu_base->expires_next = expires;
-	return res;
+	cpu_base->expires_next = expires;
+	tick_program_event(expires, 1);
 }
 
 /*
@@ -660,19 +648,11 @@ static void retrigger_next_event(void *arg)
  */
 static int hrtimer_switch_to_hres(void)
 {
-	int cpu = smp_processor_id();
-	struct hrtimer_cpu_base *base = &per_cpu(hrtimer_bases, cpu);
-	unsigned long flags;
-
-	if (base->hres_active)
-		return 1;
-
-	local_irq_save(flags);
+	struct hrtimer_cpu_base *base = this_cpu_ptr(&hrtimer_bases);
 
 	if (tick_init_highres()) {
-		local_irq_restore(flags);
 		printk(KERN_WARNING "Could not switch to high resolution "
-				    "mode on CPU %d\n", cpu);
+				    "mode on CPU %d\n", base->cpu);
 		return 0;
 	}
 	base->hres_active = 1;
@@ -681,7 +661,6 @@ static int hrtimer_switch_to_hres(void)
 	tick_setup_sched_timer();
 	/* "Retrigger" the interrupt to get things going */
 	retrigger_next_event(NULL);
-	local_irq_restore(flags);
 	return 1;
 }
 
@@ -984,26 +963,8 @@ int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
 		 * on dynticks target.
 		 */
 		wake_up_nohz_cpu(new_base->cpu_base->cpu);
-	} else if (new_base->cpu_base == this_cpu_ptr(&hrtimer_bases) &&
-			hrtimer_reprogram(timer, new_base)) {
-		/*
-		 * Only allow reprogramming if the new base is on this CPU.
-		 * (it might still be on another CPU if the timer was pending)
-		 *
-		 * XXX send_remote_softirq() ?
-		 */
-		if (wakeup) {
-			/*
-			 * We need to drop cpu_base->lock to avoid a
-			 * lock ordering issue vs. rq->lock.
-			 */
-			raw_spin_unlock(&new_base->cpu_base->lock);
-			raise_softirq_irqoff(HRTIMER_SOFTIRQ);
-			local_irq_restore(flags);
-			return ret;
-		} else {
-			__raise_softirq_irqoff(HRTIMER_SOFTIRQ);
-		}
+	} else {
+		hrtimer_reprogram(timer, new_base);
 	}
 
 	unlock_hrtimer_base(timer, &flags);
@@ -1354,7 +1315,7 @@ retry:
  * local version of hrtimer_peek_ahead_timers() called with interrupts
  * disabled.
  */
-static void __hrtimer_peek_ahead_timers(void)
+static inline void __hrtimer_peek_ahead_timers(void)
 {
 	struct tick_device *td;
 
@@ -1366,29 +1327,6 @@ static void __hrtimer_peek_ahead_timers(void)
 		hrtimer_interrupt(td->evtdev);
 }
 
-/**
- * hrtimer_peek_ahead_timers -- run soft-expired timers now
- *
- * hrtimer_peek_ahead_timers will peek at the timer queue of
- * the current cpu and check if there are any timers for which
- * the soft expires time has passed. If any such timers exist,
- * they are run immediately and then removed from the timer queue.
- *
- */
-void hrtimer_peek_ahead_timers(void)
-{
-	unsigned long flags;
-
-	local_irq_save(flags);
-	__hrtimer_peek_ahead_timers();
-	local_irq_restore(flags);
-}
-
-static void run_hrtimer_softirq(struct softirq_action *h)
-{
-	hrtimer_peek_ahead_timers();
-}
-
 #else /* CONFIG_HIGH_RES_TIMERS */
 
 static inline void __hrtimer_peek_ahead_timers(void) { }
@@ -1396,31 +1334,7 @@ static inline void __hrtimer_peek_ahead_timers(void) { }
 #endif	/* !CONFIG_HIGH_RES_TIMERS */
 
 /*
- * Called from timer softirq every jiffy, expire hrtimers:
- *
- * For HRT its the fall back code to run the softirq in the timer
- * softirq context in case the hrtimer initialization failed or has
- * not been done yet.
- */
-void hrtimer_run_pending(void)
-{
-	if (hrtimer_hres_active())
-		return;
-
-	/*
-	 * This _is_ ugly: We have to check in the softirq context,
-	 * whether we can switch to highres and / or nohz mode. The
-	 * clocksource switch happens in the timer interrupt with
-	 * xtime_lock held. Notification from there only sets the
-	 * check bit in the tick_oneshot code, otherwise we might
-	 * deadlock vs. xtime_lock.
-	 */
-	if (tick_check_oneshot_change(!hrtimer_is_hres_enabled()))
-		hrtimer_switch_to_hres();
-}
-
-/*
- * Called from hardirq context every jiffy
+ * Called from run_local_timers in hardirq context every jiffy
  */
 void hrtimer_run_queues(void)
 {
@@ -1430,6 +1344,18 @@ void hrtimer_run_queues(void)
 	if (__hrtimer_hres_active(cpu_base))
 		return;
 
+	/*
+	 * This _is_ ugly: We have to check periodically, whether we
+	 * can switch to highres and / or nohz mode. The clocksource
+	 * switch happens with xtime_lock held. Notification from
+	 * there only sets the check bit in the tick_oneshot code,
+	 * otherwise we might deadlock vs. xtime_lock.
+	 */
+	if (tick_check_oneshot_change(!hrtimer_is_hres_enabled())) {
+		hrtimer_switch_to_hres();
+		return;
+	}
+
 	raw_spin_lock(&cpu_base->lock);
 	now = hrtimer_update_base(cpu_base);
 	__hrtimer_run_queues(cpu_base, now);
@@ -1700,9 +1626,6 @@ void __init hrtimers_init(void)
 	hrtimer_cpu_notify(&hrtimers_nb, (unsigned long)CPU_UP_PREPARE,
 			  (void *)(long)smp_processor_id());
 	register_cpu_notifier(&hrtimers_nb);
-#ifdef CONFIG_HIGH_RES_TIMERS
-	open_softirq(HRTIMER_SOFTIRQ, run_hrtimer_softirq);
-#endif
 }
 
 /**
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index 3ae6afa1eb98..ea5f9eae8f74 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -102,6 +102,16 @@ void tick_handle_periodic(struct clock_event_device *dev)
 
 	tick_periodic(cpu);
 
+#if defined(CONFIG_HIGH_RES_TIMERS) || defined(CONFIG_NO_HZ_COMMON)
+	/*
+	 * The cpu might have transitioned to HIGHRES or NOHZ mode via
+	 * update_process_times() -> run_local_timers() ->
+	 * hrtimer_run_queues().
+	 */
+	if (dev->event_handler != tick_handle_periodic)
+		return;
+#endif
+
 	if (dev->state != CLOCK_EVT_STATE_ONESHOT)
 		return;
 	for (;;) {
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 2ece3aa5069c..b31f13f4fe41 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -1409,8 +1409,6 @@ static void run_timer_softirq(struct softirq_action *h)
 {
 	struct tvec_base *base = __this_cpu_read(tvec_bases);
 
-	hrtimer_run_pending();
-
 	if (time_after_eq(jiffies, base->timer_jiffies))
 		__run_timers(base);
 }
-- 
cgit v1.2.3


From c1ad348b452aacd784fb97403d03d71723c72ee1 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 14 Apr 2015 21:08:58 +0000
Subject: tick: Nohz: Rework next timer evaluation

The evaluation of the next timer in the nohz code is based on jiffies
while all the tick internals are nano seconds based. We have also to
convert hrtimer nanoseconds to jiffies in the !highres case. That's
just wrong and introduces interesting corner cases.

Turn it around and convert the next timer wheel timer expiry and the
rcu event to clock monotonic and base all calculations on
nanoseconds. That identifies the case where no timer is pending
clearly with an absolute expiry value of KTIME_MAX.

Makes the code more readable and gets rid of the jiffies magic in the
nohz code.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Preeti U Murthy <preeti@linux.vnet.ibm.com>
Cc: Viresh Kumar <viresh.kumar@linaro.org>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Josh Triplett <josh@joshtriplett.org>
Cc: Lai Jiangshan <laijs@cn.fujitsu.com>
Cc: John Stultz <john.stultz@linaro.org>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Link: http://lkml.kernel.org/r/20150414203502.184198593@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/hrtimer.h     |   2 +-
 include/linux/rcupdate.h    |   6 ++-
 include/linux/rcutree.h     |   2 +-
 include/linux/timer.h       |   7 ---
 kernel/rcu/tree_plugin.h    |  14 +++---
 kernel/time/hrtimer.c       |  14 ++----
 kernel/time/tick-internal.h |   2 +
 kernel/time/tick-sched.c    | 109 ++++++++++++++++++++------------------------
 kernel/time/tick-sched.h    |   2 +-
 kernel/time/timer.c         |  71 ++++++++++++++---------------
 kernel/time/timer_list.c    |   4 +-
 11 files changed, 107 insertions(+), 126 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index 048270a27bc5..2c68f71ffd24 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -386,7 +386,7 @@ static inline int hrtimer_restart(struct hrtimer *timer)
 /* Query timers: */
 extern ktime_t hrtimer_get_remaining(const struct hrtimer *timer);
 
-extern ktime_t hrtimer_get_next_event(void);
+extern u64 hrtimer_get_next_event(void);
 
 /*
  * A timer is active, when it is enqueued into the rbtree or the
diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 573a5afd5ed8..0627a447c589 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -44,6 +44,8 @@
 #include <linux/debugobjects.h>
 #include <linux/bug.h>
 #include <linux/compiler.h>
+#include <linux/ktime.h>
+
 #include <asm/barrier.h>
 
 extern int rcu_expedited; /* for sysctl */
@@ -1154,9 +1156,9 @@ static inline notrace void rcu_read_unlock_sched_notrace(void)
 	__kfree_rcu(&((ptr)->rcu_head), offsetof(typeof(*(ptr)), rcu_head))
 
 #if defined(CONFIG_TINY_RCU) || defined(CONFIG_RCU_NOCB_CPU_ALL)
-static inline int rcu_needs_cpu(unsigned long *delta_jiffies)
+static inline int rcu_needs_cpu(u64 basemono, u64 *nextevt)
 {
-	*delta_jiffies = ULONG_MAX;
+	*nextevt = KTIME_MAX;
 	return 0;
 }
 #endif /* #if defined(CONFIG_TINY_RCU) || defined(CONFIG_RCU_NOCB_CPU_ALL) */
diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
index d2e583a6aaca..db2e31beaae7 100644
--- a/include/linux/rcutree.h
+++ b/include/linux/rcutree.h
@@ -32,7 +32,7 @@
 
 void rcu_note_context_switch(void);
 #ifndef CONFIG_RCU_NOCB_CPU_ALL
-int rcu_needs_cpu(unsigned long *delta_jiffies);
+int rcu_needs_cpu(u64 basem, u64 *nextevt);
 #endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */
 void rcu_cpu_stall_reset(void);
 
diff --git a/include/linux/timer.h b/include/linux/timer.h
index 8c5a197e1587..fbb80e0030bf 100644
--- a/include/linux/timer.h
+++ b/include/linux/timer.h
@@ -187,13 +187,6 @@ extern void set_timer_slack(struct timer_list *time, int slack_hz);
  */
 #define NEXT_TIMER_MAX_DELTA	((1UL << 30) - 1)
 
-/*
- * Return when the next timer-wheel timeout occurs (in absolute jiffies),
- * locks the timer base and does the comparison against the given
- * jiffie.
- */
-extern unsigned long get_next_timer_interrupt(unsigned long now);
-
 /*
  * Timer-statistics info:
  */
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 8c0ec0f5a027..0ef80a0bbabb 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -1368,9 +1368,9 @@ static void rcu_prepare_kthreads(int cpu)
  * any flavor of RCU.
  */
 #ifndef CONFIG_RCU_NOCB_CPU_ALL
-int rcu_needs_cpu(unsigned long *delta_jiffies)
+int rcu_needs_cpu(u64 basemono, u64 *nextevt)
 {
-	*delta_jiffies = ULONG_MAX;
+	*nextevt = KTIME_MAX;
 	return rcu_cpu_has_callbacks(NULL);
 }
 #endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */
@@ -1481,16 +1481,17 @@ static bool __maybe_unused rcu_try_advance_all_cbs(void)
  * The caller must have disabled interrupts.
  */
 #ifndef CONFIG_RCU_NOCB_CPU_ALL
-int rcu_needs_cpu(unsigned long *dj)
+int rcu_needs_cpu(u64 basemono, u64 *nextevt)
 {
 	struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
+	unsigned long dj;
 
 	/* Snapshot to detect later posting of non-lazy callback. */
 	rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted;
 
 	/* If no callbacks, RCU doesn't need the CPU. */
 	if (!rcu_cpu_has_callbacks(&rdtp->all_lazy)) {
-		*dj = ULONG_MAX;
+		*nextevt = KTIME_MAX;
 		return 0;
 	}
 
@@ -1504,11 +1505,12 @@ int rcu_needs_cpu(unsigned long *dj)
 
 	/* Request timer delay depending on laziness, and round. */
 	if (!rdtp->all_lazy) {
-		*dj = round_up(rcu_idle_gp_delay + jiffies,
+		dj = round_up(rcu_idle_gp_delay + jiffies,
 			       rcu_idle_gp_delay) - jiffies;
 	} else {
-		*dj = round_jiffies(rcu_idle_lazy_gp_delay + jiffies) - jiffies;
+		dj = round_jiffies(rcu_idle_lazy_gp_delay + jiffies) - jiffies;
 	}
+	*nextevt = basemono + dj * TICK_NSEC;
 	return 0;
 }
 #endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index fc6b6d25f93d..179b991cfdcb 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -1080,26 +1080,22 @@ EXPORT_SYMBOL_GPL(hrtimer_get_remaining);
 /**
  * hrtimer_get_next_event - get the time until next expiry event
  *
- * Returns the delta to the next expiry event or KTIME_MAX if no timer
- * is pending.
+ * Returns the next expiry time or KTIME_MAX if no timer is pending.
  */
-ktime_t hrtimer_get_next_event(void)
+u64 hrtimer_get_next_event(void)
 {
 	struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
-	ktime_t mindelta = { .tv64 = KTIME_MAX };
+	u64 expires = KTIME_MAX;
 	unsigned long flags;
 
 	raw_spin_lock_irqsave(&cpu_base->lock, flags);
 
 	if (!__hrtimer_hres_active(cpu_base))
-		mindelta = ktime_sub(__hrtimer_get_next_event(cpu_base),
-				     ktime_get());
+		expires = __hrtimer_get_next_event(cpu_base).tv64;
 
 	raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
 
-	if (mindelta.tv64 < 0)
-		mindelta.tv64 = 0;
-	return mindelta;
+	return expires;
 }
 #endif
 
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index b64fdd8054c5..65273f0a11ed 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -137,3 +137,5 @@ extern void tick_nohz_init(void);
 # else
 static inline void tick_nohz_init(void) { }
 #endif
+
+extern u64 get_next_timer_interrupt(unsigned long basej, u64 basem);
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 4c5f4a9dcc0a..753c211f6195 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -582,39 +582,46 @@ static void tick_nohz_restart(struct tick_sched *ts, ktime_t now)
 static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
 					 ktime_t now, int cpu)
 {
-	unsigned long seq, last_jiffies, next_jiffies, delta_jiffies;
-	ktime_t last_update, expires, ret = { .tv64 = 0 };
-	unsigned long rcu_delta_jiffies;
 	struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev);
-	u64 time_delta;
-
-	time_delta = timekeeping_max_deferment();
+	u64 basemono, next_tick, next_tmr, next_rcu, delta, expires;
+	unsigned long seq, basejiff;
+	ktime_t	tick;
 
 	/* Read jiffies and the time when jiffies were updated last */
 	do {
 		seq = read_seqbegin(&jiffies_lock);
-		last_update = last_jiffies_update;
-		last_jiffies = jiffies;
+		basemono = last_jiffies_update.tv64;
+		basejiff = jiffies;
 	} while (read_seqretry(&jiffies_lock, seq));
+	ts->last_jiffies = basejiff;
 
-	if (rcu_needs_cpu(&rcu_delta_jiffies) ||
+	if (rcu_needs_cpu(basemono, &next_rcu) ||
 	    arch_needs_cpu() || irq_work_needs_cpu()) {
-		next_jiffies = last_jiffies + 1;
-		delta_jiffies = 1;
+		next_tick = basemono + TICK_NSEC;
 	} else {
-		/* Get the next timer wheel timer */
-		next_jiffies = get_next_timer_interrupt(last_jiffies);
-		delta_jiffies = next_jiffies - last_jiffies;
-		if (rcu_delta_jiffies < delta_jiffies) {
-			next_jiffies = last_jiffies + rcu_delta_jiffies;
-			delta_jiffies = rcu_delta_jiffies;
-		}
+		/*
+		 * Get the next pending timer. If high resolution
+		 * timers are enabled this only takes the timer wheel
+		 * timers into account. If high resolution timers are
+		 * disabled this also looks at the next expiring
+		 * hrtimer.
+		 */
+		next_tmr = get_next_timer_interrupt(basejiff, basemono);
+		ts->next_timer = next_tmr;
+		/* Take the next rcu event into account */
+		next_tick = next_rcu < next_tmr ? next_rcu : next_tmr;
 	}
 
-	if ((long)delta_jiffies <= 1) {
+	/*
+	 * If the tick is due in the next period, keep it ticking or
+	 * restart it proper.
+	 */
+	delta = next_tick - basemono;
+	if (delta <= (u64)TICK_NSEC) {
+		tick.tv64 = 0;
 		if (!ts->tick_stopped)
 			goto out;
-		if (delta_jiffies == 0) {
+		if (delta == 0) {
 			/* Tick is stopped, but required now. Enforce it */
 			tick_nohz_restart(ts, now);
 			goto out;
@@ -629,54 +636,39 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
 	 * do_timer() never invoked. Keep track of the fact that it
 	 * was the one which had the do_timer() duty last. If this cpu
 	 * is the one which had the do_timer() duty last, we limit the
-	 * sleep time to the timekeeping max_deferement value which we
-	 * retrieved above. Otherwise we can sleep as long as we want.
+	 * sleep time to the timekeeping max_deferement value.
+	 * Otherwise we can sleep as long as we want.
 	 */
+	delta = timekeeping_max_deferment();
 	if (cpu == tick_do_timer_cpu) {
 		tick_do_timer_cpu = TICK_DO_TIMER_NONE;
 		ts->do_timer_last = 1;
 	} else if (tick_do_timer_cpu != TICK_DO_TIMER_NONE) {
-		time_delta = KTIME_MAX;
+		delta = KTIME_MAX;
 		ts->do_timer_last = 0;
 	} else if (!ts->do_timer_last) {
-		time_delta = KTIME_MAX;
+		delta = KTIME_MAX;
 	}
 
 #ifdef CONFIG_NO_HZ_FULL
+	/* Limit the tick delta to the maximum scheduler deferment */
 	if (!ts->inidle)
-		time_delta = min(time_delta, scheduler_tick_max_deferment());
+		delta = min(delta, scheduler_tick_max_deferment());
 #endif
 
-	/*
-	 * calculate the expiry time for the next timer wheel
-	 * timer. delta_jiffies >= NEXT_TIMER_MAX_DELTA signals that
-	 * there is no timer pending or at least extremely far into
-	 * the future (12 days for HZ=1000). In this case we set the
-	 * expiry to the end of time.
-	 */
-	if (likely(delta_jiffies < NEXT_TIMER_MAX_DELTA)) {
-		/*
-		 * Calculate the time delta for the next timer event.
-		 * If the time delta exceeds the maximum time delta
-		 * permitted by the current clocksource then adjust
-		 * the time delta accordingly to ensure the
-		 * clocksource does not wrap.
-		 */
-		time_delta = min_t(u64, time_delta,
-				   tick_period.tv64 * delta_jiffies);
-	}
-
-	if (time_delta < KTIME_MAX)
-		expires = ktime_add_ns(last_update, time_delta);
+	/* Calculate the next expiry time */
+	if (delta < (KTIME_MAX - basemono))
+		expires = basemono + delta;
 	else
-		expires.tv64 = KTIME_MAX;
+		expires = KTIME_MAX;
+
+	expires = min_t(u64, expires, next_tick);
+	tick.tv64 = expires;
 
 	/* Skip reprogram of event if its not changed */
-	if (ts->tick_stopped && ktime_equal(expires, dev->next_event))
+	if (ts->tick_stopped && (expires == dev->next_event.tv64))
 		goto out;
 
-	ret = expires;
-
 	/*
 	 * nohz_stop_sched_tick can be called several times before
 	 * the nohz_restart_sched_tick is called. This happens when
@@ -694,26 +686,23 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
 	}
 
 	/*
-	 * If the expiration time == KTIME_MAX, then
-	 * in this case we simply stop the tick timer.
+	 * If the expiration time == KTIME_MAX, then we simply stop
+	 * the tick timer.
 	 */
-	if (unlikely(expires.tv64 == KTIME_MAX)) {
+	if (unlikely(expires == KTIME_MAX)) {
 		if (ts->nohz_mode == NOHZ_MODE_HIGHRES)
 			hrtimer_cancel(&ts->sched_timer);
 		goto out;
 	}
 
 	if (ts->nohz_mode == NOHZ_MODE_HIGHRES)
-		hrtimer_start(&ts->sched_timer, expires,
-			      HRTIMER_MODE_ABS_PINNED);
+		hrtimer_start(&ts->sched_timer, tick, HRTIMER_MODE_ABS_PINNED);
 	else
-		tick_program_event(expires, 1);
+		tick_program_event(tick, 1);
 out:
-	ts->next_jiffies = next_jiffies;
-	ts->last_jiffies = last_jiffies;
+	/* Update the estimated sleep length */
 	ts->sleep_length = ktime_sub(dev->next_event, now);
-
-	return ret;
+	return tick;
 }
 
 static void tick_nohz_full_stop_tick(struct tick_sched *ts)
diff --git a/kernel/time/tick-sched.h b/kernel/time/tick-sched.h
index 28b5da3e1a17..42fdf4958bcc 100644
--- a/kernel/time/tick-sched.h
+++ b/kernel/time/tick-sched.h
@@ -57,7 +57,7 @@ struct tick_sched {
 	ktime_t				iowait_sleeptime;
 	ktime_t				sleep_length;
 	unsigned long			last_jiffies;
-	unsigned long			next_jiffies;
+	u64				next_timer;
 	ktime_t				idle_expires;
 	int				do_timer_last;
 };
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index b31f13f4fe41..172b83cd2f8e 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -49,6 +49,8 @@
 #include <asm/timex.h>
 #include <asm/io.h>
 
+#include "tick-internal.h"
+
 #define CREATE_TRACE_POINTS
 #include <trace/events/timer.h>
 
@@ -1311,54 +1313,48 @@ cascade:
  * Check, if the next hrtimer event is before the next timer wheel
  * event:
  */
-static unsigned long cmp_next_hrtimer_event(unsigned long now,
-					    unsigned long expires)
+static u64 cmp_next_hrtimer_event(u64 basem, u64 expires)
 {
-	ktime_t hr_delta = hrtimer_get_next_event();
-	struct timespec tsdelta;
-	unsigned long delta;
-
-	if (hr_delta.tv64 == KTIME_MAX)
-		return expires;
+	u64 nextevt = hrtimer_get_next_event();
 
 	/*
-	 * Expired timer available, let it expire in the next tick
+	 * If high resolution timers are enabled
+	 * hrtimer_get_next_event() returns KTIME_MAX.
 	 */
-	if (hr_delta.tv64 <= 0)
-		return now + 1;
-
-	tsdelta = ktime_to_timespec(hr_delta);
-	delta = timespec_to_jiffies(&tsdelta);
+	if (expires <= nextevt)
+		return expires;
 
 	/*
-	 * Limit the delta to the max value, which is checked in
-	 * tick_nohz_stop_sched_tick():
+	 * If the next timer is already expired, return the tick base
+	 * time so the tick is fired immediately.
 	 */
-	if (delta > NEXT_TIMER_MAX_DELTA)
-		delta = NEXT_TIMER_MAX_DELTA;
+	if (nextevt <= basem)
+		return basem;
 
 	/*
-	 * Take rounding errors in to account and make sure, that it
-	 * expires in the next tick. Otherwise we go into an endless
-	 * ping pong due to tick_nohz_stop_sched_tick() retriggering
-	 * the timer softirq
+	 * Round up to the next jiffie. High resolution timers are
+	 * off, so the hrtimers are expired in the tick and we need to
+	 * make sure that this tick really expires the timer to avoid
+	 * a ping pong of the nohz stop code.
+	 *
+	 * Use DIV_ROUND_UP_ULL to prevent gcc calling __divdi3
 	 */
-	if (delta < 1)
-		delta = 1;
-	now += delta;
-	if (time_before(now, expires))
-		return now;
-	return expires;
+	return DIV_ROUND_UP_ULL(nextevt, TICK_NSEC) * TICK_NSEC;
 }
 
 /**
- * get_next_timer_interrupt - return the jiffy of the next pending timer
- * @now: current time (in jiffies)
+ * get_next_timer_interrupt - return the time (clock mono) of the next timer
+ * @basej:	base time jiffies
+ * @basem:	base time clock monotonic
+ *
+ * Returns the tick aligned clock monotonic time of the next pending
+ * timer or KTIME_MAX if no timer is pending.
  */
-unsigned long get_next_timer_interrupt(unsigned long now)
+u64 get_next_timer_interrupt(unsigned long basej, u64 basem)
 {
 	struct tvec_base *base = __this_cpu_read(tvec_bases);
-	unsigned long expires = now + NEXT_TIMER_MAX_DELTA;
+	u64 expires = KTIME_MAX;
+	unsigned long nextevt;
 
 	/*
 	 * Pretend that there is no timer pending if the cpu is offline.
@@ -1371,14 +1367,15 @@ unsigned long get_next_timer_interrupt(unsigned long now)
 	if (base->active_timers) {
 		if (time_before_eq(base->next_timer, base->timer_jiffies))
 			base->next_timer = __next_timer_interrupt(base);
-		expires = base->next_timer;
+		nextevt = base->next_timer;
+		if (time_before_eq(nextevt, basej))
+			expires = basem;
+		else
+			expires = basem + (nextevt - basej) * TICK_NSEC;
 	}
 	spin_unlock(&base->lock);
 
-	if (time_before_eq(expires, now))
-		return now;
-
-	return cmp_next_hrtimer_event(now, expires);
+	return cmp_next_hrtimer_event(basem, expires);
 }
 #endif
 
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index 6232fc536185..66f39bba5353 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -191,7 +191,7 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now)
 		P_ns(idle_sleeptime);
 		P_ns(iowait_sleeptime);
 		P(last_jiffies);
-		P(next_jiffies);
+		P(next_timer);
 		P_ns(idle_expires);
 		SEQ_printf(m, "jiffies: %Lu\n",
 			   (unsigned long long)jiffies);
@@ -289,7 +289,7 @@ static void timer_list_show_tickdevices_header(struct seq_file *m)
 
 static inline void timer_list_header(struct seq_file *m, u64 now)
 {
-	SEQ_printf(m, "Timer List Version: v0.7\n");
+	SEQ_printf(m, "Timer List Version: v0.8\n");
 	SEQ_printf(m, "HRTIMER_MAX_CLOCK_BASES: %d\n", HRTIMER_MAX_CLOCK_BASES);
 	SEQ_printf(m, "now at %Ld nsecs\n", (unsigned long long)now);
 	SEQ_printf(m, "\n");
-- 
cgit v1.2.3


From 58f1f803f1d6ef9ab280de13246d65970a09cb95 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 14 Apr 2015 21:09:08 +0000
Subject: hrtimer: Get rid of __hrtimer_start_range_ns()

No more callers. Remove the leftovers.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Preeti U Murthy <preeti@linux.vnet.ibm.com>
Cc: Viresh Kumar <viresh.kumar@linaro.org>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Link: http://lkml.kernel.org/r/20150414203502.707871492@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/hrtimer.h |  4 ----
 kernel/time/hrtimer.c   | 38 +++++++++++++++-----------------------
 2 files changed, 15 insertions(+), 27 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index 2c68f71ffd24..a80baa86bb24 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -359,10 +359,6 @@ extern int hrtimer_start(struct hrtimer *timer, ktime_t tim,
 			 const enum hrtimer_mode mode);
 extern int hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
 			unsigned long range_ns, const enum hrtimer_mode mode);
-extern int
-__hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
-			 unsigned long delta_ns,
-			 const enum hrtimer_mode mode, int wakeup);
 
 extern int hrtimer_cancel(struct hrtimer *timer);
 extern int hrtimer_try_to_cancel(struct hrtimer *timer);
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 179b991cfdcb..88d6ea25dde4 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -916,9 +916,20 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base)
 	return 0;
 }
 
-int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
-		unsigned long delta_ns, const enum hrtimer_mode mode,
-		int wakeup)
+/**
+ * hrtimer_start_range_ns - (re)start an hrtimer on the current CPU
+ * @timer:	the timer to be added
+ * @tim:	expiry time
+ * @delta_ns:	"slack" range for the timer
+ * @mode:	expiry mode: absolute (HRTIMER_MODE_ABS) or
+ *		relative (HRTIMER_MODE_REL)
+ *
+ * Returns:
+ *  0 on success
+ *  1 when the timer was active
+ */
+int hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
+			   unsigned long delta_ns, const enum hrtimer_mode mode)
 {
 	struct hrtimer_clock_base *base, *new_base;
 	unsigned long flags;
@@ -971,25 +982,6 @@ int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
 
 	return ret;
 }
-EXPORT_SYMBOL_GPL(__hrtimer_start_range_ns);
-
-/**
- * hrtimer_start_range_ns - (re)start an hrtimer on the current CPU
- * @timer:	the timer to be added
- * @tim:	expiry time
- * @delta_ns:	"slack" range for the timer
- * @mode:	expiry mode: absolute (HRTIMER_MODE_ABS) or
- *		relative (HRTIMER_MODE_REL)
- *
- * Returns:
- *  0 on success
- *  1 when the timer was active
- */
-int hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
-		unsigned long delta_ns, const enum hrtimer_mode mode)
-{
-	return __hrtimer_start_range_ns(timer, tim, delta_ns, mode, 1);
-}
 EXPORT_SYMBOL_GPL(hrtimer_start_range_ns);
 
 /**
@@ -1006,7 +998,7 @@ EXPORT_SYMBOL_GPL(hrtimer_start_range_ns);
 int
 hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode)
 {
-	return __hrtimer_start_range_ns(timer, tim, 0, mode, 1);
+	return hrtimer_start_range_ns(timer, tim, 0, mode);
 }
 EXPORT_SYMBOL_GPL(hrtimer_start);
 
-- 
cgit v1.2.3


From 02a171af1a46966dcdb5b38cdc33e4f43e92c778 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 14 Apr 2015 21:09:10 +0000
Subject: hrtimer: Make hrtimer_start() a inline wrapper

No point for an extra export just to set the extra argument of
hrtimer_start_range_ns() to 0.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Preeti U Murthy <preeti@linux.vnet.ibm.com>
Cc: Viresh Kumar <viresh.kumar@linaro.org>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Link: http://lkml.kernel.org/r/20150414203502.808544539@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/hrtimer.h | 19 +++++++++++++++++--
 kernel/time/hrtimer.c   | 19 -------------------
 2 files changed, 17 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index a80baa86bb24..42074ab3d5c3 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -355,11 +355,26 @@ static inline void destroy_hrtimer_on_stack(struct hrtimer *timer) { }
 #endif
 
 /* Basic timer operations: */
-extern int hrtimer_start(struct hrtimer *timer, ktime_t tim,
-			 const enum hrtimer_mode mode);
 extern int hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
 			unsigned long range_ns, const enum hrtimer_mode mode);
 
+/**
+ * hrtimer_start - (re)start an hrtimer on the current CPU
+ * @timer:	the timer to be added
+ * @tim:	expiry time
+ * @mode:	expiry mode: absolute (HRTIMER_MODE_ABS) or
+ *		relative (HRTIMER_MODE_REL)
+ *
+ * Returns:
+ *  0 on success
+ *  1 when the timer was active
+ */
+static inline int hrtimer_start(struct hrtimer *timer, ktime_t tim,
+				const enum hrtimer_mode mode)
+{
+	return hrtimer_start_range_ns(timer, tim, 0, mode);
+}
+
 extern int hrtimer_cancel(struct hrtimer *timer);
 extern int hrtimer_try_to_cancel(struct hrtimer *timer);
 
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 88d6ea25dde4..e5cf71aa6d77 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -984,25 +984,6 @@ int hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
 }
 EXPORT_SYMBOL_GPL(hrtimer_start_range_ns);
 
-/**
- * hrtimer_start - (re)start an hrtimer on the current CPU
- * @timer:	the timer to be added
- * @tim:	expiry time
- * @mode:	expiry mode: absolute (HRTIMER_MODE_ABS) or
- *		relative (HRTIMER_MODE_REL)
- *
- * Returns:
- *  0 on success
- *  1 when the timer was active
- */
-int
-hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode)
-{
-	return hrtimer_start_range_ns(timer, tim, 0, mode);
-}
-EXPORT_SYMBOL_GPL(hrtimer_start);
-
-
 /**
  * hrtimer_try_to_cancel - try to deactivate a timer
  * @timer:	hrtimer to stop
-- 
cgit v1.2.3


From b193217e6dc3f88b599b573b53e0e0f6671d969a Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 14 Apr 2015 21:09:18 +0000
Subject: alarmtimer: Get rid of unused return value

We want to get rid of the hrtimer_start() return value and the alarm
timer return value is nowhere used. Remove it.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Preeti U Murthy <preeti@linux.vnet.ibm.com>
Cc: Viresh Kumar <viresh.kumar@linaro.org>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: John Stultz <john.stultz@linaro.org>
Link: http://lkml.kernel.org/r/20150414203503.243910615@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/alarmtimer.h |  4 ++--
 kernel/time/alarmtimer.c   | 11 ++++-------
 2 files changed, 6 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/alarmtimer.h b/include/linux/alarmtimer.h
index a899402a5a0e..52f3b7da4f2d 100644
--- a/include/linux/alarmtimer.h
+++ b/include/linux/alarmtimer.h
@@ -43,8 +43,8 @@ struct alarm {
 
 void alarm_init(struct alarm *alarm, enum alarmtimer_type type,
 		enum alarmtimer_restart (*function)(struct alarm *, ktime_t));
-int alarm_start(struct alarm *alarm, ktime_t start);
-int alarm_start_relative(struct alarm *alarm, ktime_t start);
+void alarm_start(struct alarm *alarm, ktime_t start);
+void alarm_start_relative(struct alarm *alarm, ktime_t start);
 void alarm_restart(struct alarm *alarm);
 int alarm_try_to_cancel(struct alarm *alarm);
 int alarm_cancel(struct alarm *alarm);
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index 0b55a7570c90..7fbba635a549 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -317,19 +317,16 @@ EXPORT_SYMBOL_GPL(alarm_init);
  * @alarm: ptr to alarm to set
  * @start: time to run the alarm
  */
-int alarm_start(struct alarm *alarm, ktime_t start)
+void alarm_start(struct alarm *alarm, ktime_t start)
 {
 	struct alarm_base *base = &alarm_bases[alarm->type];
 	unsigned long flags;
-	int ret;
 
 	spin_lock_irqsave(&base->lock, flags);
 	alarm->node.expires = start;
 	alarmtimer_enqueue(base, alarm);
-	ret = hrtimer_start(&alarm->timer, alarm->node.expires,
-				HRTIMER_MODE_ABS);
+	hrtimer_start(&alarm->timer, alarm->node.expires, HRTIMER_MODE_ABS);
 	spin_unlock_irqrestore(&base->lock, flags);
-	return ret;
 }
 EXPORT_SYMBOL_GPL(alarm_start);
 
@@ -338,12 +335,12 @@ EXPORT_SYMBOL_GPL(alarm_start);
  * @alarm: ptr to alarm to set
  * @start: time relative to now to run the alarm
  */
-int alarm_start_relative(struct alarm *alarm, ktime_t start)
+void alarm_start_relative(struct alarm *alarm, ktime_t start)
 {
 	struct alarm_base *base = &alarm_bases[alarm->type];
 
 	start = ktime_add(start, base->gettime());
-	return alarm_start(alarm, start);
+	alarm_start(alarm, start);
 }
 EXPORT_SYMBOL_GPL(alarm_start_relative);
 
-- 
cgit v1.2.3


From 61699e13072a89880aa584dcc64c6da465fb2ccc Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 14 Apr 2015 21:09:23 +0000
Subject: hrtimer: Remove hrtimer_start() return value

No user was ever interested whether the timer was active or not when
it was started. All abusers of the return value are gone, so get rid
of it.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Preeti U Murthy <preeti@linux.vnet.ibm.com>
Cc: Viresh Kumar <viresh.kumar@linaro.org>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Link: http://lkml.kernel.org/r/20150414203503.483556394@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/hrtimer.h   | 22 +++++++++-------------
 include/linux/interrupt.h |  6 +++---
 kernel/time/hrtimer.c     | 23 +++++++----------------
 3 files changed, 19 insertions(+), 32 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index 42074ab3d5c3..470d876c2eda 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -355,7 +355,7 @@ static inline void destroy_hrtimer_on_stack(struct hrtimer *timer) { }
 #endif
 
 /* Basic timer operations: */
-extern int hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
+extern void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
 			unsigned long range_ns, const enum hrtimer_mode mode);
 
 /**
@@ -364,34 +364,30 @@ extern int hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
  * @tim:	expiry time
  * @mode:	expiry mode: absolute (HRTIMER_MODE_ABS) or
  *		relative (HRTIMER_MODE_REL)
- *
- * Returns:
- *  0 on success
- *  1 when the timer was active
  */
-static inline int hrtimer_start(struct hrtimer *timer, ktime_t tim,
-				const enum hrtimer_mode mode)
+static inline void hrtimer_start(struct hrtimer *timer, ktime_t tim,
+				 const enum hrtimer_mode mode)
 {
-	return hrtimer_start_range_ns(timer, tim, 0, mode);
+	hrtimer_start_range_ns(timer, tim, 0, mode);
 }
 
 extern int hrtimer_cancel(struct hrtimer *timer);
 extern int hrtimer_try_to_cancel(struct hrtimer *timer);
 
-static inline int hrtimer_start_expires(struct hrtimer *timer,
-						enum hrtimer_mode mode)
+static inline void hrtimer_start_expires(struct hrtimer *timer,
+					 enum hrtimer_mode mode)
 {
 	unsigned long delta;
 	ktime_t soft, hard;
 	soft = hrtimer_get_softexpires(timer);
 	hard = hrtimer_get_expires(timer);
 	delta = ktime_to_ns(ktime_sub(hard, soft));
-	return hrtimer_start_range_ns(timer, soft, delta, mode);
+	hrtimer_start_range_ns(timer, soft, delta, mode);
 }
 
-static inline int hrtimer_restart(struct hrtimer *timer)
+static inline void hrtimer_restart(struct hrtimer *timer)
 {
-	return hrtimer_start_expires(timer, HRTIMER_MODE_ABS);
+	hrtimer_start_expires(timer, HRTIMER_MODE_ABS);
 }
 
 /* Query timers: */
diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 6bf15a66bce7..be7e75c945e9 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -593,10 +593,10 @@ tasklet_hrtimer_init(struct tasklet_hrtimer *ttimer,
 		     clockid_t which_clock, enum hrtimer_mode mode);
 
 static inline
-int tasklet_hrtimer_start(struct tasklet_hrtimer *ttimer, ktime_t time,
-			  const enum hrtimer_mode mode)
+void tasklet_hrtimer_start(struct tasklet_hrtimer *ttimer, ktime_t time,
+			   const enum hrtimer_mode mode)
 {
-	return hrtimer_start(&ttimer->timer, time, mode);
+	hrtimer_start(&ttimer->timer, time, mode);
 }
 
 static inline
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index c38f0b6024b4..beab02d3ff1e 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -923,22 +923,18 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base)
  * @delta_ns:	"slack" range for the timer
  * @mode:	expiry mode: absolute (HRTIMER_MODE_ABS) or
  *		relative (HRTIMER_MODE_REL)
- *
- * Returns:
- *  0 on success
- *  1 when the timer was active
  */
-int hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
-			   unsigned long delta_ns, const enum hrtimer_mode mode)
+void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
+			    unsigned long delta_ns, const enum hrtimer_mode mode)
 {
 	struct hrtimer_clock_base *base, *new_base;
 	unsigned long flags;
-	int ret, leftmost;
+	int leftmost;
 
 	base = lock_hrtimer_base(timer, &flags);
 
 	/* Remove an active timer from the queue: */
-	ret = remove_hrtimer(timer, base);
+	remove_hrtimer(timer, base);
 
 	if (mode & HRTIMER_MODE_REL) {
 		tim = ktime_add_safe(tim, base->get_time());
@@ -962,11 +958,8 @@ int hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
 	timer_stats_hrtimer_set_start_info(timer);
 
 	leftmost = enqueue_hrtimer(timer, new_base);
-
-	if (!leftmost) {
-		unlock_hrtimer_base(timer, &flags);
-		return ret;
-	}
+	if (!leftmost)
+		goto unlock;
 
 	if (!hrtimer_is_hres_active(timer)) {
 		/*
@@ -977,10 +970,8 @@ int hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
 	} else {
 		hrtimer_reprogram(timer, new_base);
 	}
-
+unlock:
 	unlock_hrtimer_base(timer, &flags);
-
-	return ret;
 }
 EXPORT_SYMBOL_GPL(hrtimer_start_range_ns);
 
-- 
cgit v1.2.3


From 59afdc7b32143528524455039e7557a46b60e4c8 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Wed, 22 Apr 2015 11:28:46 +0800
Subject: crypto: api - Move module sig ifdef into accessor function

Currently we're hiding mod->sig_ok under an ifdef in open code.
This patch adds a module_sig_ok accessor function and removes that
ifdef.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Acked-by: Rusty Russell <rusty@rustcorp.com.au>
---
 crypto/algapi.c        |  5 +----
 include/linux/module.h | 12 ++++++++++++
 2 files changed, 13 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/crypto/algapi.c b/crypto/algapi.c
index a60f62625b5f..f835f439bb23 100644
--- a/crypto/algapi.c
+++ b/crypto/algapi.c
@@ -43,12 +43,9 @@ static inline int crypto_set_driver_name(struct crypto_alg *alg)
 
 static inline void crypto_check_module_sig(struct module *mod)
 {
-#ifdef CONFIG_CRYPTO_FIPS
-	if (fips_enabled && mod && !mod->sig_ok)
+	if (fips_enabled && mod && !module_sig_ok(mod))
 		panic("Module %s signature verification failed in FIPS mode\n",
 		      mod->name);
-#endif
-	return;
 }
 
 static int crypto_check_alg(struct crypto_alg *alg)
diff --git a/include/linux/module.h b/include/linux/module.h
index c883b86ea964..1e5436042eb0 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -655,4 +655,16 @@ static inline void module_bug_finalize(const Elf_Ehdr *hdr,
 static inline void module_bug_cleanup(struct module *mod) {}
 #endif	/* CONFIG_GENERIC_BUG */
 
+#ifdef CONFIG_MODULE_SIG
+static inline bool module_sig_ok(struct module *module)
+{
+	return module->sig_ok;
+}
+#else	/* !CONFIG_MODULE_SIG */
+static inline bool module_sig_ok(struct module *module)
+{
+	return true;
+}
+#endif	/* CONFIG_MODULE_SIG */
+
 #endif /* _LINUX_MODULE_H */
-- 
cgit v1.2.3


From af87baedf2c23b1181f51323339210a26a64f7fc Mon Sep 17 00:00:00 2001
From: Jiang Liu <jiang.liu@linux.intel.com>
Date: Mon, 13 Apr 2015 14:11:28 +0800
Subject: x86/htirq: Use new irqdomain interfaces to allocate/free IRQ

Use new irqdomain interfaces to allocate/free IRQ for HTIRQ, so we can
remove GENERIC_IRQ_LEGACY_ALLOC_HWIRQ later.

This patch changes the interfaces between arch independent PCI driver
and arch specific code. Currently HT_IRQ is only enabled on x86, so it
does not affect other architectures.

Signed-off-by: Jiang Liu <jiang.liu@linux.intel.com>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: David Cohen <david.a.cohen@linux.intel.com>
Cc: Sander Eikelenboom <linux@eikelenboom.it>
Cc: David Vrabel <david.vrabel@citrix.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Joerg Roedel <joro@8bytes.org>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Rafael J. Wysocki <rjw@rjwysocki.net>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dimitri Sivanich <sivanich@sgi.com>
Link: http://lkml.kernel.org/r/1428905519-23704-7-git-send-email-jiang.liu@linux.intel.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/apic/htirq.c | 26 +++++++++++++-------------
 drivers/pci/htirq.c          |  7 +++----
 include/linux/htirq.h        |  2 ++
 3 files changed, 18 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/apic/htirq.c b/arch/x86/kernel/apic/htirq.c
index 816f36e979ad..b307ee7a7148 100644
--- a/arch/x86/kernel/apic/htirq.c
+++ b/arch/x86/kernel/apic/htirq.c
@@ -14,6 +14,7 @@
 #include <linux/device.h>
 #include <linux/pci.h>
 #include <linux/htirq.h>
+#include <linux/irqdomain.h>
 #include <asm/hw_irq.h>
 #include <asm/apic.h>
 #include <asm/hypertransport.h>
@@ -61,31 +62,30 @@ static struct irq_chip ht_irq_chip = {
 	.flags			= IRQCHIP_SKIP_SET_WAKE,
 };
 
+int arch_alloc_ht_irq(struct pci_dev *dev)
+{
+	return irq_domain_alloc_irqs(NULL, 1, dev_to_node(&dev->dev), NULL);
+}
+
+void arch_free_ht_irq(int irq)
+{
+	irq_domain_free_irqs(irq, 1);
+}
+
 int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
 {
 	struct irq_cfg *cfg;
 	struct ht_irq_msg msg;
-	unsigned dest;
-	int err;
 
 	if (disable_apic)
 		return -ENXIO;
 
 	cfg = irq_cfg(irq);
-	err = assign_irq_vector(irq, cfg, apic->target_cpus());
-	if (err)
-		return err;
-
-	err = apic->cpu_mask_to_apicid_and(cfg->domain,
-					   apic->target_cpus(), &dest);
-	if (err)
-		return err;
-
-	msg.address_hi = HT_IRQ_HIGH_DEST_ID(dest);
+	msg.address_hi = HT_IRQ_HIGH_DEST_ID(cfg->dest_apicid);
 
 	msg.address_lo =
 		HT_IRQ_LOW_BASE |
-		HT_IRQ_LOW_DEST_ID(dest) |
+		HT_IRQ_LOW_DEST_ID(cfg->dest_apicid) |
 		HT_IRQ_LOW_VECTOR(cfg->vector) |
 		((apic->irq_dest_mode == 0) ?
 			HT_IRQ_LOW_DM_PHYSICAL :
diff --git a/drivers/pci/htirq.c b/drivers/pci/htirq.c
index a94dd2c4183a..ceb0ebeb7b5f 100644
--- a/drivers/pci/htirq.c
+++ b/drivers/pci/htirq.c
@@ -117,8 +117,8 @@ int __ht_create_irq(struct pci_dev *dev, int idx, ht_irq_update_t *update)
 	cfg->msg.address_lo = 0xffffffff;
 	cfg->msg.address_hi = 0xffffffff;
 
-	irq = irq_alloc_hwirq(dev_to_node(&dev->dev));
-	if (!irq) {
+	irq = arch_alloc_ht_irq(dev);
+	if (irq <= 0) {
 		kfree(cfg);
 		return -EBUSY;
 	}
@@ -163,8 +163,7 @@ void ht_destroy_irq(unsigned int irq)
 	cfg = irq_get_handler_data(irq);
 	irq_set_chip(irq, NULL);
 	irq_set_handler_data(irq, NULL);
-	irq_free_hwirq(irq);
-
+	arch_free_ht_irq(irq);
 	kfree(cfg);
 }
 EXPORT_SYMBOL(ht_destroy_irq);
diff --git a/include/linux/htirq.h b/include/linux/htirq.h
index 70a1dbbf2093..5caa51b7b95c 100644
--- a/include/linux/htirq.h
+++ b/include/linux/htirq.h
@@ -15,6 +15,8 @@ void unmask_ht_irq(struct irq_data *data);
 
 /* The arch hook for getting things started */
 int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev);
+int arch_alloc_ht_irq(struct pci_dev *dev);
+void arch_free_ht_irq(int irq);
 
 /* For drivers of buggy hardware */
 typedef void (ht_irq_update_t)(struct pci_dev *dev, int irq,
-- 
cgit v1.2.3


From b106ee63abccbba5f5a52d6e43168a6a30c6d98a Mon Sep 17 00:00:00 2001
From: Jiang Liu <jiang.liu@linux.intel.com>
Date: Mon, 13 Apr 2015 14:11:32 +0800
Subject: irq_remapping/vt-d: Enhance Intel IR driver to support hierarchical
 irqdomains

Enhance Intel interrupt remapping driver to support hierarchical
irqdomains. Implement intel_ir_chip to support stacked irq_chip.

Signed-off-by: Jiang Liu <jiang.liu@linux.intel.com>
Acked-by: Joerg Roedel <jroedel@suse.de>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: David Cohen <david.a.cohen@linux.intel.com>
Cc: Sander Eikelenboom <linux@eikelenboom.it>
Cc: David Vrabel <david.vrabel@citrix.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: iommu@lists.linux-foundation.org
Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Rafael J. Wysocki <rjw@rjwysocki.net>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dimitri Sivanich <sivanich@sgi.com>
Cc: Joerg Roedel <joro@8bytes.org>
Cc: David Woodhouse <dwmw2@infradead.org>
Link: http://lkml.kernel.org/r/1428905519-23704-11-git-send-email-jiang.liu@linux.intel.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 drivers/iommu/intel_irq_remapping.c | 337 +++++++++++++++++++++++++++++++++++-
 include/linux/intel-iommu.h         |   4 +
 2 files changed, 333 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iommu/intel_irq_remapping.c b/drivers/iommu/intel_irq_remapping.c
index 0b89ef1e274a..97ea4209fc8d 100644
--- a/drivers/iommu/intel_irq_remapping.c
+++ b/drivers/iommu/intel_irq_remapping.c
@@ -8,6 +8,7 @@
 #include <linux/irq.h>
 #include <linux/intel-iommu.h>
 #include <linux/acpi.h>
+#include <linux/irqdomain.h>
 #include <asm/io_apic.h>
 #include <asm/smp.h>
 #include <asm/cpu.h>
@@ -31,6 +32,14 @@ struct hpet_scope {
 	unsigned int devfn;
 };
 
+struct intel_ir_data {
+	struct irq_2_iommu			irq_2_iommu;
+	struct irte				irte_entry;
+	union {
+		struct msi_msg			msi_entry;
+	};
+};
+
 #define IR_X2APIC_MODE(mode) (mode ? (1 << 11) : 0)
 #define IRTE_DEST(dest) ((eim_mode) ? dest : dest << 8)
 
@@ -50,6 +59,7 @@ static struct hpet_scope ir_hpet[MAX_HPET_TBS];
  * the dmar_global_lock.
  */
 static DEFINE_RAW_SPINLOCK(irq_2_ir_lock);
+static struct irq_domain_ops intel_ir_domain_ops;
 
 static int __init parse_ioapics_under_ir(void);
 
@@ -263,7 +273,7 @@ static int free_irte(int irq)
 	unsigned long flags;
 	int rc;
 
-	if (!irq_iommu)
+	if (!irq_iommu || irq_iommu->iommu == NULL)
 		return -1;
 
 	raw_spin_lock_irqsave(&irq_2_ir_lock, flags);
@@ -488,7 +498,6 @@ static int intel_setup_irq_remapping(struct intel_iommu *iommu)
 
 	pages = alloc_pages_node(iommu->node, GFP_KERNEL | __GFP_ZERO,
 				 INTR_REMAP_PAGE_ORDER);
-
 	if (!pages) {
 		pr_err("IR%d: failed to allocate pages of order %d\n",
 		       iommu->seq_id, INTR_REMAP_PAGE_ORDER);
@@ -502,11 +511,23 @@ static int intel_setup_irq_remapping(struct intel_iommu *iommu)
 		goto out_free_pages;
 	}
 
+	iommu->ir_domain = irq_domain_add_hierarchy(arch_get_ir_parent_domain(),
+						    0, INTR_REMAP_TABLE_ENTRIES,
+						    NULL, &intel_ir_domain_ops,
+						    iommu);
+	if (!iommu->ir_domain) {
+		pr_err("IR%d: failed to allocate irqdomain\n", iommu->seq_id);
+		goto out_free_bitmap;
+	}
+	iommu->ir_msi_domain = arch_create_msi_irq_domain(iommu->ir_domain);
+
 	ir_table->base = page_address(pages);
 	ir_table->bitmap = bitmap;
 	iommu->ir_table = ir_table;
 	return 0;
 
+out_free_bitmap:
+	kfree(bitmap);
 out_free_pages:
 	__free_pages(pages, INTR_REMAP_PAGE_ORDER);
 out_free_table:
@@ -517,6 +538,14 @@ out_free_table:
 static void intel_teardown_irq_remapping(struct intel_iommu *iommu)
 {
 	if (iommu && iommu->ir_table) {
+		if (iommu->ir_msi_domain) {
+			irq_domain_remove(iommu->ir_msi_domain);
+			iommu->ir_msi_domain = NULL;
+		}
+		if (iommu->ir_domain) {
+			irq_domain_remove(iommu->ir_domain);
+			iommu->ir_domain = NULL;
+		}
 		free_pages((unsigned long)iommu->ir_table->base,
 			   INTR_REMAP_PAGE_ORDER);
 		kfree(iommu->ir_table->bitmap);
@@ -1062,12 +1091,6 @@ intel_ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask,
 	struct irte irte;
 	int err;
 
-	if (!config_enabled(CONFIG_SMP))
-		return -EINVAL;
-
-	if (!cpumask_intersects(mask, cpu_online_mask))
-		return -EINVAL;
-
 	if (get_irte(irq, &irte))
 		return -EBUSY;
 
@@ -1100,6 +1123,7 @@ intel_ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask,
 		send_cleanup_vector(cfg);
 
 	cpumask_copy(data->affinity, mask);
+
 	return 0;
 }
 
@@ -1205,6 +1229,53 @@ static int intel_alloc_hpet_msi(unsigned int irq, unsigned int id)
 	return ret;
 }
 
+static struct irq_domain *intel_get_ir_irq_domain(struct irq_alloc_info *info)
+{
+	struct intel_iommu *iommu = NULL;
+
+	if (!info)
+		return NULL;
+
+	switch (info->type) {
+	case X86_IRQ_ALLOC_TYPE_IOAPIC:
+		iommu = map_ioapic_to_ir(info->ioapic_id);
+		break;
+	case X86_IRQ_ALLOC_TYPE_HPET:
+		iommu = map_hpet_to_ir(info->hpet_id);
+		break;
+	case X86_IRQ_ALLOC_TYPE_MSI:
+	case X86_IRQ_ALLOC_TYPE_MSIX:
+		iommu = map_dev_to_ir(info->msi_dev);
+		break;
+	default:
+		BUG_ON(1);
+		break;
+	}
+
+	return iommu ? iommu->ir_domain : NULL;
+}
+
+static struct irq_domain *intel_get_irq_domain(struct irq_alloc_info *info)
+{
+	struct intel_iommu *iommu;
+
+	if (!info)
+		return NULL;
+
+	switch (info->type) {
+	case X86_IRQ_ALLOC_TYPE_MSI:
+	case X86_IRQ_ALLOC_TYPE_MSIX:
+		iommu = map_dev_to_ir(info->msi_dev);
+		if (iommu)
+			return iommu->ir_msi_domain;
+		break;
+	default:
+		break;
+	}
+
+	return NULL;
+}
+
 struct irq_remap_ops intel_irq_remap_ops = {
 	.prepare		= intel_prepare_irq_remapping,
 	.enable			= intel_enable_irq_remapping,
@@ -1218,6 +1289,256 @@ struct irq_remap_ops intel_irq_remap_ops = {
 	.msi_alloc_irq		= intel_msi_alloc_irq,
 	.msi_setup_irq		= intel_msi_setup_irq,
 	.alloc_hpet_msi		= intel_alloc_hpet_msi,
+	.get_ir_irq_domain	= intel_get_ir_irq_domain,
+	.get_irq_domain		= intel_get_irq_domain,
+};
+
+/*
+ * Migrate the IO-APIC irq in the presence of intr-remapping.
+ *
+ * For both level and edge triggered, irq migration is a simple atomic
+ * update(of vector and cpu destination) of IRTE and flush the hardware cache.
+ *
+ * For level triggered, we eliminate the io-apic RTE modification (with the
+ * updated vector information), by using a virtual vector (io-apic pin number).
+ * Real vector that is used for interrupting cpu will be coming from
+ * the interrupt-remapping table entry.
+ *
+ * As the migration is a simple atomic update of IRTE, the same mechanism
+ * is used to migrate MSI irq's in the presence of interrupt-remapping.
+ */
+static int
+intel_ir_set_affinity(struct irq_data *data, const struct cpumask *mask,
+		      bool force)
+{
+	struct intel_ir_data *ir_data = data->chip_data;
+	struct irte *irte = &ir_data->irte_entry;
+	struct irq_cfg *cfg = irqd_cfg(data);
+	struct irq_data *parent = data->parent_data;
+	int ret;
+
+	ret = parent->chip->irq_set_affinity(parent, mask, force);
+	if (ret < 0 || ret == IRQ_SET_MASK_OK_DONE)
+		return ret;
+
+	/*
+	 * Atomically updates the IRTE with the new destination, vector
+	 * and flushes the interrupt entry cache.
+	 */
+	irte->vector = cfg->vector;
+	irte->dest_id = IRTE_DEST(cfg->dest_apicid);
+	modify_irte(&ir_data->irq_2_iommu, irte);
+
+	/*
+	 * After this point, all the interrupts will start arriving
+	 * at the new destination. So, time to cleanup the previous
+	 * vector allocation.
+	 */
+	if (cfg->move_in_progress)
+		send_cleanup_vector(cfg);
+
+	return IRQ_SET_MASK_OK_DONE;
+}
+
+static void intel_ir_compose_msi_msg(struct irq_data *irq_data,
+				     struct msi_msg *msg)
+{
+	struct intel_ir_data *ir_data = irq_data->chip_data;
+
+	*msg = ir_data->msi_entry;
+}
+
+static struct irq_chip intel_ir_chip = {
+	.irq_ack = ir_ack_apic_edge,
+	.irq_set_affinity = intel_ir_set_affinity,
+	.irq_compose_msi_msg = intel_ir_compose_msi_msg,
+};
+
+static void intel_irq_remapping_prepare_irte(struct intel_ir_data *data,
+					     struct irq_cfg *irq_cfg,
+					     struct irq_alloc_info *info,
+					     int index, int sub_handle)
+{
+	struct IR_IO_APIC_route_entry *entry;
+	struct irte *irte = &data->irte_entry;
+	struct msi_msg *msg = &data->msi_entry;
+
+	prepare_irte(irte, irq_cfg->vector, irq_cfg->dest_apicid);
+	switch (info->type) {
+	case X86_IRQ_ALLOC_TYPE_IOAPIC:
+		/* Set source-id of interrupt request */
+		set_ioapic_sid(irte, info->ioapic_id);
+		apic_printk(APIC_VERBOSE, KERN_DEBUG "IOAPIC[%d]: Set IRTE entry (P:%d FPD:%d Dst_Mode:%d Redir_hint:%d Trig_Mode:%d Dlvry_Mode:%X Avail:%X Vector:%02X Dest:%08X SID:%04X SQ:%X SVT:%X)\n",
+			info->ioapic_id, irte->present, irte->fpd,
+			irte->dst_mode, irte->redir_hint,
+			irte->trigger_mode, irte->dlvry_mode,
+			irte->avail, irte->vector, irte->dest_id,
+			irte->sid, irte->sq, irte->svt);
+
+		entry = (struct IR_IO_APIC_route_entry *)info->ioapic_entry;
+		info->ioapic_entry = NULL;
+		memset(entry, 0, sizeof(*entry));
+		entry->index2	= (index >> 15) & 0x1;
+		entry->zero	= 0;
+		entry->format	= 1;
+		entry->index	= (index & 0x7fff);
+		/*
+		 * IO-APIC RTE will be configured with virtual vector.
+		 * irq handler will do the explicit EOI to the io-apic.
+		 */
+		entry->vector	= info->ioapic_pin;
+		entry->mask	= 0;			/* enable IRQ */
+		entry->trigger	= info->ioapic_trigger;
+		entry->polarity	= info->ioapic_polarity;
+		if (info->ioapic_trigger)
+			entry->mask = 1; /* Mask level triggered irqs. */
+		break;
+
+	case X86_IRQ_ALLOC_TYPE_HPET:
+	case X86_IRQ_ALLOC_TYPE_MSI:
+	case X86_IRQ_ALLOC_TYPE_MSIX:
+		if (info->type == X86_IRQ_ALLOC_TYPE_HPET)
+			set_hpet_sid(irte, info->hpet_id);
+		else
+			set_msi_sid(irte, info->msi_dev);
+
+		msg->address_hi = MSI_ADDR_BASE_HI;
+		msg->data = sub_handle;
+		msg->address_lo = MSI_ADDR_BASE_LO | MSI_ADDR_IR_EXT_INT |
+				  MSI_ADDR_IR_SHV |
+				  MSI_ADDR_IR_INDEX1(index) |
+				  MSI_ADDR_IR_INDEX2(index);
+		break;
+
+	default:
+		BUG_ON(1);
+		break;
+	}
+}
+
+static void intel_free_irq_resources(struct irq_domain *domain,
+				     unsigned int virq, unsigned int nr_irqs)
+{
+	struct irq_data *irq_data;
+	struct intel_ir_data *data;
+	struct irq_2_iommu *irq_iommu;
+	unsigned long flags;
+	int i;
+
+	for (i = 0; i < nr_irqs; i++) {
+		irq_data = irq_domain_get_irq_data(domain, virq  + i);
+		if (irq_data && irq_data->chip_data) {
+			data = irq_data->chip_data;
+			irq_iommu = &data->irq_2_iommu;
+			raw_spin_lock_irqsave(&irq_2_ir_lock, flags);
+			clear_entries(irq_iommu);
+			raw_spin_unlock_irqrestore(&irq_2_ir_lock, flags);
+			irq_domain_reset_irq_data(irq_data);
+			kfree(data);
+		}
+	}
+}
+
+static int intel_irq_remapping_alloc(struct irq_domain *domain,
+				     unsigned int virq, unsigned int nr_irqs,
+				     void *arg)
+{
+	struct intel_iommu *iommu = domain->host_data;
+	struct irq_alloc_info *info = arg;
+	struct intel_ir_data *data;
+	struct irq_data *irq_data;
+	struct irq_cfg *irq_cfg;
+	int i, ret, index;
+
+	if (!info || !iommu)
+		return -EINVAL;
+	if (nr_irqs > 1 && info->type != X86_IRQ_ALLOC_TYPE_MSI &&
+	    info->type != X86_IRQ_ALLOC_TYPE_MSIX)
+		return -EINVAL;
+
+	/*
+	 * With IRQ remapping enabled, don't need contiguous CPU vectors
+	 * to support multiple MSI interrupts.
+	 */
+	if (info->type == X86_IRQ_ALLOC_TYPE_MSI)
+		info->flags &= ~X86_IRQ_ALLOC_CONTIGUOUS_VECTORS;
+
+	ret = irq_domain_alloc_irqs_parent(domain, virq, nr_irqs, arg);
+	if (ret < 0)
+		return ret;
+
+	ret = -ENOMEM;
+	data = kzalloc(sizeof(*data), GFP_KERNEL);
+	if (!data)
+		goto out_free_parent;
+
+	down_read(&dmar_global_lock);
+	index = alloc_irte(iommu, virq, &data->irq_2_iommu, nr_irqs);
+	up_read(&dmar_global_lock);
+	if (index < 0) {
+		pr_warn("Failed to allocate IRTE\n");
+		kfree(data);
+		goto out_free_parent;
+	}
+
+	for (i = 0; i < nr_irqs; i++) {
+		irq_data = irq_domain_get_irq_data(domain, virq + i);
+		irq_cfg = irqd_cfg(irq_data);
+		if (!irq_data || !irq_cfg) {
+			ret = -EINVAL;
+			goto out_free_data;
+		}
+
+		if (i > 0) {
+			data = kzalloc(sizeof(*data), GFP_KERNEL);
+			if (!data)
+				goto out_free_data;
+		}
+		irq_data->hwirq = (index << 16) + i;
+		irq_data->chip_data = data;
+		irq_data->chip = &intel_ir_chip;
+		intel_irq_remapping_prepare_irte(data, irq_cfg, info, index, i);
+		irq_set_status_flags(virq + i, IRQ_MOVE_PCNTXT);
+	}
+	return 0;
+
+out_free_data:
+	intel_free_irq_resources(domain, virq, i);
+out_free_parent:
+	irq_domain_free_irqs_common(domain, virq, nr_irqs);
+	return ret;
+}
+
+static void intel_irq_remapping_free(struct irq_domain *domain,
+				     unsigned int virq, unsigned int nr_irqs)
+{
+	intel_free_irq_resources(domain, virq, nr_irqs);
+	irq_domain_free_irqs_common(domain, virq, nr_irqs);
+}
+
+static void intel_irq_remapping_activate(struct irq_domain *domain,
+					 struct irq_data *irq_data)
+{
+	struct intel_ir_data *data = irq_data->chip_data;
+
+	modify_irte(&data->irq_2_iommu, &data->irte_entry);
+}
+
+static void intel_irq_remapping_deactivate(struct irq_domain *domain,
+					   struct irq_data *irq_data)
+{
+	struct intel_ir_data *data = irq_data->chip_data;
+	struct irte entry;
+
+	memset(&entry, 0, sizeof(entry));
+	modify_irte(&data->irq_2_iommu, &entry);
+}
+
+static struct irq_domain_ops intel_ir_domain_ops = {
+	.alloc = intel_irq_remapping_alloc,
+	.free = intel_irq_remapping_free,
+	.activate = intel_irq_remapping_activate,
+	.deactivate = intel_irq_remapping_deactivate,
 };
 
 /*
diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index a65208a8fe18..ecaf3a937845 100644
--- a/include/linux/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -286,6 +286,8 @@ struct q_inval {
 
 #define INTR_REMAP_TABLE_ENTRIES	65536
 
+struct irq_domain;
+
 struct ir_table {
 	struct irte *base;
 	unsigned long *bitmap;
@@ -335,6 +337,8 @@ struct intel_iommu {
 
 #ifdef CONFIG_IRQ_REMAP
 	struct ir_table *ir_table;	/* Interrupt remapping info */
+	struct irq_domain *ir_domain;
+	struct irq_domain *ir_msi_domain;
 #endif
 	struct device	*iommu_dev; /* IOMMU-sysfs device */
 	int		node;
-- 
cgit v1.2.3


From 34742db8eaf9ff364034f214ee5827701e131d4b Mon Sep 17 00:00:00 2001
From: Jiang Liu <jiang.liu@linux.intel.com>
Date: Mon, 13 Apr 2015 14:11:41 +0800
Subject: iommu/vt-d: Refine the interfaces to create IRQ for DMAR unit

Refine the interfaces to create IRQ for DMAR unit. It's a preparation
for converting DMAR IRQ to hierarchical irqdomain on x86.

It also moves dmar_alloc_hwirq()/dmar_free_hwirq() from irq_remapping.h
to dmar.h. They are not irq_remapping specific.

Signed-off-by: Jiang Liu <jiang.liu@linux.intel.com>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: David Cohen <david.a.cohen@linux.intel.com>
Cc: Sander Eikelenboom <linux@eikelenboom.it>
Cc: David Vrabel <david.vrabel@citrix.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: iommu@lists.linux-foundation.org
Cc: Vinod Koul <vinod.koul@intel.com>
Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Rafael J. Wysocki <rjw@rjwysocki.net>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dimitri Sivanich <sivanich@sgi.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Joerg Roedel <joro@8bytes.org>
Link: http://lkml.kernel.org/r/1428905519-23704-20-git-send-email-jiang.liu@linux.intel.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/ia64/include/asm/irq_remapping.h |  2 --
 arch/ia64/kernel/msi_ia64.c           | 30 +++++++++++++++++++-----------
 arch/x86/include/asm/irq_remapping.h  |  4 ----
 arch/x86/kernel/apic/msi.c            | 24 +++++++++++++-----------
 drivers/iommu/dmar.c                  | 19 +++++--------------
 include/linux/dmar.h                  |  3 ++-
 6 files changed, 39 insertions(+), 43 deletions(-)

(limited to 'include/linux')

diff --git a/arch/ia64/include/asm/irq_remapping.h b/arch/ia64/include/asm/irq_remapping.h
index e3b3556e2e1b..a8687b1d8906 100644
--- a/arch/ia64/include/asm/irq_remapping.h
+++ b/arch/ia64/include/asm/irq_remapping.h
@@ -1,6 +1,4 @@
 #ifndef __IA64_INTR_REMAPPING_H
 #define __IA64_INTR_REMAPPING_H
 #define irq_remapping_enabled 0
-#define dmar_alloc_hwirq	create_irq
-#define dmar_free_hwirq		destroy_irq
 #endif
diff --git a/arch/ia64/kernel/msi_ia64.c b/arch/ia64/kernel/msi_ia64.c
index 9dd7464f8c17..d70bf15c690a 100644
--- a/arch/ia64/kernel/msi_ia64.c
+++ b/arch/ia64/kernel/msi_ia64.c
@@ -165,7 +165,7 @@ static struct irq_chip dmar_msi_type = {
 	.irq_retrigger = ia64_msi_retrigger_irq,
 };
 
-static int
+static void
 msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_msg *msg)
 {
 	struct irq_cfg *cfg = irq_cfg + irq;
@@ -186,21 +186,29 @@ msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_msg *msg)
 		MSI_DATA_LEVEL_ASSERT |
 		MSI_DATA_DELIVERY_FIXED |
 		MSI_DATA_VECTOR(cfg->vector);
-	return 0;
 }
 
-int arch_setup_dmar_msi(unsigned int irq)
+int dmar_alloc_hwirq(int id, int node, void *arg)
 {
-	int ret;
+	int irq;
 	struct msi_msg msg;
 
-	ret = msi_compose_msg(NULL, irq, &msg);
-	if (ret < 0)
-		return ret;
-	dmar_msi_write(irq, &msg);
-	irq_set_chip_and_handler_name(irq, &dmar_msi_type, handle_edge_irq,
-				      "edge");
-	return 0;
+	irq = create_irq();
+	if (irq > 0) {
+		irq_set_handler_data(irq, arg);
+		irq_set_chip_and_handler_name(irq, &dmar_msi_type,
+					      handle_edge_irq, "edge");
+		msi_compose_msg(NULL, irq, &msg);
+		dmar_msi_write(irq, &msg);
+	}
+
+	return irq;
+}
+
+void dmar_free_hwirq(int irq)
+{
+	irq_set_handler_data(irq, NULL);
+	destroy_irq(irq);
 }
 #endif /* CONFIG_INTEL_IOMMU */
 
diff --git a/arch/x86/include/asm/irq_remapping.h b/arch/x86/include/asm/irq_remapping.h
index b978c68f5d29..bacac1052262 100644
--- a/arch/x86/include/asm/irq_remapping.h
+++ b/arch/x86/include/asm/irq_remapping.h
@@ -117,8 +117,4 @@ irq_remapping_get_irq_domain(struct irq_alloc_info *info)
 
 #define	irq_remapping_print_chip	NULL
 #endif /* CONFIG_IRQ_REMAP */
-
-extern int dmar_alloc_hwirq(void);
-extern void dmar_free_hwirq(int irq);
-
 #endif /* __X86_IRQ_REMAPPING_H */
diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c
index 9fe7a08479fa..ca6250439acc 100644
--- a/arch/x86/kernel/apic/msi.c
+++ b/arch/x86/kernel/apic/msi.c
@@ -228,25 +228,27 @@ static struct irq_chip dmar_msi_type = {
 	.flags			= IRQCHIP_SKIP_SET_WAKE,
 };
 
-int arch_setup_dmar_msi(unsigned int irq)
+int dmar_alloc_hwirq(int id, int node, void *arg)
 {
+	int irq;
 	struct msi_msg msg;
-	struct irq_cfg *cfg = irq_cfg(irq);
 
-	native_compose_msi_msg(cfg, &msg);
-	dmar_msi_write(irq, &msg);
-	irq_set_chip_and_handler_name(irq, &dmar_msi_type, handle_edge_irq,
-				      "edge");
-	return 0;
-}
+	irq = irq_domain_alloc_irqs(NULL, 1, node, NULL);
+	if (irq > 0) {
+		irq_set_handler_data(irq, arg);
+		irq_set_chip_and_handler_name(irq, &dmar_msi_type,
+					      handle_edge_irq, "edge");
+		native_compose_msi_msg(irq_cfg(irq), &msg);
+		dmar_msi_write(irq, &msg);
+	}
 
-int dmar_alloc_hwirq(void)
-{
-	return irq_domain_alloc_irqs(NULL, 1, NUMA_NO_NODE, NULL);
+	return irq;
 }
 
 void dmar_free_hwirq(int irq)
 {
+	irq_set_handler_data(irq, NULL);
+	irq_set_handler(irq, NULL);
 	irq_domain_free_irqs(irq, 1);
 }
 #endif
diff --git a/drivers/iommu/dmar.c b/drivers/iommu/dmar.c
index 9847613085e1..536f2d8ea41a 100644
--- a/drivers/iommu/dmar.c
+++ b/drivers/iommu/dmar.c
@@ -1087,8 +1087,8 @@ static void free_iommu(struct intel_iommu *iommu)
 
 	if (iommu->irq) {
 		free_irq(iommu->irq, iommu);
-		irq_set_handler_data(iommu->irq, NULL);
 		dmar_free_hwirq(iommu->irq);
+		iommu->irq = 0;
 	}
 
 	if (iommu->qi) {
@@ -1642,23 +1642,14 @@ int dmar_set_interrupt(struct intel_iommu *iommu)
 	if (iommu->irq)
 		return 0;
 
-	irq = dmar_alloc_hwirq();
-	if (irq <= 0) {
+	irq = dmar_alloc_hwirq(iommu->seq_id, iommu->node, iommu);
+	if (irq > 0) {
+		iommu->irq = irq;
+	} else {
 		pr_err("IOMMU: no free vectors\n");
 		return -EINVAL;
 	}
 
-	irq_set_handler_data(irq, iommu);
-	iommu->irq = irq;
-
-	ret = arch_setup_dmar_msi(irq);
-	if (ret) {
-		irq_set_handler_data(irq, NULL);
-		iommu->irq = 0;
-		dmar_free_hwirq(irq);
-		return ret;
-	}
-
 	ret = request_irq(irq, dmar_fault, IRQF_NO_THREAD, iommu->name, iommu);
 	if (ret)
 		pr_err("IOMMU: can't request irq\n");
diff --git a/include/linux/dmar.h b/include/linux/dmar.h
index 30624954dec5..84737565c1fd 100644
--- a/include/linux/dmar.h
+++ b/include/linux/dmar.h
@@ -227,6 +227,7 @@ extern void dmar_msi_read(int irq, struct msi_msg *msg);
 extern void dmar_msi_write(int irq, struct msi_msg *msg);
 extern int dmar_set_interrupt(struct intel_iommu *iommu);
 extern irqreturn_t dmar_fault(int irq, void *dev_id);
-extern int arch_setup_dmar_msi(unsigned int irq);
+extern int dmar_alloc_hwirq(int id, int node, void *arg);
+extern void dmar_free_hwirq(int irq);
 
 #endif /* __DMAR_H__ */
-- 
cgit v1.2.3


From 49e07d8f28c05347f237146a9ec66f6d958db83e Mon Sep 17 00:00:00 2001
From: Jiang Liu <jiang.liu@linux.intel.com>
Date: Mon, 13 Apr 2015 14:11:43 +0800
Subject: x86/htirq: Use hierarchical irqdomain to manage Hypertransport
 interrupts

We have slightly changed the architecture interfaces to support htirq
PCI driver. It's safe because currently Hypertransport interrupt is
only enabled on x86 platforms.

Signed-off-by: Jiang Liu <jiang.liu@linux.intel.com>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: David Cohen <david.a.cohen@linux.intel.com>
Cc: Sander Eikelenboom <linux@eikelenboom.it>
Cc: David Vrabel <david.vrabel@citrix.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Joerg Roedel <joro@8bytes.org>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Rafael J. Wysocki <rjw@rjwysocki.net>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dimitri Sivanich <sivanich@sgi.com>
Link: http://lkml.kernel.org/r/1428905519-23704-22-git-send-email-jiang.liu@linux.intel.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/include/asm/hw_irq.h |  13 ++++
 arch/x86/kernel/apic/htirq.c  | 161 +++++++++++++++++++++++++++++++-----------
 arch/x86/kernel/apic/vector.c |   1 +
 drivers/pci/htirq.c           |  47 ++----------
 include/linux/htirq.h         |  24 +++++--
 5 files changed, 158 insertions(+), 88 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index bf1725047c27..5e0b031d1a7d 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -160,6 +160,14 @@ struct irq_alloc_info {
 			int		dmar_id;
 			void		*dmar_data;
 		};
+#endif
+#ifdef	CONFIG_HT_IRQ
+		struct {
+			int		ht_pos;
+			int		ht_idx;
+			struct pci_dev	*ht_dev;
+			void		*ht_update;
+		};
 #endif
 	};
 };
@@ -227,6 +235,11 @@ extern void arch_init_msi_domain(struct irq_domain *domain);
 #else
 static inline void arch_init_msi_domain(struct irq_domain *domain) { }
 #endif
+#ifdef	CONFIG_HT_IRQ
+extern void arch_init_htirq_domain(struct irq_domain *domain);
+#else
+static inline void arch_init_htirq_domain(struct irq_domain *domain) { }
+#endif
 
 /* Statistics */
 extern atomic_t irq_err_count;
diff --git a/arch/x86/kernel/apic/htirq.c b/arch/x86/kernel/apic/htirq.c
index b307ee7a7148..1cae104415ea 100644
--- a/arch/x86/kernel/apic/htirq.c
+++ b/arch/x86/kernel/apic/htirq.c
@@ -3,6 +3,8 @@
  *
  * Copyright (C) 1997, 1998, 1999, 2000, 2009 Ingo Molnar, Hajnalka Szabo
  *	Moved from arch/x86/kernel/apic/io_apic.c.
+ * Jiang Liu <jiang.liu@linux.intel.com>
+ *	Add support of hierarchical irqdomain
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
@@ -19,70 +21,104 @@
 #include <asm/apic.h>
 #include <asm/hypertransport.h>
 
+static struct irq_domain *htirq_domain;
+
 /*
  * Hypertransport interrupt support
  */
-static void target_ht_irq(unsigned int irq, unsigned int dest, u8 vector)
-{
-	struct ht_irq_msg msg;
-
-	fetch_ht_irq_msg(irq, &msg);
-
-	msg.address_lo &= ~(HT_IRQ_LOW_VECTOR_MASK | HT_IRQ_LOW_DEST_ID_MASK);
-	msg.address_hi &= ~(HT_IRQ_HIGH_DEST_ID_MASK);
-
-	msg.address_lo |= HT_IRQ_LOW_VECTOR(vector) | HT_IRQ_LOW_DEST_ID(dest);
-	msg.address_hi |= HT_IRQ_HIGH_DEST_ID(dest);
-
-	write_ht_irq_msg(irq, &msg);
-}
-
 static int
 ht_set_affinity(struct irq_data *data, const struct cpumask *mask, bool force)
 {
-	struct irq_cfg *cfg = irqd_cfg(data);
-	unsigned int dest;
+	struct irq_data *parent = data->parent_data;
 	int ret;
 
-	ret = apic_set_affinity(data, mask, &dest);
-	if (ret)
-		return ret;
-
-	target_ht_irq(data->irq, dest, cfg->vector);
-	return IRQ_SET_MASK_OK_NOCOPY;
+	ret = parent->chip->irq_set_affinity(parent, mask, force);
+	if (ret >= 0) {
+		struct ht_irq_msg msg;
+		struct irq_cfg *cfg = irqd_cfg(data);
+
+		fetch_ht_irq_msg(data->irq, &msg);
+		msg.address_lo &= ~(HT_IRQ_LOW_VECTOR_MASK |
+				    HT_IRQ_LOW_DEST_ID_MASK);
+		msg.address_lo |= HT_IRQ_LOW_VECTOR(cfg->vector) |
+				  HT_IRQ_LOW_DEST_ID(cfg->dest_apicid);
+		msg.address_hi &= ~(HT_IRQ_HIGH_DEST_ID_MASK);
+		msg.address_hi |= HT_IRQ_HIGH_DEST_ID(cfg->dest_apicid);
+		write_ht_irq_msg(data->irq, &msg);
+	}
+
+	return ret;
 }
 
 static struct irq_chip ht_irq_chip = {
 	.name			= "PCI-HT",
 	.irq_mask		= mask_ht_irq,
 	.irq_unmask		= unmask_ht_irq,
-	.irq_ack		= apic_ack_edge,
+	.irq_ack		= irq_chip_ack_parent,
 	.irq_set_affinity	= ht_set_affinity,
-	.irq_retrigger		= apic_retrigger_irq,
+	.irq_retrigger		= irq_chip_retrigger_hierarchy,
 	.flags			= IRQCHIP_SKIP_SET_WAKE,
 };
 
-int arch_alloc_ht_irq(struct pci_dev *dev)
+static int htirq_domain_alloc(struct irq_domain *domain, unsigned int virq,
+			      unsigned int nr_irqs, void *arg)
 {
-	return irq_domain_alloc_irqs(NULL, 1, dev_to_node(&dev->dev), NULL);
+	struct ht_irq_cfg *ht_cfg;
+	struct irq_alloc_info *info = arg;
+	struct pci_dev *dev;
+	irq_hw_number_t hwirq;
+	int ret;
+
+	if (nr_irqs > 1 || !info)
+		return -EINVAL;
+
+	dev = info->ht_dev;
+	hwirq = (info->ht_idx & 0xFF) |
+		PCI_DEVID(dev->bus->number, dev->devfn) << 8 |
+		(pci_domain_nr(dev->bus) & 0xFFFFFFFF) << 24;
+	if (irq_find_mapping(domain, hwirq) > 0)
+		return -EEXIST;
+
+	ht_cfg = kmalloc(sizeof(*ht_cfg), GFP_KERNEL);
+	if (!ht_cfg)
+		return -ENOMEM;
+
+	ret = irq_domain_alloc_irqs_parent(domain, virq, nr_irqs, info);
+	if (ret < 0) {
+		kfree(ht_cfg);
+		return ret;
+	}
+
+	/* Initialize msg to a value that will never match the first write. */
+	ht_cfg->msg.address_lo = 0xffffffff;
+	ht_cfg->msg.address_hi = 0xffffffff;
+	ht_cfg->dev = info->ht_dev;
+	ht_cfg->update = info->ht_update;
+	ht_cfg->pos = info->ht_pos;
+	ht_cfg->idx = 0x10 + (info->ht_idx * 2);
+	irq_domain_set_info(domain, virq, hwirq, &ht_irq_chip, ht_cfg,
+			    handle_edge_irq, ht_cfg, "edge");
+
+	return 0;
 }
 
-void arch_free_ht_irq(int irq)
+static void htirq_domain_free(struct irq_domain *domain, unsigned int virq,
+			      unsigned int nr_irqs)
 {
-	irq_domain_free_irqs(irq, 1);
+	struct irq_data *irq_data = irq_domain_get_irq_data(domain, virq);
+
+	BUG_ON(nr_irqs != 1);
+	kfree(irq_data->chip_data);
+	irq_domain_free_irqs_top(domain, virq, nr_irqs);
 }
 
-int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
+static void htirq_domain_activate(struct irq_domain *domain,
+				  struct irq_data *irq_data)
 {
-	struct irq_cfg *cfg;
 	struct ht_irq_msg msg;
+	struct irq_cfg *cfg = irqd_cfg(irq_data);
 
-	if (disable_apic)
-		return -ENXIO;
-
-	cfg = irq_cfg(irq);
 	msg.address_hi = HT_IRQ_HIGH_DEST_ID(cfg->dest_apicid);
-
 	msg.address_lo =
 		HT_IRQ_LOW_BASE |
 		HT_IRQ_LOW_DEST_ID(cfg->dest_apicid) |
@@ -95,13 +131,56 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
 			HT_IRQ_LOW_MT_FIXED :
 			HT_IRQ_LOW_MT_ARBITRATED) |
 		HT_IRQ_LOW_IRQ_MASKED;
+	write_ht_irq_msg(irq_data->irq, &msg);
+}
 
-	write_ht_irq_msg(irq, &msg);
+static void htirq_domain_deactivate(struct irq_domain *domain,
+				    struct irq_data *irq_data)
+{
+	struct ht_irq_msg msg;
 
-	irq_set_chip_and_handler_name(irq, &ht_irq_chip,
-				      handle_edge_irq, "edge");
+	memset(&msg, 0, sizeof(msg));
+	write_ht_irq_msg(irq_data->irq, &msg);
+}
 
-	dev_dbg(&dev->dev, "irq %d for HT\n", irq);
+static struct irq_domain_ops htirq_domain_ops = {
+	.alloc = htirq_domain_alloc,
+	.free = htirq_domain_free,
+	.activate = htirq_domain_activate,
+	.deactivate = htirq_domain_deactivate,
+};
 
-	return 0;
+void arch_init_htirq_domain(struct irq_domain *parent)
+{
+	if (disable_apic)
+		return;
+
+	htirq_domain = irq_domain_add_tree(NULL, &htirq_domain_ops, NULL);
+	if (!htirq_domain)
+		pr_warn("failed to initialize irqdomain for HTIRQ.\n");
+	else
+		htirq_domain->parent = parent;
+}
+
+int arch_setup_ht_irq(int idx, int pos, struct pci_dev *dev,
+		      ht_irq_update_t *update)
+{
+	struct irq_alloc_info info;
+
+	if (!htirq_domain)
+		return -ENOSYS;
+
+	init_irq_alloc_info(&info, NULL);
+	info.ht_idx = idx;
+	info.ht_pos = pos;
+	info.ht_dev = dev;
+	info.ht_update = update;
+
+	return irq_domain_alloc_irqs(htirq_domain, 1, dev_to_node(&dev->dev),
+				     &info);
+}
+
+void arch_teardown_ht_irq(unsigned int irq)
+{
+	irq_domain_free_irqs(irq, 1);
 }
diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c
index a8d82896be75..b4b6b5a13440 100644
--- a/arch/x86/kernel/apic/vector.c
+++ b/arch/x86/kernel/apic/vector.c
@@ -365,6 +365,7 @@ int __init arch_early_irq_init(void)
 	irq_set_default_host(x86_vector_domain);
 
 	arch_init_msi_domain(x86_vector_domain);
+	arch_init_htirq_domain(x86_vector_domain);
 
 	return arch_early_ioapic_init();
 }
diff --git a/drivers/pci/htirq.c b/drivers/pci/htirq.c
index ceb0ebeb7b5f..7eb4109a3df4 100644
--- a/drivers/pci/htirq.c
+++ b/drivers/pci/htirq.c
@@ -23,20 +23,11 @@
  */
 static DEFINE_SPINLOCK(ht_irq_lock);
 
-struct ht_irq_cfg {
-	struct pci_dev *dev;
-	 /* Update callback used to cope with buggy hardware */
-	ht_irq_update_t *update;
-	unsigned pos;
-	unsigned idx;
-	struct ht_irq_msg msg;
-};
-
-
 void write_ht_irq_msg(unsigned int irq, struct ht_irq_msg *msg)
 {
 	struct ht_irq_cfg *cfg = irq_get_handler_data(irq);
 	unsigned long flags;
+
 	spin_lock_irqsave(&ht_irq_lock, flags);
 	if (cfg->msg.address_lo != msg->address_lo) {
 		pci_write_config_byte(cfg->dev, cfg->pos + 2, cfg->idx);
@@ -55,6 +46,7 @@ void write_ht_irq_msg(unsigned int irq, struct ht_irq_msg *msg)
 void fetch_ht_irq_msg(unsigned int irq, struct ht_irq_msg *msg)
 {
 	struct ht_irq_cfg *cfg = irq_get_handler_data(irq);
+
 	*msg = cfg->msg;
 }
 
@@ -86,7 +78,6 @@ void unmask_ht_irq(struct irq_data *data)
  */
 int __ht_create_irq(struct pci_dev *dev, int idx, ht_irq_update_t *update)
 {
-	struct ht_irq_cfg *cfg;
 	int max_irq, pos, irq;
 	unsigned long flags;
 	u32 data;
@@ -105,29 +96,9 @@ int __ht_create_irq(struct pci_dev *dev, int idx, ht_irq_update_t *update)
 	if (idx > max_irq)
 		return -EINVAL;
 
-	cfg = kmalloc(sizeof(*cfg), GFP_KERNEL);
-	if (!cfg)
-		return -ENOMEM;
-
-	cfg->dev = dev;
-	cfg->update = update;
-	cfg->pos = pos;
-	cfg->idx = 0x10 + (idx * 2);
-	/* Initialize msg to a value that will never match the first write. */
-	cfg->msg.address_lo = 0xffffffff;
-	cfg->msg.address_hi = 0xffffffff;
-
-	irq = arch_alloc_ht_irq(dev);
-	if (irq <= 0) {
-		kfree(cfg);
-		return -EBUSY;
-	}
-	irq_set_handler_data(irq, cfg);
-
-	if (arch_setup_ht_irq(irq, dev) < 0) {
-		ht_destroy_irq(irq);
-		return -EBUSY;
-	}
+	irq = arch_setup_ht_irq(idx, pos, dev, update);
+	if (irq > 0)
+		dev_dbg(&dev->dev, "irq %d for HT\n", irq);
 
 	return irq;
 }
@@ -158,12 +129,6 @@ EXPORT_SYMBOL(ht_create_irq);
  */
 void ht_destroy_irq(unsigned int irq)
 {
-	struct ht_irq_cfg *cfg;
-
-	cfg = irq_get_handler_data(irq);
-	irq_set_chip(irq, NULL);
-	irq_set_handler_data(irq, NULL);
-	arch_free_ht_irq(irq);
-	kfree(cfg);
+	arch_teardown_ht_irq(irq);
 }
 EXPORT_SYMBOL(ht_destroy_irq);
diff --git a/include/linux/htirq.h b/include/linux/htirq.h
index 5caa51b7b95c..d4a527e58434 100644
--- a/include/linux/htirq.h
+++ b/include/linux/htirq.h
@@ -1,26 +1,38 @@
 #ifndef LINUX_HTIRQ_H
 #define LINUX_HTIRQ_H
 
+struct pci_dev;
+struct irq_data;
+
 struct ht_irq_msg {
 	u32	address_lo;	/* low 32 bits of the ht irq message */
 	u32	address_hi;	/* high 32 bits of the it irq message */
 };
 
+typedef void (ht_irq_update_t)(struct pci_dev *dev, int irq,
+			       struct ht_irq_msg *msg);
+
+struct ht_irq_cfg {
+	struct pci_dev *dev;
+	 /* Update callback used to cope with buggy hardware */
+	ht_irq_update_t *update;
+	unsigned pos;
+	unsigned idx;
+	struct ht_irq_msg msg;
+};
+
 /* Helper functions.. */
 void fetch_ht_irq_msg(unsigned int irq, struct ht_irq_msg *msg);
 void write_ht_irq_msg(unsigned int irq, struct ht_irq_msg *msg);
-struct irq_data;
 void mask_ht_irq(struct irq_data *data);
 void unmask_ht_irq(struct irq_data *data);
 
 /* The arch hook for getting things started */
-int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev);
-int arch_alloc_ht_irq(struct pci_dev *dev);
-void arch_free_ht_irq(int irq);
+int arch_setup_ht_irq(int idx, int pos, struct pci_dev *dev,
+		      ht_irq_update_t *update);
+void arch_teardown_ht_irq(unsigned int irq);
 
 /* For drivers of buggy hardware */
-typedef void (ht_irq_update_t)(struct pci_dev *dev, int irq,
-			       struct ht_irq_msg *msg);
 int __ht_create_irq(struct pci_dev *dev, int idx, ht_irq_update_t *update);
 
 #endif /* LINUX_HTIRQ_H */
-- 
cgit v1.2.3


From 591fc116f3302da915bb57d4474a61a5e8884cec Mon Sep 17 00:00:00 2001
From: "Ivan T. Ivanov" <ivan.ivanov@linaro.org>
Date: Thu, 9 Apr 2015 11:34:22 +0300
Subject: usb: phy: msm: Use extcon framework for VBUS and ID detection

On recent Qualcomm platforms VBUS and ID lines are not routed to
USB PHY LINK controller. Use extcon framework to receive connect
and disconnect ID and VBUS notification.

Signed-off-by: Ivan T. Ivanov <ivan.ivanov@linaro.org>
Signed-off-by: Felipe Balbi <balbi@ti.com>
---
 .../devicetree/bindings/usb/msm-hsusb.txt          |  7 ++
 drivers/usb/phy/Kconfig                            |  1 +
 drivers/usb/phy/phy-msm-usb.c                      | 84 ++++++++++++++++++++++
 include/linux/usb/msm_hsusb.h                      | 17 +++++
 4 files changed, 109 insertions(+)

(limited to 'include/linux')

diff --git a/Documentation/devicetree/bindings/usb/msm-hsusb.txt b/Documentation/devicetree/bindings/usb/msm-hsusb.txt
index 2826f2af503a..f26bcfac3d8f 100644
--- a/Documentation/devicetree/bindings/usb/msm-hsusb.txt
+++ b/Documentation/devicetree/bindings/usb/msm-hsusb.txt
@@ -69,6 +69,13 @@ Optional properties:
                 (no, min, max) where each value represents either a voltage
                 in microvolts or a value corresponding to voltage corner.
 
+- extcon:       phandles to external connector devices. First phandle
+                should point to external connector, which provide "USB"
+                cable events, the second should point to external connector
+                device, which provide "USB-HOST" cable events. If one of
+                the external connector devices is not required empty <0>
+                phandle should be specified.
+
 Example HSUSB OTG controller device node:
 
     usb@f9a55000 {
diff --git a/drivers/usb/phy/Kconfig b/drivers/usb/phy/Kconfig
index 2175678e674e..811f331892d7 100644
--- a/drivers/usb/phy/Kconfig
+++ b/drivers/usb/phy/Kconfig
@@ -141,6 +141,7 @@ config USB_MSM_OTG
 	tristate "Qualcomm on-chip USB OTG controller support"
 	depends on (USB || USB_GADGET) && (ARCH_QCOM || COMPILE_TEST)
 	depends on RESET_CONTROLLER
+	depends on EXTCON
 	select USB_PHY
 	help
 	  Enable this to support the USB OTG transceiver on Qualcomm chips. It
diff --git a/drivers/usb/phy/phy-msm-usb.c b/drivers/usb/phy/phy-msm-usb.c
index c9156beeadef..ad66c67ce9a5 100644
--- a/drivers/usb/phy/phy-msm-usb.c
+++ b/drivers/usb/phy/phy-msm-usb.c
@@ -1436,9 +1436,42 @@ static const struct of_device_id msm_otg_dt_match[] = {
 };
 MODULE_DEVICE_TABLE(of, msm_otg_dt_match);
 
+static int msm_otg_vbus_notifier(struct notifier_block *nb, unsigned long event,
+				void *ptr)
+{
+	struct msm_usb_cable *vbus = container_of(nb, struct msm_usb_cable, nb);
+	struct msm_otg *motg = container_of(vbus, struct msm_otg, vbus);
+
+	if (event)
+		set_bit(B_SESS_VLD, &motg->inputs);
+	else
+		clear_bit(B_SESS_VLD, &motg->inputs);
+
+	schedule_work(&motg->sm_work);
+
+	return NOTIFY_DONE;
+}
+
+static int msm_otg_id_notifier(struct notifier_block *nb, unsigned long event,
+				void *ptr)
+{
+	struct msm_usb_cable *id = container_of(nb, struct msm_usb_cable, nb);
+	struct msm_otg *motg = container_of(id, struct msm_otg, id);
+
+	if (event)
+		clear_bit(ID, &motg->inputs);
+	else
+		set_bit(ID, &motg->inputs);
+
+	schedule_work(&motg->sm_work);
+
+	return NOTIFY_DONE;
+}
+
 static int msm_otg_read_dt(struct platform_device *pdev, struct msm_otg *motg)
 {
 	struct msm_otg_platform_data *pdata;
+	struct extcon_dev *ext_id, *ext_vbus;
 	const struct of_device_id *id;
 	struct device_node *node = pdev->dev.of_node;
 	struct property *prop;
@@ -1487,6 +1520,52 @@ static int msm_otg_read_dt(struct platform_device *pdev, struct msm_otg *motg)
 		motg->vdd_levels[VDD_LEVEL_MAX] = tmp[VDD_LEVEL_MAX];
 	}
 
+	ext_id = ERR_PTR(-ENODEV);
+	ext_vbus = ERR_PTR(-ENODEV);
+	if (of_property_read_bool(node, "extcon")) {
+
+		/* Each one of them is not mandatory */
+		ext_vbus = extcon_get_edev_by_phandle(&pdev->dev, 0);
+		if (IS_ERR(ext_vbus) && PTR_ERR(ext_vbus) != -ENODEV)
+			return PTR_ERR(ext_vbus);
+
+		ext_id = extcon_get_edev_by_phandle(&pdev->dev, 1);
+		if (IS_ERR(ext_id) && PTR_ERR(ext_id) != -ENODEV)
+			return PTR_ERR(ext_id);
+	}
+
+	if (!IS_ERR(ext_vbus)) {
+		motg->vbus.nb.notifier_call = msm_otg_vbus_notifier;
+		ret = extcon_register_interest(&motg->vbus.conn, ext_vbus->name,
+					       "USB", &motg->vbus.nb);
+		if (ret < 0) {
+			dev_err(&pdev->dev, "register VBUS notifier failed\n");
+			return ret;
+		}
+
+		ret = extcon_get_cable_state(ext_vbus, "USB");
+		if (ret)
+			set_bit(B_SESS_VLD, &motg->inputs);
+		else
+			clear_bit(B_SESS_VLD, &motg->inputs);
+	}
+
+	if (!IS_ERR(ext_id)) {
+		motg->id.nb.notifier_call = msm_otg_id_notifier;
+		ret = extcon_register_interest(&motg->id.conn, ext_id->name,
+					       "USB-HOST", &motg->id.nb);
+		if (ret < 0) {
+			dev_err(&pdev->dev, "register ID notifier failed\n");
+			return ret;
+		}
+
+		ret = extcon_get_cable_state(ext_id, "USB-HOST");
+		if (ret)
+			clear_bit(ID, &motg->inputs);
+		else
+			set_bit(ID, &motg->inputs);
+	}
+
 	prop = of_find_property(node, "qcom,phy-init-sequence", &len);
 	if (!prop || !len)
 		return 0;
@@ -1700,6 +1779,11 @@ static int msm_otg_remove(struct platform_device *pdev)
 	if (phy->otg->host || phy->otg->gadget)
 		return -EBUSY;
 
+	if (motg->id.conn.edev)
+		extcon_unregister_interest(&motg->id.conn);
+	if (motg->vbus.conn.edev)
+		extcon_unregister_interest(&motg->vbus.conn);
+
 	msm_otg_debugfs_cleanup();
 	cancel_delayed_work_sync(&motg->chg_work);
 	cancel_work_sync(&motg->sm_work);
diff --git a/include/linux/usb/msm_hsusb.h b/include/linux/usb/msm_hsusb.h
index 7dbecf9a4656..c4d956e50d09 100644
--- a/include/linux/usb/msm_hsusb.h
+++ b/include/linux/usb/msm_hsusb.h
@@ -18,6 +18,7 @@
 #ifndef __ASM_ARCH_MSM_HSUSB_H
 #define __ASM_ARCH_MSM_HSUSB_H
 
+#include <linux/extcon.h>
 #include <linux/types.h>
 #include <linux/usb/otg.h>
 #include <linux/clk.h>
@@ -119,6 +120,17 @@ struct msm_otg_platform_data {
 	void (*setup_gpio)(enum usb_otg_state state);
 };
 
+/**
+ * struct msm_usb_cable - structure for exteternal connector cable
+ *			  state tracking
+ * @nb: hold event notification callback
+ * @conn: used for notification registration
+ */
+struct msm_usb_cable {
+	struct notifier_block		nb;
+	struct extcon_specific_cable_nb conn;
+};
+
 /**
  * struct msm_otg: OTG driver data. Shared by HCD and DCD.
  * @otg: USB OTG Transceiver structure.
@@ -138,6 +150,8 @@ struct msm_otg_platform_data {
  * @chg_type: The type of charger attached.
  * @dcd_retires: The retry count used to track Data contact
  *               detection process.
+ * @vbus: VBUS signal state trakining, using extcon framework
+ * @id: ID signal state trakining, using extcon framework
  */
 struct msm_otg {
 	struct usb_phy phy;
@@ -166,6 +180,9 @@ struct msm_otg {
 	struct reset_control *phy_rst;
 	struct reset_control *link_rst;
 	int vdd_levels[3];
+
+	struct msm_usb_cable vbus;
+	struct msm_usb_cable id;
 };
 
 #endif
-- 
cgit v1.2.3


From 44e42ae3a398b559c768b9b3c324d72b0b0b4479 Mon Sep 17 00:00:00 2001
From: "Ivan T. Ivanov" <ivan.ivanov@linaro.org>
Date: Thu, 9 Apr 2015 11:34:33 +0300
Subject: usb: phy: msm: Manual PHY and LINK controller VBUS change
 notification

VBUS is not routed to USB PHY on recent Qualcomm platforms. USB controller
must see VBUS in order to pull-up DP when setting RS bit. Henc configure
USB PHY and LINK registers sense VBUS and enable manual pullup on D+ line.

Cc: Vamsi Krishna <vskrishn@codeaurora.org>
Cc: Mayank Rana <mrana@codeaurora.org>
Signed-off-by: Ivan T. Ivanov <ivan.ivanov@linaro.org>
Signed-off-by: Felipe Balbi <balbi@ti.com>
---
 .../devicetree/bindings/usb/msm-hsusb.txt          |  4 ++++
 drivers/usb/phy/phy-msm-usb.c                      | 26 ++++++++++++++++++++++
 include/linux/usb/msm_hsusb.h                      |  5 +++++
 include/linux/usb/msm_hsusb_hw.h                   |  9 ++++++++
 4 files changed, 44 insertions(+)

(limited to 'include/linux')

diff --git a/Documentation/devicetree/bindings/usb/msm-hsusb.txt b/Documentation/devicetree/bindings/usb/msm-hsusb.txt
index f26bcfac3d8f..bd8d9e753029 100644
--- a/Documentation/devicetree/bindings/usb/msm-hsusb.txt
+++ b/Documentation/devicetree/bindings/usb/msm-hsusb.txt
@@ -69,6 +69,10 @@ Optional properties:
                 (no, min, max) where each value represents either a voltage
                 in microvolts or a value corresponding to voltage corner.
 
+- qcom,manual-pullup: If present, vbus is not routed to USB controller/phy
+                and controller driver therefore enables pull-up explicitly
+                before starting controller using usbcmd run/stop bit.
+
 - extcon:       phandles to external connector devices. First phandle
                 should point to external connector, which provide "USB"
                 cable events, the second should point to external connector
diff --git a/drivers/usb/phy/phy-msm-usb.c b/drivers/usb/phy/phy-msm-usb.c
index ad66c67ce9a5..00c49bb1bd29 100644
--- a/drivers/usb/phy/phy-msm-usb.c
+++ b/drivers/usb/phy/phy-msm-usb.c
@@ -240,8 +240,14 @@ static void ulpi_init(struct msm_otg *motg)
 static int msm_phy_notify_disconnect(struct usb_phy *phy,
 				   enum usb_device_speed speed)
 {
+	struct msm_otg *motg = container_of(phy, struct msm_otg, phy);
 	int val;
 
+	if (motg->manual_pullup) {
+		val = ULPI_MISC_A_VBUSVLDEXT | ULPI_MISC_A_VBUSVLDEXTSEL;
+		usb_phy_io_write(phy, val, ULPI_CLR(ULPI_MISC_A));
+	}
+
 	/*
 	 * Put the transceiver in non-driving mode. Otherwise host
 	 * may not detect soft-disconnection.
@@ -422,6 +428,24 @@ static int msm_phy_init(struct usb_phy *phy)
 		ulpi_write(phy, ulpi_val, ULPI_USB_INT_EN_FALL);
 	}
 
+	if (motg->manual_pullup) {
+		val = ULPI_MISC_A_VBUSVLDEXTSEL | ULPI_MISC_A_VBUSVLDEXT;
+		ulpi_write(phy, val, ULPI_SET(ULPI_MISC_A));
+
+		val = readl(USB_GENCONFIG_2);
+		val |= GENCONFIG_2_SESS_VLD_CTRL_EN;
+		writel(val, USB_GENCONFIG_2);
+
+		val = readl(USB_USBCMD);
+		val |= USBCMD_SESS_VLD_CTRL;
+		writel(val, USB_USBCMD);
+
+		val = ulpi_read(phy, ULPI_FUNC_CTRL);
+		val &= ~ULPI_FUNC_CTRL_OPMODE_MASK;
+		val |= ULPI_FUNC_CTRL_OPMODE_NORMAL;
+		ulpi_write(phy, val, ULPI_FUNC_CTRL);
+	}
+
 	if (motg->phy_number)
 		writel(readl(USB_PHY_CTRL2) | BIT(16), USB_PHY_CTRL2);
 
@@ -1520,6 +1544,8 @@ static int msm_otg_read_dt(struct platform_device *pdev, struct msm_otg *motg)
 		motg->vdd_levels[VDD_LEVEL_MAX] = tmp[VDD_LEVEL_MAX];
 	}
 
+	motg->manual_pullup = of_property_read_bool(node, "qcom,manual-pullup");
+
 	ext_id = ERR_PTR(-ENODEV);
 	ext_vbus = ERR_PTR(-ENODEV);
 	if (of_property_read_bool(node, "extcon")) {
diff --git a/include/linux/usb/msm_hsusb.h b/include/linux/usb/msm_hsusb.h
index c4d956e50d09..e55a1504266e 100644
--- a/include/linux/usb/msm_hsusb.h
+++ b/include/linux/usb/msm_hsusb.h
@@ -150,6 +150,9 @@ struct msm_usb_cable {
  * @chg_type: The type of charger attached.
  * @dcd_retires: The retry count used to track Data contact
  *               detection process.
+ * @manual_pullup: true if VBUS is not routed to USB controller/phy
+ *	and controller driver therefore enables pull-up explicitly before
+ *	starting controller using usbcmd run/stop bit.
  * @vbus: VBUS signal state trakining, using extcon framework
  * @id: ID signal state trakining, using extcon framework
  */
@@ -181,6 +184,8 @@ struct msm_otg {
 	struct reset_control *link_rst;
 	int vdd_levels[3];
 
+	bool manual_pullup;
+
 	struct msm_usb_cable vbus;
 	struct msm_usb_cable id;
 };
diff --git a/include/linux/usb/msm_hsusb_hw.h b/include/linux/usb/msm_hsusb_hw.h
index a29f6030afb1..e159b39f67a2 100644
--- a/include/linux/usb/msm_hsusb_hw.h
+++ b/include/linux/usb/msm_hsusb_hw.h
@@ -21,6 +21,8 @@
 
 #define USB_AHBBURST         (MSM_USB_BASE + 0x0090)
 #define USB_AHBMODE          (MSM_USB_BASE + 0x0098)
+#define USB_GENCONFIG_2      (MSM_USB_BASE + 0x00a0)
+
 #define USB_CAPLENGTH        (MSM_USB_BASE + 0x0100) /* 8 bit */
 
 #define USB_USBCMD           (MSM_USB_BASE + 0x0140)
@@ -30,6 +32,9 @@
 #define USB_PHY_CTRL         (MSM_USB_BASE + 0x0240)
 #define USB_PHY_CTRL2        (MSM_USB_BASE + 0x0278)
 
+#define GENCONFIG_2_SESS_VLD_CTRL_EN	BIT(7)
+#define USBCMD_SESS_VLD_CTRL		BIT(25)
+
 #define USBCMD_RESET   2
 #define USB_USBINTR          (MSM_USB_BASE + 0x0148)
 
@@ -50,6 +55,10 @@
 #define ULPI_PWR_CLK_MNG_REG	0x88
 #define OTG_COMP_DISABLE	BIT(0)
 
+#define ULPI_MISC_A			0x96
+#define ULPI_MISC_A_VBUSVLDEXTSEL	BIT(1)
+#define ULPI_MISC_A_VBUSVLDEXT		BIT(0)
+
 #define ASYNC_INTR_CTRL         (1 << 29) /* Enable async interrupt */
 #define ULPI_STP_CTRL           (1 << 30) /* Block communication with PHY */
 #define PHY_RETEN               (1 << 1) /* PHY retention enable/disable */
-- 
cgit v1.2.3


From b189a2117223edbe40e0a187ae5c606cbdd6447c Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert+renesas@glider.be>
Date: Tue, 28 Apr 2015 14:04:07 +0200
Subject: usb: phy: Remove the phy-rcar-gen2-usb driver

The phy-rcar-gen2-usb driver, which supports legacy platform data only,
is no longer used since commit a483dcbfa21f919c ("ARM: shmobile: lager:
Remove legacy board support").

This driver was superseded by the DT-only phy-rcar-gen2 driver, which
was introduced in commit 1233f59f745b237d ("phy: Renesas R-Car Gen2 PHY
driver").

Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
Signed-off-by: Felipe Balbi <balbi@ti.com>
---
 drivers/usb/phy/Kconfig                         |  13 --
 drivers/usb/phy/Makefile                        |   1 -
 drivers/usb/phy/phy-rcar-gen2-usb.c             | 246 ------------------------
 include/linux/platform_data/usb-rcar-gen2-phy.h |  22 ---
 4 files changed, 282 deletions(-)
 delete mode 100644 drivers/usb/phy/phy-rcar-gen2-usb.c
 delete mode 100644 include/linux/platform_data/usb-rcar-gen2-phy.h

(limited to 'include/linux')

diff --git a/drivers/usb/phy/Kconfig b/drivers/usb/phy/Kconfig
index 811f331892d7..173a5b5d8bc1 100644
--- a/drivers/usb/phy/Kconfig
+++ b/drivers/usb/phy/Kconfig
@@ -187,19 +187,6 @@ config USB_RCAR_PHY
 	  To compile this driver as a module, choose M here: the
 	  module will be called phy-rcar-usb.
 
-config USB_RCAR_GEN2_PHY
-	tristate "Renesas R-Car Gen2 USB PHY support"
-	depends on ARCH_R8A7790 || ARCH_R8A7791 || COMPILE_TEST
-	select USB_PHY
-	help
-	  Say Y here to add support for the Renesas R-Car Gen2 USB PHY driver.
-	  It is typically used to control internal USB PHY for USBHS,
-	  and to configure shared USB channels 0 and 2.
-	  This driver supports R8A7790 and R8A7791.
-
-	  To compile this driver as a module, choose M here: the
-	  module will be called phy-rcar-gen2-usb.
-
 config USB_ULPI
 	bool "Generic ULPI Transceiver Driver"
 	depends on ARM || ARM64
diff --git a/drivers/usb/phy/Makefile b/drivers/usb/phy/Makefile
index 75f2bba58c84..e36ab1d46d8b 100644
--- a/drivers/usb/phy/Makefile
+++ b/drivers/usb/phy/Makefile
@@ -23,7 +23,6 @@ obj-$(CONFIG_USB_MSM_OTG)		+= phy-msm-usb.o
 obj-$(CONFIG_USB_MV_OTG)		+= phy-mv-usb.o
 obj-$(CONFIG_USB_MXS_PHY)		+= phy-mxs-usb.o
 obj-$(CONFIG_USB_RCAR_PHY)		+= phy-rcar-usb.o
-obj-$(CONFIG_USB_RCAR_GEN2_PHY)		+= phy-rcar-gen2-usb.o
 obj-$(CONFIG_USB_ULPI)			+= phy-ulpi.o
 obj-$(CONFIG_USB_ULPI_VIEWPORT)		+= phy-ulpi-viewport.o
 obj-$(CONFIG_KEYSTONE_USB_PHY)		+= phy-keystone.o
diff --git a/drivers/usb/phy/phy-rcar-gen2-usb.c b/drivers/usb/phy/phy-rcar-gen2-usb.c
deleted file mode 100644
index f81800b6562a..000000000000
--- a/drivers/usb/phy/phy-rcar-gen2-usb.c
+++ /dev/null
@@ -1,246 +0,0 @@
-/*
- * Renesas R-Car Gen2 USB phy driver
- *
- * Copyright (C) 2013 Renesas Solutions Corp.
- * Copyright (C) 2013 Cogent Embedded, Inc.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/clk.h>
-#include <linux/delay.h>
-#include <linux/io.h>
-#include <linux/module.h>
-#include <linux/platform_data/usb-rcar-gen2-phy.h>
-#include <linux/platform_device.h>
-#include <linux/spinlock.h>
-#include <linux/usb/otg.h>
-
-struct rcar_gen2_usb_phy_priv {
-	struct usb_phy phy;
-	void __iomem *base;
-	struct clk *clk;
-	spinlock_t lock;
-	int usecount;
-	u32 ugctrl2;
-};
-
-#define usb_phy_to_priv(p) container_of(p, struct rcar_gen2_usb_phy_priv, phy)
-
-/* Low Power Status register */
-#define USBHS_LPSTS_REG			0x02
-#define USBHS_LPSTS_SUSPM		(1 << 14)
-
-/* USB General control register */
-#define USBHS_UGCTRL_REG		0x80
-#define USBHS_UGCTRL_CONNECT		(1 << 2)
-#define USBHS_UGCTRL_PLLRESET		(1 << 0)
-
-/* USB General control register 2 */
-#define USBHS_UGCTRL2_REG		0x84
-#define USBHS_UGCTRL2_USB0_PCI		(1 << 4)
-#define USBHS_UGCTRL2_USB0_HS		(3 << 4)
-#define USBHS_UGCTRL2_USB2_PCI		(0 << 31)
-#define USBHS_UGCTRL2_USB2_SS		(1 << 31)
-
-/* USB General status register */
-#define USBHS_UGSTS_REG			0x88
-#define USBHS_UGSTS_LOCK		(1 << 8)
-
-/* Enable USBHS internal phy */
-static int __rcar_gen2_usbhs_phy_enable(void __iomem *base)
-{
-	u32 val;
-	int i;
-
-	/* USBHS PHY power on */
-	val = ioread32(base + USBHS_UGCTRL_REG);
-	val &= ~USBHS_UGCTRL_PLLRESET;
-	iowrite32(val, base + USBHS_UGCTRL_REG);
-
-	val = ioread16(base + USBHS_LPSTS_REG);
-	val |= USBHS_LPSTS_SUSPM;
-	iowrite16(val, base + USBHS_LPSTS_REG);
-
-	for (i = 0; i < 20; i++) {
-		val = ioread32(base + USBHS_UGSTS_REG);
-		if ((val & USBHS_UGSTS_LOCK) == USBHS_UGSTS_LOCK) {
-			val = ioread32(base + USBHS_UGCTRL_REG);
-			val |= USBHS_UGCTRL_CONNECT;
-			iowrite32(val, base + USBHS_UGCTRL_REG);
-			return 0;
-		}
-		udelay(1);
-	}
-
-	/* Timed out waiting for the PLL lock */
-	return -ETIMEDOUT;
-}
-
-/* Disable USBHS internal phy */
-static int __rcar_gen2_usbhs_phy_disable(void __iomem *base)
-{
-	u32 val;
-
-	/* USBHS PHY power off */
-	val = ioread32(base + USBHS_UGCTRL_REG);
-	val &= ~USBHS_UGCTRL_CONNECT;
-	iowrite32(val, base + USBHS_UGCTRL_REG);
-
-	val = ioread16(base + USBHS_LPSTS_REG);
-	val &= ~USBHS_LPSTS_SUSPM;
-	iowrite16(val, base + USBHS_LPSTS_REG);
-
-	val = ioread32(base + USBHS_UGCTRL_REG);
-	val |= USBHS_UGCTRL_PLLRESET;
-	iowrite32(val, base + USBHS_UGCTRL_REG);
-	return 0;
-}
-
-/* Setup USB channels */
-static void __rcar_gen2_usb_phy_init(struct rcar_gen2_usb_phy_priv *priv)
-{
-	u32 val;
-
-	clk_prepare_enable(priv->clk);
-
-	/* Set USB channels in the USBHS UGCTRL2 register */
-	val = ioread32(priv->base + USBHS_UGCTRL2_REG);
-	val &= ~(USBHS_UGCTRL2_USB0_HS | USBHS_UGCTRL2_USB2_SS);
-	val |= priv->ugctrl2;
-	iowrite32(val, priv->base + USBHS_UGCTRL2_REG);
-}
-
-/* Shutdown USB channels */
-static void __rcar_gen2_usb_phy_shutdown(struct rcar_gen2_usb_phy_priv *priv)
-{
-	__rcar_gen2_usbhs_phy_disable(priv->base);
-	clk_disable_unprepare(priv->clk);
-}
-
-static int rcar_gen2_usb_phy_set_suspend(struct usb_phy *phy, int suspend)
-{
-	struct rcar_gen2_usb_phy_priv *priv = usb_phy_to_priv(phy);
-	unsigned long flags;
-	int retval;
-
-	spin_lock_irqsave(&priv->lock, flags);
-	retval = suspend ? __rcar_gen2_usbhs_phy_disable(priv->base) :
-			   __rcar_gen2_usbhs_phy_enable(priv->base);
-	spin_unlock_irqrestore(&priv->lock, flags);
-	return retval;
-}
-
-static int rcar_gen2_usb_phy_init(struct usb_phy *phy)
-{
-	struct rcar_gen2_usb_phy_priv *priv = usb_phy_to_priv(phy);
-	unsigned long flags;
-
-	spin_lock_irqsave(&priv->lock, flags);
-	/*
-	 * Enable the clock and setup USB channels
-	 * if it's the first user
-	 */
-	if (!priv->usecount++)
-		__rcar_gen2_usb_phy_init(priv);
-	spin_unlock_irqrestore(&priv->lock, flags);
-	return 0;
-}
-
-static void rcar_gen2_usb_phy_shutdown(struct usb_phy *phy)
-{
-	struct rcar_gen2_usb_phy_priv *priv = usb_phy_to_priv(phy);
-	unsigned long flags;
-
-	spin_lock_irqsave(&priv->lock, flags);
-	if (!priv->usecount) {
-		dev_warn(phy->dev, "Trying to disable phy with 0 usecount\n");
-		goto out;
-	}
-
-	/* Disable everything if it's the last user */
-	if (!--priv->usecount)
-		__rcar_gen2_usb_phy_shutdown(priv);
-out:
-	spin_unlock_irqrestore(&priv->lock, flags);
-}
-
-static int rcar_gen2_usb_phy_probe(struct platform_device *pdev)
-{
-	struct device *dev = &pdev->dev;
-	struct rcar_gen2_phy_platform_data *pdata;
-	struct rcar_gen2_usb_phy_priv *priv;
-	struct resource *res;
-	void __iomem *base;
-	struct clk *clk;
-	int retval;
-
-	pdata = dev_get_platdata(dev);
-	if (!pdata) {
-		dev_err(dev, "No platform data\n");
-		return -EINVAL;
-	}
-
-	clk = devm_clk_get(dev, "usbhs");
-	if (IS_ERR(clk)) {
-		dev_err(dev, "Can't get the clock\n");
-		return PTR_ERR(clk);
-	}
-
-	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
-	base = devm_ioremap_resource(dev, res);
-	if (IS_ERR(base))
-		return PTR_ERR(base);
-
-	priv = devm_kzalloc(dev, sizeof(*priv), GFP_KERNEL);
-	if (!priv)
-		return -ENOMEM;
-
-	spin_lock_init(&priv->lock);
-	priv->clk = clk;
-	priv->base = base;
-	priv->ugctrl2 = pdata->chan0_pci ?
-			USBHS_UGCTRL2_USB0_PCI : USBHS_UGCTRL2_USB0_HS;
-	priv->ugctrl2 |= pdata->chan2_pci ?
-			USBHS_UGCTRL2_USB2_PCI : USBHS_UGCTRL2_USB2_SS;
-	priv->phy.dev = dev;
-	priv->phy.label = dev_name(dev);
-	priv->phy.init = rcar_gen2_usb_phy_init;
-	priv->phy.shutdown = rcar_gen2_usb_phy_shutdown;
-	priv->phy.set_suspend = rcar_gen2_usb_phy_set_suspend;
-
-	retval = usb_add_phy_dev(&priv->phy);
-	if (retval < 0) {
-		dev_err(dev, "Failed to add USB phy\n");
-		return retval;
-	}
-
-	platform_set_drvdata(pdev, priv);
-
-	return retval;
-}
-
-static int rcar_gen2_usb_phy_remove(struct platform_device *pdev)
-{
-	struct rcar_gen2_usb_phy_priv *priv = platform_get_drvdata(pdev);
-
-	usb_remove_phy(&priv->phy);
-
-	return 0;
-}
-
-static struct platform_driver rcar_gen2_usb_phy_driver = {
-	.driver = {
-		.name = "usb_phy_rcar_gen2",
-	},
-	.probe = rcar_gen2_usb_phy_probe,
-	.remove = rcar_gen2_usb_phy_remove,
-};
-
-module_platform_driver(rcar_gen2_usb_phy_driver);
-
-MODULE_LICENSE("GPL v2");
-MODULE_DESCRIPTION("Renesas R-Car Gen2 USB phy");
-MODULE_AUTHOR("Valentine Barshak <valentine.barshak@cogentembedded.com>");
diff --git a/include/linux/platform_data/usb-rcar-gen2-phy.h b/include/linux/platform_data/usb-rcar-gen2-phy.h
deleted file mode 100644
index dd3ba46c0d90..000000000000
--- a/include/linux/platform_data/usb-rcar-gen2-phy.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/*
- * Copyright (C) 2013 Renesas Solutions Corp.
- * Copyright (C) 2013 Cogent Embedded, Inc.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#ifndef __USB_RCAR_GEN2_PHY_H
-#define __USB_RCAR_GEN2_PHY_H
-
-#include <linux/types.h>
-
-struct rcar_gen2_phy_platform_data {
-	/* USB channel 0 configuration */
-	bool chan0_pci:1;	/* true: PCI USB host 0, false: USBHS */
-	/* USB channel 2 configuration */
-	bool chan2_pci:1;	/* true: PCI USB host 2, false: USBSS */
-};
-
-#endif
-- 
cgit v1.2.3


From f1ec7187167ce225d2744b20a90afef5f10fd6cd Mon Sep 17 00:00:00 2001
From: Rob Herring <robh@kernel.org>
Date: Wed, 29 Apr 2015 16:02:30 -0500
Subject: libfdt: add fdt type definitions

In preparation for libfdt/dtc update, add the new fdt specific types.

Signed-off-by: Rob Herring <robh@kernel.org>
Cc: Russell King <linux@arm.linux.org.uk>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: linux-arm-kernel@lists.infradead.org
Cc: linuxppc-dev@lists.ozlabs.org
---
 arch/arm/boot/compressed/libfdt_env.h | 4 ++++
 arch/powerpc/boot/libfdt_env.h        | 4 ++++
 arch/powerpc/boot/of.h                | 2 ++
 include/linux/libfdt_env.h            | 4 ++++
 4 files changed, 14 insertions(+)

(limited to 'include/linux')

diff --git a/arch/arm/boot/compressed/libfdt_env.h b/arch/arm/boot/compressed/libfdt_env.h
index 1f4e71876b00..17ae0f3efac8 100644
--- a/arch/arm/boot/compressed/libfdt_env.h
+++ b/arch/arm/boot/compressed/libfdt_env.h
@@ -5,6 +5,10 @@
 #include <linux/string.h>
 #include <asm/byteorder.h>
 
+typedef __be16 fdt16_t;
+typedef __be32 fdt32_t;
+typedef __be64 fdt64_t;
+
 #define fdt16_to_cpu(x)		be16_to_cpu(x)
 #define cpu_to_fdt16(x)		cpu_to_be16(x)
 #define fdt32_to_cpu(x)		be32_to_cpu(x)
diff --git a/arch/powerpc/boot/libfdt_env.h b/arch/powerpc/boot/libfdt_env.h
index 8dcd744e5728..7e3789ea396b 100644
--- a/arch/powerpc/boot/libfdt_env.h
+++ b/arch/powerpc/boot/libfdt_env.h
@@ -10,6 +10,10 @@ typedef u32 uint32_t;
 typedef u64 uint64_t;
 typedef unsigned long uintptr_t;
 
+typedef __be16 fdt16_t;
+typedef __be32 fdt32_t;
+typedef __be64 fdt64_t;
+
 #define fdt16_to_cpu(x)		be16_to_cpu(x)
 #define cpu_to_fdt16(x)		cpu_to_be16(x)
 #define fdt32_to_cpu(x)		be32_to_cpu(x)
diff --git a/arch/powerpc/boot/of.h b/arch/powerpc/boot/of.h
index 5603320dce07..53f8f27f94e4 100644
--- a/arch/powerpc/boot/of.h
+++ b/arch/powerpc/boot/of.h
@@ -21,7 +21,9 @@ int of_setprop(const void *phandle, const char *name, const void *buf,
 /* Console functions */
 void of_console_init(void);
 
+typedef u16			__be16;
 typedef u32			__be32;
+typedef u64			__be64;
 
 #ifdef __LITTLE_ENDIAN__
 #define cpu_to_be16(x) swab16(x)
diff --git a/include/linux/libfdt_env.h b/include/linux/libfdt_env.h
index 01508c7b8c81..2a663c6bb428 100644
--- a/include/linux/libfdt_env.h
+++ b/include/linux/libfdt_env.h
@@ -5,6 +5,10 @@
 
 #include <asm/byteorder.h>
 
+typedef __be16 fdt16_t;
+typedef __be32 fdt32_t;
+typedef __be64 fdt64_t;
+
 #define fdt32_to_cpu(x) be32_to_cpu(x)
 #define cpu_to_fdt32(x) cpu_to_be32(x)
 #define fdt64_to_cpu(x) be64_to_cpu(x)
-- 
cgit v1.2.3


From 042f7df15a4fff8eec42873f755aea848dcdedd1 Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Thu, 30 Apr 2015 17:16:12 +0800
Subject: workqueue: Allow modifying low level unbound workqueue cpumask

Allow to modify the low-level unbound workqueues cpumask through
sysfs. This is performed by traversing the entire workqueue list
and calling apply_wqattrs_prepare() on the unbound workqueues
with the new low level mask. Only after all the preparation are done,
we commit them all together.

Ordered workqueues are ignored from the low level unbound workqueue
cpumask, it will be handled in near future.

All the (default & per-node) pwqs are mandatorily controlled by
the low level cpumask. If the user configured cpumask doesn't overlap
with the low level cpumask, the low level cpumask will be used for the
wq instead.

The comment of wq_calc_node_cpumask() is updated and explicitly
requires that its first argument should be the attrs of the default
pwq.

The default wq_unbound_cpumask is cpu_possible_mask.  The workqueue
subsystem doesn't know its best default value, let the system manager
or the other subsystem set it when needed.

Changed from V8:
  merge the calculating code for the attrs of the default pwq together.
  minor change the code&comments for saving the user configured attrs.
  remove unnecessary list_del().
  minor update the comment of wq_calc_node_cpumask().
  update the comment of workqueue_set_unbound_cpumask();

Cc: Christoph Lameter <cl@linux.com>
Cc: Kevin Hilman <khilman@linaro.org>
Cc: Lai Jiangshan <laijs@cn.fujitsu.com>
Cc: Mike Galbraith <bitbucket@online.de>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Viresh Kumar <viresh.kumar@linaro.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Original-patch-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/workqueue.h |   1 +
 kernel/workqueue.c        | 127 ++++++++++++++++++++++++++++++++++++++++++----
 2 files changed, 119 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index deee212af8e0..4618dd672d1b 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -424,6 +424,7 @@ struct workqueue_attrs *alloc_workqueue_attrs(gfp_t gfp_mask);
 void free_workqueue_attrs(struct workqueue_attrs *attrs);
 int apply_workqueue_attrs(struct workqueue_struct *wq,
 			  const struct workqueue_attrs *attrs);
+int workqueue_set_unbound_cpumask(cpumask_var_t cpumask);
 
 extern bool queue_work_on(int cpu, struct workqueue_struct *wq,
 			struct work_struct *work);
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 9be75e2a4da6..a3915abc1983 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -299,7 +299,7 @@ static DEFINE_SPINLOCK(wq_mayday_lock);	/* protects wq->maydays list */
 static LIST_HEAD(workqueues);		/* PR: list of all workqueues */
 static bool workqueue_freezing;		/* PL: have wqs started freezing? */
 
-static cpumask_var_t wq_unbound_cpumask;
+static cpumask_var_t wq_unbound_cpumask; /* PL: low level cpumask for all unbound wqs */
 
 /* the per-cpu worker pools */
 static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS],
@@ -3429,7 +3429,7 @@ static struct pool_workqueue *alloc_unbound_pwq(struct workqueue_struct *wq,
 
 /**
  * wq_calc_node_mask - calculate a wq_attrs' cpumask for the specified node
- * @attrs: the wq_attrs of interest
+ * @attrs: the wq_attrs of the default pwq of the target workqueue
  * @node: the target NUMA node
  * @cpu_going_down: if >= 0, the CPU to consider as offline
  * @cpumask: outarg, the resulting cpumask
@@ -3493,6 +3493,7 @@ static struct pool_workqueue *numa_pwq_tbl_install(struct workqueue_struct *wq,
 struct apply_wqattrs_ctx {
 	struct workqueue_struct	*wq;		/* target workqueue */
 	struct workqueue_attrs	*attrs;		/* attrs to apply */
+	struct list_head	list;		/* queued for batching commit */
 	struct pool_workqueue	*dfl_pwq;
 	struct pool_workqueue	*pwq_tbl[];
 };
@@ -3532,9 +3533,15 @@ apply_wqattrs_prepare(struct workqueue_struct *wq,
 	if (!ctx || !new_attrs || !tmp_attrs)
 		goto out_free;
 
-	/* make a copy of @attrs and sanitize it */
+	/*
+	 * Calculate the attrs of the default pwq.
+	 * If the user configured cpumask doesn't overlap with the
+	 * wq_unbound_cpumask, we fallback to the wq_unbound_cpumask.
+	 */
 	copy_workqueue_attrs(new_attrs, attrs);
 	cpumask_and(new_attrs->cpumask, new_attrs->cpumask, wq_unbound_cpumask);
+	if (unlikely(cpumask_empty(new_attrs->cpumask)))
+		cpumask_copy(new_attrs->cpumask, wq_unbound_cpumask);
 
 	/*
 	 * We may create multiple pwqs with differing cpumasks.  Make a
@@ -3553,7 +3560,7 @@ apply_wqattrs_prepare(struct workqueue_struct *wq,
 		goto out_free;
 
 	for_each_node(node) {
-		if (wq_calc_node_cpumask(attrs, node, -1, tmp_attrs->cpumask)) {
+		if (wq_calc_node_cpumask(new_attrs, node, -1, tmp_attrs->cpumask)) {
 			ctx->pwq_tbl[node] = alloc_unbound_pwq(wq, tmp_attrs);
 			if (!ctx->pwq_tbl[node])
 				goto out_free;
@@ -3563,7 +3570,11 @@ apply_wqattrs_prepare(struct workqueue_struct *wq,
 		}
 	}
 
+	/* save the user configured attrs and sanitize it. */
+	copy_workqueue_attrs(new_attrs, attrs);
+	cpumask_and(new_attrs->cpumask, new_attrs->cpumask, cpu_possible_mask);
 	ctx->attrs = new_attrs;
+
 	ctx->wq = wq;
 	free_workqueue_attrs(tmp_attrs);
 	return ctx;
@@ -3704,11 +3715,11 @@ static void wq_update_unbound_numa(struct workqueue_struct *wq, int cpu,
 
 	/*
 	 * Let's determine what needs to be done.  If the target cpumask is
-	 * different from wq's, we need to compare it to @pwq's and create
-	 * a new one if they don't match.  If the target cpumask equals
-	 * wq's, the default pwq should be used.
+	 * different from the default pwq's, we need to compare it to @pwq's
+	 * and create a new one if they don't match.  If the target cpumask
+	 * equals the default pwq's, the default pwq should be used.
 	 */
-	if (wq_calc_node_cpumask(wq->unbound_attrs, node, cpu_off, cpumask)) {
+	if (wq_calc_node_cpumask(wq->dfl_pwq->pool->attrs, node, cpu_off, cpumask)) {
 		if (cpumask_equal(cpumask, pwq->pool->attrs->cpumask))
 			goto out_unlock;
 	} else {
@@ -4731,6 +4742,84 @@ out_unlock:
 }
 #endif /* CONFIG_FREEZER */
 
+static int workqueue_apply_unbound_cpumask(void)
+{
+	LIST_HEAD(ctxs);
+	int ret = 0;
+	struct workqueue_struct *wq;
+	struct apply_wqattrs_ctx *ctx, *n;
+
+	lockdep_assert_held(&wq_pool_mutex);
+
+	list_for_each_entry(wq, &workqueues, list) {
+		if (!(wq->flags & WQ_UNBOUND))
+			continue;
+		/* creating multiple pwqs breaks ordering guarantee */
+		if (wq->flags & __WQ_ORDERED)
+			continue;
+
+		ctx = apply_wqattrs_prepare(wq, wq->unbound_attrs);
+		if (!ctx) {
+			ret = -ENOMEM;
+			break;
+		}
+
+		list_add_tail(&ctx->list, &ctxs);
+	}
+
+	list_for_each_entry_safe(ctx, n, &ctxs, list) {
+		if (!ret)
+			apply_wqattrs_commit(ctx);
+		apply_wqattrs_cleanup(ctx);
+	}
+
+	return ret;
+}
+
+/**
+ *  workqueue_set_unbound_cpumask - Set the low-level unbound cpumask
+ *  @cpumask: the cpumask to set
+ *
+ *  The low-level workqueues cpumask is a global cpumask that limits
+ *  the affinity of all unbound workqueues.  This function check the @cpumask
+ *  and apply it to all unbound workqueues and updates all pwqs of them.
+ *
+ *  Retun:	0	- Success
+ *  		-EINVAL	- Invalid @cpumask
+ *  		-ENOMEM	- Failed to allocate memory for attrs or pwqs.
+ */
+int workqueue_set_unbound_cpumask(cpumask_var_t cpumask)
+{
+	int ret = -EINVAL;
+	cpumask_var_t saved_cpumask;
+
+	if (!zalloc_cpumask_var(&saved_cpumask, GFP_KERNEL))
+		return -ENOMEM;
+
+	get_online_cpus();
+	cpumask_and(cpumask, cpumask, cpu_possible_mask);
+	if (!cpumask_empty(cpumask)) {
+		mutex_lock(&wq_pool_mutex);
+
+		/* save the old wq_unbound_cpumask. */
+		cpumask_copy(saved_cpumask, wq_unbound_cpumask);
+
+		/* update wq_unbound_cpumask at first and apply it to wqs. */
+		cpumask_copy(wq_unbound_cpumask, cpumask);
+		ret = workqueue_apply_unbound_cpumask();
+
+		/* restore the wq_unbound_cpumask when failed. */
+		if (ret < 0)
+			cpumask_copy(wq_unbound_cpumask, saved_cpumask);
+
+		mutex_unlock(&wq_pool_mutex);
+	}
+	put_online_cpus();
+
+	free_cpumask_var(saved_cpumask);
+	return ret;
+}
+
 #ifdef CONFIG_SYSFS
 /*
  * Workqueues with WQ_SYSFS flag set is visible to userland via
@@ -4952,14 +5041,34 @@ static ssize_t wq_unbound_cpumask_show(struct device *dev,
 {
 	int written;
 
+	mutex_lock(&wq_pool_mutex);
 	written = scnprintf(buf, PAGE_SIZE, "%*pb\n",
 			    cpumask_pr_args(wq_unbound_cpumask));
+	mutex_unlock(&wq_pool_mutex);
 
 	return written;
 }
 
+static ssize_t wq_unbound_cpumask_store(struct device *dev,
+		struct device_attribute *attr, const char *buf, size_t count)
+{
+	cpumask_var_t cpumask;
+	int ret;
+
+	if (!zalloc_cpumask_var(&cpumask, GFP_KERNEL))
+		return -ENOMEM;
+
+	ret = cpumask_parse(buf, cpumask);
+	if (!ret)
+		ret = workqueue_set_unbound_cpumask(cpumask);
+
+	free_cpumask_var(cpumask);
+	return ret ? ret : count;
+}
+
 static struct device_attribute wq_sysfs_cpumask_attr =
-	__ATTR(cpumask, 0444, wq_unbound_cpumask_show, NULL);
+	__ATTR(cpumask, 0644, wq_unbound_cpumask_show,
+	       wq_unbound_cpumask_store);
 
 static int __init wq_sysfs_init(void)
 {
-- 
cgit v1.2.3


From 0bb549052d33f8992544764a6cf1299d06ba7e2f Mon Sep 17 00:00:00 2001
From: Peter Jones <pjones@redhat.com>
Date: Tue, 28 Apr 2015 18:44:31 -0400
Subject: efi: Add esrt support

Add sysfs files for the EFI System Resource Table (ESRT) under
/sys/firmware/efi/esrt and for each EFI System Resource Entry under
entries/ as a subdir.

The EFI System Resource Table (ESRT) provides a read-only catalog of
system components for which the system accepts firmware upgrades via
UEFI's "Capsule Update" feature.  This module allows userland utilities
to evaluate what firmware updates can be applied to this system, and
potentially arrange for those updates to occur.

The ESRT is described as part of the UEFI specification, in version 2.5
which should be available from http://uefi.org/specifications in early
2015.  If you're a member of the UEFI Forum, information about its
addition to the standard is available as UEFI Mantis 1090.

For some hardware platforms, additional restrictions may be found at
http://msdn.microsoft.com/en-us/library/windows/hardware/jj128256.aspx ,
and additional documentation may be found at
http://download.microsoft.com/download/5/F/5/5F5D16CD-2530-4289-8019-94C6A20BED3C/windows-uefi-firmware-update-platform.docx
.

Signed-off-by: Peter Jones <pjones@redhat.com>
Signed-off-by: Matt Fleming <matt.fleming@intel.com>
---
 Documentation/ABI/testing/sysfs-firmware-efi-esrt |  81 ++++
 arch/x86/platform/efi/efi.c                       |   2 +
 drivers/firmware/efi/Makefile                     |   2 +-
 drivers/firmware/efi/efi.c                        |  82 +++-
 drivers/firmware/efi/esrt.c                       | 464 ++++++++++++++++++++++
 include/linux/efi.h                               |   8 +
 6 files changed, 637 insertions(+), 2 deletions(-)
 create mode 100644 Documentation/ABI/testing/sysfs-firmware-efi-esrt
 create mode 100644 drivers/firmware/efi/esrt.c

(limited to 'include/linux')

diff --git a/Documentation/ABI/testing/sysfs-firmware-efi-esrt b/Documentation/ABI/testing/sysfs-firmware-efi-esrt
new file mode 100644
index 000000000000..6e431d1a4e79
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-firmware-efi-esrt
@@ -0,0 +1,81 @@
+What:		/sys/firmware/efi/esrt/
+Date:		February 2015
+Contact:	Peter Jones <pjones@redhat.com>
+Description:	Provides userland access to read the EFI System Resource Table
+		(ESRT), a catalog of firmware for which can be updated with
+		the UEFI UpdateCapsule mechanism described in section 7.5 of
+		the UEFI Standard.
+Users:		fwupdate - https://github.com/rhinstaller/fwupdate
+
+What:		/sys/firmware/efi/esrt/fw_resource_count
+Date:		February 2015
+Contact:	Peter Jones <pjones@redhat.com>
+Description:	The number of entries in the ESRT
+
+What:		/sys/firmware/efi/esrt/fw_resource_count_max
+Date:		February 2015
+Contact:	Peter Jones <pjones@redhat.com>
+Description:	The maximum number of entries that /could/ be registered
+		in the allocation the table is currently in.  This is
+		really only useful to the system firmware itself.
+
+What:		/sys/firmware/efi/esrt/fw_resource_version
+Date:		February 2015
+Contact:	Peter Jones <pjones@redhat.com>
+Description:	The version of the ESRT structure provided by the firmware.
+
+What:		/sys/firmware/efi/esrt/entries/entry$N/
+Date:		February 2015
+Contact:	Peter Jones <pjones@redhat.com>
+Description:	Each ESRT entry is identified by a GUID, and each gets a
+		subdirectory under entries/ .
+		example: /sys/firmware/efi/esrt/entries/entry0/
+
+What:		/sys/firmware/efi/esrt/entries/entry$N/fw_type
+Date:		February 2015
+Contact:	Peter Jones <pjones@redhat.com>
+Description:	What kind of firmware entry this is:
+		0 - Unknown
+		1 - System Firmware
+		2 - Device Firmware
+		3 - UEFI Driver
+
+What:		/sys/firmware/efi/esrt/entries/entry$N/fw_class
+Date:		February 2015
+Contact:	Peter Jones <pjones@redhat.com>
+Description:	This is the entry's guid, and will match the directory name.
+
+What:		/sys/firmware/efi/esrt/entries/entry$N/fw_version
+Date:		February 2015
+Contact:	Peter Jones <pjones@redhat.com>
+Description:	The version of the firmware currently installed.  This is a
+		32-bit unsigned integer.
+
+What:		/sys/firmware/efi/esrt/entries/entry$N/lowest_supported_fw_version
+Date:		February 2015
+Contact:	Peter Jones <pjones@redhat.com>
+Description:	The lowest version of the firmware that can be installed.
+
+What:		/sys/firmware/efi/esrt/entries/entry$N/capsule_flags
+Date:		February 2015
+Contact:	Peter Jones <pjones@redhat.com>
+Description:	Flags that must be passed to UpdateCapsule()
+
+What:		/sys/firmware/efi/esrt/entries/entry$N/last_attempt_version
+Date:		February 2015
+Contact:	Peter Jones <pjones@redhat.com>
+Description:	The last firmware version for which an update was attempted.
+
+What:		/sys/firmware/efi/esrt/entries/entry$N/last_attempt_status
+Date:		February 2015
+Contact:	Peter Jones <pjones@redhat.com>
+Description:	The result of the last firmware update attempt for the
+		firmware resource entry.
+		0 - Success
+		1 - Insufficient resources
+		2 - Incorrect version
+		3 - Invalid format
+		4 - Authentication error
+		5 - AC power event
+		6 - Battery power event
+
diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c
index 02744df576d5..3b984c3aa1b0 100644
--- a/arch/x86/platform/efi/efi.c
+++ b/arch/x86/platform/efi/efi.c
@@ -501,6 +501,8 @@ void __init efi_init(void)
 
 	if (efi_enabled(EFI_DBG))
 		print_efi_memmap();
+
+	efi_esrt_init();
 }
 
 void __init efi_late_init(void)
diff --git a/drivers/firmware/efi/Makefile b/drivers/firmware/efi/Makefile
index d8be608a9f3b..26eabbc55341 100644
--- a/drivers/firmware/efi/Makefile
+++ b/drivers/firmware/efi/Makefile
@@ -1,7 +1,7 @@
 #
 # Makefile for linux kernel
 #
-obj-$(CONFIG_EFI)			+= efi.o vars.o reboot.o
+obj-$(CONFIG_EFI)			+= efi.o esrt.o vars.o reboot.o
 obj-$(CONFIG_EFI_VARS)			+= efivars.o
 obj-$(CONFIG_EFI_VARS_PSTORE)		+= efi-pstore.o
 obj-$(CONFIG_UEFI_CPER)			+= cper.o
diff --git a/drivers/firmware/efi/efi.c b/drivers/firmware/efi/efi.c
index 3061bb8629dc..48b4c356740f 100644
--- a/drivers/firmware/efi/efi.c
+++ b/drivers/firmware/efi/efi.c
@@ -39,6 +39,7 @@ struct efi __read_mostly efi = {
 	.fw_vendor  = EFI_INVALID_TABLE_ADDR,
 	.runtime    = EFI_INVALID_TABLE_ADDR,
 	.config_table  = EFI_INVALID_TABLE_ADDR,
+	.esrt       = EFI_INVALID_TABLE_ADDR,
 };
 EXPORT_SYMBOL(efi);
 
@@ -64,7 +65,7 @@ static int __init parse_efi_cmdline(char *str)
 }
 early_param("efi", parse_efi_cmdline);
 
-static struct kobject *efi_kobj;
+struct kobject *efi_kobj;
 static struct kobject *efivars_kobj;
 
 /*
@@ -232,6 +233,84 @@ err_put:
 
 subsys_initcall(efisubsys_init);
 
+/*
+ * Find the efi memory descriptor for a given physical address.  Given a
+ * physicall address, determine if it exists within an EFI Memory Map entry,
+ * and if so, populate the supplied memory descriptor with the appropriate
+ * data.
+ */
+int __init efi_mem_desc_lookup(u64 phys_addr, efi_memory_desc_t *out_md)
+{
+	struct efi_memory_map *map = efi.memmap;
+	void *p, *e;
+
+	if (!efi_enabled(EFI_MEMMAP)) {
+		pr_err_once("EFI_MEMMAP is not enabled.\n");
+		return -EINVAL;
+	}
+
+	if (!map) {
+		pr_err_once("efi.memmap is not set.\n");
+		return -EINVAL;
+	}
+	if (!out_md) {
+		pr_err_once("out_md is null.\n");
+		return -EINVAL;
+        }
+	if (WARN_ON_ONCE(!map->phys_map))
+		return -EINVAL;
+	if (WARN_ON_ONCE(map->nr_map == 0) || WARN_ON_ONCE(map->desc_size == 0))
+		return -EINVAL;
+
+	e = map->phys_map + map->nr_map * map->desc_size;
+	for (p = map->phys_map; p < e; p += map->desc_size) {
+		efi_memory_desc_t *md;
+		u64 size;
+		u64 end;
+
+		/*
+		 * If a driver calls this after efi_free_boot_services,
+		 * ->map will be NULL, and the target may also not be mapped.
+		 * So just always get our own virtual map on the CPU.
+		 *
+		 */
+		md = early_memremap((phys_addr_t)p, sizeof (*md));
+		if (!md) {
+			pr_err_once("early_memremap(%p, %zu) failed.\n",
+				    p, sizeof (*md));
+			return -ENOMEM;
+		}
+
+		if (!(md->attribute & EFI_MEMORY_RUNTIME) &&
+		    md->type != EFI_BOOT_SERVICES_DATA &&
+		    md->type != EFI_RUNTIME_SERVICES_DATA) {
+			early_memunmap(md, sizeof (*md));
+			continue;
+		}
+
+		size = md->num_pages << EFI_PAGE_SHIFT;
+		end = md->phys_addr + size;
+		if (phys_addr >= md->phys_addr && phys_addr < end) {
+			memcpy(out_md, md, sizeof(*out_md));
+			early_memunmap(md, sizeof (*md));
+			return 0;
+		}
+
+		early_memunmap(md, sizeof (*md));
+	}
+	pr_err_once("requested map not found.\n");
+	return -ENOENT;
+}
+
+/*
+ * Calculate the highest address of an efi memory descriptor.
+ */
+u64 __init efi_mem_desc_end(efi_memory_desc_t *md)
+{
+	u64 size = md->num_pages << EFI_PAGE_SHIFT;
+	u64 end = md->phys_addr + size;
+	return end;
+}
 
 /*
  * We can't ioremap data in EFI boot services RAM, because we've already mapped
@@ -274,6 +353,7 @@ static __initdata efi_config_table_type_t common_tables[] = {
 	{SMBIOS_TABLE_GUID, "SMBIOS", &efi.smbios},
 	{SMBIOS3_TABLE_GUID, "SMBIOS 3.0", &efi.smbios3},
 	{UGA_IO_PROTOCOL_GUID, "UGA", &efi.uga},
+	{EFI_SYSTEM_RESOURCE_TABLE_GUID, "ESRT", &efi.esrt},
 	{NULL_GUID, NULL, NULL},
 };
 
diff --git a/drivers/firmware/efi/esrt.c b/drivers/firmware/efi/esrt.c
new file mode 100644
index 000000000000..20c0cbdb1c60
--- /dev/null
+++ b/drivers/firmware/efi/esrt.c
@@ -0,0 +1,464 @@
+/*
+ * esrt.c
+ *
+ * This module exports EFI System Resource Table (ESRT) entries into userspace
+ * through the sysfs file system. The ESRT provides a read-only catalog of
+ * system components for which the system accepts firmware upgrades via UEFI's
+ * "Capsule Update" feature. This module allows userland utilities to evaluate
+ * what firmware updates can be applied to this system, and potentially arrange
+ * for those updates to occur.
+ *
+ * Data is currently found below /sys/firmware/efi/esrt/...
+ */
+#define pr_fmt(fmt) "esrt: " fmt
+
+#include <linux/capability.h>
+#include <linux/device.h>
+#include <linux/efi.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/kobject.h>
+#include <linux/list.h>
+#include <linux/memblock.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+
+#include <asm/io.h>
+#include <asm/early_ioremap.h>
+
+struct efi_system_resource_entry_v1 {
+	efi_guid_t	fw_class;
+	u32		fw_type;
+	u32		fw_version;
+	u32		lowest_supported_fw_version;
+	u32		capsule_flags;
+	u32		last_attempt_version;
+	u32		last_attempt_status;
+};
+
+/*
+ * _count and _version are what they seem like.  _max is actually just
+ * accounting info for the firmware when creating the table; it should never
+ * have been exposed to us.  To wit, the spec says:
+ * The maximum number of resource array entries that can be within the
+ * table without reallocating the table, must not be zero.
+ * Since there's no guidance about what that means in terms of memory layout,
+ * it means nothing to us.
+ */
+struct efi_system_resource_table {
+	u32	fw_resource_count;
+	u32	fw_resource_count_max;
+	u64	fw_resource_version;
+	u8	entries[];
+};
+
+static phys_addr_t esrt_data;
+static size_t esrt_data_size;
+
+static struct efi_system_resource_table *esrt;
+
+struct esre_entry {
+	union {
+		struct efi_system_resource_entry_v1 *esre1;
+	} esre;
+
+	struct kobject kobj;
+	struct list_head list;
+};
+
+/* global list of esre_entry. */
+static LIST_HEAD(entry_list);
+
+/* entry attribute */
+struct esre_attribute {
+	struct attribute attr;
+	ssize_t (*show)(struct esre_entry *entry, char *buf);
+	ssize_t (*store)(struct esre_entry *entry,
+			 const char *buf, size_t count);
+};
+
+static struct esre_entry *to_entry(struct kobject *kobj)
+{
+	return container_of(kobj, struct esre_entry, kobj);
+}
+
+static struct esre_attribute *to_attr(struct attribute *attr)
+{
+	return container_of(attr, struct esre_attribute, attr);
+}
+
+static ssize_t esre_attr_show(struct kobject *kobj,
+			      struct attribute *_attr, char *buf)
+{
+	struct esre_entry *entry = to_entry(kobj);
+	struct esre_attribute *attr = to_attr(_attr);
+
+	/* Don't tell normal users what firmware versions we've got... */
+	if (!capable(CAP_SYS_ADMIN))
+		return -EACCES;
+
+	return attr->show(entry, buf);
+}
+
+static const struct sysfs_ops esre_attr_ops = {
+	.show = esre_attr_show,
+};
+
+/* Generic ESRT Entry ("ESRE") support. */
+static ssize_t esre_fw_class_show(struct esre_entry *entry, char *buf)
+{
+	char *str = buf;
+
+	efi_guid_to_str(&entry->esre.esre1->fw_class, str);
+	str += strlen(str);
+	str += sprintf(str, "\n");
+
+	return str - buf;
+}
+
+static struct esre_attribute esre_fw_class = __ATTR(fw_class, 0400,
+	esre_fw_class_show, NULL);
+
+#define esre_attr_decl(name, size, fmt) \
+static ssize_t esre_##name##_show(struct esre_entry *entry, char *buf) \
+{ \
+	return sprintf(buf, fmt "\n", \
+		       le##size##_to_cpu(entry->esre.esre1->name)); \
+} \
+\
+static struct esre_attribute esre_##name = __ATTR(name, 0400, \
+	esre_##name##_show, NULL)
+
+esre_attr_decl(fw_type, 32, "%u");
+esre_attr_decl(fw_version, 32, "%u");
+esre_attr_decl(lowest_supported_fw_version, 32, "%u");
+esre_attr_decl(capsule_flags, 32, "0x%x");
+esre_attr_decl(last_attempt_version, 32, "%u");
+esre_attr_decl(last_attempt_status, 32, "%u");
+
+static struct attribute *esre1_attrs[] = {
+	&esre_fw_class.attr,
+	&esre_fw_type.attr,
+	&esre_fw_version.attr,
+	&esre_lowest_supported_fw_version.attr,
+	&esre_capsule_flags.attr,
+	&esre_last_attempt_version.attr,
+	&esre_last_attempt_status.attr,
+	NULL
+};
+static void esre_release(struct kobject *kobj)
+{
+	struct esre_entry *entry = to_entry(kobj);
+
+	list_del(&entry->list);
+	kfree(entry);
+}
+
+static struct kobj_type esre1_ktype = {
+	.release = esre_release,
+	.sysfs_ops = &esre_attr_ops,
+	.default_attrs = esre1_attrs,
+};
+
+
+static struct kobject *esrt_kobj;
+static struct kset *esrt_kset;
+
+static int esre_create_sysfs_entry(void *esre, int entry_num)
+{
+	int rc = 0;
+	struct esre_entry *entry;
+	char name[20];
+
+	entry = kzalloc(sizeof(*entry), GFP_KERNEL);
+	if (!entry)
+		return -ENOMEM;
+
+	sprintf(name, "entry%d", entry_num);
+
+	entry->kobj.kset = esrt_kset;
+
+	if (esrt->fw_resource_version == 1) {
+		entry->esre.esre1 = esre;
+		rc = kobject_init_and_add(&entry->kobj, &esre1_ktype, NULL,
+					  "%s", name);
+	}
+	if (rc) {
+		kfree(entry);
+		return rc;
+	}
+
+	list_add_tail(&entry->list, &entry_list);
+	return 0;
+}
+
+/* support for displaying ESRT fields at the top level */
+#define esrt_attr_decl(name, size, fmt) \
+static ssize_t esrt_##name##_show(struct kobject *kobj, \
+				  struct kobj_attribute *attr, char *buf)\
+{ \
+	return sprintf(buf, fmt "\n", le##size##_to_cpu(esrt->name)); \
+} \
+\
+static struct kobj_attribute esrt_##name = __ATTR(name, 0400, \
+	esrt_##name##_show, NULL)
+
+esrt_attr_decl(fw_resource_count, 32, "%u");
+esrt_attr_decl(fw_resource_count_max, 32, "%u");
+esrt_attr_decl(fw_resource_version, 64, "%llu");
+
+static struct attribute *esrt_attrs[] = {
+	&esrt_fw_resource_count.attr,
+	&esrt_fw_resource_count_max.attr,
+	&esrt_fw_resource_version.attr,
+	NULL,
+};
+
+static inline int esrt_table_exists(void)
+{
+	if (!efi_enabled(EFI_CONFIG_TABLES))
+		return 0;
+	if (efi.esrt == EFI_INVALID_TABLE_ADDR)
+		return 0;
+	return 1;
+}
+
+static umode_t esrt_attr_is_visible(struct kobject *kobj,
+				    struct attribute *attr, int n)
+{
+	if (!esrt_table_exists())
+		return 0;
+	return attr->mode;
+}
+
+static struct attribute_group esrt_attr_group = {
+	.attrs = esrt_attrs,
+	.is_visible = esrt_attr_is_visible,
+};
+
+/*
+ * remap the table, copy it to kmalloced pages, and unmap it.
+ */
+void __init efi_esrt_init(void)
+{
+	void *va;
+	struct efi_system_resource_table tmpesrt;
+	struct efi_system_resource_entry_v1 *v1_entries;
+	size_t size, max, entry_size, entries_size;
+	efi_memory_desc_t md;
+	int rc;
+
+	pr_debug("esrt-init: loading.\n");
+	if (!esrt_table_exists())
+		return;
+
+	rc = efi_mem_desc_lookup(efi.esrt, &md);
+	if (rc < 0) {
+		pr_err("ESRT header is not in the memory map.\n");
+		return;
+	}
+
+	max = efi_mem_desc_end(&md);
+	if (max < efi.esrt) {
+		pr_err("EFI memory descriptor is invalid. (esrt: %p max: %p)\n",
+		       (void *)efi.esrt, (void *)max);
+		return;
+	}
+
+	size = sizeof(*esrt);
+	max -= efi.esrt;
+
+	if (max < size) {
+		pr_err("ESRT header doen't fit on single memory map entry. (size: %zu max: %zu)\n",
+		       size, max);
+		return;
+	}
+
+	va = early_memremap(efi.esrt, size);
+	if (!va) {
+		pr_err("early_memremap(%p, %zu) failed.\n", (void *)efi.esrt,
+		       size);
+		return;
+	}
+
+	memcpy(&tmpesrt, va, sizeof(tmpesrt));
+
+	if (tmpesrt.fw_resource_version == 1) {
+		entry_size = sizeof (*v1_entries);
+	} else {
+		pr_err("Unsupported ESRT version %lld.\n",
+		       tmpesrt.fw_resource_version);
+		return;
+	}
+
+	if (tmpesrt.fw_resource_count > 0 && max - size < entry_size) {
+		pr_err("ESRT memory map entry can only hold the header. (max: %zu size: %zu)\n",
+		       max - size, entry_size);
+		goto err_memunmap;
+	}
+
+	/*
+	 * The format doesn't really give us any boundary to test here,
+	 * so I'm making up 128 as the max number of individually updatable
+	 * components we support.
+	 * 128 should be pretty excessive, but there's still some chance
+	 * somebody will do that someday and we'll need to raise this.
+	 */
+	if (tmpesrt.fw_resource_count > 128) {
+		pr_err("ESRT says fw_resource_count has very large value %d.\n",
+		       tmpesrt.fw_resource_count);
+		goto err_memunmap;
+	}
+
+	/*
+	 * We know it can't be larger than N * sizeof() here, and N is limited
+	 * by the previous test to a small number, so there's no overflow.
+	 */
+	entries_size = tmpesrt.fw_resource_count * entry_size;
+	if (max < size + entries_size) {
+		pr_err("ESRT does not fit on single memory map entry (size: %zu max: %zu)\n",
+		       size, max);
+		goto err_memunmap;
+	}
+
+	/* remap it with our (plausible) new pages */
+	early_memunmap(va, size);
+	size += entries_size;
+	va = early_memremap(efi.esrt, size);
+	if (!va) {
+		pr_err("early_memremap(%p, %zu) failed.\n", (void *)efi.esrt,
+		       size);
+		return;
+	}
+
+	esrt_data = (phys_addr_t)efi.esrt;
+	esrt_data_size = size;
+
+	pr_info("Reserving ESRT space from %p to %p.\n", (void *)esrt_data,
+		(char *)esrt_data + size);
+	memblock_reserve(esrt_data, esrt_data_size);
+
+	pr_debug("esrt-init: loaded.\n");
+err_memunmap:
+	early_memunmap(va, size);
+}
+
+static int __init register_entries(void)
+{
+	struct efi_system_resource_entry_v1 *v1_entries = (void *)esrt->entries;
+	int i, rc;
+
+	if (!esrt_table_exists())
+		return 0;
+
+	for (i = 0; i < le32_to_cpu(esrt->fw_resource_count); i++) {
+		void *entry;
+		if (esrt->fw_resource_version == 1) {
+			entry = &v1_entries[i];
+		}
+		rc = esre_create_sysfs_entry(entry, i);
+		if (rc < 0) {
+			pr_err("ESRT entry creation failed with error %d.\n",
+			       rc);
+			return rc;
+		}
+	}
+	return 0;
+}
+
+static void cleanup_entry_list(void)
+{
+	struct esre_entry *entry, *next;
+
+	list_for_each_entry_safe(entry, next, &entry_list, list) {
+		kobject_put(&entry->kobj);
+	}
+}
+
+static int __init esrt_sysfs_init(void)
+{
+	int error;
+	struct efi_system_resource_table __iomem *ioesrt;
+
+	pr_debug("esrt-sysfs: loading.\n");
+	if (!esrt_data || !esrt_data_size)
+		return -ENOSYS;
+
+	ioesrt = ioremap(esrt_data, esrt_data_size);
+	if (!ioesrt) {
+		pr_err("ioremap(%p, %zu) failed.\n", (void *)esrt_data,
+		       esrt_data_size);
+		return -ENOMEM;
+	}
+
+	esrt = kmalloc(esrt_data_size, GFP_KERNEL);
+	if (!esrt) {
+		pr_err("kmalloc failed. (wanted %zu bytes)\n", esrt_data_size);
+		iounmap(ioesrt);
+		return -ENOMEM;
+	}
+
+	memcpy_fromio(esrt, ioesrt, esrt_data_size);
+
+	esrt_kobj = kobject_create_and_add("esrt", efi_kobj);
+	if (!esrt_kobj) {
+		pr_err("Firmware table registration failed.\n");
+		error = -ENOMEM;
+		goto err;
+	}
+
+	error = sysfs_create_group(esrt_kobj, &esrt_attr_group);
+	if (error) {
+		pr_err("Sysfs attribute export failed with error %d.\n",
+		       error);
+		goto err_remove_esrt;
+	}
+
+	esrt_kset = kset_create_and_add("entries", NULL, esrt_kobj);
+	if (!esrt_kset) {
+		pr_err("kset creation failed.\n");
+		error = -ENOMEM;
+		goto err_remove_group;
+	}
+
+	error = register_entries();
+	if (error)
+		goto err_cleanup_list;
+
+	memblock_remove(esrt_data, esrt_data_size);
+
+	pr_debug("esrt-sysfs: loaded.\n");
+
+	return 0;
+err_cleanup_list:
+	cleanup_entry_list();
+	kset_unregister(esrt_kset);
+err_remove_group:
+	sysfs_remove_group(esrt_kobj, &esrt_attr_group);
+err_remove_esrt:
+	kobject_put(esrt_kobj);
+err:
+	kfree(esrt);
+	esrt = NULL;
+	return error;
+}
+
+static void __exit esrt_sysfs_exit(void)
+{
+	pr_debug("esrt-sysfs: unloading.\n");
+	cleanup_entry_list();
+	kset_unregister(esrt_kset);
+	sysfs_remove_group(esrt_kobj, &esrt_attr_group);
+	kfree(esrt);
+	esrt = NULL;
+	kobject_del(esrt_kobj);
+	kobject_put(esrt_kobj);
+}
+
+module_init(esrt_sysfs_init);
+module_exit(esrt_sysfs_exit);
+
+MODULE_AUTHOR("Peter Jones <pjones@redhat.com>");
+MODULE_DESCRIPTION("EFI System Resource Table support");
+MODULE_LICENSE("GPL");
diff --git a/include/linux/efi.h b/include/linux/efi.h
index af5be0368dec..024c27e7c0fa 100644
--- a/include/linux/efi.h
+++ b/include/linux/efi.h
@@ -583,6 +583,9 @@ void efi_native_runtime_setup(void);
 #define EFI_FILE_INFO_ID \
     EFI_GUID(  0x9576e92, 0x6d3f, 0x11d2, 0x8e, 0x39, 0x00, 0xa0, 0xc9, 0x69, 0x72, 0x3b )
 
+#define EFI_SYSTEM_RESOURCE_TABLE_GUID \
+    EFI_GUID(  0xb122a263, 0x3661, 0x4f68, 0x99, 0x29, 0x78, 0xf8, 0xb0, 0xd6, 0x21, 0x80 )
+
 #define EFI_FILE_SYSTEM_GUID \
     EFI_GUID(  0x964e5b22, 0x6459, 0x11d2, 0x8e, 0x39, 0x00, 0xa0, 0xc9, 0x69, 0x72, 0x3b )
 
@@ -823,6 +826,7 @@ extern struct efi {
 	unsigned long fw_vendor;	/* fw_vendor */
 	unsigned long runtime;		/* runtime table */
 	unsigned long config_table;	/* config tables */
+	unsigned long esrt;		/* ESRT table */
 	efi_get_time_t *get_time;
 	efi_set_time_t *set_time;
 	efi_get_wakeup_time_t *get_wakeup_time;
@@ -875,6 +879,7 @@ static inline efi_status_t efi_query_variable_store(u32 attributes, unsigned lon
 #endif
 extern void __iomem *efi_lookup_mapped_addr(u64 phys_addr);
 extern int efi_config_init(efi_config_table_type_t *arch_tables);
+extern void __init efi_esrt_init(void);
 extern int efi_config_parse_tables(void *config_tables, int count, int sz,
 				   efi_config_table_type_t *arch_tables);
 extern u64 efi_get_iobase (void);
@@ -882,12 +887,15 @@ extern u32 efi_mem_type (unsigned long phys_addr);
 extern u64 efi_mem_attributes (unsigned long phys_addr);
 extern u64 efi_mem_attribute (unsigned long phys_addr, unsigned long size);
 extern int __init efi_uart_console_only (void);
+extern u64 efi_mem_desc_end(efi_memory_desc_t *md);
+extern int efi_mem_desc_lookup(u64 phys_addr, efi_memory_desc_t *out_md);
 extern void efi_initialize_iomem_resources(struct resource *code_resource,
 		struct resource *data_resource, struct resource *bss_resource);
 extern void efi_get_time(struct timespec *now);
 extern void efi_reserve_boot_services(void);
 extern int efi_get_fdt_params(struct efi_fdt_params *params, int verbose);
 extern struct efi_memory_map memmap;
+extern struct kobject *efi_kobj;
 
 extern int efi_reboot_quirk_mode;
 extern bool efi_poweroff_required(void);
-- 
cgit v1.2.3


From fb7737e949e31d8a71acee6bbb670f32dbd2a2c0 Mon Sep 17 00:00:00 2001
From: Suman Anna <s-anna@ti.com>
Date: Wed, 4 Mar 2015 20:01:14 -0600
Subject: hwspinlock/core: add device tree support

This patch adds a new OF-friendly API of_hwspin_lock_get_id()
for hwspinlock clients to use/request locks from a hwspinlock
device instantiated through a device-tree blob. This new API
can be used by hwspinlock clients to get the id for a specific
lock using the phandle + args specifier, so that it can be
requested using the available hwspin_lock_request_specific()
API.

Signed-off-by: Suman Anna <s-anna@ti.com>
Reviewed-by: Bjorn Andersson <bjorn.andersson@sonymobile.com>
[small comment clarification]
Signed-off-by: Ohad Ben-Cohen <ohad@wizery.com>
---
 Documentation/hwspinlock.txt         | 10 +++++
 drivers/hwspinlock/hwspinlock_core.c | 79 ++++++++++++++++++++++++++++++++++++
 include/linux/hwspinlock.h           |  7 ++++
 3 files changed, 96 insertions(+)

(limited to 'include/linux')

diff --git a/Documentation/hwspinlock.txt b/Documentation/hwspinlock.txt
index 62f7d4ea6e26..61c1ee98e59f 100644
--- a/Documentation/hwspinlock.txt
+++ b/Documentation/hwspinlock.txt
@@ -48,6 +48,16 @@ independent, drivers.
      ids for predefined purposes.
      Should be called from a process context (might sleep).
 
+  int of_hwspin_lock_get_id(struct device_node *np, int index);
+   - retrieve the global lock id for an OF phandle-based specific lock.
+     This function provides a means for DT users of a hwspinlock module
+     to get the global lock id of a specific hwspinlock, so that it can
+     be requested using the normal hwspin_lock_request_specific() API.
+     The function returns a lock id number on success, -EPROBE_DEFER if
+     the hwspinlock device is not yet registered with the core, or other
+     error values.
+     Should be called from a process context (might sleep).
+
   int hwspin_lock_free(struct hwspinlock *hwlock);
    - free a previously-assigned hwspinlock; returns 0 on success, or an
      appropriate error code on failure (e.g. -EINVAL if the hwspinlock
diff --git a/drivers/hwspinlock/hwspinlock_core.c b/drivers/hwspinlock/hwspinlock_core.c
index 461a0d739d75..52f708bcf77f 100644
--- a/drivers/hwspinlock/hwspinlock_core.c
+++ b/drivers/hwspinlock/hwspinlock_core.c
@@ -27,6 +27,7 @@
 #include <linux/hwspinlock.h>
 #include <linux/pm_runtime.h>
 #include <linux/mutex.h>
+#include <linux/of.h>
 
 #include "hwspinlock_internal.h"
 
@@ -257,6 +258,84 @@ void __hwspin_unlock(struct hwspinlock *hwlock, int mode, unsigned long *flags)
 }
 EXPORT_SYMBOL_GPL(__hwspin_unlock);
 
+/**
+ * of_hwspin_lock_simple_xlate - translate hwlock_spec to return a lock id
+ * @bank: the hwspinlock device bank
+ * @hwlock_spec: hwlock specifier as found in the device tree
+ *
+ * This is a simple translation function, suitable for hwspinlock platform
+ * drivers that only has a lock specifier length of 1.
+ *
+ * Returns a relative index of the lock within a specified bank on success,
+ * or -EINVAL on invalid specifier cell count.
+ */
+static inline int
+of_hwspin_lock_simple_xlate(const struct of_phandle_args *hwlock_spec)
+{
+	if (WARN_ON(hwlock_spec->args_count != 1))
+		return -EINVAL;
+
+	return hwlock_spec->args[0];
+}
+
+/**
+ * of_hwspin_lock_get_id() - get lock id for an OF phandle-based specific lock
+ * @np: device node from which to request the specific hwlock
+ * @index: index of the hwlock in the list of values
+ *
+ * This function provides a means for DT users of the hwspinlock module to
+ * get the global lock id of a specific hwspinlock using the phandle of the
+ * hwspinlock device, so that it can be requested using the normal
+ * hwspin_lock_request_specific() API.
+ *
+ * Returns the global lock id number on success, -EPROBE_DEFER if the hwspinlock
+ * device is not yet registered, -EINVAL on invalid args specifier value or an
+ * appropriate error as returned from the OF parsing of the DT client node.
+ */
+int of_hwspin_lock_get_id(struct device_node *np, int index)
+{
+	struct of_phandle_args args;
+	struct hwspinlock *hwlock;
+	struct radix_tree_iter iter;
+	void **slot;
+	int id;
+	int ret;
+
+	ret = of_parse_phandle_with_args(np, "hwlocks", "#hwlock-cells", index,
+					 &args);
+	if (ret)
+		return ret;
+
+	/* Find the hwspinlock device: we need its base_id */
+	ret = -EPROBE_DEFER;
+	rcu_read_lock();
+	radix_tree_for_each_slot(slot, &hwspinlock_tree, &iter, 0) {
+		hwlock = radix_tree_deref_slot(slot);
+		if (unlikely(!hwlock))
+			continue;
+
+		if (hwlock->bank->dev->of_node == args.np) {
+			ret = 0;
+			break;
+		}
+	}
+	rcu_read_unlock();
+	if (ret < 0)
+		goto out;
+
+	id = of_hwspin_lock_simple_xlate(&args);
+	if (id < 0 || id >= hwlock->bank->num_locks) {
+		ret = -EINVAL;
+		goto out;
+	}
+	id += hwlock->bank->base_id;
+
+out:
+	of_node_put(args.np);
+	return ret ? ret : id;
+}
+EXPORT_SYMBOL_GPL(of_hwspin_lock_get_id);
+
 static int hwspin_lock_register_single(struct hwspinlock *hwlock, int id)
 {
 	struct hwspinlock *tmp;
diff --git a/include/linux/hwspinlock.h b/include/linux/hwspinlock.h
index 3343298e40e8..859d673d98c8 100644
--- a/include/linux/hwspinlock.h
+++ b/include/linux/hwspinlock.h
@@ -26,6 +26,7 @@
 #define HWLOCK_IRQ	0x02	/* Disable interrupts, don't save state */
 
 struct device;
+struct device_node;
 struct hwspinlock;
 struct hwspinlock_device;
 struct hwspinlock_ops;
@@ -66,6 +67,7 @@ int hwspin_lock_unregister(struct hwspinlock_device *bank);
 struct hwspinlock *hwspin_lock_request(void);
 struct hwspinlock *hwspin_lock_request_specific(unsigned int id);
 int hwspin_lock_free(struct hwspinlock *hwlock);
+int of_hwspin_lock_get_id(struct device_node *np, int index);
 int hwspin_lock_get_id(struct hwspinlock *hwlock);
 int __hwspin_lock_timeout(struct hwspinlock *, unsigned int, int,
 							unsigned long *);
@@ -120,6 +122,11 @@ void __hwspin_unlock(struct hwspinlock *hwlock, int mode, unsigned long *flags)
 {
 }
 
+static inline int of_hwspin_lock_get_id(struct device_node *np, int index)
+{
+	return 0;
+}
+
 static inline int hwspin_lock_get_id(struct hwspinlock *hwlock)
 {
 	return 0;
-- 
cgit v1.2.3


From d54385ce68cd18ab002b46f61246ad197cec92de Mon Sep 17 00:00:00 2001
From: Alexander Duyck <alexander.h.duyck@redhat.com>
Date: Thu, 30 Apr 2015 14:53:54 -0700
Subject: etherdev: Process is_multicast_ether_addr at same size as other
 operations

This change makes it so that we process the address in
is_multicast_ether_addr at the same size as the other calls.  This allows
us to avoid duplicate reads when used with other calls such as
is_zero_ether_addr or eth_addr_copy.  In addition I have added a 64 bit
version of the function so in eth_type_trans we can process the destination
address as a 64 bit value throughout.

Signed-off-by: Alexander Duyck <alexander.h.duyck@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/etherdevice.h | 24 +++++++++++++++++++++++-
 net/ethernet/eth.c          |  2 +-
 2 files changed, 24 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/etherdevice.h b/include/linux/etherdevice.h
index 606563ef8a72..c4a10f991fe0 100644
--- a/include/linux/etherdevice.h
+++ b/include/linux/etherdevice.h
@@ -110,7 +110,29 @@ static inline bool is_zero_ether_addr(const u8 *addr)
  */
 static inline bool is_multicast_ether_addr(const u8 *addr)
 {
-	return 0x01 & addr[0];
+#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)
+	u32 a = *(const u32 *)addr;
+#else
+	u16 a = *(const u16 *)addr;
+#endif
+#ifdef __BIG_ENDIAN
+	return 0x01 & (a >> ((sizeof(a) * 8) - 8));
+#else
+	return 0x01 & a;
+#endif
+}
+
+static inline bool is_multicast_ether_addr_64bits(const u8 addr[6+2])
+{
+#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64
+#ifdef __BIG_ENDIAN
+	return 0x01 & ((*(const u64 *)addr) >> 56);
+#else
+	return 0x01 & (*(const u64 *)addr);
+#endif
+#else
+	return is_multicast_ether_addr(addr);
+#endif
 }
 
 /**
diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c
index 60069318d5d1..21c211e9fd5a 100644
--- a/net/ethernet/eth.c
+++ b/net/ethernet/eth.c
@@ -159,7 +159,7 @@ __be16 eth_type_trans(struct sk_buff *skb, struct net_device *dev)
 	skb_pull_inline(skb, ETH_HLEN);
 	eth = eth_hdr(skb);
 
-	if (unlikely(is_multicast_ether_addr(eth->h_dest))) {
+	if (unlikely(is_multicast_ether_addr_64bits(eth->h_dest))) {
 		if (ether_addr_equal_64bits(eth->h_dest, dev->broadcast))
 			skb->pkt_type = PACKET_BROADCAST;
 		else
-- 
cgit v1.2.3


From 50fb799289501c2eab9f43fc9af513027e1e994f Mon Sep 17 00:00:00 2001
From: Tom Herbert <tom@herbertland.com>
Date: Fri, 1 May 2015 11:30:12 -0700
Subject: net: Add skb_get_hash_perturb

This calls flow_disect and __skb_get_hash to procure a hash for a
packet. Input includes a key to initialize jhash. This function
does not set skb->hash.

Signed-off-by: Tom Herbert <tom@herbertland.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h    |  2 ++
 net/core/flow_dissector.c | 38 ++++++++++++++++++++++++++++++--------
 2 files changed, 32 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 66e374d62f64..acb83e249e3f 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -927,6 +927,8 @@ static inline __u32 skb_get_hash(struct sk_buff *skb)
 	return skb->hash;
 }
 
+__u32 skb_get_hash_perturb(const struct sk_buff *skb, u32 perturb);
+
 static inline __u32 skb_get_hash_raw(const struct sk_buff *skb)
 {
 	return skb->hash;
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index 2c35c02a931e..b1df995ef06c 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -267,13 +267,12 @@ static __always_inline void __flow_hash_secret_init(void)
 	net_get_random_once(&hashrnd, sizeof(hashrnd));
 }
 
-static __always_inline u32 __flow_hash_3words(u32 a, u32 b, u32 c)
+static __always_inline u32 __flow_hash_3words(u32 a, u32 b, u32 c, u32 keyval)
 {
-	__flow_hash_secret_init();
-	return jhash_3words(a, b, c, hashrnd);
+	return jhash_3words(a, b, c, keyval);
 }
 
-static inline u32 __flow_hash_from_keys(struct flow_keys *keys)
+static inline u32 __flow_hash_from_keys(struct flow_keys *keys, u32 keyval)
 {
 	u32 hash;
 
@@ -287,7 +286,8 @@ static inline u32 __flow_hash_from_keys(struct flow_keys *keys)
 
 	hash = __flow_hash_3words((__force u32)keys->dst,
 				  (__force u32)keys->src,
-				  (__force u32)keys->ports);
+				  (__force u32)keys->ports,
+				  keyval);
 	if (!hash)
 		hash = 1;
 
@@ -296,10 +296,20 @@ static inline u32 __flow_hash_from_keys(struct flow_keys *keys)
 
 u32 flow_hash_from_keys(struct flow_keys *keys)
 {
-	return __flow_hash_from_keys(keys);
+	__flow_hash_secret_init();
+	return __flow_hash_from_keys(keys, hashrnd);
 }
 EXPORT_SYMBOL(flow_hash_from_keys);
 
+static inline u32 ___skb_get_hash(const struct sk_buff *skb,
+				  struct flow_keys *keys, u32 keyval)
+{
+	if (!skb_flow_dissect(skb, keys))
+		return 0;
+
+	return __flow_hash_from_keys(keys, keyval);
+}
+
 /*
  * __skb_get_hash: calculate a flow hash based on src/dst addresses
  * and src/dst port numbers.  Sets hash in skb to non-zero hash value
@@ -309,8 +319,12 @@ EXPORT_SYMBOL(flow_hash_from_keys);
 void __skb_get_hash(struct sk_buff *skb)
 {
 	struct flow_keys keys;
+	u32 hash;
 
-	if (!skb_flow_dissect(skb, &keys))
+	__flow_hash_secret_init();
+
+	hash = ___skb_get_hash(skb, &keys, hashrnd);
+	if (!hash)
 		return;
 
 	if (keys.ports)
@@ -318,10 +332,18 @@ void __skb_get_hash(struct sk_buff *skb)
 
 	skb->sw_hash = 1;
 
-	skb->hash = __flow_hash_from_keys(&keys);
+	skb->hash = hash;
 }
 EXPORT_SYMBOL(__skb_get_hash);
 
+__u32 skb_get_hash_perturb(const struct sk_buff *skb, u32 perturb)
+{
+	struct flow_keys keys;
+
+	return ___skb_get_hash(skb, &keys, perturb);
+}
+EXPORT_SYMBOL(skb_get_hash_perturb);
+
 /*
  * Returns a Tx hash based on the given packet descriptor a Tx queues' number
  * to be used as a distribution range.
-- 
cgit v1.2.3


From 20f56758b04364c3c139edbffde8cfaf0307edee Mon Sep 17 00:00:00 2001
From: Jacek Anaszewski <j.anaszewski@samsung.com>
Date: Tue, 28 Apr 2015 00:18:41 -0700
Subject: leds: unify the location of led-trigger API

Part of led-trigger API was in the private drivers/leds/leds.h header.
Move it to the include/linux/leds.h header to unify the API location
and announce it as public. It has been already exported from
led-triggers.c with EXPORT_SYMBOL_GPL macro. The no-op definitions are
changed from macros to inline to match the style of the surrounding code.

Signed-off-by: Jacek Anaszewski <j.anaszewski@samsung.com>
Cc: Richard Purdie <rpurdie@rpsys.net>
Acked-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Signed-off-by: Bryan Wu <cooloney@gmail.com>
---
 drivers/leds/leds.h  | 24 ------------------------
 include/linux/leds.h | 25 +++++++++++++++++++++++++
 2 files changed, 25 insertions(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/leds/leds.h b/drivers/leds/leds.h
index 79efe57c7405..bc89d7ace2c4 100644
--- a/drivers/leds/leds.h
+++ b/drivers/leds/leds.h
@@ -13,7 +13,6 @@
 #ifndef __LEDS_H_INCLUDED
 #define __LEDS_H_INCLUDED
 
-#include <linux/device.h>
 #include <linux/rwsem.h>
 #include <linux/leds.h>
 
@@ -50,27 +49,4 @@ void led_stop_software_blink(struct led_classdev *led_cdev);
 extern struct rw_semaphore leds_list_lock;
 extern struct list_head leds_list;
 
-#ifdef CONFIG_LEDS_TRIGGERS
-void led_trigger_set_default(struct led_classdev *led_cdev);
-void led_trigger_set(struct led_classdev *led_cdev,
-			struct led_trigger *trigger);
-void led_trigger_remove(struct led_classdev *led_cdev);
-
-static inline void *led_get_trigger_data(struct led_classdev *led_cdev)
-{
-	return led_cdev->trigger_data;
-}
-
-#else
-#define led_trigger_set_default(x) do {} while (0)
-#define led_trigger_set(x, y) do {} while (0)
-#define led_trigger_remove(x) do {} while (0)
-#define led_get_trigger_data(x) (NULL)
-#endif
-
-ssize_t led_trigger_store(struct device *dev, struct device_attribute *attr,
-			const char *buf, size_t count);
-ssize_t led_trigger_show(struct device *dev, struct device_attribute *attr,
-			char *buf);
-
 #endif	/* __LEDS_H_INCLUDED */
diff --git a/include/linux/leds.h b/include/linux/leds.h
index 9a2b000094cf..b122eeafb5dc 100644
--- a/include/linux/leds.h
+++ b/include/linux/leds.h
@@ -12,6 +12,7 @@
 #ifndef __LINUX_LEDS_H_INCLUDED
 #define __LINUX_LEDS_H_INCLUDED
 
+#include <linux/device.h>
 #include <linux/list.h>
 #include <linux/mutex.h>
 #include <linux/rwsem.h>
@@ -222,6 +223,11 @@ struct led_trigger {
 	struct list_head  next_trig;
 };
 
+ssize_t led_trigger_store(struct device *dev, struct device_attribute *attr,
+			const char *buf, size_t count);
+ssize_t led_trigger_show(struct device *dev, struct device_attribute *attr,
+			char *buf);
+
 /* Registration functions for complex triggers */
 extern int led_trigger_register(struct led_trigger *trigger);
 extern void led_trigger_unregister(struct led_trigger *trigger);
@@ -238,6 +244,16 @@ extern void led_trigger_blink_oneshot(struct led_trigger *trigger,
 				      unsigned long *delay_on,
 				      unsigned long *delay_off,
 				      int invert);
+extern void led_trigger_set_default(struct led_classdev *led_cdev);
+extern void led_trigger_set(struct led_classdev *led_cdev,
+			struct led_trigger *trigger);
+extern void led_trigger_remove(struct led_classdev *led_cdev);
+
+static inline void *led_get_trigger_data(struct led_classdev *led_cdev)
+{
+	return led_cdev->trigger_data;
+}
+
 /**
  * led_trigger_rename_static - rename a trigger
  * @name: the new trigger name
@@ -267,6 +283,15 @@ static inline void led_trigger_register_simple(const char *name,
 static inline void led_trigger_unregister_simple(struct led_trigger *trigger) {}
 static inline void led_trigger_event(struct led_trigger *trigger,
 				enum led_brightness event) {}
+static inline void led_trigger_set_default(struct led_classdev *led_cdev) {}
+static inline void led_trigger_set(struct led_classdev *led_cdev,
+				struct led_trigger *trigger) {}
+static inline void led_trigger_remove(struct led_classdev *led_cdev) {}
+static inline void *led_get_trigger_data(struct led_classdev *led_cdev)
+{
+	return NULL;
+}
+
 #endif /* CONFIG_LEDS_TRIGGERS */
 
 /* Trigger specific functions */
-- 
cgit v1.2.3


From 9afd85c9e4552b276e2f4cfefd622bdeeffbbf26 Mon Sep 17 00:00:00 2001
From: Linus Lüssing <linus.luessing@c0d3.blue>
Date: Sat, 2 May 2015 14:01:07 +0200
Subject: net: Export IGMP/MLD message validation code
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

With this patch, the IGMP and MLD message validation functions are moved
from the bridge code to IPv4/IPv6 multicast files. Some small
refactoring was done to enhance readibility and to iron out some
differences in behaviour between the IGMP and MLD parsing code (e.g. the
skb-cloning of MLD messages is now only done if necessary, just like the
IGMP part always did).

Finally, these IGMP and MLD message validation functions are exported so
that not only the bridge can use it but batman-adv later, too.

Signed-off-by: Linus Lüssing <linus.luessing@c0d3.blue>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/igmp.h      |   1 +
 include/linux/skbuff.h    |   3 +
 include/net/addrconf.h    |   1 +
 net/bridge/br_multicast.c | 218 +++++++---------------------------------------
 net/core/skbuff.c         |  87 ++++++++++++++++++
 net/ipv4/igmp.c           | 162 ++++++++++++++++++++++++++++++++++
 net/ipv6/Makefile         |   1 +
 net/ipv6/mcast_snoop.c    | 213 ++++++++++++++++++++++++++++++++++++++++++++
 8 files changed, 498 insertions(+), 188 deletions(-)
 create mode 100644 net/ipv6/mcast_snoop.c

(limited to 'include/linux')

diff --git a/include/linux/igmp.h b/include/linux/igmp.h
index 2c677afeea47..193ad488d3e2 100644
--- a/include/linux/igmp.h
+++ b/include/linux/igmp.h
@@ -130,5 +130,6 @@ extern void ip_mc_unmap(struct in_device *);
 extern void ip_mc_remap(struct in_device *);
 extern void ip_mc_dec_group(struct in_device *in_dev, __be32 addr);
 extern void ip_mc_inc_group(struct in_device *in_dev, __be32 addr);
+int ip_mc_check_igmp(struct sk_buff *skb, struct sk_buff **skb_trimmed);
 
 #endif
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index acb83e249e3f..9c2f793573fa 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -3419,6 +3419,9 @@ static inline void skb_checksum_none_assert(const struct sk_buff *skb)
 bool skb_partial_csum_set(struct sk_buff *skb, u16 start, u16 off);
 
 int skb_checksum_setup(struct sk_buff *skb, bool recalculate);
+struct sk_buff *skb_checksum_trimmed(struct sk_buff *skb,
+				     unsigned int transport_len,
+				     __sum16(*skb_chkf)(struct sk_buff *skb));
 
 u32 skb_get_poff(const struct sk_buff *skb);
 u32 __skb_get_poff(const struct sk_buff *skb, void *data,
diff --git a/include/net/addrconf.h b/include/net/addrconf.h
index 80456f72d70a..def59d3a34d5 100644
--- a/include/net/addrconf.h
+++ b/include/net/addrconf.h
@@ -142,6 +142,7 @@ void ipv6_mc_unmap(struct inet6_dev *idev);
 void ipv6_mc_remap(struct inet6_dev *idev);
 void ipv6_mc_init_dev(struct inet6_dev *idev);
 void ipv6_mc_destroy_dev(struct inet6_dev *idev);
+int ipv6_mc_check_mld(struct sk_buff *skb, struct sk_buff **skb_trimmed);
 void addrconf_dad_failure(struct inet6_ifaddr *ifp);
 
 bool ipv6_chk_mcast_addr(struct net_device *dev, const struct in6_addr *group,
diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c
index b52f4cb8aee9..2d69d5cab52f 100644
--- a/net/bridge/br_multicast.c
+++ b/net/bridge/br_multicast.c
@@ -975,9 +975,6 @@ static int br_ip4_multicast_igmp3_report(struct net_bridge *br,
 	int err = 0;
 	__be32 group;
 
-	if (!pskb_may_pull(skb, sizeof(*ih)))
-		return -EINVAL;
-
 	ih = igmpv3_report_hdr(skb);
 	num = ntohs(ih->ngrec);
 	len = sizeof(*ih);
@@ -1248,25 +1245,14 @@ static int br_ip4_multicast_query(struct net_bridge *br,
 			max_delay = 10 * HZ;
 			group = 0;
 		}
-	} else {
-		if (!pskb_may_pull(skb, sizeof(struct igmpv3_query))) {
-			err = -EINVAL;
-			goto out;
-		}
-
+	} else if (skb->len >= sizeof(*ih3)) {
 		ih3 = igmpv3_query_hdr(skb);
 		if (ih3->nsrcs)
 			goto out;
 
 		max_delay = ih3->code ?
 			    IGMPV3_MRC(ih3->code) * (HZ / IGMP_TIMER_SCALE) : 1;
-	}
-
-	/* RFC2236+RFC3376 (IGMPv2+IGMPv3) require the multicast link layer
-	 * all-systems destination addresses (224.0.0.1) for general queries
-	 */
-	if (!group && iph->daddr != htonl(INADDR_ALLHOSTS_GROUP)) {
-		err = -EINVAL;
+	} else {
 		goto out;
 	}
 
@@ -1329,12 +1315,6 @@ static int br_ip6_multicast_query(struct net_bridge *br,
 	    (port && port->state == BR_STATE_DISABLED))
 		goto out;
 
-	/* RFC2710+RFC3810 (MLDv1+MLDv2) require link-local source addresses */
-	if (!(ipv6_addr_type(&ip6h->saddr) & IPV6_ADDR_LINKLOCAL)) {
-		err = -EINVAL;
-		goto out;
-	}
-
 	if (skb->len == sizeof(*mld)) {
 		if (!pskb_may_pull(skb, sizeof(*mld))) {
 			err = -EINVAL;
@@ -1358,14 +1338,6 @@ static int br_ip6_multicast_query(struct net_bridge *br,
 
 	is_general_query = group && ipv6_addr_any(group);
 
-	/* RFC2710+RFC3810 (MLDv1+MLDv2) require the multicast link layer
-	 * all-nodes destination address (ff02::1) for general queries
-	 */
-	if (is_general_query && !ipv6_addr_is_ll_all_nodes(&ip6h->daddr)) {
-		err = -EINVAL;
-		goto out;
-	}
-
 	if (is_general_query) {
 		saddr.proto = htons(ETH_P_IPV6);
 		saddr.u.ip6 = ip6h->saddr;
@@ -1557,66 +1529,22 @@ static int br_multicast_ipv4_rcv(struct net_bridge *br,
 				 struct sk_buff *skb,
 				 u16 vid)
 {
-	struct sk_buff *skb2 = skb;
-	const struct iphdr *iph;
+	struct sk_buff *skb_trimmed = NULL;
 	struct igmphdr *ih;
-	unsigned int len;
-	unsigned int offset;
 	int err;
 
-	/* We treat OOM as packet loss for now. */
-	if (!pskb_may_pull(skb, sizeof(*iph)))
-		return -EINVAL;
-
-	iph = ip_hdr(skb);
-
-	if (iph->ihl < 5 || iph->version != 4)
-		return -EINVAL;
-
-	if (!pskb_may_pull(skb, ip_hdrlen(skb)))
-		return -EINVAL;
-
-	iph = ip_hdr(skb);
+	err = ip_mc_check_igmp(skb, &skb_trimmed);
 
-	if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl)))
-		return -EINVAL;
-
-	if (iph->protocol != IPPROTO_IGMP) {
-		if (!ipv4_is_local_multicast(iph->daddr))
+	if (err == -ENOMSG) {
+		if (!ipv4_is_local_multicast(ip_hdr(skb)->daddr))
 			BR_INPUT_SKB_CB(skb)->mrouters_only = 1;
 		return 0;
+	} else if (err < 0) {
+		return err;
 	}
 
-	len = ntohs(iph->tot_len);
-	if (skb->len < len || len < ip_hdrlen(skb))
-		return -EINVAL;
-
-	if (skb->len > len) {
-		skb2 = skb_clone(skb, GFP_ATOMIC);
-		if (!skb2)
-			return -ENOMEM;
-
-		err = pskb_trim_rcsum(skb2, len);
-		if (err)
-			goto err_out;
-	}
-
-	len -= ip_hdrlen(skb2);
-	offset = skb_network_offset(skb2) + ip_hdrlen(skb2);
-	__skb_pull(skb2, offset);
-	skb_reset_transport_header(skb2);
-
-	err = -EINVAL;
-	if (!pskb_may_pull(skb2, sizeof(*ih)))
-		goto out;
-
-	if (skb_checksum_simple_validate(skb2))
-		goto out;
-
-	err = 0;
-
 	BR_INPUT_SKB_CB(skb)->igmp = 1;
-	ih = igmp_hdr(skb2);
+	ih = igmp_hdr(skb);
 
 	switch (ih->type) {
 	case IGMP_HOST_MEMBERSHIP_REPORT:
@@ -1625,21 +1553,19 @@ static int br_multicast_ipv4_rcv(struct net_bridge *br,
 		err = br_ip4_multicast_add_group(br, port, ih->group, vid);
 		break;
 	case IGMPV3_HOST_MEMBERSHIP_REPORT:
-		err = br_ip4_multicast_igmp3_report(br, port, skb2, vid);
+		err = br_ip4_multicast_igmp3_report(br, port, skb_trimmed, vid);
 		break;
 	case IGMP_HOST_MEMBERSHIP_QUERY:
-		err = br_ip4_multicast_query(br, port, skb2, vid);
+		err = br_ip4_multicast_query(br, port, skb_trimmed, vid);
 		break;
 	case IGMP_HOST_LEAVE_MESSAGE:
 		br_ip4_multicast_leave_group(br, port, ih->group, vid);
 		break;
 	}
 
-out:
-	__skb_push(skb2, offset);
-err_out:
-	if (skb2 != skb)
-		kfree_skb(skb2);
+	if (skb_trimmed)
+		kfree_skb(skb_trimmed);
+
 	return err;
 }
 
@@ -1649,126 +1575,42 @@ static int br_multicast_ipv6_rcv(struct net_bridge *br,
 				 struct sk_buff *skb,
 				 u16 vid)
 {
-	struct sk_buff *skb2;
-	const struct ipv6hdr *ip6h;
-	u8 icmp6_type;
-	u8 nexthdr;
-	__be16 frag_off;
-	unsigned int len;
-	int offset;
+	struct sk_buff *skb_trimmed = NULL;
+	struct mld_msg *mld;
 	int err;
 
-	if (!pskb_may_pull(skb, sizeof(*ip6h)))
-		return -EINVAL;
+	err = ipv6_mc_check_mld(skb, &skb_trimmed);
 
-	ip6h = ipv6_hdr(skb);
-
-	/*
-	 * We're interested in MLD messages only.
-	 *  - Version is 6
-	 *  - MLD has always Router Alert hop-by-hop option
-	 *  - But we do not support jumbrograms.
-	 */
-	if (ip6h->version != 6)
-		return 0;
-
-	/* Prevent flooding this packet if there is no listener present */
-	if (!ipv6_addr_is_ll_all_nodes(&ip6h->daddr))
-		BR_INPUT_SKB_CB(skb)->mrouters_only = 1;
-
-	if (ip6h->nexthdr != IPPROTO_HOPOPTS ||
-	    ip6h->payload_len == 0)
-		return 0;
-
-	len = ntohs(ip6h->payload_len) + sizeof(*ip6h);
-	if (skb->len < len)
-		return -EINVAL;
-
-	nexthdr = ip6h->nexthdr;
-	offset = ipv6_skip_exthdr(skb, sizeof(*ip6h), &nexthdr, &frag_off);
-
-	if (offset < 0 || nexthdr != IPPROTO_ICMPV6)
+	if (err == -ENOMSG) {
+		if (!ipv6_addr_is_ll_all_nodes(&ipv6_hdr(skb)->daddr))
+			BR_INPUT_SKB_CB(skb)->mrouters_only = 1;
 		return 0;
-
-	/* Okay, we found ICMPv6 header */
-	skb2 = skb_clone(skb, GFP_ATOMIC);
-	if (!skb2)
-		return -ENOMEM;
-
-	err = -EINVAL;
-	if (!pskb_may_pull(skb2, offset + sizeof(struct icmp6hdr)))
-		goto out;
-
-	len -= offset - skb_network_offset(skb2);
-
-	__skb_pull(skb2, offset);
-	skb_reset_transport_header(skb2);
-	skb_postpull_rcsum(skb2, skb_network_header(skb2),
-			   skb_network_header_len(skb2));
-
-	icmp6_type = icmp6_hdr(skb2)->icmp6_type;
-
-	switch (icmp6_type) {
-	case ICMPV6_MGM_QUERY:
-	case ICMPV6_MGM_REPORT:
-	case ICMPV6_MGM_REDUCTION:
-	case ICMPV6_MLD2_REPORT:
-		break;
-	default:
-		err = 0;
-		goto out;
-	}
-
-	/* Okay, we found MLD message. Check further. */
-	if (skb2->len > len) {
-		err = pskb_trim_rcsum(skb2, len);
-		if (err)
-			goto out;
-		err = -EINVAL;
+	} else if (err < 0) {
+		return err;
 	}
 
-	ip6h = ipv6_hdr(skb2);
-
-	if (skb_checksum_validate(skb2, IPPROTO_ICMPV6, ip6_compute_pseudo))
-		goto out;
-
-	err = 0;
-
 	BR_INPUT_SKB_CB(skb)->igmp = 1;
+	mld = (struct mld_msg *)skb_transport_header(skb);
 
-	switch (icmp6_type) {
+	switch (mld->mld_type) {
 	case ICMPV6_MGM_REPORT:
-	    {
-		struct mld_msg *mld;
-		if (!pskb_may_pull(skb2, sizeof(*mld))) {
-			err = -EINVAL;
-			goto out;
-		}
-		mld = (struct mld_msg *)skb_transport_header(skb2);
 		BR_INPUT_SKB_CB(skb)->mrouters_only = 1;
 		err = br_ip6_multicast_add_group(br, port, &mld->mld_mca, vid);
 		break;
-	    }
 	case ICMPV6_MLD2_REPORT:
-		err = br_ip6_multicast_mld2_report(br, port, skb2, vid);
+		err = br_ip6_multicast_mld2_report(br, port, skb_trimmed, vid);
 		break;
 	case ICMPV6_MGM_QUERY:
-		err = br_ip6_multicast_query(br, port, skb2, vid);
+		err = br_ip6_multicast_query(br, port, skb_trimmed, vid);
 		break;
 	case ICMPV6_MGM_REDUCTION:
-	    {
-		struct mld_msg *mld;
-		if (!pskb_may_pull(skb2, sizeof(*mld))) {
-			err = -EINVAL;
-			goto out;
-		}
-		mld = (struct mld_msg *)skb_transport_header(skb2);
 		br_ip6_multicast_leave_group(br, port, &mld->mld_mca, vid);
-	    }
+		break;
 	}
 
-out:
-	kfree_skb(skb2);
+	if (skb_trimmed)
+		kfree_skb(skb_trimmed);
+
 	return err;
 }
 #endif
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 3cfff2a3d651..1e4278a4dd7e 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -4030,6 +4030,93 @@ int skb_checksum_setup(struct sk_buff *skb, bool recalculate)
 }
 EXPORT_SYMBOL(skb_checksum_setup);
 
+/**
+ * skb_checksum_maybe_trim - maybe trims the given skb
+ * @skb: the skb to check
+ * @transport_len: the data length beyond the network header
+ *
+ * Checks whether the given skb has data beyond the given transport length.
+ * If so, returns a cloned skb trimmed to this transport length.
+ * Otherwise returns the provided skb. Returns NULL in error cases
+ * (e.g. transport_len exceeds skb length or out-of-memory).
+ *
+ * Caller needs to set the skb transport header and release the returned skb.
+ * Provided skb is consumed.
+ */
+static struct sk_buff *skb_checksum_maybe_trim(struct sk_buff *skb,
+					       unsigned int transport_len)
+{
+	struct sk_buff *skb_chk;
+	unsigned int len = skb_transport_offset(skb) + transport_len;
+	int ret;
+
+	if (skb->len < len) {
+		kfree_skb(skb);
+		return NULL;
+	} else if (skb->len == len) {
+		return skb;
+	}
+
+	skb_chk = skb_clone(skb, GFP_ATOMIC);
+	kfree_skb(skb);
+
+	if (!skb_chk)
+		return NULL;
+
+	ret = pskb_trim_rcsum(skb_chk, len);
+	if (ret) {
+		kfree_skb(skb_chk);
+		return NULL;
+	}
+
+	return skb_chk;
+}
+
+/**
+ * skb_checksum_trimmed - validate checksum of an skb
+ * @skb: the skb to check
+ * @transport_len: the data length beyond the network header
+ * @skb_chkf: checksum function to use
+ *
+ * Applies the given checksum function skb_chkf to the provided skb.
+ * Returns a checked and maybe trimmed skb. Returns NULL on error.
+ *
+ * If the skb has data beyond the given transport length, then a
+ * trimmed & cloned skb is checked and returned.
+ *
+ * Caller needs to set the skb transport header and release the returned skb.
+ * Provided skb is consumed.
+ */
+struct sk_buff *skb_checksum_trimmed(struct sk_buff *skb,
+				     unsigned int transport_len,
+				     __sum16(*skb_chkf)(struct sk_buff *skb))
+{
+	struct sk_buff *skb_chk;
+	unsigned int offset = skb_transport_offset(skb);
+	int ret;
+
+	skb_chk = skb_checksum_maybe_trim(skb, transport_len);
+	if (!skb_chk)
+		return NULL;
+
+	if (!pskb_may_pull(skb_chk, offset)) {
+		kfree_skb(skb_chk);
+		return NULL;
+	}
+
+	__skb_pull(skb_chk, offset);
+	ret = skb_chkf(skb_chk);
+	__skb_push(skb_chk, offset);
+
+	if (ret) {
+		kfree_skb(skb_chk);
+		return NULL;
+	}
+
+	return skb_chk;
+}
+EXPORT_SYMBOL(skb_checksum_trimmed);
+
 void __skb_warn_lro_forwarding(const struct sk_buff *skb)
 {
 	net_warn_ratelimited("%s: received packets cannot be forwarded while LRO is enabled\n",
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index a3a697f5ffba..651cdf648ec4 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -1339,6 +1339,168 @@ out:
 }
 EXPORT_SYMBOL(ip_mc_inc_group);
 
+static int ip_mc_check_iphdr(struct sk_buff *skb)
+{
+	const struct iphdr *iph;
+	unsigned int len;
+	unsigned int offset = skb_network_offset(skb) + sizeof(*iph);
+
+	if (!pskb_may_pull(skb, offset))
+		return -EINVAL;
+
+	iph = ip_hdr(skb);
+
+	if (iph->version != 4 || ip_hdrlen(skb) < sizeof(*iph))
+		return -EINVAL;
+
+	offset += ip_hdrlen(skb) - sizeof(*iph);
+
+	if (!pskb_may_pull(skb, offset))
+		return -EINVAL;
+
+	iph = ip_hdr(skb);
+
+	if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl)))
+		return -EINVAL;
+
+	len = skb_network_offset(skb) + ntohs(iph->tot_len);
+	if (skb->len < len || len < offset)
+		return -EINVAL;
+
+	skb_set_transport_header(skb, offset);
+
+	return 0;
+}
+
+static int ip_mc_check_igmp_reportv3(struct sk_buff *skb)
+{
+	unsigned int len = skb_transport_offset(skb);
+
+	len += sizeof(struct igmpv3_report);
+
+	return pskb_may_pull(skb, len) ? 0 : -EINVAL;
+}
+
+static int ip_mc_check_igmp_query(struct sk_buff *skb)
+{
+	unsigned int len = skb_transport_offset(skb);
+
+	len += sizeof(struct igmphdr);
+	if (skb->len < len)
+		return -EINVAL;
+
+	/* IGMPv{1,2}? */
+	if (skb->len != len) {
+		/* or IGMPv3? */
+		len += sizeof(struct igmpv3_query) - sizeof(struct igmphdr);
+		if (skb->len < len || !pskb_may_pull(skb, len))
+			return -EINVAL;
+	}
+
+	/* RFC2236+RFC3376 (IGMPv2+IGMPv3) require the multicast link layer
+	 * all-systems destination addresses (224.0.0.1) for general queries
+	 */
+	if (!igmp_hdr(skb)->group &&
+	    ip_hdr(skb)->daddr != htonl(INADDR_ALLHOSTS_GROUP))
+		return -EINVAL;
+
+	return 0;
+}
+
+static int ip_mc_check_igmp_msg(struct sk_buff *skb)
+{
+	switch (igmp_hdr(skb)->type) {
+	case IGMP_HOST_LEAVE_MESSAGE:
+	case IGMP_HOST_MEMBERSHIP_REPORT:
+	case IGMPV2_HOST_MEMBERSHIP_REPORT:
+		/* fall through */
+		return 0;
+	case IGMPV3_HOST_MEMBERSHIP_REPORT:
+		return ip_mc_check_igmp_reportv3(skb);
+	case IGMP_HOST_MEMBERSHIP_QUERY:
+		return ip_mc_check_igmp_query(skb);
+	default:
+		return -ENOMSG;
+	}
+}
+
+static inline __sum16 ip_mc_validate_checksum(struct sk_buff *skb)
+{
+	return skb_checksum_simple_validate(skb);
+}
+
+static int __ip_mc_check_igmp(struct sk_buff *skb, struct sk_buff **skb_trimmed)
+
+{
+	struct sk_buff *skb_chk;
+	unsigned int transport_len;
+	unsigned int len = skb_transport_offset(skb) + sizeof(struct igmphdr);
+	int ret;
+
+	transport_len = ntohs(ip_hdr(skb)->tot_len) - ip_hdrlen(skb);
+
+	skb_get(skb);
+	skb_chk = skb_checksum_trimmed(skb, transport_len,
+				       ip_mc_validate_checksum);
+	if (!skb_chk)
+		return -EINVAL;
+
+	if (!pskb_may_pull(skb_chk, len)) {
+		kfree_skb(skb_chk);
+		return -EINVAL;
+	}
+
+	ret = ip_mc_check_igmp_msg(skb_chk);
+	if (ret) {
+		kfree_skb(skb_chk);
+		return ret;
+	}
+
+	if (skb_trimmed)
+		*skb_trimmed = skb_chk;
+	else
+		kfree_skb(skb_chk);
+
+	return 0;
+}
+
+/**
+ * ip_mc_check_igmp - checks whether this is a sane IGMP packet
+ * @skb: the skb to validate
+ * @skb_trimmed: to store an skb pointer trimmed to IPv4 packet tail (optional)
+ *
+ * Checks whether an IPv4 packet is a valid IGMP packet. If so sets
+ * skb network and transport headers accordingly and returns zero.
+ *
+ * -EINVAL: A broken packet was detected, i.e. it violates some internet
+ *  standard
+ * -ENOMSG: IP header validation succeeded but it is not an IGMP packet.
+ * -ENOMEM: A memory allocation failure happened.
+ *
+ * Optionally, an skb pointer might be provided via skb_trimmed (or set it
+ * to NULL): After parsing an IGMP packet successfully it will point to
+ * an skb which has its tail aligned to the IP packet end. This might
+ * either be the originally provided skb or a trimmed, cloned version if
+ * the skb frame had data beyond the IP packet. A cloned skb allows us
+ * to leave the original skb and its full frame unchanged (which might be
+ * desirable for layer 2 frame jugglers).
+ *
+ * The caller needs to release a reference count from any returned skb_trimmed.
+ */
+int ip_mc_check_igmp(struct sk_buff *skb, struct sk_buff **skb_trimmed)
+{
+	int ret = ip_mc_check_iphdr(skb);
+
+	if (ret < 0)
+		return ret;
+
+	if (ip_hdr(skb)->protocol != IPPROTO_IGMP)
+		return -ENOMSG;
+
+	return __ip_mc_check_igmp(skb, skb_trimmed);
+}
+EXPORT_SYMBOL(ip_mc_check_igmp);
+
 /*
  *	Resend IGMP JOIN report; used by netdev notifier.
  */
diff --git a/net/ipv6/Makefile b/net/ipv6/Makefile
index 2e8c06108ab9..0f3f1999719a 100644
--- a/net/ipv6/Makefile
+++ b/net/ipv6/Makefile
@@ -48,4 +48,5 @@ obj-$(subst m,y,$(CONFIG_IPV6)) += inet6_hashtables.o
 
 ifneq ($(CONFIG_IPV6),)
 obj-$(CONFIG_NET_UDP_TUNNEL) += ip6_udp_tunnel.o
+obj-y += mcast_snoop.o
 endif
diff --git a/net/ipv6/mcast_snoop.c b/net/ipv6/mcast_snoop.c
new file mode 100644
index 000000000000..1a2cbc13a7d3
--- /dev/null
+++ b/net/ipv6/mcast_snoop.c
@@ -0,0 +1,213 @@
+/* Copyright (C) 2010: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
+ * Copyright (C) 2015: Linus Lüssing <linus.luessing@c0d3.blue>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ *
+ *
+ * Based on the MLD support added to br_multicast.c by YOSHIFUJI Hideaki.
+ */
+
+#include <linux/skbuff.h>
+#include <net/ipv6.h>
+#include <net/mld.h>
+#include <net/addrconf.h>
+#include <net/ip6_checksum.h>
+
+static int ipv6_mc_check_ip6hdr(struct sk_buff *skb)
+{
+	const struct ipv6hdr *ip6h;
+	unsigned int len;
+	unsigned int offset = skb_network_offset(skb) + sizeof(*ip6h);
+
+	if (!pskb_may_pull(skb, offset))
+		return -EINVAL;
+
+	ip6h = ipv6_hdr(skb);
+
+	if (ip6h->version != 6)
+		return -EINVAL;
+
+	len = offset + ntohs(ip6h->payload_len);
+	if (skb->len < len || len <= offset)
+		return -EINVAL;
+
+	return 0;
+}
+
+static int ipv6_mc_check_exthdrs(struct sk_buff *skb)
+{
+	const struct ipv6hdr *ip6h;
+	unsigned int offset;
+	u8 nexthdr;
+	__be16 frag_off;
+
+	ip6h = ipv6_hdr(skb);
+
+	if (ip6h->nexthdr != IPPROTO_HOPOPTS)
+		return -ENOMSG;
+
+	nexthdr = ip6h->nexthdr;
+	offset = skb_network_offset(skb) + sizeof(*ip6h);
+	offset = ipv6_skip_exthdr(skb, offset, &nexthdr, &frag_off);
+
+	if (offset < 0)
+		return -EINVAL;
+
+	if (nexthdr != IPPROTO_ICMPV6)
+		return -ENOMSG;
+
+	skb_set_transport_header(skb, offset);
+
+	return 0;
+}
+
+static int ipv6_mc_check_mld_reportv2(struct sk_buff *skb)
+{
+	unsigned int len = skb_transport_offset(skb);
+
+	len += sizeof(struct mld2_report);
+
+	return pskb_may_pull(skb, len) ? 0 : -EINVAL;
+}
+
+static int ipv6_mc_check_mld_query(struct sk_buff *skb)
+{
+	struct mld_msg *mld;
+	unsigned int len = skb_transport_offset(skb);
+
+	/* RFC2710+RFC3810 (MLDv1+MLDv2) require link-local source addresses */
+	if (!(ipv6_addr_type(&ipv6_hdr(skb)->saddr) & IPV6_ADDR_LINKLOCAL))
+		return -EINVAL;
+
+	len += sizeof(struct mld_msg);
+	if (skb->len < len)
+		return -EINVAL;
+
+	/* MLDv1? */
+	if (skb->len != len) {
+		/* or MLDv2? */
+		len += sizeof(struct mld2_query) - sizeof(struct mld_msg);
+		if (skb->len < len || !pskb_may_pull(skb, len))
+			return -EINVAL;
+	}
+
+	mld = (struct mld_msg *)skb_transport_header(skb);
+
+	/* RFC2710+RFC3810 (MLDv1+MLDv2) require the multicast link layer
+	 * all-nodes destination address (ff02::1) for general queries
+	 */
+	if (ipv6_addr_any(&mld->mld_mca) &&
+	    !ipv6_addr_is_ll_all_nodes(&ipv6_hdr(skb)->daddr))
+		return -EINVAL;
+
+	return 0;
+}
+
+static int ipv6_mc_check_mld_msg(struct sk_buff *skb)
+{
+	struct mld_msg *mld = (struct mld_msg *)skb_transport_header(skb);
+
+	switch (mld->mld_type) {
+	case ICMPV6_MGM_REDUCTION:
+	case ICMPV6_MGM_REPORT:
+		/* fall through */
+		return 0;
+	case ICMPV6_MLD2_REPORT:
+		return ipv6_mc_check_mld_reportv2(skb);
+	case ICMPV6_MGM_QUERY:
+		return ipv6_mc_check_mld_query(skb);
+	default:
+		return -ENOMSG;
+	}
+}
+
+static inline __sum16 ipv6_mc_validate_checksum(struct sk_buff *skb)
+{
+	return skb_checksum_validate(skb, IPPROTO_ICMPV6, ip6_compute_pseudo);
+}
+
+static int __ipv6_mc_check_mld(struct sk_buff *skb,
+			       struct sk_buff **skb_trimmed)
+
+{
+	struct sk_buff *skb_chk = NULL;
+	unsigned int transport_len;
+	unsigned int len = skb_transport_offset(skb) + sizeof(struct mld_msg);
+	int ret;
+
+	transport_len = ntohs(ipv6_hdr(skb)->payload_len);
+	transport_len -= skb_transport_offset(skb) - sizeof(struct ipv6hdr);
+
+	skb_get(skb);
+	skb_chk = skb_checksum_trimmed(skb, transport_len,
+				       ipv6_mc_validate_checksum);
+	if (!skb_chk)
+		return -EINVAL;
+
+	if (!pskb_may_pull(skb_chk, len)) {
+		kfree_skb(skb_chk);
+		return -EINVAL;
+	}
+
+	ret = ipv6_mc_check_mld_msg(skb_chk);
+	if (ret) {
+		kfree_skb(skb_chk);
+		return ret;
+	}
+
+	if (skb_trimmed)
+		*skb_trimmed = skb_chk;
+	else
+		kfree_skb(skb_chk);
+
+	return 0;
+}
+
+/**
+ * ipv6_mc_check_mld - checks whether this is a sane MLD packet
+ * @skb: the skb to validate
+ * @skb_trimmed: to store an skb pointer trimmed to IPv6 packet tail (optional)
+ *
+ * Checks whether an IPv6 packet is a valid MLD packet. If so sets
+ * skb network and transport headers accordingly and returns zero.
+ *
+ * -EINVAL: A broken packet was detected, i.e. it violates some internet
+ *  standard
+ * -ENOMSG: IP header validation succeeded but it is not an MLD packet.
+ * -ENOMEM: A memory allocation failure happened.
+ *
+ * Optionally, an skb pointer might be provided via skb_trimmed (or set it
+ * to NULL): After parsing an MLD packet successfully it will point to
+ * an skb which has its tail aligned to the IP packet end. This might
+ * either be the originally provided skb or a trimmed, cloned version if
+ * the skb frame had data beyond the IP packet. A cloned skb allows us
+ * to leave the original skb and its full frame unchanged (which might be
+ * desirable for layer 2 frame jugglers).
+ *
+ * The caller needs to release a reference count from any returned skb_trimmed.
+ */
+int ipv6_mc_check_mld(struct sk_buff *skb, struct sk_buff **skb_trimmed)
+{
+	int ret;
+
+	ret = ipv6_mc_check_ip6hdr(skb);
+	if (ret < 0)
+		return ret;
+
+	ret = ipv6_mc_check_exthdrs(skb);
+	if (ret < 0)
+		return ret;
+
+	return __ipv6_mc_check_mld(skb, skb_trimmed);
+}
+EXPORT_SYMBOL(ipv6_mc_check_mld);
-- 
cgit v1.2.3


From 6cd9e9f629f11b9412d4e9aa294c029dbb36b3cf Mon Sep 17 00:00:00 2001
From: Kapileshwar Singh <kapileshwar.singh@arm.com>
Date: Wed, 18 Feb 2015 16:04:21 +0000
Subject: thermal: of: fix cooling device weights in device tree

Currently you can specify the weight of the cooling device in the device
tree but that information is not populated to the
thermal_bind_params where the fair share governor expects it to
be.  The of thermal zone device doesn't have a thermal_bind_params
structure and arguably it's better to pass the weight inside the
thermal_instance as it is specific to the bind of a cooling device to a
thermal zone parameter.

Core thermal code is fixed to populate the weight in the instance from
the thermal_bind_params, so platform code that was passing the weight
inside the thermal_bind_params continue to work seamlessly.

While we are at it, create a default value for the weight parameter for
those thermal zones that currently don't define it and remove the
hardcoded default in of-thermal.

Cc: Zhang Rui <rui.zhang@intel.com>
Cc: "Rafael J. Wysocki" <rjw@rjwysocki.net>
Cc: Len Brown <lenb@kernel.org>
Cc: Peter Feuerer <peter@piie.net>
Cc: Darren Hart <dvhart@infradead.org>
Cc: Eduardo Valentin <edubezval@gmail.com>
Cc: Kukjin Kim <kgene@kernel.org>
Cc: Durgadoss R <durgadoss.r@intel.com>
Signed-off-by: Kapileshwar Singh <kapileshwar.singh@arm.com>
Signed-off-by: Eduardo Valentin <edubezval@gmail.com>
---
 Documentation/thermal/sysfs-api.txt                |  4 +++-
 drivers/acpi/thermal.c                             |  9 ++++++---
 drivers/platform/x86/acerhdf.c                     |  3 ++-
 drivers/thermal/db8500_thermal.c                   |  2 +-
 drivers/thermal/fair_share.c                       |  2 +-
 drivers/thermal/imx_thermal.c                      |  3 ++-
 drivers/thermal/of-thermal.c                       |  5 +++--
 drivers/thermal/thermal_core.c                     | 22 ++++++++++++++++------
 drivers/thermal/thermal_core.h                     |  1 +
 drivers/thermal/ti-soc-thermal/ti-thermal-common.c |  3 ++-
 include/linux/thermal.h                            |  6 +++++-
 11 files changed, 42 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/thermal/sysfs-api.txt b/Documentation/thermal/sysfs-api.txt
index 87519cb379ee..7ec632ed9769 100644
--- a/Documentation/thermal/sysfs-api.txt
+++ b/Documentation/thermal/sysfs-api.txt
@@ -95,7 +95,7 @@ temperature) and throttle appropriate devices.
 1.3 interface for binding a thermal zone device with a thermal cooling device
 1.3.1 int thermal_zone_bind_cooling_device(struct thermal_zone_device *tz,
 	int trip, struct thermal_cooling_device *cdev,
-	unsigned long upper, unsigned long lower);
+	unsigned long upper, unsigned long lower, unsigned int weight);
 
     This interface function bind a thermal cooling device to the certain trip
     point of a thermal zone device.
@@ -110,6 +110,8 @@ temperature) and throttle appropriate devices.
     lower:the Minimum cooling state can be used for this trip point.
           THERMAL_NO_LIMIT means no lower limit,
 	  and the cooling device can be in cooling state 0.
+    weight: the influence of this cooling device in this thermal
+            zone.  See 1.4.1 below for more information.
 
 1.3.2 int thermal_zone_unbind_cooling_device(struct thermal_zone_device *tz,
 		int trip, struct thermal_cooling_device *cdev);
diff --git a/drivers/acpi/thermal.c b/drivers/acpi/thermal.c
index d24fa1964eb8..6d4e44ea74ac 100644
--- a/drivers/acpi/thermal.c
+++ b/drivers/acpi/thermal.c
@@ -800,7 +800,8 @@ static int acpi_thermal_cooling_device_cb(struct thermal_zone_device *thermal,
 				result =
 					thermal_zone_bind_cooling_device
 					(thermal, trip, cdev,
-					 THERMAL_NO_LIMIT, THERMAL_NO_LIMIT);
+					 THERMAL_NO_LIMIT, THERMAL_NO_LIMIT,
+					 THERMAL_WEIGHT_DEFAULT);
 			else
 				result =
 					thermal_zone_unbind_cooling_device
@@ -824,7 +825,8 @@ static int acpi_thermal_cooling_device_cb(struct thermal_zone_device *thermal,
 			if (bind)
 				result = thermal_zone_bind_cooling_device
 					(thermal, trip, cdev,
-					 THERMAL_NO_LIMIT, THERMAL_NO_LIMIT);
+					 THERMAL_NO_LIMIT, THERMAL_NO_LIMIT,
+					 THERMAL_WEIGHT_DEFAULT);
 			else
 				result = thermal_zone_unbind_cooling_device
 					(thermal, trip, cdev);
@@ -841,7 +843,8 @@ static int acpi_thermal_cooling_device_cb(struct thermal_zone_device *thermal,
 				result = thermal_zone_bind_cooling_device
 						(thermal, THERMAL_TRIPS_NONE,
 						 cdev, THERMAL_NO_LIMIT,
-						 THERMAL_NO_LIMIT);
+						 THERMAL_NO_LIMIT,
+						 THERMAL_WEIGHT_DEFAULT);
 			else
 				result = thermal_zone_unbind_cooling_device
 						(thermal, THERMAL_TRIPS_NONE,
diff --git a/drivers/platform/x86/acerhdf.c b/drivers/platform/x86/acerhdf.c
index 594c918b553d..1ef02daddb60 100644
--- a/drivers/platform/x86/acerhdf.c
+++ b/drivers/platform/x86/acerhdf.c
@@ -372,7 +372,8 @@ static int acerhdf_bind(struct thermal_zone_device *thermal,
 		return 0;
 
 	if (thermal_zone_bind_cooling_device(thermal, 0, cdev,
-			THERMAL_NO_LIMIT, THERMAL_NO_LIMIT)) {
+			THERMAL_NO_LIMIT, THERMAL_NO_LIMIT,
+			THERMAL_WEIGHT_DEFAULT)) {
 		pr_err("error binding cooling dev\n");
 		return -EINVAL;
 	}
diff --git a/drivers/thermal/db8500_thermal.c b/drivers/thermal/db8500_thermal.c
index 20adfbe27df1..2fb273c4baa9 100644
--- a/drivers/thermal/db8500_thermal.c
+++ b/drivers/thermal/db8500_thermal.c
@@ -76,7 +76,7 @@ static int db8500_cdev_bind(struct thermal_zone_device *thermal,
 		upper = lower = i > max_state ? max_state : i;
 
 		ret = thermal_zone_bind_cooling_device(thermal, i, cdev,
-			upper, lower);
+			upper, lower, THERMAL_WEIGHT_DEFAULT);
 
 		dev_info(&cdev->device, "%s bind to %d: %d-%s\n", cdev->type,
 			i, ret, ret ? "fail" : "succeed");
diff --git a/drivers/thermal/fair_share.c b/drivers/thermal/fair_share.c
index 6e0a3fbfae86..c3b25187b467 100644
--- a/drivers/thermal/fair_share.c
+++ b/drivers/thermal/fair_share.c
@@ -109,7 +109,7 @@ static int fair_share_throttle(struct thermal_zone_device *tz, int trip)
 			continue;
 
 		instance->target = get_target_state(tz, cdev,
-					tzp->tbp[i].weight, cur_trip_level);
+					instance->weight, cur_trip_level);
 
 		instance->cdev->updated = false;
 		thermal_cdev_update(cdev);
diff --git a/drivers/thermal/imx_thermal.c b/drivers/thermal/imx_thermal.c
index 2ccbc0788353..fde4c2876d14 100644
--- a/drivers/thermal/imx_thermal.c
+++ b/drivers/thermal/imx_thermal.c
@@ -306,7 +306,8 @@ static int imx_bind(struct thermal_zone_device *tz,
 
 	ret = thermal_zone_bind_cooling_device(tz, IMX_TRIP_PASSIVE, cdev,
 					       THERMAL_NO_LIMIT,
-					       THERMAL_NO_LIMIT);
+					       THERMAL_NO_LIMIT,
+					       THERMAL_WEIGHT_DEFAULT);
 	if (ret) {
 		dev_err(&tz->device,
 			"binding zone %s with cdev %s failed:%d\n",
diff --git a/drivers/thermal/of-thermal.c b/drivers/thermal/of-thermal.c
index 668fb1bdea9e..c606b85ea9f4 100644
--- a/drivers/thermal/of-thermal.c
+++ b/drivers/thermal/of-thermal.c
@@ -227,7 +227,8 @@ static int of_thermal_bind(struct thermal_zone_device *thermal,
 			ret = thermal_zone_bind_cooling_device(thermal,
 						tbp->trip_id, cdev,
 						tbp->max,
-						tbp->min);
+						tbp->min,
+						tbp->usage);
 			if (ret)
 				return ret;
 		}
@@ -581,7 +582,7 @@ static int thermal_of_populate_bind_params(struct device_node *np,
 	u32 prop;
 
 	/* Default weight. Usage is optional */
-	__tbp->usage = 0;
+	__tbp->usage = THERMAL_WEIGHT_DEFAULT;
 	ret = of_property_read_u32(np, "contribution", &prop);
 	if (ret == 0)
 		__tbp->usage = prop;
diff --git a/drivers/thermal/thermal_core.c b/drivers/thermal/thermal_core.c
index 4108db7e10c1..a6cb9b78b629 100644
--- a/drivers/thermal/thermal_core.c
+++ b/drivers/thermal/thermal_core.c
@@ -218,7 +218,8 @@ static void print_bind_err_msg(struct thermal_zone_device *tz,
 
 static void __bind(struct thermal_zone_device *tz, int mask,
 			struct thermal_cooling_device *cdev,
-			unsigned long *limits)
+			unsigned long *limits,
+			unsigned int weight)
 {
 	int i, ret;
 
@@ -233,7 +234,8 @@ static void __bind(struct thermal_zone_device *tz, int mask,
 				upper = limits[i * 2 + 1];
 			}
 			ret = thermal_zone_bind_cooling_device(tz, i, cdev,
-							       upper, lower);
+							       upper, lower,
+							       weight);
 			if (ret)
 				print_bind_err_msg(tz, cdev, ret);
 		}
@@ -280,7 +282,8 @@ static void bind_cdev(struct thermal_cooling_device *cdev)
 				continue;
 			tzp->tbp[i].cdev = cdev;
 			__bind(pos, tzp->tbp[i].trip_mask, cdev,
-			       tzp->tbp[i].binding_limits);
+			       tzp->tbp[i].binding_limits,
+			       tzp->tbp[i].weight);
 		}
 	}
 
@@ -319,7 +322,8 @@ static void bind_tz(struct thermal_zone_device *tz)
 				continue;
 			tzp->tbp[i].cdev = pos;
 			__bind(tz, tzp->tbp[i].trip_mask, pos,
-			       tzp->tbp[i].binding_limits);
+			       tzp->tbp[i].binding_limits,
+			       tzp->tbp[i].weight);
 		}
 	}
 exit:
@@ -713,7 +717,8 @@ passive_store(struct device *dev, struct device_attribute *attr,
 				thermal_zone_bind_cooling_device(tz,
 						THERMAL_TRIPS_NONE, cdev,
 						THERMAL_NO_LIMIT,
-						THERMAL_NO_LIMIT);
+						THERMAL_NO_LIMIT,
+						THERMAL_WEIGHT_DEFAULT);
 		}
 		mutex_unlock(&thermal_list_lock);
 		if (!tz->passive_delay)
@@ -931,6 +936,9 @@ static const struct attribute_group *cooling_device_attr_groups[] = {
  * @lower:	the Minimum cooling state can be used for this trip point.
  *		THERMAL_NO_LIMIT means no lower limit,
  *		and the cooling device can be in cooling state 0.
+ * @weight:	The weight of the cooling device to be bound to the
+ *		thermal zone. Use THERMAL_WEIGHT_DEFAULT for the
+ *		default value
  *
  * This interface function bind a thermal cooling device to the certain trip
  * point of a thermal zone device.
@@ -941,7 +949,8 @@ static const struct attribute_group *cooling_device_attr_groups[] = {
 int thermal_zone_bind_cooling_device(struct thermal_zone_device *tz,
 				     int trip,
 				     struct thermal_cooling_device *cdev,
-				     unsigned long upper, unsigned long lower)
+				     unsigned long upper, unsigned long lower,
+				     unsigned int weight)
 {
 	struct thermal_instance *dev;
 	struct thermal_instance *pos;
@@ -986,6 +995,7 @@ int thermal_zone_bind_cooling_device(struct thermal_zone_device *tz,
 	dev->upper = upper;
 	dev->lower = lower;
 	dev->target = THERMAL_NO_TARGET;
+	dev->weight = weight;
 
 	result = get_idr(&tz->idr, &tz->lock, &dev->id);
 	if (result)
diff --git a/drivers/thermal/thermal_core.h b/drivers/thermal/thermal_core.h
index 0531c752fbbb..7a465e9d456c 100644
--- a/drivers/thermal/thermal_core.h
+++ b/drivers/thermal/thermal_core.h
@@ -48,6 +48,7 @@ struct thermal_instance {
 	struct device_attribute attr;
 	struct list_head tz_node; /* node in tz->thermal_instances */
 	struct list_head cdev_node; /* node in cdev->thermal_instances */
+	unsigned int weight; /* The weight of the cooling device */
 };
 
 int thermal_register_governor(struct thermal_governor *);
diff --git a/drivers/thermal/ti-soc-thermal/ti-thermal-common.c b/drivers/thermal/ti-soc-thermal/ti-thermal-common.c
index a38c1756442a..cb45e729adb5 100644
--- a/drivers/thermal/ti-soc-thermal/ti-thermal-common.c
+++ b/drivers/thermal/ti-soc-thermal/ti-thermal-common.c
@@ -146,7 +146,8 @@ static int ti_thermal_bind(struct thermal_zone_device *thermal,
 	return thermal_zone_bind_cooling_device(thermal, 0, cdev,
 	/* bind with min and max states defined by cpu_cooling */
 						THERMAL_NO_LIMIT,
-						THERMAL_NO_LIMIT);
+						THERMAL_NO_LIMIT,
+						THERMAL_WEIGHT_DEFAULT);
 }
 
 /* Unbind callback functions for thermal zone */
diff --git a/include/linux/thermal.h b/include/linux/thermal.h
index 5eac316490ea..00dacd4dfdce 100644
--- a/include/linux/thermal.h
+++ b/include/linux/thermal.h
@@ -40,6 +40,9 @@
 /* No upper/lower limit requirement */
 #define THERMAL_NO_LIMIT	((u32)~0)
 
+/* Default weight of a bound cooling device */
+#define THERMAL_WEIGHT_DEFAULT 0
+
 /* Unit conversion macros */
 #define KELVIN_TO_CELSIUS(t)	(long)(((long)t-2732 >= 0) ?	\
 				((long)t-2732+5)/10 : ((long)t-2732-5)/10)
@@ -323,7 +326,8 @@ void thermal_zone_device_unregister(struct thermal_zone_device *);
 
 int thermal_zone_bind_cooling_device(struct thermal_zone_device *, int,
 				     struct thermal_cooling_device *,
-				     unsigned long, unsigned long);
+				     unsigned long, unsigned long,
+				     unsigned int);
 int thermal_zone_unbind_cooling_device(struct thermal_zone_device *, int,
 				       struct thermal_cooling_device *);
 void thermal_zone_device_update(struct thermal_zone_device *);
-- 
cgit v1.2.3


From bcdcbbc71125c37195f97314f453ca9a3a4eb758 Mon Sep 17 00:00:00 2001
From: Javi Merino <javi.merino@arm.com>
Date: Wed, 18 Feb 2015 16:04:25 +0000
Subject: thermal: fair_share: generalize the weight concept

The fair share governor has the concept of weights, which is the
influence of each cooling device in a thermal zone.  The current
implementation forces the weights of all cooling devices in a thermal
zone to add up to a 100.  This complicates setups, as you need to know
in advance how many cooling devices you are going to have.  If you bind a
new cooling device, you have to modify all the other cooling devices
weights, which is error prone.  Furthermore, you can't specify a
"default" weight for platforms since that default value depends on the
number of cooling devices in the platform.

This patch generalizes the concept of weight by allowing any number to
be a "weight".  Weights are now relative to each other.  Platforms that
don't specify weights get the same default value for all their cooling
devices, so all their cdevs are considered to be equally influential.

It's important to note that previous users of the weights don't need to
alter the code: percentages continue to work as they used to.  This
patch just removes the constraint of all the weights in a thermal zone
having to add up to a 100.  If they do, you get the same behavior as
before.  If they don't, fair share now works for that platform.

Cc: Zhang Rui <rui.zhang@intel.com>
Cc: Eduardo Valentin <edubezval@gmail.com>
Cc: Durgadoss R <durgadoss.r@intel.com>
Acked-by: Durgadoss R <durgadoss.r@intel.com>
Signed-off-by: Javi Merino <javi.merino@arm.com>
Signed-off-by: Eduardo Valentin <edubezval@gmail.com>
---
 Documentation/thermal/sysfs-api.txt | 12 +++++++++---
 drivers/thermal/fair_share.c        | 26 +++++++++++++++++++++-----
 include/linux/thermal.h             |  9 ++++++---
 3 files changed, 36 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/thermal/sysfs-api.txt b/Documentation/thermal/sysfs-api.txt
index 3625453ceef6..fc7dfe10778b 100644
--- a/Documentation/thermal/sysfs-api.txt
+++ b/Documentation/thermal/sysfs-api.txt
@@ -129,9 +129,15 @@ temperature) and throttle appropriate devices.
     This structure defines the following parameters that are used to bind
     a zone with a cooling device for a particular trip point.
     .cdev: The cooling device pointer
-    .weight: The 'influence' of a particular cooling device on this zone.
-             This is on a percentage scale. The sum of all these weights
-             (for a particular zone) cannot exceed 100.
+    .weight: The 'influence' of a particular cooling device on this
+             zone. This is relative to the rest of the cooling
+             devices. For example, if all cooling devices have a
+             weight of 1, then they all contribute the same. You can
+             use percentages if you want, but it's not mandatory. A
+             weight of 0 means that this cooling device doesn't
+             contribute to the cooling of this zone unless all cooling
+             devices have a weight of 0. If all weights are 0, then
+             they all contribute the same.
     .trip_mask:This is a bit mask that gives the binding relation between
                this thermal zone and cdev, for a particular trip point.
                If nth bit is set, then the cdev and thermal zone are bound
diff --git a/drivers/thermal/fair_share.c b/drivers/thermal/fair_share.c
index 692f4053f08b..c2c10bbe24d6 100644
--- a/drivers/thermal/fair_share.c
+++ b/drivers/thermal/fair_share.c
@@ -59,13 +59,13 @@ static int get_trip_level(struct thermal_zone_device *tz)
 }
 
 static long get_target_state(struct thermal_zone_device *tz,
-		struct thermal_cooling_device *cdev, int weight, int level)
+		struct thermal_cooling_device *cdev, int percentage, int level)
 {
 	unsigned long max_state;
 
 	cdev->ops->get_max_state(cdev, &max_state);
 
-	return (long)(weight * level * max_state) / (100 * tz->trips);
+	return (long)(percentage * level * max_state) / (100 * tz->trips);
 }
 
 /**
@@ -77,7 +77,7 @@ static long get_target_state(struct thermal_zone_device *tz,
  *
  * Parameters used for Throttling:
  * P1. max_state: Maximum throttle state exposed by the cooling device.
- * P2. weight[i]/100:
+ * P2. percentage[i]/100:
  *	How 'effective' the 'i'th device is, in cooling the given zone.
  * P3. cur_trip_level/max_no_of_trips:
  *	This describes the extent to which the devices should be throttled.
@@ -89,16 +89,32 @@ static long get_target_state(struct thermal_zone_device *tz,
 static int fair_share_throttle(struct thermal_zone_device *tz, int trip)
 {
 	struct thermal_instance *instance;
+	int total_weight = 0;
+	int total_instance = 0;
 	int cur_trip_level = get_trip_level(tz);
 
 	list_for_each_entry(instance, &tz->thermal_instances, tz_node) {
+		if (instance->trip != trip)
+			continue;
+
+		total_weight += instance->weight;
+		total_instance++;
+	}
+
+	list_for_each_entry(instance, &tz->thermal_instances, tz_node) {
+		int percentage;
 		struct thermal_cooling_device *cdev = instance->cdev;
 
 		if (instance->trip != trip)
 			continue;
 
-		instance->target = get_target_state(tz, cdev,
-					instance->weight, cur_trip_level);
+		if (!total_weight)
+			percentage = 100 / total_instance;
+		else
+			percentage = (instance->weight * 100) / total_weight;
+
+		instance->target = get_target_state(tz, cdev, percentage,
+						    cur_trip_level);
 
 		instance->cdev->updated = false;
 		thermal_cdev_update(cdev);
diff --git a/include/linux/thermal.h b/include/linux/thermal.h
index 00dacd4dfdce..bac0f52c7a1e 100644
--- a/include/linux/thermal.h
+++ b/include/linux/thermal.h
@@ -217,9 +217,12 @@ struct thermal_bind_params {
 
 	/*
 	 * This is a measure of 'how effectively these devices can
-	 * cool 'this' thermal zone. The shall be determined by platform
-	 * characterization. This is on a 'percentage' scale.
-	 * See Documentation/thermal/sysfs-api.txt for more information.
+	 * cool 'this' thermal zone. It shall be determined by
+	 * platform characterization. This value is relative to the
+	 * rest of the weights so a cooling device whose weight is
+	 * double that of another cooling device is twice as
+	 * effective. See Documentation/thermal/sysfs-api.txt for more
+	 * information.
 	 */
 	int weight;
 
-- 
cgit v1.2.3


From e33df1d2f3a0141cd79e770f31999ba0dd7ebfa8 Mon Sep 17 00:00:00 2001
From: Javi Merino <javi.merino@arm.com>
Date: Thu, 26 Feb 2015 19:00:27 +0000
Subject: thermal: let governors have private data for each thermal zone

A governor may need to store its current state between calls to
throttle().  That state depends on the thermal zone, so store it as
private data in struct thermal_zone_device.

The governors may have two new ops: bind_to_tz() and unbind_from_tz().
When provided, these functions let governors do some initialization
and teardown when they are bound/unbound to a tz and possibly store that
information in the governor_data field of the struct
thermal_zone_device.

Cc: Zhang Rui <rui.zhang@intel.com>
Cc: Eduardo Valentin <edubezval@gmail.com>
Signed-off-by: Javi Merino <javi.merino@arm.com>
Signed-off-by: Eduardo Valentin <edubezval@gmail.com>
---
 drivers/thermal/thermal_core.c | 83 ++++++++++++++++++++++++++++++++++++++----
 include/linux/thermal.h        |  9 +++++
 2 files changed, 84 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/thermal/thermal_core.c b/drivers/thermal/thermal_core.c
index 605d6919c1b6..be62b1622ed3 100644
--- a/drivers/thermal/thermal_core.c
+++ b/drivers/thermal/thermal_core.c
@@ -75,6 +75,58 @@ static struct thermal_governor *__find_governor(const char *name)
 	return NULL;
 }
 
+/**
+ * bind_previous_governor() - bind the previous governor of the thermal zone
+ * @tz:		a valid pointer to a struct thermal_zone_device
+ * @failed_gov_name:	the name of the governor that failed to register
+ *
+ * Register the previous governor of the thermal zone after a new
+ * governor has failed to be bound.
+ */
+static void bind_previous_governor(struct thermal_zone_device *tz,
+				   const char *failed_gov_name)
+{
+	if (tz->governor && tz->governor->bind_to_tz) {
+		if (tz->governor->bind_to_tz(tz)) {
+			dev_err(&tz->device,
+				"governor %s failed to bind and the previous one (%s) failed to bind again, thermal zone %s has no governor\n",
+				failed_gov_name, tz->governor->name, tz->type);
+			tz->governor = NULL;
+		}
+	}
+}
+
+/**
+ * thermal_set_governor() - Switch to another governor
+ * @tz:		a valid pointer to a struct thermal_zone_device
+ * @new_gov:	pointer to the new governor
+ *
+ * Change the governor of thermal zone @tz.
+ *
+ * Return: 0 on success, an error if the new governor's bind_to_tz() failed.
+ */
+static int thermal_set_governor(struct thermal_zone_device *tz,
+				struct thermal_governor *new_gov)
+{
+	int ret = 0;
+
+	if (tz->governor && tz->governor->unbind_from_tz)
+		tz->governor->unbind_from_tz(tz);
+
+	if (new_gov && new_gov->bind_to_tz) {
+		ret = new_gov->bind_to_tz(tz);
+		if (ret) {
+			bind_previous_governor(tz, new_gov->name);
+
+			return ret;
+		}
+	}
+
+	tz->governor = new_gov;
+
+	return ret;
+}
+
 int thermal_register_governor(struct thermal_governor *governor)
 {
 	int err;
@@ -107,8 +159,15 @@ int thermal_register_governor(struct thermal_governor *governor)
 
 		name = pos->tzp->governor_name;
 
-		if (!strncasecmp(name, governor->name, THERMAL_NAME_LENGTH))
-			pos->governor = governor;
+		if (!strncasecmp(name, governor->name, THERMAL_NAME_LENGTH)) {
+			int ret;
+
+			ret = thermal_set_governor(pos, governor);
+			if (ret)
+				dev_err(&pos->device,
+					"Failed to set governor %s for thermal zone %s: %d\n",
+					governor->name, pos->type, ret);
+		}
 	}
 
 	mutex_unlock(&thermal_list_lock);
@@ -134,7 +193,7 @@ void thermal_unregister_governor(struct thermal_governor *governor)
 	list_for_each_entry(pos, &thermal_tz_list, node) {
 		if (!strncasecmp(pos->governor->name, governor->name,
 						THERMAL_NAME_LENGTH))
-			pos->governor = NULL;
+			thermal_set_governor(pos, NULL);
 	}
 
 	mutex_unlock(&thermal_list_lock);
@@ -770,8 +829,9 @@ policy_store(struct device *dev, struct device_attribute *attr,
 	if (!gov)
 		goto exit;
 
-	tz->governor = gov;
-	ret = count;
+	ret = thermal_set_governor(tz, gov);
+	if (!ret)
+		ret = count;
 
 exit:
 	mutex_unlock(&tz->lock);
@@ -1512,6 +1572,7 @@ struct thermal_zone_device *thermal_zone_device_register(const char *type,
 	int result;
 	int count;
 	int passive = 0;
+	struct thermal_governor *governor;
 
 	if (type && strlen(type) >= THERMAL_NAME_LENGTH)
 		return ERR_PTR(-EINVAL);
@@ -1602,9 +1663,15 @@ struct thermal_zone_device *thermal_zone_device_register(const char *type,
 	mutex_lock(&thermal_governor_lock);
 
 	if (tz->tzp)
-		tz->governor = __find_governor(tz->tzp->governor_name);
+		governor = __find_governor(tz->tzp->governor_name);
 	else
-		tz->governor = def_governor;
+		governor = def_governor;
+
+	result = thermal_set_governor(tz, governor);
+	if (result) {
+		mutex_unlock(&thermal_governor_lock);
+		goto unregister;
+	}
 
 	mutex_unlock(&thermal_governor_lock);
 
@@ -1693,7 +1760,7 @@ void thermal_zone_device_unregister(struct thermal_zone_device *tz)
 		device_remove_file(&tz->device, &dev_attr_mode);
 	device_remove_file(&tz->device, &dev_attr_policy);
 	remove_trip_attrs(tz);
-	tz->governor = NULL;
+	thermal_set_governor(tz, NULL);
 
 	thermal_remove_hwmon_sysfs(tz);
 	release_idr(&thermal_tz_idr, &thermal_idr_lock, tz->id);
diff --git a/include/linux/thermal.h b/include/linux/thermal.h
index bac0f52c7a1e..edf9d53c67e6 100644
--- a/include/linux/thermal.h
+++ b/include/linux/thermal.h
@@ -165,6 +165,7 @@ struct thermal_attr {
  * @ops:	operations this &thermal_zone_device supports
  * @tzp:	thermal zone parameters
  * @governor:	pointer to the governor for this thermal zone
+ * @governor_data:	private pointer for governor data
  * @thermal_instances:	list of &struct thermal_instance of this thermal zone
  * @idr:	&struct idr to generate unique id for this zone's cooling
  *		devices
@@ -191,6 +192,7 @@ struct thermal_zone_device {
 	struct thermal_zone_device_ops *ops;
 	const struct thermal_zone_params *tzp;
 	struct thermal_governor *governor;
+	void *governor_data;
 	struct list_head thermal_instances;
 	struct idr idr;
 	struct mutex lock;
@@ -201,12 +203,19 @@ struct thermal_zone_device {
 /**
  * struct thermal_governor - structure that holds thermal governor information
  * @name:	name of the governor
+ * @bind_to_tz: callback called when binding to a thermal zone.  If it
+ *		returns 0, the governor is bound to the thermal zone,
+ *		otherwise it fails.
+ * @unbind_from_tz:	callback called when a governor is unbound from a
+ *			thermal zone.
  * @throttle:	callback called for every trip point even if temperature is
  *		below the trip point temperature
  * @governor_list:	node in thermal_governor_list (in thermal_core.c)
  */
 struct thermal_governor {
 	char name[THERMAL_NAME_LENGTH];
+	int (*bind_to_tz)(struct thermal_zone_device *tz);
+	void (*unbind_from_tz)(struct thermal_zone_device *tz);
 	int (*throttle)(struct thermal_zone_device *tz, int trip);
 	struct list_head	governor_list;
 };
-- 
cgit v1.2.3


From 35b11d2e3a66279a477e36cefb2603806295b8ce Mon Sep 17 00:00:00 2001
From: Javi Merino <javi.merino@arm.com>
Date: Thu, 26 Feb 2015 19:00:28 +0000
Subject: thermal: extend the cooling device API to include power information

Add three optional callbacks to the cooling device interface to allow
them to express power.  In addition to the callbacks, add helpers to
identify cooling devices that implement the power cooling device API.

Cc: Zhang Rui <rui.zhang@intel.com>
Cc: Eduardo Valentin <edubezval@gmail.com>
Signed-off-by: Javi Merino <javi.merino@arm.com>
Signed-off-by: Eduardo Valentin <edubezval@gmail.com>
---
 drivers/thermal/thermal_core.c | 52 ++++++++++++++++++++++++++++++++++++++++++
 include/linux/thermal.h        | 25 ++++++++++++++++++++
 2 files changed, 77 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/thermal/thermal_core.c b/drivers/thermal/thermal_core.c
index be62b1622ed3..263628b0e862 100644
--- a/drivers/thermal/thermal_core.c
+++ b/drivers/thermal/thermal_core.c
@@ -875,6 +875,58 @@ emul_temp_store(struct device *dev, struct device_attribute *attr,
 static DEVICE_ATTR(emul_temp, S_IWUSR, NULL, emul_temp_store);
 #endif/*CONFIG_THERMAL_EMULATION*/
 
+/**
+ * power_actor_get_max_power() - get the maximum power that a cdev can consume
+ * @cdev:	pointer to &thermal_cooling_device
+ * @tz:		a valid thermal zone device pointer
+ * @max_power:	pointer in which to store the maximum power
+ *
+ * Calculate the maximum power consumption in milliwats that the
+ * cooling device can currently consume and store it in @max_power.
+ *
+ * Return: 0 on success, -EINVAL if @cdev doesn't support the
+ * power_actor API or -E* on other error.
+ */
+int power_actor_get_max_power(struct thermal_cooling_device *cdev,
+			      struct thermal_zone_device *tz, u32 *max_power)
+{
+	if (!cdev_is_power_actor(cdev))
+		return -EINVAL;
+
+	return cdev->ops->state2power(cdev, tz, 0, max_power);
+}
+
+/**
+ * power_actor_set_power() - limit the maximum power that a cooling device can consume
+ * @cdev:	pointer to &thermal_cooling_device
+ * @instance:	thermal instance to update
+ * @power:	the power in milliwatts
+ *
+ * Set the cooling device to consume at most @power milliwatts.
+ *
+ * Return: 0 on success, -EINVAL if the cooling device does not
+ * implement the power actor API or -E* for other failures.
+ */
+int power_actor_set_power(struct thermal_cooling_device *cdev,
+			  struct thermal_instance *instance, u32 power)
+{
+	unsigned long state;
+	int ret;
+
+	if (!cdev_is_power_actor(cdev))
+		return -EINVAL;
+
+	ret = cdev->ops->power2state(cdev, instance->tz, power, &state);
+	if (ret)
+		return ret;
+
+	instance->target = state;
+	cdev->updated = false;
+	thermal_cdev_update(cdev);
+
+	return 0;
+}
+
 static DEVICE_ATTR(type, 0444, type_show, NULL);
 static DEVICE_ATTR(temp, 0444, temp_show, NULL);
 static DEVICE_ATTR(mode, 0644, mode_show, mode_store);
diff --git a/include/linux/thermal.h b/include/linux/thermal.h
index edf9d53c67e6..bf3c55f405c2 100644
--- a/include/linux/thermal.h
+++ b/include/linux/thermal.h
@@ -63,6 +63,7 @@
 
 struct thermal_zone_device;
 struct thermal_cooling_device;
+struct thermal_instance;
 
 enum thermal_device_mode {
 	THERMAL_DEVICE_DISABLED = 0,
@@ -116,6 +117,12 @@ struct thermal_cooling_device_ops {
 	int (*get_max_state) (struct thermal_cooling_device *, unsigned long *);
 	int (*get_cur_state) (struct thermal_cooling_device *, unsigned long *);
 	int (*set_cur_state) (struct thermal_cooling_device *, unsigned long);
+	int (*get_requested_power)(struct thermal_cooling_device *,
+				   struct thermal_zone_device *, u32 *);
+	int (*state2power)(struct thermal_cooling_device *,
+			   struct thermal_zone_device *, unsigned long, u32 *);
+	int (*power2state)(struct thermal_cooling_device *,
+			   struct thermal_zone_device *, u32, unsigned long *);
 };
 
 struct thermal_cooling_device {
@@ -331,6 +338,16 @@ void thermal_zone_of_sensor_unregister(struct device *dev,
 #endif
 
 #if IS_ENABLED(CONFIG_THERMAL)
+static inline bool cdev_is_power_actor(struct thermal_cooling_device *cdev)
+{
+	return cdev->ops->get_requested_power && cdev->ops->state2power &&
+		cdev->ops->power2state;
+}
+
+int power_actor_get_max_power(struct thermal_cooling_device *,
+			      struct thermal_zone_device *tz, u32 *max_power);
+int power_actor_set_power(struct thermal_cooling_device *,
+			  struct thermal_instance *, u32);
 struct thermal_zone_device *thermal_zone_device_register(const char *, int, int,
 		void *, struct thermal_zone_device_ops *,
 		const struct thermal_zone_params *, int, int);
@@ -359,6 +376,14 @@ struct thermal_instance *get_thermal_instance(struct thermal_zone_device *,
 void thermal_cdev_update(struct thermal_cooling_device *);
 void thermal_notify_framework(struct thermal_zone_device *, int);
 #else
+static inline bool cdev_is_power_actor(struct thermal_cooling_device *cdev)
+{ return false; }
+static inline int power_actor_get_max_power(struct thermal_cooling_device *cdev,
+			      struct thermal_zone_device *tz, u32 *max_power)
+{ return 0; }
+static inline int power_actor_set_power(struct thermal_cooling_device *cdev,
+			  struct thermal_instance *tz, u32 power)
+{ return 0; }
 static inline struct thermal_zone_device *thermal_zone_device_register(
 	const char *type, int trips, int mask, void *devdata,
 	struct thermal_zone_device_ops *ops,
-- 
cgit v1.2.3


From c36cf07176316fbe6a4bdbc23afcb0cbf7822bf2 Mon Sep 17 00:00:00 2001
From: Javi Merino <javi.merino@arm.com>
Date: Thu, 26 Feb 2015 19:00:29 +0000
Subject: thermal: cpu_cooling: implement the power cooling device API

Add a basic power model to the cpu cooling device to implement the
power cooling device API.  The power model uses the current frequency,
current load and OPPs for the power calculations.  The cpus must have
registered their OPPs using the OPP library.

Cc: Zhang Rui <rui.zhang@intel.com>
Cc: Eduardo Valentin <edubezval@gmail.com>
Signed-off-by: Kapileshwar Singh <kapileshwar.singh@arm.com>
Signed-off-by: Punit Agrawal <punit.agrawal@arm.com>
Signed-off-by: Javi Merino <javi.merino@arm.com>
Signed-off-by: Eduardo Valentin <edubezval@gmail.com>
---
 Documentation/thermal/cpu-cooling-api.txt | 156 +++++++-
 drivers/thermal/cpu_cooling.c             | 583 +++++++++++++++++++++++++++++-
 include/linux/cpu_cooling.h               |  39 ++
 3 files changed, 760 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/thermal/cpu-cooling-api.txt b/Documentation/thermal/cpu-cooling-api.txt
index 753e47cc2e20..71653584cd03 100644
--- a/Documentation/thermal/cpu-cooling-api.txt
+++ b/Documentation/thermal/cpu-cooling-api.txt
@@ -36,8 +36,162 @@ the user. The registration APIs returns the cooling device pointer.
     np: pointer to the cooling device device tree node
     clip_cpus: cpumask of cpus where the frequency constraints will happen.
 
-1.1.3 void cpufreq_cooling_unregister(struct thermal_cooling_device *cdev)
+1.1.3 struct thermal_cooling_device *cpufreq_power_cooling_register(
+    const struct cpumask *clip_cpus, u32 capacitance,
+    get_static_t plat_static_func)
+
+Similar to cpufreq_cooling_register, this function registers a cpufreq
+cooling device.  Using this function, the cooling device will
+implement the power extensions by using a simple cpu power model.  The
+cpus must have registered their OPPs using the OPP library.
+
+The additional parameters are needed for the power model (See 2. Power
+models).  "capacitance" is the dynamic power coefficient (See 2.1
+Dynamic power).  "plat_static_func" is a function to calculate the
+static power consumed by these cpus (See 2.2 Static power).
+
+1.1.4 struct thermal_cooling_device *of_cpufreq_power_cooling_register(
+    struct device_node *np, const struct cpumask *clip_cpus, u32 capacitance,
+    get_static_t plat_static_func)
+
+Similar to cpufreq_power_cooling_register, this function register a
+cpufreq cooling device with power extensions using the device tree
+information supplied by the np parameter.
+
+1.1.5 void cpufreq_cooling_unregister(struct thermal_cooling_device *cdev)
 
     This interface function unregisters the "thermal-cpufreq-%x" cooling device.
 
     cdev: Cooling device pointer which has to be unregistered.
+
+2. Power models
+
+The power API registration functions provide a simple power model for
+CPUs.  The current power is calculated as dynamic + (optionally)
+static power.  This power model requires that the operating-points of
+the CPUs are registered using the kernel's opp library and the
+`cpufreq_frequency_table` is assigned to the `struct device` of the
+cpu.  If you are using CONFIG_CPUFREQ_DT then the
+`cpufreq_frequency_table` should already be assigned to the cpu
+device.
+
+The `plat_static_func` parameter of `cpufreq_power_cooling_register()`
+and `of_cpufreq_power_cooling_register()` is optional.  If you don't
+provide it, only dynamic power will be considered.
+
+2.1 Dynamic power
+
+The dynamic power consumption of a processor depends on many factors.
+For a given processor implementation the primary factors are:
+
+- The time the processor spends running, consuming dynamic power, as
+  compared to the time in idle states where dynamic consumption is
+  negligible.  Herein we refer to this as 'utilisation'.
+- The voltage and frequency levels as a result of DVFS.  The DVFS
+  level is a dominant factor governing power consumption.
+- In running time the 'execution' behaviour (instruction types, memory
+  access patterns and so forth) causes, in most cases, a second order
+  variation.  In pathological cases this variation can be significant,
+  but typically it is of a much lesser impact than the factors above.
+
+A high level dynamic power consumption model may then be represented as:
+
+Pdyn = f(run) * Voltage^2 * Frequency * Utilisation
+
+f(run) here represents the described execution behaviour and its
+result has a units of Watts/Hz/Volt^2 (this often expressed in
+mW/MHz/uVolt^2)
+
+The detailed behaviour for f(run) could be modelled on-line.  However,
+in practice, such an on-line model has dependencies on a number of
+implementation specific processor support and characterisation
+factors.  Therefore, in initial implementation that contribution is
+represented as a constant coefficient.  This is a simplification
+consistent with the relative contribution to overall power variation.
+
+In this simplified representation our model becomes:
+
+Pdyn = Capacitance * Voltage^2 * Frequency * Utilisation
+
+Where `capacitance` is a constant that represents an indicative
+running time dynamic power coefficient in fundamental units of
+mW/MHz/uVolt^2.  Typical values for mobile CPUs might lie in range
+from 100 to 500.  For reference, the approximate values for the SoC in
+ARM's Juno Development Platform are 530 for the Cortex-A57 cluster and
+140 for the Cortex-A53 cluster.
+
+
+2.2 Static power
+
+Static leakage power consumption depends on a number of factors.  For a
+given circuit implementation the primary factors are:
+
+- Time the circuit spends in each 'power state'
+- Temperature
+- Operating voltage
+- Process grade
+
+The time the circuit spends in each 'power state' for a given
+evaluation period at first order means OFF or ON.  However,
+'retention' states can also be supported that reduce power during
+inactive periods without loss of context.
+
+Note: The visibility of state entries to the OS can vary, according to
+platform specifics, and this can then impact the accuracy of a model
+based on OS state information alone.  It might be possible in some
+cases to extract more accurate information from system resources.
+
+The temperature, operating voltage and process 'grade' (slow to fast)
+of the circuit are all significant factors in static leakage power
+consumption.  All of these have complex relationships to static power.
+
+Circuit implementation specific factors include the chosen silicon
+process as well as the type, number and size of transistors in both
+the logic gates and any RAM elements included.
+
+The static power consumption modelling must take into account the
+power managed regions that are implemented.  Taking the example of an
+ARM processor cluster, the modelling would take into account whether
+each CPU can be powered OFF separately or if only a single power
+region is implemented for the complete cluster.
+
+In one view, there are others, a static power consumption model can
+then start from a set of reference values for each power managed
+region (e.g. CPU, Cluster/L2) in each state (e.g. ON, OFF) at an
+arbitrary process grade, voltage and temperature point.  These values
+are then scaled for all of the following: the time in each state, the
+process grade, the current temperature and the operating voltage.
+However, since both implementation specific and complex relationships
+dominate the estimate, the appropriate interface to the model from the
+cpu cooling device is to provide a function callback that calculates
+the static power in this platform.  When registering the cpu cooling
+device pass a function pointer that follows the `get_static_t`
+prototype:
+
+    int plat_get_static(cpumask_t *cpumask, int interval,
+                        unsigned long voltage, u32 &power);
+
+`cpumask` is the cpumask of the cpus involved in the calculation.
+`voltage` is the voltage at which they are operating.  The function
+should calculate the average static power for the last `interval`
+milliseconds.  It returns 0 on success, -E* on error.  If it
+succeeds, it should store the static power in `power`.  Reading the
+temperature of the cpus described by `cpumask` is left for
+plat_get_static() to do as the platform knows best which thermal
+sensor is closest to the cpu.
+
+If `plat_static_func` is NULL, static power is considered to be
+negligible for this platform and only dynamic power is considered.
+
+The platform specific callback can then use any combination of tables
+and/or equations to permute the estimated value.  Process grade
+information is not passed to the model since access to such data, from
+on-chip measurement capability or manufacture time data, is platform
+specific.
+
+Note: the significance of static power for CPUs in comparison to
+dynamic power is highly dependent on implementation.  Given the
+potential complexity in implementation, the importance and accuracy of
+its inclusion when using cpu cooling devices should be assessed on a
+case by case basis.
+
diff --git a/drivers/thermal/cpu_cooling.c b/drivers/thermal/cpu_cooling.c
index f65f0d109fc8..ba23150c7bde 100644
--- a/drivers/thermal/cpu_cooling.c
+++ b/drivers/thermal/cpu_cooling.c
@@ -26,6 +26,7 @@
 #include <linux/thermal.h>
 #include <linux/cpufreq.h>
 #include <linux/err.h>
+#include <linux/pm_opp.h>
 #include <linux/slab.h>
 #include <linux/cpu.h>
 #include <linux/cpu_cooling.h>
@@ -44,6 +45,19 @@
  *	...
  */
 
+/**
+ * struct power_table - frequency to power conversion
+ * @frequency:	frequency in KHz
+ * @power:	power in mW
+ *
+ * This structure is built when the cooling device registers and helps
+ * in translating frequency to power and viceversa.
+ */
+struct power_table {
+	u32 frequency;
+	u32 power;
+};
+
 /**
  * struct cpufreq_cooling_device - data for cooling device with cpufreq
  * @id: unique integer value corresponding to each cpufreq_cooling_device
@@ -58,6 +72,15 @@
  *	cpufreq frequencies.
  * @allowed_cpus: all the cpus involved for this cpufreq_cooling_device.
  * @node: list_head to link all cpufreq_cooling_device together.
+ * @last_load: load measured by the latest call to cpufreq_get_actual_power()
+ * @time_in_idle: previous reading of the absolute time that this cpu was idle
+ * @time_in_idle_timestamp: wall time of the last invocation of
+ *	get_cpu_idle_time_us()
+ * @dyn_power_table: array of struct power_table for frequency to power
+ *	conversion, sorted in ascending order.
+ * @dyn_power_table_entries: number of entries in the @dyn_power_table array
+ * @cpu_dev: the first cpu_device from @allowed_cpus that has OPPs registered
+ * @plat_get_static_power: callback to calculate the static power
  *
  * This structure is required for keeping information of each registered
  * cpufreq_cooling_device.
@@ -71,6 +94,13 @@ struct cpufreq_cooling_device {
 	unsigned int *freq_table;	/* In descending order */
 	struct cpumask allowed_cpus;
 	struct list_head node;
+	u32 last_load;
+	u64 *time_in_idle;
+	u64 *time_in_idle_timestamp;
+	struct power_table *dyn_power_table;
+	int dyn_power_table_entries;
+	struct device *cpu_dev;
+	get_static_t plat_get_static_power;
 };
 static DEFINE_IDR(cpufreq_idr);
 static DEFINE_MUTEX(cooling_cpufreq_lock);
@@ -167,6 +197,39 @@ unsigned long cpufreq_cooling_get_level(unsigned int cpu, unsigned int freq)
 }
 EXPORT_SYMBOL_GPL(cpufreq_cooling_get_level);
 
+static void update_cpu_device(int cpu)
+{
+	struct cpufreq_cooling_device *cpufreq_dev;
+
+	mutex_lock(&cooling_cpufreq_lock);
+	list_for_each_entry(cpufreq_dev, &cpufreq_dev_list, node) {
+		if (cpumask_test_cpu(cpu, &cpufreq_dev->allowed_cpus)) {
+			cpufreq_dev->cpu_dev = get_cpu_device(cpu);
+			if (!cpufreq_dev->cpu_dev) {
+				dev_warn(&cpufreq_dev->cool_dev->device,
+					"No cpu device for new policy cpu %d\n",
+					 cpu);
+			}
+			break;
+		}
+	}
+	mutex_unlock(&cooling_cpufreq_lock);
+}
+
+static void remove_cpu_device(int cpu)
+{
+	struct cpufreq_cooling_device *cpufreq_dev;
+
+	mutex_lock(&cooling_cpufreq_lock);
+	list_for_each_entry(cpufreq_dev, &cpufreq_dev_list, node) {
+		if (cpumask_test_cpu(cpu, &cpufreq_dev->allowed_cpus)) {
+			cpufreq_dev->cpu_dev = NULL;
+			break;
+		}
+	}
+	mutex_unlock(&cooling_cpufreq_lock);
+}
+
 /**
  * cpufreq_thermal_notifier - notifier callback for cpufreq policy change.
  * @nb:	struct notifier_block * with callback info.
@@ -186,23 +249,240 @@ static int cpufreq_thermal_notifier(struct notifier_block *nb,
 	unsigned long max_freq = 0;
 	struct cpufreq_cooling_device *cpufreq_dev;
 
-	if (event != CPUFREQ_ADJUST)
-		return 0;
+	switch (event) {
 
-	mutex_lock(&cooling_cpufreq_lock);
-	list_for_each_entry(cpufreq_dev, &cpufreq_dev_list, node) {
-		if (!cpumask_test_cpu(policy->cpu,
-					&cpufreq_dev->allowed_cpus))
+	case CPUFREQ_ADJUST:
+		mutex_lock(&cooling_cpufreq_lock);
+		list_for_each_entry(cpufreq_dev, &cpufreq_dev_list, node) {
+			if (!cpumask_test_cpu(policy->cpu,
+					      &cpufreq_dev->allowed_cpus))
+				continue;
+
+			max_freq = cpufreq_dev->cpufreq_val;
+
+			if (policy->max != max_freq)
+				cpufreq_verify_within_limits(policy, 0,
+							     max_freq);
+		}
+		mutex_unlock(&cooling_cpufreq_lock);
+		break;
+
+	case CPUFREQ_CREATE_POLICY:
+		update_cpu_device(policy->cpu);
+		break;
+	case CPUFREQ_REMOVE_POLICY:
+		remove_cpu_device(policy->cpu);
+		break;
+	default:
+		return NOTIFY_DONE;
+	}
+
+	return NOTIFY_OK;
+}
+
+/**
+ * build_dyn_power_table() - create a dynamic power to frequency table
+ * @cpufreq_device:	the cpufreq cooling device in which to store the table
+ * @capacitance: dynamic power coefficient for these cpus
+ *
+ * Build a dynamic power to frequency table for this cpu and store it
+ * in @cpufreq_device.  This table will be used in cpu_power_to_freq() and
+ * cpu_freq_to_power() to convert between power and frequency
+ * efficiently.  Power is stored in mW, frequency in KHz.  The
+ * resulting table is in ascending order.
+ *
+ * Return: 0 on success, -E* on error.
+ */
+static int build_dyn_power_table(struct cpufreq_cooling_device *cpufreq_device,
+				 u32 capacitance)
+{
+	struct power_table *power_table;
+	struct dev_pm_opp *opp;
+	struct device *dev = NULL;
+	int num_opps = 0, cpu, i, ret = 0;
+	unsigned long freq;
+
+	rcu_read_lock();
+
+	for_each_cpu(cpu, &cpufreq_device->allowed_cpus) {
+		dev = get_cpu_device(cpu);
+		if (!dev) {
+			dev_warn(&cpufreq_device->cool_dev->device,
+				 "No cpu device for cpu %d\n", cpu);
 			continue;
+		}
 
-		max_freq = cpufreq_dev->cpufreq_val;
+		num_opps = dev_pm_opp_get_opp_count(dev);
+		if (num_opps > 0) {
+			break;
+		} else if (num_opps < 0) {
+			ret = num_opps;
+			goto unlock;
+		}
+	}
 
-		if (policy->max != max_freq)
-			cpufreq_verify_within_limits(policy, 0, max_freq);
+	if (num_opps == 0) {
+		ret = -EINVAL;
+		goto unlock;
 	}
-	mutex_unlock(&cooling_cpufreq_lock);
 
-	return 0;
+	power_table = kcalloc(num_opps, sizeof(*power_table), GFP_KERNEL);
+
+	for (freq = 0, i = 0;
+	     opp = dev_pm_opp_find_freq_ceil(dev, &freq), !IS_ERR(opp);
+	     freq++, i++) {
+		u32 freq_mhz, voltage_mv;
+		u64 power;
+
+		freq_mhz = freq / 1000000;
+		voltage_mv = dev_pm_opp_get_voltage(opp) / 1000;
+
+		/*
+		 * Do the multiplication with MHz and millivolt so as
+		 * to not overflow.
+		 */
+		power = (u64)capacitance * freq_mhz * voltage_mv * voltage_mv;
+		do_div(power, 1000000000);
+
+		/* frequency is stored in power_table in KHz */
+		power_table[i].frequency = freq / 1000;
+
+		/* power is stored in mW */
+		power_table[i].power = power;
+	}
+
+	if (i == 0) {
+		ret = PTR_ERR(opp);
+		goto unlock;
+	}
+
+	cpufreq_device->cpu_dev = dev;
+	cpufreq_device->dyn_power_table = power_table;
+	cpufreq_device->dyn_power_table_entries = i;
+
+unlock:
+	rcu_read_unlock();
+	return ret;
+}
+
+static u32 cpu_freq_to_power(struct cpufreq_cooling_device *cpufreq_device,
+			     u32 freq)
+{
+	int i;
+	struct power_table *pt = cpufreq_device->dyn_power_table;
+
+	for (i = 1; i < cpufreq_device->dyn_power_table_entries; i++)
+		if (freq < pt[i].frequency)
+			break;
+
+	return pt[i - 1].power;
+}
+
+static u32 cpu_power_to_freq(struct cpufreq_cooling_device *cpufreq_device,
+			     u32 power)
+{
+	int i;
+	struct power_table *pt = cpufreq_device->dyn_power_table;
+
+	for (i = 1; i < cpufreq_device->dyn_power_table_entries; i++)
+		if (power < pt[i].power)
+			break;
+
+	return pt[i - 1].frequency;
+}
+
+/**
+ * get_load() - get load for a cpu since last updated
+ * @cpufreq_device:	&struct cpufreq_cooling_device for this cpu
+ * @cpu:	cpu number
+ *
+ * Return: The average load of cpu @cpu in percentage since this
+ * function was last called.
+ */
+static u32 get_load(struct cpufreq_cooling_device *cpufreq_device, int cpu)
+{
+	u32 load;
+	u64 now, now_idle, delta_time, delta_idle;
+
+	now_idle = get_cpu_idle_time(cpu, &now, 0);
+	delta_idle = now_idle - cpufreq_device->time_in_idle[cpu];
+	delta_time = now - cpufreq_device->time_in_idle_timestamp[cpu];
+
+	if (delta_time <= delta_idle)
+		load = 0;
+	else
+		load = div64_u64(100 * (delta_time - delta_idle), delta_time);
+
+	cpufreq_device->time_in_idle[cpu] = now_idle;
+	cpufreq_device->time_in_idle_timestamp[cpu] = now;
+
+	return load;
+}
+
+/**
+ * get_static_power() - calculate the static power consumed by the cpus
+ * @cpufreq_device:	struct &cpufreq_cooling_device for this cpu cdev
+ * @tz:		thermal zone device in which we're operating
+ * @freq:	frequency in KHz
+ * @power:	pointer in which to store the calculated static power
+ *
+ * Calculate the static power consumed by the cpus described by
+ * @cpu_actor running at frequency @freq.  This function relies on a
+ * platform specific function that should have been provided when the
+ * actor was registered.  If it wasn't, the static power is assumed to
+ * be negligible.  The calculated static power is stored in @power.
+ *
+ * Return: 0 on success, -E* on failure.
+ */
+static int get_static_power(struct cpufreq_cooling_device *cpufreq_device,
+			    struct thermal_zone_device *tz, unsigned long freq,
+			    u32 *power)
+{
+	struct dev_pm_opp *opp;
+	unsigned long voltage;
+	struct cpumask *cpumask = &cpufreq_device->allowed_cpus;
+	unsigned long freq_hz = freq * 1000;
+
+	if (!cpufreq_device->plat_get_static_power ||
+	    !cpufreq_device->cpu_dev) {
+		*power = 0;
+		return 0;
+	}
+
+	rcu_read_lock();
+
+	opp = dev_pm_opp_find_freq_exact(cpufreq_device->cpu_dev, freq_hz,
+					 true);
+	voltage = dev_pm_opp_get_voltage(opp);
+
+	rcu_read_unlock();
+
+	if (voltage == 0) {
+		dev_warn_ratelimited(cpufreq_device->cpu_dev,
+				     "Failed to get voltage for frequency %lu: %ld\n",
+				     freq_hz, IS_ERR(opp) ? PTR_ERR(opp) : 0);
+		return -EINVAL;
+	}
+
+	return cpufreq_device->plat_get_static_power(cpumask, tz->passive_delay,
+						     voltage, power);
+}
+
+/**
+ * get_dynamic_power() - calculate the dynamic power
+ * @cpufreq_device:	&cpufreq_cooling_device for this cdev
+ * @freq:	current frequency
+ *
+ * Return: the dynamic power consumed by the cpus described by
+ * @cpufreq_device.
+ */
+static u32 get_dynamic_power(struct cpufreq_cooling_device *cpufreq_device,
+			     unsigned long freq)
+{
+	u32 raw_cpu_power;
+
+	raw_cpu_power = cpu_freq_to_power(cpufreq_device, freq);
+	return (raw_cpu_power * cpufreq_device->last_load) / 100;
 }
 
 /* cpufreq cooling device callback functions are defined below */
@@ -280,8 +560,169 @@ static int cpufreq_set_cur_state(struct thermal_cooling_device *cdev,
 	return 0;
 }
 
+/**
+ * cpufreq_get_requested_power() - get the current power
+ * @cdev:	&thermal_cooling_device pointer
+ * @tz:		a valid thermal zone device pointer
+ * @power:	pointer in which to store the resulting power
+ *
+ * Calculate the current power consumption of the cpus in milliwatts
+ * and store it in @power.  This function should actually calculate
+ * the requested power, but it's hard to get the frequency that
+ * cpufreq would have assigned if there were no thermal limits.
+ * Instead, we calculate the current power on the assumption that the
+ * immediate future will look like the immediate past.
+ *
+ * We use the current frequency and the average load since this
+ * function was last called.  In reality, there could have been
+ * multiple opps since this function was last called and that affects
+ * the load calculation.  While it's not perfectly accurate, this
+ * simplification is good enough and works.  REVISIT this, as more
+ * complex code may be needed if experiments show that it's not
+ * accurate enough.
+ *
+ * Return: 0 on success, -E* if getting the static power failed.
+ */
+static int cpufreq_get_requested_power(struct thermal_cooling_device *cdev,
+				       struct thermal_zone_device *tz,
+				       u32 *power)
+{
+	unsigned long freq;
+	int cpu, ret;
+	u32 static_power, dynamic_power, total_load = 0;
+	struct cpufreq_cooling_device *cpufreq_device = cdev->devdata;
+
+	freq = cpufreq_quick_get(cpumask_any(&cpufreq_device->allowed_cpus));
+
+	for_each_cpu(cpu, &cpufreq_device->allowed_cpus) {
+		u32 load;
+
+		if (cpu_online(cpu))
+			load = get_load(cpufreq_device, cpu);
+		else
+			load = 0;
+
+		total_load += load;
+	}
+
+	cpufreq_device->last_load = total_load;
+
+	dynamic_power = get_dynamic_power(cpufreq_device, freq);
+	ret = get_static_power(cpufreq_device, tz, freq, &static_power);
+	if (ret)
+		return ret;
+
+	*power = static_power + dynamic_power;
+	return 0;
+}
+
+/**
+ * cpufreq_state2power() - convert a cpu cdev state to power consumed
+ * @cdev:	&thermal_cooling_device pointer
+ * @tz:		a valid thermal zone device pointer
+ * @state:	cooling device state to be converted
+ * @power:	pointer in which to store the resulting power
+ *
+ * Convert cooling device state @state into power consumption in
+ * milliwatts assuming 100% load.  Store the calculated power in
+ * @power.
+ *
+ * Return: 0 on success, -EINVAL if the cooling device state could not
+ * be converted into a frequency or other -E* if there was an error
+ * when calculating the static power.
+ */
+static int cpufreq_state2power(struct thermal_cooling_device *cdev,
+			       struct thermal_zone_device *tz,
+			       unsigned long state, u32 *power)
+{
+	unsigned int freq, num_cpus;
+	cpumask_t cpumask;
+	u32 static_power, dynamic_power;
+	int ret;
+	struct cpufreq_cooling_device *cpufreq_device = cdev->devdata;
+
+	cpumask_and(&cpumask, &cpufreq_device->allowed_cpus, cpu_online_mask);
+	num_cpus = cpumask_weight(&cpumask);
+
+	/* None of our cpus are online, so no power */
+	if (num_cpus == 0) {
+		*power = 0;
+		return 0;
+	}
+
+	freq = cpufreq_device->freq_table[state];
+	if (!freq)
+		return -EINVAL;
+
+	dynamic_power = cpu_freq_to_power(cpufreq_device, freq) * num_cpus;
+	ret = get_static_power(cpufreq_device, tz, freq, &static_power);
+	if (ret)
+		return ret;
+
+	*power = static_power + dynamic_power;
+	return 0;
+}
+
+/**
+ * cpufreq_power2state() - convert power to a cooling device state
+ * @cdev:	&thermal_cooling_device pointer
+ * @tz:		a valid thermal zone device pointer
+ * @power:	power in milliwatts to be converted
+ * @state:	pointer in which to store the resulting state
+ *
+ * Calculate a cooling device state for the cpus described by @cdev
+ * that would allow them to consume at most @power mW and store it in
+ * @state.  Note that this calculation depends on external factors
+ * such as the cpu load or the current static power.  Calling this
+ * function with the same power as input can yield different cooling
+ * device states depending on those external factors.
+ *
+ * Return: 0 on success, -ENODEV if no cpus are online or -EINVAL if
+ * the calculated frequency could not be converted to a valid state.
+ * The latter should not happen unless the frequencies available to
+ * cpufreq have changed since the initialization of the cpu cooling
+ * device.
+ */
+static int cpufreq_power2state(struct thermal_cooling_device *cdev,
+			       struct thermal_zone_device *tz, u32 power,
+			       unsigned long *state)
+{
+	unsigned int cpu, cur_freq, target_freq;
+	int ret;
+	s32 dyn_power;
+	u32 last_load, normalised_power, static_power;
+	struct cpufreq_cooling_device *cpufreq_device = cdev->devdata;
+
+	cpu = cpumask_any_and(&cpufreq_device->allowed_cpus, cpu_online_mask);
+
+	/* None of our cpus are online */
+	if (cpu >= nr_cpu_ids)
+		return -ENODEV;
+
+	cur_freq = cpufreq_quick_get(cpu);
+	ret = get_static_power(cpufreq_device, tz, cur_freq, &static_power);
+	if (ret)
+		return ret;
+
+	dyn_power = power - static_power;
+	dyn_power = dyn_power > 0 ? dyn_power : 0;
+	last_load = cpufreq_device->last_load ?: 1;
+	normalised_power = (dyn_power * 100) / last_load;
+	target_freq = cpu_power_to_freq(cpufreq_device, normalised_power);
+
+	*state = cpufreq_cooling_get_level(cpu, target_freq);
+	if (*state == THERMAL_CSTATE_INVALID) {
+		dev_warn_ratelimited(&cdev->device,
+				     "Failed to convert %dKHz for cpu %d into a cdev state\n",
+				     target_freq, cpu);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
 /* Bind cpufreq callbacks to thermal cooling device ops */
-static struct thermal_cooling_device_ops const cpufreq_cooling_ops = {
+static struct thermal_cooling_device_ops cpufreq_cooling_ops = {
 	.get_max_state = cpufreq_get_max_state,
 	.get_cur_state = cpufreq_get_cur_state,
 	.set_cur_state = cpufreq_set_cur_state,
@@ -311,6 +752,9 @@ static unsigned int find_next_max(struct cpufreq_frequency_table *table,
  * @np: a valid struct device_node to the cooling device device tree node
  * @clip_cpus: cpumask of cpus where the frequency constraints will happen.
  * Normally this should be same as cpufreq policy->related_cpus.
+ * @capacitance: dynamic power coefficient for these cpus
+ * @plat_static_func: function to calculate the static power consumed by these
+ *                    cpus (optional)
  *
  * This interface function registers the cpufreq cooling device with the name
  * "thermal-cpufreq-%x". This api can support multiple instances of cpufreq
@@ -322,13 +766,14 @@ static unsigned int find_next_max(struct cpufreq_frequency_table *table,
  */
 static struct thermal_cooling_device *
 __cpufreq_cooling_register(struct device_node *np,
-			   const struct cpumask *clip_cpus)
+			const struct cpumask *clip_cpus, u32 capacitance,
+			get_static_t plat_static_func)
 {
 	struct thermal_cooling_device *cool_dev;
 	struct cpufreq_cooling_device *cpufreq_dev;
 	char dev_name[THERMAL_NAME_LENGTH];
 	struct cpufreq_frequency_table *pos, *table;
-	unsigned int freq, i;
+	unsigned int freq, i, num_cpus;
 	int ret;
 
 	table = cpufreq_frequency_get_table(cpumask_first(clip_cpus));
@@ -341,6 +786,23 @@ __cpufreq_cooling_register(struct device_node *np,
 	if (!cpufreq_dev)
 		return ERR_PTR(-ENOMEM);
 
+	num_cpus = cpumask_weight(clip_cpus);
+	cpufreq_dev->time_in_idle = kcalloc(num_cpus,
+					    sizeof(*cpufreq_dev->time_in_idle),
+					    GFP_KERNEL);
+	if (!cpufreq_dev->time_in_idle) {
+		cool_dev = ERR_PTR(-ENOMEM);
+		goto free_cdev;
+	}
+
+	cpufreq_dev->time_in_idle_timestamp =
+		kcalloc(num_cpus, sizeof(*cpufreq_dev->time_in_idle_timestamp),
+			GFP_KERNEL);
+	if (!cpufreq_dev->time_in_idle_timestamp) {
+		cool_dev = ERR_PTR(-ENOMEM);
+		goto free_time_in_idle;
+	}
+
 	/* Find max levels */
 	cpufreq_for_each_valid_entry(pos, table)
 		cpufreq_dev->max_level++;
@@ -349,7 +811,7 @@ __cpufreq_cooling_register(struct device_node *np,
 					  cpufreq_dev->max_level, GFP_KERNEL);
 	if (!cpufreq_dev->freq_table) {
 		cool_dev = ERR_PTR(-ENOMEM);
-		goto free_cdev;
+		goto free_time_in_idle_timestamp;
 	}
 
 	/* max_level is an index, not a counter */
@@ -357,6 +819,20 @@ __cpufreq_cooling_register(struct device_node *np,
 
 	cpumask_copy(&cpufreq_dev->allowed_cpus, clip_cpus);
 
+	if (capacitance) {
+		cpufreq_cooling_ops.get_requested_power =
+			cpufreq_get_requested_power;
+		cpufreq_cooling_ops.state2power = cpufreq_state2power;
+		cpufreq_cooling_ops.power2state = cpufreq_power2state;
+		cpufreq_dev->plat_get_static_power = plat_static_func;
+
+		ret = build_dyn_power_table(cpufreq_dev, capacitance);
+		if (ret) {
+			cool_dev = ERR_PTR(ret);
+			goto free_table;
+		}
+	}
+
 	ret = get_idr(&cpufreq_idr, &cpufreq_dev->id);
 	if (ret) {
 		cool_dev = ERR_PTR(ret);
@@ -402,6 +878,10 @@ remove_idr:
 	release_idr(&cpufreq_idr, cpufreq_dev->id);
 free_table:
 	kfree(cpufreq_dev->freq_table);
+free_time_in_idle_timestamp:
+	kfree(cpufreq_dev->time_in_idle_timestamp);
+free_time_in_idle:
+	kfree(cpufreq_dev->time_in_idle);
 free_cdev:
 	kfree(cpufreq_dev);
 
@@ -422,7 +902,7 @@ free_cdev:
 struct thermal_cooling_device *
 cpufreq_cooling_register(const struct cpumask *clip_cpus)
 {
-	return __cpufreq_cooling_register(NULL, clip_cpus);
+	return __cpufreq_cooling_register(NULL, clip_cpus, 0, NULL);
 }
 EXPORT_SYMBOL_GPL(cpufreq_cooling_register);
 
@@ -446,10 +926,77 @@ of_cpufreq_cooling_register(struct device_node *np,
 	if (!np)
 		return ERR_PTR(-EINVAL);
 
-	return __cpufreq_cooling_register(np, clip_cpus);
+	return __cpufreq_cooling_register(np, clip_cpus, 0, NULL);
 }
 EXPORT_SYMBOL_GPL(of_cpufreq_cooling_register);
 
+/**
+ * cpufreq_power_cooling_register() - create cpufreq cooling device with power extensions
+ * @clip_cpus:	cpumask of cpus where the frequency constraints will happen
+ * @capacitance:	dynamic power coefficient for these cpus
+ * @plat_static_func:	function to calculate the static power consumed by these
+ *			cpus (optional)
+ *
+ * This interface function registers the cpufreq cooling device with
+ * the name "thermal-cpufreq-%x".  This api can support multiple
+ * instances of cpufreq cooling devices.  Using this function, the
+ * cooling device will implement the power extensions by using a
+ * simple cpu power model.  The cpus must have registered their OPPs
+ * using the OPP library.
+ *
+ * An optional @plat_static_func may be provided to calculate the
+ * static power consumed by these cpus.  If the platform's static
+ * power consumption is unknown or negligible, make it NULL.
+ *
+ * Return: a valid struct thermal_cooling_device pointer on success,
+ * on failure, it returns a corresponding ERR_PTR().
+ */
+struct thermal_cooling_device *
+cpufreq_power_cooling_register(const struct cpumask *clip_cpus, u32 capacitance,
+			       get_static_t plat_static_func)
+{
+	return __cpufreq_cooling_register(NULL, clip_cpus, capacitance,
+				plat_static_func);
+}
+EXPORT_SYMBOL(cpufreq_power_cooling_register);
+
+/**
+ * of_cpufreq_power_cooling_register() - create cpufreq cooling device with power extensions
+ * @np:	a valid struct device_node to the cooling device device tree node
+ * @clip_cpus:	cpumask of cpus where the frequency constraints will happen
+ * @capacitance:	dynamic power coefficient for these cpus
+ * @plat_static_func:	function to calculate the static power consumed by these
+ *			cpus (optional)
+ *
+ * This interface function registers the cpufreq cooling device with
+ * the name "thermal-cpufreq-%x".  This api can support multiple
+ * instances of cpufreq cooling devices.  Using this API, the cpufreq
+ * cooling device will be linked to the device tree node provided.
+ * Using this function, the cooling device will implement the power
+ * extensions by using a simple cpu power model.  The cpus must have
+ * registered their OPPs using the OPP library.
+ *
+ * An optional @plat_static_func may be provided to calculate the
+ * static power consumed by these cpus.  If the platform's static
+ * power consumption is unknown or negligible, make it NULL.
+ *
+ * Return: a valid struct thermal_cooling_device pointer on success,
+ * on failure, it returns a corresponding ERR_PTR().
+ */
+struct thermal_cooling_device *
+of_cpufreq_power_cooling_register(struct device_node *np,
+				  const struct cpumask *clip_cpus,
+				  u32 capacitance,
+				  get_static_t plat_static_func)
+{
+	if (!np)
+		return ERR_PTR(-EINVAL);
+
+	return __cpufreq_cooling_register(np, clip_cpus, capacitance,
+				plat_static_func);
+}
+EXPORT_SYMBOL(of_cpufreq_power_cooling_register);
+
 /**
  * cpufreq_cooling_unregister - function to remove cpufreq cooling device.
  * @cdev: thermal cooling device pointer.
@@ -475,6 +1022,8 @@ void cpufreq_cooling_unregister(struct thermal_cooling_device *cdev)
 
 	thermal_cooling_device_unregister(cpufreq_dev->cool_dev);
 	release_idr(&cpufreq_idr, cpufreq_dev->id);
+	kfree(cpufreq_dev->time_in_idle_timestamp);
+	kfree(cpufreq_dev->time_in_idle);
 	kfree(cpufreq_dev->freq_table);
 	kfree(cpufreq_dev);
 }
diff --git a/include/linux/cpu_cooling.h b/include/linux/cpu_cooling.h
index bd955270d5aa..c156f5082758 100644
--- a/include/linux/cpu_cooling.h
+++ b/include/linux/cpu_cooling.h
@@ -28,6 +28,9 @@
 #include <linux/thermal.h>
 #include <linux/cpumask.h>
 
+typedef int (*get_static_t)(cpumask_t *cpumask, int interval,
+			    unsigned long voltage, u32 *power);
+
 #ifdef CONFIG_CPU_THERMAL
 /**
  * cpufreq_cooling_register - function to create cpufreq cooling device.
@@ -36,6 +39,10 @@
 struct thermal_cooling_device *
 cpufreq_cooling_register(const struct cpumask *clip_cpus);
 
+struct thermal_cooling_device *
+cpufreq_power_cooling_register(const struct cpumask *clip_cpus,
+			       u32 capacitance, get_static_t plat_static_func);
+
 /**
  * of_cpufreq_cooling_register - create cpufreq cooling device based on DT.
  * @np: a valid struct device_node to the cooling device device tree node.
@@ -45,6 +52,12 @@ cpufreq_cooling_register(const struct cpumask *clip_cpus);
 struct thermal_cooling_device *
 of_cpufreq_cooling_register(struct device_node *np,
 			    const struct cpumask *clip_cpus);
+
+struct thermal_cooling_device *
+of_cpufreq_power_cooling_register(struct device_node *np,
+				  const struct cpumask *clip_cpus,
+				  u32 capacitance,
+				  get_static_t plat_static_func);
 #else
 static inline struct thermal_cooling_device *
 of_cpufreq_cooling_register(struct device_node *np,
@@ -52,6 +65,15 @@ of_cpufreq_cooling_register(struct device_node *np,
 {
 	return ERR_PTR(-ENOSYS);
 }
+
+static inline struct thermal_cooling_device *
+of_cpufreq_power_cooling_register(struct device_node *np,
+				  const struct cpumask *clip_cpus,
+				  u32 capacitance,
+				  get_static_t plat_static_func)
+{
+	return NULL;
+}
 #endif
 
 /**
@@ -67,12 +89,29 @@ cpufreq_cooling_register(const struct cpumask *clip_cpus)
 {
 	return ERR_PTR(-ENOSYS);
 }
+static inline struct thermal_cooling_device *
+cpufreq_power_cooling_register(const struct cpumask *clip_cpus,
+			       u32 capacitance, get_static_t plat_static_func)
+{
+	return NULL;
+}
+
 static inline struct thermal_cooling_device *
 of_cpufreq_cooling_register(struct device_node *np,
 			    const struct cpumask *clip_cpus)
 {
 	return ERR_PTR(-ENOSYS);
 }
+
+static inline struct thermal_cooling_device *
+of_cpufreq_power_cooling_register(struct device_node *np,
+				  const struct cpumask *clip_cpus,
+				  u32 capacitance,
+				  get_static_t plat_static_func)
+{
+	return NULL;
+}
+
 static inline
 void cpufreq_cooling_unregister(struct thermal_cooling_device *cdev)
 {
-- 
cgit v1.2.3


From 6b775e870c56c59c3e16531ea2307b797395f9f7 Mon Sep 17 00:00:00 2001
From: Javi Merino <javi.merino@arm.com>
Date: Mon, 2 Mar 2015 17:17:19 +0000
Subject: thermal: introduce the Power Allocator governor

The power allocator governor is a thermal governor that controls system
and device power allocation to control temperature.  Conceptually, the
implementation divides the sustainable power of a thermal zone among
all the heat sources in that zone.

This governor relies on "power actors", entities that represent heat
sources.  They can report current and maximum power consumption and
can set a given maximum power consumption, usually via a cooling
device.

The governor uses a Proportional Integral Derivative (PID) controller
driven by the temperature of the thermal zone.  The output of the
controller is a power budget that is then allocated to each power
actor that can have bearing on the temperature we are trying to
control.  It decides how much power to give each cooling device based
on the performance they are requesting.  The PID controller ensures
that the total power budget does not exceed the control temperature.

Cc: Zhang Rui <rui.zhang@intel.com>
Cc: Eduardo Valentin <edubezval@gmail.com>
Signed-off-by: Punit Agrawal <punit.agrawal@arm.com>
Signed-off-by: Javi Merino <javi.merino@arm.com>
Signed-off-by: Eduardo Valentin <edubezval@gmail.com>
---
 Documentation/thermal/power_allocator.txt | 247 ++++++++++++++
 drivers/thermal/Kconfig                   |  15 +
 drivers/thermal/Makefile                  |   1 +
 drivers/thermal/power_allocator.c         | 520 ++++++++++++++++++++++++++++++
 drivers/thermal/thermal_core.c            |   9 +-
 drivers/thermal/thermal_core.h            |   8 +
 include/linux/thermal.h                   |  37 ++-
 7 files changed, 830 insertions(+), 7 deletions(-)
 create mode 100644 Documentation/thermal/power_allocator.txt
 create mode 100644 drivers/thermal/power_allocator.c

(limited to 'include/linux')

diff --git a/Documentation/thermal/power_allocator.txt b/Documentation/thermal/power_allocator.txt
new file mode 100644
index 000000000000..c3797b529991
--- /dev/null
+++ b/Documentation/thermal/power_allocator.txt
@@ -0,0 +1,247 @@
+Power allocator governor tunables
+=================================
+
+Trip points
+-----------
+
+The governor requires the following two passive trip points:
+
+1.  "switch on" trip point: temperature above which the governor
+    control loop starts operating.  This is the first passive trip
+    point of the thermal zone.
+
+2.  "desired temperature" trip point: it should be higher than the
+    "switch on" trip point.  This the target temperature the governor
+    is controlling for.  This is the last passive trip point of the
+    thermal zone.
+
+PID Controller
+--------------
+
+The power allocator governor implements a
+Proportional-Integral-Derivative controller (PID controller) with
+temperature as the control input and power as the controlled output:
+
+    P_max = k_p * e + k_i * err_integral + k_d * diff_err + sustainable_power
+
+where
+    e = desired_temperature - current_temperature
+    err_integral is the sum of previous errors
+    diff_err = e - previous_error
+
+It is similar to the one depicted below:
+
+                                      k_d
+                                       |
+current_temp                           |
+     |                                 v
+     |                +----------+   +---+
+     |         +----->| diff_err |-->| X |------+
+     |         |      +----------+   +---+      |
+     |         |                                |      tdp        actor
+     |         |                      k_i       |       |  get_requested_power()
+     |         |                       |        |       |        |     |
+     |         |                       |        |       |        |     | ...
+     v         |                       v        v       v        v     v
+   +---+       |      +-------+      +---+    +---+   +---+   +----------+
+   | S |-------+----->| sum e |----->| X |--->| S |-->| S |-->|power     |
+   +---+       |      +-------+      +---+    +---+   +---+   |allocation|
+     ^         |                                ^             +----------+
+     |         |                                |                |     |
+     |         |        +---+                   |                |     |
+     |         +------->| X |-------------------+                v     v
+     |                  +---+                               granted performance
+desired_temperature       ^
+                          |
+                          |
+                      k_po/k_pu
+
+Sustainable power
+-----------------
+
+An estimate of the sustainable dissipatable power (in mW) should be
+provided while registering the thermal zone.  This estimates the
+sustained power that can be dissipated at the desired control
+temperature.  This is the maximum sustained power for allocation at
+the desired maximum temperature.  The actual sustained power can vary
+for a number of reasons.  The closed loop controller will take care of
+variations such as environmental conditions, and some factors related
+to the speed-grade of the silicon.  `sustainable_power` is therefore
+simply an estimate, and may be tuned to affect the aggressiveness of
+the thermal ramp. For reference, the sustainable power of a 4" phone
+is typically 2000mW, while on a 10" tablet is around 4500mW (may vary
+depending on screen size).
+
+If you are using device tree, do add it as a property of the
+thermal-zone.  For example:
+
+	thermal-zones {
+		soc_thermal {
+			polling-delay = <1000>;
+			polling-delay-passive = <100>;
+			sustainable-power = <2500>;
+			...
+
+Instead, if the thermal zone is registered from the platform code, pass a
+`thermal_zone_params` that has a `sustainable_power`.  If no
+`thermal_zone_params` were being passed, then something like below
+will suffice:
+
+	static const struct thermal_zone_params tz_params = {
+		.sustainable_power = 3500,
+	};
+
+and then pass `tz_params` as the 5th parameter to
+`thermal_zone_device_register()`
+
+k_po and k_pu
+-------------
+
+The implementation of the PID controller in the power allocator
+thermal governor allows the configuration of two proportional term
+constants: `k_po` and `k_pu`.  `k_po` is the proportional term
+constant during temperature overshoot periods (current temperature is
+above "desired temperature" trip point).  Conversely, `k_pu` is the
+proportional term constant during temperature undershoot periods
+(current temperature below "desired temperature" trip point).
+
+These controls are intended as the primary mechanism for configuring
+the permitted thermal "ramp" of the system.  For instance, a lower
+`k_pu` value will provide a slower ramp, at the cost of capping
+available capacity at a low temperature.  On the other hand, a high
+value of `k_pu` will result in the governor granting very high power
+whilst temperature is low, and may lead to temperature overshooting.
+
+The default value for `k_pu` is:
+
+    2 * sustainable_power / (desired_temperature - switch_on_temp)
+
+This means that at `switch_on_temp` the output of the controller's
+proportional term will be 2 * `sustainable_power`.  The default value
+for `k_po` is:
+
+    sustainable_power / (desired_temperature - switch_on_temp)
+
+Focusing on the proportional and feed forward values of the PID
+controller equation we have:
+
+    P_max = k_p * e + sustainable_power
+
+The proportional term is proportional to the difference between the
+desired temperature and the current one.  When the current temperature
+is the desired one, then the proportional component is zero and
+`P_max` = `sustainable_power`.  That is, the system should operate in
+thermal equilibrium under constant load.  `sustainable_power` is only
+an estimate, which is the reason for closed-loop control such as this.
+
+Expanding `k_pu` we get:
+    P_max = 2 * sustainable_power * (T_set - T) / (T_set - T_on) +
+        sustainable_power
+
+where
+    T_set is the desired temperature
+    T is the current temperature
+    T_on is the switch on temperature
+
+When the current temperature is the switch_on temperature, the above
+formula becomes:
+
+    P_max = 2 * sustainable_power * (T_set - T_on) / (T_set - T_on) +
+        sustainable_power = 2 * sustainable_power + sustainable_power =
+        3 * sustainable_power
+
+Therefore, the proportional term alone linearly decreases power from
+3 * `sustainable_power` to `sustainable_power` as the temperature
+rises from the switch on temperature to the desired temperature.
+
+k_i and integral_cutoff
+-----------------------
+
+`k_i` configures the PID loop's integral term constant.  This term
+allows the PID controller to compensate for long term drift and for
+the quantized nature of the output control: cooling devices can't set
+the exact power that the governor requests.  When the temperature
+error is below `integral_cutoff`, errors are accumulated in the
+integral term.  This term is then multiplied by `k_i` and the result
+added to the output of the controller.  Typically `k_i` is set low (1
+or 2) and `integral_cutoff` is 0.
+
+k_d
+---
+
+`k_d` configures the PID loop's derivative term constant.  It's
+recommended to leave it as the default: 0.
+
+Cooling device power API
+========================
+
+Cooling devices controlled by this governor must supply the additional
+"power" API in their `cooling_device_ops`.  It consists on three ops:
+
+1. int get_requested_power(struct thermal_cooling_device *cdev,
+	struct thermal_zone_device *tz, u32 *power);
+@cdev: The `struct thermal_cooling_device` pointer
+@tz: thermal zone in which we are currently operating
+@power: pointer in which to store the calculated power
+
+`get_requested_power()` calculates the power requested by the device
+in milliwatts and stores it in @power .  It should return 0 on
+success, -E* on failure.  This is currently used by the power
+allocator governor to calculate how much power to give to each cooling
+device.
+
+2. int state2power(struct thermal_cooling_device *cdev, struct
+        thermal_zone_device *tz, unsigned long state, u32 *power);
+@cdev: The `struct thermal_cooling_device` pointer
+@tz: thermal zone in which we are currently operating
+@state: A cooling device state
+@power: pointer in which to store the equivalent power
+
+Convert cooling device state @state into power consumption in
+milliwatts and store it in @power.  It should return 0 on success, -E*
+on failure.  This is currently used by thermal core to calculate the
+maximum power that an actor can consume.
+
+3. int power2state(struct thermal_cooling_device *cdev, u32 power,
+	unsigned long *state);
+@cdev: The `struct thermal_cooling_device` pointer
+@power: power in milliwatts
+@state: pointer in which to store the resulting state
+
+Calculate a cooling device state that would make the device consume at
+most @power mW and store it in @state.  It should return 0 on success,
+-E* on failure.  This is currently used by the thermal core to convert
+a given power set by the power allocator governor to a state that the
+cooling device can set.  It is a function because this conversion may
+depend on external factors that may change so this function should the
+best conversion given "current circumstances".
+
+Cooling device weights
+----------------------
+
+Weights are a mechanism to bias the allocation among cooling
+devices.  They express the relative power efficiency of different
+cooling devices.  Higher weight can be used to express higher power
+efficiency.  Weighting is relative such that if each cooling device
+has a weight of one they are considered equal.  This is particularly
+useful in heterogeneous systems where two cooling devices may perform
+the same kind of compute, but with different efficiency.  For example,
+a system with two different types of processors.
+
+If the thermal zone is registered using
+`thermal_zone_device_register()` (i.e., platform code), then weights
+are passed as part of the thermal zone's `thermal_bind_parameters`.
+If the platform is registered using device tree, then they are passed
+as the `contribution` property of each map in the `cooling-maps` node.
+
+Limitations of the power allocator governor
+===========================================
+
+The power allocator governor's PID controller works best if there is a
+periodic tick.  If you have a driver that calls
+`thermal_zone_device_update()` (or anything that ends up calling the
+governor's `throttle()` function) repetitively, the governor response
+won't be very good.  Note that this is not particular to this
+governor, step-wise will also misbehave if you call its throttle()
+faster than the normal thermal framework tick (due to interrupts for
+example) as it will overreact.
diff --git a/drivers/thermal/Kconfig b/drivers/thermal/Kconfig
index 30aee81e9f5b..a1b43eab0a70 100644
--- a/drivers/thermal/Kconfig
+++ b/drivers/thermal/Kconfig
@@ -71,6 +71,14 @@ config THERMAL_DEFAULT_GOV_USER_SPACE
 	  Select this if you want to let the user space manage the
 	  platform thermals.
 
+config THERMAL_DEFAULT_GOV_POWER_ALLOCATOR
+	bool "power_allocator"
+	select THERMAL_GOV_POWER_ALLOCATOR
+	help
+	  Select this if you want to control temperature based on
+	  system and device power allocation. This governor can only
+	  operate on cooling devices that implement the power API.
+
 endchoice
 
 config THERMAL_GOV_FAIR_SHARE
@@ -99,6 +107,13 @@ config THERMAL_GOV_USER_SPACE
 	help
 	  Enable this to let the user space manage the platform thermals.
 
+config THERMAL_GOV_POWER_ALLOCATOR
+	bool "Power allocator thermal governor"
+	select THERMAL_POWER_ACTOR
+	help
+	  Enable this to manage platform thermals by dynamically
+	  allocating and limiting power to devices.
+
 config CPU_THERMAL
 	bool "generic cpu cooling support"
 	depends on CPU_FREQ
diff --git a/drivers/thermal/Makefile b/drivers/thermal/Makefile
index 1fe86652cfb6..b1783cf37ed2 100644
--- a/drivers/thermal/Makefile
+++ b/drivers/thermal/Makefile
@@ -14,6 +14,7 @@ thermal_sys-$(CONFIG_THERMAL_GOV_FAIR_SHARE)	+= fair_share.o
 thermal_sys-$(CONFIG_THERMAL_GOV_BANG_BANG)	+= gov_bang_bang.o
 thermal_sys-$(CONFIG_THERMAL_GOV_STEP_WISE)	+= step_wise.o
 thermal_sys-$(CONFIG_THERMAL_GOV_USER_SPACE)	+= user_space.o
+thermal_sys-$(CONFIG_THERMAL_GOV_POWER_ALLOCATOR)	+= power_allocator.o
 
 # cpufreq cooling
 thermal_sys-$(CONFIG_CPU_THERMAL)	+= cpu_cooling.o
diff --git a/drivers/thermal/power_allocator.c b/drivers/thermal/power_allocator.c
new file mode 100644
index 000000000000..67982d79b76c
--- /dev/null
+++ b/drivers/thermal/power_allocator.c
@@ -0,0 +1,520 @@
+/*
+ * A power allocator to manage temperature
+ *
+ * Copyright (C) 2014 ARM Ltd.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed "as is" WITHOUT ANY WARRANTY of any
+ * kind, whether express or implied; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#define pr_fmt(fmt) "Power allocator: " fmt
+
+#include <linux/rculist.h>
+#include <linux/slab.h>
+#include <linux/thermal.h>
+
+#include "thermal_core.h"
+
+#define FRAC_BITS 10
+#define int_to_frac(x) ((x) << FRAC_BITS)
+#define frac_to_int(x) ((x) >> FRAC_BITS)
+
+/**
+ * mul_frac() - multiply two fixed-point numbers
+ * @x:	first multiplicand
+ * @y:	second multiplicand
+ *
+ * Return: the result of multiplying two fixed-point numbers.  The
+ * result is also a fixed-point number.
+ */
+static inline s64 mul_frac(s64 x, s64 y)
+{
+	return (x * y) >> FRAC_BITS;
+}
+
+/**
+ * div_frac() - divide two fixed-point numbers
+ * @x:	the dividend
+ * @y:	the divisor
+ *
+ * Return: the result of dividing two fixed-point numbers.  The
+ * result is also a fixed-point number.
+ */
+static inline s64 div_frac(s64 x, s64 y)
+{
+	return div_s64(x << FRAC_BITS, y);
+}
+
+/**
+ * struct power_allocator_params - parameters for the power allocator governor
+ * @err_integral:	accumulated error in the PID controller.
+ * @prev_err:	error in the previous iteration of the PID controller.
+ *		Used to calculate the derivative term.
+ * @trip_switch_on:	first passive trip point of the thermal zone.  The
+ *			governor switches on when this trip point is crossed.
+ * @trip_max_desired_temperature:	last passive trip point of the thermal
+ *					zone.  The temperature we are
+ *					controlling for.
+ */
+struct power_allocator_params {
+	s64 err_integral;
+	s32 prev_err;
+	int trip_switch_on;
+	int trip_max_desired_temperature;
+};
+
+/**
+ * pid_controller() - PID controller
+ * @tz:	thermal zone we are operating in
+ * @current_temp:	the current temperature in millicelsius
+ * @control_temp:	the target temperature in millicelsius
+ * @max_allocatable_power:	maximum allocatable power for this thermal zone
+ *
+ * This PID controller increases the available power budget so that the
+ * temperature of the thermal zone gets as close as possible to
+ * @control_temp and limits the power if it exceeds it.  k_po is the
+ * proportional term when we are overshooting, k_pu is the
+ * proportional term when we are undershooting.  integral_cutoff is a
+ * threshold below which we stop accumulating the error.  The
+ * accumulated error is only valid if the requested power will make
+ * the system warmer.  If the system is mostly idle, there's no point
+ * in accumulating positive error.
+ *
+ * Return: The power budget for the next period.
+ */
+static u32 pid_controller(struct thermal_zone_device *tz,
+			  unsigned long current_temp,
+			  unsigned long control_temp,
+			  u32 max_allocatable_power)
+{
+	s64 p, i, d, power_range;
+	s32 err, max_power_frac;
+	struct power_allocator_params *params = tz->governor_data;
+
+	max_power_frac = int_to_frac(max_allocatable_power);
+
+	err = ((s32)control_temp - (s32)current_temp);
+	err = int_to_frac(err);
+
+	/* Calculate the proportional term */
+	p = mul_frac(err < 0 ? tz->tzp->k_po : tz->tzp->k_pu, err);
+
+	/*
+	 * Calculate the integral term
+	 *
+	 * if the error is less than cut off allow integration (but
+	 * the integral is limited to max power)
+	 */
+	i = mul_frac(tz->tzp->k_i, params->err_integral);
+
+	if (err < int_to_frac(tz->tzp->integral_cutoff)) {
+		s64 i_next = i + mul_frac(tz->tzp->k_i, err);
+
+		if (abs64(i_next) < max_power_frac) {
+			i = i_next;
+			params->err_integral += err;
+		}
+	}
+
+	/*
+	 * Calculate the derivative term
+	 *
+	 * We do err - prev_err, so with a positive k_d, a decreasing
+	 * error (i.e. driving closer to the line) results in less
+	 * power being applied, slowing down the controller)
+	 */
+	d = mul_frac(tz->tzp->k_d, err - params->prev_err);
+	d = div_frac(d, tz->passive_delay);
+	params->prev_err = err;
+
+	power_range = p + i + d;
+
+	/* feed-forward the known sustainable dissipatable power */
+	power_range = tz->tzp->sustainable_power + frac_to_int(power_range);
+
+	return clamp(power_range, (s64)0, (s64)max_allocatable_power);
+}
+
+/**
+ * divvy_up_power() - divvy the allocated power between the actors
+ * @req_power:	each actor's requested power
+ * @max_power:	each actor's maximum available power
+ * @num_actors:	size of the @req_power, @max_power and @granted_power's array
+ * @total_req_power: sum of @req_power
+ * @power_range:	total allocated power
+ * @granted_power:	output array: each actor's granted power
+ * @extra_actor_power:	an appropriately sized array to be used in the
+ *			function as temporary storage of the extra power given
+ *			to the actors
+ *
+ * This function divides the total allocated power (@power_range)
+ * fairly between the actors.  It first tries to give each actor a
+ * share of the @power_range according to how much power it requested
+ * compared to the rest of the actors.  For example, if only one actor
+ * requests power, then it receives all the @power_range.  If
+ * three actors each requests 1mW, each receives a third of the
+ * @power_range.
+ *
+ * If any actor received more than their maximum power, then that
+ * surplus is re-divvied among the actors based on how far they are
+ * from their respective maximums.
+ *
+ * Granted power for each actor is written to @granted_power, which
+ * should've been allocated by the calling function.
+ */
+static void divvy_up_power(u32 *req_power, u32 *max_power, int num_actors,
+			   u32 total_req_power, u32 power_range,
+			   u32 *granted_power, u32 *extra_actor_power)
+{
+	u32 extra_power, capped_extra_power;
+	int i;
+
+	/*
+	 * Prevent division by 0 if none of the actors request power.
+	 */
+	if (!total_req_power)
+		total_req_power = 1;
+
+	capped_extra_power = 0;
+	extra_power = 0;
+	for (i = 0; i < num_actors; i++) {
+		u64 req_range = req_power[i] * power_range;
+
+		granted_power[i] = div_u64(req_range, total_req_power);
+
+		if (granted_power[i] > max_power[i]) {
+			extra_power += granted_power[i] - max_power[i];
+			granted_power[i] = max_power[i];
+		}
+
+		extra_actor_power[i] = max_power[i] - granted_power[i];
+		capped_extra_power += extra_actor_power[i];
+	}
+
+	if (!extra_power)
+		return;
+
+	/*
+	 * Re-divvy the reclaimed extra among actors based on
+	 * how far they are from the max
+	 */
+	extra_power = min(extra_power, capped_extra_power);
+	if (capped_extra_power > 0)
+		for (i = 0; i < num_actors; i++)
+			granted_power[i] += (extra_actor_power[i] *
+					extra_power) / capped_extra_power;
+}
+
+static int allocate_power(struct thermal_zone_device *tz,
+			  unsigned long current_temp,
+			  unsigned long control_temp)
+{
+	struct thermal_instance *instance;
+	struct power_allocator_params *params = tz->governor_data;
+	u32 *req_power, *max_power, *granted_power, *extra_actor_power;
+	u32 total_req_power, max_allocatable_power;
+	u32 power_range;
+	int i, num_actors, total_weight, ret = 0;
+	int trip_max_desired_temperature = params->trip_max_desired_temperature;
+
+	mutex_lock(&tz->lock);
+
+	num_actors = 0;
+	total_weight = 0;
+	list_for_each_entry(instance, &tz->thermal_instances, tz_node) {
+		if ((instance->trip == trip_max_desired_temperature) &&
+		    cdev_is_power_actor(instance->cdev)) {
+			num_actors++;
+			total_weight += instance->weight;
+		}
+	}
+
+	/*
+	 * We need to allocate three arrays of the same size:
+	 * req_power, max_power and granted_power.  They are going to
+	 * be needed until this function returns.  Allocate them all
+	 * in one go to simplify the allocation and deallocation
+	 * logic.
+	 */
+	BUILD_BUG_ON(sizeof(*req_power) != sizeof(*max_power));
+	BUILD_BUG_ON(sizeof(*req_power) != sizeof(*granted_power));
+	BUILD_BUG_ON(sizeof(*req_power) != sizeof(*extra_actor_power));
+	req_power = devm_kcalloc(&tz->device, num_actors * 4,
+				 sizeof(*req_power), GFP_KERNEL);
+	if (!req_power) {
+		ret = -ENOMEM;
+		goto unlock;
+	}
+
+	max_power = &req_power[num_actors];
+	granted_power = &req_power[2 * num_actors];
+	extra_actor_power = &req_power[3 * num_actors];
+
+	i = 0;
+	total_req_power = 0;
+	max_allocatable_power = 0;
+
+	list_for_each_entry(instance, &tz->thermal_instances, tz_node) {
+		int weight;
+		struct thermal_cooling_device *cdev = instance->cdev;
+
+		if (instance->trip != trip_max_desired_temperature)
+			continue;
+
+		if (!cdev_is_power_actor(cdev))
+			continue;
+
+		if (cdev->ops->get_requested_power(cdev, tz, &req_power[i]))
+			continue;
+
+		if (!total_weight)
+			weight = 1 << FRAC_BITS;
+		else
+			weight = instance->weight;
+
+		req_power[i] = frac_to_int(weight * req_power[i]);
+
+		if (power_actor_get_max_power(cdev, tz, &max_power[i]))
+			continue;
+
+		total_req_power += req_power[i];
+		max_allocatable_power += max_power[i];
+
+		i++;
+	}
+
+	power_range = pid_controller(tz, current_temp, control_temp,
+				     max_allocatable_power);
+
+	divvy_up_power(req_power, max_power, num_actors, total_req_power,
+		       power_range, granted_power, extra_actor_power);
+
+	i = 0;
+	list_for_each_entry(instance, &tz->thermal_instances, tz_node) {
+		if (instance->trip != trip_max_desired_temperature)
+			continue;
+
+		if (!cdev_is_power_actor(instance->cdev))
+			continue;
+
+		power_actor_set_power(instance->cdev, instance,
+				      granted_power[i]);
+
+		i++;
+	}
+
+	devm_kfree(&tz->device, req_power);
+unlock:
+	mutex_unlock(&tz->lock);
+
+	return ret;
+}
+
+static int get_governor_trips(struct thermal_zone_device *tz,
+			      struct power_allocator_params *params)
+{
+	int i, ret, last_passive;
+	bool found_first_passive;
+
+	found_first_passive = false;
+	last_passive = -1;
+	ret = -EINVAL;
+
+	for (i = 0; i < tz->trips; i++) {
+		enum thermal_trip_type type;
+
+		ret = tz->ops->get_trip_type(tz, i, &type);
+		if (ret)
+			return ret;
+
+		if (!found_first_passive) {
+			if (type == THERMAL_TRIP_PASSIVE) {
+				params->trip_switch_on = i;
+				found_first_passive = true;
+			}
+		} else if (type == THERMAL_TRIP_PASSIVE) {
+			last_passive = i;
+		} else {
+			break;
+		}
+	}
+
+	if (last_passive != -1) {
+		params->trip_max_desired_temperature = last_passive;
+		ret = 0;
+	} else {
+		ret = -EINVAL;
+	}
+
+	return ret;
+}
+
+static void reset_pid_controller(struct power_allocator_params *params)
+{
+	params->err_integral = 0;
+	params->prev_err = 0;
+}
+
+static void allow_maximum_power(struct thermal_zone_device *tz)
+{
+	struct thermal_instance *instance;
+	struct power_allocator_params *params = tz->governor_data;
+
+	list_for_each_entry(instance, &tz->thermal_instances, tz_node) {
+		if ((instance->trip != params->trip_max_desired_temperature) ||
+		    (!cdev_is_power_actor(instance->cdev)))
+			continue;
+
+		instance->target = 0;
+		instance->cdev->updated = false;
+		thermal_cdev_update(instance->cdev);
+	}
+}
+
+/**
+ * power_allocator_bind() - bind the power_allocator governor to a thermal zone
+ * @tz:	thermal zone to bind it to
+ *
+ * Check that the thermal zone is valid for this governor, that is, it
+ * has two thermal trips.  If so, initialize the PID controller
+ * parameters and bind it to the thermal zone.
+ *
+ * Return: 0 on success, -EINVAL if the trips were invalid or -ENOMEM
+ * if we ran out of memory.
+ */
+static int power_allocator_bind(struct thermal_zone_device *tz)
+{
+	int ret;
+	struct power_allocator_params *params;
+	unsigned long switch_on_temp, control_temp;
+	u32 temperature_threshold;
+
+	if (!tz->tzp || !tz->tzp->sustainable_power) {
+		dev_err(&tz->device,
+			"power_allocator: missing sustainable_power\n");
+		return -EINVAL;
+	}
+
+	params = devm_kzalloc(&tz->device, sizeof(*params), GFP_KERNEL);
+	if (!params)
+		return -ENOMEM;
+
+	ret = get_governor_trips(tz, params);
+	if (ret) {
+		dev_err(&tz->device,
+			"thermal zone %s has wrong trip setup for power allocator\n",
+			tz->type);
+		goto free;
+	}
+
+	ret = tz->ops->get_trip_temp(tz, params->trip_switch_on,
+				     &switch_on_temp);
+	if (ret)
+		goto free;
+
+	ret = tz->ops->get_trip_temp(tz, params->trip_max_desired_temperature,
+				     &control_temp);
+	if (ret)
+		goto free;
+
+	temperature_threshold = control_temp - switch_on_temp;
+
+	tz->tzp->k_po = tz->tzp->k_po ?:
+		int_to_frac(tz->tzp->sustainable_power) / temperature_threshold;
+	tz->tzp->k_pu = tz->tzp->k_pu ?:
+		int_to_frac(2 * tz->tzp->sustainable_power) /
+		temperature_threshold;
+	tz->tzp->k_i = tz->tzp->k_i ?: int_to_frac(10) / 1000;
+	/*
+	 * The default for k_d and integral_cutoff is 0, so we can
+	 * leave them as they are.
+	 */
+
+	reset_pid_controller(params);
+
+	tz->governor_data = params;
+
+	return 0;
+
+free:
+	devm_kfree(&tz->device, params);
+	return ret;
+}
+
+static void power_allocator_unbind(struct thermal_zone_device *tz)
+{
+	dev_dbg(&tz->device, "Unbinding from thermal zone %d\n", tz->id);
+	devm_kfree(&tz->device, tz->governor_data);
+	tz->governor_data = NULL;
+}
+
+static int power_allocator_throttle(struct thermal_zone_device *tz, int trip)
+{
+	int ret;
+	unsigned long switch_on_temp, control_temp, current_temp;
+	struct power_allocator_params *params = tz->governor_data;
+
+	/*
+	 * We get called for every trip point but we only need to do
+	 * our calculations once
+	 */
+	if (trip != params->trip_max_desired_temperature)
+		return 0;
+
+	ret = thermal_zone_get_temp(tz, &current_temp);
+	if (ret) {
+		dev_warn(&tz->device, "Failed to get temperature: %d\n", ret);
+		return ret;
+	}
+
+	ret = tz->ops->get_trip_temp(tz, params->trip_switch_on,
+				     &switch_on_temp);
+	if (ret) {
+		dev_warn(&tz->device,
+			 "Failed to get switch on temperature: %d\n", ret);
+		return ret;
+	}
+
+	if (current_temp < switch_on_temp) {
+		tz->passive = 0;
+		reset_pid_controller(params);
+		allow_maximum_power(tz);
+		return 0;
+	}
+
+	tz->passive = 1;
+
+	ret = tz->ops->get_trip_temp(tz, params->trip_max_desired_temperature,
+				&control_temp);
+	if (ret) {
+		dev_warn(&tz->device,
+			 "Failed to get the maximum desired temperature: %d\n",
+			 ret);
+		return ret;
+	}
+
+	return allocate_power(tz, current_temp, control_temp);
+}
+
+static struct thermal_governor thermal_gov_power_allocator = {
+	.name		= "power_allocator",
+	.bind_to_tz	= power_allocator_bind,
+	.unbind_from_tz	= power_allocator_unbind,
+	.throttle	= power_allocator_throttle,
+};
+
+int thermal_gov_power_allocator_register(void)
+{
+	return thermal_register_governor(&thermal_gov_power_allocator);
+}
+
+void thermal_gov_power_allocator_unregister(void)
+{
+	thermal_unregister_governor(&thermal_gov_power_allocator);
+}
diff --git a/drivers/thermal/thermal_core.c b/drivers/thermal/thermal_core.c
index 263628b0e862..b389bc2ec0fa 100644
--- a/drivers/thermal/thermal_core.c
+++ b/drivers/thermal/thermal_core.c
@@ -1616,7 +1616,7 @@ static void remove_trip_attrs(struct thermal_zone_device *tz)
 struct thermal_zone_device *thermal_zone_device_register(const char *type,
 	int trips, int mask, void *devdata,
 	struct thermal_zone_device_ops *ops,
-	const struct thermal_zone_params *tzp,
+	struct thermal_zone_params *tzp,
 	int passive_delay, int polling_delay)
 {
 	struct thermal_zone_device *tz;
@@ -1968,7 +1968,11 @@ static int __init thermal_register_governors(void)
 	if (result)
 		return result;
 
-	return thermal_gov_user_space_register();
+	result = thermal_gov_user_space_register();
+	if (result)
+		return result;
+
+	return thermal_gov_power_allocator_register();
 }
 
 static void thermal_unregister_governors(void)
@@ -1977,6 +1981,7 @@ static void thermal_unregister_governors(void)
 	thermal_gov_fair_share_unregister();
 	thermal_gov_bang_bang_unregister();
 	thermal_gov_user_space_unregister();
+	thermal_gov_power_allocator_unregister();
 }
 
 static int __init thermal_init(void)
diff --git a/drivers/thermal/thermal_core.h b/drivers/thermal/thermal_core.h
index faebe881f062..8a6624488cc5 100644
--- a/drivers/thermal/thermal_core.h
+++ b/drivers/thermal/thermal_core.h
@@ -88,6 +88,14 @@ static inline int thermal_gov_user_space_register(void) { return 0; }
 static inline void thermal_gov_user_space_unregister(void) {}
 #endif /* CONFIG_THERMAL_GOV_USER_SPACE */
 
+#ifdef CONFIG_THERMAL_GOV_POWER_ALLOCATOR
+int thermal_gov_power_allocator_register(void);
+void thermal_gov_power_allocator_unregister(void);
+#else
+static inline int thermal_gov_power_allocator_register(void) { return 0; }
+static inline void thermal_gov_power_allocator_unregister(void) {}
+#endif /* CONFIG_THERMAL_GOV_POWER_ALLOCATOR */
+
 /* device tree support */
 #ifdef CONFIG_THERMAL_OF
 int of_parse_thermal_zones(void);
diff --git a/include/linux/thermal.h b/include/linux/thermal.h
index bf3c55f405c2..6bbe11c97cea 100644
--- a/include/linux/thermal.h
+++ b/include/linux/thermal.h
@@ -59,6 +59,8 @@
 #define DEFAULT_THERMAL_GOVERNOR       "fair_share"
 #elif defined(CONFIG_THERMAL_DEFAULT_GOV_USER_SPACE)
 #define DEFAULT_THERMAL_GOVERNOR       "user_space"
+#elif defined(CONFIG_THERMAL_DEFAULT_GOV_POWER_ALLOCATOR)
+#define DEFAULT_THERMAL_GOVERNOR       "power_allocator"
 #endif
 
 struct thermal_zone_device;
@@ -154,8 +156,7 @@ struct thermal_attr {
  * @devdata:	private pointer for device private data
  * @trips:	number of trip points the thermal zone supports
  * @passive_delay:	number of milliseconds to wait between polls when
- *			performing passive cooling.  Currenty only used by the
- *			step-wise governor
+ *			performing passive cooling.
  * @polling_delay:	number of milliseconds to wait between polls when
  *			checking whether trip points have been crossed (0 for
  *			interrupt driven systems)
@@ -165,7 +166,6 @@ struct thermal_attr {
  * @last_temperature:	previous temperature read
  * @emul_temperature:	emulated temperature when using CONFIG_THERMAL_EMULATION
  * @passive:		1 if you've crossed a passive trip point, 0 otherwise.
- *			Currenty only used by the step-wise governor.
  * @forced_passive:	If > 0, temperature at which to switch on all ACPI
  *			processor cooling devices.  Currently only used by the
  *			step-wise governor.
@@ -197,7 +197,7 @@ struct thermal_zone_device {
 	int passive;
 	unsigned int forced_passive;
 	struct thermal_zone_device_ops *ops;
-	const struct thermal_zone_params *tzp;
+	struct thermal_zone_params *tzp;
 	struct thermal_governor *governor;
 	void *governor_data;
 	struct list_head thermal_instances;
@@ -275,6 +275,33 @@ struct thermal_zone_params {
 
 	int num_tbps;	/* Number of tbp entries */
 	struct thermal_bind_params *tbp;
+
+	/*
+	 * Sustainable power (heat) that this thermal zone can dissipate in
+	 * mW
+	 */
+	u32 sustainable_power;
+
+	/*
+	 * Proportional parameter of the PID controller when
+	 * overshooting (i.e., when temperature is below the target)
+	 */
+	s32 k_po;
+
+	/*
+	 * Proportional parameter of the PID controller when
+	 * undershooting
+	 */
+	s32 k_pu;
+
+	/* Integral parameter of the PID controller */
+	s32 k_i;
+
+	/* Derivative parameter of the PID controller */
+	s32 k_d;
+
+	/* threshold below which the error is no longer accumulated */
+	s32 integral_cutoff;
 };
 
 struct thermal_genl_event {
@@ -350,7 +377,7 @@ int power_actor_set_power(struct thermal_cooling_device *,
 			  struct thermal_instance *, u32);
 struct thermal_zone_device *thermal_zone_device_register(const char *, int, int,
 		void *, struct thermal_zone_device_ops *,
-		const struct thermal_zone_params *, int, int);
+		struct thermal_zone_params *, int, int);
 void thermal_zone_device_unregister(struct thermal_zone_device *);
 
 int thermal_zone_bind_cooling_device(struct thermal_zone_device *, int,
-- 
cgit v1.2.3


From f31105347cc56c13d552b844ada04418769d875d Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert+renesas@glider.be>
Date: Tue, 28 Apr 2015 12:17:50 +0200
Subject: irqchip: irqc: Remove platform data support

As of commit 914d7d148411997c ("ARM: shmobile: r8a73a4: Remove legacy
code"), the Renesas R-Mobile/R-Car interrupt controller is used with DT
only, and interrupt numbers are thus always assigned automatically.

Drop the platform data declaration and all related support code.

Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
Cc: Magnus Damm <magnus.damm@gmail.com>
Cc: Jason Cooper <jason@lakedaemon.net>
Link: http://lkml.kernel.org/r/1430216270-31929-1-git-send-email-geert%2Brenesas@glider.be
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 drivers/irqchip/irq-renesas-irqc.c             | 17 +---------------
 include/linux/platform_data/irq-renesas-irqc.h | 27 --------------------------
 2 files changed, 1 insertion(+), 43 deletions(-)
 delete mode 100644 include/linux/platform_data/irq-renesas-irqc.h

(limited to 'include/linux')

diff --git a/drivers/irqchip/irq-renesas-irqc.c b/drivers/irqchip/irq-renesas-irqc.c
index df5bf21a701f..778bd076aeea 100644
--- a/drivers/irqchip/irq-renesas-irqc.c
+++ b/drivers/irqchip/irq-renesas-irqc.c
@@ -29,7 +29,6 @@
 #include <linux/err.h>
 #include <linux/slab.h>
 #include <linux/module.h>
-#include <linux/platform_data/irq-renesas-irqc.h>
 #include <linux/pm_runtime.h>
 
 #define IRQC_IRQ_MAX	32	/* maximum 32 interrupts per driver instance */
@@ -62,7 +61,6 @@ struct irqc_priv {
 	void __iomem *iomem;
 	void __iomem *cpu_int_base;
 	struct irqc_irq irq[IRQC_IRQ_MAX];
-	struct renesas_irqc_config config;
 	unsigned int number_of_irqs;
 	struct platform_device *pdev;
 	struct irq_chip irq_chip;
@@ -175,7 +173,6 @@ static const struct irq_domain_ops irqc_irq_domain_ops = {
 
 static int irqc_probe(struct platform_device *pdev)
 {
-	struct renesas_irqc_config *pdata = pdev->dev.platform_data;
 	struct irqc_priv *p;
 	struct resource *io;
 	struct resource *irq;
@@ -191,10 +188,6 @@ static int irqc_probe(struct platform_device *pdev)
 		goto err0;
 	}
 
-	/* deal with driver instance configuration */
-	if (pdata)
-		memcpy(&p->config, pdata, sizeof(*pdata));
-
 	p->pdev = pdev;
 	platform_set_drvdata(pdev, p);
 
@@ -251,8 +244,7 @@ static int irqc_probe(struct platform_device *pdev)
 	irq_chip->flags	= IRQCHIP_MASK_ON_SUSPEND;
 
 	p->irq_domain = irq_domain_add_simple(pdev->dev.of_node,
-					      p->number_of_irqs,
-					      p->config.irq_base,
+					      p->number_of_irqs, 0,
 					      &irqc_irq_domain_ops, p);
 	if (!p->irq_domain) {
 		ret = -ENXIO;
@@ -272,13 +264,6 @@ static int irqc_probe(struct platform_device *pdev)
 
 	dev_info(&pdev->dev, "driving %d irqs\n", p->number_of_irqs);
 
-	/* warn in case of mismatch if irq base is specified */
-	if (p->config.irq_base) {
-		if (p->config.irq_base != p->irq[0].domain_irq)
-			dev_warn(&pdev->dev, "irq base mismatch (%d/%d)\n",
-				 p->config.irq_base, p->irq[0].domain_irq);
-	}
-
 	return 0;
 err3:
 	while (--k >= 0)
diff --git a/include/linux/platform_data/irq-renesas-irqc.h b/include/linux/platform_data/irq-renesas-irqc.h
deleted file mode 100644
index 3ae17b3e00ed..000000000000
--- a/include/linux/platform_data/irq-renesas-irqc.h
+++ /dev/null
@@ -1,27 +0,0 @@
-/*
- * Renesas IRQC Driver
- *
- *  Copyright (C) 2013 Magnus Damm
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
- */
-
-#ifndef __IRQ_RENESAS_IRQC_H__
-#define __IRQ_RENESAS_IRQC_H__
-
-struct renesas_irqc_config {
-	unsigned int irq_base;
-};
-
-#endif /* __IRQ_RENESAS_IRQC_H__ */
-- 
cgit v1.2.3


From 406c057c4e00744453d5b0731eb23629ec14dcdf Mon Sep 17 00:00:00 2001
From: "Martin K. Petersen" <martin.petersen@oracle.com>
Date: Mon, 4 May 2015 21:54:20 -0400
Subject: libata: READ LOG DMA EXT support can be in either page 119 or 120

Support for the READ/WRITE LOG DMA EXT commands can be signaled either
in page 119 or page 120. We should check both pages.

Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/ata.h | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/ata.h b/include/linux/ata.h
index b666b773e111..fed36418dd1c 100644
--- a/include/linux/ata.h
+++ b/include/linux/ata.h
@@ -704,9 +704,19 @@ static inline bool ata_id_wcache_enabled(const u16 *id)
 
 static inline bool ata_id_has_read_log_dma_ext(const u16 *id)
 {
+	/* Word 86 must have bit 15 set */
 	if (!(id[ATA_ID_CFS_ENABLE_2] & (1 << 15)))
 		return false;
-	return id[ATA_ID_COMMAND_SET_3] & (1 << 3);
+
+	/* READ LOG DMA EXT support can be signaled either from word 119
+	 * or from word 120. The format is the same for both words: Bit
+	 * 15 must be cleared, bit 14 set and bit 3 set.
+	 */
+	if ((id[ATA_ID_COMMAND_SET_3] & 0xC008) == 0x4008 ||
+	    (id[ATA_ID_COMMAND_SET_4] & 0xC008) == 0x4008)
+		return true;
+
+	return false;
 }
 
 static inline bool ata_id_has_sense_reporting(const u16 *id)
-- 
cgit v1.2.3


From 5d3abf8ff67f49271a42c0f7fa4f20f9e046bf0e Mon Sep 17 00:00:00 2001
From: "Martin K. Petersen" <martin.petersen@oracle.com>
Date: Mon, 4 May 2015 21:54:21 -0400
Subject: libata: Fall back to unqueued READ LOG EXT if the DMA variant fails

Some devices advertise support for the READ/WRITE LOG DMA EXT commands
but fail when we try to issue them. This can lead to queued TRIM being
unintentionally disabled since the relevant feature flag is located in a
general purpose log page.

Fall back to unqueued READ LOG EXT if the DMA variant fails while
reading a log page.

Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 drivers/ata/libata-eh.c | 12 +++++++++++-
 include/linux/libata.h  |  1 +
 2 files changed, 12 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/ata/libata-eh.c b/drivers/ata/libata-eh.c
index 07f41be38fbe..2893563d0537 100644
--- a/drivers/ata/libata-eh.c
+++ b/drivers/ata/libata-eh.c
@@ -1507,13 +1507,17 @@ unsigned int ata_read_log_page(struct ata_device *dev, u8 log,
 {
 	struct ata_taskfile tf;
 	unsigned int err_mask;
+	bool dma = false;
 
 	DPRINTK("read log page - log 0x%x, page 0x%x\n", log, page);
 
+retry:
 	ata_tf_init(dev, &tf);
-	if (dev->dma_mode && ata_id_has_read_log_dma_ext(dev->id)) {
+	if (dev->dma_mode && ata_id_has_read_log_dma_ext(dev->id) &&
+	    !(dev->horkage & ATA_HORKAGE_NO_NCQ_LOG)) {
 		tf.command = ATA_CMD_READ_LOG_DMA_EXT;
 		tf.protocol = ATA_PROT_DMA;
+		dma = true;
 	} else {
 		tf.command = ATA_CMD_READ_LOG_EXT;
 		tf.protocol = ATA_PROT_PIO;
@@ -1527,6 +1531,12 @@ unsigned int ata_read_log_page(struct ata_device *dev, u8 log,
 	err_mask = ata_exec_internal(dev, &tf, NULL, DMA_FROM_DEVICE,
 				     buf, sectors * ATA_SECT_SIZE, 0);
 
+	if (err_mask && dma) {
+		dev->horkage |= ATA_HORKAGE_NO_NCQ_LOG;
+		ata_dev_warn(dev, "READ LOG DMA EXT failed, trying unqueued\n");
+		goto retry;
+	}
+
 	DPRINTK("EXIT, err_mask=%x\n", err_mask);
 	return err_mask;
 }
diff --git a/include/linux/libata.h b/include/linux/libata.h
index 8dad4a307bb8..c3ef58014b33 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -424,6 +424,7 @@ enum {
 	ATA_HORKAGE_NOLPM	= (1 << 20),	/* don't use LPM */
 	ATA_HORKAGE_WD_BROKEN_LPM = (1 << 21),	/* some WDs have broken LPM */
 	ATA_HORKAGE_ZERO_AFTER_TRIM = (1 << 22),/* guarantees zero after trim */
+	ATA_HORKAGE_NO_NCQ_LOG	= (1 << 23),	/* don't use NCQ for log read */
 
 	 /* DMA mask for user DMA control: User visible values; DO NOT
 	    renumber */
-- 
cgit v1.2.3


From c4cf5261f8bffd9de132b50660a69148e7575bd6 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Fri, 17 Apr 2015 16:15:18 -0600
Subject: bio: skip atomic inc/dec of ->bi_remaining for non-chains

Struct bio has an atomic ref count for chained bio's, and we use this
to know when to end IO on the bio. However, most bio's are not chained,
so we don't need to always introduce this atomic operation as part of
ending IO.

Add a helper to elevate the bi_remaining count, and flag the bio as
now actually needing the decrement at end_io time. Rename the field
to __bi_remaining to catch any current users of this doing the
incrementing manually.

For high IOPS workloads, this reduces the overhead of bio_endio()
substantially.

Tested-by: Robert Elliott <elliott@hp.com>
Acked-by: Kent Overstreet <kent.overstreet@gmail.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/bio.c                  | 38 +++++++++++++++++++++++++++++---------
 drivers/md/dm-cache-target.c |  2 +-
 drivers/md/dm-raid1.c        |  2 +-
 drivers/md/dm-snap.c         |  2 +-
 drivers/md/dm-thin.c         |  4 ++--
 include/linux/bio.h          | 11 +++++++++++
 include/linux/blk_types.h    |  3 ++-
 7 files changed, 47 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/block/bio.c b/block/bio.c
index f66a4eae16ee..117da319afb6 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -270,7 +270,7 @@ void bio_init(struct bio *bio)
 {
 	memset(bio, 0, sizeof(*bio));
 	bio->bi_flags = 1 << BIO_UPTODATE;
-	atomic_set(&bio->bi_remaining, 1);
+	atomic_set(&bio->__bi_remaining, 1);
 	atomic_set(&bio->bi_cnt, 1);
 }
 EXPORT_SYMBOL(bio_init);
@@ -292,8 +292,8 @@ void bio_reset(struct bio *bio)
 	__bio_free(bio);
 
 	memset(bio, 0, BIO_RESET_BYTES);
-	bio->bi_flags = flags|(1 << BIO_UPTODATE);
-	atomic_set(&bio->bi_remaining, 1);
+	bio->bi_flags = flags | (1 << BIO_UPTODATE);
+	atomic_set(&bio->__bi_remaining, 1);
 }
 EXPORT_SYMBOL(bio_reset);
 
@@ -320,7 +320,7 @@ void bio_chain(struct bio *bio, struct bio *parent)
 
 	bio->bi_private = parent;
 	bio->bi_end_io	= bio_chain_endio;
-	atomic_inc(&parent->bi_remaining);
+	bio_inc_remaining(parent);
 }
 EXPORT_SYMBOL(bio_chain);
 
@@ -1741,6 +1741,23 @@ void bio_flush_dcache_pages(struct bio *bi)
 EXPORT_SYMBOL(bio_flush_dcache_pages);
 #endif
 
+static inline bool bio_remaining_done(struct bio *bio)
+{
+	/*
+	 * If we're not chaining, then ->__bi_remaining is always 1 and
+	 * we always end io on the first invocation.
+	 */
+	if (!bio_flagged(bio, BIO_CHAIN))
+		return true;
+
+	BUG_ON(atomic_read(&bio->__bi_remaining) <= 0);
+
+	if (atomic_dec_and_test(&bio->__bi_remaining))
+		return true;
+
+	return false;
+}
+
 /**
  * bio_endio - end I/O on a bio
  * @bio:	bio
@@ -1758,15 +1775,13 @@ EXPORT_SYMBOL(bio_flush_dcache_pages);
 void bio_endio(struct bio *bio, int error)
 {
 	while (bio) {
-		BUG_ON(atomic_read(&bio->bi_remaining) <= 0);
-
 		if (error)
 			clear_bit(BIO_UPTODATE, &bio->bi_flags);
 		else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
 			error = -EIO;
 
-		if (!atomic_dec_and_test(&bio->bi_remaining))
-			return;
+		if (unlikely(!bio_remaining_done(bio)))
+			break;
 
 		/*
 		 * Need to have a real endio function for chained bios,
@@ -1799,7 +1814,12 @@ EXPORT_SYMBOL(bio_endio);
  **/
 void bio_endio_nodec(struct bio *bio, int error)
 {
-	atomic_inc(&bio->bi_remaining);
+	/*
+	 * If it's not flagged as a chain, we are not going to dec the count
+	 */
+	if (bio_flagged(bio, BIO_CHAIN))
+		bio_inc_remaining(bio);
+
 	bio_endio(bio, error);
 }
 EXPORT_SYMBOL(bio_endio_nodec);
diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c
index 7755af351867..705eb7b99d69 100644
--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@@ -91,7 +91,7 @@ static void dm_unhook_bio(struct dm_hook_info *h, struct bio *bio)
 	 * Must bump bi_remaining to allow bio to complete with
 	 * restored bi_end_io.
 	 */
-	atomic_inc(&bio->bi_remaining);
+	bio_inc_remaining(bio);
 }
 
 /*----------------------------------------------------------------*/
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index 089d62751f7f..d6a1c096b777 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -1254,7 +1254,7 @@ static int mirror_end_io(struct dm_target *ti, struct bio *bio, int error)
 			dm_bio_restore(bd, bio);
 			bio_record->details.bi_bdev = NULL;
 
-			atomic_inc(&bio->bi_remaining);
+			bio_inc_remaining(bio);
 
 			queue_bio(ms, bio, rw);
 			return DM_ENDIO_INCOMPLETE;
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index f83a0f3fc365..8bfeae218531 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -1478,7 +1478,7 @@ out:
 	if (full_bio) {
 		full_bio->bi_end_io = pe->full_bio_end_io;
 		full_bio->bi_private = pe->full_bio_private;
-		atomic_inc(&full_bio->bi_remaining);
+		bio_inc_remaining(full_bio);
 	}
 	increment_pending_exceptions_done_count();
 
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index 921aafd12aee..342dbdad6131 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -795,7 +795,7 @@ static void process_prepared_mapping_fail(struct dm_thin_new_mapping *m)
 {
 	if (m->bio) {
 		m->bio->bi_end_io = m->saved_bi_end_io;
-		atomic_inc(&m->bio->bi_remaining);
+		bio_inc_remaining(m->bio);
 	}
 	cell_error(m->tc->pool, m->cell);
 	list_del(&m->list);
@@ -812,7 +812,7 @@ static void process_prepared_mapping(struct dm_thin_new_mapping *m)
 	bio = m->bio;
 	if (bio) {
 		bio->bi_end_io = m->saved_bi_end_io;
-		atomic_inc(&bio->bi_remaining);
+		bio_inc_remaining(bio);
 	}
 
 	if (m->err) {
diff --git a/include/linux/bio.h b/include/linux/bio.h
index da3a127c9958..8bfe9eee6d1a 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -644,6 +644,17 @@ static inline struct bio *bio_list_get(struct bio_list *bl)
 	return bio;
 }
 
+/*
+ * Increment chain count for the bio. Make sure the CHAIN flag update
+ * is visible before the raised count.
+ */
+static inline void bio_inc_remaining(struct bio *bio)
+{
+	bio->bi_flags |= (1 << BIO_CHAIN);
+	smp_mb__before_atomic();
+	atomic_inc(&bio->__bi_remaining);
+}
+
 /*
  * bio_set is used to allow other portions of the IO system to
  * allocate their own private memory pools for bio and iovec structures.
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index a1b25e35ea5f..8b07e0603887 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -65,7 +65,7 @@ struct bio {
 	unsigned int		bi_seg_front_size;
 	unsigned int		bi_seg_back_size;
 
-	atomic_t		bi_remaining;
+	atomic_t		__bi_remaining;
 
 	bio_end_io_t		*bi_end_io;
 
@@ -122,6 +122,7 @@ struct bio {
 #define BIO_NULL_MAPPED 8	/* contains invalid user pages */
 #define BIO_QUIET	9	/* Make BIO Quiet */
 #define BIO_SNAP_STABLE	10	/* bio data must be snapshotted during write */
+#define BIO_CHAIN	11	/* chained bio, ->bi_remaining in effect */
 
 /*
  * Flags starting here get preserved by bio_reset() - this includes
-- 
cgit v1.2.3


From dac56212e8127dbc0bff7be35c508bc280213309 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Fri, 17 Apr 2015 16:23:59 -0600
Subject: bio: skip atomic inc/dec of ->bi_cnt for most use cases

Struct bio has a reference count that controls when it can be freed.
Most uses cases is allocating the bio, which then returns with a
single reference to it, doing IO, and then dropping that single
reference. We can remove this atomic_dec_and_test() in the completion
path, if nobody else is holding a reference to the bio.

If someone does call bio_get() on the bio, then we flag the bio as
now having valid count and that we must properly honor the reference
count when it's being put.

Tested-by: Robert Elliott <elliott@hp.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/bio.c                 | 18 +++++++++++-------
 drivers/md/bcache/request.c |  2 +-
 fs/btrfs/volumes.c          |  2 +-
 fs/xfs/xfs_aops.c           |  1 -
 include/linux/bio.h         | 16 +++++++++++++++-
 include/linux/blk_types.h   |  3 ++-
 6 files changed, 30 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/block/bio.c b/block/bio.c
index 117da319afb6..c2ff8a88aef1 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -271,7 +271,7 @@ void bio_init(struct bio *bio)
 	memset(bio, 0, sizeof(*bio));
 	bio->bi_flags = 1 << BIO_UPTODATE;
 	atomic_set(&bio->__bi_remaining, 1);
-	atomic_set(&bio->bi_cnt, 1);
+	atomic_set(&bio->__bi_cnt, 1);
 }
 EXPORT_SYMBOL(bio_init);
 
@@ -524,13 +524,17 @@ EXPORT_SYMBOL(zero_fill_bio);
  **/
 void bio_put(struct bio *bio)
 {
-	BIO_BUG_ON(!atomic_read(&bio->bi_cnt));
-
-	/*
-	 * last put frees it
-	 */
-	if (atomic_dec_and_test(&bio->bi_cnt))
+	if (!bio_flagged(bio, BIO_REFFED))
 		bio_free(bio);
+	else {
+		BIO_BUG_ON(!atomic_read(&bio->__bi_cnt));
+
+		/*
+		 * last put frees it
+		 */
+		if (atomic_dec_and_test(&bio->__bi_cnt))
+			bio_free(bio);
+	}
 }
 EXPORT_SYMBOL(bio_put);
 
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index ab43faddb447..1616f668a4cb 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -619,7 +619,7 @@ static void do_bio_hook(struct search *s, struct bio *orig_bio)
 	bio->bi_end_io		= request_endio;
 	bio->bi_private		= &s->cl;
 
-	atomic_set(&bio->bi_cnt, 3);
+	bio_cnt_set(bio, 3);
 }
 
 static void search_free(struct closure *cl)
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 96aebf3bcd5b..8e8d1d1e28a5 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -345,7 +345,7 @@ loop_lock:
 		    waitqueue_active(&fs_info->async_submit_wait))
 			wake_up(&fs_info->async_submit_wait);
 
-		BUG_ON(atomic_read(&cur->bi_cnt) == 0);
+		BUG_ON(atomic_read(&cur->__bi_cnt) == 0);
 
 		/*
 		 * if we're doing the sync list, record that our
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index a56960dd1684..095f94c2d8b5 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -356,7 +356,6 @@ xfs_end_bio(
 {
 	xfs_ioend_t		*ioend = bio->bi_private;
 
-	ASSERT(atomic_read(&bio->bi_cnt) >= 1);
 	ioend->io_error = test_bit(BIO_UPTODATE, &bio->bi_flags) ? 0 : error;
 
 	/* Toss bio and pass work off to an xfsdatad thread */
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 8bfe9eee6d1a..7486ea103f6e 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -290,7 +290,21 @@ static inline unsigned bio_segments(struct bio *bio)
  * returns. and then bio would be freed memory when if (bio->bi_flags ...)
  * runs
  */
-#define bio_get(bio)	atomic_inc(&(bio)->bi_cnt)
+static inline void bio_get(struct bio *bio)
+{
+	bio->bi_flags |= (1 << BIO_REFFED);
+	smp_mb__before_atomic();
+	atomic_inc(&bio->__bi_cnt);
+}
+
+static inline void bio_cnt_set(struct bio *bio, unsigned int count)
+{
+	if (count != 1) {
+		bio->bi_flags |= (1 << BIO_REFFED);
+		smp_mb__before_atomic();
+	}
+	atomic_set(&bio->__bi_cnt, count);
+}
 
 enum bip_flags {
 	BIP_BLOCK_INTEGRITY	= 1 << 0, /* block layer owns integrity data */
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 8b07e0603887..93d2e7153816 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -92,7 +92,7 @@ struct bio {
 
 	unsigned short		bi_max_vecs;	/* max bvl_vecs we can hold */
 
-	atomic_t		bi_cnt;		/* pin count */
+	atomic_t		__bi_cnt;	/* pin count */
 
 	struct bio_vec		*bi_io_vec;	/* the actual vec list */
 
@@ -123,6 +123,7 @@ struct bio {
 #define BIO_QUIET	9	/* Make BIO Quiet */
 #define BIO_SNAP_STABLE	10	/* bio data must be snapshotted during write */
 #define BIO_CHAIN	11	/* chained bio, ->bi_remaining in effect */
+#define BIO_REFFED	12	/* bio has elevated ->bi_cnt */
 
 /*
  * Flags starting here get preserved by bio_reset() - this includes
-- 
cgit v1.2.3


From 84be456f883c4685680fba8e5154b5f72e92957e Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 1 May 2015 12:46:15 +0200
Subject: remove <asm/scatterlist.h>

We don't have any arch specific scatterlist now that parisc switched over
to the generic one.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 arch/alpha/include/asm/pci.h          |  2 +-
 arch/arm/include/asm/dma.h            |  2 +-
 arch/arm/mach-footbridge/dma.c        |  2 +-
 arch/blackfin/include/asm/pci.h       |  2 +-
 arch/cris/include/asm/dma-mapping.h   |  2 +-
 arch/cris/include/asm/pci.h           |  2 +-
 arch/frv/include/asm/dma-mapping.h    |  2 +-
 arch/frv/include/asm/pci.h            |  2 +-
 arch/ia64/include/asm/pci.h           |  2 +-
 arch/microblaze/include/asm/pci.h     |  2 +-
 arch/mips/include/asm/dma-mapping.h   |  2 +-
 arch/mips/include/asm/pci.h           |  2 +-
 arch/mn10300/include/asm/pci.h        |  2 +-
 arch/parisc/include/asm/dma-mapping.h |  2 +-
 arch/parisc/include/asm/pci.h         |  2 +-
 arch/powerpc/include/asm/pci.h        |  2 +-
 arch/powerpc/include/asm/vio.h        |  2 +-
 arch/sparc/kernel/iommu_common.h      |  2 +-
 arch/x86/include/asm/pci.h            |  2 +-
 arch/xtensa/include/asm/pci.h         |  2 +-
 drivers/mmc/host/android-goldfish.c   |  2 +-
 include/asm-generic/scatterlist.h     | 34 ------------------------------
 include/linux/blkdev.h                |  3 +--
 include/linux/dmapool.h               |  2 +-
 include/linux/scatterlist.h           | 39 ++++++++++++++++++++++++++++-------
 lib/swiotlb.c                         |  2 +-
 26 files changed, 56 insertions(+), 66 deletions(-)
 delete mode 100644 include/asm-generic/scatterlist.h

(limited to 'include/linux')

diff --git a/arch/alpha/include/asm/pci.h b/arch/alpha/include/asm/pci.h
index f7f680f7457d..bf7d97dce9e8 100644
--- a/arch/alpha/include/asm/pci.h
+++ b/arch/alpha/include/asm/pci.h
@@ -5,7 +5,7 @@
 
 #include <linux/spinlock.h>
 #include <linux/dma-mapping.h>
-#include <asm/scatterlist.h>
+#include <linux/scatterlist.h>
 #include <asm/machvec.h>
 #include <asm-generic/pci-bridge.h>
 
diff --git a/arch/arm/include/asm/dma.h b/arch/arm/include/asm/dma.h
index 99084431d6ae..bb4fa67da541 100644
--- a/arch/arm/include/asm/dma.h
+++ b/arch/arm/include/asm/dma.h
@@ -19,7 +19,7 @@
  * It should not be re-used except for that purpose.
  */
 #include <linux/spinlock.h>
-#include <asm/scatterlist.h>
+#include <linux/scatterlist.h>
 
 #include <mach/isa-dma.h>
 
diff --git a/arch/arm/mach-footbridge/dma.c b/arch/arm/mach-footbridge/dma.c
index e2e0df8bcee2..22536b85a81d 100644
--- a/arch/arm/mach-footbridge/dma.c
+++ b/arch/arm/mach-footbridge/dma.c
@@ -13,9 +13,9 @@
 #include <linux/init.h>
 #include <linux/io.h>
 #include <linux/spinlock.h>
+#include <linux/scatterlist.h>
 
 #include <asm/dma.h>
-#include <asm/scatterlist.h>
 
 #include <asm/mach/dma.h>
 #include <asm/hardware/dec21285.h>
diff --git a/arch/blackfin/include/asm/pci.h b/arch/blackfin/include/asm/pci.h
index c737909fba47..14efc0db1ade 100644
--- a/arch/blackfin/include/asm/pci.h
+++ b/arch/blackfin/include/asm/pci.h
@@ -3,7 +3,7 @@
 #ifndef _ASM_BFIN_PCI_H
 #define _ASM_BFIN_PCI_H
 
-#include <asm/scatterlist.h>
+#include <linux/scatterlist.h>
 #include <asm-generic/pci-dma-compat.h>
 #include <asm-generic/pci.h>
 
diff --git a/arch/cris/include/asm/dma-mapping.h b/arch/cris/include/asm/dma-mapping.h
index 2f0f654f1b44..57f794ee6039 100644
--- a/arch/cris/include/asm/dma-mapping.h
+++ b/arch/cris/include/asm/dma-mapping.h
@@ -5,10 +5,10 @@
 
 #include <linux/mm.h>
 #include <linux/kernel.h>
+#include <linux/scatterlist.h>
 
 #include <asm/cache.h>
 #include <asm/io.h>
-#include <asm/scatterlist.h>
 
 #define dma_alloc_noncoherent(d, s, h, f) dma_alloc_coherent(d, s, h, f)
 #define dma_free_noncoherent(d, s, v, h) dma_free_coherent(d, s, v, h)
diff --git a/arch/cris/include/asm/pci.h b/arch/cris/include/asm/pci.h
index cc2399c175e9..c15b4b4baafa 100644
--- a/arch/cris/include/asm/pci.h
+++ b/arch/cris/include/asm/pci.h
@@ -29,7 +29,7 @@ int pcibios_set_irq_routing(struct pci_dev *dev, int pin, int irq);
 
 #include <linux/types.h>
 #include <linux/slab.h>
-#include <asm/scatterlist.h>
+#include <linux/scatterlist.h>
 #include <linux/string.h>
 #include <asm/io.h>
 
diff --git a/arch/frv/include/asm/dma-mapping.h b/arch/frv/include/asm/dma-mapping.h
index 1746a2b8e6e7..2840adcd6d92 100644
--- a/arch/frv/include/asm/dma-mapping.h
+++ b/arch/frv/include/asm/dma-mapping.h
@@ -2,9 +2,9 @@
 #define _ASM_DMA_MAPPING_H
 
 #include <linux/device.h>
+#include <linux/scatterlist.h>
 #include <asm/cache.h>
 #include <asm/cacheflush.h>
-#include <asm/scatterlist.h>
 #include <asm/io.h>
 
 /*
diff --git a/arch/frv/include/asm/pci.h b/arch/frv/include/asm/pci.h
index 2035a4d3f9b9..43d224685144 100644
--- a/arch/frv/include/asm/pci.h
+++ b/arch/frv/include/asm/pci.h
@@ -14,7 +14,7 @@
 #define _ASM_FRV_PCI_H
 
 #include <linux/mm.h>
-#include <asm/scatterlist.h>
+#include <linux/scatterlist.h>
 #include <asm-generic/pci-dma-compat.h>
 #include <asm-generic/pci.h>
 
diff --git a/arch/ia64/include/asm/pci.h b/arch/ia64/include/asm/pci.h
index 52af5ed9f60b..0c0e25d6427a 100644
--- a/arch/ia64/include/asm/pci.h
+++ b/arch/ia64/include/asm/pci.h
@@ -6,9 +6,9 @@
 #include <linux/spinlock.h>
 #include <linux/string.h>
 #include <linux/types.h>
+#include <linux/scatterlist.h>
 
 #include <asm/io.h>
-#include <asm/scatterlist.h>
 #include <asm/hw_irq.h>
 
 struct pci_vector_struct {
diff --git a/arch/microblaze/include/asm/pci.h b/arch/microblaze/include/asm/pci.h
index 468aca8cec0d..81f27aef979c 100644
--- a/arch/microblaze/include/asm/pci.h
+++ b/arch/microblaze/include/asm/pci.h
@@ -16,8 +16,8 @@
 #include <linux/string.h>
 #include <linux/dma-mapping.h>
 #include <linux/pci.h>
+#include <linux/scatterlist.h>
 
-#include <asm/scatterlist.h>
 #include <asm/io.h>
 #include <asm/prom.h>
 #include <asm/pci-bridge.h>
diff --git a/arch/mips/include/asm/dma-mapping.h b/arch/mips/include/asm/dma-mapping.h
index fd1b4a150759..360b3387182a 100644
--- a/arch/mips/include/asm/dma-mapping.h
+++ b/arch/mips/include/asm/dma-mapping.h
@@ -1,7 +1,7 @@
 #ifndef _ASM_DMA_MAPPING_H
 #define _ASM_DMA_MAPPING_H
 
-#include <asm/scatterlist.h>
+#include <linux/scatterlist.h>
 #include <asm/dma-coherence.h>
 #include <asm/cache.h>
 #include <asm-generic/dma-coherent.h>
diff --git a/arch/mips/include/asm/pci.h b/arch/mips/include/asm/pci.h
index d9692993fc83..3413b10d833b 100644
--- a/arch/mips/include/asm/pci.h
+++ b/arch/mips/include/asm/pci.h
@@ -99,7 +99,7 @@ static inline void pci_resource_to_user(const struct pci_dev *dev, int bar,
 
 #include <linux/types.h>
 #include <linux/slab.h>
-#include <asm/scatterlist.h>
+#include <linux/scatterlist.h>
 #include <linux/string.h>
 #include <asm/io.h>
 #include <asm-generic/pci-bridge.h>
diff --git a/arch/mn10300/include/asm/pci.h b/arch/mn10300/include/asm/pci.h
index 5f70af25c7d0..393b51d38f36 100644
--- a/arch/mn10300/include/asm/pci.h
+++ b/arch/mn10300/include/asm/pci.h
@@ -55,7 +55,7 @@ void pcibios_set_master(struct pci_dev *dev);
 
 #include <linux/types.h>
 #include <linux/slab.h>
-#include <asm/scatterlist.h>
+#include <linux/scatterlist.h>
 #include <linux/string.h>
 #include <asm/io.h>
 
diff --git a/arch/parisc/include/asm/dma-mapping.h b/arch/parisc/include/asm/dma-mapping.h
index d0eae5f2bd87..d8d60a57183f 100644
--- a/arch/parisc/include/asm/dma-mapping.h
+++ b/arch/parisc/include/asm/dma-mapping.h
@@ -2,8 +2,8 @@
 #define _PARISC_DMA_MAPPING_H
 
 #include <linux/mm.h>
+#include <linux/scatterlist.h>
 #include <asm/cacheflush.h>
-#include <asm/scatterlist.h>
 
 /* See Documentation/DMA-API-HOWTO.txt */
 struct hppa_dma_ops {
diff --git a/arch/parisc/include/asm/pci.h b/arch/parisc/include/asm/pci.h
index 20df2b04fc09..794d8820db5d 100644
--- a/arch/parisc/include/asm/pci.h
+++ b/arch/parisc/include/asm/pci.h
@@ -1,7 +1,7 @@
 #ifndef __ASM_PARISC_PCI_H
 #define __ASM_PARISC_PCI_H
 
-#include <asm/scatterlist.h>
+#include <linux/scatterlist.h>
 
 
diff --git a/arch/powerpc/include/asm/pci.h b/arch/powerpc/include/asm/pci.h
index 4aef8d660999..3be43ab63181 100644
--- a/arch/powerpc/include/asm/pci.h
+++ b/arch/powerpc/include/asm/pci.h
@@ -13,9 +13,9 @@
 #include <linux/slab.h>
 #include <linux/string.h>
 #include <linux/dma-mapping.h>
+#include <linux/scatterlist.h>
 
 #include <asm/machdep.h>
-#include <asm/scatterlist.h>
 #include <asm/io.h>
 #include <asm/prom.h>
 #include <asm/pci-bridge.h>
diff --git a/arch/powerpc/include/asm/vio.h b/arch/powerpc/include/asm/vio.h
index 4f9b7ca0710f..84286ec77b12 100644
--- a/arch/powerpc/include/asm/vio.h
+++ b/arch/powerpc/include/asm/vio.h
@@ -19,9 +19,9 @@
 #include <linux/device.h>
 #include <linux/dma-mapping.h>
 #include <linux/mod_devicetable.h>
+#include <linux/scatterlist.h>
 
 #include <asm/hvcall.h>
-#include <asm/scatterlist.h>
 
 /*
  * Architecture-specific constants for drivers to
diff --git a/arch/sparc/kernel/iommu_common.h b/arch/sparc/kernel/iommu_common.h
index f4be0d724fc6..b40cec252905 100644
--- a/arch/sparc/kernel/iommu_common.h
+++ b/arch/sparc/kernel/iommu_common.h
@@ -13,9 +13,9 @@
 #include <linux/scatterlist.h>
 #include <linux/device.h>
 #include <linux/iommu-helper.h>
+#include <linux/scatterlist.h>
 
 #include <asm/iommu.h>
-#include <asm/scatterlist.h>
 
 /*
  * These give mapping size of each iommu pte/tlb.
diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h
index 4e370a5d8117..81da003df21b 100644
--- a/arch/x86/include/asm/pci.h
+++ b/arch/x86/include/asm/pci.h
@@ -5,7 +5,7 @@
 #include <linux/types.h>
 #include <linux/slab.h>
 #include <linux/string.h>
-#include <asm/scatterlist.h>
+#include <linux/scatterlist.h>
 #include <asm/io.h>
 #include <asm/x86_init.h>
 
diff --git a/arch/xtensa/include/asm/pci.h b/arch/xtensa/include/asm/pci.h
index 5d52dc43dfe7..e438a00fbd63 100644
--- a/arch/xtensa/include/asm/pci.h
+++ b/arch/xtensa/include/asm/pci.h
@@ -33,7 +33,7 @@ extern struct pci_controller* pcibios_alloc_controller(void);
 
 #include <linux/types.h>
 #include <linux/slab.h>
-#include <asm/scatterlist.h>
+#include <linux/scatterlist.h>
 #include <linux/string.h>
 #include <asm/io.h>
 
diff --git a/drivers/mmc/host/android-goldfish.c b/drivers/mmc/host/android-goldfish.c
index 8b4e20a3f16c..b1eac719a4cc 100644
--- a/drivers/mmc/host/android-goldfish.c
+++ b/drivers/mmc/host/android-goldfish.c
@@ -42,10 +42,10 @@
 #include <linux/spinlock.h>
 #include <linux/timer.h>
 #include <linux/clk.h>
+#include <linux/scatterlist.h>
 
 #include <asm/io.h>
 #include <asm/irq.h>
-#include <asm/scatterlist.h>
 
 #include <asm/types.h>
 #include <asm/io.h>
diff --git a/include/asm-generic/scatterlist.h b/include/asm-generic/scatterlist.h
deleted file mode 100644
index 5de07355fad4..000000000000
--- a/include/asm-generic/scatterlist.h
+++ /dev/null
@@ -1,34 +0,0 @@
-#ifndef __ASM_GENERIC_SCATTERLIST_H
-#define __ASM_GENERIC_SCATTERLIST_H
-
-#include <linux/types.h>
-
-struct scatterlist {
-#ifdef CONFIG_DEBUG_SG
-	unsigned long	sg_magic;
-#endif
-	unsigned long	page_link;
-	unsigned int	offset;
-	unsigned int	length;
-	dma_addr_t	dma_address;
-#ifdef CONFIG_NEED_SG_DMA_LENGTH
-	unsigned int	dma_length;
-#endif
-};
-
-/*
- * These macros should be used after a dma_map_sg call has been done
- * to get bus addresses of each of the SG entries and their lengths.
- * You should only work with the number of sg entries pci_map_sg
- * returns, or alternatively stop on the first sg_dma_len(sg) which
- * is 0.
- */
-#define sg_dma_address(sg)	((sg)->dma_address)
-
-#ifdef CONFIG_NEED_SG_DMA_LENGTH
-#define sg_dma_len(sg)		((sg)->dma_length)
-#else
-#define sg_dma_len(sg)		((sg)->length)
-#endif
-
-#endif /* __ASM_GENERIC_SCATTERLIST_H */
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 7f9a516f24de..504af1e65ce1 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -22,8 +22,7 @@
 #include <linux/smp.h>
 #include <linux/rcupdate.h>
 #include <linux/percpu-refcount.h>
-
-#include <asm/scatterlist.h>
+#include <linux/scatterlist.h>
 
 struct module;
 struct scsi_ioctl_command;
diff --git a/include/linux/dmapool.h b/include/linux/dmapool.h
index 52456aa566a0..e1043f79122f 100644
--- a/include/linux/dmapool.h
+++ b/include/linux/dmapool.h
@@ -11,8 +11,8 @@
 #ifndef LINUX_DMAPOOL_H
 #define	LINUX_DMAPOOL_H
 
+#include <linux/scatterlist.h>
 #include <asm/io.h>
-#include <asm/scatterlist.h>
 
 struct device;
 
diff --git a/include/linux/scatterlist.h b/include/linux/scatterlist.h
index ed8f9e70df9b..eca1ec93775c 100644
--- a/include/linux/scatterlist.h
+++ b/include/linux/scatterlist.h
@@ -2,13 +2,39 @@
 #define _LINUX_SCATTERLIST_H
 
 #include <linux/string.h>
+#include <linux/types.h>
 #include <linux/bug.h>
 #include <linux/mm.h>
-
-#include <asm/types.h>
-#include <asm/scatterlist.h>
 #include <asm/io.h>
 
+struct scatterlist {
+#ifdef CONFIG_DEBUG_SG
+	unsigned long	sg_magic;
+#endif
+	unsigned long	page_link;
+	unsigned int	offset;
+	unsigned int	length;
+	dma_addr_t	dma_address;
+#ifdef CONFIG_NEED_SG_DMA_LENGTH
+	unsigned int	dma_length;
+#endif
+};
+
+/*
+ * These macros should be used after a dma_map_sg call has been done
+ * to get bus addresses of each of the SG entries and their lengths.
+ * You should only work with the number of sg entries dma_map_sg
+ * returns, or alternatively stop on the first sg_dma_len(sg) which
+ * is 0.
+ */
+#define sg_dma_address(sg)	((sg)->dma_address)
+
+#ifdef CONFIG_NEED_SG_DMA_LENGTH
+#define sg_dma_len(sg)		((sg)->dma_length)
+#else
+#define sg_dma_len(sg)		((sg)->length)
+#endif
+
 struct sg_table {
 	struct scatterlist *sgl;	/* the list */
 	unsigned int nents;		/* number of mapped entries */
@@ -18,10 +44,9 @@ struct sg_table {
 /*
  * Notes on SG table design.
  *
- * Architectures must provide an unsigned long page_link field in the
- * scatterlist struct. We use that to place the page pointer AND encode
- * information about the sg table as well. The two lower bits are reserved
- * for this information.
+ * We use the unsigned long page_link field in the scatterlist struct to place
+ * the page pointer AND encode information about the sg table as well. The two
+ * lower bits are reserved for this information.
  *
  * If bit 0 is set, then the page_link contains a pointer to the next sg
  * table list. Otherwise the next entry is at sg + 1.
diff --git a/lib/swiotlb.c b/lib/swiotlb.c
index 4abda074ea45..341268841b31 100644
--- a/lib/swiotlb.c
+++ b/lib/swiotlb.c
@@ -29,10 +29,10 @@
 #include <linux/ctype.h>
 #include <linux/highmem.h>
 #include <linux/gfp.h>
+#include <linux/scatterlist.h>
 
 #include <asm/io.h>
 #include <asm/dma.h>
-#include <asm/scatterlist.h>
 
 #include <linux/init.h>
 #include <linux/bootmem.h>
-- 
cgit v1.2.3


From 4f8c9510ba71bb54477841bebb90154ef140860f Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 17 Apr 2015 22:37:16 +0200
Subject: block: rename REQ_TYPE_SPECIAL to REQ_TYPE_DRV_PRIV

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/block/nbd.c        | 2 +-
 drivers/block/paride/pd.c  | 4 ++--
 drivers/block/sx8.c        | 4 ++--
 drivers/block/virtio_blk.c | 6 +++---
 drivers/ide/ide-atapi.c    | 4 ++--
 drivers/ide/ide-cd.c       | 2 +-
 drivers/ide/ide-cd_ioctl.c | 2 +-
 drivers/ide/ide-devsets.c  | 2 +-
 drivers/ide/ide-eh.c       | 2 +-
 drivers/ide/ide-floppy.c   | 6 +++---
 drivers/ide/ide-io.c       | 4 ++--
 drivers/ide/ide-ioctls.c   | 2 +-
 drivers/ide/ide-park.c     | 4 ++--
 drivers/ide/ide-tape.c     | 4 ++--
 include/linux/blkdev.h     | 4 ++--
 15 files changed, 26 insertions(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index 39e5f7fae3ef..9cf52ac328fe 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -592,7 +592,7 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd,
 		fsync_bdev(bdev);
 		mutex_lock(&nbd->tx_lock);
 		blk_rq_init(NULL, &sreq);
-		sreq.cmd_type = REQ_TYPE_SPECIAL;
+		sreq.cmd_type = REQ_TYPE_DRV_PRIV;
 		nbd_cmd(&sreq) = NBD_CMD_DISC;
 
 		/* Check again after getting mutex back.  */
diff --git a/drivers/block/paride/pd.c b/drivers/block/paride/pd.c
index d48715b287e6..dbb4da1cdca8 100644
--- a/drivers/block/paride/pd.c
+++ b/drivers/block/paride/pd.c
@@ -442,7 +442,7 @@ static char *pd_buf;		/* buffer for request in progress */
 
 static enum action do_pd_io_start(void)
 {
-	if (pd_req->cmd_type == REQ_TYPE_SPECIAL) {
+	if (pd_req->cmd_type == REQ_TYPE_DRV_PRIV) {
 		phase = pd_special;
 		return pd_special();
 	}
@@ -725,7 +725,7 @@ static int pd_special_command(struct pd_unit *disk,
 	if (IS_ERR(rq))
 		return PTR_ERR(rq);
 
-	rq->cmd_type = REQ_TYPE_SPECIAL;
+	rq->cmd_type = REQ_TYPE_DRV_PRIV;
 	rq->special = func;
 
 	err = blk_execute_rq(disk->gd->queue, disk->gd, rq, 0);
diff --git a/drivers/block/sx8.c b/drivers/block/sx8.c
index 5d552857de41..59c91d49b14b 100644
--- a/drivers/block/sx8.c
+++ b/drivers/block/sx8.c
@@ -620,7 +620,7 @@ static int carm_array_info (struct carm_host *host, unsigned int array_idx)
 	spin_unlock_irq(&host->lock);
 
 	DPRINTK("blk_execute_rq_nowait, tag == %u\n", idx);
-	crq->rq->cmd_type = REQ_TYPE_SPECIAL;
+	crq->rq->cmd_type = REQ_TYPE_DRV_PRIV;
 	crq->rq->special = crq;
 	blk_execute_rq_nowait(host->oob_q, NULL, crq->rq, true, NULL);
 
@@ -661,7 +661,7 @@ static int carm_send_special (struct carm_host *host, carm_sspc_t func)
 	crq->msg_bucket = (u32) rc;
 
 	DPRINTK("blk_execute_rq_nowait, tag == %u\n", idx);
-	crq->rq->cmd_type = REQ_TYPE_SPECIAL;
+	crq->rq->cmd_type = REQ_TYPE_DRV_PRIV;
 	crq->rq->special = crq;
 	blk_execute_rq_nowait(host->oob_q, NULL, crq->rq, true, NULL);
 
diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index 5ea2f0bbbc7c..d4d05f064d39 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -124,7 +124,7 @@ static inline void virtblk_request_done(struct request *req)
 		req->resid_len = virtio32_to_cpu(vblk->vdev, vbr->in_hdr.residual);
 		req->sense_len = virtio32_to_cpu(vblk->vdev, vbr->in_hdr.sense_len);
 		req->errors = virtio32_to_cpu(vblk->vdev, vbr->in_hdr.errors);
-	} else if (req->cmd_type == REQ_TYPE_SPECIAL) {
+	} else if (req->cmd_type == REQ_TYPE_DRV_PRIV) {
 		req->errors = (error != 0);
 	}
 
@@ -188,7 +188,7 @@ static int virtio_queue_rq(struct blk_mq_hw_ctx *hctx,
 			vbr->out_hdr.sector = 0;
 			vbr->out_hdr.ioprio = cpu_to_virtio32(vblk->vdev, req_get_ioprio(vbr->req));
 			break;
-		case REQ_TYPE_SPECIAL:
+		case REQ_TYPE_DRV_PRIV:
 			vbr->out_hdr.type = cpu_to_virtio32(vblk->vdev, VIRTIO_BLK_T_GET_ID);
 			vbr->out_hdr.sector = 0;
 			vbr->out_hdr.ioprio = cpu_to_virtio32(vblk->vdev, req_get_ioprio(vbr->req));
@@ -251,7 +251,7 @@ static int virtblk_get_id(struct gendisk *disk, char *id_str)
 		return PTR_ERR(req);
 	}
 
-	req->cmd_type = REQ_TYPE_SPECIAL;
+	req->cmd_type = REQ_TYPE_DRV_PRIV;
 	err = blk_execute_rq(vblk->disk->queue, vblk->disk, req, false);
 	blk_put_request(req);
 
diff --git a/drivers/ide/ide-atapi.c b/drivers/ide/ide-atapi.c
index fac3d9da2e07..b367300ce479 100644
--- a/drivers/ide/ide-atapi.c
+++ b/drivers/ide/ide-atapi.c
@@ -93,7 +93,7 @@ int ide_queue_pc_tail(ide_drive_t *drive, struct gendisk *disk,
 	int error;
 
 	rq = blk_get_request(drive->queue, READ, __GFP_WAIT);
-	rq->cmd_type = REQ_TYPE_SPECIAL;
+	rq->cmd_type = REQ_TYPE_DRV_PRIV;
 	rq->special = (char *)pc;
 
 	if (buf && bufflen) {
@@ -477,7 +477,7 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive)
 		if (uptodate == 0)
 			drive->failed_pc = NULL;
 
-		if (rq->cmd_type == REQ_TYPE_SPECIAL) {
+		if (rq->cmd_type == REQ_TYPE_DRV_PRIV) {
 			rq->errors = 0;
 			error = 0;
 		} else {
diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c
index 0b510bafd90e..9a32603c6c74 100644
--- a/drivers/ide/ide-cd.c
+++ b/drivers/ide/ide-cd.c
@@ -799,7 +799,7 @@ static ide_startstop_t ide_cd_do_request(ide_drive_t *drive, struct request *rq,
 
 		cdrom_do_block_pc(drive, rq);
 		break;
-	case REQ_TYPE_SPECIAL:
+	case REQ_TYPE_DRV_PRIV:
 		/* right now this can only be a reset... */
 		uptodate = 1;
 		goto out_end;
diff --git a/drivers/ide/ide-cd_ioctl.c b/drivers/ide/ide-cd_ioctl.c
index 02caa7dd51c8..066e39036518 100644
--- a/drivers/ide/ide-cd_ioctl.c
+++ b/drivers/ide/ide-cd_ioctl.c
@@ -304,7 +304,7 @@ int ide_cdrom_reset(struct cdrom_device_info *cdi)
 	int ret;
 
 	rq = blk_get_request(drive->queue, READ, __GFP_WAIT);
-	rq->cmd_type = REQ_TYPE_SPECIAL;
+	rq->cmd_type = REQ_TYPE_DRV_PRIV;
 	rq->cmd_flags = REQ_QUIET;
 	ret = blk_execute_rq(drive->queue, cd->disk, rq, 0);
 	blk_put_request(rq);
diff --git a/drivers/ide/ide-devsets.c b/drivers/ide/ide-devsets.c
index 9e98122f646e..b05a74d78ef5 100644
--- a/drivers/ide/ide-devsets.c
+++ b/drivers/ide/ide-devsets.c
@@ -166,7 +166,7 @@ int ide_devset_execute(ide_drive_t *drive, const struct ide_devset *setting,
 		return setting->set(drive, arg);
 
 	rq = blk_get_request(q, READ, __GFP_WAIT);
-	rq->cmd_type = REQ_TYPE_SPECIAL;
+	rq->cmd_type = REQ_TYPE_DRV_PRIV;
 	rq->cmd_len = 5;
 	rq->cmd[0] = REQ_DEVSET_EXEC;
 	*(int *)&rq->cmd[1] = arg;
diff --git a/drivers/ide/ide-eh.c b/drivers/ide/ide-eh.c
index 32970664c275..19d809c48a8d 100644
--- a/drivers/ide/ide-eh.c
+++ b/drivers/ide/ide-eh.c
@@ -147,7 +147,7 @@ static inline void ide_complete_drive_reset(ide_drive_t *drive, int err)
 {
 	struct request *rq = drive->hwif->rq;
 
-	if (rq && rq->cmd_type == REQ_TYPE_SPECIAL &&
+	if (rq && rq->cmd_type == REQ_TYPE_DRV_PRIV &&
 	    rq->cmd[0] == REQ_DRIVE_RESET) {
 		if (err <= 0 && rq->errors == 0)
 			rq->errors = -EIO;
diff --git a/drivers/ide/ide-floppy.c b/drivers/ide/ide-floppy.c
index 8c6363cdd208..3dbfa5b6db6a 100644
--- a/drivers/ide/ide-floppy.c
+++ b/drivers/ide/ide-floppy.c
@@ -97,7 +97,7 @@ static int ide_floppy_callback(ide_drive_t *drive, int dsc)
 			       "Aborting request!\n");
 	}
 
-	if (rq->cmd_type == REQ_TYPE_SPECIAL)
+	if (rq->cmd_type == REQ_TYPE_DRV_PRIV)
 		rq->errors = uptodate ? 0 : IDE_DRV_ERROR_GENERAL;
 
 	return uptodate;
@@ -246,7 +246,7 @@ static ide_startstop_t ide_floppy_do_request(ide_drive_t *drive,
 		} else
 			printk(KERN_ERR PFX "%s: I/O error\n", drive->name);
 
-		if (rq->cmd_type == REQ_TYPE_SPECIAL) {
+		if (rq->cmd_type == REQ_TYPE_DRV_PRIV) {
 			rq->errors = 0;
 			ide_complete_rq(drive, 0, blk_rq_bytes(rq));
 			return ide_stopped;
@@ -265,7 +265,7 @@ static ide_startstop_t ide_floppy_do_request(ide_drive_t *drive,
 		pc = &floppy->queued_pc;
 		idefloppy_create_rw_cmd(drive, pc, rq, (unsigned long)block);
 		break;
-	case REQ_TYPE_SPECIAL:
+	case REQ_TYPE_DRV_PRIV:
 	case REQ_TYPE_SENSE:
 		pc = (struct ide_atapi_pc *)rq->special;
 		break;
diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c
index 177db6d5b2f5..8e55abd03a24 100644
--- a/drivers/ide/ide-io.c
+++ b/drivers/ide/ide-io.c
@@ -135,7 +135,7 @@ EXPORT_SYMBOL(ide_complete_rq);
 
 void ide_kill_rq(ide_drive_t *drive, struct request *rq)
 {
-	u8 drv_req = (rq->cmd_type == REQ_TYPE_SPECIAL) && rq->rq_disk;
+	u8 drv_req = (rq->cmd_type == REQ_TYPE_DRV_PRIV) && rq->rq_disk;
 	u8 media = drive->media;
 
 	drive->failed_pc = NULL;
@@ -353,7 +353,7 @@ static ide_startstop_t start_request (ide_drive_t *drive, struct request *rq)
 			    pm->pm_step == IDE_PM_COMPLETED)
 				ide_complete_pm_rq(drive, rq);
 			return startstop;
-		} else if (!rq->rq_disk && rq->cmd_type == REQ_TYPE_SPECIAL)
+		} else if (!rq->rq_disk && rq->cmd_type == REQ_TYPE_DRV_PRIV)
 			/*
 			 * TODO: Once all ULDs have been modified to
 			 * check for specific op codes rather than
diff --git a/drivers/ide/ide-ioctls.c b/drivers/ide/ide-ioctls.c
index 6233fa2cb8a9..aa2e9b77b20d 100644
--- a/drivers/ide/ide-ioctls.c
+++ b/drivers/ide/ide-ioctls.c
@@ -222,7 +222,7 @@ static int generic_drive_reset(ide_drive_t *drive)
 	int ret = 0;
 
 	rq = blk_get_request(drive->queue, READ, __GFP_WAIT);
-	rq->cmd_type = REQ_TYPE_SPECIAL;
+	rq->cmd_type = REQ_TYPE_DRV_PRIV;
 	rq->cmd_len = 1;
 	rq->cmd[0] = REQ_DRIVE_RESET;
 	if (blk_execute_rq(drive->queue, NULL, rq, 1))
diff --git a/drivers/ide/ide-park.c b/drivers/ide/ide-park.c
index ca958604cda2..c80868520488 100644
--- a/drivers/ide/ide-park.c
+++ b/drivers/ide/ide-park.c
@@ -34,7 +34,7 @@ static void issue_park_cmd(ide_drive_t *drive, unsigned long timeout)
 	rq = blk_get_request(q, READ, __GFP_WAIT);
 	rq->cmd[0] = REQ_PARK_HEADS;
 	rq->cmd_len = 1;
-	rq->cmd_type = REQ_TYPE_SPECIAL;
+	rq->cmd_type = REQ_TYPE_DRV_PRIV;
 	rq->special = &timeout;
 	rc = blk_execute_rq(q, NULL, rq, 1);
 	blk_put_request(rq);
@@ -51,7 +51,7 @@ static void issue_park_cmd(ide_drive_t *drive, unsigned long timeout)
 
 	rq->cmd[0] = REQ_UNPARK_HEADS;
 	rq->cmd_len = 1;
-	rq->cmd_type = REQ_TYPE_SPECIAL;
+	rq->cmd_type = REQ_TYPE_DRV_PRIV;
 	elv_add_request(q, rq, ELEVATOR_INSERT_FRONT);
 
 out:
diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c
index 6eb738ca6d2f..2a5d543db9f5 100644
--- a/drivers/ide/ide-tape.c
+++ b/drivers/ide/ide-tape.c
@@ -576,7 +576,7 @@ static ide_startstop_t idetape_do_request(ide_drive_t *drive,
 		      rq->cmd[0], (unsigned long long)blk_rq_pos(rq),
 		      blk_rq_sectors(rq));
 
-	BUG_ON(!(rq->cmd_type == REQ_TYPE_SPECIAL ||
+	BUG_ON(!(rq->cmd_type == REQ_TYPE_DRV_PRIV ||
 		 rq->cmd_type == REQ_TYPE_SENSE));
 
 	/* Retry a failed packet command */
@@ -853,7 +853,7 @@ static int idetape_queue_rw_tail(ide_drive_t *drive, int cmd, int size)
 	BUG_ON(size < 0 || size % tape->blk_size);
 
 	rq = blk_get_request(drive->queue, READ, __GFP_WAIT);
-	rq->cmd_type = REQ_TYPE_SPECIAL;
+	rq->cmd_type = REQ_TYPE_DRV_PRIV;
 	rq->cmd[13] = cmd;
 	rq->rq_disk = tape->disk;
 	rq->__sector = tape->first_frame;
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 7f9a516f24de..98c90272443b 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -79,10 +79,10 @@ enum rq_cmd_type_bits {
 	REQ_TYPE_PM_SUSPEND,		/* suspend request */
 	REQ_TYPE_PM_RESUME,		/* resume request */
 	REQ_TYPE_PM_SHUTDOWN,		/* shutdown request */
-	REQ_TYPE_SPECIAL,		/* driver defined type */
+	REQ_TYPE_DRV_PRIV,		/* driver defined type */
 	/*
 	 * for ATA/ATAPI devices. this really doesn't belong here, ide should
-	 * use REQ_TYPE_SPECIAL and use rq->cmd[0] with the range of driver
+	 * use REQ_TYPE_DRV_PRIV and use rq->cmd[0] with the range of driver
 	 * private REQ_LB opcodes to differentiate what type of request this is
 	 */
 	REQ_TYPE_ATA_TASKFILE,
-- 
cgit v1.2.3


From b42171ef7d938a66fa52e66a3d911ed63770b5ca Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 17 Apr 2015 22:37:17 +0200
Subject: block: move REQ_TYPE_ATA_TASKFILE and REQ_TYPE_ATA_PC to ide.h

These values are only used by the IDE driver, so move them into it
by allowing drivers to take cmd_type values after the first private
one.  Note that we have to turn cmd_type into a plain unsigned integer
so that gcc doesn't complain about mismatching enum types.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 include/linux/blkdev.h | 11 ++---------
 include/linux/ide.h    |  7 +++++++
 2 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 98c90272443b..9cb4d80a4987 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -79,14 +79,7 @@ enum rq_cmd_type_bits {
 	REQ_TYPE_PM_SUSPEND,		/* suspend request */
 	REQ_TYPE_PM_RESUME,		/* resume request */
 	REQ_TYPE_PM_SHUTDOWN,		/* shutdown request */
-	REQ_TYPE_DRV_PRIV,		/* driver defined type */
-	/*
-	 * for ATA/ATAPI devices. this really doesn't belong here, ide should
-	 * use REQ_TYPE_DRV_PRIV and use rq->cmd[0] with the range of driver
-	 * private REQ_LB opcodes to differentiate what type of request this is
-	 */
-	REQ_TYPE_ATA_TASKFILE,
-	REQ_TYPE_ATA_PC,
+	REQ_TYPE_DRV_PRIV,		/* driver defined types from here */
 };
 
 #define BLK_MAX_CDB	16
@@ -108,7 +101,7 @@ struct request {
 	struct blk_mq_ctx *mq_ctx;
 
 	u64 cmd_flags;
-	enum rq_cmd_type_bits cmd_type;
+	unsigned cmd_type;
 	unsigned long atomic_flags;
 
 	int cpu;
diff --git a/include/linux/ide.h b/include/linux/ide.h
index 93b5ca754b5b..62ac399144a6 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -39,6 +39,12 @@
 
 struct device;
 
+/* IDE-specific values for req->cmd_type */
+enum ata_cmd_type_bits {
+	REQ_TYPE_ATA_TASKFILE = REQ_TYPE_DRV_PRIV + 1,
+	REQ_TYPE_ATA_PC,
+};
+
 /* Error codes returned in rq->errors to the higher part of the driver. */
 enum {
 	IDE_DRV_ERROR_GENERAL	= 101,
@@ -1551,4 +1557,5 @@ static inline void ide_set_drivedata(ide_drive_t *drive, void *data)
 #define ide_host_for_each_port(i, port, host) \
 	for ((i) = 0; ((port) = (host)->ports[i]) || (i) < MAX_HOST_PORTS; (i)++)
 
+
 #endif /* _IDE_H */
-- 
cgit v1.2.3


From b0b93b48a30e809240ddd7449a6ad60a5ddf7b4d Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 17 Apr 2015 22:37:18 +0200
Subject: block: move REQ_TYPE_SENSE to the ide driver

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/ide/ide-atapi.c  | 6 +++---
 drivers/ide/ide-cd.c     | 8 ++++----
 drivers/ide/ide-floppy.c | 2 +-
 drivers/ide/ide-tape.c   | 2 +-
 include/linux/blkdev.h   | 1 -
 include/linux/ide.h      | 1 +
 6 files changed, 10 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ide/ide-atapi.c b/drivers/ide/ide-atapi.c
index b367300ce479..1362ad80a76c 100644
--- a/drivers/ide/ide-atapi.c
+++ b/drivers/ide/ide-atapi.c
@@ -191,7 +191,7 @@ void ide_prep_sense(ide_drive_t *drive, struct request *rq)
 
 	BUG_ON(sense_len > sizeof(*sense));
 
-	if (rq->cmd_type == REQ_TYPE_SENSE || drive->sense_rq_armed)
+	if (rq->cmd_type == REQ_TYPE_ATA_SENSE || drive->sense_rq_armed)
 		return;
 
 	memset(sense, 0, sizeof(*sense));
@@ -210,7 +210,7 @@ void ide_prep_sense(ide_drive_t *drive, struct request *rq)
 	sense_rq->rq_disk = rq->rq_disk;
 	sense_rq->cmd[0] = GPCMD_REQUEST_SENSE;
 	sense_rq->cmd[4] = cmd_len;
-	sense_rq->cmd_type = REQ_TYPE_SENSE;
+	sense_rq->cmd_type = REQ_TYPE_ATA_SENSE;
 	sense_rq->cmd_flags |= REQ_PREEMPT;
 
 	if (drive->media == ide_tape)
@@ -310,7 +310,7 @@ int ide_cd_get_xferlen(struct request *rq)
 	switch (rq->cmd_type) {
 	case REQ_TYPE_FS:
 		return 32768;
-	case REQ_TYPE_SENSE:
+	case REQ_TYPE_ATA_SENSE:
 	case REQ_TYPE_BLOCK_PC:
 	case REQ_TYPE_ATA_PC:
 		return blk_rq_bytes(rq);
diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c
index 9a32603c6c74..64a6b827b3dd 100644
--- a/drivers/ide/ide-cd.c
+++ b/drivers/ide/ide-cd.c
@@ -210,7 +210,7 @@ static void cdrom_analyze_sense_data(ide_drive_t *drive,
 static void ide_cd_complete_failed_rq(ide_drive_t *drive, struct request *rq)
 {
 	/*
-	 * For REQ_TYPE_SENSE, "rq->special" points to the original
+	 * For REQ_TYPE_ATA_SENSE, "rq->special" points to the original
 	 * failed request.  Also, the sense data should be read
 	 * directly from rq which might be different from the original
 	 * sense buffer if it got copied during mapping.
@@ -285,7 +285,7 @@ static int cdrom_decode_status(ide_drive_t *drive, u8 stat)
 				  "stat 0x%x",
 				  rq->cmd[0], rq->cmd_type, err, stat);
 
-	if (rq->cmd_type == REQ_TYPE_SENSE) {
+	if (rq->cmd_type == REQ_TYPE_ATA_SENSE) {
 		/*
 		 * We got an error trying to get sense info from the drive
 		 * (probably while trying to recover from a former error).
@@ -526,7 +526,7 @@ static ide_startstop_t cdrom_newpc_intr(ide_drive_t *drive)
 	ide_expiry_t *expiry = NULL;
 	int dma_error = 0, dma, thislen, uptodate = 0;
 	int write = (rq_data_dir(rq) == WRITE) ? 1 : 0, rc = 0;
-	int sense = (rq->cmd_type == REQ_TYPE_SENSE);
+	int sense = (rq->cmd_type == REQ_TYPE_ATA_SENSE);
 	unsigned int timeout;
 	u16 len;
 	u8 ireason, stat;
@@ -791,7 +791,7 @@ static ide_startstop_t ide_cd_do_request(ide_drive_t *drive, struct request *rq,
 		if (cdrom_start_rw(drive, rq) == ide_stopped)
 			goto out_end;
 		break;
-	case REQ_TYPE_SENSE:
+	case REQ_TYPE_ATA_SENSE:
 	case REQ_TYPE_BLOCK_PC:
 	case REQ_TYPE_ATA_PC:
 		if (!rq->timeout)
diff --git a/drivers/ide/ide-floppy.c b/drivers/ide/ide-floppy.c
index 3dbfa5b6db6a..2fb5350c5410 100644
--- a/drivers/ide/ide-floppy.c
+++ b/drivers/ide/ide-floppy.c
@@ -266,7 +266,7 @@ static ide_startstop_t ide_floppy_do_request(ide_drive_t *drive,
 		idefloppy_create_rw_cmd(drive, pc, rq, (unsigned long)block);
 		break;
 	case REQ_TYPE_DRV_PRIV:
-	case REQ_TYPE_SENSE:
+	case REQ_TYPE_ATA_SENSE:
 		pc = (struct ide_atapi_pc *)rq->special;
 		break;
 	case REQ_TYPE_BLOCK_PC:
diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c
index 2a5d543db9f5..f5d51d1d09ee 100644
--- a/drivers/ide/ide-tape.c
+++ b/drivers/ide/ide-tape.c
@@ -577,7 +577,7 @@ static ide_startstop_t idetape_do_request(ide_drive_t *drive,
 		      blk_rq_sectors(rq));
 
 	BUG_ON(!(rq->cmd_type == REQ_TYPE_DRV_PRIV ||
-		 rq->cmd_type == REQ_TYPE_SENSE));
+		 rq->cmd_type == REQ_TYPE_ATA_SENSE));
 
 	/* Retry a failed packet command */
 	if (drive->failed_pc && drive->pc->c[0] == REQUEST_SENSE) {
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 9cb4d80a4987..6076b9e18dcb 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -75,7 +75,6 @@ struct request_list {
 enum rq_cmd_type_bits {
 	REQ_TYPE_FS		= 1,	/* fs request */
 	REQ_TYPE_BLOCK_PC,		/* scsi command */
-	REQ_TYPE_SENSE,			/* sense request */
 	REQ_TYPE_PM_SUSPEND,		/* suspend request */
 	REQ_TYPE_PM_RESUME,		/* resume request */
 	REQ_TYPE_PM_SHUTDOWN,		/* shutdown request */
diff --git a/include/linux/ide.h b/include/linux/ide.h
index 62ac399144a6..9856b7d455d9 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -43,6 +43,7 @@ struct device;
 enum ata_cmd_type_bits {
 	REQ_TYPE_ATA_TASKFILE = REQ_TYPE_DRV_PRIV + 1,
 	REQ_TYPE_ATA_PC,
+	REQ_TYPE_ATA_SENSE,	/* sense request */
 };
 
 /* Error codes returned in rq->errors to the higher part of the driver. */
-- 
cgit v1.2.3


From ac7cdff00a33d48d27217560fa3b16d802e5f535 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 17 Apr 2015 22:37:19 +0200
Subject: block: remove REQ_TYPE_PM_SHUTDOWN

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 include/linux/blkdev.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 6076b9e18dcb..c2829ba5e738 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -77,7 +77,6 @@ enum rq_cmd_type_bits {
 	REQ_TYPE_BLOCK_PC,		/* scsi command */
 	REQ_TYPE_PM_SUSPEND,		/* suspend request */
 	REQ_TYPE_PM_RESUME,		/* resume request */
-	REQ_TYPE_PM_SHUTDOWN,		/* shutdown request */
 	REQ_TYPE_DRV_PRIV,		/* driver defined types from here */
 };
 
-- 
cgit v1.2.3


From a7928c1578c550bd6f4dec62d65132e6db226c57 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 17 Apr 2015 22:37:20 +0200
Subject: block: move PM request support to IDE

This removes the request types and hacks from the block code and into the
old IDE driver.  There is a small amunt of code duplication due to this,
but it's not too bad.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-core.c           |  1 +
 block/blk-exec.c           | 10 ---------
 block/blk.h                |  2 --
 drivers/ide/ide-eh.c       |  2 +-
 drivers/ide/ide-io.c       |  8 +++----
 drivers/ide/ide-pm.c       | 56 +++++++++++++++++++++++++++++++++++-----------
 drivers/ide/ide-taskfile.c |  2 +-
 include/linux/blkdev.h     | 21 +----------------
 include/linux/ide.h        | 19 ++++++++++++++++
 9 files changed, 70 insertions(+), 51 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-core.c b/block/blk-core.c
index fd154b94447a..2e5020f37d55 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -285,6 +285,7 @@ inline void __blk_run_queue_uncond(struct request_queue *q)
 	q->request_fn(q);
 	q->request_fn_active--;
 }
+EXPORT_SYMBOL_GPL(__blk_run_queue_uncond);
 
 /**
  * __blk_run_queue - run a single device queue
diff --git a/block/blk-exec.c b/block/blk-exec.c
index 9924725fa50d..3fec8a29d0fa 100644
--- a/block/blk-exec.c
+++ b/block/blk-exec.c
@@ -53,7 +53,6 @@ void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk,
 			   rq_end_io_fn *done)
 {
 	int where = at_head ? ELEVATOR_INSERT_FRONT : ELEVATOR_INSERT_BACK;
-	bool is_pm_resume;
 
 	WARN_ON(irqs_disabled());
 	WARN_ON(rq->cmd_type == REQ_TYPE_FS);
@@ -70,12 +69,6 @@ void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk,
 		return;
 	}
 
-	/*
-	 * need to check this before __blk_run_queue(), because rq can
-	 * be freed before that returns.
-	 */
-	is_pm_resume = rq->cmd_type == REQ_TYPE_PM_RESUME;
-
 	spin_lock_irq(q->queue_lock);
 
 	if (unlikely(blk_queue_dying(q))) {
@@ -88,9 +81,6 @@ void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk,
 
 	__elv_add_request(q, rq, where);
 	__blk_run_queue(q);
-	/* the queue is stopped so it won't be run */
-	if (is_pm_resume)
-		__blk_run_queue_uncond(q);
 	spin_unlock_irq(q->queue_lock);
 }
 EXPORT_SYMBOL_GPL(blk_execute_rq_nowait);
diff --git a/block/blk.h b/block/blk.h
index 43b036185712..4b48d55e588e 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -193,8 +193,6 @@ int blk_try_merge(struct request *rq, struct bio *bio);
 
 void blk_queue_congestion_threshold(struct request_queue *q);
 
-void __blk_run_queue_uncond(struct request_queue *q);
-
 int blk_dev_init(void);
 
 
diff --git a/drivers/ide/ide-eh.c b/drivers/ide/ide-eh.c
index 19d809c48a8d..d6da011299f5 100644
--- a/drivers/ide/ide-eh.c
+++ b/drivers/ide/ide-eh.c
@@ -129,7 +129,7 @@ ide_startstop_t ide_error(ide_drive_t *drive, const char *msg, u8 stat)
 
 			if (cmd)
 				ide_complete_cmd(drive, cmd, stat, err);
-		} else if (blk_pm_request(rq)) {
+		} else if (ata_pm_request(rq)) {
 			rq->errors = 1;
 			ide_complete_pm_rq(drive, rq);
 			return ide_stopped;
diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c
index 8e55abd03a24..669ea1e45795 100644
--- a/drivers/ide/ide-io.c
+++ b/drivers/ide/ide-io.c
@@ -320,7 +320,7 @@ static ide_startstop_t start_request (ide_drive_t *drive, struct request *rq)
 		goto kill_rq;
 	}
 
-	if (blk_pm_request(rq))
+	if (ata_pm_request(rq))
 		ide_check_pm_state(drive, rq);
 
 	drive->hwif->tp_ops->dev_select(drive);
@@ -342,8 +342,8 @@ static ide_startstop_t start_request (ide_drive_t *drive, struct request *rq)
 
 		if (rq->cmd_type == REQ_TYPE_ATA_TASKFILE)
 			return execute_drive_cmd(drive, rq);
-		else if (blk_pm_request(rq)) {
-			struct request_pm_state *pm = rq->special;
+		else if (ata_pm_request(rq)) {
+			struct ide_pm_state *pm = rq->special;
 #ifdef DEBUG_PM
 			printk("%s: start_power_step(step: %d)\n",
 				drive->name, pm->pm_step);
@@ -538,7 +538,7 @@ repeat:
 		 * state machine.
 		 */
 		if ((drive->dev_flags & IDE_DFLAG_BLOCKED) &&
-		    blk_pm_request(rq) == 0 &&
+		    ata_pm_request(rq) == 0 &&
 		    (rq->cmd_flags & REQ_PREEMPT) == 0) {
 			/* there should be no pending command at this point */
 			ide_unlock_port(hwif);
diff --git a/drivers/ide/ide-pm.c b/drivers/ide/ide-pm.c
index 8d1e32d7cd97..081e43458d50 100644
--- a/drivers/ide/ide-pm.c
+++ b/drivers/ide/ide-pm.c
@@ -8,7 +8,7 @@ int generic_ide_suspend(struct device *dev, pm_message_t mesg)
 	ide_drive_t *pair = ide_get_pair_dev(drive);
 	ide_hwif_t *hwif = drive->hwif;
 	struct request *rq;
-	struct request_pm_state rqpm;
+	struct ide_pm_state rqpm;
 	int ret;
 
 	if (ide_port_acpi(hwif)) {
@@ -19,7 +19,7 @@ int generic_ide_suspend(struct device *dev, pm_message_t mesg)
 
 	memset(&rqpm, 0, sizeof(rqpm));
 	rq = blk_get_request(drive->queue, READ, __GFP_WAIT);
-	rq->cmd_type = REQ_TYPE_PM_SUSPEND;
+	rq->cmd_type = REQ_TYPE_ATA_PM_SUSPEND;
 	rq->special = &rqpm;
 	rqpm.pm_step = IDE_PM_START_SUSPEND;
 	if (mesg.event == PM_EVENT_PRETHAW)
@@ -38,13 +38,43 @@ int generic_ide_suspend(struct device *dev, pm_message_t mesg)
 	return ret;
 }
 
+static void ide_end_sync_rq(struct request *rq, int error)
+{
+	complete(rq->end_io_data);
+}
+
+static int ide_pm_execute_rq(struct request *rq)
+{
+	struct request_queue *q = rq->q;
+	DECLARE_COMPLETION_ONSTACK(wait);
+
+	rq->end_io_data = &wait;
+	rq->end_io = ide_end_sync_rq;
+
+	spin_lock_irq(q->queue_lock);
+	if (unlikely(blk_queue_dying(q))) {
+		rq->cmd_flags |= REQ_QUIET;
+		rq->errors = -ENXIO;
+		__blk_end_request_all(rq, rq->errors);
+		spin_unlock_irq(q->queue_lock);
+		return -ENXIO;
+	}
+	__elv_add_request(q, rq, ELEVATOR_INSERT_FRONT);
+	__blk_run_queue_uncond(q);
+	spin_unlock_irq(q->queue_lock);
+
+	wait_for_completion_io(&wait);
+
+	return rq->errors ? -EIO : 0;
+}
+
 int generic_ide_resume(struct device *dev)
 {
 	ide_drive_t *drive = to_ide_device(dev);
 	ide_drive_t *pair = ide_get_pair_dev(drive);
 	ide_hwif_t *hwif = drive->hwif;
 	struct request *rq;
-	struct request_pm_state rqpm;
+	struct ide_pm_state rqpm;
 	int err;
 
 	if (ide_port_acpi(hwif)) {
@@ -59,13 +89,13 @@ int generic_ide_resume(struct device *dev)
 
 	memset(&rqpm, 0, sizeof(rqpm));
 	rq = blk_get_request(drive->queue, READ, __GFP_WAIT);
-	rq->cmd_type = REQ_TYPE_PM_RESUME;
+	rq->cmd_type = REQ_TYPE_ATA_PM_RESUME;
 	rq->cmd_flags |= REQ_PREEMPT;
 	rq->special = &rqpm;
 	rqpm.pm_step = IDE_PM_START_RESUME;
 	rqpm.pm_state = PM_EVENT_ON;
 
-	err = blk_execute_rq(drive->queue, NULL, rq, 1);
+	err = ide_pm_execute_rq(rq);
 	blk_put_request(rq);
 
 	if (err == 0 && dev->driver) {
@@ -80,7 +110,7 @@ int generic_ide_resume(struct device *dev)
 
 void ide_complete_power_step(ide_drive_t *drive, struct request *rq)
 {
-	struct request_pm_state *pm = rq->special;
+	struct ide_pm_state *pm = rq->special;
 
 #ifdef DEBUG_PM
 	printk(KERN_INFO "%s: complete_power_step(step: %d)\n",
@@ -110,7 +140,7 @@ void ide_complete_power_step(ide_drive_t *drive, struct request *rq)
 
 ide_startstop_t ide_start_power_step(ide_drive_t *drive, struct request *rq)
 {
-	struct request_pm_state *pm = rq->special;
+	struct ide_pm_state *pm = rq->special;
 	struct ide_cmd cmd = { };
 
 	switch (pm->pm_step) {
@@ -182,7 +212,7 @@ out_do_tf:
 void ide_complete_pm_rq(ide_drive_t *drive, struct request *rq)
 {
 	struct request_queue *q = drive->queue;
-	struct request_pm_state *pm = rq->special;
+	struct ide_pm_state *pm = rq->special;
 	unsigned long flags;
 
 	ide_complete_power_step(drive, rq);
@@ -191,10 +221,10 @@ void ide_complete_pm_rq(ide_drive_t *drive, struct request *rq)
 
 #ifdef DEBUG_PM
 	printk("%s: completing PM request, %s\n", drive->name,
-	       (rq->cmd_type == REQ_TYPE_PM_SUSPEND) ? "suspend" : "resume");
+	       (rq->cmd_type == REQ_TYPE_ATA_PM_SUSPEND) ? "suspend" : "resume");
 #endif
 	spin_lock_irqsave(q->queue_lock, flags);
-	if (rq->cmd_type == REQ_TYPE_PM_SUSPEND)
+	if (rq->cmd_type == REQ_TYPE_ATA_PM_SUSPEND)
 		blk_stop_queue(q);
 	else
 		drive->dev_flags &= ~IDE_DFLAG_BLOCKED;
@@ -208,13 +238,13 @@ void ide_complete_pm_rq(ide_drive_t *drive, struct request *rq)
 
 void ide_check_pm_state(ide_drive_t *drive, struct request *rq)
 {
-	struct request_pm_state *pm = rq->special;
+	struct ide_pm_state *pm = rq->special;
 
-	if (rq->cmd_type == REQ_TYPE_PM_SUSPEND &&
+	if (rq->cmd_type == REQ_TYPE_ATA_PM_SUSPEND &&
 	    pm->pm_step == IDE_PM_START_SUSPEND)
 		/* Mark drive blocked when starting the suspend sequence. */
 		drive->dev_flags |= IDE_DFLAG_BLOCKED;
-	else if (rq->cmd_type == REQ_TYPE_PM_RESUME &&
+	else if (rq->cmd_type == REQ_TYPE_ATA_PM_RESUME &&
 		 pm->pm_step == IDE_PM_START_RESUME) {
 		/*
 		 * The first thing we do on wakeup is to wait for BSY bit to
diff --git a/drivers/ide/ide-taskfile.c b/drivers/ide/ide-taskfile.c
index dabb88b1cbec..0979e126fff1 100644
--- a/drivers/ide/ide-taskfile.c
+++ b/drivers/ide/ide-taskfile.c
@@ -186,7 +186,7 @@ static ide_startstop_t task_no_data_intr(ide_drive_t *drive)
 	    tf->command == ATA_CMD_CHK_POWER) {
 		struct request *rq = hwif->rq;
 
-		if (blk_pm_request(rq))
+		if (ata_pm_request(rq))
 			ide_complete_pm_rq(drive, rq);
 		else
 			ide_finish_cmd(drive, cmd, stat);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index c2829ba5e738..2da818a48097 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -30,7 +30,6 @@ struct scsi_ioctl_command;
 
 struct request_queue;
 struct elevator_queue;
-struct request_pm_state;
 struct blk_trace;
 struct request;
 struct sg_io_hdr;
@@ -75,8 +74,6 @@ struct request_list {
 enum rq_cmd_type_bits {
 	REQ_TYPE_FS		= 1,	/* fs request */
 	REQ_TYPE_BLOCK_PC,		/* scsi command */
-	REQ_TYPE_PM_SUSPEND,		/* suspend request */
-	REQ_TYPE_PM_RESUME,		/* resume request */
 	REQ_TYPE_DRV_PRIV,		/* driver defined types from here */
 };
 
@@ -207,19 +204,6 @@ static inline unsigned short req_get_ioprio(struct request *req)
 	return req->ioprio;
 }
 
-/*
- * State information carried for REQ_TYPE_PM_SUSPEND and REQ_TYPE_PM_RESUME
- * requests. Some step values could eventually be made generic.
- */
-struct request_pm_state
-{
-	/* PM state machine step value, currently driver specific */
-	int	pm_step;
-	/* requested PM state value (S1, S2, S3, S4, ...) */
-	u32	pm_state;
-	void*	data;		/* for driver use */
-};
-
 #include <linux/elevator.h>
 
 struct blk_queue_ctx;
@@ -601,10 +585,6 @@ static inline void queue_flag_clear(unsigned int flag, struct request_queue *q)
 	(((rq)->cmd_flags & REQ_STARTED) && \
 	 ((rq)->cmd_type == REQ_TYPE_FS))
 
-#define blk_pm_request(rq)	\
-	((rq)->cmd_type == REQ_TYPE_PM_SUSPEND || \
-	 (rq)->cmd_type == REQ_TYPE_PM_RESUME)
-
 #define blk_rq_cpu_valid(rq)	((rq)->cpu != -1)
 #define blk_bidi_rq(rq)		((rq)->next_rq != NULL)
 /* rq->queuelist of dequeued request must be list_empty() */
@@ -838,6 +818,7 @@ extern void blk_stop_queue(struct request_queue *q);
 extern void blk_sync_queue(struct request_queue *q);
 extern void __blk_stop_queue(struct request_queue *q);
 extern void __blk_run_queue(struct request_queue *q);
+extern void __blk_run_queue_uncond(struct request_queue *q);
 extern void blk_run_queue(struct request_queue *);
 extern void blk_run_queue_async(struct request_queue *q);
 extern int blk_rq_map_user(struct request_queue *, struct request *,
diff --git a/include/linux/ide.h b/include/linux/ide.h
index 9856b7d455d9..a633898f36ac 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -44,8 +44,14 @@ enum ata_cmd_type_bits {
 	REQ_TYPE_ATA_TASKFILE = REQ_TYPE_DRV_PRIV + 1,
 	REQ_TYPE_ATA_PC,
 	REQ_TYPE_ATA_SENSE,	/* sense request */
+	REQ_TYPE_ATA_PM_SUSPEND,/* suspend request */
+	REQ_TYPE_ATA_PM_RESUME,	/* resume request */
 };
 
+#define ata_pm_request(rq)	\
+	((rq)->cmd_type == REQ_TYPE_ATA_PM_SUSPEND || \
+	 (rq)->cmd_type == REQ_TYPE_ATA_PM_RESUME)
+
 /* Error codes returned in rq->errors to the higher part of the driver. */
 enum {
 	IDE_DRV_ERROR_GENERAL	= 101,
@@ -1321,6 +1327,19 @@ struct ide_port_info {
 	u8			udma_mask;
 };
 
+/*
+ * State information carried for REQ_TYPE_ATA_PM_SUSPEND and REQ_TYPE_ATA_PM_RESUME
+ * requests.
+ */
+struct ide_pm_state {
+	/* PM state machine step value, currently driver specific */
+	int	pm_step;
+	/* requested PM state value (S1, S2, S3, S4, ...) */
+	u32	pm_state;
+	void*	data;		/* for driver use */
+};
+
+
 int ide_pci_init_one(struct pci_dev *, const struct ide_port_info *, void *);
 int ide_pci_init_two(struct pci_dev *, struct pci_dev *,
 		     const struct ide_port_info *, void *);
-- 
cgit v1.2.3


From cd8ae85299d54155702a56811b2e035e63064d3d Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Sun, 3 May 2015 21:34:46 -0700
Subject: tcp: provide SYN headers for passive connections

This patch allows a server application to get the TCP SYN headers for
its passive connections.  This is useful if the server is doing
fingerprinting of clients based on SYN packet contents.

Two socket options are added: TCP_SAVE_SYN and TCP_SAVED_SYN.

The first is used on a socket to enable saving the SYN headers
for child connections. This can be set before or after the listen()
call.

The latter is used to retrieve the SYN headers for passive connections,
if the parent listener has enabled TCP_SAVE_SYN.

TCP_SAVED_SYN is read once, it frees the saved SYN headers.

The data returned in TCP_SAVED_SYN are network (IPv4/IPv6) and TCP
headers.

Original patch was written by Tom Herbert, I changed it to not hold
a full skb (and associated dst and conntracking reference).

We have used such patch for about 3 years at Google.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Tested-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h        |  8 ++++++++
 include/net/request_sock.h |  4 +++-
 include/uapi/linux/tcp.h   |  2 ++
 net/ipv4/tcp.c             | 35 +++++++++++++++++++++++++++++++++++
 net/ipv4/tcp_input.c       | 18 ++++++++++++++++++
 net/ipv4/tcp_ipv4.c        |  1 +
 net/ipv4/tcp_minisocks.c   |  3 +++
 7 files changed, 70 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 3b2911502a8c..e6fb5df22db1 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -199,6 +199,7 @@ struct tcp_sock {
 		syn_fastopen:1,	/* SYN includes Fast Open option */
 		syn_fastopen_exp:1,/* SYN includes Fast Open exp. option */
 		syn_data_acked:1,/* data in SYN is acked by SYN-ACK */
+		save_syn:1,	/* Save headers of SYN packet */
 		is_cwnd_limited:1;/* forward progress limited by snd_cwnd? */
 	u32	tlp_high_seq;	/* snd_nxt at the time of TLP retransmit. */
 
@@ -326,6 +327,7 @@ struct tcp_sock {
 	 * socket. Used to retransmit SYNACKs etc.
 	 */
 	struct request_sock *fastopen_rsk;
+	u32	*saved_syn;
 };
 
 enum tsq_flags {
@@ -393,4 +395,10 @@ static inline int fastopen_init_queue(struct sock *sk, int backlog)
 	return 0;
 }
 
+static inline void tcp_saved_syn_free(struct tcp_sock *tp)
+{
+	kfree(tp->saved_syn);
+	tp->saved_syn = NULL;
+}
+
 #endif	/* _LINUX_TCP_H */
diff --git a/include/net/request_sock.h b/include/net/request_sock.h
index 9f4265ce8892..87935cad2f7b 100644
--- a/include/net/request_sock.h
+++ b/include/net/request_sock.h
@@ -64,6 +64,7 @@ struct request_sock {
 	struct timer_list		rsk_timer;
 	const struct request_sock_ops	*rsk_ops;
 	struct sock			*sk;
+	u32				*saved_syn;
 	u32				secid;
 	u32				peer_secid;
 };
@@ -77,7 +78,7 @@ reqsk_alloc(const struct request_sock_ops *ops, struct sock *sk_listener)
 		req->rsk_ops = ops;
 		sock_hold(sk_listener);
 		req->rsk_listener = sk_listener;
-
+		req->saved_syn = NULL;
 		/* Following is temporary. It is coupled with debugging
 		 * helpers in reqsk_put() & reqsk_free()
 		 */
@@ -104,6 +105,7 @@ static inline void reqsk_free(struct request_sock *req)
 	req->rsk_ops->destructor(req);
 	if (req->rsk_listener)
 		sock_put(req->rsk_listener);
+	kfree(req->saved_syn);
 	kmem_cache_free(req->rsk_ops->slab, req);
 }
 
diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h
index faa72f4fa547..51ebedba577f 100644
--- a/include/uapi/linux/tcp.h
+++ b/include/uapi/linux/tcp.h
@@ -113,6 +113,8 @@ enum {
 #define TCP_TIMESTAMP		24
 #define TCP_NOTSENT_LOWAT	25	/* limit number of unsent bytes in write queue */
 #define TCP_CC_INFO		26	/* Get Congestion Control (optional) info */
+#define TCP_SAVE_SYN		27	/* Record SYN headers for new connections */
+#define TCP_SAVED_SYN		28	/* Get SYN headers recorded for connection */
 
 struct tcp_repair_opt {
 	__u32	opt_code;
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 46efa03d2b11..ecccfdc50d76 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2482,6 +2482,13 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
 			icsk->icsk_syn_retries = val;
 		break;
 
+	case TCP_SAVE_SYN:
+		if (val < 0 || val > 1)
+			err = -EINVAL;
+		else
+			tp->save_syn = val;
+		break;
+
 	case TCP_LINGER2:
 		if (val < 0)
 			tp->linger2 = -1;
@@ -2818,6 +2825,34 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
 	case TCP_NOTSENT_LOWAT:
 		val = tp->notsent_lowat;
 		break;
+	case TCP_SAVE_SYN:
+		val = tp->save_syn;
+		break;
+	case TCP_SAVED_SYN: {
+		if (get_user(len, optlen))
+			return -EFAULT;
+
+		lock_sock(sk);
+		if (tp->saved_syn) {
+			len = min_t(unsigned int, tp->saved_syn[0], len);
+			if (put_user(len, optlen)) {
+				release_sock(sk);
+				return -EFAULT;
+			}
+			if (copy_to_user(optval, tp->saved_syn + 1, len)) {
+				release_sock(sk);
+				return -EFAULT;
+			}
+			tcp_saved_syn_free(tp);
+			release_sock(sk);
+		} else {
+			release_sock(sk);
+			len = 0;
+			if (put_user(len, optlen))
+				return -EFAULT;
+		}
+		return 0;
+	}
 	default:
 		return -ENOPROTOOPT;
 	}
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 09bdc4abfcbb..df2ca615cd0c 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -6060,6 +6060,23 @@ static bool tcp_syn_flood_action(struct sock *sk,
 	return want_cookie;
 }
 
+static void tcp_reqsk_record_syn(const struct sock *sk,
+				 struct request_sock *req,
+				 const struct sk_buff *skb)
+{
+	if (tcp_sk(sk)->save_syn) {
+		u32 len = skb_network_header_len(skb) + tcp_hdrlen(skb);
+		u32 *copy;
+
+		copy = kmalloc(len + sizeof(u32), GFP_ATOMIC);
+		if (copy) {
+			copy[0] = len;
+			memcpy(&copy[1], skb_network_header(skb), len);
+			req->saved_syn = copy;
+		}
+	}
+}
+
 int tcp_conn_request(struct request_sock_ops *rsk_ops,
 		     const struct tcp_request_sock_ops *af_ops,
 		     struct sock *sk, struct sk_buff *skb)
@@ -6192,6 +6209,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
 		tcp_rsk(req)->tfo_listener = false;
 		af_ops->queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
 	}
+	tcp_reqsk_record_syn(sk, req, skb);
 
 	return 0;
 
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index fc1c658ec6c1..91cb4768a860 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1802,6 +1802,7 @@ void tcp_v4_destroy_sock(struct sock *sk)
 
 	/* If socket is aborted during connect operation */
 	tcp_free_fastopen_req(tp);
+	tcp_saved_syn_free(tp);
 
 	sk_sockets_allocated_dec(sk);
 	sock_release_memcg(sk);
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index e5d7649136fc..ebe2ab2596ed 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -536,6 +536,9 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
 		newtp->fastopen_rsk = NULL;
 		newtp->syn_data_acked = 0;
 
+		newtp->saved_syn = req->saved_syn;
+		req->saved_syn = NULL;
+
 		TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_PASSIVEOPENS);
 	}
 	return newsk;
-- 
cgit v1.2.3


From 2c7a88c252bf3381958cf716f31b6b2e0f2f3fa7 Mon Sep 17 00:00:00 2001
From: Alexander Duyck <alexander.h.duyck@redhat.com>
Date: Mon, 4 May 2015 14:33:48 -0700
Subject: etherdev: Fix sparse error, make test usable by other functions

This change does two things.  First it fixes a sparse error for the fact
that the __be16 degrades to an integer.  Since that is actually what I am
kind of doing I am simply working around that by forcing both sides of the
comparison to u16.

Also I realized on some compilers I was generating another instruction for
big endian systems such as PowerPC since it was masking the value before
doing the comparison.  So to resolve that I have simply pulled the mask out
and wrapped it in an #ifndef __BIG_ENDIAN.

Lastly I pulled this all out into its own function.  I notices there are
similar checks in a number of other places so this function can be reused
there to help reduce overhead in these paths as well.

Signed-off-by: Alexander Duyck <alexander.h.duyck@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/etherdevice.h | 18 ++++++++++++++++++
 net/ethernet/eth.c          |  2 +-
 2 files changed, 19 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/etherdevice.h b/include/linux/etherdevice.h
index c4a10f991fe0..9012f8775208 100644
--- a/include/linux/etherdevice.h
+++ b/include/linux/etherdevice.h
@@ -190,6 +190,24 @@ static inline bool is_valid_ether_addr(const u8 *addr)
 	return !is_multicast_ether_addr(addr) && !is_zero_ether_addr(addr);
 }
 
+/**
+ * eth_proto_is_802_3 - Determine if a given Ethertype/length is a protocol
+ * @proto: Ethertype/length value to be tested
+ *
+ * Check that the value from the Ethertype/length field is a valid Ethertype.
+ *
+ * Return true if the valid is an 802.3 supported Ethertype.
+ */
+static inline bool eth_proto_is_802_3(__be16 proto)
+{
+#ifndef __BIG_ENDIAN
+	/* if CPU is little endian mask off bits representing LSB */
+	proto &= htons(0xFF00);
+#endif
+	/* cast both to u16 and compare since LSB can be ignored */
+	return (__force u16)proto >= (__force u16)htons(ETH_P_802_3_MIN);
+}
+
 /**
  * eth_random_addr - Generate software assigned random Ethernet address
  * @addr: Pointer to a six-byte array containing the Ethernet address
diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c
index 314e4c5a5a5e..9045e2a1108f 100644
--- a/net/ethernet/eth.c
+++ b/net/ethernet/eth.c
@@ -179,7 +179,7 @@ __be16 eth_type_trans(struct sk_buff *skb, struct net_device *dev)
 	if (unlikely(netdev_uses_dsa(dev)))
 		return htons(ETH_P_XDSA);
 
-	if (likely((eth->h_proto & htons(0xFF00)) >= htons(ETH_P_802_3_MIN)))
+	if (likely(eth_proto_is_802_3(eth->h_proto)))
 		return eth->h_proto;
 
 	/*
-- 
cgit v1.2.3


From 9545b22da647cf6fbbac9c5a48c50fd72d892b11 Mon Sep 17 00:00:00 2001
From: Alexander Duyck <alexander.h.duyck@redhat.com>
Date: Mon, 4 May 2015 14:34:10 -0700
Subject: vlan: Use eth_proto_is_802_3

Replace "ntohs(proto) >= ETH_P_802_3_MIN" w/ eth_proto_is_802_3(proto).

Signed-off-by: Alexander Duyck <alexander.h.duyck@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/if_vlan.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h
index 920e4457ce6e..b9ab677c0c0a 100644
--- a/include/linux/if_vlan.h
+++ b/include/linux/if_vlan.h
@@ -539,7 +539,7 @@ static inline void vlan_set_encap_proto(struct sk_buff *skb,
 	 */
 
 	proto = vhdr->h_vlan_encapsulated_proto;
-	if (ntohs(proto) >= ETH_P_802_3_MIN) {
+	if (eth_proto_is_802_3(proto)) {
 		skb->protocol = proto;
 		return;
 	}
-- 
cgit v1.2.3


From 2893c379461a208b3059f55dfe4dafa06b4aa46a Mon Sep 17 00:00:00 2001
From: Sascha Hauer <s.hauer@pengutronix.de>
Date: Tue, 31 Mar 2015 20:16:52 +0200
Subject: clk: make strings in parent name arrays const
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The clk functions and structs declare the parent_name arrays as
'const char **parent_names' which means the parent name strings
are const, but the array itself is not. Use
'const char * const * parent_names' instead which also makes
the array const. This allows us to put the parent_name arrays into
the __initconst section.

Signed-off-by: Sascha Hauer <s.hauer@pengutronix.de>
Reviewed-by: Krzysztof Kozlowski <k.kozlowski@samsung.com>
Tested-by: Krzysztof Kozlowski <k.kozlowski@samsung.com>
Acked-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
[sboyd@codeaurora.org: Squelch 80-character checkpatch warnings]
Signed-off-by: Stephen Boyd <sboyd@codeaurora.org>
---
 drivers/clk/clk-composite.c  |  2 +-
 drivers/clk/clk-mux.c        |  6 ++++--
 include/linux/clk-provider.h | 10 ++++++----
 3 files changed, 11 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/clk/clk-composite.c b/drivers/clk/clk-composite.c
index 956b7e54fa1c..077f4c7148f1 100644
--- a/drivers/clk/clk-composite.c
+++ b/drivers/clk/clk-composite.c
@@ -188,7 +188,7 @@ static void clk_composite_disable(struct clk_hw *hw)
 }
 
 struct clk *clk_register_composite(struct device *dev, const char *name,
-			const char **parent_names, int num_parents,
+			const char * const *parent_names, int num_parents,
 			struct clk_hw *mux_hw, const struct clk_ops *mux_ops,
 			struct clk_hw *rate_hw, const struct clk_ops *rate_ops,
 			struct clk_hw *gate_hw, const struct clk_ops *gate_ops,
diff --git a/drivers/clk/clk-mux.c b/drivers/clk/clk-mux.c
index 69a094c3783d..6066a01b20ea 100644
--- a/drivers/clk/clk-mux.c
+++ b/drivers/clk/clk-mux.c
@@ -114,7 +114,8 @@ const struct clk_ops clk_mux_ro_ops = {
 EXPORT_SYMBOL_GPL(clk_mux_ro_ops);
 
 struct clk *clk_register_mux_table(struct device *dev, const char *name,
-		const char **parent_names, u8 num_parents, unsigned long flags,
+		const char * const *parent_names, u8 num_parents,
+		unsigned long flags,
 		void __iomem *reg, u8 shift, u32 mask,
 		u8 clk_mux_flags, u32 *table, spinlock_t *lock)
 {
@@ -166,7 +167,8 @@ struct clk *clk_register_mux_table(struct device *dev, const char *name,
 EXPORT_SYMBOL_GPL(clk_register_mux_table);
 
 struct clk *clk_register_mux(struct device *dev, const char *name,
-		const char **parent_names, u8 num_parents, unsigned long flags,
+		const char * const *parent_names, u8 num_parents,
+		unsigned long flags,
 		void __iomem *reg, u8 shift, u8 width,
 		u8 clk_mux_flags, spinlock_t *lock)
 {
diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h
index df695313f975..5378c2aba4d2 100644
--- a/include/linux/clk-provider.h
+++ b/include/linux/clk-provider.h
@@ -209,7 +209,7 @@ struct clk_ops {
 struct clk_init_data {
 	const char		*name;
 	const struct clk_ops	*ops;
-	const char		**parent_names;
+	const char		* const *parent_names;
 	u8			num_parents;
 	unsigned long		flags;
 };
@@ -426,12 +426,14 @@ extern const struct clk_ops clk_mux_ops;
 extern const struct clk_ops clk_mux_ro_ops;
 
 struct clk *clk_register_mux(struct device *dev, const char *name,
-		const char **parent_names, u8 num_parents, unsigned long flags,
+		const char * const *parent_names, u8 num_parents,
+		unsigned long flags,
 		void __iomem *reg, u8 shift, u8 width,
 		u8 clk_mux_flags, spinlock_t *lock);
 
 struct clk *clk_register_mux_table(struct device *dev, const char *name,
-		const char **parent_names, u8 num_parents, unsigned long flags,
+		const char * const *parent_names, u8 num_parents,
+		unsigned long flags,
 		void __iomem *reg, u8 shift, u32 mask,
 		u8 clk_mux_flags, u32 *table, spinlock_t *lock);
 
@@ -518,7 +520,7 @@ struct clk_composite {
 };
 
 struct clk *clk_register_composite(struct device *dev, const char *name,
-		const char **parent_names, int num_parents,
+		const char * const *parent_names, int num_parents,
 		struct clk_hw *mux_hw, const struct clk_ops *mux_ops,
 		struct clk_hw *rate_hw, const struct clk_ops *rate_ops,
 		struct clk_hw *gate_hw, const struct clk_ops *gate_ops,
-- 
cgit v1.2.3


From d5622a9c13752be46e6fcde9d31391ce0bb0598b Mon Sep 17 00:00:00 2001
From: Russell King <rmk+kernel@arm.linux.org.uk>
Date: Mon, 2 Mar 2015 15:45:41 +0000
Subject: clkdev: use clk_hw internally

clk_add_alias() calls clk_get() followed by clk_put() but in between
those two calls it saves away the struct clk pointer to a clk_lookup
structure. This leaves the 'clk' member of the clk_lookup pointing at
freed memory on configurations where CONFIG_COMMON_CLK=y. This is a
problem because clk_get_sys() will eventually try to dereference the
freed pointer by calling __clk_get_hw() on it. Fix this by saving away
the struct clk_hw pointer instead of the struct clk pointer so that when
we try to create a per-user struct clk in clk_get_sys() we don't
dereference a junk pointer.

Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
---
 drivers/clk/clkdev.c   | 24 ++++++++++++++++--------
 include/linux/clkdev.h |  1 +
 2 files changed, 17 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/clk/clkdev.c b/drivers/clk/clkdev.c
index 1fcb6ef2cdac..1bb120a3855f 100644
--- a/drivers/clk/clkdev.c
+++ b/drivers/clk/clkdev.c
@@ -177,7 +177,7 @@ struct clk *clk_get_sys(const char *dev_id, const char *con_id)
 	if (!cl)
 		goto out;
 
-	clk = __clk_create_clk(__clk_get_hw(cl->clk), dev_id, con_id);
+	clk = __clk_create_clk(cl->clk_hw, dev_id, con_id);
 	if (IS_ERR(clk))
 		goto out;
 
@@ -215,18 +215,26 @@ void clk_put(struct clk *clk)
 }
 EXPORT_SYMBOL(clk_put);
 
-void clkdev_add(struct clk_lookup *cl)
+static void __clkdev_add(struct clk_lookup *cl)
 {
 	mutex_lock(&clocks_mutex);
 	list_add_tail(&cl->node, &clocks);
 	mutex_unlock(&clocks_mutex);
 }
+
+void clkdev_add(struct clk_lookup *cl)
+{
+	if (!cl->clk_hw)
+		cl->clk_hw = __clk_get_hw(cl->clk);
+	__clkdev_add(cl);
+}
 EXPORT_SYMBOL(clkdev_add);
 
 void __init clkdev_add_table(struct clk_lookup *cl, size_t num)
 {
 	mutex_lock(&clocks_mutex);
 	while (num--) {
+		cl->clk_hw = __clk_get_hw(cl->clk);
 		list_add_tail(&cl->node, &clocks);
 		cl++;
 	}
@@ -243,7 +251,7 @@ struct clk_lookup_alloc {
 };
 
 static struct clk_lookup * __init_refok
-vclkdev_alloc(struct clk *clk, const char *con_id, const char *dev_fmt,
+vclkdev_alloc(struct clk_hw *hw, const char *con_id, const char *dev_fmt,
 	va_list ap)
 {
 	struct clk_lookup_alloc *cla;
@@ -252,7 +260,7 @@ vclkdev_alloc(struct clk *clk, const char *con_id, const char *dev_fmt,
 	if (!cla)
 		return NULL;
 
-	cla->cl.clk = clk;
+	cla->cl.clk_hw = hw;
 	if (con_id) {
 		strlcpy(cla->con_id, con_id, sizeof(cla->con_id));
 		cla->cl.con_id = cla->con_id;
@@ -273,7 +281,7 @@ clkdev_alloc(struct clk *clk, const char *con_id, const char *dev_fmt, ...)
 	va_list ap;
 
 	va_start(ap, dev_fmt);
-	cl = vclkdev_alloc(clk, con_id, dev_fmt, ap);
+	cl = vclkdev_alloc(__clk_get_hw(clk), con_id, dev_fmt, ap);
 	va_end(ap);
 
 	return cl;
@@ -334,7 +342,7 @@ int clk_register_clkdev(struct clk *clk, const char *con_id,
 		return PTR_ERR(clk);
 
 	va_start(ap, dev_fmt);
-	cl = vclkdev_alloc(clk, con_id, dev_fmt, ap);
+	cl = vclkdev_alloc(__clk_get_hw(clk), con_id, dev_fmt, ap);
 	va_end(ap);
 
 	if (!cl)
@@ -365,8 +373,8 @@ int clk_register_clkdevs(struct clk *clk, struct clk_lookup *cl, size_t num)
 		return PTR_ERR(clk);
 
 	for (i = 0; i < num; i++, cl++) {
-		cl->clk = clk;
-		clkdev_add(cl);
+		cl->clk_hw = __clk_get_hw(clk);
+		__clkdev_add(cl);
 	}
 
 	return 0;
diff --git a/include/linux/clkdev.h b/include/linux/clkdev.h
index 94bad77eeb4a..3003afad46c9 100644
--- a/include/linux/clkdev.h
+++ b/include/linux/clkdev.h
@@ -22,6 +22,7 @@ struct clk_lookup {
 	const char		*dev_id;
 	const char		*con_id;
 	struct clk		*clk;
+	struct clk_hw		*clk_hw;
 };
 
 #define CLKDEV_INIT(d, n, c)	\
-- 
cgit v1.2.3


From d2d14a77886485310ec66e575f00ea5232ac7a14 Mon Sep 17 00:00:00 2001
From: Russell King <rmk+kernel@arm.linux.org.uk>
Date: Sat, 14 Mar 2015 15:12:35 +0000
Subject: clk: update clk API documentation to clarify clk_round_rate()

The idea is that rate = clk_round_rate(clk, r) is equivalent to:

	clk_set_rate(clk, r);
	rate = clk_get_rate(clk);

except that clk_round_rate() does not change the hardware in any way.

Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
---
 include/linux/clk.h | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/clk.h b/include/linux/clk.h
index 68c16a6bedb3..cafb22df8d00 100644
--- a/include/linux/clk.h
+++ b/include/linux/clk.h
@@ -306,6 +306,20 @@ void devm_clk_put(struct device *dev, struct clk *clk);
  * @clk: clock source
  * @rate: desired clock rate in Hz
  *
+ * This answers the question "if I were to pass @rate to clk_set_rate(),
+ * what clock rate would I end up with?" without changing the hardware
+ * in any way.  In other words:
+ *
+ *   rate = clk_round_rate(clk, r);
+ *
+ * and:
+ *
+ *   clk_set_rate(clk, r);
+ *   rate = clk_get_rate(clk);
+ *
+ * are equivalent except the former does not modify the clock hardware
+ * in any way.
+ *
  * Returns rounded clock rate in Hz, or negative errno.
  */
 long clk_round_rate(struct clk *clk, unsigned long rate);
-- 
cgit v1.2.3


From 2d34e507293102f29ee94d9a9c5b890696d42452 Mon Sep 17 00:00:00 2001
From: Russell King <rmk+kernel@arm.linux.org.uk>
Date: Mon, 9 Mar 2015 11:03:00 +0000
Subject: clkdev: get rid of redundant clk_add_alias() prototype in linux/clk.h

clk_add_alias() is provided by clkdev, and is not part of the clk API.
Howver, it is prototyped in two locations: linux/clkdev.h and
linux/clk.h.  This is a mess.  Get rid of the redundant and unnecessary
version in linux/clk.h.

Acked-by: Tony Lindgren <tony@atomide.com>
Tested-by: Robert Jarzmik <robert.jarzmik@free.fr>
Acked-by: Sekhar Nori <nsekhar@ti.com>
Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
---
 arch/arm/mach-davinci/da850.c        |  1 +
 arch/arm/mach-omap1/board-nokia770.c |  2 +-
 arch/arm/mach-pxa/eseries.c          |  1 +
 arch/arm/mach-pxa/lubbock.c          |  1 +
 arch/arm/mach-pxa/tosa.c             |  1 +
 include/linux/clk.h                  | 13 -------------
 6 files changed, 5 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/mach-davinci/da850.c b/arch/arm/mach-davinci/da850.c
index 45ce065e7170..3b8740c083c4 100644
--- a/arch/arm/mach-davinci/da850.c
+++ b/arch/arm/mach-davinci/da850.c
@@ -11,6 +11,7 @@
  * is licensed "as is" without any warranty of any kind, whether express
  * or implied.
  */
+#include <linux/clkdev.h>
 #include <linux/gpio.h>
 #include <linux/init.h>
 #include <linux/clk.h>
diff --git a/arch/arm/mach-omap1/board-nokia770.c b/arch/arm/mach-omap1/board-nokia770.c
index 85089d821982..3bc59390a943 100644
--- a/arch/arm/mach-omap1/board-nokia770.c
+++ b/arch/arm/mach-omap1/board-nokia770.c
@@ -7,6 +7,7 @@
  * it under the terms of the GNU General Public License version 2 as
  * published by the Free Software Foundation.
  */
+#include <linux/clkdev.h>
 #include <linux/irq.h>
 #include <linux/gpio.h>
 #include <linux/kernel.h>
@@ -14,7 +15,6 @@
 #include <linux/mutex.h>
 #include <linux/platform_device.h>
 #include <linux/input.h>
-#include <linux/clk.h>
 #include <linux/omapfb.h>
 
 #include <linux/spi/spi.h>
diff --git a/arch/arm/mach-pxa/eseries.c b/arch/arm/mach-pxa/eseries.c
index cfb864173ce3..4427bf26ea47 100644
--- a/arch/arm/mach-pxa/eseries.c
+++ b/arch/arm/mach-pxa/eseries.c
@@ -10,6 +10,7 @@
  *
  */
 
+#include <linux/clkdev.h>
 #include <linux/kernel.h>
 #include <linux/init.h>
 #include <linux/gpio.h>
diff --git a/arch/arm/mach-pxa/lubbock.c b/arch/arm/mach-pxa/lubbock.c
index d8a1be619f21..235f2d9318c1 100644
--- a/arch/arm/mach-pxa/lubbock.c
+++ b/arch/arm/mach-pxa/lubbock.c
@@ -11,6 +11,7 @@
  *  it under the terms of the GNU General Public License version 2 as
  *  published by the Free Software Foundation.
  */
+#include <linux/clkdev.h>
 #include <linux/gpio.h>
 #include <linux/module.h>
 #include <linux/kernel.h>
diff --git a/arch/arm/mach-pxa/tosa.c b/arch/arm/mach-pxa/tosa.c
index 7780d1faa06f..92e56d8a24d8 100644
--- a/arch/arm/mach-pxa/tosa.c
+++ b/arch/arm/mach-pxa/tosa.c
@@ -12,6 +12,7 @@
  *
  */
 
+#include <linux/clkdev.h>
 #include <linux/kernel.h>
 #include <linux/init.h>
 #include <linux/platform_device.h>
diff --git a/include/linux/clk.h b/include/linux/clk.h
index cafb22df8d00..0df4a51e1a78 100644
--- a/include/linux/clk.h
+++ b/include/linux/clk.h
@@ -485,19 +485,6 @@ static inline void clk_disable_unprepare(struct clk *clk)
 	clk_unprepare(clk);
 }
 
-/**
- * clk_add_alias - add a new clock alias
- * @alias: name for clock alias
- * @alias_dev_name: device name
- * @id: platform specific clock name
- * @dev: device
- *
- * Allows using generic clock names for drivers by adding a new alias.
- * Assumes clkdev, see clkdev.h for more info.
- */
-int clk_add_alias(const char *alias, const char *alias_dev_name, char *id,
-			struct device *dev);
-
 struct device_node;
 struct of_phandle_args;
 
-- 
cgit v1.2.3


From b3d8d7e89fab374d731dfb46fe048f09766ca9c8 Mon Sep 17 00:00:00 2001
From: Russell King <rmk+kernel@arm.linux.org.uk>
Date: Mon, 9 Mar 2015 10:43:04 +0000
Subject: clkdev: const-ify connection id to clk_add_alias()

The connection id is only passed to clk_get() which is already const.
Const-ify this argument too.

Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
---
 drivers/clk/clkdev.c   | 6 +++---
 include/linux/clkdev.h | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/clk/clkdev.c b/drivers/clk/clkdev.c
index 04b59ad6d852..e112e13af760 100644
--- a/drivers/clk/clkdev.c
+++ b/drivers/clk/clkdev.c
@@ -288,10 +288,10 @@ clkdev_alloc(struct clk *clk, const char *con_id, const char *dev_fmt, ...)
 }
 EXPORT_SYMBOL(clkdev_alloc);
 
-int clk_add_alias(const char *alias, const char *alias_dev_name, char *id,
-	struct device *dev)
+int clk_add_alias(const char *alias, const char *alias_dev_name,
+	const char *con_id, struct device *dev)
 {
-	struct clk *r = clk_get(dev, id);
+	struct clk *r = clk_get(dev, con_id);
 	struct clk_lookup *l;
 
 	if (IS_ERR(r))
diff --git a/include/linux/clkdev.h b/include/linux/clkdev.h
index 3003afad46c9..cd93b215e3af 100644
--- a/include/linux/clkdev.h
+++ b/include/linux/clkdev.h
@@ -39,7 +39,7 @@ void clkdev_add(struct clk_lookup *cl);
 void clkdev_drop(struct clk_lookup *cl);
 
 void clkdev_add_table(struct clk_lookup *, size_t);
-int clk_add_alias(const char *, const char *, char *, struct device *);
+int clk_add_alias(const char *, const char *, const char *, struct device *);
 
 int clk_register_clkdev(struct clk *, const char *, const char *, ...);
 int clk_register_clkdevs(struct clk *, struct clk_lookup *, size_t);
-- 
cgit v1.2.3


From 2568999835d7797afce3dcc3a3f368051ffcaf1f Mon Sep 17 00:00:00 2001
From: Russell King <rmk+kernel@arm.linux.org.uk>
Date: Mon, 2 Mar 2015 15:40:29 +0000
Subject: clkdev: add clkdev_create() helper

Add a helper to allocate and add a clk_lookup structure.  This can not
only be used in several places in clkdev.c to simplify the code, but
more importantly, can be used by callers of the clkdev code to simplify
their clkdev creation and registration.

Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
---
 drivers/clk/clkdev.c   | 53 ++++++++++++++++++++++++++++++++++++++------------
 include/linux/clkdev.h |  3 +++
 2 files changed, 44 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/clk/clkdev.c b/drivers/clk/clkdev.c
index e112e13af760..c0eaf0973bd2 100644
--- a/drivers/clk/clkdev.c
+++ b/drivers/clk/clkdev.c
@@ -274,6 +274,19 @@ vclkdev_alloc(struct clk_hw *hw, const char *con_id, const char *dev_fmt,
 	return &cla->cl;
 }
 
+static struct clk_lookup *
+vclkdev_create(struct clk_hw *hw, const char *con_id, const char *dev_fmt,
+	va_list ap)
+{
+	struct clk_lookup *cl;
+
+	cl = vclkdev_alloc(hw, con_id, dev_fmt, ap);
+	if (cl)
+		__clkdev_add(cl);
+
+	return cl;
+}
+
 struct clk_lookup * __init_refok
 clkdev_alloc(struct clk *clk, const char *con_id, const char *dev_fmt, ...)
 {
@@ -288,6 +301,29 @@ clkdev_alloc(struct clk *clk, const char *con_id, const char *dev_fmt, ...)
 }
 EXPORT_SYMBOL(clkdev_alloc);
 
+/**
+ * clkdev_create - allocate and add a clkdev lookup structure
+ * @clk: struct clk to associate with all clk_lookups
+ * @con_id: connection ID string on device
+ * @dev_fmt: format string describing device name
+ *
+ * Returns a clk_lookup structure, which can be later unregistered and
+ * freed.
+ */
+struct clk_lookup *clkdev_create(struct clk *clk, const char *con_id,
+	const char *dev_fmt, ...)
+{
+	struct clk_lookup *cl;
+	va_list ap;
+
+	va_start(ap, dev_fmt);
+	cl = vclkdev_create(__clk_get_hw(clk), con_id, dev_fmt, ap);
+	va_end(ap);
+
+	return cl;
+}
+EXPORT_SYMBOL_GPL(clkdev_create);
+
 int clk_add_alias(const char *alias, const char *alias_dev_name,
 	const char *con_id, struct device *dev)
 {
@@ -297,12 +333,10 @@ int clk_add_alias(const char *alias, const char *alias_dev_name,
 	if (IS_ERR(r))
 		return PTR_ERR(r);
 
-	l = clkdev_alloc(r, alias, alias_dev_name);
+	l = clkdev_create(r, alias, "%s", alias_dev_name);
 	clk_put(r);
-	if (!l)
-		return -ENODEV;
-	clkdev_add(l);
-	return 0;
+
+	return l ? 0 : -ENODEV;
 }
 EXPORT_SYMBOL(clk_add_alias);
 
@@ -342,15 +376,10 @@ int clk_register_clkdev(struct clk *clk, const char *con_id,
 		return PTR_ERR(clk);
 
 	va_start(ap, dev_fmt);
-	cl = vclkdev_alloc(__clk_get_hw(clk), con_id, dev_fmt, ap);
+	cl = vclkdev_create(__clk_get_hw(clk), con_id, dev_fmt, ap);
 	va_end(ap);
 
-	if (!cl)
-		return -ENOMEM;
-
-	clkdev_add(cl);
-
-	return 0;
+	return cl ? 0 : -ENOMEM;
 }
 EXPORT_SYMBOL(clk_register_clkdev);
 
diff --git a/include/linux/clkdev.h b/include/linux/clkdev.h
index cd93b215e3af..a240b18e86fa 100644
--- a/include/linux/clkdev.h
+++ b/include/linux/clkdev.h
@@ -38,6 +38,9 @@ struct clk_lookup *clkdev_alloc(struct clk *clk, const char *con_id,
 void clkdev_add(struct clk_lookup *cl);
 void clkdev_drop(struct clk_lookup *cl);
 
+struct clk_lookup *clkdev_create(struct clk *clk, const char *con_id,
+	const char *dev_fmt, ...);
+
 void clkdev_add_table(struct clk_lookup *, size_t);
 int clk_add_alias(const char *, const char *, const char *, struct device *);
 
-- 
cgit v1.2.3


From efb0de55b6a2ec15fc424e660601f22ae2fa487a Mon Sep 17 00:00:00 2001
From: Shobhit Kumar <shobhit.kumar@intel.com>
Date: Tue, 5 May 2015 15:04:18 +0530
Subject: pwm: Add support to remove registered consumer lookup tables

In case some drivers are unloading, they can remove lookup tables which
they had registered during their load time to avoid redundant entries if
loaded again.

CC: Samuel Ortiz <sameo@linux.intel.com>
Cc: Linus Walleij <linus.walleij@linaro.org>
Cc: Alexandre Courbot <gnurou@gmail.com>
Cc: Thierry Reding <thierry.reding@gmail.com>
Signed-off-by: Shobhit Kumar <shobhit.kumar@intel.com>
Signed-off-by: Thierry Reding <thierry.reding@gmail.com>
---
 drivers/pwm/core.c  | 17 +++++++++++++++++
 include/linux/pwm.h |  5 +++++
 2 files changed, 22 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/pwm/core.c b/drivers/pwm/core.c
index ba34c7d89042..27cd58d16881 100644
--- a/drivers/pwm/core.c
+++ b/drivers/pwm/core.c
@@ -585,6 +585,23 @@ void pwm_add_table(struct pwm_lookup *table, size_t num)
 	mutex_unlock(&pwm_lookup_lock);
 }
 
+/**
+ * pwm_remove_table() - unregister PWM device consumers
+ * @table: array of consumers to unregister
+ * @num: number of consumers in table
+ */
+void pwm_remove_table(struct pwm_lookup *table, size_t num)
+{
+	mutex_lock(&pwm_lookup_lock);
+
+	while (num--) {
+		list_del(&table->list);
+		table++;
+	}
+
+	mutex_unlock(&pwm_lookup_lock);
+}
+
 /**
  * pwm_get() - look up and request a PWM device
  * @dev: device for PWM consumer
diff --git a/include/linux/pwm.h b/include/linux/pwm.h
index e90628cac8fa..cfe2d8df5be0 100644
--- a/include/linux/pwm.h
+++ b/include/linux/pwm.h
@@ -290,10 +290,15 @@ struct pwm_lookup {
 
 #if IS_ENABLED(CONFIG_PWM)
 void pwm_add_table(struct pwm_lookup *table, size_t num);
+void pwm_remove_table(struct pwm_lookup *table, size_t num);
 #else
 static inline void pwm_add_table(struct pwm_lookup *table, size_t num)
 {
 }
+
+static inline void pwm_remove_table(struct pwm_lookup *table, size_t num)
+{
+}
 #endif
 
 #ifdef CONFIG_PWM_SYSFS
-- 
cgit v1.2.3


From fa76a3db7093a527333c380df82a0f158d9b8299 Mon Sep 17 00:00:00 2001
From: Sonic Zhang <sonic.zhang@analog.com>
Date: Thu, 9 Apr 2015 11:13:07 +0800
Subject: pinctrl: allow exlusive GPIO/mux pin allocation

Disallow simultaneous use of the the GPIO and peripheral mux
functions by setting a flag "strict" in struct pinctrl_desc.

The blackfin pinmux and gpio controller doesn't allow user to
set up a pin for both GPIO and peripheral function. So, add flag
strict in struct pinctrl_desc to check both gpio_owner and
mux_owner before approving the pin request.

v2-changes:
- if strict flag is set, check gpio_owner and mux_onwer in if and
  else clause

v3-changes:
- add kerneldoc for this struct
- augment Documentation/pinctrl.txt

Signed-off-by: Sonic Zhang <sonic.zhang@analog.com>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 Documentation/pinctrl.txt       | 11 +++++++++++
 drivers/pinctrl/pinctrl-adi2.c  |  1 +
 drivers/pinctrl/pinmux.c        | 13 +++++++++++++
 include/linux/pinctrl/pinctrl.h |  3 +++
 4 files changed, 28 insertions(+)

(limited to 'include/linux')

diff --git a/Documentation/pinctrl.txt b/Documentation/pinctrl.txt
index a9b47163bb5d..d6b2bed94c43 100644
--- a/Documentation/pinctrl.txt
+++ b/Documentation/pinctrl.txt
@@ -73,6 +73,7 @@ static struct pinctrl_desc foo_desc = {
 	.pins = foo_pins,
 	.npins = ARRAY_SIZE(foo_pins),
 	.owner = THIS_MODULE,
+	.strict = true,
 };
 
 int __init foo_probe(void)
@@ -830,6 +831,11 @@ separate memory range only intended for GPIO driving, and the register
 range dealing with pin config and pin multiplexing get placed into a
 different memory range and a separate section of the data sheet.
 
+A flag "strict" in struct pinctrl_desc is available to check and deny
+simultaneous access to the same pin from GPIO and pin multiplexing
+consumers on hardware of this type. The pinctrl driver should set this flag
+accordingly.
+
 (B)
 
                        pin config
@@ -850,6 +856,11 @@ possible that the GPIO, pin config and pin multiplex registers are placed into
 the same memory range and the same section of the data sheet, although that
 need not be the case.
 
+In some pin controllers, although the physical pins are designed in the same
+way as (B), the GPIO function still can't be enabled at the same time as the
+peripheral functions. So again the "strict" flag should be set, denying
+simultaneous activation by GPIO and other muxed in devices.
+
 From a kernel point of view, however, these are different aspects of the
 hardware and shall be put into different subsystems:
 
diff --git a/drivers/pinctrl/pinctrl-adi2.c b/drivers/pinctrl/pinctrl-adi2.c
index 8434439c5017..fbd492668da1 100644
--- a/drivers/pinctrl/pinctrl-adi2.c
+++ b/drivers/pinctrl/pinctrl-adi2.c
@@ -710,6 +710,7 @@ static struct pinctrl_desc adi_pinmux_desc = {
 	.name = DRIVER_NAME,
 	.pctlops = &adi_pctrl_ops,
 	.pmxops = &adi_pinmux_ops,
+	.strict = true,
 	.owner = THIS_MODULE,
 };
 
diff --git a/drivers/pinctrl/pinmux.c b/drivers/pinctrl/pinmux.c
index b874458dcb88..2546fa783464 100644
--- a/drivers/pinctrl/pinmux.c
+++ b/drivers/pinctrl/pinmux.c
@@ -107,6 +107,13 @@ static int pin_request(struct pinctrl_dev *pctldev,
 				desc->name, desc->gpio_owner, owner);
 			goto out;
 		}
+		if (pctldev->desc->strict && desc->mux_usecount &&
+		    strcmp(desc->mux_owner, owner)) {
+			dev_err(pctldev->dev,
+				"pin %s already requested by %s; cannot claim for %s\n",
+				desc->name, desc->mux_owner, owner);
+			goto out;
+		}
 
 		desc->gpio_owner = owner;
 	} else {
@@ -116,6 +123,12 @@ static int pin_request(struct pinctrl_dev *pctldev,
 				desc->name, desc->mux_owner, owner);
 			goto out;
 		}
+		if (pctldev->desc->strict && desc->gpio_owner) {
+			dev_err(pctldev->dev,
+				"pin %s already requested by %s; cannot claim for %s\n",
+				desc->name, desc->gpio_owner, owner);
+			goto out;
+		}
 
 		desc->mux_usecount++;
 		if (desc->mux_usecount > 1)
diff --git a/include/linux/pinctrl/pinctrl.h b/include/linux/pinctrl/pinctrl.h
index 66e4697516de..fc6b0348c375 100644
--- a/include/linux/pinctrl/pinctrl.h
+++ b/include/linux/pinctrl/pinctrl.h
@@ -114,6 +114,8 @@ struct pinctrl_ops {
  *	of the pins field above
  * @pctlops: pin control operation vtable, to support global concepts like
  *	grouping of pins, this is optional.
+ * @strict: check both gpio_owner and mux_owner strictly before approving
+	the pin request
  * @pmxops: pinmux operations vtable, if you support pinmuxing in your driver
  * @confops: pin config operations vtable, if you support pin configuration in
  *	your driver
@@ -132,6 +134,7 @@ struct pinctrl_desc {
 	const struct pinctrl_ops *pctlops;
 	const struct pinmux_ops *pmxops;
 	const struct pinconf_ops *confops;
+	bool strict;
 	struct module *owner;
 #ifdef CONFIG_GENERIC_PINCONF
 	unsigned int num_custom_params;
-- 
cgit v1.2.3


From 8c4c2016345feefcd289ce2479eb70286d30825a Mon Sep 17 00:00:00 2001
From: Linus Walleij <linus.walleij@linaro.org>
Date: Wed, 6 May 2015 14:19:13 +0200
Subject: pinctrl: move strict option to pinmux_ops

While the pinmux_ops are ideally just a vtable for pin mux
calls, the "strict" setting belongs so intuitively with the
pin multiplexing that we should move it here anyway. Putting
it in the top pinctrl_desc makes no sense.

Cc: Sonic Zhang <sonic.zhang@analog.com>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 Documentation/pinctrl.txt       | 2 +-
 drivers/pinctrl/pinctrl-adi2.c  | 2 +-
 drivers/pinctrl/pinmux.c        | 4 ++--
 include/linux/pinctrl/pinctrl.h | 3 ---
 include/linux/pinctrl/pinmux.h  | 4 ++++
 5 files changed, 8 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/pinctrl.txt b/Documentation/pinctrl.txt
index d6b2bed94c43..4976389e432d 100644
--- a/Documentation/pinctrl.txt
+++ b/Documentation/pinctrl.txt
@@ -73,7 +73,6 @@ static struct pinctrl_desc foo_desc = {
 	.pins = foo_pins,
 	.npins = ARRAY_SIZE(foo_pins),
 	.owner = THIS_MODULE,
-	.strict = true,
 };
 
 int __init foo_probe(void)
@@ -715,6 +714,7 @@ static struct pinmux_ops foo_pmxops = {
 	.get_function_name = foo_get_fname,
 	.get_function_groups = foo_get_groups,
 	.set_mux = foo_set_mux,
+	.strict = true,
 };
 
 /* Pinmux operations are handled by some pin controller */
diff --git a/drivers/pinctrl/pinctrl-adi2.c b/drivers/pinctrl/pinctrl-adi2.c
index fbd492668da1..49df9037b41e 100644
--- a/drivers/pinctrl/pinctrl-adi2.c
+++ b/drivers/pinctrl/pinctrl-adi2.c
@@ -703,6 +703,7 @@ static struct pinmux_ops adi_pinmux_ops = {
 	.get_function_name = adi_pinmux_get_func_name,
 	.get_function_groups = adi_pinmux_get_groups,
 	.gpio_request_enable = adi_pinmux_request_gpio,
+	.strict = true,
 };
 
 
@@ -710,7 +711,6 @@ static struct pinctrl_desc adi_pinmux_desc = {
 	.name = DRIVER_NAME,
 	.pctlops = &adi_pctrl_ops,
 	.pmxops = &adi_pinmux_ops,
-	.strict = true,
 	.owner = THIS_MODULE,
 };
 
diff --git a/drivers/pinctrl/pinmux.c b/drivers/pinctrl/pinmux.c
index 2546fa783464..c58c168b06c2 100644
--- a/drivers/pinctrl/pinmux.c
+++ b/drivers/pinctrl/pinmux.c
@@ -107,7 +107,7 @@ static int pin_request(struct pinctrl_dev *pctldev,
 				desc->name, desc->gpio_owner, owner);
 			goto out;
 		}
-		if (pctldev->desc->strict && desc->mux_usecount &&
+		if (ops->strict && desc->mux_usecount &&
 		    strcmp(desc->mux_owner, owner)) {
 			dev_err(pctldev->dev,
 				"pin %s already requested by %s; cannot claim for %s\n",
@@ -123,7 +123,7 @@ static int pin_request(struct pinctrl_dev *pctldev,
 				desc->name, desc->mux_owner, owner);
 			goto out;
 		}
-		if (pctldev->desc->strict && desc->gpio_owner) {
+		if (ops->strict && desc->gpio_owner) {
 			dev_err(pctldev->dev,
 				"pin %s already requested by %s; cannot claim for %s\n",
 				desc->name, desc->gpio_owner, owner);
diff --git a/include/linux/pinctrl/pinctrl.h b/include/linux/pinctrl/pinctrl.h
index fc6b0348c375..66e4697516de 100644
--- a/include/linux/pinctrl/pinctrl.h
+++ b/include/linux/pinctrl/pinctrl.h
@@ -114,8 +114,6 @@ struct pinctrl_ops {
  *	of the pins field above
  * @pctlops: pin control operation vtable, to support global concepts like
  *	grouping of pins, this is optional.
- * @strict: check both gpio_owner and mux_owner strictly before approving
-	the pin request
  * @pmxops: pinmux operations vtable, if you support pinmuxing in your driver
  * @confops: pin config operations vtable, if you support pin configuration in
  *	your driver
@@ -134,7 +132,6 @@ struct pinctrl_desc {
 	const struct pinctrl_ops *pctlops;
 	const struct pinmux_ops *pmxops;
 	const struct pinconf_ops *confops;
-	bool strict;
 	struct module *owner;
 #ifdef CONFIG_GENERIC_PINCONF
 	unsigned int num_custom_params;
diff --git a/include/linux/pinctrl/pinmux.h b/include/linux/pinctrl/pinmux.h
index 511bda9ed4bf..d3740fa7073f 100644
--- a/include/linux/pinctrl/pinmux.h
+++ b/include/linux/pinctrl/pinmux.h
@@ -56,6 +56,9 @@ struct pinctrl_dev;
  *	depending on whether the GPIO is configured as input or output,
  *	a direction selector function may be implemented as a backing
  *	to the GPIO controllers that need pin muxing.
+ * @strict: do not allow simultaneous use of the same pin for GPIO and another
+ *	function. Check both gpio_owner and mux_owner strictly before approving
+ *	the pin request.
  */
 struct pinmux_ops {
 	int (*request) (struct pinctrl_dev *pctldev, unsigned offset);
@@ -79,6 +82,7 @@ struct pinmux_ops {
 				   struct pinctrl_gpio_range *range,
 				   unsigned offset,
 				   bool input);
+	bool strict;
 };
 
 #endif /* CONFIG_PINMUX */
-- 
cgit v1.2.3


From cac089f9026e9ddb3481daf08f0fc4e5949fa1af Mon Sep 17 00:00:00 2001
From: Tony Lindgren <tony@atomide.com>
Date: Thu, 23 Apr 2015 16:56:22 -0700
Subject: gpio: omap: Allow building as a loadable module

We currently get all kinds of errors building the omap gpio driver
as a module starting with:

undefined reference to `omap2_gpio_resume_after_idle'
undefined reference to `omap2_gpio_prepare_for_idle'
...

Let's fix the issue by adding inline functions to the header.
Note that we can now also remove the two unused functions for
omap_set_gpio_debounce and omap_set_gpio_debounce_time.

Then doing rmmod on the module produces further warnings
because of missing exit related functions. Let's add those.

And finally, we can make the Kconfig entry just a tristate
option that's selected for omaps.

Cc: Javier Martinez Canillas <javier@dowhile0.org>
Cc: Kevin Hilman <khilman@deeprootsystems.com>
Cc: Nishanth Menon <nm@ti.com>
Signed-off-by: Tony Lindgren <tony@atomide.com>
Reviewed-by: Grygorii Strashko <grygorii.strashko@linaro.org>
Acked-by: Santosh Shilimkar <ssantosh@kernel.org>
Reviewed-by: Felipe Balbi <balbi@ti.com>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 drivers/gpio/Kconfig                    |  2 +-
 drivers/gpio/gpio-omap.c                | 24 ++++++++++++++++++++++++
 include/linux/platform_data/gpio-omap.h | 12 ++++++++++--
 3 files changed, 35 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/gpio/Kconfig b/drivers/gpio/Kconfig
index c2211258231b..4c4dba075277 100644
--- a/drivers/gpio/Kconfig
+++ b/drivers/gpio/Kconfig
@@ -316,7 +316,7 @@ config GPIO_OCTEON
 	  family of SOCs.
 
 config GPIO_OMAP
-	bool "TI OMAP GPIO support" if COMPILE_TEST && !ARCH_OMAP2PLUS
+	tristate "TI OMAP GPIO support" if ARCH_OMAP2PLUS || COMPILE_TEST
 	default y if ARCH_OMAP
 	depends on ARM
 	select GENERIC_IRQ_CHIP
diff --git a/drivers/gpio/gpio-omap.c b/drivers/gpio/gpio-omap.c
index cd1d5bf48f36..38c268fb7ba8 100644
--- a/drivers/gpio/gpio-omap.c
+++ b/drivers/gpio/gpio-omap.c
@@ -1263,6 +1263,17 @@ static int omap_gpio_probe(struct platform_device *pdev)
 	return 0;
 }
 
+static int omap_gpio_remove(struct platform_device *pdev)
+{
+	struct gpio_bank *bank = platform_get_drvdata(pdev);
+
+	list_del(&bank->node);
+	gpiochip_remove(&bank->chip);
+	pm_runtime_disable(bank->dev);
+
+	return 0;
+}
+
 #ifdef CONFIG_ARCH_OMAP2PLUS
 
 #if defined(CONFIG_PM)
@@ -1448,6 +1459,7 @@ static int omap_gpio_runtime_resume(struct device *dev)
 }
 #endif /* CONFIG_PM */
 
+#if IS_BUILTIN(CONFIG_GPIO_OMAP)
 void omap2_gpio_prepare_for_idle(int pwr_mode)
 {
 	struct gpio_bank *bank;
@@ -1473,6 +1485,7 @@ void omap2_gpio_resume_after_idle(void)
 		pm_runtime_get_sync(bank->dev);
 	}
 }
+#endif
 
 #if defined(CONFIG_PM)
 static void omap_gpio_init_context(struct gpio_bank *p)
@@ -1628,6 +1641,7 @@ MODULE_DEVICE_TABLE(of, omap_gpio_match);
 
 static struct platform_driver omap_gpio_driver = {
 	.probe		= omap_gpio_probe,
+	.remove		= omap_gpio_remove,
 	.driver		= {
 		.name	= "omap_gpio",
 		.pm	= &gpio_pm_ops,
@@ -1645,3 +1659,13 @@ static int __init omap_gpio_drv_reg(void)
 	return platform_driver_register(&omap_gpio_driver);
 }
 postcore_initcall(omap_gpio_drv_reg);
+
+static void __exit omap_gpio_exit(void)
+{
+	platform_driver_unregister(&omap_gpio_driver);
+}
+module_exit(omap_gpio_exit);
+
+MODULE_DESCRIPTION("omap gpio driver");
+MODULE_ALIAS("platform:gpio-omap");
+MODULE_LICENSE("GPL v2");
diff --git a/include/linux/platform_data/gpio-omap.h b/include/linux/platform_data/gpio-omap.h
index 5d50b25a73d7..cb2618147c34 100644
--- a/include/linux/platform_data/gpio-omap.h
+++ b/include/linux/platform_data/gpio-omap.h
@@ -208,9 +208,17 @@ struct omap_gpio_platform_data {
 	int (*get_context_loss_count)(struct device *dev);
 };
 
+#if IS_BUILTIN(CONFIG_GPIO_OMAP)
 extern void omap2_gpio_prepare_for_idle(int off_mode);
 extern void omap2_gpio_resume_after_idle(void);
-extern void omap_set_gpio_debounce(int gpio, int enable);
-extern void omap_set_gpio_debounce_time(int gpio, int enable);
+#else
+static inline void omap2_gpio_prepare_for_idle(int off_mode)
+{
+}
+
+static inline void omap2_gpio_resume_after_idle(void)
+{
+}
+#endif
 
 #endif
-- 
cgit v1.2.3


From 66eb3bd857f5311f72c7c371f78ddc9c472befba Mon Sep 17 00:00:00 2001
From: Fabian Frederick <fabf@skynet.be>
Date: Mon, 27 Apr 2015 18:04:05 +0200
Subject: pinctrl: use ERR_CAST instead of ERR_PTR/PTR_ERR

Inspired by scripts/coccinelle/api/err_cast.cocci

Signed-off-by: Fabian Frederick <fabf@skynet.be>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 include/linux/pinctrl/consumer.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/pinctrl/consumer.h b/include/linux/pinctrl/consumer.h
index 18eccefea06e..d7e5d608faa7 100644
--- a/include/linux/pinctrl/consumer.h
+++ b/include/linux/pinctrl/consumer.h
@@ -142,7 +142,7 @@ static inline struct pinctrl * __must_check pinctrl_get_select(
 	s = pinctrl_lookup_state(p, name);
 	if (IS_ERR(s)) {
 		pinctrl_put(p);
-		return ERR_PTR(PTR_ERR(s));
+		return ERR_CAST(s);
 	}
 
 	ret = pinctrl_select_state(p, s);
-- 
cgit v1.2.3


From 1d6b98774cff82860a3f044610e956bcbff556c1 Mon Sep 17 00:00:00 2001
From: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Date: Tue, 31 Mar 2015 15:55:57 +0200
Subject: tty: constify return type of tty_name

All users of tty_name pass the result directly to a printf-like
function. This means we can actually let tty_name return the literal
"NULL tty" or tty->name directly, avoiding the strcpy and a lot of
medium-sized stack buffers. In preparation for that, make the return
type const char*.

While at it, we can also constify the tty parameter.

Signed-off-by: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Reviewed-by: Peter Hurley <peter@hurleysoftware.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/tty/tty_io.c | 2 +-
 include/linux/tty.h  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/tty/tty_io.c b/drivers/tty/tty_io.c
index e5695467598f..5a49a4a80ebd 100644
--- a/drivers/tty/tty_io.c
+++ b/drivers/tty/tty_io.c
@@ -243,7 +243,7 @@ static void tty_del_file(struct file *file)
  *	Locking: none
  */
 
-char *tty_name(struct tty_struct *tty, char *buf)
+const char *tty_name(const struct tty_struct *tty, char *buf)
 {
 	if (!tty) /* Hmm.  NULL pointer.  That's fun. */
 		strcpy(buf, "NULL tty");
diff --git a/include/linux/tty.h b/include/linux/tty.h
index fe5623c9af71..4cbecfc7b3c9 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -421,7 +421,7 @@ static inline struct tty_struct *tty_kref_get(struct tty_struct *tty)
 
 extern int tty_paranoia_check(struct tty_struct *tty, struct inode *inode,
 			      const char *routine);
-extern char *tty_name(struct tty_struct *tty, char *buf);
+extern const char *tty_name(const struct tty_struct *tty, char *buf);
 extern void tty_wait_until_sent(struct tty_struct *tty, long timeout);
 extern int tty_check_change(struct tty_struct *tty);
 extern void __stop_tty(struct tty_struct *tty);
-- 
cgit v1.2.3


From 429b474990cb4e5e8cfe2352daf649d0599cccb6 Mon Sep 17 00:00:00 2001
From: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Date: Tue, 31 Mar 2015 15:55:59 +0200
Subject: tty: remove buf parameter from tty_name()

tty_name no longer uses the buf parameter, so remove it along with all
the 64 byte stack buffers that used to be passed in.

Mostly generated by the coccinelle script

@depends on patch@
identifier buf;
constant C;
expression tty;
@@
- char buf[C];
  <+...
- tty_name(tty, buf)
+ tty_name(tty)
  ...+>

allmodconfig compiles, so I'm fairly confident the stack buffers
weren't used for other purposes as well.

Signed-off-by: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Reviewed-by: Peter Hurley <peter@hurleysoftware.com>
Acked-by: Jesper Nilsson <jesper.nilsson@axis.com>
Acked-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/input/serio/serport.c    |  5 ++---
 drivers/tty/amiserial.c          |  8 ++------
 drivers/tty/cyclades.c           |  8 ++------
 drivers/tty/n_gsm.c              |  3 +--
 drivers/tty/n_tty.c              |  7 ++-----
 drivers/tty/serial/crisv10.c     |  8 ++------
 drivers/tty/serial/serial_core.c |  4 +---
 drivers/tty/tty_io.c             | 28 +++++++++++-----------------
 drivers/tty/tty_ioctl.c          |  4 +---
 drivers/tty/tty_ldisc.c          |  8 +++-----
 include/linux/tty.h              |  2 +-
 11 files changed, 28 insertions(+), 57 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/input/serio/serport.c b/drivers/input/serio/serport.c
index 69175b825346..9c927d35c1f5 100644
--- a/drivers/input/serio/serport.c
+++ b/drivers/input/serio/serport.c
@@ -167,7 +167,6 @@ static ssize_t serport_ldisc_read(struct tty_struct * tty, struct file * file, u
 {
 	struct serport *serport = (struct serport*) tty->disc_data;
 	struct serio *serio;
-	char name[64];
 
 	if (test_and_set_bit(SERPORT_BUSY, &serport->flags))
 		return -EBUSY;
@@ -177,7 +176,7 @@ static ssize_t serport_ldisc_read(struct tty_struct * tty, struct file * file, u
 		return -ENOMEM;
 
 	strlcpy(serio->name, "Serial port", sizeof(serio->name));
-	snprintf(serio->phys, sizeof(serio->phys), "%s/serio0", tty_name(tty, name));
+	snprintf(serio->phys, sizeof(serio->phys), "%s/serio0", tty_name(tty));
 	serio->id = serport->id;
 	serio->id.type = SERIO_RS232;
 	serio->write = serport_serio_write;
@@ -187,7 +186,7 @@ static ssize_t serport_ldisc_read(struct tty_struct * tty, struct file * file, u
 	serio->dev.parent = tty->dev;
 
 	serio_register_port(serport->serio);
-	printk(KERN_INFO "serio: Serial port %s\n", tty_name(tty, name));
+	printk(KERN_INFO "serio: Serial port %s\n", tty_name(tty));
 
 	wait_event_interruptible(serport->wait, test_bit(SERPORT_DEAD, &serport->flags));
 	serio_unregister_port(serport->serio);
diff --git a/drivers/tty/amiserial.c b/drivers/tty/amiserial.c
index b2d760055952..894d3a84e285 100644
--- a/drivers/tty/amiserial.c
+++ b/drivers/tty/amiserial.c
@@ -966,9 +966,7 @@ static void rs_throttle(struct tty_struct * tty)
 	struct serial_state *info = tty->driver_data;
 	unsigned long flags;
 #ifdef SERIAL_DEBUG_THROTTLE
-	char	buf[64];
-
-	printk("throttle %s: %d....\n", tty_name(tty, buf),
+	printk("throttle %s: %d....\n", tty_name(tty),
 	       tty->ldisc.chars_in_buffer(tty));
 #endif
 
@@ -991,9 +989,7 @@ static void rs_unthrottle(struct tty_struct * tty)
 	struct serial_state *info = tty->driver_data;
 	unsigned long flags;
 #ifdef SERIAL_DEBUG_THROTTLE
-	char	buf[64];
-
-	printk("unthrottle %s: %d....\n", tty_name(tty, buf),
+	printk("unthrottle %s: %d....\n", tty_name(tty),
 	       tty->ldisc.chars_in_buffer(tty));
 #endif
 
diff --git a/drivers/tty/cyclades.c b/drivers/tty/cyclades.c
index fd66f57390d0..87f6578c6f4a 100644
--- a/drivers/tty/cyclades.c
+++ b/drivers/tty/cyclades.c
@@ -2861,9 +2861,7 @@ static void cy_throttle(struct tty_struct *tty)
 	unsigned long flags;
 
 #ifdef CY_DEBUG_THROTTLE
-	char buf[64];
-
-	printk(KERN_DEBUG "cyc:throttle %s: %ld...ttyC%d\n", tty_name(tty, buf),
+	printk(KERN_DEBUG "cyc:throttle %s: %ld...ttyC%d\n", tty_name(tty),
 			tty->ldisc.chars_in_buffer(tty), info->line);
 #endif
 
@@ -2902,10 +2900,8 @@ static void cy_unthrottle(struct tty_struct *tty)
 	unsigned long flags;
 
 #ifdef CY_DEBUG_THROTTLE
-	char buf[64];
-
 	printk(KERN_DEBUG "cyc:unthrottle %s: %ld...ttyC%d\n",
-		tty_name(tty, buf), tty_chars_in_buffer(tty), info->line);
+		tty_name(tty), tty_chars_in_buffer(tty), info->line);
 #endif
 
 	if (serial_paranoia_check(info, tty->name, "cy_unthrottle"))
diff --git a/drivers/tty/n_gsm.c b/drivers/tty/n_gsm.c
index 91abc00aa833..7e039669fc82 100644
--- a/drivers/tty/n_gsm.c
+++ b/drivers/tty/n_gsm.c
@@ -2274,7 +2274,6 @@ static void gsmld_receive_buf(struct tty_struct *tty, const unsigned char *cp,
 	const unsigned char *dp;
 	char *f;
 	int i;
-	char buf[64];
 	char flags = TTY_NORMAL;
 
 	if (debug & 4)
@@ -2296,7 +2295,7 @@ static void gsmld_receive_buf(struct tty_struct *tty, const unsigned char *cp,
 			break;
 		default:
 			WARN_ONCE(1, "%s: unknown flag %d\n",
-			       tty_name(tty, buf), flags);
+			       tty_name(tty), flags);
 			break;
 		}
 	}
diff --git a/drivers/tty/n_tty.c b/drivers/tty/n_tty.c
index cf6e0f2e1331..54da8f49394d 100644
--- a/drivers/tty/n_tty.c
+++ b/drivers/tty/n_tty.c
@@ -1179,13 +1179,12 @@ static void n_tty_receive_break(struct tty_struct *tty)
 static void n_tty_receive_overrun(struct tty_struct *tty)
 {
 	struct n_tty_data *ldata = tty->disc_data;
-	char buf[64];
 
 	ldata->num_overrun++;
 	if (time_after(jiffies, ldata->overrun_time + HZ) ||
 			time_after(ldata->overrun_time, jiffies)) {
 		printk(KERN_WARNING "%s: %d input overrun(s)\n",
-			tty_name(tty, buf),
+			tty_name(tty),
 			ldata->num_overrun);
 		ldata->overrun_time = jiffies;
 		ldata->num_overrun = 0;
@@ -1460,8 +1459,6 @@ static void n_tty_receive_char_closing(struct tty_struct *tty, unsigned char c)
 static void
 n_tty_receive_char_flagged(struct tty_struct *tty, unsigned char c, char flag)
 {
-	char buf[64];
-
 	switch (flag) {
 	case TTY_BREAK:
 		n_tty_receive_break(tty);
@@ -1475,7 +1472,7 @@ n_tty_receive_char_flagged(struct tty_struct *tty, unsigned char c, char flag)
 		break;
 	default:
 		printk(KERN_ERR "%s: unknown flag %d\n",
-		       tty_name(tty, buf), flag);
+		       tty_name(tty), flag);
 		break;
 	}
 }
diff --git a/drivers/tty/serial/crisv10.c b/drivers/tty/serial/crisv10.c
index 0c1825b0b41d..568ea0d2d699 100644
--- a/drivers/tty/serial/crisv10.c
+++ b/drivers/tty/serial/crisv10.c
@@ -3216,9 +3216,7 @@ rs_throttle(struct tty_struct * tty)
 {
 	struct e100_serial *info = (struct e100_serial *)tty->driver_data;
 #ifdef SERIAL_DEBUG_THROTTLE
-	char	buf[64];
-
-	printk("throttle %s: %lu....\n", tty_name(tty, buf),
+	printk("throttle %s: %lu....\n", tty_name(tty),
 	       (unsigned long)tty->ldisc.chars_in_buffer(tty));
 #endif
 	DFLOW(DEBUG_LOG(info->line,"rs_throttle %lu\n", tty->ldisc.chars_in_buffer(tty)));
@@ -3238,9 +3236,7 @@ rs_unthrottle(struct tty_struct * tty)
 {
 	struct e100_serial *info = (struct e100_serial *)tty->driver_data;
 #ifdef SERIAL_DEBUG_THROTTLE
-	char	buf[64];
-
-	printk("unthrottle %s: %lu....\n", tty_name(tty, buf),
+	printk("unthrottle %s: %lu....\n", tty_name(tty),
 	       (unsigned long)tty->ldisc.chars_in_buffer(tty));
 #endif
 	DFLOW(DEBUG_LOG(info->line,"rs_unthrottle ldisc %d\n", tty->ldisc.chars_in_buffer(tty)));
diff --git a/drivers/tty/serial/serial_core.c b/drivers/tty/serial/serial_core.c
index 0b7bb12dfc68..eec067d8eedb 100644
--- a/drivers/tty/serial/serial_core.c
+++ b/drivers/tty/serial/serial_core.c
@@ -894,12 +894,10 @@ static int uart_set_info(struct tty_struct *tty, struct tty_port *port,
 			 * need to rate-limit; it's CAP_SYS_ADMIN only.
 			 */
 			if (uport->flags & UPF_SPD_MASK) {
-				char buf[64];
-
 				dev_notice(uport->dev,
 				       "%s sets custom speed on %s. This is deprecated.\n",
 				      current->comm,
-				      tty_name(port->tty, buf));
+				      tty_name(port->tty));
 			}
 			uart_change_speed(tty, state, NULL);
 		}
diff --git a/drivers/tty/tty_io.c b/drivers/tty/tty_io.c
index 1eaf0fbd99e4..57fc6ee12332 100644
--- a/drivers/tty/tty_io.c
+++ b/drivers/tty/tty_io.c
@@ -235,7 +235,6 @@ static void tty_del_file(struct file *file)
 /**
  *	tty_name	-	return tty naming
  *	@tty: tty structure
- *	@buf: unused
  *
  *	Convert a tty structure into a name. The name reflects the kernel
  *	naming policy and if udev is in use may not reflect user space
@@ -243,7 +242,7 @@ static void tty_del_file(struct file *file)
  *	Locking: none
  */
 
-const char *tty_name(const struct tty_struct *tty, char *buf)
+const char *tty_name(const struct tty_struct *tty)
 {
 	if (!tty) /* Hmm.  NULL pointer.  That's fun. */
 		return "NULL tty";
@@ -768,8 +767,7 @@ static void do_tty_hangup(struct work_struct *work)
 void tty_hangup(struct tty_struct *tty)
 {
 #ifdef TTY_DEBUG_HANGUP
-	char	buf[64];
-	printk(KERN_DEBUG "%s hangup...\n", tty_name(tty, buf));
+	printk(KERN_DEBUG "%s hangup...\n", tty_name(tty));
 #endif
 	schedule_work(&tty->hangup_work);
 }
@@ -788,9 +786,7 @@ EXPORT_SYMBOL(tty_hangup);
 void tty_vhangup(struct tty_struct *tty)
 {
 #ifdef TTY_DEBUG_HANGUP
-	char	buf[64];
-
-	printk(KERN_DEBUG "%s vhangup...\n", tty_name(tty, buf));
+	printk(KERN_DEBUG "%s vhangup...\n", tty_name(tty));
 #endif
 	__tty_hangup(tty, 0);
 }
@@ -829,9 +825,7 @@ void tty_vhangup_self(void)
 static void tty_vhangup_session(struct tty_struct *tty)
 {
 #ifdef TTY_DEBUG_HANGUP
-	char	buf[64];
-
-	printk(KERN_DEBUG "%s vhangup session...\n", tty_name(tty, buf));
+	printk(KERN_DEBUG "%s vhangup session...\n", tty_name(tty));
 #endif
 	__tty_hangup(tty, 1);
 }
@@ -1767,7 +1761,6 @@ int tty_release(struct inode *inode, struct file *filp)
 	struct tty_struct *o_tty = NULL;
 	int	do_sleep, final;
 	int	idx;
-	char	buf[64];
 	long	timeout = 0;
 	int	once = 1;
 
@@ -1791,7 +1784,7 @@ int tty_release(struct inode *inode, struct file *filp)
 
 #ifdef TTY_DEBUG_HANGUP
 	printk(KERN_DEBUG "%s: %s (tty count=%d)...\n", __func__,
-			tty_name(tty, buf), tty->count);
+			tty_name(tty), tty->count);
 #endif
 
 	if (tty->ops->close)
@@ -1842,7 +1835,7 @@ int tty_release(struct inode *inode, struct file *filp)
 		if (once) {
 			once = 0;
 			printk(KERN_WARNING "%s: %s: read/write wait queue active!\n",
-			       __func__, tty_name(tty, buf));
+			       __func__, tty_name(tty));
 		}
 		schedule_timeout_killable(timeout);
 		if (timeout < 120 * HZ)
@@ -1854,13 +1847,13 @@ int tty_release(struct inode *inode, struct file *filp)
 	if (o_tty) {
 		if (--o_tty->count < 0) {
 			printk(KERN_WARNING "%s: bad pty slave count (%d) for %s\n",
-				__func__, o_tty->count, tty_name(o_tty, buf));
+				__func__, o_tty->count, tty_name(o_tty));
 			o_tty->count = 0;
 		}
 	}
 	if (--tty->count < 0) {
 		printk(KERN_WARNING "%s: bad tty->count (%d) for %s\n",
-				__func__, tty->count, tty_name(tty, buf));
+				__func__, tty->count, tty_name(tty));
 		tty->count = 0;
 	}
 
@@ -1903,7 +1896,7 @@ int tty_release(struct inode *inode, struct file *filp)
 		return 0;
 
 #ifdef TTY_DEBUG_HANGUP
-	printk(KERN_DEBUG "%s: %s: final close\n", __func__, tty_name(tty, buf));
+	printk(KERN_DEBUG "%s: %s: final close\n", __func__, tty_name(tty));
 #endif
 	/*
 	 * Ask the line discipline code to release its structures
@@ -1914,7 +1907,8 @@ int tty_release(struct inode *inode, struct file *filp)
 	tty_flush_works(tty);
 
 #ifdef TTY_DEBUG_HANGUP
-	printk(KERN_DEBUG "%s: %s: freeing structure...\n", __func__, tty_name(tty, buf));
+	printk(KERN_DEBUG "%s: %s: freeing structure...\n", __func__,
+	       tty_name(tty));
 #endif
 	/*
 	 * The release_tty function takes care of the details of clearing
diff --git a/drivers/tty/tty_ioctl.c b/drivers/tty/tty_ioctl.c
index 8e53fe469664..5232fb60b0b1 100644
--- a/drivers/tty/tty_ioctl.c
+++ b/drivers/tty/tty_ioctl.c
@@ -211,9 +211,7 @@ int tty_unthrottle_safe(struct tty_struct *tty)
 void tty_wait_until_sent(struct tty_struct *tty, long timeout)
 {
 #ifdef TTY_DEBUG_WAIT_UNTIL_SENT
-	char buf[64];
-
-	printk(KERN_DEBUG "%s wait until sent...\n", tty_name(tty, buf));
+	printk(KERN_DEBUG "%s wait until sent...\n", tty_name(tty));
 #endif
 	if (!timeout)
 		timeout = MAX_SCHEDULE_TIMEOUT;
diff --git a/drivers/tty/tty_ldisc.c b/drivers/tty/tty_ldisc.c
index 3737f55272d2..c07fb5d9bcf9 100644
--- a/drivers/tty/tty_ldisc.c
+++ b/drivers/tty/tty_ldisc.c
@@ -22,9 +22,8 @@
 #undef LDISC_DEBUG_HANGUP
 
 #ifdef LDISC_DEBUG_HANGUP
-#define tty_ldisc_debug(tty, f, args...) ({				       \
-	char __b[64];							       \
-	printk(KERN_DEBUG "%s: %s: " f, __func__, tty_name(tty, __b), ##args); \
+#define tty_ldisc_debug(tty, f, args...) ({				  \
+	printk(KERN_DEBUG "%s: %s: " f, __func__, tty_name(tty), ##args); \
 })
 #else
 #define tty_ldisc_debug(tty, f, args...)
@@ -483,7 +482,6 @@ static void tty_ldisc_close(struct tty_struct *tty, struct tty_ldisc *ld)
 
 static void tty_ldisc_restore(struct tty_struct *tty, struct tty_ldisc *old)
 {
-	char buf[64];
 	struct tty_ldisc *new_ldisc;
 	int r;
 
@@ -504,7 +502,7 @@ static void tty_ldisc_restore(struct tty_struct *tty, struct tty_ldisc *old)
 		if (r < 0)
 			panic("Couldn't open N_TTY ldisc for "
 			      "%s --- error %d.",
-			      tty_name(tty, buf), r);
+			      tty_name(tty), r);
 	}
 }
 
diff --git a/include/linux/tty.h b/include/linux/tty.h
index 4cbecfc7b3c9..9a72c9144d8a 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -421,7 +421,7 @@ static inline struct tty_struct *tty_kref_get(struct tty_struct *tty)
 
 extern int tty_paranoia_check(struct tty_struct *tty, struct inode *inode,
 			      const char *routine);
-extern const char *tty_name(const struct tty_struct *tty, char *buf);
+extern const char *tty_name(const struct tty_struct *tty);
 extern void tty_wait_until_sent(struct tty_struct *tty, long timeout);
 extern int tty_check_change(struct tty_struct *tty);
 extern void __stop_tty(struct tty_struct *tty);
-- 
cgit v1.2.3


From 6b3cddccf4eec0883feb065aea28dd9770bb17d0 Mon Sep 17 00:00:00 2001
From: Peter Hurley <peter@hurleysoftware.com>
Date: Sat, 11 Apr 2015 11:02:36 -0400
Subject: serial: core: Fix unused variable warnings from uart_console()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

If CONFIG_SERIAL_CORE_CONSOLE=n, build warnings are generated by
uart_console() macro expansion:

drivers/tty/serial/of_serial.c: In function ‘of_serial_suspend_8250’:
drivers/tty/serial/of_serial.c:262:20: warning: unused variable ‘port’ [-Wunused-variable]
  struct uart_port *port = &port8250->port;
                    ^
drivers/tty/serial/of_serial.c: In function ‘of_serial_resume_8250’:
drivers/tty/serial/of_serial.c:272:20: warning: unused variable ‘port’ [-Wunused-variable]
  struct uart_port *port = &port8250->port;

Signed-off-by: Peter Hurley <peter@hurleysoftware.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/serial_core.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h
index 025dad9dcde4..297d4fa1cfe5 100644
--- a/include/linux/serial_core.h
+++ b/include/linux/serial_core.h
@@ -35,7 +35,7 @@
 #define uart_console(port) \
 	((port)->cons && (port)->cons->index == (port)->line)
 #else
-#define uart_console(port)      (0)
+#define uart_console(port)      ({ (void)port; 0; })
 #endif
 
 struct uart_port;
-- 
cgit v1.2.3


From 17799359e7b3fa6ef4f2bf926cd6821cf7903ecf Mon Sep 17 00:00:00 2001
From: Brian Norris <computersforpeace@gmail.com>
Date: Sat, 28 Feb 2015 02:13:11 -0800
Subject: mtd: nand_bbt: make nand_scan_bbt() static

This implementation detail is no longer needed outside of nand_bbt.c.

Signed-off-by: Brian Norris <computersforpeace@gmail.com>
---
 drivers/mtd/nand/nand_bbt.c | 2 +-
 include/linux/mtd/nand.h    | 1 -
 2 files changed, 1 insertion(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/nand/nand_bbt.c b/drivers/mtd/nand/nand_bbt.c
index 307a285afb78..53e17586fbed 100644
--- a/drivers/mtd/nand/nand_bbt.c
+++ b/drivers/mtd/nand/nand_bbt.c
@@ -1074,7 +1074,7 @@ static void verify_bbt_descr(struct mtd_info *mtd, struct nand_bbt_descr *bd)
  * The bad block table memory is allocated here. It must be freed by calling
  * the nand_free_bbt function.
  */
-int nand_scan_bbt(struct mtd_info *mtd, struct nand_bbt_descr *bd)
+static int nand_scan_bbt(struct mtd_info *mtd, struct nand_bbt_descr *bd)
 {
 	struct nand_chip *this = mtd->priv;
 	int len, res = 0;
diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h
index 3d4ea7eb2b68..6c51876941f3 100644
--- a/include/linux/mtd/nand.h
+++ b/include/linux/mtd/nand.h
@@ -833,7 +833,6 @@ struct nand_manufacturers {
 extern struct nand_flash_dev nand_flash_ids[];
 extern struct nand_manufacturers nand_manuf_ids[];
 
-extern int nand_scan_bbt(struct mtd_info *mtd, struct nand_bbt_descr *bd);
 extern int nand_default_bbt(struct mtd_info *mtd);
 extern int nand_markbad_bbt(struct mtd_info *mtd, loff_t offs);
 extern int nand_isreserved_bbt(struct mtd_info *mtd, loff_t offs);
-- 
cgit v1.2.3


From 0097d12e504b3ce57b68810737ad6a5a64a98c68 Mon Sep 17 00:00:00 2001
From: Christian Borntraeger <borntraeger@de.ibm.com>
Date: Thu, 30 Apr 2015 13:43:30 +0200
Subject: KVM: provide irq_unsafe kvm_guest_{enter|exit}

Several kvm architectures disable interrupts before kvm_guest_enter.
kvm_guest_enter then uses local_irq_save/restore to disable interrupts
again or for the first time. Lets provide underscore versions of
kvm_guest_{enter|exit} that assume being called locked.
kvm_guest_enter now disables interrupts for the full function and
thus we can remove the check for preemptible.

This patch then adopts s390/kvm to use local_irq_disable/enable calls
which are slighty cheaper that local_irq_save/restore and call these
new functions.

Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/s390/kvm/kvm-s390.c | 10 ++++++----
 include/linux/kvm_host.h | 27 ++++++++++++++++++---------
 2 files changed, 24 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 8cd8e7b288c5..2be391bb8557 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -1993,12 +1993,14 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
 		 * As PF_VCPU will be used in fault handler, between
 		 * guest_enter and guest_exit should be no uaccess.
 		 */
-		preempt_disable();
-		kvm_guest_enter();
-		preempt_enable();
+		local_irq_disable();
+		__kvm_guest_enter();
+		local_irq_enable();
 		exit_reason = sie64a(vcpu->arch.sie_block,
 				     vcpu->run->s.regs.gprs);
-		kvm_guest_exit();
+		local_irq_disable();
+		__kvm_guest_exit();
+		local_irq_enable();
 		vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
 
 		rc = vcpu_post_run(vcpu, exit_reason);
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index ad45054309a0..efc16df1fc5d 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -762,16 +762,10 @@ static inline void kvm_iommu_unmap_pages(struct kvm *kvm,
 }
 #endif
 
-static inline void kvm_guest_enter(void)
+/* must be called with irqs disabled */
+static inline void __kvm_guest_enter(void)
 {
-	unsigned long flags;
-
-	BUG_ON(preemptible());
-
-	local_irq_save(flags);
 	guest_enter();
-	local_irq_restore(flags);
-
 	/* KVM does not hold any references to rcu protected data when it
 	 * switches CPU into a guest mode. In fact switching to a guest mode
 	 * is very similar to exiting to userspace from rcu point of view. In
@@ -783,12 +777,27 @@ static inline void kvm_guest_enter(void)
 		rcu_virt_note_context_switch(smp_processor_id());
 }
 
+/* must be called with irqs disabled */
+static inline void __kvm_guest_exit(void)
+{
+	guest_exit();
+}
+
+static inline void kvm_guest_enter(void)
+{
+	unsigned long flags;
+
+	local_irq_save(flags);
+	__kvm_guest_enter();
+	local_irq_restore(flags);
+}
+
 static inline void kvm_guest_exit(void)
 {
 	unsigned long flags;
 
 	local_irq_save(flags);
-	guest_exit();
+	__kvm_guest_exit();
 	local_irq_restore(flags);
 }
 
-- 
cgit v1.2.3


From 653f52c316a49c5ee2701bc13b15879f20790662 Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@redhat.com>
Date: Thu, 23 Apr 2015 11:52:37 -0400
Subject: kvm,x86: load guest FPU context more eagerly

Currently KVM will clear the FPU bits in CR0.TS in the VMCS, and trap to
re-load them every time the guest accesses the FPU after a switch back into
the guest from the host.

This patch copies the x86 task switch semantics for FPU loading, with the
FPU loaded eagerly after first use if the system uses eager fpu mode,
or if the guest uses the FPU frequently.

In the latter case, after loading the FPU for 255 times, the fpu_counter
will roll over, and we will revert to loading the FPU on demand, until
it has been established that the guest is still actively using the FPU.

This mirrors the x86 task switch policy, which seems to work.

Signed-off-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/x86.c       | 15 +++++++++++++--
 include/linux/kvm_host.h |  1 +
 2 files changed, 14 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index bf80ce7656b7..c42134e98e31 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -7086,14 +7086,25 @@ void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
 {
 	kvm_put_guest_xcr0(vcpu);
 
-	if (!vcpu->guest_fpu_loaded)
+	if (!vcpu->guest_fpu_loaded) {
+		vcpu->fpu_counter = 0;
 		return;
+	}
 
 	vcpu->guest_fpu_loaded = 0;
 	fpu_save_init(&vcpu->arch.guest_fpu);
 	__kernel_fpu_end();
 	++vcpu->stat.fpu_reload;
-	kvm_make_request(KVM_REQ_DEACTIVATE_FPU, vcpu);
+	/*
+	 * If using eager FPU mode, or if the guest is a frequent user
+	 * of the FPU, just leave the FPU active for next time.
+	 * Every 255 times fpu_counter rolls over to 0; a guest that uses
+	 * the FPU in bursts will revert to loading it on demand.
+	 */
+	if (!use_eager_fpu()) {
+		if (++vcpu->fpu_counter < 5)
+			kvm_make_request(KVM_REQ_DEACTIVATE_FPU, vcpu);
+	}
 	trace_kvm_fpu(0);
 }
 
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index efc16df1fc5d..b7a08cd6f4a8 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -230,6 +230,7 @@ struct kvm_vcpu {
 
 	int fpu_active;
 	int guest_fpu_loaded, guest_xcr0_loaded;
+	unsigned char fpu_counter;
 	wait_queue_head_t wq;
 	struct pid *pid;
 	int sigset_active;
-- 
cgit v1.2.3


From aed5ed47724f6a7453fa62e3c90f3cee93edbfe3 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Wed, 6 May 2015 18:04:23 +0200
Subject: context_tracking: Protect against recursion

Context tracking recursion can happen when an exception triggers
in the middle of a call to a context tracking probe.

This special case can be caused by vmalloc faults. If an access
to a memory area allocated by vmalloc happens in the middle of
context_tracking_enter(), we may run into an endless fault loop
because the exception in turn calls context_tracking_enter()
which faults on the same vmalloc'ed memory, triggering an
exception again, etc...

Some rare crashes have been reported so lets protect against
this with a recursion counter.

Reported-by: Dave Jones <davej@redhat.com>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Reviewed-by: Rik van Riel <riel@redhat.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Chris Metcalf <cmetcalf@ezchip.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Mike Galbraith <umgwanakikbuti@gmail.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Rafael J . Wysocki <rafael.j.wysocki@intel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1430928266-24888-2-git-send-email-fweisbec@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/context_tracking_state.h |  1 +
 kernel/context_tracking.c              | 29 +++++++++++++++++++++++++++++
 2 files changed, 30 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/context_tracking_state.h b/include/linux/context_tracking_state.h
index 6b7b96a32b75..678ecdf90cf6 100644
--- a/include/linux/context_tracking_state.h
+++ b/include/linux/context_tracking_state.h
@@ -12,6 +12,7 @@ struct context_tracking {
 	 * may be further optimized using static keys.
 	 */
 	bool active;
+	int recursion;
 	enum ctx_state {
 		CONTEXT_KERNEL = 0,
 		CONTEXT_USER,
diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c
index 72d59a1a6eb6..5b11a10e196a 100644
--- a/kernel/context_tracking.c
+++ b/kernel/context_tracking.c
@@ -38,6 +38,25 @@ void context_tracking_cpu_set(int cpu)
 	}
 }
 
+static bool context_tracking_recursion_enter(void)
+{
+	int recursion;
+
+	recursion = __this_cpu_inc_return(context_tracking.recursion);
+	if (recursion == 1)
+		return true;
+
+	WARN_ONCE((recursion < 1), "Invalid context tracking recursion value %d\n", recursion);
+	__this_cpu_dec(context_tracking.recursion);
+
+	return false;
+}
+
+static void context_tracking_recursion_exit(void)
+{
+	__this_cpu_dec(context_tracking.recursion);
+}
+
 /**
  * context_tracking_enter - Inform the context tracking that the CPU is going
  *                          enter user or guest space mode.
@@ -75,6 +94,9 @@ void context_tracking_enter(enum ctx_state state)
 	WARN_ON_ONCE(!current->mm);
 
 	local_irq_save(flags);
+	if (!context_tracking_recursion_enter())
+		goto out_irq_restore;
+
 	if ( __this_cpu_read(context_tracking.state) != state) {
 		if (__this_cpu_read(context_tracking.active)) {
 			/*
@@ -105,6 +127,8 @@ void context_tracking_enter(enum ctx_state state)
 		 */
 		__this_cpu_write(context_tracking.state, state);
 	}
+	context_tracking_recursion_exit();
+out_irq_restore:
 	local_irq_restore(flags);
 }
 NOKPROBE_SYMBOL(context_tracking_enter);
@@ -139,6 +163,9 @@ void context_tracking_exit(enum ctx_state state)
 		return;
 
 	local_irq_save(flags);
+	if (!context_tracking_recursion_enter())
+		goto out_irq_restore;
+
 	if (__this_cpu_read(context_tracking.state) == state) {
 		if (__this_cpu_read(context_tracking.active)) {
 			/*
@@ -153,6 +180,8 @@ void context_tracking_exit(enum ctx_state state)
 		}
 		__this_cpu_write(context_tracking.state, CONTEXT_KERNEL);
 	}
+	context_tracking_recursion_exit();
+out_irq_restore:
 	local_irq_restore(flags);
 }
 NOKPROBE_SYMBOL(context_tracking_exit);
-- 
cgit v1.2.3


From fafe870f31212a72f3c2d74e7b90e4ef39e83ee1 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Wed, 6 May 2015 18:04:24 +0200
Subject: context_tracking: Inherit TIF_NOHZ through forks instead of context
 switches

TIF_NOHZ is used by context_tracking to force syscall slow-path
on every task in order to track userspace roundtrips. As such,
it must be set on all running tasks.

It's currently explicitly inherited through context switches.
There is no need to do it in this fast-path though. The flag
could simply be set once for all on all tasks, whether they are
running or not.

Lets do this by setting the flag for the init task on early boot,
and let it propagate through fork inheritance.

While at it, mark context_tracking_cpu_set() as init code, we
only need it at early boot time.

Suggested-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Reviewed-by: Rik van Riel <riel@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Chris Metcalf <cmetcalf@ezchip.com>
Cc: Dave Jones <davej@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Mike Galbraith <umgwanakikbuti@gmail.com>
Cc: Paul E . McKenney <paulmck@linux.vnet.ibm.com>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rafael J . Wysocki <rafael.j.wysocki@intel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1430928266-24888-3-git-send-email-fweisbec@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/context_tracking.h | 10 ---------
 include/linux/sched.h            |  3 +++
 kernel/context_tracking.c        | 44 +++++++++++++++++-----------------------
 kernel/sched/core.c              |  1 -
 4 files changed, 22 insertions(+), 36 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/context_tracking.h b/include/linux/context_tracking.h
index 2821838256b4..b96bd299966f 100644
--- a/include/linux/context_tracking.h
+++ b/include/linux/context_tracking.h
@@ -14,8 +14,6 @@ extern void context_tracking_enter(enum ctx_state state);
 extern void context_tracking_exit(enum ctx_state state);
 extern void context_tracking_user_enter(void);
 extern void context_tracking_user_exit(void);
-extern void __context_tracking_task_switch(struct task_struct *prev,
-					   struct task_struct *next);
 
 static inline void user_enter(void)
 {
@@ -51,19 +49,11 @@ static inline void exception_exit(enum ctx_state prev_ctx)
 	}
 }
 
-static inline void context_tracking_task_switch(struct task_struct *prev,
-						struct task_struct *next)
-{
-	if (context_tracking_is_enabled())
-		__context_tracking_task_switch(prev, next);
-}
 #else
 static inline void user_enter(void) { }
 static inline void user_exit(void) { }
 static inline enum ctx_state exception_enter(void) { return 0; }
 static inline void exception_exit(enum ctx_state prev_ctx) { }
-static inline void context_tracking_task_switch(struct task_struct *prev,
-						struct task_struct *next) { }
 #endif /* !CONFIG_CONTEXT_TRACKING */
 
 
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 26a2e6122734..185a750e4ed4 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2532,6 +2532,9 @@ static inline unsigned long wait_task_inactive(struct task_struct *p,
 }
 #endif
 
+#define tasklist_empty() \
+	list_empty(&init_task.tasks)
+
 #define next_task(p) \
 	list_entry_rcu((p)->tasks.next, struct task_struct, tasks)
 
diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c
index 5b11a10e196a..0a495ab35bc7 100644
--- a/kernel/context_tracking.c
+++ b/kernel/context_tracking.c
@@ -30,14 +30,6 @@ EXPORT_SYMBOL_GPL(context_tracking_enabled);
 DEFINE_PER_CPU(struct context_tracking, context_tracking);
 EXPORT_SYMBOL_GPL(context_tracking);
 
-void context_tracking_cpu_set(int cpu)
-{
-	if (!per_cpu(context_tracking.active, cpu)) {
-		per_cpu(context_tracking.active, cpu) = true;
-		static_key_slow_inc(&context_tracking_enabled);
-	}
-}
-
 static bool context_tracking_recursion_enter(void)
 {
 	int recursion;
@@ -193,24 +185,26 @@ void context_tracking_user_exit(void)
 }
 NOKPROBE_SYMBOL(context_tracking_user_exit);
 
-/**
- * __context_tracking_task_switch - context switch the syscall callbacks
- * @prev: the task that is being switched out
- * @next: the task that is being switched in
- *
- * The context tracking uses the syscall slow path to implement its user-kernel
- * boundaries probes on syscalls. This way it doesn't impact the syscall fast
- * path on CPUs that don't do context tracking.
- *
- * But we need to clear the flag on the previous task because it may later
- * migrate to some CPU that doesn't do the context tracking. As such the TIF
- * flag may not be desired there.
- */
-void __context_tracking_task_switch(struct task_struct *prev,
-				    struct task_struct *next)
+void __init context_tracking_cpu_set(int cpu)
 {
-	clear_tsk_thread_flag(prev, TIF_NOHZ);
-	set_tsk_thread_flag(next, TIF_NOHZ);
+	static __initdata bool initialized = false;
+
+	if (!per_cpu(context_tracking.active, cpu)) {
+		per_cpu(context_tracking.active, cpu) = true;
+		static_key_slow_inc(&context_tracking_enabled);
+	}
+
+	if (initialized)
+		return;
+
+	/*
+	 * Set TIF_NOHZ to init/0 and let it propagate to all tasks through fork
+	 * This assumes that init is the only task at this early boot stage.
+	 */
+	set_tsk_thread_flag(&init_task, TIF_NOHZ);
+	WARN_ON_ONCE(!tasklist_empty());
+
+	initialized = true;
 }
 
 #ifdef CONFIG_CONTEXT_TRACKING_FORCE
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index fe22f7510bce..54f032cea040 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2332,7 +2332,6 @@ context_switch(struct rq *rq, struct task_struct *prev,
 	 */
 	spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
 
-	context_tracking_task_switch(prev, next);
 	/* Here we just switch the register state and the stack. */
 	switch_to(prev, next, prev);
 	barrier();
-- 
cgit v1.2.3


From 83dedea8a07fb4bf91863764b15c1c4ec00330f9 Mon Sep 17 00:00:00 2001
From: Chris Metcalf <cmetcalf@ezchip.com>
Date: Wed, 6 May 2015 18:04:25 +0200
Subject: nohz: Add tick_nohz_full_add_cpus_to() API

This API is useful to modify a cpumask indicating some special
nohz-type functionality so that the nohz cores are automatically
added to that set.

Signed-off-by: Chris Metcalf <cmetcalf@ezchip.com>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Jones <davej@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Mike Galbraith <umgwanakikbuti@gmail.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1429024675-18938-1-git-send-email-cmetcalf@ezchip.com
Link: http://lkml.kernel.org/r/1430928266-24888-4-git-send-email-fweisbec@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/tick.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/tick.h b/include/linux/tick.h
index f8492da57ad3..4191b5623a28 100644
--- a/include/linux/tick.h
+++ b/include/linux/tick.h
@@ -134,6 +134,12 @@ static inline bool tick_nohz_full_cpu(int cpu)
 	return cpumask_test_cpu(cpu, tick_nohz_full_mask);
 }
 
+static inline void tick_nohz_full_add_cpus_to(struct cpumask *mask)
+{
+	if (tick_nohz_full_enabled())
+		cpumask_or(mask, mask, tick_nohz_full_mask);
+}
+
 extern void __tick_nohz_full_check(void);
 extern void tick_nohz_full_kick(void);
 extern void tick_nohz_full_kick_cpu(int cpu);
@@ -142,6 +148,7 @@ extern void __tick_nohz_task_switch(struct task_struct *tsk);
 #else
 static inline bool tick_nohz_full_enabled(void) { return false; }
 static inline bool tick_nohz_full_cpu(int cpu) { return false; }
+static inline void tick_nohz_full_add_cpus_to(struct cpumask *mask) { }
 static inline void __tick_nohz_full_check(void) { }
 static inline void tick_nohz_full_kick_cpu(int cpu) { }
 static inline void tick_nohz_full_kick(void) { }
-- 
cgit v1.2.3


From c6201cd8513db2db54b248a862672849ed9ccb82 Mon Sep 17 00:00:00 2001
From: Bjorn Helgaas <bhelgaas@google.com>
Date: Thu, 7 May 2015 09:52:22 -0500
Subject: PCI/MSI: Remove unused pci_msi_off()

pci_msi_off() is unused, so remove it.

Removes the exported symbol pci_msi_off().

Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 drivers/pci/pci.c   | 33 ---------------------------------
 include/linux/pci.h |  1 -
 2 files changed, 34 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index acc4b6ef78c4..687af7264406 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -3101,39 +3101,6 @@ bool pci_check_and_unmask_intx(struct pci_dev *dev)
 }
 EXPORT_SYMBOL_GPL(pci_check_and_unmask_intx);
 
-/**
- * pci_msi_off - disables any MSI or MSI-X capabilities
- * @dev: the PCI device to operate on
- *
- * If you want to use MSI, see pci_enable_msi() and friends.
- * This is a lower-level primitive that allows us to disable
- * MSI operation at the device level.
- */
-void pci_msi_off(struct pci_dev *dev)
-{
-	int pos;
-	u16 control;
-
-	/*
-	 * This looks like it could go in msi.c, but we need it even when
-	 * CONFIG_PCI_MSI=n.  For the same reason, we can't use
-	 * dev->msi_cap or dev->msix_cap here.
-	 */
-	pos = pci_find_capability(dev, PCI_CAP_ID_MSI);
-	if (pos) {
-		pci_read_config_word(dev, pos + PCI_MSI_FLAGS, &control);
-		control &= ~PCI_MSI_FLAGS_ENABLE;
-		pci_write_config_word(dev, pos + PCI_MSI_FLAGS, control);
-	}
-	pos = pci_find_capability(dev, PCI_CAP_ID_MSIX);
-	if (pos) {
-		pci_read_config_word(dev, pos + PCI_MSIX_FLAGS, &control);
-		control &= ~PCI_MSIX_FLAGS_ENABLE;
-		pci_write_config_word(dev, pos + PCI_MSIX_FLAGS, control);
-	}
-}
-EXPORT_SYMBOL_GPL(pci_msi_off);
-
 int pci_set_dma_max_seg_size(struct pci_dev *dev, unsigned int size)
 {
 	return dma_set_max_seg_size(&dev->dev, size);
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 353db8dc4c6e..50b7c7d0206f 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -974,7 +974,6 @@ void pci_intx(struct pci_dev *dev, int enable);
 bool pci_intx_mask_supported(struct pci_dev *dev);
 bool pci_check_and_mask_intx(struct pci_dev *dev);
 bool pci_check_and_unmask_intx(struct pci_dev *dev);
-void pci_msi_off(struct pci_dev *dev);
 int pci_set_dma_max_seg_size(struct pci_dev *dev, unsigned int size);
 int pci_set_dma_seg_boundary(struct pci_dev *dev, unsigned long mask);
 int pci_wait_for_pending(struct pci_dev *dev, int pos, u16 mask);
-- 
cgit v1.2.3


From 385f83f85cd9428db82cae5e6f6f786be113b24c Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert+renesas@glider.be>
Date: Tue, 28 Apr 2015 12:21:40 +0200
Subject: dmaengine: Remove Renesas Audio DMAC peri peri platform data

Commit 3cd44dcd35a6 ("dmaengine: remove Renesas Audio DMAC peri peri")
forgot to remove the header file with the platform data definitions.

Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
Acked-by: Simon Horman <horms+renesas@verge.net.au>
Signed-off-by: Vinod Koul <vinod.koul@intel.com>
---
 include/linux/platform_data/dma-rcar-audmapp.h | 34 --------------------------
 1 file changed, 34 deletions(-)
 delete mode 100644 include/linux/platform_data/dma-rcar-audmapp.h

(limited to 'include/linux')

diff --git a/include/linux/platform_data/dma-rcar-audmapp.h b/include/linux/platform_data/dma-rcar-audmapp.h
deleted file mode 100644
index 471fffebbeb4..000000000000
--- a/include/linux/platform_data/dma-rcar-audmapp.h
+++ /dev/null
@@ -1,34 +0,0 @@
-/*
- * This is for Renesas R-Car Audio-DMAC-peri-peri.
- *
- * Copyright (C) 2014 Renesas Electronics Corporation
- * Copyright (C) 2014 Kuninori Morimoto <kuninori.morimoto.gx@renesas.com>
- *
- * This file is based on the include/linux/sh_dma.h
- *
- * Header for the new SH dmaengine driver
- *
- * Copyright (C) 2010 Guennadi Liakhovetski <g.liakhovetski@gmx.de>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-#ifndef SH_AUDMAPP_H
-#define SH_AUDMAPP_H
-
-#include <linux/dmaengine.h>
-
-struct audmapp_slave_config {
-	int		slave_id;
-	dma_addr_t	src;
-	dma_addr_t	dst;
-	u32		chcr;
-};
-
-struct audmapp_pdata {
-	struct audmapp_slave_config *slave;
-	int slave_num;
-};
-
-#endif /* SH_AUDMAPP_H */
-- 
cgit v1.2.3


From 3289bdb429884c0279bf9ab72dff7b934f19dfc6 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 14 Apr 2015 13:19:42 +0200
Subject: sched: Move the loadavg code to a more obvious location

I could not find the loadavg code.. turns out it was hidden in a file
called proc.c. It further got mingled up with the cruft per rq load
indexes (which we really want to get rid of).

Move the per rq load indexes into the fair.c load-balance code (that's
the only thing that uses them) and rename proc.c to loadavg.c so we
can find it again.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Paul Gortmaker <paul.gortmaker@windriver.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
[ Did minor cleanups to the code. ]
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h  |   5 +
 kernel/sched/Makefile  |   2 +-
 kernel/sched/core.c    |   7 +-
 kernel/sched/fair.c    | 183 ++++++++++++++++
 kernel/sched/loadavg.c | 394 +++++++++++++++++++++++++++++++++
 kernel/sched/proc.c    | 584 -------------------------------------------------
 kernel/sched/sched.h   |   8 +-
 7 files changed, 593 insertions(+), 590 deletions(-)
 create mode 100644 kernel/sched/loadavg.c
 delete mode 100644 kernel/sched/proc.c

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 26a2e6122734..85cf253bc366 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -173,7 +173,12 @@ extern unsigned long nr_iowait_cpu(int cpu);
 extern void get_iowait_load(unsigned long *nr_waiters, unsigned long *load);
 
 extern void calc_global_load(unsigned long ticks);
+
+#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
 extern void update_cpu_load_nohz(void);
+#else
+static inline void update_cpu_load_nohz(void) { }
+#endif
 
 extern unsigned long get_parent_ip(unsigned long addr);
 
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 46be87024875..67687973ce80 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -11,7 +11,7 @@ ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
 CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer
 endif
 
-obj-y += core.o proc.o clock.o cputime.o
+obj-y += core.o loadavg.o clock.o cputime.o
 obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o
 obj-y += wait.o completion.o idle.o
 obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index fdf972d56f65..527fc28a737a 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2397,9 +2397,9 @@ unsigned long nr_iowait_cpu(int cpu)
 
 void get_iowait_load(unsigned long *nr_waiters, unsigned long *load)
 {
-	struct rq *this = this_rq();
-	*nr_waiters = atomic_read(&this->nr_iowait);
-	*load = this->cpu_load[0];
+	struct rq *rq = this_rq();
+	*nr_waiters = atomic_read(&rq->nr_iowait);
+	*load = rq->load.weight;
 }
 
 #ifdef CONFIG_SMP
@@ -2497,6 +2497,7 @@ void scheduler_tick(void)
 	update_rq_clock(rq);
 	curr->sched_class->task_tick(rq, curr, 0);
 	update_cpu_load_active(rq);
+	calc_global_load_tick(rq);
 	raw_spin_unlock(&rq->lock);
 
 	perf_event_task_tick();
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index ffeaa4105e48..4bc6013886ec 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4323,6 +4323,189 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 }
 
 #ifdef CONFIG_SMP
+
+/*
+ * per rq 'load' arrray crap; XXX kill this.
+ */
+
+/*
+ * The exact cpuload at various idx values, calculated at every tick would be
+ * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load
+ *
+ * If a cpu misses updates for n-1 ticks (as it was idle) and update gets called
+ * on nth tick when cpu may be busy, then we have:
+ * load = ((2^idx - 1) / 2^idx)^(n-1) * load
+ * load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load
+ *
+ * decay_load_missed() below does efficient calculation of
+ * load = ((2^idx - 1) / 2^idx)^(n-1) * load
+ * avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load
+ *
+ * The calculation is approximated on a 128 point scale.
+ * degrade_zero_ticks is the number of ticks after which load at any
+ * particular idx is approximated to be zero.
+ * degrade_factor is a precomputed table, a row for each load idx.
+ * Each column corresponds to degradation factor for a power of two ticks,
+ * based on 128 point scale.
+ * Example:
+ * row 2, col 3 (=12) says that the degradation at load idx 2 after
+ * 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8).
+ *
+ * With this power of 2 load factors, we can degrade the load n times
+ * by looking at 1 bits in n and doing as many mult/shift instead of
+ * n mult/shifts needed by the exact degradation.
+ */
+#define DEGRADE_SHIFT		7
+static const unsigned char
+		degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128};
+static const unsigned char
+		degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = {
+					{0, 0, 0, 0, 0, 0, 0, 0},
+					{64, 32, 8, 0, 0, 0, 0, 0},
+					{96, 72, 40, 12, 1, 0, 0},
+					{112, 98, 75, 43, 15, 1, 0},
+					{120, 112, 98, 76, 45, 16, 2} };
+
+/*
+ * Update cpu_load for any missed ticks, due to tickless idle. The backlog
+ * would be when CPU is idle and so we just decay the old load without
+ * adding any new load.
+ */
+static unsigned long
+decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
+{
+	int j = 0;
+
+	if (!missed_updates)
+		return load;
+
+	if (missed_updates >= degrade_zero_ticks[idx])
+		return 0;
+
+	if (idx == 1)
+		return load >> missed_updates;
+
+	while (missed_updates) {
+		if (missed_updates % 2)
+			load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT;
+
+		missed_updates >>= 1;
+		j++;
+	}
+	return load;
+}
+
+/*
+ * Update rq->cpu_load[] statistics. This function is usually called every
+ * scheduler tick (TICK_NSEC). With tickless idle this will not be called
+ * every tick. We fix it up based on jiffies.
+ */
+static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
+			      unsigned long pending_updates)
+{
+	int i, scale;
+
+	this_rq->nr_load_updates++;
+
+	/* Update our load: */
+	this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */
+	for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
+		unsigned long old_load, new_load;
+
+		/* scale is effectively 1 << i now, and >> i divides by scale */
+
+		old_load = this_rq->cpu_load[i];
+		old_load = decay_load_missed(old_load, pending_updates - 1, i);
+		new_load = this_load;
+		/*
+		 * Round up the averaging division if load is increasing. This
+		 * prevents us from getting stuck on 9 if the load is 10, for
+		 * example.
+		 */
+		if (new_load > old_load)
+			new_load += scale - 1;
+
+		this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i;
+	}
+
+	sched_avg_update(this_rq);
+}
+
+#ifdef CONFIG_NO_HZ_COMMON
+/*
+ * There is no sane way to deal with nohz on smp when using jiffies because the
+ * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading
+ * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}.
+ *
+ * Therefore we cannot use the delta approach from the regular tick since that
+ * would seriously skew the load calculation. However we'll make do for those
+ * updates happening while idle (nohz_idle_balance) or coming out of idle
+ * (tick_nohz_idle_exit).
+ *
+ * This means we might still be one tick off for nohz periods.
+ */
+
+/*
+ * Called from nohz_idle_balance() to update the load ratings before doing the
+ * idle balance.
+ */
+static void update_idle_cpu_load(struct rq *this_rq)
+{
+	unsigned long curr_jiffies = ACCESS_ONCE(jiffies);
+	unsigned long load = this_rq->cfs.runnable_load_avg;
+	unsigned long pending_updates;
+
+	/*
+	 * bail if there's load or we're actually up-to-date.
+	 */
+	if (load || curr_jiffies == this_rq->last_load_update_tick)
+		return;
+
+	pending_updates = curr_jiffies - this_rq->last_load_update_tick;
+	this_rq->last_load_update_tick = curr_jiffies;
+
+	__update_cpu_load(this_rq, load, pending_updates);
+}
+
+/*
+ * Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed.
+ */
+void update_cpu_load_nohz(void)
+{
+	struct rq *this_rq = this_rq();
+	unsigned long curr_jiffies = ACCESS_ONCE(jiffies);
+	unsigned long pending_updates;
+
+	if (curr_jiffies == this_rq->last_load_update_tick)
+		return;
+
+	raw_spin_lock(&this_rq->lock);
+	pending_updates = curr_jiffies - this_rq->last_load_update_tick;
+	if (pending_updates) {
+		this_rq->last_load_update_tick = curr_jiffies;
+		/*
+		 * We were idle, this means load 0, the current load might be
+		 * !0 due to remote wakeups and the sort.
+		 */
+		__update_cpu_load(this_rq, 0, pending_updates);
+	}
+	raw_spin_unlock(&this_rq->lock);
+}
+#endif /* CONFIG_NO_HZ */
+
+/*
+ * Called from scheduler_tick()
+ */
+void update_cpu_load_active(struct rq *this_rq)
+{
+	unsigned long load = this_rq->cfs.runnable_load_avg;
+	/*
+	 * See the mess around update_idle_cpu_load() / update_cpu_load_nohz().
+	 */
+	this_rq->last_load_update_tick = jiffies;
+	__update_cpu_load(this_rq, load, 1);
+}
+
 /* Used instead of source_load when we know the type == 0 */
 static unsigned long weighted_cpuload(const int cpu)
 {
diff --git a/kernel/sched/loadavg.c b/kernel/sched/loadavg.c
new file mode 100644
index 000000000000..ef7159012cf3
--- /dev/null
+++ b/kernel/sched/loadavg.c
@@ -0,0 +1,394 @@
+/*
+ * kernel/sched/loadavg.c
+ *
+ * This file contains the magic bits required to compute the global loadavg
+ * figure. Its a silly number but people think its important. We go through
+ * great pains to make it work on big machines and tickless kernels.
+ */
+
+#include <linux/export.h>
+
+#include "sched.h"
+
+/*
+ * Global load-average calculations
+ *
+ * We take a distributed and async approach to calculating the global load-avg
+ * in order to minimize overhead.
+ *
+ * The global load average is an exponentially decaying average of nr_running +
+ * nr_uninterruptible.
+ *
+ * Once every LOAD_FREQ:
+ *
+ *   nr_active = 0;
+ *   for_each_possible_cpu(cpu)
+ *	nr_active += cpu_of(cpu)->nr_running + cpu_of(cpu)->nr_uninterruptible;
+ *
+ *   avenrun[n] = avenrun[0] * exp_n + nr_active * (1 - exp_n)
+ *
+ * Due to a number of reasons the above turns in the mess below:
+ *
+ *  - for_each_possible_cpu() is prohibitively expensive on machines with
+ *    serious number of cpus, therefore we need to take a distributed approach
+ *    to calculating nr_active.
+ *
+ *        \Sum_i x_i(t) = \Sum_i x_i(t) - x_i(t_0) | x_i(t_0) := 0
+ *                      = \Sum_i { \Sum_j=1 x_i(t_j) - x_i(t_j-1) }
+ *
+ *    So assuming nr_active := 0 when we start out -- true per definition, we
+ *    can simply take per-cpu deltas and fold those into a global accumulate
+ *    to obtain the same result. See calc_load_fold_active().
+ *
+ *    Furthermore, in order to avoid synchronizing all per-cpu delta folding
+ *    across the machine, we assume 10 ticks is sufficient time for every
+ *    cpu to have completed this task.
+ *
+ *    This places an upper-bound on the IRQ-off latency of the machine. Then
+ *    again, being late doesn't loose the delta, just wrecks the sample.
+ *
+ *  - cpu_rq()->nr_uninterruptible isn't accurately tracked per-cpu because
+ *    this would add another cross-cpu cacheline miss and atomic operation
+ *    to the wakeup path. Instead we increment on whatever cpu the task ran
+ *    when it went into uninterruptible state and decrement on whatever cpu
+ *    did the wakeup. This means that only the sum of nr_uninterruptible over
+ *    all cpus yields the correct result.
+ *
+ *  This covers the NO_HZ=n code, for extra head-aches, see the comment below.
+ */
+
+/* Variables and functions for calc_load */
+atomic_long_t calc_load_tasks;
+unsigned long calc_load_update;
+unsigned long avenrun[3];
+EXPORT_SYMBOL(avenrun); /* should be removed */
+
+/**
+ * get_avenrun - get the load average array
+ * @loads:	pointer to dest load array
+ * @offset:	offset to add
+ * @shift:	shift count to shift the result left
+ *
+ * These values are estimates at best, so no need for locking.
+ */
+void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
+{
+	loads[0] = (avenrun[0] + offset) << shift;
+	loads[1] = (avenrun[1] + offset) << shift;
+	loads[2] = (avenrun[2] + offset) << shift;
+}
+
+long calc_load_fold_active(struct rq *this_rq)
+{
+	long nr_active, delta = 0;
+
+	nr_active = this_rq->nr_running;
+	nr_active += (long)this_rq->nr_uninterruptible;
+
+	if (nr_active != this_rq->calc_load_active) {
+		delta = nr_active - this_rq->calc_load_active;
+		this_rq->calc_load_active = nr_active;
+	}
+
+	return delta;
+}
+
+/*
+ * a1 = a0 * e + a * (1 - e)
+ */
+static unsigned long
+calc_load(unsigned long load, unsigned long exp, unsigned long active)
+{
+	load *= exp;
+	load += active * (FIXED_1 - exp);
+	load += 1UL << (FSHIFT - 1);
+	return load >> FSHIFT;
+}
+
+#ifdef CONFIG_NO_HZ_COMMON
+/*
+ * Handle NO_HZ for the global load-average.
+ *
+ * Since the above described distributed algorithm to compute the global
+ * load-average relies on per-cpu sampling from the tick, it is affected by
+ * NO_HZ.
+ *
+ * The basic idea is to fold the nr_active delta into a global idle-delta upon
+ * entering NO_HZ state such that we can include this as an 'extra' cpu delta
+ * when we read the global state.
+ *
+ * Obviously reality has to ruin such a delightfully simple scheme:
+ *
+ *  - When we go NO_HZ idle during the window, we can negate our sample
+ *    contribution, causing under-accounting.
+ *
+ *    We avoid this by keeping two idle-delta counters and flipping them
+ *    when the window starts, thus separating old and new NO_HZ load.
+ *
+ *    The only trick is the slight shift in index flip for read vs write.
+ *
+ *        0s            5s            10s           15s
+ *          +10           +10           +10           +10
+ *        |-|-----------|-|-----------|-|-----------|-|
+ *    r:0 0 1           1 0           0 1           1 0
+ *    w:0 1 1           0 0           1 1           0 0
+ *
+ *    This ensures we'll fold the old idle contribution in this window while
+ *    accumlating the new one.
+ *
+ *  - When we wake up from NO_HZ idle during the window, we push up our
+ *    contribution, since we effectively move our sample point to a known
+ *    busy state.
+ *
+ *    This is solved by pushing the window forward, and thus skipping the
+ *    sample, for this cpu (effectively using the idle-delta for this cpu which
+ *    was in effect at the time the window opened). This also solves the issue
+ *    of having to deal with a cpu having been in NOHZ idle for multiple
+ *    LOAD_FREQ intervals.
+ *
+ * When making the ILB scale, we should try to pull this in as well.
+ */
+static atomic_long_t calc_load_idle[2];
+static int calc_load_idx;
+
+static inline int calc_load_write_idx(void)
+{
+	int idx = calc_load_idx;
+
+	/*
+	 * See calc_global_nohz(), if we observe the new index, we also
+	 * need to observe the new update time.
+	 */
+	smp_rmb();
+
+	/*
+	 * If the folding window started, make sure we start writing in the
+	 * next idle-delta.
+	 */
+	if (!time_before(jiffies, calc_load_update))
+		idx++;
+
+	return idx & 1;
+}
+
+static inline int calc_load_read_idx(void)
+{
+	return calc_load_idx & 1;
+}
+
+void calc_load_enter_idle(void)
+{
+	struct rq *this_rq = this_rq();
+	long delta;
+
+	/*
+	 * We're going into NOHZ mode, if there's any pending delta, fold it
+	 * into the pending idle delta.
+	 */
+	delta = calc_load_fold_active(this_rq);
+	if (delta) {
+		int idx = calc_load_write_idx();
+
+		atomic_long_add(delta, &calc_load_idle[idx]);
+	}
+}
+
+void calc_load_exit_idle(void)
+{
+	struct rq *this_rq = this_rq();
+
+	/*
+	 * If we're still before the sample window, we're done.
+	 */
+	if (time_before(jiffies, this_rq->calc_load_update))
+		return;
+
+	/*
+	 * We woke inside or after the sample window, this means we're already
+	 * accounted through the nohz accounting, so skip the entire deal and
+	 * sync up for the next window.
+	 */
+	this_rq->calc_load_update = calc_load_update;
+	if (time_before(jiffies, this_rq->calc_load_update + 10))
+		this_rq->calc_load_update += LOAD_FREQ;
+}
+
+static long calc_load_fold_idle(void)
+{
+	int idx = calc_load_read_idx();
+	long delta = 0;
+
+	if (atomic_long_read(&calc_load_idle[idx]))
+		delta = atomic_long_xchg(&calc_load_idle[idx], 0);
+
+	return delta;
+}
+
+/**
+ * fixed_power_int - compute: x^n, in O(log n) time
+ *
+ * @x:         base of the power
+ * @frac_bits: fractional bits of @x
+ * @n:         power to raise @x to.
+ *
+ * By exploiting the relation between the definition of the natural power
+ * function: x^n := x*x*...*x (x multiplied by itself for n times), and
+ * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i,
+ * (where: n_i \elem {0, 1}, the binary vector representing n),
+ * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is
+ * of course trivially computable in O(log_2 n), the length of our binary
+ * vector.
+ */
+static unsigned long
+fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n)
+{
+	unsigned long result = 1UL << frac_bits;
+
+	if (n) {
+		for (;;) {
+			if (n & 1) {
+				result *= x;
+				result += 1UL << (frac_bits - 1);
+				result >>= frac_bits;
+			}
+			n >>= 1;
+			if (!n)
+				break;
+			x *= x;
+			x += 1UL << (frac_bits - 1);
+			x >>= frac_bits;
+		}
+	}
+
+	return result;
+}
+
+/*
+ * a1 = a0 * e + a * (1 - e)
+ *
+ * a2 = a1 * e + a * (1 - e)
+ *    = (a0 * e + a * (1 - e)) * e + a * (1 - e)
+ *    = a0 * e^2 + a * (1 - e) * (1 + e)
+ *
+ * a3 = a2 * e + a * (1 - e)
+ *    = (a0 * e^2 + a * (1 - e) * (1 + e)) * e + a * (1 - e)
+ *    = a0 * e^3 + a * (1 - e) * (1 + e + e^2)
+ *
+ *  ...
+ *
+ * an = a0 * e^n + a * (1 - e) * (1 + e + ... + e^n-1) [1]
+ *    = a0 * e^n + a * (1 - e) * (1 - e^n)/(1 - e)
+ *    = a0 * e^n + a * (1 - e^n)
+ *
+ * [1] application of the geometric series:
+ *
+ *              n         1 - x^(n+1)
+ *     S_n := \Sum x^i = -------------
+ *             i=0          1 - x
+ */
+static unsigned long
+calc_load_n(unsigned long load, unsigned long exp,
+	    unsigned long active, unsigned int n)
+{
+	return calc_load(load, fixed_power_int(exp, FSHIFT, n), active);
+}
+
+/*
+ * NO_HZ can leave us missing all per-cpu ticks calling
+ * calc_load_account_active(), but since an idle CPU folds its delta into
+ * calc_load_tasks_idle per calc_load_account_idle(), all we need to do is fold
+ * in the pending idle delta if our idle period crossed a load cycle boundary.
+ *
+ * Once we've updated the global active value, we need to apply the exponential
+ * weights adjusted to the number of cycles missed.
+ */
+static void calc_global_nohz(void)
+{
+	long delta, active, n;
+
+	if (!time_before(jiffies, calc_load_update + 10)) {
+		/*
+		 * Catch-up, fold however many we are behind still
+		 */
+		delta = jiffies - calc_load_update - 10;
+		n = 1 + (delta / LOAD_FREQ);
+
+		active = atomic_long_read(&calc_load_tasks);
+		active = active > 0 ? active * FIXED_1 : 0;
+
+		avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n);
+		avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n);
+		avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n);
+
+		calc_load_update += n * LOAD_FREQ;
+	}
+
+	/*
+	 * Flip the idle index...
+	 *
+	 * Make sure we first write the new time then flip the index, so that
+	 * calc_load_write_idx() will see the new time when it reads the new
+	 * index, this avoids a double flip messing things up.
+	 */
+	smp_wmb();
+	calc_load_idx++;
+}
+#else /* !CONFIG_NO_HZ_COMMON */
+
+static inline long calc_load_fold_idle(void) { return 0; }
+static inline void calc_global_nohz(void) { }
+
+#endif /* CONFIG_NO_HZ_COMMON */
+
+/*
+ * calc_load - update the avenrun load estimates 10 ticks after the
+ * CPUs have updated calc_load_tasks.
+ *
+ * Called from the global timer code.
+ */
+void calc_global_load(unsigned long ticks)
+{
+	long active, delta;
+
+	if (time_before(jiffies, calc_load_update + 10))
+		return;
+
+	/*
+	 * Fold the 'old' idle-delta to include all NO_HZ cpus.
+	 */
+	delta = calc_load_fold_idle();
+	if (delta)
+		atomic_long_add(delta, &calc_load_tasks);
+
+	active = atomic_long_read(&calc_load_tasks);
+	active = active > 0 ? active * FIXED_1 : 0;
+
+	avenrun[0] = calc_load(avenrun[0], EXP_1, active);
+	avenrun[1] = calc_load(avenrun[1], EXP_5, active);
+	avenrun[2] = calc_load(avenrun[2], EXP_15, active);
+
+	calc_load_update += LOAD_FREQ;
+
+	/*
+	 * In case we idled for multiple LOAD_FREQ intervals, catch up in bulk.
+	 */
+	calc_global_nohz();
+}
+
+/*
+ * Called from scheduler_tick() to periodically update this CPU's
+ * active count.
+ */
+void calc_global_load_tick(struct rq *this_rq)
+{
+	long delta;
+
+	if (time_before(jiffies, this_rq->calc_load_update))
+		return;
+
+	delta  = calc_load_fold_active(this_rq);
+	if (delta)
+		atomic_long_add(delta, &calc_load_tasks);
+
+	this_rq->calc_load_update += LOAD_FREQ;
+}
diff --git a/kernel/sched/proc.c b/kernel/sched/proc.c
deleted file mode 100644
index 8ecd552fe4f2..000000000000
--- a/kernel/sched/proc.c
+++ /dev/null
@@ -1,584 +0,0 @@
-/*
- *  kernel/sched/proc.c
- *
- *  Kernel load calculations, forked from sched/core.c
- */
-
-#include <linux/export.h>
-
-#include "sched.h"
-
-/*
- * Global load-average calculations
- *
- * We take a distributed and async approach to calculating the global load-avg
- * in order to minimize overhead.
- *
- * The global load average is an exponentially decaying average of nr_running +
- * nr_uninterruptible.
- *
- * Once every LOAD_FREQ:
- *
- *   nr_active = 0;
- *   for_each_possible_cpu(cpu)
- *	nr_active += cpu_of(cpu)->nr_running + cpu_of(cpu)->nr_uninterruptible;
- *
- *   avenrun[n] = avenrun[0] * exp_n + nr_active * (1 - exp_n)
- *
- * Due to a number of reasons the above turns in the mess below:
- *
- *  - for_each_possible_cpu() is prohibitively expensive on machines with
- *    serious number of cpus, therefore we need to take a distributed approach
- *    to calculating nr_active.
- *
- *        \Sum_i x_i(t) = \Sum_i x_i(t) - x_i(t_0) | x_i(t_0) := 0
- *                      = \Sum_i { \Sum_j=1 x_i(t_j) - x_i(t_j-1) }
- *
- *    So assuming nr_active := 0 when we start out -- true per definition, we
- *    can simply take per-cpu deltas and fold those into a global accumulate
- *    to obtain the same result. See calc_load_fold_active().
- *
- *    Furthermore, in order to avoid synchronizing all per-cpu delta folding
- *    across the machine, we assume 10 ticks is sufficient time for every
- *    cpu to have completed this task.
- *
- *    This places an upper-bound on the IRQ-off latency of the machine. Then
- *    again, being late doesn't loose the delta, just wrecks the sample.
- *
- *  - cpu_rq()->nr_uninterruptible isn't accurately tracked per-cpu because
- *    this would add another cross-cpu cacheline miss and atomic operation
- *    to the wakeup path. Instead we increment on whatever cpu the task ran
- *    when it went into uninterruptible state and decrement on whatever cpu
- *    did the wakeup. This means that only the sum of nr_uninterruptible over
- *    all cpus yields the correct result.
- *
- *  This covers the NO_HZ=n code, for extra head-aches, see the comment below.
- */
-
-/* Variables and functions for calc_load */
-atomic_long_t calc_load_tasks;
-unsigned long calc_load_update;
-unsigned long avenrun[3];
-EXPORT_SYMBOL(avenrun); /* should be removed */
-
-/**
- * get_avenrun - get the load average array
- * @loads:	pointer to dest load array
- * @offset:	offset to add
- * @shift:	shift count to shift the result left
- *
- * These values are estimates at best, so no need for locking.
- */
-void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
-{
-	loads[0] = (avenrun[0] + offset) << shift;
-	loads[1] = (avenrun[1] + offset) << shift;
-	loads[2] = (avenrun[2] + offset) << shift;
-}
-
-long calc_load_fold_active(struct rq *this_rq)
-{
-	long nr_active, delta = 0;
-
-	nr_active = this_rq->nr_running;
-	nr_active += (long) this_rq->nr_uninterruptible;
-
-	if (nr_active != this_rq->calc_load_active) {
-		delta = nr_active - this_rq->calc_load_active;
-		this_rq->calc_load_active = nr_active;
-	}
-
-	return delta;
-}
-
-/*
- * a1 = a0 * e + a * (1 - e)
- */
-static unsigned long
-calc_load(unsigned long load, unsigned long exp, unsigned long active)
-{
-	load *= exp;
-	load += active * (FIXED_1 - exp);
-	load += 1UL << (FSHIFT - 1);
-	return load >> FSHIFT;
-}
-
-#ifdef CONFIG_NO_HZ_COMMON
-/*
- * Handle NO_HZ for the global load-average.
- *
- * Since the above described distributed algorithm to compute the global
- * load-average relies on per-cpu sampling from the tick, it is affected by
- * NO_HZ.
- *
- * The basic idea is to fold the nr_active delta into a global idle-delta upon
- * entering NO_HZ state such that we can include this as an 'extra' cpu delta
- * when we read the global state.
- *
- * Obviously reality has to ruin such a delightfully simple scheme:
- *
- *  - When we go NO_HZ idle during the window, we can negate our sample
- *    contribution, causing under-accounting.
- *
- *    We avoid this by keeping two idle-delta counters and flipping them
- *    when the window starts, thus separating old and new NO_HZ load.
- *
- *    The only trick is the slight shift in index flip for read vs write.
- *
- *        0s            5s            10s           15s
- *          +10           +10           +10           +10
- *        |-|-----------|-|-----------|-|-----------|-|
- *    r:0 0 1           1 0           0 1           1 0
- *    w:0 1 1           0 0           1 1           0 0
- *
- *    This ensures we'll fold the old idle contribution in this window while
- *    accumlating the new one.
- *
- *  - When we wake up from NO_HZ idle during the window, we push up our
- *    contribution, since we effectively move our sample point to a known
- *    busy state.
- *
- *    This is solved by pushing the window forward, and thus skipping the
- *    sample, for this cpu (effectively using the idle-delta for this cpu which
- *    was in effect at the time the window opened). This also solves the issue
- *    of having to deal with a cpu having been in NOHZ idle for multiple
- *    LOAD_FREQ intervals.
- *
- * When making the ILB scale, we should try to pull this in as well.
- */
-static atomic_long_t calc_load_idle[2];
-static int calc_load_idx;
-
-static inline int calc_load_write_idx(void)
-{
-	int idx = calc_load_idx;
-
-	/*
-	 * See calc_global_nohz(), if we observe the new index, we also
-	 * need to observe the new update time.
-	 */
-	smp_rmb();
-
-	/*
-	 * If the folding window started, make sure we start writing in the
-	 * next idle-delta.
-	 */
-	if (!time_before(jiffies, calc_load_update))
-		idx++;
-
-	return idx & 1;
-}
-
-static inline int calc_load_read_idx(void)
-{
-	return calc_load_idx & 1;
-}
-
-void calc_load_enter_idle(void)
-{
-	struct rq *this_rq = this_rq();
-	long delta;
-
-	/*
-	 * We're going into NOHZ mode, if there's any pending delta, fold it
-	 * into the pending idle delta.
-	 */
-	delta = calc_load_fold_active(this_rq);
-	if (delta) {
-		int idx = calc_load_write_idx();
-		atomic_long_add(delta, &calc_load_idle[idx]);
-	}
-}
-
-void calc_load_exit_idle(void)
-{
-	struct rq *this_rq = this_rq();
-
-	/*
-	 * If we're still before the sample window, we're done.
-	 */
-	if (time_before(jiffies, this_rq->calc_load_update))
-		return;
-
-	/*
-	 * We woke inside or after the sample window, this means we're already
-	 * accounted through the nohz accounting, so skip the entire deal and
-	 * sync up for the next window.
-	 */
-	this_rq->calc_load_update = calc_load_update;
-	if (time_before(jiffies, this_rq->calc_load_update + 10))
-		this_rq->calc_load_update += LOAD_FREQ;
-}
-
-static long calc_load_fold_idle(void)
-{
-	int idx = calc_load_read_idx();
-	long delta = 0;
-
-	if (atomic_long_read(&calc_load_idle[idx]))
-		delta = atomic_long_xchg(&calc_load_idle[idx], 0);
-
-	return delta;
-}
-
-/**
- * fixed_power_int - compute: x^n, in O(log n) time
- *
- * @x:         base of the power
- * @frac_bits: fractional bits of @x
- * @n:         power to raise @x to.
- *
- * By exploiting the relation between the definition of the natural power
- * function: x^n := x*x*...*x (x multiplied by itself for n times), and
- * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i,
- * (where: n_i \elem {0, 1}, the binary vector representing n),
- * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is
- * of course trivially computable in O(log_2 n), the length of our binary
- * vector.
- */
-static unsigned long
-fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n)
-{
-	unsigned long result = 1UL << frac_bits;
-
-	if (n) for (;;) {
-		if (n & 1) {
-			result *= x;
-			result += 1UL << (frac_bits - 1);
-			result >>= frac_bits;
-		}
-		n >>= 1;
-		if (!n)
-			break;
-		x *= x;
-		x += 1UL << (frac_bits - 1);
-		x >>= frac_bits;
-	}
-
-	return result;
-}
-
-/*
- * a1 = a0 * e + a * (1 - e)
- *
- * a2 = a1 * e + a * (1 - e)
- *    = (a0 * e + a * (1 - e)) * e + a * (1 - e)
- *    = a0 * e^2 + a * (1 - e) * (1 + e)
- *
- * a3 = a2 * e + a * (1 - e)
- *    = (a0 * e^2 + a * (1 - e) * (1 + e)) * e + a * (1 - e)
- *    = a0 * e^3 + a * (1 - e) * (1 + e + e^2)
- *
- *  ...
- *
- * an = a0 * e^n + a * (1 - e) * (1 + e + ... + e^n-1) [1]
- *    = a0 * e^n + a * (1 - e) * (1 - e^n)/(1 - e)
- *    = a0 * e^n + a * (1 - e^n)
- *
- * [1] application of the geometric series:
- *
- *              n         1 - x^(n+1)
- *     S_n := \Sum x^i = -------------
- *             i=0          1 - x
- */
-static unsigned long
-calc_load_n(unsigned long load, unsigned long exp,
-	    unsigned long active, unsigned int n)
-{
-
-	return calc_load(load, fixed_power_int(exp, FSHIFT, n), active);
-}
-
-/*
- * NO_HZ can leave us missing all per-cpu ticks calling
- * calc_load_account_active(), but since an idle CPU folds its delta into
- * calc_load_tasks_idle per calc_load_account_idle(), all we need to do is fold
- * in the pending idle delta if our idle period crossed a load cycle boundary.
- *
- * Once we've updated the global active value, we need to apply the exponential
- * weights adjusted to the number of cycles missed.
- */
-static void calc_global_nohz(void)
-{
-	long delta, active, n;
-
-	if (!time_before(jiffies, calc_load_update + 10)) {
-		/*
-		 * Catch-up, fold however many we are behind still
-		 */
-		delta = jiffies - calc_load_update - 10;
-		n = 1 + (delta / LOAD_FREQ);
-
-		active = atomic_long_read(&calc_load_tasks);
-		active = active > 0 ? active * FIXED_1 : 0;
-
-		avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n);
-		avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n);
-		avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n);
-
-		calc_load_update += n * LOAD_FREQ;
-	}
-
-	/*
-	 * Flip the idle index...
-	 *
-	 * Make sure we first write the new time then flip the index, so that
-	 * calc_load_write_idx() will see the new time when it reads the new
-	 * index, this avoids a double flip messing things up.
-	 */
-	smp_wmb();
-	calc_load_idx++;
-}
-#else /* !CONFIG_NO_HZ_COMMON */
-
-static inline long calc_load_fold_idle(void) { return 0; }
-static inline void calc_global_nohz(void) { }
-
-#endif /* CONFIG_NO_HZ_COMMON */
-
-/*
- * calc_load - update the avenrun load estimates 10 ticks after the
- * CPUs have updated calc_load_tasks.
- */
-void calc_global_load(unsigned long ticks)
-{
-	long active, delta;
-
-	if (time_before(jiffies, calc_load_update + 10))
-		return;
-
-	/*
-	 * Fold the 'old' idle-delta to include all NO_HZ cpus.
-	 */
-	delta = calc_load_fold_idle();
-	if (delta)
-		atomic_long_add(delta, &calc_load_tasks);
-
-	active = atomic_long_read(&calc_load_tasks);
-	active = active > 0 ? active * FIXED_1 : 0;
-
-	avenrun[0] = calc_load(avenrun[0], EXP_1, active);
-	avenrun[1] = calc_load(avenrun[1], EXP_5, active);
-	avenrun[2] = calc_load(avenrun[2], EXP_15, active);
-
-	calc_load_update += LOAD_FREQ;
-
-	/*
-	 * In case we idled for multiple LOAD_FREQ intervals, catch up in bulk.
-	 */
-	calc_global_nohz();
-}
-
-/*
- * Called from update_cpu_load() to periodically update this CPU's
- * active count.
- */
-static void calc_load_account_active(struct rq *this_rq)
-{
-	long delta;
-
-	if (time_before(jiffies, this_rq->calc_load_update))
-		return;
-
-	delta  = calc_load_fold_active(this_rq);
-	if (delta)
-		atomic_long_add(delta, &calc_load_tasks);
-
-	this_rq->calc_load_update += LOAD_FREQ;
-}
-
-/*
- * End of global load-average stuff
- */
-
-/*
- * The exact cpuload at various idx values, calculated at every tick would be
- * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load
- *
- * If a cpu misses updates for n-1 ticks (as it was idle) and update gets called
- * on nth tick when cpu may be busy, then we have:
- * load = ((2^idx - 1) / 2^idx)^(n-1) * load
- * load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load
- *
- * decay_load_missed() below does efficient calculation of
- * load = ((2^idx - 1) / 2^idx)^(n-1) * load
- * avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load
- *
- * The calculation is approximated on a 128 point scale.
- * degrade_zero_ticks is the number of ticks after which load at any
- * particular idx is approximated to be zero.
- * degrade_factor is a precomputed table, a row for each load idx.
- * Each column corresponds to degradation factor for a power of two ticks,
- * based on 128 point scale.
- * Example:
- * row 2, col 3 (=12) says that the degradation at load idx 2 after
- * 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8).
- *
- * With this power of 2 load factors, we can degrade the load n times
- * by looking at 1 bits in n and doing as many mult/shift instead of
- * n mult/shifts needed by the exact degradation.
- */
-#define DEGRADE_SHIFT		7
-static const unsigned char
-		degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128};
-static const unsigned char
-		degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = {
-					{0, 0, 0, 0, 0, 0, 0, 0},
-					{64, 32, 8, 0, 0, 0, 0, 0},
-					{96, 72, 40, 12, 1, 0, 0},
-					{112, 98, 75, 43, 15, 1, 0},
-					{120, 112, 98, 76, 45, 16, 2} };
-
-/*
- * Update cpu_load for any missed ticks, due to tickless idle. The backlog
- * would be when CPU is idle and so we just decay the old load without
- * adding any new load.
- */
-static unsigned long
-decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
-{
-	int j = 0;
-
-	if (!missed_updates)
-		return load;
-
-	if (missed_updates >= degrade_zero_ticks[idx])
-		return 0;
-
-	if (idx == 1)
-		return load >> missed_updates;
-
-	while (missed_updates) {
-		if (missed_updates % 2)
-			load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT;
-
-		missed_updates >>= 1;
-		j++;
-	}
-	return load;
-}
-
-/*
- * Update rq->cpu_load[] statistics. This function is usually called every
- * scheduler tick (TICK_NSEC). With tickless idle this will not be called
- * every tick. We fix it up based on jiffies.
- */
-static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
-			      unsigned long pending_updates)
-{
-	int i, scale;
-
-	this_rq->nr_load_updates++;
-
-	/* Update our load: */
-	this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */
-	for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
-		unsigned long old_load, new_load;
-
-		/* scale is effectively 1 << i now, and >> i divides by scale */
-
-		old_load = this_rq->cpu_load[i];
-		old_load = decay_load_missed(old_load, pending_updates - 1, i);
-		new_load = this_load;
-		/*
-		 * Round up the averaging division if load is increasing. This
-		 * prevents us from getting stuck on 9 if the load is 10, for
-		 * example.
-		 */
-		if (new_load > old_load)
-			new_load += scale - 1;
-
-		this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i;
-	}
-
-	sched_avg_update(this_rq);
-}
-
-#ifdef CONFIG_SMP
-static inline unsigned long get_rq_runnable_load(struct rq *rq)
-{
-	return rq->cfs.runnable_load_avg;
-}
-#else
-static inline unsigned long get_rq_runnable_load(struct rq *rq)
-{
-	return rq->load.weight;
-}
-#endif
-
-#ifdef CONFIG_NO_HZ_COMMON
-/*
- * There is no sane way to deal with nohz on smp when using jiffies because the
- * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading
- * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}.
- *
- * Therefore we cannot use the delta approach from the regular tick since that
- * would seriously skew the load calculation. However we'll make do for those
- * updates happening while idle (nohz_idle_balance) or coming out of idle
- * (tick_nohz_idle_exit).
- *
- * This means we might still be one tick off for nohz periods.
- */
-
-/*
- * Called from nohz_idle_balance() to update the load ratings before doing the
- * idle balance.
- */
-void update_idle_cpu_load(struct rq *this_rq)
-{
-	unsigned long curr_jiffies = ACCESS_ONCE(jiffies);
-	unsigned long load = get_rq_runnable_load(this_rq);
-	unsigned long pending_updates;
-
-	/*
-	 * bail if there's load or we're actually up-to-date.
-	 */
-	if (load || curr_jiffies == this_rq->last_load_update_tick)
-		return;
-
-	pending_updates = curr_jiffies - this_rq->last_load_update_tick;
-	this_rq->last_load_update_tick = curr_jiffies;
-
-	__update_cpu_load(this_rq, load, pending_updates);
-}
-
-/*
- * Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed.
- */
-void update_cpu_load_nohz(void)
-{
-	struct rq *this_rq = this_rq();
-	unsigned long curr_jiffies = ACCESS_ONCE(jiffies);
-	unsigned long pending_updates;
-
-	if (curr_jiffies == this_rq->last_load_update_tick)
-		return;
-
-	raw_spin_lock(&this_rq->lock);
-	pending_updates = curr_jiffies - this_rq->last_load_update_tick;
-	if (pending_updates) {
-		this_rq->last_load_update_tick = curr_jiffies;
-		/*
-		 * We were idle, this means load 0, the current load might be
-		 * !0 due to remote wakeups and the sort.
-		 */
-		__update_cpu_load(this_rq, 0, pending_updates);
-	}
-	raw_spin_unlock(&this_rq->lock);
-}
-#endif /* CONFIG_NO_HZ */
-
-/*
- * Called from scheduler_tick()
- */
-void update_cpu_load_active(struct rq *this_rq)
-{
-	unsigned long load = get_rq_runnable_load(this_rq);
-	/*
-	 * See the mess around update_idle_cpu_load() / update_cpu_load_nohz().
-	 */
-	this_rq->last_load_update_tick = jiffies;
-	__update_cpu_load(this_rq, load, 1);
-
-	calc_load_account_active(this_rq);
-}
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index e0e129993958..09ed26a89f31 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -26,8 +26,14 @@ extern __read_mostly int scheduler_running;
 extern unsigned long calc_load_update;
 extern atomic_long_t calc_load_tasks;
 
+extern void calc_global_load_tick(struct rq *this_rq);
 extern long calc_load_fold_active(struct rq *this_rq);
+
+#ifdef CONFIG_SMP
 extern void update_cpu_load_active(struct rq *this_rq);
+#else
+static inline void update_cpu_load_active(struct rq *this_rq) { }
+#endif
 
 /*
  * Helpers for converting nanosecond timing to jiffy resolution
@@ -1298,8 +1304,6 @@ extern void init_dl_task_timer(struct sched_dl_entity *dl_se);
 
 unsigned long to_ratio(u64 period, u64 runtime);
 
-extern void update_idle_cpu_load(struct rq *this_rq);
-
 extern void init_task_runnable_average(struct task_struct *p);
 
 static inline void add_nr_running(struct rq *rq, unsigned count)
-- 
cgit v1.2.3


From b76808e6808e34e7e78131d2b8cb0535622b8e9f Mon Sep 17 00:00:00 2001
From: Palmer Dabbelt <palmer@dabbelt.com>
Date: Thu, 30 Apr 2015 21:19:57 -0700
Subject: signals, sched: Change all uses of JOBCTL_* from 'int' to 'long'

c56fb6564dcd ("Fix a misaligned load inside ptrace_attach()") makes
jobctl an "unsigned long".  It makes sense to have the masks applied
to it match that type.  This is currently just a cosmetic change, but
it will prevent the mask from being unexpectedly truncated if we ever
end up with masks with more bits.

One instance of "signr" is an int, but I left this alone because the
mask ensures that it will never overflow.

Signed-off-by: Palmer Dabbelt <palmer@dabbelt.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Chris Metcalf <cmetcalf@ezchip.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: bobby.prani@gmail.com
Cc: oleg@redhat.com
Cc: paulmck@linux.vnet.ibm.com
Cc: richard@nod.at
Cc: vdavydov@parallels.com
Link: http://lkml.kernel.org/r/1430453997-32459-4-git-send-email-palmer@dabbelt.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h | 18 +++++++++---------
 kernel/signal.c       |  6 +++---
 2 files changed, 12 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 85cf253bc366..4f066cb625ad 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2082,22 +2082,22 @@ TASK_PFA_CLEAR(SPREAD_SLAB, spread_slab)
 #define JOBCTL_TRAPPING_BIT	21	/* switching to TRACED */
 #define JOBCTL_LISTENING_BIT	22	/* ptracer is listening for events */
 
-#define JOBCTL_STOP_DEQUEUED	(1 << JOBCTL_STOP_DEQUEUED_BIT)
-#define JOBCTL_STOP_PENDING	(1 << JOBCTL_STOP_PENDING_BIT)
-#define JOBCTL_STOP_CONSUME	(1 << JOBCTL_STOP_CONSUME_BIT)
-#define JOBCTL_TRAP_STOP	(1 << JOBCTL_TRAP_STOP_BIT)
-#define JOBCTL_TRAP_NOTIFY	(1 << JOBCTL_TRAP_NOTIFY_BIT)
-#define JOBCTL_TRAPPING		(1 << JOBCTL_TRAPPING_BIT)
-#define JOBCTL_LISTENING	(1 << JOBCTL_LISTENING_BIT)
+#define JOBCTL_STOP_DEQUEUED	(1UL << JOBCTL_STOP_DEQUEUED_BIT)
+#define JOBCTL_STOP_PENDING	(1UL << JOBCTL_STOP_PENDING_BIT)
+#define JOBCTL_STOP_CONSUME	(1UL << JOBCTL_STOP_CONSUME_BIT)
+#define JOBCTL_TRAP_STOP	(1UL << JOBCTL_TRAP_STOP_BIT)
+#define JOBCTL_TRAP_NOTIFY	(1UL << JOBCTL_TRAP_NOTIFY_BIT)
+#define JOBCTL_TRAPPING		(1UL << JOBCTL_TRAPPING_BIT)
+#define JOBCTL_LISTENING	(1UL << JOBCTL_LISTENING_BIT)
 
 #define JOBCTL_TRAP_MASK	(JOBCTL_TRAP_STOP | JOBCTL_TRAP_NOTIFY)
 #define JOBCTL_PENDING_MASK	(JOBCTL_STOP_PENDING | JOBCTL_TRAP_MASK)
 
 extern bool task_set_jobctl_pending(struct task_struct *task,
-				    unsigned int mask);
+				    unsigned long mask);
 extern void task_clear_jobctl_trapping(struct task_struct *task);
 extern void task_clear_jobctl_pending(struct task_struct *task,
-				      unsigned int mask);
+				      unsigned long mask);
 
 static inline void rcu_copy_process(struct task_struct *p)
 {
diff --git a/kernel/signal.c b/kernel/signal.c
index d51c5ddd855c..f19833b5db3c 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -245,7 +245,7 @@ static inline void print_dropped_signal(int sig)
  * RETURNS:
  * %true if @mask is set, %false if made noop because @task was dying.
  */
-bool task_set_jobctl_pending(struct task_struct *task, unsigned int mask)
+bool task_set_jobctl_pending(struct task_struct *task, unsigned long mask)
 {
 	BUG_ON(mask & ~(JOBCTL_PENDING_MASK | JOBCTL_STOP_CONSUME |
 			JOBCTL_STOP_SIGMASK | JOBCTL_TRAPPING));
@@ -297,7 +297,7 @@ void task_clear_jobctl_trapping(struct task_struct *task)
  * CONTEXT:
  * Must be called with @task->sighand->siglock held.
  */
-void task_clear_jobctl_pending(struct task_struct *task, unsigned int mask)
+void task_clear_jobctl_pending(struct task_struct *task, unsigned long mask)
 {
 	BUG_ON(mask & ~JOBCTL_PENDING_MASK);
 
@@ -2000,7 +2000,7 @@ static bool do_signal_stop(int signr)
 	struct signal_struct *sig = current->signal;
 
 	if (!(current->jobctl & JOBCTL_STOP_PENDING)) {
-		unsigned int gstop = JOBCTL_STOP_PENDING | JOBCTL_STOP_CONSUME;
+		unsigned long gstop = JOBCTL_STOP_PENDING | JOBCTL_STOP_CONSUME;
 		struct task_struct *t;
 
 		/* signr will be recorded in task->jobctl for retries */
-- 
cgit v1.2.3


From 7e60598785f30cf3dc9e476cc0fc3feeb37a0c63 Mon Sep 17 00:00:00 2001
From: Palmer Dabbelt <palmer@dabbelt.com>
Date: Thu, 30 Apr 2015 21:19:56 -0700
Subject: sched/wait: Change wait_on_bit*() to take an unsigned long *, not a
 void *

The implementations of wait_on_bit*() will only work with long-aligned
memory on systems that don't support misaligned loads and stores.

This patch changes the function prototypes to ensure that the compiler
will enforce alignment.

Running

  make defconfig
  make KFLAGS="-Werror"

seems to indicate that, as of c56fb6564dcd ("Fix a misaligned load
inside ptrace_attach()"), there are now no users of non-long-aligned
calls to wait_on_bit*().  I additionally tried a few "make randconfig"
attempts, none of which failed to compile for this reason.

Signed-off-by: Palmer Dabbelt <palmer@dabbelt.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Chris Metcalf <cmetcalf@ezchip.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: bobby.prani@gmail.com
Cc: oleg@redhat.com
Cc: paulmck@linux.vnet.ibm.com
Cc: richard@nod.at
Cc: vdavydov@parallels.com
Link: http://lkml.kernel.org/r/1430453997-32459-3-git-send-email-palmer@dabbelt.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/wait.h | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/wait.h b/include/linux/wait.h
index 2db83349865b..d69ac4ecc88b 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -969,7 +969,7 @@ extern int bit_wait_io_timeout(struct wait_bit_key *);
  * on that signal.
  */
 static inline int
-wait_on_bit(void *word, int bit, unsigned mode)
+wait_on_bit(unsigned long *word, int bit, unsigned mode)
 {
 	might_sleep();
 	if (!test_bit(bit, word))
@@ -994,7 +994,7 @@ wait_on_bit(void *word, int bit, unsigned mode)
  * on that signal.
  */
 static inline int
-wait_on_bit_io(void *word, int bit, unsigned mode)
+wait_on_bit_io(unsigned long *word, int bit, unsigned mode)
 {
 	might_sleep();
 	if (!test_bit(bit, word))
@@ -1020,7 +1020,8 @@ wait_on_bit_io(void *word, int bit, unsigned mode)
  * received a signal and the mode permitted wakeup on that signal.
  */
 static inline int
-wait_on_bit_timeout(void *word, int bit, unsigned mode, unsigned long timeout)
+wait_on_bit_timeout(unsigned long *word, int bit, unsigned mode,
+		    unsigned long timeout)
 {
 	might_sleep();
 	if (!test_bit(bit, word))
@@ -1047,7 +1048,8 @@ wait_on_bit_timeout(void *word, int bit, unsigned mode, unsigned long timeout)
  * on that signal.
  */
 static inline int
-wait_on_bit_action(void *word, int bit, wait_bit_action_f *action, unsigned mode)
+wait_on_bit_action(unsigned long *word, int bit, wait_bit_action_f *action,
+		   unsigned mode)
 {
 	might_sleep();
 	if (!test_bit(bit, word))
@@ -1075,7 +1077,7 @@ wait_on_bit_action(void *word, int bit, wait_bit_action_f *action, unsigned mode
  * the @mode allows that signal to wake the process.
  */
 static inline int
-wait_on_bit_lock(void *word, int bit, unsigned mode)
+wait_on_bit_lock(unsigned long *word, int bit, unsigned mode)
 {
 	might_sleep();
 	if (!test_and_set_bit(bit, word))
@@ -1099,7 +1101,7 @@ wait_on_bit_lock(void *word, int bit, unsigned mode)
  * the @mode allows that signal to wake the process.
  */
 static inline int
-wait_on_bit_lock_io(void *word, int bit, unsigned mode)
+wait_on_bit_lock_io(unsigned long *word, int bit, unsigned mode)
 {
 	might_sleep();
 	if (!test_and_set_bit(bit, word))
@@ -1125,7 +1127,8 @@ wait_on_bit_lock_io(void *word, int bit, unsigned mode)
  * the @mode allows that signal to wake the process.
  */
 static inline int
-wait_on_bit_lock_action(void *word, int bit, wait_bit_action_f *action, unsigned mode)
+wait_on_bit_lock_action(unsigned long *word, int bit, wait_bit_action_f *action,
+			unsigned mode)
 {
 	might_sleep();
 	if (!test_and_set_bit(bit, word))
-- 
cgit v1.2.3


From e7cc4173115347bcdaa5de2824dd46ef2c58425f Mon Sep 17 00:00:00 2001
From: Palmer Dabbelt <palmer@dabbelt.com>
Date: Thu, 30 Apr 2015 21:19:55 -0700
Subject: signals, ptrace, sched: Fix a misaligned load inside ptrace_attach()

The misaligned load exception arises when running ptrace_attach() on
the RISC-V (which hasn't been upstreamed yet).  The problem is that
wait_on_bit() takes a void* but then proceeds to call test_bit(),
which takes a long*.  This allows an int-aligned pointer to be passed
to test_bit(), which promptly fails.  This will manifest on any other
asm-generic port where unaligned loads trap, where sizeof(long) >
sizeof(int), and where task_struct.jobctl ends up not being
long-aligned.

This patch changes task_struct.jobctl to be a long, which ensures it
has the correct alignment.

Signed-off-by: Palmer Dabbelt <palmer@dabbelt.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Chris Metcalf <cmetcalf@ezchip.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: bobby.prani@gmail.com
Cc: oleg@redhat.com
Cc: paulmck@linux.vnet.ibm.com
Cc: richard@nod.at
Cc: vdavydov@parallels.com
Link: http://lkml.kernel.org/r/1430453997-32459-2-git-send-email-palmer@dabbelt.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 4f066cb625ad..fb650a2f4a73 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1374,7 +1374,7 @@ struct task_struct {
 	int exit_state;
 	int exit_code, exit_signal;
 	int pdeath_signal;  /*  The signal sent when the parent dies  */
-	unsigned int jobctl;	/* JOBCTL_*, siglock protected */
+	unsigned long jobctl;	/* JOBCTL_*, siglock protected */
 
 	/* Used for emulating ABI behavior of previous Linux versions */
 	unsigned int personality;
-- 
cgit v1.2.3


From 316c1608d15c736439d4065ed12f306db554b3da Mon Sep 17 00:00:00 2001
From: Jason Low <jason.low2@hp.com>
Date: Tue, 28 Apr 2015 13:00:20 -0700
Subject: sched, timer: Convert usages of ACCESS_ONCE() in the scheduler to
 READ_ONCE()/WRITE_ONCE()

ACCESS_ONCE doesn't work reliably on non-scalar types. This patch removes
the rest of the existing usages of ACCESS_ONCE() in the scheduler, and use
the new READ_ONCE() and WRITE_ONCE() APIs as appropriate.

Signed-off-by: Jason Low <jason.low2@hp.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Rik van Riel <riel@redhat.com>
Acked-by: Waiman Long <Waiman.Long@hp.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Aswin Chandramouleeswaran <aswin@hp.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Mike Galbraith <umgwanakikbuti@gmail.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Preeti U Murthy <preeti@linux.vnet.ibm.com>
Cc: Scott J Norton <scott.norton@hp.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Link: http://lkml.kernel.org/r/1430251224-5764-2-git-send-email-jason.low2@hp.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h          |  4 ++--
 kernel/fork.c                  |  2 +-
 kernel/sched/auto_group.c      |  2 +-
 kernel/sched/auto_group.h      |  2 +-
 kernel/sched/core.c            |  4 ++--
 kernel/sched/cputime.c         |  2 +-
 kernel/sched/deadline.c        |  2 +-
 kernel/sched/fair.c            | 18 +++++++++---------
 kernel/sched/rt.c              |  2 +-
 kernel/sched/sched.h           |  2 +-
 kernel/sched/wait.c            |  4 ++--
 kernel/time/posix-cpu-timers.c |  8 ++++----
 12 files changed, 26 insertions(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index fb650a2f4a73..d70910355b20 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -3085,13 +3085,13 @@ static inline void mm_update_next_owner(struct mm_struct *mm)
 static inline unsigned long task_rlimit(const struct task_struct *tsk,
 		unsigned int limit)
 {
-	return ACCESS_ONCE(tsk->signal->rlim[limit].rlim_cur);
+	return READ_ONCE(tsk->signal->rlim[limit].rlim_cur);
 }
 
 static inline unsigned long task_rlimit_max(const struct task_struct *tsk,
 		unsigned int limit)
 {
-	return ACCESS_ONCE(tsk->signal->rlim[limit].rlim_max);
+	return READ_ONCE(tsk->signal->rlim[limit].rlim_max);
 }
 
 static inline unsigned long rlimit(unsigned int limit)
diff --git a/kernel/fork.c b/kernel/fork.c
index 03c1eaaa6ef5..47c37a411a62 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1094,7 +1094,7 @@ static void posix_cpu_timers_init_group(struct signal_struct *sig)
 	/* Thread group counters. */
 	thread_group_cputime_init(sig);
 
-	cpu_limit = ACCESS_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
+	cpu_limit = READ_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
 	if (cpu_limit != RLIM_INFINITY) {
 		sig->cputime_expires.prof_exp = secs_to_cputime(cpu_limit);
 		sig->cputimer.running = 1;
diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c
index 1a3b58d531b2..750ed601ddf7 100644
--- a/kernel/sched/auto_group.c
+++ b/kernel/sched/auto_group.c
@@ -139,7 +139,7 @@ autogroup_move_group(struct task_struct *p, struct autogroup *ag)
 
 	p->signal->autogroup = autogroup_kref_get(ag);
 
-	if (!ACCESS_ONCE(sysctl_sched_autogroup_enabled))
+	if (!READ_ONCE(sysctl_sched_autogroup_enabled))
 		goto out;
 
 	for_each_thread(p, t)
diff --git a/kernel/sched/auto_group.h b/kernel/sched/auto_group.h
index 8bd047142816..890c95f2587a 100644
--- a/kernel/sched/auto_group.h
+++ b/kernel/sched/auto_group.h
@@ -29,7 +29,7 @@ extern bool task_wants_autogroup(struct task_struct *p, struct task_group *tg);
 static inline struct task_group *
 autogroup_task_group(struct task_struct *p, struct task_group *tg)
 {
-	int enabled = ACCESS_ONCE(sysctl_sched_autogroup_enabled);
+	int enabled = READ_ONCE(sysctl_sched_autogroup_enabled);
 
 	if (enabled && task_wants_autogroup(p, tg))
 		return p->signal->autogroup->tg;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 46a5d6f05208..22b53c863ef3 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -511,7 +511,7 @@ static bool set_nr_and_not_polling(struct task_struct *p)
 static bool set_nr_if_polling(struct task_struct *p)
 {
 	struct thread_info *ti = task_thread_info(p);
-	typeof(ti->flags) old, val = ACCESS_ONCE(ti->flags);
+	typeof(ti->flags) old, val = READ_ONCE(ti->flags);
 
 	for (;;) {
 		if (!(val & _TIF_POLLING_NRFLAG))
@@ -2526,7 +2526,7 @@ void scheduler_tick(void)
 u64 scheduler_tick_max_deferment(void)
 {
 	struct rq *rq = this_rq();
-	unsigned long next, now = ACCESS_ONCE(jiffies);
+	unsigned long next, now = READ_ONCE(jiffies);
 
 	next = rq->last_sched_tick + HZ;
 
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 8394b1ee600c..f5a64ffad176 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -567,7 +567,7 @@ static void cputime_advance(cputime_t *counter, cputime_t new)
 {
 	cputime_t old;
 
-	while (new > (old = ACCESS_ONCE(*counter)))
+	while (new > (old = READ_ONCE(*counter)))
 		cmpxchg_cputime(counter, old, new);
 }
 
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 5e95145088fd..890ce951c717 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -995,7 +995,7 @@ select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags)
 	rq = cpu_rq(cpu);
 
 	rcu_read_lock();
-	curr = ACCESS_ONCE(rq->curr); /* unlocked access */
+	curr = READ_ONCE(rq->curr); /* unlocked access */
 
 	/*
 	 * If we are dealing with a -deadline task, we must
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 4bc6013886ec..d6915a038d8a 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -834,7 +834,7 @@ static unsigned int task_nr_scan_windows(struct task_struct *p)
 
 static unsigned int task_scan_min(struct task_struct *p)
 {
-	unsigned int scan_size = ACCESS_ONCE(sysctl_numa_balancing_scan_size);
+	unsigned int scan_size = READ_ONCE(sysctl_numa_balancing_scan_size);
 	unsigned int scan, floor;
 	unsigned int windows = 1;
 
@@ -1794,7 +1794,7 @@ static void task_numa_placement(struct task_struct *p)
 	u64 runtime, period;
 	spinlock_t *group_lock = NULL;
 
-	seq = ACCESS_ONCE(p->mm->numa_scan_seq);
+	seq = READ_ONCE(p->mm->numa_scan_seq);
 	if (p->numa_scan_seq == seq)
 		return;
 	p->numa_scan_seq = seq;
@@ -1938,7 +1938,7 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,
 	}
 
 	rcu_read_lock();
-	tsk = ACCESS_ONCE(cpu_rq(cpu)->curr);
+	tsk = READ_ONCE(cpu_rq(cpu)->curr);
 
 	if (!cpupid_match_pid(tsk, cpupid))
 		goto no_join;
@@ -2107,7 +2107,7 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
 
 static void reset_ptenuma_scan(struct task_struct *p)
 {
-	ACCESS_ONCE(p->mm->numa_scan_seq)++;
+	WRITE_ONCE(p->mm->numa_scan_seq, READ_ONCE(p->mm->numa_scan_seq) + 1);
 	p->mm->numa_scan_offset = 0;
 }
 
@@ -4451,7 +4451,7 @@ static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
  */
 static void update_idle_cpu_load(struct rq *this_rq)
 {
-	unsigned long curr_jiffies = ACCESS_ONCE(jiffies);
+	unsigned long curr_jiffies = READ_ONCE(jiffies);
 	unsigned long load = this_rq->cfs.runnable_load_avg;
 	unsigned long pending_updates;
 
@@ -4473,7 +4473,7 @@ static void update_idle_cpu_load(struct rq *this_rq)
 void update_cpu_load_nohz(void)
 {
 	struct rq *this_rq = this_rq();
-	unsigned long curr_jiffies = ACCESS_ONCE(jiffies);
+	unsigned long curr_jiffies = READ_ONCE(jiffies);
 	unsigned long pending_updates;
 
 	if (curr_jiffies == this_rq->last_load_update_tick)
@@ -4558,7 +4558,7 @@ static unsigned long capacity_orig_of(int cpu)
 static unsigned long cpu_avg_load_per_task(int cpu)
 {
 	struct rq *rq = cpu_rq(cpu);
-	unsigned long nr_running = ACCESS_ONCE(rq->cfs.h_nr_running);
+	unsigned long nr_running = READ_ONCE(rq->cfs.h_nr_running);
 	unsigned long load_avg = rq->cfs.runnable_load_avg;
 
 	if (nr_running)
@@ -6220,8 +6220,8 @@ static unsigned long scale_rt_capacity(int cpu)
 	 * Since we're reading these variables without serialization make sure
 	 * we read them once before doing sanity checks on them.
 	 */
-	age_stamp = ACCESS_ONCE(rq->age_stamp);
-	avg = ACCESS_ONCE(rq->rt_avg);
+	age_stamp = READ_ONCE(rq->age_stamp);
+	avg = READ_ONCE(rq->rt_avg);
 	delta = __rq_clock_broken(rq) - age_stamp;
 
 	if (unlikely(delta < 0))
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 575da76a3874..560d2fa623c3 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1323,7 +1323,7 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags)
 	rq = cpu_rq(cpu);
 
 	rcu_read_lock();
-	curr = ACCESS_ONCE(rq->curr); /* unlocked access */
+	curr = READ_ONCE(rq->curr); /* unlocked access */
 
 	/*
 	 * If the current task on @p's runqueue is an RT task, then
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 09ed26a89f31..d85455539d5c 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -713,7 +713,7 @@ DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
 
 static inline u64 __rq_clock_broken(struct rq *rq)
 {
-	return ACCESS_ONCE(rq->clock);
+	return READ_ONCE(rq->clock);
 }
 
 static inline u64 rq_clock(struct rq *rq)
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index 852143a79f36..2ccec988d6b7 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -601,7 +601,7 @@ EXPORT_SYMBOL(bit_wait_io);
 
 __sched int bit_wait_timeout(struct wait_bit_key *word)
 {
-	unsigned long now = ACCESS_ONCE(jiffies);
+	unsigned long now = READ_ONCE(jiffies);
 	if (signal_pending_state(current->state, current))
 		return 1;
 	if (time_after_eq(now, word->timeout))
@@ -613,7 +613,7 @@ EXPORT_SYMBOL_GPL(bit_wait_timeout);
 
 __sched int bit_wait_io_timeout(struct wait_bit_key *word)
 {
-	unsigned long now = ACCESS_ONCE(jiffies);
+	unsigned long now = READ_ONCE(jiffies);
 	if (signal_pending_state(current->state, current))
 		return 1;
 	if (time_after_eq(now, word->timeout))
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 0075da74abf0..e072d982f64c 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -852,10 +852,10 @@ static void check_thread_timers(struct task_struct *tsk,
 	/*
 	 * Check for the special case thread timers.
 	 */
-	soft = ACCESS_ONCE(sig->rlim[RLIMIT_RTTIME].rlim_cur);
+	soft = READ_ONCE(sig->rlim[RLIMIT_RTTIME].rlim_cur);
 	if (soft != RLIM_INFINITY) {
 		unsigned long hard =
-			ACCESS_ONCE(sig->rlim[RLIMIT_RTTIME].rlim_max);
+			READ_ONCE(sig->rlim[RLIMIT_RTTIME].rlim_max);
 
 		if (hard != RLIM_INFINITY &&
 		    tsk->rt.timeout > DIV_ROUND_UP(hard, USEC_PER_SEC/HZ)) {
@@ -958,11 +958,11 @@ static void check_process_timers(struct task_struct *tsk,
 			 SIGPROF);
 	check_cpu_itimer(tsk, &sig->it[CPUCLOCK_VIRT], &virt_expires, utime,
 			 SIGVTALRM);
-	soft = ACCESS_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
+	soft = READ_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
 	if (soft != RLIM_INFINITY) {
 		unsigned long psecs = cputime_to_secs(ptime);
 		unsigned long hard =
-			ACCESS_ONCE(sig->rlim[RLIMIT_CPU].rlim_max);
+			READ_ONCE(sig->rlim[RLIMIT_CPU].rlim_max);
 		cputime_t x;
 		if (psecs >= hard) {
 			/*
-- 
cgit v1.2.3


From 1018016c706f7ff9f56fde3a649789c47085a293 Mon Sep 17 00:00:00 2001
From: Jason Low <jason.low2@hp.com>
Date: Tue, 28 Apr 2015 13:00:22 -0700
Subject: sched, timer: Replace spinlocks with atomics in
 thread_group_cputimer(), to improve scalability

While running a database workload, we found a scalability issue with itimers.

Much of the problem was caused by the thread_group_cputimer spinlock.
Each time we account for group system/user time, we need to obtain a
thread_group_cputimer's spinlock to update the timers. On larger systems
(such as a 16 socket machine), this caused more than 30% of total time
spent trying to obtain this kernel lock to update these group timer stats.

This patch converts the timers to 64-bit atomic variables and use
atomic add to update them without a lock. With this patch, the percent
of total time spent updating thread group cputimer timers was reduced
from 30% down to less than 1%.

Note: On 32-bit systems using the generic 64-bit atomics, this causes
sample_group_cputimer() to take locks 3 times instead of just 1 time.
However, we tested this patch on a 32-bit system ARM system using the
generic atomics and did not find the overhead to be much of an issue.
An explanation for why this isn't an issue is that 32-bit systems usually
have small numbers of CPUs, and cacheline contention from extra spinlocks
called periodically is not really apparent on smaller systems.

Signed-off-by: Jason Low <jason.low2@hp.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Rik van Riel <riel@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Aswin Chandramouleeswaran <aswin@hp.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Mike Galbraith <umgwanakikbuti@gmail.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Preeti U Murthy <preeti@linux.vnet.ibm.com>
Cc: Scott J Norton <scott.norton@hp.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Waiman Long <Waiman.Long@hp.com>
Link: http://lkml.kernel.org/r/1430251224-5764-4-git-send-email-jason.low2@hp.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/init_task.h      |  7 ++--
 include/linux/sched.h          | 10 ++----
 kernel/fork.c                  |  3 --
 kernel/sched/stats.h           | 15 +++-----
 kernel/time/posix-cpu-timers.c | 79 ++++++++++++++++++++++++++----------------
 5 files changed, 62 insertions(+), 52 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 696d22312b31..7b9d8b59e7bf 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -50,9 +50,10 @@ extern struct fs_struct init_fs;
 	.cpu_timers	= INIT_CPU_TIMERS(sig.cpu_timers),		\
 	.rlim		= INIT_RLIMITS,					\
 	.cputimer	= { 						\
-		.cputime = INIT_CPUTIME,				\
-		.running = 0,						\
-		.lock = __RAW_SPIN_LOCK_UNLOCKED(sig.cputimer.lock),	\
+		.utime 		  = ATOMIC64_INIT(0),			\
+		.stime		  = ATOMIC64_INIT(0),			\
+		.sum_exec_runtime = ATOMIC64_INIT(0),			\
+		.running 	  = 0					\
 	},								\
 	.cred_guard_mutex =						\
 		 __MUTEX_INITIALIZER(sig.cred_guard_mutex),		\
diff --git a/include/linux/sched.h b/include/linux/sched.h
index d70910355b20..a45874c3fab6 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -598,9 +598,10 @@ struct task_cputime {
  * used for thread group CPU timer calculations.
  */
 struct thread_group_cputimer {
-	struct task_cputime cputime;
+	atomic64_t utime;
+	atomic64_t stime;
+	atomic64_t sum_exec_runtime;
 	int running;
-	raw_spinlock_t lock;
 };
 
 #include <linux/rwsem.h>
@@ -2967,11 +2968,6 @@ static __always_inline bool need_resched(void)
 void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times);
 void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times);
 
-static inline void thread_group_cputime_init(struct signal_struct *sig)
-{
-	raw_spin_lock_init(&sig->cputimer.lock);
-}
-
 /*
  * Reevaluate whether the task has signals pending delivery.
  * Wake the task if so.
diff --git a/kernel/fork.c b/kernel/fork.c
index 47c37a411a62..2e670864174f 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1091,9 +1091,6 @@ static void posix_cpu_timers_init_group(struct signal_struct *sig)
 {
 	unsigned long cpu_limit;
 
-	/* Thread group counters. */
-	thread_group_cputime_init(sig);
-
 	cpu_limit = READ_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
 	if (cpu_limit != RLIM_INFINITY) {
 		sig->cputime_expires.prof_exp = secs_to_cputime(cpu_limit);
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index 4ab704339656..c6d1c7da3ea5 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -174,7 +174,8 @@ static inline bool cputimer_running(struct task_struct *tsk)
 {
 	struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
 
-	if (!cputimer->running)
+	/* Check if cputimer isn't running. This is accessed without locking. */
+	if (!READ_ONCE(cputimer->running))
 		return false;
 
 	/*
@@ -215,9 +216,7 @@ static inline void account_group_user_time(struct task_struct *tsk,
 	if (!cputimer_running(tsk))
 		return;
 
-	raw_spin_lock(&cputimer->lock);
-	cputimer->cputime.utime += cputime;
-	raw_spin_unlock(&cputimer->lock);
+	atomic64_add(cputime, &cputimer->utime);
 }
 
 /**
@@ -238,9 +237,7 @@ static inline void account_group_system_time(struct task_struct *tsk,
 	if (!cputimer_running(tsk))
 		return;
 
-	raw_spin_lock(&cputimer->lock);
-	cputimer->cputime.stime += cputime;
-	raw_spin_unlock(&cputimer->lock);
+	atomic64_add(cputime, &cputimer->stime);
 }
 
 /**
@@ -261,7 +258,5 @@ static inline void account_group_exec_runtime(struct task_struct *tsk,
 	if (!cputimer_running(tsk))
 		return;
 
-	raw_spin_lock(&cputimer->lock);
-	cputimer->cputime.sum_exec_runtime += ns;
-	raw_spin_unlock(&cputimer->lock);
+	atomic64_add(ns, &cputimer->sum_exec_runtime);
 }
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index e072d982f64c..d85730669410 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -196,39 +196,62 @@ static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p,
 	return 0;
 }
 
-static void update_gt_cputime(struct task_cputime *a, struct task_cputime *b)
+/*
+ * Set cputime to sum_cputime if sum_cputime > cputime. Use cmpxchg
+ * to avoid race conditions with concurrent updates to cputime.
+ */
+static inline void __update_gt_cputime(atomic64_t *cputime, u64 sum_cputime)
 {
-	if (b->utime > a->utime)
-		a->utime = b->utime;
+	u64 curr_cputime;
+retry:
+	curr_cputime = atomic64_read(cputime);
+	if (sum_cputime > curr_cputime) {
+		if (atomic64_cmpxchg(cputime, curr_cputime, sum_cputime) != curr_cputime)
+			goto retry;
+	}
+}
 
-	if (b->stime > a->stime)
-		a->stime = b->stime;
+static void update_gt_cputime(struct thread_group_cputimer *cputimer, struct task_cputime *sum)
+{
+	__update_gt_cputime(&cputimer->utime, sum->utime);
+	__update_gt_cputime(&cputimer->stime, sum->stime);
+	__update_gt_cputime(&cputimer->sum_exec_runtime, sum->sum_exec_runtime);
+}
 
-	if (b->sum_exec_runtime > a->sum_exec_runtime)
-		a->sum_exec_runtime = b->sum_exec_runtime;
+/* Sample thread_group_cputimer values in "cputimer", store results in "times". */
+static inline void sample_group_cputimer(struct task_cputime *times,
+					  struct thread_group_cputimer *cputimer)
+{
+	times->utime = atomic64_read(&cputimer->utime);
+	times->stime = atomic64_read(&cputimer->stime);
+	times->sum_exec_runtime = atomic64_read(&cputimer->sum_exec_runtime);
 }
 
 void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times)
 {
 	struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
 	struct task_cputime sum;
-	unsigned long flags;
 
-	if (!cputimer->running) {
+	/* Check if cputimer isn't running. This is accessed without locking. */
+	if (!READ_ONCE(cputimer->running)) {
 		/*
 		 * The POSIX timer interface allows for absolute time expiry
 		 * values through the TIMER_ABSTIME flag, therefore we have
-		 * to synchronize the timer to the clock every time we start
-		 * it.
+		 * to synchronize the timer to the clock every time we start it.
 		 */
 		thread_group_cputime(tsk, &sum);
-		raw_spin_lock_irqsave(&cputimer->lock, flags);
-		cputimer->running = 1;
-		update_gt_cputime(&cputimer->cputime, &sum);
-	} else
-		raw_spin_lock_irqsave(&cputimer->lock, flags);
-	*times = cputimer->cputime;
-	raw_spin_unlock_irqrestore(&cputimer->lock, flags);
+		update_gt_cputime(cputimer, &sum);
+
+		/*
+		 * We're setting cputimer->running without a lock. Ensure
+		 * this only gets written to in one operation. We set
+		 * running after update_gt_cputime() as a small optimization,
+		 * but barriers are not required because update_gt_cputime()
+		 * can handle concurrent updates.
+		 */
+		WRITE_ONCE(cputimer->running, 1);
+	}
+	sample_group_cputimer(times, cputimer);
 }
 
 /*
@@ -582,7 +605,8 @@ bool posix_cpu_timers_can_stop_tick(struct task_struct *tsk)
 	if (!task_cputime_zero(&tsk->cputime_expires))
 		return false;
 
-	if (tsk->signal->cputimer.running)
+	/* Check if cputimer is running. This is accessed without locking. */
+	if (READ_ONCE(tsk->signal->cputimer.running))
 		return false;
 
 	return true;
@@ -882,14 +906,12 @@ static void check_thread_timers(struct task_struct *tsk,
 	}
 }
 
-static void stop_process_timers(struct signal_struct *sig)
+static inline void stop_process_timers(struct signal_struct *sig)
 {
 	struct thread_group_cputimer *cputimer = &sig->cputimer;
-	unsigned long flags;
 
-	raw_spin_lock_irqsave(&cputimer->lock, flags);
-	cputimer->running = 0;
-	raw_spin_unlock_irqrestore(&cputimer->lock, flags);
+	/* Turn off cputimer->running. This is done without locking. */
+	WRITE_ONCE(cputimer->running, 0);
 }
 
 static u32 onecputick;
@@ -1111,12 +1133,11 @@ static inline int fastpath_timer_check(struct task_struct *tsk)
 	}
 
 	sig = tsk->signal;
-	if (sig->cputimer.running) {
+	/* Check if cputimer is running. This is accessed without locking. */
+	if (READ_ONCE(sig->cputimer.running)) {
 		struct task_cputime group_sample;
 
-		raw_spin_lock(&sig->cputimer.lock);
-		group_sample = sig->cputimer.cputime;
-		raw_spin_unlock(&sig->cputimer.lock);
+		sample_group_cputimer(&group_sample, &sig->cputimer);
 
 		if (task_cputime_expired(&group_sample, &sig->cputime_expires))
 			return 1;
@@ -1157,7 +1178,7 @@ void run_posix_cpu_timers(struct task_struct *tsk)
 	 * If there are any active process wide timers (POSIX 1.b, itimers,
 	 * RLIMIT_CPU) cputimer must be running.
 	 */
-	if (tsk->signal->cputimer.running)
+	if (READ_ONCE(tsk->signal->cputimer.running))
 		check_process_timers(tsk, &firing);
 
 	/*
-- 
cgit v1.2.3


From 971e8a985482c76487edb5a49811e99b96e846e1 Mon Sep 17 00:00:00 2001
From: Jason Low <jason.low2@hp.com>
Date: Tue, 28 Apr 2015 13:00:23 -0700
Subject: sched, timer: Provide an atomic 'struct task_cputime' data structure

This patch adds an atomic variant of the 'struct task_cputime' data structure,
which can be used to store and update task_cputime statistics without
needing to do locking.

Suggested-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Jason Low <jason.low2@hp.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Rik van Riel <riel@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Aswin Chandramouleeswaran <aswin@hp.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Mike Galbraith <umgwanakikbuti@gmail.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Preeti U Murthy <preeti@linux.vnet.ibm.com>
Cc: Scott J Norton <scott.norton@hp.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Waiman Long <Waiman.Long@hp.com>
Link: http://lkml.kernel.org/r/1430251224-5764-5-git-send-email-jason.low2@hp.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index a45874c3fab6..6eb78cd45da7 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -572,6 +572,23 @@ struct task_cputime {
 		.sum_exec_runtime = 0,				\
 	}
 
+/*
+ * This is the atomic variant of task_cputime, which can be used for
+ * storing and updating task_cputime statistics without locking.
+ */
+struct task_cputime_atomic {
+	atomic64_t utime;
+	atomic64_t stime;
+	atomic64_t sum_exec_runtime;
+};
+
+#define INIT_CPUTIME_ATOMIC \
+	(struct task_cputime_atomic) {				\
+		.utime = ATOMIC64_INIT(0),			\
+		.stime = ATOMIC64_INIT(0),			\
+		.sum_exec_runtime = ATOMIC64_INIT(0),		\
+	}
+
 #ifdef CONFIG_PREEMPT_COUNT
 #define PREEMPT_DISABLED	(1 + PREEMPT_ENABLED)
 #else
-- 
cgit v1.2.3


From 7110744516276e906f9197e2857d026eb2343393 Mon Sep 17 00:00:00 2001
From: Jason Low <jason.low2@hp.com>
Date: Tue, 28 Apr 2015 13:00:24 -0700
Subject: sched, timer: Use the atomic task_cputime in thread_group_cputimer

Recent optimizations were made to thread_group_cputimer to improve its
scalability by keeping track of cputime stats without a lock. However,
the values were open coded to the structure, causing them to be at
a different abstraction level from the regular task_cputime structure.
Furthermore, any subsequent similar optimizations would not be able to
share the new code, since they are specific to thread_group_cputimer.

This patch adds the new task_cputime_atomic data structure (introduced in
the previous patch in the series) to thread_group_cputimer for keeping
track of the cputime atomically, which also helps generalize the code.

Suggested-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Jason Low <jason.low2@hp.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Rik van Riel <riel@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Aswin Chandramouleeswaran <aswin@hp.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Mike Galbraith <umgwanakikbuti@gmail.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Preeti U Murthy <preeti@linux.vnet.ibm.com>
Cc: Scott J Norton <scott.norton@hp.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Waiman Long <Waiman.Long@hp.com>
Link: http://lkml.kernel.org/r/1430251224-5764-6-git-send-email-jason.low2@hp.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/init_task.h      |  6 ++----
 include/linux/sched.h          |  4 +---
 kernel/sched/stats.h           |  6 +++---
 kernel/time/posix-cpu-timers.c | 26 +++++++++++++-------------
 4 files changed, 19 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 7b9d8b59e7bf..bb9b075f0eb0 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -50,10 +50,8 @@ extern struct fs_struct init_fs;
 	.cpu_timers	= INIT_CPU_TIMERS(sig.cpu_timers),		\
 	.rlim		= INIT_RLIMITS,					\
 	.cputimer	= { 						\
-		.utime 		  = ATOMIC64_INIT(0),			\
-		.stime		  = ATOMIC64_INIT(0),			\
-		.sum_exec_runtime = ATOMIC64_INIT(0),			\
-		.running 	  = 0					\
+		.cputime_atomic	= INIT_CPUTIME_ATOMIC,			\
+		.running	= 0,					\
 	},								\
 	.cred_guard_mutex =						\
 		 __MUTEX_INITIALIZER(sig.cred_guard_mutex),		\
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 6eb78cd45da7..4adc536a3b03 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -615,9 +615,7 @@ struct task_cputime_atomic {
  * used for thread group CPU timer calculations.
  */
 struct thread_group_cputimer {
-	atomic64_t utime;
-	atomic64_t stime;
-	atomic64_t sum_exec_runtime;
+	struct task_cputime_atomic cputime_atomic;
 	int running;
 };
 
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index c6d1c7da3ea5..077ebbd5e10f 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -216,7 +216,7 @@ static inline void account_group_user_time(struct task_struct *tsk,
 	if (!cputimer_running(tsk))
 		return;
 
-	atomic64_add(cputime, &cputimer->utime);
+	atomic64_add(cputime, &cputimer->cputime_atomic.utime);
 }
 
 /**
@@ -237,7 +237,7 @@ static inline void account_group_system_time(struct task_struct *tsk,
 	if (!cputimer_running(tsk))
 		return;
 
-	atomic64_add(cputime, &cputimer->stime);
+	atomic64_add(cputime, &cputimer->cputime_atomic.stime);
 }
 
 /**
@@ -258,5 +258,5 @@ static inline void account_group_exec_runtime(struct task_struct *tsk,
 	if (!cputimer_running(tsk))
 		return;
 
-	atomic64_add(ns, &cputimer->sum_exec_runtime);
+	atomic64_add(ns, &cputimer->cputime_atomic.sum_exec_runtime);
 }
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index d85730669410..892e3dae0aac 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -211,20 +211,20 @@ retry:
 	}
 }
 
-static void update_gt_cputime(struct thread_group_cputimer *cputimer, struct task_cputime *sum)
+static void update_gt_cputime(struct task_cputime_atomic *cputime_atomic, struct task_cputime *sum)
 {
-	__update_gt_cputime(&cputimer->utime, sum->utime);
-	__update_gt_cputime(&cputimer->stime, sum->stime);
-	__update_gt_cputime(&cputimer->sum_exec_runtime, sum->sum_exec_runtime);
+	__update_gt_cputime(&cputime_atomic->utime, sum->utime);
+	__update_gt_cputime(&cputime_atomic->stime, sum->stime);
+	__update_gt_cputime(&cputime_atomic->sum_exec_runtime, sum->sum_exec_runtime);
 }
 
-/* Sample thread_group_cputimer values in "cputimer", store results in "times". */
-static inline void sample_group_cputimer(struct task_cputime *times,
-					  struct thread_group_cputimer *cputimer)
+/* Sample task_cputime_atomic values in "atomic_timers", store results in "times". */
+static inline void sample_cputime_atomic(struct task_cputime *times,
+					 struct task_cputime_atomic *atomic_times)
 {
-	times->utime = atomic64_read(&cputimer->utime);
-	times->stime = atomic64_read(&cputimer->stime);
-	times->sum_exec_runtime = atomic64_read(&cputimer->sum_exec_runtime);
+	times->utime = atomic64_read(&atomic_times->utime);
+	times->stime = atomic64_read(&atomic_times->stime);
+	times->sum_exec_runtime = atomic64_read(&atomic_times->sum_exec_runtime);
 }
 
 void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times)
@@ -240,7 +240,7 @@ void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times)
 		 * to synchronize the timer to the clock every time we start it.
 		 */
 		thread_group_cputime(tsk, &sum);
-		update_gt_cputime(cputimer, &sum);
+		update_gt_cputime(&cputimer->cputime_atomic, &sum);
 
 		/*
 		 * We're setting cputimer->running without a lock. Ensure
@@ -251,7 +251,7 @@ void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times)
 		 */
 		WRITE_ONCE(cputimer->running, 1);
 	}
-	sample_group_cputimer(times, cputimer);
+	sample_cputime_atomic(times, &cputimer->cputime_atomic);
 }
 
 /*
@@ -1137,7 +1137,7 @@ static inline int fastpath_timer_check(struct task_struct *tsk)
 	if (READ_ONCE(sig->cputimer.running)) {
 		struct task_cputime group_sample;
 
-		sample_group_cputimer(&group_sample, &sig->cputimer);
+		sample_cputime_atomic(&group_sample, &sig->cputimer.cputime_atomic);
 
 		if (task_cputime_expired(&group_sample, &sig->cputime_expires))
 			return 1;
-- 
cgit v1.2.3


From 7675104990ed255b9315a82ae827ff312a2a88a2 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 1 May 2015 08:27:50 -0700
Subject: sched: Implement lockless wake-queues

This is useful for locking primitives that can effect multiple
wakeups per operation and want to avoid lock internal lock contention
by delaying the wakeups until we've released the lock internal locks.

Alternatively it can be used to avoid issuing multiple wakeups, and
thus save a few cycles, in packet processing. Queue all target tasks
and wakeup once you've processed all packets. That way you avoid
waking the target task multiple times if there were multiple packets
for the same task.

Properties of a wake_q are:
- Lockless, as queue head must reside on the stack.
- Being a queue, maintains wakeup order passed by the callers. This can
  be important for otherwise, in scenarios where highly contended locks
  could affect any reliance on lock fairness.
- A queued task cannot be added again until it is woken up.

This patch adds the needed infrastructure into the scheduler code
and uses the new wake_list to delay the futex wakeups until
after we've released the hash bucket locks.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
[tweaks, adjustments, comments, etc.]
Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Chris Mason <clm@fb.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: George Spelvin <linux@horizon.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Manfred Spraul <manfred@colorfullife.com>
Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Cc: Steven Rostedt <rostedt@goodmis.org>
Link: http://lkml.kernel.org/r/1430494072-30283-2-git-send-email-dave@stgolabs.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h | 46 ++++++++++++++++++++++++++++++++++++++++++++++
 kernel/sched/core.c   | 46 ++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 92 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 4adc536a3b03..254d88e80f65 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -920,6 +920,50 @@ enum cpu_idle_type {
 #define SCHED_CAPACITY_SHIFT	10
 #define SCHED_CAPACITY_SCALE	(1L << SCHED_CAPACITY_SHIFT)
 
+/*
+ * Wake-queues are lists of tasks with a pending wakeup, whose
+ * callers have already marked the task as woken internally,
+ * and can thus carry on. A common use case is being able to
+ * do the wakeups once the corresponding user lock as been
+ * released.
+ *
+ * We hold reference to each task in the list across the wakeup,
+ * thus guaranteeing that the memory is still valid by the time
+ * the actual wakeups are performed in wake_up_q().
+ *
+ * One per task suffices, because there's never a need for a task to be
+ * in two wake queues simultaneously; it is forbidden to abandon a task
+ * in a wake queue (a call to wake_up_q() _must_ follow), so if a task is
+ * already in a wake queue, the wakeup will happen soon and the second
+ * waker can just skip it.
+ *
+ * The WAKE_Q macro declares and initializes the list head.
+ * wake_up_q() does NOT reinitialize the list; it's expected to be
+ * called near the end of a function, where the fact that the queue is
+ * not used again will be easy to see by inspection.
+ *
+ * Note that this can cause spurious wakeups. schedule() callers
+ * must ensure the call is done inside a loop, confirming that the
+ * wakeup condition has in fact occurred.
+ */
+struct wake_q_node {
+	struct wake_q_node *next;
+};
+
+struct wake_q_head {
+	struct wake_q_node *first;
+	struct wake_q_node **lastp;
+};
+
+#define WAKE_Q_TAIL ((struct wake_q_node *) 0x01)
+
+#define WAKE_Q(name)					\
+	struct wake_q_head name = { WAKE_Q_TAIL, &name.first }
+
+extern void wake_q_add(struct wake_q_head *head,
+		       struct task_struct *task);
+extern void wake_up_q(struct wake_q_head *head);
+
 /*
  * sched-domains (multiprocessor balancing) declarations:
  */
@@ -1532,6 +1576,8 @@ struct task_struct {
 	/* Protection of the PI data structures: */
 	raw_spinlock_t pi_lock;
 
+	struct wake_q_node wake_q;
+
 #ifdef CONFIG_RT_MUTEXES
 	/* PI waiters blocked on a rt_mutex held by this task */
 	struct rb_root pi_waiters;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 22b53c863ef3..355f9538ca33 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -541,6 +541,52 @@ static bool set_nr_if_polling(struct task_struct *p)
 #endif
 #endif
 
+void wake_q_add(struct wake_q_head *head, struct task_struct *task)
+{
+	struct wake_q_node *node = &task->wake_q;
+
+	/*
+	 * Atomically grab the task, if ->wake_q is !nil already it means
+	 * its already queued (either by us or someone else) and will get the
+	 * wakeup due to that.
+	 *
+	 * This cmpxchg() implies a full barrier, which pairs with the write
+	 * barrier implied by the wakeup in wake_up_list().
+	 */
+	if (cmpxchg(&node->next, NULL, WAKE_Q_TAIL))
+		return;
+
+	get_task_struct(task);
+
+	/*
+	 * The head is context local, there can be no concurrency.
+	 */
+	*head->lastp = node;
+	head->lastp = &node->next;
+}
+
+void wake_up_q(struct wake_q_head *head)
+{
+	struct wake_q_node *node = head->first;
+
+	while (node != WAKE_Q_TAIL) {
+		struct task_struct *task;
+
+		task = container_of(node, struct task_struct, wake_q);
+		BUG_ON(!task);
+		/* task can safely be re-inserted now */
+		node = node->next;
+		task->wake_q.next = NULL;
+
+		/*
+		 * wake_up_process() implies a wmb() to pair with the queueing
+		 * in wake_q_add() so as not to miss wakeups.
+		 */
+		wake_up_process(task);
+		put_task_struct(task);
+	}
+}
+
 /*
  * resched_curr - mark rq's current task 'to be rescheduled now'.
  *
-- 
cgit v1.2.3


From ff303e66c240ba6269e31817a386995440a18c99 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 17 Apr 2015 20:05:30 +0200
Subject: perf: Fix software migrate events

Stephane asked about PERF_COUNT_SW_CPU_MIGRATIONS and I realized it
was borken:

 > The problem is that the task isn't actually scheduled while its being
 > migrated (obviously), and if its not scheduled, the counters aren't
 > scheduled either, so there's no observing of the fact.
 >
 > A further problem with migrations is that many migrations happen from
 > softirq context, which is nested inside the 'random' task context of
 > whoemever happens to run at that time, similarly for the wakeup
 > migrations triggered from (soft)irq context. All those end up being
 > accounted in the task that's currently running, eg. your 'ls'.

The below cures this by marking a task as migrated and accounting it
on the subsequent sched_in().

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/perf_event.h | 24 ++++++++++++++++++++++++
 include/linux/sched.h      |  7 ++++---
 kernel/sched/core.c        |  2 +-
 3 files changed, 29 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 61992cf2e977..e86f85abeda7 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -798,11 +798,33 @@ perf_sw_event_sched(u32 event_id, u64 nr, u64 addr)
 
 extern struct static_key_deferred perf_sched_events;
 
+static __always_inline bool
+perf_sw_migrate_enabled(void)
+{
+	if (static_key_false(&perf_swevent_enabled[PERF_COUNT_SW_CPU_MIGRATIONS]))
+		return true;
+	return false;
+}
+
+static inline void perf_event_task_migrate(struct task_struct *task)
+{
+	if (perf_sw_migrate_enabled())
+		task->sched_migrated = 1;
+}
+
 static inline void perf_event_task_sched_in(struct task_struct *prev,
 					    struct task_struct *task)
 {
 	if (static_key_false(&perf_sched_events.key))
 		__perf_event_task_sched_in(prev, task);
+
+	if (perf_sw_migrate_enabled() && task->sched_migrated) {
+		struct pt_regs *regs = this_cpu_ptr(&__perf_regs[0]);
+
+		perf_fetch_caller_regs(regs);
+		___perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, regs, 0);
+		task->sched_migrated = 0;
+	}
 }
 
 static inline void perf_event_task_sched_out(struct task_struct *prev,
@@ -925,6 +947,8 @@ perf_aux_output_skip(struct perf_output_handle *handle,
 static inline void *
 perf_get_aux(struct perf_output_handle *handle)				{ return NULL; }
 static inline void
+perf_event_task_migrate(struct task_struct *task)			{ }
+static inline void
 perf_event_task_sched_in(struct task_struct *prev,
 			 struct task_struct *task)			{ }
 static inline void
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 26a2e6122734..2c5e6c3db654 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1356,9 +1356,6 @@ struct task_struct {
 #endif
 
 	struct mm_struct *mm, *active_mm;
-#ifdef CONFIG_COMPAT_BRK
-	unsigned brk_randomized:1;
-#endif
 	/* per-thread vma caching */
 	u32 vmacache_seqnum;
 	struct vm_area_struct *vmacache[VMACACHE_SIZE];
@@ -1381,10 +1378,14 @@ struct task_struct {
 	/* Revert to default priority/policy when forking */
 	unsigned sched_reset_on_fork:1;
 	unsigned sched_contributes_to_load:1;
+	unsigned sched_migrated:1;
 
 #ifdef CONFIG_MEMCG_KMEM
 	unsigned memcg_kmem_skip_account:1;
 #endif
+#ifdef CONFIG_COMPAT_BRK
+	unsigned brk_randomized:1;
+#endif
 
 	unsigned long atomic_flags; /* Flags needing atomic access. */
 
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index fe22f7510bce..8652fd540780 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1049,7 +1049,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
 		if (p->sched_class->migrate_task_rq)
 			p->sched_class->migrate_task_rq(p, new_cpu);
 		p->se.nr_migrations++;
-		perf_sw_event_sched(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 0);
+		perf_event_task_migrate(p);
 	}
 
 	__set_task_cpu(p, new_cpu);
-- 
cgit v1.2.3


From 59aabfc7e959f5f213e4e5cc7567ab4934da2adf Mon Sep 17 00:00:00 2001
From: Waiman Long <Waiman.Long@hp.com>
Date: Thu, 30 Apr 2015 17:12:16 -0400
Subject: locking/rwsem: Reduce spinlock contention in wakeup after
 up_read()/up_write()

In up_write()/up_read(), rwsem_wake() will be called whenever it
detects that some writers/readers are waiting. The rwsem_wake()
function will take the wait_lock and call __rwsem_do_wake() to do the
real wakeup.  For a heavily contended rwsem, doing a spin_lock() on
wait_lock will cause further contention on the heavily contended rwsem
cacheline resulting in delay in the completion of the up_read/up_write
operations.

This patch makes the wait_lock taking and the call to __rwsem_do_wake()
optional if at least one spinning writer is present. The spinning
writer will be able to take the rwsem and call rwsem_wake() later
when it calls up_write(). With the presence of a spinning writer,
rwsem_wake() will now try to acquire the lock using trylock. If that
fails, it will just quit.

Suggested-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Waiman Long <Waiman.Long@hp.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Davidlohr Bueso <dave@stgolabs.net>
Acked-by: Jason Low <jason.low2@hp.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Douglas Hatch <doug.hatch@hp.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Scott J Norton <scott.norton@hp.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1430428337-16802-2-git-send-email-Waiman.Long@hp.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/osq_lock.h    |  5 +++++
 kernel/locking/rwsem-xadd.c | 44 ++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 49 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/osq_lock.h b/include/linux/osq_lock.h
index 3a6490e81b28..703ea5c30a33 100644
--- a/include/linux/osq_lock.h
+++ b/include/linux/osq_lock.h
@@ -32,4 +32,9 @@ static inline void osq_lock_init(struct optimistic_spin_queue *lock)
 extern bool osq_lock(struct optimistic_spin_queue *lock);
 extern void osq_unlock(struct optimistic_spin_queue *lock);
 
+static inline bool osq_is_locked(struct optimistic_spin_queue *lock)
+{
+	return atomic_read(&lock->tail) != OSQ_UNLOCKED_VAL;
+}
+
 #endif
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
index 3417d0172a5d..0f189714e457 100644
--- a/kernel/locking/rwsem-xadd.c
+++ b/kernel/locking/rwsem-xadd.c
@@ -409,11 +409,24 @@ done:
 	return taken;
 }
 
+/*
+ * Return true if the rwsem has active spinner
+ */
+static inline bool rwsem_has_spinner(struct rw_semaphore *sem)
+{
+	return osq_is_locked(&sem->osq);
+}
+
 #else
 static bool rwsem_optimistic_spin(struct rw_semaphore *sem)
 {
 	return false;
 }
+
+static inline bool rwsem_has_spinner(struct rw_semaphore *sem)
+{
+	return false;
+}
 #endif
 
 /*
@@ -496,7 +509,38 @@ struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem)
 {
 	unsigned long flags;
 
+	/*
+	 * If a spinner is present, it is not necessary to do the wakeup.
+	 * Try to do wakeup only if the trylock succeeds to minimize
+	 * spinlock contention which may introduce too much delay in the
+	 * unlock operation.
+	 *
+	 *    spinning writer		up_write/up_read caller
+	 *    ---------------		-----------------------
+	 * [S]   osq_unlock()		[L]   osq
+	 *	 MB			      RMB
+	 * [RmW] rwsem_try_write_lock() [RmW] spin_trylock(wait_lock)
+	 *
+	 * Here, it is important to make sure that there won't be a missed
+	 * wakeup while the rwsem is free and the only spinning writer goes
+	 * to sleep without taking the rwsem. Even when the spinning writer
+	 * is just going to break out of the waiting loop, it will still do
+	 * a trylock in rwsem_down_write_failed() before sleeping. IOW, if
+	 * rwsem_has_spinner() is true, it will guarantee at least one
+	 * trylock attempt on the rwsem later on.
+	 */
+	if (rwsem_has_spinner(sem)) {
+		/*
+		 * The smp_rmb() here is to make sure that the spinner
+		 * state is consulted before reading the wait_lock.
+		 */
+		smp_rmb();
+		if (!raw_spin_trylock_irqsave(&sem->wait_lock, flags))
+			return sem;
+		goto locked;
+	}
 	raw_spin_lock_irqsave(&sem->wait_lock, flags);
+locked:
 
 	/* do nothing if list empty */
 	if (!list_empty(&sem->wait_list))
-- 
cgit v1.2.3


From 663fdcbee0a656cdaef934e7f50e6c2670373bc9 Mon Sep 17 00:00:00 2001
From: Preeti U Murthy <preeti@linux.vnet.ibm.com>
Date: Thu, 30 Apr 2015 17:27:21 +0530
Subject: kernel: Replace reference to ASSIGN_ONCE() with WRITE_ONCE() in
 comment

Looks like commit :

 43239cbe79fc ("kernel: Change ASSIGN_ONCE(val, x) to WRITE_ONCE(x, val)")

left behind a reference to ASSIGN_ONCE(). Update this to WRITE_ONCE().

Signed-off-by: Preeti U Murthy <preeti@linux.vnet.ibm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: borntraeger@de.ibm.com
Cc: dave@stgolabs.net
Cc: paulmck@linux.vnet.ibm.com
Link: http://lkml.kernel.org/r/20150430115721.22278.94082.stgit@preeti.in.ibm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/compiler.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index 867722591be2..a7c0941d10da 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -450,7 +450,7 @@ static __always_inline void __write_once_size(volatile void *p, void *res, int s
  * with an explicit memory barrier or atomic instruction that provides the
  * required ordering.
  *
- * If possible use READ_ONCE/ASSIGN_ONCE instead.
+ * If possible use READ_ONCE()/WRITE_ONCE() instead.
  */
 #define __ACCESS_ONCE(x) ({ \
 	 __maybe_unused typeof(x) __var = (__force typeof(x)) 0; \
-- 
cgit v1.2.3


From 56f13c0d9524c5816f5dc9c91b9d766d6b1064ca Mon Sep 17 00:00:00 2001
From: Peter Ujfalusi <peter.ujfalusi@ti.com>
Date: Thu, 9 Apr 2015 12:35:47 +0300
Subject: dmaengine: of_dma: Support for DMA routers

DMA routers are transparent devices used to mux DMA requests from
peripherals to DMA controllers. They are used when the SoC integrates more
devices with DMA requests then their controller can handle.
DRA7x is one example of such SoC, where the sDMA can hanlde 128 DMA request
lines, but in SoC level it has 205 DMA requests.

The of_dma_router will be registered as of_dma_controller with special
xlate function and additional parameters. The driver for the router is
responsible to craft the dma_spec (in the of_dma_route_allocate callback)
which can be used to requests a DMA channel from the real DMA controller.
This way the router can be transparent for the system while remaining generic
enough to be used in different environments.

Signed-off-by: Peter Ujfalusi <peter.ujfalusi@ti.com>
Signed-off-by: Vinod Koul <vinod.koul@intel.com>
---
 Documentation/devicetree/bindings/dma/dma.txt | 28 +++++++++
 drivers/dma/dmaengine.c                       |  7 +++
 drivers/dma/of-dma.c                          | 89 +++++++++++++++++++++++++++
 include/linux/dmaengine.h                     | 17 +++++
 include/linux/of_dma.h                        | 21 +++++++
 5 files changed, 162 insertions(+)

(limited to 'include/linux')

diff --git a/Documentation/devicetree/bindings/dma/dma.txt b/Documentation/devicetree/bindings/dma/dma.txt
index 82104271e754..6312fb00ce8d 100644
--- a/Documentation/devicetree/bindings/dma/dma.txt
+++ b/Documentation/devicetree/bindings/dma/dma.txt
@@ -31,6 +31,34 @@ Example:
 		dma-requests = <127>;
 	};
 
+* DMA router
+
+DMA routers are transparent IP blocks used to route DMA request lines from
+devices to the DMA controller. Some SoCs (like TI DRA7x) have more peripherals
+integrated with DMA requests than what the DMA controller can handle directly.
+
+Required property:
+- dma-masters:		phandle of the DMA controller or list of phandles for
+			the DMA controllers the router can direct the signal to.
+- #dma-cells: 		Must be at least 1. Used to provide DMA router specific
+			information. See DMA client binding below for more
+			details.
+
+Optional properties:
+- dma-requests: 	Number of incoming request lines the router can handle.
+- In the node pointed by the dma-masters:
+	- dma-requests:	The router driver might need to look for this in order
+			to configure the routing.
+
+Example:
+	sdma_xbar: dma-router@4a002b78 {
+		compatible = "ti,dra7-dma-crossbar";
+		reg = <0x4a002b78 0xfc>;
+		#dma-cells = <1>;
+		dma-requests = <205>;
+		ti,dma-safe-map = <0>;
+		dma-masters = <&sdma>;
+	};
 
 * DMA client
 
diff --git a/drivers/dma/dmaengine.c b/drivers/dma/dmaengine.c
index 0e035a8cf401..9e5949696b1b 100644
--- a/drivers/dma/dmaengine.c
+++ b/drivers/dma/dmaengine.c
@@ -267,6 +267,13 @@ static void dma_chan_put(struct dma_chan *chan)
 	/* This channel is not in use anymore, free it */
 	if (!chan->client_count && chan->device->device_free_chan_resources)
 		chan->device->device_free_chan_resources(chan);
+
+	/* If the channel is used via a DMA request router, free the mapping */
+	if (chan->router && chan->router->route_free) {
+		chan->router->route_free(chan->router->dev, chan->route_data);
+		chan->router = NULL;
+		chan->route_data = NULL;
+	}
 }
 
 enum dma_status dma_sync_wait(struct dma_chan *chan, dma_cookie_t cookie)
diff --git a/drivers/dma/of-dma.c b/drivers/dma/of-dma.c
index cbd4a8aff120..1e1f2986eba8 100644
--- a/drivers/dma/of-dma.c
+++ b/drivers/dma/of-dma.c
@@ -44,6 +44,50 @@ static struct of_dma *of_dma_find_controller(struct of_phandle_args *dma_spec)
 	return NULL;
 }
 
+/**
+ * of_dma_router_xlate - translation function for router devices
+ * @dma_spec:	pointer to DMA specifier as found in the device tree
+ * @of_dma:	pointer to DMA controller data (router information)
+ *
+ * The function creates new dma_spec to be passed to the router driver's
+ * of_dma_route_allocate() function to prepare a dma_spec which will be used
+ * to request channel from the real DMA controller.
+ */
+static struct dma_chan *of_dma_router_xlate(struct of_phandle_args *dma_spec,
+					    struct of_dma *ofdma)
+{
+	struct dma_chan		*chan;
+	struct of_dma		*ofdma_target;
+	struct of_phandle_args	dma_spec_target;
+	void			*route_data;
+
+	/* translate the request for the real DMA controller */
+	memcpy(&dma_spec_target, dma_spec, sizeof(dma_spec_target));
+	route_data = ofdma->of_dma_route_allocate(&dma_spec_target, ofdma);
+	if (IS_ERR(route_data))
+		return NULL;
+
+	ofdma_target = of_dma_find_controller(&dma_spec_target);
+	if (!ofdma_target)
+		return NULL;
+
+	chan = ofdma_target->of_dma_xlate(&dma_spec_target, ofdma_target);
+	if (chan) {
+		chan->router = ofdma->dma_router;
+		chan->route_data = route_data;
+	} else {
+		ofdma->dma_router->route_free(ofdma->dma_router->dev,
+					      route_data);
+	}
+
+	/*
+	 * Need to put the node back since the ofdma->of_dma_route_allocate
+	 * has taken it for generating the new, translated dma_spec
+	 */
+	of_node_put(dma_spec_target.np);
+	return chan;
+}
+
 /**
  * of_dma_controller_register - Register a DMA controller to DT DMA helpers
  * @np:			device node of DMA controller
@@ -109,6 +153,51 @@ void of_dma_controller_free(struct device_node *np)
 }
 EXPORT_SYMBOL_GPL(of_dma_controller_free);
 
+/**
+ * of_dma_router_register - Register a DMA router to DT DMA helpers as a
+ *			    controller
+ * @np:				device node of DMA router
+ * @of_dma_route_allocate:	setup function for the router which need to
+ *				modify the dma_spec for the DMA controller to
+ *				use and to set up the requested route.
+ * @dma_router:			pointer to dma_router structure to be used when
+ *				the route need to be free up.
+ *
+ * Returns 0 on success or appropriate errno value on error.
+ *
+ * Allocated memory should be freed with appropriate of_dma_controller_free()
+ * call.
+ */
+int of_dma_router_register(struct device_node *np,
+			   void *(*of_dma_route_allocate)
+			   (struct of_phandle_args *, struct of_dma *),
+			   struct dma_router *dma_router)
+{
+	struct of_dma	*ofdma;
+
+	if (!np || !of_dma_route_allocate || !dma_router) {
+		pr_err("%s: not enough information provided\n", __func__);
+		return -EINVAL;
+	}
+
+	ofdma = kzalloc(sizeof(*ofdma), GFP_KERNEL);
+	if (!ofdma)
+		return -ENOMEM;
+
+	ofdma->of_node = np;
+	ofdma->of_dma_xlate = of_dma_router_xlate;
+	ofdma->of_dma_route_allocate = of_dma_route_allocate;
+	ofdma->dma_router = dma_router;
+
+	/* Now queue of_dma controller structure in list */
+	mutex_lock(&of_dma_lock);
+	list_add_tail(&ofdma->of_dma_controllers, &of_dma_list);
+	mutex_unlock(&of_dma_lock);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(of_dma_router_register);
+
 /**
  * of_dma_match_channel - Check if a DMA specifier matches name
  * @np:		device node to look for DMA channels
diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h
index ad419757241f..abf63ceabef9 100644
--- a/include/linux/dmaengine.h
+++ b/include/linux/dmaengine.h
@@ -221,6 +221,16 @@ struct dma_chan_percpu {
 	unsigned long bytes_transferred;
 };
 
+/**
+ * struct dma_router - DMA router structure
+ * @dev: pointer to the DMA router device
+ * @route_free: function to be called when the route can be disconnected
+ */
+struct dma_router {
+	struct device *dev;
+	void (*route_free)(struct device *dev, void *route_data);
+};
+
 /**
  * struct dma_chan - devices supply DMA channels, clients use them
  * @device: ptr to the dma device who supplies this channel, always !%NULL
@@ -232,6 +242,8 @@ struct dma_chan_percpu {
  * @local: per-cpu pointer to a struct dma_chan_percpu
  * @client_count: how many clients are using this channel
  * @table_count: number of appearances in the mem-to-mem allocation table
+ * @router: pointer to the DMA router structure
+ * @route_data: channel specific data for the router
  * @private: private data for certain client-channel associations
  */
 struct dma_chan {
@@ -247,6 +259,11 @@ struct dma_chan {
 	struct dma_chan_percpu __percpu *local;
 	int client_count;
 	int table_count;
+
+	/* DMA router */
+	struct dma_router *router;
+	void *route_data;
+
 	void *private;
 };
 
diff --git a/include/linux/of_dma.h b/include/linux/of_dma.h
index 56bc026c143f..98ba7525929e 100644
--- a/include/linux/of_dma.h
+++ b/include/linux/of_dma.h
@@ -23,6 +23,9 @@ struct of_dma {
 	struct device_node	*of_node;
 	struct dma_chan		*(*of_dma_xlate)
 				(struct of_phandle_args *, struct of_dma *);
+	void			*(*of_dma_route_allocate)
+				(struct of_phandle_args *, struct of_dma *);
+	struct dma_router	*dma_router;
 	void			*of_dma_data;
 };
 
@@ -37,12 +40,20 @@ extern int of_dma_controller_register(struct device_node *np,
 		(struct of_phandle_args *, struct of_dma *),
 		void *data);
 extern void of_dma_controller_free(struct device_node *np);
+
+extern int of_dma_router_register(struct device_node *np,
+		void *(*of_dma_route_allocate)
+		(struct of_phandle_args *, struct of_dma *),
+		struct dma_router *dma_router);
+#define of_dma_router_free of_dma_controller_free
+
 extern struct dma_chan *of_dma_request_slave_channel(struct device_node *np,
 						     const char *name);
 extern struct dma_chan *of_dma_simple_xlate(struct of_phandle_args *dma_spec,
 		struct of_dma *ofdma);
 extern struct dma_chan *of_dma_xlate_by_chan_id(struct of_phandle_args *dma_spec,
 		struct of_dma *ofdma);
+
 #else
 static inline int of_dma_controller_register(struct device_node *np,
 		struct dma_chan *(*of_dma_xlate)
@@ -56,6 +67,16 @@ static inline void of_dma_controller_free(struct device_node *np)
 {
 }
 
+static inline int of_dma_router_register(struct device_node *np,
+		void *(*of_dma_route_allocate)
+		(struct of_phandle_args *, struct of_dma *),
+		struct dma_router *dma_router)
+{
+	return -ENODEV;
+}
+
+#define of_dma_router_free of_dma_controller_free
+
 static inline struct dma_chan *of_dma_request_slave_channel(struct device_node *np,
 						     const char *name)
 {
-- 
cgit v1.2.3


From 4ae92bc77ac8e620f7c8d59b5882a4cb0d1c4ef1 Mon Sep 17 00:00:00 2001
From: Nicolas Schichan <nschichan@freebox.fr>
Date: Wed, 6 May 2015 16:12:27 +0200
Subject: net: filter: add a callback to allow classic post-verifier
 transformations

This is in preparation for use by the seccomp code, the rationale is
not to duplicate additional code within the seccomp layer, but instead,
have it abstracted and hidden within the classic BPF API.

As an interim step, this now also makes bpf_prepare_filter() visible
(not as exported symbol though), so that seccomp can reuse that code
path instead of reimplementing it.

Joint work with Daniel Borkmann.

Signed-off-by: Nicolas Schichan <nschichan@freebox.fr>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Cc: Alexei Starovoitov <ast@plumgrid.com>
Cc: Kees Cook <keescook@chromium.org>
Acked-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/filter.h |  6 ++++++
 net/core/filter.c      | 18 +++++++++++++++---
 2 files changed, 21 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/filter.h b/include/linux/filter.h
index fa11b3a367be..91996247cb55 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -384,7 +384,13 @@ int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk);
 int sk_attach_bpf(u32 ufd, struct sock *sk);
 int sk_detach_filter(struct sock *sk);
 
+typedef int (*bpf_aux_classic_check_t)(struct sock_filter *filter,
+				       unsigned int flen);
+
 int bpf_check_classic(const struct sock_filter *filter, unsigned int flen);
+struct bpf_prog *bpf_prepare_filter(struct bpf_prog *fp,
+				    bpf_aux_classic_check_t trans);
+
 int sk_get_filter(struct sock *sk, struct sock_filter __user *filter,
 		  unsigned int len);
 
diff --git a/net/core/filter.c b/net/core/filter.c
index bf831a85c315..e670494f1d83 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -988,7 +988,8 @@ out_err:
 	return ERR_PTR(err);
 }
 
-static struct bpf_prog *bpf_prepare_filter(struct bpf_prog *fp)
+struct bpf_prog *bpf_prepare_filter(struct bpf_prog *fp,
+				    bpf_aux_classic_check_t trans)
 {
 	int err;
 
@@ -1001,6 +1002,17 @@ static struct bpf_prog *bpf_prepare_filter(struct bpf_prog *fp)
 		return ERR_PTR(err);
 	}
 
+	/* There might be additional checks and transformations
+	 * needed on classic filters, f.e. in case of seccomp.
+	 */
+	if (trans) {
+		err = trans(fp->insns, fp->len);
+		if (err) {
+			__bpf_prog_release(fp);
+			return ERR_PTR(err);
+		}
+	}
+
 	/* Probe if we can JIT compile the filter and if so, do
 	 * the compilation of the filter.
 	 */
@@ -1050,7 +1062,7 @@ int bpf_prog_create(struct bpf_prog **pfp, struct sock_fprog_kern *fprog)
 	/* bpf_prepare_filter() already takes care of freeing
 	 * memory in case something goes wrong.
 	 */
-	fp = bpf_prepare_filter(fp);
+	fp = bpf_prepare_filter(fp, NULL);
 	if (IS_ERR(fp))
 		return PTR_ERR(fp);
 
@@ -1135,7 +1147,7 @@ int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk)
 	/* bpf_prepare_filter() already takes care of freeing
 	 * memory in case something goes wrong.
 	 */
-	prog = bpf_prepare_filter(prog);
+	prog = bpf_prepare_filter(prog, NULL);
 	if (IS_ERR(prog))
 		return PTR_ERR(prog);
 
-- 
cgit v1.2.3


From d9e12f42e58da475379b9080708b94f2095904af Mon Sep 17 00:00:00 2001
From: Nicolas Schichan <nschichan@freebox.fr>
Date: Wed, 6 May 2015 16:12:28 +0200
Subject: seccomp: simplify seccomp_prepare_filter and reuse bpf_prepare_filter

Remove the calls to bpf_check_classic(), bpf_convert_filter() and
bpf_migrate_runtime() and let bpf_prepare_filter() take care of that
instead.

seccomp_check_filter() is passed to bpf_prepare_filter() so that it
gets called from there, after bpf_check_classic().

We can now remove exposure of two internal classic BPF functions
previously used by seccomp. The export of bpf_check_classic() symbol,
previously known as sk_chk_filter(), was there since pre git times,
and no in-tree module was using it, therefore remove it.

Joint work with Daniel Borkmann.

Signed-off-by: Nicolas Schichan <nschichan@freebox.fr>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Cc: Alexei Starovoitov <ast@plumgrid.com>
Cc: Kees Cook <keescook@chromium.org>
Acked-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/filter.h |  4 ---
 kernel/seccomp.c       | 68 ++++++++++++++++----------------------------------
 net/core/filter.c      |  8 +++---
 3 files changed, 26 insertions(+), 54 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 91996247cb55..0dcb44bcfc5f 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -363,9 +363,6 @@ int sk_filter(struct sock *sk, struct sk_buff *skb);
 void bpf_prog_select_runtime(struct bpf_prog *fp);
 void bpf_prog_free(struct bpf_prog *fp);
 
-int bpf_convert_filter(struct sock_filter *prog, int len,
-		       struct bpf_insn *new_prog, int *new_len);
-
 struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags);
 struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size,
 				  gfp_t gfp_extra_flags);
@@ -387,7 +384,6 @@ int sk_detach_filter(struct sock *sk);
 typedef int (*bpf_aux_classic_check_t)(struct sock_filter *filter,
 				       unsigned int flen);
 
-int bpf_check_classic(const struct sock_filter *filter, unsigned int flen);
 struct bpf_prog *bpf_prepare_filter(struct bpf_prog *fp,
 				    bpf_aux_classic_check_t trans);
 
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 4f44028943e6..93d40f7f3683 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -347,15 +347,14 @@ static inline void seccomp_sync_threads(void)
 static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog)
 {
 	struct seccomp_filter *filter;
-	unsigned long fp_size;
-	struct sock_filter *fp;
-	int new_len;
-	long ret;
+	struct bpf_prog *prog;
+	unsigned long fsize;
 
 	if (fprog->len == 0 || fprog->len > BPF_MAXINSNS)
 		return ERR_PTR(-EINVAL);
+
 	BUG_ON(INT_MAX / fprog->len < sizeof(struct sock_filter));
-	fp_size = fprog->len * sizeof(struct sock_filter);
+	fsize = bpf_classic_proglen(fprog);
 
 	/*
 	 * Installing a seccomp filter requires that the task has
@@ -368,60 +367,37 @@ static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog)
 				     CAP_SYS_ADMIN) != 0)
 		return ERR_PTR(-EACCES);
 
-	fp = kzalloc(fp_size, GFP_KERNEL|__GFP_NOWARN);
-	if (!fp)
+	prog = bpf_prog_alloc(bpf_prog_size(fprog->len), 0);
+	if (!prog)
 		return ERR_PTR(-ENOMEM);
 
 	/* Copy the instructions from fprog. */
-	ret = -EFAULT;
-	if (copy_from_user(fp, fprog->filter, fp_size))
-		goto free_prog;
-
-	/* Check and rewrite the fprog via the skb checker */
-	ret = bpf_check_classic(fp, fprog->len);
-	if (ret)
-		goto free_prog;
+	if (copy_from_user(prog->insns, fprog->filter, fsize)) {
+		__bpf_prog_free(prog);
+		return ERR_PTR(-EFAULT);
+	}
 
-	/* Check and rewrite the fprog for seccomp use */
-	ret = seccomp_check_filter(fp, fprog->len);
-	if (ret)
-		goto free_prog;
+	prog->len = fprog->len;
 
-	/* Convert 'sock_filter' insns to 'bpf_insn' insns */
-	ret = bpf_convert_filter(fp, fprog->len, NULL, &new_len);
-	if (ret)
-		goto free_prog;
+	/* bpf_prepare_filter() already takes care of freeing
+	 * memory in case something goes wrong.
+	 */
+	prog = bpf_prepare_filter(prog, seccomp_check_filter);
+	if (IS_ERR(prog))
+		return ERR_CAST(prog);
 
 	/* Allocate a new seccomp_filter */
-	ret = -ENOMEM;
 	filter = kzalloc(sizeof(struct seccomp_filter),
 			 GFP_KERNEL|__GFP_NOWARN);
-	if (!filter)
-		goto free_prog;
-
-	filter->prog = bpf_prog_alloc(bpf_prog_size(new_len), __GFP_NOWARN);
-	if (!filter->prog)
-		goto free_filter;
-
-	ret = bpf_convert_filter(fp, fprog->len, filter->prog->insnsi, &new_len);
-	if (ret)
-		goto free_filter_prog;
+	if (!filter) {
+		bpf_prog_destroy(prog);
+		return ERR_PTR(-ENOMEM);
+	}
 
-	kfree(fp);
+	filter->prog = prog;
 	atomic_set(&filter->usage, 1);
-	filter->prog->len = new_len;
-
-	bpf_prog_select_runtime(filter->prog);
 
 	return filter;
-
-free_filter_prog:
-	__bpf_prog_free(filter->prog);
-free_filter:
-	kfree(filter);
-free_prog:
-	kfree(fp);
-	return ERR_PTR(ret);
 }
 
 /**
diff --git a/net/core/filter.c b/net/core/filter.c
index e670494f1d83..f887084740cd 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -355,8 +355,8 @@ static bool convert_bpf_extensions(struct sock_filter *fp,
  * for socket filters: ctx == 'struct sk_buff *', for seccomp:
  * ctx == 'struct seccomp_data *'.
  */
-int bpf_convert_filter(struct sock_filter *prog, int len,
-		       struct bpf_insn *new_prog, int *new_len)
+static int bpf_convert_filter(struct sock_filter *prog, int len,
+			      struct bpf_insn *new_prog, int *new_len)
 {
 	int new_flen = 0, pass = 0, target, i;
 	struct bpf_insn *new_insn;
@@ -751,7 +751,8 @@ static bool chk_code_allowed(u16 code_to_probe)
  *
  * Returns 0 if the rule set is legal or -EINVAL if not.
  */
-int bpf_check_classic(const struct sock_filter *filter, unsigned int flen)
+static int bpf_check_classic(const struct sock_filter *filter,
+			     unsigned int flen)
 {
 	bool anc_found;
 	int pc;
@@ -825,7 +826,6 @@ int bpf_check_classic(const struct sock_filter *filter, unsigned int flen)
 
 	return -EINVAL;
 }
-EXPORT_SYMBOL(bpf_check_classic);
 
 static int bpf_prog_store_orig_filter(struct bpf_prog *fp,
 				      const struct sock_fprog *fprog)
-- 
cgit v1.2.3


From ac67eb2c5347bd9976308c0e0cf1d9e7ca690342 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Wed, 6 May 2015 16:12:30 +0200
Subject: seccomp, filter: add and use bpf_prog_create_from_user from seccomp

Seccomp has always been a special candidate when it comes to preparation
of its filters in seccomp_prepare_filter(). Due to the extra checks and
filter rewrite it partially duplicates code and has BPF internals exposed.

This patch adds a generic API inside the BPF code code that seccomp can use
and thus keep it's filter preparation code minimal and better maintainable.
The other side-effect is that now classic JITs can add seccomp support as
well by only providing a BPF_LDX | BPF_W | BPF_ABS translation.

Tested with seccomp and BPF test suites.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Cc: Nicolas Schichan <nschichan@freebox.fr>
Cc: Alexei Starovoitov <ast@plumgrid.com>
Cc: Kees Cook <keescook@chromium.org>
Acked-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/filter.h | 12 +++++-------
 kernel/seccomp.c       | 42 ++++++++++++-----------------------------
 net/core/filter.c      | 51 ++++++++++++++++++++++++++++++++++++++++++++++++--
 3 files changed, 66 insertions(+), 39 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 0dcb44bcfc5f..3c03a6085b82 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -374,19 +374,17 @@ static inline void bpf_prog_unlock_free(struct bpf_prog *fp)
 	__bpf_prog_free(fp);
 }
 
+typedef int (*bpf_aux_classic_check_t)(struct sock_filter *filter,
+				       unsigned int flen);
+
 int bpf_prog_create(struct bpf_prog **pfp, struct sock_fprog_kern *fprog);
+int bpf_prog_create_from_user(struct bpf_prog **pfp, struct sock_fprog *fprog,
+			      bpf_aux_classic_check_t trans);
 void bpf_prog_destroy(struct bpf_prog *fp);
 
 int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk);
 int sk_attach_bpf(u32 ufd, struct sock *sk);
 int sk_detach_filter(struct sock *sk);
-
-typedef int (*bpf_aux_classic_check_t)(struct sock_filter *filter,
-				       unsigned int flen);
-
-struct bpf_prog *bpf_prepare_filter(struct bpf_prog *fp,
-				    bpf_aux_classic_check_t trans);
-
 int sk_get_filter(struct sock *sk, struct sock_filter __user *filter,
 		  unsigned int len);
 
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 93d40f7f3683..245df6b32b81 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -346,15 +346,13 @@ static inline void seccomp_sync_threads(void)
  */
 static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog)
 {
-	struct seccomp_filter *filter;
-	struct bpf_prog *prog;
-	unsigned long fsize;
+	struct seccomp_filter *sfilter;
+	int ret;
 
 	if (fprog->len == 0 || fprog->len > BPF_MAXINSNS)
 		return ERR_PTR(-EINVAL);
 
 	BUG_ON(INT_MAX / fprog->len < sizeof(struct sock_filter));
-	fsize = bpf_classic_proglen(fprog);
 
 	/*
 	 * Installing a seccomp filter requires that the task has
@@ -367,37 +365,21 @@ static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog)
 				     CAP_SYS_ADMIN) != 0)
 		return ERR_PTR(-EACCES);
 
-	prog = bpf_prog_alloc(bpf_prog_size(fprog->len), 0);
-	if (!prog)
-		return ERR_PTR(-ENOMEM);
-
-	/* Copy the instructions from fprog. */
-	if (copy_from_user(prog->insns, fprog->filter, fsize)) {
-		__bpf_prog_free(prog);
-		return ERR_PTR(-EFAULT);
-	}
-
-	prog->len = fprog->len;
-
-	/* bpf_prepare_filter() already takes care of freeing
-	 * memory in case something goes wrong.
-	 */
-	prog = bpf_prepare_filter(prog, seccomp_check_filter);
-	if (IS_ERR(prog))
-		return ERR_CAST(prog);
-
 	/* Allocate a new seccomp_filter */
-	filter = kzalloc(sizeof(struct seccomp_filter),
-			 GFP_KERNEL|__GFP_NOWARN);
-	if (!filter) {
-		bpf_prog_destroy(prog);
+	sfilter = kzalloc(sizeof(*sfilter), GFP_KERNEL | __GFP_NOWARN);
+	if (!sfilter)
 		return ERR_PTR(-ENOMEM);
+
+	ret = bpf_prog_create_from_user(&sfilter->prog, fprog,
+					seccomp_check_filter);
+	if (ret < 0) {
+		kfree(sfilter);
+		return ERR_PTR(ret);
 	}
 
-	filter->prog = prog;
-	atomic_set(&filter->usage, 1);
+	atomic_set(&sfilter->usage, 1);
 
-	return filter;
+	return sfilter;
 }
 
 /**
diff --git a/net/core/filter.c b/net/core/filter.c
index 45c015d6b974..a831f193e2c7 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -991,8 +991,8 @@ out_err:
 	return ERR_PTR(err);
 }
 
-struct bpf_prog *bpf_prepare_filter(struct bpf_prog *fp,
-				    bpf_aux_classic_check_t trans)
+static struct bpf_prog *bpf_prepare_filter(struct bpf_prog *fp,
+					   bpf_aux_classic_check_t trans)
 {
 	int err;
 
@@ -1074,6 +1074,53 @@ int bpf_prog_create(struct bpf_prog **pfp, struct sock_fprog_kern *fprog)
 }
 EXPORT_SYMBOL_GPL(bpf_prog_create);
 
+/**
+ *	bpf_prog_create_from_user - create an unattached filter from user buffer
+ *	@pfp: the unattached filter that is created
+ *	@fprog: the filter program
+ *	@trans: post-classic verifier transformation handler
+ *
+ * This function effectively does the same as bpf_prog_create(), only
+ * that it builds up its insns buffer from user space provided buffer.
+ * It also allows for passing a bpf_aux_classic_check_t handler.
+ */
+int bpf_prog_create_from_user(struct bpf_prog **pfp, struct sock_fprog *fprog,
+			      bpf_aux_classic_check_t trans)
+{
+	unsigned int fsize = bpf_classic_proglen(fprog);
+	struct bpf_prog *fp;
+
+	/* Make sure new filter is there and in the right amounts. */
+	if (fprog->filter == NULL)
+		return -EINVAL;
+
+	fp = bpf_prog_alloc(bpf_prog_size(fprog->len), 0);
+	if (!fp)
+		return -ENOMEM;
+
+	if (copy_from_user(fp->insns, fprog->filter, fsize)) {
+		__bpf_prog_free(fp);
+		return -EFAULT;
+	}
+
+	fp->len = fprog->len;
+	/* Since unattached filters are not copied back to user
+	 * space through sk_get_filter(), we do not need to hold
+	 * a copy here, and can spare us the work.
+	 */
+	fp->orig_prog = NULL;
+
+	/* bpf_prepare_filter() already takes care of freeing
+	 * memory in case something goes wrong.
+	 */
+	fp = bpf_prepare_filter(fp, trans);
+	if (IS_ERR(fp))
+		return PTR_ERR(fp);
+
+	*pfp = fp;
+	return 0;
+}
+
 void bpf_prog_destroy(struct bpf_prog *fp)
 {
 	__bpf_prog_release(fp);
-- 
cgit v1.2.3


From 59324cf35aba5336b611074028777838a963d03b Mon Sep 17 00:00:00 2001
From: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Date: Thu, 7 May 2015 11:02:53 +0200
Subject: netlink: allow to listen "all" netns

More accurately, listen all netns that have a nsid assigned into the netns
where the netlink socket is opened.
For this purpose, a netlink socket option is added:
NETLINK_LISTEN_ALL_NSID. When this option is set on a netlink socket, this
socket will receive netlink notifications from all netns that have a nsid
assigned into the netns where the socket has been opened. The nsid is sent
to userland via an anscillary data.

With this patch, a daemon needs only one socket to listen many netns. This
is useful when the number of netns is high.

Because 0 is a valid value for a nsid, the field nsid_is_set indicates if
the field nsid is valid or not. skb->cb is initialized to 0 on skb
allocation, thus we are sure that we will never send a nsid 0 by error to
the userland.

Signed-off-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Acked-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netlink.h      |  2 ++
 include/net/net_namespace.h  |  2 ++
 include/uapi/linux/netlink.h |  1 +
 net/core/net_namespace.c     | 10 ++++++++-
 net/netlink/af_netlink.c     | 52 +++++++++++++++++++++++++++++++++++++++-----
 5 files changed, 61 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netlink.h b/include/linux/netlink.h
index 6835c1279df7..9120edb650a0 100644
--- a/include/linux/netlink.h
+++ b/include/linux/netlink.h
@@ -28,6 +28,8 @@ struct netlink_skb_parms {
 	__u32			dst_group;
 	__u32			flags;
 	struct sock		*sk;
+	bool			nsid_is_set;
+	int			nsid;
 };
 
 #define NETLINK_CB(skb)		(*(struct netlink_skb_parms*)&((skb)->cb))
diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h
index 6d1e2eae32fb..3f850acc844e 100644
--- a/include/net/net_namespace.h
+++ b/include/net/net_namespace.h
@@ -272,6 +272,8 @@ static inline struct net *read_pnet(const possible_net_t *pnet)
 #endif
 
 int peernet2id_alloc(struct net *net, struct net *peer);
+int peernet2id(struct net *net, struct net *peer);
+bool peernet_has_id(struct net *net, struct net *peer);
 struct net *get_net_ns_by_id(struct net *net, int id);
 
 struct pernet_operations {
diff --git a/include/uapi/linux/netlink.h b/include/uapi/linux/netlink.h
index 1a85940f8ab7..3e34b7d702f8 100644
--- a/include/uapi/linux/netlink.h
+++ b/include/uapi/linux/netlink.h
@@ -108,6 +108,7 @@ struct nlmsgerr {
 #define NETLINK_NO_ENOBUFS	5
 #define NETLINK_RX_RING		6
 #define NETLINK_TX_RING		7
+#define NETLINK_LISTEN_ALL_NSID	8
 
 struct nl_pktinfo {
 	__u32	group;
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index ae5008b097de..a665bf490c88 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -229,7 +229,7 @@ int peernet2id_alloc(struct net *net, struct net *peer)
 EXPORT_SYMBOL(peernet2id_alloc);
 
 /* This function returns, if assigned, the id of a peer netns. */
-static int peernet2id(struct net *net, struct net *peer)
+int peernet2id(struct net *net, struct net *peer)
 {
 	unsigned long flags;
 	int id;
@@ -240,6 +240,14 @@ static int peernet2id(struct net *net, struct net *peer)
 	return id;
 }
 
+/* This function returns true is the peer netns has an id assigned into the
+ * current netns.
+ */
+bool peernet_has_id(struct net *net, struct net *peer)
+{
+	return peernet2id(net, peer) >= 0;
+}
+
 struct net *get_net_ns_by_id(struct net *net, int id)
 {
 	unsigned long flags;
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index bf7f56d7a9aa..a5fff75accf8 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -83,6 +83,7 @@ struct listeners {
 #define NETLINK_F_RECV_PKTINFO		0x2
 #define NETLINK_F_BROADCAST_SEND_ERROR	0x4
 #define NETLINK_F_RECV_NO_ENOBUFS	0x8
+#define NETLINK_F_LISTEN_ALL_NSID	0x10
 
 static inline int netlink_is_kernel(struct sock *sk)
 {
@@ -1932,8 +1933,17 @@ static void do_one_broadcast(struct sock *sk,
 	    !test_bit(p->group - 1, nlk->groups))
 		return;
 
-	if (!net_eq(sock_net(sk), p->net))
-		return;
+	if (!net_eq(sock_net(sk), p->net)) {
+		if (!(nlk->flags & NETLINK_F_LISTEN_ALL_NSID))
+			return;
+
+		if (!peernet_has_id(sock_net(sk), p->net))
+			return;
+
+		if (!file_ns_capable(sk->sk_socket->file, p->net->user_ns,
+				     CAP_NET_BROADCAST))
+			return;
+	}
 
 	if (p->failure) {
 		netlink_overrun(sk);
@@ -1959,13 +1969,22 @@ static void do_one_broadcast(struct sock *sk,
 		p->failure = 1;
 		if (nlk->flags & NETLINK_F_BROADCAST_SEND_ERROR)
 			p->delivery_failure = 1;
-	} else if (p->tx_filter && p->tx_filter(sk, p->skb2, p->tx_data)) {
+		goto out;
+	}
+	if (p->tx_filter && p->tx_filter(sk, p->skb2, p->tx_data)) {
 		kfree_skb(p->skb2);
 		p->skb2 = NULL;
-	} else if (sk_filter(sk, p->skb2)) {
+		goto out;
+	}
+	if (sk_filter(sk, p->skb2)) {
 		kfree_skb(p->skb2);
 		p->skb2 = NULL;
-	} else if ((val = netlink_broadcast_deliver(sk, p->skb2)) < 0) {
+		goto out;
+	}
+	NETLINK_CB(p->skb2).nsid = peernet2id(sock_net(sk), p->net);
+	NETLINK_CB(p->skb2).nsid_is_set = true;
+	val = netlink_broadcast_deliver(sk, p->skb2);
+	if (val < 0) {
 		netlink_overrun(sk);
 		if (nlk->flags & NETLINK_F_BROADCAST_SEND_ERROR)
 			p->delivery_failure = 1;
@@ -1974,6 +1993,7 @@ static void do_one_broadcast(struct sock *sk,
 		p->delivered = 1;
 		p->skb2 = NULL;
 	}
+out:
 	sock_put(sk);
 }
 
@@ -2202,6 +2222,16 @@ static int netlink_setsockopt(struct socket *sock, int level, int optname,
 		break;
 	}
 #endif /* CONFIG_NETLINK_MMAP */
+	case NETLINK_LISTEN_ALL_NSID:
+		if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_BROADCAST))
+			return -EPERM;
+
+		if (val)
+			nlk->flags |= NETLINK_F_LISTEN_ALL_NSID;
+		else
+			nlk->flags &= ~NETLINK_F_LISTEN_ALL_NSID;
+		err = 0;
+		break;
 	default:
 		err = -ENOPROTOOPT;
 	}
@@ -2268,6 +2298,16 @@ static void netlink_cmsg_recv_pktinfo(struct msghdr *msg, struct sk_buff *skb)
 	put_cmsg(msg, SOL_NETLINK, NETLINK_PKTINFO, sizeof(info), &info);
 }
 
+static void netlink_cmsg_listen_all_nsid(struct sock *sk, struct msghdr *msg,
+					 struct sk_buff *skb)
+{
+	if (!NETLINK_CB(skb).nsid_is_set)
+		return;
+
+	put_cmsg(msg, SOL_NETLINK, NETLINK_LISTEN_ALL_NSID, sizeof(int),
+		 &NETLINK_CB(skb).nsid);
+}
+
 static int netlink_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
 {
 	struct sock *sk = sock->sk;
@@ -2421,6 +2461,8 @@ static int netlink_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
 
 	if (nlk->flags & NETLINK_F_RECV_PKTINFO)
 		netlink_cmsg_recv_pktinfo(msg, skb);
+	if (nlk->flags & NETLINK_F_LISTEN_ALL_NSID)
+		netlink_cmsg_listen_all_nsid(sk, msg, skb);
 
 	memset(&scm, 0, sizeof(scm));
 	scm.creds = *NETLINK_CREDS(skb);
-- 
cgit v1.2.3


From 920ce39f6c204d4ce4d8acebe7522f0dfa95f662 Mon Sep 17 00:00:00 2001
From: Jason Low <jason.low2@hp.com>
Date: Fri, 8 May 2015 14:31:50 -0700
Subject: sched, timer: Fix documentation for 'struct thread_group_cputimer'

Fix the docbook build bug reported by Fengguang Wu.

Reported-by: Fengguang Wu <fengguang.wu@intel.com>
Signed-off-by: Jason Low <jason.low2@hp.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Waiman.Long@hp.com
Cc: aswin@hp.com
Cc: bp@alien8.de
Cc: dave@stgolabs.net
Cc: fweisbec@gmail.com
Cc: mgorman@suse.de
Cc: oleg@redhat.com
Cc: paulmck@linux.vnet.ibm.com
Cc: preeti@linux.vnet.ibm.com
Cc: riel@redhat.com
Cc: rostedt@goodmis.org
Cc: scott.norton@hp.com
Cc: torvalds@linux-foundation.org
Cc: umgwanakikbuti@gmail.com
Link: http://lkml.kernel.org/r/1431120710.5136.12.camel@j-VirtualBox
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 254d88e80f65..0eceeec5a01a 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -606,10 +606,9 @@ struct task_cputime_atomic {
 
 /**
  * struct thread_group_cputimer - thread group interval timer counts
- * @cputime:		thread group interval timers.
+ * @cputime_atomic:	atomic thread group interval timers.
  * @running:		non-zero when there are timers running and
  * 			@cputime receives updates.
- * @lock:		lock for fields in this struct.
  *
  * This structure contains the version of task_cputime, above, that is
  * used for thread group CPU timer calculations.
-- 
cgit v1.2.3


From 34ef33f7da6b00900d3a896d33522a035a930245 Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert+renesas@glider.be>
Date: Tue, 28 Apr 2015 14:04:07 +0200
Subject: usb: phy: Remove the phy-rcar-gen2-usb driver

The phy-rcar-gen2-usb driver, which supports legacy platform data only,
is no longer used since commit a483dcbfa21f919c ("ARM: shmobile: lager:
Remove legacy board support").

This driver was superseded by the DT-only phy-rcar-gen2 driver, which
was introduced in commit 1233f59f745b237d ("phy: Renesas R-Car Gen2 PHY
driver").

Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/phy/Kconfig                         |  13 --
 drivers/usb/phy/Makefile                        |   1 -
 drivers/usb/phy/phy-rcar-gen2-usb.c             | 246 ------------------------
 include/linux/platform_data/usb-rcar-gen2-phy.h |  22 ---
 4 files changed, 282 deletions(-)
 delete mode 100644 drivers/usb/phy/phy-rcar-gen2-usb.c
 delete mode 100644 include/linux/platform_data/usb-rcar-gen2-phy.h

(limited to 'include/linux')

diff --git a/drivers/usb/phy/Kconfig b/drivers/usb/phy/Kconfig
index 2175678e674e..3cd3bee54ca6 100644
--- a/drivers/usb/phy/Kconfig
+++ b/drivers/usb/phy/Kconfig
@@ -186,19 +186,6 @@ config USB_RCAR_PHY
 	  To compile this driver as a module, choose M here: the
 	  module will be called phy-rcar-usb.
 
-config USB_RCAR_GEN2_PHY
-	tristate "Renesas R-Car Gen2 USB PHY support"
-	depends on ARCH_R8A7790 || ARCH_R8A7791 || COMPILE_TEST
-	select USB_PHY
-	help
-	  Say Y here to add support for the Renesas R-Car Gen2 USB PHY driver.
-	  It is typically used to control internal USB PHY for USBHS,
-	  and to configure shared USB channels 0 and 2.
-	  This driver supports R8A7790 and R8A7791.
-
-	  To compile this driver as a module, choose M here: the
-	  module will be called phy-rcar-gen2-usb.
-
 config USB_ULPI
 	bool "Generic ULPI Transceiver Driver"
 	depends on ARM || ARM64
diff --git a/drivers/usb/phy/Makefile b/drivers/usb/phy/Makefile
index 75f2bba58c84..e36ab1d46d8b 100644
--- a/drivers/usb/phy/Makefile
+++ b/drivers/usb/phy/Makefile
@@ -23,7 +23,6 @@ obj-$(CONFIG_USB_MSM_OTG)		+= phy-msm-usb.o
 obj-$(CONFIG_USB_MV_OTG)		+= phy-mv-usb.o
 obj-$(CONFIG_USB_MXS_PHY)		+= phy-mxs-usb.o
 obj-$(CONFIG_USB_RCAR_PHY)		+= phy-rcar-usb.o
-obj-$(CONFIG_USB_RCAR_GEN2_PHY)		+= phy-rcar-gen2-usb.o
 obj-$(CONFIG_USB_ULPI)			+= phy-ulpi.o
 obj-$(CONFIG_USB_ULPI_VIEWPORT)		+= phy-ulpi-viewport.o
 obj-$(CONFIG_KEYSTONE_USB_PHY)		+= phy-keystone.o
diff --git a/drivers/usb/phy/phy-rcar-gen2-usb.c b/drivers/usb/phy/phy-rcar-gen2-usb.c
deleted file mode 100644
index f81800b6562a..000000000000
--- a/drivers/usb/phy/phy-rcar-gen2-usb.c
+++ /dev/null
@@ -1,246 +0,0 @@
-/*
- * Renesas R-Car Gen2 USB phy driver
- *
- * Copyright (C) 2013 Renesas Solutions Corp.
- * Copyright (C) 2013 Cogent Embedded, Inc.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/clk.h>
-#include <linux/delay.h>
-#include <linux/io.h>
-#include <linux/module.h>
-#include <linux/platform_data/usb-rcar-gen2-phy.h>
-#include <linux/platform_device.h>
-#include <linux/spinlock.h>
-#include <linux/usb/otg.h>
-
-struct rcar_gen2_usb_phy_priv {
-	struct usb_phy phy;
-	void __iomem *base;
-	struct clk *clk;
-	spinlock_t lock;
-	int usecount;
-	u32 ugctrl2;
-};
-
-#define usb_phy_to_priv(p) container_of(p, struct rcar_gen2_usb_phy_priv, phy)
-
-/* Low Power Status register */
-#define USBHS_LPSTS_REG			0x02
-#define USBHS_LPSTS_SUSPM		(1 << 14)
-
-/* USB General control register */
-#define USBHS_UGCTRL_REG		0x80
-#define USBHS_UGCTRL_CONNECT		(1 << 2)
-#define USBHS_UGCTRL_PLLRESET		(1 << 0)
-
-/* USB General control register 2 */
-#define USBHS_UGCTRL2_REG		0x84
-#define USBHS_UGCTRL2_USB0_PCI		(1 << 4)
-#define USBHS_UGCTRL2_USB0_HS		(3 << 4)
-#define USBHS_UGCTRL2_USB2_PCI		(0 << 31)
-#define USBHS_UGCTRL2_USB2_SS		(1 << 31)
-
-/* USB General status register */
-#define USBHS_UGSTS_REG			0x88
-#define USBHS_UGSTS_LOCK		(1 << 8)
-
-/* Enable USBHS internal phy */
-static int __rcar_gen2_usbhs_phy_enable(void __iomem *base)
-{
-	u32 val;
-	int i;
-
-	/* USBHS PHY power on */
-	val = ioread32(base + USBHS_UGCTRL_REG);
-	val &= ~USBHS_UGCTRL_PLLRESET;
-	iowrite32(val, base + USBHS_UGCTRL_REG);
-
-	val = ioread16(base + USBHS_LPSTS_REG);
-	val |= USBHS_LPSTS_SUSPM;
-	iowrite16(val, base + USBHS_LPSTS_REG);
-
-	for (i = 0; i < 20; i++) {
-		val = ioread32(base + USBHS_UGSTS_REG);
-		if ((val & USBHS_UGSTS_LOCK) == USBHS_UGSTS_LOCK) {
-			val = ioread32(base + USBHS_UGCTRL_REG);
-			val |= USBHS_UGCTRL_CONNECT;
-			iowrite32(val, base + USBHS_UGCTRL_REG);
-			return 0;
-		}
-		udelay(1);
-	}
-
-	/* Timed out waiting for the PLL lock */
-	return -ETIMEDOUT;
-}
-
-/* Disable USBHS internal phy */
-static int __rcar_gen2_usbhs_phy_disable(void __iomem *base)
-{
-	u32 val;
-
-	/* USBHS PHY power off */
-	val = ioread32(base + USBHS_UGCTRL_REG);
-	val &= ~USBHS_UGCTRL_CONNECT;
-	iowrite32(val, base + USBHS_UGCTRL_REG);
-
-	val = ioread16(base + USBHS_LPSTS_REG);
-	val &= ~USBHS_LPSTS_SUSPM;
-	iowrite16(val, base + USBHS_LPSTS_REG);
-
-	val = ioread32(base + USBHS_UGCTRL_REG);
-	val |= USBHS_UGCTRL_PLLRESET;
-	iowrite32(val, base + USBHS_UGCTRL_REG);
-	return 0;
-}
-
-/* Setup USB channels */
-static void __rcar_gen2_usb_phy_init(struct rcar_gen2_usb_phy_priv *priv)
-{
-	u32 val;
-
-	clk_prepare_enable(priv->clk);
-
-	/* Set USB channels in the USBHS UGCTRL2 register */
-	val = ioread32(priv->base + USBHS_UGCTRL2_REG);
-	val &= ~(USBHS_UGCTRL2_USB0_HS | USBHS_UGCTRL2_USB2_SS);
-	val |= priv->ugctrl2;
-	iowrite32(val, priv->base + USBHS_UGCTRL2_REG);
-}
-
-/* Shutdown USB channels */
-static void __rcar_gen2_usb_phy_shutdown(struct rcar_gen2_usb_phy_priv *priv)
-{
-	__rcar_gen2_usbhs_phy_disable(priv->base);
-	clk_disable_unprepare(priv->clk);
-}
-
-static int rcar_gen2_usb_phy_set_suspend(struct usb_phy *phy, int suspend)
-{
-	struct rcar_gen2_usb_phy_priv *priv = usb_phy_to_priv(phy);
-	unsigned long flags;
-	int retval;
-
-	spin_lock_irqsave(&priv->lock, flags);
-	retval = suspend ? __rcar_gen2_usbhs_phy_disable(priv->base) :
-			   __rcar_gen2_usbhs_phy_enable(priv->base);
-	spin_unlock_irqrestore(&priv->lock, flags);
-	return retval;
-}
-
-static int rcar_gen2_usb_phy_init(struct usb_phy *phy)
-{
-	struct rcar_gen2_usb_phy_priv *priv = usb_phy_to_priv(phy);
-	unsigned long flags;
-
-	spin_lock_irqsave(&priv->lock, flags);
-	/*
-	 * Enable the clock and setup USB channels
-	 * if it's the first user
-	 */
-	if (!priv->usecount++)
-		__rcar_gen2_usb_phy_init(priv);
-	spin_unlock_irqrestore(&priv->lock, flags);
-	return 0;
-}
-
-static void rcar_gen2_usb_phy_shutdown(struct usb_phy *phy)
-{
-	struct rcar_gen2_usb_phy_priv *priv = usb_phy_to_priv(phy);
-	unsigned long flags;
-
-	spin_lock_irqsave(&priv->lock, flags);
-	if (!priv->usecount) {
-		dev_warn(phy->dev, "Trying to disable phy with 0 usecount\n");
-		goto out;
-	}
-
-	/* Disable everything if it's the last user */
-	if (!--priv->usecount)
-		__rcar_gen2_usb_phy_shutdown(priv);
-out:
-	spin_unlock_irqrestore(&priv->lock, flags);
-}
-
-static int rcar_gen2_usb_phy_probe(struct platform_device *pdev)
-{
-	struct device *dev = &pdev->dev;
-	struct rcar_gen2_phy_platform_data *pdata;
-	struct rcar_gen2_usb_phy_priv *priv;
-	struct resource *res;
-	void __iomem *base;
-	struct clk *clk;
-	int retval;
-
-	pdata = dev_get_platdata(dev);
-	if (!pdata) {
-		dev_err(dev, "No platform data\n");
-		return -EINVAL;
-	}
-
-	clk = devm_clk_get(dev, "usbhs");
-	if (IS_ERR(clk)) {
-		dev_err(dev, "Can't get the clock\n");
-		return PTR_ERR(clk);
-	}
-
-	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
-	base = devm_ioremap_resource(dev, res);
-	if (IS_ERR(base))
-		return PTR_ERR(base);
-
-	priv = devm_kzalloc(dev, sizeof(*priv), GFP_KERNEL);
-	if (!priv)
-		return -ENOMEM;
-
-	spin_lock_init(&priv->lock);
-	priv->clk = clk;
-	priv->base = base;
-	priv->ugctrl2 = pdata->chan0_pci ?
-			USBHS_UGCTRL2_USB0_PCI : USBHS_UGCTRL2_USB0_HS;
-	priv->ugctrl2 |= pdata->chan2_pci ?
-			USBHS_UGCTRL2_USB2_PCI : USBHS_UGCTRL2_USB2_SS;
-	priv->phy.dev = dev;
-	priv->phy.label = dev_name(dev);
-	priv->phy.init = rcar_gen2_usb_phy_init;
-	priv->phy.shutdown = rcar_gen2_usb_phy_shutdown;
-	priv->phy.set_suspend = rcar_gen2_usb_phy_set_suspend;
-
-	retval = usb_add_phy_dev(&priv->phy);
-	if (retval < 0) {
-		dev_err(dev, "Failed to add USB phy\n");
-		return retval;
-	}
-
-	platform_set_drvdata(pdev, priv);
-
-	return retval;
-}
-
-static int rcar_gen2_usb_phy_remove(struct platform_device *pdev)
-{
-	struct rcar_gen2_usb_phy_priv *priv = platform_get_drvdata(pdev);
-
-	usb_remove_phy(&priv->phy);
-
-	return 0;
-}
-
-static struct platform_driver rcar_gen2_usb_phy_driver = {
-	.driver = {
-		.name = "usb_phy_rcar_gen2",
-	},
-	.probe = rcar_gen2_usb_phy_probe,
-	.remove = rcar_gen2_usb_phy_remove,
-};
-
-module_platform_driver(rcar_gen2_usb_phy_driver);
-
-MODULE_LICENSE("GPL v2");
-MODULE_DESCRIPTION("Renesas R-Car Gen2 USB phy");
-MODULE_AUTHOR("Valentine Barshak <valentine.barshak@cogentembedded.com>");
diff --git a/include/linux/platform_data/usb-rcar-gen2-phy.h b/include/linux/platform_data/usb-rcar-gen2-phy.h
deleted file mode 100644
index dd3ba46c0d90..000000000000
--- a/include/linux/platform_data/usb-rcar-gen2-phy.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/*
- * Copyright (C) 2013 Renesas Solutions Corp.
- * Copyright (C) 2013 Cogent Embedded, Inc.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#ifndef __USB_RCAR_GEN2_PHY_H
-#define __USB_RCAR_GEN2_PHY_H
-
-#include <linux/types.h>
-
-struct rcar_gen2_phy_platform_data {
-	/* USB channel 0 configuration */
-	bool chan0_pci:1;	/* true: PCI USB host 0, false: USBHS */
-	/* USB channel 2 configuration */
-	bool chan2_pci:1;	/* true: PCI USB host 2, false: USBSS */
-};
-
-#endif
-- 
cgit v1.2.3


From 1c5841e832e2d7563c31de4946118e78baf573a3 Mon Sep 17 00:00:00 2001
From: Eddie Huang <eddie.huang@mediatek.com>
Date: Tue, 28 Apr 2015 21:40:32 +0800
Subject: tty: serial: 8250: export early_serial8250_setup function

8250-like uart driver may call early_serial8250_setup to
reuse 8250_early.c character output function.

Signed-off-by: Eddie Huang <eddie.huang@mediatek.com>
Tested-by: Sascha Hauer <s.hauer@pengutronix.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/tty/serial/8250/8250_early.c | 2 +-
 include/linux/serial_8250.h          | 2 ++
 2 files changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/tty/serial/8250/8250_early.c b/drivers/tty/serial/8250/8250_early.c
index 6c0fd8b9d1c3..771dda29a0f8 100644
--- a/drivers/tty/serial/8250/8250_early.c
+++ b/drivers/tty/serial/8250/8250_early.c
@@ -131,7 +131,7 @@ static void __init init_port(struct earlycon_device *device)
 	serial8250_early_out(port, UART_LCR, c & ~UART_LCR_DLAB);
 }
 
-static int __init early_serial8250_setup(struct earlycon_device *device,
+int __init early_serial8250_setup(struct earlycon_device *device,
 					 const char *options)
 {
 	if (!(device->port.membase || device->port.iobase))
diff --git a/include/linux/serial_8250.h b/include/linux/serial_8250.h
index 78097e7a330a..f0c68d88b6f4 100644
--- a/include/linux/serial_8250.h
+++ b/include/linux/serial_8250.h
@@ -137,6 +137,8 @@ extern int early_serial_setup(struct uart_port *port);
 
 extern unsigned int serial8250_early_in(struct uart_port *port, int offset);
 extern void serial8250_early_out(struct uart_port *port, int offset, int value);
+extern int early_serial8250_setup(struct earlycon_device *device,
+					 const char *options);
 extern void serial8250_do_set_termios(struct uart_port *port,
 		struct ktermios *termios, struct ktermios *old);
 extern int serial8250_do_startup(struct uart_port *port);
-- 
cgit v1.2.3


From c27ffc1080179c3f3b85e1e194fa61f1c9923b62 Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert+renesas@glider.be>
Date: Thu, 30 Apr 2015 18:21:25 +0200
Subject: serial: sh-sci: Move private definitions to private header file

Move private register definitions and enums from the public
<linux/serial_sci.h> header file to the driver private "sh-sci.h" header
file.

The common Serial Control Register definitions are left in the public
header file, as they're needed to fill in plat_sci_port.scscr on legacy
systems not using DT.

Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/tty/serial/sh-sci.h | 83 +++++++++++++++++++++++++++++++++++++++++----
 include/linux/serial_sci.h  | 67 +-----------------------------------
 2 files changed, 77 insertions(+), 73 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/tty/serial/sh-sci.h b/drivers/tty/serial/sh-sci.h
index 350717404507..238ad0c5584b 100644
--- a/drivers/tty/serial/sh-sci.h
+++ b/drivers/tty/serial/sh-sci.h
@@ -2,6 +2,82 @@
 #include <linux/io.h>
 #include <linux/gpio.h>
 
+#define SCI_MAJOR		204
+#define SCI_MINOR_START		8
+
+
+/*
+ * SCI register subset common for all port types.
+ * Not all registers will exist on all parts.
+ */
+enum {
+	SCSMR,				/* Serial Mode Register */
+	SCBRR,				/* Bit Rate Register */
+	SCSCR,				/* Serial Control Register */
+	SCxSR,				/* Serial Status Register */
+	SCFCR,				/* FIFO Control Register */
+	SCFDR,				/* FIFO Data Count Register */
+	SCxTDR,				/* Transmit (FIFO) Data Register */
+	SCxRDR,				/* Receive (FIFO) Data Register */
+	SCLSR,				/* Line Status Register */
+	SCTFDR,				/* Transmit FIFO Data Count Register */
+	SCRFDR,				/* Receive FIFO Data Count Register */
+	SCSPTR,				/* Serial Port Register */
+	HSSRR,				/* Sampling Rate Register */
+
+	SCIx_NR_REGS,
+};
+
+
+/* SCSMR (Serial Mode Register) */
+#define SCSMR_CHR	(1 << 6)	/* 7-bit Character Length */
+#define SCSMR_PE	(1 << 5)	/* Parity Enable */
+#define SCSMR_ODD	(1 << 4)	/* Odd Parity */
+#define SCSMR_STOP	(1 << 3)	/* Stop Bit Length */
+#define SCSMR_CKS	0x0003		/* Clock Select */
+
+/* Serial Control Register, SCIFA/SCIFB only bits */
+#define SCSCR_TDRQE	(1 << 15)	/* Tx Data Transfer Request Enable */
+#define SCSCR_RDRQE	(1 << 14)	/* Rx Data Transfer Request Enable */
+
+/* SCxSR (Serial Status Register) on SCI */
+#define SCI_TDRE  0x80			/* Transmit Data Register Empty */
+#define SCI_RDRF  0x40			/* Receive Data Register Full */
+#define SCI_ORER  0x20			/* Overrun Error */
+#define SCI_FER   0x10			/* Framing Error */
+#define SCI_PER   0x08			/* Parity Error */
+#define SCI_TEND  0x04			/* Transmit End */
+
+#define SCI_DEFAULT_ERROR_MASK (SCI_PER | SCI_FER)
+
+/* SCxSR (Serial Status Register) on SCIF, HSCIF */
+#define SCIF_ER    0x0080		/* Receive Error */
+#define SCIF_TEND  0x0040		/* Transmission End */
+#define SCIF_TDFE  0x0020		/* Transmit FIFO Data Empty */
+#define SCIF_BRK   0x0010		/* Break Detect */
+#define SCIF_FER   0x0008		/* Framing Error */
+#define SCIF_PER   0x0004		/* Parity Error */
+#define SCIF_RDF   0x0002		/* Receive FIFO Data Full */
+#define SCIF_DR    0x0001		/* Receive Data Ready */
+
+#define SCIF_DEFAULT_ERROR_MASK (SCIF_PER | SCIF_FER | SCIF_ER | SCIF_BRK)
+
+/* SCFCR (FIFO Control Register) */
+#define SCFCR_MCE	0x0008
+#define SCFCR_TFRST	0x0004
+#define SCFCR_RFRST	0x0002
+#define SCFCR_LOOP	(1 << 0)	/* Loopback Test */
+
+/* SCSPTR (Serial Port Register), optional */
+#define SCSPTR_RTSIO	(1 << 7)	/* Serial Port RTS Pin Input/Output */
+#define SCSPTR_CTSIO	(1 << 5)	/* Serial Port CTS Pin Input/Output */
+#define SCSPTR_SPB2IO	(1 << 1)	/* Serial Port Break Input/Output */
+#define SCSPTR_SPB2DT	(1 << 0)	/* Serial Port Break Data */
+
+/* HSSRR HSCIF */
+#define HSCIF_SRE	0x8000		/* Sampling Rate Register Enable */
+
+
 #define SCxSR_TEND(port)	(((port)->type == PORT_SCI) ? SCI_TEND   : SCIF_TEND)
 #define SCxSR_RDxF(port)	(((port)->type == PORT_SCI) ? SCI_RDRF   : SCIF_RDF)
 #define SCxSR_TDxE(port)	(((port)->type == PORT_SCI) ? SCI_TDRE   : SCIF_TDFE)
@@ -28,10 +104,3 @@
 # define SCxSR_BREAK_CLEAR(port) (((port)->type == PORT_SCI) ? 0xc4 : 0x00e3)
 #endif
 
-/* SCFCR */
-#define SCFCR_RFRST 0x0002
-#define SCFCR_TFRST 0x0004
-#define SCFCR_MCE   0x0008
-
-#define SCI_MAJOR		204
-#define SCI_MINOR_START		8
diff --git a/include/linux/serial_sci.h b/include/linux/serial_sci.h
index 6c5e3bb282b0..395fceb8c060 100644
--- a/include/linux/serial_sci.h
+++ b/include/linux/serial_sci.h
@@ -10,13 +10,6 @@
 
 #define SCIx_NOT_SUPPORTED	(-1)
 
-/* SCSMR (Serial Mode Register) */
-#define SCSMR_CHR	(1 << 6)	/* 7-bit Character Length */
-#define SCSMR_PE	(1 << 5)	/* Parity Enable */
-#define SCSMR_ODD	(1 << 4)	/* Odd Parity */
-#define SCSMR_STOP	(1 << 3)	/* Stop Bit Length */
-#define SCSMR_CKS	0x0003		/* Clock Select */
-
 /* Serial Control Register (@ = not supported by all parts) */
 #define SCSCR_TIE	(1 << 7)	/* Transmit Interrupt Enable */
 #define SCSCR_RIE	(1 << 6)	/* Receive Interrupt Enable */
@@ -26,43 +19,7 @@
 #define SCSCR_TOIE	(1 << 2)	/* Timeout Interrupt Enable @ */
 #define SCSCR_CKE1	(1 << 1)	/* Clock Enable 1 */
 #define SCSCR_CKE0	(1 << 0)	/* Clock Enable 0 */
-/* SCIFA/SCIFB only */
-#define SCSCR_TDRQE	(1 << 15)	/* Tx Data Transfer Request Enable */
-#define SCSCR_RDRQE	(1 << 14)	/* Rx Data Transfer Request Enable */
-
-/* SCxSR (Serial Status Register) on SCI */
-#define SCI_TDRE  0x80			/* Transmit Data Register Empty */
-#define SCI_RDRF  0x40			/* Receive Data Register Full */
-#define SCI_ORER  0x20			/* Overrun Error */
-#define SCI_FER   0x10			/* Framing Error */
-#define SCI_PER   0x08			/* Parity Error */
-#define SCI_TEND  0x04			/* Transmit End */
-
-#define SCI_DEFAULT_ERROR_MASK (SCI_PER | SCI_FER)
-
-/* SCxSR (Serial Status Register) on SCIF, HSCIF */
-#define SCIF_ER    0x0080		/* Receive Error */
-#define SCIF_TEND  0x0040		/* Transmission End */
-#define SCIF_TDFE  0x0020		/* Transmit FIFO Data Empty */
-#define SCIF_BRK   0x0010		/* Break Detect */
-#define SCIF_FER   0x0008		/* Framing Error */
-#define SCIF_PER   0x0004		/* Parity Error */
-#define SCIF_RDF   0x0002		/* Receive FIFO Data Full */
-#define SCIF_DR    0x0001		/* Receive Data Ready */
-
-#define SCIF_DEFAULT_ERROR_MASK (SCIF_PER | SCIF_FER | SCIF_ER | SCIF_BRK)
-
-/* SCFCR (FIFO Control Register) */
-#define SCFCR_LOOP	(1 << 0)	/* Loopback Test */
-
-/* SCSPTR (Serial Port Register), optional */
-#define SCSPTR_RTSIO	(1 << 7)	/* Serial Port RTS Pin Input/Output */
-#define SCSPTR_CTSIO	(1 << 5)	/* Serial Port CTS Pin Input/Output */
-#define SCSPTR_SPB2IO	(1 << 1)	/* Serial Port Break Input/Output */
-#define SCSPTR_SPB2DT	(1 << 0)	/* Serial Port Break Data */
-
-/* HSSRR HSCIF */
-#define HSCIF_SRE	0x8000		/* Sampling Rate Register Enable */
+
 
 enum {
 	SCIx_PROBE_REGTYPE,
@@ -82,28 +39,6 @@ enum {
 	SCIx_NR_REGTYPES,
 };
 
-/*
- * SCI register subset common for all port types.
- * Not all registers will exist on all parts.
- */
-enum {
-	SCSMR,				/* Serial Mode Register */
-	SCBRR,				/* Bit Rate Register */
-	SCSCR,				/* Serial Control Register */
-	SCxSR,				/* Serial Status Register */
-	SCFCR,				/* FIFO Control Register */
-	SCFDR,				/* FIFO Data Count Register */
-	SCxTDR,				/* Transmit (FIFO) Data Register */
-	SCxRDR,				/* Receive (FIFO) Data Register */
-	SCLSR,				/* Line Status Register */
-	SCTFDR,				/* Transmit FIFO Data Count Register */
-	SCRFDR,				/* Receive FIFO Data Count Register */
-	SCSPTR,				/* Serial Port Register */
-	HSSRR,				/* Sampling Rate Register */
-
-	SCIx_NR_REGS,
-};
-
 struct device;
 
 struct plat_sci_port_ops {
-- 
cgit v1.2.3


From d94a0a3857987c76c37a8095977fe554799ab69d Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert+renesas@glider.be>
Date: Thu, 30 Apr 2015 18:21:29 +0200
Subject: serial: sh-sci: Standardize on using the BIT() macro to define
 register bits

Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/tty/serial/sh-sci.h | 73 +++++++++++++++++++++++----------------------
 include/linux/serial_sci.h  | 19 ++++++------
 2 files changed, 47 insertions(+), 45 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/tty/serial/sh-sci.h b/drivers/tty/serial/sh-sci.h
index 1586d68470b6..5282738375ae 100644
--- a/drivers/tty/serial/sh-sci.h
+++ b/drivers/tty/serial/sh-sci.h
@@ -1,3 +1,4 @@
+#include <linux/bitops.h>
 #include <linux/serial_core.h>
 #include <linux/io.h>
 #include <linux/gpio.h>
@@ -32,62 +33,62 @@ enum {
 
 
 /* SCSMR (Serial Mode Register) */
-#define SCSMR_CHR	(1 << 6)	/* 7-bit Character Length */
-#define SCSMR_PE	(1 << 5)	/* Parity Enable */
-#define SCSMR_ODD	(1 << 4)	/* Odd Parity */
-#define SCSMR_STOP	(1 << 3)	/* Stop Bit Length */
-#define SCSMR_CKS	0x0003		/* Clock Select */
+#define SCSMR_CHR	BIT(6)	/* 7-bit Character Length */
+#define SCSMR_PE	BIT(5)	/* Parity Enable */
+#define SCSMR_ODD	BIT(4)	/* Odd Parity */
+#define SCSMR_STOP	BIT(3)	/* Stop Bit Length */
+#define SCSMR_CKS	0x0003	/* Clock Select */
 
 /* Serial Control Register, SCIFA/SCIFB only bits */
-#define SCSCR_TDRQE	(1 << 15)	/* Tx Data Transfer Request Enable */
-#define SCSCR_RDRQE	(1 << 14)	/* Rx Data Transfer Request Enable */
+#define SCSCR_TDRQE	BIT(15)	/* Tx Data Transfer Request Enable */
+#define SCSCR_RDRQE	BIT(14)	/* Rx Data Transfer Request Enable */
 
 /* SCxSR (Serial Status Register) on SCI */
-#define SCI_TDRE  0x80			/* Transmit Data Register Empty */
-#define SCI_RDRF  0x40			/* Receive Data Register Full */
-#define SCI_ORER  0x20			/* Overrun Error */
-#define SCI_FER   0x10			/* Framing Error */
-#define SCI_PER   0x08			/* Parity Error */
-#define SCI_TEND  0x04			/* Transmit End */
+#define SCI_TDRE	BIT(7)	/* Transmit Data Register Empty */
+#define SCI_RDRF	BIT(6)	/* Receive Data Register Full */
+#define SCI_ORER	BIT(5)	/* Overrun Error */
+#define SCI_FER		BIT(4)	/* Framing Error */
+#define SCI_PER		BIT(3)	/* Parity Error */
+#define SCI_TEND	BIT(2)	/* Transmit End */
 
 #define SCI_DEFAULT_ERROR_MASK (SCI_PER | SCI_FER)
 
 /* SCxSR (Serial Status Register) on SCIF, HSCIF */
-#define SCIF_ER    0x0080		/* Receive Error */
-#define SCIF_TEND  0x0040		/* Transmission End */
-#define SCIF_TDFE  0x0020		/* Transmit FIFO Data Empty */
-#define SCIF_BRK   0x0010		/* Break Detect */
-#define SCIF_FER   0x0008		/* Framing Error */
-#define SCIF_PER   0x0004		/* Parity Error */
-#define SCIF_RDF   0x0002		/* Receive FIFO Data Full */
-#define SCIF_DR    0x0001		/* Receive Data Ready */
+#define SCIF_ER		BIT(7)	/* Receive Error */
+#define SCIF_TEND	BIT(6)	/* Transmission End */
+#define SCIF_TDFE	BIT(5)	/* Transmit FIFO Data Empty */
+#define SCIF_BRK	BIT(4)	/* Break Detect */
+#define SCIF_FER	BIT(3)	/* Framing Error */
+#define SCIF_PER	BIT(2)	/* Parity Error */
+#define SCIF_RDF	BIT(1)	/* Receive FIFO Data Full */
+#define SCIF_DR		BIT(0)	/* Receive Data Ready */
 
 #define SCIF_DEFAULT_ERROR_MASK (SCIF_PER | SCIF_FER | SCIF_ER | SCIF_BRK)
 
 /* SCFCR (FIFO Control Register) */
-#define SCFCR_MCE	0x0008		/* Modem Control Enable */
-#define SCFCR_TFRST	0x0004		/* Transmit FIFO Data Register Reset */
-#define SCFCR_RFRST	0x0002		/* Receive FIFO Data Register Reset */
-#define SCFCR_LOOP	(1 << 0)	/* Loopback Test */
+#define SCFCR_MCE	BIT(3)	/* Modem Control Enable */
+#define SCFCR_TFRST	BIT(2)	/* Transmit FIFO Data Register Reset */
+#define SCFCR_RFRST	BIT(1)	/* Receive FIFO Data Register Reset */
+#define SCFCR_LOOP	BIT(0)	/* Loopback Test */
 
 /* SCSPTR (Serial Port Register), optional */
-#define SCSPTR_RTSIO	(1 << 7)	/* Serial Port RTS Pin Input/Output */
-#define SCSPTR_RTSDT	(1 << 6)	/* Serial Port RTS Pin Data */
-#define SCSPTR_CTSIO	(1 << 5)	/* Serial Port CTS Pin Input/Output */
-#define SCSPTR_CTSDT	(1 << 4)	/* Serial Port CTS Pin Data */
-#define SCSPTR_SPB2IO	(1 << 1)	/* Serial Port Break Input/Output */
-#define SCSPTR_SPB2DT	(1 << 0)	/* Serial Port Break Data */
+#define SCSPTR_RTSIO	BIT(7)	/* Serial Port RTS Pin Input/Output */
+#define SCSPTR_RTSDT	BIT(6)	/* Serial Port RTS Pin Data */
+#define SCSPTR_CTSIO	BIT(5)	/* Serial Port CTS Pin Input/Output */
+#define SCSPTR_CTSDT	BIT(4)	/* Serial Port CTS Pin Data */
+#define SCSPTR_SPB2IO	BIT(1)	/* Serial Port Break Input/Output */
+#define SCSPTR_SPB2DT	BIT(0)	/* Serial Port Break Data */
 
 /* HSSRR HSCIF */
-#define HSCIF_SRE	0x8000		/* Sampling Rate Register Enable */
+#define HSCIF_SRE	BIT(15)	/* Sampling Rate Register Enable */
 
 /* SCPCR (Serial Port Control Register), SCIFA/SCIFB only */
-#define SCPCR_RTSC	(1 << 4)	/* Serial Port RTS Pin / Output Pin */
-#define SCPCR_CTSC	(1 << 3)	/* Serial Port CTS Pin / Input Pin */
+#define SCPCR_RTSC	BIT(4)	/* Serial Port RTS Pin / Output Pin */
+#define SCPCR_CTSC	BIT(3)	/* Serial Port CTS Pin / Input Pin */
 
 /* SCPDR (Serial Port Data Register), SCIFA/SCIFB only */
-#define SCPDR_RTSD	(1 << 4)	/* Serial Port RTS Output Pin Data */
-#define SCPDR_CTSD	(1 << 3)	/* Serial Port CTS Input Pin Data */
+#define SCPDR_RTSD	BIT(4)	/* Serial Port RTS Output Pin Data */
+#define SCPDR_CTSD	BIT(3)	/* Serial Port CTS Input Pin Data */
 
 
 #define SCxSR_TEND(port)	(((port)->type == PORT_SCI) ? SCI_TEND   : SCIF_TEND)
diff --git a/include/linux/serial_sci.h b/include/linux/serial_sci.h
index 395fceb8c060..7c536ac5be05 100644
--- a/include/linux/serial_sci.h
+++ b/include/linux/serial_sci.h
@@ -1,6 +1,7 @@
 #ifndef __LINUX_SERIAL_SCI_H
 #define __LINUX_SERIAL_SCI_H
 
+#include <linux/bitops.h>
 #include <linux/serial_core.h>
 #include <linux/sh_dma.h>
 
@@ -11,14 +12,14 @@
 #define SCIx_NOT_SUPPORTED	(-1)
 
 /* Serial Control Register (@ = not supported by all parts) */
-#define SCSCR_TIE	(1 << 7)	/* Transmit Interrupt Enable */
-#define SCSCR_RIE	(1 << 6)	/* Receive Interrupt Enable */
-#define SCSCR_TE	(1 << 5)	/* Transmit Enable */
-#define SCSCR_RE	(1 << 4)	/* Receive Enable */
-#define SCSCR_REIE	(1 << 3)	/* Receive Error Interrupt Enable @ */
-#define SCSCR_TOIE	(1 << 2)	/* Timeout Interrupt Enable @ */
-#define SCSCR_CKE1	(1 << 1)	/* Clock Enable 1 */
-#define SCSCR_CKE0	(1 << 0)	/* Clock Enable 0 */
+#define SCSCR_TIE	BIT(7)	/* Transmit Interrupt Enable */
+#define SCSCR_RIE	BIT(6)	/* Receive Interrupt Enable */
+#define SCSCR_TE	BIT(5)	/* Transmit Enable */
+#define SCSCR_RE	BIT(4)	/* Receive Enable */
+#define SCSCR_REIE	BIT(3)	/* Receive Error Interrupt Enable @ */
+#define SCSCR_TOIE	BIT(2)	/* Timeout Interrupt Enable @ */
+#define SCSCR_CKE1	BIT(1)	/* Clock Enable 1 */
+#define SCSCR_CKE0	BIT(0)	/* Clock Enable 0 */
 
 
 enum {
@@ -48,7 +49,7 @@ struct plat_sci_port_ops {
 /*
  * Port-specific capabilities
  */
-#define SCIx_HAVE_RTSCTS	(1 << 0)
+#define SCIx_HAVE_RTSCTS	BIT(0)
 
 /*
  * Platform device specific platform_data struct
-- 
cgit v1.2.3


From bd63364caa8df38bad2b25b11b2a1b849475cce5 Mon Sep 17 00:00:00 2001
From: Scot Doyle <lkml14@scotdoyle.com>
Date: Thu, 26 Mar 2015 13:54:39 +0000
Subject: vt: add cursor blink interval escape sequence

Add an escape sequence to specify the current console's cursor blink
interval. The interval is specified as a number of milliseconds until
the next cursor display state toggle, from 50 to 65535. /proc/loadavg
did not show a difference with a one msec interval, but the lower
bound is set to 50 msecs since slower hardware wasn't tested.

Store the interval in the vc_data structure for later access by fbcon,
initializing the value to fbcon's current hardcoded value of 200 msecs.

Signed-off-by: Scot Doyle <lkml14@scotdoyle.com>
Acked-by: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/tty/vt/vt.c            | 9 +++++++++
 include/linux/console_struct.h | 1 +
 2 files changed, 10 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/tty/vt/vt.c b/drivers/tty/vt/vt.c
index 4a24eb2b0ede..b075489f314e 100644
--- a/drivers/tty/vt/vt.c
+++ b/drivers/tty/vt/vt.c
@@ -135,6 +135,7 @@ const struct consw *conswitchp;
  */
 #define DEFAULT_BELL_PITCH	750
 #define DEFAULT_BELL_DURATION	(HZ/8)
+#define DEFAULT_CURSOR_BLINK_MS	200
 
 struct vc vc_cons [MAX_NR_CONSOLES];
 
@@ -1590,6 +1591,13 @@ static void setterm_command(struct vc_data *vc)
 		case 15: /* activate the previous console */
 			set_console(last_console);
 			break;
+		case 16: /* set cursor blink duration in msec */
+			if (vc->vc_npar >= 1 && vc->vc_par[1] >= 50 &&
+					vc->vc_par[1] <= USHRT_MAX)
+				vc->vc_cur_blink_ms = vc->vc_par[1];
+			else
+				vc->vc_cur_blink_ms = DEFAULT_CURSOR_BLINK_MS;
+			break;
 	}
 }
 
@@ -1717,6 +1725,7 @@ static void reset_terminal(struct vc_data *vc, int do_clear)
 
 	vc->vc_bell_pitch = DEFAULT_BELL_PITCH;
 	vc->vc_bell_duration = DEFAULT_BELL_DURATION;
+	vc->vc_cur_blink_ms = DEFAULT_CURSOR_BLINK_MS;
 
 	gotoxy(vc, 0, 0);
 	save_cur(vc);
diff --git a/include/linux/console_struct.h b/include/linux/console_struct.h
index e859c98d1767..e329ee2667e1 100644
--- a/include/linux/console_struct.h
+++ b/include/linux/console_struct.h
@@ -104,6 +104,7 @@ struct vc_data {
 	unsigned int    vc_resize_user;         /* resize request from user */
 	unsigned int	vc_bell_pitch;		/* Console bell pitch */
 	unsigned int	vc_bell_duration;	/* Console bell duration */
+	unsigned short	vc_cur_blink_ms;	/* Cursor blink duration */
 	struct vc_data **vc_display_fg;		/* [!] Ptr to var holding fg console for this display */
 	struct uni_pagedir *vc_uni_pagedir;
 	struct uni_pagedir **vc_uni_pagedir_loc; /* [!] Location of uni_pagedir variable for this console */
-- 
cgit v1.2.3


From faaa44955dedc661f083636d816af90975a359ee Mon Sep 17 00:00:00 2001
From: Irina Tirdea <irina.tirdea@intel.com>
Date: Wed, 29 Apr 2015 21:16:39 +0300
Subject: iio: core: Introduce IIO_CHAN_INFO_OVERSAMPLING_RATIO

Some magnetometers can perform a number of repetitions in HW
for each measurement to increase accuracy. One example is
Bosch BMC150:
http://ae-bst.resource.bosch.com/media/products/dokumente/bmc150/BST-BMC150-DS000-04.pdf.

Introduce an interface to set the oversampling ratio
for these devices.

Signed-off-by: Irina Tirdea <irina.tirdea@intel.com>
Signed-off-by: Jonathan Cameron <jic23@kernel.org>
---
 Documentation/ABI/testing/sysfs-bus-iio | 12 ++++++++++++
 drivers/iio/industrialio-core.c         |  1 +
 include/linux/iio/iio.h                 |  1 +
 3 files changed, 14 insertions(+)

(limited to 'include/linux')

diff --git a/Documentation/ABI/testing/sysfs-bus-iio b/Documentation/ABI/testing/sysfs-bus-iio
index 866b4ec4aab6..e46c71fbd047 100644
--- a/Documentation/ABI/testing/sysfs-bus-iio
+++ b/Documentation/ABI/testing/sysfs-bus-iio
@@ -1375,3 +1375,15 @@ Description:
 		The emissivity ratio of the surface in the field of view of the
 		contactless temperature sensor.  Emissivity varies from 0 to 1,
 		with 1 being the emissivity of a black body.
+
+What:		/sys/bus/iio/devices/iio:deviceX/in_magn_x_oversampling_ratio
+What:		/sys/bus/iio/devices/iio:deviceX/in_magn_y_oversampling_ratio
+What:		/sys/bus/iio/devices/iio:deviceX/in_magn_z_oversampling_ratio
+KernelVersion:	4.2
+Contact:	linux-iio@vger.kernel.org
+Description:
+		Hardware applied number of measurements for acquiring one
+		data point. The HW will do <type>[_name]_oversampling_ratio
+		measurements and return the average value as output data. Each
+		value resulted from <type>[_name]_oversampling_ratio measurements
+		is considered as one sample for <type>[_name]_sampling_frequency.
diff --git a/drivers/iio/industrialio-core.c b/drivers/iio/industrialio-core.c
index 7c98bc1504e6..dfa81db3b910 100644
--- a/drivers/iio/industrialio-core.c
+++ b/drivers/iio/industrialio-core.c
@@ -129,6 +129,7 @@ static const char * const iio_chan_info_postfix[] = {
 	[IIO_CHAN_INFO_DEBOUNCE_COUNT] = "debounce_count",
 	[IIO_CHAN_INFO_DEBOUNCE_TIME] = "debounce_time",
 	[IIO_CHAN_INFO_CALIBEMISSIVITY] = "calibemissivity",
+	[IIO_CHAN_INFO_OVERSAMPLING_RATIO] = "oversampling_ratio",
 };
 
 /**
diff --git a/include/linux/iio/iio.h b/include/linux/iio/iio.h
index b1e46ae89aa7..058441da4984 100644
--- a/include/linux/iio/iio.h
+++ b/include/linux/iio/iio.h
@@ -44,6 +44,7 @@ enum iio_chan_info_enum {
 	IIO_CHAN_INFO_DEBOUNCE_COUNT,
 	IIO_CHAN_INFO_DEBOUNCE_TIME,
 	IIO_CHAN_INFO_CALIBEMISSIVITY,
+	IIO_CHAN_INFO_OVERSAMPLING_RATIO,
 };
 
 enum iio_shared_by {
-- 
cgit v1.2.3


From 61ba64fc0768879a300599b011c176203bdf27d9 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 2 May 2015 09:54:06 -0400
Subject: libfs: simple_follow_link()

let "fast" symlinks store the pointer to the body into ->i_link and
use simple_follow_link for ->follow_link()

Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/inode.c         |  1 +
 fs/libfs.c         | 13 +++++++++++++
 include/linux/fs.h |  3 +++
 3 files changed, 17 insertions(+)

(limited to 'include/linux')

diff --git a/fs/inode.c b/fs/inode.c
index ea37cd17b53f..952fb4852e38 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -152,6 +152,7 @@ int inode_init_always(struct super_block *sb, struct inode *inode)
 	inode->i_pipe = NULL;
 	inode->i_bdev = NULL;
 	inode->i_cdev = NULL;
+	inode->i_link = NULL;
 	inode->i_rdev = 0;
 	inode->dirtied_when = 0;
 
diff --git a/fs/libfs.c b/fs/libfs.c
index cb1fb4b9b637..72e4e015455f 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -1093,3 +1093,16 @@ simple_nosetlease(struct file *filp, long arg, struct file_lock **flp,
 	return -EINVAL;
 }
 EXPORT_SYMBOL(simple_nosetlease);
+
+void *simple_follow_link(struct dentry *dentry, struct nameidata *nd)
+{
+	nd_set_link(nd, d_inode(dentry)->i_link);
+	return NULL;
+}
+EXPORT_SYMBOL(simple_follow_link);
+
+const struct inode_operations simple_symlink_inode_operations = {
+	.follow_link = simple_follow_link,
+	.readlink = generic_readlink
+};
+EXPORT_SYMBOL(simple_symlink_inode_operations);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 35ec87e490b1..0ac758fcff00 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -656,6 +656,7 @@ struct inode {
 		struct pipe_inode_info	*i_pipe;
 		struct block_device	*i_bdev;
 		struct cdev		*i_cdev;
+		char			*i_link;
 	};
 
 	__u32			i_generation;
@@ -2721,6 +2722,8 @@ void __inode_sub_bytes(struct inode *inode, loff_t bytes);
 void inode_sub_bytes(struct inode *inode, loff_t bytes);
 loff_t inode_get_bytes(struct inode *inode);
 void inode_set_bytes(struct inode *inode, loff_t bytes);
+void *simple_follow_link(struct dentry *, struct nameidata *);
+extern const struct inode_operations simple_symlink_inode_operations;
 
 extern int iterate_dir(struct file *, struct dir_context *);
 
-- 
cgit v1.2.3


From 5723cb01f0295ace2b029b0737dd6525a2de337f Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 2 May 2015 10:27:18 -0400
Subject: debugfs: switch to simple_follow_link()

Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/debugfs/file.c       | 12 ------------
 fs/debugfs/inode.c      |  6 +++---
 include/linux/debugfs.h |  1 -
 3 files changed, 3 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
index 830a7e76f5c6..284f9aa0028b 100644
--- a/fs/debugfs/file.c
+++ b/fs/debugfs/file.c
@@ -17,7 +17,6 @@
 #include <linux/fs.h>
 #include <linux/seq_file.h>
 #include <linux/pagemap.h>
-#include <linux/namei.h>
 #include <linux/debugfs.h>
 #include <linux/io.h>
 #include <linux/slab.h>
@@ -43,17 +42,6 @@ const struct file_operations debugfs_file_operations = {
 	.llseek =	noop_llseek,
 };
 
-static void *debugfs_follow_link(struct dentry *dentry, struct nameidata *nd)
-{
-	nd_set_link(nd, d_inode(dentry)->i_private);
-	return NULL;
-}
-
-const struct inode_operations debugfs_link_operations = {
-	.readlink       = generic_readlink,
-	.follow_link    = debugfs_follow_link,
-};
-
 static int debugfs_u8_set(void *data, u64 val)
 {
 	*(u8 *)data = val;
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index c1e7ffb0dab6..7eaec88ea970 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -174,7 +174,7 @@ static void debugfs_evict_inode(struct inode *inode)
 	truncate_inode_pages_final(&inode->i_data);
 	clear_inode(inode);
 	if (S_ISLNK(inode->i_mode))
-		kfree(inode->i_private);
+		kfree(inode->i_link);
 }
 
 static const struct super_operations debugfs_super_operations = {
@@ -511,8 +511,8 @@ struct dentry *debugfs_create_symlink(const char *name, struct dentry *parent,
 		return failed_creating(dentry);
 	}
 	inode->i_mode = S_IFLNK | S_IRWXUGO;
-	inode->i_op = &debugfs_link_operations;
-	inode->i_private = link;
+	inode->i_op = &simple_symlink_inode_operations;
+	inode->i_link = link;
 	d_instantiate(dentry, inode);
 	return end_creating(dentry);
 }
diff --git a/include/linux/debugfs.h b/include/linux/debugfs.h
index cb25af461054..420311bcee38 100644
--- a/include/linux/debugfs.h
+++ b/include/linux/debugfs.h
@@ -45,7 +45,6 @@ extern struct dentry *arch_debugfs_dir;
 
 /* declared over in file.c */
 extern const struct file_operations debugfs_file_operations;
-extern const struct inode_operations debugfs_link_operations;
 
 struct dentry *debugfs_create_file(const char *name, umode_t mode,
 				   struct dentry *parent, void *data,
-- 
cgit v1.2.3


From 37882db0546c759ff75b561c188539ac96fd0bfe Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 23 Mar 2015 13:37:39 +1100
Subject: SECURITY: remove nameidata arg from inode_follow_link.

No ->inode_follow_link() methods use the nameidata arg, and
it is about to become private to namei.c.
So remove from all inode_follow_link() functions.

Signed-off-by: NeilBrown <neilb@suse.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c               | 2 +-
 include/linux/security.h | 9 +++------
 security/capability.c    | 3 +--
 security/security.c      | 4 ++--
 security/selinux/hooks.c | 2 +-
 5 files changed, 8 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/fs/namei.c b/fs/namei.c
index fe30d3be43a8..7f20b40426dc 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -871,7 +871,7 @@ follow_link(struct path *link, struct nameidata *nd, void **p)
 	touch_atime(link);
 	nd_set_link(nd, NULL);
 
-	error = security_inode_follow_link(link->dentry, nd);
+	error = security_inode_follow_link(dentry);
 	if (error)
 		goto out_put_nd_path;
 
diff --git a/include/linux/security.h b/include/linux/security.h
index 18264ea9e314..62a66202ecf1 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -43,7 +43,6 @@ struct file;
 struct vfsmount;
 struct path;
 struct qstr;
-struct nameidata;
 struct iattr;
 struct fown_struct;
 struct file_operations;
@@ -477,7 +476,6 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts)
  * @inode_follow_link:
  *	Check permission to follow a symbolic link when looking up a pathname.
  *	@dentry contains the dentry structure for the link.
- *	@nd contains the nameidata structure for the parent directory.
  *	Return 0 if permission is granted.
  * @inode_permission:
  *	Check permission before accessing an inode.  This hook is called by the
@@ -1553,7 +1551,7 @@ struct security_operations {
 	int (*inode_rename) (struct inode *old_dir, struct dentry *old_dentry,
 			     struct inode *new_dir, struct dentry *new_dentry);
 	int (*inode_readlink) (struct dentry *dentry);
-	int (*inode_follow_link) (struct dentry *dentry, struct nameidata *nd);
+	int (*inode_follow_link) (struct dentry *dentry);
 	int (*inode_permission) (struct inode *inode, int mask);
 	int (*inode_setattr)	(struct dentry *dentry, struct iattr *attr);
 	int (*inode_getattr) (const struct path *path);
@@ -1839,7 +1837,7 @@ int security_inode_rename(struct inode *old_dir, struct dentry *old_dentry,
 			  struct inode *new_dir, struct dentry *new_dentry,
 			  unsigned int flags);
 int security_inode_readlink(struct dentry *dentry);
-int security_inode_follow_link(struct dentry *dentry, struct nameidata *nd);
+int security_inode_follow_link(struct dentry *dentry);
 int security_inode_permission(struct inode *inode, int mask);
 int security_inode_setattr(struct dentry *dentry, struct iattr *attr);
 int security_inode_getattr(const struct path *path);
@@ -2241,8 +2239,7 @@ static inline int security_inode_readlink(struct dentry *dentry)
 	return 0;
 }
 
-static inline int security_inode_follow_link(struct dentry *dentry,
-					      struct nameidata *nd)
+static inline int security_inode_follow_link(struct dentry *dentry)
 {
 	return 0;
 }
diff --git a/security/capability.c b/security/capability.c
index 0d03fcc489a4..fae99db180ca 100644
--- a/security/capability.c
+++ b/security/capability.c
@@ -209,8 +209,7 @@ static int cap_inode_readlink(struct dentry *dentry)
 	return 0;
 }
 
-static int cap_inode_follow_link(struct dentry *dentry,
-				 struct nameidata *nameidata)
+static int cap_inode_follow_link(struct dentry *dentry)
 {
 	return 0;
 }
diff --git a/security/security.c b/security/security.c
index 8e9b1f4b9b45..d7c30b03e144 100644
--- a/security/security.c
+++ b/security/security.c
@@ -581,11 +581,11 @@ int security_inode_readlink(struct dentry *dentry)
 	return security_ops->inode_readlink(dentry);
 }
 
-int security_inode_follow_link(struct dentry *dentry, struct nameidata *nd)
+int security_inode_follow_link(struct dentry *dentry)
 {
 	if (unlikely(IS_PRIVATE(d_backing_inode(dentry))))
 		return 0;
-	return security_ops->inode_follow_link(dentry, nd);
+	return security_ops->inode_follow_link(dentry);
 }
 
 int security_inode_permission(struct inode *inode, int mask)
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 7dade28affba..801622947282 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -2861,7 +2861,7 @@ static int selinux_inode_readlink(struct dentry *dentry)
 	return dentry_has_perm(cred, dentry, FILE__READ);
 }
 
-static int selinux_inode_follow_link(struct dentry *dentry, struct nameidata *nameidata)
+static int selinux_inode_follow_link(struct dentry *dentry)
 {
 	const struct cred *cred = current_cred();
 
-- 
cgit v1.2.3


From 680baacbca69d18a6d7315374ad83d05ac9c0977 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 2 May 2015 13:32:22 -0400
Subject: new ->follow_link() and ->put_link() calling conventions

a) instead of storing the symlink body (via nd_set_link()) and returning
an opaque pointer later passed to ->put_link(), ->follow_link() _stores_
that opaque pointer (into void * passed by address by caller) and returns
the symlink body.  Returning ERR_PTR() on error, NULL on jump (procfs magic
symlinks) and pointer to symlink body for normal symlinks.  Stored pointer
is ignored in all cases except the last one.

Storing NULL for opaque pointer (or not storing it at all) means no call
of ->put_link().

b) the body used to be passed to ->put_link() implicitly (via nameidata).
Now only the opaque pointer is.  In the cases when we used the symlink body
to free stuff, ->follow_link() now should store it as opaque pointer in addition
to returning it.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 Documentation/filesystems/Locking             |  4 +-
 Documentation/filesystems/vfs.txt             |  4 +-
 drivers/staging/lustre/lustre/llite/symlink.c | 11 ++---
 fs/9p/vfs_inode.c                             | 13 +++---
 fs/9p/vfs_inode_dotl.c                        |  7 ++-
 fs/autofs4/symlink.c                          |  5 +-
 fs/befs/linuxvfs.c                            | 35 +++++++-------
 fs/cifs/cifsfs.h                              |  2 +-
 fs/cifs/link.c                                | 28 ++++++------
 fs/configfs/symlink.c                         | 28 +++++-------
 fs/ecryptfs/inode.c                           |  8 ++--
 fs/ext4/symlink.c                             |  9 ++--
 fs/f2fs/namei.c                               | 18 +++-----
 fs/fuse/dir.c                                 | 19 ++------
 fs/gfs2/inode.c                               | 10 ++--
 fs/hostfs/hostfs_kern.c                       | 15 +++---
 fs/hppfs/hppfs.c                              |  9 ++--
 fs/kernfs/symlink.c                           | 22 ++++-----
 fs/libfs.c                                    | 12 ++---
 fs/namei.c                                    | 66 +++++++++------------------
 fs/nfs/symlink.c                              | 19 +++-----
 fs/overlayfs/inode.c                          | 18 ++++----
 fs/proc/base.c                                |  2 +-
 fs/proc/inode.c                               |  9 ++--
 fs/proc/namespaces.c                          |  2 +-
 fs/proc/self.c                                | 24 +++++-----
 fs/proc/thread_self.c                         | 22 ++++-----
 fs/xfs/xfs_iops.c                             | 10 ++--
 include/linux/fs.h                            | 12 ++---
 include/linux/namei.h                         |  2 -
 mm/shmem.c                                    | 23 +++++-----
 31 files changed, 195 insertions(+), 273 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking
index 0a926e2ba3ab..7fa6c4ac858c 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -50,8 +50,8 @@ prototypes:
 	int (*rename2) (struct inode *, struct dentry *,
 			struct inode *, struct dentry *, unsigned int);
 	int (*readlink) (struct dentry *, char __user *,int);
-	void * (*follow_link) (struct dentry *, struct nameidata *);
-	void (*put_link) (struct dentry *, struct nameidata *, void *);
+	const char *(*follow_link) (struct dentry *, void **, struct nameidata *);
+	void (*put_link) (struct dentry *, void *);
 	void (*truncate) (struct inode *);
 	int (*permission) (struct inode *, int, unsigned int);
 	int (*get_acl)(struct inode *, int);
diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt
index 5d833b32bbcd..1c6b03ac2e5a 100644
--- a/Documentation/filesystems/vfs.txt
+++ b/Documentation/filesystems/vfs.txt
@@ -350,8 +350,8 @@ struct inode_operations {
 	int (*rename2) (struct inode *, struct dentry *,
 			struct inode *, struct dentry *, unsigned int);
 	int (*readlink) (struct dentry *, char __user *,int);
-        void * (*follow_link) (struct dentry *, struct nameidata *);
-        void (*put_link) (struct dentry *, struct nameidata *, void *);
+	const char *(*follow_link) (struct dentry *, void **, struct nameidata *);
+	void (*put_link) (struct dentry *, void *);
 	int (*permission) (struct inode *, int);
 	int (*get_acl)(struct inode *, int);
 	int (*setattr) (struct dentry *, struct iattr *);
diff --git a/drivers/staging/lustre/lustre/llite/symlink.c b/drivers/staging/lustre/lustre/llite/symlink.c
index 3711e671a4df..e488cb3bb25d 100644
--- a/drivers/staging/lustre/lustre/llite/symlink.c
+++ b/drivers/staging/lustre/lustre/llite/symlink.c
@@ -118,7 +118,7 @@ failed:
 	return rc;
 }
 
-static void *ll_follow_link(struct dentry *dentry, struct nameidata *nd)
+static const char *ll_follow_link(struct dentry *dentry, void **cookie, struct nameidata *nd)
 {
 	struct inode *inode = d_inode(dentry);
 	struct ptlrpc_request *request = NULL;
@@ -140,18 +140,17 @@ static void *ll_follow_link(struct dentry *dentry, struct nameidata *nd)
 	}
 	if (rc) {
 		ptlrpc_req_finished(request);
-		request = NULL;
-		symname = ERR_PTR(rc);
+		return ERR_PTR(rc);
 	}
 
-	nd_set_link(nd, symname);
 	/* symname may contain a pointer to the request message buffer,
 	 * we delay request releasing until ll_put_link then.
 	 */
-	return request;
+	*cookie = request;
+	return symname;
 }
 
-static void ll_put_link(struct dentry *dentry, struct nameidata *nd, void *cookie)
+static void ll_put_link(struct dentry *dentry, void *cookie)
 {
 	ptlrpc_req_finished(cookie);
 }
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 0ba11712b388..7cc70a39a1d8 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -1230,11 +1230,12 @@ ino_t v9fs_qid2ino(struct p9_qid *qid)
  *
  */
 
-static void *v9fs_vfs_follow_link(struct dentry *dentry, struct nameidata *nd)
+static const char *v9fs_vfs_follow_link(struct dentry *dentry, void **cookie, struct nameidata *nd)
 {
 	struct v9fs_session_info *v9ses = v9fs_dentry2v9ses(dentry);
 	struct p9_fid *fid = v9fs_fid_lookup(dentry);
 	struct p9_wstat *st;
+	char *res;
 
 	p9_debug(P9_DEBUG_VFS, "%pd\n", dentry);
 
@@ -1253,14 +1254,14 @@ static void *v9fs_vfs_follow_link(struct dentry *dentry, struct nameidata *nd)
 		kfree(st);
 		return ERR_PTR(-EINVAL);
 	}
-	if (strlen(st->extension) >= PATH_MAX)
-		st->extension[PATH_MAX - 1] = '\0';
-
-	nd_set_link(nd, st->extension);
+	res = st->extension;
 	st->extension = NULL;
+	if (strlen(res) >= PATH_MAX)
+		res[PATH_MAX - 1] = '\0';
+
 	p9stat_free(st);
 	kfree(st);
-	return NULL;
+	return *cookie = res;
 }
 
 /**
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
index bc2a91f2b910..ae062ffa0f1f 100644
--- a/fs/9p/vfs_inode_dotl.c
+++ b/fs/9p/vfs_inode_dotl.c
@@ -909,8 +909,8 @@ error:
  *
  */
 
-static void *
-v9fs_vfs_follow_link_dotl(struct dentry *dentry, struct nameidata *nd)
+static const char *
+v9fs_vfs_follow_link_dotl(struct dentry *dentry, void **cookie, struct nameidata *nd)
 {
 	struct p9_fid *fid = v9fs_fid_lookup(dentry);
 	char *target;
@@ -923,8 +923,7 @@ v9fs_vfs_follow_link_dotl(struct dentry *dentry, struct nameidata *nd)
 	retval = p9_client_readlink(fid, &target);
 	if (retval)
 		return ERR_PTR(retval);
-	nd_set_link(nd, target);
-	return NULL;
+	return *cookie = target;
 }
 
 int v9fs_refresh_inode_dotl(struct p9_fid *fid, struct inode *inode)
diff --git a/fs/autofs4/symlink.c b/fs/autofs4/symlink.c
index de58cc7b8076..9c6a07739c9b 100644
--- a/fs/autofs4/symlink.c
+++ b/fs/autofs4/symlink.c
@@ -12,14 +12,13 @@
 
 #include "autofs_i.h"
 
-static void *autofs4_follow_link(struct dentry *dentry, struct nameidata *nd)
+static const char *autofs4_follow_link(struct dentry *dentry, void **cookie, struct nameidata *nd)
 {
 	struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
 	struct autofs_info *ino = autofs4_dentry_ino(dentry);
 	if (ino && !autofs4_oz_mode(sbi))
 		ino->last_used = jiffies;
-	nd_set_link(nd, d_inode(dentry)->i_private);
-	return NULL;
+	return d_inode(dentry)->i_private;
 }
 
 const struct inode_operations autofs4_symlink_inode_operations = {
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index 172e306d68a7..3a1aefb86a11 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -42,7 +42,7 @@ static struct inode *befs_iget(struct super_block *, unsigned long);
 static struct inode *befs_alloc_inode(struct super_block *sb);
 static void befs_destroy_inode(struct inode *inode);
 static void befs_destroy_inodecache(void);
-static void *befs_follow_link(struct dentry *, struct nameidata *);
+static const char *befs_follow_link(struct dentry *, void **, struct nameidata *nd);
 static int befs_utf2nls(struct super_block *sb, const char *in, int in_len,
 			char **out, int *out_len);
 static int befs_nls2utf(struct super_block *sb, const char *in, int in_len,
@@ -463,8 +463,8 @@ befs_destroy_inodecache(void)
  * The data stream become link name. Unless the LONG_SYMLINK
  * flag is set.
  */
-static void *
-befs_follow_link(struct dentry *dentry, struct nameidata *nd)
+static const char *
+befs_follow_link(struct dentry *dentry, void **cookie, struct nameidata *nd)
 {
 	struct super_block *sb = dentry->d_sb;
 	struct befs_inode_info *befs_ino = BEFS_I(d_inode(dentry));
@@ -474,23 +474,20 @@ befs_follow_link(struct dentry *dentry, struct nameidata *nd)
 
 	if (len == 0) {
 		befs_error(sb, "Long symlink with illegal length");
-		link = ERR_PTR(-EIO);
-	} else {
-		befs_debug(sb, "Follow long symlink");
-
-		link = kmalloc(len, GFP_NOFS);
-		if (!link) {
-			link = ERR_PTR(-ENOMEM);
-		} else if (befs_read_lsymlink(sb, data, link, len) != len) {
-			kfree(link);
-			befs_error(sb, "Failed to read entire long symlink");
-			link = ERR_PTR(-EIO);
-		} else {
-			link[len - 1] = '\0';
-		}
+		return ERR_PTR(-EIO);
 	}
-	nd_set_link(nd, link);
-	return NULL;
+	befs_debug(sb, "Follow long symlink");
+
+	link = kmalloc(len, GFP_NOFS);
+	if (!link)
+		return ERR_PTR(-ENOMEM);
+	if (befs_read_lsymlink(sb, data, link, len) != len) {
+		kfree(link);
+		befs_error(sb, "Failed to read entire long symlink");
+		return ERR_PTR(-EIO);
+	}
+	link[len - 1] = '\0';
+	return *cookie = link;
 }
 
 /*
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 252f5c15806b..61012da7e9d8 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -120,7 +120,7 @@ extern struct vfsmount *cifs_dfs_d_automount(struct path *path);
 #endif
 
 /* Functions related to symlinks */
-extern void *cifs_follow_link(struct dentry *direntry, struct nameidata *nd);
+extern const char *cifs_follow_link(struct dentry *direntry, void **cookie, struct nameidata *nd);
 extern int cifs_readlink(struct dentry *direntry, char __user *buffer,
 			 int buflen);
 extern int cifs_symlink(struct inode *inode, struct dentry *direntry,
diff --git a/fs/cifs/link.c b/fs/cifs/link.c
index 252e672d5604..4a439c2c0c7f 100644
--- a/fs/cifs/link.c
+++ b/fs/cifs/link.c
@@ -626,8 +626,8 @@ cifs_hl_exit:
 	return rc;
 }
 
-void *
-cifs_follow_link(struct dentry *direntry, struct nameidata *nd)
+const char *
+cifs_follow_link(struct dentry *direntry, void **cookie, struct nameidata *nd)
 {
 	struct inode *inode = d_inode(direntry);
 	int rc = -ENOMEM;
@@ -643,16 +643,18 @@ cifs_follow_link(struct dentry *direntry, struct nameidata *nd)
 
 	tlink = cifs_sb_tlink(cifs_sb);
 	if (IS_ERR(tlink)) {
-		rc = PTR_ERR(tlink);
-		tlink = NULL;
-		goto out;
+		free_xid(xid);
+		return ERR_CAST(tlink);
 	}
 	tcon = tlink_tcon(tlink);
 	server = tcon->ses->server;
 
 	full_path = build_path_from_dentry(direntry);
-	if (!full_path)
-		goto out;
+	if (!full_path) {
+		free_xid(xid);
+		cifs_put_tlink(tlink);
+		return ERR_PTR(-ENOMEM);
+	}
 
 	cifs_dbg(FYI, "Full path: %s inode = 0x%p\n", full_path, inode);
 
@@ -670,17 +672,13 @@ cifs_follow_link(struct dentry *direntry, struct nameidata *nd)
 						&target_path, cifs_sb);
 
 	kfree(full_path);
-out:
+	free_xid(xid);
+	cifs_put_tlink(tlink);
 	if (rc != 0) {
 		kfree(target_path);
-		target_path = ERR_PTR(rc);
+		return ERR_PTR(rc);
 	}
-
-	free_xid(xid);
-	if (tlink)
-		cifs_put_tlink(tlink);
-	nd_set_link(nd, target_path);
-	return NULL;
+	return *cookie = target_path;
 }
 
 int
diff --git a/fs/configfs/symlink.c b/fs/configfs/symlink.c
index cc9f2546ea4a..fac8e8517f33 100644
--- a/fs/configfs/symlink.c
+++ b/fs/configfs/symlink.c
@@ -279,30 +279,26 @@ static int configfs_getlink(struct dentry *dentry, char * path)
 
 }
 
-static void *configfs_follow_link(struct dentry *dentry, struct nameidata *nd)
+static const char *configfs_follow_link(struct dentry *dentry, void **cookie, struct nameidata *nd)
 {
-	int error = -ENOMEM;
 	unsigned long page = get_zeroed_page(GFP_KERNEL);
+	int error;
 
-	if (page) {
-		error = configfs_getlink(dentry, (char *)page);
-		if (!error) {
-			nd_set_link(nd, (char *)page);
-			return (void *)page;
-		}
+	if (!page)
+		return ERR_PTR(-ENOMEM);
+
+	error = configfs_getlink(dentry, (char *)page);
+	if (!error) {
+		return *cookie = (void *)page;
 	}
 
-	nd_set_link(nd, ERR_PTR(error));
-	return NULL;
+	free_page(page);
+	return ERR_PTR(error);
 }
 
-static void configfs_put_link(struct dentry *dentry, struct nameidata *nd,
-			      void *cookie)
+static void configfs_put_link(struct dentry *dentry, void *cookie)
 {
-	if (cookie) {
-		unsigned long page = (unsigned long)cookie;
-		free_page(page);
-	}
+	free_page((unsigned long)cookie);
 }
 
 const struct inode_operations configfs_symlink_inode_operations = {
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index fc850b55db67..cdb9d6c4532d 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -675,18 +675,16 @@ out:
 	return rc ? ERR_PTR(rc) : buf;
 }
 
-static void *ecryptfs_follow_link(struct dentry *dentry, struct nameidata *nd)
+static const char *ecryptfs_follow_link(struct dentry *dentry, void **cookie, struct nameidata *nd)
 {
 	size_t len;
 	char *buf = ecryptfs_readlink_lower(dentry, &len);
 	if (IS_ERR(buf))
-		goto out;
+		return buf;
 	fsstack_copy_attr_atime(d_inode(dentry),
 				d_inode(ecryptfs_dentry_to_lower(dentry)));
 	buf[len] = '\0';
-out:
-	nd_set_link(nd, buf);
-	return NULL;
+	return *cookie = buf;
 }
 
 /**
diff --git a/fs/ext4/symlink.c b/fs/ext4/symlink.c
index 4264fb1e341a..afec475aaf5c 100644
--- a/fs/ext4/symlink.c
+++ b/fs/ext4/symlink.c
@@ -23,7 +23,7 @@
 #include "xattr.h"
 
 #ifdef CONFIG_EXT4_FS_ENCRYPTION
-static void *ext4_follow_link(struct dentry *dentry, struct nameidata *nd)
+static const char *ext4_follow_link(struct dentry *dentry, void **cookie, struct nameidata *nd)
 {
 	struct page *cpage = NULL;
 	char *caddr, *paddr = NULL;
@@ -37,7 +37,7 @@ static void *ext4_follow_link(struct dentry *dentry, struct nameidata *nd)
 
 	ctx = ext4_get_fname_crypto_ctx(inode, inode->i_sb->s_blocksize);
 	if (IS_ERR(ctx))
-		return ctx;
+		return ERR_CAST(ctx);
 
 	if (ext4_inode_is_fast_symlink(inode)) {
 		caddr = (char *) EXT4_I(inode)->i_data;
@@ -46,7 +46,7 @@ static void *ext4_follow_link(struct dentry *dentry, struct nameidata *nd)
 		cpage = read_mapping_page(inode->i_mapping, 0, NULL);
 		if (IS_ERR(cpage)) {
 			ext4_put_fname_crypto_ctx(&ctx);
-			return cpage;
+			return ERR_CAST(cpage);
 		}
 		caddr = kmap(cpage);
 		caddr[size] = 0;
@@ -77,13 +77,12 @@ static void *ext4_follow_link(struct dentry *dentry, struct nameidata *nd)
 	/* Null-terminate the name */
 	if (res <= plen)
 		paddr[res] = '\0';
-	nd_set_link(nd, paddr);
 	ext4_put_fname_crypto_ctx(&ctx);
 	if (cpage) {
 		kunmap(cpage);
 		page_cache_release(cpage);
 	}
-	return NULL;
+	return *cookie = paddr;
 errout:
 	ext4_put_fname_crypto_ctx(&ctx);
 	if (cpage) {
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index 658e8079aaf9..d2947937515e 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -296,19 +296,15 @@ fail:
 	return err;
 }
 
-static void *f2fs_follow_link(struct dentry *dentry, struct nameidata *nd)
+static const char *f2fs_follow_link(struct dentry *dentry, void **cookie, struct nameidata *nd)
 {
-	struct page *page = page_follow_link_light(dentry, nd);
-
-	if (IS_ERR_OR_NULL(page))
-		return page;
-
-	/* this is broken symlink case */
-	if (*nd_get_link(nd) == 0) {
-		page_put_link(dentry, nd, page);
-		return ERR_PTR(-ENOENT);
+	const char *link = page_follow_link_light(dentry, cookie, nd);
+	if (!IS_ERR(link) && !*link) {
+		/* this is broken symlink case */
+		page_put_link(dentry, *cookie);
+		link = ERR_PTR(-ENOENT);
 	}
-	return page;
+	return link;
 }
 
 static int f2fs_symlink(struct inode *dir, struct dentry *dentry,
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 0572bca49f15..f9cb260375cf 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -1365,7 +1365,7 @@ static int fuse_readdir(struct file *file, struct dir_context *ctx)
 	return err;
 }
 
-static char *read_link(struct dentry *dentry)
+static const char *fuse_follow_link(struct dentry *dentry, void **cookie, struct nameidata *nd)
 {
 	struct inode *inode = d_inode(dentry);
 	struct fuse_conn *fc = get_fuse_conn(inode);
@@ -1389,26 +1389,15 @@ static char *read_link(struct dentry *dentry)
 		link = ERR_PTR(ret);
 	} else {
 		link[ret] = '\0';
+		*cookie = link;
 	}
 	fuse_invalidate_atime(inode);
 	return link;
 }
 
-static void free_link(char *link)
+static void fuse_put_link(struct dentry *dentry, void *cookie)
 {
-	if (!IS_ERR(link))
-		free_page((unsigned long) link);
-}
-
-static void *fuse_follow_link(struct dentry *dentry, struct nameidata *nd)
-{
-	nd_set_link(nd, read_link(dentry));
-	return NULL;
-}
-
-static void fuse_put_link(struct dentry *dentry, struct nameidata *nd, void *c)
-{
-	free_link(nd_get_link(nd));
+	free_page((unsigned long) cookie);
 }
 
 static int fuse_dir_open(struct inode *inode, struct file *file)
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 1b3ca7a2e3fc..f59390aebffb 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -1548,7 +1548,7 @@ out:
  * Returns: 0 on success or error code
  */
 
-static void *gfs2_follow_link(struct dentry *dentry, struct nameidata *nd)
+static const char *gfs2_follow_link(struct dentry *dentry, void **cookie, struct nameidata *nd)
 {
 	struct gfs2_inode *ip = GFS2_I(d_inode(dentry));
 	struct gfs2_holder i_gh;
@@ -1561,8 +1561,7 @@ static void *gfs2_follow_link(struct dentry *dentry, struct nameidata *nd)
 	error = gfs2_glock_nq(&i_gh);
 	if (error) {
 		gfs2_holder_uninit(&i_gh);
-		nd_set_link(nd, ERR_PTR(error));
-		return NULL;
+		return ERR_PTR(error);
 	}
 
 	size = (unsigned int)i_size_read(&ip->i_inode);
@@ -1586,8 +1585,9 @@ static void *gfs2_follow_link(struct dentry *dentry, struct nameidata *nd)
 	brelse(dibh);
 out:
 	gfs2_glock_dq_uninit(&i_gh);
-	nd_set_link(nd, buf);
-	return NULL;
+	if (!IS_ERR(buf))
+		*cookie = buf;
+	return buf;
 }
 
 /**
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index ef263174acd2..f650ed661fab 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -892,7 +892,7 @@ static const struct inode_operations hostfs_dir_iops = {
 	.setattr	= hostfs_setattr,
 };
 
-static void *hostfs_follow_link(struct dentry *dentry, struct nameidata *nd)
+static const char *hostfs_follow_link(struct dentry *dentry, void **cookie, struct nameidata *nd)
 {
 	char *link = __getname();
 	if (link) {
@@ -906,21 +906,18 @@ static void *hostfs_follow_link(struct dentry *dentry, struct nameidata *nd)
 		}
 		if (err < 0) {
 			__putname(link);
-			link = ERR_PTR(err);
+			return ERR_PTR(err);
 		}
 	} else {
-		link = ERR_PTR(-ENOMEM);
+		return ERR_PTR(-ENOMEM);
 	}
 
-	nd_set_link(nd, link);
-	return NULL;
+	return *cookie = link;
 }
 
-static void hostfs_put_link(struct dentry *dentry, struct nameidata *nd, void *cookie)
+static void hostfs_put_link(struct dentry *dentry, void *cookie)
 {
-	char *s = nd_get_link(nd);
-	if (!IS_ERR(s))
-		__putname(s);
+	__putname(cookie);
 }
 
 static const struct inode_operations hostfs_link_iops = {
diff --git a/fs/hppfs/hppfs.c b/fs/hppfs/hppfs.c
index fa2bd5366ecf..b8f24d3b04ee 100644
--- a/fs/hppfs/hppfs.c
+++ b/fs/hppfs/hppfs.c
@@ -642,20 +642,19 @@ static int hppfs_readlink(struct dentry *dentry, char __user *buffer,
 						    buflen);
 }
 
-static void *hppfs_follow_link(struct dentry *dentry, struct nameidata *nd)
+static const char *hppfs_follow_link(struct dentry *dentry, void **cookie, struct nameidata *nd)
 {
 	struct dentry *proc_dentry = HPPFS_I(d_inode(dentry))->proc_dentry;
 
-	return d_inode(proc_dentry)->i_op->follow_link(proc_dentry, nd);
+	return d_inode(proc_dentry)->i_op->follow_link(proc_dentry, cookie, nd);
 }
 
-static void hppfs_put_link(struct dentry *dentry, struct nameidata *nd,
-			   void *cookie)
+static void hppfs_put_link(struct dentry *dentry, void *cookie)
 {
 	struct dentry *proc_dentry = HPPFS_I(d_inode(dentry))->proc_dentry;
 
 	if (d_inode(proc_dentry)->i_op->put_link)
-		d_inode(proc_dentry)->i_op->put_link(proc_dentry, nd, cookie);
+		d_inode(proc_dentry)->i_op->put_link(proc_dentry, cookie);
 }
 
 static const struct inode_operations hppfs_dir_iops = {
diff --git a/fs/kernfs/symlink.c b/fs/kernfs/symlink.c
index 8a198898e39a..3c7e799974a2 100644
--- a/fs/kernfs/symlink.c
+++ b/fs/kernfs/symlink.c
@@ -112,25 +112,23 @@ static int kernfs_getlink(struct dentry *dentry, char *path)
 	return error;
 }
 
-static void *kernfs_iop_follow_link(struct dentry *dentry, struct nameidata *nd)
+static const char *kernfs_iop_follow_link(struct dentry *dentry, void **cookie, struct nameidata *nd)
 {
 	int error = -ENOMEM;
 	unsigned long page = get_zeroed_page(GFP_KERNEL);
-	if (page) {
-		error = kernfs_getlink(dentry, (char *) page);
-		if (error < 0)
-			free_page((unsigned long)page);
+	if (!page)
+		return ERR_PTR(-ENOMEM);
+	error = kernfs_getlink(dentry, (char *)page);
+	if (unlikely(error < 0)) {
+		free_page((unsigned long)page);
+		return ERR_PTR(error);
 	}
-	nd_set_link(nd, error ? ERR_PTR(error) : (char *)page);
-	return NULL;
+	return *cookie = (char *)page;
 }
 
-static void kernfs_iop_put_link(struct dentry *dentry, struct nameidata *nd,
-				void *cookie)
+static void kernfs_iop_put_link(struct dentry *dentry, void *cookie)
 {
-	char *page = nd_get_link(nd);
-	if (!IS_ERR(page))
-		free_page((unsigned long)page);
+	free_page((unsigned long)cookie);
 }
 
 const struct inode_operations kernfs_symlink_iops = {
diff --git a/fs/libfs.c b/fs/libfs.c
index 72e4e015455f..0c83fde20dbd 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -1024,12 +1024,9 @@ int noop_fsync(struct file *file, loff_t start, loff_t end, int datasync)
 }
 EXPORT_SYMBOL(noop_fsync);
 
-void kfree_put_link(struct dentry *dentry, struct nameidata *nd,
-				void *cookie)
+void kfree_put_link(struct dentry *dentry, void *cookie)
 {
-	char *s = nd_get_link(nd);
-	if (!IS_ERR(s))
-		kfree(s);
+	kfree(cookie);
 }
 EXPORT_SYMBOL(kfree_put_link);
 
@@ -1094,10 +1091,9 @@ simple_nosetlease(struct file *filp, long arg, struct file_lock **flp,
 }
 EXPORT_SYMBOL(simple_nosetlease);
 
-void *simple_follow_link(struct dentry *dentry, struct nameidata *nd)
+const char *simple_follow_link(struct dentry *dentry, void **cookie, struct nameidata *nd)
 {
-	nd_set_link(nd, d_inode(dentry)->i_link);
-	return NULL;
+	return d_inode(dentry)->i_link;
 }
 EXPORT_SYMBOL(simple_follow_link);
 
diff --git a/fs/namei.c b/fs/namei.c
index ab2bcbdbd683..aeca44877371 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -502,7 +502,6 @@ struct nameidata {
 	int		last_type;
 	unsigned	depth;
 	struct file	*base;
-	char *saved_names[MAX_NESTED_LINKS + 1];
 };
 
 /*
@@ -713,23 +712,11 @@ void nd_jump_link(struct nameidata *nd, struct path *path)
 	nd->flags |= LOOKUP_JUMPED;
 }
 
-void nd_set_link(struct nameidata *nd, char *path)
-{
-	nd->saved_names[nd->depth] = path;
-}
-EXPORT_SYMBOL(nd_set_link);
-
-char *nd_get_link(struct nameidata *nd)
-{
-	return nd->saved_names[nd->depth];
-}
-EXPORT_SYMBOL(nd_get_link);
-
 static inline void put_link(struct nameidata *nd, struct path *link, void *cookie)
 {
 	struct inode *inode = link->dentry->d_inode;
-	if (inode->i_op->put_link)
-		inode->i_op->put_link(link->dentry, nd, cookie);
+	if (cookie && inode->i_op->put_link)
+		inode->i_op->put_link(link->dentry, cookie);
 	path_put(link);
 }
 
@@ -854,7 +841,7 @@ follow_link(struct path *link, struct nameidata *nd, void **p)
 {
 	struct dentry *dentry = link->dentry;
 	int error;
-	char *s;
+	const char *s;
 
 	BUG_ON(nd->flags & LOOKUP_RCU);
 
@@ -869,26 +856,20 @@ follow_link(struct path *link, struct nameidata *nd, void **p)
 	current->total_link_count++;
 
 	touch_atime(link);
-	nd_set_link(nd, NULL);
 
 	error = security_inode_follow_link(dentry);
 	if (error)
 		goto out_put_nd_path;
 
 	nd->last_type = LAST_BIND;
-	*p = dentry->d_inode->i_op->follow_link(dentry, nd);
-	error = PTR_ERR(*p);
-	if (IS_ERR(*p))
+	*p = NULL;
+	s = dentry->d_inode->i_op->follow_link(dentry, p, nd);
+	error = PTR_ERR(s);
+	if (IS_ERR(s))
 		goto out_put_nd_path;
 
 	error = 0;
-	s = nd_get_link(nd);
 	if (s) {
-		if (unlikely(IS_ERR(s))) {
-			path_put(&nd->path);
-			put_link(nd, link, *p);
-			return PTR_ERR(s);
-		}
 		if (*s == '/') {
 			if (!nd->root.mnt)
 				set_root(nd);
@@ -906,7 +887,6 @@ follow_link(struct path *link, struct nameidata *nd, void **p)
 	return error;
 
 out_put_nd_path:
-	*p = NULL;
 	path_put(&nd->path);
 	path_put(link);
 	return error;
@@ -4430,18 +4410,15 @@ EXPORT_SYMBOL(readlink_copy);
  */
 int generic_readlink(struct dentry *dentry, char __user *buffer, int buflen)
 {
-	struct nameidata nd;
 	void *cookie;
+	const char *link = dentry->d_inode->i_op->follow_link(dentry, &cookie, NULL);
 	int res;
 
-	nd.depth = 0;
-	cookie = dentry->d_inode->i_op->follow_link(dentry, &nd);
-	if (IS_ERR(cookie))
-		return PTR_ERR(cookie);
-
-	res = readlink_copy(buffer, buflen, nd_get_link(&nd));
-	if (dentry->d_inode->i_op->put_link)
-		dentry->d_inode->i_op->put_link(dentry, &nd, cookie);
+	if (IS_ERR(link))
+		return PTR_ERR(link);
+	res = readlink_copy(buffer, buflen, link);
+	if (cookie && dentry->d_inode->i_op->put_link)
+		dentry->d_inode->i_op->put_link(dentry, cookie);
 	return res;
 }
 EXPORT_SYMBOL(generic_readlink);
@@ -4473,22 +4450,21 @@ int page_readlink(struct dentry *dentry, char __user *buffer, int buflen)
 }
 EXPORT_SYMBOL(page_readlink);
 
-void *page_follow_link_light(struct dentry *dentry, struct nameidata *nd)
+const char *page_follow_link_light(struct dentry *dentry, void **cookie, struct nameidata *nd)
 {
 	struct page *page = NULL;
-	nd_set_link(nd, page_getlink(dentry, &page));
-	return page;
+	char *res = page_getlink(dentry, &page);
+	if (!IS_ERR(res))
+		*cookie = page;
+	return res;
 }
 EXPORT_SYMBOL(page_follow_link_light);
 
-void page_put_link(struct dentry *dentry, struct nameidata *nd, void *cookie)
+void page_put_link(struct dentry *dentry, void *cookie)
 {
 	struct page *page = cookie;
-
-	if (page) {
-		kunmap(page);
-		page_cache_release(page);
-	}
+	kunmap(page);
+	page_cache_release(page);
 }
 EXPORT_SYMBOL(page_put_link);
 
diff --git a/fs/nfs/symlink.c b/fs/nfs/symlink.c
index 2d56200655fe..c992b200ae7e 100644
--- a/fs/nfs/symlink.c
+++ b/fs/nfs/symlink.c
@@ -20,7 +20,6 @@
 #include <linux/stat.h>
 #include <linux/mm.h>
 #include <linux/string.h>
-#include <linux/namei.h>
 
 /* Symlink caching in the page cache is even more simplistic
  * and straight-forward than readdir caching.
@@ -43,7 +42,7 @@ error:
 	return -EIO;
 }
 
-static void *nfs_follow_link(struct dentry *dentry, struct nameidata *nd)
+static const char *nfs_follow_link(struct dentry *dentry, void **cookie, struct nameidata *nd)
 {
 	struct inode *inode = d_inode(dentry);
 	struct page *page;
@@ -51,19 +50,13 @@ static void *nfs_follow_link(struct dentry *dentry, struct nameidata *nd)
 
 	err = ERR_PTR(nfs_revalidate_mapping(inode, inode->i_mapping));
 	if (err)
-		goto read_failed;
+		return err;
 	page = read_cache_page(&inode->i_data, 0,
 				(filler_t *)nfs_symlink_filler, inode);
-	if (IS_ERR(page)) {
-		err = page;
-		goto read_failed;
-	}
-	nd_set_link(nd, kmap(page));
-	return page;
-
-read_failed:
-	nd_set_link(nd, err);
-	return NULL;
+	if (IS_ERR(page))
+		return ERR_CAST(page);
+	*cookie = page;
+	return kmap(page);
 }
 
 /*
diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c
index 1b4b9c5e51b7..235ad42afb57 100644
--- a/fs/overlayfs/inode.c
+++ b/fs/overlayfs/inode.c
@@ -140,12 +140,12 @@ struct ovl_link_data {
 	void *cookie;
 };
 
-static void *ovl_follow_link(struct dentry *dentry, struct nameidata *nd)
+static const char *ovl_follow_link(struct dentry *dentry, void **cookie, struct nameidata *nd)
 {
-	void *ret;
 	struct dentry *realdentry;
 	struct inode *realinode;
 	struct ovl_link_data *data = NULL;
+	const char *ret;
 
 	realdentry = ovl_dentry_real(dentry);
 	realinode = realdentry->d_inode;
@@ -160,19 +160,21 @@ static void *ovl_follow_link(struct dentry *dentry, struct nameidata *nd)
 		data->realdentry = realdentry;
 	}
 
-	ret = realinode->i_op->follow_link(realdentry, nd);
-	if (IS_ERR(ret)) {
+	ret = realinode->i_op->follow_link(realdentry, cookie, nd);
+	if (IS_ERR_OR_NULL(ret)) {
 		kfree(data);
 		return ret;
 	}
 
 	if (data)
-		data->cookie = ret;
+		data->cookie = *cookie;
 
-	return data;
+	*cookie = data;
+
+	return ret;
 }
 
-static void ovl_put_link(struct dentry *dentry, struct nameidata *nd, void *c)
+static void ovl_put_link(struct dentry *dentry, void *c)
 {
 	struct inode *realinode;
 	struct ovl_link_data *data = c;
@@ -181,7 +183,7 @@ static void ovl_put_link(struct dentry *dentry, struct nameidata *nd, void *c)
 		return;
 
 	realinode = data->realdentry->d_inode;
-	realinode->i_op->put_link(data->realdentry, nd, data->cookie);
+	realinode->i_op->put_link(data->realdentry, data->cookie);
 	kfree(data);
 }
 
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 093ca14f5701..52652f86b187 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1380,7 +1380,7 @@ static int proc_exe_link(struct dentry *dentry, struct path *exe_path)
 		return -ENOENT;
 }
 
-static void *proc_pid_follow_link(struct dentry *dentry, struct nameidata *nd)
+static const char *proc_pid_follow_link(struct dentry *dentry, void **cookie, struct nameidata *nd)
 {
 	struct inode *inode = d_inode(dentry);
 	struct path path;
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 8272aaba1bb0..acd51d75387d 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -23,7 +23,6 @@
 #include <linux/slab.h>
 #include <linux/mount.h>
 #include <linux/magic.h>
-#include <linux/namei.h>
 
 #include <asm/uaccess.h>
 
@@ -394,16 +393,16 @@ static const struct file_operations proc_reg_file_ops_no_compat = {
 };
 #endif
 
-static void *proc_follow_link(struct dentry *dentry, struct nameidata *nd)
+static const char *proc_follow_link(struct dentry *dentry, void **cookie, struct nameidata *nd)
 {
 	struct proc_dir_entry *pde = PDE(d_inode(dentry));
 	if (unlikely(!use_pde(pde)))
 		return ERR_PTR(-EINVAL);
-	nd_set_link(nd, pde->data);
-	return pde;
+	*cookie = pde;
+	return pde->data;
 }
 
-static void proc_put_link(struct dentry *dentry, struct nameidata *nd, void *p)
+static void proc_put_link(struct dentry *dentry, void *p)
 {
 	unuse_pde(p);
 }
diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c
index e512642dbbdc..10d24dd096e8 100644
--- a/fs/proc/namespaces.c
+++ b/fs/proc/namespaces.c
@@ -30,7 +30,7 @@ static const struct proc_ns_operations *ns_entries[] = {
 	&mntns_operations,
 };
 
-static void *proc_ns_follow_link(struct dentry *dentry, struct nameidata *nd)
+static const char *proc_ns_follow_link(struct dentry *dentry, void **cookie, struct nameidata *nd)
 {
 	struct inode *inode = d_inode(dentry);
 	const struct proc_ns_operations *ns_ops = PROC_I(inode)->ns_ops;
diff --git a/fs/proc/self.c b/fs/proc/self.c
index 6195b4a7c3b1..ad333946b53a 100644
--- a/fs/proc/self.c
+++ b/fs/proc/self.c
@@ -1,5 +1,4 @@
 #include <linux/sched.h>
-#include <linux/namei.h>
 #include <linux/slab.h>
 #include <linux/pid_namespace.h>
 #include "internal.h"
@@ -19,21 +18,20 @@ static int proc_self_readlink(struct dentry *dentry, char __user *buffer,
 	return readlink_copy(buffer, buflen, tmp);
 }
 
-static void *proc_self_follow_link(struct dentry *dentry, struct nameidata *nd)
+static const char *proc_self_follow_link(struct dentry *dentry, void **cookie, struct nameidata *nd)
 {
 	struct pid_namespace *ns = dentry->d_sb->s_fs_info;
 	pid_t tgid = task_tgid_nr_ns(current, ns);
-	char *name = ERR_PTR(-ENOENT);
-	if (tgid) {
-		/* 11 for max length of signed int in decimal + NULL term */
-		name = kmalloc(12, GFP_KERNEL);
-		if (!name)
-			name = ERR_PTR(-ENOMEM);
-		else
-			sprintf(name, "%d", tgid);
-	}
-	nd_set_link(nd, name);
-	return NULL;
+	char *name;
+
+	if (!tgid)
+		return ERR_PTR(-ENOENT);
+	/* 11 for max length of signed int in decimal + NULL term */
+	name = kmalloc(12, GFP_KERNEL);
+	if (!name)
+		return ERR_PTR(-ENOMEM);
+	sprintf(name, "%d", tgid);
+	return *cookie = name;
 }
 
 static const struct inode_operations proc_self_inode_operations = {
diff --git a/fs/proc/thread_self.c b/fs/proc/thread_self.c
index a8371993b4fb..85c96e0d7aaa 100644
--- a/fs/proc/thread_self.c
+++ b/fs/proc/thread_self.c
@@ -1,5 +1,4 @@
 #include <linux/sched.h>
-#include <linux/namei.h>
 #include <linux/slab.h>
 #include <linux/pid_namespace.h>
 #include "internal.h"
@@ -20,21 +19,20 @@ static int proc_thread_self_readlink(struct dentry *dentry, char __user *buffer,
 	return readlink_copy(buffer, buflen, tmp);
 }
 
-static void *proc_thread_self_follow_link(struct dentry *dentry, struct nameidata *nd)
+static const char *proc_thread_self_follow_link(struct dentry *dentry, void **cookie, struct nameidata *nd)
 {
 	struct pid_namespace *ns = dentry->d_sb->s_fs_info;
 	pid_t tgid = task_tgid_nr_ns(current, ns);
 	pid_t pid = task_pid_nr_ns(current, ns);
-	char *name = ERR_PTR(-ENOENT);
-	if (pid) {
-		name = kmalloc(PROC_NUMBUF + 6 + PROC_NUMBUF, GFP_KERNEL);
-		if (!name)
-			name = ERR_PTR(-ENOMEM);
-		else
-			sprintf(name, "%d/task/%d", tgid, pid);
-	}
-	nd_set_link(nd, name);
-	return NULL;
+	char *name;
+
+	if (!pid)
+		return ERR_PTR(-ENOENT);
+	name = kmalloc(PROC_NUMBUF + 6 + PROC_NUMBUF, GFP_KERNEL);
+	if (!name)
+		return ERR_PTR(-ENOMEM);
+	sprintf(name, "%d/task/%d", tgid, pid);
+	return *cookie = name;
 }
 
 static const struct inode_operations proc_thread_self_inode_operations = {
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index f4cd7204e236..26c4dcb1ef56 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -41,7 +41,6 @@
 
 #include <linux/capability.h>
 #include <linux/xattr.h>
-#include <linux/namei.h>
 #include <linux/posix_acl.h>
 #include <linux/security.h>
 #include <linux/fiemap.h>
@@ -414,9 +413,10 @@ xfs_vn_rename(
  * we need to be very careful about how much stack we use.
  * uio is kmalloced for this reason...
  */
-STATIC void *
+STATIC const char *
 xfs_vn_follow_link(
 	struct dentry		*dentry,
+	void			**cookie,
 	struct nameidata	*nd)
 {
 	char			*link;
@@ -430,14 +430,12 @@ xfs_vn_follow_link(
 	if (unlikely(error))
 		goto out_kfree;
 
-	nd_set_link(nd, link);
-	return NULL;
+	return *cookie = link;
 
  out_kfree:
 	kfree(link);
  out_err:
-	nd_set_link(nd, ERR_PTR(error));
-	return NULL;
+	return ERR_PTR(error);
 }
 
 STATIC int
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 0ac758fcff00..9ab934113a28 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1608,12 +1608,12 @@ struct file_operations {
 
 struct inode_operations {
 	struct dentry * (*lookup) (struct inode *,struct dentry *, unsigned int);
-	void * (*follow_link) (struct dentry *, struct nameidata *);
+	const char * (*follow_link) (struct dentry *, void **, struct nameidata *);
 	int (*permission) (struct inode *, int);
 	struct posix_acl * (*get_acl)(struct inode *, int);
 
 	int (*readlink) (struct dentry *, char __user *,int);
-	void (*put_link) (struct dentry *, struct nameidata *, void *);
+	void (*put_link) (struct dentry *, void *);
 
 	int (*create) (struct inode *,struct dentry *, umode_t, bool);
 	int (*link) (struct dentry *,struct inode *,struct dentry *);
@@ -2705,13 +2705,13 @@ extern const struct file_operations generic_ro_fops;
 
 extern int readlink_copy(char __user *, int, const char *);
 extern int page_readlink(struct dentry *, char __user *, int);
-extern void *page_follow_link_light(struct dentry *, struct nameidata *);
-extern void page_put_link(struct dentry *, struct nameidata *, void *);
+extern const char *page_follow_link_light(struct dentry *, void **, struct nameidata *);
+extern void page_put_link(struct dentry *, void *);
 extern int __page_symlink(struct inode *inode, const char *symname, int len,
 		int nofs);
 extern int page_symlink(struct inode *inode, const char *symname, int len);
 extern const struct inode_operations page_symlink_inode_operations;
-extern void kfree_put_link(struct dentry *, struct nameidata *, void *);
+extern void kfree_put_link(struct dentry *, void *);
 extern int generic_readlink(struct dentry *, char __user *, int);
 extern void generic_fillattr(struct inode *, struct kstat *);
 int vfs_getattr_nosec(struct path *path, struct kstat *stat);
@@ -2722,7 +2722,7 @@ void __inode_sub_bytes(struct inode *inode, loff_t bytes);
 void inode_sub_bytes(struct inode *inode, loff_t bytes);
 loff_t inode_get_bytes(struct inode *inode);
 void inode_set_bytes(struct inode *inode, loff_t bytes);
-void *simple_follow_link(struct dentry *, struct nameidata *);
+const char *simple_follow_link(struct dentry *, void **, struct nameidata *);
 extern const struct inode_operations simple_symlink_inode_operations;
 
 extern int iterate_dir(struct file *, struct dir_context *);
diff --git a/include/linux/namei.h b/include/linux/namei.h
index c8990779f0c3..a5d5bed2c0e1 100644
--- a/include/linux/namei.h
+++ b/include/linux/namei.h
@@ -71,8 +71,6 @@ extern struct dentry *lock_rename(struct dentry *, struct dentry *);
 extern void unlock_rename(struct dentry *, struct dentry *);
 
 extern void nd_jump_link(struct nameidata *nd, struct path *path);
-extern void nd_set_link(struct nameidata *nd, char *path);
-extern char *nd_get_link(struct nameidata *nd);
 
 static inline void nd_terminate_link(void *name, size_t len, size_t maxlen)
 {
diff --git a/mm/shmem.c b/mm/shmem.c
index 7f6e2f889122..d1693dcb4285 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2475,24 +2475,23 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s
 	return 0;
 }
 
-static void *shmem_follow_link(struct dentry *dentry, struct nameidata *nd)
+static const char *shmem_follow_link(struct dentry *dentry, void **cookie, struct nameidata *nd)
 {
 	struct page *page = NULL;
 	int error = shmem_getpage(d_inode(dentry), 0, &page, SGP_READ, NULL);
-	nd_set_link(nd, error ? ERR_PTR(error) : kmap(page));
-	if (page)
-		unlock_page(page);
-	return page;
+	if (error)
+		return ERR_PTR(error);
+	unlock_page(page);
+	*cookie = page;
+	return kmap(page);
 }
 
-static void shmem_put_link(struct dentry *dentry, struct nameidata *nd, void *cookie)
+static void shmem_put_link(struct dentry *dentry, void *cookie)
 {
-	if (!IS_ERR(nd_get_link(nd))) {
-		struct page *page = cookie;
-		kunmap(page);
-		mark_page_accessed(page);
-		page_cache_release(page);
-	}
+	struct page *page = cookie;
+	kunmap(page);
+	mark_page_accessed(page);
+	page_cache_release(page);
 }
 
 #ifdef CONFIG_TMPFS_XATTR
-- 
cgit v1.2.3


From 894bc8c4662ba9daceafe943a5ba0dd407da5cd3 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 2 May 2015 07:16:16 -0400
Subject: namei: remove restrictions on nesting depth

The only restriction is that on the total amount of symlinks
crossed; how they are nested does not matter

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c            | 66 ++++++++++++++++++++++++++++++++++++++++-----------
 include/linux/namei.h |  2 ++
 2 files changed, 54 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/fs/namei.c b/fs/namei.c
index e5715a567860..1ae34cd0d590 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -492,6 +492,7 @@ void path_put(const struct path *path)
 }
 EXPORT_SYMBOL(path_put);
 
+#define EMBEDDED_LEVELS 2
 struct nameidata {
 	struct path	path;
 	union {
@@ -509,9 +510,42 @@ struct nameidata {
 		struct path link;
 		void *cookie;
 		const char *name;
-	} stack[MAX_NESTED_LINKS + 1];
+	} *stack, internal[EMBEDDED_LEVELS];
 };
 
+static void set_nameidata(struct nameidata *nd)
+{
+	nd->stack = nd->internal;
+}
+
+static void restore_nameidata(struct nameidata *nd)
+{
+	if (nd->stack != nd->internal) {
+		kfree(nd->stack);
+		nd->stack = nd->internal;
+	}
+}
+
+static int __nd_alloc_stack(struct nameidata *nd)
+{
+	struct saved *p = kmalloc((MAXSYMLINKS + 1) * sizeof(struct saved),
+				  GFP_KERNEL);
+	if (unlikely(!p))
+		return -ENOMEM;
+	memcpy(p, nd->internal, sizeof(nd->internal));
+	nd->stack = p;
+	return 0;
+}
+
+static inline int nd_alloc_stack(struct nameidata *nd)
+{
+	if (likely(nd->depth != EMBEDDED_LEVELS - 1))
+		return 0;
+	if (likely(nd->stack != nd->internal))
+		return 0;
+	return __nd_alloc_stack(nd);
+}
+
 /*
  * Path walking has 2 modes, rcu-walk and ref-walk (see
  * Documentation/filesystems/path-lookup.txt).  In situations when we can't
@@ -857,7 +891,7 @@ const char *get_link(struct nameidata *nd)
 	if (nd->link.mnt == nd->path.mnt)
 		mntget(nd->link.mnt);
 
-	if (unlikely(current->total_link_count >= 40)) {
+	if (unlikely(current->total_link_count >= MAXSYMLINKS)) {
 		path_put(&nd->path);
 		path_put(&nd->link);
 		return ERR_PTR(-ELOOP);
@@ -1789,22 +1823,18 @@ Walked:
 		if (err) {
 			const char *s;
 
-			if (unlikely(current->link_count >= MAX_NESTED_LINKS)) {
-				path_put_conditional(&nd->link, nd);
-				path_put(&nd->path);
-				err = -ELOOP;
-				goto Err;
+			err = nd_alloc_stack(nd);
+			if (unlikely(err)) {
+				path_to_nameidata(&nd->link, nd);
+				break;
 			}
-			BUG_ON(nd->depth >= MAX_NESTED_LINKS);
 
 			nd->depth++;
-			current->link_count++;
 
 			s = get_link(nd);
 
 			if (unlikely(IS_ERR(s))) {
 				err = PTR_ERR(s);
-				current->link_count--;
 				nd->depth--;
 				goto Err;
 			}
@@ -1812,7 +1842,6 @@ Walked:
 			if (unlikely(!s)) {
 				/* jumped */
 				put_link(nd);
-				current->link_count--;
 				nd->depth--;
 			} else {
 				if (*s == '/') {
@@ -1842,7 +1871,6 @@ Walked:
 Err:
 	while (unlikely(nd->depth)) {
 		put_link(nd);
-		current->link_count--;
 		nd->depth--;
 	}
 	return err;
@@ -1851,7 +1879,6 @@ OK:
 		name = nd->stack[nd->depth].name;
 		err = walk_component(nd, LOOKUP_FOLLOW);
 		put_link(nd);
-		current->link_count--;
 		nd->depth--;
 		goto Walked;
 	}
@@ -2055,7 +2082,11 @@ static int path_lookupat(int dfd, const struct filename *name,
 static int filename_lookup(int dfd, struct filename *name,
 				unsigned int flags, struct nameidata *nd)
 {
-	int retval = path_lookupat(dfd, name, flags | LOOKUP_RCU, nd);
+	int retval;
+
+	set_nameidata(nd);
+	retval = path_lookupat(dfd, name, flags | LOOKUP_RCU, nd);
+
 	if (unlikely(retval == -ECHILD))
 		retval = path_lookupat(dfd, name, flags, nd);
 	if (unlikely(retval == -ESTALE))
@@ -2063,6 +2094,7 @@ static int filename_lookup(int dfd, struct filename *name,
 
 	if (likely(!retval))
 		audit_inode(name, nd->path.dentry, flags & LOOKUP_PARENT);
+	restore_nameidata(nd);
 	return retval;
 }
 
@@ -2393,6 +2425,7 @@ filename_mountpoint(int dfd, struct filename *name, struct path *path,
 	int error;
 	if (IS_ERR(name))
 		return PTR_ERR(name);
+	set_nameidata(&nd);
 	error = path_mountpoint(dfd, name, path, &nd, flags | LOOKUP_RCU);
 	if (unlikely(error == -ECHILD))
 		error = path_mountpoint(dfd, name, path, &nd, flags);
@@ -2400,6 +2433,7 @@ filename_mountpoint(int dfd, struct filename *name, struct path *path,
 		error = path_mountpoint(dfd, name, path, &nd, flags | LOOKUP_REVAL);
 	if (likely(!error))
 		audit_inode(name, path->dentry, 0);
+	restore_nameidata(&nd);
 	putname(name);
 	return error;
 }
@@ -3288,11 +3322,13 @@ struct file *do_filp_open(int dfd, struct filename *pathname,
 	int flags = op->lookup_flags;
 	struct file *filp;
 
+	set_nameidata(&nd);
 	filp = path_openat(dfd, pathname, &nd, op, flags | LOOKUP_RCU);
 	if (unlikely(filp == ERR_PTR(-ECHILD)))
 		filp = path_openat(dfd, pathname, &nd, op, flags);
 	if (unlikely(filp == ERR_PTR(-ESTALE)))
 		filp = path_openat(dfd, pathname, &nd, op, flags | LOOKUP_REVAL);
+	restore_nameidata(&nd);
 	return filp;
 }
 
@@ -3306,6 +3342,7 @@ struct file *do_file_open_root(struct dentry *dentry, struct vfsmount *mnt,
 
 	nd.root.mnt = mnt;
 	nd.root.dentry = dentry;
+	set_nameidata(&nd);
 
 	if (d_is_symlink(dentry) && op->intent & LOOKUP_OPEN)
 		return ERR_PTR(-ELOOP);
@@ -3319,6 +3356,7 @@ struct file *do_file_open_root(struct dentry *dentry, struct vfsmount *mnt,
 		file = path_openat(-1, filename, &nd, op, flags);
 	if (unlikely(file == ERR_PTR(-ESTALE)))
 		file = path_openat(-1, filename, &nd, op, flags | LOOKUP_REVAL);
+	restore_nameidata(&nd);
 	putname(filename);
 	return file;
 }
diff --git a/include/linux/namei.h b/include/linux/namei.h
index a5d5bed2c0e1..3a6cc9651712 100644
--- a/include/linux/namei.h
+++ b/include/linux/namei.h
@@ -11,6 +11,8 @@ struct nameidata;
 
 enum { MAX_NESTED_LINKS = 8 };
 
+#define MAXSYMLINKS 40
+
 /*
  * Type of the last component on LOOKUP_PARENT
  */
-- 
cgit v1.2.3


From 756daf263ea53a8bfc89db26cb92e963953253a1 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 23 Mar 2015 13:37:38 +1100
Subject: VFS: replace {, total_}link_count in task_struct with pointer to
 nameidata

task_struct currently contains two ad-hoc members for use by the VFS:
link_count and total_link_count.  These are only interesting to fs/namei.c,
so exposing them explicitly is poor layering.  Incidentally, link_count
isn't used anymore, so it can just die.

This patches replaces those with a single pointer to 'struct nameidata'.
This structure represents the current filename lookup of which
there can only be one per process, and is a natural place to
store total_link_count.

This will allow the current "nameidata" argument to all
follow_link operations to be removed as current->nameidata
can be used instead in the _very_ few instances that care about
it at all.

As there are occasional circumstances where pathname lookup can
recurse, such as through kern_path_locked, we always save and old
current->nameidata (if there is one) when setting a new value, and
make sure any active link_counts are preserved.

follow_mount and follow_automount now get a 'struct nameidata *'
rather than 'int flags' so that they can directly access
total_link_count, rather than going through 'current'.

Suggested-by: Al Viro <viro@ZenIV.linux.org.uk>
Signed-off-by: NeilBrown <neilb@suse.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c            | 70 ++++++++++++++++++++++++++++-----------------------
 include/linux/sched.h |  2 +-
 2 files changed, 40 insertions(+), 32 deletions(-)

(limited to 'include/linux')

diff --git a/fs/namei.c b/fs/namei.c
index 046a703c081d..699e093f52b5 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -505,6 +505,7 @@ struct nameidata {
 	unsigned	seq, m_seq;
 	int		last_type;
 	unsigned	depth;
+	int		total_link_count;
 	struct file	*base;
 	struct saved {
 		struct path link;
@@ -513,16 +514,25 @@ struct nameidata {
 	} *stack, internal[EMBEDDED_LEVELS];
 };
 
-static void set_nameidata(struct nameidata *nd)
+static struct nameidata *set_nameidata(struct nameidata *p)
 {
-	nd->stack = nd->internal;
+	struct nameidata *old = current->nameidata;
+	p->stack = p->internal;
+	p->total_link_count = old ? old->total_link_count : 0;
+	current->nameidata = p;
+	return old;
 }
 
-static void restore_nameidata(struct nameidata *nd)
+static void restore_nameidata(struct nameidata *old)
 {
-	if (nd->stack != nd->internal) {
-		kfree(nd->stack);
-		nd->stack = nd->internal;
+	struct nameidata *now = current->nameidata;
+
+	current->nameidata = old;
+	if (old)
+		old->total_link_count = now->total_link_count;
+	if (now->stack != now->internal) {
+		kfree(now->stack);
+		now->stack = now->internal;
 	}
 }
 
@@ -970,7 +980,7 @@ EXPORT_SYMBOL(follow_up);
  * - return -EISDIR to tell follow_managed() to stop and return the path we
  *   were called with.
  */
-static int follow_automount(struct path *path, unsigned flags,
+static int follow_automount(struct path *path, struct nameidata *nd,
 			    bool *need_mntput)
 {
 	struct vfsmount *mnt;
@@ -990,13 +1000,13 @@ static int follow_automount(struct path *path, unsigned flags,
 	 * as being automount points.  These will need the attentions
 	 * of the daemon to instantiate them before they can be used.
 	 */
-	if (!(flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY |
-		     LOOKUP_OPEN | LOOKUP_CREATE | LOOKUP_AUTOMOUNT)) &&
+	if (!(nd->flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY |
+			   LOOKUP_OPEN | LOOKUP_CREATE | LOOKUP_AUTOMOUNT)) &&
 	    path->dentry->d_inode)
 		return -EISDIR;
 
-	current->total_link_count++;
-	if (current->total_link_count >= 40)
+	nd->total_link_count++;
+	if (nd->total_link_count >= 40)
 		return -ELOOP;
 
 	mnt = path->dentry->d_op->d_automount(path);
@@ -1010,7 +1020,7 @@ static int follow_automount(struct path *path, unsigned flags,
 		 * the path being looked up; if it wasn't then the remainder of
 		 * the path is inaccessible and we should say so.
 		 */
-		if (PTR_ERR(mnt) == -EISDIR && (flags & LOOKUP_PARENT))
+		if (PTR_ERR(mnt) == -EISDIR && (nd->flags & LOOKUP_PARENT))
 			return -EREMOTE;
 		return PTR_ERR(mnt);
 	}
@@ -1050,7 +1060,7 @@ static int follow_automount(struct path *path, unsigned flags,
  *
  * Serialization is taken care of in namespace.c
  */
-static int follow_managed(struct path *path, unsigned flags)
+static int follow_managed(struct path *path, struct nameidata *nd)
 {
 	struct vfsmount *mnt = path->mnt; /* held by caller, must be left alone */
 	unsigned managed;
@@ -1094,7 +1104,7 @@ static int follow_managed(struct path *path, unsigned flags)
 
 		/* Handle an automount point */
 		if (managed & DCACHE_NEED_AUTOMOUNT) {
-			ret = follow_automount(path, flags, &need_mntput);
+			ret = follow_automount(path, nd, &need_mntput);
 			if (ret < 0)
 				break;
 			continue;
@@ -1483,7 +1493,7 @@ unlazy:
 	}
 	path->mnt = mnt;
 	path->dentry = dentry;
-	err = follow_managed(path, nd->flags);
+	err = follow_managed(path, nd);
 	if (unlikely(err < 0)) {
 		path_put_conditional(path, nd);
 		return err;
@@ -1513,7 +1523,7 @@ static int lookup_slow(struct nameidata *nd, struct path *path)
 		return PTR_ERR(dentry);
 	path->mnt = nd->path.mnt;
 	path->dentry = dentry;
-	err = follow_managed(path, nd->flags);
+	err = follow_managed(path, nd);
 	if (unlikely(err < 0)) {
 		path_put_conditional(path, nd);
 		return err;
@@ -1563,7 +1573,7 @@ static void terminate_walk(struct nameidata *nd)
 static int pick_link(struct nameidata *nd, struct path *link)
 {
 	int error;
-	if (unlikely(current->total_link_count++ >= MAXSYMLINKS)) {
+	if (unlikely(nd->total_link_count++ >= MAXSYMLINKS)) {
 		path_to_nameidata(link, nd);
 		return -ELOOP;
 	}
@@ -1981,7 +1991,7 @@ static int path_init(int dfd, const struct filename *name, unsigned int flags,
 	rcu_read_unlock();
 	return -ECHILD;
 done:
-	current->total_link_count = 0;
+	nd->total_link_count = 0;
 	return link_path_walk(s, nd);
 }
 
@@ -2087,10 +2097,9 @@ static int filename_lookup(int dfd, struct filename *name,
 				unsigned int flags, struct nameidata *nd)
 {
 	int retval;
+	struct nameidata *saved_nd = set_nameidata(nd);
 
-	set_nameidata(nd);
 	retval = path_lookupat(dfd, name, flags | LOOKUP_RCU, nd);
-
 	if (unlikely(retval == -ECHILD))
 		retval = path_lookupat(dfd, name, flags, nd);
 	if (unlikely(retval == -ESTALE))
@@ -2098,7 +2107,7 @@ static int filename_lookup(int dfd, struct filename *name,
 
 	if (likely(!retval))
 		audit_inode(name, nd->path.dentry, flags & LOOKUP_PARENT);
-	restore_nameidata(nd);
+	restore_nameidata(saved_nd);
 	return retval;
 }
 
@@ -2426,11 +2435,11 @@ static int
 filename_mountpoint(int dfd, struct filename *name, struct path *path,
 			unsigned int flags)
 {
-	struct nameidata nd;
+	struct nameidata nd, *saved;
 	int error;
 	if (IS_ERR(name))
 		return PTR_ERR(name);
-	set_nameidata(&nd);
+	saved = set_nameidata(&nd);
 	error = path_mountpoint(dfd, name, path, &nd, flags | LOOKUP_RCU);
 	if (unlikely(error == -ECHILD))
 		error = path_mountpoint(dfd, name, path, &nd, flags);
@@ -2438,7 +2447,7 @@ filename_mountpoint(int dfd, struct filename *name, struct path *path,
 		error = path_mountpoint(dfd, name, path, &nd, flags | LOOKUP_REVAL);
 	if (likely(!error))
 		audit_inode(name, path->dentry, 0);
-	restore_nameidata(&nd);
+	restore_nameidata(saved);
 	putname(name);
 	return error;
 }
@@ -3087,7 +3096,7 @@ retry_lookup:
 	if ((open_flag & (O_EXCL | O_CREAT)) == (O_EXCL | O_CREAT))
 		goto exit_dput;
 
-	error = follow_managed(&path, nd->flags);
+	error = follow_managed(&path, nd);
 	if (error < 0)
 		goto exit_dput;
 
@@ -3323,31 +3332,29 @@ out2:
 struct file *do_filp_open(int dfd, struct filename *pathname,
 		const struct open_flags *op)
 {
-	struct nameidata nd;
+	struct nameidata nd, *saved_nd = set_nameidata(&nd);
 	int flags = op->lookup_flags;
 	struct file *filp;
 
-	set_nameidata(&nd);
 	filp = path_openat(dfd, pathname, &nd, op, flags | LOOKUP_RCU);
 	if (unlikely(filp == ERR_PTR(-ECHILD)))
 		filp = path_openat(dfd, pathname, &nd, op, flags);
 	if (unlikely(filp == ERR_PTR(-ESTALE)))
 		filp = path_openat(dfd, pathname, &nd, op, flags | LOOKUP_REVAL);
-	restore_nameidata(&nd);
+	restore_nameidata(saved_nd);
 	return filp;
 }
 
 struct file *do_file_open_root(struct dentry *dentry, struct vfsmount *mnt,
 		const char *name, const struct open_flags *op)
 {
-	struct nameidata nd;
+	struct nameidata nd, *saved_nd;
 	struct file *file;
 	struct filename *filename;
 	int flags = op->lookup_flags | LOOKUP_ROOT;
 
 	nd.root.mnt = mnt;
 	nd.root.dentry = dentry;
-	set_nameidata(&nd);
 
 	if (d_is_symlink(dentry) && op->intent & LOOKUP_OPEN)
 		return ERR_PTR(-ELOOP);
@@ -3356,12 +3363,13 @@ struct file *do_file_open_root(struct dentry *dentry, struct vfsmount *mnt,
 	if (unlikely(IS_ERR(filename)))
 		return ERR_CAST(filename);
 
+	saved_nd = set_nameidata(&nd);
 	file = path_openat(-1, filename, &nd, op, flags | LOOKUP_RCU);
 	if (unlikely(file == ERR_PTR(-ECHILD)))
 		file = path_openat(-1, filename, &nd, op, flags);
 	if (unlikely(file == ERR_PTR(-ESTALE)))
 		file = path_openat(-1, filename, &nd, op, flags | LOOKUP_REVAL);
-	restore_nameidata(&nd);
+	restore_nameidata(saved_nd);
 	putname(filename);
 	return file;
 }
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 26a2e6122734..f6c9b69d66f2 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1461,7 +1461,7 @@ struct task_struct {
 				       it with task_lock())
 				     - initialized normally by setup_new_exec */
 /* file system info */
-	int link_count, total_link_count;
+	struct nameidata *nameidata;
 #ifdef CONFIG_SYSVIPC
 /* ipc stuff */
 	struct sysv_sem sysvsem;
-- 
cgit v1.2.3


From 6e77137b363b8d866ac29c5a0c95e953614fb2d8 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 2 May 2015 13:37:52 -0400
Subject: don't pass nameidata to ->follow_link()

its only use is getting passed to nd_jump_link(), which can obtain
it from current->nameidata

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 Documentation/filesystems/Locking             |  2 +-
 Documentation/filesystems/vfs.txt             |  2 +-
 drivers/staging/lustre/lustre/llite/symlink.c |  2 +-
 fs/9p/vfs_inode.c                             |  2 +-
 fs/9p/vfs_inode_dotl.c                        |  2 +-
 fs/autofs4/symlink.c                          |  2 +-
 fs/befs/linuxvfs.c                            |  4 ++--
 fs/cifs/cifsfs.h                              |  2 +-
 fs/cifs/link.c                                |  2 +-
 fs/configfs/symlink.c                         |  2 +-
 fs/ecryptfs/inode.c                           |  2 +-
 fs/ext4/symlink.c                             |  2 +-
 fs/f2fs/namei.c                               |  4 ++--
 fs/fuse/dir.c                                 |  2 +-
 fs/gfs2/inode.c                               |  2 +-
 fs/hostfs/hostfs_kern.c                       |  2 +-
 fs/hppfs/hppfs.c                              |  4 ++--
 fs/kernfs/symlink.c                           |  2 +-
 fs/libfs.c                                    |  2 +-
 fs/namei.c                                    | 11 ++++++-----
 fs/nfs/symlink.c                              |  2 +-
 fs/overlayfs/inode.c                          |  4 ++--
 fs/proc/base.c                                |  4 ++--
 fs/proc/inode.c                               |  2 +-
 fs/proc/namespaces.c                          |  4 ++--
 fs/proc/self.c                                |  2 +-
 fs/proc/thread_self.c                         |  2 +-
 fs/xfs/xfs_iops.c                             |  3 +--
 include/linux/fs.h                            |  6 +++---
 include/linux/namei.h                         |  2 +-
 mm/shmem.c                                    |  2 +-
 31 files changed, 44 insertions(+), 44 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking
index 7fa6c4ac858c..5b5b4f54c033 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -50,7 +50,7 @@ prototypes:
 	int (*rename2) (struct inode *, struct dentry *,
 			struct inode *, struct dentry *, unsigned int);
 	int (*readlink) (struct dentry *, char __user *,int);
-	const char *(*follow_link) (struct dentry *, void **, struct nameidata *);
+	const char *(*follow_link) (struct dentry *, void **);
 	void (*put_link) (struct dentry *, void *);
 	void (*truncate) (struct inode *);
 	int (*permission) (struct inode *, int, unsigned int);
diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt
index 1c6b03ac2e5a..0dec8c880be6 100644
--- a/Documentation/filesystems/vfs.txt
+++ b/Documentation/filesystems/vfs.txt
@@ -350,7 +350,7 @@ struct inode_operations {
 	int (*rename2) (struct inode *, struct dentry *,
 			struct inode *, struct dentry *, unsigned int);
 	int (*readlink) (struct dentry *, char __user *,int);
-	const char *(*follow_link) (struct dentry *, void **, struct nameidata *);
+	const char *(*follow_link) (struct dentry *, void **);
 	void (*put_link) (struct dentry *, void *);
 	int (*permission) (struct inode *, int);
 	int (*get_acl)(struct inode *, int);
diff --git a/drivers/staging/lustre/lustre/llite/symlink.c b/drivers/staging/lustre/lustre/llite/symlink.c
index da6d9d17c50d..f3be3bf0f66f 100644
--- a/drivers/staging/lustre/lustre/llite/symlink.c
+++ b/drivers/staging/lustre/lustre/llite/symlink.c
@@ -118,7 +118,7 @@ failed:
 	return rc;
 }
 
-static const char *ll_follow_link(struct dentry *dentry, void **cookie, struct nameidata *nd)
+static const char *ll_follow_link(struct dentry *dentry, void **cookie)
 {
 	struct inode *inode = d_inode(dentry);
 	struct ptlrpc_request *request = NULL;
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 7cc70a39a1d8..271f51af2f75 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -1230,7 +1230,7 @@ ino_t v9fs_qid2ino(struct p9_qid *qid)
  *
  */
 
-static const char *v9fs_vfs_follow_link(struct dentry *dentry, void **cookie, struct nameidata *nd)
+static const char *v9fs_vfs_follow_link(struct dentry *dentry, void **cookie)
 {
 	struct v9fs_session_info *v9ses = v9fs_dentry2v9ses(dentry);
 	struct p9_fid *fid = v9fs_fid_lookup(dentry);
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
index ae062ffa0f1f..16658ed677c9 100644
--- a/fs/9p/vfs_inode_dotl.c
+++ b/fs/9p/vfs_inode_dotl.c
@@ -910,7 +910,7 @@ error:
  */
 
 static const char *
-v9fs_vfs_follow_link_dotl(struct dentry *dentry, void **cookie, struct nameidata *nd)
+v9fs_vfs_follow_link_dotl(struct dentry *dentry, void **cookie)
 {
 	struct p9_fid *fid = v9fs_fid_lookup(dentry);
 	char *target;
diff --git a/fs/autofs4/symlink.c b/fs/autofs4/symlink.c
index 9c6a07739c9b..da0c33481bc0 100644
--- a/fs/autofs4/symlink.c
+++ b/fs/autofs4/symlink.c
@@ -12,7 +12,7 @@
 
 #include "autofs_i.h"
 
-static const char *autofs4_follow_link(struct dentry *dentry, void **cookie, struct nameidata *nd)
+static const char *autofs4_follow_link(struct dentry *dentry, void **cookie)
 {
 	struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
 	struct autofs_info *ino = autofs4_dentry_ino(dentry);
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index 3a1aefb86a11..46aedacfa6a8 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -42,7 +42,7 @@ static struct inode *befs_iget(struct super_block *, unsigned long);
 static struct inode *befs_alloc_inode(struct super_block *sb);
 static void befs_destroy_inode(struct inode *inode);
 static void befs_destroy_inodecache(void);
-static const char *befs_follow_link(struct dentry *, void **, struct nameidata *nd);
+static const char *befs_follow_link(struct dentry *, void **);
 static int befs_utf2nls(struct super_block *sb, const char *in, int in_len,
 			char **out, int *out_len);
 static int befs_nls2utf(struct super_block *sb, const char *in, int in_len,
@@ -464,7 +464,7 @@ befs_destroy_inodecache(void)
  * flag is set.
  */
 static const char *
-befs_follow_link(struct dentry *dentry, void **cookie, struct nameidata *nd)
+befs_follow_link(struct dentry *dentry, void **cookie)
 {
 	struct super_block *sb = dentry->d_sb;
 	struct befs_inode_info *befs_ino = BEFS_I(d_inode(dentry));
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 61012da7e9d8..a782b22904e4 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -120,7 +120,7 @@ extern struct vfsmount *cifs_dfs_d_automount(struct path *path);
 #endif
 
 /* Functions related to symlinks */
-extern const char *cifs_follow_link(struct dentry *direntry, void **cookie, struct nameidata *nd);
+extern const char *cifs_follow_link(struct dentry *direntry, void **cookie);
 extern int cifs_readlink(struct dentry *direntry, char __user *buffer,
 			 int buflen);
 extern int cifs_symlink(struct inode *inode, struct dentry *direntry,
diff --git a/fs/cifs/link.c b/fs/cifs/link.c
index 4a439c2c0c7f..546f86ab09aa 100644
--- a/fs/cifs/link.c
+++ b/fs/cifs/link.c
@@ -627,7 +627,7 @@ cifs_hl_exit:
 }
 
 const char *
-cifs_follow_link(struct dentry *direntry, void **cookie, struct nameidata *nd)
+cifs_follow_link(struct dentry *direntry, void **cookie)
 {
 	struct inode *inode = d_inode(direntry);
 	int rc = -ENOMEM;
diff --git a/fs/configfs/symlink.c b/fs/configfs/symlink.c
index fac8e8517f33..0ace75649009 100644
--- a/fs/configfs/symlink.c
+++ b/fs/configfs/symlink.c
@@ -279,7 +279,7 @@ static int configfs_getlink(struct dentry *dentry, char * path)
 
 }
 
-static const char *configfs_follow_link(struct dentry *dentry, void **cookie, struct nameidata *nd)
+static const char *configfs_follow_link(struct dentry *dentry, void **cookie)
 {
 	unsigned long page = get_zeroed_page(GFP_KERNEL);
 	int error;
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index cdb9d6c4532d..73d20ae92478 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -675,7 +675,7 @@ out:
 	return rc ? ERR_PTR(rc) : buf;
 }
 
-static const char *ecryptfs_follow_link(struct dentry *dentry, void **cookie, struct nameidata *nd)
+static const char *ecryptfs_follow_link(struct dentry *dentry, void **cookie)
 {
 	size_t len;
 	char *buf = ecryptfs_readlink_lower(dentry, &len);
diff --git a/fs/ext4/symlink.c b/fs/ext4/symlink.c
index afec475aaf5c..ba5bd18a9825 100644
--- a/fs/ext4/symlink.c
+++ b/fs/ext4/symlink.c
@@ -23,7 +23,7 @@
 #include "xattr.h"
 
 #ifdef CONFIG_EXT4_FS_ENCRYPTION
-static const char *ext4_follow_link(struct dentry *dentry, void **cookie, struct nameidata *nd)
+static const char *ext4_follow_link(struct dentry *dentry, void **cookie)
 {
 	struct page *cpage = NULL;
 	char *caddr, *paddr = NULL;
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index d2947937515e..cd05a7c91533 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -296,9 +296,9 @@ fail:
 	return err;
 }
 
-static const char *f2fs_follow_link(struct dentry *dentry, void **cookie, struct nameidata *nd)
+static const char *f2fs_follow_link(struct dentry *dentry, void **cookie)
 {
-	const char *link = page_follow_link_light(dentry, cookie, nd);
+	const char *link = page_follow_link_light(dentry, cookie);
 	if (!IS_ERR(link) && !*link) {
 		/* this is broken symlink case */
 		page_put_link(dentry, *cookie);
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index f9cb260375cf..d5cdef8b7f3a 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -1365,7 +1365,7 @@ static int fuse_readdir(struct file *file, struct dir_context *ctx)
 	return err;
 }
 
-static const char *fuse_follow_link(struct dentry *dentry, void **cookie, struct nameidata *nd)
+static const char *fuse_follow_link(struct dentry *dentry, void **cookie)
 {
 	struct inode *inode = d_inode(dentry);
 	struct fuse_conn *fc = get_fuse_conn(inode);
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index f59390aebffb..3a1461de1551 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -1548,7 +1548,7 @@ out:
  * Returns: 0 on success or error code
  */
 
-static const char *gfs2_follow_link(struct dentry *dentry, void **cookie, struct nameidata *nd)
+static const char *gfs2_follow_link(struct dentry *dentry, void **cookie)
 {
 	struct gfs2_inode *ip = GFS2_I(d_inode(dentry));
 	struct gfs2_holder i_gh;
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index f650ed661fab..7b6ed7a908f6 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -892,7 +892,7 @@ static const struct inode_operations hostfs_dir_iops = {
 	.setattr	= hostfs_setattr,
 };
 
-static const char *hostfs_follow_link(struct dentry *dentry, void **cookie, struct nameidata *nd)
+static const char *hostfs_follow_link(struct dentry *dentry, void **cookie)
 {
 	char *link = __getname();
 	if (link) {
diff --git a/fs/hppfs/hppfs.c b/fs/hppfs/hppfs.c
index b8f24d3b04ee..15a774eb5bbf 100644
--- a/fs/hppfs/hppfs.c
+++ b/fs/hppfs/hppfs.c
@@ -642,11 +642,11 @@ static int hppfs_readlink(struct dentry *dentry, char __user *buffer,
 						    buflen);
 }
 
-static const char *hppfs_follow_link(struct dentry *dentry, void **cookie, struct nameidata *nd)
+static const char *hppfs_follow_link(struct dentry *dentry, void **cookie)
 {
 	struct dentry *proc_dentry = HPPFS_I(d_inode(dentry))->proc_dentry;
 
-	return d_inode(proc_dentry)->i_op->follow_link(proc_dentry, cookie, nd);
+	return d_inode(proc_dentry)->i_op->follow_link(proc_dentry, cookie);
 }
 
 static void hppfs_put_link(struct dentry *dentry, void *cookie)
diff --git a/fs/kernfs/symlink.c b/fs/kernfs/symlink.c
index 3c7e799974a2..366c5a17475e 100644
--- a/fs/kernfs/symlink.c
+++ b/fs/kernfs/symlink.c
@@ -112,7 +112,7 @@ static int kernfs_getlink(struct dentry *dentry, char *path)
 	return error;
 }
 
-static const char *kernfs_iop_follow_link(struct dentry *dentry, void **cookie, struct nameidata *nd)
+static const char *kernfs_iop_follow_link(struct dentry *dentry, void **cookie)
 {
 	int error = -ENOMEM;
 	unsigned long page = get_zeroed_page(GFP_KERNEL);
diff --git a/fs/libfs.c b/fs/libfs.c
index 0c83fde20dbd..c5f3373e326b 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -1091,7 +1091,7 @@ simple_nosetlease(struct file *filp, long arg, struct file_lock **flp,
 }
 EXPORT_SYMBOL(simple_nosetlease);
 
-const char *simple_follow_link(struct dentry *dentry, void **cookie, struct nameidata *nd)
+const char *simple_follow_link(struct dentry *dentry, void **cookie)
 {
 	return d_inode(dentry)->i_link;
 }
diff --git a/fs/namei.c b/fs/namei.c
index b57400ca6a0f..f311f0369e3c 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -753,8 +753,9 @@ static inline void path_to_nameidata(const struct path *path,
  * Helper to directly jump to a known parsed path from ->follow_link,
  * caller must have taken a reference to path beforehand.
  */
-void nd_jump_link(struct nameidata *nd, struct path *path)
+void nd_jump_link(struct path *path)
 {
+	struct nameidata *nd = current->nameidata;
 	path_put(&nd->path);
 
 	nd->path = *path;
@@ -916,7 +917,7 @@ const char *get_link(struct nameidata *nd)
 	nd->last_type = LAST_BIND;
 	res = inode->i_link;
 	if (!res) {
-		res = inode->i_op->follow_link(dentry, &last->cookie, nd);
+		res = inode->i_op->follow_link(dentry, &last->cookie);
 		if (IS_ERR(res)) {
 out:
 			path_put(&last->link);
@@ -4485,12 +4486,12 @@ int generic_readlink(struct dentry *dentry, char __user *buffer, int buflen)
 	int res;
 
 	if (!link) {
-		link = dentry->d_inode->i_op->follow_link(dentry, &cookie, NULL);
+		link = dentry->d_inode->i_op->follow_link(dentry, &cookie);
 		if (IS_ERR(link))
 			return PTR_ERR(link);
 	}
 	res = readlink_copy(buffer, buflen, link);
-	if (cookie && dentry->d_inode->i_op->put_link)
+	if (dentry->d_inode->i_op->put_link)
 		dentry->d_inode->i_op->put_link(dentry, cookie);
 	return res;
 }
@@ -4523,7 +4524,7 @@ int page_readlink(struct dentry *dentry, char __user *buffer, int buflen)
 }
 EXPORT_SYMBOL(page_readlink);
 
-const char *page_follow_link_light(struct dentry *dentry, void **cookie, struct nameidata *nd)
+const char *page_follow_link_light(struct dentry *dentry, void **cookie)
 {
 	struct page *page = NULL;
 	char *res = page_getlink(dentry, &page);
diff --git a/fs/nfs/symlink.c b/fs/nfs/symlink.c
index c992b200ae7e..b6de433da5db 100644
--- a/fs/nfs/symlink.c
+++ b/fs/nfs/symlink.c
@@ -42,7 +42,7 @@ error:
 	return -EIO;
 }
 
-static const char *nfs_follow_link(struct dentry *dentry, void **cookie, struct nameidata *nd)
+static const char *nfs_follow_link(struct dentry *dentry, void **cookie)
 {
 	struct inode *inode = d_inode(dentry);
 	struct page *page;
diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c
index 235ad42afb57..9986833c9fcc 100644
--- a/fs/overlayfs/inode.c
+++ b/fs/overlayfs/inode.c
@@ -140,7 +140,7 @@ struct ovl_link_data {
 	void *cookie;
 };
 
-static const char *ovl_follow_link(struct dentry *dentry, void **cookie, struct nameidata *nd)
+static const char *ovl_follow_link(struct dentry *dentry, void **cookie)
 {
 	struct dentry *realdentry;
 	struct inode *realinode;
@@ -160,7 +160,7 @@ static const char *ovl_follow_link(struct dentry *dentry, void **cookie, struct
 		data->realdentry = realdentry;
 	}
 
-	ret = realinode->i_op->follow_link(realdentry, cookie, nd);
+	ret = realinode->i_op->follow_link(realdentry, cookie);
 	if (IS_ERR_OR_NULL(ret)) {
 		kfree(data);
 		return ret;
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 52652f86b187..286a422f440e 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1380,7 +1380,7 @@ static int proc_exe_link(struct dentry *dentry, struct path *exe_path)
 		return -ENOENT;
 }
 
-static const char *proc_pid_follow_link(struct dentry *dentry, void **cookie, struct nameidata *nd)
+static const char *proc_pid_follow_link(struct dentry *dentry, void **cookie)
 {
 	struct inode *inode = d_inode(dentry);
 	struct path path;
@@ -1394,7 +1394,7 @@ static const char *proc_pid_follow_link(struct dentry *dentry, void **cookie, st
 	if (error)
 		goto out;
 
-	nd_jump_link(nd, &path);
+	nd_jump_link(&path);
 	return NULL;
 out:
 	return ERR_PTR(error);
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index acd51d75387d..eb35874fe09c 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -393,7 +393,7 @@ static const struct file_operations proc_reg_file_ops_no_compat = {
 };
 #endif
 
-static const char *proc_follow_link(struct dentry *dentry, void **cookie, struct nameidata *nd)
+static const char *proc_follow_link(struct dentry *dentry, void **cookie)
 {
 	struct proc_dir_entry *pde = PDE(d_inode(dentry));
 	if (unlikely(!use_pde(pde)))
diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c
index 10d24dd096e8..f6e8354b8cea 100644
--- a/fs/proc/namespaces.c
+++ b/fs/proc/namespaces.c
@@ -30,7 +30,7 @@ static const struct proc_ns_operations *ns_entries[] = {
 	&mntns_operations,
 };
 
-static const char *proc_ns_follow_link(struct dentry *dentry, void **cookie, struct nameidata *nd)
+static const char *proc_ns_follow_link(struct dentry *dentry, void **cookie)
 {
 	struct inode *inode = d_inode(dentry);
 	const struct proc_ns_operations *ns_ops = PROC_I(inode)->ns_ops;
@@ -45,7 +45,7 @@ static const char *proc_ns_follow_link(struct dentry *dentry, void **cookie, str
 	if (ptrace_may_access(task, PTRACE_MODE_READ)) {
 		error = ns_get_path(&ns_path, task, ns_ops);
 		if (!error)
-			nd_jump_link(nd, &ns_path);
+			nd_jump_link(&ns_path);
 	}
 	put_task_struct(task);
 	return error;
diff --git a/fs/proc/self.c b/fs/proc/self.c
index ad333946b53a..113b8d061fc0 100644
--- a/fs/proc/self.c
+++ b/fs/proc/self.c
@@ -18,7 +18,7 @@ static int proc_self_readlink(struct dentry *dentry, char __user *buffer,
 	return readlink_copy(buffer, buflen, tmp);
 }
 
-static const char *proc_self_follow_link(struct dentry *dentry, void **cookie, struct nameidata *nd)
+static const char *proc_self_follow_link(struct dentry *dentry, void **cookie)
 {
 	struct pid_namespace *ns = dentry->d_sb->s_fs_info;
 	pid_t tgid = task_tgid_nr_ns(current, ns);
diff --git a/fs/proc/thread_self.c b/fs/proc/thread_self.c
index 85c96e0d7aaa..947b0f4fd0a1 100644
--- a/fs/proc/thread_self.c
+++ b/fs/proc/thread_self.c
@@ -19,7 +19,7 @@ static int proc_thread_self_readlink(struct dentry *dentry, char __user *buffer,
 	return readlink_copy(buffer, buflen, tmp);
 }
 
-static const char *proc_thread_self_follow_link(struct dentry *dentry, void **cookie, struct nameidata *nd)
+static const char *proc_thread_self_follow_link(struct dentry *dentry, void **cookie)
 {
 	struct pid_namespace *ns = dentry->d_sb->s_fs_info;
 	pid_t tgid = task_tgid_nr_ns(current, ns);
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 26c4dcb1ef56..7f51f39f8acc 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -416,8 +416,7 @@ xfs_vn_rename(
 STATIC const char *
 xfs_vn_follow_link(
 	struct dentry		*dentry,
-	void			**cookie,
-	struct nameidata	*nd)
+	void			**cookie)
 {
 	char			*link;
 	int			error = -ENOMEM;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 9ab934113a28..ed7c9f298759 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1608,7 +1608,7 @@ struct file_operations {
 
 struct inode_operations {
 	struct dentry * (*lookup) (struct inode *,struct dentry *, unsigned int);
-	const char * (*follow_link) (struct dentry *, void **, struct nameidata *);
+	const char * (*follow_link) (struct dentry *, void **);
 	int (*permission) (struct inode *, int);
 	struct posix_acl * (*get_acl)(struct inode *, int);
 
@@ -2705,7 +2705,7 @@ extern const struct file_operations generic_ro_fops;
 
 extern int readlink_copy(char __user *, int, const char *);
 extern int page_readlink(struct dentry *, char __user *, int);
-extern const char *page_follow_link_light(struct dentry *, void **, struct nameidata *);
+extern const char *page_follow_link_light(struct dentry *, void **);
 extern void page_put_link(struct dentry *, void *);
 extern int __page_symlink(struct inode *inode, const char *symname, int len,
 		int nofs);
@@ -2722,7 +2722,7 @@ void __inode_sub_bytes(struct inode *inode, loff_t bytes);
 void inode_sub_bytes(struct inode *inode, loff_t bytes);
 loff_t inode_get_bytes(struct inode *inode);
 void inode_set_bytes(struct inode *inode, loff_t bytes);
-const char *simple_follow_link(struct dentry *, void **, struct nameidata *);
+const char *simple_follow_link(struct dentry *, void **);
 extern const struct inode_operations simple_symlink_inode_operations;
 
 extern int iterate_dir(struct file *, struct dir_context *);
diff --git a/include/linux/namei.h b/include/linux/namei.h
index 3a6cc9651712..d756304aa09b 100644
--- a/include/linux/namei.h
+++ b/include/linux/namei.h
@@ -72,7 +72,7 @@ extern int follow_up(struct path *);
 extern struct dentry *lock_rename(struct dentry *, struct dentry *);
 extern void unlock_rename(struct dentry *, struct dentry *);
 
-extern void nd_jump_link(struct nameidata *nd, struct path *path);
+extern void nd_jump_link(struct path *path);
 
 static inline void nd_terminate_link(void *name, size_t len, size_t maxlen)
 {
diff --git a/mm/shmem.c b/mm/shmem.c
index d1693dcb4285..e02682267046 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2475,7 +2475,7 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s
 	return 0;
 }
 
-static const char *shmem_follow_link(struct dentry *dentry, void **cookie, struct nameidata *nd)
+static const char *shmem_follow_link(struct dentry *dentry, void **cookie)
 {
 	struct page *page = NULL;
 	int error = shmem_getpage(d_inode(dentry), 0, &page, SGP_READ, NULL);
-- 
cgit v1.2.3


From 2da572c959dd5815aef153cf62010b16a498a0d3 Mon Sep 17 00:00:00 2001
From: Dan Streetman <ddstreet@ieee.org>
Date: Thu, 7 May 2015 13:49:14 -0400
Subject: lib: add software 842 compression/decompression

Add 842-format software compression and decompression functions.
Update the MAINTAINERS 842 section to include the new files.

The 842 compression function can compress any input data into the 842
compression format.  The 842 decompression function can decompress any
standard-format 842 compressed data - specifically, either a compressed
data buffer created by the 842 software compression function, or a
compressed data buffer created by the 842 hardware compressor (located
in PowerPC coprocessors).

The 842 compressed data format is explained in the header comments.

This is used in a later patch to provide a full software 842 compression
and decompression crypto interface.

Signed-off-by: Dan Streetman <ddstreet@ieee.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 MAINTAINERS              |   2 +
 include/linux/sw842.h    |  12 +
 lib/842/842.h            | 127 ++++++++++
 lib/842/842_compress.c   | 626 +++++++++++++++++++++++++++++++++++++++++++++++
 lib/842/842_debugfs.h    |  52 ++++
 lib/842/842_decompress.c | 405 ++++++++++++++++++++++++++++++
 lib/842/Makefile         |   2 +
 lib/Kconfig              |   6 +
 lib/Makefile             |   2 +
 9 files changed, 1234 insertions(+)
 create mode 100644 include/linux/sw842.h
 create mode 100644 lib/842/842.h
 create mode 100644 lib/842/842_compress.c
 create mode 100644 lib/842/842_debugfs.h
 create mode 100644 lib/842/842_decompress.c
 create mode 100644 lib/842/Makefile

(limited to 'include/linux')

diff --git a/MAINTAINERS b/MAINTAINERS
index 56a432d51119..96684fd49abc 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -4863,6 +4863,8 @@ M:	Dan Streetman <ddstreet@us.ibm.com>
 S:	Supported
 F:	drivers/crypto/nx/nx-842.c
 F:	include/linux/nx842.h
+F:	include/linux/sw842.h
+F:	lib/842/
 
 IBM Power Linux RAID adapter
 M:	Brian King <brking@us.ibm.com>
diff --git a/include/linux/sw842.h b/include/linux/sw842.h
new file mode 100644
index 000000000000..109ba041c2ae
--- /dev/null
+++ b/include/linux/sw842.h
@@ -0,0 +1,12 @@
+#ifndef __SW842_H__
+#define __SW842_H__
+
+#define SW842_MEM_COMPRESS	(0xf000)
+
+int sw842_compress(const u8 *src, unsigned int srclen,
+		   u8 *dst, unsigned int *destlen, void *wmem);
+
+int sw842_decompress(const u8 *src, unsigned int srclen,
+		     u8 *dst, unsigned int *destlen);
+
+#endif
diff --git a/lib/842/842.h b/lib/842/842.h
new file mode 100644
index 000000000000..7c200030acf7
--- /dev/null
+++ b/lib/842/842.h
@@ -0,0 +1,127 @@
+
+#ifndef __842_H__
+#define __842_H__
+
+/* The 842 compressed format is made up of multiple blocks, each of
+ * which have the format:
+ *
+ * <template>[arg1][arg2][arg3][arg4]
+ *
+ * where there are between 0 and 4 template args, depending on the specific
+ * template operation.  For normal operations, each arg is either a specific
+ * number of data bytes to add to the output buffer, or an index pointing
+ * to a previously-written number of data bytes to copy to the output buffer.
+ *
+ * The template code is a 5-bit value.  This code indicates what to do with
+ * the following data.  Template codes from 0 to 0x19 should use the template
+ * table, the static "decomp_ops" table used in decompress.  For each template
+ * (table row), there are between 1 and 4 actions; each action corresponds to
+ * an arg following the template code bits.  Each action is either a "data"
+ * type action, or a "index" type action, and each action results in 2, 4, or 8
+ * bytes being written to the output buffer.  Each template (i.e. all actions
+ * in the table row) will add up to 8 bytes being written to the output buffer.
+ * Any row with less than 4 actions is padded with noop actions, indicated by
+ * N0 (for which there is no corresponding arg in the compressed data buffer).
+ *
+ * "Data" actions, indicated in the table by D2, D4, and D8, mean that the
+ * corresponding arg is 2, 4, or 8 bytes, respectively, in the compressed data
+ * buffer should be copied directly to the output buffer.
+ *
+ * "Index" actions, indicated in the table by I2, I4, and I8, mean the
+ * corresponding arg is an index parameter that points to, respectively, a 2,
+ * 4, or 8 byte value already in the output buffer, that should be copied to
+ * the end of the output buffer.  Essentially, the index points to a position
+ * in a ring buffer that contains the last N bytes of output buffer data.
+ * The number of bits for each index's arg are: 8 bits for I2, 9 bits for I4,
+ * and 8 bits for I8.  Since each index points to a 2, 4, or 8 byte section,
+ * this means that I2 can reference 512 bytes ((2^8 bits = 256) * 2 bytes), I4
+ * can reference 2048 bytes ((2^9 = 512) * 4 bytes), and I8 can reference 2048
+ * bytes ((2^8 = 256) * 8 bytes).  Think of it as a kind-of ring buffer for
+ * each of I2, I4, and I8 that are updated for each byte written to the output
+ * buffer.  In this implementation, the output buffer is directly used for each
+ * index; there is no additional memory required.  Note that the index is into
+ * a ring buffer, not a sliding window; for example, if there have been 260
+ * bytes written to the output buffer, an I2 index of 0 would index to byte 256
+ * in the output buffer, while an I2 index of 16 would index to byte 16 in the
+ * output buffer.
+ *
+ * There are also 3 special template codes; 0x1b for "repeat", 0x1c for
+ * "zeros", and 0x1e for "end".  The "repeat" operation is followed by a 6 bit
+ * arg N indicating how many times to repeat.  The last 8 bytes written to the
+ * output buffer are written again to the output buffer, N + 1 times.  The
+ * "zeros" operation, which has no arg bits, writes 8 zeros to the output
+ * buffer.  The "end" operation, which also has no arg bits, signals the end
+ * of the compressed data.  There may be some number of padding (don't care,
+ * but usually 0) bits after the "end" operation bits, to fill the buffer
+ * length to a specific byte multiple (usually a multiple of 8, 16, or 32
+ * bytes).
+ *
+ * This software implementation also uses one of the undefined template values,
+ * 0x1d as a special "short data" template code, to represent less than 8 bytes
+ * of uncompressed data.  It is followed by a 3 bit arg N indicating how many
+ * data bytes will follow, and then N bytes of data, which should be copied to
+ * the output buffer.  This allows the software 842 compressor to accept input
+ * buffers that are not an exact multiple of 8 bytes long.  However, those
+ * compressed buffers containing this sw-only template will be rejected by
+ * the 842 hardware decompressor, and must be decompressed with this software
+ * library.  The 842 software compression module includes a parameter to
+ * disable using this sw-only "short data" template, and instead simply
+ * reject any input buffer that is not a multiple of 8 bytes long.
+ *
+ * After all actions for each operation code are processed, another template
+ * code is in the next 5 bits.  The decompression ends once the "end" template
+ * code is detected.
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/bitops.h>
+#include <asm/unaligned.h>
+
+#include <linux/sw842.h>
+
+/* special templates */
+#define OP_REPEAT	(0x1B)
+#define OP_ZEROS	(0x1C)
+#define OP_END		(0x1E)
+
+/* sw only template - this is not in the hw design; it's used only by this
+ * software compressor and decompressor, to allow input buffers that aren't
+ * a multiple of 8.
+ */
+#define OP_SHORT_DATA	(0x1D)
+
+/* additional bits of each op param */
+#define OP_BITS		(5)
+#define REPEAT_BITS	(6)
+#define SHORT_DATA_BITS	(3)
+#define I2_BITS		(8)
+#define I4_BITS		(9)
+#define I8_BITS		(8)
+
+#define REPEAT_BITS_MAX		(0x3f)
+#define SHORT_DATA_BITS_MAX	(0x7)
+
+/* Arbitrary values used to indicate action */
+#define OP_ACTION	(0x70)
+#define OP_ACTION_INDEX	(0x10)
+#define OP_ACTION_DATA	(0x20)
+#define OP_ACTION_NOOP	(0x40)
+#define OP_AMOUNT	(0x0f)
+#define OP_AMOUNT_0	(0x00)
+#define OP_AMOUNT_2	(0x02)
+#define OP_AMOUNT_4	(0x04)
+#define OP_AMOUNT_8	(0x08)
+
+#define D2		(OP_ACTION_DATA  | OP_AMOUNT_2)
+#define D4		(OP_ACTION_DATA  | OP_AMOUNT_4)
+#define D8		(OP_ACTION_DATA  | OP_AMOUNT_8)
+#define I2		(OP_ACTION_INDEX | OP_AMOUNT_2)
+#define I4		(OP_ACTION_INDEX | OP_AMOUNT_4)
+#define I8		(OP_ACTION_INDEX | OP_AMOUNT_8)
+#define N0		(OP_ACTION_NOOP  | OP_AMOUNT_0)
+
+/* the max of the regular templates - not including the special templates */
+#define OPS_MAX		(0x1a)
+
+#endif
diff --git a/lib/842/842_compress.c b/lib/842/842_compress.c
new file mode 100644
index 000000000000..7ce68948e68c
--- /dev/null
+++ b/lib/842/842_compress.c
@@ -0,0 +1,626 @@
+/*
+ * 842 Software Compression
+ *
+ * Copyright (C) 2015 Dan Streetman, IBM Corp
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * See 842.h for details of the 842 compressed format.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#define MODULE_NAME "842_compress"
+
+#include <linux/hashtable.h>
+
+#include "842.h"
+#include "842_debugfs.h"
+
+#define SW842_HASHTABLE8_BITS	(10)
+#define SW842_HASHTABLE4_BITS	(11)
+#define SW842_HASHTABLE2_BITS	(10)
+
+/* By default, we allow compressing input buffers of any length, but we must
+ * use the non-standard "short data" template so the decompressor can correctly
+ * reproduce the uncompressed data buffer at the right length.  However the
+ * hardware 842 compressor will not recognize the "short data" template, and
+ * will fail to decompress any compressed buffer containing it (I have no idea
+ * why anyone would want to use software to compress and hardware to decompress
+ * but that's beside the point).  This parameter forces the compression
+ * function to simply reject any input buffer that isn't a multiple of 8 bytes
+ * long, instead of using the "short data" template, so that all compressed
+ * buffers produced by this function will be decompressable by the 842 hardware
+ * decompressor.  Unless you have a specific need for that, leave this disabled
+ * so that any length buffer can be compressed.
+ */
+static bool sw842_strict;
+module_param_named(strict, sw842_strict, bool, 0644);
+
+static u8 comp_ops[OPS_MAX][5] = { /* params size in bits */
+	{ I8, N0, N0, N0, 0x19 }, /* 8 */
+	{ I4, I4, N0, N0, 0x18 }, /* 18 */
+	{ I4, I2, I2, N0, 0x17 }, /* 25 */
+	{ I2, I2, I4, N0, 0x13 }, /* 25 */
+	{ I2, I2, I2, I2, 0x12 }, /* 32 */
+	{ I4, I2, D2, N0, 0x16 }, /* 33 */
+	{ I4, D2, I2, N0, 0x15 }, /* 33 */
+	{ I2, D2, I4, N0, 0x0e }, /* 33 */
+	{ D2, I2, I4, N0, 0x09 }, /* 33 */
+	{ I2, I2, I2, D2, 0x11 }, /* 40 */
+	{ I2, I2, D2, I2, 0x10 }, /* 40 */
+	{ I2, D2, I2, I2, 0x0d }, /* 40 */
+	{ D2, I2, I2, I2, 0x08 }, /* 40 */
+	{ I4, D4, N0, N0, 0x14 }, /* 41 */
+	{ D4, I4, N0, N0, 0x04 }, /* 41 */
+	{ I2, I2, D4, N0, 0x0f }, /* 48 */
+	{ I2, D2, I2, D2, 0x0c }, /* 48 */
+	{ I2, D4, I2, N0, 0x0b }, /* 48 */
+	{ D2, I2, I2, D2, 0x07 }, /* 48 */
+	{ D2, I2, D2, I2, 0x06 }, /* 48 */
+	{ D4, I2, I2, N0, 0x03 }, /* 48 */
+	{ I2, D2, D4, N0, 0x0a }, /* 56 */
+	{ D2, I2, D4, N0, 0x05 }, /* 56 */
+	{ D4, I2, D2, N0, 0x02 }, /* 56 */
+	{ D4, D2, I2, N0, 0x01 }, /* 56 */
+	{ D8, N0, N0, N0, 0x00 }, /* 64 */
+};
+
+struct sw842_hlist_node8 {
+	struct hlist_node node;
+	u64 data;
+	u8 index;
+};
+
+struct sw842_hlist_node4 {
+	struct hlist_node node;
+	u32 data;
+	u16 index;
+};
+
+struct sw842_hlist_node2 {
+	struct hlist_node node;
+	u16 data;
+	u8 index;
+};
+
+#define INDEX_NOT_FOUND		(-1)
+#define INDEX_NOT_CHECKED	(-2)
+
+struct sw842_param {
+	u8 *in;
+	u8 *instart;
+	u64 ilen;
+	u8 *out;
+	u64 olen;
+	u8 bit;
+	u64 data8[1];
+	u32 data4[2];
+	u16 data2[4];
+	int index8[1];
+	int index4[2];
+	int index2[4];
+	DECLARE_HASHTABLE(htable8, SW842_HASHTABLE8_BITS);
+	DECLARE_HASHTABLE(htable4, SW842_HASHTABLE4_BITS);
+	DECLARE_HASHTABLE(htable2, SW842_HASHTABLE2_BITS);
+	struct sw842_hlist_node8 node8[1 << I8_BITS];
+	struct sw842_hlist_node4 node4[1 << I4_BITS];
+	struct sw842_hlist_node2 node2[1 << I2_BITS];
+};
+
+#define get_input_data(p, o, b)						\
+	be##b##_to_cpu(get_unaligned((__be##b *)((p)->in + (o))))
+
+#define init_hashtable_nodes(p, b)	do {			\
+	int _i;							\
+	hash_init((p)->htable##b);				\
+	for (_i = 0; _i < ARRAY_SIZE((p)->node##b); _i++) {	\
+		(p)->node##b[_i].index = _i;			\
+		(p)->node##b[_i].data = 0;			\
+		INIT_HLIST_NODE(&(p)->node##b[_i].node);	\
+	}							\
+} while (0)
+
+#define find_index(p, b, n)	({					\
+	struct sw842_hlist_node##b *_n;					\
+	p->index##b[n] = INDEX_NOT_FOUND;				\
+	hash_for_each_possible(p->htable##b, _n, node, p->data##b[n]) {	\
+		if (p->data##b[n] == _n->data) {			\
+			p->index##b[n] = _n->index;			\
+			break;						\
+		}							\
+	}								\
+	p->index##b[n] >= 0;						\
+})
+
+#define check_index(p, b, n)			\
+	((p)->index##b[n] == INDEX_NOT_CHECKED	\
+	 ? find_index(p, b, n)			\
+	 : (p)->index##b[n] >= 0)
+
+#define replace_hash(p, b, i, d)	do {				\
+	struct sw842_hlist_node##b *_n = &(p)->node##b[(i)+(d)];	\
+	hash_del(&_n->node);						\
+	_n->data = (p)->data##b[d];					\
+	pr_debug("add hash index%x %x pos %x data %lx\n", b,		\
+		 (unsigned int)_n->index,				\
+		 (unsigned int)((p)->in - (p)->instart),		\
+		 (unsigned long)_n->data);				\
+	hash_add((p)->htable##b, &_n->node, _n->data);			\
+} while (0)
+
+static u8 bmask[8] = { 0x00, 0x80, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc, 0xfe };
+
+static int add_bits(struct sw842_param *p, u64 d, u8 n);
+
+static int __split_add_bits(struct sw842_param *p, u64 d, u8 n, u8 s)
+{
+	int ret;
+
+	if (n <= s)
+		return -EINVAL;
+
+	ret = add_bits(p, d >> s, n - s);
+	if (ret)
+		return ret;
+	return add_bits(p, d & GENMASK_ULL(s - 1, 0), s);
+}
+
+static int add_bits(struct sw842_param *p, u64 d, u8 n)
+{
+	int b = p->bit, bits = b + n, s = round_up(bits, 8) - bits;
+	u64 o;
+	u8 *out = p->out;
+
+	pr_debug("add %u bits %lx\n", (unsigned char)n, (unsigned long)d);
+
+	if (n > 64)
+		return -EINVAL;
+
+	/* split this up if writing to > 8 bytes (i.e. n == 64 && p->bit > 0),
+	 * or if we're at the end of the output buffer and would write past end
+	 */
+	if (bits > 64)
+		return __split_add_bits(p, d, n, 32);
+	else if (p->olen < 8 && bits > 32 && bits <= 56)
+		return __split_add_bits(p, d, n, 16);
+	else if (p->olen < 4 && bits > 16 && bits <= 24)
+		return __split_add_bits(p, d, n, 8);
+
+	if (DIV_ROUND_UP(bits, 8) > p->olen)
+		return -ENOSPC;
+
+	o = *out & bmask[b];
+	d <<= s;
+
+	if (bits <= 8)
+		*out = o | d;
+	else if (bits <= 16)
+		put_unaligned(cpu_to_be16(o << 8 | d), (__be16 *)out);
+	else if (bits <= 24)
+		put_unaligned(cpu_to_be32(o << 24 | d << 8), (__be32 *)out);
+	else if (bits <= 32)
+		put_unaligned(cpu_to_be32(o << 24 | d), (__be32 *)out);
+	else if (bits <= 40)
+		put_unaligned(cpu_to_be64(o << 56 | d << 24), (__be64 *)out);
+	else if (bits <= 48)
+		put_unaligned(cpu_to_be64(o << 56 | d << 16), (__be64 *)out);
+	else if (bits <= 56)
+		put_unaligned(cpu_to_be64(o << 56 | d << 8), (__be64 *)out);
+	else
+		put_unaligned(cpu_to_be64(o << 56 | d), (__be64 *)out);
+
+	p->bit += n;
+
+	if (p->bit > 7) {
+		p->out += p->bit / 8;
+		p->olen -= p->bit / 8;
+		p->bit %= 8;
+	}
+
+	return 0;
+}
+
+static int add_template(struct sw842_param *p, u8 c)
+{
+	int ret, i, b = 0;
+	u8 *t = comp_ops[c];
+	bool inv = false;
+
+	if (c >= OPS_MAX)
+		return -EINVAL;
+
+	pr_debug("template %x\n", t[4]);
+
+	ret = add_bits(p, t[4], OP_BITS);
+	if (ret)
+		return ret;
+
+	for (i = 0; i < 4; i++) {
+		pr_debug("op %x\n", t[i]);
+
+		switch (t[i] & OP_AMOUNT) {
+		case OP_AMOUNT_8:
+			if (b)
+				inv = true;
+			else if (t[i] & OP_ACTION_INDEX)
+				ret = add_bits(p, p->index8[0], I8_BITS);
+			else if (t[i] & OP_ACTION_DATA)
+				ret = add_bits(p, p->data8[0], 64);
+			else
+				inv = true;
+			break;
+		case OP_AMOUNT_4:
+			if (b == 2 && t[i] & OP_ACTION_DATA)
+				ret = add_bits(p, get_input_data(p, 2, 32), 32);
+			else if (b != 0 && b != 4)
+				inv = true;
+			else if (t[i] & OP_ACTION_INDEX)
+				ret = add_bits(p, p->index4[b >> 2], I4_BITS);
+			else if (t[i] & OP_ACTION_DATA)
+				ret = add_bits(p, p->data4[b >> 2], 32);
+			else
+				inv = true;
+			break;
+		case OP_AMOUNT_2:
+			if (b != 0 && b != 2 && b != 4 && b != 6)
+				inv = true;
+			if (t[i] & OP_ACTION_INDEX)
+				ret = add_bits(p, p->index2[b >> 1], I2_BITS);
+			else if (t[i] & OP_ACTION_DATA)
+				ret = add_bits(p, p->data2[b >> 1], 16);
+			else
+				inv = true;
+			break;
+		case OP_AMOUNT_0:
+			inv = (b != 8) || !(t[i] & OP_ACTION_NOOP);
+			break;
+		default:
+			inv = true;
+			break;
+		}
+
+		if (ret)
+			return ret;
+
+		if (inv) {
+			pr_err("Invalid templ %x op %d : %x %x %x %x\n",
+			       c, i, t[0], t[1], t[2], t[3]);
+			return -EINVAL;
+		}
+
+		b += t[i] & OP_AMOUNT;
+	}
+
+	if (b != 8) {
+		pr_err("Invalid template %x len %x : %x %x %x %x\n",
+		       c, b, t[0], t[1], t[2], t[3]);
+		return -EINVAL;
+	}
+
+	if (sw842_template_counts)
+		atomic_inc(&template_count[t[4]]);
+
+	return 0;
+}
+
+static int add_repeat_template(struct sw842_param *p, u8 r)
+{
+	int ret;
+
+	/* repeat param is 0-based */
+	if (!r || --r > REPEAT_BITS_MAX)
+		return -EINVAL;
+
+	ret = add_bits(p, OP_REPEAT, OP_BITS);
+	if (ret)
+		return ret;
+
+	ret = add_bits(p, r, REPEAT_BITS);
+	if (ret)
+		return ret;
+
+	if (sw842_template_counts)
+		atomic_inc(&template_repeat_count);
+
+	return 0;
+}
+
+static int add_short_data_template(struct sw842_param *p, u8 b)
+{
+	int ret, i;
+
+	if (!b || b > SHORT_DATA_BITS_MAX)
+		return -EINVAL;
+
+	ret = add_bits(p, OP_SHORT_DATA, OP_BITS);
+	if (ret)
+		return ret;
+
+	ret = add_bits(p, b, SHORT_DATA_BITS);
+	if (ret)
+		return ret;
+
+	for (i = 0; i < b; i++) {
+		ret = add_bits(p, p->in[i], 8);
+		if (ret)
+			return ret;
+	}
+
+	if (sw842_template_counts)
+		atomic_inc(&template_short_data_count);
+
+	return 0;
+}
+
+static int add_zeros_template(struct sw842_param *p)
+{
+	int ret = add_bits(p, OP_ZEROS, OP_BITS);
+
+	if (ret)
+		return ret;
+
+	if (sw842_template_counts)
+		atomic_inc(&template_zeros_count);
+
+	return 0;
+}
+
+static int add_end_template(struct sw842_param *p)
+{
+	int ret = add_bits(p, OP_END, OP_BITS);
+
+	if (ret)
+		return ret;
+
+	if (sw842_template_counts)
+		atomic_inc(&template_end_count);
+
+	return 0;
+}
+
+static bool check_template(struct sw842_param *p, u8 c)
+{
+	u8 *t = comp_ops[c];
+	int i, match, b = 0;
+
+	if (c >= OPS_MAX)
+		return false;
+
+	for (i = 0; i < 4; i++) {
+		if (t[i] & OP_ACTION_INDEX) {
+			if (t[i] & OP_AMOUNT_2)
+				match = check_index(p, 2, b >> 1);
+			else if (t[i] & OP_AMOUNT_4)
+				match = check_index(p, 4, b >> 2);
+			else if (t[i] & OP_AMOUNT_8)
+				match = check_index(p, 8, 0);
+			else
+				return false;
+			if (!match)
+				return false;
+		}
+
+		b += t[i] & OP_AMOUNT;
+	}
+
+	return true;
+}
+
+static void get_next_data(struct sw842_param *p)
+{
+	p->data8[0] = get_input_data(p, 0, 64);
+	p->data4[0] = get_input_data(p, 0, 32);
+	p->data4[1] = get_input_data(p, 4, 32);
+	p->data2[0] = get_input_data(p, 0, 16);
+	p->data2[1] = get_input_data(p, 2, 16);
+	p->data2[2] = get_input_data(p, 4, 16);
+	p->data2[3] = get_input_data(p, 6, 16);
+}
+
+/* update the hashtable entries.
+ * only call this after finding/adding the current template
+ * the dataN fields for the current 8 byte block must be already updated
+ */
+static void update_hashtables(struct sw842_param *p)
+{
+	u64 pos = p->in - p->instart;
+	u64 n8 = (pos >> 3) % (1 << I8_BITS);
+	u64 n4 = (pos >> 2) % (1 << I4_BITS);
+	u64 n2 = (pos >> 1) % (1 << I2_BITS);
+
+	replace_hash(p, 8, n8, 0);
+	replace_hash(p, 4, n4, 0);
+	replace_hash(p, 4, n4, 1);
+	replace_hash(p, 2, n2, 0);
+	replace_hash(p, 2, n2, 1);
+	replace_hash(p, 2, n2, 2);
+	replace_hash(p, 2, n2, 3);
+}
+
+/* find the next template to use, and add it
+ * the p->dataN fields must already be set for the current 8 byte block
+ */
+static int process_next(struct sw842_param *p)
+{
+	int ret, i;
+
+	p->index8[0] = INDEX_NOT_CHECKED;
+	p->index4[0] = INDEX_NOT_CHECKED;
+	p->index4[1] = INDEX_NOT_CHECKED;
+	p->index2[0] = INDEX_NOT_CHECKED;
+	p->index2[1] = INDEX_NOT_CHECKED;
+	p->index2[2] = INDEX_NOT_CHECKED;
+	p->index2[3] = INDEX_NOT_CHECKED;
+
+	/* check up to OPS_MAX - 1; last op is our fallback */
+	for (i = 0; i < OPS_MAX - 1; i++) {
+		if (check_template(p, i))
+			break;
+	}
+
+	ret = add_template(p, i);
+	if (ret)
+		return ret;
+
+	return 0;
+}
+
+/**
+ * sw842_compress
+ *
+ * Compress the uncompressed buffer of length @ilen at @in to the output buffer
+ * @out, using no more than @olen bytes, using the 842 compression format.
+ *
+ * Returns: 0 on success, error on failure.  The @olen parameter
+ * will contain the number of output bytes written on success, or
+ * 0 on error.
+ */
+int sw842_compress(const u8 *in, unsigned int ilen,
+		   u8 *out, unsigned int *olen, void *wmem)
+{
+	struct sw842_param *p = (struct sw842_param *)wmem;
+	int ret;
+	u64 last, next, pad, total;
+	u8 repeat_count = 0;
+
+	BUILD_BUG_ON(sizeof(*p) > SW842_MEM_COMPRESS);
+
+	init_hashtable_nodes(p, 8);
+	init_hashtable_nodes(p, 4);
+	init_hashtable_nodes(p, 2);
+
+	p->in = (u8 *)in;
+	p->instart = p->in;
+	p->ilen = ilen;
+	p->out = out;
+	p->olen = *olen;
+	p->bit = 0;
+
+	total = p->olen;
+
+	*olen = 0;
+
+	/* if using strict mode, we can only compress a multiple of 8 */
+	if (sw842_strict && (ilen % 8)) {
+		pr_err("Using strict mode, can't compress len %d\n", ilen);
+		return -EINVAL;
+	}
+
+	/* let's compress at least 8 bytes, mkay? */
+	if (unlikely(ilen < 8))
+		goto skip_comp;
+
+	/* make initial 'last' different so we don't match the first time */
+	last = ~get_unaligned((u64 *)p->in);
+
+	while (p->ilen > 7) {
+		next = get_unaligned((u64 *)p->in);
+
+		/* must get the next data, as we need to update the hashtable
+		 * entries with the new data every time
+		 */
+		get_next_data(p);
+
+		/* we don't care about endianness in last or next;
+		 * we're just comparing 8 bytes to another 8 bytes,
+		 * they're both the same endianness
+		 */
+		if (next == last) {
+			/* repeat count bits are 0-based, so we stop at +1 */
+			if (++repeat_count <= REPEAT_BITS_MAX)
+				goto repeat;
+		}
+		if (repeat_count) {
+			ret = add_repeat_template(p, repeat_count);
+			repeat_count = 0;
+			if (next == last) /* reached max repeat bits */
+				goto repeat;
+		}
+
+		if (next == 0)
+			ret = add_zeros_template(p);
+		else
+			ret = process_next(p);
+
+		if (ret)
+			return ret;
+
+repeat:
+		last = next;
+		update_hashtables(p);
+		p->in += 8;
+		p->ilen -= 8;
+	}
+
+	if (repeat_count) {
+		ret = add_repeat_template(p, repeat_count);
+		if (ret)
+			return ret;
+	}
+
+skip_comp:
+	if (p->ilen > 0) {
+		ret = add_short_data_template(p, p->ilen);
+		if (ret)
+			return ret;
+
+		p->in += p->ilen;
+		p->ilen = 0;
+	}
+
+	ret = add_end_template(p);
+	if (ret)
+		return ret;
+
+	if (p->bit) {
+		p->out++;
+		p->olen--;
+		p->bit = 0;
+	}
+
+	/* pad compressed length to multiple of 8 */
+	pad = (8 - ((total - p->olen) % 8)) % 8;
+	if (pad) {
+		if (pad > p->olen) /* we were so close! */
+			return -ENOSPC;
+		memset(p->out, 0, pad);
+		p->out += pad;
+		p->olen -= pad;
+	}
+
+	if (unlikely((total - p->olen) > UINT_MAX))
+		return -ENOSPC;
+
+	*olen = total - p->olen;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(sw842_compress);
+
+static int __init sw842_init(void)
+{
+	if (sw842_template_counts)
+		sw842_debugfs_create();
+
+	return 0;
+}
+module_init(sw842_init);
+
+static void __exit sw842_exit(void)
+{
+	if (sw842_template_counts)
+		sw842_debugfs_remove();
+}
+module_exit(sw842_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Software 842 Compressor");
+MODULE_AUTHOR("Dan Streetman <ddstreet@ieee.org>");
diff --git a/lib/842/842_debugfs.h b/lib/842/842_debugfs.h
new file mode 100644
index 000000000000..e7f3bffaf255
--- /dev/null
+++ b/lib/842/842_debugfs.h
@@ -0,0 +1,52 @@
+
+#ifndef __842_DEBUGFS_H__
+#define __842_DEBUGFS_H__
+
+#include <linux/debugfs.h>
+
+static bool sw842_template_counts;
+module_param_named(template_counts, sw842_template_counts, bool, 0444);
+
+static atomic_t template_count[OPS_MAX], template_repeat_count,
+	template_zeros_count, template_short_data_count, template_end_count;
+
+static struct dentry *sw842_debugfs_root;
+
+static int __init sw842_debugfs_create(void)
+{
+	umode_t m = S_IRUGO | S_IWUSR;
+	int i;
+
+	if (!debugfs_initialized())
+		return -ENODEV;
+
+	sw842_debugfs_root = debugfs_create_dir(MODULE_NAME, NULL);
+	if (IS_ERR(sw842_debugfs_root))
+		return PTR_ERR(sw842_debugfs_root);
+
+	for (i = 0; i < ARRAY_SIZE(template_count); i++) {
+		char name[32];
+
+		snprintf(name, 32, "template_%02x", i);
+		debugfs_create_atomic_t(name, m, sw842_debugfs_root,
+					&template_count[i]);
+	}
+	debugfs_create_atomic_t("template_repeat", m, sw842_debugfs_root,
+				&template_repeat_count);
+	debugfs_create_atomic_t("template_zeros", m, sw842_debugfs_root,
+				&template_zeros_count);
+	debugfs_create_atomic_t("template_short_data", m, sw842_debugfs_root,
+				&template_short_data_count);
+	debugfs_create_atomic_t("template_end", m, sw842_debugfs_root,
+				&template_end_count);
+
+	return 0;
+}
+
+static void __exit sw842_debugfs_remove(void)
+{
+	if (sw842_debugfs_root && !IS_ERR(sw842_debugfs_root))
+		debugfs_remove_recursive(sw842_debugfs_root);
+}
+
+#endif
diff --git a/lib/842/842_decompress.c b/lib/842/842_decompress.c
new file mode 100644
index 000000000000..6b2b45aecde3
--- /dev/null
+++ b/lib/842/842_decompress.c
@@ -0,0 +1,405 @@
+/*
+ * 842 Software Decompression
+ *
+ * Copyright (C) 2015 Dan Streetman, IBM Corp
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * See 842.h for details of the 842 compressed format.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#define MODULE_NAME "842_decompress"
+
+#include "842.h"
+#include "842_debugfs.h"
+
+/* rolling fifo sizes */
+#define I2_FIFO_SIZE	(2 * (1 << I2_BITS))
+#define I4_FIFO_SIZE	(4 * (1 << I4_BITS))
+#define I8_FIFO_SIZE	(8 * (1 << I8_BITS))
+
+static u8 decomp_ops[OPS_MAX][4] = {
+	{ D8, N0, N0, N0 },
+	{ D4, D2, I2, N0 },
+	{ D4, I2, D2, N0 },
+	{ D4, I2, I2, N0 },
+	{ D4, I4, N0, N0 },
+	{ D2, I2, D4, N0 },
+	{ D2, I2, D2, I2 },
+	{ D2, I2, I2, D2 },
+	{ D2, I2, I2, I2 },
+	{ D2, I2, I4, N0 },
+	{ I2, D2, D4, N0 },
+	{ I2, D4, I2, N0 },
+	{ I2, D2, I2, D2 },
+	{ I2, D2, I2, I2 },
+	{ I2, D2, I4, N0 },
+	{ I2, I2, D4, N0 },
+	{ I2, I2, D2, I2 },
+	{ I2, I2, I2, D2 },
+	{ I2, I2, I2, I2 },
+	{ I2, I2, I4, N0 },
+	{ I4, D4, N0, N0 },
+	{ I4, D2, I2, N0 },
+	{ I4, I2, D2, N0 },
+	{ I4, I2, I2, N0 },
+	{ I4, I4, N0, N0 },
+	{ I8, N0, N0, N0 }
+};
+
+struct sw842_param {
+	u8 *in;
+	u8 bit;
+	u64 ilen;
+	u8 *out;
+	u8 *ostart;
+	u64 olen;
+};
+
+#define beN_to_cpu(d, s)					\
+	((s) == 2 ? be16_to_cpu(get_unaligned((__be16 *)d)) :	\
+	 (s) == 4 ? be32_to_cpu(get_unaligned((__be32 *)d)) :	\
+	 (s) == 8 ? be64_to_cpu(get_unaligned((__be64 *)d)) :	\
+	 WARN(1, "pr_debug param err invalid size %x\n", s))
+
+static int next_bits(struct sw842_param *p, u64 *d, u8 n);
+
+static int __split_next_bits(struct sw842_param *p, u64 *d, u8 n, u8 s)
+{
+	u64 tmp = 0;
+	int ret;
+
+	if (n <= s) {
+		pr_debug("split_next_bits invalid n %u s %u\n", n, s);
+		return -EINVAL;
+	}
+
+	ret = next_bits(p, &tmp, n - s);
+	if (ret)
+		return ret;
+	ret = next_bits(p, d, s);
+	if (ret)
+		return ret;
+	*d |= tmp << s;
+	return 0;
+}
+
+static int next_bits(struct sw842_param *p, u64 *d, u8 n)
+{
+	u8 *in = p->in, b = p->bit, bits = b + n;
+
+	if (n > 64) {
+		pr_debug("next_bits invalid n %u\n", n);
+		return -EINVAL;
+	}
+
+	/* split this up if reading > 8 bytes, or if we're at the end of
+	 * the input buffer and would read past the end
+	 */
+	if (bits > 64)
+		return __split_next_bits(p, d, n, 32);
+	else if (p->ilen < 8 && bits > 32 && bits <= 56)
+		return __split_next_bits(p, d, n, 16);
+	else if (p->ilen < 4 && bits > 16 && bits <= 24)
+		return __split_next_bits(p, d, n, 8);
+
+	if (DIV_ROUND_UP(bits, 8) > p->ilen)
+		return -EOVERFLOW;
+
+	if (bits <= 8)
+		*d = *in >> (8 - bits);
+	else if (bits <= 16)
+		*d = be16_to_cpu(get_unaligned((__be16 *)in)) >> (16 - bits);
+	else if (bits <= 32)
+		*d = be32_to_cpu(get_unaligned((__be32 *)in)) >> (32 - bits);
+	else
+		*d = be64_to_cpu(get_unaligned((__be64 *)in)) >> (64 - bits);
+
+	*d &= GENMASK_ULL(n - 1, 0);
+
+	p->bit += n;
+
+	if (p->bit > 7) {
+		p->in += p->bit / 8;
+		p->ilen -= p->bit / 8;
+		p->bit %= 8;
+	}
+
+	return 0;
+}
+
+static int do_data(struct sw842_param *p, u8 n)
+{
+	u64 v;
+	int ret;
+
+	if (n > p->olen)
+		return -ENOSPC;
+
+	ret = next_bits(p, &v, n * 8);
+	if (ret)
+		return ret;
+
+	switch (n) {
+	case 2:
+		put_unaligned(cpu_to_be16((u16)v), (__be16 *)p->out);
+		break;
+	case 4:
+		put_unaligned(cpu_to_be32((u32)v), (__be32 *)p->out);
+		break;
+	case 8:
+		put_unaligned(cpu_to_be64((u64)v), (__be64 *)p->out);
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	p->out += n;
+	p->olen -= n;
+
+	return 0;
+}
+
+static int __do_index(struct sw842_param *p, u8 size, u8 bits, u64 fsize)
+{
+	u64 index, offset, total = round_down(p->out - p->ostart, 8);
+	int ret;
+
+	ret = next_bits(p, &index, bits);
+	if (ret)
+		return ret;
+
+	offset = index * size;
+
+	/* a ring buffer of fsize is used; correct the offset */
+	if (total > fsize) {
+		/* this is where the current fifo is */
+		u64 section = round_down(total, fsize);
+		/* the current pos in the fifo */
+		u64 pos = total % fsize;
+
+		/* if the offset is past/at the pos, we need to
+		 * go back to the last fifo section
+		 */
+		if (offset >= pos)
+			section -= fsize;
+
+		offset += section;
+	}
+
+	if (offset + size > total) {
+		pr_debug("index%x %lx points past end %lx\n", size,
+			 (unsigned long)offset, (unsigned long)total);
+		return -EINVAL;
+	}
+
+	pr_debug("index%x to %lx off %lx adjoff %lx tot %lx data %lx\n",
+		 size, (unsigned long)index, (unsigned long)(index * size),
+		 (unsigned long)offset, (unsigned long)total,
+		 (unsigned long)beN_to_cpu(&p->ostart[offset], size));
+
+	memcpy(p->out, &p->ostart[offset], size);
+	p->out += size;
+	p->olen -= size;
+
+	return 0;
+}
+
+int do_index(struct sw842_param *p, u8 n)
+{
+	switch (n) {
+	case 2:
+		return __do_index(p, 2, I2_BITS, I2_FIFO_SIZE);
+	case 4:
+		return __do_index(p, 4, I4_BITS, I4_FIFO_SIZE);
+	case 8:
+		return __do_index(p, 8, I8_BITS, I8_FIFO_SIZE);
+	default:
+		return -EINVAL;
+	}
+}
+
+int do_op(struct sw842_param *p, u8 o)
+{
+	int i, ret = 0;
+
+	if (o >= OPS_MAX)
+		return -EINVAL;
+
+	for (i = 0; i < 4; i++) {
+		u8 op = decomp_ops[o][i];
+
+		pr_debug("op is %x\n", op);
+
+		switch (op & OP_ACTION) {
+		case OP_ACTION_DATA:
+			ret = do_data(p, op & OP_AMOUNT);
+			break;
+		case OP_ACTION_INDEX:
+			ret = do_index(p, op & OP_AMOUNT);
+			break;
+		case OP_ACTION_NOOP:
+			break;
+		default:
+			pr_err("Interal error, invalid op %x\n", op);
+			return -EINVAL;
+		}
+
+		if (ret)
+			return ret;
+	}
+
+	if (sw842_template_counts)
+		atomic_inc(&template_count[o]);
+
+	return 0;
+}
+
+/**
+ * sw842_decompress
+ *
+ * Decompress the 842-compressed buffer of length @ilen at @in
+ * to the output buffer @out, using no more than @olen bytes.
+ *
+ * The compressed buffer must be only a single 842-compressed buffer,
+ * with the standard format described in the comments in 842.h
+ * Processing will stop when the 842 "END" template is detected,
+ * not the end of the buffer.
+ *
+ * Returns: 0 on success, error on failure.  The @olen parameter
+ * will contain the number of output bytes written on success, or
+ * 0 on error.
+ */
+int sw842_decompress(const u8 *in, unsigned int ilen,
+		     u8 *out, unsigned int *olen)
+{
+	struct sw842_param p;
+	int ret;
+	u64 op, rep, tmp, bytes, total;
+
+	p.in = (u8 *)in;
+	p.bit = 0;
+	p.ilen = ilen;
+	p.out = out;
+	p.ostart = out;
+	p.olen = *olen;
+
+	total = p.olen;
+
+	*olen = 0;
+
+	do {
+		ret = next_bits(&p, &op, OP_BITS);
+		if (ret)
+			return ret;
+
+		pr_debug("template is %lx\n", (unsigned long)op);
+
+		switch (op) {
+		case OP_REPEAT:
+			ret = next_bits(&p, &rep, REPEAT_BITS);
+			if (ret)
+				return ret;
+
+			if (p.out == out) /* no previous bytes */
+				return -EINVAL;
+
+			/* copy rep + 1 */
+			rep++;
+
+			if (rep * 8 > p.olen)
+				return -ENOSPC;
+
+			while (rep-- > 0) {
+				memcpy(p.out, p.out - 8, 8);
+				p.out += 8;
+				p.olen -= 8;
+			}
+
+			if (sw842_template_counts)
+				atomic_inc(&template_repeat_count);
+
+			break;
+		case OP_ZEROS:
+			if (8 > p.olen)
+				return -ENOSPC;
+
+			memset(p.out, 0, 8);
+			p.out += 8;
+			p.olen -= 8;
+
+			if (sw842_template_counts)
+				atomic_inc(&template_zeros_count);
+
+			break;
+		case OP_SHORT_DATA:
+			ret = next_bits(&p, &bytes, SHORT_DATA_BITS);
+			if (ret)
+				return ret;
+
+			if (!bytes || bytes > SHORT_DATA_BITS_MAX)
+				return -EINVAL;
+
+			while (bytes-- > 0) {
+				ret = next_bits(&p, &tmp, 8);
+				if (ret)
+					return ret;
+				*p.out = (u8)tmp;
+				p.out++;
+				p.olen--;
+			}
+
+			if (sw842_template_counts)
+				atomic_inc(&template_short_data_count);
+
+			break;
+		case OP_END:
+			if (sw842_template_counts)
+				atomic_inc(&template_end_count);
+
+			break;
+		default: /* use template */
+			ret = do_op(&p, op);
+			if (ret)
+				return ret;
+			break;
+		}
+	} while (op != OP_END);
+
+	if (unlikely((total - p.olen) > UINT_MAX))
+		return -ENOSPC;
+
+	*olen = total - p.olen;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(sw842_decompress);
+
+static int __init sw842_init(void)
+{
+	if (sw842_template_counts)
+		sw842_debugfs_create();
+
+	return 0;
+}
+module_init(sw842_init);
+
+static void __exit sw842_exit(void)
+{
+	if (sw842_template_counts)
+		sw842_debugfs_remove();
+}
+module_exit(sw842_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Software 842 Decompressor");
+MODULE_AUTHOR("Dan Streetman <ddstreet@ieee.org>");
diff --git a/lib/842/Makefile b/lib/842/Makefile
new file mode 100644
index 000000000000..5d24c0baff2e
--- /dev/null
+++ b/lib/842/Makefile
@@ -0,0 +1,2 @@
+obj-$(CONFIG_842_COMPRESS) += 842_compress.o
+obj-$(CONFIG_842_DECOMPRESS) += 842_decompress.o
diff --git a/lib/Kconfig b/lib/Kconfig
index 87da53bb1fef..42c925e9caea 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -213,6 +213,12 @@ config RANDOM32_SELFTEST
 #
 # compression support is select'ed if needed
 #
+config 842_COMPRESS
+	tristate
+
+config 842_DECOMPRESS
+	tristate
+
 config ZLIB_INFLATE
 	tristate
 
diff --git a/lib/Makefile b/lib/Makefile
index 58f74d2dd396..a1d67e408e12 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -78,6 +78,8 @@ obj-$(CONFIG_LIBCRC32C)	+= libcrc32c.o
 obj-$(CONFIG_CRC8)	+= crc8.o
 obj-$(CONFIG_GENERIC_ALLOCATOR) += genalloc.o
 
+obj-$(CONFIG_842_COMPRESS) += 842/
+obj-$(CONFIG_842_DECOMPRESS) += 842/
 obj-$(CONFIG_ZLIB_INFLATE) += zlib_inflate/
 obj-$(CONFIG_ZLIB_DEFLATE) += zlib_deflate/
 obj-$(CONFIG_REED_SOLOMON) += reed_solomon/
-- 
cgit v1.2.3


From 7011a122383e36dab594406720fa1d089e0be8f9 Mon Sep 17 00:00:00 2001
From: Dan Streetman <ddstreet@ieee.org>
Date: Thu, 7 May 2015 13:49:17 -0400
Subject: crypto: nx - add NX-842 platform frontend driver

Add NX-842 frontend that allows using either the pSeries platform or
PowerNV platform driver (to be added by later patch) for the NX-842
hardware.  Update the MAINTAINERS file to include the new filenames.
Update Kconfig files to clarify titles and descriptions, and correct
dependencies.

Signed-off-by: Dan Streetman <ddstreet@ieee.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 MAINTAINERS                        |   2 +-
 drivers/crypto/Kconfig             |  10 +--
 drivers/crypto/nx/Kconfig          |  35 ++++++---
 drivers/crypto/nx/Makefile         |   4 +-
 drivers/crypto/nx/nx-842-pseries.c |  57 +++++++--------
 drivers/crypto/nx/nx-842.c         | 144 +++++++++++++++++++++++++++++++++++++
 drivers/crypto/nx/nx-842.h         |  32 +++++++++
 include/linux/nx842.h              |  10 +--
 8 files changed, 245 insertions(+), 49 deletions(-)
 create mode 100644 drivers/crypto/nx/nx-842.c
 create mode 100644 drivers/crypto/nx/nx-842.h

(limited to 'include/linux')

diff --git a/MAINTAINERS b/MAINTAINERS
index a13b4b10ff46..912c9d9f741c 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -4861,7 +4861,7 @@ F:	drivers/crypto/nx/
 IBM Power 842 compression accelerator
 M:	Dan Streetman <ddstreet@us.ibm.com>
 S:	Supported
-F:	drivers/crypto/nx/nx-842.c
+F:	drivers/crypto/nx/nx-842*
 F:	include/linux/nx842.h
 F:	include/linux/sw842.h
 F:	crypto/842.c
diff --git a/drivers/crypto/Kconfig b/drivers/crypto/Kconfig
index 8a76a012be61..0889e496d886 100644
--- a/drivers/crypto/Kconfig
+++ b/drivers/crypto/Kconfig
@@ -330,11 +330,13 @@ config CRYPTO_DEV_S5P
 	  algorithms execution.
 
 config CRYPTO_DEV_NX
-	bool "Support for IBM Power7+ in-Nest cryptographic acceleration"
-	depends on PPC64 && IBMVIO && !CPU_LITTLE_ENDIAN
-	default n
+	bool "Support for IBM PowerPC Nest (NX) cryptographic acceleration"
+	depends on PPC64
 	help
-	  Support for Power7+ in-Nest cryptographic acceleration.
+	  This enables support for the NX hardware cryptographic accelerator
+	  coprocessor that is in IBM PowerPC P7+ or later processors.  This
+	  does not actually enable any drivers, it only allows you to select
+	  which acceleration type (encryption and/or compression) to enable.
 
 if CRYPTO_DEV_NX
 	source "drivers/crypto/nx/Kconfig"
diff --git a/drivers/crypto/nx/Kconfig b/drivers/crypto/nx/Kconfig
index f82616621ae1..34013f7d46c8 100644
--- a/drivers/crypto/nx/Kconfig
+++ b/drivers/crypto/nx/Kconfig
@@ -1,7 +1,9 @@
+
 config CRYPTO_DEV_NX_ENCRYPT
-	tristate "Encryption acceleration support"
-	depends on PPC64 && IBMVIO
+	tristate "Encryption acceleration support on pSeries platform"
+	depends on PPC_PSERIES && IBMVIO && !CPU_LITTLE_ENDIAN
 	default y
+	select CRYPTO_ALGAPI
 	select CRYPTO_AES
 	select CRYPTO_CBC
 	select CRYPTO_ECB
@@ -12,15 +14,30 @@ config CRYPTO_DEV_NX_ENCRYPT
 	select CRYPTO_SHA256
 	select CRYPTO_SHA512
 	help
-	  Support for Power7+ in-Nest encryption acceleration. This
-	  module supports acceleration for AES and SHA2 algorithms. If you
-	  choose 'M' here, this module will be called nx_crypto.
+	  Support for PowerPC Nest (NX) encryption acceleration. This
+	  module supports acceleration for AES and SHA2 algorithms on
+	  the pSeries platform.  If you choose 'M' here, this module
+	  will be called nx_crypto.
 
 config CRYPTO_DEV_NX_COMPRESS
 	tristate "Compression acceleration support"
-	depends on PPC64 && IBMVIO
 	default y
 	help
-	  Support for Power7+ in-Nest compression acceleration. This
-	  module supports acceleration for AES and SHA2 algorithms. If you
-	  choose 'M' here, this module will be called nx_compress.
+	  Support for PowerPC Nest (NX) compression acceleration. This
+	  module supports acceleration for compressing memory with the 842
+	  algorithm.  One of the platform drivers must be selected also.
+	  If you choose 'M' here, this module will be called nx_compress.
+
+if CRYPTO_DEV_NX_COMPRESS
+
+config CRYPTO_DEV_NX_COMPRESS_PSERIES
+	tristate "Compression acceleration support on pSeries platform"
+	depends on PPC_PSERIES && IBMVIO && !CPU_LITTLE_ENDIAN
+	default y
+	help
+	  Support for PowerPC Nest (NX) compression acceleration. This
+	  module supports acceleration for compressing memory with the 842
+	  algorithm.  This supports NX hardware on the pSeries platform.
+	  If you choose 'M' here, this module will be called nx_compress_pseries.
+
+endif
diff --git a/drivers/crypto/nx/Makefile b/drivers/crypto/nx/Makefile
index 8669ffa8dc5c..5d9f4bc15209 100644
--- a/drivers/crypto/nx/Makefile
+++ b/drivers/crypto/nx/Makefile
@@ -11,4 +11,6 @@ nx-crypto-objs := nx.o \
 		  nx-sha512.o
 
 obj-$(CONFIG_CRYPTO_DEV_NX_COMPRESS) += nx-compress.o
-nx-compress-objs := nx-842-pseries.o
+obj-$(CONFIG_CRYPTO_DEV_NX_COMPRESS_PSERIES) += nx-compress-pseries.o
+nx-compress-objs := nx-842.o
+nx-compress-pseries-objs := nx-842-pseries.o
diff --git a/drivers/crypto/nx/nx-842-pseries.c b/drivers/crypto/nx/nx-842-pseries.c
index 887196e9b50c..9b83c9e7fd73 100644
--- a/drivers/crypto/nx/nx-842-pseries.c
+++ b/drivers/crypto/nx/nx-842-pseries.c
@@ -21,18 +21,13 @@
  *          Seth Jennings <sjenning@linux.vnet.ibm.com>
  */
 
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/nx842.h>
-#include <linux/of.h>
-#include <linux/slab.h>
-
 #include <asm/page.h>
 #include <asm/vio.h>
 
+#include "nx-842.h"
 #include "nx_csbcpb.h" /* struct nx_csbcpb */
 
-#define MODULE_NAME "nx-compress"
+#define MODULE_NAME NX842_PSERIES_MODULE_NAME
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Robert Jennings <rcj@linux.vnet.ibm.com>");
 MODULE_DESCRIPTION("842 H/W Compression driver for IBM Power processors");
@@ -236,18 +231,6 @@ struct nx842_workmem {
 	};
 };
 
-int nx842_get_workmem_size(void)
-{
-	return sizeof(struct nx842_workmem) + NX842_HW_PAGE_SIZE;
-}
-EXPORT_SYMBOL_GPL(nx842_get_workmem_size);
-
-int nx842_get_workmem_size_aligned(void)
-{
-	return sizeof(struct nx842_workmem);
-}
-EXPORT_SYMBOL_GPL(nx842_get_workmem_size_aligned);
-
 static int nx842_validate_result(struct device *dev,
 	struct cop_status_block *csb)
 {
@@ -300,7 +283,7 @@ static int nx842_validate_result(struct device *dev,
 }
 
 /**
- * nx842_compress - Compress data using the 842 algorithm
+ * nx842_pseries_compress - Compress data using the 842 algorithm
  *
  * Compression provide by the NX842 coprocessor on IBM Power systems.
  * The input buffer is compressed and the result is stored in the
@@ -315,7 +298,7 @@ static int nx842_validate_result(struct device *dev,
  * @out: Pointer to output buffer
  * @outlen: Length of output buffer
  * @wrkmem: ptr to buffer for working memory, size determined by
- *          nx842_get_workmem_size()
+ *          NX842_MEM_COMPRESS
  *
  * Returns:
  *   0		Success, output of length @outlen stored in the buffer at @out
@@ -325,8 +308,9 @@ static int nx842_validate_result(struct device *dev,
  *   -EIO	Internal error
  *   -ENODEV	Hardware unavailable
  */
-int nx842_compress(const unsigned char *in, unsigned int inlen,
-		       unsigned char *out, unsigned int *outlen, void *wmem)
+static int nx842_pseries_compress(const unsigned char *in, unsigned int inlen,
+				  unsigned char *out, unsigned int *outlen,
+				  void *wmem)
 {
 	struct nx842_header *hdr;
 	struct nx842_devdata *local_devdata;
@@ -493,13 +477,12 @@ unlock:
 	rcu_read_unlock();
 	return ret;
 }
-EXPORT_SYMBOL_GPL(nx842_compress);
 
 static int sw842_decompress(const unsigned char *, int, unsigned char *, int *,
 			const void *);
 
 /**
- * nx842_decompress - Decompress data using the 842 algorithm
+ * nx842_pseries_decompress - Decompress data using the 842 algorithm
  *
  * Decompression provide by the NX842 coprocessor on IBM Power systems.
  * The input buffer is decompressed and the result is stored in the
@@ -515,7 +498,7 @@ static int sw842_decompress(const unsigned char *, int, unsigned char *, int *,
  * @out: Pointer to output buffer, must be page aligned
  * @outlen: Length of output buffer, must be PAGE_SIZE
  * @wrkmem: ptr to buffer for working memory, size determined by
- *          nx842_get_workmem_size()
+ *          NX842_MEM_COMPRESS
  *
  * Returns:
  *   0		Success, output of length @outlen stored in the buffer at @out
@@ -525,8 +508,9 @@ static int sw842_decompress(const unsigned char *, int, unsigned char *, int *,
  *   -EINVAL	Bad input data encountered when attempting decompress
  *   -EIO	Internal error
  */
-int nx842_decompress(const unsigned char *in, unsigned int inlen,
-			 unsigned char *out, unsigned int *outlen, void *wmem)
+static int nx842_pseries_decompress(const unsigned char *in, unsigned int inlen,
+				    unsigned char *out, unsigned int *outlen,
+				    void *wmem)
 {
 	struct nx842_header *hdr;
 	struct nx842_devdata *local_devdata;
@@ -694,7 +678,6 @@ unlock:
 	rcu_read_unlock();
 	return ret;
 }
-EXPORT_SYMBOL_GPL(nx842_decompress);
 
 /**
  * nx842_OF_set_defaults -- Set default (disabled) values for devdata
@@ -1130,6 +1113,12 @@ static struct attribute_group nx842_attribute_group = {
 	.attrs = nx842_sysfs_entries,
 };
 
+static struct nx842_driver nx842_pseries_driver = {
+	.owner =	THIS_MODULE,
+	.compress =	nx842_pseries_compress,
+	.decompress =	nx842_pseries_decompress,
+};
+
 static int __init nx842_probe(struct vio_dev *viodev,
 				  const struct vio_device_id *id)
 {
@@ -1192,6 +1181,8 @@ static int __init nx842_probe(struct vio_dev *viodev,
 		goto error;
 	}
 
+	nx842_register_driver(&nx842_pseries_driver);
+
 	return 0;
 
 error_unlock:
@@ -1222,11 +1213,14 @@ static int __exit nx842_remove(struct vio_dev *viodev)
 	if (old_devdata)
 		kfree(old_devdata->counters);
 	kfree(old_devdata);
+
+	nx842_unregister_driver(&nx842_pseries_driver);
+
 	return 0;
 }
 
 static struct vio_device_id nx842_driver_ids[] = {
-	{"ibm,compression-v1", "ibm,compression"},
+	{NX842_PSERIES_COMPAT_NAME "-v1", NX842_PSERIES_COMPAT_NAME},
 	{"", ""},
 };
 
@@ -1243,6 +1237,8 @@ static int __init nx842_init(void)
 	struct nx842_devdata *new_devdata;
 	pr_info("Registering IBM Power 842 compression driver\n");
 
+	BUILD_BUG_ON(sizeof(struct nx842_workmem) > NX842_MEM_COMPRESS);
+
 	RCU_INIT_POINTER(devdata, NULL);
 	new_devdata = kzalloc(sizeof(*new_devdata), GFP_KERNEL);
 	if (!new_devdata) {
@@ -1272,6 +1268,7 @@ static void __exit nx842_exit(void)
 	if (old_devdata)
 		dev_set_drvdata(old_devdata->dev, NULL);
 	kfree(old_devdata);
+	nx842_unregister_driver(&nx842_pseries_driver);
 	vio_unregister_driver(&nx842_driver);
 }
 
diff --git a/drivers/crypto/nx/nx-842.c b/drivers/crypto/nx/nx-842.c
new file mode 100644
index 000000000000..f1f378eef373
--- /dev/null
+++ b/drivers/crypto/nx/nx-842.c
@@ -0,0 +1,144 @@
+/*
+ * Driver frontend for IBM Power 842 compression accelerator
+ *
+ * Copyright (C) 2015 Dan Streetman, IBM Corp
+ *
+ * Designer of the Power data compression engine:
+ *   Bulent Abali <abali@us.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include "nx-842.h"
+
+#define MODULE_NAME "nx-compress"
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Dan Streetman <ddstreet@ieee.org>");
+MODULE_DESCRIPTION("842 H/W Compression driver for IBM Power processors");
+
+/* Only one driver is expected, based on the HW platform */
+static struct nx842_driver *nx842_driver;
+static DEFINE_SPINLOCK(nx842_driver_lock); /* protects driver pointers */
+
+void nx842_register_driver(struct nx842_driver *driver)
+{
+	spin_lock(&nx842_driver_lock);
+
+	if (nx842_driver) {
+		pr_err("can't register driver %s, already using driver %s\n",
+		       driver->owner->name, nx842_driver->owner->name);
+	} else {
+		pr_info("registering driver %s\n", driver->owner->name);
+		nx842_driver = driver;
+	}
+
+	spin_unlock(&nx842_driver_lock);
+}
+EXPORT_SYMBOL_GPL(nx842_register_driver);
+
+void nx842_unregister_driver(struct nx842_driver *driver)
+{
+	spin_lock(&nx842_driver_lock);
+
+	if (nx842_driver == driver) {
+		pr_info("unregistering driver %s\n", driver->owner->name);
+		nx842_driver = NULL;
+	} else if (nx842_driver) {
+		pr_err("can't unregister driver %s, using driver %s\n",
+		       driver->owner->name, nx842_driver->owner->name);
+	} else {
+		pr_err("can't unregister driver %s, no driver in use\n",
+		       driver->owner->name);
+	}
+
+	spin_unlock(&nx842_driver_lock);
+}
+EXPORT_SYMBOL_GPL(nx842_unregister_driver);
+
+static struct nx842_driver *get_driver(void)
+{
+	struct nx842_driver *driver = NULL;
+
+	spin_lock(&nx842_driver_lock);
+
+	driver = nx842_driver;
+
+	if (driver && !try_module_get(driver->owner))
+		driver = NULL;
+
+	spin_unlock(&nx842_driver_lock);
+
+	return driver;
+}
+
+static void put_driver(struct nx842_driver *driver)
+{
+	module_put(driver->owner);
+}
+
+int nx842_compress(const unsigned char *in, unsigned int in_len,
+		   unsigned char *out, unsigned int *out_len,
+		   void *wrkmem)
+{
+	struct nx842_driver *driver = get_driver();
+	int ret;
+
+	if (!driver)
+		return -ENODEV;
+
+	ret = driver->compress(in, in_len, out, out_len, wrkmem);
+
+	put_driver(driver);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(nx842_compress);
+
+int nx842_decompress(const unsigned char *in, unsigned int in_len,
+		     unsigned char *out, unsigned int *out_len,
+		     void *wrkmem)
+{
+	struct nx842_driver *driver = get_driver();
+	int ret;
+
+	if (!driver)
+		return -ENODEV;
+
+	ret = driver->decompress(in, in_len, out, out_len, wrkmem);
+
+	put_driver(driver);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(nx842_decompress);
+
+static __init int nx842_init(void)
+{
+	pr_info("loading\n");
+
+	if (of_find_compatible_node(NULL, NULL, NX842_PSERIES_COMPAT_NAME))
+		request_module_nowait(NX842_PSERIES_MODULE_NAME);
+	else
+		pr_err("no nx842 driver found.\n");
+
+	pr_info("loaded\n");
+
+	return 0;
+}
+module_init(nx842_init);
+
+static void __exit nx842_exit(void)
+{
+	pr_info("NX842 unloaded\n");
+}
+module_exit(nx842_exit);
diff --git a/drivers/crypto/nx/nx-842.h b/drivers/crypto/nx/nx-842.h
new file mode 100644
index 000000000000..2a5d4e197c72
--- /dev/null
+++ b/drivers/crypto/nx/nx-842.h
@@ -0,0 +1,32 @@
+
+#ifndef __NX_842_H__
+#define __NX_842_H__
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/nx842.h>
+#include <linux/of.h>
+#include <linux/slab.h>
+#include <linux/io.h>
+
+struct nx842_driver {
+	struct module *owner;
+
+	int (*compress)(const unsigned char *in, unsigned int in_len,
+			unsigned char *out, unsigned int *out_len,
+			void *wrkmem);
+	int (*decompress)(const unsigned char *in, unsigned int in_len,
+			  unsigned char *out, unsigned int *out_len,
+			  void *wrkmem);
+};
+
+void nx842_register_driver(struct nx842_driver *driver);
+void nx842_unregister_driver(struct nx842_driver *driver);
+
+
+/* To allow the main nx-compress module to load platform module */
+#define NX842_PSERIES_MODULE_NAME	"nx-compress-pseries"
+#define NX842_PSERIES_COMPAT_NAME	"ibm,compression"
+
+
+#endif /* __NX_842_H__ */
diff --git a/include/linux/nx842.h b/include/linux/nx842.h
index a4d324c6406a..d919c22b7fd6 100644
--- a/include/linux/nx842.h
+++ b/include/linux/nx842.h
@@ -1,11 +1,13 @@
 #ifndef __NX842_H__
 #define __NX842_H__
 
-int nx842_get_workmem_size(void);
-int nx842_get_workmem_size_aligned(void);
+#define __NX842_PSERIES_MEM_COMPRESS	((PAGE_SIZE * 2) + 10240)
+
+#define NX842_MEM_COMPRESS	__NX842_PSERIES_MEM_COMPRESS
+
 int nx842_compress(const unsigned char *in, unsigned int in_len,
-		unsigned char *out, unsigned int *out_len, void *wrkmem);
+		   unsigned char *out, unsigned int *out_len, void *wrkmem);
 int nx842_decompress(const unsigned char *in, unsigned int in_len,
-		unsigned char *out, unsigned int *out_len, void *wrkmem);
+		     unsigned char *out, unsigned int *out_len, void *wrkmem);
 
 #endif
-- 
cgit v1.2.3


From 959e6659b6f74ec1fa4d391a3b88d63dc0189f36 Mon Sep 17 00:00:00 2001
From: Dan Streetman <ddstreet@ieee.org>
Date: Thu, 7 May 2015 13:49:18 -0400
Subject: crypto: nx - add nx842 constraints

Add "constraints" for the NX-842 driver.  The constraints are used to
indicate what the current NX-842 platform driver is capable of.  The
constraints tell the NX-842 user what alignment, min and max length, and
length multiple each provided buffers should conform to.  These are
required because the 842 hardware requires buffers to meet specific
constraints that vary based on platform - for example, the pSeries
max length is much lower than the PowerNV max length.

Signed-off-by: Dan Streetman <ddstreet@ieee.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 drivers/crypto/nx/nx-842-pseries.c | 10 ++++++++++
 drivers/crypto/nx/nx-842.c         | 38 ++++++++++++++++++++++++++++++++++++++
 drivers/crypto/nx/nx-842.h         |  2 ++
 include/linux/nx842.h              |  9 +++++++++
 4 files changed, 59 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/crypto/nx/nx-842-pseries.c b/drivers/crypto/nx/nx-842-pseries.c
index 9b83c9e7fd73..cb481d81df06 100644
--- a/drivers/crypto/nx/nx-842-pseries.c
+++ b/drivers/crypto/nx/nx-842-pseries.c
@@ -40,6 +40,13 @@ MODULE_DESCRIPTION("842 H/W Compression driver for IBM Power processors");
 /* IO buffer must be 128 byte aligned */
 #define IO_BUFFER_ALIGN 128
 
+static struct nx842_constraints nx842_pseries_constraints = {
+	.alignment =	IO_BUFFER_ALIGN,
+	.multiple =	DDE_BUFFER_LAST_MULT,
+	.minimum =	IO_BUFFER_ALIGN,
+	.maximum =	PAGE_SIZE, /* dynamic, max_sync_size */
+};
+
 struct nx842_header {
 	int blocks_nr; /* number of compressed blocks */
 	int offset; /* offset of the first block (from beginning of header) */
@@ -842,6 +849,8 @@ static int nx842_OF_upd_maxsyncop(struct nx842_devdata *devdata,
 		goto out;
 	}
 
+	nx842_pseries_constraints.maximum = devdata->max_sync_size;
+
 	devdata->max_sync_sg = (unsigned int)min(maxsynccop->comp_sg_limit,
 						maxsynccop->decomp_sg_limit);
 	if (devdata->max_sync_sg < 1) {
@@ -1115,6 +1124,7 @@ static struct attribute_group nx842_attribute_group = {
 
 static struct nx842_driver nx842_pseries_driver = {
 	.owner =	THIS_MODULE,
+	.constraints =	&nx842_pseries_constraints,
 	.compress =	nx842_pseries_compress,
 	.decompress =	nx842_pseries_decompress,
 };
diff --git a/drivers/crypto/nx/nx-842.c b/drivers/crypto/nx/nx-842.c
index f1f378eef373..160fe2d97336 100644
--- a/drivers/crypto/nx/nx-842.c
+++ b/drivers/crypto/nx/nx-842.c
@@ -86,6 +86,44 @@ static void put_driver(struct nx842_driver *driver)
 	module_put(driver->owner);
 }
 
+/**
+ * nx842_constraints
+ *
+ * This provides the driver's constraints.  Different nx842 implementations
+ * may have varying requirements.  The constraints are:
+ *   @alignment:	All buffers should be aligned to this
+ *   @multiple:		All buffer lengths should be a multiple of this
+ *   @minimum:		Buffer lengths must not be less than this amount
+ *   @maximum:		Buffer lengths must not be more than this amount
+ *
+ * The constraints apply to all buffers and lengths, both input and output,
+ * for both compression and decompression, except for the minimum which
+ * only applies to compression input and decompression output; the
+ * compressed data can be less than the minimum constraint.  It can be
+ * assumed that compressed data will always adhere to the multiple
+ * constraint.
+ *
+ * The driver may succeed even if these constraints are violated;
+ * however the driver can return failure or suffer reduced performance
+ * if any constraint is not met.
+ */
+int nx842_constraints(struct nx842_constraints *c)
+{
+	struct nx842_driver *driver = get_driver();
+	int ret = 0;
+
+	if (!driver)
+		return -ENODEV;
+
+	BUG_ON(!c);
+	memcpy(c, driver->constraints, sizeof(*c));
+
+	put_driver(driver);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(nx842_constraints);
+
 int nx842_compress(const unsigned char *in, unsigned int in_len,
 		   unsigned char *out, unsigned int *out_len,
 		   void *wrkmem)
diff --git a/drivers/crypto/nx/nx-842.h b/drivers/crypto/nx/nx-842.h
index 2a5d4e197c72..c6ceb0f1d04c 100644
--- a/drivers/crypto/nx/nx-842.h
+++ b/drivers/crypto/nx/nx-842.h
@@ -12,6 +12,8 @@
 struct nx842_driver {
 	struct module *owner;
 
+	struct nx842_constraints *constraints;
+
 	int (*compress)(const unsigned char *in, unsigned int in_len,
 			unsigned char *out, unsigned int *out_len,
 			void *wrkmem);
diff --git a/include/linux/nx842.h b/include/linux/nx842.h
index d919c22b7fd6..aa1a97e90dea 100644
--- a/include/linux/nx842.h
+++ b/include/linux/nx842.h
@@ -5,6 +5,15 @@
 
 #define NX842_MEM_COMPRESS	__NX842_PSERIES_MEM_COMPRESS
 
+struct nx842_constraints {
+	int alignment;
+	int multiple;
+	int minimum;
+	int maximum;
+};
+
+int nx842_constraints(struct nx842_constraints *constraints);
+
 int nx842_compress(const unsigned char *in, unsigned int in_len,
 		   unsigned char *out, unsigned int *out_len, void *wrkmem);
 int nx842_decompress(const unsigned char *in, unsigned int in_len,
-- 
cgit v1.2.3


From 99182a42b7ef3d5e4180992ce01befd9e87526d2 Mon Sep 17 00:00:00 2001
From: Dan Streetman <ddstreet@ieee.org>
Date: Thu, 7 May 2015 13:49:19 -0400
Subject: crypto: nx - add PowerNV platform NX-842 driver

Add driver for NX-842 hardware on the PowerNV platform.

This allows the use of the 842 compression hardware coprocessor on
the PowerNV platform.

Signed-off-by: Dan Streetman <ddstreet@ieee.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 drivers/crypto/nx/Kconfig          |  10 +
 drivers/crypto/nx/Makefile         |   2 +
 drivers/crypto/nx/nx-842-powernv.c | 625 +++++++++++++++++++++++++++++++++++++
 drivers/crypto/nx/nx-842-pseries.c |   9 -
 drivers/crypto/nx/nx-842.c         |   4 +-
 drivers/crypto/nx/nx-842.h         |  97 ++++++
 include/linux/nx842.h              |   6 +-
 7 files changed, 741 insertions(+), 12 deletions(-)
 create mode 100644 drivers/crypto/nx/nx-842-powernv.c

(limited to 'include/linux')

diff --git a/drivers/crypto/nx/Kconfig b/drivers/crypto/nx/Kconfig
index 34013f7d46c8..ee9e25956241 100644
--- a/drivers/crypto/nx/Kconfig
+++ b/drivers/crypto/nx/Kconfig
@@ -40,4 +40,14 @@ config CRYPTO_DEV_NX_COMPRESS_PSERIES
 	  algorithm.  This supports NX hardware on the pSeries platform.
 	  If you choose 'M' here, this module will be called nx_compress_pseries.
 
+config CRYPTO_DEV_NX_COMPRESS_POWERNV
+	tristate "Compression acceleration support on PowerNV platform"
+	depends on PPC_POWERNV
+	default y
+	help
+	  Support for PowerPC Nest (NX) compression acceleration. This
+	  module supports acceleration for compressing memory with the 842
+	  algorithm.  This supports NX hardware on the PowerNV platform.
+	  If you choose 'M' here, this module will be called nx_compress_powernv.
+
 endif
diff --git a/drivers/crypto/nx/Makefile b/drivers/crypto/nx/Makefile
index 5d9f4bc15209..6619787423b3 100644
--- a/drivers/crypto/nx/Makefile
+++ b/drivers/crypto/nx/Makefile
@@ -12,5 +12,7 @@ nx-crypto-objs := nx.o \
 
 obj-$(CONFIG_CRYPTO_DEV_NX_COMPRESS) += nx-compress.o
 obj-$(CONFIG_CRYPTO_DEV_NX_COMPRESS_PSERIES) += nx-compress-pseries.o
+obj-$(CONFIG_CRYPTO_DEV_NX_COMPRESS_POWERNV) += nx-compress-powernv.o
 nx-compress-objs := nx-842.o
 nx-compress-pseries-objs := nx-842-pseries.o
+nx-compress-powernv-objs := nx-842-powernv.o
diff --git a/drivers/crypto/nx/nx-842-powernv.c b/drivers/crypto/nx/nx-842-powernv.c
new file mode 100644
index 000000000000..6a9fb8b2d05b
--- /dev/null
+++ b/drivers/crypto/nx/nx-842-powernv.c
@@ -0,0 +1,625 @@
+/*
+ * Driver for IBM PowerNV 842 compression accelerator
+ *
+ * Copyright (C) 2015 Dan Streetman, IBM Corp
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include "nx-842.h"
+
+#include <linux/timer.h>
+
+#include <asm/prom.h>
+#include <asm/icswx.h>
+
+#define MODULE_NAME NX842_POWERNV_MODULE_NAME
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Dan Streetman <ddstreet@ieee.org>");
+MODULE_DESCRIPTION("842 H/W Compression driver for IBM PowerNV processors");
+
+#define WORKMEM_ALIGN	(CRB_ALIGN)
+#define CSB_WAIT_MAX	(5000) /* ms */
+
+struct nx842_workmem {
+	/* Below fields must be properly aligned */
+	struct coprocessor_request_block crb; /* CRB_ALIGN align */
+	struct data_descriptor_entry ddl_in[DDL_LEN_MAX]; /* DDE_ALIGN align */
+	struct data_descriptor_entry ddl_out[DDL_LEN_MAX]; /* DDE_ALIGN align */
+	/* Above fields must be properly aligned */
+
+	ktime_t start;
+
+	char padding[WORKMEM_ALIGN]; /* unused, to allow alignment */
+} __packed __aligned(WORKMEM_ALIGN);
+
+struct nx842_coproc {
+	unsigned int chip_id;
+	unsigned int ct;
+	unsigned int ci;
+	struct list_head list;
+};
+
+/* no cpu hotplug on powernv, so this list never changes after init */
+static LIST_HEAD(nx842_coprocs);
+static unsigned int nx842_ct;
+
+/**
+ * setup_indirect_dde - Setup an indirect DDE
+ *
+ * The DDE is setup with the the DDE count, byte count, and address of
+ * first direct DDE in the list.
+ */
+static void setup_indirect_dde(struct data_descriptor_entry *dde,
+			       struct data_descriptor_entry *ddl,
+			       unsigned int dde_count, unsigned int byte_count)
+{
+	dde->flags = 0;
+	dde->count = dde_count;
+	dde->index = 0;
+	dde->length = cpu_to_be32(byte_count);
+	dde->address = cpu_to_be64(nx842_get_pa(ddl));
+}
+
+/**
+ * setup_direct_dde - Setup single DDE from buffer
+ *
+ * The DDE is setup with the buffer and length.  The buffer must be properly
+ * aligned.  The used length is returned.
+ * Returns:
+ *   N    Successfully set up DDE with N bytes
+ */
+static unsigned int setup_direct_dde(struct data_descriptor_entry *dde,
+				     unsigned long pa, unsigned int len)
+{
+	unsigned int l = min_t(unsigned int, len, LEN_ON_PAGE(pa));
+
+	dde->flags = 0;
+	dde->count = 0;
+	dde->index = 0;
+	dde->length = cpu_to_be32(l);
+	dde->address = cpu_to_be64(pa);
+
+	return l;
+}
+
+/**
+ * setup_ddl - Setup DDL from buffer
+ *
+ * Returns:
+ *   0		Successfully set up DDL
+ */
+static int setup_ddl(struct data_descriptor_entry *dde,
+		     struct data_descriptor_entry *ddl,
+		     unsigned char *buf, unsigned int len,
+		     bool in)
+{
+	unsigned long pa = nx842_get_pa(buf);
+	int i, ret, total_len = len;
+
+	if (!IS_ALIGNED(pa, DDE_BUFFER_ALIGN)) {
+		pr_debug("%s buffer pa 0x%lx not 0x%x-byte aligned\n",
+			 in ? "input" : "output", pa, DDE_BUFFER_ALIGN);
+		return -EINVAL;
+	}
+
+	/* only need to check last mult; since buffer must be
+	 * DDE_BUFFER_ALIGN aligned, and that is a multiple of
+	 * DDE_BUFFER_SIZE_MULT, and pre-last page DDE buffers
+	 * are guaranteed a multiple of DDE_BUFFER_SIZE_MULT.
+	 */
+	if (len % DDE_BUFFER_LAST_MULT) {
+		pr_debug("%s buffer len 0x%x not a multiple of 0x%x\n",
+			 in ? "input" : "output", len, DDE_BUFFER_LAST_MULT);
+		if (in)
+			return -EINVAL;
+		len = round_down(len, DDE_BUFFER_LAST_MULT);
+	}
+
+	/* use a single direct DDE */
+	if (len <= LEN_ON_PAGE(pa)) {
+		ret = setup_direct_dde(dde, pa, len);
+		WARN_ON(ret < len);
+		return 0;
+	}
+
+	/* use the DDL */
+	for (i = 0; i < DDL_LEN_MAX && len > 0; i++) {
+		ret = setup_direct_dde(&ddl[i], pa, len);
+		buf += ret;
+		len -= ret;
+		pa = nx842_get_pa(buf);
+	}
+
+	if (len > 0) {
+		pr_debug("0x%x total %s bytes 0x%x too many for DDL.\n",
+			 total_len, in ? "input" : "output", len);
+		if (in)
+			return -EMSGSIZE;
+		total_len -= len;
+	}
+	setup_indirect_dde(dde, ddl, i, total_len);
+
+	return 0;
+}
+
+#define CSB_ERR(csb, msg, ...)					\
+	pr_err("ERROR: " msg " : %02x %02x %02x %02x %08x\n",	\
+	       ##__VA_ARGS__, (csb)->flags,			\
+	       (csb)->cs, (csb)->cc, (csb)->ce,			\
+	       be32_to_cpu((csb)->count))
+
+#define CSB_ERR_ADDR(csb, msg, ...)				\
+	CSB_ERR(csb, msg " at %lx", ##__VA_ARGS__,		\
+		(unsigned long)be64_to_cpu((csb)->address))
+
+/**
+ * wait_for_csb
+ */
+static int wait_for_csb(struct nx842_workmem *wmem,
+			struct coprocessor_status_block *csb)
+{
+	ktime_t start = wmem->start, now = ktime_get();
+	ktime_t timeout = ktime_add_ms(start, CSB_WAIT_MAX);
+
+	while (!(ACCESS_ONCE(csb->flags) & CSB_V)) {
+		cpu_relax();
+		now = ktime_get();
+		if (ktime_after(now, timeout))
+			break;
+	}
+
+	/* hw has updated csb and output buffer */
+	barrier();
+
+	/* check CSB flags */
+	if (!(csb->flags & CSB_V)) {
+		CSB_ERR(csb, "CSB still not valid after %ld us, giving up",
+			(long)ktime_us_delta(now, start));
+		return -ETIMEDOUT;
+	}
+	if (csb->flags & CSB_F) {
+		CSB_ERR(csb, "Invalid CSB format");
+		return -EPROTO;
+	}
+	if (csb->flags & CSB_CH) {
+		CSB_ERR(csb, "Invalid CSB chaining state");
+		return -EPROTO;
+	}
+
+	/* verify CSB completion sequence is 0 */
+	if (csb->cs) {
+		CSB_ERR(csb, "Invalid CSB completion sequence");
+		return -EPROTO;
+	}
+
+	/* check CSB Completion Code */
+	switch (csb->cc) {
+	/* no error */
+	case CSB_CC_SUCCESS:
+		break;
+	case CSB_CC_TPBC_GT_SPBC:
+		/* not an error, but the compressed data is
+		 * larger than the uncompressed data :(
+		 */
+		break;
+
+	/* input data errors */
+	case CSB_CC_OPERAND_OVERLAP:
+		/* input and output buffers overlap */
+		CSB_ERR(csb, "Operand Overlap error");
+		return -EINVAL;
+	case CSB_CC_INVALID_OPERAND:
+		CSB_ERR(csb, "Invalid operand");
+		return -EINVAL;
+	case CSB_CC_NOSPC:
+		/* output buffer too small */
+		return -ENOSPC;
+	case CSB_CC_ABORT:
+		CSB_ERR(csb, "Function aborted");
+		return -EINTR;
+	case CSB_CC_CRC_MISMATCH:
+		CSB_ERR(csb, "CRC mismatch");
+		return -EINVAL;
+	case CSB_CC_TEMPL_INVALID:
+		CSB_ERR(csb, "Compressed data template invalid");
+		return -EINVAL;
+	case CSB_CC_TEMPL_OVERFLOW:
+		CSB_ERR(csb, "Compressed data template shows data past end");
+		return -EINVAL;
+
+	/* these should not happen */
+	case CSB_CC_INVALID_ALIGN:
+		/* setup_ddl should have detected this */
+		CSB_ERR_ADDR(csb, "Invalid alignment");
+		return -EINVAL;
+	case CSB_CC_DATA_LENGTH:
+		/* setup_ddl should have detected this */
+		CSB_ERR(csb, "Invalid data length");
+		return -EINVAL;
+	case CSB_CC_WR_TRANSLATION:
+	case CSB_CC_TRANSLATION:
+	case CSB_CC_TRANSLATION_DUP1:
+	case CSB_CC_TRANSLATION_DUP2:
+	case CSB_CC_TRANSLATION_DUP3:
+	case CSB_CC_TRANSLATION_DUP4:
+	case CSB_CC_TRANSLATION_DUP5:
+	case CSB_CC_TRANSLATION_DUP6:
+		/* should not happen, we use physical addrs */
+		CSB_ERR_ADDR(csb, "Translation error");
+		return -EPROTO;
+	case CSB_CC_WR_PROTECTION:
+	case CSB_CC_PROTECTION:
+	case CSB_CC_PROTECTION_DUP1:
+	case CSB_CC_PROTECTION_DUP2:
+	case CSB_CC_PROTECTION_DUP3:
+	case CSB_CC_PROTECTION_DUP4:
+	case CSB_CC_PROTECTION_DUP5:
+	case CSB_CC_PROTECTION_DUP6:
+		/* should not happen, we use physical addrs */
+		CSB_ERR_ADDR(csb, "Protection error");
+		return -EPROTO;
+	case CSB_CC_PRIVILEGE:
+		/* shouldn't happen, we're in HYP mode */
+		CSB_ERR(csb, "Insufficient Privilege error");
+		return -EPROTO;
+	case CSB_CC_EXCESSIVE_DDE:
+		/* shouldn't happen, setup_ddl doesn't use many dde's */
+		CSB_ERR(csb, "Too many DDEs in DDL");
+		return -EINVAL;
+	case CSB_CC_TRANSPORT:
+		/* shouldn't happen, we setup CRB correctly */
+		CSB_ERR(csb, "Invalid CRB");
+		return -EINVAL;
+	case CSB_CC_SEGMENTED_DDL:
+		/* shouldn't happen, setup_ddl creates DDL right */
+		CSB_ERR(csb, "Segmented DDL error");
+		return -EINVAL;
+	case CSB_CC_DDE_OVERFLOW:
+		/* shouldn't happen, setup_ddl creates DDL right */
+		CSB_ERR(csb, "DDE overflow error");
+		return -EINVAL;
+	case CSB_CC_SESSION:
+		/* should not happen with ICSWX */
+		CSB_ERR(csb, "Session violation error");
+		return -EPROTO;
+	case CSB_CC_CHAIN:
+		/* should not happen, we don't use chained CRBs */
+		CSB_ERR(csb, "Chained CRB error");
+		return -EPROTO;
+	case CSB_CC_SEQUENCE:
+		/* should not happen, we don't use chained CRBs */
+		CSB_ERR(csb, "CRB seqeunce number error");
+		return -EPROTO;
+	case CSB_CC_UNKNOWN_CODE:
+		CSB_ERR(csb, "Unknown subfunction code");
+		return -EPROTO;
+
+	/* hardware errors */
+	case CSB_CC_RD_EXTERNAL:
+	case CSB_CC_RD_EXTERNAL_DUP1:
+	case CSB_CC_RD_EXTERNAL_DUP2:
+	case CSB_CC_RD_EXTERNAL_DUP3:
+		CSB_ERR_ADDR(csb, "Read error outside coprocessor");
+		return -EPROTO;
+	case CSB_CC_WR_EXTERNAL:
+		CSB_ERR_ADDR(csb, "Write error outside coprocessor");
+		return -EPROTO;
+	case CSB_CC_INTERNAL:
+		CSB_ERR(csb, "Internal error in coprocessor");
+		return -EPROTO;
+	case CSB_CC_PROVISION:
+		CSB_ERR(csb, "Storage provision error");
+		return -EPROTO;
+	case CSB_CC_HW:
+		CSB_ERR(csb, "Correctable hardware error");
+		return -EPROTO;
+
+	default:
+		CSB_ERR(csb, "Invalid CC %d", csb->cc);
+		return -EPROTO;
+	}
+
+	/* check Completion Extension state */
+	if (csb->ce & CSB_CE_TERMINATION) {
+		CSB_ERR(csb, "CSB request was terminated");
+		return -EPROTO;
+	}
+	if (csb->ce & CSB_CE_INCOMPLETE) {
+		CSB_ERR(csb, "CSB request not complete");
+		return -EPROTO;
+	}
+	if (!(csb->ce & CSB_CE_TPBC)) {
+		CSB_ERR(csb, "TPBC not provided, unknown target length");
+		return -EPROTO;
+	}
+
+	/* successful completion */
+	pr_debug_ratelimited("Processed %u bytes in %lu us\n", csb->count,
+			     (unsigned long)ktime_us_delta(now, start));
+
+	return 0;
+}
+
+/**
+ * nx842_powernv_function - compress/decompress data using the 842 algorithm
+ *
+ * (De)compression provided by the NX842 coprocessor on IBM PowerNV systems.
+ * This compresses or decompresses the provided input buffer into the provided
+ * output buffer.
+ *
+ * Upon return from this function @outlen contains the length of the
+ * output data.  If there is an error then @outlen will be 0 and an
+ * error will be specified by the return code from this function.
+ *
+ * The @workmem buffer should only be used by one function call at a time.
+ *
+ * @in: input buffer pointer
+ * @inlen: input buffer size
+ * @out: output buffer pointer
+ * @outlenp: output buffer size pointer
+ * @workmem: working memory buffer pointer, must be at least NX842_MEM_COMPRESS
+ * @fc: function code, see CCW Function Codes in nx-842.h
+ *
+ * Returns:
+ *   0		Success, output of length @outlenp stored in the buffer at @out
+ *   -ENODEV	Hardware unavailable
+ *   -ENOSPC	Output buffer is to small
+ *   -EMSGSIZE	Input buffer too large
+ *   -EINVAL	buffer constraints do not fix nx842_constraints
+ *   -EPROTO	hardware error during operation
+ *   -ETIMEDOUT	hardware did not complete operation in reasonable time
+ *   -EINTR	operation was aborted
+ */
+static int nx842_powernv_function(const unsigned char *in, unsigned int inlen,
+				  unsigned char *out, unsigned int *outlenp,
+				  void *workmem, int fc)
+{
+	struct coprocessor_request_block *crb;
+	struct coprocessor_status_block *csb;
+	struct nx842_workmem *wmem;
+	int ret;
+	u64 csb_addr;
+	u32 ccw;
+	unsigned int outlen = *outlenp;
+
+	wmem = PTR_ALIGN(workmem, WORKMEM_ALIGN);
+
+	*outlenp = 0;
+
+	/* shoudn't happen, we don't load without a coproc */
+	if (!nx842_ct) {
+		pr_err_ratelimited("coprocessor CT is 0");
+		return -ENODEV;
+	}
+
+	crb = &wmem->crb;
+	csb = &crb->csb;
+
+	/* Clear any previous values */
+	memset(crb, 0, sizeof(*crb));
+
+	/* set up DDLs */
+	ret = setup_ddl(&crb->source, wmem->ddl_in,
+			(unsigned char *)in, inlen, true);
+	if (ret)
+		return ret;
+	ret = setup_ddl(&crb->target, wmem->ddl_out,
+			out, outlen, false);
+	if (ret)
+		return ret;
+
+	/* set up CCW */
+	ccw = 0;
+	ccw = SET_FIELD(ccw, CCW_CT, nx842_ct);
+	ccw = SET_FIELD(ccw, CCW_CI_842, 0); /* use 0 for hw auto-selection */
+	ccw = SET_FIELD(ccw, CCW_FC_842, fc);
+
+	/* set up CRB's CSB addr */
+	csb_addr = nx842_get_pa(csb) & CRB_CSB_ADDRESS;
+	csb_addr |= CRB_CSB_AT; /* Addrs are phys */
+	crb->csb_addr = cpu_to_be64(csb_addr);
+
+	wmem->start = ktime_get();
+
+	/* do ICSWX */
+	ret = icswx(cpu_to_be32(ccw), crb);
+
+	pr_debug_ratelimited("icswx CR %x ccw %x crb->ccw %x\n", ret,
+			     (unsigned int)ccw,
+			     (unsigned int)be32_to_cpu(crb->ccw));
+
+	switch (ret) {
+	case ICSWX_INITIATED:
+		ret = wait_for_csb(wmem, csb);
+		break;
+	case ICSWX_BUSY:
+		pr_debug_ratelimited("842 Coprocessor busy\n");
+		ret = -EBUSY;
+		break;
+	case ICSWX_REJECTED:
+		pr_err_ratelimited("ICSWX rejected\n");
+		ret = -EPROTO;
+		break;
+	default:
+		pr_err_ratelimited("Invalid ICSWX return code %x\n", ret);
+		ret = -EPROTO;
+		break;
+	}
+
+	if (!ret)
+		*outlenp = be32_to_cpu(csb->count);
+
+	return ret;
+}
+
+/**
+ * nx842_powernv_compress - Compress data using the 842 algorithm
+ *
+ * Compression provided by the NX842 coprocessor on IBM PowerNV systems.
+ * The input buffer is compressed and the result is stored in the
+ * provided output buffer.
+ *
+ * Upon return from this function @outlen contains the length of the
+ * compressed data.  If there is an error then @outlen will be 0 and an
+ * error will be specified by the return code from this function.
+ *
+ * @in: input buffer pointer
+ * @inlen: input buffer size
+ * @out: output buffer pointer
+ * @outlenp: output buffer size pointer
+ * @workmem: working memory buffer pointer, must be at least NX842_MEM_COMPRESS
+ *
+ * Returns: see @nx842_powernv_function()
+ */
+static int nx842_powernv_compress(const unsigned char *in, unsigned int inlen,
+				  unsigned char *out, unsigned int *outlenp,
+				  void *wmem)
+{
+	return nx842_powernv_function(in, inlen, out, outlenp,
+				      wmem, CCW_FC_842_COMP_NOCRC);
+}
+
+/**
+ * nx842_powernv_decompress - Decompress data using the 842 algorithm
+ *
+ * Decompression provided by the NX842 coprocessor on IBM PowerNV systems.
+ * The input buffer is decompressed and the result is stored in the
+ * provided output buffer.
+ *
+ * Upon return from this function @outlen contains the length of the
+ * decompressed data.  If there is an error then @outlen will be 0 and an
+ * error will be specified by the return code from this function.
+ *
+ * @in: input buffer pointer
+ * @inlen: input buffer size
+ * @out: output buffer pointer
+ * @outlenp: output buffer size pointer
+ * @workmem: working memory buffer pointer, must be at least NX842_MEM_COMPRESS
+ *
+ * Returns: see @nx842_powernv_function()
+ */
+static int nx842_powernv_decompress(const unsigned char *in, unsigned int inlen,
+				    unsigned char *out, unsigned int *outlenp,
+				    void *wmem)
+{
+	return nx842_powernv_function(in, inlen, out, outlenp,
+				      wmem, CCW_FC_842_DECOMP_NOCRC);
+}
+
+static int __init nx842_powernv_probe(struct device_node *dn)
+{
+	struct nx842_coproc *coproc;
+	struct property *ct_prop, *ci_prop;
+	unsigned int ct, ci;
+	int chip_id;
+
+	chip_id = of_get_ibm_chip_id(dn);
+	if (chip_id < 0) {
+		pr_err("ibm,chip-id missing\n");
+		return -EINVAL;
+	}
+	ct_prop = of_find_property(dn, "ibm,842-coprocessor-type", NULL);
+	if (!ct_prop) {
+		pr_err("ibm,842-coprocessor-type missing\n");
+		return -EINVAL;
+	}
+	ct = be32_to_cpu(*(unsigned int *)ct_prop->value);
+	ci_prop = of_find_property(dn, "ibm,842-coprocessor-instance", NULL);
+	if (!ci_prop) {
+		pr_err("ibm,842-coprocessor-instance missing\n");
+		return -EINVAL;
+	}
+	ci = be32_to_cpu(*(unsigned int *)ci_prop->value);
+
+	coproc = kmalloc(sizeof(*coproc), GFP_KERNEL);
+	if (!coproc)
+		return -ENOMEM;
+
+	coproc->chip_id = chip_id;
+	coproc->ct = ct;
+	coproc->ci = ci;
+	INIT_LIST_HEAD(&coproc->list);
+	list_add(&coproc->list, &nx842_coprocs);
+
+	pr_info("coprocessor found on chip %d, CT %d CI %d\n", chip_id, ct, ci);
+
+	if (!nx842_ct)
+		nx842_ct = ct;
+	else if (nx842_ct != ct)
+		pr_err("NX842 chip %d, CT %d != first found CT %d\n",
+		       chip_id, ct, nx842_ct);
+
+	return 0;
+}
+
+static struct nx842_constraints nx842_powernv_constraints = {
+	.alignment =	DDE_BUFFER_ALIGN,
+	.multiple =	DDE_BUFFER_LAST_MULT,
+	.minimum =	DDE_BUFFER_LAST_MULT,
+	.maximum =	(DDL_LEN_MAX - 1) * PAGE_SIZE,
+};
+
+static struct nx842_driver nx842_powernv_driver = {
+	.owner =	THIS_MODULE,
+	.constraints =	&nx842_powernv_constraints,
+	.compress =	nx842_powernv_compress,
+	.decompress =	nx842_powernv_decompress,
+};
+
+static __init int nx842_powernv_init(void)
+{
+	struct device_node *dn;
+
+	/* verify workmem size/align restrictions */
+	BUILD_BUG_ON(sizeof(struct nx842_workmem) > NX842_MEM_COMPRESS);
+	BUILD_BUG_ON(WORKMEM_ALIGN % CRB_ALIGN);
+	BUILD_BUG_ON(CRB_ALIGN % DDE_ALIGN);
+	BUILD_BUG_ON(CRB_SIZE % DDE_ALIGN);
+	/* verify buffer size/align restrictions */
+	BUILD_BUG_ON(PAGE_SIZE % DDE_BUFFER_ALIGN);
+	BUILD_BUG_ON(DDE_BUFFER_ALIGN % DDE_BUFFER_SIZE_MULT);
+	BUILD_BUG_ON(DDE_BUFFER_SIZE_MULT % DDE_BUFFER_LAST_MULT);
+
+	pr_info("loading\n");
+
+	for_each_compatible_node(dn, NULL, NX842_POWERNV_COMPAT_NAME)
+		nx842_powernv_probe(dn);
+
+	if (!nx842_ct) {
+		pr_err("no coprocessors found\n");
+		return -ENODEV;
+	}
+
+	nx842_register_driver(&nx842_powernv_driver);
+
+	pr_info("loaded\n");
+
+	return 0;
+}
+module_init(nx842_powernv_init);
+
+static void __exit nx842_powernv_exit(void)
+{
+	struct nx842_coproc *coproc, *n;
+
+	nx842_unregister_driver(&nx842_powernv_driver);
+
+	list_for_each_entry_safe(coproc, n, &nx842_coprocs, list) {
+		list_del(&coproc->list);
+		kfree(coproc);
+	}
+
+	pr_info("unloaded\n");
+}
+module_exit(nx842_powernv_exit);
diff --git a/drivers/crypto/nx/nx-842-pseries.c b/drivers/crypto/nx/nx-842-pseries.c
index cb481d81df06..6db99924652c 100644
--- a/drivers/crypto/nx/nx-842-pseries.c
+++ b/drivers/crypto/nx/nx-842-pseries.c
@@ -160,15 +160,6 @@ static inline unsigned long nx842_get_scatterlist_size(
 	return sl->entry_nr * sizeof(struct nx842_slentry);
 }
 
-static inline unsigned long nx842_get_pa(void *addr)
-{
-	if (is_vmalloc_addr(addr))
-		return page_to_phys(vmalloc_to_page(addr))
-		       + offset_in_page(addr);
-	else
-		return __pa(addr);
-}
-
 static int nx842_build_scatterlist(unsigned long buf, int len,
 			struct nx842_scatterlist *sl)
 {
diff --git a/drivers/crypto/nx/nx-842.c b/drivers/crypto/nx/nx-842.c
index 160fe2d97336..bf2823ceaf4e 100644
--- a/drivers/crypto/nx/nx-842.c
+++ b/drivers/crypto/nx/nx-842.c
@@ -164,7 +164,9 @@ static __init int nx842_init(void)
 {
 	pr_info("loading\n");
 
-	if (of_find_compatible_node(NULL, NULL, NX842_PSERIES_COMPAT_NAME))
+	if (of_find_compatible_node(NULL, NULL, NX842_POWERNV_COMPAT_NAME))
+		request_module_nowait(NX842_POWERNV_MODULE_NAME);
+	else if (of_find_compatible_node(NULL, NULL, NX842_PSERIES_COMPAT_NAME))
 		request_module_nowait(NX842_PSERIES_MODULE_NAME);
 	else
 		pr_err("no nx842 driver found.\n");
diff --git a/drivers/crypto/nx/nx-842.h b/drivers/crypto/nx/nx-842.h
index c6ceb0f1d04c..84b15b7448bb 100644
--- a/drivers/crypto/nx/nx-842.h
+++ b/drivers/crypto/nx/nx-842.h
@@ -5,9 +5,104 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/nx842.h>
+#include <linux/sw842.h>
 #include <linux/of.h>
 #include <linux/slab.h>
 #include <linux/io.h>
+#include <linux/mm.h>
+#include <linux/ratelimit.h>
+
+/* Restrictions on Data Descriptor List (DDL) and Entry (DDE) buffers
+ *
+ * From NX P8 workbook, sec 4.9.1 "842 details"
+ *   Each DDE buffer is 128 byte aligned
+ *   Each DDE buffer size is a multiple of 32 bytes (except the last)
+ *   The last DDE buffer size is a multiple of 8 bytes
+ */
+#define DDE_BUFFER_ALIGN	(128)
+#define DDE_BUFFER_SIZE_MULT	(32)
+#define DDE_BUFFER_LAST_MULT	(8)
+
+/* Arbitrary DDL length limit
+ * Allows max buffer size of MAX-1 to MAX pages
+ * (depending on alignment)
+ */
+#define DDL_LEN_MAX		(17)
+
+/* CCW 842 CI/FC masks
+ * NX P8 workbook, section 4.3.1, figure 4-6
+ * "CI/FC Boundary by NX CT type"
+ */
+#define CCW_CI_842		(0x00003ff8)
+#define CCW_FC_842		(0x00000007)
+
+/* CCW Function Codes (FC) for 842
+ * NX P8 workbook, section 4.9, table 4-28
+ * "Function Code Definitions for 842 Memory Compression"
+ */
+#define CCW_FC_842_COMP_NOCRC	(0)
+#define CCW_FC_842_COMP_CRC	(1)
+#define CCW_FC_842_DECOMP_NOCRC	(2)
+#define CCW_FC_842_DECOMP_CRC	(3)
+#define CCW_FC_842_MOVE		(4)
+
+/* CSB CC Error Types for 842
+ * NX P8 workbook, section 4.10.3, table 4-30
+ * "Reported Error Types Summary Table"
+ */
+/* These are all duplicates of existing codes defined in icswx.h. */
+#define CSB_CC_TRANSLATION_DUP1	(80)
+#define CSB_CC_TRANSLATION_DUP2	(82)
+#define CSB_CC_TRANSLATION_DUP3	(84)
+#define CSB_CC_TRANSLATION_DUP4	(86)
+#define CSB_CC_TRANSLATION_DUP5	(92)
+#define CSB_CC_TRANSLATION_DUP6	(94)
+#define CSB_CC_PROTECTION_DUP1	(81)
+#define CSB_CC_PROTECTION_DUP2	(83)
+#define CSB_CC_PROTECTION_DUP3	(85)
+#define CSB_CC_PROTECTION_DUP4	(87)
+#define CSB_CC_PROTECTION_DUP5	(93)
+#define CSB_CC_PROTECTION_DUP6	(95)
+#define CSB_CC_RD_EXTERNAL_DUP1	(89)
+#define CSB_CC_RD_EXTERNAL_DUP2	(90)
+#define CSB_CC_RD_EXTERNAL_DUP3	(91)
+/* These are specific to NX */
+/* 842 codes */
+#define CSB_CC_TPBC_GT_SPBC	(64) /* no error, but >1 comp ratio */
+#define CSB_CC_CRC_MISMATCH	(65) /* decomp crc mismatch */
+#define CSB_CC_TEMPL_INVALID	(66) /* decomp invalid template value */
+#define CSB_CC_TEMPL_OVERFLOW	(67) /* decomp template shows data after end */
+/* sym crypt codes */
+#define CSB_CC_DECRYPT_OVERFLOW	(64)
+/* asym crypt codes */
+#define CSB_CC_MINV_OVERFLOW	(128)
+/* These are reserved for hypervisor use */
+#define CSB_CC_HYP_RESERVE_START	(240)
+#define CSB_CC_HYP_RESERVE_END		(253)
+#define CSB_CC_HYP_NO_HW		(254)
+#define CSB_CC_HYP_HANG_ABORTED		(255)
+
+/* CCB Completion Modes (CM) for 842
+ * NX P8 workbook, section 4.3, figure 4-5
+ * "CRB Details - Normal Cop_Req (CL=00, C=1)"
+ */
+#define CCB_CM_EXTRA_WRITE	(CCB_CM0_ALL_COMPLETIONS & CCB_CM12_STORE)
+#define CCB_CM_INTERRUPT	(CCB_CM0_ALL_COMPLETIONS & CCB_CM12_INTERRUPT)
+
+#define LEN_ON_PAGE(pa)		(PAGE_SIZE - ((pa) & ~PAGE_MASK))
+
+static inline unsigned long nx842_get_pa(void *addr)
+{
+	if (!is_vmalloc_addr(addr))
+		return __pa(addr);
+
+	return page_to_phys(vmalloc_to_page(addr)) + offset_in_page(addr);
+}
+
+/* Get/Set bit fields */
+#define MASK_LSH(m)		(__builtin_ffsl(m) - 1)
+#define GET_FIELD(v, m)		(((v) & (m)) >> MASK_LSH(m))
+#define SET_FIELD(v, m, val)	(((v) & ~(m)) | (((val) << MASK_LSH(m)) & (m)))
 
 struct nx842_driver {
 	struct module *owner;
@@ -27,6 +122,8 @@ void nx842_unregister_driver(struct nx842_driver *driver);
 
 
 /* To allow the main nx-compress module to load platform module */
+#define NX842_POWERNV_MODULE_NAME	"nx-compress-powernv"
+#define NX842_POWERNV_COMPAT_NAME	"ibm,power-nx"
 #define NX842_PSERIES_MODULE_NAME	"nx-compress-pseries"
 #define NX842_PSERIES_COMPAT_NAME	"ibm,compression"
 
diff --git a/include/linux/nx842.h b/include/linux/nx842.h
index aa1a97e90dea..4ddf68d9c0d4 100644
--- a/include/linux/nx842.h
+++ b/include/linux/nx842.h
@@ -1,9 +1,11 @@
 #ifndef __NX842_H__
 #define __NX842_H__
 
-#define __NX842_PSERIES_MEM_COMPRESS	((PAGE_SIZE * 2) + 10240)
+#define __NX842_PSERIES_MEM_COMPRESS	(10240)
+#define __NX842_POWERNV_MEM_COMPRESS	(1024)
 
-#define NX842_MEM_COMPRESS	__NX842_PSERIES_MEM_COMPRESS
+#define NX842_MEM_COMPRESS	(max_t(unsigned int,			\
+	__NX842_PSERIES_MEM_COMPRESS, __NX842_POWERNV_MEM_COMPRESS))
 
 struct nx842_constraints {
 	int alignment;
-- 
cgit v1.2.3


From b19e7f51a55fe740c18038d1d6957aedfc078d07 Mon Sep 17 00:00:00 2001
From: Vladimir Zapolskiy <vladimir_zapolskiy@mentor.com>
Date: Wed, 29 Apr 2015 18:34:59 +0300
Subject: gpio: gpio-generic: add flag to read out output value from reg_set

The change introduces BGPIOF_READ_OUTPUT_REG_SET flag for gpio-generic
GPIO chip implementation, which allows to get correct configured value
from reg_set register, input value is still get from reg_dat.

Signed-off-by: Vladimir Zapolskiy <vladimir_zapolskiy@mentor.com>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 drivers/gpio/gpio-generic.c     | 22 +++++++++++++++++++---
 include/linux/basic_mmio_gpio.h |  1 +
 2 files changed, 20 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/gpio/gpio-generic.c b/drivers/gpio/gpio-generic.c
index b92a690f5765..9bda3727fac1 100644
--- a/drivers/gpio/gpio-generic.c
+++ b/drivers/gpio/gpio-generic.c
@@ -135,6 +135,17 @@ static unsigned long bgpio_pin2mask_be(struct bgpio_chip *bgc,
 	return 1 << (bgc->bits - 1 - pin);
 }
 
+static int bgpio_get_set(struct gpio_chip *gc, unsigned int gpio)
+{
+	struct bgpio_chip *bgc = to_bgpio_chip(gc);
+	unsigned long pinmask = bgc->pin2mask(bgc, gpio);
+
+	if (bgc->dir & pinmask)
+		return bgc->read_reg(bgc->reg_set) & pinmask;
+	else
+		return bgc->read_reg(bgc->reg_dat) & pinmask;
+}
+
 static int bgpio_get(struct gpio_chip *gc, unsigned int gpio)
 {
 	struct bgpio_chip *bgc = to_bgpio_chip(gc);
@@ -416,7 +427,8 @@ static int bgpio_setup_accessors(struct device *dev,
 static int bgpio_setup_io(struct bgpio_chip *bgc,
 			  void __iomem *dat,
 			  void __iomem *set,
-			  void __iomem *clr)
+			  void __iomem *clr,
+			  unsigned long flags)
 {
 
 	bgc->reg_dat = dat;
@@ -437,7 +449,11 @@ static int bgpio_setup_io(struct bgpio_chip *bgc,
 		bgc->gc.set_multiple = bgpio_set_multiple;
 	}
 
-	bgc->gc.get = bgpio_get;
+	if (!(flags & BGPIOF_UNREADABLE_REG_SET) &&
+	    (flags & BGPIOF_READ_OUTPUT_REG_SET))
+		bgc->gc.get = bgpio_get_set;
+	else
+		bgc->gc.get = bgpio_get;
 
 	return 0;
 }
@@ -500,7 +516,7 @@ int bgpio_init(struct bgpio_chip *bgc, struct device *dev,
 	bgc->gc.ngpio = bgc->bits;
 	bgc->gc.request = bgpio_request;
 
-	ret = bgpio_setup_io(bgc, dat, set, clr);
+	ret = bgpio_setup_io(bgc, dat, set, clr, flags);
 	if (ret)
 		return ret;
 
diff --git a/include/linux/basic_mmio_gpio.h b/include/linux/basic_mmio_gpio.h
index 0e97856b2cff..14eea946e640 100644
--- a/include/linux/basic_mmio_gpio.h
+++ b/include/linux/basic_mmio_gpio.h
@@ -74,5 +74,6 @@ int bgpio_init(struct bgpio_chip *bgc, struct device *dev,
 #define BGPIOF_UNREADABLE_REG_SET	BIT(1) /* reg_set is unreadable */
 #define BGPIOF_UNREADABLE_REG_DIR	BIT(2) /* reg_dir is unreadable */
 #define BGPIOF_BIG_ENDIAN_BYTE_ORDER	BIT(3)
+#define BGPIOF_READ_OUTPUT_REG_SET     BIT(4) /* reg_set stores output value */
 
 #endif /* __BASIC_MMIO_GPIO_H */
-- 
cgit v1.2.3


From c884fbd452147e952ae160e750553d00ea4dc4c9 Mon Sep 17 00:00:00 2001
From: Mika Westerberg <mika.westerberg@linux.intel.com>
Date: Wed, 6 May 2015 13:29:06 +0300
Subject: gpio / ACPI: Add support for retrieving GpioInt resources from a
 device

ACPI specification knows two types of GPIOs: GpioIo and GpioInt. The latter
is used to describe that a given device interrupt line is connected to a
specific GPIO pin. Typical ACPI _CRS entry for such device looks like
below:

    Name (_CRS, ResourceTemplate ()
    {
        I2cSerialBus (0x004A, ControllerInitiated, 0x00061A80,
                      AddressingMode7Bit, "\\_SB.PCI0.I2C6",
                      0x00, ResourceConsumer)
        GpioIo (Exclusive, PullDefault, 0x0000, 0x0000,
                IoRestrictionOutputOnly, "\\_SB.GPO0",
                0x00, ResourceConsumer)
        {
            0x004B
        }
        GpioInt (Level, ActiveLow, Shared, PullDefault, 0x0000,
                 "\\_SB.GPO0", 0x00, ResourceConsumer)
        {
            0x004C
        }
    })

Currently drivers need to request a GPIO corresponding to the right GpioInt
and then translate that to Linux IRQ number. This adds unnecessary lines of
boiler-plate code.

We can ease this a bit by introducing acpi_dev_gpio_irq_get() analogous to
of_irq_get(). This function translates given GpioInt resource under the
device in question to the suitable Linux IRQ number.

Signed-off-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Acked-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 drivers/gpio/gpiolib-acpi.c | 29 +++++++++++++++++++++++++++++
 include/linux/acpi.h        |  7 +++++++
 2 files changed, 36 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/gpio/gpiolib-acpi.c b/drivers/gpio/gpiolib-acpi.c
index d2303d50f561..bff29bb0a3fe 100644
--- a/drivers/gpio/gpiolib-acpi.c
+++ b/drivers/gpio/gpiolib-acpi.c
@@ -514,6 +514,35 @@ struct gpio_desc *acpi_get_gpiod_by_index(struct acpi_device *adev,
 	return lookup.desc ? lookup.desc : ERR_PTR(-ENOENT);
 }
 
+/**
+ * acpi_dev_gpio_irq_get() - Find GpioInt and translate it to Linux IRQ number
+ * @adev: pointer to a ACPI device to get IRQ from
+ * @index: index of GpioInt resource (starting from %0)
+ *
+ * If the device has one or more GpioInt resources, this function can be
+ * used to translate from the GPIO offset in the resource to the Linux IRQ
+ * number.
+ *
+ * Return: Linux IRQ number (>%0) on success, negative errno on failure.
+ */
+int acpi_dev_gpio_irq_get(struct acpi_device *adev, int index)
+{
+	int idx, i;
+
+	for (i = 0, idx = 0; idx <= index; i++) {
+		struct acpi_gpio_info info;
+		struct gpio_desc *desc;
+
+		desc = acpi_get_gpiod_by_index(adev, NULL, i, &info);
+		if (IS_ERR(desc))
+			break;
+		if (info.gpioint && idx++ == index)
+			return gpiod_to_irq(desc);
+	}
+	return -ENOENT;
+}
+EXPORT_SYMBOL_GPL(acpi_dev_gpio_irq_get);
+
 static acpi_status
 acpi_gpio_adr_space_handler(u32 function, acpi_physical_address address,
 			    u32 bits, u64 *value, void *handler_context,
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index e4da5e35e29c..f57c440642cd 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -721,6 +721,8 @@ static inline void acpi_dev_remove_driver_gpios(struct acpi_device *adev)
 	if (adev)
 		adev->driver_gpios = NULL;
 }
+
+int acpi_dev_gpio_irq_get(struct acpi_device *adev, int index);
 #else
 static inline int acpi_dev_add_driver_gpios(struct acpi_device *adev,
 			      const struct acpi_gpio_mapping *gpios)
@@ -728,6 +730,11 @@ static inline int acpi_dev_add_driver_gpios(struct acpi_device *adev,
 	return -ENXIO;
 }
 static inline void acpi_dev_remove_driver_gpios(struct acpi_device *adev) {}
+
+static inline int acpi_dev_gpio_irq_get(struct acpi_device *adev, int index)
+{
+	return -ENXIO;
+}
 #endif
 
 /* Device properties */
-- 
cgit v1.2.3


From bda0be7ad994812960e9f8f2d2757f72cb4a96cb Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 23 Mar 2015 13:37:39 +1100
Subject: security: make inode_follow_link RCU-walk aware

inode_follow_link now takes an inode and rcu flag as well as the
dentry.

inode is used in preference to d_backing_inode(dentry), particularly
in RCU-walk mode.

selinux_inode_follow_link() gets dentry_has_perm() and
inode_has_perm() open-coded into it so that it can call
avc_has_perm_flags() in way that is safe if LOOKUP_RCU is set.

Calling avc_has_perm_flags() with rcu_read_lock() held means
that when avc_has_perm_noaudit calls avc_compute_av(), the attempt
to rcu_read_unlock() before calling security_compute_av() will not
actually drop the RCU read-lock.

However as security_compute_av() is completely in a read_lock()ed
region, it should be safe with the RCU read-lock held.

Signed-off-by: NeilBrown <neilb@suse.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c               |  5 +++--
 include/linux/security.h | 12 +++++++++---
 security/capability.c    |  3 ++-
 security/security.c      |  7 ++++---
 security/selinux/hooks.c | 16 ++++++++++++++--
 5 files changed, 32 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/fs/namei.c b/fs/namei.c
index 33b655de0ec0..0fa7af23cff6 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -881,8 +881,9 @@ const char *get_link(struct nameidata *nd)
 
 	touch_atime(&last->link);
 
-	error = security_inode_follow_link(dentry);
-	if (error)
+	error = security_inode_follow_link(dentry, inode,
+					   nd->flags & LOOKUP_RCU);
+	if (unlikely(error))
 		return ERR_PTR(error);
 
 	nd->last_type = LAST_BIND;
diff --git a/include/linux/security.h b/include/linux/security.h
index 62a66202ecf1..52febde52479 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -476,6 +476,8 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts)
  * @inode_follow_link:
  *	Check permission to follow a symbolic link when looking up a pathname.
  *	@dentry contains the dentry structure for the link.
+ *	@inode contains the inode, which itself is not stable in RCU-walk
+ *	@rcu indicates whether we are in RCU-walk mode.
  *	Return 0 if permission is granted.
  * @inode_permission:
  *	Check permission before accessing an inode.  This hook is called by the
@@ -1551,7 +1553,8 @@ struct security_operations {
 	int (*inode_rename) (struct inode *old_dir, struct dentry *old_dentry,
 			     struct inode *new_dir, struct dentry *new_dentry);
 	int (*inode_readlink) (struct dentry *dentry);
-	int (*inode_follow_link) (struct dentry *dentry);
+	int (*inode_follow_link) (struct dentry *dentry, struct inode *inode,
+				  bool rcu);
 	int (*inode_permission) (struct inode *inode, int mask);
 	int (*inode_setattr)	(struct dentry *dentry, struct iattr *attr);
 	int (*inode_getattr) (const struct path *path);
@@ -1837,7 +1840,8 @@ int security_inode_rename(struct inode *old_dir, struct dentry *old_dentry,
 			  struct inode *new_dir, struct dentry *new_dentry,
 			  unsigned int flags);
 int security_inode_readlink(struct dentry *dentry);
-int security_inode_follow_link(struct dentry *dentry);
+int security_inode_follow_link(struct dentry *dentry, struct inode *inode,
+			       bool rcu);
 int security_inode_permission(struct inode *inode, int mask);
 int security_inode_setattr(struct dentry *dentry, struct iattr *attr);
 int security_inode_getattr(const struct path *path);
@@ -2239,7 +2243,9 @@ static inline int security_inode_readlink(struct dentry *dentry)
 	return 0;
 }
 
-static inline int security_inode_follow_link(struct dentry *dentry)
+static inline int security_inode_follow_link(struct dentry *dentry,
+					     struct inode *inode,
+					     bool rcu)
 {
 	return 0;
 }
diff --git a/security/capability.c b/security/capability.c
index fae99db180ca..7d3f38fe02ba 100644
--- a/security/capability.c
+++ b/security/capability.c
@@ -209,7 +209,8 @@ static int cap_inode_readlink(struct dentry *dentry)
 	return 0;
 }
 
-static int cap_inode_follow_link(struct dentry *dentry)
+static int cap_inode_follow_link(struct dentry *dentry, struct inode *inode,
+				 bool rcu)
 {
 	return 0;
 }
diff --git a/security/security.c b/security/security.c
index d7c30b03e144..04c8feca081a 100644
--- a/security/security.c
+++ b/security/security.c
@@ -581,11 +581,12 @@ int security_inode_readlink(struct dentry *dentry)
 	return security_ops->inode_readlink(dentry);
 }
 
-int security_inode_follow_link(struct dentry *dentry)
+int security_inode_follow_link(struct dentry *dentry, struct inode *inode,
+			       bool rcu)
 {
-	if (unlikely(IS_PRIVATE(d_backing_inode(dentry))))
+	if (unlikely(IS_PRIVATE(inode)))
 		return 0;
-	return security_ops->inode_follow_link(dentry);
+	return security_ops->inode_follow_link(dentry, inode, rcu);
 }
 
 int security_inode_permission(struct inode *inode, int mask)
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index d56a82967d5e..ffa5a642629a 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -2861,11 +2861,23 @@ static int selinux_inode_readlink(struct dentry *dentry)
 	return dentry_has_perm(cred, dentry, FILE__READ);
 }
 
-static int selinux_inode_follow_link(struct dentry *dentry)
+static int selinux_inode_follow_link(struct dentry *dentry, struct inode *inode,
+				     bool rcu)
 {
 	const struct cred *cred = current_cred();
+	struct common_audit_data ad;
+	struct inode_security_struct *isec;
+	u32 sid;
 
-	return dentry_has_perm(cred, dentry, FILE__READ);
+	validate_creds(cred);
+
+	ad.type = LSM_AUDIT_DATA_DENTRY;
+	ad.u.dentry = dentry;
+	sid = cred_sid(cred);
+	isec = inode->i_security;
+
+	return avc_has_perm_flags(sid, isec->sid, isec->sclass, FILE__READ, &ad,
+				  rcu ? MAY_NOT_BLOCK : 0);
 }
 
 static noinline int audit_inode_permission(struct inode *inode,
-- 
cgit v1.2.3


From 5f2c4179e129bdc47870a81a65d0aff85aa18293 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 7 May 2015 11:14:26 -0400
Subject: switch ->put_link() from dentry to inode

only one instance looks at that argument at all; that sole
exception wants inode rather than dentry.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 Documentation/filesystems/Locking             |  2 +-
 Documentation/filesystems/vfs.txt             |  2 +-
 drivers/staging/lustre/lustre/llite/symlink.c |  2 +-
 fs/configfs/symlink.c                         |  2 +-
 fs/f2fs/namei.c                               |  2 +-
 fs/fuse/dir.c                                 |  2 +-
 fs/hostfs/hostfs_kern.c                       |  2 +-
 fs/hppfs/hppfs.c                              |  8 ++++----
 fs/kernfs/symlink.c                           |  2 +-
 fs/libfs.c                                    |  2 +-
 fs/namei.c                                    | 13 +++++++------
 fs/overlayfs/inode.c                          |  4 ++--
 fs/proc/inode.c                               |  2 +-
 include/linux/fs.h                            |  6 +++---
 mm/shmem.c                                    |  2 +-
 15 files changed, 27 insertions(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking
index 5b5b4f54c033..6a34a0f4d37c 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -51,7 +51,7 @@ prototypes:
 			struct inode *, struct dentry *, unsigned int);
 	int (*readlink) (struct dentry *, char __user *,int);
 	const char *(*follow_link) (struct dentry *, void **);
-	void (*put_link) (struct dentry *, void *);
+	void (*put_link) (struct inode *, void *);
 	void (*truncate) (struct inode *);
 	int (*permission) (struct inode *, int, unsigned int);
 	int (*get_acl)(struct inode *, int);
diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt
index 0dec8c880be6..542d9352d0f2 100644
--- a/Documentation/filesystems/vfs.txt
+++ b/Documentation/filesystems/vfs.txt
@@ -351,7 +351,7 @@ struct inode_operations {
 			struct inode *, struct dentry *, unsigned int);
 	int (*readlink) (struct dentry *, char __user *,int);
 	const char *(*follow_link) (struct dentry *, void **);
-	void (*put_link) (struct dentry *, void *);
+	void (*put_link) (struct inode *, void *);
 	int (*permission) (struct inode *, int);
 	int (*get_acl)(struct inode *, int);
 	int (*setattr) (struct dentry *, struct iattr *);
diff --git a/drivers/staging/lustre/lustre/llite/symlink.c b/drivers/staging/lustre/lustre/llite/symlink.c
index f3be3bf0f66f..69b203651905 100644
--- a/drivers/staging/lustre/lustre/llite/symlink.c
+++ b/drivers/staging/lustre/lustre/llite/symlink.c
@@ -141,7 +141,7 @@ static const char *ll_follow_link(struct dentry *dentry, void **cookie)
 	return symname;
 }
 
-static void ll_put_link(struct dentry *dentry, void *cookie)
+static void ll_put_link(struct inode *unused, void *cookie)
 {
 	ptlrpc_req_finished(cookie);
 }
diff --git a/fs/configfs/symlink.c b/fs/configfs/symlink.c
index 0ace75649009..bc464c26e00e 100644
--- a/fs/configfs/symlink.c
+++ b/fs/configfs/symlink.c
@@ -296,7 +296,7 @@ static const char *configfs_follow_link(struct dentry *dentry, void **cookie)
 	return ERR_PTR(error);
 }
 
-static void configfs_put_link(struct dentry *dentry, void *cookie)
+static void configfs_put_link(struct inode *unused, void *cookie)
 {
 	free_page((unsigned long)cookie);
 }
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index cd05a7c91533..71765d062914 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -301,7 +301,7 @@ static const char *f2fs_follow_link(struct dentry *dentry, void **cookie)
 	const char *link = page_follow_link_light(dentry, cookie);
 	if (!IS_ERR(link) && !*link) {
 		/* this is broken symlink case */
-		page_put_link(dentry, *cookie);
+		page_put_link(NULL, *cookie);
 		link = ERR_PTR(-ENOENT);
 	}
 	return link;
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index d5cdef8b7f3a..9e704c124392 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -1395,7 +1395,7 @@ static const char *fuse_follow_link(struct dentry *dentry, void **cookie)
 	return link;
 }
 
-static void fuse_put_link(struct dentry *dentry, void *cookie)
+static void fuse_put_link(struct inode *unused, void *cookie)
 {
 	free_page((unsigned long) cookie);
 }
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index 7b6ed7a908f6..4a437ab5f296 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -915,7 +915,7 @@ static const char *hostfs_follow_link(struct dentry *dentry, void **cookie)
 	return *cookie = link;
 }
 
-static void hostfs_put_link(struct dentry *dentry, void *cookie)
+static void hostfs_put_link(struct inode *unused, void *cookie)
 {
 	__putname(cookie);
 }
diff --git a/fs/hppfs/hppfs.c b/fs/hppfs/hppfs.c
index 15a774eb5bbf..2867837909a9 100644
--- a/fs/hppfs/hppfs.c
+++ b/fs/hppfs/hppfs.c
@@ -649,12 +649,12 @@ static const char *hppfs_follow_link(struct dentry *dentry, void **cookie)
 	return d_inode(proc_dentry)->i_op->follow_link(proc_dentry, cookie);
 }
 
-static void hppfs_put_link(struct dentry *dentry, void *cookie)
+static void hppfs_put_link(struct inode *inode, void *cookie)
 {
-	struct dentry *proc_dentry = HPPFS_I(d_inode(dentry))->proc_dentry;
+	struct inode *proc_inode = d_inode(HPPFS_I(inode)->proc_dentry);
 
-	if (d_inode(proc_dentry)->i_op->put_link)
-		d_inode(proc_dentry)->i_op->put_link(proc_dentry, cookie);
+	if (proc_inode->i_op->put_link)
+		proc_inode->i_op->put_link(proc_inode, cookie);
 }
 
 static const struct inode_operations hppfs_dir_iops = {
diff --git a/fs/kernfs/symlink.c b/fs/kernfs/symlink.c
index 366c5a17475e..f6aa2e5a76b4 100644
--- a/fs/kernfs/symlink.c
+++ b/fs/kernfs/symlink.c
@@ -126,7 +126,7 @@ static const char *kernfs_iop_follow_link(struct dentry *dentry, void **cookie)
 	return *cookie = (char *)page;
 }
 
-static void kernfs_iop_put_link(struct dentry *dentry, void *cookie)
+static void kernfs_iop_put_link(struct inode *unused, void *cookie)
 {
 	free_page((unsigned long)cookie);
 }
diff --git a/fs/libfs.c b/fs/libfs.c
index c5f3373e326b..01c337b0fec8 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -1024,7 +1024,7 @@ int noop_fsync(struct file *file, loff_t start, loff_t end, int datasync)
 }
 EXPORT_SYMBOL(noop_fsync);
 
-void kfree_put_link(struct dentry *dentry, void *cookie)
+void kfree_put_link(struct inode *unused, void *cookie)
 {
 	kfree(cookie);
 }
diff --git a/fs/namei.c b/fs/namei.c
index 0fa7af23cff6..43034046a0e1 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -749,7 +749,7 @@ static inline void put_link(struct nameidata *nd)
 	struct saved *last = nd->stack + --nd->depth;
 	struct inode *inode = last->inode;
 	if (last->cookie && inode->i_op->put_link)
-		inode->i_op->put_link(last->link.dentry, last->cookie);
+		inode->i_op->put_link(inode, last->cookie);
 	path_put(&last->link);
 }
 
@@ -4444,17 +4444,18 @@ EXPORT_SYMBOL(readlink_copy);
 int generic_readlink(struct dentry *dentry, char __user *buffer, int buflen)
 {
 	void *cookie;
-	const char *link = dentry->d_inode->i_link;
+	struct inode *inode = d_inode(dentry);
+	const char *link = inode->i_link;
 	int res;
 
 	if (!link) {
-		link = dentry->d_inode->i_op->follow_link(dentry, &cookie);
+		link = inode->i_op->follow_link(dentry, &cookie);
 		if (IS_ERR(link))
 			return PTR_ERR(link);
 	}
 	res = readlink_copy(buffer, buflen, link);
-	if (dentry->d_inode->i_op->put_link)
-		dentry->d_inode->i_op->put_link(dentry, cookie);
+	if (inode->i_op->put_link)
+		inode->i_op->put_link(inode, cookie);
 	return res;
 }
 EXPORT_SYMBOL(generic_readlink);
@@ -4496,7 +4497,7 @@ const char *page_follow_link_light(struct dentry *dentry, void **cookie)
 }
 EXPORT_SYMBOL(page_follow_link_light);
 
-void page_put_link(struct dentry *dentry, void *cookie)
+void page_put_link(struct inode *unused, void *cookie)
 {
 	struct page *page = cookie;
 	kunmap(page);
diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c
index 9986833c9fcc..308379b2d0b2 100644
--- a/fs/overlayfs/inode.c
+++ b/fs/overlayfs/inode.c
@@ -174,7 +174,7 @@ static const char *ovl_follow_link(struct dentry *dentry, void **cookie)
 	return ret;
 }
 
-static void ovl_put_link(struct dentry *dentry, void *c)
+static void ovl_put_link(struct inode *unused, void *c)
 {
 	struct inode *realinode;
 	struct ovl_link_data *data = c;
@@ -183,7 +183,7 @@ static void ovl_put_link(struct dentry *dentry, void *c)
 		return;
 
 	realinode = data->realdentry->d_inode;
-	realinode->i_op->put_link(data->realdentry, data->cookie);
+	realinode->i_op->put_link(realinode, data->cookie);
 	kfree(data);
 }
 
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index eb35874fe09c..afe232b9df6e 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -402,7 +402,7 @@ static const char *proc_follow_link(struct dentry *dentry, void **cookie)
 	return pde->data;
 }
 
-static void proc_put_link(struct dentry *dentry, void *p)
+static void proc_put_link(struct inode *unused, void *p)
 {
 	unuse_pde(p);
 }
diff --git a/include/linux/fs.h b/include/linux/fs.h
index ed7c9f298759..f21e3328f991 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1613,7 +1613,7 @@ struct inode_operations {
 	struct posix_acl * (*get_acl)(struct inode *, int);
 
 	int (*readlink) (struct dentry *, char __user *,int);
-	void (*put_link) (struct dentry *, void *);
+	void (*put_link) (struct inode *, void *);
 
 	int (*create) (struct inode *,struct dentry *, umode_t, bool);
 	int (*link) (struct dentry *,struct inode *,struct dentry *);
@@ -2706,12 +2706,12 @@ extern const struct file_operations generic_ro_fops;
 extern int readlink_copy(char __user *, int, const char *);
 extern int page_readlink(struct dentry *, char __user *, int);
 extern const char *page_follow_link_light(struct dentry *, void **);
-extern void page_put_link(struct dentry *, void *);
+extern void page_put_link(struct inode *, void *);
 extern int __page_symlink(struct inode *inode, const char *symname, int len,
 		int nofs);
 extern int page_symlink(struct inode *inode, const char *symname, int len);
 extern const struct inode_operations page_symlink_inode_operations;
-extern void kfree_put_link(struct dentry *, void *);
+extern void kfree_put_link(struct inode *, void *);
 extern int generic_readlink(struct dentry *, char __user *, int);
 extern void generic_fillattr(struct inode *, struct kstat *);
 int vfs_getattr_nosec(struct path *path, struct kstat *stat);
diff --git a/mm/shmem.c b/mm/shmem.c
index e02682267046..a59087edf728 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2486,7 +2486,7 @@ static const char *shmem_follow_link(struct dentry *dentry, void **cookie)
 	return kmap(page);
 }
 
-static void shmem_put_link(struct dentry *dentry, void *cookie)
+static void shmem_put_link(struct inode *unused, void *cookie)
 {
 	struct page *page = cookie;
 	kunmap(page);
-- 
cgit v1.2.3


From ecc087ff14352aed52b8e775b4511e7f9cfc64ec Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 7 May 2015 11:19:14 -0400
Subject: new helper: free_page_put_link()

similar to kfree_put_link()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/configfs/symlink.c | 7 +------
 fs/fuse/dir.c         | 7 +------
 fs/kernfs/symlink.c   | 7 +------
 fs/libfs.c            | 6 ++++++
 include/linux/fs.h    | 1 +
 5 files changed, 10 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/fs/configfs/symlink.c b/fs/configfs/symlink.c
index bc464c26e00e..ec5c8325b503 100644
--- a/fs/configfs/symlink.c
+++ b/fs/configfs/symlink.c
@@ -296,15 +296,10 @@ static const char *configfs_follow_link(struct dentry *dentry, void **cookie)
 	return ERR_PTR(error);
 }
 
-static void configfs_put_link(struct inode *unused, void *cookie)
-{
-	free_page((unsigned long)cookie);
-}
-
 const struct inode_operations configfs_symlink_inode_operations = {
 	.follow_link = configfs_follow_link,
 	.readlink = generic_readlink,
-	.put_link = configfs_put_link,
+	.put_link = free_page_put_link,
 	.setattr = configfs_setattr,
 };
 
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 9e704c124392..5e2e08712d3b 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -1395,11 +1395,6 @@ static const char *fuse_follow_link(struct dentry *dentry, void **cookie)
 	return link;
 }
 
-static void fuse_put_link(struct inode *unused, void *cookie)
-{
-	free_page((unsigned long) cookie);
-}
-
 static int fuse_dir_open(struct inode *inode, struct file *file)
 {
 	return fuse_open_common(inode, file, true);
@@ -1915,7 +1910,7 @@ static const struct inode_operations fuse_common_inode_operations = {
 static const struct inode_operations fuse_symlink_inode_operations = {
 	.setattr	= fuse_setattr,
 	.follow_link	= fuse_follow_link,
-	.put_link	= fuse_put_link,
+	.put_link	= free_page_put_link,
 	.readlink	= generic_readlink,
 	.getattr	= fuse_getattr,
 	.setxattr	= fuse_setxattr,
diff --git a/fs/kernfs/symlink.c b/fs/kernfs/symlink.c
index f6aa2e5a76b4..db272528ab5b 100644
--- a/fs/kernfs/symlink.c
+++ b/fs/kernfs/symlink.c
@@ -126,11 +126,6 @@ static const char *kernfs_iop_follow_link(struct dentry *dentry, void **cookie)
 	return *cookie = (char *)page;
 }
 
-static void kernfs_iop_put_link(struct inode *unused, void *cookie)
-{
-	free_page((unsigned long)cookie);
-}
-
 const struct inode_operations kernfs_symlink_iops = {
 	.setxattr	= kernfs_iop_setxattr,
 	.removexattr	= kernfs_iop_removexattr,
@@ -138,7 +133,7 @@ const struct inode_operations kernfs_symlink_iops = {
 	.listxattr	= kernfs_iop_listxattr,
 	.readlink	= generic_readlink,
 	.follow_link	= kernfs_iop_follow_link,
-	.put_link	= kernfs_iop_put_link,
+	.put_link	= free_page_put_link,
 	.setattr	= kernfs_iop_setattr,
 	.getattr	= kernfs_iop_getattr,
 	.permission	= kernfs_iop_permission,
diff --git a/fs/libfs.c b/fs/libfs.c
index 01c337b0fec8..65e1feca8b98 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -1030,6 +1030,12 @@ void kfree_put_link(struct inode *unused, void *cookie)
 }
 EXPORT_SYMBOL(kfree_put_link);
 
+void free_page_put_link(struct inode *unused, void *cookie)
+{
+	free_page((unsigned long) cookie);
+}
+EXPORT_SYMBOL(free_page_put_link);
+
 /*
  * nop .set_page_dirty method so that people can use .page_mkwrite on
  * anon inodes.
diff --git a/include/linux/fs.h b/include/linux/fs.h
index f21e3328f991..8f738512c874 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2712,6 +2712,7 @@ extern int __page_symlink(struct inode *inode, const char *symname, int len,
 extern int page_symlink(struct inode *inode, const char *symname, int len);
 extern const struct inode_operations page_symlink_inode_operations;
 extern void kfree_put_link(struct inode *, void *);
+extern void free_page_put_link(struct inode *, void *);
 extern int generic_readlink(struct dentry *, char __user *, int);
 extern void generic_fillattr(struct inode *, struct kstat *);
 int vfs_getattr_nosec(struct path *path, struct kstat *stat);
-- 
cgit v1.2.3


From 140e807da12988e2a925fe029336e7bb67a8d4de Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Fri, 8 May 2015 21:07:08 -0500
Subject: tun: Utilize the normal socket network namespace refcounting.

There is no need for tun to do the weird network namespace refcounting.
The existing network namespace refcounting in tfile has almost exactly
the same lifetime.  So rewrite the code to use the struct sock network
namespace refcounting and remove the unnecessary hand rolled network
namespace refcounting and the unncesary tfile->net.

This change allows the tun code to directly call sock_put bypassing
sock_release and making SOCK_EXTERNALLY_ALLOCATED unnecessary.

Remove the now unncessary tun_release so that if anything tries to use
the sock_release code path the kernel will oops, and let us know about
the bug.

The macvtap code already uses it's internal socket this way.

Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/tun.c   | 24 ++++--------------------
 include/linux/net.h |  1 -
 net/socket.c        |  3 ---
 3 files changed, 4 insertions(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index e470ae59d405..3262f3e2b8b2 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -146,7 +146,6 @@ struct tun_file {
 	struct socket socket;
 	struct socket_wq wq;
 	struct tun_struct __rcu *tun;
-	struct net *net;
 	struct fasync_struct *fasync;
 	/* only used for fasnyc */
 	unsigned int flags;
@@ -493,10 +492,7 @@ static void __tun_detach(struct tun_file *tfile, bool clean)
 			    tun->dev->reg_state == NETREG_REGISTERED)
 				unregister_netdevice(tun->dev);
 		}
-
-		BUG_ON(!test_bit(SOCK_EXTERNALLY_ALLOCATED,
-				 &tfile->socket.flags));
-		sk_release_kernel(&tfile->sk);
+		sock_put(&tfile->sk);
 	}
 }
 
@@ -1492,18 +1488,10 @@ out:
 	return ret;
 }
 
-static int tun_release(struct socket *sock)
-{
-	if (sock->sk)
-		sock_put(sock->sk);
-	return 0;
-}
-
 /* Ops structure to mimic raw sockets with tun */
 static const struct proto_ops tun_socket_ops = {
 	.sendmsg = tun_sendmsg,
 	.recvmsg = tun_recvmsg,
-	.release = tun_release,
 };
 
 static struct proto tun_proto = {
@@ -1865,7 +1853,7 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
 	if (cmd == TUNSETIFF && !tun) {
 		ifr.ifr_name[IFNAMSIZ-1] = '\0';
 
-		ret = tun_set_iff(tfile->net, file, &ifr);
+		ret = tun_set_iff(sock_net(&tfile->sk), file, &ifr);
 
 		if (ret)
 			goto unlock;
@@ -2154,16 +2142,16 @@ out:
 
 static int tun_chr_open(struct inode *inode, struct file * file)
 {
+	struct net *net = current->nsproxy->net_ns;
 	struct tun_file *tfile;
 
 	DBG1(KERN_INFO, "tunX: tun_chr_open\n");
 
-	tfile = (struct tun_file *)sk_alloc(&init_net, AF_UNSPEC, GFP_KERNEL,
+	tfile = (struct tun_file *)sk_alloc(net, AF_UNSPEC, GFP_KERNEL,
 					    &tun_proto);
 	if (!tfile)
 		return -ENOMEM;
 	RCU_INIT_POINTER(tfile->tun, NULL);
-	tfile->net = get_net(current->nsproxy->net_ns);
 	tfile->flags = 0;
 	tfile->ifindex = 0;
 
@@ -2174,13 +2162,11 @@ static int tun_chr_open(struct inode *inode, struct file * file)
 	tfile->socket.ops = &tun_socket_ops;
 
 	sock_init_data(&tfile->socket, &tfile->sk);
-	sk_change_net(&tfile->sk, tfile->net);
 
 	tfile->sk.sk_write_space = tun_sock_write_space;
 	tfile->sk.sk_sndbuf = INT_MAX;
 
 	file->private_data = tfile;
-	set_bit(SOCK_EXTERNALLY_ALLOCATED, &tfile->socket.flags);
 	INIT_LIST_HEAD(&tfile->next);
 
 	sock_set_flag(&tfile->sk, SOCK_ZEROCOPY);
@@ -2191,10 +2177,8 @@ static int tun_chr_open(struct inode *inode, struct file * file)
 static int tun_chr_close(struct inode *inode, struct file *file)
 {
 	struct tun_file *tfile = file->private_data;
-	struct net *net = tfile->net;
 
 	tun_detach(tfile, true);
-	put_net(net);
 
 	return 0;
 }
diff --git a/include/linux/net.h b/include/linux/net.h
index 738ea48be889..8a5e81d2bdf7 100644
--- a/include/linux/net.h
+++ b/include/linux/net.h
@@ -38,7 +38,6 @@ struct net;
 #define SOCK_NOSPACE		2
 #define SOCK_PASSCRED		3
 #define SOCK_PASSSEC		4
-#define SOCK_EXTERNALLY_ALLOCATED 5
 
 #ifndef ARCH_HAS_SOCKET_TYPES
 /**
diff --git a/net/socket.c b/net/socket.c
index 884e32997698..b5f1f43ed8f4 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -576,9 +576,6 @@ void sock_release(struct socket *sock)
 	if (rcu_dereference_protected(sock->wq, 1)->fasync_list)
 		pr_err("%s: fasync list not empty!\n", __func__);
 
-	if (test_bit(SOCK_EXTERNALLY_ALLOCATED, &sock->flags))
-		return;
-
 	this_cpu_sub(sockets_in_use, 1);
 	if (!sock->file) {
 		iput(SOCK_INODE(sock));
-- 
cgit v1.2.3


From eeb1bd5c40edb0e2fd925c8535e2fdebdbc5cef2 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Fri, 8 May 2015 21:08:05 -0500
Subject: net: Add a struct net parameter to sock_create_kern

This is long overdue, and is part of cleaning up how we allocate kernel
sockets that don't reference count struct net.

Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/block/drbd/drbd_receiver.c |  4 ++--
 fs/afs/rxrpc.c                     |  2 +-
 fs/dlm/lowcomms.c                  | 16 ++++++++--------
 include/linux/net.h                |  2 +-
 net/bluetooth/rfcomm/core.c        |  2 +-
 net/ceph/messenger.c               |  4 ++--
 net/ipv4/af_inet.c                 |  2 +-
 net/ipv4/udp_tunnel.c              |  2 +-
 net/ipv6/ip6_udp_tunnel.c          |  2 +-
 net/l2tp/l2tp_core.c               |  4 ++--
 net/netfilter/ipvs/ip_vs_sync.c    |  4 ++--
 net/rxrpc/ar-local.c               |  4 ++--
 net/socket.c                       |  4 ++--
 13 files changed, 26 insertions(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index cee20354ac37..c097909c589c 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -598,7 +598,7 @@ static struct socket *drbd_try_connect(struct drbd_connection *connection)
 	memcpy(&peer_in6, &connection->peer_addr, peer_addr_len);
 
 	what = "sock_create_kern";
-	err = sock_create_kern(((struct sockaddr *)&src_in6)->sa_family,
+	err = sock_create_kern(&init_net, ((struct sockaddr *)&src_in6)->sa_family,
 			       SOCK_STREAM, IPPROTO_TCP, &sock);
 	if (err < 0) {
 		sock = NULL;
@@ -693,7 +693,7 @@ static int prepare_listen_socket(struct drbd_connection *connection, struct acce
 	memcpy(&my_addr, &connection->my_addr, my_addr_len);
 
 	what = "sock_create_kern";
-	err = sock_create_kern(((struct sockaddr *)&my_addr)->sa_family,
+	err = sock_create_kern(&init_net, ((struct sockaddr *)&my_addr)->sa_family,
 			       SOCK_STREAM, IPPROTO_TCP, &s_listen);
 	if (err) {
 		s_listen = NULL;
diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c
index 3a57a1b0fb51..b50642870a43 100644
--- a/fs/afs/rxrpc.c
+++ b/fs/afs/rxrpc.c
@@ -85,7 +85,7 @@ int afs_open_socket(void)
 		return -ENOMEM;
 	}
 
-	ret = sock_create_kern(AF_RXRPC, SOCK_DGRAM, PF_INET, &socket);
+	ret = sock_create_kern(&init_net, AF_RXRPC, SOCK_DGRAM, PF_INET, &socket);
 	if (ret < 0) {
 		destroy_workqueue(afs_async_calls);
 		_leave(" = %d [socket]", ret);
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index d08e079ea5d3..754fd6c0b747 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -921,8 +921,8 @@ static int tcp_accept_from_sock(struct connection *con)
 	mutex_unlock(&connections_lock);
 
 	memset(&peeraddr, 0, sizeof(peeraddr));
-	result = sock_create_kern(dlm_local_addr[0]->ss_family, SOCK_STREAM,
-				  IPPROTO_TCP, &newsock);
+	result = sock_create_kern(&init_net, dlm_local_addr[0]->ss_family,
+				  SOCK_STREAM, IPPROTO_TCP, &newsock);
 	if (result < 0)
 		return -ENOMEM;
 
@@ -1173,8 +1173,8 @@ static void tcp_connect_to_sock(struct connection *con)
 		goto out;
 
 	/* Create a socket to communicate with */
-	result = sock_create_kern(dlm_local_addr[0]->ss_family, SOCK_STREAM,
-				  IPPROTO_TCP, &sock);
+	result = sock_create_kern(&init_net, dlm_local_addr[0]->ss_family,
+				  SOCK_STREAM, IPPROTO_TCP, &sock);
 	if (result < 0)
 		goto out_err;
 
@@ -1258,8 +1258,8 @@ static struct socket *tcp_create_listen_sock(struct connection *con,
 		addr_len = sizeof(struct sockaddr_in6);
 
 	/* Create a socket to communicate with */
-	result = sock_create_kern(dlm_local_addr[0]->ss_family, SOCK_STREAM,
-				  IPPROTO_TCP, &sock);
+	result = sock_create_kern(&init_net, dlm_local_addr[0]->ss_family,
+				  SOCK_STREAM, IPPROTO_TCP, &sock);
 	if (result < 0) {
 		log_print("Can't create listening comms socket");
 		goto create_out;
@@ -1365,8 +1365,8 @@ static int sctp_listen_for_all(void)
 
 	log_print("Using SCTP for communications");
 
-	result = sock_create_kern(dlm_local_addr[0]->ss_family, SOCK_SEQPACKET,
-				  IPPROTO_SCTP, &sock);
+	result = sock_create_kern(&init_net, dlm_local_addr[0]->ss_family,
+				  SOCK_SEQPACKET, IPPROTO_SCTP, &sock);
 	if (result < 0) {
 		log_print("Can't create comms socket, check SCTP is loaded");
 		goto out;
diff --git a/include/linux/net.h b/include/linux/net.h
index 8a5e81d2bdf7..04aa06852771 100644
--- a/include/linux/net.h
+++ b/include/linux/net.h
@@ -207,7 +207,7 @@ void sock_unregister(int family);
 int __sock_create(struct net *net, int family, int type, int proto,
 		  struct socket **res, int kern);
 int sock_create(int family, int type, int proto, struct socket **res);
-int sock_create_kern(int family, int type, int proto, struct socket **res);
+int sock_create_kern(struct net *net, int family, int type, int proto, struct socket **res);
 int sock_create_lite(int family, int type, int proto, struct socket **res);
 void sock_release(struct socket *sock);
 int sock_sendmsg(struct socket *sock, struct msghdr *msg);
diff --git a/net/bluetooth/rfcomm/core.c b/net/bluetooth/rfcomm/core.c
index 4fea24275b17..29709fbfd1f5 100644
--- a/net/bluetooth/rfcomm/core.c
+++ b/net/bluetooth/rfcomm/core.c
@@ -200,7 +200,7 @@ static int rfcomm_l2sock_create(struct socket **sock)
 
 	BT_DBG("");
 
-	err = sock_create_kern(PF_BLUETOOTH, SOCK_SEQPACKET, BTPROTO_L2CAP, sock);
+	err = sock_create_kern(&init_net, PF_BLUETOOTH, SOCK_SEQPACKET, BTPROTO_L2CAP, sock);
 	if (!err) {
 		struct sock *sk = (*sock)->sk;
 		sk->sk_data_ready   = rfcomm_l2data_ready;
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 967080a9f043..073262fea6dd 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -480,8 +480,8 @@ static int ceph_tcp_connect(struct ceph_connection *con)
 	int ret;
 
 	BUG_ON(con->sock);
-	ret = sock_create_kern(con->peer_addr.in_addr.ss_family, SOCK_STREAM,
-			       IPPROTO_TCP, &sock);
+	ret = sock_create_kern(&init_net, con->peer_addr.in_addr.ss_family,
+			       SOCK_STREAM, IPPROTO_TCP, &sock);
 	if (ret)
 		return ret;
 	sock->sk->sk_allocation = GFP_NOFS;
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 8b47a4d79d04..09f4d024dfe5 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1430,7 +1430,7 @@ int inet_ctl_sock_create(struct sock **sk, unsigned short family,
 			 struct net *net)
 {
 	struct socket *sock;
-	int rc = sock_create_kern(family, type, protocol, &sock);
+	int rc = sock_create_kern(&init_net, family, type, protocol, &sock);
 
 	if (rc == 0) {
 		*sk = sock->sk;
diff --git a/net/ipv4/udp_tunnel.c b/net/ipv4/udp_tunnel.c
index 6bb98cc193c9..4e2837476967 100644
--- a/net/ipv4/udp_tunnel.c
+++ b/net/ipv4/udp_tunnel.c
@@ -15,7 +15,7 @@ int udp_sock_create4(struct net *net, struct udp_port_cfg *cfg,
 	struct socket *sock = NULL;
 	struct sockaddr_in udp_addr;
 
-	err = sock_create_kern(AF_INET, SOCK_DGRAM, 0, &sock);
+	err = sock_create_kern(&init_net, AF_INET, SOCK_DGRAM, 0, &sock);
 	if (err < 0)
 		goto error;
 
diff --git a/net/ipv6/ip6_udp_tunnel.c b/net/ipv6/ip6_udp_tunnel.c
index bba8903e871f..478576b61214 100644
--- a/net/ipv6/ip6_udp_tunnel.c
+++ b/net/ipv6/ip6_udp_tunnel.c
@@ -19,7 +19,7 @@ int udp_sock_create6(struct net *net, struct udp_port_cfg *cfg,
 	int err;
 	struct socket *sock = NULL;
 
-	err = sock_create_kern(AF_INET6, SOCK_DGRAM, 0, &sock);
+	err = sock_create_kern(&init_net, AF_INET6, SOCK_DGRAM, 0, &sock);
 	if (err < 0)
 		goto error;
 
diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c
index a29a504492af..ae513a2fe7f3 100644
--- a/net/l2tp/l2tp_core.c
+++ b/net/l2tp/l2tp_core.c
@@ -1399,7 +1399,7 @@ static int l2tp_tunnel_sock_create(struct net *net,
 		if (cfg->local_ip6 && cfg->peer_ip6) {
 			struct sockaddr_l2tpip6 ip6_addr = {0};
 
-			err = sock_create_kern(AF_INET6, SOCK_DGRAM,
+			err = sock_create_kern(&init_net, AF_INET6, SOCK_DGRAM,
 					  IPPROTO_L2TP, &sock);
 			if (err < 0)
 				goto out;
@@ -1429,7 +1429,7 @@ static int l2tp_tunnel_sock_create(struct net *net,
 		{
 			struct sockaddr_l2tpip ip_addr = {0};
 
-			err = sock_create_kern(AF_INET, SOCK_DGRAM,
+			err = sock_create_kern(&init_net, AF_INET, SOCK_DGRAM,
 					  IPPROTO_L2TP, &sock);
 			if (err < 0)
 				goto out;
diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c
index 19b9cce6c210..2e9a5b5d1239 100644
--- a/net/netfilter/ipvs/ip_vs_sync.c
+++ b/net/netfilter/ipvs/ip_vs_sync.c
@@ -1458,7 +1458,7 @@ static struct socket *make_send_sock(struct net *net, int id)
 	int result;
 
 	/* First create a socket move it to right name space later */
-	result = sock_create_kern(PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock);
+	result = sock_create_kern(&init_net, PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock);
 	if (result < 0) {
 		pr_err("Error during creation of socket; terminating\n");
 		return ERR_PTR(result);
@@ -1518,7 +1518,7 @@ static struct socket *make_receive_sock(struct net *net, int id)
 	int result;
 
 	/* First create a socket */
-	result = sock_create_kern(PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock);
+	result = sock_create_kern(&init_net, PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock);
 	if (result < 0) {
 		pr_err("Error during creation of socket; terminating\n");
 		return ERR_PTR(result);
diff --git a/net/rxrpc/ar-local.c b/net/rxrpc/ar-local.c
index ca904ed5400a..78483b4602bf 100644
--- a/net/rxrpc/ar-local.c
+++ b/net/rxrpc/ar-local.c
@@ -73,8 +73,8 @@ static int rxrpc_create_local(struct rxrpc_local *local)
 	_enter("%p{%d}", local, local->srx.transport_type);
 
 	/* create a socket to represent the local endpoint */
-	ret = sock_create_kern(PF_INET, local->srx.transport_type, IPPROTO_UDP,
-			       &local->socket);
+	ret = sock_create_kern(&init_net, PF_INET, local->srx.transport_type,
+			       IPPROTO_UDP, &local->socket);
 	if (ret < 0) {
 		_leave(" = %d [socket]", ret);
 		return ret;
diff --git a/net/socket.c b/net/socket.c
index b5f1f43ed8f4..9963a0b53a64 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -1210,9 +1210,9 @@ int sock_create(int family, int type, int protocol, struct socket **res)
 }
 EXPORT_SYMBOL(sock_create);
 
-int sock_create_kern(int family, int type, int protocol, struct socket **res)
+int sock_create_kern(struct net *net, int family, int type, int protocol, struct socket **res)
 {
-	return __sock_create(&init_net, family, type, protocol, res, 1);
+	return __sock_create(net, family, type, protocol, res, 1);
 }
 EXPORT_SYMBOL(sock_create_kern);
 
-- 
cgit v1.2.3


From 11aa9c28b4209242a9de0a661a7b3405adb568a0 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Fri, 8 May 2015 21:09:13 -0500
Subject: net: Pass kern from net_proto_family.create to sk_alloc

In preparation for changing how struct net is refcounted
on kernel sockets pass the knowledge that we are creating
a kernel socket from sock_create_kern through to sk_alloc.

Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 crypto/af_alg.c                |  4 ++--
 drivers/isdn/mISDN/socket.c    | 12 ++++++------
 drivers/net/macvtap.c          |  2 +-
 drivers/net/ppp/pppoe.c        |  4 ++--
 drivers/net/ppp/pppox.c        |  2 +-
 drivers/net/ppp/pptp.c         |  4 ++--
 drivers/net/tun.c              |  2 +-
 include/linux/if_pppox.h       |  2 +-
 include/net/af_vsock.h         |  2 +-
 include/net/llc_conn.h         |  2 +-
 include/net/sock.h             |  2 +-
 net/appletalk/ddp.c            |  2 +-
 net/atm/common.c               |  4 ++--
 net/atm/common.h               |  2 +-
 net/atm/pvc.c                  |  2 +-
 net/atm/svc.c                  |  2 +-
 net/ax25/af_ax25.c             |  4 ++--
 net/bluetooth/bnep/sock.c      |  2 +-
 net/bluetooth/cmtp/sock.c      |  2 +-
 net/bluetooth/hci_sock.c       |  2 +-
 net/bluetooth/hidp/sock.c      |  2 +-
 net/bluetooth/l2cap_sock.c     | 10 +++++-----
 net/bluetooth/rfcomm/sock.c    |  8 ++++----
 net/bluetooth/sco.c            |  8 ++++----
 net/caif/caif_socket.c         |  2 +-
 net/can/af_can.c               |  2 +-
 net/core/sock.c                |  3 ++-
 net/decnet/af_decnet.c         |  8 ++++----
 net/ieee802154/socket.c        |  2 +-
 net/ipv4/af_inet.c             |  2 +-
 net/ipv6/af_inet6.c            |  2 +-
 net/ipx/af_ipx.c               |  2 +-
 net/irda/af_irda.c             |  2 +-
 net/iucv/af_iucv.c             | 10 +++++-----
 net/key/af_key.c               |  2 +-
 net/l2tp/l2tp_ppp.c            |  4 ++--
 net/llc/af_llc.c               |  2 +-
 net/llc/llc_conn.c             |  6 +++---
 net/netlink/af_netlink.c       | 11 +++++------
 net/netrom/af_netrom.c         |  4 ++--
 net/nfc/af_nfc.c               |  2 +-
 net/nfc/llcp.h                 |  2 +-
 net/nfc/llcp_core.c            |  2 +-
 net/nfc/llcp_sock.c            |  8 ++++----
 net/nfc/nfc.h                  |  2 +-
 net/nfc/rawsock.c              |  4 ++--
 net/packet/af_packet.c         |  2 +-
 net/phonet/af_phonet.c         |  2 +-
 net/phonet/pep.c               |  2 +-
 net/rds/af_rds.c               |  2 +-
 net/rose/af_rose.c             |  4 ++--
 net/rxrpc/af_rxrpc.c           |  2 +-
 net/sctp/ipv6.c                |  2 +-
 net/sctp/protocol.c            |  2 +-
 net/tipc/socket.c              |  2 +-
 net/unix/af_unix.c             |  8 ++++----
 net/vmw_vsock/af_vsock.c       |  7 ++++---
 net/vmw_vsock/vmci_transport.c |  2 +-
 net/x25/af_x25.c               |  8 ++++----
 59 files changed, 109 insertions(+), 108 deletions(-)

(limited to 'include/linux')

diff --git a/crypto/af_alg.c b/crypto/af_alg.c
index f22cc56fd1b3..5ad0d5354535 100644
--- a/crypto/af_alg.c
+++ b/crypto/af_alg.c
@@ -244,7 +244,7 @@ int af_alg_accept(struct sock *sk, struct socket *newsock)
 	if (!type)
 		goto unlock;
 
-	sk2 = sk_alloc(sock_net(sk), PF_ALG, GFP_KERNEL, &alg_proto);
+	sk2 = sk_alloc(sock_net(sk), PF_ALG, GFP_KERNEL, &alg_proto, 0);
 	err = -ENOMEM;
 	if (!sk2)
 		goto unlock;
@@ -324,7 +324,7 @@ static int alg_create(struct net *net, struct socket *sock, int protocol,
 		return -EPROTONOSUPPORT;
 
 	err = -ENOMEM;
-	sk = sk_alloc(net, PF_ALG, GFP_KERNEL, &alg_proto);
+	sk = sk_alloc(net, PF_ALG, GFP_KERNEL, &alg_proto, kern);
 	if (!sk)
 		goto out;
 
diff --git a/drivers/isdn/mISDN/socket.c b/drivers/isdn/mISDN/socket.c
index 8dc7290089bb..0d29b5a6356d 100644
--- a/drivers/isdn/mISDN/socket.c
+++ b/drivers/isdn/mISDN/socket.c
@@ -601,14 +601,14 @@ static const struct proto_ops data_sock_ops = {
 };
 
 static int
-data_sock_create(struct net *net, struct socket *sock, int protocol)
+data_sock_create(struct net *net, struct socket *sock, int protocol, int kern)
 {
 	struct sock *sk;
 
 	if (sock->type != SOCK_DGRAM)
 		return -ESOCKTNOSUPPORT;
 
-	sk = sk_alloc(net, PF_ISDN, GFP_KERNEL, &mISDN_proto);
+	sk = sk_alloc(net, PF_ISDN, GFP_KERNEL, &mISDN_proto, kern);
 	if (!sk)
 		return -ENOMEM;
 
@@ -756,14 +756,14 @@ static const struct proto_ops base_sock_ops = {
 
 
 static int
-base_sock_create(struct net *net, struct socket *sock, int protocol)
+base_sock_create(struct net *net, struct socket *sock, int protocol, int kern)
 {
 	struct sock *sk;
 
 	if (sock->type != SOCK_RAW)
 		return -ESOCKTNOSUPPORT;
 
-	sk = sk_alloc(net, PF_ISDN, GFP_KERNEL, &mISDN_proto);
+	sk = sk_alloc(net, PF_ISDN, GFP_KERNEL, &mISDN_proto, kern);
 	if (!sk)
 		return -ENOMEM;
 
@@ -785,7 +785,7 @@ mISDN_sock_create(struct net *net, struct socket *sock, int proto, int kern)
 
 	switch (proto) {
 	case ISDN_P_BASE:
-		err = base_sock_create(net, sock, proto);
+		err = base_sock_create(net, sock, proto, kern);
 		break;
 	case ISDN_P_TE_S0:
 	case ISDN_P_NT_S0:
@@ -799,7 +799,7 @@ mISDN_sock_create(struct net *net, struct socket *sock, int proto, int kern)
 	case ISDN_P_B_L2DTMF:
 	case ISDN_P_B_L2DSP:
 	case ISDN_P_B_L2DSPHDLC:
-		err = data_sock_create(net, sock, proto);
+		err = data_sock_create(net, sock, proto, kern);
 		break;
 	default:
 		return err;
diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c
index 8c350c5d54ad..0398631a3c24 100644
--- a/drivers/net/macvtap.c
+++ b/drivers/net/macvtap.c
@@ -476,7 +476,7 @@ static int macvtap_open(struct inode *inode, struct file *file)
 
 	err = -ENOMEM;
 	q = (struct macvtap_queue *)sk_alloc(net, AF_UNSPEC, GFP_KERNEL,
-					     &macvtap_proto);
+					     &macvtap_proto, 0);
 	if (!q)
 		goto out;
 
diff --git a/drivers/net/ppp/pppoe.c b/drivers/net/ppp/pppoe.c
index aa1dd926623a..f86c5ab334aa 100644
--- a/drivers/net/ppp/pppoe.c
+++ b/drivers/net/ppp/pppoe.c
@@ -546,11 +546,11 @@ static struct proto pppoe_sk_proto __read_mostly = {
  * Initialize a new struct sock.
  *
  **********************************************************************/
-static int pppoe_create(struct net *net, struct socket *sock)
+static int pppoe_create(struct net *net, struct socket *sock, int kern)
 {
 	struct sock *sk;
 
-	sk = sk_alloc(net, PF_PPPOX, GFP_KERNEL, &pppoe_sk_proto);
+	sk = sk_alloc(net, PF_PPPOX, GFP_KERNEL, &pppoe_sk_proto, kern);
 	if (!sk)
 		return -ENOMEM;
 
diff --git a/drivers/net/ppp/pppox.c b/drivers/net/ppp/pppox.c
index 2940e9fe351b..0e1b30622477 100644
--- a/drivers/net/ppp/pppox.c
+++ b/drivers/net/ppp/pppox.c
@@ -118,7 +118,7 @@ static int pppox_create(struct net *net, struct socket *sock, int protocol,
 	    !try_module_get(pppox_protos[protocol]->owner))
 		goto out;
 
-	rc = pppox_protos[protocol]->create(net, sock);
+	rc = pppox_protos[protocol]->create(net, sock, kern);
 
 	module_put(pppox_protos[protocol]->owner);
 out:
diff --git a/drivers/net/ppp/pptp.c b/drivers/net/ppp/pptp.c
index e3bfbd4d0136..14839bc0aaf5 100644
--- a/drivers/net/ppp/pptp.c
+++ b/drivers/net/ppp/pptp.c
@@ -561,14 +561,14 @@ static void pptp_sock_destruct(struct sock *sk)
 	skb_queue_purge(&sk->sk_receive_queue);
 }
 
-static int pptp_create(struct net *net, struct socket *sock)
+static int pptp_create(struct net *net, struct socket *sock, int kern)
 {
 	int error = -ENOMEM;
 	struct sock *sk;
 	struct pppox_sock *po;
 	struct pptp_opt *opt;
 
-	sk = sk_alloc(net, PF_PPPOX, GFP_KERNEL, &pptp_sk_proto);
+	sk = sk_alloc(net, PF_PPPOX, GFP_KERNEL, &pptp_sk_proto, kern);
 	if (!sk)
 		goto out;
 
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 3262f3e2b8b2..1a1c4f7b3ec5 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -2148,7 +2148,7 @@ static int tun_chr_open(struct inode *inode, struct file * file)
 	DBG1(KERN_INFO, "tunX: tun_chr_open\n");
 
 	tfile = (struct tun_file *)sk_alloc(net, AF_UNSPEC, GFP_KERNEL,
-					    &tun_proto);
+					    &tun_proto, 0);
 	if (!tfile)
 		return -ENOMEM;
 	RCU_INIT_POINTER(tfile->tun, NULL);
diff --git a/include/linux/if_pppox.h b/include/linux/if_pppox.h
index 66a7d7600f43..b49cf923becc 100644
--- a/include/linux/if_pppox.h
+++ b/include/linux/if_pppox.h
@@ -74,7 +74,7 @@ static inline struct sock *sk_pppox(struct pppox_sock *po)
 struct module;
 
 struct pppox_proto {
-	int		(*create)(struct net *net, struct socket *sock);
+	int		(*create)(struct net *net, struct socket *sock, int kern);
 	int		(*ioctl)(struct socket *sock, unsigned int cmd,
 				 unsigned long arg);
 	struct module	*owner;
diff --git a/include/net/af_vsock.h b/include/net/af_vsock.h
index 172632dd9930..db639a4c5ab8 100644
--- a/include/net/af_vsock.h
+++ b/include/net/af_vsock.h
@@ -74,7 +74,7 @@ void vsock_pending_work(struct work_struct *work);
 struct sock *__vsock_create(struct net *net,
 			    struct socket *sock,
 			    struct sock *parent,
-			    gfp_t priority, unsigned short type);
+			    gfp_t priority, unsigned short type, int kern);
 
 /**** TRANSPORT ****/
 
diff --git a/include/net/llc_conn.h b/include/net/llc_conn.h
index 0134681acc4c..fe994d2e5286 100644
--- a/include/net/llc_conn.h
+++ b/include/net/llc_conn.h
@@ -96,7 +96,7 @@ static __inline__ char llc_backlog_type(struct sk_buff *skb)
 }
 
 struct sock *llc_sk_alloc(struct net *net, int family, gfp_t priority,
-			  struct proto *prot);
+			  struct proto *prot, int kern);
 void llc_sk_free(struct sock *sk);
 
 void llc_sk_reset(struct sock *sk);
diff --git a/include/net/sock.h b/include/net/sock.h
index 3a4898ec8c67..d8dcf91732b0 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1514,7 +1514,7 @@ static inline void unlock_sock_fast(struct sock *sk, bool slow)
 
 
 struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
-		      struct proto *prot);
+		      struct proto *prot, int kern);
 void sk_free(struct sock *sk);
 void sk_release_kernel(struct sock *sk);
 struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority);
diff --git a/net/appletalk/ddp.c b/net/appletalk/ddp.c
index 3b7ad43c7dad..d5871ac493eb 100644
--- a/net/appletalk/ddp.c
+++ b/net/appletalk/ddp.c
@@ -1030,7 +1030,7 @@ static int atalk_create(struct net *net, struct socket *sock, int protocol,
 	if (sock->type != SOCK_RAW && sock->type != SOCK_DGRAM)
 		goto out;
 	rc = -ENOMEM;
-	sk = sk_alloc(net, PF_APPLETALK, GFP_KERNEL, &ddp_proto);
+	sk = sk_alloc(net, PF_APPLETALK, GFP_KERNEL, &ddp_proto, kern);
 	if (!sk)
 		goto out;
 	rc = 0;
diff --git a/net/atm/common.c b/net/atm/common.c
index ed0466637e13..49a872db7e42 100644
--- a/net/atm/common.c
+++ b/net/atm/common.c
@@ -141,7 +141,7 @@ static struct proto vcc_proto = {
 	.release_cb = vcc_release_cb,
 };
 
-int vcc_create(struct net *net, struct socket *sock, int protocol, int family)
+int vcc_create(struct net *net, struct socket *sock, int protocol, int family, int kern)
 {
 	struct sock *sk;
 	struct atm_vcc *vcc;
@@ -149,7 +149,7 @@ int vcc_create(struct net *net, struct socket *sock, int protocol, int family)
 	sock->sk = NULL;
 	if (sock->type == SOCK_STREAM)
 		return -EINVAL;
-	sk = sk_alloc(net, family, GFP_KERNEL, &vcc_proto);
+	sk = sk_alloc(net, family, GFP_KERNEL, &vcc_proto, kern);
 	if (!sk)
 		return -ENOMEM;
 	sock_init_data(sock, sk);
diff --git a/net/atm/common.h b/net/atm/common.h
index 4d6f5b2068ac..959436b87182 100644
--- a/net/atm/common.h
+++ b/net/atm/common.h
@@ -10,7 +10,7 @@
 #include <linux/poll.h> /* for poll_table */
 
 
-int vcc_create(struct net *net, struct socket *sock, int protocol, int family);
+int vcc_create(struct net *net, struct socket *sock, int protocol, int family, int kern);
 int vcc_release(struct socket *sock);
 int vcc_connect(struct socket *sock, int itf, short vpi, int vci);
 int vcc_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
diff --git a/net/atm/pvc.c b/net/atm/pvc.c
index ae0324021407..040207ec399f 100644
--- a/net/atm/pvc.c
+++ b/net/atm/pvc.c
@@ -136,7 +136,7 @@ static int pvc_create(struct net *net, struct socket *sock, int protocol,
 		return -EAFNOSUPPORT;
 
 	sock->ops = &pvc_proto_ops;
-	return vcc_create(net, sock, protocol, PF_ATMPVC);
+	return vcc_create(net, sock, protocol, PF_ATMPVC, kern);
 }
 
 static const struct net_proto_family pvc_family_ops = {
diff --git a/net/atm/svc.c b/net/atm/svc.c
index 1ba23f5018e7..3fa0a9ee98d1 100644
--- a/net/atm/svc.c
+++ b/net/atm/svc.c
@@ -660,7 +660,7 @@ static int svc_create(struct net *net, struct socket *sock, int protocol,
 		return -EAFNOSUPPORT;
 
 	sock->ops = &svc_proto_ops;
-	error = vcc_create(net, sock, protocol, AF_ATMSVC);
+	error = vcc_create(net, sock, protocol, AF_ATMSVC, kern);
 	if (error)
 		return error;
 	ATM_SD(sock)->local.sas_family = AF_ATMSVC;
diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c
index 330c1f4a5a0b..4273533d22b1 100644
--- a/net/ax25/af_ax25.c
+++ b/net/ax25/af_ax25.c
@@ -855,7 +855,7 @@ static int ax25_create(struct net *net, struct socket *sock, int protocol,
 		return -ESOCKTNOSUPPORT;
 	}
 
-	sk = sk_alloc(net, PF_AX25, GFP_ATOMIC, &ax25_proto);
+	sk = sk_alloc(net, PF_AX25, GFP_ATOMIC, &ax25_proto, kern);
 	if (sk == NULL)
 		return -ENOMEM;
 
@@ -881,7 +881,7 @@ struct sock *ax25_make_new(struct sock *osk, struct ax25_dev *ax25_dev)
 	struct sock *sk;
 	ax25_cb *ax25, *oax25;
 
-	sk = sk_alloc(sock_net(osk), PF_AX25, GFP_ATOMIC,	osk->sk_prot);
+	sk = sk_alloc(sock_net(osk), PF_AX25, GFP_ATOMIC, osk->sk_prot, 0);
 	if (sk == NULL)
 		return NULL;
 
diff --git a/net/bluetooth/bnep/sock.c b/net/bluetooth/bnep/sock.c
index bde2bdd9e929..b5116fa9835e 100644
--- a/net/bluetooth/bnep/sock.c
+++ b/net/bluetooth/bnep/sock.c
@@ -202,7 +202,7 @@ static int bnep_sock_create(struct net *net, struct socket *sock, int protocol,
 	if (sock->type != SOCK_RAW)
 		return -ESOCKTNOSUPPORT;
 
-	sk = sk_alloc(net, PF_BLUETOOTH, GFP_ATOMIC, &bnep_proto);
+	sk = sk_alloc(net, PF_BLUETOOTH, GFP_ATOMIC, &bnep_proto, kern);
 	if (!sk)
 		return -ENOMEM;
 
diff --git a/net/bluetooth/cmtp/sock.c b/net/bluetooth/cmtp/sock.c
index d82787d417bd..ce86a7bae844 100644
--- a/net/bluetooth/cmtp/sock.c
+++ b/net/bluetooth/cmtp/sock.c
@@ -205,7 +205,7 @@ static int cmtp_sock_create(struct net *net, struct socket *sock, int protocol,
 	if (sock->type != SOCK_RAW)
 		return -ESOCKTNOSUPPORT;
 
-	sk = sk_alloc(net, PF_BLUETOOTH, GFP_ATOMIC, &cmtp_proto);
+	sk = sk_alloc(net, PF_BLUETOOTH, GFP_ATOMIC, &cmtp_proto, kern);
 	if (!sk)
 		return -ENOMEM;
 
diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c
index 56f9edbf3d05..5b14dcafcd08 100644
--- a/net/bluetooth/hci_sock.c
+++ b/net/bluetooth/hci_sock.c
@@ -1377,7 +1377,7 @@ static int hci_sock_create(struct net *net, struct socket *sock, int protocol,
 
 	sock->ops = &hci_sock_ops;
 
-	sk = sk_alloc(net, PF_BLUETOOTH, GFP_ATOMIC, &hci_sk_proto);
+	sk = sk_alloc(net, PF_BLUETOOTH, GFP_ATOMIC, &hci_sk_proto, kern);
 	if (!sk)
 		return -ENOMEM;
 
diff --git a/net/bluetooth/hidp/sock.c b/net/bluetooth/hidp/sock.c
index cb3fdde1968a..008ba439bd62 100644
--- a/net/bluetooth/hidp/sock.c
+++ b/net/bluetooth/hidp/sock.c
@@ -235,7 +235,7 @@ static int hidp_sock_create(struct net *net, struct socket *sock, int protocol,
 	if (sock->type != SOCK_RAW)
 		return -ESOCKTNOSUPPORT;
 
-	sk = sk_alloc(net, PF_BLUETOOTH, GFP_ATOMIC, &hidp_proto);
+	sk = sk_alloc(net, PF_BLUETOOTH, GFP_ATOMIC, &hidp_proto, kern);
 	if (!sk)
 		return -ENOMEM;
 
diff --git a/net/bluetooth/l2cap_sock.c b/net/bluetooth/l2cap_sock.c
index a7278f05eafb..244287706f91 100644
--- a/net/bluetooth/l2cap_sock.c
+++ b/net/bluetooth/l2cap_sock.c
@@ -43,7 +43,7 @@ static struct bt_sock_list l2cap_sk_list = {
 static const struct proto_ops l2cap_sock_ops;
 static void l2cap_sock_init(struct sock *sk, struct sock *parent);
 static struct sock *l2cap_sock_alloc(struct net *net, struct socket *sock,
-				     int proto, gfp_t prio);
+				     int proto, gfp_t prio, int kern);
 
 bool l2cap_is_socket(struct socket *sock)
 {
@@ -1193,7 +1193,7 @@ static struct l2cap_chan *l2cap_sock_new_connection_cb(struct l2cap_chan *chan)
 	}
 
 	sk = l2cap_sock_alloc(sock_net(parent), NULL, BTPROTO_L2CAP,
-			      GFP_ATOMIC);
+			      GFP_ATOMIC, 0);
 	if (!sk) {
 		release_sock(parent);
 		return NULL;
@@ -1523,12 +1523,12 @@ static struct proto l2cap_proto = {
 };
 
 static struct sock *l2cap_sock_alloc(struct net *net, struct socket *sock,
-				     int proto, gfp_t prio)
+				     int proto, gfp_t prio, int kern)
 {
 	struct sock *sk;
 	struct l2cap_chan *chan;
 
-	sk = sk_alloc(net, PF_BLUETOOTH, prio, &l2cap_proto);
+	sk = sk_alloc(net, PF_BLUETOOTH, prio, &l2cap_proto, kern);
 	if (!sk)
 		return NULL;
 
@@ -1574,7 +1574,7 @@ static int l2cap_sock_create(struct net *net, struct socket *sock, int protocol,
 
 	sock->ops = &l2cap_sock_ops;
 
-	sk = l2cap_sock_alloc(net, sock, protocol, GFP_ATOMIC);
+	sk = l2cap_sock_alloc(net, sock, protocol, GFP_ATOMIC, kern);
 	if (!sk)
 		return -ENOMEM;
 
diff --git a/net/bluetooth/rfcomm/sock.c b/net/bluetooth/rfcomm/sock.c
index 825e8fb5114b..b2338e971b33 100644
--- a/net/bluetooth/rfcomm/sock.c
+++ b/net/bluetooth/rfcomm/sock.c
@@ -269,12 +269,12 @@ static struct proto rfcomm_proto = {
 	.obj_size	= sizeof(struct rfcomm_pinfo)
 };
 
-static struct sock *rfcomm_sock_alloc(struct net *net, struct socket *sock, int proto, gfp_t prio)
+static struct sock *rfcomm_sock_alloc(struct net *net, struct socket *sock, int proto, gfp_t prio, int kern)
 {
 	struct rfcomm_dlc *d;
 	struct sock *sk;
 
-	sk = sk_alloc(net, PF_BLUETOOTH, prio, &rfcomm_proto);
+	sk = sk_alloc(net, PF_BLUETOOTH, prio, &rfcomm_proto, kern);
 	if (!sk)
 		return NULL;
 
@@ -324,7 +324,7 @@ static int rfcomm_sock_create(struct net *net, struct socket *sock,
 
 	sock->ops = &rfcomm_sock_ops;
 
-	sk = rfcomm_sock_alloc(net, sock, protocol, GFP_ATOMIC);
+	sk = rfcomm_sock_alloc(net, sock, protocol, GFP_ATOMIC, kern);
 	if (!sk)
 		return -ENOMEM;
 
@@ -969,7 +969,7 @@ int rfcomm_connect_ind(struct rfcomm_session *s, u8 channel, struct rfcomm_dlc *
 		goto done;
 	}
 
-	sk = rfcomm_sock_alloc(sock_net(parent), NULL, BTPROTO_RFCOMM, GFP_ATOMIC);
+	sk = rfcomm_sock_alloc(sock_net(parent), NULL, BTPROTO_RFCOMM, GFP_ATOMIC, 0);
 	if (!sk)
 		goto done;
 
diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c
index 4322c833e748..6b6e59dc54cf 100644
--- a/net/bluetooth/sco.c
+++ b/net/bluetooth/sco.c
@@ -460,11 +460,11 @@ static struct proto sco_proto = {
 	.obj_size	= sizeof(struct sco_pinfo)
 };
 
-static struct sock *sco_sock_alloc(struct net *net, struct socket *sock, int proto, gfp_t prio)
+static struct sock *sco_sock_alloc(struct net *net, struct socket *sock, int proto, gfp_t prio, int kern)
 {
 	struct sock *sk;
 
-	sk = sk_alloc(net, PF_BLUETOOTH, prio, &sco_proto);
+	sk = sk_alloc(net, PF_BLUETOOTH, prio, &sco_proto, kern);
 	if (!sk)
 		return NULL;
 
@@ -501,7 +501,7 @@ static int sco_sock_create(struct net *net, struct socket *sock, int protocol,
 
 	sock->ops = &sco_sock_ops;
 
-	sk = sco_sock_alloc(net, sock, protocol, GFP_ATOMIC);
+	sk = sco_sock_alloc(net, sock, protocol, GFP_ATOMIC, kern);
 	if (!sk)
 		return -ENOMEM;
 
@@ -1026,7 +1026,7 @@ static void sco_conn_ready(struct sco_conn *conn)
 		bh_lock_sock(parent);
 
 		sk = sco_sock_alloc(sock_net(parent), NULL,
-				    BTPROTO_SCO, GFP_ATOMIC);
+				    BTPROTO_SCO, GFP_ATOMIC, 0);
 		if (!sk) {
 			bh_unlock_sock(parent);
 			sco_conn_unlock(conn);
diff --git a/net/caif/caif_socket.c b/net/caif/caif_socket.c
index 4ec0c803aef1..78a04ebb113c 100644
--- a/net/caif/caif_socket.c
+++ b/net/caif/caif_socket.c
@@ -1047,7 +1047,7 @@ static int caif_create(struct net *net, struct socket *sock, int protocol,
 	 * is really not used at all in the net/core or socket.c but the
 	 * initialization makes sure that sock->state is not uninitialized.
 	 */
-	sk = sk_alloc(net, PF_CAIF, GFP_KERNEL, &prot);
+	sk = sk_alloc(net, PF_CAIF, GFP_KERNEL, &prot, kern);
 	if (!sk)
 		return -ENOMEM;
 
diff --git a/net/can/af_can.c b/net/can/af_can.c
index 32d710eaf1fc..d4d404bdfc9a 100644
--- a/net/can/af_can.c
+++ b/net/can/af_can.c
@@ -179,7 +179,7 @@ static int can_create(struct net *net, struct socket *sock, int protocol,
 
 	sock->ops = cp->ops;
 
-	sk = sk_alloc(net, PF_CAN, GFP_KERNEL, cp->prot);
+	sk = sk_alloc(net, PF_CAN, GFP_KERNEL, cp->prot, kern);
 	if (!sk) {
 		err = -ENOMEM;
 		goto errout;
diff --git a/net/core/sock.c b/net/core/sock.c
index e891bcf325ca..cbc3789b830c 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1396,9 +1396,10 @@ EXPORT_SYMBOL_GPL(sock_update_netprioidx);
  *	@family: protocol family
  *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
  *	@prot: struct proto associated with this new sock instance
+ *	@kern: is this to be a kernel socket?
  */
 struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
-		      struct proto *prot)
+		      struct proto *prot, int kern)
 {
 	struct sock *sk;
 
diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c
index 754484b3cd0e..675cf94e04f8 100644
--- a/net/decnet/af_decnet.c
+++ b/net/decnet/af_decnet.c
@@ -468,10 +468,10 @@ static struct proto dn_proto = {
 	.obj_size		= sizeof(struct dn_sock),
 };
 
-static struct sock *dn_alloc_sock(struct net *net, struct socket *sock, gfp_t gfp)
+static struct sock *dn_alloc_sock(struct net *net, struct socket *sock, gfp_t gfp, int kern)
 {
 	struct dn_scp *scp;
-	struct sock *sk = sk_alloc(net, PF_DECnet, gfp, &dn_proto);
+	struct sock *sk = sk_alloc(net, PF_DECnet, gfp, &dn_proto, kern);
 
 	if  (!sk)
 		goto out;
@@ -693,7 +693,7 @@ static int dn_create(struct net *net, struct socket *sock, int protocol,
 	}
 
 
-	if ((sk = dn_alloc_sock(net, sock, GFP_KERNEL)) == NULL)
+	if ((sk = dn_alloc_sock(net, sock, GFP_KERNEL, kern)) == NULL)
 		return -ENOBUFS;
 
 	sk->sk_protocol = protocol;
@@ -1096,7 +1096,7 @@ static int dn_accept(struct socket *sock, struct socket *newsock, int flags)
 
 	cb = DN_SKB_CB(skb);
 	sk->sk_ack_backlog--;
-	newsk = dn_alloc_sock(sock_net(sk), newsock, sk->sk_allocation);
+	newsk = dn_alloc_sock(sock_net(sk), newsock, sk->sk_allocation, 0);
 	if (newsk == NULL) {
 		release_sock(sk);
 		kfree_skb(skb);
diff --git a/net/ieee802154/socket.c b/net/ieee802154/socket.c
index b60c65f70346..7aaaf967df58 100644
--- a/net/ieee802154/socket.c
+++ b/net/ieee802154/socket.c
@@ -1014,7 +1014,7 @@ static int ieee802154_create(struct net *net, struct socket *sock,
 	}
 
 	rc = -ENOMEM;
-	sk = sk_alloc(net, PF_IEEE802154, GFP_KERNEL, proto);
+	sk = sk_alloc(net, PF_IEEE802154, GFP_KERNEL, proto, kern);
 	if (!sk)
 		goto out;
 	rc = 0;
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 09f4d024dfe5..e2dd9cb99d61 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -317,7 +317,7 @@ lookup_protocol:
 	WARN_ON(!answer_prot->slab);
 
 	err = -ENOBUFS;
-	sk = sk_alloc(net, PF_INET, GFP_KERNEL, answer_prot);
+	sk = sk_alloc(net, PF_INET, GFP_KERNEL, answer_prot, kern);
 	if (!sk)
 		goto out;
 
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 4632afa57e05..f3866c0b6cfe 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -167,7 +167,7 @@ lookup_protocol:
 	WARN_ON(!answer_prot->slab);
 
 	err = -ENOBUFS;
-	sk = sk_alloc(net, PF_INET6, GFP_KERNEL, answer_prot);
+	sk = sk_alloc(net, PF_INET6, GFP_KERNEL, answer_prot, kern);
 	if (!sk)
 		goto out;
 
diff --git a/net/ipx/af_ipx.c b/net/ipx/af_ipx.c
index 4ea5d7497b5f..48d0dc89b58d 100644
--- a/net/ipx/af_ipx.c
+++ b/net/ipx/af_ipx.c
@@ -1347,7 +1347,7 @@ static int ipx_create(struct net *net, struct socket *sock, int protocol,
 		goto out;
 
 	rc = -ENOMEM;
-	sk = sk_alloc(net, PF_IPX, GFP_KERNEL, &ipx_proto);
+	sk = sk_alloc(net, PF_IPX, GFP_KERNEL, &ipx_proto, kern);
 	if (!sk)
 		goto out;
 
diff --git a/net/irda/af_irda.c b/net/irda/af_irda.c
index ee0ea25c8e7a..fae6822cc367 100644
--- a/net/irda/af_irda.c
+++ b/net/irda/af_irda.c
@@ -1100,7 +1100,7 @@ static int irda_create(struct net *net, struct socket *sock, int protocol,
 	}
 
 	/* Allocate networking socket */
-	sk = sk_alloc(net, PF_IRDA, GFP_KERNEL, &irda_proto);
+	sk = sk_alloc(net, PF_IRDA, GFP_KERNEL, &irda_proto, kern);
 	if (sk == NULL)
 		return -ENOMEM;
 
diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c
index 6daa52a18d40..918151c11348 100644
--- a/net/iucv/af_iucv.c
+++ b/net/iucv/af_iucv.c
@@ -535,12 +535,12 @@ static void iucv_sock_init(struct sock *sk, struct sock *parent)
 		sk->sk_type = parent->sk_type;
 }
 
-static struct sock *iucv_sock_alloc(struct socket *sock, int proto, gfp_t prio)
+static struct sock *iucv_sock_alloc(struct socket *sock, int proto, gfp_t prio, int kern)
 {
 	struct sock *sk;
 	struct iucv_sock *iucv;
 
-	sk = sk_alloc(&init_net, PF_IUCV, prio, &iucv_proto);
+	sk = sk_alloc(&init_net, PF_IUCV, prio, &iucv_proto, kern);
 	if (!sk)
 		return NULL;
 	iucv = iucv_sk(sk);
@@ -602,7 +602,7 @@ static int iucv_sock_create(struct net *net, struct socket *sock, int protocol,
 		return -ESOCKTNOSUPPORT;
 	}
 
-	sk = iucv_sock_alloc(sock, protocol, GFP_KERNEL);
+	sk = iucv_sock_alloc(sock, protocol, GFP_KERNEL, kern);
 	if (!sk)
 		return -ENOMEM;
 
@@ -1723,7 +1723,7 @@ static int iucv_callback_connreq(struct iucv_path *path,
 	}
 
 	/* Create the new socket */
-	nsk = iucv_sock_alloc(NULL, sk->sk_type, GFP_ATOMIC);
+	nsk = iucv_sock_alloc(NULL, sk->sk_type, GFP_ATOMIC, 0);
 	if (!nsk) {
 		err = pr_iucv->path_sever(path, user_data);
 		iucv_path_free(path);
@@ -1933,7 +1933,7 @@ static int afiucv_hs_callback_syn(struct sock *sk, struct sk_buff *skb)
 		goto out;
 	}
 
-	nsk = iucv_sock_alloc(NULL, sk->sk_type, GFP_ATOMIC);
+	nsk = iucv_sock_alloc(NULL, sk->sk_type, GFP_ATOMIC, 0);
 	bh_lock_sock(sk);
 	if ((sk->sk_state != IUCV_LISTEN) ||
 	    sk_acceptq_is_full(sk) ||
diff --git a/net/key/af_key.c b/net/key/af_key.c
index f0d52d721b3a..9e834ec475a9 100644
--- a/net/key/af_key.c
+++ b/net/key/af_key.c
@@ -149,7 +149,7 @@ static int pfkey_create(struct net *net, struct socket *sock, int protocol,
 		return -EPROTONOSUPPORT;
 
 	err = -ENOMEM;
-	sk = sk_alloc(net, PF_KEY, GFP_KERNEL, &key_proto);
+	sk = sk_alloc(net, PF_KEY, GFP_KERNEL, &key_proto, kern);
 	if (sk == NULL)
 		goto out;
 
diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c
index e9b0dec56b8e..f56c9f69e9f2 100644
--- a/net/l2tp/l2tp_ppp.c
+++ b/net/l2tp/l2tp_ppp.c
@@ -542,12 +542,12 @@ static int pppol2tp_backlog_recv(struct sock *sk, struct sk_buff *skb)
 
 /* socket() handler. Initialize a new struct sock.
  */
-static int pppol2tp_create(struct net *net, struct socket *sock)
+static int pppol2tp_create(struct net *net, struct socket *sock, int kern)
 {
 	int error = -ENOMEM;
 	struct sock *sk;
 
-	sk = sk_alloc(net, PF_PPPOX, GFP_KERNEL, &pppol2tp_sk_proto);
+	sk = sk_alloc(net, PF_PPPOX, GFP_KERNEL, &pppol2tp_sk_proto, kern);
 	if (!sk)
 		goto out;
 
diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c
index 17a8dff06090..8fd9febaa5ba 100644
--- a/net/llc/af_llc.c
+++ b/net/llc/af_llc.c
@@ -168,7 +168,7 @@ static int llc_ui_create(struct net *net, struct socket *sock, int protocol,
 
 	if (likely(sock->type == SOCK_DGRAM || sock->type == SOCK_STREAM)) {
 		rc = -ENOMEM;
-		sk = llc_sk_alloc(net, PF_LLC, GFP_KERNEL, &llc_proto);
+		sk = llc_sk_alloc(net, PF_LLC, GFP_KERNEL, &llc_proto, kern);
 		if (sk) {
 			rc = 0;
 			llc_ui_sk_init(sock, sk);
diff --git a/net/llc/llc_conn.c b/net/llc/llc_conn.c
index 81a61fce3afb..3e821daf9dd4 100644
--- a/net/llc/llc_conn.c
+++ b/net/llc/llc_conn.c
@@ -768,7 +768,7 @@ static struct sock *llc_create_incoming_sock(struct sock *sk,
 					     struct llc_addr *daddr)
 {
 	struct sock *newsk = llc_sk_alloc(sock_net(sk), sk->sk_family, GFP_ATOMIC,
-					  sk->sk_prot);
+					  sk->sk_prot, 0);
 	struct llc_sock *newllc, *llc = llc_sk(sk);
 
 	if (!newsk)
@@ -931,9 +931,9 @@ static void llc_sk_init(struct sock *sk)
  *	Allocates a LLC sock and initializes it. Returns the new LLC sock
  *	or %NULL if there's no memory available for one
  */
-struct sock *llc_sk_alloc(struct net *net, int family, gfp_t priority, struct proto *prot)
+struct sock *llc_sk_alloc(struct net *net, int family, gfp_t priority, struct proto *prot, int kern)
 {
-	struct sock *sk = sk_alloc(net, family, priority, prot);
+	struct sock *sk = sk_alloc(net, family, priority, prot, kern);
 
 	if (!sk)
 		goto out;
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index a5fff75accf8..be6665ab7f40 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -1119,14 +1119,15 @@ static struct proto netlink_proto = {
 };
 
 static int __netlink_create(struct net *net, struct socket *sock,
-			    struct mutex *cb_mutex, int protocol)
+			    struct mutex *cb_mutex, int protocol,
+			    int kern)
 {
 	struct sock *sk;
 	struct netlink_sock *nlk;
 
 	sock->ops = &netlink_ops;
 
-	sk = sk_alloc(net, PF_NETLINK, GFP_KERNEL, &netlink_proto);
+	sk = sk_alloc(net, PF_NETLINK, GFP_KERNEL, &netlink_proto, kern);
 	if (!sk)
 		return -ENOMEM;
 
@@ -1188,7 +1189,7 @@ static int netlink_create(struct net *net, struct socket *sock, int protocol,
 	if (err < 0)
 		goto out;
 
-	err = __netlink_create(net, sock, cb_mutex, protocol);
+	err = __netlink_create(net, sock, cb_mutex, protocol, kern);
 	if (err < 0)
 		goto out_module;
 
@@ -2515,14 +2516,12 @@ __netlink_kernel_create(struct net *net, int unit, struct module *module,
 
 	if (sock_create_lite(PF_NETLINK, SOCK_DGRAM, unit, &sock))
 		return NULL;
-
 	/*
 	 * We have to just have a reference on the net from sk, but don't
 	 * get_net it. Besides, we cannot get and then put the net here.
 	 * So we create one inside init_net and the move it to net.
 	 */
-
-	if (__netlink_create(&init_net, sock, cb_mutex, unit) < 0)
+	if (__netlink_create(&init_net, sock, cb_mutex, unit, 0) < 0)
 		goto out_sock_release_nosk;
 
 	sk = sock->sk;
diff --git a/net/netrom/af_netrom.c b/net/netrom/af_netrom.c
index b987fd56c3c5..ed212ffc1d9d 100644
--- a/net/netrom/af_netrom.c
+++ b/net/netrom/af_netrom.c
@@ -433,7 +433,7 @@ static int nr_create(struct net *net, struct socket *sock, int protocol,
 	if (sock->type != SOCK_SEQPACKET || protocol != 0)
 		return -ESOCKTNOSUPPORT;
 
-	sk = sk_alloc(net, PF_NETROM, GFP_ATOMIC, &nr_proto);
+	sk = sk_alloc(net, PF_NETROM, GFP_ATOMIC, &nr_proto, kern);
 	if (sk  == NULL)
 		return -ENOMEM;
 
@@ -476,7 +476,7 @@ static struct sock *nr_make_new(struct sock *osk)
 	if (osk->sk_type != SOCK_SEQPACKET)
 		return NULL;
 
-	sk = sk_alloc(sock_net(osk), PF_NETROM, GFP_ATOMIC, osk->sk_prot);
+	sk = sk_alloc(sock_net(osk), PF_NETROM, GFP_ATOMIC, osk->sk_prot, 0);
 	if (sk == NULL)
 		return NULL;
 
diff --git a/net/nfc/af_nfc.c b/net/nfc/af_nfc.c
index 2277276f52bc..54e40fa47822 100644
--- a/net/nfc/af_nfc.c
+++ b/net/nfc/af_nfc.c
@@ -40,7 +40,7 @@ static int nfc_sock_create(struct net *net, struct socket *sock, int proto,
 
 	read_lock(&proto_tab_lock);
 	if (proto_tab[proto] &&	try_module_get(proto_tab[proto]->owner)) {
-		rc = proto_tab[proto]->create(net, sock, proto_tab[proto]);
+		rc = proto_tab[proto]->create(net, sock, proto_tab[proto], kern);
 		module_put(proto_tab[proto]->owner);
 	}
 	read_unlock(&proto_tab_lock);
diff --git a/net/nfc/llcp.h b/net/nfc/llcp.h
index de1789e3cc82..1f68724d44d3 100644
--- a/net/nfc/llcp.h
+++ b/net/nfc/llcp.h
@@ -225,7 +225,7 @@ void nfc_llcp_send_to_raw_sock(struct nfc_llcp_local *local,
 			       struct sk_buff *skb, u8 direction);
 
 /* Sock API */
-struct sock *nfc_llcp_sock_alloc(struct socket *sock, int type, gfp_t gfp);
+struct sock *nfc_llcp_sock_alloc(struct socket *sock, int type, gfp_t gfp, int kern);
 void nfc_llcp_sock_free(struct nfc_llcp_sock *sock);
 void nfc_llcp_accept_unlink(struct sock *sk);
 void nfc_llcp_accept_enqueue(struct sock *parent, struct sock *sk);
diff --git a/net/nfc/llcp_core.c b/net/nfc/llcp_core.c
index b18f07ccb504..98876274a1ee 100644
--- a/net/nfc/llcp_core.c
+++ b/net/nfc/llcp_core.c
@@ -934,7 +934,7 @@ static void nfc_llcp_recv_connect(struct nfc_llcp_local *local,
 		sock->ssap = ssap;
 	}
 
-	new_sk = nfc_llcp_sock_alloc(NULL, parent->sk_type, GFP_ATOMIC);
+	new_sk = nfc_llcp_sock_alloc(NULL, parent->sk_type, GFP_ATOMIC, 0);
 	if (new_sk == NULL) {
 		reason = LLCP_DM_REJ;
 		release_sock(&sock->sk);
diff --git a/net/nfc/llcp_sock.c b/net/nfc/llcp_sock.c
index 9578bd6a4f3e..b7de0da46acd 100644
--- a/net/nfc/llcp_sock.c
+++ b/net/nfc/llcp_sock.c
@@ -942,12 +942,12 @@ static void llcp_sock_destruct(struct sock *sk)
 	}
 }
 
-struct sock *nfc_llcp_sock_alloc(struct socket *sock, int type, gfp_t gfp)
+struct sock *nfc_llcp_sock_alloc(struct socket *sock, int type, gfp_t gfp, int kern)
 {
 	struct sock *sk;
 	struct nfc_llcp_sock *llcp_sock;
 
-	sk = sk_alloc(&init_net, PF_NFC, gfp, &llcp_sock_proto);
+	sk = sk_alloc(&init_net, PF_NFC, gfp, &llcp_sock_proto, kern);
 	if (!sk)
 		return NULL;
 
@@ -993,7 +993,7 @@ void nfc_llcp_sock_free(struct nfc_llcp_sock *sock)
 }
 
 static int llcp_sock_create(struct net *net, struct socket *sock,
-			    const struct nfc_protocol *nfc_proto)
+			    const struct nfc_protocol *nfc_proto, int kern)
 {
 	struct sock *sk;
 
@@ -1009,7 +1009,7 @@ static int llcp_sock_create(struct net *net, struct socket *sock,
 	else
 		sock->ops = &llcp_sock_ops;
 
-	sk = nfc_llcp_sock_alloc(sock, sock->type, GFP_ATOMIC);
+	sk = nfc_llcp_sock_alloc(sock, sock->type, GFP_ATOMIC, kern);
 	if (sk == NULL)
 		return -ENOMEM;
 
diff --git a/net/nfc/nfc.h b/net/nfc/nfc.h
index a8ce80b47720..5c93e8412a26 100644
--- a/net/nfc/nfc.h
+++ b/net/nfc/nfc.h
@@ -30,7 +30,7 @@ struct nfc_protocol {
 	struct proto *proto;
 	struct module *owner;
 	int (*create)(struct net *net, struct socket *sock,
-		      const struct nfc_protocol *nfc_proto);
+		      const struct nfc_protocol *nfc_proto, int kern);
 };
 
 struct nfc_rawsock {
diff --git a/net/nfc/rawsock.c b/net/nfc/rawsock.c
index 82b4e8024778..e9a91488fe3d 100644
--- a/net/nfc/rawsock.c
+++ b/net/nfc/rawsock.c
@@ -334,7 +334,7 @@ static void rawsock_destruct(struct sock *sk)
 }
 
 static int rawsock_create(struct net *net, struct socket *sock,
-			  const struct nfc_protocol *nfc_proto)
+			  const struct nfc_protocol *nfc_proto, int kern)
 {
 	struct sock *sk;
 
@@ -348,7 +348,7 @@ static int rawsock_create(struct net *net, struct socket *sock,
 	else
 		sock->ops = &rawsock_ops;
 
-	sk = sk_alloc(net, PF_NFC, GFP_ATOMIC, nfc_proto->proto);
+	sk = sk_alloc(net, PF_NFC, GFP_ATOMIC, nfc_proto->proto, kern);
 	if (!sk)
 		return -ENOMEM;
 
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 5102c3cc4eec..94713276a1d9 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -2832,7 +2832,7 @@ static int packet_create(struct net *net, struct socket *sock, int protocol,
 	sock->state = SS_UNCONNECTED;
 
 	err = -ENOBUFS;
-	sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto);
+	sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto, kern);
 	if (sk == NULL)
 		goto out;
 
diff --git a/net/phonet/af_phonet.c b/net/phonet/af_phonet.c
index 32ab87d34828..10d42f3220ab 100644
--- a/net/phonet/af_phonet.c
+++ b/net/phonet/af_phonet.c
@@ -97,7 +97,7 @@ static int pn_socket_create(struct net *net, struct socket *sock, int protocol,
 		goto out;
 	}
 
-	sk = sk_alloc(net, PF_PHONET, GFP_KERNEL, pnp->prot);
+	sk = sk_alloc(net, PF_PHONET, GFP_KERNEL, pnp->prot, kern);
 	if (sk == NULL) {
 		err = -ENOMEM;
 		goto out;
diff --git a/net/phonet/pep.c b/net/phonet/pep.c
index 6de2aeb98a1f..850a86cde0b3 100644
--- a/net/phonet/pep.c
+++ b/net/phonet/pep.c
@@ -845,7 +845,7 @@ static struct sock *pep_sock_accept(struct sock *sk, int flags, int *errp)
 	}
 
 	/* Create a new to-be-accepted sock */
-	newsk = sk_alloc(sock_net(sk), PF_PHONET, GFP_KERNEL, sk->sk_prot);
+	newsk = sk_alloc(sock_net(sk), PF_PHONET, GFP_KERNEL, sk->sk_prot, 0);
 	if (!newsk) {
 		pep_reject_conn(sk, skb, PN_PIPE_ERR_OVERLOAD, GFP_KERNEL);
 		err = -ENOBUFS;
diff --git a/net/rds/af_rds.c b/net/rds/af_rds.c
index 10443377fb9d..3d83641f2861 100644
--- a/net/rds/af_rds.c
+++ b/net/rds/af_rds.c
@@ -440,7 +440,7 @@ static int rds_create(struct net *net, struct socket *sock, int protocol,
 	if (sock->type != SOCK_SEQPACKET || protocol)
 		return -ESOCKTNOSUPPORT;
 
-	sk = sk_alloc(net, AF_RDS, GFP_ATOMIC, &rds_proto);
+	sk = sk_alloc(net, AF_RDS, GFP_ATOMIC, &rds_proto, kern);
 	if (!sk)
 		return -ENOMEM;
 
diff --git a/net/rose/af_rose.c b/net/rose/af_rose.c
index 8ae603069a1a..36dbc2da3661 100644
--- a/net/rose/af_rose.c
+++ b/net/rose/af_rose.c
@@ -520,7 +520,7 @@ static int rose_create(struct net *net, struct socket *sock, int protocol,
 	if (sock->type != SOCK_SEQPACKET || protocol != 0)
 		return -ESOCKTNOSUPPORT;
 
-	sk = sk_alloc(net, PF_ROSE, GFP_ATOMIC, &rose_proto);
+	sk = sk_alloc(net, PF_ROSE, GFP_ATOMIC, &rose_proto, kern);
 	if (sk == NULL)
 		return -ENOMEM;
 
@@ -559,7 +559,7 @@ static struct sock *rose_make_new(struct sock *osk)
 	if (osk->sk_type != SOCK_SEQPACKET)
 		return NULL;
 
-	sk = sk_alloc(sock_net(osk), PF_ROSE, GFP_ATOMIC, &rose_proto);
+	sk = sk_alloc(sock_net(osk), PF_ROSE, GFP_ATOMIC, &rose_proto, 0);
 	if (sk == NULL)
 		return NULL;
 
diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c
index 0095b9a0b779..25d60ed15284 100644
--- a/net/rxrpc/af_rxrpc.c
+++ b/net/rxrpc/af_rxrpc.c
@@ -632,7 +632,7 @@ static int rxrpc_create(struct net *net, struct socket *sock, int protocol,
 	sock->ops = &rxrpc_rpc_ops;
 	sock->state = SS_UNCONNECTED;
 
-	sk = sk_alloc(net, PF_RXRPC, GFP_KERNEL, &rxrpc_proto);
+	sk = sk_alloc(net, PF_RXRPC, GFP_KERNEL, &rxrpc_proto, kern);
 	if (!sk)
 		return -ENOMEM;
 
diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c
index 0e4198ee2370..e703ff7fed40 100644
--- a/net/sctp/ipv6.c
+++ b/net/sctp/ipv6.c
@@ -635,7 +635,7 @@ static struct sock *sctp_v6_create_accept_sk(struct sock *sk,
 	struct ipv6_pinfo *newnp, *np = inet6_sk(sk);
 	struct sctp6_sock *newsctp6sk;
 
-	newsk = sk_alloc(sock_net(sk), PF_INET6, GFP_KERNEL, sk->sk_prot);
+	newsk = sk_alloc(sock_net(sk), PF_INET6, GFP_KERNEL, sk->sk_prot, 0);
 	if (!newsk)
 		goto out;
 
diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index 53b7acde9aa3..59e80356672b 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -550,7 +550,7 @@ static struct sock *sctp_v4_create_accept_sk(struct sock *sk,
 					     struct sctp_association *asoc)
 {
 	struct sock *newsk = sk_alloc(sock_net(sk), PF_INET, GFP_KERNEL,
-			sk->sk_prot);
+			sk->sk_prot, 0);
 	struct inet_sock *newinet;
 
 	if (!newsk)
diff --git a/net/tipc/socket.c b/net/tipc/socket.c
index 9074b5cede38..8f3c8e2cef8e 100644
--- a/net/tipc/socket.c
+++ b/net/tipc/socket.c
@@ -342,7 +342,7 @@ static int tipc_sk_create(struct net *net, struct socket *sock,
 	}
 
 	/* Allocate socket's protocol area */
-	sk = sk_alloc(net, AF_TIPC, GFP_KERNEL, &tipc_proto);
+	sk = sk_alloc(net, AF_TIPC, GFP_KERNEL, &tipc_proto, kern);
 	if (sk == NULL)
 		return -ENOMEM;
 
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 5266ea7b922b..941b3d26e3bf 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -620,7 +620,7 @@ static struct proto unix_proto = {
  */
 static struct lock_class_key af_unix_sk_receive_queue_lock_key;
 
-static struct sock *unix_create1(struct net *net, struct socket *sock)
+static struct sock *unix_create1(struct net *net, struct socket *sock, int kern)
 {
 	struct sock *sk = NULL;
 	struct unix_sock *u;
@@ -629,7 +629,7 @@ static struct sock *unix_create1(struct net *net, struct socket *sock)
 	if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files())
 		goto out;
 
-	sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_proto);
+	sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_proto, kern);
 	if (!sk)
 		goto out;
 
@@ -688,7 +688,7 @@ static int unix_create(struct net *net, struct socket *sock, int protocol,
 		return -ESOCKTNOSUPPORT;
 	}
 
-	return unix_create1(net, sock) ? 0 : -ENOMEM;
+	return unix_create1(net, sock, kern) ? 0 : -ENOMEM;
 }
 
 static int unix_release(struct socket *sock)
@@ -1088,7 +1088,7 @@ static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
 	err = -ENOMEM;
 
 	/* create new sock for complete connection */
-	newsk = unix_create1(sock_net(sk), NULL);
+	newsk = unix_create1(sock_net(sk), NULL, 0);
 	if (newsk == NULL)
 		goto out;
 
diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
index 2ec86e652a19..df5fc6b340f1 100644
--- a/net/vmw_vsock/af_vsock.c
+++ b/net/vmw_vsock/af_vsock.c
@@ -581,13 +581,14 @@ struct sock *__vsock_create(struct net *net,
 			    struct socket *sock,
 			    struct sock *parent,
 			    gfp_t priority,
-			    unsigned short type)
+			    unsigned short type,
+			    int kern)
 {
 	struct sock *sk;
 	struct vsock_sock *psk;
 	struct vsock_sock *vsk;
 
-	sk = sk_alloc(net, AF_VSOCK, priority, &vsock_proto);
+	sk = sk_alloc(net, AF_VSOCK, priority, &vsock_proto, kern);
 	if (!sk)
 		return NULL;
 
@@ -1866,7 +1867,7 @@ static int vsock_create(struct net *net, struct socket *sock,
 
 	sock->state = SS_UNCONNECTED;
 
-	return __vsock_create(net, sock, NULL, GFP_KERNEL, 0) ? 0 : -ENOMEM;
+	return __vsock_create(net, sock, NULL, GFP_KERNEL, 0, kern) ? 0 : -ENOMEM;
 }
 
 static const struct net_proto_family vsock_family_ops = {
diff --git a/net/vmw_vsock/vmci_transport.c b/net/vmw_vsock/vmci_transport.c
index c294da095461..1f63daff3965 100644
--- a/net/vmw_vsock/vmci_transport.c
+++ b/net/vmw_vsock/vmci_transport.c
@@ -1022,7 +1022,7 @@ static int vmci_transport_recv_listen(struct sock *sk,
 	}
 
 	pending = __vsock_create(sock_net(sk), NULL, sk, GFP_KERNEL,
-				 sk->sk_type);
+				 sk->sk_type, 0);
 	if (!pending) {
 		vmci_transport_send_reset(sk, pkt);
 		return -ENOMEM;
diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c
index c3ab230e4493..a750f330b8dd 100644
--- a/net/x25/af_x25.c
+++ b/net/x25/af_x25.c
@@ -515,10 +515,10 @@ static struct proto x25_proto = {
 	.obj_size = sizeof(struct x25_sock),
 };
 
-static struct sock *x25_alloc_socket(struct net *net)
+static struct sock *x25_alloc_socket(struct net *net, int kern)
 {
 	struct x25_sock *x25;
-	struct sock *sk = sk_alloc(net, AF_X25, GFP_ATOMIC, &x25_proto);
+	struct sock *sk = sk_alloc(net, AF_X25, GFP_ATOMIC, &x25_proto, kern);
 
 	if (!sk)
 		goto out;
@@ -553,7 +553,7 @@ static int x25_create(struct net *net, struct socket *sock, int protocol,
 		goto out;
 
 	rc = -ENOBUFS;
-	if ((sk = x25_alloc_socket(net)) == NULL)
+	if ((sk = x25_alloc_socket(net, kern)) == NULL)
 		goto out;
 
 	x25 = x25_sk(sk);
@@ -602,7 +602,7 @@ static struct sock *x25_make_new(struct sock *osk)
 	if (osk->sk_type != SOCK_SEQPACKET)
 		goto out;
 
-	if ((sk = x25_alloc_socket(sock_net(osk))) == NULL)
+	if ((sk = x25_alloc_socket(sock_net(osk), 0)) == NULL)
 		goto out;
 
 	x25 = x25_sk(sk);
-- 
cgit v1.2.3


From d2788d34885d4ce5ba17a8996fd95d28942e574e Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Sat, 9 May 2015 22:51:32 +0200
Subject: net: sched: further simplify handle_ing

Ingress qdisc has no other purpose than calling into tc_classify()
that executes attached classifier(s) and action(s).

It has a 1:1 relationship to dev->ingress_queue. After having commit
087c1a601ad7 ("net: sched: run ingress qdisc without locks") removed
the central ingress lock, one major contention point is gone.

The extra indirection layers however, are not necessary for calling
into ingress qdisc. pktgen calling locally into netif_receive_skb()
with a dummy u32, single CPU result on a Supermicro X10SLM-F, Xeon
E3-1240: before ~21,1 Mpps, after patch ~22,9 Mpps.

We can redirect the private classifier list to the netdev directly,
without changing any classifier API bits (!) and execute on that from
handle_ing() side. The __QDISC_STATE_DEACTIVATE test can be removed,
ingress qdisc doesn't have a queue and thus dev_deactivate_queue()
is also not applicable, ingress_cl_list provides similar behaviour.
In other words, ingress qdisc acts like TCQ_F_BUILTIN qdisc.

One next possible step is the removal of the dev's ingress (dummy)
netdev_queue, and to only have the list member in the netdevice
itself.

Note, the filter chain is RCU protected and individual filter elements
are being kfree'd by sched subsystem after RCU grace period. RCU read
lock is being held by __netif_receive_skb_core().

Joint work with Alexei Starovoitov.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h |  4 ++++
 net/core/dev.c            | 30 ++++++++++++++----------
 net/sched/sch_ingress.c   | 58 ++++++++---------------------------------------
 3 files changed, 31 insertions(+), 61 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 1899c74a7127..c4e1caf6056f 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1655,7 +1655,11 @@ struct net_device {
 	rx_handler_func_t __rcu	*rx_handler;
 	void __rcu		*rx_handler_data;
 
+#if CONFIG_NET_CLS_ACT
+	struct tcf_proto __rcu  *ingress_cl_list;
+#endif
 	struct netdev_queue __rcu *ingress_queue;
+
 	unsigned char		broadcast[MAX_ADDR_LEN];
 #ifdef CONFIG_RFS_ACCEL
 	struct cpu_rmap		*rx_cpu_rmap;
diff --git a/net/core/dev.c b/net/core/dev.c
index 8a757464bfa2..e5f77c40bbd1 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3525,31 +3525,37 @@ static inline struct sk_buff *handle_ing(struct sk_buff *skb,
 					 struct packet_type **pt_prev,
 					 int *ret, struct net_device *orig_dev)
 {
-	struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue);
-	struct Qdisc *q;
+	struct tcf_proto *cl = rcu_dereference_bh(skb->dev->ingress_cl_list);
+	struct tcf_result cl_res;
 
 	/* If there's at least one ingress present somewhere (so
 	 * we get here via enabled static key), remaining devices
 	 * that are not configured with an ingress qdisc will bail
-	 * out w/o the rcu_dereference().
+	 * out here.
 	 */
-	if (!rxq || (q = rcu_dereference(rxq->qdisc)) == &noop_qdisc)
+	if (!cl)
 		return skb;
-
 	if (*pt_prev) {
 		*ret = deliver_skb(skb, *pt_prev, orig_dev);
 		*pt_prev = NULL;
 	}
 
+	qdisc_bstats_update_cpu(cl->q, skb);
 	skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
 
-	if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
-		switch (qdisc_enqueue_root(skb, q)) {
-		case TC_ACT_SHOT:
-		case TC_ACT_STOLEN:
-			kfree_skb(skb);
-			return NULL;
-		}
+	switch (tc_classify(skb, cl, &cl_res)) {
+	case TC_ACT_OK:
+	case TC_ACT_RECLASSIFY:
+		skb->tc_index = TC_H_MIN(cl_res.classid);
+		break;
+	case TC_ACT_SHOT:
+		qdisc_qstats_drop_cpu(cl->q);
+	case TC_ACT_STOLEN:
+	case TC_ACT_QUEUED:
+		kfree_skb(skb);
+		return NULL;
+	default:
+		break;
 	}
 
 	return skb;
diff --git a/net/sched/sch_ingress.c b/net/sched/sch_ingress.c
index a89cc3278bfb..e7c648fa9dc3 100644
--- a/net/sched/sch_ingress.c
+++ b/net/sched/sch_ingress.c
@@ -12,16 +12,10 @@
 #include <linux/list.h>
 #include <linux/skbuff.h>
 #include <linux/rtnetlink.h>
+
 #include <net/netlink.h>
 #include <net/pkt_sched.h>
 
-
-struct ingress_qdisc_data {
-	struct tcf_proto __rcu	*filter_list;
-};
-
-/* ------------------------- Class/flow operations ------------------------- */
-
 static struct Qdisc *ingress_leaf(struct Qdisc *sch, unsigned long arg)
 {
 	return NULL;
@@ -49,45 +43,11 @@ static void ingress_walk(struct Qdisc *sch, struct qdisc_walker *walker)
 static struct tcf_proto __rcu **ingress_find_tcf(struct Qdisc *sch,
 						 unsigned long cl)
 {
-	struct ingress_qdisc_data *p = qdisc_priv(sch);
-
-	return &p->filter_list;
-}
-
-/* --------------------------- Qdisc operations ---------------------------- */
+	struct net_device *dev = qdisc_dev(sch);
 
-static int ingress_enqueue(struct sk_buff *skb, struct Qdisc *sch)
-{
-	struct ingress_qdisc_data *p = qdisc_priv(sch);
-	struct tcf_result res;
-	struct tcf_proto *fl = rcu_dereference_bh(p->filter_list);
-	int result;
-
-	result = tc_classify(skb, fl, &res);
-
-	qdisc_bstats_update_cpu(sch, skb);
-	switch (result) {
-	case TC_ACT_SHOT:
-		result = TC_ACT_SHOT;
-		qdisc_qstats_drop_cpu(sch);
-		break;
-	case TC_ACT_STOLEN:
-	case TC_ACT_QUEUED:
-		result = TC_ACT_STOLEN;
-		break;
-	case TC_ACT_RECLASSIFY:
-	case TC_ACT_OK:
-		skb->tc_index = TC_H_MIN(res.classid);
-	default:
-		result = TC_ACT_OK;
-		break;
-	}
-
-	return result;
+	return &dev->ingress_cl_list;
 }
 
-/* ------------------------------------------------------------- */
-
 static int ingress_init(struct Qdisc *sch, struct nlattr *opt)
 {
 	net_inc_ingress_queue();
@@ -98,9 +58,9 @@ static int ingress_init(struct Qdisc *sch, struct nlattr *opt)
 
 static void ingress_destroy(struct Qdisc *sch)
 {
-	struct ingress_qdisc_data *p = qdisc_priv(sch);
+	struct net_device *dev = qdisc_dev(sch);
 
-	tcf_destroy_chain(&p->filter_list);
+	tcf_destroy_chain(&dev->ingress_cl_list);
 	net_dec_ingress_queue();
 }
 
@@ -111,6 +71,7 @@ static int ingress_dump(struct Qdisc *sch, struct sk_buff *skb)
 	nest = nla_nest_start(skb, TCA_OPTIONS);
 	if (nest == NULL)
 		goto nla_put_failure;
+
 	return nla_nest_end(skb, nest);
 
 nla_put_failure:
@@ -131,8 +92,6 @@ static const struct Qdisc_class_ops ingress_class_ops = {
 static struct Qdisc_ops ingress_qdisc_ops __read_mostly = {
 	.cl_ops		=	&ingress_class_ops,
 	.id		=	"ingress",
-	.priv_size	=	sizeof(struct ingress_qdisc_data),
-	.enqueue	=	ingress_enqueue,
 	.init		=	ingress_init,
 	.destroy	=	ingress_destroy,
 	.dump		=	ingress_dump,
@@ -149,6 +108,7 @@ static void __exit ingress_module_exit(void)
 	unregister_qdisc(&ingress_qdisc_ops);
 }
 
-module_init(ingress_module_init)
-module_exit(ingress_module_exit)
+module_init(ingress_module_init);
+module_exit(ingress_module_exit);
+
 MODULE_LICENSE("GPL");
-- 
cgit v1.2.3


From 6be109b31ccdb9c98e7be12687171f6602527a5d Mon Sep 17 00:00:00 2001
From: Arun Ramamurthy <arun.ramamurthy@broadcom.com>
Date: Wed, 22 Apr 2015 16:04:11 -0700
Subject: phy: core: Add devm_of_phy_get_by_index to phy-core

Some generic drivers, such as ehci, may use multiple phys and for such
drivers referencing phy(s) by name(s) does not make sense. Instead of
inventing new naming schemes and using custom code to iterate through them,
such drivers are better of using nameless phy bindings and using this newly
introduced API to iterate through them.

Signed-off-by: Arun Ramamurthy <arun.ramamurthy@broadcom.com>
Reviewed-by: Ray Jui <rjui@broadcom.com>
Reviewed-by: Scott Branden <sbranden@broadcom.com>
[kishon@ti.com: fix compilation errors]
Signed-off-by: Kishon Vijay Abraham I <kishon@ti.com>
---
 Documentation/phy.txt   |  7 ++++++-
 drivers/phy/phy-core.c  | 32 ++++++++++++++++++++++++++++++++
 include/linux/phy/phy.h |  9 +++++++++
 3 files changed, 47 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/Documentation/phy.txt b/Documentation/phy.txt
index 371361c69a4b..b388c5af9e72 100644
--- a/Documentation/phy.txt
+++ b/Documentation/phy.txt
@@ -76,6 +76,8 @@ struct phy *phy_get(struct device *dev, const char *string);
 struct phy *phy_optional_get(struct device *dev, const char *string);
 struct phy *devm_phy_get(struct device *dev, const char *string);
 struct phy *devm_phy_optional_get(struct device *dev, const char *string);
+struct phy *devm_of_phy_get_by_index(struct device *dev, struct device_node *np,
+				     int index);
 
 phy_get, phy_optional_get, devm_phy_get and devm_phy_optional_get can
 be used to get the PHY. In the case of dt boot, the string arguments
@@ -86,7 +88,10 @@ successful PHY get. On driver detach, release function is invoked on
 the the devres data and devres data is freed. phy_optional_get and
 devm_phy_optional_get should be used when the phy is optional. These
 two functions will never return -ENODEV, but instead returns NULL when
-the phy cannot be found.
+the phy cannot be found.Some generic drivers, such as ehci, may use multiple
+phys and for such drivers referencing phy(s) by name(s) does not make sense. In
+this case, devm_of_phy_get_by_index can be used to get a phy reference based on
+the index.
 
 It should be noted that NULL is a valid phy reference. All phy
 consumer calls on the NULL phy become NOPs. That is the release calls,
diff --git a/drivers/phy/phy-core.c b/drivers/phy/phy-core.c
index 3791838f4bd4..964a84d5a580 100644
--- a/drivers/phy/phy-core.c
+++ b/drivers/phy/phy-core.c
@@ -622,6 +622,38 @@ struct phy *devm_of_phy_get(struct device *dev, struct device_node *np,
 }
 EXPORT_SYMBOL_GPL(devm_of_phy_get);
 
+/**
+ * devm_of_phy_get_by_index() - lookup and obtain a reference to a phy by index.
+ * @dev: device that requests this phy
+ * @np: node containing the phy
+ * @index: index of the phy
+ *
+ * Gets the phy using _of_phy_get(), and associates a device with it using
+ * devres. On driver detach, release function is invoked on the devres data,
+ * then, devres data is freed.
+ *
+ */
+struct phy *devm_of_phy_get_by_index(struct device *dev, struct device_node *np,
+				     int index)
+{
+	struct phy **ptr, *phy;
+
+	ptr = devres_alloc(devm_phy_release, sizeof(*ptr), GFP_KERNEL);
+	if (!ptr)
+		return ERR_PTR(-ENOMEM);
+
+	phy = _of_phy_get(np, index);
+	if (!IS_ERR(phy)) {
+		*ptr = phy;
+		devres_add(dev, ptr);
+	} else {
+		devres_free(ptr);
+	}
+
+	return phy;
+}
+EXPORT_SYMBOL_GPL(devm_of_phy_get_by_index);
+
 /**
  * phy_create() - create a new phy
  * @dev: device that is creating the new phy
diff --git a/include/linux/phy/phy.h b/include/linux/phy/phy.h
index a0197fa1b116..8cf05e341cff 100644
--- a/include/linux/phy/phy.h
+++ b/include/linux/phy/phy.h
@@ -133,6 +133,8 @@ struct phy *devm_phy_get(struct device *dev, const char *string);
 struct phy *devm_phy_optional_get(struct device *dev, const char *string);
 struct phy *devm_of_phy_get(struct device *dev, struct device_node *np,
 			    const char *con_id);
+struct phy *devm_of_phy_get_by_index(struct device *dev, struct device_node *np,
+				     int index);
 void phy_put(struct phy *phy);
 void devm_phy_put(struct device *dev, struct phy *phy);
 struct phy *of_phy_get(struct device_node *np, const char *con_id);
@@ -261,6 +263,13 @@ static inline struct phy *devm_of_phy_get(struct device *dev,
 	return ERR_PTR(-ENOSYS);
 }
 
+static inline struct phy *devm_of_phy_get_by_index(struct device *dev,
+						   struct device_node *np,
+						   int index)
+{
+	return ERR_PTR(-ENOSYS);
+}
+
 static inline void phy_put(struct phy *phy)
 {
 }
-- 
cgit v1.2.3


From 4cda01e86f68dacc758b2daf6e8809f2ce915b85 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Mon, 11 May 2015 19:28:49 +0200
Subject: net: sched: fix typo in net_device ifdef

This should have been #ifdef not #if.

Reported-by: Fengguang Wu <fengguang.wu@intel.com>
Fixes: d2788d34885d ("net: sched: further simplify handle_ing")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index c4e1caf6056f..a6d706b2a947 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1655,7 +1655,7 @@ struct net_device {
 	rx_handler_func_t __rcu	*rx_handler;
 	void __rcu		*rx_handler_data;
 
-#if CONFIG_NET_CLS_ACT
+#ifdef CONFIG_NET_CLS_ACT
 	struct tcf_proto __rcu  *ingress_cl_list;
 #endif
 	struct netdev_queue __rcu *ingress_queue;
-- 
cgit v1.2.3


From 5844feeaa4154d1c46d3462c7a4653d22356d8b4 Mon Sep 17 00:00:00 2001
From: Brian Norris <computersforpeace@gmail.com>
Date: Fri, 23 Jan 2015 00:22:27 -0800
Subject: mtd: nand: add common DT init code

These are already-documented common bindings for NAND chips. Let's
handle them in nand_base.

If NAND controller drivers need to act on this data before bringing up
the NAND chip (e.g., fill out ECC callback functions, change HW modes,
etc.), then they can do so between calling nand_scan_ident() and
nand_scan_tail().

Signed-off-by: Brian Norris <computersforpeace@gmail.com>
---
 drivers/mtd/nand/nand_base.c | 41 +++++++++++++++++++++++++++++++++++++++++
 include/linux/mtd/nand.h     |  5 +++++
 2 files changed, 46 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/mtd/nand/nand_base.c b/drivers/mtd/nand/nand_base.c
index 0afe76315c38..c4619e3ac4ab 100644
--- a/drivers/mtd/nand/nand_base.c
+++ b/drivers/mtd/nand/nand_base.c
@@ -48,6 +48,7 @@
 #include <linux/leds.h>
 #include <linux/io.h>
 #include <linux/mtd/partitions.h>
+#include <linux/of_mtd.h>
 
 /* Define default oob placement schemes for large and small page devices */
 static struct nand_ecclayout nand_oob_8 = {
@@ -3795,6 +3796,39 @@ ident_done:
 	return type;
 }
 
+static int nand_dt_init(struct mtd_info *mtd, struct nand_chip *chip,
+			struct device_node *dn)
+{
+	int ecc_mode, ecc_strength, ecc_step;
+
+	if (of_get_nand_bus_width(dn) == 16)
+		chip->options |= NAND_BUSWIDTH_16;
+
+	if (of_get_nand_on_flash_bbt(dn))
+		chip->bbt_options |= NAND_BBT_USE_FLASH;
+
+	ecc_mode = of_get_nand_ecc_mode(dn);
+	ecc_strength = of_get_nand_ecc_strength(dn);
+	ecc_step = of_get_nand_ecc_step_size(dn);
+
+	if ((ecc_step >= 0 && !(ecc_strength >= 0)) ||
+	    (!(ecc_step >= 0) && ecc_strength >= 0)) {
+		pr_err("must set both strength and step size in DT\n");
+		return -EINVAL;
+	}
+
+	if (ecc_mode >= 0)
+		chip->ecc.mode = ecc_mode;
+
+	if (ecc_strength >= 0)
+		chip->ecc.strength = ecc_strength;
+
+	if (ecc_step > 0)
+		chip->ecc.size = ecc_step;
+
+	return 0;
+}
+
 /**
  * nand_scan_ident - [NAND Interface] Scan for the NAND device
  * @mtd: MTD device structure
@@ -3812,6 +3846,13 @@ int nand_scan_ident(struct mtd_info *mtd, int maxchips,
 	int i, nand_maf_id, nand_dev_id;
 	struct nand_chip *chip = mtd->priv;
 	struct nand_flash_dev *type;
+	int ret;
+
+	if (chip->dn) {
+		ret = nand_dt_init(mtd, chip, chip->dn);
+		if (ret)
+			return ret;
+	}
 
 	/* Set the default functions */
 	nand_set_defaults(chip, chip->options & NAND_BUSWIDTH_16);
diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h
index 6c51876941f3..f25e2bdd188c 100644
--- a/include/linux/mtd/nand.h
+++ b/include/linux/mtd/nand.h
@@ -26,6 +26,8 @@
 
 struct mtd_info;
 struct nand_flash_dev;
+struct device_node;
+
 /* Scan and identify a NAND device */
 extern int nand_scan(struct mtd_info *mtd, int max_chips);
 /*
@@ -542,6 +544,7 @@ struct nand_buffers {
  *			flash device
  * @IO_ADDR_W:		[BOARDSPECIFIC] address to write the 8 I/O lines of the
  *			flash device.
+ * @dn:			[BOARDSPECIFIC] device node describing this instance
  * @read_byte:		[REPLACEABLE] read one byte from the chip
  * @read_word:		[REPLACEABLE] read one word from the chip
  * @write_byte:		[REPLACEABLE] write a single byte to the chip on the
@@ -644,6 +647,8 @@ struct nand_chip {
 	void __iomem *IO_ADDR_R;
 	void __iomem *IO_ADDR_W;
 
+	struct device_node *dn;
+
 	uint8_t (*read_byte)(struct mtd_info *mtd);
 	u16 (*read_word)(struct mtd_info *mtd);
 	void (*write_byte)(struct mtd_info *mtd, uint8_t byte);
-- 
cgit v1.2.3


From 9d0be7f4810257a9b0fc78fff641f14409f14ab3 Mon Sep 17 00:00:00 2001
From: Eduardo Valentin <edubezval@gmail.com>
Date: Mon, 11 May 2015 19:34:23 -0700
Subject: thermal: support slope and offset coefficients

It is common to have a linear extrapolation from
the current sensor readings and the actual temperature
value. This is specially the case when the sensor
is in use to extrapolate hotspots.

This patch adds slope and offset constants for
single sensor linear extrapolation equation. Because
the same sensor can be use in different locations,
from board to board, these constants are added
as part of thermal_zone_params.

The constants are available through sysfs.

It is up to the device driver to determine
the usage of these values.

Signed-off-by: Eduardo Valentin <edubezval@gmail.com>
---
 Documentation/thermal/sysfs-api.txt | 16 ++++++++++++++++
 drivers/thermal/thermal_core.c      |  4 ++++
 include/linux/thermal.h             | 11 +++++++++++
 3 files changed, 31 insertions(+)

(limited to 'include/linux')

diff --git a/Documentation/thermal/sysfs-api.txt b/Documentation/thermal/sysfs-api.txt
index 7d44d7f1a71b..c1f6864a8c5d 100644
--- a/Documentation/thermal/sysfs-api.txt
+++ b/Documentation/thermal/sysfs-api.txt
@@ -190,6 +190,8 @@ Thermal zone device sys I/F, created once it's registered:
     |---k_i:                    PID's integral term in the power allocator gov
     |---k_d:                    PID's derivative term in the power allocator
     |---integral_cutoff:        Offset above which errors are accumulated
+    |---slope:                  Slope constant applied as linear extrapolation
+    |---offset:                 Offset constant applied as linear extrapolation
 
 Thermal cooling device sys I/F, created once it's registered:
 /sys/class/thermal/cooling_device[0-*]:
@@ -359,6 +361,20 @@ integral_cutoff
 	Documentation/thermal/power_allocator.txt
 	RW, Optional
 
+slope
+	The slope constant used in a linear extrapolation model
+	to determine a hotspot temperature based off the sensor's
+	raw readings. It is up to the device driver to determine
+	the usage of these values.
+	RW, Optional
+
+offset
+	The offset constant used in a linear extrapolation model
+	to determine a hotspot temperature based off the sensor's
+	raw readings. It is up to the device driver to determine
+	the usage of these values.
+	RW, Optional
+
 *****************************
 * Cooling device attributes *
 *****************************
diff --git a/drivers/thermal/thermal_core.c b/drivers/thermal/thermal_core.c
index 962de1847cc0..04659bfb888b 100644
--- a/drivers/thermal/thermal_core.c
+++ b/drivers/thermal/thermal_core.c
@@ -944,6 +944,8 @@ create_s32_tzp_attr(k_pu);
 create_s32_tzp_attr(k_i);
 create_s32_tzp_attr(k_d);
 create_s32_tzp_attr(integral_cutoff);
+create_s32_tzp_attr(slope);
+create_s32_tzp_attr(offset);
 #undef create_s32_tzp_attr
 
 static struct device_attribute *dev_tzp_attrs[] = {
@@ -953,6 +955,8 @@ static struct device_attribute *dev_tzp_attrs[] = {
 	&dev_attr_k_i,
 	&dev_attr_k_d,
 	&dev_attr_integral_cutoff,
+	&dev_attr_slope,
+	&dev_attr_offset,
 };
 
 static int create_tzp_attrs(struct device *dev)
diff --git a/include/linux/thermal.h b/include/linux/thermal.h
index 6bbe11c97cea..037e9df2f610 100644
--- a/include/linux/thermal.h
+++ b/include/linux/thermal.h
@@ -302,6 +302,17 @@ struct thermal_zone_params {
 
 	/* threshold below which the error is no longer accumulated */
 	s32 integral_cutoff;
+
+	/*
+	 * @slope:	slope of a linear temperature adjustment curve.
+	 * 		Used by thermal zone drivers.
+	 */
+	int slope;
+	/*
+	 * @offset:	offset of a linear temperature adjustment curve.
+	 * 		Used by thermal zone drivers (default 0).
+	 */
+	int offset;
 };
 
 struct thermal_genl_event {
-- 
cgit v1.2.3


From 05ae797566a66d159cf1e2ee11bf3f6fae40c8eb Mon Sep 17 00:00:00 2001
From: Andrew Bresticker <abrestic@chromium.org>
Date: Mon, 4 May 2015 10:36:35 -0700
Subject: mailbox: Make mbox_chan_ops const

The mailbox controller's channel ops ought to be read-only.  Update
all the mailbox drivers to make their mbox_chan_ops const as well.

Signed-off-by: Andrew Bresticker <abrestic@chromium.org>
Cc: Ashwin Chaugule <ashwin.chaugule@linaro.org>
Cc: Ley Foon Tan <lftan@altera.com>
Acked-by: Suman Anna <s-anna@ti.com>
Signed-off-by: Jassi Brar <jaswinder.singh@linaro.org>
---
 drivers/mailbox/arm_mhu.c          | 2 +-
 drivers/mailbox/mailbox-altera.c   | 2 +-
 drivers/mailbox/omap-mailbox.c     | 2 +-
 drivers/mailbox/pcc.c              | 2 +-
 include/linux/mailbox_controller.h | 2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mailbox/arm_mhu.c b/drivers/mailbox/arm_mhu.c
index ac693c635357..d9e99f981aa9 100644
--- a/drivers/mailbox/arm_mhu.c
+++ b/drivers/mailbox/arm_mhu.c
@@ -110,7 +110,7 @@ static void mhu_shutdown(struct mbox_chan *chan)
 	free_irq(mlink->irq, chan);
 }
 
-static struct mbox_chan_ops mhu_ops = {
+static const struct mbox_chan_ops mhu_ops = {
 	.send_data = mhu_send_data,
 	.startup = mhu_startup,
 	.shutdown = mhu_shutdown,
diff --git a/drivers/mailbox/mailbox-altera.c b/drivers/mailbox/mailbox-altera.c
index a266265677d3..bb682c926b0a 100644
--- a/drivers/mailbox/mailbox-altera.c
+++ b/drivers/mailbox/mailbox-altera.c
@@ -285,7 +285,7 @@ static void altera_mbox_shutdown(struct mbox_chan *chan)
 	}
 }
 
-static struct mbox_chan_ops altera_mbox_ops = {
+static const struct mbox_chan_ops altera_mbox_ops = {
 	.send_data = altera_mbox_send_data,
 	.startup = altera_mbox_startup,
 	.shutdown = altera_mbox_shutdown,
diff --git a/drivers/mailbox/omap-mailbox.c b/drivers/mailbox/omap-mailbox.c
index 0f332c178b07..03f8545ba037 100644
--- a/drivers/mailbox/omap-mailbox.c
+++ b/drivers/mailbox/omap-mailbox.c
@@ -604,7 +604,7 @@ static int omap_mbox_chan_send_data(struct mbox_chan *chan, void *data)
 	return ret;
 }
 
-static struct mbox_chan_ops omap_mbox_chan_ops = {
+static const struct mbox_chan_ops omap_mbox_chan_ops = {
 	.startup        = omap_mbox_chan_startup,
 	.send_data      = omap_mbox_chan_send_data,
 	.shutdown       = omap_mbox_chan_shutdown,
diff --git a/drivers/mailbox/pcc.c b/drivers/mailbox/pcc.c
index 7e91d68a3ac3..26d121d1d501 100644
--- a/drivers/mailbox/pcc.c
+++ b/drivers/mailbox/pcc.c
@@ -198,7 +198,7 @@ static int pcc_send_data(struct mbox_chan *chan, void *data)
 	return 0;
 }
 
-static struct mbox_chan_ops pcc_chan_ops = {
+static const struct mbox_chan_ops pcc_chan_ops = {
 	.send_data = pcc_send_data,
 };
 
diff --git a/include/linux/mailbox_controller.h b/include/linux/mailbox_controller.h
index d4cf96f07cfc..68c42454439b 100644
--- a/include/linux/mailbox_controller.h
+++ b/include/linux/mailbox_controller.h
@@ -72,7 +72,7 @@ struct mbox_chan_ops {
  */
 struct mbox_controller {
 	struct device *dev;
-	struct mbox_chan_ops *ops;
+	const struct mbox_chan_ops *ops;
 	struct mbox_chan *chans;
 	int num_chans;
 	bool txdone_irq;
-- 
cgit v1.2.3


From 3c4ed7bdf5997d8020cbb8d4abbef2fcfb9f1284 Mon Sep 17 00:00:00 2001
From: Casey Schaufler <casey@schaufler-ca.com>
Date: Sat, 2 May 2015 15:10:46 -0700
Subject: LSM: Split security.h

The security.h header file serves two purposes,
interfaces for users of the security modules and
interfaces for security modules. Users of the
security modules don't need to know about what's
in the security_operations structure, so pull it
out into it's own header, lsm_hooks.h

Signed-off-by: Casey Schaufler <casey@schaufler-ca.com>
Acked-by: John Johansen <john.johansen@canonical.com>
Acked-by: Kees Cook <keescook@chromium.org>
Acked-by: Paul Moore <paul@paul-moore.com>
Acked-by:  Stephen Smalley <sds@tycho.nsa.gov>
Acked-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Signed-off-by: James Morris <james.l.morris@oracle.com>
---
 include/linux/lsm_hooks.h | 358 ++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/security.h  | 305 ---------------------------------------
 security/apparmor/lsm.c   |   2 +-
 security/capability.c     |   2 +-
 security/security.c       |   2 +-
 security/selinux/hooks.c  |   2 +-
 security/smack/smack.h    |   2 +-
 security/tomoyo/tomoyo.c  |   2 +-
 security/yama/yama_lsm.c  |   2 +-
 9 files changed, 365 insertions(+), 312 deletions(-)
 create mode 100644 include/linux/lsm_hooks.h

(limited to 'include/linux')

diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h
new file mode 100644
index 000000000000..c60f81b2d18c
--- /dev/null
+++ b/include/linux/lsm_hooks.h
@@ -0,0 +1,358 @@
+/*
+ * Linux Security Module interfaces
+ *
+ * Copyright (C) 2001 WireX Communications, Inc <chris@wirex.com>
+ * Copyright (C) 2001 Greg Kroah-Hartman <greg@kroah.com>
+ * Copyright (C) 2001 Networks Associates Technology, Inc <ssmalley@nai.com>
+ * Copyright (C) 2001 James Morris <jmorris@intercode.com.au>
+ * Copyright (C) 2001 Silicon Graphics, Inc. (Trust Technology Group)
+ * Copyright (C) 2015 Intel Corporation.
+ * Copyright (C) 2015 Casey Schaufler <casey@schaufler-ca.com>
+ *
+ *	This program is free software; you can redistribute it and/or modify
+ *	it under the terms of the GNU General Public License as published by
+ *	the Free Software Foundation; either version 2 of the License, or
+ *	(at your option) any later version.
+ *
+ *	Due to this file being licensed under the GPL there is controversy over
+ *	whether this permits you to write a module that #includes this file
+ *	without placing your module under the GPL.  Please consult a lawyer for
+ *	advice before doing this.
+ *
+ */
+
+#ifndef __LINUX_LSM_HOOKS_H
+#define __LINUX_LSM_HOOKS_H
+
+#include <linux/security.h>
+
+/* Maximum number of letters for an LSM name string */
+#define SECURITY_NAME_MAX	10
+
+#ifdef CONFIG_SECURITY
+
+struct security_operations {
+	char name[SECURITY_NAME_MAX + 1];
+
+	int (*binder_set_context_mgr)(struct task_struct *mgr);
+	int (*binder_transaction)(struct task_struct *from,
+					struct task_struct *to);
+	int (*binder_transfer_binder)(struct task_struct *from,
+					struct task_struct *to);
+	int (*binder_transfer_file)(struct task_struct *from,
+					struct task_struct *to,
+					struct file *file);
+
+	int (*ptrace_access_check)(struct task_struct *child,
+					unsigned int mode);
+	int (*ptrace_traceme)(struct task_struct *parent);
+	int (*capget)(struct task_struct *target, kernel_cap_t *effective,
+			kernel_cap_t *inheritable, kernel_cap_t *permitted);
+	int (*capset)(struct cred *new, const struct cred *old,
+			const kernel_cap_t *effective,
+			const kernel_cap_t *inheritable,
+			const kernel_cap_t *permitted);
+	int (*capable)(const struct cred *cred, struct user_namespace *ns,
+			int cap, int audit);
+	int (*quotactl)(int cmds, int type, int id, struct super_block *sb);
+	int (*quota_on)(struct dentry *dentry);
+	int (*syslog)(int type);
+	int (*settime)(const struct timespec *ts, const struct timezone *tz);
+	int (*vm_enough_memory)(struct mm_struct *mm, long pages);
+
+	int (*bprm_set_creds)(struct linux_binprm *bprm);
+	int (*bprm_check_security)(struct linux_binprm *bprm);
+	int (*bprm_secureexec)(struct linux_binprm *bprm);
+	void (*bprm_committing_creds)(struct linux_binprm *bprm);
+	void (*bprm_committed_creds)(struct linux_binprm *bprm);
+
+	int (*sb_alloc_security)(struct super_block *sb);
+	void (*sb_free_security)(struct super_block *sb);
+	int (*sb_copy_data)(char *orig, char *copy);
+	int (*sb_remount)(struct super_block *sb, void *data);
+	int (*sb_kern_mount)(struct super_block *sb, int flags, void *data);
+	int (*sb_show_options)(struct seq_file *m, struct super_block *sb);
+	int (*sb_statfs)(struct dentry *dentry);
+	int (*sb_mount)(const char *dev_name, struct path *path,
+			const char *type, unsigned long flags, void *data);
+	int (*sb_umount)(struct vfsmount *mnt, int flags);
+	int (*sb_pivotroot)(struct path *old_path, struct path *new_path);
+	int (*sb_set_mnt_opts)(struct super_block *sb,
+				struct security_mnt_opts *opts,
+				unsigned long kern_flags,
+				unsigned long *set_kern_flags);
+	int (*sb_clone_mnt_opts)(const struct super_block *oldsb,
+					struct super_block *newsb);
+	int (*sb_parse_opts_str)(char *options, struct security_mnt_opts *opts);
+	int (*dentry_init_security)(struct dentry *dentry, int mode,
+					struct qstr *name, void **ctx,
+					u32 *ctxlen);
+
+
+#ifdef CONFIG_SECURITY_PATH
+	int (*path_unlink)(struct path *dir, struct dentry *dentry);
+	int (*path_mkdir)(struct path *dir, struct dentry *dentry,
+				umode_t mode);
+	int (*path_rmdir)(struct path *dir, struct dentry *dentry);
+	int (*path_mknod)(struct path *dir, struct dentry *dentry,
+				umode_t mode, unsigned int dev);
+	int (*path_truncate)(struct path *path);
+	int (*path_symlink)(struct path *dir, struct dentry *dentry,
+				const char *old_name);
+	int (*path_link)(struct dentry *old_dentry, struct path *new_dir,
+				struct dentry *new_dentry);
+	int (*path_rename)(struct path *old_dir, struct dentry *old_dentry,
+				struct path *new_dir,
+				struct dentry *new_dentry);
+	int (*path_chmod)(struct path *path, umode_t mode);
+	int (*path_chown)(struct path *path, kuid_t uid, kgid_t gid);
+	int (*path_chroot)(struct path *path);
+#endif
+
+	int (*inode_alloc_security)(struct inode *inode);
+	void (*inode_free_security)(struct inode *inode);
+	int (*inode_init_security)(struct inode *inode, struct inode *dir,
+					const struct qstr *qstr,
+					const char **name, void **value,
+					size_t *len);
+	int (*inode_create)(struct inode *dir, struct dentry *dentry,
+				umode_t mode);
+	int (*inode_link)(struct dentry *old_dentry, struct inode *dir,
+				struct dentry *new_dentry);
+	int (*inode_unlink)(struct inode *dir, struct dentry *dentry);
+	int (*inode_symlink)(struct inode *dir, struct dentry *dentry,
+				const char *old_name);
+	int (*inode_mkdir)(struct inode *dir, struct dentry *dentry,
+				umode_t mode);
+	int (*inode_rmdir)(struct inode *dir, struct dentry *dentry);
+	int (*inode_mknod)(struct inode *dir, struct dentry *dentry,
+				umode_t mode, dev_t dev);
+	int (*inode_rename)(struct inode *old_dir, struct dentry *old_dentry,
+				struct inode *new_dir,
+				struct dentry *new_dentry);
+	int (*inode_readlink)(struct dentry *dentry);
+	int (*inode_follow_link)(struct dentry *dentry, struct nameidata *nd);
+	int (*inode_permission)(struct inode *inode, int mask);
+	int (*inode_setattr)(struct dentry *dentry, struct iattr *attr);
+	int (*inode_getattr)(const struct path *path);
+	int (*inode_setxattr)(struct dentry *dentry, const char *name,
+				const void *value, size_t size, int flags);
+	void (*inode_post_setxattr)(struct dentry *dentry, const char *name,
+					const void *value, size_t size,
+					int flags);
+	int (*inode_getxattr)(struct dentry *dentry, const char *name);
+	int (*inode_listxattr)(struct dentry *dentry);
+	int (*inode_removexattr)(struct dentry *dentry, const char *name);
+	int (*inode_need_killpriv)(struct dentry *dentry);
+	int (*inode_killpriv)(struct dentry *dentry);
+	int (*inode_getsecurity)(const struct inode *inode, const char *name,
+					void **buffer, bool alloc);
+	int (*inode_setsecurity)(struct inode *inode, const char *name,
+					const void *value, size_t size,
+					int flags);
+	int (*inode_listsecurity)(struct inode *inode, char *buffer,
+					size_t buffer_size);
+	void (*inode_getsecid)(const struct inode *inode, u32 *secid);
+
+	int (*file_permission)(struct file *file, int mask);
+	int (*file_alloc_security)(struct file *file);
+	void (*file_free_security)(struct file *file);
+	int (*file_ioctl)(struct file *file, unsigned int cmd,
+				unsigned long arg);
+	int (*mmap_addr)(unsigned long addr);
+	int (*mmap_file)(struct file *file, unsigned long reqprot,
+				unsigned long prot, unsigned long flags);
+	int (*file_mprotect)(struct vm_area_struct *vma, unsigned long reqprot,
+				unsigned long prot);
+	int (*file_lock)(struct file *file, unsigned int cmd);
+	int (*file_fcntl)(struct file *file, unsigned int cmd,
+				unsigned long arg);
+	void (*file_set_fowner)(struct file *file);
+	int (*file_send_sigiotask)(struct task_struct *tsk,
+					struct fown_struct *fown, int sig);
+	int (*file_receive)(struct file *file);
+	int (*file_open)(struct file *file, const struct cred *cred);
+
+	int (*task_create)(unsigned long clone_flags);
+	void (*task_free)(struct task_struct *task);
+	int (*cred_alloc_blank)(struct cred *cred, gfp_t gfp);
+	void (*cred_free)(struct cred *cred);
+	int (*cred_prepare)(struct cred *new, const struct cred *old,
+				gfp_t gfp);
+	void (*cred_transfer)(struct cred *new, const struct cred *old);
+	int (*kernel_act_as)(struct cred *new, u32 secid);
+	int (*kernel_create_files_as)(struct cred *new, struct inode *inode);
+	int (*kernel_fw_from_file)(struct file *file, char *buf, size_t size);
+	int (*kernel_module_request)(char *kmod_name);
+	int (*kernel_module_from_file)(struct file *file);
+	int (*task_fix_setuid)(struct cred *new, const struct cred *old,
+				int flags);
+	int (*task_setpgid)(struct task_struct *p, pid_t pgid);
+	int (*task_getpgid)(struct task_struct *p);
+	int (*task_getsid)(struct task_struct *p);
+	void (*task_getsecid)(struct task_struct *p, u32 *secid);
+	int (*task_setnice)(struct task_struct *p, int nice);
+	int (*task_setioprio)(struct task_struct *p, int ioprio);
+	int (*task_getioprio)(struct task_struct *p);
+	int (*task_setrlimit)(struct task_struct *p, unsigned int resource,
+				struct rlimit *new_rlim);
+	int (*task_setscheduler)(struct task_struct *p);
+	int (*task_getscheduler)(struct task_struct *p);
+	int (*task_movememory)(struct task_struct *p);
+	int (*task_kill)(struct task_struct *p, struct siginfo *info,
+				int sig, u32 secid);
+	int (*task_wait)(struct task_struct *p);
+	int (*task_prctl)(int option, unsigned long arg2, unsigned long arg3,
+				unsigned long arg4, unsigned long arg5);
+	void (*task_to_inode)(struct task_struct *p, struct inode *inode);
+
+	int (*ipc_permission)(struct kern_ipc_perm *ipcp, short flag);
+	void (*ipc_getsecid)(struct kern_ipc_perm *ipcp, u32 *secid);
+
+	int (*msg_msg_alloc_security)(struct msg_msg *msg);
+	void (*msg_msg_free_security)(struct msg_msg *msg);
+
+	int (*msg_queue_alloc_security)(struct msg_queue *msq);
+	void (*msg_queue_free_security)(struct msg_queue *msq);
+	int (*msg_queue_associate)(struct msg_queue *msq, int msqflg);
+	int (*msg_queue_msgctl)(struct msg_queue *msq, int cmd);
+	int (*msg_queue_msgsnd)(struct msg_queue *msq, struct msg_msg *msg,
+				int msqflg);
+	int (*msg_queue_msgrcv)(struct msg_queue *msq, struct msg_msg *msg,
+				struct task_struct *target, long type,
+				int mode);
+
+	int (*shm_alloc_security)(struct shmid_kernel *shp);
+	void (*shm_free_security)(struct shmid_kernel *shp);
+	int (*shm_associate)(struct shmid_kernel *shp, int shmflg);
+	int (*shm_shmctl)(struct shmid_kernel *shp, int cmd);
+	int (*shm_shmat)(struct shmid_kernel *shp, char __user *shmaddr,
+				int shmflg);
+
+	int (*sem_alloc_security)(struct sem_array *sma);
+	void (*sem_free_security)(struct sem_array *sma);
+	int (*sem_associate)(struct sem_array *sma, int semflg);
+	int (*sem_semctl)(struct sem_array *sma, int cmd);
+	int (*sem_semop)(struct sem_array *sma, struct sembuf *sops,
+				unsigned nsops, int alter);
+
+	int (*netlink_send)(struct sock *sk, struct sk_buff *skb);
+
+	void (*d_instantiate)(struct dentry *dentry, struct inode *inode);
+
+	int (*getprocattr)(struct task_struct *p, char *name, char **value);
+	int (*setprocattr)(struct task_struct *p, char *name, void *value,
+				size_t size);
+	int (*ismaclabel)(const char *name);
+	int (*secid_to_secctx)(u32 secid, char **secdata, u32 *seclen);
+	int (*secctx_to_secid)(const char *secdata, u32 seclen, u32 *secid);
+	void (*release_secctx)(char *secdata, u32 seclen);
+
+	int (*inode_notifysecctx)(struct inode *inode, void *ctx, u32 ctxlen);
+	int (*inode_setsecctx)(struct dentry *dentry, void *ctx, u32 ctxlen);
+	int (*inode_getsecctx)(struct inode *inode, void **ctx, u32 *ctxlen);
+
+#ifdef CONFIG_SECURITY_NETWORK
+	int (*unix_stream_connect)(struct sock *sock, struct sock *other,
+					struct sock *newsk);
+	int (*unix_may_send)(struct socket *sock, struct socket *other);
+
+	int (*socket_create)(int family, int type, int protocol, int kern);
+	int (*socket_post_create)(struct socket *sock, int family, int type,
+					int protocol, int kern);
+	int (*socket_bind)(struct socket *sock, struct sockaddr *address,
+				int addrlen);
+	int (*socket_connect)(struct socket *sock, struct sockaddr *address,
+				int addrlen);
+	int (*socket_listen)(struct socket *sock, int backlog);
+	int (*socket_accept)(struct socket *sock, struct socket *newsock);
+	int (*socket_sendmsg)(struct socket *sock, struct msghdr *msg,
+				int size);
+	int (*socket_recvmsg)(struct socket *sock, struct msghdr *msg,
+				int size, int flags);
+	int (*socket_getsockname)(struct socket *sock);
+	int (*socket_getpeername)(struct socket *sock);
+	int (*socket_getsockopt)(struct socket *sock, int level, int optname);
+	int (*socket_setsockopt)(struct socket *sock, int level, int optname);
+	int (*socket_shutdown)(struct socket *sock, int how);
+	int (*socket_sock_rcv_skb)(struct sock *sk, struct sk_buff *skb);
+	int (*socket_getpeersec_stream)(struct socket *sock,
+					char __user *optval,
+					int __user *optlen, unsigned len);
+	int (*socket_getpeersec_dgram)(struct socket *sock,
+					struct sk_buff *skb, u32 *secid);
+	int (*sk_alloc_security)(struct sock *sk, int family, gfp_t priority);
+	void (*sk_free_security)(struct sock *sk);
+	void (*sk_clone_security)(const struct sock *sk, struct sock *newsk);
+	void (*sk_getsecid)(struct sock *sk, u32 *secid);
+	void (*sock_graft)(struct sock *sk, struct socket *parent);
+	int (*inet_conn_request)(struct sock *sk, struct sk_buff *skb,
+					struct request_sock *req);
+	void (*inet_csk_clone)(struct sock *newsk,
+				const struct request_sock *req);
+	void (*inet_conn_established)(struct sock *sk, struct sk_buff *skb);
+	int (*secmark_relabel_packet)(u32 secid);
+	void (*secmark_refcount_inc)(void);
+	void (*secmark_refcount_dec)(void);
+	void (*req_classify_flow)(const struct request_sock *req,
+					struct flowi *fl);
+	int (*tun_dev_alloc_security)(void **security);
+	void (*tun_dev_free_security)(void *security);
+	int (*tun_dev_create)(void);
+	int (*tun_dev_attach_queue)(void *security);
+	int (*tun_dev_attach)(struct sock *sk, void *security);
+	int (*tun_dev_open)(void *security);
+#endif	/* CONFIG_SECURITY_NETWORK */
+
+#ifdef CONFIG_SECURITY_NETWORK_XFRM
+	int (*xfrm_policy_alloc_security)(struct xfrm_sec_ctx **ctxp,
+					  struct xfrm_user_sec_ctx *sec_ctx,
+						gfp_t gfp);
+	int (*xfrm_policy_clone_security)(struct xfrm_sec_ctx *old_ctx,
+						struct xfrm_sec_ctx **new_ctx);
+	void (*xfrm_policy_free_security)(struct xfrm_sec_ctx *ctx);
+	int (*xfrm_policy_delete_security)(struct xfrm_sec_ctx *ctx);
+	int (*xfrm_state_alloc)(struct xfrm_state *x,
+				struct xfrm_user_sec_ctx *sec_ctx);
+	int (*xfrm_state_alloc_acquire)(struct xfrm_state *x,
+					struct xfrm_sec_ctx *polsec,
+					u32 secid);
+	void (*xfrm_state_free_security)(struct xfrm_state *x);
+	int (*xfrm_state_delete_security)(struct xfrm_state *x);
+	int (*xfrm_policy_lookup)(struct xfrm_sec_ctx *ctx, u32 fl_secid,
+					u8 dir);
+	int (*xfrm_state_pol_flow_match)(struct xfrm_state *x,
+						struct xfrm_policy *xp,
+						const struct flowi *fl);
+	int (*xfrm_decode_session)(struct sk_buff *skb, u32 *secid, int ckall);
+#endif	/* CONFIG_SECURITY_NETWORK_XFRM */
+
+	/* key management security hooks */
+#ifdef CONFIG_KEYS
+	int (*key_alloc)(struct key *key, const struct cred *cred,
+				unsigned long flags);
+	void (*key_free)(struct key *key);
+	int (*key_permission)(key_ref_t key_ref, const struct cred *cred,
+				unsigned perm);
+	int (*key_getsecurity)(struct key *key, char **_buffer);
+#endif	/* CONFIG_KEYS */
+
+#ifdef CONFIG_AUDIT
+	int (*audit_rule_init)(u32 field, u32 op, char *rulestr,
+				void **lsmrule);
+	int (*audit_rule_known)(struct audit_krule *krule);
+	int (*audit_rule_match)(u32 secid, u32 field, u32 op, void *lsmrule,
+				struct audit_context *actx);
+	void (*audit_rule_free)(void *lsmrule);
+#endif /* CONFIG_AUDIT */
+};
+
+/* prototypes */
+extern int security_module_enable(struct security_operations *ops);
+extern int register_security(struct security_operations *ops);
+extern void __init security_fixup_ops(struct security_operations *ops);
+extern void reset_security_ops(void);
+
+#endif /* CONFIG_SECURITY */
+
+#endif /* ! __LINUX_LSM_HOOKS_H */
diff --git a/include/linux/security.h b/include/linux/security.h
index 18264ea9e314..f3d42c636f27 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -116,8 +116,6 @@ struct seq_file;
 
 extern int cap_netlink_send(struct sock *sk, struct sk_buff *skb);
 
-void reset_security_ops(void);
-
 #ifdef CONFIG_MMU
 extern unsigned long mmap_min_addr;
 extern unsigned long dac_mmap_min_addr;
@@ -1457,312 +1455,9 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts)
  *	@ctxlen points to the place to put the length of @ctx.
  * This is the main security structure.
  */
-struct security_operations {
-	char name[SECURITY_NAME_MAX + 1];
-
-	int (*binder_set_context_mgr) (struct task_struct *mgr);
-	int (*binder_transaction) (struct task_struct *from,
-				   struct task_struct *to);
-	int (*binder_transfer_binder) (struct task_struct *from,
-				       struct task_struct *to);
-	int (*binder_transfer_file) (struct task_struct *from,
-				     struct task_struct *to, struct file *file);
-
-	int (*ptrace_access_check) (struct task_struct *child, unsigned int mode);
-	int (*ptrace_traceme) (struct task_struct *parent);
-	int (*capget) (struct task_struct *target,
-		       kernel_cap_t *effective,
-		       kernel_cap_t *inheritable, kernel_cap_t *permitted);
-	int (*capset) (struct cred *new,
-		       const struct cred *old,
-		       const kernel_cap_t *effective,
-		       const kernel_cap_t *inheritable,
-		       const kernel_cap_t *permitted);
-	int (*capable) (const struct cred *cred, struct user_namespace *ns,
-			int cap, int audit);
-	int (*quotactl) (int cmds, int type, int id, struct super_block *sb);
-	int (*quota_on) (struct dentry *dentry);
-	int (*syslog) (int type);
-	int (*settime) (const struct timespec *ts, const struct timezone *tz);
-	int (*vm_enough_memory) (struct mm_struct *mm, long pages);
-
-	int (*bprm_set_creds) (struct linux_binprm *bprm);
-	int (*bprm_check_security) (struct linux_binprm *bprm);
-	int (*bprm_secureexec) (struct linux_binprm *bprm);
-	void (*bprm_committing_creds) (struct linux_binprm *bprm);
-	void (*bprm_committed_creds) (struct linux_binprm *bprm);
-
-	int (*sb_alloc_security) (struct super_block *sb);
-	void (*sb_free_security) (struct super_block *sb);
-	int (*sb_copy_data) (char *orig, char *copy);
-	int (*sb_remount) (struct super_block *sb, void *data);
-	int (*sb_kern_mount) (struct super_block *sb, int flags, void *data);
-	int (*sb_show_options) (struct seq_file *m, struct super_block *sb);
-	int (*sb_statfs) (struct dentry *dentry);
-	int (*sb_mount) (const char *dev_name, struct path *path,
-			 const char *type, unsigned long flags, void *data);
-	int (*sb_umount) (struct vfsmount *mnt, int flags);
-	int (*sb_pivotroot) (struct path *old_path,
-			     struct path *new_path);
-	int (*sb_set_mnt_opts) (struct super_block *sb,
-				struct security_mnt_opts *opts,
-				unsigned long kern_flags,
-				unsigned long *set_kern_flags);
-	int (*sb_clone_mnt_opts) (const struct super_block *oldsb,
-				   struct super_block *newsb);
-	int (*sb_parse_opts_str) (char *options, struct security_mnt_opts *opts);
-	int (*dentry_init_security) (struct dentry *dentry, int mode,
-					struct qstr *name, void **ctx,
-					u32 *ctxlen);
-
-
-#ifdef CONFIG_SECURITY_PATH
-	int (*path_unlink) (struct path *dir, struct dentry *dentry);
-	int (*path_mkdir) (struct path *dir, struct dentry *dentry, umode_t mode);
-	int (*path_rmdir) (struct path *dir, struct dentry *dentry);
-	int (*path_mknod) (struct path *dir, struct dentry *dentry, umode_t mode,
-			   unsigned int dev);
-	int (*path_truncate) (struct path *path);
-	int (*path_symlink) (struct path *dir, struct dentry *dentry,
-			     const char *old_name);
-	int (*path_link) (struct dentry *old_dentry, struct path *new_dir,
-			  struct dentry *new_dentry);
-	int (*path_rename) (struct path *old_dir, struct dentry *old_dentry,
-			    struct path *new_dir, struct dentry *new_dentry);
-	int (*path_chmod) (struct path *path, umode_t mode);
-	int (*path_chown) (struct path *path, kuid_t uid, kgid_t gid);
-	int (*path_chroot) (struct path *path);
-#endif
-
-	int (*inode_alloc_security) (struct inode *inode);
-	void (*inode_free_security) (struct inode *inode);
-	int (*inode_init_security) (struct inode *inode, struct inode *dir,
-				    const struct qstr *qstr, const char **name,
-				    void **value, size_t *len);
-	int (*inode_create) (struct inode *dir,
-			     struct dentry *dentry, umode_t mode);
-	int (*inode_link) (struct dentry *old_dentry,
-			   struct inode *dir, struct dentry *new_dentry);
-	int (*inode_unlink) (struct inode *dir, struct dentry *dentry);
-	int (*inode_symlink) (struct inode *dir,
-			      struct dentry *dentry, const char *old_name);
-	int (*inode_mkdir) (struct inode *dir, struct dentry *dentry, umode_t mode);
-	int (*inode_rmdir) (struct inode *dir, struct dentry *dentry);
-	int (*inode_mknod) (struct inode *dir, struct dentry *dentry,
-			    umode_t mode, dev_t dev);
-	int (*inode_rename) (struct inode *old_dir, struct dentry *old_dentry,
-			     struct inode *new_dir, struct dentry *new_dentry);
-	int (*inode_readlink) (struct dentry *dentry);
-	int (*inode_follow_link) (struct dentry *dentry, struct nameidata *nd);
-	int (*inode_permission) (struct inode *inode, int mask);
-	int (*inode_setattr)	(struct dentry *dentry, struct iattr *attr);
-	int (*inode_getattr) (const struct path *path);
-	int (*inode_setxattr) (struct dentry *dentry, const char *name,
-			       const void *value, size_t size, int flags);
-	void (*inode_post_setxattr) (struct dentry *dentry, const char *name,
-				     const void *value, size_t size, int flags);
-	int (*inode_getxattr) (struct dentry *dentry, const char *name);
-	int (*inode_listxattr) (struct dentry *dentry);
-	int (*inode_removexattr) (struct dentry *dentry, const char *name);
-	int (*inode_need_killpriv) (struct dentry *dentry);
-	int (*inode_killpriv) (struct dentry *dentry);
-	int (*inode_getsecurity) (const struct inode *inode, const char *name, void **buffer, bool alloc);
-	int (*inode_setsecurity) (struct inode *inode, const char *name, const void *value, size_t size, int flags);
-	int (*inode_listsecurity) (struct inode *inode, char *buffer, size_t buffer_size);
-	void (*inode_getsecid) (const struct inode *inode, u32 *secid);
-
-	int (*file_permission) (struct file *file, int mask);
-	int (*file_alloc_security) (struct file *file);
-	void (*file_free_security) (struct file *file);
-	int (*file_ioctl) (struct file *file, unsigned int cmd,
-			   unsigned long arg);
-	int (*mmap_addr) (unsigned long addr);
-	int (*mmap_file) (struct file *file,
-			  unsigned long reqprot, unsigned long prot,
-			  unsigned long flags);
-	int (*file_mprotect) (struct vm_area_struct *vma,
-			      unsigned long reqprot,
-			      unsigned long prot);
-	int (*file_lock) (struct file *file, unsigned int cmd);
-	int (*file_fcntl) (struct file *file, unsigned int cmd,
-			   unsigned long arg);
-	void (*file_set_fowner) (struct file *file);
-	int (*file_send_sigiotask) (struct task_struct *tsk,
-				    struct fown_struct *fown, int sig);
-	int (*file_receive) (struct file *file);
-	int (*file_open) (struct file *file, const struct cred *cred);
-
-	int (*task_create) (unsigned long clone_flags);
-	void (*task_free) (struct task_struct *task);
-	int (*cred_alloc_blank) (struct cred *cred, gfp_t gfp);
-	void (*cred_free) (struct cred *cred);
-	int (*cred_prepare)(struct cred *new, const struct cred *old,
-			    gfp_t gfp);
-	void (*cred_transfer)(struct cred *new, const struct cred *old);
-	int (*kernel_act_as)(struct cred *new, u32 secid);
-	int (*kernel_create_files_as)(struct cred *new, struct inode *inode);
-	int (*kernel_fw_from_file)(struct file *file, char *buf, size_t size);
-	int (*kernel_module_request)(char *kmod_name);
-	int (*kernel_module_from_file)(struct file *file);
-	int (*task_fix_setuid) (struct cred *new, const struct cred *old,
-				int flags);
-	int (*task_setpgid) (struct task_struct *p, pid_t pgid);
-	int (*task_getpgid) (struct task_struct *p);
-	int (*task_getsid) (struct task_struct *p);
-	void (*task_getsecid) (struct task_struct *p, u32 *secid);
-	int (*task_setnice) (struct task_struct *p, int nice);
-	int (*task_setioprio) (struct task_struct *p, int ioprio);
-	int (*task_getioprio) (struct task_struct *p);
-	int (*task_setrlimit) (struct task_struct *p, unsigned int resource,
-			struct rlimit *new_rlim);
-	int (*task_setscheduler) (struct task_struct *p);
-	int (*task_getscheduler) (struct task_struct *p);
-	int (*task_movememory) (struct task_struct *p);
-	int (*task_kill) (struct task_struct *p,
-			  struct siginfo *info, int sig, u32 secid);
-	int (*task_wait) (struct task_struct *p);
-	int (*task_prctl) (int option, unsigned long arg2,
-			   unsigned long arg3, unsigned long arg4,
-			   unsigned long arg5);
-	void (*task_to_inode) (struct task_struct *p, struct inode *inode);
-
-	int (*ipc_permission) (struct kern_ipc_perm *ipcp, short flag);
-	void (*ipc_getsecid) (struct kern_ipc_perm *ipcp, u32 *secid);
-
-	int (*msg_msg_alloc_security) (struct msg_msg *msg);
-	void (*msg_msg_free_security) (struct msg_msg *msg);
-
-	int (*msg_queue_alloc_security) (struct msg_queue *msq);
-	void (*msg_queue_free_security) (struct msg_queue *msq);
-	int (*msg_queue_associate) (struct msg_queue *msq, int msqflg);
-	int (*msg_queue_msgctl) (struct msg_queue *msq, int cmd);
-	int (*msg_queue_msgsnd) (struct msg_queue *msq,
-				 struct msg_msg *msg, int msqflg);
-	int (*msg_queue_msgrcv) (struct msg_queue *msq,
-				 struct msg_msg *msg,
-				 struct task_struct *target,
-				 long type, int mode);
-
-	int (*shm_alloc_security) (struct shmid_kernel *shp);
-	void (*shm_free_security) (struct shmid_kernel *shp);
-	int (*shm_associate) (struct shmid_kernel *shp, int shmflg);
-	int (*shm_shmctl) (struct shmid_kernel *shp, int cmd);
-	int (*shm_shmat) (struct shmid_kernel *shp,
-			  char __user *shmaddr, int shmflg);
-
-	int (*sem_alloc_security) (struct sem_array *sma);
-	void (*sem_free_security) (struct sem_array *sma);
-	int (*sem_associate) (struct sem_array *sma, int semflg);
-	int (*sem_semctl) (struct sem_array *sma, int cmd);
-	int (*sem_semop) (struct sem_array *sma,
-			  struct sembuf *sops, unsigned nsops, int alter);
-
-	int (*netlink_send) (struct sock *sk, struct sk_buff *skb);
-
-	void (*d_instantiate) (struct dentry *dentry, struct inode *inode);
-
-	int (*getprocattr) (struct task_struct *p, char *name, char **value);
-	int (*setprocattr) (struct task_struct *p, char *name, void *value, size_t size);
-	int (*ismaclabel) (const char *name);
-	int (*secid_to_secctx) (u32 secid, char **secdata, u32 *seclen);
-	int (*secctx_to_secid) (const char *secdata, u32 seclen, u32 *secid);
-	void (*release_secctx) (char *secdata, u32 seclen);
-
-	int (*inode_notifysecctx)(struct inode *inode, void *ctx, u32 ctxlen);
-	int (*inode_setsecctx)(struct dentry *dentry, void *ctx, u32 ctxlen);
-	int (*inode_getsecctx)(struct inode *inode, void **ctx, u32 *ctxlen);
-
-#ifdef CONFIG_SECURITY_NETWORK
-	int (*unix_stream_connect) (struct sock *sock, struct sock *other, struct sock *newsk);
-	int (*unix_may_send) (struct socket *sock, struct socket *other);
-
-	int (*socket_create) (int family, int type, int protocol, int kern);
-	int (*socket_post_create) (struct socket *sock, int family,
-				   int type, int protocol, int kern);
-	int (*socket_bind) (struct socket *sock,
-			    struct sockaddr *address, int addrlen);
-	int (*socket_connect) (struct socket *sock,
-			       struct sockaddr *address, int addrlen);
-	int (*socket_listen) (struct socket *sock, int backlog);
-	int (*socket_accept) (struct socket *sock, struct socket *newsock);
-	int (*socket_sendmsg) (struct socket *sock,
-			       struct msghdr *msg, int size);
-	int (*socket_recvmsg) (struct socket *sock,
-			       struct msghdr *msg, int size, int flags);
-	int (*socket_getsockname) (struct socket *sock);
-	int (*socket_getpeername) (struct socket *sock);
-	int (*socket_getsockopt) (struct socket *sock, int level, int optname);
-	int (*socket_setsockopt) (struct socket *sock, int level, int optname);
-	int (*socket_shutdown) (struct socket *sock, int how);
-	int (*socket_sock_rcv_skb) (struct sock *sk, struct sk_buff *skb);
-	int (*socket_getpeersec_stream) (struct socket *sock, char __user *optval, int __user *optlen, unsigned len);
-	int (*socket_getpeersec_dgram) (struct socket *sock, struct sk_buff *skb, u32 *secid);
-	int (*sk_alloc_security) (struct sock *sk, int family, gfp_t priority);
-	void (*sk_free_security) (struct sock *sk);
-	void (*sk_clone_security) (const struct sock *sk, struct sock *newsk);
-	void (*sk_getsecid) (struct sock *sk, u32 *secid);
-	void (*sock_graft) (struct sock *sk, struct socket *parent);
-	int (*inet_conn_request) (struct sock *sk, struct sk_buff *skb,
-				  struct request_sock *req);
-	void (*inet_csk_clone) (struct sock *newsk, const struct request_sock *req);
-	void (*inet_conn_established) (struct sock *sk, struct sk_buff *skb);
-	int (*secmark_relabel_packet) (u32 secid);
-	void (*secmark_refcount_inc) (void);
-	void (*secmark_refcount_dec) (void);
-	void (*req_classify_flow) (const struct request_sock *req, struct flowi *fl);
-	int (*tun_dev_alloc_security) (void **security);
-	void (*tun_dev_free_security) (void *security);
-	int (*tun_dev_create) (void);
-	int (*tun_dev_attach_queue) (void *security);
-	int (*tun_dev_attach) (struct sock *sk, void *security);
-	int (*tun_dev_open) (void *security);
-#endif	/* CONFIG_SECURITY_NETWORK */
-
-#ifdef CONFIG_SECURITY_NETWORK_XFRM
-	int (*xfrm_policy_alloc_security) (struct xfrm_sec_ctx **ctxp,
-			struct xfrm_user_sec_ctx *sec_ctx, gfp_t gfp);
-	int (*xfrm_policy_clone_security) (struct xfrm_sec_ctx *old_ctx, struct xfrm_sec_ctx **new_ctx);
-	void (*xfrm_policy_free_security) (struct xfrm_sec_ctx *ctx);
-	int (*xfrm_policy_delete_security) (struct xfrm_sec_ctx *ctx);
-	int (*xfrm_state_alloc) (struct xfrm_state *x,
-				 struct xfrm_user_sec_ctx *sec_ctx);
-	int (*xfrm_state_alloc_acquire) (struct xfrm_state *x,
-					 struct xfrm_sec_ctx *polsec,
-					 u32 secid);
-	void (*xfrm_state_free_security) (struct xfrm_state *x);
-	int (*xfrm_state_delete_security) (struct xfrm_state *x);
-	int (*xfrm_policy_lookup) (struct xfrm_sec_ctx *ctx, u32 fl_secid, u8 dir);
-	int (*xfrm_state_pol_flow_match) (struct xfrm_state *x,
-					  struct xfrm_policy *xp,
-					  const struct flowi *fl);
-	int (*xfrm_decode_session) (struct sk_buff *skb, u32 *secid, int ckall);
-#endif	/* CONFIG_SECURITY_NETWORK_XFRM */
-
-	/* key management security hooks */
-#ifdef CONFIG_KEYS
-	int (*key_alloc) (struct key *key, const struct cred *cred, unsigned long flags);
-	void (*key_free) (struct key *key);
-	int (*key_permission) (key_ref_t key_ref,
-			       const struct cred *cred,
-			       unsigned perm);
-	int (*key_getsecurity)(struct key *key, char **_buffer);
-#endif	/* CONFIG_KEYS */
-
-#ifdef CONFIG_AUDIT
-	int (*audit_rule_init) (u32 field, u32 op, char *rulestr, void **lsmrule);
-	int (*audit_rule_known) (struct audit_krule *krule);
-	int (*audit_rule_match) (u32 secid, u32 field, u32 op, void *lsmrule,
-				 struct audit_context *actx);
-	void (*audit_rule_free) (void *lsmrule);
-#endif /* CONFIG_AUDIT */
-};
 
 /* prototypes */
 extern int security_init(void);
-extern int security_module_enable(struct security_operations *ops);
-extern int register_security(struct security_operations *ops);
-extern void __init security_fixup_ops(struct security_operations *ops);
-
 
 /* Security operations */
 int security_binder_set_context_mgr(struct task_struct *mgr);
diff --git a/security/apparmor/lsm.c b/security/apparmor/lsm.c
index e5f1561439db..fead41bd0440 100644
--- a/security/apparmor/lsm.c
+++ b/security/apparmor/lsm.c
@@ -12,7 +12,7 @@
  * License.
  */
 
-#include <linux/security.h>
+#include <linux/lsm_hooks.h>
 #include <linux/moduleparam.h>
 #include <linux/mm.h>
 #include <linux/mman.h>
diff --git a/security/capability.c b/security/capability.c
index 0d03fcc489a4..513015feffd7 100644
--- a/security/capability.c
+++ b/security/capability.c
@@ -10,7 +10,7 @@
  *
  */
 
-#include <linux/security.h>
+#include <linux/lsm_hooks.h>
 
 static int cap_binder_set_context_mgr(struct task_struct *mgr)
 {
diff --git a/security/security.c b/security/security.c
index 8e9b1f4b9b45..9c95fe0c8d75 100644
--- a/security/security.c
+++ b/security/security.c
@@ -16,7 +16,7 @@
 #include <linux/module.h>
 #include <linux/init.h>
 #include <linux/kernel.h>
-#include <linux/security.h>
+#include <linux/lsm_hooks.h>
 #include <linux/integrity.h>
 #include <linux/ima.h>
 #include <linux/evm.h>
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 7dade28affba..40e3f7757ec7 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -29,7 +29,7 @@
 #include <linux/tracehook.h>
 #include <linux/errno.h>
 #include <linux/sched.h>
-#include <linux/security.h>
+#include <linux/lsm_hooks.h>
 #include <linux/xattr.h>
 #include <linux/capability.h>
 #include <linux/unistd.h>
diff --git a/security/smack/smack.h b/security/smack/smack.h
index 49eada6266ec..262dad8dfbc6 100644
--- a/security/smack/smack.h
+++ b/security/smack/smack.h
@@ -15,7 +15,7 @@
 
 #include <linux/capability.h>
 #include <linux/spinlock.h>
-#include <linux/security.h>
+#include <linux/lsm_hooks.h>
 #include <linux/in.h>
 #include <net/netlabel.h>
 #include <linux/list.h>
diff --git a/security/tomoyo/tomoyo.c b/security/tomoyo/tomoyo.c
index 57c88d52ffa5..2f7b46855f48 100644
--- a/security/tomoyo/tomoyo.c
+++ b/security/tomoyo/tomoyo.c
@@ -4,7 +4,7 @@
  * Copyright (C) 2005-2011  NTT DATA CORPORATION
  */
 
-#include <linux/security.h>
+#include <linux/lsm_hooks.h>
 #include "common.h"
 
 /**
diff --git a/security/yama/yama_lsm.c b/security/yama/yama_lsm.c
index 24aae2ae2b30..14557ffa7b4d 100644
--- a/security/yama/yama_lsm.c
+++ b/security/yama/yama_lsm.c
@@ -12,7 +12,7 @@
  *
  */
 
-#include <linux/security.h>
+#include <linux/lsm_hooks.h>
 #include <linux/sysctl.h>
 #include <linux/ptrace.h>
 #include <linux/prctl.h>
-- 
cgit v1.2.3


From fe7bb272ee72b5cc377e02b556d0d718d12bbede Mon Sep 17 00:00:00 2001
From: Casey Schaufler <casey@schaufler-ca.com>
Date: Sat, 2 May 2015 15:10:53 -0700
Subject: LSM: Add the comment to lsm_hooks.h

Add the large comment describing the content of the
security_operations structure to lsm_hooks.h. This
wasn't done in the previous (1/7) patch because it
would have exceeded the mail list size limits.

Signed-off-by: Casey Schaufler <casey@schaufler-ca.com>
Acked-by: John Johansen <john.johansen@canonical.com>
Acked-by: Kees Cook <keescook@chromium.org>
Acked-by: Paul Moore <paul@paul-moore.com>
Acked-by:  Stephen Smalley <sds@tycho.nsa.gov>
Acked-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Signed-off-by: James Morris <james.l.morris@oracle.com>
---
 include/linux/lsm_hooks.h | 1279 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 1279 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h
index c60f81b2d18c..b4c91de510c2 100644
--- a/include/linux/lsm_hooks.h
+++ b/include/linux/lsm_hooks.h
@@ -31,6 +31,1285 @@
 
 #ifdef CONFIG_SECURITY
 
+/**
+ * struct security_operations - main security structure
+ *
+ * Security module identifier.
+ *
+ * @name:
+ *	A string that acts as a unique identifier for the LSM with max number
+ *	of characters = SECURITY_NAME_MAX.
+ *
+ * Security hooks for program execution operations.
+ *
+ * @bprm_set_creds:
+ *	Save security information in the bprm->security field, typically based
+ *	on information about the bprm->file, for later use by the apply_creds
+ *	hook.  This hook may also optionally check permissions (e.g. for
+ *	transitions between security domains).
+ *	This hook may be called multiple times during a single execve, e.g. for
+ *	interpreters.  The hook can tell whether it has already been called by
+ *	checking to see if @bprm->security is non-NULL.  If so, then the hook
+ *	may decide either to retain the security information saved earlier or
+ *	to replace it.
+ *	@bprm contains the linux_binprm structure.
+ *	Return 0 if the hook is successful and permission is granted.
+ * @bprm_check_security:
+ *	This hook mediates the point when a search for a binary handler will
+ *	begin.  It allows a check the @bprm->security value which is set in the
+ *	preceding set_creds call.  The primary difference from set_creds is
+ *	that the argv list and envp list are reliably available in @bprm.  This
+ *	hook may be called multiple times during a single execve; and in each
+ *	pass set_creds is called first.
+ *	@bprm contains the linux_binprm structure.
+ *	Return 0 if the hook is successful and permission is granted.
+ * @bprm_committing_creds:
+ *	Prepare to install the new security attributes of a process being
+ *	transformed by an execve operation, based on the old credentials
+ *	pointed to by @current->cred and the information set in @bprm->cred by
+ *	the bprm_set_creds hook.  @bprm points to the linux_binprm structure.
+ *	This hook is a good place to perform state changes on the process such
+ *	as closing open file descriptors to which access will no longer be
+ *	granted when the attributes are changed.  This is called immediately
+ *	before commit_creds().
+ * @bprm_committed_creds:
+ *	Tidy up after the installation of the new security attributes of a
+ *	process being transformed by an execve operation.  The new credentials
+ *	have, by this point, been set to @current->cred.  @bprm points to the
+ *	linux_binprm structure.  This hook is a good place to perform state
+ *	changes on the process such as clearing out non-inheritable signal
+ *	state.  This is called immediately after commit_creds().
+ * @bprm_secureexec:
+ *	Return a boolean value (0 or 1) indicating whether a "secure exec"
+ *	is required.  The flag is passed in the auxiliary table
+ *	on the initial stack to the ELF interpreter to indicate whether libc
+ *	should enable secure mode.
+ *	@bprm contains the linux_binprm structure.
+ *
+ * Security hooks for filesystem operations.
+ *
+ * @sb_alloc_security:
+ *	Allocate and attach a security structure to the sb->s_security field.
+ *	The s_security field is initialized to NULL when the structure is
+ *	allocated.
+ *	@sb contains the super_block structure to be modified.
+ *	Return 0 if operation was successful.
+ * @sb_free_security:
+ *	Deallocate and clear the sb->s_security field.
+ *	@sb contains the super_block structure to be modified.
+ * @sb_statfs:
+ *	Check permission before obtaining filesystem statistics for the @mnt
+ *	mountpoint.
+ *	@dentry is a handle on the superblock for the filesystem.
+ *	Return 0 if permission is granted.
+ * @sb_mount:
+ *	Check permission before an object specified by @dev_name is mounted on
+ *	the mount point named by @nd.  For an ordinary mount, @dev_name
+ *	identifies a device if the file system type requires a device.  For a
+ *	remount (@flags & MS_REMOUNT), @dev_name is irrelevant.  For a
+ *	loopback/bind mount (@flags & MS_BIND), @dev_name identifies the
+ *	pathname of the object being mounted.
+ *	@dev_name contains the name for object being mounted.
+ *	@path contains the path for mount point object.
+ *	@type contains the filesystem type.
+ *	@flags contains the mount flags.
+ *	@data contains the filesystem-specific data.
+ *	Return 0 if permission is granted.
+ * @sb_copy_data:
+ *	Allow mount option data to be copied prior to parsing by the filesystem,
+ *	so that the security module can extract security-specific mount
+ *	options cleanly (a filesystem may modify the data e.g. with strsep()).
+ *	This also allows the original mount data to be stripped of security-
+ *	specific options to avoid having to make filesystems aware of them.
+ *	@type the type of filesystem being mounted.
+ *	@orig the original mount data copied from userspace.
+ *	@copy copied data which will be passed to the security module.
+ *	Returns 0 if the copy was successful.
+ * @sb_remount:
+ *	Extracts security system specific mount options and verifies no changes
+ *	are being made to those options.
+ *	@sb superblock being remounted
+ *	@data contains the filesystem-specific data.
+ *	Return 0 if permission is granted.
+ * @sb_umount:
+ *	Check permission before the @mnt file system is unmounted.
+ *	@mnt contains the mounted file system.
+ *	@flags contains the unmount flags, e.g. MNT_FORCE.
+ *	Return 0 if permission is granted.
+ * @sb_pivotroot:
+ *	Check permission before pivoting the root filesystem.
+ *	@old_path contains the path for the new location of the
+ *	current root (put_old).
+ *	@new_path contains the path for the new root (new_root).
+ *	Return 0 if permission is granted.
+ * @sb_set_mnt_opts:
+ *	Set the security relevant mount options used for a superblock
+ *	@sb the superblock to set security mount options for
+ *	@opts binary data structure containing all lsm mount data
+ * @sb_clone_mnt_opts:
+ *	Copy all security options from a given superblock to another
+ *	@oldsb old superblock which contain information to clone
+ *	@newsb new superblock which needs filled in
+ * @sb_parse_opts_str:
+ *	Parse a string of security data filling in the opts structure
+ *	@options string containing all mount options known by the LSM
+ *	@opts binary data structure usable by the LSM
+ * @dentry_init_security:
+ *	Compute a context for a dentry as the inode is not yet available
+ *	since NFSv4 has no label backed by an EA anyway.
+ *	@dentry dentry to use in calculating the context.
+ *	@mode mode used to determine resource type.
+ *	@name name of the last path component used to create file
+ *	@ctx pointer to place the pointer to the resulting context in.
+ *	@ctxlen point to place the length of the resulting context.
+ *
+ *
+ * Security hooks for inode operations.
+ *
+ * @inode_alloc_security:
+ *	Allocate and attach a security structure to @inode->i_security.  The
+ *	i_security field is initialized to NULL when the inode structure is
+ *	allocated.
+ *	@inode contains the inode structure.
+ *	Return 0 if operation was successful.
+ * @inode_free_security:
+ *	@inode contains the inode structure.
+ *	Deallocate the inode security structure and set @inode->i_security to
+ *	NULL.
+ * @inode_init_security:
+ *	Obtain the security attribute name suffix and value to set on a newly
+ *	created inode and set up the incore security field for the new inode.
+ *	This hook is called by the fs code as part of the inode creation
+ *	transaction and provides for atomic labeling of the inode, unlike
+ *	the post_create/mkdir/... hooks called by the VFS.  The hook function
+ *	is expected to allocate the name and value via kmalloc, with the caller
+ *	being responsible for calling kfree after using them.
+ *	If the security module does not use security attributes or does
+ *	not wish to put a security attribute on this particular inode,
+ *	then it should return -EOPNOTSUPP to skip this processing.
+ *	@inode contains the inode structure of the newly created inode.
+ *	@dir contains the inode structure of the parent directory.
+ *	@qstr contains the last path component of the new object
+ *	@name will be set to the allocated name suffix (e.g. selinux).
+ *	@value will be set to the allocated attribute value.
+ *	@len will be set to the length of the value.
+ *	Returns 0 if @name and @value have been successfully set,
+ *		-EOPNOTSUPP if no security attribute is needed, or
+ *		-ENOMEM on memory allocation failure.
+ * @inode_create:
+ *	Check permission to create a regular file.
+ *	@dir contains inode structure of the parent of the new file.
+ *	@dentry contains the dentry structure for the file to be created.
+ *	@mode contains the file mode of the file to be created.
+ *	Return 0 if permission is granted.
+ * @inode_link:
+ *	Check permission before creating a new hard link to a file.
+ *	@old_dentry contains the dentry structure for an existing
+ *	link to the file.
+ *	@dir contains the inode structure of the parent directory
+ *	of the new link.
+ *	@new_dentry contains the dentry structure for the new link.
+ *	Return 0 if permission is granted.
+ * @path_link:
+ *	Check permission before creating a new hard link to a file.
+ *	@old_dentry contains the dentry structure for an existing link
+ *	to the file.
+ *	@new_dir contains the path structure of the parent directory of
+ *	the new link.
+ *	@new_dentry contains the dentry structure for the new link.
+ *	Return 0 if permission is granted.
+ * @inode_unlink:
+ *	Check the permission to remove a hard link to a file.
+ *	@dir contains the inode structure of parent directory of the file.
+ *	@dentry contains the dentry structure for file to be unlinked.
+ *	Return 0 if permission is granted.
+ * @path_unlink:
+ *	Check the permission to remove a hard link to a file.
+ *	@dir contains the path structure of parent directory of the file.
+ *	@dentry contains the dentry structure for file to be unlinked.
+ *	Return 0 if permission is granted.
+ * @inode_symlink:
+ *	Check the permission to create a symbolic link to a file.
+ *	@dir contains the inode structure of parent directory of
+ *	the symbolic link.
+ *	@dentry contains the dentry structure of the symbolic link.
+ *	@old_name contains the pathname of file.
+ *	Return 0 if permission is granted.
+ * @path_symlink:
+ *	Check the permission to create a symbolic link to a file.
+ *	@dir contains the path structure of parent directory of
+ *	the symbolic link.
+ *	@dentry contains the dentry structure of the symbolic link.
+ *	@old_name contains the pathname of file.
+ *	Return 0 if permission is granted.
+ * @inode_mkdir:
+ *	Check permissions to create a new directory in the existing directory
+ *	associated with inode structure @dir.
+ *	@dir contains the inode structure of parent of the directory
+ *	to be created.
+ *	@dentry contains the dentry structure of new directory.
+ *	@mode contains the mode of new directory.
+ *	Return 0 if permission is granted.
+ * @path_mkdir:
+ *	Check permissions to create a new directory in the existing directory
+ *	associated with path structure @path.
+ *	@dir contains the path structure of parent of the directory
+ *	to be created.
+ *	@dentry contains the dentry structure of new directory.
+ *	@mode contains the mode of new directory.
+ *	Return 0 if permission is granted.
+ * @inode_rmdir:
+ *	Check the permission to remove a directory.
+ *	@dir contains the inode structure of parent of the directory
+ *	to be removed.
+ *	@dentry contains the dentry structure of directory to be removed.
+ *	Return 0 if permission is granted.
+ * @path_rmdir:
+ *	Check the permission to remove a directory.
+ *	@dir contains the path structure of parent of the directory to be
+ *	removed.
+ *	@dentry contains the dentry structure of directory to be removed.
+ *	Return 0 if permission is granted.
+ * @inode_mknod:
+ *	Check permissions when creating a special file (or a socket or a fifo
+ *	file created via the mknod system call).  Note that if mknod operation
+ *	is being done for a regular file, then the create hook will be called
+ *	and not this hook.
+ *	@dir contains the inode structure of parent of the new file.
+ *	@dentry contains the dentry structure of the new file.
+ *	@mode contains the mode of the new file.
+ *	@dev contains the device number.
+ *	Return 0 if permission is granted.
+ * @path_mknod:
+ *	Check permissions when creating a file. Note that this hook is called
+ *	even if mknod operation is being done for a regular file.
+ *	@dir contains the path structure of parent of the new file.
+ *	@dentry contains the dentry structure of the new file.
+ *	@mode contains the mode of the new file.
+ *	@dev contains the undecoded device number. Use new_decode_dev() to get
+ *	the decoded device number.
+ *	Return 0 if permission is granted.
+ * @inode_rename:
+ *	Check for permission to rename a file or directory.
+ *	@old_dir contains the inode structure for parent of the old link.
+ *	@old_dentry contains the dentry structure of the old link.
+ *	@new_dir contains the inode structure for parent of the new link.
+ *	@new_dentry contains the dentry structure of the new link.
+ *	Return 0 if permission is granted.
+ * @path_rename:
+ *	Check for permission to rename a file or directory.
+ *	@old_dir contains the path structure for parent of the old link.
+ *	@old_dentry contains the dentry structure of the old link.
+ *	@new_dir contains the path structure for parent of the new link.
+ *	@new_dentry contains the dentry structure of the new link.
+ *	Return 0 if permission is granted.
+ * @path_chmod:
+ *	Check for permission to change DAC's permission of a file or directory.
+ *	@dentry contains the dentry structure.
+ *	@mnt contains the vfsmnt structure.
+ *	@mode contains DAC's mode.
+ *	Return 0 if permission is granted.
+ * @path_chown:
+ *	Check for permission to change owner/group of a file or directory.
+ *	@path contains the path structure.
+ *	@uid contains new owner's ID.
+ *	@gid contains new group's ID.
+ *	Return 0 if permission is granted.
+ * @path_chroot:
+ *	Check for permission to change root directory.
+ *	@path contains the path structure.
+ *	Return 0 if permission is granted.
+ * @inode_readlink:
+ *	Check the permission to read the symbolic link.
+ *	@dentry contains the dentry structure for the file link.
+ *	Return 0 if permission is granted.
+ * @inode_follow_link:
+ *	Check permission to follow a symbolic link when looking up a pathname.
+ *	@dentry contains the dentry structure for the link.
+ *	@nd contains the nameidata structure for the parent directory.
+ *	Return 0 if permission is granted.
+ * @inode_permission:
+ *	Check permission before accessing an inode.  This hook is called by the
+ *	existing Linux permission function, so a security module can use it to
+ *	provide additional checking for existing Linux permission checks.
+ *	Notice that this hook is called when a file is opened (as well as many
+ *	other operations), whereas the file_security_ops permission hook is
+ *	called when the actual read/write operations are performed.
+ *	@inode contains the inode structure to check.
+ *	@mask contains the permission mask.
+ *	Return 0 if permission is granted.
+ * @inode_setattr:
+ *	Check permission before setting file attributes.  Note that the kernel
+ *	call to notify_change is performed from several locations, whenever
+ *	file attributes change (such as when a file is truncated, chown/chmod
+ *	operations, transferring disk quotas, etc).
+ *	@dentry contains the dentry structure for the file.
+ *	@attr is the iattr structure containing the new file attributes.
+ *	Return 0 if permission is granted.
+ * @path_truncate:
+ *	Check permission before truncating a file.
+ *	@path contains the path structure for the file.
+ *	Return 0 if permission is granted.
+ * @inode_getattr:
+ *	Check permission before obtaining file attributes.
+ *	@mnt is the vfsmount where the dentry was looked up
+ *	@dentry contains the dentry structure for the file.
+ *	Return 0 if permission is granted.
+ * @inode_setxattr:
+ *	Check permission before setting the extended attributes
+ *	@value identified by @name for @dentry.
+ *	Return 0 if permission is granted.
+ * @inode_post_setxattr:
+ *	Update inode security field after successful setxattr operation.
+ *	@value identified by @name for @dentry.
+ * @inode_getxattr:
+ *	Check permission before obtaining the extended attributes
+ *	identified by @name for @dentry.
+ *	Return 0 if permission is granted.
+ * @inode_listxattr:
+ *	Check permission before obtaining the list of extended attribute
+ *	names for @dentry.
+ *	Return 0 if permission is granted.
+ * @inode_removexattr:
+ *	Check permission before removing the extended attribute
+ *	identified by @name for @dentry.
+ *	Return 0 if permission is granted.
+ * @inode_getsecurity:
+ *	Retrieve a copy of the extended attribute representation of the
+ *	security label associated with @name for @inode via @buffer.  Note that
+ *	@name is the remainder of the attribute name after the security prefix
+ *	has been removed. @alloc is used to specify of the call should return a
+ *	value via the buffer or just the value length Return size of buffer on
+ *	success.
+ * @inode_setsecurity:
+ *	Set the security label associated with @name for @inode from the
+ *	extended attribute value @value.  @size indicates the size of the
+ *	@value in bytes.  @flags may be XATTR_CREATE, XATTR_REPLACE, or 0.
+ *	Note that @name is the remainder of the attribute name after the
+ *	security. prefix has been removed.
+ *	Return 0 on success.
+ * @inode_listsecurity:
+ *	Copy the extended attribute names for the security labels
+ *	associated with @inode into @buffer.  The maximum size of @buffer
+ *	is specified by @buffer_size.  @buffer may be NULL to request
+ *	the size of the buffer required.
+ *	Returns number of bytes used/required on success.
+ * @inode_need_killpriv:
+ *	Called when an inode has been changed.
+ *	@dentry is the dentry being changed.
+ *	Return <0 on error to abort the inode change operation.
+ *	Return 0 if inode_killpriv does not need to be called.
+ *	Return >0 if inode_killpriv does need to be called.
+ * @inode_killpriv:
+ *	The setuid bit is being removed.  Remove similar security labels.
+ *	Called with the dentry->d_inode->i_mutex held.
+ *	@dentry is the dentry being changed.
+ *	Return 0 on success.  If error is returned, then the operation
+ *	causing setuid bit removal is failed.
+ * @inode_getsecid:
+ *	Get the secid associated with the node.
+ *	@inode contains a pointer to the inode.
+ *	@secid contains a pointer to the location where result will be saved.
+ *	In case of failure, @secid will be set to zero.
+ *
+ * Security hooks for file operations
+ *
+ * @file_permission:
+ *	Check file permissions before accessing an open file.  This hook is
+ *	called by various operations that read or write files.  A security
+ *	module can use this hook to perform additional checking on these
+ *	operations, e.g.  to revalidate permissions on use to support privilege
+ *	bracketing or policy changes.  Notice that this hook is used when the
+ *	actual read/write operations are performed, whereas the
+ *	inode_security_ops hook is called when a file is opened (as well as
+ *	many other operations).
+ *	Caveat:  Although this hook can be used to revalidate permissions for
+ *	various system call operations that read or write files, it does not
+ *	address the revalidation of permissions for memory-mapped files.
+ *	Security modules must handle this separately if they need such
+ *	revalidation.
+ *	@file contains the file structure being accessed.
+ *	@mask contains the requested permissions.
+ *	Return 0 if permission is granted.
+ * @file_alloc_security:
+ *	Allocate and attach a security structure to the file->f_security field.
+ *	The security field is initialized to NULL when the structure is first
+ *	created.
+ *	@file contains the file structure to secure.
+ *	Return 0 if the hook is successful and permission is granted.
+ * @file_free_security:
+ *	Deallocate and free any security structures stored in file->f_security.
+ *	@file contains the file structure being modified.
+ * @file_ioctl:
+ *	@file contains the file structure.
+ *	@cmd contains the operation to perform.
+ *	@arg contains the operational arguments.
+ *	Check permission for an ioctl operation on @file.  Note that @arg
+ *	sometimes represents a user space pointer; in other cases, it may be a
+ *	simple integer value.  When @arg represents a user space pointer, it
+ *	should never be used by the security module.
+ *	Return 0 if permission is granted.
+ * @mmap_addr :
+ *	Check permissions for a mmap operation at @addr.
+ *	@addr contains virtual address that will be used for the operation.
+ *	Return 0 if permission is granted.
+ * @mmap_file :
+ *	Check permissions for a mmap operation.  The @file may be NULL, e.g.
+ *	if mapping anonymous memory.
+ *	@file contains the file structure for file to map (may be NULL).
+ *	@reqprot contains the protection requested by the application.
+ *	@prot contains the protection that will be applied by the kernel.
+ *	@flags contains the operational flags.
+ *	Return 0 if permission is granted.
+ * @file_mprotect:
+ *	Check permissions before changing memory access permissions.
+ *	@vma contains the memory region to modify.
+ *	@reqprot contains the protection requested by the application.
+ *	@prot contains the protection that will be applied by the kernel.
+ *	Return 0 if permission is granted.
+ * @file_lock:
+ *	Check permission before performing file locking operations.
+ *	Note: this hook mediates both flock and fcntl style locks.
+ *	@file contains the file structure.
+ *	@cmd contains the posix-translated lock operation to perform
+ *	(e.g. F_RDLCK, F_WRLCK).
+ *	Return 0 if permission is granted.
+ * @file_fcntl:
+ *	Check permission before allowing the file operation specified by @cmd
+ *	from being performed on the file @file.  Note that @arg sometimes
+ *	represents a user space pointer; in other cases, it may be a simple
+ *	integer value.  When @arg represents a user space pointer, it should
+ *	never be used by the security module.
+ *	@file contains the file structure.
+ *	@cmd contains the operation to be performed.
+ *	@arg contains the operational arguments.
+ *	Return 0 if permission is granted.
+ * @file_set_fowner:
+ *	Save owner security information (typically from current->security) in
+ *	file->f_security for later use by the send_sigiotask hook.
+ *	@file contains the file structure to update.
+ *	Return 0 on success.
+ * @file_send_sigiotask:
+ *	Check permission for the file owner @fown to send SIGIO or SIGURG to the
+ *	process @tsk.  Note that this hook is sometimes called from interrupt.
+ *	Note that the fown_struct, @fown, is never outside the context of a
+ *	struct file, so the file structure (and associated security information)
+ *	can always be obtained:
+ *		container_of(fown, struct file, f_owner)
+ *	@tsk contains the structure of task receiving signal.
+ *	@fown contains the file owner information.
+ *	@sig is the signal that will be sent.  When 0, kernel sends SIGIO.
+ *	Return 0 if permission is granted.
+ * @file_receive:
+ *	This hook allows security modules to control the ability of a process
+ *	to receive an open file descriptor via socket IPC.
+ *	@file contains the file structure being received.
+ *	Return 0 if permission is granted.
+ * @file_open
+ *	Save open-time permission checking state for later use upon
+ *	file_permission, and recheck access if anything has changed
+ *	since inode_permission.
+ *
+ * Security hooks for task operations.
+ *
+ * @task_create:
+ *	Check permission before creating a child process.  See the clone(2)
+ *	manual page for definitions of the @clone_flags.
+ *	@clone_flags contains the flags indicating what should be shared.
+ *	Return 0 if permission is granted.
+ * @task_free:
+ *	@task task being freed
+ *	Handle release of task-related resources. (Note that this can be called
+ *	from interrupt context.)
+ * @cred_alloc_blank:
+ *	@cred points to the credentials.
+ *	@gfp indicates the atomicity of any memory allocations.
+ *	Only allocate sufficient memory and attach to @cred such that
+ *	cred_transfer() will not get ENOMEM.
+ * @cred_free:
+ *	@cred points to the credentials.
+ *	Deallocate and clear the cred->security field in a set of credentials.
+ * @cred_prepare:
+ *	@new points to the new credentials.
+ *	@old points to the original credentials.
+ *	@gfp indicates the atomicity of any memory allocations.
+ *	Prepare a new set of credentials by copying the data from the old set.
+ * @cred_transfer:
+ *	@new points to the new credentials.
+ *	@old points to the original credentials.
+ *	Transfer data from original creds to new creds
+ * @kernel_act_as:
+ *	Set the credentials for a kernel service to act as (subjective context).
+ *	@new points to the credentials to be modified.
+ *	@secid specifies the security ID to be set
+ *	The current task must be the one that nominated @secid.
+ *	Return 0 if successful.
+ * @kernel_create_files_as:
+ *	Set the file creation context in a set of credentials to be the same as
+ *	the objective context of the specified inode.
+ *	@new points to the credentials to be modified.
+ *	@inode points to the inode to use as a reference.
+ *	The current task must be the one that nominated @inode.
+ *	Return 0 if successful.
+ * @kernel_fw_from_file:
+ *	Load firmware from userspace (not called for built-in firmware).
+ *	@file contains the file structure pointing to the file containing
+ *	the firmware to load. This argument will be NULL if the firmware
+ *	was loaded via the uevent-triggered blob-based interface exposed
+ *	by CONFIG_FW_LOADER_USER_HELPER.
+ *	@buf pointer to buffer containing firmware contents.
+ *	@size length of the firmware contents.
+ *	Return 0 if permission is granted.
+ * @kernel_module_request:
+ *	Ability to trigger the kernel to automatically upcall to userspace for
+ *	userspace to load a kernel module with the given name.
+ *	@kmod_name name of the module requested by the kernel
+ *	Return 0 if successful.
+ * @kernel_module_from_file:
+ *	Load a kernel module from userspace.
+ *	@file contains the file structure pointing to the file containing
+ *	the kernel module to load. If the module is being loaded from a blob,
+ *	this argument will be NULL.
+ *	Return 0 if permission is granted.
+ * @task_fix_setuid:
+ *	Update the module's state after setting one or more of the user
+ *	identity attributes of the current process.  The @flags parameter
+ *	indicates which of the set*uid system calls invoked this hook.  If
+ *	@new is the set of credentials that will be installed.  Modifications
+ *	should be made to this rather than to @current->cred.
+ *	@old is the set of credentials that are being replaces
+ *	@flags contains one of the LSM_SETID_* values.
+ *	Return 0 on success.
+ * @task_setpgid:
+ *	Check permission before setting the process group identifier of the
+ *	process @p to @pgid.
+ *	@p contains the task_struct for process being modified.
+ *	@pgid contains the new pgid.
+ *	Return 0 if permission is granted.
+ * @task_getpgid:
+ *	Check permission before getting the process group identifier of the
+ *	process @p.
+ *	@p contains the task_struct for the process.
+ *	Return 0 if permission is granted.
+ * @task_getsid:
+ *	Check permission before getting the session identifier of the process
+ *	@p.
+ *	@p contains the task_struct for the process.
+ *	Return 0 if permission is granted.
+ * @task_getsecid:
+ *	Retrieve the security identifier of the process @p.
+ *	@p contains the task_struct for the process and place is into @secid.
+ *	In case of failure, @secid will be set to zero.
+ *
+ * @task_setnice:
+ *	Check permission before setting the nice value of @p to @nice.
+ *	@p contains the task_struct of process.
+ *	@nice contains the new nice value.
+ *	Return 0 if permission is granted.
+ * @task_setioprio
+ *	Check permission before setting the ioprio value of @p to @ioprio.
+ *	@p contains the task_struct of process.
+ *	@ioprio contains the new ioprio value
+ *	Return 0 if permission is granted.
+ * @task_getioprio
+ *	Check permission before getting the ioprio value of @p.
+ *	@p contains the task_struct of process.
+ *	Return 0 if permission is granted.
+ * @task_setrlimit:
+ *	Check permission before setting the resource limits of the current
+ *	process for @resource to @new_rlim.  The old resource limit values can
+ *	be examined by dereferencing (current->signal->rlim + resource).
+ *	@resource contains the resource whose limit is being set.
+ *	@new_rlim contains the new limits for @resource.
+ *	Return 0 if permission is granted.
+ * @task_setscheduler:
+ *	Check permission before setting scheduling policy and/or parameters of
+ *	process @p based on @policy and @lp.
+ *	@p contains the task_struct for process.
+ *	@policy contains the scheduling policy.
+ *	@lp contains the scheduling parameters.
+ *	Return 0 if permission is granted.
+ * @task_getscheduler:
+ *	Check permission before obtaining scheduling information for process
+ *	@p.
+ *	@p contains the task_struct for process.
+ *	Return 0 if permission is granted.
+ * @task_movememory
+ *	Check permission before moving memory owned by process @p.
+ *	@p contains the task_struct for process.
+ *	Return 0 if permission is granted.
+ * @task_kill:
+ *	Check permission before sending signal @sig to @p.  @info can be NULL,
+ *	the constant 1, or a pointer to a siginfo structure.  If @info is 1 or
+ *	SI_FROMKERNEL(info) is true, then the signal should be viewed as coming
+ *	from the kernel and should typically be permitted.
+ *	SIGIO signals are handled separately by the send_sigiotask hook in
+ *	file_security_ops.
+ *	@p contains the task_struct for process.
+ *	@info contains the signal information.
+ *	@sig contains the signal value.
+ *	@secid contains the sid of the process where the signal originated
+ *	Return 0 if permission is granted.
+ * @task_wait:
+ *	Check permission before allowing a process to reap a child process @p
+ *	and collect its status information.
+ *	@p contains the task_struct for process.
+ *	Return 0 if permission is granted.
+ * @task_prctl:
+ *	Check permission before performing a process control operation on the
+ *	current process.
+ *	@option contains the operation.
+ *	@arg2 contains a argument.
+ *	@arg3 contains a argument.
+ *	@arg4 contains a argument.
+ *	@arg5 contains a argument.
+ *	Return -ENOSYS if no-one wanted to handle this op, any other value to
+ *	cause prctl() to return immediately with that value.
+ * @task_to_inode:
+ *	Set the security attributes for an inode based on an associated task's
+ *	security attributes, e.g. for /proc/pid inodes.
+ *	@p contains the task_struct for the task.
+ *	@inode contains the inode structure for the inode.
+ *
+ * Security hooks for Netlink messaging.
+ *
+ * @netlink_send:
+ *	Save security information for a netlink message so that permission
+ *	checking can be performed when the message is processed.  The security
+ *	information can be saved using the eff_cap field of the
+ *	netlink_skb_parms structure.  Also may be used to provide fine
+ *	grained control over message transmission.
+ *	@sk associated sock of task sending the message.
+ *	@skb contains the sk_buff structure for the netlink message.
+ *	Return 0 if the information was successfully saved and message
+ *	is allowed to be transmitted.
+ *
+ * Security hooks for Unix domain networking.
+ *
+ * @unix_stream_connect:
+ *	Check permissions before establishing a Unix domain stream connection
+ *	between @sock and @other.
+ *	@sock contains the sock structure.
+ *	@other contains the peer sock structure.
+ *	@newsk contains the new sock structure.
+ *	Return 0 if permission is granted.
+ * @unix_may_send:
+ *	Check permissions before connecting or sending datagrams from @sock to
+ *	@other.
+ *	@sock contains the socket structure.
+ *	@other contains the peer socket structure.
+ *	Return 0 if permission is granted.
+ *
+ * The @unix_stream_connect and @unix_may_send hooks were necessary because
+ * Linux provides an alternative to the conventional file name space for Unix
+ * domain sockets.  Whereas binding and connecting to sockets in the file name
+ * space is mediated by the typical file permissions (and caught by the mknod
+ * and permission hooks in inode_security_ops), binding and connecting to
+ * sockets in the abstract name space is completely unmediated.  Sufficient
+ * control of Unix domain sockets in the abstract name space isn't possible
+ * using only the socket layer hooks, since we need to know the actual target
+ * socket, which is not looked up until we are inside the af_unix code.
+ *
+ * Security hooks for socket operations.
+ *
+ * @socket_create:
+ *	Check permissions prior to creating a new socket.
+ *	@family contains the requested protocol family.
+ *	@type contains the requested communications type.
+ *	@protocol contains the requested protocol.
+ *	@kern set to 1 if a kernel socket.
+ *	Return 0 if permission is granted.
+ * @socket_post_create:
+ *	This hook allows a module to update or allocate a per-socket security
+ *	structure. Note that the security field was not added directly to the
+ *	socket structure, but rather, the socket security information is stored
+ *	in the associated inode.  Typically, the inode alloc_security hook will
+ *	allocate and and attach security information to
+ *	sock->inode->i_security.  This hook may be used to update the
+ *	sock->inode->i_security field with additional information that wasn't
+ *	available when the inode was allocated.
+ *	@sock contains the newly created socket structure.
+ *	@family contains the requested protocol family.
+ *	@type contains the requested communications type.
+ *	@protocol contains the requested protocol.
+ *	@kern set to 1 if a kernel socket.
+ * @socket_bind:
+ *	Check permission before socket protocol layer bind operation is
+ *	performed and the socket @sock is bound to the address specified in the
+ *	@address parameter.
+ *	@sock contains the socket structure.
+ *	@address contains the address to bind to.
+ *	@addrlen contains the length of address.
+ *	Return 0 if permission is granted.
+ * @socket_connect:
+ *	Check permission before socket protocol layer connect operation
+ *	attempts to connect socket @sock to a remote address, @address.
+ *	@sock contains the socket structure.
+ *	@address contains the address of remote endpoint.
+ *	@addrlen contains the length of address.
+ *	Return 0 if permission is granted.
+ * @socket_listen:
+ *	Check permission before socket protocol layer listen operation.
+ *	@sock contains the socket structure.
+ *	@backlog contains the maximum length for the pending connection queue.
+ *	Return 0 if permission is granted.
+ * @socket_accept:
+ *	Check permission before accepting a new connection.  Note that the new
+ *	socket, @newsock, has been created and some information copied to it,
+ *	but the accept operation has not actually been performed.
+ *	@sock contains the listening socket structure.
+ *	@newsock contains the newly created server socket for connection.
+ *	Return 0 if permission is granted.
+ * @socket_sendmsg:
+ *	Check permission before transmitting a message to another socket.
+ *	@sock contains the socket structure.
+ *	@msg contains the message to be transmitted.
+ *	@size contains the size of message.
+ *	Return 0 if permission is granted.
+ * @socket_recvmsg:
+ *	Check permission before receiving a message from a socket.
+ *	@sock contains the socket structure.
+ *	@msg contains the message structure.
+ *	@size contains the size of message structure.
+ *	@flags contains the operational flags.
+ *	Return 0 if permission is granted.
+ * @socket_getsockname:
+ *	Check permission before the local address (name) of the socket object
+ *	@sock is retrieved.
+ *	@sock contains the socket structure.
+ *	Return 0 if permission is granted.
+ * @socket_getpeername:
+ *	Check permission before the remote address (name) of a socket object
+ *	@sock is retrieved.
+ *	@sock contains the socket structure.
+ *	Return 0 if permission is granted.
+ * @socket_getsockopt:
+ *	Check permissions before retrieving the options associated with socket
+ *	@sock.
+ *	@sock contains the socket structure.
+ *	@level contains the protocol level to retrieve option from.
+ *	@optname contains the name of option to retrieve.
+ *	Return 0 if permission is granted.
+ * @socket_setsockopt:
+ *	Check permissions before setting the options associated with socket
+ *	@sock.
+ *	@sock contains the socket structure.
+ *	@level contains the protocol level to set options for.
+ *	@optname contains the name of the option to set.
+ *	Return 0 if permission is granted.
+ * @socket_shutdown:
+ *	Checks permission before all or part of a connection on the socket
+ *	@sock is shut down.
+ *	@sock contains the socket structure.
+ *	@how contains the flag indicating how future sends and receives
+ *	are handled.
+ *	Return 0 if permission is granted.
+ * @socket_sock_rcv_skb:
+ *	Check permissions on incoming network packets.  This hook is distinct
+ *	from Netfilter's IP input hooks since it is the first time that the
+ *	incoming sk_buff @skb has been associated with a particular socket, @sk.
+ *	Must not sleep inside this hook because some callers hold spinlocks.
+ *	@sk contains the sock (not socket) associated with the incoming sk_buff.
+ *	@skb contains the incoming network data.
+ * @socket_getpeersec_stream:
+ *	This hook allows the security module to provide peer socket security
+ *	state for unix or connected tcp sockets to userspace via getsockopt
+ *	SO_GETPEERSEC.  For tcp sockets this can be meaningful if the
+ *	socket is associated with an ipsec SA.
+ *	@sock is the local socket.
+ *	@optval userspace memory where the security state is to be copied.
+ *	@optlen userspace int where the module should copy the actual length
+ *	of the security state.
+ *	@len as input is the maximum length to copy to userspace provided
+ *	by the caller.
+ *	Return 0 if all is well, otherwise, typical getsockopt return
+ *	values.
+ * @socket_getpeersec_dgram:
+ *	This hook allows the security module to provide peer socket security
+ *	state for udp sockets on a per-packet basis to userspace via
+ *	getsockopt SO_GETPEERSEC.  The application must first have indicated
+ *	the IP_PASSSEC option via getsockopt.  It can then retrieve the
+ *	security state returned by this hook for a packet via the SCM_SECURITY
+ *	ancillary message type.
+ *	@skb is the skbuff for the packet being queried
+ *	@secdata is a pointer to a buffer in which to copy the security data
+ *	@seclen is the maximum length for @secdata
+ *	Return 0 on success, error on failure.
+ * @sk_alloc_security:
+ *	Allocate and attach a security structure to the sk->sk_security field,
+ *	which is used to copy security attributes between local stream sockets.
+ * @sk_free_security:
+ *	Deallocate security structure.
+ * @sk_clone_security:
+ *	Clone/copy security structure.
+ * @sk_getsecid:
+ *	Retrieve the LSM-specific secid for the sock to enable caching
+ *	of network authorizations.
+ * @sock_graft:
+ *	Sets the socket's isec sid to the sock's sid.
+ * @inet_conn_request:
+ *	Sets the openreq's sid to socket's sid with MLS portion taken
+ *	from peer sid.
+ * @inet_csk_clone:
+ *	Sets the new child socket's sid to the openreq sid.
+ * @inet_conn_established:
+ *	Sets the connection's peersid to the secmark on skb.
+ * @secmark_relabel_packet:
+ *	check if the process should be allowed to relabel packets to
+ *	the given secid
+ * @security_secmark_refcount_inc
+ *	tells the LSM to increment the number of secmark labeling rules loaded
+ * @security_secmark_refcount_dec
+ *	tells the LSM to decrement the number of secmark labeling rules loaded
+ * @req_classify_flow:
+ *	Sets the flow's sid to the openreq sid.
+ * @tun_dev_alloc_security:
+ *	This hook allows a module to allocate a security structure for a TUN
+ *	device.
+ *	@security pointer to a security structure pointer.
+ *	Returns a zero on success, negative values on failure.
+ * @tun_dev_free_security:
+ *	This hook allows a module to free the security structure for a TUN
+ *	device.
+ *	@security pointer to the TUN device's security structure
+ * @tun_dev_create:
+ *	Check permissions prior to creating a new TUN device.
+ * @tun_dev_attach_queue:
+ *	Check permissions prior to attaching to a TUN device queue.
+ *	@security pointer to the TUN device's security structure.
+ * @tun_dev_attach:
+ *	This hook can be used by the module to update any security state
+ *	associated with the TUN device's sock structure.
+ *	@sk contains the existing sock structure.
+ *	@security pointer to the TUN device's security structure.
+ * @tun_dev_open:
+ *	This hook can be used by the module to update any security state
+ *	associated with the TUN device's security structure.
+ *	@security pointer to the TUN devices's security structure.
+ *
+ * Security hooks for XFRM operations.
+ *
+ * @xfrm_policy_alloc_security:
+ *	@ctxp is a pointer to the xfrm_sec_ctx being added to Security Policy
+ *	Database used by the XFRM system.
+ *	@sec_ctx contains the security context information being provided by
+ *	the user-level policy update program (e.g., setkey).
+ *	Allocate a security structure to the xp->security field; the security
+ *	field is initialized to NULL when the xfrm_policy is allocated.
+ *	Return 0 if operation was successful (memory to allocate, legal context)
+ *	@gfp is to specify the context for the allocation
+ * @xfrm_policy_clone_security:
+ *	@old_ctx contains an existing xfrm_sec_ctx.
+ *	@new_ctxp contains a new xfrm_sec_ctx being cloned from old.
+ *	Allocate a security structure in new_ctxp that contains the
+ *	information from the old_ctx structure.
+ *	Return 0 if operation was successful (memory to allocate).
+ * @xfrm_policy_free_security:
+ *	@ctx contains the xfrm_sec_ctx
+ *	Deallocate xp->security.
+ * @xfrm_policy_delete_security:
+ *	@ctx contains the xfrm_sec_ctx.
+ *	Authorize deletion of xp->security.
+ * @xfrm_state_alloc:
+ *	@x contains the xfrm_state being added to the Security Association
+ *	Database by the XFRM system.
+ *	@sec_ctx contains the security context information being provided by
+ *	the user-level SA generation program (e.g., setkey or racoon).
+ *	Allocate a security structure to the x->security field; the security
+ *	field is initialized to NULL when the xfrm_state is allocated. Set the
+ *	context to correspond to sec_ctx. Return 0 if operation was successful
+ *	(memory to allocate, legal context).
+ * @xfrm_state_alloc_acquire:
+ *	@x contains the xfrm_state being added to the Security Association
+ *	Database by the XFRM system.
+ *	@polsec contains the policy's security context.
+ *	@secid contains the secid from which to take the mls portion of the
+ *	context.
+ *	Allocate a security structure to the x->security field; the security
+ *	field is initialized to NULL when the xfrm_state is allocated. Set the
+ *	context to correspond to secid. Return 0 if operation was successful
+ *	(memory to allocate, legal context).
+ * @xfrm_state_free_security:
+ *	@x contains the xfrm_state.
+ *	Deallocate x->security.
+ * @xfrm_state_delete_security:
+ *	@x contains the xfrm_state.
+ *	Authorize deletion of x->security.
+ * @xfrm_policy_lookup:
+ *	@ctx contains the xfrm_sec_ctx for which the access control is being
+ *	checked.
+ *	@fl_secid contains the flow security label that is used to authorize
+ *	access to the policy xp.
+ *	@dir contains the direction of the flow (input or output).
+ *	Check permission when a flow selects a xfrm_policy for processing
+ *	XFRMs on a packet.  The hook is called when selecting either a
+ *	per-socket policy or a generic xfrm policy.
+ *	Return 0 if permission is granted, -ESRCH otherwise, or -errno
+ *	on other errors.
+ * @xfrm_state_pol_flow_match:
+ *	@x contains the state to match.
+ *	@xp contains the policy to check for a match.
+ *	@fl contains the flow to check for a match.
+ *	Return 1 if there is a match.
+ * @xfrm_decode_session:
+ *	@skb points to skb to decode.
+ *	@secid points to the flow key secid to set.
+ *	@ckall says if all xfrms used should be checked for same secid.
+ *	Return 0 if ckall is zero or all xfrms used have the same secid.
+ *
+ * Security hooks affecting all Key Management operations
+ *
+ * @key_alloc:
+ *	Permit allocation of a key and assign security data. Note that key does
+ *	not have a serial number assigned at this point.
+ *	@key points to the key.
+ *	@flags is the allocation flags
+ *	Return 0 if permission is granted, -ve error otherwise.
+ * @key_free:
+ *	Notification of destruction; free security data.
+ *	@key points to the key.
+ *	No return value.
+ * @key_permission:
+ *	See whether a specific operational right is granted to a process on a
+ *	key.
+ *	@key_ref refers to the key (key pointer + possession attribute bit).
+ *	@cred points to the credentials to provide the context against which to
+ *	evaluate the security data on the key.
+ *	@perm describes the combination of permissions required of this key.
+ *	Return 0 if permission is granted, -ve error otherwise.
+ * @key_getsecurity:
+ *	Get a textual representation of the security context attached to a key
+ *	for the purposes of honouring KEYCTL_GETSECURITY.  This function
+ *	allocates the storage for the NUL-terminated string and the caller
+ *	should free it.
+ *	@key points to the key to be queried.
+ *	@_buffer points to a pointer that should be set to point to the
+ *	resulting string (if no label or an error occurs).
+ *	Return the length of the string (including terminating NUL) or -ve if
+ *	an error.
+ *	May also return 0 (and a NULL buffer pointer) if there is no label.
+ *
+ * Security hooks affecting all System V IPC operations.
+ *
+ * @ipc_permission:
+ *	Check permissions for access to IPC
+ *	@ipcp contains the kernel IPC permission structure
+ *	@flag contains the desired (requested) permission set
+ *	Return 0 if permission is granted.
+ * @ipc_getsecid:
+ *	Get the secid associated with the ipc object.
+ *	@ipcp contains the kernel IPC permission structure.
+ *	@secid contains a pointer to the location where result will be saved.
+ *	In case of failure, @secid will be set to zero.
+ *
+ * Security hooks for individual messages held in System V IPC message queues
+ * @msg_msg_alloc_security:
+ *	Allocate and attach a security structure to the msg->security field.
+ *	The security field is initialized to NULL when the structure is first
+ *	created.
+ *	@msg contains the message structure to be modified.
+ *	Return 0 if operation was successful and permission is granted.
+ * @msg_msg_free_security:
+ *	Deallocate the security structure for this message.
+ *	@msg contains the message structure to be modified.
+ *
+ * Security hooks for System V IPC Message Queues
+ *
+ * @msg_queue_alloc_security:
+ *	Allocate and attach a security structure to the
+ *	msq->q_perm.security field. The security field is initialized to
+ *	NULL when the structure is first created.
+ *	@msq contains the message queue structure to be modified.
+ *	Return 0 if operation was successful and permission is granted.
+ * @msg_queue_free_security:
+ *	Deallocate security structure for this message queue.
+ *	@msq contains the message queue structure to be modified.
+ * @msg_queue_associate:
+ *	Check permission when a message queue is requested through the
+ *	msgget system call.  This hook is only called when returning the
+ *	message queue identifier for an existing message queue, not when a
+ *	new message queue is created.
+ *	@msq contains the message queue to act upon.
+ *	@msqflg contains the operation control flags.
+ *	Return 0 if permission is granted.
+ * @msg_queue_msgctl:
+ *	Check permission when a message control operation specified by @cmd
+ *	is to be performed on the message queue @msq.
+ *	The @msq may be NULL, e.g. for IPC_INFO or MSG_INFO.
+ *	@msq contains the message queue to act upon.  May be NULL.
+ *	@cmd contains the operation to be performed.
+ *	Return 0 if permission is granted.
+ * @msg_queue_msgsnd:
+ *	Check permission before a message, @msg, is enqueued on the message
+ *	queue, @msq.
+ *	@msq contains the message queue to send message to.
+ *	@msg contains the message to be enqueued.
+ *	@msqflg contains operational flags.
+ *	Return 0 if permission is granted.
+ * @msg_queue_msgrcv:
+ *	Check permission before a message, @msg, is removed from the message
+ *	queue, @msq.  The @target task structure contains a pointer to the
+ *	process that will be receiving the message (not equal to the current
+ *	process when inline receives are being performed).
+ *	@msq contains the message queue to retrieve message from.
+ *	@msg contains the message destination.
+ *	@target contains the task structure for recipient process.
+ *	@type contains the type of message requested.
+ *	@mode contains the operational flags.
+ *	Return 0 if permission is granted.
+ *
+ * Security hooks for System V Shared Memory Segments
+ *
+ * @shm_alloc_security:
+ *	Allocate and attach a security structure to the shp->shm_perm.security
+ *	field.  The security field is initialized to NULL when the structure is
+ *	first created.
+ *	@shp contains the shared memory structure to be modified.
+ *	Return 0 if operation was successful and permission is granted.
+ * @shm_free_security:
+ *	Deallocate the security struct for this memory segment.
+ *	@shp contains the shared memory structure to be modified.
+ * @shm_associate:
+ *	Check permission when a shared memory region is requested through the
+ *	shmget system call.  This hook is only called when returning the shared
+ *	memory region identifier for an existing region, not when a new shared
+ *	memory region is created.
+ *	@shp contains the shared memory structure to be modified.
+ *	@shmflg contains the operation control flags.
+ *	Return 0 if permission is granted.
+ * @shm_shmctl:
+ *	Check permission when a shared memory control operation specified by
+ *	@cmd is to be performed on the shared memory region @shp.
+ *	The @shp may be NULL, e.g. for IPC_INFO or SHM_INFO.
+ *	@shp contains shared memory structure to be modified.
+ *	@cmd contains the operation to be performed.
+ *	Return 0 if permission is granted.
+ * @shm_shmat:
+ *	Check permissions prior to allowing the shmat system call to attach the
+ *	shared memory segment @shp to the data segment of the calling process.
+ *	The attaching address is specified by @shmaddr.
+ *	@shp contains the shared memory structure to be modified.
+ *	@shmaddr contains the address to attach memory region to.
+ *	@shmflg contains the operational flags.
+ *	Return 0 if permission is granted.
+ *
+ * Security hooks for System V Semaphores
+ *
+ * @sem_alloc_security:
+ *	Allocate and attach a security structure to the sma->sem_perm.security
+ *	field.  The security field is initialized to NULL when the structure is
+ *	first created.
+ *	@sma contains the semaphore structure
+ *	Return 0 if operation was successful and permission is granted.
+ * @sem_free_security:
+ *	deallocate security struct for this semaphore
+ *	@sma contains the semaphore structure.
+ * @sem_associate:
+ *	Check permission when a semaphore is requested through the semget
+ *	system call.  This hook is only called when returning the semaphore
+ *	identifier for an existing semaphore, not when a new one must be
+ *	created.
+ *	@sma contains the semaphore structure.
+ *	@semflg contains the operation control flags.
+ *	Return 0 if permission is granted.
+ * @sem_semctl:
+ *	Check permission when a semaphore operation specified by @cmd is to be
+ *	performed on the semaphore @sma.  The @sma may be NULL, e.g. for
+ *	IPC_INFO or SEM_INFO.
+ *	@sma contains the semaphore structure.  May be NULL.
+ *	@cmd contains the operation to be performed.
+ *	Return 0 if permission is granted.
+ * @sem_semop
+ *	Check permissions before performing operations on members of the
+ *	semaphore set @sma.  If the @alter flag is nonzero, the semaphore set
+ *	may be modified.
+ *	@sma contains the semaphore structure.
+ *	@sops contains the operations to perform.
+ *	@nsops contains the number of operations to perform.
+ *	@alter contains the flag indicating whether changes are to be made.
+ *	Return 0 if permission is granted.
+ *
+ * @binder_set_context_mgr
+ *	Check whether @mgr is allowed to be the binder context manager.
+ *	@mgr contains the task_struct for the task being registered.
+ *	Return 0 if permission is granted.
+ * @binder_transaction
+ *	Check whether @from is allowed to invoke a binder transaction call
+ *	to @to.
+ *	@from contains the task_struct for the sending task.
+ *	@to contains the task_struct for the receiving task.
+ * @binder_transfer_binder
+ *	Check whether @from is allowed to transfer a binder reference to @to.
+ *	@from contains the task_struct for the sending task.
+ *	@to contains the task_struct for the receiving task.
+ * @binder_transfer_file
+ *	Check whether @from is allowed to transfer @file to @to.
+ *	@from contains the task_struct for the sending task.
+ *	@file contains the struct file being transferred.
+ *	@to contains the task_struct for the receiving task.
+ *
+ * @ptrace_access_check:
+ *	Check permission before allowing the current process to trace the
+ *	@child process.
+ *	Security modules may also want to perform a process tracing check
+ *	during an execve in the set_security or apply_creds hooks of
+ *	tracing check during an execve in the bprm_set_creds hook of
+ *	binprm_security_ops if the process is being traced and its security
+ *	attributes would be changed by the execve.
+ *	@child contains the task_struct structure for the target process.
+ *	@mode contains the PTRACE_MODE flags indicating the form of access.
+ *	Return 0 if permission is granted.
+ * @ptrace_traceme:
+ *	Check that the @parent process has sufficient permission to trace the
+ *	current process before allowing the current process to present itself
+ *	to the @parent process for tracing.
+ *	@parent contains the task_struct structure for debugger process.
+ *	Return 0 if permission is granted.
+ * @capget:
+ *	Get the @effective, @inheritable, and @permitted capability sets for
+ *	the @target process.  The hook may also perform permission checking to
+ *	determine if the current process is allowed to see the capability sets
+ *	of the @target process.
+ *	@target contains the task_struct structure for target process.
+ *	@effective contains the effective capability set.
+ *	@inheritable contains the inheritable capability set.
+ *	@permitted contains the permitted capability set.
+ *	Return 0 if the capability sets were successfully obtained.
+ * @capset:
+ *	Set the @effective, @inheritable, and @permitted capability sets for
+ *	the current process.
+ *	@new contains the new credentials structure for target process.
+ *	@old contains the current credentials structure for target process.
+ *	@effective contains the effective capability set.
+ *	@inheritable contains the inheritable capability set.
+ *	@permitted contains the permitted capability set.
+ *	Return 0 and update @new if permission is granted.
+ * @capable:
+ *	Check whether the @tsk process has the @cap capability in the indicated
+ *	credentials.
+ *	@cred contains the credentials to use.
+ *	@ns contains the user namespace we want the capability in
+ *	@cap contains the capability <include/linux/capability.h>.
+ *	@audit: Whether to write an audit message or not
+ *	Return 0 if the capability is granted for @tsk.
+ * @syslog:
+ *	Check permission before accessing the kernel message ring or changing
+ *	logging to the console.
+ *	See the syslog(2) manual page for an explanation of the @type values.
+ *	@type contains the type of action.
+ *	@from_file indicates the context of action (if it came from /proc).
+ *	Return 0 if permission is granted.
+ * @settime:
+ *	Check permission to change the system time.
+ *	struct timespec and timezone are defined in include/linux/time.h
+ *	@ts contains new time
+ *	@tz contains new timezone
+ *	Return 0 if permission is granted.
+ * @vm_enough_memory:
+ *	Check permissions for allocating a new virtual mapping.
+ *	@mm contains the mm struct it is being added to.
+ *	@pages contains the number of pages.
+ *	Return 0 if permission is granted.
+ *
+ * @ismaclabel:
+ *	Check if the extended attribute specified by @name
+ *	represents a MAC label. Returns 1 if name is a MAC
+ *	attribute otherwise returns 0.
+ *	@name full extended attribute name to check against
+ *	LSM as a MAC label.
+ *
+ * @secid_to_secctx:
+ *	Convert secid to security context.  If secdata is NULL the length of
+ *	the result will be returned in seclen, but no secdata will be returned.
+ *	This does mean that the length could change between calls to check the
+ *	length and the next call which actually allocates and returns the
+ *	secdata.
+ *	@secid contains the security ID.
+ *	@secdata contains the pointer that stores the converted security
+ *	context.
+ *	@seclen pointer which contains the length of the data
+ * @secctx_to_secid:
+ *	Convert security context to secid.
+ *	@secid contains the pointer to the generated security ID.
+ *	@secdata contains the security context.
+ *
+ * @release_secctx:
+ *	Release the security context.
+ *	@secdata contains the security context.
+ *	@seclen contains the length of the security context.
+ *
+ * Security hooks for Audit
+ *
+ * @audit_rule_init:
+ *	Allocate and initialize an LSM audit rule structure.
+ *	@field contains the required Audit action.
+ *	Fields flags are defined in include/linux/audit.h
+ *	@op contains the operator the rule uses.
+ *	@rulestr contains the context where the rule will be applied to.
+ *	@lsmrule contains a pointer to receive the result.
+ *	Return 0 if @lsmrule has been successfully set,
+ *	-EINVAL in case of an invalid rule.
+ *
+ * @audit_rule_known:
+ *	Specifies whether given @rule contains any fields related to
+ *	current LSM.
+ *	@rule contains the audit rule of interest.
+ *	Return 1 in case of relation found, 0 otherwise.
+ *
+ * @audit_rule_match:
+ *	Determine if given @secid matches a rule previously approved
+ *	by @audit_rule_known.
+ *	@secid contains the security id in question.
+ *	@field contains the field which relates to current LSM.
+ *	@op contains the operator that will be used for matching.
+ *	@rule points to the audit rule that will be checked against.
+ *	@actx points to the audit context associated with the check.
+ *	Return 1 if secid matches the rule, 0 if it does not, -ERRNO on failure.
+ *
+ * @audit_rule_free:
+ *	Deallocate the LSM audit rule structure previously allocated by
+ *	audit_rule_init.
+ *	@rule contains the allocated rule
+ *
+ * @inode_notifysecctx:
+ *	Notify the security module of what the security context of an inode
+ *	should be.  Initializes the incore security context managed by the
+ *	security module for this inode.  Example usage:  NFS client invokes
+ *	this hook to initialize the security context in its incore inode to the
+ *	value provided by the server for the file when the server returned the
+ *	file's attributes to the client.
+ *
+ *	Must be called with inode->i_mutex locked.
+ *
+ *	@inode we wish to set the security context of.
+ *	@ctx contains the string which we wish to set in the inode.
+ *	@ctxlen contains the length of @ctx.
+ *
+ * @inode_setsecctx:
+ *	Change the security context of an inode.  Updates the
+ *	incore security context managed by the security module and invokes the
+ *	fs code as needed (via __vfs_setxattr_noperm) to update any backing
+ *	xattrs that represent the context.  Example usage:  NFS server invokes
+ *	this hook to change the security context in its incore inode and on the
+ *	backing filesystem to a value provided by the client on a SETATTR
+ *	operation.
+ *
+ *	Must be called with inode->i_mutex locked.
+ *
+ *	@dentry contains the inode we wish to set the security context of.
+ *	@ctx contains the string which we wish to set in the inode.
+ *	@ctxlen contains the length of @ctx.
+ *
+ * @inode_getsecctx:
+ *	On success, returns 0 and fills out @ctx and @ctxlen with the security
+ *	context for the given @inode.
+ *
+ *	@inode we wish to get the security context of.
+ *	@ctx is a pointer in which to place the allocated security context.
+ *	@ctxlen points to the place to put the length of @ctx.
+ * This is the main security structure.
+ */
+
 struct security_operations {
 	char name[SECURITY_NAME_MAX + 1];
 
-- 
cgit v1.2.3


From 346033a28fb16b83dac2a74d8025ff8ee64a2c9b Mon Sep 17 00:00:00 2001
From: Casey Schaufler <casey@schaufler-ca.com>
Date: Sat, 2 May 2015 15:11:14 -0700
Subject: LSM: Remove a comment from security.h

Remove the large comment describing the content of the
security_operations structure from security.h. This
wasn't done in the previous (2/7) patch because it
would have exceeded the mail list size limits.

Signed-off-by: Casey Schaufler <casey@schaufler-ca.com>
Acked-by: John Johansen <john.johansen@canonical.com>
Acked-by: Kees Cook <keescook@chromium.org>
Acked-by: Paul Moore <paul@paul-moore.com>
Acked-by:  Stephen Smalley <sds@tycho.nsa.gov>
Acked-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Signed-off-by: James Morris <james.l.morris@oracle.com>
---
 include/linux/security.h | 1270 ----------------------------------------------
 1 file changed, 1270 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/security.h b/include/linux/security.h
index f3d42c636f27..a2a100e7ac6e 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -186,1276 +186,6 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts)
 	opts->num_mnt_opts = 0;
 }
 
-/**
- * struct security_operations - main security structure
- *
- * Security module identifier.
- *
- * @name:
- *	A string that acts as a unique identifier for the LSM with max number
- *	of characters = SECURITY_NAME_MAX.
- *
- * Security hooks for program execution operations.
- *
- * @bprm_set_creds:
- *	Save security information in the bprm->security field, typically based
- *	on information about the bprm->file, for later use by the apply_creds
- *	hook.  This hook may also optionally check permissions (e.g. for
- *	transitions between security domains).
- *	This hook may be called multiple times during a single execve, e.g. for
- *	interpreters.  The hook can tell whether it has already been called by
- *	checking to see if @bprm->security is non-NULL.  If so, then the hook
- *	may decide either to retain the security information saved earlier or
- *	to replace it.
- *	@bprm contains the linux_binprm structure.
- *	Return 0 if the hook is successful and permission is granted.
- * @bprm_check_security:
- *	This hook mediates the point when a search for a binary handler will
- *	begin.  It allows a check the @bprm->security value which is set in the
- *	preceding set_creds call.  The primary difference from set_creds is
- *	that the argv list and envp list are reliably available in @bprm.  This
- *	hook may be called multiple times during a single execve; and in each
- *	pass set_creds is called first.
- *	@bprm contains the linux_binprm structure.
- *	Return 0 if the hook is successful and permission is granted.
- * @bprm_committing_creds:
- *	Prepare to install the new security attributes of a process being
- *	transformed by an execve operation, based on the old credentials
- *	pointed to by @current->cred and the information set in @bprm->cred by
- *	the bprm_set_creds hook.  @bprm points to the linux_binprm structure.
- *	This hook is a good place to perform state changes on the process such
- *	as closing open file descriptors to which access will no longer be
- *	granted when the attributes are changed.  This is called immediately
- *	before commit_creds().
- * @bprm_committed_creds:
- *	Tidy up after the installation of the new security attributes of a
- *	process being transformed by an execve operation.  The new credentials
- *	have, by this point, been set to @current->cred.  @bprm points to the
- *	linux_binprm structure.  This hook is a good place to perform state
- *	changes on the process such as clearing out non-inheritable signal
- *	state.  This is called immediately after commit_creds().
- * @bprm_secureexec:
- *	Return a boolean value (0 or 1) indicating whether a "secure exec"
- *	is required.  The flag is passed in the auxiliary table
- *	on the initial stack to the ELF interpreter to indicate whether libc
- *	should enable secure mode.
- *	@bprm contains the linux_binprm structure.
- *
- * Security hooks for filesystem operations.
- *
- * @sb_alloc_security:
- *	Allocate and attach a security structure to the sb->s_security field.
- *	The s_security field is initialized to NULL when the structure is
- *	allocated.
- *	@sb contains the super_block structure to be modified.
- *	Return 0 if operation was successful.
- * @sb_free_security:
- *	Deallocate and clear the sb->s_security field.
- *	@sb contains the super_block structure to be modified.
- * @sb_statfs:
- *	Check permission before obtaining filesystem statistics for the @mnt
- *	mountpoint.
- *	@dentry is a handle on the superblock for the filesystem.
- *	Return 0 if permission is granted.
- * @sb_mount:
- *	Check permission before an object specified by @dev_name is mounted on
- *	the mount point named by @nd.  For an ordinary mount, @dev_name
- *	identifies a device if the file system type requires a device.  For a
- *	remount (@flags & MS_REMOUNT), @dev_name is irrelevant.  For a
- *	loopback/bind mount (@flags & MS_BIND), @dev_name identifies the
- *	pathname of the object being mounted.
- *	@dev_name contains the name for object being mounted.
- *	@path contains the path for mount point object.
- *	@type contains the filesystem type.
- *	@flags contains the mount flags.
- *	@data contains the filesystem-specific data.
- *	Return 0 if permission is granted.
- * @sb_copy_data:
- *	Allow mount option data to be copied prior to parsing by the filesystem,
- *	so that the security module can extract security-specific mount
- *	options cleanly (a filesystem may modify the data e.g. with strsep()).
- *	This also allows the original mount data to be stripped of security-
- *	specific options to avoid having to make filesystems aware of them.
- *	@type the type of filesystem being mounted.
- *	@orig the original mount data copied from userspace.
- *	@copy copied data which will be passed to the security module.
- *	Returns 0 if the copy was successful.
- * @sb_remount:
- *	Extracts security system specific mount options and verifies no changes
- *	are being made to those options.
- *	@sb superblock being remounted
- *	@data contains the filesystem-specific data.
- *	Return 0 if permission is granted.
- * @sb_umount:
- *	Check permission before the @mnt file system is unmounted.
- *	@mnt contains the mounted file system.
- *	@flags contains the unmount flags, e.g. MNT_FORCE.
- *	Return 0 if permission is granted.
- * @sb_pivotroot:
- *	Check permission before pivoting the root filesystem.
- *	@old_path contains the path for the new location of the current root (put_old).
- *	@new_path contains the path for the new root (new_root).
- *	Return 0 if permission is granted.
- * @sb_set_mnt_opts:
- *	Set the security relevant mount options used for a superblock
- *	@sb the superblock to set security mount options for
- *	@opts binary data structure containing all lsm mount data
- * @sb_clone_mnt_opts:
- *	Copy all security options from a given superblock to another
- *	@oldsb old superblock which contain information to clone
- *	@newsb new superblock which needs filled in
- * @sb_parse_opts_str:
- *	Parse a string of security data filling in the opts structure
- *	@options string containing all mount options known by the LSM
- *	@opts binary data structure usable by the LSM
- * @dentry_init_security:
- *	Compute a context for a dentry as the inode is not yet available
- *	since NFSv4 has no label backed by an EA anyway.
- *	@dentry dentry to use in calculating the context.
- *	@mode mode used to determine resource type.
- *	@name name of the last path component used to create file
- *	@ctx pointer to place the pointer to the resulting context in.
- *	@ctxlen point to place the length of the resulting context.
- *
- *
- * Security hooks for inode operations.
- *
- * @inode_alloc_security:
- *	Allocate and attach a security structure to @inode->i_security.  The
- *	i_security field is initialized to NULL when the inode structure is
- *	allocated.
- *	@inode contains the inode structure.
- *	Return 0 if operation was successful.
- * @inode_free_security:
- *	@inode contains the inode structure.
- *	Deallocate the inode security structure and set @inode->i_security to
- *	NULL.
- * @inode_init_security:
- *	Obtain the security attribute name suffix and value to set on a newly
- *	created inode and set up the incore security field for the new inode.
- *	This hook is called by the fs code as part of the inode creation
- *	transaction and provides for atomic labeling of the inode, unlike
- *	the post_create/mkdir/... hooks called by the VFS.  The hook function
- *	is expected to allocate the name and value via kmalloc, with the caller
- *	being responsible for calling kfree after using them.
- *	If the security module does not use security attributes or does
- *	not wish to put a security attribute on this particular inode,
- *	then it should return -EOPNOTSUPP to skip this processing.
- *	@inode contains the inode structure of the newly created inode.
- *	@dir contains the inode structure of the parent directory.
- *	@qstr contains the last path component of the new object
- *	@name will be set to the allocated name suffix (e.g. selinux).
- *	@value will be set to the allocated attribute value.
- *	@len will be set to the length of the value.
- *	Returns 0 if @name and @value have been successfully set,
- *		-EOPNOTSUPP if no security attribute is needed, or
- *		-ENOMEM on memory allocation failure.
- * @inode_create:
- *	Check permission to create a regular file.
- *	@dir contains inode structure of the parent of the new file.
- *	@dentry contains the dentry structure for the file to be created.
- *	@mode contains the file mode of the file to be created.
- *	Return 0 if permission is granted.
- * @inode_link:
- *	Check permission before creating a new hard link to a file.
- *	@old_dentry contains the dentry structure for an existing link to the file.
- *	@dir contains the inode structure of the parent directory of the new link.
- *	@new_dentry contains the dentry structure for the new link.
- *	Return 0 if permission is granted.
- * @path_link:
- *	Check permission before creating a new hard link to a file.
- *	@old_dentry contains the dentry structure for an existing link
- *	to the file.
- *	@new_dir contains the path structure of the parent directory of
- *	the new link.
- *	@new_dentry contains the dentry structure for the new link.
- *	Return 0 if permission is granted.
- * @inode_unlink:
- *	Check the permission to remove a hard link to a file.
- *	@dir contains the inode structure of parent directory of the file.
- *	@dentry contains the dentry structure for file to be unlinked.
- *	Return 0 if permission is granted.
- * @path_unlink:
- *	Check the permission to remove a hard link to a file.
- *	@dir contains the path structure of parent directory of the file.
- *	@dentry contains the dentry structure for file to be unlinked.
- *	Return 0 if permission is granted.
- * @inode_symlink:
- *	Check the permission to create a symbolic link to a file.
- *	@dir contains the inode structure of parent directory of the symbolic link.
- *	@dentry contains the dentry structure of the symbolic link.
- *	@old_name contains the pathname of file.
- *	Return 0 if permission is granted.
- * @path_symlink:
- *	Check the permission to create a symbolic link to a file.
- *	@dir contains the path structure of parent directory of
- *	the symbolic link.
- *	@dentry contains the dentry structure of the symbolic link.
- *	@old_name contains the pathname of file.
- *	Return 0 if permission is granted.
- * @inode_mkdir:
- *	Check permissions to create a new directory in the existing directory
- *	associated with inode structure @dir.
- *	@dir contains the inode structure of parent of the directory to be created.
- *	@dentry contains the dentry structure of new directory.
- *	@mode contains the mode of new directory.
- *	Return 0 if permission is granted.
- * @path_mkdir:
- *	Check permissions to create a new directory in the existing directory
- *	associated with path structure @path.
- *	@dir contains the path structure of parent of the directory
- *	to be created.
- *	@dentry contains the dentry structure of new directory.
- *	@mode contains the mode of new directory.
- *	Return 0 if permission is granted.
- * @inode_rmdir:
- *	Check the permission to remove a directory.
- *	@dir contains the inode structure of parent of the directory to be removed.
- *	@dentry contains the dentry structure of directory to be removed.
- *	Return 0 if permission is granted.
- * @path_rmdir:
- *	Check the permission to remove a directory.
- *	@dir contains the path structure of parent of the directory to be
- *	removed.
- *	@dentry contains the dentry structure of directory to be removed.
- *	Return 0 if permission is granted.
- * @inode_mknod:
- *	Check permissions when creating a special file (or a socket or a fifo
- *	file created via the mknod system call).  Note that if mknod operation
- *	is being done for a regular file, then the create hook will be called
- *	and not this hook.
- *	@dir contains the inode structure of parent of the new file.
- *	@dentry contains the dentry structure of the new file.
- *	@mode contains the mode of the new file.
- *	@dev contains the device number.
- *	Return 0 if permission is granted.
- * @path_mknod:
- *	Check permissions when creating a file. Note that this hook is called
- *	even if mknod operation is being done for a regular file.
- *	@dir contains the path structure of parent of the new file.
- *	@dentry contains the dentry structure of the new file.
- *	@mode contains the mode of the new file.
- *	@dev contains the undecoded device number. Use new_decode_dev() to get
- *	the decoded device number.
- *	Return 0 if permission is granted.
- * @inode_rename:
- *	Check for permission to rename a file or directory.
- *	@old_dir contains the inode structure for parent of the old link.
- *	@old_dentry contains the dentry structure of the old link.
- *	@new_dir contains the inode structure for parent of the new link.
- *	@new_dentry contains the dentry structure of the new link.
- *	Return 0 if permission is granted.
- * @path_rename:
- *	Check for permission to rename a file or directory.
- *	@old_dir contains the path structure for parent of the old link.
- *	@old_dentry contains the dentry structure of the old link.
- *	@new_dir contains the path structure for parent of the new link.
- *	@new_dentry contains the dentry structure of the new link.
- *	Return 0 if permission is granted.
- * @path_chmod:
- *	Check for permission to change DAC's permission of a file or directory.
- *	@dentry contains the dentry structure.
- *	@mnt contains the vfsmnt structure.
- *	@mode contains DAC's mode.
- *	Return 0 if permission is granted.
- * @path_chown:
- *	Check for permission to change owner/group of a file or directory.
- *	@path contains the path structure.
- *	@uid contains new owner's ID.
- *	@gid contains new group's ID.
- *	Return 0 if permission is granted.
- * @path_chroot:
- *	Check for permission to change root directory.
- *	@path contains the path structure.
- *	Return 0 if permission is granted.
- * @inode_readlink:
- *	Check the permission to read the symbolic link.
- *	@dentry contains the dentry structure for the file link.
- *	Return 0 if permission is granted.
- * @inode_follow_link:
- *	Check permission to follow a symbolic link when looking up a pathname.
- *	@dentry contains the dentry structure for the link.
- *	@nd contains the nameidata structure for the parent directory.
- *	Return 0 if permission is granted.
- * @inode_permission:
- *	Check permission before accessing an inode.  This hook is called by the
- *	existing Linux permission function, so a security module can use it to
- *	provide additional checking for existing Linux permission checks.
- *	Notice that this hook is called when a file is opened (as well as many
- *	other operations), whereas the file_security_ops permission hook is
- *	called when the actual read/write operations are performed.
- *	@inode contains the inode structure to check.
- *	@mask contains the permission mask.
- *	Return 0 if permission is granted.
- * @inode_setattr:
- *	Check permission before setting file attributes.  Note that the kernel
- *	call to notify_change is performed from several locations, whenever
- *	file attributes change (such as when a file is truncated, chown/chmod
- *	operations, transferring disk quotas, etc).
- *	@dentry contains the dentry structure for the file.
- *	@attr is the iattr structure containing the new file attributes.
- *	Return 0 if permission is granted.
- * @path_truncate:
- *	Check permission before truncating a file.
- *	@path contains the path structure for the file.
- *	Return 0 if permission is granted.
- * @inode_getattr:
- *	Check permission before obtaining file attributes.
- *	@mnt is the vfsmount where the dentry was looked up
- *	@dentry contains the dentry structure for the file.
- *	Return 0 if permission is granted.
- * @inode_setxattr:
- *	Check permission before setting the extended attributes
- *	@value identified by @name for @dentry.
- *	Return 0 if permission is granted.
- * @inode_post_setxattr:
- *	Update inode security field after successful setxattr operation.
- *	@value identified by @name for @dentry.
- * @inode_getxattr:
- *	Check permission before obtaining the extended attributes
- *	identified by @name for @dentry.
- *	Return 0 if permission is granted.
- * @inode_listxattr:
- *	Check permission before obtaining the list of extended attribute
- *	names for @dentry.
- *	Return 0 if permission is granted.
- * @inode_removexattr:
- *	Check permission before removing the extended attribute
- *	identified by @name for @dentry.
- *	Return 0 if permission is granted.
- * @inode_getsecurity:
- *	Retrieve a copy of the extended attribute representation of the
- *	security label associated with @name for @inode via @buffer.  Note that
- *	@name is the remainder of the attribute name after the security prefix
- *	has been removed. @alloc is used to specify of the call should return a
- *	value via the buffer or just the value length Return size of buffer on
- *	success.
- * @inode_setsecurity:
- *	Set the security label associated with @name for @inode from the
- *	extended attribute value @value.  @size indicates the size of the
- *	@value in bytes.  @flags may be XATTR_CREATE, XATTR_REPLACE, or 0.
- *	Note that @name is the remainder of the attribute name after the
- *	security. prefix has been removed.
- *	Return 0 on success.
- * @inode_listsecurity:
- *	Copy the extended attribute names for the security labels
- *	associated with @inode into @buffer.  The maximum size of @buffer
- *	is specified by @buffer_size.  @buffer may be NULL to request
- *	the size of the buffer required.
- *	Returns number of bytes used/required on success.
- * @inode_need_killpriv:
- *	Called when an inode has been changed.
- *	@dentry is the dentry being changed.
- *	Return <0 on error to abort the inode change operation.
- *	Return 0 if inode_killpriv does not need to be called.
- *	Return >0 if inode_killpriv does need to be called.
- * @inode_killpriv:
- *	The setuid bit is being removed.  Remove similar security labels.
- *	Called with the dentry->d_inode->i_mutex held.
- *	@dentry is the dentry being changed.
- *	Return 0 on success.  If error is returned, then the operation
- *	causing setuid bit removal is failed.
- * @inode_getsecid:
- *	Get the secid associated with the node.
- *	@inode contains a pointer to the inode.
- *	@secid contains a pointer to the location where result will be saved.
- *	In case of failure, @secid will be set to zero.
- *
- * Security hooks for file operations
- *
- * @file_permission:
- *	Check file permissions before accessing an open file.  This hook is
- *	called by various operations that read or write files.  A security
- *	module can use this hook to perform additional checking on these
- *	operations, e.g.  to revalidate permissions on use to support privilege
- *	bracketing or policy changes.  Notice that this hook is used when the
- *	actual read/write operations are performed, whereas the
- *	inode_security_ops hook is called when a file is opened (as well as
- *	many other operations).
- *	Caveat:  Although this hook can be used to revalidate permissions for
- *	various system call operations that read or write files, it does not
- *	address the revalidation of permissions for memory-mapped files.
- *	Security modules must handle this separately if they need such
- *	revalidation.
- *	@file contains the file structure being accessed.
- *	@mask contains the requested permissions.
- *	Return 0 if permission is granted.
- * @file_alloc_security:
- *	Allocate and attach a security structure to the file->f_security field.
- *	The security field is initialized to NULL when the structure is first
- *	created.
- *	@file contains the file structure to secure.
- *	Return 0 if the hook is successful and permission is granted.
- * @file_free_security:
- *	Deallocate and free any security structures stored in file->f_security.
- *	@file contains the file structure being modified.
- * @file_ioctl:
- *	@file contains the file structure.
- *	@cmd contains the operation to perform.
- *	@arg contains the operational arguments.
- *	Check permission for an ioctl operation on @file.  Note that @arg
- *	sometimes represents a user space pointer; in other cases, it may be a
- *	simple integer value.  When @arg represents a user space pointer, it
- *	should never be used by the security module.
- *	Return 0 if permission is granted.
- * @mmap_addr :
- *	Check permissions for a mmap operation at @addr.
- *	@addr contains virtual address that will be used for the operation.
- *	Return 0 if permission is granted.
- * @mmap_file :
- *	Check permissions for a mmap operation.  The @file may be NULL, e.g.
- *	if mapping anonymous memory.
- *	@file contains the file structure for file to map (may be NULL).
- *	@reqprot contains the protection requested by the application.
- *	@prot contains the protection that will be applied by the kernel.
- *	@flags contains the operational flags.
- *	Return 0 if permission is granted.
- * @file_mprotect:
- *	Check permissions before changing memory access permissions.
- *	@vma contains the memory region to modify.
- *	@reqprot contains the protection requested by the application.
- *	@prot contains the protection that will be applied by the kernel.
- *	Return 0 if permission is granted.
- * @file_lock:
- *	Check permission before performing file locking operations.
- *	Note: this hook mediates both flock and fcntl style locks.
- *	@file contains the file structure.
- *	@cmd contains the posix-translated lock operation to perform
- *	(e.g. F_RDLCK, F_WRLCK).
- *	Return 0 if permission is granted.
- * @file_fcntl:
- *	Check permission before allowing the file operation specified by @cmd
- *	from being performed on the file @file.  Note that @arg sometimes
- *	represents a user space pointer; in other cases, it may be a simple
- *	integer value.  When @arg represents a user space pointer, it should
- *	never be used by the security module.
- *	@file contains the file structure.
- *	@cmd contains the operation to be performed.
- *	@arg contains the operational arguments.
- *	Return 0 if permission is granted.
- * @file_set_fowner:
- *	Save owner security information (typically from current->security) in
- *	file->f_security for later use by the send_sigiotask hook.
- *	@file contains the file structure to update.
- *	Return 0 on success.
- * @file_send_sigiotask:
- *	Check permission for the file owner @fown to send SIGIO or SIGURG to the
- *	process @tsk.  Note that this hook is sometimes called from interrupt.
- *	Note that the fown_struct, @fown, is never outside the context of a
- *	struct file, so the file structure (and associated security information)
- *	can always be obtained:
- *		container_of(fown, struct file, f_owner)
- *	@tsk contains the structure of task receiving signal.
- *	@fown contains the file owner information.
- *	@sig is the signal that will be sent.  When 0, kernel sends SIGIO.
- *	Return 0 if permission is granted.
- * @file_receive:
- *	This hook allows security modules to control the ability of a process
- *	to receive an open file descriptor via socket IPC.
- *	@file contains the file structure being received.
- *	Return 0 if permission is granted.
- * @file_open
- *	Save open-time permission checking state for later use upon
- *	file_permission, and recheck access if anything has changed
- *	since inode_permission.
- *
- * Security hooks for task operations.
- *
- * @task_create:
- *	Check permission before creating a child process.  See the clone(2)
- *	manual page for definitions of the @clone_flags.
- *	@clone_flags contains the flags indicating what should be shared.
- *	Return 0 if permission is granted.
- * @task_free:
- *	@task task being freed
- *	Handle release of task-related resources. (Note that this can be called
- *	from interrupt context.)
- * @cred_alloc_blank:
- *	@cred points to the credentials.
- *	@gfp indicates the atomicity of any memory allocations.
- *	Only allocate sufficient memory and attach to @cred such that
- *	cred_transfer() will not get ENOMEM.
- * @cred_free:
- *	@cred points to the credentials.
- *	Deallocate and clear the cred->security field in a set of credentials.
- * @cred_prepare:
- *	@new points to the new credentials.
- *	@old points to the original credentials.
- *	@gfp indicates the atomicity of any memory allocations.
- *	Prepare a new set of credentials by copying the data from the old set.
- * @cred_transfer:
- *	@new points to the new credentials.
- *	@old points to the original credentials.
- *	Transfer data from original creds to new creds
- * @kernel_act_as:
- *	Set the credentials for a kernel service to act as (subjective context).
- *	@new points to the credentials to be modified.
- *	@secid specifies the security ID to be set
- *	The current task must be the one that nominated @secid.
- *	Return 0 if successful.
- * @kernel_create_files_as:
- *	Set the file creation context in a set of credentials to be the same as
- *	the objective context of the specified inode.
- *	@new points to the credentials to be modified.
- *	@inode points to the inode to use as a reference.
- *	The current task must be the one that nominated @inode.
- *	Return 0 if successful.
- * @kernel_fw_from_file:
- *	Load firmware from userspace (not called for built-in firmware).
- *	@file contains the file structure pointing to the file containing
- *	the firmware to load. This argument will be NULL if the firmware
- *	was loaded via the uevent-triggered blob-based interface exposed
- *	by CONFIG_FW_LOADER_USER_HELPER.
- *	@buf pointer to buffer containing firmware contents.
- *	@size length of the firmware contents.
- *	Return 0 if permission is granted.
- * @kernel_module_request:
- *	Ability to trigger the kernel to automatically upcall to userspace for
- *	userspace to load a kernel module with the given name.
- *	@kmod_name name of the module requested by the kernel
- *	Return 0 if successful.
- * @kernel_module_from_file:
- *	Load a kernel module from userspace.
- *	@file contains the file structure pointing to the file containing
- *	the kernel module to load. If the module is being loaded from a blob,
- *	this argument will be NULL.
- *	Return 0 if permission is granted.
- * @task_fix_setuid:
- *	Update the module's state after setting one or more of the user
- *	identity attributes of the current process.  The @flags parameter
- *	indicates which of the set*uid system calls invoked this hook.  If
- *	@new is the set of credentials that will be installed.  Modifications
- *	should be made to this rather than to @current->cred.
- *	@old is the set of credentials that are being replaces
- *	@flags contains one of the LSM_SETID_* values.
- *	Return 0 on success.
- * @task_setpgid:
- *	Check permission before setting the process group identifier of the
- *	process @p to @pgid.
- *	@p contains the task_struct for process being modified.
- *	@pgid contains the new pgid.
- *	Return 0 if permission is granted.
- * @task_getpgid:
- *	Check permission before getting the process group identifier of the
- *	process @p.
- *	@p contains the task_struct for the process.
- *	Return 0 if permission is granted.
- * @task_getsid:
- *	Check permission before getting the session identifier of the process
- *	@p.
- *	@p contains the task_struct for the process.
- *	Return 0 if permission is granted.
- * @task_getsecid:
- *	Retrieve the security identifier of the process @p.
- *	@p contains the task_struct for the process and place is into @secid.
- *	In case of failure, @secid will be set to zero.
- *
- * @task_setnice:
- *	Check permission before setting the nice value of @p to @nice.
- *	@p contains the task_struct of process.
- *	@nice contains the new nice value.
- *	Return 0 if permission is granted.
- * @task_setioprio
- *	Check permission before setting the ioprio value of @p to @ioprio.
- *	@p contains the task_struct of process.
- *	@ioprio contains the new ioprio value
- *	Return 0 if permission is granted.
- * @task_getioprio
- *	Check permission before getting the ioprio value of @p.
- *	@p contains the task_struct of process.
- *	Return 0 if permission is granted.
- * @task_setrlimit:
- *	Check permission before setting the resource limits of the current
- *	process for @resource to @new_rlim.  The old resource limit values can
- *	be examined by dereferencing (current->signal->rlim + resource).
- *	@resource contains the resource whose limit is being set.
- *	@new_rlim contains the new limits for @resource.
- *	Return 0 if permission is granted.
- * @task_setscheduler:
- *	Check permission before setting scheduling policy and/or parameters of
- *	process @p based on @policy and @lp.
- *	@p contains the task_struct for process.
- *	@policy contains the scheduling policy.
- *	@lp contains the scheduling parameters.
- *	Return 0 if permission is granted.
- * @task_getscheduler:
- *	Check permission before obtaining scheduling information for process
- *	@p.
- *	@p contains the task_struct for process.
- *	Return 0 if permission is granted.
- * @task_movememory
- *	Check permission before moving memory owned by process @p.
- *	@p contains the task_struct for process.
- *	Return 0 if permission is granted.
- * @task_kill:
- *	Check permission before sending signal @sig to @p.  @info can be NULL,
- *	the constant 1, or a pointer to a siginfo structure.  If @info is 1 or
- *	SI_FROMKERNEL(info) is true, then the signal should be viewed as coming
- *	from the kernel and should typically be permitted.
- *	SIGIO signals are handled separately by the send_sigiotask hook in
- *	file_security_ops.
- *	@p contains the task_struct for process.
- *	@info contains the signal information.
- *	@sig contains the signal value.
- *	@secid contains the sid of the process where the signal originated
- *	Return 0 if permission is granted.
- * @task_wait:
- *	Check permission before allowing a process to reap a child process @p
- *	and collect its status information.
- *	@p contains the task_struct for process.
- *	Return 0 if permission is granted.
- * @task_prctl:
- *	Check permission before performing a process control operation on the
- *	current process.
- *	@option contains the operation.
- *	@arg2 contains a argument.
- *	@arg3 contains a argument.
- *	@arg4 contains a argument.
- *	@arg5 contains a argument.
- *	Return -ENOSYS if no-one wanted to handle this op, any other value to
- *	cause prctl() to return immediately with that value.
- * @task_to_inode:
- *	Set the security attributes for an inode based on an associated task's
- *	security attributes, e.g. for /proc/pid inodes.
- *	@p contains the task_struct for the task.
- *	@inode contains the inode structure for the inode.
- *
- * Security hooks for Netlink messaging.
- *
- * @netlink_send:
- *	Save security information for a netlink message so that permission
- *	checking can be performed when the message is processed.  The security
- *	information can be saved using the eff_cap field of the
- *	netlink_skb_parms structure.  Also may be used to provide fine
- *	grained control over message transmission.
- *	@sk associated sock of task sending the message.
- *	@skb contains the sk_buff structure for the netlink message.
- *	Return 0 if the information was successfully saved and message
- *	is allowed to be transmitted.
- *
- * Security hooks for Unix domain networking.
- *
- * @unix_stream_connect:
- *	Check permissions before establishing a Unix domain stream connection
- *	between @sock and @other.
- *	@sock contains the sock structure.
- *	@other contains the peer sock structure.
- *	@newsk contains the new sock structure.
- *	Return 0 if permission is granted.
- * @unix_may_send:
- *	Check permissions before connecting or sending datagrams from @sock to
- *	@other.
- *	@sock contains the socket structure.
- *	@other contains the peer socket structure.
- *	Return 0 if permission is granted.
- *
- * The @unix_stream_connect and @unix_may_send hooks were necessary because
- * Linux provides an alternative to the conventional file name space for Unix
- * domain sockets.  Whereas binding and connecting to sockets in the file name
- * space is mediated by the typical file permissions (and caught by the mknod
- * and permission hooks in inode_security_ops), binding and connecting to
- * sockets in the abstract name space is completely unmediated.  Sufficient
- * control of Unix domain sockets in the abstract name space isn't possible
- * using only the socket layer hooks, since we need to know the actual target
- * socket, which is not looked up until we are inside the af_unix code.
- *
- * Security hooks for socket operations.
- *
- * @socket_create:
- *	Check permissions prior to creating a new socket.
- *	@family contains the requested protocol family.
- *	@type contains the requested communications type.
- *	@protocol contains the requested protocol.
- *	@kern set to 1 if a kernel socket.
- *	Return 0 if permission is granted.
- * @socket_post_create:
- *	This hook allows a module to update or allocate a per-socket security
- *	structure. Note that the security field was not added directly to the
- *	socket structure, but rather, the socket security information is stored
- *	in the associated inode.  Typically, the inode alloc_security hook will
- *	allocate and and attach security information to
- *	sock->inode->i_security.  This hook may be used to update the
- *	sock->inode->i_security field with additional information that wasn't
- *	available when the inode was allocated.
- *	@sock contains the newly created socket structure.
- *	@family contains the requested protocol family.
- *	@type contains the requested communications type.
- *	@protocol contains the requested protocol.
- *	@kern set to 1 if a kernel socket.
- * @socket_bind:
- *	Check permission before socket protocol layer bind operation is
- *	performed and the socket @sock is bound to the address specified in the
- *	@address parameter.
- *	@sock contains the socket structure.
- *	@address contains the address to bind to.
- *	@addrlen contains the length of address.
- *	Return 0 if permission is granted.
- * @socket_connect:
- *	Check permission before socket protocol layer connect operation
- *	attempts to connect socket @sock to a remote address, @address.
- *	@sock contains the socket structure.
- *	@address contains the address of remote endpoint.
- *	@addrlen contains the length of address.
- *	Return 0 if permission is granted.
- * @socket_listen:
- *	Check permission before socket protocol layer listen operation.
- *	@sock contains the socket structure.
- *	@backlog contains the maximum length for the pending connection queue.
- *	Return 0 if permission is granted.
- * @socket_accept:
- *	Check permission before accepting a new connection.  Note that the new
- *	socket, @newsock, has been created and some information copied to it,
- *	but the accept operation has not actually been performed.
- *	@sock contains the listening socket structure.
- *	@newsock contains the newly created server socket for connection.
- *	Return 0 if permission is granted.
- * @socket_sendmsg:
- *	Check permission before transmitting a message to another socket.
- *	@sock contains the socket structure.
- *	@msg contains the message to be transmitted.
- *	@size contains the size of message.
- *	Return 0 if permission is granted.
- * @socket_recvmsg:
- *	Check permission before receiving a message from a socket.
- *	@sock contains the socket structure.
- *	@msg contains the message structure.
- *	@size contains the size of message structure.
- *	@flags contains the operational flags.
- *	Return 0 if permission is granted.
- * @socket_getsockname:
- *	Check permission before the local address (name) of the socket object
- *	@sock is retrieved.
- *	@sock contains the socket structure.
- *	Return 0 if permission is granted.
- * @socket_getpeername:
- *	Check permission before the remote address (name) of a socket object
- *	@sock is retrieved.
- *	@sock contains the socket structure.
- *	Return 0 if permission is granted.
- * @socket_getsockopt:
- *	Check permissions before retrieving the options associated with socket
- *	@sock.
- *	@sock contains the socket structure.
- *	@level contains the protocol level to retrieve option from.
- *	@optname contains the name of option to retrieve.
- *	Return 0 if permission is granted.
- * @socket_setsockopt:
- *	Check permissions before setting the options associated with socket
- *	@sock.
- *	@sock contains the socket structure.
- *	@level contains the protocol level to set options for.
- *	@optname contains the name of the option to set.
- *	Return 0 if permission is granted.
- * @socket_shutdown:
- *	Checks permission before all or part of a connection on the socket
- *	@sock is shut down.
- *	@sock contains the socket structure.
- *	@how contains the flag indicating how future sends and receives are handled.
- *	Return 0 if permission is granted.
- * @socket_sock_rcv_skb:
- *	Check permissions on incoming network packets.  This hook is distinct
- *	from Netfilter's IP input hooks since it is the first time that the
- *	incoming sk_buff @skb has been associated with a particular socket, @sk.
- *	Must not sleep inside this hook because some callers hold spinlocks.
- *	@sk contains the sock (not socket) associated with the incoming sk_buff.
- *	@skb contains the incoming network data.
- * @socket_getpeersec_stream:
- *	This hook allows the security module to provide peer socket security
- *	state for unix or connected tcp sockets to userspace via getsockopt
- *	SO_GETPEERSEC.  For tcp sockets this can be meaningful if the
- *	socket is associated with an ipsec SA.
- *	@sock is the local socket.
- *	@optval userspace memory where the security state is to be copied.
- *	@optlen userspace int where the module should copy the actual length
- *	of the security state.
- *	@len as input is the maximum length to copy to userspace provided
- *	by the caller.
- *	Return 0 if all is well, otherwise, typical getsockopt return
- *	values.
- * @socket_getpeersec_dgram:
- *	This hook allows the security module to provide peer socket security
- *	state for udp sockets on a per-packet basis to userspace via
- *	getsockopt SO_GETPEERSEC.  The application must first have indicated
- *	the IP_PASSSEC option via getsockopt.  It can then retrieve the
- *	security state returned by this hook for a packet via the SCM_SECURITY
- *	ancillary message type.
- *	@skb is the skbuff for the packet being queried
- *	@secdata is a pointer to a buffer in which to copy the security data
- *	@seclen is the maximum length for @secdata
- *	Return 0 on success, error on failure.
- * @sk_alloc_security:
- *	Allocate and attach a security structure to the sk->sk_security field,
- *	which is used to copy security attributes between local stream sockets.
- * @sk_free_security:
- *	Deallocate security structure.
- * @sk_clone_security:
- *	Clone/copy security structure.
- * @sk_getsecid:
- *	Retrieve the LSM-specific secid for the sock to enable caching of network
- *	authorizations.
- * @sock_graft:
- *	Sets the socket's isec sid to the sock's sid.
- * @inet_conn_request:
- *	Sets the openreq's sid to socket's sid with MLS portion taken from peer sid.
- * @inet_csk_clone:
- *	Sets the new child socket's sid to the openreq sid.
- * @inet_conn_established:
- *	Sets the connection's peersid to the secmark on skb.
- * @secmark_relabel_packet:
- *	check if the process should be allowed to relabel packets to the given secid
- * @security_secmark_refcount_inc
- *	tells the LSM to increment the number of secmark labeling rules loaded
- * @security_secmark_refcount_dec
- *	tells the LSM to decrement the number of secmark labeling rules loaded
- * @req_classify_flow:
- *	Sets the flow's sid to the openreq sid.
- * @tun_dev_alloc_security:
- *	This hook allows a module to allocate a security structure for a TUN
- *	device.
- *	@security pointer to a security structure pointer.
- *	Returns a zero on success, negative values on failure.
- * @tun_dev_free_security:
- *	This hook allows a module to free the security structure for a TUN
- *	device.
- *	@security pointer to the TUN device's security structure
- * @tun_dev_create:
- *	Check permissions prior to creating a new TUN device.
- * @tun_dev_attach_queue:
- *	Check permissions prior to attaching to a TUN device queue.
- *	@security pointer to the TUN device's security structure.
- * @tun_dev_attach:
- *	This hook can be used by the module to update any security state
- *	associated with the TUN device's sock structure.
- *	@sk contains the existing sock structure.
- *	@security pointer to the TUN device's security structure.
- * @tun_dev_open:
- *	This hook can be used by the module to update any security state
- *	associated with the TUN device's security structure.
- *	@security pointer to the TUN devices's security structure.
- * @skb_owned_by:
- *	This hook sets the packet's owning sock.
- *	@skb is the packet.
- *	@sk the sock which owns the packet.
- *
- * Security hooks for XFRM operations.
- *
- * @xfrm_policy_alloc_security:
- *	@ctxp is a pointer to the xfrm_sec_ctx being added to Security Policy
- *	Database used by the XFRM system.
- *	@sec_ctx contains the security context information being provided by
- *	the user-level policy update program (e.g., setkey).
- *	Allocate a security structure to the xp->security field; the security
- *	field is initialized to NULL when the xfrm_policy is allocated.
- *	Return 0 if operation was successful (memory to allocate, legal context)
- *	@gfp is to specify the context for the allocation
- * @xfrm_policy_clone_security:
- *	@old_ctx contains an existing xfrm_sec_ctx.
- *	@new_ctxp contains a new xfrm_sec_ctx being cloned from old.
- *	Allocate a security structure in new_ctxp that contains the
- *	information from the old_ctx structure.
- *	Return 0 if operation was successful (memory to allocate).
- * @xfrm_policy_free_security:
- *	@ctx contains the xfrm_sec_ctx
- *	Deallocate xp->security.
- * @xfrm_policy_delete_security:
- *	@ctx contains the xfrm_sec_ctx.
- *	Authorize deletion of xp->security.
- * @xfrm_state_alloc:
- *	@x contains the xfrm_state being added to the Security Association
- *	Database by the XFRM system.
- *	@sec_ctx contains the security context information being provided by
- *	the user-level SA generation program (e.g., setkey or racoon).
- *	Allocate a security structure to the x->security field; the security
- *	field is initialized to NULL when the xfrm_state is allocated. Set the
- *	context to correspond to sec_ctx. Return 0 if operation was successful
- *	(memory to allocate, legal context).
- * @xfrm_state_alloc_acquire:
- *	@x contains the xfrm_state being added to the Security Association
- *	Database by the XFRM system.
- *	@polsec contains the policy's security context.
- *	@secid contains the secid from which to take the mls portion of the
- *	context.
- *	Allocate a security structure to the x->security field; the security
- *	field is initialized to NULL when the xfrm_state is allocated. Set the
- *	context to correspond to secid. Return 0 if operation was successful
- *	(memory to allocate, legal context).
- * @xfrm_state_free_security:
- *	@x contains the xfrm_state.
- *	Deallocate x->security.
- * @xfrm_state_delete_security:
- *	@x contains the xfrm_state.
- *	Authorize deletion of x->security.
- * @xfrm_policy_lookup:
- *	@ctx contains the xfrm_sec_ctx for which the access control is being
- *	checked.
- *	@fl_secid contains the flow security label that is used to authorize
- *	access to the policy xp.
- *	@dir contains the direction of the flow (input or output).
- *	Check permission when a flow selects a xfrm_policy for processing
- *	XFRMs on a packet.  The hook is called when selecting either a
- *	per-socket policy or a generic xfrm policy.
- *	Return 0 if permission is granted, -ESRCH otherwise, or -errno
- *	on other errors.
- * @xfrm_state_pol_flow_match:
- *	@x contains the state to match.
- *	@xp contains the policy to check for a match.
- *	@fl contains the flow to check for a match.
- *	Return 1 if there is a match.
- * @xfrm_decode_session:
- *	@skb points to skb to decode.
- *	@secid points to the flow key secid to set.
- *	@ckall says if all xfrms used should be checked for same secid.
- *	Return 0 if ckall is zero or all xfrms used have the same secid.
- *
- * Security hooks affecting all Key Management operations
- *
- * @key_alloc:
- *	Permit allocation of a key and assign security data. Note that key does
- *	not have a serial number assigned at this point.
- *	@key points to the key.
- *	@flags is the allocation flags
- *	Return 0 if permission is granted, -ve error otherwise.
- * @key_free:
- *	Notification of destruction; free security data.
- *	@key points to the key.
- *	No return value.
- * @key_permission:
- *	See whether a specific operational right is granted to a process on a
- *	key.
- *	@key_ref refers to the key (key pointer + possession attribute bit).
- *	@cred points to the credentials to provide the context against which to
- *	evaluate the security data on the key.
- *	@perm describes the combination of permissions required of this key.
- *	Return 0 if permission is granted, -ve error otherwise.
- * @key_getsecurity:
- *	Get a textual representation of the security context attached to a key
- *	for the purposes of honouring KEYCTL_GETSECURITY.  This function
- *	allocates the storage for the NUL-terminated string and the caller
- *	should free it.
- *	@key points to the key to be queried.
- *	@_buffer points to a pointer that should be set to point to the
- *	resulting string (if no label or an error occurs).
- *	Return the length of the string (including terminating NUL) or -ve if
- *	an error.
- *	May also return 0 (and a NULL buffer pointer) if there is no label.
- *
- * Security hooks affecting all System V IPC operations.
- *
- * @ipc_permission:
- *	Check permissions for access to IPC
- *	@ipcp contains the kernel IPC permission structure
- *	@flag contains the desired (requested) permission set
- *	Return 0 if permission is granted.
- * @ipc_getsecid:
- *	Get the secid associated with the ipc object.
- *	@ipcp contains the kernel IPC permission structure.
- *	@secid contains a pointer to the location where result will be saved.
- *	In case of failure, @secid will be set to zero.
- *
- * Security hooks for individual messages held in System V IPC message queues
- * @msg_msg_alloc_security:
- *	Allocate and attach a security structure to the msg->security field.
- *	The security field is initialized to NULL when the structure is first
- *	created.
- *	@msg contains the message structure to be modified.
- *	Return 0 if operation was successful and permission is granted.
- * @msg_msg_free_security:
- *	Deallocate the security structure for this message.
- *	@msg contains the message structure to be modified.
- *
- * Security hooks for System V IPC Message Queues
- *
- * @msg_queue_alloc_security:
- *	Allocate and attach a security structure to the
- *	msq->q_perm.security field. The security field is initialized to
- *	NULL when the structure is first created.
- *	@msq contains the message queue structure to be modified.
- *	Return 0 if operation was successful and permission is granted.
- * @msg_queue_free_security:
- *	Deallocate security structure for this message queue.
- *	@msq contains the message queue structure to be modified.
- * @msg_queue_associate:
- *	Check permission when a message queue is requested through the
- *	msgget system call.  This hook is only called when returning the
- *	message queue identifier for an existing message queue, not when a
- *	new message queue is created.
- *	@msq contains the message queue to act upon.
- *	@msqflg contains the operation control flags.
- *	Return 0 if permission is granted.
- * @msg_queue_msgctl:
- *	Check permission when a message control operation specified by @cmd
- *	is to be performed on the message queue @msq.
- *	The @msq may be NULL, e.g. for IPC_INFO or MSG_INFO.
- *	@msq contains the message queue to act upon.  May be NULL.
- *	@cmd contains the operation to be performed.
- *	Return 0 if permission is granted.
- * @msg_queue_msgsnd:
- *	Check permission before a message, @msg, is enqueued on the message
- *	queue, @msq.
- *	@msq contains the message queue to send message to.
- *	@msg contains the message to be enqueued.
- *	@msqflg contains operational flags.
- *	Return 0 if permission is granted.
- * @msg_queue_msgrcv:
- *	Check permission before a message, @msg, is removed from the message
- *	queue, @msq.  The @target task structure contains a pointer to the
- *	process that will be receiving the message (not equal to the current
- *	process when inline receives are being performed).
- *	@msq contains the message queue to retrieve message from.
- *	@msg contains the message destination.
- *	@target contains the task structure for recipient process.
- *	@type contains the type of message requested.
- *	@mode contains the operational flags.
- *	Return 0 if permission is granted.
- *
- * Security hooks for System V Shared Memory Segments
- *
- * @shm_alloc_security:
- *	Allocate and attach a security structure to the shp->shm_perm.security
- *	field.  The security field is initialized to NULL when the structure is
- *	first created.
- *	@shp contains the shared memory structure to be modified.
- *	Return 0 if operation was successful and permission is granted.
- * @shm_free_security:
- *	Deallocate the security struct for this memory segment.
- *	@shp contains the shared memory structure to be modified.
- * @shm_associate:
- *	Check permission when a shared memory region is requested through the
- *	shmget system call.  This hook is only called when returning the shared
- *	memory region identifier for an existing region, not when a new shared
- *	memory region is created.
- *	@shp contains the shared memory structure to be modified.
- *	@shmflg contains the operation control flags.
- *	Return 0 if permission is granted.
- * @shm_shmctl:
- *	Check permission when a shared memory control operation specified by
- *	@cmd is to be performed on the shared memory region @shp.
- *	The @shp may be NULL, e.g. for IPC_INFO or SHM_INFO.
- *	@shp contains shared memory structure to be modified.
- *	@cmd contains the operation to be performed.
- *	Return 0 if permission is granted.
- * @shm_shmat:
- *	Check permissions prior to allowing the shmat system call to attach the
- *	shared memory segment @shp to the data segment of the calling process.
- *	The attaching address is specified by @shmaddr.
- *	@shp contains the shared memory structure to be modified.
- *	@shmaddr contains the address to attach memory region to.
- *	@shmflg contains the operational flags.
- *	Return 0 if permission is granted.
- *
- * Security hooks for System V Semaphores
- *
- * @sem_alloc_security:
- *	Allocate and attach a security structure to the sma->sem_perm.security
- *	field.  The security field is initialized to NULL when the structure is
- *	first created.
- *	@sma contains the semaphore structure
- *	Return 0 if operation was successful and permission is granted.
- * @sem_free_security:
- *	deallocate security struct for this semaphore
- *	@sma contains the semaphore structure.
- * @sem_associate:
- *	Check permission when a semaphore is requested through the semget
- *	system call.  This hook is only called when returning the semaphore
- *	identifier for an existing semaphore, not when a new one must be
- *	created.
- *	@sma contains the semaphore structure.
- *	@semflg contains the operation control flags.
- *	Return 0 if permission is granted.
- * @sem_semctl:
- *	Check permission when a semaphore operation specified by @cmd is to be
- *	performed on the semaphore @sma.  The @sma may be NULL, e.g. for
- *	IPC_INFO or SEM_INFO.
- *	@sma contains the semaphore structure.  May be NULL.
- *	@cmd contains the operation to be performed.
- *	Return 0 if permission is granted.
- * @sem_semop
- *	Check permissions before performing operations on members of the
- *	semaphore set @sma.  If the @alter flag is nonzero, the semaphore set
- *	may be modified.
- *	@sma contains the semaphore structure.
- *	@sops contains the operations to perform.
- *	@nsops contains the number of operations to perform.
- *	@alter contains the flag indicating whether changes are to be made.
- *	Return 0 if permission is granted.
- *
- * @binder_set_context_mgr
- *	Check whether @mgr is allowed to be the binder context manager.
- *	@mgr contains the task_struct for the task being registered.
- *	Return 0 if permission is granted.
- * @binder_transaction
- *	Check whether @from is allowed to invoke a binder transaction call
- *	to @to.
- *	@from contains the task_struct for the sending task.
- *	@to contains the task_struct for the receiving task.
- * @binder_transfer_binder
- *	Check whether @from is allowed to transfer a binder reference to @to.
- *	@from contains the task_struct for the sending task.
- *	@to contains the task_struct for the receiving task.
- * @binder_transfer_file
- *	Check whether @from is allowed to transfer @file to @to.
- *	@from contains the task_struct for the sending task.
- *	@file contains the struct file being transferred.
- *	@to contains the task_struct for the receiving task.
- *
- * @ptrace_access_check:
- *	Check permission before allowing the current process to trace the
- *	@child process.
- *	Security modules may also want to perform a process tracing check
- *	during an execve in the set_security or apply_creds hooks of
- *	tracing check during an execve in the bprm_set_creds hook of
- *	binprm_security_ops if the process is being traced and its security
- *	attributes would be changed by the execve.
- *	@child contains the task_struct structure for the target process.
- *	@mode contains the PTRACE_MODE flags indicating the form of access.
- *	Return 0 if permission is granted.
- * @ptrace_traceme:
- *	Check that the @parent process has sufficient permission to trace the
- *	current process before allowing the current process to present itself
- *	to the @parent process for tracing.
- *	@parent contains the task_struct structure for debugger process.
- *	Return 0 if permission is granted.
- * @capget:
- *	Get the @effective, @inheritable, and @permitted capability sets for
- *	the @target process.  The hook may also perform permission checking to
- *	determine if the current process is allowed to see the capability sets
- *	of the @target process.
- *	@target contains the task_struct structure for target process.
- *	@effective contains the effective capability set.
- *	@inheritable contains the inheritable capability set.
- *	@permitted contains the permitted capability set.
- *	Return 0 if the capability sets were successfully obtained.
- * @capset:
- *	Set the @effective, @inheritable, and @permitted capability sets for
- *	the current process.
- *	@new contains the new credentials structure for target process.
- *	@old contains the current credentials structure for target process.
- *	@effective contains the effective capability set.
- *	@inheritable contains the inheritable capability set.
- *	@permitted contains the permitted capability set.
- *	Return 0 and update @new if permission is granted.
- * @capable:
- *	Check whether the @tsk process has the @cap capability in the indicated
- *	credentials.
- *	@cred contains the credentials to use.
- *	@ns contains the user namespace we want the capability in
- *	@cap contains the capability <include/linux/capability.h>.
- *	@audit: Whether to write an audit message or not
- *	Return 0 if the capability is granted for @tsk.
- * @syslog:
- *	Check permission before accessing the kernel message ring or changing
- *	logging to the console.
- *	See the syslog(2) manual page for an explanation of the @type values.
- *	@type contains the type of action.
- *	@from_file indicates the context of action (if it came from /proc).
- *	Return 0 if permission is granted.
- * @settime:
- *	Check permission to change the system time.
- *	struct timespec and timezone are defined in include/linux/time.h
- *	@ts contains new time
- *	@tz contains new timezone
- *	Return 0 if permission is granted.
- * @vm_enough_memory:
- *	Check permissions for allocating a new virtual mapping.
- *	@mm contains the mm struct it is being added to.
- *	@pages contains the number of pages.
- *	Return 0 if permission is granted.
- *
- * @ismaclabel:
- *	Check if the extended attribute specified by @name
- *	represents a MAC label. Returns 1 if name is a MAC
- *	attribute otherwise returns 0.
- *	@name full extended attribute name to check against
- *	LSM as a MAC label.
- *
- * @secid_to_secctx:
- *	Convert secid to security context.  If secdata is NULL the length of
- *	the result will be returned in seclen, but no secdata will be returned.
- *	This does mean that the length could change between calls to check the
- *	length and the next call which actually allocates and returns the secdata.
- *	@secid contains the security ID.
- *	@secdata contains the pointer that stores the converted security context.
- *	@seclen pointer which contains the length of the data
- * @secctx_to_secid:
- *	Convert security context to secid.
- *	@secid contains the pointer to the generated security ID.
- *	@secdata contains the security context.
- *
- * @release_secctx:
- *	Release the security context.
- *	@secdata contains the security context.
- *	@seclen contains the length of the security context.
- *
- * Security hooks for Audit
- *
- * @audit_rule_init:
- *	Allocate and initialize an LSM audit rule structure.
- *	@field contains the required Audit action. Fields flags are defined in include/linux/audit.h
- *	@op contains the operator the rule uses.
- *	@rulestr contains the context where the rule will be applied to.
- *	@lsmrule contains a pointer to receive the result.
- *	Return 0 if @lsmrule has been successfully set,
- *	-EINVAL in case of an invalid rule.
- *
- * @audit_rule_known:
- *	Specifies whether given @rule contains any fields related to current LSM.
- *	@rule contains the audit rule of interest.
- *	Return 1 in case of relation found, 0 otherwise.
- *
- * @audit_rule_match:
- *	Determine if given @secid matches a rule previously approved
- *	by @audit_rule_known.
- *	@secid contains the security id in question.
- *	@field contains the field which relates to current LSM.
- *	@op contains the operator that will be used for matching.
- *	@rule points to the audit rule that will be checked against.
- *	@actx points to the audit context associated with the check.
- *	Return 1 if secid matches the rule, 0 if it does not, -ERRNO on failure.
- *
- * @audit_rule_free:
- *	Deallocate the LSM audit rule structure previously allocated by
- *	audit_rule_init.
- *	@rule contains the allocated rule
- *
- * @inode_notifysecctx:
- *	Notify the security module of what the security context of an inode
- *	should be.  Initializes the incore security context managed by the
- *	security module for this inode.  Example usage:  NFS client invokes
- *	this hook to initialize the security context in its incore inode to the
- *	value provided by the server for the file when the server returned the
- *	file's attributes to the client.
- *
- * 	Must be called with inode->i_mutex locked.
- *
- * 	@inode we wish to set the security context of.
- * 	@ctx contains the string which we wish to set in the inode.
- * 	@ctxlen contains the length of @ctx.
- *
- * @inode_setsecctx:
- * 	Change the security context of an inode.  Updates the
- * 	incore security context managed by the security module and invokes the
- * 	fs code as needed (via __vfs_setxattr_noperm) to update any backing
- * 	xattrs that represent the context.  Example usage:  NFS server invokes
- * 	this hook to change the security context in its incore inode and on the
- * 	backing filesystem to a value provided by the client on a SETATTR
- * 	operation.
- *
- * 	Must be called with inode->i_mutex locked.
- *
- * 	@dentry contains the inode we wish to set the security context of.
- * 	@ctx contains the string which we wish to set in the inode.
- * 	@ctxlen contains the length of @ctx.
- *
- * @inode_getsecctx:
- *	On success, returns 0 and fills out @ctx and @ctxlen with the security
- *	context for the given @inode.
- *
- * 	@inode we wish to get the security context of.
- *	@ctx is a pointer in which to place the allocated security context.
- *	@ctxlen points to the place to put the length of @ctx.
- * This is the main security structure.
- */
-
 /* prototypes */
 extern int security_init(void);
 
-- 
cgit v1.2.3


From e20b043a6902ecb61c2c84355c3bae5149f391db Mon Sep 17 00:00:00 2001
From: Casey Schaufler <casey@schaufler-ca.com>
Date: Sat, 2 May 2015 15:11:36 -0700
Subject: LSM: Add security module hook list heads

Add a list header for each security hook. They aren't used until
later in the patch series. They are grouped together in a structure
so that there doesn't need to be an external address for each.

Macro-ize the initialization of the security_operations
for each security module in anticipation of changing out
the security_operations structure.

Signed-off-by: Casey Schaufler <casey@schaufler-ca.com>
Acked-by: John Johansen <john.johansen@canonical.com>
Acked-by: Kees Cook <keescook@chromium.org>
Acked-by: Paul Moore <paul@paul-moore.com>
Acked-by:  Stephen Smalley <sds@tycho.nsa.gov>
Acked-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Signed-off-by: James Morris <james.l.morris@oracle.com>
---
 include/linux/lsm_hooks.h  | 220 +++++++++++++++++++++++++
 security/apparmor/lsm.c    |  84 +++++-----
 security/selinux/hooks.c   | 398 +++++++++++++++++++++++----------------------
 security/smack/smack_lsm.c | 258 ++++++++++++++---------------
 security/tomoyo/tomoyo.c   |  58 +++----
 security/yama/yama_lsm.c   |  10 +-
 6 files changed, 626 insertions(+), 402 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h
index b4c91de510c2..27dd6fcacccc 100644
--- a/include/linux/lsm_hooks.h
+++ b/include/linux/lsm_hooks.h
@@ -1626,6 +1626,226 @@ struct security_operations {
 #endif /* CONFIG_AUDIT */
 };
 
+struct security_hook_heads {
+	struct list_head binder_set_context_mgr;
+	struct list_head binder_transaction;
+	struct list_head binder_transfer_binder;
+	struct list_head binder_transfer_file;
+	struct list_head ptrace_access_check;
+	struct list_head ptrace_traceme;
+	struct list_head capget;
+	struct list_head capset;
+	struct list_head capable;
+	struct list_head quotactl;
+	struct list_head quota_on;
+	struct list_head syslog;
+	struct list_head settime;
+	struct list_head vm_enough_memory;
+	struct list_head bprm_set_creds;
+	struct list_head bprm_check_security;
+	struct list_head bprm_secureexec;
+	struct list_head bprm_committing_creds;
+	struct list_head bprm_committed_creds;
+	struct list_head sb_alloc_security;
+	struct list_head sb_free_security;
+	struct list_head sb_copy_data;
+	struct list_head sb_remount;
+	struct list_head sb_kern_mount;
+	struct list_head sb_show_options;
+	struct list_head sb_statfs;
+	struct list_head sb_mount;
+	struct list_head sb_umount;
+	struct list_head sb_pivotroot;
+	struct list_head sb_set_mnt_opts;
+	struct list_head sb_clone_mnt_opts;
+	struct list_head sb_parse_opts_str;
+	struct list_head dentry_init_security;
+#ifdef CONFIG_SECURITY_PATH
+	struct list_head path_unlink;
+	struct list_head path_mkdir;
+	struct list_head path_rmdir;
+	struct list_head path_mknod;
+	struct list_head path_truncate;
+	struct list_head path_symlink;
+	struct list_head path_link;
+	struct list_head path_rename;
+	struct list_head path_chmod;
+	struct list_head path_chown;
+	struct list_head path_chroot;
+#endif
+	struct list_head inode_alloc_security;
+	struct list_head inode_free_security;
+	struct list_head inode_init_security;
+	struct list_head inode_create;
+	struct list_head inode_link;
+	struct list_head inode_unlink;
+	struct list_head inode_symlink;
+	struct list_head inode_mkdir;
+	struct list_head inode_rmdir;
+	struct list_head inode_mknod;
+	struct list_head inode_rename;
+	struct list_head inode_readlink;
+	struct list_head inode_follow_link;
+	struct list_head inode_permission;
+	struct list_head inode_setattr;
+	struct list_head inode_getattr;
+	struct list_head inode_setxattr;
+	struct list_head inode_post_setxattr;
+	struct list_head inode_getxattr;
+	struct list_head inode_listxattr;
+	struct list_head inode_removexattr;
+	struct list_head inode_need_killpriv;
+	struct list_head inode_killpriv;
+	struct list_head inode_getsecurity;
+	struct list_head inode_setsecurity;
+	struct list_head inode_listsecurity;
+	struct list_head inode_getsecid;
+	struct list_head file_permission;
+	struct list_head file_alloc_security;
+	struct list_head file_free_security;
+	struct list_head file_ioctl;
+	struct list_head mmap_addr;
+	struct list_head mmap_file;
+	struct list_head file_mprotect;
+	struct list_head file_lock;
+	struct list_head file_fcntl;
+	struct list_head file_set_fowner;
+	struct list_head file_send_sigiotask;
+	struct list_head file_receive;
+	struct list_head file_open;
+	struct list_head task_create;
+	struct list_head task_free;
+	struct list_head cred_alloc_blank;
+	struct list_head cred_free;
+	struct list_head cred_prepare;
+	struct list_head cred_transfer;
+	struct list_head kernel_act_as;
+	struct list_head kernel_create_files_as;
+	struct list_head kernel_fw_from_file;
+	struct list_head kernel_module_request;
+	struct list_head kernel_module_from_file;
+	struct list_head task_fix_setuid;
+	struct list_head task_setpgid;
+	struct list_head task_getpgid;
+	struct list_head task_getsid;
+	struct list_head task_getsecid;
+	struct list_head task_setnice;
+	struct list_head task_setioprio;
+	struct list_head task_getioprio;
+	struct list_head task_setrlimit;
+	struct list_head task_setscheduler;
+	struct list_head task_getscheduler;
+	struct list_head task_movememory;
+	struct list_head task_kill;
+	struct list_head task_wait;
+	struct list_head task_prctl;
+	struct list_head task_to_inode;
+	struct list_head ipc_permission;
+	struct list_head ipc_getsecid;
+	struct list_head msg_msg_alloc_security;
+	struct list_head msg_msg_free_security;
+	struct list_head msg_queue_alloc_security;
+	struct list_head msg_queue_free_security;
+	struct list_head msg_queue_associate;
+	struct list_head msg_queue_msgctl;
+	struct list_head msg_queue_msgsnd;
+	struct list_head msg_queue_msgrcv;
+	struct list_head shm_alloc_security;
+	struct list_head shm_free_security;
+	struct list_head shm_associate;
+	struct list_head shm_shmctl;
+	struct list_head shm_shmat;
+	struct list_head sem_alloc_security;
+	struct list_head sem_free_security;
+	struct list_head sem_associate;
+	struct list_head sem_semctl;
+	struct list_head sem_semop;
+	struct list_head netlink_send;
+	struct list_head d_instantiate;
+	struct list_head getprocattr;
+	struct list_head setprocattr;
+	struct list_head ismaclabel;
+	struct list_head secid_to_secctx;
+	struct list_head secctx_to_secid;
+	struct list_head release_secctx;
+	struct list_head inode_notifysecctx;
+	struct list_head inode_setsecctx;
+	struct list_head inode_getsecctx;
+#ifdef CONFIG_SECURITY_NETWORK
+	struct list_head unix_stream_connect;
+	struct list_head unix_may_send;
+	struct list_head socket_create;
+	struct list_head socket_post_create;
+	struct list_head socket_bind;
+	struct list_head socket_connect;
+	struct list_head socket_listen;
+	struct list_head socket_accept;
+	struct list_head socket_sendmsg;
+	struct list_head socket_recvmsg;
+	struct list_head socket_getsockname;
+	struct list_head socket_getpeername;
+	struct list_head socket_getsockopt;
+	struct list_head socket_setsockopt;
+	struct list_head socket_shutdown;
+	struct list_head socket_sock_rcv_skb;
+	struct list_head socket_getpeersec_stream;
+	struct list_head socket_getpeersec_dgram;
+	struct list_head sk_alloc_security;
+	struct list_head sk_free_security;
+	struct list_head sk_clone_security;
+	struct list_head sk_getsecid;
+	struct list_head sock_graft;
+	struct list_head inet_conn_request;
+	struct list_head inet_csk_clone;
+	struct list_head inet_conn_established;
+	struct list_head secmark_relabel_packet;
+	struct list_head secmark_refcount_inc;
+	struct list_head secmark_refcount_dec;
+	struct list_head req_classify_flow;
+	struct list_head tun_dev_alloc_security;
+	struct list_head tun_dev_free_security;
+	struct list_head tun_dev_create;
+	struct list_head tun_dev_attach_queue;
+	struct list_head tun_dev_attach;
+	struct list_head tun_dev_open;
+	struct list_head skb_owned_by;
+#endif	/* CONFIG_SECURITY_NETWORK */
+#ifdef CONFIG_SECURITY_NETWORK_XFRM
+	struct list_head xfrm_policy_alloc_security;
+	struct list_head xfrm_policy_clone_security;
+	struct list_head xfrm_policy_free_security;
+	struct list_head xfrm_policy_delete_security;
+	struct list_head xfrm_state_alloc;
+	struct list_head xfrm_state_alloc_acquire;
+	struct list_head xfrm_state_free_security;
+	struct list_head xfrm_state_delete_security;
+	struct list_head xfrm_policy_lookup;
+	struct list_head xfrm_state_pol_flow_match;
+	struct list_head xfrm_decode_session;
+#endif	/* CONFIG_SECURITY_NETWORK_XFRM */
+#ifdef CONFIG_KEYS
+	struct list_head key_alloc;
+	struct list_head key_free;
+	struct list_head key_permission;
+	struct list_head key_getsecurity;
+#endif	/* CONFIG_KEYS */
+#ifdef CONFIG_AUDIT
+	struct list_head audit_rule_init;
+	struct list_head audit_rule_known;
+	struct list_head audit_rule_match;
+	struct list_head audit_rule_free;
+#endif /* CONFIG_AUDIT */
+};
+
+/*
+ * Initializing a security_hook_list structure takes
+ * up a lot of space in a source file. This macro takes
+ * care of the common case and reduces the amount of
+ * text involved.
+ * Casey says: Comment is true in the next patch.
+ */
+#define LSM_HOOK_INIT(HEAD, HOOK)	.HEAD = HOOK
+
 /* prototypes */
 extern int security_module_enable(struct security_operations *ops);
 extern int register_security(struct security_operations *ops);
diff --git a/security/apparmor/lsm.c b/security/apparmor/lsm.c
index fead41bd0440..f54253258fb8 100644
--- a/security/apparmor/lsm.c
+++ b/security/apparmor/lsm.c
@@ -616,48 +616,48 @@ static int apparmor_task_setrlimit(struct task_struct *task,
 }
 
 static struct security_operations apparmor_ops = {
-	.name =				"apparmor",
-
-	.ptrace_access_check =		apparmor_ptrace_access_check,
-	.ptrace_traceme =		apparmor_ptrace_traceme,
-	.capget =			apparmor_capget,
-	.capable =			apparmor_capable,
-
-	.path_link =			apparmor_path_link,
-	.path_unlink =			apparmor_path_unlink,
-	.path_symlink =			apparmor_path_symlink,
-	.path_mkdir =			apparmor_path_mkdir,
-	.path_rmdir =			apparmor_path_rmdir,
-	.path_mknod =			apparmor_path_mknod,
-	.path_rename =			apparmor_path_rename,
-	.path_chmod =			apparmor_path_chmod,
-	.path_chown =			apparmor_path_chown,
-	.path_truncate =		apparmor_path_truncate,
-	.inode_getattr =                apparmor_inode_getattr,
-
-	.file_open =			apparmor_file_open,
-	.file_permission =		apparmor_file_permission,
-	.file_alloc_security =		apparmor_file_alloc_security,
-	.file_free_security =		apparmor_file_free_security,
-	.mmap_file =			apparmor_mmap_file,
-	.mmap_addr =			cap_mmap_addr,
-	.file_mprotect =		apparmor_file_mprotect,
-	.file_lock =			apparmor_file_lock,
-
-	.getprocattr =			apparmor_getprocattr,
-	.setprocattr =			apparmor_setprocattr,
-
-	.cred_alloc_blank =		apparmor_cred_alloc_blank,
-	.cred_free =			apparmor_cred_free,
-	.cred_prepare =			apparmor_cred_prepare,
-	.cred_transfer =		apparmor_cred_transfer,
-
-	.bprm_set_creds =		apparmor_bprm_set_creds,
-	.bprm_committing_creds =	apparmor_bprm_committing_creds,
-	.bprm_committed_creds =		apparmor_bprm_committed_creds,
-	.bprm_secureexec =		apparmor_bprm_secureexec,
-
-	.task_setrlimit =		apparmor_task_setrlimit,
+	LSM_HOOK_INIT(name, "apparmor"),
+
+	LSM_HOOK_INIT(ptrace_access_check, apparmor_ptrace_access_check),
+	LSM_HOOK_INIT(ptrace_traceme, apparmor_ptrace_traceme),
+	LSM_HOOK_INIT(capget, apparmor_capget),
+	LSM_HOOK_INIT(capable, apparmor_capable),
+
+	LSM_HOOK_INIT(path_link, apparmor_path_link),
+	LSM_HOOK_INIT(path_unlink, apparmor_path_unlink),
+	LSM_HOOK_INIT(path_symlink, apparmor_path_symlink),
+	LSM_HOOK_INIT(path_mkdir, apparmor_path_mkdir),
+	LSM_HOOK_INIT(path_rmdir, apparmor_path_rmdir),
+	LSM_HOOK_INIT(path_mknod, apparmor_path_mknod),
+	LSM_HOOK_INIT(path_rename, apparmor_path_rename),
+	LSM_HOOK_INIT(path_chmod, apparmor_path_chmod),
+	LSM_HOOK_INIT(path_chown, apparmor_path_chown),
+	LSM_HOOK_INIT(path_truncate, apparmor_path_truncate),
+	LSM_HOOK_INIT(inode_getattr, apparmor_inode_getattr),
+
+	LSM_HOOK_INIT(file_open, apparmor_file_open),
+	LSM_HOOK_INIT(file_permission, apparmor_file_permission),
+	LSM_HOOK_INIT(file_alloc_security, apparmor_file_alloc_security),
+	LSM_HOOK_INIT(file_free_security, apparmor_file_free_security),
+	LSM_HOOK_INIT(mmap_file, apparmor_mmap_file),
+	LSM_HOOK_INIT(mmap_addr, cap_mmap_addr),
+	LSM_HOOK_INIT(file_mprotect, apparmor_file_mprotect),
+	LSM_HOOK_INIT(file_lock, apparmor_file_lock),
+
+	LSM_HOOK_INIT(getprocattr, apparmor_getprocattr),
+	LSM_HOOK_INIT(setprocattr, apparmor_setprocattr),
+
+	LSM_HOOK_INIT(cred_alloc_blank, apparmor_cred_alloc_blank),
+	LSM_HOOK_INIT(cred_free, apparmor_cred_free),
+	LSM_HOOK_INIT(cred_prepare, apparmor_cred_prepare),
+	LSM_HOOK_INIT(cred_transfer, apparmor_cred_transfer),
+
+	LSM_HOOK_INIT(bprm_set_creds, apparmor_bprm_set_creds),
+	LSM_HOOK_INIT(bprm_committing_creds, apparmor_bprm_committing_creds),
+	LSM_HOOK_INIT(bprm_committed_creds, apparmor_bprm_committed_creds),
+	LSM_HOOK_INIT(bprm_secureexec, apparmor_bprm_secureexec),
+
+	LSM_HOOK_INIT(task_setrlimit, apparmor_task_setrlimit),
 };
 
 /*
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 40e3f7757ec7..0cf105f346d4 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -5841,211 +5841,215 @@ static int selinux_key_getsecurity(struct key *key, char **_buffer)
 #endif
 
 static struct security_operations selinux_ops = {
-	.name =				"selinux",
-
-	.binder_set_context_mgr =	selinux_binder_set_context_mgr,
-	.binder_transaction =		selinux_binder_transaction,
-	.binder_transfer_binder =	selinux_binder_transfer_binder,
-	.binder_transfer_file =		selinux_binder_transfer_file,
-
-	.ptrace_access_check =		selinux_ptrace_access_check,
-	.ptrace_traceme =		selinux_ptrace_traceme,
-	.capget =			selinux_capget,
-	.capset =			selinux_capset,
-	.capable =			selinux_capable,
-	.quotactl =			selinux_quotactl,
-	.quota_on =			selinux_quota_on,
-	.syslog =			selinux_syslog,
-	.vm_enough_memory =		selinux_vm_enough_memory,
-
-	.netlink_send =			selinux_netlink_send,
-
-	.bprm_set_creds =		selinux_bprm_set_creds,
-	.bprm_committing_creds =	selinux_bprm_committing_creds,
-	.bprm_committed_creds =		selinux_bprm_committed_creds,
-	.bprm_secureexec =		selinux_bprm_secureexec,
-
-	.sb_alloc_security =		selinux_sb_alloc_security,
-	.sb_free_security =		selinux_sb_free_security,
-	.sb_copy_data =			selinux_sb_copy_data,
-	.sb_remount =			selinux_sb_remount,
-	.sb_kern_mount =		selinux_sb_kern_mount,
-	.sb_show_options =		selinux_sb_show_options,
-	.sb_statfs =			selinux_sb_statfs,
-	.sb_mount =			selinux_mount,
-	.sb_umount =			selinux_umount,
-	.sb_set_mnt_opts =		selinux_set_mnt_opts,
-	.sb_clone_mnt_opts =		selinux_sb_clone_mnt_opts,
-	.sb_parse_opts_str = 		selinux_parse_opts_str,
-
-	.dentry_init_security =		selinux_dentry_init_security,
-
-	.inode_alloc_security =		selinux_inode_alloc_security,
-	.inode_free_security =		selinux_inode_free_security,
-	.inode_init_security =		selinux_inode_init_security,
-	.inode_create =			selinux_inode_create,
-	.inode_link =			selinux_inode_link,
-	.inode_unlink =			selinux_inode_unlink,
-	.inode_symlink =		selinux_inode_symlink,
-	.inode_mkdir =			selinux_inode_mkdir,
-	.inode_rmdir =			selinux_inode_rmdir,
-	.inode_mknod =			selinux_inode_mknod,
-	.inode_rename =			selinux_inode_rename,
-	.inode_readlink =		selinux_inode_readlink,
-	.inode_follow_link =		selinux_inode_follow_link,
-	.inode_permission =		selinux_inode_permission,
-	.inode_setattr =		selinux_inode_setattr,
-	.inode_getattr =		selinux_inode_getattr,
-	.inode_setxattr =		selinux_inode_setxattr,
-	.inode_post_setxattr =		selinux_inode_post_setxattr,
-	.inode_getxattr =		selinux_inode_getxattr,
-	.inode_listxattr =		selinux_inode_listxattr,
-	.inode_removexattr =		selinux_inode_removexattr,
-	.inode_getsecurity =		selinux_inode_getsecurity,
-	.inode_setsecurity =		selinux_inode_setsecurity,
-	.inode_listsecurity =		selinux_inode_listsecurity,
-	.inode_getsecid =		selinux_inode_getsecid,
-
-	.file_permission =		selinux_file_permission,
-	.file_alloc_security =		selinux_file_alloc_security,
-	.file_free_security =		selinux_file_free_security,
-	.file_ioctl =			selinux_file_ioctl,
-	.mmap_file =			selinux_mmap_file,
-	.mmap_addr =			selinux_mmap_addr,
-	.file_mprotect =		selinux_file_mprotect,
-	.file_lock =			selinux_file_lock,
-	.file_fcntl =			selinux_file_fcntl,
-	.file_set_fowner =		selinux_file_set_fowner,
-	.file_send_sigiotask =		selinux_file_send_sigiotask,
-	.file_receive =			selinux_file_receive,
-
-	.file_open =			selinux_file_open,
-
-	.task_create =			selinux_task_create,
-	.cred_alloc_blank =		selinux_cred_alloc_blank,
-	.cred_free =			selinux_cred_free,
-	.cred_prepare =			selinux_cred_prepare,
-	.cred_transfer =		selinux_cred_transfer,
-	.kernel_act_as =		selinux_kernel_act_as,
-	.kernel_create_files_as =	selinux_kernel_create_files_as,
-	.kernel_module_request =	selinux_kernel_module_request,
-	.task_setpgid =			selinux_task_setpgid,
-	.task_getpgid =			selinux_task_getpgid,
-	.task_getsid =			selinux_task_getsid,
-	.task_getsecid =		selinux_task_getsecid,
-	.task_setnice =			selinux_task_setnice,
-	.task_setioprio =		selinux_task_setioprio,
-	.task_getioprio =		selinux_task_getioprio,
-	.task_setrlimit =		selinux_task_setrlimit,
-	.task_setscheduler =		selinux_task_setscheduler,
-	.task_getscheduler =		selinux_task_getscheduler,
-	.task_movememory =		selinux_task_movememory,
-	.task_kill =			selinux_task_kill,
-	.task_wait =			selinux_task_wait,
-	.task_to_inode =		selinux_task_to_inode,
-
-	.ipc_permission =		selinux_ipc_permission,
-	.ipc_getsecid =			selinux_ipc_getsecid,
-
-	.msg_msg_alloc_security =	selinux_msg_msg_alloc_security,
-	.msg_msg_free_security =	selinux_msg_msg_free_security,
-
-	.msg_queue_alloc_security =	selinux_msg_queue_alloc_security,
-	.msg_queue_free_security =	selinux_msg_queue_free_security,
-	.msg_queue_associate =		selinux_msg_queue_associate,
-	.msg_queue_msgctl =		selinux_msg_queue_msgctl,
-	.msg_queue_msgsnd =		selinux_msg_queue_msgsnd,
-	.msg_queue_msgrcv =		selinux_msg_queue_msgrcv,
-
-	.shm_alloc_security =		selinux_shm_alloc_security,
-	.shm_free_security =		selinux_shm_free_security,
-	.shm_associate =		selinux_shm_associate,
-	.shm_shmctl =			selinux_shm_shmctl,
-	.shm_shmat =			selinux_shm_shmat,
-
-	.sem_alloc_security =		selinux_sem_alloc_security,
-	.sem_free_security =		selinux_sem_free_security,
-	.sem_associate =		selinux_sem_associate,
-	.sem_semctl =			selinux_sem_semctl,
-	.sem_semop =			selinux_sem_semop,
-
-	.d_instantiate =		selinux_d_instantiate,
-
-	.getprocattr =			selinux_getprocattr,
-	.setprocattr =			selinux_setprocattr,
-
-	.ismaclabel =			selinux_ismaclabel,
-	.secid_to_secctx =		selinux_secid_to_secctx,
-	.secctx_to_secid =		selinux_secctx_to_secid,
-	.release_secctx =		selinux_release_secctx,
-	.inode_notifysecctx =		selinux_inode_notifysecctx,
-	.inode_setsecctx =		selinux_inode_setsecctx,
-	.inode_getsecctx =		selinux_inode_getsecctx,
-
-	.unix_stream_connect =		selinux_socket_unix_stream_connect,
-	.unix_may_send =		selinux_socket_unix_may_send,
-
-	.socket_create =		selinux_socket_create,
-	.socket_post_create =		selinux_socket_post_create,
-	.socket_bind =			selinux_socket_bind,
-	.socket_connect =		selinux_socket_connect,
-	.socket_listen =		selinux_socket_listen,
-	.socket_accept =		selinux_socket_accept,
-	.socket_sendmsg =		selinux_socket_sendmsg,
-	.socket_recvmsg =		selinux_socket_recvmsg,
-	.socket_getsockname =		selinux_socket_getsockname,
-	.socket_getpeername =		selinux_socket_getpeername,
-	.socket_getsockopt =		selinux_socket_getsockopt,
-	.socket_setsockopt =		selinux_socket_setsockopt,
-	.socket_shutdown =		selinux_socket_shutdown,
-	.socket_sock_rcv_skb =		selinux_socket_sock_rcv_skb,
-	.socket_getpeersec_stream =	selinux_socket_getpeersec_stream,
-	.socket_getpeersec_dgram =	selinux_socket_getpeersec_dgram,
-	.sk_alloc_security =		selinux_sk_alloc_security,
-	.sk_free_security =		selinux_sk_free_security,
-	.sk_clone_security =		selinux_sk_clone_security,
-	.sk_getsecid =			selinux_sk_getsecid,
-	.sock_graft =			selinux_sock_graft,
-	.inet_conn_request =		selinux_inet_conn_request,
-	.inet_csk_clone =		selinux_inet_csk_clone,
-	.inet_conn_established =	selinux_inet_conn_established,
-	.secmark_relabel_packet =	selinux_secmark_relabel_packet,
-	.secmark_refcount_inc =		selinux_secmark_refcount_inc,
-	.secmark_refcount_dec =		selinux_secmark_refcount_dec,
-	.req_classify_flow =		selinux_req_classify_flow,
-	.tun_dev_alloc_security =	selinux_tun_dev_alloc_security,
-	.tun_dev_free_security =	selinux_tun_dev_free_security,
-	.tun_dev_create =		selinux_tun_dev_create,
-	.tun_dev_attach_queue =		selinux_tun_dev_attach_queue,
-	.tun_dev_attach =		selinux_tun_dev_attach,
-	.tun_dev_open =			selinux_tun_dev_open,
+	LSM_HOOK_INIT(name, "selinux"),
+
+	LSM_HOOK_INIT(binder_set_context_mgr, selinux_binder_set_context_mgr),
+	LSM_HOOK_INIT(binder_transaction, selinux_binder_transaction),
+	LSM_HOOK_INIT(binder_transfer_binder, selinux_binder_transfer_binder),
+	LSM_HOOK_INIT(binder_transfer_file, selinux_binder_transfer_file),
+
+	LSM_HOOK_INIT(ptrace_access_check, selinux_ptrace_access_check),
+	LSM_HOOK_INIT(ptrace_traceme, selinux_ptrace_traceme),
+	LSM_HOOK_INIT(capget, selinux_capget),
+	LSM_HOOK_INIT(capset, selinux_capset),
+	LSM_HOOK_INIT(capable, selinux_capable),
+	LSM_HOOK_INIT(quotactl, selinux_quotactl),
+	LSM_HOOK_INIT(quota_on, selinux_quota_on),
+	LSM_HOOK_INIT(syslog, selinux_syslog),
+	LSM_HOOK_INIT(vm_enough_memory, selinux_vm_enough_memory),
+
+	LSM_HOOK_INIT(netlink_send, selinux_netlink_send),
+
+	LSM_HOOK_INIT(bprm_set_creds, selinux_bprm_set_creds),
+	LSM_HOOK_INIT(bprm_committing_creds, selinux_bprm_committing_creds),
+	LSM_HOOK_INIT(bprm_committed_creds, selinux_bprm_committed_creds),
+	LSM_HOOK_INIT(bprm_secureexec, selinux_bprm_secureexec),
+
+	LSM_HOOK_INIT(sb_alloc_security, selinux_sb_alloc_security),
+	LSM_HOOK_INIT(sb_free_security, selinux_sb_free_security),
+	LSM_HOOK_INIT(sb_copy_data, selinux_sb_copy_data),
+	LSM_HOOK_INIT(sb_remount, selinux_sb_remount),
+	LSM_HOOK_INIT(sb_kern_mount, selinux_sb_kern_mount),
+	LSM_HOOK_INIT(sb_show_options, selinux_sb_show_options),
+	LSM_HOOK_INIT(sb_statfs, selinux_sb_statfs),
+	LSM_HOOK_INIT(sb_mount, selinux_mount),
+	LSM_HOOK_INIT(sb_umount, selinux_umount),
+	LSM_HOOK_INIT(sb_set_mnt_opts, selinux_set_mnt_opts),
+	LSM_HOOK_INIT(sb_clone_mnt_opts, selinux_sb_clone_mnt_opts),
+	LSM_HOOK_INIT(sb_parse_opts_str, selinux_parse_opts_str),
+
+	LSM_HOOK_INIT(dentry_init_security, selinux_dentry_init_security),
+
+	LSM_HOOK_INIT(inode_alloc_security, selinux_inode_alloc_security),
+	LSM_HOOK_INIT(inode_free_security, selinux_inode_free_security),
+	LSM_HOOK_INIT(inode_init_security, selinux_inode_init_security),
+	LSM_HOOK_INIT(inode_create, selinux_inode_create),
+	LSM_HOOK_INIT(inode_link, selinux_inode_link),
+	LSM_HOOK_INIT(inode_unlink, selinux_inode_unlink),
+	LSM_HOOK_INIT(inode_symlink, selinux_inode_symlink),
+	LSM_HOOK_INIT(inode_mkdir, selinux_inode_mkdir),
+	LSM_HOOK_INIT(inode_rmdir, selinux_inode_rmdir),
+	LSM_HOOK_INIT(inode_mknod, selinux_inode_mknod),
+	LSM_HOOK_INIT(inode_rename, selinux_inode_rename),
+	LSM_HOOK_INIT(inode_readlink, selinux_inode_readlink),
+	LSM_HOOK_INIT(inode_follow_link, selinux_inode_follow_link),
+	LSM_HOOK_INIT(inode_permission, selinux_inode_permission),
+	LSM_HOOK_INIT(inode_setattr, selinux_inode_setattr),
+	LSM_HOOK_INIT(inode_getattr, selinux_inode_getattr),
+	LSM_HOOK_INIT(inode_setxattr, selinux_inode_setxattr),
+	LSM_HOOK_INIT(inode_post_setxattr, selinux_inode_post_setxattr),
+	LSM_HOOK_INIT(inode_getxattr, selinux_inode_getxattr),
+	LSM_HOOK_INIT(inode_listxattr, selinux_inode_listxattr),
+	LSM_HOOK_INIT(inode_removexattr, selinux_inode_removexattr),
+	LSM_HOOK_INIT(inode_getsecurity, selinux_inode_getsecurity),
+	LSM_HOOK_INIT(inode_setsecurity, selinux_inode_setsecurity),
+	LSM_HOOK_INIT(inode_listsecurity, selinux_inode_listsecurity),
+	LSM_HOOK_INIT(inode_getsecid, selinux_inode_getsecid),
+
+	LSM_HOOK_INIT(file_permission, selinux_file_permission),
+	LSM_HOOK_INIT(file_alloc_security, selinux_file_alloc_security),
+	LSM_HOOK_INIT(file_free_security, selinux_file_free_security),
+	LSM_HOOK_INIT(file_ioctl, selinux_file_ioctl),
+	LSM_HOOK_INIT(mmap_file, selinux_mmap_file),
+	LSM_HOOK_INIT(mmap_addr, selinux_mmap_addr),
+	LSM_HOOK_INIT(file_mprotect, selinux_file_mprotect),
+	LSM_HOOK_INIT(file_lock, selinux_file_lock),
+	LSM_HOOK_INIT(file_fcntl, selinux_file_fcntl),
+	LSM_HOOK_INIT(file_set_fowner, selinux_file_set_fowner),
+	LSM_HOOK_INIT(file_send_sigiotask, selinux_file_send_sigiotask),
+	LSM_HOOK_INIT(file_receive, selinux_file_receive),
+
+	LSM_HOOK_INIT(file_open, selinux_file_open),
+
+	LSM_HOOK_INIT(task_create, selinux_task_create),
+	LSM_HOOK_INIT(cred_alloc_blank, selinux_cred_alloc_blank),
+	LSM_HOOK_INIT(cred_free, selinux_cred_free),
+	LSM_HOOK_INIT(cred_prepare, selinux_cred_prepare),
+	LSM_HOOK_INIT(cred_transfer, selinux_cred_transfer),
+	LSM_HOOK_INIT(kernel_act_as, selinux_kernel_act_as),
+	LSM_HOOK_INIT(kernel_create_files_as, selinux_kernel_create_files_as),
+	LSM_HOOK_INIT(kernel_module_request, selinux_kernel_module_request),
+	LSM_HOOK_INIT(task_setpgid, selinux_task_setpgid),
+	LSM_HOOK_INIT(task_getpgid, selinux_task_getpgid),
+	LSM_HOOK_INIT(task_getsid, selinux_task_getsid),
+	LSM_HOOK_INIT(task_getsecid, selinux_task_getsecid),
+	LSM_HOOK_INIT(task_setnice, selinux_task_setnice),
+	LSM_HOOK_INIT(task_setioprio, selinux_task_setioprio),
+	LSM_HOOK_INIT(task_getioprio, selinux_task_getioprio),
+	LSM_HOOK_INIT(task_setrlimit, selinux_task_setrlimit),
+	LSM_HOOK_INIT(task_setscheduler, selinux_task_setscheduler),
+	LSM_HOOK_INIT(task_getscheduler, selinux_task_getscheduler),
+	LSM_HOOK_INIT(task_movememory, selinux_task_movememory),
+	LSM_HOOK_INIT(task_kill, selinux_task_kill),
+	LSM_HOOK_INIT(task_wait, selinux_task_wait),
+	LSM_HOOK_INIT(task_to_inode, selinux_task_to_inode),
+
+	LSM_HOOK_INIT(ipc_permission, selinux_ipc_permission),
+	LSM_HOOK_INIT(ipc_getsecid, selinux_ipc_getsecid),
+
+	LSM_HOOK_INIT(msg_msg_alloc_security, selinux_msg_msg_alloc_security),
+	LSM_HOOK_INIT(msg_msg_free_security, selinux_msg_msg_free_security),
+
+	LSM_HOOK_INIT(msg_queue_alloc_security,
+			selinux_msg_queue_alloc_security),
+	LSM_HOOK_INIT(msg_queue_free_security, selinux_msg_queue_free_security),
+	LSM_HOOK_INIT(msg_queue_associate, selinux_msg_queue_associate),
+	LSM_HOOK_INIT(msg_queue_msgctl, selinux_msg_queue_msgctl),
+	LSM_HOOK_INIT(msg_queue_msgsnd, selinux_msg_queue_msgsnd),
+	LSM_HOOK_INIT(msg_queue_msgrcv, selinux_msg_queue_msgrcv),
+
+	LSM_HOOK_INIT(shm_alloc_security, selinux_shm_alloc_security),
+	LSM_HOOK_INIT(shm_free_security, selinux_shm_free_security),
+	LSM_HOOK_INIT(shm_associate, selinux_shm_associate),
+	LSM_HOOK_INIT(shm_shmctl, selinux_shm_shmctl),
+	LSM_HOOK_INIT(shm_shmat, selinux_shm_shmat),
+
+	LSM_HOOK_INIT(sem_alloc_security, selinux_sem_alloc_security),
+	LSM_HOOK_INIT(sem_free_security, selinux_sem_free_security),
+	LSM_HOOK_INIT(sem_associate, selinux_sem_associate),
+	LSM_HOOK_INIT(sem_semctl, selinux_sem_semctl),
+	LSM_HOOK_INIT(sem_semop, selinux_sem_semop),
+
+	LSM_HOOK_INIT(d_instantiate, selinux_d_instantiate),
+
+	LSM_HOOK_INIT(getprocattr, selinux_getprocattr),
+	LSM_HOOK_INIT(setprocattr, selinux_setprocattr),
+
+	LSM_HOOK_INIT(ismaclabel, selinux_ismaclabel),
+	LSM_HOOK_INIT(secid_to_secctx, selinux_secid_to_secctx),
+	LSM_HOOK_INIT(secctx_to_secid, selinux_secctx_to_secid),
+	LSM_HOOK_INIT(release_secctx, selinux_release_secctx),
+	LSM_HOOK_INIT(inode_notifysecctx, selinux_inode_notifysecctx),
+	LSM_HOOK_INIT(inode_setsecctx, selinux_inode_setsecctx),
+	LSM_HOOK_INIT(inode_getsecctx, selinux_inode_getsecctx),
+
+	LSM_HOOK_INIT(unix_stream_connect, selinux_socket_unix_stream_connect),
+	LSM_HOOK_INIT(unix_may_send, selinux_socket_unix_may_send),
+
+	LSM_HOOK_INIT(socket_create, selinux_socket_create),
+	LSM_HOOK_INIT(socket_post_create, selinux_socket_post_create),
+	LSM_HOOK_INIT(socket_bind, selinux_socket_bind),
+	LSM_HOOK_INIT(socket_connect, selinux_socket_connect),
+	LSM_HOOK_INIT(socket_listen, selinux_socket_listen),
+	LSM_HOOK_INIT(socket_accept, selinux_socket_accept),
+	LSM_HOOK_INIT(socket_sendmsg, selinux_socket_sendmsg),
+	LSM_HOOK_INIT(socket_recvmsg, selinux_socket_recvmsg),
+	LSM_HOOK_INIT(socket_getsockname, selinux_socket_getsockname),
+	LSM_HOOK_INIT(socket_getpeername, selinux_socket_getpeername),
+	LSM_HOOK_INIT(socket_getsockopt, selinux_socket_getsockopt),
+	LSM_HOOK_INIT(socket_setsockopt, selinux_socket_setsockopt),
+	LSM_HOOK_INIT(socket_shutdown, selinux_socket_shutdown),
+	LSM_HOOK_INIT(socket_sock_rcv_skb, selinux_socket_sock_rcv_skb),
+	LSM_HOOK_INIT(socket_getpeersec_stream,
+			selinux_socket_getpeersec_stream),
+	LSM_HOOK_INIT(socket_getpeersec_dgram, selinux_socket_getpeersec_dgram),
+	LSM_HOOK_INIT(sk_alloc_security, selinux_sk_alloc_security),
+	LSM_HOOK_INIT(sk_free_security, selinux_sk_free_security),
+	LSM_HOOK_INIT(sk_clone_security, selinux_sk_clone_security),
+	LSM_HOOK_INIT(sk_getsecid, selinux_sk_getsecid),
+	LSM_HOOK_INIT(sock_graft, selinux_sock_graft),
+	LSM_HOOK_INIT(inet_conn_request, selinux_inet_conn_request),
+	LSM_HOOK_INIT(inet_csk_clone, selinux_inet_csk_clone),
+	LSM_HOOK_INIT(inet_conn_established, selinux_inet_conn_established),
+	LSM_HOOK_INIT(secmark_relabel_packet, selinux_secmark_relabel_packet),
+	LSM_HOOK_INIT(secmark_refcount_inc, selinux_secmark_refcount_inc),
+	LSM_HOOK_INIT(secmark_refcount_dec, selinux_secmark_refcount_dec),
+	LSM_HOOK_INIT(req_classify_flow, selinux_req_classify_flow),
+	LSM_HOOK_INIT(tun_dev_alloc_security, selinux_tun_dev_alloc_security),
+	LSM_HOOK_INIT(tun_dev_free_security, selinux_tun_dev_free_security),
+	LSM_HOOK_INIT(tun_dev_create, selinux_tun_dev_create),
+	LSM_HOOK_INIT(tun_dev_attach_queue, selinux_tun_dev_attach_queue),
+	LSM_HOOK_INIT(tun_dev_attach, selinux_tun_dev_attach),
+	LSM_HOOK_INIT(tun_dev_open, selinux_tun_dev_open),
 
 #ifdef CONFIG_SECURITY_NETWORK_XFRM
-	.xfrm_policy_alloc_security =	selinux_xfrm_policy_alloc,
-	.xfrm_policy_clone_security =	selinux_xfrm_policy_clone,
-	.xfrm_policy_free_security =	selinux_xfrm_policy_free,
-	.xfrm_policy_delete_security =	selinux_xfrm_policy_delete,
-	.xfrm_state_alloc =		selinux_xfrm_state_alloc,
-	.xfrm_state_alloc_acquire =	selinux_xfrm_state_alloc_acquire,
-	.xfrm_state_free_security =	selinux_xfrm_state_free,
-	.xfrm_state_delete_security =	selinux_xfrm_state_delete,
-	.xfrm_policy_lookup =		selinux_xfrm_policy_lookup,
-	.xfrm_state_pol_flow_match =	selinux_xfrm_state_pol_flow_match,
-	.xfrm_decode_session =		selinux_xfrm_decode_session,
+	LSM_HOOK_INIT(xfrm_policy_alloc_security, selinux_xfrm_policy_alloc),
+	LSM_HOOK_INIT(xfrm_policy_clone_security, selinux_xfrm_policy_clone),
+	LSM_HOOK_INIT(xfrm_policy_free_security, selinux_xfrm_policy_free),
+	LSM_HOOK_INIT(xfrm_policy_delete_security, selinux_xfrm_policy_delete),
+	LSM_HOOK_INIT(xfrm_state_alloc, selinux_xfrm_state_alloc),
+	LSM_HOOK_INIT(xfrm_state_alloc_acquire,
+			selinux_xfrm_state_alloc_acquire),
+	LSM_HOOK_INIT(xfrm_state_free_security, selinux_xfrm_state_free),
+	LSM_HOOK_INIT(xfrm_state_delete_security, selinux_xfrm_state_delete),
+	LSM_HOOK_INIT(xfrm_policy_lookup, selinux_xfrm_policy_lookup),
+	LSM_HOOK_INIT(xfrm_state_pol_flow_match,
+			selinux_xfrm_state_pol_flow_match),
+	LSM_HOOK_INIT(xfrm_decode_session, selinux_xfrm_decode_session),
 #endif
 
 #ifdef CONFIG_KEYS
-	.key_alloc =			selinux_key_alloc,
-	.key_free =			selinux_key_free,
-	.key_permission =		selinux_key_permission,
-	.key_getsecurity =		selinux_key_getsecurity,
+	LSM_HOOK_INIT(key_alloc, selinux_key_alloc),
+	LSM_HOOK_INIT(key_free, selinux_key_free),
+	LSM_HOOK_INIT(key_permission, selinux_key_permission),
+	LSM_HOOK_INIT(key_getsecurity, selinux_key_getsecurity),
 #endif
 
 #ifdef CONFIG_AUDIT
-	.audit_rule_init =		selinux_audit_rule_init,
-	.audit_rule_known =		selinux_audit_rule_known,
-	.audit_rule_match =		selinux_audit_rule_match,
-	.audit_rule_free =		selinux_audit_rule_free,
+	LSM_HOOK_INIT(audit_rule_init, selinux_audit_rule_init),
+	LSM_HOOK_INIT(audit_rule_known, selinux_audit_rule_known),
+	LSM_HOOK_INIT(audit_rule_match, selinux_audit_rule_match),
+	LSM_HOOK_INIT(audit_rule_free, selinux_audit_rule_free),
 #endif
 };
 
diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c
index b644757886bc..4313bf44c3f0 100644
--- a/security/smack/smack_lsm.c
+++ b/security/smack/smack_lsm.c
@@ -4267,146 +4267,146 @@ static int smack_inode_getsecctx(struct inode *inode, void **ctx, u32 *ctxlen)
 }
 
 struct security_operations smack_ops = {
-	.name =				"smack",
-
-	.ptrace_access_check =		smack_ptrace_access_check,
-	.ptrace_traceme =		smack_ptrace_traceme,
-	.syslog = 			smack_syslog,
-
-	.sb_alloc_security = 		smack_sb_alloc_security,
-	.sb_free_security = 		smack_sb_free_security,
-	.sb_copy_data = 		smack_sb_copy_data,
-	.sb_kern_mount = 		smack_sb_kern_mount,
-	.sb_statfs = 			smack_sb_statfs,
-
-	.bprm_set_creds =		smack_bprm_set_creds,
-	.bprm_committing_creds =	smack_bprm_committing_creds,
-	.bprm_secureexec =		smack_bprm_secureexec,
-
-	.inode_alloc_security = 	smack_inode_alloc_security,
-	.inode_free_security = 		smack_inode_free_security,
-	.inode_init_security = 		smack_inode_init_security,
-	.inode_link = 			smack_inode_link,
-	.inode_unlink = 		smack_inode_unlink,
-	.inode_rmdir = 			smack_inode_rmdir,
-	.inode_rename = 		smack_inode_rename,
-	.inode_permission = 		smack_inode_permission,
-	.inode_setattr = 		smack_inode_setattr,
-	.inode_getattr = 		smack_inode_getattr,
-	.inode_setxattr = 		smack_inode_setxattr,
-	.inode_post_setxattr = 		smack_inode_post_setxattr,
-	.inode_getxattr = 		smack_inode_getxattr,
-	.inode_removexattr = 		smack_inode_removexattr,
-	.inode_getsecurity = 		smack_inode_getsecurity,
-	.inode_setsecurity = 		smack_inode_setsecurity,
-	.inode_listsecurity = 		smack_inode_listsecurity,
-	.inode_getsecid =		smack_inode_getsecid,
-
-	.file_permission = 		smack_file_permission,
-	.file_alloc_security = 		smack_file_alloc_security,
-	.file_free_security = 		smack_file_free_security,
-	.file_ioctl = 			smack_file_ioctl,
-	.file_lock = 			smack_file_lock,
-	.file_fcntl = 			smack_file_fcntl,
-	.mmap_file =			smack_mmap_file,
-	.mmap_addr =			cap_mmap_addr,
-	.file_set_fowner = 		smack_file_set_fowner,
-	.file_send_sigiotask = 		smack_file_send_sigiotask,
-	.file_receive = 		smack_file_receive,
-
-	.file_open =			smack_file_open,
-
-	.cred_alloc_blank =		smack_cred_alloc_blank,
-	.cred_free =			smack_cred_free,
-	.cred_prepare =			smack_cred_prepare,
-	.cred_transfer =		smack_cred_transfer,
-	.kernel_act_as =		smack_kernel_act_as,
-	.kernel_create_files_as =	smack_kernel_create_files_as,
-	.task_setpgid = 		smack_task_setpgid,
-	.task_getpgid = 		smack_task_getpgid,
-	.task_getsid = 			smack_task_getsid,
-	.task_getsecid = 		smack_task_getsecid,
-	.task_setnice = 		smack_task_setnice,
-	.task_setioprio = 		smack_task_setioprio,
-	.task_getioprio = 		smack_task_getioprio,
-	.task_setscheduler = 		smack_task_setscheduler,
-	.task_getscheduler = 		smack_task_getscheduler,
-	.task_movememory = 		smack_task_movememory,
-	.task_kill = 			smack_task_kill,
-	.task_wait = 			smack_task_wait,
-	.task_to_inode = 		smack_task_to_inode,
-
-	.ipc_permission = 		smack_ipc_permission,
-	.ipc_getsecid =			smack_ipc_getsecid,
-
-	.msg_msg_alloc_security = 	smack_msg_msg_alloc_security,
-	.msg_msg_free_security = 	smack_msg_msg_free_security,
-
-	.msg_queue_alloc_security = 	smack_msg_queue_alloc_security,
-	.msg_queue_free_security = 	smack_msg_queue_free_security,
-	.msg_queue_associate = 		smack_msg_queue_associate,
-	.msg_queue_msgctl = 		smack_msg_queue_msgctl,
-	.msg_queue_msgsnd = 		smack_msg_queue_msgsnd,
-	.msg_queue_msgrcv = 		smack_msg_queue_msgrcv,
-
-	.shm_alloc_security = 		smack_shm_alloc_security,
-	.shm_free_security = 		smack_shm_free_security,
-	.shm_associate = 		smack_shm_associate,
-	.shm_shmctl = 			smack_shm_shmctl,
-	.shm_shmat = 			smack_shm_shmat,
-
-	.sem_alloc_security = 		smack_sem_alloc_security,
-	.sem_free_security = 		smack_sem_free_security,
-	.sem_associate = 		smack_sem_associate,
-	.sem_semctl = 			smack_sem_semctl,
-	.sem_semop = 			smack_sem_semop,
-
-	.d_instantiate = 		smack_d_instantiate,
-
-	.getprocattr = 			smack_getprocattr,
-	.setprocattr = 			smack_setprocattr,
-
-	.unix_stream_connect = 		smack_unix_stream_connect,
-	.unix_may_send = 		smack_unix_may_send,
-
-	.socket_post_create = 		smack_socket_post_create,
+	LSM_HOOK_INIT(name, "smack"),
+
+	LSM_HOOK_INIT(ptrace_access_check, smack_ptrace_access_check),
+	LSM_HOOK_INIT(ptrace_traceme, smack_ptrace_traceme),
+	LSM_HOOK_INIT(syslog, smack_syslog),
+
+	LSM_HOOK_INIT(sb_alloc_security, smack_sb_alloc_security),
+	LSM_HOOK_INIT(sb_free_security, smack_sb_free_security),
+	LSM_HOOK_INIT(sb_copy_data, smack_sb_copy_data),
+	LSM_HOOK_INIT(sb_kern_mount, smack_sb_kern_mount),
+	LSM_HOOK_INIT(sb_statfs, smack_sb_statfs),
+
+	LSM_HOOK_INIT(bprm_set_creds, smack_bprm_set_creds),
+	LSM_HOOK_INIT(bprm_committing_creds, smack_bprm_committing_creds),
+	LSM_HOOK_INIT(bprm_secureexec, smack_bprm_secureexec),
+
+	LSM_HOOK_INIT(inode_alloc_security, smack_inode_alloc_security),
+	LSM_HOOK_INIT(inode_free_security, smack_inode_free_security),
+	LSM_HOOK_INIT(inode_init_security, smack_inode_init_security),
+	LSM_HOOK_INIT(inode_link, smack_inode_link),
+	LSM_HOOK_INIT(inode_unlink, smack_inode_unlink),
+	LSM_HOOK_INIT(inode_rmdir, smack_inode_rmdir),
+	LSM_HOOK_INIT(inode_rename, smack_inode_rename),
+	LSM_HOOK_INIT(inode_permission, smack_inode_permission),
+	LSM_HOOK_INIT(inode_setattr, smack_inode_setattr),
+	LSM_HOOK_INIT(inode_getattr, smack_inode_getattr),
+	LSM_HOOK_INIT(inode_setxattr, smack_inode_setxattr),
+	LSM_HOOK_INIT(inode_post_setxattr, smack_inode_post_setxattr),
+	LSM_HOOK_INIT(inode_getxattr, smack_inode_getxattr),
+	LSM_HOOK_INIT(inode_removexattr, smack_inode_removexattr),
+	LSM_HOOK_INIT(inode_getsecurity, smack_inode_getsecurity),
+	LSM_HOOK_INIT(inode_setsecurity, smack_inode_setsecurity),
+	LSM_HOOK_INIT(inode_listsecurity, smack_inode_listsecurity),
+	LSM_HOOK_INIT(inode_getsecid, smack_inode_getsecid),
+
+	LSM_HOOK_INIT(file_permission, smack_file_permission),
+	LSM_HOOK_INIT(file_alloc_security, smack_file_alloc_security),
+	LSM_HOOK_INIT(file_free_security, smack_file_free_security),
+	LSM_HOOK_INIT(file_ioctl, smack_file_ioctl),
+	LSM_HOOK_INIT(file_lock, smack_file_lock),
+	LSM_HOOK_INIT(file_fcntl, smack_file_fcntl),
+	LSM_HOOK_INIT(mmap_file, smack_mmap_file),
+	LSM_HOOK_INIT(mmap_addr, cap_mmap_addr),
+	LSM_HOOK_INIT(file_set_fowner, smack_file_set_fowner),
+	LSM_HOOK_INIT(file_send_sigiotask, smack_file_send_sigiotask),
+	LSM_HOOK_INIT(file_receive, smack_file_receive),
+
+	LSM_HOOK_INIT(file_open, smack_file_open),
+
+	LSM_HOOK_INIT(cred_alloc_blank, smack_cred_alloc_blank),
+	LSM_HOOK_INIT(cred_free, smack_cred_free),
+	LSM_HOOK_INIT(cred_prepare, smack_cred_prepare),
+	LSM_HOOK_INIT(cred_transfer, smack_cred_transfer),
+	LSM_HOOK_INIT(kernel_act_as, smack_kernel_act_as),
+	LSM_HOOK_INIT(kernel_create_files_as, smack_kernel_create_files_as),
+	LSM_HOOK_INIT(task_setpgid, smack_task_setpgid),
+	LSM_HOOK_INIT(task_getpgid, smack_task_getpgid),
+	LSM_HOOK_INIT(task_getsid, smack_task_getsid),
+	LSM_HOOK_INIT(task_getsecid, smack_task_getsecid),
+	LSM_HOOK_INIT(task_setnice, smack_task_setnice),
+	LSM_HOOK_INIT(task_setioprio, smack_task_setioprio),
+	LSM_HOOK_INIT(task_getioprio, smack_task_getioprio),
+	LSM_HOOK_INIT(task_setscheduler, smack_task_setscheduler),
+	LSM_HOOK_INIT(task_getscheduler, smack_task_getscheduler),
+	LSM_HOOK_INIT(task_movememory, smack_task_movememory),
+	LSM_HOOK_INIT(task_kill, smack_task_kill),
+	LSM_HOOK_INIT(task_wait, smack_task_wait),
+	LSM_HOOK_INIT(task_to_inode, smack_task_to_inode),
+
+	LSM_HOOK_INIT(ipc_permission, smack_ipc_permission),
+	LSM_HOOK_INIT(ipc_getsecid, smack_ipc_getsecid),
+
+	LSM_HOOK_INIT(msg_msg_alloc_security, smack_msg_msg_alloc_security),
+	LSM_HOOK_INIT(msg_msg_free_security, smack_msg_msg_free_security),
+
+	LSM_HOOK_INIT(msg_queue_alloc_security, smack_msg_queue_alloc_security),
+	LSM_HOOK_INIT(msg_queue_free_security, smack_msg_queue_free_security),
+	LSM_HOOK_INIT(msg_queue_associate, smack_msg_queue_associate),
+	LSM_HOOK_INIT(msg_queue_msgctl, smack_msg_queue_msgctl),
+	LSM_HOOK_INIT(msg_queue_msgsnd, smack_msg_queue_msgsnd),
+	LSM_HOOK_INIT(msg_queue_msgrcv, smack_msg_queue_msgrcv),
+
+	LSM_HOOK_INIT(shm_alloc_security, smack_shm_alloc_security),
+	LSM_HOOK_INIT(shm_free_security, smack_shm_free_security),
+	LSM_HOOK_INIT(shm_associate, smack_shm_associate),
+	LSM_HOOK_INIT(shm_shmctl, smack_shm_shmctl),
+	LSM_HOOK_INIT(shm_shmat, smack_shm_shmat),
+
+	LSM_HOOK_INIT(sem_alloc_security, smack_sem_alloc_security),
+	LSM_HOOK_INIT(sem_free_security, smack_sem_free_security),
+	LSM_HOOK_INIT(sem_associate, smack_sem_associate),
+	LSM_HOOK_INIT(sem_semctl, smack_sem_semctl),
+	LSM_HOOK_INIT(sem_semop, smack_sem_semop),
+
+	LSM_HOOK_INIT(d_instantiate, smack_d_instantiate),
+
+	LSM_HOOK_INIT(getprocattr, smack_getprocattr),
+	LSM_HOOK_INIT(setprocattr, smack_setprocattr),
+
+	LSM_HOOK_INIT(unix_stream_connect, smack_unix_stream_connect),
+	LSM_HOOK_INIT(unix_may_send, smack_unix_may_send),
+
+	LSM_HOOK_INIT(socket_post_create, smack_socket_post_create),
 #ifndef CONFIG_SECURITY_SMACK_NETFILTER
-	.socket_bind =			smack_socket_bind,
+	LSM_HOOK_INIT(socket_bind, smack_socket_bind),
 #endif /* CONFIG_SECURITY_SMACK_NETFILTER */
-	.socket_connect =		smack_socket_connect,
-	.socket_sendmsg =		smack_socket_sendmsg,
-	.socket_sock_rcv_skb = 		smack_socket_sock_rcv_skb,
-	.socket_getpeersec_stream =	smack_socket_getpeersec_stream,
-	.socket_getpeersec_dgram =	smack_socket_getpeersec_dgram,
-	.sk_alloc_security = 		smack_sk_alloc_security,
-	.sk_free_security = 		smack_sk_free_security,
-	.sock_graft = 			smack_sock_graft,
-	.inet_conn_request = 		smack_inet_conn_request,
-	.inet_csk_clone =		smack_inet_csk_clone,
+	LSM_HOOK_INIT(socket_connect, smack_socket_connect),
+	LSM_HOOK_INIT(socket_sendmsg, smack_socket_sendmsg),
+	LSM_HOOK_INIT(socket_sock_rcv_skb, smack_socket_sock_rcv_skb),
+	LSM_HOOK_INIT(socket_getpeersec_stream, smack_socket_getpeersec_stream),
+	LSM_HOOK_INIT(socket_getpeersec_dgram, smack_socket_getpeersec_dgram),
+	LSM_HOOK_INIT(sk_alloc_security, smack_sk_alloc_security),
+	LSM_HOOK_INIT(sk_free_security, smack_sk_free_security),
+	LSM_HOOK_INIT(sock_graft, smack_sock_graft),
+	LSM_HOOK_INIT(inet_conn_request, smack_inet_conn_request),
+	LSM_HOOK_INIT(inet_csk_clone, smack_inet_csk_clone),
 
  /* key management security hooks */
 #ifdef CONFIG_KEYS
-	.key_alloc = 			smack_key_alloc,
-	.key_free = 			smack_key_free,
-	.key_permission = 		smack_key_permission,
-	.key_getsecurity =		smack_key_getsecurity,
+	LSM_HOOK_INIT(key_alloc, smack_key_alloc),
+	LSM_HOOK_INIT(key_free, smack_key_free),
+	LSM_HOOK_INIT(key_permission, smack_key_permission),
+	LSM_HOOK_INIT(key_getsecurity, smack_key_getsecurity),
 #endif /* CONFIG_KEYS */
 
  /* Audit hooks */
 #ifdef CONFIG_AUDIT
-	.audit_rule_init =		smack_audit_rule_init,
-	.audit_rule_known =		smack_audit_rule_known,
-	.audit_rule_match =		smack_audit_rule_match,
-	.audit_rule_free =		smack_audit_rule_free,
+	LSM_HOOK_INIT(audit_rule_init, smack_audit_rule_init),
+	LSM_HOOK_INIT(audit_rule_known, smack_audit_rule_known),
+	LSM_HOOK_INIT(audit_rule_match, smack_audit_rule_match),
+	LSM_HOOK_INIT(audit_rule_free, smack_audit_rule_free),
 #endif /* CONFIG_AUDIT */
 
-	.ismaclabel =			smack_ismaclabel,
-	.secid_to_secctx = 		smack_secid_to_secctx,
-	.secctx_to_secid = 		smack_secctx_to_secid,
-	.release_secctx = 		smack_release_secctx,
-	.inode_notifysecctx =		smack_inode_notifysecctx,
-	.inode_setsecctx =		smack_inode_setsecctx,
-	.inode_getsecctx =		smack_inode_getsecctx,
+	LSM_HOOK_INIT(ismaclabel, smack_ismaclabel),
+	LSM_HOOK_INIT(secid_to_secctx, smack_secid_to_secctx),
+	LSM_HOOK_INIT(secctx_to_secid, smack_secctx_to_secid),
+	LSM_HOOK_INIT(release_secctx, smack_release_secctx),
+	LSM_HOOK_INIT(inode_notifysecctx, smack_inode_notifysecctx),
+	LSM_HOOK_INIT(inode_setsecctx, smack_inode_setsecctx),
+	LSM_HOOK_INIT(inode_getsecctx, smack_inode_getsecctx),
 };
 
 
diff --git a/security/tomoyo/tomoyo.c b/security/tomoyo/tomoyo.c
index 2f7b46855f48..bce13583efda 100644
--- a/security/tomoyo/tomoyo.c
+++ b/security/tomoyo/tomoyo.c
@@ -503,35 +503,35 @@ static int tomoyo_socket_sendmsg(struct socket *sock, struct msghdr *msg,
  * registering TOMOYO.
  */
 static struct security_operations tomoyo_security_ops = {
-	.name                = "tomoyo",
-	.cred_alloc_blank    = tomoyo_cred_alloc_blank,
-	.cred_prepare        = tomoyo_cred_prepare,
-	.cred_transfer	     = tomoyo_cred_transfer,
-	.cred_free           = tomoyo_cred_free,
-	.bprm_set_creds      = tomoyo_bprm_set_creds,
-	.bprm_check_security = tomoyo_bprm_check_security,
-	.file_fcntl          = tomoyo_file_fcntl,
-	.file_open           = tomoyo_file_open,
-	.path_truncate       = tomoyo_path_truncate,
-	.path_unlink         = tomoyo_path_unlink,
-	.path_mkdir          = tomoyo_path_mkdir,
-	.path_rmdir          = tomoyo_path_rmdir,
-	.path_symlink        = tomoyo_path_symlink,
-	.path_mknod          = tomoyo_path_mknod,
-	.path_link           = tomoyo_path_link,
-	.path_rename         = tomoyo_path_rename,
-	.inode_getattr       = tomoyo_inode_getattr,
-	.file_ioctl          = tomoyo_file_ioctl,
-	.path_chmod          = tomoyo_path_chmod,
-	.path_chown          = tomoyo_path_chown,
-	.path_chroot         = tomoyo_path_chroot,
-	.sb_mount            = tomoyo_sb_mount,
-	.sb_umount           = tomoyo_sb_umount,
-	.sb_pivotroot        = tomoyo_sb_pivotroot,
-	.socket_bind         = tomoyo_socket_bind,
-	.socket_connect      = tomoyo_socket_connect,
-	.socket_listen       = tomoyo_socket_listen,
-	.socket_sendmsg      = tomoyo_socket_sendmsg,
+	LSM_HOOK_INIT(name, "tomoyo"),
+	LSM_HOOK_INIT(cred_alloc_blank, tomoyo_cred_alloc_blank),
+	LSM_HOOK_INIT(cred_prepare, tomoyo_cred_prepare),
+	LSM_HOOK_INIT(cred_transfer, tomoyo_cred_transfer),
+	LSM_HOOK_INIT(cred_free, tomoyo_cred_free),
+	LSM_HOOK_INIT(bprm_set_creds, tomoyo_bprm_set_creds),
+	LSM_HOOK_INIT(bprm_check_security, tomoyo_bprm_check_security),
+	LSM_HOOK_INIT(file_fcntl, tomoyo_file_fcntl),
+	LSM_HOOK_INIT(file_open, tomoyo_file_open),
+	LSM_HOOK_INIT(path_truncate, tomoyo_path_truncate),
+	LSM_HOOK_INIT(path_unlink, tomoyo_path_unlink),
+	LSM_HOOK_INIT(path_mkdir, tomoyo_path_mkdir),
+	LSM_HOOK_INIT(path_rmdir, tomoyo_path_rmdir),
+	LSM_HOOK_INIT(path_symlink, tomoyo_path_symlink),
+	LSM_HOOK_INIT(path_mknod, tomoyo_path_mknod),
+	LSM_HOOK_INIT(path_link, tomoyo_path_link),
+	LSM_HOOK_INIT(path_rename, tomoyo_path_rename),
+	LSM_HOOK_INIT(inode_getattr, tomoyo_inode_getattr),
+	LSM_HOOK_INIT(file_ioctl, tomoyo_file_ioctl),
+	LSM_HOOK_INIT(path_chmod, tomoyo_path_chmod),
+	LSM_HOOK_INIT(path_chown, tomoyo_path_chown),
+	LSM_HOOK_INIT(path_chroot, tomoyo_path_chroot),
+	LSM_HOOK_INIT(sb_mount, tomoyo_sb_mount),
+	LSM_HOOK_INIT(sb_umount, tomoyo_sb_umount),
+	LSM_HOOK_INIT(sb_pivotroot, tomoyo_sb_pivotroot),
+	LSM_HOOK_INIT(socket_bind, tomoyo_socket_bind),
+	LSM_HOOK_INIT(socket_connect, tomoyo_socket_connect),
+	LSM_HOOK_INIT(socket_listen, tomoyo_socket_listen),
+	LSM_HOOK_INIT(socket_sendmsg, tomoyo_socket_sendmsg),
 };
 
 /* Lock for GC. */
diff --git a/security/yama/yama_lsm.c b/security/yama/yama_lsm.c
index 14557ffa7b4d..23dd4c6246b2 100644
--- a/security/yama/yama_lsm.c
+++ b/security/yama/yama_lsm.c
@@ -366,12 +366,12 @@ int yama_ptrace_traceme(struct task_struct *parent)
 
 #ifndef CONFIG_SECURITY_YAMA_STACKED
 static struct security_operations yama_ops = {
-	.name =			"yama",
+	LSM_HOOK_INIT(name, "yama"),
 
-	.ptrace_access_check =	yama_ptrace_access_check,
-	.ptrace_traceme =	yama_ptrace_traceme,
-	.task_prctl =		yama_task_prctl,
-	.task_free =		yama_task_free,
+	LSM_HOOK_INIT(ptrace_access_check, yama_ptrace_access_check),
+	LSM_HOOK_INIT(ptrace_traceme, yama_ptrace_traceme),
+	LSM_HOOK_INIT(task_prctl, yama_task_prctl),
+	LSM_HOOK_INIT(task_free, yama_task_free),
 };
 #endif
 
-- 
cgit v1.2.3


From b1d9e6b0646d0e5ee5d9050bd236b6c65d66faef Mon Sep 17 00:00:00 2001
From: Casey Schaufler <casey@schaufler-ca.com>
Date: Sat, 2 May 2015 15:11:42 -0700
Subject: LSM: Switch to lists of hooks

Instead of using a vector of security operations
with explicit, special case stacking of the capability
and yama hooks use lists of hooks with capability and
yama hooks included as appropriate.

The security_operations structure is no longer required.
Instead, there is a union of the function pointers that
allows all the hooks lists to use a common mechanism for
list management while retaining typing. Each module
supplies an array describing the hooks it provides instead
of a sparsely populated security_operations structure.
The description includes the element that gets put on
the hook list, avoiding the issues surrounding individual
element allocation.

The method for registering security modules is changed to
reflect the information available. The method for removing
a module, currently only used by SELinux, has also changed.
It should be generic now, however if there are potential
race conditions based on ordering of hook removal that needs
to be addressed by the calling module.

The security hooks are called from the lists and the first
failure is returned.

Signed-off-by: Casey Schaufler <casey@schaufler-ca.com>
Acked-by: John Johansen <john.johansen@canonical.com>
Acked-by: Kees Cook <keescook@chromium.org>
Acked-by: Paul Moore <paul@paul-moore.com>
Acked-by:  Stephen Smalley <sds@tycho.nsa.gov>
Acked-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Signed-off-by: James Morris <james.l.morris@oracle.com>
---
 include/linux/lsm_hooks.h  |  77 ++++--
 include/linux/security.h   |  46 +---
 security/Makefile          |   2 +-
 security/apparmor/domain.c |  12 +-
 security/apparmor/lsm.c    |  51 ++--
 security/commoncap.c       |  41 +++-
 security/security.c        | 570 +++++++++++++++++++++++++++++++++++++--------
 security/selinux/hooks.c   |  94 ++------
 security/smack/smack.h     |   2 -
 security/smack/smack_lsm.c |  53 +----
 security/smack/smackfs.c   |   2 +-
 security/tomoyo/tomoyo.c   |  14 +-
 security/yama/yama_lsm.c   |  50 ++--
 13 files changed, 627 insertions(+), 387 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h
index 27dd6fcacccc..f014f2596e22 100644
--- a/include/linux/lsm_hooks.h
+++ b/include/linux/lsm_hooks.h
@@ -25,21 +25,10 @@
 #define __LINUX_LSM_HOOKS_H
 
 #include <linux/security.h>
-
-/* Maximum number of letters for an LSM name string */
-#define SECURITY_NAME_MAX	10
-
-#ifdef CONFIG_SECURITY
+#include <linux/init.h>
+#include <linux/rculist.h>
 
 /**
- * struct security_operations - main security structure
- *
- * Security module identifier.
- *
- * @name:
- *	A string that acts as a unique identifier for the LSM with max number
- *	of characters = SECURITY_NAME_MAX.
- *
  * Security hooks for program execution operations.
  *
  * @bprm_set_creds:
@@ -1310,9 +1299,7 @@
  * This is the main security structure.
  */
 
-struct security_operations {
-	char name[SECURITY_NAME_MAX + 1];
-
+union security_list_options {
 	int (*binder_set_context_mgr)(struct task_struct *mgr);
 	int (*binder_transaction)(struct task_struct *from,
 					struct task_struct *to);
@@ -1837,21 +1824,63 @@ struct security_hook_heads {
 #endif /* CONFIG_AUDIT */
 };
 
+/*
+ * Security module hook list structure.
+ * For use with generic list macros for common operations.
+ */
+struct security_hook_list {
+	struct list_head		list;
+	struct list_head		*head;
+	union security_list_options	hook;
+};
+
 /*
  * Initializing a security_hook_list structure takes
  * up a lot of space in a source file. This macro takes
  * care of the common case and reduces the amount of
  * text involved.
- * Casey says: Comment is true in the next patch.
  */
-#define LSM_HOOK_INIT(HEAD, HOOK)	.HEAD = HOOK
+#define LSM_HOOK_INIT(HEAD, HOOK) \
+	{ .head = &security_hook_heads.HEAD, .hook = { .HEAD = HOOK } }
+
+extern struct security_hook_heads security_hook_heads;
+
+static inline void security_add_hooks(struct security_hook_list *hooks,
+				      int count)
+{
+	int i;
 
-/* prototypes */
-extern int security_module_enable(struct security_operations *ops);
-extern int register_security(struct security_operations *ops);
-extern void __init security_fixup_ops(struct security_operations *ops);
-extern void reset_security_ops(void);
+	for (i = 0; i < count; i++)
+		list_add_tail_rcu(&hooks[i].list, hooks[i].head);
+}
 
-#endif /* CONFIG_SECURITY */
+#ifdef CONFIG_SECURITY_SELINUX_DISABLE
+/*
+ * Assuring the safety of deleting a security module is up to
+ * the security module involved. This may entail ordering the
+ * module's hook list in a particular way, refusing to disable
+ * the module once a policy is loaded or any number of other
+ * actions better imagined than described.
+ *
+ * The name of the configuration option reflects the only module
+ * that currently uses the mechanism. Any developer who thinks
+ * disabling their module is a good idea needs to be at least as
+ * careful as the SELinux team.
+ */
+static inline void security_delete_hooks(struct security_hook_list *hooks,
+						int count)
+{
+	int i;
+
+	for (i = 0; i < count; i++)
+		list_del_rcu(&hooks[i].list);
+}
+#endif /* CONFIG_SECURITY_SELINUX_DISABLE */
+
+extern int __init security_module_enable(const char *module);
+extern void __init capability_add_hooks(void);
+#ifdef CONFIG_SECURITY_YAMA_STACKED
+void __init yama_add_hooks(void);
+#endif
 
 #endif /* ! __LINUX_LSM_HOOKS_H */
diff --git a/include/linux/security.h b/include/linux/security.h
index a2a100e7ac6e..8c8175d41b4c 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -27,6 +27,7 @@
 #include <linux/slab.h>
 #include <linux/err.h>
 #include <linux/string.h>
+#include <linux/mm.h>
 
 struct linux_binprm;
 struct cred;
@@ -54,9 +55,6 @@ struct xattr;
 struct xfrm_sec_ctx;
 struct mm_struct;
 
-/* Maximum number of letters for an LSM name string */
-#define SECURITY_NAME_MAX	10
-
 /* If capable should audit the security request */
 #define SECURITY_CAP_NOAUDIT 0
 #define SECURITY_CAP_AUDIT 1
@@ -69,10 +67,7 @@ struct audit_krule;
 struct user_namespace;
 struct timezone;
 
-/*
- * These functions are in security/capability.c and are used
- * as the default capabilities functions
- */
+/* These functions are in security/commoncap.c */
 extern int cap_capable(const struct cred *cred, struct user_namespace *ns,
 		       int cap, int audit);
 extern int cap_settime(const struct timespec *ts, const struct timezone *tz);
@@ -114,8 +109,6 @@ struct xfrm_state;
 struct xfrm_user_sec_ctx;
 struct seq_file;
 
-extern int cap_netlink_send(struct sock *sk, struct sk_buff *skb);
-
 #ifdef CONFIG_MMU
 extern unsigned long mmap_min_addr;
 extern unsigned long dac_mmap_min_addr;
@@ -472,7 +465,7 @@ static inline int security_settime(const struct timespec *ts,
 
 static inline int security_vm_enough_memory_mm(struct mm_struct *mm, long pages)
 {
-	return cap_vm_enough_memory(mm, pages);
+	return __vm_enough_memory(mm, pages, cap_vm_enough_memory(mm, pages));
 }
 
 static inline int security_bprm_set_creds(struct linux_binprm *bprm)
@@ -1075,7 +1068,7 @@ static inline int security_setprocattr(struct task_struct *p, char *name, void *
 
 static inline int security_netlink_send(struct sock *sk, struct sk_buff *skb)
 {
-	return cap_netlink_send(sk, skb);
+	return 0;
 }
 
 static inline int security_ismaclabel(const char *name)
@@ -1643,36 +1636,5 @@ static inline void free_secdata(void *secdata)
 { }
 #endif /* CONFIG_SECURITY */
 
-#ifdef CONFIG_SECURITY_YAMA
-extern int yama_ptrace_access_check(struct task_struct *child,
-				    unsigned int mode);
-extern int yama_ptrace_traceme(struct task_struct *parent);
-extern void yama_task_free(struct task_struct *task);
-extern int yama_task_prctl(int option, unsigned long arg2, unsigned long arg3,
-			   unsigned long arg4, unsigned long arg5);
-#else
-static inline int yama_ptrace_access_check(struct task_struct *child,
-					   unsigned int mode)
-{
-	return 0;
-}
-
-static inline int yama_ptrace_traceme(struct task_struct *parent)
-{
-	return 0;
-}
-
-static inline void yama_task_free(struct task_struct *task)
-{
-}
-
-static inline int yama_task_prctl(int option, unsigned long arg2,
-				  unsigned long arg3, unsigned long arg4,
-				  unsigned long arg5)
-{
-	return -ENOSYS;
-}
-#endif /* CONFIG_SECURITY_YAMA */
-
 #endif /* ! __LINUX_SECURITY_H */
 
diff --git a/security/Makefile b/security/Makefile
index 05f1c934d74b..c9bfbc84ff50 100644
--- a/security/Makefile
+++ b/security/Makefile
@@ -14,7 +14,7 @@ obj-y					+= commoncap.o
 obj-$(CONFIG_MMU)			+= min_addr.o
 
 # Object file lists
-obj-$(CONFIG_SECURITY)			+= security.o capability.o
+obj-$(CONFIG_SECURITY)			+= security.o
 obj-$(CONFIG_SECURITYFS)		+= inode.o
 obj-$(CONFIG_SECURITY_SELINUX)		+= selinux/
 obj-$(CONFIG_SECURITY_SMACK)		+= smack/
diff --git a/security/apparmor/domain.c b/security/apparmor/domain.c
index d97cba3e3849..dc0027b28b04 100644
--- a/security/apparmor/domain.c
+++ b/security/apparmor/domain.c
@@ -347,9 +347,7 @@ int apparmor_bprm_set_creds(struct linux_binprm *bprm)
 		file_inode(bprm->file)->i_mode
 	};
 	const char *name = NULL, *target = NULL, *info = NULL;
-	int error = cap_bprm_set_creds(bprm);
-	if (error)
-		return error;
+	int error = 0;
 
 	if (bprm->cred_prepared)
 		return 0;
@@ -531,15 +529,13 @@ cleanup:
  */
 int apparmor_bprm_secureexec(struct linux_binprm *bprm)
 {
-	int ret = cap_bprm_secureexec(bprm);
-
 	/* the decision to use secure exec is computed in set_creds
 	 * and stored in bprm->unsafe.
 	 */
-	if (!ret && (bprm->unsafe & AA_SECURE_X_NEEDED))
-		ret = 1;
+	if (bprm->unsafe & AA_SECURE_X_NEEDED)
+		return 1;
 
-	return ret;
+	return 0;
 }
 
 /**
diff --git a/security/apparmor/lsm.c b/security/apparmor/lsm.c
index f54253258fb8..5696874e8062 100644
--- a/security/apparmor/lsm.c
+++ b/security/apparmor/lsm.c
@@ -96,19 +96,11 @@ static void apparmor_cred_transfer(struct cred *new, const struct cred *old)
 static int apparmor_ptrace_access_check(struct task_struct *child,
 					unsigned int mode)
 {
-	int error = cap_ptrace_access_check(child, mode);
-	if (error)
-		return error;
-
 	return aa_ptrace(current, child, mode);
 }
 
 static int apparmor_ptrace_traceme(struct task_struct *parent)
 {
-	int error = cap_ptrace_traceme(parent);
-	if (error)
-		return error;
-
 	return aa_ptrace(parent, current, PTRACE_MODE_ATTACH);
 }
 
@@ -123,10 +115,10 @@ static int apparmor_capget(struct task_struct *target, kernel_cap_t *effective,
 	cred = __task_cred(target);
 	profile = aa_cred_profile(cred);
 
-	*effective = cred->cap_effective;
-	*inheritable = cred->cap_inheritable;
-	*permitted = cred->cap_permitted;
-
+	/*
+	 * cap_capget is stacked ahead of this and will
+	 * initialize effective and permitted.
+	 */
 	if (!unconfined(profile) && !COMPLAIN_MODE(profile)) {
 		*effective = cap_intersect(*effective, profile->caps.allow);
 		*permitted = cap_intersect(*permitted, profile->caps.allow);
@@ -140,13 +132,11 @@ static int apparmor_capable(const struct cred *cred, struct user_namespace *ns,
 			    int cap, int audit)
 {
 	struct aa_profile *profile;
-	/* cap_capable returns 0 on success, else -EPERM */
-	int error = cap_capable(cred, ns, cap, audit);
-	if (!error) {
-		profile = aa_cred_profile(cred);
-		if (!unconfined(profile))
-			error = aa_capable(profile, cap, audit);
-	}
+	int error = 0;
+
+	profile = aa_cred_profile(cred);
+	if (!unconfined(profile))
+		error = aa_capable(profile, cap, audit);
 	return error;
 }
 
@@ -615,9 +605,7 @@ static int apparmor_task_setrlimit(struct task_struct *task,
 	return error;
 }
 
-static struct security_operations apparmor_ops = {
-	LSM_HOOK_INIT(name, "apparmor"),
-
+static struct security_hook_list apparmor_hooks[] = {
 	LSM_HOOK_INIT(ptrace_access_check, apparmor_ptrace_access_check),
 	LSM_HOOK_INIT(ptrace_traceme, apparmor_ptrace_traceme),
 	LSM_HOOK_INIT(capget, apparmor_capget),
@@ -640,7 +628,6 @@ static struct security_operations apparmor_ops = {
 	LSM_HOOK_INIT(file_alloc_security, apparmor_file_alloc_security),
 	LSM_HOOK_INIT(file_free_security, apparmor_file_free_security),
 	LSM_HOOK_INIT(mmap_file, apparmor_mmap_file),
-	LSM_HOOK_INIT(mmap_addr, cap_mmap_addr),
 	LSM_HOOK_INIT(file_mprotect, apparmor_file_mprotect),
 	LSM_HOOK_INIT(file_lock, apparmor_file_lock),
 
@@ -898,7 +885,7 @@ static int __init apparmor_init(void)
 {
 	int error;
 
-	if (!apparmor_enabled || !security_module_enable(&apparmor_ops)) {
+	if (!apparmor_enabled || !security_module_enable("apparmor")) {
 		aa_info_message("AppArmor disabled by boot time parameter");
 		apparmor_enabled = 0;
 		return 0;
@@ -913,17 +900,10 @@ static int __init apparmor_init(void)
 	error = set_init_cxt();
 	if (error) {
 		AA_ERROR("Failed to set context on init task\n");
-		goto register_security_out;
-	}
-
-	error = register_security(&apparmor_ops);
-	if (error) {
-		struct cred *cred = (struct cred *)current->real_cred;
-		aa_free_task_context(cred_cxt(cred));
-		cred_cxt(cred) = NULL;
-		AA_ERROR("Unable to register AppArmor\n");
-		goto register_security_out;
+		aa_free_root_ns();
+		goto alloc_out;
 	}
+	security_add_hooks(apparmor_hooks, ARRAY_SIZE(apparmor_hooks));
 
 	/* Report that AppArmor successfully initialized */
 	apparmor_initialized = 1;
@@ -936,9 +916,6 @@ static int __init apparmor_init(void)
 
 	return error;
 
-register_security_out:
-	aa_free_root_ns();
-
 alloc_out:
 	aa_destroy_aafs();
 
diff --git a/security/commoncap.c b/security/commoncap.c
index f2875cd9f677..d103f5a4043d 100644
--- a/security/commoncap.c
+++ b/security/commoncap.c
@@ -12,7 +12,7 @@
 #include <linux/module.h>
 #include <linux/init.h>
 #include <linux/kernel.h>
-#include <linux/security.h>
+#include <linux/lsm_hooks.h>
 #include <linux/file.h>
 #include <linux/mm.h>
 #include <linux/mman.h>
@@ -53,11 +53,6 @@ static void warn_setuid_and_fcaps_mixed(const char *fname)
 	}
 }
 
-int cap_netlink_send(struct sock *sk, struct sk_buff *skb)
-{
-	return 0;
-}
-
 /**
  * cap_capable - Determine whether a task has a particular effective capability
  * @cred: The credentials to use
@@ -941,7 +936,7 @@ int cap_task_prctl(int option, unsigned long arg2, unsigned long arg3,
  * @pages: The size of the mapping
  *
  * Determine whether the allocation of a new virtual mapping by the current
- * task is permitted, returning 0 if permission is granted, -ve if not.
+ * task is permitted, returning 1 if permission is granted, 0 if not.
  */
 int cap_vm_enough_memory(struct mm_struct *mm, long pages)
 {
@@ -950,7 +945,7 @@ int cap_vm_enough_memory(struct mm_struct *mm, long pages)
 	if (cap_capable(current_cred(), &init_user_ns, CAP_SYS_ADMIN,
 			SECURITY_CAP_NOAUDIT) == 0)
 		cap_sys_admin = 1;
-	return __vm_enough_memory(mm, pages, cap_sys_admin);
+	return cap_sys_admin;
 }
 
 /*
@@ -981,3 +976,33 @@ int cap_mmap_file(struct file *file, unsigned long reqprot,
 {
 	return 0;
 }
+
+#ifdef CONFIG_SECURITY
+
+struct security_hook_list capability_hooks[] = {
+	LSM_HOOK_INIT(capable, cap_capable),
+	LSM_HOOK_INIT(settime, cap_settime),
+	LSM_HOOK_INIT(ptrace_access_check, cap_ptrace_access_check),
+	LSM_HOOK_INIT(ptrace_traceme, cap_ptrace_traceme),
+	LSM_HOOK_INIT(capget, cap_capget),
+	LSM_HOOK_INIT(capset, cap_capset),
+	LSM_HOOK_INIT(bprm_set_creds, cap_bprm_set_creds),
+	LSM_HOOK_INIT(bprm_secureexec, cap_bprm_secureexec),
+	LSM_HOOK_INIT(inode_need_killpriv, cap_inode_need_killpriv),
+	LSM_HOOK_INIT(inode_killpriv, cap_inode_killpriv),
+	LSM_HOOK_INIT(mmap_addr, cap_mmap_addr),
+	LSM_HOOK_INIT(mmap_file, cap_mmap_file),
+	LSM_HOOK_INIT(task_fix_setuid, cap_task_fix_setuid),
+	LSM_HOOK_INIT(task_prctl, cap_task_prctl),
+	LSM_HOOK_INIT(task_setscheduler, cap_task_setscheduler),
+	LSM_HOOK_INIT(task_setioprio, cap_task_setioprio),
+	LSM_HOOK_INIT(task_setnice, cap_task_setnice),
+	LSM_HOOK_INIT(vm_enough_memory, cap_vm_enough_memory),
+};
+
+void __init capability_add_hooks(void)
+{
+	security_add_hooks(capability_hooks, ARRAY_SIZE(capability_hooks));
+}
+
+#endif /* CONFIG_SECURITY */
diff --git a/security/security.c b/security/security.c
index 02dc72006afa..bd4c5f6a5b78 100644
--- a/security/security.c
+++ b/security/security.c
@@ -29,24 +29,13 @@
 
 #define MAX_LSM_EVM_XATTR	2
 
+/* Maximum number of letters for an LSM name string */
+#define SECURITY_NAME_MAX	10
+
 /* Boot-time LSM user choice */
 static __initdata char chosen_lsm[SECURITY_NAME_MAX + 1] =
 	CONFIG_DEFAULT_SECURITY;
 
-static struct security_operations *security_ops;
-static struct security_operations default_security_ops = {
-	.name	= "default",
-};
-
-static inline int __init verify(struct security_operations *ops)
-{
-	/* verify the security_operations structure exists */
-	if (!ops)
-		return -EINVAL;
-	security_fixup_ops(ops);
-	return 0;
-}
-
 static void __init do_security_initcalls(void)
 {
 	initcall_t *call;
@@ -64,20 +53,27 @@ static void __init do_security_initcalls(void)
  */
 int __init security_init(void)
 {
-	printk(KERN_INFO "Security Framework initialized\n");
+	pr_info("Security Framework initialized\n");
 
-	security_fixup_ops(&default_security_ops);
-	security_ops = &default_security_ops;
+	/*
+	 * Always load the capability module.
+	 */
+	capability_add_hooks();
+#ifdef CONFIG_SECURITY_YAMA_STACKED
+	/*
+	 * If Yama is configured for stacking load it next.
+	 */
+	yama_add_hooks();
+#endif
+	/*
+	 * Load the chosen module if there is one.
+	 * This will also find yama if it is stacking
+	 */
 	do_security_initcalls();
 
 	return 0;
 }
 
-void reset_security_ops(void)
-{
-	security_ops = &default_security_ops;
-}
-
 /* Save user chosen LSM */
 static int __init choose_lsm(char *str)
 {
@@ -88,7 +84,7 @@ __setup("security=", choose_lsm);
 
 /**
  * security_module_enable - Load given security module on boot ?
- * @ops: a pointer to the struct security_operations that is to be checked.
+ * @module: the name of the module
  *
  * Each LSM must pass this method before registering its own operations
  * to avoid security registration races. This method may also be used
@@ -100,41 +96,13 @@ __setup("security=", choose_lsm);
  *	 choose an alternate LSM at boot time.
  * Otherwise, return false.
  */
-int __init security_module_enable(struct security_operations *ops)
+int __init security_module_enable(const char *module)
 {
-	return !strcmp(ops->name, chosen_lsm);
-}
-
-/**
- * register_security - registers a security framework with the kernel
- * @ops: a pointer to the struct security_options that is to be registered
- *
- * This function allows a security module to register itself with the
- * kernel security subsystem.  Some rudimentary checking is done on the @ops
- * value passed to this function. You'll need to check first if your LSM
- * is allowed to register its @ops by calling security_module_enable(@ops).
- *
- * If there is already a security module registered with the kernel,
- * an error will be returned.  Otherwise %0 is returned on success.
- */
-int __init register_security(struct security_operations *ops)
-{
-	if (verify(ops)) {
-		printk(KERN_DEBUG "%s could not verify "
-		       "security_operations structure.\n", __func__);
-		return -EINVAL;
-	}
-
-	if (security_ops != &default_security_ops)
-		return -EAGAIN;
-
-	security_ops = ops;
-
-	return 0;
+	return !strcmp(module, chosen_lsm);
 }
 
 /*
- * Hook operation macros.
+ * Hook list operation macros.
  *
  * call_void_hook:
  *	This is a hook that does not return a value.
@@ -143,8 +111,27 @@ int __init register_security(struct security_operations *ops)
  *	This is a hook that returns a value.
  */
 
-#define call_void_hook(FUNC, ...)	security_ops->FUNC(__VA_ARGS__)
-#define call_int_hook(FUNC, IRC, ...)	security_ops->FUNC(__VA_ARGS__)
+#define call_void_hook(FUNC, ...)				\
+	do {							\
+		struct security_hook_list *P;			\
+								\
+		list_for_each_entry(P, &security_hook_heads.FUNC, list)	\
+			P->hook.FUNC(__VA_ARGS__);		\
+	} while (0)
+
+#define call_int_hook(FUNC, IRC, ...) ({			\
+	int RC = IRC;						\
+	do {							\
+		struct security_hook_list *P;			\
+								\
+		list_for_each_entry(P, &security_hook_heads.FUNC, list) { \
+			RC = P->hook.FUNC(__VA_ARGS__);		\
+			if (RC != 0)				\
+				break;				\
+		}						\
+	} while (0);						\
+	RC;							\
+})
 
 /* Security operations */
 
@@ -173,23 +160,11 @@ int security_binder_transfer_file(struct task_struct *from,
 
 int security_ptrace_access_check(struct task_struct *child, unsigned int mode)
 {
-#ifdef CONFIG_SECURITY_YAMA_STACKED
-	int rc;
-	rc = yama_ptrace_access_check(child, mode);
-	if (rc)
-		return rc;
-#endif
 	return call_int_hook(ptrace_access_check, 0, child, mode);
 }
 
 int security_ptrace_traceme(struct task_struct *parent)
 {
-#ifdef CONFIG_SECURITY_YAMA_STACKED
-	int rc;
-	rc = yama_ptrace_traceme(parent);
-	if (rc)
-		return rc;
-#endif
 	return call_int_hook(ptrace_traceme, 0, parent);
 }
 
@@ -245,7 +220,25 @@ int security_settime(const struct timespec *ts, const struct timezone *tz)
 
 int security_vm_enough_memory_mm(struct mm_struct *mm, long pages)
 {
-	return call_int_hook(vm_enough_memory, 0, mm, pages);
+	struct security_hook_list *hp;
+	int cap_sys_admin = 1;
+	int rc;
+
+	/*
+	 * The module will respond with a positive value if
+	 * it thinks the __vm_enough_memory() call should be
+	 * made with the cap_sys_admin set. If all of the modules
+	 * agree that it should be set it will. If any module
+	 * thinks it should not be set it won't.
+	 */
+	list_for_each_entry(hp, &security_hook_heads.vm_enough_memory, list) {
+		rc = hp->hook.vm_enough_memory(mm, pages);
+		if (rc <= 0) {
+			cap_sys_admin = 0;
+			break;
+		}
+	}
+	return __vm_enough_memory(mm, pages, cap_sys_admin);
 }
 
 int security_bprm_set_creds(struct linux_binprm *bprm)
@@ -335,8 +328,9 @@ int security_sb_set_mnt_opts(struct super_block *sb,
 				unsigned long kern_flags,
 				unsigned long *set_kern_flags)
 {
-	return call_int_hook(sb_set_mnt_opts, 0, sb, opts, kern_flags,
-						set_kern_flags);
+	return call_int_hook(sb_set_mnt_opts,
+				opts->num_mnt_opts ? -EOPNOTSUPP : 0, sb,
+				opts, kern_flags, set_kern_flags);
 }
 EXPORT_SYMBOL(security_sb_set_mnt_opts);
 
@@ -369,8 +363,8 @@ int security_dentry_init_security(struct dentry *dentry, int mode,
 					struct qstr *name, void **ctx,
 					u32 *ctxlen)
 {
-	return call_int_hook(dentry_init_security, 0, dentry, mode, name,
-							ctx, ctxlen);
+	return call_int_hook(dentry_init_security, -EOPNOTSUPP, dentry, mode,
+				name, ctx, ctxlen);
 }
 EXPORT_SYMBOL(security_dentry_init_security);
 
@@ -390,7 +384,7 @@ int security_inode_init_security(struct inode *inode, struct inode *dir,
 							 NULL, NULL, NULL);
 	memset(new_xattrs, 0, sizeof(new_xattrs));
 	lsm_xattr = new_xattrs;
-	ret = call_int_hook(inode_init_security, 0, inode, dir, qstr,
+	ret = call_int_hook(inode_init_security, -EOPNOTSUPP, inode, dir, qstr,
 						&lsm_xattr->name,
 						&lsm_xattr->value,
 						&lsm_xattr->value_len);
@@ -636,8 +630,15 @@ int security_inode_setxattr(struct dentry *dentry, const char *name,
 
 	if (unlikely(IS_PRIVATE(d_backing_inode(dentry))))
 		return 0;
-	ret = call_int_hook(inode_setxattr, 0, dentry, name, value, size,
+	/*
+	 * SELinux and Smack integrate the cap call,
+	 * so assume that all LSMs supplying this call do so.
+	 */
+	ret = call_int_hook(inode_setxattr, 1, dentry, name, value, size,
 				flags);
+
+	if (ret == 1)
+		ret = cap_inode_setxattr(dentry, name, value, size, flags);
 	if (ret)
 		return ret;
 	ret = ima_inode_setxattr(dentry, name, value, size);
@@ -675,7 +676,13 @@ int security_inode_removexattr(struct dentry *dentry, const char *name)
 
 	if (unlikely(IS_PRIVATE(d_backing_inode(dentry))))
 		return 0;
-	ret = call_int_hook(inode_removexattr, 0, dentry, name);
+	/*
+	 * SELinux and Smack integrate the cap call,
+	 * so assume that all LSMs supplying this call do so.
+	 */
+	ret = call_int_hook(inode_removexattr, 1, dentry, name);
+	if (ret == 1)
+		ret = cap_inode_removexattr(dentry, name);
 	if (ret)
 		return ret;
 	ret = ima_inode_removexattr(dentry, name);
@@ -698,15 +705,16 @@ int security_inode_getsecurity(const struct inode *inode, const char *name, void
 {
 	if (unlikely(IS_PRIVATE(inode)))
 		return -EOPNOTSUPP;
-	return call_int_hook(inode_getsecurity, 0, inode, name, buffer, alloc);
+	return call_int_hook(inode_getsecurity, -EOPNOTSUPP, inode, name,
+				buffer, alloc);
 }
 
 int security_inode_setsecurity(struct inode *inode, const char *name, const void *value, size_t size, int flags)
 {
 	if (unlikely(IS_PRIVATE(inode)))
 		return -EOPNOTSUPP;
-	return call_int_hook(inode_setsecurity, 0, inode, name, value, size,
-				flags);
+	return call_int_hook(inode_setsecurity, -EOPNOTSUPP, inode, name,
+				value, size, flags);
 }
 
 int security_inode_listsecurity(struct inode *inode, char *buffer, size_t buffer_size)
@@ -847,9 +855,6 @@ int security_task_create(unsigned long clone_flags)
 
 void security_task_free(struct task_struct *task)
 {
-#ifdef CONFIG_SECURITY_YAMA_STACKED
-	yama_task_free(task);
-#endif
 	call_void_hook(task_free, task);
 }
 
@@ -932,6 +937,7 @@ int security_task_getsid(struct task_struct *p)
 
 void security_task_getsecid(struct task_struct *p, u32 *secid)
 {
+	*secid = 0;
 	call_void_hook(task_getsecid, p, secid);
 }
 EXPORT_SYMBOL(security_task_getsecid);
@@ -986,13 +992,19 @@ int security_task_wait(struct task_struct *p)
 int security_task_prctl(int option, unsigned long arg2, unsigned long arg3,
 			 unsigned long arg4, unsigned long arg5)
 {
-#ifdef CONFIG_SECURITY_YAMA_STACKED
-	int rc;
-	rc = yama_task_prctl(option, arg2, arg3, arg4, arg5);
-	if (rc != -ENOSYS)
-		return rc;
-#endif
-	return call_int_hook(task_prctl, 0, option, arg2, arg3, arg4, arg5);
+	int thisrc;
+	int rc = -ENOSYS;
+	struct security_hook_list *hp;
+
+	list_for_each_entry(hp, &security_hook_heads.task_prctl, list) {
+		thisrc = hp->hook.task_prctl(option, arg2, arg3, arg4, arg5);
+		if (thisrc != -ENOSYS) {
+			rc = thisrc;
+			if (thisrc != 0)
+				break;
+		}
+	}
+	return rc;
 }
 
 void security_task_to_inode(struct task_struct *p, struct inode *inode)
@@ -1007,6 +1019,7 @@ int security_ipc_permission(struct kern_ipc_perm *ipcp, short flag)
 
 void security_ipc_getsecid(struct kern_ipc_perm *ipcp, u32 *secid)
 {
+	*secid = 0;
 	call_void_hook(ipc_getsecid, ipcp, secid);
 }
 
@@ -1113,12 +1126,12 @@ EXPORT_SYMBOL(security_d_instantiate);
 
 int security_getprocattr(struct task_struct *p, char *name, char **value)
 {
-	return call_int_hook(getprocattr, 0, p, name, value);
+	return call_int_hook(getprocattr, -EINVAL, p, name, value);
 }
 
 int security_setprocattr(struct task_struct *p, char *name, void *value, size_t size)
 {
-	return call_int_hook(setprocattr, 0, p, name, value, size);
+	return call_int_hook(setprocattr, -EINVAL, p, name, value, size);
 }
 
 int security_netlink_send(struct sock *sk, struct sk_buff *skb)
@@ -1134,12 +1147,14 @@ EXPORT_SYMBOL(security_ismaclabel);
 
 int security_secid_to_secctx(u32 secid, char **secdata, u32 *seclen)
 {
-	return call_int_hook(secid_to_secctx, 0, secid, secdata, seclen);
+	return call_int_hook(secid_to_secctx, -EOPNOTSUPP, secid, secdata,
+				seclen);
 }
 EXPORT_SYMBOL(security_secid_to_secctx);
 
 int security_secctx_to_secid(const char *secdata, u32 seclen, u32 *secid)
 {
+	*secid = 0;
 	return call_int_hook(secctx_to_secid, 0, secdata, seclen, secid);
 }
 EXPORT_SYMBOL(security_secctx_to_secid);
@@ -1164,7 +1179,7 @@ EXPORT_SYMBOL(security_inode_setsecctx);
 
 int security_inode_getsecctx(struct inode *inode, void **ctx, u32 *ctxlen)
 {
-	return call_int_hook(inode_getsecctx, 0, inode, ctx, ctxlen);
+	return call_int_hook(inode_getsecctx, -EOPNOTSUPP, inode, ctx, ctxlen);
 }
 EXPORT_SYMBOL(security_inode_getsecctx);
 
@@ -1259,8 +1274,8 @@ EXPORT_SYMBOL(security_sock_rcv_skb);
 int security_socket_getpeersec_stream(struct socket *sock, char __user *optval,
 				      int __user *optlen, unsigned len)
 {
-	return call_int_hook(socket_getpeersec_stream, 0, sock, optval,
-				optlen, len);
+	return call_int_hook(socket_getpeersec_stream, -ENOPROTOOPT, sock,
+				optval, optlen, len);
 }
 
 int security_socket_getpeersec_dgram(struct socket *sock, struct sk_buff *skb, u32 *secid)
@@ -1438,7 +1453,24 @@ int security_xfrm_state_pol_flow_match(struct xfrm_state *x,
 				       struct xfrm_policy *xp,
 				       const struct flowi *fl)
 {
-	return call_int_hook(xfrm_state_pol_flow_match, 0, x, xp, fl);
+	struct security_hook_list *hp;
+	int rc = 1;
+
+	/*
+	 * Since this function is expected to return 0 or 1, the judgment
+	 * becomes difficult if multiple LSMs supply this call. Fortunately,
+	 * we can use the first LSM's judgment because currently only SELinux
+	 * supplies this call.
+	 *
+	 * For speed optimization, we explicitly break the loop rather than
+	 * using the macro
+	 */
+	list_for_each_entry(hp, &security_hook_heads.xfrm_state_pol_flow_match,
+				list) {
+		rc = hp->hook.xfrm_state_pol_flow_match(x, xp, fl);
+		break;
+	}
+	return rc;
 }
 
 int security_xfrm_decode_session(struct sk_buff *skb, u32 *secid)
@@ -1478,6 +1510,7 @@ int security_key_permission(key_ref_t key_ref,
 
 int security_key_getsecurity(struct key *key, char **_buffer)
 {
+	*_buffer = NULL;
 	return call_int_hook(key_getsecurity, 0, key, _buffer);
 }
 
@@ -1506,5 +1539,350 @@ int security_audit_rule_match(u32 secid, u32 field, u32 op, void *lsmrule,
 	return call_int_hook(audit_rule_match, 0, secid, field, op, lsmrule,
 				actx);
 }
+#endif /* CONFIG_AUDIT */
 
+struct security_hook_heads security_hook_heads = {
+	.binder_set_context_mgr =
+		LIST_HEAD_INIT(security_hook_heads.binder_set_context_mgr),
+	.binder_transaction =
+		LIST_HEAD_INIT(security_hook_heads.binder_transaction),
+	.binder_transfer_binder =
+		LIST_HEAD_INIT(security_hook_heads.binder_transfer_binder),
+	.binder_transfer_file =
+		LIST_HEAD_INIT(security_hook_heads.binder_transfer_file),
+
+	.ptrace_access_check =
+		LIST_HEAD_INIT(security_hook_heads.ptrace_access_check),
+	.ptrace_traceme =
+		LIST_HEAD_INIT(security_hook_heads.ptrace_traceme),
+	.capget =	LIST_HEAD_INIT(security_hook_heads.capget),
+	.capset =	LIST_HEAD_INIT(security_hook_heads.capset),
+	.capable =	LIST_HEAD_INIT(security_hook_heads.capable),
+	.quotactl =	LIST_HEAD_INIT(security_hook_heads.quotactl),
+	.quota_on =	LIST_HEAD_INIT(security_hook_heads.quota_on),
+	.syslog =	LIST_HEAD_INIT(security_hook_heads.syslog),
+	.settime =	LIST_HEAD_INIT(security_hook_heads.settime),
+	.vm_enough_memory =
+		LIST_HEAD_INIT(security_hook_heads.vm_enough_memory),
+	.bprm_set_creds =
+		LIST_HEAD_INIT(security_hook_heads.bprm_set_creds),
+	.bprm_check_security =
+		LIST_HEAD_INIT(security_hook_heads.bprm_check_security),
+	.bprm_secureexec =
+		LIST_HEAD_INIT(security_hook_heads.bprm_secureexec),
+	.bprm_committing_creds =
+		LIST_HEAD_INIT(security_hook_heads.bprm_committing_creds),
+	.bprm_committed_creds =
+		LIST_HEAD_INIT(security_hook_heads.bprm_committed_creds),
+	.sb_alloc_security =
+		LIST_HEAD_INIT(security_hook_heads.sb_alloc_security),
+	.sb_free_security =
+		LIST_HEAD_INIT(security_hook_heads.sb_free_security),
+	.sb_copy_data =	LIST_HEAD_INIT(security_hook_heads.sb_copy_data),
+	.sb_remount =	LIST_HEAD_INIT(security_hook_heads.sb_remount),
+	.sb_kern_mount =
+		LIST_HEAD_INIT(security_hook_heads.sb_kern_mount),
+	.sb_show_options =
+		LIST_HEAD_INIT(security_hook_heads.sb_show_options),
+	.sb_statfs =	LIST_HEAD_INIT(security_hook_heads.sb_statfs),
+	.sb_mount =	LIST_HEAD_INIT(security_hook_heads.sb_mount),
+	.sb_umount =	LIST_HEAD_INIT(security_hook_heads.sb_umount),
+	.sb_pivotroot =	LIST_HEAD_INIT(security_hook_heads.sb_pivotroot),
+	.sb_set_mnt_opts =
+		LIST_HEAD_INIT(security_hook_heads.sb_set_mnt_opts),
+	.sb_clone_mnt_opts =
+		LIST_HEAD_INIT(security_hook_heads.sb_clone_mnt_opts),
+	.sb_parse_opts_str =
+		LIST_HEAD_INIT(security_hook_heads.sb_parse_opts_str),
+	.dentry_init_security =
+		LIST_HEAD_INIT(security_hook_heads.dentry_init_security),
+#ifdef CONFIG_SECURITY_PATH
+	.path_unlink =	LIST_HEAD_INIT(security_hook_heads.path_unlink),
+	.path_mkdir =	LIST_HEAD_INIT(security_hook_heads.path_mkdir),
+	.path_rmdir =	LIST_HEAD_INIT(security_hook_heads.path_rmdir),
+	.path_mknod =	LIST_HEAD_INIT(security_hook_heads.path_mknod),
+	.path_truncate =
+		LIST_HEAD_INIT(security_hook_heads.path_truncate),
+	.path_symlink =	LIST_HEAD_INIT(security_hook_heads.path_symlink),
+	.path_link =	LIST_HEAD_INIT(security_hook_heads.path_link),
+	.path_rename =	LIST_HEAD_INIT(security_hook_heads.path_rename),
+	.path_chmod =	LIST_HEAD_INIT(security_hook_heads.path_chmod),
+	.path_chown =	LIST_HEAD_INIT(security_hook_heads.path_chown),
+	.path_chroot =	LIST_HEAD_INIT(security_hook_heads.path_chroot),
+#endif
+	.inode_alloc_security =
+		LIST_HEAD_INIT(security_hook_heads.inode_alloc_security),
+	.inode_free_security =
+		LIST_HEAD_INIT(security_hook_heads.inode_free_security),
+	.inode_init_security =
+		LIST_HEAD_INIT(security_hook_heads.inode_init_security),
+	.inode_create =	LIST_HEAD_INIT(security_hook_heads.inode_create),
+	.inode_link =	LIST_HEAD_INIT(security_hook_heads.inode_link),
+	.inode_unlink =	LIST_HEAD_INIT(security_hook_heads.inode_unlink),
+	.inode_symlink =
+		LIST_HEAD_INIT(security_hook_heads.inode_symlink),
+	.inode_mkdir =	LIST_HEAD_INIT(security_hook_heads.inode_mkdir),
+	.inode_rmdir =	LIST_HEAD_INIT(security_hook_heads.inode_rmdir),
+	.inode_mknod =	LIST_HEAD_INIT(security_hook_heads.inode_mknod),
+	.inode_rename =	LIST_HEAD_INIT(security_hook_heads.inode_rename),
+	.inode_readlink =
+		LIST_HEAD_INIT(security_hook_heads.inode_readlink),
+	.inode_follow_link =
+		LIST_HEAD_INIT(security_hook_heads.inode_follow_link),
+	.inode_permission =
+		LIST_HEAD_INIT(security_hook_heads.inode_permission),
+	.inode_setattr =
+		LIST_HEAD_INIT(security_hook_heads.inode_setattr),
+	.inode_getattr =
+		LIST_HEAD_INIT(security_hook_heads.inode_getattr),
+	.inode_setxattr =
+		LIST_HEAD_INIT(security_hook_heads.inode_setxattr),
+	.inode_post_setxattr =
+		LIST_HEAD_INIT(security_hook_heads.inode_post_setxattr),
+	.inode_getxattr =
+		LIST_HEAD_INIT(security_hook_heads.inode_getxattr),
+	.inode_listxattr =
+		LIST_HEAD_INIT(security_hook_heads.inode_listxattr),
+	.inode_removexattr =
+		LIST_HEAD_INIT(security_hook_heads.inode_removexattr),
+	.inode_need_killpriv =
+		LIST_HEAD_INIT(security_hook_heads.inode_need_killpriv),
+	.inode_killpriv =
+		LIST_HEAD_INIT(security_hook_heads.inode_killpriv),
+	.inode_getsecurity =
+		LIST_HEAD_INIT(security_hook_heads.inode_getsecurity),
+	.inode_setsecurity =
+		LIST_HEAD_INIT(security_hook_heads.inode_setsecurity),
+	.inode_listsecurity =
+		LIST_HEAD_INIT(security_hook_heads.inode_listsecurity),
+	.inode_getsecid =
+		LIST_HEAD_INIT(security_hook_heads.inode_getsecid),
+	.file_permission =
+		LIST_HEAD_INIT(security_hook_heads.file_permission),
+	.file_alloc_security =
+		LIST_HEAD_INIT(security_hook_heads.file_alloc_security),
+	.file_free_security =
+		LIST_HEAD_INIT(security_hook_heads.file_free_security),
+	.file_ioctl =	LIST_HEAD_INIT(security_hook_heads.file_ioctl),
+	.mmap_addr =	LIST_HEAD_INIT(security_hook_heads.mmap_addr),
+	.mmap_file =	LIST_HEAD_INIT(security_hook_heads.mmap_file),
+	.file_mprotect =
+		LIST_HEAD_INIT(security_hook_heads.file_mprotect),
+	.file_lock =	LIST_HEAD_INIT(security_hook_heads.file_lock),
+	.file_fcntl =	LIST_HEAD_INIT(security_hook_heads.file_fcntl),
+	.file_set_fowner =
+		LIST_HEAD_INIT(security_hook_heads.file_set_fowner),
+	.file_send_sigiotask =
+		LIST_HEAD_INIT(security_hook_heads.file_send_sigiotask),
+	.file_receive =	LIST_HEAD_INIT(security_hook_heads.file_receive),
+	.file_open =	LIST_HEAD_INIT(security_hook_heads.file_open),
+	.task_create =	LIST_HEAD_INIT(security_hook_heads.task_create),
+	.task_free =	LIST_HEAD_INIT(security_hook_heads.task_free),
+	.cred_alloc_blank =
+		LIST_HEAD_INIT(security_hook_heads.cred_alloc_blank),
+	.cred_free =	LIST_HEAD_INIT(security_hook_heads.cred_free),
+	.cred_prepare =	LIST_HEAD_INIT(security_hook_heads.cred_prepare),
+	.cred_transfer =
+		LIST_HEAD_INIT(security_hook_heads.cred_transfer),
+	.kernel_act_as =
+		LIST_HEAD_INIT(security_hook_heads.kernel_act_as),
+	.kernel_create_files_as =
+		LIST_HEAD_INIT(security_hook_heads.kernel_create_files_as),
+	.kernel_fw_from_file =
+		LIST_HEAD_INIT(security_hook_heads.kernel_fw_from_file),
+	.kernel_module_request =
+		LIST_HEAD_INIT(security_hook_heads.kernel_module_request),
+	.kernel_module_from_file =
+		LIST_HEAD_INIT(security_hook_heads.kernel_module_from_file),
+	.task_fix_setuid =
+		LIST_HEAD_INIT(security_hook_heads.task_fix_setuid),
+	.task_setpgid =	LIST_HEAD_INIT(security_hook_heads.task_setpgid),
+	.task_getpgid =	LIST_HEAD_INIT(security_hook_heads.task_getpgid),
+	.task_getsid =	LIST_HEAD_INIT(security_hook_heads.task_getsid),
+	.task_getsecid =
+		LIST_HEAD_INIT(security_hook_heads.task_getsecid),
+	.task_setnice =	LIST_HEAD_INIT(security_hook_heads.task_setnice),
+	.task_setioprio =
+		LIST_HEAD_INIT(security_hook_heads.task_setioprio),
+	.task_getioprio =
+		LIST_HEAD_INIT(security_hook_heads.task_getioprio),
+	.task_setrlimit =
+		LIST_HEAD_INIT(security_hook_heads.task_setrlimit),
+	.task_setscheduler =
+		LIST_HEAD_INIT(security_hook_heads.task_setscheduler),
+	.task_getscheduler =
+		LIST_HEAD_INIT(security_hook_heads.task_getscheduler),
+	.task_movememory =
+		LIST_HEAD_INIT(security_hook_heads.task_movememory),
+	.task_kill =	LIST_HEAD_INIT(security_hook_heads.task_kill),
+	.task_wait =	LIST_HEAD_INIT(security_hook_heads.task_wait),
+	.task_prctl =	LIST_HEAD_INIT(security_hook_heads.task_prctl),
+	.task_to_inode =
+		LIST_HEAD_INIT(security_hook_heads.task_to_inode),
+	.ipc_permission =
+		LIST_HEAD_INIT(security_hook_heads.ipc_permission),
+	.ipc_getsecid =	LIST_HEAD_INIT(security_hook_heads.ipc_getsecid),
+	.msg_msg_alloc_security =
+		LIST_HEAD_INIT(security_hook_heads.msg_msg_alloc_security),
+	.msg_msg_free_security =
+		LIST_HEAD_INIT(security_hook_heads.msg_msg_free_security),
+	.msg_queue_alloc_security =
+		LIST_HEAD_INIT(security_hook_heads.msg_queue_alloc_security),
+	.msg_queue_free_security =
+		LIST_HEAD_INIT(security_hook_heads.msg_queue_free_security),
+	.msg_queue_associate =
+		LIST_HEAD_INIT(security_hook_heads.msg_queue_associate),
+	.msg_queue_msgctl =
+		LIST_HEAD_INIT(security_hook_heads.msg_queue_msgctl),
+	.msg_queue_msgsnd =
+		LIST_HEAD_INIT(security_hook_heads.msg_queue_msgsnd),
+	.msg_queue_msgrcv =
+		LIST_HEAD_INIT(security_hook_heads.msg_queue_msgrcv),
+	.shm_alloc_security =
+		LIST_HEAD_INIT(security_hook_heads.shm_alloc_security),
+	.shm_free_security =
+		LIST_HEAD_INIT(security_hook_heads.shm_free_security),
+	.shm_associate =
+		LIST_HEAD_INIT(security_hook_heads.shm_associate),
+	.shm_shmctl =	LIST_HEAD_INIT(security_hook_heads.shm_shmctl),
+	.shm_shmat =	LIST_HEAD_INIT(security_hook_heads.shm_shmat),
+	.sem_alloc_security =
+		LIST_HEAD_INIT(security_hook_heads.sem_alloc_security),
+	.sem_free_security =
+		LIST_HEAD_INIT(security_hook_heads.sem_free_security),
+	.sem_associate =
+		LIST_HEAD_INIT(security_hook_heads.sem_associate),
+	.sem_semctl =	LIST_HEAD_INIT(security_hook_heads.sem_semctl),
+	.sem_semop =	LIST_HEAD_INIT(security_hook_heads.sem_semop),
+	.netlink_send =	LIST_HEAD_INIT(security_hook_heads.netlink_send),
+	.d_instantiate =
+		LIST_HEAD_INIT(security_hook_heads.d_instantiate),
+	.getprocattr =	LIST_HEAD_INIT(security_hook_heads.getprocattr),
+	.setprocattr =	LIST_HEAD_INIT(security_hook_heads.setprocattr),
+	.ismaclabel =	LIST_HEAD_INIT(security_hook_heads.ismaclabel),
+	.secid_to_secctx =
+		LIST_HEAD_INIT(security_hook_heads.secid_to_secctx),
+	.secctx_to_secid =
+		LIST_HEAD_INIT(security_hook_heads.secctx_to_secid),
+	.release_secctx =
+		LIST_HEAD_INIT(security_hook_heads.release_secctx),
+	.inode_notifysecctx =
+		LIST_HEAD_INIT(security_hook_heads.inode_notifysecctx),
+	.inode_setsecctx =
+		LIST_HEAD_INIT(security_hook_heads.inode_setsecctx),
+	.inode_getsecctx =
+		LIST_HEAD_INIT(security_hook_heads.inode_getsecctx),
+#ifdef CONFIG_SECURITY_NETWORK
+	.unix_stream_connect =
+		LIST_HEAD_INIT(security_hook_heads.unix_stream_connect),
+	.unix_may_send =
+		LIST_HEAD_INIT(security_hook_heads.unix_may_send),
+	.socket_create =
+		LIST_HEAD_INIT(security_hook_heads.socket_create),
+	.socket_post_create =
+		LIST_HEAD_INIT(security_hook_heads.socket_post_create),
+	.socket_bind =	LIST_HEAD_INIT(security_hook_heads.socket_bind),
+	.socket_connect =
+		LIST_HEAD_INIT(security_hook_heads.socket_connect),
+	.socket_listen =
+		LIST_HEAD_INIT(security_hook_heads.socket_listen),
+	.socket_accept =
+		LIST_HEAD_INIT(security_hook_heads.socket_accept),
+	.socket_sendmsg =
+		LIST_HEAD_INIT(security_hook_heads.socket_sendmsg),
+	.socket_recvmsg =
+		LIST_HEAD_INIT(security_hook_heads.socket_recvmsg),
+	.socket_getsockname =
+		LIST_HEAD_INIT(security_hook_heads.socket_getsockname),
+	.socket_getpeername =
+		LIST_HEAD_INIT(security_hook_heads.socket_getpeername),
+	.socket_getsockopt =
+		LIST_HEAD_INIT(security_hook_heads.socket_getsockopt),
+	.socket_setsockopt =
+		LIST_HEAD_INIT(security_hook_heads.socket_setsockopt),
+	.socket_shutdown =
+		LIST_HEAD_INIT(security_hook_heads.socket_shutdown),
+	.socket_sock_rcv_skb =
+		LIST_HEAD_INIT(security_hook_heads.socket_sock_rcv_skb),
+	.socket_getpeersec_stream =
+		LIST_HEAD_INIT(security_hook_heads.socket_getpeersec_stream),
+	.socket_getpeersec_dgram =
+		LIST_HEAD_INIT(security_hook_heads.socket_getpeersec_dgram),
+	.sk_alloc_security =
+		LIST_HEAD_INIT(security_hook_heads.sk_alloc_security),
+	.sk_free_security =
+		LIST_HEAD_INIT(security_hook_heads.sk_free_security),
+	.sk_clone_security =
+		LIST_HEAD_INIT(security_hook_heads.sk_clone_security),
+	.sk_getsecid =	LIST_HEAD_INIT(security_hook_heads.sk_getsecid),
+	.sock_graft =	LIST_HEAD_INIT(security_hook_heads.sock_graft),
+	.inet_conn_request =
+		LIST_HEAD_INIT(security_hook_heads.inet_conn_request),
+	.inet_csk_clone =
+		LIST_HEAD_INIT(security_hook_heads.inet_csk_clone),
+	.inet_conn_established =
+		LIST_HEAD_INIT(security_hook_heads.inet_conn_established),
+	.secmark_relabel_packet =
+		LIST_HEAD_INIT(security_hook_heads.secmark_relabel_packet),
+	.secmark_refcount_inc =
+		LIST_HEAD_INIT(security_hook_heads.secmark_refcount_inc),
+	.secmark_refcount_dec =
+		LIST_HEAD_INIT(security_hook_heads.secmark_refcount_dec),
+	.req_classify_flow =
+		LIST_HEAD_INIT(security_hook_heads.req_classify_flow),
+	.tun_dev_alloc_security =
+		LIST_HEAD_INIT(security_hook_heads.tun_dev_alloc_security),
+	.tun_dev_free_security =
+		LIST_HEAD_INIT(security_hook_heads.tun_dev_free_security),
+	.tun_dev_create =
+		LIST_HEAD_INIT(security_hook_heads.tun_dev_create),
+	.tun_dev_attach_queue =
+		LIST_HEAD_INIT(security_hook_heads.tun_dev_attach_queue),
+	.tun_dev_attach =
+		LIST_HEAD_INIT(security_hook_heads.tun_dev_attach),
+	.tun_dev_open =	LIST_HEAD_INIT(security_hook_heads.tun_dev_open),
+	.skb_owned_by =	LIST_HEAD_INIT(security_hook_heads.skb_owned_by),
+#endif	/* CONFIG_SECURITY_NETWORK */
+#ifdef CONFIG_SECURITY_NETWORK_XFRM
+	.xfrm_policy_alloc_security =
+		LIST_HEAD_INIT(security_hook_heads.xfrm_policy_alloc_security),
+	.xfrm_policy_clone_security =
+		LIST_HEAD_INIT(security_hook_heads.xfrm_policy_clone_security),
+	.xfrm_policy_free_security =
+		LIST_HEAD_INIT(security_hook_heads.xfrm_policy_free_security),
+	.xfrm_policy_delete_security =
+		LIST_HEAD_INIT(security_hook_heads.xfrm_policy_delete_security),
+	.xfrm_state_alloc =
+		LIST_HEAD_INIT(security_hook_heads.xfrm_state_alloc),
+	.xfrm_state_alloc_acquire =
+		LIST_HEAD_INIT(security_hook_heads.xfrm_state_alloc_acquire),
+	.xfrm_state_free_security =
+		LIST_HEAD_INIT(security_hook_heads.xfrm_state_free_security),
+	.xfrm_state_delete_security =
+		LIST_HEAD_INIT(security_hook_heads.xfrm_state_delete_security),
+	.xfrm_policy_lookup =
+		LIST_HEAD_INIT(security_hook_heads.xfrm_policy_lookup),
+	.xfrm_state_pol_flow_match =
+		LIST_HEAD_INIT(security_hook_heads.xfrm_state_pol_flow_match),
+	.xfrm_decode_session =
+		LIST_HEAD_INIT(security_hook_heads.xfrm_decode_session),
+#endif	/* CONFIG_SECURITY_NETWORK_XFRM */
+#ifdef CONFIG_KEYS
+	.key_alloc =	LIST_HEAD_INIT(security_hook_heads.key_alloc),
+	.key_free =	LIST_HEAD_INIT(security_hook_heads.key_free),
+	.key_permission =
+		LIST_HEAD_INIT(security_hook_heads.key_permission),
+	.key_getsecurity =
+		LIST_HEAD_INIT(security_hook_heads.key_getsecurity),
+#endif	/* CONFIG_KEYS */
+#ifdef CONFIG_AUDIT
+	.audit_rule_init =
+		LIST_HEAD_INIT(security_hook_heads.audit_rule_init),
+	.audit_rule_known =
+		LIST_HEAD_INIT(security_hook_heads.audit_rule_known),
+	.audit_rule_match =
+		LIST_HEAD_INIT(security_hook_heads.audit_rule_match),
+	.audit_rule_free =
+		LIST_HEAD_INIT(security_hook_heads.audit_rule_free),
 #endif /* CONFIG_AUDIT */
+};
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 0cf105f346d4..06c9dd962c3c 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -1990,12 +1990,6 @@ static int selinux_binder_transfer_file(struct task_struct *from,
 static int selinux_ptrace_access_check(struct task_struct *child,
 				     unsigned int mode)
 {
-	int rc;
-
-	rc = cap_ptrace_access_check(child, mode);
-	if (rc)
-		return rc;
-
 	if (mode & PTRACE_MODE_READ) {
 		u32 sid = current_sid();
 		u32 csid = task_sid(child);
@@ -2007,25 +2001,13 @@ static int selinux_ptrace_access_check(struct task_struct *child,
 
 static int selinux_ptrace_traceme(struct task_struct *parent)
 {
-	int rc;
-
-	rc = cap_ptrace_traceme(parent);
-	if (rc)
-		return rc;
-
 	return task_has_perm(parent, current, PROCESS__PTRACE);
 }
 
 static int selinux_capget(struct task_struct *target, kernel_cap_t *effective,
 			  kernel_cap_t *inheritable, kernel_cap_t *permitted)
 {
-	int error;
-
-	error = current_has_perm(target, PROCESS__GETCAP);
-	if (error)
-		return error;
-
-	return cap_capget(target, effective, inheritable, permitted);
+	return current_has_perm(target, PROCESS__GETCAP);
 }
 
 static int selinux_capset(struct cred *new, const struct cred *old,
@@ -2033,13 +2015,6 @@ static int selinux_capset(struct cred *new, const struct cred *old,
 			  const kernel_cap_t *inheritable,
 			  const kernel_cap_t *permitted)
 {
-	int error;
-
-	error = cap_capset(new, old,
-				      effective, inheritable, permitted);
-	if (error)
-		return error;
-
 	return cred_has_perm(old, new, PROCESS__SETCAP);
 }
 
@@ -2056,12 +2031,6 @@ static int selinux_capset(struct cred *new, const struct cred *old,
 static int selinux_capable(const struct cred *cred, struct user_namespace *ns,
 			   int cap, int audit)
 {
-	int rc;
-
-	rc = cap_capable(cred, ns, cap, audit);
-	if (rc)
-		return rc;
-
 	return cred_has_capability(cred, cap, audit);
 }
 
@@ -2139,12 +2108,12 @@ static int selinux_vm_enough_memory(struct mm_struct *mm, long pages)
 {
 	int rc, cap_sys_admin = 0;
 
-	rc = selinux_capable(current_cred(), &init_user_ns, CAP_SYS_ADMIN,
-			     SECURITY_CAP_NOAUDIT);
+	rc = cred_has_capability(current_cred(), CAP_SYS_ADMIN,
+					SECURITY_CAP_NOAUDIT);
 	if (rc == 0)
 		cap_sys_admin = 1;
 
-	return __vm_enough_memory(mm, pages, cap_sys_admin);
+	return cap_sys_admin;
 }
 
 /* binprm security operations */
@@ -2193,10 +2162,6 @@ static int selinux_bprm_set_creds(struct linux_binprm *bprm)
 	struct inode *inode = file_inode(bprm->file);
 	int rc;
 
-	rc = cap_bprm_set_creds(bprm);
-	if (rc)
-		return rc;
-
 	/* SELinux context only depends on initial program or script and not
 	 * the script interpreter */
 	if (bprm->cred_prepared)
@@ -2320,7 +2285,7 @@ static int selinux_bprm_secureexec(struct linux_binprm *bprm)
 					PROCESS__NOATSECURE, NULL);
 	}
 
-	return (atsecure || cap_bprm_secureexec(bprm));
+	return !!atsecure;
 }
 
 static int match_file(const void *p, struct file *file, unsigned fd)
@@ -3132,8 +3097,11 @@ static int selinux_inode_getsecurity(const struct inode *inode, const char *name
 	 * and lack of permission just means that we fall back to the
 	 * in-core context value, not a denial.
 	 */
-	error = selinux_capable(current_cred(), &init_user_ns, CAP_MAC_ADMIN,
-				SECURITY_CAP_NOAUDIT);
+	error = cap_capable(current_cred(), &init_user_ns, CAP_MAC_ADMIN,
+			    SECURITY_CAP_NOAUDIT);
+	if (!error)
+		error = cred_has_capability(current_cred(), CAP_MAC_ADMIN,
+					    SECURITY_CAP_NOAUDIT);
 	if (!error)
 		error = security_sid_to_context_force(isec->sid, &context,
 						      &size);
@@ -3318,12 +3286,7 @@ error:
 
 static int selinux_mmap_addr(unsigned long addr)
 {
-	int rc;
-
-	/* do DAC check on address space usage */
-	rc = cap_mmap_addr(addr);
-	if (rc)
-		return rc;
+	int rc = 0;
 
 	if (addr < CONFIG_LSM_MMAP_MIN_ADDR) {
 		u32 sid = current_sid();
@@ -3639,23 +3602,11 @@ static void selinux_task_getsecid(struct task_struct *p, u32 *secid)
 
 static int selinux_task_setnice(struct task_struct *p, int nice)
 {
-	int rc;
-
-	rc = cap_task_setnice(p, nice);
-	if (rc)
-		return rc;
-
 	return current_has_perm(p, PROCESS__SETSCHED);
 }
 
 static int selinux_task_setioprio(struct task_struct *p, int ioprio)
 {
-	int rc;
-
-	rc = cap_task_setioprio(p, ioprio);
-	if (rc)
-		return rc;
-
 	return current_has_perm(p, PROCESS__SETSCHED);
 }
 
@@ -3681,12 +3632,6 @@ static int selinux_task_setrlimit(struct task_struct *p, unsigned int resource,
 
 static int selinux_task_setscheduler(struct task_struct *p)
 {
-	int rc;
-
-	rc = cap_task_setscheduler(p);
-	if (rc)
-		return rc;
-
 	return current_has_perm(p, PROCESS__SETSCHED);
 }
 
@@ -5097,12 +5042,6 @@ static unsigned int selinux_ipv6_postroute(const struct nf_hook_ops *ops,
 
 static int selinux_netlink_send(struct sock *sk, struct sk_buff *skb)
 {
-	int err;
-
-	err = cap_netlink_send(sk, skb);
-	if (err)
-		return err;
-
 	return selinux_nlmsg_perm(sk, skb);
 }
 
@@ -5840,9 +5779,7 @@ static int selinux_key_getsecurity(struct key *key, char **_buffer)
 
 #endif
 
-static struct security_operations selinux_ops = {
-	LSM_HOOK_INIT(name, "selinux"),
-
+static struct security_hook_list selinux_hooks[] = {
 	LSM_HOOK_INIT(binder_set_context_mgr, selinux_binder_set_context_mgr),
 	LSM_HOOK_INIT(binder_transaction, selinux_binder_transaction),
 	LSM_HOOK_INIT(binder_transfer_binder, selinux_binder_transfer_binder),
@@ -6055,7 +5992,7 @@ static struct security_operations selinux_ops = {
 
 static __init int selinux_init(void)
 {
-	if (!security_module_enable(&selinux_ops)) {
+	if (!security_module_enable("selinux")) {
 		selinux_enabled = 0;
 		return 0;
 	}
@@ -6077,8 +6014,7 @@ static __init int selinux_init(void)
 					    0, SLAB_PANIC, NULL);
 	avc_init();
 
-	if (register_security(&selinux_ops))
-		panic("SELinux: Unable to register with kernel.\n");
+	security_add_hooks(selinux_hooks, ARRAY_SIZE(selinux_hooks));
 
 	if (avc_add_callback(selinux_netcache_avc_callback, AVC_CALLBACK_RESET))
 		panic("SELinux: Unable to register AVC netcache callback\n");
@@ -6206,7 +6142,7 @@ int selinux_disable(void)
 	selinux_disabled = 1;
 	selinux_enabled = 0;
 
-	reset_security_ops();
+	security_delete_hooks(selinux_hooks, ARRAY_SIZE(selinux_hooks));
 
 	/* Try to destroy the avc node cache */
 	avc_disable();
diff --git a/security/smack/smack.h b/security/smack/smack.h
index 262dad8dfbc6..b8c1a869d85e 100644
--- a/security/smack/smack.h
+++ b/security/smack/smack.h
@@ -276,8 +276,6 @@ extern struct mutex	smack_known_lock;
 extern struct list_head smack_known_list;
 extern struct list_head smk_netlbladdr_list;
 
-extern struct security_operations smack_ops;
-
 #define SMACK_HASH_SLOTS 16
 extern struct hlist_head smack_known_hash[SMACK_HASH_SLOTS];
 
diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c
index 4313bf44c3f0..5eae42c8d0d5 100644
--- a/security/smack/smack_lsm.c
+++ b/security/smack/smack_lsm.c
@@ -436,17 +436,11 @@ static int smk_ptrace_rule_check(struct task_struct *tracer,
  */
 static int smack_ptrace_access_check(struct task_struct *ctp, unsigned int mode)
 {
-	int rc;
 	struct smack_known *skp;
 
-	rc = cap_ptrace_access_check(ctp, mode);
-	if (rc != 0)
-		return rc;
-
 	skp = smk_of_task_struct(ctp);
 
-	rc = smk_ptrace_rule_check(current, skp, mode, __func__);
-	return rc;
+	return smk_ptrace_rule_check(current, skp, mode, __func__);
 }
 
 /**
@@ -462,10 +456,6 @@ static int smack_ptrace_traceme(struct task_struct *ptp)
 	int rc;
 	struct smack_known *skp;
 
-	rc = cap_ptrace_traceme(ptp);
-	if (rc != 0)
-		return rc;
-
 	skp = smk_of_task(current_security());
 
 	rc = smk_ptrace_rule_check(ptp, skp, PTRACE_MODE_ATTACH, __func__);
@@ -721,10 +711,6 @@ static int smack_bprm_set_creds(struct linux_binprm *bprm)
 	struct inode_smack *isp;
 	int rc;
 
-	rc = cap_bprm_set_creds(bprm);
-	if (rc != 0)
-		return rc;
-
 	if (bprm->cred_prepared)
 		return 0;
 
@@ -779,12 +765,11 @@ static void smack_bprm_committing_creds(struct linux_binprm *bprm)
 static int smack_bprm_secureexec(struct linux_binprm *bprm)
 {
 	struct task_smack *tsp = current_security();
-	int ret = cap_bprm_secureexec(bprm);
 
-	if (!ret && (tsp->smk_task != tsp->smk_forked))
-		ret = 1;
+	if (tsp->smk_task != tsp->smk_forked)
+		return 1;
 
-	return ret;
+	return 0;
 }
 
 /*
@@ -1934,12 +1919,7 @@ static void smack_task_getsecid(struct task_struct *p, u32 *secid)
  */
 static int smack_task_setnice(struct task_struct *p, int nice)
 {
-	int rc;
-
-	rc = cap_task_setnice(p, nice);
-	if (rc == 0)
-		rc = smk_curacc_on_task(p, MAY_WRITE, __func__);
-	return rc;
+	return smk_curacc_on_task(p, MAY_WRITE, __func__);
 }
 
 /**
@@ -1951,12 +1931,7 @@ static int smack_task_setnice(struct task_struct *p, int nice)
  */
 static int smack_task_setioprio(struct task_struct *p, int ioprio)
 {
-	int rc;
-
-	rc = cap_task_setioprio(p, ioprio);
-	if (rc == 0)
-		rc = smk_curacc_on_task(p, MAY_WRITE, __func__);
-	return rc;
+	return smk_curacc_on_task(p, MAY_WRITE, __func__);
 }
 
 /**
@@ -1980,12 +1955,7 @@ static int smack_task_getioprio(struct task_struct *p)
  */
 static int smack_task_setscheduler(struct task_struct *p)
 {
-	int rc;
-
-	rc = cap_task_setscheduler(p);
-	if (rc == 0)
-		rc = smk_curacc_on_task(p, MAY_WRITE, __func__);
-	return rc;
+	return smk_curacc_on_task(p, MAY_WRITE, __func__);
 }
 
 /**
@@ -4266,9 +4236,7 @@ static int smack_inode_getsecctx(struct inode *inode, void **ctx, u32 *ctxlen)
 	return 0;
 }
 
-struct security_operations smack_ops = {
-	LSM_HOOK_INIT(name, "smack"),
-
+struct security_hook_list smack_hooks[] = {
 	LSM_HOOK_INIT(ptrace_access_check, smack_ptrace_access_check),
 	LSM_HOOK_INIT(ptrace_traceme, smack_ptrace_traceme),
 	LSM_HOOK_INIT(syslog, smack_syslog),
@@ -4451,7 +4419,7 @@ static __init int smack_init(void)
 	struct cred *cred;
 	struct task_smack *tsp;
 
-	if (!security_module_enable(&smack_ops))
+	if (!security_module_enable("smack"))
 		return 0;
 
 	smack_enabled = 1;
@@ -4481,8 +4449,7 @@ static __init int smack_init(void)
 	/*
 	 * Register with LSM
 	 */
-	if (register_security(&smack_ops))
-		panic("smack: Unable to register with kernel.\n");
+	security_add_hooks(smack_hooks, ARRAY_SIZE(smack_hooks));
 
 	return 0;
 }
diff --git a/security/smack/smackfs.c b/security/smack/smackfs.c
index d9682985349e..4aa12c8d3c63 100644
--- a/security/smack/smackfs.c
+++ b/security/smack/smackfs.c
@@ -2547,7 +2547,7 @@ static int __init init_smk_fs(void)
 	int err;
 	int rc;
 
-	if (!security_module_enable(&smack_ops))
+	if (!security_module_enable("smack"))
 		return 0;
 
 	err = smk_init_sysfs();
diff --git a/security/tomoyo/tomoyo.c b/security/tomoyo/tomoyo.c
index bce13583efda..cbf3df422c87 100644
--- a/security/tomoyo/tomoyo.c
+++ b/security/tomoyo/tomoyo.c
@@ -72,12 +72,6 @@ static void tomoyo_cred_free(struct cred *cred)
  */
 static int tomoyo_bprm_set_creds(struct linux_binprm *bprm)
 {
-	int rc;
-
-	rc = cap_bprm_set_creds(bprm);
-	if (rc)
-		return rc;
-
 	/*
 	 * Do only if this function is called for the first time of an execve
 	 * operation.
@@ -502,8 +496,7 @@ static int tomoyo_socket_sendmsg(struct socket *sock, struct msghdr *msg,
  * tomoyo_security_ops is a "struct security_operations" which is used for
  * registering TOMOYO.
  */
-static struct security_operations tomoyo_security_ops = {
-	LSM_HOOK_INIT(name, "tomoyo"),
+static struct security_hook_list tomoyo_hooks[] = {
 	LSM_HOOK_INIT(cred_alloc_blank, tomoyo_cred_alloc_blank),
 	LSM_HOOK_INIT(cred_prepare, tomoyo_cred_prepare),
 	LSM_HOOK_INIT(cred_transfer, tomoyo_cred_transfer),
@@ -546,11 +539,10 @@ static int __init tomoyo_init(void)
 {
 	struct cred *cred = (struct cred *) current_cred();
 
-	if (!security_module_enable(&tomoyo_security_ops))
+	if (!security_module_enable("tomoyo"))
 		return 0;
 	/* register ourselves with the security framework */
-	if (register_security(&tomoyo_security_ops))
-		panic("Failure registering TOMOYO Linux");
+	security_add_hooks(tomoyo_hooks, ARRAY_SIZE(tomoyo_hooks));
 	printk(KERN_INFO "TOMOYO Linux initialized\n");
 	cred->security = &tomoyo_kernel_domain;
 	tomoyo_mm_init();
diff --git a/security/yama/yama_lsm.c b/security/yama/yama_lsm.c
index 23dd4c6246b2..9ed32502470e 100644
--- a/security/yama/yama_lsm.c
+++ b/security/yama/yama_lsm.c
@@ -154,13 +154,9 @@ void yama_task_free(struct task_struct *task)
 int yama_task_prctl(int option, unsigned long arg2, unsigned long arg3,
 			   unsigned long arg4, unsigned long arg5)
 {
-	int rc;
+	int rc = -ENOSYS;
 	struct task_struct *myself = current;
 
-	rc = cap_task_prctl(option, arg2, arg3, arg4, arg5);
-	if (rc != -ENOSYS)
-		return rc;
-
 	switch (option) {
 	case PR_SET_PTRACER:
 		/* Since a thread can call prctl(), find the group leader
@@ -279,17 +275,10 @@ static int ptracer_exception_found(struct task_struct *tracer,
  *
  * Returns 0 if following the ptrace is allowed, -ve on error.
  */
-int yama_ptrace_access_check(struct task_struct *child,
+static int yama_ptrace_access_check(struct task_struct *child,
 				    unsigned int mode)
 {
-	int rc;
-
-	/* If standard caps disallows it, so does Yama.  We should
-	 * only tighten restrictions further.
-	 */
-	rc = cap_ptrace_access_check(child, mode);
-	if (rc)
-		return rc;
+	int rc = 0;
 
 	/* require ptrace target be a child of ptracer on attach */
 	if (mode == PTRACE_MODE_ATTACH) {
@@ -335,14 +324,7 @@ int yama_ptrace_access_check(struct task_struct *child,
  */
 int yama_ptrace_traceme(struct task_struct *parent)
 {
-	int rc;
-
-	/* If standard caps disallows it, so does Yama.  We should
-	 * only tighten restrictions further.
-	 */
-	rc = cap_ptrace_traceme(parent);
-	if (rc)
-		return rc;
+	int rc = 0;
 
 	/* Only disallow PTRACE_TRACEME on more aggressive settings. */
 	switch (ptrace_scope) {
@@ -364,16 +346,17 @@ int yama_ptrace_traceme(struct task_struct *parent)
 	return rc;
 }
 
-#ifndef CONFIG_SECURITY_YAMA_STACKED
-static struct security_operations yama_ops = {
-	LSM_HOOK_INIT(name, "yama"),
-
+static struct security_hook_list yama_hooks[] = {
 	LSM_HOOK_INIT(ptrace_access_check, yama_ptrace_access_check),
 	LSM_HOOK_INIT(ptrace_traceme, yama_ptrace_traceme),
 	LSM_HOOK_INIT(task_prctl, yama_task_prctl),
 	LSM_HOOK_INIT(task_free, yama_task_free),
 };
-#endif
+
+void __init yama_add_hooks(void)
+{
+	security_add_hooks(yama_hooks, ARRAY_SIZE(yama_hooks));
+}
 
 #ifdef CONFIG_SYSCTL
 static int yama_dointvec_minmax(struct ctl_table *table, int write,
@@ -418,16 +401,13 @@ static struct ctl_table yama_sysctl_table[] = {
 static __init int yama_init(void)
 {
 #ifndef CONFIG_SECURITY_YAMA_STACKED
-	if (!security_module_enable(&yama_ops))
+	/*
+	 * If yama is being stacked this is already taken care of.
+	 */
+	if (!security_module_enable("yama"))
 		return 0;
 #endif
-
-	printk(KERN_INFO "Yama: becoming mindful.\n");
-
-#ifndef CONFIG_SECURITY_YAMA_STACKED
-	if (register_security(&yama_ops))
-		panic("Yama: kernel registration failed.\n");
-#endif
+	pr_info("Yama: becoming mindful.\n");
 
 #ifdef CONFIG_SYSCTL
 	if (!register_sysctl_paths(yama_sysctl_path, yama_sysctl_table))
-- 
cgit v1.2.3


From 6a4b6b0a3b55f23f4cc9ad85a1539c581bcb0874 Mon Sep 17 00:00:00 2001
From: Johan Hovold <johan@kernel.org>
Date: Mon, 4 May 2015 17:10:31 +0200
Subject: gpio: sysfs: clean up chip class-device handling

Clean gpio-chip class device registration and deregistration.

The class device is registered when a gpio-chip is added (or from
gpiolib_sysfs_init post-core init call), and deregistered when the chip
is removed.

Store the class device in struct gpio_chip directly rather than do a
class-device lookup on deregistration. This also removes the need for
the exported flag.

Signed-off-by: Johan Hovold <johan@kernel.org>
Reviewed-by: Alexandre Courbot <acourbot@nvidia.com>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 drivers/gpio/gpiolib-sysfs.c | 39 +++++++++++++--------------------------
 include/linux/gpio/driver.h  |  4 ++--
 2 files changed, 15 insertions(+), 28 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/gpio/gpiolib-sysfs.c b/drivers/gpio/gpiolib-sysfs.c
index d19bf234e878..767b79adb9a4 100644
--- a/drivers/gpio/gpiolib-sysfs.c
+++ b/drivers/gpio/gpiolib-sysfs.c
@@ -582,7 +582,7 @@ int gpiod_export(struct gpio_desc *desc, bool direction_may_change)
 	mutex_lock(&sysfs_lock);
 
 	/* check if chip is being removed */
-	if (!chip || !chip->exported) {
+	if (!chip || !chip->cdev) {
 		status = -ENODEV;
 		goto fail_unlock;
 	}
@@ -767,7 +767,6 @@ EXPORT_SYMBOL_GPL(gpiod_unexport);
 
 int gpiochip_export(struct gpio_chip *chip)
 {
-	int		status;
 	struct device	*dev;
 
 	/* Many systems register gpio chips for SOC support very early,
@@ -783,41 +782,29 @@ int gpiochip_export(struct gpio_chip *chip)
 					chip, gpiochip_groups,
 					"gpiochip%d", chip->base);
 	if (IS_ERR(dev))
-		status = PTR_ERR(dev);
-	else
-		status = 0;
+		return PTR_ERR(dev);
 
 	mutex_lock(&sysfs_lock);
-	chip->exported = (status == 0);
+	chip->cdev = dev;
 	mutex_unlock(&sysfs_lock);
 
-	if (status)
-		chip_dbg(chip, "%s: status %d\n", __func__, status);
-
-	return status;
+	return 0;
 }
 
 void gpiochip_unexport(struct gpio_chip *chip)
 {
-	int			status;
-	struct device		*dev;
 	struct gpio_desc *desc;
 	unsigned int i;
 
-	dev = class_find_device(&gpio_class, NULL, chip, match_export);
-	if (dev) {
-		put_device(dev);
-		device_unregister(dev);
-		/* prevent further gpiod exports */
-		mutex_lock(&sysfs_lock);
-		chip->exported = false;
-		mutex_unlock(&sysfs_lock);
-		status = 0;
-	} else
-		status = -ENODEV;
+	if (!chip->cdev)
+		return;
 
-	if (status)
-		chip_dbg(chip, "%s: status %d\n", __func__, status);
+	device_unregister(chip->cdev);
+
+	/* prevent further gpiod exports */
+	mutex_lock(&sysfs_lock);
+	chip->cdev = NULL;
+	mutex_unlock(&sysfs_lock);
 
 	/* unregister gpiod class devices owned by sysfs */
 	for (i = 0; i < chip->ngpio; i++) {
@@ -845,7 +832,7 @@ static int __init gpiolib_sysfs_init(void)
 	 */
 	spin_lock_irqsave(&gpio_lock, flags);
 	list_for_each_entry(chip, &gpio_chips, list) {
-		if (chip->exported)
+		if (chip->cdev)
 			continue;
 
 		/*
diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h
index f1b36593ec9f..2c1e639f66bd 100644
--- a/include/linux/gpio/driver.h
+++ b/include/linux/gpio/driver.h
@@ -20,6 +20,7 @@ struct seq_file;
  * struct gpio_chip - abstract a GPIO controller
  * @label: for diagnostics
  * @dev: optional device providing the GPIOs
+ * @cdev: class device used by sysfs interface (may be NULL)
  * @owner: helps prevent removal of modules exporting active GPIOs
  * @list: links gpio_chips together for traversal
  * @request: optional hook for chip-specific activation, such as
@@ -57,7 +58,6 @@ struct seq_file;
  *	implies that if the chip supports IRQs, these IRQs need to be threaded
  *	as the chip access may sleep when e.g. reading out the IRQ status
  *	registers.
- * @exported: flags if the gpiochip is exported for use from sysfs. Private.
  * @irq_not_threaded: flag must be set if @can_sleep is set but the
  *	IRQs don't need to be threaded
  *
@@ -74,6 +74,7 @@ struct seq_file;
 struct gpio_chip {
 	const char		*label;
 	struct device		*dev;
+	struct device		*cdev;
 	struct module		*owner;
 	struct list_head        list;
 
@@ -109,7 +110,6 @@ struct gpio_chip {
 	const char		*const *names;
 	bool			can_sleep;
 	bool			irq_not_threaded;
-	bool			exported;
 
 #ifdef CONFIG_GPIOLIB_IRQCHIP
 	/*
-- 
cgit v1.2.3


From 166a85e44245d771bd7042f3ad72aa0e12bb53bd Mon Sep 17 00:00:00 2001
From: Johan Hovold <johan@kernel.org>
Date: Mon, 4 May 2015 17:10:33 +0200
Subject: gpio: remove gpiod_sysfs_set_active_low

Remove gpiod_sysfs_set_active_low (and gpio_sysfs_set_active_low) which
allowed code to change the polarity of a gpio line even after it had
been exported through sysfs.

Drivers should not care, and generally does not know, about gpio-line
polarity which is a hardware feature that needs to be described by
firmware.

It is currently possible to define gpio-line polarity in device-tree and
acpi firmware or using platform data. Userspace can also change the
polarity through sysfs.

Note that drivers using the legacy gpio interface could still use
GPIOF_ACTIVE_LOW to change the polarity before exporting the gpio.

There are no in-kernel users of this interface.

Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Harry Wei <harryxiyou@gmail.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: linux-doc@vger.kernel.org
Cc: linux-kernel@zh-kernel.org
Cc: linux-arch@vger.kernel.org
Signed-off-by: Johan Hovold <johan@kernel.org>
Reviewed-by: Alexandre Courbot <acourbot@nvidia.com>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 Documentation/gpio/gpio-legacy.txt |  9 -------
 Documentation/gpio/sysfs.txt       |  8 -------
 Documentation/zh_CN/gpio.txt       |  8 -------
 drivers/gpio/gpiolib-sysfs.c       | 48 ++------------------------------------
 include/asm-generic/gpio.h         |  5 ----
 include/linux/gpio.h               |  7 ------
 include/linux/gpio/consumer.h      |  6 -----
 7 files changed, 2 insertions(+), 89 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/gpio/gpio-legacy.txt b/Documentation/gpio/gpio-legacy.txt
index 6f83fa965b4b..79ab5648d69b 100644
--- a/Documentation/gpio/gpio-legacy.txt
+++ b/Documentation/gpio/gpio-legacy.txt
@@ -751,9 +751,6 @@ requested using gpio_request():
 	int gpio_export_link(struct device *dev, const char *name,
 		unsigned gpio)
 
-	/* change the polarity of a GPIO node in sysfs */
-	int gpio_sysfs_set_active_low(unsigned gpio, int value);
-
 After a kernel driver requests a GPIO, it may only be made available in
 the sysfs interface by gpio_export().  The driver can control whether the
 signal direction may change.  This helps drivers prevent userspace code
@@ -767,9 +764,3 @@ After the GPIO has been exported, gpio_export_link() allows creating
 symlinks from elsewhere in sysfs to the GPIO sysfs node.  Drivers can
 use this to provide the interface under their own device in sysfs with
 a descriptive name.
-
-Drivers can use gpio_sysfs_set_active_low() to hide GPIO line polarity
-differences between boards from user space.  This only affects the
-sysfs interface.  Polarity change can be done both before and after
-gpio_export(), and previously enabled poll(2) support for either
-rising or falling edge will be reconfigured to follow this setting.
diff --git a/Documentation/gpio/sysfs.txt b/Documentation/gpio/sysfs.txt
index c2c3a97f8ff7..535b6a8a7a7c 100644
--- a/Documentation/gpio/sysfs.txt
+++ b/Documentation/gpio/sysfs.txt
@@ -132,9 +132,6 @@ requested using gpio_request():
 	int gpiod_export_link(struct device *dev, const char *name,
 		      struct gpio_desc *desc);
 
-	/* change the polarity of a GPIO node in sysfs */
-	int gpiod_sysfs_set_active_low(struct gpio_desc *desc, int value);
-
 After a kernel driver requests a GPIO, it may only be made available in
 the sysfs interface by gpiod_export(). The driver can control whether the
 signal direction may change. This helps drivers prevent userspace code
@@ -148,8 +145,3 @@ After the GPIO has been exported, gpiod_export_link() allows creating
 symlinks from elsewhere in sysfs to the GPIO sysfs node. Drivers can
 use this to provide the interface under their own device in sysfs with
 a descriptive name.
-
-Drivers can use gpiod_sysfs_set_active_low() to hide GPIO line polarity
-differences between boards from user space. Polarity change can be done both
-before and after gpiod_export(), and previously enabled poll(2) support for
-either rising or falling edge will be reconfigured to follow this setting.
diff --git a/Documentation/zh_CN/gpio.txt b/Documentation/zh_CN/gpio.txt
index d5b8f01833f4..bce972521065 100644
--- a/Documentation/zh_CN/gpio.txt
+++ b/Documentation/zh_CN/gpio.txt
@@ -638,9 +638,6 @@ GPIO 控制器的路径类似 /sys/class/gpio/gpiochip42/ (对于从#42 GPIO
 	int gpio_export_link(struct device *dev, const char *name,
 		unsigned gpio)
 
-	/* 改变 sysfs 中的一个 GPIO 节点的极性 */
-	int gpio_sysfs_set_active_low(unsigned gpio, int value);
-
 在一个内核驱动申请一个 GPIO 之后，它可以通过 gpio_export()使其在 sysfs
 接口中可见。该驱动可以控制信号方向是否可修改。这有助于防止用户空间代码无意间
 破坏重要的系统状态。
@@ -651,8 +648,3 @@ GPIO 控制器的路径类似 /sys/class/gpio/gpiochip42/ (对于从#42 GPIO
 在 GPIO 被导出之后，gpio_export_link()允许在 sysfs 文件系统的任何地方
 创建一个到这个 GPIO sysfs 节点的符号链接。这样驱动就可以通过一个描述性的
 名字，在 sysfs 中他们所拥有的设备下提供一个(到这个 GPIO sysfs 节点的)接口。
-
-驱动可以使用 gpio_sysfs_set_active_low() 来在用户空间隐藏电路板之间
-GPIO 线的极性差异。这个仅对 sysfs 接口起作用。极性的改变可以在 gpio_export()
-前后进行,且之前使能的轮询操作(poll(2))支持(上升或下降沿)将会被重新配置来遵循
-这个设置。
diff --git a/drivers/gpio/gpiolib-sysfs.c b/drivers/gpio/gpiolib-sysfs.c
index aeb73ef2955e..9dcd346a20fb 100644
--- a/drivers/gpio/gpiolib-sysfs.c
+++ b/drivers/gpio/gpiolib-sysfs.c
@@ -308,8 +308,8 @@ static int sysfs_set_active_low(struct gpio_desc *desc, struct device *dev,
 		clear_bit(FLAG_ACTIVE_LOW, &desc->flags);
 
 	/* reconfigure poll(2) support if enabled on one edge only */
-	if (dev != NULL && (!!test_bit(FLAG_TRIG_RISE, &desc->flags) ^
-				!!test_bit(FLAG_TRIG_FALL, &desc->flags))) {
+	if (!!test_bit(FLAG_TRIG_RISE, &desc->flags) ^
+				!!test_bit(FLAG_TRIG_FALL, &desc->flags)) {
 		unsigned long trigger_flags = desc->flags & GPIO_TRIGGER_MASK;
 
 		gpio_setup_irq(desc, dev, 0);
@@ -680,50 +680,6 @@ int gpiod_export_link(struct device *dev, const char *name,
 }
 EXPORT_SYMBOL_GPL(gpiod_export_link);
 
-/**
- * gpiod_sysfs_set_active_low - set the polarity of gpio sysfs value
- * @gpio: gpio to change
- * @value: non-zero to use active low, i.e. inverted values
- *
- * Set the polarity of /sys/class/gpio/gpioN/value sysfs attribute.
- * The GPIO does not have to be exported yet.  If poll(2) support has
- * been enabled for either rising or falling edge, it will be
- * reconfigured to follow the new polarity.
- *
- * Returns zero on success, else an error.
- */
-int gpiod_sysfs_set_active_low(struct gpio_desc *desc, int value)
-{
-	struct device		*dev = NULL;
-	int			status = -EINVAL;
-
-	if (!desc) {
-		pr_warn("%s: invalid GPIO\n", __func__);
-		return -EINVAL;
-	}
-
-	mutex_lock(&sysfs_lock);
-
-	if (test_bit(FLAG_EXPORT, &desc->flags)) {
-		dev = class_find_device(&gpio_class, NULL, desc, match_export);
-		if (dev == NULL) {
-			status = -ENODEV;
-			goto unlock;
-		}
-	}
-
-	status = sysfs_set_active_low(desc, dev, value);
-	put_device(dev);
-unlock:
-	mutex_unlock(&sysfs_lock);
-
-	if (status)
-		gpiod_dbg(desc, "%s: status %d\n", __func__, status);
-
-	return status;
-}
-EXPORT_SYMBOL_GPL(gpiod_sysfs_set_active_low);
-
 /**
  * gpiod_unexport - reverse effect of gpio_export()
  * @gpio: gpio to make unavailable
diff --git a/include/asm-generic/gpio.h b/include/asm-generic/gpio.h
index 9bb0d11729c9..40ec1433f05d 100644
--- a/include/asm-generic/gpio.h
+++ b/include/asm-generic/gpio.h
@@ -128,11 +128,6 @@ static inline int gpio_export_link(struct device *dev, const char *name,
 	return gpiod_export_link(dev, name, gpio_to_desc(gpio));
 }
 
-static inline int gpio_sysfs_set_active_low(unsigned gpio, int value)
-{
-	return gpiod_sysfs_set_active_low(gpio_to_desc(gpio), value);
-}
-
 static inline void gpio_unexport(unsigned gpio)
 {
 	gpiod_unexport(gpio_to_desc(gpio));
diff --git a/include/linux/gpio.h b/include/linux/gpio.h
index ab81339a8590..d12b5d566e4b 100644
--- a/include/linux/gpio.h
+++ b/include/linux/gpio.h
@@ -196,13 +196,6 @@ static inline int gpio_export_link(struct device *dev, const char *name,
 	return -EINVAL;
 }
 
-static inline int gpio_sysfs_set_active_low(unsigned gpio, int value)
-{
-	/* GPIO can never have been requested */
-	WARN_ON(1);
-	return -EINVAL;
-}
-
 static inline void gpio_unexport(unsigned gpio)
 {
 	/* GPIO can never have been exported */
diff --git a/include/linux/gpio/consumer.h b/include/linux/gpio/consumer.h
index 3a7c9ffd5ab9..09a7fb0062a6 100644
--- a/include/linux/gpio/consumer.h
+++ b/include/linux/gpio/consumer.h
@@ -449,7 +449,6 @@ static inline int desc_to_gpio(const struct gpio_desc *desc)
 int gpiod_export(struct gpio_desc *desc, bool direction_may_change);
 int gpiod_export_link(struct device *dev, const char *name,
 		      struct gpio_desc *desc);
-int gpiod_sysfs_set_active_low(struct gpio_desc *desc, int value);
 void gpiod_unexport(struct gpio_desc *desc);
 
 #else  /* CONFIG_GPIOLIB && CONFIG_GPIO_SYSFS */
@@ -466,11 +465,6 @@ static inline int gpiod_export_link(struct device *dev, const char *name,
 	return -ENOSYS;
 }
 
-static inline int gpiod_sysfs_set_active_low(struct gpio_desc *desc, int value)
-{
-	return -ENOSYS;
-}
-
 static inline void gpiod_unexport(struct gpio_desc *desc)
 {
 }
-- 
cgit v1.2.3


From 0e39250845c0f91acc64264709b25f7f9b85c2c3 Mon Sep 17 00:00:00 2001
From: Alexander Duyck <alexander.h.duyck@redhat.com>
Date: Wed, 6 May 2015 21:11:51 -0700
Subject: net: Store virtual address instead of page in netdev_alloc_cache

This change makes it so that we store the virtual address of the page
in the netdev_alloc_cache instead of the page pointer.  The idea behind
this is to avoid multiple calls to page_address since the virtual address
is required for every access, but the page pointer is only needed at
allocation or reset of the page.

While I was at it I also reordered the netdev_alloc_cache structure a bit
so that the size is always 16 bytes by dropping size in the case where
PAGE_SIZE is greater than or equal to 32KB.

Signed-off-by: Alexander Duyck <alexander.h.duyck@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h |  5 ++---
 net/core/skbuff.c      | 55 +++++++++++++++++++++++++++++---------------------
 2 files changed, 34 insertions(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 9c2f793573fa..8b9a2c35a9d7 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -2128,9 +2128,8 @@ static inline void __skb_queue_purge(struct sk_buff_head *list)
 		kfree_skb(skb);
 }
 
-#define NETDEV_FRAG_PAGE_MAX_ORDER get_order(32768)
-#define NETDEV_FRAG_PAGE_MAX_SIZE  (PAGE_SIZE << NETDEV_FRAG_PAGE_MAX_ORDER)
-#define NETDEV_PAGECNT_MAX_BIAS	   NETDEV_FRAG_PAGE_MAX_SIZE
+#define NETDEV_FRAG_PAGE_MAX_SIZE	__ALIGN_MASK(32768, ~PAGE_MASK)
+#define NETDEV_FRAG_PAGE_MAX_ORDER	get_order(NETDEV_FRAG_PAGE_MAX_SIZE)
 
 void *netdev_alloc_frag(unsigned int fragsz);
 
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index d6851ca32598..a3062ec341c3 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -348,7 +348,13 @@ struct sk_buff *build_skb(void *data, unsigned int frag_size)
 EXPORT_SYMBOL(build_skb);
 
 struct netdev_alloc_cache {
-	struct page_frag	frag;
+	void * va;
+#if (PAGE_SIZE < NETDEV_FRAG_PAGE_MAX_SIZE)
+	__u16 offset;
+	__u16 size;
+#else
+	__u32 offset;
+#endif
 	/* we maintain a pagecount bias, so that we dont dirty cache line
 	 * containing page->_count every time we allocate a fragment.
 	 */
@@ -361,21 +367,20 @@ static DEFINE_PER_CPU(struct netdev_alloc_cache, napi_alloc_cache);
 static struct page *__page_frag_refill(struct netdev_alloc_cache *nc,
 				       gfp_t gfp_mask)
 {
-	const unsigned int order = NETDEV_FRAG_PAGE_MAX_ORDER;
 	struct page *page = NULL;
 	gfp_t gfp = gfp_mask;
 
-	if (order) {
-		gfp_mask |= __GFP_COMP | __GFP_NOWARN | __GFP_NORETRY |
-			    __GFP_NOMEMALLOC;
-		page = alloc_pages_node(NUMA_NO_NODE, gfp_mask, order);
-		nc->frag.size = PAGE_SIZE << (page ? order : 0);
-	}
-
+#if (PAGE_SIZE < NETDEV_FRAG_PAGE_MAX_SIZE)
+	gfp_mask |= __GFP_COMP | __GFP_NOWARN | __GFP_NORETRY |
+		    __GFP_NOMEMALLOC;
+	page = alloc_pages_node(NUMA_NO_NODE, gfp_mask,
+				NETDEV_FRAG_PAGE_MAX_ORDER);
+	nc->size = page ? NETDEV_FRAG_PAGE_MAX_SIZE : PAGE_SIZE;
+#endif
 	if (unlikely(!page))
 		page = alloc_pages_node(NUMA_NO_NODE, gfp, 0);
 
-	nc->frag.page = page;
+	nc->va = page ? page_address(page) : NULL;
 
 	return page;
 }
@@ -383,19 +388,20 @@ static struct page *__page_frag_refill(struct netdev_alloc_cache *nc,
 static void *__alloc_page_frag(struct netdev_alloc_cache *nc,
 			       unsigned int fragsz, gfp_t gfp_mask)
 {
-	struct page *page = nc->frag.page;
-	unsigned int size;
+	unsigned int size = PAGE_SIZE;
+	struct page *page;
 	int offset;
 
-	if (unlikely(!page)) {
+	if (unlikely(!nc->va)) {
 refill:
 		page = __page_frag_refill(nc, gfp_mask);
 		if (!page)
 			return NULL;
 
-		/* if size can vary use frag.size else just use PAGE_SIZE */
-		size = NETDEV_FRAG_PAGE_MAX_ORDER ? nc->frag.size : PAGE_SIZE;
-
+#if (PAGE_SIZE < NETDEV_FRAG_PAGE_MAX_SIZE)
+		/* if size can vary use size else just use PAGE_SIZE */
+		size = nc->size;
+#endif
 		/* Even if we own the page, we do not use atomic_set().
 		 * This would break get_page_unless_zero() users.
 		 */
@@ -404,17 +410,20 @@ refill:
 		/* reset page count bias and offset to start of new frag */
 		nc->pfmemalloc = page->pfmemalloc;
 		nc->pagecnt_bias = size;
-		nc->frag.offset = size;
+		nc->offset = size;
 	}
 
-	offset = nc->frag.offset - fragsz;
+	offset = nc->offset - fragsz;
 	if (unlikely(offset < 0)) {
+		page = virt_to_page(nc->va);
+
 		if (!atomic_sub_and_test(nc->pagecnt_bias, &page->_count))
 			goto refill;
 
-		/* if size can vary use frag.size else just use PAGE_SIZE */
-		size = NETDEV_FRAG_PAGE_MAX_ORDER ? nc->frag.size : PAGE_SIZE;
-
+#if (PAGE_SIZE < NETDEV_FRAG_PAGE_MAX_SIZE)
+		/* if size can vary use size else just use PAGE_SIZE */
+		size = nc->size;
+#endif
 		/* OK, page count is 0, we can safely set it */
 		atomic_set(&page->_count, size);
 
@@ -424,9 +433,9 @@ refill:
 	}
 
 	nc->pagecnt_bias--;
-	nc->frag.offset = offset;
+	nc->offset = offset;
 
-	return page_address(page) + offset;
+	return nc->va + offset;
 }
 
 static void *__netdev_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)
-- 
cgit v1.2.3


From b63ae8ca096dfdbfeef6a209c30a93a966518853 Mon Sep 17 00:00:00 2001
From: Alexander Duyck <alexander.h.duyck@redhat.com>
Date: Wed, 6 May 2015 21:11:57 -0700
Subject: mm/net: Rename and move page fragment handling from net/ to mm/

This change moves the __alloc_page_frag functionality out of the networking
stack and into the page allocation portion of mm.  The idea it so help make
this maintainable by placing it with other page allocation functions.

Since we are moving it from skbuff.c to page_alloc.c I have also renamed
the basic defines and structure from netdev_alloc_cache to page_frag_cache
to reflect that this is now part of a different kernel subsystem.

I have also added a simple __free_page_frag function which can handle
freeing the frags based on the skb->head pointer.  The model for this is
based off of __free_pages since we don't actually need to deal with all of
the cases that put_page handles.  I incorporated the virt_to_head_page call
and compound_order into the function as it actually allows for a signficant
size reduction by reducing code duplication.

Signed-off-by: Alexander Duyck <alexander.h.duyck@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/gfp.h      |   5 +++
 include/linux/mm_types.h |  18 +++++++++
 include/linux/skbuff.h   |   3 --
 mm/page_alloc.c          |  98 ++++++++++++++++++++++++++++++++++++++++++++++
 net/core/skbuff.c        | 100 +++--------------------------------------------
 5 files changed, 127 insertions(+), 97 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 97a9373e61e8..70a7fee1efb3 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -366,6 +366,11 @@ extern void free_pages(unsigned long addr, unsigned int order);
 extern void free_hot_cold_page(struct page *page, bool cold);
 extern void free_hot_cold_page_list(struct list_head *list, bool cold);
 
+struct page_frag_cache;
+extern void *__alloc_page_frag(struct page_frag_cache *nc,
+			       unsigned int fragsz, gfp_t gfp_mask);
+extern void __free_page_frag(void *addr);
+
 extern void __free_kmem_pages(struct page *page, unsigned int order);
 extern void free_kmem_pages(unsigned long addr, unsigned int order);
 
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 8d37e26a1007..0038ac7466fd 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -226,6 +226,24 @@ struct page_frag {
 #endif
 };
 
+#define PAGE_FRAG_CACHE_MAX_SIZE	__ALIGN_MASK(32768, ~PAGE_MASK)
+#define PAGE_FRAG_CACHE_MAX_ORDER	get_order(PAGE_FRAG_CACHE_MAX_SIZE)
+
+struct page_frag_cache {
+	void * va;
+#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
+	__u16 offset;
+	__u16 size;
+#else
+	__u32 offset;
+#endif
+	/* we maintain a pagecount bias, so that we dont dirty cache line
+	 * containing page->_count every time we allocate a fragment.
+	 */
+	unsigned int		pagecnt_bias;
+	bool pfmemalloc;
+};
+
 typedef unsigned long __nocast vm_flags_t;
 
 /*
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 8b9a2c35a9d7..0039fcc45b3b 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -2128,9 +2128,6 @@ static inline void __skb_queue_purge(struct sk_buff_head *list)
 		kfree_skb(skb);
 }
 
-#define NETDEV_FRAG_PAGE_MAX_SIZE	__ALIGN_MASK(32768, ~PAGE_MASK)
-#define NETDEV_FRAG_PAGE_MAX_ORDER	get_order(NETDEV_FRAG_PAGE_MAX_SIZE)
-
 void *netdev_alloc_frag(unsigned int fragsz);
 
 struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int length,
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index ebffa0e4a9c0..2fd31aebef30 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2966,6 +2966,104 @@ void free_pages(unsigned long addr, unsigned int order)
 
 EXPORT_SYMBOL(free_pages);
 
+/*
+ * Page Fragment:
+ *  An arbitrary-length arbitrary-offset area of memory which resides
+ *  within a 0 or higher order page.  Multiple fragments within that page
+ *  are individually refcounted, in the page's reference counter.
+ *
+ * The page_frag functions below provide a simple allocation framework for
+ * page fragments.  This is used by the network stack and network device
+ * drivers to provide a backing region of memory for use as either an
+ * sk_buff->head, or to be used in the "frags" portion of skb_shared_info.
+ */
+static struct page *__page_frag_refill(struct page_frag_cache *nc,
+				       gfp_t gfp_mask)
+{
+	struct page *page = NULL;
+	gfp_t gfp = gfp_mask;
+
+#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
+	gfp_mask |= __GFP_COMP | __GFP_NOWARN | __GFP_NORETRY |
+		    __GFP_NOMEMALLOC;
+	page = alloc_pages_node(NUMA_NO_NODE, gfp_mask,
+				PAGE_FRAG_CACHE_MAX_ORDER);
+	nc->size = page ? PAGE_FRAG_CACHE_MAX_SIZE : PAGE_SIZE;
+#endif
+	if (unlikely(!page))
+		page = alloc_pages_node(NUMA_NO_NODE, gfp, 0);
+
+	nc->va = page ? page_address(page) : NULL;
+
+	return page;
+}
+
+void *__alloc_page_frag(struct page_frag_cache *nc,
+			unsigned int fragsz, gfp_t gfp_mask)
+{
+	unsigned int size = PAGE_SIZE;
+	struct page *page;
+	int offset;
+
+	if (unlikely(!nc->va)) {
+refill:
+		page = __page_frag_refill(nc, gfp_mask);
+		if (!page)
+			return NULL;
+
+#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
+		/* if size can vary use size else just use PAGE_SIZE */
+		size = nc->size;
+#endif
+		/* Even if we own the page, we do not use atomic_set().
+		 * This would break get_page_unless_zero() users.
+		 */
+		atomic_add(size - 1, &page->_count);
+
+		/* reset page count bias and offset to start of new frag */
+		nc->pfmemalloc = page->pfmemalloc;
+		nc->pagecnt_bias = size;
+		nc->offset = size;
+	}
+
+	offset = nc->offset - fragsz;
+	if (unlikely(offset < 0)) {
+		page = virt_to_page(nc->va);
+
+		if (!atomic_sub_and_test(nc->pagecnt_bias, &page->_count))
+			goto refill;
+
+#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
+		/* if size can vary use size else just use PAGE_SIZE */
+		size = nc->size;
+#endif
+		/* OK, page count is 0, we can safely set it */
+		atomic_set(&page->_count, size);
+
+		/* reset page count bias and offset to start of new frag */
+		nc->pagecnt_bias = size;
+		offset = size - fragsz;
+	}
+
+	nc->pagecnt_bias--;
+	nc->offset = offset;
+
+	return nc->va + offset;
+}
+EXPORT_SYMBOL(__alloc_page_frag);
+
+/*
+ * Frees a page fragment allocated out of either a compound or order 0 page.
+ */
+void __free_page_frag(void *addr)
+{
+	struct page *page = virt_to_head_page(addr);
+
+	if (unlikely(put_page_testzero(page)))
+		__free_pages_ok(page, compound_order(page));
+}
+EXPORT_SYMBOL(__free_page_frag);
+
 /*
  * alloc_kmem_pages charges newly allocated pages to the kmem resource counter
  * of the current memory cgroup.
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index a3062ec341c3..dcc0e07abf47 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -347,100 +347,12 @@ struct sk_buff *build_skb(void *data, unsigned int frag_size)
 }
 EXPORT_SYMBOL(build_skb);
 
-struct netdev_alloc_cache {
-	void * va;
-#if (PAGE_SIZE < NETDEV_FRAG_PAGE_MAX_SIZE)
-	__u16 offset;
-	__u16 size;
-#else
-	__u32 offset;
-#endif
-	/* we maintain a pagecount bias, so that we dont dirty cache line
-	 * containing page->_count every time we allocate a fragment.
-	 */
-	unsigned int		pagecnt_bias;
-	bool pfmemalloc;
-};
-static DEFINE_PER_CPU(struct netdev_alloc_cache, netdev_alloc_cache);
-static DEFINE_PER_CPU(struct netdev_alloc_cache, napi_alloc_cache);
-
-static struct page *__page_frag_refill(struct netdev_alloc_cache *nc,
-				       gfp_t gfp_mask)
-{
-	struct page *page = NULL;
-	gfp_t gfp = gfp_mask;
-
-#if (PAGE_SIZE < NETDEV_FRAG_PAGE_MAX_SIZE)
-	gfp_mask |= __GFP_COMP | __GFP_NOWARN | __GFP_NORETRY |
-		    __GFP_NOMEMALLOC;
-	page = alloc_pages_node(NUMA_NO_NODE, gfp_mask,
-				NETDEV_FRAG_PAGE_MAX_ORDER);
-	nc->size = page ? NETDEV_FRAG_PAGE_MAX_SIZE : PAGE_SIZE;
-#endif
-	if (unlikely(!page))
-		page = alloc_pages_node(NUMA_NO_NODE, gfp, 0);
-
-	nc->va = page ? page_address(page) : NULL;
-
-	return page;
-}
-
-static void *__alloc_page_frag(struct netdev_alloc_cache *nc,
-			       unsigned int fragsz, gfp_t gfp_mask)
-{
-	unsigned int size = PAGE_SIZE;
-	struct page *page;
-	int offset;
-
-	if (unlikely(!nc->va)) {
-refill:
-		page = __page_frag_refill(nc, gfp_mask);
-		if (!page)
-			return NULL;
-
-#if (PAGE_SIZE < NETDEV_FRAG_PAGE_MAX_SIZE)
-		/* if size can vary use size else just use PAGE_SIZE */
-		size = nc->size;
-#endif
-		/* Even if we own the page, we do not use atomic_set().
-		 * This would break get_page_unless_zero() users.
-		 */
-		atomic_add(size - 1, &page->_count);
-
-		/* reset page count bias and offset to start of new frag */
-		nc->pfmemalloc = page->pfmemalloc;
-		nc->pagecnt_bias = size;
-		nc->offset = size;
-	}
-
-	offset = nc->offset - fragsz;
-	if (unlikely(offset < 0)) {
-		page = virt_to_page(nc->va);
-
-		if (!atomic_sub_and_test(nc->pagecnt_bias, &page->_count))
-			goto refill;
-
-#if (PAGE_SIZE < NETDEV_FRAG_PAGE_MAX_SIZE)
-		/* if size can vary use size else just use PAGE_SIZE */
-		size = nc->size;
-#endif
-		/* OK, page count is 0, we can safely set it */
-		atomic_set(&page->_count, size);
-
-		/* reset page count bias and offset to start of new frag */
-		nc->pagecnt_bias = size;
-		offset = size - fragsz;
-	}
-
-	nc->pagecnt_bias--;
-	nc->offset = offset;
-
-	return nc->va + offset;
-}
+static DEFINE_PER_CPU(struct page_frag_cache, netdev_alloc_cache);
+static DEFINE_PER_CPU(struct page_frag_cache, napi_alloc_cache);
 
 static void *__netdev_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)
 {
-	struct netdev_alloc_cache *nc;
+	struct page_frag_cache *nc;
 	unsigned long flags;
 	void *data;
 
@@ -466,7 +378,7 @@ EXPORT_SYMBOL(netdev_alloc_frag);
 
 static void *__napi_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)
 {
-	struct netdev_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
+	struct page_frag_cache *nc = this_cpu_ptr(&napi_alloc_cache);
 
 	return __alloc_page_frag(nc, fragsz, gfp_mask);
 }
@@ -493,7 +405,7 @@ EXPORT_SYMBOL(napi_alloc_frag);
 struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len,
 				   gfp_t gfp_mask)
 {
-	struct netdev_alloc_cache *nc;
+	struct page_frag_cache *nc;
 	unsigned long flags;
 	struct sk_buff *skb;
 	bool pfmemalloc;
@@ -556,7 +468,7 @@ EXPORT_SYMBOL(__netdev_alloc_skb);
 struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
 				 gfp_t gfp_mask)
 {
-	struct netdev_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
+	struct page_frag_cache *nc = this_cpu_ptr(&napi_alloc_cache);
 	struct sk_buff *skb;
 	void *data;
 
-- 
cgit v1.2.3


From 181edb2bfa22b50817684135ab6430ed2808abf0 Mon Sep 17 00:00:00 2001
From: Alexander Duyck <alexander.h.duyck@redhat.com>
Date: Wed, 6 May 2015 21:12:03 -0700
Subject: net: Add skb_free_frag to replace use of put_page in freeing
 skb->head

This change adds a function called skb_free_frag which is meant to
compliment the function netdev_alloc_frag.  The general idea is to enable a
more lightweight version of page freeing since we don't actually need all
the overhead of a put_page, and we don't quite fit the model of __free_pages.

Signed-off-by: Alexander Duyck <alexander.h.duyck@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h |  5 +++++
 net/core/skbuff.c      | 10 ++++++----
 2 files changed, 11 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 0039fcc45b3b..c0b574a414e7 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -2182,6 +2182,11 @@ static inline struct sk_buff *netdev_alloc_skb_ip_align(struct net_device *dev,
 	return __netdev_alloc_skb_ip_align(dev, length, GFP_ATOMIC);
 }
 
+static inline void skb_free_frag(void *addr)
+{
+	__free_page_frag(addr);
+}
+
 void *napi_alloc_frag(unsigned int fragsz);
 struct sk_buff *__napi_alloc_skb(struct napi_struct *napi,
 				 unsigned int length, gfp_t gfp_mask);
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index dcc0e07abf47..d67e612bf0ef 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -436,7 +436,7 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len,
 
 	skb = __build_skb(data, len);
 	if (unlikely(!skb)) {
-		put_page(virt_to_head_page(data));
+		skb_free_frag(data);
 		return NULL;
 	}
 
@@ -490,7 +490,7 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
 
 	skb = __build_skb(data, len);
 	if (unlikely(!skb)) {
-		put_page(virt_to_head_page(data));
+		skb_free_frag(data);
 		return NULL;
 	}
 
@@ -549,10 +549,12 @@ static void skb_clone_fraglist(struct sk_buff *skb)
 
 static void skb_free_head(struct sk_buff *skb)
 {
+	unsigned char *head = skb->head;
+
 	if (skb->head_frag)
-		put_page(virt_to_head_page(skb->head));
+		skb_free_frag(head);
 	else
-		kfree(skb->head);
+		kfree(head);
 }
 
 static void skb_release_data(struct sk_buff *skb)
-- 
cgit v1.2.3


From 755a27e7e4c817dd51ade41668b380f26026899c Mon Sep 17 00:00:00 2001
From: Nicolas Iooss <nicolas.iooss_linux@m4x.org>
Date: Sun, 3 May 2015 18:18:02 +0800
Subject: tracing: remove unused ftrace_output_event() prototype

The prototype of ftrace_output_event was added by commit 1d6bae966e90
("tracing: Move raw output code from macro to standalone function")
but this function was not defined anywhere, and is still nowhere to be
found.

Link: http://lkml.kernel.org/r/1430648282-25792-1-git-send-email-nicolas.iooss_linux@m4x.org

Signed-off-by: Nicolas Iooss <nicolas.iooss_linux@m4x.org>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/ftrace_event.h | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
index f9ecf63d47f1..65ce6de91307 100644
--- a/include/linux/ftrace_event.h
+++ b/include/linux/ftrace_event.h
@@ -219,9 +219,6 @@ struct ftrace_event_class {
 extern int ftrace_event_reg(struct ftrace_event_call *event,
 			    enum trace_reg type, void *data);
 
-int ftrace_output_event(struct trace_iterator *iter, struct ftrace_event_call *event,
-			char *fmt, ...);
-
 int ftrace_event_define_field(struct ftrace_event_call *call,
 			      char *type, int len, char *item, int offset,
 			      int field_size, int sign, int filter);
-- 
cgit v1.2.3


From 020af89a41c41fd2c92d0da524968dfaba6269f0 Mon Sep 17 00:00:00 2001
From: Grygorii Strashko <Grygorii.Strashko@linaro.org>
Date: Mon, 27 Apr 2015 21:24:30 +0300
Subject: PM / sleep: Add macro to define common noirq system PM callbacks

The same approach is used as for the existing SET_SYSTEM_SLEEP_PM_OPS,
but for noirq callbacks.

New SET_NOIRQ_SYSTEM_SLEEP_PM_OPS, defined for CONFIG_PM_SLEEP, will
point ->suspend_noirq, ->freeze_noirq and ->poweroff_noirq to the same
function. Vice versa happens for ->resume_noirq, ->thaw_noirq and
->restore_noirq.

Signed-off-by: Grygorii Strashko <Grygorii.Strashko@linaro.org>
Acked-by: Santosh Shilimkar <ssantosh@kernel.org>
Reviewed-by: Ulf Hansson <ulf.hansson@linaro.org>
Acked-by: Pavel Machek <pavel@ucw.cz>
Reviewed-by: Kevin Hilman <khilman@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/linux/pm.h | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/pm.h b/include/linux/pm.h
index 2d29c64f8fb1..4890743892ef 100644
--- a/include/linux/pm.h
+++ b/include/linux/pm.h
@@ -342,6 +342,18 @@ struct dev_pm_ops {
 #define SET_LATE_SYSTEM_SLEEP_PM_OPS(suspend_fn, resume_fn)
 #endif
 
+#ifdef CONFIG_PM_SLEEP
+#define SET_NOIRQ_SYSTEM_SLEEP_PM_OPS(suspend_fn, resume_fn) \
+	.suspend_noirq = suspend_fn, \
+	.resume_noirq = resume_fn, \
+	.freeze_noirq = suspend_fn, \
+	.thaw_noirq = resume_fn, \
+	.poweroff_noirq = suspend_fn, \
+	.restore_noirq = resume_fn,
+#else
+#define SET_NOIRQ_SYSTEM_SLEEP_PM_OPS(suspend_fn, resume_fn)
+#endif
+
 #ifdef CONFIG_PM
 #define SET_RUNTIME_PM_OPS(suspend_fn, resume_fn, idle_fn) \
 	.runtime_suspend = suspend_fn, \
-- 
cgit v1.2.3


From 75f504004ab866c8f84749303b0f70953724e259 Mon Sep 17 00:00:00 2001
From: Rajendra Nayak <rnayak@codeaurora.org>
Date: Thu, 23 Apr 2015 14:03:09 +0530
Subject: PM / clock_ops: Provide default runtime ops to users

Most users of PM clocks do the extact same things in the runtime
suspend/resume callbacks. Provide them USE_PM_CLK_RUNTIME_OPS so
as to avoid/remove boilerplate code.

Signed-off-by: Rajendra Nayak <rnayak@codeaurora.org>
Reviewed-by: Kevin Hilman <khilman@linaro.org>
Acked-by: Santosh Shilimkar <ssantosh@kernel.org>
Acked-by: Geert Uytterhoeven <geert+renesas@glider.be>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/base/power/clock_ops.c | 38 ++++++++++++++++++++++++++++++++++++++
 include/linux/pm_clock.h       | 10 ++++++++++
 2 files changed, 48 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/base/power/clock_ops.c b/drivers/base/power/clock_ops.c
index 7fdd0172605a..8abea669c82c 100644
--- a/drivers/base/power/clock_ops.c
+++ b/drivers/base/power/clock_ops.c
@@ -15,6 +15,7 @@
 #include <linux/clkdev.h>
 #include <linux/slab.h>
 #include <linux/err.h>
+#include <linux/pm_runtime.h>
 
 #ifdef CONFIG_PM
 
@@ -367,6 +368,43 @@ static int pm_clk_notify(struct notifier_block *nb,
 	return 0;
 }
 
+int pm_clk_runtime_suspend(struct device *dev)
+{
+	int ret;
+
+	dev_dbg(dev, "%s\n", __func__);
+
+	ret = pm_generic_runtime_suspend(dev);
+	if (ret) {
+		dev_err(dev, "failed to suspend device\n");
+		return ret;
+	}
+
+	ret = pm_clk_suspend(dev);
+	if (ret) {
+		dev_err(dev, "failed to suspend clock\n");
+		pm_generic_runtime_resume(dev);
+		return ret;
+	}
+
+	return 0;
+}
+
+int pm_clk_runtime_resume(struct device *dev)
+{
+	int ret;
+
+	dev_dbg(dev, "%s\n", __func__);
+
+	ret = pm_clk_resume(dev);
+	if (ret) {
+		dev_err(dev, "failed to resume clock\n");
+		return ret;
+	}
+
+	return pm_generic_runtime_resume(dev);
+}
+
 #else /* !CONFIG_PM */
 
 /**
diff --git a/include/linux/pm_clock.h b/include/linux/pm_clock.h
index 0b0039634410..25266c600021 100644
--- a/include/linux/pm_clock.h
+++ b/include/linux/pm_clock.h
@@ -20,6 +20,16 @@ struct pm_clk_notifier_block {
 
 struct clk;
 
+#ifdef CONFIG_PM
+extern int pm_clk_runtime_suspend(struct device *dev);
+extern int pm_clk_runtime_resume(struct device *dev);
+#define USE_PM_CLK_RUNTIME_OPS \
+	.runtime_suspend = pm_clk_runtime_suspend, \
+	.runtime_resume = pm_clk_runtime_resume,
+#else
+#define USE_PM_CLK_RUNTIME_OPS
+#endif
+
 #ifdef CONFIG_PM_CLK
 static inline bool pm_clk_no_clocks(struct device *dev)
 {
-- 
cgit v1.2.3


From 9d47c0a2d958e06322c88245749278633d333cca Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@resnulli.us>
Date: Sun, 10 May 2015 09:47:47 -0700
Subject: switchdev: s/swdev_/switchdev_/

Turned out that "switchdev" sticks. So just unify all related terms to use
this prefix.

Signed-off-by: Jiri Pirko <jiri@resnulli.us>
Signed-off-by: Scott Feldman <sfeldma@gmail.com>
Acked-by: Roopa Prabhu <roopa@cumulusnetworks.com>
Acked-by: Andy Gospodarek <gospo@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/rocker/rocker.c | 37 +++++++++++++++++----------------
 include/linux/netdevice.h            |  2 +-
 include/net/switchdev.h              | 30 +++++++++++++--------------
 net/dsa/slave.c                      |  8 ++++----
 net/switchdev/switchdev.c            | 40 ++++++++++++++++++------------------
 5 files changed, 59 insertions(+), 58 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/rocker/rocker.c b/drivers/net/ethernet/rocker/rocker.c
index 418f62b19545..7473874daf1f 100644
--- a/drivers/net/ethernet/rocker/rocker.c
+++ b/drivers/net/ethernet/rocker/rocker.c
@@ -4221,8 +4221,8 @@ static const struct net_device_ops rocker_port_netdev_ops = {
  * swdev interface
  ********************/
 
-static int rocker_port_swdev_parent_id_get(struct net_device *dev,
-					   struct netdev_phys_item_id *psid)
+static int rocker_port_switchdev_parent_id_get(struct net_device *dev,
+					       struct netdev_phys_item_id *psid)
 {
 	struct rocker_port *rocker_port = netdev_priv(dev);
 	struct rocker *rocker = rocker_port->rocker;
@@ -4232,18 +4232,19 @@ static int rocker_port_swdev_parent_id_get(struct net_device *dev,
 	return 0;
 }
 
-static int rocker_port_swdev_port_stp_update(struct net_device *dev, u8 state)
+static int rocker_port_switchdev_port_stp_update(struct net_device *dev,
+						 u8 state)
 {
 	struct rocker_port *rocker_port = netdev_priv(dev);
 
 	return rocker_port_stp_update(rocker_port, state);
 }
 
-static int rocker_port_swdev_fib_ipv4_add(struct net_device *dev,
-					  __be32 dst, int dst_len,
-					  struct fib_info *fi,
-					  u8 tos, u8 type,
-					  u32 nlflags, u32 tb_id)
+static int rocker_port_switchdev_fib_ipv4_add(struct net_device *dev,
+					      __be32 dst, int dst_len,
+					      struct fib_info *fi,
+					      u8 tos, u8 type,
+					      u32 nlflags, u32 tb_id)
 {
 	struct rocker_port *rocker_port = netdev_priv(dev);
 	int flags = 0;
@@ -4252,10 +4253,10 @@ static int rocker_port_swdev_fib_ipv4_add(struct net_device *dev,
 				    fi, tb_id, flags);
 }
 
-static int rocker_port_swdev_fib_ipv4_del(struct net_device *dev,
-					  __be32 dst, int dst_len,
-					  struct fib_info *fi,
-					  u8 tos, u8 type, u32 tb_id)
+static int rocker_port_switchdev_fib_ipv4_del(struct net_device *dev,
+					      __be32 dst, int dst_len,
+					      struct fib_info *fi,
+					      u8 tos, u8 type, u32 tb_id)
 {
 	struct rocker_port *rocker_port = netdev_priv(dev);
 	int flags = ROCKER_OP_FLAG_REMOVE;
@@ -4264,11 +4265,11 @@ static int rocker_port_swdev_fib_ipv4_del(struct net_device *dev,
 				    fi, tb_id, flags);
 }
 
-static const struct swdev_ops rocker_port_swdev_ops = {
-	.swdev_parent_id_get		= rocker_port_swdev_parent_id_get,
-	.swdev_port_stp_update		= rocker_port_swdev_port_stp_update,
-	.swdev_fib_ipv4_add		= rocker_port_swdev_fib_ipv4_add,
-	.swdev_fib_ipv4_del		= rocker_port_swdev_fib_ipv4_del,
+static const struct switchdev_ops rocker_port_switchdev_ops = {
+	.switchdev_parent_id_get	= rocker_port_switchdev_parent_id_get,
+	.switchdev_port_stp_update	= rocker_port_switchdev_port_stp_update,
+	.switchdev_fib_ipv4_add		= rocker_port_switchdev_fib_ipv4_add,
+	.switchdev_fib_ipv4_del		= rocker_port_switchdev_fib_ipv4_del,
 };
 
 /********************
@@ -4623,7 +4624,7 @@ static int rocker_probe_port(struct rocker *rocker, unsigned int port_number)
 	rocker_port_dev_addr_init(rocker, rocker_port);
 	dev->netdev_ops = &rocker_port_netdev_ops;
 	dev->ethtool_ops = &rocker_port_ethtool_ops;
-	dev->swdev_ops = &rocker_port_swdev_ops;
+	dev->switchdev_ops = &rocker_port_switchdev_ops;
 	netif_napi_add(dev, &rocker_port->napi_tx, rocker_port_poll_tx,
 		       NAPI_POLL_WEIGHT);
 	netif_napi_add(dev, &rocker_port->napi_rx, rocker_port_poll_rx,
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index a6d706b2a947..2b39235b9f13 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1567,7 +1567,7 @@ struct net_device {
 	const struct net_device_ops *netdev_ops;
 	const struct ethtool_ops *ethtool_ops;
 #ifdef CONFIG_NET_SWITCHDEV
-	const struct swdev_ops *swdev_ops;
+	const struct switchdev_ops *switchdev_ops;
 #endif
 
 	const struct header_ops *header_ops;
diff --git a/include/net/switchdev.h b/include/net/switchdev.h
index cd921fa0d63a..97b556daeef7 100644
--- a/include/net/switchdev.h
+++ b/include/net/switchdev.h
@@ -19,28 +19,28 @@ struct fib_info;
 /**
  * struct switchdev_ops - switchdev operations
  *
- * @swdev_parent_id_get: Called to get an ID of the switch chip this port
+ * @switchdev_parent_id_get: Called to get an ID of the switch chip this port
  *   is part of.  If driver implements this, it indicates that it
  *   represents a port of a switch chip.
  *
- * @swdev_port_stp_update: Called to notify switch device port of bridge
+ * @switchdev_port_stp_update: Called to notify switch device port of bridge
  *   port STP state change.
  *
- * @swdev_fib_ipv4_add: Called to add/modify IPv4 route to switch device.
+ * @switchdev_fib_ipv4_add: Called to add/modify IPv4 route to switch device.
  *
- * @swdev_fib_ipv4_del: Called to delete IPv4 route from switch device.
+ * @switchdev_fib_ipv4_del: Called to delete IPv4 route from switch device.
  */
-struct swdev_ops {
-	int	(*swdev_parent_id_get)(struct net_device *dev,
-				       struct netdev_phys_item_id *psid);
-	int	(*swdev_port_stp_update)(struct net_device *dev, u8 state);
-	int	(*swdev_fib_ipv4_add)(struct net_device *dev, __be32 dst,
-				      int dst_len, struct fib_info *fi,
-				      u8 tos, u8 type, u32 nlflags,
-				      u32 tb_id);
-	int	(*swdev_fib_ipv4_del)(struct net_device *dev, __be32 dst,
-				      int dst_len, struct fib_info *fi,
-				      u8 tos, u8 type, u32 tb_id);
+struct switchdev_ops {
+	int	(*switchdev_parent_id_get)(struct net_device *dev,
+					   struct netdev_phys_item_id *psid);
+	int	(*switchdev_port_stp_update)(struct net_device *dev, u8 state);
+	int	(*switchdev_fib_ipv4_add)(struct net_device *dev, __be32 dst,
+					  int dst_len, struct fib_info *fi,
+					  u8 tos, u8 type, u32 nlflags,
+					  u32 tb_id);
+	int	(*switchdev_fib_ipv4_del)(struct net_device *dev, __be32 dst,
+					  int dst_len, struct fib_info *fi,
+					  u8 tos, u8 type, u32 tb_id);
 };
 
 enum switchdev_notifier_type {
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index 03e041addea3..1546acf6ebd3 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -675,9 +675,9 @@ static const struct net_device_ops dsa_slave_netdev_ops = {
 	.ndo_get_iflink		= dsa_slave_get_iflink,
 };
 
-static const struct swdev_ops dsa_slave_swdev_ops = {
-	.swdev_parent_id_get = dsa_slave_parent_id_get,
-	.swdev_port_stp_update = dsa_slave_stp_update,
+static const struct switchdev_ops dsa_slave_switchdev_ops = {
+	.switchdev_parent_id_get	= dsa_slave_parent_id_get,
+	.switchdev_port_stp_update	= dsa_slave_stp_update,
 };
 
 static void dsa_slave_adjust_link(struct net_device *dev)
@@ -866,7 +866,7 @@ int dsa_slave_create(struct dsa_switch *ds, struct device *parent,
 	eth_hw_addr_inherit(slave_dev, master);
 	slave_dev->tx_queue_len = 0;
 	slave_dev->netdev_ops = &dsa_slave_netdev_ops;
-	slave_dev->swdev_ops = &dsa_slave_swdev_ops;
+	slave_dev->switchdev_ops = &dsa_slave_switchdev_ops;
 
 	netdev_for_each_tx_queue(slave_dev, dsa_slave_set_lockdep_class_one,
 				 NULL);
diff --git a/net/switchdev/switchdev.c b/net/switchdev/switchdev.c
index 52613ed49a8c..b7f44a23def5 100644
--- a/net/switchdev/switchdev.c
+++ b/net/switchdev/switchdev.c
@@ -28,11 +28,11 @@
 int switchdev_parent_id_get(struct net_device *dev,
 			    struct netdev_phys_item_id *psid)
 {
-	const struct swdev_ops *ops = dev->swdev_ops;
+	const struct switchdev_ops *ops = dev->switchdev_ops;
 
-	if (!ops || !ops->swdev_parent_id_get)
+	if (!ops || !ops->switchdev_parent_id_get)
 		return -EOPNOTSUPP;
-	return ops->swdev_parent_id_get(dev, psid);
+	return ops->switchdev_parent_id_get(dev, psid);
 }
 EXPORT_SYMBOL_GPL(switchdev_parent_id_get);
 
@@ -46,13 +46,13 @@ EXPORT_SYMBOL_GPL(switchdev_parent_id_get);
  */
 int switchdev_port_stp_update(struct net_device *dev, u8 state)
 {
-	const struct swdev_ops *ops = dev->swdev_ops;
+	const struct switchdev_ops *ops = dev->switchdev_ops;
 	struct net_device *lower_dev;
 	struct list_head *iter;
 	int err = -EOPNOTSUPP;
 
-	if (ops && ops->swdev_port_stp_update)
-		return ops->swdev_port_stp_update(dev, state);
+	if (ops && ops->switchdev_port_stp_update)
+		return ops->switchdev_port_stp_update(dev, state);
 
 	netdev_for_each_lower_dev(dev, lower_dev, iter) {
 		err = switchdev_port_stp_update(lower_dev, state);
@@ -239,17 +239,17 @@ EXPORT_SYMBOL_GPL(ndo_dflt_switchdev_port_bridge_dellink);
 
 static struct net_device *switchdev_get_lowest_dev(struct net_device *dev)
 {
-	const struct swdev_ops *ops = dev->swdev_ops;
+	const struct switchdev_ops *ops = dev->switchdev_ops;
 	struct net_device *lower_dev;
 	struct net_device *port_dev;
 	struct list_head *iter;
 
 	/* Recusively search down until we find a sw port dev.
-	 * (A sw port dev supports swdev_parent_id_get).
+	 * (A sw port dev supports switchdev_parent_id_get).
 	 */
 
 	if (dev->features & NETIF_F_HW_SWITCH_OFFLOAD &&
-	    ops && ops->swdev_parent_id_get)
+	    ops && ops->switchdev_parent_id_get)
 		return dev;
 
 	netdev_for_each_lower_dev(dev, lower_dev, iter) {
@@ -313,7 +313,7 @@ int switchdev_fib_ipv4_add(u32 dst, int dst_len, struct fib_info *fi,
 			   u8 tos, u8 type, u32 nlflags, u32 tb_id)
 {
 	struct net_device *dev;
-	const struct swdev_ops *ops;
+	const struct switchdev_ops *ops;
 	int err = 0;
 
 	/* Don't offload route if using custom ip rules or if
@@ -331,12 +331,12 @@ int switchdev_fib_ipv4_add(u32 dst, int dst_len, struct fib_info *fi,
 	dev = switchdev_get_dev_by_nhs(fi);
 	if (!dev)
 		return 0;
-	ops = dev->swdev_ops;
+	ops = dev->switchdev_ops;
 
-	if (ops->swdev_fib_ipv4_add) {
-		err = ops->swdev_fib_ipv4_add(dev, htonl(dst), dst_len,
-					      fi, tos, type, nlflags,
-					      tb_id);
+	if (ops->switchdev_fib_ipv4_add) {
+		err = ops->switchdev_fib_ipv4_add(dev, htonl(dst), dst_len,
+						  fi, tos, type, nlflags,
+						  tb_id);
 		if (!err)
 			fi->fib_flags |= RTNH_F_EXTERNAL;
 	}
@@ -361,7 +361,7 @@ int switchdev_fib_ipv4_del(u32 dst, int dst_len, struct fib_info *fi,
 			   u8 tos, u8 type, u32 tb_id)
 {
 	struct net_device *dev;
-	const struct swdev_ops *ops;
+	const struct switchdev_ops *ops;
 	int err = 0;
 
 	if (!(fi->fib_flags & RTNH_F_EXTERNAL))
@@ -370,11 +370,11 @@ int switchdev_fib_ipv4_del(u32 dst, int dst_len, struct fib_info *fi,
 	dev = switchdev_get_dev_by_nhs(fi);
 	if (!dev)
 		return 0;
-	ops = dev->swdev_ops;
+	ops = dev->switchdev_ops;
 
-	if (ops->swdev_fib_ipv4_del) {
-		err = ops->swdev_fib_ipv4_del(dev, htonl(dst), dst_len,
-					      fi, tos, type, tb_id);
+	if (ops->switchdev_fib_ipv4_del) {
+		err = ops->switchdev_fib_ipv4_del(dev, htonl(dst), dst_len,
+						  fi, tos, type, tb_id);
 		if (!err)
 			fi->fib_flags &= ~RTNH_F_EXTERNAL;
 	}
-- 
cgit v1.2.3


From 7889cbee8357aaed85898d028829dfb4f75bae2c Mon Sep 17 00:00:00 2001
From: Scott Feldman <sfeldma@gmail.com>
Date: Sun, 10 May 2015 09:48:07 -0700
Subject: switchdev: remove NETIF_F_HW_SWITCH_OFFLOAD feature flag

Roopa said remove the feature flag for this series and she'll work on
bringing it back if needed at a later date.

Signed-off-by: Scott Feldman <sfeldma@gmail.com>
Acked-by: Jiri Pirko <jiri@resnulli.us>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/bonding/bond_main.c      | 5 +----
 drivers/net/ethernet/rocker/rocker.c | 3 +--
 drivers/net/team/team.c              | 2 +-
 include/linux/netdev_features.h      | 5 +----
 net/core/ethtool.c                   | 1 -
 5 files changed, 4 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 147642dae3d0..a2e25de98bde 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -1015,10 +1015,7 @@ static netdev_features_t bond_fix_features(struct net_device *dev,
 	netdev_features_t mask;
 	struct slave *slave;
 
-	/* If any slave has the offload feature flag set,
-	 * set the offload flag on the bond.
-	 */
-	mask = features | NETIF_F_HW_SWITCH_OFFLOAD;
+	mask = features;
 
 	features &= ~NETIF_F_ONE_FOR_ALL;
 	features |= NETIF_F_ALL_FOR_ALL;
diff --git a/drivers/net/ethernet/rocker/rocker.c b/drivers/net/ethernet/rocker/rocker.c
index 1c906504cfe2..9715e3322706 100644
--- a/drivers/net/ethernet/rocker/rocker.c
+++ b/drivers/net/ethernet/rocker/rocker.c
@@ -4906,8 +4906,7 @@ static int rocker_probe_port(struct rocker *rocker, unsigned int port_number)
 	rocker_carrier_init(rocker_port);
 
 	dev->features |= NETIF_F_NETNS_LOCAL |
-			 NETIF_F_HW_VLAN_CTAG_FILTER |
-			 NETIF_F_HW_SWITCH_OFFLOAD;
+			 NETIF_F_HW_VLAN_CTAG_FILTER;
 
 	err = register_netdev(dev);
 	if (err) {
diff --git a/drivers/net/team/team.c b/drivers/net/team/team.c
index bda32be596ef..1ec035a53c3d 100644
--- a/drivers/net/team/team.c
+++ b/drivers/net/team/team.c
@@ -1924,7 +1924,7 @@ static netdev_features_t team_fix_features(struct net_device *dev,
 	struct team *team = netdev_priv(dev);
 	netdev_features_t mask;
 
-	mask = features | NETIF_F_HW_SWITCH_OFFLOAD;
+	mask = features;
 	features &= ~NETIF_F_ONE_FOR_ALL;
 	features |= NETIF_F_ALL_FOR_ALL;
 
diff --git a/include/linux/netdev_features.h b/include/linux/netdev_features.h
index 7d59dc6ab789..9672781c593d 100644
--- a/include/linux/netdev_features.h
+++ b/include/linux/netdev_features.h
@@ -66,7 +66,6 @@ enum {
 	NETIF_F_HW_VLAN_STAG_FILTER_BIT,/* Receive filtering on VLAN STAGs */
 	NETIF_F_HW_L2FW_DOFFLOAD_BIT,	/* Allow L2 Forwarding in Hardware */
 	NETIF_F_BUSY_POLL_BIT,		/* Busy poll */
-	NETIF_F_HW_SWITCH_OFFLOAD_BIT,  /* HW switch offload */
 
 	/*
 	 * Add your fresh new feature above and remember to update
@@ -125,7 +124,6 @@ enum {
 #define NETIF_F_HW_VLAN_STAG_TX	__NETIF_F(HW_VLAN_STAG_TX)
 #define NETIF_F_HW_L2FW_DOFFLOAD	__NETIF_F(HW_L2FW_DOFFLOAD)
 #define NETIF_F_BUSY_POLL	__NETIF_F(BUSY_POLL)
-#define NETIF_F_HW_SWITCH_OFFLOAD	__NETIF_F(HW_SWITCH_OFFLOAD)
 
 /* Features valid for ethtool to change */
 /* = all defined minus driver/device-class-related */
@@ -161,8 +159,7 @@ enum {
  */
 #define NETIF_F_ONE_FOR_ALL	(NETIF_F_GSO_SOFTWARE | NETIF_F_GSO_ROBUST | \
 				 NETIF_F_SG | NETIF_F_HIGHDMA |		\
-				 NETIF_F_FRAGLIST | NETIF_F_VLAN_CHALLENGED | \
-				 NETIF_F_HW_SWITCH_OFFLOAD)
+				 NETIF_F_FRAGLIST | NETIF_F_VLAN_CHALLENGED)
 
 /*
  * If one device doesn't support one of these features, then disable it
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index 1d00b8922902..eb0c3ace7458 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -98,7 +98,6 @@ static const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN]
 	[NETIF_F_RXALL_BIT] =            "rx-all",
 	[NETIF_F_HW_L2FW_DOFFLOAD_BIT] = "l2-fwd-offload",
 	[NETIF_F_BUSY_POLL_BIT] =        "busy-poll",
-	[NETIF_F_HW_SWITCH_OFFLOAD_BIT] = "hw-switch-offload",
 };
 
 static const char
-- 
cgit v1.2.3


From 5d1d65f8bea6de3d9c2c60fdfdd2da02da5ea672 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Mon, 11 May 2015 17:48:12 +0800
Subject: crypto: aead - Convert top level interface to new style

This patch converts the top-level aead interface to the new style.
All user-level AEAD interface code have been moved into crypto/aead.h.

The allocation/free functions have switched over to the new way of
allocating tfms.

This patch also removes the double indrection on setkey so the
indirection now exists only at the alg level.

Apart from these there are no user-visible changes.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/aead.c                  | 161 +++++----------
 include/crypto/aead.h          | 436 +++++++++++++++++++++++++++++++++++++++-
 include/crypto/algapi.h        |  33 +--
 include/crypto/internal/aead.h |  38 +++-
 include/linux/crypto.h         | 442 +----------------------------------------
 5 files changed, 516 insertions(+), 594 deletions(-)

(limited to 'include/linux')

diff --git a/crypto/aead.c b/crypto/aead.c
index d6ad0c66ee83..717b2f6ec9bb 100644
--- a/crypto/aead.c
+++ b/crypto/aead.c
@@ -26,6 +26,9 @@
 
 #include "internal.h"
 
+static int aead_null_givencrypt(struct aead_givcrypt_request *req);
+static int aead_null_givdecrypt(struct aead_givcrypt_request *req);
+
 static int setkey_unaligned(struct crypto_aead *tfm, const u8 *key,
 			    unsigned int keylen)
 {
@@ -48,63 +51,63 @@ static int setkey_unaligned(struct crypto_aead *tfm, const u8 *key,
 	return ret;
 }
 
-static int setkey(struct crypto_aead *tfm, const u8 *key, unsigned int keylen)
+int crypto_aead_setkey(struct crypto_aead *tfm,
+		       const u8 *key, unsigned int keylen)
 {
 	struct aead_alg *aead = crypto_aead_alg(tfm);
 	unsigned long alignmask = crypto_aead_alignmask(tfm);
 
+	tfm = tfm->child;
+
 	if ((unsigned long)key & alignmask)
 		return setkey_unaligned(tfm, key, keylen);
 
 	return aead->setkey(tfm, key, keylen);
 }
+EXPORT_SYMBOL_GPL(crypto_aead_setkey);
 
 int crypto_aead_setauthsize(struct crypto_aead *tfm, unsigned int authsize)
 {
-	struct aead_tfm *crt = crypto_aead_crt(tfm);
 	int err;
 
 	if (authsize > crypto_aead_alg(tfm)->maxauthsize)
 		return -EINVAL;
 
 	if (crypto_aead_alg(tfm)->setauthsize) {
-		err = crypto_aead_alg(tfm)->setauthsize(crt->base, authsize);
+		err = crypto_aead_alg(tfm)->setauthsize(tfm->child, authsize);
 		if (err)
 			return err;
 	}
 
-	crypto_aead_crt(crt->base)->authsize = authsize;
-	crt->authsize = authsize;
+	tfm->child->authsize = authsize;
+	tfm->authsize = authsize;
 	return 0;
 }
 EXPORT_SYMBOL_GPL(crypto_aead_setauthsize);
 
-static unsigned int crypto_aead_ctxsize(struct crypto_alg *alg, u32 type,
-					u32 mask)
-{
-	return alg->cra_ctxsize;
-}
-
 static int no_givcrypt(struct aead_givcrypt_request *req)
 {
 	return -ENOSYS;
 }
 
-static int crypto_init_aead_ops(struct crypto_tfm *tfm, u32 type, u32 mask)
+static int crypto_aead_init_tfm(struct crypto_tfm *tfm)
 {
 	struct aead_alg *alg = &tfm->__crt_alg->cra_aead;
-	struct aead_tfm *crt = &tfm->crt_aead;
+	struct crypto_aead *crt = __crypto_aead_cast(tfm);
 
 	if (max(alg->maxauthsize, alg->ivsize) > PAGE_SIZE / 8)
 		return -EINVAL;
 
-	crt->setkey = tfm->__crt_alg->cra_flags & CRYPTO_ALG_GENIV ?
-		      alg->setkey : setkey;
 	crt->encrypt = alg->encrypt;
 	crt->decrypt = alg->decrypt;
-	crt->givencrypt = alg->givencrypt ?: no_givcrypt;
-	crt->givdecrypt = alg->givdecrypt ?: no_givcrypt;
-	crt->base = __crypto_aead_cast(tfm);
+	if (alg->ivsize) {
+		crt->givencrypt = alg->givencrypt ?: no_givcrypt;
+		crt->givdecrypt = alg->givdecrypt ?: no_givcrypt;
+	} else {
+		crt->givencrypt = aead_null_givencrypt;
+		crt->givdecrypt = aead_null_givdecrypt;
+	}
+	crt->child = __crypto_aead_cast(tfm);
 	crt->ivsize = alg->ivsize;
 	crt->authsize = alg->maxauthsize;
 
@@ -155,12 +158,17 @@ static void crypto_aead_show(struct seq_file *m, struct crypto_alg *alg)
 }
 
 const struct crypto_type crypto_aead_type = {
-	.ctxsize = crypto_aead_ctxsize,
-	.init = crypto_init_aead_ops,
+	.extsize = crypto_alg_extsize,
+	.init_tfm = crypto_aead_init_tfm,
 #ifdef CONFIG_PROC_FS
 	.show = crypto_aead_show,
 #endif
 	.report = crypto_aead_report,
+	.lookup = crypto_lookup_aead,
+	.maskclear = ~(CRYPTO_ALG_TYPE_MASK | CRYPTO_ALG_GENIV),
+	.maskset = CRYPTO_ALG_TYPE_MASK,
+	.type = CRYPTO_ALG_TYPE_AEAD,
+	.tfmsize = offsetof(struct crypto_aead, base),
 };
 EXPORT_SYMBOL_GPL(crypto_aead_type);
 
@@ -174,28 +182,6 @@ static int aead_null_givdecrypt(struct aead_givcrypt_request *req)
 	return crypto_aead_decrypt(&req->areq);
 }
 
-static int crypto_init_nivaead_ops(struct crypto_tfm *tfm, u32 type, u32 mask)
-{
-	struct aead_alg *alg = &tfm->__crt_alg->cra_aead;
-	struct aead_tfm *crt = &tfm->crt_aead;
-
-	if (max(alg->maxauthsize, alg->ivsize) > PAGE_SIZE / 8)
-		return -EINVAL;
-
-	crt->setkey = setkey;
-	crt->encrypt = alg->encrypt;
-	crt->decrypt = alg->decrypt;
-	if (!alg->ivsize) {
-		crt->givencrypt = aead_null_givencrypt;
-		crt->givdecrypt = aead_null_givdecrypt;
-	}
-	crt->base = __crypto_aead_cast(tfm);
-	crt->ivsize = alg->ivsize;
-	crt->authsize = alg->maxauthsize;
-
-	return 0;
-}
-
 #ifdef CONFIG_NET
 static int crypto_nivaead_report(struct sk_buff *skb, struct crypto_alg *alg)
 {
@@ -241,32 +227,24 @@ static void crypto_nivaead_show(struct seq_file *m, struct crypto_alg *alg)
 }
 
 const struct crypto_type crypto_nivaead_type = {
-	.ctxsize = crypto_aead_ctxsize,
-	.init = crypto_init_nivaead_ops,
+	.extsize = crypto_alg_extsize,
+	.init_tfm = crypto_aead_init_tfm,
 #ifdef CONFIG_PROC_FS
 	.show = crypto_nivaead_show,
 #endif
 	.report = crypto_nivaead_report,
+	.maskclear = ~(CRYPTO_ALG_TYPE_MASK | CRYPTO_ALG_GENIV),
+	.maskset = CRYPTO_ALG_TYPE_MASK | CRYPTO_ALG_GENIV,
+	.type = CRYPTO_ALG_TYPE_AEAD,
+	.tfmsize = offsetof(struct crypto_aead, base),
 };
 EXPORT_SYMBOL_GPL(crypto_nivaead_type);
 
 static int crypto_grab_nivaead(struct crypto_aead_spawn *spawn,
 			       const char *name, u32 type, u32 mask)
 {
-	struct crypto_alg *alg;
-	int err;
-
-	type &= ~(CRYPTO_ALG_TYPE_MASK | CRYPTO_ALG_GENIV);
-	type |= CRYPTO_ALG_TYPE_AEAD;
-	mask |= CRYPTO_ALG_TYPE_MASK | CRYPTO_ALG_GENIV;
-
-	alg = crypto_alg_mod_lookup(name, type, mask);
-	if (IS_ERR(alg))
-		return PTR_ERR(alg);
-
-	err = crypto_init_spawn(&spawn->base, alg, spawn->base.inst, mask);
-	crypto_mod_put(alg);
-	return err;
+	spawn->base.frontend = &crypto_nivaead_type;
+	return crypto_grab_spawn(&spawn->base, name, type, mask);
 }
 
 struct crypto_instance *aead_geniv_alloc(struct crypto_template *tmpl,
@@ -374,14 +352,17 @@ EXPORT_SYMBOL_GPL(aead_geniv_free);
 int aead_geniv_init(struct crypto_tfm *tfm)
 {
 	struct crypto_instance *inst = (void *)tfm->__crt_alg;
+	struct crypto_aead *child;
 	struct crypto_aead *aead;
 
-	aead = crypto_spawn_aead(crypto_instance_ctx(inst));
-	if (IS_ERR(aead))
-		return PTR_ERR(aead);
+	aead = __crypto_aead_cast(tfm);
 
-	tfm->crt_aead.base = aead;
-	tfm->crt_aead.reqsize += crypto_aead_reqsize(aead);
+	child = crypto_spawn_aead(crypto_instance_ctx(inst));
+	if (IS_ERR(child))
+		return PTR_ERR(child);
+
+	aead->child = child;
+	aead->reqsize += crypto_aead_reqsize(child);
 
 	return 0;
 }
@@ -389,7 +370,7 @@ EXPORT_SYMBOL_GPL(aead_geniv_init);
 
 void aead_geniv_exit(struct crypto_tfm *tfm)
 {
-	crypto_free_aead(tfm->crt_aead.base);
+	crypto_free_aead(__crypto_aead_cast(tfm)->child);
 }
 EXPORT_SYMBOL_GPL(aead_geniv_exit);
 
@@ -505,60 +486,14 @@ EXPORT_SYMBOL_GPL(crypto_lookup_aead);
 int crypto_grab_aead(struct crypto_aead_spawn *spawn, const char *name,
 		     u32 type, u32 mask)
 {
-	struct crypto_alg *alg;
-	int err;
-
-	type &= ~(CRYPTO_ALG_TYPE_MASK | CRYPTO_ALG_GENIV);
-	type |= CRYPTO_ALG_TYPE_AEAD;
-	mask &= ~(CRYPTO_ALG_TYPE_MASK | CRYPTO_ALG_GENIV);
-	mask |= CRYPTO_ALG_TYPE_MASK;
-
-	alg = crypto_lookup_aead(name, type, mask);
-	if (IS_ERR(alg))
-		return PTR_ERR(alg);
-
-	err = crypto_init_spawn(&spawn->base, alg, spawn->base.inst, mask);
-	crypto_mod_put(alg);
-	return err;
+	spawn->base.frontend = &crypto_aead_type;
+	return crypto_grab_spawn(&spawn->base, name, type, mask);
 }
 EXPORT_SYMBOL_GPL(crypto_grab_aead);
 
 struct crypto_aead *crypto_alloc_aead(const char *alg_name, u32 type, u32 mask)
 {
-	struct crypto_tfm *tfm;
-	int err;
-
-	type &= ~(CRYPTO_ALG_TYPE_MASK | CRYPTO_ALG_GENIV);
-	type |= CRYPTO_ALG_TYPE_AEAD;
-	mask &= ~(CRYPTO_ALG_TYPE_MASK | CRYPTO_ALG_GENIV);
-	mask |= CRYPTO_ALG_TYPE_MASK;
-
-	for (;;) {
-		struct crypto_alg *alg;
-
-		alg = crypto_lookup_aead(alg_name, type, mask);
-		if (IS_ERR(alg)) {
-			err = PTR_ERR(alg);
-			goto err;
-		}
-
-		tfm = __crypto_alloc_tfm(alg, type, mask);
-		if (!IS_ERR(tfm))
-			return __crypto_aead_cast(tfm);
-
-		crypto_mod_put(alg);
-		err = PTR_ERR(tfm);
-
-err:
-		if (err != -EAGAIN)
-			break;
-		if (signal_pending(current)) {
-			err = -EINTR;
-			break;
-		}
-	}
-
-	return ERR_PTR(err);
+	return crypto_alloc_tfm(alg_name, &crypto_aead_type, type, mask);
 }
 EXPORT_SYMBOL_GPL(crypto_alloc_aead);
 
diff --git a/include/crypto/aead.h b/include/crypto/aead.h
index 94b19be67574..dbcad08f4891 100644
--- a/include/crypto/aead.h
+++ b/include/crypto/aead.h
@@ -17,6 +17,62 @@
 #include <linux/kernel.h>
 #include <linux/slab.h>
 
+/**
+ * DOC: Authenticated Encryption With Associated Data (AEAD) Cipher API
+ *
+ * The AEAD cipher API is used with the ciphers of type CRYPTO_ALG_TYPE_AEAD
+ * (listed as type "aead" in /proc/crypto)
+ *
+ * The most prominent examples for this type of encryption is GCM and CCM.
+ * However, the kernel supports other types of AEAD ciphers which are defined
+ * with the following cipher string:
+ *
+ *	authenc(keyed message digest, block cipher)
+ *
+ * For example: authenc(hmac(sha256), cbc(aes))
+ *
+ * The example code provided for the asynchronous block cipher operation
+ * applies here as well. Naturally all *ablkcipher* symbols must be exchanged
+ * the *aead* pendants discussed in the following. In addtion, for the AEAD
+ * operation, the aead_request_set_assoc function must be used to set the
+ * pointer to the associated data memory location before performing the
+ * encryption or decryption operation. In case of an encryption, the associated
+ * data memory is filled during the encryption operation. For decryption, the
+ * associated data memory must contain data that is used to verify the integrity
+ * of the decrypted data. Another deviation from the asynchronous block cipher
+ * operation is that the caller should explicitly check for -EBADMSG of the
+ * crypto_aead_decrypt. That error indicates an authentication error, i.e.
+ * a breach in the integrity of the message. In essence, that -EBADMSG error
+ * code is the key bonus an AEAD cipher has over "standard" block chaining
+ * modes.
+ */
+
+/**
+ *	struct aead_request - AEAD request
+ *	@base: Common attributes for async crypto requests
+ *	@assoclen: Length in bytes of associated data for authentication
+ *	@cryptlen: Length of data to be encrypted or decrypted
+ *	@iv: Initialisation vector
+ *	@assoc: Associated data
+ *	@src: Source data
+ *	@dst: Destination data
+ *	@__ctx: Start of private context data
+ */
+struct aead_request {
+	struct crypto_async_request base;
+
+	unsigned int assoclen;
+	unsigned int cryptlen;
+
+	u8 *iv;
+
+	struct scatterlist *assoc;
+	struct scatterlist *src;
+	struct scatterlist *dst;
+
+	void *__ctx[] CRYPTO_MINALIGN_ATTR;
+};
+
 /**
  *	struct aead_givcrypt_request - AEAD request with IV generation
  *	@seq: Sequence number for IV generation
@@ -30,6 +86,380 @@ struct aead_givcrypt_request {
 	struct aead_request areq;
 };
 
+struct crypto_aead {
+	int (*encrypt)(struct aead_request *req);
+	int (*decrypt)(struct aead_request *req);
+	int (*givencrypt)(struct aead_givcrypt_request *req);
+	int (*givdecrypt)(struct aead_givcrypt_request *req);
+
+	struct crypto_aead *child;
+
+	unsigned int ivsize;
+	unsigned int authsize;
+	unsigned int reqsize;
+
+	struct crypto_tfm base;
+};
+
+static inline struct crypto_aead *__crypto_aead_cast(struct crypto_tfm *tfm)
+{
+	return container_of(tfm, struct crypto_aead, base);
+}
+
+/**
+ * crypto_alloc_aead() - allocate AEAD cipher handle
+ * @alg_name: is the cra_name / name or cra_driver_name / driver name of the
+ *	     AEAD cipher
+ * @type: specifies the type of the cipher
+ * @mask: specifies the mask for the cipher
+ *
+ * Allocate a cipher handle for an AEAD. The returned struct
+ * crypto_aead is the cipher handle that is required for any subsequent
+ * API invocation for that AEAD.
+ *
+ * Return: allocated cipher handle in case of success; IS_ERR() is true in case
+ *	   of an error, PTR_ERR() returns the error code.
+ */
+struct crypto_aead *crypto_alloc_aead(const char *alg_name, u32 type, u32 mask);
+
+static inline struct crypto_tfm *crypto_aead_tfm(struct crypto_aead *tfm)
+{
+	return &tfm->base;
+}
+
+/**
+ * crypto_free_aead() - zeroize and free aead handle
+ * @tfm: cipher handle to be freed
+ */
+static inline void crypto_free_aead(struct crypto_aead *tfm)
+{
+	crypto_destroy_tfm(tfm, crypto_aead_tfm(tfm));
+}
+
+static inline struct crypto_aead *crypto_aead_crt(struct crypto_aead *tfm)
+{
+	return tfm;
+}
+
+/**
+ * crypto_aead_ivsize() - obtain IV size
+ * @tfm: cipher handle
+ *
+ * The size of the IV for the aead referenced by the cipher handle is
+ * returned. This IV size may be zero if the cipher does not need an IV.
+ *
+ * Return: IV size in bytes
+ */
+static inline unsigned int crypto_aead_ivsize(struct crypto_aead *tfm)
+{
+	return tfm->ivsize;
+}
+
+/**
+ * crypto_aead_authsize() - obtain maximum authentication data size
+ * @tfm: cipher handle
+ *
+ * The maximum size of the authentication data for the AEAD cipher referenced
+ * by the AEAD cipher handle is returned. The authentication data size may be
+ * zero if the cipher implements a hard-coded maximum.
+ *
+ * The authentication data may also be known as "tag value".
+ *
+ * Return: authentication data size / tag size in bytes
+ */
+static inline unsigned int crypto_aead_authsize(struct crypto_aead *tfm)
+{
+	return tfm->authsize;
+}
+
+/**
+ * crypto_aead_blocksize() - obtain block size of cipher
+ * @tfm: cipher handle
+ *
+ * The block size for the AEAD referenced with the cipher handle is returned.
+ * The caller may use that information to allocate appropriate memory for the
+ * data returned by the encryption or decryption operation
+ *
+ * Return: block size of cipher
+ */
+static inline unsigned int crypto_aead_blocksize(struct crypto_aead *tfm)
+{
+	return crypto_tfm_alg_blocksize(crypto_aead_tfm(tfm));
+}
+
+static inline unsigned int crypto_aead_alignmask(struct crypto_aead *tfm)
+{
+	return crypto_tfm_alg_alignmask(crypto_aead_tfm(tfm));
+}
+
+static inline u32 crypto_aead_get_flags(struct crypto_aead *tfm)
+{
+	return crypto_tfm_get_flags(crypto_aead_tfm(tfm));
+}
+
+static inline void crypto_aead_set_flags(struct crypto_aead *tfm, u32 flags)
+{
+	crypto_tfm_set_flags(crypto_aead_tfm(tfm), flags);
+}
+
+static inline void crypto_aead_clear_flags(struct crypto_aead *tfm, u32 flags)
+{
+	crypto_tfm_clear_flags(crypto_aead_tfm(tfm), flags);
+}
+
+/**
+ * crypto_aead_setkey() - set key for cipher
+ * @tfm: cipher handle
+ * @key: buffer holding the key
+ * @keylen: length of the key in bytes
+ *
+ * The caller provided key is set for the AEAD referenced by the cipher
+ * handle.
+ *
+ * Note, the key length determines the cipher type. Many block ciphers implement
+ * different cipher modes depending on the key size, such as AES-128 vs AES-192
+ * vs. AES-256. When providing a 16 byte key for an AES cipher handle, AES-128
+ * is performed.
+ *
+ * Return: 0 if the setting of the key was successful; < 0 if an error occurred
+ */
+int crypto_aead_setkey(struct crypto_aead *tfm,
+		       const u8 *key, unsigned int keylen);
+
+/**
+ * crypto_aead_setauthsize() - set authentication data size
+ * @tfm: cipher handle
+ * @authsize: size of the authentication data / tag in bytes
+ *
+ * Set the authentication data size / tag size. AEAD requires an authentication
+ * tag (or MAC) in addition to the associated data.
+ *
+ * Return: 0 if the setting of the key was successful; < 0 if an error occurred
+ */
+int crypto_aead_setauthsize(struct crypto_aead *tfm, unsigned int authsize);
+
+static inline struct crypto_aead *crypto_aead_reqtfm(struct aead_request *req)
+{
+	return __crypto_aead_cast(req->base.tfm);
+}
+
+/**
+ * crypto_aead_encrypt() - encrypt plaintext
+ * @req: reference to the aead_request handle that holds all information
+ *	 needed to perform the cipher operation
+ *
+ * Encrypt plaintext data using the aead_request handle. That data structure
+ * and how it is filled with data is discussed with the aead_request_*
+ * functions.
+ *
+ * IMPORTANT NOTE The encryption operation creates the authentication data /
+ *		  tag. That data is concatenated with the created ciphertext.
+ *		  The ciphertext memory size is therefore the given number of
+ *		  block cipher blocks + the size defined by the
+ *		  crypto_aead_setauthsize invocation. The caller must ensure
+ *		  that sufficient memory is available for the ciphertext and
+ *		  the authentication tag.
+ *
+ * Return: 0 if the cipher operation was successful; < 0 if an error occurred
+ */
+static inline int crypto_aead_encrypt(struct aead_request *req)
+{
+	return crypto_aead_reqtfm(req)->encrypt(req);
+}
+
+/**
+ * crypto_aead_decrypt() - decrypt ciphertext
+ * @req: reference to the ablkcipher_request handle that holds all information
+ *	 needed to perform the cipher operation
+ *
+ * Decrypt ciphertext data using the aead_request handle. That data structure
+ * and how it is filled with data is discussed with the aead_request_*
+ * functions.
+ *
+ * IMPORTANT NOTE The caller must concatenate the ciphertext followed by the
+ *		  authentication data / tag. That authentication data / tag
+ *		  must have the size defined by the crypto_aead_setauthsize
+ *		  invocation.
+ *
+ *
+ * Return: 0 if the cipher operation was successful; -EBADMSG: The AEAD
+ *	   cipher operation performs the authentication of the data during the
+ *	   decryption operation. Therefore, the function returns this error if
+ *	   the authentication of the ciphertext was unsuccessful (i.e. the
+ *	   integrity of the ciphertext or the associated data was violated);
+ *	   < 0 if an error occurred.
+ */
+static inline int crypto_aead_decrypt(struct aead_request *req)
+{
+	if (req->cryptlen < crypto_aead_authsize(crypto_aead_reqtfm(req)))
+		return -EINVAL;
+
+	return crypto_aead_reqtfm(req)->decrypt(req);
+}
+
+/**
+ * DOC: Asynchronous AEAD Request Handle
+ *
+ * The aead_request data structure contains all pointers to data required for
+ * the AEAD cipher operation. This includes the cipher handle (which can be
+ * used by multiple aead_request instances), pointer to plaintext and
+ * ciphertext, asynchronous callback function, etc. It acts as a handle to the
+ * aead_request_* API calls in a similar way as AEAD handle to the
+ * crypto_aead_* API calls.
+ */
+
+/**
+ * crypto_aead_reqsize() - obtain size of the request data structure
+ * @tfm: cipher handle
+ *
+ * Return: number of bytes
+ */
+static inline unsigned int crypto_aead_reqsize(struct crypto_aead *tfm)
+{
+	return tfm->reqsize;
+}
+
+/**
+ * aead_request_set_tfm() - update cipher handle reference in request
+ * @req: request handle to be modified
+ * @tfm: cipher handle that shall be added to the request handle
+ *
+ * Allow the caller to replace the existing aead handle in the request
+ * data structure with a different one.
+ */
+static inline void aead_request_set_tfm(struct aead_request *req,
+					struct crypto_aead *tfm)
+{
+	req->base.tfm = crypto_aead_tfm(tfm->child);
+}
+
+/**
+ * aead_request_alloc() - allocate request data structure
+ * @tfm: cipher handle to be registered with the request
+ * @gfp: memory allocation flag that is handed to kmalloc by the API call.
+ *
+ * Allocate the request data structure that must be used with the AEAD
+ * encrypt and decrypt API calls. During the allocation, the provided aead
+ * handle is registered in the request data structure.
+ *
+ * Return: allocated request handle in case of success; IS_ERR() is true in case
+ *	   of an error, PTR_ERR() returns the error code.
+ */
+static inline struct aead_request *aead_request_alloc(struct crypto_aead *tfm,
+						      gfp_t gfp)
+{
+	struct aead_request *req;
+
+	req = kmalloc(sizeof(*req) + crypto_aead_reqsize(tfm), gfp);
+
+	if (likely(req))
+		aead_request_set_tfm(req, tfm);
+
+	return req;
+}
+
+/**
+ * aead_request_free() - zeroize and free request data structure
+ * @req: request data structure cipher handle to be freed
+ */
+static inline void aead_request_free(struct aead_request *req)
+{
+	kzfree(req);
+}
+
+/**
+ * aead_request_set_callback() - set asynchronous callback function
+ * @req: request handle
+ * @flags: specify zero or an ORing of the flags
+ *	   CRYPTO_TFM_REQ_MAY_BACKLOG the request queue may back log and
+ *	   increase the wait queue beyond the initial maximum size;
+ *	   CRYPTO_TFM_REQ_MAY_SLEEP the request processing may sleep
+ * @compl: callback function pointer to be registered with the request handle
+ * @data: The data pointer refers to memory that is not used by the kernel
+ *	  crypto API, but provided to the callback function for it to use. Here,
+ *	  the caller can provide a reference to memory the callback function can
+ *	  operate on. As the callback function is invoked asynchronously to the
+ *	  related functionality, it may need to access data structures of the
+ *	  related functionality which can be referenced using this pointer. The
+ *	  callback function can access the memory via the "data" field in the
+ *	  crypto_async_request data structure provided to the callback function.
+ *
+ * Setting the callback function that is triggered once the cipher operation
+ * completes
+ *
+ * The callback function is registered with the aead_request handle and
+ * must comply with the following template
+ *
+ *	void callback_function(struct crypto_async_request *req, int error)
+ */
+static inline void aead_request_set_callback(struct aead_request *req,
+					     u32 flags,
+					     crypto_completion_t compl,
+					     void *data)
+{
+	req->base.complete = compl;
+	req->base.data = data;
+	req->base.flags = flags;
+}
+
+/**
+ * aead_request_set_crypt - set data buffers
+ * @req: request handle
+ * @src: source scatter / gather list
+ * @dst: destination scatter / gather list
+ * @cryptlen: number of bytes to process from @src
+ * @iv: IV for the cipher operation which must comply with the IV size defined
+ *      by crypto_aead_ivsize()
+ *
+ * Setting the source data and destination data scatter / gather lists.
+ *
+ * For encryption, the source is treated as the plaintext and the
+ * destination is the ciphertext. For a decryption operation, the use is
+ * reversed - the source is the ciphertext and the destination is the plaintext.
+ *
+ * IMPORTANT NOTE AEAD requires an authentication tag (MAC). For decryption,
+ *		  the caller must concatenate the ciphertext followed by the
+ *		  authentication tag and provide the entire data stream to the
+ *		  decryption operation (i.e. the data length used for the
+ *		  initialization of the scatterlist and the data length for the
+ *		  decryption operation is identical). For encryption, however,
+ *		  the authentication tag is created while encrypting the data.
+ *		  The destination buffer must hold sufficient space for the
+ *		  ciphertext and the authentication tag while the encryption
+ *		  invocation must only point to the plaintext data size. The
+ *		  following code snippet illustrates the memory usage
+ *		  buffer = kmalloc(ptbuflen + (enc ? authsize : 0));
+ *		  sg_init_one(&sg, buffer, ptbuflen + (enc ? authsize : 0));
+ *		  aead_request_set_crypt(req, &sg, &sg, ptbuflen, iv);
+ */
+static inline void aead_request_set_crypt(struct aead_request *req,
+					  struct scatterlist *src,
+					  struct scatterlist *dst,
+					  unsigned int cryptlen, u8 *iv)
+{
+	req->src = src;
+	req->dst = dst;
+	req->cryptlen = cryptlen;
+	req->iv = iv;
+}
+
+/**
+ * aead_request_set_assoc() - set the associated data scatter / gather list
+ * @req: request handle
+ * @assoc: associated data scatter / gather list
+ * @assoclen: number of bytes to process from @assoc
+ *
+ * For encryption, the memory is filled with the associated data. For
+ * decryption, the memory must point to the associated data.
+ */
+static inline void aead_request_set_assoc(struct aead_request *req,
+					  struct scatterlist *assoc,
+					  unsigned int assoclen)
+{
+	req->assoc = assoc;
+	req->assoclen = assoclen;
+}
+
 static inline struct crypto_aead *aead_givcrypt_reqtfm(
 	struct aead_givcrypt_request *req)
 {
@@ -38,14 +468,12 @@ static inline struct crypto_aead *aead_givcrypt_reqtfm(
 
 static inline int crypto_aead_givencrypt(struct aead_givcrypt_request *req)
 {
-	struct aead_tfm *crt = crypto_aead_crt(aead_givcrypt_reqtfm(req));
-	return crt->givencrypt(req);
+	return aead_givcrypt_reqtfm(req)->givencrypt(req);
 };
 
 static inline int crypto_aead_givdecrypt(struct aead_givcrypt_request *req)
 {
-	struct aead_tfm *crt = crypto_aead_crt(aead_givcrypt_reqtfm(req));
-	return crt->givdecrypt(req);
+	return aead_givcrypt_reqtfm(req)->givdecrypt(req);
 };
 
 static inline void aead_givcrypt_set_tfm(struct aead_givcrypt_request *req,
diff --git a/include/crypto/algapi.h b/include/crypto/algapi.h
index a949bf70983b..d4ebf6e9af6a 100644
--- a/include/crypto/algapi.h
+++ b/include/crypto/algapi.h
@@ -17,6 +17,7 @@
 #include <linux/kernel.h>
 #include <linux/skbuff.h>
 
+struct crypto_aead;
 struct module;
 struct rtattr;
 struct seq_file;
@@ -126,7 +127,6 @@ struct ablkcipher_walk {
 };
 
 extern const struct crypto_type crypto_ablkcipher_type;
-extern const struct crypto_type crypto_aead_type;
 extern const struct crypto_type crypto_blkcipher_type;
 
 void crypto_mod_put(struct crypto_alg *alg);
@@ -241,22 +241,6 @@ static inline void *crypto_ablkcipher_ctx_aligned(struct crypto_ablkcipher *tfm)
 	return crypto_tfm_ctx_aligned(&tfm->base);
 }
 
-static inline struct aead_alg *crypto_aead_alg(struct crypto_aead *tfm)
-{
-	return &crypto_aead_tfm(tfm)->__crt_alg->cra_aead;
-}
-
-static inline void *crypto_aead_ctx(struct crypto_aead *tfm)
-{
-	return crypto_tfm_ctx(&tfm->base);
-}
-
-static inline struct crypto_instance *crypto_aead_alg_instance(
-	struct crypto_aead *aead)
-{
-	return crypto_tfm_alg_instance(&aead->base);
-}
-
 static inline struct crypto_blkcipher *crypto_spawn_blkcipher(
 	struct crypto_spawn *spawn)
 {
@@ -365,21 +349,6 @@ static inline int ablkcipher_tfm_in_queue(struct crypto_queue *queue,
 	return crypto_tfm_in_queue(queue, crypto_ablkcipher_tfm(tfm));
 }
 
-static inline void *aead_request_ctx(struct aead_request *req)
-{
-	return req->__ctx;
-}
-
-static inline void aead_request_complete(struct aead_request *req, int err)
-{
-	req->base.complete(&req->base, err);
-}
-
-static inline u32 aead_request_flags(struct aead_request *req)
-{
-	return req->base.flags;
-}
-
 static inline struct crypto_alg *crypto_get_attr_alg(struct rtattr **tb,
 						     u32 type, u32 mask)
 {
diff --git a/include/crypto/internal/aead.h b/include/crypto/internal/aead.h
index 750948cf4621..a2d104aa3430 100644
--- a/include/crypto/internal/aead.h
+++ b/include/crypto/internal/aead.h
@@ -23,8 +23,40 @@ struct crypto_aead_spawn {
 	struct crypto_spawn base;
 };
 
+extern const struct crypto_type crypto_aead_type;
 extern const struct crypto_type crypto_nivaead_type;
 
+static inline struct aead_alg *crypto_aead_alg(struct crypto_aead *tfm)
+{
+	return &crypto_aead_tfm(tfm)->__crt_alg->cra_aead;
+}
+
+static inline void *crypto_aead_ctx(struct crypto_aead *tfm)
+{
+	return crypto_tfm_ctx(&tfm->base);
+}
+
+static inline struct crypto_instance *crypto_aead_alg_instance(
+	struct crypto_aead *aead)
+{
+	return crypto_tfm_alg_instance(&aead->base);
+}
+
+static inline void *aead_request_ctx(struct aead_request *req)
+{
+	return req->__ctx;
+}
+
+static inline void aead_request_complete(struct aead_request *req, int err)
+{
+	req->base.complete(&req->base, err);
+}
+
+static inline u32 aead_request_flags(struct aead_request *req)
+{
+	return req->base.flags;
+}
+
 static inline void crypto_set_aead_spawn(
 	struct crypto_aead_spawn *spawn, struct crypto_instance *inst)
 {
@@ -50,9 +82,7 @@ static inline struct crypto_alg *crypto_aead_spawn_alg(
 static inline struct crypto_aead *crypto_spawn_aead(
 	struct crypto_aead_spawn *spawn)
 {
-	return __crypto_aead_cast(
-		crypto_spawn_tfm(&spawn->base, CRYPTO_ALG_TYPE_AEAD,
-				 CRYPTO_ALG_TYPE_MASK));
+	return crypto_spawn_tfm2(&spawn->base);
 }
 
 struct crypto_instance *aead_geniv_alloc(struct crypto_template *tmpl,
@@ -64,7 +94,7 @@ void aead_geniv_exit(struct crypto_tfm *tfm);
 
 static inline struct crypto_aead *aead_geniv_base(struct crypto_aead *geniv)
 {
-	return crypto_aead_crt(geniv)->base;
+	return geniv->child;
 }
 
 static inline void *aead_givcrypt_reqctx(struct aead_givcrypt_request *req)
diff --git a/include/linux/crypto.h b/include/linux/crypto.h
index ee14140f8893..59ca4086ce6a 100644
--- a/include/linux/crypto.h
+++ b/include/linux/crypto.h
@@ -140,6 +140,7 @@ struct crypto_blkcipher;
 struct crypto_hash;
 struct crypto_tfm;
 struct crypto_type;
+struct aead_request;
 struct aead_givcrypt_request;
 struct skcipher_givcrypt_request;
 
@@ -174,32 +175,6 @@ struct ablkcipher_request {
 	void *__ctx[] CRYPTO_MINALIGN_ATTR;
 };
 
-/**
- *	struct aead_request - AEAD request
- *	@base: Common attributes for async crypto requests
- *	@assoclen: Length in bytes of associated data for authentication
- *	@cryptlen: Length of data to be encrypted or decrypted
- *	@iv: Initialisation vector
- *	@assoc: Associated data
- *	@src: Source data
- *	@dst: Destination data
- *	@__ctx: Start of private context data
- */
-struct aead_request {
-	struct crypto_async_request base;
-
-	unsigned int assoclen;
-	unsigned int cryptlen;
-
-	u8 *iv;
-
-	struct scatterlist *assoc;
-	struct scatterlist *src;
-	struct scatterlist *dst;
-
-	void *__ctx[] CRYPTO_MINALIGN_ATTR;
-};
-
 struct blkcipher_desc {
 	struct crypto_blkcipher *tfm;
 	void *info;
@@ -572,21 +547,6 @@ struct ablkcipher_tfm {
 	unsigned int reqsize;
 };
 
-struct aead_tfm {
-	int (*setkey)(struct crypto_aead *tfm, const u8 *key,
-	              unsigned int keylen);
-	int (*encrypt)(struct aead_request *req);
-	int (*decrypt)(struct aead_request *req);
-	int (*givencrypt)(struct aead_givcrypt_request *req);
-	int (*givdecrypt)(struct aead_givcrypt_request *req);
-
-	struct crypto_aead *base;
-
-	unsigned int ivsize;
-	unsigned int authsize;
-	unsigned int reqsize;
-};
-
 struct blkcipher_tfm {
 	void *iv;
 	int (*setkey)(struct crypto_tfm *tfm, const u8 *key,
@@ -626,7 +586,6 @@ struct compress_tfm {
 };
 
 #define crt_ablkcipher	crt_u.ablkcipher
-#define crt_aead	crt_u.aead
 #define crt_blkcipher	crt_u.blkcipher
 #define crt_cipher	crt_u.cipher
 #define crt_hash	crt_u.hash
@@ -638,7 +597,6 @@ struct crypto_tfm {
 	
 	union {
 		struct ablkcipher_tfm ablkcipher;
-		struct aead_tfm aead;
 		struct blkcipher_tfm blkcipher;
 		struct cipher_tfm cipher;
 		struct hash_tfm hash;
@@ -656,10 +614,6 @@ struct crypto_ablkcipher {
 	struct crypto_tfm base;
 };
 
-struct crypto_aead {
-	struct crypto_tfm base;
-};
-
 struct crypto_blkcipher {
 	struct crypto_tfm base;
 };
@@ -1151,400 +1105,6 @@ static inline void ablkcipher_request_set_crypt(
 	req->info = iv;
 }
 
-/**
- * DOC: Authenticated Encryption With Associated Data (AEAD) Cipher API
- *
- * The AEAD cipher API is used with the ciphers of type CRYPTO_ALG_TYPE_AEAD
- * (listed as type "aead" in /proc/crypto)
- *
- * The most prominent examples for this type of encryption is GCM and CCM.
- * However, the kernel supports other types of AEAD ciphers which are defined
- * with the following cipher string:
- *
- *	authenc(keyed message digest, block cipher)
- *
- * For example: authenc(hmac(sha256), cbc(aes))
- *
- * The example code provided for the asynchronous block cipher operation
- * applies here as well. Naturally all *ablkcipher* symbols must be exchanged
- * the *aead* pendants discussed in the following. In addtion, for the AEAD
- * operation, the aead_request_set_assoc function must be used to set the
- * pointer to the associated data memory location before performing the
- * encryption or decryption operation. In case of an encryption, the associated
- * data memory is filled during the encryption operation. For decryption, the
- * associated data memory must contain data that is used to verify the integrity
- * of the decrypted data. Another deviation from the asynchronous block cipher
- * operation is that the caller should explicitly check for -EBADMSG of the
- * crypto_aead_decrypt. That error indicates an authentication error, i.e.
- * a breach in the integrity of the message. In essence, that -EBADMSG error
- * code is the key bonus an AEAD cipher has over "standard" block chaining
- * modes.
- */
-
-static inline struct crypto_aead *__crypto_aead_cast(struct crypto_tfm *tfm)
-{
-	return (struct crypto_aead *)tfm;
-}
-
-/**
- * crypto_alloc_aead() - allocate AEAD cipher handle
- * @alg_name: is the cra_name / name or cra_driver_name / driver name of the
- *	     AEAD cipher
- * @type: specifies the type of the cipher
- * @mask: specifies the mask for the cipher
- *
- * Allocate a cipher handle for an AEAD. The returned struct
- * crypto_aead is the cipher handle that is required for any subsequent
- * API invocation for that AEAD.
- *
- * Return: allocated cipher handle in case of success; IS_ERR() is true in case
- *	   of an error, PTR_ERR() returns the error code.
- */
-struct crypto_aead *crypto_alloc_aead(const char *alg_name, u32 type, u32 mask);
-
-static inline struct crypto_tfm *crypto_aead_tfm(struct crypto_aead *tfm)
-{
-	return &tfm->base;
-}
-
-/**
- * crypto_free_aead() - zeroize and free aead handle
- * @tfm: cipher handle to be freed
- */
-static inline void crypto_free_aead(struct crypto_aead *tfm)
-{
-	crypto_free_tfm(crypto_aead_tfm(tfm));
-}
-
-static inline struct aead_tfm *crypto_aead_crt(struct crypto_aead *tfm)
-{
-	return &crypto_aead_tfm(tfm)->crt_aead;
-}
-
-/**
- * crypto_aead_ivsize() - obtain IV size
- * @tfm: cipher handle
- *
- * The size of the IV for the aead referenced by the cipher handle is
- * returned. This IV size may be zero if the cipher does not need an IV.
- *
- * Return: IV size in bytes
- */
-static inline unsigned int crypto_aead_ivsize(struct crypto_aead *tfm)
-{
-	return crypto_aead_crt(tfm)->ivsize;
-}
-
-/**
- * crypto_aead_authsize() - obtain maximum authentication data size
- * @tfm: cipher handle
- *
- * The maximum size of the authentication data for the AEAD cipher referenced
- * by the AEAD cipher handle is returned. The authentication data size may be
- * zero if the cipher implements a hard-coded maximum.
- *
- * The authentication data may also be known as "tag value".
- *
- * Return: authentication data size / tag size in bytes
- */
-static inline unsigned int crypto_aead_authsize(struct crypto_aead *tfm)
-{
-	return crypto_aead_crt(tfm)->authsize;
-}
-
-/**
- * crypto_aead_blocksize() - obtain block size of cipher
- * @tfm: cipher handle
- *
- * The block size for the AEAD referenced with the cipher handle is returned.
- * The caller may use that information to allocate appropriate memory for the
- * data returned by the encryption or decryption operation
- *
- * Return: block size of cipher
- */
-static inline unsigned int crypto_aead_blocksize(struct crypto_aead *tfm)
-{
-	return crypto_tfm_alg_blocksize(crypto_aead_tfm(tfm));
-}
-
-static inline unsigned int crypto_aead_alignmask(struct crypto_aead *tfm)
-{
-	return crypto_tfm_alg_alignmask(crypto_aead_tfm(tfm));
-}
-
-static inline u32 crypto_aead_get_flags(struct crypto_aead *tfm)
-{
-	return crypto_tfm_get_flags(crypto_aead_tfm(tfm));
-}
-
-static inline void crypto_aead_set_flags(struct crypto_aead *tfm, u32 flags)
-{
-	crypto_tfm_set_flags(crypto_aead_tfm(tfm), flags);
-}
-
-static inline void crypto_aead_clear_flags(struct crypto_aead *tfm, u32 flags)
-{
-	crypto_tfm_clear_flags(crypto_aead_tfm(tfm), flags);
-}
-
-/**
- * crypto_aead_setkey() - set key for cipher
- * @tfm: cipher handle
- * @key: buffer holding the key
- * @keylen: length of the key in bytes
- *
- * The caller provided key is set for the AEAD referenced by the cipher
- * handle.
- *
- * Note, the key length determines the cipher type. Many block ciphers implement
- * different cipher modes depending on the key size, such as AES-128 vs AES-192
- * vs. AES-256. When providing a 16 byte key for an AES cipher handle, AES-128
- * is performed.
- *
- * Return: 0 if the setting of the key was successful; < 0 if an error occurred
- */
-static inline int crypto_aead_setkey(struct crypto_aead *tfm, const u8 *key,
-				     unsigned int keylen)
-{
-	struct aead_tfm *crt = crypto_aead_crt(tfm);
-
-	return crt->setkey(crt->base, key, keylen);
-}
-
-/**
- * crypto_aead_setauthsize() - set authentication data size
- * @tfm: cipher handle
- * @authsize: size of the authentication data / tag in bytes
- *
- * Set the authentication data size / tag size. AEAD requires an authentication
- * tag (or MAC) in addition to the associated data.
- *
- * Return: 0 if the setting of the key was successful; < 0 if an error occurred
- */
-int crypto_aead_setauthsize(struct crypto_aead *tfm, unsigned int authsize);
-
-static inline struct crypto_aead *crypto_aead_reqtfm(struct aead_request *req)
-{
-	return __crypto_aead_cast(req->base.tfm);
-}
-
-/**
- * crypto_aead_encrypt() - encrypt plaintext
- * @req: reference to the aead_request handle that holds all information
- *	 needed to perform the cipher operation
- *
- * Encrypt plaintext data using the aead_request handle. That data structure
- * and how it is filled with data is discussed with the aead_request_*
- * functions.
- *
- * IMPORTANT NOTE The encryption operation creates the authentication data /
- *		  tag. That data is concatenated with the created ciphertext.
- *		  The ciphertext memory size is therefore the given number of
- *		  block cipher blocks + the size defined by the
- *		  crypto_aead_setauthsize invocation. The caller must ensure
- *		  that sufficient memory is available for the ciphertext and
- *		  the authentication tag.
- *
- * Return: 0 if the cipher operation was successful; < 0 if an error occurred
- */
-static inline int crypto_aead_encrypt(struct aead_request *req)
-{
-	return crypto_aead_crt(crypto_aead_reqtfm(req))->encrypt(req);
-}
-
-/**
- * crypto_aead_decrypt() - decrypt ciphertext
- * @req: reference to the ablkcipher_request handle that holds all information
- *	 needed to perform the cipher operation
- *
- * Decrypt ciphertext data using the aead_request handle. That data structure
- * and how it is filled with data is discussed with the aead_request_*
- * functions.
- *
- * IMPORTANT NOTE The caller must concatenate the ciphertext followed by the
- *		  authentication data / tag. That authentication data / tag
- *		  must have the size defined by the crypto_aead_setauthsize
- *		  invocation.
- *
- *
- * Return: 0 if the cipher operation was successful; -EBADMSG: The AEAD
- *	   cipher operation performs the authentication of the data during the
- *	   decryption operation. Therefore, the function returns this error if
- *	   the authentication of the ciphertext was unsuccessful (i.e. the
- *	   integrity of the ciphertext or the associated data was violated);
- *	   < 0 if an error occurred.
- */
-static inline int crypto_aead_decrypt(struct aead_request *req)
-{
-	if (req->cryptlen < crypto_aead_authsize(crypto_aead_reqtfm(req)))
-		return -EINVAL;
-
-	return crypto_aead_crt(crypto_aead_reqtfm(req))->decrypt(req);
-}
-
-/**
- * DOC: Asynchronous AEAD Request Handle
- *
- * The aead_request data structure contains all pointers to data required for
- * the AEAD cipher operation. This includes the cipher handle (which can be
- * used by multiple aead_request instances), pointer to plaintext and
- * ciphertext, asynchronous callback function, etc. It acts as a handle to the
- * aead_request_* API calls in a similar way as AEAD handle to the
- * crypto_aead_* API calls.
- */
-
-/**
- * crypto_aead_reqsize() - obtain size of the request data structure
- * @tfm: cipher handle
- *
- * Return: number of bytes
- */
-static inline unsigned int crypto_aead_reqsize(struct crypto_aead *tfm)
-{
-	return crypto_aead_crt(tfm)->reqsize;
-}
-
-/**
- * aead_request_set_tfm() - update cipher handle reference in request
- * @req: request handle to be modified
- * @tfm: cipher handle that shall be added to the request handle
- *
- * Allow the caller to replace the existing aead handle in the request
- * data structure with a different one.
- */
-static inline void aead_request_set_tfm(struct aead_request *req,
-					struct crypto_aead *tfm)
-{
-	req->base.tfm = crypto_aead_tfm(crypto_aead_crt(tfm)->base);
-}
-
-/**
- * aead_request_alloc() - allocate request data structure
- * @tfm: cipher handle to be registered with the request
- * @gfp: memory allocation flag that is handed to kmalloc by the API call.
- *
- * Allocate the request data structure that must be used with the AEAD
- * encrypt and decrypt API calls. During the allocation, the provided aead
- * handle is registered in the request data structure.
- *
- * Return: allocated request handle in case of success; IS_ERR() is true in case
- *	   of an error, PTR_ERR() returns the error code.
- */
-static inline struct aead_request *aead_request_alloc(struct crypto_aead *tfm,
-						      gfp_t gfp)
-{
-	struct aead_request *req;
-
-	req = kmalloc(sizeof(*req) + crypto_aead_reqsize(tfm), gfp);
-
-	if (likely(req))
-		aead_request_set_tfm(req, tfm);
-
-	return req;
-}
-
-/**
- * aead_request_free() - zeroize and free request data structure
- * @req: request data structure cipher handle to be freed
- */
-static inline void aead_request_free(struct aead_request *req)
-{
-	kzfree(req);
-}
-
-/**
- * aead_request_set_callback() - set asynchronous callback function
- * @req: request handle
- * @flags: specify zero or an ORing of the flags
- *	   CRYPTO_TFM_REQ_MAY_BACKLOG the request queue may back log and
- *	   increase the wait queue beyond the initial maximum size;
- *	   CRYPTO_TFM_REQ_MAY_SLEEP the request processing may sleep
- * @compl: callback function pointer to be registered with the request handle
- * @data: The data pointer refers to memory that is not used by the kernel
- *	  crypto API, but provided to the callback function for it to use. Here,
- *	  the caller can provide a reference to memory the callback function can
- *	  operate on. As the callback function is invoked asynchronously to the
- *	  related functionality, it may need to access data structures of the
- *	  related functionality which can be referenced using this pointer. The
- *	  callback function can access the memory via the "data" field in the
- *	  crypto_async_request data structure provided to the callback function.
- *
- * Setting the callback function that is triggered once the cipher operation
- * completes
- *
- * The callback function is registered with the aead_request handle and
- * must comply with the following template
- *
- *	void callback_function(struct crypto_async_request *req, int error)
- */
-static inline void aead_request_set_callback(struct aead_request *req,
-					     u32 flags,
-					     crypto_completion_t compl,
-					     void *data)
-{
-	req->base.complete = compl;
-	req->base.data = data;
-	req->base.flags = flags;
-}
-
-/**
- * aead_request_set_crypt - set data buffers
- * @req: request handle
- * @src: source scatter / gather list
- * @dst: destination scatter / gather list
- * @cryptlen: number of bytes to process from @src
- * @iv: IV for the cipher operation which must comply with the IV size defined
- *      by crypto_aead_ivsize()
- *
- * Setting the source data and destination data scatter / gather lists.
- *
- * For encryption, the source is treated as the plaintext and the
- * destination is the ciphertext. For a decryption operation, the use is
- * reversed - the source is the ciphertext and the destination is the plaintext.
- *
- * IMPORTANT NOTE AEAD requires an authentication tag (MAC). For decryption,
- *		  the caller must concatenate the ciphertext followed by the
- *		  authentication tag and provide the entire data stream to the
- *		  decryption operation (i.e. the data length used for the
- *		  initialization of the scatterlist and the data length for the
- *		  decryption operation is identical). For encryption, however,
- *		  the authentication tag is created while encrypting the data.
- *		  The destination buffer must hold sufficient space for the
- *		  ciphertext and the authentication tag while the encryption
- *		  invocation must only point to the plaintext data size. The
- *		  following code snippet illustrates the memory usage
- *		  buffer = kmalloc(ptbuflen + (enc ? authsize : 0));
- *		  sg_init_one(&sg, buffer, ptbuflen + (enc ? authsize : 0));
- *		  aead_request_set_crypt(req, &sg, &sg, ptbuflen, iv);
- */
-static inline void aead_request_set_crypt(struct aead_request *req,
-					  struct scatterlist *src,
-					  struct scatterlist *dst,
-					  unsigned int cryptlen, u8 *iv)
-{
-	req->src = src;
-	req->dst = dst;
-	req->cryptlen = cryptlen;
-	req->iv = iv;
-}
-
-/**
- * aead_request_set_assoc() - set the associated data scatter / gather list
- * @req: request handle
- * @assoc: associated data scatter / gather list
- * @assoclen: number of bytes to process from @assoc
- *
- * For encryption, the memory is filled with the associated data. For
- * decryption, the memory must point to the associated data.
- */
-static inline void aead_request_set_assoc(struct aead_request *req,
-					  struct scatterlist *assoc,
-					  unsigned int assoclen)
-{
-	req->assoc = assoc;
-	req->assoclen = assoclen;
-}
-
 /**
  * DOC: Synchronous Block Cipher API
  *
-- 
cgit v1.2.3


From a2029240e5836e73ebcc1a8ddb8c22d636f89c9a Mon Sep 17 00:00:00 2001
From: Denys Vlasenko <dvlasenk@redhat.com>
Date: Mon, 11 May 2015 21:17:53 +0200
Subject: net: deinline netif_tx_stop_all_queues(), remove WARN_ON in
 netif_tx_stop_queue()

These functions compile to 60 bytes of machine code each.
With this .config: http://busybox.net/~vda/kernel_config
there are 617 calls of netif_tx_stop_queue()
and 49 calls of netif_tx_stop_all_queues() in vmlinux.

To fix this, remove WARN_ON in netif_tx_stop_queue()
as suggested by davem, and deinline netif_tx_stop_all_queues().

Change in code size is about 20k:

   text      data      bss       dec     hex filename
82426986 22255416 20627456 125309858 77813a2 vmlinux.before
82406248 22255416 20627456 125289120 777c2a0 vmlinux

gcc-4.7.2 still creates deinlined version of netif_tx_stop_queue
sometimes:

$ nm --size-sort vmlinux | grep netif_tx_stop_queue | wc -l
190

ffffffff81b558a8 <netif_tx_stop_queue>:
ffffffff81b558a8:       55                      push   %rbp
ffffffff81b558a9:       48 89 e5                mov    %rsp,%rbp
ffffffff81b558ac:       f0 80 8f e0 01 00 00    lock orb $0x1,0x1e0(%rdi)
ffffffff81b558b3:       01
ffffffff81b558b4:       5d                      pop    %rbp
ffffffff81b558b5:       c3                      retq

This needs additional fixing.

Signed-off-by: Denys Vlasenko <dvlasenk@redhat.com>
CC: Alexei Starovoitov <alexei.starovoitov@gmail.com>
CC: Alexander Duyck <alexander.duyck@gmail.com>
CC: Joe Perches <joe@perches.com>
CC: David S. Miller <davem@davemloft.net>
CC: Jiri Pirko <jpirko@redhat.com>
CC: linux-kernel@vger.kernel.org
CC: netdev@vger.kernel.org
CC: netfilter-devel@vger.kernel.org
Acked-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 14 +-------------
 net/core/dev.c            | 11 +++++++++++
 2 files changed, 12 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 2b39235b9f13..fa57915f440c 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2559,10 +2559,6 @@ static inline void netif_tx_wake_all_queues(struct net_device *dev)
 
 static inline void netif_tx_stop_queue(struct netdev_queue *dev_queue)
 {
-	if (WARN_ON(!dev_queue)) {
-		pr_info("netif_stop_queue() cannot be called before register_netdev()\n");
-		return;
-	}
 	set_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state);
 }
 
@@ -2578,15 +2574,7 @@ static inline void netif_stop_queue(struct net_device *dev)
 	netif_tx_stop_queue(netdev_get_tx_queue(dev, 0));
 }
 
-static inline void netif_tx_stop_all_queues(struct net_device *dev)
-{
-	unsigned int i;
-
-	for (i = 0; i < dev->num_tx_queues; i++) {
-		struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
-		netif_tx_stop_queue(txq);
-	}
-}
+void netif_tx_stop_all_queues(struct net_device *dev);
 
 static inline bool netif_tx_queue_stopped(const struct netdev_queue *dev_queue)
 {
diff --git a/net/core/dev.c b/net/core/dev.c
index e5f77c40bbd1..fd012bbe0486 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -6301,6 +6301,17 @@ static int netif_alloc_netdev_queues(struct net_device *dev)
 	return 0;
 }
 
+void netif_tx_stop_all_queues(struct net_device *dev)
+{
+	unsigned int i;
+
+	for (i = 0; i < dev->num_tx_queues; i++) {
+		struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
+		netif_tx_stop_queue(txq);
+	}
+}
+EXPORT_SYMBOL(netif_tx_stop_all_queues);
+
 /**
  *	register_netdevice	- register a network device
  *	@dev: device to register
-- 
cgit v1.2.3


From cffc642d93f9324a06dfbd7da9af29652952a248 Mon Sep 17 00:00:00 2001
From: Michael Holzheu <holzheu@linux.vnet.ibm.com>
Date: Mon, 11 May 2015 22:22:44 -0700
Subject: test_bpf: add 173 new testcases for eBPF

add an exhaustive set of eBPF tests bringing total to:
test_bpf: Summary: 233 PASSED, 0 FAILED, [0/226 JIT'ed]

Signed-off-by: Michael Holzheu <holzheu@linux.vnet.ibm.com>
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/filter.h |   10 +
 lib/test_bpf.c         | 2192 ++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 2202 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 3c03a6085b82..ce1d72d34382 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -207,6 +207,16 @@ struct bpf_prog_aux;
 		.off   = OFF,					\
 		.imm   = 0 })
 
+/* Atomic memory add, *(uint *)(dst_reg + off16) += src_reg */
+
+#define BPF_STX_XADD(SIZE, DST, SRC, OFF)			\
+	((struct bpf_insn) {					\
+		.code  = BPF_STX | BPF_SIZE(SIZE) | BPF_XADD,	\
+		.dst_reg = DST,					\
+		.src_reg = SRC,					\
+		.off   = OFF,					\
+		.imm   = 0 })
+
 /* Memory store, *(uint *) (dst_reg + off16) = imm32 */
 
 #define BPF_ST_MEM(SIZE, DST, OFF, IMM)				\
diff --git a/lib/test_bpf.c b/lib/test_bpf.c
index 3c41049d72d8..8bca780e3613 100644
--- a/lib/test_bpf.c
+++ b/lib/test_bpf.c
@@ -1806,6 +1806,2198 @@ static struct bpf_test tests[] = {
 		  0x10, 0xbf, 0x48, 0xd6, 0x43, 0xd6},
 		{ { 38, 256 } }
 	},
+	/* BPF_ALU | BPF_MOV | BPF_X */
+	{
+		"ALU_MOV_X: dst = 2",
+		.u.insns_int = {
+			BPF_ALU32_IMM(BPF_MOV, R1, 2),
+			BPF_ALU32_REG(BPF_MOV, R0, R1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 2 } },
+	},
+	{
+		"ALU_MOV_X: dst = 4294967295",
+		.u.insns_int = {
+			BPF_ALU32_IMM(BPF_MOV, R1, 4294967295),
+			BPF_ALU32_REG(BPF_MOV, R0, R1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 4294967295 } },
+	},
+	{
+		"ALU64_MOV_X: dst = 2",
+		.u.insns_int = {
+			BPF_ALU32_IMM(BPF_MOV, R1, 2),
+			BPF_ALU64_REG(BPF_MOV, R0, R1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 2 } },
+	},
+	{
+		"ALU64_MOV_X: dst = 4294967295",
+		.u.insns_int = {
+			BPF_ALU32_IMM(BPF_MOV, R1, 4294967295),
+			BPF_ALU64_REG(BPF_MOV, R0, R1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 4294967295 } },
+	},
+	/* BPF_ALU | BPF_MOV | BPF_K */
+	{
+		"ALU_MOV_K: dst = 2",
+		.u.insns_int = {
+			BPF_ALU32_IMM(BPF_MOV, R0, 2),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 2 } },
+	},
+	{
+		"ALU_MOV_K: dst = 4294967295",
+		.u.insns_int = {
+			BPF_ALU32_IMM(BPF_MOV, R0, 4294967295),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 4294967295 } },
+	},
+	{
+		"ALU_MOV_K: 0x0000ffffffff0000 = 0x00000000ffffffff",
+		.u.insns_int = {
+			BPF_LD_IMM64(R2, 0x0000ffffffff0000),
+			BPF_LD_IMM64(R3, 0x00000000ffffffff),
+			BPF_ALU32_IMM(BPF_MOV, R2, 0xffffffff),
+			BPF_JMP_REG(BPF_JEQ, R2, R3, 2),
+			BPF_MOV32_IMM(R0, 2),
+			BPF_EXIT_INSN(),
+			BPF_MOV32_IMM(R0, 1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 0x1 } },
+	},
+	{
+		"ALU64_MOV_K: dst = 2",
+		.u.insns_int = {
+			BPF_ALU64_IMM(BPF_MOV, R0, 2),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 2 } },
+	},
+	{
+		"ALU64_MOV_K: dst = 2147483647",
+		.u.insns_int = {
+			BPF_ALU64_IMM(BPF_MOV, R0, 2147483647),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 2147483647 } },
+	},
+	{
+		"ALU64_OR_K: dst = 0x0",
+		.u.insns_int = {
+			BPF_LD_IMM64(R2, 0x0000ffffffff0000),
+			BPF_LD_IMM64(R3, 0x0),
+			BPF_ALU64_IMM(BPF_MOV, R2, 0x0),
+			BPF_JMP_REG(BPF_JEQ, R2, R3, 2),
+			BPF_MOV32_IMM(R0, 2),
+			BPF_EXIT_INSN(),
+			BPF_MOV32_IMM(R0, 1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 0x1 } },
+	},
+	{
+		"ALU64_MOV_K: dst = -1",
+		.u.insns_int = {
+			BPF_LD_IMM64(R2, 0x0000ffffffff0000),
+			BPF_LD_IMM64(R3, 0xffffffffffffffff),
+			BPF_ALU64_IMM(BPF_MOV, R2, 0xffffffff),
+			BPF_JMP_REG(BPF_JEQ, R2, R3, 2),
+			BPF_MOV32_IMM(R0, 2),
+			BPF_EXIT_INSN(),
+			BPF_MOV32_IMM(R0, 1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 0x1 } },
+	},
+	/* BPF_ALU | BPF_ADD | BPF_X */
+	{
+		"ALU_ADD_X: 1 + 2 = 3",
+		.u.insns_int = {
+			BPF_LD_IMM64(R0, 1),
+			BPF_ALU32_IMM(BPF_MOV, R1, 2),
+			BPF_ALU32_REG(BPF_ADD, R0, R1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 3 } },
+	},
+	{
+		"ALU_ADD_X: 1 + 4294967294 = 4294967295",
+		.u.insns_int = {
+			BPF_LD_IMM64(R0, 1),
+			BPF_ALU32_IMM(BPF_MOV, R1, 4294967294),
+			BPF_ALU32_REG(BPF_ADD, R0, R1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 4294967295 } },
+	},
+	{
+		"ALU64_ADD_X: 1 + 2 = 3",
+		.u.insns_int = {
+			BPF_LD_IMM64(R0, 1),
+			BPF_ALU32_IMM(BPF_MOV, R1, 2),
+			BPF_ALU64_REG(BPF_ADD, R0, R1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 3 } },
+	},
+	{
+		"ALU64_ADD_X: 1 + 4294967294 = 4294967295",
+		.u.insns_int = {
+			BPF_LD_IMM64(R0, 1),
+			BPF_ALU32_IMM(BPF_MOV, R1, 4294967294),
+			BPF_ALU64_REG(BPF_ADD, R0, R1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 4294967295 } },
+	},
+	/* BPF_ALU | BPF_ADD | BPF_K */
+	{
+		"ALU_ADD_K: 1 + 2 = 3",
+		.u.insns_int = {
+			BPF_LD_IMM64(R0, 1),
+			BPF_ALU32_IMM(BPF_ADD, R0, 2),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 3 } },
+	},
+	{
+		"ALU_ADD_K: 3 + 0 = 3",
+		.u.insns_int = {
+			BPF_LD_IMM64(R0, 3),
+			BPF_ALU32_IMM(BPF_ADD, R0, 0),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 3 } },
+	},
+	{
+		"ALU_ADD_K: 1 + 4294967294 = 4294967295",
+		.u.insns_int = {
+			BPF_LD_IMM64(R0, 1),
+			BPF_ALU32_IMM(BPF_ADD, R0, 4294967294),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 4294967295 } },
+	},
+	{
+		"ALU_ADD_K: 0 + (-1) = 0x00000000ffffffff",
+		.u.insns_int = {
+			BPF_LD_IMM64(R2, 0x0),
+			BPF_LD_IMM64(R3, 0x00000000ffffffff),
+			BPF_ALU32_IMM(BPF_ADD, R2, 0xffffffff),
+			BPF_JMP_REG(BPF_JEQ, R2, R3, 2),
+			BPF_MOV32_IMM(R0, 2),
+			BPF_EXIT_INSN(),
+			BPF_MOV32_IMM(R0, 1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 0x1 } },
+	},
+	{
+		"ALU64_ADD_K: 1 + 2 = 3",
+		.u.insns_int = {
+			BPF_LD_IMM64(R0, 1),
+			BPF_ALU64_IMM(BPF_ADD, R0, 2),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 3 } },
+	},
+	{
+		"ALU64_ADD_K: 3 + 0 = 3",
+		.u.insns_int = {
+			BPF_LD_IMM64(R0, 3),
+			BPF_ALU64_IMM(BPF_ADD, R0, 0),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 3 } },
+	},
+	{
+		"ALU64_ADD_K: 1 + 2147483646 = 2147483647",
+		.u.insns_int = {
+			BPF_LD_IMM64(R0, 1),
+			BPF_ALU64_IMM(BPF_ADD, R0, 2147483646),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 2147483647 } },
+	},
+	{
+		"ALU64_ADD_K: 2147483646 + -2147483647 = -1",
+		.u.insns_int = {
+			BPF_LD_IMM64(R0, 2147483646),
+			BPF_ALU64_IMM(BPF_ADD, R0, -2147483647),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, -1 } },
+	},
+	{
+		"ALU64_ADD_K: 1 + 0 = 1",
+		.u.insns_int = {
+			BPF_LD_IMM64(R2, 0x1),
+			BPF_LD_IMM64(R3, 0x1),
+			BPF_ALU64_IMM(BPF_ADD, R2, 0x0),
+			BPF_JMP_REG(BPF_JEQ, R2, R3, 2),
+			BPF_MOV32_IMM(R0, 2),
+			BPF_EXIT_INSN(),
+			BPF_MOV32_IMM(R0, 1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 0x1 } },
+	},
+	{
+		"ALU64_ADD_K: 0 + (-1) = 0xffffffffffffffff",
+		.u.insns_int = {
+			BPF_LD_IMM64(R2, 0x0),
+			BPF_LD_IMM64(R3, 0xffffffffffffffff),
+			BPF_ALU64_IMM(BPF_ADD, R2, 0xffffffff),
+			BPF_JMP_REG(BPF_JEQ, R2, R3, 2),
+			BPF_MOV32_IMM(R0, 2),
+			BPF_EXIT_INSN(),
+			BPF_MOV32_IMM(R0, 1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 0x1 } },
+	},
+	/* BPF_ALU | BPF_SUB | BPF_X */
+	{
+		"ALU_SUB_X: 3 - 1 = 2",
+		.u.insns_int = {
+			BPF_LD_IMM64(R0, 3),
+			BPF_ALU32_IMM(BPF_MOV, R1, 1),
+			BPF_ALU32_REG(BPF_SUB, R0, R1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 2 } },
+	},
+	{
+		"ALU_SUB_X: 4294967295 - 4294967294 = 1",
+		.u.insns_int = {
+			BPF_LD_IMM64(R0, 4294967295),
+			BPF_ALU32_IMM(BPF_MOV, R1, 4294967294),
+			BPF_ALU32_REG(BPF_SUB, R0, R1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 1 } },
+	},
+	{
+		"ALU64_SUB_X: 3 - 1 = 2",
+		.u.insns_int = {
+			BPF_LD_IMM64(R0, 3),
+			BPF_ALU32_IMM(BPF_MOV, R1, 1),
+			BPF_ALU64_REG(BPF_SUB, R0, R1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 2 } },
+	},
+	{
+		"ALU64_SUB_X: 4294967295 - 4294967294 = 1",
+		.u.insns_int = {
+			BPF_LD_IMM64(R0, 4294967295),
+			BPF_ALU32_IMM(BPF_MOV, R1, 4294967294),
+			BPF_ALU64_REG(BPF_SUB, R0, R1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 1 } },
+	},
+	/* BPF_ALU | BPF_SUB | BPF_K */
+	{
+		"ALU_SUB_K: 3 - 1 = 2",
+		.u.insns_int = {
+			BPF_LD_IMM64(R0, 3),
+			BPF_ALU32_IMM(BPF_SUB, R0, 1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 2 } },
+	},
+	{
+		"ALU_SUB_K: 3 - 0 = 3",
+		.u.insns_int = {
+			BPF_LD_IMM64(R0, 3),
+			BPF_ALU32_IMM(BPF_SUB, R0, 0),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 3 } },
+	},
+	{
+		"ALU_SUB_K: 4294967295 - 4294967294 = 1",
+		.u.insns_int = {
+			BPF_LD_IMM64(R0, 4294967295),
+			BPF_ALU32_IMM(BPF_SUB, R0, 4294967294),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 1 } },
+	},
+	{
+		"ALU64_SUB_K: 3 - 1 = 2",
+		.u.insns_int = {
+			BPF_LD_IMM64(R0, 3),
+			BPF_ALU64_IMM(BPF_SUB, R0, 1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 2 } },
+	},
+	{
+		"ALU64_SUB_K: 3 - 0 = 3",
+		.u.insns_int = {
+			BPF_LD_IMM64(R0, 3),
+			BPF_ALU64_IMM(BPF_SUB, R0, 0),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 3 } },
+	},
+	{
+		"ALU64_SUB_K: 4294967294 - 4294967295 = -1",
+		.u.insns_int = {
+			BPF_LD_IMM64(R0, 4294967294),
+			BPF_ALU64_IMM(BPF_SUB, R0, 4294967295),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, -1 } },
+	},
+	{
+		"ALU64_ADD_K: 2147483646 - 2147483647 = -1",
+		.u.insns_int = {
+			BPF_LD_IMM64(R0, 2147483646),
+			BPF_ALU64_IMM(BPF_SUB, R0, 2147483647),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, -1 } },
+	},
+	/* BPF_ALU | BPF_MUL | BPF_X */
+	{
+		"ALU_MUL_X: 2 * 3 = 6",
+		.u.insns_int = {
+			BPF_LD_IMM64(R0, 2),
+			BPF_ALU32_IMM(BPF_MOV, R1, 3),
+			BPF_ALU32_REG(BPF_MUL, R0, R1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 6 } },
+	},
+	{
+		"ALU_MUL_X: 2 * 0x7FFFFFF8 = 0xFFFFFFF0",
+		.u.insns_int = {
+			BPF_LD_IMM64(R0, 2),
+			BPF_ALU32_IMM(BPF_MOV, R1, 0x7FFFFFF8),
+			BPF_ALU32_REG(BPF_MUL, R0, R1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 0xFFFFFFF0 } },
+	},
+	{
+		"ALU_MUL_X: -1 * -1 = 1",
+		.u.insns_int = {
+			BPF_LD_IMM64(R0, -1),
+			BPF_ALU32_IMM(BPF_MOV, R1, -1),
+			BPF_ALU32_REG(BPF_MUL, R0, R1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 1 } },
+	},
+	{
+		"ALU64_MUL_X: 2 * 3 = 6",
+		.u.insns_int = {
+			BPF_LD_IMM64(R0, 2),
+			BPF_ALU32_IMM(BPF_MOV, R1, 3),
+			BPF_ALU64_REG(BPF_MUL, R0, R1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 6 } },
+	},
+	{
+		"ALU64_MUL_X: 1 * 2147483647 = 2147483647",
+		.u.insns_int = {
+			BPF_LD_IMM64(R0, 1),
+			BPF_ALU32_IMM(BPF_MOV, R1, 2147483647),
+			BPF_ALU64_REG(BPF_MUL, R0, R1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 2147483647 } },
+	},
+	/* BPF_ALU | BPF_MUL | BPF_K */
+	{
+		"ALU_MUL_K: 2 * 3 = 6",
+		.u.insns_int = {
+			BPF_LD_IMM64(R0, 2),
+			BPF_ALU32_IMM(BPF_MUL, R0, 3),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 6 } },
+	},
+	{
+		"ALU_MUL_K: 3 * 1 = 3",
+		.u.insns_int = {
+			BPF_LD_IMM64(R0, 3),
+			BPF_ALU32_IMM(BPF_MUL, R0, 1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 3 } },
+	},
+	{
+		"ALU_MUL_K: 2 * 0x7FFFFFF8 = 0xFFFFFFF0",
+		.u.insns_int = {
+			BPF_LD_IMM64(R0, 2),
+			BPF_ALU32_IMM(BPF_MUL, R0, 0x7FFFFFF8),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 0xFFFFFFF0 } },
+	},
+	{
+		"ALU_MUL_K: 1 * (-1) = 0x00000000ffffffff",
+		.u.insns_int = {
+			BPF_LD_IMM64(R2, 0x1),
+			BPF_LD_IMM64(R3, 0x00000000ffffffff),
+			BPF_ALU32_IMM(BPF_MUL, R2, 0xffffffff),
+			BPF_JMP_REG(BPF_JEQ, R2, R3, 2),
+			BPF_MOV32_IMM(R0, 2),
+			BPF_EXIT_INSN(),
+			BPF_MOV32_IMM(R0, 1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 0x1 } },
+	},
+	{
+		"ALU64_MUL_K: 2 * 3 = 6",
+		.u.insns_int = {
+			BPF_LD_IMM64(R0, 2),
+			BPF_ALU64_IMM(BPF_MUL, R0, 3),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 6 } },
+	},
+	{
+		"ALU64_MUL_K: 3 * 1 = 3",
+		.u.insns_int = {
+			BPF_LD_IMM64(R0, 3),
+			BPF_ALU64_IMM(BPF_MUL, R0, 1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 3 } },
+	},
+	{
+		"ALU64_MUL_K: 1 * 2147483647 = 2147483647",
+		.u.insns_int = {
+			BPF_LD_IMM64(R0, 1),
+			BPF_ALU64_IMM(BPF_MUL, R0, 2147483647),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 2147483647 } },
+	},
+	{
+		"ALU64_MUL_K: 1 * -2147483647 = -2147483647",
+		.u.insns_int = {
+			BPF_LD_IMM64(R0, 1),
+			BPF_ALU64_IMM(BPF_MUL, R0, -2147483647),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, -2147483647 } },
+	},
+	{
+		"ALU64_MUL_K: 1 * (-1) = 0xffffffffffffffff",
+		.u.insns_int = {
+			BPF_LD_IMM64(R2, 0x1),
+			BPF_LD_IMM64(R3, 0xffffffffffffffff),
+			BPF_ALU64_IMM(BPF_MUL, R2, 0xffffffff),
+			BPF_JMP_REG(BPF_JEQ, R2, R3, 2),
+			BPF_MOV32_IMM(R0, 2),
+			BPF_EXIT_INSN(),
+			BPF_MOV32_IMM(R0, 1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 0x1 } },
+	},
+	/* BPF_ALU | BPF_DIV | BPF_X */
+	{
+		"ALU_DIV_X: 6 / 2 = 3",
+		.u.insns_int = {
+			BPF_LD_IMM64(R0, 6),
+			BPF_ALU32_IMM(BPF_MOV, R1, 2),
+			BPF_ALU32_REG(BPF_DIV, R0, R1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 3 } },
+	},
+	{
+		"ALU_DIV_X: 4294967295 / 4294967295 = 1",
+		.u.insns_int = {
+			BPF_LD_IMM64(R0, 4294967295),
+			BPF_ALU32_IMM(BPF_MOV, R1, 4294967295),
+			BPF_ALU32_REG(BPF_DIV, R0, R1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 1 } },
+	},
+	{
+		"ALU64_DIV_X: 6 / 2 = 3",
+		.u.insns_int = {
+			BPF_LD_IMM64(R0, 6),
+			BPF_ALU32_IMM(BPF_MOV, R1, 2),
+			BPF_ALU64_REG(BPF_DIV, R0, R1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 3 } },
+	},
+	{
+		"ALU64_DIV_X: 2147483647 / 2147483647 = 1",
+		.u.insns_int = {
+			BPF_LD_IMM64(R0, 2147483647),
+			BPF_ALU32_IMM(BPF_MOV, R1, 2147483647),
+			BPF_ALU64_REG(BPF_DIV, R0, R1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 1 } },
+	},
+	{
+		"ALU64_DIV_X: 0xffffffffffffffff / (-1) = 0x0000000000000001",
+		.u.insns_int = {
+			BPF_LD_IMM64(R2, 0xffffffffffffffffUL),
+			BPF_LD_IMM64(R4, 0xffffffffffffffffUL),
+			BPF_LD_IMM64(R3, 0x0000000000000001UL),
+			BPF_ALU64_REG(BPF_DIV, R2, R4),
+			BPF_JMP_REG(BPF_JEQ, R2, R3, 2),
+			BPF_MOV32_IMM(R0, 2),
+			BPF_EXIT_INSN(),
+			BPF_MOV32_IMM(R0, 1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 0x1 } },
+	},
+	/* BPF_ALU | BPF_DIV | BPF_K */
+	{
+		"ALU_DIV_K: 6 / 2 = 3",
+		.u.insns_int = {
+			BPF_LD_IMM64(R0, 6),
+			BPF_ALU32_IMM(BPF_DIV, R0, 2),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 3 } },
+	},
+	{
+		"ALU_DIV_K: 3 / 1 = 3",
+		.u.insns_int = {
+			BPF_LD_IMM64(R0, 3),
+			BPF_ALU32_IMM(BPF_DIV, R0, 1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 3 } },
+	},
+	{
+		"ALU_DIV_K: 4294967295 / 4294967295 = 1",
+		.u.insns_int = {
+			BPF_LD_IMM64(R0, 4294967295),
+			BPF_ALU32_IMM(BPF_DIV, R0, 4294967295),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 1 } },
+	},
+	{
+		"ALU_DIV_K: 0xffffffffffffffff / (-1) = 0x1",
+		.u.insns_int = {
+			BPF_LD_IMM64(R2, 0xffffffffffffffffUL),
+			BPF_LD_IMM64(R3, 0x1UL),
+			BPF_ALU32_IMM(BPF_DIV, R2, 0xffffffff),
+			BPF_JMP_REG(BPF_JEQ, R2, R3, 2),
+			BPF_MOV32_IMM(R0, 2),
+			BPF_EXIT_INSN(),
+			BPF_MOV32_IMM(R0, 1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 0x1 } },
+	},
+	{
+		"ALU64_DIV_K: 6 / 2 = 3",
+		.u.insns_int = {
+			BPF_LD_IMM64(R0, 6),
+			BPF_ALU64_IMM(BPF_DIV, R0, 2),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 3 } },
+	},
+	{
+		"ALU64_DIV_K: 3 / 1 = 3",
+		.u.insns_int = {
+			BPF_LD_IMM64(R0, 3),
+			BPF_ALU64_IMM(BPF_DIV, R0, 1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 3 } },
+	},
+	{
+		"ALU64_DIV_K: 2147483647 / 2147483647 = 1",
+		.u.insns_int = {
+			BPF_LD_IMM64(R0, 2147483647),
+			BPF_ALU64_IMM(BPF_DIV, R0, 2147483647),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 1 } },
+	},
+	{
+		"ALU64_DIV_K: 0xffffffffffffffff / (-1) = 0x0000000000000001",
+		.u.insns_int = {
+			BPF_LD_IMM64(R2, 0xffffffffffffffffUL),
+			BPF_LD_IMM64(R3, 0x0000000000000001UL),
+			BPF_ALU64_IMM(BPF_DIV, R2, 0xffffffff),
+			BPF_JMP_REG(BPF_JEQ, R2, R3, 2),
+			BPF_MOV32_IMM(R0, 2),
+			BPF_EXIT_INSN(),
+			BPF_MOV32_IMM(R0, 1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 0x1 } },
+	},
+	/* BPF_ALU | BPF_MOD | BPF_X */
+	{
+		"ALU_MOD_X: 3 % 2 = 1",
+		.u.insns_int = {
+			BPF_LD_IMM64(R0, 3),
+			BPF_ALU32_IMM(BPF_MOV, R1, 2),
+			BPF_ALU32_REG(BPF_MOD, R0, R1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 1 } },
+	},
+	{
+		"ALU_MOD_X: 4294967295 % 4294967293 = 2",
+		.u.insns_int = {
+			BPF_LD_IMM64(R0, 4294967295),
+			BPF_ALU32_IMM(BPF_MOV, R1, 4294967293),
+			BPF_ALU32_REG(BPF_MOD, R0, R1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 2 } },
+	},
+	{
+		"ALU64_MOD_X: 3 % 2 = 1",
+		.u.insns_int = {
+			BPF_LD_IMM64(R0, 3),
+			BPF_ALU32_IMM(BPF_MOV, R1, 2),
+			BPF_ALU64_REG(BPF_MOD, R0, R1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 1 } },
+	},
+	{
+		"ALU64_MOD_X: 2147483647 % 2147483645 = 2",
+		.u.insns_int = {
+			BPF_LD_IMM64(R0, 2147483647),
+			BPF_ALU32_IMM(BPF_MOV, R1, 2147483645),
+			BPF_ALU64_REG(BPF_MOD, R0, R1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 2 } },
+	},
+	/* BPF_ALU | BPF_MOD | BPF_K */
+	{
+		"ALU_MOD_K: 3 % 2 = 1",
+		.u.insns_int = {
+			BPF_LD_IMM64(R0, 3),
+			BPF_ALU32_IMM(BPF_MOD, R0, 2),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 1 } },
+	},
+	{
+		"ALU_MOD_K: 3 % 1 = 0",
+		.u.insns_int = {
+			BPF_LD_IMM64(R0, 3),
+			BPF_ALU32_IMM(BPF_MOD, R0, 1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 0 } },
+	},
+	{
+		"ALU_MOD_K: 4294967295 % 4294967293 = 2",
+		.u.insns_int = {
+			BPF_LD_IMM64(R0, 4294967295),
+			BPF_ALU32_IMM(BPF_MOD, R0, 4294967293),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 2 } },
+	},
+	{
+		"ALU64_MOD_K: 3 % 2 = 1",
+		.u.insns_int = {
+			BPF_LD_IMM64(R0, 3),
+			BPF_ALU64_IMM(BPF_MOD, R0, 2),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 1 } },
+	},
+	{
+		"ALU64_MOD_K: 3 % 1 = 0",
+		.u.insns_int = {
+			BPF_LD_IMM64(R0, 3),
+			BPF_ALU64_IMM(BPF_MOD, R0, 1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 0 } },
+	},
+	{
+		"ALU64_MOD_K: 2147483647 % 2147483645 = 2",
+		.u.insns_int = {
+			BPF_LD_IMM64(R0, 2147483647),
+			BPF_ALU64_IMM(BPF_MOD, R0, 2147483645),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 2 } },
+	},
+	/* BPF_ALU | BPF_AND | BPF_X */
+	{
+		"ALU_AND_X: 3 & 2 = 2",
+		.u.insns_int = {
+			BPF_LD_IMM64(R0, 3),
+			BPF_ALU32_IMM(BPF_MOV, R1, 2),
+			BPF_ALU32_REG(BPF_AND, R0, R1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 2 } },
+	},
+	{
+		"ALU_AND_X: 0xffffffff & 0xffffffff = 0xffffffff",
+		.u.insns_int = {
+			BPF_LD_IMM64(R0, 0xffffffff),
+			BPF_ALU32_IMM(BPF_MOV, R1, 0xffffffff),
+			BPF_ALU32_REG(BPF_AND, R0, R1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 0xffffffff } },
+	},
+	{
+		"ALU64_AND_X: 3 & 2 = 2",
+		.u.insns_int = {
+			BPF_LD_IMM64(R0, 3),
+			BPF_ALU32_IMM(BPF_MOV, R1, 2),
+			BPF_ALU64_REG(BPF_AND, R0, R1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 2 } },
+	},
+	{
+		"ALU64_AND_X: 0xffffffff & 0xffffffff = 0xffffffff",
+		.u.insns_int = {
+			BPF_LD_IMM64(R0, 0xffffffff),
+			BPF_ALU32_IMM(BPF_MOV, R1, 0xffffffff),
+			BPF_ALU64_REG(BPF_AND, R0, R1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 0xffffffff } },
+	},
+	/* BPF_ALU | BPF_AND | BPF_K */
+	{
+		"ALU_AND_K: 3 & 2 = 2",
+		.u.insns_int = {
+			BPF_LD_IMM64(R0, 3),
+			BPF_ALU32_IMM(BPF_AND, R0, 2),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 2 } },
+	},
+	{
+		"ALU_AND_K: 0xffffffff & 0xffffffff = 0xffffffff",
+		.u.insns_int = {
+			BPF_LD_IMM64(R0, 0xffffffff),
+			BPF_ALU32_IMM(BPF_AND, R0, 0xffffffff),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 0xffffffff } },
+	},
+	{
+		"ALU64_AND_K: 3 & 2 = 2",
+		.u.insns_int = {
+			BPF_LD_IMM64(R0, 3),
+			BPF_ALU64_IMM(BPF_AND, R0, 2),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 2 } },
+	},
+	{
+		"ALU64_AND_K: 0xffffffff & 0xffffffff = 0xffffffff",
+		.u.insns_int = {
+			BPF_LD_IMM64(R0, 0xffffffff),
+			BPF_ALU64_IMM(BPF_AND, R0, 0xffffffff),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 0xffffffff } },
+	},
+	{
+		"ALU64_AND_K: 0x0000ffffffff0000 & 0x0 = 0x0000ffff00000000",
+		.u.insns_int = {
+			BPF_LD_IMM64(R2, 0x0000ffffffff0000),
+			BPF_LD_IMM64(R3, 0x0000000000000000),
+			BPF_ALU64_IMM(BPF_AND, R2, 0x0),
+			BPF_JMP_REG(BPF_JEQ, R2, R3, 2),
+			BPF_MOV32_IMM(R0, 2),
+			BPF_EXIT_INSN(),
+			BPF_MOV32_IMM(R0, 1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 0x1 } },
+	},
+	{
+		"ALU64_AND_K: 0x0000ffffffff0000 & -1 = 0x0000ffffffffffff",
+		.u.insns_int = {
+			BPF_LD_IMM64(R2, 0x0000ffffffff0000),
+			BPF_LD_IMM64(R3, 0x0000ffffffff0000),
+			BPF_ALU64_IMM(BPF_AND, R2, 0xffffffff),
+			BPF_JMP_REG(BPF_JEQ, R2, R3, 2),
+			BPF_MOV32_IMM(R0, 2),
+			BPF_EXIT_INSN(),
+			BPF_MOV32_IMM(R0, 1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 0x1 } },
+	},
+	{
+		"ALU64_AND_K: 0xffffffffffffffff & -1 = 0xffffffffffffffff",
+		.u.insns_int = {
+			BPF_LD_IMM64(R2, 0xffffffffffffffff),
+			BPF_LD_IMM64(R3, 0xffffffffffffffff),
+			BPF_ALU64_IMM(BPF_AND, R2, 0xffffffff),
+			BPF_JMP_REG(BPF_JEQ, R2, R3, 2),
+			BPF_MOV32_IMM(R0, 2),
+			BPF_EXIT_INSN(),
+			BPF_MOV32_IMM(R0, 1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 0x1 } },
+	},
+	/* BPF_ALU | BPF_OR | BPF_X */
+	{
+		"ALU_OR_X: 1 | 2 = 3",
+		.u.insns_int = {
+			BPF_LD_IMM64(R0, 1),
+			BPF_ALU32_IMM(BPF_MOV, R1, 2),
+			BPF_ALU32_REG(BPF_OR, R0, R1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 3 } },
+	},
+	{
+		"ALU_OR_X: 0x0 | 0xffffffff = 0xffffffff",
+		.u.insns_int = {
+			BPF_LD_IMM64(R0, 0),
+			BPF_ALU32_IMM(BPF_MOV, R1, 0xffffffff),
+			BPF_ALU32_REG(BPF_OR, R0, R1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 0xffffffff } },
+	},
+	{
+		"ALU64_OR_X: 1 | 2 = 3",
+		.u.insns_int = {
+			BPF_LD_IMM64(R0, 1),
+			BPF_ALU32_IMM(BPF_MOV, R1, 2),
+			BPF_ALU64_REG(BPF_OR, R0, R1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 3 } },
+	},
+	{
+		"ALU64_OR_X: 0 | 0xffffffff = 0xffffffff",
+		.u.insns_int = {
+			BPF_LD_IMM64(R0, 0),
+			BPF_ALU32_IMM(BPF_MOV, R1, 0xffffffff),
+			BPF_ALU64_REG(BPF_OR, R0, R1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 0xffffffff } },
+	},
+	/* BPF_ALU | BPF_OR | BPF_K */
+	{
+		"ALU_OR_K: 1 | 2 = 3",
+		.u.insns_int = {
+			BPF_LD_IMM64(R0, 1),
+			BPF_ALU32_IMM(BPF_OR, R0, 2),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 3 } },
+	},
+	{
+		"ALU_OR_K: 0 & 0xffffffff = 0xffffffff",
+		.u.insns_int = {
+			BPF_LD_IMM64(R0, 0),
+			BPF_ALU32_IMM(BPF_OR, R0, 0xffffffff),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 0xffffffff } },
+	},
+	{
+		"ALU64_OR_K: 1 | 2 = 3",
+		.u.insns_int = {
+			BPF_LD_IMM64(R0, 1),
+			BPF_ALU64_IMM(BPF_OR, R0, 2),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 3 } },
+	},
+	{
+		"ALU64_OR_K: 0 & 0xffffffff = 0xffffffff",
+		.u.insns_int = {
+			BPF_LD_IMM64(R0, 0),
+			BPF_ALU64_IMM(BPF_OR, R0, 0xffffffff),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 0xffffffff } },
+	},
+	{
+		"ALU64_OR_K: 0x0000ffffffff0000 | 0x0 = 0x0000ffff00000000",
+		.u.insns_int = {
+			BPF_LD_IMM64(R2, 0x0000ffffffff0000),
+			BPF_LD_IMM64(R3, 0x0000ffffffff0000),
+			BPF_ALU64_IMM(BPF_OR, R2, 0x0),
+			BPF_JMP_REG(BPF_JEQ, R2, R3, 2),
+			BPF_MOV32_IMM(R0, 2),
+			BPF_EXIT_INSN(),
+			BPF_MOV32_IMM(R0, 1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 0x1 } },
+	},
+	{
+		"ALU64_OR_K: 0x0000ffffffff0000 | -1 = 0xffffffffffffffff",
+		.u.insns_int = {
+			BPF_LD_IMM64(R2, 0x0000ffffffff0000),
+			BPF_LD_IMM64(R3, 0xffffffffffffffff),
+			BPF_ALU64_IMM(BPF_OR, R2, 0xffffffff),
+			BPF_JMP_REG(BPF_JEQ, R2, R3, 2),
+			BPF_MOV32_IMM(R0, 2),
+			BPF_EXIT_INSN(),
+			BPF_MOV32_IMM(R0, 1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 0x1 } },
+	},
+	{
+		"ALU64_OR_K: 0x000000000000000 | -1 = 0xffffffffffffffff",
+		.u.insns_int = {
+			BPF_LD_IMM64(R2, 0x0000000000000000),
+			BPF_LD_IMM64(R3, 0xffffffffffffffff),
+			BPF_ALU64_IMM(BPF_OR, R2, 0xffffffff),
+			BPF_JMP_REG(BPF_JEQ, R2, R3, 2),
+			BPF_MOV32_IMM(R0, 2),
+			BPF_EXIT_INSN(),
+			BPF_MOV32_IMM(R0, 1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 0x1 } },
+	},
+	/* BPF_ALU | BPF_XOR | BPF_X */
+	{
+		"ALU_XOR_X: 5 ^ 6 = 3",
+		.u.insns_int = {
+			BPF_LD_IMM64(R0, 5),
+			BPF_ALU32_IMM(BPF_MOV, R1, 6),
+			BPF_ALU32_REG(BPF_XOR, R0, R1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 3 } },
+	},
+	{
+		"ALU_XOR_X: 0x1 ^ 0xffffffff = 0xfffffffe",
+		.u.insns_int = {
+			BPF_LD_IMM64(R0, 1),
+			BPF_ALU32_IMM(BPF_MOV, R1, 0xffffffff),
+			BPF_ALU32_REG(BPF_XOR, R0, R1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 0xfffffffe } },
+	},
+	{
+		"ALU64_XOR_X: 5 ^ 6 = 3",
+		.u.insns_int = {
+			BPF_LD_IMM64(R0, 5),
+			BPF_ALU32_IMM(BPF_MOV, R1, 6),
+			BPF_ALU64_REG(BPF_XOR, R0, R1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 3 } },
+	},
+	{
+		"ALU64_XOR_X: 1 ^ 0xffffffff = 0xfffffffe",
+		.u.insns_int = {
+			BPF_LD_IMM64(R0, 1),
+			BPF_ALU32_IMM(BPF_MOV, R1, 0xffffffff),
+			BPF_ALU64_REG(BPF_XOR, R0, R1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 0xfffffffe } },
+	},
+	/* BPF_ALU | BPF_XOR | BPF_K */
+	{
+		"ALU_XOR_K: 5 ^ 6 = 3",
+		.u.insns_int = {
+			BPF_LD_IMM64(R0, 5),
+			BPF_ALU32_IMM(BPF_XOR, R0, 6),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 3 } },
+	},
+	{
+		"ALU_XOR_K: 1 ^ 0xffffffff = 0xfffffffe",
+		.u.insns_int = {
+			BPF_LD_IMM64(R0, 1),
+			BPF_ALU32_IMM(BPF_XOR, R0, 0xffffffff),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 0xfffffffe } },
+	},
+	{
+		"ALU64_XOR_K: 5 ^ 6 = 3",
+		.u.insns_int = {
+			BPF_LD_IMM64(R0, 5),
+			BPF_ALU64_IMM(BPF_XOR, R0, 6),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 3 } },
+	},
+	{
+		"ALU64_XOR_K: 1 & 0xffffffff = 0xfffffffe",
+		.u.insns_int = {
+			BPF_LD_IMM64(R0, 1),
+			BPF_ALU64_IMM(BPF_XOR, R0, 0xffffffff),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 0xfffffffe } },
+	},
+	{
+		"ALU64_XOR_K: 0x0000ffffffff0000 ^ 0x0 = 0x0000ffffffff0000",
+		.u.insns_int = {
+			BPF_LD_IMM64(R2, 0x0000ffffffff0000),
+			BPF_LD_IMM64(R3, 0x0000ffffffff0000),
+			BPF_ALU64_IMM(BPF_XOR, R2, 0x0),
+			BPF_JMP_REG(BPF_JEQ, R2, R3, 2),
+			BPF_MOV32_IMM(R0, 2),
+			BPF_EXIT_INSN(),
+			BPF_MOV32_IMM(R0, 1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 0x1 } },
+	},
+	{
+		"ALU64_XOR_K: 0x0000ffffffff0000 ^ -1 = 0xffff00000000ffff",
+		.u.insns_int = {
+			BPF_LD_IMM64(R2, 0x0000ffffffff0000),
+			BPF_LD_IMM64(R3, 0xffff00000000ffff),
+			BPF_ALU64_IMM(BPF_XOR, R2, 0xffffffff),
+			BPF_JMP_REG(BPF_JEQ, R2, R3, 2),
+			BPF_MOV32_IMM(R0, 2),
+			BPF_EXIT_INSN(),
+			BPF_MOV32_IMM(R0, 1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 0x1 } },
+	},
+	{
+		"ALU64_XOR_K: 0x000000000000000 ^ -1 = 0xffffffffffffffff",
+		.u.insns_int = {
+			BPF_LD_IMM64(R2, 0x0000000000000000),
+			BPF_LD_IMM64(R3, 0xffffffffffffffff),
+			BPF_ALU64_IMM(BPF_XOR, R2, 0xffffffff),
+			BPF_JMP_REG(BPF_JEQ, R2, R3, 2),
+			BPF_MOV32_IMM(R0, 2),
+			BPF_EXIT_INSN(),
+			BPF_MOV32_IMM(R0, 1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 0x1 } },
+	},
+	/* BPF_ALU | BPF_LSH | BPF_X */
+	{
+		"ALU_LSH_X: 1 << 1 = 2",
+		.u.insns_int = {
+			BPF_LD_IMM64(R0, 1),
+			BPF_ALU32_IMM(BPF_MOV, R1, 1),
+			BPF_ALU32_REG(BPF_LSH, R0, R1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 2 } },
+	},
+	{
+		"ALU_LSH_X: 1 << 31 = 0x80000000",
+		.u.insns_int = {
+			BPF_LD_IMM64(R0, 1),
+			BPF_ALU32_IMM(BPF_MOV, R1, 31),
+			BPF_ALU32_REG(BPF_LSH, R0, R1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 0x80000000 } },
+	},
+	{
+		"ALU64_LSH_X: 1 << 1 = 2",
+		.u.insns_int = {
+			BPF_LD_IMM64(R0, 1),
+			BPF_ALU32_IMM(BPF_MOV, R1, 1),
+			BPF_ALU64_REG(BPF_LSH, R0, R1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 2 } },
+	},
+	{
+		"ALU64_LSH_X: 1 << 31 = 0x80000000",
+		.u.insns_int = {
+			BPF_LD_IMM64(R0, 1),
+			BPF_ALU32_IMM(BPF_MOV, R1, 31),
+			BPF_ALU64_REG(BPF_LSH, R0, R1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 0x80000000 } },
+	},
+	/* BPF_ALU | BPF_LSH | BPF_K */
+	{
+		"ALU_LSH_K: 1 << 1 = 2",
+		.u.insns_int = {
+			BPF_LD_IMM64(R0, 1),
+			BPF_ALU32_IMM(BPF_LSH, R0, 1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 2 } },
+	},
+	{
+		"ALU_LSH_K: 1 << 31 = 0x80000000",
+		.u.insns_int = {
+			BPF_LD_IMM64(R0, 1),
+			BPF_ALU32_IMM(BPF_LSH, R0, 31),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 0x80000000 } },
+	},
+	{
+		"ALU64_LSH_K: 1 << 1 = 2",
+		.u.insns_int = {
+			BPF_LD_IMM64(R0, 1),
+			BPF_ALU64_IMM(BPF_LSH, R0, 1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 2 } },
+	},
+	{
+		"ALU64_LSH_K: 1 << 31 = 0x80000000",
+		.u.insns_int = {
+			BPF_LD_IMM64(R0, 1),
+			BPF_ALU64_IMM(BPF_LSH, R0, 31),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 0x80000000 } },
+	},
+	/* BPF_ALU | BPF_RSH | BPF_X */
+	{
+		"ALU_RSH_X: 2 >> 1 = 1",
+		.u.insns_int = {
+			BPF_LD_IMM64(R0, 2),
+			BPF_ALU32_IMM(BPF_MOV, R1, 1),
+			BPF_ALU32_REG(BPF_RSH, R0, R1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 1 } },
+	},
+	{
+		"ALU_RSH_X: 0x80000000 >> 31 = 1",
+		.u.insns_int = {
+			BPF_LD_IMM64(R0, 0x80000000),
+			BPF_ALU32_IMM(BPF_MOV, R1, 31),
+			BPF_ALU32_REG(BPF_RSH, R0, R1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 1 } },
+	},
+	{
+		"ALU64_RSH_X: 2 >> 1 = 1",
+		.u.insns_int = {
+			BPF_LD_IMM64(R0, 2),
+			BPF_ALU32_IMM(BPF_MOV, R1, 1),
+			BPF_ALU64_REG(BPF_RSH, R0, R1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 1 } },
+	},
+	{
+		"ALU64_RSH_X: 0x80000000 >> 31 = 1",
+		.u.insns_int = {
+			BPF_LD_IMM64(R0, 0x80000000),
+			BPF_ALU32_IMM(BPF_MOV, R1, 31),
+			BPF_ALU64_REG(BPF_RSH, R0, R1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 1 } },
+	},
+	/* BPF_ALU | BPF_RSH | BPF_K */
+	{
+		"ALU_RSH_K: 2 >> 1 = 1",
+		.u.insns_int = {
+			BPF_LD_IMM64(R0, 2),
+			BPF_ALU32_IMM(BPF_RSH, R0, 1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 1 } },
+	},
+	{
+		"ALU_RSH_K: 0x80000000 >> 31 = 1",
+		.u.insns_int = {
+			BPF_LD_IMM64(R0, 0x80000000),
+			BPF_ALU32_IMM(BPF_RSH, R0, 31),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 1 } },
+	},
+	{
+		"ALU64_RSH_K: 2 >> 1 = 1",
+		.u.insns_int = {
+			BPF_LD_IMM64(R0, 2),
+			BPF_ALU64_IMM(BPF_RSH, R0, 1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 1 } },
+	},
+	{
+		"ALU64_RSH_K: 0x80000000 >> 31 = 1",
+		.u.insns_int = {
+			BPF_LD_IMM64(R0, 0x80000000),
+			BPF_ALU64_IMM(BPF_RSH, R0, 31),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 1 } },
+	},
+	/* BPF_ALU | BPF_ARSH | BPF_X */
+	{
+		"ALU_ARSH_X: 0xff00ff0000000000 >> 40 = 0xffffffffffff00ff",
+		.u.insns_int = {
+			BPF_LD_IMM64(R0, 0xff00ff0000000000LL),
+			BPF_ALU32_IMM(BPF_MOV, R1, 40),
+			BPF_ALU64_REG(BPF_ARSH, R0, R1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 0xffff00ff } },
+	},
+	/* BPF_ALU | BPF_ARSH | BPF_K */
+	{
+		"ALU_ARSH_K: 0xff00ff0000000000 >> 40 = 0xffffffffffff00ff",
+		.u.insns_int = {
+			BPF_LD_IMM64(R0, 0xff00ff0000000000LL),
+			BPF_ALU64_IMM(BPF_ARSH, R0, 40),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 0xffff00ff } },
+	},
+	/* BPF_ALU | BPF_NEG */
+	{
+		"ALU_NEG: -(3) = -3",
+		.u.insns_int = {
+			BPF_ALU32_IMM(BPF_MOV, R0, 3),
+			BPF_ALU32_IMM(BPF_NEG, R0, 0),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, -3 } },
+	},
+	{
+		"ALU_NEG: -(-3) = 3",
+		.u.insns_int = {
+			BPF_ALU32_IMM(BPF_MOV, R0, -3),
+			BPF_ALU32_IMM(BPF_NEG, R0, 0),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 3 } },
+	},
+	{
+		"ALU64_NEG: -(3) = -3",
+		.u.insns_int = {
+			BPF_LD_IMM64(R0, 3),
+			BPF_ALU64_IMM(BPF_NEG, R0, 0),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, -3 } },
+	},
+	{
+		"ALU64_NEG: -(-3) = 3",
+		.u.insns_int = {
+			BPF_LD_IMM64(R0, -3),
+			BPF_ALU64_IMM(BPF_NEG, R0, 0),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 3 } },
+	},
+	/* BPF_ALU | BPF_END | BPF_FROM_BE */
+	{
+		"ALU_END_FROM_BE 16: 0x0123456789abcdef -> 0xcdef",
+		.u.insns_int = {
+			BPF_LD_IMM64(R0, 0x0123456789abcdefLL),
+			BPF_ENDIAN(BPF_FROM_BE, R0, 16),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0,  cpu_to_be16(0xcdef) } },
+	},
+	{
+		"ALU_END_FROM_BE 32: 0x0123456789abcdef -> 0x89abcdef",
+		.u.insns_int = {
+			BPF_LD_IMM64(R0, 0x0123456789abcdefLL),
+			BPF_ENDIAN(BPF_FROM_BE, R0, 32),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, cpu_to_be32(0x89abcdef) } },
+	},
+	{
+		"ALU_END_FROM_BE 64: 0x0123456789abcdef -> 0x89abcdef",
+		.u.insns_int = {
+			BPF_LD_IMM64(R0, 0x0123456789abcdefLL),
+			BPF_ENDIAN(BPF_FROM_BE, R0, 64),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, (u32) cpu_to_be64(0x0123456789abcdefLL) } },
+	},
+	/* BPF_ALU | BPF_END | BPF_FROM_LE */
+	{
+		"ALU_END_FROM_LE 16: 0x0123456789abcdef -> 0xefcd",
+		.u.insns_int = {
+			BPF_LD_IMM64(R0, 0x0123456789abcdefLL),
+			BPF_ENDIAN(BPF_FROM_LE, R0, 16),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, cpu_to_le16(0xcdef) } },
+	},
+	{
+		"ALU_END_FROM_LE 32: 0x0123456789abcdef -> 0xefcdab89",
+		.u.insns_int = {
+			BPF_LD_IMM64(R0, 0x0123456789abcdefLL),
+			BPF_ENDIAN(BPF_FROM_LE, R0, 32),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, cpu_to_le32(0x89abcdef) } },
+	},
+	{
+		"ALU_END_FROM_LE 64: 0x0123456789abcdef -> 0x67452301",
+		.u.insns_int = {
+			BPF_LD_IMM64(R0, 0x0123456789abcdefLL),
+			BPF_ENDIAN(BPF_FROM_LE, R0, 64),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, (u32) cpu_to_le64(0x0123456789abcdefLL) } },
+	},
+	/* BPF_ST(X) | BPF_MEM | BPF_B/H/W/DW */
+	{
+		"ST_MEM_B: Store/Load byte: max negative",
+		.u.insns_int = {
+			BPF_ALU32_IMM(BPF_MOV, R0, 1),
+			BPF_ST_MEM(BPF_B, R10, -40, 0xff),
+			BPF_LDX_MEM(BPF_B, R0, R10, -40),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 0xff } },
+	},
+	{
+		"ST_MEM_B: Store/Load byte: max positive",
+		.u.insns_int = {
+			BPF_ALU32_IMM(BPF_MOV, R0, 1),
+			BPF_ST_MEM(BPF_H, R10, -40, 0x7f),
+			BPF_LDX_MEM(BPF_H, R0, R10, -40),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 0x7f } },
+	},
+	{
+		"STX_MEM_B: Store/Load byte: max negative",
+		.u.insns_int = {
+			BPF_LD_IMM64(R0, 0),
+			BPF_LD_IMM64(R1, 0xffLL),
+			BPF_STX_MEM(BPF_B, R10, R1, -40),
+			BPF_LDX_MEM(BPF_B, R0, R10, -40),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 0xff } },
+	},
+	{
+		"ST_MEM_H: Store/Load half word: max negative",
+		.u.insns_int = {
+			BPF_ALU32_IMM(BPF_MOV, R0, 1),
+			BPF_ST_MEM(BPF_H, R10, -40, 0xffff),
+			BPF_LDX_MEM(BPF_H, R0, R10, -40),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 0xffff } },
+	},
+	{
+		"ST_MEM_H: Store/Load half word: max positive",
+		.u.insns_int = {
+			BPF_ALU32_IMM(BPF_MOV, R0, 1),
+			BPF_ST_MEM(BPF_H, R10, -40, 0x7fff),
+			BPF_LDX_MEM(BPF_H, R0, R10, -40),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 0x7fff } },
+	},
+	{
+		"STX_MEM_H: Store/Load half word: max negative",
+		.u.insns_int = {
+			BPF_LD_IMM64(R0, 0),
+			BPF_LD_IMM64(R1, 0xffffLL),
+			BPF_STX_MEM(BPF_H, R10, R1, -40),
+			BPF_LDX_MEM(BPF_H, R0, R10, -40),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 0xffff } },
+	},
+	{
+		"ST_MEM_W: Store/Load word: max negative",
+		.u.insns_int = {
+			BPF_ALU32_IMM(BPF_MOV, R0, 1),
+			BPF_ST_MEM(BPF_W, R10, -40, 0xffffffff),
+			BPF_LDX_MEM(BPF_W, R0, R10, -40),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 0xffffffff } },
+	},
+	{
+		"ST_MEM_W: Store/Load word: max positive",
+		.u.insns_int = {
+			BPF_ALU32_IMM(BPF_MOV, R0, 1),
+			BPF_ST_MEM(BPF_W, R10, -40, 0x7fffffff),
+			BPF_LDX_MEM(BPF_W, R0, R10, -40),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 0x7fffffff } },
+	},
+	{
+		"STX_MEM_W: Store/Load word: max negative",
+		.u.insns_int = {
+			BPF_LD_IMM64(R0, 0),
+			BPF_LD_IMM64(R1, 0xffffffffLL),
+			BPF_STX_MEM(BPF_W, R10, R1, -40),
+			BPF_LDX_MEM(BPF_W, R0, R10, -40),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 0xffffffff } },
+	},
+	{
+		"ST_MEM_DW: Store/Load double word: max negative",
+		.u.insns_int = {
+			BPF_ALU32_IMM(BPF_MOV, R0, 1),
+			BPF_ST_MEM(BPF_DW, R10, -40, 0xffffffff),
+			BPF_LDX_MEM(BPF_DW, R0, R10, -40),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 0xffffffff } },
+	},
+	{
+		"ST_MEM_DW: Store/Load double word: max negative 2",
+		.u.insns_int = {
+			BPF_LD_IMM64(R2, 0xffff00000000ffff),
+			BPF_LD_IMM64(R3, 0xffffffffffffffff),
+			BPF_ST_MEM(BPF_DW, R10, -40, 0xffffffff),
+			BPF_LDX_MEM(BPF_DW, R2, R10, -40),
+			BPF_JMP_REG(BPF_JEQ, R2, R3, 2),
+			BPF_MOV32_IMM(R0, 2),
+			BPF_EXIT_INSN(),
+			BPF_MOV32_IMM(R0, 1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 0x1 } },
+	},
+	{
+		"ST_MEM_DW: Store/Load double word: max positive",
+		.u.insns_int = {
+			BPF_ALU32_IMM(BPF_MOV, R0, 1),
+			BPF_ST_MEM(BPF_DW, R10, -40, 0x7fffffff),
+			BPF_LDX_MEM(BPF_DW, R0, R10, -40),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 0x7fffffff } },
+	},
+	{
+		"STX_MEM_DW: Store/Load double word: max negative",
+		.u.insns_int = {
+			BPF_LD_IMM64(R0, 0),
+			BPF_LD_IMM64(R1, 0xffffffffffffffffLL),
+			BPF_STX_MEM(BPF_W, R10, R1, -40),
+			BPF_LDX_MEM(BPF_W, R0, R10, -40),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 0xffffffff } },
+	},
+	/* BPF_STX | BPF_XADD | BPF_W/DW */
+	{
+		"STX_XADD_W: Test: 0x12 + 0x10 = 0x22",
+		.u.insns_int = {
+			BPF_ALU32_IMM(BPF_MOV, R0, 0x12),
+			BPF_ST_MEM(BPF_W, R10, -40, 0x10),
+			BPF_STX_XADD(BPF_W, R10, R0, -40),
+			BPF_LDX_MEM(BPF_W, R0, R10, -40),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 0x22 } },
+	},
+	{
+		"STX_XADD_DW: Test: 0x12 + 0x10 = 0x22",
+		.u.insns_int = {
+			BPF_ALU32_IMM(BPF_MOV, R0, 0x12),
+			BPF_ST_MEM(BPF_DW, R10, -40, 0x10),
+			BPF_STX_XADD(BPF_DW, R10, R0, -40),
+			BPF_LDX_MEM(BPF_DW, R0, R10, -40),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 0x22 } },
+	},
+	/* BPF_JMP | BPF_EXIT */
+	{
+		"JMP_EXIT",
+		.u.insns_int = {
+			BPF_ALU32_IMM(BPF_MOV, R0, 0x4711),
+			BPF_EXIT_INSN(),
+			BPF_ALU32_IMM(BPF_MOV, R0, 0x4712),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 0x4711 } },
+	},
+	/* BPF_JMP | BPF_JA */
+	{
+		"JMP_JA: Unconditional jump: if (true) return 1",
+		.u.insns_int = {
+			BPF_ALU32_IMM(BPF_MOV, R0, 0),
+			BPF_JMP_IMM(BPF_JA, 0, 0, 1),
+			BPF_EXIT_INSN(),
+			BPF_ALU32_IMM(BPF_MOV, R0, 1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 1 } },
+	},
+	/* BPF_JMP | BPF_JSGT | BPF_K */
+	{
+		"JMP_JSGT_K: Signed jump: if (-1 > -2) return 1",
+		.u.insns_int = {
+			BPF_ALU32_IMM(BPF_MOV, R0, 0),
+			BPF_LD_IMM64(R1, 0xffffffffffffffffLL),
+			BPF_JMP_IMM(BPF_JSGT, R1, -2, 1),
+			BPF_EXIT_INSN(),
+			BPF_ALU32_IMM(BPF_MOV, R0, 1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 1 } },
+	},
+	{
+		"JMP_JSGT_K: Signed jump: if (-1 > -1) return 0",
+		.u.insns_int = {
+			BPF_ALU32_IMM(BPF_MOV, R0, 1),
+			BPF_LD_IMM64(R1, 0xffffffffffffffffLL),
+			BPF_JMP_IMM(BPF_JSGT, R1, -1, 1),
+			BPF_EXIT_INSN(),
+			BPF_ALU32_IMM(BPF_MOV, R0, 0),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 1 } },
+	},
+	/* BPF_JMP | BPF_JSGE | BPF_K */
+	{
+		"JMP_JSGE_K: Signed jump: if (-1 >= -2) return 1",
+		.u.insns_int = {
+			BPF_ALU32_IMM(BPF_MOV, R0, 0),
+			BPF_LD_IMM64(R1, 0xffffffffffffffffLL),
+			BPF_JMP_IMM(BPF_JSGE, R1, -2, 1),
+			BPF_EXIT_INSN(),
+			BPF_ALU32_IMM(BPF_MOV, R0, 1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 1 } },
+	},
+	{
+		"JMP_JSGE_K: Signed jump: if (-1 >= -1) return 1",
+		.u.insns_int = {
+			BPF_ALU32_IMM(BPF_MOV, R0, 0),
+			BPF_LD_IMM64(R1, 0xffffffffffffffffLL),
+			BPF_JMP_IMM(BPF_JSGE, R1, -1, 1),
+			BPF_EXIT_INSN(),
+			BPF_ALU32_IMM(BPF_MOV, R0, 1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 1 } },
+	},
+	/* BPF_JMP | BPF_JGT | BPF_K */
+	{
+		"JMP_JGT_K: if (3 > 2) return 1",
+		.u.insns_int = {
+			BPF_ALU32_IMM(BPF_MOV, R0, 0),
+			BPF_LD_IMM64(R1, 3),
+			BPF_JMP_IMM(BPF_JGT, R1, 2, 1),
+			BPF_EXIT_INSN(),
+			BPF_ALU32_IMM(BPF_MOV, R0, 1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 1 } },
+	},
+	/* BPF_JMP | BPF_JGE | BPF_K */
+	{
+		"JMP_JGE_K: if (3 >= 2) return 1",
+		.u.insns_int = {
+			BPF_ALU32_IMM(BPF_MOV, R0, 0),
+			BPF_LD_IMM64(R1, 3),
+			BPF_JMP_IMM(BPF_JGE, R1, 2, 1),
+			BPF_EXIT_INSN(),
+			BPF_ALU32_IMM(BPF_MOV, R0, 1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 1 } },
+	},
+	{
+		"JMP_JGE_K: if (3 >= 3) return 1",
+		.u.insns_int = {
+			BPF_ALU32_IMM(BPF_MOV, R0, 0),
+			BPF_LD_IMM64(R1, 3),
+			BPF_JMP_IMM(BPF_JGE, R1, 3, 1),
+			BPF_EXIT_INSN(),
+			BPF_ALU32_IMM(BPF_MOV, R0, 1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 1 } },
+	},
+	/* BPF_JMP | BPF_JNE | BPF_K */
+	{
+		"JMP_JNE_K: if (3 != 2) return 1",
+		.u.insns_int = {
+			BPF_ALU32_IMM(BPF_MOV, R0, 0),
+			BPF_LD_IMM64(R1, 3),
+			BPF_JMP_IMM(BPF_JNE, R1, 2, 1),
+			BPF_EXIT_INSN(),
+			BPF_ALU32_IMM(BPF_MOV, R0, 1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 1 } },
+	},
+	/* BPF_JMP | BPF_JEQ | BPF_K */
+	{
+		"JMP_JEQ_K: if (3 == 3) return 1",
+		.u.insns_int = {
+			BPF_ALU32_IMM(BPF_MOV, R0, 0),
+			BPF_LD_IMM64(R1, 3),
+			BPF_JMP_IMM(BPF_JEQ, R1, 3, 1),
+			BPF_EXIT_INSN(),
+			BPF_ALU32_IMM(BPF_MOV, R0, 1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 1 } },
+	},
+	/* BPF_JMP | BPF_JSET | BPF_K */
+	{
+		"JMP_JSET_K: if (0x3 & 0x2) return 1",
+		.u.insns_int = {
+			BPF_ALU32_IMM(BPF_MOV, R0, 0),
+			BPF_LD_IMM64(R1, 3),
+			BPF_JMP_IMM(BPF_JNE, R1, 2, 1),
+			BPF_EXIT_INSN(),
+			BPF_ALU32_IMM(BPF_MOV, R0, 1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 1 } },
+	},
+	{
+		"JMP_JSET_K: if (0x3 & 0xffffffff) return 1",
+		.u.insns_int = {
+			BPF_ALU32_IMM(BPF_MOV, R0, 0),
+			BPF_LD_IMM64(R1, 3),
+			BPF_JMP_IMM(BPF_JNE, R1, 0xffffffff, 1),
+			BPF_EXIT_INSN(),
+			BPF_ALU32_IMM(BPF_MOV, R0, 1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 1 } },
+	},
+	/* BPF_JMP | BPF_JSGT | BPF_X */
+	{
+		"JMP_JSGT_X: Signed jump: if (-1 > -2) return 1",
+		.u.insns_int = {
+			BPF_ALU32_IMM(BPF_MOV, R0, 0),
+			BPF_LD_IMM64(R1, -1),
+			BPF_LD_IMM64(R2, -2),
+			BPF_JMP_REG(BPF_JSGT, R1, R2, 1),
+			BPF_EXIT_INSN(),
+			BPF_ALU32_IMM(BPF_MOV, R0, 1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 1 } },
+	},
+	{
+		"JMP_JSGT_X: Signed jump: if (-1 > -1) return 0",
+		.u.insns_int = {
+			BPF_ALU32_IMM(BPF_MOV, R0, 1),
+			BPF_LD_IMM64(R1, -1),
+			BPF_LD_IMM64(R2, -1),
+			BPF_JMP_REG(BPF_JSGT, R1, R2, 1),
+			BPF_EXIT_INSN(),
+			BPF_ALU32_IMM(BPF_MOV, R0, 0),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 1 } },
+	},
+	/* BPF_JMP | BPF_JSGE | BPF_X */
+	{
+		"JMP_JSGE_X: Signed jump: if (-1 >= -2) return 1",
+		.u.insns_int = {
+			BPF_ALU32_IMM(BPF_MOV, R0, 0),
+			BPF_LD_IMM64(R1, -1),
+			BPF_LD_IMM64(R2, -2),
+			BPF_JMP_REG(BPF_JSGE, R1, R2, 1),
+			BPF_EXIT_INSN(),
+			BPF_ALU32_IMM(BPF_MOV, R0, 1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 1 } },
+	},
+	{
+		"JMP_JSGE_X: Signed jump: if (-1 >= -1) return 1",
+		.u.insns_int = {
+			BPF_ALU32_IMM(BPF_MOV, R0, 0),
+			BPF_LD_IMM64(R1, -1),
+			BPF_LD_IMM64(R2, -1),
+			BPF_JMP_REG(BPF_JSGE, R1, R2, 1),
+			BPF_EXIT_INSN(),
+			BPF_ALU32_IMM(BPF_MOV, R0, 1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 1 } },
+	},
+	/* BPF_JMP | BPF_JGT | BPF_X */
+	{
+		"JMP_JGT_X: if (3 > 2) return 1",
+		.u.insns_int = {
+			BPF_ALU32_IMM(BPF_MOV, R0, 0),
+			BPF_LD_IMM64(R1, 3),
+			BPF_LD_IMM64(R2, 2),
+			BPF_JMP_REG(BPF_JGT, R1, R2, 1),
+			BPF_EXIT_INSN(),
+			BPF_ALU32_IMM(BPF_MOV, R0, 1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 1 } },
+	},
+	/* BPF_JMP | BPF_JGE | BPF_X */
+	{
+		"JMP_JGE_X: if (3 >= 2) return 1",
+		.u.insns_int = {
+			BPF_ALU32_IMM(BPF_MOV, R0, 0),
+			BPF_LD_IMM64(R1, 3),
+			BPF_LD_IMM64(R2, 2),
+			BPF_JMP_REG(BPF_JGE, R1, R2, 1),
+			BPF_EXIT_INSN(),
+			BPF_ALU32_IMM(BPF_MOV, R0, 1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 1 } },
+	},
+	{
+		"JMP_JGE_X: if (3 >= 3) return 1",
+		.u.insns_int = {
+			BPF_ALU32_IMM(BPF_MOV, R0, 0),
+			BPF_LD_IMM64(R1, 3),
+			BPF_LD_IMM64(R2, 3),
+			BPF_JMP_REG(BPF_JGE, R1, R2, 1),
+			BPF_EXIT_INSN(),
+			BPF_ALU32_IMM(BPF_MOV, R0, 1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 1 } },
+	},
+	/* BPF_JMP | BPF_JNE | BPF_X */
+	{
+		"JMP_JNE_X: if (3 != 2) return 1",
+		.u.insns_int = {
+			BPF_ALU32_IMM(BPF_MOV, R0, 0),
+			BPF_LD_IMM64(R1, 3),
+			BPF_LD_IMM64(R2, 2),
+			BPF_JMP_REG(BPF_JNE, R1, R2, 1),
+			BPF_EXIT_INSN(),
+			BPF_ALU32_IMM(BPF_MOV, R0, 1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 1 } },
+	},
+	/* BPF_JMP | BPF_JEQ | BPF_X */
+	{
+		"JMP_JEQ_X: if (3 == 3) return 1",
+		.u.insns_int = {
+			BPF_ALU32_IMM(BPF_MOV, R0, 0),
+			BPF_LD_IMM64(R1, 3),
+			BPF_LD_IMM64(R2, 3),
+			BPF_JMP_REG(BPF_JEQ, R1, R2, 1),
+			BPF_EXIT_INSN(),
+			BPF_ALU32_IMM(BPF_MOV, R0, 1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 1 } },
+	},
+	/* BPF_JMP | BPF_JSET | BPF_X */
+	{
+		"JMP_JSET_X: if (0x3 & 0x2) return 1",
+		.u.insns_int = {
+			BPF_ALU32_IMM(BPF_MOV, R0, 0),
+			BPF_LD_IMM64(R1, 3),
+			BPF_LD_IMM64(R2, 2),
+			BPF_JMP_REG(BPF_JNE, R1, R2, 1),
+			BPF_EXIT_INSN(),
+			BPF_ALU32_IMM(BPF_MOV, R0, 1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 1 } },
+	},
+	{
+		"JMP_JSET_X: if (0x3 & 0xffffffff) return 1",
+		.u.insns_int = {
+			BPF_ALU32_IMM(BPF_MOV, R0, 0),
+			BPF_LD_IMM64(R1, 3),
+			BPF_LD_IMM64(R2, 0xffffffff),
+			BPF_JMP_REG(BPF_JNE, R1, R2, 1),
+			BPF_EXIT_INSN(),
+			BPF_ALU32_IMM(BPF_MOV, R0, 1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 1 } },
+	},
 };
 
 static struct net_device dev;
-- 
cgit v1.2.3


From 9abdffe286c1532a54d5aee31571d3029be4026c Mon Sep 17 00:00:00 2001
From: Sumit Semwal <sumit.semwal@linaro.org>
Date: Tue, 5 May 2015 14:56:15 +0530
Subject: dma-buf: add ref counting for module as exporter

Add reference counting on a kernel module that exports dma-buf and
implements its operations. This prevents the module from being unloaded
while DMABUF file is in use.

The original patch [1] was submitted by Tomasz Stanislawski, but this
is a simpler way to do it.

v3: call module_put() as late as possible, per gregkh's comment.
v2: move owner to struct dma_buf, and use DEFINE_DMA_BUF_EXPORT_INFO
    macro to simplify the change.

Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Sumit Semwal <sumit.semwal@linaro.org>

[1]: https://lkml.org/lkml/2012/8/8/163
---
 drivers/dma-buf/dma-buf.c | 10 +++++++++-
 include/linux/dma-buf.h   | 10 ++++++++--
 2 files changed, 17 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/dma-buf/dma-buf.c b/drivers/dma-buf/dma-buf.c
index c5a9138a6a8d..63a9914e42b8 100644
--- a/drivers/dma-buf/dma-buf.c
+++ b/drivers/dma-buf/dma-buf.c
@@ -29,6 +29,7 @@
 #include <linux/anon_inodes.h>
 #include <linux/export.h>
 #include <linux/debugfs.h>
+#include <linux/module.h>
 #include <linux/seq_file.h>
 #include <linux/poll.h>
 #include <linux/reservation.h>
@@ -72,6 +73,7 @@ static int dma_buf_release(struct inode *inode, struct file *file)
 	if (dmabuf->resv == (struct reservation_object *)&dmabuf[1])
 		reservation_object_fini(dmabuf->resv);
 
+	module_put(dmabuf->owner);
 	kfree(dmabuf);
 	return 0;
 }
@@ -302,14 +304,20 @@ struct dma_buf *dma_buf_export(const struct dma_buf_export_info *exp_info)
 		return ERR_PTR(-EINVAL);
 	}
 
+	if (!try_module_get(exp_info->owner))
+		return ERR_PTR(-ENOENT);
+
 	dmabuf = kzalloc(alloc_size, GFP_KERNEL);
-	if (dmabuf == NULL)
+	if (!dmabuf) {
+		module_put(exp_info->owner);
 		return ERR_PTR(-ENOMEM);
+	}
 
 	dmabuf->priv = exp_info->priv;
 	dmabuf->ops = exp_info->ops;
 	dmabuf->size = exp_info->size;
 	dmabuf->exp_name = exp_info->exp_name;
+	dmabuf->owner = exp_info->owner;
 	init_waitqueue_head(&dmabuf->poll);
 	dmabuf->cb_excl.poll = dmabuf->cb_shared.poll = &dmabuf->poll;
 	dmabuf->cb_excl.active = dmabuf->cb_shared.active = 0;
diff --git a/include/linux/dma-buf.h b/include/linux/dma-buf.h
index 2f0b431b73e0..f98bd7068d55 100644
--- a/include/linux/dma-buf.h
+++ b/include/linux/dma-buf.h
@@ -115,6 +115,8 @@ struct dma_buf_ops {
  * @attachments: list of dma_buf_attachment that denotes all devices attached.
  * @ops: dma_buf_ops associated with this buffer object.
  * @exp_name: name of the exporter; useful for debugging.
+ * @owner: pointer to exporter module; used for refcounting when exporter is a
+ *         kernel module.
  * @list_node: node for dma_buf accounting and debugging.
  * @priv: exporter specific private data for this buffer object.
  * @resv: reservation object linked to this dma-buf
@@ -129,6 +131,7 @@ struct dma_buf {
 	unsigned vmapping_counter;
 	void *vmap_ptr;
 	const char *exp_name;
+	struct module *owner;
 	struct list_head list_node;
 	void *priv;
 	struct reservation_object *resv;
@@ -164,7 +167,8 @@ struct dma_buf_attachment {
 
 /**
  * struct dma_buf_export_info - holds information needed to export a dma_buf
- * @exp_name:	name of the exporting module - useful for debugging.
+ * @exp_name:	name of the exporter - useful for debugging.
+ * @owner:	pointer to exporter module - used for refcounting kernel module
  * @ops:	Attach allocator-defined dma buf ops to the new buffer
  * @size:	Size of the buffer
  * @flags:	mode flags for the file
@@ -176,6 +180,7 @@ struct dma_buf_attachment {
  */
 struct dma_buf_export_info {
 	const char *exp_name;
+	struct module *owner;
 	const struct dma_buf_ops *ops;
 	size_t size;
 	int flags;
@@ -187,7 +192,8 @@ struct dma_buf_export_info {
  * helper macro for exporters; zeros and fills in most common values
  */
 #define DEFINE_DMA_BUF_EXPORT_INFO(a)	\
-	struct dma_buf_export_info a = { .exp_name = KBUILD_MODNAME }
+	struct dma_buf_export_info a = { .exp_name = KBUILD_MODNAME, \
+					 .owner = THIS_MODULE }
 
 /**
  * get_dma_buf - convenience wrapper for get_file.
-- 
cgit v1.2.3


From 25e4fe92a20bbffde87500615250f1d54bfb832f Mon Sep 17 00:00:00 2001
From: Dmitry Eremin-Solenikov <dbaryshkov@gmail.com>
Date: Tue, 12 May 2015 20:12:23 +0300
Subject: gpiolib: cleanup chained handler and data

Clean up chained handler and handler data if they were set by
gpiochip_set_chained_irqchip().

Signed-off-by: Dmitry Eremin-Solenikov <dbaryshkov@gmail.com>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 drivers/gpio/gpiolib.c      | 7 +++++++
 include/linux/gpio/driver.h | 1 +
 2 files changed, 8 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c
index 2ce3df3504e6..59cb4303e251 100644
--- a/drivers/gpio/gpiolib.c
+++ b/drivers/gpio/gpiolib.c
@@ -443,6 +443,8 @@ void gpiochip_set_chained_irqchip(struct gpio_chip *gpiochip,
 		 */
 		irq_set_handler_data(parent_irq, gpiochip);
 		irq_set_chained_handler(parent_irq, parent_handler);
+
+		gpiochip->irq_parent = parent_irq;
 	}
 
 	/* Set the parent IRQ for all affected IRQs */
@@ -551,6 +553,11 @@ static void gpiochip_irqchip_remove(struct gpio_chip *gpiochip)
 
 	acpi_gpiochip_free_interrupts(gpiochip);
 
+	if (gpiochip->irq_parent) {
+		irq_set_chained_handler(gpiochip->irq_parent, NULL);
+		irq_set_handler_data(gpiochip->irq_parent, NULL);
+	}
+
 	/* Remove all IRQ mappings and delete the domain */
 	if (gpiochip->irqdomain) {
 		for (offset = 0; offset < gpiochip->ngpio; offset++)
diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h
index 2c1e639f66bd..96a678842cde 100644
--- a/include/linux/gpio/driver.h
+++ b/include/linux/gpio/driver.h
@@ -121,6 +121,7 @@ struct gpio_chip {
 	unsigned int		irq_base;
 	irq_flow_handler_t	irq_handler;
 	unsigned int		irq_default_type;
+	int			irq_parent;
 #endif
 
 #if defined(CONFIG_OF_GPIO)
-- 
cgit v1.2.3


From 42c86547f4e5c2e81616c76ce9a2badce515c41f Mon Sep 17 00:00:00 2001
From: Tomeu Vizoso <tomeu.vizoso@collabora.com>
Date: Wed, 11 Mar 2015 11:34:25 +0100
Subject: clk: Expose clk_hw_reparent() to providers

To be used by clock implementations for switching to a new parent during
rate change.

Signed-off-by: Tomeu Vizoso <tomeu.vizoso@collabora.com>
Signed-off-by: Thierry Reding <treding@nvidia.com>
---
 drivers/clk/clk.c            | 8 ++++++++
 include/linux/clk-provider.h | 1 +
 2 files changed, 9 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/clk/clk.c b/drivers/clk/clk.c
index 459ce9da13e0..5315a273eae9 100644
--- a/drivers/clk/clk.c
+++ b/drivers/clk/clk.c
@@ -2020,6 +2020,14 @@ static void clk_core_reparent(struct clk_core *clk,
 	__clk_recalc_rates(clk, POST_RATE_CHANGE);
 }
 
+void clk_hw_reparent(struct clk_hw *hw, struct clk_hw *new_parent)
+{
+	if (!hw)
+		return;
+
+	clk_core_reparent(hw->core, !new_parent ? NULL : new_parent->core);
+}
+
 /**
  * clk_has_parent - check if a clock is a possible parent for another
  * @clk: clock source
diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h
index df695313f975..51efb9ec3e37 100644
--- a/include/linux/clk-provider.h
+++ b/include/linux/clk-provider.h
@@ -589,6 +589,7 @@ long __clk_mux_determine_rate_closest(struct clk_hw *hw, unsigned long rate,
 			      unsigned long max_rate,
 			      unsigned long *best_parent_rate,
 			      struct clk_hw **best_parent_p);
+void clk_hw_reparent(struct clk_hw *hw, struct clk_hw *new_parent);
 
 static inline void __clk_hw_set_clk(struct clk_hw *dst, struct clk_hw *src)
 {
-- 
cgit v1.2.3


From f05be589ff32e87821b86845625ed3d402d37dc7 Mon Sep 17 00:00:00 2001
From: Boris BREZILLON <boris.brezillon@free-electrons.com>
Date: Fri, 10 Apr 2015 12:09:01 +0800
Subject: mfd: axp20x: Add AXP22x PMIC support

Add support for the AXP22x PMIC devices to the existing AXP20x driver.
This includes the AXP221 and AXP223, which are identical except for
the external data bus. Only AXP221 is added for now. AXP223 will be
added after it's Reduced Serial Bus (RSB) interface is supported.

AXP22x defines a new set of registers, power supplies and regulators,
but most of the API is similar to the AXP20x ones.

A new irq chip definition is used, even though the available interrupts
on AXP22x is a subset of those on AXP20x. This is done so the interrupt
numbers match those on the datasheet.

This patch only enables the interrupts, system power-off function, and PEK
sub-device. The regulator driver must first support different variants
before we enable it from the mfd driver.

Signed-off-by: Boris BREZILLON <boris.brezillon@free-electrons.com>
[wens@csie.org: fix interrupts and move regulators to separate patch]
Signed-off-by: Chen-Yu Tsai <wens@csie.org>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/mfd/axp20x.c       | 98 ++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/mfd/axp20x.h | 86 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 184 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/mfd/axp20x.c b/drivers/mfd/axp20x.c
index d18029be6a78..cfbb7d7aead6 100644
--- a/drivers/mfd/axp20x.c
+++ b/drivers/mfd/axp20x.c
@@ -32,6 +32,7 @@
 static const char * const axp20x_model_names[] = {
 	"AXP202",
 	"AXP209",
+	"AXP221",
 	"AXP288",
 };
 
@@ -54,6 +55,25 @@ static const struct regmap_access_table axp20x_volatile_table = {
 	.n_yes_ranges	= ARRAY_SIZE(axp20x_volatile_ranges),
 };
 
+static const struct regmap_range axp22x_writeable_ranges[] = {
+	regmap_reg_range(AXP20X_DATACACHE(0), AXP20X_IRQ5_STATE),
+	regmap_reg_range(AXP20X_DCDC_MODE, AXP22X_BATLOW_THRES1),
+};
+
+static const struct regmap_range axp22x_volatile_ranges[] = {
+	regmap_reg_range(AXP20X_IRQ1_EN, AXP20X_IRQ5_STATE),
+};
+
+static const struct regmap_access_table axp22x_writeable_table = {
+	.yes_ranges	= axp22x_writeable_ranges,
+	.n_yes_ranges	= ARRAY_SIZE(axp22x_writeable_ranges),
+};
+
+static const struct regmap_access_table axp22x_volatile_table = {
+	.yes_ranges	= axp22x_volatile_ranges,
+	.n_yes_ranges	= ARRAY_SIZE(axp22x_volatile_ranges),
+};
+
 static const struct regmap_range axp288_writeable_ranges[] = {
 	regmap_reg_range(AXP20X_DATACACHE(0), AXP20X_IRQ6_STATE),
 	regmap_reg_range(AXP20X_DCDC_MODE, AXP288_FG_TUNE5),
@@ -87,6 +107,20 @@ static struct resource axp20x_pek_resources[] = {
 	},
 };
 
+static struct resource axp22x_pek_resources[] = {
+	{
+		.name   = "PEK_DBR",
+		.start  = AXP22X_IRQ_PEK_RIS_EDGE,
+		.end    = AXP22X_IRQ_PEK_RIS_EDGE,
+		.flags  = IORESOURCE_IRQ,
+	}, {
+		.name   = "PEK_DBF",
+		.start  = AXP22X_IRQ_PEK_FAL_EDGE,
+		.end    = AXP22X_IRQ_PEK_FAL_EDGE,
+		.flags  = IORESOURCE_IRQ,
+	},
+};
+
 static struct resource axp288_fuel_gauge_resources[] = {
 	{
 		.start = AXP288_IRQ_QWBTU,
@@ -129,6 +163,15 @@ static const struct regmap_config axp20x_regmap_config = {
 	.cache_type	= REGCACHE_RBTREE,
 };
 
+static const struct regmap_config axp22x_regmap_config = {
+	.reg_bits	= 8,
+	.val_bits	= 8,
+	.wr_table	= &axp22x_writeable_table,
+	.volatile_table	= &axp22x_volatile_table,
+	.max_register	= AXP22X_BATLOW_THRES1,
+	.cache_type	= REGCACHE_RBTREE,
+};
+
 static const struct regmap_config axp288_regmap_config = {
 	.reg_bits	= 8,
 	.val_bits	= 8,
@@ -181,6 +224,34 @@ static const struct regmap_irq axp20x_regmap_irqs[] = {
 	INIT_REGMAP_IRQ(AXP20X, GPIO0_INPUT,		4, 0),
 };
 
+static const struct regmap_irq axp22x_regmap_irqs[] = {
+	INIT_REGMAP_IRQ(AXP22X, ACIN_OVER_V,		0, 7),
+	INIT_REGMAP_IRQ(AXP22X, ACIN_PLUGIN,		0, 6),
+	INIT_REGMAP_IRQ(AXP22X, ACIN_REMOVAL,	        0, 5),
+	INIT_REGMAP_IRQ(AXP22X, VBUS_OVER_V,		0, 4),
+	INIT_REGMAP_IRQ(AXP22X, VBUS_PLUGIN,		0, 3),
+	INIT_REGMAP_IRQ(AXP22X, VBUS_REMOVAL,	        0, 2),
+	INIT_REGMAP_IRQ(AXP22X, VBUS_V_LOW,		0, 1),
+	INIT_REGMAP_IRQ(AXP22X, BATT_PLUGIN,		1, 7),
+	INIT_REGMAP_IRQ(AXP22X, BATT_REMOVAL,	        1, 6),
+	INIT_REGMAP_IRQ(AXP22X, BATT_ENT_ACT_MODE,	1, 5),
+	INIT_REGMAP_IRQ(AXP22X, BATT_EXIT_ACT_MODE,	1, 4),
+	INIT_REGMAP_IRQ(AXP22X, CHARG,		        1, 3),
+	INIT_REGMAP_IRQ(AXP22X, CHARG_DONE,		1, 2),
+	INIT_REGMAP_IRQ(AXP22X, BATT_TEMP_HIGH,	        1, 1),
+	INIT_REGMAP_IRQ(AXP22X, BATT_TEMP_LOW,	        1, 0),
+	INIT_REGMAP_IRQ(AXP22X, DIE_TEMP_HIGH,	        2, 7),
+	INIT_REGMAP_IRQ(AXP22X, PEK_SHORT,		2, 1),
+	INIT_REGMAP_IRQ(AXP22X, PEK_LONG,		2, 0),
+	INIT_REGMAP_IRQ(AXP22X, LOW_PWR_LVL1,	        3, 1),
+	INIT_REGMAP_IRQ(AXP22X, LOW_PWR_LVL2,	        3, 0),
+	INIT_REGMAP_IRQ(AXP22X, TIMER,		        4, 7),
+	INIT_REGMAP_IRQ(AXP22X, PEK_RIS_EDGE,	        4, 6),
+	INIT_REGMAP_IRQ(AXP22X, PEK_FAL_EDGE,	        4, 5),
+	INIT_REGMAP_IRQ(AXP22X, GPIO1_INPUT,		4, 1),
+	INIT_REGMAP_IRQ(AXP22X, GPIO0_INPUT,		4, 0),
+};
+
 /* some IRQs are compatible with axp20x models */
 static const struct regmap_irq axp288_regmap_irqs[] = {
 	INIT_REGMAP_IRQ(AXP288, VBUS_FALL,              0, 2),
@@ -224,6 +295,7 @@ static const struct regmap_irq axp288_regmap_irqs[] = {
 static const struct of_device_id axp20x_of_match[] = {
 	{ .compatible = "x-powers,axp202", .data = (void *) AXP202_ID },
 	{ .compatible = "x-powers,axp209", .data = (void *) AXP209_ID },
+	{ .compatible = "x-powers,axp221", .data = (void *) AXP221_ID },
 	{ },
 };
 MODULE_DEVICE_TABLE(of, axp20x_of_match);
@@ -258,6 +330,18 @@ static const struct regmap_irq_chip axp20x_regmap_irq_chip = {
 
 };
 
+static const struct regmap_irq_chip axp22x_regmap_irq_chip = {
+	.name			= "axp22x_irq_chip",
+	.status_base		= AXP20X_IRQ1_STATE,
+	.ack_base		= AXP20X_IRQ1_STATE,
+	.mask_base		= AXP20X_IRQ1_EN,
+	.mask_invert		= true,
+	.init_ack_masked	= true,
+	.irqs			= axp22x_regmap_irqs,
+	.num_irqs		= ARRAY_SIZE(axp22x_regmap_irqs),
+	.num_regs		= 5,
+};
+
 static const struct regmap_irq_chip axp288_regmap_irq_chip = {
 	.name			= "axp288_irq_chip",
 	.status_base		= AXP20X_IRQ1_STATE,
@@ -281,6 +365,14 @@ static struct mfd_cell axp20x_cells[] = {
 	},
 };
 
+static struct mfd_cell axp22x_cells[] = {
+	{
+		.name			= "axp20x-pek",
+		.num_resources		= ARRAY_SIZE(axp22x_pek_resources),
+		.resources		= axp22x_pek_resources,
+	},
+};
+
 static struct resource axp288_adc_resources[] = {
 	{
 		.name  = "GPADC",
@@ -426,6 +518,12 @@ static int axp20x_match_device(struct axp20x_dev *axp20x, struct device *dev)
 		axp20x->regmap_cfg = &axp20x_regmap_config;
 		axp20x->regmap_irq_chip = &axp20x_regmap_irq_chip;
 		break;
+	case AXP221_ID:
+		axp20x->nr_cells = ARRAY_SIZE(axp22x_cells);
+		axp20x->cells = axp22x_cells;
+		axp20x->regmap_cfg = &axp22x_regmap_config;
+		axp20x->regmap_irq_chip = &axp22x_regmap_irq_chip;
+		break;
 	case AXP288_ID:
 		axp20x->cells = axp288_cells;
 		axp20x->nr_cells = ARRAY_SIZE(axp288_cells);
diff --git a/include/linux/mfd/axp20x.h b/include/linux/mfd/axp20x.h
index dfabd6db7ddf..95568eb798c3 100644
--- a/include/linux/mfd/axp20x.h
+++ b/include/linux/mfd/axp20x.h
@@ -14,6 +14,7 @@
 enum {
 	AXP202_ID = 0,
 	AXP209_ID,
+	AXP221_ID,
 	AXP288_ID,
 	NR_AXP20X_VARIANTS,
 };
@@ -45,6 +46,28 @@ enum {
 #define AXP20X_V_LTF_DISCHRG		0x3c
 #define AXP20X_V_HTF_DISCHRG		0x3d
 
+#define AXP22X_PWR_OUT_CTRL1		0x10
+#define AXP22X_PWR_OUT_CTRL2		0x12
+#define AXP22X_PWR_OUT_CTRL3		0x13
+#define AXP22X_DLDO1_V_OUT		0x15
+#define AXP22X_DLDO2_V_OUT		0x16
+#define AXP22X_DLDO3_V_OUT		0x17
+#define AXP22X_DLDO4_V_OUT		0x18
+#define AXP22X_ELDO1_V_OUT		0x19
+#define AXP22X_ELDO2_V_OUT		0x1a
+#define AXP22X_ELDO3_V_OUT		0x1b
+#define AXP22X_DC5LDO_V_OUT		0x1c
+#define AXP22X_DCDC1_V_OUT		0x21
+#define AXP22X_DCDC2_V_OUT		0x22
+#define AXP22X_DCDC3_V_OUT		0x23
+#define AXP22X_DCDC4_V_OUT		0x24
+#define AXP22X_DCDC5_V_OUT		0x25
+#define AXP22X_DCDC23_V_RAMP_CTRL	0x27
+#define AXP22X_ALDO1_V_OUT		0x28
+#define AXP22X_ALDO2_V_OUT		0x29
+#define AXP22X_ALDO3_V_OUT		0x2a
+#define AXP22X_CHRG_CTRL3		0x35
+
 /* Interrupt */
 #define AXP20X_IRQ1_EN			0x40
 #define AXP20X_IRQ2_EN			0x41
@@ -100,6 +123,9 @@ enum {
 #define AXP20X_VBUS_MON			0x8b
 #define AXP20X_OVER_TMP			0x8f
 
+#define AXP22X_PWREN_CTRL1		0x8c
+#define AXP22X_PWREN_CTRL2		0x8d
+
 /* GPIO */
 #define AXP20X_GPIO0_CTRL		0x90
 #define AXP20X_LDO5_V_OUT		0x91
@@ -108,6 +134,11 @@ enum {
 #define AXP20X_GPIO20_SS		0x94
 #define AXP20X_GPIO3_CTRL		0x95
 
+#define AXP22X_LDO_IO0_V_OUT		0x91
+#define AXP22X_LDO_IO1_V_OUT		0x93
+#define AXP22X_GPIO_STATE		0x94
+#define AXP22X_GPIO_PULL_DOWN		0x95
+
 /* Battery */
 #define AXP20X_CHRG_CC_31_24		0xb0
 #define AXP20X_CHRG_CC_23_16		0xb1
@@ -120,6 +151,9 @@ enum {
 #define AXP20X_CC_CTRL			0xb8
 #define AXP20X_FG_RES			0xb9
 
+/* AXP22X specific registers */
+#define AXP22X_BATLOW_THRES1		0xe6
+
 /* AXP288 specific registers */
 #define AXP288_PMIC_ADC_H               0x56
 #define AXP288_PMIC_ADC_L               0x57
@@ -158,6 +192,30 @@ enum {
 	AXP20X_REG_ID_MAX,
 };
 
+enum {
+	AXP22X_DCDC1 = 0,
+	AXP22X_DCDC2,
+	AXP22X_DCDC3,
+	AXP22X_DCDC4,
+	AXP22X_DCDC5,
+	AXP22X_DC1SW,
+	AXP22X_DC5LDO,
+	AXP22X_ALDO1,
+	AXP22X_ALDO2,
+	AXP22X_ALDO3,
+	AXP22X_ELDO1,
+	AXP22X_ELDO2,
+	AXP22X_ELDO3,
+	AXP22X_DLDO1,
+	AXP22X_DLDO2,
+	AXP22X_DLDO3,
+	AXP22X_DLDO4,
+	AXP22X_RTC_LDO,
+	AXP22X_LDO_IO0,
+	AXP22X_LDO_IO1,
+	AXP22X_REG_ID_MAX,
+};
+
 /* IRQs */
 enum {
 	AXP20X_IRQ_ACIN_OVER_V = 1,
@@ -199,6 +257,34 @@ enum {
 	AXP20X_IRQ_GPIO0_INPUT,
 };
 
+enum axp22x_irqs {
+	AXP22X_IRQ_ACIN_OVER_V = 1,
+	AXP22X_IRQ_ACIN_PLUGIN,
+	AXP22X_IRQ_ACIN_REMOVAL,
+	AXP22X_IRQ_VBUS_OVER_V,
+	AXP22X_IRQ_VBUS_PLUGIN,
+	AXP22X_IRQ_VBUS_REMOVAL,
+	AXP22X_IRQ_VBUS_V_LOW,
+	AXP22X_IRQ_BATT_PLUGIN,
+	AXP22X_IRQ_BATT_REMOVAL,
+	AXP22X_IRQ_BATT_ENT_ACT_MODE,
+	AXP22X_IRQ_BATT_EXIT_ACT_MODE,
+	AXP22X_IRQ_CHARG,
+	AXP22X_IRQ_CHARG_DONE,
+	AXP22X_IRQ_BATT_TEMP_HIGH,
+	AXP22X_IRQ_BATT_TEMP_LOW,
+	AXP22X_IRQ_DIE_TEMP_HIGH,
+	AXP22X_IRQ_PEK_SHORT,
+	AXP22X_IRQ_PEK_LONG,
+	AXP22X_IRQ_LOW_PWR_LVL1,
+	AXP22X_IRQ_LOW_PWR_LVL2,
+	AXP22X_IRQ_TIMER,
+	AXP22X_IRQ_PEK_RIS_EDGE,
+	AXP22X_IRQ_PEK_FAL_EDGE,
+	AXP22X_IRQ_GPIO1_INPUT,
+	AXP22X_IRQ_GPIO0_INPUT,
+};
+
 enum axp288_irqs {
 	AXP288_IRQ_VBUS_FALL     = 2,
 	AXP288_IRQ_VBUS_RISE,
-- 
cgit v1.2.3


From 275e2bc0f25d5eb99c99ebb7293fc3722533124b Mon Sep 17 00:00:00 2001
From: Sergey Popovich <popovich_sergei@mail.ua>
Date: Sat, 2 May 2015 19:28:17 +0200
Subject: netfilter: ipset: Fix ext_*() macros

So pointers returned by these macros could be
referenced with -> directly.

Signed-off-by: Sergey Popovich <popovich_sergei@mail.ua>
Signed-off-by: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter/ipset/ip_set.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter/ipset/ip_set.h b/include/linux/netfilter/ipset/ip_set.h
index 34b172301558..f88be7258e5f 100644
--- a/include/linux/netfilter/ipset/ip_set.h
+++ b/include/linux/netfilter/ipset/ip_set.h
@@ -122,13 +122,13 @@ struct ip_set_skbinfo {
 struct ip_set;
 
 #define ext_timeout(e, s)	\
-(unsigned long *)(((void *)(e)) + (s)->offset[IPSET_EXT_ID_TIMEOUT])
+((unsigned long *)(((void *)(e)) + (s)->offset[IPSET_EXT_ID_TIMEOUT]))
 #define ext_counter(e, s)	\
-(struct ip_set_counter *)(((void *)(e)) + (s)->offset[IPSET_EXT_ID_COUNTER])
+((struct ip_set_counter *)(((void *)(e)) + (s)->offset[IPSET_EXT_ID_COUNTER]))
 #define ext_comment(e, s)	\
-(struct ip_set_comment *)(((void *)(e)) + (s)->offset[IPSET_EXT_ID_COMMENT])
+((struct ip_set_comment *)(((void *)(e)) + (s)->offset[IPSET_EXT_ID_COMMENT]))
 #define ext_skbinfo(e, s)	\
-(struct ip_set_skbinfo *)(((void *)(e)) + (s)->offset[IPSET_EXT_ID_SKBINFO])
+((struct ip_set_skbinfo *)(((void *)(e)) + (s)->offset[IPSET_EXT_ID_SKBINFO]))
 
 typedef int (*ipset_adtfn)(struct ip_set *set, void *value,
 			   const struct ip_set_ext *ext,
-- 
cgit v1.2.3


From 289fcff4bcdb1dcc0ce8788b7ea0f58a9e4a495f Mon Sep 17 00:00:00 2001
From: Heikki Krogerus <heikki.krogerus@linux.intel.com>
Date: Wed, 13 May 2015 15:26:42 +0300
Subject: usb: add bus type for USB ULPI

UTMI+ Low Pin Interface (ULPI) is a commonly used PHY
interface for USB 2.0. The ULPI specification describes a
standard set of registers which the vendors can extend for
their specific needs. ULPI PHYs provide often functions
such as charger detection and ADP sensing and probing.

There are two major issues that the bus type is meant to
tackle:

Firstly, ULPI registers are accessed from the controller.
The bus provides convenient method for the controller
drivers to share that access with the actual PHY drivers.

Secondly, there are already platforms that assume ULPI PHYs
are runtime detected, such as many Intel Baytrail based
platforms. They do not provide any kind of hardware
description for the ULPI PHYs like separate ACPI device
object that could be used to enumerate a device from.

Signed-off-by: Heikki Krogerus <heikki.krogerus@linux.intel.com>
Acked-by: David Cohen <david.a.cohen@linux.intel.com>
Signed-off-by: Felipe Balbi <balbi@ti.com>
---
 MAINTAINERS                       |   7 ++
 drivers/usb/common/Makefile       |   1 +
 drivers/usb/common/ulpi.c         | 255 ++++++++++++++++++++++++++++++++++++++
 drivers/usb/core/Kconfig          |  20 +++
 include/linux/mod_devicetable.h   |   6 +
 include/linux/ulpi/driver.h       |  60 +++++++++
 include/linux/ulpi/interface.h    |  23 ++++
 include/linux/ulpi/regs.h         | 130 +++++++++++++++++++
 include/linux/usb/ulpi.h          | 134 +-------------------
 scripts/mod/devicetable-offsets.c |   4 +
 scripts/mod/file2alias.c          |  13 ++
 11 files changed, 521 insertions(+), 132 deletions(-)
 create mode 100644 drivers/usb/common/ulpi.c
 create mode 100644 include/linux/ulpi/driver.h
 create mode 100644 include/linux/ulpi/interface.h
 create mode 100644 include/linux/ulpi/regs.h

(limited to 'include/linux')

diff --git a/MAINTAINERS b/MAINTAINERS
index 2e5bbc0d68b2..2202e9194d83 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -10453,6 +10453,13 @@ S:	Maintained
 F:	Documentation/video4linux/zr364xx.txt
 F:	drivers/media/usb/zr364xx/
 
+ULPI BUS
+M:	Heikki Krogerus <heikki.krogerus@linux.intel.com>
+L:	linux-usb@vger.kernel.org
+S:	Maintained
+F:	drivers/usb/common/ulpi.c
+F:	include/linux/ulpi/
+
 USER-MODE LINUX (UML)
 M:	Jeff Dike <jdike@addtoit.com>
 M:	Richard Weinberger <richard@nod.at>
diff --git a/drivers/usb/common/Makefile b/drivers/usb/common/Makefile
index ca2f8bd0e431..6bbb3ec17018 100644
--- a/drivers/usb/common/Makefile
+++ b/drivers/usb/common/Makefile
@@ -7,3 +7,4 @@ usb-common-y			  += common.o
 usb-common-$(CONFIG_USB_LED_TRIG) += led.o
 
 obj-$(CONFIG_USB_OTG_FSM) += usb-otg-fsm.o
+obj-$(CONFIG_USB_ULPI_BUS)	+= ulpi.o
diff --git a/drivers/usb/common/ulpi.c b/drivers/usb/common/ulpi.c
new file mode 100644
index 000000000000..0e6f968e93fe
--- /dev/null
+++ b/drivers/usb/common/ulpi.c
@@ -0,0 +1,255 @@
+/**
+ * ulpi.c - USB ULPI PHY bus
+ *
+ * Copyright (C) 2015 Intel Corporation
+ *
+ * Author: Heikki Krogerus <heikki.krogerus@linux.intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/ulpi/interface.h>
+#include <linux/ulpi/driver.h>
+#include <linux/ulpi/regs.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/acpi.h>
+
+/* -------------------------------------------------------------------------- */
+
+int ulpi_read(struct ulpi *ulpi, u8 addr)
+{
+	return ulpi->ops->read(ulpi->ops, addr);
+}
+EXPORT_SYMBOL_GPL(ulpi_read);
+
+int ulpi_write(struct ulpi *ulpi, u8 addr, u8 val)
+{
+	return ulpi->ops->write(ulpi->ops, addr, val);
+}
+EXPORT_SYMBOL_GPL(ulpi_write);
+
+/* -------------------------------------------------------------------------- */
+
+static int ulpi_match(struct device *dev, struct device_driver *driver)
+{
+	struct ulpi_driver *drv = to_ulpi_driver(driver);
+	struct ulpi *ulpi = to_ulpi_dev(dev);
+	const struct ulpi_device_id *id;
+
+	for (id = drv->id_table; id->vendor; id++)
+		if (id->vendor == ulpi->id.vendor &&
+		    id->product == ulpi->id.product)
+			return 1;
+
+	return 0;
+}
+
+static int ulpi_uevent(struct device *dev, struct kobj_uevent_env *env)
+{
+	struct ulpi *ulpi = to_ulpi_dev(dev);
+
+	if (add_uevent_var(env, "MODALIAS=ulpi:v%04xp%04x",
+			   ulpi->id.vendor, ulpi->id.product))
+		return -ENOMEM;
+	return 0;
+}
+
+static int ulpi_probe(struct device *dev)
+{
+	struct ulpi_driver *drv = to_ulpi_driver(dev->driver);
+
+	return drv->probe(to_ulpi_dev(dev));
+}
+
+static int ulpi_remove(struct device *dev)
+{
+	struct ulpi_driver *drv = to_ulpi_driver(dev->driver);
+
+	if (drv->remove)
+		drv->remove(to_ulpi_dev(dev));
+
+	return 0;
+}
+
+static struct bus_type ulpi_bus = {
+	.name = "ulpi",
+	.match = ulpi_match,
+	.uevent = ulpi_uevent,
+	.probe = ulpi_probe,
+	.remove = ulpi_remove,
+};
+
+/* -------------------------------------------------------------------------- */
+
+static ssize_t modalias_show(struct device *dev, struct device_attribute *attr,
+			     char *buf)
+{
+	struct ulpi *ulpi = to_ulpi_dev(dev);
+
+	return sprintf(buf, "ulpi:v%04xp%04x\n",
+		       ulpi->id.vendor, ulpi->id.product);
+}
+static DEVICE_ATTR_RO(modalias);
+
+static struct attribute *ulpi_dev_attrs[] = {
+	&dev_attr_modalias.attr,
+	NULL
+};
+
+static struct attribute_group ulpi_dev_attr_group = {
+	.attrs = ulpi_dev_attrs,
+};
+
+static const struct attribute_group *ulpi_dev_attr_groups[] = {
+	&ulpi_dev_attr_group,
+	NULL
+};
+
+static void ulpi_dev_release(struct device *dev)
+{
+	kfree(to_ulpi_dev(dev));
+}
+
+static struct device_type ulpi_dev_type = {
+	.name = "ulpi_device",
+	.groups = ulpi_dev_attr_groups,
+	.release = ulpi_dev_release,
+};
+
+/* -------------------------------------------------------------------------- */
+
+/**
+ * ulpi_register_driver - register a driver with the ULPI bus
+ * @drv: driver being registered
+ *
+ * Registers a driver with the ULPI bus.
+ */
+int ulpi_register_driver(struct ulpi_driver *drv)
+{
+	if (!drv->probe)
+		return -EINVAL;
+
+	drv->driver.bus = &ulpi_bus;
+
+	return driver_register(&drv->driver);
+}
+EXPORT_SYMBOL_GPL(ulpi_register_driver);
+
+/**
+ * ulpi_unregister_driver - unregister a driver with the ULPI bus
+ * @drv: driver to unregister
+ *
+ * Unregisters a driver with the ULPI bus.
+ */
+void ulpi_unregister_driver(struct ulpi_driver *drv)
+{
+	driver_unregister(&drv->driver);
+}
+EXPORT_SYMBOL_GPL(ulpi_unregister_driver);
+
+/* -------------------------------------------------------------------------- */
+
+static int ulpi_register(struct device *dev, struct ulpi *ulpi)
+{
+	int ret;
+
+	/* Test the interface */
+	ret = ulpi_write(ulpi, ULPI_SCRATCH, 0xaa);
+	if (ret < 0)
+		return ret;
+
+	ret = ulpi_read(ulpi, ULPI_SCRATCH);
+	if (ret < 0)
+		return ret;
+
+	if (ret != 0xaa)
+		return -ENODEV;
+
+	ulpi->id.vendor = ulpi_read(ulpi, ULPI_VENDOR_ID_LOW);
+	ulpi->id.vendor |= ulpi_read(ulpi, ULPI_VENDOR_ID_HIGH) << 8;
+
+	ulpi->id.product = ulpi_read(ulpi, ULPI_PRODUCT_ID_LOW);
+	ulpi->id.product |= ulpi_read(ulpi, ULPI_PRODUCT_ID_HIGH) << 8;
+
+	ulpi->dev.parent = dev;
+	ulpi->dev.bus = &ulpi_bus;
+	ulpi->dev.type = &ulpi_dev_type;
+	dev_set_name(&ulpi->dev, "%s.ulpi", dev_name(dev));
+
+	ACPI_COMPANION_SET(&ulpi->dev, ACPI_COMPANION(dev));
+
+	request_module("ulpi:v%04xp%04x", ulpi->id.vendor, ulpi->id.product);
+
+	ret = device_register(&ulpi->dev);
+	if (ret)
+		return ret;
+
+	dev_dbg(&ulpi->dev, "registered ULPI PHY: vendor %04x, product %04x\n",
+		ulpi->id.vendor, ulpi->id.product);
+
+	return 0;
+}
+
+/**
+ * ulpi_register_interface - instantiate new ULPI device
+ * @dev: USB controller's device interface
+ * @ops: ULPI register access
+ *
+ * Allocates and registers a ULPI device and an interface for it. Called from
+ * the USB controller that provides the ULPI interface.
+ */
+struct ulpi *ulpi_register_interface(struct device *dev, struct ulpi_ops *ops)
+{
+	struct ulpi *ulpi;
+	int ret;
+
+	ulpi = kzalloc(sizeof(*ulpi), GFP_KERNEL);
+	if (!ulpi)
+		return ERR_PTR(-ENOMEM);
+
+	ulpi->ops = ops;
+	ops->dev = dev;
+
+	ret = ulpi_register(dev, ulpi);
+	if (ret) {
+		kfree(ulpi);
+		return ERR_PTR(ret);
+	}
+
+	return ulpi;
+}
+EXPORT_SYMBOL_GPL(ulpi_register_interface);
+
+/**
+ * ulpi_unregister_interface - unregister ULPI interface
+ * @intrf: struct ulpi_interface
+ *
+ * Unregisters a ULPI device and it's interface that was created with
+ * ulpi_create_interface().
+ */
+void ulpi_unregister_interface(struct ulpi *ulpi)
+{
+	device_unregister(&ulpi->dev);
+}
+EXPORT_SYMBOL_GPL(ulpi_unregister_interface);
+
+/* -------------------------------------------------------------------------- */
+
+static int __init ulpi_init(void)
+{
+	return bus_register(&ulpi_bus);
+}
+module_init(ulpi_init);
+
+static void __exit ulpi_exit(void)
+{
+	bus_unregister(&ulpi_bus);
+}
+module_exit(ulpi_exit);
+
+MODULE_AUTHOR("Intel Corporation");
+MODULE_LICENSE("GPL v2");
+MODULE_DESCRIPTION("USB ULPI PHY bus");
diff --git a/drivers/usb/core/Kconfig b/drivers/usb/core/Kconfig
index cc0ced08bae2..a99c89e78126 100644
--- a/drivers/usb/core/Kconfig
+++ b/drivers/usb/core/Kconfig
@@ -84,3 +84,23 @@ config USB_OTG_FSM
 	  Implements OTG Finite State Machine as specified in On-The-Go
 	  and Embedded Host Supplement to the USB Revision 2.0 Specification.
 
+config USB_ULPI_BUS
+	tristate "USB ULPI PHY interface support"
+	depends on USB_SUPPORT
+	help
+	  UTMI+ Low Pin Interface (ULPI) is specification for a commonly used
+	  USB 2.0 PHY interface. The ULPI specification defines a standard set
+	  of registers that can be used to detect the vendor and product which
+	  allows ULPI to be handled as a bus. This module is the driver for that
+	  bus.
+
+	  The ULPI interfaces (the buses) are registered by the drivers for USB
+	  controllers which support ULPI register access and have ULPI PHY
+	  attached to them. The ULPI PHY drivers themselves are normal PHY
+	  drivers.
+
+	  ULPI PHYs provide often functions such as ADP sensing/probing (OTG
+	  protocol) and USB charger detection.
+
+	  To compile this driver as a module, choose M here: the module will
+	  be called ulpi.
diff --git a/include/linux/mod_devicetable.h b/include/linux/mod_devicetable.h
index 3bfd56778c29..7ab00d61d30a 100644
--- a/include/linux/mod_devicetable.h
+++ b/include/linux/mod_devicetable.h
@@ -629,4 +629,10 @@ struct mcb_device_id {
 	kernel_ulong_t driver_data;
 };
 
+struct ulpi_device_id {
+	__u16 vendor;
+	__u16 product;
+	kernel_ulong_t driver_data;
+};
+
 #endif /* LINUX_MOD_DEVICETABLE_H */
diff --git a/include/linux/ulpi/driver.h b/include/linux/ulpi/driver.h
new file mode 100644
index 000000000000..388f6e08b9d4
--- /dev/null
+++ b/include/linux/ulpi/driver.h
@@ -0,0 +1,60 @@
+#ifndef __LINUX_ULPI_DRIVER_H
+#define __LINUX_ULPI_DRIVER_H
+
+#include <linux/mod_devicetable.h>
+
+#include <linux/device.h>
+
+struct ulpi_ops;
+
+/**
+ * struct ulpi - describes ULPI PHY device
+ * @id: vendor and product ids for ULPI device
+ * @ops: I/O access
+ * @dev: device interface
+ */
+struct ulpi {
+	struct ulpi_device_id id;
+	struct ulpi_ops *ops;
+	struct device dev;
+};
+
+#define to_ulpi_dev(d) container_of(d, struct ulpi, dev)
+
+static inline void ulpi_set_drvdata(struct ulpi *ulpi, void *data)
+{
+	dev_set_drvdata(&ulpi->dev, data);
+}
+
+static inline void *ulpi_get_drvdata(struct ulpi *ulpi)
+{
+	return dev_get_drvdata(&ulpi->dev);
+}
+
+/**
+ * struct ulpi_driver - describes a ULPI PHY driver
+ * @id_table: array of device identifiers supported by this driver
+ * @probe: binds this driver to ULPI device
+ * @remove: unbinds this driver from ULPI device
+ * @driver: the name and owner members must be initialized by the drivers
+ */
+struct ulpi_driver {
+	const struct ulpi_device_id *id_table;
+	int (*probe)(struct ulpi *ulpi);
+	void (*remove)(struct ulpi *ulpi);
+	struct device_driver driver;
+};
+
+#define to_ulpi_driver(d) container_of(d, struct ulpi_driver, driver)
+
+int ulpi_register_driver(struct ulpi_driver *drv);
+void ulpi_unregister_driver(struct ulpi_driver *drv);
+
+#define module_ulpi_driver(__ulpi_driver) \
+	module_driver(__ulpi_driver, ulpi_register_driver, \
+		      ulpi_unregister_driver)
+
+int ulpi_read(struct ulpi *ulpi, u8 addr);
+int ulpi_write(struct ulpi *ulpi, u8 addr, u8 val);
+
+#endif /* __LINUX_ULPI_DRIVER_H */
diff --git a/include/linux/ulpi/interface.h b/include/linux/ulpi/interface.h
new file mode 100644
index 000000000000..4de8ab491038
--- /dev/null
+++ b/include/linux/ulpi/interface.h
@@ -0,0 +1,23 @@
+#ifndef __LINUX_ULPI_INTERFACE_H
+#define __LINUX_ULPI_INTERFACE_H
+
+#include <linux/types.h>
+
+struct ulpi;
+
+/**
+ * struct ulpi_ops - ULPI register access
+ * @dev: the interface provider
+ * @read: read operation for ULPI register access
+ * @write: write operation for ULPI register access
+ */
+struct ulpi_ops {
+	struct device *dev;
+	int (*read)(struct ulpi_ops *ops, u8 addr);
+	int (*write)(struct ulpi_ops *ops, u8 addr, u8 val);
+};
+
+struct ulpi *ulpi_register_interface(struct device *, struct ulpi_ops *);
+void ulpi_unregister_interface(struct ulpi *);
+
+#endif /* __LINUX_ULPI_INTERFACE_H */
diff --git a/include/linux/ulpi/regs.h b/include/linux/ulpi/regs.h
new file mode 100644
index 000000000000..b5b8b8804560
--- /dev/null
+++ b/include/linux/ulpi/regs.h
@@ -0,0 +1,130 @@
+#ifndef __LINUX_ULPI_REGS_H
+#define __LINUX_ULPI_REGS_H
+
+/*
+ * Macros for Set and Clear
+ * See ULPI 1.1 specification to find the registers with Set and Clear offsets
+ */
+#define ULPI_SET(a)				(a + 1)
+#define ULPI_CLR(a)				(a + 2)
+
+/*
+ * Register Map
+ */
+#define ULPI_VENDOR_ID_LOW			0x00
+#define ULPI_VENDOR_ID_HIGH			0x01
+#define ULPI_PRODUCT_ID_LOW			0x02
+#define ULPI_PRODUCT_ID_HIGH			0x03
+#define ULPI_FUNC_CTRL				0x04
+#define ULPI_IFC_CTRL				0x07
+#define ULPI_OTG_CTRL				0x0a
+#define ULPI_USB_INT_EN_RISE			0x0d
+#define ULPI_USB_INT_EN_FALL			0x10
+#define ULPI_USB_INT_STS			0x13
+#define ULPI_USB_INT_LATCH			0x14
+#define ULPI_DEBUG				0x15
+#define ULPI_SCRATCH				0x16
+/* Optional Carkit Registers */
+#define ULPI_CARKIT_CTRL			0x19
+#define ULPI_CARKIT_INT_DELAY			0x1c
+#define ULPI_CARKIT_INT_EN			0x1d
+#define ULPI_CARKIT_INT_STS			0x20
+#define ULPI_CARKIT_INT_LATCH			0x21
+#define ULPI_CARKIT_PLS_CTRL			0x22
+/* Other Optional Registers */
+#define ULPI_TX_POS_WIDTH			0x25
+#define ULPI_TX_NEG_WIDTH			0x26
+#define ULPI_POLARITY_RECOVERY			0x27
+/* Access Extended Register Set */
+#define ULPI_ACCESS_EXTENDED			0x2f
+/* Vendor Specific */
+#define ULPI_VENDOR_SPECIFIC			0x30
+/* Extended Registers */
+#define ULPI_EXT_VENDOR_SPECIFIC		0x80
+
+/*
+ * Register Bits
+ */
+
+/* Function Control */
+#define ULPI_FUNC_CTRL_XCVRSEL			BIT(0)
+#define  ULPI_FUNC_CTRL_XCVRSEL_MASK		0x3
+#define  ULPI_FUNC_CTRL_HIGH_SPEED		0x0
+#define  ULPI_FUNC_CTRL_FULL_SPEED		0x1
+#define  ULPI_FUNC_CTRL_LOW_SPEED		0x2
+#define  ULPI_FUNC_CTRL_FS4LS			0x3
+#define ULPI_FUNC_CTRL_TERMSELECT		BIT(2)
+#define ULPI_FUNC_CTRL_OPMODE			BIT(3)
+#define  ULPI_FUNC_CTRL_OPMODE_MASK		(0x3 << 3)
+#define  ULPI_FUNC_CTRL_OPMODE_NORMAL		(0x0 << 3)
+#define  ULPI_FUNC_CTRL_OPMODE_NONDRIVING	(0x1 << 3)
+#define  ULPI_FUNC_CTRL_OPMODE_DISABLE_NRZI	(0x2 << 3)
+#define  ULPI_FUNC_CTRL_OPMODE_NOSYNC_NOEOP	(0x3 << 3)
+#define ULPI_FUNC_CTRL_RESET			BIT(5)
+#define ULPI_FUNC_CTRL_SUSPENDM			BIT(6)
+
+/* Interface Control */
+#define ULPI_IFC_CTRL_6_PIN_SERIAL_MODE		BIT(0)
+#define ULPI_IFC_CTRL_3_PIN_SERIAL_MODE		BIT(1)
+#define ULPI_IFC_CTRL_CARKITMODE		BIT(2)
+#define ULPI_IFC_CTRL_CLOCKSUSPENDM		BIT(3)
+#define ULPI_IFC_CTRL_AUTORESUME		BIT(4)
+#define ULPI_IFC_CTRL_EXTERNAL_VBUS		BIT(5)
+#define ULPI_IFC_CTRL_PASSTHRU			BIT(6)
+#define ULPI_IFC_CTRL_PROTECT_IFC_DISABLE	BIT(7)
+
+/* OTG Control */
+#define ULPI_OTG_CTRL_ID_PULLUP			BIT(0)
+#define ULPI_OTG_CTRL_DP_PULLDOWN		BIT(1)
+#define ULPI_OTG_CTRL_DM_PULLDOWN		BIT(2)
+#define ULPI_OTG_CTRL_DISCHRGVBUS		BIT(3)
+#define ULPI_OTG_CTRL_CHRGVBUS			BIT(4)
+#define ULPI_OTG_CTRL_DRVVBUS			BIT(5)
+#define ULPI_OTG_CTRL_DRVVBUS_EXT		BIT(6)
+#define ULPI_OTG_CTRL_EXTVBUSIND		BIT(7)
+
+/* USB Interrupt Enable Rising,
+ * USB Interrupt Enable Falling,
+ * USB Interrupt Status and
+ * USB Interrupt Latch
+ */
+#define ULPI_INT_HOST_DISCONNECT		BIT(0)
+#define ULPI_INT_VBUS_VALID			BIT(1)
+#define ULPI_INT_SESS_VALID			BIT(2)
+#define ULPI_INT_SESS_END			BIT(3)
+#define ULPI_INT_IDGRD				BIT(4)
+
+/* Debug */
+#define ULPI_DEBUG_LINESTATE0			BIT(0)
+#define ULPI_DEBUG_LINESTATE1			BIT(1)
+
+/* Carkit Control */
+#define ULPI_CARKIT_CTRL_CARKITPWR		BIT(0)
+#define ULPI_CARKIT_CTRL_IDGNDDRV		BIT(1)
+#define ULPI_CARKIT_CTRL_TXDEN			BIT(2)
+#define ULPI_CARKIT_CTRL_RXDEN			BIT(3)
+#define ULPI_CARKIT_CTRL_SPKLEFTEN		BIT(4)
+#define ULPI_CARKIT_CTRL_SPKRIGHTEN		BIT(5)
+#define ULPI_CARKIT_CTRL_MICEN			BIT(6)
+
+/* Carkit Interrupt Enable */
+#define ULPI_CARKIT_INT_EN_IDFLOAT_RISE		BIT(0)
+#define ULPI_CARKIT_INT_EN_IDFLOAT_FALL		BIT(1)
+#define ULPI_CARKIT_INT_EN_CARINTDET		BIT(2)
+#define ULPI_CARKIT_INT_EN_DP_RISE		BIT(3)
+#define ULPI_CARKIT_INT_EN_DP_FALL		BIT(4)
+
+/* Carkit Interrupt Status and
+ * Carkit Interrupt Latch
+ */
+#define ULPI_CARKIT_INT_IDFLOAT			BIT(0)
+#define ULPI_CARKIT_INT_CARINTDET		BIT(1)
+#define ULPI_CARKIT_INT_DP			BIT(2)
+
+/* Carkit Pulse Control*/
+#define ULPI_CARKIT_PLS_CTRL_TXPLSEN		BIT(0)
+#define ULPI_CARKIT_PLS_CTRL_RXPLSEN		BIT(1)
+#define ULPI_CARKIT_PLS_CTRL_SPKRLEFT_BIASEN	BIT(2)
+#define ULPI_CARKIT_PLS_CTRL_SPKRRIGHT_BIASEN	BIT(3)
+
+#endif /* __LINUX_ULPI_REGS_H */
diff --git a/include/linux/usb/ulpi.h b/include/linux/usb/ulpi.h
index 5c295c26ad37..5f07407a367a 100644
--- a/include/linux/usb/ulpi.h
+++ b/include/linux/usb/ulpi.h
@@ -12,6 +12,8 @@
 #define __LINUX_USB_ULPI_H
 
 #include <linux/usb/otg.h>
+#include <linux/ulpi/regs.h>
+
 /*-------------------------------------------------------------------------*/
 
 /*
@@ -49,138 +51,6 @@
 
 /*-------------------------------------------------------------------------*/
 
-/*
- * Macros for Set and Clear
- * See ULPI 1.1 specification to find the registers with Set and Clear offsets
- */
-#define ULPI_SET(a)				(a + 1)
-#define ULPI_CLR(a)				(a + 2)
-
-/*-------------------------------------------------------------------------*/
-
-/*
- * Register Map
- */
-#define ULPI_VENDOR_ID_LOW			0x00
-#define ULPI_VENDOR_ID_HIGH			0x01
-#define ULPI_PRODUCT_ID_LOW			0x02
-#define ULPI_PRODUCT_ID_HIGH			0x03
-#define ULPI_FUNC_CTRL				0x04
-#define ULPI_IFC_CTRL				0x07
-#define ULPI_OTG_CTRL				0x0a
-#define ULPI_USB_INT_EN_RISE			0x0d
-#define ULPI_USB_INT_EN_FALL			0x10
-#define ULPI_USB_INT_STS			0x13
-#define ULPI_USB_INT_LATCH			0x14
-#define ULPI_DEBUG				0x15
-#define ULPI_SCRATCH				0x16
-/* Optional Carkit Registers */
-#define ULPI_CARCIT_CTRL			0x19
-#define ULPI_CARCIT_INT_DELAY			0x1c
-#define ULPI_CARCIT_INT_EN			0x1d
-#define ULPI_CARCIT_INT_STS			0x20
-#define ULPI_CARCIT_INT_LATCH			0x21
-#define ULPI_CARCIT_PLS_CTRL			0x22
-/* Other Optional Registers */
-#define ULPI_TX_POS_WIDTH			0x25
-#define ULPI_TX_NEG_WIDTH			0x26
-#define ULPI_POLARITY_RECOVERY			0x27
-/* Access Extended Register Set */
-#define ULPI_ACCESS_EXTENDED			0x2f
-/* Vendor Specific */
-#define ULPI_VENDOR_SPECIFIC			0x30
-/* Extended Registers */
-#define ULPI_EXT_VENDOR_SPECIFIC		0x80
-
-/*-------------------------------------------------------------------------*/
-
-/*
- * Register Bits
- */
-
-/* Function Control */
-#define ULPI_FUNC_CTRL_XCVRSEL			(1 << 0)
-#define  ULPI_FUNC_CTRL_XCVRSEL_MASK		(3 << 0)
-#define  ULPI_FUNC_CTRL_HIGH_SPEED		(0 << 0)
-#define  ULPI_FUNC_CTRL_FULL_SPEED		(1 << 0)
-#define  ULPI_FUNC_CTRL_LOW_SPEED		(2 << 0)
-#define  ULPI_FUNC_CTRL_FS4LS			(3 << 0)
-#define ULPI_FUNC_CTRL_TERMSELECT		(1 << 2)
-#define ULPI_FUNC_CTRL_OPMODE			(1 << 3)
-#define  ULPI_FUNC_CTRL_OPMODE_MASK		(3 << 3)
-#define  ULPI_FUNC_CTRL_OPMODE_NORMAL		(0 << 3)
-#define  ULPI_FUNC_CTRL_OPMODE_NONDRIVING	(1 << 3)
-#define  ULPI_FUNC_CTRL_OPMODE_DISABLE_NRZI	(2 << 3)
-#define  ULPI_FUNC_CTRL_OPMODE_NOSYNC_NOEOP	(3 << 3)
-#define ULPI_FUNC_CTRL_RESET			(1 << 5)
-#define ULPI_FUNC_CTRL_SUSPENDM			(1 << 6)
-
-/* Interface Control */
-#define ULPI_IFC_CTRL_6_PIN_SERIAL_MODE		(1 << 0)
-#define ULPI_IFC_CTRL_3_PIN_SERIAL_MODE		(1 << 1)
-#define ULPI_IFC_CTRL_CARKITMODE		(1 << 2)
-#define ULPI_IFC_CTRL_CLOCKSUSPENDM		(1 << 3)
-#define ULPI_IFC_CTRL_AUTORESUME		(1 << 4)
-#define ULPI_IFC_CTRL_EXTERNAL_VBUS		(1 << 5)
-#define ULPI_IFC_CTRL_PASSTHRU			(1 << 6)
-#define ULPI_IFC_CTRL_PROTECT_IFC_DISABLE	(1 << 7)
-
-/* OTG Control */
-#define ULPI_OTG_CTRL_ID_PULLUP			(1 << 0)
-#define ULPI_OTG_CTRL_DP_PULLDOWN		(1 << 1)
-#define ULPI_OTG_CTRL_DM_PULLDOWN		(1 << 2)
-#define ULPI_OTG_CTRL_DISCHRGVBUS		(1 << 3)
-#define ULPI_OTG_CTRL_CHRGVBUS			(1 << 4)
-#define ULPI_OTG_CTRL_DRVVBUS			(1 << 5)
-#define ULPI_OTG_CTRL_DRVVBUS_EXT		(1 << 6)
-#define ULPI_OTG_CTRL_EXTVBUSIND		(1 << 7)
-
-/* USB Interrupt Enable Rising,
- * USB Interrupt Enable Falling,
- * USB Interrupt Status and
- * USB Interrupt Latch
- */
-#define ULPI_INT_HOST_DISCONNECT		(1 << 0)
-#define ULPI_INT_VBUS_VALID			(1 << 1)
-#define ULPI_INT_SESS_VALID			(1 << 2)
-#define ULPI_INT_SESS_END			(1 << 3)
-#define ULPI_INT_IDGRD				(1 << 4)
-
-/* Debug */
-#define ULPI_DEBUG_LINESTATE0			(1 << 0)
-#define ULPI_DEBUG_LINESTATE1			(1 << 1)
-
-/* Carkit Control */
-#define ULPI_CARKIT_CTRL_CARKITPWR		(1 << 0)
-#define ULPI_CARKIT_CTRL_IDGNDDRV		(1 << 1)
-#define ULPI_CARKIT_CTRL_TXDEN			(1 << 2)
-#define ULPI_CARKIT_CTRL_RXDEN			(1 << 3)
-#define ULPI_CARKIT_CTRL_SPKLEFTEN		(1 << 4)
-#define ULPI_CARKIT_CTRL_SPKRIGHTEN		(1 << 5)
-#define ULPI_CARKIT_CTRL_MICEN			(1 << 6)
-
-/* Carkit Interrupt Enable */
-#define ULPI_CARKIT_INT_EN_IDFLOAT_RISE		(1 << 0)
-#define ULPI_CARKIT_INT_EN_IDFLOAT_FALL		(1 << 1)
-#define ULPI_CARKIT_INT_EN_CARINTDET		(1 << 2)
-#define ULPI_CARKIT_INT_EN_DP_RISE		(1 << 3)
-#define ULPI_CARKIT_INT_EN_DP_FALL		(1 << 4)
-
-/* Carkit Interrupt Status and
- * Carkit Interrupt Latch
- */
-#define ULPI_CARKIT_INT_IDFLOAT			(1 << 0)
-#define ULPI_CARKIT_INT_CARINTDET		(1 << 1)
-#define ULPI_CARKIT_INT_DP			(1 << 2)
-
-/* Carkit Pulse Control*/
-#define ULPI_CARKIT_PLS_CTRL_TXPLSEN		(1 << 0)
-#define ULPI_CARKIT_PLS_CTRL_RXPLSEN		(1 << 1)
-#define ULPI_CARKIT_PLS_CTRL_SPKRLEFT_BIASEN	(1 << 2)
-#define ULPI_CARKIT_PLS_CTRL_SPKRRIGHT_BIASEN	(1 << 3)
-
-/*-------------------------------------------------------------------------*/
-
 #if IS_ENABLED(CONFIG_USB_ULPI)
 struct usb_phy *otg_ulpi_create(struct usb_phy_io_ops *ops,
 					unsigned int flags);
diff --git a/scripts/mod/devicetable-offsets.c b/scripts/mod/devicetable-offsets.c
index fce36d0f6898..ada8417362c7 100644
--- a/scripts/mod/devicetable-offsets.c
+++ b/scripts/mod/devicetable-offsets.c
@@ -189,5 +189,9 @@ int main(void)
 	DEVID_FIELD(rio_device_id, asm_did);
 	DEVID_FIELD(rio_device_id, asm_vid);
 
+	DEVID(ulpi_device_id);
+	DEVID_FIELD(ulpi_device_id, vendor);
+	DEVID_FIELD(ulpi_device_id, product);
+
 	return 0;
 }
diff --git a/scripts/mod/file2alias.c b/scripts/mod/file2alias.c
index 78691d51a479..a7a8560db44d 100644
--- a/scripts/mod/file2alias.c
+++ b/scripts/mod/file2alias.c
@@ -1192,6 +1192,19 @@ static int do_rio_entry(const char *filename,
 }
 ADD_TO_DEVTABLE("rapidio", rio_device_id, do_rio_entry);
 
+/* Looks like: ulpi:vNpN */
+static int do_ulpi_entry(const char *filename, void *symval,
+			 char *alias)
+{
+	DEF_FIELD(symval, ulpi_device_id, vendor);
+	DEF_FIELD(symval, ulpi_device_id, product);
+
+	sprintf(alias, "ulpi:v%04xp%04x", vendor, product);
+
+	return 1;
+}
+ADD_TO_DEVTABLE("ulpi", ulpi_device_id, do_ulpi_entry);
+
 /* Does namelen bytes of name exactly match the symbol? */
 static bool sym_is(const char *name, unsigned namelen, const char *symbol)
 {
-- 
cgit v1.2.3


From f267caab44451baa70d25fa3191a68bb79ad1b08 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Wed, 13 May 2015 14:03:51 -0400
Subject: tracing: Remove unused prototype ftrace_event_define_field()

ftrace_event_define_field() has a prototype defined but never used.
Remove it.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/ftrace_event.h | 8 --------
 1 file changed, 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
index 65ce6de91307..f8465d65f3c7 100644
--- a/include/linux/ftrace_event.h
+++ b/include/linux/ftrace_event.h
@@ -219,10 +219,6 @@ struct ftrace_event_class {
 extern int ftrace_event_reg(struct ftrace_event_call *event,
 			    enum trace_reg type, void *data);
 
-int ftrace_event_define_field(struct ftrace_event_call *call,
-			      char *type, int len, char *item, int offset,
-			      int field_size, int sign, int filter);
-
 struct ftrace_event_buffer {
 	struct ring_buffer		*buffer;
 	struct ring_buffer_event	*event;
@@ -238,10 +234,6 @@ void *ftrace_event_buffer_reserve(struct ftrace_event_buffer *fbuffer,
 
 void ftrace_event_buffer_commit(struct ftrace_event_buffer *fbuffer);
 
-int ftrace_event_define_field(struct ftrace_event_call *call,
-			      char *type, int len, char *item, int offset,
-			      int field_size, int sign, int filter);
-
 enum {
 	TRACE_EVENT_FL_FILTERED_BIT,
 	TRACE_EVENT_FL_CAP_ANY_BIT,
-- 
cgit v1.2.3


From af658dca221207174fc0a7bcdcd4cff7c589fdd8 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Wed, 29 Apr 2015 14:36:05 -0400
Subject: tracing: Rename ftrace_event.h to trace_events.h

The term "ftrace" is really the infrastructure of the function hooks,
and not the trace events. Rename ftrace_event.h to trace_events.h to
represent the trace_event infrastructure and decouple the term ftrace
from it.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 arch/x86/kvm/mmutrace.h      |   2 +-
 arch/x86/kvm/svm.c           |   2 +-
 arch/x86/kvm/vmx.c           |   2 +-
 include/linux/ftrace_event.h | 616 -------------------------------------------
 include/linux/trace_events.h | 616 +++++++++++++++++++++++++++++++++++++++++++
 include/trace/events/power.h |   2 +-
 include/trace/syscall.h      |   2 +-
 include/trace/trace_events.h |   2 +-
 kernel/events/core.c         |   2 +-
 kernel/module.c              |   2 +-
 kernel/rcu/tiny.c            |   2 +-
 kernel/rcu/tree.c            |   2 +-
 kernel/trace/ring_buffer.c   |   2 +-
 kernel/trace/trace.h         |   4 +-
 mm/debug.c                   |   2 +-
 15 files changed, 630 insertions(+), 630 deletions(-)
 delete mode 100644 include/linux/ftrace_event.h
 create mode 100644 include/linux/trace_events.h

(limited to 'include/linux')

diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h
index ce463a9cc8fb..5a24b846a1cb 100644
--- a/arch/x86/kvm/mmutrace.h
+++ b/arch/x86/kvm/mmutrace.h
@@ -2,7 +2,7 @@
 #define _TRACE_KVMMMU_H
 
 #include <linux/tracepoint.h>
-#include <linux/ftrace_event.h>
+#include <linux/trace_events.h>
 
 #undef TRACE_SYSTEM
 #define TRACE_SYSTEM kvmmmu
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index ce741b8650f6..19788d5b1109 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -28,7 +28,7 @@
 #include <linux/vmalloc.h>
 #include <linux/highmem.h>
 #include <linux/sched.h>
-#include <linux/ftrace_event.h>
+#include <linux/trace_events.h>
 #include <linux/slab.h>
 
 #include <asm/perf_event.h>
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index f7b61687bd79..9b5adf4860c2 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -28,7 +28,7 @@
 #include <linux/sched.h>
 #include <linux/moduleparam.h>
 #include <linux/mod_devicetable.h>
-#include <linux/ftrace_event.h>
+#include <linux/trace_events.h>
 #include <linux/slab.h>
 #include <linux/tboot.h>
 #include <linux/hrtimer.h>
diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
deleted file mode 100644
index f8465d65f3c7..000000000000
--- a/include/linux/ftrace_event.h
+++ /dev/null
@@ -1,616 +0,0 @@
-
-#ifndef _LINUX_FTRACE_EVENT_H
-#define _LINUX_FTRACE_EVENT_H
-
-#include <linux/ring_buffer.h>
-#include <linux/trace_seq.h>
-#include <linux/percpu.h>
-#include <linux/hardirq.h>
-#include <linux/perf_event.h>
-#include <linux/tracepoint.h>
-
-struct trace_array;
-struct trace_buffer;
-struct tracer;
-struct dentry;
-struct bpf_prog;
-
-struct trace_print_flags {
-	unsigned long		mask;
-	const char		*name;
-};
-
-struct trace_print_flags_u64 {
-	unsigned long long	mask;
-	const char		*name;
-};
-
-const char *ftrace_print_flags_seq(struct trace_seq *p, const char *delim,
-				   unsigned long flags,
-				   const struct trace_print_flags *flag_array);
-
-const char *ftrace_print_symbols_seq(struct trace_seq *p, unsigned long val,
-				     const struct trace_print_flags *symbol_array);
-
-#if BITS_PER_LONG == 32
-const char *ftrace_print_symbols_seq_u64(struct trace_seq *p,
-					 unsigned long long val,
-					 const struct trace_print_flags_u64
-								 *symbol_array);
-#endif
-
-const char *ftrace_print_bitmask_seq(struct trace_seq *p, void *bitmask_ptr,
-				     unsigned int bitmask_size);
-
-const char *ftrace_print_hex_seq(struct trace_seq *p,
-				 const unsigned char *buf, int len);
-
-const char *ftrace_print_array_seq(struct trace_seq *p,
-				   const void *buf, int count,
-				   size_t el_size);
-
-struct trace_iterator;
-struct trace_event;
-
-int ftrace_raw_output_prep(struct trace_iterator *iter,
-			   struct trace_event *event);
-
-/*
- * The trace entry - the most basic unit of tracing. This is what
- * is printed in the end as a single line in the trace output, such as:
- *
- *     bash-15816 [01]   235.197585: idle_cpu <- irq_enter
- */
-struct trace_entry {
-	unsigned short		type;
-	unsigned char		flags;
-	unsigned char		preempt_count;
-	int			pid;
-};
-
-#define FTRACE_MAX_EVENT						\
-	((1 << (sizeof(((struct trace_entry *)0)->type) * 8)) - 1)
-
-/*
- * Trace iterator - used by printout routines who present trace
- * results to users and which routines might sleep, etc:
- */
-struct trace_iterator {
-	struct trace_array	*tr;
-	struct tracer		*trace;
-	struct trace_buffer	*trace_buffer;
-	void			*private;
-	int			cpu_file;
-	struct mutex		mutex;
-	struct ring_buffer_iter	**buffer_iter;
-	unsigned long		iter_flags;
-
-	/* trace_seq for __print_flags() and __print_symbolic() etc. */
-	struct trace_seq	tmp_seq;
-
-	cpumask_var_t		started;
-
-	/* it's true when current open file is snapshot */
-	bool			snapshot;
-
-	/* The below is zeroed out in pipe_read */
-	struct trace_seq	seq;
-	struct trace_entry	*ent;
-	unsigned long		lost_events;
-	int			leftover;
-	int			ent_size;
-	int			cpu;
-	u64			ts;
-
-	loff_t			pos;
-	long			idx;
-
-	/* All new field here will be zeroed out in pipe_read */
-};
-
-enum trace_iter_flags {
-	TRACE_FILE_LAT_FMT	= 1,
-	TRACE_FILE_ANNOTATE	= 2,
-	TRACE_FILE_TIME_IN_NS	= 4,
-};
-
-
-typedef enum print_line_t (*trace_print_func)(struct trace_iterator *iter,
-				      int flags, struct trace_event *event);
-
-struct trace_event_functions {
-	trace_print_func	trace;
-	trace_print_func	raw;
-	trace_print_func	hex;
-	trace_print_func	binary;
-};
-
-struct trace_event {
-	struct hlist_node		node;
-	struct list_head		list;
-	int				type;
-	struct trace_event_functions	*funcs;
-};
-
-extern int register_ftrace_event(struct trace_event *event);
-extern int unregister_ftrace_event(struct trace_event *event);
-
-/* Return values for print_line callback */
-enum print_line_t {
-	TRACE_TYPE_PARTIAL_LINE	= 0,	/* Retry after flushing the seq */
-	TRACE_TYPE_HANDLED	= 1,
-	TRACE_TYPE_UNHANDLED	= 2,	/* Relay to other output functions */
-	TRACE_TYPE_NO_CONSUME	= 3	/* Handled but ask to not consume */
-};
-
-/*
- * Several functions return TRACE_TYPE_PARTIAL_LINE if the trace_seq
- * overflowed, and TRACE_TYPE_HANDLED otherwise. This helper function
- * simplifies those functions and keeps them in sync.
- */
-static inline enum print_line_t trace_handle_return(struct trace_seq *s)
-{
-	return trace_seq_has_overflowed(s) ?
-		TRACE_TYPE_PARTIAL_LINE : TRACE_TYPE_HANDLED;
-}
-
-void tracing_generic_entry_update(struct trace_entry *entry,
-				  unsigned long flags,
-				  int pc);
-struct ftrace_event_file;
-
-struct ring_buffer_event *
-trace_event_buffer_lock_reserve(struct ring_buffer **current_buffer,
-				struct ftrace_event_file *ftrace_file,
-				int type, unsigned long len,
-				unsigned long flags, int pc);
-struct ring_buffer_event *
-trace_current_buffer_lock_reserve(struct ring_buffer **current_buffer,
-				  int type, unsigned long len,
-				  unsigned long flags, int pc);
-void trace_current_buffer_unlock_commit(struct ring_buffer *buffer,
-					struct ring_buffer_event *event,
-					unsigned long flags, int pc);
-void trace_buffer_unlock_commit(struct ring_buffer *buffer,
-				struct ring_buffer_event *event,
-				unsigned long flags, int pc);
-void trace_buffer_unlock_commit_regs(struct ring_buffer *buffer,
-				     struct ring_buffer_event *event,
-				     unsigned long flags, int pc,
-				     struct pt_regs *regs);
-void trace_current_buffer_discard_commit(struct ring_buffer *buffer,
-					 struct ring_buffer_event *event);
-
-void tracing_record_cmdline(struct task_struct *tsk);
-
-int ftrace_output_call(struct trace_iterator *iter, char *name, char *fmt, ...);
-
-struct event_filter;
-
-enum trace_reg {
-	TRACE_REG_REGISTER,
-	TRACE_REG_UNREGISTER,
-#ifdef CONFIG_PERF_EVENTS
-	TRACE_REG_PERF_REGISTER,
-	TRACE_REG_PERF_UNREGISTER,
-	TRACE_REG_PERF_OPEN,
-	TRACE_REG_PERF_CLOSE,
-	TRACE_REG_PERF_ADD,
-	TRACE_REG_PERF_DEL,
-#endif
-};
-
-struct ftrace_event_call;
-
-struct ftrace_event_class {
-	const char		*system;
-	void			*probe;
-#ifdef CONFIG_PERF_EVENTS
-	void			*perf_probe;
-#endif
-	int			(*reg)(struct ftrace_event_call *event,
-				       enum trace_reg type, void *data);
-	int			(*define_fields)(struct ftrace_event_call *);
-	struct list_head	*(*get_fields)(struct ftrace_event_call *);
-	struct list_head	fields;
-	int			(*raw_init)(struct ftrace_event_call *);
-};
-
-extern int ftrace_event_reg(struct ftrace_event_call *event,
-			    enum trace_reg type, void *data);
-
-struct ftrace_event_buffer {
-	struct ring_buffer		*buffer;
-	struct ring_buffer_event	*event;
-	struct ftrace_event_file	*ftrace_file;
-	void				*entry;
-	unsigned long			flags;
-	int				pc;
-};
-
-void *ftrace_event_buffer_reserve(struct ftrace_event_buffer *fbuffer,
-				  struct ftrace_event_file *ftrace_file,
-				  unsigned long len);
-
-void ftrace_event_buffer_commit(struct ftrace_event_buffer *fbuffer);
-
-enum {
-	TRACE_EVENT_FL_FILTERED_BIT,
-	TRACE_EVENT_FL_CAP_ANY_BIT,
-	TRACE_EVENT_FL_NO_SET_FILTER_BIT,
-	TRACE_EVENT_FL_IGNORE_ENABLE_BIT,
-	TRACE_EVENT_FL_WAS_ENABLED_BIT,
-	TRACE_EVENT_FL_USE_CALL_FILTER_BIT,
-	TRACE_EVENT_FL_TRACEPOINT_BIT,
-	TRACE_EVENT_FL_KPROBE_BIT,
-};
-
-/*
- * Event flags:
- *  FILTERED	  - The event has a filter attached
- *  CAP_ANY	  - Any user can enable for perf
- *  NO_SET_FILTER - Set when filter has error and is to be ignored
- *  IGNORE_ENABLE - For ftrace internal events, do not enable with debugfs file
- *  WAS_ENABLED   - Set and stays set when an event was ever enabled
- *                    (used for module unloading, if a module event is enabled,
- *                     it is best to clear the buffers that used it).
- *  USE_CALL_FILTER - For ftrace internal events, don't use file filter
- *  TRACEPOINT    - Event is a tracepoint
- *  KPROBE        - Event is a kprobe
- */
-enum {
-	TRACE_EVENT_FL_FILTERED		= (1 << TRACE_EVENT_FL_FILTERED_BIT),
-	TRACE_EVENT_FL_CAP_ANY		= (1 << TRACE_EVENT_FL_CAP_ANY_BIT),
-	TRACE_EVENT_FL_NO_SET_FILTER	= (1 << TRACE_EVENT_FL_NO_SET_FILTER_BIT),
-	TRACE_EVENT_FL_IGNORE_ENABLE	= (1 << TRACE_EVENT_FL_IGNORE_ENABLE_BIT),
-	TRACE_EVENT_FL_WAS_ENABLED	= (1 << TRACE_EVENT_FL_WAS_ENABLED_BIT),
-	TRACE_EVENT_FL_USE_CALL_FILTER	= (1 << TRACE_EVENT_FL_USE_CALL_FILTER_BIT),
-	TRACE_EVENT_FL_TRACEPOINT	= (1 << TRACE_EVENT_FL_TRACEPOINT_BIT),
-	TRACE_EVENT_FL_KPROBE		= (1 << TRACE_EVENT_FL_KPROBE_BIT),
-};
-
-struct ftrace_event_call {
-	struct list_head	list;
-	struct ftrace_event_class *class;
-	union {
-		char			*name;
-		/* Set TRACE_EVENT_FL_TRACEPOINT flag when using "tp" */
-		struct tracepoint	*tp;
-	};
-	struct trace_event	event;
-	char			*print_fmt;
-	struct event_filter	*filter;
-	void			*mod;
-	void			*data;
-	/*
-	 *   bit 0:		filter_active
-	 *   bit 1:		allow trace by non root (cap any)
-	 *   bit 2:		failed to apply filter
-	 *   bit 3:		ftrace internal event (do not enable)
-	 *   bit 4:		Event was enabled by module
-	 *   bit 5:		use call filter rather than file filter
-	 *   bit 6:		Event is a tracepoint
-	 */
-	int			flags; /* static flags of different events */
-
-#ifdef CONFIG_PERF_EVENTS
-	int				perf_refcount;
-	struct hlist_head __percpu	*perf_events;
-	struct bpf_prog			*prog;
-
-	int	(*perf_perm)(struct ftrace_event_call *,
-			     struct perf_event *);
-#endif
-};
-
-static inline const char *
-ftrace_event_name(struct ftrace_event_call *call)
-{
-	if (call->flags & TRACE_EVENT_FL_TRACEPOINT)
-		return call->tp ? call->tp->name : NULL;
-	else
-		return call->name;
-}
-
-struct trace_array;
-struct ftrace_subsystem_dir;
-
-enum {
-	FTRACE_EVENT_FL_ENABLED_BIT,
-	FTRACE_EVENT_FL_RECORDED_CMD_BIT,
-	FTRACE_EVENT_FL_FILTERED_BIT,
-	FTRACE_EVENT_FL_NO_SET_FILTER_BIT,
-	FTRACE_EVENT_FL_SOFT_MODE_BIT,
-	FTRACE_EVENT_FL_SOFT_DISABLED_BIT,
-	FTRACE_EVENT_FL_TRIGGER_MODE_BIT,
-	FTRACE_EVENT_FL_TRIGGER_COND_BIT,
-};
-
-/*
- * Ftrace event file flags:
- *  ENABLED	  - The event is enabled
- *  RECORDED_CMD  - The comms should be recorded at sched_switch
- *  FILTERED	  - The event has a filter attached
- *  NO_SET_FILTER - Set when filter has error and is to be ignored
- *  SOFT_MODE     - The event is enabled/disabled by SOFT_DISABLED
- *  SOFT_DISABLED - When set, do not trace the event (even though its
- *                   tracepoint may be enabled)
- *  TRIGGER_MODE  - When set, invoke the triggers associated with the event
- *  TRIGGER_COND  - When set, one or more triggers has an associated filter
- */
-enum {
-	FTRACE_EVENT_FL_ENABLED		= (1 << FTRACE_EVENT_FL_ENABLED_BIT),
-	FTRACE_EVENT_FL_RECORDED_CMD	= (1 << FTRACE_EVENT_FL_RECORDED_CMD_BIT),
-	FTRACE_EVENT_FL_FILTERED	= (1 << FTRACE_EVENT_FL_FILTERED_BIT),
-	FTRACE_EVENT_FL_NO_SET_FILTER	= (1 << FTRACE_EVENT_FL_NO_SET_FILTER_BIT),
-	FTRACE_EVENT_FL_SOFT_MODE	= (1 << FTRACE_EVENT_FL_SOFT_MODE_BIT),
-	FTRACE_EVENT_FL_SOFT_DISABLED	= (1 << FTRACE_EVENT_FL_SOFT_DISABLED_BIT),
-	FTRACE_EVENT_FL_TRIGGER_MODE	= (1 << FTRACE_EVENT_FL_TRIGGER_MODE_BIT),
-	FTRACE_EVENT_FL_TRIGGER_COND	= (1 << FTRACE_EVENT_FL_TRIGGER_COND_BIT),
-};
-
-struct ftrace_event_file {
-	struct list_head		list;
-	struct ftrace_event_call	*event_call;
-	struct event_filter		*filter;
-	struct dentry			*dir;
-	struct trace_array		*tr;
-	struct ftrace_subsystem_dir	*system;
-	struct list_head		triggers;
-
-	/*
-	 * 32 bit flags:
-	 *   bit 0:		enabled
-	 *   bit 1:		enabled cmd record
-	 *   bit 2:		enable/disable with the soft disable bit
-	 *   bit 3:		soft disabled
-	 *   bit 4:		trigger enabled
-	 *
-	 * Note: The bits must be set atomically to prevent races
-	 * from other writers. Reads of flags do not need to be in
-	 * sync as they occur in critical sections. But the way flags
-	 * is currently used, these changes do not affect the code
-	 * except that when a change is made, it may have a slight
-	 * delay in propagating the changes to other CPUs due to
-	 * caching and such. Which is mostly OK ;-)
-	 */
-	unsigned long		flags;
-	atomic_t		sm_ref;	/* soft-mode reference counter */
-	atomic_t		tm_ref;	/* trigger-mode reference counter */
-};
-
-#define __TRACE_EVENT_FLAGS(name, value)				\
-	static int __init trace_init_flags_##name(void)			\
-	{								\
-		event_##name.flags |= value;				\
-		return 0;						\
-	}								\
-	early_initcall(trace_init_flags_##name);
-
-#define __TRACE_EVENT_PERF_PERM(name, expr...)				\
-	static int perf_perm_##name(struct ftrace_event_call *tp_event, \
-				    struct perf_event *p_event)		\
-	{								\
-		return ({ expr; });					\
-	}								\
-	static int __init trace_init_perf_perm_##name(void)		\
-	{								\
-		event_##name.perf_perm = &perf_perm_##name;		\
-		return 0;						\
-	}								\
-	early_initcall(trace_init_perf_perm_##name);
-
-#define PERF_MAX_TRACE_SIZE	2048
-
-#define MAX_FILTER_STR_VAL	256	/* Should handle KSYM_SYMBOL_LEN */
-
-enum event_trigger_type {
-	ETT_NONE		= (0),
-	ETT_TRACE_ONOFF		= (1 << 0),
-	ETT_SNAPSHOT		= (1 << 1),
-	ETT_STACKTRACE		= (1 << 2),
-	ETT_EVENT_ENABLE	= (1 << 3),
-};
-
-extern int filter_match_preds(struct event_filter *filter, void *rec);
-
-extern int filter_check_discard(struct ftrace_event_file *file, void *rec,
-				struct ring_buffer *buffer,
-				struct ring_buffer_event *event);
-extern int call_filter_check_discard(struct ftrace_event_call *call, void *rec,
-				     struct ring_buffer *buffer,
-				     struct ring_buffer_event *event);
-extern enum event_trigger_type event_triggers_call(struct ftrace_event_file *file,
-						   void *rec);
-extern void event_triggers_post_call(struct ftrace_event_file *file,
-				     enum event_trigger_type tt);
-
-/**
- * ftrace_trigger_soft_disabled - do triggers and test if soft disabled
- * @file: The file pointer of the event to test
- *
- * If any triggers without filters are attached to this event, they
- * will be called here. If the event is soft disabled and has no
- * triggers that require testing the fields, it will return true,
- * otherwise false.
- */
-static inline bool
-ftrace_trigger_soft_disabled(struct ftrace_event_file *file)
-{
-	unsigned long eflags = file->flags;
-
-	if (!(eflags & FTRACE_EVENT_FL_TRIGGER_COND)) {
-		if (eflags & FTRACE_EVENT_FL_TRIGGER_MODE)
-			event_triggers_call(file, NULL);
-		if (eflags & FTRACE_EVENT_FL_SOFT_DISABLED)
-			return true;
-	}
-	return false;
-}
-
-/*
- * Helper function for event_trigger_unlock_commit{_regs}().
- * If there are event triggers attached to this event that requires
- * filtering against its fields, then they wil be called as the
- * entry already holds the field information of the current event.
- *
- * It also checks if the event should be discarded or not.
- * It is to be discarded if the event is soft disabled and the
- * event was only recorded to process triggers, or if the event
- * filter is active and this event did not match the filters.
- *
- * Returns true if the event is discarded, false otherwise.
- */
-static inline bool
-__event_trigger_test_discard(struct ftrace_event_file *file,
-			     struct ring_buffer *buffer,
-			     struct ring_buffer_event *event,
-			     void *entry,
-			     enum event_trigger_type *tt)
-{
-	unsigned long eflags = file->flags;
-
-	if (eflags & FTRACE_EVENT_FL_TRIGGER_COND)
-		*tt = event_triggers_call(file, entry);
-
-	if (test_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &file->flags))
-		ring_buffer_discard_commit(buffer, event);
-	else if (!filter_check_discard(file, entry, buffer, event))
-		return false;
-
-	return true;
-}
-
-/**
- * event_trigger_unlock_commit - handle triggers and finish event commit
- * @file: The file pointer assoctiated to the event
- * @buffer: The ring buffer that the event is being written to
- * @event: The event meta data in the ring buffer
- * @entry: The event itself
- * @irq_flags: The state of the interrupts at the start of the event
- * @pc: The state of the preempt count at the start of the event.
- *
- * This is a helper function to handle triggers that require data
- * from the event itself. It also tests the event against filters and
- * if the event is soft disabled and should be discarded.
- */
-static inline void
-event_trigger_unlock_commit(struct ftrace_event_file *file,
-			    struct ring_buffer *buffer,
-			    struct ring_buffer_event *event,
-			    void *entry, unsigned long irq_flags, int pc)
-{
-	enum event_trigger_type tt = ETT_NONE;
-
-	if (!__event_trigger_test_discard(file, buffer, event, entry, &tt))
-		trace_buffer_unlock_commit(buffer, event, irq_flags, pc);
-
-	if (tt)
-		event_triggers_post_call(file, tt);
-}
-
-/**
- * event_trigger_unlock_commit_regs - handle triggers and finish event commit
- * @file: The file pointer assoctiated to the event
- * @buffer: The ring buffer that the event is being written to
- * @event: The event meta data in the ring buffer
- * @entry: The event itself
- * @irq_flags: The state of the interrupts at the start of the event
- * @pc: The state of the preempt count at the start of the event.
- *
- * This is a helper function to handle triggers that require data
- * from the event itself. It also tests the event against filters and
- * if the event is soft disabled and should be discarded.
- *
- * Same as event_trigger_unlock_commit() but calls
- * trace_buffer_unlock_commit_regs() instead of trace_buffer_unlock_commit().
- */
-static inline void
-event_trigger_unlock_commit_regs(struct ftrace_event_file *file,
-				 struct ring_buffer *buffer,
-				 struct ring_buffer_event *event,
-				 void *entry, unsigned long irq_flags, int pc,
-				 struct pt_regs *regs)
-{
-	enum event_trigger_type tt = ETT_NONE;
-
-	if (!__event_trigger_test_discard(file, buffer, event, entry, &tt))
-		trace_buffer_unlock_commit_regs(buffer, event,
-						irq_flags, pc, regs);
-
-	if (tt)
-		event_triggers_post_call(file, tt);
-}
-
-#ifdef CONFIG_BPF_SYSCALL
-unsigned int trace_call_bpf(struct bpf_prog *prog, void *ctx);
-#else
-static inline unsigned int trace_call_bpf(struct bpf_prog *prog, void *ctx)
-{
-	return 1;
-}
-#endif
-
-enum {
-	FILTER_OTHER = 0,
-	FILTER_STATIC_STRING,
-	FILTER_DYN_STRING,
-	FILTER_PTR_STRING,
-	FILTER_TRACE_FN,
-};
-
-extern int trace_event_raw_init(struct ftrace_event_call *call);
-extern int trace_define_field(struct ftrace_event_call *call, const char *type,
-			      const char *name, int offset, int size,
-			      int is_signed, int filter_type);
-extern int trace_add_event_call(struct ftrace_event_call *call);
-extern int trace_remove_event_call(struct ftrace_event_call *call);
-
-#define is_signed_type(type)	(((type)(-1)) < (type)1)
-
-int trace_set_clr_event(const char *system, const char *event, int set);
-
-/*
- * The double __builtin_constant_p is because gcc will give us an error
- * if we try to allocate the static variable to fmt if it is not a
- * constant. Even with the outer if statement optimizing out.
- */
-#define event_trace_printk(ip, fmt, args...)				\
-do {									\
-	__trace_printk_check_format(fmt, ##args);			\
-	tracing_record_cmdline(current);				\
-	if (__builtin_constant_p(fmt)) {				\
-		static const char *trace_printk_fmt			\
-		  __attribute__((section("__trace_printk_fmt"))) =	\
-			__builtin_constant_p(fmt) ? fmt : NULL;		\
-									\
-		__trace_bprintk(ip, trace_printk_fmt, ##args);		\
-	} else								\
-		__trace_printk(ip, fmt, ##args);			\
-} while (0)
-
-#ifdef CONFIG_PERF_EVENTS
-struct perf_event;
-
-DECLARE_PER_CPU(struct pt_regs, perf_trace_regs);
-
-extern int  perf_trace_init(struct perf_event *event);
-extern void perf_trace_destroy(struct perf_event *event);
-extern int  perf_trace_add(struct perf_event *event, int flags);
-extern void perf_trace_del(struct perf_event *event, int flags);
-extern int  ftrace_profile_set_filter(struct perf_event *event, int event_id,
-				     char *filter_str);
-extern void ftrace_profile_free_filter(struct perf_event *event);
-extern void *perf_trace_buf_prepare(int size, unsigned short type,
-				    struct pt_regs **regs, int *rctxp);
-
-static inline void
-perf_trace_buf_submit(void *raw_data, int size, int rctx, u64 addr,
-		       u64 count, struct pt_regs *regs, void *head,
-		       struct task_struct *task)
-{
-	perf_tp_event(addr, count, raw_data, size, regs, head, rctx, task);
-}
-#endif
-
-#endif /* _LINUX_FTRACE_EVENT_H */
diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
new file mode 100644
index 000000000000..f8465d65f3c7
--- /dev/null
+++ b/include/linux/trace_events.h
@@ -0,0 +1,616 @@
+
+#ifndef _LINUX_FTRACE_EVENT_H
+#define _LINUX_FTRACE_EVENT_H
+
+#include <linux/ring_buffer.h>
+#include <linux/trace_seq.h>
+#include <linux/percpu.h>
+#include <linux/hardirq.h>
+#include <linux/perf_event.h>
+#include <linux/tracepoint.h>
+
+struct trace_array;
+struct trace_buffer;
+struct tracer;
+struct dentry;
+struct bpf_prog;
+
+struct trace_print_flags {
+	unsigned long		mask;
+	const char		*name;
+};
+
+struct trace_print_flags_u64 {
+	unsigned long long	mask;
+	const char		*name;
+};
+
+const char *ftrace_print_flags_seq(struct trace_seq *p, const char *delim,
+				   unsigned long flags,
+				   const struct trace_print_flags *flag_array);
+
+const char *ftrace_print_symbols_seq(struct trace_seq *p, unsigned long val,
+				     const struct trace_print_flags *symbol_array);
+
+#if BITS_PER_LONG == 32
+const char *ftrace_print_symbols_seq_u64(struct trace_seq *p,
+					 unsigned long long val,
+					 const struct trace_print_flags_u64
+								 *symbol_array);
+#endif
+
+const char *ftrace_print_bitmask_seq(struct trace_seq *p, void *bitmask_ptr,
+				     unsigned int bitmask_size);
+
+const char *ftrace_print_hex_seq(struct trace_seq *p,
+				 const unsigned char *buf, int len);
+
+const char *ftrace_print_array_seq(struct trace_seq *p,
+				   const void *buf, int count,
+				   size_t el_size);
+
+struct trace_iterator;
+struct trace_event;
+
+int ftrace_raw_output_prep(struct trace_iterator *iter,
+			   struct trace_event *event);
+
+/*
+ * The trace entry - the most basic unit of tracing. This is what
+ * is printed in the end as a single line in the trace output, such as:
+ *
+ *     bash-15816 [01]   235.197585: idle_cpu <- irq_enter
+ */
+struct trace_entry {
+	unsigned short		type;
+	unsigned char		flags;
+	unsigned char		preempt_count;
+	int			pid;
+};
+
+#define FTRACE_MAX_EVENT						\
+	((1 << (sizeof(((struct trace_entry *)0)->type) * 8)) - 1)
+
+/*
+ * Trace iterator - used by printout routines who present trace
+ * results to users and which routines might sleep, etc:
+ */
+struct trace_iterator {
+	struct trace_array	*tr;
+	struct tracer		*trace;
+	struct trace_buffer	*trace_buffer;
+	void			*private;
+	int			cpu_file;
+	struct mutex		mutex;
+	struct ring_buffer_iter	**buffer_iter;
+	unsigned long		iter_flags;
+
+	/* trace_seq for __print_flags() and __print_symbolic() etc. */
+	struct trace_seq	tmp_seq;
+
+	cpumask_var_t		started;
+
+	/* it's true when current open file is snapshot */
+	bool			snapshot;
+
+	/* The below is zeroed out in pipe_read */
+	struct trace_seq	seq;
+	struct trace_entry	*ent;
+	unsigned long		lost_events;
+	int			leftover;
+	int			ent_size;
+	int			cpu;
+	u64			ts;
+
+	loff_t			pos;
+	long			idx;
+
+	/* All new field here will be zeroed out in pipe_read */
+};
+
+enum trace_iter_flags {
+	TRACE_FILE_LAT_FMT	= 1,
+	TRACE_FILE_ANNOTATE	= 2,
+	TRACE_FILE_TIME_IN_NS	= 4,
+};
+
+
+typedef enum print_line_t (*trace_print_func)(struct trace_iterator *iter,
+				      int flags, struct trace_event *event);
+
+struct trace_event_functions {
+	trace_print_func	trace;
+	trace_print_func	raw;
+	trace_print_func	hex;
+	trace_print_func	binary;
+};
+
+struct trace_event {
+	struct hlist_node		node;
+	struct list_head		list;
+	int				type;
+	struct trace_event_functions	*funcs;
+};
+
+extern int register_ftrace_event(struct trace_event *event);
+extern int unregister_ftrace_event(struct trace_event *event);
+
+/* Return values for print_line callback */
+enum print_line_t {
+	TRACE_TYPE_PARTIAL_LINE	= 0,	/* Retry after flushing the seq */
+	TRACE_TYPE_HANDLED	= 1,
+	TRACE_TYPE_UNHANDLED	= 2,	/* Relay to other output functions */
+	TRACE_TYPE_NO_CONSUME	= 3	/* Handled but ask to not consume */
+};
+
+/*
+ * Several functions return TRACE_TYPE_PARTIAL_LINE if the trace_seq
+ * overflowed, and TRACE_TYPE_HANDLED otherwise. This helper function
+ * simplifies those functions and keeps them in sync.
+ */
+static inline enum print_line_t trace_handle_return(struct trace_seq *s)
+{
+	return trace_seq_has_overflowed(s) ?
+		TRACE_TYPE_PARTIAL_LINE : TRACE_TYPE_HANDLED;
+}
+
+void tracing_generic_entry_update(struct trace_entry *entry,
+				  unsigned long flags,
+				  int pc);
+struct ftrace_event_file;
+
+struct ring_buffer_event *
+trace_event_buffer_lock_reserve(struct ring_buffer **current_buffer,
+				struct ftrace_event_file *ftrace_file,
+				int type, unsigned long len,
+				unsigned long flags, int pc);
+struct ring_buffer_event *
+trace_current_buffer_lock_reserve(struct ring_buffer **current_buffer,
+				  int type, unsigned long len,
+				  unsigned long flags, int pc);
+void trace_current_buffer_unlock_commit(struct ring_buffer *buffer,
+					struct ring_buffer_event *event,
+					unsigned long flags, int pc);
+void trace_buffer_unlock_commit(struct ring_buffer *buffer,
+				struct ring_buffer_event *event,
+				unsigned long flags, int pc);
+void trace_buffer_unlock_commit_regs(struct ring_buffer *buffer,
+				     struct ring_buffer_event *event,
+				     unsigned long flags, int pc,
+				     struct pt_regs *regs);
+void trace_current_buffer_discard_commit(struct ring_buffer *buffer,
+					 struct ring_buffer_event *event);
+
+void tracing_record_cmdline(struct task_struct *tsk);
+
+int ftrace_output_call(struct trace_iterator *iter, char *name, char *fmt, ...);
+
+struct event_filter;
+
+enum trace_reg {
+	TRACE_REG_REGISTER,
+	TRACE_REG_UNREGISTER,
+#ifdef CONFIG_PERF_EVENTS
+	TRACE_REG_PERF_REGISTER,
+	TRACE_REG_PERF_UNREGISTER,
+	TRACE_REG_PERF_OPEN,
+	TRACE_REG_PERF_CLOSE,
+	TRACE_REG_PERF_ADD,
+	TRACE_REG_PERF_DEL,
+#endif
+};
+
+struct ftrace_event_call;
+
+struct ftrace_event_class {
+	const char		*system;
+	void			*probe;
+#ifdef CONFIG_PERF_EVENTS
+	void			*perf_probe;
+#endif
+	int			(*reg)(struct ftrace_event_call *event,
+				       enum trace_reg type, void *data);
+	int			(*define_fields)(struct ftrace_event_call *);
+	struct list_head	*(*get_fields)(struct ftrace_event_call *);
+	struct list_head	fields;
+	int			(*raw_init)(struct ftrace_event_call *);
+};
+
+extern int ftrace_event_reg(struct ftrace_event_call *event,
+			    enum trace_reg type, void *data);
+
+struct ftrace_event_buffer {
+	struct ring_buffer		*buffer;
+	struct ring_buffer_event	*event;
+	struct ftrace_event_file	*ftrace_file;
+	void				*entry;
+	unsigned long			flags;
+	int				pc;
+};
+
+void *ftrace_event_buffer_reserve(struct ftrace_event_buffer *fbuffer,
+				  struct ftrace_event_file *ftrace_file,
+				  unsigned long len);
+
+void ftrace_event_buffer_commit(struct ftrace_event_buffer *fbuffer);
+
+enum {
+	TRACE_EVENT_FL_FILTERED_BIT,
+	TRACE_EVENT_FL_CAP_ANY_BIT,
+	TRACE_EVENT_FL_NO_SET_FILTER_BIT,
+	TRACE_EVENT_FL_IGNORE_ENABLE_BIT,
+	TRACE_EVENT_FL_WAS_ENABLED_BIT,
+	TRACE_EVENT_FL_USE_CALL_FILTER_BIT,
+	TRACE_EVENT_FL_TRACEPOINT_BIT,
+	TRACE_EVENT_FL_KPROBE_BIT,
+};
+
+/*
+ * Event flags:
+ *  FILTERED	  - The event has a filter attached
+ *  CAP_ANY	  - Any user can enable for perf
+ *  NO_SET_FILTER - Set when filter has error and is to be ignored
+ *  IGNORE_ENABLE - For ftrace internal events, do not enable with debugfs file
+ *  WAS_ENABLED   - Set and stays set when an event was ever enabled
+ *                    (used for module unloading, if a module event is enabled,
+ *                     it is best to clear the buffers that used it).
+ *  USE_CALL_FILTER - For ftrace internal events, don't use file filter
+ *  TRACEPOINT    - Event is a tracepoint
+ *  KPROBE        - Event is a kprobe
+ */
+enum {
+	TRACE_EVENT_FL_FILTERED		= (1 << TRACE_EVENT_FL_FILTERED_BIT),
+	TRACE_EVENT_FL_CAP_ANY		= (1 << TRACE_EVENT_FL_CAP_ANY_BIT),
+	TRACE_EVENT_FL_NO_SET_FILTER	= (1 << TRACE_EVENT_FL_NO_SET_FILTER_BIT),
+	TRACE_EVENT_FL_IGNORE_ENABLE	= (1 << TRACE_EVENT_FL_IGNORE_ENABLE_BIT),
+	TRACE_EVENT_FL_WAS_ENABLED	= (1 << TRACE_EVENT_FL_WAS_ENABLED_BIT),
+	TRACE_EVENT_FL_USE_CALL_FILTER	= (1 << TRACE_EVENT_FL_USE_CALL_FILTER_BIT),
+	TRACE_EVENT_FL_TRACEPOINT	= (1 << TRACE_EVENT_FL_TRACEPOINT_BIT),
+	TRACE_EVENT_FL_KPROBE		= (1 << TRACE_EVENT_FL_KPROBE_BIT),
+};
+
+struct ftrace_event_call {
+	struct list_head	list;
+	struct ftrace_event_class *class;
+	union {
+		char			*name;
+		/* Set TRACE_EVENT_FL_TRACEPOINT flag when using "tp" */
+		struct tracepoint	*tp;
+	};
+	struct trace_event	event;
+	char			*print_fmt;
+	struct event_filter	*filter;
+	void			*mod;
+	void			*data;
+	/*
+	 *   bit 0:		filter_active
+	 *   bit 1:		allow trace by non root (cap any)
+	 *   bit 2:		failed to apply filter
+	 *   bit 3:		ftrace internal event (do not enable)
+	 *   bit 4:		Event was enabled by module
+	 *   bit 5:		use call filter rather than file filter
+	 *   bit 6:		Event is a tracepoint
+	 */
+	int			flags; /* static flags of different events */
+
+#ifdef CONFIG_PERF_EVENTS
+	int				perf_refcount;
+	struct hlist_head __percpu	*perf_events;
+	struct bpf_prog			*prog;
+
+	int	(*perf_perm)(struct ftrace_event_call *,
+			     struct perf_event *);
+#endif
+};
+
+static inline const char *
+ftrace_event_name(struct ftrace_event_call *call)
+{
+	if (call->flags & TRACE_EVENT_FL_TRACEPOINT)
+		return call->tp ? call->tp->name : NULL;
+	else
+		return call->name;
+}
+
+struct trace_array;
+struct ftrace_subsystem_dir;
+
+enum {
+	FTRACE_EVENT_FL_ENABLED_BIT,
+	FTRACE_EVENT_FL_RECORDED_CMD_BIT,
+	FTRACE_EVENT_FL_FILTERED_BIT,
+	FTRACE_EVENT_FL_NO_SET_FILTER_BIT,
+	FTRACE_EVENT_FL_SOFT_MODE_BIT,
+	FTRACE_EVENT_FL_SOFT_DISABLED_BIT,
+	FTRACE_EVENT_FL_TRIGGER_MODE_BIT,
+	FTRACE_EVENT_FL_TRIGGER_COND_BIT,
+};
+
+/*
+ * Ftrace event file flags:
+ *  ENABLED	  - The event is enabled
+ *  RECORDED_CMD  - The comms should be recorded at sched_switch
+ *  FILTERED	  - The event has a filter attached
+ *  NO_SET_FILTER - Set when filter has error and is to be ignored
+ *  SOFT_MODE     - The event is enabled/disabled by SOFT_DISABLED
+ *  SOFT_DISABLED - When set, do not trace the event (even though its
+ *                   tracepoint may be enabled)
+ *  TRIGGER_MODE  - When set, invoke the triggers associated with the event
+ *  TRIGGER_COND  - When set, one or more triggers has an associated filter
+ */
+enum {
+	FTRACE_EVENT_FL_ENABLED		= (1 << FTRACE_EVENT_FL_ENABLED_BIT),
+	FTRACE_EVENT_FL_RECORDED_CMD	= (1 << FTRACE_EVENT_FL_RECORDED_CMD_BIT),
+	FTRACE_EVENT_FL_FILTERED	= (1 << FTRACE_EVENT_FL_FILTERED_BIT),
+	FTRACE_EVENT_FL_NO_SET_FILTER	= (1 << FTRACE_EVENT_FL_NO_SET_FILTER_BIT),
+	FTRACE_EVENT_FL_SOFT_MODE	= (1 << FTRACE_EVENT_FL_SOFT_MODE_BIT),
+	FTRACE_EVENT_FL_SOFT_DISABLED	= (1 << FTRACE_EVENT_FL_SOFT_DISABLED_BIT),
+	FTRACE_EVENT_FL_TRIGGER_MODE	= (1 << FTRACE_EVENT_FL_TRIGGER_MODE_BIT),
+	FTRACE_EVENT_FL_TRIGGER_COND	= (1 << FTRACE_EVENT_FL_TRIGGER_COND_BIT),
+};
+
+struct ftrace_event_file {
+	struct list_head		list;
+	struct ftrace_event_call	*event_call;
+	struct event_filter		*filter;
+	struct dentry			*dir;
+	struct trace_array		*tr;
+	struct ftrace_subsystem_dir	*system;
+	struct list_head		triggers;
+
+	/*
+	 * 32 bit flags:
+	 *   bit 0:		enabled
+	 *   bit 1:		enabled cmd record
+	 *   bit 2:		enable/disable with the soft disable bit
+	 *   bit 3:		soft disabled
+	 *   bit 4:		trigger enabled
+	 *
+	 * Note: The bits must be set atomically to prevent races
+	 * from other writers. Reads of flags do not need to be in
+	 * sync as they occur in critical sections. But the way flags
+	 * is currently used, these changes do not affect the code
+	 * except that when a change is made, it may have a slight
+	 * delay in propagating the changes to other CPUs due to
+	 * caching and such. Which is mostly OK ;-)
+	 */
+	unsigned long		flags;
+	atomic_t		sm_ref;	/* soft-mode reference counter */
+	atomic_t		tm_ref;	/* trigger-mode reference counter */
+};
+
+#define __TRACE_EVENT_FLAGS(name, value)				\
+	static int __init trace_init_flags_##name(void)			\
+	{								\
+		event_##name.flags |= value;				\
+		return 0;						\
+	}								\
+	early_initcall(trace_init_flags_##name);
+
+#define __TRACE_EVENT_PERF_PERM(name, expr...)				\
+	static int perf_perm_##name(struct ftrace_event_call *tp_event, \
+				    struct perf_event *p_event)		\
+	{								\
+		return ({ expr; });					\
+	}								\
+	static int __init trace_init_perf_perm_##name(void)		\
+	{								\
+		event_##name.perf_perm = &perf_perm_##name;		\
+		return 0;						\
+	}								\
+	early_initcall(trace_init_perf_perm_##name);
+
+#define PERF_MAX_TRACE_SIZE	2048
+
+#define MAX_FILTER_STR_VAL	256	/* Should handle KSYM_SYMBOL_LEN */
+
+enum event_trigger_type {
+	ETT_NONE		= (0),
+	ETT_TRACE_ONOFF		= (1 << 0),
+	ETT_SNAPSHOT		= (1 << 1),
+	ETT_STACKTRACE		= (1 << 2),
+	ETT_EVENT_ENABLE	= (1 << 3),
+};
+
+extern int filter_match_preds(struct event_filter *filter, void *rec);
+
+extern int filter_check_discard(struct ftrace_event_file *file, void *rec,
+				struct ring_buffer *buffer,
+				struct ring_buffer_event *event);
+extern int call_filter_check_discard(struct ftrace_event_call *call, void *rec,
+				     struct ring_buffer *buffer,
+				     struct ring_buffer_event *event);
+extern enum event_trigger_type event_triggers_call(struct ftrace_event_file *file,
+						   void *rec);
+extern void event_triggers_post_call(struct ftrace_event_file *file,
+				     enum event_trigger_type tt);
+
+/**
+ * ftrace_trigger_soft_disabled - do triggers and test if soft disabled
+ * @file: The file pointer of the event to test
+ *
+ * If any triggers without filters are attached to this event, they
+ * will be called here. If the event is soft disabled and has no
+ * triggers that require testing the fields, it will return true,
+ * otherwise false.
+ */
+static inline bool
+ftrace_trigger_soft_disabled(struct ftrace_event_file *file)
+{
+	unsigned long eflags = file->flags;
+
+	if (!(eflags & FTRACE_EVENT_FL_TRIGGER_COND)) {
+		if (eflags & FTRACE_EVENT_FL_TRIGGER_MODE)
+			event_triggers_call(file, NULL);
+		if (eflags & FTRACE_EVENT_FL_SOFT_DISABLED)
+			return true;
+	}
+	return false;
+}
+
+/*
+ * Helper function for event_trigger_unlock_commit{_regs}().
+ * If there are event triggers attached to this event that requires
+ * filtering against its fields, then they wil be called as the
+ * entry already holds the field information of the current event.
+ *
+ * It also checks if the event should be discarded or not.
+ * It is to be discarded if the event is soft disabled and the
+ * event was only recorded to process triggers, or if the event
+ * filter is active and this event did not match the filters.
+ *
+ * Returns true if the event is discarded, false otherwise.
+ */
+static inline bool
+__event_trigger_test_discard(struct ftrace_event_file *file,
+			     struct ring_buffer *buffer,
+			     struct ring_buffer_event *event,
+			     void *entry,
+			     enum event_trigger_type *tt)
+{
+	unsigned long eflags = file->flags;
+
+	if (eflags & FTRACE_EVENT_FL_TRIGGER_COND)
+		*tt = event_triggers_call(file, entry);
+
+	if (test_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &file->flags))
+		ring_buffer_discard_commit(buffer, event);
+	else if (!filter_check_discard(file, entry, buffer, event))
+		return false;
+
+	return true;
+}
+
+/**
+ * event_trigger_unlock_commit - handle triggers and finish event commit
+ * @file: The file pointer assoctiated to the event
+ * @buffer: The ring buffer that the event is being written to
+ * @event: The event meta data in the ring buffer
+ * @entry: The event itself
+ * @irq_flags: The state of the interrupts at the start of the event
+ * @pc: The state of the preempt count at the start of the event.
+ *
+ * This is a helper function to handle triggers that require data
+ * from the event itself. It also tests the event against filters and
+ * if the event is soft disabled and should be discarded.
+ */
+static inline void
+event_trigger_unlock_commit(struct ftrace_event_file *file,
+			    struct ring_buffer *buffer,
+			    struct ring_buffer_event *event,
+			    void *entry, unsigned long irq_flags, int pc)
+{
+	enum event_trigger_type tt = ETT_NONE;
+
+	if (!__event_trigger_test_discard(file, buffer, event, entry, &tt))
+		trace_buffer_unlock_commit(buffer, event, irq_flags, pc);
+
+	if (tt)
+		event_triggers_post_call(file, tt);
+}
+
+/**
+ * event_trigger_unlock_commit_regs - handle triggers and finish event commit
+ * @file: The file pointer assoctiated to the event
+ * @buffer: The ring buffer that the event is being written to
+ * @event: The event meta data in the ring buffer
+ * @entry: The event itself
+ * @irq_flags: The state of the interrupts at the start of the event
+ * @pc: The state of the preempt count at the start of the event.
+ *
+ * This is a helper function to handle triggers that require data
+ * from the event itself. It also tests the event against filters and
+ * if the event is soft disabled and should be discarded.
+ *
+ * Same as event_trigger_unlock_commit() but calls
+ * trace_buffer_unlock_commit_regs() instead of trace_buffer_unlock_commit().
+ */
+static inline void
+event_trigger_unlock_commit_regs(struct ftrace_event_file *file,
+				 struct ring_buffer *buffer,
+				 struct ring_buffer_event *event,
+				 void *entry, unsigned long irq_flags, int pc,
+				 struct pt_regs *regs)
+{
+	enum event_trigger_type tt = ETT_NONE;
+
+	if (!__event_trigger_test_discard(file, buffer, event, entry, &tt))
+		trace_buffer_unlock_commit_regs(buffer, event,
+						irq_flags, pc, regs);
+
+	if (tt)
+		event_triggers_post_call(file, tt);
+}
+
+#ifdef CONFIG_BPF_SYSCALL
+unsigned int trace_call_bpf(struct bpf_prog *prog, void *ctx);
+#else
+static inline unsigned int trace_call_bpf(struct bpf_prog *prog, void *ctx)
+{
+	return 1;
+}
+#endif
+
+enum {
+	FILTER_OTHER = 0,
+	FILTER_STATIC_STRING,
+	FILTER_DYN_STRING,
+	FILTER_PTR_STRING,
+	FILTER_TRACE_FN,
+};
+
+extern int trace_event_raw_init(struct ftrace_event_call *call);
+extern int trace_define_field(struct ftrace_event_call *call, const char *type,
+			      const char *name, int offset, int size,
+			      int is_signed, int filter_type);
+extern int trace_add_event_call(struct ftrace_event_call *call);
+extern int trace_remove_event_call(struct ftrace_event_call *call);
+
+#define is_signed_type(type)	(((type)(-1)) < (type)1)
+
+int trace_set_clr_event(const char *system, const char *event, int set);
+
+/*
+ * The double __builtin_constant_p is because gcc will give us an error
+ * if we try to allocate the static variable to fmt if it is not a
+ * constant. Even with the outer if statement optimizing out.
+ */
+#define event_trace_printk(ip, fmt, args...)				\
+do {									\
+	__trace_printk_check_format(fmt, ##args);			\
+	tracing_record_cmdline(current);				\
+	if (__builtin_constant_p(fmt)) {				\
+		static const char *trace_printk_fmt			\
+		  __attribute__((section("__trace_printk_fmt"))) =	\
+			__builtin_constant_p(fmt) ? fmt : NULL;		\
+									\
+		__trace_bprintk(ip, trace_printk_fmt, ##args);		\
+	} else								\
+		__trace_printk(ip, fmt, ##args);			\
+} while (0)
+
+#ifdef CONFIG_PERF_EVENTS
+struct perf_event;
+
+DECLARE_PER_CPU(struct pt_regs, perf_trace_regs);
+
+extern int  perf_trace_init(struct perf_event *event);
+extern void perf_trace_destroy(struct perf_event *event);
+extern int  perf_trace_add(struct perf_event *event, int flags);
+extern void perf_trace_del(struct perf_event *event, int flags);
+extern int  ftrace_profile_set_filter(struct perf_event *event, int event_id,
+				     char *filter_str);
+extern void ftrace_profile_free_filter(struct perf_event *event);
+extern void *perf_trace_buf_prepare(int size, unsigned short type,
+				    struct pt_regs **regs, int *rctxp);
+
+static inline void
+perf_trace_buf_submit(void *raw_data, int size, int rctx, u64 addr,
+		       u64 count, struct pt_regs *regs, void *head,
+		       struct task_struct *task)
+{
+	perf_tp_event(addr, count, raw_data, size, regs, head, rctx, task);
+}
+#endif
+
+#endif /* _LINUX_FTRACE_EVENT_H */
diff --git a/include/trace/events/power.h b/include/trace/events/power.h
index d19840b0cac8..a9c88d0a1ca8 100644
--- a/include/trace/events/power.h
+++ b/include/trace/events/power.h
@@ -7,7 +7,7 @@
 #include <linux/ktime.h>
 #include <linux/pm_qos.h>
 #include <linux/tracepoint.h>
-#include <linux/ftrace_event.h>
+#include <linux/trace_events.h>
 
 #define TPS(x)  tracepoint_string(x)
 
diff --git a/include/trace/syscall.h b/include/trace/syscall.h
index 9674145e2f6a..58df48c9d04e 100644
--- a/include/trace/syscall.h
+++ b/include/trace/syscall.h
@@ -3,7 +3,7 @@
 
 #include <linux/tracepoint.h>
 #include <linux/unistd.h>
-#include <linux/ftrace_event.h>
+#include <linux/trace_events.h>
 #include <linux/thread_info.h>
 
 #include <asm/ptrace.h>
diff --git a/include/trace/trace_events.h b/include/trace/trace_events.h
index 19af616e9dbd..17d85ca992ee 100644
--- a/include/trace/trace_events.h
+++ b/include/trace/trace_events.h
@@ -16,7 +16,7 @@
  * in the structure.
  */
 
-#include <linux/ftrace_event.h>
+#include <linux/trace_events.h>
 
 #ifndef TRACE_SYSTEM_VAR
 #define TRACE_SYSTEM_VAR TRACE_SYSTEM
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 81aa3a4ece9f..e318b1aa8647 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -36,7 +36,7 @@
 #include <linux/kernel_stat.h>
 #include <linux/cgroup.h>
 #include <linux/perf_event.h>
-#include <linux/ftrace_event.h>
+#include <linux/trace_events.h>
 #include <linux/hw_breakpoint.h>
 #include <linux/mm_types.h>
 #include <linux/module.h>
diff --git a/kernel/module.c b/kernel/module.c
index 42a1d2afb217..4db0dbc14031 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -18,7 +18,7 @@
 */
 #include <linux/export.h>
 #include <linux/moduleloader.h>
-#include <linux/ftrace_event.h>
+#include <linux/trace_events.h>
 #include <linux/init.h>
 #include <linux/kallsyms.h>
 #include <linux/file.h>
diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c
index 069742d61c68..665a4f4d265f 100644
--- a/kernel/rcu/tiny.c
+++ b/kernel/rcu/tiny.c
@@ -35,7 +35,7 @@
 #include <linux/time.h>
 #include <linux/cpu.h>
 #include <linux/prefetch.h>
-#include <linux/ftrace_event.h>
+#include <linux/trace_events.h>
 
 #include "rcu.h"
 
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 8cf7304b2867..45a6141e19be 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -54,7 +54,7 @@
 #include <linux/delay.h>
 #include <linux/stop_machine.h>
 #include <linux/random.h>
-#include <linux/ftrace_event.h>
+#include <linux/trace_events.h>
 #include <linux/suspend.h>
 
 #include "tree.h"
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 0315d43176d8..a4bdd63219be 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -3,7 +3,7 @@
  *
  * Copyright (C) 2008 Steven Rostedt <srostedt@redhat.com>
  */
-#include <linux/ftrace_event.h>
+#include <linux/trace_events.h>
 #include <linux/ring_buffer.h>
 #include <linux/trace_clock.h>
 #include <linux/trace_seq.h>
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index d2612016de94..c09ecfed57db 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -12,7 +12,7 @@
 #include <linux/ftrace.h>
 #include <linux/hw_breakpoint.h>
 #include <linux/trace_seq.h>
-#include <linux/ftrace_event.h>
+#include <linux/trace_events.h>
 #include <linux/compiler.h>
 #include <linux/trace_seq.h>
 
@@ -1180,7 +1180,7 @@ struct event_trigger_ops {
  *	commands need to do this if they themselves log to the trace
  *	buffer (see the @post_trigger() member below).  @trigger_type
  *	values are defined by adding new values to the trigger_type
- *	enum in include/linux/ftrace_event.h.
+ *	enum in include/linux/trace_events.h.
  *
  * @post_trigger: A flag that says whether or not this command needs
  *	to have its action delayed until after the current event has
diff --git a/mm/debug.c b/mm/debug.c
index 3eb3ac2fcee7..76089ddf99ea 100644
--- a/mm/debug.c
+++ b/mm/debug.c
@@ -7,7 +7,7 @@
 
 #include <linux/kernel.h>
 #include <linux/mm.h>
-#include <linux/ftrace_event.h>
+#include <linux/trace_events.h>
 #include <linux/memcontrol.h>
 
 static const struct trace_print_flags pageflag_names[] = {
-- 
cgit v1.2.3


From 645df987f7c1740bb1ba783ab907001720a20cf7 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Mon, 4 May 2015 18:12:44 -0400
Subject: tracing: Rename ftrace_print_*() functions ta trace_print_*()

The name "ftrace" really refers to the function hook infrastructure. It
is not about the trace_events. The functions ftrace_print_*() are not part of
the function infrastructure, and the names can be confusing. Rename them
to be trace_print_*().

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/trace_events.h | 30 +++++++++++++++---------------
 include/trace/trace_events.h | 12 ++++++------
 kernel/trace/trace_output.c  | 34 +++++++++++++++++-----------------
 3 files changed, 38 insertions(+), 38 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index f8465d65f3c7..29627cbafdea 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -1,6 +1,6 @@
 
-#ifndef _LINUX_FTRACE_EVENT_H
-#define _LINUX_FTRACE_EVENT_H
+#ifndef _LINUX_TRACE_EVENT_H
+#define _LINUX_TRACE_EVENT_H
 
 #include <linux/ring_buffer.h>
 #include <linux/trace_seq.h>
@@ -25,27 +25,27 @@ struct trace_print_flags_u64 {
 	const char		*name;
 };
 
-const char *ftrace_print_flags_seq(struct trace_seq *p, const char *delim,
-				   unsigned long flags,
-				   const struct trace_print_flags *flag_array);
+const char *trace_print_flags_seq(struct trace_seq *p, const char *delim,
+				  unsigned long flags,
+				  const struct trace_print_flags *flag_array);
 
-const char *ftrace_print_symbols_seq(struct trace_seq *p, unsigned long val,
-				     const struct trace_print_flags *symbol_array);
+const char *trace_print_symbols_seq(struct trace_seq *p, unsigned long val,
+				    const struct trace_print_flags *symbol_array);
 
 #if BITS_PER_LONG == 32
-const char *ftrace_print_symbols_seq_u64(struct trace_seq *p,
-					 unsigned long long val,
-					 const struct trace_print_flags_u64
+const char *trace_print_symbols_seq_u64(struct trace_seq *p,
+					unsigned long long val,
+					const struct trace_print_flags_u64
 								 *symbol_array);
 #endif
 
-const char *ftrace_print_bitmask_seq(struct trace_seq *p, void *bitmask_ptr,
-				     unsigned int bitmask_size);
+const char *trace_print_bitmask_seq(struct trace_seq *p, void *bitmask_ptr,
+				    unsigned int bitmask_size);
 
-const char *ftrace_print_hex_seq(struct trace_seq *p,
-				 const unsigned char *buf, int len);
+const char *trace_print_hex_seq(struct trace_seq *p,
+				const unsigned char *buf, int len);
 
-const char *ftrace_print_array_seq(struct trace_seq *p,
+const char *trace_print_array_seq(struct trace_seq *p,
 				   const void *buf, int count,
 				   size_t el_size);
 
diff --git a/include/trace/trace_events.h b/include/trace/trace_events.h
index 17d85ca992ee..c0b94728758b 100644
--- a/include/trace/trace_events.h
+++ b/include/trace/trace_events.h
@@ -258,7 +258,7 @@ TRACE_MAKE_SYSTEM_STR();
 		void *__bitmask = __get_dynamic_array(field);		\
 		unsigned int __bitmask_size;				\
 		__bitmask_size = __get_dynamic_array_len(field);	\
-		ftrace_print_bitmask_seq(p, __bitmask, __bitmask_size);	\
+		trace_print_bitmask_seq(p, __bitmask, __bitmask_size);	\
 	})
 
 #undef __print_flags
@@ -266,7 +266,7 @@ TRACE_MAKE_SYSTEM_STR();
 	({								\
 		static const struct trace_print_flags __flags[] =	\
 			{ flag_array, { -1, NULL }};			\
-		ftrace_print_flags_seq(p, delim, flag, __flags);	\
+		trace_print_flags_seq(p, delim, flag, __flags);	\
 	})
 
 #undef __print_symbolic
@@ -274,7 +274,7 @@ TRACE_MAKE_SYSTEM_STR();
 	({								\
 		static const struct trace_print_flags symbols[] =	\
 			{ symbol_array, { -1, NULL }};			\
-		ftrace_print_symbols_seq(p, value, symbols);		\
+		trace_print_symbols_seq(p, value, symbols);		\
 	})
 
 #undef __print_symbolic_u64
@@ -283,7 +283,7 @@ TRACE_MAKE_SYSTEM_STR();
 	({								\
 		static const struct trace_print_flags_u64 symbols[] =	\
 			{ symbol_array, { -1, NULL } };			\
-		ftrace_print_symbols_seq_u64(p, value, symbols);	\
+		trace_print_symbols_seq_u64(p, value, symbols);	\
 	})
 #else
 #define __print_symbolic_u64(value, symbol_array...)			\
@@ -291,14 +291,14 @@ TRACE_MAKE_SYSTEM_STR();
 #endif
 
 #undef __print_hex
-#define __print_hex(buf, buf_len) ftrace_print_hex_seq(p, buf, buf_len)
+#define __print_hex(buf, buf_len) trace_print_hex_seq(p, buf, buf_len)
 
 #undef __print_array
 #define __print_array(array, count, el_size)				\
 	({								\
 		BUILD_BUG_ON(el_size != 1 && el_size != 2 &&		\
 			     el_size != 4 && el_size != 8);		\
-		ftrace_print_array_seq(p, array, count, el_size);	\
+		trace_print_array_seq(p, array, count, el_size);	\
 	})
 
 #undef DECLARE_EVENT_CLASS
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 25a086bcb700..4243bf620a27 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -60,9 +60,9 @@ enum print_line_t trace_print_printk_msg_only(struct trace_iterator *iter)
 }
 
 const char *
-ftrace_print_flags_seq(struct trace_seq *p, const char *delim,
-		       unsigned long flags,
-		       const struct trace_print_flags *flag_array)
+trace_print_flags_seq(struct trace_seq *p, const char *delim,
+		      unsigned long flags,
+		      const struct trace_print_flags *flag_array)
 {
 	unsigned long mask;
 	const char *str;
@@ -95,11 +95,11 @@ ftrace_print_flags_seq(struct trace_seq *p, const char *delim,
 
 	return ret;
 }
-EXPORT_SYMBOL(ftrace_print_flags_seq);
+EXPORT_SYMBOL(trace_print_flags_seq);
 
 const char *
-ftrace_print_symbols_seq(struct trace_seq *p, unsigned long val,
-			 const struct trace_print_flags *symbol_array)
+trace_print_symbols_seq(struct trace_seq *p, unsigned long val,
+			const struct trace_print_flags *symbol_array)
 {
 	int i;
 	const char *ret = trace_seq_buffer_ptr(p);
@@ -120,11 +120,11 @@ ftrace_print_symbols_seq(struct trace_seq *p, unsigned long val,
 
 	return ret;
 }
-EXPORT_SYMBOL(ftrace_print_symbols_seq);
+EXPORT_SYMBOL(trace_print_symbols_seq);
 
 #if BITS_PER_LONG == 32
 const char *
-ftrace_print_symbols_seq_u64(struct trace_seq *p, unsigned long long val,
+trace_print_symbols_seq_u64(struct trace_seq *p, unsigned long long val,
 			 const struct trace_print_flags_u64 *symbol_array)
 {
 	int i;
@@ -146,12 +146,12 @@ ftrace_print_symbols_seq_u64(struct trace_seq *p, unsigned long long val,
 
 	return ret;
 }
-EXPORT_SYMBOL(ftrace_print_symbols_seq_u64);
+EXPORT_SYMBOL(trace_print_symbols_seq_u64);
 #endif
 
 const char *
-ftrace_print_bitmask_seq(struct trace_seq *p, void *bitmask_ptr,
-			 unsigned int bitmask_size)
+trace_print_bitmask_seq(struct trace_seq *p, void *bitmask_ptr,
+			unsigned int bitmask_size)
 {
 	const char *ret = trace_seq_buffer_ptr(p);
 
@@ -160,10 +160,10 @@ ftrace_print_bitmask_seq(struct trace_seq *p, void *bitmask_ptr,
 
 	return ret;
 }
-EXPORT_SYMBOL_GPL(ftrace_print_bitmask_seq);
+EXPORT_SYMBOL_GPL(trace_print_bitmask_seq);
 
 const char *
-ftrace_print_hex_seq(struct trace_seq *p, const unsigned char *buf, int buf_len)
+trace_print_hex_seq(struct trace_seq *p, const unsigned char *buf, int buf_len)
 {
 	int i;
 	const char *ret = trace_seq_buffer_ptr(p);
@@ -175,11 +175,11 @@ ftrace_print_hex_seq(struct trace_seq *p, const unsigned char *buf, int buf_len)
 
 	return ret;
 }
-EXPORT_SYMBOL(ftrace_print_hex_seq);
+EXPORT_SYMBOL(trace_print_hex_seq);
 
 const char *
-ftrace_print_array_seq(struct trace_seq *p, const void *buf, int count,
-		       size_t el_size)
+trace_print_array_seq(struct trace_seq *p, const void *buf, int count,
+		      size_t el_size)
 {
 	const char *ret = trace_seq_buffer_ptr(p);
 	const char *prefix = "";
@@ -220,7 +220,7 @@ ftrace_print_array_seq(struct trace_seq *p, const void *buf, int count,
 
 	return ret;
 }
-EXPORT_SYMBOL(ftrace_print_array_seq);
+EXPORT_SYMBOL(trace_print_array_seq);
 
 int ftrace_raw_output_prep(struct trace_iterator *iter,
 			   struct trace_event *trace_event)
-- 
cgit v1.2.3


From 9023c930902fbbcf0cebf6110828700f792989a4 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Tue, 5 May 2015 09:39:12 -0400
Subject: tracing: Rename (un)register_ftrace_event() to
 (un)register_trace_event()

The name "ftrace" really refers to the function hook infrastructure. It
is not about the trace_events. The functions (un)register_ftrace_event() is
really about trace_events, and the name should be register_trace_event()
instead.

Also renamed ftrace_event_reg() to trace_event_reg() for the same reason.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/trace_events.h         |  6 +++---
 include/trace/perf.h                 |  4 ++--
 kernel/trace/blktrace.c              |  4 ++--
 kernel/trace/trace_branch.c          |  2 +-
 kernel/trace/trace_events.c          | 10 +++++-----
 kernel/trace/trace_functions_graph.c |  4 ++--
 kernel/trace/trace_kprobe.c          |  4 ++--
 kernel/trace/trace_output.c          | 18 +++++++++---------
 kernel/trace/trace_output.h          |  2 +-
 kernel/trace/trace_uprobe.c          |  4 ++--
 10 files changed, 29 insertions(+), 29 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index 29627cbafdea..99924c07a042 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -132,8 +132,8 @@ struct trace_event {
 	struct trace_event_functions	*funcs;
 };
 
-extern int register_ftrace_event(struct trace_event *event);
-extern int unregister_ftrace_event(struct trace_event *event);
+extern int register_trace_event(struct trace_event *event);
+extern int unregister_trace_event(struct trace_event *event);
 
 /* Return values for print_line callback */
 enum print_line_t {
@@ -216,7 +216,7 @@ struct ftrace_event_class {
 	int			(*raw_init)(struct ftrace_event_call *);
 };
 
-extern int ftrace_event_reg(struct ftrace_event_call *event,
+extern int trace_event_reg(struct ftrace_event_call *event,
 			    enum trace_reg type, void *data);
 
 struct ftrace_event_buffer {
diff --git a/include/trace/perf.h b/include/trace/perf.h
index 5e82add1647b..bb751a5975dd 100644
--- a/include/trace/perf.h
+++ b/include/trace/perf.h
@@ -69,7 +69,7 @@
  *	.fields			= LIST_HEAD_INIT(event_class_##call.fields),
  *	.raw_init		= trace_event_raw_init,
  *	.probe			= ftrace_raw_event_##call,
- *	.reg			= ftrace_event_reg,
+ *	.reg			= trace_event_reg,
  * };
  *
  * static struct ftrace_event_call event_<call> = {
@@ -219,7 +219,7 @@ static struct ftrace_event_class __used __refdata event_class_##call = { \
 	.fields			= LIST_HEAD_INIT(event_class_##call.fields),\
 	.raw_init		= trace_event_raw_init,			\
 	.probe			= ftrace_raw_event_##call,		\
-	.reg			= ftrace_event_reg,			\
+	.reg			= trace_event_reg,			\
 	_TRACE_PERF_INIT(call)						\
 };
 
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 483cecfa5c17..4ba2b8ecc81c 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -1450,14 +1450,14 @@ static struct trace_event trace_blk_event = {
 
 static int __init init_blk_tracer(void)
 {
-	if (!register_ftrace_event(&trace_blk_event)) {
+	if (!register_trace_event(&trace_blk_event)) {
 		pr_warning("Warning: could not register block events\n");
 		return 1;
 	}
 
 	if (register_tracer(&blk_tracer) != 0) {
 		pr_warning("Warning: could not register the block tracer\n");
-		unregister_ftrace_event(&trace_blk_event);
+		unregister_trace_event(&trace_blk_event);
 		return 1;
 	}
 
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index 57cbf1efdd44..bdfcb44d5d4a 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -191,7 +191,7 @@ __init static int init_branch_tracer(void)
 {
 	int ret;
 
-	ret = register_ftrace_event(&trace_branch_event);
+	ret = register_trace_event(&trace_branch_event);
 	if (!ret) {
 		printk(KERN_WARNING "Warning: could not register "
 				    "branch events\n");
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index c4de47fc5cca..5fbb06c6c3ec 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -182,7 +182,7 @@ int trace_event_raw_init(struct ftrace_event_call *call)
 {
 	int id;
 
-	id = register_ftrace_event(&call->event);
+	id = register_trace_event(&call->event);
 	if (!id)
 		return -ENODEV;
 
@@ -252,8 +252,8 @@ void ftrace_event_buffer_commit(struct ftrace_event_buffer *fbuffer)
 }
 EXPORT_SYMBOL_GPL(ftrace_event_buffer_commit);
 
-int ftrace_event_reg(struct ftrace_event_call *call,
-		     enum trace_reg type, void *data)
+int trace_event_reg(struct ftrace_event_call *call,
+		    enum trace_reg type, void *data)
 {
 	struct ftrace_event_file *file = data;
 
@@ -288,7 +288,7 @@ int ftrace_event_reg(struct ftrace_event_call *call,
 	}
 	return 0;
 }
-EXPORT_SYMBOL_GPL(ftrace_event_reg);
+EXPORT_SYMBOL_GPL(trace_event_reg);
 
 void trace_event_enable_cmd_record(bool enable)
 {
@@ -1673,7 +1673,7 @@ static void event_remove(struct ftrace_event_call *call)
 	} while_for_each_event_file();
 
 	if (call->event.funcs)
-		__unregister_ftrace_event(&call->event);
+		__unregister_trace_event(&call->event);
 	remove_event_from_tracers(call);
 	list_del(&call->list);
 }
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index a51e79688455..32e76a21b8d4 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -1454,12 +1454,12 @@ static __init int init_graph_trace(void)
 {
 	max_bytes_for_cpu = snprintf(NULL, 0, "%d", nr_cpu_ids - 1);
 
-	if (!register_ftrace_event(&graph_trace_entry_event)) {
+	if (!register_trace_event(&graph_trace_entry_event)) {
 		pr_warning("Warning: could not register graph trace events\n");
 		return 1;
 	}
 
-	if (!register_ftrace_event(&graph_trace_ret_event)) {
+	if (!register_trace_event(&graph_trace_ret_event)) {
 		pr_warning("Warning: could not register graph trace events\n");
 		return 1;
 	}
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index d0ce590f06e1..59c35210a7e7 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -1290,7 +1290,7 @@ static int register_kprobe_event(struct trace_kprobe *tk)
 	}
 	if (set_print_fmt(&tk->tp, trace_kprobe_is_return(tk)) < 0)
 		return -ENOMEM;
-	ret = register_ftrace_event(&call->event);
+	ret = register_trace_event(&call->event);
 	if (!ret) {
 		kfree(call->print_fmt);
 		return -ENODEV;
@@ -1303,7 +1303,7 @@ static int register_kprobe_event(struct trace_kprobe *tk)
 		pr_info("Failed to register kprobe event: %s\n",
 			ftrace_event_name(call));
 		kfree(call->print_fmt);
-		unregister_ftrace_event(&call->event);
+		unregister_trace_event(&call->event);
 	}
 	return ret;
 }
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 4243bf620a27..6469906e890d 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -693,7 +693,7 @@ void trace_event_read_unlock(void)
 }
 
 /**
- * register_ftrace_event - register output for an event type
+ * register_trace_event - register output for an event type
  * @event: the event type to register
  *
  * Event types are stored in a hash and this hash is used to
@@ -707,7 +707,7 @@ void trace_event_read_unlock(void)
  *
  * Returns the event type number or zero on error.
  */
-int register_ftrace_event(struct trace_event *event)
+int register_trace_event(struct trace_event *event)
 {
 	unsigned key;
 	int ret = 0;
@@ -771,12 +771,12 @@ int register_ftrace_event(struct trace_event *event)
 
 	return ret;
 }
-EXPORT_SYMBOL_GPL(register_ftrace_event);
+EXPORT_SYMBOL_GPL(register_trace_event);
 
 /*
  * Used by module code with the trace_event_sem held for write.
  */
-int __unregister_ftrace_event(struct trace_event *event)
+int __unregister_trace_event(struct trace_event *event)
 {
 	hlist_del(&event->node);
 	list_del(&event->list);
@@ -784,18 +784,18 @@ int __unregister_ftrace_event(struct trace_event *event)
 }
 
 /**
- * unregister_ftrace_event - remove a no longer used event
+ * unregister_trace_event - remove a no longer used event
  * @event: the event to remove
  */
-int unregister_ftrace_event(struct trace_event *event)
+int unregister_trace_event(struct trace_event *event)
 {
 	down_write(&trace_event_sem);
-	__unregister_ftrace_event(event);
+	__unregister_trace_event(event);
 	up_write(&trace_event_sem);
 
 	return 0;
 }
-EXPORT_SYMBOL_GPL(unregister_ftrace_event);
+EXPORT_SYMBOL_GPL(unregister_trace_event);
 
 /*
  * Standard events
@@ -1243,7 +1243,7 @@ __init static int init_events(void)
 	for (i = 0; events[i]; i++) {
 		event = events[i];
 
-		ret = register_ftrace_event(event);
+		ret = register_trace_event(event);
 		if (!ret) {
 			printk(KERN_WARNING "event %d failed to register\n",
 			       event->type);
diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h
index 8ef2c40efb3c..4cbfe85b99c8 100644
--- a/kernel/trace/trace_output.h
+++ b/kernel/trace/trace_output.h
@@ -32,7 +32,7 @@ extern int
 trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry);
 
 /* used by module unregistering */
-extern int __unregister_ftrace_event(struct trace_event *event);
+extern int __unregister_trace_event(struct trace_event *event);
 extern struct rw_semaphore trace_event_sem;
 
 #define SEQ_PUT_FIELD(s, x)				\
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 6dd022c7b5bc..c8e45d8d6a92 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -1283,7 +1283,7 @@ static int register_uprobe_event(struct trace_uprobe *tu)
 	if (set_print_fmt(&tu->tp, is_ret_probe(tu)) < 0)
 		return -ENOMEM;
 
-	ret = register_ftrace_event(&call->event);
+	ret = register_trace_event(&call->event);
 	if (!ret) {
 		kfree(call->print_fmt);
 		return -ENODEV;
@@ -1297,7 +1297,7 @@ static int register_uprobe_event(struct trace_uprobe *tu)
 		pr_info("Failed to register uprobe event: %s\n",
 			ftrace_event_name(call));
 		kfree(call->print_fmt);
-		unregister_ftrace_event(&call->event);
+		unregister_trace_event(&call->event);
 	}
 
 	return ret;
-- 
cgit v1.2.3


From 7f1d2f8210195c8c309d424a77dbf06a6d2186f4 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Tue, 5 May 2015 10:09:53 -0400
Subject: tracing: Rename ftrace_event_file to trace_event_file

The name "ftrace" really refers to the function hook infrastructure. It
is not about the trace_events. The structure ftrace_event_file is really
about trace events and not "ftrace". Rename it to trace_event_file.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/trace_events.h        | 24 +++++-----
 include/trace/perf.h                | 24 +++++-----
 kernel/trace/trace.c                |  8 ++--
 kernel/trace/trace.h                | 24 +++++-----
 kernel/trace/trace_events.c         | 92 ++++++++++++++++++-------------------
 kernel/trace/trace_events_filter.c  | 30 ++++++------
 kernel/trace/trace_events_trigger.c | 52 ++++++++++-----------
 kernel/trace/trace_kprobe.c         | 32 ++++++-------
 kernel/trace/trace_probe.h          |  4 +-
 kernel/trace/trace_syscalls.c       | 32 ++++++-------
 kernel/trace/trace_uprobe.c         | 16 +++----
 11 files changed, 169 insertions(+), 169 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index 99924c07a042..ae19233c7dd8 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -157,11 +157,11 @@ static inline enum print_line_t trace_handle_return(struct trace_seq *s)
 void tracing_generic_entry_update(struct trace_entry *entry,
 				  unsigned long flags,
 				  int pc);
-struct ftrace_event_file;
+struct trace_event_file;
 
 struct ring_buffer_event *
 trace_event_buffer_lock_reserve(struct ring_buffer **current_buffer,
-				struct ftrace_event_file *ftrace_file,
+				struct trace_event_file *trace_file,
 				int type, unsigned long len,
 				unsigned long flags, int pc);
 struct ring_buffer_event *
@@ -222,14 +222,14 @@ extern int trace_event_reg(struct ftrace_event_call *event,
 struct ftrace_event_buffer {
 	struct ring_buffer		*buffer;
 	struct ring_buffer_event	*event;
-	struct ftrace_event_file	*ftrace_file;
+	struct trace_event_file		*trace_file;
 	void				*entry;
 	unsigned long			flags;
 	int				pc;
 };
 
 void *ftrace_event_buffer_reserve(struct ftrace_event_buffer *fbuffer,
-				  struct ftrace_event_file *ftrace_file,
+				  struct trace_event_file *trace_file,
 				  unsigned long len);
 
 void ftrace_event_buffer_commit(struct ftrace_event_buffer *fbuffer);
@@ -349,7 +349,7 @@ enum {
 	FTRACE_EVENT_FL_TRIGGER_COND	= (1 << FTRACE_EVENT_FL_TRIGGER_COND_BIT),
 };
 
-struct ftrace_event_file {
+struct trace_event_file {
 	struct list_head		list;
 	struct ftrace_event_call	*event_call;
 	struct event_filter		*filter;
@@ -414,15 +414,15 @@ enum event_trigger_type {
 
 extern int filter_match_preds(struct event_filter *filter, void *rec);
 
-extern int filter_check_discard(struct ftrace_event_file *file, void *rec,
+extern int filter_check_discard(struct trace_event_file *file, void *rec,
 				struct ring_buffer *buffer,
 				struct ring_buffer_event *event);
 extern int call_filter_check_discard(struct ftrace_event_call *call, void *rec,
 				     struct ring_buffer *buffer,
 				     struct ring_buffer_event *event);
-extern enum event_trigger_type event_triggers_call(struct ftrace_event_file *file,
+extern enum event_trigger_type event_triggers_call(struct trace_event_file *file,
 						   void *rec);
-extern void event_triggers_post_call(struct ftrace_event_file *file,
+extern void event_triggers_post_call(struct trace_event_file *file,
 				     enum event_trigger_type tt);
 
 /**
@@ -435,7 +435,7 @@ extern void event_triggers_post_call(struct ftrace_event_file *file,
  * otherwise false.
  */
 static inline bool
-ftrace_trigger_soft_disabled(struct ftrace_event_file *file)
+ftrace_trigger_soft_disabled(struct trace_event_file *file)
 {
 	unsigned long eflags = file->flags;
 
@@ -462,7 +462,7 @@ ftrace_trigger_soft_disabled(struct ftrace_event_file *file)
  * Returns true if the event is discarded, false otherwise.
  */
 static inline bool
-__event_trigger_test_discard(struct ftrace_event_file *file,
+__event_trigger_test_discard(struct trace_event_file *file,
 			     struct ring_buffer *buffer,
 			     struct ring_buffer_event *event,
 			     void *entry,
@@ -495,7 +495,7 @@ __event_trigger_test_discard(struct ftrace_event_file *file,
  * if the event is soft disabled and should be discarded.
  */
 static inline void
-event_trigger_unlock_commit(struct ftrace_event_file *file,
+event_trigger_unlock_commit(struct trace_event_file *file,
 			    struct ring_buffer *buffer,
 			    struct ring_buffer_event *event,
 			    void *entry, unsigned long irq_flags, int pc)
@@ -526,7 +526,7 @@ event_trigger_unlock_commit(struct ftrace_event_file *file,
  * trace_buffer_unlock_commit_regs() instead of trace_buffer_unlock_commit().
  */
 static inline void
-event_trigger_unlock_commit_regs(struct ftrace_event_file *file,
+event_trigger_unlock_commit_regs(struct trace_event_file *file,
 				 struct ring_buffer *buffer,
 				 struct ring_buffer_event *event,
 				 void *entry, unsigned long irq_flags, int pc,
diff --git a/include/trace/perf.h b/include/trace/perf.h
index bb751a5975dd..ccc5cc1381e4 100644
--- a/include/trace/perf.h
+++ b/include/trace/perf.h
@@ -9,10 +9,10 @@
  *
  * static void ftrace_raw_event_<call>(void *__data, proto)
  * {
- *	struct ftrace_event_file *ftrace_file = __data;
- *	struct ftrace_event_call *event_call = ftrace_file->event_call;
+ *	struct trace_event_file *trace_file = __data;
+ *	struct ftrace_event_call *event_call = trace_file->event_call;
  *	struct ftrace_data_offsets_<call> __maybe_unused __data_offsets;
- *	unsigned long eflags = ftrace_file->flags;
+ *	unsigned long eflags = trace_file->flags;
  *	enum event_trigger_type __tt = ETT_NONE;
  *	struct ring_buffer_event *event;
  *	struct ftrace_raw_<call> *entry; <-- defined in stage 1
@@ -23,7 +23,7 @@
  *
  *	if (!(eflags & FTRACE_EVENT_FL_TRIGGER_COND)) {
  *		if (eflags & FTRACE_EVENT_FL_TRIGGER_MODE)
- *			event_triggers_call(ftrace_file, NULL);
+ *			event_triggers_call(trace_file, NULL);
  *		if (eflags & FTRACE_EVENT_FL_SOFT_DISABLED)
  *			return;
  *	}
@@ -33,7 +33,7 @@
  *
  *	__data_size = ftrace_get_offsets_<call>(&__data_offsets, args);
  *
- *	event = trace_event_buffer_lock_reserve(&buffer, ftrace_file,
+ *	event = trace_event_buffer_lock_reserve(&buffer, trace_file,
  *				  event_<call>->event.type,
  *				  sizeof(*entry) + __data_size,
  *				  irq_flags, pc);
@@ -45,16 +45,16 @@
  *			   __array macros.
  *
  *	if (eflags & FTRACE_EVENT_FL_TRIGGER_COND)
- *		__tt = event_triggers_call(ftrace_file, entry);
+ *		__tt = event_triggers_call(trace_file, entry);
  *
  *	if (test_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT,
- *		     &ftrace_file->flags))
+ *		     &trace_file->flags))
  *		ring_buffer_discard_commit(buffer, event);
- *	else if (!filter_check_discard(ftrace_file, entry, buffer, event))
+ *	else if (!filter_check_discard(trace_file, entry, buffer, event))
  *		trace_buffer_unlock_commit(buffer, event, irq_flags, pc);
  *
  *	if (__tt)
- *		event_triggers_post_call(ftrace_file, __tt);
+ *		event_triggers_post_call(trace_file, __tt);
  * }
  *
  * static struct trace_event ftrace_event_type_<call> = {
@@ -153,18 +153,18 @@
 static notrace void							\
 ftrace_raw_event_##call(void *__data, proto)				\
 {									\
-	struct ftrace_event_file *ftrace_file = __data;			\
+	struct trace_event_file *trace_file = __data;			\
 	struct ftrace_data_offsets_##call __maybe_unused __data_offsets;\
 	struct ftrace_event_buffer fbuffer;				\
 	struct ftrace_raw_##call *entry;				\
 	int __data_size;						\
 									\
-	if (ftrace_trigger_soft_disabled(ftrace_file))			\
+	if (ftrace_trigger_soft_disabled(trace_file))			\
 		return;							\
 									\
 	__data_size = ftrace_get_offsets_##call(&__data_offsets, args); \
 									\
-	entry = ftrace_event_buffer_reserve(&fbuffer, ftrace_file,	\
+	entry = ftrace_event_buffer_reserve(&fbuffer, trace_file,	\
 				 sizeof(*entry) + __data_size);		\
 									\
 	if (!entry)							\
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 458031c31a37..dd29e9b6b30e 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -297,7 +297,7 @@ void trace_array_put(struct trace_array *this_tr)
 	mutex_unlock(&trace_types_lock);
 }
 
-int filter_check_discard(struct ftrace_event_file *file, void *rec,
+int filter_check_discard(struct trace_event_file *file, void *rec,
 			 struct ring_buffer *buffer,
 			 struct ring_buffer_event *event)
 {
@@ -1694,13 +1694,13 @@ static struct ring_buffer *temp_buffer;
 
 struct ring_buffer_event *
 trace_event_buffer_lock_reserve(struct ring_buffer **current_rb,
-			  struct ftrace_event_file *ftrace_file,
+			  struct trace_event_file *trace_file,
 			  int type, unsigned long len,
 			  unsigned long flags, int pc)
 {
 	struct ring_buffer_event *entry;
 
-	*current_rb = ftrace_file->tr->trace_buffer.buffer;
+	*current_rb = trace_file->tr->trace_buffer.buffer;
 	entry = trace_buffer_lock_reserve(*current_rb,
 					 type, len, flags, pc);
 	/*
@@ -1709,7 +1709,7 @@ trace_event_buffer_lock_reserve(struct ring_buffer **current_rb,
 	 * to store the trace event for the tigger to use. It's recusive
 	 * safe and will not be recorded anywhere.
 	 */
-	if (!entry && ftrace_file->flags & FTRACE_EVENT_FL_TRIGGER_COND) {
+	if (!entry && trace_file->flags & FTRACE_EVENT_FL_TRIGGER_COND) {
 		*current_rb = temp_buffer;
 		entry = trace_buffer_lock_reserve(*current_rb,
 						  type, len, flags, pc);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index c09ecfed57db..4e1715e55b38 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -211,8 +211,8 @@ struct trace_array {
 #ifdef CONFIG_FTRACE_SYSCALLS
 	int			sys_refcount_enter;
 	int			sys_refcount_exit;
-	struct ftrace_event_file __rcu *enter_syscall_files[NR_syscalls];
-	struct ftrace_event_file __rcu *exit_syscall_files[NR_syscalls];
+	struct trace_event_file __rcu *enter_syscall_files[NR_syscalls];
+	struct trace_event_file __rcu *exit_syscall_files[NR_syscalls];
 #endif
 	int			stop_count;
 	int			clock_id;
@@ -1052,9 +1052,9 @@ struct filter_pred {
 
 extern enum regex_type
 filter_parse_regex(char *buff, int len, char **search, int *not);
-extern void print_event_filter(struct ftrace_event_file *file,
+extern void print_event_filter(struct trace_event_file *file,
 			       struct trace_seq *s);
-extern int apply_event_filter(struct ftrace_event_file *file,
+extern int apply_event_filter(struct trace_event_file *file,
 			      char *filter_string);
 extern int apply_subsystem_event_filter(struct ftrace_subsystem_dir *dir,
 					char *filter_string);
@@ -1073,9 +1073,9 @@ extern void trace_event_enable_cmd_record(bool enable);
 extern int event_trace_add_tracer(struct dentry *parent, struct trace_array *tr);
 extern int event_trace_del_tracer(struct trace_array *tr);
 
-extern struct ftrace_event_file *find_event_file(struct trace_array *tr,
-						 const char *system,
-						 const char *event);
+extern struct trace_event_file *find_event_file(struct trace_array *tr,
+						const char *system,
+						const char *event);
 
 static inline void *event_file_data(struct file *filp)
 {
@@ -1242,23 +1242,23 @@ struct event_command {
 	enum event_trigger_type	trigger_type;
 	bool			post_trigger;
 	int			(*func)(struct event_command *cmd_ops,
-					struct ftrace_event_file *file,
+					struct trace_event_file *file,
 					char *glob, char *cmd, char *params);
 	int			(*reg)(char *glob,
 				       struct event_trigger_ops *ops,
 				       struct event_trigger_data *data,
-				       struct ftrace_event_file *file);
+				       struct trace_event_file *file);
 	void			(*unreg)(char *glob,
 					 struct event_trigger_ops *ops,
 					 struct event_trigger_data *data,
-					 struct ftrace_event_file *file);
+					 struct trace_event_file *file);
 	int			(*set_filter)(char *filter_str,
 					      struct event_trigger_data *data,
-					      struct ftrace_event_file *file);
+					      struct trace_event_file *file);
 	struct event_trigger_ops *(*get_trigger_ops)(char *cmd, char *param);
 };
 
-extern int trace_event_enable_disable(struct ftrace_event_file *file,
+extern int trace_event_enable_disable(struct trace_event_file *file,
 				      int enable, int soft_disable);
 extern int tracing_alloc_snapshot(void);
 
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 5fbb06c6c3ec..4a7cc4630ced 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -61,7 +61,7 @@ static int system_refcount_dec(struct event_subsystem *system)
 
 #define do_for_each_event_file_safe(tr, file)			\
 	list_for_each_entry(tr, &ftrace_trace_arrays, list) {	\
-		struct ftrace_event_file *___n;				\
+		struct trace_event_file *___n;				\
 		list_for_each_entry_safe(file, ___n, &tr->events, list)
 
 #define while_for_each_event_file()		\
@@ -191,17 +191,17 @@ int trace_event_raw_init(struct ftrace_event_call *call)
 EXPORT_SYMBOL_GPL(trace_event_raw_init);
 
 void *ftrace_event_buffer_reserve(struct ftrace_event_buffer *fbuffer,
-				  struct ftrace_event_file *ftrace_file,
+				  struct trace_event_file *trace_file,
 				  unsigned long len)
 {
-	struct ftrace_event_call *event_call = ftrace_file->event_call;
+	struct ftrace_event_call *event_call = trace_file->event_call;
 
 	local_save_flags(fbuffer->flags);
 	fbuffer->pc = preempt_count();
-	fbuffer->ftrace_file = ftrace_file;
+	fbuffer->trace_file = trace_file;
 
 	fbuffer->event =
-		trace_event_buffer_lock_reserve(&fbuffer->buffer, ftrace_file,
+		trace_event_buffer_lock_reserve(&fbuffer->buffer, trace_file,
 						event_call->event.type, len,
 						fbuffer->flags, fbuffer->pc);
 	if (!fbuffer->event)
@@ -224,12 +224,12 @@ static void output_printk(struct ftrace_event_buffer *fbuffer)
 	if (!iter)
 		return;
 
-	event_call = fbuffer->ftrace_file->event_call;
+	event_call = fbuffer->trace_file->event_call;
 	if (!event_call || !event_call->event.funcs ||
 	    !event_call->event.funcs->trace)
 		return;
 
-	event = &fbuffer->ftrace_file->event_call->event;
+	event = &fbuffer->trace_file->event_call->event;
 
 	spin_lock_irqsave(&tracepoint_iter_lock, flags);
 	trace_seq_init(&iter->seq);
@@ -246,7 +246,7 @@ void ftrace_event_buffer_commit(struct ftrace_event_buffer *fbuffer)
 	if (tracepoint_printk)
 		output_printk(fbuffer);
 
-	event_trigger_unlock_commit(fbuffer->ftrace_file, fbuffer->buffer,
+	event_trigger_unlock_commit(fbuffer->trace_file, fbuffer->buffer,
 				    fbuffer->event, fbuffer->entry,
 				    fbuffer->flags, fbuffer->pc);
 }
@@ -255,7 +255,7 @@ EXPORT_SYMBOL_GPL(ftrace_event_buffer_commit);
 int trace_event_reg(struct ftrace_event_call *call,
 		    enum trace_reg type, void *data)
 {
-	struct ftrace_event_file *file = data;
+	struct trace_event_file *file = data;
 
 	WARN_ON(!(call->flags & TRACE_EVENT_FL_TRACEPOINT));
 	switch (type) {
@@ -292,7 +292,7 @@ EXPORT_SYMBOL_GPL(trace_event_reg);
 
 void trace_event_enable_cmd_record(bool enable)
 {
-	struct ftrace_event_file *file;
+	struct trace_event_file *file;
 	struct trace_array *tr;
 
 	mutex_lock(&event_mutex);
@@ -312,7 +312,7 @@ void trace_event_enable_cmd_record(bool enable)
 	mutex_unlock(&event_mutex);
 }
 
-static int __ftrace_event_enable_disable(struct ftrace_event_file *file,
+static int __ftrace_event_enable_disable(struct trace_event_file *file,
 					 int enable, int soft_disable)
 {
 	struct ftrace_event_call *call = file->event_call;
@@ -401,13 +401,13 @@ static int __ftrace_event_enable_disable(struct ftrace_event_file *file,
 	return ret;
 }
 
-int trace_event_enable_disable(struct ftrace_event_file *file,
+int trace_event_enable_disable(struct trace_event_file *file,
 			       int enable, int soft_disable)
 {
 	return __ftrace_event_enable_disable(file, enable, soft_disable);
 }
 
-static int ftrace_event_enable_disable(struct ftrace_event_file *file,
+static int ftrace_event_enable_disable(struct trace_event_file *file,
 				       int enable)
 {
 	return __ftrace_event_enable_disable(file, enable, 0);
@@ -415,7 +415,7 @@ static int ftrace_event_enable_disable(struct ftrace_event_file *file,
 
 static void ftrace_clear_events(struct trace_array *tr)
 {
-	struct ftrace_event_file *file;
+	struct trace_event_file *file;
 
 	mutex_lock(&event_mutex);
 	list_for_each_entry(file, &tr->events, list) {
@@ -486,7 +486,7 @@ static void remove_subsystem(struct ftrace_subsystem_dir *dir)
 	}
 }
 
-static void remove_event_file_dir(struct ftrace_event_file *file)
+static void remove_event_file_dir(struct trace_event_file *file)
 {
 	struct dentry *dir = file->dir;
 	struct dentry *child;
@@ -515,7 +515,7 @@ static int
 __ftrace_set_clr_event_nolock(struct trace_array *tr, const char *match,
 			      const char *sub, const char *event, int set)
 {
-	struct ftrace_event_file *file;
+	struct trace_event_file *file;
 	struct ftrace_event_call *call;
 	const char *name;
 	int ret = -EINVAL;
@@ -671,7 +671,7 @@ ftrace_event_write(struct file *file, const char __user *ubuf,
 static void *
 t_next(struct seq_file *m, void *v, loff_t *pos)
 {
-	struct ftrace_event_file *file = v;
+	struct trace_event_file *file = v;
 	struct ftrace_event_call *call;
 	struct trace_array *tr = m->private;
 
@@ -692,13 +692,13 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
 
 static void *t_start(struct seq_file *m, loff_t *pos)
 {
-	struct ftrace_event_file *file;
+	struct trace_event_file *file;
 	struct trace_array *tr = m->private;
 	loff_t l;
 
 	mutex_lock(&event_mutex);
 
-	file = list_entry(&tr->events, struct ftrace_event_file, list);
+	file = list_entry(&tr->events, struct trace_event_file, list);
 	for (l = 0; l <= *pos; ) {
 		file = t_next(m, file, &l);
 		if (!file)
@@ -710,7 +710,7 @@ static void *t_start(struct seq_file *m, loff_t *pos)
 static void *
 s_next(struct seq_file *m, void *v, loff_t *pos)
 {
-	struct ftrace_event_file *file = v;
+	struct trace_event_file *file = v;
 	struct trace_array *tr = m->private;
 
 	(*pos)++;
@@ -725,13 +725,13 @@ s_next(struct seq_file *m, void *v, loff_t *pos)
 
 static void *s_start(struct seq_file *m, loff_t *pos)
 {
-	struct ftrace_event_file *file;
+	struct trace_event_file *file;
 	struct trace_array *tr = m->private;
 	loff_t l;
 
 	mutex_lock(&event_mutex);
 
-	file = list_entry(&tr->events, struct ftrace_event_file, list);
+	file = list_entry(&tr->events, struct trace_event_file, list);
 	for (l = 0; l <= *pos; ) {
 		file = s_next(m, file, &l);
 		if (!file)
@@ -742,7 +742,7 @@ static void *s_start(struct seq_file *m, loff_t *pos)
 
 static int t_show(struct seq_file *m, void *v)
 {
-	struct ftrace_event_file *file = v;
+	struct trace_event_file *file = v;
 	struct ftrace_event_call *call = file->event_call;
 
 	if (strcmp(call->class->system, TRACE_SYSTEM) != 0)
@@ -761,7 +761,7 @@ static ssize_t
 event_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
 		  loff_t *ppos)
 {
-	struct ftrace_event_file *file;
+	struct trace_event_file *file;
 	unsigned long flags;
 	char buf[4] = "0";
 
@@ -791,7 +791,7 @@ static ssize_t
 event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
 		   loff_t *ppos)
 {
-	struct ftrace_event_file *file;
+	struct trace_event_file *file;
 	unsigned long val;
 	int ret;
 
@@ -831,7 +831,7 @@ system_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
 	struct ftrace_subsystem_dir *dir = filp->private_data;
 	struct event_subsystem *system = dir->subsystem;
 	struct ftrace_event_call *call;
-	struct ftrace_event_file *file;
+	struct trace_event_file *file;
 	struct trace_array *tr = dir->tr;
 	char buf[2];
 	int set = 0;
@@ -1062,7 +1062,7 @@ static ssize_t
 event_filter_read(struct file *filp, char __user *ubuf, size_t cnt,
 		  loff_t *ppos)
 {
-	struct ftrace_event_file *file;
+	struct trace_event_file *file;
 	struct trace_seq *s;
 	int r = -ENODEV;
 
@@ -1095,7 +1095,7 @@ static ssize_t
 event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
 		   loff_t *ppos)
 {
-	struct ftrace_event_file *file;
+	struct trace_event_file *file;
 	char *buf;
 	int err = -ENODEV;
 
@@ -1497,7 +1497,7 @@ create_new_subsystem(const char *name)
 
 static struct dentry *
 event_subsystem_dir(struct trace_array *tr, const char *name,
-		    struct ftrace_event_file *file, struct dentry *parent)
+		    struct trace_event_file *file, struct dentry *parent)
 {
 	struct ftrace_subsystem_dir *dir;
 	struct event_subsystem *system;
@@ -1571,7 +1571,7 @@ event_subsystem_dir(struct trace_array *tr, const char *name,
 }
 
 static int
-event_create_dir(struct dentry *parent, struct ftrace_event_file *file)
+event_create_dir(struct dentry *parent, struct trace_event_file *file)
 {
 	struct ftrace_event_call *call = file->event_call;
 	struct trace_array *tr = file->tr;
@@ -1636,7 +1636,7 @@ event_create_dir(struct dentry *parent, struct ftrace_event_file *file)
 
 static void remove_event_from_tracers(struct ftrace_event_call *call)
 {
-	struct ftrace_event_file *file;
+	struct trace_event_file *file;
 	struct trace_array *tr;
 
 	do_for_each_event_file_safe(tr, file) {
@@ -1657,7 +1657,7 @@ static void remove_event_from_tracers(struct ftrace_event_call *call)
 static void event_remove(struct ftrace_event_call *call)
 {
 	struct trace_array *tr;
-	struct ftrace_event_file *file;
+	struct trace_event_file *file;
 
 	do_for_each_event_file(tr, file) {
 		if (file->event_call != call)
@@ -1836,11 +1836,11 @@ void trace_event_enum_update(struct trace_enum_map **map, int len)
 	up_write(&trace_event_sem);
 }
 
-static struct ftrace_event_file *
+static struct trace_event_file *
 trace_create_new_event(struct ftrace_event_call *call,
 		       struct trace_array *tr)
 {
-	struct ftrace_event_file *file;
+	struct trace_event_file *file;
 
 	file = kmem_cache_alloc(file_cachep, GFP_TRACE);
 	if (!file)
@@ -1860,7 +1860,7 @@ trace_create_new_event(struct ftrace_event_call *call,
 static int
 __trace_add_new_event(struct ftrace_event_call *call, struct trace_array *tr)
 {
-	struct ftrace_event_file *file;
+	struct trace_event_file *file;
 
 	file = trace_create_new_event(call, tr);
 	if (!file)
@@ -1878,7 +1878,7 @@ static __init int
 __trace_early_add_new_event(struct ftrace_event_call *call,
 			    struct trace_array *tr)
 {
-	struct ftrace_event_file *file;
+	struct trace_event_file *file;
 
 	file = trace_create_new_event(call, tr);
 	if (!file)
@@ -1921,7 +1921,7 @@ static void __trace_remove_event_call(struct ftrace_event_call *call)
 static int probe_remove_event_call(struct ftrace_event_call *call)
 {
 	struct trace_array *tr;
-	struct ftrace_event_file *file;
+	struct trace_event_file *file;
 
 #ifdef CONFIG_PERF_EVENTS
 	if (call->perf_refcount)
@@ -2066,10 +2066,10 @@ __trace_add_event_dirs(struct trace_array *tr)
 	}
 }
 
-struct ftrace_event_file *
+struct trace_event_file *
 find_event_file(struct trace_array *tr, const char *system,  const char *event)
 {
-	struct ftrace_event_file *file;
+	struct trace_event_file *file;
 	struct ftrace_event_call *call;
 	const char *name;
 
@@ -2098,7 +2098,7 @@ find_event_file(struct trace_array *tr, const char *system,  const char *event)
 #define DISABLE_EVENT_STR	"disable_event"
 
 struct event_probe_data {
-	struct ftrace_event_file	*file;
+	struct trace_event_file	*file;
 	unsigned long			count;
 	int				ref;
 	bool				enable;
@@ -2226,7 +2226,7 @@ event_enable_func(struct ftrace_hash *hash,
 		  char *glob, char *cmd, char *param, int enabled)
 {
 	struct trace_array *tr = top_trace_array();
-	struct ftrace_event_file *file;
+	struct trace_event_file *file;
 	struct ftrace_probe_ops *ops;
 	struct event_probe_data *data;
 	const char *system;
@@ -2358,7 +2358,7 @@ static inline int register_event_cmds(void) { return 0; }
 #endif /* CONFIG_DYNAMIC_FTRACE */
 
 /*
- * The top level array has already had its ftrace_event_file
+ * The top level array has already had its trace_event_file
  * descriptors created in order to allow for early events to
  * be recorded. This function is called after the tracefs has been
  * initialized, and we now have to create the files associated
@@ -2367,7 +2367,7 @@ static inline int register_event_cmds(void) { return 0; }
 static __init void
 __trace_early_add_event_dirs(struct trace_array *tr)
 {
-	struct ftrace_event_file *file;
+	struct trace_event_file *file;
 	int ret;
 
 
@@ -2407,7 +2407,7 @@ __trace_early_add_events(struct trace_array *tr)
 static void
 __trace_remove_event_dirs(struct trace_array *tr)
 {
-	struct ftrace_event_file *file, *next;
+	struct trace_event_file *file, *next;
 
 	list_for_each_entry_safe(file, next, &tr->events, list)
 		remove_event_file_dir(file);
@@ -2557,7 +2557,7 @@ int event_trace_del_tracer(struct trace_array *tr)
 static __init int event_trace_memsetup(void)
 {
 	field_cachep = KMEM_CACHE(ftrace_event_field, SLAB_PANIC);
-	file_cachep = KMEM_CACHE(ftrace_event_file, SLAB_PANIC);
+	file_cachep = KMEM_CACHE(trace_event_file, SLAB_PANIC);
 	return 0;
 }
 
@@ -2755,7 +2755,7 @@ static __init void event_test_stuff(void)
 static __init void event_trace_self_tests(void)
 {
 	struct ftrace_subsystem_dir *dir;
-	struct ftrace_event_file *file;
+	struct trace_event_file *file;
 	struct ftrace_event_call *call;
 	struct event_subsystem *system;
 	struct trace_array *tr;
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index ced69da0ff55..13ad0a87d31e 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -643,7 +643,7 @@ static void append_filter_err(struct filter_parse_state *ps,
 	free_page((unsigned long) buf);
 }
 
-static inline struct event_filter *event_filter(struct ftrace_event_file *file)
+static inline struct event_filter *event_filter(struct trace_event_file *file)
 {
 	if (file->event_call->flags & TRACE_EVENT_FL_USE_CALL_FILTER)
 		return file->event_call->filter;
@@ -652,7 +652,7 @@ static inline struct event_filter *event_filter(struct ftrace_event_file *file)
 }
 
 /* caller must hold event_mutex */
-void print_event_filter(struct ftrace_event_file *file, struct trace_seq *s)
+void print_event_filter(struct trace_event_file *file, struct trace_seq *s)
 {
 	struct event_filter *filter = event_filter(file);
 
@@ -780,7 +780,7 @@ static void __free_preds(struct event_filter *filter)
 	filter->n_preds = 0;
 }
 
-static void filter_disable(struct ftrace_event_file *file)
+static void filter_disable(struct trace_event_file *file)
 {
 	struct ftrace_event_call *call = file->event_call;
 
@@ -837,7 +837,7 @@ static int __alloc_preds(struct event_filter *filter, int n_preds)
 	return 0;
 }
 
-static inline void __remove_filter(struct ftrace_event_file *file)
+static inline void __remove_filter(struct trace_event_file *file)
 {
 	struct ftrace_event_call *call = file->event_call;
 
@@ -851,7 +851,7 @@ static inline void __remove_filter(struct ftrace_event_file *file)
 static void filter_free_subsystem_preds(struct ftrace_subsystem_dir *dir,
 					struct trace_array *tr)
 {
-	struct ftrace_event_file *file;
+	struct trace_event_file *file;
 
 	list_for_each_entry(file, &tr->events, list) {
 		if (file->system != dir)
@@ -860,7 +860,7 @@ static void filter_free_subsystem_preds(struct ftrace_subsystem_dir *dir,
 	}
 }
 
-static inline void __free_subsystem_filter(struct ftrace_event_file *file)
+static inline void __free_subsystem_filter(struct trace_event_file *file)
 {
 	struct ftrace_event_call *call = file->event_call;
 
@@ -876,7 +876,7 @@ static inline void __free_subsystem_filter(struct ftrace_event_file *file)
 static void filter_free_subsystem_filters(struct ftrace_subsystem_dir *dir,
 					  struct trace_array *tr)
 {
-	struct ftrace_event_file *file;
+	struct trace_event_file *file;
 
 	list_for_each_entry(file, &tr->events, list) {
 		if (file->system != dir)
@@ -1662,7 +1662,7 @@ fail:
 	return err;
 }
 
-static inline void event_set_filtered_flag(struct ftrace_event_file *file)
+static inline void event_set_filtered_flag(struct trace_event_file *file)
 {
 	struct ftrace_event_call *call = file->event_call;
 
@@ -1672,7 +1672,7 @@ static inline void event_set_filtered_flag(struct ftrace_event_file *file)
 		file->flags |= FTRACE_EVENT_FL_FILTERED;
 }
 
-static inline void event_set_filter(struct ftrace_event_file *file,
+static inline void event_set_filter(struct trace_event_file *file,
 				    struct event_filter *filter)
 {
 	struct ftrace_event_call *call = file->event_call;
@@ -1683,7 +1683,7 @@ static inline void event_set_filter(struct ftrace_event_file *file,
 		rcu_assign_pointer(file->filter, filter);
 }
 
-static inline void event_clear_filter(struct ftrace_event_file *file)
+static inline void event_clear_filter(struct trace_event_file *file)
 {
 	struct ftrace_event_call *call = file->event_call;
 
@@ -1694,7 +1694,7 @@ static inline void event_clear_filter(struct ftrace_event_file *file)
 }
 
 static inline void
-event_set_no_set_filter_flag(struct ftrace_event_file *file)
+event_set_no_set_filter_flag(struct trace_event_file *file)
 {
 	struct ftrace_event_call *call = file->event_call;
 
@@ -1705,7 +1705,7 @@ event_set_no_set_filter_flag(struct ftrace_event_file *file)
 }
 
 static inline void
-event_clear_no_set_filter_flag(struct ftrace_event_file *file)
+event_clear_no_set_filter_flag(struct trace_event_file *file)
 {
 	struct ftrace_event_call *call = file->event_call;
 
@@ -1716,7 +1716,7 @@ event_clear_no_set_filter_flag(struct ftrace_event_file *file)
 }
 
 static inline bool
-event_no_set_filter_flag(struct ftrace_event_file *file)
+event_no_set_filter_flag(struct trace_event_file *file)
 {
 	struct ftrace_event_call *call = file->event_call;
 
@@ -1740,7 +1740,7 @@ static int replace_system_preds(struct ftrace_subsystem_dir *dir,
 				struct filter_parse_state *ps,
 				char *filter_string)
 {
-	struct ftrace_event_file *file;
+	struct trace_event_file *file;
 	struct filter_list *filter_item;
 	struct filter_list *tmp;
 	LIST_HEAD(filter_list);
@@ -1961,7 +1961,7 @@ static int create_system_filter(struct ftrace_subsystem_dir *dir,
 }
 
 /* caller must hold event_mutex */
-int apply_event_filter(struct ftrace_event_file *file, char *filter_string)
+int apply_event_filter(struct trace_event_file *file, char *filter_string)
 {
 	struct ftrace_event_call *call = file->event_call;
 	struct event_filter *filter;
diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c
index 8712df9decb4..bb6f2ff52ad2 100644
--- a/kernel/trace/trace_events_trigger.c
+++ b/kernel/trace/trace_events_trigger.c
@@ -40,7 +40,7 @@ trigger_data_free(struct event_trigger_data *data)
 
 /**
  * event_triggers_call - Call triggers associated with a trace event
- * @file: The ftrace_event_file associated with the event
+ * @file: The trace_event_file associated with the event
  * @rec: The trace entry for the event, NULL for unconditional invocation
  *
  * For each trigger associated with an event, invoke the trigger
@@ -63,7 +63,7 @@ trigger_data_free(struct event_trigger_data *data)
  * any trigger that should be deferred, ETT_NONE if nothing to defer.
  */
 enum event_trigger_type
-event_triggers_call(struct ftrace_event_file *file, void *rec)
+event_triggers_call(struct trace_event_file *file, void *rec)
 {
 	struct event_trigger_data *data;
 	enum event_trigger_type tt = ETT_NONE;
@@ -92,7 +92,7 @@ EXPORT_SYMBOL_GPL(event_triggers_call);
 
 /**
  * event_triggers_post_call - Call 'post_triggers' for a trace event
- * @file: The ftrace_event_file associated with the event
+ * @file: The trace_event_file associated with the event
  * @tt: enum event_trigger_type containing a set bit for each trigger to invoke
  *
  * For each trigger associated with an event, invoke the trigger
@@ -103,7 +103,7 @@ EXPORT_SYMBOL_GPL(event_triggers_call);
  * Called from tracepoint handlers (with rcu_read_lock_sched() held).
  */
 void
-event_triggers_post_call(struct ftrace_event_file *file,
+event_triggers_post_call(struct trace_event_file *file,
 			 enum event_trigger_type tt)
 {
 	struct event_trigger_data *data;
@@ -119,7 +119,7 @@ EXPORT_SYMBOL_GPL(event_triggers_post_call);
 
 static void *trigger_next(struct seq_file *m, void *t, loff_t *pos)
 {
-	struct ftrace_event_file *event_file = event_file_data(m->private);
+	struct trace_event_file *event_file = event_file_data(m->private);
 
 	if (t == SHOW_AVAILABLE_TRIGGERS)
 		return NULL;
@@ -129,7 +129,7 @@ static void *trigger_next(struct seq_file *m, void *t, loff_t *pos)
 
 static void *trigger_start(struct seq_file *m, loff_t *pos)
 {
-	struct ftrace_event_file *event_file;
+	struct trace_event_file *event_file;
 
 	/* ->stop() is called even if ->start() fails */
 	mutex_lock(&event_mutex);
@@ -201,7 +201,7 @@ static int event_trigger_regex_open(struct inode *inode, struct file *file)
 	return ret;
 }
 
-static int trigger_process_regex(struct ftrace_event_file *file, char *buff)
+static int trigger_process_regex(struct trace_event_file *file, char *buff)
 {
 	char *command, *next = buff;
 	struct event_command *p;
@@ -227,7 +227,7 @@ static ssize_t event_trigger_regex_write(struct file *file,
 					 const char __user *ubuf,
 					 size_t cnt, loff_t *ppos)
 {
-	struct ftrace_event_file *event_file;
+	struct trace_event_file *event_file;
 	ssize_t ret;
 	char *buf;
 
@@ -430,7 +430,7 @@ event_trigger_free(struct event_trigger_ops *ops,
 		trigger_data_free(data);
 }
 
-static int trace_event_trigger_enable_disable(struct ftrace_event_file *file,
+static int trace_event_trigger_enable_disable(struct trace_event_file *file,
 					      int trigger_enable)
 {
 	int ret = 0;
@@ -466,7 +466,7 @@ static int trace_event_trigger_enable_disable(struct ftrace_event_file *file,
 void
 clear_event_triggers(struct trace_array *tr)
 {
-	struct ftrace_event_file *file;
+	struct trace_event_file *file;
 
 	list_for_each_entry(file, &tr->events, list) {
 		struct event_trigger_data *data;
@@ -480,7 +480,7 @@ clear_event_triggers(struct trace_array *tr)
 
 /**
  * update_cond_flag - Set or reset the TRIGGER_COND bit
- * @file: The ftrace_event_file associated with the event
+ * @file: The trace_event_file associated with the event
  *
  * If an event has triggers and any of those triggers has a filter or
  * a post_trigger, trigger invocation needs to be deferred until after
@@ -488,7 +488,7 @@ clear_event_triggers(struct trace_array *tr)
  * its TRIGGER_COND bit set, otherwise the TRIGGER_COND bit should be
  * cleared.
  */
-static void update_cond_flag(struct ftrace_event_file *file)
+static void update_cond_flag(struct trace_event_file *file)
 {
 	struct event_trigger_data *data;
 	bool set_cond = false;
@@ -511,7 +511,7 @@ static void update_cond_flag(struct ftrace_event_file *file)
  * @glob: The raw string used to register the trigger
  * @ops: The trigger ops associated with the trigger
  * @data: Trigger-specific data to associate with the trigger
- * @file: The ftrace_event_file associated with the event
+ * @file: The trace_event_file associated with the event
  *
  * Common implementation for event trigger registration.
  *
@@ -522,7 +522,7 @@ static void update_cond_flag(struct ftrace_event_file *file)
  */
 static int register_trigger(char *glob, struct event_trigger_ops *ops,
 			    struct event_trigger_data *data,
-			    struct ftrace_event_file *file)
+			    struct trace_event_file *file)
 {
 	struct event_trigger_data *test;
 	int ret = 0;
@@ -557,7 +557,7 @@ out:
  * @glob: The raw string used to register the trigger
  * @ops: The trigger ops associated with the trigger
  * @test: Trigger-specific data used to find the trigger to remove
- * @file: The ftrace_event_file associated with the event
+ * @file: The trace_event_file associated with the event
  *
  * Common implementation for event trigger unregistration.
  *
@@ -566,7 +566,7 @@ out:
  */
 static void unregister_trigger(char *glob, struct event_trigger_ops *ops,
 			       struct event_trigger_data *test,
-			       struct ftrace_event_file *file)
+			       struct trace_event_file *file)
 {
 	struct event_trigger_data *data;
 	bool unregistered = false;
@@ -588,7 +588,7 @@ static void unregister_trigger(char *glob, struct event_trigger_ops *ops,
 /**
  * event_trigger_callback - Generic event_command @func implementation
  * @cmd_ops: The command ops, used for trigger registration
- * @file: The ftrace_event_file associated with the event
+ * @file: The trace_event_file associated with the event
  * @glob: The raw string used to register the trigger
  * @cmd: The cmd portion of the string used to register the trigger
  * @param: The params portion of the string used to register the trigger
@@ -603,7 +603,7 @@ static void unregister_trigger(char *glob, struct event_trigger_ops *ops,
  */
 static int
 event_trigger_callback(struct event_command *cmd_ops,
-		       struct ftrace_event_file *file,
+		       struct trace_event_file *file,
 		       char *glob, char *cmd, char *param)
 {
 	struct event_trigger_data *trigger_data;
@@ -688,7 +688,7 @@ event_trigger_callback(struct event_command *cmd_ops,
  * set_trigger_filter - Generic event_command @set_filter implementation
  * @filter_str: The filter string for the trigger, NULL to remove filter
  * @trigger_data: Trigger-specific data
- * @file: The ftrace_event_file associated with the event
+ * @file: The trace_event_file associated with the event
  *
  * Common implementation for event command filter parsing and filter
  * instantiation.
@@ -702,7 +702,7 @@ event_trigger_callback(struct event_command *cmd_ops,
  */
 static int set_trigger_filter(char *filter_str,
 			      struct event_trigger_data *trigger_data,
-			      struct ftrace_event_file *file)
+			      struct trace_event_file *file)
 {
 	struct event_trigger_data *data = trigger_data;
 	struct event_filter *filter = NULL, *tmp;
@@ -900,7 +900,7 @@ snapshot_count_trigger(struct event_trigger_data *data)
 static int
 register_snapshot_trigger(char *glob, struct event_trigger_ops *ops,
 			  struct event_trigger_data *data,
-			  struct ftrace_event_file *file)
+			  struct trace_event_file *file)
 {
 	int ret = register_trigger(glob, ops, data, file);
 
@@ -1053,7 +1053,7 @@ static __init void unregister_trigger_traceon_traceoff_cmds(void)
 #define DISABLE_EVENT_STR	"disable_event"
 
 struct enable_trigger_data {
-	struct ftrace_event_file	*file;
+	struct trace_event_file		*file;
 	bool				enable;
 };
 
@@ -1159,10 +1159,10 @@ static struct event_trigger_ops event_disable_count_trigger_ops = {
 
 static int
 event_enable_trigger_func(struct event_command *cmd_ops,
-			  struct ftrace_event_file *file,
+			  struct trace_event_file *file,
 			  char *glob, char *cmd, char *param)
 {
-	struct ftrace_event_file *event_enable_file;
+	struct trace_event_file *event_enable_file;
 	struct enable_trigger_data *enable_data;
 	struct event_trigger_data *trigger_data;
 	struct event_trigger_ops *trigger_ops;
@@ -1294,7 +1294,7 @@ event_enable_trigger_func(struct event_command *cmd_ops,
 static int event_enable_register_trigger(char *glob,
 					 struct event_trigger_ops *ops,
 					 struct event_trigger_data *data,
-					 struct ftrace_event_file *file)
+					 struct trace_event_file *file)
 {
 	struct enable_trigger_data *enable_data = data->private_data;
 	struct enable_trigger_data *test_enable_data;
@@ -1331,7 +1331,7 @@ out:
 static void event_enable_unregister_trigger(char *glob,
 					    struct event_trigger_ops *ops,
 					    struct event_trigger_data *test,
-					    struct ftrace_event_file *file)
+					    struct trace_event_file *file)
 {
 	struct enable_trigger_data *test_enable_data = test->private_data;
 	struct enable_trigger_data *enable_data;
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 59c35210a7e7..87c63d039b9d 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -359,7 +359,7 @@ static struct trace_kprobe *find_trace_kprobe(const char *event,
  * if the file is NULL, enable "perf" handler, or enable "trace" handler.
  */
 static int
-enable_trace_kprobe(struct trace_kprobe *tk, struct ftrace_event_file *file)
+enable_trace_kprobe(struct trace_kprobe *tk, struct trace_event_file *file)
 {
 	int ret = 0;
 
@@ -394,7 +394,7 @@ enable_trace_kprobe(struct trace_kprobe *tk, struct ftrace_event_file *file)
  * if the file is NULL, disable "perf" handler, or disable "trace" handler.
  */
 static int
-disable_trace_kprobe(struct trace_kprobe *tk, struct ftrace_event_file *file)
+disable_trace_kprobe(struct trace_kprobe *tk, struct trace_event_file *file)
 {
 	struct event_file_link *link = NULL;
 	int wait = 0;
@@ -917,7 +917,7 @@ static const struct file_operations kprobe_profile_ops = {
 /* Kprobe handler */
 static nokprobe_inline void
 __kprobe_trace_func(struct trace_kprobe *tk, struct pt_regs *regs,
-		    struct ftrace_event_file *ftrace_file)
+		    struct trace_event_file *trace_file)
 {
 	struct kprobe_trace_entry_head *entry;
 	struct ring_buffer_event *event;
@@ -926,9 +926,9 @@ __kprobe_trace_func(struct trace_kprobe *tk, struct pt_regs *regs,
 	unsigned long irq_flags;
 	struct ftrace_event_call *call = &tk->tp.call;
 
-	WARN_ON(call != ftrace_file->event_call);
+	WARN_ON(call != trace_file->event_call);
 
-	if (ftrace_trigger_soft_disabled(ftrace_file))
+	if (ftrace_trigger_soft_disabled(trace_file))
 		return;
 
 	local_save_flags(irq_flags);
@@ -937,7 +937,7 @@ __kprobe_trace_func(struct trace_kprobe *tk, struct pt_regs *regs,
 	dsize = __get_data_size(&tk->tp, regs);
 	size = sizeof(*entry) + tk->tp.size + dsize;
 
-	event = trace_event_buffer_lock_reserve(&buffer, ftrace_file,
+	event = trace_event_buffer_lock_reserve(&buffer, trace_file,
 						call->event.type,
 						size, irq_flags, pc);
 	if (!event)
@@ -947,7 +947,7 @@ __kprobe_trace_func(struct trace_kprobe *tk, struct pt_regs *regs,
 	entry->ip = (unsigned long)tk->rp.kp.addr;
 	store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize);
 
-	event_trigger_unlock_commit_regs(ftrace_file, buffer, event,
+	event_trigger_unlock_commit_regs(trace_file, buffer, event,
 					 entry, irq_flags, pc, regs);
 }
 
@@ -965,7 +965,7 @@ NOKPROBE_SYMBOL(kprobe_trace_func);
 static nokprobe_inline void
 __kretprobe_trace_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
 		       struct pt_regs *regs,
-		       struct ftrace_event_file *ftrace_file)
+		       struct trace_event_file *trace_file)
 {
 	struct kretprobe_trace_entry_head *entry;
 	struct ring_buffer_event *event;
@@ -974,9 +974,9 @@ __kretprobe_trace_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
 	unsigned long irq_flags;
 	struct ftrace_event_call *call = &tk->tp.call;
 
-	WARN_ON(call != ftrace_file->event_call);
+	WARN_ON(call != trace_file->event_call);
 
-	if (ftrace_trigger_soft_disabled(ftrace_file))
+	if (ftrace_trigger_soft_disabled(trace_file))
 		return;
 
 	local_save_flags(irq_flags);
@@ -985,7 +985,7 @@ __kretprobe_trace_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
 	dsize = __get_data_size(&tk->tp, regs);
 	size = sizeof(*entry) + tk->tp.size + dsize;
 
-	event = trace_event_buffer_lock_reserve(&buffer, ftrace_file,
+	event = trace_event_buffer_lock_reserve(&buffer, trace_file,
 						call->event.type,
 						size, irq_flags, pc);
 	if (!event)
@@ -996,7 +996,7 @@ __kretprobe_trace_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
 	entry->ret_ip = (unsigned long)ri->ret_addr;
 	store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize);
 
-	event_trigger_unlock_commit_regs(ftrace_file, buffer, event,
+	event_trigger_unlock_commit_regs(trace_file, buffer, event,
 					 entry, irq_flags, pc, regs);
 }
 
@@ -1210,7 +1210,7 @@ static int kprobe_register(struct ftrace_event_call *event,
 			   enum trace_reg type, void *data)
 {
 	struct trace_kprobe *tk = (struct trace_kprobe *)event->data;
-	struct ftrace_event_file *file = data;
+	struct trace_event_file *file = data;
 
 	switch (type) {
 	case TRACE_REG_REGISTER:
@@ -1364,10 +1364,10 @@ static __used int kprobe_trace_selftest_target(int a1, int a2, int a3,
 	return a1 + a2 + a3 + a4 + a5 + a6;
 }
 
-static struct ftrace_event_file *
+static struct trace_event_file *
 find_trace_probe_file(struct trace_kprobe *tk, struct trace_array *tr)
 {
-	struct ftrace_event_file *file;
+	struct trace_event_file *file;
 
 	list_for_each_entry(file, &tr->events, list)
 		if (file->event_call == &tk->tp.call)
@@ -1385,7 +1385,7 @@ static __init int kprobe_trace_self_tests_init(void)
 	int ret, warn = 0;
 	int (*target)(int, int, int, int, int, int);
 	struct trace_kprobe *tk;
-	struct ftrace_event_file *file;
+	struct trace_event_file *file;
 
 	if (tracing_is_disabled())
 		return -ENODEV;
diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h
index ab283e146b70..9fcdfbbcd8b5 100644
--- a/kernel/trace/trace_probe.h
+++ b/kernel/trace/trace_probe.h
@@ -281,7 +281,7 @@ struct trace_probe {
 };
 
 struct event_file_link {
-	struct ftrace_event_file	*file;
+	struct trace_event_file		*file;
 	struct list_head		list;
 };
 
@@ -314,7 +314,7 @@ static inline int is_good_name(const char *name)
 }
 
 static inline struct event_file_link *
-find_event_file_link(struct trace_probe *tp, struct ftrace_event_file *file)
+find_event_file_link(struct trace_probe *tp, struct trace_event_file *file)
 {
 	struct event_file_link *link;
 
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index f97f6e3a676c..ee4525261e82 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -293,7 +293,7 @@ static int __init syscall_exit_define_fields(struct ftrace_event_call *call)
 static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id)
 {
 	struct trace_array *tr = data;
-	struct ftrace_event_file *ftrace_file;
+	struct trace_event_file *trace_file;
 	struct syscall_trace_enter *entry;
 	struct syscall_metadata *sys_data;
 	struct ring_buffer_event *event;
@@ -308,11 +308,11 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id)
 		return;
 
 	/* Here we're inside tp handler's rcu_read_lock_sched (__DO_TRACE) */
-	ftrace_file = rcu_dereference_sched(tr->enter_syscall_files[syscall_nr]);
-	if (!ftrace_file)
+	trace_file = rcu_dereference_sched(tr->enter_syscall_files[syscall_nr]);
+	if (!trace_file)
 		return;
 
-	if (ftrace_trigger_soft_disabled(ftrace_file))
+	if (ftrace_trigger_soft_disabled(trace_file))
 		return;
 
 	sys_data = syscall_nr_to_meta(syscall_nr);
@@ -334,14 +334,14 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id)
 	entry->nr = syscall_nr;
 	syscall_get_arguments(current, regs, 0, sys_data->nb_args, entry->args);
 
-	event_trigger_unlock_commit(ftrace_file, buffer, event, entry,
+	event_trigger_unlock_commit(trace_file, buffer, event, entry,
 				    irq_flags, pc);
 }
 
 static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret)
 {
 	struct trace_array *tr = data;
-	struct ftrace_event_file *ftrace_file;
+	struct trace_event_file *trace_file;
 	struct syscall_trace_exit *entry;
 	struct syscall_metadata *sys_data;
 	struct ring_buffer_event *event;
@@ -355,11 +355,11 @@ static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret)
 		return;
 
 	/* Here we're inside tp handler's rcu_read_lock_sched (__DO_TRACE()) */
-	ftrace_file = rcu_dereference_sched(tr->exit_syscall_files[syscall_nr]);
-	if (!ftrace_file)
+	trace_file = rcu_dereference_sched(tr->exit_syscall_files[syscall_nr]);
+	if (!trace_file)
 		return;
 
-	if (ftrace_trigger_soft_disabled(ftrace_file))
+	if (ftrace_trigger_soft_disabled(trace_file))
 		return;
 
 	sys_data = syscall_nr_to_meta(syscall_nr);
@@ -380,11 +380,11 @@ static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret)
 	entry->nr = syscall_nr;
 	entry->ret = syscall_get_return_value(current, regs);
 
-	event_trigger_unlock_commit(ftrace_file, buffer, event, entry,
+	event_trigger_unlock_commit(trace_file, buffer, event, entry,
 				    irq_flags, pc);
 }
 
-static int reg_event_syscall_enter(struct ftrace_event_file *file,
+static int reg_event_syscall_enter(struct trace_event_file *file,
 				   struct ftrace_event_call *call)
 {
 	struct trace_array *tr = file->tr;
@@ -405,7 +405,7 @@ static int reg_event_syscall_enter(struct ftrace_event_file *file,
 	return ret;
 }
 
-static void unreg_event_syscall_enter(struct ftrace_event_file *file,
+static void unreg_event_syscall_enter(struct trace_event_file *file,
 				      struct ftrace_event_call *call)
 {
 	struct trace_array *tr = file->tr;
@@ -422,7 +422,7 @@ static void unreg_event_syscall_enter(struct ftrace_event_file *file,
 	mutex_unlock(&syscall_trace_lock);
 }
 
-static int reg_event_syscall_exit(struct ftrace_event_file *file,
+static int reg_event_syscall_exit(struct trace_event_file *file,
 				  struct ftrace_event_call *call)
 {
 	struct trace_array *tr = file->tr;
@@ -443,7 +443,7 @@ static int reg_event_syscall_exit(struct ftrace_event_file *file,
 	return ret;
 }
 
-static void unreg_event_syscall_exit(struct ftrace_event_file *file,
+static void unreg_event_syscall_exit(struct trace_event_file *file,
 				     struct ftrace_event_call *call)
 {
 	struct trace_array *tr = file->tr;
@@ -696,7 +696,7 @@ static void perf_sysexit_disable(struct ftrace_event_call *call)
 static int syscall_enter_register(struct ftrace_event_call *event,
 				 enum trace_reg type, void *data)
 {
-	struct ftrace_event_file *file = data;
+	struct trace_event_file *file = data;
 
 	switch (type) {
 	case TRACE_REG_REGISTER:
@@ -724,7 +724,7 @@ static int syscall_enter_register(struct ftrace_event_call *event,
 static int syscall_exit_register(struct ftrace_event_call *event,
 				 enum trace_reg type, void *data)
 {
-	struct ftrace_event_file *file = data;
+	struct trace_event_file *file = data;
 
 	switch (type) {
 	case TRACE_REG_REGISTER:
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index c8e45d8d6a92..3f61ec4a3164 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -770,7 +770,7 @@ static void uprobe_buffer_put(struct uprobe_cpu_buffer *ucb)
 static void __uprobe_trace_func(struct trace_uprobe *tu,
 				unsigned long func, struct pt_regs *regs,
 				struct uprobe_cpu_buffer *ucb, int dsize,
-				struct ftrace_event_file *ftrace_file)
+				struct trace_event_file *trace_file)
 {
 	struct uprobe_trace_entry_head *entry;
 	struct ring_buffer_event *event;
@@ -779,17 +779,17 @@ static void __uprobe_trace_func(struct trace_uprobe *tu,
 	int size, esize;
 	struct ftrace_event_call *call = &tu->tp.call;
 
-	WARN_ON(call != ftrace_file->event_call);
+	WARN_ON(call != trace_file->event_call);
 
 	if (WARN_ON_ONCE(tu->tp.size + dsize > PAGE_SIZE))
 		return;
 
-	if (ftrace_trigger_soft_disabled(ftrace_file))
+	if (ftrace_trigger_soft_disabled(trace_file))
 		return;
 
 	esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu));
 	size = esize + tu->tp.size + dsize;
-	event = trace_event_buffer_lock_reserve(&buffer, ftrace_file,
+	event = trace_event_buffer_lock_reserve(&buffer, trace_file,
 						call->event.type, size, 0, 0);
 	if (!event)
 		return;
@@ -806,7 +806,7 @@ static void __uprobe_trace_func(struct trace_uprobe *tu,
 
 	memcpy(data, ucb->buf, tu->tp.size + dsize);
 
-	event_trigger_unlock_commit(ftrace_file, buffer, event, entry, 0, 0);
+	event_trigger_unlock_commit(trace_file, buffer, event, entry, 0, 0);
 }
 
 /* uprobe handler */
@@ -881,7 +881,7 @@ typedef bool (*filter_func_t)(struct uprobe_consumer *self,
 				struct mm_struct *mm);
 
 static int
-probe_event_enable(struct trace_uprobe *tu, struct ftrace_event_file *file,
+probe_event_enable(struct trace_uprobe *tu, struct trace_event_file *file,
 		   filter_func_t filter)
 {
 	bool enabled = trace_probe_is_enabled(&tu->tp);
@@ -938,7 +938,7 @@ probe_event_enable(struct trace_uprobe *tu, struct ftrace_event_file *file,
 }
 
 static void
-probe_event_disable(struct trace_uprobe *tu, struct ftrace_event_file *file)
+probe_event_disable(struct trace_uprobe *tu, struct trace_event_file *file)
 {
 	if (!trace_probe_is_enabled(&tu->tp))
 		return;
@@ -1163,7 +1163,7 @@ trace_uprobe_register(struct ftrace_event_call *event, enum trace_reg type,
 		      void *data)
 {
 	struct trace_uprobe *tu = event->data;
-	struct ftrace_event_file *file = data;
+	struct trace_event_file *file = data;
 
 	switch (type) {
 	case TRACE_REG_REGISTER:
-- 
cgit v1.2.3


From 2425bcb9240f8c97d793cb31c8e8d8d0a843fa29 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Tue, 5 May 2015 11:45:27 -0400
Subject: tracing: Rename ftrace_event_{call,class} to trace_event_{call,class}

The name "ftrace" really refers to the function hook infrastructure. It
is not about the trace_events. The structures ftrace_event_call and
ftrace_event_class have nothing to do with the function hooks, and are
really trace_event structures. Rename ftrace_event_* to trace_event_*.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/module.h               |  2 +-
 include/linux/perf_event.h           |  2 +-
 include/linux/syscalls.h             | 12 +++---
 include/linux/trace_events.h         | 38 ++++++++---------
 include/trace/perf.h                 | 22 +++++-----
 include/trace/syscall.h              |  4 +-
 include/trace/trace_events.h         |  6 +--
 kernel/trace/trace.c                 | 12 +++---
 kernel/trace/trace.h                 | 10 ++---
 kernel/trace/trace_branch.c          |  2 +-
 kernel/trace/trace_event_perf.c      | 20 ++++-----
 kernel/trace/trace_events.c          | 80 ++++++++++++++++++------------------
 kernel/trace/trace_events_filter.c   | 34 +++++++--------
 kernel/trace/trace_export.c          | 10 ++---
 kernel/trace/trace_functions_graph.c |  4 +-
 kernel/trace/trace_kprobe.c          | 18 ++++----
 kernel/trace/trace_mmiotrace.c       |  4 +-
 kernel/trace/trace_output.c          |  4 +-
 kernel/trace/trace_probe.h           |  4 +-
 kernel/trace/trace_sched_wakeup.c    |  4 +-
 kernel/trace/trace_syscalls.c        | 40 +++++++++---------
 kernel/trace/trace_uprobe.c          | 12 +++---
 22 files changed, 172 insertions(+), 172 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/module.h b/include/linux/module.h
index c883b86ea964..3e0d492682bb 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -336,7 +336,7 @@ struct module {
 	const char **trace_bprintk_fmt_start;
 #endif
 #ifdef CONFIG_EVENT_TRACING
-	struct ftrace_event_call **trace_events;
+	struct trace_event_call **trace_events;
 	unsigned int num_trace_events;
 	struct trace_enum_map **trace_enums;
 	unsigned int num_trace_enums;
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 61992cf2e977..d089d6d58ae0 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -483,7 +483,7 @@ struct perf_event {
 	void				*overflow_handler_context;
 
 #ifdef CONFIG_EVENT_TRACING
-	struct ftrace_event_call	*tp_event;
+	struct trace_event_call		*tp_event;
 	struct event_filter		*filter;
 #ifdef CONFIG_FUNCTION_TRACER
 	struct ftrace_ops               ftrace_ops;
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 76d1e38aabe1..d8b06abb264f 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -111,14 +111,14 @@ union bpf_attr;
 #define __SC_STR_ADECL(t, a)	#a
 #define __SC_STR_TDECL(t, a)	#t
 
-extern struct ftrace_event_class event_class_syscall_enter;
-extern struct ftrace_event_class event_class_syscall_exit;
+extern struct trace_event_class event_class_syscall_enter;
+extern struct trace_event_class event_class_syscall_exit;
 extern struct trace_event_functions enter_syscall_print_funcs;
 extern struct trace_event_functions exit_syscall_print_funcs;
 
 #define SYSCALL_TRACE_ENTER_EVENT(sname)				\
 	static struct syscall_metadata __syscall_meta_##sname;		\
-	static struct ftrace_event_call __used				\
+	static struct trace_event_call __used				\
 	  event_enter_##sname = {					\
 		.class			= &event_class_syscall_enter,	\
 		{							\
@@ -128,13 +128,13 @@ extern struct trace_event_functions exit_syscall_print_funcs;
 		.data			= (void *)&__syscall_meta_##sname,\
 		.flags                  = TRACE_EVENT_FL_CAP_ANY,	\
 	};								\
-	static struct ftrace_event_call __used				\
+	static struct trace_event_call __used				\
 	  __attribute__((section("_ftrace_events")))			\
 	 *__event_enter_##sname = &event_enter_##sname;
 
 #define SYSCALL_TRACE_EXIT_EVENT(sname)					\
 	static struct syscall_metadata __syscall_meta_##sname;		\
-	static struct ftrace_event_call __used				\
+	static struct trace_event_call __used				\
 	  event_exit_##sname = {					\
 		.class			= &event_class_syscall_exit,	\
 		{							\
@@ -144,7 +144,7 @@ extern struct trace_event_functions exit_syscall_print_funcs;
 		.data			= (void *)&__syscall_meta_##sname,\
 		.flags                  = TRACE_EVENT_FL_CAP_ANY,	\
 	};								\
-	static struct ftrace_event_call __used				\
+	static struct trace_event_call __used				\
 	  __attribute__((section("_ftrace_events")))			\
 	*__event_exit_##sname = &event_exit_##sname;
 
diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index ae19233c7dd8..d10ab04a17b2 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -200,23 +200,23 @@ enum trace_reg {
 #endif
 };
 
-struct ftrace_event_call;
+struct trace_event_call;
 
-struct ftrace_event_class {
+struct trace_event_class {
 	const char		*system;
 	void			*probe;
 #ifdef CONFIG_PERF_EVENTS
 	void			*perf_probe;
 #endif
-	int			(*reg)(struct ftrace_event_call *event,
+	int			(*reg)(struct trace_event_call *event,
 				       enum trace_reg type, void *data);
-	int			(*define_fields)(struct ftrace_event_call *);
-	struct list_head	*(*get_fields)(struct ftrace_event_call *);
+	int			(*define_fields)(struct trace_event_call *);
+	struct list_head	*(*get_fields)(struct trace_event_call *);
 	struct list_head	fields;
-	int			(*raw_init)(struct ftrace_event_call *);
+	int			(*raw_init)(struct trace_event_call *);
 };
 
-extern int trace_event_reg(struct ftrace_event_call *event,
+extern int trace_event_reg(struct trace_event_call *event,
 			    enum trace_reg type, void *data);
 
 struct ftrace_event_buffer {
@@ -269,9 +269,9 @@ enum {
 	TRACE_EVENT_FL_KPROBE		= (1 << TRACE_EVENT_FL_KPROBE_BIT),
 };
 
-struct ftrace_event_call {
+struct trace_event_call {
 	struct list_head	list;
-	struct ftrace_event_class *class;
+	struct trace_event_class *class;
 	union {
 		char			*name;
 		/* Set TRACE_EVENT_FL_TRACEPOINT flag when using "tp" */
@@ -298,13 +298,13 @@ struct ftrace_event_call {
 	struct hlist_head __percpu	*perf_events;
 	struct bpf_prog			*prog;
 
-	int	(*perf_perm)(struct ftrace_event_call *,
+	int	(*perf_perm)(struct trace_event_call *,
 			     struct perf_event *);
 #endif
 };
 
 static inline const char *
-ftrace_event_name(struct ftrace_event_call *call)
+ftrace_event_name(struct trace_event_call *call)
 {
 	if (call->flags & TRACE_EVENT_FL_TRACEPOINT)
 		return call->tp ? call->tp->name : NULL;
@@ -351,7 +351,7 @@ enum {
 
 struct trace_event_file {
 	struct list_head		list;
-	struct ftrace_event_call	*event_call;
+	struct trace_event_call		*event_call;
 	struct event_filter		*filter;
 	struct dentry			*dir;
 	struct trace_array		*tr;
@@ -388,7 +388,7 @@ struct trace_event_file {
 	early_initcall(trace_init_flags_##name);
 
 #define __TRACE_EVENT_PERF_PERM(name, expr...)				\
-	static int perf_perm_##name(struct ftrace_event_call *tp_event, \
+	static int perf_perm_##name(struct trace_event_call *tp_event, \
 				    struct perf_event *p_event)		\
 	{								\
 		return ({ expr; });					\
@@ -417,7 +417,7 @@ extern int filter_match_preds(struct event_filter *filter, void *rec);
 extern int filter_check_discard(struct trace_event_file *file, void *rec,
 				struct ring_buffer *buffer,
 				struct ring_buffer_event *event);
-extern int call_filter_check_discard(struct ftrace_event_call *call, void *rec,
+extern int call_filter_check_discard(struct trace_event_call *call, void *rec,
 				     struct ring_buffer *buffer,
 				     struct ring_buffer_event *event);
 extern enum event_trigger_type event_triggers_call(struct trace_event_file *file,
@@ -559,12 +559,12 @@ enum {
 	FILTER_TRACE_FN,
 };
 
-extern int trace_event_raw_init(struct ftrace_event_call *call);
-extern int trace_define_field(struct ftrace_event_call *call, const char *type,
+extern int trace_event_raw_init(struct trace_event_call *call);
+extern int trace_define_field(struct trace_event_call *call, const char *type,
 			      const char *name, int offset, int size,
 			      int is_signed, int filter_type);
-extern int trace_add_event_call(struct ftrace_event_call *call);
-extern int trace_remove_event_call(struct ftrace_event_call *call);
+extern int trace_add_event_call(struct trace_event_call *call);
+extern int trace_remove_event_call(struct trace_event_call *call);
 
 #define is_signed_type(type)	(((type)(-1)) < (type)1)
 
@@ -613,4 +613,4 @@ perf_trace_buf_submit(void *raw_data, int size, int rctx, u64 addr,
 }
 #endif
 
-#endif /* _LINUX_FTRACE_EVENT_H */
+#endif /* _LINUX_TRACE_EVENT_H */
diff --git a/include/trace/perf.h b/include/trace/perf.h
index ccc5cc1381e4..1d10c2d2b2c4 100644
--- a/include/trace/perf.h
+++ b/include/trace/perf.h
@@ -5,12 +5,12 @@
  *
  * For those macros defined with TRACE_EVENT:
  *
- * static struct ftrace_event_call event_<call>;
+ * static struct trace_event_call event_<call>;
  *
  * static void ftrace_raw_event_<call>(void *__data, proto)
  * {
  *	struct trace_event_file *trace_file = __data;
- *	struct ftrace_event_call *event_call = trace_file->event_call;
+ *	struct trace_event_call *event_call = trace_file->event_call;
  *	struct ftrace_data_offsets_<call> __maybe_unused __data_offsets;
  *	unsigned long eflags = trace_file->flags;
  *	enum event_trigger_type __tt = ETT_NONE;
@@ -63,7 +63,7 @@
  *
  * static char print_fmt_<call>[] = <TP_printk>;
  *
- * static struct ftrace_event_class __used event_class_<template> = {
+ * static struct trace_event_class __used event_class_<template> = {
  *	.system			= "<system>",
  *	.define_fields		= ftrace_define_fields_<call>,
  *	.fields			= LIST_HEAD_INIT(event_class_##call.fields),
@@ -72,7 +72,7 @@
  *	.reg			= trace_event_reg,
  * };
  *
- * static struct ftrace_event_call event_<call> = {
+ * static struct trace_event_call event_<call> = {
  *	.class			= event_class_<template>,
  *	{
  *		.tp			= &__tracepoint_<call>,
@@ -83,7 +83,7 @@
  * };
  * // its only safe to use pointers when doing linker tricks to
  * // create an array.
- * static struct ftrace_event_call __used
+ * static struct trace_event_call __used
  * __attribute__((section("_ftrace_events"))) *__event_<call> = &event_<call>;
  *
  */
@@ -213,7 +213,7 @@ static inline void ftrace_test_probe_##call(void)			\
 #define DECLARE_EVENT_CLASS(call, proto, args, tstruct, assign, print)	\
 _TRACE_PERF_PROTO(call, PARAMS(proto));					\
 static char print_fmt_##call[] = print;					\
-static struct ftrace_event_class __used __refdata event_class_##call = { \
+static struct trace_event_class __used __refdata event_class_##call = { \
 	.system			= TRACE_SYSTEM_STRING,			\
 	.define_fields		= ftrace_define_fields_##call,		\
 	.fields			= LIST_HEAD_INIT(event_class_##call.fields),\
@@ -226,7 +226,7 @@ static struct ftrace_event_class __used __refdata event_class_##call = { \
 #undef DEFINE_EVENT
 #define DEFINE_EVENT(template, call, proto, args)			\
 									\
-static struct ftrace_event_call __used event_##call = {			\
+static struct trace_event_call __used event_##call = {			\
 	.class			= &event_class_##template,		\
 	{								\
 		.tp			= &__tracepoint_##call,		\
@@ -235,7 +235,7 @@ static struct ftrace_event_call __used event_##call = {			\
 	.print_fmt		= print_fmt_##template,			\
 	.flags			= TRACE_EVENT_FL_TRACEPOINT,		\
 };									\
-static struct ftrace_event_call __used					\
+static struct trace_event_call __used					\
 __attribute__((section("_ftrace_events"))) *__event_##call = &event_##call
 
 #undef DEFINE_EVENT_PRINT
@@ -243,7 +243,7 @@ __attribute__((section("_ftrace_events"))) *__event_##call = &event_##call
 									\
 static char print_fmt_##call[] = print;					\
 									\
-static struct ftrace_event_call __used event_##call = {			\
+static struct trace_event_call __used event_##call = {			\
 	.class			= &event_class_##template,		\
 	{								\
 		.tp			= &__tracepoint_##call,		\
@@ -252,7 +252,7 @@ static struct ftrace_event_call __used event_##call = {			\
 	.print_fmt		= print_fmt_##call,			\
 	.flags			= TRACE_EVENT_FL_TRACEPOINT,		\
 };									\
-static struct ftrace_event_call __used					\
+static struct trace_event_call __used					\
 __attribute__((section("_ftrace_events"))) *__event_##call = &event_##call
 
 #include TRACE_INCLUDE(TRACE_INCLUDE_FILE)
@@ -292,7 +292,7 @@ __attribute__((section("_ftrace_events"))) *__event_##call = &event_##call
 static notrace void							\
 perf_trace_##call(void *__data, proto)					\
 {									\
-	struct ftrace_event_call *event_call = __data;			\
+	struct trace_event_call *event_call = __data;			\
 	struct ftrace_data_offsets_##call __maybe_unused __data_offsets;\
 	struct ftrace_raw_##call *entry;				\
 	struct pt_regs *__regs;						\
diff --git a/include/trace/syscall.h b/include/trace/syscall.h
index 58df48c9d04e..7434f0f5d3f6 100644
--- a/include/trace/syscall.h
+++ b/include/trace/syscall.h
@@ -29,8 +29,8 @@ struct syscall_metadata {
 	const char	**args;
 	struct list_head enter_fields;
 
-	struct ftrace_event_call *enter_event;
-	struct ftrace_event_call *exit_event;
+	struct trace_event_call *enter_event;
+	struct trace_event_call *exit_event;
 };
 
 #if defined(CONFIG_TRACEPOINTS) && defined(CONFIG_HAVE_SYSCALL_TRACEPOINTS)
diff --git a/include/trace/trace_events.h b/include/trace/trace_events.h
index c0b94728758b..c6f826136b8c 100644
--- a/include/trace/trace_events.h
+++ b/include/trace/trace_events.h
@@ -101,11 +101,11 @@ TRACE_MAKE_SYSTEM_STR();
 		char			__data[0];			\
 	};								\
 									\
-	static struct ftrace_event_class event_class_##name;
+	static struct trace_event_class event_class_##name;
 
 #undef DEFINE_EVENT
 #define DEFINE_EVENT(template, name, proto, args)	\
-	static struct ftrace_event_call	__used		\
+	static struct trace_event_call	__used		\
 	__attribute__((__aligned__(4))) event_##name
 
 #undef DEFINE_EVENT_FN
@@ -407,7 +407,7 @@ static struct trace_event_functions ftrace_event_type_funcs_##call = {	\
 #undef DECLARE_EVENT_CLASS
 #define DECLARE_EVENT_CLASS(call, proto, args, tstruct, func, print)	\
 static int notrace __init						\
-ftrace_define_fields_##call(struct ftrace_event_call *event_call)	\
+ftrace_define_fields_##call(struct trace_event_call *event_call)	\
 {									\
 	struct ftrace_raw_##call field;					\
 	int ret;							\
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index dd29e9b6b30e..07ff08661167 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -311,7 +311,7 @@ int filter_check_discard(struct trace_event_file *file, void *rec,
 }
 EXPORT_SYMBOL_GPL(filter_check_discard);
 
-int call_filter_check_discard(struct ftrace_event_call *call, void *rec,
+int call_filter_check_discard(struct trace_event_call *call, void *rec,
 			      struct ring_buffer *buffer,
 			      struct ring_buffer_event *event)
 {
@@ -1761,7 +1761,7 @@ trace_function(struct trace_array *tr,
 	       unsigned long ip, unsigned long parent_ip, unsigned long flags,
 	       int pc)
 {
-	struct ftrace_event_call *call = &event_function;
+	struct trace_event_call *call = &event_function;
 	struct ring_buffer *buffer = tr->trace_buffer.buffer;
 	struct ring_buffer_event *event;
 	struct ftrace_entry *entry;
@@ -1796,7 +1796,7 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer,
 				 unsigned long flags,
 				 int skip, int pc, struct pt_regs *regs)
 {
-	struct ftrace_event_call *call = &event_kernel_stack;
+	struct trace_event_call *call = &event_kernel_stack;
 	struct ring_buffer_event *event;
 	struct stack_entry *entry;
 	struct stack_trace trace;
@@ -1924,7 +1924,7 @@ static DEFINE_PER_CPU(int, user_stack_count);
 void
 ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc)
 {
-	struct ftrace_event_call *call = &event_user_stack;
+	struct trace_event_call *call = &event_user_stack;
 	struct ring_buffer_event *event;
 	struct userstack_entry *entry;
 	struct stack_trace trace;
@@ -2130,7 +2130,7 @@ static void trace_printk_start_stop_comm(int enabled)
  */
 int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
 {
-	struct ftrace_event_call *call = &event_bprint;
+	struct trace_event_call *call = &event_bprint;
 	struct ring_buffer_event *event;
 	struct ring_buffer *buffer;
 	struct trace_array *tr = &global_trace;
@@ -2188,7 +2188,7 @@ static int
 __trace_array_vprintk(struct ring_buffer *buffer,
 		      unsigned long ip, const char *fmt, va_list args)
 {
-	struct ftrace_event_call *call = &event_print;
+	struct trace_event_call *call = &event_print;
 	struct ring_buffer_event *event;
 	int len = 0, size, pc;
 	struct print_entry *entry;
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 4e1715e55b38..64de3837c383 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -858,7 +858,7 @@ void ftrace_destroy_filter_files(struct ftrace_ops *ops);
 #define ftrace_destroy_filter_files(ops) do { } while (0)
 #endif /* CONFIG_FUNCTION_TRACER && CONFIG_DYNAMIC_FTRACE */
 
-int ftrace_event_is_function(struct ftrace_event_call *call);
+int ftrace_event_is_function(struct trace_event_call *call);
 
 /*
  * struct trace_parser - servers for reading the user input separated by spaces
@@ -1061,13 +1061,13 @@ extern int apply_subsystem_event_filter(struct ftrace_subsystem_dir *dir,
 extern void print_subsystem_event_filter(struct event_subsystem *system,
 					 struct trace_seq *s);
 extern int filter_assign_type(const char *type);
-extern int create_event_filter(struct ftrace_event_call *call,
+extern int create_event_filter(struct trace_event_call *call,
 			       char *filter_str, bool set_str,
 			       struct event_filter **filterp);
 extern void free_event_filter(struct event_filter *filter);
 
 struct ftrace_event_field *
-trace_find_event_field(struct ftrace_event_call *call, char *name);
+trace_find_event_field(struct trace_event_call *call, char *name);
 
 extern void trace_event_enable_cmd_record(bool enable);
 extern int event_trace_add_tracer(struct dentry *parent, struct trace_array *tr);
@@ -1286,7 +1286,7 @@ int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled);
 
 #undef FTRACE_ENTRY
 #define FTRACE_ENTRY(call, struct_name, id, tstruct, print, filter)	\
-	extern struct ftrace_event_call					\
+	extern struct trace_event_call					\
 	__aligned(4) event_##call;
 #undef FTRACE_ENTRY_DUP
 #define FTRACE_ENTRY_DUP(call, struct_name, id, tstruct, print, filter)	\
@@ -1295,7 +1295,7 @@ int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled);
 #include "trace_entries.h"
 
 #if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_FUNCTION_TRACER)
-int perf_ftrace_event_register(struct ftrace_event_call *call,
+int perf_ftrace_event_register(struct trace_event_call *call,
 			       enum trace_reg type, void *data);
 #else
 #define perf_ftrace_event_register NULL
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index bdfcb44d5d4a..a87b43f49eb4 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -29,7 +29,7 @@ static struct trace_array *branch_tracer;
 static void
 probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)
 {
-	struct ftrace_event_call *call = &event_branch;
+	struct trace_event_call *call = &event_branch;
 	struct trace_array *tr = branch_tracer;
 	struct trace_array_cpu *data;
 	struct ring_buffer_event *event;
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index 6fa484de2ba1..abfc903e741e 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -21,7 +21,7 @@ typedef typeof(unsigned long [PERF_MAX_TRACE_SIZE / sizeof(unsigned long)])
 /* Count the events in use (per event id, not per instance) */
 static int	total_ref_count;
 
-static int perf_trace_event_perm(struct ftrace_event_call *tp_event,
+static int perf_trace_event_perm(struct trace_event_call *tp_event,
 				 struct perf_event *p_event)
 {
 	if (tp_event->perf_perm) {
@@ -83,7 +83,7 @@ static int perf_trace_event_perm(struct ftrace_event_call *tp_event,
 	return 0;
 }
 
-static int perf_trace_event_reg(struct ftrace_event_call *tp_event,
+static int perf_trace_event_reg(struct trace_event_call *tp_event,
 				struct perf_event *p_event)
 {
 	struct hlist_head __percpu *list;
@@ -143,7 +143,7 @@ fail:
 
 static void perf_trace_event_unreg(struct perf_event *p_event)
 {
-	struct ftrace_event_call *tp_event = p_event->tp_event;
+	struct trace_event_call *tp_event = p_event->tp_event;
 	int i;
 
 	if (--tp_event->perf_refcount > 0)
@@ -172,17 +172,17 @@ out:
 
 static int perf_trace_event_open(struct perf_event *p_event)
 {
-	struct ftrace_event_call *tp_event = p_event->tp_event;
+	struct trace_event_call *tp_event = p_event->tp_event;
 	return tp_event->class->reg(tp_event, TRACE_REG_PERF_OPEN, p_event);
 }
 
 static void perf_trace_event_close(struct perf_event *p_event)
 {
-	struct ftrace_event_call *tp_event = p_event->tp_event;
+	struct trace_event_call *tp_event = p_event->tp_event;
 	tp_event->class->reg(tp_event, TRACE_REG_PERF_CLOSE, p_event);
 }
 
-static int perf_trace_event_init(struct ftrace_event_call *tp_event,
+static int perf_trace_event_init(struct trace_event_call *tp_event,
 				 struct perf_event *p_event)
 {
 	int ret;
@@ -206,7 +206,7 @@ static int perf_trace_event_init(struct ftrace_event_call *tp_event,
 
 int perf_trace_init(struct perf_event *p_event)
 {
-	struct ftrace_event_call *tp_event;
+	struct trace_event_call *tp_event;
 	u64 event_id = p_event->attr.config;
 	int ret = -EINVAL;
 
@@ -236,7 +236,7 @@ void perf_trace_destroy(struct perf_event *p_event)
 
 int perf_trace_add(struct perf_event *p_event, int flags)
 {
-	struct ftrace_event_call *tp_event = p_event->tp_event;
+	struct trace_event_call *tp_event = p_event->tp_event;
 	struct hlist_head __percpu *pcpu_list;
 	struct hlist_head *list;
 
@@ -255,7 +255,7 @@ int perf_trace_add(struct perf_event *p_event, int flags)
 
 void perf_trace_del(struct perf_event *p_event, int flags)
 {
-	struct ftrace_event_call *tp_event = p_event->tp_event;
+	struct trace_event_call *tp_event = p_event->tp_event;
 	hlist_del_rcu(&p_event->hlist_entry);
 	tp_event->class->reg(tp_event, TRACE_REG_PERF_DEL, p_event);
 }
@@ -357,7 +357,7 @@ static void perf_ftrace_function_disable(struct perf_event *event)
 	ftrace_function_local_disable(&event->ftrace_ops);
 }
 
-int perf_ftrace_event_register(struct ftrace_event_call *call,
+int perf_ftrace_event_register(struct trace_event_call *call,
 			       enum trace_reg type, void *data)
 {
 	switch (type) {
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 4a7cc4630ced..8df615ce3dc4 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -68,7 +68,7 @@ static int system_refcount_dec(struct event_subsystem *system)
 	}
 
 static struct list_head *
-trace_get_fields(struct ftrace_event_call *event_call)
+trace_get_fields(struct trace_event_call *event_call)
 {
 	if (!event_call->class->get_fields)
 		return &event_call->class->fields;
@@ -89,7 +89,7 @@ __find_event_field(struct list_head *head, char *name)
 }
 
 struct ftrace_event_field *
-trace_find_event_field(struct ftrace_event_call *call, char *name)
+trace_find_event_field(struct trace_event_call *call, char *name)
 {
 	struct ftrace_event_field *field;
 	struct list_head *head;
@@ -129,7 +129,7 @@ static int __trace_define_field(struct list_head *head, const char *type,
 	return 0;
 }
 
-int trace_define_field(struct ftrace_event_call *call, const char *type,
+int trace_define_field(struct trace_event_call *call, const char *type,
 		       const char *name, int offset, int size, int is_signed,
 		       int filter_type)
 {
@@ -166,7 +166,7 @@ static int trace_define_common_fields(void)
 	return ret;
 }
 
-static void trace_destroy_fields(struct ftrace_event_call *call)
+static void trace_destroy_fields(struct trace_event_call *call)
 {
 	struct ftrace_event_field *field, *next;
 	struct list_head *head;
@@ -178,7 +178,7 @@ static void trace_destroy_fields(struct ftrace_event_call *call)
 	}
 }
 
-int trace_event_raw_init(struct ftrace_event_call *call)
+int trace_event_raw_init(struct trace_event_call *call)
 {
 	int id;
 
@@ -194,7 +194,7 @@ void *ftrace_event_buffer_reserve(struct ftrace_event_buffer *fbuffer,
 				  struct trace_event_file *trace_file,
 				  unsigned long len)
 {
-	struct ftrace_event_call *event_call = trace_file->event_call;
+	struct trace_event_call *event_call = trace_file->event_call;
 
 	local_save_flags(fbuffer->flags);
 	fbuffer->pc = preempt_count();
@@ -216,7 +216,7 @@ static DEFINE_SPINLOCK(tracepoint_iter_lock);
 
 static void output_printk(struct ftrace_event_buffer *fbuffer)
 {
-	struct ftrace_event_call *event_call;
+	struct trace_event_call *event_call;
 	struct trace_event *event;
 	unsigned long flags;
 	struct trace_iterator *iter = tracepoint_print_iter;
@@ -252,7 +252,7 @@ void ftrace_event_buffer_commit(struct ftrace_event_buffer *fbuffer)
 }
 EXPORT_SYMBOL_GPL(ftrace_event_buffer_commit);
 
-int trace_event_reg(struct ftrace_event_call *call,
+int trace_event_reg(struct trace_event_call *call,
 		    enum trace_reg type, void *data)
 {
 	struct trace_event_file *file = data;
@@ -315,7 +315,7 @@ void trace_event_enable_cmd_record(bool enable)
 static int __ftrace_event_enable_disable(struct trace_event_file *file,
 					 int enable, int soft_disable)
 {
-	struct ftrace_event_call *call = file->event_call;
+	struct trace_event_call *call = file->event_call;
 	int ret = 0;
 	int disable;
 
@@ -516,7 +516,7 @@ __ftrace_set_clr_event_nolock(struct trace_array *tr, const char *match,
 			      const char *sub, const char *event, int set)
 {
 	struct trace_event_file *file;
-	struct ftrace_event_call *call;
+	struct trace_event_call *call;
 	const char *name;
 	int ret = -EINVAL;
 
@@ -672,7 +672,7 @@ static void *
 t_next(struct seq_file *m, void *v, loff_t *pos)
 {
 	struct trace_event_file *file = v;
-	struct ftrace_event_call *call;
+	struct trace_event_call *call;
 	struct trace_array *tr = m->private;
 
 	(*pos)++;
@@ -743,7 +743,7 @@ static void *s_start(struct seq_file *m, loff_t *pos)
 static int t_show(struct seq_file *m, void *v)
 {
 	struct trace_event_file *file = v;
-	struct ftrace_event_call *call = file->event_call;
+	struct trace_event_call *call = file->event_call;
 
 	if (strcmp(call->class->system, TRACE_SYSTEM) != 0)
 		seq_printf(m, "%s:", call->class->system);
@@ -830,7 +830,7 @@ system_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
 	const char set_to_char[4] = { '?', '0', '1', 'X' };
 	struct ftrace_subsystem_dir *dir = filp->private_data;
 	struct event_subsystem *system = dir->subsystem;
-	struct ftrace_event_call *call;
+	struct trace_event_call *call;
 	struct trace_event_file *file;
 	struct trace_array *tr = dir->tr;
 	char buf[2];
@@ -917,7 +917,7 @@ enum {
 
 static void *f_next(struct seq_file *m, void *v, loff_t *pos)
 {
-	struct ftrace_event_call *call = event_file_data(m->private);
+	struct trace_event_call *call = event_file_data(m->private);
 	struct list_head *common_head = &ftrace_common_fields;
 	struct list_head *head = trace_get_fields(call);
 	struct list_head *node = v;
@@ -949,7 +949,7 @@ static void *f_next(struct seq_file *m, void *v, loff_t *pos)
 
 static int f_show(struct seq_file *m, void *v)
 {
-	struct ftrace_event_call *call = event_file_data(m->private);
+	struct trace_event_call *call = event_file_data(m->private);
 	struct ftrace_event_field *field;
 	const char *array_descriptor;
 
@@ -1573,7 +1573,7 @@ event_subsystem_dir(struct trace_array *tr, const char *name,
 static int
 event_create_dir(struct dentry *parent, struct trace_event_file *file)
 {
-	struct ftrace_event_call *call = file->event_call;
+	struct trace_event_call *call = file->event_call;
 	struct trace_array *tr = file->tr;
 	struct list_head *head;
 	struct dentry *d_events;
@@ -1634,7 +1634,7 @@ event_create_dir(struct dentry *parent, struct trace_event_file *file)
 	return 0;
 }
 
-static void remove_event_from_tracers(struct ftrace_event_call *call)
+static void remove_event_from_tracers(struct trace_event_call *call)
 {
 	struct trace_event_file *file;
 	struct trace_array *tr;
@@ -1654,7 +1654,7 @@ static void remove_event_from_tracers(struct ftrace_event_call *call)
 	} while_for_each_event_file();
 }
 
-static void event_remove(struct ftrace_event_call *call)
+static void event_remove(struct trace_event_call *call)
 {
 	struct trace_array *tr;
 	struct trace_event_file *file;
@@ -1678,7 +1678,7 @@ static void event_remove(struct ftrace_event_call *call)
 	list_del(&call->list);
 }
 
-static int event_init(struct ftrace_event_call *call)
+static int event_init(struct trace_event_call *call)
 {
 	int ret = 0;
 	const char *name;
@@ -1697,7 +1697,7 @@ static int event_init(struct ftrace_event_call *call)
 }
 
 static int
-__register_event(struct ftrace_event_call *call, struct module *mod)
+__register_event(struct trace_event_call *call, struct module *mod)
 {
 	int ret;
 
@@ -1733,7 +1733,7 @@ static char *enum_replace(char *ptr, struct trace_enum_map *map, int len)
 	return ptr + elen;
 }
 
-static void update_event_printk(struct ftrace_event_call *call,
+static void update_event_printk(struct trace_event_call *call,
 				struct trace_enum_map *map)
 {
 	char *ptr;
@@ -1811,7 +1811,7 @@ static void update_event_printk(struct ftrace_event_call *call,
 
 void trace_event_enum_update(struct trace_enum_map **map, int len)
 {
-	struct ftrace_event_call *call, *p;
+	struct trace_event_call *call, *p;
 	const char *last_system = NULL;
 	int last_i;
 	int i;
@@ -1837,7 +1837,7 @@ void trace_event_enum_update(struct trace_enum_map **map, int len)
 }
 
 static struct trace_event_file *
-trace_create_new_event(struct ftrace_event_call *call,
+trace_create_new_event(struct trace_event_call *call,
 		       struct trace_array *tr)
 {
 	struct trace_event_file *file;
@@ -1858,7 +1858,7 @@ trace_create_new_event(struct ftrace_event_call *call,
 
 /* Add an event to a trace directory */
 static int
-__trace_add_new_event(struct ftrace_event_call *call, struct trace_array *tr)
+__trace_add_new_event(struct trace_event_call *call, struct trace_array *tr)
 {
 	struct trace_event_file *file;
 
@@ -1875,7 +1875,7 @@ __trace_add_new_event(struct ftrace_event_call *call, struct trace_array *tr)
  * the filesystem is initialized.
  */
 static __init int
-__trace_early_add_new_event(struct ftrace_event_call *call,
+__trace_early_add_new_event(struct trace_event_call *call,
 			    struct trace_array *tr)
 {
 	struct trace_event_file *file;
@@ -1888,10 +1888,10 @@ __trace_early_add_new_event(struct ftrace_event_call *call,
 }
 
 struct ftrace_module_file_ops;
-static void __add_event_to_tracers(struct ftrace_event_call *call);
+static void __add_event_to_tracers(struct trace_event_call *call);
 
 /* Add an additional event_call dynamically */
-int trace_add_event_call(struct ftrace_event_call *call)
+int trace_add_event_call(struct trace_event_call *call)
 {
 	int ret;
 	mutex_lock(&trace_types_lock);
@@ -1910,7 +1910,7 @@ int trace_add_event_call(struct ftrace_event_call *call)
  * Must be called under locking of trace_types_lock, event_mutex and
  * trace_event_sem.
  */
-static void __trace_remove_event_call(struct ftrace_event_call *call)
+static void __trace_remove_event_call(struct trace_event_call *call)
 {
 	event_remove(call);
 	trace_destroy_fields(call);
@@ -1918,7 +1918,7 @@ static void __trace_remove_event_call(struct ftrace_event_call *call)
 	call->filter = NULL;
 }
 
-static int probe_remove_event_call(struct ftrace_event_call *call)
+static int probe_remove_event_call(struct trace_event_call *call)
 {
 	struct trace_array *tr;
 	struct trace_event_file *file;
@@ -1952,7 +1952,7 @@ static int probe_remove_event_call(struct ftrace_event_call *call)
 }
 
 /* Remove an event_call */
-int trace_remove_event_call(struct ftrace_event_call *call)
+int trace_remove_event_call(struct trace_event_call *call)
 {
 	int ret;
 
@@ -1976,7 +1976,7 @@ int trace_remove_event_call(struct ftrace_event_call *call)
 
 static void trace_module_add_events(struct module *mod)
 {
-	struct ftrace_event_call **call, **start, **end;
+	struct trace_event_call **call, **start, **end;
 
 	if (!mod->num_trace_events)
 		return;
@@ -1999,7 +1999,7 @@ static void trace_module_add_events(struct module *mod)
 
 static void trace_module_remove_events(struct module *mod)
 {
-	struct ftrace_event_call *call, *p;
+	struct trace_event_call *call, *p;
 	bool clear_trace = false;
 
 	down_write(&trace_event_sem);
@@ -2055,7 +2055,7 @@ static struct notifier_block trace_module_nb = {
 static void
 __trace_add_event_dirs(struct trace_array *tr)
 {
-	struct ftrace_event_call *call;
+	struct trace_event_call *call;
 	int ret;
 
 	list_for_each_entry(call, &ftrace_events, list) {
@@ -2070,7 +2070,7 @@ struct trace_event_file *
 find_event_file(struct trace_array *tr, const char *system,  const char *event)
 {
 	struct trace_event_file *file;
-	struct ftrace_event_call *call;
+	struct trace_event_call *call;
 	const char *name;
 
 	list_for_each_entry(file, &tr->events, list) {
@@ -2388,7 +2388,7 @@ __trace_early_add_event_dirs(struct trace_array *tr)
 static __init void
 __trace_early_add_events(struct trace_array *tr)
 {
-	struct ftrace_event_call *call;
+	struct trace_event_call *call;
 	int ret;
 
 	list_for_each_entry(call, &ftrace_events, list) {
@@ -2413,7 +2413,7 @@ __trace_remove_event_dirs(struct trace_array *tr)
 		remove_event_file_dir(file);
 }
 
-static void __add_event_to_tracers(struct ftrace_event_call *call)
+static void __add_event_to_tracers(struct trace_event_call *call)
 {
 	struct trace_array *tr;
 
@@ -2421,8 +2421,8 @@ static void __add_event_to_tracers(struct ftrace_event_call *call)
 		__trace_add_new_event(call, tr);
 }
 
-extern struct ftrace_event_call *__start_ftrace_events[];
-extern struct ftrace_event_call *__stop_ftrace_events[];
+extern struct trace_event_call *__start_ftrace_events[];
+extern struct trace_event_call *__stop_ftrace_events[];
 
 static char bootup_event_buf[COMMAND_LINE_SIZE] __initdata;
 
@@ -2593,7 +2593,7 @@ early_enable_events(struct trace_array *tr, bool disable_first)
 static __init int event_trace_enable(void)
 {
 	struct trace_array *tr = top_trace_array();
-	struct ftrace_event_call **iter, *call;
+	struct trace_event_call **iter, *call;
 	int ret;
 
 	if (!tr)
@@ -2756,7 +2756,7 @@ static __init void event_trace_self_tests(void)
 {
 	struct ftrace_subsystem_dir *dir;
 	struct trace_event_file *file;
-	struct ftrace_event_call *call;
+	struct trace_event_call *call;
 	struct event_subsystem *system;
 	struct trace_array *tr;
 	int ret;
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 13ad0a87d31e..d535f3bf2aa2 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -782,7 +782,7 @@ static void __free_preds(struct event_filter *filter)
 
 static void filter_disable(struct trace_event_file *file)
 {
-	struct ftrace_event_call *call = file->event_call;
+	struct trace_event_call *call = file->event_call;
 
 	if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER)
 		call->flags &= ~TRACE_EVENT_FL_FILTERED;
@@ -839,7 +839,7 @@ static int __alloc_preds(struct event_filter *filter, int n_preds)
 
 static inline void __remove_filter(struct trace_event_file *file)
 {
-	struct ftrace_event_call *call = file->event_call;
+	struct trace_event_call *call = file->event_call;
 
 	filter_disable(file);
 	if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER)
@@ -862,7 +862,7 @@ static void filter_free_subsystem_preds(struct ftrace_subsystem_dir *dir,
 
 static inline void __free_subsystem_filter(struct trace_event_file *file)
 {
-	struct ftrace_event_call *call = file->event_call;
+	struct trace_event_call *call = file->event_call;
 
 	if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) {
 		__free_filter(call->filter);
@@ -1336,7 +1336,7 @@ parse_operand:
 }
 
 static struct filter_pred *create_pred(struct filter_parse_state *ps,
-				       struct ftrace_event_call *call,
+				       struct trace_event_call *call,
 				       int op, char *operand1, char *operand2)
 {
 	struct ftrace_event_field *field;
@@ -1549,7 +1549,7 @@ static int fold_pred_tree(struct event_filter *filter,
 			      filter->preds);
 }
 
-static int replace_preds(struct ftrace_event_call *call,
+static int replace_preds(struct trace_event_call *call,
 			 struct event_filter *filter,
 			 struct filter_parse_state *ps,
 			 bool dry_run)
@@ -1664,7 +1664,7 @@ fail:
 
 static inline void event_set_filtered_flag(struct trace_event_file *file)
 {
-	struct ftrace_event_call *call = file->event_call;
+	struct trace_event_call *call = file->event_call;
 
 	if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER)
 		call->flags |= TRACE_EVENT_FL_FILTERED;
@@ -1675,7 +1675,7 @@ static inline void event_set_filtered_flag(struct trace_event_file *file)
 static inline void event_set_filter(struct trace_event_file *file,
 				    struct event_filter *filter)
 {
-	struct ftrace_event_call *call = file->event_call;
+	struct trace_event_call *call = file->event_call;
 
 	if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER)
 		rcu_assign_pointer(call->filter, filter);
@@ -1685,7 +1685,7 @@ static inline void event_set_filter(struct trace_event_file *file,
 
 static inline void event_clear_filter(struct trace_event_file *file)
 {
-	struct ftrace_event_call *call = file->event_call;
+	struct trace_event_call *call = file->event_call;
 
 	if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER)
 		RCU_INIT_POINTER(call->filter, NULL);
@@ -1696,7 +1696,7 @@ static inline void event_clear_filter(struct trace_event_file *file)
 static inline void
 event_set_no_set_filter_flag(struct trace_event_file *file)
 {
-	struct ftrace_event_call *call = file->event_call;
+	struct trace_event_call *call = file->event_call;
 
 	if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER)
 		call->flags |= TRACE_EVENT_FL_NO_SET_FILTER;
@@ -1707,7 +1707,7 @@ event_set_no_set_filter_flag(struct trace_event_file *file)
 static inline void
 event_clear_no_set_filter_flag(struct trace_event_file *file)
 {
-	struct ftrace_event_call *call = file->event_call;
+	struct trace_event_call *call = file->event_call;
 
 	if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER)
 		call->flags &= ~TRACE_EVENT_FL_NO_SET_FILTER;
@@ -1718,7 +1718,7 @@ event_clear_no_set_filter_flag(struct trace_event_file *file)
 static inline bool
 event_no_set_filter_flag(struct trace_event_file *file)
 {
-	struct ftrace_event_call *call = file->event_call;
+	struct trace_event_call *call = file->event_call;
 
 	if (file->flags & FTRACE_EVENT_FL_NO_SET_FILTER)
 		return true;
@@ -1884,8 +1884,8 @@ static void create_filter_finish(struct filter_parse_state *ps)
 }
 
 /**
- * create_filter - create a filter for a ftrace_event_call
- * @call: ftrace_event_call to create a filter for
+ * create_filter - create a filter for a trace_event_call
+ * @call: trace_event_call to create a filter for
  * @filter_str: filter string
  * @set_str: remember @filter_str and enable detailed error in filter
  * @filterp: out param for created filter (always updated on return)
@@ -1899,7 +1899,7 @@ static void create_filter_finish(struct filter_parse_state *ps)
  * information if @set_str is %true and the caller is responsible for
  * freeing it.
  */
-static int create_filter(struct ftrace_event_call *call,
+static int create_filter(struct trace_event_call *call,
 			 char *filter_str, bool set_str,
 			 struct event_filter **filterp)
 {
@@ -1919,7 +1919,7 @@ static int create_filter(struct ftrace_event_call *call,
 	return err;
 }
 
-int create_event_filter(struct ftrace_event_call *call,
+int create_event_filter(struct trace_event_call *call,
 			char *filter_str, bool set_str,
 			struct event_filter **filterp)
 {
@@ -1963,7 +1963,7 @@ static int create_system_filter(struct ftrace_subsystem_dir *dir,
 /* caller must hold event_mutex */
 int apply_event_filter(struct trace_event_file *file, char *filter_string)
 {
-	struct ftrace_event_call *call = file->event_call;
+	struct trace_event_call *call = file->event_call;
 	struct event_filter *filter;
 	int err;
 
@@ -2212,7 +2212,7 @@ int ftrace_profile_set_filter(struct perf_event *event, int event_id,
 {
 	int err;
 	struct event_filter *filter;
-	struct ftrace_event_call *call;
+	struct trace_event_call *call;
 
 	mutex_lock(&event_mutex);
 
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index 174a6a71146c..adabf7da9113 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -125,7 +125,7 @@ static void __always_unused ____ftrace_check_##name(void)		\
 #undef FTRACE_ENTRY
 #define FTRACE_ENTRY(name, struct_name, id, tstruct, print, filter)	\
 static int __init							\
-ftrace_define_fields_##name(struct ftrace_event_call *event_call)	\
+ftrace_define_fields_##name(struct trace_event_call *event_call)	\
 {									\
 	struct struct_name field;					\
 	int ret;							\
@@ -163,14 +163,14 @@ ftrace_define_fields_##name(struct ftrace_event_call *event_call)	\
 #define FTRACE_ENTRY_REG(call, struct_name, etype, tstruct, print, filter,\
 			 regfn)						\
 									\
-struct ftrace_event_class __refdata event_class_ftrace_##call = {	\
+struct trace_event_class __refdata event_class_ftrace_##call = {	\
 	.system			= __stringify(TRACE_SYSTEM),		\
 	.define_fields		= ftrace_define_fields_##call,		\
 	.fields			= LIST_HEAD_INIT(event_class_ftrace_##call.fields),\
 	.reg			= regfn,				\
 };									\
 									\
-struct ftrace_event_call __used event_##call = {			\
+struct trace_event_call __used event_##call = {				\
 	.class			= &event_class_ftrace_##call,		\
 	{								\
 		.name			= #call,			\
@@ -179,7 +179,7 @@ struct ftrace_event_call __used event_##call = {			\
 	.print_fmt		= print,				\
 	.flags			= TRACE_EVENT_FL_IGNORE_ENABLE,		\
 };									\
-struct ftrace_event_call __used						\
+struct trace_event_call __used						\
 __attribute__((section("_ftrace_events"))) *__event_##call = &event_##call;
 
 #undef FTRACE_ENTRY
@@ -187,7 +187,7 @@ __attribute__((section("_ftrace_events"))) *__event_##call = &event_##call;
 	FTRACE_ENTRY_REG(call, struct_name, etype,			\
 			 PARAMS(tstruct), PARAMS(print), filter, NULL)
 
-int ftrace_event_is_function(struct ftrace_event_call *call)
+int ftrace_event_is_function(struct trace_event_call *call)
 {
 	return call == &event_function;
 }
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 32e76a21b8d4..8968bf720c12 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -278,7 +278,7 @@ int __trace_graph_entry(struct trace_array *tr,
 				unsigned long flags,
 				int pc)
 {
-	struct ftrace_event_call *call = &event_funcgraph_entry;
+	struct trace_event_call *call = &event_funcgraph_entry;
 	struct ring_buffer_event *event;
 	struct ring_buffer *buffer = tr->trace_buffer.buffer;
 	struct ftrace_graph_ent_entry *entry;
@@ -393,7 +393,7 @@ void __trace_graph_return(struct trace_array *tr,
 				unsigned long flags,
 				int pc)
 {
-	struct ftrace_event_call *call = &event_funcgraph_exit;
+	struct trace_event_call *call = &event_funcgraph_exit;
 	struct ring_buffer_event *event;
 	struct ring_buffer *buffer = tr->trace_buffer.buffer;
 	struct ftrace_graph_ret_entry *entry;
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 87c63d039b9d..6ecb1a49af7b 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -924,7 +924,7 @@ __kprobe_trace_func(struct trace_kprobe *tk, struct pt_regs *regs,
 	struct ring_buffer *buffer;
 	int size, dsize, pc;
 	unsigned long irq_flags;
-	struct ftrace_event_call *call = &tk->tp.call;
+	struct trace_event_call *call = &tk->tp.call;
 
 	WARN_ON(call != trace_file->event_call);
 
@@ -972,7 +972,7 @@ __kretprobe_trace_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
 	struct ring_buffer *buffer;
 	int size, pc, dsize;
 	unsigned long irq_flags;
-	struct ftrace_event_call *call = &tk->tp.call;
+	struct trace_event_call *call = &tk->tp.call;
 
 	WARN_ON(call != trace_file->event_call);
 
@@ -1081,7 +1081,7 @@ print_kretprobe_event(struct trace_iterator *iter, int flags,
 }
 
 
-static int kprobe_event_define_fields(struct ftrace_event_call *event_call)
+static int kprobe_event_define_fields(struct trace_event_call *event_call)
 {
 	int ret, i;
 	struct kprobe_trace_entry_head field;
@@ -1104,7 +1104,7 @@ static int kprobe_event_define_fields(struct ftrace_event_call *event_call)
 	return 0;
 }
 
-static int kretprobe_event_define_fields(struct ftrace_event_call *event_call)
+static int kretprobe_event_define_fields(struct trace_event_call *event_call)
 {
 	int ret, i;
 	struct kretprobe_trace_entry_head field;
@@ -1134,7 +1134,7 @@ static int kretprobe_event_define_fields(struct ftrace_event_call *event_call)
 static void
 kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs)
 {
-	struct ftrace_event_call *call = &tk->tp.call;
+	struct trace_event_call *call = &tk->tp.call;
 	struct bpf_prog *prog = call->prog;
 	struct kprobe_trace_entry_head *entry;
 	struct hlist_head *head;
@@ -1169,7 +1169,7 @@ static void
 kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
 		    struct pt_regs *regs)
 {
-	struct ftrace_event_call *call = &tk->tp.call;
+	struct trace_event_call *call = &tk->tp.call;
 	struct bpf_prog *prog = call->prog;
 	struct kretprobe_trace_entry_head *entry;
 	struct hlist_head *head;
@@ -1206,7 +1206,7 @@ NOKPROBE_SYMBOL(kretprobe_perf_func);
  * kprobe_trace_self_tests_init() does enable_trace_probe/disable_trace_probe
  * lockless, but we can't race with this __init function.
  */
-static int kprobe_register(struct ftrace_event_call *event,
+static int kprobe_register(struct trace_event_call *event,
 			   enum trace_reg type, void *data)
 {
 	struct trace_kprobe *tk = (struct trace_kprobe *)event->data;
@@ -1276,10 +1276,10 @@ static struct trace_event_functions kprobe_funcs = {
 
 static int register_kprobe_event(struct trace_kprobe *tk)
 {
-	struct ftrace_event_call *call = &tk->tp.call;
+	struct trace_event_call *call = &tk->tp.call;
 	int ret;
 
-	/* Initialize ftrace_event_call */
+	/* Initialize trace_event_call */
 	INIT_LIST_HEAD(&call->class->fields);
 	if (trace_kprobe_is_return(tk)) {
 		call->event.funcs = &kretprobe_funcs;
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
index 7a9ba62e9fef..638e110c5bfd 100644
--- a/kernel/trace/trace_mmiotrace.c
+++ b/kernel/trace/trace_mmiotrace.c
@@ -298,7 +298,7 @@ static void __trace_mmiotrace_rw(struct trace_array *tr,
 				struct trace_array_cpu *data,
 				struct mmiotrace_rw *rw)
 {
-	struct ftrace_event_call *call = &event_mmiotrace_rw;
+	struct trace_event_call *call = &event_mmiotrace_rw;
 	struct ring_buffer *buffer = tr->trace_buffer.buffer;
 	struct ring_buffer_event *event;
 	struct trace_mmiotrace_rw *entry;
@@ -328,7 +328,7 @@ static void __trace_mmiotrace_map(struct trace_array *tr,
 				struct trace_array_cpu *data,
 				struct mmiotrace_map *map)
 {
-	struct ftrace_event_call *call = &event_mmiotrace_map;
+	struct trace_event_call *call = &event_mmiotrace_map;
 	struct ring_buffer *buffer = tr->trace_buffer.buffer;
 	struct ring_buffer_event *event;
 	struct trace_mmiotrace_map *entry;
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 6469906e890d..21c6525efcef 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -225,12 +225,12 @@ EXPORT_SYMBOL(trace_print_array_seq);
 int ftrace_raw_output_prep(struct trace_iterator *iter,
 			   struct trace_event *trace_event)
 {
-	struct ftrace_event_call *event;
+	struct trace_event_call *event;
 	struct trace_seq *s = &iter->seq;
 	struct trace_seq *p = &iter->tmp_seq;
 	struct trace_entry *entry;
 
-	event = container_of(trace_event, struct ftrace_event_call, event);
+	event = container_of(trace_event, struct trace_event_call, event);
 	entry = iter->ent;
 
 	if (entry->type != event->event.type) {
diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h
index 9fcdfbbcd8b5..b98dee914542 100644
--- a/kernel/trace/trace_probe.h
+++ b/kernel/trace/trace_probe.h
@@ -272,8 +272,8 @@ struct probe_arg {
 
 struct trace_probe {
 	unsigned int			flags;	/* For TP_FLAG_* */
-	struct ftrace_event_class	class;
-	struct ftrace_event_call	call;
+	struct trace_event_class	class;
+	struct trace_event_call		call;
 	struct list_head 		files;
 	ssize_t				size;	/* trace entry size */
 	unsigned int			nr_args;
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index d6e1003724e9..9b33dd117f3f 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -369,7 +369,7 @@ tracing_sched_switch_trace(struct trace_array *tr,
 			   struct task_struct *next,
 			   unsigned long flags, int pc)
 {
-	struct ftrace_event_call *call = &event_context_switch;
+	struct trace_event_call *call = &event_context_switch;
 	struct ring_buffer *buffer = tr->trace_buffer.buffer;
 	struct ring_buffer_event *event;
 	struct ctx_switch_entry *entry;
@@ -397,7 +397,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
 			   struct task_struct *curr,
 			   unsigned long flags, int pc)
 {
-	struct ftrace_event_call *call = &event_wakeup;
+	struct trace_event_call *call = &event_wakeup;
 	struct ring_buffer_event *event;
 	struct ctx_switch_entry *entry;
 	struct ring_buffer *buffer = tr->trace_buffer.buffer;
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index ee4525261e82..504f582b15db 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -13,13 +13,13 @@
 
 static DEFINE_MUTEX(syscall_trace_lock);
 
-static int syscall_enter_register(struct ftrace_event_call *event,
+static int syscall_enter_register(struct trace_event_call *event,
 				 enum trace_reg type, void *data);
-static int syscall_exit_register(struct ftrace_event_call *event,
+static int syscall_exit_register(struct trace_event_call *event,
 				 enum trace_reg type, void *data);
 
 static struct list_head *
-syscall_get_enter_fields(struct ftrace_event_call *call)
+syscall_get_enter_fields(struct trace_event_call *call)
 {
 	struct syscall_metadata *entry = call->data;
 
@@ -219,7 +219,7 @@ __set_enter_print_fmt(struct syscall_metadata *entry, char *buf, int len)
 	return pos;
 }
 
-static int __init set_syscall_print_fmt(struct ftrace_event_call *call)
+static int __init set_syscall_print_fmt(struct trace_event_call *call)
 {
 	char *print_fmt;
 	int len;
@@ -244,7 +244,7 @@ static int __init set_syscall_print_fmt(struct ftrace_event_call *call)
 	return 0;
 }
 
-static void __init free_syscall_print_fmt(struct ftrace_event_call *call)
+static void __init free_syscall_print_fmt(struct trace_event_call *call)
 {
 	struct syscall_metadata *entry = call->data;
 
@@ -252,7 +252,7 @@ static void __init free_syscall_print_fmt(struct ftrace_event_call *call)
 		kfree(call->print_fmt);
 }
 
-static int __init syscall_enter_define_fields(struct ftrace_event_call *call)
+static int __init syscall_enter_define_fields(struct trace_event_call *call)
 {
 	struct syscall_trace_enter trace;
 	struct syscall_metadata *meta = call->data;
@@ -275,7 +275,7 @@ static int __init syscall_enter_define_fields(struct ftrace_event_call *call)
 	return ret;
 }
 
-static int __init syscall_exit_define_fields(struct ftrace_event_call *call)
+static int __init syscall_exit_define_fields(struct trace_event_call *call)
 {
 	struct syscall_trace_exit trace;
 	int ret;
@@ -385,7 +385,7 @@ static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret)
 }
 
 static int reg_event_syscall_enter(struct trace_event_file *file,
-				   struct ftrace_event_call *call)
+				   struct trace_event_call *call)
 {
 	struct trace_array *tr = file->tr;
 	int ret = 0;
@@ -406,7 +406,7 @@ static int reg_event_syscall_enter(struct trace_event_file *file,
 }
 
 static void unreg_event_syscall_enter(struct trace_event_file *file,
-				      struct ftrace_event_call *call)
+				      struct trace_event_call *call)
 {
 	struct trace_array *tr = file->tr;
 	int num;
@@ -423,7 +423,7 @@ static void unreg_event_syscall_enter(struct trace_event_file *file,
 }
 
 static int reg_event_syscall_exit(struct trace_event_file *file,
-				  struct ftrace_event_call *call)
+				  struct trace_event_call *call)
 {
 	struct trace_array *tr = file->tr;
 	int ret = 0;
@@ -444,7 +444,7 @@ static int reg_event_syscall_exit(struct trace_event_file *file,
 }
 
 static void unreg_event_syscall_exit(struct trace_event_file *file,
-				     struct ftrace_event_call *call)
+				     struct trace_event_call *call)
 {
 	struct trace_array *tr = file->tr;
 	int num;
@@ -460,7 +460,7 @@ static void unreg_event_syscall_exit(struct trace_event_file *file,
 	mutex_unlock(&syscall_trace_lock);
 }
 
-static int __init init_syscall_trace(struct ftrace_event_call *call)
+static int __init init_syscall_trace(struct trace_event_call *call)
 {
 	int id;
 	int num;
@@ -493,7 +493,7 @@ struct trace_event_functions exit_syscall_print_funcs = {
 	.trace		= print_syscall_exit,
 };
 
-struct ftrace_event_class __refdata event_class_syscall_enter = {
+struct trace_event_class __refdata event_class_syscall_enter = {
 	.system		= "syscalls",
 	.reg		= syscall_enter_register,
 	.define_fields	= syscall_enter_define_fields,
@@ -501,7 +501,7 @@ struct ftrace_event_class __refdata event_class_syscall_enter = {
 	.raw_init	= init_syscall_trace,
 };
 
-struct ftrace_event_class __refdata event_class_syscall_exit = {
+struct trace_event_class __refdata event_class_syscall_exit = {
 	.system		= "syscalls",
 	.reg		= syscall_exit_register,
 	.define_fields	= syscall_exit_define_fields,
@@ -584,7 +584,7 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
 	perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head, NULL);
 }
 
-static int perf_sysenter_enable(struct ftrace_event_call *call)
+static int perf_sysenter_enable(struct trace_event_call *call)
 {
 	int ret = 0;
 	int num;
@@ -605,7 +605,7 @@ static int perf_sysenter_enable(struct ftrace_event_call *call)
 	return ret;
 }
 
-static void perf_sysenter_disable(struct ftrace_event_call *call)
+static void perf_sysenter_disable(struct trace_event_call *call)
 {
 	int num;
 
@@ -656,7 +656,7 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
 	perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head, NULL);
 }
 
-static int perf_sysexit_enable(struct ftrace_event_call *call)
+static int perf_sysexit_enable(struct trace_event_call *call)
 {
 	int ret = 0;
 	int num;
@@ -677,7 +677,7 @@ static int perf_sysexit_enable(struct ftrace_event_call *call)
 	return ret;
 }
 
-static void perf_sysexit_disable(struct ftrace_event_call *call)
+static void perf_sysexit_disable(struct trace_event_call *call)
 {
 	int num;
 
@@ -693,7 +693,7 @@ static void perf_sysexit_disable(struct ftrace_event_call *call)
 
 #endif /* CONFIG_PERF_EVENTS */
 
-static int syscall_enter_register(struct ftrace_event_call *event,
+static int syscall_enter_register(struct trace_event_call *event,
 				 enum trace_reg type, void *data)
 {
 	struct trace_event_file *file = data;
@@ -721,7 +721,7 @@ static int syscall_enter_register(struct ftrace_event_call *event,
 	return 0;
 }
 
-static int syscall_exit_register(struct ftrace_event_call *event,
+static int syscall_exit_register(struct trace_event_call *event,
 				 enum trace_reg type, void *data)
 {
 	struct trace_event_file *file = data;
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 3f61ec4a3164..68ba5da5d9e0 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -777,7 +777,7 @@ static void __uprobe_trace_func(struct trace_uprobe *tu,
 	struct ring_buffer *buffer;
 	void *data;
 	int size, esize;
-	struct ftrace_event_call *call = &tu->tp.call;
+	struct trace_event_call *call = &tu->tp.call;
 
 	WARN_ON(call != trace_file->event_call);
 
@@ -967,7 +967,7 @@ probe_event_disable(struct trace_uprobe *tu, struct trace_event_file *file)
 	uprobe_buffer_disable();
 }
 
-static int uprobe_event_define_fields(struct ftrace_event_call *event_call)
+static int uprobe_event_define_fields(struct trace_event_call *event_call)
 {
 	int ret, i, size;
 	struct uprobe_trace_entry_head field;
@@ -1093,7 +1093,7 @@ static void __uprobe_perf_func(struct trace_uprobe *tu,
 			       unsigned long func, struct pt_regs *regs,
 			       struct uprobe_cpu_buffer *ucb, int dsize)
 {
-	struct ftrace_event_call *call = &tu->tp.call;
+	struct trace_event_call *call = &tu->tp.call;
 	struct uprobe_trace_entry_head *entry;
 	struct hlist_head *head;
 	void *data;
@@ -1159,7 +1159,7 @@ static void uretprobe_perf_func(struct trace_uprobe *tu, unsigned long func,
 #endif	/* CONFIG_PERF_EVENTS */
 
 static int
-trace_uprobe_register(struct ftrace_event_call *event, enum trace_reg type,
+trace_uprobe_register(struct trace_event_call *event, enum trace_reg type,
 		      void *data)
 {
 	struct trace_uprobe *tu = event->data;
@@ -1272,10 +1272,10 @@ static struct trace_event_functions uprobe_funcs = {
 
 static int register_uprobe_event(struct trace_uprobe *tu)
 {
-	struct ftrace_event_call *call = &tu->tp.call;
+	struct trace_event_call *call = &tu->tp.call;
 	int ret;
 
-	/* Initialize ftrace_event_call */
+	/* Initialize trace_event_call */
 	INIT_LIST_HEAD(&call->class->fields);
 	call->event.funcs = &uprobe_funcs;
 	call->class->define_fields = uprobe_event_define_fields;
-- 
cgit v1.2.3


From 3f795dcfc7364cd811c3f6f03d115fcefbbdc1ca Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Tue, 5 May 2015 13:18:46 -0400
Subject: tracing: Rename ftrace_event_buffer to trace_event_buffer.

The name "ftrace" really refers to the function hook infrastructure. It
is not about the trace_events. The ftrace_event_buffer functions and data
structures are for trace_events and not for function hooks. Rename them
to trace_event_buffer*.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/trace_events.h |  6 +++---
 include/trace/perf.h         |  6 +++---
 kernel/trace/trace_events.c  | 14 +++++++-------
 3 files changed, 13 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index d10ab04a17b2..a1fa8ebaf684 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -219,7 +219,7 @@ struct trace_event_class {
 extern int trace_event_reg(struct trace_event_call *event,
 			    enum trace_reg type, void *data);
 
-struct ftrace_event_buffer {
+struct trace_event_buffer {
 	struct ring_buffer		*buffer;
 	struct ring_buffer_event	*event;
 	struct trace_event_file		*trace_file;
@@ -228,11 +228,11 @@ struct ftrace_event_buffer {
 	int				pc;
 };
 
-void *ftrace_event_buffer_reserve(struct ftrace_event_buffer *fbuffer,
+void *trace_event_buffer_reserve(struct trace_event_buffer *fbuffer,
 				  struct trace_event_file *trace_file,
 				  unsigned long len);
 
-void ftrace_event_buffer_commit(struct ftrace_event_buffer *fbuffer);
+void trace_event_buffer_commit(struct trace_event_buffer *fbuffer);
 
 enum {
 	TRACE_EVENT_FL_FILTERED_BIT,
diff --git a/include/trace/perf.h b/include/trace/perf.h
index 1d10c2d2b2c4..b1d7399df449 100644
--- a/include/trace/perf.h
+++ b/include/trace/perf.h
@@ -155,7 +155,7 @@ ftrace_raw_event_##call(void *__data, proto)				\
 {									\
 	struct trace_event_file *trace_file = __data;			\
 	struct ftrace_data_offsets_##call __maybe_unused __data_offsets;\
-	struct ftrace_event_buffer fbuffer;				\
+	struct trace_event_buffer fbuffer;				\
 	struct ftrace_raw_##call *entry;				\
 	int __data_size;						\
 									\
@@ -164,7 +164,7 @@ ftrace_raw_event_##call(void *__data, proto)				\
 									\
 	__data_size = ftrace_get_offsets_##call(&__data_offsets, args); \
 									\
-	entry = ftrace_event_buffer_reserve(&fbuffer, trace_file,	\
+	entry = trace_event_buffer_reserve(&fbuffer, trace_file,	\
 				 sizeof(*entry) + __data_size);		\
 									\
 	if (!entry)							\
@@ -174,7 +174,7 @@ ftrace_raw_event_##call(void *__data, proto)				\
 									\
 	{ assign; }							\
 									\
-	ftrace_event_buffer_commit(&fbuffer);				\
+	trace_event_buffer_commit(&fbuffer);				\
 }
 /*
  * The ftrace_test_probe is compiled out, it is only here as a build time check
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 8df615ce3dc4..e5638c43b04d 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -190,9 +190,9 @@ int trace_event_raw_init(struct trace_event_call *call)
 }
 EXPORT_SYMBOL_GPL(trace_event_raw_init);
 
-void *ftrace_event_buffer_reserve(struct ftrace_event_buffer *fbuffer,
-				  struct trace_event_file *trace_file,
-				  unsigned long len)
+void *trace_event_buffer_reserve(struct trace_event_buffer *fbuffer,
+				 struct trace_event_file *trace_file,
+				 unsigned long len)
 {
 	struct trace_event_call *event_call = trace_file->event_call;
 
@@ -210,11 +210,11 @@ void *ftrace_event_buffer_reserve(struct ftrace_event_buffer *fbuffer,
 	fbuffer->entry = ring_buffer_event_data(fbuffer->event);
 	return fbuffer->entry;
 }
-EXPORT_SYMBOL_GPL(ftrace_event_buffer_reserve);
+EXPORT_SYMBOL_GPL(trace_event_buffer_reserve);
 
 static DEFINE_SPINLOCK(tracepoint_iter_lock);
 
-static void output_printk(struct ftrace_event_buffer *fbuffer)
+static void output_printk(struct trace_event_buffer *fbuffer)
 {
 	struct trace_event_call *event_call;
 	struct trace_event *event;
@@ -241,7 +241,7 @@ static void output_printk(struct ftrace_event_buffer *fbuffer)
 	spin_unlock_irqrestore(&tracepoint_iter_lock, flags);
 }
 
-void ftrace_event_buffer_commit(struct ftrace_event_buffer *fbuffer)
+void trace_event_buffer_commit(struct trace_event_buffer *fbuffer)
 {
 	if (tracepoint_printk)
 		output_printk(fbuffer);
@@ -250,7 +250,7 @@ void ftrace_event_buffer_commit(struct ftrace_event_buffer *fbuffer)
 				    fbuffer->event, fbuffer->entry,
 				    fbuffer->flags, fbuffer->pc);
 }
-EXPORT_SYMBOL_GPL(ftrace_event_buffer_commit);
+EXPORT_SYMBOL_GPL(trace_event_buffer_commit);
 
 int trace_event_reg(struct trace_event_call *call,
 		    enum trace_reg type, void *data)
-- 
cgit v1.2.3


From 892c505aac2bdded3c8ec2ec27abc6d74fd210f5 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Tue, 5 May 2015 14:18:11 -0400
Subject: tracing: Rename ftrace_output functions to trace_output

The name "ftrace" really refers to the function hook infrastructure. It
is not about the trace_events. The ftrace_output_*() and ftrace_raw_output_*()
functions represent the trace_event code. Rename them to just trace_output
or trace_raw_output.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/trace_events.h |  6 +++---
 include/trace/perf.h         |  2 +-
 include/trace/trace_events.h | 16 ++++++++--------
 kernel/trace/trace_output.c  | 16 ++++++++--------
 4 files changed, 20 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index a1fa8ebaf684..12ca46322a94 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -52,8 +52,8 @@ const char *trace_print_array_seq(struct trace_seq *p,
 struct trace_iterator;
 struct trace_event;
 
-int ftrace_raw_output_prep(struct trace_iterator *iter,
-			   struct trace_event *event);
+int trace_raw_output_prep(struct trace_iterator *iter,
+			  struct trace_event *event);
 
 /*
  * The trace entry - the most basic unit of tracing. This is what
@@ -183,7 +183,7 @@ void trace_current_buffer_discard_commit(struct ring_buffer *buffer,
 
 void tracing_record_cmdline(struct task_struct *tsk);
 
-int ftrace_output_call(struct trace_iterator *iter, char *name, char *fmt, ...);
+int trace_output_call(struct trace_iterator *iter, char *name, char *fmt, ...);
 
 struct event_filter;
 
diff --git a/include/trace/perf.h b/include/trace/perf.h
index b1d7399df449..0dbdbfe0ec41 100644
--- a/include/trace/perf.h
+++ b/include/trace/perf.h
@@ -58,7 +58,7 @@
  * }
  *
  * static struct trace_event ftrace_event_type_<call> = {
- *	.trace			= ftrace_raw_output_<call>, <-- stage 2
+ *	.trace			= trace_raw_output_<call>, <-- stage 2
  * };
  *
  * static char print_fmt_<call>[] = <TP_printk>;
diff --git a/include/trace/trace_events.h b/include/trace/trace_events.h
index c6f826136b8c..ab927dd32149 100644
--- a/include/trace/trace_events.h
+++ b/include/trace/trace_events.h
@@ -203,7 +203,7 @@ TRACE_MAKE_SYSTEM_STR();
  * Override the macros in <trace/trace_events.h> to include the following:
  *
  * enum print_line_t
- * ftrace_raw_output_<call>(struct trace_iterator *iter, int flags)
+ * trace_raw_output_<call>(struct trace_iterator *iter, int flags)
  * {
  *	struct trace_seq *s = &iter->seq;
  *	struct ftrace_raw_<call> *field; <-- defined in stage 1
@@ -304,8 +304,8 @@ TRACE_MAKE_SYSTEM_STR();
 #undef DECLARE_EVENT_CLASS
 #define DECLARE_EVENT_CLASS(call, proto, args, tstruct, assign, print)	\
 static notrace enum print_line_t					\
-ftrace_raw_output_##call(struct trace_iterator *iter, int flags,	\
-			 struct trace_event *trace_event)		\
+trace_raw_output_##call(struct trace_iterator *iter, int flags,		\
+			struct trace_event *trace_event)		\
 {									\
 	struct trace_seq *s = &iter->seq;				\
 	struct trace_seq __maybe_unused *p = &iter->tmp_seq;		\
@@ -314,7 +314,7 @@ ftrace_raw_output_##call(struct trace_iterator *iter, int flags,	\
 									\
 	field = (typeof(field))iter->ent;				\
 									\
-	ret = ftrace_raw_output_prep(iter, trace_event);		\
+	ret = trace_raw_output_prep(iter, trace_event);			\
 	if (ret != TRACE_TYPE_HANDLED)					\
 		return ret;						\
 									\
@@ -323,13 +323,13 @@ ftrace_raw_output_##call(struct trace_iterator *iter, int flags,	\
 	return trace_handle_return(s);					\
 }									\
 static struct trace_event_functions ftrace_event_type_funcs_##call = {	\
-	.trace			= ftrace_raw_output_##call,		\
+	.trace			= trace_raw_output_##call,		\
 };
 
 #undef DEFINE_EVENT_PRINT
 #define DEFINE_EVENT_PRINT(template, call, proto, args, print)		\
 static notrace enum print_line_t					\
-ftrace_raw_output_##call(struct trace_iterator *iter, int flags,	\
+trace_raw_output_##call(struct trace_iterator *iter, int flags,		\
 			 struct trace_event *event)			\
 {									\
 	struct ftrace_raw_##template *field;				\
@@ -346,10 +346,10 @@ ftrace_raw_output_##call(struct trace_iterator *iter, int flags,	\
 	field = (typeof(field))entry;					\
 									\
 	trace_seq_init(p);						\
-	return ftrace_output_call(iter, #call, print);			\
+	return trace_output_call(iter, #call, print);			\
 }									\
 static struct trace_event_functions ftrace_event_type_funcs_##call = {	\
-	.trace			= ftrace_raw_output_##call,		\
+	.trace			= trace_raw_output_##call,		\
 };
 
 #include TRACE_INCLUDE(TRACE_INCLUDE_FILE)
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 21c6525efcef..d1a2feb54d06 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -222,8 +222,8 @@ trace_print_array_seq(struct trace_seq *p, const void *buf, int count,
 }
 EXPORT_SYMBOL(trace_print_array_seq);
 
-int ftrace_raw_output_prep(struct trace_iterator *iter,
-			   struct trace_event *trace_event)
+int trace_raw_output_prep(struct trace_iterator *iter,
+			  struct trace_event *trace_event)
 {
 	struct trace_event_call *event;
 	struct trace_seq *s = &iter->seq;
@@ -243,10 +243,10 @@ int ftrace_raw_output_prep(struct trace_iterator *iter,
 
 	return trace_handle_return(s);
 }
-EXPORT_SYMBOL(ftrace_raw_output_prep);
+EXPORT_SYMBOL(trace_raw_output_prep);
 
-static int ftrace_output_raw(struct trace_iterator *iter, char *name,
-			     char *fmt, va_list ap)
+static int trace_output_raw(struct trace_iterator *iter, char *name,
+			    char *fmt, va_list ap)
 {
 	struct trace_seq *s = &iter->seq;
 
@@ -256,18 +256,18 @@ static int ftrace_output_raw(struct trace_iterator *iter, char *name,
 	return trace_handle_return(s);
 }
 
-int ftrace_output_call(struct trace_iterator *iter, char *name, char *fmt, ...)
+int trace_output_call(struct trace_iterator *iter, char *name, char *fmt, ...)
 {
 	va_list ap;
 	int ret;
 
 	va_start(ap, fmt);
-	ret = ftrace_output_raw(iter, name, fmt, ap);
+	ret = trace_output_raw(iter, name, fmt, ap);
 	va_end(ap);
 
 	return ret;
 }
-EXPORT_SYMBOL_GPL(ftrace_output_call);
+EXPORT_SYMBOL_GPL(trace_output_call);
 
 #ifdef CONFIG_KRETPROBES
 static inline const char *kretprobed(const char *name)
-- 
cgit v1.2.3


From 609a74045238c303bbe9396775eacf5bac1f51cc Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Wed, 13 May 2015 13:44:36 -0400
Subject: tracing: Rename FTRACE_MAX_EVENT to TRACE_EVENT_TYPE_MAX

The name "ftrace" really refers to the function hook infrastructure. It
is not about the trace_events. Rename the max trace_event type size to
something more descriptive and appropriate.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/trace_events.h                           | 2 +-
 kernel/trace/trace_output.c                            | 4 ++--
 tools/perf/util/scripting-engines/trace-event-perl.c   | 4 ++--
 tools/perf/util/scripting-engines/trace-event-python.c | 4 ++--
 4 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index 12ca46322a94..6f28464be418 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -68,7 +68,7 @@ struct trace_entry {
 	int			pid;
 };
 
-#define FTRACE_MAX_EVENT						\
+#define TRACE_EVENT_TYPE_MAX						\
 	((1 << (sizeof(((struct trace_entry *)0)->type) * 8)) - 1)
 
 /*
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index d1a2feb54d06..74dfe2e01d68 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -675,7 +675,7 @@ static int trace_search_list(struct list_head **list)
 	}
 
 	/* Did we used up all 65 thousand events??? */
-	if ((last + 1) > FTRACE_MAX_EVENT)
+	if ((last + 1) > TRACE_EVENT_TYPE_MAX)
 		return 0;
 
 	*list = &e->list;
@@ -725,7 +725,7 @@ int register_trace_event(struct trace_event *event)
 	if (!event->type) {
 		struct list_head *list = NULL;
 
-		if (next_event_type > FTRACE_MAX_EVENT) {
+		if (next_event_type > TRACE_EVENT_TYPE_MAX) {
 
 			event->type = trace_search_list(&list);
 			if (!event->type)
diff --git a/tools/perf/util/scripting-engines/trace-event-perl.c b/tools/perf/util/scripting-engines/trace-event-perl.c
index 430b5d27828e..1bd593bbf7a5 100644
--- a/tools/perf/util/scripting-engines/trace-event-perl.c
+++ b/tools/perf/util/scripting-engines/trace-event-perl.c
@@ -55,10 +55,10 @@ void xs_init(pTHX)
 
 INTERP my_perl;
 
-#define FTRACE_MAX_EVENT				\
+#define TRACE_EVENT_TYPE_MAX				\
 	((1 << (sizeof(unsigned short) * 8)) - 1)
 
-static DECLARE_BITMAP(events_defined, FTRACE_MAX_EVENT);
+static DECLARE_BITMAP(events_defined, TRACE_EVENT_TYPE_MAX);
 
 extern struct scripting_context *scripting_context;
 
diff --git a/tools/perf/util/scripting-engines/trace-event-python.c b/tools/perf/util/scripting-engines/trace-event-python.c
index 5544b8cdd1ee..ace2484985cb 100644
--- a/tools/perf/util/scripting-engines/trace-event-python.c
+++ b/tools/perf/util/scripting-engines/trace-event-python.c
@@ -44,10 +44,10 @@
 
 PyMODINIT_FUNC initperf_trace_context(void);
 
-#define FTRACE_MAX_EVENT				\
+#define TRACE_EVENT_TYPE_MAX				\
 	((1 << (sizeof(unsigned short) * 8)) - 1)
 
-static DECLARE_BITMAP(events_defined, FTRACE_MAX_EVENT);
+static DECLARE_BITMAP(events_defined, TRACE_EVENT_TYPE_MAX);
 
 #define MAX_FIELDS	64
 #define N_COMMON_FIELDS	7
-- 
cgit v1.2.3


From 687fcc4aee4567df14e31e82d6993418b826f408 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Wed, 13 May 2015 14:20:14 -0400
Subject: tracing: Rename ftrace_event_name() to trace_event_name()

The name "ftrace" really refers to the function hook infrastructure. It
is not about the trace_events. ftrace_event_name() returns the name of
an event tracepoint, has nothing to do with function tracing. Rename it
to trace_event_name().

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/trace_events.h        |  2 +-
 kernel/trace/trace_events.c         | 26 +++++++++++++-------------
 kernel/trace/trace_events_trigger.c |  2 +-
 kernel/trace/trace_kprobe.c         | 16 ++++++++--------
 kernel/trace/trace_output.c         |  2 +-
 kernel/trace/trace_uprobe.c         | 14 +++++++-------
 6 files changed, 31 insertions(+), 31 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index 6f28464be418..15617798849c 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -304,7 +304,7 @@ struct trace_event_call {
 };
 
 static inline const char *
-ftrace_event_name(struct trace_event_call *call)
+trace_event_name(struct trace_event_call *call)
 {
 	if (call->flags & TRACE_EVENT_FL_TRACEPOINT)
 		return call->tp ? call->tp->name : NULL;
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index e5638c43b04d..df491ce4f3b0 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -387,7 +387,7 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file,
 			if (ret) {
 				tracing_stop_cmdline_record();
 				pr_info("event trace: Could not enable event "
-					"%s\n", ftrace_event_name(call));
+					"%s\n", trace_event_name(call));
 				break;
 			}
 			set_bit(FTRACE_EVENT_FL_ENABLED_BIT, &file->flags);
@@ -523,7 +523,7 @@ __ftrace_set_clr_event_nolock(struct trace_array *tr, const char *match,
 	list_for_each_entry(file, &tr->events, list) {
 
 		call = file->event_call;
-		name = ftrace_event_name(call);
+		name = trace_event_name(call);
 
 		if (!name || !call->class || !call->class->reg)
 			continue;
@@ -747,7 +747,7 @@ static int t_show(struct seq_file *m, void *v)
 
 	if (strcmp(call->class->system, TRACE_SYSTEM) != 0)
 		seq_printf(m, "%s:", call->class->system);
-	seq_printf(m, "%s\n", ftrace_event_name(call));
+	seq_printf(m, "%s\n", trace_event_name(call));
 
 	return 0;
 }
@@ -840,7 +840,7 @@ system_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
 	mutex_lock(&event_mutex);
 	list_for_each_entry(file, &tr->events, list) {
 		call = file->event_call;
-		if (!ftrace_event_name(call) || !call->class || !call->class->reg)
+		if (!trace_event_name(call) || !call->class || !call->class->reg)
 			continue;
 
 		if (system && strcmp(call->class->system, system->name) != 0)
@@ -955,7 +955,7 @@ static int f_show(struct seq_file *m, void *v)
 
 	switch ((unsigned long)v) {
 	case FORMAT_HEADER:
-		seq_printf(m, "name: %s\n", ftrace_event_name(call));
+		seq_printf(m, "name: %s\n", trace_event_name(call));
 		seq_printf(m, "ID: %d\n", call->event.type);
 		seq_puts(m, "format:\n");
 		return 0;
@@ -1591,7 +1591,7 @@ event_create_dir(struct dentry *parent, struct trace_event_file *file)
 	} else
 		d_events = parent;
 
-	name = ftrace_event_name(call);
+	name = trace_event_name(call);
 	file->dir = tracefs_create_dir(name, d_events);
 	if (!file->dir) {
 		pr_warn("Could not create tracefs '%s' directory\n", name);
@@ -1683,7 +1683,7 @@ static int event_init(struct trace_event_call *call)
 	int ret = 0;
 	const char *name;
 
-	name = ftrace_event_name(call);
+	name = trace_event_name(call);
 	if (WARN_ON(!name))
 		return -EINVAL;
 
@@ -2062,7 +2062,7 @@ __trace_add_event_dirs(struct trace_array *tr)
 		ret = __trace_add_new_event(call, tr);
 		if (ret < 0)
 			pr_warn("Could not create directory for event %s\n",
-				ftrace_event_name(call));
+				trace_event_name(call));
 	}
 }
 
@@ -2076,7 +2076,7 @@ find_event_file(struct trace_array *tr, const char *system,  const char *event)
 	list_for_each_entry(file, &tr->events, list) {
 
 		call = file->event_call;
-		name = ftrace_event_name(call);
+		name = trace_event_name(call);
 
 		if (!name || !call->class || !call->class->reg)
 			continue;
@@ -2152,7 +2152,7 @@ event_enable_print(struct seq_file *m, unsigned long ip,
 	seq_printf(m, "%s:%s:%s",
 		   data->enable ? ENABLE_EVENT_STR : DISABLE_EVENT_STR,
 		   data->file->event_call->class->system,
-		   ftrace_event_name(data->file->event_call));
+		   trace_event_name(data->file->event_call));
 
 	if (data->count == -1)
 		seq_puts(m, ":unlimited\n");
@@ -2375,7 +2375,7 @@ __trace_early_add_event_dirs(struct trace_array *tr)
 		ret = event_create_dir(tr->event_dir, file);
 		if (ret < 0)
 			pr_warn("Could not create directory for event %s\n",
-				ftrace_event_name(file->event_call));
+				trace_event_name(file->event_call));
 	}
 }
 
@@ -2399,7 +2399,7 @@ __trace_early_add_events(struct trace_array *tr)
 		ret = __trace_early_add_new_event(call, tr);
 		if (ret < 0)
 			pr_warn("Could not create early event %s\n",
-				ftrace_event_name(call));
+				trace_event_name(call));
 	}
 }
 
@@ -2787,7 +2787,7 @@ static __init void event_trace_self_tests(void)
 			continue;
 #endif
 
-		pr_info("Testing event %s: ", ftrace_event_name(call));
+		pr_info("Testing event %s: ", trace_event_name(call));
 
 		/*
 		 * If an event is already enabled, someone is using
diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c
index bb6f2ff52ad2..ccd6a2adc7ad 100644
--- a/kernel/trace/trace_events_trigger.c
+++ b/kernel/trace/trace_events_trigger.c
@@ -1095,7 +1095,7 @@ event_enable_trigger_print(struct seq_file *m, struct event_trigger_ops *ops,
 	seq_printf(m, "%s:%s:%s",
 		   enable_data->enable ? ENABLE_EVENT_STR : DISABLE_EVENT_STR,
 		   enable_data->file->event_call->class->system,
-		   ftrace_event_name(enable_data->file->event_call));
+		   trace_event_name(enable_data->file->event_call));
 
 	if (data->count == -1)
 		seq_puts(m, ":unlimited");
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 6ecb1a49af7b..4ce865dae39e 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -348,7 +348,7 @@ static struct trace_kprobe *find_trace_kprobe(const char *event,
 	struct trace_kprobe *tk;
 
 	list_for_each_entry(tk, &probe_list, list)
-		if (strcmp(ftrace_event_name(&tk->tp.call), event) == 0 &&
+		if (strcmp(trace_event_name(&tk->tp.call), event) == 0 &&
 		    strcmp(tk->tp.call.class->system, group) == 0)
 			return tk;
 	return NULL;
@@ -523,7 +523,7 @@ static int register_trace_kprobe(struct trace_kprobe *tk)
 	mutex_lock(&probe_lock);
 
 	/* Delete old (same name) event if exist */
-	old_tk = find_trace_kprobe(ftrace_event_name(&tk->tp.call),
+	old_tk = find_trace_kprobe(trace_event_name(&tk->tp.call),
 			tk->tp.call.class->system);
 	if (old_tk) {
 		ret = unregister_trace_kprobe(old_tk);
@@ -572,7 +572,7 @@ static int trace_kprobe_module_callback(struct notifier_block *nb,
 			if (ret)
 				pr_warning("Failed to re-register probe %s on"
 					   "%s: %d\n",
-					   ftrace_event_name(&tk->tp.call),
+					   trace_event_name(&tk->tp.call),
 					   mod->name, ret);
 		}
 	}
@@ -829,7 +829,7 @@ static int probes_seq_show(struct seq_file *m, void *v)
 
 	seq_putc(m, trace_kprobe_is_return(tk) ? 'r' : 'p');
 	seq_printf(m, ":%s/%s", tk->tp.call.class->system,
-			ftrace_event_name(&tk->tp.call));
+			trace_event_name(&tk->tp.call));
 
 	if (!tk->symbol)
 		seq_printf(m, " 0x%p", tk->rp.kp.addr);
@@ -888,7 +888,7 @@ static int probes_profile_seq_show(struct seq_file *m, void *v)
 	struct trace_kprobe *tk = v;
 
 	seq_printf(m, "  %-44s %15lu %15lu\n",
-		   ftrace_event_name(&tk->tp.call), tk->nhit,
+		   trace_event_name(&tk->tp.call), tk->nhit,
 		   tk->rp.kp.nmissed);
 
 	return 0;
@@ -1025,7 +1025,7 @@ print_kprobe_event(struct trace_iterator *iter, int flags,
 	field = (struct kprobe_trace_entry_head *)iter->ent;
 	tp = container_of(event, struct trace_probe, call.event);
 
-	trace_seq_printf(s, "%s: (", ftrace_event_name(&tp->call));
+	trace_seq_printf(s, "%s: (", trace_event_name(&tp->call));
 
 	if (!seq_print_ip_sym(s, field->ip, flags | TRACE_ITER_SYM_OFFSET))
 		goto out;
@@ -1056,7 +1056,7 @@ print_kretprobe_event(struct trace_iterator *iter, int flags,
 	field = (struct kretprobe_trace_entry_head *)iter->ent;
 	tp = container_of(event, struct trace_probe, call.event);
 
-	trace_seq_printf(s, "%s: (", ftrace_event_name(&tp->call));
+	trace_seq_printf(s, "%s: (", trace_event_name(&tp->call));
 
 	if (!seq_print_ip_sym(s, field->ret_ip, flags | TRACE_ITER_SYM_OFFSET))
 		goto out;
@@ -1301,7 +1301,7 @@ static int register_kprobe_event(struct trace_kprobe *tk)
 	ret = trace_add_event_call(call);
 	if (ret) {
 		pr_info("Failed to register kprobe event: %s\n",
-			ftrace_event_name(call));
+			trace_event_name(call));
 		kfree(call->print_fmt);
 		unregister_trace_event(&call->event);
 	}
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 74dfe2e01d68..dfab253727dc 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -239,7 +239,7 @@ int trace_raw_output_prep(struct trace_iterator *iter,
 	}
 
 	trace_seq_init(p);
-	trace_seq_printf(s, "%s: ", ftrace_event_name(event));
+	trace_seq_printf(s, "%s: ", trace_event_name(event));
 
 	return trace_handle_return(s);
 }
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 68ba5da5d9e0..40764abc7d09 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -293,7 +293,7 @@ static struct trace_uprobe *find_probe_event(const char *event, const char *grou
 	struct trace_uprobe *tu;
 
 	list_for_each_entry(tu, &uprobe_list, list)
-		if (strcmp(ftrace_event_name(&tu->tp.call), event) == 0 &&
+		if (strcmp(trace_event_name(&tu->tp.call), event) == 0 &&
 		    strcmp(tu->tp.call.class->system, group) == 0)
 			return tu;
 
@@ -323,7 +323,7 @@ static int register_trace_uprobe(struct trace_uprobe *tu)
 	mutex_lock(&uprobe_lock);
 
 	/* register as an event */
-	old_tu = find_probe_event(ftrace_event_name(&tu->tp.call),
+	old_tu = find_probe_event(trace_event_name(&tu->tp.call),
 			tu->tp.call.class->system);
 	if (old_tu) {
 		/* delete old event */
@@ -600,7 +600,7 @@ static int probes_seq_show(struct seq_file *m, void *v)
 	int i;
 
 	seq_printf(m, "%c:%s/%s", c, tu->tp.call.class->system,
-			ftrace_event_name(&tu->tp.call));
+			trace_event_name(&tu->tp.call));
 	seq_printf(m, " %s:0x%p", tu->filename, (void *)tu->offset);
 
 	for (i = 0; i < tu->tp.nr_args; i++)
@@ -651,7 +651,7 @@ static int probes_profile_seq_show(struct seq_file *m, void *v)
 	struct trace_uprobe *tu = v;
 
 	seq_printf(m, "  %s %-44s %15lu\n", tu->filename,
-			ftrace_event_name(&tu->tp.call), tu->nhit);
+			trace_event_name(&tu->tp.call), tu->nhit);
 	return 0;
 }
 
@@ -853,12 +853,12 @@ print_uprobe_event(struct trace_iterator *iter, int flags, struct trace_event *e
 
 	if (is_ret_probe(tu)) {
 		trace_seq_printf(s, "%s: (0x%lx <- 0x%lx)",
-				 ftrace_event_name(&tu->tp.call),
+				 trace_event_name(&tu->tp.call),
 				 entry->vaddr[1], entry->vaddr[0]);
 		data = DATAOF_TRACE_ENTRY(entry, true);
 	} else {
 		trace_seq_printf(s, "%s: (0x%lx)",
-				 ftrace_event_name(&tu->tp.call),
+				 trace_event_name(&tu->tp.call),
 				 entry->vaddr[0]);
 		data = DATAOF_TRACE_ENTRY(entry, false);
 	}
@@ -1295,7 +1295,7 @@ static int register_uprobe_event(struct trace_uprobe *tu)
 
 	if (ret) {
 		pr_info("Failed to register uprobe event: %s\n",
-			ftrace_event_name(call));
+			trace_event_name(call));
 		kfree(call->print_fmt);
 		unregister_trace_event(&call->event);
 	}
-- 
cgit v1.2.3


From 7967b3e0c40ff72fb2cf44d3b50e2cb388ef6c67 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Wed, 13 May 2015 14:59:40 -0400
Subject: tracing: Rename struct ftrace_subsystem_dir to trace_subsystem_dir

The name "ftrace" really refers to the function hook infrastructure. It
is not about the trace_events. The structure ftrace_subsystem_dir holds
the information about trace event subsystems. It should not be named
ftrace, rename it to trace_subsystem_dir.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/trace_events.h       |  4 ++--
 kernel/trace/trace.h               |  4 ++--
 kernel/trace/trace_events.c        | 26 +++++++++++++-------------
 kernel/trace/trace_events_filter.c | 10 +++++-----
 4 files changed, 22 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index 15617798849c..d4ad58ec684a 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -313,7 +313,7 @@ trace_event_name(struct trace_event_call *call)
 }
 
 struct trace_array;
-struct ftrace_subsystem_dir;
+struct trace_subsystem_dir;
 
 enum {
 	FTRACE_EVENT_FL_ENABLED_BIT,
@@ -355,7 +355,7 @@ struct trace_event_file {
 	struct event_filter		*filter;
 	struct dentry			*dir;
 	struct trace_array		*tr;
-	struct ftrace_subsystem_dir	*system;
+	struct trace_subsystem_dir	*system;
 	struct list_head		triggers;
 
 	/*
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 64de3837c383..4c41fcda83ed 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -992,7 +992,7 @@ struct event_subsystem {
 	int			ref_count;
 };
 
-struct ftrace_subsystem_dir {
+struct trace_subsystem_dir {
 	struct list_head		list;
 	struct event_subsystem		*subsystem;
 	struct trace_array		*tr;
@@ -1056,7 +1056,7 @@ extern void print_event_filter(struct trace_event_file *file,
 			       struct trace_seq *s);
 extern int apply_event_filter(struct trace_event_file *file,
 			      char *filter_string);
-extern int apply_subsystem_event_filter(struct ftrace_subsystem_dir *dir,
+extern int apply_subsystem_event_filter(struct trace_subsystem_dir *dir,
 					char *filter_string);
 extern void print_subsystem_event_filter(struct event_subsystem *system,
 					 struct trace_seq *s);
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index df491ce4f3b0..58984c252aac 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -449,14 +449,14 @@ static void __get_system(struct event_subsystem *system)
 	system_refcount_inc(system);
 }
 
-static void __get_system_dir(struct ftrace_subsystem_dir *dir)
+static void __get_system_dir(struct trace_subsystem_dir *dir)
 {
 	WARN_ON_ONCE(dir->ref_count == 0);
 	dir->ref_count++;
 	__get_system(dir->subsystem);
 }
 
-static void __put_system_dir(struct ftrace_subsystem_dir *dir)
+static void __put_system_dir(struct trace_subsystem_dir *dir)
 {
 	WARN_ON_ONCE(dir->ref_count == 0);
 	/* If the subsystem is about to be freed, the dir must be too */
@@ -467,14 +467,14 @@ static void __put_system_dir(struct ftrace_subsystem_dir *dir)
 		kfree(dir);
 }
 
-static void put_system(struct ftrace_subsystem_dir *dir)
+static void put_system(struct trace_subsystem_dir *dir)
 {
 	mutex_lock(&event_mutex);
 	__put_system_dir(dir);
 	mutex_unlock(&event_mutex);
 }
 
-static void remove_subsystem(struct ftrace_subsystem_dir *dir)
+static void remove_subsystem(struct trace_subsystem_dir *dir)
 {
 	if (!dir)
 		return;
@@ -828,7 +828,7 @@ system_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
 		   loff_t *ppos)
 {
 	const char set_to_char[4] = { '?', '0', '1', 'X' };
-	struct ftrace_subsystem_dir *dir = filp->private_data;
+	struct trace_subsystem_dir *dir = filp->private_data;
 	struct event_subsystem *system = dir->subsystem;
 	struct trace_event_call *call;
 	struct trace_event_file *file;
@@ -873,7 +873,7 @@ static ssize_t
 system_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
 		    loff_t *ppos)
 {
-	struct ftrace_subsystem_dir *dir = filp->private_data;
+	struct trace_subsystem_dir *dir = filp->private_data;
 	struct event_subsystem *system = dir->subsystem;
 	const char *name = NULL;
 	unsigned long val;
@@ -1132,7 +1132,7 @@ static LIST_HEAD(event_subsystems);
 static int subsystem_open(struct inode *inode, struct file *filp)
 {
 	struct event_subsystem *system = NULL;
-	struct ftrace_subsystem_dir *dir = NULL; /* Initialize for gcc */
+	struct trace_subsystem_dir *dir = NULL; /* Initialize for gcc */
 	struct trace_array *tr;
 	int ret;
 
@@ -1181,7 +1181,7 @@ static int subsystem_open(struct inode *inode, struct file *filp)
 
 static int system_tr_open(struct inode *inode, struct file *filp)
 {
-	struct ftrace_subsystem_dir *dir;
+	struct trace_subsystem_dir *dir;
 	struct trace_array *tr = inode->i_private;
 	int ret;
 
@@ -1214,7 +1214,7 @@ static int system_tr_open(struct inode *inode, struct file *filp)
 
 static int subsystem_release(struct inode *inode, struct file *file)
 {
-	struct ftrace_subsystem_dir *dir = file->private_data;
+	struct trace_subsystem_dir *dir = file->private_data;
 
 	trace_array_put(dir->tr);
 
@@ -1235,7 +1235,7 @@ static ssize_t
 subsystem_filter_read(struct file *filp, char __user *ubuf, size_t cnt,
 		      loff_t *ppos)
 {
-	struct ftrace_subsystem_dir *dir = filp->private_data;
+	struct trace_subsystem_dir *dir = filp->private_data;
 	struct event_subsystem *system = dir->subsystem;
 	struct trace_seq *s;
 	int r;
@@ -1262,7 +1262,7 @@ static ssize_t
 subsystem_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
 		       loff_t *ppos)
 {
-	struct ftrace_subsystem_dir *dir = filp->private_data;
+	struct trace_subsystem_dir *dir = filp->private_data;
 	char *buf;
 	int err;
 
@@ -1499,7 +1499,7 @@ static struct dentry *
 event_subsystem_dir(struct trace_array *tr, const char *name,
 		    struct trace_event_file *file, struct dentry *parent)
 {
-	struct ftrace_subsystem_dir *dir;
+	struct trace_subsystem_dir *dir;
 	struct event_subsystem *system;
 	struct dentry *entry;
 
@@ -2754,7 +2754,7 @@ static __init void event_test_stuff(void)
  */
 static __init void event_trace_self_tests(void)
 {
-	struct ftrace_subsystem_dir *dir;
+	struct trace_subsystem_dir *dir;
 	struct trace_event_file *file;
 	struct trace_event_call *call;
 	struct event_subsystem *system;
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index d535f3bf2aa2..203dd3750e91 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -848,7 +848,7 @@ static inline void __remove_filter(struct trace_event_file *file)
 		remove_filter_string(file->filter);
 }
 
-static void filter_free_subsystem_preds(struct ftrace_subsystem_dir *dir,
+static void filter_free_subsystem_preds(struct trace_subsystem_dir *dir,
 					struct trace_array *tr)
 {
 	struct trace_event_file *file;
@@ -873,7 +873,7 @@ static inline void __free_subsystem_filter(struct trace_event_file *file)
 	}
 }
 
-static void filter_free_subsystem_filters(struct ftrace_subsystem_dir *dir,
+static void filter_free_subsystem_filters(struct trace_subsystem_dir *dir,
 					  struct trace_array *tr)
 {
 	struct trace_event_file *file;
@@ -1735,7 +1735,7 @@ struct filter_list {
 	struct event_filter	*filter;
 };
 
-static int replace_system_preds(struct ftrace_subsystem_dir *dir,
+static int replace_system_preds(struct trace_subsystem_dir *dir,
 				struct trace_array *tr,
 				struct filter_parse_state *ps,
 				char *filter_string)
@@ -1935,7 +1935,7 @@ int create_event_filter(struct trace_event_call *call,
  * Identical to create_filter() except that it creates a subsystem filter
  * and always remembers @filter_str.
  */
-static int create_system_filter(struct ftrace_subsystem_dir *dir,
+static int create_system_filter(struct trace_subsystem_dir *dir,
 				struct trace_array *tr,
 				char *filter_str, struct event_filter **filterp)
 {
@@ -2012,7 +2012,7 @@ int apply_event_filter(struct trace_event_file *file, char *filter_string)
 	return err;
 }
 
-int apply_subsystem_event_filter(struct ftrace_subsystem_dir *dir,
+int apply_subsystem_event_filter(struct trace_subsystem_dir *dir,
 				 char *filter_string)
 {
 	struct event_subsystem *system = dir->subsystem;
-- 
cgit v1.2.3


From 1bd758eb1cab2fa5b71a23f9e5d3c8076f4ed650 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@resnulli.us>
Date: Tue, 12 May 2015 14:56:07 +0200
Subject: net: change name of flow_dissector header to match the .c file name

add couple of empty lines on the way.

Signed-off-by: Jiri Pirko <jiri@resnulli.us>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/bonding/bond_main.c             |  2 +-
 drivers/net/ethernet/cisco/enic/enic_clsf.c |  2 +-
 include/linux/skbuff.h                      |  2 +-
 include/net/flow_dissector.h                | 68 +++++++++++++++++++++++++++++
 include/net/flow_keys.h                     | 61 --------------------------
 include/net/ip.h                            |  2 +-
 include/net/ipv6.h                          |  2 +-
 net/core/flow_dissector.c                   |  2 +-
 net/sched/cls_flow.c                        |  2 +-
 net/sched/sch_choke.c                       |  2 +-
 10 files changed, 76 insertions(+), 69 deletions(-)
 create mode 100644 include/net/flow_dissector.h
 delete mode 100644 include/net/flow_keys.h

(limited to 'include/linux')

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index a2e25de98bde..fe7c38721775 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -76,7 +76,7 @@
 #include <net/netns/generic.h>
 #include <net/pkt_sched.h>
 #include <linux/rculist.h>
-#include <net/flow_keys.h>
+#include <net/flow_dissector.h>
 #include <net/switchdev.h>
 #include <net/bonding.h>
 #include <net/bond_3ad.h>
diff --git a/drivers/net/ethernet/cisco/enic/enic_clsf.c b/drivers/net/ethernet/cisco/enic/enic_clsf.c
index 0be6850be8a2..380f04903a36 100644
--- a/drivers/net/ethernet/cisco/enic/enic_clsf.c
+++ b/drivers/net/ethernet/cisco/enic/enic_clsf.c
@@ -5,7 +5,7 @@
 #include <linux/in.h>
 #include <linux/types.h>
 #include <linux/skbuff.h>
-#include <net/flow_keys.h>
+#include <net/flow_dissector.h>
 #include "enic_res.h"
 #include "enic_clsf.h"
 
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index c0b574a414e7..e35c2b13d434 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -34,7 +34,7 @@
 #include <linux/dma-mapping.h>
 #include <linux/netdev_features.h>
 #include <linux/sched.h>
-#include <net/flow_keys.h>
+#include <net/flow_dissector.h>
 
 /* A. Checksumming of received packets by device.
  *
diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h
new file mode 100644
index 000000000000..5e99a7b3963c
--- /dev/null
+++ b/include/net/flow_dissector.h
@@ -0,0 +1,68 @@
+#ifndef _NET_FLOW_DISSECTOR_H
+#define _NET_FLOW_DISSECTOR_H
+
+/* struct flow_keys:
+ *	@src: source ip address in case of IPv4
+ *	      For IPv6 it contains 32bit hash of src address
+ *	@dst: destination ip address in case of IPv4
+ *	      For IPv6 it contains 32bit hash of dst address
+ *	@ports: port numbers of Transport header
+ *		port16[0]: src port number
+ *		port16[1]: dst port number
+ *	@thoff: Transport header offset
+ *	@n_proto: Network header protocol (eg. IPv4/IPv6)
+ *	@ip_proto: Transport header protocol (eg. TCP/UDP)
+ * All the members, except thoff, are in network byte order.
+ */
+struct flow_keys {
+	/* (src,dst) must be grouped, in the same way than in IP header */
+	__be32 src;
+	__be32 dst;
+	union {
+		__be32 ports;
+		__be16 port16[2];
+	};
+	u16	thoff;
+	__be16	n_proto;
+	u8	ip_proto;
+};
+
+bool __skb_flow_dissect(const struct sk_buff *skb, struct flow_keys *flow,
+			void *data, __be16 proto, int nhoff, int hlen);
+
+static inline bool skb_flow_dissect(const struct sk_buff *skb,
+				    struct flow_keys *flow)
+{
+	return __skb_flow_dissect(skb, flow, NULL, 0, 0, 0);
+}
+
+__be32 __skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto,
+			    void *data, int hlen_proto);
+
+static inline __be32 skb_flow_get_ports(const struct sk_buff *skb,
+					int thoff, u8 ip_proto)
+{
+	return __skb_flow_get_ports(skb, thoff, ip_proto, NULL, 0);
+}
+
+u32 flow_hash_from_keys(struct flow_keys *keys);
+
+unsigned int flow_get_hlen(const unsigned char *data, unsigned int max_len,
+			   __be16 protocol);
+
+/* struct flow_keys_digest:
+ *
+ * This structure is used to hold a digest of the full flow keys. This is a
+ * larger "hash" of a flow to allow definitively matching specific flows where
+ * the 32 bit skb->hash is not large enough. The size is limited to 16 bytes so
+ * that it can by used in CB of skb (see sch_choke for an example).
+ */
+#define FLOW_KEYS_DIGEST_LEN	16
+struct flow_keys_digest {
+	u8	data[FLOW_KEYS_DIGEST_LEN];
+};
+
+void make_flow_keys_digest(struct flow_keys_digest *digest,
+			   const struct flow_keys *flow);
+
+#endif
diff --git a/include/net/flow_keys.h b/include/net/flow_keys.h
deleted file mode 100644
index 6d6ef626811a..000000000000
--- a/include/net/flow_keys.h
+++ /dev/null
@@ -1,61 +0,0 @@
-#ifndef _NET_FLOW_KEYS_H
-#define _NET_FLOW_KEYS_H
-
-/* struct flow_keys:
- *	@src: source ip address in case of IPv4
- *	      For IPv6 it contains 32bit hash of src address
- *	@dst: destination ip address in case of IPv4
- *	      For IPv6 it contains 32bit hash of dst address
- *	@ports: port numbers of Transport header
- *		port16[0]: src port number
- *		port16[1]: dst port number
- *	@thoff: Transport header offset
- *	@n_proto: Network header protocol (eg. IPv4/IPv6)
- *	@ip_proto: Transport header protocol (eg. TCP/UDP)
- * All the members, except thoff, are in network byte order.
- */
-struct flow_keys {
-	/* (src,dst) must be grouped, in the same way than in IP header */
-	__be32 src;
-	__be32 dst;
-	union {
-		__be32 ports;
-		__be16 port16[2];
-	};
-	u16	thoff;
-	__be16	n_proto;
-	u8	ip_proto;
-};
-
-bool __skb_flow_dissect(const struct sk_buff *skb, struct flow_keys *flow,
-			void *data, __be16 proto, int nhoff, int hlen);
-static inline bool skb_flow_dissect(const struct sk_buff *skb, struct flow_keys *flow)
-{
-	return __skb_flow_dissect(skb, flow, NULL, 0, 0, 0);
-}
-__be32 __skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto,
-			    void *data, int hlen_proto);
-static inline __be32 skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto)
-{
-	return __skb_flow_get_ports(skb, thoff, ip_proto, NULL, 0);
-}
-u32 flow_hash_from_keys(struct flow_keys *keys);
-unsigned int flow_get_hlen(const unsigned char *data, unsigned int max_len,
-			   __be16 protocol);
-
-/* struct flow_keys_digest:
- *
- * This structure is used to hold a digest of the full flow keys. This is a
- * larger "hash" of a flow to allow definitively matching specific flows where
- * the 32 bit skb->hash is not large enough. The size is limited to 16 bytes so
- * that it can by used in CB of skb (see sch_choke for an example).
- */
-#define FLOW_KEYS_DIGEST_LEN	16
-struct flow_keys_digest {
-	u8	data[FLOW_KEYS_DIGEST_LEN];
-};
-
-void make_flow_keys_digest(struct flow_keys_digest *digest,
-			   const struct flow_keys *flow);
-
-#endif
diff --git a/include/net/ip.h b/include/net/ip.h
index d14af7edd197..562eb653aebc 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -31,7 +31,7 @@
 #include <net/route.h>
 #include <net/snmp.h>
 #include <net/flow.h>
-#include <net/flow_keys.h>
+#include <net/flow_dissector.h>
 
 struct sock;
 
diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index 53d25ef1699a..9932b86394a9 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -19,7 +19,7 @@
 #include <net/if_inet6.h>
 #include <net/ndisc.h>
 #include <net/flow.h>
-#include <net/flow_keys.h>
+#include <net/flow_dissector.h>
 #include <net/snmp.h>
 
 #define SIN6_LEN_RFC2133	24
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index d3acc4dff4ae..eaa86b539221 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -12,7 +12,7 @@
 #include <linux/if_tunnel.h>
 #include <linux/if_pppox.h>
 #include <linux/ppp_defs.h>
-#include <net/flow_keys.h>
+#include <net/flow_dissector.h>
 #include <scsi/fc/fc_fcoe.h>
 
 /* copy saddr & daddr, possibly using 64bit load/store
diff --git a/net/sched/cls_flow.c b/net/sched/cls_flow.c
index a620c4e288a5..4c5a92298906 100644
--- a/net/sched/cls_flow.c
+++ b/net/sched/cls_flow.c
@@ -26,7 +26,7 @@
 #include <net/pkt_cls.h>
 #include <net/ip.h>
 #include <net/route.h>
-#include <net/flow_keys.h>
+#include <net/flow_dissector.h>
 
 #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
 #include <net/netfilter/nf_conntrack.h>
diff --git a/net/sched/sch_choke.c b/net/sched/sch_choke.c
index dfe3da75594c..2624e991a2d1 100644
--- a/net/sched/sch_choke.c
+++ b/net/sched/sch_choke.c
@@ -18,7 +18,7 @@
 #include <net/pkt_sched.h>
 #include <net/inet_ecn.h>
 #include <net/red.h>
-#include <net/flow_keys.h>
+#include <net/flow_dissector.h>
 
 /*
    CHOKe stateless AQM for fair bandwidth allocation
-- 
cgit v1.2.3


From 10b89ee43e849544eddfe34e535341fc077464ec Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@resnulli.us>
Date: Tue, 12 May 2015 14:56:09 +0200
Subject: net: move *skb_get_poff declarations into correct header

Since these functions are defined in flow_dissector.c, move header
declarations from skbuff.h into flow_dissector.h

Signed-off-by: Jiri Pirko <jiri@resnulli.us>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h       | 4 ----
 include/net/flow_dissector.h | 3 +++
 net/core/filter.c            | 1 +
 net/ethernet/eth.c           | 1 +
 4 files changed, 5 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index e35c2b13d434..17607ab9e7a2 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -3424,10 +3424,6 @@ struct sk_buff *skb_checksum_trimmed(struct sk_buff *skb,
 				     unsigned int transport_len,
 				     __sum16(*skb_chkf)(struct sk_buff *skb));
 
-u32 skb_get_poff(const struct sk_buff *skb);
-u32 __skb_get_poff(const struct sk_buff *skb, void *data,
-		   const struct flow_keys *keys, int hlen);
-
 /**
  * skb_head_is_locked - Determine if the skb->head is locked down
  * @skb: skb to check
diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h
index 118ae6926a72..4570ccafe59d 100644
--- a/include/net/flow_dissector.h
+++ b/include/net/flow_dissector.h
@@ -46,6 +46,9 @@ static inline __be32 skb_flow_get_ports(const struct sk_buff *skb,
 }
 
 u32 flow_hash_from_keys(struct flow_keys *keys);
+u32 skb_get_poff(const struct sk_buff *skb);
+u32 __skb_get_poff(const struct sk_buff *skb, void *data,
+		   const struct flow_keys *keys, int hlen);
 
 /* struct flow_keys_digest:
  *
diff --git a/net/core/filter.c b/net/core/filter.c
index a831f193e2c7..6805717be614 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -36,6 +36,7 @@
 #include <net/netlink.h>
 #include <linux/skbuff.h>
 #include <net/sock.h>
+#include <net/flow_dissector.h>
 #include <linux/errno.h>
 #include <linux/timer.h>
 #include <asm/uaccess.h>
diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c
index 9045e2a1108f..9332a0ab0698 100644
--- a/net/ethernet/eth.c
+++ b/net/ethernet/eth.c
@@ -58,6 +58,7 @@
 #include <net/ipv6.h>
 #include <net/ip.h>
 #include <net/dsa.h>
+#include <net/flow_dissector.h>
 #include <linux/uaccess.h>
 
 __setup("ether=", netdev_boot_setup);
-- 
cgit v1.2.3


From 9c684b5083bc191c4b7b189c73d75587e167a474 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@resnulli.us>
Date: Tue, 12 May 2015 14:56:11 +0200
Subject: net: move __skb_get_hash function declaration to flow_dissector.h

Since the definition of the function is in flow_dissector.c, it makes
sense to have the declaration in flow_dissector.h

Signed-off-by: Jiri Pirko <jiri@resnulli.us>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h       | 1 -
 include/net/flow_dissector.h | 1 +
 2 files changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 17607ab9e7a2..ae2d1b7769d8 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -918,7 +918,6 @@ skb_set_hash(struct sk_buff *skb, __u32 hash, enum pkt_hash_types type)
 	skb->hash = hash;
 }
 
-void __skb_get_hash(struct sk_buff *skb);
 static inline __u32 skb_get_hash(struct sk_buff *skb)
 {
 	if (!skb->l4_hash && !skb->sw_hash)
diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h
index 4570ccafe59d..e4ee761ca8c0 100644
--- a/include/net/flow_dissector.h
+++ b/include/net/flow_dissector.h
@@ -46,6 +46,7 @@ static inline __be32 skb_flow_get_ports(const struct sk_buff *skb,
 }
 
 u32 flow_hash_from_keys(struct flow_keys *keys);
+void __skb_get_hash(struct sk_buff *skb);
 u32 skb_get_poff(const struct sk_buff *skb);
 u32 __skb_get_poff(const struct sk_buff *skb, void *data,
 		   const struct flow_keys *keys, int hlen);
-- 
cgit v1.2.3


From 5605c76240aadc823e3d46ac9afde2f26fbcf019 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@resnulli.us>
Date: Tue, 12 May 2015 14:56:12 +0200
Subject: net: move __skb_tx_hash to dev.c

__skb_tx_hash function has no relation to flow_dissect so just move it
to dev.c

Signed-off-by: Jiri Pirko <jiri@resnulli.us>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h |  3 +++
 include/linux/skbuff.h    |  3 ---
 net/core/dev.c            | 28 ++++++++++++++++++++++++++++
 net/core/flow_dissector.c | 28 ----------------------------
 4 files changed, 31 insertions(+), 31 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index cd0951c1893d..d3ed01c18247 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2832,6 +2832,9 @@ static inline int netif_set_xps_queue(struct net_device *dev,
 }
 #endif
 
+u16 __skb_tx_hash(const struct net_device *dev, struct sk_buff *skb,
+		  unsigned int num_tx_queues);
+
 /*
  * Returns a Tx hash for the given packet when dev->real_num_tx_queues is used
  * as a distribution range limit for the returned value.
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index ae2d1b7769d8..b01c7fba7c17 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -3299,9 +3299,6 @@ static inline bool skb_rx_queue_recorded(const struct sk_buff *skb)
 	return skb->queue_mapping != 0;
 }
 
-u16 __skb_tx_hash(const struct net_device *dev, struct sk_buff *skb,
-		  unsigned int num_tx_queues);
-
 static inline struct sec_path *skb_sec_path(struct sk_buff *skb)
 {
 #ifdef CONFIG_XFRM
diff --git a/net/core/dev.c b/net/core/dev.c
index 90a568a150b4..d044d2f8532b 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2350,6 +2350,34 @@ void netif_device_attach(struct net_device *dev)
 }
 EXPORT_SYMBOL(netif_device_attach);
 
+/*
+ * Returns a Tx hash based on the given packet descriptor a Tx queues' number
+ * to be used as a distribution range.
+ */
+u16 __skb_tx_hash(const struct net_device *dev, struct sk_buff *skb,
+		  unsigned int num_tx_queues)
+{
+	u32 hash;
+	u16 qoffset = 0;
+	u16 qcount = num_tx_queues;
+
+	if (skb_rx_queue_recorded(skb)) {
+		hash = skb_get_rx_queue(skb);
+		while (unlikely(hash >= num_tx_queues))
+			hash -= num_tx_queues;
+		return hash;
+	}
+
+	if (dev->num_tc) {
+		u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
+		qoffset = dev->tc_to_txq[tc].offset;
+		qcount = dev->tc_to_txq[tc].count;
+	}
+
+	return (u16) reciprocal_scale(skb_get_hash(skb), qcount) + qoffset;
+}
+EXPORT_SYMBOL(__skb_tx_hash);
+
 static void skb_warn_bad_offload(const struct sk_buff *skb)
 {
 	static const netdev_features_t null_features = 0;
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index 0d9bc3a586ba..07ca11d06339 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -371,34 +371,6 @@ __u32 skb_get_hash_perturb(const struct sk_buff *skb, u32 perturb)
 }
 EXPORT_SYMBOL(skb_get_hash_perturb);
 
-/*
- * Returns a Tx hash based on the given packet descriptor a Tx queues' number
- * to be used as a distribution range.
- */
-u16 __skb_tx_hash(const struct net_device *dev, struct sk_buff *skb,
-		  unsigned int num_tx_queues)
-{
-	u32 hash;
-	u16 qoffset = 0;
-	u16 qcount = num_tx_queues;
-
-	if (skb_rx_queue_recorded(skb)) {
-		hash = skb_get_rx_queue(skb);
-		while (unlikely(hash >= num_tx_queues))
-			hash -= num_tx_queues;
-		return hash;
-	}
-
-	if (dev->num_tc) {
-		u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
-		qoffset = dev->tc_to_txq[tc].offset;
-		qcount = dev->tc_to_txq[tc].count;
-	}
-
-	return (u16) reciprocal_scale(skb_get_hash(skb), qcount) + qoffset;
-}
-EXPORT_SYMBOL(__skb_tx_hash);
-
 u32 __skb_get_poff(const struct sk_buff *skb, void *data,
 		   const struct flow_keys *keys, int hlen)
 {
-- 
cgit v1.2.3


From 06635a35d13d42b95422bba6633f175245cc644e Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@resnulli.us>
Date: Tue, 12 May 2015 14:56:16 +0200
Subject: flow_dissect: use programable dissector in skb_flow_dissect and
 friends

Signed-off-by: Jiri Pirko <jiri@resnulli.us>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/bonding/bond_main.c                |  18 +--
 drivers/net/ethernet/cisco/enic/enic_clsf.c    |  27 ++--
 drivers/net/ethernet/cisco/enic/enic_ethtool.c |  10 +-
 drivers/net/hyperv/netvsc_drv.c                |   8 +-
 include/linux/skbuff.h                         |   4 +-
 include/net/flow_dissector.h                   |  63 ++++----
 include/net/ip.h                               |   8 +-
 include/net/ipv6.h                             |   8 +-
 net/core/flow_dissector.c                      | 199 +++++++++++++++++--------
 net/ethernet/eth.c                             |   6 +-
 net/sched/cls_flow.c                           |  20 +--
 net/sched/sch_choke.c                          |   4 +-
 12 files changed, 229 insertions(+), 146 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index fe7c38721775..ef26e0147050 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -3051,16 +3051,16 @@ static bool bond_flow_dissect(struct bonding *bond, struct sk_buff *skb,
 	int noff, proto = -1;
 
 	if (bond->params.xmit_policy > BOND_XMIT_POLICY_LAYER23)
-		return skb_flow_dissect(skb, fk);
+		return skb_flow_dissect_flow_keys(skb, fk);
 
-	fk->ports = 0;
+	fk->ports.ports = 0;
 	noff = skb_network_offset(skb);
 	if (skb->protocol == htons(ETH_P_IP)) {
 		if (unlikely(!pskb_may_pull(skb, noff + sizeof(*iph))))
 			return false;
 		iph = ip_hdr(skb);
-		fk->src = iph->saddr;
-		fk->dst = iph->daddr;
+		fk->addrs.src = iph->saddr;
+		fk->addrs.dst = iph->daddr;
 		noff += iph->ihl << 2;
 		if (!ip_is_fragment(iph))
 			proto = iph->protocol;
@@ -3068,15 +3068,15 @@ static bool bond_flow_dissect(struct bonding *bond, struct sk_buff *skb,
 		if (unlikely(!pskb_may_pull(skb, noff + sizeof(*iph6))))
 			return false;
 		iph6 = ipv6_hdr(skb);
-		fk->src = (__force __be32)ipv6_addr_hash(&iph6->saddr);
-		fk->dst = (__force __be32)ipv6_addr_hash(&iph6->daddr);
+		fk->addrs.src = (__force __be32)ipv6_addr_hash(&iph6->saddr);
+		fk->addrs.dst = (__force __be32)ipv6_addr_hash(&iph6->daddr);
 		noff += sizeof(*iph6);
 		proto = iph6->nexthdr;
 	} else {
 		return false;
 	}
 	if (bond->params.xmit_policy == BOND_XMIT_POLICY_LAYER34 && proto >= 0)
-		fk->ports = skb_flow_get_ports(skb, noff, proto);
+		fk->ports.ports = skb_flow_get_ports(skb, noff, proto);
 
 	return true;
 }
@@ -3102,8 +3102,8 @@ u32 bond_xmit_hash(struct bonding *bond, struct sk_buff *skb)
 	    bond->params.xmit_policy == BOND_XMIT_POLICY_ENCAP23)
 		hash = bond_eth_hash(skb);
 	else
-		hash = (__force u32)flow.ports;
-	hash ^= (__force u32)flow.dst ^ (__force u32)flow.src;
+		hash = (__force u32)flow.ports.ports;
+	hash ^= (__force u32)flow.addrs.dst ^ (__force u32)flow.addrs.src;
 	hash ^= (hash >> 16);
 	hash ^= (hash >> 8);
 
diff --git a/drivers/net/ethernet/cisco/enic/enic_clsf.c b/drivers/net/ethernet/cisco/enic/enic_clsf.c
index 380f04903a36..d3d25c7e5b96 100644
--- a/drivers/net/ethernet/cisco/enic/enic_clsf.c
+++ b/drivers/net/ethernet/cisco/enic/enic_clsf.c
@@ -22,7 +22,7 @@ int enic_addfltr_5t(struct enic *enic, struct flow_keys *keys, u16 rq)
 	int res;
 	struct filter data;
 
-	switch (keys->ip_proto) {
+	switch (keys->basic.ip_proto) {
 	case IPPROTO_TCP:
 		data.u.ipv4.protocol = PROTO_TCP;
 		break;
@@ -33,10 +33,10 @@ int enic_addfltr_5t(struct enic *enic, struct flow_keys *keys, u16 rq)
 		return -EPROTONOSUPPORT;
 	};
 	data.type = FILTER_IPV4_5TUPLE;
-	data.u.ipv4.src_addr = ntohl(keys->src);
-	data.u.ipv4.dst_addr = ntohl(keys->dst);
-	data.u.ipv4.src_port = ntohs(keys->port16[0]);
-	data.u.ipv4.dst_port = ntohs(keys->port16[1]);
+	data.u.ipv4.src_addr = ntohl(keys->addrs.src);
+	data.u.ipv4.dst_addr = ntohl(keys->addrs.dst);
+	data.u.ipv4.src_port = ntohs(keys->ports.port16[0]);
+	data.u.ipv4.dst_port = ntohs(keys->ports.port16[1]);
 	data.u.ipv4.flags = FILTER_FIELDS_IPV4_5TUPLE;
 
 	spin_lock_bh(&enic->devcmd_lock);
@@ -158,11 +158,11 @@ static struct enic_rfs_fltr_node *htbl_key_search(struct hlist_head *h,
 	struct enic_rfs_fltr_node *tpos;
 
 	hlist_for_each_entry(tpos, h, node)
-		if (tpos->keys.src == k->src &&
-		    tpos->keys.dst == k->dst &&
-		    tpos->keys.ports == k->ports &&
-		    tpos->keys.ip_proto == k->ip_proto &&
-		    tpos->keys.n_proto == k->n_proto)
+		if (tpos->keys.addrs.src == k->addrs.src &&
+		    tpos->keys.addrs.dst == k->addrs.dst &&
+		    tpos->keys.ports.ports == k->ports.ports &&
+		    tpos->keys.basic.ip_proto == k->basic.ip_proto &&
+		    tpos->keys.basic.n_proto == k->basic.n_proto)
 			return tpos;
 	return NULL;
 }
@@ -177,9 +177,10 @@ int enic_rx_flow_steer(struct net_device *dev, const struct sk_buff *skb,
 	int res, i;
 
 	enic = netdev_priv(dev);
-	res = skb_flow_dissect(skb, &keys);
-	if (!res || keys.n_proto != htons(ETH_P_IP) ||
-	    (keys.ip_proto != IPPROTO_TCP && keys.ip_proto != IPPROTO_UDP))
+	res = skb_flow_dissect_flow_keys(skb, &keys);
+	if (!res || keys.basic.n_proto != htons(ETH_P_IP) ||
+	    (keys.basic.ip_proto != IPPROTO_TCP &&
+	     keys.basic.ip_proto != IPPROTO_UDP))
 		return -EPROTONOSUPPORT;
 
 	tbl_idx = skb_get_hash_raw(skb) & ENIC_RFS_FLW_MASK;
diff --git a/drivers/net/ethernet/cisco/enic/enic_ethtool.c b/drivers/net/ethernet/cisco/enic/enic_ethtool.c
index 28d9ca675a27..7588f8dcc890 100644
--- a/drivers/net/ethernet/cisco/enic/enic_ethtool.c
+++ b/drivers/net/ethernet/cisco/enic/enic_ethtool.c
@@ -334,7 +334,7 @@ static int enic_grxclsrule(struct enic *enic, struct ethtool_rxnfc *cmd)
 	n = htbl_fltr_search(enic, (u16)fsp->location);
 	if (!n)
 		return -EINVAL;
-	switch (n->keys.ip_proto) {
+	switch (n->keys.basic.ip_proto) {
 	case IPPROTO_TCP:
 		fsp->flow_type = TCP_V4_FLOW;
 		break;
@@ -346,16 +346,16 @@ static int enic_grxclsrule(struct enic *enic, struct ethtool_rxnfc *cmd)
 		break;
 	}
 
-	fsp->h_u.tcp_ip4_spec.ip4src = n->keys.src;
+	fsp->h_u.tcp_ip4_spec.ip4src = n->keys.addrs.src;
 	fsp->m_u.tcp_ip4_spec.ip4src = (__u32)~0;
 
-	fsp->h_u.tcp_ip4_spec.ip4dst = n->keys.dst;
+	fsp->h_u.tcp_ip4_spec.ip4dst = n->keys.addrs.dst;
 	fsp->m_u.tcp_ip4_spec.ip4dst = (__u32)~0;
 
-	fsp->h_u.tcp_ip4_spec.psrc = n->keys.port16[0];
+	fsp->h_u.tcp_ip4_spec.psrc = n->keys.ports.port16[0];
 	fsp->m_u.tcp_ip4_spec.psrc = (__u16)~0;
 
-	fsp->h_u.tcp_ip4_spec.pdst = n->keys.port16[1];
+	fsp->h_u.tcp_ip4_spec.pdst = n->keys.ports.port16[1];
 	fsp->m_u.tcp_ip4_spec.pdst = (__u16)~0;
 
 	fsp->ring_cookie = n->rq_id;
diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c
index 5993c7e2d723..8e5fe888a0ec 100644
--- a/drivers/net/hyperv/netvsc_drv.c
+++ b/drivers/net/hyperv/netvsc_drv.c
@@ -196,12 +196,12 @@ static bool netvsc_set_hash(u32 *hash, struct sk_buff *skb)
 	struct flow_keys flow;
 	int data_len;
 
-	if (!skb_flow_dissect(skb, &flow) ||
-	    !(flow.n_proto == htons(ETH_P_IP) ||
-	      flow.n_proto == htons(ETH_P_IPV6)))
+	if (!skb_flow_dissect_flow_keys(skb, &flow) ||
+	    !(flow.basic.n_proto == htons(ETH_P_IP) ||
+	      flow.basic.n_proto == htons(ETH_P_IPV6)))
 		return false;
 
-	if (flow.ip_proto == IPPROTO_TCP)
+	if (flow.basic.ip_proto == IPPROTO_TCP)
 		data_len = 12;
 	else
 		data_len = 8;
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index b01c7fba7c17..f83aa6568cbf 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1935,8 +1935,8 @@ static inline void skb_probe_transport_header(struct sk_buff *skb,
 
 	if (skb_transport_header_was_set(skb))
 		return;
-	else if (skb_flow_dissect(skb, &keys))
-		skb_set_transport_header(skb, keys.thoff);
+	else if (skb_flow_dissect_flow_keys(skb, &keys))
+		skb_set_transport_header(skb, keys.basic.thoff);
 	else
 		skb_set_transport_header(skb, offset_hint);
 }
diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h
index 20239e807f77..0c8d406fb730 100644
--- a/include/net/flow_dissector.h
+++ b/include/net/flow_dissector.h
@@ -59,42 +59,47 @@ struct flow_dissector {
 	unsigned short int offset[FLOW_DISSECTOR_KEY_MAX];
 };
 
-/* struct flow_keys:
- *	@src: source ip address in case of IPv4
- *	      For IPv6 it contains 32bit hash of src address
- *	@dst: destination ip address in case of IPv4
- *	      For IPv6 it contains 32bit hash of dst address
- *	@ports: port numbers of Transport header
- *		port16[0]: src port number
- *		port16[1]: dst port number
- *	@thoff: Transport header offset
- *	@n_proto: Network header protocol (eg. IPv4/IPv6)
- *	@ip_proto: Transport header protocol (eg. TCP/UDP)
- * All the members, except thoff, are in network byte order.
- */
-struct flow_keys {
-	/* (src,dst) must be grouped, in the same way than in IP header */
-	__be32 src;
-	__be32 dst;
-	union {
-		__be32 ports;
-		__be16 port16[2];
-	};
-	u16	thoff;
-	__be16	n_proto;
-	u8	ip_proto;
-};
-
 void skb_flow_dissector_init(struct flow_dissector *flow_dissector,
 			     const struct flow_dissector_key *key,
 			     unsigned int key_count);
-bool __skb_flow_dissect(const struct sk_buff *skb, struct flow_keys *flow,
+
+bool __skb_flow_dissect(const struct sk_buff *skb,
+			struct flow_dissector *flow_dissector,
+			void *target_container,
 			void *data, __be16 proto, int nhoff, int hlen);
 
 static inline bool skb_flow_dissect(const struct sk_buff *skb,
-				    struct flow_keys *flow)
+				    struct flow_dissector *flow_dissector,
+				    void *target_container)
+{
+	return __skb_flow_dissect(skb, flow_dissector, target_container,
+				  NULL, 0, 0, 0);
+}
+
+struct flow_keys {
+	struct flow_dissector_key_addrs addrs;
+	struct flow_dissector_key_ports ports;
+	struct flow_dissector_key_basic basic;
+};
+
+extern struct flow_dissector flow_keys_dissector;
+extern struct flow_dissector flow_keys_buf_dissector;
+
+static inline bool skb_flow_dissect_flow_keys(const struct sk_buff *skb,
+					      struct flow_keys *flow)
+{
+	memset(flow, 0, sizeof(*flow));
+	return __skb_flow_dissect(skb, &flow_keys_dissector, flow,
+				  NULL, 0, 0, 0);
+}
+
+static inline bool skb_flow_dissect_flow_keys_buf(struct flow_keys *flow,
+						  void *data, __be16 proto,
+						  int nhoff, int hlen)
 {
-	return __skb_flow_dissect(skb, flow, NULL, 0, 0, 0);
+	memset(flow, 0, sizeof(*flow));
+	return __skb_flow_dissect(NULL, &flow_keys_buf_dissector, flow,
+				  data, proto, nhoff, hlen);
 }
 
 __be32 __skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto,
diff --git a/include/net/ip.h b/include/net/ip.h
index 562eb653aebc..b0443d4fe13f 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -360,10 +360,10 @@ static inline void inet_set_txhash(struct sock *sk)
 	struct inet_sock *inet = inet_sk(sk);
 	struct flow_keys keys;
 
-	keys.src = inet->inet_saddr;
-	keys.dst = inet->inet_daddr;
-	keys.port16[0] = inet->inet_sport;
-	keys.port16[1] = inet->inet_dport;
+	keys.addrs.src = inet->inet_saddr;
+	keys.addrs.dst = inet->inet_daddr;
+	keys.ports.port16[0] = inet->inet_sport;
+	keys.ports.port16[1] = inet->inet_dport;
 
 	sk->sk_txhash = flow_hash_from_keys(&keys);
 }
diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index 9932b86394a9..9eed9761dfce 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -698,10 +698,10 @@ static inline void ip6_set_txhash(struct sock *sk)
 	struct ipv6_pinfo *np = inet6_sk(sk);
 	struct flow_keys keys;
 
-	keys.src = (__force __be32)ipv6_addr_hash(&np->saddr);
-	keys.dst = (__force __be32)ipv6_addr_hash(&sk->sk_v6_daddr);
-	keys.port16[0] = inet->inet_sport;
-	keys.port16[1] = inet->inet_dport;
+	keys.addrs.src = (__force __be32)ipv6_addr_hash(&np->saddr);
+	keys.addrs.dst = (__force __be32)ipv6_addr_hash(&sk->sk_v6_daddr);
+	keys.ports.port16[0] = inet->inet_sport;
+	keys.ports.port16[1] = inet->inet_dport;
 
 	sk->sk_txhash = flow_hash_from_keys(&keys);
 }
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index 6883e22b8fc4..6a49acaa6651 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -13,6 +13,7 @@
 #include <linux/if_tunnel.h>
 #include <linux/if_pppox.h>
 #include <linux/ppp_defs.h>
+#include <linux/stddef.h>
 #include <net/flow_dissector.h>
 #include <scsi/fc/fc_fcoe.h>
 
@@ -63,17 +64,6 @@ void skb_flow_dissector_init(struct flow_dissector *flow_dissector,
 }
 EXPORT_SYMBOL(skb_flow_dissector_init);
 
-/* copy saddr & daddr, possibly using 64bit load/store
- * Equivalent to :	flow->src = iph->saddr;
- *			flow->dst = iph->daddr;
- */
-static void iph_to_flow_copy_addrs(struct flow_keys *flow, const struct iphdr *iph)
-{
-	BUILD_BUG_ON(offsetof(typeof(*flow), dst) !=
-		     offsetof(typeof(*flow), src) + sizeof(flow->src));
-	memcpy(&flow->src, &iph->saddr, sizeof(flow->src) + sizeof(flow->dst));
-}
-
 /**
  * __skb_flow_get_ports - extract the upper layer ports and return them
  * @skb: sk_buff to extract the ports from
@@ -111,17 +101,27 @@ EXPORT_SYMBOL(__skb_flow_get_ports);
 /**
  * __skb_flow_dissect - extract the flow_keys struct and return it
  * @skb: sk_buff to extract the flow from, can be NULL if the rest are specified
+ * @flow_dissector: list of keys to dissect
+ * @target_container: target structure to put dissected values into
  * @data: raw buffer pointer to the packet, if NULL use skb->data
  * @proto: protocol for which to get the flow, if @data is NULL use skb->protocol
  * @nhoff: network header offset, if @data is NULL use skb_network_offset(skb)
  * @hlen: packet header length, if @data is NULL use skb_headlen(skb)
  *
- * The function will try to retrieve the struct flow_keys from either the skbuff
- * or a raw buffer specified by the rest parameters
+ * The function will try to retrieve individual keys into target specified
+ * by flow_dissector from either the skbuff or a raw buffer specified by the
+ * rest parameters.
+ *
+ * Caller must take care of zeroing target container memory.
  */
-bool __skb_flow_dissect(const struct sk_buff *skb, struct flow_keys *flow,
+bool __skb_flow_dissect(const struct sk_buff *skb,
+			struct flow_dissector *flow_dissector,
+			void *target_container,
 			void *data, __be16 proto, int nhoff, int hlen)
 {
+	struct flow_dissector_key_basic *key_basic;
+	struct flow_dissector_key_addrs *key_addrs;
+	struct flow_dissector_key_ports *key_ports;
 	u8 ip_proto;
 
 	if (!data) {
@@ -131,7 +131,12 @@ bool __skb_flow_dissect(const struct sk_buff *skb, struct flow_keys *flow,
 		hlen = skb_headlen(skb);
 	}
 
-	memset(flow, 0, sizeof(*flow));
+	/* It is ensured by skb_flow_dissector_init() that basic key will
+	 * be always present.
+	 */
+	key_basic = skb_flow_dissector_target(flow_dissector,
+					      FLOW_DISSECTOR_KEY_BASIC,
+					      target_container);
 
 again:
 	switch (proto) {
@@ -148,14 +153,13 @@ ip:
 		if (ip_is_fragment(iph))
 			ip_proto = 0;
 
-		/* skip the address processing if skb is NULL.  The assumption
-		 * here is that if there is no skb we are not looking for flow
-		 * info but lengths and protocols.
-		 */
-		if (!skb)
+		if (!skb_flow_dissector_uses_key(flow_dissector,
+						 FLOW_DISSECTOR_KEY_IPV4_ADDRS))
 			break;
-
-		iph_to_flow_copy_addrs(flow, iph);
+		key_addrs = skb_flow_dissector_target(flow_dissector,
+						      FLOW_DISSECTOR_KEY_IPV4_ADDRS,
+						      target_container);
+		memcpy(key_addrs, &iph->saddr, sizeof(*key_addrs));
 		break;
 	}
 	case htons(ETH_P_IPV6): {
@@ -171,12 +175,15 @@ ipv6:
 		ip_proto = iph->nexthdr;
 		nhoff += sizeof(struct ipv6hdr);
 
-		/* see comment above in IPv4 section */
-		if (!skb)
+		if (!skb_flow_dissector_uses_key(flow_dissector,
+						 FLOW_DISSECTOR_KEY_IPV6_HASH_ADDRS))
 			break;
+		key_addrs = skb_flow_dissector_target(flow_dissector,
+						      FLOW_DISSECTOR_KEY_IPV6_HASH_ADDRS,
+						      target_container);
 
-		flow->src = (__force __be32)ipv6_addr_hash(&iph->saddr);
-		flow->dst = (__force __be32)ipv6_addr_hash(&iph->daddr);
+		key_addrs->src = (__force __be32)ipv6_addr_hash(&iph->saddr);
+		key_addrs->dst = (__force __be32)ipv6_addr_hash(&iph->daddr);
 
 		flow_label = ip6_flowlabel(iph);
 		if (flow_label) {
@@ -184,10 +191,18 @@ ipv6:
 			 * use that to represent the ports without any
 			 * further dissection.
 			 */
-			flow->n_proto = proto;
-			flow->ip_proto = ip_proto;
-			flow->ports = flow_label;
-			flow->thoff = (u16)nhoff;
+
+			key_basic->n_proto = proto;
+			key_basic->ip_proto = ip_proto;
+			key_basic->thoff = (u16)nhoff;
+
+			if (!skb_flow_dissector_uses_key(flow_dissector,
+							 FLOW_DISSECTOR_KEY_PORTS))
+				break;
+			key_ports = skb_flow_dissector_target(flow_dissector,
+							      FLOW_DISSECTOR_KEY_PORTS,
+							      target_container);
+			key_ports->ports = flow_label;
 
 			return true;
 		}
@@ -234,14 +249,22 @@ ipv6:
 		hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, hlen, &_hdr);
 		if (!hdr)
 			return false;
-		flow->src = hdr->srcnode;
-		flow->dst = 0;
-		flow->n_proto = proto;
-		flow->thoff = (u16)nhoff;
+		key_basic->n_proto = proto;
+		key_basic->thoff = (u16)nhoff;
+
+		if (skb_flow_dissector_uses_key(flow_dissector,
+						FLOW_DISSECTOR_KEY_IPV6_HASH_ADDRS)) {
+			return true;
+			key_addrs = skb_flow_dissector_target(flow_dissector,
+							      FLOW_DISSECTOR_KEY_IPV6_HASH_ADDRS,
+							      target_container);
+			key_addrs->src = hdr->srcnode;
+			key_addrs->dst = 0;
+		}
 		return true;
 	}
 	case htons(ETH_P_FCOE):
-		flow->thoff = (u16)(nhoff + FCOE_HEADER_LEN);
+		key_basic->thoff = (u16)(nhoff + FCOE_HEADER_LEN);
 		/* fall through */
 	default:
 		return false;
@@ -296,14 +319,24 @@ ipv6:
 		break;
 	}
 
-	flow->n_proto = proto;
-	flow->ip_proto = ip_proto;
-	flow->thoff = (u16) nhoff;
-
-	/* unless skb is set we don't need to record port info */
-	if (skb)
-		flow->ports = __skb_flow_get_ports(skb, nhoff, ip_proto,
-						   data, hlen);
+	/* It is ensured by skb_flow_dissector_init() that basic key will
+	 * be always present.
+	 */
+	key_basic = skb_flow_dissector_target(flow_dissector,
+					      FLOW_DISSECTOR_KEY_BASIC,
+					      target_container);
+	key_basic->n_proto = proto;
+	key_basic->ip_proto = ip_proto;
+	key_basic->thoff = (u16) nhoff;
+
+	if (skb_flow_dissector_uses_key(flow_dissector,
+					FLOW_DISSECTOR_KEY_PORTS)) {
+		key_ports = skb_flow_dissector_target(flow_dissector,
+						      FLOW_DISSECTOR_KEY_PORTS,
+						      target_container);
+		key_ports->ports = __skb_flow_get_ports(skb, nhoff, ip_proto,
+							data, hlen);
+	}
 
 	return true;
 }
@@ -325,16 +358,16 @@ static inline u32 __flow_hash_from_keys(struct flow_keys *keys, u32 keyval)
 	u32 hash;
 
 	/* get a consistent hash (same value on both flow directions) */
-	if (((__force u32)keys->dst < (__force u32)keys->src) ||
-	    (((__force u32)keys->dst == (__force u32)keys->src) &&
-	     ((__force u16)keys->port16[1] < (__force u16)keys->port16[0]))) {
-		swap(keys->dst, keys->src);
-		swap(keys->port16[0], keys->port16[1]);
+	if (((__force u32)keys->addrs.dst < (__force u32)keys->addrs.src) ||
+	    (((__force u32)keys->addrs.dst == (__force u32)keys->addrs.src) &&
+	     ((__force u16)keys->ports.port16[1] < (__force u16)keys->ports.port16[0]))) {
+		swap(keys->addrs.dst, keys->addrs.src);
+		swap(keys->ports.port16[0], keys->ports.port16[1]);
 	}
 
-	hash = __flow_hash_3words((__force u32)keys->dst,
-				  (__force u32)keys->src,
-				  (__force u32)keys->ports,
+	hash = __flow_hash_3words((__force u32)keys->addrs.dst,
+				  (__force u32)keys->addrs.src,
+				  (__force u32)keys->ports.ports,
 				  keyval);
 	if (!hash)
 		hash = 1;
@@ -352,7 +385,7 @@ EXPORT_SYMBOL(flow_hash_from_keys);
 static inline u32 ___skb_get_hash(const struct sk_buff *skb,
 				  struct flow_keys *keys, u32 keyval)
 {
-	if (!skb_flow_dissect(skb, keys))
+	if (!skb_flow_dissect_flow_keys(skb, keys))
 		return 0;
 
 	return __flow_hash_from_keys(keys, keyval);
@@ -377,11 +410,11 @@ void make_flow_keys_digest(struct flow_keys_digest *digest,
 
 	memset(digest, 0, sizeof(*digest));
 
-	data->n_proto = flow->n_proto;
-	data->ip_proto = flow->ip_proto;
-	data->ports = flow->ports;
-	data->src = flow->src;
-	data->dst = flow->dst;
+	data->n_proto = flow->basic.n_proto;
+	data->ip_proto = flow->basic.ip_proto;
+	data->ports = flow->ports.ports;
+	data->src = flow->addrs.src;
+	data->dst = flow->addrs.dst;
 }
 EXPORT_SYMBOL(make_flow_keys_digest);
 
@@ -404,7 +437,7 @@ void __skb_get_hash(struct sk_buff *skb)
 	hash = ___skb_get_hash(skb, &keys, hashrnd);
 	if (!hash)
 		return;
-	if (keys.ports)
+	if (keys.ports.ports)
 		skb->l4_hash = 1;
 	skb->sw_hash = 1;
 	skb->hash = hash;
@@ -422,9 +455,9 @@ EXPORT_SYMBOL(skb_get_hash_perturb);
 u32 __skb_get_poff(const struct sk_buff *skb, void *data,
 		   const struct flow_keys *keys, int hlen)
 {
-	u32 poff = keys->thoff;
+	u32 poff = keys->basic.thoff;
 
-	switch (keys->ip_proto) {
+	switch (keys->basic.ip_proto) {
 	case IPPROTO_TCP: {
 		/* access doff as u8 to avoid unaligned access */
 		const u8 *doff;
@@ -478,8 +511,52 @@ u32 skb_get_poff(const struct sk_buff *skb)
 {
 	struct flow_keys keys;
 
-	if (!skb_flow_dissect(skb, &keys))
+	if (!skb_flow_dissect_flow_keys(skb, &keys))
 		return 0;
 
 	return __skb_get_poff(skb, skb->data, &keys, skb_headlen(skb));
 }
+
+static const struct flow_dissector_key flow_keys_dissector_keys[] = {
+	{
+		.key_id = FLOW_DISSECTOR_KEY_BASIC,
+		.offset = offsetof(struct flow_keys, basic),
+	},
+	{
+		.key_id = FLOW_DISSECTOR_KEY_IPV4_ADDRS,
+		.offset = offsetof(struct flow_keys, addrs),
+	},
+	{
+		.key_id = FLOW_DISSECTOR_KEY_IPV6_HASH_ADDRS,
+		.offset = offsetof(struct flow_keys, addrs),
+	},
+	{
+		.key_id = FLOW_DISSECTOR_KEY_PORTS,
+		.offset = offsetof(struct flow_keys, ports),
+	},
+};
+
+static const struct flow_dissector_key flow_keys_buf_dissector_keys[] = {
+	{
+		.key_id = FLOW_DISSECTOR_KEY_BASIC,
+		.offset = offsetof(struct flow_keys, basic),
+	},
+};
+
+struct flow_dissector flow_keys_dissector __read_mostly;
+EXPORT_SYMBOL(flow_keys_dissector);
+
+struct flow_dissector flow_keys_buf_dissector __read_mostly;
+
+static int __init init_default_flow_dissectors(void)
+{
+	skb_flow_dissector_init(&flow_keys_dissector,
+				flow_keys_dissector_keys,
+				ARRAY_SIZE(flow_keys_dissector_keys));
+	skb_flow_dissector_init(&flow_keys_buf_dissector,
+				flow_keys_buf_dissector_keys,
+				ARRAY_SIZE(flow_keys_buf_dissector_keys));
+	return 0;
+}
+
+late_initcall_sync(init_default_flow_dissectors);
diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c
index 9332a0ab0698..c3325bd2f3fb 100644
--- a/net/ethernet/eth.c
+++ b/net/ethernet/eth.c
@@ -131,9 +131,9 @@ u32 eth_get_headlen(void *data, unsigned int len)
 		return len;
 
 	/* parse any remaining L2/L3 headers, check for L4 */
-	if (!__skb_flow_dissect(NULL, &keys, data,
-				eth->h_proto, sizeof(*eth), len))
-		return max_t(u32, keys.thoff, sizeof(*eth));
+	if (!skb_flow_dissect_flow_keys_buf(&keys, data, eth->h_proto,
+					    sizeof(*eth), len))
+		return max_t(u32, keys.basic.thoff, sizeof(*eth));
 
 	/* parse for any L4 headers */
 	return min_t(u32, __skb_get_poff(NULL, data, &keys, len), len);
diff --git a/net/sched/cls_flow.c b/net/sched/cls_flow.c
index 4c5a92298906..4b3e3e30bf4d 100644
--- a/net/sched/cls_flow.c
+++ b/net/sched/cls_flow.c
@@ -68,35 +68,35 @@ static inline u32 addr_fold(void *addr)
 
 static u32 flow_get_src(const struct sk_buff *skb, const struct flow_keys *flow)
 {
-	if (flow->src)
-		return ntohl(flow->src);
+	if (flow->addrs.src)
+		return ntohl(flow->addrs.src);
 	return addr_fold(skb->sk);
 }
 
 static u32 flow_get_dst(const struct sk_buff *skb, const struct flow_keys *flow)
 {
-	if (flow->dst)
-		return ntohl(flow->dst);
+	if (flow->addrs.dst)
+		return ntohl(flow->addrs.dst);
 	return addr_fold(skb_dst(skb)) ^ (__force u16) tc_skb_protocol(skb);
 }
 
 static u32 flow_get_proto(const struct sk_buff *skb, const struct flow_keys *flow)
 {
-	return flow->ip_proto;
+	return flow->basic.ip_proto;
 }
 
 static u32 flow_get_proto_src(const struct sk_buff *skb, const struct flow_keys *flow)
 {
-	if (flow->ports)
-		return ntohs(flow->port16[0]);
+	if (flow->ports.ports)
+		return ntohs(flow->ports.port16[0]);
 
 	return addr_fold(skb->sk);
 }
 
 static u32 flow_get_proto_dst(const struct sk_buff *skb, const struct flow_keys *flow)
 {
-	if (flow->ports)
-		return ntohs(flow->port16[1]);
+	if (flow->ports.ports)
+		return ntohs(flow->ports.port16[1]);
 
 	return addr_fold(skb_dst(skb)) ^ (__force u16) tc_skb_protocol(skb);
 }
@@ -295,7 +295,7 @@ static int flow_classify(struct sk_buff *skb, const struct tcf_proto *tp,
 
 		keymask = f->keymask;
 		if (keymask & FLOW_KEYS_NEEDED)
-			skb_flow_dissect(skb, &flow_keys);
+			skb_flow_dissect_flow_keys(skb, &flow_keys);
 
 		for (n = 0; n < f->nkeys; n++) {
 			key = ffs(keymask) - 1;
diff --git a/net/sched/sch_choke.c b/net/sched/sch_choke.c
index 2624e991a2d1..93d5742dc7e0 100644
--- a/net/sched/sch_choke.c
+++ b/net/sched/sch_choke.c
@@ -170,13 +170,13 @@ static bool choke_match_flow(struct sk_buff *skb1,
 
 	if (!choke_skb_cb(skb1)->keys_valid) {
 		choke_skb_cb(skb1)->keys_valid = 1;
-		skb_flow_dissect(skb1, &temp);
+		skb_flow_dissect_flow_keys(skb1, &temp);
 		make_flow_keys_digest(&choke_skb_cb(skb1)->keys, &temp);
 	}
 
 	if (!choke_skb_cb(skb2)->keys_valid) {
 		choke_skb_cb(skb2)->keys_valid = 1;
-		skb_flow_dissect(skb2, &temp);
+		skb_flow_dissect_flow_keys(skb2, &temp);
 		make_flow_keys_digest(&choke_skb_cb(skb2)->keys, &temp);
 	}
 
-- 
cgit v1.2.3


From 5d6ad960a71f0b36d95d74ef93285733b9f62f59 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Wed, 13 May 2015 15:12:33 -0400
Subject: tracing: Rename FTRACE_EVENT_FL_* flags to EVENT_FILE_FL_*

The name "ftrace" really refers to the function hook infrastructure. It
is not about the trace_events. The FTRACE_EVENT_FL_* flags are flags to
do with the trace_event files in the tracefs directory. They are not related
to function tracing. Rename them to a more descriptive name.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/trace_events.h        | 50 +++++++++++++++---------------
 include/trace/perf.h                | 10 +++---
 kernel/trace/trace.c                |  4 +--
 kernel/trace/trace_events.c         | 62 ++++++++++++++++++-------------------
 kernel/trace/trace_events_filter.c  | 10 +++---
 kernel/trace/trace_events_trigger.c | 14 ++++-----
 6 files changed, 75 insertions(+), 75 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index d4ad58ec684a..a46c138b2eea 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -250,11 +250,11 @@ enum {
  *  FILTERED	  - The event has a filter attached
  *  CAP_ANY	  - Any user can enable for perf
  *  NO_SET_FILTER - Set when filter has error and is to be ignored
- *  IGNORE_ENABLE - For ftrace internal events, do not enable with debugfs file
+ *  IGNORE_ENABLE - For trace internal events, do not enable with debugfs file
  *  WAS_ENABLED   - Set and stays set when an event was ever enabled
  *                    (used for module unloading, if a module event is enabled,
  *                     it is best to clear the buffers that used it).
- *  USE_CALL_FILTER - For ftrace internal events, don't use file filter
+ *  USE_CALL_FILTER - For trace internal events, don't use file filter
  *  TRACEPOINT    - Event is a tracepoint
  *  KPROBE        - Event is a kprobe
  */
@@ -286,7 +286,7 @@ struct trace_event_call {
 	 *   bit 0:		filter_active
 	 *   bit 1:		allow trace by non root (cap any)
 	 *   bit 2:		failed to apply filter
-	 *   bit 3:		ftrace internal event (do not enable)
+	 *   bit 3:		trace internal event (do not enable)
 	 *   bit 4:		Event was enabled by module
 	 *   bit 5:		use call filter rather than file filter
 	 *   bit 6:		Event is a tracepoint
@@ -316,18 +316,18 @@ struct trace_array;
 struct trace_subsystem_dir;
 
 enum {
-	FTRACE_EVENT_FL_ENABLED_BIT,
-	FTRACE_EVENT_FL_RECORDED_CMD_BIT,
-	FTRACE_EVENT_FL_FILTERED_BIT,
-	FTRACE_EVENT_FL_NO_SET_FILTER_BIT,
-	FTRACE_EVENT_FL_SOFT_MODE_BIT,
-	FTRACE_EVENT_FL_SOFT_DISABLED_BIT,
-	FTRACE_EVENT_FL_TRIGGER_MODE_BIT,
-	FTRACE_EVENT_FL_TRIGGER_COND_BIT,
+	EVENT_FILE_FL_ENABLED_BIT,
+	EVENT_FILE_FL_RECORDED_CMD_BIT,
+	EVENT_FILE_FL_FILTERED_BIT,
+	EVENT_FILE_FL_NO_SET_FILTER_BIT,
+	EVENT_FILE_FL_SOFT_MODE_BIT,
+	EVENT_FILE_FL_SOFT_DISABLED_BIT,
+	EVENT_FILE_FL_TRIGGER_MODE_BIT,
+	EVENT_FILE_FL_TRIGGER_COND_BIT,
 };
 
 /*
- * Ftrace event file flags:
+ * Event file flags:
  *  ENABLED	  - The event is enabled
  *  RECORDED_CMD  - The comms should be recorded at sched_switch
  *  FILTERED	  - The event has a filter attached
@@ -339,14 +339,14 @@ enum {
  *  TRIGGER_COND  - When set, one or more triggers has an associated filter
  */
 enum {
-	FTRACE_EVENT_FL_ENABLED		= (1 << FTRACE_EVENT_FL_ENABLED_BIT),
-	FTRACE_EVENT_FL_RECORDED_CMD	= (1 << FTRACE_EVENT_FL_RECORDED_CMD_BIT),
-	FTRACE_EVENT_FL_FILTERED	= (1 << FTRACE_EVENT_FL_FILTERED_BIT),
-	FTRACE_EVENT_FL_NO_SET_FILTER	= (1 << FTRACE_EVENT_FL_NO_SET_FILTER_BIT),
-	FTRACE_EVENT_FL_SOFT_MODE	= (1 << FTRACE_EVENT_FL_SOFT_MODE_BIT),
-	FTRACE_EVENT_FL_SOFT_DISABLED	= (1 << FTRACE_EVENT_FL_SOFT_DISABLED_BIT),
-	FTRACE_EVENT_FL_TRIGGER_MODE	= (1 << FTRACE_EVENT_FL_TRIGGER_MODE_BIT),
-	FTRACE_EVENT_FL_TRIGGER_COND	= (1 << FTRACE_EVENT_FL_TRIGGER_COND_BIT),
+	EVENT_FILE_FL_ENABLED		= (1 << EVENT_FILE_FL_ENABLED_BIT),
+	EVENT_FILE_FL_RECORDED_CMD	= (1 << EVENT_FILE_FL_RECORDED_CMD_BIT),
+	EVENT_FILE_FL_FILTERED		= (1 << EVENT_FILE_FL_FILTERED_BIT),
+	EVENT_FILE_FL_NO_SET_FILTER	= (1 << EVENT_FILE_FL_NO_SET_FILTER_BIT),
+	EVENT_FILE_FL_SOFT_MODE		= (1 << EVENT_FILE_FL_SOFT_MODE_BIT),
+	EVENT_FILE_FL_SOFT_DISABLED	= (1 << EVENT_FILE_FL_SOFT_DISABLED_BIT),
+	EVENT_FILE_FL_TRIGGER_MODE	= (1 << EVENT_FILE_FL_TRIGGER_MODE_BIT),
+	EVENT_FILE_FL_TRIGGER_COND	= (1 << EVENT_FILE_FL_TRIGGER_COND_BIT),
 };
 
 struct trace_event_file {
@@ -439,10 +439,10 @@ ftrace_trigger_soft_disabled(struct trace_event_file *file)
 {
 	unsigned long eflags = file->flags;
 
-	if (!(eflags & FTRACE_EVENT_FL_TRIGGER_COND)) {
-		if (eflags & FTRACE_EVENT_FL_TRIGGER_MODE)
+	if (!(eflags & EVENT_FILE_FL_TRIGGER_COND)) {
+		if (eflags & EVENT_FILE_FL_TRIGGER_MODE)
 			event_triggers_call(file, NULL);
-		if (eflags & FTRACE_EVENT_FL_SOFT_DISABLED)
+		if (eflags & EVENT_FILE_FL_SOFT_DISABLED)
 			return true;
 	}
 	return false;
@@ -470,10 +470,10 @@ __event_trigger_test_discard(struct trace_event_file *file,
 {
 	unsigned long eflags = file->flags;
 
-	if (eflags & FTRACE_EVENT_FL_TRIGGER_COND)
+	if (eflags & EVENT_FILE_FL_TRIGGER_COND)
 		*tt = event_triggers_call(file, entry);
 
-	if (test_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &file->flags))
+	if (test_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags))
 		ring_buffer_discard_commit(buffer, event);
 	else if (!filter_check_discard(file, entry, buffer, event))
 		return false;
diff --git a/include/trace/perf.h b/include/trace/perf.h
index 0dbdbfe0ec41..792eca92c43a 100644
--- a/include/trace/perf.h
+++ b/include/trace/perf.h
@@ -21,10 +21,10 @@
  *	int __data_size;
  *	int pc;
  *
- *	if (!(eflags & FTRACE_EVENT_FL_TRIGGER_COND)) {
- *		if (eflags & FTRACE_EVENT_FL_TRIGGER_MODE)
+ *	if (!(eflags & EVENT_FILE_FL_TRIGGER_COND)) {
+ *		if (eflags & EVENT_FILE_FL_TRIGGER_MODE)
  *			event_triggers_call(trace_file, NULL);
- *		if (eflags & FTRACE_EVENT_FL_SOFT_DISABLED)
+ *		if (eflags & EVENT_FILE_FL_SOFT_DISABLED)
  *			return;
  *	}
  *
@@ -44,10 +44,10 @@
  *	{ <assign>; }  <-- Here we assign the entries by the __field and
  *			   __array macros.
  *
- *	if (eflags & FTRACE_EVENT_FL_TRIGGER_COND)
+ *	if (eflags & EVENT_FILE_FL_TRIGGER_COND)
  *		__tt = event_triggers_call(trace_file, entry);
  *
- *	if (test_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT,
+ *	if (test_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT,
  *		     &trace_file->flags))
  *		ring_buffer_discard_commit(buffer, event);
  *	else if (!filter_check_discard(trace_file, entry, buffer, event))
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 07ff08661167..abcbf7ff8743 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -301,7 +301,7 @@ int filter_check_discard(struct trace_event_file *file, void *rec,
 			 struct ring_buffer *buffer,
 			 struct ring_buffer_event *event)
 {
-	if (unlikely(file->flags & FTRACE_EVENT_FL_FILTERED) &&
+	if (unlikely(file->flags & EVENT_FILE_FL_FILTERED) &&
 	    !filter_match_preds(file->filter, rec)) {
 		ring_buffer_discard_commit(buffer, event);
 		return 1;
@@ -1709,7 +1709,7 @@ trace_event_buffer_lock_reserve(struct ring_buffer **current_rb,
 	 * to store the trace event for the tigger to use. It's recusive
 	 * safe and will not be recorded anywhere.
 	 */
-	if (!entry && trace_file->flags & FTRACE_EVENT_FL_TRIGGER_COND) {
+	if (!entry && trace_file->flags & EVENT_FILE_FL_TRIGGER_COND) {
 		*current_rb = temp_buffer;
 		entry = trace_buffer_lock_reserve(*current_rb,
 						  type, len, flags, pc);
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 58984c252aac..404a372ad85a 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -298,15 +298,15 @@ void trace_event_enable_cmd_record(bool enable)
 	mutex_lock(&event_mutex);
 	do_for_each_event_file(tr, file) {
 
-		if (!(file->flags & FTRACE_EVENT_FL_ENABLED))
+		if (!(file->flags & EVENT_FILE_FL_ENABLED))
 			continue;
 
 		if (enable) {
 			tracing_start_cmdline_record();
-			set_bit(FTRACE_EVENT_FL_RECORDED_CMD_BIT, &file->flags);
+			set_bit(EVENT_FILE_FL_RECORDED_CMD_BIT, &file->flags);
 		} else {
 			tracing_stop_cmdline_record();
-			clear_bit(FTRACE_EVENT_FL_RECORDED_CMD_BIT, &file->flags);
+			clear_bit(EVENT_FILE_FL_RECORDED_CMD_BIT, &file->flags);
 		}
 	} while_for_each_event_file();
 	mutex_unlock(&event_mutex);
@@ -337,24 +337,24 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file,
 		if (soft_disable) {
 			if (atomic_dec_return(&file->sm_ref) > 0)
 				break;
-			disable = file->flags & FTRACE_EVENT_FL_SOFT_DISABLED;
-			clear_bit(FTRACE_EVENT_FL_SOFT_MODE_BIT, &file->flags);
+			disable = file->flags & EVENT_FILE_FL_SOFT_DISABLED;
+			clear_bit(EVENT_FILE_FL_SOFT_MODE_BIT, &file->flags);
 		} else
-			disable = !(file->flags & FTRACE_EVENT_FL_SOFT_MODE);
+			disable = !(file->flags & EVENT_FILE_FL_SOFT_MODE);
 
-		if (disable && (file->flags & FTRACE_EVENT_FL_ENABLED)) {
-			clear_bit(FTRACE_EVENT_FL_ENABLED_BIT, &file->flags);
-			if (file->flags & FTRACE_EVENT_FL_RECORDED_CMD) {
+		if (disable && (file->flags & EVENT_FILE_FL_ENABLED)) {
+			clear_bit(EVENT_FILE_FL_ENABLED_BIT, &file->flags);
+			if (file->flags & EVENT_FILE_FL_RECORDED_CMD) {
 				tracing_stop_cmdline_record();
-				clear_bit(FTRACE_EVENT_FL_RECORDED_CMD_BIT, &file->flags);
+				clear_bit(EVENT_FILE_FL_RECORDED_CMD_BIT, &file->flags);
 			}
 			call->class->reg(call, TRACE_REG_UNREGISTER, file);
 		}
 		/* If in SOFT_MODE, just set the SOFT_DISABLE_BIT, else clear it */
-		if (file->flags & FTRACE_EVENT_FL_SOFT_MODE)
-			set_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &file->flags);
+		if (file->flags & EVENT_FILE_FL_SOFT_MODE)
+			set_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags);
 		else
-			clear_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &file->flags);
+			clear_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags);
 		break;
 	case 1:
 		/*
@@ -366,22 +366,22 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file,
 		 * it still seems to be disabled.
 		 */
 		if (!soft_disable)
-			clear_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &file->flags);
+			clear_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags);
 		else {
 			if (atomic_inc_return(&file->sm_ref) > 1)
 				break;
-			set_bit(FTRACE_EVENT_FL_SOFT_MODE_BIT, &file->flags);
+			set_bit(EVENT_FILE_FL_SOFT_MODE_BIT, &file->flags);
 		}
 
-		if (!(file->flags & FTRACE_EVENT_FL_ENABLED)) {
+		if (!(file->flags & EVENT_FILE_FL_ENABLED)) {
 
 			/* Keep the event disabled, when going to SOFT_MODE. */
 			if (soft_disable)
-				set_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &file->flags);
+				set_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags);
 
 			if (trace_flags & TRACE_ITER_RECORD_CMD) {
 				tracing_start_cmdline_record();
-				set_bit(FTRACE_EVENT_FL_RECORDED_CMD_BIT, &file->flags);
+				set_bit(EVENT_FILE_FL_RECORDED_CMD_BIT, &file->flags);
 			}
 			ret = call->class->reg(call, TRACE_REG_REGISTER, file);
 			if (ret) {
@@ -390,7 +390,7 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file,
 					"%s\n", trace_event_name(call));
 				break;
 			}
-			set_bit(FTRACE_EVENT_FL_ENABLED_BIT, &file->flags);
+			set_bit(EVENT_FILE_FL_ENABLED_BIT, &file->flags);
 
 			/* WAS_ENABLED gets set but never cleared. */
 			call->flags |= TRACE_EVENT_FL_WAS_ENABLED;
@@ -716,7 +716,7 @@ s_next(struct seq_file *m, void *v, loff_t *pos)
 	(*pos)++;
 
 	list_for_each_entry_continue(file, &tr->events, list) {
-		if (file->flags & FTRACE_EVENT_FL_ENABLED)
+		if (file->flags & EVENT_FILE_FL_ENABLED)
 			return file;
 	}
 
@@ -774,12 +774,12 @@ event_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
 	if (!file)
 		return -ENODEV;
 
-	if (flags & FTRACE_EVENT_FL_ENABLED &&
-	    !(flags & FTRACE_EVENT_FL_SOFT_DISABLED))
+	if (flags & EVENT_FILE_FL_ENABLED &&
+	    !(flags & EVENT_FILE_FL_SOFT_DISABLED))
 		strcpy(buf, "1");
 
-	if (flags & FTRACE_EVENT_FL_SOFT_DISABLED ||
-	    flags & FTRACE_EVENT_FL_SOFT_MODE)
+	if (flags & EVENT_FILE_FL_SOFT_DISABLED ||
+	    flags & EVENT_FILE_FL_SOFT_MODE)
 		strcat(buf, "*");
 
 	strcat(buf, "\n");
@@ -851,7 +851,7 @@ system_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
 		 * or if all events or cleared, or if we have
 		 * a mixture.
 		 */
-		set |= (1 << !!(file->flags & FTRACE_EVENT_FL_ENABLED));
+		set |= (1 << !!(file->flags & EVENT_FILE_FL_ENABLED));
 
 		/*
 		 * If we have a mixture, no need to look further.
@@ -1932,10 +1932,10 @@ static int probe_remove_event_call(struct trace_event_call *call)
 			continue;
 		/*
 		 * We can't rely on ftrace_event_enable_disable(enable => 0)
-		 * we are going to do, FTRACE_EVENT_FL_SOFT_MODE can suppress
+		 * we are going to do, EVENT_FILE_FL_SOFT_MODE can suppress
 		 * TRACE_REG_UNREGISTER.
 		 */
-		if (file->flags & FTRACE_EVENT_FL_ENABLED)
+		if (file->flags & EVENT_FILE_FL_ENABLED)
 			return -EBUSY;
 		/*
 		 * The do_for_each_event_file_safe() is
@@ -2114,9 +2114,9 @@ event_enable_probe(unsigned long ip, unsigned long parent_ip, void **_data)
 		return;
 
 	if (data->enable)
-		clear_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &data->file->flags);
+		clear_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &data->file->flags);
 	else
-		set_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &data->file->flags);
+		set_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &data->file->flags);
 }
 
 static void
@@ -2132,7 +2132,7 @@ event_enable_count_probe(unsigned long ip, unsigned long parent_ip, void **_data
 		return;
 
 	/* Skip if the event is in a state we want to switch to */
-	if (data->enable == !(data->file->flags & FTRACE_EVENT_FL_SOFT_DISABLED))
+	if (data->enable == !(data->file->flags & EVENT_FILE_FL_SOFT_DISABLED))
 		return;
 
 	if (data->count != -1)
@@ -2793,7 +2793,7 @@ static __init void event_trace_self_tests(void)
 		 * If an event is already enabled, someone is using
 		 * it and the self test should not be on.
 		 */
-		if (file->flags & FTRACE_EVENT_FL_ENABLED) {
+		if (file->flags & EVENT_FILE_FL_ENABLED) {
 			pr_warn("Enabled event during self test!\n");
 			WARN_ON_ONCE(1);
 			continue;
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 203dd3750e91..319560a1af4b 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -787,7 +787,7 @@ static void filter_disable(struct trace_event_file *file)
 	if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER)
 		call->flags &= ~TRACE_EVENT_FL_FILTERED;
 	else
-		file->flags &= ~FTRACE_EVENT_FL_FILTERED;
+		file->flags &= ~EVENT_FILE_FL_FILTERED;
 }
 
 static void __free_filter(struct event_filter *filter)
@@ -1669,7 +1669,7 @@ static inline void event_set_filtered_flag(struct trace_event_file *file)
 	if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER)
 		call->flags |= TRACE_EVENT_FL_FILTERED;
 	else
-		file->flags |= FTRACE_EVENT_FL_FILTERED;
+		file->flags |= EVENT_FILE_FL_FILTERED;
 }
 
 static inline void event_set_filter(struct trace_event_file *file,
@@ -1701,7 +1701,7 @@ event_set_no_set_filter_flag(struct trace_event_file *file)
 	if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER)
 		call->flags |= TRACE_EVENT_FL_NO_SET_FILTER;
 	else
-		file->flags |= FTRACE_EVENT_FL_NO_SET_FILTER;
+		file->flags |= EVENT_FILE_FL_NO_SET_FILTER;
 }
 
 static inline void
@@ -1712,7 +1712,7 @@ event_clear_no_set_filter_flag(struct trace_event_file *file)
 	if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER)
 		call->flags &= ~TRACE_EVENT_FL_NO_SET_FILTER;
 	else
-		file->flags &= ~FTRACE_EVENT_FL_NO_SET_FILTER;
+		file->flags &= ~EVENT_FILE_FL_NO_SET_FILTER;
 }
 
 static inline bool
@@ -1720,7 +1720,7 @@ event_no_set_filter_flag(struct trace_event_file *file)
 {
 	struct trace_event_call *call = file->event_call;
 
-	if (file->flags & FTRACE_EVENT_FL_NO_SET_FILTER)
+	if (file->flags & EVENT_FILE_FL_NO_SET_FILTER)
 		return true;
 
 	if ((call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) &&
diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c
index ccd6a2adc7ad..bb2cff8abe71 100644
--- a/kernel/trace/trace_events_trigger.c
+++ b/kernel/trace/trace_events_trigger.c
@@ -438,12 +438,12 @@ static int trace_event_trigger_enable_disable(struct trace_event_file *file,
 	if (trigger_enable) {
 		if (atomic_inc_return(&file->tm_ref) > 1)
 			return ret;
-		set_bit(FTRACE_EVENT_FL_TRIGGER_MODE_BIT, &file->flags);
+		set_bit(EVENT_FILE_FL_TRIGGER_MODE_BIT, &file->flags);
 		ret = trace_event_enable_disable(file, 1, 1);
 	} else {
 		if (atomic_dec_return(&file->tm_ref) > 0)
 			return ret;
-		clear_bit(FTRACE_EVENT_FL_TRIGGER_MODE_BIT, &file->flags);
+		clear_bit(EVENT_FILE_FL_TRIGGER_MODE_BIT, &file->flags);
 		ret = trace_event_enable_disable(file, 0, 1);
 	}
 
@@ -501,9 +501,9 @@ static void update_cond_flag(struct trace_event_file *file)
 	}
 
 	if (set_cond)
-		set_bit(FTRACE_EVENT_FL_TRIGGER_COND_BIT, &file->flags);
+		set_bit(EVENT_FILE_FL_TRIGGER_COND_BIT, &file->flags);
 	else
-		clear_bit(FTRACE_EVENT_FL_TRIGGER_COND_BIT, &file->flags);
+		clear_bit(EVENT_FILE_FL_TRIGGER_COND_BIT, &file->flags);
 }
 
 /**
@@ -1063,9 +1063,9 @@ event_enable_trigger(struct event_trigger_data *data)
 	struct enable_trigger_data *enable_data = data->private_data;
 
 	if (enable_data->enable)
-		clear_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &enable_data->file->flags);
+		clear_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &enable_data->file->flags);
 	else
-		set_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &enable_data->file->flags);
+		set_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &enable_data->file->flags);
 }
 
 static void
@@ -1077,7 +1077,7 @@ event_enable_count_trigger(struct event_trigger_data *data)
 		return;
 
 	/* Skip if the event is in a state we want to switch to */
-	if (enable_data->enable == !(enable_data->file->flags & FTRACE_EVENT_FL_SOFT_DISABLED))
+	if (enable_data->enable == !(enable_data->file->flags & EVENT_FILE_FL_SOFT_DISABLED))
 		return;
 
 	if (data->count != -1)
-- 
cgit v1.2.3


From 09a5059aa1a2cbf8c8993e61b013cc83a0dd5833 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Wed, 13 May 2015 15:21:25 -0400
Subject: tracing: Rename ftrace_trigger_soft_disabled() to
 trace_trigger_soft_disabled()

The name "ftrace" really refers to the function hook infrastructure. It
is not about the trace_events. The ftrace_trigger_soft_disabled() tests if a
trace_event is soft disabled (called but not traced), and returns true if
it is. It has nothing to do with function tracing and should be renamed.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/trace_events.h  | 4 ++--
 include/trace/perf.h          | 2 +-
 kernel/trace/trace_kprobe.c   | 4 ++--
 kernel/trace/trace_syscalls.c | 4 ++--
 kernel/trace/trace_uprobe.c   | 2 +-
 5 files changed, 8 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index a46c138b2eea..1063c850dbab 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -426,7 +426,7 @@ extern void event_triggers_post_call(struct trace_event_file *file,
 				     enum event_trigger_type tt);
 
 /**
- * ftrace_trigger_soft_disabled - do triggers and test if soft disabled
+ * trace_trigger_soft_disabled - do triggers and test if soft disabled
  * @file: The file pointer of the event to test
  *
  * If any triggers without filters are attached to this event, they
@@ -435,7 +435,7 @@ extern void event_triggers_post_call(struct trace_event_file *file,
  * otherwise false.
  */
 static inline bool
-ftrace_trigger_soft_disabled(struct trace_event_file *file)
+trace_trigger_soft_disabled(struct trace_event_file *file)
 {
 	unsigned long eflags = file->flags;
 
diff --git a/include/trace/perf.h b/include/trace/perf.h
index 792eca92c43a..c9b831f3da23 100644
--- a/include/trace/perf.h
+++ b/include/trace/perf.h
@@ -159,7 +159,7 @@ ftrace_raw_event_##call(void *__data, proto)				\
 	struct ftrace_raw_##call *entry;				\
 	int __data_size;						\
 									\
-	if (ftrace_trigger_soft_disabled(trace_file))			\
+	if (trace_trigger_soft_disabled(trace_file))			\
 		return;							\
 									\
 	__data_size = ftrace_get_offsets_##call(&__data_offsets, args); \
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 4ce865dae39e..b7d0cdd9906c 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -928,7 +928,7 @@ __kprobe_trace_func(struct trace_kprobe *tk, struct pt_regs *regs,
 
 	WARN_ON(call != trace_file->event_call);
 
-	if (ftrace_trigger_soft_disabled(trace_file))
+	if (trace_trigger_soft_disabled(trace_file))
 		return;
 
 	local_save_flags(irq_flags);
@@ -976,7 +976,7 @@ __kretprobe_trace_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
 
 	WARN_ON(call != trace_file->event_call);
 
-	if (ftrace_trigger_soft_disabled(trace_file))
+	if (trace_trigger_soft_disabled(trace_file))
 		return;
 
 	local_save_flags(irq_flags);
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 504f582b15db..7d567a4b9fa7 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -312,7 +312,7 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id)
 	if (!trace_file)
 		return;
 
-	if (ftrace_trigger_soft_disabled(trace_file))
+	if (trace_trigger_soft_disabled(trace_file))
 		return;
 
 	sys_data = syscall_nr_to_meta(syscall_nr);
@@ -359,7 +359,7 @@ static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret)
 	if (!trace_file)
 		return;
 
-	if (ftrace_trigger_soft_disabled(trace_file))
+	if (trace_trigger_soft_disabled(trace_file))
 		return;
 
 	sys_data = syscall_nr_to_meta(syscall_nr);
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 40764abc7d09..aa1ea7b36fa8 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -784,7 +784,7 @@ static void __uprobe_trace_func(struct trace_uprobe *tu,
 	if (WARN_ON_ONCE(tu->tp.size + dsize > PAGE_SIZE))
 		return;
 
-	if (ftrace_trigger_soft_disabled(trace_file))
+	if (trace_trigger_soft_disabled(trace_file))
 		return;
 
 	esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu));
-- 
cgit v1.2.3


From f0b5e8a42f37a880b8467e59dc814f4f21581d3d Mon Sep 17 00:00:00 2001
From: Pablo Neira <pablo@netfilter.org>
Date: Tue, 12 May 2015 20:28:07 +0200
Subject: net: kill useless net_*_ingress_queue() definitions when NET_CLS_ACT
 is unset

This fixes 4577139b2dabf589 ("net: use jump label patching for ingress qdisc in
__netif_receive_skb_core").

The only client of this is sch_ingress and it depends on NET_CLS_ACT. So
there is no way these definition can be of any help.

Cc: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/rtnetlink.h | 8 --------
 1 file changed, 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h
index 7b8e260c4a27..bd29ab4b0941 100644
--- a/include/linux/rtnetlink.h
+++ b/include/linux/rtnetlink.h
@@ -82,14 +82,6 @@ struct netdev_queue *dev_ingress_queue_create(struct net_device *dev);
 #ifdef CONFIG_NET_CLS_ACT
 void net_inc_ingress_queue(void);
 void net_dec_ingress_queue(void);
-#else
-static inline void net_inc_ingress_queue(void)
-{
-}
-
-static inline void net_dec_ingress_queue(void)
-{
-}
 #endif
 
 extern void rtnetlink_init(void);
-- 
cgit v1.2.3


From 25956b6612601cf36022392ffa83f6bf97939bcd Mon Sep 17 00:00:00 2001
From: Hanjun Guo <hanjun.guo@linaro.org>
Date: Mon, 11 May 2015 12:17:13 +0800
Subject: ACPI / processor: Introduce invalid_logical_cpuid()

In ACPI processor drivers, we use direct comparisons of cpu logical
id with -1 which are error prone in case logical cpuid is accidentally
assinged an error code and prevents us from returning an error-encoding
cpuid directly in some cases.

So introduce invalid_logical_cpuid() to identify cpu with invalid
logical cpu num, then it will be used to replace the direct comparisons
with -1.

Signed-off-by: Hanjun Guo <hanjun.guo@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/acpi_processor.c | 5 +++--
 drivers/acpi/processor_pdc.c  | 5 +----
 include/linux/acpi.h          | 5 +++++
 3 files changed, 9 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/acpi/acpi_processor.c b/drivers/acpi/acpi_processor.c
index 58f335ca2e75..ac6bda030a74 100644
--- a/drivers/acpi/acpi_processor.c
+++ b/drivers/acpi/acpi_processor.c
@@ -275,7 +275,8 @@ static int acpi_processor_get_info(struct acpi_device *device)
 		 * Handle UP system running SMP kernel, with no CPU
 		 * entry in MADT
 		 */
-		if ((cpu_index == -1) && (num_online_cpus() == 1))
+		if (invalid_logical_cpuid(cpu_index)
+		    && (num_online_cpus() == 1))
 			cpu_index = 0;
 	}
 	pr->id = cpu_index;
@@ -285,7 +286,7 @@ static int acpi_processor_get_info(struct acpi_device *device)
 	 *  less than the max # of CPUs. They should be ignored _iff
 	 *  they are physically not present.
 	 */
-	if (pr->id == -1) {
+	if (invalid_logical_cpuid(pr->id)) {
 		int ret = acpi_processor_hotadd_init(pr);
 		if (ret)
 			return ret;
diff --git a/drivers/acpi/processor_pdc.c b/drivers/acpi/processor_pdc.c
index e5dd80800930..7cfbda4d7c51 100644
--- a/drivers/acpi/processor_pdc.c
+++ b/drivers/acpi/processor_pdc.c
@@ -52,10 +52,7 @@ static bool __init processor_physically_present(acpi_handle handle)
 	type = (acpi_type == ACPI_TYPE_DEVICE) ? 1 : 0;
 	cpuid = acpi_get_cpuid(handle, type, acpi_id);
 
-	if (cpuid == -1)
-		return false;
-
-	return true;
+	return !invalid_logical_cpuid(cpuid);
 }
 
 static void acpi_set_pdc_bits(u32 *buf)
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index e4da5e35e29c..913b49f9a6e6 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -158,6 +158,11 @@ typedef u32 phys_cpuid_t;
 #define PHYS_CPUID_INVALID (phys_cpuid_t)(-1)
 #endif
 
+static inline bool invalid_logical_cpuid(u32 cpuid)
+{
+	return (int)cpuid < 0;
+}
+
 #ifdef CONFIG_ACPI_HOTPLUG_CPU
 /* Arch dependent functions for cpu hotplug support */
 int acpi_map_cpu(acpi_handle handle, phys_cpuid_t physid, int *pcpu);
-- 
cgit v1.2.3


From ddcc18f5bdd1aafd457032ec693fd9d0af764d61 Mon Sep 17 00:00:00 2001
From: Hanjun Guo <hanjun.guo@linaro.org>
Date: Wed, 13 May 2015 16:19:30 +0800
Subject: ACPI / processor: Introduce invalid_phys_cpuid()

Introduce invalid_phys_cpuid() to identify cpu with invalid
physical ID, then used it as replacement of the direct comparisons
with PHYS_CPUID_INVALID.

Signed-off-by: Hanjun Guo <hanjun.guo@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/acpi_processor.c | 4 ++--
 drivers/acpi/processor_core.c | 4 ++--
 include/linux/acpi.h          | 5 +++++
 3 files changed, 9 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/acpi/acpi_processor.c b/drivers/acpi/acpi_processor.c
index 62c846befb62..92a5f738e370 100644
--- a/drivers/acpi/acpi_processor.c
+++ b/drivers/acpi/acpi_processor.c
@@ -170,7 +170,7 @@ static int acpi_processor_hotadd_init(struct acpi_processor *pr)
 	acpi_status status;
 	int ret;
 
-	if (pr->phys_id == PHYS_CPUID_INVALID)
+	if (invalid_phys_cpuid(pr->phys_id))
 		return -ENODEV;
 
 	status = acpi_evaluate_integer(pr->handle, "_STA", NULL, &sta);
@@ -264,7 +264,7 @@ static int acpi_processor_get_info(struct acpi_device *device)
 
 	pr->phys_id = acpi_get_phys_id(pr->handle, device_declaration,
 					pr->acpi_id);
-	if (pr->phys_id == PHYS_CPUID_INVALID)
+	if (invalid_phys_cpuid(pr->phys_id))
 		acpi_handle_debug(pr->handle, "failed to get CPU physical ID.\n");
 
 	pr->id = acpi_map_cpuid(pr->phys_id, pr->acpi_id);
diff --git a/drivers/acpi/processor_core.c b/drivers/acpi/processor_core.c
index fd4140dabbf1..33a38d604630 100644
--- a/drivers/acpi/processor_core.c
+++ b/drivers/acpi/processor_core.c
@@ -184,7 +184,7 @@ phys_cpuid_t acpi_get_phys_id(acpi_handle handle, int type, u32 acpi_id)
 	phys_cpuid_t phys_id;
 
 	phys_id = map_mat_entry(handle, type, acpi_id);
-	if (phys_id == PHYS_CPUID_INVALID)
+	if (invalid_phys_cpuid(phys_id))
 		phys_id = map_madt_entry(type, acpi_id);
 
 	return phys_id;
@@ -196,7 +196,7 @@ int acpi_map_cpuid(phys_cpuid_t phys_id, u32 acpi_id)
 	int i;
 #endif
 
-	if (phys_id == PHYS_CPUID_INVALID) {
+	if (invalid_phys_cpuid(phys_id)) {
 		/*
 		 * On UP processor, there is no _MAT or MADT table.
 		 * So above phys_id is always set to PHYS_CPUID_INVALID.
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 913b49f9a6e6..90e4ed1eb191 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -163,6 +163,11 @@ static inline bool invalid_logical_cpuid(u32 cpuid)
 	return (int)cpuid < 0;
 }
 
+static inline bool invalid_phys_cpuid(phys_cpuid_t phys_id)
+{
+	return phys_id == PHYS_CPUID_INVALID;
+}
+
 #ifdef CONFIG_ACPI_HOTPLUG_CPU
 /* Arch dependent functions for cpu hotplug support */
 int acpi_map_cpu(acpi_handle handle, phys_cpuid_t physid, int *pcpu);
-- 
cgit v1.2.3


From 1b852bceb0d111e510d1a15826ecc4a19358d512 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Fri, 8 May 2015 23:22:29 -0500
Subject: mnt: Refactor the logic for mounting sysfs and proc in a user
 namespace

Fresh mounts of proc and sysfs are a very special case that works very
much like a bind mount.  Unfortunately the current structure can not
preserve the MNT_LOCK... mount flags.  Therefore refactor the logic
into a form that can be modified to preserve those lock bits.

Add a new filesystem flag FS_USERNS_VISIBLE that requires some mount
of the filesystem be fully visible in the current mount namespace,
before the filesystem may be mounted.

Move the logic for calling fs_fully_visible from proc and sysfs into
fs/namespace.c where it has greater access to mount namespace state.

Cc: stable@vger.kernel.org
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 fs/namespace.c     | 8 +++++++-
 fs/proc/root.c     | 5 +----
 fs/sysfs/mount.c   | 5 +----
 include/linux/fs.h | 2 +-
 4 files changed, 10 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/fs/namespace.c b/fs/namespace.c
index 1b9e11167bae..8e7edaf60fe1 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -2332,6 +2332,8 @@ unlock:
 	return err;
 }
 
+static bool fs_fully_visible(struct file_system_type *fs_type);
+
 /*
  * create a new mount for userspace and request it to be added into the
  * namespace's tree
@@ -2363,6 +2365,10 @@ static int do_new_mount(struct path *path, const char *fstype, int flags,
 			flags |= MS_NODEV;
 			mnt_flags |= MNT_NODEV | MNT_LOCK_NODEV;
 		}
+		if (type->fs_flags & FS_USERNS_VISIBLE) {
+			if (!fs_fully_visible(type))
+				return -EPERM;
+		}
 	}
 
 	mnt = vfs_kern_mount(type, flags, name, data);
@@ -3164,7 +3170,7 @@ bool current_chrooted(void)
 	return chrooted;
 }
 
-bool fs_fully_visible(struct file_system_type *type)
+static bool fs_fully_visible(struct file_system_type *type)
 {
 	struct mnt_namespace *ns = current->nsproxy->mnt_ns;
 	struct mount *mnt;
diff --git a/fs/proc/root.c b/fs/proc/root.c
index b7fa4bfe896a..64e1ab64bde6 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -112,9 +112,6 @@ static struct dentry *proc_mount(struct file_system_type *fs_type,
 		ns = task_active_pid_ns(current);
 		options = data;
 
-		if (!capable(CAP_SYS_ADMIN) && !fs_fully_visible(fs_type))
-			return ERR_PTR(-EPERM);
-
 		/* Does the mounter have privilege over the pid namespace? */
 		if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN))
 			return ERR_PTR(-EPERM);
@@ -159,7 +156,7 @@ static struct file_system_type proc_fs_type = {
 	.name		= "proc",
 	.mount		= proc_mount,
 	.kill_sb	= proc_kill_sb,
-	.fs_flags	= FS_USERNS_MOUNT,
+	.fs_flags	= FS_USERNS_VISIBLE | FS_USERNS_MOUNT,
 };
 
 void __init proc_root_init(void)
diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c
index 8a49486bf30c..1c6ac6fcee9f 100644
--- a/fs/sysfs/mount.c
+++ b/fs/sysfs/mount.c
@@ -31,9 +31,6 @@ static struct dentry *sysfs_mount(struct file_system_type *fs_type,
 	bool new_sb;
 
 	if (!(flags & MS_KERNMOUNT)) {
-		if (!capable(CAP_SYS_ADMIN) && !fs_fully_visible(fs_type))
-			return ERR_PTR(-EPERM);
-
 		if (!kobj_ns_current_may_mount(KOBJ_NS_TYPE_NET))
 			return ERR_PTR(-EPERM);
 	}
@@ -58,7 +55,7 @@ static struct file_system_type sysfs_fs_type = {
 	.name		= "sysfs",
 	.mount		= sysfs_mount,
 	.kill_sb	= sysfs_kill_sb,
-	.fs_flags	= FS_USERNS_MOUNT,
+	.fs_flags	= FS_USERNS_VISIBLE | FS_USERNS_MOUNT,
 };
 
 int __init sysfs_init(void)
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 35ec87e490b1..2d24eeb8e59c 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1897,6 +1897,7 @@ struct file_system_type {
 #define FS_HAS_SUBTYPE		4
 #define FS_USERNS_MOUNT		8	/* Can be mounted by userns root */
 #define FS_USERNS_DEV_MOUNT	16 /* A userns mount does not imply MNT_NODEV */
+#define FS_USERNS_VISIBLE	32	/* FS must already be visible */
 #define FS_RENAME_DOES_D_MOVE	32768	/* FS will handle d_move() during rename() internally. */
 	struct dentry *(*mount) (struct file_system_type *, int,
 		       const char *, void *);
@@ -1984,7 +1985,6 @@ extern int vfs_ustat(dev_t, struct kstatfs *);
 extern int freeze_super(struct super_block *super);
 extern int thaw_super(struct super_block *super);
 extern bool our_mnt(struct vfsmount *mnt);
-extern bool fs_fully_visible(struct file_system_type *);
 
 extern int current_umask(void);
 
-- 
cgit v1.2.3


From 87d5c18ce1f41a2410d4fb26d5d68c55867450f5 Mon Sep 17 00:00:00 2001
From: Pablo Neira <pablo@netfilter.org>
Date: Wed, 13 May 2015 18:19:34 +0200
Subject: netfilter: cleanup struct nf_hook_ops indentation

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netfilter.h | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h
index 63560d0a8dfe..83be4a3cec98 100644
--- a/include/linux/netfilter.h
+++ b/include/linux/netfilter.h
@@ -79,16 +79,16 @@ typedef unsigned int nf_hookfn(const struct nf_hook_ops *ops,
 			       const struct nf_hook_state *state);
 
 struct nf_hook_ops {
-	struct list_head list;
+	struct list_head 	list;
 
 	/* User fills in from here down. */
-	nf_hookfn	*hook;
-	struct module	*owner;
-	void		*priv;
-	u_int8_t	pf;
-	unsigned int	hooknum;
+	nf_hookfn		*hook;
+	struct module		*owner;
+	void			*priv;
+	u_int8_t		pf;
+	unsigned int		hooknum;
 	/* Hooks are ordered in ascending priority. */
-	int		priority;
+	int			priority;
 };
 
 struct nf_sockopt_ops {
-- 
cgit v1.2.3


From f7191483461ce2ae579b6f7227fa7ce49e006656 Mon Sep 17 00:00:00 2001
From: Pablo Neira <pablo@netfilter.org>
Date: Wed, 13 May 2015 18:19:35 +0200
Subject: netfilter: add hook list to nf_hook_state

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netfilter.h | 7 +++++--
 net/netfilter/core.c      | 6 ++----
 2 files changed, 7 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h
index 83be4a3cec98..388ed1952242 100644
--- a/include/linux/netfilter.h
+++ b/include/linux/netfilter.h
@@ -54,10 +54,12 @@ struct nf_hook_state {
 	struct net_device *in;
 	struct net_device *out;
 	struct sock *sk;
+	struct list_head *hook_list;
 	int (*okfn)(struct sock *, struct sk_buff *);
 };
 
 static inline void nf_hook_state_init(struct nf_hook_state *p,
+				      struct list_head *hook_list,
 				      unsigned int hook,
 				      int thresh, u_int8_t pf,
 				      struct net_device *indev,
@@ -71,6 +73,7 @@ static inline void nf_hook_state_init(struct nf_hook_state *p,
 	p->in = indev;
 	p->out = outdev;
 	p->sk = sk;
+	p->hook_list = hook_list;
 	p->okfn = okfn;
 }
 
@@ -166,8 +169,8 @@ static inline int nf_hook_thresh(u_int8_t pf, unsigned int hook,
 	if (nf_hooks_active(pf, hook)) {
 		struct nf_hook_state state;
 
-		nf_hook_state_init(&state, hook, thresh, pf,
-				   indev, outdev, sk, okfn);
+		nf_hook_state_init(&state, &nf_hooks[pf][hook], hook, thresh,
+				   pf, indev, outdev, sk, okfn);
 		return nf_hook_slow(skb, &state);
 	}
 	return 1;
diff --git a/net/netfilter/core.c b/net/netfilter/core.c
index e6163017c42d..e418cfd603c0 100644
--- a/net/netfilter/core.c
+++ b/net/netfilter/core.c
@@ -166,11 +166,9 @@ int nf_hook_slow(struct sk_buff *skb, struct nf_hook_state *state)
 	/* We may already have this, but read-locks nest anyway */
 	rcu_read_lock();
 
-	elem = list_entry_rcu(&nf_hooks[state->pf][state->hook],
-			      struct nf_hook_ops, list);
+	elem = list_entry_rcu(state->hook_list, struct nf_hook_ops, list);
 next_hook:
-	verdict = nf_iterate(&nf_hooks[state->pf][state->hook], skb, state,
-			     &elem);
+	verdict = nf_iterate(state->hook_list, skb, state, &elem);
 	if (verdict == NF_ACCEPT || verdict == NF_STOP) {
 		ret = 1;
 	} else if ((verdict & NF_VERDICT_MASK) == NF_DROP) {
-- 
cgit v1.2.3


From b8d0aad0c77f488d1d51a02d871a5cbc2d8032b9 Mon Sep 17 00:00:00 2001
From: Pablo Neira <pablo@netfilter.org>
Date: Wed, 13 May 2015 18:19:36 +0200
Subject: netfilter: add nf_hook_list_active()

In preparation to have netfilter ingress per-device hook list.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netfilter.h | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h
index 388ed1952242..49d00638d1fa 100644
--- a/include/linux/netfilter.h
+++ b/include/linux/netfilter.h
@@ -134,26 +134,33 @@ extern struct list_head nf_hooks[NFPROTO_NUMPROTO][NF_MAX_HOOKS];
 #ifdef HAVE_JUMP_LABEL
 extern struct static_key nf_hooks_needed[NFPROTO_NUMPROTO][NF_MAX_HOOKS];
 
-static inline bool nf_hooks_active(u_int8_t pf, unsigned int hook)
+static inline bool nf_hook_list_active(struct list_head *nf_hook_list,
+				       u_int8_t pf, unsigned int hook)
 {
 	if (__builtin_constant_p(pf) &&
 	    __builtin_constant_p(hook))
 		return static_key_false(&nf_hooks_needed[pf][hook]);
 
-	return !list_empty(&nf_hooks[pf][hook]);
+	return !list_empty(nf_hook_list);
 }
 #else
-static inline bool nf_hooks_active(u_int8_t pf, unsigned int hook)
+static inline bool nf_hook_list_active(struct list_head *nf_hook_list,
+				       u_int8_t pf, unsigned int hook)
 {
-	return !list_empty(&nf_hooks[pf][hook]);
+	return !list_empty(nf_hook_list);
 }
 #endif
 
+static inline bool nf_hooks_active(u_int8_t pf, unsigned int hook)
+{
+	return nf_hook_list_active(&nf_hooks[pf][hook], pf, hook);
+}
+
 int nf_hook_slow(struct sk_buff *skb, struct nf_hook_state *state);
 
 /**
  *	nf_hook_thresh - call a netfilter hook
- *	
+ *
  *	Returns 1 if the hook has allowed the packet to pass.  The function
  *	okfn must be invoked by the caller in this case.  Any other return
  *	value indicates the packet has been consumed by the hook.
-- 
cgit v1.2.3


From 1cf51900f8545b358b5deaacfda348d990f671db Mon Sep 17 00:00:00 2001
From: Pablo Neira <pablo@netfilter.org>
Date: Wed, 13 May 2015 18:19:37 +0200
Subject: net: add CONFIG_NET_INGRESS to enable ingress filtering

This new config switch enables the ingress filtering infrastructure that is
controlled through the ingress_needed static key. This prepares the
introduction of the Netfilter ingress hook that resides under this unique
static key.

Note that CONFIG_SCH_INGRESS automatically selects this, that should be no
problem since this also depends on CONFIG_NET_CLS_ACT.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Acked-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/rtnetlink.h | 2 +-
 net/Kconfig               | 3 +++
 net/core/dev.c            | 7 ++++---
 net/sched/Kconfig         | 1 +
 4 files changed, 9 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h
index bd29ab4b0941..a2324fb45cf4 100644
--- a/include/linux/rtnetlink.h
+++ b/include/linux/rtnetlink.h
@@ -79,7 +79,7 @@ static inline struct netdev_queue *dev_ingress_queue(struct net_device *dev)
 
 struct netdev_queue *dev_ingress_queue_create(struct net_device *dev);
 
-#ifdef CONFIG_NET_CLS_ACT
+#ifdef CONFIG_NET_INGRESS
 void net_inc_ingress_queue(void);
 void net_dec_ingress_queue(void);
 #endif
diff --git a/net/Kconfig b/net/Kconfig
index 44dd5786ee91..57a7c5af3175 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -45,6 +45,9 @@ config COMPAT_NETLINK_MESSAGES
 	  Newly written code should NEVER need this option but do
 	  compat-independent messages instead!
 
+config NET_INGRESS
+	bool
+
 menu "Networking options"
 
 source "net/packet/Kconfig"
diff --git a/net/core/dev.c b/net/core/dev.c
index af549062ae8e..a5ef90016ce7 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1630,7 +1630,7 @@ int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
 }
 EXPORT_SYMBOL(call_netdevice_notifiers);
 
-#ifdef CONFIG_NET_CLS_ACT
+#ifdef CONFIG_NET_INGRESS
 static struct static_key ingress_needed __read_mostly;
 
 void net_inc_ingress_queue(void)
@@ -3798,13 +3798,14 @@ another_round:
 	}
 
 skip_taps:
-#ifdef CONFIG_NET_CLS_ACT
+#ifdef CONFIG_NET_INGRESS
 	if (static_key_false(&ingress_needed)) {
 		skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
 		if (!skb)
 			goto unlock;
 	}
-
+#endif
+#ifdef CONFIG_NET_CLS_ACT
 	skb->tc_verd = 0;
 ncls:
 #endif
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index 5fd1c2f487d2..daa33432b716 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -312,6 +312,7 @@ config NET_SCH_PIE
 config NET_SCH_INGRESS
 	tristate "Ingress Qdisc"
 	depends on NET_CLS_ACT
+	select NET_INGRESS
 	---help---
 	  Say Y here if you want to use classifiers for incoming packets.
 	  If unsure, say Y.
-- 
cgit v1.2.3


From e687ad60af09010936bbd0b2a3b5d90a8ee8353c Mon Sep 17 00:00:00 2001
From: Pablo Neira <pablo@netfilter.org>
Date: Wed, 13 May 2015 18:19:38 +0200
Subject: netfilter: add netfilter ingress hook after handle_ing() under unique
 static key

This patch adds the Netfilter ingress hook just after the existing tc ingress
hook, that seems to be the consensus solution for this.

Note that the Netfilter hook resides under the global static key that enables
ingress filtering. Nonetheless, Netfilter still also has its own static key for
minimal impact on the existing handle_ing().

* Without this patch:

Result: OK: 6216490(c6216338+d152) usec, 100000000 (60byte,0frags)
  16086246pps 7721Mb/sec (7721398080bps) errors: 100000000

    42.46%  kpktgend_0   [kernel.kallsyms]   [k] __netif_receive_skb_core
    25.92%  kpktgend_0   [kernel.kallsyms]   [k] kfree_skb
     7.81%  kpktgend_0   [pktgen]            [k] pktgen_thread_worker
     5.62%  kpktgend_0   [kernel.kallsyms]   [k] ip_rcv
     2.70%  kpktgend_0   [kernel.kallsyms]   [k] netif_receive_skb_internal
     2.34%  kpktgend_0   [kernel.kallsyms]   [k] netif_receive_skb_sk
     1.44%  kpktgend_0   [kernel.kallsyms]   [k] __build_skb

* With this patch:

Result: OK: 6214833(c6214731+d101) usec, 100000000 (60byte,0frags)
  16090536pps 7723Mb/sec (7723457280bps) errors: 100000000

    41.23%  kpktgend_0      [kernel.kallsyms]  [k] __netif_receive_skb_core
    26.57%  kpktgend_0      [kernel.kallsyms]  [k] kfree_skb
     7.72%  kpktgend_0      [pktgen]           [k] pktgen_thread_worker
     5.55%  kpktgend_0      [kernel.kallsyms]  [k] ip_rcv
     2.78%  kpktgend_0      [kernel.kallsyms]  [k] netif_receive_skb_internal
     2.06%  kpktgend_0      [kernel.kallsyms]  [k] netif_receive_skb_sk
     1.43%  kpktgend_0      [kernel.kallsyms]  [k] __build_skb

* Without this patch + tc ingress:

        tc filter add dev eth4 parent ffff: protocol ip prio 1 \
                u32 match ip dst 4.3.2.1/32

Result: OK: 9269001(c9268821+d179) usec, 100000000 (60byte,0frags)
  10788648pps 5178Mb/sec (5178551040bps) errors: 100000000

    40.99%  kpktgend_0   [kernel.kallsyms]  [k] __netif_receive_skb_core
    17.50%  kpktgend_0   [kernel.kallsyms]  [k] kfree_skb
    11.77%  kpktgend_0   [cls_u32]          [k] u32_classify
     5.62%  kpktgend_0   [kernel.kallsyms]  [k] tc_classify_compat
     5.18%  kpktgend_0   [pktgen]           [k] pktgen_thread_worker
     3.23%  kpktgend_0   [kernel.kallsyms]  [k] tc_classify
     2.97%  kpktgend_0   [kernel.kallsyms]  [k] ip_rcv
     1.83%  kpktgend_0   [kernel.kallsyms]  [k] netif_receive_skb_internal
     1.50%  kpktgend_0   [kernel.kallsyms]  [k] netif_receive_skb_sk
     0.99%  kpktgend_0   [kernel.kallsyms]  [k] __build_skb

* With this patch + tc ingress:

        tc filter add dev eth4 parent ffff: protocol ip prio 1 \
                u32 match ip dst 4.3.2.1/32

Result: OK: 9308218(c9308091+d126) usec, 100000000 (60byte,0frags)
  10743194pps 5156Mb/sec (5156733120bps) errors: 100000000

    42.01%  kpktgend_0   [kernel.kallsyms]   [k] __netif_receive_skb_core
    17.78%  kpktgend_0   [kernel.kallsyms]   [k] kfree_skb
    11.70%  kpktgend_0   [cls_u32]           [k] u32_classify
     5.46%  kpktgend_0   [kernel.kallsyms]   [k] tc_classify_compat
     5.16%  kpktgend_0   [pktgen]            [k] pktgen_thread_worker
     2.98%  kpktgend_0   [kernel.kallsyms]   [k] ip_rcv
     2.84%  kpktgend_0   [kernel.kallsyms]   [k] tc_classify
     1.96%  kpktgend_0   [kernel.kallsyms]   [k] netif_receive_skb_internal
     1.57%  kpktgend_0   [kernel.kallsyms]   [k] netif_receive_skb_sk

Note that the results are very similar before and after.

I can see gcc gets the code under the ingress static key out of the hot path.
Then, on that cold branch, it generates the code to accomodate the netfilter
ingress static key. My explanation for this is that this reduces the pressure
on the instruction cache for non-users as the new code is out of the hot path,
and it comes with minimal impact for tc ingress users.

Using gcc version 4.8.4 on:

Architecture:          x86_64
CPU op-mode(s):        32-bit, 64-bit
Byte Order:            Little Endian
CPU(s):                8
[...]
L1d cache:             16K
L1i cache:             64K
L2 cache:              2048K
L3 cache:              8192K

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Acked-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h         |  3 +++
 include/linux/netfilter.h         |  1 +
 include/linux/netfilter_ingress.h | 41 +++++++++++++++++++++++++++++++++++++++
 include/uapi/linux/netfilter.h    |  6 ++++++
 net/core/dev.c                    | 36 ++++++++++++++++++++++++++++++++++
 net/netfilter/Kconfig             |  7 +++++++
 net/netfilter/core.c              | 31 ++++++++++++++++++++++++++++-
 7 files changed, 124 insertions(+), 1 deletion(-)
 create mode 100644 include/linux/netfilter_ingress.h

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index d3ed01c18247..51f8d2f5dc3f 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1656,6 +1656,9 @@ struct net_device {
 	struct tcf_proto __rcu  *ingress_cl_list;
 #endif
 	struct netdev_queue __rcu *ingress_queue;
+#ifdef CONFIG_NETFILTER_INGRESS
+	struct list_head	nf_hooks_ingress;
+#endif
 
 	unsigned char		broadcast[MAX_ADDR_LEN];
 #ifdef CONFIG_RFS_ACCEL
diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h
index 49d00638d1fa..f5ff5d156da8 100644
--- a/include/linux/netfilter.h
+++ b/include/linux/netfilter.h
@@ -86,6 +86,7 @@ struct nf_hook_ops {
 
 	/* User fills in from here down. */
 	nf_hookfn		*hook;
+	struct net_device	*dev;
 	struct module		*owner;
 	void			*priv;
 	u_int8_t		pf;
diff --git a/include/linux/netfilter_ingress.h b/include/linux/netfilter_ingress.h
new file mode 100644
index 000000000000..cb0727fe2b3d
--- /dev/null
+++ b/include/linux/netfilter_ingress.h
@@ -0,0 +1,41 @@
+#ifndef _NETFILTER_INGRESS_H_
+#define _NETFILTER_INGRESS_H_
+
+#include <linux/netfilter.h>
+#include <linux/netdevice.h>
+
+#ifdef CONFIG_NETFILTER_INGRESS
+static inline int nf_hook_ingress_active(struct sk_buff *skb)
+{
+	return nf_hook_list_active(&skb->dev->nf_hooks_ingress,
+				   NFPROTO_NETDEV, NF_NETDEV_INGRESS);
+}
+
+static inline int nf_hook_ingress(struct sk_buff *skb)
+{
+	struct nf_hook_state state;
+
+	nf_hook_state_init(&state, &skb->dev->nf_hooks_ingress,
+			   NF_NETDEV_INGRESS, INT_MIN, NFPROTO_NETDEV, NULL,
+			   skb->dev, NULL, NULL);
+	return nf_hook_slow(skb, &state);
+}
+
+static inline void nf_hook_ingress_init(struct net_device *dev)
+{
+	INIT_LIST_HEAD(&dev->nf_hooks_ingress);
+}
+#else /* CONFIG_NETFILTER_INGRESS */
+static inline int nf_hook_ingress_active(struct sk_buff *skb)
+{
+	return 0;
+}
+
+static inline int nf_hook_ingress(struct sk_buff *skb)
+{
+	return 0;
+}
+
+static inline void nf_hook_ingress_init(struct net_device *dev) {}
+#endif /* CONFIG_NETFILTER_INGRESS */
+#endif /* _NETFILTER_INGRESS_H_ */
diff --git a/include/uapi/linux/netfilter.h b/include/uapi/linux/netfilter.h
index ef1b1f88ca18..177027cce6b3 100644
--- a/include/uapi/linux/netfilter.h
+++ b/include/uapi/linux/netfilter.h
@@ -51,11 +51,17 @@ enum nf_inet_hooks {
 	NF_INET_NUMHOOKS
 };
 
+enum nf_dev_hooks {
+	NF_NETDEV_INGRESS,
+	NF_NETDEV_NUMHOOKS
+};
+
 enum {
 	NFPROTO_UNSPEC =  0,
 	NFPROTO_INET   =  1,
 	NFPROTO_IPV4   =  2,
 	NFPROTO_ARP    =  3,
+	NFPROTO_NETDEV =  5,
 	NFPROTO_BRIDGE =  7,
 	NFPROTO_IPV6   = 10,
 	NFPROTO_DECNET = 12,
diff --git a/net/core/dev.c b/net/core/dev.c
index a5ef90016ce7..29f0d6e6542c 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -135,6 +135,7 @@
 #include <linux/if_macvlan.h>
 #include <linux/errqueue.h>
 #include <linux/hrtimer.h>
+#include <linux/netfilter_ingress.h>
 
 #include "net-sysfs.h"
 
@@ -3666,6 +3667,13 @@ static inline struct sk_buff *handle_ing(struct sk_buff *skb,
 
 	return skb;
 }
+#else
+static inline struct sk_buff *handle_ing(struct sk_buff *skb,
+					 struct packet_type **pt_prev,
+					 int *ret, struct net_device *orig_dev)
+{
+	return skb;
+}
 #endif
 
 /**
@@ -3739,6 +3747,28 @@ static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
 	}
 }
 
+#ifdef CONFIG_NETFILTER_INGRESS
+static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev,
+			     int *ret, struct net_device *orig_dev)
+{
+	if (nf_hook_ingress_active(skb)) {
+		if (*pt_prev) {
+			*ret = deliver_skb(skb, *pt_prev, orig_dev);
+			*pt_prev = NULL;
+		}
+
+		return nf_hook_ingress(skb);
+	}
+	return 0;
+}
+#else
+static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev,
+			     int *ret, struct net_device *orig_dev)
+{
+	return 0;
+}
+#endif
+
 static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
 {
 	struct packet_type *ptype, *pt_prev;
@@ -3803,6 +3833,9 @@ skip_taps:
 		skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
 		if (!skb)
 			goto unlock;
+
+		if (nf_ingress(skb, &pt_prev, &ret, orig_dev) < 0)
+			goto unlock;
 	}
 #endif
 #ifdef CONFIG_NET_CLS_ACT
@@ -6968,6 +7001,9 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
 	dev->group = INIT_NETDEV_GROUP;
 	if (!dev->ethtool_ops)
 		dev->ethtool_ops = &default_ethtool_ops;
+
+	nf_hook_ingress_init(dev);
+
 	return dev;
 
 free_all:
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index f70e34a68f70..db1c674397ad 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -1,6 +1,13 @@
 menu "Core Netfilter Configuration"
 	depends on NET && INET && NETFILTER
 
+config NETFILTER_INGRESS
+	bool "Netfilter ingress support"
+	select NET_INGRESS
+	help
+	  This allows you to classify packets from ingress using the Netfilter
+	  infrastructure.
+
 config NETFILTER_NETLINK
 	tristate
 
diff --git a/net/netfilter/core.c b/net/netfilter/core.c
index e418cfd603c0..653e32eac08c 100644
--- a/net/netfilter/core.c
+++ b/net/netfilter/core.c
@@ -64,10 +64,27 @@ static DEFINE_MUTEX(nf_hook_mutex);
 
 int nf_register_hook(struct nf_hook_ops *reg)
 {
+	struct list_head *nf_hook_list;
 	struct nf_hook_ops *elem;
 
 	mutex_lock(&nf_hook_mutex);
-	list_for_each_entry(elem, &nf_hooks[reg->pf][reg->hooknum], list) {
+	switch (reg->pf) {
+	case NFPROTO_NETDEV:
+#ifdef CONFIG_NETFILTER_INGRESS
+		if (reg->hooknum == NF_NETDEV_INGRESS) {
+			BUG_ON(reg->dev == NULL);
+			nf_hook_list = &reg->dev->nf_hooks_ingress;
+			net_inc_ingress_queue();
+			break;
+		}
+#endif
+		/* Fall through. */
+	default:
+		nf_hook_list = &nf_hooks[reg->pf][reg->hooknum];
+		break;
+	}
+
+	list_for_each_entry(elem, nf_hook_list, list) {
 		if (reg->priority < elem->priority)
 			break;
 	}
@@ -85,6 +102,18 @@ void nf_unregister_hook(struct nf_hook_ops *reg)
 	mutex_lock(&nf_hook_mutex);
 	list_del_rcu(&reg->list);
 	mutex_unlock(&nf_hook_mutex);
+	switch (reg->pf) {
+	case NFPROTO_NETDEV:
+#ifdef CONFIG_NETFILTER_INGRESS
+		if (reg->hooknum == NF_NETDEV_INGRESS) {
+			net_dec_ingress_queue();
+			break;
+		}
+		break;
+#endif
+	default:
+		break;
+	}
 #ifdef HAVE_JUMP_LABEL
 	static_key_slow_dec(&nf_hooks_needed[reg->pf][reg->hooknum]);
 #endif
-- 
cgit v1.2.3


From af6c235d1a5c112964c3029eb0ed4b52c7aa33bf Mon Sep 17 00:00:00 2001
From: Linus Walleij <linus.walleij@linaro.org>
Date: Wed, 13 May 2015 13:03:21 +0200
Subject: gpio: discourage passing base to gpio_chip

Passing a fixed base in struct gpio_chip is done for legacy
systems that cannot handle dynamic allocation. Discourage this
behaviour in the kerneldoc.

Acked-by: Alexandre Courbot <acourbot@nvidia.com>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 include/linux/gpio/driver.h | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h
index 96a678842cde..cc7ec129b329 100644
--- a/include/linux/gpio/driver.h
+++ b/include/linux/gpio/driver.h
@@ -42,8 +42,12 @@ struct seq_file;
  * @dbg_show: optional routine to show contents in debugfs; default code
  *	will be used when this is omitted, but custom code can show extra
  *	state (such as pullup/pulldown configuration).
- * @base: identifies the first GPIO number handled by this chip; or, if
- *	negative during registration, requests dynamic ID allocation.
+ * @base: identifies the first GPIO number handled by this chip;
+ *	or, if negative during registration, requests dynamic ID allocation.
+ *	DEPRECATION: providing anything non-negative and nailing the base
+ *	base offset of GPIO chips is deprecated. Please pass -1 as base to
+ *	let gpiolib select the chip base in all possible cases. We want to
+ *	get rid of the static GPIO number space in the long run.
  * @ngpio: the number of GPIOs handled by this controller; the last GPIO
  *	handled is (base + ngpio - 1).
  * @desc: array of ngpio descriptors. Private.
-- 
cgit v1.2.3


From 7fb48c5bc3100f7674a8e26f42c1518196500728 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Sun, 3 May 2015 22:05:28 +0200
Subject: netfilter: bridge: neigh_head and physoutdev can't be used at same
 time

The neigh_header is only needed when we detect DNAT after prerouting
and neigh cache didn't have a mac address for us.

The output port has not been chosen yet so we can re-use the storage
area, bringing struct size down to 32 bytes on x86_64.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/skbuff.h    | 8 +++++---
 net/bridge/br_netfilter.c | 2 ++
 2 files changed, 7 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index c0b574a414e7..3d932e64125a 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -170,12 +170,14 @@ struct nf_bridge_info {
 		BRNF_PROTO_UNCHANGED,
 		BRNF_PROTO_8021Q,
 		BRNF_PROTO_PPPOE
-	} orig_proto;
+	} orig_proto:8;
 	bool			pkt_otherhost;
 	unsigned int		mask;
 	struct net_device	*physindev;
-	struct net_device	*physoutdev;
-	char			neigh_header[8];
+	union {
+		struct net_device *physoutdev;
+		char neigh_header[8];
+	};
 };
 #endif
 
diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c
index ab55e2472beb..13973da29b2a 100644
--- a/net/bridge/br_netfilter.c
+++ b/net/bridge/br_netfilter.c
@@ -973,6 +973,8 @@ static void br_nf_pre_routing_finish_bridge_slow(struct sk_buff *skb)
 				       nf_bridge->neigh_header,
 				       ETH_HLEN - ETH_ALEN);
 	skb->dev = nf_bridge->physindev;
+
+	nf_bridge->physoutdev = NULL;
 	br_handle_frame_finish(NULL, skb);
 }
 
-- 
cgit v1.2.3


From a3b1c1eb50f9b3e0c73c37157d0c61b2e90ae580 Mon Sep 17 00:00:00 2001
From: Denys Vlasenko <dvlasenk@redhat.com>
Date: Wed, 6 May 2015 16:28:57 +0200
Subject: netfilter: ipset: deinline ip_set_put_extensions()

On x86 allyesconfig build:
The function compiles to 489 bytes of machine code.
It has 25 callsites.

    text    data       bss       dec     hex filename
82441375 22255384 20627456 125324215 7784bb7 vmlinux.before
82434909 22255384 20627456 125317749 7783275 vmlinux

Signed-off-by: Denys Vlasenko <dvlasenk@redhat.com>
CC: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
CC: Eric W. Biederman <ebiederm@xmission.com>
CC: David S. Miller <davem@davemloft.net>
CC: Jan Engelhardt <jengelh@medozas.de>
CC: Jiri Pirko <jpirko@redhat.com>
CC: linux-kernel@vger.kernel.org
CC: netdev@vger.kernel.org
CC: netfilter-devel@vger.kernel.org
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter/ipset/ip_set.h | 24 ++----------------------
 net/netfilter/ipset/ip_set_core.c      | 25 +++++++++++++++++++++++++
 2 files changed, 27 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter/ipset/ip_set.h b/include/linux/netfilter/ipset/ip_set.h
index f88be7258e5f..ffdfdc24952a 100644
--- a/include/linux/netfilter/ipset/ip_set.h
+++ b/include/linux/netfilter/ipset/ip_set.h
@@ -533,29 +533,9 @@ bitmap_bytes(u32 a, u32 b)
 #include <linux/netfilter/ipset/ip_set_timeout.h>
 #include <linux/netfilter/ipset/ip_set_comment.h>
 
-static inline int
+int
 ip_set_put_extensions(struct sk_buff *skb, const struct ip_set *set,
-		      const void *e, bool active)
-{
-	if (SET_WITH_TIMEOUT(set)) {
-		unsigned long *timeout = ext_timeout(e, set);
-
-		if (nla_put_net32(skb, IPSET_ATTR_TIMEOUT,
-			htonl(active ? ip_set_timeout_get(timeout)
-				: *timeout)))
-			return -EMSGSIZE;
-	}
-	if (SET_WITH_COUNTER(set) &&
-	    ip_set_put_counter(skb, ext_counter(e, set)))
-		return -EMSGSIZE;
-	if (SET_WITH_COMMENT(set) &&
-	    ip_set_put_comment(skb, ext_comment(e, set)))
-		return -EMSGSIZE;
-	if (SET_WITH_SKBINFO(set) &&
-	    ip_set_put_skbinfo(skb, ext_skbinfo(e, set)))
-		return -EMSGSIZE;
-	return 0;
-}
+		      const void *e, bool active);
 
 #define IP_SET_INIT_KEXT(skb, opt, set)			\
 	{ .bytes = (skb)->len, .packets = 1,		\
diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c
index 7f9c0565e715..475e4960a164 100644
--- a/net/netfilter/ipset/ip_set_core.c
+++ b/net/netfilter/ipset/ip_set_core.c
@@ -432,6 +432,31 @@ ip_set_get_extensions(struct ip_set *set, struct nlattr *tb[],
 }
 EXPORT_SYMBOL_GPL(ip_set_get_extensions);
 
+int
+ip_set_put_extensions(struct sk_buff *skb, const struct ip_set *set,
+		      const void *e, bool active)
+{
+	if (SET_WITH_TIMEOUT(set)) {
+		unsigned long *timeout = ext_timeout(e, set);
+
+		if (nla_put_net32(skb, IPSET_ATTR_TIMEOUT,
+			htonl(active ? ip_set_timeout_get(timeout)
+				: *timeout)))
+			return -EMSGSIZE;
+	}
+	if (SET_WITH_COUNTER(set) &&
+	    ip_set_put_counter(skb, ext_counter(e, set)))
+		return -EMSGSIZE;
+	if (SET_WITH_COMMENT(set) &&
+	    ip_set_put_comment(skb, ext_comment(e, set)))
+		return -EMSGSIZE;
+	if (SET_WITH_SKBINFO(set) &&
+	    ip_set_put_skbinfo(skb, ext_skbinfo(e, set)))
+		return -EMSGSIZE;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(ip_set_put_extensions);
+
 /*
  * Creating/destroying/renaming/swapping affect the existence and
  * the properties of a set. All of these can be executed from userspace
-- 
cgit v1.2.3


From 922f2dd1b65a888e34c472979460dc23211750a2 Mon Sep 17 00:00:00 2001
From: Florian Fainelli <f.fainelli@gmail.com>
Date: Tue, 12 May 2015 10:33:24 -0700
Subject: net: phy: Add phy_ignore_ta_mask to account for broken turn-around

Some PHY devices/switches will not release the turn-around line as they
should do at the end of a MDIO transaction. To help with such
situations, allow MDIO bus drivers to be made aware of such
restrictions.

Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/phy.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/phy.h b/include/linux/phy.h
index 685809835b5c..701c7a3946e0 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -181,6 +181,9 @@ struct mii_bus {
 	/* PHY addresses to be ignored when probing */
 	u32 phy_mask;
 
+	/* PHY addresses to ignore the TA/read failure */
+	u32 phy_ignore_ta_mask;
+
 	/*
 	 * Pointer to an array of interrupts, each PHY's
 	 * interrupt at the index matching its address
-- 
cgit v1.2.3


From 8c8a457a60050d5922676f81913d87e4af6fd97b Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <n.borisov@siteground.com>
Date: Thu, 14 May 2015 14:31:01 +0300
Subject: sched: Remove redundant #ifdef

Two adjacent members in task_struct were guarded
by the same #define, so we can merge the two blocks.

Signed-off-by: Nikolay Borisov <n.borisov@siteground.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1431603061-29408-1-git-send-email-kernel@kyup.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 0eceeec5a01a..5f8defa155cf 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1398,8 +1398,6 @@ struct task_struct {
 	int rcu_read_lock_nesting;
 	union rcu_special rcu_read_unlock_special;
 	struct list_head rcu_node_entry;
-#endif /* #ifdef CONFIG_PREEMPT_RCU */
-#ifdef CONFIG_PREEMPT_RCU
 	struct rcu_node *rcu_blocked_node;
 #endif /* #ifdef CONFIG_PREEMPT_RCU */
 #ifdef CONFIG_TASKS_RCU
-- 
cgit v1.2.3


From faad38492814112e3e7ce94d90123bbe301fff33 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Sun, 10 May 2015 01:18:03 +0200
Subject: sched / idle: Call idle_set_state() from cpuidle_enter_state()

Introduce a wrapper function around idle_set_state() called
sched_idle_set_state() that will pass this_rq() to it as the
first argument and make cpuidle_enter_state() call the new
function before and after entering the target state.

At the same time, remove direct invocations of idle_set_state()
from call_cpuidle().

This will allow the invocation of default_idle_call() to be
moved from call_cpuidle() to cpuidle_enter_state() safely
and call_cpuidle() to be simplified a bit as a result.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Reviewed-by: Preeti U Murthy <preeti@linux.vnet.ibm.com>
Tested-by: Preeti U Murthy <preeti@linux.vnet.ibm.com>
Tested-by: Sudeep Holla <sudeep.holla@arm.com>
Acked-by: Kevin Hilman <khilman@linaro.org>
---
 drivers/cpuidle/cpuidle.c |  6 ++++++
 include/linux/cpuidle.h   |  3 +++
 kernel/sched/idle.c       | 15 +++++++++------
 3 files changed, 18 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c
index 597f88443bdc..9306dd5f460e 100644
--- a/drivers/cpuidle/cpuidle.c
+++ b/drivers/cpuidle/cpuidle.c
@@ -170,6 +170,9 @@ int cpuidle_enter_state(struct cpuidle_device *dev, struct cpuidle_driver *drv,
 	if (broadcast && tick_broadcast_enter())
 		return -EBUSY;
 
+	/* Take note of the planned idle state. */
+	sched_idle_set_state(target_state);
+
 	trace_cpu_idle_rcuidle(index, dev->cpu);
 	time_start = ktime_get();
 
@@ -178,6 +181,9 @@ int cpuidle_enter_state(struct cpuidle_device *dev, struct cpuidle_driver *drv,
 	time_end = ktime_get();
 	trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, dev->cpu);
 
+	/* The cpu is no longer idle or about to enter idle. */
+	sched_idle_set_state(NULL);
+
 	if (broadcast) {
 		if (WARN_ON_ONCE(!irqs_disabled()))
 			local_irq_disable();
diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h
index 9c5e89254796..301eaaab40e3 100644
--- a/include/linux/cpuidle.h
+++ b/include/linux/cpuidle.h
@@ -200,6 +200,9 @@ static inline struct cpuidle_driver *cpuidle_get_cpu_driver(
 	struct cpuidle_device *dev) {return NULL; }
 #endif
 
+/* kernel/sched/idle.c */
+extern void sched_idle_set_state(struct cpuidle_state *idle_state);
+
 #ifdef CONFIG_ARCH_NEEDS_CPU_IDLE_COUPLED
 void cpuidle_coupled_parallel_barrier(struct cpuidle_device *dev, atomic_t *a);
 #else
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 9c919b42f846..5d9f549fffa8 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -15,6 +15,15 @@
 
 #include "sched.h"
 
+/**
+ * sched_idle_set_state - Record idle state for the current CPU.
+ * @idle_state: State to record.
+ */
+void sched_idle_set_state(struct cpuidle_state *idle_state)
+{
+	idle_set_state(this_rq(), idle_state);
+}
+
 static int __read_mostly cpu_idle_force_poll;
 
 void cpu_idle_poll_ctrl(bool enable)
@@ -100,9 +109,6 @@ static int call_cpuidle(struct cpuidle_driver *drv, struct cpuidle_device *dev,
 		return -EBUSY;
 	}
 
-	/* Take note of the planned idle state. */
-	idle_set_state(this_rq(), &drv->states[next_state]);
-
 	/*
 	 * Enter the idle state previously returned by the governor decision.
 	 * This function will block until an interrupt occurs and will take
@@ -110,9 +116,6 @@ static int call_cpuidle(struct cpuidle_driver *drv, struct cpuidle_device *dev,
 	 */
 	entered_state = cpuidle_enter(drv, dev, next_state);
 
-	/* The cpu is no longer idle or about to enter idle. */
-	idle_set_state(this_rq(), NULL);
-
 	if (entered_state == -EBUSY)
 		default_idle_call();
 
-- 
cgit v1.2.3


From 827a5aefc542b8fb17c00de06118e5cd0e3800f2 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Sun, 10 May 2015 01:18:46 +0200
Subject: sched / idle: Call default_idle_call() from cpuidle_enter_state()

The check of the cpuidle_enter() return value against -EBUSY
made in call_cpuidle() will not be necessary any more if
cpuidle_enter_state() calls default_idle_call() directly when it
is about to return -EBUSY, so make that happen and eliminate the
check.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Reviewed-by: Preeti U Murthy <preeti@linux.vnet.ibm.com>
Tested-by: Preeti U Murthy <preeti@linux.vnet.ibm.com>
Tested-by: Sudeep Holla <sudeep.holla@arm.com>
Acked-by: Kevin Hilman <khilman@linaro.org>
---
 drivers/cpuidle/cpuidle.c |  4 +++-
 include/linux/cpuidle.h   |  1 +
 kernel/sched/idle.c       | 20 +++++++-------------
 3 files changed, 11 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c
index 9306dd5f460e..a7b9e679a2ef 100644
--- a/drivers/cpuidle/cpuidle.c
+++ b/drivers/cpuidle/cpuidle.c
@@ -167,8 +167,10 @@ int cpuidle_enter_state(struct cpuidle_device *dev, struct cpuidle_driver *drv,
 	 * local timer will be shut down.  If a local timer is used from another
 	 * CPU as a broadcast timer, this call may fail if it is not available.
 	 */
-	if (broadcast && tick_broadcast_enter())
+	if (broadcast && tick_broadcast_enter()) {
+		default_idle_call();
 		return -EBUSY;
+	}
 
 	/* Take note of the planned idle state. */
 	sched_idle_set_state(target_state);
diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h
index 301eaaab40e3..c7a63643658e 100644
--- a/include/linux/cpuidle.h
+++ b/include/linux/cpuidle.h
@@ -202,6 +202,7 @@ static inline struct cpuidle_driver *cpuidle_get_cpu_driver(
 
 /* kernel/sched/idle.c */
 extern void sched_idle_set_state(struct cpuidle_state *idle_state);
+extern void default_idle_call(void);
 
 #ifdef CONFIG_ARCH_NEEDS_CPU_IDLE_COUPLED
 void cpuidle_coupled_parallel_barrier(struct cpuidle_device *dev, atomic_t *a);
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 5d9f549fffa8..594275ed2620 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -76,12 +76,13 @@ void __weak arch_cpu_idle(void)
 	local_irq_enable();
 }
 
-static void default_idle_call(void)
+/**
+ * default_idle_call - Default CPU idle routine.
+ *
+ * To use when the cpuidle framework cannot be used.
+ */
+void default_idle_call(void)
 {
-	/*
-	 * We can't use the cpuidle framework, let's use the default idle
-	 * routine.
-	 */
 	if (current_clr_polling_and_test())
 		local_irq_enable();
 	else
@@ -91,8 +92,6 @@ static void default_idle_call(void)
 static int call_cpuidle(struct cpuidle_driver *drv, struct cpuidle_device *dev,
 		      int next_state)
 {
-	int entered_state;
-
 	/* Fall back to the default arch idle method on errors. */
 	if (next_state < 0) {
 		default_idle_call();
@@ -114,12 +113,7 @@ static int call_cpuidle(struct cpuidle_driver *drv, struct cpuidle_device *dev,
 	 * This function will block until an interrupt occurs and will take
 	 * care of re-enabling the local interrupts
 	 */
-	entered_state = cpuidle_enter(drv, dev, next_state);
-
-	if (entered_state == -EBUSY)
-		default_idle_call();
-
-	return entered_state;
+	return cpuidle_enter(drv, dev, next_state);
 }
 
 /**
-- 
cgit v1.2.3


From 4573237b01221881702fbe6655f3ae5135be1c18 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Tue, 12 May 2015 12:22:34 +0530
Subject: cpufreq: Manage governor usage history with 'policy->last_governor'

History of which governor was used last is common to all CPUs within a
policy and maintaining it per-cpu isn't the best approach for sure.

Apart from wasting memory, this also increases the complexity of
managing this data structure as it has to be updated for all CPUs.

To make that somewhat simpler, lets store this information in a new
field 'last_governor' in struct cpufreq_policy and update it on removal
of last cpu of a policy.

As a side-effect it also solves an old problem, consider a system with
two clusters 0 & 1. And there is one policy per cluster.

Cluster 0: CPU0 and 1.
Cluster 1: CPU2 and 3.

 - CPU2 is first brought online, and governor is set to performance
   (default as cpufreq_cpu_governor wasn't set).
 - Governor is changed to ondemand.
 - CPU2 is taken offline and cpufreq_cpu_governor is updated for CPU2.
 - CPU3 is brought online.
 - Because cpufreq_cpu_governor wasn't set for CPU3, the default governor
   performance is picked for CPU3.

This patch fixes the bug as we now have a single variable to update for
policy.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/cpufreq.c | 30 +++++++++++++++---------------
 include/linux/cpufreq.h   |  1 +
 2 files changed, 16 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index e6a63d6ba6f1..16275ba6428e 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -104,9 +104,6 @@ static DEFINE_PER_CPU(struct cpufreq_policy *, cpufreq_cpu_data);
 static DEFINE_RWLOCK(cpufreq_driver_lock);
 DEFINE_MUTEX(cpufreq_governor_lock);
 
-/* This one keeps track of the previously set governor of a removed CPU */
-static DEFINE_PER_CPU(char[CPUFREQ_NAME_LEN], cpufreq_cpu_governor);
-
 /* Flag to suspend/resume CPUFreq governors */
 static bool cpufreq_suspended;
 
@@ -1017,7 +1014,7 @@ static void cpufreq_init_policy(struct cpufreq_policy *policy)
 	memcpy(&new_policy, policy, sizeof(*policy));
 
 	/* Update governor of new_policy to the governor used before hotplug */
-	gov = find_governor(per_cpu(cpufreq_cpu_governor, policy->cpu));
+	gov = find_governor(policy->last_governor);
 	if (gov)
 		pr_debug("Restoring governor %s for cpu %d\n",
 				policy->governor->name, policy->cpu);
@@ -1411,14 +1408,15 @@ static int __cpufreq_remove_dev_prepare(struct device *dev,
 			pr_err("%s: Failed to stop governor\n", __func__);
 			return ret;
 		}
-
-		strncpy(per_cpu(cpufreq_cpu_governor, cpu),
-			policy->governor->name, CPUFREQ_NAME_LEN);
 	}
 
-	down_read(&policy->rwsem);
+	down_write(&policy->rwsem);
 	cpus = cpumask_weight(policy->cpus);
-	up_read(&policy->rwsem);
+
+	if (has_target() && cpus == 1)
+		strncpy(policy->last_governor, policy->governor->name,
+			CPUFREQ_NAME_LEN);
+	up_write(&policy->rwsem);
 
 	if (cpu != policy->cpu) {
 		sysfs_remove_link(&dev->kobj, "cpufreq");
@@ -2135,7 +2133,8 @@ EXPORT_SYMBOL_GPL(cpufreq_register_governor);
 
 void cpufreq_unregister_governor(struct cpufreq_governor *governor)
 {
-	int cpu;
+	struct cpufreq_policy *policy;
+	unsigned long flags;
 
 	if (!governor)
 		return;
@@ -2143,12 +2142,13 @@ void cpufreq_unregister_governor(struct cpufreq_governor *governor)
 	if (cpufreq_disabled())
 		return;
 
-	for_each_present_cpu(cpu) {
-		if (cpu_online(cpu))
-			continue;
-		if (!strcmp(per_cpu(cpufreq_cpu_governor, cpu), governor->name))
-			strcpy(per_cpu(cpufreq_cpu_governor, cpu), "\0");
+	/* clear last_governor for all inactive policies */
+	read_lock_irqsave(&cpufreq_driver_lock, flags);
+	for_each_inactive_policy(policy) {
+		if (!strcmp(policy->last_governor, governor->name))
+			strcpy(policy->last_governor, "\0");
 	}
+	read_unlock_irqrestore(&cpufreq_driver_lock, flags);
 
 	mutex_lock(&cpufreq_governor_mutex);
 	list_del(&governor->governor_list);
diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index 2ee4888c1f47..48e37c07eb84 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -80,6 +80,7 @@ struct cpufreq_policy {
 	struct cpufreq_governor	*governor; /* see below */
 	void			*governor_data;
 	bool			governor_enabled; /* governor start/stop flag */
+	char			last_governor[CPUFREQ_NAME_LEN]; /* last governor used */
 
 	struct work_struct	update; /* if update_policy() needs to be
 					 * called, but you're in IRQ context */
-- 
cgit v1.2.3


From a4afd37b26f4b9f640310a89b7f8d176ae3460b1 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Wed, 13 May 2015 13:12:43 +0200
Subject: test_bpf: add tests related to BPF_MAXINSNS

Couple of torture test cases related to the bug fixed in 0b59d8806a31
("ARM: net: delegate filter to kernel interpreter when imm_offset()
return value can't fit into 12bits.").

I've added a helper to allocate and fill the insn space. Output on
x86_64 from my laptop:

test_bpf: #233 BPF_MAXINSNS: Maximum possible literals jited:0 7 PASS
test_bpf: #234 BPF_MAXINSNS: Single literal jited:0 8 PASS
test_bpf: #235 BPF_MAXINSNS: Run/add until end jited:0 11553 PASS
test_bpf: #236 BPF_MAXINSNS: Too many instructions PASS
test_bpf: #237 BPF_MAXINSNS: Very long jump jited:0 9 PASS
test_bpf: #238 BPF_MAXINSNS: Ctx heavy transformations jited:0 20329 20398 PASS
test_bpf: #239 BPF_MAXINSNS: Call heavy transformations jited:0 32178 32475 PASS
test_bpf: #240 BPF_MAXINSNS: Jump heavy test jited:0 10518 PASS

test_bpf: #233 BPF_MAXINSNS: Maximum possible literals jited:1 4 PASS
test_bpf: #234 BPF_MAXINSNS: Single literal jited:1 4 PASS
test_bpf: #235 BPF_MAXINSNS: Run/add until end jited:1 1625 PASS
test_bpf: #236 BPF_MAXINSNS: Too many instructions PASS
test_bpf: #237 BPF_MAXINSNS: Very long jump jited:1 8 PASS
test_bpf: #238 BPF_MAXINSNS: Ctx heavy transformations jited:1 3301 3174 PASS
test_bpf: #239 BPF_MAXINSNS: Call heavy transformations jited:1 24107 23491 PASS
test_bpf: #240 BPF_MAXINSNS: Jump heavy test jited:1 8651 PASS

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Cc: Alexei Starovoitov <ast@plumgrid.com>
Cc: Nicolas Schichan <nschichan@freebox.fr>
Acked-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/filter.h |   8 ++
 lib/test_bpf.c         | 317 +++++++++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 316 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/filter.h b/include/linux/filter.h
index ce1d72d34382..200be4a74a33 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -277,6 +277,14 @@ struct bpf_prog_aux;
 		.off   = 0,					\
 		.imm   = 0 })
 
+/* Internal classic blocks for direct assignment */
+
+#define __BPF_STMT(CODE, K)					\
+	((struct sock_filter) BPF_STMT(CODE, K))
+
+#define __BPF_JUMP(CODE, K, JT, JF)				\
+	((struct sock_filter) BPF_JUMP(CODE, K, JT, JF))
+
 #define bytes_to_bpf_size(bytes)				\
 ({								\
 	int bpf_size = -EINVAL;					\
diff --git a/lib/test_bpf.c b/lib/test_bpf.c
index 8bca780e3613..66358b21acce 100644
--- a/lib/test_bpf.c
+++ b/lib/test_bpf.c
@@ -21,6 +21,7 @@
 #include <linux/skbuff.h>
 #include <linux/netdevice.h>
 #include <linux/if_vlan.h>
+#include <linux/random.h>
 
 /* General test specific settings */
 #define MAX_SUBTESTS	3
@@ -67,6 +68,10 @@ struct bpf_test {
 	union {
 		struct sock_filter insns[MAX_INSNS];
 		struct bpf_insn insns_int[MAX_INSNS];
+		struct {
+			void *insns;
+			unsigned int len;
+		} ptr;
 	} u;
 	__u8 aux;
 	__u8 data[MAX_DATA];
@@ -74,8 +79,190 @@ struct bpf_test {
 		int data_size;
 		__u32 result;
 	} test[MAX_SUBTESTS];
+	int (*fill_helper)(struct bpf_test *self);
 };
 
+/* Large test cases need separate allocation and fill handler. */
+
+static int bpf_fill_maxinsns1(struct bpf_test *self)
+{
+	unsigned int len = BPF_MAXINSNS;
+	struct sock_filter *insn;
+	__u32 k = ~0;
+	int i;
+
+	insn = kmalloc_array(len, sizeof(*insn), GFP_KERNEL);
+	if (!insn)
+		return -ENOMEM;
+
+	for (i = 0; i < len; i++, k--)
+		insn[i] = __BPF_STMT(BPF_RET | BPF_K, k);
+
+	self->u.ptr.insns = insn;
+	self->u.ptr.len = len;
+
+	return 0;
+}
+
+static int bpf_fill_maxinsns2(struct bpf_test *self)
+{
+	unsigned int len = BPF_MAXINSNS;
+	struct sock_filter *insn;
+	int i;
+
+	insn = kmalloc_array(len, sizeof(*insn), GFP_KERNEL);
+	if (!insn)
+		return -ENOMEM;
+
+	for (i = 0; i < len; i++)
+		insn[i] = __BPF_STMT(BPF_RET | BPF_K, 0xfefefefe);
+
+	self->u.ptr.insns = insn;
+	self->u.ptr.len = len;
+
+	return 0;
+}
+
+static int bpf_fill_maxinsns3(struct bpf_test *self)
+{
+	unsigned int len = BPF_MAXINSNS;
+	struct sock_filter *insn;
+	struct rnd_state rnd;
+	int i;
+
+	insn = kmalloc_array(len, sizeof(*insn), GFP_KERNEL);
+	if (!insn)
+		return -ENOMEM;
+
+	prandom_seed_state(&rnd, 3141592653589793238ULL);
+
+	for (i = 0; i < len - 1; i++) {
+		__u32 k = prandom_u32_state(&rnd);
+
+		insn[i] = __BPF_STMT(BPF_ALU | BPF_ADD | BPF_K, k);
+	}
+
+	insn[len - 1] = __BPF_STMT(BPF_RET | BPF_A, 0);
+
+	self->u.ptr.insns = insn;
+	self->u.ptr.len = len;
+
+	return 0;
+}
+
+static int bpf_fill_maxinsns4(struct bpf_test *self)
+{
+	unsigned int len = BPF_MAXINSNS + 1;
+	struct sock_filter *insn;
+	int i;
+
+	insn = kmalloc_array(len, sizeof(*insn), GFP_KERNEL);
+	if (!insn)
+		return -ENOMEM;
+
+	for (i = 0; i < len; i++)
+		insn[i] = __BPF_STMT(BPF_RET | BPF_K, 0xfefefefe);
+
+	self->u.ptr.insns = insn;
+	self->u.ptr.len = len;
+
+	return 0;
+}
+
+static int bpf_fill_maxinsns5(struct bpf_test *self)
+{
+	unsigned int len = BPF_MAXINSNS;
+	struct sock_filter *insn;
+	int i;
+
+	insn = kmalloc_array(len, sizeof(*insn), GFP_KERNEL);
+	if (!insn)
+		return -ENOMEM;
+
+	insn[0] = __BPF_JUMP(BPF_JMP | BPF_JA, len - 2, 0, 0);
+
+	for (i = 1; i < len - 1; i++)
+		insn[i] = __BPF_STMT(BPF_RET | BPF_K, 0xfefefefe);
+
+	insn[len - 1] = __BPF_STMT(BPF_RET | BPF_K, 0xabababab);
+
+	self->u.ptr.insns = insn;
+	self->u.ptr.len = len;
+
+	return 0;
+}
+
+static int bpf_fill_maxinsns6(struct bpf_test *self)
+{
+	unsigned int len = BPF_MAXINSNS;
+	struct sock_filter *insn;
+	int i;
+
+	insn = kmalloc_array(len, sizeof(*insn), GFP_KERNEL);
+	if (!insn)
+		return -ENOMEM;
+
+	for (i = 0; i < len - 1; i++)
+		insn[i] = __BPF_STMT(BPF_LD | BPF_W | BPF_ABS, SKF_AD_OFF +
+				     SKF_AD_VLAN_TAG_PRESENT);
+
+	insn[len - 1] = __BPF_STMT(BPF_RET | BPF_A, 0);
+
+	self->u.ptr.insns = insn;
+	self->u.ptr.len = len;
+
+	return 0;
+}
+
+static int bpf_fill_maxinsns7(struct bpf_test *self)
+{
+	unsigned int len = BPF_MAXINSNS;
+	struct sock_filter *insn;
+	int i;
+
+	insn = kmalloc_array(len, sizeof(*insn), GFP_KERNEL);
+	if (!insn)
+		return -ENOMEM;
+
+	for (i = 0; i < len - 4; i++)
+		insn[i] = __BPF_STMT(BPF_LD | BPF_W | BPF_ABS, SKF_AD_OFF +
+				     SKF_AD_CPU);
+
+	insn[len - 4] = __BPF_STMT(BPF_MISC | BPF_TAX, 0);
+	insn[len - 3] = __BPF_STMT(BPF_LD | BPF_W | BPF_ABS, SKF_AD_OFF +
+				   SKF_AD_CPU);
+	insn[len - 2] = __BPF_STMT(BPF_ALU | BPF_SUB | BPF_X, 0);
+	insn[len - 1] = __BPF_STMT(BPF_RET | BPF_A, 0);
+
+	self->u.ptr.insns = insn;
+	self->u.ptr.len = len;
+
+	return 0;
+}
+
+static int bpf_fill_maxinsns8(struct bpf_test *self)
+{
+	unsigned int len = BPF_MAXINSNS;
+	struct sock_filter *insn;
+	int i, jmp_off = len - 3;
+
+	insn = kmalloc_array(len, sizeof(*insn), GFP_KERNEL);
+	if (!insn)
+		return -ENOMEM;
+
+	insn[0] = __BPF_STMT(BPF_LD | BPF_IMM, 0xffffffff);
+
+	for (i = 1; i < len - 1; i++)
+		insn[i] = __BPF_JUMP(BPF_JMP | BPF_JGT, 0xffffffff, jmp_off--, 0);
+
+	insn[len - 1] = __BPF_STMT(BPF_RET | BPF_A, 0);
+
+	self->u.ptr.insns = insn;
+	self->u.ptr.len = len;
+
+	return 0;
+}
+
 static struct bpf_test tests[] = {
 	{
 		"TAX",
@@ -3998,6 +4185,73 @@ static struct bpf_test tests[] = {
 		{ },
 		{ { 0, 1 } },
 	},
+	{	/* Mainly checking JIT here. */
+		"BPF_MAXINSNS: Maximum possible literals",
+		{ },
+		CLASSIC | FLAG_NO_DATA,
+		{ },
+		{ { 0, 0xffffffff } },
+		.fill_helper = bpf_fill_maxinsns1,
+	},
+	{	/* Mainly checking JIT here. */
+		"BPF_MAXINSNS: Single literal",
+		{ },
+		CLASSIC | FLAG_NO_DATA,
+		{ },
+		{ { 0, 0xfefefefe } },
+		.fill_helper = bpf_fill_maxinsns2,
+	},
+	{	/* Mainly checking JIT here. */
+		"BPF_MAXINSNS: Run/add until end",
+		{ },
+		CLASSIC | FLAG_NO_DATA,
+		{ },
+		{ { 0, 0x947bf368 } },
+		.fill_helper = bpf_fill_maxinsns3,
+	},
+	{
+		"BPF_MAXINSNS: Too many instructions",
+		{ },
+		CLASSIC | FLAG_NO_DATA | FLAG_EXPECTED_FAIL,
+		{ },
+		{ },
+		.fill_helper = bpf_fill_maxinsns4,
+	},
+	{	/* Mainly checking JIT here. */
+		"BPF_MAXINSNS: Very long jump",
+		{ },
+		CLASSIC | FLAG_NO_DATA,
+		{ },
+		{ { 0, 0xabababab } },
+		.fill_helper = bpf_fill_maxinsns5,
+	},
+	{	/* Mainly checking JIT here. */
+		"BPF_MAXINSNS: Ctx heavy transformations",
+		{ },
+		CLASSIC,
+		{ },
+		{
+			{  1, !!(SKB_VLAN_TCI & VLAN_TAG_PRESENT) },
+			{ 10, !!(SKB_VLAN_TCI & VLAN_TAG_PRESENT) }
+		},
+		.fill_helper = bpf_fill_maxinsns6,
+	},
+	{	/* Mainly checking JIT here. */
+		"BPF_MAXINSNS: Call heavy transformations",
+		{ },
+		CLASSIC | FLAG_NO_DATA,
+		{ },
+		{ { 1, 0 }, { 10, 0 } },
+		.fill_helper = bpf_fill_maxinsns7,
+	},
+	{	/* Mainly checking JIT here. */
+		"BPF_MAXINSNS: Jump heavy test",
+		{ },
+		CLASSIC | FLAG_NO_DATA,
+		{ },
+		{ { 0, 0xffffffff } },
+		.fill_helper = bpf_fill_maxinsns8,
+	},
 };
 
 static struct net_device dev;
@@ -4051,10 +4305,15 @@ static void release_test_data(const struct bpf_test *test, void *data)
 	kfree_skb(data);
 }
 
-static int probe_filter_length(struct sock_filter *fp)
+static int filter_length(int which)
 {
-	int len = 0;
+	struct sock_filter *fp;
+	int len;
 
+	if (tests[which].fill_helper)
+		return tests[which].u.ptr.len;
+
+	fp = tests[which].u.insns;
 	for (len = MAX_INSNS - 1; len > 0; --len)
 		if (fp[len].code != 0 || fp[len].k != 0)
 			break;
@@ -4062,16 +4321,25 @@ static int probe_filter_length(struct sock_filter *fp)
 	return len + 1;
 }
 
+static void *filter_pointer(int which)
+{
+	if (tests[which].fill_helper)
+		return tests[which].u.ptr.insns;
+	else
+		return tests[which].u.insns;
+}
+
 static struct bpf_prog *generate_filter(int which, int *err)
 {
-	struct bpf_prog *fp;
-	struct sock_fprog_kern fprog;
-	unsigned int flen = probe_filter_length(tests[which].u.insns);
 	__u8 test_type = tests[which].aux & TEST_TYPE_MASK;
+	unsigned int flen = filter_length(which);
+	void *fptr = filter_pointer(which);
+	struct sock_fprog_kern fprog;
+	struct bpf_prog *fp;
 
 	switch (test_type) {
 	case CLASSIC:
-		fprog.filter = tests[which].u.insns;
+		fprog.filter = fptr;
 		fprog.len = flen;
 
 		*err = bpf_prog_create(&fp, &fprog);
@@ -4107,8 +4375,7 @@ static struct bpf_prog *generate_filter(int which, int *err)
 		}
 
 		fp->len = flen;
-		memcpy(fp->insnsi, tests[which].u.insns_int,
-		       fp->len * sizeof(struct bpf_insn));
+		memcpy(fp->insnsi, fptr, fp->len * sizeof(struct bpf_insn));
 
 		bpf_prog_select_runtime(fp);
 		break;
@@ -4180,6 +4447,29 @@ static int run_one(const struct bpf_prog *fp, struct bpf_test *test)
 	return err_cnt;
 }
 
+static __init int prepare_bpf_tests(void)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(tests); i++) {
+		if (tests[i].fill_helper &&
+		    tests[i].fill_helper(&tests[i]) < 0)
+			return -ENOMEM;
+	}
+
+	return 0;
+}
+
+static __init void destroy_bpf_tests(void)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(tests); i++) {
+		if (tests[i].fill_helper)
+			kfree(tests[i].u.ptr.insns);
+	}
+}
+
 static __init int test_bpf(void)
 {
 	int i, err_cnt = 0, pass_cnt = 0;
@@ -4227,7 +4517,16 @@ static __init int test_bpf(void)
 
 static int __init test_bpf_init(void)
 {
-	return test_bpf();
+	int ret;
+
+	ret = prepare_bpf_tests();
+	if (ret < 0)
+		return ret;
+
+	ret = test_bpf();
+
+	destroy_bpf_tests();
+	return ret;
 }
 
 static void __exit test_bpf_exit(void)
-- 
cgit v1.2.3


From ef7f3a5c7149ad2dbd1d8a71d0aa88a02d1dbcb8 Mon Sep 17 00:00:00 2001
From: Bert Vermeulen <bert@biot.com>
Date: Wed, 13 May 2015 13:35:39 +0200
Subject: mdio-gpio: Propagate mii_bus.phy_ignore_ta_mask

This also changes mii_bus.phy_mask to u32 for consistency.

Signed-off-by: Bert Vermeulen <bert@biot.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/mdio-gpio.c | 1 +
 include/linux/mdio-gpio.h   | 3 ++-
 2 files changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/net/phy/mdio-gpio.c b/drivers/net/phy/mdio-gpio.c
index 53d18150f4e2..7dc21e56a7aa 100644
--- a/drivers/net/phy/mdio-gpio.c
+++ b/drivers/net/phy/mdio-gpio.c
@@ -158,6 +158,7 @@ static struct mii_bus *mdio_gpio_bus_init(struct device *dev,
 	new_bus->name = "GPIO Bitbanged MDIO",
 
 	new_bus->phy_mask = pdata->phy_mask;
+	new_bus->phy_ignore_ta_mask = pdata->phy_ignore_ta_mask;
 	new_bus->irq = pdata->irqs;
 	new_bus->parent = dev;
 
diff --git a/include/linux/mdio-gpio.h b/include/linux/mdio-gpio.h
index 66c30a763b10..11f00cdabe3d 100644
--- a/include/linux/mdio-gpio.h
+++ b/include/linux/mdio-gpio.h
@@ -23,7 +23,8 @@ struct mdio_gpio_platform_data {
 	bool mdio_active_low;
 	bool mdo_active_low;
 
-	unsigned int phy_mask;
+	u32 phy_mask;
+	u32 phy_ignore_ta_mask;
 	int irqs[PHY_MAX_ADDR];
 	/* reset callback */
 	int (*reset)(struct mii_bus *bus);
-- 
cgit v1.2.3


From 8fa9dd24667f2d6997ec21341019657342859d31 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 23 Mar 2015 13:37:40 +1100
Subject: VFS/namei: make the use of touch_atime() in get_link() RCU-safe.

touch_atime is not RCU-safe, and so cannot be called on an RCU walk.
However, in situations where RCU-walk makes a difference, the symlink
will likely to accessed much more often than it is useful to update
the atime.

So split out the test of "Does the atime actually need to be updated"
into  atime_needs_update(), and have get_link() unlazy if it finds that
it will need to do that update.

Signed-off-by: NeilBrown <neilb@suse.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/inode.c         | 30 +++++++++++++++++++++---------
 fs/namei.c         | 12 +++++++++---
 include/linux/fs.h |  1 +
 3 files changed, 31 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/fs/inode.c b/fs/inode.c
index 952fb4852e38..e8d62688ed91 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -1585,36 +1585,47 @@ static int update_time(struct inode *inode, struct timespec *time, int flags)
  *	This function automatically handles read only file systems and media,
  *	as well as the "noatime" flag and inode specific "noatime" markers.
  */
-void touch_atime(const struct path *path)
+bool atime_needs_update(const struct path *path, struct inode *inode)
 {
 	struct vfsmount *mnt = path->mnt;
-	struct inode *inode = d_inode(path->dentry);
 	struct timespec now;
 
 	if (inode->i_flags & S_NOATIME)
-		return;
+		return false;
 	if (IS_NOATIME(inode))
-		return;
+		return false;
 	if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
-		return;
+		return false;
 
 	if (mnt->mnt_flags & MNT_NOATIME)
-		return;
+		return false;
 	if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
-		return;
+		return false;
 
 	now = current_fs_time(inode->i_sb);
 
 	if (!relatime_need_update(mnt, inode, now))
-		return;
+		return false;
 
 	if (timespec_equal(&inode->i_atime, &now))
+		return false;
+
+	return true;
+}
+
+void touch_atime(const struct path *path)
+{
+	struct vfsmount *mnt = path->mnt;
+	struct inode *inode = d_inode(path->dentry);
+	struct timespec now;
+
+	if (!atime_needs_update(path, inode))
 		return;
 
 	if (!sb_start_write_trylock(inode->i_sb))
 		return;
 
-	if (__mnt_want_write(mnt))
+	if (__mnt_want_write(mnt) != 0)
 		goto skip_update;
 	/*
 	 * File systems can error out when updating inodes if they need to
@@ -1625,6 +1636,7 @@ void touch_atime(const struct path *path)
 	 * We may also fail on filesystems that have the ability to make parts
 	 * of the fs read only, e.g. subvolumes in Btrfs.
 	 */
+	now = current_fs_time(inode->i_sb);
 	update_time(inode, &now, S_ATIME);
 	__mnt_drop_write(mnt);
 skip_update:
diff --git a/fs/namei.c b/fs/namei.c
index 47b20086e9f3..d9f77ff60b55 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -966,13 +966,19 @@ const char *get_link(struct nameidata *nd)
 	int error;
 	const char *res;
 
-	if (nd->flags & LOOKUP_RCU) {
+	if (!(nd->flags & LOOKUP_RCU)) {
+		touch_atime(&last->link);
+		cond_resched();
+	} else if (atime_needs_update(&last->link, inode)) {
 		if (unlikely(unlazy_walk(nd, NULL, 0)))
 			return ERR_PTR(-ECHILD);
+		touch_atime(&last->link);
 	}
-	cond_resched();
 
-	touch_atime(&last->link);
+	if (nd->flags & LOOKUP_RCU) {
+		if (unlikely(unlazy_walk(nd, NULL, 0)))
+			return ERR_PTR(-ECHILD);
+	}
 
 	error = security_inode_follow_link(dentry, inode,
 					   nd->flags & LOOKUP_RCU);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 8f738512c874..1426c435d455 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1880,6 +1880,7 @@ enum file_time_flags {
 	S_VERSION = 8,
 };
 
+extern bool atime_needs_update(const struct path *, struct inode *);
 extern void touch_atime(const struct path *);
 static inline void file_accessed(struct file *file)
 {
-- 
cgit v1.2.3


From 89076bc31950eee576ecc06460c23466e2d50939 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 12 May 2015 08:29:38 -0400
Subject: get rid of assorted nameidata-related debris

pointless forward declarations, stale comments

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/9p/vfs_inode.c      | 3 +--
 fs/9p/vfs_inode_dotl.c | 3 +--
 fs/ecryptfs/inode.c    | 3 +--
 fs/ntfs/namei.c        | 2 +-
 include/linux/fs.h     | 1 -
 include/linux/namei.h  | 1 -
 include/linux/sched.h  | 1 +
 7 files changed, 5 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 271f51af2f75..510040b04c96 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -1226,8 +1226,7 @@ ino_t v9fs_qid2ino(struct p9_qid *qid)
 /**
  * v9fs_vfs_follow_link - follow a symlink path
  * @dentry: dentry for symlink
- * @nd: nameidata
- *
+ * @cookie: place to pass the data to put_link()
  */
 
 static const char *v9fs_vfs_follow_link(struct dentry *dentry, void **cookie)
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
index 16658ed677c9..09e4433717b8 100644
--- a/fs/9p/vfs_inode_dotl.c
+++ b/fs/9p/vfs_inode_dotl.c
@@ -905,8 +905,7 @@ error:
 /**
  * v9fs_vfs_follow_link_dotl - follow a symlink path
  * @dentry: dentry for symlink
- * @nd: nameidata
- *
+ * @cookie: place to pass the data to put_link()
  */
 
 static const char *
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 73d20ae92478..3c4db1172d22 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -170,7 +170,6 @@ out_unlock:
  * @directory_inode: inode of the new file's dentry's parent in ecryptfs
  * @ecryptfs_dentry: New file's dentry in ecryptfs
  * @mode: The mode of the new file
- * @nd: nameidata of ecryptfs' parent's dentry & vfsmount
  *
  * Creates the underlying file and the eCryptfs inode which will link to
  * it. It will also update the eCryptfs directory inode to mimic the
@@ -384,7 +383,7 @@ static int ecryptfs_lookup_interpose(struct dentry *dentry,
  * ecryptfs_lookup
  * @ecryptfs_dir_inode: The eCryptfs directory inode
  * @ecryptfs_dentry: The eCryptfs dentry that we are looking up
- * @ecryptfs_nd: nameidata; may be NULL
+ * @flags: lookup flags
  *
  * Find a file on disk. If the file does not exist, then we'll add it to the
  * dentry cache and continue on to read it from the disk.
diff --git a/fs/ntfs/namei.c b/fs/ntfs/namei.c
index 0f35b80d17fe..443abecf01b7 100644
--- a/fs/ntfs/namei.c
+++ b/fs/ntfs/namei.c
@@ -35,7 +35,7 @@
  * ntfs_lookup - find the inode represented by a dentry in a directory inode
  * @dir_ino:	directory inode in which to look for the inode
  * @dent:	dentry representing the inode to look for
- * @nd:		lookup nameidata
+ * @flags:	lookup flags
  *
  * In short, ntfs_lookup() looks for the inode represented by the dentry @dent
  * in the directory inode @dir_ino and if found attaches the inode to the
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 1426c435d455..b577e801b4af 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -38,7 +38,6 @@ struct backing_dev_info;
 struct export_operations;
 struct hd_geometry;
 struct iovec;
-struct nameidata;
 struct kiocb;
 struct kobject;
 struct pipe_inode_info;
diff --git a/include/linux/namei.h b/include/linux/namei.h
index d756304aa09b..1208e489f83e 100644
--- a/include/linux/namei.h
+++ b/include/linux/namei.h
@@ -7,7 +7,6 @@
 #include <linux/path.h>
 
 struct vfsmount;
-struct nameidata;
 
 enum { MAX_NESTED_LINKS = 8 };
 
diff --git a/include/linux/sched.h b/include/linux/sched.h
index f6c9b69d66f2..a1158c954f0f 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -132,6 +132,7 @@ struct fs_struct;
 struct perf_event_context;
 struct blk_plug;
 struct filename;
+struct nameidata;
 
 #define VMACACHE_BITS 2
 #define VMACACHE_SIZE (1U << VMACACHE_BITS)
-- 
cgit v1.2.3


From b853a16176cf3e02c57e215743015614152c2428 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 13 May 2015 09:12:02 -0400
Subject: turn user_{path_at,path,lpath,path_dir}() into static inlines

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c            |  8 +-------
 include/linux/namei.h | 34 ++++++++++++++++++++++++----------
 2 files changed, 25 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/fs/namei.c b/fs/namei.c
index 1a117c0d13c5..2dad0eaf91d3 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2299,13 +2299,7 @@ int user_path_at_empty(int dfd, const char __user *name, unsigned flags,
 	return filename_lookup(dfd, getname_flags(name, flags, empty),
 			       flags, path, NULL);
 }
-
-int user_path_at(int dfd, const char __user *name, unsigned flags,
-		 struct path *path)
-{
-	return user_path_at_empty(dfd, name, flags, path, NULL);
-}
-EXPORT_SYMBOL(user_path_at);
+EXPORT_SYMBOL(user_path_at_empty);
 
 /*
  * NB: most callers don't do anything directly with the reference to the
diff --git a/include/linux/namei.h b/include/linux/namei.h
index 1208e489f83e..d8c6334cd150 100644
--- a/include/linux/namei.h
+++ b/include/linux/namei.h
@@ -1,12 +1,10 @@
 #ifndef _LINUX_NAMEI_H
 #define _LINUX_NAMEI_H
 
-#include <linux/dcache.h>
-#include <linux/errno.h>
-#include <linux/linkage.h>
+#include <linux/kernel.h>
 #include <linux/path.h>
-
-struct vfsmount;
+#include <linux/fcntl.h>
+#include <linux/errno.h>
 
 enum { MAX_NESTED_LINKS = 8 };
 
@@ -46,13 +44,29 @@ enum {LAST_NORM, LAST_ROOT, LAST_DOT, LAST_DOTDOT, LAST_BIND};
 #define LOOKUP_ROOT		0x2000
 #define LOOKUP_EMPTY		0x4000
 
-extern int user_path_at(int, const char __user *, unsigned, struct path *);
 extern int user_path_at_empty(int, const char __user *, unsigned, struct path *, int *empty);
 
-#define user_path(name, path) user_path_at(AT_FDCWD, name, LOOKUP_FOLLOW, path)
-#define user_lpath(name, path) user_path_at(AT_FDCWD, name, 0, path)
-#define user_path_dir(name, path) \
-	user_path_at(AT_FDCWD, name, LOOKUP_FOLLOW | LOOKUP_DIRECTORY, path)
+static inline int user_path_at(int dfd, const char __user *name, unsigned flags,
+		 struct path *path)
+{
+	return user_path_at_empty(dfd, name, flags, path, NULL);
+}
+
+static inline int user_path(const char __user *name, struct path *path)
+{
+	return user_path_at_empty(AT_FDCWD, name, LOOKUP_FOLLOW, path, NULL);
+}
+
+static inline int user_lpath(const char __user *name, struct path *path)
+{
+	return user_path_at_empty(AT_FDCWD, name, 0, path, NULL);
+}
+
+static inline int user_path_dir(const char __user *name, struct path *path)
+{
+	return user_path_at_empty(AT_FDCWD, name,
+				  LOOKUP_FOLLOW | LOOKUP_DIRECTORY, path, NULL);
+}
 
 extern int kern_path(const char *, unsigned, struct path *);
 
-- 
cgit v1.2.3


From 55917a21d0cc012bb6073bb05bb768fd51d8e237 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Thu, 14 May 2015 14:57:23 +0200
Subject: netfilter: x_tables: add context to know if extension runs from
 nft_compat

Currently, we have four xtables extensions that cannot be used from the
xt over nft compat layer. The problem is that they need real access to
the full blown xt_entry to validate that the rule comes with the right
dependencies. This check was introduced to overcome the lack of
sufficient userspace dependency validation in iptables.

To resolve this problem, this patch introduces a new field to the
xt_tgchk_param structure that tell us if the extension is run from
nft_compat context.

The three affected extensions are:

1) CLUSTERIP, this target has been superseded by xt_cluster. So just
   bail out by returning -EINVAL.

2) TCPMSS. Relax the checking when used from nft_compat. If used with
   the wrong configuration, it will corrupt !syn packets by adding TCP
   MSS option.

3) ebt_stp. Relax the check to make sure it uses the reserved
   destination MAC address for STP.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Tested-by: Arturo Borrero Gonzalez <arturo.borrero.glez@gmail.com>
---
 include/linux/netfilter/x_tables.h | 2 ++
 net/bridge/netfilter/ebt_stp.c     | 6 ++++--
 net/ipv4/netfilter/ipt_CLUSTERIP.c | 5 +++++
 net/netfilter/nft_compat.c         | 2 ++
 net/netfilter/xt_TCPMSS.c          | 6 ++++++
 5 files changed, 19 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h
index a3e215bb0241..09f38206c18f 100644
--- a/include/linux/netfilter/x_tables.h
+++ b/include/linux/netfilter/x_tables.h
@@ -62,6 +62,7 @@ struct xt_mtchk_param {
 	void *matchinfo;
 	unsigned int hook_mask;
 	u_int8_t family;
+	bool nft_compat;
 };
 
 /**
@@ -92,6 +93,7 @@ struct xt_tgchk_param {
 	void *targinfo;
 	unsigned int hook_mask;
 	u_int8_t family;
+	bool nft_compat;
 };
 
 /* Target destructor parameters */
diff --git a/net/bridge/netfilter/ebt_stp.c b/net/bridge/netfilter/ebt_stp.c
index 071d87214dde..0c40570069ba 100644
--- a/net/bridge/netfilter/ebt_stp.c
+++ b/net/bridge/netfilter/ebt_stp.c
@@ -164,8 +164,10 @@ static int ebt_stp_mt_check(const struct xt_mtchk_param *par)
 	    !(info->bitmask & EBT_STP_MASK))
 		return -EINVAL;
 	/* Make sure the match only receives stp frames */
-	if (!ether_addr_equal(e->destmac, bridge_ula) ||
-	    !ether_addr_equal(e->destmsk, msk) || !(e->bitmask & EBT_DESTMAC))
+	if (!par->nft_compat &&
+	    (!ether_addr_equal(e->destmac, bridge_ula) ||
+	     !ether_addr_equal(e->destmsk, msk) ||
+	     !(e->bitmask & EBT_DESTMAC)))
 		return -EINVAL;
 
 	return 0;
diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c
index 771ab3d01ad3..45cb16a6a4a3 100644
--- a/net/ipv4/netfilter/ipt_CLUSTERIP.c
+++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c
@@ -367,6 +367,11 @@ static int clusterip_tg_check(const struct xt_tgchk_param *par)
 	struct clusterip_config *config;
 	int ret;
 
+	if (par->nft_compat) {
+		pr_err("cannot use CLUSTERIP target from nftables compat\n");
+		return -EOPNOTSUPP;
+	}
+
 	if (cipinfo->hash_mode != CLUSTERIP_HASHMODE_SIP &&
 	    cipinfo->hash_mode != CLUSTERIP_HASHMODE_SIP_SPT &&
 	    cipinfo->hash_mode != CLUSTERIP_HASHMODE_SIP_SPT_DPT) {
diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c
index 7f29cfc76349..66def315eb56 100644
--- a/net/netfilter/nft_compat.c
+++ b/net/netfilter/nft_compat.c
@@ -161,6 +161,7 @@ nft_target_set_tgchk_param(struct xt_tgchk_param *par,
 		par->hook_mask = 0;
 	}
 	par->family	= ctx->afi->family;
+	par->nft_compat = true;
 }
 
 static void target_compat_from_user(struct xt_target *t, void *in, void *out)
@@ -377,6 +378,7 @@ nft_match_set_mtchk_param(struct xt_mtchk_param *par, const struct nft_ctx *ctx,
 		par->hook_mask = 0;
 	}
 	par->family	= ctx->afi->family;
+	par->nft_compat = true;
 }
 
 static void match_compat_from_user(struct xt_match *m, void *in, void *out)
diff --git a/net/netfilter/xt_TCPMSS.c b/net/netfilter/xt_TCPMSS.c
index e762de5ee89b..8c3190e2fc6a 100644
--- a/net/netfilter/xt_TCPMSS.c
+++ b/net/netfilter/xt_TCPMSS.c
@@ -277,6 +277,9 @@ static int tcpmss_tg4_check(const struct xt_tgchk_param *par)
 			"FORWARD, OUTPUT and POSTROUTING hooks\n");
 		return -EINVAL;
 	}
+	if (par->nft_compat)
+		return 0;
+
 	xt_ematch_foreach(ematch, e)
 		if (find_syn_match(ematch))
 			return 0;
@@ -299,6 +302,9 @@ static int tcpmss_tg6_check(const struct xt_tgchk_param *par)
 			"FORWARD, OUTPUT and POSTROUTING hooks\n");
 		return -EINVAL;
 	}
+	if (par->nft_compat)
+		return 0;
+
 	xt_ematch_foreach(ematch, e)
 		if (find_syn_match(ematch))
 			return 0;
-- 
cgit v1.2.3


From 3f7f642b9bc46453e1435e8b67f1c4f7949be7ff Mon Sep 17 00:00:00 2001
From: Martin Fuzzey <mfuzzey@parkeon.com>
Date: Wed, 13 May 2015 12:26:42 +0200
Subject: iio: core: add high pass filter attributes

Add a high pass filter attribute for measurements
(like the existing low pass)

Also add both high and low pass attributes for events.

Signed-off-by: Martin Fuzzey <mfuzzey@parkeon.com>
Signed-off-by: Jonathan Cameron <jic23@kernel.org>
---
 Documentation/ABI/testing/sysfs-bus-iio | 30 ++++++++++++++++++++++++++++++
 drivers/iio/industrialio-core.c         |  2 ++
 drivers/iio/industrialio-event.c        |  2 ++
 include/linux/iio/iio.h                 |  1 +
 include/linux/iio/types.h               |  2 ++
 5 files changed, 37 insertions(+)

(limited to 'include/linux')

diff --git a/Documentation/ABI/testing/sysfs-bus-iio b/Documentation/ABI/testing/sysfs-bus-iio
index e46c71fbd047..f66262c64e2f 100644
--- a/Documentation/ABI/testing/sysfs-bus-iio
+++ b/Documentation/ABI/testing/sysfs-bus-iio
@@ -420,6 +420,16 @@ Description:
 		to the underlying data channel, then this parameter
 		gives the 3dB frequency of the filter in Hz.
 
+What:		/sys/.../in_accel_filter_high_pass_3db_frequency
+What:		/sys/.../in_anglvel_filter_high_pass_3db_frequency
+What:		/sys/.../in_magn_filter_high_pass_3db_frequency
+KernelVersion:	4.2
+Contact:	linux-iio@vger.kernel.org
+Description:
+		If a known or controllable high pass filter is applied
+		to the underlying data channel, then this parameter
+		gives the 3dB frequency of the filter in Hz.
+
 What:		/sys/bus/iio/devices/iio:deviceX/out_voltageY_raw
 What:		/sys/bus/iio/devices/iio:deviceX/out_altvoltageY_raw
 KernelVersion:	2.6.37
@@ -880,6 +890,26 @@ Description:
 		met before an event is generated. If direction is not
 		specified then this period applies to both directions.
 
+What:		/sys/.../events/in_accel_thresh_rising_low_pass_filter_3db
+What:		/sys/.../events/in_anglvel_thresh_rising_low_pass_filter_3db
+What:		/sys/.../events/in_magn_thresh_rising_low_pass_filter_3db
+KernelVersion:	4.2
+Contact:	linux-iio@vger.kernel.org
+Description:
+		If a low pass filter can be applied to the event generation
+		this property gives its 3db frequency in Hz.
+		A value of zero disables the filter.
+
+What:		/sys/.../events/in_accel_thresh_rising_high_pass_filter_3db
+What:		/sys/.../events/in_anglvel_thresh_rising_high_pass_filter_3db
+What:		/sys/.../events/in_magn_thresh_rising_high_pass_filter_3db
+KernelVersion:	4.2
+Contact:	linux-iio@vger.kernel.org
+Description:
+		If a high pass filter can be applied to the event generation
+		this property gives its 3db frequency in Hz.
+		A value of zero disables the filter.
+
 What:		/sys/.../events/in_activity_still_thresh_rising_en
 What:		/sys/.../events/in_activity_still_thresh_falling_en
 What:		/sys/.../events/in_activity_walking_thresh_rising_en
diff --git a/drivers/iio/industrialio-core.c b/drivers/iio/industrialio-core.c
index dfa81db3b910..9688a88b6198 100644
--- a/drivers/iio/industrialio-core.c
+++ b/drivers/iio/industrialio-core.c
@@ -117,6 +117,8 @@ static const char * const iio_chan_info_postfix[] = {
 	[IIO_CHAN_INFO_AVERAGE_RAW] = "mean_raw",
 	[IIO_CHAN_INFO_LOW_PASS_FILTER_3DB_FREQUENCY]
 	= "filter_low_pass_3db_frequency",
+	[IIO_CHAN_INFO_HIGH_PASS_FILTER_3DB_FREQUENCY]
+	= "filter_high_pass_3db_frequency",
 	[IIO_CHAN_INFO_SAMP_FREQ] = "sampling_frequency",
 	[IIO_CHAN_INFO_FREQUENCY] = "frequency",
 	[IIO_CHAN_INFO_PHASE] = "phase",
diff --git a/drivers/iio/industrialio-event.c b/drivers/iio/industrialio-event.c
index a99692ba91bc..894d8137c4cf 100644
--- a/drivers/iio/industrialio-event.c
+++ b/drivers/iio/industrialio-event.c
@@ -211,6 +211,8 @@ static const char * const iio_ev_info_text[] = {
 	[IIO_EV_INFO_VALUE] = "value",
 	[IIO_EV_INFO_HYSTERESIS] = "hysteresis",
 	[IIO_EV_INFO_PERIOD] = "period",
+	[IIO_EV_INFO_HIGH_PASS_FILTER_3DB] = "high_pass_filter_3db",
+	[IIO_EV_INFO_LOW_PASS_FILTER_3DB] = "low_pass_filter_3db",
 };
 
 static enum iio_event_direction iio_ev_attr_dir(struct iio_dev_attr *attr)
diff --git a/include/linux/iio/iio.h b/include/linux/iio/iio.h
index 058441da4984..f79148261d16 100644
--- a/include/linux/iio/iio.h
+++ b/include/linux/iio/iio.h
@@ -32,6 +32,7 @@ enum iio_chan_info_enum {
 	IIO_CHAN_INFO_QUADRATURE_CORRECTION_RAW,
 	IIO_CHAN_INFO_AVERAGE_RAW,
 	IIO_CHAN_INFO_LOW_PASS_FILTER_3DB_FREQUENCY,
+	IIO_CHAN_INFO_HIGH_PASS_FILTER_3DB_FREQUENCY,
 	IIO_CHAN_INFO_SAMP_FREQ,
 	IIO_CHAN_INFO_FREQUENCY,
 	IIO_CHAN_INFO_PHASE,
diff --git a/include/linux/iio/types.h b/include/linux/iio/types.h
index 942b6de68e2f..32b579525004 100644
--- a/include/linux/iio/types.h
+++ b/include/linux/iio/types.h
@@ -17,6 +17,8 @@ enum iio_event_info {
 	IIO_EV_INFO_VALUE,
 	IIO_EV_INFO_HYSTERESIS,
 	IIO_EV_INFO_PERIOD,
+	IIO_EV_INFO_HIGH_PASS_FILTER_3DB,
+	IIO_EV_INFO_LOW_PASS_FILTER_3DB,
 };
 
 #define IIO_VAL_INT 1
-- 
cgit v1.2.3


From c91d46065209bfe6124396fc409765ced0601b59 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 15 May 2015 05:48:07 -0700
Subject: net: fix two sparse errors

First one in __skb_checksum_validate_complete() fixes the following
(and other callers)

make C=2 CF=-D__CHECK_ENDIAN__ net/ipv4/tcp_ipv4.o
  CHECK   net/ipv4/tcp_ipv4.c
include/linux/skbuff.h:3052:24: warning: incorrect type in return expression (different base types)
include/linux/skbuff.h:3052:24:    expected restricted __sum16
include/linux/skbuff.h:3052:24:    got int

Second is fixing gso_make_checksum() :

  CHECK   net/ipv4/gre_offload.c
include/linux/skbuff.h:3360:14: warning: incorrect type in assignment (different base types)
include/linux/skbuff.h:3360:14:    expected unsigned short [unsigned] [usertype] csum
include/linux/skbuff.h:3360:14:    got restricted __sum16
include/linux/skbuff.h:3365:16: warning: incorrect type in return expression (different base types)
include/linux/skbuff.h:3365:16:    expected restricted __sum16
include/linux/skbuff.h:3365:16:    got unsigned short [unsigned] [usertype] csum

Fixes: 5a21232983aa7 ("net: Support for csum_bad in skbuff")
Fixes: 7e2b10c1e52ca ("net: Support for multiple checksums with gso")
Signed-off-by: Eric Dumazet <edumazet@google.com>
CC: Tom Herbert <tom@herbertland.com>
Acked-by: Tom Herbert <tom@herbertland.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index f83aa6568cbf..b57eebfb67e0 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -3051,7 +3051,7 @@ static inline __sum16 __skb_checksum_validate_complete(struct sk_buff *skb,
 		}
 	} else if (skb->csum_bad) {
 		/* ip_summed == CHECKSUM_NONE in this case */
-		return 1;
+		return (__force __sum16)1;
 	}
 
 	skb->csum = psum;
@@ -3353,15 +3353,14 @@ static inline int gso_pskb_expand_head(struct sk_buff *skb, int extra)
 static inline __sum16 gso_make_checksum(struct sk_buff *skb, __wsum res)
 {
 	int plen = SKB_GSO_CB(skb)->csum_start - skb_headroom(skb) -
-	    skb_transport_offset(skb);
-	__u16 csum;
+		   skb_transport_offset(skb);
+	__wsum partial;
 
-	csum = csum_fold(csum_partial(skb_transport_header(skb),
-				      plen, skb->csum));
+	partial = csum_partial(skb_transport_header(skb), plen, skb->csum);
 	skb->csum = res;
 	SKB_GSO_CB(skb)->csum_start -= plen;
 
-	return csum;
+	return csum_fold(partial);
 }
 
 static inline bool skb_is_gso(const struct sk_buff *skb)
-- 
cgit v1.2.3


From e1031dc1f7ba5c8724ba211062134076df292791 Mon Sep 17 00:00:00 2001
From: Maxime Ripard <maxime.ripard@free-electrons.com>
Date: Thu, 7 May 2015 17:38:07 +0200
Subject: dmaengine: Support different source and destination stride

In interleaved mode, we can expect to have different source and destination
strides.

Add support for such case to dmaengine.

Signed-off-by: Maxime Ripard <maxime.ripard@free-electrons.com>
Signed-off-by: Vinod Koul <vinod.koul@intel.com>
---
 include/linux/dmaengine.h | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h
index ad419757241f..5d63acb09813 100644
--- a/include/linux/dmaengine.h
+++ b/include/linux/dmaengine.h
@@ -122,10 +122,18 @@ enum dma_transfer_direction {
  *	 chunk and before first src/dst address for next chunk.
  *	 Ignored for dst(assumed 0), if dst_inc is true and dst_sgl is false.
  *	 Ignored for src(assumed 0), if src_inc is true and src_sgl is false.
+ * @dst_icg: Number of bytes to jump after last dst address of this
+ *	 chunk and before the first dst address for next chunk.
+ *	 Ignored if dst_inc is true and dst_sgl is false.
+ * @src_icg: Number of bytes to jump after last src address of this
+ *	 chunk and before the first src address for next chunk.
+ *	 Ignored if src_inc is true and src_sgl is false.
  */
 struct data_chunk {
 	size_t size;
 	size_t icg;
+	size_t dst_icg;
+	size_t src_icg;
 };
 
 /**
-- 
cgit v1.2.3


From 4cfafd3082afc707653aeb82e9f8e7b596fbbfd6 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 14 May 2015 12:23:11 +0200
Subject: sched,perf: Fix periodic timers

In the below two commits (see Fixes) we have periodic timers that can
stop themselves when they're no longer required, but need to be
(re)-started when their idle condition changes.

Further complications is that we want the timer handler to always do
the forward such that it will always correctly deal with the overruns,
and we do not want to race such that the handler has already decided
to stop, but the (external) restart sees the timer still active and we
end up with a 'lost' timer.

The problem with the current code is that the re-start can come before
the callback does the forward, at which point the forward from the
callback will WARN about forwarding an enqueued timer.

Now, conceptually its easy to detect if you're before or after the fwd
by comparing the expiration time against the current time. Of course,
that's expensive (and racy) because we don't have the current time.

Alternatively one could cache this state inside the timer, but then
everybody pays the overhead of maintaining this extra state, and that
is undesired.

The only other option that I could see is the external timer_active
variable, which I tried to kill before. I would love a nicer interface
for this seemingly simple 'problem' but alas.

Fixes: 272325c4821f ("perf: Fix mux_interval hrtimer wreckage")
Fixes: 77a4d1a1b9a1 ("sched: Cleanup bandwidth timers")
Cc: pjt@google.com
Cc: tglx@linutronix.de
Cc: klamm@yandex-team.ru
Cc: mingo@kernel.org
Cc: bsegall@google.com
Cc: hpa@zytor.com
Cc: Sasha Levin <sasha.levin@oracle.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: http://lkml.kernel.org/r/20150514102311.GX21418@twins.programming.kicks-ass.net
---
 include/linux/perf_event.h |  4 ++++
 kernel/events/core.c       | 29 ++++++++++++++++-------------
 kernel/sched/core.c        | 12 ------------
 kernel/sched/fair.c        | 17 +++++++++++++----
 kernel/sched/rt.c          |  8 +++++++-
 kernel/sched/sched.h       |  5 ++---
 6 files changed, 42 insertions(+), 33 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 61992cf2e977..cf3342a8ad80 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -566,8 +566,12 @@ struct perf_cpu_context {
 	struct perf_event_context	*task_ctx;
 	int				active_oncpu;
 	int				exclusive;
+
+	raw_spinlock_t			hrtimer_lock;
 	struct hrtimer			hrtimer;
 	ktime_t				hrtimer_interval;
+	unsigned int			hrtimer_active;
+
 	struct pmu			*unique_pmu;
 	struct perf_cgroup		*cgrp;
 };
diff --git a/kernel/events/core.c b/kernel/events/core.c
index f5288293d667..d9c93f36e379 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -752,24 +752,21 @@ perf_cgroup_mark_enabled(struct perf_event *event,
 static enum hrtimer_restart perf_mux_hrtimer_handler(struct hrtimer *hr)
 {
 	struct perf_cpu_context *cpuctx;
-	enum hrtimer_restart ret = HRTIMER_NORESTART;
 	int rotations = 0;
 
 	WARN_ON(!irqs_disabled());
 
 	cpuctx = container_of(hr, struct perf_cpu_context, hrtimer);
-
 	rotations = perf_rotate_context(cpuctx);
 
-	/*
-	 * arm timer if needed
-	 */
-	if (rotations) {
+	raw_spin_lock(&cpuctx->hrtimer_lock);
+	if (rotations)
 		hrtimer_forward_now(hr, cpuctx->hrtimer_interval);
-		ret = HRTIMER_RESTART;
-	}
+	else
+		cpuctx->hrtimer_active = 0;
+	raw_spin_unlock(&cpuctx->hrtimer_lock);
 
-	return ret;
+	return rotations ? HRTIMER_RESTART : HRTIMER_NORESTART;
 }
 
 static void __perf_mux_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu)
@@ -792,7 +789,8 @@ static void __perf_mux_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu)
 
 	cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * interval);
 
-	hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
+	raw_spin_lock_init(&cpuctx->hrtimer_lock);
+	hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
 	timer->function = perf_mux_hrtimer_handler;
 }
 
@@ -800,15 +798,20 @@ static int perf_mux_hrtimer_restart(struct perf_cpu_context *cpuctx)
 {
 	struct hrtimer *timer = &cpuctx->hrtimer;
 	struct pmu *pmu = cpuctx->ctx.pmu;
+	unsigned long flags;
 
 	/* not for SW PMU */
 	if (pmu->task_ctx_nr == perf_sw_context)
 		return 0;
 
-	if (hrtimer_is_queued(timer))
-		return 0;
+	raw_spin_lock_irqsave(&cpuctx->hrtimer_lock, flags);
+	if (!cpuctx->hrtimer_active) {
+		cpuctx->hrtimer_active = 1;
+		hrtimer_forward_now(timer, cpuctx->hrtimer_interval);
+		hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED);
+	}
+	raw_spin_unlock_irqrestore(&cpuctx->hrtimer_lock, flags);
 
-	hrtimer_start(timer, cpuctx->hrtimer_interval, HRTIMER_MODE_REL_PINNED);
 	return 0;
 }
 
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index d8a6196465d5..e84aeb280777 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -90,18 +90,6 @@
 #define CREATE_TRACE_POINTS
 #include <trace/events/sched.h>
 
-void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period)
-{
-	/*
-	 * Do not forward the expiration time of active timers;
-	 * we do not want to loose an overrun.
-	 */
-	if (!hrtimer_active(period_timer))
-		hrtimer_forward_now(period_timer, period);
-
-	hrtimer_start_expires(period_timer, HRTIMER_MODE_ABS_PINNED);
-}
-
 DEFINE_MUTEX(sched_domains_mutex);
 DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
 
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index e3b32ebfe421..69be2825262d 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3870,8 +3870,9 @@ static void start_cfs_slack_bandwidth(struct cfs_bandwidth *cfs_b)
 	if (runtime_refresh_within(cfs_b, min_left))
 		return;
 
-	start_bandwidth_timer(&cfs_b->slack_timer,
-				ns_to_ktime(cfs_bandwidth_slack_period));
+	hrtimer_start(&cfs_b->slack_timer,
+			ns_to_ktime(cfs_bandwidth_slack_period),
+			HRTIMER_MODE_REL);
 }
 
 /* we know any runtime found here is valid as update_curr() precedes return */
@@ -4012,6 +4013,8 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
 
 		idle = do_sched_cfs_period_timer(cfs_b, overrun);
 	}
+	if (idle)
+		cfs_b->period_active = 0;
 	raw_spin_unlock(&cfs_b->lock);
 
 	return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
@@ -4025,7 +4028,7 @@ void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
 	cfs_b->period = ns_to_ktime(default_cfs_period());
 
 	INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);
-	hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+	hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
 	cfs_b->period_timer.function = sched_cfs_period_timer;
 	hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
 	cfs_b->slack_timer.function = sched_cfs_slack_timer;
@@ -4039,7 +4042,13 @@ static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
 
 void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
 {
-	start_bandwidth_timer(&cfs_b->period_timer, cfs_b->period);
+	lockdep_assert_held(&cfs_b->lock);
+
+	if (!cfs_b->period_active) {
+		cfs_b->period_active = 1;
+		hrtimer_forward_now(&cfs_b->period_timer, cfs_b->period);
+		hrtimer_start_expires(&cfs_b->period_timer, HRTIMER_MODE_ABS_PINNED);
+	}
 }
 
 static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index b0febf25d8f1..e43da5391dcd 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -31,6 +31,8 @@ static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer)
 		idle = do_sched_rt_period_timer(rt_b, overrun);
 		raw_spin_lock(&rt_b->rt_runtime_lock);
 	}
+	if (idle)
+		rt_b->rt_period_active = 0;
 	raw_spin_unlock(&rt_b->rt_runtime_lock);
 
 	return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
@@ -54,7 +56,11 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
 		return;
 
 	raw_spin_lock(&rt_b->rt_runtime_lock);
-	start_bandwidth_timer(&rt_b->rt_period_timer, rt_b->rt_period);
+	if (!rt_b->rt_period_active) {
+		rt_b->rt_period_active = 1;
+		hrtimer_forward_now(&rt_b->rt_period_timer, rt_b->rt_period);
+		hrtimer_start_expires(&rt_b->rt_period_timer, HRTIMER_MODE_ABS_PINNED);
+	}
 	raw_spin_unlock(&rt_b->rt_runtime_lock);
 }
 
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 08606a1f8c4d..f9a58ef373b4 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -131,6 +131,7 @@ struct rt_bandwidth {
 	ktime_t			rt_period;
 	u64			rt_runtime;
 	struct hrtimer		rt_period_timer;
+	unsigned int		rt_period_active;
 };
 
 void __dl_clear_params(struct task_struct *p);
@@ -215,7 +216,7 @@ struct cfs_bandwidth {
 	s64 hierarchical_quota;
 	u64 runtime_expires;
 
-	int idle;
+	int idle, period_active;
 	struct hrtimer period_timer, slack_timer;
 	struct list_head throttled_cfs_rq;
 
@@ -1406,8 +1407,6 @@ static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) { }
 static inline void sched_avg_update(struct rq *rq) { }
 #endif
 
-extern void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period);
-
 /*
  * __task_rq_lock - lock the rq @p resides on.
  */
-- 
cgit v1.2.3


From 5f22f5c668204f3af7557018b2ad6cf2074defac Mon Sep 17 00:00:00 2001
From: Stefan Agner <stefan@agner.ch>
Date: Sat, 16 May 2015 11:44:13 +0200
Subject: irqdomain: Add non-hierarchy helper irq_domain_set_info

This adds the helper irq_domain_set_info() in a non-domain hierarchy
variant. This allows to use the helper for generic chip since not
all chips using generic chip support domain hierarchy.

Signed-off-by: Stefan Agner <stefan@agner.ch>
Cc: marc.zyngier@arm.com
Cc: linux@arm.linux.org.uk
Cc: u.kleine-koenig@pengutronix.de
Cc: olof@lixom.net
Cc: arnd@arndb.de
Cc: daniel.lezcano@linaro.org
Cc: mark.rutland@arm.com
Cc: pawel.moll@arm.com
Cc: robh+dt@kernel.org
Cc: ijc+devicetree@hellion.org.uk
Cc: galak@codeaurora.org
Cc: mcoquelin.stm32@gmail.com
Cc: linux-arm-kernel@lists.infradead.org
Cc: shawn.guo@linaro.org
Cc: kernel@pengutronix.de
Cc: jason@lakedaemon.net
Link: http://lkml.kernel.org/r/1431769465-26867-2-git-send-email-stefan@agner.ch
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/irqdomain.h |  8 ++++----
 kernel/irq/irqdomain.c    | 21 +++++++++++++++++++++
 2 files changed, 25 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h
index 676d7306a360..744ac0ec98eb 100644
--- a/include/linux/irqdomain.h
+++ b/include/linux/irqdomain.h
@@ -258,6 +258,10 @@ int irq_domain_xlate_onetwocell(struct irq_domain *d, struct device_node *ctrlr,
 /* V2 interfaces to support hierarchy IRQ domains. */
 extern struct irq_data *irq_domain_get_irq_data(struct irq_domain *domain,
 						unsigned int virq);
+extern void irq_domain_set_info(struct irq_domain *domain, unsigned int virq,
+				irq_hw_number_t hwirq, struct irq_chip *chip,
+				void *chip_data, irq_flow_handler_t handler,
+				void *handler_data, const char *handler_name);
 #ifdef	CONFIG_IRQ_DOMAIN_HIERARCHY
 extern struct irq_domain *irq_domain_add_hierarchy(struct irq_domain *parent,
 			unsigned int flags, unsigned int size,
@@ -281,10 +285,6 @@ extern int irq_domain_set_hwirq_and_chip(struct irq_domain *domain,
 					 irq_hw_number_t hwirq,
 					 struct irq_chip *chip,
 					 void *chip_data);
-extern void irq_domain_set_info(struct irq_domain *domain, unsigned int virq,
-				irq_hw_number_t hwirq, struct irq_chip *chip,
-				void *chip_data, irq_flow_handler_t handler,
-				void *handler_data, const char *handler_name);
 extern void irq_domain_reset_irq_data(struct irq_data *irq_data);
 extern void irq_domain_free_irqs_common(struct irq_domain *domain,
 					unsigned int virq,
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 7fac311057b8..41bf6dc49f59 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -1232,6 +1232,27 @@ struct irq_data *irq_domain_get_irq_data(struct irq_domain *domain,
 	return (irq_data && irq_data->domain == domain) ? irq_data : NULL;
 }
 
+/**
+ * irq_domain_set_info - Set the complete data for a @virq in @domain
+ * @domain:		Interrupt domain to match
+ * @virq:		IRQ number
+ * @hwirq:		The hardware interrupt number
+ * @chip:		The associated interrupt chip
+ * @chip_data:		The associated interrupt chip data
+ * @handler:		The interrupt flow handler
+ * @handler_data:	The interrupt flow handler data
+ * @handler_name:	The interrupt handler name
+ */
+void irq_domain_set_info(struct irq_domain *domain, unsigned int virq,
+			 irq_hw_number_t hwirq, struct irq_chip *chip,
+			 void *chip_data, irq_flow_handler_t handler,
+			 void *handler_data, const char *handler_name)
+{
+	irq_set_chip_and_handler_name(virq, chip, handler, handler_name);
+	irq_set_chip_data(virq, chip_data);
+	irq_set_handler_data(virq, handler_data);
+}
+
 static void irq_domain_check_hierarchy(struct irq_domain *domain)
 {
 }
-- 
cgit v1.2.3


From 3cfeffc265791bc953527458e0a44ea77c459340 Mon Sep 17 00:00:00 2001
From: Stefan Agner <stefan@agner.ch>
Date: Sat, 16 May 2015 11:44:14 +0200
Subject: genirq: Add irq_chip_(enable/disable)_parent

Add helper irq_chip_enable_parent and irq_chip_disable_parent. The
helper implement the default behavior in case irq_enable or irq_disable
is not implemented for the parent interrupt chip, which is calling the
irq_mask or irq_unmask respectively.

Signed-off-by: Stefan Agner <stefan@agner.ch>
Cc: marc.zyngier@arm.com
Cc: linux@arm.linux.org.uk
Cc: u.kleine-koenig@pengutronix.de
Cc: olof@lixom.net
Cc: arnd@arndb.de
Cc: daniel.lezcano@linaro.org
Cc: mark.rutland@arm.com
Cc: pawel.moll@arm.com
Cc: robh+dt@kernel.org
Cc: ijc+devicetree@hellion.org.uk
Cc: galak@codeaurora.org
Cc: mcoquelin.stm32@gmail.com
Cc: linux-arm-kernel@lists.infradead.org
Cc: shawn.guo@linaro.org
Cc: kernel@pengutronix.de
Cc: jason@lakedaemon.net
Link: http://lkml.kernel.org/r/1431769465-26867-3-git-send-email-stefan@agner.ch
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/irq.h |  2 ++
 kernel/irq/chip.c   | 28 ++++++++++++++++++++++++++++
 2 files changed, 30 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/irq.h b/include/linux/irq.h
index 62c6901cab55..2633061364b1 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -458,6 +458,8 @@ extern void handle_nested_irq(unsigned int irq);
 
 extern int irq_chip_compose_msi_msg(struct irq_data *data, struct msi_msg *msg);
 #ifdef	CONFIG_IRQ_DOMAIN_HIERARCHY
+extern void irq_chip_enable_parent(struct irq_data *data);
+extern void irq_chip_disable_parent(struct irq_data *data);
 extern void irq_chip_ack_parent(struct irq_data *data);
 extern int irq_chip_retrigger_hierarchy(struct irq_data *data);
 extern void irq_chip_mask_parent(struct irq_data *data);
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index eb9a4ea394ab..2456fe89719c 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -875,6 +875,34 @@ void irq_cpu_offline(void)
 }
 
 #ifdef	CONFIG_IRQ_DOMAIN_HIERARCHY
+/**
+ * irq_chip_enable_parent - Enable the parent interrupt (defaults to unmask if
+ * NULL)
+ * @data:	Pointer to interrupt specific data
+ */
+void irq_chip_enable_parent(struct irq_data *data)
+{
+	data = data->parent_data;
+	if (data->chip->irq_enable)
+		data->chip->irq_enable(data);
+	else
+		data->chip->irq_unmask(data);
+}
+
+/**
+ * irq_chip_disable_parent - Disable the parent interrupt (defaults to mask if
+ * NULL)
+ * @data:	Pointer to interrupt specific data
+ */
+void irq_chip_disable_parent(struct irq_data *data)
+{
+	data = data->parent_data;
+	if (data->chip->irq_disable)
+		data->chip->irq_disable(data);
+	else
+		data->chip->irq_mask(data);
+}
+
 /**
  * irq_chip_ack_parent - Acknowledge the parent interrupt
  * @data:	Pointer to interrupt specific data
-- 
cgit v1.2.3


From b4a04ab7a37b490cad48e69abfe14288cacb669c Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 13 May 2015 15:38:40 -0400
Subject: cgroup: separate out include/linux/cgroup-defs.h

From 2d728f74bfc071df06773e2fd7577dd5dab6425d Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 13 May 2015 15:37:01 -0400

This patch separates out cgroup-defs.h from cgroup.h which has grown a
lot of dependencies.  cgroup-defs.h currently only contains constant
and type definitions and can be used to break circular include
dependency.  While moving, definitions are reordered so that
cgroup-defs.h has consistent logical structure.

This patch is pure reorganization.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/cgroup-defs.h | 464 ++++++++++++++++++++++++++++++++++++++++++++
 include/linux/cgroup.h      | 455 +------------------------------------------
 2 files changed, 466 insertions(+), 453 deletions(-)
 create mode 100644 include/linux/cgroup-defs.h

(limited to 'include/linux')

diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
new file mode 100644
index 000000000000..55f3120fb952
--- /dev/null
+++ b/include/linux/cgroup-defs.h
@@ -0,0 +1,464 @@
+/*
+ * linux/cgroup-defs.h - basic definitions for cgroup
+ *
+ * This file provides basic type and interface.  Include this file directly
+ * only if necessary to avoid cyclic dependencies.
+ */
+#ifndef _LINUX_CGROUP_DEFS_H
+#define _LINUX_CGROUP_DEFS_H
+
+#include <linux/limits.h>
+#include <linux/list.h>
+#include <linux/idr.h>
+#include <linux/wait.h>
+#include <linux/mutex.h>
+#include <linux/rcupdate.h>
+#include <linux/percpu-refcount.h>
+#include <linux/workqueue.h>
+
+#ifdef CONFIG_CGROUPS
+
+struct cgroup;
+struct cgroup_root;
+struct cgroup_subsys;
+struct cgroup_taskset;
+struct kernfs_node;
+struct kernfs_ops;
+struct kernfs_open_file;
+
+#define MAX_CGROUP_TYPE_NAMELEN 32
+#define MAX_CGROUP_ROOT_NAMELEN 64
+#define MAX_CFTYPE_NAME		64
+
+/* define the enumeration of all cgroup subsystems */
+#define SUBSYS(_x) _x ## _cgrp_id,
+enum cgroup_subsys_id {
+#include <linux/cgroup_subsys.h>
+	CGROUP_SUBSYS_COUNT,
+};
+#undef SUBSYS
+
+/* bits in struct cgroup_subsys_state flags field */
+enum {
+	CSS_NO_REF	= (1 << 0), /* no reference counting for this css */
+	CSS_ONLINE	= (1 << 1), /* between ->css_online() and ->css_offline() */
+	CSS_RELEASED	= (1 << 2), /* refcnt reached zero, released */
+};
+
+/* bits in struct cgroup flags field */
+enum {
+	/* Control Group requires release notifications to userspace */
+	CGRP_NOTIFY_ON_RELEASE,
+	/*
+	 * Clone the parent's configuration when creating a new child
+	 * cpuset cgroup.  For historical reasons, this option can be
+	 * specified at mount time and thus is implemented here.
+	 */
+	CGRP_CPUSET_CLONE_CHILDREN,
+};
+
+/* cgroup_root->flags */
+enum {
+	CGRP_ROOT_SANE_BEHAVIOR	= (1 << 0), /* __DEVEL__sane_behavior specified */
+	CGRP_ROOT_NOPREFIX	= (1 << 1), /* mounted subsystems have no named prefix */
+	CGRP_ROOT_XATTR		= (1 << 2), /* supports extended attributes */
+};
+
+/* cftype->flags */
+enum {
+	CFTYPE_ONLY_ON_ROOT	= (1 << 0),	/* only create on root cgrp */
+	CFTYPE_NOT_ON_ROOT	= (1 << 1),	/* don't create on root cgrp */
+	CFTYPE_NO_PREFIX	= (1 << 3),	/* (DON'T USE FOR NEW FILES) no subsys prefix */
+
+	/* internal flags, do not use outside cgroup core proper */
+	__CFTYPE_ONLY_ON_DFL	= (1 << 16),	/* only on default hierarchy */
+	__CFTYPE_NOT_ON_DFL	= (1 << 17),	/* not on default hierarchy */
+};
+
+/*
+ * Per-subsystem/per-cgroup state maintained by the system.  This is the
+ * fundamental structural building block that controllers deal with.
+ *
+ * Fields marked with "PI:" are public and immutable and may be accessed
+ * directly without synchronization.
+ */
+struct cgroup_subsys_state {
+	/* PI: the cgroup that this css is attached to */
+	struct cgroup *cgroup;
+
+	/* PI: the cgroup subsystem that this css is attached to */
+	struct cgroup_subsys *ss;
+
+	/* reference count - access via css_[try]get() and css_put() */
+	struct percpu_ref refcnt;
+
+	/* PI: the parent css */
+	struct cgroup_subsys_state *parent;
+
+	/* siblings list anchored at the parent's ->children */
+	struct list_head sibling;
+	struct list_head children;
+
+	/*
+	 * PI: Subsys-unique ID.  0 is unused and root is always 1.  The
+	 * matching css can be looked up using css_from_id().
+	 */
+	int id;
+
+	unsigned int flags;
+
+	/*
+	 * Monotonically increasing unique serial number which defines a
+	 * uniform order among all csses.  It's guaranteed that all
+	 * ->children lists are in the ascending order of ->serial_nr and
+	 * used to allow interrupting and resuming iterations.
+	 */
+	u64 serial_nr;
+
+	/* percpu_ref killing and RCU release */
+	struct rcu_head rcu_head;
+	struct work_struct destroy_work;
+};
+
+/*
+ * A css_set is a structure holding pointers to a set of
+ * cgroup_subsys_state objects. This saves space in the task struct
+ * object and speeds up fork()/exit(), since a single inc/dec and a
+ * list_add()/del() can bump the reference count on the entire cgroup
+ * set for a task.
+ */
+struct css_set {
+	/* Reference count */
+	atomic_t refcount;
+
+	/*
+	 * List running through all cgroup groups in the same hash
+	 * slot. Protected by css_set_lock
+	 */
+	struct hlist_node hlist;
+
+	/*
+	 * Lists running through all tasks using this cgroup group.
+	 * mg_tasks lists tasks which belong to this cset but are in the
+	 * process of being migrated out or in.  Protected by
+	 * css_set_rwsem, but, during migration, once tasks are moved to
+	 * mg_tasks, it can be read safely while holding cgroup_mutex.
+	 */
+	struct list_head tasks;
+	struct list_head mg_tasks;
+
+	/*
+	 * List of cgrp_cset_links pointing at cgroups referenced from this
+	 * css_set.  Protected by css_set_lock.
+	 */
+	struct list_head cgrp_links;
+
+	/* the default cgroup associated with this css_set */
+	struct cgroup *dfl_cgrp;
+
+	/*
+	 * Set of subsystem states, one for each subsystem. This array is
+	 * immutable after creation apart from the init_css_set during
+	 * subsystem registration (at boot time).
+	 */
+	struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT];
+
+	/*
+	 * List of csets participating in the on-going migration either as
+	 * source or destination.  Protected by cgroup_mutex.
+	 */
+	struct list_head mg_preload_node;
+	struct list_head mg_node;
+
+	/*
+	 * If this cset is acting as the source of migration the following
+	 * two fields are set.  mg_src_cgrp is the source cgroup of the
+	 * on-going migration and mg_dst_cset is the destination cset the
+	 * target tasks on this cset should be migrated to.  Protected by
+	 * cgroup_mutex.
+	 */
+	struct cgroup *mg_src_cgrp;
+	struct css_set *mg_dst_cset;
+
+	/*
+	 * On the default hierarhcy, ->subsys[ssid] may point to a css
+	 * attached to an ancestor instead of the cgroup this css_set is
+	 * associated with.  The following node is anchored at
+	 * ->subsys[ssid]->cgroup->e_csets[ssid] and provides a way to
+	 * iterate through all css's attached to a given cgroup.
+	 */
+	struct list_head e_cset_node[CGROUP_SUBSYS_COUNT];
+
+	/* For RCU-protected deletion */
+	struct rcu_head rcu_head;
+};
+
+struct cgroup {
+	/* self css with NULL ->ss, points back to this cgroup */
+	struct cgroup_subsys_state self;
+
+	unsigned long flags;		/* "unsigned long" so bitops work */
+
+	/*
+	 * idr allocated in-hierarchy ID.
+	 *
+	 * ID 0 is not used, the ID of the root cgroup is always 1, and a
+	 * new cgroup will be assigned with a smallest available ID.
+	 *
+	 * Allocating/Removing ID must be protected by cgroup_mutex.
+	 */
+	int id;
+
+	/*
+	 * If this cgroup contains any tasks, it contributes one to
+	 * populated_cnt.  All children with non-zero popuplated_cnt of
+	 * their own contribute one.  The count is zero iff there's no task
+	 * in this cgroup or its subtree.
+	 */
+	int populated_cnt;
+
+	struct kernfs_node *kn;		/* cgroup kernfs entry */
+	struct kernfs_node *populated_kn; /* kn for "cgroup.subtree_populated" */
+
+	/*
+	 * The bitmask of subsystems enabled on the child cgroups.
+	 * ->subtree_control is the one configured through
+	 * "cgroup.subtree_control" while ->child_subsys_mask is the
+	 * effective one which may have more subsystems enabled.
+	 * Controller knobs are made available iff it's enabled in
+	 * ->subtree_control.
+	 */
+	unsigned int subtree_control;
+	unsigned int child_subsys_mask;
+
+	/* Private pointers for each registered subsystem */
+	struct cgroup_subsys_state __rcu *subsys[CGROUP_SUBSYS_COUNT];
+
+	struct cgroup_root *root;
+
+	/*
+	 * List of cgrp_cset_links pointing at css_sets with tasks in this
+	 * cgroup.  Protected by css_set_lock.
+	 */
+	struct list_head cset_links;
+
+	/*
+	 * On the default hierarchy, a css_set for a cgroup with some
+	 * susbsys disabled will point to css's which are associated with
+	 * the closest ancestor which has the subsys enabled.  The
+	 * following lists all css_sets which point to this cgroup's css
+	 * for the given subsystem.
+	 */
+	struct list_head e_csets[CGROUP_SUBSYS_COUNT];
+
+	/*
+	 * list of pidlists, up to two for each namespace (one for procs, one
+	 * for tasks); created on demand.
+	 */
+	struct list_head pidlists;
+	struct mutex pidlist_mutex;
+
+	/* used to wait for offlining of csses */
+	wait_queue_head_t offline_waitq;
+
+	/* used to schedule release agent */
+	struct work_struct release_agent_work;
+};
+
+/*
+ * A cgroup_root represents the root of a cgroup hierarchy, and may be
+ * associated with a kernfs_root to form an active hierarchy.  This is
+ * internal to cgroup core.  Don't access directly from controllers.
+ */
+struct cgroup_root {
+	struct kernfs_root *kf_root;
+
+	/* The bitmask of subsystems attached to this hierarchy */
+	unsigned int subsys_mask;
+
+	/* Unique id for this hierarchy. */
+	int hierarchy_id;
+
+	/* The root cgroup.  Root is destroyed on its release. */
+	struct cgroup cgrp;
+
+	/* Number of cgroups in the hierarchy, used only for /proc/cgroups */
+	atomic_t nr_cgrps;
+
+	/* A list running through the active hierarchies */
+	struct list_head root_list;
+
+	/* Hierarchy-specific flags */
+	unsigned int flags;
+
+	/* IDs for cgroups in this hierarchy */
+	struct idr cgroup_idr;
+
+	/* The path to use for release notifications. */
+	char release_agent_path[PATH_MAX];
+
+	/* The name for this hierarchy - may be empty */
+	char name[MAX_CGROUP_ROOT_NAMELEN];
+};
+
+/*
+ * struct cftype: handler definitions for cgroup control files
+ *
+ * When reading/writing to a file:
+ *	- the cgroup to use is file->f_path.dentry->d_parent->d_fsdata
+ *	- the 'cftype' of the file is file->f_path.dentry->d_fsdata
+ */
+struct cftype {
+	/*
+	 * By convention, the name should begin with the name of the
+	 * subsystem, followed by a period.  Zero length string indicates
+	 * end of cftype array.
+	 */
+	char name[MAX_CFTYPE_NAME];
+	int private;
+	/*
+	 * If not 0, file mode is set to this value, otherwise it will
+	 * be figured out automatically
+	 */
+	umode_t mode;
+
+	/*
+	 * The maximum length of string, excluding trailing nul, that can
+	 * be passed to write.  If < PAGE_SIZE-1, PAGE_SIZE-1 is assumed.
+	 */
+	size_t max_write_len;
+
+	/* CFTYPE_* flags */
+	unsigned int flags;
+
+	/*
+	 * Fields used for internal bookkeeping.  Initialized automatically
+	 * during registration.
+	 */
+	struct cgroup_subsys *ss;	/* NULL for cgroup core files */
+	struct list_head node;		/* anchored at ss->cfts */
+	struct kernfs_ops *kf_ops;
+
+	/*
+	 * read_u64() is a shortcut for the common case of returning a
+	 * single integer. Use it in place of read()
+	 */
+	u64 (*read_u64)(struct cgroup_subsys_state *css, struct cftype *cft);
+	/*
+	 * read_s64() is a signed version of read_u64()
+	 */
+	s64 (*read_s64)(struct cgroup_subsys_state *css, struct cftype *cft);
+
+	/* generic seq_file read interface */
+	int (*seq_show)(struct seq_file *sf, void *v);
+
+	/* optional ops, implement all or none */
+	void *(*seq_start)(struct seq_file *sf, loff_t *ppos);
+	void *(*seq_next)(struct seq_file *sf, void *v, loff_t *ppos);
+	void (*seq_stop)(struct seq_file *sf, void *v);
+
+	/*
+	 * write_u64() is a shortcut for the common case of accepting
+	 * a single integer (as parsed by simple_strtoull) from
+	 * userspace. Use in place of write(); return 0 or error.
+	 */
+	int (*write_u64)(struct cgroup_subsys_state *css, struct cftype *cft,
+			 u64 val);
+	/*
+	 * write_s64() is a signed version of write_u64()
+	 */
+	int (*write_s64)(struct cgroup_subsys_state *css, struct cftype *cft,
+			 s64 val);
+
+	/*
+	 * write() is the generic write callback which maps directly to
+	 * kernfs write operation and overrides all other operations.
+	 * Maximum write size is determined by ->max_write_len.  Use
+	 * of_css/cft() to access the associated css and cft.
+	 */
+	ssize_t (*write)(struct kernfs_open_file *of,
+			 char *buf, size_t nbytes, loff_t off);
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	struct lock_class_key	lockdep_key;
+#endif
+};
+
+/*
+ * Control Group subsystem type.
+ * See Documentation/cgroups/cgroups.txt for details
+ */
+struct cgroup_subsys {
+	struct cgroup_subsys_state *(*css_alloc)(struct cgroup_subsys_state *parent_css);
+	int (*css_online)(struct cgroup_subsys_state *css);
+	void (*css_offline)(struct cgroup_subsys_state *css);
+	void (*css_released)(struct cgroup_subsys_state *css);
+	void (*css_free)(struct cgroup_subsys_state *css);
+	void (*css_reset)(struct cgroup_subsys_state *css);
+	void (*css_e_css_changed)(struct cgroup_subsys_state *css);
+
+	int (*can_attach)(struct cgroup_subsys_state *css,
+			  struct cgroup_taskset *tset);
+	void (*cancel_attach)(struct cgroup_subsys_state *css,
+			      struct cgroup_taskset *tset);
+	void (*attach)(struct cgroup_subsys_state *css,
+		       struct cgroup_taskset *tset);
+	void (*fork)(struct task_struct *task);
+	void (*exit)(struct cgroup_subsys_state *css,
+		     struct cgroup_subsys_state *old_css,
+		     struct task_struct *task);
+	void (*bind)(struct cgroup_subsys_state *root_css);
+
+	int disabled;
+	int early_init;
+
+	/*
+	 * If %false, this subsystem is properly hierarchical -
+	 * configuration, resource accounting and restriction on a parent
+	 * cgroup cover those of its children.  If %true, hierarchy support
+	 * is broken in some ways - some subsystems ignore hierarchy
+	 * completely while others are only implemented half-way.
+	 *
+	 * It's now disallowed to create nested cgroups if the subsystem is
+	 * broken and cgroup core will emit a warning message on such
+	 * cases.  Eventually, all subsystems will be made properly
+	 * hierarchical and this will go away.
+	 */
+	bool broken_hierarchy;
+	bool warned_broken_hierarchy;
+
+	/* the following two fields are initialized automtically during boot */
+	int id;
+	const char *name;
+
+	/* link to parent, protected by cgroup_lock() */
+	struct cgroup_root *root;
+
+	/* idr for css->id */
+	struct idr css_idr;
+
+	/*
+	 * List of cftypes.  Each entry is the first entry of an array
+	 * terminated by zero length name.
+	 */
+	struct list_head cfts;
+
+	/*
+	 * Base cftypes which are automatically registered.  The two can
+	 * point to the same array.
+	 */
+	struct cftype *dfl_cftypes;	/* for the default hierarchy */
+	struct cftype *legacy_cftypes;	/* for the legacy hierarchies */
+
+	/*
+	 * A subsystem may depend on other subsystems.  When such subsystem
+	 * is enabled on a cgroup, the depended-upon subsystems are enabled
+	 * together if available.  Subsystems enabled due to dependency are
+	 * not visible to userland until explicitly enabled.  The following
+	 * specifies the mask of subsystems that this one depends on.
+	 */
+	unsigned int depends_on;
+};
+
+#endif	/* CONFIG_CGROUPS */
+#endif	/* _LINUX_CGROUP_DEFS_H */
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index b9cb94c3102a..96a2ecd5aa69 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -11,23 +11,16 @@
 #include <linux/sched.h>
 #include <linux/cpumask.h>
 #include <linux/nodemask.h>
-#include <linux/rcupdate.h>
 #include <linux/rculist.h>
 #include <linux/cgroupstats.h>
 #include <linux/rwsem.h>
-#include <linux/idr.h>
-#include <linux/workqueue.h>
 #include <linux/fs.h>
-#include <linux/percpu-refcount.h>
 #include <linux/seq_file.h>
 #include <linux/kernfs.h>
-#include <linux/wait.h>
 
-#ifdef CONFIG_CGROUPS
+#include <linux/cgroup-defs.h>
 
-struct cgroup_root;
-struct cgroup_subsys;
-struct cgroup;
+#ifdef CONFIG_CGROUPS
 
 extern int cgroup_init_early(void);
 extern int cgroup_init(void);
@@ -40,66 +33,6 @@ extern int cgroupstats_build(struct cgroupstats *stats,
 extern int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
 			    struct pid *pid, struct task_struct *tsk);
 
-/* define the enumeration of all cgroup subsystems */
-#define SUBSYS(_x) _x ## _cgrp_id,
-enum cgroup_subsys_id {
-#include <linux/cgroup_subsys.h>
-	CGROUP_SUBSYS_COUNT,
-};
-#undef SUBSYS
-
-/*
- * Per-subsystem/per-cgroup state maintained by the system.  This is the
- * fundamental structural building block that controllers deal with.
- *
- * Fields marked with "PI:" are public and immutable and may be accessed
- * directly without synchronization.
- */
-struct cgroup_subsys_state {
-	/* PI: the cgroup that this css is attached to */
-	struct cgroup *cgroup;
-
-	/* PI: the cgroup subsystem that this css is attached to */
-	struct cgroup_subsys *ss;
-
-	/* reference count - access via css_[try]get() and css_put() */
-	struct percpu_ref refcnt;
-
-	/* PI: the parent css */
-	struct cgroup_subsys_state *parent;
-
-	/* siblings list anchored at the parent's ->children */
-	struct list_head sibling;
-	struct list_head children;
-
-	/*
-	 * PI: Subsys-unique ID.  0 is unused and root is always 1.  The
-	 * matching css can be looked up using css_from_id().
-	 */
-	int id;
-
-	unsigned int flags;
-
-	/*
-	 * Monotonically increasing unique serial number which defines a
-	 * uniform order among all csses.  It's guaranteed that all
-	 * ->children lists are in the ascending order of ->serial_nr and
-	 * used to allow interrupting and resuming iterations.
-	 */
-	u64 serial_nr;
-
-	/* percpu_ref killing and RCU release */
-	struct rcu_head rcu_head;
-	struct work_struct destroy_work;
-};
-
-/* bits in struct cgroup_subsys_state flags field */
-enum {
-	CSS_NO_REF	= (1 << 0), /* no reference counting for this css */
-	CSS_ONLINE	= (1 << 1), /* between ->css_online() and ->css_offline() */
-	CSS_RELEASED	= (1 << 2), /* refcnt reached zero, released */
-};
-
 /**
  * css_get - obtain a reference on the specified css
  * @css: target css
@@ -185,307 +118,6 @@ static inline void css_put_many(struct cgroup_subsys_state *css, unsigned int n)
 		percpu_ref_put_many(&css->refcnt, n);
 }
 
-/* bits in struct cgroup flags field */
-enum {
-	/* Control Group requires release notifications to userspace */
-	CGRP_NOTIFY_ON_RELEASE,
-	/*
-	 * Clone the parent's configuration when creating a new child
-	 * cpuset cgroup.  For historical reasons, this option can be
-	 * specified at mount time and thus is implemented here.
-	 */
-	CGRP_CPUSET_CLONE_CHILDREN,
-};
-
-struct cgroup {
-	/* self css with NULL ->ss, points back to this cgroup */
-	struct cgroup_subsys_state self;
-
-	unsigned long flags;		/* "unsigned long" so bitops work */
-
-	/*
-	 * idr allocated in-hierarchy ID.
-	 *
-	 * ID 0 is not used, the ID of the root cgroup is always 1, and a
-	 * new cgroup will be assigned with a smallest available ID.
-	 *
-	 * Allocating/Removing ID must be protected by cgroup_mutex.
-	 */
-	int id;
-
-	/*
-	 * If this cgroup contains any tasks, it contributes one to
-	 * populated_cnt.  All children with non-zero popuplated_cnt of
-	 * their own contribute one.  The count is zero iff there's no task
-	 * in this cgroup or its subtree.
-	 */
-	int populated_cnt;
-
-	struct kernfs_node *kn;		/* cgroup kernfs entry */
-	struct kernfs_node *populated_kn; /* kn for "cgroup.subtree_populated" */
-
-	/*
-	 * The bitmask of subsystems enabled on the child cgroups.
-	 * ->subtree_control is the one configured through
-	 * "cgroup.subtree_control" while ->child_subsys_mask is the
-	 * effective one which may have more subsystems enabled.
-	 * Controller knobs are made available iff it's enabled in
-	 * ->subtree_control.
-	 */
-	unsigned int subtree_control;
-	unsigned int child_subsys_mask;
-
-	/* Private pointers for each registered subsystem */
-	struct cgroup_subsys_state __rcu *subsys[CGROUP_SUBSYS_COUNT];
-
-	struct cgroup_root *root;
-
-	/*
-	 * List of cgrp_cset_links pointing at css_sets with tasks in this
-	 * cgroup.  Protected by css_set_lock.
-	 */
-	struct list_head cset_links;
-
-	/*
-	 * On the default hierarchy, a css_set for a cgroup with some
-	 * susbsys disabled will point to css's which are associated with
-	 * the closest ancestor which has the subsys enabled.  The
-	 * following lists all css_sets which point to this cgroup's css
-	 * for the given subsystem.
-	 */
-	struct list_head e_csets[CGROUP_SUBSYS_COUNT];
-
-	/*
-	 * list of pidlists, up to two for each namespace (one for procs, one
-	 * for tasks); created on demand.
-	 */
-	struct list_head pidlists;
-	struct mutex pidlist_mutex;
-
-	/* used to wait for offlining of csses */
-	wait_queue_head_t offline_waitq;
-
-	/* used to schedule release agent */
-	struct work_struct release_agent_work;
-};
-
-#define MAX_CGROUP_ROOT_NAMELEN 64
-
-/* cgroup_root->flags */
-enum {
-	CGRP_ROOT_SANE_BEHAVIOR	= (1 << 0), /* __DEVEL__sane_behavior specified */
-	CGRP_ROOT_NOPREFIX	= (1 << 1), /* mounted subsystems have no named prefix */
-	CGRP_ROOT_XATTR		= (1 << 2), /* supports extended attributes */
-};
-
-/*
- * A cgroup_root represents the root of a cgroup hierarchy, and may be
- * associated with a kernfs_root to form an active hierarchy.  This is
- * internal to cgroup core.  Don't access directly from controllers.
- */
-struct cgroup_root {
-	struct kernfs_root *kf_root;
-
-	/* The bitmask of subsystems attached to this hierarchy */
-	unsigned int subsys_mask;
-
-	/* Unique id for this hierarchy. */
-	int hierarchy_id;
-
-	/* The root cgroup.  Root is destroyed on its release. */
-	struct cgroup cgrp;
-
-	/* Number of cgroups in the hierarchy, used only for /proc/cgroups */
-	atomic_t nr_cgrps;
-
-	/* A list running through the active hierarchies */
-	struct list_head root_list;
-
-	/* Hierarchy-specific flags */
-	unsigned int flags;
-
-	/* IDs for cgroups in this hierarchy */
-	struct idr cgroup_idr;
-
-	/* The path to use for release notifications. */
-	char release_agent_path[PATH_MAX];
-
-	/* The name for this hierarchy - may be empty */
-	char name[MAX_CGROUP_ROOT_NAMELEN];
-};
-
-/*
- * A css_set is a structure holding pointers to a set of
- * cgroup_subsys_state objects. This saves space in the task struct
- * object and speeds up fork()/exit(), since a single inc/dec and a
- * list_add()/del() can bump the reference count on the entire cgroup
- * set for a task.
- */
-
-struct css_set {
-
-	/* Reference count */
-	atomic_t refcount;
-
-	/*
-	 * List running through all cgroup groups in the same hash
-	 * slot. Protected by css_set_lock
-	 */
-	struct hlist_node hlist;
-
-	/*
-	 * Lists running through all tasks using this cgroup group.
-	 * mg_tasks lists tasks which belong to this cset but are in the
-	 * process of being migrated out or in.  Protected by
-	 * css_set_rwsem, but, during migration, once tasks are moved to
-	 * mg_tasks, it can be read safely while holding cgroup_mutex.
-	 */
-	struct list_head tasks;
-	struct list_head mg_tasks;
-
-	/*
-	 * List of cgrp_cset_links pointing at cgroups referenced from this
-	 * css_set.  Protected by css_set_lock.
-	 */
-	struct list_head cgrp_links;
-
-	/* the default cgroup associated with this css_set */
-	struct cgroup *dfl_cgrp;
-
-	/*
-	 * Set of subsystem states, one for each subsystem. This array is
-	 * immutable after creation apart from the init_css_set during
-	 * subsystem registration (at boot time).
-	 */
-	struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT];
-
-	/*
-	 * List of csets participating in the on-going migration either as
-	 * source or destination.  Protected by cgroup_mutex.
-	 */
-	struct list_head mg_preload_node;
-	struct list_head mg_node;
-
-	/*
-	 * If this cset is acting as the source of migration the following
-	 * two fields are set.  mg_src_cgrp is the source cgroup of the
-	 * on-going migration and mg_dst_cset is the destination cset the
-	 * target tasks on this cset should be migrated to.  Protected by
-	 * cgroup_mutex.
-	 */
-	struct cgroup *mg_src_cgrp;
-	struct css_set *mg_dst_cset;
-
-	/*
-	 * On the default hierarhcy, ->subsys[ssid] may point to a css
-	 * attached to an ancestor instead of the cgroup this css_set is
-	 * associated with.  The following node is anchored at
-	 * ->subsys[ssid]->cgroup->e_csets[ssid] and provides a way to
-	 * iterate through all css's attached to a given cgroup.
-	 */
-	struct list_head e_cset_node[CGROUP_SUBSYS_COUNT];
-
-	/* For RCU-protected deletion */
-	struct rcu_head rcu_head;
-};
-
-/*
- * struct cftype: handler definitions for cgroup control files
- *
- * When reading/writing to a file:
- *	- the cgroup to use is file->f_path.dentry->d_parent->d_fsdata
- *	- the 'cftype' of the file is file->f_path.dentry->d_fsdata
- */
-
-/* cftype->flags */
-enum {
-	CFTYPE_ONLY_ON_ROOT	= (1 << 0),	/* only create on root cgrp */
-	CFTYPE_NOT_ON_ROOT	= (1 << 1),	/* don't create on root cgrp */
-	CFTYPE_NO_PREFIX	= (1 << 3),	/* (DON'T USE FOR NEW FILES) no subsys prefix */
-
-	/* internal flags, do not use outside cgroup core proper */
-	__CFTYPE_ONLY_ON_DFL	= (1 << 16),	/* only on default hierarchy */
-	__CFTYPE_NOT_ON_DFL	= (1 << 17),	/* not on default hierarchy */
-};
-
-#define MAX_CFTYPE_NAME		64
-
-struct cftype {
-	/*
-	 * By convention, the name should begin with the name of the
-	 * subsystem, followed by a period.  Zero length string indicates
-	 * end of cftype array.
-	 */
-	char name[MAX_CFTYPE_NAME];
-	int private;
-	/*
-	 * If not 0, file mode is set to this value, otherwise it will
-	 * be figured out automatically
-	 */
-	umode_t mode;
-
-	/*
-	 * The maximum length of string, excluding trailing nul, that can
-	 * be passed to write.  If < PAGE_SIZE-1, PAGE_SIZE-1 is assumed.
-	 */
-	size_t max_write_len;
-
-	/* CFTYPE_* flags */
-	unsigned int flags;
-
-	/*
-	 * Fields used for internal bookkeeping.  Initialized automatically
-	 * during registration.
-	 */
-	struct cgroup_subsys *ss;	/* NULL for cgroup core files */
-	struct list_head node;		/* anchored at ss->cfts */
-	struct kernfs_ops *kf_ops;
-
-	/*
-	 * read_u64() is a shortcut for the common case of returning a
-	 * single integer. Use it in place of read()
-	 */
-	u64 (*read_u64)(struct cgroup_subsys_state *css, struct cftype *cft);
-	/*
-	 * read_s64() is a signed version of read_u64()
-	 */
-	s64 (*read_s64)(struct cgroup_subsys_state *css, struct cftype *cft);
-
-	/* generic seq_file read interface */
-	int (*seq_show)(struct seq_file *sf, void *v);
-
-	/* optional ops, implement all or none */
-	void *(*seq_start)(struct seq_file *sf, loff_t *ppos);
-	void *(*seq_next)(struct seq_file *sf, void *v, loff_t *ppos);
-	void (*seq_stop)(struct seq_file *sf, void *v);
-
-	/*
-	 * write_u64() is a shortcut for the common case of accepting
-	 * a single integer (as parsed by simple_strtoull) from
-	 * userspace. Use in place of write(); return 0 or error.
-	 */
-	int (*write_u64)(struct cgroup_subsys_state *css, struct cftype *cft,
-			 u64 val);
-	/*
-	 * write_s64() is a signed version of write_u64()
-	 */
-	int (*write_s64)(struct cgroup_subsys_state *css, struct cftype *cft,
-			 s64 val);
-
-	/*
-	 * write() is the generic write callback which maps directly to
-	 * kernfs write operation and overrides all other operations.
-	 * Maximum write size is determined by ->max_write_len.  Use
-	 * of_css/cft() to access the associated css and cft.
-	 */
-	ssize_t (*write)(struct kernfs_open_file *of,
-			 char *buf, size_t nbytes, loff_t off);
-
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-	struct lock_class_key	lockdep_key;
-#endif
-};
-
 extern struct cgroup_root cgrp_dfl_root;
 extern struct css_set init_css_set;
 
@@ -612,11 +244,6 @@ int cgroup_rm_cftypes(struct cftype *cfts);
 
 bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor);
 
-/*
- * Control Group taskset, used to pass around set of tasks to cgroup_subsys
- * methods.
- */
-struct cgroup_taskset;
 struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset);
 struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset);
 
@@ -629,84 +256,6 @@ struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset);
 	for ((task) = cgroup_taskset_first((tset)); (task);		\
 	     (task) = cgroup_taskset_next((tset)))
 
-/*
- * Control Group subsystem type.
- * See Documentation/cgroups/cgroups.txt for details
- */
-
-struct cgroup_subsys {
-	struct cgroup_subsys_state *(*css_alloc)(struct cgroup_subsys_state *parent_css);
-	int (*css_online)(struct cgroup_subsys_state *css);
-	void (*css_offline)(struct cgroup_subsys_state *css);
-	void (*css_released)(struct cgroup_subsys_state *css);
-	void (*css_free)(struct cgroup_subsys_state *css);
-	void (*css_reset)(struct cgroup_subsys_state *css);
-	void (*css_e_css_changed)(struct cgroup_subsys_state *css);
-
-	int (*can_attach)(struct cgroup_subsys_state *css,
-			  struct cgroup_taskset *tset);
-	void (*cancel_attach)(struct cgroup_subsys_state *css,
-			      struct cgroup_taskset *tset);
-	void (*attach)(struct cgroup_subsys_state *css,
-		       struct cgroup_taskset *tset);
-	void (*fork)(struct task_struct *task);
-	void (*exit)(struct cgroup_subsys_state *css,
-		     struct cgroup_subsys_state *old_css,
-		     struct task_struct *task);
-	void (*bind)(struct cgroup_subsys_state *root_css);
-
-	int disabled;
-	int early_init;
-
-	/*
-	 * If %false, this subsystem is properly hierarchical -
-	 * configuration, resource accounting and restriction on a parent
-	 * cgroup cover those of its children.  If %true, hierarchy support
-	 * is broken in some ways - some subsystems ignore hierarchy
-	 * completely while others are only implemented half-way.
-	 *
-	 * It's now disallowed to create nested cgroups if the subsystem is
-	 * broken and cgroup core will emit a warning message on such
-	 * cases.  Eventually, all subsystems will be made properly
-	 * hierarchical and this will go away.
-	 */
-	bool broken_hierarchy;
-	bool warned_broken_hierarchy;
-
-	/* the following two fields are initialized automtically during boot */
-	int id;
-#define MAX_CGROUP_TYPE_NAMELEN 32
-	const char *name;
-
-	/* link to parent, protected by cgroup_lock() */
-	struct cgroup_root *root;
-
-	/* idr for css->id */
-	struct idr css_idr;
-
-	/*
-	 * List of cftypes.  Each entry is the first entry of an array
-	 * terminated by zero length name.
-	 */
-	struct list_head cfts;
-
-	/*
-	 * Base cftypes which are automatically registered.  The two can
-	 * point to the same array.
-	 */
-	struct cftype *dfl_cftypes;	/* for the default hierarchy */
-	struct cftype *legacy_cftypes;	/* for the legacy hierarchies */
-
-	/*
-	 * A subsystem may depend on other subsystems.  When such subsystem
-	 * is enabled on a cgroup, the depended-upon subsystems are enabled
-	 * together if available.  Subsystems enabled due to dependency are
-	 * not visible to userland until explicitly enabled.  The following
-	 * specifies the mask of subsystems that this one depends on.
-	 */
-	unsigned int depends_on;
-};
-
 #define SUBSYS(_x) extern struct cgroup_subsys _x ## _cgrp_subsys;
 #include <linux/cgroup_subsys.h>
 #undef SUBSYS
-- 
cgit v1.2.3


From c326aa2bb2209e10df4a381801bb34ca0f923038 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 13 May 2015 16:24:16 -0400
Subject: cgroup: reorganize include/linux/cgroup.h

From c4d440938b5e2015c70594fe6666a099c844f929 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 13 May 2015 16:21:40 -0400

Over time, cgroup.h grew organically and doesn't have much logical
structure at this point.  Separation of cgroup-defs.h in the previous
patch gives us a good chance for reorganizing cgroup.h as changes to
the header are likely to cause conflicts anyway.

This patch reorganizes cgroup.h so that it has consistent logical
grouping.

This is pure reorganization.

v2: Relocating #ifdef CONFIG_CGROUPS caused build failure when cgroup
    is disabled.  Dropped.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/cgroup.h | 552 ++++++++++++++++++++++++-------------------------
 1 file changed, 271 insertions(+), 281 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 96a2ecd5aa69..82319fb31cfe 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -22,16 +22,189 @@
 
 #ifdef CONFIG_CGROUPS
 
-extern int cgroup_init_early(void);
-extern int cgroup_init(void);
-extern void cgroup_fork(struct task_struct *p);
-extern void cgroup_post_fork(struct task_struct *p);
-extern void cgroup_exit(struct task_struct *p);
-extern int cgroupstats_build(struct cgroupstats *stats,
-				struct dentry *dentry);
+/* a css_task_iter should be treated as an opaque object */
+struct css_task_iter {
+	struct cgroup_subsys		*ss;
+
+	struct list_head		*cset_pos;
+	struct list_head		*cset_head;
+
+	struct list_head		*task_pos;
+	struct list_head		*tasks_head;
+	struct list_head		*mg_tasks_head;
+};
+
+extern struct cgroup_root cgrp_dfl_root;
+extern struct css_set init_css_set;
+
+#define SUBSYS(_x) extern struct cgroup_subsys _x ## _cgrp_subsys;
+#include <linux/cgroup_subsys.h>
+#undef SUBSYS
+
+bool css_has_online_children(struct cgroup_subsys_state *css);
+struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss);
+struct cgroup_subsys_state *cgroup_get_e_css(struct cgroup *cgroup,
+					     struct cgroup_subsys *ss);
+struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry,
+						       struct cgroup_subsys *ss);
+
+bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor);
+int cgroup_attach_task_all(struct task_struct *from, struct task_struct *);
+int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from);
+
+int cgroup_add_dfl_cftypes(struct cgroup_subsys *ss, struct cftype *cfts);
+int cgroup_add_legacy_cftypes(struct cgroup_subsys *ss, struct cftype *cfts);
+int cgroup_rm_cftypes(struct cftype *cfts);
+
+char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen);
+int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry);
+int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
+		     struct pid *pid, struct task_struct *tsk);
+
+void cgroup_fork(struct task_struct *p);
+void cgroup_post_fork(struct task_struct *p);
+void cgroup_exit(struct task_struct *p);
+
+int cgroup_init_early(void);
+int cgroup_init(void);
+
+/*
+ * Iteration helpers and macros.
+ */
+
+struct cgroup_subsys_state *css_next_child(struct cgroup_subsys_state *pos,
+					   struct cgroup_subsys_state *parent);
+struct cgroup_subsys_state *css_next_descendant_pre(struct cgroup_subsys_state *pos,
+						    struct cgroup_subsys_state *css);
+struct cgroup_subsys_state *css_rightmost_descendant(struct cgroup_subsys_state *pos);
+struct cgroup_subsys_state *css_next_descendant_post(struct cgroup_subsys_state *pos,
+						     struct cgroup_subsys_state *css);
+
+struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset);
+struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset);
+
+void css_task_iter_start(struct cgroup_subsys_state *css,
+			 struct css_task_iter *it);
+struct task_struct *css_task_iter_next(struct css_task_iter *it);
+void css_task_iter_end(struct css_task_iter *it);
+
+/**
+ * css_for_each_child - iterate through children of a css
+ * @pos: the css * to use as the loop cursor
+ * @parent: css whose children to walk
+ *
+ * Walk @parent's children.  Must be called under rcu_read_lock().
+ *
+ * If a subsystem synchronizes ->css_online() and the start of iteration, a
+ * css which finished ->css_online() is guaranteed to be visible in the
+ * future iterations and will stay visible until the last reference is put.
+ * A css which hasn't finished ->css_online() or already finished
+ * ->css_offline() may show up during traversal.  It's each subsystem's
+ * responsibility to synchronize against on/offlining.
+ *
+ * It is allowed to temporarily drop RCU read lock during iteration.  The
+ * caller is responsible for ensuring that @pos remains accessible until
+ * the start of the next iteration by, for example, bumping the css refcnt.
+ */
+#define css_for_each_child(pos, parent)					\
+	for ((pos) = css_next_child(NULL, (parent)); (pos);		\
+	     (pos) = css_next_child((pos), (parent)))
+
+/**
+ * css_for_each_descendant_pre - pre-order walk of a css's descendants
+ * @pos: the css * to use as the loop cursor
+ * @root: css whose descendants to walk
+ *
+ * Walk @root's descendants.  @root is included in the iteration and the
+ * first node to be visited.  Must be called under rcu_read_lock().
+ *
+ * If a subsystem synchronizes ->css_online() and the start of iteration, a
+ * css which finished ->css_online() is guaranteed to be visible in the
+ * future iterations and will stay visible until the last reference is put.
+ * A css which hasn't finished ->css_online() or already finished
+ * ->css_offline() may show up during traversal.  It's each subsystem's
+ * responsibility to synchronize against on/offlining.
+ *
+ * For example, the following guarantees that a descendant can't escape
+ * state updates of its ancestors.
+ *
+ * my_online(@css)
+ * {
+ *	Lock @css's parent and @css;
+ *	Inherit state from the parent;
+ *	Unlock both.
+ * }
+ *
+ * my_update_state(@css)
+ * {
+ *	css_for_each_descendant_pre(@pos, @css) {
+ *		Lock @pos;
+ *		if (@pos == @css)
+ *			Update @css's state;
+ *		else
+ *			Verify @pos is alive and inherit state from its parent;
+ *		Unlock @pos;
+ *	}
+ * }
+ *
+ * As long as the inheriting step, including checking the parent state, is
+ * enclosed inside @pos locking, double-locking the parent isn't necessary
+ * while inheriting.  The state update to the parent is guaranteed to be
+ * visible by walking order and, as long as inheriting operations to the
+ * same @pos are atomic to each other, multiple updates racing each other
+ * still result in the correct state.  It's guaranateed that at least one
+ * inheritance happens for any css after the latest update to its parent.
+ *
+ * If checking parent's state requires locking the parent, each inheriting
+ * iteration should lock and unlock both @pos->parent and @pos.
+ *
+ * Alternatively, a subsystem may choose to use a single global lock to
+ * synchronize ->css_online() and ->css_offline() against tree-walking
+ * operations.
+ *
+ * It is allowed to temporarily drop RCU read lock during iteration.  The
+ * caller is responsible for ensuring that @pos remains accessible until
+ * the start of the next iteration by, for example, bumping the css refcnt.
+ */
+#define css_for_each_descendant_pre(pos, css)				\
+	for ((pos) = css_next_descendant_pre(NULL, (css)); (pos);	\
+	     (pos) = css_next_descendant_pre((pos), (css)))
+
+/**
+ * css_for_each_descendant_post - post-order walk of a css's descendants
+ * @pos: the css * to use as the loop cursor
+ * @css: css whose descendants to walk
+ *
+ * Similar to css_for_each_descendant_pre() but performs post-order
+ * traversal instead.  @root is included in the iteration and the last
+ * node to be visited.
+ *
+ * If a subsystem synchronizes ->css_online() and the start of iteration, a
+ * css which finished ->css_online() is guaranteed to be visible in the
+ * future iterations and will stay visible until the last reference is put.
+ * A css which hasn't finished ->css_online() or already finished
+ * ->css_offline() may show up during traversal.  It's each subsystem's
+ * responsibility to synchronize against on/offlining.
+ *
+ * Note that the walk visibility guarantee example described in pre-order
+ * walk doesn't apply the same to post-order walks.
+ */
+#define css_for_each_descendant_post(pos, css)				\
+	for ((pos) = css_next_descendant_post(NULL, (css)); (pos);	\
+	     (pos) = css_next_descendant_post((pos), (css)))
+
+/**
+ * cgroup_taskset_for_each - iterate cgroup_taskset
+ * @task: the loop cursor
+ * @tset: taskset to iterate
+ */
+#define cgroup_taskset_for_each(task, tset)				\
+	for ((task) = cgroup_taskset_first((tset)); (task);		\
+	     (task) = cgroup_taskset_next((tset)))
 
-extern int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
-			    struct pid *pid, struct task_struct *tsk);
+/*
+ * Inline functions.
+ */
 
 /**
  * css_get - obtain a reference on the specified css
@@ -118,8 +291,87 @@ static inline void css_put_many(struct cgroup_subsys_state *css, unsigned int n)
 		percpu_ref_put_many(&css->refcnt, n);
 }
 
-extern struct cgroup_root cgrp_dfl_root;
-extern struct css_set init_css_set;
+/**
+ * task_css_set_check - obtain a task's css_set with extra access conditions
+ * @task: the task to obtain css_set for
+ * @__c: extra condition expression to be passed to rcu_dereference_check()
+ *
+ * A task's css_set is RCU protected, initialized and exited while holding
+ * task_lock(), and can only be modified while holding both cgroup_mutex
+ * and task_lock() while the task is alive.  This macro verifies that the
+ * caller is inside proper critical section and returns @task's css_set.
+ *
+ * The caller can also specify additional allowed conditions via @__c, such
+ * as locks used during the cgroup_subsys::attach() methods.
+ */
+#ifdef CONFIG_PROVE_RCU
+extern struct mutex cgroup_mutex;
+extern struct rw_semaphore css_set_rwsem;
+#define task_css_set_check(task, __c)					\
+	rcu_dereference_check((task)->cgroups,				\
+		lockdep_is_held(&cgroup_mutex) ||			\
+		lockdep_is_held(&css_set_rwsem) ||			\
+		((task)->flags & PF_EXITING) || (__c))
+#else
+#define task_css_set_check(task, __c)					\
+	rcu_dereference((task)->cgroups)
+#endif
+
+/**
+ * task_css_check - obtain css for (task, subsys) w/ extra access conds
+ * @task: the target task
+ * @subsys_id: the target subsystem ID
+ * @__c: extra condition expression to be passed to rcu_dereference_check()
+ *
+ * Return the cgroup_subsys_state for the (@task, @subsys_id) pair.  The
+ * synchronization rules are the same as task_css_set_check().
+ */
+#define task_css_check(task, subsys_id, __c)				\
+	task_css_set_check((task), (__c))->subsys[(subsys_id)]
+
+/**
+ * task_css_set - obtain a task's css_set
+ * @task: the task to obtain css_set for
+ *
+ * See task_css_set_check().
+ */
+static inline struct css_set *task_css_set(struct task_struct *task)
+{
+	return task_css_set_check(task, false);
+}
+
+/**
+ * task_css - obtain css for (task, subsys)
+ * @task: the target task
+ * @subsys_id: the target subsystem ID
+ *
+ * See task_css_check().
+ */
+static inline struct cgroup_subsys_state *task_css(struct task_struct *task,
+						   int subsys_id)
+{
+	return task_css_check(task, subsys_id, false);
+}
+
+/**
+ * task_css_is_root - test whether a task belongs to the root css
+ * @task: the target task
+ * @subsys_id: the target subsystem ID
+ *
+ * Test whether @task belongs to the root css on the specified subsystem.
+ * May be invoked in any context.
+ */
+static inline bool task_css_is_root(struct task_struct *task, int subsys_id)
+{
+	return task_css_check(task, subsys_id, true) ==
+		init_css_set.subsys[subsys_id];
+}
+
+static inline struct cgroup *task_cgroup(struct task_struct *task,
+					 int subsys_id)
+{
+	return task_css(task, subsys_id)->cgroup;
+}
 
 /**
  * cgroup_on_dfl - test whether a cgroup is on the default hierarchy
@@ -236,284 +488,22 @@ static inline void pr_cont_cgroup_path(struct cgroup *cgrp)
 	pr_cont_kernfs_path(cgrp->kn);
 }
 
-char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen);
-
-int cgroup_add_dfl_cftypes(struct cgroup_subsys *ss, struct cftype *cfts);
-int cgroup_add_legacy_cftypes(struct cgroup_subsys *ss, struct cftype *cfts);
-int cgroup_rm_cftypes(struct cftype *cfts);
-
-bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor);
-
-struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset);
-struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset);
-
-/**
- * cgroup_taskset_for_each - iterate cgroup_taskset
- * @task: the loop cursor
- * @tset: taskset to iterate
- */
-#define cgroup_taskset_for_each(task, tset)				\
-	for ((task) = cgroup_taskset_first((tset)); (task);		\
-	     (task) = cgroup_taskset_next((tset)))
-
-#define SUBSYS(_x) extern struct cgroup_subsys _x ## _cgrp_subsys;
-#include <linux/cgroup_subsys.h>
-#undef SUBSYS
-
-/**
- * task_css_set_check - obtain a task's css_set with extra access conditions
- * @task: the task to obtain css_set for
- * @__c: extra condition expression to be passed to rcu_dereference_check()
- *
- * A task's css_set is RCU protected, initialized and exited while holding
- * task_lock(), and can only be modified while holding both cgroup_mutex
- * and task_lock() while the task is alive.  This macro verifies that the
- * caller is inside proper critical section and returns @task's css_set.
- *
- * The caller can also specify additional allowed conditions via @__c, such
- * as locks used during the cgroup_subsys::attach() methods.
- */
-#ifdef CONFIG_PROVE_RCU
-extern struct mutex cgroup_mutex;
-extern struct rw_semaphore css_set_rwsem;
-#define task_css_set_check(task, __c)					\
-	rcu_dereference_check((task)->cgroups,				\
-		lockdep_is_held(&cgroup_mutex) ||			\
-		lockdep_is_held(&css_set_rwsem) ||			\
-		((task)->flags & PF_EXITING) || (__c))
-#else
-#define task_css_set_check(task, __c)					\
-	rcu_dereference((task)->cgroups)
-#endif
-
-/**
- * task_css_check - obtain css for (task, subsys) w/ extra access conds
- * @task: the target task
- * @subsys_id: the target subsystem ID
- * @__c: extra condition expression to be passed to rcu_dereference_check()
- *
- * Return the cgroup_subsys_state for the (@task, @subsys_id) pair.  The
- * synchronization rules are the same as task_css_set_check().
- */
-#define task_css_check(task, subsys_id, __c)				\
-	task_css_set_check((task), (__c))->subsys[(subsys_id)]
-
-/**
- * task_css_set - obtain a task's css_set
- * @task: the task to obtain css_set for
- *
- * See task_css_set_check().
- */
-static inline struct css_set *task_css_set(struct task_struct *task)
-{
-	return task_css_set_check(task, false);
-}
-
-/**
- * task_css - obtain css for (task, subsys)
- * @task: the target task
- * @subsys_id: the target subsystem ID
- *
- * See task_css_check().
- */
-static inline struct cgroup_subsys_state *task_css(struct task_struct *task,
-						   int subsys_id)
-{
-	return task_css_check(task, subsys_id, false);
-}
-
-/**
- * task_css_is_root - test whether a task belongs to the root css
- * @task: the target task
- * @subsys_id: the target subsystem ID
- *
- * Test whether @task belongs to the root css on the specified subsystem.
- * May be invoked in any context.
- */
-static inline bool task_css_is_root(struct task_struct *task, int subsys_id)
-{
-	return task_css_check(task, subsys_id, true) ==
-		init_css_set.subsys[subsys_id];
-}
-
-static inline struct cgroup *task_cgroup(struct task_struct *task,
-					 int subsys_id)
-{
-	return task_css(task, subsys_id)->cgroup;
-}
-
-struct cgroup_subsys_state *css_next_child(struct cgroup_subsys_state *pos,
-					   struct cgroup_subsys_state *parent);
-
-struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss);
-
-/**
- * css_for_each_child - iterate through children of a css
- * @pos: the css * to use as the loop cursor
- * @parent: css whose children to walk
- *
- * Walk @parent's children.  Must be called under rcu_read_lock().
- *
- * If a subsystem synchronizes ->css_online() and the start of iteration, a
- * css which finished ->css_online() is guaranteed to be visible in the
- * future iterations and will stay visible until the last reference is put.
- * A css which hasn't finished ->css_online() or already finished
- * ->css_offline() may show up during traversal.  It's each subsystem's
- * responsibility to synchronize against on/offlining.
- *
- * It is allowed to temporarily drop RCU read lock during iteration.  The
- * caller is responsible for ensuring that @pos remains accessible until
- * the start of the next iteration by, for example, bumping the css refcnt.
- */
-#define css_for_each_child(pos, parent)					\
-	for ((pos) = css_next_child(NULL, (parent)); (pos);		\
-	     (pos) = css_next_child((pos), (parent)))
-
-struct cgroup_subsys_state *
-css_next_descendant_pre(struct cgroup_subsys_state *pos,
-			struct cgroup_subsys_state *css);
-
-struct cgroup_subsys_state *
-css_rightmost_descendant(struct cgroup_subsys_state *pos);
-
-/**
- * css_for_each_descendant_pre - pre-order walk of a css's descendants
- * @pos: the css * to use as the loop cursor
- * @root: css whose descendants to walk
- *
- * Walk @root's descendants.  @root is included in the iteration and the
- * first node to be visited.  Must be called under rcu_read_lock().
- *
- * If a subsystem synchronizes ->css_online() and the start of iteration, a
- * css which finished ->css_online() is guaranteed to be visible in the
- * future iterations and will stay visible until the last reference is put.
- * A css which hasn't finished ->css_online() or already finished
- * ->css_offline() may show up during traversal.  It's each subsystem's
- * responsibility to synchronize against on/offlining.
- *
- * For example, the following guarantees that a descendant can't escape
- * state updates of its ancestors.
- *
- * my_online(@css)
- * {
- *	Lock @css's parent and @css;
- *	Inherit state from the parent;
- *	Unlock both.
- * }
- *
- * my_update_state(@css)
- * {
- *	css_for_each_descendant_pre(@pos, @css) {
- *		Lock @pos;
- *		if (@pos == @css)
- *			Update @css's state;
- *		else
- *			Verify @pos is alive and inherit state from its parent;
- *		Unlock @pos;
- *	}
- * }
- *
- * As long as the inheriting step, including checking the parent state, is
- * enclosed inside @pos locking, double-locking the parent isn't necessary
- * while inheriting.  The state update to the parent is guaranteed to be
- * visible by walking order and, as long as inheriting operations to the
- * same @pos are atomic to each other, multiple updates racing each other
- * still result in the correct state.  It's guaranateed that at least one
- * inheritance happens for any css after the latest update to its parent.
- *
- * If checking parent's state requires locking the parent, each inheriting
- * iteration should lock and unlock both @pos->parent and @pos.
- *
- * Alternatively, a subsystem may choose to use a single global lock to
- * synchronize ->css_online() and ->css_offline() against tree-walking
- * operations.
- *
- * It is allowed to temporarily drop RCU read lock during iteration.  The
- * caller is responsible for ensuring that @pos remains accessible until
- * the start of the next iteration by, for example, bumping the css refcnt.
- */
-#define css_for_each_descendant_pre(pos, css)				\
-	for ((pos) = css_next_descendant_pre(NULL, (css)); (pos);	\
-	     (pos) = css_next_descendant_pre((pos), (css)))
-
-struct cgroup_subsys_state *
-css_next_descendant_post(struct cgroup_subsys_state *pos,
-			 struct cgroup_subsys_state *css);
-
-/**
- * css_for_each_descendant_post - post-order walk of a css's descendants
- * @pos: the css * to use as the loop cursor
- * @css: css whose descendants to walk
- *
- * Similar to css_for_each_descendant_pre() but performs post-order
- * traversal instead.  @root is included in the iteration and the last
- * node to be visited.
- *
- * If a subsystem synchronizes ->css_online() and the start of iteration, a
- * css which finished ->css_online() is guaranteed to be visible in the
- * future iterations and will stay visible until the last reference is put.
- * A css which hasn't finished ->css_online() or already finished
- * ->css_offline() may show up during traversal.  It's each subsystem's
- * responsibility to synchronize against on/offlining.
- *
- * Note that the walk visibility guarantee example described in pre-order
- * walk doesn't apply the same to post-order walks.
- */
-#define css_for_each_descendant_post(pos, css)				\
-	for ((pos) = css_next_descendant_post(NULL, (css)); (pos);	\
-	     (pos) = css_next_descendant_post((pos), (css)))
-
-bool css_has_online_children(struct cgroup_subsys_state *css);
-
-/* A css_task_iter should be treated as an opaque object */
-struct css_task_iter {
-	struct cgroup_subsys		*ss;
-
-	struct list_head		*cset_pos;
-	struct list_head		*cset_head;
-
-	struct list_head		*task_pos;
-	struct list_head		*tasks_head;
-	struct list_head		*mg_tasks_head;
-};
-
-void css_task_iter_start(struct cgroup_subsys_state *css,
-			 struct css_task_iter *it);
-struct task_struct *css_task_iter_next(struct css_task_iter *it);
-void css_task_iter_end(struct css_task_iter *it);
-
-int cgroup_attach_task_all(struct task_struct *from, struct task_struct *);
-int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from);
-
-struct cgroup_subsys_state *cgroup_get_e_css(struct cgroup *cgroup,
-					     struct cgroup_subsys *ss);
-struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry,
-						       struct cgroup_subsys *ss);
-
 #else /* !CONFIG_CGROUPS */
 
 struct cgroup_subsys_state;
 
-static inline int cgroup_init_early(void) { return 0; }
-static inline int cgroup_init(void) { return 0; }
+static inline void css_put(struct cgroup_subsys_state *css) {}
+static inline int cgroup_attach_task_all(struct task_struct *from,
+					 struct task_struct *t) { return 0; }
+static inline int cgroupstats_build(struct cgroupstats *stats,
+				    struct dentry *dentry) { return -EINVAL; }
+
 static inline void cgroup_fork(struct task_struct *p) {}
 static inline void cgroup_post_fork(struct task_struct *p) {}
 static inline void cgroup_exit(struct task_struct *p) {}
 
-static inline int cgroupstats_build(struct cgroupstats *stats,
-					struct dentry *dentry)
-{
-	return -EINVAL;
-}
-
-static inline void css_put(struct cgroup_subsys_state *css) {}
-
-/* No cgroups - nothing to do */
-static inline int cgroup_attach_task_all(struct task_struct *from,
-					 struct task_struct *t)
-{
-	return 0;
-}
+static inline int cgroup_init_early(void) { return 0; }
+static inline int cgroup_init(void) { return 0; }
 
 #endif /* !CONFIG_CGROUPS */
 
-- 
cgit v1.2.3


From 87e9b9f1d86c2ee9a10c2a4186a72d0af4cc963e Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Sat, 16 May 2015 01:38:15 +0200
Subject: PM / sleep: Make suspend-to-idle-specific code depend on
 CONFIG_SUSPEND

Since idle_should_freeze() is defined to always return 'false'
for CONFIG_SUSPEND unset, all of the code depending on it in
cpuidle_idle_call() is not necessary in that case.

Make that code depend on CONFIG_SUSPEND too to avoid building it
when it is not going to be used.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
---
 drivers/cpuidle/cpuidle.c |  2 ++
 include/linux/cpuidle.h   | 16 ++++++++++------
 include/linux/tick.h      | 12 ++++++++----
 kernel/time/tick-common.c |  2 ++
 4 files changed, 22 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c
index 61c417b9e53f..71459f546145 100644
--- a/drivers/cpuidle/cpuidle.c
+++ b/drivers/cpuidle/cpuidle.c
@@ -92,6 +92,7 @@ static int find_deepest_state(struct cpuidle_driver *drv,
 	return ret;
 }
 
+#ifdef CONFIG_SUSPEND
 /**
  * cpuidle_find_deepest_state - Find the deepest available idle state.
  * @drv: cpuidle driver for the given CPU.
@@ -145,6 +146,7 @@ int cpuidle_enter_freeze(struct cpuidle_driver *drv, struct cpuidle_device *dev)
 
 	return index;
 }
+#endif /* CONFIG_SUSPEND */
 
 /**
  * cpuidle_enter_state - enter the state and update stats
diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h
index 9c5e89254796..13ee266ca98c 100644
--- a/include/linux/cpuidle.h
+++ b/include/linux/cpuidle.h
@@ -151,10 +151,6 @@ extern void cpuidle_resume(void);
 extern int cpuidle_enable_device(struct cpuidle_device *dev);
 extern void cpuidle_disable_device(struct cpuidle_device *dev);
 extern int cpuidle_play_dead(void);
-extern int cpuidle_find_deepest_state(struct cpuidle_driver *drv,
-				      struct cpuidle_device *dev);
-extern int cpuidle_enter_freeze(struct cpuidle_driver *drv,
-				struct cpuidle_device *dev);
 
 extern struct cpuidle_driver *cpuidle_get_cpu_driver(struct cpuidle_device *dev);
 #else
@@ -190,14 +186,22 @@ static inline int cpuidle_enable_device(struct cpuidle_device *dev)
 {return -ENODEV; }
 static inline void cpuidle_disable_device(struct cpuidle_device *dev) { }
 static inline int cpuidle_play_dead(void) {return -ENODEV; }
+static inline struct cpuidle_driver *cpuidle_get_cpu_driver(
+	struct cpuidle_device *dev) {return NULL; }
+#endif
+
+#if defined(CONFIG_CPU_IDLE) && defined(CONFIG_SUSPEND)
+extern int cpuidle_find_deepest_state(struct cpuidle_driver *drv,
+				      struct cpuidle_device *dev);
+extern int cpuidle_enter_freeze(struct cpuidle_driver *drv,
+				struct cpuidle_device *dev);
+#else
 static inline int cpuidle_find_deepest_state(struct cpuidle_driver *drv,
 					     struct cpuidle_device *dev)
 {return -ENODEV; }
 static inline int cpuidle_enter_freeze(struct cpuidle_driver *drv,
 				       struct cpuidle_device *dev)
 {return -ENODEV; }
-static inline struct cpuidle_driver *cpuidle_get_cpu_driver(
-	struct cpuidle_device *dev) {return NULL; }
 #endif
 
 #ifdef CONFIG_ARCH_NEEDS_CPU_IDLE_COUPLED
diff --git a/include/linux/tick.h b/include/linux/tick.h
index f8492da57ad3..ec6e8bc992bf 100644
--- a/include/linux/tick.h
+++ b/include/linux/tick.h
@@ -13,8 +13,6 @@
 
 #ifdef CONFIG_GENERIC_CLOCKEVENTS
 extern void __init tick_init(void);
-extern void tick_freeze(void);
-extern void tick_unfreeze(void);
 /* Should be core only, but ARM BL switcher requires it */
 extern void tick_suspend_local(void);
 /* Should be core only, but XEN resume magic and ARM BL switcher require it */
@@ -23,14 +21,20 @@ extern void tick_handover_do_timer(void);
 extern void tick_cleanup_dead_cpu(int cpu);
 #else /* CONFIG_GENERIC_CLOCKEVENTS */
 static inline void tick_init(void) { }
-static inline void tick_freeze(void) { }
-static inline void tick_unfreeze(void) { }
 static inline void tick_suspend_local(void) { }
 static inline void tick_resume_local(void) { }
 static inline void tick_handover_do_timer(void) { }
 static inline void tick_cleanup_dead_cpu(int cpu) { }
 #endif /* !CONFIG_GENERIC_CLOCKEVENTS */
 
+#if defined(CONFIG_GENERIC_CLOCKEVENTS) && defined(CONFIG_SUSPEND)
+extern void tick_freeze(void);
+extern void tick_unfreeze(void);
+#else
+static inline void tick_freeze(void) { }
+static inline void tick_unfreeze(void) { }
+#endif
+
 #ifdef CONFIG_TICK_ONESHOT
 extern void tick_irq_enter(void);
 #  ifndef arch_needs_cpu
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index 80c043052487..51508465153c 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -441,6 +441,7 @@ void tick_resume(void)
 	tick_resume_local();
 }
 
+#ifdef CONFIG_SUSPEND
 static DEFINE_RAW_SPINLOCK(tick_freeze_lock);
 static unsigned int tick_freeze_depth;
 
@@ -494,6 +495,7 @@ void tick_unfreeze(void)
 
 	raw_spin_unlock(&tick_freeze_lock);
 }
+#endif /* CONFIG_SUSPEND */
 
 /**
  * tick_init - initialize the tick control
-- 
cgit v1.2.3


From ab3f02fc237211f0583c1e7ba3bf504747be9b8d Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 12 May 2015 10:52:27 +0200
Subject: locking/arch: Add WRITE_ONCE() to set_mb()

Since we assume set_mb() to result in a single store followed by a
full memory barrier, employ WRITE_ONCE().

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/arm/include/asm/barrier.h      | 2 +-
 arch/arm64/include/asm/barrier.h    | 2 +-
 arch/ia64/include/asm/barrier.h     | 2 +-
 arch/metag/include/asm/barrier.h    | 2 +-
 arch/mips/include/asm/barrier.h     | 2 +-
 arch/powerpc/include/asm/barrier.h  | 2 +-
 arch/s390/include/asm/barrier.h     | 2 +-
 arch/sparc/include/asm/barrier_64.h | 2 +-
 arch/x86/include/asm/barrier.h      | 2 +-
 arch/x86/um/asm/barrier.h           | 2 +-
 include/asm-generic/barrier.h       | 2 +-
 include/linux/compiler.h            | 2 +-
 12 files changed, 12 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/include/asm/barrier.h b/arch/arm/include/asm/barrier.h
index d2f81e6b8c1c..993150aea681 100644
--- a/arch/arm/include/asm/barrier.h
+++ b/arch/arm/include/asm/barrier.h
@@ -81,7 +81,7 @@ do {									\
 #define read_barrier_depends()		do { } while(0)
 #define smp_read_barrier_depends()	do { } while(0)
 
-#define set_mb(var, value)	do { var = value; smp_mb(); } while (0)
+#define set_mb(var, value)	do { WRITE_ONCE(var, value); smp_mb(); } while (0)
 
 #define smp_mb__before_atomic()	smp_mb()
 #define smp_mb__after_atomic()	smp_mb()
diff --git a/arch/arm64/include/asm/barrier.h b/arch/arm64/include/asm/barrier.h
index 71f19c4dc0de..ff7de78d01b8 100644
--- a/arch/arm64/include/asm/barrier.h
+++ b/arch/arm64/include/asm/barrier.h
@@ -114,7 +114,7 @@ do {									\
 #define read_barrier_depends()		do { } while(0)
 #define smp_read_barrier_depends()	do { } while(0)
 
-#define set_mb(var, value)	do { var = value; smp_mb(); } while (0)
+#define set_mb(var, value)	do { WRITE_ONCE(var, value); smp_mb(); } while (0)
 #define nop()		asm volatile("nop");
 
 #define smp_mb__before_atomic()	smp_mb()
diff --git a/arch/ia64/include/asm/barrier.h b/arch/ia64/include/asm/barrier.h
index f6769eb2bbf9..03117e7b2ab8 100644
--- a/arch/ia64/include/asm/barrier.h
+++ b/arch/ia64/include/asm/barrier.h
@@ -82,7 +82,7 @@ do {									\
  * acquire vs release semantics but we can't discuss this stuff with
  * Linus just yet.  Grrr...
  */
-#define set_mb(var, value)	do { (var) = (value); mb(); } while (0)
+#define set_mb(var, value)	do { WRITE_ONCE(var, value); mb(); } while (0)
 
 /*
  * The group barrier in front of the rsm & ssm are necessary to ensure
diff --git a/arch/metag/include/asm/barrier.h b/arch/metag/include/asm/barrier.h
index d703d8e26a65..97eb018a2933 100644
--- a/arch/metag/include/asm/barrier.h
+++ b/arch/metag/include/asm/barrier.h
@@ -84,7 +84,7 @@ static inline void fence(void)
 #define read_barrier_depends()		do { } while (0)
 #define smp_read_barrier_depends()	do { } while (0)
 
-#define set_mb(var, value) do { var = value; smp_mb(); } while (0)
+#define set_mb(var, value) do { WRITE_ONCE(var, value); smp_mb(); } while (0)
 
 #define smp_store_release(p, v)						\
 do {									\
diff --git a/arch/mips/include/asm/barrier.h b/arch/mips/include/asm/barrier.h
index 2b8bbbcb9be0..cff1bbdaa74a 100644
--- a/arch/mips/include/asm/barrier.h
+++ b/arch/mips/include/asm/barrier.h
@@ -113,7 +113,7 @@
 #endif
 
 #define set_mb(var, value) \
-	do { var = value; smp_mb(); } while (0)
+	do { WRITE_ONCE(var, value); smp_mb(); } while (0)
 
 #define smp_llsc_mb()	__asm__ __volatile__(__WEAK_LLSC_MB : : :"memory")
 
diff --git a/arch/powerpc/include/asm/barrier.h b/arch/powerpc/include/asm/barrier.h
index a3bf5be111ff..2a072e48780d 100644
--- a/arch/powerpc/include/asm/barrier.h
+++ b/arch/powerpc/include/asm/barrier.h
@@ -34,7 +34,7 @@
 #define rmb()  __asm__ __volatile__ ("sync" : : : "memory")
 #define wmb()  __asm__ __volatile__ ("sync" : : : "memory")
 
-#define set_mb(var, value)	do { var = value; mb(); } while (0)
+#define set_mb(var, value)	do { WRITE_ONCE(var, value); mb(); } while (0)
 
 #ifdef __SUBARCH_HAS_LWSYNC
 #    define SMPWMB      LWSYNC
diff --git a/arch/s390/include/asm/barrier.h b/arch/s390/include/asm/barrier.h
index 8d724718ec21..b66cd53d35fc 100644
--- a/arch/s390/include/asm/barrier.h
+++ b/arch/s390/include/asm/barrier.h
@@ -36,7 +36,7 @@
 #define smp_mb__before_atomic()		smp_mb()
 #define smp_mb__after_atomic()		smp_mb()
 
-#define set_mb(var, value)		do { var = value; mb(); } while (0)
+#define set_mb(var, value)		do { WRITE_ONCE(var, value); mb(); } while (0)
 
 #define smp_store_release(p, v)						\
 do {									\
diff --git a/arch/sparc/include/asm/barrier_64.h b/arch/sparc/include/asm/barrier_64.h
index 76648941fea7..125fec7512f4 100644
--- a/arch/sparc/include/asm/barrier_64.h
+++ b/arch/sparc/include/asm/barrier_64.h
@@ -41,7 +41,7 @@ do {	__asm__ __volatile__("ba,pt	%%xcc, 1f\n\t" \
 #define dma_wmb()	wmb()
 
 #define set_mb(__var, __value) \
-	do { __var = __value; membar_safe("#StoreLoad"); } while(0)
+	do { WRITE_ONCE(__var, __value); membar_safe("#StoreLoad"); } while(0)
 
 #ifdef CONFIG_SMP
 #define smp_mb()	mb()
diff --git a/arch/x86/include/asm/barrier.h b/arch/x86/include/asm/barrier.h
index 959e45b81fe2..9de5cde133a1 100644
--- a/arch/x86/include/asm/barrier.h
+++ b/arch/x86/include/asm/barrier.h
@@ -40,7 +40,7 @@
 #define smp_mb()	barrier()
 #define smp_rmb()	barrier()
 #define smp_wmb()	barrier()
-#define set_mb(var, value) do { var = value; barrier(); } while (0)
+#define set_mb(var, value) do { WRITE_ONCE(var, value); barrier(); } while (0)
 #endif /* SMP */
 
 #define read_barrier_depends()		do { } while (0)
diff --git a/arch/x86/um/asm/barrier.h b/arch/x86/um/asm/barrier.h
index 7e8a1a650435..cc0cb01f346d 100644
--- a/arch/x86/um/asm/barrier.h
+++ b/arch/x86/um/asm/barrier.h
@@ -39,7 +39,7 @@
 #define smp_mb()	barrier()
 #define smp_rmb()	barrier()
 #define smp_wmb()	barrier()
-#define set_mb(var, value) do { var = value; barrier(); } while (0)
+#define set_mb(var, value) do { WRITE_ONCE(var, value); barrier(); } while (0)
 
 #define read_barrier_depends()		do { } while (0)
 #define smp_read_barrier_depends()	do { } while (0)
diff --git a/include/asm-generic/barrier.h b/include/asm-generic/barrier.h
index f5c40b0fadc2..3938716b44d7 100644
--- a/include/asm-generic/barrier.h
+++ b/include/asm-generic/barrier.h
@@ -67,7 +67,7 @@
 #endif
 
 #ifndef set_mb
-#define set_mb(var, value)  do { (var) = (value); mb(); } while (0)
+#define set_mb(var, value)  do { WRITE_ONCE(var, value); mb(); } while (0)
 #endif
 
 #ifndef smp_mb__before_atomic
diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index a7c0941d10da..03e227ba481c 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -250,7 +250,7 @@ static __always_inline void __write_once_size(volatile void *p, void *res, int s
 	({ union { typeof(x) __val; char __c[1]; } __u; __read_once_size(&(x), __u.__c, sizeof(x)); __u.__val; })
 
 #define WRITE_ONCE(x, val) \
-	({ typeof(x) __val = (val); __write_once_size(&(x), &__val, sizeof(__val)); __val; })
+	({ union { typeof(x) __val; char __c[1]; } __u = { .__val = (val) }; __write_once_size(&(x), __u.__c, sizeof(x)); __u.__val; })
 
 #endif /* __KERNEL__ */
 
-- 
cgit v1.2.3


From b92b8b35a2e38bde319fd1d68ec84628c1f1b0fb Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 12 May 2015 10:51:55 +0200
Subject: locking/arch: Rename set_mb() to smp_store_mb()

Since set_mb() is really about an smp_mb() -- not a IO/DMA barrier
like mb() rename it to match the recent smp_load_acquire() and
smp_store_release().

Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 Documentation/memory-barriers.txt   | 6 +++---
 arch/arm/include/asm/barrier.h      | 2 +-
 arch/arm64/include/asm/barrier.h    | 2 +-
 arch/ia64/include/asm/barrier.h     | 7 +------
 arch/metag/include/asm/barrier.h    | 2 +-
 arch/mips/include/asm/barrier.h     | 2 +-
 arch/powerpc/include/asm/barrier.h  | 2 +-
 arch/s390/include/asm/barrier.h     | 2 +-
 arch/sh/include/asm/barrier.h       | 2 +-
 arch/sparc/include/asm/barrier_64.h | 2 +-
 arch/x86/include/asm/barrier.h      | 4 ++--
 arch/x86/um/asm/barrier.h           | 3 ++-
 fs/select.c                         | 6 +++---
 include/asm-generic/barrier.h       | 4 ++--
 include/linux/sched.h               | 8 ++++----
 kernel/futex.c                      | 2 +-
 kernel/locking/qspinlock_paravirt.h | 2 +-
 kernel/sched/wait.c                 | 4 ++--
 18 files changed, 29 insertions(+), 33 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/memory-barriers.txt b/Documentation/memory-barriers.txt
index f95746189b5d..fe4020e4b468 100644
--- a/Documentation/memory-barriers.txt
+++ b/Documentation/memory-barriers.txt
@@ -1662,7 +1662,7 @@ CPU from reordering them.
 
 There are some more advanced barrier functions:
 
- (*) set_mb(var, value)
+ (*) smp_store_mb(var, value)
 
      This assigns the value to the variable and then inserts a full memory
      barrier after it, depending on the function.  It isn't guaranteed to
@@ -1975,7 +1975,7 @@ after it has altered the task state:
 	CPU 1
 	===============================
 	set_current_state();
-	  set_mb();
+	  smp_store_mb();
 	    STORE current->state
 	    <general barrier>
 	LOAD event_indicated
@@ -2016,7 +2016,7 @@ between the STORE to indicate the event and the STORE to set TASK_RUNNING:
 	CPU 1				CPU 2
 	===============================	===============================
 	set_current_state();		STORE event_indicated
-	  set_mb();			wake_up();
+	  smp_store_mb();		wake_up();
 	    STORE current->state	  <write barrier>
 	    <general barrier>		  STORE current->state
 	LOAD event_indicated
diff --git a/arch/arm/include/asm/barrier.h b/arch/arm/include/asm/barrier.h
index 993150aea681..6c2327e1c732 100644
--- a/arch/arm/include/asm/barrier.h
+++ b/arch/arm/include/asm/barrier.h
@@ -81,7 +81,7 @@ do {									\
 #define read_barrier_depends()		do { } while(0)
 #define smp_read_barrier_depends()	do { } while(0)
 
-#define set_mb(var, value)	do { WRITE_ONCE(var, value); smp_mb(); } while (0)
+#define smp_store_mb(var, value)	do { WRITE_ONCE(var, value); smp_mb(); } while (0)
 
 #define smp_mb__before_atomic()	smp_mb()
 #define smp_mb__after_atomic()	smp_mb()
diff --git a/arch/arm64/include/asm/barrier.h b/arch/arm64/include/asm/barrier.h
index ff7de78d01b8..0fa47c4275cb 100644
--- a/arch/arm64/include/asm/barrier.h
+++ b/arch/arm64/include/asm/barrier.h
@@ -114,7 +114,7 @@ do {									\
 #define read_barrier_depends()		do { } while(0)
 #define smp_read_barrier_depends()	do { } while(0)
 
-#define set_mb(var, value)	do { WRITE_ONCE(var, value); smp_mb(); } while (0)
+#define smp_store_mb(var, value)	do { WRITE_ONCE(var, value); smp_mb(); } while (0)
 #define nop()		asm volatile("nop");
 
 #define smp_mb__before_atomic()	smp_mb()
diff --git a/arch/ia64/include/asm/barrier.h b/arch/ia64/include/asm/barrier.h
index 03117e7b2ab8..843ba435e43b 100644
--- a/arch/ia64/include/asm/barrier.h
+++ b/arch/ia64/include/asm/barrier.h
@@ -77,12 +77,7 @@ do {									\
 	___p1;								\
 })
 
-/*
- * XXX check on this ---I suspect what Linus really wants here is
- * acquire vs release semantics but we can't discuss this stuff with
- * Linus just yet.  Grrr...
- */
-#define set_mb(var, value)	do { WRITE_ONCE(var, value); mb(); } while (0)
+#define smp_store_mb(var, value)	do { WRITE_ONCE(var, value); mb(); } while (0)
 
 /*
  * The group barrier in front of the rsm & ssm are necessary to ensure
diff --git a/arch/metag/include/asm/barrier.h b/arch/metag/include/asm/barrier.h
index 97eb018a2933..5a696e507930 100644
--- a/arch/metag/include/asm/barrier.h
+++ b/arch/metag/include/asm/barrier.h
@@ -84,7 +84,7 @@ static inline void fence(void)
 #define read_barrier_depends()		do { } while (0)
 #define smp_read_barrier_depends()	do { } while (0)
 
-#define set_mb(var, value) do { WRITE_ONCE(var, value); smp_mb(); } while (0)
+#define smp_store_mb(var, value) do { WRITE_ONCE(var, value); smp_mb(); } while (0)
 
 #define smp_store_release(p, v)						\
 do {									\
diff --git a/arch/mips/include/asm/barrier.h b/arch/mips/include/asm/barrier.h
index cff1bbdaa74a..7ecba84656d4 100644
--- a/arch/mips/include/asm/barrier.h
+++ b/arch/mips/include/asm/barrier.h
@@ -112,7 +112,7 @@
 #define __WEAK_LLSC_MB		"		\n"
 #endif
 
-#define set_mb(var, value) \
+#define smp_store_mb(var, value) \
 	do { WRITE_ONCE(var, value); smp_mb(); } while (0)
 
 #define smp_llsc_mb()	__asm__ __volatile__(__WEAK_LLSC_MB : : :"memory")
diff --git a/arch/powerpc/include/asm/barrier.h b/arch/powerpc/include/asm/barrier.h
index 2a072e48780d..39505d660a70 100644
--- a/arch/powerpc/include/asm/barrier.h
+++ b/arch/powerpc/include/asm/barrier.h
@@ -34,7 +34,7 @@
 #define rmb()  __asm__ __volatile__ ("sync" : : : "memory")
 #define wmb()  __asm__ __volatile__ ("sync" : : : "memory")
 
-#define set_mb(var, value)	do { WRITE_ONCE(var, value); mb(); } while (0)
+#define smp_store_mb(var, value)	do { WRITE_ONCE(var, value); mb(); } while (0)
 
 #ifdef __SUBARCH_HAS_LWSYNC
 #    define SMPWMB      LWSYNC
diff --git a/arch/s390/include/asm/barrier.h b/arch/s390/include/asm/barrier.h
index b66cd53d35fc..e6f8615a11eb 100644
--- a/arch/s390/include/asm/barrier.h
+++ b/arch/s390/include/asm/barrier.h
@@ -36,7 +36,7 @@
 #define smp_mb__before_atomic()		smp_mb()
 #define smp_mb__after_atomic()		smp_mb()
 
-#define set_mb(var, value)		do { WRITE_ONCE(var, value); mb(); } while (0)
+#define smp_store_mb(var, value)		do { WRITE_ONCE(var, value); mb(); } while (0)
 
 #define smp_store_release(p, v)						\
 do {									\
diff --git a/arch/sh/include/asm/barrier.h b/arch/sh/include/asm/barrier.h
index 43715308b068..bf91037db4e0 100644
--- a/arch/sh/include/asm/barrier.h
+++ b/arch/sh/include/asm/barrier.h
@@ -32,7 +32,7 @@
 #define ctrl_barrier()	__asm__ __volatile__ ("nop;nop;nop;nop;nop;nop;nop;nop")
 #endif
 
-#define set_mb(var, value) do { (void)xchg(&var, value); } while (0)
+#define smp_store_mb(var, value) do { (void)xchg(&var, value); } while (0)
 
 #include <asm-generic/barrier.h>
 
diff --git a/arch/sparc/include/asm/barrier_64.h b/arch/sparc/include/asm/barrier_64.h
index 125fec7512f4..809941e33e12 100644
--- a/arch/sparc/include/asm/barrier_64.h
+++ b/arch/sparc/include/asm/barrier_64.h
@@ -40,7 +40,7 @@ do {	__asm__ __volatile__("ba,pt	%%xcc, 1f\n\t" \
 #define dma_rmb()	rmb()
 #define dma_wmb()	wmb()
 
-#define set_mb(__var, __value) \
+#define smp_store_mb(__var, __value) \
 	do { WRITE_ONCE(__var, __value); membar_safe("#StoreLoad"); } while(0)
 
 #ifdef CONFIG_SMP
diff --git a/arch/x86/include/asm/barrier.h b/arch/x86/include/asm/barrier.h
index 9de5cde133a1..e51a8f803f55 100644
--- a/arch/x86/include/asm/barrier.h
+++ b/arch/x86/include/asm/barrier.h
@@ -35,12 +35,12 @@
 #define smp_mb()	mb()
 #define smp_rmb()	dma_rmb()
 #define smp_wmb()	barrier()
-#define set_mb(var, value) do { (void)xchg(&var, value); } while (0)
+#define smp_store_mb(var, value) do { (void)xchg(&var, value); } while (0)
 #else /* !SMP */
 #define smp_mb()	barrier()
 #define smp_rmb()	barrier()
 #define smp_wmb()	barrier()
-#define set_mb(var, value) do { WRITE_ONCE(var, value); barrier(); } while (0)
+#define smp_store_mb(var, value) do { WRITE_ONCE(var, value); barrier(); } while (0)
 #endif /* SMP */
 
 #define read_barrier_depends()		do { } while (0)
diff --git a/arch/x86/um/asm/barrier.h b/arch/x86/um/asm/barrier.h
index cc0cb01f346d..b9531d343134 100644
--- a/arch/x86/um/asm/barrier.h
+++ b/arch/x86/um/asm/barrier.h
@@ -39,7 +39,8 @@
 #define smp_mb()	barrier()
 #define smp_rmb()	barrier()
 #define smp_wmb()	barrier()
-#define set_mb(var, value) do { WRITE_ONCE(var, value); barrier(); } while (0)
+
+#define smp_store_mb(var, value) do { WRITE_ONCE(var, value); barrier(); } while (0)
 
 #define read_barrier_depends()		do { } while (0)
 #define smp_read_barrier_depends()	do { } while (0)
diff --git a/fs/select.c b/fs/select.c
index f684c750e08a..015547330e88 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -189,7 +189,7 @@ static int __pollwake(wait_queue_t *wait, unsigned mode, int sync, void *key)
 	 * doesn't imply write barrier and the users expect write
 	 * barrier semantics on wakeup functions.  The following
 	 * smp_wmb() is equivalent to smp_wmb() in try_to_wake_up()
-	 * and is paired with set_mb() in poll_schedule_timeout.
+	 * and is paired with smp_store_mb() in poll_schedule_timeout.
 	 */
 	smp_wmb();
 	pwq->triggered = 1;
@@ -244,7 +244,7 @@ int poll_schedule_timeout(struct poll_wqueues *pwq, int state,
 	/*
 	 * Prepare for the next iteration.
 	 *
-	 * The following set_mb() serves two purposes.  First, it's
+	 * The following smp_store_mb() serves two purposes.  First, it's
 	 * the counterpart rmb of the wmb in pollwake() such that data
 	 * written before wake up is always visible after wake up.
 	 * Second, the full barrier guarantees that triggered clearing
@@ -252,7 +252,7 @@ int poll_schedule_timeout(struct poll_wqueues *pwq, int state,
 	 * this problem doesn't exist for the first iteration as
 	 * add_wait_queue() has full barrier semantics.
 	 */
-	set_mb(pwq->triggered, 0);
+	smp_store_mb(pwq->triggered, 0);
 
 	return rc;
 }
diff --git a/include/asm-generic/barrier.h b/include/asm-generic/barrier.h
index 3938716b44d7..e6a83d712ef6 100644
--- a/include/asm-generic/barrier.h
+++ b/include/asm-generic/barrier.h
@@ -66,8 +66,8 @@
 #define smp_read_barrier_depends()	do { } while (0)
 #endif
 
-#ifndef set_mb
-#define set_mb(var, value)  do { WRITE_ONCE(var, value); mb(); } while (0)
+#ifndef smp_store_mb
+#define smp_store_mb(var, value)  do { WRITE_ONCE(var, value); mb(); } while (0)
 #endif
 
 #ifndef smp_mb__before_atomic
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 26a2e6122734..18f197223ebd 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -252,7 +252,7 @@ extern char ___assert_task_state[1 - 2*!!(
 #define set_task_state(tsk, state_value)			\
 	do {							\
 		(tsk)->task_state_change = _THIS_IP_;		\
-		set_mb((tsk)->state, (state_value));		\
+		smp_store_mb((tsk)->state, (state_value));		\
 	} while (0)
 
 /*
@@ -274,7 +274,7 @@ extern char ___assert_task_state[1 - 2*!!(
 #define set_current_state(state_value)				\
 	do {							\
 		current->task_state_change = _THIS_IP_;		\
-		set_mb(current->state, (state_value));		\
+		smp_store_mb(current->state, (state_value));		\
 	} while (0)
 
 #else
@@ -282,7 +282,7 @@ extern char ___assert_task_state[1 - 2*!!(
 #define __set_task_state(tsk, state_value)		\
 	do { (tsk)->state = (state_value); } while (0)
 #define set_task_state(tsk, state_value)		\
-	set_mb((tsk)->state, (state_value))
+	smp_store_mb((tsk)->state, (state_value))
 
 /*
  * set_current_state() includes a barrier so that the write of current->state
@@ -298,7 +298,7 @@ extern char ___assert_task_state[1 - 2*!!(
 #define __set_current_state(state_value)		\
 	do { current->state = (state_value); } while (0)
 #define set_current_state(state_value)			\
-	set_mb(current->state, (state_value))
+	smp_store_mb(current->state, (state_value))
 
 #endif
 
diff --git a/kernel/futex.c b/kernel/futex.c
index 2579e407ff67..55ca63ad9622 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -2055,7 +2055,7 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q,
 {
 	/*
 	 * The task state is guaranteed to be set before another task can
-	 * wake it. set_current_state() is implemented using set_mb() and
+	 * wake it. set_current_state() is implemented using smp_store_mb() and
 	 * queue_me() calls spin_unlock() upon completion, both serializing
 	 * access to the hash list and forcing another memory barrier.
 	 */
diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h
index 27ab96dca68c..04ab18151cc8 100644
--- a/kernel/locking/qspinlock_paravirt.h
+++ b/kernel/locking/qspinlock_paravirt.h
@@ -175,7 +175,7 @@ static void pv_wait_node(struct mcs_spinlock *node)
 		 *
 		 * Matches the xchg() from pv_kick_node().
 		 */
-		set_mb(pn->state, vcpu_halted);
+		smp_store_mb(pn->state, vcpu_halted);
 
 		if (!READ_ONCE(node->locked))
 			pv_wait(&pn->state, vcpu_halted);
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index 852143a79f36..9bc82329eaad 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -341,7 +341,7 @@ long wait_woken(wait_queue_t *wait, unsigned mode, long timeout)
 	 * condition being true _OR_ WQ_FLAG_WOKEN such that we will not miss
 	 * an event.
 	 */
-	set_mb(wait->flags, wait->flags & ~WQ_FLAG_WOKEN); /* B */
+	smp_store_mb(wait->flags, wait->flags & ~WQ_FLAG_WOKEN); /* B */
 
 	return timeout;
 }
@@ -354,7 +354,7 @@ int woken_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key)
 	 * doesn't imply write barrier and the users expects write
 	 * barrier semantics on wakeup functions.  The following
 	 * smp_wmb() is equivalent to smp_wmb() in try_to_wake_up()
-	 * and is paired with set_mb() in wait_woken().
+	 * and is paired with smp_store_mb() in wait_woken().
 	 */
 	smp_wmb(); /* C */
 	wait->flags |= WQ_FLAG_WOKEN;
-- 
cgit v1.2.3


From 92cf211874e954027b8e91cc9a15485a50b58d6b Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Tue, 12 May 2015 16:41:46 +0200
Subject: sched/preempt: Merge preempt_mask.h into preempt.h

preempt_mask.h defines all the preempt_count semantics and related
symbols: preempt, softirq, hardirq, nmi, preempt active, need resched,
etc...

preempt.h defines the accessors and mutators of preempt_count.

But there is a messy dependency game around those two header files:

	* preempt_mask.h includes preempt.h in order to access preempt_count()

	* preempt_mask.h defines all preempt_count semantic and symbols
	  except PREEMPT_NEED_RESCHED that is needed by asm/preempt.h
	  Thus we need to define it from preempt.h, right before including
	  asm/preempt.h, instead of defining it to preempt_mask.h with the
	  other preempt_count symbols. Therefore the preempt_count semantics
	  happen to be spread out.

	* We plan to introduce preempt_active_[enter,exit]() to consolidate
	  preempt_schedule*() code. But we'll need to access both preempt_count
	  mutators (preempt_count_add()) and preempt_count symbols
	  (PREEMPT_ACTIVE, PREEMPT_OFFSET). The usual place to define preempt
	  operations is in preempt.h but then we'll need symbols in
	  preempt_mask.h which already includes preempt.h. So we end up with
	  a ressource circle dependency.

Lets merge preempt_mask.h into preempt.h to solve these dependency issues.
This way we gather semantic symbols and operation definition of
preempt_count in a single file.

This is a dumb copy-paste merge. Further merge re-arrangments are
performed in a subsequent patch to ease review.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1431441711-29753-2-git-send-email-fweisbec@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/m68k/include/asm/irqflags.h |   3 -
 include/linux/bottom_half.h      |   1 -
 include/linux/hardirq.h          |   2 +-
 include/linux/preempt.h          | 111 +++++++++++++++++++++++++++++++++++++
 include/linux/preempt_mask.h     | 117 ---------------------------------------
 include/linux/sched.h            |   2 +-
 lib/radix-tree.c                 |   2 +-
 7 files changed, 114 insertions(+), 124 deletions(-)
 delete mode 100644 include/linux/preempt_mask.h

(limited to 'include/linux')

diff --git a/arch/m68k/include/asm/irqflags.h b/arch/m68k/include/asm/irqflags.h
index a823cd73dc09..b5941818346f 100644
--- a/arch/m68k/include/asm/irqflags.h
+++ b/arch/m68k/include/asm/irqflags.h
@@ -2,9 +2,6 @@
 #define _M68K_IRQFLAGS_H
 
 #include <linux/types.h>
-#ifdef CONFIG_MMU
-#include <linux/preempt_mask.h>
-#endif
 #include <linux/preempt.h>
 #include <asm/thread_info.h>
 #include <asm/entry.h>
diff --git a/include/linux/bottom_half.h b/include/linux/bottom_half.h
index 86c12c93e3cf..8fdcb783197d 100644
--- a/include/linux/bottom_half.h
+++ b/include/linux/bottom_half.h
@@ -2,7 +2,6 @@
 #define _LINUX_BH_H
 
 #include <linux/preempt.h>
-#include <linux/preempt_mask.h>
 
 #ifdef CONFIG_TRACE_IRQFLAGS
 extern void __local_bh_disable_ip(unsigned long ip, unsigned int cnt);
diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h
index f4af03404b97..dfd59d6bc6f0 100644
--- a/include/linux/hardirq.h
+++ b/include/linux/hardirq.h
@@ -1,7 +1,7 @@
 #ifndef LINUX_HARDIRQ_H
 #define LINUX_HARDIRQ_H
 
-#include <linux/preempt_mask.h>
+#include <linux/preempt.h>
 #include <linux/lockdep.h>
 #include <linux/ftrace_irq.h>
 #include <linux/vtime.h>
diff --git a/include/linux/preempt.h b/include/linux/preempt.h
index de83b4eb1642..8cc0338a5e9a 100644
--- a/include/linux/preempt.h
+++ b/include/linux/preempt.h
@@ -17,6 +17,117 @@
 
 #include <asm/preempt.h>
 
+/*
+ * We put the hardirq and softirq counter into the preemption
+ * counter. The bitmask has the following meaning:
+ *
+ * - bits 0-7 are the preemption count (max preemption depth: 256)
+ * - bits 8-15 are the softirq count (max # of softirqs: 256)
+ *
+ * The hardirq count could in theory be the same as the number of
+ * interrupts in the system, but we run all interrupt handlers with
+ * interrupts disabled, so we cannot have nesting interrupts. Though
+ * there are a few palaeontologic drivers which reenable interrupts in
+ * the handler, so we need more than one bit here.
+ *
+ * PREEMPT_MASK:	0x000000ff
+ * SOFTIRQ_MASK:	0x0000ff00
+ * HARDIRQ_MASK:	0x000f0000
+ *     NMI_MASK:	0x00100000
+ * PREEMPT_ACTIVE:	0x00200000
+ */
+#define PREEMPT_BITS	8
+#define SOFTIRQ_BITS	8
+#define HARDIRQ_BITS	4
+#define NMI_BITS	1
+
+#define PREEMPT_SHIFT	0
+#define SOFTIRQ_SHIFT	(PREEMPT_SHIFT + PREEMPT_BITS)
+#define HARDIRQ_SHIFT	(SOFTIRQ_SHIFT + SOFTIRQ_BITS)
+#define NMI_SHIFT	(HARDIRQ_SHIFT + HARDIRQ_BITS)
+
+#define __IRQ_MASK(x)	((1UL << (x))-1)
+
+#define PREEMPT_MASK	(__IRQ_MASK(PREEMPT_BITS) << PREEMPT_SHIFT)
+#define SOFTIRQ_MASK	(__IRQ_MASK(SOFTIRQ_BITS) << SOFTIRQ_SHIFT)
+#define HARDIRQ_MASK	(__IRQ_MASK(HARDIRQ_BITS) << HARDIRQ_SHIFT)
+#define NMI_MASK	(__IRQ_MASK(NMI_BITS)     << NMI_SHIFT)
+
+#define PREEMPT_OFFSET	(1UL << PREEMPT_SHIFT)
+#define SOFTIRQ_OFFSET	(1UL << SOFTIRQ_SHIFT)
+#define HARDIRQ_OFFSET	(1UL << HARDIRQ_SHIFT)
+#define NMI_OFFSET	(1UL << NMI_SHIFT)
+
+#define SOFTIRQ_DISABLE_OFFSET	(2 * SOFTIRQ_OFFSET)
+
+#define PREEMPT_ACTIVE_BITS	1
+#define PREEMPT_ACTIVE_SHIFT	(NMI_SHIFT + NMI_BITS)
+#define PREEMPT_ACTIVE	(__IRQ_MASK(PREEMPT_ACTIVE_BITS) << PREEMPT_ACTIVE_SHIFT)
+
+#define hardirq_count()	(preempt_count() & HARDIRQ_MASK)
+#define softirq_count()	(preempt_count() & SOFTIRQ_MASK)
+#define irq_count()	(preempt_count() & (HARDIRQ_MASK | SOFTIRQ_MASK \
+				 | NMI_MASK))
+
+/*
+ * Are we doing bottom half or hardware interrupt processing?
+ * Are we in a softirq context? Interrupt context?
+ * in_softirq - Are we currently processing softirq or have bh disabled?
+ * in_serving_softirq - Are we currently processing softirq?
+ */
+#define in_irq()		(hardirq_count())
+#define in_softirq()		(softirq_count())
+#define in_interrupt()		(irq_count())
+#define in_serving_softirq()	(softirq_count() & SOFTIRQ_OFFSET)
+
+/*
+ * Are we in NMI context?
+ */
+#define in_nmi()	(preempt_count() & NMI_MASK)
+
+#if defined(CONFIG_PREEMPT_COUNT)
+# define PREEMPT_CHECK_OFFSET 1
+#else
+# define PREEMPT_CHECK_OFFSET 0
+#endif
+
+/*
+ * The preempt_count offset needed for things like:
+ *
+ *  spin_lock_bh()
+ *
+ * Which need to disable both preemption (CONFIG_PREEMPT_COUNT) and
+ * softirqs, such that unlock sequences of:
+ *
+ *  spin_unlock();
+ *  local_bh_enable();
+ *
+ * Work as expected.
+ */
+#define SOFTIRQ_LOCK_OFFSET (SOFTIRQ_DISABLE_OFFSET + PREEMPT_CHECK_OFFSET)
+
+/*
+ * Are we running in atomic context?  WARNING: this macro cannot
+ * always detect atomic context; in particular, it cannot know about
+ * held spinlocks in non-preemptible kernels.  Thus it should not be
+ * used in the general case to determine whether sleeping is possible.
+ * Do not use in_atomic() in driver code.
+ */
+#define in_atomic()	((preempt_count() & ~PREEMPT_ACTIVE) != 0)
+
+/*
+ * Check whether we were atomic before we did preempt_disable():
+ * (used by the scheduler, *after* releasing the kernel lock)
+ */
+#define in_atomic_preempt_off() \
+		((preempt_count() & ~PREEMPT_ACTIVE) != PREEMPT_CHECK_OFFSET)
+
+#ifdef CONFIG_PREEMPT_COUNT
+# define preemptible()	(preempt_count() == 0 && !irqs_disabled())
+#else
+# define preemptible()	0
+#endif
+
 #if defined(CONFIG_DEBUG_PREEMPT) || defined(CONFIG_PREEMPT_TRACER)
 extern void preempt_count_add(int val);
 extern void preempt_count_sub(int val);
diff --git a/include/linux/preempt_mask.h b/include/linux/preempt_mask.h
deleted file mode 100644
index dbeec4d4a3be..000000000000
--- a/include/linux/preempt_mask.h
+++ /dev/null
@@ -1,117 +0,0 @@
-#ifndef LINUX_PREEMPT_MASK_H
-#define LINUX_PREEMPT_MASK_H
-
-#include <linux/preempt.h>
-
-/*
- * We put the hardirq and softirq counter into the preemption
- * counter. The bitmask has the following meaning:
- *
- * - bits 0-7 are the preemption count (max preemption depth: 256)
- * - bits 8-15 are the softirq count (max # of softirqs: 256)
- *
- * The hardirq count could in theory be the same as the number of
- * interrupts in the system, but we run all interrupt handlers with
- * interrupts disabled, so we cannot have nesting interrupts. Though
- * there are a few palaeontologic drivers which reenable interrupts in
- * the handler, so we need more than one bit here.
- *
- * PREEMPT_MASK:	0x000000ff
- * SOFTIRQ_MASK:	0x0000ff00
- * HARDIRQ_MASK:	0x000f0000
- *     NMI_MASK:	0x00100000
- * PREEMPT_ACTIVE:	0x00200000
- */
-#define PREEMPT_BITS	8
-#define SOFTIRQ_BITS	8
-#define HARDIRQ_BITS	4
-#define NMI_BITS	1
-
-#define PREEMPT_SHIFT	0
-#define SOFTIRQ_SHIFT	(PREEMPT_SHIFT + PREEMPT_BITS)
-#define HARDIRQ_SHIFT	(SOFTIRQ_SHIFT + SOFTIRQ_BITS)
-#define NMI_SHIFT	(HARDIRQ_SHIFT + HARDIRQ_BITS)
-
-#define __IRQ_MASK(x)	((1UL << (x))-1)
-
-#define PREEMPT_MASK	(__IRQ_MASK(PREEMPT_BITS) << PREEMPT_SHIFT)
-#define SOFTIRQ_MASK	(__IRQ_MASK(SOFTIRQ_BITS) << SOFTIRQ_SHIFT)
-#define HARDIRQ_MASK	(__IRQ_MASK(HARDIRQ_BITS) << HARDIRQ_SHIFT)
-#define NMI_MASK	(__IRQ_MASK(NMI_BITS)     << NMI_SHIFT)
-
-#define PREEMPT_OFFSET	(1UL << PREEMPT_SHIFT)
-#define SOFTIRQ_OFFSET	(1UL << SOFTIRQ_SHIFT)
-#define HARDIRQ_OFFSET	(1UL << HARDIRQ_SHIFT)
-#define NMI_OFFSET	(1UL << NMI_SHIFT)
-
-#define SOFTIRQ_DISABLE_OFFSET	(2 * SOFTIRQ_OFFSET)
-
-#define PREEMPT_ACTIVE_BITS	1
-#define PREEMPT_ACTIVE_SHIFT	(NMI_SHIFT + NMI_BITS)
-#define PREEMPT_ACTIVE	(__IRQ_MASK(PREEMPT_ACTIVE_BITS) << PREEMPT_ACTIVE_SHIFT)
-
-#define hardirq_count()	(preempt_count() & HARDIRQ_MASK)
-#define softirq_count()	(preempt_count() & SOFTIRQ_MASK)
-#define irq_count()	(preempt_count() & (HARDIRQ_MASK | SOFTIRQ_MASK \
-				 | NMI_MASK))
-
-/*
- * Are we doing bottom half or hardware interrupt processing?
- * Are we in a softirq context? Interrupt context?
- * in_softirq - Are we currently processing softirq or have bh disabled?
- * in_serving_softirq - Are we currently processing softirq?
- */
-#define in_irq()		(hardirq_count())
-#define in_softirq()		(softirq_count())
-#define in_interrupt()		(irq_count())
-#define in_serving_softirq()	(softirq_count() & SOFTIRQ_OFFSET)
-
-/*
- * Are we in NMI context?
- */
-#define in_nmi()	(preempt_count() & NMI_MASK)
-
-#if defined(CONFIG_PREEMPT_COUNT)
-# define PREEMPT_CHECK_OFFSET 1
-#else
-# define PREEMPT_CHECK_OFFSET 0
-#endif
-
-/*
- * The preempt_count offset needed for things like:
- *
- *  spin_lock_bh()
- *
- * Which need to disable both preemption (CONFIG_PREEMPT_COUNT) and
- * softirqs, such that unlock sequences of:
- *
- *  spin_unlock();
- *  local_bh_enable();
- *
- * Work as expected.
- */
-#define SOFTIRQ_LOCK_OFFSET (SOFTIRQ_DISABLE_OFFSET + PREEMPT_CHECK_OFFSET)
-
-/*
- * Are we running in atomic context?  WARNING: this macro cannot
- * always detect atomic context; in particular, it cannot know about
- * held spinlocks in non-preemptible kernels.  Thus it should not be
- * used in the general case to determine whether sleeping is possible.
- * Do not use in_atomic() in driver code.
- */
-#define in_atomic()	((preempt_count() & ~PREEMPT_ACTIVE) != 0)
-
-/*
- * Check whether we were atomic before we did preempt_disable():
- * (used by the scheduler, *after* releasing the kernel lock)
- */
-#define in_atomic_preempt_off() \
-		((preempt_count() & ~PREEMPT_ACTIVE) != PREEMPT_CHECK_OFFSET)
-
-#ifdef CONFIG_PREEMPT_COUNT
-# define preemptible()	(preempt_count() == 0 && !irqs_disabled())
-#else
-# define preemptible()	0
-#endif
-
-#endif /* LINUX_PREEMPT_MASK_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 5f8defa155cf..c53a1784d7a9 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -25,7 +25,7 @@ struct sched_param {
 #include <linux/errno.h>
 #include <linux/nodemask.h>
 #include <linux/mm_types.h>
-#include <linux/preempt_mask.h>
+#include <linux/preempt.h>
 
 #include <asm/page.h>
 #include <asm/ptrace.h>
diff --git a/lib/radix-tree.c b/lib/radix-tree.c
index 3d2aa27b845b..061550de77bc 100644
--- a/lib/radix-tree.c
+++ b/lib/radix-tree.c
@@ -33,7 +33,7 @@
 #include <linux/string.h>
 #include <linux/bitops.h>
 #include <linux/rcupdate.h>
-#include <linux/preempt_mask.h>		/* in_interrupt() */
+#include <linux/preempt.h>		/* in_interrupt() */
 
 
 /*
-- 
cgit v1.2.3


From 2e10e71ce88e3eaccfd09a045ae6ecebe657ba09 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Tue, 12 May 2015 16:41:47 +0200
Subject: sched/preempt: Rearrange a few symbols after headers merge

Adjust a few comments, and further integrate a few definitions after
the dumb headers copy.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1431441711-29753-3-git-send-email-fweisbec@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/preempt.h | 34 +++++++++++++++-------------------
 1 file changed, 15 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/preempt.h b/include/linux/preempt.h
index 8cc0338a5e9a..37974cd4f092 100644
--- a/include/linux/preempt.h
+++ b/include/linux/preempt.h
@@ -9,14 +9,6 @@
 #include <linux/linkage.h>
 #include <linux/list.h>
 
-/*
- * We use the MSB mostly because its available; see <linux/preempt_mask.h> for
- * the other bits -- can't include that header due to inclusion hell.
- */
-#define PREEMPT_NEED_RESCHED	0x80000000
-
-#include <asm/preempt.h>
-
 /*
  * We put the hardirq and softirq counter into the preemption
  * counter. The bitmask has the following meaning:
@@ -30,11 +22,12 @@
  * there are a few palaeontologic drivers which reenable interrupts in
  * the handler, so we need more than one bit here.
  *
- * PREEMPT_MASK:	0x000000ff
- * SOFTIRQ_MASK:	0x0000ff00
- * HARDIRQ_MASK:	0x000f0000
- *     NMI_MASK:	0x00100000
- * PREEMPT_ACTIVE:	0x00200000
+ *         PREEMPT_MASK:	0x000000ff
+ *         SOFTIRQ_MASK:	0x0000ff00
+ *         HARDIRQ_MASK:	0x000f0000
+ *             NMI_MASK:	0x00100000
+ *       PREEMPT_ACTIVE:	0x00200000
+ * PREEMPT_NEED_RESCHED:	0x80000000
  */
 #define PREEMPT_BITS	8
 #define SOFTIRQ_BITS	8
@@ -64,6 +57,12 @@
 #define PREEMPT_ACTIVE_SHIFT	(NMI_SHIFT + NMI_BITS)
 #define PREEMPT_ACTIVE	(__IRQ_MASK(PREEMPT_ACTIVE_BITS) << PREEMPT_ACTIVE_SHIFT)
 
+/* We use the MSB mostly because its available */
+#define PREEMPT_NEED_RESCHED	0x80000000
+
+/* preempt_count() and related functions, depends on PREEMPT_NEED_RESCHED */
+#include <asm/preempt.h>
+
 #define hardirq_count()	(preempt_count() & HARDIRQ_MASK)
 #define softirq_count()	(preempt_count() & SOFTIRQ_MASK)
 #define irq_count()	(preempt_count() & (HARDIRQ_MASK | SOFTIRQ_MASK \
@@ -122,12 +121,6 @@
 #define in_atomic_preempt_off() \
 		((preempt_count() & ~PREEMPT_ACTIVE) != PREEMPT_CHECK_OFFSET)
 
-#ifdef CONFIG_PREEMPT_COUNT
-# define preemptible()	(preempt_count() == 0 && !irqs_disabled())
-#else
-# define preemptible()	0
-#endif
-
 #if defined(CONFIG_DEBUG_PREEMPT) || defined(CONFIG_PREEMPT_TRACER)
 extern void preempt_count_add(int val);
 extern void preempt_count_sub(int val);
@@ -160,6 +153,8 @@ do { \
 
 #define preempt_enable_no_resched() sched_preempt_enable_no_resched()
 
+#define preemptible()	(preempt_count() == 0 && !irqs_disabled())
+
 #ifdef CONFIG_PREEMPT
 #define preempt_enable() \
 do { \
@@ -232,6 +227,7 @@ do { \
 #define preempt_disable_notrace()		barrier()
 #define preempt_enable_no_resched_notrace()	barrier()
 #define preempt_enable_notrace()		barrier()
+#define preemptible()				0
 
 #endif /* CONFIG_PREEMPT_COUNT */
 
-- 
cgit v1.2.3


From 90b62b5129d5cb50f62f40e684de7a1961e57197 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Tue, 12 May 2015 16:41:48 +0200
Subject: sched/preempt: Rename PREEMPT_CHECK_OFFSET to PREEMPT_DISABLE_OFFSET

"CHECK" suggests it's only used as a comparison mask. But now it's used
further as a config-conditional preempt disabler offset. Lets
disambiguate this name.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1431441711-29753-4-git-send-email-fweisbec@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/preempt.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/preempt.h b/include/linux/preempt.h
index 37974cd4f092..4689ef210a13 100644
--- a/include/linux/preempt.h
+++ b/include/linux/preempt.h
@@ -85,9 +85,9 @@
 #define in_nmi()	(preempt_count() & NMI_MASK)
 
 #if defined(CONFIG_PREEMPT_COUNT)
-# define PREEMPT_CHECK_OFFSET 1
+# define PREEMPT_DISABLE_OFFSET 1
 #else
-# define PREEMPT_CHECK_OFFSET 0
+# define PREEMPT_DISABLE_OFFSET 0
 #endif
 
 /*
@@ -103,7 +103,7 @@
  *
  * Work as expected.
  */
-#define SOFTIRQ_LOCK_OFFSET (SOFTIRQ_DISABLE_OFFSET + PREEMPT_CHECK_OFFSET)
+#define SOFTIRQ_LOCK_OFFSET (SOFTIRQ_DISABLE_OFFSET + PREEMPT_DISABLE_OFFSET)
 
 /*
  * Are we running in atomic context?  WARNING: this macro cannot
@@ -119,7 +119,7 @@
  * (used by the scheduler, *after* releasing the kernel lock)
  */
 #define in_atomic_preempt_off() \
-		((preempt_count() & ~PREEMPT_ACTIVE) != PREEMPT_CHECK_OFFSET)
+		((preempt_count() & ~PREEMPT_ACTIVE) != PREEMPT_DISABLE_OFFSET)
 
 #if defined(CONFIG_DEBUG_PREEMPT) || defined(CONFIG_PREEMPT_TRACER)
 extern void preempt_count_add(int val);
-- 
cgit v1.2.3


From b30f0e3ffedfa52b1d67a302ae5860c49998e5e2 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Tue, 12 May 2015 16:41:49 +0200
Subject: sched/preempt: Optimize preemption operations on __schedule() callers

__schedule() disables preemption and some of its callers
(the preempt_schedule*() family) also set PREEMPT_ACTIVE.

So we have two preempt_count() modifications that could be performed
at once.

Lets remove the preemption disablement from __schedule() and pull
this responsibility to its callers in order to optimize preempt_count()
operations in a single place.

Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1431441711-29753-5-git-send-email-fweisbec@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/preempt.h | 12 ++++++++++++
 kernel/sched/core.c     | 29 +++++++++--------------------
 2 files changed, 21 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/preempt.h b/include/linux/preempt.h
index 4689ef210a13..45da394f2779 100644
--- a/include/linux/preempt.h
+++ b/include/linux/preempt.h
@@ -137,6 +137,18 @@ extern void preempt_count_sub(int val);
 #define preempt_count_inc() preempt_count_add(1)
 #define preempt_count_dec() preempt_count_sub(1)
 
+#define preempt_active_enter() \
+do { \
+	preempt_count_add(PREEMPT_ACTIVE + PREEMPT_DISABLE_OFFSET); \
+	barrier(); \
+} while (0)
+
+#define preempt_active_exit() \
+do { \
+	barrier(); \
+	preempt_count_sub(PREEMPT_ACTIVE + PREEMPT_DISABLE_OFFSET); \
+} while (0)
+
 #ifdef CONFIG_PREEMPT_COUNT
 
 #define preempt_disable() \
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 355f9538ca33..5140db62c621 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2773,9 +2773,7 @@ again:
  *          - return from syscall or exception to user-space
  *          - return from interrupt-handler to user-space
  *
- * WARNING: all callers must re-check need_resched() afterward and reschedule
- * accordingly in case an event triggered the need for rescheduling (such as
- * an interrupt waking up a task) while preemption was disabled in __schedule().
+ * WARNING: must be called with preemption disabled!
  */
 static void __sched __schedule(void)
 {
@@ -2784,7 +2782,6 @@ static void __sched __schedule(void)
 	struct rq *rq;
 	int cpu;
 
-	preempt_disable();
 	cpu = smp_processor_id();
 	rq = cpu_rq(cpu);
 	rcu_note_context_switch();
@@ -2848,8 +2845,6 @@ static void __sched __schedule(void)
 		raw_spin_unlock_irq(&rq->lock);
 
 	post_schedule(rq);
-
-	sched_preempt_enable_no_resched();
 }
 
 static inline void sched_submit_work(struct task_struct *tsk)
@@ -2870,7 +2865,9 @@ asmlinkage __visible void __sched schedule(void)
 
 	sched_submit_work(tsk);
 	do {
+		preempt_disable();
 		__schedule();
+		sched_preempt_enable_no_resched();
 	} while (need_resched());
 }
 EXPORT_SYMBOL(schedule);
@@ -2909,15 +2906,14 @@ void __sched schedule_preempt_disabled(void)
 static void __sched notrace preempt_schedule_common(void)
 {
 	do {
-		__preempt_count_add(PREEMPT_ACTIVE);
+		preempt_active_enter();
 		__schedule();
-		__preempt_count_sub(PREEMPT_ACTIVE);
+		preempt_active_exit();
 
 		/*
 		 * Check again in case we missed a preemption opportunity
 		 * between schedule and now.
 		 */
-		barrier();
 	} while (need_resched());
 }
 
@@ -2964,7 +2960,7 @@ asmlinkage __visible void __sched notrace preempt_schedule_context(void)
 		return;
 
 	do {
-		__preempt_count_add(PREEMPT_ACTIVE);
+		preempt_active_enter();
 		/*
 		 * Needs preempt disabled in case user_exit() is traced
 		 * and the tracer calls preempt_enable_notrace() causing
@@ -2974,8 +2970,7 @@ asmlinkage __visible void __sched notrace preempt_schedule_context(void)
 		__schedule();
 		exception_exit(prev_ctx);
 
-		__preempt_count_sub(PREEMPT_ACTIVE);
-		barrier();
+		preempt_active_exit();
 	} while (need_resched());
 }
 EXPORT_SYMBOL_GPL(preempt_schedule_context);
@@ -2999,17 +2994,11 @@ asmlinkage __visible void __sched preempt_schedule_irq(void)
 	prev_state = exception_enter();
 
 	do {
-		__preempt_count_add(PREEMPT_ACTIVE);
+		preempt_active_enter();
 		local_irq_enable();
 		__schedule();
 		local_irq_disable();
-		__preempt_count_sub(PREEMPT_ACTIVE);
-
-		/*
-		 * Check again in case we missed a preemption opportunity
-		 * between schedule and now.
-		 */
-		barrier();
+		preempt_active_exit();
 	} while (need_resched());
 
 	exception_exit(prev_state);
-- 
cgit v1.2.3


From e017cf21ae82e0b36f026b22083a8ae67926f465 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Tue, 12 May 2015 16:41:50 +0200
Subject: sched/preempt: Fix out of date comment

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1431441711-29753-6-git-send-email-fweisbec@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/preempt.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/preempt.h b/include/linux/preempt.h
index 45da394f2779..4057696c641c 100644
--- a/include/linux/preempt.h
+++ b/include/linux/preempt.h
@@ -116,7 +116,7 @@
 
 /*
  * Check whether we were atomic before we did preempt_disable():
- * (used by the scheduler, *after* releasing the kernel lock)
+ * (used by the scheduler)
  */
 #define in_atomic_preempt_off() \
 		((preempt_count() & ~PREEMPT_ACTIVE) != PREEMPT_DISABLE_OFFSET)
-- 
cgit v1.2.3


From 3e51f3c4004c9b01f66da03214a3e206f5ed627b Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Tue, 12 May 2015 16:41:51 +0200
Subject: sched/preempt: Remove PREEMPT_ACTIVE unmasking off in_atomic()

Now that PREEMPT_ACTIVE implies PREEMPT_DISABLE_OFFSET, ignoring
PREEMPT_ACTIVE from in_atomic() check isn't useful anymore.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1431441711-29753-7-git-send-email-fweisbec@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/preempt.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/preempt.h b/include/linux/preempt.h
index 4057696c641c..a1a00e14c14f 100644
--- a/include/linux/preempt.h
+++ b/include/linux/preempt.h
@@ -112,7 +112,7 @@
  * used in the general case to determine whether sleeping is possible.
  * Do not use in_atomic() in driver code.
  */
-#define in_atomic()	((preempt_count() & ~PREEMPT_ACTIVE) != 0)
+#define in_atomic()	(preempt_count() != 0)
 
 /*
  * Check whether we were atomic before we did preempt_disable():
-- 
cgit v1.2.3


From 8bcbde5480f9777f8b74d71493722c663e22c21b Mon Sep 17 00:00:00 2001
From: David Hildenbrand <dahi@linux.vnet.ibm.com>
Date: Mon, 11 May 2015 17:52:06 +0200
Subject: sched/preempt, mm/fault: Count pagefault_disable() levels in
 pagefault_disabled

Until now, pagefault_disable()/pagefault_enabled() used the preempt
count to track whether in an environment with pagefaults disabled (can
be queried via in_atomic()).

This patch introduces a separate counter in task_struct to count the
level of pagefault_disable() calls. We'll keep manipulating the preempt
count to retain compatibility to existing pagefault handlers.

It is now possible to verify whether in a pagefault_disable() envionment
by calling pagefault_disabled(). In contrast to in_atomic() it will not
be influenced by preempt_enable()/preempt_disable().

This patch is based on a patch from Ingo Molnar.

Reviewed-and-tested-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: David Hildenbrand <dahi@linux.vnet.ibm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: David.Laight@ACULAB.COM
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: airlied@linux.ie
Cc: akpm@linux-foundation.org
Cc: benh@kernel.crashing.org
Cc: bigeasy@linutronix.de
Cc: borntraeger@de.ibm.com
Cc: daniel.vetter@intel.com
Cc: heiko.carstens@de.ibm.com
Cc: herbert@gondor.apana.org.au
Cc: hocko@suse.cz
Cc: hughd@google.com
Cc: mst@redhat.com
Cc: paulus@samba.org
Cc: ralf@linux-mips.org
Cc: schwidefsky@de.ibm.com
Cc: yang.shi@windriver.com
Link: http://lkml.kernel.org/r/1431359540-32227-2-git-send-email-dahi@linux.vnet.ibm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h   |  1 +
 include/linux/uaccess.h | 36 +++++++++++++++++++++++++++++-------
 kernel/fork.c           |  3 +++
 3 files changed, 33 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index c53a1784d7a9..dd07ac03f82a 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1788,6 +1788,7 @@ struct task_struct {
 #ifdef CONFIG_DEBUG_ATOMIC_SLEEP
 	unsigned long	task_state_change;
 #endif
+	int pagefault_disabled;
 };
 
 /* Future-safe accessor for struct task_struct's cpus_allowed. */
diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h
index ecd3319dac33..23290cc93a24 100644
--- a/include/linux/uaccess.h
+++ b/include/linux/uaccess.h
@@ -2,20 +2,36 @@
 #define __LINUX_UACCESS_H__
 
 #include <linux/preempt.h>
+#include <linux/sched.h>
 #include <asm/uaccess.h>
 
+static __always_inline void pagefault_disabled_inc(void)
+{
+	current->pagefault_disabled++;
+}
+
+static __always_inline void pagefault_disabled_dec(void)
+{
+	current->pagefault_disabled--;
+	WARN_ON(current->pagefault_disabled < 0);
+}
+
 /*
- * These routines enable/disable the pagefault handler in that
- * it will not take any locks and go straight to the fixup table.
+ * These routines enable/disable the pagefault handler. If disabled, it will
+ * not take any locks and go straight to the fixup table.
+ *
+ * We increase the preempt and the pagefault count, to be able to distinguish
+ * whether we run in simple atomic context or in a real pagefault_disable()
+ * context.
+ *
+ * For now, after pagefault_disabled() has been called, we run in atomic
+ * context. User access methods will not sleep.
  *
- * They have great resemblance to the preempt_disable/enable calls
- * and in fact they are identical; this is because currently there is
- * no other way to make the pagefault handlers do this. So we do
- * disable preemption but we don't necessarily care about that.
  */
 static inline void pagefault_disable(void)
 {
 	preempt_count_inc();
+	pagefault_disabled_inc();
 	/*
 	 * make sure to have issued the store before a pagefault
 	 * can hit.
@@ -25,18 +41,24 @@ static inline void pagefault_disable(void)
 
 static inline void pagefault_enable(void)
 {
-#ifndef CONFIG_PREEMPT
 	/*
 	 * make sure to issue those last loads/stores before enabling
 	 * the pagefault handler again.
 	 */
 	barrier();
+	pagefault_disabled_dec();
+#ifndef CONFIG_PREEMPT
 	preempt_count_dec();
 #else
 	preempt_enable();
 #endif
 }
 
+/*
+ * Is the pagefault handler disabled? If so, user access methods will not sleep.
+ */
+#define pagefault_disabled() (current->pagefault_disabled != 0)
+
 #ifndef ARCH_HAS_NOCACHE_UACCESS
 
 static inline unsigned long __copy_from_user_inatomic_nocache(void *to,
diff --git a/kernel/fork.c b/kernel/fork.c
index 2e670864174f..0bb88b555550 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1393,6 +1393,9 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	p->hardirq_context = 0;
 	p->softirq_context = 0;
 #endif
+
+	p->pagefault_disabled = 0;
+
 #ifdef CONFIG_LOCKDEP
 	p->lockdep_depth = 0; /* no locks held yet */
 	p->curr_chain_key = 0;
-- 
cgit v1.2.3


From 9ec23531fd48031d1b6ca5366f5f967d17a8bc28 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <dahi@linux.vnet.ibm.com>
Date: Mon, 11 May 2015 17:52:07 +0200
Subject: sched/preempt, mm/fault: Trigger might_sleep() in might_fault() with
 disabled pagefaults

Commit 662bbcb2747c ("mm, sched: Allow uaccess in atomic with
pagefault_disable()") removed might_sleep() checks for all user access
code (that uses might_fault()).

The reason was to disable wrong "sleep in atomic" warnings in the
following scenario:

    pagefault_disable()
    rc = copy_to_user(...)
    pagefault_enable()

Which is valid, as pagefault_disable() increments the preempt counter
and therefore disables the pagefault handler. copy_to_user() will not
sleep and return an error code if a page is not available.

However, as all might_sleep() checks are removed,
CONFIG_DEBUG_ATOMIC_SLEEP would no longer detect the following scenario:

    spin_lock(&lock);
    rc = copy_to_user(...)
    spin_unlock(&lock)

If the kernel is compiled with preemption turned on, preempt_disable()
will make in_atomic() detect disabled preemption. The fault handler would
correctly never sleep on user access.
However, with preemption turned off, preempt_disable() is usually a NOP
(with !CONFIG_PREEMPT_COUNT), therefore in_atomic() will not be able to
detect disabled preemption nor disabled pagefaults. The fault handler
could sleep.
We really want to enable CONFIG_DEBUG_ATOMIC_SLEEP checks for user access
functions again, otherwise we can end up with horrible deadlocks.

Root of all evil is that pagefault_disable() acts almost as
preempt_disable(), depending on preemption being turned on/off.

As we now have pagefault_disabled(), we can use it to distinguish
whether user acces functions might sleep.

Convert might_fault() into a makro that calls __might_fault(), to
allow proper file + line messages in case of a might_sleep() warning.

Reviewed-and-tested-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: David Hildenbrand <dahi@linux.vnet.ibm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: David.Laight@ACULAB.COM
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: airlied@linux.ie
Cc: akpm@linux-foundation.org
Cc: benh@kernel.crashing.org
Cc: bigeasy@linutronix.de
Cc: borntraeger@de.ibm.com
Cc: daniel.vetter@intel.com
Cc: heiko.carstens@de.ibm.com
Cc: herbert@gondor.apana.org.au
Cc: hocko@suse.cz
Cc: hughd@google.com
Cc: mst@redhat.com
Cc: paulus@samba.org
Cc: ralf@linux-mips.org
Cc: schwidefsky@de.ibm.com
Cc: yang.shi@windriver.com
Link: http://lkml.kernel.org/r/1431359540-32227-3-git-send-email-dahi@linux.vnet.ibm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/kernel.h |  3 ++-
 mm/memory.c            | 18 ++++++------------
 2 files changed, 8 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 3a5b48e52a9e..060dd7b61c6d 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -244,7 +244,8 @@ static inline u32 reciprocal_scale(u32 val, u32 ep_ro)
 
 #if defined(CONFIG_MMU) && \
 	(defined(CONFIG_PROVE_LOCKING) || defined(CONFIG_DEBUG_ATOMIC_SLEEP))
-void might_fault(void);
+#define might_fault() __might_fault(__FILE__, __LINE__)
+void __might_fault(const char *file, int line);
 #else
 static inline void might_fault(void) { }
 #endif
diff --git a/mm/memory.c b/mm/memory.c
index 22e037e3364e..17734c3c1183 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3737,7 +3737,7 @@ void print_vma_addr(char *prefix, unsigned long ip)
 }
 
 #if defined(CONFIG_PROVE_LOCKING) || defined(CONFIG_DEBUG_ATOMIC_SLEEP)
-void might_fault(void)
+void __might_fault(const char *file, int line)
 {
 	/*
 	 * Some code (nfs/sunrpc) uses socket ops on kernel memory while
@@ -3747,21 +3747,15 @@ void might_fault(void)
 	 */
 	if (segment_eq(get_fs(), KERNEL_DS))
 		return;
-
-	/*
-	 * it would be nicer only to annotate paths which are not under
-	 * pagefault_disable, however that requires a larger audit and
-	 * providing helpers like get_user_atomic.
-	 */
-	if (in_atomic())
+	if (pagefault_disabled())
 		return;
-
-	__might_sleep(__FILE__, __LINE__, 0);
-
+	__might_sleep(file, line, 0);
+#if defined(CONFIG_DEBUG_ATOMIC_SLEEP)
 	if (current->mm)
 		might_lock_read(&current->mm->mmap_sem);
+#endif
 }
-EXPORT_SYMBOL(might_fault);
+EXPORT_SYMBOL(__might_fault);
 #endif
 
 #if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS)
-- 
cgit v1.2.3


From 2cb7c9cb426660b5ed58b643d9e7dd5d50ba901f Mon Sep 17 00:00:00 2001
From: David Hildenbrand <dahi@linux.vnet.ibm.com>
Date: Mon, 11 May 2015 17:52:09 +0200
Subject: sched/preempt, mm/kmap: Explicitly disable/enable preemption in
 kmap_atomic_*

The existing code relies on pagefault_disable() implicitly disabling
preemption, so that no schedule will happen between kmap_atomic() and
kunmap_atomic().

Let's make this explicit, to prepare for pagefault_disable() not
touching preemption anymore.

Reviewed-and-tested-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: David Hildenbrand <dahi@linux.vnet.ibm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: David.Laight@ACULAB.COM
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: airlied@linux.ie
Cc: akpm@linux-foundation.org
Cc: benh@kernel.crashing.org
Cc: bigeasy@linutronix.de
Cc: borntraeger@de.ibm.com
Cc: daniel.vetter@intel.com
Cc: heiko.carstens@de.ibm.com
Cc: herbert@gondor.apana.org.au
Cc: hocko@suse.cz
Cc: hughd@google.com
Cc: mst@redhat.com
Cc: paulus@samba.org
Cc: ralf@linux-mips.org
Cc: schwidefsky@de.ibm.com
Cc: yang.shi@windriver.com
Link: http://lkml.kernel.org/r/1431359540-32227-5-git-send-email-dahi@linux.vnet.ibm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/arm/mm/highmem.c                | 3 +++
 arch/frv/mm/highmem.c                | 2 ++
 arch/metag/mm/highmem.c              | 4 +++-
 arch/microblaze/mm/highmem.c         | 4 +++-
 arch/mips/mm/highmem.c               | 5 ++++-
 arch/mn10300/include/asm/highmem.h   | 3 +++
 arch/parisc/include/asm/cacheflush.h | 2 ++
 arch/powerpc/mm/highmem.c            | 4 +++-
 arch/sparc/mm/highmem.c              | 4 +++-
 arch/tile/mm/highmem.c               | 3 ++-
 arch/x86/mm/highmem_32.c             | 3 ++-
 arch/x86/mm/iomap_32.c               | 2 ++
 arch/xtensa/mm/highmem.c             | 2 ++
 include/linux/highmem.h              | 2 ++
 include/linux/io-mapping.h           | 2 ++
 15 files changed, 38 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/mm/highmem.c b/arch/arm/mm/highmem.c
index b98895d9fe57..ee8dfa793989 100644
--- a/arch/arm/mm/highmem.c
+++ b/arch/arm/mm/highmem.c
@@ -59,6 +59,7 @@ void *kmap_atomic(struct page *page)
 	void *kmap;
 	int type;
 
+	preempt_disable();
 	pagefault_disable();
 	if (!PageHighMem(page))
 		return page_address(page);
@@ -121,6 +122,7 @@ void __kunmap_atomic(void *kvaddr)
 		kunmap_high(pte_page(pkmap_page_table[PKMAP_NR(vaddr)]));
 	}
 	pagefault_enable();
+	preempt_enable();
 }
 EXPORT_SYMBOL(__kunmap_atomic);
 
@@ -130,6 +132,7 @@ void *kmap_atomic_pfn(unsigned long pfn)
 	int idx, type;
 	struct page *page = pfn_to_page(pfn);
 
+	preempt_disable();
 	pagefault_disable();
 	if (!PageHighMem(page))
 		return page_address(page);
diff --git a/arch/frv/mm/highmem.c b/arch/frv/mm/highmem.c
index bed9a9bd3c10..785344bbdc07 100644
--- a/arch/frv/mm/highmem.c
+++ b/arch/frv/mm/highmem.c
@@ -42,6 +42,7 @@ void *kmap_atomic(struct page *page)
 	unsigned long paddr;
 	int type;
 
+	preempt_disable();
 	pagefault_disable();
 	type = kmap_atomic_idx_push();
 	paddr = page_to_phys(page);
@@ -85,5 +86,6 @@ void __kunmap_atomic(void *kvaddr)
 	}
 	kmap_atomic_idx_pop();
 	pagefault_enable();
+	preempt_enable();
 }
 EXPORT_SYMBOL(__kunmap_atomic);
diff --git a/arch/metag/mm/highmem.c b/arch/metag/mm/highmem.c
index d71f621a2c0b..807f1b1c4e65 100644
--- a/arch/metag/mm/highmem.c
+++ b/arch/metag/mm/highmem.c
@@ -43,7 +43,7 @@ void *kmap_atomic(struct page *page)
 	unsigned long vaddr;
 	int type;
 
-	/* even !CONFIG_PREEMPT needs this, for in_atomic in do_page_fault */
+	preempt_disable();
 	pagefault_disable();
 	if (!PageHighMem(page))
 		return page_address(page);
@@ -82,6 +82,7 @@ void __kunmap_atomic(void *kvaddr)
 	}
 
 	pagefault_enable();
+	preempt_enable();
 }
 EXPORT_SYMBOL(__kunmap_atomic);
 
@@ -95,6 +96,7 @@ void *kmap_atomic_pfn(unsigned long pfn)
 	unsigned long vaddr;
 	int type;
 
+	preempt_disable();
 	pagefault_disable();
 
 	type = kmap_atomic_idx_push();
diff --git a/arch/microblaze/mm/highmem.c b/arch/microblaze/mm/highmem.c
index 5a92576fad92..2fcc5a52d84d 100644
--- a/arch/microblaze/mm/highmem.c
+++ b/arch/microblaze/mm/highmem.c
@@ -37,7 +37,7 @@ void *kmap_atomic_prot(struct page *page, pgprot_t prot)
 	unsigned long vaddr;
 	int idx, type;
 
-	/* even !CONFIG_PREEMPT needs this, for in_atomic in do_page_fault */
+	preempt_disable();
 	pagefault_disable();
 	if (!PageHighMem(page))
 		return page_address(page);
@@ -63,6 +63,7 @@ void __kunmap_atomic(void *kvaddr)
 
 	if (vaddr < __fix_to_virt(FIX_KMAP_END)) {
 		pagefault_enable();
+		preempt_enable();
 		return;
 	}
 
@@ -84,5 +85,6 @@ void __kunmap_atomic(void *kvaddr)
 #endif
 	kmap_atomic_idx_pop();
 	pagefault_enable();
+	preempt_enable();
 }
 EXPORT_SYMBOL(__kunmap_atomic);
diff --git a/arch/mips/mm/highmem.c b/arch/mips/mm/highmem.c
index da815d295239..11661cbc11a8 100644
--- a/arch/mips/mm/highmem.c
+++ b/arch/mips/mm/highmem.c
@@ -47,7 +47,7 @@ void *kmap_atomic(struct page *page)
 	unsigned long vaddr;
 	int idx, type;
 
-	/* even !CONFIG_PREEMPT needs this, for in_atomic in do_page_fault */
+	preempt_disable();
 	pagefault_disable();
 	if (!PageHighMem(page))
 		return page_address(page);
@@ -72,6 +72,7 @@ void __kunmap_atomic(void *kvaddr)
 
 	if (vaddr < FIXADDR_START) { // FIXME
 		pagefault_enable();
+		preempt_enable();
 		return;
 	}
 
@@ -92,6 +93,7 @@ void __kunmap_atomic(void *kvaddr)
 #endif
 	kmap_atomic_idx_pop();
 	pagefault_enable();
+	preempt_enable();
 }
 EXPORT_SYMBOL(__kunmap_atomic);
 
@@ -104,6 +106,7 @@ void *kmap_atomic_pfn(unsigned long pfn)
 	unsigned long vaddr;
 	int idx, type;
 
+	preempt_disable();
 	pagefault_disable();
 
 	type = kmap_atomic_idx_push();
diff --git a/arch/mn10300/include/asm/highmem.h b/arch/mn10300/include/asm/highmem.h
index 2fbbe4d920aa..1ddea5afba09 100644
--- a/arch/mn10300/include/asm/highmem.h
+++ b/arch/mn10300/include/asm/highmem.h
@@ -75,6 +75,7 @@ static inline void *kmap_atomic(struct page *page)
 	unsigned long vaddr;
 	int idx, type;
 
+	preempt_disable();
 	pagefault_disable();
 	if (page < highmem_start_page)
 		return page_address(page);
@@ -98,6 +99,7 @@ static inline void __kunmap_atomic(unsigned long vaddr)
 
 	if (vaddr < FIXADDR_START) { /* FIXME */
 		pagefault_enable();
+		preempt_enable();
 		return;
 	}
 
@@ -122,6 +124,7 @@ static inline void __kunmap_atomic(unsigned long vaddr)
 
 	kmap_atomic_idx_pop();
 	pagefault_enable();
+	preempt_enable();
 }
 #endif /* __KERNEL__ */
 
diff --git a/arch/parisc/include/asm/cacheflush.h b/arch/parisc/include/asm/cacheflush.h
index de65f66ea64e..ec2df4bab302 100644
--- a/arch/parisc/include/asm/cacheflush.h
+++ b/arch/parisc/include/asm/cacheflush.h
@@ -142,6 +142,7 @@ static inline void kunmap(struct page *page)
 
 static inline void *kmap_atomic(struct page *page)
 {
+	preempt_disable();
 	pagefault_disable();
 	return page_address(page);
 }
@@ -150,6 +151,7 @@ static inline void __kunmap_atomic(void *addr)
 {
 	flush_kernel_dcache_page_addr(addr);
 	pagefault_enable();
+	preempt_enable();
 }
 
 #define kmap_atomic_prot(page, prot)	kmap_atomic(page)
diff --git a/arch/powerpc/mm/highmem.c b/arch/powerpc/mm/highmem.c
index e7450bdbe83a..e292c8a60952 100644
--- a/arch/powerpc/mm/highmem.c
+++ b/arch/powerpc/mm/highmem.c
@@ -34,7 +34,7 @@ void *kmap_atomic_prot(struct page *page, pgprot_t prot)
 	unsigned long vaddr;
 	int idx, type;
 
-	/* even !CONFIG_PREEMPT needs this, for in_atomic in do_page_fault */
+	preempt_disable();
 	pagefault_disable();
 	if (!PageHighMem(page))
 		return page_address(page);
@@ -59,6 +59,7 @@ void __kunmap_atomic(void *kvaddr)
 
 	if (vaddr < __fix_to_virt(FIX_KMAP_END)) {
 		pagefault_enable();
+		preempt_enable();
 		return;
 	}
 
@@ -82,5 +83,6 @@ void __kunmap_atomic(void *kvaddr)
 
 	kmap_atomic_idx_pop();
 	pagefault_enable();
+	preempt_enable();
 }
 EXPORT_SYMBOL(__kunmap_atomic);
diff --git a/arch/sparc/mm/highmem.c b/arch/sparc/mm/highmem.c
index 449f864f0cef..a454ec5ff07a 100644
--- a/arch/sparc/mm/highmem.c
+++ b/arch/sparc/mm/highmem.c
@@ -53,7 +53,7 @@ void *kmap_atomic(struct page *page)
 	unsigned long vaddr;
 	long idx, type;
 
-	/* even !CONFIG_PREEMPT needs this, for in_atomic in do_page_fault */
+	preempt_disable();
 	pagefault_disable();
 	if (!PageHighMem(page))
 		return page_address(page);
@@ -91,6 +91,7 @@ void __kunmap_atomic(void *kvaddr)
 
 	if (vaddr < FIXADDR_START) { // FIXME
 		pagefault_enable();
+		preempt_enable();
 		return;
 	}
 
@@ -126,5 +127,6 @@ void __kunmap_atomic(void *kvaddr)
 
 	kmap_atomic_idx_pop();
 	pagefault_enable();
+	preempt_enable();
 }
 EXPORT_SYMBOL(__kunmap_atomic);
diff --git a/arch/tile/mm/highmem.c b/arch/tile/mm/highmem.c
index 6aa2f2625447..fcd545014e79 100644
--- a/arch/tile/mm/highmem.c
+++ b/arch/tile/mm/highmem.c
@@ -201,7 +201,7 @@ void *kmap_atomic_prot(struct page *page, pgprot_t prot)
 	int idx, type;
 	pte_t *pte;
 
-	/* even !CONFIG_PREEMPT needs this, for in_atomic in do_page_fault */
+	preempt_disable();
 	pagefault_disable();
 
 	/* Avoid icache flushes by disallowing atomic executable mappings. */
@@ -259,6 +259,7 @@ void __kunmap_atomic(void *kvaddr)
 	}
 
 	pagefault_enable();
+	preempt_enable();
 }
 EXPORT_SYMBOL(__kunmap_atomic);
 
diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c
index 4500142bc4aa..eecb207a2037 100644
--- a/arch/x86/mm/highmem_32.c
+++ b/arch/x86/mm/highmem_32.c
@@ -35,7 +35,7 @@ void *kmap_atomic_prot(struct page *page, pgprot_t prot)
 	unsigned long vaddr;
 	int idx, type;
 
-	/* even !CONFIG_PREEMPT needs this, for in_atomic in do_page_fault */
+	preempt_disable();
 	pagefault_disable();
 
 	if (!PageHighMem(page))
@@ -100,6 +100,7 @@ void __kunmap_atomic(void *kvaddr)
 #endif
 
 	pagefault_enable();
+	preempt_enable();
 }
 EXPORT_SYMBOL(__kunmap_atomic);
 
diff --git a/arch/x86/mm/iomap_32.c b/arch/x86/mm/iomap_32.c
index 9ca35fc60cfe..2b7ece0e103a 100644
--- a/arch/x86/mm/iomap_32.c
+++ b/arch/x86/mm/iomap_32.c
@@ -59,6 +59,7 @@ void *kmap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot)
 	unsigned long vaddr;
 	int idx, type;
 
+	preempt_disable();
 	pagefault_disable();
 
 	type = kmap_atomic_idx_push();
@@ -117,5 +118,6 @@ iounmap_atomic(void __iomem *kvaddr)
 	}
 
 	pagefault_enable();
+	preempt_enable();
 }
 EXPORT_SYMBOL_GPL(iounmap_atomic);
diff --git a/arch/xtensa/mm/highmem.c b/arch/xtensa/mm/highmem.c
index 8cfb71ec0937..184ceadccc1a 100644
--- a/arch/xtensa/mm/highmem.c
+++ b/arch/xtensa/mm/highmem.c
@@ -42,6 +42,7 @@ void *kmap_atomic(struct page *page)
 	enum fixed_addresses idx;
 	unsigned long vaddr;
 
+	preempt_disable();
 	pagefault_disable();
 	if (!PageHighMem(page))
 		return page_address(page);
@@ -79,6 +80,7 @@ void __kunmap_atomic(void *kvaddr)
 	}
 
 	pagefault_enable();
+	preempt_enable();
 }
 EXPORT_SYMBOL(__kunmap_atomic);
 
diff --git a/include/linux/highmem.h b/include/linux/highmem.h
index 9286a46b7d69..6aefcd0031a6 100644
--- a/include/linux/highmem.h
+++ b/include/linux/highmem.h
@@ -65,6 +65,7 @@ static inline void kunmap(struct page *page)
 
 static inline void *kmap_atomic(struct page *page)
 {
+	preempt_disable();
 	pagefault_disable();
 	return page_address(page);
 }
@@ -73,6 +74,7 @@ static inline void *kmap_atomic(struct page *page)
 static inline void __kunmap_atomic(void *addr)
 {
 	pagefault_enable();
+	preempt_enable();
 }
 
 #define kmap_atomic_pfn(pfn)	kmap_atomic(pfn_to_page(pfn))
diff --git a/include/linux/io-mapping.h b/include/linux/io-mapping.h
index 657fab4efab3..c27dde7215b5 100644
--- a/include/linux/io-mapping.h
+++ b/include/linux/io-mapping.h
@@ -141,6 +141,7 @@ static inline void __iomem *
 io_mapping_map_atomic_wc(struct io_mapping *mapping,
 			 unsigned long offset)
 {
+	preempt_disable();
 	pagefault_disable();
 	return ((char __force __iomem *) mapping) + offset;
 }
@@ -149,6 +150,7 @@ static inline void
 io_mapping_unmap_atomic(void __iomem *vaddr)
 {
 	pagefault_enable();
+	preempt_enable();
 }
 
 /* Non-atomic map/unmap */
-- 
cgit v1.2.3


From 70ffdb9393a7264a069265edded729078dcf0425 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <dahi@linux.vnet.ibm.com>
Date: Mon, 11 May 2015 17:52:11 +0200
Subject: mm/fault, arch: Use pagefault_disable() to check for disabled
 pagefaults in the handler

Introduce faulthandler_disabled() and use it to check for irq context and
disabled pagefaults (via pagefault_disable()) in the pagefault handlers.

Please note that we keep the in_atomic() checks in place - to detect
whether in irq context (in which case preemption is always properly
disabled).

In contrast, preempt_disable() should never be used to disable pagefaults.
With !CONFIG_PREEMPT_COUNT, preempt_disable() doesn't modify the preempt
counter, and therefore the result of in_atomic() differs.
We validate that condition by using might_fault() checks when calling
might_sleep().

Therefore, add a comment to faulthandler_disabled(), describing why this
is needed.

faulthandler_disabled() and pagefault_disable() are defined in
linux/uaccess.h, so let's properly add that include to all relevant files.

This patch is based on a patch from Thomas Gleixner.

Reviewed-and-tested-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: David Hildenbrand <dahi@linux.vnet.ibm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: David.Laight@ACULAB.COM
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: airlied@linux.ie
Cc: akpm@linux-foundation.org
Cc: benh@kernel.crashing.org
Cc: bigeasy@linutronix.de
Cc: borntraeger@de.ibm.com
Cc: daniel.vetter@intel.com
Cc: heiko.carstens@de.ibm.com
Cc: herbert@gondor.apana.org.au
Cc: hocko@suse.cz
Cc: hughd@google.com
Cc: mst@redhat.com
Cc: paulus@samba.org
Cc: ralf@linux-mips.org
Cc: schwidefsky@de.ibm.com
Cc: yang.shi@windriver.com
Link: http://lkml.kernel.org/r/1431359540-32227-7-git-send-email-dahi@linux.vnet.ibm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/alpha/mm/fault.c      |  5 ++---
 arch/arc/mm/fault.c        |  2 +-
 arch/arm/mm/fault.c        |  2 +-
 arch/arm64/mm/fault.c      |  2 +-
 arch/avr32/mm/fault.c      |  4 ++--
 arch/cris/mm/fault.c       |  6 +++---
 arch/frv/mm/fault.c        |  4 ++--
 arch/ia64/mm/fault.c       |  4 ++--
 arch/m32r/mm/fault.c       |  8 ++++----
 arch/m68k/mm/fault.c       |  4 ++--
 arch/metag/mm/fault.c      |  2 +-
 arch/microblaze/mm/fault.c |  8 ++++----
 arch/mips/mm/fault.c       |  4 ++--
 arch/mn10300/mm/fault.c    |  4 ++--
 arch/nios2/mm/fault.c      |  2 +-
 arch/parisc/kernel/traps.c |  4 ++--
 arch/parisc/mm/fault.c     |  4 ++--
 arch/powerpc/mm/fault.c    |  9 +++++----
 arch/s390/mm/fault.c       |  2 +-
 arch/score/mm/fault.c      |  3 ++-
 arch/sh/mm/fault.c         |  5 +++--
 arch/sparc/mm/fault_32.c   |  4 ++--
 arch/sparc/mm/fault_64.c   |  4 ++--
 arch/sparc/mm/init_64.c    |  2 +-
 arch/tile/mm/fault.c       |  4 ++--
 arch/um/kernel/trap.c      |  4 ++--
 arch/unicore32/mm/fault.c  |  2 +-
 arch/x86/mm/fault.c        |  5 +++--
 arch/xtensa/mm/fault.c     |  4 ++--
 include/linux/uaccess.h    | 12 ++++++++++++
 30 files changed, 72 insertions(+), 57 deletions(-)

(limited to 'include/linux')

diff --git a/arch/alpha/mm/fault.c b/arch/alpha/mm/fault.c
index 9d0ac091a52a..4a905bd667e2 100644
--- a/arch/alpha/mm/fault.c
+++ b/arch/alpha/mm/fault.c
@@ -23,8 +23,7 @@
 #include <linux/smp.h>
 #include <linux/interrupt.h>
 #include <linux/module.h>
-
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 extern void die_if_kernel(char *,struct pt_regs *,long, unsigned long *);
 
@@ -107,7 +106,7 @@ do_page_fault(unsigned long address, unsigned long mmcsr,
 
 	/* If we're in an interrupt context, or have no user context,
 	   we must not take the fault.  */
-	if (!mm || in_atomic())
+	if (!mm || faulthandler_disabled())
 		goto no_context;
 
 #ifdef CONFIG_ALPHA_LARGE_VMALLOC
diff --git a/arch/arc/mm/fault.c b/arch/arc/mm/fault.c
index 6a2e006cbcce..d948e4e9d89c 100644
--- a/arch/arc/mm/fault.c
+++ b/arch/arc/mm/fault.c
@@ -86,7 +86,7 @@ void do_page_fault(unsigned long address, struct pt_regs *regs)
 	 * If we're in an interrupt or have no user
 	 * context, we must not take the fault..
 	 */
-	if (in_atomic() || !mm)
+	if (faulthandler_disabled() || !mm)
 		goto no_context;
 
 	if (user_mode(regs))
diff --git a/arch/arm/mm/fault.c b/arch/arm/mm/fault.c
index 6333d9c17875..0d629b8f973f 100644
--- a/arch/arm/mm/fault.c
+++ b/arch/arm/mm/fault.c
@@ -276,7 +276,7 @@ do_page_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
 	 * If we're in an interrupt or have no user
 	 * context, we must not take the fault..
 	 */
-	if (in_atomic() || !mm)
+	if (faulthandler_disabled() || !mm)
 		goto no_context;
 
 	if (user_mode(regs))
diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index 96da13167d4a..0948d327d013 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -211,7 +211,7 @@ static int __kprobes do_page_fault(unsigned long addr, unsigned int esr,
 	 * If we're in an interrupt or have no user context, we must not take
 	 * the fault.
 	 */
-	if (in_atomic() || !mm)
+	if (faulthandler_disabled() || !mm)
 		goto no_context;
 
 	if (user_mode(regs))
diff --git a/arch/avr32/mm/fault.c b/arch/avr32/mm/fault.c
index d223a8b57c1e..c03533937a9f 100644
--- a/arch/avr32/mm/fault.c
+++ b/arch/avr32/mm/fault.c
@@ -14,11 +14,11 @@
 #include <linux/pagemap.h>
 #include <linux/kdebug.h>
 #include <linux/kprobes.h>
+#include <linux/uaccess.h>
 
 #include <asm/mmu_context.h>
 #include <asm/sysreg.h>
 #include <asm/tlb.h>
-#include <asm/uaccess.h>
 
 #ifdef CONFIG_KPROBES
 static inline int notify_page_fault(struct pt_regs *regs, int trap)
@@ -81,7 +81,7 @@ asmlinkage void do_page_fault(unsigned long ecr, struct pt_regs *regs)
 	 * If we're in an interrupt or have no user context, we must
 	 * not take the fault...
 	 */
-	if (in_atomic() || !mm || regs->sr & SYSREG_BIT(GM))
+	if (faulthandler_disabled() || !mm || regs->sr & SYSREG_BIT(GM))
 		goto no_context;
 
 	local_irq_enable();
diff --git a/arch/cris/mm/fault.c b/arch/cris/mm/fault.c
index 83f12f2ed9e3..3066d40a6db1 100644
--- a/arch/cris/mm/fault.c
+++ b/arch/cris/mm/fault.c
@@ -8,7 +8,7 @@
 #include <linux/interrupt.h>
 #include <linux/module.h>
 #include <linux/wait.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <arch/system.h>
 
 extern int find_fixup_code(struct pt_regs *);
@@ -109,11 +109,11 @@ do_page_fault(unsigned long address, struct pt_regs *regs,
 	info.si_code = SEGV_MAPERR;
 
 	/*
-	 * If we're in an interrupt or "atomic" operation or have no
+	 * If we're in an interrupt, have pagefaults disabled or have no
 	 * user context, we must not take the fault.
 	 */
 
-	if (in_atomic() || !mm)
+	if (faulthandler_disabled() || !mm)
 		goto no_context;
 
 	if (user_mode(regs))
diff --git a/arch/frv/mm/fault.c b/arch/frv/mm/fault.c
index ec4917ddf678..61d99767fe16 100644
--- a/arch/frv/mm/fault.c
+++ b/arch/frv/mm/fault.c
@@ -19,9 +19,9 @@
 #include <linux/kernel.h>
 #include <linux/ptrace.h>
 #include <linux/hardirq.h>
+#include <linux/uaccess.h>
 
 #include <asm/pgtable.h>
-#include <asm/uaccess.h>
 #include <asm/gdb-stub.h>
 
 /*****************************************************************************/
@@ -78,7 +78,7 @@ asmlinkage void do_page_fault(int datammu, unsigned long esr0, unsigned long ear
 	 * If we're in an interrupt or have no user
 	 * context, we must not take the fault..
 	 */
-	if (in_atomic() || !mm)
+	if (faulthandler_disabled() || !mm)
 		goto no_context;
 
 	if (user_mode(__frame))
diff --git a/arch/ia64/mm/fault.c b/arch/ia64/mm/fault.c
index ba5ba7accd0d..70b40d1205a6 100644
--- a/arch/ia64/mm/fault.c
+++ b/arch/ia64/mm/fault.c
@@ -11,10 +11,10 @@
 #include <linux/kprobes.h>
 #include <linux/kdebug.h>
 #include <linux/prefetch.h>
+#include <linux/uaccess.h>
 
 #include <asm/pgtable.h>
 #include <asm/processor.h>
-#include <asm/uaccess.h>
 
 extern int die(char *, struct pt_regs *, long);
 
@@ -96,7 +96,7 @@ ia64_do_page_fault (unsigned long address, unsigned long isr, struct pt_regs *re
 	/*
 	 * If we're in an interrupt or have no user context, we must not take the fault..
 	 */
-	if (in_atomic() || !mm)
+	if (faulthandler_disabled() || !mm)
 		goto no_context;
 
 #ifdef CONFIG_VIRTUAL_MEM_MAP
diff --git a/arch/m32r/mm/fault.c b/arch/m32r/mm/fault.c
index e3d4d4890104..8f9875b7933d 100644
--- a/arch/m32r/mm/fault.c
+++ b/arch/m32r/mm/fault.c
@@ -24,9 +24,9 @@
 #include <linux/vt_kern.h>		/* For unblank_screen() */
 #include <linux/highmem.h>
 #include <linux/module.h>
+#include <linux/uaccess.h>
 
 #include <asm/m32r.h>
-#include <asm/uaccess.h>
 #include <asm/hardirq.h>
 #include <asm/mmu_context.h>
 #include <asm/tlbflush.h>
@@ -111,10 +111,10 @@ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long error_code,
 	mm = tsk->mm;
 
 	/*
-	 * If we're in an interrupt or have no user context or are running in an
-	 * atomic region then we must not take the fault..
+	 * If we're in an interrupt or have no user context or have pagefaults
+	 * disabled then we must not take the fault.
 	 */
-	if (in_atomic() || !mm)
+	if (faulthandler_disabled() || !mm)
 		goto bad_area_nosemaphore;
 
 	if (error_code & ACE_USERMODE)
diff --git a/arch/m68k/mm/fault.c b/arch/m68k/mm/fault.c
index b2f04aee46ec..6a94cdd0c830 100644
--- a/arch/m68k/mm/fault.c
+++ b/arch/m68k/mm/fault.c
@@ -10,10 +10,10 @@
 #include <linux/ptrace.h>
 #include <linux/interrupt.h>
 #include <linux/module.h>
+#include <linux/uaccess.h>
 
 #include <asm/setup.h>
 #include <asm/traps.h>
-#include <asm/uaccess.h>
 #include <asm/pgalloc.h>
 
 extern void die_if_kernel(char *, struct pt_regs *, long);
@@ -81,7 +81,7 @@ int do_page_fault(struct pt_regs *regs, unsigned long address,
 	 * If we're in an interrupt or have no user
 	 * context, we must not take the fault..
 	 */
-	if (in_atomic() || !mm)
+	if (faulthandler_disabled() || !mm)
 		goto no_context;
 
 	if (user_mode(regs))
diff --git a/arch/metag/mm/fault.c b/arch/metag/mm/fault.c
index 2de5dc695a87..f57edca63609 100644
--- a/arch/metag/mm/fault.c
+++ b/arch/metag/mm/fault.c
@@ -105,7 +105,7 @@ int do_page_fault(struct pt_regs *regs, unsigned long address,
 
 	mm = tsk->mm;
 
-	if (in_atomic() || !mm)
+	if (faulthandler_disabled() || !mm)
 		goto no_context;
 
 	if (user_mode(regs))
diff --git a/arch/microblaze/mm/fault.c b/arch/microblaze/mm/fault.c
index d46a5ebb7570..177dfc003643 100644
--- a/arch/microblaze/mm/fault.c
+++ b/arch/microblaze/mm/fault.c
@@ -107,14 +107,14 @@ void do_page_fault(struct pt_regs *regs, unsigned long address,
 	if ((error_code & 0x13) == 0x13 || (error_code & 0x11) == 0x11)
 		is_write = 0;
 
-	if (unlikely(in_atomic() || !mm)) {
+	if (unlikely(faulthandler_disabled() || !mm)) {
 		if (kernel_mode(regs))
 			goto bad_area_nosemaphore;
 
-		/* in_atomic() in user mode is really bad,
+		/* faulthandler_disabled() in user mode is really bad,
 		   as is current->mm == NULL. */
-		pr_emerg("Page fault in user mode with in_atomic(), mm = %p\n",
-									mm);
+		pr_emerg("Page fault in user mode with faulthandler_disabled(), mm = %p\n",
+			 mm);
 		pr_emerg("r15 = %lx  MSR = %lx\n",
 		       regs->r15, regs->msr);
 		die("Weird page fault", regs, SIGSEGV);
diff --git a/arch/mips/mm/fault.c b/arch/mips/mm/fault.c
index 7ff8637e530d..36c0f26fac6b 100644
--- a/arch/mips/mm/fault.c
+++ b/arch/mips/mm/fault.c
@@ -21,10 +21,10 @@
 #include <linux/module.h>
 #include <linux/kprobes.h>
 #include <linux/perf_event.h>
+#include <linux/uaccess.h>
 
 #include <asm/branch.h>
 #include <asm/mmu_context.h>
-#include <asm/uaccess.h>
 #include <asm/ptrace.h>
 #include <asm/highmem.h>		/* For VMALLOC_END */
 #include <linux/kdebug.h>
@@ -94,7 +94,7 @@ static void __kprobes __do_page_fault(struct pt_regs *regs, unsigned long write,
 	 * If we're in an interrupt or have no user
 	 * context, we must not take the fault..
 	 */
-	if (in_atomic() || !mm)
+	if (faulthandler_disabled() || !mm)
 		goto bad_area_nosemaphore;
 
 	if (user_mode(regs))
diff --git a/arch/mn10300/mm/fault.c b/arch/mn10300/mm/fault.c
index 0c2cc5d39c8e..4a1d181ed32f 100644
--- a/arch/mn10300/mm/fault.c
+++ b/arch/mn10300/mm/fault.c
@@ -23,8 +23,8 @@
 #include <linux/interrupt.h>
 #include <linux/init.h>
 #include <linux/vt_kern.h>		/* For unblank_screen() */
+#include <linux/uaccess.h>
 
-#include <asm/uaccess.h>
 #include <asm/pgalloc.h>
 #include <asm/hardirq.h>
 #include <asm/cpu-regs.h>
@@ -168,7 +168,7 @@ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long fault_code,
 	 * If we're in an interrupt or have no user
 	 * context, we must not take the fault..
 	 */
-	if (in_atomic() || !mm)
+	if (faulthandler_disabled() || !mm)
 		goto no_context;
 
 	if ((fault_code & MMUFCR_xFC_ACCESS) == MMUFCR_xFC_ACCESS_USR)
diff --git a/arch/nios2/mm/fault.c b/arch/nios2/mm/fault.c
index 0c9b6afe69e9..b51878b0c6b8 100644
--- a/arch/nios2/mm/fault.c
+++ b/arch/nios2/mm/fault.c
@@ -77,7 +77,7 @@ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long cause,
 	 * If we're in an interrupt or have no user
 	 * context, we must not take the fault..
 	 */
-	if (in_atomic() || !mm)
+	if (faulthandler_disabled() || !mm)
 		goto bad_area_nosemaphore;
 
 	if (user_mode(regs))
diff --git a/arch/parisc/kernel/traps.c b/arch/parisc/kernel/traps.c
index 47ee620d15d2..6548fd1d2e62 100644
--- a/arch/parisc/kernel/traps.c
+++ b/arch/parisc/kernel/traps.c
@@ -26,9 +26,9 @@
 #include <linux/console.h>
 #include <linux/bug.h>
 #include <linux/ratelimit.h>
+#include <linux/uaccess.h>
 
 #include <asm/assembly.h>
-#include <asm/uaccess.h>
 #include <asm/io.h>
 #include <asm/irq.h>
 #include <asm/traps.h>
@@ -800,7 +800,7 @@ void notrace handle_interruption(int code, struct pt_regs *regs)
 	     * unless pagefault_disable() was called before.
 	     */
 
-	    if (fault_space == 0 && !in_atomic())
+	    if (fault_space == 0 && !faulthandler_disabled())
 	    {
 		pdc_chassis_send_status(PDC_CHASSIS_DIRECT_PANIC);
 		parisc_terminate("Kernel Fault", regs, code, fault_address);
diff --git a/arch/parisc/mm/fault.c b/arch/parisc/mm/fault.c
index e5120e653240..15503adddf4f 100644
--- a/arch/parisc/mm/fault.c
+++ b/arch/parisc/mm/fault.c
@@ -15,8 +15,8 @@
 #include <linux/sched.h>
 #include <linux/interrupt.h>
 #include <linux/module.h>
+#include <linux/uaccess.h>
 
-#include <asm/uaccess.h>
 #include <asm/traps.h>
 
 /* Various important other fields */
@@ -207,7 +207,7 @@ void do_page_fault(struct pt_regs *regs, unsigned long code,
 	int fault;
 	unsigned int flags;
 
-	if (in_atomic())
+	if (pagefault_disabled())
 		goto no_context;
 
 	tsk = current;
diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index b396868d2aa7..6d535973b200 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -33,13 +33,13 @@
 #include <linux/ratelimit.h>
 #include <linux/context_tracking.h>
 #include <linux/hugetlb.h>
+#include <linux/uaccess.h>
 
 #include <asm/firmware.h>
 #include <asm/page.h>
 #include <asm/pgtable.h>
 #include <asm/mmu.h>
 #include <asm/mmu_context.h>
-#include <asm/uaccess.h>
 #include <asm/tlbflush.h>
 #include <asm/siginfo.h>
 #include <asm/debug.h>
@@ -272,15 +272,16 @@ int __kprobes do_page_fault(struct pt_regs *regs, unsigned long address,
 	if (!arch_irq_disabled_regs(regs))
 		local_irq_enable();
 
-	if (in_atomic() || mm == NULL) {
+	if (faulthandler_disabled() || mm == NULL) {
 		if (!user_mode(regs)) {
 			rc = SIGSEGV;
 			goto bail;
 		}
-		/* in_atomic() in user mode is really bad,
+		/* faulthandler_disabled() in user mode is really bad,
 		   as is current->mm == NULL. */
 		printk(KERN_EMERG "Page fault in user mode with "
-		       "in_atomic() = %d mm = %p\n", in_atomic(), mm);
+		       "faulthandler_disabled() = %d mm = %p\n",
+		       faulthandler_disabled(), mm);
 		printk(KERN_EMERG "NIP = %lx  MSR = %lx\n",
 		       regs->nip, regs->msr);
 		die("Weird page fault", regs, SIGSEGV);
diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c
index 76515bcea2f1..4c8f5d7f9c23 100644
--- a/arch/s390/mm/fault.c
+++ b/arch/s390/mm/fault.c
@@ -399,7 +399,7 @@ static inline int do_exception(struct pt_regs *regs, int access)
 	 * user context.
 	 */
 	fault = VM_FAULT_BADCONTEXT;
-	if (unlikely(!user_space_fault(regs) || in_atomic() || !mm))
+	if (unlikely(!user_space_fault(regs) || faulthandler_disabled() || !mm))
 		goto out;
 
 	address = trans_exc_code & __FAIL_ADDR_MASK;
diff --git a/arch/score/mm/fault.c b/arch/score/mm/fault.c
index 6860beb2a280..37a6c2e0e969 100644
--- a/arch/score/mm/fault.c
+++ b/arch/score/mm/fault.c
@@ -34,6 +34,7 @@
 #include <linux/string.h>
 #include <linux/types.h>
 #include <linux/ptrace.h>
+#include <linux/uaccess.h>
 
 /*
  * This routine handles page faults.  It determines the address,
@@ -73,7 +74,7 @@ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long write,
 	* If we're in an interrupt or have no user
 	* context, we must not take the fault..
 	*/
-	if (in_atomic() || !mm)
+	if (pagefault_disabled() || !mm)
 		goto bad_area_nosemaphore;
 
 	if (user_mode(regs))
diff --git a/arch/sh/mm/fault.c b/arch/sh/mm/fault.c
index a58fec9b55e0..79d8276377d1 100644
--- a/arch/sh/mm/fault.c
+++ b/arch/sh/mm/fault.c
@@ -17,6 +17,7 @@
 #include <linux/kprobes.h>
 #include <linux/perf_event.h>
 #include <linux/kdebug.h>
+#include <linux/uaccess.h>
 #include <asm/io_trapped.h>
 #include <asm/mmu_context.h>
 #include <asm/tlbflush.h>
@@ -438,9 +439,9 @@ asmlinkage void __kprobes do_page_fault(struct pt_regs *regs,
 
 	/*
 	 * If we're in an interrupt, have no user context or are running
-	 * in an atomic region then we must not take the fault:
+	 * with pagefaults disabled then we must not take the fault:
 	 */
-	if (unlikely(in_atomic() || !mm)) {
+	if (unlikely(faulthandler_disabled() || !mm)) {
 		bad_area_nosemaphore(regs, error_code, address);
 		return;
 	}
diff --git a/arch/sparc/mm/fault_32.c b/arch/sparc/mm/fault_32.c
index 70d817154fe8..c399e7b3b035 100644
--- a/arch/sparc/mm/fault_32.c
+++ b/arch/sparc/mm/fault_32.c
@@ -21,6 +21,7 @@
 #include <linux/perf_event.h>
 #include <linux/interrupt.h>
 #include <linux/kdebug.h>
+#include <linux/uaccess.h>
 
 #include <asm/page.h>
 #include <asm/pgtable.h>
@@ -29,7 +30,6 @@
 #include <asm/setup.h>
 #include <asm/smp.h>
 #include <asm/traps.h>
-#include <asm/uaccess.h>
 
 #include "mm_32.h"
 
@@ -196,7 +196,7 @@ asmlinkage void do_sparc_fault(struct pt_regs *regs, int text_fault, int write,
 	 * If we're in an interrupt or have no user
 	 * context, we must not take the fault..
 	 */
-	if (in_atomic() || !mm)
+	if (pagefault_disabled() || !mm)
 		goto no_context;
 
 	perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
diff --git a/arch/sparc/mm/fault_64.c b/arch/sparc/mm/fault_64.c
index 479823249429..e9268ea1a68d 100644
--- a/arch/sparc/mm/fault_64.c
+++ b/arch/sparc/mm/fault_64.c
@@ -22,12 +22,12 @@
 #include <linux/kdebug.h>
 #include <linux/percpu.h>
 #include <linux/context_tracking.h>
+#include <linux/uaccess.h>
 
 #include <asm/page.h>
 #include <asm/pgtable.h>
 #include <asm/openprom.h>
 #include <asm/oplib.h>
-#include <asm/uaccess.h>
 #include <asm/asi.h>
 #include <asm/lsu.h>
 #include <asm/sections.h>
@@ -330,7 +330,7 @@ asmlinkage void __kprobes do_sparc64_fault(struct pt_regs *regs)
 	 * If we're in an interrupt or have no user
 	 * context, we must not take the fault..
 	 */
-	if (in_atomic() || !mm)
+	if (faulthandler_disabled() || !mm)
 		goto intr_or_no_mm;
 
 	perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c
index 4ca0d6ba5ec8..cee9b77ddd05 100644
--- a/arch/sparc/mm/init_64.c
+++ b/arch/sparc/mm/init_64.c
@@ -2706,7 +2706,7 @@ void hugetlb_setup(struct pt_regs *regs)
 	struct mm_struct *mm = current->mm;
 	struct tsb_config *tp;
 
-	if (in_atomic() || !mm) {
+	if (faulthandler_disabled() || !mm) {
 		const struct exception_table_entry *entry;
 
 		entry = search_exception_tables(regs->tpc);
diff --git a/arch/tile/mm/fault.c b/arch/tile/mm/fault.c
index e83cc999da02..3f4f58d34a92 100644
--- a/arch/tile/mm/fault.c
+++ b/arch/tile/mm/fault.c
@@ -354,9 +354,9 @@ static int handle_page_fault(struct pt_regs *regs,
 
 	/*
 	 * If we're in an interrupt, have no user context or are running in an
-	 * atomic region then we must not take the fault.
+	 * region with pagefaults disabled then we must not take the fault.
 	 */
-	if (in_atomic() || !mm) {
+	if (pagefault_disabled() || !mm) {
 		vma = NULL;  /* happy compiler */
 		goto bad_area_nosemaphore;
 	}
diff --git a/arch/um/kernel/trap.c b/arch/um/kernel/trap.c
index 8e4daf44e980..f9c9e5a6beba 100644
--- a/arch/um/kernel/trap.c
+++ b/arch/um/kernel/trap.c
@@ -35,10 +35,10 @@ int handle_page_fault(unsigned long address, unsigned long ip,
 	*code_out = SEGV_MAPERR;
 
 	/*
-	 * If the fault was during atomic operation, don't take the fault, just
+	 * If the fault was with pagefaults disabled, don't take the fault, just
 	 * fail.
 	 */
-	if (in_atomic())
+	if (faulthandler_disabled())
 		goto out_nosemaphore;
 
 	if (is_user)
diff --git a/arch/unicore32/mm/fault.c b/arch/unicore32/mm/fault.c
index 0dc922dba915..afccef5529cc 100644
--- a/arch/unicore32/mm/fault.c
+++ b/arch/unicore32/mm/fault.c
@@ -218,7 +218,7 @@ static int do_pf(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
 	 * If we're in an interrupt or have no user
 	 * context, we must not take the fault..
 	 */
-	if (in_atomic() || !mm)
+	if (faulthandler_disabled() || !mm)
 		goto no_context;
 
 	if (user_mode(regs))
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 181c53bac3a7..9dc909841739 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -13,6 +13,7 @@
 #include <linux/hugetlb.h>		/* hstate_index_to_shift	*/
 #include <linux/prefetch.h>		/* prefetchw			*/
 #include <linux/context_tracking.h>	/* exception_enter(), ...	*/
+#include <linux/uaccess.h>		/* faulthandler_disabled()	*/
 
 #include <asm/traps.h>			/* dotraplinkage, ...		*/
 #include <asm/pgalloc.h>		/* pgd_*(), ...			*/
@@ -1126,9 +1127,9 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
 
 	/*
 	 * If we're in an interrupt, have no user context or are running
-	 * in an atomic region then we must not take the fault:
+	 * in a region with pagefaults disabled then we must not take the fault
 	 */
-	if (unlikely(in_atomic() || !mm)) {
+	if (unlikely(faulthandler_disabled() || !mm)) {
 		bad_area_nosemaphore(regs, error_code, address);
 		return;
 	}
diff --git a/arch/xtensa/mm/fault.c b/arch/xtensa/mm/fault.c
index 9e3571a6535c..83a44a33cfa1 100644
--- a/arch/xtensa/mm/fault.c
+++ b/arch/xtensa/mm/fault.c
@@ -15,10 +15,10 @@
 #include <linux/mm.h>
 #include <linux/module.h>
 #include <linux/hardirq.h>
+#include <linux/uaccess.h>
 #include <asm/mmu_context.h>
 #include <asm/cacheflush.h>
 #include <asm/hardirq.h>
-#include <asm/uaccess.h>
 #include <asm/pgalloc.h>
 
 DEFINE_PER_CPU(unsigned long, asid_cache) = ASID_USER_FIRST;
@@ -57,7 +57,7 @@ void do_page_fault(struct pt_regs *regs)
 	/* If we're in an interrupt or have no user
 	 * context, we must not take the fault..
 	 */
-	if (in_atomic() || !mm) {
+	if (faulthandler_disabled() || !mm) {
 		bad_page_fault(regs, address, SIGSEGV);
 		return;
 	}
diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h
index 23290cc93a24..90786d2d74e5 100644
--- a/include/linux/uaccess.h
+++ b/include/linux/uaccess.h
@@ -59,6 +59,18 @@ static inline void pagefault_enable(void)
  */
 #define pagefault_disabled() (current->pagefault_disabled != 0)
 
+/*
+ * The pagefault handler is in general disabled by pagefault_disable() or
+ * when in irq context (via in_atomic()).
+ *
+ * This function should only be used by the fault handlers. Other users should
+ * stick to pagefault_disabled().
+ * Please NEVER use preempt_disable() to disable the fault handler. With
+ * !CONFIG_PREEMPT_COUNT, this is like a NOP. So the handler won't be disabled.
+ * in_atomic() will report different values based on !CONFIG_PREEMPT_COUNT.
+ */
+#define faulthandler_disabled() (pagefault_disabled() || in_atomic())
+
 #ifndef ARCH_HAS_NOCACHE_UACCESS
 
 static inline unsigned long __copy_from_user_inatomic_nocache(void *to,
-- 
cgit v1.2.3


From 8222dbe21e79338de92d5e1956cd1e3994cc9f93 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <dahi@linux.vnet.ibm.com>
Date: Mon, 11 May 2015 17:52:20 +0200
Subject: sched/preempt, mm/fault: Decouple preemption from the page fault
 logic

As the fault handlers now all rely on the pagefault_disabled() checks
and implicit preempt_disable() calls by pagefault_disable() have been
made explicit, we can completely rely on the pagefault_disableD counter.

So let's no longer touch the preempt count when disabling/enabling
pagefaults. After a call to pagefault_disable(), pagefault_disabled()
will return true, but in_atomic() won't.

Reviewed-and-tested-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: David Hildenbrand <dahi@linux.vnet.ibm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: David.Laight@ACULAB.COM
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: airlied@linux.ie
Cc: akpm@linux-foundation.org
Cc: benh@kernel.crashing.org
Cc: bigeasy@linutronix.de
Cc: borntraeger@de.ibm.com
Cc: daniel.vetter@intel.com
Cc: heiko.carstens@de.ibm.com
Cc: herbert@gondor.apana.org.au
Cc: hocko@suse.cz
Cc: hughd@google.com
Cc: mst@redhat.com
Cc: paulus@samba.org
Cc: ralf@linux-mips.org
Cc: schwidefsky@de.ibm.com
Cc: yang.shi@windriver.com
Link: http://lkml.kernel.org/r/1431359540-32227-16-git-send-email-dahi@linux.vnet.ibm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/uaccess.h | 16 ++--------------
 1 file changed, 2 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h
index 90786d2d74e5..ae572c138607 100644
--- a/include/linux/uaccess.h
+++ b/include/linux/uaccess.h
@@ -1,7 +1,6 @@
 #ifndef __LINUX_UACCESS_H__
 #define __LINUX_UACCESS_H__
 
-#include <linux/preempt.h>
 #include <linux/sched.h>
 #include <asm/uaccess.h>
 
@@ -20,17 +19,11 @@ static __always_inline void pagefault_disabled_dec(void)
  * These routines enable/disable the pagefault handler. If disabled, it will
  * not take any locks and go straight to the fixup table.
  *
- * We increase the preempt and the pagefault count, to be able to distinguish
- * whether we run in simple atomic context or in a real pagefault_disable()
- * context.
- *
- * For now, after pagefault_disabled() has been called, we run in atomic
- * context. User access methods will not sleep.
- *
+ * User access methods will not sleep when called from a pagefault_disabled()
+ * environment.
  */
 static inline void pagefault_disable(void)
 {
-	preempt_count_inc();
 	pagefault_disabled_inc();
 	/*
 	 * make sure to have issued the store before a pagefault
@@ -47,11 +40,6 @@ static inline void pagefault_enable(void)
 	 */
 	barrier();
 	pagefault_disabled_dec();
-#ifndef CONFIG_PREEMPT
-	preempt_count_dec();
-#else
-	preempt_enable();
-#endif
 }
 
 /*
-- 
cgit v1.2.3


From 80ed87c8a9ca0cad7ca66cf3bbdfb17559a66dcf Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 8 May 2015 14:23:45 +0200
Subject: sched/wait: Introduce TASK_NOLOAD and TASK_IDLE

Currently people use TASK_INTERRUPTIBLE to idle kthreads and wait for
'work' because TASK_UNINTERRUPTIBLE contributes to the loadavg. Having
all idle kthreads contribute to the loadavg is somewhat silly.

Now mostly this works OK, because kthreads have all their signals
masked. However there's a few sites where this is causing problems and
TASK_UNINTERRUPTIBLE should be used, except for that loadavg issue.

This patch adds TASK_NOLOAD which, when combined with
TASK_UNINTERRUPTIBLE avoids the loadavg accounting.

As most of imagined usage sites are loops where a thread wants to
idle, waiting for work, a helper TASK_IDLE is introduced.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Julian Anastasov <ja@ssi.bg>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: NeilBrown <neilb@suse.de>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h        | 10 +++++++---
 include/trace/events/sched.h |  3 ++-
 2 files changed, 9 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index dd07ac03f82a..7de815c6fa78 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -218,9 +218,10 @@ print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq);
 #define TASK_WAKEKILL		128
 #define TASK_WAKING		256
 #define TASK_PARKED		512
-#define TASK_STATE_MAX		1024
+#define TASK_NOLOAD		1024
+#define TASK_STATE_MAX		2048
 
-#define TASK_STATE_TO_CHAR_STR "RSDTtXZxKWP"
+#define TASK_STATE_TO_CHAR_STR "RSDTtXZxKWPN"
 
 extern char ___assert_task_state[1 - 2*!!(
 		sizeof(TASK_STATE_TO_CHAR_STR)-1 != ilog2(TASK_STATE_MAX)+1)];
@@ -230,6 +231,8 @@ extern char ___assert_task_state[1 - 2*!!(
 #define TASK_STOPPED		(TASK_WAKEKILL | __TASK_STOPPED)
 #define TASK_TRACED		(TASK_WAKEKILL | __TASK_TRACED)
 
+#define TASK_IDLE		(TASK_UNINTERRUPTIBLE | TASK_NOLOAD)
+
 /* Convenience macros for the sake of wake_up */
 #define TASK_NORMAL		(TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE)
 #define TASK_ALL		(TASK_NORMAL | __TASK_STOPPED | __TASK_TRACED)
@@ -245,7 +248,8 @@ extern char ___assert_task_state[1 - 2*!!(
 			((task->state & (__TASK_STOPPED | __TASK_TRACED)) != 0)
 #define task_contributes_to_load(task)	\
 				((task->state & TASK_UNINTERRUPTIBLE) != 0 && \
-				 (task->flags & PF_FROZEN) == 0)
+				 (task->flags & PF_FROZEN) == 0 && \
+				 (task->state & TASK_NOLOAD) == 0)
 
 #ifdef CONFIG_DEBUG_ATOMIC_SLEEP
 
diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index 30fedaf3e56a..d57a575fe31f 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -147,7 +147,8 @@ TRACE_EVENT(sched_switch,
 		  __print_flags(__entry->prev_state & (TASK_STATE_MAX-1), "|",
 				{ 1, "S"} , { 2, "D" }, { 4, "T" }, { 8, "t" },
 				{ 16, "Z" }, { 32, "X" }, { 64, "x" },
-				{ 128, "K" }, { 256, "W" }, { 512, "P" }) : "R",
+				{ 128, "K" }, { 256, "W" }, { 512, "P" },
+				{ 1024, "N" }) : "R",
 		__entry->prev_state & TASK_STATE_MAX ? "+" : "",
 		__entry->next_comm, __entry->next_pid, __entry->next_prio)
 );
-- 
cgit v1.2.3


From f03123783d4e43cd59df58e23e963136e04f8280 Mon Sep 17 00:00:00 2001
From: Ramakrishna Pallala <ramakrishna.pallala@intel.com>
Date: Thu, 30 Apr 2015 20:44:45 +0530
Subject: extcon: axp288: Add axp288 extcon driver support

This patch adds the extcon support for AXP288 PMIC which
has the BC1.2 charger detection capability. Additionally
it also adds the USB mux switching support b/w SOC and PMIC
based on GPIO control.

Signed-off-by: Ramakrishna Pallala <ramakrishna.pallala@intel.com>
Acked-by: Lee Jones <lee.jones@linaro.org>
[cw00.choi: Modify the log message to keep the consistent log message pattern]
Signed-off-by: Chanwoo Choi <cw00.choi@samsung.com>
---
 drivers/extcon/Kconfig         |   7 +
 drivers/extcon/Makefile        |   1 +
 drivers/extcon/extcon-axp288.c | 385 +++++++++++++++++++++++++++++++++++++++++
 include/linux/mfd/axp20x.h     |   5 +
 4 files changed, 398 insertions(+)
 create mode 100644 drivers/extcon/extcon-axp288.c

(limited to 'include/linux')

diff --git a/drivers/extcon/Kconfig b/drivers/extcon/Kconfig
index bd10b6c6962d..b74e1f27688b 100644
--- a/drivers/extcon/Kconfig
+++ b/drivers/extcon/Kconfig
@@ -28,6 +28,13 @@ config EXTCON_ARIZONA
 	  with Wolfson Arizona devices. These are audio CODECs with
 	  advanced audio accessory detection support.
 
+config EXTCON_AXP288
+	tristate "X-Power AXP288 EXTCON support"
+	depends on MFD_AXP20X && USB_PHY
+	help
+	  Say Y here to enable support for USB peripheral detection
+	  and USB MUX switching by X-Power AXP288 PMIC.
+
 config EXTCON_GPIO
 	tristate "GPIO extcon support"
 	depends on GPIOLIB
diff --git a/drivers/extcon/Makefile b/drivers/extcon/Makefile
index 9204114791a3..ba787d04295b 100644
--- a/drivers/extcon/Makefile
+++ b/drivers/extcon/Makefile
@@ -5,6 +5,7 @@
 obj-$(CONFIG_EXTCON)		+= extcon.o
 obj-$(CONFIG_EXTCON_ADC_JACK)	+= extcon-adc-jack.o
 obj-$(CONFIG_EXTCON_ARIZONA)	+= extcon-arizona.o
+obj-$(CONFIG_EXTCON_AXP288)	+= extcon-axp288.o
 obj-$(CONFIG_EXTCON_GPIO)	+= extcon-gpio.o
 obj-$(CONFIG_EXTCON_MAX14577)	+= extcon-max14577.o
 obj-$(CONFIG_EXTCON_MAX77693)	+= extcon-max77693.o
diff --git a/drivers/extcon/extcon-axp288.c b/drivers/extcon/extcon-axp288.c
new file mode 100644
index 000000000000..8299adb9b676
--- /dev/null
+++ b/drivers/extcon/extcon-axp288.c
@@ -0,0 +1,385 @@
+/*
+ * extcon-axp288.c - X-Power AXP288 PMIC extcon cable detection driver
+ *
+ * Copyright (C) 2015 Intel Corporation
+ * Author: Ramakrishna Pallala <ramakrishna.pallala@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/io.h>
+#include <linux/slab.h>
+#include <linux/interrupt.h>
+#include <linux/platform_device.h>
+#include <linux/property.h>
+#include <linux/usb/phy.h>
+#include <linux/notifier.h>
+#include <linux/extcon.h>
+#include <linux/regmap.h>
+#include <linux/gpio.h>
+#include <linux/gpio/consumer.h>
+#include <linux/mfd/axp20x.h>
+
+/* Power source status register */
+#define PS_STAT_VBUS_TRIGGER		BIT(0)
+#define PS_STAT_BAT_CHRG_DIR		BIT(2)
+#define PS_STAT_VBUS_ABOVE_VHOLD	BIT(3)
+#define PS_STAT_VBUS_VALID		BIT(4)
+#define PS_STAT_VBUS_PRESENT		BIT(5)
+
+/* BC module global register */
+#define BC_GLOBAL_RUN			BIT(0)
+#define BC_GLOBAL_DET_STAT		BIT(2)
+#define BC_GLOBAL_DBP_TOUT		BIT(3)
+#define BC_GLOBAL_VLGC_COM_SEL		BIT(4)
+#define BC_GLOBAL_DCD_TOUT_MASK		(BIT(6)|BIT(5))
+#define BC_GLOBAL_DCD_TOUT_300MS	0
+#define BC_GLOBAL_DCD_TOUT_100MS	1
+#define BC_GLOBAL_DCD_TOUT_500MS	2
+#define BC_GLOBAL_DCD_TOUT_900MS	3
+#define BC_GLOBAL_DCD_DET_SEL		BIT(7)
+
+/* BC module vbus control and status register */
+#define VBUS_CNTL_DPDM_PD_EN		BIT(4)
+#define VBUS_CNTL_DPDM_FD_EN		BIT(5)
+#define VBUS_CNTL_FIRST_PO_STAT		BIT(6)
+
+/* BC USB status register */
+#define USB_STAT_BUS_STAT_MASK		(BIT(3)|BIT(2)|BIT(1)|BIT(0))
+#define USB_STAT_BUS_STAT_SHIFT		0
+#define USB_STAT_BUS_STAT_ATHD		0
+#define USB_STAT_BUS_STAT_CONN		1
+#define USB_STAT_BUS_STAT_SUSP		2
+#define USB_STAT_BUS_STAT_CONF		3
+#define USB_STAT_USB_SS_MODE		BIT(4)
+#define USB_STAT_DEAD_BAT_DET		BIT(6)
+#define USB_STAT_DBP_UNCFG		BIT(7)
+
+/* BC detect status register */
+#define DET_STAT_MASK			(BIT(7)|BIT(6)|BIT(5))
+#define DET_STAT_SHIFT			5
+#define DET_STAT_SDP			1
+#define DET_STAT_CDP			2
+#define DET_STAT_DCP			3
+
+/* IRQ enable-1 register */
+#define PWRSRC_IRQ_CFG_MASK		(BIT(4)|BIT(3)|BIT(2))
+
+/* IRQ enable-6 register */
+#define BC12_IRQ_CFG_MASK		BIT(1)
+
+#define AXP288_EXTCON_SLOW_CHARGER		"SLOW-CHARGER"
+#define AXP288_EXTCON_DOWNSTREAM_CHARGER	"CHARGE-DOWNSTREAM"
+#define AXP288_EXTCON_FAST_CHARGER		"FAST-CHARGER"
+
+enum axp288_extcon_reg {
+	AXP288_PS_STAT_REG		= 0x00,
+	AXP288_PS_BOOT_REASON_REG	= 0x02,
+	AXP288_BC_GLOBAL_REG		= 0x2c,
+	AXP288_BC_VBUS_CNTL_REG		= 0x2d,
+	AXP288_BC_USB_STAT_REG		= 0x2e,
+	AXP288_BC_DET_STAT_REG		= 0x2f,
+	AXP288_PWRSRC_IRQ_CFG_REG	= 0x40,
+	AXP288_BC12_IRQ_CFG_REG		= 0x45,
+};
+
+enum axp288_mux_select {
+	EXTCON_GPIO_MUX_SEL_PMIC = 0,
+	EXTCON_GPIO_MUX_SEL_SOC,
+};
+
+enum axp288_extcon_irq {
+	VBUS_FALLING_IRQ = 0,
+	VBUS_RISING_IRQ,
+	MV_CHNG_IRQ,
+	BC_USB_CHNG_IRQ,
+	EXTCON_IRQ_END,
+};
+
+static const char *axp288_extcon_cables[] = {
+	AXP288_EXTCON_SLOW_CHARGER,
+	AXP288_EXTCON_DOWNSTREAM_CHARGER,
+	AXP288_EXTCON_FAST_CHARGER,
+	NULL,
+};
+
+struct axp288_extcon_info {
+	struct device *dev;
+	struct regmap *regmap;
+	struct regmap_irq_chip_data *regmap_irqc;
+	struct axp288_extcon_pdata *pdata;
+	int irq[EXTCON_IRQ_END];
+	struct extcon_dev *edev;
+	struct notifier_block extcon_nb;
+	struct usb_phy *otg;
+};
+
+/* Power up/down reason string array */
+static char *axp288_pwr_up_down_info[] = {
+	"Last wake caused by user pressing the power button",
+	"Last wake caused by a charger insertion",
+	"Last wake caused by a battery insertion",
+	"Last wake caused by SOC initiated global reset",
+	"Last wake caused by cold reset",
+	"Last shutdown caused by PMIC UVLO threshold",
+	"Last shutdown caused by SOC initiated cold off",
+	"Last shutdown caused by user pressing the power button",
+	NULL,
+};
+
+/*
+ * Decode and log the given "reset source indicator" (rsi)
+ * register and then clear it.
+ */
+static void axp288_extcon_log_rsi(struct axp288_extcon_info *info)
+{
+	char **rsi;
+	unsigned int val, i, clear_mask = 0;
+	int ret;
+
+	ret = regmap_read(info->regmap, AXP288_PS_BOOT_REASON_REG, &val);
+	for (i = 0, rsi = axp288_pwr_up_down_info; *rsi; rsi++, i++) {
+		if (val & BIT(i)) {
+			dev_dbg(info->dev, "%s\n", *rsi);
+			clear_mask |= BIT(i);
+		}
+	}
+
+	/* Clear the register value for next reboot (write 1 to clear bit) */
+	regmap_write(info->regmap, AXP288_PS_BOOT_REASON_REG, clear_mask);
+}
+
+static int axp288_handle_chrg_det_event(struct axp288_extcon_info *info)
+{
+	static bool notify_otg, notify_charger;
+	static char *cable;
+	int ret, stat, cfg, pwr_stat;
+	u8 chrg_type;
+	bool vbus_attach = false;
+
+	ret = regmap_read(info->regmap, AXP288_PS_STAT_REG, &pwr_stat);
+	if (ret < 0) {
+		dev_err(info->dev, "failed to read vbus status\n");
+		return ret;
+	}
+
+	vbus_attach = (pwr_stat & PS_STAT_VBUS_PRESENT);
+	if (!vbus_attach)
+		goto notify_otg;
+
+	/* Check charger detection completion status */
+	ret = regmap_read(info->regmap, AXP288_BC_GLOBAL_REG, &cfg);
+	if (ret < 0)
+		goto dev_det_ret;
+	if (cfg & BC_GLOBAL_DET_STAT) {
+		dev_dbg(info->dev, "can't complete the charger detection\n");
+		goto dev_det_ret;
+	}
+
+	ret = regmap_read(info->regmap, AXP288_BC_DET_STAT_REG, &stat);
+	if (ret < 0)
+		goto dev_det_ret;
+
+	chrg_type = (stat & DET_STAT_MASK) >> DET_STAT_SHIFT;
+
+	switch (chrg_type) {
+	case DET_STAT_SDP:
+		dev_dbg(info->dev, "sdp cable is connecetd\n");
+		notify_otg = true;
+		notify_charger = true;
+		cable = AXP288_EXTCON_SLOW_CHARGER;
+		break;
+	case DET_STAT_CDP:
+		dev_dbg(info->dev, "cdp cable is connecetd\n");
+		notify_otg = true;
+		notify_charger = true;
+		cable = AXP288_EXTCON_DOWNSTREAM_CHARGER;
+		break;
+	case DET_STAT_DCP:
+		dev_dbg(info->dev, "dcp cable is connecetd\n");
+		notify_charger = true;
+		cable = AXP288_EXTCON_FAST_CHARGER;
+		break;
+	default:
+		dev_warn(info->dev,
+			"disconnect or unknown or ID event\n");
+	}
+
+notify_otg:
+	if (notify_otg) {
+		/*
+		 * If VBUS is absent Connect D+/D- lines to PMIC for BC
+		 * detection. Else connect them to SOC for USB communication.
+		 */
+		if (info->pdata->gpio_mux_cntl)
+			gpiod_set_value(info->pdata->gpio_mux_cntl,
+				vbus_attach ? EXTCON_GPIO_MUX_SEL_SOC
+						: EXTCON_GPIO_MUX_SEL_PMIC);
+
+		atomic_notifier_call_chain(&info->otg->notifier,
+			vbus_attach ? USB_EVENT_VBUS : USB_EVENT_NONE, NULL);
+	}
+
+	if (notify_charger)
+		extcon_set_cable_state(info->edev, cable, vbus_attach);
+
+	/* Clear the flags on disconnect event */
+	if (!vbus_attach)
+		notify_otg = notify_charger = false;
+
+	return 0;
+
+dev_det_ret:
+	if (ret < 0)
+		dev_err(info->dev, "failed to detect BC Mod\n");
+
+	return ret;
+}
+
+static irqreturn_t axp288_extcon_isr(int irq, void *data)
+{
+	struct axp288_extcon_info *info = data;
+	int ret;
+
+	ret = axp288_handle_chrg_det_event(info);
+	if (ret < 0)
+		dev_err(info->dev, "failed to handle the interrupt\n");
+
+	return IRQ_HANDLED;
+}
+
+static void axp288_extcon_enable_irq(struct axp288_extcon_info *info)
+{
+	/* Unmask VBUS interrupt */
+	regmap_write(info->regmap, AXP288_PWRSRC_IRQ_CFG_REG,
+						PWRSRC_IRQ_CFG_MASK);
+	regmap_update_bits(info->regmap, AXP288_BC_GLOBAL_REG,
+						BC_GLOBAL_RUN, 0);
+	/* Unmask the BC1.2 complete interrupts */
+	regmap_write(info->regmap, AXP288_BC12_IRQ_CFG_REG, BC12_IRQ_CFG_MASK);
+	/* Enable the charger detection logic */
+	regmap_update_bits(info->regmap, AXP288_BC_GLOBAL_REG,
+					BC_GLOBAL_RUN, BC_GLOBAL_RUN);
+}
+
+static int axp288_extcon_probe(struct platform_device *pdev)
+{
+	struct axp288_extcon_info *info;
+	struct axp20x_dev *axp20x = dev_get_drvdata(pdev->dev.parent);
+	int ret, i, pirq, gpio;
+
+	info = devm_kzalloc(&pdev->dev, sizeof(*info), GFP_KERNEL);
+	if (!info)
+		return -ENOMEM;
+
+	info->dev = &pdev->dev;
+	info->regmap = axp20x->regmap;
+	info->regmap_irqc = axp20x->regmap_irqc;
+	info->pdata = pdev->dev.platform_data;
+
+	if (!info->pdata) {
+		/* Try ACPI provided pdata via device properties */
+		if (!device_property_present(&pdev->dev,
+					"axp288_extcon_data\n"))
+			dev_err(&pdev->dev, "failed to get platform data\n");
+		return -ENODEV;
+	}
+	platform_set_drvdata(pdev, info);
+
+	axp288_extcon_log_rsi(info);
+
+	/* Initialize extcon device */
+	info->edev = devm_extcon_dev_allocate(&pdev->dev,
+					      axp288_extcon_cables);
+	if (IS_ERR(info->edev)) {
+		dev_err(&pdev->dev, "failed to allocate memory for extcon\n");
+		return PTR_ERR(info->edev);
+	}
+
+	/* Register extcon device */
+	ret = devm_extcon_dev_register(&pdev->dev, info->edev);
+	if (ret) {
+		dev_err(&pdev->dev, "failed to register extcon device\n");
+		return ret;
+	}
+
+	/* Get otg transceiver phy */
+	info->otg = usb_get_phy(USB_PHY_TYPE_USB2);
+	if (IS_ERR(info->otg)) {
+		dev_err(&pdev->dev, "failed to get otg transceiver\n");
+		return PTR_ERR(info->otg);
+	}
+
+	/* Set up gpio control for USB Mux */
+	if (info->pdata->gpio_mux_cntl) {
+		gpio = desc_to_gpio(info->pdata->gpio_mux_cntl);
+		ret = gpio_request(gpio, "USB_MUX");
+		if (ret < 0) {
+			dev_err(&pdev->dev,
+				"failed to request the gpio=%d\n", gpio);
+			goto gpio_req_failed;
+		}
+		gpiod_direction_output(info->pdata->gpio_mux_cntl,
+						EXTCON_GPIO_MUX_SEL_PMIC);
+	}
+
+	for (i = 0; i < EXTCON_IRQ_END; i++) {
+		pirq = platform_get_irq(pdev, i);
+		info->irq[i] = regmap_irq_get_virq(info->regmap_irqc, pirq);
+		if (info->irq[i] < 0) {
+			dev_err(&pdev->dev,
+				"failed to get virtual interrupt=%d\n", pirq);
+			ret = info->irq[i];
+			goto gpio_req_failed;
+		}
+
+		ret = devm_request_threaded_irq(&pdev->dev, info->irq[i],
+				NULL, axp288_extcon_isr,
+				IRQF_ONESHOT | IRQF_NO_SUSPEND,
+				pdev->name, info);
+		if (ret) {
+			dev_err(&pdev->dev, "failed to request interrupt=%d\n",
+							info->irq[i]);
+			goto gpio_req_failed;
+		}
+	}
+
+	/* Enable interrupts */
+	axp288_extcon_enable_irq(info);
+
+	return 0;
+
+gpio_req_failed:
+	usb_put_phy(info->otg);
+	return ret;
+}
+
+static int axp288_extcon_remove(struct platform_device *pdev)
+{
+	struct axp288_extcon_info *info = platform_get_drvdata(pdev);
+
+	usb_put_phy(info->otg);
+	return 0;
+}
+
+static struct platform_driver axp288_extcon_driver = {
+	.probe = axp288_extcon_probe,
+	.remove = axp288_extcon_remove,
+	.driver = {
+		.name = "axp288_extcon",
+	},
+};
+module_platform_driver(axp288_extcon_driver);
+
+MODULE_AUTHOR("Ramakrishna Pallala <ramakrishna.pallala@intel.com>");
+MODULE_DESCRIPTION("X-Powers AXP288 extcon driver");
+MODULE_LICENSE("GPL v2");
diff --git a/include/linux/mfd/axp20x.h b/include/linux/mfd/axp20x.h
index dfabd6db7ddf..4ed8071d062e 100644
--- a/include/linux/mfd/axp20x.h
+++ b/include/linux/mfd/axp20x.h
@@ -275,4 +275,9 @@ struct axp20x_fg_pdata {
 	int thermistor_curve[MAX_THERM_CURVE_SIZE][2];
 };
 
+struct axp288_extcon_pdata {
+	/* GPIO pin control to switch D+/D- lines b/w PMIC and SOC */
+	struct gpio_desc *gpio_mux_cntl;
+};
+
 #endif /* __LINUX_MFD_AXP20X_H */
-- 
cgit v1.2.3


From 707d7550875a9bd245ce5f1077d2d2cb44eab218 Mon Sep 17 00:00:00 2001
From: Chanwoo Choi <cw00.choi@samsung.com>
Date: Wed, 15 Apr 2015 13:57:51 +0900
Subject: extcon: Add extcon_get_edev_name() API to get the extcon device name

This patch adds the extcon_get_edev_name() API to get the name of extcon device
because all information inclued in the structure extcon_dev should be accessed
by extcon core API instead of directly accessing the data.

Signed-off-by: Chanwoo Choi <cw00.choi@samsung.com>
---
 drivers/extcon/extcon.c | 9 +++++++++
 include/linux/extcon.h  | 4 ++++
 2 files changed, 13 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/extcon/extcon.c b/drivers/extcon/extcon.c
index 1a932292d6c8..c5e9ebf0a914 100644
--- a/drivers/extcon/extcon.c
+++ b/drivers/extcon/extcon.c
@@ -1046,6 +1046,15 @@ struct extcon_dev *extcon_get_edev_by_phandle(struct device *dev, int index)
 #endif /* CONFIG_OF */
 EXPORT_SYMBOL_GPL(extcon_get_edev_by_phandle);
 
+/**
+ * extcon_get_edev_name() - Get the name of the extcon device.
+ * @edev:	the extcon device
+ */
+const char *extcon_get_edev_name(struct extcon_dev *edev)
+{
+	return !edev ? NULL : edev->name;
+}
+
 static int __init extcon_class_init(void)
 {
 	return create_extcon_class();
diff --git a/include/linux/extcon.h b/include/linux/extcon.h
index 36f49c405dfb..e2cf6254c86a 100644
--- a/include/linux/extcon.h
+++ b/include/linux/extcon.h
@@ -259,6 +259,10 @@ extern int extcon_unregister_notifier(struct extcon_dev *edev,
  * This function use phandle of devicetree to get extcon device directly.
  */
 extern struct extcon_dev *extcon_get_edev_by_phandle(struct device *dev, int index);
+
+/* Following API to get information of extcon device */
+extern const char *extcon_get_edev_name(struct extcon_dev *edev);
+
 #else /* CONFIG_EXTCON */
 static inline int extcon_dev_register(struct extcon_dev *edev)
 {
-- 
cgit v1.2.3


From b9ec23c08a0274d31ee626f14b359563ea0cae46 Mon Sep 17 00:00:00 2001
From: Chanwoo Choi <cw00.choi@samsung.com>
Date: Fri, 24 Apr 2015 14:48:52 +0900
Subject: extcon: Fix the checkpatch warning and minor coding style issue

This patch clean up the extcon core driver by fixing the checkpatch warning
and minor coding style issue.

Signed-off-by: Chanwoo Choi <cw00.choi@samsung.com>
---
 drivers/extcon/extcon.c | 10 +++++-----
 include/linux/extcon.h  |  7 ++++---
 2 files changed, 9 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/extcon/extcon.c b/drivers/extcon/extcon.c
index c5e9ebf0a914..2fb5f757e811 100644
--- a/drivers/extcon/extcon.c
+++ b/drivers/extcon/extcon.c
@@ -1,5 +1,5 @@
 /*
- *  drivers/extcon/extcon_class.c
+ *  drivers/extcon/extcon.c - External Connector (extcon) framework.
  *
  *  External connector (extcon) class driver
  *
@@ -19,8 +19,7 @@
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
- *
-*/
+ */
 
 #include <linux/module.h>
 #include <linux/types.h>
@@ -469,7 +468,6 @@ int extcon_register_interest(struct extcon_specific_cable_nb *obj,
 		ret = raw_notifier_chain_register(&obj->edev->nh,
 						  &obj->internal_nb);
 		spin_unlock_irqrestore(&obj->edev->lock, flags);
-		return ret;
 	} else {
 		struct class_dev_iter iter;
 		struct extcon_dev *extd;
@@ -489,8 +487,10 @@ int extcon_register_interest(struct extcon_specific_cable_nb *obj,
 						cable_name, nb);
 		}
 
-		return -ENODEV;
+		ret = -ENODEV;
 	}
+
+	return ret;
 }
 EXPORT_SYMBOL_GPL(extcon_register_interest);
 
diff --git a/include/linux/extcon.h b/include/linux/extcon.h
index e2cf6254c86a..799474d9dc48 100644
--- a/include/linux/extcon.h
+++ b/include/linux/extcon.h
@@ -97,8 +97,8 @@ struct extcon_cable;
  * @state:		Attach/detach state of this extcon. Do not provide at
  *			register-time.
  * @nh:			Notifier for the state change events from this extcon
- * @entry:		To support list of extcon devices so that users can search
- *			for extcon devices based on the extcon name.
+ * @entry:		To support list of extcon devices so that users can
+ *			search for extcon devices based on the extcon name.
  * @lock:
  * @max_supported:	Internal value to store the number of cables.
  * @extcon_dev_type:	Device_type struct to provide attribute_groups
@@ -258,7 +258,8 @@ extern int extcon_unregister_notifier(struct extcon_dev *edev,
  * Following API get the extcon device from devicetree.
  * This function use phandle of devicetree to get extcon device directly.
  */
-extern struct extcon_dev *extcon_get_edev_by_phandle(struct device *dev, int index);
+extern struct extcon_dev *extcon_get_edev_by_phandle(struct device *dev,
+						     int index);
 
 /* Following API to get information of extcon device */
 extern const char *extcon_get_edev_name(struct extcon_dev *edev);
-- 
cgit v1.2.3


From 9e86b2ad4c11fd52ee8133abce7a29e0b32d29a7 Mon Sep 17 00:00:00 2001
From: Inha Song <ideal.song@samsung.com>
Date: Mon, 4 May 2015 13:42:13 +0900
Subject: extcon: arizona: Add support for select accessory detect mode when
 headphone detection

This patch add support for select accessory detect mode to HPDETL or HPDETR.
Arizona provides a headphone detection circuit on the HPDETL and HPDETR pins
to measure the impedance of an external load connected to the headphone.

Depending on board design, headphone detect pins can change to HPDETR or HPDETL.

Signed-off-by: Inha Song <ideal.song@samsung.com>
Acked-by: Lee Jones <lee@kernel.org>
Acked-by: Charles Keepax <ckeepax@opensource.wolfsonmicro.com>
Signed-off-by: Chanwoo Choi <cw00.choi@samsung.com>
---
 drivers/extcon/extcon-arizona.c   | 38 ++++++++++++++++++++++++++++++--------
 include/dt-bindings/mfd/arizona.h |  4 ++++
 include/linux/mfd/arizona/pdata.h |  3 +++
 3 files changed, 37 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/extcon/extcon-arizona.c b/drivers/extcon/extcon-arizona.c
index 5e08c358649b..1ec06b4b4875 100644
--- a/drivers/extcon/extcon-arizona.c
+++ b/drivers/extcon/extcon-arizona.c
@@ -32,13 +32,10 @@
 #include <linux/mfd/arizona/core.h>
 #include <linux/mfd/arizona/pdata.h>
 #include <linux/mfd/arizona/registers.h>
+#include <dt-bindings/mfd/arizona.h>
 
 #define ARIZONA_MAX_MICD_RANGE 8
 
-#define ARIZONA_ACCDET_MODE_MIC 0
-#define ARIZONA_ACCDET_MODE_HPL 1
-#define ARIZONA_ACCDET_MODE_HPR 2
-
 #define ARIZONA_MICD_CLAMP_MODE_JDL      0x4
 #define ARIZONA_MICD_CLAMP_MODE_JDH      0x5
 #define ARIZONA_MICD_CLAMP_MODE_JDL_GP5H 0x9
@@ -671,9 +668,9 @@ static void arizona_identify_headphone(struct arizona_extcon_info *info)
 	ret = regmap_update_bits(arizona->regmap,
 				 ARIZONA_ACCESSORY_DETECT_MODE_1,
 				 ARIZONA_ACCDET_MODE_MASK,
-				 ARIZONA_ACCDET_MODE_HPL);
+				 arizona->pdata.hpdet_channel);
 	if (ret != 0) {
-		dev_err(arizona->dev, "Failed to set HPDETL mode: %d\n", ret);
+		dev_err(arizona->dev, "Failed to set HPDET mode: %d\n", ret);
 		goto err;
 	}
 
@@ -723,9 +720,9 @@ static void arizona_start_hpdet_acc_id(struct arizona_extcon_info *info)
 				 ARIZONA_ACCESSORY_DETECT_MODE_1,
 				 ARIZONA_ACCDET_SRC | ARIZONA_ACCDET_MODE_MASK,
 				 info->micd_modes[0].src |
-				 ARIZONA_ACCDET_MODE_HPL);
+				 arizona->pdata.hpdet_channel);
 	if (ret != 0) {
-		dev_err(arizona->dev, "Failed to set HPDETL mode: %d\n", ret);
+		dev_err(arizona->dev, "Failed to set HPDET mode: %d\n", ret);
 		goto err;
 	}
 
@@ -1121,6 +1118,26 @@ static void arizona_micd_set_level(struct arizona *arizona, int index,
 	regmap_update_bits(arizona->regmap, reg, mask, level);
 }
 
+static int arizona_extcon_of_get_pdata(struct arizona *arizona)
+{
+	struct arizona_pdata *pdata = &arizona->pdata;
+	unsigned int val = ARIZONA_ACCDET_MODE_HPL;
+
+	of_property_read_u32(arizona->dev->of_node, "wlf,hpdet-channel", &val);
+	switch (val) {
+	case ARIZONA_ACCDET_MODE_HPL:
+	case ARIZONA_ACCDET_MODE_HPR:
+		pdata->hpdet_channel = val;
+		break;
+	default:
+		dev_err(arizona->dev,
+			"Wrong wlf,hpdet-channel DT value %d\n", val);
+		pdata->hpdet_channel = ARIZONA_ACCDET_MODE_HPL;
+	}
+
+	return 0;
+}
+
 static int arizona_extcon_probe(struct platform_device *pdev)
 {
 	struct arizona *arizona = dev_get_drvdata(pdev->dev.parent);
@@ -1138,6 +1155,11 @@ static int arizona_extcon_probe(struct platform_device *pdev)
 	if (!info)
 		return -ENOMEM;
 
+	if (IS_ENABLED(CONFIG_OF)) {
+		if (!dev_get_platdata(arizona->dev))
+			arizona_extcon_of_get_pdata(arizona);
+	}
+
 	info->micvdd = devm_regulator_get(&pdev->dev, "MICVDD");
 	if (IS_ERR(info->micvdd)) {
 		ret = PTR_ERR(info->micvdd);
diff --git a/include/dt-bindings/mfd/arizona.h b/include/dt-bindings/mfd/arizona.h
index c7af7c7ef793..fee48e624036 100644
--- a/include/dt-bindings/mfd/arizona.h
+++ b/include/dt-bindings/mfd/arizona.h
@@ -90,4 +90,8 @@
 #define ARIZONA_INMODE_SE   1
 #define ARIZONA_INMODE_DMIC 2
 
+#define ARIZONA_ACCDET_MODE_MIC 0
+#define ARIZONA_ACCDET_MODE_HPL 1
+#define ARIZONA_ACCDET_MODE_HPR 2
+
 #endif
diff --git a/include/linux/mfd/arizona/pdata.h b/include/linux/mfd/arizona/pdata.h
index 1789cb0f4f17..aa5c48b06755 100644
--- a/include/linux/mfd/arizona/pdata.h
+++ b/include/linux/mfd/arizona/pdata.h
@@ -121,6 +121,9 @@ struct arizona_pdata {
 	/** GPIO used for mic isolation with HPDET */
 	int hpdet_id_gpio;
 
+	/** Channel to use for headphone detection */
+	unsigned int hpdet_channel;
+
 	/** Extra debounce timeout used during initial mic detection (ms) */
 	int micd_detect_debounce;
 
-- 
cgit v1.2.3


From ca42aaf0c8616cde6161ea4391dff364efeee46a Mon Sep 17 00:00:00 2001
From: Nicholas Mc Guire <hofrat@osadl.org>
Date: Mon, 18 May 2015 14:19:13 +0200
Subject: time: Refactor msecs_to_jiffies

Refactor the msecs_to_jiffies conditional code part in time.c and
jiffies.h putting it into conditional functions rather than #ifdefs
to improve readability.

[ tglx: Verified that there is no binary code change ]

Signed-off-by: Nicholas Mc Guire <hofrat@osadl.org>
Cc: Masahiro Yamada <yamada.m@jp.panasonic.com>
Cc: Sam Ravnborg <sam@ravnborg.org>
Cc: Joe Perches <joe@perches.com>
Cc: John Stultz <john.stultz@linaro.org>
Cc: Andrew Hunter <ahh@google.com>
Cc: Paul Turner <pjt@google.com>
Cc: Michal Marek <mmarek@suse.cz>
Link: http://lkml.kernel.org/r/1431951554-5563-2-git-send-email-hofrat@osadl.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/jiffies.h | 64 ++++++++++++++++++++++++++++++++++++++++++++++++-
 kernel/time/time.c      | 59 +++++++++++++++------------------------------
 2 files changed, 82 insertions(+), 41 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/jiffies.h b/include/linux/jiffies.h
index c367cbdf73ab..9527ddbb0f1b 100644
--- a/include/linux/jiffies.h
+++ b/include/linux/jiffies.h
@@ -7,6 +7,7 @@
 #include <linux/time.h>
 #include <linux/timex.h>
 #include <asm/param.h>			/* for HZ */
+#include <generated/timeconst.h>
 
 /*
  * The following defines establish the engineering parameters of the PLL
@@ -288,7 +289,68 @@ static inline u64 jiffies_to_nsecs(const unsigned long j)
 	return (u64)jiffies_to_usecs(j) * NSEC_PER_USEC;
 }
 
-extern unsigned long msecs_to_jiffies(const unsigned int m);
+extern unsigned long __msecs_to_jiffies(const unsigned int m);
+#if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ)
+/*
+ * HZ is equal to or smaller than 1000, and 1000 is a nice round
+ * multiple of HZ, divide with the factor between them, but round
+ * upwards:
+ */
+static inline unsigned long _msecs_to_jiffies(const unsigned int m)
+{
+		return (m + (MSEC_PER_SEC / HZ) - 1) / (MSEC_PER_SEC / HZ);
+}
+#elif HZ > MSEC_PER_SEC && !(HZ % MSEC_PER_SEC)
+/*
+ * HZ is larger than 1000, and HZ is a nice round multiple of 1000 -
+ * simply multiply with the factor between them.
+ *
+ * But first make sure the multiplication result cannot overflow:
+ */
+static inline unsigned long _msecs_to_jiffies(const unsigned int m)
+{
+		if (m > jiffies_to_msecs(MAX_JIFFY_OFFSET))
+			return MAX_JIFFY_OFFSET;
+		return m * (HZ / MSEC_PER_SEC);
+}
+#else
+/*
+ * Generic case - multiply, round and divide. But first check that if
+ * we are doing a net multiplication, that we wouldn't overflow:
+ */
+static inline unsigned long _msecs_to_jiffies(const unsigned int m)
+{
+		if (HZ > MSEC_PER_SEC && m > jiffies_to_msecs(MAX_JIFFY_OFFSET))
+			return MAX_JIFFY_OFFSET;
+
+		return (MSEC_TO_HZ_MUL32 * m + MSEC_TO_HZ_ADJ32)
+			>> MSEC_TO_HZ_SHR32;
+}
+#endif
+/**
+ * msecs_to_jiffies: - convert milliseconds to jiffies
+ * @m:	time in milliseconds
+ *
+ * conversion is done as follows:
+ *
+ * - negative values mean 'infinite timeout' (MAX_JIFFY_OFFSET)
+ *
+ * - 'too large' values [that would result in larger than
+ *   MAX_JIFFY_OFFSET values] mean 'infinite timeout' too.
+ *
+ * - all other values are converted to jiffies by either multiplying
+ *   the input value by a factor or dividing it with a factor and
+ *   handling any 32-bit overflows.
+ *   for the details see __msecs_to_jiffies()
+ *
+ * the HZ range specific helpers _msecs_to_jiffies() are called from
+ * __msecs_to_jiffies().
+ */
+static inline unsigned long msecs_to_jiffies(const unsigned int m)
+{
+	return __msecs_to_jiffies(m);
+}
+
 extern unsigned long usecs_to_jiffies(const unsigned int u);
 extern unsigned long timespec_to_jiffies(const struct timespec *value);
 extern void jiffies_to_timespec(const unsigned long jiffies,
diff --git a/kernel/time/time.c b/kernel/time/time.c
index 4fa1d26a9843..c42c2c3214fe 100644
--- a/kernel/time/time.c
+++ b/kernel/time/time.c
@@ -483,9 +483,11 @@ struct timespec64 ns_to_timespec64(const s64 nsec)
 }
 EXPORT_SYMBOL(ns_to_timespec64);
 #endif
-/*
- * When we convert to jiffies then we interpret incoming values
- * the following way:
+/**
+ * msecs_to_jiffies: - convert milliseconds to jiffies
+ * @m:	time in milliseconds
+ *
+ * conversion is done as follows:
  *
  * - negative values mean 'infinite timeout' (MAX_JIFFY_OFFSET)
  *
@@ -493,51 +495,28 @@ EXPORT_SYMBOL(ns_to_timespec64);
  *   MAX_JIFFY_OFFSET values] mean 'infinite timeout' too.
  *
  * - all other values are converted to jiffies by either multiplying
- *   the input value by a factor or dividing it with a factor
- *
- * We must also be careful about 32-bit overflows.
+ *   the input value by a factor or dividing it with a factor and
+ *   handling any 32-bit overflows.
+ *   for the details see __msecs_to_jiffies()
+ *
+ * msecs_to_jiffies() checks for the passed in value being a constant
+ * via __builtin_constant_p() allowing gcc to eliminate most of the
+ * code, __msecs_to_jiffies() is called if the value passed does not
+ * allow constant folding and the actual conversion must be done at
+ * runtime.
+ * the _msecs_to_jiffies helpers are the HZ dependent conversion
+ * routines found in include/linux/jiffies.h
  */
-unsigned long msecs_to_jiffies(const unsigned int m)
+unsigned long __msecs_to_jiffies(const unsigned int m)
 {
 	/*
 	 * Negative value, means infinite timeout:
 	 */
 	if ((int)m < 0)
 		return MAX_JIFFY_OFFSET;
-
-#if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ)
-	/*
-	 * HZ is equal to or smaller than 1000, and 1000 is a nice
-	 * round multiple of HZ, divide with the factor between them,
-	 * but round upwards:
-	 */
-	return (m + (MSEC_PER_SEC / HZ) - 1) / (MSEC_PER_SEC / HZ);
-#elif HZ > MSEC_PER_SEC && !(HZ % MSEC_PER_SEC)
-	/*
-	 * HZ is larger than 1000, and HZ is a nice round multiple of
-	 * 1000 - simply multiply with the factor between them.
-	 *
-	 * But first make sure the multiplication result cannot
-	 * overflow:
-	 */
-	if (m > jiffies_to_msecs(MAX_JIFFY_OFFSET))
-		return MAX_JIFFY_OFFSET;
-
-	return m * (HZ / MSEC_PER_SEC);
-#else
-	/*
-	 * Generic case - multiply, round and divide. But first
-	 * check that if we are doing a net multiplication, that
-	 * we wouldn't overflow:
-	 */
-	if (HZ > MSEC_PER_SEC && m > jiffies_to_msecs(MAX_JIFFY_OFFSET))
-		return MAX_JIFFY_OFFSET;
-
-	return (MSEC_TO_HZ_MUL32 * m + MSEC_TO_HZ_ADJ32)
-		>> MSEC_TO_HZ_SHR32;
-#endif
+	return _msecs_to_jiffies(m);
 }
-EXPORT_SYMBOL(msecs_to_jiffies);
+EXPORT_SYMBOL(__msecs_to_jiffies);
 
 unsigned long usecs_to_jiffies(const unsigned int u)
 {
-- 
cgit v1.2.3


From daa67b4b70568a07fef3cffacb2055891bf42ddb Mon Sep 17 00:00:00 2001
From: Nicholas Mc Guire <hofrat@osadl.org>
Date: Mon, 18 May 2015 14:19:14 +0200
Subject: time: Allow gcc to fold constants when possible

To allow constant folding in msecs_to_jiffies() conditionally calls
the HZ dependent _msecs_to_jiffies() helpers or, when gcc can not
figure out constant folding, __msecs_to_jiffies which is the renamed
original msecs_to_jiffies() function.

Signed-off-by: Nicholas Mc Guire <hofrat@osadl.org>
Cc: Masahiro Yamada <yamada.m@jp.panasonic.com>
Cc: Sam Ravnborg <sam@ravnborg.org>
Cc: Joe Perches <joe@perches.com>
Cc: John Stultz <john.stultz@linaro.org>
Cc: Andrew Hunter <ahh@google.com>
Cc: Paul Turner <pjt@google.com>
Cc: Michal Marek <mmarek@suse.cz>
Link: http://lkml.kernel.org/r/1431951554-5563-3-git-send-email-hofrat@osadl.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/jiffies.h | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/jiffies.h b/include/linux/jiffies.h
index 9527ddbb0f1b..5e75af6cf1bc 100644
--- a/include/linux/jiffies.h
+++ b/include/linux/jiffies.h
@@ -343,12 +343,24 @@ static inline unsigned long _msecs_to_jiffies(const unsigned int m)
  *   handling any 32-bit overflows.
  *   for the details see __msecs_to_jiffies()
  *
- * the HZ range specific helpers _msecs_to_jiffies() are called from
- * __msecs_to_jiffies().
+ * msecs_to_jiffies() checks for the passed in value being a constant
+ * via __builtin_constant_p() allowing gcc to eliminate most of the
+ * code, __msecs_to_jiffies() is called if the value passed does not
+ * allow constant folding and the actual conversion must be done at
+ * runtime.
+ * the HZ range specific helpers _msecs_to_jiffies() are called both
+ * directly here and from __msecs_to_jiffies() in the case where
+ * constant folding is not possible.
  */
 static inline unsigned long msecs_to_jiffies(const unsigned int m)
 {
-	return __msecs_to_jiffies(m);
+	if (__builtin_constant_p(m)) {
+		if ((int)m < 0)
+			return MAX_JIFFY_OFFSET;
+		return _msecs_to_jiffies(m);
+	} else {
+		return __msecs_to_jiffies(m);
+	}
 }
 
 extern unsigned long usecs_to_jiffies(const unsigned int u);
-- 
cgit v1.2.3


From 0a4377de305684c883bf90ad21e3cbdeead70f5c Mon Sep 17 00:00:00 2001
From: Jiang Liu <jiang.liu@linux.intel.com>
Date: Tue, 19 May 2015 17:07:14 +0800
Subject: genirq: Introduce irq_set_vcpu_affinity() to target an interrupt to a
 VCPU

With Posted-Interrupts support in Intel CPU and IOMMU, an external
interrupt from assigned-devices could be directly delivered to a
virtual CPU in a virtual machine. Instead of hacking KVM and Intel
IOMMU drivers, we propose a platform independent interface to target
an interrupt to a specific virtual CPU in a virtual machine, or set
virtual CPU affinity for an interrupt.

By adopting this new interface and the hierarchy irqdomain, we could
easily support posted-interrupts on Intel platforms, and also provide
flexible enough interfaces for other platforms to support similar
features.

Here is the usage scenario for this interface:
Guest update MSI/MSI-X interrupt configuration
        -->QEMU and KVM handle this
        -->KVM call this interface (passing posted interrupts descriptor
           and guest vector)
        -->irq core will transfer the control to IOMMU
        -->IOMMU will do the real work of updating IRTE (IRTE has new
           format for VT-d Posted-Interrupts)

Signed-off-by: Jiang Liu <jiang.liu@linux.intel.com>
Signed-off-by: Feng Wu <feng.wu@intel.com>
Link: http://lkml.kernel.org/r/1432026437-16560-2-git-send-email-feng.wu@intel.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/irq.h |  6 ++++++
 kernel/irq/chip.c   | 14 ++++++++++++++
 kernel/irq/manage.c | 31 +++++++++++++++++++++++++++++++
 3 files changed, 51 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/irq.h b/include/linux/irq.h
index 62c6901cab55..48cb7d1aa58f 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -327,6 +327,7 @@ static inline irq_hw_number_t irqd_to_hwirq(struct irq_data *d)
  * @irq_write_msi_msg:	optional to write message content for MSI
  * @irq_get_irqchip_state:	return the internal state of an interrupt
  * @irq_set_irqchip_state:	set the internal state of a interrupt
+ * @irq_set_vcpu_affinity:	optional to target a vCPU in a virtual machine
  * @flags:		chip specific flags
  */
 struct irq_chip {
@@ -369,6 +370,8 @@ struct irq_chip {
 	int		(*irq_get_irqchip_state)(struct irq_data *data, enum irqchip_irq_state which, bool *state);
 	int		(*irq_set_irqchip_state)(struct irq_data *data, enum irqchip_irq_state which, bool state);
 
+	int		(*irq_set_vcpu_affinity)(struct irq_data *data, void *vcpu_info);
+
 	unsigned long	flags;
 };
 
@@ -422,6 +425,7 @@ extern void irq_cpu_online(void);
 extern void irq_cpu_offline(void);
 extern int irq_set_affinity_locked(struct irq_data *data,
 				   const struct cpumask *cpumask, bool force);
+extern int irq_set_vcpu_affinity(unsigned int irq, void *vcpu_info);
 
 #if defined(CONFIG_SMP) && defined(CONFIG_GENERIC_PENDING_IRQ)
 void irq_move_irq(struct irq_data *data);
@@ -467,6 +471,8 @@ extern int irq_chip_set_affinity_parent(struct irq_data *data,
 					const struct cpumask *dest,
 					bool force);
 extern int irq_chip_set_wake_parent(struct irq_data *data, unsigned int on);
+extern int irq_chip_set_vcpu_affinity_parent(struct irq_data *data,
+					     void *vcpu_info);
 #endif
 
 /* Handling of unhandled and spurious interrupts: */
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index eb9a4ea394ab..55016b2151f3 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -949,6 +949,20 @@ int irq_chip_retrigger_hierarchy(struct irq_data *data)
 	return -ENOSYS;
 }
 
+/**
+ * irq_chip_set_vcpu_affinity_parent - Set vcpu affinity on the parent interrupt
+ * @data:	Pointer to interrupt specific data
+ * @dest:	The vcpu affinity information
+ */
+int irq_chip_set_vcpu_affinity_parent(struct irq_data *data, void *vcpu_info)
+{
+	data = data->parent_data;
+	if (data->chip->irq_set_vcpu_affinity)
+		return data->chip->irq_set_vcpu_affinity(data, vcpu_info);
+
+	return -ENOSYS;
+}
+
 /**
  * irq_chip_set_wake_parent - Set/reset wake-up on the parent interrupt
  * @data:	Pointer to interrupt specific data
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index e68932bb308e..b1c7e8f46bfb 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -256,6 +256,37 @@ int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m)
 }
 EXPORT_SYMBOL_GPL(irq_set_affinity_hint);
 
+/**
+ *	irq_set_vcpu_affinity - Set vcpu affinity for the interrupt
+ *	@irq: interrupt number to set affinity
+ *	@vcpu_info: vCPU specific data
+ *
+ *	This function uses the vCPU specific data to set the vCPU
+ *	affinity for an irq. The vCPU specific data is passed from
+ *	outside, such as KVM. One example code path is as below:
+ *	KVM -> IOMMU -> irq_set_vcpu_affinity().
+ */
+int irq_set_vcpu_affinity(unsigned int irq, void *vcpu_info)
+{
+	unsigned long flags;
+	struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0);
+	struct irq_data *data;
+	struct irq_chip *chip;
+	int ret = -ENOSYS;
+
+	if (!desc)
+		return -EINVAL;
+
+	data = irq_desc_get_irq_data(desc);
+	chip = irq_data_get_irq_chip(data);
+	if (chip && chip->irq_set_vcpu_affinity)
+		ret = chip->irq_set_vcpu_affinity(data, vcpu_info);
+	irq_put_desc_unlock(desc, flags);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(irq_set_vcpu_affinity);
+
 static void irq_affinity_notify(struct work_struct *work)
 {
 	struct irq_affinity_notify *notify =
-- 
cgit v1.2.3


From 8fff52fd50934580c5108afed12043a774edf728 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Fri, 3 Apr 2015 09:04:04 +0530
Subject: clockevents: Introduce CLOCK_EVT_STATE_ONESHOT_STOPPED state

When no timers/hrtimers are pending, the expiry time is set to a
special value: 'KTIME_MAX'. This normally happens with
NO_HZ_{IDLE|FULL} in both LOWRES/HIGHRES modes.

When 'expiry == KTIME_MAX', we either cancel the 'tick-sched' hrtimer
(NOHZ_MODE_HIGHRES) or skip reprogramming clockevent device
(NOHZ_MODE_LOWRES).  But, the clockevent device is already
reprogrammed from the tick-handler for next tick.

As the clock event device is programmed in ONESHOT mode it will at
least fire one more time (unnecessarily). Timers on few
implementations (like arm_arch_timer, etc.) only support PERIODIC mode
and their drivers emulate ONESHOT over that. Which means that on these
platforms we will get spurious interrupts periodically (at last
programmed interval rate, normally tick rate).

In order to avoid spurious interrupts, the clockevent device should be
stopped or its interrupts should be masked.

A simple (yet hacky) solution to get this fixed could be: update
hrtimer_force_reprogram() to always reprogram clockevent device and
update clockevent drivers to STOP generating events (or delay it to
max time) when 'expires' is set to KTIME_MAX. But the drawback here is
that every clockevent driver has to be hacked for this particular case
and its very easy for new ones to miss this.

However, Thomas suggested to add an optional state ONESHOT_STOPPED to
solve this problem: lkml.org/lkml/2014/5/9/508.

This patch adds support for ONESHOT_STOPPED state in clockevents
core. It will only be available to drivers that implement the
state-specific callbacks instead of the legacy ->set_mode() callback.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Reviewed-by: Preeti U. Murthy <preeti@linux.vnet.ibm.com>
Cc: linaro-kernel@lists.linaro.org
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Kevin Hilman <khilman@linaro.org>
Cc: Daniel Lezcano <daniel.lezcano@linaro.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/b8b383a03ac07b13312c16850b5106b82e4245b5.1428031396.git.viresh.kumar@linaro.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/clockchips.h |  7 ++++++-
 kernel/time/clockevents.c  | 14 +++++++++++++-
 kernel/time/timer_list.c   |  6 ++++++
 3 files changed, 25 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/clockchips.h b/include/linux/clockchips.h
index 96c280b2c263..271fa4c8eb29 100644
--- a/include/linux/clockchips.h
+++ b/include/linux/clockchips.h
@@ -37,12 +37,15 @@ enum clock_event_mode {
  *		reached from DETACHED or SHUTDOWN.
  * ONESHOT:	Device is programmed to generate event only once. Can be reached
  *		from DETACHED or SHUTDOWN.
+ * ONESHOT_STOPPED: Device was programmed in ONESHOT mode and is temporarily
+ *		    stopped.
  */
 enum clock_event_state {
 	CLOCK_EVT_STATE_DETACHED,
 	CLOCK_EVT_STATE_SHUTDOWN,
 	CLOCK_EVT_STATE_PERIODIC,
 	CLOCK_EVT_STATE_ONESHOT,
+	CLOCK_EVT_STATE_ONESHOT_STOPPED,
 };
 
 /*
@@ -90,6 +93,7 @@ enum clock_event_state {
  * @set_mode:		legacy set mode function, only for modes <= CLOCK_EVT_MODE_RESUME.
  * @set_state_periodic:	switch state to periodic, if !set_mode
  * @set_state_oneshot:	switch state to oneshot, if !set_mode
+ * @set_state_oneshot_stopped: switch state to oneshot_stopped, if !set_mode
  * @set_state_shutdown:	switch state to shutdown, if !set_mode
  * @tick_resume:	resume clkevt device, if !set_mode
  * @broadcast:		function to broadcast events
@@ -121,11 +125,12 @@ struct clock_event_device {
 	 * State transition callback(s): Only one of the two groups should be
 	 * defined:
 	 * - set_mode(), only for modes <= CLOCK_EVT_MODE_RESUME.
-	 * - set_state_{shutdown|periodic|oneshot}(), tick_resume().
+	 * - set_state_{shutdown|periodic|oneshot|oneshot_stopped}(), tick_resume().
 	 */
 	void			(*set_mode)(enum clock_event_mode mode, struct clock_event_device *);
 	int			(*set_state_periodic)(struct clock_event_device *);
 	int			(*set_state_oneshot)(struct clock_event_device *);
+	int			(*set_state_oneshot_stopped)(struct clock_event_device *);
 	int			(*set_state_shutdown)(struct clock_event_device *);
 	int			(*tick_resume)(struct clock_event_device *);
 
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 637a09461c1d..dc6afb485027 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -134,6 +134,17 @@ static int __clockevents_set_state(struct clock_event_device *dev,
 			return -ENOSYS;
 		return dev->set_state_oneshot(dev);
 
+	case CLOCK_EVT_STATE_ONESHOT_STOPPED:
+		/* Core internal bug */
+		if (WARN_ONCE(dev->state != CLOCK_EVT_STATE_ONESHOT,
+			      "Current state: %d\n", dev->state))
+			return -EINVAL;
+
+		if (dev->set_state_oneshot_stopped)
+			return dev->set_state_oneshot_stopped(dev);
+		else
+			return -ENOSYS;
+
 	default:
 		return -ENOSYS;
 	}
@@ -445,7 +456,8 @@ static int clockevents_sanity_check(struct clock_event_device *dev)
 	if (dev->set_mode) {
 		/* We shouldn't be supporting new modes now */
 		WARN_ON(dev->set_state_periodic || dev->set_state_oneshot ||
-			dev->set_state_shutdown || dev->tick_resume);
+			dev->set_state_shutdown || dev->tick_resume ||
+			dev->set_state_oneshot_stopped);
 
 		BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED);
 		return 0;
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index 18b074b215b0..1327004429be 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -258,6 +258,12 @@ print_tickdevice(struct seq_file *m, struct tick_device *td, int cpu)
 			SEQ_printf(m, "\n");
 		}
 
+		if (dev->set_state_oneshot_stopped) {
+			SEQ_printf(m, " oneshot stopped: ");
+			print_name_offset(m, dev->set_state_oneshot_stopped);
+			SEQ_printf(m, "\n");
+		}
+
 		if (dev->tick_resume) {
 			SEQ_printf(m, " resume:   ");
 			print_name_offset(m, dev->tick_resume);
-- 
cgit v1.2.3


From 4ecd4fef3a074c8bb43c391a57742c422469ebbd Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 7 May 2015 09:38:13 +0200
Subject: block: use an atomic_t for mq_freeze_depth

lockdep gets unhappy about the not disabling irqs when using the queue_lock
around it.  Instead of trying to fix that up just switch to an atomic_t
and get rid of the lock.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq.c         | 24 ++++++++++--------------
 include/linux/blkdev.h |  2 +-
 2 files changed, 11 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 31df47443699..c382a34fe5ac 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -89,7 +89,8 @@ static int blk_mq_queue_enter(struct request_queue *q, gfp_t gfp)
 			return -EBUSY;
 
 		ret = wait_event_interruptible(q->mq_freeze_wq,
-				!q->mq_freeze_depth || blk_queue_dying(q));
+				!atomic_read(&q->mq_freeze_depth) ||
+				blk_queue_dying(q));
 		if (blk_queue_dying(q))
 			return -ENODEV;
 		if (ret)
@@ -112,13 +113,10 @@ static void blk_mq_usage_counter_release(struct percpu_ref *ref)
 
 void blk_mq_freeze_queue_start(struct request_queue *q)
 {
-	bool freeze;
+	int freeze_depth;
 
-	spin_lock_irq(q->queue_lock);
-	freeze = !q->mq_freeze_depth++;
-	spin_unlock_irq(q->queue_lock);
-
-	if (freeze) {
+	freeze_depth = atomic_inc_return(&q->mq_freeze_depth);
+	if (freeze_depth == 1) {
 		percpu_ref_kill(&q->mq_usage_counter);
 		blk_mq_run_hw_queues(q, false);
 	}
@@ -143,13 +141,11 @@ EXPORT_SYMBOL_GPL(blk_mq_freeze_queue);
 
 void blk_mq_unfreeze_queue(struct request_queue *q)
 {
-	bool wake;
+	int freeze_depth;
 
-	spin_lock_irq(q->queue_lock);
-	wake = !--q->mq_freeze_depth;
-	WARN_ON_ONCE(q->mq_freeze_depth < 0);
-	spin_unlock_irq(q->queue_lock);
-	if (wake) {
+	freeze_depth = atomic_dec_return(&q->mq_freeze_depth);
+	WARN_ON_ONCE(freeze_depth < 0);
+	if (!freeze_depth) {
 		percpu_ref_reinit(&q->mq_usage_counter);
 		wake_up_all(&q->mq_freeze_wq);
 	}
@@ -2081,7 +2077,7 @@ void blk_mq_free_queue(struct request_queue *q)
 /* Basically redo blk_mq_init_queue with queue frozen */
 static void blk_mq_queue_reinit(struct request_queue *q)
 {
-	WARN_ON_ONCE(!q->mq_freeze_depth);
+	WARN_ON_ONCE(!atomic_read(&q->mq_freeze_depth));
 
 	blk_mq_sysfs_unregister(q);
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 2da818a48097..bc917956a6d0 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -444,7 +444,7 @@ struct request_queue {
 	struct mutex		sysfs_lock;
 
 	int			bypass_depth;
-	int			mq_freeze_depth;
+	atomic_t		mq_freeze_depth;
 
 #if defined(CONFIG_BLK_DEV_BSG)
 	bsg_job_fn		*bsg_job_fn;
-- 
cgit v1.2.3


From b25de9d6da49b1a8760a89672283128aa8c78345 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 24 Apr 2015 21:41:01 +0200
Subject: block: remove BIO_EOPNOTSUPP

Since the big barrier rewrite/removal in 2007 we never fail FLUSH or
FUA requests, which means we can remove the magic BIO_EOPNOTSUPP flag
to help propagating those to the buffer_head layer.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jeff Moyer <jmoyer@redhat.com>
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/bounce.c            |  3 ---
 fs/btrfs/disk-io.c        | 11 ++---------
 fs/btrfs/extent_io.c      |  2 --
 fs/buffer.c               | 10 ----------
 fs/ext4/page-io.c         |  1 -
 fs/nilfs2/segbuf.c        | 12 ------------
 include/linux/blk_types.h |  1 -
 7 files changed, 2 insertions(+), 38 deletions(-)

(limited to 'include/linux')

diff --git a/block/bounce.c b/block/bounce.c
index ab21ba203d5c..4bac72579c1f 100644
--- a/block/bounce.c
+++ b/block/bounce.c
@@ -128,9 +128,6 @@ static void bounce_end_io(struct bio *bio, mempool_t *pool, int err)
 	struct bio_vec *bvec, *org_vec;
 	int i;
 
-	if (test_bit(BIO_EOPNOTSUPP, &bio->bi_flags))
-		set_bit(BIO_EOPNOTSUPP, &bio_orig->bi_flags);
-
 	/*
 	 * free up bounce indirect pages used
 	 */
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 2ef9a4b72d06..e08a926fe12c 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -3269,11 +3269,8 @@ static int write_dev_supers(struct btrfs_device *device,
  */
 static void btrfs_end_empty_barrier(struct bio *bio, int err)
 {
-	if (err) {
-		if (err == -EOPNOTSUPP)
-			set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
+	if (err)
 		clear_bit(BIO_UPTODATE, &bio->bi_flags);
-	}
 	if (bio->bi_private)
 		complete(bio->bi_private);
 	bio_put(bio);
@@ -3301,11 +3298,7 @@ static int write_dev_flush(struct btrfs_device *device, int wait)
 
 		wait_for_completion(&device->flush_wait);
 
-		if (bio_flagged(bio, BIO_EOPNOTSUPP)) {
-			printk_in_rcu("BTRFS: disabling barriers on dev %s\n",
-				      rcu_str_deref(device->name));
-			device->nobarriers = 1;
-		} else if (!bio_flagged(bio, BIO_UPTODATE)) {
+		if (!bio_flagged(bio, BIO_UPTODATE)) {
 			ret = -EIO;
 			btrfs_dev_stat_inc_and_print(device,
 				BTRFS_DEV_STAT_FLUSH_ERRS);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 43af5a61ad25..1e155299abc0 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2767,8 +2767,6 @@ static int __must_check submit_one_bio(int rw, struct bio *bio,
 	else
 		btrfsic_submit_bio(rw, bio);
 
-	if (bio_flagged(bio, BIO_EOPNOTSUPP))
-		ret = -EOPNOTSUPP;
 	bio_put(bio);
 	return ret;
 }
diff --git a/fs/buffer.c b/fs/buffer.c
index c7a5602d01ee..efd85e0e8660 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -2938,10 +2938,6 @@ static void end_bio_bh_io_sync(struct bio *bio, int err)
 {
 	struct buffer_head *bh = bio->bi_private;
 
-	if (err == -EOPNOTSUPP) {
-		set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
-	}
-
 	if (unlikely (test_bit(BIO_QUIET,&bio->bi_flags)))
 		set_bit(BH_Quiet, &bh->b_state);
 
@@ -3041,13 +3037,7 @@ int _submit_bh(int rw, struct buffer_head *bh, unsigned long bio_flags)
 	if (buffer_prio(bh))
 		rw |= REQ_PRIO;
 
-	bio_get(bio);
 	submit_bio(rw, bio);
-
-	if (bio_flagged(bio, BIO_EOPNOTSUPP))
-		ret = -EOPNOTSUPP;
-
-	bio_put(bio);
 	return ret;
 }
 EXPORT_SYMBOL_GPL(_submit_bh);
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 5765f88b3904..c5d81e8d84c3 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -359,7 +359,6 @@ void ext4_io_submit(struct ext4_io_submit *io)
 	if (bio) {
 		bio_get(io->io_bio);
 		submit_bio(io->io_op, io->io_bio);
-		BUG_ON(bio_flagged(io->io_bio, BIO_EOPNOTSUPP));
 		bio_put(io->io_bio);
 	}
 	io->io_bio = NULL;
diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c
index dc3a9efdaab8..42468e5ab3e7 100644
--- a/fs/nilfs2/segbuf.c
+++ b/fs/nilfs2/segbuf.c
@@ -343,11 +343,6 @@ static void nilfs_end_bio_write(struct bio *bio, int err)
 	const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
 	struct nilfs_segment_buffer *segbuf = bio->bi_private;
 
-	if (err == -EOPNOTSUPP) {
-		set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
-		/* to be detected by nilfs_segbuf_submit_bio() */
-	}
-
 	if (!uptodate)
 		atomic_inc(&segbuf->sb_err);
 
@@ -374,15 +369,8 @@ static int nilfs_segbuf_submit_bio(struct nilfs_segment_buffer *segbuf,
 
 	bio->bi_end_io = nilfs_end_bio_write;
 	bio->bi_private = segbuf;
-	bio_get(bio);
 	submit_bio(mode, bio);
 	segbuf->sb_nbio++;
-	if (bio_flagged(bio, BIO_EOPNOTSUPP)) {
-		bio_put(bio);
-		err = -EOPNOTSUPP;
-		goto failed;
-	}
-	bio_put(bio);
 
 	wi->bio = NULL;
 	wi->rest_blocks -= wi->end - wi->start;
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 93d2e7153816..daf95915d104 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -118,7 +118,6 @@ struct bio {
 #define BIO_CLONED	4	/* doesn't own data */
 #define BIO_BOUNCED	5	/* bio is a bounce bio */
 #define BIO_USER_MAPPED 6	/* contains user pages */
-#define BIO_EOPNOTSUPP	7	/* not supported */
 #define BIO_NULL_MAPPED 8	/* contains invalid user pages */
 #define BIO_QUIET	9	/* Make BIO Quiet */
 #define BIO_SNAP_STABLE	10	/* bio data must be snapshotted during write */
-- 
cgit v1.2.3


From 97ca223c3b37ed12a5b67a5dc6247e5a4799d337 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 24 Apr 2015 21:41:02 +0200
Subject: block: remove unused BIO_RW_BLOCK and BIO_EOF flags

Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-core.c          | 2 --
 include/linux/blk_types.h | 2 --
 2 files changed, 4 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-core.c b/block/blk-core.c
index f0be754c7781..de474b5dee2b 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1725,8 +1725,6 @@ static void handle_bad_sector(struct bio *bio)
 			bio->bi_rw,
 			(unsigned long long)bio_end_sector(bio),
 			(long long)(i_size_read(bio->bi_bdev->bd_inode) >> 9));
-
-	set_bit(BIO_EOF, &bio->bi_flags);
 }
 
 #ifdef CONFIG_FAIL_MAKE_REQUEST
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index daf95915d104..09c7a2cd48ef 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -112,8 +112,6 @@ struct bio {
  * bio flags
  */
 #define BIO_UPTODATE	0	/* ok after I/O completion */
-#define BIO_RW_BLOCK	1	/* RW_AHEAD set, and read/write would block */
-#define BIO_EOF		2	/* out-out-bounds error */
 #define BIO_SEG_VALID	3	/* bi_phys_segments valid */
 #define BIO_CLONED	4	/* doesn't own data */
 #define BIO_BOUNCED	5	/* bio is a bounce bio */
-- 
cgit v1.2.3


From 4e3d9cb0134fea035e6eb1707e5e7d8aaffa186d Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 19 May 2015 17:14:51 +0200
Subject: jiffies: Remove the extra indentation level

Somehow I missed to clean that up when applying the patches. Fix it up
now.

Reported-by: Joe Perches <joe@perches.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Nicholas Mc Guire <der.herr@hofr.at>
---
 include/linux/jiffies.h | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/jiffies.h b/include/linux/jiffies.h
index 5e75af6cf1bc..3bde5eb8568b 100644
--- a/include/linux/jiffies.h
+++ b/include/linux/jiffies.h
@@ -298,7 +298,7 @@ extern unsigned long __msecs_to_jiffies(const unsigned int m);
  */
 static inline unsigned long _msecs_to_jiffies(const unsigned int m)
 {
-		return (m + (MSEC_PER_SEC / HZ) - 1) / (MSEC_PER_SEC / HZ);
+	return (m + (MSEC_PER_SEC / HZ) - 1) / (MSEC_PER_SEC / HZ);
 }
 #elif HZ > MSEC_PER_SEC && !(HZ % MSEC_PER_SEC)
 /*
@@ -309,9 +309,9 @@ static inline unsigned long _msecs_to_jiffies(const unsigned int m)
  */
 static inline unsigned long _msecs_to_jiffies(const unsigned int m)
 {
-		if (m > jiffies_to_msecs(MAX_JIFFY_OFFSET))
-			return MAX_JIFFY_OFFSET;
-		return m * (HZ / MSEC_PER_SEC);
+	if (m > jiffies_to_msecs(MAX_JIFFY_OFFSET))
+		return MAX_JIFFY_OFFSET;
+	return m * (HZ / MSEC_PER_SEC);
 }
 #else
 /*
@@ -320,11 +320,10 @@ static inline unsigned long _msecs_to_jiffies(const unsigned int m)
  */
 static inline unsigned long _msecs_to_jiffies(const unsigned int m)
 {
-		if (HZ > MSEC_PER_SEC && m > jiffies_to_msecs(MAX_JIFFY_OFFSET))
-			return MAX_JIFFY_OFFSET;
+	if (HZ > MSEC_PER_SEC && m > jiffies_to_msecs(MAX_JIFFY_OFFSET))
+		return MAX_JIFFY_OFFSET;
 
-		return (MSEC_TO_HZ_MUL32 * m + MSEC_TO_HZ_ADJ32)
-			>> MSEC_TO_HZ_SHR32;
+	return (MSEC_TO_HZ_MUL32 * m + MSEC_TO_HZ_ADJ32) >> MSEC_TO_HZ_SHR32;
 }
 #endif
 /**
-- 
cgit v1.2.3


From b2dbe0a60f1bcf4db5c701f1577b3135c3159eb5 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Tue, 19 May 2015 09:18:28 -0600
Subject: block: collapse bio bit space

Various previous patches removed bits and left holes, collapse them
all. Leave the reset start bit where it is, we don't need to change
that.

Signed-off-by: Jens Axboe <axboe@fb.com>
---
 include/linux/blk_types.h | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 09c7a2cd48ef..3f4ded0b1a34 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -112,15 +112,15 @@ struct bio {
  * bio flags
  */
 #define BIO_UPTODATE	0	/* ok after I/O completion */
-#define BIO_SEG_VALID	3	/* bi_phys_segments valid */
-#define BIO_CLONED	4	/* doesn't own data */
-#define BIO_BOUNCED	5	/* bio is a bounce bio */
-#define BIO_USER_MAPPED 6	/* contains user pages */
-#define BIO_NULL_MAPPED 8	/* contains invalid user pages */
-#define BIO_QUIET	9	/* Make BIO Quiet */
-#define BIO_SNAP_STABLE	10	/* bio data must be snapshotted during write */
-#define BIO_CHAIN	11	/* chained bio, ->bi_remaining in effect */
-#define BIO_REFFED	12	/* bio has elevated ->bi_cnt */
+#define BIO_SEG_VALID	1	/* bi_phys_segments valid */
+#define BIO_CLONED	2	/* doesn't own data */
+#define BIO_BOUNCED	3	/* bio is a bounce bio */
+#define BIO_USER_MAPPED 4	/* contains user pages */
+#define BIO_NULL_MAPPED 5	/* contains invalid user pages */
+#define BIO_QUIET	6	/* Make BIO Quiet */
+#define BIO_SNAP_STABLE	7	/* bio data must be snapshotted during write */
+#define BIO_CHAIN	8	/* chained bio, ->bi_remaining in effect */
+#define BIO_REFFED	9	/* bio has elevated ->bi_cnt */
 
 /*
  * Flags starting here get preserved by bio_reset() - this includes
-- 
cgit v1.2.3


From 343df3c79c62b644ce6ff5dff96c9e0be1ecb242 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 19 May 2015 09:23:23 +0200
Subject: suspend: simplify block I/O handling

Stop abusing struct page functionality and the swap end_io handler, and
instead add a modified version of the blk-lib.c bio_batch helpers.

Also move the block I/O code into swap.c as they are directly tied into
each other.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Tested-by: Pavel Machek <pavel@ucw.cz>
Tested-by: Ming Lin <mlin@kernel.org>
Acked-by: Pavel Machek <pavel@ucw.cz>
Acked-by: Rafael J. Wysocki <rjw@rjwysocki.net>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 include/linux/swap.h    |   1 -
 kernel/power/Makefile   |   3 +-
 kernel/power/block_io.c | 103 -------------------------------
 kernel/power/power.h    |   9 ---
 kernel/power/swap.c     | 159 ++++++++++++++++++++++++++++++++++++------------
 mm/page_io.c            |   2 +-
 6 files changed, 122 insertions(+), 155 deletions(-)
 delete mode 100644 kernel/power/block_io.c

(limited to 'include/linux')

diff --git a/include/linux/swap.h b/include/linux/swap.h
index cee108cbe2d5..38874729dc5f 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -377,7 +377,6 @@ extern void end_swap_bio_write(struct bio *bio, int err);
 extern int __swap_writepage(struct page *page, struct writeback_control *wbc,
 	void (*end_write_func)(struct bio *, int));
 extern int swap_set_page_dirty(struct page *page);
-extern void end_swap_bio_read(struct bio *bio, int err);
 
 int add_swap_extent(struct swap_info_struct *sis, unsigned long start_page,
 		unsigned long nr_pages, sector_t start_block);
diff --git a/kernel/power/Makefile b/kernel/power/Makefile
index 29472bff11ef..cb880a14cc39 100644
--- a/kernel/power/Makefile
+++ b/kernel/power/Makefile
@@ -7,8 +7,7 @@ obj-$(CONFIG_VT_CONSOLE_SLEEP)	+= console.o
 obj-$(CONFIG_FREEZER)		+= process.o
 obj-$(CONFIG_SUSPEND)		+= suspend.o
 obj-$(CONFIG_PM_TEST_SUSPEND)	+= suspend_test.o
-obj-$(CONFIG_HIBERNATION)	+= hibernate.o snapshot.o swap.o user.o \
-				   block_io.o
+obj-$(CONFIG_HIBERNATION)	+= hibernate.o snapshot.o swap.o user.o
 obj-$(CONFIG_PM_AUTOSLEEP)	+= autosleep.o
 obj-$(CONFIG_PM_WAKELOCKS)	+= wakelock.o
 
diff --git a/kernel/power/block_io.c b/kernel/power/block_io.c
deleted file mode 100644
index 9a58bc258810..000000000000
--- a/kernel/power/block_io.c
+++ /dev/null
@@ -1,103 +0,0 @@
-/*
- * This file provides functions for block I/O operations on swap/file.
- *
- * Copyright (C) 1998,2001-2005 Pavel Machek <pavel@ucw.cz>
- * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl>
- *
- * This file is released under the GPLv2.
- */
-
-#include <linux/bio.h>
-#include <linux/kernel.h>
-#include <linux/pagemap.h>
-#include <linux/swap.h>
-
-#include "power.h"
-
-/**
- *	submit - submit BIO request.
- *	@rw:	READ or WRITE.
- *	@off	physical offset of page.
- *	@page:	page we're reading or writing.
- *	@bio_chain: list of pending biod (for async reading)
- *
- *	Straight from the textbook - allocate and initialize the bio.
- *	If we're reading, make sure the page is marked as dirty.
- *	Then submit it and, if @bio_chain == NULL, wait.
- */
-static int submit(int rw, struct block_device *bdev, sector_t sector,
-		struct page *page, struct bio **bio_chain)
-{
-	const int bio_rw = rw | REQ_SYNC;
-	struct bio *bio;
-
-	bio = bio_alloc(__GFP_WAIT | __GFP_HIGH, 1);
-	bio->bi_iter.bi_sector = sector;
-	bio->bi_bdev = bdev;
-	bio->bi_end_io = end_swap_bio_read;
-
-	if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) {
-		printk(KERN_ERR "PM: Adding page to bio failed at %llu\n",
-			(unsigned long long)sector);
-		bio_put(bio);
-		return -EFAULT;
-	}
-
-	lock_page(page);
-	bio_get(bio);
-
-	if (bio_chain == NULL) {
-		submit_bio(bio_rw, bio);
-		wait_on_page_locked(page);
-		if (rw == READ)
-			bio_set_pages_dirty(bio);
-		bio_put(bio);
-	} else {
-		if (rw == READ)
-			get_page(page);	/* These pages are freed later */
-		bio->bi_private = *bio_chain;
-		*bio_chain = bio;
-		submit_bio(bio_rw, bio);
-	}
-	return 0;
-}
-
-int hib_bio_read_page(pgoff_t page_off, void *addr, struct bio **bio_chain)
-{
-	return submit(READ, hib_resume_bdev, page_off * (PAGE_SIZE >> 9),
-			virt_to_page(addr), bio_chain);
-}
-
-int hib_bio_write_page(pgoff_t page_off, void *addr, struct bio **bio_chain)
-{
-	return submit(WRITE, hib_resume_bdev, page_off * (PAGE_SIZE >> 9),
-			virt_to_page(addr), bio_chain);
-}
-
-int hib_wait_on_bio_chain(struct bio **bio_chain)
-{
-	struct bio *bio;
-	struct bio *next_bio;
-	int ret = 0;
-
-	if (bio_chain == NULL)
-		return 0;
-
-	bio = *bio_chain;
-	if (bio == NULL)
-		return 0;
-	while (bio) {
-		struct page *page;
-
-		next_bio = bio->bi_private;
-		page = bio->bi_io_vec[0].bv_page;
-		wait_on_page_locked(page);
-		if (!PageUptodate(page) || PageError(page))
-			ret = -EIO;
-		put_page(page);
-		bio_put(bio);
-		bio = next_bio;
-	}
-	*bio_chain = NULL;
-	return ret;
-}
diff --git a/kernel/power/power.h b/kernel/power/power.h
index ce9b8328a689..caadb566e82b 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -163,15 +163,6 @@ extern void swsusp_close(fmode_t);
 extern int swsusp_unmark(void);
 #endif
 
-/* kernel/power/block_io.c */
-extern struct block_device *hib_resume_bdev;
-
-extern int hib_bio_read_page(pgoff_t page_off, void *addr,
-		struct bio **bio_chain);
-extern int hib_bio_write_page(pgoff_t page_off, void *addr,
-		struct bio **bio_chain);
-extern int hib_wait_on_bio_chain(struct bio **bio_chain);
-
 struct timeval;
 /* kernel/power/swsusp.c */
 extern void swsusp_show_speed(ktime_t, ktime_t, unsigned int, char *);
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 570aff817543..2f30ca91e4fa 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -212,7 +212,84 @@ int swsusp_swap_in_use(void)
  */
 
 static unsigned short root_swap = 0xffff;
-struct block_device *hib_resume_bdev;
+static struct block_device *hib_resume_bdev;
+
+struct hib_bio_batch {
+	atomic_t		count;
+	wait_queue_head_t	wait;
+	int			error;
+};
+
+static void hib_init_batch(struct hib_bio_batch *hb)
+{
+	atomic_set(&hb->count, 0);
+	init_waitqueue_head(&hb->wait);
+	hb->error = 0;
+}
+
+static void hib_end_io(struct bio *bio, int error)
+{
+	struct hib_bio_batch *hb = bio->bi_private;
+	const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
+	struct page *page = bio->bi_io_vec[0].bv_page;
+
+	if (!uptodate || error) {
+		printk(KERN_ALERT "Read-error on swap-device (%u:%u:%Lu)\n",
+				imajor(bio->bi_bdev->bd_inode),
+				iminor(bio->bi_bdev->bd_inode),
+				(unsigned long long)bio->bi_iter.bi_sector);
+
+		if (!error)
+			error = -EIO;
+	}
+
+	if (bio_data_dir(bio) == WRITE)
+		put_page(page);
+
+	if (error && !hb->error)
+		hb->error = error;
+	if (atomic_dec_and_test(&hb->count))
+		wake_up(&hb->wait);
+
+	bio_put(bio);
+}
+
+static int hib_submit_io(int rw, pgoff_t page_off, void *addr,
+		struct hib_bio_batch *hb)
+{
+	struct page *page = virt_to_page(addr);
+	struct bio *bio;
+	int error = 0;
+
+	bio = bio_alloc(__GFP_WAIT | __GFP_HIGH, 1);
+	bio->bi_iter.bi_sector = page_off * (PAGE_SIZE >> 9);
+	bio->bi_bdev = hib_resume_bdev;
+
+	if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) {
+		printk(KERN_ERR "PM: Adding page to bio failed at %llu\n",
+			(unsigned long long)bio->bi_iter.bi_sector);
+		bio_put(bio);
+		return -EFAULT;
+	}
+
+	if (hb) {
+		bio->bi_end_io = hib_end_io;
+		bio->bi_private = hb;
+		atomic_inc(&hb->count);
+		submit_bio(rw, bio);
+	} else {
+		error = submit_bio_wait(rw, bio);
+		bio_put(bio);
+	}
+
+	return error;
+}
+
+static int hib_wait_io(struct hib_bio_batch *hb)
+{
+	wait_event(hb->wait, atomic_read(&hb->count) == 0);
+	return hb->error;
+}
 
 /*
  * Saving part
@@ -222,7 +299,7 @@ static int mark_swapfiles(struct swap_map_handle *handle, unsigned int flags)
 {
 	int error;
 
-	hib_bio_read_page(swsusp_resume_block, swsusp_header, NULL);
+	hib_submit_io(READ_SYNC, swsusp_resume_block, swsusp_header, NULL);
 	if (!memcmp("SWAP-SPACE",swsusp_header->sig, 10) ||
 	    !memcmp("SWAPSPACE2",swsusp_header->sig, 10)) {
 		memcpy(swsusp_header->orig_sig,swsusp_header->sig, 10);
@@ -231,7 +308,7 @@ static int mark_swapfiles(struct swap_map_handle *handle, unsigned int flags)
 		swsusp_header->flags = flags;
 		if (flags & SF_CRC32_MODE)
 			swsusp_header->crc32 = handle->crc32;
-		error = hib_bio_write_page(swsusp_resume_block,
+		error = hib_submit_io(WRITE_SYNC, swsusp_resume_block,
 					swsusp_header, NULL);
 	} else {
 		printk(KERN_ERR "PM: Swap header not found!\n");
@@ -271,10 +348,10 @@ static int swsusp_swap_check(void)
  *	write_page - Write one page to given swap location.
  *	@buf:		Address we're writing.
  *	@offset:	Offset of the swap page we're writing to.
- *	@bio_chain:	Link the next write BIO here
+ *	@hb:		bio completion batch
  */
 
-static int write_page(void *buf, sector_t offset, struct bio **bio_chain)
+static int write_page(void *buf, sector_t offset, struct hib_bio_batch *hb)
 {
 	void *src;
 	int ret;
@@ -282,13 +359,13 @@ static int write_page(void *buf, sector_t offset, struct bio **bio_chain)
 	if (!offset)
 		return -ENOSPC;
 
-	if (bio_chain) {
+	if (hb) {
 		src = (void *)__get_free_page(__GFP_WAIT | __GFP_NOWARN |
 		                              __GFP_NORETRY);
 		if (src) {
 			copy_page(src, buf);
 		} else {
-			ret = hib_wait_on_bio_chain(bio_chain); /* Free pages */
+			ret = hib_wait_io(hb); /* Free pages */
 			if (ret)
 				return ret;
 			src = (void *)__get_free_page(__GFP_WAIT |
@@ -298,14 +375,14 @@ static int write_page(void *buf, sector_t offset, struct bio **bio_chain)
 				copy_page(src, buf);
 			} else {
 				WARN_ON_ONCE(1);
-				bio_chain = NULL;	/* Go synchronous */
+				hb = NULL;	/* Go synchronous */
 				src = buf;
 			}
 		}
 	} else {
 		src = buf;
 	}
-	return hib_bio_write_page(offset, src, bio_chain);
+	return hib_submit_io(WRITE_SYNC, offset, src, hb);
 }
 
 static void release_swap_writer(struct swap_map_handle *handle)
@@ -348,7 +425,7 @@ err_close:
 }
 
 static int swap_write_page(struct swap_map_handle *handle, void *buf,
-				struct bio **bio_chain)
+		struct hib_bio_batch *hb)
 {
 	int error = 0;
 	sector_t offset;
@@ -356,7 +433,7 @@ static int swap_write_page(struct swap_map_handle *handle, void *buf,
 	if (!handle->cur)
 		return -EINVAL;
 	offset = alloc_swapdev_block(root_swap);
-	error = write_page(buf, offset, bio_chain);
+	error = write_page(buf, offset, hb);
 	if (error)
 		return error;
 	handle->cur->entries[handle->k++] = offset;
@@ -365,15 +442,15 @@ static int swap_write_page(struct swap_map_handle *handle, void *buf,
 		if (!offset)
 			return -ENOSPC;
 		handle->cur->next_swap = offset;
-		error = write_page(handle->cur, handle->cur_swap, bio_chain);
+		error = write_page(handle->cur, handle->cur_swap, hb);
 		if (error)
 			goto out;
 		clear_page(handle->cur);
 		handle->cur_swap = offset;
 		handle->k = 0;
 
-		if (bio_chain && low_free_pages() <= handle->reqd_free_pages) {
-			error = hib_wait_on_bio_chain(bio_chain);
+		if (hb && low_free_pages() <= handle->reqd_free_pages) {
+			error = hib_wait_io(hb);
 			if (error)
 				goto out;
 			/*
@@ -445,23 +522,24 @@ static int save_image(struct swap_map_handle *handle,
 	int ret;
 	int nr_pages;
 	int err2;
-	struct bio *bio;
+	struct hib_bio_batch hb;
 	ktime_t start;
 	ktime_t stop;
 
+	hib_init_batch(&hb);
+
 	printk(KERN_INFO "PM: Saving image data pages (%u pages)...\n",
 		nr_to_write);
 	m = nr_to_write / 10;
 	if (!m)
 		m = 1;
 	nr_pages = 0;
-	bio = NULL;
 	start = ktime_get();
 	while (1) {
 		ret = snapshot_read_next(snapshot);
 		if (ret <= 0)
 			break;
-		ret = swap_write_page(handle, data_of(*snapshot), &bio);
+		ret = swap_write_page(handle, data_of(*snapshot), &hb);
 		if (ret)
 			break;
 		if (!(nr_pages % m))
@@ -469,7 +547,7 @@ static int save_image(struct swap_map_handle *handle,
 			       nr_pages / m * 10);
 		nr_pages++;
 	}
-	err2 = hib_wait_on_bio_chain(&bio);
+	err2 = hib_wait_io(&hb);
 	stop = ktime_get();
 	if (!ret)
 		ret = err2;
@@ -580,7 +658,7 @@ static int save_image_lzo(struct swap_map_handle *handle,
 	int ret = 0;
 	int nr_pages;
 	int err2;
-	struct bio *bio;
+	struct hib_bio_batch hb;
 	ktime_t start;
 	ktime_t stop;
 	size_t off;
@@ -589,6 +667,8 @@ static int save_image_lzo(struct swap_map_handle *handle,
 	struct cmp_data *data = NULL;
 	struct crc_data *crc = NULL;
 
+	hib_init_batch(&hb);
+
 	/*
 	 * We'll limit the number of threads for compression to limit memory
 	 * footprint.
@@ -674,7 +754,6 @@ static int save_image_lzo(struct swap_map_handle *handle,
 	if (!m)
 		m = 1;
 	nr_pages = 0;
-	bio = NULL;
 	start = ktime_get();
 	for (;;) {
 		for (thr = 0; thr < nr_threads; thr++) {
@@ -748,7 +827,7 @@ static int save_image_lzo(struct swap_map_handle *handle,
 			     off += PAGE_SIZE) {
 				memcpy(page, data[thr].cmp + off, PAGE_SIZE);
 
-				ret = swap_write_page(handle, page, &bio);
+				ret = swap_write_page(handle, page, &hb);
 				if (ret)
 					goto out_finish;
 			}
@@ -759,7 +838,7 @@ static int save_image_lzo(struct swap_map_handle *handle,
 	}
 
 out_finish:
-	err2 = hib_wait_on_bio_chain(&bio);
+	err2 = hib_wait_io(&hb);
 	stop = ktime_get();
 	if (!ret)
 		ret = err2;
@@ -906,7 +985,7 @@ static int get_swap_reader(struct swap_map_handle *handle,
 			return -ENOMEM;
 		}
 
-		error = hib_bio_read_page(offset, tmp->map, NULL);
+		error = hib_submit_io(READ_SYNC, offset, tmp->map, NULL);
 		if (error) {
 			release_swap_reader(handle);
 			return error;
@@ -919,7 +998,7 @@ static int get_swap_reader(struct swap_map_handle *handle,
 }
 
 static int swap_read_page(struct swap_map_handle *handle, void *buf,
-				struct bio **bio_chain)
+		struct hib_bio_batch *hb)
 {
 	sector_t offset;
 	int error;
@@ -930,7 +1009,7 @@ static int swap_read_page(struct swap_map_handle *handle, void *buf,
 	offset = handle->cur->entries[handle->k];
 	if (!offset)
 		return -EFAULT;
-	error = hib_bio_read_page(offset, buf, bio_chain);
+	error = hib_submit_io(READ_SYNC, offset, buf, hb);
 	if (error)
 		return error;
 	if (++handle->k >= MAP_PAGE_ENTRIES) {
@@ -968,27 +1047,28 @@ static int load_image(struct swap_map_handle *handle,
 	int ret = 0;
 	ktime_t start;
 	ktime_t stop;
-	struct bio *bio;
+	struct hib_bio_batch hb;
 	int err2;
 	unsigned nr_pages;
 
+	hib_init_batch(&hb);
+
 	printk(KERN_INFO "PM: Loading image data pages (%u pages)...\n",
 		nr_to_read);
 	m = nr_to_read / 10;
 	if (!m)
 		m = 1;
 	nr_pages = 0;
-	bio = NULL;
 	start = ktime_get();
 	for ( ; ; ) {
 		ret = snapshot_write_next(snapshot);
 		if (ret <= 0)
 			break;
-		ret = swap_read_page(handle, data_of(*snapshot), &bio);
+		ret = swap_read_page(handle, data_of(*snapshot), &hb);
 		if (ret)
 			break;
 		if (snapshot->sync_read)
-			ret = hib_wait_on_bio_chain(&bio);
+			ret = hib_wait_io(&hb);
 		if (ret)
 			break;
 		if (!(nr_pages % m))
@@ -996,7 +1076,7 @@ static int load_image(struct swap_map_handle *handle,
 			       nr_pages / m * 10);
 		nr_pages++;
 	}
-	err2 = hib_wait_on_bio_chain(&bio);
+	err2 = hib_wait_io(&hb);
 	stop = ktime_get();
 	if (!ret)
 		ret = err2;
@@ -1067,7 +1147,7 @@ static int load_image_lzo(struct swap_map_handle *handle,
 	unsigned int m;
 	int ret = 0;
 	int eof = 0;
-	struct bio *bio;
+	struct hib_bio_batch hb;
 	ktime_t start;
 	ktime_t stop;
 	unsigned nr_pages;
@@ -1080,6 +1160,8 @@ static int load_image_lzo(struct swap_map_handle *handle,
 	struct dec_data *data = NULL;
 	struct crc_data *crc = NULL;
 
+	hib_init_batch(&hb);
+
 	/*
 	 * We'll limit the number of threads for decompression to limit memory
 	 * footprint.
@@ -1190,7 +1272,6 @@ static int load_image_lzo(struct swap_map_handle *handle,
 	if (!m)
 		m = 1;
 	nr_pages = 0;
-	bio = NULL;
 	start = ktime_get();
 
 	ret = snapshot_write_next(snapshot);
@@ -1199,7 +1280,7 @@ static int load_image_lzo(struct swap_map_handle *handle,
 
 	for(;;) {
 		for (i = 0; !eof && i < want; i++) {
-			ret = swap_read_page(handle, page[ring], &bio);
+			ret = swap_read_page(handle, page[ring], &hb);
 			if (ret) {
 				/*
 				 * On real read error, finish. On end of data,
@@ -1226,7 +1307,7 @@ static int load_image_lzo(struct swap_map_handle *handle,
 			if (!asked)
 				break;
 
-			ret = hib_wait_on_bio_chain(&bio);
+			ret = hib_wait_io(&hb);
 			if (ret)
 				goto out_finish;
 			have += asked;
@@ -1281,7 +1362,7 @@ static int load_image_lzo(struct swap_map_handle *handle,
 		 * Wait for more data while we are decompressing.
 		 */
 		if (have < LZO_CMP_PAGES && asked) {
-			ret = hib_wait_on_bio_chain(&bio);
+			ret = hib_wait_io(&hb);
 			if (ret)
 				goto out_finish;
 			have += asked;
@@ -1430,7 +1511,7 @@ int swsusp_check(void)
 	if (!IS_ERR(hib_resume_bdev)) {
 		set_blocksize(hib_resume_bdev, PAGE_SIZE);
 		clear_page(swsusp_header);
-		error = hib_bio_read_page(swsusp_resume_block,
+		error = hib_submit_io(READ_SYNC, swsusp_resume_block,
 					swsusp_header, NULL);
 		if (error)
 			goto put;
@@ -1438,7 +1519,7 @@ int swsusp_check(void)
 		if (!memcmp(HIBERNATE_SIG, swsusp_header->sig, 10)) {
 			memcpy(swsusp_header->sig, swsusp_header->orig_sig, 10);
 			/* Reset swap signature now */
-			error = hib_bio_write_page(swsusp_resume_block,
+			error = hib_submit_io(WRITE_SYNC, swsusp_resume_block,
 						swsusp_header, NULL);
 		} else {
 			error = -EINVAL;
@@ -1482,10 +1563,10 @@ int swsusp_unmark(void)
 {
 	int error;
 
-	hib_bio_read_page(swsusp_resume_block, swsusp_header, NULL);
+	hib_submit_io(READ_SYNC, swsusp_resume_block, swsusp_header, NULL);
 	if (!memcmp(HIBERNATE_SIG,swsusp_header->sig, 10)) {
 		memcpy(swsusp_header->sig,swsusp_header->orig_sig, 10);
-		error = hib_bio_write_page(swsusp_resume_block,
+		error = hib_submit_io(WRITE_SYNC, swsusp_resume_block,
 					swsusp_header, NULL);
 	} else {
 		printk(KERN_ERR "PM: Cannot find swsusp signature!\n");
diff --git a/mm/page_io.c b/mm/page_io.c
index 6424869e275e..520baa4b04d7 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -69,7 +69,7 @@ void end_swap_bio_write(struct bio *bio, int err)
 	bio_put(bio);
 }
 
-void end_swap_bio_read(struct bio *bio, int err)
+static void end_swap_bio_read(struct bio *bio, int err)
 {
 	const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
 	struct page *page = bio->bi_io_vec[0].bv_page;
-- 
cgit v1.2.3


From 3520469d65f26a1cd2f610f5d5de976f78db74fe Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Thu, 2 Apr 2015 11:20:48 +0200
Subject: KVM: export __gfn_to_pfn_memslot, drop gfn_to_pfn_async

gfn_to_pfn_async is used in just one place, and because of x86-specific
treatment that place will need to look at the memory slot.  Hence inline
it into try_async_pf and export __gfn_to_pfn_memslot.

The patch also switches the subsequent call to gfn_to_pfn_prot to use
__gfn_to_pfn_memslot.  This is a small optimization.  Finally, remove
the now-unused async argument of __gfn_to_pfn.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/mmu.c       |  9 +++++----
 include/linux/kvm_host.h |  4 ++--
 virt/kvm/kvm_main.c      | 26 ++++++++------------------
 3 files changed, 15 insertions(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 5bebec1191f1..49c34e632b91 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3511,10 +3511,12 @@ static bool can_do_async_pf(struct kvm_vcpu *vcpu)
 static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
 			 gva_t gva, pfn_t *pfn, bool write, bool *writable)
 {
+	struct kvm_memory_slot *slot;
 	bool async;
 
-	*pfn = gfn_to_pfn_async(vcpu->kvm, gfn, &async, write, writable);
-
+	slot = gfn_to_memslot(vcpu->kvm, gfn);
+	async = false;
+	*pfn = __gfn_to_pfn_memslot(slot, gfn, false, &async, write, writable);
 	if (!async)
 		return false; /* *pfn has correct page already */
 
@@ -3528,8 +3530,7 @@ static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
 			return true;
 	}
 
-	*pfn = gfn_to_pfn_prot(vcpu->kvm, gfn, write, writable);
-
+	*pfn = __gfn_to_pfn_memslot(slot, gfn, false, NULL, write, writable);
 	return false;
 }
 
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index b7a08cd6f4a8..87fd74a04005 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -539,13 +539,13 @@ void kvm_release_page_dirty(struct page *page);
 void kvm_set_page_accessed(struct page *page);
 
 pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn);
-pfn_t gfn_to_pfn_async(struct kvm *kvm, gfn_t gfn, bool *async,
-		       bool write_fault, bool *writable);
 pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn);
 pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault,
 		      bool *writable);
 pfn_t gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn);
 pfn_t gfn_to_pfn_memslot_atomic(struct kvm_memory_slot *slot, gfn_t gfn);
+pfn_t __gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn, bool atomic,
+			   bool *async, bool write_fault, bool *writable);
 
 void kvm_release_pfn_clean(pfn_t pfn);
 void kvm_set_pfn_dirty(pfn_t pfn);
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index f202c4035134..bd3c08a7c6c2 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1355,9 +1355,8 @@ exit:
 	return pfn;
 }
 
-static pfn_t
-__gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn, bool atomic,
-		     bool *async, bool write_fault, bool *writable)
+pfn_t __gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn, bool atomic,
+			   bool *async, bool write_fault, bool *writable)
 {
 	unsigned long addr = __gfn_to_hva_many(slot, gfn, NULL, write_fault);
 
@@ -1376,44 +1375,35 @@ __gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn, bool atomic,
 	return hva_to_pfn(addr, atomic, async, write_fault,
 			  writable);
 }
+EXPORT_SYMBOL_GPL(__gfn_to_pfn_memslot);
 
-static pfn_t __gfn_to_pfn(struct kvm *kvm, gfn_t gfn, bool atomic, bool *async,
+static pfn_t __gfn_to_pfn(struct kvm *kvm, gfn_t gfn, bool atomic,
 			  bool write_fault, bool *writable)
 {
 	struct kvm_memory_slot *slot;
 
-	if (async)
-		*async = false;
-
 	slot = gfn_to_memslot(kvm, gfn);
 
-	return __gfn_to_pfn_memslot(slot, gfn, atomic, async, write_fault,
+	return __gfn_to_pfn_memslot(slot, gfn, atomic, NULL, write_fault,
 				    writable);
 }
 
 pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn)
 {
-	return __gfn_to_pfn(kvm, gfn, true, NULL, true, NULL);
+	return __gfn_to_pfn(kvm, gfn, true, true, NULL);
 }
 EXPORT_SYMBOL_GPL(gfn_to_pfn_atomic);
 
-pfn_t gfn_to_pfn_async(struct kvm *kvm, gfn_t gfn, bool *async,
-		       bool write_fault, bool *writable)
-{
-	return __gfn_to_pfn(kvm, gfn, false, async, write_fault, writable);
-}
-EXPORT_SYMBOL_GPL(gfn_to_pfn_async);
-
 pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn)
 {
-	return __gfn_to_pfn(kvm, gfn, false, NULL, true, NULL);
+	return __gfn_to_pfn(kvm, gfn, false, true, NULL);
 }
 EXPORT_SYMBOL_GPL(gfn_to_pfn);
 
 pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault,
 		      bool *writable)
 {
-	return __gfn_to_pfn(kvm, gfn, false, NULL, write_fault, writable);
+	return __gfn_to_pfn(kvm, gfn, false, write_fault, writable);
 }
 EXPORT_SYMBOL_GPL(gfn_to_pfn_prot);
 
-- 
cgit v1.2.3


From cad706df7e4a00a595f2662f32c0fc174aa4e61f Mon Sep 17 00:00:00 2001
From: Miroslav Benes <mbenes@suse.cz>
Date: Tue, 19 May 2015 12:01:18 +0200
Subject: livepatch: make kobject in klp_object statically allocated

Make kobj variable (of type struct kobject) statically allocated in
klp_object structure. It will allow us to move in the func-object-patch
hierarchy through kobject links.

The only reason to have it dynamic was to not have empty release
callback in the code. However we have empty callbacks for function and
patch in the code now, so it is no longer valid and the advantage of
static allocation is clear.

Signed-off-by: Miroslav Benes <mbenes@suse.cz>
Signed-off-by: Jiri Slaby <jslaby@suse.cz>
Acked-by: Josh Poimboeuf <jpoimboe@redhat.com>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 include/linux/livepatch.h |  2 +-
 kernel/livepatch/core.c   | 22 ++++++++++++++++------
 2 files changed, 17 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/livepatch.h b/include/linux/livepatch.h
index ee6dbb39a809..fe45f2f02c8d 100644
--- a/include/linux/livepatch.h
+++ b/include/linux/livepatch.h
@@ -99,7 +99,7 @@ struct klp_object {
 	struct klp_func *funcs;
 
 	/* internal */
-	struct kobject *kobj;
+	struct kobject kobj;
 	struct module *mod;
 	enum klp_state state;
 };
diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c
index c03c04637ec6..e997782362c3 100644
--- a/kernel/livepatch/core.c
+++ b/kernel/livepatch/core.c
@@ -651,6 +651,15 @@ static struct kobj_type klp_ktype_patch = {
 	.default_attrs = klp_patch_attrs,
 };
 
+static void klp_kobj_release_object(struct kobject *kobj)
+{
+}
+
+static struct kobj_type klp_ktype_object = {
+	.release = klp_kobj_release_object,
+	.sysfs_ops = &kobj_sysfs_ops,
+};
+
 static void klp_kobj_release_func(struct kobject *kobj)
 {
 }
@@ -695,7 +704,7 @@ static void klp_free_objects_limited(struct klp_patch *patch,
 
 	for (obj = patch->objs; obj->funcs && obj != limit; obj++) {
 		klp_free_funcs_limited(obj, NULL);
-		kobject_put(obj->kobj);
+		kobject_put(&obj->kobj);
 	}
 }
 
@@ -713,7 +722,7 @@ static int klp_init_func(struct klp_object *obj, struct klp_func *func)
 	func->state = KLP_DISABLED;
 
 	return kobject_init_and_add(&func->kobj, &klp_ktype_func,
-				    obj->kobj, "%s", func->old_name);
+				    &obj->kobj, "%s", func->old_name);
 }
 
 /* parts of the initialization that is done only when the object is loaded */
@@ -753,9 +762,10 @@ static int klp_init_object(struct klp_patch *patch, struct klp_object *obj)
 	klp_find_object_module(obj);
 
 	name = klp_is_module(obj) ? obj->name : "vmlinux";
-	obj->kobj = kobject_create_and_add(name, &patch->kobj);
-	if (!obj->kobj)
-		return -ENOMEM;
+	ret = kobject_init_and_add(&obj->kobj, &klp_ktype_object,
+				   &patch->kobj, "%s", name);
+	if (ret)
+		return ret;
 
 	for (func = obj->funcs; func->old_name; func++) {
 		ret = klp_init_func(obj, func);
@@ -773,7 +783,7 @@ static int klp_init_object(struct klp_patch *patch, struct klp_object *obj)
 
 free:
 	klp_free_funcs_limited(obj, func);
-	kobject_put(obj->kobj);
+	kobject_put(&obj->kobj);
 	return ret;
 }
 
-- 
cgit v1.2.3


From 8cdd043ab32c2ff28d2a77c514a768a9edce244c Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jslaby@suse.cz>
Date: Tue, 19 May 2015 12:01:19 +0200
Subject: livepatch: introduce patch/func-walking helpers

klp_for_each_object and klp_for_each_func are now used all over the
code. One need not think what is the proper condition to check in the
for loop now.

Signed-off-by: Jiri Slaby <jslaby@suse.cz>
Acked-by: Josh Poimboeuf <jpoimboe@redhat.com>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 include/linux/livepatch.h |  6 ++++++
 kernel/livepatch/core.c   | 18 +++++++++---------
 2 files changed, 15 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/livepatch.h b/include/linux/livepatch.h
index fe45f2f02c8d..31db7a05dd36 100644
--- a/include/linux/livepatch.h
+++ b/include/linux/livepatch.h
@@ -123,6 +123,12 @@ struct klp_patch {
 	enum klp_state state;
 };
 
+#define klp_for_each_object(patch, obj) \
+	for (obj = patch->objs; obj->funcs; obj++)
+
+#define klp_for_each_func(obj, func) \
+	for (func = obj->funcs; func->old_name; func++)
+
 int klp_register_patch(struct klp_patch *);
 int klp_unregister_patch(struct klp_patch *);
 int klp_enable_patch(struct klp_patch *);
diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c
index e997782362c3..c38398e20f64 100644
--- a/kernel/livepatch/core.c
+++ b/kernel/livepatch/core.c
@@ -422,7 +422,7 @@ static void klp_disable_object(struct klp_object *obj)
 {
 	struct klp_func *func;
 
-	for (func = obj->funcs; func->old_name; func++)
+	klp_for_each_func(obj, func)
 		if (func->state == KLP_ENABLED)
 			klp_disable_func(func);
 
@@ -440,7 +440,7 @@ static int klp_enable_object(struct klp_object *obj)
 	if (WARN_ON(!klp_is_object_loaded(obj)))
 		return -EINVAL;
 
-	for (func = obj->funcs; func->old_name; func++) {
+	klp_for_each_func(obj, func) {
 		ret = klp_enable_func(func);
 		if (ret) {
 			klp_disable_object(obj);
@@ -463,7 +463,7 @@ static int __klp_disable_patch(struct klp_patch *patch)
 
 	pr_notice("disabling patch '%s'\n", patch->mod->name);
 
-	for (obj = patch->objs; obj->funcs; obj++) {
+	klp_for_each_object(patch, obj) {
 		if (obj->state == KLP_ENABLED)
 			klp_disable_object(obj);
 	}
@@ -523,7 +523,7 @@ static int __klp_enable_patch(struct klp_patch *patch)
 
 	pr_notice("enabling patch '%s'\n", patch->mod->name);
 
-	for (obj = patch->objs; obj->funcs; obj++) {
+	klp_for_each_object(patch, obj) {
 		if (!klp_is_object_loaded(obj))
 			continue;
 
@@ -689,7 +689,7 @@ static void klp_free_object_loaded(struct klp_object *obj)
 
 	obj->mod = NULL;
 
-	for (func = obj->funcs; func->old_name; func++)
+	klp_for_each_func(obj, func)
 		func->old_addr = 0;
 }
 
@@ -738,7 +738,7 @@ static int klp_init_object_loaded(struct klp_patch *patch,
 			return ret;
 	}
 
-	for (func = obj->funcs; func->old_name; func++) {
+	klp_for_each_func(obj, func) {
 		ret = klp_find_verify_func_addr(obj, func);
 		if (ret)
 			return ret;
@@ -767,7 +767,7 @@ static int klp_init_object(struct klp_patch *patch, struct klp_object *obj)
 	if (ret)
 		return ret;
 
-	for (func = obj->funcs; func->old_name; func++) {
+	klp_for_each_func(obj, func) {
 		ret = klp_init_func(obj, func);
 		if (ret)
 			goto free;
@@ -804,7 +804,7 @@ static int klp_init_patch(struct klp_patch *patch)
 	if (ret)
 		goto unlock;
 
-	for (obj = patch->objs; obj->funcs; obj++) {
+	klp_for_each_object(patch, obj) {
 		ret = klp_init_object(patch, obj);
 		if (ret)
 			goto free;
@@ -961,7 +961,7 @@ static int klp_module_notify(struct notifier_block *nb, unsigned long action,
 		mod->klp_alive = false;
 
 	list_for_each_entry(patch, &klp_patches, list) {
-		for (obj = patch->objs; obj->funcs; obj++) {
+		klp_for_each_object(patch, obj) {
 			if (!klp_is_module(obj) || strcmp(obj->name, mod->name))
 				continue;
 
-- 
cgit v1.2.3


From 4990d4fe327b9d9a7a3be7103a82699406fdde69 Mon Sep 17 00:00:00 2001
From: Tony Lindgren <tony@atomide.com>
Date: Mon, 18 May 2015 15:40:29 -0700
Subject: PM / Wakeirq: Add automated device wake IRQ handling

Turns out we can automate the handling for the device_may_wakeup()
quite a bit by using the kernel wakeup source list as suggested
by Rafael J. Wysocki <rjw@rjwysocki.net>.

And as some hardware has separate dedicated wake-up interrupt
in addition to the IO interrupt, we can automate the handling by
adding a generic threaded interrupt handler that just calls the
device PM runtime to wake up the device.

This allows dropping code from device drivers as we currently
are doing it in multiple ways, and often wrong.

For most drivers, we should be able to drop the following
boilerplate code from runtime_suspend and runtime_resume
functions:

	...
	device_init_wakeup(dev, true);
	...
	if (device_may_wakeup(dev))
		enable_irq_wake(irq);
	...
	if (device_may_wakeup(dev))
		disable_irq_wake(irq);
	...
	device_init_wakeup(dev, false);
	...

We can replace it with just the following init and exit
time code:

	...
	device_init_wakeup(dev, true);
	dev_pm_set_wake_irq(dev, irq);
	...
	dev_pm_clear_wake_irq(dev);
	device_init_wakeup(dev, false);
	...

And for hardware with dedicated wake-up interrupts:

	...
	device_init_wakeup(dev, true);
	dev_pm_set_dedicated_wake_irq(dev, irq);
	...
	dev_pm_clear_wake_irq(dev);
	device_init_wakeup(dev, false);
	...

Signed-off-by: Tony Lindgren <tony@atomide.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/base/power/Makefile  |   2 +-
 drivers/base/power/main.c    |   3 +
 drivers/base/power/power.h   |  48 ++++++++
 drivers/base/power/runtime.c |   5 +
 drivers/base/power/wakeirq.c | 273 +++++++++++++++++++++++++++++++++++++++++++
 drivers/base/power/wakeup.c  |  92 +++++++++++++++
 include/linux/pm.h           |   2 +
 include/linux/pm_wakeirq.h   |  52 +++++++++
 include/linux/pm_wakeup.h    |   9 ++
 9 files changed, 485 insertions(+), 1 deletion(-)
 create mode 100644 drivers/base/power/wakeirq.c
 create mode 100644 include/linux/pm_wakeirq.h

(limited to 'include/linux')

diff --git a/drivers/base/power/Makefile b/drivers/base/power/Makefile
index 1cb8544598d5..f94a6ccfe787 100644
--- a/drivers/base/power/Makefile
+++ b/drivers/base/power/Makefile
@@ -1,4 +1,4 @@
-obj-$(CONFIG_PM)	+= sysfs.o generic_ops.o common.o qos.o runtime.o
+obj-$(CONFIG_PM)	+= sysfs.o generic_ops.o common.o qos.o runtime.o wakeirq.o
 obj-$(CONFIG_PM_SLEEP)	+= main.o wakeup.o
 obj-$(CONFIG_PM_TRACE_RTC)	+= trace.o
 obj-$(CONFIG_PM_OPP)	+= opp.o
diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c
index 3d874eca7104..6f2515c571f1 100644
--- a/drivers/base/power/main.c
+++ b/drivers/base/power/main.c
@@ -24,6 +24,7 @@
 #include <linux/pm.h>
 #include <linux/pm_runtime.h>
 #include <linux/pm-trace.h>
+#include <linux/pm_wakeirq.h>
 #include <linux/interrupt.h>
 #include <linux/sched.h>
 #include <linux/async.h>
@@ -587,6 +588,7 @@ void dpm_resume_noirq(pm_message_t state)
 	async_synchronize_full();
 	dpm_show_time(starttime, state, "noirq");
 	resume_device_irqs();
+	device_wakeup_disarm_wake_irqs();
 	cpuidle_resume();
 	trace_suspend_resume(TPS("dpm_resume_noirq"), state.event, false);
 }
@@ -1104,6 +1106,7 @@ int dpm_suspend_noirq(pm_message_t state)
 
 	trace_suspend_resume(TPS("dpm_suspend_noirq"), state.event, true);
 	cpuidle_pause();
+	device_wakeup_arm_wake_irqs();
 	suspend_device_irqs();
 	mutex_lock(&dpm_list_mtx);
 	pm_transition = state;
diff --git a/drivers/base/power/power.h b/drivers/base/power/power.h
index b6b8a273c5da..f1a5d95e7b20 100644
--- a/drivers/base/power/power.h
+++ b/drivers/base/power/power.h
@@ -20,6 +20,46 @@ static inline void pm_runtime_early_init(struct device *dev)
 extern void pm_runtime_init(struct device *dev);
 extern void pm_runtime_remove(struct device *dev);
 
+struct wake_irq {
+	struct device *dev;
+	int irq;
+	bool dedicated_irq:1;
+};
+
+extern void dev_pm_arm_wake_irq(struct wake_irq *wirq);
+extern void dev_pm_disarm_wake_irq(struct wake_irq *wirq);
+
+#ifdef CONFIG_PM_SLEEP
+
+extern int device_wakeup_attach_irq(struct device *dev,
+				    struct wake_irq *wakeirq);
+extern void device_wakeup_detach_irq(struct device *dev);
+extern void device_wakeup_arm_wake_irqs(void);
+extern void device_wakeup_disarm_wake_irqs(void);
+
+#else
+
+static inline int
+device_wakeup_attach_irq(struct device *dev,
+			 struct wake_irq *wakeirq)
+{
+	return 0;
+}
+
+static inline void device_wakeup_detach_irq(struct device *dev)
+{
+}
+
+static inline void device_wakeup_arm_wake_irqs(void)
+{
+}
+
+static inline void device_wakeup_disarm_wake_irqs(void)
+{
+}
+
+#endif /* CONFIG_PM_SLEEP */
+
 /*
  * sysfs.c
  */
@@ -52,6 +92,14 @@ static inline void wakeup_sysfs_remove(struct device *dev) {}
 static inline int pm_qos_sysfs_add(struct device *dev) { return 0; }
 static inline void pm_qos_sysfs_remove(struct device *dev) {}
 
+static inline void dev_pm_arm_wake_irq(struct wake_irq *wirq)
+{
+}
+
+static inline void dev_pm_disarm_wake_irq(struct wake_irq *wirq)
+{
+}
+
 #endif
 
 #ifdef CONFIG_PM_SLEEP
diff --git a/drivers/base/power/runtime.c b/drivers/base/power/runtime.c
index 4ffe4a2add76..e1a10a03df8e 100644
--- a/drivers/base/power/runtime.c
+++ b/drivers/base/power/runtime.c
@@ -10,6 +10,7 @@
 #include <linux/sched.h>
 #include <linux/export.h>
 #include <linux/pm_runtime.h>
+#include <linux/pm_wakeirq.h>
 #include <trace/events/rpm.h>
 #include "power.h"
 
@@ -514,6 +515,7 @@ static int rpm_suspend(struct device *dev, int rpmflags)
 
 	callback = RPM_GET_CALLBACK(dev, runtime_suspend);
 
+	dev_pm_enable_wake_irq(dev);
 	retval = rpm_callback(callback, dev);
 	if (retval)
 		goto fail;
@@ -552,6 +554,7 @@ static int rpm_suspend(struct device *dev, int rpmflags)
 	return retval;
 
  fail:
+	dev_pm_disable_wake_irq(dev);
 	__update_runtime_status(dev, RPM_ACTIVE);
 	dev->power.deferred_resume = false;
 	wake_up_all(&dev->power.wait_queue);
@@ -734,10 +737,12 @@ static int rpm_resume(struct device *dev, int rpmflags)
 
 	callback = RPM_GET_CALLBACK(dev, runtime_resume);
 
+	dev_pm_disable_wake_irq(dev);
 	retval = rpm_callback(callback, dev);
 	if (retval) {
 		__update_runtime_status(dev, RPM_SUSPENDED);
 		pm_runtime_cancel_pending(dev);
+		dev_pm_enable_wake_irq(dev);
 	} else {
  no_callback:
 		__update_runtime_status(dev, RPM_ACTIVE);
diff --git a/drivers/base/power/wakeirq.c b/drivers/base/power/wakeirq.c
new file mode 100644
index 000000000000..7470004ca810
--- /dev/null
+++ b/drivers/base/power/wakeirq.c
@@ -0,0 +1,273 @@
+/*
+ * wakeirq.c - Device wakeirq helper functions
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed "as is" WITHOUT ANY WARRANTY of any
+ * kind, whether express or implied; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/device.h>
+#include <linux/interrupt.h>
+#include <linux/irq.h>
+#include <linux/slab.h>
+#include <linux/pm_runtime.h>
+#include <linux/pm_wakeirq.h>
+
+#include "power.h"
+
+/**
+ * dev_pm_attach_wake_irq - Attach device interrupt as a wake IRQ
+ * @dev: Device entry
+ * @irq: Device wake-up capable interrupt
+ * @wirq: Wake irq specific data
+ *
+ * Internal function to attach either a device IO interrupt or a
+ * dedicated wake-up interrupt as a wake IRQ.
+ */
+static int dev_pm_attach_wake_irq(struct device *dev, int irq,
+				  struct wake_irq *wirq)
+{
+	unsigned long flags;
+	int err;
+
+	if (!dev || !wirq)
+		return -EINVAL;
+
+	spin_lock_irqsave(&dev->power.lock, flags);
+	if (dev_WARN_ONCE(dev, dev->power.wakeirq,
+			  "wake irq already initialized\n")) {
+		spin_unlock_irqrestore(&dev->power.lock, flags);
+		return -EEXIST;
+	}
+
+	dev->power.wakeirq = wirq;
+	spin_unlock_irqrestore(&dev->power.lock, flags);
+
+	err = device_wakeup_attach_irq(dev, wirq);
+	if (err)
+		return err;
+
+	return 0;
+}
+
+/**
+ * dev_pm_set_wake_irq - Attach device IO interrupt as wake IRQ
+ * @dev: Device entry
+ * @irq: Device IO interrupt
+ *
+ * Attach a device IO interrupt as a wake IRQ. The wake IRQ gets
+ * automatically configured for wake-up from suspend  based
+ * on the device specific sysfs wakeup entry. Typically called
+ * during driver probe after calling device_init_wakeup().
+ */
+int dev_pm_set_wake_irq(struct device *dev, int irq)
+{
+	struct wake_irq *wirq;
+	int err;
+
+	wirq = kzalloc(sizeof(*wirq), GFP_KERNEL);
+	if (!wirq)
+		return -ENOMEM;
+
+	wirq->dev = dev;
+	wirq->irq = irq;
+
+	err = dev_pm_attach_wake_irq(dev, irq, wirq);
+	if (err)
+		kfree(wirq);
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(dev_pm_set_wake_irq);
+
+/**
+ * dev_pm_clear_wake_irq - Detach a device IO interrupt wake IRQ
+ * @dev: Device entry
+ *
+ * Detach a device wake IRQ and free resources.
+ *
+ * Note that it's OK for drivers to call this without calling
+ * dev_pm_set_wake_irq() as all the driver instances may not have
+ * a wake IRQ configured. This avoid adding wake IRQ specific
+ * checks into the drivers.
+ */
+void dev_pm_clear_wake_irq(struct device *dev)
+{
+	struct wake_irq *wirq = dev->power.wakeirq;
+	unsigned long flags;
+
+	if (!wirq)
+		return;
+
+	spin_lock_irqsave(&dev->power.lock, flags);
+	dev->power.wakeirq = NULL;
+	spin_unlock_irqrestore(&dev->power.lock, flags);
+
+	device_wakeup_detach_irq(dev);
+	if (wirq->dedicated_irq)
+		free_irq(wirq->irq, wirq);
+	kfree(wirq);
+}
+EXPORT_SYMBOL_GPL(dev_pm_clear_wake_irq);
+
+/**
+ * handle_threaded_wake_irq - Handler for dedicated wake-up interrupts
+ * @irq: Device specific dedicated wake-up interrupt
+ * @_wirq: Wake IRQ data
+ *
+ * Some devices have a separate wake-up interrupt in addition to the
+ * device IO interrupt. The wake-up interrupt signals that a device
+ * should be woken up from it's idle state. This handler uses device
+ * specific pm_runtime functions to wake the device, and then it's
+ * up to the device to do whatever it needs to. Note that as the
+ * device may need to restore context and start up regulators, we
+ * use a threaded IRQ.
+ *
+ * Also note that we are not resending the lost device interrupts.
+ * We assume that the wake-up interrupt just needs to wake-up the
+ * device, and then device's pm_runtime_resume() can deal with the
+ * situation.
+ */
+static irqreturn_t handle_threaded_wake_irq(int irq, void *_wirq)
+{
+	struct wake_irq *wirq = _wirq;
+	int res;
+
+	/* We don't want RPM_ASYNC or RPM_NOWAIT here */
+	res = pm_runtime_resume(wirq->dev);
+	if (res < 0)
+		dev_warn(wirq->dev,
+			 "wake IRQ with no resume: %i\n", res);
+
+	return IRQ_HANDLED;
+}
+
+/**
+ * dev_pm_set_dedicated_wake_irq - Request a dedicated wake-up interrupt
+ * @dev: Device entry
+ * @irq: Device wake-up interrupt
+ *
+ * Unless your hardware has separate wake-up interrupts in addition
+ * to the device IO interrupts, you don't need this.
+ *
+ * Sets up a threaded interrupt handler for a device that has
+ * a dedicated wake-up interrupt in addition to the device IO
+ * interrupt.
+ *
+ * The interrupt starts disabled, and needs to be managed for
+ * the device by the bus code or the device driver using
+ * dev_pm_enable_wake_irq() and dev_pm_disable_wake_irq()
+ * functions.
+ */
+int dev_pm_set_dedicated_wake_irq(struct device *dev, int irq)
+{
+	struct wake_irq *wirq;
+	int err;
+
+	wirq = kzalloc(sizeof(*wirq), GFP_KERNEL);
+	if (!wirq)
+		return -ENOMEM;
+
+	wirq->dev = dev;
+	wirq->irq = irq;
+	wirq->dedicated_irq = true;
+	irq_set_status_flags(irq, IRQ_NOAUTOEN);
+
+	/*
+	 * Consumer device may need to power up and restore state
+	 * so we use a threaded irq.
+	 */
+	err = request_threaded_irq(irq, NULL, handle_threaded_wake_irq,
+				   IRQF_ONESHOT, dev_name(dev), wirq);
+	if (err)
+		goto err_free;
+
+	err = dev_pm_attach_wake_irq(dev, irq, wirq);
+	if (err)
+		goto err_free_irq;
+
+	return err;
+
+err_free_irq:
+	free_irq(irq, wirq);
+err_free:
+	kfree(wirq);
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(dev_pm_set_dedicated_wake_irq);
+
+/**
+ * dev_pm_enable_wake_irq - Enable device wake-up interrupt
+ * @dev: Device
+ *
+ * Called from the bus code or the device driver for
+ * runtime_suspend() to enable the wake-up interrupt while
+ * the device is running.
+ *
+ * Note that for runtime_suspend()) the wake-up interrupts
+ * should be unconditionally enabled unlike for suspend()
+ * that is conditional.
+ */
+void dev_pm_enable_wake_irq(struct device *dev)
+{
+	struct wake_irq *wirq = dev->power.wakeirq;
+
+	if (wirq && wirq->dedicated_irq)
+		enable_irq(wirq->irq);
+}
+EXPORT_SYMBOL_GPL(dev_pm_enable_wake_irq);
+
+/**
+ * dev_pm_disable_wake_irq - Disable device wake-up interrupt
+ * @dev: Device
+ *
+ * Called from the bus code or the device driver for
+ * runtime_resume() to disable the wake-up interrupt while
+ * the device is running.
+ */
+void dev_pm_disable_wake_irq(struct device *dev)
+{
+	struct wake_irq *wirq = dev->power.wakeirq;
+
+	if (wirq && wirq->dedicated_irq)
+		disable_irq_nosync(wirq->irq);
+}
+EXPORT_SYMBOL_GPL(dev_pm_disable_wake_irq);
+
+/**
+ * dev_pm_arm_wake_irq - Arm device wake-up
+ * @wirq: Device wake-up interrupt
+ *
+ * Sets up the wake-up event conditionally based on the
+ * device_may_wake().
+ */
+void dev_pm_arm_wake_irq(struct wake_irq *wirq)
+{
+	if (!wirq)
+		return;
+
+	if (device_may_wakeup(wirq->dev))
+		enable_irq_wake(wirq->irq);
+}
+
+/**
+ * dev_pm_disarm_wake_irq - Disarm device wake-up
+ * @wirq: Device wake-up interrupt
+ *
+ * Clears up the wake-up event conditionally based on the
+ * device_may_wake().
+ */
+void dev_pm_disarm_wake_irq(struct wake_irq *wirq)
+{
+	if (!wirq)
+		return;
+
+	if (device_may_wakeup(wirq->dev))
+		disable_irq_wake(wirq->irq);
+}
diff --git a/drivers/base/power/wakeup.c b/drivers/base/power/wakeup.c
index 77262009f89d..7332ebc9cab0 100644
--- a/drivers/base/power/wakeup.c
+++ b/drivers/base/power/wakeup.c
@@ -14,6 +14,7 @@
 #include <linux/suspend.h>
 #include <linux/seq_file.h>
 #include <linux/debugfs.h>
+#include <linux/pm_wakeirq.h>
 #include <trace/events/power.h>
 
 #include "power.h"
@@ -238,6 +239,97 @@ int device_wakeup_enable(struct device *dev)
 }
 EXPORT_SYMBOL_GPL(device_wakeup_enable);
 
+/**
+ * device_wakeup_attach_irq - Attach a wakeirq to a wakeup source
+ * @dev: Device to handle
+ * @wakeirq: Device specific wakeirq entry
+ *
+ * Attach a device wakeirq to the wakeup source so the device
+ * wake IRQ can be configured automatically for suspend and
+ * resume.
+ */
+int device_wakeup_attach_irq(struct device *dev,
+			     struct wake_irq *wakeirq)
+{
+	struct wakeup_source *ws;
+	int ret = 0;
+
+	spin_lock_irq(&dev->power.lock);
+	ws = dev->power.wakeup;
+	if (!ws) {
+		dev_err(dev, "forgot to call call device_init_wakeup?\n");
+		ret = -EINVAL;
+		goto unlock;
+	}
+
+	if (ws->wakeirq) {
+		ret = -EEXIST;
+		goto unlock;
+	}
+
+	ws->wakeirq = wakeirq;
+
+unlock:
+	spin_unlock_irq(&dev->power.lock);
+
+	return ret;
+}
+
+/**
+ * device_wakeup_detach_irq - Detach a wakeirq from a wakeup source
+ * @dev: Device to handle
+ *
+ * Removes a device wakeirq from the wakeup source.
+ */
+void device_wakeup_detach_irq(struct device *dev)
+{
+	struct wakeup_source *ws;
+
+	spin_lock_irq(&dev->power.lock);
+	ws = dev->power.wakeup;
+	if (!ws)
+		goto unlock;
+
+	ws->wakeirq = NULL;
+
+unlock:
+	spin_unlock_irq(&dev->power.lock);
+}
+
+/**
+ * device_wakeup_arm_wake_irqs(void)
+ *
+ * Itereates over the list of device wakeirqs to arm them.
+ */
+void device_wakeup_arm_wake_irqs(void)
+{
+	struct wakeup_source *ws;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(ws, &wakeup_sources, entry) {
+		if (ws->wakeirq)
+			dev_pm_arm_wake_irq(ws->wakeirq);
+	}
+	rcu_read_unlock();
+}
+
+/**
+ * device_wakeup_disarm_wake_irqs(void)
+ *
+ * Itereates over the list of device wakeirqs to disarm them.
+ */
+void device_wakeup_disarm_wake_irqs(void)
+{
+	struct wakeup_source *ws;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(ws, &wakeup_sources, entry) {
+		if (ws->wakeirq)
+			dev_pm_disarm_wake_irq(ws->wakeirq);
+	}
+	rcu_read_unlock();
+}
+
 /**
  * device_wakeup_detach - Detach a device's wakeup source object from it.
  * @dev: Device to detach the wakeup source object from.
diff --git a/include/linux/pm.h b/include/linux/pm.h
index 2d29c64f8fb1..1c4ed0cb7907 100644
--- a/include/linux/pm.h
+++ b/include/linux/pm.h
@@ -529,6 +529,7 @@ enum rpm_request {
 };
 
 struct wakeup_source;
+struct wake_irq;
 struct pm_domain_data;
 
 struct pm_subsys_data {
@@ -568,6 +569,7 @@ struct dev_pm_info {
 	unsigned long		timer_expires;
 	struct work_struct	work;
 	wait_queue_head_t	wait_queue;
+	struct wake_irq		*wakeirq;
 	atomic_t		usage_count;
 	atomic_t		child_count;
 	unsigned int		disable_depth:3;
diff --git a/include/linux/pm_wakeirq.h b/include/linux/pm_wakeirq.h
new file mode 100644
index 000000000000..4046fa1b7d25
--- /dev/null
+++ b/include/linux/pm_wakeirq.h
@@ -0,0 +1,52 @@
+/*
+ * pm_wakeirq.h - Device wakeirq helper functions
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed "as is" WITHOUT ANY WARRANTY of any
+ * kind, whether express or implied; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#ifndef _LINUX_PM_WAKEIRQ_H
+#define _LINUX_PM_WAKEIRQ_H
+
+#ifdef CONFIG_PM
+
+extern int dev_pm_set_wake_irq(struct device *dev, int irq);
+extern int dev_pm_set_dedicated_wake_irq(struct device *dev,
+					 int irq);
+extern void dev_pm_clear_wake_irq(struct device *dev);
+extern void dev_pm_enable_wake_irq(struct device *dev);
+extern void dev_pm_disable_wake_irq(struct device *dev);
+
+#else	/* !CONFIG_PM */
+
+static inline int dev_pm_set_wake_irq(struct device *dev, int irq)
+{
+	return 0;
+}
+
+static inline int dev_pm_set_dedicated__wake_irq(struct device *dev,
+						 int irq)
+{
+	return 0;
+}
+
+static inline void dev_pm_clear_wake_irq(struct device *dev)
+{
+}
+
+static inline void dev_pm_enable_wake_irq(struct device *dev)
+{
+}
+
+static inline void dev_pm_disable_wake_irq(struct device *dev)
+{
+}
+
+#endif	/* CONFIG_PM */
+#endif	/* _LINUX_PM_WAKEIRQ_H */
diff --git a/include/linux/pm_wakeup.h b/include/linux/pm_wakeup.h
index a0f70808d7f4..a3447932df1f 100644
--- a/include/linux/pm_wakeup.h
+++ b/include/linux/pm_wakeup.h
@@ -28,9 +28,17 @@
 
 #include <linux/types.h>
 
+struct wake_irq;
+
 /**
  * struct wakeup_source - Representation of wakeup sources
  *
+ * @name: Name of the wakeup source
+ * @entry: Wakeup source list entry
+ * @lock: Wakeup source lock
+ * @wakeirq: Optional device specific wakeirq
+ * @timer: Wakeup timer list
+ * @timer_expires: Wakeup timer expiration
  * @total_time: Total time this wakeup source has been active.
  * @max_time: Maximum time this wakeup source has been continuously active.
  * @last_time: Monotonic clock when the wakeup source's was touched last time.
@@ -47,6 +55,7 @@ struct wakeup_source {
 	const char 		*name;
 	struct list_head	entry;
 	spinlock_t		lock;
+	struct wake_irq		*wakeirq;
 	struct timer_list	timer;
 	unsigned long		timer_expires;
 	ktime_t total_time;
-- 
cgit v1.2.3


From ecc8617053e0a97272ef2eee138809f30080e84b Mon Sep 17 00:00:00 2001
From: "Luis R. Rodriguez" <mcgrof@suse.com>
Date: Mon, 30 Mar 2015 16:20:03 -0700
Subject: module: add extra argument for parse_params() callback

This adds an extra argument onto parse_params() to be used
as a way to make the unused callback a bit more useful and
generic by allowing the caller to pass on a data structure
of its choice. An example use case is to allow us to easily
make module parameters for every module which we will do
next.

@ parse @
identifier name, args, params, num, level_min, level_max;
identifier unknown, param, val, doing;
type s16;
@@
 extern char *parse_args(const char *name,
 			 char *args,
 			 const struct kernel_param *params,
 			 unsigned num,
 			 s16 level_min,
 			 s16 level_max,
+			 void *arg,
 			 int (*unknown)(char *param, char *val,
					const char *doing
+					, void *arg
					));

@ parse_mod @
identifier name, args, params, num, level_min, level_max;
identifier unknown, param, val, doing;
type s16;
@@
 char *parse_args(const char *name,
 			 char *args,
 			 const struct kernel_param *params,
 			 unsigned num,
 			 s16 level_min,
 			 s16 level_max,
+			 void *arg,
 			 int (*unknown)(char *param, char *val,
					const char *doing
+					, void *arg
					))
{
	...
}

@ parse_args_found @
expression R, E1, E2, E3, E4, E5, E6;
identifier func;
@@

(
	R =
	parse_args(E1, E2, E3, E4, E5, E6,
+		   NULL,
		   func);
|
	R =
	parse_args(E1, E2, E3, E4, E5, E6,
+		   NULL,
		   &func);
|
	R =
	parse_args(E1, E2, E3, E4, E5, E6,
+		   NULL,
		   NULL);
|
	parse_args(E1, E2, E3, E4, E5, E6,
+		   NULL,
		   func);
|
	parse_args(E1, E2, E3, E4, E5, E6,
+		   NULL,
		   &func);
|
	parse_args(E1, E2, E3, E4, E5, E6,
+		   NULL,
		   NULL);
)

@ parse_args_unused depends on parse_args_found @
identifier parse_args_found.func;
@@

int func(char *param, char *val, const char *unused
+		 , void *arg
		 )
{
	...
}

@ mod_unused depends on parse_args_found @
identifier parse_args_found.func;
expression A1, A2, A3;
@@

-	func(A1, A2, A3);
+	func(A1, A2, A3, NULL);

Generated-by: Coccinelle SmPL
Cc: cocci@systeme.lip6.fr
Cc: Tejun Heo <tj@kernel.org>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Felipe Contreras <felipe.contreras@gmail.com>
Cc: Ewan Milne <emilne@redhat.com>
Cc: Jean Delvare <jdelvare@suse.de>
Cc: Hannes Reinecke <hare@suse.de>
Cc: Jani Nikula <jani.nikula@intel.com>
Cc: linux-kernel@vger.kernel.org
Reviewed-by: Tejun Heo <tj@kernel.org>
Acked-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Luis R. Rodriguez <mcgrof@suse.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/powerpc/mm/hugetlbpage.c |  4 ++--
 include/linux/moduleparam.h   |  3 ++-
 init/main.c                   | 25 +++++++++++++++----------
 kernel/module.c               |  6 ++++--
 kernel/params.c               | 11 +++++++----
 lib/dynamic_debug.c           |  4 ++--
 6 files changed, 32 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index 0ce968b00b7c..d230158c9770 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -336,7 +336,7 @@ int alloc_bootmem_huge_page(struct hstate *hstate)
 unsigned long gpage_npages[MMU_PAGE_COUNT];
 
 static int __init do_gpage_early_setup(char *param, char *val,
-				       const char *unused)
+				       const char *unused, void *arg)
 {
 	static phys_addr_t size;
 	unsigned long npages;
@@ -385,7 +385,7 @@ void __init reserve_hugetlb_gpages(void)
 
 	strlcpy(cmdline, boot_command_line, COMMAND_LINE_SIZE);
 	parse_args("hugetlb gpages", cmdline, NULL, 0, 0, 0,
-			&do_gpage_early_setup);
+			NULL, &do_gpage_early_setup);
 
 	/*
 	 * Walk gpage list in reverse, allocating larger page sizes first.
diff --git a/include/linux/moduleparam.h b/include/linux/moduleparam.h
index 1c9effa25e26..13923709d30d 100644
--- a/include/linux/moduleparam.h
+++ b/include/linux/moduleparam.h
@@ -357,8 +357,9 @@ extern char *parse_args(const char *name,
 		      unsigned num,
 		      s16 level_min,
 		      s16 level_max,
+		      void *arg,
 		      int (*unknown)(char *param, char *val,
-			      const char *doing));
+				     const char *doing, void *arg));
 
 /* Called by module remove. */
 #ifdef CONFIG_SYSFS
diff --git a/init/main.c b/init/main.c
index 2115055faeac..edcb13440646 100644
--- a/init/main.c
+++ b/init/main.c
@@ -235,7 +235,8 @@ static int __init loglevel(char *str)
 early_param("loglevel", loglevel);
 
 /* Change NUL term back to "=", to make "param" the whole string. */
-static int __init repair_env_string(char *param, char *val, const char *unused)
+static int __init repair_env_string(char *param, char *val,
+				    const char *unused, void *arg)
 {
 	if (val) {
 		/* param=val or param="val"? */
@@ -252,14 +253,15 @@ static int __init repair_env_string(char *param, char *val, const char *unused)
 }
 
 /* Anything after -- gets handed straight to init. */
-static int __init set_init_arg(char *param, char *val, const char *unused)
+static int __init set_init_arg(char *param, char *val,
+			       const char *unused, void *arg)
 {
 	unsigned int i;
 
 	if (panic_later)
 		return 0;
 
-	repair_env_string(param, val, unused);
+	repair_env_string(param, val, unused, NULL);
 
 	for (i = 0; argv_init[i]; i++) {
 		if (i == MAX_INIT_ARGS) {
@@ -276,9 +278,10 @@ static int __init set_init_arg(char *param, char *val, const char *unused)
  * Unknown boot options get handed to init, unless they look like
  * unused parameters (modprobe will find them in /proc/cmdline).
  */
-static int __init unknown_bootoption(char *param, char *val, const char *unused)
+static int __init unknown_bootoption(char *param, char *val,
+				     const char *unused, void *arg)
 {
-	repair_env_string(param, val, unused);
+	repair_env_string(param, val, unused, NULL);
 
 	/* Handle obsolete-style parameters */
 	if (obsolete_checksetup(param))
@@ -410,7 +413,8 @@ static noinline void __init_refok rest_init(void)
 }
 
 /* Check for early params. */
-static int __init do_early_param(char *param, char *val, const char *unused)
+static int __init do_early_param(char *param, char *val,
+				 const char *unused, void *arg)
 {
 	const struct obs_kernel_param *p;
 
@@ -429,7 +433,8 @@ static int __init do_early_param(char *param, char *val, const char *unused)
 
 void __init parse_early_options(char *cmdline)
 {
-	parse_args("early options", cmdline, NULL, 0, 0, 0, do_early_param);
+	parse_args("early options", cmdline, NULL, 0, 0, 0, NULL,
+		   do_early_param);
 }
 
 /* Arch code calls this early on, or if not, just before other parsing. */
@@ -535,10 +540,10 @@ asmlinkage __visible void __init start_kernel(void)
 	after_dashes = parse_args("Booting kernel",
 				  static_command_line, __start___param,
 				  __stop___param - __start___param,
-				  -1, -1, &unknown_bootoption);
+				  -1, -1, NULL, &unknown_bootoption);
 	if (!IS_ERR_OR_NULL(after_dashes))
 		parse_args("Setting init args", after_dashes, NULL, 0, -1, -1,
-			   set_init_arg);
+			   NULL, set_init_arg);
 
 	jump_label_init();
 
@@ -847,7 +852,7 @@ static void __init do_initcall_level(int level)
 		   initcall_command_line, __start___param,
 		   __stop___param - __start___param,
 		   level, level,
-		   &repair_env_string);
+		   NULL, &repair_env_string);
 
 	for (fn = initcall_levels[level]; fn < initcall_levels[level+1]; fn++)
 		do_one_initcall(*fn);
diff --git a/kernel/module.c b/kernel/module.c
index 42a1d2afb217..24d1f31d02f2 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -3237,7 +3237,8 @@ out:
 	return err;
 }
 
-static int unknown_module_param_cb(char *param, char *val, const char *modname)
+static int unknown_module_param_cb(char *param, char *val, const char *modname,
+				   void *arg)
 {
 	/* Check for magic 'dyndbg' arg */
 	int ret = ddebug_dyndbg_module_param_cb(param, val, modname);
@@ -3342,7 +3343,8 @@ static int load_module(struct load_info *info, const char __user *uargs,
 
 	/* Module is ready to execute: parsing args may do that. */
 	after_dashes = parse_args(mod->name, mod->args, mod->kp, mod->num_kp,
-				  -32768, 32767, unknown_module_param_cb);
+				  -32768, 32767, NULL,
+				  unknown_module_param_cb);
 	if (IS_ERR(after_dashes)) {
 		err = PTR_ERR(after_dashes);
 		goto bug_cleanup;
diff --git a/kernel/params.c b/kernel/params.c
index a22d6a759b1a..30288c1e15dd 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -100,8 +100,9 @@ static int parse_one(char *param,
 		     unsigned num_params,
 		     s16 min_level,
 		     s16 max_level,
+		     void *arg,
 		     int (*handle_unknown)(char *param, char *val,
-				     const char *doing))
+				     const char *doing, void *arg))
 {
 	unsigned int i;
 	int err;
@@ -128,7 +129,7 @@ static int parse_one(char *param,
 
 	if (handle_unknown) {
 		pr_debug("doing %s: %s='%s'\n", doing, param, val);
-		return handle_unknown(param, val, doing);
+		return handle_unknown(param, val, doing, arg);
 	}
 
 	pr_debug("Unknown argument '%s'\n", param);
@@ -194,7 +195,9 @@ char *parse_args(const char *doing,
 		 unsigned num,
 		 s16 min_level,
 		 s16 max_level,
-		 int (*unknown)(char *param, char *val, const char *doing))
+		 void *arg,
+		 int (*unknown)(char *param, char *val,
+				const char *doing, void *arg))
 {
 	char *param, *val;
 
@@ -214,7 +217,7 @@ char *parse_args(const char *doing,
 			return args;
 		irq_was_disabled = irqs_disabled();
 		ret = parse_one(param, val, doing, params, num,
-				min_level, max_level, unknown);
+				min_level, max_level, arg, unknown);
 		if (irq_was_disabled && !irqs_disabled())
 			pr_warn("%s: option '%s' enabled irq's!\n",
 				doing, param);
diff --git a/lib/dynamic_debug.c b/lib/dynamic_debug.c
index d8f3d3150603..e491e02eff54 100644
--- a/lib/dynamic_debug.c
+++ b/lib/dynamic_debug.c
@@ -887,7 +887,7 @@ static int ddebug_dyndbg_param_cb(char *param, char *val,
 
 /* handle both dyndbg and $module.dyndbg params at boot */
 static int ddebug_dyndbg_boot_param_cb(char *param, char *val,
-				const char *unused)
+				const char *unused, void *arg)
 {
 	vpr_info("%s=\"%s\"\n", param, val);
 	return ddebug_dyndbg_param_cb(param, val, NULL, 0);
@@ -1028,7 +1028,7 @@ static int __init dynamic_debug_init(void)
 	 */
 	cmdline = kstrdup(saved_command_line, GFP_KERNEL);
 	parse_args("dyndbg params", cmdline, NULL,
-		   0, 0, 0, &ddebug_dyndbg_boot_param_cb);
+		   0, 0, 0, NULL, &ddebug_dyndbg_boot_param_cb);
 	kfree(cmdline);
 	return 0;
 
-- 
cgit v1.2.3


From 765230b5f084863183aa8adb3405ab3f32c0b16e Mon Sep 17 00:00:00 2001
From: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Date: Mon, 30 Mar 2015 16:20:04 -0700
Subject: driver-core: add asynchronous probing support for drivers

Some devices take a long time when initializing, and not all drivers are
suited to initialize their devices when they are open. For example,
input drivers need to interrogate their devices in order to publish
device's capabilities before userspace will open them. When such drivers
are compiled into kernel they may stall entire kernel initialization.

This change allows drivers request for their probe functions to be
called asynchronously during driver and device registration (manual
binding is still synchronous). Because async_schedule is used to perform
asynchronous calls module loading will still wait for the probing to
complete.

Note that the end goal is to make the probing asynchronous by default,
so annotating drivers with PROBE_PREFER_ASYNCHRONOUS is a temporary
measure that allows us to speed up boot process while we validating and
fixing the rest of the drivers and preparing userspace.

This change is based on earlier patch by "Luis R. Rodriguez"
<mcgrof@suse.com>

Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/base/base.h    |   1 +
 drivers/base/bus.c     |  31 +++++++---
 drivers/base/dd.c      | 149 ++++++++++++++++++++++++++++++++++++++++++-------
 include/linux/device.h |  28 ++++++++++
 4 files changed, 182 insertions(+), 27 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/base.h b/drivers/base/base.h
index 251c5d30f963..fd3347d9f153 100644
--- a/drivers/base/base.h
+++ b/drivers/base/base.h
@@ -116,6 +116,7 @@ static inline int driver_match_device(struct device_driver *drv,
 {
 	return drv->bus->match ? drv->bus->match(dev, drv) : 1;
 }
+extern bool driver_allows_async_probing(struct device_driver *drv);
 
 extern int driver_add_groups(struct device_driver *drv,
 			     const struct attribute_group **groups);
diff --git a/drivers/base/bus.c b/drivers/base/bus.c
index 79bc203f51ef..500592486e88 100644
--- a/drivers/base/bus.c
+++ b/drivers/base/bus.c
@@ -10,6 +10,7 @@
  *
  */
 
+#include <linux/async.h>
 #include <linux/device.h>
 #include <linux/module.h>
 #include <linux/errno.h>
@@ -549,15 +550,12 @@ void bus_probe_device(struct device *dev)
 {
 	struct bus_type *bus = dev->bus;
 	struct subsys_interface *sif;
-	int ret;
 
 	if (!bus)
 		return;
 
-	if (bus->p->drivers_autoprobe) {
-		ret = device_attach(dev);
-		WARN_ON(ret < 0);
-	}
+	if (bus->p->drivers_autoprobe)
+		device_initial_probe(dev);
 
 	mutex_lock(&bus->p->mutex);
 	list_for_each_entry(sif, &bus->p->interfaces, node)
@@ -659,6 +657,17 @@ static ssize_t uevent_store(struct device_driver *drv, const char *buf,
 }
 static DRIVER_ATTR_WO(uevent);
 
+static void driver_attach_async(void *_drv, async_cookie_t cookie)
+{
+	struct device_driver *drv = _drv;
+	int ret;
+
+	ret = driver_attach(drv);
+
+	pr_debug("bus: '%s': driver %s async attach completed: %d\n",
+		 drv->bus->name, drv->name, ret);
+}
+
 /**
  * bus_add_driver - Add a driver to the bus.
  * @drv: driver.
@@ -691,9 +700,15 @@ int bus_add_driver(struct device_driver *drv)
 
 	klist_add_tail(&priv->knode_bus, &bus->p->klist_drivers);
 	if (drv->bus->p->drivers_autoprobe) {
-		error = driver_attach(drv);
-		if (error)
-			goto out_unregister;
+		if (driver_allows_async_probing(drv)) {
+			pr_debug("bus: '%s': probing driver %s asynchronously\n",
+				drv->bus->name, drv->name);
+			async_schedule(driver_attach_async, drv);
+		} else {
+			error = driver_attach(drv);
+			if (error)
+				goto out_unregister;
+		}
 	}
 	module_add_driver(drv->owner, drv);
 
diff --git a/drivers/base/dd.c b/drivers/base/dd.c
index e843fdbe4925..2ad33b21888c 100644
--- a/drivers/base/dd.c
+++ b/drivers/base/dd.c
@@ -417,31 +417,95 @@ int driver_probe_device(struct device_driver *drv, struct device *dev)
 	return ret;
 }
 
-static int __device_attach(struct device_driver *drv, void *data)
+bool driver_allows_async_probing(struct device_driver *drv)
 {
-	struct device *dev = data;
+	return drv->probe_type == PROBE_PREFER_ASYNCHRONOUS;
+}
+
+struct device_attach_data {
+	struct device *dev;
+
+	/*
+	 * Indicates whether we are are considering asynchronous probing or
+	 * not. Only initial binding after device or driver registration
+	 * (including deferral processing) may be done asynchronously, the
+	 * rest is always synchronous, as we expect it is being done by
+	 * request from userspace.
+	 */
+	bool check_async;
+
+	/*
+	 * Indicates if we are binding synchronous or asynchronous drivers.
+	 * When asynchronous probing is enabled we'll execute 2 passes
+	 * over drivers: first pass doing synchronous probing and second
+	 * doing asynchronous probing (if synchronous did not succeed -
+	 * most likely because there was no driver requiring synchronous
+	 * probing - and we found asynchronous driver during first pass).
+	 * The 2 passes are done because we can't shoot asynchronous
+	 * probe for given device and driver from bus_for_each_drv() since
+	 * driver pointer is not guaranteed to stay valid once
+	 * bus_for_each_drv() iterates to the next driver on the bus.
+	 */
+	bool want_async;
+
+	/*
+	 * We'll set have_async to 'true' if, while scanning for matching
+	 * driver, we'll encounter one that requests asynchronous probing.
+	 */
+	bool have_async;
+};
+
+static int __device_attach_driver(struct device_driver *drv, void *_data)
+{
+	struct device_attach_data *data = _data;
+	struct device *dev = data->dev;
+	bool async_allowed;
+
+	/*
+	 * Check if device has already been claimed. This may
+	 * happen with driver loading, device discovery/registration,
+	 * and deferred probe processing happens all at once with
+	 * multiple threads.
+	 */
+	if (dev->driver)
+		return -EBUSY;
 
 	if (!driver_match_device(drv, dev))
 		return 0;
 
+	async_allowed = driver_allows_async_probing(drv);
+
+	if (async_allowed)
+		data->have_async = true;
+
+	if (data->check_async && async_allowed != data->want_async)
+		return 0;
+
 	return driver_probe_device(drv, dev);
 }
 
-/**
- * device_attach - try to attach device to a driver.
- * @dev: device.
- *
- * Walk the list of drivers that the bus has and call
- * driver_probe_device() for each pair. If a compatible
- * pair is found, break out and return.
- *
- * Returns 1 if the device was bound to a driver;
- * 0 if no matching driver was found;
- * -ENODEV if the device is not registered.
- *
- * When called for a USB interface, @dev->parent lock must be held.
- */
-int device_attach(struct device *dev)
+static void __device_attach_async_helper(void *_dev, async_cookie_t cookie)
+{
+	struct device *dev = _dev;
+	struct device_attach_data data = {
+		.dev		= dev,
+		.check_async	= true,
+		.want_async	= true,
+	};
+
+	device_lock(dev);
+
+	bus_for_each_drv(dev->bus, NULL, &data, __device_attach_driver);
+	dev_dbg(dev, "async probe completed\n");
+
+	pm_request_idle(dev);
+
+	device_unlock(dev);
+
+	put_device(dev);
+}
+
+int __device_attach(struct device *dev, bool allow_async)
 {
 	int ret = 0;
 
@@ -459,15 +523,59 @@ int device_attach(struct device *dev)
 			ret = 0;
 		}
 	} else {
-		ret = bus_for_each_drv(dev->bus, NULL, dev, __device_attach);
-		pm_request_idle(dev);
+		struct device_attach_data data = {
+			.dev = dev,
+			.check_async = allow_async,
+			.want_async = false,
+		};
+
+		ret = bus_for_each_drv(dev->bus, NULL, &data,
+					__device_attach_driver);
+		if (!ret && allow_async && data.have_async) {
+			/*
+			 * If we could not find appropriate driver
+			 * synchronously and we are allowed to do
+			 * async probes and there are drivers that
+			 * want to probe asynchronously, we'll
+			 * try them.
+			 */
+			dev_dbg(dev, "scheduling asynchronous probe\n");
+			get_device(dev);
+			async_schedule(__device_attach_async_helper, dev);
+		} else {
+			pm_request_idle(dev);
+		}
 	}
 out_unlock:
 	device_unlock(dev);
 	return ret;
 }
+
+/**
+ * device_attach - try to attach device to a driver.
+ * @dev: device.
+ *
+ * Walk the list of drivers that the bus has and call
+ * driver_probe_device() for each pair. If a compatible
+ * pair is found, break out and return.
+ *
+ * Returns 1 if the device was bound to a driver;
+ * 0 if no matching driver was found;
+ * -ENODEV if the device is not registered.
+ *
+ * When called for a USB interface, @dev->parent lock must be held.
+ */
+int device_attach(struct device *dev)
+{
+	return __device_attach(dev, false);
+}
 EXPORT_SYMBOL_GPL(device_attach);
 
+void device_initial_probe(struct device *dev)
+{
+	__device_attach(dev, true);
+}
+
 static int __driver_attach(struct device *dev, void *data)
 {
 	struct device_driver *drv = data;
@@ -522,6 +630,9 @@ static void __device_release_driver(struct device *dev)
 
 	drv = dev->driver;
 	if (drv) {
+		if (driver_allows_async_probing(drv))
+			async_synchronize_full();
+
 		pm_runtime_get_sync(dev);
 
 		driver_sysfs_remove(dev);
diff --git a/include/linux/device.h b/include/linux/device.h
index 6558af90c8fe..7857b46c548b 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -195,6 +195,31 @@ extern int bus_unregister_notifier(struct bus_type *bus,
 extern struct kset *bus_get_kset(struct bus_type *bus);
 extern struct klist *bus_get_device_klist(struct bus_type *bus);
 
+/**
+ * enum probe_type - device driver probe type to try
+ *	Device drivers may opt in for special handling of their
+ *	respective probe routines. This tells the core what to
+ *	expect and prefer.
+ *
+ * @PROBE_SYNCHRONOUS: Default. Drivers expect their probe routines
+ *	to run synchronously with driver and device registration
+ *	(with the exception of -EPROBE_DEFER handling - re-probing
+ *	always ends up being done asynchronously).
+ * @PROBE_PREFER_ASYNCHRONOUS: Drivers for "slow" devices which
+ *	probing order is not essential for booting the system may
+ *	opt into executing their probes asynchronously.
+ *
+ * Note that the end goal is to switch the kernel to use asynchronous
+ * probing by default, so annotating drivers with
+ * %PROBE_PREFER_ASYNCHRONOUS is a temporary measure that allows us
+ * to speed up boot process while we are validating the rest of the
+ * drivers.
+ */
+enum probe_type {
+	PROBE_SYNCHRONOUS,
+	PROBE_PREFER_ASYNCHRONOUS,
+};
+
 /**
  * struct device_driver - The basic device driver structure
  * @name:	Name of the device driver.
@@ -202,6 +227,7 @@ extern struct klist *bus_get_device_klist(struct bus_type *bus);
  * @owner:	The module owner.
  * @mod_name:	Used for built-in modules.
  * @suppress_bind_attrs: Disables bind/unbind via sysfs.
+ * @probe_type:	Type of the probe (synchronous or asynchronous) to use.
  * @of_match_table: The open firmware table.
  * @acpi_match_table: The ACPI match table.
  * @probe:	Called to query the existence of a specific device,
@@ -235,6 +261,7 @@ struct device_driver {
 	const char		*mod_name;	/* used for built-in modules */
 
 	bool suppress_bind_attrs;	/* disables bind/unbind via sysfs */
+	enum probe_type probe_type;
 
 	const struct of_device_id	*of_match_table;
 	const struct acpi_device_id	*acpi_match_table;
@@ -975,6 +1002,7 @@ extern int __must_check device_bind_driver(struct device *dev);
 extern void device_release_driver(struct device *dev);
 extern int  __must_check device_attach(struct device *dev);
 extern int __must_check driver_attach(struct device_driver *drv);
+extern void device_initial_probe(struct device *dev);
 extern int __must_check device_reprobe(struct device *dev);
 
 /*
-- 
cgit v1.2.3


From f2411da746985e60d4d087f3a43e271c61785927 Mon Sep 17 00:00:00 2001
From: "Luis R. Rodriguez" <mcgrof@suse.com>
Date: Mon, 30 Mar 2015 16:20:05 -0700
Subject: driver-core: add driver module asynchronous probe support

Some init systems may wish to express the desire to have device drivers
run their probe() code asynchronously. This implements support for this
and allows userspace to request async probe as a preference through a
generic shared device driver module parameter, async_probe.

Implementation for async probe is supported through a module parameter
given that since synchronous probe has been prevalent for years some
userspace might exist which relies on the fact that the device driver
will probe synchronously and the assumption that devices it provides
will be immediately available after this.

Signed-off-by: Luis R. Rodriguez <mcgrof@suse.com>
Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 Documentation/kernel-parameters.txt |  3 +++
 drivers/base/dd.c                   |  8 +++++++-
 include/linux/device.h              |  8 +++++---
 include/linux/module.h              |  2 ++
 kernel/module.c                     | 12 ++++++++++--
 5 files changed, 27 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 61ab1628a057..e89fdd5aa605 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -943,6 +943,9 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
 			auto	selects the default scheme, which automatically
 				enables eagerfpu restore for xsaveopt.
 
+	module.async_probe [KNL]
+			Enable asynchronous probe on this module.
+
 	early_ioremap_debug [KNL]
 			Enable debug messages in early_ioremap support. This
 			is useful for tracking down temporary early mappings
diff --git a/drivers/base/dd.c b/drivers/base/dd.c
index 2ad33b21888c..7a2fa5dcead7 100644
--- a/drivers/base/dd.c
+++ b/drivers/base/dd.c
@@ -419,7 +419,13 @@ int driver_probe_device(struct device_driver *drv, struct device *dev)
 
 bool driver_allows_async_probing(struct device_driver *drv)
 {
-	return drv->probe_type == PROBE_PREFER_ASYNCHRONOUS;
+	if (drv->probe_type == PROBE_PREFER_ASYNCHRONOUS)
+		return true;
+
+	if (drv->owner && drv->owner->async_probe_requested)
+		return true;
+
+	return false;
 }
 
 struct device_attach_data {
diff --git a/include/linux/device.h b/include/linux/device.h
index 7857b46c548b..77b7cd9e5467 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -201,10 +201,12 @@ extern struct klist *bus_get_device_klist(struct bus_type *bus);
  *	respective probe routines. This tells the core what to
  *	expect and prefer.
  *
- * @PROBE_SYNCHRONOUS: Default. Drivers expect their probe routines
+ * @PROBE_DEFAULT_STRATEGY: Drivers expect their probe routines
  *	to run synchronously with driver and device registration
  *	(with the exception of -EPROBE_DEFER handling - re-probing
- *	always ends up being done asynchronously).
+ *	always ends up being done asynchronously) unless user
+ *	explicitly requested asynchronous probing via module
+ *	parameter.
  * @PROBE_PREFER_ASYNCHRONOUS: Drivers for "slow" devices which
  *	probing order is not essential for booting the system may
  *	opt into executing their probes asynchronously.
@@ -216,7 +218,7 @@ extern struct klist *bus_get_device_klist(struct bus_type *bus);
  * drivers.
  */
 enum probe_type {
-	PROBE_SYNCHRONOUS,
+	PROBE_DEFAULT_STRATEGY,
 	PROBE_PREFER_ASYNCHRONOUS,
 };
 
diff --git a/include/linux/module.h b/include/linux/module.h
index c883b86ea964..f46a47d3c0dc 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -257,6 +257,8 @@ struct module {
 	bool sig_ok;
 #endif
 
+	bool async_probe_requested;
+
 	/* symbols that will be GPL-only in the near future. */
 	const struct kernel_symbol *gpl_future_syms;
 	const unsigned long *gpl_future_crcs;
diff --git a/kernel/module.c b/kernel/module.c
index 24d1f31d02f2..ea941bc327d5 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -3107,7 +3107,7 @@ static noinline int do_init_module(struct module *mod)
 	 *
 	 * http://thread.gmane.org/gmane.linux.kernel/1420814
 	 */
-	if (current->flags & PF_USED_ASYNC)
+	if (!mod->async_probe_requested && (current->flags & PF_USED_ASYNC))
 		async_synchronize_full();
 
 	mutex_lock(&module_mutex);
@@ -3240,8 +3240,16 @@ out:
 static int unknown_module_param_cb(char *param, char *val, const char *modname,
 				   void *arg)
 {
+	struct module *mod = arg;
+	int ret;
+
+	if (strcmp(param, "async_probe") == 0) {
+		mod->async_probe_requested = true;
+		return 0;
+	}
+
 	/* Check for magic 'dyndbg' arg */
-	int ret = ddebug_dyndbg_module_param_cb(param, val, modname);
+	ret = ddebug_dyndbg_module_param_cb(param, val, modname);
 	if (ret != 0)
 		pr_warn("%s: unknown parameter '%s' ignored\n", modname, param);
 	return 0;
-- 
cgit v1.2.3


From d173a137c5bd95ee29d02705e5fa8890ef149718 Mon Sep 17 00:00:00 2001
From: "Luis R. Rodriguez" <mcgrof@suse.com>
Date: Mon, 30 Mar 2015 16:20:06 -0700
Subject: driver-core: enable drivers to opt-out of async probe

There are drivers that can not be probed asynchronously. One such group
is platform drivers registered with platform_driver_probe(), which
expects driver's probe routine be discarded after the driver has been
registered and initial binding attempt executed. Also
platform_driver_probe() an error when no devices were bound to the
driver, allowing failing to load such driver module altogether.

Other drivers do not work well with asynchronous probing because of
driver bug or not optimal driver organization.

To allow using such drivers even when user requests asynchronous probing
as default boot strategy, let's allow them to opt out.

Signed-off-by: Luis R. Rodriguez <mcgrof@suse.com>
Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/base/dd.c      | 14 ++++++++++----
 include/linux/device.h | 13 +++++++------
 2 files changed, 17 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/dd.c b/drivers/base/dd.c
index 7a2fa5dcead7..39292535c74e 100644
--- a/drivers/base/dd.c
+++ b/drivers/base/dd.c
@@ -419,13 +419,19 @@ int driver_probe_device(struct device_driver *drv, struct device *dev)
 
 bool driver_allows_async_probing(struct device_driver *drv)
 {
-	if (drv->probe_type == PROBE_PREFER_ASYNCHRONOUS)
+	switch (drv->probe_type) {
+	case PROBE_PREFER_ASYNCHRONOUS:
 		return true;
 
-	if (drv->owner && drv->owner->async_probe_requested)
-		return true;
+	case PROBE_FORCE_SYNCHRONOUS:
+		return false;
+
+	default:
+		if (drv->owner && drv->owner->async_probe_requested)
+			return true;
 
-	return false;
+		return false;
+	}
 }
 
 struct device_attach_data {
diff --git a/include/linux/device.h b/include/linux/device.h
index 77b7cd9e5467..00ac57c26615 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -201,15 +201,15 @@ extern struct klist *bus_get_device_klist(struct bus_type *bus);
  *	respective probe routines. This tells the core what to
  *	expect and prefer.
  *
- * @PROBE_DEFAULT_STRATEGY: Drivers expect their probe routines
- *	to run synchronously with driver and device registration
- *	(with the exception of -EPROBE_DEFER handling - re-probing
- *	always ends up being done asynchronously) unless user
- *	explicitly requested asynchronous probing via module
- *	parameter.
+ * @PROBE_DEFAULT_STRATEGY: Used by drivers that work equally well
+ *	whether probed synchronously or asynchronously.
  * @PROBE_PREFER_ASYNCHRONOUS: Drivers for "slow" devices which
  *	probing order is not essential for booting the system may
  *	opt into executing their probes asynchronously.
+ * @PROBE_FORCE_SYNCHRONOUS: Use this to annotate drivers that need
+ *	their probe routines to run synchronously with driver and
+ *	device registration (with the exception of -EPROBE_DEFER
+ *	handling - re-probing always ends up being done asynchronously).
  *
  * Note that the end goal is to switch the kernel to use asynchronous
  * probing by default, so annotating drivers with
@@ -220,6 +220,7 @@ extern struct klist *bus_get_device_klist(struct bus_type *bus);
 enum probe_type {
 	PROBE_DEFAULT_STRATEGY,
 	PROBE_PREFER_ASYNCHRONOUS,
+	PROBE_FORCE_SYNCHRONOUS,
 };
 
 /**
-- 
cgit v1.2.3


From ec0ccc16a09fc32f7142ef3ddf1c2276fbbb35d0 Mon Sep 17 00:00:00 2001
From: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Date: Mon, 30 Mar 2015 16:20:09 -0700
Subject: module: add core_param_unsafe

Similarly to module_param_unsafe(), add the helper to be used by core
code wishing to expose unsafe debugging or testing parameters that taint
the kernel when set.

Acked-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/moduleparam.h | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/moduleparam.h b/include/linux/moduleparam.h
index 13923709d30d..6480dcaca275 100644
--- a/include/linux/moduleparam.h
+++ b/include/linux/moduleparam.h
@@ -310,6 +310,15 @@ static inline void __kernel_param_unlock(void)
 #define core_param(name, var, type, perm)				\
 	param_check_##type(name, &(var));				\
 	__module_param_call("", name, &param_ops_##type, &var, perm, -1, 0)
+
+/**
+ * core_param_unsafe - same as core_param but taints kernel
+ */
+#define core_param_unsafe(name, var, type, perm)		\
+	param_check_##type(name, &(var));				\
+	__module_param_call("", name, &param_ops_##type, &var, perm,	\
+			    -1, KERNEL_PARAM_FL_UNSAFE)
+
 #endif /* !MODULE */
 
 /**
-- 
cgit v1.2.3


From 985eba3aba77a997f65109e1418837b1d8b8512a Mon Sep 17 00:00:00 2001
From: Boris Brezillon <boris.brezillon@free-electrons.com>
Date: Wed, 3 Dec 2014 16:46:14 +0100
Subject: mfd: syscon: Add Atmel MC (Memory Controller) registers definition

The at91rm9200 SoC embeds a Memory Controller block which is used to
configure several aspects of the platform:
- AHB/APB Bus behavior
- SDRAM Controller
- EBI (External Bus Interface) and SMC (Static Memory Controller) config

Those registers might be accessed by different drivers, hence we need to
define it as a syscon device.

Signed-off-by: Boris Brezillon <boris.brezillon@free-electrons.com>
Signed-off-by: Alexandre Belloni <alexandre.belloni@free-electrons.com>
Acked-by: Lee Jones <lee.jones@linaro.org>
Acked-by: Nicolas Ferre <nicolas.ferre@atmel.com>
---
 include/linux/mfd/syscon/atmel-mc.h | 144 ++++++++++++++++++++++++++++++++++++
 1 file changed, 144 insertions(+)
 create mode 100644 include/linux/mfd/syscon/atmel-mc.h

(limited to 'include/linux')

diff --git a/include/linux/mfd/syscon/atmel-mc.h b/include/linux/mfd/syscon/atmel-mc.h
new file mode 100644
index 000000000000..afd9b8f1e363
--- /dev/null
+++ b/include/linux/mfd/syscon/atmel-mc.h
@@ -0,0 +1,144 @@
+/*
+ * Copyright (C) 2005 Ivan Kokshaysky
+ * Copyright (C) SAN People
+ *
+ * Memory Controllers (MC, EBI, SMC, SDRAMC, BFC) - System peripherals
+ * registers.
+ * Based on AT91RM9200 datasheet revision E.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#ifndef _LINUX_MFD_SYSCON_ATMEL_MC_H_
+#define _LINUX_MFD_SYSCON_ATMEL_MC_H_
+
+/* Memory Controller */
+#define AT91_MC_RCR			0x00
+#define AT91_MC_RCB			BIT(0)
+
+#define AT91_MC_ASR			0x04
+#define AT91_MC_UNADD			BIT(0)
+#define AT91_MC_MISADD			BIT(1)
+#define AT91_MC_ABTSZ			GENMASK(9, 8)
+#define AT91_MC_ABTSZ_BYTE		(0 << 8)
+#define AT91_MC_ABTSZ_HALFWORD		(1 << 8)
+#define AT91_MC_ABTSZ_WORD		(2 << 8)
+#define AT91_MC_ABTTYP			GENMASK(11, 10)
+#define AT91_MC_ABTTYP_DATAREAD		(0 << 10)
+#define AT91_MC_ABTTYP_DATAWRITE	(1 << 10)
+#define AT91_MC_ABTTYP_FETCH		(2 << 10)
+#define AT91_MC_MST(n)			BIT(16 + (n))
+#define AT91_MC_SVMST(n)		BIT(24 + (n))
+
+#define AT91_MC_AASR			0x08
+
+#define AT91_MC_MPR			0x0c
+#define AT91_MPR_MSTP(n)		GENMASK(2 + ((x) * 4), ((x) * 4))
+
+/* External Bus Interface (EBI) registers */
+#define AT91_MC_EBI_CSA			0x60
+#define AT91_MC_EBI_CS(n)		BIT(x)
+#define AT91_MC_EBI_NUM_CS		8
+
+#define AT91_MC_EBI_CFGR		0x64
+#define AT91_MC_EBI_DBPUC		BIT(0)
+
+/* Static Memory Controller (SMC) registers */
+#define AT91_MC_SMC_CSR(n)		(0x70 + ((n) * 4))
+#define AT91_MC_SMC_NWS			GENMASK(6, 0)
+#define AT91_MC_SMC_NWS_(x)		((x) << 0)
+#define AT91_MC_SMC_WSEN		BIT(7)
+#define AT91_MC_SMC_TDF			GENMASK(11, 8)
+#define AT91_MC_SMC_TDF_(x)		((x) << 8)
+#define AT91_MC_SMC_TDF_MAX		0xf
+#define AT91_MC_SMC_BAT			BIT(12)
+#define AT91_MC_SMC_DBW			GENMASK(14, 13)
+#define AT91_MC_SMC_DBW_16		(1 << 13)
+#define AT91_MC_SMC_DBW_8		(2 << 13)
+#define AT91_MC_SMC_DPR			BIT(15)
+#define AT91_MC_SMC_ACSS		GENMASK(17, 16)
+#define AT91_MC_SMC_ACSS_(x)		((x) << 16)
+#define AT91_MC_SMC_ACSS_MAX		3
+#define AT91_MC_SMC_RWSETUP		GENMASK(26, 24)
+#define AT91_MC_SMC_RWSETUP_(x)		((x) << 24)
+#define AT91_MC_SMC_RWHOLD		GENMASK(30, 28)
+#define AT91_MC_SMC_RWHOLD_(x)		((x) << 28)
+#define AT91_MC_SMC_RWHOLDSETUP_MAX	7
+
+/* SDRAM Controller registers */
+#define AT91_MC_SDRAMC_MR		0x90
+#define AT91_MC_SDRAMC_MODE		GENMASK(3, 0)
+#define AT91_MC_SDRAMC_MODE_NORMAL	(0 << 0)
+#define AT91_MC_SDRAMC_MODE_NOP		(1 << 0)
+#define AT91_MC_SDRAMC_MODE_PRECHARGE	(2 << 0)
+#define AT91_MC_SDRAMC_MODE_LMR		(3 << 0)
+#define AT91_MC_SDRAMC_MODE_REFRESH	(4 << 0)
+#define AT91_MC_SDRAMC_DBW_16		BIT(4)
+
+#define AT91_MC_SDRAMC_TR		0x94
+#define AT91_MC_SDRAMC_COUNT		GENMASK(11, 0)
+
+#define AT91_MC_SDRAMC_CR		0x98
+#define AT91_MC_SDRAMC_NC		GENMASK(1, 0)
+#define AT91_MC_SDRAMC_NC_8		(0 << 0)
+#define AT91_MC_SDRAMC_NC_9		(1 << 0)
+#define AT91_MC_SDRAMC_NC_10		(2 << 0)
+#define AT91_MC_SDRAMC_NC_11		(3 << 0)
+#define AT91_MC_SDRAMC_NR		GENMASK(3, 2)
+#define AT91_MC_SDRAMC_NR_11		(0 << 2)
+#define AT91_MC_SDRAMC_NR_12		(1 << 2)
+#define AT91_MC_SDRAMC_NR_13		(2 << 2)
+#define AT91_MC_SDRAMC_NB		BIT(4)
+#define AT91_MC_SDRAMC_NB_2		(0 << 4)
+#define AT91_MC_SDRAMC_NB_4		(1 << 4)
+#define AT91_MC_SDRAMC_CAS		GENMASK(6, 5)
+#define AT91_MC_SDRAMC_CAS_2		(2 << 5)
+#define AT91_MC_SDRAMC_TWR		GENMASK(10,  7)
+#define AT91_MC_SDRAMC_TRC		GENMASK(14, 11)
+#define AT91_MC_SDRAMC_TRP		GENMASK(18, 15)
+#define AT91_MC_SDRAMC_TRCD		GENMASK(22, 19)
+#define AT91_MC_SDRAMC_TRAS		GENMASK(26, 23)
+#define AT91_MC_SDRAMC_TXSR		GENMASK(30, 27)
+
+#define AT91_MC_SDRAMC_SRR		0x9c
+#define AT91_MC_SDRAMC_SRCB		BIT(0)
+
+#define AT91_MC_SDRAMC_LPR		0xa0
+#define AT91_MC_SDRAMC_LPCB		BIT(0)
+
+#define AT91_MC_SDRAMC_IER		0xa4
+#define AT91_MC_SDRAMC_IDR		0xa8
+#define AT91_MC_SDRAMC_IMR		0xac
+#define AT91_MC_SDRAMC_ISR		0xb0
+#define AT91_MC_SDRAMC_RES		BIT(0)
+
+/* Burst Flash Controller register */
+#define AT91_MC_BFC_MR			0xc0
+#define AT91_MC_BFC_BFCOM		GENMASK(1, 0)
+#define AT91_MC_BFC_BFCOM_DISABLED	(0 << 0)
+#define AT91_MC_BFC_BFCOM_ASYNC		(1 << 0)
+#define AT91_MC_BFC_BFCOM_BURST		(2 << 0)
+#define AT91_MC_BFC_BFCC		GENMASK(3, 2)
+#define AT91_MC_BFC_BFCC_MCK		(1 << 2)
+#define AT91_MC_BFC_BFCC_DIV2		(2 << 2)
+#define AT91_MC_BFC_BFCC_DIV4		(3 << 2)
+#define AT91_MC_BFC_AVL			GENMASK(7,  4)
+#define AT91_MC_BFC_PAGES		GENMASK(10, 8)
+#define AT91_MC_BFC_PAGES_NO_PAGE	(0 << 8)
+#define AT91_MC_BFC_PAGES_16		(1 << 8)
+#define AT91_MC_BFC_PAGES_32		(2 << 8)
+#define AT91_MC_BFC_PAGES_64		(3 << 8)
+#define AT91_MC_BFC_PAGES_128		(4 << 8)
+#define AT91_MC_BFC_PAGES_256		(5 << 8)
+#define AT91_MC_BFC_PAGES_512		(6 << 8)
+#define AT91_MC_BFC_PAGES_1024		(7 << 8)
+#define AT91_MC_BFC_OEL			GENMASK(13, 12)
+#define AT91_MC_BFC_BAAEN		BIT(16)
+#define AT91_MC_BFC_BFOEH		BIT(17)
+#define AT91_MC_BFC_MUXEN		BIT(18)
+#define AT91_MC_BFC_RDYEN		BIT(19)
+
+#endif /* _LINUX_MFD_SYSCON_ATMEL_MC_H_ */
-- 
cgit v1.2.3


From be32417796c2b8a83fe4cbece83bea96ab9e378f Mon Sep 17 00:00:00 2001
From: Jarod Wilson <jarod@redhat.com>
Date: Wed, 6 May 2015 12:26:22 +0800
Subject: block: export blkdev_reread_part() and __blkdev_reread_part()

This patch exports blkdev_reread_part() for block drivers, also
introduce __blkdev_reread_part().

For some drivers, such as loop, reread of partitions can be run
from the release path, and bd_mutex may already be held prior to
calling ioctl_by_bdev(bdev, BLKRRPART, 0), so introduce
__blkdev_reread_part for use in such cases.

CC: Christoph Hellwig <hch@lst.de>
CC: Jens Axboe <axboe@kernel.dk>
CC: Tejun Heo <tj@kernel.org>
CC: Alexander Viro <viro@zeniv.linux.org.uk>
CC: Markus Pargmann <mpa@pengutronix.de>
CC: Stefan Weinhuber <wein@de.ibm.com>
CC: Stefan Haberland <stefan.haberland@de.ibm.com>
CC: Sebastian Ott <sebott@linux.vnet.ibm.com>
CC: Fabian Frederick <fabf@skynet.be>
CC: Ming Lei <ming.lei@canonical.com>
CC: David Herrmann <dh.herrmann@gmail.com>
CC: Andrew Morton <akpm@linux-foundation.org>
CC: Peter Zijlstra <peterz@infradead.org>
CC: nbd-general@lists.sourceforge.net
CC: linux-s390@vger.kernel.org
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jarod Wilson <jarod@redhat.com>
Signed-off-by: Ming Lei <ming.lei@canonical.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/ioctl.c      | 28 +++++++++++++++++++++++++---
 include/linux/fs.h |  3 +++
 2 files changed, 28 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/block/ioctl.c b/block/ioctl.c
index 7d8befde2aca..203cb4aeea8b 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -150,21 +150,43 @@ static int blkpg_ioctl(struct block_device *bdev, struct blkpg_ioctl_arg __user
 	}
 }
 
-static int blkdev_reread_part(struct block_device *bdev)
+/*
+ * This is an exported API for the block driver, and will not
+ * acquire bd_mutex. This API should be used in case that
+ * caller has held bd_mutex already.
+ */
+int __blkdev_reread_part(struct block_device *bdev)
 {
 	struct gendisk *disk = bdev->bd_disk;
-	int res;
 
 	if (!disk_part_scan_enabled(disk) || bdev != bdev->bd_contains)
 		return -EINVAL;
 	if (!capable(CAP_SYS_ADMIN))
 		return -EACCES;
+
+	lockdep_assert_held(&bdev->bd_mutex);
+
+	return rescan_partitions(disk, bdev);
+}
+EXPORT_SYMBOL(__blkdev_reread_part);
+
+/*
+ * This is an exported API for the block driver, and will
+ * try to acquire bd_mutex. If bd_mutex has been held already
+ * in current context, please call __blkdev_reread_part().
+ */
+int blkdev_reread_part(struct block_device *bdev)
+{
+	int res;
+
 	if (!mutex_trylock(&bdev->bd_mutex))
 		return -EBUSY;
-	res = rescan_partitions(disk, bdev);
+	res = __blkdev_reread_part(bdev);
 	mutex_unlock(&bdev->bd_mutex);
+
 	return res;
 }
+EXPORT_SYMBOL(blkdev_reread_part);
 
 static int blk_ioctl_discard(struct block_device *bdev, uint64_t start,
 			     uint64_t len, int secure)
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 35ec87e490b1..1ef63900243c 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2279,6 +2279,9 @@ extern struct block_device *blkdev_get_by_path(const char *path, fmode_t mode,
 extern struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode,
 					      void *holder);
 extern void blkdev_put(struct block_device *bdev, fmode_t mode);
+extern int __blkdev_reread_part(struct block_device *bdev);
+extern int blkdev_reread_part(struct block_device *bdev);
+
 #ifdef CONFIG_SYSFS
 extern int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk);
 extern void bd_unlink_disk_holder(struct block_device *bdev,
-- 
cgit v1.2.3


From 6e844b035360294636271f4f0fd4a5a685e03c06 Mon Sep 17 00:00:00 2001
From: Florian Fainelli <f.fainelli@gmail.com>
Date: Fri, 17 Apr 2015 10:45:55 -0700
Subject: ARM: BCM63xx: Add Broadcom BCM63xx PMB controller helpers

This patch adds both common register definitions and helper functions
used to issue read/write commands to the Broadcom BCM63xx PMB controller
which is used to power on and release from reset internal on-chip
peripherals such as the integrated Ethernet switch, AHCI, USB, as well
as the secondary CPU core.

This is going to be utilized by the BCM63138 SMP code, as well as by the
BCM63138 reset controller later.

Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
---
 include/linux/reset/bcm63xx_pmb.h | 88 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 88 insertions(+)
 create mode 100644 include/linux/reset/bcm63xx_pmb.h

(limited to 'include/linux')

diff --git a/include/linux/reset/bcm63xx_pmb.h b/include/linux/reset/bcm63xx_pmb.h
new file mode 100644
index 000000000000..bb4af7b5eb36
--- /dev/null
+++ b/include/linux/reset/bcm63xx_pmb.h
@@ -0,0 +1,88 @@
+/*
+ * Broadcom BCM63xx Processor Monitor Bus shared routines (SMP and reset)
+ *
+ * Copyright (C) 2015, Broadcom Corporation
+ * Author: Florian Fainelli <f.fainelli@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation version 2.
+ *
+ * This program is distributed "as is" WITHOUT ANY WARRANTY of any
+ * kind, whether express or implied; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+#ifndef __BCM63XX_PMB_H
+#define __BCM63XX_PMB_H
+
+#include <linux/io.h>
+#include <linux/types.h>
+#include <linux/delay.h>
+#include <linux/err.h>
+
+/* PMB Master controller register */
+#define PMB_CTRL		0x00
+#define  PMC_PMBM_START		(1 << 31)
+#define  PMC_PMBM_TIMEOUT	(1 << 30)
+#define  PMC_PMBM_SLAVE_ERR	(1 << 29)
+#define  PMC_PMBM_BUSY		(1 << 28)
+#define  PMC_PMBM_READ		(0 << 20)
+#define  PMC_PMBM_WRITE		(1 << 20)
+#define PMB_WR_DATA		0x04
+#define PMB_TIMEOUT		0x08
+#define PMB_RD_DATA		0x0C
+
+#define PMB_BUS_ID_SHIFT	8
+
+/* Perform the low-level PMB master operation, shared between reads and
+ * writes.
+ */
+static inline int __bpcm_do_op(void __iomem *master, unsigned int addr,
+			       u32 off, u32 op)
+{
+	unsigned int timeout = 1000;
+	u32 cmd;
+
+	cmd = (PMC_PMBM_START | op | (addr & 0xff) << 12 | off);
+	writel(cmd, master + PMB_CTRL);
+	do {
+		cmd = readl(master + PMB_CTRL);
+		if (!(cmd & PMC_PMBM_START))
+			return 0;
+
+		if (cmd & PMC_PMBM_SLAVE_ERR)
+			return -EIO;
+
+		if (cmd & PMC_PMBM_TIMEOUT)
+			return -ETIMEDOUT;
+
+		udelay(1);
+	} while (timeout-- > 0);
+
+	return -ETIMEDOUT;
+}
+
+static inline int bpcm_rd(void __iomem *master, unsigned int addr,
+			  u32 off, u32 *val)
+{
+	int ret = 0;
+
+	ret = __bpcm_do_op(master, addr, off >> 2, PMC_PMBM_READ);
+	*val = readl(master + PMB_RD_DATA);
+
+	return ret;
+}
+
+static inline int bpcm_wr(void __iomem *master, unsigned int addr,
+			  u32 off, u32 val)
+{
+	int ret = 0;
+
+	writel(val, master + PMB_WR_DATA);
+	ret = __bpcm_do_op(master, addr, off >> 2, PMC_PMBM_WRITE);
+
+	return ret;
+}
+
+#endif /* __BCM63XX_PMB_H */
-- 
cgit v1.2.3


From 7f1a57fdd6cb6e7be2ed31878a34655df38e1861 Mon Sep 17 00:00:00 2001
From: Krzysztof Kozlowski <k.kozlowski@samsung.com>
Date: Tue, 19 May 2015 16:13:02 +0900
Subject: power_supply: Fix possible NULL pointer dereference on early uevent

Don't call the power_supply_changed() from power_supply_register() when
parent is still probing because it may lead to accessing parent too
early.

In bq27x00_battery this caused NULL pointer exception because uevent of
power_supply_changed called back the the get_property() method provided
by the driver. The get_property() method accessed pointer which should
be returned by power_supply_register().

Starting from bq27x00_battery_probe():
  di->bat = power_supply_register()
    power_supply_changed()
      kobject_uevent()
        power_supply_uevent()
          power_supply_show_property()
            power_supply_get_property()
              bq27x00_battery_get_property()
                dereference of di->bat which is NULL here

The dereference of di->bat (value returned by power_supply_register())
is the currently visible problem. However calling back the methods
provided by driver before ending the probe may lead to accessing other
driver-related data which is not yet initialized.

The call to power_supply_changed() is postponed till probing ends -
mutex of parent device is released.

Reported-by: H. Nikolaus Schaller <hns@goldelico.com>
Signed-off-by: Krzysztof Kozlowski <k.kozlowski@samsung.com>
Fixes: 297d716f6260 ("power_supply: Change ownership from driver to core")
Tested-By: Dr. H. Nikolaus Schaller <hns@goldelico.com>
Signed-off-by: Sebastian Reichel <sre@kernel.org>
---
 drivers/power/power_supply_core.c | 50 +++++++++++++++++++++++++++++++++++----
 include/linux/power_supply.h      |  1 +
 2 files changed, 46 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/power/power_supply_core.c b/drivers/power/power_supply_core.c
index 15da277e0e8d..4bc0c7f459a5 100644
--- a/drivers/power/power_supply_core.c
+++ b/drivers/power/power_supply_core.c
@@ -30,6 +30,8 @@ EXPORT_SYMBOL_GPL(power_supply_notifier);
 
 static struct device_type power_supply_dev_type;
 
+#define POWER_SUPPLY_DEFERRED_REGISTER_TIME	msecs_to_jiffies(10)
+
 static bool __power_supply_is_supplied_by(struct power_supply *supplier,
 					 struct power_supply *supply)
 {
@@ -121,6 +123,30 @@ void power_supply_changed(struct power_supply *psy)
 }
 EXPORT_SYMBOL_GPL(power_supply_changed);
 
+/*
+ * Notify that power supply was registered after parent finished the probing.
+ *
+ * Often power supply is registered from driver's probe function. However
+ * calling power_supply_changed() directly from power_supply_register()
+ * would lead to execution of get_property() function provided by the driver
+ * too early - before the probe ends.
+ *
+ * Avoid that by waiting on parent's mutex.
+ */
+static void power_supply_deferred_register_work(struct work_struct *work)
+{
+	struct power_supply *psy = container_of(work, struct power_supply,
+						deferred_register_work.work);
+
+	if (psy->dev.parent)
+		mutex_lock(&psy->dev.parent->mutex);
+
+	power_supply_changed(psy);
+
+	if (psy->dev.parent)
+		mutex_unlock(&psy->dev.parent->mutex);
+}
+
 #ifdef CONFIG_OF
 #include <linux/of.h>
 
@@ -645,6 +671,10 @@ __power_supply_register(struct device *parent,
 	struct power_supply *psy;
 	int rc;
 
+	if (!parent)
+		pr_warn("%s: Expected proper parent device for '%s'\n",
+			__func__, desc->name);
+
 	psy = kzalloc(sizeof(*psy), GFP_KERNEL);
 	if (!psy)
 		return ERR_PTR(-ENOMEM);
@@ -671,6 +701,8 @@ __power_supply_register(struct device *parent,
 		goto dev_set_name_failed;
 
 	INIT_WORK(&psy->changed_work, power_supply_changed_work);
+	INIT_DELAYED_WORK(&psy->deferred_register_work,
+			  power_supply_deferred_register_work);
 
 	rc = power_supply_check_supplies(psy);
 	if (rc) {
@@ -709,7 +741,10 @@ __power_supply_register(struct device *parent,
 	 *    after calling power_supply_register()).
 	 */
 	atomic_inc(&psy->use_cnt);
-	power_supply_changed(psy);
+
+	queue_delayed_work(system_power_efficient_wq,
+			   &psy->deferred_register_work,
+			   POWER_SUPPLY_DEFERRED_REGISTER_TIME);
 
 	return psy;
 
@@ -729,7 +764,8 @@ dev_set_name_failed:
 
 /**
  * power_supply_register() - Register new power supply
- * @parent:	Device to be a parent of power supply's device
+ * @parent:	Device to be a parent of power supply's device, usually
+ *		the device which probe function calls this
  * @desc:	Description of power supply, must be valid through whole
  *		lifetime of this power supply
  * @cfg:	Run-time specific configuration accessed during registering,
@@ -750,7 +786,8 @@ EXPORT_SYMBOL_GPL(power_supply_register);
 
 /**
  * power_supply_register() - Register new non-waking-source power supply
- * @parent:	Device to be a parent of power supply's device
+ * @parent:	Device to be a parent of power supply's device, usually
+ *		the device which probe function calls this
  * @desc:	Description of power supply, must be valid through whole
  *		lifetime of this power supply
  * @cfg:	Run-time specific configuration accessed during registering,
@@ -779,7 +816,8 @@ static void devm_power_supply_release(struct device *dev, void *res)
 
 /**
  * power_supply_register() - Register managed power supply
- * @parent:	Device to be a parent of power supply's device
+ * @parent:	Device to be a parent of power supply's device, usually
+ *		the device which probe function calls this
  * @desc:	Description of power supply, must be valid through whole
  *		lifetime of this power supply
  * @cfg:	Run-time specific configuration accessed during registering,
@@ -814,7 +852,8 @@ EXPORT_SYMBOL_GPL(devm_power_supply_register);
 
 /**
  * power_supply_register() - Register managed non-waking-source power supply
- * @parent:	Device to be a parent of power supply's device
+ * @parent:	Device to be a parent of power supply's device, usually
+ *		the device which probe function calls this
  * @desc:	Description of power supply, must be valid through whole
  *		lifetime of this power supply
  * @cfg:	Run-time specific configuration accessed during registering,
@@ -858,6 +897,7 @@ void power_supply_unregister(struct power_supply *psy)
 {
 	WARN_ON(atomic_dec_return(&psy->use_cnt));
 	cancel_work_sync(&psy->changed_work);
+	cancel_delayed_work_sync(&psy->deferred_register_work);
 	sysfs_remove_link(&psy->dev.kobj, "powers");
 	power_supply_remove_triggers(psy);
 	psy_unregister_cooler(psy);
diff --git a/include/linux/power_supply.h b/include/linux/power_supply.h
index 75a1dd8dc56e..a80f1fd01ddb 100644
--- a/include/linux/power_supply.h
+++ b/include/linux/power_supply.h
@@ -237,6 +237,7 @@ struct power_supply {
 	/* private */
 	struct device dev;
 	struct work_struct changed_work;
+	struct delayed_work deferred_register_work;
 	spinlock_t changed_lock;
 	bool changed;
 	atomic_t use_cnt;
-- 
cgit v1.2.3


From 04fd61ab36ec065e194ab5e74ae34a5240d992bb Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@plumgrid.com>
Date: Tue, 19 May 2015 16:59:03 -0700
Subject: bpf: allow bpf programs to tail-call other bpf programs

introduce bpf_tail_call(ctx, &jmp_table, index) helper function
which can be used from BPF programs like:
int bpf_prog(struct pt_regs *ctx)
{
  ...
  bpf_tail_call(ctx, &jmp_table, index);
  ...
}
that is roughly equivalent to:
int bpf_prog(struct pt_regs *ctx)
{
  ...
  if (jmp_table[index])
    return (*jmp_table[index])(ctx);
  ...
}
The important detail that it's not a normal call, but a tail call.
The kernel stack is precious, so this helper reuses the current
stack frame and jumps into another BPF program without adding
extra call frame.
It's trivially done in interpreter and a bit trickier in JITs.
In case of x64 JIT the bigger part of generated assembler prologue
is common for all programs, so it is simply skipped while jumping.
Other JITs can do similar prologue-skipping optimization or
do stack unwind before jumping into the next program.

bpf_tail_call() arguments:
ctx - context pointer
jmp_table - one of BPF_MAP_TYPE_PROG_ARRAY maps used as the jump table
index - index in the jump table

Since all BPF programs are idenitified by file descriptor, user space
need to populate the jmp_table with FDs of other BPF programs.
If jmp_table[index] is empty the bpf_tail_call() doesn't jump anywhere
and program execution continues as normal.

New BPF_MAP_TYPE_PROG_ARRAY map type is introduced so that user space can
populate this jmp_table array with FDs of other bpf programs.
Programs can share the same jmp_table array or use multiple jmp_tables.

The chain of tail calls can form unpredictable dynamic loops therefore
tail_call_cnt is used to limit the number of calls and currently is set to 32.

Use cases:
Acked-by: Daniel Borkmann <daniel@iogearbox.net>

==========
- simplify complex programs by splitting them into a sequence of small programs

- dispatch routine
  For tracing and future seccomp the program may be triggered on all system
  calls, but processing of syscall arguments will be different. It's more
  efficient to implement them as:
  int syscall_entry(struct seccomp_data *ctx)
  {
     bpf_tail_call(ctx, &syscall_jmp_table, ctx->nr /* syscall number */);
     ... default: process unknown syscall ...
  }
  int sys_write_event(struct seccomp_data *ctx) {...}
  int sys_read_event(struct seccomp_data *ctx) {...}
  syscall_jmp_table[__NR_write] = sys_write_event;
  syscall_jmp_table[__NR_read] = sys_read_event;

  For networking the program may call into different parsers depending on
  packet format, like:
  int packet_parser(struct __sk_buff *skb)
  {
     ... parse L2, L3 here ...
     __u8 ipproto = load_byte(skb, ... offsetof(struct iphdr, protocol));
     bpf_tail_call(skb, &ipproto_jmp_table, ipproto);
     ... default: process unknown protocol ...
  }
  int parse_tcp(struct __sk_buff *skb) {...}
  int parse_udp(struct __sk_buff *skb) {...}
  ipproto_jmp_table[IPPROTO_TCP] = parse_tcp;
  ipproto_jmp_table[IPPROTO_UDP] = parse_udp;

- for TC use case, bpf_tail_call() allows to implement reclassify-like logic

- bpf_map_update_elem/delete calls into BPF_MAP_TYPE_PROG_ARRAY jump table
  are atomic, so user space can build chains of BPF programs on the fly

Implementation details:
=======================
- high performance of bpf_tail_call() is the goal.
  It could have been implemented without JIT changes as a wrapper on top of
  BPF_PROG_RUN() macro, but with two downsides:
  . all programs would have to pay performance penalty for this feature and
    tail call itself would be slower, since mandatory stack unwind, return,
    stack allocate would be done for every tailcall.
  . tailcall would be limited to programs running preempt_disabled, since
    generic 'void *ctx' doesn't have room for 'tail_call_cnt' and it would
    need to be either global per_cpu variable accessed by helper and by wrapper
    or global variable protected by locks.

  In this implementation x64 JIT bypasses stack unwind and jumps into the
  callee program after prologue.

- bpf_prog_array_compatible() ensures that prog_type of callee and caller
  are the same and JITed/non-JITed flag is the same, since calling JITed
  program from non-JITed is invalid, since stack frames are different.
  Similarly calling kprobe type program from socket type program is invalid.

- jump table is implemented as BPF_MAP_TYPE_PROG_ARRAY to reuse 'map'
  abstraction, its user space API and all of verifier logic.
  It's in the existing arraymap.c file, since several functions are
  shared with regular array map.

Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h      |  22 +++++++++
 include/linux/filter.h   |   2 +-
 include/uapi/linux/bpf.h |  10 +++++
 kernel/bpf/arraymap.c    | 113 ++++++++++++++++++++++++++++++++++++++++++++---
 kernel/bpf/core.c        |  73 +++++++++++++++++++++++++++++-
 kernel/bpf/syscall.c     |  23 +++++++++-
 kernel/bpf/verifier.c    |  17 +++++++
 kernel/trace/bpf_trace.c |   2 +
 net/core/filter.c        |   2 +
 9 files changed, 255 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index d5cda067115a..8821b9a8689e 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -126,6 +126,27 @@ struct bpf_prog_aux {
 	struct work_struct work;
 };
 
+struct bpf_array {
+	struct bpf_map map;
+	u32 elem_size;
+	/* 'ownership' of prog_array is claimed by the first program that
+	 * is going to use this map or by the first program which FD is stored
+	 * in the map to make sure that all callers and callees have the same
+	 * prog_type and JITed flag
+	 */
+	enum bpf_prog_type owner_prog_type;
+	bool owner_jited;
+	union {
+		char value[0] __aligned(8);
+		struct bpf_prog *prog[0] __aligned(8);
+	};
+};
+#define MAX_TAIL_CALL_CNT 32
+
+u64 bpf_tail_call(u64 ctx, u64 r2, u64 index, u64 r4, u64 r5);
+void bpf_prog_array_map_clear(struct bpf_map *map);
+bool bpf_prog_array_compatible(struct bpf_array *array, const struct bpf_prog *fp);
+
 #ifdef CONFIG_BPF_SYSCALL
 void bpf_register_prog_type(struct bpf_prog_type_list *tl);
 void bpf_register_map_type(struct bpf_map_type_list *tl);
@@ -160,5 +181,6 @@ extern const struct bpf_func_proto bpf_map_delete_elem_proto;
 
 extern const struct bpf_func_proto bpf_get_prandom_u32_proto;
 extern const struct bpf_func_proto bpf_get_smp_processor_id_proto;
+extern const struct bpf_func_proto bpf_tail_call_proto;
 
 #endif /* _LINUX_BPF_H */
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 200be4a74a33..17724f6ea983 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -378,7 +378,7 @@ static inline void bpf_prog_unlock_ro(struct bpf_prog *fp)
 
 int sk_filter(struct sock *sk, struct sk_buff *skb);
 
-void bpf_prog_select_runtime(struct bpf_prog *fp);
+int bpf_prog_select_runtime(struct bpf_prog *fp);
 void bpf_prog_free(struct bpf_prog *fp);
 
 struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags);
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index a9ebdf5701e8..f0a9af8b4dae 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -113,6 +113,7 @@ enum bpf_map_type {
 	BPF_MAP_TYPE_UNSPEC,
 	BPF_MAP_TYPE_HASH,
 	BPF_MAP_TYPE_ARRAY,
+	BPF_MAP_TYPE_PROG_ARRAY,
 };
 
 enum bpf_prog_type {
@@ -210,6 +211,15 @@ enum bpf_func_id {
 	 * Return: 0 on success
 	 */
 	BPF_FUNC_l4_csum_replace,
+
+	/**
+	 * bpf_tail_call(ctx, prog_array_map, index) - jump into another BPF program
+	 * @ctx: context pointer passed to next program
+	 * @prog_array_map: pointer to map which type is BPF_MAP_TYPE_PROG_ARRAY
+	 * @index: index inside array that selects specific program to run
+	 * Return: 0 on success
+	 */
+	BPF_FUNC_tail_call,
 	__BPF_FUNC_MAX_ID,
 };
 
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 8a6616583f38..614bcd4c1d74 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -14,12 +14,7 @@
 #include <linux/vmalloc.h>
 #include <linux/slab.h>
 #include <linux/mm.h>
-
-struct bpf_array {
-	struct bpf_map map;
-	u32 elem_size;
-	char value[0] __aligned(8);
-};
+#include <linux/filter.h>
 
 /* Called from syscall */
 static struct bpf_map *array_map_alloc(union bpf_attr *attr)
@@ -154,3 +149,109 @@ static int __init register_array_map(void)
 	return 0;
 }
 late_initcall(register_array_map);
+
+static struct bpf_map *prog_array_map_alloc(union bpf_attr *attr)
+{
+	/* only bpf_prog file descriptors can be stored in prog_array map */
+	if (attr->value_size != sizeof(u32))
+		return ERR_PTR(-EINVAL);
+	return array_map_alloc(attr);
+}
+
+static void prog_array_map_free(struct bpf_map *map)
+{
+	struct bpf_array *array = container_of(map, struct bpf_array, map);
+	int i;
+
+	synchronize_rcu();
+
+	/* make sure it's empty */
+	for (i = 0; i < array->map.max_entries; i++)
+		BUG_ON(array->prog[i] != NULL);
+	kvfree(array);
+}
+
+static void *prog_array_map_lookup_elem(struct bpf_map *map, void *key)
+{
+	return NULL;
+}
+
+/* only called from syscall */
+static int prog_array_map_update_elem(struct bpf_map *map, void *key,
+				      void *value, u64 map_flags)
+{
+	struct bpf_array *array = container_of(map, struct bpf_array, map);
+	struct bpf_prog *prog, *old_prog;
+	u32 index = *(u32 *)key, ufd;
+
+	if (map_flags != BPF_ANY)
+		return -EINVAL;
+
+	if (index >= array->map.max_entries)
+		return -E2BIG;
+
+	ufd = *(u32 *)value;
+	prog = bpf_prog_get(ufd);
+	if (IS_ERR(prog))
+		return PTR_ERR(prog);
+
+	if (!bpf_prog_array_compatible(array, prog)) {
+		bpf_prog_put(prog);
+		return -EINVAL;
+	}
+
+	old_prog = xchg(array->prog + index, prog);
+	if (old_prog)
+		bpf_prog_put(old_prog);
+
+	return 0;
+}
+
+static int prog_array_map_delete_elem(struct bpf_map *map, void *key)
+{
+	struct bpf_array *array = container_of(map, struct bpf_array, map);
+	struct bpf_prog *old_prog;
+	u32 index = *(u32 *)key;
+
+	if (index >= array->map.max_entries)
+		return -E2BIG;
+
+	old_prog = xchg(array->prog + index, NULL);
+	if (old_prog) {
+		bpf_prog_put(old_prog);
+		return 0;
+	} else {
+		return -ENOENT;
+	}
+}
+
+/* decrement refcnt of all bpf_progs that are stored in this map */
+void bpf_prog_array_map_clear(struct bpf_map *map)
+{
+	struct bpf_array *array = container_of(map, struct bpf_array, map);
+	int i;
+
+	for (i = 0; i < array->map.max_entries; i++)
+		prog_array_map_delete_elem(map, &i);
+}
+
+static const struct bpf_map_ops prog_array_ops = {
+	.map_alloc = prog_array_map_alloc,
+	.map_free = prog_array_map_free,
+	.map_get_next_key = array_map_get_next_key,
+	.map_lookup_elem = prog_array_map_lookup_elem,
+	.map_update_elem = prog_array_map_update_elem,
+	.map_delete_elem = prog_array_map_delete_elem,
+};
+
+static struct bpf_map_type_list prog_array_type __read_mostly = {
+	.ops = &prog_array_ops,
+	.type = BPF_MAP_TYPE_PROG_ARRAY,
+};
+
+static int __init register_prog_array_map(void)
+{
+	bpf_register_map_type(&prog_array_type);
+	return 0;
+}
+late_initcall(register_prog_array_map);
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 54f0e7fcd0e2..d44b25cbe460 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -176,6 +176,15 @@ noinline u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
 	return 0;
 }
 
+const struct bpf_func_proto bpf_tail_call_proto = {
+	.func = NULL,
+	.gpl_only = false,
+	.ret_type = RET_VOID,
+	.arg1_type = ARG_PTR_TO_CTX,
+	.arg2_type = ARG_CONST_MAP_PTR,
+	.arg3_type = ARG_ANYTHING,
+};
+
 /**
  *	__bpf_prog_run - run eBPF program on a given context
  *	@ctx: is the data we are operating on
@@ -244,6 +253,7 @@ static unsigned int __bpf_prog_run(void *ctx, const struct bpf_insn *insn)
 		[BPF_ALU64 | BPF_NEG] = &&ALU64_NEG,
 		/* Call instruction */
 		[BPF_JMP | BPF_CALL] = &&JMP_CALL,
+		[BPF_JMP | BPF_CALL | BPF_X] = &&JMP_TAIL_CALL,
 		/* Jumps */
 		[BPF_JMP | BPF_JA] = &&JMP_JA,
 		[BPF_JMP | BPF_JEQ | BPF_X] = &&JMP_JEQ_X,
@@ -286,6 +296,7 @@ static unsigned int __bpf_prog_run(void *ctx, const struct bpf_insn *insn)
 		[BPF_LD | BPF_IND | BPF_B] = &&LD_IND_B,
 		[BPF_LD | BPF_IMM | BPF_DW] = &&LD_IMM_DW,
 	};
+	u32 tail_call_cnt = 0;
 	void *ptr;
 	int off;
 
@@ -431,6 +442,30 @@ select_insn:
 						       BPF_R4, BPF_R5);
 		CONT;
 
+	JMP_TAIL_CALL: {
+		struct bpf_map *map = (struct bpf_map *) (unsigned long) BPF_R2;
+		struct bpf_array *array = container_of(map, struct bpf_array, map);
+		struct bpf_prog *prog;
+		u64 index = BPF_R3;
+
+		if (unlikely(index >= array->map.max_entries))
+			goto out;
+
+		if (unlikely(tail_call_cnt > MAX_TAIL_CALL_CNT))
+			goto out;
+
+		tail_call_cnt++;
+
+		prog = READ_ONCE(array->prog[index]);
+		if (unlikely(!prog))
+			goto out;
+
+		ARG1 = BPF_R1;
+		insn = prog->insnsi;
+		goto select_insn;
+out:
+		CONT;
+	}
 	/* JMP */
 	JMP_JA:
 		insn += insn->off;
@@ -619,6 +654,40 @@ void __weak bpf_int_jit_compile(struct bpf_prog *prog)
 {
 }
 
+bool bpf_prog_array_compatible(struct bpf_array *array, const struct bpf_prog *fp)
+{
+	if (array->owner_prog_type) {
+		if (array->owner_prog_type != fp->type)
+			return false;
+		if (array->owner_jited != fp->jited)
+			return false;
+	} else {
+		array->owner_prog_type = fp->type;
+		array->owner_jited = fp->jited;
+	}
+	return true;
+}
+
+static int check_tail_call(const struct bpf_prog *fp)
+{
+	struct bpf_prog_aux *aux = fp->aux;
+	int i;
+
+	for (i = 0; i < aux->used_map_cnt; i++) {
+		struct bpf_array *array;
+		struct bpf_map *map;
+
+		map = aux->used_maps[i];
+		if (map->map_type != BPF_MAP_TYPE_PROG_ARRAY)
+			continue;
+		array = container_of(map, struct bpf_array, map);
+		if (!bpf_prog_array_compatible(array, fp))
+			return -EINVAL;
+	}
+
+	return 0;
+}
+
 /**
  *	bpf_prog_select_runtime - select execution runtime for BPF program
  *	@fp: bpf_prog populated with internal BPF program
@@ -626,7 +695,7 @@ void __weak bpf_int_jit_compile(struct bpf_prog *prog)
  * try to JIT internal BPF program, if JIT is not available select interpreter
  * BPF program will be executed via BPF_PROG_RUN() macro
  */
-void bpf_prog_select_runtime(struct bpf_prog *fp)
+int bpf_prog_select_runtime(struct bpf_prog *fp)
 {
 	fp->bpf_func = (void *) __bpf_prog_run;
 
@@ -634,6 +703,8 @@ void bpf_prog_select_runtime(struct bpf_prog *fp)
 	bpf_int_jit_compile(fp);
 	/* Lock whole bpf_prog as read-only */
 	bpf_prog_lock_ro(fp);
+
+	return check_tail_call(fp);
 }
 EXPORT_SYMBOL_GPL(bpf_prog_select_runtime);
 
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 3bae6c591914..98a69bd83069 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -68,6 +68,12 @@ static int bpf_map_release(struct inode *inode, struct file *filp)
 {
 	struct bpf_map *map = filp->private_data;
 
+	if (map->map_type == BPF_MAP_TYPE_PROG_ARRAY)
+		/* prog_array stores refcnt-ed bpf_prog pointers
+		 * release them all when user space closes prog_array_fd
+		 */
+		bpf_prog_array_map_clear(map);
+
 	bpf_map_put(map);
 	return 0;
 }
@@ -392,6 +398,19 @@ static void fixup_bpf_calls(struct bpf_prog *prog)
 			 */
 			BUG_ON(!prog->aux->ops->get_func_proto);
 
+			if (insn->imm == BPF_FUNC_tail_call) {
+				/* mark bpf_tail_call as different opcode
+				 * to avoid conditional branch in
+				 * interpeter for every normal call
+				 * and to prevent accidental JITing by
+				 * JIT compiler that doesn't support
+				 * bpf_tail_call yet
+				 */
+				insn->imm = 0;
+				insn->code |= BPF_X;
+				continue;
+			}
+
 			fn = prog->aux->ops->get_func_proto(insn->imm);
 			/* all functions that have prototype and verifier allowed
 			 * programs to call them, must be real in-kernel functions
@@ -532,7 +551,9 @@ static int bpf_prog_load(union bpf_attr *attr)
 	fixup_bpf_calls(prog);
 
 	/* eBPF program is ready to be JITed */
-	bpf_prog_select_runtime(prog);
+	err = bpf_prog_select_runtime(prog);
+	if (err < 0)
+		goto free_used_maps;
 
 	err = anon_inode_getfd("bpf-prog", &bpf_prog_fops, prog, O_RDWR | O_CLOEXEC);
 	if (err < 0)
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 47dcd3aa6e23..cfd9a40b9a5a 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -907,6 +907,23 @@ static int check_call(struct verifier_env *env, int func_id)
 			fn->ret_type, func_id);
 		return -EINVAL;
 	}
+
+	if (map && map->map_type == BPF_MAP_TYPE_PROG_ARRAY &&
+	    func_id != BPF_FUNC_tail_call)
+		/* prog_array map type needs extra care:
+		 * only allow to pass it into bpf_tail_call() for now.
+		 * bpf_map_delete_elem() can be allowed in the future,
+		 * while bpf_map_update_elem() must only be done via syscall
+		 */
+		return -EINVAL;
+
+	if (func_id == BPF_FUNC_tail_call &&
+	    map->map_type != BPF_MAP_TYPE_PROG_ARRAY)
+		/* don't allow any other map type to be passed into
+		 * bpf_tail_call()
+		 */
+		return -EINVAL;
+
 	return 0;
 }
 
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 2d56ce501632..646445e41bd4 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -172,6 +172,8 @@ static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func
 		return &bpf_probe_read_proto;
 	case BPF_FUNC_ktime_get_ns:
 		return &bpf_ktime_get_ns_proto;
+	case BPF_FUNC_tail_call:
+		return &bpf_tail_call_proto;
 
 	case BPF_FUNC_trace_printk:
 		/*
diff --git a/net/core/filter.c b/net/core/filter.c
index 6805717be614..3adcca6f17a4 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1421,6 +1421,8 @@ sk_filter_func_proto(enum bpf_func_id func_id)
 		return &bpf_get_prandom_u32_proto;
 	case BPF_FUNC_get_smp_processor_id:
 		return &bpf_get_smp_processor_id_proto;
+	case BPF_FUNC_tail_call:
+		return &bpf_tail_call_proto;
 	default:
 		return NULL;
 	}
-- 
cgit v1.2.3


From 37b1ef31a568fc02e53587620226e5f3c66454c8 Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Wed, 20 May 2015 14:41:19 +0800
Subject: workqueue: move flush_scheduled_work() to workqueue.h

flush_scheduled_work() is just a simple call to flush_work().

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/workqueue.h | 30 +++++++++++++++++++++++++++++-
 kernel/workqueue.c        | 30 ------------------------------
 2 files changed, 29 insertions(+), 31 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index 4618dd672d1b..738b30b39b68 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -435,7 +435,6 @@ extern bool mod_delayed_work_on(int cpu, struct workqueue_struct *wq,
 
 extern void flush_workqueue(struct workqueue_struct *wq);
 extern void drain_workqueue(struct workqueue_struct *wq);
-extern void flush_scheduled_work(void);
 
 extern int schedule_on_each_cpu(work_func_t func);
 
@@ -531,6 +530,35 @@ static inline bool schedule_work(struct work_struct *work)
 	return queue_work(system_wq, work);
 }
 
+/**
+ * flush_scheduled_work - ensure that any scheduled work has run to completion.
+ *
+ * Forces execution of the kernel-global workqueue and blocks until its
+ * completion.
+ *
+ * Think twice before calling this function!  It's very easy to get into
+ * trouble if you don't take great care.  Either of the following situations
+ * will lead to deadlock:
+ *
+ *	One of the work items currently on the workqueue needs to acquire
+ *	a lock held by your code or its caller.
+ *
+ *	Your code is running in the context of a work routine.
+ *
+ * They will be detected by lockdep when they occur, but the first might not
+ * occur very often.  It depends on what work items are on the workqueue and
+ * what locks they need, which you have no control over.
+ *
+ * In most situations flushing the entire workqueue is overkill; you merely
+ * need to know that a particular work item isn't queued and isn't running.
+ * In such cases you should use cancel_delayed_work_sync() or
+ * cancel_work_sync() instead.
+ */
+static inline void flush_scheduled_work(void)
+{
+	flush_workqueue(system_wq);
+}
+
 /**
  * schedule_delayed_work_on - queue work in global workqueue on CPU after delay
  * @cpu: cpu to use
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index ad8dc2b9efc3..c9eaa4e5c867 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -2958,36 +2958,6 @@ int schedule_on_each_cpu(work_func_t func)
 	return 0;
 }
 
-/**
- * flush_scheduled_work - ensure that any scheduled work has run to completion.
- *
- * Forces execution of the kernel-global workqueue and blocks until its
- * completion.
- *
- * Think twice before calling this function!  It's very easy to get into
- * trouble if you don't take great care.  Either of the following situations
- * will lead to deadlock:
- *
- *	One of the work items currently on the workqueue needs to acquire
- *	a lock held by your code or its caller.
- *
- *	Your code is running in the context of a work routine.
- *
- * They will be detected by lockdep when they occur, but the first might not
- * occur very often.  It depends on what work items are on the workqueue and
- * what locks they need, which you have no control over.
- *
- * In most situations flushing the entire workqueue is overkill; you merely
- * need to know that a particular work item isn't queued and isn't running.
- * In such cases you should use cancel_delayed_work_sync() or
- * cancel_work_sync() instead.
- */
-void flush_scheduled_work(void)
-{
-	flush_workqueue(system_wq);
-}
-EXPORT_SYMBOL(flush_scheduled_work);
-
 /**
  * execute_in_process_context - reliably execute the routine with user context
  * @fn:		the function to execute
-- 
cgit v1.2.3


From 2efd055c53c06b7e89c167c98069bab9afce7e59 Mon Sep 17 00:00:00 2001
From: Marcelo Ricardo Leitner <mleitner@redhat.com>
Date: Wed, 20 May 2015 16:35:41 -0700
Subject: tcp: add tcpi_segs_in and tcpi_segs_out to tcp_info

This patch tracks the total number of inbound and outbound segments on a
TCP socket. One may use this number to have an idea on connection
quality when compared against the retransmissions.

RFC4898 named these : tcpEStatsPerfSegsIn and tcpEStatsPerfSegsOut

These are a 32bit field each and can be fetched both from TCP_INFO
getsockopt() if one has a handle on a TCP socket, or from inet_diag
netlink facility (iproute2/ss patch will follow)

Note that tp->segs_out was placed near tp->snd_nxt for good data
locality and minimal performance impact, while tp->segs_in was placed
near tp->bytes_received for the same reason.

Join work with Eric Dumazet.

Note that received SYN are accounted on the listener, but sent SYNACK
are not accounted.

Signed-off-by: Marcelo Ricardo Leitner <mleitner@redhat.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h      | 7 ++++++-
 include/uapi/linux/tcp.h | 4 +++-
 net/ipv4/tcp.c           | 2 ++
 net/ipv4/tcp_ipv4.c      | 1 +
 net/ipv4/tcp_minisocks.c | 1 +
 net/ipv4/tcp_output.c    | 1 +
 net/ipv6/tcp_ipv6.c      | 1 +
 7 files changed, 15 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index e6fb5df22db1..f0212026c77f 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -149,11 +149,16 @@ struct tcp_sock {
 				 * sum(delta(rcv_nxt)), or how many bytes
 				 * were acked.
 				 */
+	u32	segs_in;	/* RFC4898 tcpEStatsPerfSegsIn
+				 * total number of segments in.
+				 */
  	u32	rcv_nxt;	/* What we want to receive next 	*/
 	u32	copied_seq;	/* Head of yet unread data		*/
 	u32	rcv_wup;	/* rcv_nxt on last window update sent	*/
  	u32	snd_nxt;	/* Next sequence we send		*/
-
+	u32	segs_out;	/* RFC4898 tcpEStatsPerfSegsOut
+				 * The total number of segments sent.
+				 */
 	u64	bytes_acked;	/* RFC4898 tcpEStatsAppHCThruOctetsAcked
 				 * sum(delta(snd_una)), or how many bytes
 				 * were acked.
diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h
index 51ebedba577f..65a77b071e22 100644
--- a/include/uapi/linux/tcp.h
+++ b/include/uapi/linux/tcp.h
@@ -192,8 +192,10 @@ struct tcp_info {
 
 	__u64	tcpi_pacing_rate;
 	__u64	tcpi_max_pacing_rate;
-	__u64	tcpi_bytes_acked; /* RFC4898 tcpEStatsAppHCThruOctetsAcked */
+	__u64	tcpi_bytes_acked;    /* RFC4898 tcpEStatsAppHCThruOctetsAcked */
 	__u64	tcpi_bytes_received; /* RFC4898 tcpEStatsAppHCThruOctetsReceived */
+	__u32	tcpi_segs_out;	     /* RFC4898 tcpEStatsPerfSegsOut */
+	__u32	tcpi_segs_in;	     /* RFC4898 tcpEStatsPerfSegsIn */
 };
 
 /* for TCP_MD5SIG socket option */
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 0a3f9a00565b..7f3e721b9e69 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2695,6 +2695,8 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
 	spin_lock_bh(&sk->sk_lock.slock);
 	info->tcpi_bytes_acked = tp->bytes_acked;
 	info->tcpi_bytes_received = tp->bytes_received;
+	info->tcpi_segs_out = tp->segs_out;
+	info->tcpi_segs_in = tp->segs_in;
 	spin_unlock_bh(&sk->sk_lock.slock);
 }
 EXPORT_SYMBOL_GPL(tcp_get_info);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 0cc4b5a630cd..feb875769b8d 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1626,6 +1626,7 @@ process:
 	skb->dev = NULL;
 
 	bh_lock_sock_nested(sk);
+	tcp_sk(sk)->segs_in += max_t(u16, 1, skb_shinfo(skb)->gso_segs);
 	ret = 0;
 	if (!sock_owned_by_user(sk)) {
 		if (!tcp_prequeue(sk, skb))
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index ebe2ab2596ed..b62d15c86946 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -448,6 +448,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
 
 		newtp->rcv_wup = newtp->copied_seq =
 		newtp->rcv_nxt = treq->rcv_isn + 1;
+		newtp->segs_in = 0;
 
 		newtp->snd_sml = newtp->snd_una =
 		newtp->snd_nxt = newtp->snd_up = treq->snt_isn + 1;
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 3aebe0157dfa..534e5fdb04c1 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1027,6 +1027,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
 		TCP_ADD_STATS(sock_net(sk), TCP_MIB_OUTSEGS,
 			      tcp_skb_pcount(skb));
 
+	tp->segs_out += tcp_skb_pcount(skb);
 	/* OK, its time to fill skb_shinfo(skb)->gso_segs */
 	skb_shinfo(skb)->gso_segs = tcp_skb_pcount(skb);
 
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index b6575d665568..beac6bf840b9 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1421,6 +1421,7 @@ process:
 	skb->dev = NULL;
 
 	bh_lock_sock_nested(sk);
+	tcp_sk(sk)->segs_in += max_t(u16, 1, skb_shinfo(skb)->gso_segs);
 	ret = 0;
 	if (!sock_owned_by_user(sk)) {
 		if (!tcp_prequeue(sk, skb))
-- 
cgit v1.2.3


From 2d0f230fe0649758394466cb69b553c0e8184df9 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Thu, 21 May 2015 15:11:02 +0800
Subject: crypto: aead - Rename aead_alg to old_aead_alg

This patch is the first step in the introduction of a new AEAD
alg type.  Unlike normal conversions this patch only renames the
existing aead_alg structure because there are external references
to it.

Those references will be removed after this patch.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/aead.c                  | 25 +++++++++++++------------
 include/crypto/aead.h          |  2 ++
 include/crypto/internal/aead.h |  5 +++++
 include/linux/crypto.h         |  6 +++---
 4 files changed, 23 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/crypto/aead.c b/crypto/aead.c
index c2bf3b313354..ebc91ea89c91 100644
--- a/crypto/aead.c
+++ b/crypto/aead.c
@@ -33,7 +33,7 @@ static int aead_null_givdecrypt(struct aead_givcrypt_request *req);
 static int setkey_unaligned(struct crypto_aead *tfm, const u8 *key,
 			    unsigned int keylen)
 {
-	struct aead_alg *aead = crypto_aead_alg(tfm);
+	struct old_aead_alg *aead = crypto_old_aead_alg(tfm);
 	unsigned long alignmask = crypto_aead_alignmask(tfm);
 	int ret;
 	u8 *buffer, *alignbuffer;
@@ -55,7 +55,7 @@ static int setkey_unaligned(struct crypto_aead *tfm, const u8 *key,
 int crypto_aead_setkey(struct crypto_aead *tfm,
 		       const u8 *key, unsigned int keylen)
 {
-	struct aead_alg *aead = crypto_aead_alg(tfm);
+	struct old_aead_alg *aead = crypto_old_aead_alg(tfm);
 	unsigned long alignmask = crypto_aead_alignmask(tfm);
 
 	tfm = tfm->child;
@@ -71,11 +71,12 @@ int crypto_aead_setauthsize(struct crypto_aead *tfm, unsigned int authsize)
 {
 	int err;
 
-	if (authsize > crypto_aead_alg(tfm)->maxauthsize)
+	if (authsize > crypto_old_aead_alg(tfm)->maxauthsize)
 		return -EINVAL;
 
-	if (crypto_aead_alg(tfm)->setauthsize) {
-		err = crypto_aead_alg(tfm)->setauthsize(tfm->child, authsize);
+	if (crypto_old_aead_alg(tfm)->setauthsize) {
+		err = crypto_old_aead_alg(tfm)->setauthsize(
+			tfm->child, authsize);
 		if (err)
 			return err;
 	}
@@ -126,7 +127,7 @@ static int old_crypt(struct aead_request *req,
 static int old_encrypt(struct aead_request *req)
 {
 	struct crypto_aead *aead = crypto_aead_reqtfm(req);
-	struct aead_alg *alg = crypto_aead_alg(aead);
+	struct old_aead_alg *alg = crypto_old_aead_alg(aead);
 
 	return old_crypt(req, alg->encrypt);
 }
@@ -134,7 +135,7 @@ static int old_encrypt(struct aead_request *req)
 static int old_decrypt(struct aead_request *req)
 {
 	struct crypto_aead *aead = crypto_aead_reqtfm(req);
-	struct aead_alg *alg = crypto_aead_alg(aead);
+	struct old_aead_alg *alg = crypto_old_aead_alg(aead);
 
 	return old_crypt(req, alg->decrypt);
 }
@@ -146,7 +147,7 @@ static int no_givcrypt(struct aead_givcrypt_request *req)
 
 static int crypto_aead_init_tfm(struct crypto_tfm *tfm)
 {
-	struct aead_alg *alg = &tfm->__crt_alg->cra_aead;
+	struct old_aead_alg *alg = &tfm->__crt_alg->cra_aead;
 	struct crypto_aead *crt = __crypto_aead_cast(tfm);
 
 	if (max(alg->maxauthsize, alg->ivsize) > PAGE_SIZE / 8)
@@ -172,7 +173,7 @@ static int crypto_aead_init_tfm(struct crypto_tfm *tfm)
 static int crypto_aead_report(struct sk_buff *skb, struct crypto_alg *alg)
 {
 	struct crypto_report_aead raead;
-	struct aead_alg *aead = &alg->cra_aead;
+	struct old_aead_alg *aead = &alg->cra_aead;
 
 	strncpy(raead.type, "aead", sizeof(raead.type));
 	strncpy(raead.geniv, aead->geniv ?: "<built-in>", sizeof(raead.geniv));
@@ -200,7 +201,7 @@ static void crypto_aead_show(struct seq_file *m, struct crypto_alg *alg)
 	__attribute__ ((unused));
 static void crypto_aead_show(struct seq_file *m, struct crypto_alg *alg)
 {
-	struct aead_alg *aead = &alg->cra_aead;
+	struct old_aead_alg *aead = &alg->cra_aead;
 
 	seq_printf(m, "type         : aead\n");
 	seq_printf(m, "async        : %s\n", alg->cra_flags & CRYPTO_ALG_ASYNC ?
@@ -240,7 +241,7 @@ static int aead_null_givdecrypt(struct aead_givcrypt_request *req)
 static int crypto_nivaead_report(struct sk_buff *skb, struct crypto_alg *alg)
 {
 	struct crypto_report_aead raead;
-	struct aead_alg *aead = &alg->cra_aead;
+	struct old_aead_alg *aead = &alg->cra_aead;
 
 	strncpy(raead.type, "nivaead", sizeof(raead.type));
 	strncpy(raead.geniv, aead->geniv, sizeof(raead.geniv));
@@ -269,7 +270,7 @@ static void crypto_nivaead_show(struct seq_file *m, struct crypto_alg *alg)
 	__attribute__ ((unused));
 static void crypto_nivaead_show(struct seq_file *m, struct crypto_alg *alg)
 {
-	struct aead_alg *aead = &alg->cra_aead;
+	struct old_aead_alg *aead = &alg->cra_aead;
 
 	seq_printf(m, "type         : nivaead\n");
 	seq_printf(m, "async        : %s\n", alg->cra_flags & CRYPTO_ALG_ASYNC ?
diff --git a/include/crypto/aead.h b/include/crypto/aead.h
index e2d2c3c62e68..aebf57dfb903 100644
--- a/include/crypto/aead.h
+++ b/include/crypto/aead.h
@@ -17,6 +17,8 @@
 #include <linux/kernel.h>
 #include <linux/slab.h>
 
+#define aead_alg old_aead_alg
+
 /**
  * DOC: Authenticated Encryption With Associated Data (AEAD) Cipher API
  *
diff --git a/include/crypto/internal/aead.h b/include/crypto/internal/aead.h
index a2d104aa3430..84c17bb92b6a 100644
--- a/include/crypto/internal/aead.h
+++ b/include/crypto/internal/aead.h
@@ -26,6 +26,11 @@ struct crypto_aead_spawn {
 extern const struct crypto_type crypto_aead_type;
 extern const struct crypto_type crypto_nivaead_type;
 
+static inline struct old_aead_alg *crypto_old_aead_alg(struct crypto_aead *tfm)
+{
+	return &crypto_aead_tfm(tfm)->__crt_alg->cra_aead;
+}
+
 static inline struct aead_alg *crypto_aead_alg(struct crypto_aead *tfm)
 {
 	return &crypto_aead_tfm(tfm)->__crt_alg->cra_aead;
diff --git a/include/linux/crypto.h b/include/linux/crypto.h
index 59ca4086ce6a..7d290a91c6f9 100644
--- a/include/linux/crypto.h
+++ b/include/linux/crypto.h
@@ -268,7 +268,7 @@ struct ablkcipher_alg {
 };
 
 /**
- * struct aead_alg - AEAD cipher definition
+ * struct old_aead_alg - AEAD cipher definition
  * @maxauthsize: Set the maximum authentication tag size supported by the
  *		 transformation. A transformation may support smaller tag sizes.
  *		 As the authentication tag is a message digest to ensure the
@@ -293,7 +293,7 @@ struct ablkcipher_alg {
  * All fields except @givencrypt , @givdecrypt , @geniv and @ivsize are
  * mandatory and must be filled.
  */
-struct aead_alg {
+struct old_aead_alg {
 	int (*setkey)(struct crypto_aead *tfm, const u8 *key,
 	              unsigned int keylen);
 	int (*setauthsize)(struct crypto_aead *tfm, unsigned int authsize);
@@ -501,7 +501,7 @@ struct crypto_alg {
 
 	union {
 		struct ablkcipher_alg ablkcipher;
-		struct aead_alg aead;
+		struct old_aead_alg aead;
 		struct blkcipher_alg blkcipher;
 		struct cipher_alg cipher;
 		struct compress_alg compress;
-- 
cgit v1.2.3


From 2a9de9c0f08d61fbe3764a21d22d0b72df97d6ae Mon Sep 17 00:00:00 2001
From: Chanwoo Choi <cw00.choi@samsung.com>
Date: Fri, 24 Apr 2015 19:16:05 +0900
Subject: extcon: Use the unique id for external connector instead of string

This patch uses the unique id to identify the type of external connector instead
of string name. The string name have the many potential issues. So, this patch
defines the 'extcon' enumeration which includes all supported external connector
on EXTCON subsystem. If new external connector is necessary, the unique id of
new connector have to be added in 'extcon' enumeration. There are current
supported external connector in 'enum extcon' as following:

enum extcon {
	EXTCON_NONE		= 0x0,

	/* USB external connector */
	EXTCON_USB		= 0x1,
	EXTCON_USB_HOST		= 0x2,

	/* Charger external connector */
	EXTCON_TA		= 0x10,
	EXTCON_FAST_CHARGER	= 0x11,
	EXTCON_SLOW_CHARGER	= 0x12,
	EXTCON_CHARGE_DOWNSTREAM = 0x13,

	/* Audio and video external connector */
	EXTCON_LINE_IN		= 0x20,
	EXTCON_LINE_OUT		= 0x21,
	EXTCON_MICROPHONE	= 0x22,
	EXTCON_HEADPHONE	= 0x23,

	EXTCON_HDMI		= 0x30,
	EXTCON_MHL		= 0x31,
	EXTCON_DVI		= 0x32,
	EXTCON_VGA		= 0x33,
	EXTCON_SPDIF_IN		= 0x34,
	EXTCON_SPDIF_OUT	= 0x35,
	EXTCON_VIDEO_IN		= 0x36,
	EXTCON_VIDEO_OUT	= 0x37,

	/* Miscellaneous external connector */
	EXTCON_DOCK		= 0x50,
	EXTCON_JIG		= 0x51,
	EXTCON_MECHANICAL	= 0x52,

	EXTCON_END,
};

For example in extcon-arizona.c:
To use unique id removes the potential issue about handling
the inconsistent name of external connector with string.
- Previously, use the string to register the type of arizona jack connector
static const char *arizona_cable[] = {
	"Mechanical",
	"Microphone",
	"Headphone",
	"Line-out",
};
- Newly, use the unique id to register the type of arizona jack connector
static const enum extcon arizona_cable[] = {
	EXTCON_MECHANICAL,
	EXTCON_MICROPHONE,
	EXTCON_HEADPHONE,
	EXTCON_LINE_OUT,

	EXTCON_NONE,
};

And this patch modify the prototype of extcon_{get|set}_cable_state_() which
uses the 'enum extcon id' instead of 'cable_index'. Because although one more
extcon drivers support USB cable, each extcon driver might has the differnt
'cable_index' for USB cable. All extcon drivers can use the unique id number
for same external connector with modified extcon_{get|set}_cable_state_().

- Previously, use 'cable_index' on these functions:
extcon_get_cable_state_(struct extcon_dev*, int cable_index)
extcon_set_cable_state_(struct extcon_dev*, int cable_index, bool state)

-Newly, use 'enum extcon id' on these functions:
extcon_get_cable_state_(struct extcon_dev*, enum extcon id)
extcon_set_cable_state_(struct extcon_dev*, enum extcon id, bool state)

Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Felipe Balbi <balbi@ti.com>
Signed-off-by: Chanwoo Choi <cw00.choi@samsung.com>
Acked-by: Roger Quadros <rogerq@ti.com>
Acked-by: Charles Keepax <ckeepax@opensource.wolfsonmicro.com>
Acked-by: Ramakrishna Pallala <ramakrishna.pallala@intel.com>
Reviewed-by: Krzysztof Kozlowski <k.kozlowski@samsung.com>
[arnd: Report the build break about drivers/usb/phy/phy-tahvo.c after using the
unique id for external connector insteadf of string]
Reported-by: Arnd Bergmann <arnd@arndb.de>
[dan.carpenter: Report the build warning of extcon_{set|get}_cable_state_()]
Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
---
 drivers/extcon/extcon-arizona.c        |  38 +++-----
 drivers/extcon/extcon-axp288.c         |  24 ++---
 drivers/extcon/extcon-max14577.c       |  45 ++++-----
 drivers/extcon/extcon-max77693.c       |  95 +++++++++----------
 drivers/extcon/extcon-max77843.c       |  56 +++++------
 drivers/extcon/extcon-max8997.c        |  59 +++++-------
 drivers/extcon/extcon-palmas.c         |  22 +++--
 drivers/extcon/extcon-rt8973a.c        |  40 +++-----
 drivers/extcon/extcon-sm5502.c         |  32 ++-----
 drivers/extcon/extcon-usb-gpio.c       |  32 ++-----
 drivers/extcon/extcon.c                | 163 ++++++++++++++++++---------------
 drivers/usb/phy/phy-tahvo.c            |   9 +-
 include/linux/extcon.h                 | 111 ++++++++++------------
 include/linux/extcon/extcon-adc-jack.h |   5 +-
 14 files changed, 320 insertions(+), 411 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/extcon/extcon-arizona.c b/drivers/extcon/extcon-arizona.c
index 1ec06b4b4875..9262b45a4484 100644
--- a/drivers/extcon/extcon-arizona.c
+++ b/drivers/extcon/extcon-arizona.c
@@ -118,17 +118,12 @@ static const int arizona_micd_levels[] = {
 	1257,
 };
 
-#define ARIZONA_CABLE_MECHANICAL 0
-#define ARIZONA_CABLE_MICROPHONE 1
-#define ARIZONA_CABLE_HEADPHONE  2
-#define ARIZONA_CABLE_LINEOUT    3
-
-static const char *arizona_cable[] = {
-	"Mechanical",
-	"Microphone",
-	"Headphone",
-	"Line-out",
-	NULL,
+static const enum extcon arizona_cable[] = {
+	EXTCON_MECHANICAL,
+	EXTCON_MICROPHONE,
+	EXTCON_HEADPHONE,
+	EXTCON_LINE_OUT,
+	EXTCON_NONE,
 };
 
 static void arizona_start_hpdet_acc_id(struct arizona_extcon_info *info);
@@ -557,7 +552,7 @@ static irqreturn_t arizona_hpdet_irq(int irq, void *data)
 	struct arizona_extcon_info *info = data;
 	struct arizona *arizona = info->arizona;
 	int id_gpio = arizona->pdata.hpdet_id_gpio;
-	int report = ARIZONA_CABLE_HEADPHONE;
+	enum extcon report = EXTCON_HEADPHONE;
 	int ret, reading;
 	bool mic = false;
 
@@ -571,7 +566,7 @@ static irqreturn_t arizona_hpdet_irq(int irq, void *data)
 	}
 
 	/* If the cable was removed while measuring ignore the result */
-	ret = extcon_get_cable_state_(info->edev, ARIZONA_CABLE_MECHANICAL);
+	ret = extcon_get_cable_state_(info->edev, EXTCON_MECHANICAL);
 	if (ret < 0) {
 		dev_err(arizona->dev, "Failed to check cable state: %d\n",
 			ret);
@@ -602,9 +597,9 @@ static irqreturn_t arizona_hpdet_irq(int irq, void *data)
 
 	/* Report high impedence cables as line outputs */
 	if (reading >= 5000)
-		report = ARIZONA_CABLE_LINEOUT;
+		report = EXTCON_LINE_OUT;
 	else
-		report = ARIZONA_CABLE_HEADPHONE;
+		report = EXTCON_HEADPHONE;
 
 	ret = extcon_set_cable_state_(info->edev, report, true);
 	if (ret != 0)
@@ -689,8 +684,7 @@ err:
 			   ARIZONA_ACCDET_MODE_MASK, ARIZONA_ACCDET_MODE_MIC);
 
 	/* Just report headphone */
-	ret = extcon_set_cable_state_(info->edev,
-				      ARIZONA_CABLE_HEADPHONE, true);
+	ret = extcon_set_cable_state_(info->edev, EXTCON_HEADPHONE, true);
 	if (ret != 0)
 		dev_err(arizona->dev, "Failed to report headphone: %d\n", ret);
 
@@ -747,8 +741,7 @@ err:
 			   ARIZONA_ACCDET_MODE_MASK, ARIZONA_ACCDET_MODE_MIC);
 
 	/* Just report headphone */
-	ret = extcon_set_cable_state_(info->edev,
-				      ARIZONA_CABLE_HEADPHONE, true);
+	ret = extcon_set_cable_state_(info->edev, EXTCON_HEADPHONE, true);
 	if (ret != 0)
 		dev_err(arizona->dev, "Failed to report headphone: %d\n", ret);
 
@@ -787,7 +780,7 @@ static void arizona_micd_detect(struct work_struct *work)
 	mutex_lock(&info->lock);
 
 	/* If the cable was removed while measuring ignore the result */
-	ret = extcon_get_cable_state_(info->edev, ARIZONA_CABLE_MECHANICAL);
+	ret = extcon_get_cable_state_(info->edev, EXTCON_MECHANICAL);
 	if (ret < 0) {
 		dev_err(arizona->dev, "Failed to check cable state: %d\n",
 				ret);
@@ -836,8 +829,7 @@ static void arizona_micd_detect(struct work_struct *work)
 		arizona_identify_headphone(info);
 
 		ret = extcon_set_cable_state_(info->edev,
-					      ARIZONA_CABLE_MICROPHONE, true);
-
+					      EXTCON_MICROPHONE, true);
 		if (ret != 0)
 			dev_err(arizona->dev, "Headset report failed: %d\n",
 				ret);
@@ -1028,7 +1020,7 @@ static irqreturn_t arizona_jackdet(int irq, void *data)
 	if (info->last_jackdet == present) {
 		dev_dbg(arizona->dev, "Detected jack\n");
 		ret = extcon_set_cable_state_(info->edev,
-					      ARIZONA_CABLE_MECHANICAL, true);
+					      EXTCON_MECHANICAL, true);
 
 		if (ret != 0)
 			dev_err(arizona->dev, "Mechanical report failed: %d\n",
diff --git a/drivers/extcon/extcon-axp288.c b/drivers/extcon/extcon-axp288.c
index 8299adb9b676..3605aa96c25a 100644
--- a/drivers/extcon/extcon-axp288.c
+++ b/drivers/extcon/extcon-axp288.c
@@ -77,10 +77,6 @@
 /* IRQ enable-6 register */
 #define BC12_IRQ_CFG_MASK		BIT(1)
 
-#define AXP288_EXTCON_SLOW_CHARGER		"SLOW-CHARGER"
-#define AXP288_EXTCON_DOWNSTREAM_CHARGER	"CHARGE-DOWNSTREAM"
-#define AXP288_EXTCON_FAST_CHARGER		"FAST-CHARGER"
-
 enum axp288_extcon_reg {
 	AXP288_PS_STAT_REG		= 0x00,
 	AXP288_PS_BOOT_REASON_REG	= 0x02,
@@ -105,11 +101,11 @@ enum axp288_extcon_irq {
 	EXTCON_IRQ_END,
 };
 
-static const char *axp288_extcon_cables[] = {
-	AXP288_EXTCON_SLOW_CHARGER,
-	AXP288_EXTCON_DOWNSTREAM_CHARGER,
-	AXP288_EXTCON_FAST_CHARGER,
-	NULL,
+static const enum extcon axp288_extcon_cables[] = {
+	EXTCON_SLOW_CHARGER,
+	EXTCON_CHARGE_DOWNSTREAM,
+	EXTCON_FAST_CHARGER,
+	EXTCON_NONE,
 };
 
 struct axp288_extcon_info {
@@ -161,7 +157,7 @@ static void axp288_extcon_log_rsi(struct axp288_extcon_info *info)
 static int axp288_handle_chrg_det_event(struct axp288_extcon_info *info)
 {
 	static bool notify_otg, notify_charger;
-	static char *cable;
+	static enum extcon cable;
 	int ret, stat, cfg, pwr_stat;
 	u8 chrg_type;
 	bool vbus_attach = false;
@@ -196,18 +192,18 @@ static int axp288_handle_chrg_det_event(struct axp288_extcon_info *info)
 		dev_dbg(info->dev, "sdp cable is connecetd\n");
 		notify_otg = true;
 		notify_charger = true;
-		cable = AXP288_EXTCON_SLOW_CHARGER;
+		cable = EXTCON_SLOW_CHARGER;
 		break;
 	case DET_STAT_CDP:
 		dev_dbg(info->dev, "cdp cable is connecetd\n");
 		notify_otg = true;
 		notify_charger = true;
-		cable = AXP288_EXTCON_DOWNSTREAM_CHARGER;
+		cable = EXTCON_CHARGE_DOWNSTREAM;
 		break;
 	case DET_STAT_DCP:
 		dev_dbg(info->dev, "dcp cable is connecetd\n");
 		notify_charger = true;
-		cable = AXP288_EXTCON_FAST_CHARGER;
+		cable = EXTCON_FAST_CHARGER;
 		break;
 	default:
 		dev_warn(info->dev,
@@ -230,7 +226,7 @@ notify_otg:
 	}
 
 	if (notify_charger)
-		extcon_set_cable_state(info->edev, cable, vbus_attach);
+		extcon_set_cable_state_(info->edev, cable, vbus_attach);
 
 	/* Clear the flags on disconnect event */
 	if (!vbus_attach)
diff --git a/drivers/extcon/extcon-max14577.c b/drivers/extcon/extcon-max14577.c
index ad8f8ddc8e51..e7c3edb5bd4b 100644
--- a/drivers/extcon/extcon-max14577.c
+++ b/drivers/extcon/extcon-max14577.c
@@ -148,27 +148,14 @@ enum max14577_muic_acc_type {
 	MAX14577_MUIC_ADC_OPEN,
 };
 
-/* max14577 MUIC device support below list of accessories(external connector) */
-enum {
-	EXTCON_CABLE_USB = 0,
-	EXTCON_CABLE_TA,
-	EXTCON_CABLE_FAST_CHARGER,
-	EXTCON_CABLE_SLOW_CHARGER,
-	EXTCON_CABLE_CHARGE_DOWNSTREAM,
-	EXTCON_CABLE_JIG,
-
-	_EXTCON_CABLE_NUM,
-};
-
-static const char *max14577_extcon_cable[] = {
-	[EXTCON_CABLE_USB]			= "USB",
-	[EXTCON_CABLE_TA]			= "TA",
-	[EXTCON_CABLE_FAST_CHARGER]		= "Fast-charger",
-	[EXTCON_CABLE_SLOW_CHARGER]		= "Slow-charger",
-	[EXTCON_CABLE_CHARGE_DOWNSTREAM]	= "Charge-downstream",
-	[EXTCON_CABLE_JIG]			= "JIG",
-
-	NULL,
+static const enum extcon max14577_extcon_cable[] = {
+	EXTCON_USB,
+	EXTCON_TA,
+	EXTCON_FAST_CHARGER,
+	EXTCON_SLOW_CHARGER,
+	EXTCON_CHARGE_DOWNSTREAM,
+	EXTCON_JIG,
+	EXTCON_NONE,
 };
 
 /*
@@ -369,7 +356,7 @@ static int max14577_muic_jig_handler(struct max14577_muic_info *info,
 	if (ret < 0)
 		return ret;
 
-	extcon_set_cable_state(info->edev, "JIG", attached);
+	extcon_set_cable_state_(info->edev, EXTCON_JIG, attached);
 
 	return 0;
 }
@@ -466,20 +453,22 @@ static int max14577_muic_chg_handler(struct max14577_muic_info *info)
 		if (ret < 0)
 			return ret;
 
-		extcon_set_cable_state(info->edev, "USB", attached);
+		extcon_set_cable_state_(info->edev, EXTCON_USB, attached);
 		break;
 	case MAX14577_CHARGER_TYPE_DEDICATED_CHG:
-		extcon_set_cable_state(info->edev, "TA", attached);
+		extcon_set_cable_state_(info->edev, EXTCON_TA, attached);
 		break;
 	case MAX14577_CHARGER_TYPE_DOWNSTREAM_PORT:
-		extcon_set_cable_state(info->edev,
-				"Charge-downstream", attached);
+		extcon_set_cable_state_(info->edev, EXTCON_CHARGE_DOWNSTREAM,
+					attached);
 		break;
 	case MAX14577_CHARGER_TYPE_SPECIAL_500MA:
-		extcon_set_cable_state(info->edev, "Slow-charger", attached);
+		extcon_set_cable_state_(info->edev, EXTCON_SLOW_CHARGER,
+					attached);
 		break;
 	case MAX14577_CHARGER_TYPE_SPECIAL_1A:
-		extcon_set_cable_state(info->edev, "Fast-charger", attached);
+		extcon_set_cable_state_(info->edev, EXTCON_FAST_CHARGER,
+					attached);
 		break;
 	case MAX14577_CHARGER_TYPE_NONE:
 	case MAX14577_CHARGER_TYPE_DEAD_BATTERY:
diff --git a/drivers/extcon/extcon-max77693.c b/drivers/extcon/extcon-max77693.c
index c274249245ff..20e796e10e57 100644
--- a/drivers/extcon/extcon-max77693.c
+++ b/drivers/extcon/extcon-max77693.c
@@ -200,32 +200,17 @@ enum max77693_muic_acc_type {
 /*
  * MAX77693 MUIC device support below list of accessories(external connector)
  */
-enum {
-	EXTCON_CABLE_USB = 0,
-	EXTCON_CABLE_USB_HOST,
-	EXTCON_CABLE_TA,
-	EXTCON_CABLE_FAST_CHARGER,
-	EXTCON_CABLE_SLOW_CHARGER,
-	EXTCON_CABLE_CHARGE_DOWNSTREAM,
-	EXTCON_CABLE_MHL,
-	EXTCON_CABLE_JIG,
-	EXTCON_CABLE_DOCK,
-
-	_EXTCON_CABLE_NUM,
-};
-
-static const char *max77693_extcon_cable[] = {
-	[EXTCON_CABLE_USB]			= "USB",
-	[EXTCON_CABLE_USB_HOST]			= "USB-Host",
-	[EXTCON_CABLE_TA]			= "TA",
-	[EXTCON_CABLE_FAST_CHARGER]		= "Fast-charger",
-	[EXTCON_CABLE_SLOW_CHARGER]		= "Slow-charger",
-	[EXTCON_CABLE_CHARGE_DOWNSTREAM]	= "Charge-downstream",
-	[EXTCON_CABLE_MHL]			= "MHL",
-	[EXTCON_CABLE_JIG]			= "JIG",
-	[EXTCON_CABLE_DOCK]			= "DOCK",
-
-	NULL,
+static const enum extcon max77693_extcon_cable[] = {
+	EXTCON_USB,
+	EXTCON_USB_HOST,
+	EXTCON_TA,
+	EXTCON_FAST_CHARGER,
+	EXTCON_SLOW_CHARGER,
+	EXTCON_CHARGE_DOWNSTREAM,
+	EXTCON_MHL,
+	EXTCON_JIG,
+	EXTCON_DOCK,
+	EXTCON_NONE,
 };
 
 /*
@@ -472,7 +457,7 @@ static int max77693_muic_dock_handler(struct max77693_muic_info *info,
 	int ret = 0;
 	int vbvolt;
 	bool cable_attached;
-	char dock_name[CABLE_NAME_MAX];
+	enum extcon dock_id;
 
 	dev_info(info->dev,
 		"external connector is %s (adc:0x%02x)\n",
@@ -517,16 +502,16 @@ static int max77693_muic_dock_handler(struct max77693_muic_info *info,
 		if (ret < 0)
 			return ret;
 
-		extcon_set_cable_state(info->edev, "DOCK", attached);
-		extcon_set_cable_state(info->edev, "MHL", attached);
+		extcon_set_cable_state_(info->edev, EXTCON_DOCK, attached);
+		extcon_set_cable_state_(info->edev, EXTCON_MHL, attached);
 		goto out;
 	case MAX77693_MUIC_ADC_AUDIO_MODE_REMOTE:	/* Dock-Desk */
-		strcpy(dock_name, "DOCK");
+		dock_id = EXTCON_DOCK;
 		break;
 	case MAX77693_MUIC_ADC_AV_CABLE_NOLOAD:		/* Dock-Audio */
-		strcpy(dock_name, "DOCK");
+		dock_id = EXTCON_DOCK;
 		if (!attached)
-			extcon_set_cable_state(info->edev, "USB", false);
+			extcon_set_cable_state_(info->edev, EXTCON_USB, false);
 		break;
 	default:
 		dev_err(info->dev, "failed to detect %s dock device\n",
@@ -538,7 +523,7 @@ static int max77693_muic_dock_handler(struct max77693_muic_info *info,
 	ret = max77693_muic_set_path(info, CONTROL1_SW_AUDIO, attached);
 	if (ret < 0)
 		return ret;
-	extcon_set_cable_state(info->edev, dock_name, attached);
+	extcon_set_cable_state_(info->edev, dock_id, attached);
 
 out:
 	return 0;
@@ -603,20 +588,19 @@ static int max77693_muic_adc_ground_handler(struct max77693_muic_info *info)
 		ret = max77693_muic_set_path(info, CONTROL1_SW_USB, attached);
 		if (ret < 0)
 			return ret;
-		extcon_set_cable_state(info->edev, "USB-Host", attached);
+		extcon_set_cable_state_(info->edev, EXTCON_USB_HOST, attached);
 		break;
 	case MAX77693_MUIC_GND_AV_CABLE_LOAD:
 		/* Audio Video Cable with load, PATH:AUDIO */
 		ret = max77693_muic_set_path(info, CONTROL1_SW_AUDIO, attached);
 		if (ret < 0)
 			return ret;
-		extcon_set_cable_state(info->edev,
-				"Audio-video-load", attached);
+		extcon_set_cable_state_(info->edev, EXTCON_USB, attached);
 		break;
 	case MAX77693_MUIC_GND_MHL:
 	case MAX77693_MUIC_GND_MHL_VB:
 		/* MHL or MHL with USB/TA cable */
-		extcon_set_cable_state(info->edev, "MHL", attached);
+		extcon_set_cable_state_(info->edev, EXTCON_MHL, attached);
 		break;
 	default:
 		dev_err(info->dev, "failed to detect %s cable of gnd type\n",
@@ -658,7 +642,7 @@ static int max77693_muic_jig_handler(struct max77693_muic_info *info,
 	if (ret < 0)
 		return ret;
 
-	extcon_set_cable_state(info->edev, "JIG", attached);
+	extcon_set_cable_state_(info->edev, EXTCON_JIG, attached);
 
 	return 0;
 }
@@ -812,10 +796,10 @@ static int max77693_muic_chg_handler(struct max77693_muic_info *info)
 			 * - Support charging through micro-usb port without
 			 *   data connection
 			 */
-			extcon_set_cable_state(info->edev, "TA", attached);
+			extcon_set_cable_state_(info->edev, EXTCON_TA, attached);
 			if (!cable_attached)
-				extcon_set_cable_state(info->edev,
-						      "MHL", cable_attached);
+				extcon_set_cable_state_(info->edev, EXTCON_MHL,
+							cable_attached);
 			break;
 		}
 
@@ -838,11 +822,12 @@ static int max77693_muic_chg_handler(struct max77693_muic_info *info)
 			 * - Support charging through micro-usb port without
 			 *   data connection.
 			 */
-			extcon_set_cable_state(info->edev, "USB", attached);
+			extcon_set_cable_state_(info->edev, EXTCON_USB,
+						attached);
 
 			if (!cable_attached)
-				extcon_set_cable_state(info->edev, "DOCK",
-						      cable_attached);
+				extcon_set_cable_state_(info->edev, EXTCON_DOCK,
+							cable_attached);
 			break;
 		case MAX77693_MUIC_ADC_RESERVED_ACC_3:		/* Dock-Smart */
 			/*
@@ -870,9 +855,10 @@ static int max77693_muic_chg_handler(struct max77693_muic_info *info)
 			if (ret < 0)
 				return ret;
 
-			extcon_set_cable_state(info->edev, "DOCK", attached);
-			extcon_set_cable_state(info->edev, "MHL", attached);
-
+			extcon_set_cable_state_(info->edev, EXTCON_DOCK,
+						attached);
+			extcon_set_cable_state_(info->edev, EXTCON_MHL,
+						attached);
 			break;
 		}
 
@@ -905,23 +891,26 @@ static int max77693_muic_chg_handler(struct max77693_muic_info *info)
 			if (ret < 0)
 				return ret;
 
-			extcon_set_cable_state(info->edev, "USB", attached);
+			extcon_set_cable_state_(info->edev, EXTCON_USB,
+						attached);
 			break;
 		case MAX77693_CHARGER_TYPE_DEDICATED_CHG:
 			/* Only TA cable */
-			extcon_set_cable_state(info->edev, "TA", attached);
+			extcon_set_cable_state_(info->edev, EXTCON_TA, attached);
 			break;
 		}
 		break;
 	case MAX77693_CHARGER_TYPE_DOWNSTREAM_PORT:
-		extcon_set_cable_state(info->edev,
-				"Charge-downstream", attached);
+		extcon_set_cable_state_(info->edev, EXTCON_CHARGE_DOWNSTREAM,
+					attached);
 		break;
 	case MAX77693_CHARGER_TYPE_APPLE_500MA:
-		extcon_set_cable_state(info->edev, "Slow-charger", attached);
+		extcon_set_cable_state_(info->edev, EXTCON_SLOW_CHARGER,
+					attached);
 		break;
 	case MAX77693_CHARGER_TYPE_APPLE_1A_2A:
-		extcon_set_cable_state(info->edev, "Fast-charger", attached);
+		extcon_set_cable_state_(info->edev, EXTCON_FAST_CHARGER,
+					attached);
 		break;
 	case MAX77693_CHARGER_TYPE_DEAD_BATTERY:
 		break;
diff --git a/drivers/extcon/extcon-max77843.c b/drivers/extcon/extcon-max77843.c
index 5746d7bc165f..d78a64d7fc20 100644
--- a/drivers/extcon/extcon-max77843.c
+++ b/drivers/extcon/extcon-max77843.c
@@ -118,28 +118,16 @@ enum max77843_muic_charger_type {
 	MAX77843_MUIC_CHG_GND,
 };
 
-enum {
-	MAX77843_CABLE_USB = 0,
-	MAX77843_CABLE_USB_HOST,
-	MAX77843_CABLE_TA,
-	MAX77843_CABLE_CHARGE_DOWNSTREAM,
-	MAX77843_CABLE_FAST_CHARGER,
-	MAX77843_CABLE_SLOW_CHARGER,
-	MAX77843_CABLE_MHL,
-	MAX77843_CABLE_JIG,
-
-	MAX77843_CABLE_NUM,
-};
-
-static const char *max77843_extcon_cable[] = {
-	[MAX77843_CABLE_USB]			= "USB",
-	[MAX77843_CABLE_USB_HOST]		= "USB-HOST",
-	[MAX77843_CABLE_TA]			= "TA",
-	[MAX77843_CABLE_CHARGE_DOWNSTREAM]	= "CHARGER-DOWNSTREAM",
-	[MAX77843_CABLE_FAST_CHARGER]		= "FAST-CHARGER",
-	[MAX77843_CABLE_SLOW_CHARGER]		= "SLOW-CHARGER",
-	[MAX77843_CABLE_MHL]			= "MHL",
-	[MAX77843_CABLE_JIG]			= "JIG",
+static const enum extcon max77843_extcon_cable[] = {
+	EXTCON_USB,
+	EXTCON_USB_HOST,
+	EXTCON_TA,
+	EXTCON_CHARGE_DOWNSTREAM,
+	EXTCON_FAST_CHARGER,
+	EXTCON_SLOW_CHARGER,
+	EXTCON_MHL,
+	EXTCON_JIG,
+	EXTCON_NONE,
 };
 
 struct max77843_muic_irq {
@@ -354,7 +342,7 @@ static int max77843_muic_adc_gnd_handler(struct max77843_muic_info *info)
 		if (ret < 0)
 			return ret;
 
-		extcon_set_cable_state(info->edev, "USB-HOST", attached);
+		extcon_set_cable_state_(info->edev, EXTCON_USB_HOST, attached);
 		break;
 	case MAX77843_MUIC_GND_MHL_VB:
 	case MAX77843_MUIC_GND_MHL:
@@ -362,7 +350,7 @@ static int max77843_muic_adc_gnd_handler(struct max77843_muic_info *info)
 		if (ret < 0)
 			return ret;
 
-		extcon_set_cable_state(info->edev, "MHL", attached);
+		extcon_set_cable_state_(info->edev, EXTCON_MHL, attached);
 		break;
 	default:
 		dev_err(info->dev, "failed to detect %s accessory(gnd:0x%x)\n",
@@ -398,7 +386,7 @@ static int max77843_muic_jig_handler(struct max77843_muic_info *info,
 	if (ret < 0)
 		return ret;
 
-	extcon_set_cable_state(info->edev, "JIG", attached);
+	extcon_set_cable_state_(info->edev, EXTCON_JIG, attached);
 
 	return 0;
 }
@@ -490,36 +478,38 @@ static int max77843_muic_chg_handler(struct max77843_muic_info *info)
 		if (ret < 0)
 			return ret;
 
-		extcon_set_cable_state(info->edev, "USB", attached);
+		extcon_set_cable_state_(info->edev, EXTCON_USB, attached);
 		break;
 	case MAX77843_MUIC_CHG_DOWNSTREAM:
 		ret = max77843_muic_set_path(info, CONTROL1_SW_OPEN, attached);
 		if (ret < 0)
 			return ret;
 
-		extcon_set_cable_state(info->edev,
-				"CHARGER-DOWNSTREAM", attached);
+		extcon_set_cable_state_(info->edev, EXTCON_CHARGE_DOWNSTREAM,
+					attached);
 		break;
 	case MAX77843_MUIC_CHG_DEDICATED:
 		ret = max77843_muic_set_path(info, CONTROL1_SW_OPEN, attached);
 		if (ret < 0)
 			return ret;
 
-		extcon_set_cable_state(info->edev, "TA", attached);
+		extcon_set_cable_state_(info->edev, EXTCON_TA, attached);
 		break;
 	case MAX77843_MUIC_CHG_SPECIAL_500MA:
 		ret = max77843_muic_set_path(info, CONTROL1_SW_OPEN, attached);
 		if (ret < 0)
 			return ret;
 
-		extcon_set_cable_state(info->edev, "SLOW-CHAREGER", attached);
+		extcon_set_cable_state_(info->edev, EXTCON_SLOW_CHARGER,
+					attached);
 		break;
 	case MAX77843_MUIC_CHG_SPECIAL_1A:
 		ret = max77843_muic_set_path(info, CONTROL1_SW_OPEN, attached);
 		if (ret < 0)
 			return ret;
 
-		extcon_set_cable_state(info->edev, "FAST-CHARGER", attached);
+		extcon_set_cable_state_(info->edev, EXTCON_FAST_CHARGER,
+					attached);
 		break;
 	case MAX77843_MUIC_CHG_GND:
 		gnd_type = max77843_muic_get_cable_type(info,
@@ -527,9 +517,9 @@ static int max77843_muic_chg_handler(struct max77843_muic_info *info)
 
 		/* Charger cable on MHL accessory is attach or detach */
 		if (gnd_type == MAX77843_MUIC_GND_MHL_VB)
-			extcon_set_cable_state(info->edev, "TA", true);
+			extcon_set_cable_state_(info->edev, EXTCON_TA, true);
 		else if (gnd_type == MAX77843_MUIC_GND_MHL)
-			extcon_set_cable_state(info->edev, "TA", false);
+			extcon_set_cable_state_(info->edev, EXTCON_TA, false);
 		break;
 	case MAX77843_MUIC_CHG_NONE:
 		break;
diff --git a/drivers/extcon/extcon-max8997.c b/drivers/extcon/extcon-max8997.c
index 33613c490d34..4d10949c6eb2 100644
--- a/drivers/extcon/extcon-max8997.c
+++ b/drivers/extcon/extcon-max8997.c
@@ -145,32 +145,17 @@ struct max8997_muic_info {
 	int path_uart;
 };
 
-enum {
-	EXTCON_CABLE_USB = 0,
-	EXTCON_CABLE_USB_HOST,
-	EXTCON_CABLE_TA,
-	EXTCON_CABLE_FAST_CHARGER,
-	EXTCON_CABLE_SLOW_CHARGER,
-	EXTCON_CABLE_CHARGE_DOWNSTREAM,
-	EXTCON_CABLE_MHL,
-	EXTCON_CABLE_DOCK,
-	EXTCON_CABLE_JIG,
-
-	_EXTCON_CABLE_NUM,
-};
-
-static const char *max8997_extcon_cable[] = {
-	[EXTCON_CABLE_USB]			= "USB",
-	[EXTCON_CABLE_USB_HOST]			= "USB-Host",
-	[EXTCON_CABLE_TA]			= "TA",
-	[EXTCON_CABLE_FAST_CHARGER]		= "Fast-charger",
-	[EXTCON_CABLE_SLOW_CHARGER]		= "Slow-charger",
-	[EXTCON_CABLE_CHARGE_DOWNSTREAM]	= "Charge-downstream",
-	[EXTCON_CABLE_MHL]			= "MHL",
-	[EXTCON_CABLE_DOCK]			= "DOCK",
-	[EXTCON_CABLE_JIG]			= "JIG",
-
-	NULL,
+static const enum extcon max8997_extcon_cable[] = {
+	EXTCON_USB,
+	EXTCON_USB_HOST,
+	EXTCON_TA,
+	EXTCON_FAST_CHARGER,
+	EXTCON_SLOW_CHARGER,
+	EXTCON_CHARGE_DOWNSTREAM,
+	EXTCON_MHL,
+	EXTCON_DOCK,
+	EXTCON_JIG,
+	EXTCON_NONE,
 };
 
 /*
@@ -345,10 +330,10 @@ static int max8997_muic_handle_usb(struct max8997_muic_info *info,
 
 	switch (usb_type) {
 	case MAX8997_USB_HOST:
-		extcon_set_cable_state(info->edev, "USB-Host", attached);
+		extcon_set_cable_state_(info->edev, EXTCON_USB_HOST, attached);
 		break;
 	case MAX8997_USB_DEVICE:
-		extcon_set_cable_state(info->edev, "USB", attached);
+		extcon_set_cable_state_(info->edev, EXTCON_USB, attached);
 		break;
 	default:
 		dev_err(info->dev, "failed to detect %s usb cable\n",
@@ -373,7 +358,7 @@ static int max8997_muic_handle_dock(struct max8997_muic_info *info,
 	switch (cable_type) {
 	case MAX8997_MUIC_ADC_AV_CABLE_NOLOAD:
 	case MAX8997_MUIC_ADC_FACTORY_MODE_UART_ON:
-		extcon_set_cable_state(info->edev, "DOCK", attached);
+		extcon_set_cable_state_(info->edev, EXTCON_DOCK, attached);
 		break;
 	default:
 		dev_err(info->dev, "failed to detect %s dock device\n",
@@ -396,7 +381,7 @@ static int max8997_muic_handle_jig_uart(struct max8997_muic_info *info,
 		return ret;
 	}
 
-	extcon_set_cable_state(info->edev, "JIG", attached);
+	extcon_set_cable_state_(info->edev, EXTCON_JIG, attached);
 
 	return 0;
 }
@@ -418,7 +403,7 @@ static int max8997_muic_adc_handler(struct max8997_muic_info *info)
 			return ret;
 		break;
 	case MAX8997_MUIC_ADC_MHL:
-		extcon_set_cable_state(info->edev, "MHL", attached);
+		extcon_set_cable_state_(info->edev, EXTCON_MHL, attached);
 		break;
 	case MAX8997_MUIC_ADC_FACTORY_MODE_USB_OFF:
 	case MAX8997_MUIC_ADC_FACTORY_MODE_USB_ON:
@@ -501,17 +486,19 @@ static int max8997_muic_chg_handler(struct max8997_muic_info *info)
 		}
 		break;
 	case MAX8997_CHARGER_TYPE_DOWNSTREAM_PORT:
-		extcon_set_cable_state(info->edev,
-				      "Charge-downstream", attached);
+		extcon_set_cable_state_(info->edev, EXTCON_CHARGE_DOWNSTREAM,
+					attached);
 		break;
 	case MAX8997_CHARGER_TYPE_DEDICATED_CHG:
-		extcon_set_cable_state(info->edev, "TA", attached);
+		extcon_set_cable_state_(info->edev, EXTCON_TA, attached);
 		break;
 	case MAX8997_CHARGER_TYPE_500MA:
-		extcon_set_cable_state(info->edev, "Slow-charger", attached);
+		extcon_set_cable_state_(info->edev, EXTCON_SLOW_CHARGER,
+					attached);
 		break;
 	case MAX8997_CHARGER_TYPE_1A:
-		extcon_set_cable_state(info->edev, "Fast-charger", attached);
+		extcon_set_cable_state_(info->edev, EXTCON_FAST_CHARGER,
+					attached);
 		break;
 	default:
 		dev_err(info->dev,
diff --git a/drivers/extcon/extcon-palmas.c b/drivers/extcon/extcon-palmas.c
index 9c8943d691b1..d68954045a33 100644
--- a/drivers/extcon/extcon-palmas.c
+++ b/drivers/extcon/extcon-palmas.c
@@ -29,10 +29,10 @@
 #include <linux/of.h>
 #include <linux/of_platform.h>
 
-static const char *palmas_extcon_cable[] = {
-	[0] = "USB",
-	[1] = "USB-HOST",
-	NULL,
+static const enum extcon palmas_extcon_cable[] = {
+	EXTCON_USB,
+	EXTCON_USB_HOST,
+	EXTCON_NONE,
 };
 
 static const int mutually_exclusive[] = {0x3, 0x0};
@@ -49,6 +49,7 @@ static void palmas_usb_wakeup(struct palmas *palmas, int enable)
 static irqreturn_t palmas_vbus_irq_handler(int irq, void *_palmas_usb)
 {
 	struct palmas_usb *palmas_usb = _palmas_usb;
+	struct extcon_dev *edev = palmas_usb->edev;
 	unsigned int vbus_line_state;
 
 	palmas_read(palmas_usb->palmas, PALMAS_INTERRUPT_BASE,
@@ -57,7 +58,7 @@ static irqreturn_t palmas_vbus_irq_handler(int irq, void *_palmas_usb)
 	if (vbus_line_state & PALMAS_INT3_LINE_STATE_VBUS) {
 		if (palmas_usb->linkstat != PALMAS_USB_STATE_VBUS) {
 			palmas_usb->linkstat = PALMAS_USB_STATE_VBUS;
-			extcon_set_cable_state(palmas_usb->edev, "USB", true);
+			extcon_set_cable_state_(edev, EXTCON_USB, true);
 			dev_info(palmas_usb->dev, "USB cable is attached\n");
 		} else {
 			dev_dbg(palmas_usb->dev,
@@ -66,7 +67,7 @@ static irqreturn_t palmas_vbus_irq_handler(int irq, void *_palmas_usb)
 	} else if (!(vbus_line_state & PALMAS_INT3_LINE_STATE_VBUS)) {
 		if (palmas_usb->linkstat == PALMAS_USB_STATE_VBUS) {
 			palmas_usb->linkstat = PALMAS_USB_STATE_DISCONNECT;
-			extcon_set_cable_state(palmas_usb->edev, "USB", false);
+			extcon_set_cable_state_(edev, EXTCON_USB, false);
 			dev_info(palmas_usb->dev, "USB cable is detached\n");
 		} else {
 			dev_dbg(palmas_usb->dev,
@@ -81,6 +82,7 @@ static irqreturn_t palmas_id_irq_handler(int irq, void *_palmas_usb)
 {
 	unsigned int set, id_src;
 	struct palmas_usb *palmas_usb = _palmas_usb;
+	struct extcon_dev *edev = palmas_usb->edev;
 
 	palmas_read(palmas_usb->palmas, PALMAS_USB_OTG_BASE,
 		PALMAS_USB_ID_INT_LATCH_SET, &set);
@@ -93,7 +95,7 @@ static irqreturn_t palmas_id_irq_handler(int irq, void *_palmas_usb)
 			PALMAS_USB_ID_INT_LATCH_CLR,
 			PALMAS_USB_ID_INT_EN_HI_CLR_ID_GND);
 		palmas_usb->linkstat = PALMAS_USB_STATE_ID;
-		extcon_set_cable_state(palmas_usb->edev, "USB-HOST", true);
+		extcon_set_cable_state_(edev, EXTCON_USB_HOST, true);
 		dev_info(palmas_usb->dev, "USB-HOST cable is attached\n");
 	} else if ((set & PALMAS_USB_ID_INT_SRC_ID_FLOAT) &&
 				(id_src & PALMAS_USB_ID_INT_SRC_ID_FLOAT)) {
@@ -101,17 +103,17 @@ static irqreturn_t palmas_id_irq_handler(int irq, void *_palmas_usb)
 			PALMAS_USB_ID_INT_LATCH_CLR,
 			PALMAS_USB_ID_INT_EN_HI_CLR_ID_FLOAT);
 		palmas_usb->linkstat = PALMAS_USB_STATE_DISCONNECT;
-		extcon_set_cable_state(palmas_usb->edev, "USB-HOST", false);
+		extcon_set_cable_state_(edev, EXTCON_USB_HOST, false);
 		dev_info(palmas_usb->dev, "USB-HOST cable is detached\n");
 	} else if ((palmas_usb->linkstat == PALMAS_USB_STATE_ID) &&
 				(!(set & PALMAS_USB_ID_INT_SRC_ID_GND))) {
 		palmas_usb->linkstat = PALMAS_USB_STATE_DISCONNECT;
-		extcon_set_cable_state(palmas_usb->edev, "USB-HOST", false);
+		extcon_set_cable_state_(edev, EXTCON_USB_HOST, false);
 		dev_info(palmas_usb->dev, "USB-HOST cable is detached\n");
 	} else if ((palmas_usb->linkstat == PALMAS_USB_STATE_DISCONNECT) &&
 				(id_src & PALMAS_USB_ID_INT_SRC_ID_GND)) {
 		palmas_usb->linkstat = PALMAS_USB_STATE_ID;
-		extcon_set_cable_state(palmas_usb->edev, "USB-HOST", true);
+		extcon_set_cable_state_(edev, EXTCON_USB_HOST, true);
 		dev_info(palmas_usb->dev, " USB-HOST cable is attached\n");
 	}
 
diff --git a/drivers/extcon/extcon-rt8973a.c b/drivers/extcon/extcon-rt8973a.c
index 04447f36c994..f2a8672cbf82 100644
--- a/drivers/extcon/extcon-rt8973a.c
+++ b/drivers/extcon/extcon-rt8973a.c
@@ -90,21 +90,12 @@ static struct reg_data rt8973a_reg_data[] = {
 };
 
 /* List of detectable cables */
-enum {
-	EXTCON_CABLE_USB = 0,
-	EXTCON_CABLE_USB_HOST,
-	EXTCON_CABLE_TA,
-	EXTCON_CABLE_JIG,
-
-	EXTCON_CABLE_END,
-};
-
-static const char *rt8973a_extcon_cable[] = {
-	[EXTCON_CABLE_USB]		= "USB",
-	[EXTCON_CABLE_USB_HOST]		= "USB-Host",
-	[EXTCON_CABLE_TA]		= "TA",
-	[EXTCON_CABLE_JIG]		= "JIG",
-	NULL,
+static const enum extcon rt8973a_extcon_cable[] = {
+	EXTCON_USB,
+	EXTCON_USB_HOST,
+	EXTCON_TA,
+	EXTCON_JIG,
+	EXTCON_NONE,
 };
 
 /* Define OVP (Over Voltage Protection), OTP (Over Temperature Protection) */
@@ -307,14 +298,11 @@ static int rt8973a_muic_cable_handler(struct rt8973a_muic_info *info,
 					enum rt8973a_event_type event)
 {
 	static unsigned int prev_cable_type;
-	const char **cable_names = info->edev->supported_cable;
 	unsigned int con_sw = DM_DP_SWITCH_UART;
-	int ret, idx = 0, cable_type;
+	int ret, cable_type;
+	enum extcon id;
 	bool attached = false;
 
-	if (!cable_names)
-		return 0;
-
 	switch (event) {
 	case RT8973A_EVENT_ATTACH:
 		cable_type = rt8973a_muic_get_cable_type(info);
@@ -341,25 +329,25 @@ static int rt8973a_muic_cable_handler(struct rt8973a_muic_info *info,
 
 	switch (cable_type) {
 	case RT8973A_MUIC_ADC_OTG:
-		idx = EXTCON_CABLE_USB_HOST;
+		id = EXTCON_USB_HOST;
 		con_sw = DM_DP_SWITCH_USB;
 		break;
 	case RT8973A_MUIC_ADC_TA:
-		idx = EXTCON_CABLE_TA;
+		id = EXTCON_TA;
 		con_sw = DM_DP_SWITCH_OPEN;
 		break;
 	case RT8973A_MUIC_ADC_FACTORY_MODE_BOOT_OFF_USB:
 	case RT8973A_MUIC_ADC_FACTORY_MODE_BOOT_ON_USB:
-		idx = EXTCON_CABLE_JIG;
+		id = EXTCON_JIG;
 		con_sw = DM_DP_SWITCH_USB;
 		break;
 	case RT8973A_MUIC_ADC_FACTORY_MODE_BOOT_OFF_UART:
 	case RT8973A_MUIC_ADC_FACTORY_MODE_BOOT_ON_UART:
-		idx = EXTCON_CABLE_JIG;
+		id = EXTCON_JIG;
 		con_sw = DM_DP_SWITCH_UART;
 		break;
 	case RT8973A_MUIC_ADC_USB:
-		idx = EXTCON_CABLE_USB;
+		id = EXTCON_USB;
 		con_sw = DM_DP_SWITCH_USB;
 		break;
 	case RT8973A_MUIC_ADC_OPEN:
@@ -409,7 +397,7 @@ static int rt8973a_muic_cable_handler(struct rt8973a_muic_info *info,
 		return ret;
 
 	/* Change the state of external accessory */
-	extcon_set_cable_state(info->edev, cable_names[idx], attached);
+	extcon_set_cable_state_(info->edev, id, attached);
 
 	return 0;
 }
diff --git a/drivers/extcon/extcon-sm5502.c b/drivers/extcon/extcon-sm5502.c
index 6f1d11f8723b..520693d6fa8a 100644
--- a/drivers/extcon/extcon-sm5502.c
+++ b/drivers/extcon/extcon-sm5502.c
@@ -92,19 +92,11 @@ static struct reg_data sm5502_reg_data[] = {
 };
 
 /* List of detectable cables */
-enum {
-	EXTCON_CABLE_USB = 0,
-	EXTCON_CABLE_USB_HOST,
-	EXTCON_CABLE_TA,
-
-	EXTCON_CABLE_END,
-};
-
-static const char *sm5502_extcon_cable[] = {
-	[EXTCON_CABLE_USB]	= "USB",
-	[EXTCON_CABLE_USB_HOST]	= "USB-Host",
-	[EXTCON_CABLE_TA]	= "TA",
-	NULL,
+static const enum extcon sm5502_extcon_cable[] = {
+	EXTCON_USB,
+	EXTCON_USB_HOST,
+	EXTCON_TA,
+	EXTCON_NONE,
 };
 
 /* Define supported accessory type */
@@ -377,16 +369,12 @@ static int sm5502_muic_cable_handler(struct sm5502_muic_info *info,
 				     bool attached)
 {
 	static unsigned int prev_cable_type = SM5502_MUIC_ADC_GROUND;
-	const char **cable_names = info->edev->supported_cable;
 	unsigned int cable_type = SM5502_MUIC_ADC_GROUND;
 	unsigned int con_sw = DM_DP_SWITCH_OPEN;
 	unsigned int vbus_sw = VBUSIN_SWITCH_OPEN;
-	unsigned int idx = 0;
+	enum extcon id;
 	int ret;
 
-	if (!cable_names)
-		return 0;
-
 	/* Get the type of attached or detached cable */
 	if (attached)
 		cable_type = sm5502_muic_get_cable_type(info);
@@ -396,17 +384,17 @@ static int sm5502_muic_cable_handler(struct sm5502_muic_info *info,
 
 	switch (cable_type) {
 	case SM5502_MUIC_ADC_OPEN_USB:
-		idx	= EXTCON_CABLE_USB;
+		id	= EXTCON_USB;
 		con_sw	= DM_DP_SWITCH_USB;
 		vbus_sw	= VBUSIN_SWITCH_VBUSOUT_WITH_USB;
 		break;
 	case SM5502_MUIC_ADC_OPEN_TA:
-		idx	= EXTCON_CABLE_TA;
+		id	= EXTCON_TA;
 		con_sw	= DM_DP_SWITCH_OPEN;
 		vbus_sw	= VBUSIN_SWITCH_VBUSOUT;
 		break;
 	case SM5502_MUIC_ADC_OPEN_USB_OTG:
-		idx	= EXTCON_CABLE_USB_HOST;
+		id	= EXTCON_USB_HOST;
 		con_sw	= DM_DP_SWITCH_USB;
 		vbus_sw	= VBUSIN_SWITCH_OPEN;
 		break;
@@ -422,7 +410,7 @@ static int sm5502_muic_cable_handler(struct sm5502_muic_info *info,
 		return ret;
 
 	/* Change the state of external accessory */
-	extcon_set_cable_state(info->edev, cable_names[idx], attached);
+	extcon_set_cable_state_(info->edev, id, attached);
 
 	return 0;
 }
diff --git a/drivers/extcon/extcon-usb-gpio.c b/drivers/extcon/extcon-usb-gpio.c
index 900744b978fc..14da94cb57fa 100644
--- a/drivers/extcon/extcon-usb-gpio.c
+++ b/drivers/extcon/extcon-usb-gpio.c
@@ -39,18 +39,10 @@ struct usb_extcon_info {
 	struct delayed_work wq_detcable;
 };
 
-/* List of detectable cables */
-enum {
-	EXTCON_CABLE_USB = 0,
-	EXTCON_CABLE_USB_HOST,
-
-	EXTCON_CABLE_END,
-};
-
-static const char *usb_extcon_cable[] = {
-	[EXTCON_CABLE_USB] = "USB",
-	[EXTCON_CABLE_USB_HOST] = "USB-HOST",
-	NULL,
+static const enum extcon usb_extcon_cable[] = {
+	EXTCON_USB,
+	EXTCON_USB_HOST,
+	EXTCON_NONE,
 };
 
 static void usb_extcon_detect_cable(struct work_struct *work)
@@ -68,24 +60,16 @@ static void usb_extcon_detect_cable(struct work_struct *work)
 		 * As we don't have event for USB peripheral cable attached,
 		 * we simulate USB peripheral attach here.
 		 */
-		extcon_set_cable_state(info->edev,
-				       usb_extcon_cable[EXTCON_CABLE_USB_HOST],
-				       false);
-		extcon_set_cable_state(info->edev,
-				       usb_extcon_cable[EXTCON_CABLE_USB],
-				       true);
+		extcon_set_cable_state_(info->edev, EXTCON_USB_HOST, false);
+		extcon_set_cable_state_(info->edev, EXTCON_USB, true);
 	} else {
 		/*
 		 * ID = 0 means USB HOST cable attached.
 		 * As we don't have event for USB peripheral cable detached,
 		 * we simulate USB peripheral detach here.
 		 */
-		extcon_set_cable_state(info->edev,
-				       usb_extcon_cable[EXTCON_CABLE_USB],
-				       false);
-		extcon_set_cable_state(info->edev,
-				       usb_extcon_cable[EXTCON_CABLE_USB_HOST],
-				       true);
+		extcon_set_cable_state_(info->edev, EXTCON_USB, false);
+		extcon_set_cable_state_(info->edev, EXTCON_USB_HOST, true);
 	}
 }
 
diff --git a/drivers/extcon/extcon.c b/drivers/extcon/extcon.c
index 2fb5f757e811..a57355fc8fea 100644
--- a/drivers/extcon/extcon.c
+++ b/drivers/extcon/extcon.c
@@ -3,6 +3,9 @@
  *
  *  External connector (extcon) class driver
  *
+ * Copyright (C) 2015 Samsung Electronics
+ * Author: Chanwoo Choi <cw00.choi@samsung.com>
+ *
  * Copyright (C) 2012 Samsung Electronics
  * Author: Donggeun Kim <dg77.kim@samsung.com>
  * Author: MyungJoo Ham <myungjoo.ham@samsung.com>
@@ -32,36 +35,32 @@
 #include <linux/slab.h>
 #include <linux/sysfs.h>
 
-/*
- * extcon_cable_name suggests the standard cable names for commonly used
- * cable types.
- *
- * However, please do not use extcon_cable_name directly for extcon_dev
- * struct's supported_cable pointer unless your device really supports
- * every single port-type of the following cable names. Please choose cable
- * names that are actually used in your extcon device.
- */
-const char extcon_cable_name[][CABLE_NAME_MAX + 1] = {
+#define SUPPORTED_CABLE_MAX	32
+#define CABLE_NAME_MAX		30
+
+static const char *extcon_name[] =  {
 	[EXTCON_USB]		= "USB",
 	[EXTCON_USB_HOST]	= "USB-Host",
 	[EXTCON_TA]		= "TA",
 	[EXTCON_FAST_CHARGER]	= "Fast-charger",
 	[EXTCON_SLOW_CHARGER]	= "Slow-charger",
 	[EXTCON_CHARGE_DOWNSTREAM]	= "Charge-downstream",
+	[EXTCON_LINE_IN]	= "Line-in",
+	[EXTCON_LINE_OUT]	= "Line-out",
+	[EXTCON_MICROPHONE]	= "Microphone",
+	[EXTCON_HEADPHONE]	= "Headphone",
 	[EXTCON_HDMI]		= "HDMI",
 	[EXTCON_MHL]		= "MHL",
 	[EXTCON_DVI]		= "DVI",
 	[EXTCON_VGA]		= "VGA",
-	[EXTCON_DOCK]		= "Dock",
-	[EXTCON_LINE_IN]	= "Line-in",
-	[EXTCON_LINE_OUT]	= "Line-out",
-	[EXTCON_MIC_IN]		= "Microphone",
-	[EXTCON_HEADPHONE_OUT]	= "Headphone",
 	[EXTCON_SPDIF_IN]	= "SPDIF-in",
 	[EXTCON_SPDIF_OUT]	= "SPDIF-out",
 	[EXTCON_VIDEO_IN]	= "Video-in",
 	[EXTCON_VIDEO_OUT]	= "Video-out",
+	[EXTCON_DOCK]		= "Dock",
+	[EXTCON_JIG]		= "JIG",
 	[EXTCON_MECHANICAL]	= "Mechanical",
+	NULL,
 };
 
 static struct class *extcon_class;
@@ -101,6 +100,43 @@ static int check_mutually_exclusive(struct extcon_dev *edev, u32 new_state)
 	return 0;
 }
 
+static int find_cable_index_by_id(struct extcon_dev *edev, const enum extcon id)
+{
+	int i;
+
+	/* Find the the index of extcon cable in edev->supported_cable */
+	for (i = 0; i < edev->max_supported; i++) {
+		if (edev->supported_cable[i] == id)
+			return i;
+	}
+
+	return -EINVAL;
+}
+
+static int find_cable_index_by_name(struct extcon_dev *edev, const char *name)
+{
+	enum extcon id = EXTCON_NONE;
+	int i;
+
+	if (edev->max_supported == 0)
+		return -EINVAL;
+
+	/* Find the the number of extcon cable */
+	for (i = 0; i < EXTCON_END; i++) {
+		if (!extcon_name[i])
+			continue;
+		if (!strncmp(extcon_name[i], name, CABLE_NAME_MAX)) {
+			id = i;
+			break;
+		}
+	}
+
+	if (id == EXTCON_NONE)
+		return -EINVAL;
+
+	return find_cable_index_by_id(edev, id);
+}
+
 static ssize_t state_show(struct device *dev, struct device_attribute *attr,
 			  char *buf)
 {
@@ -118,11 +154,9 @@ static ssize_t state_show(struct device *dev, struct device_attribute *attr,
 	if (edev->max_supported == 0)
 		return sprintf(buf, "%u\n", edev->state);
 
-	for (i = 0; i < SUPPORTED_CABLE_MAX; i++) {
-		if (!edev->supported_cable[i])
-			break;
+	for (i = 0; i < edev->max_supported; i++) {
 		count += sprintf(buf + count, "%s=%d\n",
-				 edev->supported_cable[i],
+				extcon_name[edev->supported_cable[i]],
 				 !!(edev->state & (1 << i)));
 	}
 
@@ -171,9 +205,10 @@ static ssize_t cable_name_show(struct device *dev,
 {
 	struct extcon_cable *cable = container_of(attr, struct extcon_cable,
 						  attr_name);
+	int i = cable->cable_index;
 
 	return sprintf(buf, "%s\n",
-		       cable->edev->supported_cable[cable->cable_index]);
+			extcon_name[cable->edev->supported_cable[i]]);
 }
 
 static ssize_t cable_state_show(struct device *dev,
@@ -283,39 +318,19 @@ int extcon_set_state(struct extcon_dev *edev, u32 state)
 EXPORT_SYMBOL_GPL(extcon_set_state);
 
 /**
- * extcon_find_cable_index() - Get the cable index based on the cable name.
+ * extcon_get_cable_state_() - Get the status of a specific cable.
  * @edev:	the extcon device that has the cable.
- * @cable_name:	cable name to be searched.
- *
- * Note that accessing a cable state based on cable_index is faster than
- * cable_name because using cable_name induces a loop with strncmp().
- * Thus, when get/set_cable_state is repeatedly used, using cable_index
- * is recommended.
+ * @id:		the unique id of each external connector in extcon enumeration.
  */
-int extcon_find_cable_index(struct extcon_dev *edev, const char *cable_name)
+int extcon_get_cable_state_(struct extcon_dev *edev, const enum extcon id)
 {
-	int i;
-
-	if (edev->supported_cable) {
-		for (i = 0; edev->supported_cable[i]; i++) {
-			if (!strncmp(edev->supported_cable[i],
-				cable_name, CABLE_NAME_MAX))
-				return i;
-		}
-	}
+	int index;
 
-	return -EINVAL;
-}
-EXPORT_SYMBOL_GPL(extcon_find_cable_index);
+	index = find_cable_index_by_id(edev, id);
+	if (index < 0)
+		return index;
 
-/**
- * extcon_get_cable_state_() - Get the status of a specific cable.
- * @edev:	the extcon device that has the cable.
- * @index:	cable index that can be retrieved by extcon_find_cable_index().
- */
-int extcon_get_cable_state_(struct extcon_dev *edev, int index)
-{
-	if (index < 0 || (edev->max_supported && edev->max_supported <= index))
+	if (edev->max_supported && edev->max_supported <= index)
 		return -EINVAL;
 
 	return !!(edev->state & (1 << index));
@@ -331,7 +346,7 @@ EXPORT_SYMBOL_GPL(extcon_get_cable_state_);
  */
 int extcon_get_cable_state(struct extcon_dev *edev, const char *cable_name)
 {
-	return extcon_get_cable_state_(edev, extcon_find_cable_index
+	return extcon_get_cable_state_(edev, find_cable_index_by_name
 						(edev, cable_name));
 }
 EXPORT_SYMBOL_GPL(extcon_get_cable_state);
@@ -339,17 +354,22 @@ EXPORT_SYMBOL_GPL(extcon_get_cable_state);
 /**
  * extcon_set_cable_state_() - Set the status of a specific cable.
  * @edev:		the extcon device that has the cable.
- * @index:		cable index that can be retrieved by
- *			extcon_find_cable_index().
- * @cable_state:	the new cable status. The default semantics is
+ * @id:			the unique id of each external connector
+ *			in extcon enumeration.
+ * @state:		the new cable status. The default semantics is
  *			true: attached / false: detached.
  */
-int extcon_set_cable_state_(struct extcon_dev *edev,
-			int index, bool cable_state)
+int extcon_set_cable_state_(struct extcon_dev *edev, enum extcon id,
+				bool cable_state)
 {
 	u32 state;
+	int index;
 
-	if (index < 0 || (edev->max_supported && edev->max_supported <= index))
+	index = find_cable_index_by_id(edev, id);
+	if (index < 0)
+		return index;
+
+	if (edev->max_supported && edev->max_supported <= index)
 		return -EINVAL;
 
 	state = cable_state ? (1 << index) : 0;
@@ -369,7 +389,7 @@ EXPORT_SYMBOL_GPL(extcon_set_cable_state_);
 int extcon_set_cable_state(struct extcon_dev *edev,
 			const char *cable_name, bool cable_state)
 {
-	return extcon_set_cable_state_(edev, extcon_find_cable_index
+	return extcon_set_cable_state_(edev, find_cable_index_by_name
 					(edev, cable_name), cable_state);
 }
 EXPORT_SYMBOL_GPL(extcon_set_cable_state);
@@ -455,8 +475,8 @@ int extcon_register_interest(struct extcon_specific_cable_nb *obj,
 		if (!obj->edev)
 			return -ENODEV;
 
-		obj->cable_index = extcon_find_cable_index(obj->edev,
-							  cable_name);
+		obj->cable_index = find_cable_index_by_name(obj->edev,
+							cable_name);
 		if (obj->cable_index < 0)
 			return obj->cable_index;
 
@@ -479,7 +499,7 @@ int extcon_register_interest(struct extcon_specific_cable_nb *obj,
 		while ((dev = class_dev_iter_next(&iter))) {
 			extd = dev_get_drvdata(dev);
 
-			if (extcon_find_cable_index(extd, cable_name) < 0)
+			if (find_cable_index_by_name(extd, cable_name) < 0)
 				continue;
 
 			class_dev_iter_exit(&iter);
@@ -595,7 +615,7 @@ static void dummy_sysfs_dev_release(struct device *dev)
 
 /*
  * extcon_dev_allocate() - Allocate the memory of extcon device.
- * @supported_cable:	Array of supported cable names ending with NULL.
+ * @supported_cable:	Array of supported extcon ending with EXTCON_NONE.
  *			If supported_cable is NULL, cable name related APIs
  *			are disabled.
  *
@@ -605,7 +625,7 @@ static void dummy_sysfs_dev_release(struct device *dev)
  *
  * Return the pointer of extcon device if success or ERR_PTR(err) if fail
  */
-struct extcon_dev *extcon_dev_allocate(const char **supported_cable)
+struct extcon_dev *extcon_dev_allocate(const enum extcon *supported_cable)
 {
 	struct extcon_dev *edev;
 
@@ -647,7 +667,7 @@ static void devm_extcon_dev_release(struct device *dev, void *res)
 /**
  * devm_extcon_dev_allocate - Allocate managed extcon device
  * @dev:		device owning the extcon device being created
- * @supported_cable:	Array of supported cable names ending with NULL.
+ * @supported_cable:	Array of supported extcon ending with EXTCON_NONE.
  *			If supported_cable is NULL, cable name related APIs
  *			are disabled.
  *
@@ -659,7 +679,7 @@ static void devm_extcon_dev_release(struct device *dev, void *res)
  * or ERR_PTR(err) if fail
  */
 struct extcon_dev *devm_extcon_dev_allocate(struct device *dev,
-					    const char **supported_cable)
+					const enum extcon *supported_cable)
 {
 	struct extcon_dev **ptr, *edev;
 
@@ -709,17 +729,15 @@ int extcon_dev_register(struct extcon_dev *edev)
 			return ret;
 	}
 
-	if (edev->supported_cable) {
-		/* Get size of array */
-		for (index = 0; edev->supported_cable[index]; index++)
-			;
-		edev->max_supported = index;
-	} else {
-		edev->max_supported = 0;
-	}
+	if (!edev->supported_cable)
+		return -EINVAL;
+
+	for (; edev->supported_cable[index] != EXTCON_NONE; index++);
 
+	edev->max_supported = index;
 	if (index > SUPPORTED_CABLE_MAX) {
-		dev_err(&edev->dev, "extcon: maximum number of supported cables exceeded.\n");
+		dev_err(&edev->dev,
+			"exceed the maximum number of supported cables\n");
 		return -EINVAL;
 	}
 
@@ -1070,6 +1088,7 @@ static void __exit extcon_class_exit(void)
 }
 module_exit(extcon_class_exit);
 
+MODULE_AUTHOR("Chanwoo Choi <cw00.choi@samsung.com>");
 MODULE_AUTHOR("Mike Lockwood <lockwood@android.com>");
 MODULE_AUTHOR("Donggeun Kim <dg77.kim@samsung.com>");
 MODULE_AUTHOR("MyungJoo Ham <myungjoo.ham@samsung.com>");
diff --git a/drivers/usb/phy/phy-tahvo.c b/drivers/usb/phy/phy-tahvo.c
index 845f658276b1..1d1bb9ad8ccf 100644
--- a/drivers/usb/phy/phy-tahvo.c
+++ b/drivers/usb/phy/phy-tahvo.c
@@ -60,10 +60,11 @@ struct tahvo_usb {
 	struct extcon_dev	extcon;
 };
 
-static const char *tahvo_cable[] = {
-	"USB-HOST",
-	"USB",
-	NULL,
+static const enum extcon tahvo_cable[] = {
+	EXTCON_USB,
+	EXTCON_USB_HOST,
+
+	EXTCON_NONE,
 };
 
 static ssize_t vbus_state_show(struct device *device,
diff --git a/include/linux/extcon.h b/include/linux/extcon.h
index 799474d9dc48..85c882f0029e 100644
--- a/include/linux/extcon.h
+++ b/include/linux/extcon.h
@@ -1,6 +1,9 @@
 /*
  *  External connector (extcon) class driver
  *
+ * Copyright (C) 2015 Samsung Electronics
+ * Author: Chanwoo Choi <cw00.choi@samsung.com>
+ *
  * Copyright (C) 2012 Samsung Electronics
  * Author: Donggeun Kim <dg77.kim@samsung.com>
  * Author: MyungJoo Ham <myungjoo.ham@samsung.com>
@@ -27,50 +30,41 @@
 #include <linux/notifier.h>
 #include <linux/sysfs.h>
 
-#define SUPPORTED_CABLE_MAX	32
-#define CABLE_NAME_MAX		30
-
-/*
- * The standard cable name is to help support general notifier
- * and notifiee device drivers to share the common names.
- * Please use standard cable names unless your notifier device has
- * a very unique and abnormal cable or
- * the cable type is supposed to be used with only one unique
- * pair of notifier/notifiee devices.
- *
- * Please add any other "standard" cables used with extcon dev.
- *
- * You may add a dot and number to specify version or specification
- * of the specific cable if it is required. (e.g., "Fast-charger.18"
- * and "Fast-charger.10" for 1.8A and 1.0A chargers)
- * However, the notifiee and notifier should be able to handle such
- * string and if the notifiee can negotiate the protocol or identify,
- * you don't need such convention. This convention is helpful when
- * notifier can distinguish but notifiee cannot.
- */
-enum extcon_cable_name {
-	EXTCON_USB = 0,
-	EXTCON_USB_HOST,
-	EXTCON_TA,			/* Travel Adaptor */
-	EXTCON_FAST_CHARGER,
-	EXTCON_SLOW_CHARGER,
-	EXTCON_CHARGE_DOWNSTREAM,	/* Charging an external device */
-	EXTCON_HDMI,
-	EXTCON_MHL,
-	EXTCON_DVI,
-	EXTCON_VGA,
-	EXTCON_DOCK,
-	EXTCON_LINE_IN,
-	EXTCON_LINE_OUT,
-	EXTCON_MIC_IN,
-	EXTCON_HEADPHONE_OUT,
-	EXTCON_SPDIF_IN,
-	EXTCON_SPDIF_OUT,
-	EXTCON_VIDEO_IN,
-	EXTCON_VIDEO_OUT,
-	EXTCON_MECHANICAL,
+enum extcon {
+	EXTCON_NONE		= 0x0,
+
+	/* USB external connector */
+	EXTCON_USB		= 0x1,
+	EXTCON_USB_HOST		= 0x2,
+
+	/* Charger external connector */
+	EXTCON_TA		= 0x10,
+	EXTCON_FAST_CHARGER	= 0x11,
+	EXTCON_SLOW_CHARGER	= 0x12,
+	EXTCON_CHARGE_DOWNSTREAM = 0x13,
+
+	/* Audio/Video external connector */
+	EXTCON_LINE_IN		= 0x20,
+	EXTCON_LINE_OUT		= 0x21,
+	EXTCON_MICROPHONE	= 0x22,
+	EXTCON_HEADPHONE	= 0x23,
+
+	EXTCON_HDMI		= 0x30,
+	EXTCON_MHL		= 0x31,
+	EXTCON_DVI		= 0x32,
+	EXTCON_VGA		= 0x33,
+	EXTCON_SPDIF_IN		= 0x34,
+	EXTCON_SPDIF_OUT	= 0x35,
+	EXTCON_VIDEO_IN		= 0x36,
+	EXTCON_VIDEO_OUT	= 0x37,
+
+	/* Etc external connector */
+	EXTCON_DOCK		= 0x50,
+	EXTCON_JIG		= 0x51,
+	EXTCON_MECHANICAL	= 0x52,
+
+	EXTCON_END,
 };
-extern const char extcon_cable_name[][CABLE_NAME_MAX + 1];
 
 struct extcon_cable;
 
@@ -78,7 +72,7 @@ struct extcon_cable;
  * struct extcon_dev - An extcon device represents one external connector.
  * @name:		The name of this extcon device. Parent device name is
  *			used if NULL.
- * @supported_cable:	Array of supported cable names ending with NULL.
+ * @supported_cable:	Array of supported cable names ending with EXTCON_NONE.
  *			If supported_cable is NULL, cable name related APIs
  *			are disabled.
  * @mutually_exclusive:	Array of mutually exclusive set of cables that cannot
@@ -113,7 +107,7 @@ struct extcon_cable;
 struct extcon_dev {
 	/* Optional user initializing data */
 	const char *name;
-	const char **supported_cable;
+	const enum extcon *supported_cable;
 	const u32 *mutually_exclusive;
 
 	/* Optional callbacks to override class functions */
@@ -194,10 +188,10 @@ extern struct extcon_dev *extcon_get_extcon_dev(const char *extcon_name);
 /*
  * Following APIs control the memory of extcon device.
  */
-extern struct extcon_dev *extcon_dev_allocate(const char **cables);
+extern struct extcon_dev *extcon_dev_allocate(const enum extcon *cable);
 extern void extcon_dev_free(struct extcon_dev *edev);
 extern struct extcon_dev *devm_extcon_dev_allocate(struct device *dev,
-						   const char **cables);
+						   const enum extcon *cable);
 extern void devm_extcon_dev_free(struct device *dev, struct extcon_dev *edev);
 
 /*
@@ -216,13 +210,10 @@ extern int extcon_update_state(struct extcon_dev *edev, u32 mask, u32 state);
 
 /*
  * get/set_cable_state access each bit of the 32b encoded state value.
- * They are used to access the status of each cable based on the cable_name
- * or cable_index, which is retrieved by extcon_find_cable_index
+ * They are used to access the status of each cable based on the cable_name.
  */
-extern int extcon_find_cable_index(struct extcon_dev *sdev,
-				   const char *cable_name);
-extern int extcon_get_cable_state_(struct extcon_dev *edev, int cable_index);
-extern int extcon_set_cable_state_(struct extcon_dev *edev, int cable_index,
+extern int extcon_get_cable_state_(struct extcon_dev *edev, enum extcon id);
+extern int extcon_set_cable_state_(struct extcon_dev *edev, enum extcon id,
 				   bool cable_state);
 
 extern int extcon_get_cable_state(struct extcon_dev *edev,
@@ -281,7 +272,7 @@ static inline int devm_extcon_dev_register(struct device *dev,
 static inline void devm_extcon_dev_unregister(struct device *dev,
 					      struct extcon_dev *edev) { }
 
-static inline struct extcon_dev *extcon_dev_allocate(const char **cables)
+static inline struct extcon_dev *extcon_dev_allocate(const enum extcon *cable)
 {
 	return ERR_PTR(-ENOSYS);
 }
@@ -289,7 +280,7 @@ static inline struct extcon_dev *extcon_dev_allocate(const char **cables)
 static inline void extcon_dev_free(struct extcon_dev *edev) { }
 
 static inline struct extcon_dev *devm_extcon_dev_allocate(struct device *dev,
-							  const char **cables)
+						const enum extcon *cable)
 {
 	return ERR_PTR(-ENOSYS);
 }
@@ -312,20 +303,14 @@ static inline int extcon_update_state(struct extcon_dev *edev, u32 mask,
 	return 0;
 }
 
-static inline int extcon_find_cable_index(struct extcon_dev *edev,
-					  const char *cable_name)
-{
-	return 0;
-}
-
 static inline int extcon_get_cable_state_(struct extcon_dev *edev,
-					  int cable_index)
+					  enum extcon id)
 {
 	return 0;
 }
 
 static inline int extcon_set_cable_state_(struct extcon_dev *edev,
-					  int cable_index, bool cable_state)
+					  enum extcon id, bool cable_state)
 {
 	return 0;
 }
diff --git a/include/linux/extcon/extcon-adc-jack.h b/include/linux/extcon/extcon-adc-jack.h
index 9ca958c4e94c..53c60806bcfb 100644
--- a/include/linux/extcon/extcon-adc-jack.h
+++ b/include/linux/extcon/extcon-adc-jack.h
@@ -44,7 +44,7 @@ struct adc_jack_cond {
  * @consumer_channel:	Unique name to identify the channel on the consumer
  *			side. This typically describes the channels used within
  *			the consumer. E.g. 'battery_voltage'
- * @cable_names:	array of cable names ending with null.
+ * @cable_names:	array of extcon id for supported cables.
  * @adc_contitions:	array of struct adc_jack_cond conditions ending
  *			with .state = 0 entry. This describes how to decode
  *			adc values into extcon state.
@@ -58,8 +58,7 @@ struct adc_jack_pdata {
 	const char *name;
 	const char *consumer_channel;
 
-	/* The last entry should be NULL */
-	const char **cable_names;
+	const enum extcon *cable_names;
 
 	/* The last entry's state should be 0 */
 	struct adc_jack_cond *adc_conditions;
-- 
cgit v1.2.3


From 046050f6e623e442e9c71c525462ebd395dae526 Mon Sep 17 00:00:00 2001
From: Chanwoo Choi <cw00.choi@samsung.com>
Date: Tue, 19 May 2015 20:01:12 +0900
Subject: extcon: Update the prototype of extcon_register_notifier() with enum
 extcon

Previously, extcon consumer driver used the extcon_register_interest()
to register the notifier chain and then to receive the notifier event
when external connector's state is changed. When registering the notifier chain
for specific external connector with extcon_register_interest(), it used the
the string name of external connector directly. There are potential problem
because of unclear, non-standard and inconsequent cable name. Namely,
it is not appropriate method to identify each external connector.

So, this patch modify the prototype of extcon_register_notifier() by using
the 'enum extcon' which are the unique id for each external connector
instead of unclear string method.

- Previously, the extcon consumer driver used the extcon_register_interest()
with 'cable_name' to point out the specific external connector. Also. it used
the un-needed structure (struct extcon_specific_cable_nb).
: int extcon_register_interest(struct extcon_specific_cable_nb *obj,
			     const char *extcon_name, const char *cable_name,
			     struct notifier_block *nb)

- Newly, the updated extcon_register_notifier() would definitely support
the same feature to detech the changed state of external connector without
any specific structure (struct extcon_specific_cable_nb).
: int extcon_register_notifier(struct extcon_dev *edev, enum extcon id,
			     struct notifier_block *nb)

This patch support the both extcon_register_interest() and new extcon_register_
notifier(). But the extcon_{register|unregister}_interest() will be deprecated
because extcon core would support the notifier event for extcon consumer driver
with only updated extcon_register_notifier() and 'extcon_specific_cable_nb'
will be removed if there are no extcon consumer driver with legacy
extcon_{register|unregister}_interest().

Signed-off-by: Chanwoo Choi <cw00.choi@samsung.com>
Reviewed-by: Krzysztof Kozlowski <k.kozlowski@samsung.com>
---
 drivers/extcon/extcon.c | 91 ++++++++++++++++++++++++++-----------------------
 include/linux/extcon.h  | 17 +++++----
 2 files changed, 56 insertions(+), 52 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/extcon/extcon.c b/drivers/extcon/extcon.c
index fce2687bde8e..5099c11d8833 100644
--- a/drivers/extcon/extcon.c
+++ b/drivers/extcon/extcon.c
@@ -146,6 +146,16 @@ static int find_cable_index_by_name(struct extcon_dev *edev, const char *name)
 	return find_cable_index_by_id(edev, id);
 }
 
+static bool is_extcon_changed(u32 prev, u32 new, int idx, bool *attached)
+{
+	if (((prev >> idx) & 0x1) != ((new >> idx) & 0x1)) {
+		*attached = new ? true : false;
+		return true;
+	}
+
+	return false;
+}
+
 static ssize_t state_show(struct device *dev, struct device_attribute *attr,
 			  char *buf)
 {
@@ -254,23 +264,27 @@ int extcon_update_state(struct extcon_dev *edev, u32 mask, u32 state)
 	char *envp[3];
 	int env_offset = 0;
 	int length;
+	int index;
 	unsigned long flags;
+	bool attached;
 
 	spin_lock_irqsave(&edev->lock, flags);
 
 	if (edev->state != ((edev->state & ~mask) | (state & mask))) {
-		u32 old_state = edev->state;
-
 		if (check_mutually_exclusive(edev, (edev->state & ~mask) |
 						   (state & mask))) {
 			spin_unlock_irqrestore(&edev->lock, flags);
 			return -EPERM;
 		}
 
+		for (index = 0; index < edev->max_supported; index++) {
+			if (is_extcon_changed(edev->state, state, index, &attached))
+				raw_notifier_call_chain(&edev->nh[index], attached, edev);
+		}
+
 		edev->state &= ~mask;
 		edev->state |= state & mask;
 
-		raw_notifier_call_chain(&edev->nh, old_state, edev);
 		/* This could be in interrupt handler */
 		prop_buf = (char *)get_zeroed_page(GFP_ATOMIC);
 		if (prop_buf) {
@@ -423,29 +437,6 @@ out:
 }
 EXPORT_SYMBOL_GPL(extcon_get_extcon_dev);
 
-static int _call_per_cable(struct notifier_block *nb, unsigned long val,
-			   void *ptr)
-{
-	struct extcon_specific_cable_nb *obj = container_of(nb,
-			struct extcon_specific_cable_nb, internal_nb);
-	struct extcon_dev *edev = ptr;
-
-	if ((val & (1 << obj->cable_index)) !=
-	    (edev->state & (1 << obj->cable_index))) {
-		bool cable_state = true;
-
-		obj->previous_value = val;
-
-		if (val & (1 << obj->cable_index))
-			cable_state = false;
-
-		return obj->user_nb->notifier_call(obj->user_nb,
-				cable_state, ptr);
-	}
-
-	return NOTIFY_OK;
-}
-
 /**
  * extcon_register_interest() - Register a notifier for a state change of a
  *				specific cable, not an entier set of cables of a
@@ -491,11 +482,10 @@ int extcon_register_interest(struct extcon_specific_cable_nb *obj,
 
 		obj->user_nb = nb;
 
-		obj->internal_nb.notifier_call = _call_per_cable;
-
 		spin_lock_irqsave(&obj->edev->lock, flags);
-		ret = raw_notifier_chain_register(&obj->edev->nh,
-						  &obj->internal_nb);
+		ret = raw_notifier_chain_register(
+					&obj->edev->nh[obj->cable_index],
+					obj->user_nb);
 		spin_unlock_irqrestore(&obj->edev->lock, flags);
 	} else {
 		struct class_dev_iter iter;
@@ -538,7 +528,8 @@ int extcon_unregister_interest(struct extcon_specific_cable_nb *obj)
 		return -EINVAL;
 
 	spin_lock_irqsave(&obj->edev->lock, flags);
-	ret = raw_notifier_chain_unregister(&obj->edev->nh, &obj->internal_nb);
+	ret = raw_notifier_chain_unregister(
+			&obj->edev->nh[obj->cable_index], obj->user_nb);
 	spin_unlock_irqrestore(&obj->edev->lock, flags);
 
 	return ret;
@@ -548,21 +539,24 @@ EXPORT_SYMBOL_GPL(extcon_unregister_interest);
 /**
  * extcon_register_notifier() - Register a notifiee to get notified by
  *				any attach status changes from the extcon.
- * @edev:	the extcon device.
+ * @edev:	the extcon device that has the external connecotr.
+ * @id:		the unique id of each external connector in extcon enumeration.
  * @nb:		a notifier block to be registered.
  *
  * Note that the second parameter given to the callback of nb (val) is
  * "old_state", not the current state. The current state can be retrieved
  * by looking at the third pameter (edev pointer)'s state value.
  */
-int extcon_register_notifier(struct extcon_dev *edev,
-			struct notifier_block *nb)
+int extcon_register_notifier(struct extcon_dev *edev, enum extcon id,
+			     struct notifier_block *nb)
 {
 	unsigned long flags;
-	int ret;
+	int ret, idx;
+
+	idx = find_cable_index_by_id(edev, id);
 
 	spin_lock_irqsave(&edev->lock, flags);
-	ret = raw_notifier_chain_register(&edev->nh, nb);
+	ret = raw_notifier_chain_register(&edev->nh[idx], nb);
 	spin_unlock_irqrestore(&edev->lock, flags);
 
 	return ret;
@@ -571,17 +565,20 @@ EXPORT_SYMBOL_GPL(extcon_register_notifier);
 
 /**
  * extcon_unregister_notifier() - Unregister a notifiee from the extcon device.
- * @edev:	the extcon device.
- * @nb:		a registered notifier block to be unregistered.
+ * @edev:	the extcon device that has the external connecotr.
+ * @id:		the unique id of each external connector in extcon enumeration.
+ * @nb:		a notifier block to be registered.
  */
-int extcon_unregister_notifier(struct extcon_dev *edev,
-			struct notifier_block *nb)
+int extcon_unregister_notifier(struct extcon_dev *edev, enum extcon id,
+				struct notifier_block *nb)
 {
 	unsigned long flags;
-	int ret;
+	int ret, idx;
+
+	idx = find_cable_index_by_id(edev, id);
 
 	spin_lock_irqsave(&edev->lock, flags);
-	ret = raw_notifier_chain_unregister(&edev->nh, nb);
+	ret = raw_notifier_chain_unregister(&edev->nh[idx], nb);
 	spin_unlock_irqrestore(&edev->lock, flags);
 
 	return ret;
@@ -893,7 +890,15 @@ int extcon_dev_register(struct extcon_dev *edev)
 
 	spin_lock_init(&edev->lock);
 
-	RAW_INIT_NOTIFIER_HEAD(&edev->nh);
+	edev->nh = devm_kzalloc(&edev->dev,
+			sizeof(*edev->nh) * edev->max_supported, GFP_KERNEL);
+	if (!edev->nh) {
+		ret = -ENOMEM;
+		goto err_dev;
+	}
+
+	for (index = 0; index < edev->max_supported; index++)
+		RAW_INIT_NOTIFIER_HEAD(&edev->nh[index]);
 
 	dev_set_drvdata(&edev->dev, edev);
 	edev->state = 0;
diff --git a/include/linux/extcon.h b/include/linux/extcon.h
index 85c882f0029e..be9652b3a154 100644
--- a/include/linux/extcon.h
+++ b/include/linux/extcon.h
@@ -116,7 +116,7 @@ struct extcon_dev {
 
 	/* Internal data. Please do not set. */
 	struct device dev;
-	struct raw_notifier_head nh;
+	struct raw_notifier_head *nh;
 	struct list_head entry;
 	int max_supported;
 	spinlock_t lock;	/* could be called by irq handler */
@@ -155,8 +155,6 @@ struct extcon_cable {
 /**
  * struct extcon_specific_cable_nb - An internal data for
  *				     extcon_register_interest().
- * @internal_nb:	A notifier block bridging extcon notifier
- *			and cable notifier.
  * @user_nb:		user provided notifier block for events from
  *			a specific cable.
  * @cable_index:	the target cable.
@@ -164,7 +162,6 @@ struct extcon_cable {
  * @previous_value:	the saved previous event value.
  */
 struct extcon_specific_cable_nb {
-	struct notifier_block internal_nb;
 	struct notifier_block *user_nb;
 	int cable_index;
 	struct extcon_dev *edev;
@@ -240,10 +237,10 @@ extern int extcon_unregister_interest(struct extcon_specific_cable_nb *nb);
  * we do not recommend to use this for normal 'notifiee' device drivers who
  * want to be notified by a specific external port of the notifier.
  */
-extern int extcon_register_notifier(struct extcon_dev *edev,
+extern int extcon_register_notifier(struct extcon_dev *edev, enum extcon id,
+				    struct notifier_block *nb);
+extern int extcon_unregister_notifier(struct extcon_dev *edev, enum extcon id,
 				    struct notifier_block *nb);
-extern int extcon_unregister_notifier(struct extcon_dev *edev,
-				      struct notifier_block *nb);
 
 /*
  * Following API get the extcon device from devicetree.
@@ -333,13 +330,15 @@ static inline struct extcon_dev *extcon_get_extcon_dev(const char *extcon_name)
 }
 
 static inline int extcon_register_notifier(struct extcon_dev *edev,
-					   struct notifier_block *nb)
+					enum extcon id,
+					struct notifier_block *nb)
 {
 	return 0;
 }
 
 static inline int extcon_unregister_notifier(struct extcon_dev *edev,
-					     struct notifier_block *nb)
+					enum extcon id,
+					struct notifier_block *nb)
 {
 	return 0;
 }
-- 
cgit v1.2.3


From 668abc729fcb9d034eccadf63166d2c76cd645d1 Mon Sep 17 00:00:00 2001
From: Srinivas Kandagatla <srinivas.kandagatla@linaro.org>
Date: Thu, 21 May 2015 17:42:43 +0100
Subject: regmap: Introduce regmap_get_max_register

This patch introduces regmap_get_max_register() function which would be
used by the infrastructures like nvmem framework built on top of
regmap.

Signed-off-by: Srinivas Kandagatla <srinivas.kandagatla@linaro.org>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/base/regmap/regmap.c | 12 ++++++++++++
 include/linux/regmap.h       |  7 +++++++
 2 files changed, 19 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/base/regmap/regmap.c b/drivers/base/regmap/regmap.c
index 6273ff072f3e..d6c84047b957 100644
--- a/drivers/base/regmap/regmap.c
+++ b/drivers/base/regmap/regmap.c
@@ -2613,6 +2613,18 @@ int regmap_get_val_bytes(struct regmap *map)
 }
 EXPORT_SYMBOL_GPL(regmap_get_val_bytes);
 
+/**
+ * regmap_get_max_register(): Report the max register value
+ *
+ * Report the max register value, mainly intended to for use by
+ * generic infrastructure built on top of regmap.
+ */
+int regmap_get_max_register(struct regmap *map)
+{
+	return map->max_register ? map->max_register : -EINVAL;
+}
+EXPORT_SYMBOL_GPL(regmap_get_max_register);
+
 int regmap_parse_val(struct regmap *map, const void *buf,
 			unsigned int *val)
 {
diff --git a/include/linux/regmap.h b/include/linux/regmap.h
index 116655d92269..2d87deda79cd 100644
--- a/include/linux/regmap.h
+++ b/include/linux/regmap.h
@@ -433,6 +433,7 @@ int regmap_update_bits_check_async(struct regmap *map, unsigned int reg,
 				   unsigned int mask, unsigned int val,
 				   bool *change);
 int regmap_get_val_bytes(struct regmap *map);
+int regmap_get_max_register(struct regmap *map);
 int regmap_async_complete(struct regmap *map);
 bool regmap_can_raw_write(struct regmap *map);
 
@@ -676,6 +677,12 @@ static inline int regmap_get_val_bytes(struct regmap *map)
 	return -EINVAL;
 }
 
+static inline int regmap_get_max_register(struct regmap *map)
+{
+	WARN_ONCE(1, "regmap API is disabled");
+	return -EINVAL;
+}
+
 static inline int regcache_sync(struct regmap *map)
 {
 	WARN_ONCE(1, "regmap API is disabled");
-- 
cgit v1.2.3


From a2f776cbb8271d7149784207da0b0c51e8b1847c Mon Sep 17 00:00:00 2001
From: Srinivas Kandagatla <srinivas.kandagatla@linaro.org>
Date: Thu, 21 May 2015 17:42:54 +0100
Subject: regmap: Introduce regmap_get_reg_stride

This patch introduces regmap_get_reg_stride() function which would
be used by the infrastructures like nvmem framework built on top of
regmap. Mostly this function would be used for sanity checks on inputs
within such infrastructure.

Signed-off-by: Srinivas Kandagatla <srinivas.kandagatla@linaro.org>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/base/regmap/regmap.c | 12 ++++++++++++
 include/linux/regmap.h       |  7 +++++++
 2 files changed, 19 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/base/regmap/regmap.c b/drivers/base/regmap/regmap.c
index d6c84047b957..e5eef6f1408a 100644
--- a/drivers/base/regmap/regmap.c
+++ b/drivers/base/regmap/regmap.c
@@ -2625,6 +2625,18 @@ int regmap_get_max_register(struct regmap *map)
 }
 EXPORT_SYMBOL_GPL(regmap_get_max_register);
 
+/**
+ * regmap_get_reg_stride(): Report the register address stride
+ *
+ * Report the register address stride, mainly intended to for use by
+ * generic infrastructure built on top of regmap.
+ */
+int regmap_get_reg_stride(struct regmap *map)
+{
+	return map->reg_stride;
+}
+EXPORT_SYMBOL_GPL(regmap_get_reg_stride);
+
 int regmap_parse_val(struct regmap *map, const void *buf,
 			unsigned int *val)
 {
diff --git a/include/linux/regmap.h b/include/linux/regmap.h
index 2d87deda79cd..59c55ea0f0b5 100644
--- a/include/linux/regmap.h
+++ b/include/linux/regmap.h
@@ -434,6 +434,7 @@ int regmap_update_bits_check_async(struct regmap *map, unsigned int reg,
 				   bool *change);
 int regmap_get_val_bytes(struct regmap *map);
 int regmap_get_max_register(struct regmap *map);
+int regmap_get_reg_stride(struct regmap *map);
 int regmap_async_complete(struct regmap *map);
 bool regmap_can_raw_write(struct regmap *map);
 
@@ -683,6 +684,12 @@ static inline int regmap_get_max_register(struct regmap *map)
 	return -EINVAL;
 }
 
+static inline int regmap_get_reg_stride(struct regmap *map)
+{
+	WARN_ONCE(1, "regmap API is disabled");
+	return -EINVAL;
+}
+
 static inline int regcache_sync(struct regmap *map)
 {
 	WARN_ONCE(1, "regmap API is disabled");
-- 
cgit v1.2.3


From 69eb0980ab4ced06f7c2b4774575337ce32912fb Mon Sep 17 00:00:00 2001
From: Laxman Dewangan <ldewangan@nvidia.com>
Date: Thu, 23 Apr 2015 16:10:24 +0530
Subject: regulator: max8973: add mechanism to enable/disable through GPIO

MAX8973 supports the voltage output enable/disable through its EN
pin. This EN pin can be connected through GPIO from host processor.
Add support to provide GPIO number from platform/DT and if it is
valid GPIO then enable external control default.

Signed-off-by: Laxman Dewangan <ldewangan@nvidia.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 .../bindings/regulator/max8973-regulator.txt       |  2 ++
 drivers/regulator/max8973-regulator.c              | 29 +++++++++++++++++-----
 include/linux/regulator/max8973-regulator.h        |  4 +++
 3 files changed, 29 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/devicetree/bindings/regulator/max8973-regulator.txt b/Documentation/devicetree/bindings/regulator/max8973-regulator.txt
index f63de7d8dc23..201a26337387 100644
--- a/Documentation/devicetree/bindings/regulator/max8973-regulator.txt
+++ b/Documentation/devicetree/bindings/regulator/max8973-regulator.txt
@@ -12,6 +12,8 @@ Optional properties:
 
 -maxim,externally-enable: boolean, externally control the regulator output
 		enable/disable.
+-maxim,enable-gpio: GPIO for enable control. If the valid GPIO is provided
+		then externally enable control will be considered.
 -maxim,dvs-gpio: GPIO which is connected to DVS pin of device.
 -maxim,dvs-default-state: Default state of GPIO during initialisation.
 		1 for HIGH and 0 for LOW.
diff --git a/drivers/regulator/max8973-regulator.c b/drivers/regulator/max8973-regulator.c
index 1442f920c80e..00cf91cc4c85 100644
--- a/drivers/regulator/max8973-regulator.c
+++ b/drivers/regulator/max8973-regulator.c
@@ -96,6 +96,7 @@ struct max8973_chip {
 	struct regulator_desc desc;
 	struct regmap *regmap;
 	bool enable_external_control;
+	int enable_gpio;
 	int dvs_gpio;
 	int lru_index[MAX8973_MAX_VOUT_REG];
 	int curr_vout_val[MAX8973_MAX_VOUT_REG];
@@ -379,6 +380,7 @@ static struct max8973_regulator_platform_data *max8973_parse_dt(
 
 	pdata->enable_ext_control = of_property_read_bool(np,
 						"maxim,externally-enable");
+	pdata->enable_gpio = of_get_named_gpio(np, "maxim,enable-gpio", 0);
 	pdata->dvs_gpio = of_get_named_gpio(np, "maxim,dvs-gpio", 0);
 
 	ret = of_property_read_u32(np, "maxim,dvs-default-state", &pval);
@@ -409,6 +411,7 @@ static int max8973_probe(struct i2c_client *client,
 			 const struct i2c_device_id *id)
 {
 	struct max8973_regulator_platform_data *pdata;
+	struct regulator_init_data *ridata;
 	struct regulator_config config = { };
 	struct regulator_dev *rdev;
 	struct max8973_chip *max;
@@ -427,7 +430,8 @@ static int max8973_probe(struct i2c_client *client,
 		return -EIO;
 	}
 
-	if (pdata->dvs_gpio == -EPROBE_DEFER)
+	if ((pdata->dvs_gpio == -EPROBE_DEFER) ||
+		(pdata->enable_gpio == -EPROBE_DEFER))
 		return -EPROBE_DEFER;
 
 	max = devm_kzalloc(&client->dev, sizeof(*max), GFP_KERNEL);
@@ -453,6 +457,15 @@ static int max8973_probe(struct i2c_client *client,
 	max->desc.uV_step = MAX8973_VOLATGE_STEP;
 	max->desc.n_voltages = MAX8973_BUCK_N_VOLTAGE;
 
+	max->dvs_gpio = (pdata->dvs_gpio) ? pdata->dvs_gpio : -EINVAL;
+	max->enable_gpio = (pdata->enable_gpio) ? pdata->enable_gpio : -EINVAL;
+	max->enable_external_control = pdata->enable_ext_control;
+	max->curr_gpio_val = pdata->dvs_def_state;
+	max->curr_vout_reg = MAX8973_VOUT + pdata->dvs_def_state;
+
+	if (gpio_is_valid(max->enable_gpio))
+		max->enable_external_control = true;
+
 	if (!pdata->enable_ext_control) {
 		max->desc.enable_reg = MAX8973_VOUT;
 		max->desc.enable_mask = MAX8973_VOUT_ENABLE;
@@ -461,11 +474,6 @@ static int max8973_probe(struct i2c_client *client,
 		max->ops.is_enabled = regulator_is_enabled_regmap;
 	}
 
-	max->dvs_gpio = (pdata->dvs_gpio) ? pdata->dvs_gpio : -EINVAL;
-	max->enable_external_control = pdata->enable_ext_control;
-	max->curr_gpio_val = pdata->dvs_def_state;
-	max->curr_vout_reg = MAX8973_VOUT + pdata->dvs_def_state;
-
 	max->lru_index[0] = max->curr_vout_reg;
 
 	if (gpio_is_valid(max->dvs_gpio)) {
@@ -509,6 +517,15 @@ static int max8973_probe(struct i2c_client *client,
 	config.of_node = client->dev.of_node;
 	config.regmap = max->regmap;
 
+	if (gpio_is_valid(max->enable_gpio)) {
+		ridata = pdata->reg_init_data;
+		config.ena_gpio_flags = GPIOF_OUT_INIT_LOW;
+		if (ridata && (ridata->constraints.always_on ||
+			ridata->constraints.boot_on))
+			config.ena_gpio_flags = GPIOF_OUT_INIT_HIGH;
+		config.ena_gpio = max->enable_gpio;
+	}
+
 	/* Register the regulators */
 	rdev = devm_regulator_register(&client->dev, &max->desc, &config);
 	if (IS_ERR(rdev)) {
diff --git a/include/linux/regulator/max8973-regulator.h b/include/linux/regulator/max8973-regulator.h
index f8acc052e353..f6a8a16a0d4d 100644
--- a/include/linux/regulator/max8973-regulator.h
+++ b/include/linux/regulator/max8973-regulator.h
@@ -58,6 +58,9 @@
  *		control signal from EN input pin. If it is false then
  *		voltage output will be enabled/disabled through EN bit of
  *		device register.
+ * @enable_gpio: Enable GPIO. If EN pin is controlled through GPIO from host
+ *		then GPIO number can be provided. If no GPIO controlled then
+ *		it should be -1.
  * @dvs_gpio: GPIO for dvs. It should be -1 if this is tied with fixed logic.
  * @dvs_def_state: Default state of dvs. 1 if it is high else 0.
  */
@@ -65,6 +68,7 @@ struct max8973_regulator_platform_data {
 	struct regulator_init_data *reg_init_data;
 	unsigned long control_flags;
 	bool enable_ext_control;
+	int enable_gpio;
 	int dvs_gpio;
 	unsigned dvs_def_state:1;
 };
-- 
cgit v1.2.3


From f705f837c58ebe1ea69dfffff4dcc234e2fbc8dd Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 22 May 2015 11:12:38 +0200
Subject: nvme: consolidate synchronous command submission helpers

Note that we keep the unused timeout argument, but allow callers to
pass 0 instead of a timeout if they want the default.  This will allow
adding a timeout to the pass through path later on.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/block/nvme-core.c | 96 ++++++++++++++++-------------------------------
 drivers/block/nvme-scsi.c | 17 ++++-----
 include/linux/nvme.h      |  6 +--
 3 files changed, 42 insertions(+), 77 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c
index 00e641937a8e..e81b205ffd04 100644
--- a/drivers/block/nvme-core.c
+++ b/drivers/block/nvme-core.c
@@ -991,27 +991,40 @@ static void sync_completion(struct nvme_queue *nvmeq, void *ctx,
  * Returns 0 on success.  If the result is negative, it's a Linux error code;
  * if the result is positive, it's an NVM Express status code
  */
-static int nvme_submit_sync_cmd(struct request *req, struct nvme_command *cmd,
-						u32 *result, unsigned timeout)
+static int __nvme_submit_sync_cmd(struct request_queue *q,
+		struct nvme_command *cmd, u32 *result, unsigned timeout)
 {
 	struct sync_cmd_info cmdinfo;
-	struct nvme_cmd_info *cmd_rq = blk_mq_rq_to_pdu(req);
-	struct nvme_queue *nvmeq = cmd_rq->nvmeq;
+	struct nvme_cmd_info *cmd_rq;
+	struct request *req;
+	int res;
+
+	req = blk_mq_alloc_request(q, WRITE, GFP_KERNEL, false);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
 
 	cmdinfo.task = current;
 	cmdinfo.status = -EINTR;
 
 	cmd->common.command_id = req->tag;
 
+	cmd_rq = blk_mq_rq_to_pdu(req);
 	nvme_set_info(cmd_rq, &cmdinfo, sync_completion);
 
 	set_current_state(TASK_UNINTERRUPTIBLE);
-	nvme_submit_cmd(nvmeq, cmd);
+	nvme_submit_cmd(cmd_rq->nvmeq, cmd);
 	schedule();
 
 	if (result)
 		*result = cmdinfo.result;
-	return cmdinfo.status;
+	res = cmdinfo.status;
+	blk_mq_free_request(req);
+	return res;
+}
+
+int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd)
+{
+	return __nvme_submit_sync_cmd(q, cmd, NULL, 0);
 }
 
 static int nvme_submit_async_admin_req(struct nvme_dev *dev)
@@ -1060,41 +1073,6 @@ static int nvme_submit_admin_async_cmd(struct nvme_dev *dev,
 	return nvme_submit_cmd(nvmeq, cmd);
 }
 
-static int __nvme_submit_admin_cmd(struct nvme_dev *dev, struct nvme_command *cmd,
-						u32 *result, unsigned timeout)
-{
-	int res;
-	struct request *req;
-
-	req = blk_mq_alloc_request(dev->admin_q, WRITE, GFP_KERNEL, false);
-	if (IS_ERR(req))
-		return PTR_ERR(req);
-	res = nvme_submit_sync_cmd(req, cmd, result, timeout);
-	blk_mq_free_request(req);
-	return res;
-}
-
-int nvme_submit_admin_cmd(struct nvme_dev *dev, struct nvme_command *cmd,
-								u32 *result)
-{
-	return __nvme_submit_admin_cmd(dev, cmd, result, ADMIN_TIMEOUT);
-}
-
-int nvme_submit_io_cmd(struct nvme_dev *dev, struct nvme_ns *ns,
-					struct nvme_command *cmd, u32 *result)
-{
-	int res;
-	struct request *req;
-
-	req = blk_mq_alloc_request(ns->queue, WRITE, (GFP_KERNEL|__GFP_WAIT),
-									false);
-	if (IS_ERR(req))
-		return PTR_ERR(req);
-	res = nvme_submit_sync_cmd(req, cmd, result, NVME_IO_TIMEOUT);
-	blk_mq_free_request(req);
-	return res;
-}
-
 static int adapter_delete_queue(struct nvme_dev *dev, u8 opcode, u16 id)
 {
 	struct nvme_command c;
@@ -1103,7 +1081,7 @@ static int adapter_delete_queue(struct nvme_dev *dev, u8 opcode, u16 id)
 	c.delete_queue.opcode = opcode;
 	c.delete_queue.qid = cpu_to_le16(id);
 
-	return nvme_submit_admin_cmd(dev, &c, NULL);
+	return nvme_submit_sync_cmd(dev->admin_q, &c);
 }
 
 static int adapter_alloc_cq(struct nvme_dev *dev, u16 qid,
@@ -1120,7 +1098,7 @@ static int adapter_alloc_cq(struct nvme_dev *dev, u16 qid,
 	c.create_cq.cq_flags = cpu_to_le16(flags);
 	c.create_cq.irq_vector = cpu_to_le16(nvmeq->cq_vector);
 
-	return nvme_submit_admin_cmd(dev, &c, NULL);
+	return nvme_submit_sync_cmd(dev->admin_q, &c);
 }
 
 static int adapter_alloc_sq(struct nvme_dev *dev, u16 qid,
@@ -1137,7 +1115,7 @@ static int adapter_alloc_sq(struct nvme_dev *dev, u16 qid,
 	c.create_sq.sq_flags = cpu_to_le16(flags);
 	c.create_sq.cqid = cpu_to_le16(qid);
 
-	return nvme_submit_admin_cmd(dev, &c, NULL);
+	return nvme_submit_sync_cmd(dev->admin_q, &c);
 }
 
 static int adapter_delete_cq(struct nvme_dev *dev, u16 cqid)
@@ -1161,7 +1139,7 @@ int nvme_identify(struct nvme_dev *dev, unsigned nsid, unsigned cns,
 	c.identify.prp1 = cpu_to_le64(dma_addr);
 	c.identify.cns = cpu_to_le32(cns);
 
-	return nvme_submit_admin_cmd(dev, &c, NULL);
+	return nvme_submit_sync_cmd(dev->admin_q, &c);
 }
 
 int nvme_get_features(struct nvme_dev *dev, unsigned fid, unsigned nsid,
@@ -1175,7 +1153,7 @@ int nvme_get_features(struct nvme_dev *dev, unsigned fid, unsigned nsid,
 	c.features.prp1 = cpu_to_le64(dma_addr);
 	c.features.fid = cpu_to_le32(fid);
 
-	return nvme_submit_admin_cmd(dev, &c, result);
+	return __nvme_submit_sync_cmd(dev->admin_q, &c, result, 0);
 }
 
 int nvme_set_features(struct nvme_dev *dev, unsigned fid, unsigned dword11,
@@ -1189,7 +1167,7 @@ int nvme_set_features(struct nvme_dev *dev, unsigned fid, unsigned dword11,
 	c.features.fid = cpu_to_le32(fid);
 	c.features.dword11 = cpu_to_le32(dword11);
 
-	return nvme_submit_admin_cmd(dev, &c, result);
+	return __nvme_submit_sync_cmd(dev->admin_q, &c, result, 0);
 }
 
 /**
@@ -1813,7 +1791,7 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
 	c.rw.prp1 = cpu_to_le64(sg_dma_address(iod->sg));
 	c.rw.prp2 = cpu_to_le64(iod->first_dma);
 	c.rw.metadata = cpu_to_le64(meta_dma);
-	status = nvme_submit_io_cmd(dev, ns, &c, NULL);
+	status = nvme_submit_sync_cmd(ns->queue, &c);
  unmap:
 	nvme_unmap_user_pages(dev, write, iod);
 	nvme_free_iod(dev, iod);
@@ -1869,23 +1847,15 @@ static int nvme_user_cmd(struct nvme_dev *dev, struct nvme_ns *ns,
 	timeout = cmd.timeout_ms ? msecs_to_jiffies(cmd.timeout_ms) :
 								ADMIN_TIMEOUT;
 
-	if (length != cmd.data_len)
+	if (length != cmd.data_len) {
 		status = -ENOMEM;
-	else if (ns) {
-		struct request *req;
-
-		req = blk_mq_alloc_request(ns->queue, WRITE,
-						(GFP_KERNEL|__GFP_WAIT), false);
-		if (IS_ERR(req))
-			status = PTR_ERR(req);
-		else {
-			status = nvme_submit_sync_cmd(req, &c, &cmd.result,
-								timeout);
-			blk_mq_free_request(req);
-		}
-	} else
-		status = __nvme_submit_admin_cmd(dev, &c, &cmd.result, timeout);
+		goto out;
+	}
+
+	status = __nvme_submit_sync_cmd(ns ? ns->queue : dev->admin_q, &c,
+					&cmd.result, timeout);
 
+out:
 	if (cmd.data_len) {
 		nvme_unmap_user_pages(dev, cmd.opcode & 1, iod);
 		nvme_free_iod(dev, iod);
diff --git a/drivers/block/nvme-scsi.c b/drivers/block/nvme-scsi.c
index 6b736b00f63e..ba1809fbd49e 100644
--- a/drivers/block/nvme-scsi.c
+++ b/drivers/block/nvme-scsi.c
@@ -1053,7 +1053,7 @@ static int nvme_trans_log_info_exceptions(struct nvme_ns *ns,
 	c.common.prp1 = cpu_to_le64(dma_addr);
 	c.common.cdw10[0] = cpu_to_le32((((sizeof(struct nvme_smart_log) /
 			BYTES_TO_DWORDS) - 1) << 16) | NVME_LOG_SMART);
-	res = nvme_submit_admin_cmd(dev, &c, NULL);
+	res = nvme_submit_sync_cmd(dev->admin_q, &c);
 	if (res != NVME_SC_SUCCESS) {
 		temp_c = LOG_TEMP_UNKNOWN;
 	} else {
@@ -1121,7 +1121,7 @@ static int nvme_trans_log_temperature(struct nvme_ns *ns, struct sg_io_hdr *hdr,
 	c.common.prp1 = cpu_to_le64(dma_addr);
 	c.common.cdw10[0] = cpu_to_le32((((sizeof(struct nvme_smart_log) /
 			BYTES_TO_DWORDS) - 1) << 16) | NVME_LOG_SMART);
-	res = nvme_submit_admin_cmd(dev, &c, NULL);
+	res = nvme_submit_sync_cmd(dev->admin_q, &c);
 	if (res != NVME_SC_SUCCESS) {
 		temp_c_cur = LOG_TEMP_UNKNOWN;
 	} else {
@@ -1609,7 +1609,7 @@ static int nvme_trans_send_fw_cmd(struct nvme_ns *ns, struct sg_io_hdr *hdr,
 		c.common.cdw10[0] = cpu_to_le32(cdw10);
 	}
 
-	nvme_sc = nvme_submit_admin_cmd(dev, &c, NULL);
+	nvme_sc = nvme_submit_sync_cmd(dev->admin_q, &c);
 	res = nvme_trans_status_code(hdr, nvme_sc);
 	if (res)
 		goto out_unmap;
@@ -1971,7 +1971,7 @@ static int nvme_trans_fmt_send_cmd(struct nvme_ns *ns, struct sg_io_hdr *hdr,
 	c.format.nsid = cpu_to_le32(ns->ns_id);
 	c.format.cdw10 = cpu_to_le32(cdw10);
 
-	nvme_sc = nvme_submit_admin_cmd(dev, &c, NULL);
+	nvme_sc = nvme_submit_sync_cmd(dev->admin_q, &c);
 	res = nvme_trans_status_code(hdr, nvme_sc);
 	if (res)
 		goto out_dma;
@@ -2139,7 +2139,7 @@ static int nvme_trans_do_nvme_io(struct nvme_ns *ns, struct sg_io_hdr *hdr,
 
 		nvme_offset += unit_num_blocks;
 
-		nvme_sc = nvme_submit_io_cmd(dev, ns, &c, NULL);
+		nvme_sc = nvme_submit_sync_cmd(ns->queue, &c);
 		if (nvme_sc != NVME_SC_SUCCESS) {
 			nvme_unmap_user_pages(dev,
 				(is_write) ? DMA_TO_DEVICE : DMA_FROM_DEVICE,
@@ -2696,7 +2696,7 @@ static int nvme_trans_start_stop(struct nvme_ns *ns, struct sg_io_hdr *hdr,
 			c.common.opcode = nvme_cmd_flush;
 			c.common.nsid = cpu_to_le32(ns->ns_id);
 
-			nvme_sc = nvme_submit_io_cmd(ns->dev, ns, &c, NULL);
+			nvme_sc = nvme_submit_sync_cmd(ns->queue, &c);
 			res = nvme_trans_status_code(hdr, nvme_sc);
 			if (res)
 				goto out;
@@ -2724,8 +2724,7 @@ static int nvme_trans_synchronize_cache(struct nvme_ns *ns,
 	c.common.opcode = nvme_cmd_flush;
 	c.common.nsid = cpu_to_le32(ns->ns_id);
 
-	nvme_sc = nvme_submit_io_cmd(ns->dev, ns, &c, NULL);
-
+	nvme_sc = nvme_submit_sync_cmd(ns->queue, &c);
 	res = nvme_trans_status_code(hdr, nvme_sc);
 	if (res)
 		goto out;
@@ -2932,7 +2931,7 @@ static int nvme_trans_unmap(struct nvme_ns *ns, struct sg_io_hdr *hdr,
 	c.dsm.nr = cpu_to_le32(ndesc - 1);
 	c.dsm.attributes = cpu_to_le32(NVME_DSMGMT_AD);
 
-	nvme_sc = nvme_submit_io_cmd(dev, ns, &c, NULL);
+	nvme_sc = nvme_submit_sync_cmd(ns->queue, &c);
 	res = nvme_trans_status_code(hdr, nvme_sc);
 
 	dma_free_coherent(&dev->pci_dev->dev, ndesc * sizeof(*range),
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index 8dbd05e70f09..61488b2ae291 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -158,11 +158,7 @@ struct nvme_iod *nvme_map_user_pages(struct nvme_dev *dev, int write,
 				unsigned long addr, unsigned length);
 void nvme_unmap_user_pages(struct nvme_dev *dev, int write,
 			struct nvme_iod *iod);
-int nvme_submit_io_cmd(struct nvme_dev *, struct nvme_ns *,
-						struct nvme_command *, u32 *);
-int nvme_submit_flush_data(struct nvme_queue *nvmeq, struct nvme_ns *ns);
-int nvme_submit_admin_cmd(struct nvme_dev *, struct nvme_command *,
-							u32 *result);
+int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd);
 int nvme_identify(struct nvme_dev *, unsigned nsid, unsigned cns,
 							dma_addr_t dma_addr);
 int nvme_get_features(struct nvme_dev *dev, unsigned fid, unsigned nsid,
-- 
cgit v1.2.3


From e75ec752d725b7b612c0b2db1bca50a9e53c0879 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 22 May 2015 11:12:39 +0200
Subject: nvme: store a struct device pointer in struct nvme_dev

Most users want the generic device, so store that in struct nvme_dev
instead of the pci_dev.  This also happens to be a nice step towards
making some code reusable for non-PCI transports.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/block/nvme-core.c | 110 ++++++++++++++++++++++------------------------
 drivers/block/nvme-scsi.c |  63 +++++++++++---------------
 include/linux/nvme.h      |   2 +-
 3 files changed, 79 insertions(+), 96 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c
index e81b205ffd04..870a926e1ddc 100644
--- a/drivers/block/nvme-core.c
+++ b/drivers/block/nvme-core.c
@@ -610,17 +610,17 @@ static void req_completion(struct nvme_queue *nvmeq, void *ctx,
 		req->errors = 0;
 
 	if (cmd_rq->aborted)
-		dev_warn(&nvmeq->dev->pci_dev->dev,
+		dev_warn(nvmeq->dev->dev,
 			"completing aborted command with status:%04x\n",
 			status);
 
 	if (iod->nents) {
-		dma_unmap_sg(&nvmeq->dev->pci_dev->dev, iod->sg, iod->nents,
+		dma_unmap_sg(nvmeq->dev->dev, iod->sg, iod->nents,
 			rq_data_dir(req) ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
 		if (blk_integrity_rq(req)) {
 			if (!rq_data_dir(req))
 				nvme_dif_remap(req, nvme_dif_complete);
-			dma_unmap_sg(&nvmeq->dev->pci_dev->dev, iod->meta_sg, 1,
+			dma_unmap_sg(nvmeq->dev->dev, iod->meta_sg, 1,
 				rq_data_dir(req) ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
 		}
 	}
@@ -861,7 +861,7 @@ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
 
 		if (blk_rq_bytes(req) !=
                     nvme_setup_prps(nvmeq->dev, iod, blk_rq_bytes(req), GFP_ATOMIC)) {
-			dma_unmap_sg(&nvmeq->dev->pci_dev->dev, iod->sg,
+			dma_unmap_sg(nvmeq->dev->dev, iod->sg,
 					iod->nents, dma_dir);
 			goto retry_cmd;
 		}
@@ -1192,8 +1192,7 @@ static void nvme_abort_req(struct request *req)
 		if (work_busy(&dev->reset_work))
 			goto out;
 		list_del_init(&dev->node);
-		dev_warn(&dev->pci_dev->dev,
-			"I/O %d QID %d timeout, reset controller\n",
+		dev_warn(dev->dev, "I/O %d QID %d timeout, reset controller\n",
 							req->tag, nvmeq->qid);
 		dev->reset_workfn = nvme_reset_failed_dev;
 		queue_work(nvme_workq, &dev->reset_work);
@@ -1362,22 +1361,21 @@ static void nvme_disable_queue(struct nvme_dev *dev, int qid)
 static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid,
 							int depth)
 {
-	struct device *dmadev = &dev->pci_dev->dev;
 	struct nvme_queue *nvmeq = kzalloc(sizeof(*nvmeq), GFP_KERNEL);
 	if (!nvmeq)
 		return NULL;
 
-	nvmeq->cqes = dma_zalloc_coherent(dmadev, CQ_SIZE(depth),
+	nvmeq->cqes = dma_zalloc_coherent(dev->dev, CQ_SIZE(depth),
 					  &nvmeq->cq_dma_addr, GFP_KERNEL);
 	if (!nvmeq->cqes)
 		goto free_nvmeq;
 
-	nvmeq->sq_cmds = dma_alloc_coherent(dmadev, SQ_SIZE(depth),
+	nvmeq->sq_cmds = dma_alloc_coherent(dev->dev, SQ_SIZE(depth),
 					&nvmeq->sq_dma_addr, GFP_KERNEL);
 	if (!nvmeq->sq_cmds)
 		goto free_cqdma;
 
-	nvmeq->q_dmadev = dmadev;
+	nvmeq->q_dmadev = dev->dev;
 	nvmeq->dev = dev;
 	snprintf(nvmeq->irqname, sizeof(nvmeq->irqname), "nvme%dq%d",
 			dev->instance, qid);
@@ -1393,7 +1391,7 @@ static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid,
 	return nvmeq;
 
  free_cqdma:
-	dma_free_coherent(dmadev, CQ_SIZE(depth), (void *)nvmeq->cqes,
+	dma_free_coherent(dev->dev, CQ_SIZE(depth), (void *)nvmeq->cqes,
 							nvmeq->cq_dma_addr);
  free_nvmeq:
 	kfree(nvmeq);
@@ -1465,7 +1463,7 @@ static int nvme_wait_ready(struct nvme_dev *dev, u64 cap, bool enabled)
 		if (fatal_signal_pending(current))
 			return -EINTR;
 		if (time_after(jiffies, timeout)) {
-			dev_err(&dev->pci_dev->dev,
+			dev_err(dev->dev,
 				"Device not ready; aborting %s\n", enabled ?
 						"initialisation" : "reset");
 			return -ENODEV;
@@ -1515,7 +1513,7 @@ static int nvme_shutdown_ctrl(struct nvme_dev *dev)
 		if (fatal_signal_pending(current))
 			return -EINTR;
 		if (time_after(jiffies, timeout)) {
-			dev_err(&dev->pci_dev->dev,
+			dev_err(dev->dev,
 				"Device shutdown incomplete; abort shutdown\n");
 			return -ENODEV;
 		}
@@ -1558,7 +1556,7 @@ static int nvme_alloc_admin_tags(struct nvme_dev *dev)
 		dev->admin_tagset.queue_depth = NVME_AQ_DEPTH - 1;
 		dev->admin_tagset.reserved_tags = 1;
 		dev->admin_tagset.timeout = ADMIN_TIMEOUT;
-		dev->admin_tagset.numa_node = dev_to_node(&dev->pci_dev->dev);
+		dev->admin_tagset.numa_node = dev_to_node(dev->dev);
 		dev->admin_tagset.cmd_size = nvme_cmd_size(dev);
 		dev->admin_tagset.driver_data = dev;
 
@@ -1591,14 +1589,14 @@ static int nvme_configure_admin_queue(struct nvme_dev *dev)
 	unsigned dev_page_max = NVME_CAP_MPSMAX(cap) + 12;
 
 	if (page_shift < dev_page_min) {
-		dev_err(&dev->pci_dev->dev,
+		dev_err(dev->dev,
 				"Minimum device page size (%u) too large for "
 				"host (%u)\n", 1 << dev_page_min,
 				1 << page_shift);
 		return -ENODEV;
 	}
 	if (page_shift > dev_page_max) {
-		dev_info(&dev->pci_dev->dev,
+		dev_info(dev->dev,
 				"Device maximum page size (%u) smaller than "
 				"host (%u); enabling work-around\n",
 				1 << dev_page_max, 1 << page_shift);
@@ -1689,7 +1687,7 @@ struct nvme_iod *nvme_map_user_pages(struct nvme_dev *dev, int write,
 	sg_mark_end(&sg[i - 1]);
 	iod->nents = count;
 
-	nents = dma_map_sg(&dev->pci_dev->dev, sg, count,
+	nents = dma_map_sg(dev->dev, sg, count,
 				write ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
 	if (!nents)
 		goto free_iod;
@@ -1711,7 +1709,7 @@ void nvme_unmap_user_pages(struct nvme_dev *dev, int write,
 {
 	int i;
 
-	dma_unmap_sg(&dev->pci_dev->dev, iod->sg, iod->nents,
+	dma_unmap_sg(dev->dev, iod->sg, iod->nents,
 				write ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
 
 	for (i = 0; i < iod->nents; i++)
@@ -1762,7 +1760,7 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
 		goto unmap;
 	}
 	if (meta_len) {
-		meta = dma_alloc_coherent(&dev->pci_dev->dev, meta_len,
+		meta = dma_alloc_coherent(dev->dev, meta_len,
 						&meta_dma, GFP_KERNEL);
 		if (!meta) {
 			status = -ENOMEM;
@@ -1801,7 +1799,7 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
 								meta_len))
 				status = -EFAULT;
 		}
-		dma_free_coherent(&dev->pci_dev->dev, meta_len, meta, meta_dma);
+		dma_free_coherent(dev->dev, meta_len, meta, meta_dma);
 	}
 	return status;
 }
@@ -1961,15 +1959,13 @@ static int nvme_revalidate_disk(struct gendisk *disk)
 	u16 old_ms;
 	unsigned short bs;
 
-	id = dma_alloc_coherent(&dev->pci_dev->dev, 4096, &dma_addr,
-								GFP_KERNEL);
+	id = dma_alloc_coherent(dev->dev, 4096, &dma_addr, GFP_KERNEL);
 	if (!id) {
-		dev_warn(&dev->pci_dev->dev, "%s: Memory alocation failure\n",
-								__func__);
+		dev_warn(dev->dev, "%s: Memory alocation failure\n", __func__);
 		return 0;
 	}
 	if (nvme_identify(dev, ns->ns_id, 0, dma_addr)) {
-		dev_warn(&dev->pci_dev->dev,
+		dev_warn(dev->dev,
 			"identify failed ns:%d, setting capacity to 0\n",
 			ns->ns_id);
 		memset(id, 0, sizeof(*id));
@@ -2014,7 +2010,7 @@ static int nvme_revalidate_disk(struct gendisk *disk)
 	if (dev->oncs & NVME_CTRL_ONCS_DSM)
 		nvme_config_discard(ns);
 
-	dma_free_coherent(&dev->pci_dev->dev, 4096, id, dma_addr);
+	dma_free_coherent(dev->dev, 4096, id, dma_addr);
 	return 0;
 }
 
@@ -2041,7 +2037,7 @@ static int nvme_kthread(void *data)
 				if (work_busy(&dev->reset_work))
 					continue;
 				list_del_init(&dev->node);
-				dev_warn(&dev->pci_dev->dev,
+				dev_warn(dev->dev,
 					"Failed status: %x, reset controller\n",
 					readl(&dev->bar->csts));
 				dev->reset_workfn = nvme_reset_failed_dev;
@@ -2073,7 +2069,7 @@ static void nvme_alloc_ns(struct nvme_dev *dev, unsigned nsid)
 {
 	struct nvme_ns *ns;
 	struct gendisk *disk;
-	int node = dev_to_node(&dev->pci_dev->dev);
+	int node = dev_to_node(dev->dev);
 
 	ns = kzalloc_node(sizeof(*ns), GFP_KERNEL, node);
 	if (!ns)
@@ -2156,8 +2152,7 @@ static int set_queue_count(struct nvme_dev *dev, int count)
 	if (status < 0)
 		return status;
 	if (status > 0) {
-		dev_err(&dev->pci_dev->dev, "Could not set queue count (%d)\n",
-									status);
+		dev_err(dev->dev, "Could not set queue count (%d)\n", status);
 		return 0;
 	}
 	return min(result & 0xffff, result >> 16) + 1;
@@ -2171,7 +2166,7 @@ static size_t db_bar_size(struct nvme_dev *dev, unsigned nr_io_queues)
 static int nvme_setup_io_queues(struct nvme_dev *dev)
 {
 	struct nvme_queue *adminq = dev->queues[0];
-	struct pci_dev *pdev = dev->pci_dev;
+	struct pci_dev *pdev = to_pci_dev(dev->dev);
 	int result, i, vecs, nr_io_queues, size;
 
 	nr_io_queues = num_possible_cpus();
@@ -2251,7 +2246,7 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
  */
 static int nvme_dev_add(struct nvme_dev *dev)
 {
-	struct pci_dev *pdev = dev->pci_dev;
+	struct pci_dev *pdev = to_pci_dev(dev->dev);
 	int res;
 	unsigned nn, i;
 	struct nvme_id_ctrl *ctrl;
@@ -2259,14 +2254,14 @@ static int nvme_dev_add(struct nvme_dev *dev)
 	dma_addr_t dma_addr;
 	int shift = NVME_CAP_MPSMIN(readq(&dev->bar->cap)) + 12;
 
-	mem = dma_alloc_coherent(&pdev->dev, 4096, &dma_addr, GFP_KERNEL);
+	mem = dma_alloc_coherent(dev->dev, 4096, &dma_addr, GFP_KERNEL);
 	if (!mem)
 		return -ENOMEM;
 
 	res = nvme_identify(dev, 0, 1, dma_addr);
 	if (res) {
-		dev_err(&pdev->dev, "Identify Controller failed (%d)\n", res);
-		dma_free_coherent(&dev->pci_dev->dev, 4096, mem, dma_addr);
+		dev_err(dev->dev, "Identify Controller failed (%d)\n", res);
+		dma_free_coherent(dev->dev, 4096, mem, dma_addr);
 		return -EIO;
 	}
 
@@ -2292,12 +2287,12 @@ static int nvme_dev_add(struct nvme_dev *dev)
 		} else
 			dev->max_hw_sectors = max_hw_sectors;
 	}
-	dma_free_coherent(&dev->pci_dev->dev, 4096, mem, dma_addr);
+	dma_free_coherent(dev->dev, 4096, mem, dma_addr);
 
 	dev->tagset.ops = &nvme_mq_ops;
 	dev->tagset.nr_hw_queues = dev->online_queues - 1;
 	dev->tagset.timeout = NVME_IO_TIMEOUT;
-	dev->tagset.numa_node = dev_to_node(&dev->pci_dev->dev);
+	dev->tagset.numa_node = dev_to_node(dev->dev);
 	dev->tagset.queue_depth =
 				min_t(int, dev->q_depth, BLK_MQ_MAX_DEPTH) - 1;
 	dev->tagset.cmd_size = nvme_cmd_size(dev);
@@ -2317,7 +2312,7 @@ static int nvme_dev_map(struct nvme_dev *dev)
 {
 	u64 cap;
 	int bars, result = -ENOMEM;
-	struct pci_dev *pdev = dev->pci_dev;
+	struct pci_dev *pdev = to_pci_dev(dev->dev);
 
 	if (pci_enable_device_mem(pdev))
 		return result;
@@ -2331,8 +2326,8 @@ static int nvme_dev_map(struct nvme_dev *dev)
 	if (pci_request_selected_regions(pdev, bars, "nvme"))
 		goto disable_pci;
 
-	if (dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(64)) &&
-	    dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(32)))
+	if (dma_set_mask_and_coherent(dev->dev, DMA_BIT_MASK(64)) &&
+	    dma_set_mask_and_coherent(dev->dev, DMA_BIT_MASK(32)))
 		goto disable;
 
 	dev->bar = ioremap(pci_resource_start(pdev, 0), 8192);
@@ -2373,19 +2368,21 @@ static int nvme_dev_map(struct nvme_dev *dev)
 
 static void nvme_dev_unmap(struct nvme_dev *dev)
 {
-	if (dev->pci_dev->msi_enabled)
-		pci_disable_msi(dev->pci_dev);
-	else if (dev->pci_dev->msix_enabled)
-		pci_disable_msix(dev->pci_dev);
+	struct pci_dev *pdev = to_pci_dev(dev->dev);
+
+	if (pdev->msi_enabled)
+		pci_disable_msi(pdev);
+	else if (pdev->msix_enabled)
+		pci_disable_msix(pdev);
 
 	if (dev->bar) {
 		iounmap(dev->bar);
 		dev->bar = NULL;
-		pci_release_regions(dev->pci_dev);
+		pci_release_regions(pdev);
 	}
 
-	if (pci_is_enabled(dev->pci_dev))
-		pci_disable_device(dev->pci_dev);
+	if (pci_is_enabled(pdev))
+		pci_disable_device(pdev);
 }
 
 struct nvme_delq_ctx {
@@ -2504,7 +2501,7 @@ static void nvme_disable_io_queues(struct nvme_dev *dev)
 					&worker, "nvme%d", dev->instance);
 
 	if (IS_ERR(kworker_task)) {
-		dev_err(&dev->pci_dev->dev,
+		dev_err(dev->dev,
 			"Failed to create queue del task\n");
 		for (i = dev->queue_count - 1; i > 0; i--)
 			nvme_disable_queue(dev, i);
@@ -2622,14 +2619,13 @@ static void nvme_dev_remove(struct nvme_dev *dev)
 
 static int nvme_setup_prp_pools(struct nvme_dev *dev)
 {
-	struct device *dmadev = &dev->pci_dev->dev;
-	dev->prp_page_pool = dma_pool_create("prp list page", dmadev,
+	dev->prp_page_pool = dma_pool_create("prp list page", dev->dev,
 						PAGE_SIZE, PAGE_SIZE, 0);
 	if (!dev->prp_page_pool)
 		return -ENOMEM;
 
 	/* Optimisation for I/Os between 4k and 128k */
-	dev->prp_small_pool = dma_pool_create("prp list 256", dmadev,
+	dev->prp_small_pool = dma_pool_create("prp list 256", dev->dev,
 						256, 256, 0);
 	if (!dev->prp_small_pool) {
 		dma_pool_destroy(dev->prp_page_pool);
@@ -2693,7 +2689,7 @@ static void nvme_free_dev(struct kref *kref)
 {
 	struct nvme_dev *dev = container_of(kref, struct nvme_dev, kref);
 
-	pci_dev_put(dev->pci_dev);
+	put_device(dev->dev);
 	put_device(dev->device);
 	nvme_free_namespaces(dev);
 	nvme_release_instance(dev);
@@ -2837,7 +2833,7 @@ static int nvme_dev_start(struct nvme_dev *dev)
 static int nvme_remove_dead_ctrl(void *arg)
 {
 	struct nvme_dev *dev = (struct nvme_dev *)arg;
-	struct pci_dev *pdev = dev->pci_dev;
+	struct pci_dev *pdev = to_pci_dev(dev->dev);
 
 	if (pci_get_drvdata(pdev))
 		pci_stop_and_remove_bus_device_locked(pdev);
@@ -2876,11 +2872,11 @@ static void nvme_dev_reset(struct nvme_dev *dev)
 {
 	nvme_dev_shutdown(dev);
 	if (nvme_dev_resume(dev)) {
-		dev_warn(&dev->pci_dev->dev, "Device failed to resume\n");
+		dev_warn(dev->dev, "Device failed to resume\n");
 		kref_get(&dev->kref);
 		if (IS_ERR(kthread_run(nvme_remove_dead_ctrl, dev, "nvme%d",
 							dev->instance))) {
-			dev_err(&dev->pci_dev->dev,
+			dev_err(dev->dev,
 				"Failed to start controller remove task\n");
 			kref_put(&dev->kref, nvme_free_dev);
 		}
@@ -2924,7 +2920,7 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 	INIT_LIST_HEAD(&dev->namespaces);
 	dev->reset_workfn = nvme_reset_failed_dev;
 	INIT_WORK(&dev->reset_work, nvme_reset_workfn);
-	dev->pci_dev = pci_dev_get(pdev);
+	dev->dev = get_device(&pdev->dev);
 	pci_set_drvdata(pdev, dev);
 	result = nvme_set_instance(dev);
 	if (result)
@@ -2954,7 +2950,7 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
  release:
 	nvme_release_instance(dev);
  put_pci:
-	pci_dev_put(dev->pci_dev);
+	put_device(dev->dev);
  free:
 	kfree(dev->queues);
 	kfree(dev->entry);
diff --git a/drivers/block/nvme-scsi.c b/drivers/block/nvme-scsi.c
index ba1809fbd49e..f1c90f273132 100644
--- a/drivers/block/nvme-scsi.c
+++ b/drivers/block/nvme-scsi.c
@@ -684,7 +684,7 @@ static int nvme_trans_standard_inquiry_page(struct nvme_ns *ns,
 	u8 cmdque = 0x01 << 1;
 	u8 fw_offset = sizeof(dev->firmware_rev);
 
-	mem = dma_alloc_coherent(&dev->pci_dev->dev, sizeof(struct nvme_id_ns),
+	mem = dma_alloc_coherent(dev->dev, sizeof(struct nvme_id_ns),
 				&dma_addr, GFP_KERNEL);
 	if (mem == NULL) {
 		res = -ENOMEM;
@@ -728,8 +728,7 @@ static int nvme_trans_standard_inquiry_page(struct nvme_ns *ns,
 	res = nvme_trans_copy_to_user(hdr, inq_response, xfer_len);
 
  out_free:
-	dma_free_coherent(&dev->pci_dev->dev, sizeof(struct nvme_id_ns), mem,
-			  dma_addr);
+	dma_free_coherent(dev->dev, sizeof(struct nvme_id_ns), mem, dma_addr);
  out_dma:
 	return res;
 }
@@ -787,7 +786,7 @@ static int nvme_trans_device_id_page(struct nvme_ns *ns, struct sg_io_hdr *hdr,
 	int xfer_len;
 	__be32 tmp_id = cpu_to_be32(ns->ns_id);
 
-	mem = dma_alloc_coherent(&dev->pci_dev->dev, sizeof(struct nvme_id_ns),
+	mem = dma_alloc_coherent(dev->dev, sizeof(struct nvme_id_ns),
 					&dma_addr, GFP_KERNEL);
 	if (mem == NULL) {
 		res = -ENOMEM;
@@ -842,7 +841,7 @@ static int nvme_trans_device_id_page(struct nvme_ns *ns, struct sg_io_hdr *hdr,
 		inq_response[6] = 0x00;    /* Rsvd */
 		inq_response[7] = 0x44;    /* Designator Length */
 
-		sprintf(&inq_response[8], "%04x", dev->pci_dev->vendor);
+		sprintf(&inq_response[8], "%04x", to_pci_dev(dev->dev)->vendor);
 		memcpy(&inq_response[12], dev->model, sizeof(dev->model));
 		sprintf(&inq_response[52], "%04x", tmp_id);
 		memcpy(&inq_response[56], dev->serial, sizeof(dev->serial));
@@ -851,8 +850,7 @@ static int nvme_trans_device_id_page(struct nvme_ns *ns, struct sg_io_hdr *hdr,
 	res = nvme_trans_copy_to_user(hdr, inq_response, xfer_len);
 
  out_free:
-	dma_free_coherent(&dev->pci_dev->dev, sizeof(struct nvme_id_ns), mem,
-			  dma_addr);
+	dma_free_coherent(dev->dev, sizeof(struct nvme_id_ns), mem, dma_addr);
  out_dma:
 	return res;
 }
@@ -883,7 +881,7 @@ static int nvme_trans_ext_inq_page(struct nvme_ns *ns, struct sg_io_hdr *hdr,
 		goto out_mem;
 	}
 
-	mem = dma_alloc_coherent(&dev->pci_dev->dev, sizeof(struct nvme_id_ns),
+	mem = dma_alloc_coherent(dev->dev, sizeof(struct nvme_id_ns),
 							&dma_addr, GFP_KERNEL);
 	if (mem == NULL) {
 		res = -ENOMEM;
@@ -933,8 +931,7 @@ static int nvme_trans_ext_inq_page(struct nvme_ns *ns, struct sg_io_hdr *hdr,
 	res = nvme_trans_copy_to_user(hdr, inq_response, xfer_len);
 
  out_free:
-	dma_free_coherent(&dev->pci_dev->dev, sizeof(struct nvme_id_ns), mem,
-			  dma_addr);
+	dma_free_coherent(dev->dev, sizeof(struct nvme_id_ns), mem, dma_addr);
  out_dma:
 	kfree(inq_response);
  out_mem:
@@ -1038,8 +1035,7 @@ static int nvme_trans_log_info_exceptions(struct nvme_ns *ns,
 		goto out_mem;
 	}
 
-	mem = dma_alloc_coherent(&dev->pci_dev->dev,
-					sizeof(struct nvme_smart_log),
+	mem = dma_alloc_coherent(dev->dev, sizeof(struct nvme_smart_log),
 					&dma_addr, GFP_KERNEL);
 	if (mem == NULL) {
 		res = -ENOMEM;
@@ -1077,7 +1073,7 @@ static int nvme_trans_log_info_exceptions(struct nvme_ns *ns,
 	xfer_len = min(alloc_len, LOG_INFO_EXCP_PAGE_LENGTH);
 	res = nvme_trans_copy_to_user(hdr, log_response, xfer_len);
 
-	dma_free_coherent(&dev->pci_dev->dev, sizeof(struct nvme_smart_log),
+	dma_free_coherent(dev->dev, sizeof(struct nvme_smart_log),
 			  mem, dma_addr);
  out_dma:
 	kfree(log_response);
@@ -1106,8 +1102,7 @@ static int nvme_trans_log_temperature(struct nvme_ns *ns, struct sg_io_hdr *hdr,
 		goto out_mem;
 	}
 
-	mem = dma_alloc_coherent(&dev->pci_dev->dev,
-					sizeof(struct nvme_smart_log),
+	mem = dma_alloc_coherent(dev->dev, sizeof(struct nvme_smart_log),
 					&dma_addr, GFP_KERNEL);
 	if (mem == NULL) {
 		res = -ENOMEM;
@@ -1158,7 +1153,7 @@ static int nvme_trans_log_temperature(struct nvme_ns *ns, struct sg_io_hdr *hdr,
 	xfer_len = min(alloc_len, LOG_TEMP_PAGE_LENGTH);
 	res = nvme_trans_copy_to_user(hdr, log_response, xfer_len);
 
-	dma_free_coherent(&dev->pci_dev->dev, sizeof(struct nvme_smart_log),
+	dma_free_coherent(dev->dev, sizeof(struct nvme_smart_log),
 			  mem, dma_addr);
  out_dma:
 	kfree(log_response);
@@ -1209,7 +1204,7 @@ static int nvme_trans_fill_blk_desc(struct nvme_ns *ns, struct sg_io_hdr *hdr,
 	else if (llbaa > 0 && len < MODE_PAGE_LLBAA_BLK_DES_LEN)
 		return SNTI_INTERNAL_ERROR;
 
-	mem = dma_alloc_coherent(&dev->pci_dev->dev, sizeof(struct nvme_id_ns),
+	mem = dma_alloc_coherent(dev->dev, sizeof(struct nvme_id_ns),
 							&dma_addr, GFP_KERNEL);
 	if (mem == NULL) {
 		res = -ENOMEM;
@@ -1246,8 +1241,7 @@ static int nvme_trans_fill_blk_desc(struct nvme_ns *ns, struct sg_io_hdr *hdr,
 	}
 
  out_dma:
-	dma_free_coherent(&dev->pci_dev->dev, sizeof(struct nvme_id_ns), mem,
-			  dma_addr);
+	dma_free_coherent(dev->dev, sizeof(struct nvme_id_ns), mem, dma_addr);
  out:
 	return res;
 }
@@ -1494,8 +1488,7 @@ static int nvme_trans_power_state(struct nvme_ns *ns, struct sg_io_hdr *hdr,
 	unsigned ps_desired = 0;
 
 	/* NVMe Controller Identify */
-	mem = dma_alloc_coherent(&dev->pci_dev->dev,
-				sizeof(struct nvme_id_ctrl),
+	mem = dma_alloc_coherent(dev->dev, sizeof(struct nvme_id_ctrl),
 				&dma_addr, GFP_KERNEL);
 	if (mem == NULL) {
 		res = -ENOMEM;
@@ -1556,8 +1549,7 @@ static int nvme_trans_power_state(struct nvme_ns *ns, struct sg_io_hdr *hdr,
 	if (nvme_sc)
 		res = nvme_sc;
  out_dma:
-	dma_free_coherent(&dev->pci_dev->dev, sizeof(struct nvme_id_ctrl), mem,
-			  dma_addr);
+	dma_free_coherent(dev->dev, sizeof(struct nvme_id_ctrl), mem, dma_addr);
  out:
 	return res;
 }
@@ -1820,7 +1812,7 @@ static int nvme_trans_fmt_set_blk_size_count(struct nvme_ns *ns,
 	 */
 
 	if (ns->mode_select_num_blocks == 0 || ns->mode_select_block_len == 0) {
-		mem = dma_alloc_coherent(&dev->pci_dev->dev,
+		mem = dma_alloc_coherent(dev->dev,
 			sizeof(struct nvme_id_ns), &dma_addr, GFP_KERNEL);
 		if (mem == NULL) {
 			res = -ENOMEM;
@@ -1845,7 +1837,7 @@ static int nvme_trans_fmt_set_blk_size_count(struct nvme_ns *ns,
 						(1 << (id_ns->lbaf[flbas].ds));
 		}
  out_dma:
-		dma_free_coherent(&dev->pci_dev->dev, sizeof(struct nvme_id_ns),
+		dma_free_coherent(dev->dev, sizeof(struct nvme_id_ns),
 				  mem, dma_addr);
 	}
  out:
@@ -1928,7 +1920,7 @@ static int nvme_trans_fmt_send_cmd(struct nvme_ns *ns, struct sg_io_hdr *hdr,
 	struct nvme_command c;
 
 	/* Loop thru LBAF's in id_ns to match reqd lbaf, put in cdw10 */
-	mem = dma_alloc_coherent(&dev->pci_dev->dev, sizeof(struct nvme_id_ns),
+	mem = dma_alloc_coherent(dev->dev, sizeof(struct nvme_id_ns),
 							&dma_addr, GFP_KERNEL);
 	if (mem == NULL) {
 		res = -ENOMEM;
@@ -1979,8 +1971,7 @@ static int nvme_trans_fmt_send_cmd(struct nvme_ns *ns, struct sg_io_hdr *hdr,
 		res = nvme_sc;
 
  out_dma:
-	dma_free_coherent(&dev->pci_dev->dev, sizeof(struct nvme_id_ns), mem,
-			  dma_addr);
+	dma_free_coherent(dev->dev, sizeof(struct nvme_id_ns), mem, dma_addr);
  out:
 	return res;
 }
@@ -2485,7 +2476,7 @@ static int nvme_trans_read_capacity(struct nvme_ns *ns, struct sg_io_hdr *hdr,
 		resp_size = READ_CAP_16_RESP_SIZE;
 	}
 
-	mem = dma_alloc_coherent(&dev->pci_dev->dev, sizeof(struct nvme_id_ns),
+	mem = dma_alloc_coherent(dev->dev, sizeof(struct nvme_id_ns),
 							&dma_addr, GFP_KERNEL);
 	if (mem == NULL) {
 		res = -ENOMEM;
@@ -2514,8 +2505,7 @@ static int nvme_trans_read_capacity(struct nvme_ns *ns, struct sg_io_hdr *hdr,
 
 	kfree(response);
  out_dma:
-	dma_free_coherent(&dev->pci_dev->dev, sizeof(struct nvme_id_ns), mem,
-			  dma_addr);
+	dma_free_coherent(dev->dev, sizeof(struct nvme_id_ns), mem, dma_addr);
  out:
 	return res;
 }
@@ -2548,8 +2538,7 @@ static int nvme_trans_report_luns(struct nvme_ns *ns, struct sg_io_hdr *hdr,
 		goto out;
 	} else {
 		/* NVMe Controller Identify */
-		mem = dma_alloc_coherent(&dev->pci_dev->dev,
-					sizeof(struct nvme_id_ctrl),
+		mem = dma_alloc_coherent(dev->dev, sizeof(struct nvme_id_ctrl),
 					&dma_addr, GFP_KERNEL);
 		if (mem == NULL) {
 			res = -ENOMEM;
@@ -2600,8 +2589,7 @@ static int nvme_trans_report_luns(struct nvme_ns *ns, struct sg_io_hdr *hdr,
 
 	kfree(response);
  out_dma:
-	dma_free_coherent(&dev->pci_dev->dev, sizeof(struct nvme_id_ctrl), mem,
-			  dma_addr);
+	dma_free_coherent(dev->dev, sizeof(struct nvme_id_ctrl), mem, dma_addr);
  out:
 	return res;
 }
@@ -2913,7 +2901,7 @@ static int nvme_trans_unmap(struct nvme_ns *ns, struct sg_io_hdr *hdr,
 		goto out;
 	}
 
-	range = dma_alloc_coherent(&dev->pci_dev->dev, ndesc * sizeof(*range),
+	range = dma_alloc_coherent(dev->dev, ndesc * sizeof(*range),
 							&dma_addr, GFP_KERNEL);
 	if (!range)
 		goto out;
@@ -2934,8 +2922,7 @@ static int nvme_trans_unmap(struct nvme_ns *ns, struct sg_io_hdr *hdr,
 	nvme_sc = nvme_submit_sync_cmd(ns->queue, &c);
 	res = nvme_trans_status_code(hdr, nvme_sc);
 
-	dma_free_coherent(&dev->pci_dev->dev, ndesc * sizeof(*range),
-							range, dma_addr);
+	dma_free_coherent(dev->dev, ndesc * sizeof(*range), range, dma_addr);
  out:
 	kfree(plist);
 	return res;
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index 61488b2ae291..de0e49a716b8 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -74,7 +74,7 @@ struct nvme_dev {
 	struct blk_mq_tag_set tagset;
 	struct blk_mq_tag_set admin_tagset;
 	u32 __iomem *dbs;
-	struct pci_dev *pci_dev;
+	struct device *dev;
 	struct dma_pool *prp_page_pool;
 	struct dma_pool *prp_small_pool;
 	int instance;
-- 
cgit v1.2.3


From d29ec8241c10eacf59c23b3828a88dbae06e7e3f Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 22 May 2015 11:12:46 +0200
Subject: nvme: submit internal commands through the block layer

Use block layer queues with an internal cmd_type to submit internally
generated NVMe commands.  This both simplifies the code a lot and allow
for a better structure.  For example now the LighNVM code can construct
commands without knowing the details of the underlying I/O descriptors.
Or a future NVMe over network target could inject commands, as well as
could the SCSI translation and ioctl code be reused for such a beast.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/block/nvme-core.c | 399 +++++++++++++++++++------------------------
 drivers/block/nvme-scsi.c | 422 +++++++++++++---------------------------------
 include/linux/nvme.h      |  24 +--
 3 files changed, 300 insertions(+), 545 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c
index 870a926e1ddc..03bd638e76dd 100644
--- a/drivers/block/nvme-core.c
+++ b/drivers/block/nvme-core.c
@@ -445,7 +445,7 @@ static struct nvme_iod *nvme_alloc_iod(struct request *rq, struct nvme_dev *dev,
 				(unsigned long) rq, gfp);
 }
 
-void nvme_free_iod(struct nvme_dev *dev, struct nvme_iod *iod)
+static void nvme_free_iod(struct nvme_dev *dev, struct nvme_iod *iod)
 {
 	const int last_prp = dev->page_size / 8 - 1;
 	int i;
@@ -605,7 +605,12 @@ static void req_completion(struct nvme_queue *nvmeq, void *ctx,
 			spin_unlock_irqrestore(req->q->queue_lock, flags);
 			return;
 		}
-		req->errors = nvme_error_status(status);
+		if (req->cmd_type == REQ_TYPE_DRV_PRIV) {
+			req->sense_len = le32_to_cpup(&cqe->result);
+			req->errors = status;
+		} else {
+			req->errors = nvme_error_status(status);
+		}
 	} else
 		req->errors = 0;
 
@@ -630,8 +635,8 @@ static void req_completion(struct nvme_queue *nvmeq, void *ctx,
 }
 
 /* length is in bytes.  gfp flags indicates whether we may sleep. */
-int nvme_setup_prps(struct nvme_dev *dev, struct nvme_iod *iod, int total_len,
-								gfp_t gfp)
+static int nvme_setup_prps(struct nvme_dev *dev, struct nvme_iod *iod,
+		int total_len, gfp_t gfp)
 {
 	struct dma_pool *pool;
 	int length = total_len;
@@ -709,6 +714,23 @@ int nvme_setup_prps(struct nvme_dev *dev, struct nvme_iod *iod, int total_len,
 	return total_len;
 }
 
+static void nvme_submit_priv(struct nvme_queue *nvmeq, struct request *req,
+		struct nvme_iod *iod)
+{
+	struct nvme_command *cmnd = &nvmeq->sq_cmds[nvmeq->sq_tail];
+
+	memcpy(cmnd, req->cmd, sizeof(struct nvme_command));
+	cmnd->rw.command_id = req->tag;
+	if (req->nr_phys_segments) {
+		cmnd->rw.prp1 = cpu_to_le64(sg_dma_address(iod->sg));
+		cmnd->rw.prp2 = cpu_to_le64(iod->first_dma);
+	}
+
+	if (++nvmeq->sq_tail == nvmeq->q_depth)
+		nvmeq->sq_tail = 0;
+	writel(nvmeq->sq_tail, nvmeq->q_db);
+}
+
 /*
  * We reuse the small pool to allocate the 16-byte range here as it is not
  * worth having a special pool for these or additional cases to handle freeing
@@ -807,11 +829,15 @@ static int nvme_submit_iod(struct nvme_queue *nvmeq, struct nvme_iod *iod,
 	return 0;
 }
 
+/*
+ * NOTE: ns is NULL when called on the admin queue.
+ */
 static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
 			 const struct blk_mq_queue_data *bd)
 {
 	struct nvme_ns *ns = hctx->queue->queuedata;
 	struct nvme_queue *nvmeq = hctx->driver_data;
+	struct nvme_dev *dev = nvmeq->dev;
 	struct request *req = bd->rq;
 	struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(req);
 	struct nvme_iod *iod;
@@ -822,7 +848,7 @@ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
 	 * unless this namespace is formated such that the metadata can be
 	 * stripped/generated by the controller with PRACT=1.
 	 */
-	if (ns->ms && !blk_integrity_rq(req)) {
+	if (ns && ns->ms && !blk_integrity_rq(req)) {
 		if (!(ns->pi_type && ns->ms == 8)) {
 			req->errors = -EFAULT;
 			blk_mq_complete_request(req);
@@ -830,7 +856,7 @@ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
 		}
 	}
 
-	iod = nvme_alloc_iod(req, ns->dev, GFP_ATOMIC);
+	iod = nvme_alloc_iod(req, dev, GFP_ATOMIC);
 	if (!iod)
 		return BLK_MQ_RQ_QUEUE_BUSY;
 
@@ -841,8 +867,7 @@ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
 		 * as it is not worth having a special pool for these or
 		 * additional cases to handle freeing the iod.
 		 */
-		range = dma_pool_alloc(nvmeq->dev->prp_small_pool,
-						GFP_ATOMIC,
+		range = dma_pool_alloc(dev->prp_small_pool, GFP_ATOMIC,
 						&iod->first_dma);
 		if (!range)
 			goto retry_cmd;
@@ -860,9 +885,8 @@ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
 			goto retry_cmd;
 
 		if (blk_rq_bytes(req) !=
-                    nvme_setup_prps(nvmeq->dev, iod, blk_rq_bytes(req), GFP_ATOMIC)) {
-			dma_unmap_sg(nvmeq->dev->dev, iod->sg,
-					iod->nents, dma_dir);
+                    nvme_setup_prps(dev, iod, blk_rq_bytes(req), GFP_ATOMIC)) {
+			dma_unmap_sg(dev->dev, iod->sg, iod->nents, dma_dir);
 			goto retry_cmd;
 		}
 		if (blk_integrity_rq(req)) {
@@ -884,7 +908,9 @@ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
 
 	nvme_set_info(cmd, iod, req_completion);
 	spin_lock_irq(&nvmeq->q_lock);
-	if (req->cmd_flags & REQ_DISCARD)
+	if (req->cmd_type == REQ_TYPE_DRV_PRIV)
+		nvme_submit_priv(nvmeq, req, iod);
+	else if (req->cmd_flags & REQ_DISCARD)
 		nvme_submit_discard(nvmeq, ns, req, iod);
 	else if (req->cmd_flags & REQ_FLUSH)
 		nvme_submit_flush(nvmeq, ns, req->tag);
@@ -896,10 +922,10 @@ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
 	return BLK_MQ_RQ_QUEUE_OK;
 
  error_cmd:
-	nvme_free_iod(nvmeq->dev, iod);
+	nvme_free_iod(dev, iod);
 	return BLK_MQ_RQ_QUEUE_ERROR;
  retry_cmd:
-	nvme_free_iod(nvmeq->dev, iod);
+	nvme_free_iod(dev, iod);
 	return BLK_MQ_RQ_QUEUE_BUSY;
 }
 
@@ -942,15 +968,6 @@ static int nvme_process_cq(struct nvme_queue *nvmeq)
 	return 1;
 }
 
-/* Admin queue isn't initialized as a request queue. If at some point this
- * happens anyway, make sure to notify the user */
-static int nvme_admin_queue_rq(struct blk_mq_hw_ctx *hctx,
-			       const struct blk_mq_queue_data *bd)
-{
-	WARN_ON_ONCE(1);
-	return BLK_MQ_RQ_QUEUE_ERROR;
-}
-
 static irqreturn_t nvme_irq(int irq, void *data)
 {
 	irqreturn_t result;
@@ -972,59 +989,61 @@ static irqreturn_t nvme_irq_check(int irq, void *data)
 	return IRQ_WAKE_THREAD;
 }
 
-struct sync_cmd_info {
-	struct task_struct *task;
-	u32 result;
-	int status;
-};
-
-static void sync_completion(struct nvme_queue *nvmeq, void *ctx,
-						struct nvme_completion *cqe)
-{
-	struct sync_cmd_info *cmdinfo = ctx;
-	cmdinfo->result = le32_to_cpup(&cqe->result);
-	cmdinfo->status = le16_to_cpup(&cqe->status) >> 1;
-	wake_up_process(cmdinfo->task);
-}
-
 /*
  * Returns 0 on success.  If the result is negative, it's a Linux error code;
  * if the result is positive, it's an NVM Express status code
  */
-static int __nvme_submit_sync_cmd(struct request_queue *q,
-		struct nvme_command *cmd, u32 *result, unsigned timeout)
+int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
+		void *buffer, void __user *ubuffer, unsigned bufflen,
+		u32 *result, unsigned timeout)
 {
-	struct sync_cmd_info cmdinfo;
-	struct nvme_cmd_info *cmd_rq;
+	bool write = cmd->common.opcode & 1;
+	struct bio *bio = NULL;
 	struct request *req;
-	int res;
+	int ret;
 
-	req = blk_mq_alloc_request(q, WRITE, GFP_KERNEL, false);
+	req = blk_mq_alloc_request(q, write, GFP_KERNEL, false);
 	if (IS_ERR(req))
 		return PTR_ERR(req);
 
-	cmdinfo.task = current;
-	cmdinfo.status = -EINTR;
+	req->cmd_type = REQ_TYPE_DRV_PRIV;
+	req->__data_len = 0;
+	req->__sector = (sector_t) -1;
+	req->bio = req->biotail = NULL;
 
-	cmd->common.command_id = req->tag;
+	req->timeout = ADMIN_TIMEOUT;
 
-	cmd_rq = blk_mq_rq_to_pdu(req);
-	nvme_set_info(cmd_rq, &cmdinfo, sync_completion);
+	req->cmd = (unsigned char *)cmd;
+	req->cmd_len = sizeof(struct nvme_command);
+	req->sense = NULL;
+	req->sense_len = 0;
 
-	set_current_state(TASK_UNINTERRUPTIBLE);
-	nvme_submit_cmd(cmd_rq->nvmeq, cmd);
-	schedule();
+	if (buffer && bufflen) {
+		ret = blk_rq_map_kern(q, req, buffer, bufflen, __GFP_WAIT);
+		if (ret)
+			goto out;
+	} else if (ubuffer && bufflen) {
+		ret = blk_rq_map_user(q, req, NULL, ubuffer, bufflen, __GFP_WAIT);
+		if (ret)
+			goto out;
+		bio = req->bio;
+	}
 
+	blk_execute_rq(req->q, NULL, req, 0);
+	if (bio)
+		blk_rq_unmap_user(bio);
 	if (result)
-		*result = cmdinfo.result;
-	res = cmdinfo.status;
+		*result = req->sense_len;
+	ret = req->errors;
+ out:
 	blk_mq_free_request(req);
-	return res;
+	return ret;
 }
 
-int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd)
+int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
+		void *buffer, unsigned bufflen)
 {
-	return __nvme_submit_sync_cmd(q, cmd, NULL, 0);
+	return __nvme_submit_sync_cmd(q, cmd, buffer, NULL, bufflen, NULL, 0);
 }
 
 static int nvme_submit_async_admin_req(struct nvme_dev *dev)
@@ -1081,7 +1100,7 @@ static int adapter_delete_queue(struct nvme_dev *dev, u8 opcode, u16 id)
 	c.delete_queue.opcode = opcode;
 	c.delete_queue.qid = cpu_to_le16(id);
 
-	return nvme_submit_sync_cmd(dev->admin_q, &c);
+	return nvme_submit_sync_cmd(dev->admin_q, &c, NULL, 0);
 }
 
 static int adapter_alloc_cq(struct nvme_dev *dev, u16 qid,
@@ -1090,6 +1109,10 @@ static int adapter_alloc_cq(struct nvme_dev *dev, u16 qid,
 	struct nvme_command c;
 	int flags = NVME_QUEUE_PHYS_CONTIG | NVME_CQ_IRQ_ENABLED;
 
+	/*
+	 * Note: we (ab)use the fact the the prp fields survive if no data
+	 * is attached to the request.
+	 */
 	memset(&c, 0, sizeof(c));
 	c.create_cq.opcode = nvme_admin_create_cq;
 	c.create_cq.prp1 = cpu_to_le64(nvmeq->cq_dma_addr);
@@ -1098,7 +1121,7 @@ static int adapter_alloc_cq(struct nvme_dev *dev, u16 qid,
 	c.create_cq.cq_flags = cpu_to_le16(flags);
 	c.create_cq.irq_vector = cpu_to_le16(nvmeq->cq_vector);
 
-	return nvme_submit_sync_cmd(dev->admin_q, &c);
+	return nvme_submit_sync_cmd(dev->admin_q, &c, NULL, 0);
 }
 
 static int adapter_alloc_sq(struct nvme_dev *dev, u16 qid,
@@ -1107,6 +1130,10 @@ static int adapter_alloc_sq(struct nvme_dev *dev, u16 qid,
 	struct nvme_command c;
 	int flags = NVME_QUEUE_PHYS_CONTIG | NVME_SQ_PRIO_MEDIUM;
 
+	/*
+	 * Note: we (ab)use the fact the the prp fields survive if no data
+	 * is attached to the request.
+	 */
 	memset(&c, 0, sizeof(c));
 	c.create_sq.opcode = nvme_admin_create_sq;
 	c.create_sq.prp1 = cpu_to_le64(nvmeq->sq_dma_addr);
@@ -1115,7 +1142,7 @@ static int adapter_alloc_sq(struct nvme_dev *dev, u16 qid,
 	c.create_sq.sq_flags = cpu_to_le16(flags);
 	c.create_sq.cqid = cpu_to_le16(qid);
 
-	return nvme_submit_sync_cmd(dev->admin_q, &c);
+	return nvme_submit_sync_cmd(dev->admin_q, &c, NULL, 0);
 }
 
 static int adapter_delete_cq(struct nvme_dev *dev, u16 cqid)
@@ -1128,18 +1155,43 @@ static int adapter_delete_sq(struct nvme_dev *dev, u16 sqid)
 	return adapter_delete_queue(dev, nvme_admin_delete_sq, sqid);
 }
 
-int nvme_identify(struct nvme_dev *dev, unsigned nsid, unsigned cns,
-							dma_addr_t dma_addr)
+int nvme_identify_ctrl(struct nvme_dev *dev, struct nvme_id_ctrl **id)
 {
-	struct nvme_command c;
+	struct nvme_command c = {
+		.identify.opcode = nvme_admin_identify,
+		.identify.cns = cpu_to_le32(1),
+	};
+	int error;
 
-	memset(&c, 0, sizeof(c));
-	c.identify.opcode = nvme_admin_identify;
-	c.identify.nsid = cpu_to_le32(nsid);
-	c.identify.prp1 = cpu_to_le64(dma_addr);
-	c.identify.cns = cpu_to_le32(cns);
+	*id = kmalloc(sizeof(struct nvme_id_ctrl), GFP_KERNEL);
+	if (!*id)
+		return -ENOMEM;
 
-	return nvme_submit_sync_cmd(dev->admin_q, &c);
+	error = nvme_submit_sync_cmd(dev->admin_q, &c, *id,
+			sizeof(struct nvme_id_ctrl));
+	if (error)
+		kfree(*id);
+	return error;
+}
+
+int nvme_identify_ns(struct nvme_dev *dev, unsigned nsid,
+		struct nvme_id_ns **id)
+{
+	struct nvme_command c = {
+		.identify.opcode = nvme_admin_identify,
+		.identify.nsid = cpu_to_le32(nsid),
+	};
+	int error;
+
+	*id = kmalloc(sizeof(struct nvme_id_ns), GFP_KERNEL);
+	if (!*id)
+		return -ENOMEM;
+
+	error = nvme_submit_sync_cmd(dev->admin_q, &c, *id,
+			sizeof(struct nvme_id_ns));
+	if (error)
+		kfree(*id);
+	return error;
 }
 
 int nvme_get_features(struct nvme_dev *dev, unsigned fid, unsigned nsid,
@@ -1153,7 +1205,8 @@ int nvme_get_features(struct nvme_dev *dev, unsigned fid, unsigned nsid,
 	c.features.prp1 = cpu_to_le64(dma_addr);
 	c.features.fid = cpu_to_le32(fid);
 
-	return __nvme_submit_sync_cmd(dev->admin_q, &c, result, 0);
+	return __nvme_submit_sync_cmd(dev->admin_q, &c, NULL, NULL, 0,
+			result, 0);
 }
 
 int nvme_set_features(struct nvme_dev *dev, unsigned fid, unsigned dword11,
@@ -1167,7 +1220,30 @@ int nvme_set_features(struct nvme_dev *dev, unsigned fid, unsigned dword11,
 	c.features.fid = cpu_to_le32(fid);
 	c.features.dword11 = cpu_to_le32(dword11);
 
-	return __nvme_submit_sync_cmd(dev->admin_q, &c, result, 0);
+	return __nvme_submit_sync_cmd(dev->admin_q, &c, NULL, NULL, 0,
+			result, 0);
+}
+
+int nvme_get_log_page(struct nvme_dev *dev, struct nvme_smart_log **log)
+{
+	struct nvme_command c = {
+		.common.opcode = nvme_admin_get_log_page,
+		.common.nsid = cpu_to_le32(0xFFFFFFFF),
+		.common.cdw10[0] = cpu_to_le32(
+			(((sizeof(struct nvme_smart_log) / 4) - 1) << 16) |
+			 NVME_LOG_SMART),
+	};
+	int error;
+
+	*log = kmalloc(sizeof(struct nvme_smart_log), GFP_KERNEL);
+	if (!*log)
+		return -ENOMEM;
+
+	error = nvme_submit_sync_cmd(dev->admin_q, &c, *log,
+			sizeof(struct nvme_smart_log));
+	if (error)
+		kfree(*log);
+	return error;
 }
 
 /**
@@ -1523,7 +1599,7 @@ static int nvme_shutdown_ctrl(struct nvme_dev *dev)
 }
 
 static struct blk_mq_ops nvme_mq_admin_ops = {
-	.queue_rq	= nvme_admin_queue_rq,
+	.queue_rq	= nvme_queue_rq,
 	.map_queue	= blk_mq_map_queue,
 	.init_hctx	= nvme_admin_init_hctx,
 	.exit_hctx	= nvme_exit_hctx,
@@ -1644,122 +1720,41 @@ static int nvme_configure_admin_queue(struct nvme_dev *dev)
 	return result;
 }
 
-struct nvme_iod *nvme_map_user_pages(struct nvme_dev *dev, int write,
-				unsigned long addr, unsigned length)
-{
-	int i, err, count, nents, offset;
-	struct scatterlist *sg;
-	struct page **pages;
-	struct nvme_iod *iod;
-
-	if (addr & 3)
-		return ERR_PTR(-EINVAL);
-	if (!length || length > INT_MAX - PAGE_SIZE)
-		return ERR_PTR(-EINVAL);
-
-	offset = offset_in_page(addr);
-	count = DIV_ROUND_UP(offset + length, PAGE_SIZE);
-	pages = kcalloc(count, sizeof(*pages), GFP_KERNEL);
-	if (!pages)
-		return ERR_PTR(-ENOMEM);
-
-	err = get_user_pages_fast(addr, count, 1, pages);
-	if (err < count) {
-		count = err;
-		err = -EFAULT;
-		goto put_pages;
-	}
-
-	err = -ENOMEM;
-	iod = __nvme_alloc_iod(count, length, dev, 0, GFP_KERNEL);
-	if (!iod)
-		goto put_pages;
-
-	sg = iod->sg;
-	sg_init_table(sg, count);
-	for (i = 0; i < count; i++) {
-		sg_set_page(&sg[i], pages[i],
-			    min_t(unsigned, length, PAGE_SIZE - offset),
-			    offset);
-		length -= (PAGE_SIZE - offset);
-		offset = 0;
-	}
-	sg_mark_end(&sg[i - 1]);
-	iod->nents = count;
-
-	nents = dma_map_sg(dev->dev, sg, count,
-				write ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
-	if (!nents)
-		goto free_iod;
-
-	kfree(pages);
-	return iod;
-
- free_iod:
-	kfree(iod);
- put_pages:
-	for (i = 0; i < count; i++)
-		put_page(pages[i]);
-	kfree(pages);
-	return ERR_PTR(err);
-}
-
-void nvme_unmap_user_pages(struct nvme_dev *dev, int write,
-			struct nvme_iod *iod)
-{
-	int i;
-
-	dma_unmap_sg(dev->dev, iod->sg, iod->nents,
-				write ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
-
-	for (i = 0; i < iod->nents; i++)
-		put_page(sg_page(&iod->sg[i]));
-}
-
 static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
 {
 	struct nvme_dev *dev = ns->dev;
 	struct nvme_user_io io;
 	struct nvme_command c;
-	unsigned length, meta_len, prp_len;
+	unsigned length, meta_len;
 	int status, write;
-	struct nvme_iod *iod;
 	dma_addr_t meta_dma = 0;
 	void *meta = NULL;
 
 	if (copy_from_user(&io, uio, sizeof(io)))
 		return -EFAULT;
-	length = (io.nblocks + 1) << ns->lba_shift;
-	meta_len = (io.nblocks + 1) * ns->ms;
-
-	if (meta_len && ((io.metadata & 3) || !io.metadata) && !ns->ext)
-		return -EINVAL;
-	else if (meta_len && ns->ext) {
-		length += meta_len;
-		meta_len = 0;
-	}
-
-	write = io.opcode & 1;
 
 	switch (io.opcode) {
 	case nvme_cmd_write:
 	case nvme_cmd_read:
 	case nvme_cmd_compare:
-		iod = nvme_map_user_pages(dev, write, io.addr, length);
 		break;
 	default:
 		return -EINVAL;
 	}
 
-	if (IS_ERR(iod))
-		return PTR_ERR(iod);
+	length = (io.nblocks + 1) << ns->lba_shift;
+	meta_len = (io.nblocks + 1) * ns->ms;
+	write = io.opcode & 1;
 
-	prp_len = nvme_setup_prps(dev, iod, length, GFP_KERNEL);
-	if (length != prp_len) {
-		status = -ENOMEM;
-		goto unmap;
-	}
 	if (meta_len) {
+		if (((io.metadata & 3) || !io.metadata) && !ns->ext)
+			return -EINVAL;
+
+		if (ns->ext) {
+			length += meta_len;
+			meta_len = 0;
+		}
+
 		meta = dma_alloc_coherent(dev->dev, meta_len,
 						&meta_dma, GFP_KERNEL);
 		if (!meta) {
@@ -1786,13 +1781,11 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
 	c.rw.reftag = cpu_to_le32(io.reftag);
 	c.rw.apptag = cpu_to_le16(io.apptag);
 	c.rw.appmask = cpu_to_le16(io.appmask);
-	c.rw.prp1 = cpu_to_le64(sg_dma_address(iod->sg));
-	c.rw.prp2 = cpu_to_le64(iod->first_dma);
 	c.rw.metadata = cpu_to_le64(meta_dma);
-	status = nvme_submit_sync_cmd(ns->queue, &c);
+
+	status = __nvme_submit_sync_cmd(ns->queue, &c, NULL,
+			(void __user *)io.addr, length, NULL, 0);
  unmap:
-	nvme_unmap_user_pages(dev, write, iod);
-	nvme_free_iod(dev, iod);
 	if (meta) {
 		if (status == NVME_SC_SUCCESS && !write) {
 			if (copy_to_user((void __user *)io.metadata, meta,
@@ -1809,9 +1802,8 @@ static int nvme_user_cmd(struct nvme_dev *dev, struct nvme_ns *ns,
 {
 	struct nvme_passthru_cmd cmd;
 	struct nvme_command c;
-	int status, length;
-	struct nvme_iod *uninitialized_var(iod);
-	unsigned timeout;
+	unsigned timeout = 0;
+	int status;
 
 	if (!capable(CAP_SYS_ADMIN))
 		return -EACCES;
@@ -1831,38 +1823,17 @@ static int nvme_user_cmd(struct nvme_dev *dev, struct nvme_ns *ns,
 	c.common.cdw10[4] = cpu_to_le32(cmd.cdw14);
 	c.common.cdw10[5] = cpu_to_le32(cmd.cdw15);
 
-	length = cmd.data_len;
-	if (cmd.data_len) {
-		iod = nvme_map_user_pages(dev, cmd.opcode & 1, cmd.addr,
-								length);
-		if (IS_ERR(iod))
-			return PTR_ERR(iod);
-		length = nvme_setup_prps(dev, iod, length, GFP_KERNEL);
-		c.common.prp1 = cpu_to_le64(sg_dma_address(iod->sg));
-		c.common.prp2 = cpu_to_le64(iod->first_dma);
-	}
-
-	timeout = cmd.timeout_ms ? msecs_to_jiffies(cmd.timeout_ms) :
-								ADMIN_TIMEOUT;
-
-	if (length != cmd.data_len) {
-		status = -ENOMEM;
-		goto out;
-	}
+	if (cmd.timeout_ms)
+		timeout = msecs_to_jiffies(cmd.timeout_ms);
 
 	status = __nvme_submit_sync_cmd(ns ? ns->queue : dev->admin_q, &c,
-					&cmd.result, timeout);
-
-out:
-	if (cmd.data_len) {
-		nvme_unmap_user_pages(dev, cmd.opcode & 1, iod);
-		nvme_free_iod(dev, iod);
+			NULL, (void __user *)cmd.addr, cmd.data_len,
+			&cmd.result, timeout);
+	if (status >= 0) {
+		if (put_user(cmd.result, &ucmd->result))
+			return -EFAULT;
 	}
 
-	if ((status >= 0) && copy_to_user(&ucmd->result, &cmd.result,
-							sizeof(cmd.result)))
-		status = -EFAULT;
-
 	return status;
 }
 
@@ -1954,22 +1925,14 @@ static int nvme_revalidate_disk(struct gendisk *disk)
 	struct nvme_ns *ns = disk->private_data;
 	struct nvme_dev *dev = ns->dev;
 	struct nvme_id_ns *id;
-	dma_addr_t dma_addr;
 	u8 lbaf, pi_type;
 	u16 old_ms;
 	unsigned short bs;
 
-	id = dma_alloc_coherent(dev->dev, 4096, &dma_addr, GFP_KERNEL);
-	if (!id) {
-		dev_warn(dev->dev, "%s: Memory alocation failure\n", __func__);
+	if (nvme_identify_ns(dev, ns->ns_id, &id)) {
+		dev_warn(dev->dev, "%s: Identify failure\n", __func__);
 		return 0;
 	}
-	if (nvme_identify(dev, ns->ns_id, 0, dma_addr)) {
-		dev_warn(dev->dev,
-			"identify failed ns:%d, setting capacity to 0\n",
-			ns->ns_id);
-		memset(id, 0, sizeof(*id));
-	}
 
 	old_ms = ns->ms;
 	lbaf = id->flbas & NVME_NS_FLBAS_LBA_MASK;
@@ -2010,7 +1973,7 @@ static int nvme_revalidate_disk(struct gendisk *disk)
 	if (dev->oncs & NVME_CTRL_ONCS_DSM)
 		nvme_config_discard(ns);
 
-	dma_free_coherent(dev->dev, 4096, id, dma_addr);
+	kfree(id);
 	return 0;
 }
 
@@ -2250,22 +2213,14 @@ static int nvme_dev_add(struct nvme_dev *dev)
 	int res;
 	unsigned nn, i;
 	struct nvme_id_ctrl *ctrl;
-	void *mem;
-	dma_addr_t dma_addr;
 	int shift = NVME_CAP_MPSMIN(readq(&dev->bar->cap)) + 12;
 
-	mem = dma_alloc_coherent(dev->dev, 4096, &dma_addr, GFP_KERNEL);
-	if (!mem)
-		return -ENOMEM;
-
-	res = nvme_identify(dev, 0, 1, dma_addr);
+	res = nvme_identify_ctrl(dev, &ctrl);
 	if (res) {
 		dev_err(dev->dev, "Identify Controller failed (%d)\n", res);
-		dma_free_coherent(dev->dev, 4096, mem, dma_addr);
 		return -EIO;
 	}
 
-	ctrl = mem;
 	nn = le32_to_cpup(&ctrl->nn);
 	dev->oncs = le16_to_cpup(&ctrl->oncs);
 	dev->abort_limit = ctrl->acl + 1;
@@ -2287,7 +2242,7 @@ static int nvme_dev_add(struct nvme_dev *dev)
 		} else
 			dev->max_hw_sectors = max_hw_sectors;
 	}
-	dma_free_coherent(dev->dev, 4096, mem, dma_addr);
+	kfree(ctrl);
 
 	dev->tagset.ops = &nvme_mq_ops;
 	dev->tagset.nr_hw_queues = dev->online_queues - 1;
diff --git a/drivers/block/nvme-scsi.c b/drivers/block/nvme-scsi.c
index 342f5b7f840d..8e6223e5b670 100644
--- a/drivers/block/nvme-scsi.c
+++ b/drivers/block/nvme-scsi.c
@@ -525,8 +525,6 @@ static int nvme_trans_standard_inquiry_page(struct nvme_ns *ns,
 					int alloc_len)
 {
 	struct nvme_dev *dev = ns->dev;
-	dma_addr_t dma_addr;
-	void *mem;
 	struct nvme_id_ns *id_ns;
 	int res;
 	int nvme_sc;
@@ -536,21 +534,17 @@ static int nvme_trans_standard_inquiry_page(struct nvme_ns *ns,
 	u8 cmdque = 0x01 << 1;
 	u8 fw_offset = sizeof(dev->firmware_rev);
 
-	mem = dma_alloc_coherent(dev->dev, sizeof(struct nvme_id_ns),
-				&dma_addr, GFP_KERNEL);
-	if (mem == NULL) {
-		res = -ENOMEM;
-		goto out_dma;
-	}
-
 	/* nvme ns identify - use DPS value for PROTECT field */
-	nvme_sc = nvme_identify(dev, ns->ns_id, 0, dma_addr);
+	nvme_sc = nvme_identify_ns(dev, ns->ns_id, &id_ns);
 	res = nvme_trans_status_code(hdr, nvme_sc);
 	if (res)
-		goto out_free;
+		return res;
 
-	id_ns = mem;
-	(id_ns->dps) ? (protect = 0x01) : (protect = 0);
+	if (id_ns->dps)
+		protect = 0x01;
+	else
+		protect = 0;
+	kfree(id_ns);
 
 	memset(inq_response, 0, STANDARD_INQUIRY_LENGTH);
 	inq_response[2] = VERSION_SPC_4;
@@ -567,12 +561,7 @@ static int nvme_trans_standard_inquiry_page(struct nvme_ns *ns,
 	strncpy(&inq_response[32], dev->firmware_rev + fw_offset, 4);
 
 	xfer_len = min(alloc_len, STANDARD_INQUIRY_LENGTH);
-	res = nvme_trans_copy_to_user(hdr, inq_response, xfer_len);
-
- out_free:
-	dma_free_coherent(dev->dev, sizeof(struct nvme_id_ns), mem, dma_addr);
- out_dma:
-	return res;
+	return nvme_trans_copy_to_user(hdr, inq_response, xfer_len);
 }
 
 static int nvme_trans_supported_vpd_pages(struct nvme_ns *ns,
@@ -615,40 +604,35 @@ static int nvme_trans_device_id_page(struct nvme_ns *ns, struct sg_io_hdr *hdr,
 					u8 *inq_response, int alloc_len)
 {
 	struct nvme_dev *dev = ns->dev;
-	dma_addr_t dma_addr;
-	void *mem;
 	int res;
 	int nvme_sc;
 	int xfer_len;
 	__be32 tmp_id = cpu_to_be32(ns->ns_id);
 
-	mem = dma_alloc_coherent(dev->dev, sizeof(struct nvme_id_ns),
-					&dma_addr, GFP_KERNEL);
-	if (mem == NULL) {
-		res = -ENOMEM;
-		goto out_dma;
-	}
-
 	memset(inq_response, 0, alloc_len);
 	inq_response[1] = INQ_DEVICE_IDENTIFICATION_PAGE;    /* Page Code */
 	if (readl(&dev->bar->vs) >= NVME_VS(1, 1)) {
-		struct nvme_id_ns *id_ns = mem;
-		void *eui = id_ns->eui64;
-		int len = sizeof(id_ns->eui64);
+		struct nvme_id_ns *id_ns;
+		void *eui;
+		int len;
 
-		nvme_sc = nvme_identify(dev, ns->ns_id, 0, dma_addr);
+		nvme_sc = nvme_identify_ns(dev, ns->ns_id, &id_ns);
 		res = nvme_trans_status_code(hdr, nvme_sc);
 		if (res)
-			goto out_free;
+			return res;
 
+		eui = id_ns->eui64;
+		len = sizeof(id_ns->eui64);
 		if (readl(&dev->bar->vs) >= NVME_VS(1, 2)) {
 			if (bitmap_empty(eui, len * 8)) {
 				eui = id_ns->nguid;
 				len = sizeof(id_ns->nguid);
 			}
 		}
-		if (bitmap_empty(eui, len * 8))
+		if (bitmap_empty(eui, len * 8)) {
+			kfree(id_ns);
 			goto scsi_string;
+		}
 
 		inq_response[3] = 4 + len; /* Page Length */
 		/* Designation Descriptor start */
@@ -657,14 +641,14 @@ static int nvme_trans_device_id_page(struct nvme_ns *ns, struct sg_io_hdr *hdr,
 		inq_response[6] = 0x00;    /* Rsvd */
 		inq_response[7] = len;     /* Designator Length */
 		memcpy(&inq_response[8], eui, len);
+		kfree(id_ns);
 	} else {
  scsi_string:
 		if (alloc_len < 72) {
-			res = nvme_trans_completion(hdr,
+			return nvme_trans_completion(hdr,
 					SAM_STAT_CHECK_CONDITION,
 					ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB,
 					SCSI_ASCQ_CAUSE_NOT_REPORTABLE);
-			goto out_free;
 		}
 		inq_response[3] = 0x48;    /* Page Length */
 		/* Designation Descriptor start */
@@ -679,12 +663,7 @@ static int nvme_trans_device_id_page(struct nvme_ns *ns, struct sg_io_hdr *hdr,
 		memcpy(&inq_response[56], dev->serial, sizeof(dev->serial));
 	}
 	xfer_len = alloc_len;
-	res = nvme_trans_copy_to_user(hdr, inq_response, xfer_len);
-
- out_free:
-	dma_free_coherent(dev->dev, sizeof(struct nvme_id_ns), mem, dma_addr);
- out_dma:
-	return res;
+	return nvme_trans_copy_to_user(hdr, inq_response, xfer_len);
 }
 
 static int nvme_trans_ext_inq_page(struct nvme_ns *ns, struct sg_io_hdr *hdr,
@@ -694,8 +673,6 @@ static int nvme_trans_ext_inq_page(struct nvme_ns *ns, struct sg_io_hdr *hdr,
 	int res;
 	int nvme_sc;
 	struct nvme_dev *dev = ns->dev;
-	dma_addr_t dma_addr;
-	void *mem;
 	struct nvme_id_ctrl *id_ctrl;
 	struct nvme_id_ns *id_ns;
 	int xfer_len;
@@ -708,39 +685,32 @@ static int nvme_trans_ext_inq_page(struct nvme_ns *ns, struct sg_io_hdr *hdr,
 	u8 luiclr = 0x01;
 
 	inq_response = kmalloc(EXTENDED_INQUIRY_DATA_PAGE_LENGTH, GFP_KERNEL);
-	if (inq_response == NULL) {
-		res = -ENOMEM;
-		goto out_mem;
-	}
-
-	mem = dma_alloc_coherent(dev->dev, sizeof(struct nvme_id_ns),
-							&dma_addr, GFP_KERNEL);
-	if (mem == NULL) {
-		res = -ENOMEM;
-		goto out_dma;
-	}
+	if (inq_response == NULL)
+		return -ENOMEM;
 
-	/* nvme ns identify */
-	nvme_sc = nvme_identify(dev, ns->ns_id, 0, dma_addr);
+	nvme_sc = nvme_identify_ns(dev, ns->ns_id, &id_ns);
 	res = nvme_trans_status_code(hdr, nvme_sc);
 	if (res)
-		goto out_free;
+		goto out_free_inq;
+
+	spt = spt_lut[id_ns->dpc & 0x07] << 3;
+	if (id_ns->dps)
+		protect = 0x01;
+	else
+		protect = 0;
+	kfree(id_ns);
 
-	id_ns = mem;
-	spt = spt_lut[(id_ns->dpc) & 0x07] << 3;
-	(id_ns->dps) ? (protect = 0x01) : (protect = 0);
 	grd_chk = protect << 2;
 	app_chk = protect << 1;
 	ref_chk = protect;
 
-	/* nvme controller identify */
-	nvme_sc = nvme_identify(dev, 0, 1, dma_addr);
+	nvme_sc = nvme_identify_ctrl(dev, &id_ctrl);
 	res = nvme_trans_status_code(hdr, nvme_sc);
 	if (res)
-		goto out_free;
+		goto out_free_inq;
 
-	id_ctrl = mem;
 	v_sup = id_ctrl->vwc;
+	kfree(id_ctrl);
 
 	memset(inq_response, 0, EXTENDED_INQUIRY_DATA_PAGE_LENGTH);
 	inq_response[1] = INQ_EXTENDED_INQUIRY_DATA_PAGE;    /* Page Code */
@@ -756,11 +726,8 @@ static int nvme_trans_ext_inq_page(struct nvme_ns *ns, struct sg_io_hdr *hdr,
 	xfer_len = min(alloc_len, EXTENDED_INQUIRY_DATA_PAGE_LENGTH);
 	res = nvme_trans_copy_to_user(hdr, inq_response, xfer_len);
 
- out_free:
-	dma_free_coherent(dev->dev, sizeof(struct nvme_id_ns), mem, dma_addr);
- out_dma:
+ out_free_inq:
 	kfree(inq_response);
- out_mem:
 	return res;
 }
 
@@ -847,43 +814,27 @@ static int nvme_trans_log_info_exceptions(struct nvme_ns *ns,
 	int res;
 	int xfer_len;
 	u8 *log_response;
-	struct nvme_command c;
 	struct nvme_dev *dev = ns->dev;
 	struct nvme_smart_log *smart_log;
-	dma_addr_t dma_addr;
-	void *mem;
 	u8 temp_c;
 	u16 temp_k;
 
 	log_response = kzalloc(LOG_INFO_EXCP_PAGE_LENGTH, GFP_KERNEL);
-	if (log_response == NULL) {
-		res = -ENOMEM;
-		goto out_mem;
-	}
+	if (log_response == NULL)
+		return -ENOMEM;
 
-	mem = dma_alloc_coherent(dev->dev, sizeof(struct nvme_smart_log),
-					&dma_addr, GFP_KERNEL);
-	if (mem == NULL) {
-		res = -ENOMEM;
-		goto out_dma;
-	}
+	res = nvme_get_log_page(dev, &smart_log);
+	if (res < 0)
+		goto out_free_response;
 
-	/* Get SMART Log Page */
-	memset(&c, 0, sizeof(c));
-	c.common.opcode = nvme_admin_get_log_page;
-	c.common.nsid = cpu_to_le32(0xFFFFFFFF);
-	c.common.prp1 = cpu_to_le64(dma_addr);
-	c.common.cdw10[0] = cpu_to_le32((((sizeof(struct nvme_smart_log) /
-			BYTES_TO_DWORDS) - 1) << 16) | NVME_LOG_SMART);
-	res = nvme_submit_sync_cmd(dev->admin_q, &c);
 	if (res != NVME_SC_SUCCESS) {
 		temp_c = LOG_TEMP_UNKNOWN;
 	} else {
-		smart_log = mem;
 		temp_k = (smart_log->temperature[1] << 8) +
 				(smart_log->temperature[0]);
 		temp_c = temp_k - KELVIN_TEMP_FACTOR;
 	}
+	kfree(smart_log);
 
 	log_response[0] = LOG_PAGE_INFORMATIONAL_EXCEPTIONS_PAGE;
 	/* Subpage=0x00, Page Length MSB=0 */
@@ -899,11 +850,8 @@ static int nvme_trans_log_info_exceptions(struct nvme_ns *ns,
 	xfer_len = min(alloc_len, LOG_INFO_EXCP_PAGE_LENGTH);
 	res = nvme_trans_copy_to_user(hdr, log_response, xfer_len);
 
-	dma_free_coherent(dev->dev, sizeof(struct nvme_smart_log),
-			  mem, dma_addr);
- out_dma:
+ out_free_response:
 	kfree(log_response);
- out_mem:
 	return res;
 }
 
@@ -913,44 +861,28 @@ static int nvme_trans_log_temperature(struct nvme_ns *ns, struct sg_io_hdr *hdr,
 	int res;
 	int xfer_len;
 	u8 *log_response;
-	struct nvme_command c;
 	struct nvme_dev *dev = ns->dev;
 	struct nvme_smart_log *smart_log;
-	dma_addr_t dma_addr;
-	void *mem;
 	u32 feature_resp;
 	u8 temp_c_cur, temp_c_thresh;
 	u16 temp_k;
 
 	log_response = kzalloc(LOG_TEMP_PAGE_LENGTH, GFP_KERNEL);
-	if (log_response == NULL) {
-		res = -ENOMEM;
-		goto out_mem;
-	}
+	if (log_response == NULL)
+		return -ENOMEM;
 
-	mem = dma_alloc_coherent(dev->dev, sizeof(struct nvme_smart_log),
-					&dma_addr, GFP_KERNEL);
-	if (mem == NULL) {
-		res = -ENOMEM;
-		goto out_dma;
-	}
+	res = nvme_get_log_page(dev, &smart_log);
+	if (res < 0)
+		goto out_free_response;
 
-	/* Get SMART Log Page */
-	memset(&c, 0, sizeof(c));
-	c.common.opcode = nvme_admin_get_log_page;
-	c.common.nsid = cpu_to_le32(0xFFFFFFFF);
-	c.common.prp1 = cpu_to_le64(dma_addr);
-	c.common.cdw10[0] = cpu_to_le32((((sizeof(struct nvme_smart_log) /
-			BYTES_TO_DWORDS) - 1) << 16) | NVME_LOG_SMART);
-	res = nvme_submit_sync_cmd(dev->admin_q, &c);
 	if (res != NVME_SC_SUCCESS) {
 		temp_c_cur = LOG_TEMP_UNKNOWN;
 	} else {
-		smart_log = mem;
 		temp_k = (smart_log->temperature[1] << 8) +
 				(smart_log->temperature[0]);
 		temp_c_cur = temp_k - KELVIN_TEMP_FACTOR;
 	}
+	kfree(smart_log);
 
 	/* Get Features for Temp Threshold */
 	res = nvme_get_features(dev, NVME_FEAT_TEMP_THRESH, 0, 0,
@@ -979,11 +911,8 @@ static int nvme_trans_log_temperature(struct nvme_ns *ns, struct sg_io_hdr *hdr,
 	xfer_len = min(alloc_len, LOG_TEMP_PAGE_LENGTH);
 	res = nvme_trans_copy_to_user(hdr, log_response, xfer_len);
 
-	dma_free_coherent(dev->dev, sizeof(struct nvme_smart_log),
-			  mem, dma_addr);
- out_dma:
+ out_free_response:
 	kfree(log_response);
- out_mem:
 	return res;
 }
 
@@ -1019,8 +948,6 @@ static int nvme_trans_fill_blk_desc(struct nvme_ns *ns, struct sg_io_hdr *hdr,
 	int res;
 	int nvme_sc;
 	struct nvme_dev *dev = ns->dev;
-	dma_addr_t dma_addr;
-	void *mem;
 	struct nvme_id_ns *id_ns;
 	u8 flbas;
 	u32 lba_length;
@@ -1030,20 +957,11 @@ static int nvme_trans_fill_blk_desc(struct nvme_ns *ns, struct sg_io_hdr *hdr,
 	else if (llbaa > 0 && len < MODE_PAGE_LLBAA_BLK_DES_LEN)
 		return -EINVAL;
 
-	mem = dma_alloc_coherent(dev->dev, sizeof(struct nvme_id_ns),
-							&dma_addr, GFP_KERNEL);
-	if (mem == NULL) {
-		res = -ENOMEM;
-		goto out;
-	}
-
-	/* nvme ns identify */
-	nvme_sc = nvme_identify(dev, ns->ns_id, 0, dma_addr);
+	nvme_sc = nvme_identify_ns(dev, ns->ns_id, &id_ns);
 	res = nvme_trans_status_code(hdr, nvme_sc);
 	if (res)
-		goto out_dma;
+		return res;
 
-	id_ns = mem;
 	flbas = (id_ns->flbas) & 0x0F;
 	lba_length = (1 << (id_ns->lbaf[flbas].ds));
 
@@ -1063,9 +981,7 @@ static int nvme_trans_fill_blk_desc(struct nvme_ns *ns, struct sg_io_hdr *hdr,
 		memcpy(&resp[12], &tmp_len, sizeof(u32));
 	}
 
- out_dma:
-	dma_free_coherent(dev->dev, sizeof(struct nvme_id_ns), mem, dma_addr);
- out:
+	kfree(id_ns);
 	return res;
 }
 
@@ -1291,26 +1207,17 @@ static int nvme_trans_power_state(struct nvme_ns *ns, struct sg_io_hdr *hdr,
 	int res;
 	int nvme_sc;
 	struct nvme_dev *dev = ns->dev;
-	dma_addr_t dma_addr;
-	void *mem;
 	struct nvme_id_ctrl *id_ctrl;
 	int lowest_pow_st;	/* max npss = lowest power consumption */
 	unsigned ps_desired = 0;
 
-	/* NVMe Controller Identify */
-	mem = dma_alloc_coherent(dev->dev, sizeof(struct nvme_id_ctrl),
-				&dma_addr, GFP_KERNEL);
-	if (mem == NULL) {
-		res = -ENOMEM;
-		goto out;
-	}
-	nvme_sc = nvme_identify(dev, 0, 1, dma_addr);
+	nvme_sc = nvme_identify_ctrl(dev, &id_ctrl);
 	res = nvme_trans_status_code(hdr, nvme_sc);
 	if (res)
-		goto out_dma;
+		return res;
 
-	id_ctrl = mem;
 	lowest_pow_st = max(POWER_STATE_0, (int)(id_ctrl->npss - 1));
+	kfree(id_ctrl);
 
 	switch (pc) {
 	case NVME_POWER_STATE_START_VALID:
@@ -1350,12 +1257,7 @@ static int nvme_trans_power_state(struct nvme_ns *ns, struct sg_io_hdr *hdr,
 	}
 	nvme_sc = nvme_set_features(dev, NVME_FEAT_POWER_MGMT, ps_desired, 0,
 				    NULL);
-	res = nvme_trans_status_code(hdr, nvme_sc);
-
- out_dma:
-	dma_free_coherent(dev->dev, sizeof(struct nvme_id_ctrl), mem, dma_addr);
- out:
-	return res;
+	return nvme_trans_status_code(hdr, nvme_sc);
 }
 
 static int nvme_trans_send_activate_fw_cmd(struct nvme_ns *ns, struct sg_io_hdr *hdr,
@@ -1368,7 +1270,7 @@ static int nvme_trans_send_activate_fw_cmd(struct nvme_ns *ns, struct sg_io_hdr
 	c.common.opcode = nvme_admin_activate_fw;
 	c.common.cdw10[0] = cpu_to_le32(buffer_id | NVME_FWACT_REPL_ACTV);
 
-	nvme_sc = nvme_submit_sync_cmd(ns->queue, &c);
+	nvme_sc = nvme_submit_sync_cmd(ns->queue, &c, NULL, 0);
 	return nvme_trans_status_code(hdr, nvme_sc);
 }
 
@@ -1376,15 +1278,9 @@ static int nvme_trans_send_download_fw_cmd(struct nvme_ns *ns, struct sg_io_hdr
 					u8 opcode, u32 tot_len, u32 offset,
 					u8 buffer_id)
 {
-	int res;
 	int nvme_sc;
 	struct nvme_dev *dev = ns->dev;
 	struct nvme_command c;
-	struct nvme_iod *iod = NULL;
-	unsigned length;
-
-	memset(&c, 0, sizeof(c));
-	c.common.opcode = nvme_admin_download_fw;
 
 	if (hdr->iovec_count > 0) {
 		/* Assuming SGL is not allowed for this command */
@@ -1394,28 +1290,15 @@ static int nvme_trans_send_download_fw_cmd(struct nvme_ns *ns, struct sg_io_hdr
 					SCSI_ASC_INVALID_CDB,
 					SCSI_ASCQ_CAUSE_NOT_REPORTABLE);
 	}
-	iod = nvme_map_user_pages(dev, DMA_TO_DEVICE,
-			(unsigned long)hdr->dxferp, tot_len);
-	if (IS_ERR(iod))
-		return PTR_ERR(iod);
-	length = nvme_setup_prps(dev, iod, tot_len, GFP_KERNEL);
-	if (length != tot_len) {
-		res = -ENOMEM;
-		goto out_unmap;
-	}
 
-	c.dlfw.prp1 = cpu_to_le64(sg_dma_address(iod->sg));
-	c.dlfw.prp2 = cpu_to_le64(iod->first_dma);
+	memset(&c, 0, sizeof(c));
+	c.common.opcode = nvme_admin_download_fw;
 	c.dlfw.numd = cpu_to_le32((tot_len/BYTES_TO_DWORDS) - 1);
 	c.dlfw.offset = cpu_to_le32(offset/BYTES_TO_DWORDS);
 
-	nvme_sc = nvme_submit_sync_cmd(dev->admin_q, &c);
-	res = nvme_trans_status_code(hdr, nvme_sc);
-
- out_unmap:
-	nvme_unmap_user_pages(dev, DMA_TO_DEVICE, iod);
-	nvme_free_iod(dev, iod);
-	return res;
+	nvme_sc = __nvme_submit_sync_cmd(dev->admin_q, &c, NULL,
+			hdr->dxferp, tot_len, NULL, 0);
+	return nvme_trans_status_code(hdr, nvme_sc);
 }
 
 /* Mode Select Helper Functions */
@@ -1590,9 +1473,6 @@ static int nvme_trans_fmt_set_blk_size_count(struct nvme_ns *ns,
 	int res = 0;
 	int nvme_sc;
 	struct nvme_dev *dev = ns->dev;
-	dma_addr_t dma_addr;
-	void *mem;
-	struct nvme_id_ns *id_ns;
 	u8 flbas;
 
 	/*
@@ -1603,19 +1483,12 @@ static int nvme_trans_fmt_set_blk_size_count(struct nvme_ns *ns,
 	 */
 
 	if (ns->mode_select_num_blocks == 0 || ns->mode_select_block_len == 0) {
-		mem = dma_alloc_coherent(dev->dev,
-			sizeof(struct nvme_id_ns), &dma_addr, GFP_KERNEL);
-		if (mem == NULL) {
-			res = -ENOMEM;
-			goto out;
-		}
-		/* nvme ns identify */
-		nvme_sc = nvme_identify(dev, ns->ns_id, 0, dma_addr);
+		struct nvme_id_ns *id_ns;
+
+		nvme_sc = nvme_identify_ns(dev, ns->ns_id, &id_ns);
 		res = nvme_trans_status_code(hdr, nvme_sc);
 		if (res)
-			goto out_dma;
-
-		id_ns = mem;
+			return res;
 
 		if (ns->mode_select_num_blocks == 0)
 			ns->mode_select_num_blocks = le64_to_cpu(id_ns->ncap);
@@ -1624,12 +1497,11 @@ static int nvme_trans_fmt_set_blk_size_count(struct nvme_ns *ns,
 			ns->mode_select_block_len =
 						(1 << (id_ns->lbaf[flbas].ds));
 		}
- out_dma:
-		dma_free_coherent(dev->dev, sizeof(struct nvme_id_ns),
-				  mem, dma_addr);
+
+		kfree(id_ns);
 	}
- out:
-	return res;
+
+	return 0;
 }
 
 static int nvme_trans_fmt_get_parm_header(struct sg_io_hdr *hdr, u8 len,
@@ -1698,8 +1570,6 @@ static int nvme_trans_fmt_send_cmd(struct nvme_ns *ns, struct sg_io_hdr *hdr,
 	int res;
 	int nvme_sc;
 	struct nvme_dev *dev = ns->dev;
-	dma_addr_t dma_addr;
-	void *mem;
 	struct nvme_id_ns *id_ns;
 	u8 i;
 	u8 flbas, nlbaf;
@@ -1708,19 +1578,11 @@ static int nvme_trans_fmt_send_cmd(struct nvme_ns *ns, struct sg_io_hdr *hdr,
 	struct nvme_command c;
 
 	/* Loop thru LBAF's in id_ns to match reqd lbaf, put in cdw10 */
-	mem = dma_alloc_coherent(dev->dev, sizeof(struct nvme_id_ns),
-							&dma_addr, GFP_KERNEL);
-	if (mem == NULL) {
-		res = -ENOMEM;
-		goto out;
-	}
-	/* nvme ns identify */
-	nvme_sc = nvme_identify(dev, ns->ns_id, 0, dma_addr);
+	nvme_sc = nvme_identify_ns(dev, ns->ns_id, &id_ns);
 	res = nvme_trans_status_code(hdr, nvme_sc);
 	if (res)
-		goto out_dma;
+		return res;
 
-	id_ns = mem;
 	flbas = (id_ns->flbas) & 0x0F;
 	nlbaf = id_ns->nlbaf;
 
@@ -1748,12 +1610,10 @@ static int nvme_trans_fmt_send_cmd(struct nvme_ns *ns, struct sg_io_hdr *hdr,
 	c.format.nsid = cpu_to_le32(ns->ns_id);
 	c.format.cdw10 = cpu_to_le32(cdw10);
 
-	nvme_sc = nvme_submit_sync_cmd(dev->admin_q, &c);
+	nvme_sc = nvme_submit_sync_cmd(dev->admin_q, &c, NULL, 0);
 	res = nvme_trans_status_code(hdr, nvme_sc);
 
- out_dma:
-	dma_free_coherent(dev->dev, sizeof(struct nvme_id_ns), mem, dma_addr);
- out:
+	kfree(id_ns);
 	return res;
 }
 
@@ -1787,9 +1647,7 @@ static int nvme_trans_do_nvme_io(struct nvme_ns *ns, struct sg_io_hdr *hdr,
 				struct nvme_trans_io_cdb *cdb_info, u8 is_write)
 {
 	int nvme_sc = NVME_SC_SUCCESS;
-	struct nvme_dev *dev = ns->dev;
 	u32 num_cmds;
-	struct nvme_iod *iod;
 	u64 unit_len;
 	u64 unit_num_blocks;	/* Number of blocks to xfer in each nvme cmd */
 	u32 retcode;
@@ -1840,35 +1698,17 @@ static int nvme_trans_do_nvme_io(struct nvme_ns *ns, struct sg_io_hdr *hdr,
 		control = nvme_trans_io_get_control(ns, cdb_info);
 		c.rw.control = cpu_to_le16(control);
 
-		iod = nvme_map_user_pages(dev,
-			(is_write) ? DMA_TO_DEVICE : DMA_FROM_DEVICE,
-			(unsigned long)next_mapping_addr, unit_len);
-		if (IS_ERR(iod))
-			return PTR_ERR(iod);
-
-		retcode = nvme_setup_prps(dev, iod, unit_len, GFP_KERNEL);
-		if (retcode != unit_len) {
-			nvme_unmap_user_pages(dev,
-				(is_write) ? DMA_TO_DEVICE : DMA_FROM_DEVICE,
-				iod);
-			nvme_free_iod(dev, iod);
-			return -ENOMEM;
+		if (get_capacity(ns->disk) - unit_num_blocks <
+				cdb_info->lba + nvme_offset) {
+			nvme_sc = NVME_SC_LBA_RANGE;
+			break;
 		}
-		c.rw.prp1 = cpu_to_le64(sg_dma_address(iod->sg));
-		c.rw.prp2 = cpu_to_le64(iod->first_dma);
+		nvme_sc = __nvme_submit_sync_cmd(ns->queue, &c, NULL,
+				next_mapping_addr, unit_len, NULL, 0);
+		if (nvme_sc)
+			break;
 
 		nvme_offset += unit_num_blocks;
-
-		nvme_sc = nvme_submit_sync_cmd(ns->queue, &c);
-
-		nvme_unmap_user_pages(dev,
-				(is_write) ? DMA_TO_DEVICE : DMA_FROM_DEVICE,
-				iod);
-		nvme_free_iod(dev, iod);
-
-
-		if (nvme_sc != NVME_SC_SUCCESS)
-			break;
 	}
 
 	return nvme_trans_status_code(hdr, nvme_sc);
@@ -2199,8 +2039,6 @@ static int nvme_trans_read_capacity(struct nvme_ns *ns, struct sg_io_hdr *hdr,
 	u32 resp_size;
 	u32 xfer_len;
 	struct nvme_dev *dev = ns->dev;
-	dma_addr_t dma_addr;
-	void *mem;
 	struct nvme_id_ns *id_ns;
 	u8 *response;
 
@@ -2212,24 +2050,15 @@ static int nvme_trans_read_capacity(struct nvme_ns *ns, struct sg_io_hdr *hdr,
 		resp_size = READ_CAP_10_RESP_SIZE;
 	}
 
-	mem = dma_alloc_coherent(dev->dev, sizeof(struct nvme_id_ns),
-							&dma_addr, GFP_KERNEL);
-	if (mem == NULL) {
-		res = -ENOMEM;
-		goto out;
-	}
-	/* nvme ns identify */
-	nvme_sc = nvme_identify(dev, ns->ns_id, 0, dma_addr);
+	nvme_sc = nvme_identify_ns(dev, ns->ns_id, &id_ns);
 	res = nvme_trans_status_code(hdr, nvme_sc);
 	if (res)
-		goto out_dma;
-
-	id_ns = mem;
+		return res;	
 
 	response = kzalloc(resp_size, GFP_KERNEL);
 	if (response == NULL) {
 		res = -ENOMEM;
-		goto out_dma;
+		goto out_free_id;
 	}
 	nvme_trans_fill_read_cap(response, id_ns, cdb16);
 
@@ -2237,9 +2066,8 @@ static int nvme_trans_read_capacity(struct nvme_ns *ns, struct sg_io_hdr *hdr,
 	res = nvme_trans_copy_to_user(hdr, response, xfer_len);
 
 	kfree(response);
- out_dma:
-	dma_free_coherent(dev->dev, sizeof(struct nvme_id_ns), mem, dma_addr);
- out:
+ out_free_id:
+	kfree(id_ns);
 	return res;
 }
 
@@ -2251,8 +2079,6 @@ static int nvme_trans_report_luns(struct nvme_ns *ns, struct sg_io_hdr *hdr,
 	u32 alloc_len, xfer_len, resp_size;
 	u8 *response;
 	struct nvme_dev *dev = ns->dev;
-	dma_addr_t dma_addr;
-	void *mem;
 	struct nvme_id_ctrl *id_ctrl;
 	u32 ll_length, lun_id;
 	u8 lun_id_offset = REPORT_LUNS_FIRST_LUN_OFFSET;
@@ -2266,19 +2092,11 @@ static int nvme_trans_report_luns(struct nvme_ns *ns, struct sg_io_hdr *hdr,
 	case ALL_LUNS_RETURNED:
 	case ALL_WELL_KNOWN_LUNS_RETURNED:
 	case RESTRICTED_LUNS_RETURNED:
-		/* NVMe Controller Identify */
-		mem = dma_alloc_coherent(dev->dev, sizeof(struct nvme_id_ctrl),
-					&dma_addr, GFP_KERNEL);
-		if (mem == NULL) {
-			res = -ENOMEM;
-			goto out;
-		}
-		nvme_sc = nvme_identify(dev, 0, 1, dma_addr);
+		nvme_sc = nvme_identify_ctrl(dev, &id_ctrl);
 		res = nvme_trans_status_code(hdr, nvme_sc);
 		if (res)
-			goto out_dma;
+			return res;
 
-		id_ctrl = mem;
 		ll_length = le32_to_cpu(id_ctrl->nn) * LUN_ENTRY_SIZE;
 		resp_size = ll_length + LUN_DATA_HEADER_SIZE;
 
@@ -2288,13 +2106,13 @@ static int nvme_trans_report_luns(struct nvme_ns *ns, struct sg_io_hdr *hdr,
 					SAM_STAT_CHECK_CONDITION,
 					ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB,
 					SCSI_ASCQ_CAUSE_NOT_REPORTABLE);
-			goto out_dma;
+			goto out_free_id;
 		}
 
 		response = kzalloc(resp_size, GFP_KERNEL);
 		if (response == NULL) {
 			res = -ENOMEM;
-			goto out_dma;
+			goto out_free_id;
 		}
 
 		/* The first LUN ID will always be 0 per the SAM spec */
@@ -2315,9 +2133,8 @@ static int nvme_trans_report_luns(struct nvme_ns *ns, struct sg_io_hdr *hdr,
 	res = nvme_trans_copy_to_user(hdr, response, xfer_len);
 
 	kfree(response);
- out_dma:
-	dma_free_coherent(dev->dev, sizeof(struct nvme_id_ctrl), mem, dma_addr);
- out:
+ out_free_id:
+	kfree(id_ctrl);
 	return res;
 }
 
@@ -2379,12 +2196,23 @@ static int nvme_trans_security_protocol(struct nvme_ns *ns,
 				SCSI_ASCQ_CAUSE_NOT_REPORTABLE);
 }
 
-static int nvme_trans_start_stop(struct nvme_ns *ns, struct sg_io_hdr *hdr,
-							u8 *cmd)
+static int nvme_trans_synchronize_cache(struct nvme_ns *ns,
+					struct sg_io_hdr *hdr)
 {
-	int res;
 	int nvme_sc;
 	struct nvme_command c;
+
+	memset(&c, 0, sizeof(c));
+	c.common.opcode = nvme_cmd_flush;
+	c.common.nsid = cpu_to_le32(ns->ns_id);
+
+	nvme_sc = nvme_submit_sync_cmd(ns->queue, &c, NULL, 0);
+	return nvme_trans_status_code(hdr, nvme_sc);
+}
+
+static int nvme_trans_start_stop(struct nvme_ns *ns, struct sg_io_hdr *hdr,
+							u8 *cmd)
+{
 	u8 immed, pcmod, pc, no_flush, start;
 
 	immed = cmd[1] & 0x01;
@@ -2400,12 +2228,7 @@ static int nvme_trans_start_stop(struct nvme_ns *ns, struct sg_io_hdr *hdr,
 	} else {
 		if (no_flush == 0) {
 			/* Issue NVME FLUSH command prior to START STOP UNIT */
-			memset(&c, 0, sizeof(c));
-			c.common.opcode = nvme_cmd_flush;
-			c.common.nsid = cpu_to_le32(ns->ns_id);
-
-			nvme_sc = nvme_submit_sync_cmd(ns->queue, &c);
-			res = nvme_trans_status_code(hdr, nvme_sc);
+			int res = nvme_trans_synchronize_cache(ns, hdr);
 			if (res)
 				return res;
 		}
@@ -2414,20 +2237,6 @@ static int nvme_trans_start_stop(struct nvme_ns *ns, struct sg_io_hdr *hdr,
 	}
 }
 
-static int nvme_trans_synchronize_cache(struct nvme_ns *ns,
-					struct sg_io_hdr *hdr, u8 *cmd)
-{
-	int nvme_sc;
-	struct nvme_command c;
-
-	memset(&c, 0, sizeof(c));
-	c.common.opcode = nvme_cmd_flush;
-	c.common.nsid = cpu_to_le32(ns->ns_id);
-
-	nvme_sc = nvme_submit_sync_cmd(ns->queue, &c);
-	return nvme_trans_status_code(hdr, nvme_sc);
-}
-
 static int nvme_trans_format_unit(struct nvme_ns *ns, struct sg_io_hdr *hdr,
 							u8 *cmd)
 {
@@ -2563,13 +2372,11 @@ struct scsi_unmap_parm_list {
 static int nvme_trans_unmap(struct nvme_ns *ns, struct sg_io_hdr *hdr,
 							u8 *cmd)
 {
-	struct nvme_dev *dev = ns->dev;
 	struct scsi_unmap_parm_list *plist;
 	struct nvme_dsm_range *range;
 	struct nvme_command c;
 	int i, nvme_sc, res = -ENOMEM;
 	u16 ndesc, list_len;
-	dma_addr_t dma_addr;
 
 	list_len = get_unaligned_be16(&cmd[7]);
 	if (!list_len)
@@ -2589,8 +2396,7 @@ static int nvme_trans_unmap(struct nvme_ns *ns, struct sg_io_hdr *hdr,
 		goto out;
 	}
 
-	range = dma_alloc_coherent(dev->dev, ndesc * sizeof(*range),
-							&dma_addr, GFP_KERNEL);
+	range = kcalloc(ndesc, sizeof(*range), GFP_KERNEL);
 	if (!range)
 		goto out;
 
@@ -2603,14 +2409,14 @@ static int nvme_trans_unmap(struct nvme_ns *ns, struct sg_io_hdr *hdr,
 	memset(&c, 0, sizeof(c));
 	c.dsm.opcode = nvme_cmd_dsm;
 	c.dsm.nsid = cpu_to_le32(ns->ns_id);
-	c.dsm.prp1 = cpu_to_le64(dma_addr);
 	c.dsm.nr = cpu_to_le32(ndesc - 1);
 	c.dsm.attributes = cpu_to_le32(NVME_DSMGMT_AD);
 
-	nvme_sc = nvme_submit_sync_cmd(ns->queue, &c);
+	nvme_sc = nvme_submit_sync_cmd(ns->queue, &c, range,
+			ndesc * sizeof(*range));
 	res = nvme_trans_status_code(hdr, nvme_sc);
 
-	dma_free_coherent(dev->dev, ndesc * sizeof(*range), range, dma_addr);
+	kfree(range);
  out:
 	kfree(plist);
 	return res;
@@ -2690,7 +2496,7 @@ static int nvme_scsi_translate(struct nvme_ns *ns, struct sg_io_hdr *hdr)
 		retcode = nvme_trans_start_stop(ns, hdr, cmd);
 		break;
 	case SYNCHRONIZE_CACHE:
-		retcode = nvme_trans_synchronize_cache(ns, hdr, cmd);
+		retcode = nvme_trans_synchronize_cache(ns, hdr);
 		break;
 	case FORMAT_UNIT:
 		retcode = nvme_trans_format_unit(ns, hdr, cmd);
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index de0e49a716b8..986bf8ad8e93 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -146,21 +146,15 @@ static inline u64 nvme_block_nr(struct nvme_ns *ns, sector_t sector)
 	return (sector >> (ns->lba_shift - 9));
 }
 
-/**
- * nvme_free_iod - frees an nvme_iod
- * @dev: The device that the I/O was submitted to
- * @iod: The memory to free
- */
-void nvme_free_iod(struct nvme_dev *dev, struct nvme_iod *iod);
-
-int nvme_setup_prps(struct nvme_dev *, struct nvme_iod *, int, gfp_t);
-struct nvme_iod *nvme_map_user_pages(struct nvme_dev *dev, int write,
-				unsigned long addr, unsigned length);
-void nvme_unmap_user_pages(struct nvme_dev *dev, int write,
-			struct nvme_iod *iod);
-int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd);
-int nvme_identify(struct nvme_dev *, unsigned nsid, unsigned cns,
-							dma_addr_t dma_addr);
+int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
+		void *buf, unsigned bufflen);
+int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
+		void *buffer, void __user *ubuffer, unsigned bufflen,
+		u32 *result, unsigned timeout);
+int nvme_identify_ctrl(struct nvme_dev *dev, struct nvme_id_ctrl **id);
+int nvme_identify_ns(struct nvme_dev *dev, unsigned nsid,
+		struct nvme_id_ns **id);
+int nvme_get_log_page(struct nvme_dev *dev, struct nvme_smart_log **log);
 int nvme_get_features(struct nvme_dev *dev, unsigned fid, unsigned nsid,
 			dma_addr_t dma_addr, u32 *result);
 int nvme_set_features(struct nvme_dev *dev, unsigned fid, unsigned dword11,
-- 
cgit v1.2.3


From 326e1dbb57368087a36607aaebe9795b8d5453e5 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Fri, 22 May 2015 09:14:03 -0400
Subject: block: remove management of bi_remaining when restoring original
 bi_end_io

Commit c4cf5261 ("bio: skip atomic inc/dec of ->bi_remaining for
non-chains") regressed all existing callers that followed this pattern:
 1) saving a bio's original bi_end_io
 2) wiring up an intermediate bi_end_io
 3) restoring the original bi_end_io from intermediate bi_end_io
 4) calling bio_endio() to execute the restored original bi_end_io

The regression was due to BIO_CHAIN only ever getting set if
bio_inc_remaining() is called.  For the above pattern it isn't set until
step 3 above (step 2 would've needed to establish BIO_CHAIN).  As such
the first bio_endio(), in step 2 above, never decremented __bi_remaining
before calling the intermediate bi_end_io -- leaving __bi_remaining with
the value 1 instead of 0.  When bio_inc_remaining() occurred during step
3 it brought it to a value of 2.  When the second bio_endio() was
called, in step 4 above, it should've called the original bi_end_io but
it didn't because there was an extra reference that wasn't dropped (due
to atomic operations being optimized away since BIO_CHAIN wasn't set
upfront).

Fix this issue by removing the __bi_remaining management complexity for
all callers that use the above pattern -- bio_chain() is the only
interface that _needs_ to be concerned with __bi_remaining.  For the
above pattern callers just expect the bi_end_io they set to get called!
Remove bio_endio_nodec() and also remove all bio_inc_remaining() calls
that aren't associated with the bio_chain() interface.

Also, the bio_inc_remaining() interface has been moved local to bio.c.

Fixes: c4cf5261 ("bio: skip atomic inc/dec of ->bi_remaining for non-chains")
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/bio-integrity.c        |  4 ++--
 block/bio.c                  | 35 ++++++++++++++---------------------
 drivers/md/bcache/io.c       |  2 +-
 drivers/md/dm-cache-target.c |  6 ------
 drivers/md/dm-raid1.c        |  2 --
 drivers/md/dm-snap.c         |  1 -
 drivers/md/dm-thin.c         |  9 +++------
 drivers/md/dm-verity.c       |  2 +-
 fs/btrfs/disk-io.c           |  2 +-
 fs/btrfs/volumes.c           | 16 +++++-----------
 fs/btrfs/volumes.h           |  2 --
 include/linux/bio.h          | 12 ------------
 12 files changed, 27 insertions(+), 66 deletions(-)

(limited to 'include/linux')

diff --git a/block/bio-integrity.c b/block/bio-integrity.c
index 5cbd5d9ea61d..0436c21db7f2 100644
--- a/block/bio-integrity.c
+++ b/block/bio-integrity.c
@@ -361,7 +361,7 @@ static void bio_integrity_verify_fn(struct work_struct *work)
 
 	/* Restore original bio completion handler */
 	bio->bi_end_io = bip->bip_end_io;
-	bio_endio_nodec(bio, error);
+	bio_endio(bio, error);
 }
 
 /**
@@ -388,7 +388,7 @@ void bio_integrity_endio(struct bio *bio, int error)
 	 */
 	if (error) {
 		bio->bi_end_io = bip->bip_end_io;
-		bio_endio_nodec(bio, error);
+		bio_endio(bio, error);
 
 		return;
 	}
diff --git a/block/bio.c b/block/bio.c
index c2ff8a88aef1..259197d97de1 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -303,6 +303,17 @@ static void bio_chain_endio(struct bio *bio, int error)
 	bio_put(bio);
 }
 
+/*
+ * Increment chain count for the bio. Make sure the CHAIN flag update
+ * is visible before the raised count.
+ */
+static inline void bio_inc_remaining(struct bio *bio)
+{
+	bio->bi_flags |= (1 << BIO_CHAIN);
+	smp_mb__before_atomic();
+	atomic_inc(&bio->__bi_remaining);
+}
+
 /**
  * bio_chain - chain bio completions
  * @bio: the target bio
@@ -1756,8 +1767,10 @@ static inline bool bio_remaining_done(struct bio *bio)
 
 	BUG_ON(atomic_read(&bio->__bi_remaining) <= 0);
 
-	if (atomic_dec_and_test(&bio->__bi_remaining))
+	if (atomic_dec_and_test(&bio->__bi_remaining)) {
+		clear_bit(BIO_CHAIN, &bio->bi_flags);
 		return true;
+	}
 
 	return false;
 }
@@ -1808,26 +1821,6 @@ void bio_endio(struct bio *bio, int error)
 }
 EXPORT_SYMBOL(bio_endio);
 
-/**
- * bio_endio_nodec - end I/O on a bio, without decrementing bi_remaining
- * @bio:	bio
- * @error:	error, if any
- *
- * For code that has saved and restored bi_end_io; thing hard before using this
- * function, probably you should've cloned the entire bio.
- **/
-void bio_endio_nodec(struct bio *bio, int error)
-{
-	/*
-	 * If it's not flagged as a chain, we are not going to dec the count
-	 */
-	if (bio_flagged(bio, BIO_CHAIN))
-		bio_inc_remaining(bio);
-
-	bio_endio(bio, error);
-}
-EXPORT_SYMBOL(bio_endio_nodec);
-
 /**
  * bio_split - split a bio
  * @bio:	bio to split
diff --git a/drivers/md/bcache/io.c b/drivers/md/bcache/io.c
index fa028fa82df4..cb64e64a4789 100644
--- a/drivers/md/bcache/io.c
+++ b/drivers/md/bcache/io.c
@@ -55,7 +55,7 @@ static void bch_bio_submit_split_done(struct closure *cl)
 
 	s->bio->bi_end_io = s->bi_end_io;
 	s->bio->bi_private = s->bi_private;
-	bio_endio_nodec(s->bio, 0);
+	bio_endio(s->bio, 0);
 
 	closure_debug_destroy(&s->cl);
 	mempool_free(s, s->p->bio_split_hook);
diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c
index 705eb7b99d69..41b2594a80c6 100644
--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@@ -86,12 +86,6 @@ static void dm_unhook_bio(struct dm_hook_info *h, struct bio *bio)
 {
 	bio->bi_end_io = h->bi_end_io;
 	bio->bi_private = h->bi_private;
-
-	/*
-	 * Must bump bi_remaining to allow bio to complete with
-	 * restored bi_end_io.
-	 */
-	bio_inc_remaining(bio);
 }
 
 /*----------------------------------------------------------------*/
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index d6a1c096b777..743fa9bbae9e 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -1254,8 +1254,6 @@ static int mirror_end_io(struct dm_target *ti, struct bio *bio, int error)
 			dm_bio_restore(bd, bio);
 			bio_record->details.bi_bdev = NULL;
 
-			bio_inc_remaining(bio);
-
 			queue_bio(ms, bio, rw);
 			return DM_ENDIO_INCOMPLETE;
 		}
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index 8bfeae218531..7c82d3ccce87 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -1478,7 +1478,6 @@ out:
 	if (full_bio) {
 		full_bio->bi_end_io = pe->full_bio_end_io;
 		full_bio->bi_private = pe->full_bio_private;
-		bio_inc_remaining(full_bio);
 	}
 	increment_pending_exceptions_done_count();
 
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index 342dbdad6131..e852602c0091 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -793,10 +793,9 @@ static void inc_remap_and_issue_cell(struct thin_c *tc,
 
 static void process_prepared_mapping_fail(struct dm_thin_new_mapping *m)
 {
-	if (m->bio) {
+	if (m->bio)
 		m->bio->bi_end_io = m->saved_bi_end_io;
-		bio_inc_remaining(m->bio);
-	}
+
 	cell_error(m->tc->pool, m->cell);
 	list_del(&m->list);
 	mempool_free(m, m->tc->pool->mapping_pool);
@@ -810,10 +809,8 @@ static void process_prepared_mapping(struct dm_thin_new_mapping *m)
 	int r;
 
 	bio = m->bio;
-	if (bio) {
+	if (bio)
 		bio->bi_end_io = m->saved_bi_end_io;
-		bio_inc_remaining(bio);
-	}
 
 	if (m->err) {
 		cell_error(pool, m->cell);
diff --git a/drivers/md/dm-verity.c b/drivers/md/dm-verity.c
index 66616db33e6f..bb9c6a00e4b0 100644
--- a/drivers/md/dm-verity.c
+++ b/drivers/md/dm-verity.c
@@ -459,7 +459,7 @@ static void verity_finish_io(struct dm_verity_io *io, int error)
 	bio->bi_end_io = io->orig_bi_end_io;
 	bio->bi_private = io->orig_bi_private;
 
-	bio_endio_nodec(bio, error);
+	bio_endio(bio, error);
 }
 
 static void verity_work(struct work_struct *w)
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index e08a926fe12c..0bccf18dc1dc 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1745,7 +1745,7 @@ static void end_workqueue_fn(struct btrfs_work *work)
 	bio->bi_private = end_io_wq->private;
 	bio->bi_end_io = end_io_wq->end_io;
 	kmem_cache_free(btrfs_end_io_wq_cache, end_io_wq);
-	bio_endio_nodec(bio, error);
+	bio_endio(bio, error);
 }
 
 static int cleaner_kthread(void *arg)
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 8e8d1d1e28a5..dac77d42a9ab 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -5585,10 +5585,10 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
 
 static inline void btrfs_end_bbio(struct btrfs_bio *bbio, struct bio *bio, int err)
 {
-	if (likely(bbio->flags & BTRFS_BIO_ORIG_BIO_SUBMITTED))
-		bio_endio_nodec(bio, err);
-	else
-		bio_endio(bio, err);
+	bio->bi_private = bbio->private;
+	bio->bi_end_io = bbio->end_io;
+	bio_endio(bio, err);
+
 	btrfs_put_bbio(bbio);
 }
 
@@ -5632,8 +5632,6 @@ static void btrfs_end_bio(struct bio *bio, int err)
 			bio = bbio->orig_bio;
 		}
 
-		bio->bi_private = bbio->private;
-		bio->bi_end_io = bbio->end_io;
 		btrfs_io_bio(bio)->mirror_num = bbio->mirror_num;
 		/* only send an error to the higher layers if it is
 		 * beyond the tolerance of the btrfs bio
@@ -5815,8 +5813,6 @@ static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical)
 		/* Shoud be the original bio. */
 		WARN_ON(bio != bbio->orig_bio);
 
-		bio->bi_private = bbio->private;
-		bio->bi_end_io = bbio->end_io;
 		btrfs_io_bio(bio)->mirror_num = bbio->mirror_num;
 		bio->bi_iter.bi_sector = logical >> 9;
 
@@ -5897,10 +5893,8 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
 		if (dev_nr < total_devs - 1) {
 			bio = btrfs_bio_clone(first_bio, GFP_NOFS);
 			BUG_ON(!bio); /* -ENOMEM */
-		} else {
+		} else
 			bio = first_bio;
-			bbio->flags |= BTRFS_BIO_ORIG_BIO_SUBMITTED;
-		}
 
 		submit_stripe_bio(root, bbio, bio,
 				  bbio->stripes[dev_nr].physical, dev_nr, rw,
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index ebc31331a837..cedae0356558 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -292,8 +292,6 @@ struct btrfs_bio_stripe {
 struct btrfs_bio;
 typedef void (btrfs_bio_end_io_t) (struct btrfs_bio *bio, int err);
 
-#define BTRFS_BIO_ORIG_BIO_SUBMITTED	(1 << 0)
-
 struct btrfs_bio {
 	atomic_t refs;
 	atomic_t stripes_pending;
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 7486ea103f6e..f0291cf64cc5 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -427,7 +427,6 @@ static inline struct bio *bio_clone_kmalloc(struct bio *bio, gfp_t gfp_mask)
 }
 
 extern void bio_endio(struct bio *, int);
-extern void bio_endio_nodec(struct bio *, int);
 struct request_queue;
 extern int bio_phys_segments(struct request_queue *, struct bio *);
 
@@ -658,17 +657,6 @@ static inline struct bio *bio_list_get(struct bio_list *bl)
 	return bio;
 }
 
-/*
- * Increment chain count for the bio. Make sure the CHAIN flag update
- * is visible before the raised count.
- */
-static inline void bio_inc_remaining(struct bio *bio)
-{
-	bio->bi_flags |= (1 << BIO_CHAIN);
-	smp_mb__before_atomic();
-	atomic_inc(&bio->__bi_remaining);
-}
-
 /*
  * bio_set is used to allow other portions of the IO system to
  * allocate their own private memory pools for bio and iovec structures.
-- 
cgit v1.2.3


From 5f1b670d0bef508a5554d92525f5f6d00d640b38 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 22 May 2015 09:14:04 -0400
Subject: block, dm: don't copy bios for request clones

Currently dm-multipath has to clone the bios for every request sent
to the lower devices, which wastes cpu cycles and ties down memory.

This patch instead adds a new REQ_CLONE flag that instructs req_bio_endio
to not complete bios attached to a request, which we set on clone
requests similar to bios in a flush sequence.  With this change I/O
errors on a path failure only get propagated to dm-multipath, which
can then either resubmit the I/O or complete the bios on the original
request.

I've done some basic testing of this on a Linux target with ALUA support,
and it survives path failures during I/O nicely.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-core.c          |  94 +++----------------------
 drivers/md/dm-table.c     |  25 ++++---
 drivers/md/dm.c           | 171 +++++++++++-----------------------------------
 drivers/md/dm.h           |   5 +-
 include/linux/blk_types.h |   2 +
 include/linux/blkdev.h    |   6 +-
 6 files changed, 73 insertions(+), 230 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-core.c b/block/blk-core.c
index de474b5dee2b..aa819a58ea24 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -117,7 +117,7 @@ EXPORT_SYMBOL(blk_rq_init);
 static void req_bio_endio(struct request *rq, struct bio *bio,
 			  unsigned int nbytes, int error)
 {
-	if (error)
+	if (error && !(rq->cmd_flags & REQ_CLONE))
 		clear_bit(BIO_UPTODATE, &bio->bi_flags);
 	else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
 		error = -EIO;
@@ -128,7 +128,8 @@ static void req_bio_endio(struct request *rq, struct bio *bio,
 	bio_advance(bio, nbytes);
 
 	/* don't actually finish bio if it's part of flush sequence */
-	if (bio->bi_iter.bi_size == 0 && !(rq->cmd_flags & REQ_FLUSH_SEQ))
+	if (bio->bi_iter.bi_size == 0 &&
+	    !(rq->cmd_flags & (REQ_FLUSH_SEQ|REQ_CLONE)))
 		bio_endio(bio, error);
 }
 
@@ -2909,95 +2910,22 @@ int blk_lld_busy(struct request_queue *q)
 }
 EXPORT_SYMBOL_GPL(blk_lld_busy);
 
-/**
- * blk_rq_unprep_clone - Helper function to free all bios in a cloned request
- * @rq: the clone request to be cleaned up
- *
- * Description:
- *     Free all bios in @rq for a cloned request.
- */
-void blk_rq_unprep_clone(struct request *rq)
-{
-	struct bio *bio;
-
-	while ((bio = rq->bio) != NULL) {
-		rq->bio = bio->bi_next;
-
-		bio_put(bio);
-	}
-}
-EXPORT_SYMBOL_GPL(blk_rq_unprep_clone);
-
-/*
- * Copy attributes of the original request to the clone request.
- * The actual data parts (e.g. ->cmd, ->sense) are not copied.
- */
-static void __blk_rq_prep_clone(struct request *dst, struct request *src)
+void blk_rq_prep_clone(struct request *dst, struct request *src)
 {
 	dst->cpu = src->cpu;
-	dst->cmd_flags |= (src->cmd_flags & REQ_CLONE_MASK) | REQ_NOMERGE;
+	dst->cmd_flags |= (src->cmd_flags & REQ_CLONE_MASK);
+	dst->cmd_flags |= REQ_NOMERGE | REQ_CLONE;
 	dst->cmd_type = src->cmd_type;
 	dst->__sector = blk_rq_pos(src);
 	dst->__data_len = blk_rq_bytes(src);
 	dst->nr_phys_segments = src->nr_phys_segments;
 	dst->ioprio = src->ioprio;
 	dst->extra_len = src->extra_len;
-}
-
-/**
- * blk_rq_prep_clone - Helper function to setup clone request
- * @rq: the request to be setup
- * @rq_src: original request to be cloned
- * @bs: bio_set that bios for clone are allocated from
- * @gfp_mask: memory allocation mask for bio
- * @bio_ctr: setup function to be called for each clone bio.
- *           Returns %0 for success, non %0 for failure.
- * @data: private data to be passed to @bio_ctr
- *
- * Description:
- *     Clones bios in @rq_src to @rq, and copies attributes of @rq_src to @rq.
- *     The actual data parts of @rq_src (e.g. ->cmd, ->sense)
- *     are not copied, and copying such parts is the caller's responsibility.
- *     Also, pages which the original bios are pointing to are not copied
- *     and the cloned bios just point same pages.
- *     So cloned bios must be completed before original bios, which means
- *     the caller must complete @rq before @rq_src.
- */
-int blk_rq_prep_clone(struct request *rq, struct request *rq_src,
-		      struct bio_set *bs, gfp_t gfp_mask,
-		      int (*bio_ctr)(struct bio *, struct bio *, void *),
-		      void *data)
-{
-	struct bio *bio, *bio_src;
-
-	if (!bs)
-		bs = fs_bio_set;
-
-	__rq_for_each_bio(bio_src, rq_src) {
-		bio = bio_clone_fast(bio_src, gfp_mask, bs);
-		if (!bio)
-			goto free_and_out;
-
-		if (bio_ctr && bio_ctr(bio, bio_src, data))
-			goto free_and_out;
-
-		if (rq->bio) {
-			rq->biotail->bi_next = bio;
-			rq->biotail = bio;
-		} else
-			rq->bio = rq->biotail = bio;
-	}
-
-	__blk_rq_prep_clone(rq, rq_src);
-
-	return 0;
-
-free_and_out:
-	if (bio)
-		bio_put(bio);
-	blk_rq_unprep_clone(rq);
-
-	return -ENOMEM;
+	dst->bio = src->bio;
+	dst->biotail = src->biotail;
+	dst->cmd = src->cmd;
+	dst->cmd_len = src->cmd_len;
+	dst->sense = src->sense;
 }
 EXPORT_SYMBOL_GPL(blk_rq_prep_clone);
 
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index d9b00b8565c6..3662b2e49b8d 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -940,21 +940,28 @@ static int dm_table_alloc_md_mempools(struct dm_table *t, struct mapped_device *
 {
 	unsigned type = dm_table_get_type(t);
 	unsigned per_bio_data_size = 0;
-	struct dm_target *tgt;
 	unsigned i;
 
-	if (unlikely(type == DM_TYPE_NONE)) {
+	switch (type) {
+	case DM_TYPE_BIO_BASED:
+		for (i = 0; i < t->num_targets; i++) {
+			struct dm_target *tgt = t->targets + i;
+
+			per_bio_data_size = max(per_bio_data_size,
+						tgt->per_bio_data_size);
+		}
+		t->mempools = dm_alloc_bio_mempools(t->integrity_supported,
+						    per_bio_data_size);
+		break;
+	case DM_TYPE_REQUEST_BASED:
+	case DM_TYPE_MQ_REQUEST_BASED:
+		t->mempools = dm_alloc_rq_mempools(md, type);
+		break;
+	default:
 		DMWARN("no table type is set, can't allocate mempools");
 		return -EINVAL;
 	}
 
-	if (type == DM_TYPE_BIO_BASED)
-		for (i = 0; i < t->num_targets; i++) {
-			tgt = t->targets + i;
-			per_bio_data_size = max(per_bio_data_size, tgt->per_bio_data_size);
-		}
-
-	t->mempools = dm_alloc_md_mempools(md, type, t->integrity_supported, per_bio_data_size);
 	if (!t->mempools)
 		return -ENOMEM;
 
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index a930b72314ac..38837f8ea327 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -990,57 +990,6 @@ static void clone_endio(struct bio *bio, int error)
 	dec_pending(io, error);
 }
 
-/*
- * Partial completion handling for request-based dm
- */
-static void end_clone_bio(struct bio *clone, int error)
-{
-	struct dm_rq_clone_bio_info *info =
-		container_of(clone, struct dm_rq_clone_bio_info, clone);
-	struct dm_rq_target_io *tio = info->tio;
-	struct bio *bio = info->orig;
-	unsigned int nr_bytes = info->orig->bi_iter.bi_size;
-
-	bio_put(clone);
-
-	if (tio->error)
-		/*
-		 * An error has already been detected on the request.
-		 * Once error occurred, just let clone->end_io() handle
-		 * the remainder.
-		 */
-		return;
-	else if (error) {
-		/*
-		 * Don't notice the error to the upper layer yet.
-		 * The error handling decision is made by the target driver,
-		 * when the request is completed.
-		 */
-		tio->error = error;
-		return;
-	}
-
-	/*
-	 * I/O for the bio successfully completed.
-	 * Notice the data completion to the upper layer.
-	 */
-
-	/*
-	 * bios are processed from the head of the list.
-	 * So the completing bio should always be rq->bio.
-	 * If it's not, something wrong is happening.
-	 */
-	if (tio->orig->bio != bio)
-		DMERR("bio completion is going in the middle of the request");
-
-	/*
-	 * Update the original request.
-	 * Do not use blk_end_request() here, because it may complete
-	 * the original request before the clone, and break the ordering.
-	 */
-	blk_update_request(tio->orig, 0, nr_bytes);
-}
-
 static struct dm_rq_target_io *tio_from_request(struct request *rq)
 {
 	return (rq->q->mq_ops ? blk_mq_rq_to_pdu(rq) : rq->special);
@@ -1089,8 +1038,6 @@ static void free_rq_clone(struct request *clone, bool must_be_mapped)
 
 	WARN_ON_ONCE(must_be_mapped && !clone->q);
 
-	blk_rq_unprep_clone(clone);
-
 	if (md->type == DM_TYPE_MQ_REQUEST_BASED)
 		/* stacked on blk-mq queue(s) */
 		tio->ti->type->release_clone_rq(clone);
@@ -1821,39 +1768,13 @@ static void dm_dispatch_clone_request(struct request *clone, struct request *rq)
 		dm_complete_request(rq, r);
 }
 
-static int dm_rq_bio_constructor(struct bio *bio, struct bio *bio_orig,
-				 void *data)
+static void setup_clone(struct request *clone, struct request *rq,
+		        struct dm_rq_target_io *tio)
 {
-	struct dm_rq_target_io *tio = data;
-	struct dm_rq_clone_bio_info *info =
-		container_of(bio, struct dm_rq_clone_bio_info, clone);
-
-	info->orig = bio_orig;
-	info->tio = tio;
-	bio->bi_end_io = end_clone_bio;
-
-	return 0;
-}
-
-static int setup_clone(struct request *clone, struct request *rq,
-		       struct dm_rq_target_io *tio, gfp_t gfp_mask)
-{
-	int r;
-
-	r = blk_rq_prep_clone(clone, rq, tio->md->bs, gfp_mask,
-			      dm_rq_bio_constructor, tio);
-	if (r)
-		return r;
-
-	clone->cmd = rq->cmd;
-	clone->cmd_len = rq->cmd_len;
-	clone->sense = rq->sense;
+	blk_rq_prep_clone(clone, rq);
 	clone->end_io = end_clone_request;
 	clone->end_io_data = tio;
-
 	tio->clone = clone;
-
-	return 0;
 }
 
 static struct request *clone_rq(struct request *rq, struct mapped_device *md,
@@ -1874,12 +1795,7 @@ static struct request *clone_rq(struct request *rq, struct mapped_device *md,
 		clone = tio->clone;
 
 	blk_rq_init(NULL, clone);
-	if (setup_clone(clone, rq, tio, gfp_mask)) {
-		/* -ENOMEM */
-		if (alloc_clone)
-			free_clone_request(md, clone);
-		return NULL;
-	}
+	setup_clone(clone, rq, tio);
 
 	return clone;
 }
@@ -1973,11 +1889,7 @@ static int map_request(struct dm_rq_target_io *tio, struct request *rq,
 		}
 		if (IS_ERR(clone))
 			return DM_MAPIO_REQUEUE;
-		if (setup_clone(clone, rq, tio, GFP_ATOMIC)) {
-			/* -ENOMEM */
-			ti->type->release_clone_rq(clone);
-			return DM_MAPIO_REQUEUE;
-		}
+		setup_clone(clone, rq, tio);
 	}
 
 	switch (r) {
@@ -2431,8 +2343,6 @@ static void __bind_mempools(struct mapped_device *md, struct dm_table *t)
 		goto out;
 	}
 
-	BUG_ON(!p || md->io_pool || md->rq_pool || md->bs);
-
 	md->io_pool = p->io_pool;
 	p->io_pool = NULL;
 	md->rq_pool = p->rq_pool;
@@ -3536,48 +3446,23 @@ int dm_noflush_suspending(struct dm_target *ti)
 }
 EXPORT_SYMBOL_GPL(dm_noflush_suspending);
 
-struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, unsigned type,
-					    unsigned integrity, unsigned per_bio_data_size)
+struct dm_md_mempools *dm_alloc_bio_mempools(unsigned integrity,
+					     unsigned per_bio_data_size)
 {
-	struct dm_md_mempools *pools = kzalloc(sizeof(*pools), GFP_KERNEL);
-	struct kmem_cache *cachep = NULL;
-	unsigned int pool_size = 0;
+	struct dm_md_mempools *pools;
+	unsigned int pool_size = dm_get_reserved_bio_based_ios();
 	unsigned int front_pad;
 
+	pools = kzalloc(sizeof(*pools), GFP_KERNEL);
 	if (!pools)
 		return NULL;
 
-	type = filter_md_type(type, md);
+	front_pad = roundup(per_bio_data_size, __alignof__(struct dm_target_io)) +
+		offsetof(struct dm_target_io, clone);
 
-	switch (type) {
-	case DM_TYPE_BIO_BASED:
-		cachep = _io_cache;
-		pool_size = dm_get_reserved_bio_based_ios();
-		front_pad = roundup(per_bio_data_size, __alignof__(struct dm_target_io)) + offsetof(struct dm_target_io, clone);
-		break;
-	case DM_TYPE_REQUEST_BASED:
-		cachep = _rq_tio_cache;
-		pool_size = dm_get_reserved_rq_based_ios();
-		pools->rq_pool = mempool_create_slab_pool(pool_size, _rq_cache);
-		if (!pools->rq_pool)
-			goto out;
-		/* fall through to setup remaining rq-based pools */
-	case DM_TYPE_MQ_REQUEST_BASED:
-		if (!pool_size)
-			pool_size = dm_get_reserved_rq_based_ios();
-		front_pad = offsetof(struct dm_rq_clone_bio_info, clone);
-		/* per_bio_data_size is not used. See __bind_mempools(). */
-		WARN_ON(per_bio_data_size != 0);
-		break;
-	default:
-		BUG();
-	}
-
-	if (cachep) {
-		pools->io_pool = mempool_create_slab_pool(pool_size, cachep);
-		if (!pools->io_pool)
-			goto out;
-	}
+	pools->io_pool = mempool_create_slab_pool(pool_size, _io_cache);
+	if (!pools->io_pool)
+		goto out;
 
 	pools->bs = bioset_create_nobvec(pool_size, front_pad);
 	if (!pools->bs)
@@ -3587,10 +3472,34 @@ struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, unsigned t
 		goto out;
 
 	return pools;
-
 out:
 	dm_free_md_mempools(pools);
+	return NULL;
+}
+
+struct dm_md_mempools *dm_alloc_rq_mempools(struct mapped_device *md,
+					    unsigned type)
+{
+	unsigned int pool_size = dm_get_reserved_rq_based_ios();
+	struct dm_md_mempools *pools;
 
+	pools = kzalloc(sizeof(*pools), GFP_KERNEL);
+	if (!pools)
+		return NULL;
+
+	if (filter_md_type(type, md) == DM_TYPE_REQUEST_BASED) {
+		pools->rq_pool = mempool_create_slab_pool(pool_size, _rq_cache);
+		if (!pools->rq_pool)
+			goto out;
+	}
+
+	pools->io_pool = mempool_create_slab_pool(pool_size, _rq_tio_cache);
+	if (!pools->io_pool)
+		goto out;
+
+	return pools;
+out:
+	dm_free_md_mempools(pools);
 	return NULL;
 }
 
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index 6123c2bf9150..e6e66d087b26 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -222,8 +222,9 @@ void dm_kcopyd_exit(void);
 /*
  * Mempool operations
  */
-struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, unsigned type,
-					    unsigned integrity, unsigned per_bio_data_size);
+struct dm_md_mempools *dm_alloc_bio_mempools(unsigned integrity,
+					     unsigned per_bio_data_size);
+struct dm_md_mempools *dm_alloc_rq_mempools(struct mapped_device *md, unsigned type);
 void dm_free_md_mempools(struct dm_md_mempools *pools);
 
 /*
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 3f4ded0b1a34..45a6be89957c 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -192,6 +192,7 @@ enum rq_flag_bits {
 	__REQ_HASHED,		/* on IO scheduler merge hash */
 	__REQ_MQ_INFLIGHT,	/* track inflight for MQ */
 	__REQ_NO_TIMEOUT,	/* requests may never expire */
+	__REQ_CLONE,		/* cloned bios */
 	__REQ_NR_BITS,		/* stops here */
 };
 
@@ -246,5 +247,6 @@ enum rq_flag_bits {
 #define REQ_HASHED		(1ULL << __REQ_HASHED)
 #define REQ_MQ_INFLIGHT		(1ULL << __REQ_MQ_INFLIGHT)
 #define REQ_NO_TIMEOUT		(1ULL << __REQ_NO_TIMEOUT)
+#define REQ_CLONE		(1ULL << __REQ_CLONE)
 
 #endif /* __LINUX_BLK_TYPES_H */
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index bc917956a6d0..9ded80da2c16 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -775,11 +775,7 @@ extern void blk_add_request_payload(struct request *rq, struct page *page,
 		unsigned int len);
 extern int blk_rq_check_limits(struct request_queue *q, struct request *rq);
 extern int blk_lld_busy(struct request_queue *q);
-extern int blk_rq_prep_clone(struct request *rq, struct request *rq_src,
-			     struct bio_set *bs, gfp_t gfp_mask,
-			     int (*bio_ctr)(struct bio *, struct bio *, void *),
-			     void *data);
-extern void blk_rq_unprep_clone(struct request *rq);
+extern void blk_rq_prep_clone(struct request *rq, struct request *rq_src);
 extern int blk_insert_cloned_request(struct request_queue *q,
 				     struct request *rq);
 extern void blk_delay_queue(struct request_queue *, unsigned long);
-- 
cgit v1.2.3


From d0751b98dfa391f862e02dc36a233a54615e3f1d Mon Sep 17 00:00:00 2001
From: Yijing Wang <wangyijing@huawei.com>
Date: Thu, 21 May 2015 15:05:02 +0800
Subject: PCI: Add dev->has_secondary_link to track downstream PCIe links

A PCIe Port is an interface to a Link.  A Root Port is a PCI-PCI bridge in
a Root Complex and has a Link on its secondary (downstream) side.  For
other Ports, the Link may be on either the upstream (closer to the Root
Complex) or downstream side of the Port.

The usual topology has a Root Port connected to an Upstream Port.  We
previously assumed this was the only possible topology, and that a
Downstream Port's Link was always on its downstream side, like this:

                  +---------------------+
  +------+        |          Downstream |
  | Root |        | Upstream       Port +--Link--
  | Port +--Link--+ Port                |
  +------+        |          Downstream |
                  |                Port +--Link--
                  +---------------------+

But systems do exist (see URL below) where the Root Port is connected to a
Downstream Port.  In this case, a Downstream Port's Link may be on either
the upstream or downstream side:

                  +---------------------+
  +------+        |            Upstream |
  | Root |        | Downstream     Port +--Link--
  | Port +--Link--+ Port                |
  +------+        |          Downstream |
                  |                Port +--Link--
                  +---------------------+

We can't use the Port type to determine which side the Link is on, so add a
bit in struct pci_dev to keep track.

A Root Port's Link is always on the Port's secondary side.  A component
(Endpoint or Port) on the other end of the Link obviously has the Link on
its upstream side.  If that component is a Port, it is part of a Switch or
a Bridge.  A Bridge has a PCI or PCI-X bus on its secondary side, not a
Link.  The internal bus of a Switch connects the Port to another Port whose
Link is on the downstream side.

[bhelgaas: changelog, comment, cache "type", use if/else]
Link: http://lkml.kernel.org/r/54EB81B2.4050904@pobox.com
Link: https://bugzilla.kernel.org/show_bug.cgi?id=94361
Suggested-by: Bjorn Helgaas <bhelgaas@google.com>
Signed-off-by: Yijing Wang <wangyijing@huawei.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 drivers/pci/probe.c | 18 ++++++++++++++++++
 include/linux/pci.h |  1 +
 2 files changed, 19 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index 6675a7a1b9fc..96dcd7b8303b 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -973,6 +973,8 @@ void set_pcie_port_type(struct pci_dev *pdev)
 {
 	int pos;
 	u16 reg16;
+	int type;
+	struct pci_dev *parent;
 
 	pos = pci_find_capability(pdev, PCI_CAP_ID_EXP);
 	if (!pos)
@@ -982,6 +984,22 @@ void set_pcie_port_type(struct pci_dev *pdev)
 	pdev->pcie_flags_reg = reg16;
 	pci_read_config_word(pdev, pos + PCI_EXP_DEVCAP, &reg16);
 	pdev->pcie_mpss = reg16 & PCI_EXP_DEVCAP_PAYLOAD;
+
+	/*
+	 * A Root Port is always the upstream end of a Link.  No PCIe
+	 * component has two Links.  Two Links are connected by a Switch
+	 * that has a Port on each Link and internal logic to connect the
+	 * two Ports.
+	 */
+	type = pci_pcie_type(pdev);
+	if (type == PCI_EXP_TYPE_ROOT_PORT)
+		pdev->has_secondary_link = 1;
+	else if (type == PCI_EXP_TYPE_UPSTREAM ||
+		 type == PCI_EXP_TYPE_DOWNSTREAM) {
+		parent = pci_upstream_bridge(pdev);
+		if (!parent->has_secondary_link)
+			pdev->has_secondary_link = 1;
+	}
 }
 
 void set_pcie_hotplug_bridge(struct pci_dev *pdev)
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 353db8dc4c6e..1989f6dc9618 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -355,6 +355,7 @@ struct pci_dev {
 	unsigned int	broken_intx_masking:1;
 	unsigned int	io_window_1k:1;	/* Intel P2P bridge 1K I/O windows */
 	unsigned int	irq_managed:1;
+	unsigned int	has_secondary_link:1;
 	pci_dev_flags_t dev_flags;
 	atomic_t	enable_cnt;	/* pci_enable_device has been called */
 
-- 
cgit v1.2.3


From 6374f9124efea5fae9cba263108583c39e22f86b Mon Sep 17 00:00:00 2001
From: Harald Geyer <harald@ccbib.org>
Date: Tue, 7 Apr 2015 11:12:35 +0000
Subject: timekeeping: Provide new API to get the current time resolution

This patch series introduces a new function
u32 ktime_get_resolution_ns(void)
which allows to clean up some driver code.

In particular the IIO subsystem has a function to provide timestamps for
events but no means to get their resolution. So currently the dht11 driver
tries to guess the resolution in a rather messy and convoluted way. We
can do much better with the new code.

This API is not designed to be exposed to user space.

This has been tested on i386, sunxi and mxs.

Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Harald Geyer <harald@ccbib.org>
[jstultz: Tweaked to make it build after upstream changes]
Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 include/linux/timekeeping.h |  1 +
 kernel/time/timekeeping.c   | 17 +++++++++++++++++
 2 files changed, 18 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/timekeeping.h b/include/linux/timekeeping.h
index 99176af216af..9af5c1214682 100644
--- a/include/linux/timekeeping.h
+++ b/include/linux/timekeeping.h
@@ -163,6 +163,7 @@ extern ktime_t ktime_get(void);
 extern ktime_t ktime_get_with_offset(enum tk_offsets offs);
 extern ktime_t ktime_mono_to_any(ktime_t tmono, enum tk_offsets offs);
 extern ktime_t ktime_get_raw(void);
+extern u32 ktime_get_resolution_ns(void);
 
 /**
  * ktime_get_real - get the real (wall-) time in ktime_t format
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 3365e32dc208..85d376396313 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -702,6 +702,23 @@ ktime_t ktime_get(void)
 }
 EXPORT_SYMBOL_GPL(ktime_get);
 
+u32 ktime_get_resolution_ns(void)
+{
+	struct timekeeper *tk = &tk_core.timekeeper;
+	unsigned int seq;
+	u32 nsecs;
+
+	WARN_ON(timekeeping_suspended);
+
+	do {
+		seq = read_seqcount_begin(&tk_core.seq);
+		nsecs = tk->tkr_mono.mult >> tk->tkr_mono.shift;
+	} while (read_seqcount_retry(&tk_core.seq, seq));
+
+	return nsecs;
+}
+EXPORT_SYMBOL_GPL(ktime_get_resolution_ns);
+
 static ktime_t *offsets[TK_OFFS_MAX] = {
 	[TK_OFFS_REAL]	= &tk_core.timekeeper.offs_real,
 	[TK_OFFS_BOOT]	= &tk_core.timekeeper.offs_boot,
-- 
cgit v1.2.3


From 57d05a93ada77c4f8a6112cbc867a2948dce7991 Mon Sep 17 00:00:00 2001
From: John Stultz <john.stultz@linaro.org>
Date: Wed, 13 May 2015 16:04:47 -0700
Subject: time: Rework debugging variables so they aren't global

Ingo suggested that the timekeeping debugging variables
recently added should not be global, and should be tied
to the timekeeper's read_base.

Thus this patch implements that suggestion.

This version is different from the earlier versions
as it keeps the variables in the timekeeper structure
rather then in the tkr.

Cc: Ingo Molnar <mingo@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 include/linux/timekeeper_internal.h | 15 +++++++++++++++
 kernel/time/timekeeping.c           | 33 +++++++++++----------------------
 2 files changed, 26 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/timekeeper_internal.h b/include/linux/timekeeper_internal.h
index 6f8276ae579c..e1f5a1136554 100644
--- a/include/linux/timekeeper_internal.h
+++ b/include/linux/timekeeper_internal.h
@@ -61,6 +61,9 @@ struct tk_read_base {
  *			shifted nano seconds.
  * @ntp_error_shift:	Shift conversion between clock shifted nano seconds and
  *			ntp shifted nano seconds.
+ * @last_warning:	Warning ratelimiter (DEBUG_TIMEKEEPING)
+ * @underflow_seen:	Underflow warning flag (DEBUG_TIMEKEEPING)
+ * @overflow_seen:	Overflow warning flag (DEBUG_TIMEKEEPING)
  *
  * Note: For timespec(64) based interfaces wall_to_monotonic is what
  * we need to add to xtime (or xtime corrected for sub jiffie times)
@@ -106,6 +109,18 @@ struct timekeeper {
 	s64			ntp_error;
 	u32			ntp_error_shift;
 	u32			ntp_err_mult;
+#ifdef CONFIG_DEBUG_TIMEKEEPING
+	long			last_warning;
+	/*
+	 * These simple flag variables are managed
+	 * without locks, which is racy, but they are
+	 * ok since we don't really care about being
+	 * super precise about how many events were
+	 * seen, just that a problem was observed.
+	 */
+	int			underflow_seen;
+	int			overflow_seen;
+#endif
 };
 
 #ifdef CONFIG_GENERIC_TIME_VSYSCALL
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 85d376396313..2f10b6557a1c 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -118,18 +118,6 @@ static inline void tk_update_sleep_time(struct timekeeper *tk, ktime_t delta)
 
 #ifdef CONFIG_DEBUG_TIMEKEEPING
 #define WARNING_FREQ (HZ*300) /* 5 minute rate-limiting */
-/*
- * These simple flag variables are managed
- * without locks, which is racy, but ok since
- * we don't really care about being super
- * precise about how many events were seen,
- * just that a problem was observed.
- */
-static int timekeeping_underflow_seen;
-static int timekeeping_overflow_seen;
-
-/* last_warning is only modified under the timekeeping lock */
-static long timekeeping_last_warning;
 
 static void timekeeping_check_update(struct timekeeper *tk, cycle_t offset)
 {
@@ -149,29 +137,30 @@ static void timekeeping_check_update(struct timekeeper *tk, cycle_t offset)
 		}
 	}
 
-	if (timekeeping_underflow_seen) {
-		if (jiffies - timekeeping_last_warning > WARNING_FREQ) {
+	if (tk->underflow_seen) {
+		if (jiffies - tk->last_warning > WARNING_FREQ) {
 			printk_deferred("WARNING: Underflow in clocksource '%s' observed, time update ignored.\n", name);
 			printk_deferred("         Please report this, consider using a different clocksource, if possible.\n");
 			printk_deferred("         Your kernel is probably still fine.\n");
-			timekeeping_last_warning = jiffies;
+			tk->last_warning = jiffies;
 		}
-		timekeeping_underflow_seen = 0;
+		tk->underflow_seen = 0;
 	}
 
-	if (timekeeping_overflow_seen) {
-		if (jiffies - timekeeping_last_warning > WARNING_FREQ) {
+	if (tk->overflow_seen) {
+		if (jiffies - tk->last_warning > WARNING_FREQ) {
 			printk_deferred("WARNING: Overflow in clocksource '%s' observed, time update capped.\n", name);
 			printk_deferred("         Please report this, consider using a different clocksource, if possible.\n");
 			printk_deferred("         Your kernel is probably still fine.\n");
-			timekeeping_last_warning = jiffies;
+			tk->last_warning = jiffies;
 		}
-		timekeeping_overflow_seen = 0;
+		tk->overflow_seen = 0;
 	}
 }
 
 static inline cycle_t timekeeping_get_delta(struct tk_read_base *tkr)
 {
+	struct timekeeper *tk = &tk_core.timekeeper;
 	cycle_t now, last, mask, max, delta;
 	unsigned int seq;
 
@@ -197,13 +186,13 @@ static inline cycle_t timekeeping_get_delta(struct tk_read_base *tkr)
 	 * mask-relative negative values.
 	 */
 	if (unlikely((~delta & mask) < (mask >> 3))) {
-		timekeeping_underflow_seen = 1;
+		tk->underflow_seen = 1;
 		delta = 0;
 	}
 
 	/* Cap delta value to the max_cycles values to avoid mult overflows */
 	if (unlikely(delta > max)) {
-		timekeeping_overflow_seen = 1;
+		tk->overflow_seen = 1;
 		delta = tkr->clock->max_cycles;
 	}
 
-- 
cgit v1.2.3


From 30f3b3f9836c7c7e1f42ca855bbe8127fff4b99a Mon Sep 17 00:00:00 2001
From: Xunlei Pang <pang.xunlei@linaro.org>
Date: Thu, 9 Apr 2015 09:04:40 +0800
Subject: time: Include math64.h in time64.h

On 32-bit systems, timespec64_add_ns() calls __iter_div_u64_rem()
which needs math64.h, and we want to include time64.h in some
cases.

Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Xunlei Pang <pang.xunlei@linaro.org>
Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 include/linux/time64.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/time64.h b/include/linux/time64.h
index a3831478d9cf..12d4e82b0276 100644
--- a/include/linux/time64.h
+++ b/include/linux/time64.h
@@ -2,6 +2,7 @@
 #define _LINUX_TIME64_H
 
 #include <uapi/linux/time.h>
+#include <linux/math64.h>
 
 typedef __s64 time64_t;
 
-- 
cgit v1.2.3


From e83d0a4106d81dd08b70318f078f3bad6acdc110 Mon Sep 17 00:00:00 2001
From: Xunlei Pang <pang.xunlei@linaro.org>
Date: Thu, 9 Apr 2015 09:04:42 +0800
Subject: time: Remove read_boot_clock()

Now that we have a read_boot_clock64() function available on every
architecture, and converted all the users to it, it's time to remove
the (now unused) read_boot_clock() completely from the kernel.

Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Xunlei Pang <pang.xunlei@linaro.org>
[jstultz: Minor commit message tweak suggested by Ingo]
Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 include/linux/timekeeping.h |  1 -
 kernel/time/timekeeping.c   | 14 +++-----------
 2 files changed, 3 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/timekeeping.h b/include/linux/timekeeping.h
index 9af5c1214682..3aa72e648650 100644
--- a/include/linux/timekeeping.h
+++ b/include/linux/timekeeping.h
@@ -267,7 +267,6 @@ extern int persistent_clock_is_local;
 
 extern void read_persistent_clock(struct timespec *ts);
 extern void read_persistent_clock64(struct timespec64 *ts);
-extern void read_boot_clock(struct timespec *ts);
 extern void read_boot_clock64(struct timespec64 *ts);
 extern int update_persistent_clock(struct timespec now);
 extern int update_persistent_clock64(struct timespec64 now);
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 2f10b6557a1c..90ed5db67c1d 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -1188,28 +1188,20 @@ void __weak read_persistent_clock64(struct timespec64 *ts64)
 }
 
 /**
- * read_boot_clock -  Return time of the system start.
+ * read_boot_clock64 -  Return time of the system start.
  *
  * Weak dummy function for arches that do not yet support it.
  * Function to read the exact time the system has been started.
- * Returns a timespec with tv_sec=0 and tv_nsec=0 if unsupported.
+ * Returns a timespec64 with tv_sec=0 and tv_nsec=0 if unsupported.
  *
  *  XXX - Do be sure to remove it once all arches implement it.
  */
-void __weak read_boot_clock(struct timespec *ts)
+void __weak read_boot_clock64(struct timespec64 *ts)
 {
 	ts->tv_sec = 0;
 	ts->tv_nsec = 0;
 }
 
-void __weak read_boot_clock64(struct timespec64 *ts64)
-{
-	struct timespec ts;
-
-	read_boot_clock(&ts);
-	*ts64 = timespec_to_timespec64(ts);
-}
-
 /* Flag for if timekeeping_resume() has injected sleeptime */
 static bool sleeptime_injected;
 
-- 
cgit v1.2.3


From 25d238b2260973bfca0b82181824340c7aeae45a Mon Sep 17 00:00:00 2001
From: Rajeev Kumar <rajeevkumar.linux@gmail.com>
Date: Fri, 22 May 2015 09:58:30 -0700
Subject: Input: update email-id of Rajeev Kumar

rajeev-dlh.kumar@st.com email-id doesn't exist anymore as I have left the
company.

Signed-off-by: Rajeev Kumar <rajeevkumar.linux@gmail.com>
Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
---
 drivers/input/keyboard/spear-keyboard.c      | 2 +-
 include/linux/platform_data/keyboard-spear.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/input/keyboard/spear-keyboard.c b/drivers/input/keyboard/spear-keyboard.c
index f42a543db043..623d451767e3 100644
--- a/drivers/input/keyboard/spear-keyboard.c
+++ b/drivers/input/keyboard/spear-keyboard.c
@@ -3,7 +3,7 @@
  * Based on omap-keypad driver
  *
  * Copyright (C) 2010 ST Microelectronics
- * Rajeev Kumar<rajeev-dlh.kumar@st.com>
+ * Rajeev Kumar <rajeevkumar.linux@gmail.com>
  *
  * This file is licensed under the terms of the GNU General Public
  * License version 2. This program is licensed "as is" without any
diff --git a/include/linux/platform_data/keyboard-spear.h b/include/linux/platform_data/keyboard-spear.h
index 9248e3a7e333..5e3ff653900c 100644
--- a/include/linux/platform_data/keyboard-spear.h
+++ b/include/linux/platform_data/keyboard-spear.h
@@ -1,6 +1,6 @@
 /*
  * Copyright (C) 2010 ST Microelectronics
- * Rajeev Kumar<rajeev-dlh.kumar@st.com>
+ * Rajeev Kumar <rajeevkumar.linux@gmail.com>
  *
  * This file is licensed under the terms of the GNU General Public
  * License version 2. This program is licensed "as is" without any
-- 
cgit v1.2.3


From 9d16f207112f77711600fb0770182a06e056e5de Mon Sep 17 00:00:00 2001
From: Saravana Kannan <skannan@codeaurora.org>
Date: Mon, 18 May 2015 10:43:31 +0530
Subject: cpufreq: Track cpu managing sysfs kobjects separately

In order to prepare for the next few commits, that will stop migrating
sysfs files on cpu hotplug, this patch starts managing sysfs-cpu
separately.

The behavior is still the same as we are still migrating sysfs files on
hotplug, later commits would change that.

Signed-off-by: Saravana Kannan <skannan@codeaurora.org>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/cpufreq.c | 11 +++++++----
 include/linux/cpufreq.h   |  4 +++-
 2 files changed, 10 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index 870df9400d3f..5d780ff5a10a 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -959,7 +959,7 @@ static int cpufreq_add_dev_symlink(struct cpufreq_policy *policy)
 	for_each_cpu(j, policy->cpus) {
 		struct device *cpu_dev;
 
-		if (j == policy->cpu)
+		if (j == policy->kobj_cpu)
 			continue;
 
 		pr_debug("Adding link for CPU: %u\n", j);
@@ -1178,6 +1178,7 @@ static int update_policy_cpu(struct cpufreq_policy *policy, unsigned int cpu,
 
 	down_write(&policy->rwsem);
 	policy->cpu = cpu;
+	policy->kobj_cpu = cpu;
 	up_write(&policy->rwsem);
 
 	return 0;
@@ -1235,10 +1236,12 @@ static int cpufreq_add_dev(struct device *dev, struct subsys_interface *sif)
 	 * the creation of a brand new one. So we need to perform this update
 	 * by invoking update_policy_cpu().
 	 */
-	if (recover_policy && cpu != policy->cpu)
+	if (recover_policy && cpu != policy->cpu) {
 		WARN_ON(update_policy_cpu(policy, cpu, dev));
-	else
+	} else {
 		policy->cpu = cpu;
+		policy->kobj_cpu = cpu;
+	}
 
 	cpumask_copy(policy->cpus, cpumask_of(cpu));
 
@@ -1417,7 +1420,7 @@ static int __cpufreq_remove_dev_prepare(struct device *dev,
 			CPUFREQ_NAME_LEN);
 	up_write(&policy->rwsem);
 
-	if (cpu != policy->cpu) {
+	if (cpu != policy->kobj_cpu) {
 		sysfs_remove_link(&dev->kobj, "cpufreq");
 	} else if (cpus > 1) {
 		/* Nominate new CPU */
diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index 48e37c07eb84..29ad97c34fd5 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -65,7 +65,9 @@ struct cpufreq_policy {
 
 	unsigned int		shared_type; /* ACPI: ANY or ALL affected CPUs
 						should set cpufreq */
-	unsigned int		cpu;    /* cpu nr of CPU managing this policy */
+	unsigned int		cpu;    /* cpu managing this policy, must be online */
+	unsigned int		kobj_cpu; /* cpu managing sysfs files, can be offline */
+
 	struct clk		*clk;
 	struct cpufreq_cpuinfo	cpuinfo;/* see above */
 
-- 
cgit v1.2.3


From 0824965140fff1bf640a987dc790d1594a8e0699 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Mon, 13 Apr 2015 16:23:36 +0200
Subject: PCI: Propagate the "ignore hotplug" setting to parent

Refine the mechanism introduced by commit f244d8b623da ("ACPIPHP / radeon /
nouveau: Fix VGA switcheroo problem related to hotplug") to propagate the
ignore_hotplug setting of the device to its parent bridge in case hotplug
notifications related to the graphics adapter switching are given for the
bridge rather than for the device itself (they need to be ignored in both
cases).

Link: https://bugzilla.kernel.org/show_bug.cgi?id=61891
Link: https://bugs.freedesktop.org/show_bug.cgi?id=88927
Fixes: b440bde74f04 ("PCI: Add pci_ignore_hotplug() to ignore hotplug events for a device")
Reported-and-tested-by: tiagdtd-lava <tiagdtd-lava@yahoo.de>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
CC: stable@vger.kernel.org	# v3.17+
---
 drivers/pci/pci.c   | 11 +++++++++++
 include/linux/pci.h |  6 +-----
 2 files changed, 12 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index acc4b6ef78c4..c44393f26fd3 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -4324,6 +4324,17 @@ bool pci_device_is_present(struct pci_dev *pdev)
 }
 EXPORT_SYMBOL_GPL(pci_device_is_present);
 
+void pci_ignore_hotplug(struct pci_dev *dev)
+{
+	struct pci_dev *bridge = dev->bus->self;
+
+	dev->ignore_hotplug = 1;
+	/* Propagate the "ignore hotplug" setting to the parent bridge. */
+	if (bridge)
+		bridge->ignore_hotplug = 1;
+}
+EXPORT_SYMBOL_GPL(pci_ignore_hotplug);
+
 #define RESOURCE_ALIGNMENT_PARAM_SIZE COMMAND_LINE_SIZE
 static char resource_alignment_param[RESOURCE_ALIGNMENT_PARAM_SIZE] = {0};
 static DEFINE_SPINLOCK(resource_alignment_lock);
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 353db8dc4c6e..ef45ffe9ca88 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1006,6 +1006,7 @@ int __must_check pci_assign_resource(struct pci_dev *dev, int i);
 int __must_check pci_reassign_resource(struct pci_dev *dev, int i, resource_size_t add_size, resource_size_t align);
 int pci_select_bars(struct pci_dev *dev, unsigned long flags);
 bool pci_device_is_present(struct pci_dev *pdev);
+void pci_ignore_hotplug(struct pci_dev *dev);
 
 /* ROM control related routines */
 int pci_enable_rom(struct pci_dev *pdev);
@@ -1043,11 +1044,6 @@ bool pci_dev_run_wake(struct pci_dev *dev);
 bool pci_check_pme_status(struct pci_dev *dev);
 void pci_pme_wakeup_bus(struct pci_bus *bus);
 
-static inline void pci_ignore_hotplug(struct pci_dev *dev)
-{
-	dev->ignore_hotplug = 1;
-}
-
 static inline int pci_enable_wake(struct pci_dev *dev, pci_power_t state,
 				  bool enable)
 {
-- 
cgit v1.2.3


From edd4ab0559316a1efe0881a4e2ccaeb4fec73142 Mon Sep 17 00:00:00 2001
From: Ramakrishna Pallala <ramakrishna.pallala@intel.com>
Date: Sun, 24 May 2015 09:11:58 +0530
Subject: power: max17042_battery: add HEALTH and TEMP_* properties support

This patch adds the support for following battery properties
to max17042 fuel gauge driver.

POWER_SUPPLY_PROP_TEMP_ALERT_MIN
POWER_SUPPLY_PROP_TEMP_ALERT_MAX
POWER_SUPPLY_PROP_TEMP_MIN
POWER_SUPPLY_PROP_TEMP_MAX
POWER_SUPPLY_PROP_HEALTH

Signed-off-by: Ramakrishna Pallala <ramakrishna.pallala@intel.com>
Reviewed-by: Krzysztof Kozlowski <k.kozlowski.k@gmail.com>
Signed-off-by: Sebastian Reichel <sre@kernel.org>
---
 drivers/power/max17042_battery.c       | 190 +++++++++++++++++++++++++++++++--
 include/linux/power/max17042_battery.h |   4 +
 2 files changed, 183 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/power/max17042_battery.c b/drivers/power/max17042_battery.c
index 6cc5e87ec031..748d9762e89b 100644
--- a/drivers/power/max17042_battery.c
+++ b/drivers/power/max17042_battery.c
@@ -63,6 +63,8 @@
 #define dP_ACC_100	0x1900
 #define dP_ACC_200	0x3200
 
+#define MAX17042_VMAX_TOLERANCE		50 /* 50 mV */
+
 struct max17042_chip {
 	struct i2c_client *client;
 	struct regmap *regmap;
@@ -85,10 +87,94 @@ static enum power_supply_property max17042_battery_props[] = {
 	POWER_SUPPLY_PROP_CHARGE_FULL,
 	POWER_SUPPLY_PROP_CHARGE_COUNTER,
 	POWER_SUPPLY_PROP_TEMP,
+	POWER_SUPPLY_PROP_TEMP_ALERT_MIN,
+	POWER_SUPPLY_PROP_TEMP_ALERT_MAX,
+	POWER_SUPPLY_PROP_TEMP_MIN,
+	POWER_SUPPLY_PROP_TEMP_MAX,
+	POWER_SUPPLY_PROP_HEALTH,
 	POWER_SUPPLY_PROP_CURRENT_NOW,
 	POWER_SUPPLY_PROP_CURRENT_AVG,
 };
 
+static int max17042_get_temperature(struct max17042_chip *chip, int *temp)
+{
+	int ret;
+	u32 data;
+	struct regmap *map = chip->regmap;
+
+	ret = regmap_read(map, MAX17042_TEMP, &data);
+	if (ret < 0)
+		return ret;
+
+	*temp = data;
+	/* The value is signed. */
+	if (*temp & 0x8000) {
+		*temp = (0x7fff & ~*temp) + 1;
+		*temp *= -1;
+	}
+
+	/* The value is converted into deci-centigrade scale */
+	/* Units of LSB = 1 / 256 degree Celsius */
+	*temp = *temp * 10 / 256;
+	return 0;
+}
+
+static int max17042_get_battery_health(struct max17042_chip *chip, int *health)
+{
+	int temp, vavg, vbatt, ret;
+	u32 val;
+
+	ret = regmap_read(chip->regmap, MAX17042_AvgVCELL, &val);
+	if (ret < 0)
+		goto health_error;
+
+	/* bits [0-3] unused */
+	vavg = val * 625 / 8;
+	/* Convert to millivolts */
+	vavg /= 1000;
+
+	ret = regmap_read(chip->regmap, MAX17042_VCELL, &val);
+	if (ret < 0)
+		goto health_error;
+
+	/* bits [0-3] unused */
+	vbatt = val * 625 / 8;
+	/* Convert to millivolts */
+	vbatt /= 1000;
+
+	if (vavg < chip->pdata->vmin) {
+		*health = POWER_SUPPLY_HEALTH_DEAD;
+		goto out;
+	}
+
+	if (vbatt > chip->pdata->vmax + MAX17042_VMAX_TOLERANCE) {
+		*health = POWER_SUPPLY_HEALTH_OVERVOLTAGE;
+		goto out;
+	}
+
+	ret = max17042_get_temperature(chip, &temp);
+	if (ret < 0)
+		goto health_error;
+
+	if (temp <= chip->pdata->temp_min) {
+		*health = POWER_SUPPLY_HEALTH_COLD;
+		goto out;
+	}
+
+	if (temp >= chip->pdata->temp_max) {
+		*health = POWER_SUPPLY_HEALTH_OVERHEAT;
+		goto out;
+	}
+
+	*health = POWER_SUPPLY_HEALTH_GOOD;
+
+out:
+	return 0;
+
+health_error:
+	return ret;
+}
+
 static int max17042_get_property(struct power_supply *psy,
 			    enum power_supply_property psp,
 			    union power_supply_propval *val)
@@ -181,19 +267,34 @@ static int max17042_get_property(struct power_supply *psy,
 		val->intval = data * 1000 / 2;
 		break;
 	case POWER_SUPPLY_PROP_TEMP:
-		ret = regmap_read(map, MAX17042_TEMP, &data);
+		ret = max17042_get_temperature(chip, &val->intval);
+		if (ret < 0)
+			return ret;
+		break;
+	case POWER_SUPPLY_PROP_TEMP_ALERT_MIN:
+		ret = regmap_read(map, MAX17042_TALRT_Th, &data);
+		if (ret < 0)
+			return ret;
+		/* LSB is Alert Minimum. In deci-centigrade */
+		val->intval = (data & 0xff) * 10;
+		break;
+	case POWER_SUPPLY_PROP_TEMP_ALERT_MAX:
+		ret = regmap_read(map, MAX17042_TALRT_Th, &data);
+		if (ret < 0)
+			return ret;
+		/* MSB is Alert Maximum. In deci-centigrade */
+		val->intval = (data >> 8) * 10;
+		break;
+	case POWER_SUPPLY_PROP_TEMP_MIN:
+		val->intval = chip->pdata->temp_min;
+		break;
+	case POWER_SUPPLY_PROP_TEMP_MAX:
+		val->intval = chip->pdata->temp_max;
+		break;
+	case POWER_SUPPLY_PROP_HEALTH:
+		ret = max17042_get_battery_health(chip, &val->intval);
 		if (ret < 0)
 			return ret;
-
-		val->intval = data;
-		/* The value is signed. */
-		if (val->intval & 0x8000) {
-			val->intval = (0x7fff & ~val->intval) + 1;
-			val->intval *= -1;
-		}
-		/* The value is converted into deci-centigrade scale */
-		/* Units of LSB = 1 / 256 degree Celsius */
-		val->intval = val->intval * 10 / 256;
 		break;
 	case POWER_SUPPLY_PROP_CURRENT_NOW:
 		if (chip->pdata->enable_current_sense) {
@@ -237,6 +338,69 @@ static int max17042_get_property(struct power_supply *psy,
 	return 0;
 }
 
+static int max17042_set_property(struct power_supply *psy,
+			    enum power_supply_property psp,
+			    const union power_supply_propval *val)
+{
+	struct max17042_chip *chip = power_supply_get_drvdata(psy);
+	struct regmap *map = chip->regmap;
+	int ret = 0;
+	u32 data;
+	int8_t temp;
+
+	switch (psp) {
+	case POWER_SUPPLY_PROP_TEMP_ALERT_MIN:
+		ret = regmap_read(map, MAX17042_TALRT_Th, &data);
+		if (ret < 0)
+			return ret;
+
+		/* Input in deci-centigrade, convert to centigrade */
+		temp = val->intval / 10;
+		/* force min < max */
+		if (temp >= (int8_t)(data >> 8))
+			temp = (int8_t)(data >> 8) - 1;
+		/* Write both MAX and MIN ALERT */
+		data = (data & 0xff00) + temp;
+		ret = regmap_write(map, MAX17042_TALRT_Th, data);
+		break;
+	case POWER_SUPPLY_PROP_TEMP_ALERT_MAX:
+		ret = regmap_read(map, MAX17042_TALRT_Th, &data);
+		if (ret < 0)
+			return ret;
+
+		/* Input in Deci-Centigrade, convert to centigrade */
+		temp = val->intval / 10;
+		/* force max > min */
+		if (temp <= (int8_t)(data & 0xff))
+			temp = (int8_t)(data & 0xff) + 1;
+		/* Write both MAX and MIN ALERT */
+		data = (data & 0xff) + (temp << 8);
+		ret = regmap_write(map, MAX17042_TALRT_Th, data);
+		break;
+	default:
+		ret = -EINVAL;
+	}
+
+	return ret;
+}
+
+static int max17042_property_is_writeable(struct power_supply *psy,
+		enum power_supply_property psp)
+{
+	int ret;
+
+	switch (psp) {
+	case POWER_SUPPLY_PROP_TEMP_ALERT_MIN:
+	case POWER_SUPPLY_PROP_TEMP_ALERT_MAX:
+		ret = 1;
+		break;
+	default:
+		ret = 0;
+	}
+
+	return ret;
+}
+
 static int max17042_write_verify_reg(struct regmap *map, u8 reg, u32 value)
 {
 	int retries = 8;
@@ -665,6 +829,8 @@ static const struct power_supply_desc max17042_psy_desc = {
 	.name		= "max170xx_battery",
 	.type		= POWER_SUPPLY_TYPE_BATTERY,
 	.get_property	= max17042_get_property,
+	.set_property	= max17042_set_property,
+	.property_is_writeable	= max17042_property_is_writeable,
 	.properties	= max17042_battery_props,
 	.num_properties	= ARRAY_SIZE(max17042_battery_props),
 };
@@ -673,6 +839,8 @@ static const struct power_supply_desc max17042_no_current_sense_psy_desc = {
 	.name		= "max170xx_battery",
 	.type		= POWER_SUPPLY_TYPE_BATTERY,
 	.get_property	= max17042_get_property,
+	.set_property	= max17042_set_property,
+	.property_is_writeable	= max17042_property_is_writeable,
 	.properties	= max17042_battery_props,
 	.num_properties	= ARRAY_SIZE(max17042_battery_props) - 2,
 };
diff --git a/include/linux/power/max17042_battery.h b/include/linux/power/max17042_battery.h
index cf112b4075c8..522757ac9cd4 100644
--- a/include/linux/power/max17042_battery.h
+++ b/include/linux/power/max17042_battery.h
@@ -215,6 +215,10 @@ struct max17042_platform_data {
 	 * the datasheet although it can be changed by board designers.
 	 */
 	unsigned int r_sns;
+	int         vmin;	/* in millivolts */
+	int         vmax;	/* in millivolts */
+	int         temp_min;	/* in tenths of degree Celsius */
+	int         temp_max;	/* in tenths of degree Celsius */
 };
 
 #endif /* __MAX17042_BATTERY_H_ */
-- 
cgit v1.2.3


From 843735b788a4e49c453f4aefdae80e6dfbe9ee85 Mon Sep 17 00:00:00 2001
From: Ramakrishna Pallala <ramakrishna.pallala@intel.com>
Date: Mon, 4 May 2015 22:16:07 +0530
Subject: power: axp288_charger: axp288 charger driver

This patch adds new power supply charger driver support
for X-Power AXP288 PMIC integrated charger.

This driver interfaces with the axp20x mfd driver as a cell
and listens to extcon cable events for setting up charging.

Signed-off-by: Ramakrishna Pallala <ramakrishna.pallala@intel.com>
Acked-by: Lee Jones <lee.jones@linaro.org>
Signed-off-by: Sebastian Reichel <sre@kernel.org>
---
 drivers/power/Kconfig          |   7 +
 drivers/power/Makefile         |   1 +
 drivers/power/axp288_charger.c | 941 +++++++++++++++++++++++++++++++++++++++++
 include/linux/mfd/axp20x.h     |   7 +
 4 files changed, 956 insertions(+)
 create mode 100644 drivers/power/axp288_charger.c

(limited to 'include/linux')

diff --git a/drivers/power/Kconfig b/drivers/power/Kconfig
index 0fe5b08055ba..a848b1a5273e 100644
--- a/drivers/power/Kconfig
+++ b/drivers/power/Kconfig
@@ -204,6 +204,13 @@ config CHARGER_DA9150
 	  This driver can also be built as a module. If so, the module will be
 	  called da9150-charger.
 
+config AXP288_CHARGER
+	tristate "X-Powers AXP288 Charger"
+	depends on MFD_AXP20X && EXTCON_AXP288
+	help
+	  Say yes here to have support X-Power AXP288 power management IC (PMIC)
+	  integrated charger.
+
 config AXP288_FUEL_GAUGE
 	tristate "X-Powers AXP288 Fuel Gauge"
 	depends on MFD_AXP20X && IIO
diff --git a/drivers/power/Makefile b/drivers/power/Makefile
index 03942e99776b..3572a72432d0 100644
--- a/drivers/power/Makefile
+++ b/drivers/power/Makefile
@@ -66,3 +66,4 @@ obj-$(CONFIG_CHARGER_SMB347)	+= smb347-charger.o
 obj-$(CONFIG_CHARGER_TPS65090)	+= tps65090-charger.o
 obj-$(CONFIG_POWER_RESET)	+= reset/
 obj-$(CONFIG_AXP288_FUEL_GAUGE) += axp288_fuel_gauge.o
+obj-$(CONFIG_AXP288_CHARGER)	+= axp288_charger.o
diff --git a/drivers/power/axp288_charger.c b/drivers/power/axp288_charger.c
new file mode 100644
index 000000000000..5680317f4823
--- /dev/null
+++ b/drivers/power/axp288_charger.c
@@ -0,0 +1,941 @@
+/*
+ * axp288_charger.c - X-power AXP288 PMIC Charger driver
+ *
+ * Copyright (C) 2014 Intel Corporation
+ * Author: Ramakrishna Pallala <ramakrishna.pallala@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/module.h>
+#include <linux/device.h>
+#include <linux/regmap.h>
+#include <linux/workqueue.h>
+#include <linux/delay.h>
+#include <linux/platform_device.h>
+#include <linux/usb/otg.h>
+#include <linux/notifier.h>
+#include <linux/power_supply.h>
+#include <linux/notifier.h>
+#include <linux/property.h>
+#include <linux/mfd/axp20x.h>
+#include <linux/extcon.h>
+
+#define PS_STAT_VBUS_TRIGGER		(1 << 0)
+#define PS_STAT_BAT_CHRG_DIR		(1 << 2)
+#define PS_STAT_VBAT_ABOVE_VHOLD	(1 << 3)
+#define PS_STAT_VBUS_VALID		(1 << 4)
+#define PS_STAT_VBUS_PRESENT		(1 << 5)
+
+#define CHRG_STAT_BAT_SAFE_MODE		(1 << 3)
+#define CHRG_STAT_BAT_VALID		(1 << 4)
+#define CHRG_STAT_BAT_PRESENT		(1 << 5)
+#define CHRG_STAT_CHARGING		(1 << 6)
+#define CHRG_STAT_PMIC_OTP		(1 << 7)
+
+#define VBUS_ISPOUT_CUR_LIM_MASK	0x03
+#define VBUS_ISPOUT_CUR_LIM_BIT_POS	0
+#define VBUS_ISPOUT_CUR_LIM_900MA	0x0	/* 900mA */
+#define VBUS_ISPOUT_CUR_LIM_1500MA	0x1	/* 1500mA */
+#define VBUS_ISPOUT_CUR_LIM_2000MA	0x2	/* 2000mA */
+#define VBUS_ISPOUT_CUR_NO_LIM		0x3	/* 2500mA */
+#define VBUS_ISPOUT_VHOLD_SET_MASK	0x31
+#define VBUS_ISPOUT_VHOLD_SET_BIT_POS	0x3
+#define VBUS_ISPOUT_VHOLD_SET_OFFSET	4000	/* 4000mV */
+#define VBUS_ISPOUT_VHOLD_SET_LSB_RES	100	/* 100mV */
+#define VBUS_ISPOUT_VHOLD_SET_4300MV	0x3	/* 4300mV */
+#define VBUS_ISPOUT_VBUS_PATH_DIS	(1 << 7)
+
+#define CHRG_CCCV_CC_MASK		0xf		/* 4 bits */
+#define CHRG_CCCV_CC_BIT_POS		0
+#define CHRG_CCCV_CC_OFFSET		200		/* 200mA */
+#define CHRG_CCCV_CC_LSB_RES		200		/* 200mA */
+#define CHRG_CCCV_ITERM_20P		(1 << 4)	/* 20% of CC */
+#define CHRG_CCCV_CV_MASK		0x60		/* 2 bits */
+#define CHRG_CCCV_CV_BIT_POS		5
+#define CHRG_CCCV_CV_4100MV		0x0		/* 4.10V */
+#define CHRG_CCCV_CV_4150MV		0x1		/* 4.15V */
+#define CHRG_CCCV_CV_4200MV		0x2		/* 4.20V */
+#define CHRG_CCCV_CV_4350MV		0x3		/* 4.35V */
+#define CHRG_CCCV_CHG_EN		(1 << 7)
+
+#define CNTL2_CC_TIMEOUT_MASK		0x3	/* 2 bits */
+#define CNTL2_CC_TIMEOUT_OFFSET		6	/* 6 Hrs */
+#define CNTL2_CC_TIMEOUT_LSB_RES	2	/* 2 Hrs */
+#define CNTL2_CC_TIMEOUT_12HRS		0x3	/* 12 Hrs */
+#define CNTL2_CHGLED_TYPEB		(1 << 4)
+#define CNTL2_CHG_OUT_TURNON		(1 << 5)
+#define CNTL2_PC_TIMEOUT_MASK		0xC0
+#define CNTL2_PC_TIMEOUT_OFFSET		40	/* 40 mins */
+#define CNTL2_PC_TIMEOUT_LSB_RES	10	/* 10 mins */
+#define CNTL2_PC_TIMEOUT_70MINS		0x3
+
+#define CHRG_ILIM_TEMP_LOOP_EN		(1 << 3)
+#define CHRG_VBUS_ILIM_MASK		0xf0
+#define CHRG_VBUS_ILIM_BIT_POS		4
+#define CHRG_VBUS_ILIM_100MA		0x0	/* 100mA */
+#define CHRG_VBUS_ILIM_500MA		0x1	/* 500mA */
+#define CHRG_VBUS_ILIM_900MA		0x2	/* 900mA */
+#define CHRG_VBUS_ILIM_1500MA		0x3	/* 1500mA */
+#define CHRG_VBUS_ILIM_2000MA		0x4	/* 2000mA */
+#define CHRG_VBUS_ILIM_2500MA		0x5	/* 2500mA */
+#define CHRG_VBUS_ILIM_3000MA		0x6	/* 3000mA */
+
+#define CHRG_VLTFC_0C			0xA5	/* 0 DegC */
+#define CHRG_VHTFC_45C			0x1F	/* 45 DegC */
+
+#define BAT_IRQ_CFG_CHRG_DONE		(1 << 2)
+#define BAT_IRQ_CFG_CHRG_START		(1 << 3)
+#define BAT_IRQ_CFG_BAT_SAFE_EXIT	(1 << 4)
+#define BAT_IRQ_CFG_BAT_SAFE_ENTER	(1 << 5)
+#define BAT_IRQ_CFG_BAT_DISCON		(1 << 6)
+#define BAT_IRQ_CFG_BAT_CONN		(1 << 7)
+#define BAT_IRQ_CFG_BAT_MASK		0xFC
+
+#define TEMP_IRQ_CFG_QCBTU		(1 << 4)
+#define TEMP_IRQ_CFG_CBTU		(1 << 5)
+#define TEMP_IRQ_CFG_QCBTO		(1 << 6)
+#define TEMP_IRQ_CFG_CBTO		(1 << 7)
+#define TEMP_IRQ_CFG_MASK		0xF0
+
+#define FG_CNTL_OCV_ADJ_EN		(1 << 3)
+
+#define CV_4100MV			4100	/* 4100mV */
+#define CV_4150MV			4150	/* 4150mV */
+#define CV_4200MV			4200	/* 4200mV */
+#define CV_4350MV			4350	/* 4350mV */
+
+#define CC_200MA			200	/*  200mA */
+#define CC_600MA			600	/*  600mA */
+#define CC_800MA			800	/*  800mA */
+#define CC_1000MA			1000	/* 1000mA */
+#define CC_1600MA			1600	/* 1600mA */
+#define CC_2000MA			2000	/* 2000mA */
+
+#define ILIM_100MA			100	/* 100mA */
+#define ILIM_500MA			500	/* 500mA */
+#define ILIM_900MA			900	/* 900mA */
+#define ILIM_1500MA			1500	/* 1500mA */
+#define ILIM_2000MA			2000	/* 2000mA */
+#define ILIM_2500MA			2500	/* 2500mA */
+#define ILIM_3000MA			3000	/* 3000mA */
+
+#define AXP288_EXTCON_DEV_NAME		"axp288_extcon"
+
+#define AXP288_EXTCON_SLOW_CHARGER		"SLOW-CHARGER"
+#define AXP288_EXTCON_DOWNSTREAM_CHARGER	"CHARGE-DOWNSTREAM"
+#define AXP288_EXTCON_FAST_CHARGER		"FAST-CHARGER"
+
+enum {
+	VBUS_OV_IRQ = 0,
+	CHARGE_DONE_IRQ,
+	CHARGE_CHARGING_IRQ,
+	BAT_SAFE_QUIT_IRQ,
+	BAT_SAFE_ENTER_IRQ,
+	QCBTU_IRQ,
+	CBTU_IRQ,
+	QCBTO_IRQ,
+	CBTO_IRQ,
+	CHRG_INTR_END,
+};
+
+struct axp288_chrg_info {
+	struct platform_device *pdev;
+	struct axp20x_chrg_pdata *pdata;
+	struct regmap *regmap;
+	struct regmap_irq_chip_data *regmap_irqc;
+	int irq[CHRG_INTR_END];
+	struct power_supply *psy_usb;
+	struct mutex lock;
+
+	/* OTG/Host mode */
+	struct {
+		struct work_struct work;
+		struct extcon_specific_cable_nb cable;
+		struct notifier_block id_nb;
+		bool id_short;
+	} otg;
+
+	/* SDP/CDP/DCP USB charging cable notifications */
+	struct {
+		struct extcon_dev *edev;
+		bool connected;
+		enum power_supply_type chg_type;
+		struct notifier_block nb;
+		struct work_struct work;
+	} cable;
+
+	int health;
+	int inlmt;
+	int cc;
+	int cv;
+	int max_cc;
+	int max_cv;
+	bool online;
+	bool present;
+	bool enable_charger;
+	bool is_charger_enabled;
+};
+
+static inline int axp288_charger_set_cc(struct axp288_chrg_info *info, int cc)
+{
+	u8 reg_val;
+	int ret;
+
+	if (cc < CHRG_CCCV_CC_OFFSET)
+		cc = CHRG_CCCV_CC_OFFSET;
+	else if (cc > info->max_cc)
+		cc = info->max_cc;
+
+	reg_val = (cc - CHRG_CCCV_CC_OFFSET) / CHRG_CCCV_CC_LSB_RES;
+	cc = (reg_val * CHRG_CCCV_CC_LSB_RES) + CHRG_CCCV_CC_OFFSET;
+	reg_val = reg_val << CHRG_CCCV_CC_BIT_POS;
+
+	ret = regmap_update_bits(info->regmap,
+				AXP20X_CHRG_CTRL1,
+				CHRG_CCCV_CC_MASK, reg_val);
+	if (ret >= 0)
+		info->cc = cc;
+
+	return ret;
+}
+
+static inline int axp288_charger_set_cv(struct axp288_chrg_info *info, int cv)
+{
+	u8 reg_val;
+	int ret;
+
+	if (cv <= CV_4100MV) {
+		reg_val = CHRG_CCCV_CV_4100MV;
+		cv = CV_4100MV;
+	} else if (cv <= CV_4150MV) {
+		reg_val = CHRG_CCCV_CV_4150MV;
+		cv = CV_4150MV;
+	} else if (cv <= CV_4200MV) {
+		reg_val = CHRG_CCCV_CV_4200MV;
+		cv = CV_4200MV;
+	} else {
+		reg_val = CHRG_CCCV_CV_4350MV;
+		cv = CV_4350MV;
+	}
+
+	reg_val = reg_val << CHRG_CCCV_CV_BIT_POS;
+
+	ret = regmap_update_bits(info->regmap,
+				AXP20X_CHRG_CTRL1,
+				CHRG_CCCV_CV_MASK, reg_val);
+
+	if (ret >= 0)
+		info->cv = cv;
+
+	return ret;
+}
+
+static inline int axp288_charger_set_vbus_inlmt(struct axp288_chrg_info *info,
+					   int inlmt)
+{
+	int ret;
+	unsigned int val;
+	u8 reg_val;
+
+	/* Read in limit register */
+	ret = regmap_read(info->regmap, AXP20X_CHRG_BAK_CTRL, &val);
+	if (ret < 0)
+		goto set_inlmt_fail;
+
+	if (inlmt <= ILIM_100MA) {
+		reg_val = CHRG_VBUS_ILIM_100MA;
+		inlmt = ILIM_100MA;
+	} else if (inlmt <= ILIM_500MA) {
+		reg_val = CHRG_VBUS_ILIM_500MA;
+		inlmt = ILIM_500MA;
+	} else if (inlmt <= ILIM_900MA) {
+		reg_val = CHRG_VBUS_ILIM_900MA;
+		inlmt = ILIM_900MA;
+	} else if (inlmt <= ILIM_1500MA) {
+		reg_val = CHRG_VBUS_ILIM_1500MA;
+		inlmt = ILIM_1500MA;
+	} else if (inlmt <= ILIM_2000MA) {
+		reg_val = CHRG_VBUS_ILIM_2000MA;
+		inlmt = ILIM_2000MA;
+	} else if (inlmt <= ILIM_2500MA) {
+		reg_val = CHRG_VBUS_ILIM_2500MA;
+		inlmt = ILIM_2500MA;
+	} else {
+		reg_val = CHRG_VBUS_ILIM_3000MA;
+		inlmt = ILIM_3000MA;
+	}
+
+	reg_val = (val & ~CHRG_VBUS_ILIM_MASK)
+			| (reg_val << CHRG_VBUS_ILIM_BIT_POS);
+	ret = regmap_write(info->regmap, AXP20X_CHRG_BAK_CTRL, reg_val);
+	if (ret >= 0)
+		info->inlmt = inlmt;
+	else
+		dev_err(&info->pdev->dev, "charger BAK control %d\n", ret);
+
+
+set_inlmt_fail:
+	return ret;
+}
+
+static int axp288_charger_vbus_path_select(struct axp288_chrg_info *info,
+								bool enable)
+{
+	int ret;
+
+	if (enable)
+		ret = regmap_update_bits(info->regmap, AXP20X_VBUS_IPSOUT_MGMT,
+					VBUS_ISPOUT_VBUS_PATH_DIS, 0);
+	else
+		ret = regmap_update_bits(info->regmap, AXP20X_VBUS_IPSOUT_MGMT,
+			VBUS_ISPOUT_VBUS_PATH_DIS, VBUS_ISPOUT_VBUS_PATH_DIS);
+
+	if (ret < 0)
+		dev_err(&info->pdev->dev, "axp288 vbus path select %d\n", ret);
+
+
+	return ret;
+}
+
+static int axp288_charger_enable_charger(struct axp288_chrg_info *info,
+								bool enable)
+{
+	int ret;
+
+	if (enable)
+		ret = regmap_update_bits(info->regmap, AXP20X_CHRG_CTRL1,
+				CHRG_CCCV_CHG_EN, CHRG_CCCV_CHG_EN);
+	else
+		ret = regmap_update_bits(info->regmap, AXP20X_CHRG_CTRL1,
+				CHRG_CCCV_CHG_EN, 0);
+	if (ret < 0)
+		dev_err(&info->pdev->dev, "axp288 enable charger %d\n", ret);
+	else
+		info->is_charger_enabled = enable;
+
+	return ret;
+}
+
+static int axp288_charger_is_present(struct axp288_chrg_info *info)
+{
+	int ret, present = 0;
+	unsigned int val;
+
+	ret = regmap_read(info->regmap, AXP20X_PWR_INPUT_STATUS, &val);
+	if (ret < 0)
+		return ret;
+
+	if (val & PS_STAT_VBUS_PRESENT)
+		present = 1;
+	return present;
+}
+
+static int axp288_charger_is_online(struct axp288_chrg_info *info)
+{
+	int ret, online = 0;
+	unsigned int val;
+
+	ret = regmap_read(info->regmap, AXP20X_PWR_INPUT_STATUS, &val);
+	if (ret < 0)
+		return ret;
+
+	if (val & PS_STAT_VBUS_VALID)
+		online = 1;
+	return online;
+}
+
+static int axp288_get_charger_health(struct axp288_chrg_info *info)
+{
+	int ret, pwr_stat, chrg_stat;
+	int health = POWER_SUPPLY_HEALTH_UNKNOWN;
+	unsigned int val;
+
+	ret = regmap_read(info->regmap, AXP20X_PWR_INPUT_STATUS, &val);
+	if ((ret < 0) || !(val & PS_STAT_VBUS_PRESENT))
+		goto health_read_fail;
+	else
+		pwr_stat = val;
+
+	ret = regmap_read(info->regmap, AXP20X_PWR_OP_MODE, &val);
+	if (ret < 0)
+		goto health_read_fail;
+	else
+		chrg_stat = val;
+
+	if (!(pwr_stat & PS_STAT_VBUS_VALID))
+		health = POWER_SUPPLY_HEALTH_DEAD;
+	else if (chrg_stat & CHRG_STAT_PMIC_OTP)
+		health = POWER_SUPPLY_HEALTH_OVERHEAT;
+	else if (chrg_stat & CHRG_STAT_BAT_SAFE_MODE)
+		health = POWER_SUPPLY_HEALTH_SAFETY_TIMER_EXPIRE;
+	else
+		health = POWER_SUPPLY_HEALTH_GOOD;
+
+health_read_fail:
+	return health;
+}
+
+static int axp288_charger_usb_set_property(struct power_supply *psy,
+				    enum power_supply_property psp,
+				    const union power_supply_propval *val)
+{
+	struct axp288_chrg_info *info = power_supply_get_drvdata(psy);
+	int ret = 0;
+	int scaled_val;
+
+	mutex_lock(&info->lock);
+
+	switch (psp) {
+	case POWER_SUPPLY_PROP_CONSTANT_CHARGE_CURRENT:
+		scaled_val = min(val->intval, info->max_cc);
+		scaled_val = DIV_ROUND_CLOSEST(scaled_val, 1000);
+		ret = axp288_charger_set_cc(info, scaled_val);
+		if (ret < 0)
+			dev_warn(&info->pdev->dev, "set charge current failed\n");
+		break;
+	case POWER_SUPPLY_PROP_CONSTANT_CHARGE_VOLTAGE:
+		scaled_val = min(val->intval, info->max_cv);
+		scaled_val = DIV_ROUND_CLOSEST(scaled_val, 1000);
+		ret = axp288_charger_set_cv(info, scaled_val);
+		if (ret < 0)
+			dev_warn(&info->pdev->dev, "set charge voltage failed\n");
+		break;
+	default:
+		ret = -EINVAL;
+	}
+
+	mutex_unlock(&info->lock);
+	return ret;
+}
+
+static int axp288_charger_usb_get_property(struct power_supply *psy,
+				    enum power_supply_property psp,
+				    union power_supply_propval *val)
+{
+	struct axp288_chrg_info *info = power_supply_get_drvdata(psy);
+	int ret = 0;
+
+	mutex_lock(&info->lock);
+
+	switch (psp) {
+	case POWER_SUPPLY_PROP_PRESENT:
+		/* Check for OTG case first */
+		if (info->otg.id_short) {
+			val->intval = 0;
+			break;
+		}
+		ret = axp288_charger_is_present(info);
+		if (ret < 0)
+			goto psy_get_prop_fail;
+		info->present = ret;
+		val->intval = info->present;
+		break;
+	case POWER_SUPPLY_PROP_ONLINE:
+		/* Check for OTG case first */
+		if (info->otg.id_short) {
+			val->intval = 0;
+			break;
+		}
+		ret = axp288_charger_is_online(info);
+		if (ret < 0)
+			goto psy_get_prop_fail;
+		info->online = ret;
+		val->intval = info->online;
+		break;
+	case POWER_SUPPLY_PROP_HEALTH:
+		val->intval = axp288_get_charger_health(info);
+		break;
+	case POWER_SUPPLY_PROP_CONSTANT_CHARGE_CURRENT:
+		val->intval = info->cc * 1000;
+		break;
+	case POWER_SUPPLY_PROP_CONSTANT_CHARGE_CURRENT_MAX:
+		val->intval = info->max_cc * 1000;
+		break;
+	case POWER_SUPPLY_PROP_CONSTANT_CHARGE_VOLTAGE:
+		val->intval = info->cv * 1000;
+		break;
+	case POWER_SUPPLY_PROP_CONSTANT_CHARGE_VOLTAGE_MAX:
+		val->intval = info->max_cv * 1000;
+		break;
+	case POWER_SUPPLY_PROP_CHARGE_CONTROL_LIMIT:
+		val->intval = info->inlmt * 1000;
+		break;
+	default:
+		ret = -EINVAL;
+		goto psy_get_prop_fail;
+	}
+
+psy_get_prop_fail:
+	mutex_unlock(&info->lock);
+	return ret;
+}
+
+static int axp288_charger_property_is_writeable(struct power_supply *psy,
+		enum power_supply_property psp)
+{
+	int ret;
+
+	switch (psp) {
+	case POWER_SUPPLY_PROP_CONSTANT_CHARGE_CURRENT:
+	case POWER_SUPPLY_PROP_CONSTANT_CHARGE_VOLTAGE:
+		ret = 1;
+		break;
+	default:
+		ret = 0;
+	}
+
+	return ret;
+}
+
+static enum power_supply_property axp288_usb_props[] = {
+	POWER_SUPPLY_PROP_PRESENT,
+	POWER_SUPPLY_PROP_ONLINE,
+	POWER_SUPPLY_PROP_TYPE,
+	POWER_SUPPLY_PROP_HEALTH,
+	POWER_SUPPLY_PROP_CONSTANT_CHARGE_CURRENT,
+	POWER_SUPPLY_PROP_CONSTANT_CHARGE_CURRENT_MAX,
+	POWER_SUPPLY_PROP_CONSTANT_CHARGE_VOLTAGE,
+	POWER_SUPPLY_PROP_CONSTANT_CHARGE_VOLTAGE_MAX,
+	POWER_SUPPLY_PROP_CHARGE_CONTROL_LIMIT,
+};
+
+static const struct power_supply_desc axp288_charger_desc = {
+	.name			= "axp288_charger",
+	.type			= POWER_SUPPLY_TYPE_USB,
+	.properties		= axp288_usb_props,
+	.num_properties		= ARRAY_SIZE(axp288_usb_props),
+	.get_property		= axp288_charger_usb_get_property,
+	.set_property		= axp288_charger_usb_set_property,
+	.property_is_writeable	= axp288_charger_property_is_writeable,
+};
+
+static irqreturn_t axp288_charger_irq_thread_handler(int irq, void *dev)
+{
+	struct axp288_chrg_info *info = dev;
+	int i;
+
+	for (i = 0; i < CHRG_INTR_END; i++) {
+		if (info->irq[i] == irq)
+			break;
+	}
+
+	if (i >= CHRG_INTR_END) {
+		dev_warn(&info->pdev->dev, "spurious interrupt!!\n");
+		return IRQ_NONE;
+	}
+
+	switch (i) {
+	case VBUS_OV_IRQ:
+		dev_dbg(&info->pdev->dev, "VBUS Over Voltage INTR\n");
+		break;
+	case CHARGE_DONE_IRQ:
+		dev_dbg(&info->pdev->dev, "Charging Done INTR\n");
+		break;
+	case CHARGE_CHARGING_IRQ:
+		dev_dbg(&info->pdev->dev, "Start Charging IRQ\n");
+		break;
+	case BAT_SAFE_QUIT_IRQ:
+		dev_dbg(&info->pdev->dev,
+			"Quit Safe Mode(restart timer) Charging IRQ\n");
+		break;
+	case BAT_SAFE_ENTER_IRQ:
+		dev_dbg(&info->pdev->dev,
+			"Enter Safe Mode(timer expire) Charging IRQ\n");
+		break;
+	case QCBTU_IRQ:
+		dev_dbg(&info->pdev->dev,
+			"Quit Battery Under Temperature(CHRG) INTR\n");
+		break;
+	case CBTU_IRQ:
+		dev_dbg(&info->pdev->dev,
+			"Hit Battery Under Temperature(CHRG) INTR\n");
+		break;
+	case QCBTO_IRQ:
+		dev_dbg(&info->pdev->dev,
+			"Quit Battery Over Temperature(CHRG) INTR\n");
+		break;
+	case CBTO_IRQ:
+		dev_dbg(&info->pdev->dev,
+			"Hit Battery Over Temperature(CHRG) INTR\n");
+		break;
+	default:
+		dev_warn(&info->pdev->dev, "Spurious Interrupt!!!\n");
+		goto out;
+	}
+
+	power_supply_changed(info->psy_usb);
+out:
+	return IRQ_HANDLED;
+}
+
+static void axp288_charger_extcon_evt_worker(struct work_struct *work)
+{
+	struct axp288_chrg_info *info =
+	    container_of(work, struct axp288_chrg_info, cable.work);
+	int ret, current_limit;
+	bool changed = false;
+	struct extcon_dev *edev = info->cable.edev;
+	bool old_connected = info->cable.connected;
+
+	/* Determine cable/charger type */
+	if (extcon_get_cable_state(edev, AXP288_EXTCON_SLOW_CHARGER) > 0) {
+		dev_dbg(&info->pdev->dev, "USB SDP charger  is connected");
+		info->cable.connected = true;
+		info->cable.chg_type = POWER_SUPPLY_TYPE_USB;
+	} else if (extcon_get_cable_state(edev,
+				AXP288_EXTCON_DOWNSTREAM_CHARGER) > 0) {
+		dev_dbg(&info->pdev->dev, "USB CDP charger is connected");
+		info->cable.connected = true;
+		info->cable.chg_type = POWER_SUPPLY_TYPE_USB_CDP;
+	} else if (extcon_get_cable_state(edev,
+					AXP288_EXTCON_FAST_CHARGER) > 0) {
+		dev_dbg(&info->pdev->dev, "USB DCP charger is connected");
+		info->cable.connected = true;
+		info->cable.chg_type = POWER_SUPPLY_TYPE_USB_DCP;
+	} else {
+		if (old_connected)
+			dev_dbg(&info->pdev->dev, "USB charger disconnected");
+		info->cable.connected = false;
+		info->cable.chg_type = POWER_SUPPLY_TYPE_USB;
+	}
+
+	/* Cable status changed */
+	if (old_connected != info->cable.connected)
+		changed = true;
+
+	if (!changed)
+		return;
+
+	mutex_lock(&info->lock);
+
+	if (info->is_charger_enabled && !info->cable.connected) {
+		info->enable_charger = false;
+		ret = axp288_charger_enable_charger(info, info->enable_charger);
+		if (ret < 0)
+			dev_err(&info->pdev->dev,
+				"cannot disable charger (%d)", ret);
+
+	} else if (!info->is_charger_enabled && info->cable.connected) {
+		switch (info->cable.chg_type) {
+		case POWER_SUPPLY_TYPE_USB:
+			current_limit = ILIM_500MA;
+			break;
+		case POWER_SUPPLY_TYPE_USB_CDP:
+			current_limit = ILIM_1500MA;
+			break;
+		case POWER_SUPPLY_TYPE_USB_DCP:
+			current_limit = ILIM_2000MA;
+			break;
+		default:
+			/* Unknown */
+			current_limit = 0;
+			break;
+		}
+
+		/* Set vbus current limit first, then enable charger */
+		ret = axp288_charger_set_vbus_inlmt(info, current_limit);
+		if (ret < 0) {
+			dev_err(&info->pdev->dev,
+				"error setting current limit (%d)", ret);
+		} else {
+			info->enable_charger = (current_limit > 0);
+			ret = axp288_charger_enable_charger(info,
+							info->enable_charger);
+			if (ret < 0)
+				dev_err(&info->pdev->dev,
+					"cannot enable charger (%d)", ret);
+		}
+	}
+
+	if (changed)
+		info->health = axp288_get_charger_health(info);
+
+	mutex_unlock(&info->lock);
+
+	if (changed)
+		power_supply_changed(info->psy_usb);
+}
+
+static int axp288_charger_handle_cable_evt(struct notifier_block *nb,
+					  unsigned long event, void *param)
+{
+	struct axp288_chrg_info *info =
+	    container_of(nb, struct axp288_chrg_info, cable.nb);
+
+	schedule_work(&info->cable.work);
+
+	return NOTIFY_OK;
+}
+
+static void axp288_charger_otg_evt_worker(struct work_struct *work)
+{
+	struct axp288_chrg_info *info =
+	    container_of(work, struct axp288_chrg_info, otg.work);
+	int ret;
+
+	/* Disable VBUS path before enabling the 5V boost */
+	ret = axp288_charger_vbus_path_select(info, !info->otg.id_short);
+	if (ret < 0)
+		dev_warn(&info->pdev->dev, "vbus path disable failed\n");
+}
+
+static int axp288_charger_handle_otg_evt(struct notifier_block *nb,
+				   unsigned long event, void *param)
+{
+	struct axp288_chrg_info *info =
+	    container_of(nb, struct axp288_chrg_info, otg.id_nb);
+	struct extcon_dev *edev = param;
+	int usb_host = extcon_get_cable_state(edev, "USB-Host");
+
+	dev_dbg(&info->pdev->dev, "external connector USB-Host is %s\n",
+				usb_host ? "attached" : "detached");
+
+	/*
+	 * Set usb_id_short flag to avoid running charger detection logic
+	 * in case usb host.
+	 */
+	info->otg.id_short = usb_host;
+	schedule_work(&info->otg.work);
+
+	return NOTIFY_OK;
+}
+
+static void charger_init_hw_regs(struct axp288_chrg_info *info)
+{
+	int ret, cc, cv;
+	unsigned int val;
+
+	/* Program temperature thresholds */
+	ret = regmap_write(info->regmap, AXP20X_V_LTF_CHRG, CHRG_VLTFC_0C);
+	if (ret < 0)
+		dev_warn(&info->pdev->dev, "register(%x) write error(%d)\n",
+							AXP20X_V_LTF_CHRG, ret);
+
+	ret = regmap_write(info->regmap, AXP20X_V_HTF_CHRG, CHRG_VHTFC_45C);
+	if (ret < 0)
+		dev_warn(&info->pdev->dev, "register(%x) write error(%d)\n",
+							AXP20X_V_HTF_CHRG, ret);
+
+	/* Do not turn-off charger o/p after charge cycle ends */
+	ret = regmap_update_bits(info->regmap,
+				AXP20X_CHRG_CTRL2,
+				CNTL2_CHG_OUT_TURNON, 1);
+	if (ret < 0)
+		dev_warn(&info->pdev->dev, "register(%x) write error(%d)\n",
+						AXP20X_CHRG_CTRL2, ret);
+
+	/* Enable interrupts */
+	ret = regmap_update_bits(info->regmap,
+				AXP20X_IRQ2_EN,
+				BAT_IRQ_CFG_BAT_MASK, 1);
+	if (ret < 0)
+		dev_warn(&info->pdev->dev, "register(%x) write error(%d)\n",
+						AXP20X_IRQ2_EN, ret);
+
+	ret = regmap_update_bits(info->regmap, AXP20X_IRQ3_EN,
+				TEMP_IRQ_CFG_MASK, 1);
+	if (ret < 0)
+		dev_warn(&info->pdev->dev, "register(%x) write error(%d)\n",
+						AXP20X_IRQ3_EN, ret);
+
+	/* Setup ending condition for charging to be 10% of I(chrg) */
+	ret = regmap_update_bits(info->regmap,
+				AXP20X_CHRG_CTRL1,
+				CHRG_CCCV_ITERM_20P, 0);
+	if (ret < 0)
+		dev_warn(&info->pdev->dev, "register(%x) write error(%d)\n",
+						AXP20X_CHRG_CTRL1, ret);
+
+	/* Disable OCV-SOC curve calibration */
+	ret = regmap_update_bits(info->regmap,
+				AXP20X_CC_CTRL,
+				FG_CNTL_OCV_ADJ_EN, 0);
+	if (ret < 0)
+		dev_warn(&info->pdev->dev, "register(%x) write error(%d)\n",
+						AXP20X_CC_CTRL, ret);
+
+	/* Init charging current and voltage */
+	info->max_cc = info->pdata->max_cc;
+	info->max_cv = info->pdata->max_cv;
+
+	/* Read current charge voltage and current limit */
+	ret = regmap_read(info->regmap, AXP20X_CHRG_CTRL1, &val);
+	if (ret < 0) {
+		/* Assume default if cannot read */
+		info->cc = info->pdata->def_cc;
+		info->cv = info->pdata->def_cv;
+	} else {
+		/* Determine charge voltage */
+		cv = (val & CHRG_CCCV_CV_MASK) >> CHRG_CCCV_CV_BIT_POS;
+		switch (cv) {
+		case CHRG_CCCV_CV_4100MV:
+			info->cv = CV_4100MV;
+			break;
+		case CHRG_CCCV_CV_4150MV:
+			info->cv = CV_4150MV;
+			break;
+		case CHRG_CCCV_CV_4200MV:
+			info->cv = CV_4200MV;
+			break;
+		case CHRG_CCCV_CV_4350MV:
+			info->cv = CV_4350MV;
+			break;
+		default:
+			info->cv = INT_MAX;
+			break;
+		}
+
+		/* Determine charge current limit */
+		cc = (ret & CHRG_CCCV_CC_MASK) >> CHRG_CCCV_CC_BIT_POS;
+		cc = (cc * CHRG_CCCV_CC_LSB_RES) + CHRG_CCCV_CC_OFFSET;
+		info->cc = cc;
+
+		/* Program default charging voltage and current */
+		cc = min(info->pdata->def_cc, info->max_cc);
+		cv = min(info->pdata->def_cv, info->max_cv);
+
+		ret = axp288_charger_set_cc(info, cc);
+		if (ret < 0)
+			dev_warn(&info->pdev->dev,
+					"error(%d) in setting CC\n", ret);
+
+		ret = axp288_charger_set_cv(info, cv);
+		if (ret < 0)
+			dev_warn(&info->pdev->dev,
+					"error(%d) in setting CV\n", ret);
+	}
+}
+
+static int axp288_charger_probe(struct platform_device *pdev)
+{
+	int ret, i, pirq;
+	struct axp288_chrg_info *info;
+	struct axp20x_dev *axp20x = dev_get_drvdata(pdev->dev.parent);
+	struct power_supply_config charger_cfg = {};
+
+	info = devm_kzalloc(&pdev->dev, sizeof(*info), GFP_KERNEL);
+	if (!info)
+		return -ENOMEM;
+
+	info->pdev = pdev;
+	info->regmap = axp20x->regmap;
+	info->regmap_irqc = axp20x->regmap_irqc;
+	info->pdata = pdev->dev.platform_data;
+
+	if (!info->pdata) {
+		/* Try ACPI provided pdata via device properties */
+		if (!device_property_present(&pdev->dev,
+						"axp288_charger_data\n"))
+			dev_err(&pdev->dev, "failed to get platform data\n");
+		return -ENODEV;
+	}
+
+	info->cable.edev = extcon_get_extcon_dev(AXP288_EXTCON_DEV_NAME);
+	if (info->cable.edev == NULL) {
+		dev_dbg(&pdev->dev, "%s is not ready, probe deferred\n",
+			AXP288_EXTCON_DEV_NAME);
+		return -EPROBE_DEFER;
+	}
+
+	/* Register for extcon notification */
+	INIT_WORK(&info->cable.work, axp288_charger_extcon_evt_worker);
+	info->cable.nb.notifier_call = axp288_charger_handle_cable_evt;
+	ret = extcon_register_notifier(info->cable.edev, &info->cable.nb);
+	if (ret) {
+		dev_err(&info->pdev->dev,
+			"failed to register extcon notifier %d\n", ret);
+		return ret;
+	}
+
+	platform_set_drvdata(pdev, info);
+	mutex_init(&info->lock);
+
+	/* Register with power supply class */
+	charger_cfg.drv_data = info;
+	info->psy_usb = power_supply_register(&pdev->dev, &axp288_charger_desc,
+						&charger_cfg);
+	if (IS_ERR(info->psy_usb)) {
+		dev_err(&pdev->dev, "failed to register power supply charger\n");
+		ret = PTR_ERR(info->psy_usb);
+		goto psy_reg_failed;
+	}
+
+	/* Register for OTG notification */
+	INIT_WORK(&info->otg.work, axp288_charger_otg_evt_worker);
+	info->otg.id_nb.notifier_call = axp288_charger_handle_otg_evt;
+	ret = extcon_register_interest(&info->otg.cable, NULL, "USB-Host",
+				       &info->otg.id_nb);
+	if (ret)
+		dev_warn(&pdev->dev, "failed to register otg notifier\n");
+
+	if (info->otg.cable.edev)
+		info->otg.id_short = extcon_get_cable_state(
+					info->otg.cable.edev, "USB-Host");
+
+	/* Register charger interrupts */
+	for (i = 0; i < CHRG_INTR_END; i++) {
+		pirq = platform_get_irq(info->pdev, i);
+		info->irq[i] = regmap_irq_get_virq(info->regmap_irqc, pirq);
+		if (info->irq[i] < 0) {
+			dev_warn(&info->pdev->dev,
+				"failed to get virtual interrupt=%d\n", pirq);
+			ret = info->irq[i];
+			goto intr_reg_failed;
+		}
+		ret = devm_request_threaded_irq(&info->pdev->dev, info->irq[i],
+					NULL, axp288_charger_irq_thread_handler,
+					IRQF_ONESHOT, info->pdev->name, info);
+		if (ret) {
+			dev_err(&pdev->dev, "failed to request interrupt=%d\n",
+								info->irq[i]);
+			goto intr_reg_failed;
+		}
+	}
+
+	charger_init_hw_regs(info);
+
+	return 0;
+
+intr_reg_failed:
+	if (info->otg.cable.edev)
+		extcon_unregister_interest(&info->otg.cable);
+	power_supply_unregister(info->psy_usb);
+psy_reg_failed:
+	extcon_unregister_notifier(info->cable.edev, &info->cable.nb);
+	return ret;
+}
+
+static int axp288_charger_remove(struct platform_device *pdev)
+{
+	struct axp288_chrg_info *info =  dev_get_drvdata(&pdev->dev);
+
+	if (info->otg.cable.edev)
+		extcon_unregister_interest(&info->otg.cable);
+
+	extcon_unregister_notifier(info->cable.edev, &info->cable.nb);
+	power_supply_unregister(info->psy_usb);
+
+	return 0;
+}
+
+static struct platform_driver axp288_charger_driver = {
+	.probe = axp288_charger_probe,
+	.remove = axp288_charger_remove,
+	.driver = {
+		.name = "axp288_charger",
+	},
+};
+
+module_platform_driver(axp288_charger_driver);
+
+MODULE_AUTHOR("Ramakrishna Pallala <ramakrishna.pallala@intel.com>");
+MODULE_DESCRIPTION("X-power AXP288 Charger Driver");
+MODULE_LICENSE("GPL v2");
diff --git a/include/linux/mfd/axp20x.h b/include/linux/mfd/axp20x.h
index dfabd6db7ddf..f9030df5acd1 100644
--- a/include/linux/mfd/axp20x.h
+++ b/include/linux/mfd/axp20x.h
@@ -275,4 +275,11 @@ struct axp20x_fg_pdata {
 	int thermistor_curve[MAX_THERM_CURVE_SIZE][2];
 };
 
+struct axp20x_chrg_pdata {
+	int max_cc;
+	int max_cv;
+	int def_cc;
+	int def_cv;
+};
+
 #endif /* __LINUX_MFD_AXP20X_H */
-- 
cgit v1.2.3


From c93b76b34b4d8dbe8e3443eb27e49ac60034342b Mon Sep 17 00:00:00 2001
From: Tomas Winkler <tomas.winkler@intel.com>
Date: Thu, 7 May 2015 15:54:02 +0300
Subject: mei: bus: report also uuid in module alias

In order to automate modules matching add device uuid
which is reported in client enumeration, keep also
the name that is needed in for nfc distinguishing radio vendor

Report mei:name:uuid

Cc: linux-api@vger.kernel.org
Cc: Samuel Ortiz <sameo@linux.intel.com>
Signed-off-by: Tomas Winkler <tomas.winkler@intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 Documentation/ABI/testing/sysfs-bus-mei |  2 +-
 drivers/misc/mei/bus.c                  | 42 ++++++++++++++++++++++++++-------
 drivers/misc/mei/mei_dev.h              |  2 ++
 drivers/nfc/mei_phy.h                   |  3 +++
 drivers/nfc/microread/mei.c             |  2 +-
 drivers/nfc/pn544/mei.c                 |  2 +-
 include/linux/mod_devicetable.h         | 13 ++++++++++
 scripts/mod/devicetable-offsets.c       |  1 +
 scripts/mod/file2alias.c                | 18 ++++++++++++--
 9 files changed, 72 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/ABI/testing/sysfs-bus-mei b/Documentation/ABI/testing/sysfs-bus-mei
index 2066f0bbd453..91967a70313a 100644
--- a/Documentation/ABI/testing/sysfs-bus-mei
+++ b/Documentation/ABI/testing/sysfs-bus-mei
@@ -4,4 +4,4 @@ KernelVersion:	3.10
 Contact:	Samuel Ortiz <sameo@linux.intel.com>
 		linux-mei@linux.intel.com
 Description:	Stores the same MODALIAS value emitted by uevent
-		Format: mei:<mei device name>
+		Format: mei:<mei device name>:<device uuid>:
diff --git a/drivers/misc/mei/bus.c b/drivers/misc/mei/bus.c
index 1101d6efaf27..17b00baa53b1 100644
--- a/drivers/misc/mei/bus.c
+++ b/drivers/misc/mei/bus.c
@@ -30,23 +30,40 @@
 #define to_mei_cl_driver(d) container_of(d, struct mei_cl_driver, driver)
 #define to_mei_cl_device(d) container_of(d, struct mei_cl_device, dev)
 
+static inline uuid_le uuid_le_cast(const __u8 uuid[16])
+{
+	return *(uuid_le *)uuid;
+}
+
 static int mei_cl_device_match(struct device *dev, struct device_driver *drv)
 {
 	struct mei_cl_device *device = to_mei_cl_device(dev);
 	struct mei_cl_driver *driver = to_mei_cl_driver(drv);
 	const struct mei_cl_device_id *id;
+	const uuid_le *uuid;
+	const char *name;
 
 	if (!device)
 		return 0;
 
+	uuid = mei_me_cl_uuid(device->me_cl);
+	name = device->name;
+
 	if (!driver || !driver->id_table)
 		return 0;
 
 	id = driver->id_table;
 
-	while (id->name[0]) {
-		if (!strncmp(dev_name(dev), id->name, sizeof(id->name)))
-			return 1;
+	while (uuid_le_cmp(NULL_UUID_LE, uuid_le_cast(id->uuid))) {
+
+		if (!uuid_le_cmp(*uuid, uuid_le_cast(id->uuid))) {
+			if (id->name[0]) {
+				if (!strncmp(name, id->name, sizeof(id->name)))
+					return 1;
+			} else {
+				return 1;
+			}
+		}
 
 		id++;
 	}
@@ -69,7 +86,7 @@ static int mei_cl_device_probe(struct device *dev)
 
 	dev_dbg(dev, "Device probe\n");
 
-	strlcpy(id.name, dev_name(dev), sizeof(id.name));
+	strlcpy(id.name, device->name, sizeof(id.name));
 
 	return driver->probe(device, &id);
 }
@@ -100,9 +117,12 @@ static int mei_cl_device_remove(struct device *dev)
 static ssize_t modalias_show(struct device *dev, struct device_attribute *a,
 			     char *buf)
 {
-	int len;
+	struct mei_cl_device *device = to_mei_cl_device(dev);
+	const uuid_le *uuid = mei_me_cl_uuid(device->me_cl);
+	size_t len;
 
-	len = snprintf(buf, PAGE_SIZE, "mei:%s\n", dev_name(dev));
+	len = snprintf(buf, PAGE_SIZE, "mei:%s:" MEI_CL_UUID_FMT ":",
+		device->name, MEI_CL_UUID_ARGS(uuid->b));
 
 	return (len >= PAGE_SIZE) ? (PAGE_SIZE - 1) : len;
 }
@@ -116,7 +136,11 @@ ATTRIBUTE_GROUPS(mei_cl_dev);
 
 static int mei_cl_uevent(struct device *dev, struct kobj_uevent_env *env)
 {
-	if (add_uevent_var(env, "MODALIAS=mei:%s", dev_name(dev)))
+	struct mei_cl_device *device = to_mei_cl_device(dev);
+	const uuid_le *uuid = mei_me_cl_uuid(device->me_cl);
+
+	if (add_uevent_var(env, "MODALIAS=mei:%s:" MEI_CL_UUID_FMT ":",
+		device->name, MEI_CL_UUID_ARGS(uuid->b)))
 		return -ENOMEM;
 
 	return 0;
@@ -185,7 +209,9 @@ struct mei_cl_device *mei_cl_add_device(struct mei_device *dev,
 	device->dev.bus = &mei_cl_bus_type;
 	device->dev.type = &mei_cl_device_type;
 
-	dev_set_name(&device->dev, "%s", name);
+	strlcpy(device->name, name, sizeof(device->name));
+
+	dev_set_name(&device->dev, "mei:%s:%pUl", name, mei_me_cl_uuid(me_cl));
 
 	status = device_register(&device->dev);
 	if (status) {
diff --git a/drivers/misc/mei/mei_dev.h b/drivers/misc/mei/mei_dev.h
index 79ab78184523..ab719e674edf 100644
--- a/drivers/misc/mei/mei_dev.h
+++ b/drivers/misc/mei/mei_dev.h
@@ -376,6 +376,7 @@ struct mei_cl *mei_cl_bus_find_cl_by_uuid(struct mei_device *dev, uuid_le uuid);
  * @dev: linux driver model device pointer
  * @me_cl: me client
  * @cl: mei client
+ * @name: device name
  * @ops: ME transport ops
  * @event_work: async work to execute event callback
  * @event_cb: Drivers register this callback to get asynchronous ME
@@ -389,6 +390,7 @@ struct mei_cl_device {
 
 	struct mei_me_client *me_cl;
 	struct mei_cl *cl;
+	char name[MEI_CL_NAME_SIZE];
 
 	const struct mei_cl_ops *ops;
 
diff --git a/drivers/nfc/mei_phy.h b/drivers/nfc/mei_phy.h
index d669900f8278..06608c28ff14 100644
--- a/drivers/nfc/mei_phy.h
+++ b/drivers/nfc/mei_phy.h
@@ -3,7 +3,10 @@
 
 #include <linux/mei_cl_bus.h>
 #include <net/nfc/hci.h>
+#include <linux/uuid.h>
 
+#define MEI_NFC_UUID __UUID_LE(0x0bb17a78, 0x2a8e, 0x4c50, \
+		0x94, 0xd4, 0x50, 0x26, 0x67, 0x23, 0x77, 0x5c)
 #define MEI_NFC_HEADER_SIZE 10
 #define MEI_NFC_MAX_HCI_PAYLOAD 300
 
diff --git a/drivers/nfc/microread/mei.c b/drivers/nfc/microread/mei.c
index 2d1395be64ae..f9f5fc97cdd7 100644
--- a/drivers/nfc/microread/mei.c
+++ b/drivers/nfc/microread/mei.c
@@ -67,7 +67,7 @@ static int microread_mei_remove(struct mei_cl_device *device)
 }
 
 static struct mei_cl_device_id microread_mei_tbl[] = {
-	{ MICROREAD_DRIVER_NAME },
+	{ MICROREAD_DRIVER_NAME, MEI_NFC_UUID},
 
 	/* required last entry */
 	{ }
diff --git a/drivers/nfc/pn544/mei.c b/drivers/nfc/pn544/mei.c
index 330cd4031009..101a37e12efa 100644
--- a/drivers/nfc/pn544/mei.c
+++ b/drivers/nfc/pn544/mei.c
@@ -67,7 +67,7 @@ static int pn544_mei_remove(struct mei_cl_device *device)
 }
 
 static struct mei_cl_device_id pn544_mei_tbl[] = {
-	{ PN544_DRIVER_NAME },
+	{ PN544_DRIVER_NAME, MEI_NFC_UUID},
 
 	/* required last entry */
 	{ }
diff --git a/include/linux/mod_devicetable.h b/include/linux/mod_devicetable.h
index 3bfd56778c29..2d2b2b571d61 100644
--- a/include/linux/mod_devicetable.h
+++ b/include/linux/mod_devicetable.h
@@ -599,9 +599,22 @@ struct ipack_device_id {
 
 #define MEI_CL_MODULE_PREFIX "mei:"
 #define MEI_CL_NAME_SIZE 32
+#define MEI_CL_UUID_FMT "%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x"
+#define MEI_CL_UUID_ARGS(_u) \
+	_u[0], _u[1], _u[2], _u[3], _u[4], _u[5], _u[6], _u[7], \
+	_u[8], _u[9], _u[10], _u[11], _u[12], _u[13], _u[14], _u[15]
 
+/**
+ * struct mei_cl_device_id - MEI client device identifier
+ * @name: helper name
+ * @uuid: client uuid
+ * @driver_info: information used by the driver.
+ *
+ * identifies mei client device by uuid and name
+ */
 struct mei_cl_device_id {
 	char name[MEI_CL_NAME_SIZE];
+	__u8 uuid[16];
 	kernel_ulong_t driver_info;
 };
 
diff --git a/scripts/mod/devicetable-offsets.c b/scripts/mod/devicetable-offsets.c
index fce36d0f6898..091f6290a651 100644
--- a/scripts/mod/devicetable-offsets.c
+++ b/scripts/mod/devicetable-offsets.c
@@ -182,6 +182,7 @@ int main(void)
 
 	DEVID(mei_cl_device_id);
 	DEVID_FIELD(mei_cl_device_id, name);
+	DEVID_FIELD(mei_cl_device_id, uuid);
 
 	DEVID(rio_device_id);
 	DEVID_FIELD(rio_device_id, did);
diff --git a/scripts/mod/file2alias.c b/scripts/mod/file2alias.c
index 78691d51a479..62c517f4b592 100644
--- a/scripts/mod/file2alias.c
+++ b/scripts/mod/file2alias.c
@@ -131,6 +131,15 @@ static inline void add_wildcard(char *str)
 		strcat(str + len, "*");
 }
 
+static inline void add_uuid(char *str, __u8 uuid[16])
+{
+	int len = strlen(str);
+	int i;
+
+	for (i = 0; i < 16; i++)
+		sprintf(str + len + (i << 1), "%02x", uuid[i]);
+}
+
 /**
  * Check that sizeof(device_id type) are consistent with size of section
  * in .o file. If in-consistent then userspace and kernel does not agree
@@ -1160,13 +1169,18 @@ static int do_cpu_entry(const char *filename, void *symval, char *alias)
 }
 ADD_TO_DEVTABLE("cpu", cpu_feature, do_cpu_entry);
 
-/* Looks like: mei:S */
+/* Looks like: mei:S:uuid */
 static int do_mei_entry(const char *filename, void *symval,
 			char *alias)
 {
 	DEF_FIELD_ADDR(symval, mei_cl_device_id, name);
+	DEF_FIELD_ADDR(symval, mei_cl_device_id, uuid);
+
+	sprintf(alias, MEI_CL_MODULE_PREFIX);
+	sprintf(alias + strlen(alias), "%s:",  (*name)[0]  ? *name : "*");
+	add_uuid(alias, *uuid);
 
-	sprintf(alias, MEI_CL_MODULE_PREFIX "%s", *name);
+	strcat(alias, ":*");
 
 	return 1;
 }
-- 
cgit v1.2.3


From dbac993f6a6df24d5edc362667e524ba43543472 Mon Sep 17 00:00:00 2001
From: Tomas Winkler <tomas.winkler@intel.com>
Date: Thu, 7 May 2015 15:54:07 +0300
Subject: mei: export mei client device struct to external use

Cc: Samuel Ortiz <sameo@linux.intel.com>
Signed-off-by: Tomas Winkler <tomas.winkler@intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/misc/mei/mei_dev.h | 35 -----------------------------------
 include/linux/mei_cl_bus.h | 38 ++++++++++++++++++++++++++++++++++++--
 2 files changed, 36 insertions(+), 37 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/misc/mei/mei_dev.h b/drivers/misc/mei/mei_dev.h
index 609a2d2c2dba..70a644f688b0 100644
--- a/drivers/misc/mei/mei_dev.h
+++ b/drivers/misc/mei/mei_dev.h
@@ -343,41 +343,6 @@ int mei_cl_bus_init(void);
 void mei_cl_bus_exit(void);
 struct mei_cl *mei_cl_bus_find_cl_by_uuid(struct mei_device *dev, uuid_le uuid);
 
-
-/**
- * struct mei_cl_device - MEI device handle
- * An mei_cl_device pointer is returned from mei_add_device()
- * and links MEI bus clients to their actual ME host client pointer.
- * Drivers for MEI devices will get an mei_cl_device pointer
- * when being probed and shall use it for doing ME bus I/O.
- *
- * @dev: linux driver model device pointer
- * @me_cl: me client
- * @cl: mei client
- * @name: device name
- * @event_work: async work to execute event callback
- * @event_cb: Drivers register this callback to get asynchronous ME
- *	events (e.g. Rx buffer pending) notifications.
- * @event_context: event callback run context
- * @events: Events bitmask sent to the driver.
- * @priv_data: client private data
- */
-struct mei_cl_device {
-	struct device dev;
-
-	struct mei_me_client *me_cl;
-	struct mei_cl *cl;
-	char name[MEI_CL_NAME_SIZE];
-
-	struct work_struct event_work;
-	mei_cl_event_cb_t event_cb;
-	void *event_context;
-	unsigned long events;
-
-	void *priv_data;
-};
-
-
 /**
  * enum mei_pg_event - power gating transition events
  *
diff --git a/include/linux/mei_cl_bus.h b/include/linux/mei_cl_bus.h
index 0819d36a3a74..a16b1f9c1aca 100644
--- a/include/linux/mei_cl_bus.h
+++ b/include/linux/mei_cl_bus.h
@@ -7,6 +7,42 @@
 
 struct mei_cl_device;
 
+typedef void (*mei_cl_event_cb_t)(struct mei_cl_device *device,
+			       u32 events, void *context);
+
+/**
+ * struct mei_cl_device - MEI device handle
+ * An mei_cl_device pointer is returned from mei_add_device()
+ * and links MEI bus clients to their actual ME host client pointer.
+ * Drivers for MEI devices will get an mei_cl_device pointer
+ * when being probed and shall use it for doing ME bus I/O.
+ *
+ * @dev: linux driver model device pointer
+ * @me_cl: me client
+ * @cl: mei client
+ * @name: device name
+ * @event_work: async work to execute event callback
+ * @event_cb: Drivers register this callback to get asynchronous ME
+ *	events (e.g. Rx buffer pending) notifications.
+ * @event_context: event callback run context
+ * @events: Events bitmask sent to the driver.
+ * @priv_data: client private data
+ */
+struct mei_cl_device {
+	struct device dev;
+
+	struct mei_me_client *me_cl;
+	struct mei_cl *cl;
+	char name[MEI_CL_NAME_SIZE];
+
+	struct work_struct event_work;
+	mei_cl_event_cb_t event_cb;
+	void *event_context;
+	unsigned long events;
+
+	void *priv_data;
+};
+
 struct mei_cl_driver {
 	struct device_driver driver;
 	const char *name;
@@ -28,8 +64,6 @@ void mei_cl_driver_unregister(struct mei_cl_driver *driver);
 ssize_t mei_cl_send(struct mei_cl_device *device, u8 *buf, size_t length);
 ssize_t  mei_cl_recv(struct mei_cl_device *device, u8 *buf, size_t length);
 
-typedef void (*mei_cl_event_cb_t)(struct mei_cl_device *device,
-			       u32 events, void *context);
 int mei_cl_register_event_cb(struct mei_cl_device *device,
 			  mei_cl_event_cb_t read_cb, void *context);
 
-- 
cgit v1.2.3


From 7df20f2d893db42eaa1ea1e30a2573c971ec9238 Mon Sep 17 00:00:00 2001
From: Sudeep Dutt <sudeep.dutt@intel.com>
Date: Wed, 29 Apr 2015 05:32:28 -0700
Subject: misc: mic: SCIF header file and IOCTL interface

This patch introduces the SCIF documentation in the header file
and describes the IOCTL interface for user mode. mic_overview.txt
is updated with documentation on SCIF and a new document
describing SCIF in more details is available in scif_overview.txt.

Reviewed-by: Nikhil Rao <nikhil.rao@intel.com>
Reviewed-by: Ashutosh Dixit <ashutosh.dixit@intel.com>
Signed-off-by: Sudeep Dutt <sudeep.dutt@intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 Documentation/mic/mic_overview.txt  |  28 +-
 Documentation/mic/scif_overview.txt |  98 ++++
 include/linux/scif.h                | 993 ++++++++++++++++++++++++++++++++++++
 include/uapi/linux/Kbuild           |   1 +
 include/uapi/linux/scif_ioctl.h     | 130 +++++
 5 files changed, 1238 insertions(+), 12 deletions(-)
 create mode 100644 Documentation/mic/scif_overview.txt
 create mode 100644 include/linux/scif.h
 create mode 100644 include/uapi/linux/scif_ioctl.h

(limited to 'include/linux')

diff --git a/Documentation/mic/mic_overview.txt b/Documentation/mic/mic_overview.txt
index 77c541802ad9..1a2f2c8ec59e 100644
--- a/Documentation/mic/mic_overview.txt
+++ b/Documentation/mic/mic_overview.txt
@@ -24,6 +24,10 @@ a virtual bus called mic bus is created and virtual dma devices are
 created on it by the host/card drivers. On host the channels are private
 and used only by the host driver to transfer data for the virtio devices.
 
+The Symmetric Communication Interface (SCIF (pronounced as skiff)) is a
+low level communications API across PCIe currently implemented for MIC.
+More details are available at scif_overview.txt.
+
 Here is a block diagram of the various components described above. The
 virtio backends are situated on the host rather than the card given better
 single threaded performance for the host compared to MIC, the ability of
@@ -47,18 +51,18 @@ the fact that the virtio block storage backend can only be on the host.
                       |               |       | Virtio over PCIe IOCTLs  |
                       |               |       +--------------------------+
 +-----------+         |               |                   |  +-----------+
-| MIC DMA   |         |               |                   |  | MIC DMA   |
-| Driver    |         |               |                   |  | Driver    |
-+-----------+         |               |                   |  +-----------+
-      |               |               |                   |        |
-+---------------+     |               |                   |  +----------------+
-|MIC virtual Bus|     |               |                   |  |MIC virtual Bus |
-+---------------+     |               |                   |  +----------------+
-      |               |               |                   |              |
-      |   +--------------+            |            +---------------+     |
-      |   |Intel MIC     |            |            |Intel MIC      |     |
-      +---|Card Driver   |            |            |Host Driver    |     |
-          +--------------+            |            +---------------+-----+
+| MIC DMA   |         |  +----------+ | +-----------+     |  | MIC DMA   |
+| Driver    |         |  |  SCIF    | | |   SCIF    |     |  | Driver    |
++-----------+         |  +----------+ | +-----------+     |  +-----------+
+      |               |       |       |       |           |        |
++---------------+     | +-----+-----+ | +-----+-----+     | +---------------+
+|MIC virtual Bus|     | |SCIF HW Bus| | |SCIF HW BUS|     | |MIC virtual Bus|
++---------------+     | +-----------+ | +-----+-----+     | +---------------+
+      |               |       |       |       |           |              |
+      |   +--------------+    |       |       |    +---------------+     |
+      |   |Intel MIC     |    |       |       |    |Intel MIC      |     |
+      +---|Card Driver   +----+       |       |    |Host Driver    |     |
+          +--------------+            |       +----+---------------+-----+
                       |               |                   |
              +-------------------------------------------------------------+
              |                                                             |
diff --git a/Documentation/mic/scif_overview.txt b/Documentation/mic/scif_overview.txt
new file mode 100644
index 000000000000..0a280d986731
--- /dev/null
+++ b/Documentation/mic/scif_overview.txt
@@ -0,0 +1,98 @@
+The Symmetric Communication Interface (SCIF (pronounced as skiff)) is a low
+level communications API across PCIe currently implemented for MIC. Currently
+SCIF provides inter-node communication within a single host platform, where a
+node is a MIC Coprocessor or Xeon based host. SCIF abstracts the details of
+communicating over the PCIe bus while providing an API that is symmetric
+across all the nodes in the PCIe network. An important design objective for SCIF
+is to deliver the maximum possible performance given the communication
+abilities of the hardware. SCIF has been used to implement an offload compiler
+runtime and OFED support for MPI implementations for MIC coprocessors.
+
+==== SCIF API Components ====
+The SCIF API has the following parts:
+1. Connection establishment using a client server model
+2. Byte stream messaging intended for short messages
+3. Node enumeration to determine online nodes
+4. Poll semantics for detection of incoming connections and messages
+5. Memory registration to pin down pages
+6. Remote memory mapping for low latency CPU accesses via mmap
+7. Remote DMA (RDMA) for high bandwidth DMA transfers
+8. Fence APIs for RDMA synchronization
+
+SCIF exposes the notion of a connection which can be used by peer processes on
+nodes in a SCIF PCIe "network" to share memory "windows" and to communicate. A
+process in a SCIF node initiates a SCIF connection to a peer process on a
+different node via a SCIF "endpoint". SCIF endpoints support messaging APIs
+which are similar to connection oriented socket APIs. Connected SCIF endpoints
+can also register local memory which is followed by data transfer using either
+DMA, CPU copies or remote memory mapping via mmap. SCIF supports both user and
+kernel mode clients which are functionally equivalent.
+
+==== SCIF Performance for MIC ====
+DMA bandwidth comparison between the TCP (over ethernet over PCIe) stack versus
+SCIF shows the performance advantages of SCIF for HPC applications and runtimes.
+
+             Comparison of TCP and SCIF based BW
+
+  Throughput (GB/sec)
+    8 +                                             PCIe Bandwidth ******
+      +                                                        TCP ######
+    7 +    **************************************             SCIF %%%%%%
+      |                       %%%%%%%%%%%%%%%%%%%
+    6 +                   %%%%
+      |                 %%
+      |               %%%
+    5 +              %%
+      |            %%
+    4 +           %%
+      |          %%
+    3 +         %%
+      |        %
+    2 +      %%
+      |     %%
+      |    %
+    1 +
+      +    ######################################
+    0 +++---+++--+--+-+--+--+-++-+--+-++-+--+-++-+-
+      1       10     100      1000   10000   100000
+                   Transfer Size (KBytes)
+
+SCIF allows memory sharing via mmap(..) between processes on different PCIe
+nodes and thus provides bare-metal PCIe latency. The round trip SCIF mmap
+latency from the host to an x100 MIC for an 8 byte message is 0.44 usecs.
+
+SCIF has a user space library which is a thin IOCTL wrapper providing a user
+space API similar to the kernel API in scif.h. The SCIF user space library
+is distributed @ https://software.intel.com/en-us/mic-developer
+
+Here is some pseudo code for an example of how two applications on two PCIe
+nodes would typically use the SCIF API:
+
+Process A (on node A)			Process B (on node B)
+
+/* get online node information */
+scif_get_node_ids(..)			scif_get_node_ids(..)
+scif_open(..)				scif_open(..)
+scif_bind(..)				scif_bind(..)
+scif_listen(..)
+scif_accept(..)				scif_connect(..)
+/* SCIF connection established */
+
+/* Send and receive short messages */
+scif_send(..)/scif_recv(..)		scif_send(..)/scif_recv(..)
+
+/* Register memory */
+scif_register(..)			scif_register(..)
+
+/* RDMA */
+scif_readfrom(..)/scif_writeto(..)	scif_readfrom(..)/scif_writeto(..)
+
+/* Fence DMAs */
+scif_fence_signal(..)			scif_fence_signal(..)
+
+mmap(..)				mmap(..)
+
+/* Access remote registered memory */
+
+/* Close the endpoints */
+scif_close(..)				scif_close(..)
diff --git a/include/linux/scif.h b/include/linux/scif.h
new file mode 100644
index 000000000000..44f4f3898bbe
--- /dev/null
+++ b/include/linux/scif.h
@@ -0,0 +1,993 @@
+/*
+ * Intel MIC Platform Software Stack (MPSS)
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2014 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2014 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ *   notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ *   notice, this list of conditions and the following disclaimer in
+ *   the documentation and/or other materials provided with the
+ *   distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ *   contributors may be used to endorse or promote products derived
+ *   from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Intel SCIF driver.
+ *
+ */
+#ifndef __SCIF_H__
+#define __SCIF_H__
+
+#include <linux/types.h>
+#include <linux/poll.h>
+#include <linux/scif_ioctl.h>
+
+#define SCIF_ACCEPT_SYNC	1
+#define SCIF_SEND_BLOCK		1
+#define SCIF_RECV_BLOCK		1
+
+enum {
+	SCIF_PROT_READ = (1 << 0),
+	SCIF_PROT_WRITE = (1 << 1)
+};
+
+enum {
+	SCIF_MAP_FIXED = 0x10,
+	SCIF_MAP_KERNEL	= 0x20,
+};
+
+enum {
+	SCIF_FENCE_INIT_SELF = (1 << 0),
+	SCIF_FENCE_INIT_PEER = (1 << 1),
+	SCIF_SIGNAL_LOCAL = (1 << 4),
+	SCIF_SIGNAL_REMOTE = (1 << 5)
+};
+
+enum {
+	SCIF_RMA_USECPU = (1 << 0),
+	SCIF_RMA_USECACHE = (1 << 1),
+	SCIF_RMA_SYNC = (1 << 2),
+	SCIF_RMA_ORDERED = (1 << 3)
+};
+
+/* End of SCIF Admin Reserved Ports */
+#define SCIF_ADMIN_PORT_END	1024
+
+/* End of SCIF Reserved Ports */
+#define SCIF_PORT_RSVD		1088
+
+typedef struct scif_endpt *scif_epd_t;
+
+#define SCIF_OPEN_FAILED ((scif_epd_t)-1)
+#define SCIF_REGISTER_FAILED ((off_t)-1)
+#define SCIF_MMAP_FAILED ((void *)-1)
+
+/**
+ * scif_open() - Create an endpoint
+ *
+ * Return:
+ * Upon successful completion, scif_open() returns an endpoint descriptor to
+ * be used in subsequent SCIF functions calls to refer to that endpoint;
+ * otherwise in user mode SCIF_OPEN_FAILED (that is ((scif_epd_t)-1)) is
+ * returned and errno is set to indicate the error; in kernel mode a NULL
+ * scif_epd_t is returned.
+ *
+ * Errors:
+ * ENOMEM - Insufficient kernel memory was available
+ */
+scif_epd_t scif_open(void);
+
+/**
+ * scif_bind() - Bind an endpoint to a port
+ * @epd:	endpoint descriptor
+ * @pn:		port number
+ *
+ * scif_bind() binds endpoint epd to port pn, where pn is a port number on the
+ * local node. If pn is zero, a port number greater than or equal to
+ * SCIF_PORT_RSVD is assigned and returned. Each endpoint may be bound to
+ * exactly one local port. Ports less than 1024 when requested can only be bound
+ * by system (or root) processes or by processes executed by privileged users.
+ *
+ * Return:
+ * Upon successful completion, scif_bind() returns the port number to which epd
+ * is bound; otherwise in user mode -1 is returned and errno is set to
+ * indicate the error; in kernel mode the negative of one of the following
+ * errors is returned.
+ *
+ * Errors:
+ * EBADF, ENOTTY - epd is not a valid endpoint descriptor
+ * EINVAL - the endpoint or the port is already bound
+ * EISCONN - The endpoint is already connected
+ * ENOSPC - No port number available for assignment
+ * EACCES - The port requested is protected and the user is not the superuser
+ */
+int scif_bind(scif_epd_t epd, u16 pn);
+
+/**
+ * scif_listen() - Listen for connections on an endpoint
+ * @epd:	endpoint descriptor
+ * @backlog:	maximum pending connection requests
+ *
+ * scif_listen() marks the endpoint epd as a listening endpoint - that is, as
+ * an endpoint that will be used to accept incoming connection requests. Once
+ * so marked, the endpoint is said to be in the listening state and may not be
+ * used as the endpoint of a connection.
+ *
+ * The endpoint, epd, must have been bound to a port.
+ *
+ * The backlog argument defines the maximum length to which the queue of
+ * pending connections for epd may grow. If a connection request arrives when
+ * the queue is full, the client may receive an error with an indication that
+ * the connection was refused.
+ *
+ * Return:
+ * Upon successful completion, scif_listen() returns 0; otherwise in user mode
+ * -1 is returned and errno is set to indicate the error; in kernel mode the
+ * negative of one of the following errors is returned.
+ *
+ * Errors:
+ * EBADF, ENOTTY - epd is not a valid endpoint descriptor
+ * EINVAL - the endpoint is not bound to a port
+ * EISCONN - The endpoint is already connected or listening
+ */
+int scif_listen(scif_epd_t epd, int backlog);
+
+/**
+ * scif_connect() - Initiate a connection on a port
+ * @epd:	endpoint descriptor
+ * @dst:	global id of port to which to connect
+ *
+ * The scif_connect() function requests the connection of endpoint epd to remote
+ * port dst. If the connection is successful, a peer endpoint, bound to dst, is
+ * created on node dst.node. On successful return, the connection is complete.
+ *
+ * If the endpoint epd has not already been bound to a port, scif_connect()
+ * will bind it to an unused local port.
+ *
+ * A connection is terminated when an endpoint of the connection is closed,
+ * either explicitly by scif_close(), or when a process that owns one of the
+ * endpoints of the connection is terminated.
+ *
+ * In user space, scif_connect() supports an asynchronous connection mode
+ * if the application has set the O_NONBLOCK flag on the endpoint via the
+ * fcntl() system call. Setting this flag will result in the calling process
+ * not to wait during scif_connect().
+ *
+ * Return:
+ * Upon successful completion, scif_connect() returns the port ID to which the
+ * endpoint, epd, is bound; otherwise in user mode -1 is returned and errno is
+ * set to indicate the error; in kernel mode the negative of one of the
+ * following errors is returned.
+ *
+ * Errors:
+ * EBADF, ENOTTY - epd is not a valid endpoint descriptor
+ * ECONNREFUSED - The destination was not listening for connections or refused
+ * the connection request
+ * EINVAL - dst.port is not a valid port ID
+ * EISCONN - The endpoint is already connected
+ * ENOMEM - No buffer space is available
+ * ENODEV - The destination node does not exist, or the node is lost or existed,
+ * but is not currently in the network since it may have crashed
+ * ENOSPC - No port number available for assignment
+ * EOPNOTSUPP - The endpoint is listening and cannot be connected
+ */
+int scif_connect(scif_epd_t epd, struct scif_port_id *dst);
+
+/**
+ * scif_accept() - Accept a connection on an endpoint
+ * @epd:	endpoint descriptor
+ * @peer:	global id of port to which connected
+ * @newepd:	new connected endpoint descriptor
+ * @flags:	flags
+ *
+ * The scif_accept() call extracts the first connection request from the queue
+ * of pending connections for the port on which epd is listening. scif_accept()
+ * creates a new endpoint, bound to the same port as epd, and allocates a new
+ * SCIF endpoint descriptor, returned in newepd, for the endpoint. The new
+ * endpoint is connected to the endpoint through which the connection was
+ * requested. epd is unaffected by this call, and remains in the listening
+ * state.
+ *
+ * On successful return, peer holds the global port identifier (node id and
+ * local port number) of the port which requested the connection.
+ *
+ * A connection is terminated when an endpoint of the connection is closed,
+ * either explicitly by scif_close(), or when a process that owns one of the
+ * endpoints of the connection is terminated.
+ *
+ * The number of connections that can (subsequently) be accepted on epd is only
+ * limited by system resources (memory).
+ *
+ * The flags argument is formed by OR'ing together zero or more of the
+ * following values.
+ * SCIF_ACCEPT_SYNC - block until a connection request is presented. If
+ *			SCIF_ACCEPT_SYNC is not in flags, and no pending
+ *			connections are present on the queue, scif_accept()
+ *			fails with an EAGAIN error
+ *
+ * In user mode, the select() and poll() functions can be used to determine
+ * when there is a connection request. In kernel mode, the scif_poll()
+ * function may be used for this purpose. A readable event will be delivered
+ * when a connection is requested.
+ *
+ * Return:
+ * Upon successful completion, scif_accept() returns 0; otherwise in user mode
+ * -1 is returned and errno is set to indicate the error; in kernel mode the
+ *	negative of one of the following errors is returned.
+ *
+ * Errors:
+ * EAGAIN - SCIF_ACCEPT_SYNC is not set and no connections are present to be
+ * accepted or SCIF_ACCEPT_SYNC is not set and remote node failed to complete
+ * its connection request
+ * EBADF, ENOTTY - epd is not a valid endpoint descriptor
+ * EINTR - Interrupted function
+ * EINVAL - epd is not a listening endpoint, or flags is invalid, or peer is
+ * NULL, or newepd is NULL
+ * ENODEV - The requesting node is lost or existed, but is not currently in the
+ * network since it may have crashed
+ * ENOMEM - Not enough space
+ * ENOENT - Secondary part of epd registration failed
+ */
+int scif_accept(scif_epd_t epd, struct scif_port_id *peer, scif_epd_t
+		*newepd, int flags);
+
+/**
+ * scif_close() - Close an endpoint
+ * @epd:	endpoint descriptor
+ *
+ * scif_close() closes an endpoint and performs necessary teardown of
+ * facilities associated with that endpoint.
+ *
+ * If epd is a listening endpoint then it will no longer accept connection
+ * requests on the port to which it is bound. Any pending connection requests
+ * are rejected.
+ *
+ * If epd is a connected endpoint, then its peer endpoint is also closed. RMAs
+ * which are in-process through epd or its peer endpoint will complete before
+ * scif_close() returns. Registered windows of the local and peer endpoints are
+ * released as if scif_unregister() was called against each window.
+ *
+ * Closing a SCIF endpoint does not affect local registered memory mapped by
+ * a SCIF endpoint on a remote node. The local memory remains mapped by the peer
+ * SCIF endpoint explicitly removed by calling munmap(..) by the peer.
+ *
+ * If the peer endpoint's receive queue is not empty at the time that epd is
+ * closed, then the peer endpoint can be passed as the endpoint parameter to
+ * scif_recv() until the receive queue is empty.
+ *
+ * epd is freed and may no longer be accessed.
+ *
+ * Return:
+ * Upon successful completion, scif_close() returns 0; otherwise in user mode
+ * -1 is returned and errno is set to indicate the error; in kernel mode the
+ * negative of one of the following errors is returned.
+ *
+ * Errors:
+ * EBADF, ENOTTY - epd is not a valid endpoint descriptor
+ */
+int scif_close(scif_epd_t epd);
+
+/**
+ * scif_send() - Send a message
+ * @epd:	endpoint descriptor
+ * @msg:	message buffer address
+ * @len:	message length
+ * @flags:	blocking mode flags
+ *
+ * scif_send() sends data to the peer of endpoint epd. Up to len bytes of data
+ * are copied from memory starting at address msg. On successful execution the
+ * return value of scif_send() is the number of bytes that were sent, and is
+ * zero if no bytes were sent because len was zero. scif_send() may be called
+ * only when the endpoint is in a connected state.
+ *
+ * If a scif_send() call is non-blocking, then it sends only those bytes which
+ * can be sent without waiting, up to a maximum of len bytes.
+ *
+ * If a scif_send() call is blocking, then it normally returns after sending
+ * all len bytes. If a blocking call is interrupted or the connection is
+ * reset, the call is considered successful if some bytes were sent or len is
+ * zero, otherwise the call is considered unsuccessful.
+ *
+ * In user mode, the select() and poll() functions can be used to determine
+ * when the send queue is not full. In kernel mode, the scif_poll() function
+ * may be used for this purpose.
+ *
+ * It is recommended that scif_send()/scif_recv() only be used for short
+ * control-type message communication between SCIF endpoints. The SCIF RMA
+ * APIs are expected to provide better performance for transfer sizes of
+ * 1024 bytes or longer for the current MIC hardware and software
+ * implementation.
+ *
+ * scif_send() will block until the entire message is sent if SCIF_SEND_BLOCK
+ * is passed as the flags argument.
+ *
+ * Return:
+ * Upon successful completion, scif_send() returns the number of bytes sent;
+ * otherwise in user mode -1 is returned and errno is set to indicate the
+ * error; in kernel mode the negative of one of the following errors is
+ * returned.
+ *
+ * Errors:
+ * EBADF, ENOTTY - epd is not a valid endpoint descriptor
+ * ECONNRESET - Connection reset by peer
+ * EFAULT - An invalid address was specified for a parameter
+ * EINVAL - flags is invalid, or len is negative
+ * ENODEV - The remote node is lost or existed, but is not currently in the
+ * network since it may have crashed
+ * ENOMEM - Not enough space
+ * ENOTCONN - The endpoint is not connected
+ */
+int scif_send(scif_epd_t epd, void *msg, int len, int flags);
+
+/**
+ * scif_recv() - Receive a message
+ * @epd:	endpoint descriptor
+ * @msg:	message buffer address
+ * @len:	message buffer length
+ * @flags:	blocking mode flags
+ *
+ * scif_recv() receives data from the peer of endpoint epd. Up to len bytes of
+ * data are copied to memory starting at address msg. On successful execution
+ * the return value of scif_recv() is the number of bytes that were received,
+ * and is zero if no bytes were received because len was zero. scif_recv() may
+ * be called only when the endpoint is in a connected state.
+ *
+ * If a scif_recv() call is non-blocking, then it receives only those bytes
+ * which can be received without waiting, up to a maximum of len bytes.
+ *
+ * If a scif_recv() call is blocking, then it normally returns after receiving
+ * all len bytes. If the blocking call was interrupted due to a disconnection,
+ * subsequent calls to scif_recv() will copy all bytes received upto the point
+ * of disconnection.
+ *
+ * In user mode, the select() and poll() functions can be used to determine
+ * when data is available to be received. In kernel mode, the scif_poll()
+ * function may be used for this purpose.
+ *
+ * It is recommended that scif_send()/scif_recv() only be used for short
+ * control-type message communication between SCIF endpoints. The SCIF RMA
+ * APIs are expected to provide better performance for transfer sizes of
+ * 1024 bytes or longer for the current MIC hardware and software
+ * implementation.
+ *
+ * scif_recv() will block until the entire message is received if
+ * SCIF_RECV_BLOCK is passed as the flags argument.
+ *
+ * Return:
+ * Upon successful completion, scif_recv() returns the number of bytes
+ * received; otherwise in user mode -1 is returned and errno is set to
+ * indicate the error; in kernel mode the negative of one of the following
+ * errors is returned.
+ *
+ * Errors:
+ * EAGAIN - The destination node is returning from a low power state
+ * EBADF, ENOTTY - epd is not a valid endpoint descriptor
+ * ECONNRESET - Connection reset by peer
+ * EFAULT - An invalid address was specified for a parameter
+ * EINVAL - flags is invalid, or len is negative
+ * ENODEV - The remote node is lost or existed, but is not currently in the
+ * network since it may have crashed
+ * ENOMEM - Not enough space
+ * ENOTCONN - The endpoint is not connected
+ */
+int scif_recv(scif_epd_t epd, void *msg, int len, int flags);
+
+/**
+ * scif_register() - Mark a memory region for remote access.
+ * @epd:		endpoint descriptor
+ * @addr:		starting virtual address
+ * @len:		length of range
+ * @offset:		offset of window
+ * @prot_flags:		read/write protection flags
+ * @map_flags:		mapping flags
+ *
+ * The scif_register() function opens a window, a range of whole pages of the
+ * registered address space of the endpoint epd, starting at offset po and
+ * continuing for len bytes. The value of po, further described below, is a
+ * function of the parameters offset and len, and the value of map_flags. Each
+ * page of the window represents the physical memory page which backs the
+ * corresponding page of the range of virtual address pages starting at addr
+ * and continuing for len bytes. addr and len are constrained to be multiples
+ * of the page size. A successful scif_register() call returns po.
+ *
+ * When SCIF_MAP_FIXED is set in the map_flags argument, po will be offset
+ * exactly, and offset is constrained to be a multiple of the page size. The
+ * mapping established by scif_register() will not replace any existing
+ * registration; an error is returned if any page within the range [offset,
+ * offset + len - 1] intersects an existing window.
+ *
+ * When SCIF_MAP_FIXED is not set, the implementation uses offset in an
+ * implementation-defined manner to arrive at po. The po value so chosen will
+ * be an area of the registered address space that the implementation deems
+ * suitable for a mapping of len bytes. An offset value of 0 is interpreted as
+ * granting the implementation complete freedom in selecting po, subject to
+ * constraints described below. A non-zero value of offset is taken to be a
+ * suggestion of an offset near which the mapping should be placed. When the
+ * implementation selects a value for po, it does not replace any extant
+ * window. In all cases, po will be a multiple of the page size.
+ *
+ * The physical pages which are so represented by a window are available for
+ * access in calls to mmap(), scif_readfrom(), scif_writeto(),
+ * scif_vreadfrom(), and scif_vwriteto(). While a window is registered, the
+ * physical pages represented by the window will not be reused by the memory
+ * subsystem for any other purpose. Note that the same physical page may be
+ * represented by multiple windows.
+ *
+ * Subsequent operations which change the memory pages to which virtual
+ * addresses are mapped (such as mmap(), munmap()) have no effect on
+ * existing window.
+ *
+ * If the process will fork(), it is recommended that the registered
+ * virtual address range be marked with MADV_DONTFORK. Doing so will prevent
+ * problems due to copy-on-write semantics.
+ *
+ * The prot_flags argument is formed by OR'ing together one or more of the
+ * following values.
+ * SCIF_PROT_READ - allow read operations from the window
+ * SCIF_PROT_WRITE - allow write operations to the window
+ *
+ * The map_flags argument can be set to SCIF_MAP_FIXED which interprets a
+ * fixed offset.
+ *
+ * Return:
+ * Upon successful completion, scif_register() returns the offset at which the
+ * mapping was placed (po); otherwise in user mode SCIF_REGISTER_FAILED (that
+ * is (off_t *)-1) is returned and errno is set to indicate the error; in
+ * kernel mode the negative of one of the following errors is returned.
+ *
+ * Errors:
+ * EADDRINUSE - SCIF_MAP_FIXED is set in map_flags, and pages in the range
+ * [offset, offset + len -1] are already registered
+ * EAGAIN - The mapping could not be performed due to lack of resources
+ * EBADF, ENOTTY - epd is not a valid endpoint descriptor
+ * ECONNRESET - Connection reset by peer
+ * EFAULT - Addresses in the range [addr, addr + len - 1] are invalid
+ * EINVAL - map_flags is invalid, or prot_flags is invalid, or SCIF_MAP_FIXED is
+ * set in flags, and offset is not a multiple of the page size, or addr is not a
+ * multiple of the page size, or len is not a multiple of the page size, or is
+ * 0, or offset is negative
+ * ENODEV - The remote node is lost or existed, but is not currently in the
+ * network since it may have crashed
+ * ENOMEM - Not enough space
+ * ENOTCONN -The endpoint is not connected
+ */
+off_t scif_register(scif_epd_t epd, void *addr, size_t len, off_t offset,
+		    int prot_flags, int map_flags);
+
+/**
+ * scif_unregister() - Mark a memory region for remote access.
+ * @epd:	endpoint descriptor
+ * @offset:	start of range to unregister
+ * @len:	length of range to unregister
+ *
+ * The scif_unregister() function closes those previously registered windows
+ * which are entirely within the range [offset, offset + len - 1]. It is an
+ * error to specify a range which intersects only a subrange of a window.
+ *
+ * On a successful return, pages within the window may no longer be specified
+ * in calls to mmap(), scif_readfrom(), scif_writeto(), scif_vreadfrom(),
+ * scif_vwriteto(), scif_get_pages, and scif_fence_signal(). The window,
+ * however, continues to exist until all previous references against it are
+ * removed. A window is referenced if there is a mapping to it created by
+ * mmap(), or if scif_get_pages() was called against the window
+ * (and the pages have not been returned via scif_put_pages()). A window is
+ * also referenced while an RMA, in which some range of the window is a source
+ * or destination, is in progress. Finally a window is referenced while some
+ * offset in that window was specified to scif_fence_signal(), and the RMAs
+ * marked by that call to scif_fence_signal() have not completed. While a
+ * window is in this state, its registered address space pages are not
+ * available for use in a new registered window.
+ *
+ * When all such references to the window have been removed, its references to
+ * all the physical pages which it represents are removed. Similarly, the
+ * registered address space pages of the window become available for
+ * registration in a new window.
+ *
+ * Return:
+ * Upon successful completion, scif_unregister() returns 0; otherwise in user
+ * mode -1 is returned and errno is set to indicate the error; in kernel mode
+ * the negative of one of the following errors is returned. In the event of an
+ * error, no windows are unregistered.
+ *
+ * Errors:
+ * EBADF, ENOTTY - epd is not a valid endpoint descriptor
+ * ECONNRESET - Connection reset by peer
+ * EINVAL - the range [offset, offset + len - 1] intersects a subrange of a
+ * window, or offset is negative
+ * ENODEV - The remote node is lost or existed, but is not currently in the
+ * network since it may have crashed
+ * ENOTCONN - The endpoint is not connected
+ * ENXIO - Offsets in the range [offset, offset + len - 1] are invalid for the
+ * registered address space of epd
+ */
+int scif_unregister(scif_epd_t epd, off_t offset, size_t len);
+
+/**
+ * scif_readfrom() - Copy from a remote address space
+ * @epd:	endpoint descriptor
+ * @loffset:	offset in local registered address space to
+ *		which to copy
+ * @len:	length of range to copy
+ * @roffset:	offset in remote registered address space
+ *		from which to copy
+ * @rma_flags:	transfer mode flags
+ *
+ * scif_readfrom() copies len bytes from the remote registered address space of
+ * the peer of endpoint epd, starting at the offset roffset to the local
+ * registered address space of epd, starting at the offset loffset.
+ *
+ * Each of the specified ranges [loffset, loffset + len - 1] and [roffset,
+ * roffset + len - 1] must be within some registered window or windows of the
+ * local and remote nodes. A range may intersect multiple registered windows,
+ * but only if those windows are contiguous in the registered address space.
+ *
+ * If rma_flags includes SCIF_RMA_USECPU, then the data is copied using
+ * programmed read/writes. Otherwise the data is copied using DMA. If rma_-
+ * flags includes SCIF_RMA_SYNC, then scif_readfrom() will return after the
+ * transfer is complete. Otherwise, the transfer may be performed asynchron-
+ * ously. The order in which any two asynchronous RMA operations complete
+ * is non-deterministic. The synchronization functions, scif_fence_mark()/
+ * scif_fence_wait() and scif_fence_signal(), can be used to synchronize to
+ * the completion of asynchronous RMA operations on the same endpoint.
+ *
+ * The DMA transfer of individual bytes is not guaranteed to complete in
+ * address order. If rma_flags includes SCIF_RMA_ORDERED, then the last
+ * cacheline or partial cacheline of the source range will become visible on
+ * the destination node after all other transferred data in the source
+ * range has become visible on the destination node.
+ *
+ * The optimal DMA performance will likely be realized if both
+ * loffset and roffset are cacheline aligned (are a multiple of 64). Lower
+ * performance will likely be realized if loffset and roffset are not
+ * cacheline aligned but are separated by some multiple of 64. The lowest level
+ * of performance is likely if loffset and roffset are not separated by a
+ * multiple of 64.
+ *
+ * The rma_flags argument is formed by ORing together zero or more of the
+ * following values.
+ * SCIF_RMA_USECPU - perform the transfer using the CPU, otherwise use the DMA
+ *	engine.
+ * SCIF_RMA_SYNC - perform the transfer synchronously, returning after the
+ *		transfer has completed. Passing this flag results in the
+ *		current implementation busy waiting and consuming CPU cycles
+ *		while the DMA transfer is in progress for best performance by
+ *		avoiding the interrupt latency.
+ * SCIF_RMA_ORDERED - ensure that the last cacheline or partial cacheline of
+ *		the source range becomes visible on the destination node
+ *		after all other transferred data in the source range has
+ *		become visible on the destination
+ *
+ * Return:
+ * Upon successful completion, scif_readfrom() returns 0; otherwise in user
+ * mode -1 is returned and errno is set to indicate the error; in kernel mode
+ * the negative of one of the following errors is returned.
+ *
+ * Errors:
+ * EACCESS - Attempt to write to a read-only range
+ * EBADF, ENOTTY - epd is not a valid endpoint descriptor
+ * ECONNRESET - Connection reset by peer
+ * EINVAL - rma_flags is invalid
+ * ENODEV - The remote node is lost or existed, but is not currently in the
+ * network since it may have crashed
+ * ENOTCONN - The endpoint is not connected
+ * ENXIO - The range [loffset, loffset + len - 1] is invalid for the registered
+ * address space of epd, or, The range [roffset, roffset + len - 1] is invalid
+ * for the registered address space of the peer of epd, or loffset or roffset
+ * is negative
+ */
+int scif_readfrom(scif_epd_t epd, off_t loffset, size_t len, off_t
+		  roffset, int rma_flags);
+
+/**
+ * scif_writeto() - Copy to a remote address space
+ * @epd:	endpoint descriptor
+ * @loffset:	offset in local registered address space
+ *		from which to copy
+ * @len:	length of range to copy
+ * @roffset:	offset in remote registered address space to
+ *		which to copy
+ * @rma_flags:	transfer mode flags
+ *
+ * scif_writeto() copies len bytes from the local registered address space of
+ * epd, starting at the offset loffset to the remote registered address space
+ * of the peer of endpoint epd, starting at the offset roffset.
+ *
+ * Each of the specified ranges [loffset, loffset + len - 1] and [roffset,
+ * roffset + len - 1] must be within some registered window or windows of the
+ * local and remote nodes. A range may intersect multiple registered windows,
+ * but only if those windows are contiguous in the registered address space.
+ *
+ * If rma_flags includes SCIF_RMA_USECPU, then the data is copied using
+ * programmed read/writes. Otherwise the data is copied using DMA. If rma_-
+ * flags includes SCIF_RMA_SYNC, then scif_writeto() will return after the
+ * transfer is complete. Otherwise, the transfer may be performed asynchron-
+ * ously. The order in which any two asynchronous RMA operations complete
+ * is non-deterministic. The synchronization functions, scif_fence_mark()/
+ * scif_fence_wait() and scif_fence_signal(), can be used to synchronize to
+ * the completion of asynchronous RMA operations on the same endpoint.
+ *
+ * The DMA transfer of individual bytes is not guaranteed to complete in
+ * address order. If rma_flags includes SCIF_RMA_ORDERED, then the last
+ * cacheline or partial cacheline of the source range will become visible on
+ * the destination node after all other transferred data in the source
+ * range has become visible on the destination node.
+ *
+ * The optimal DMA performance will likely be realized if both
+ * loffset and roffset are cacheline aligned (are a multiple of 64). Lower
+ * performance will likely be realized if loffset and roffset are not cacheline
+ * aligned but are separated by some multiple of 64. The lowest level of
+ * performance is likely if loffset and roffset are not separated by a multiple
+ * of 64.
+ *
+ * The rma_flags argument is formed by ORing together zero or more of the
+ * following values.
+ * SCIF_RMA_USECPU - perform the transfer using the CPU, otherwise use the DMA
+ *			engine.
+ * SCIF_RMA_SYNC - perform the transfer synchronously, returning after the
+ *		transfer has completed. Passing this flag results in the
+ *		current implementation busy waiting and consuming CPU cycles
+ *		while the DMA transfer is in progress for best performance by
+ *		avoiding the interrupt latency.
+ * SCIF_RMA_ORDERED - ensure that the last cacheline or partial cacheline of
+ *		the source range becomes visible on the destination node
+ *		after all other transferred data in the source range has
+ *		become visible on the destination
+ *
+ * Return:
+ * Upon successful completion, scif_readfrom() returns 0; otherwise in user
+ * mode -1 is returned and errno is set to indicate the error; in kernel mode
+ * the negative of one of the following errors is returned.
+ *
+ * Errors:
+ * EACCESS - Attempt to write to a read-only range
+ * EBADF, ENOTTY - epd is not a valid endpoint descriptor
+ * ECONNRESET - Connection reset by peer
+ * EINVAL - rma_flags is invalid
+ * ENODEV - The remote node is lost or existed, but is not currently in the
+ * network since it may have crashed
+ * ENOTCONN - The endpoint is not connected
+ * ENXIO - The range [loffset, loffset + len - 1] is invalid for the registered
+ * address space of epd, or, The range [roffset , roffset + len -1] is invalid
+ * for the registered address space of the peer of epd, or loffset or roffset
+ * is negative
+ */
+int scif_writeto(scif_epd_t epd, off_t loffset, size_t len, off_t
+		 roffset, int rma_flags);
+
+/**
+ * scif_vreadfrom() - Copy from a remote address space
+ * @epd:	endpoint descriptor
+ * @addr:	address to which to copy
+ * @len:	length of range to copy
+ * @roffset:	offset in remote registered address space
+ *		from which to copy
+ * @rma_flags:	transfer mode flags
+ *
+ * scif_vreadfrom() copies len bytes from the remote registered address
+ * space of the peer of endpoint epd, starting at the offset roffset, to local
+ * memory, starting at addr.
+ *
+ * The specified range [roffset, roffset + len - 1] must be within some
+ * registered window or windows of the remote nodes. The range may
+ * intersect multiple registered windows, but only if those windows are
+ * contiguous in the registered address space.
+ *
+ * If rma_flags includes SCIF_RMA_USECPU, then the data is copied using
+ * programmed read/writes. Otherwise the data is copied using DMA. If rma_-
+ * flags includes SCIF_RMA_SYNC, then scif_vreadfrom() will return after the
+ * transfer is complete. Otherwise, the transfer may be performed asynchron-
+ * ously. The order in which any two asynchronous RMA operations complete
+ * is non-deterministic. The synchronization functions, scif_fence_mark()/
+ * scif_fence_wait() and scif_fence_signal(), can be used to synchronize to
+ * the completion of asynchronous RMA operations on the same endpoint.
+ *
+ * The DMA transfer of individual bytes is not guaranteed to complete in
+ * address order. If rma_flags includes SCIF_RMA_ORDERED, then the last
+ * cacheline or partial cacheline of the source range will become visible on
+ * the destination node after all other transferred data in the source
+ * range has become visible on the destination node.
+ *
+ * If rma_flags includes SCIF_RMA_USECACHE, then the physical pages which back
+ * the specified local memory range may be remain in a pinned state even after
+ * the specified transfer completes. This may reduce overhead if some or all of
+ * the same virtual address range is referenced in a subsequent call of
+ * scif_vreadfrom() or scif_vwriteto().
+ *
+ * The optimal DMA performance will likely be realized if both
+ * addr and roffset are cacheline aligned (are a multiple of 64). Lower
+ * performance will likely be realized if addr and roffset are not
+ * cacheline aligned but are separated by some multiple of 64. The lowest level
+ * of performance is likely if addr and roffset are not separated by a
+ * multiple of 64.
+ *
+ * The rma_flags argument is formed by ORing together zero or more of the
+ * following values.
+ * SCIF_RMA_USECPU - perform the transfer using the CPU, otherwise use the DMA
+ *	engine.
+ * SCIF_RMA_USECACHE - enable registration caching
+ * SCIF_RMA_SYNC - perform the transfer synchronously, returning after the
+ *		transfer has completed. Passing this flag results in the
+ *		current implementation busy waiting and consuming CPU cycles
+ *		while the DMA transfer is in progress for best performance by
+ *		avoiding the interrupt latency.
+ * SCIF_RMA_ORDERED - ensure that the last cacheline or partial cacheline of
+ *	the source range becomes visible on the destination node
+ *	after all other transferred data in the source range has
+ *	become visible on the destination
+ *
+ * Return:
+ * Upon successful completion, scif_vreadfrom() returns 0; otherwise in user
+ * mode -1 is returned and errno is set to indicate the error; in kernel mode
+ * the negative of one of the following errors is returned.
+ *
+ * Errors:
+ * EACCESS - Attempt to write to a read-only range
+ * EBADF, ENOTTY - epd is not a valid endpoint descriptor
+ * ECONNRESET - Connection reset by peer
+ * EFAULT - Addresses in the range [addr, addr + len - 1] are invalid
+ * EINVAL - rma_flags is invalid
+ * ENODEV - The remote node is lost or existed, but is not currently in the
+ * network since it may have crashed
+ * ENOTCONN - The endpoint is not connected
+ * ENXIO - Offsets in the range [roffset, roffset + len - 1] are invalid for the
+ * registered address space of epd
+ */
+int scif_vreadfrom(scif_epd_t epd, void *addr, size_t len, off_t roffset,
+		   int rma_flags);
+
+/**
+ * scif_vwriteto() - Copy to a remote address space
+ * @epd:	endpoint descriptor
+ * @addr:	address from which to copy
+ * @len:	length of range to copy
+ * @roffset:	offset in remote registered address space to
+ *		which to copy
+ * @rma_flags:	transfer mode flags
+ *
+ * scif_vwriteto() copies len bytes from the local memory, starting at addr, to
+ * the remote registered address space of the peer of endpoint epd, starting at
+ * the offset roffset.
+ *
+ * The specified range [roffset, roffset + len - 1] must be within some
+ * registered window or windows of the remote nodes. The range may intersect
+ * multiple registered windows, but only if those windows are contiguous in the
+ * registered address space.
+ *
+ * If rma_flags includes SCIF_RMA_USECPU, then the data is copied using
+ * programmed read/writes. Otherwise the data is copied using DMA. If rma_-
+ * flags includes SCIF_RMA_SYNC, then scif_vwriteto() will return after the
+ * transfer is complete. Otherwise, the transfer may be performed asynchron-
+ * ously. The order in which any two asynchronous RMA operations complete
+ * is non-deterministic. The synchronization functions, scif_fence_mark()/
+ * scif_fence_wait() and scif_fence_signal(), can be used to synchronize to
+ * the completion of asynchronous RMA operations on the same endpoint.
+ *
+ * The DMA transfer of individual bytes is not guaranteed to complete in
+ * address order. If rma_flags includes SCIF_RMA_ORDERED, then the last
+ * cacheline or partial cacheline of the source range will become visible on
+ * the destination node after all other transferred data in the source
+ * range has become visible on the destination node.
+ *
+ * If rma_flags includes SCIF_RMA_USECACHE, then the physical pages which back
+ * the specified local memory range may be remain in a pinned state even after
+ * the specified transfer completes. This may reduce overhead if some or all of
+ * the same virtual address range is referenced in a subsequent call of
+ * scif_vreadfrom() or scif_vwriteto().
+ *
+ * The optimal DMA performance will likely be realized if both
+ * addr and offset are cacheline aligned (are a multiple of 64). Lower
+ * performance will likely be realized if addr and offset are not cacheline
+ * aligned but are separated by some multiple of 64. The lowest level of
+ * performance is likely if addr and offset are not separated by a multiple of
+ * 64.
+ *
+ * The rma_flags argument is formed by ORing together zero or more of the
+ * following values.
+ * SCIF_RMA_USECPU - perform the transfer using the CPU, otherwise use the DMA
+ *	engine.
+ * SCIF_RMA_USECACHE - allow registration caching
+ * SCIF_RMA_SYNC - perform the transfer synchronously, returning after the
+ *		transfer has completed. Passing this flag results in the
+ *		current implementation busy waiting and consuming CPU cycles
+ *		while the DMA transfer is in progress for best performance by
+ *		avoiding the interrupt latency.
+ * SCIF_RMA_ORDERED - ensure that the last cacheline or partial cacheline of
+ *		the source range becomes visible on the destination node
+ *		after all other transferred data in the source range has
+ *		become visible on the destination
+ *
+ * Return:
+ * Upon successful completion, scif_vwriteto() returns 0; otherwise in user
+ * mode -1 is returned and errno is set to indicate the error; in kernel mode
+ * the negative of one of the following errors is returned.
+ *
+ * Errors:
+ * EACCESS - Attempt to write to a read-only range
+ * EBADF, ENOTTY - epd is not a valid endpoint descriptor
+ * ECONNRESET - Connection reset by peer
+ * EFAULT - Addresses in the range [addr, addr + len - 1] are invalid
+ * EINVAL - rma_flags is invalid
+ * ENODEV - The remote node is lost or existed, but is not currently in the
+ * network since it may have crashed
+ * ENOTCONN - The endpoint is not connected
+ * ENXIO - Offsets in the range [roffset, roffset + len - 1] are invalid for the
+ * registered address space of epd
+ */
+int scif_vwriteto(scif_epd_t epd, void *addr, size_t len, off_t roffset,
+		  int rma_flags);
+
+/**
+ * scif_fence_mark() - Mark previously issued RMAs
+ * @epd:	endpoint descriptor
+ * @flags:	control flags
+ * @mark:	marked value returned as output.
+ *
+ * scif_fence_mark() returns after marking the current set of all uncompleted
+ * RMAs initiated through the endpoint epd or the current set of all
+ * uncompleted RMAs initiated through the peer of endpoint epd. The RMAs are
+ * marked with a value returned at mark. The application may subsequently call
+ * scif_fence_wait(), passing the value returned at mark, to await completion
+ * of all RMAs so marked.
+ *
+ * The flags argument has exactly one of the following values.
+ * SCIF_FENCE_INIT_SELF - RMA operations initiated through endpoint
+ *	epd are marked
+ * SCIF_FENCE_INIT_PEER - RMA operations initiated through the peer
+ *	of endpoint epd are marked
+ *
+ * Return:
+ * Upon successful completion, scif_fence_mark() returns 0; otherwise in user
+ * mode -1 is returned and errno is set to indicate the error; in kernel mode
+ * the negative of one of the following errors is returned.
+ *
+ * Errors:
+ * EBADF, ENOTTY - epd is not a valid endpoint descriptor
+ * ECONNRESET - Connection reset by peer
+ * EINVAL - flags is invalid
+ * ENODEV - The remote node is lost or existed, but is not currently in the
+ * network since it may have crashed
+ * ENOTCONN - The endpoint is not connected
+ * ENOMEM - Insufficient kernel memory was available
+ */
+int scif_fence_mark(scif_epd_t epd, int flags, int *mark);
+
+/**
+ * scif_fence_wait() - Wait for completion of marked RMAs
+ * @epd:	endpoint descriptor
+ * @mark:	mark request
+ *
+ * scif_fence_wait() returns after all RMAs marked with mark have completed.
+ * The value passed in mark must have been obtained in a previous call to
+ * scif_fence_mark().
+ *
+ * Return:
+ * Upon successful completion, scif_fence_wait() returns 0; otherwise in user
+ * mode -1 is returned and errno is set to indicate the error; in kernel mode
+ * the negative of one of the following errors is returned.
+ *
+ * Errors:
+ * EBADF, ENOTTY - epd is not a valid endpoint descriptor
+ * ECONNRESET - Connection reset by peer
+ * ENODEV - The remote node is lost or existed, but is not currently in the
+ * network since it may have crashed
+ * ENOTCONN - The endpoint is not connected
+ * ENOMEM - Insufficient kernel memory was available
+ */
+int scif_fence_wait(scif_epd_t epd, int mark);
+
+/**
+ * scif_fence_signal() - Request a memory update on completion of RMAs
+ * @epd:	endpoint descriptor
+ * @loff:	local offset
+ * @lval:	local value to write to loffset
+ * @roff:	remote offset
+ * @rval:	remote value to write to roffset
+ * @flags:	flags
+ *
+ * scif_fence_signal() returns after marking the current set of all uncompleted
+ * RMAs initiated through the endpoint epd or marking the current set of all
+ * uncompleted RMAs initiated through the peer of endpoint epd.
+ *
+ * If flags includes SCIF_SIGNAL_LOCAL, then on completion of the RMAs in the
+ * marked set, lval is written to memory at the address corresponding to offset
+ * loff in the local registered address space of epd. loff must be within a
+ * registered window. If flags includes SCIF_SIGNAL_REMOTE, then on completion
+ * of the RMAs in the marked set, rval is written to memory at the address
+ * corresponding to offset roff in the remote registered address space of epd.
+ * roff must be within a remote registered window of the peer of epd. Note
+ * that any specified offset must be DWORD (4 byte / 32 bit) aligned.
+ *
+ * The flags argument is formed by OR'ing together the following.
+ * Exactly one of the following values.
+ * SCIF_FENCE_INIT_SELF - RMA operations initiated through endpoint
+ *	epd are marked
+ * SCIF_FENCE_INIT_PEER - RMA operations initiated through the peer
+ *	of endpoint epd are marked
+ * One or more of the following values.
+ * SCIF_SIGNAL_LOCAL - On completion of the marked set of RMAs, write lval to
+ *	memory at the address corresponding to offset loff in the local
+ *	registered address space of epd.
+ * SCIF_SIGNAL_REMOTE - On completion of the marked set of RMAs, write rval to
+ *	memory at the address corresponding to offset roff in the remote
+ *	registered address space of epd.
+ *
+ * Return:
+ * Upon successful completion, scif_fence_signal() returns 0; otherwise in
+ * user mode -1 is returned and errno is set to indicate the error; in kernel
+ * mode the negative of one of the following errors is returned.
+ *
+ * Errors:
+ * EBADF, ENOTTY - epd is not a valid endpoint descriptor
+ * ECONNRESET - Connection reset by peer
+ * EINVAL - flags is invalid, or loff or roff are not DWORD aligned
+ * ENODEV - The remote node is lost or existed, but is not currently in the
+ * network since it may have crashed
+ * ENOTCONN - The endpoint is not connected
+ * ENXIO - loff is invalid for the registered address of epd, or roff is invalid
+ * for the registered address space, of the peer of epd
+ */
+int scif_fence_signal(scif_epd_t epd, off_t loff, u64 lval, off_t roff,
+		      u64 rval, int flags);
+
+/**
+ * scif_get_node_ids() - Return information about online nodes
+ * @nodes:	array in which to return online node IDs
+ * @len:	number of entries in the nodes array
+ * @self:	address to place the node ID of the local node
+ *
+ * scif_get_node_ids() fills in the nodes array with up to len node IDs of the
+ * nodes in the SCIF network. If there is not enough space in nodes, as
+ * indicated by the len parameter, only len node IDs are returned in nodes. The
+ * return value of scif_get_node_ids() is the total number of nodes currently in
+ * the SCIF network. By checking the return value against the len parameter,
+ * the user may determine if enough space for nodes was allocated.
+ *
+ * The node ID of the local node is returned at self.
+ *
+ * Return:
+ * Upon successful completion, scif_get_node_ids() returns the actual number of
+ * online nodes in the SCIF network including 'self'; otherwise in user mode
+ * -1 is returned and errno is set to indicate the error; in kernel mode no
+ * errors are returned.
+ *
+ * Errors:
+ * EFAULT - Bad address
+ */
+int scif_get_node_ids(u16 *nodes, int len, u16 *self);
+
+#endif /* __SCIF_H__ */
diff --git a/include/uapi/linux/Kbuild b/include/uapi/linux/Kbuild
index 1a0006a76b00..4ad65eebbff8 100644
--- a/include/uapi/linux/Kbuild
+++ b/include/uapi/linux/Kbuild
@@ -352,6 +352,7 @@ header-y += rtc.h
 header-y += rtnetlink.h
 header-y += scc.h
 header-y += sched.h
+header-y += scif_ioctl.h
 header-y += screen_info.h
 header-y += sctp.h
 header-y += sdla.h
diff --git a/include/uapi/linux/scif_ioctl.h b/include/uapi/linux/scif_ioctl.h
new file mode 100644
index 000000000000..4a94d917cf99
--- /dev/null
+++ b/include/uapi/linux/scif_ioctl.h
@@ -0,0 +1,130 @@
+/*
+ * Intel MIC Platform Software Stack (MPSS)
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2014 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2014 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ *   notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ *   notice, this list of conditions and the following disclaimer in
+ *   the documentation and/or other materials provided with the
+ *   distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ *   contributors may be used to endorse or promote products derived
+ *   from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Intel SCIF driver.
+ *
+ */
+/*
+ * -----------------------------------------
+ * SCIF IOCTL interface information
+ * -----------------------------------------
+ */
+#ifndef SCIF_IOCTL_H
+#define SCIF_IOCTL_H
+
+#include <linux/types.h>
+
+/**
+ * struct scif_port_id - SCIF port information
+ * @node:	node on which port resides
+ * @port:	local port number
+ */
+struct scif_port_id {
+	__u16 node;
+	__u16 port;
+};
+
+/**
+ * struct scifioctl_connect - used for SCIF_CONNECT IOCTL
+ * @self:	used to read back the assigned port_id
+ * @peer:	destination node and port to connect to
+ */
+struct scifioctl_connect {
+	struct scif_port_id	self;
+	struct scif_port_id	peer;
+};
+
+/**
+ * struct scifioctl_accept - used for SCIF_ACCEPTREQ IOCTL
+ * @flags:	flags
+ * @peer:	global id of peer endpoint
+ * @endpt:	new connected endpoint descriptor
+ */
+struct scifioctl_accept {
+	__s32			flags;
+	struct scif_port_id	peer;
+	__u64			endpt;
+};
+
+/**
+ * struct scifioctl_msg - used for SCIF_SEND/SCIF_RECV IOCTL
+ * @msg:	message buffer address
+ * @len:	message length
+ * @flags:	flags
+ * @out_len:	number of bytes sent/received
+ */
+struct scifioctl_msg {
+	__u64	msg;
+	__s32	len;
+	__s32	flags;
+	__s32	out_len;
+};
+
+/**
+ * struct scifioctl_node_ids - used for SCIF_GET_NODEIDS IOCTL
+ * @nodes:	pointer to an array of node_ids
+ * @self:	ID of the current node
+ * @len:	length of array
+ */
+struct scifioctl_node_ids {
+	__u64	nodes;
+	__u64	self;
+	__s32	len;
+};
+
+#define SCIF_BIND		_IOWR('s', 1, __u64)
+#define SCIF_LISTEN		_IOW('s', 2, __s32)
+#define SCIF_CONNECT		_IOWR('s', 3, struct scifioctl_connect)
+#define SCIF_ACCEPTREQ		_IOWR('s', 4, struct scifioctl_accept)
+#define SCIF_ACCEPTREG		_IOWR('s', 5, __u64)
+#define SCIF_SEND		_IOWR('s', 6, struct scifioctl_msg)
+#define SCIF_RECV		_IOWR('s', 7, struct scifioctl_msg)
+#define SCIF_GET_NODEIDS	_IOWR('s', 14, struct scifioctl_node_ids)
+
+#endif /* SCIF_IOCTL_H */
-- 
cgit v1.2.3


From 3647a83d9dcf00b8e17777ec8aa1e48f1ed4fe06 Mon Sep 17 00:00:00 2001
From: Vitaly Kuznetsov <vkuznets@redhat.com>
Date: Sat, 11 Apr 2015 18:07:39 -0700
Subject: Drivers: hv: util: move kvp/vss function declarations to
 hyperv_vmbus.h

These declarations are internal to hv_util module and hv_fcopy_* declarations
already reside there.

Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Tested-by: Alex Ng <alexng@microsoft.com>
Signed-off-by: K. Y. Srinivasan <kys@microsoft.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/hv/hv_kvp.c       | 1 +
 drivers/hv/hv_snapshot.c  | 2 ++
 drivers/hv/hyperv_vmbus.h | 8 ++++++++
 include/linux/hyperv.h    | 7 -------
 4 files changed, 11 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/hv/hv_kvp.c b/drivers/hv/hv_kvp.c
index beb8105c0e7b..a414c8397f88 100644
--- a/drivers/hv/hv_kvp.c
+++ b/drivers/hv/hv_kvp.c
@@ -28,6 +28,7 @@
 #include <linux/workqueue.h>
 #include <linux/hyperv.h>
 
+#include "hyperv_vmbus.h"
 
 /*
  * Pre win8 version numbers used in ws2008 and ws 2008 r2 (win7)
diff --git a/drivers/hv/hv_snapshot.c b/drivers/hv/hv_snapshot.c
index 9d5e0d1efdb5..c1a3604c261c 100644
--- a/drivers/hv/hv_snapshot.c
+++ b/drivers/hv/hv_snapshot.c
@@ -24,6 +24,8 @@
 #include <linux/workqueue.h>
 #include <linux/hyperv.h>
 
+#include "hyperv_vmbus.h"
+
 #define VSS_MAJOR  5
 #define VSS_MINOR  0
 #define VSS_VERSION    (VSS_MAJOR << 16 | VSS_MINOR)
diff --git a/drivers/hv/hyperv_vmbus.h b/drivers/hv/hyperv_vmbus.h
index 887287ad411f..ddcf6c405195 100644
--- a/drivers/hv/hyperv_vmbus.h
+++ b/drivers/hv/hyperv_vmbus.h
@@ -730,6 +730,14 @@ int vmbus_set_event(struct vmbus_channel *channel);
 
 void vmbus_on_event(unsigned long data);
 
+int hv_kvp_init(struct hv_util_service *);
+void hv_kvp_deinit(void);
+void hv_kvp_onchannelcallback(void *);
+
+int hv_vss_init(struct hv_util_service *);
+void hv_vss_deinit(void);
+void hv_vss_onchannelcallback(void *);
+
 int hv_fcopy_init(struct hv_util_service *);
 void hv_fcopy_deinit(void);
 void hv_fcopy_onchannelcallback(void *);
diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h
index 902c37aef67e..1744148a39f9 100644
--- a/include/linux/hyperv.h
+++ b/include/linux/hyperv.h
@@ -1236,13 +1236,6 @@ extern bool vmbus_prep_negotiate_resp(struct icmsg_hdr *,
 					struct icmsg_negotiate *, u8 *, int,
 					int);
 
-int hv_kvp_init(struct hv_util_service *);
-void hv_kvp_deinit(void);
-void hv_kvp_onchannelcallback(void *);
-
-int hv_vss_init(struct hv_util_service *);
-void hv_vss_deinit(void);
-void hv_vss_onchannelcallback(void *);
 void hv_process_channel_removal(struct vmbus_channel *channel, u32 relid);
 
 extern struct resource hyperv_mmio;
-- 
cgit v1.2.3


From db9ba2088f6507fee370904f02db1eb9b49bd088 Mon Sep 17 00:00:00 2001
From: "K. Y. Srinivasan" <kys@microsoft.com>
Date: Wed, 22 Apr 2015 21:31:31 -0700
Subject: drivers: hv: vmbus: Get rid of some unused definitions

Get rid of some unused definitions.

Signed-off-by: K. Y. Srinivasan <kys@microsoft.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/hyperv.h | 19 -------------------
 1 file changed, 19 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h
index 1744148a39f9..e29ccddc6300 100644
--- a/include/linux/hyperv.h
+++ b/include/linux/hyperv.h
@@ -389,10 +389,6 @@ enum vmbus_channel_message_type {
 	CHANNELMSG_INITIATE_CONTACT		= 14,
 	CHANNELMSG_VERSION_RESPONSE		= 15,
 	CHANNELMSG_UNLOAD			= 16,
-#ifdef VMBUS_FEATURE_PARENT_OR_PEER_MEMORY_MAPPED_INTO_A_CHILD
-	CHANNELMSG_VIEWRANGE_ADD		= 17,
-	CHANNELMSG_VIEWRANGE_REMOVE		= 18,
-#endif
 	CHANNELMSG_COUNT
 };
 
@@ -549,21 +545,6 @@ struct vmbus_channel_gpadl_torndown {
 	u32 gpadl;
 } __packed;
 
-#ifdef VMBUS_FEATURE_PARENT_OR_PEER_MEMORY_MAPPED_INTO_A_CHILD
-struct vmbus_channel_view_range_add {
-	struct vmbus_channel_message_header header;
-	PHYSICAL_ADDRESS viewrange_base;
-	u64 viewrange_length;
-	u32 child_relid;
-} __packed;
-
-struct vmbus_channel_view_range_remove {
-	struct vmbus_channel_message_header header;
-	PHYSICAL_ADDRESS viewrange_base;
-	u32 child_relid;
-} __packed;
-#endif
-
 struct vmbus_channel_relid_released {
 	struct vmbus_channel_message_header header;
 	u32 child_relid;
-- 
cgit v1.2.3


From 2db84eff127e3f4b3635edc589cd6a56db8755a3 Mon Sep 17 00:00:00 2001
From: "K. Y. Srinivasan" <kys@microsoft.com>
Date: Wed, 22 Apr 2015 21:31:32 -0700
Subject: Drivers: hv: vmbus: Implement the protocol for tearing down vmbus
 state

Implement the protocol for tearing down the monitor state established with
the host.

Signed-off-by: K. Y. Srinivasan <kys@microsoft.com>
Tested-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/hv/channel_mgmt.c | 25 +++++++++++++++++++++++++
 drivers/hv/connection.c   |  5 +++++
 drivers/hv/hyperv_vmbus.h |  2 ++
 drivers/hv/vmbus_drv.c    |  2 +-
 include/linux/hyperv.h    |  1 +
 5 files changed, 34 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/hv/channel_mgmt.c b/drivers/hv/channel_mgmt.c
index 865a3afa1d86..4b9d89ab44d3 100644
--- a/drivers/hv/channel_mgmt.c
+++ b/drivers/hv/channel_mgmt.c
@@ -421,6 +421,30 @@ static void init_vp_index(struct vmbus_channel *channel, const uuid_le *type_gui
 	channel->target_vp = hv_context.vp_index[cur_cpu];
 }
 
+/*
+ * vmbus_unload_response - Handler for the unload response.
+ */
+static void vmbus_unload_response(struct vmbus_channel_message_header *hdr)
+{
+	/*
+	 * This is a global event; just wakeup the waiting thread.
+	 * Once we successfully unload, we can cleanup the monitor state.
+	 */
+	complete(&vmbus_connection.unload_event);
+}
+
+void vmbus_initiate_unload(void)
+{
+	struct vmbus_channel_message_header hdr;
+
+	init_completion(&vmbus_connection.unload_event);
+	memset(&hdr, 0, sizeof(struct vmbus_channel_message_header));
+	hdr.msgtype = CHANNELMSG_UNLOAD;
+	vmbus_post_msg(&hdr, sizeof(struct vmbus_channel_message_header));
+
+	wait_for_completion(&vmbus_connection.unload_event);
+}
+
 /*
  * vmbus_onoffer - Handler for channel offers from vmbus in parent partition.
  *
@@ -717,6 +741,7 @@ struct vmbus_channel_message_table_entry
 	{CHANNELMSG_INITIATE_CONTACT,		0, NULL},
 	{CHANNELMSG_VERSION_RESPONSE,		1, vmbus_onversion_response},
 	{CHANNELMSG_UNLOAD,			0, NULL},
+	{CHANNELMSG_UNLOAD_RESPONSE,		1, vmbus_unload_response},
 };
 
 /*
diff --git a/drivers/hv/connection.c b/drivers/hv/connection.c
index b27220a425f4..acd50e9d666a 100644
--- a/drivers/hv/connection.c
+++ b/drivers/hv/connection.c
@@ -227,6 +227,11 @@ cleanup:
 
 void vmbus_disconnect(void)
 {
+	/*
+	 * First send the unload request to the host.
+	 */
+	vmbus_initiate_unload();
+
 	if (vmbus_connection.work_queue) {
 		drain_workqueue(vmbus_connection.work_queue);
 		destroy_workqueue(vmbus_connection.work_queue);
diff --git a/drivers/hv/hyperv_vmbus.h b/drivers/hv/hyperv_vmbus.h
index 138d6634c79d..cddc0c9f6bf9 100644
--- a/drivers/hv/hyperv_vmbus.h
+++ b/drivers/hv/hyperv_vmbus.h
@@ -647,6 +647,7 @@ struct vmbus_connection {
 
 	atomic_t next_gpadl_handle;
 
+	struct completion  unload_event;
 	/*
 	 * Represents channel interrupts. Each bit position represents a
 	 * channel.  When a channel sends an interrupt via VMBUS, it finds its
@@ -741,6 +742,7 @@ void hv_vss_onchannelcallback(void *);
 int hv_fcopy_init(struct hv_util_service *);
 void hv_fcopy_deinit(void);
 void hv_fcopy_onchannelcallback(void *);
+void vmbus_initiate_unload(void);
 
 static inline void hv_poll_channel(struct vmbus_channel *channel,
 				   void (*cb)(void *))
diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c
index 7870a906365e..2b56260b0eb4 100644
--- a/drivers/hv/vmbus_drv.c
+++ b/drivers/hv/vmbus_drv.c
@@ -1106,6 +1106,7 @@ static void __exit vmbus_exit(void)
 
 	vmbus_connection.conn_state = DISCONNECTED;
 	hv_synic_clockevents_cleanup();
+	vmbus_disconnect();
 	hv_remove_vmbus_irq();
 	vmbus_free_channels();
 	if (ms_hyperv.features & HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE) {
@@ -1118,7 +1119,6 @@ static void __exit vmbus_exit(void)
 		smp_call_function_single(cpu, hv_synic_cleanup, NULL, 1);
 	acpi_bus_unregister_driver(&vmbus_acpi_driver);
 	hv_cpu_hotplug_quirk(false);
-	vmbus_disconnect();
 }
 
 
diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h
index e29ccddc6300..ea934864293d 100644
--- a/include/linux/hyperv.h
+++ b/include/linux/hyperv.h
@@ -389,6 +389,7 @@ enum vmbus_channel_message_type {
 	CHANNELMSG_INITIATE_CONTACT		= 14,
 	CHANNELMSG_VERSION_RESPONSE		= 15,
 	CHANNELMSG_UNLOAD			= 16,
+	CHANNELMSG_UNLOAD_RESPONSE		= 17,
 	CHANNELMSG_COUNT
 };
 
-- 
cgit v1.2.3


From fea844a2b0edd6540d5cde2cd54a8a3c86e9c53f Mon Sep 17 00:00:00 2001
From: Vitaly Kuznetsov <[mailto:vkuznets@redhat.com]>
Date: Wed, 6 May 2015 17:47:43 -0700
Subject: Drivers: hv: vmbus: briefly comment num_sc and next_oc

next_oc and num_sc fields of struct vmbus_channel deserve a description. Move
them closer to sc_list as these fields are related to it.

Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Signed-off-by: K. Y. Srinivasan <kys@microsoft.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/hyperv.h | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h
index ea934864293d..3932a993ff5a 100644
--- a/include/linux/hyperv.h
+++ b/include/linux/hyperv.h
@@ -726,6 +726,15 @@ struct vmbus_channel {
 	 * All Sub-channels of a primary channel are linked here.
 	 */
 	struct list_head sc_list;
+	/*
+	 * Current number of sub-channels.
+	 */
+	int num_sc;
+	/*
+	 * Number of a sub-channel (position within sc_list) which is supposed
+	 * to be used as the next outgoing channel.
+	 */
+	int next_oc;
 	/*
 	 * The primary channel this sub-channel belongs to.
 	 * This will be NULL for the primary channel.
@@ -740,9 +749,6 @@ struct vmbus_channel {
 	 * link up channels based on their CPU affinity.
 	 */
 	struct list_head percpu_list;
-
-	int num_sc;
-	int next_oc;
 };
 
 static inline void set_channel_read_state(struct vmbus_channel *c, bool state)
-- 
cgit v1.2.3


From 80c6e1465948c2e91214f01764f427d31ebedb26 Mon Sep 17 00:00:00 2001
From: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Date: Thu, 21 May 2015 15:49:37 -0700
Subject: driver-core: fix build for !CONFIG_MODULES

Commit f2411da74698 ("driver-core: add driver module asynchronous probe
support") broke build in case modules are disabled, because in this case
"struct module" is not defined and we can't dereference it. Let's define
module_requested_async_probing() helper and stub it out if modules are
disabled.

Reported-by: kbuild test robot <fengguang.wu@intel.com>
Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/base/dd.c      |  2 +-
 include/linux/module.h | 11 +++++++++++
 2 files changed, 12 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/base/dd.c b/drivers/base/dd.c
index 42e97d90a59a..8da8e071f01d 100644
--- a/drivers/base/dd.c
+++ b/drivers/base/dd.c
@@ -427,7 +427,7 @@ bool driver_allows_async_probing(struct device_driver *drv)
 		return false;
 
 	default:
-		if (drv->owner && drv->owner->async_probe_requested)
+		if (module_requested_async_probing(drv->owner))
 			return true;
 
 		return false;
diff --git a/include/linux/module.h b/include/linux/module.h
index f46a47d3c0dc..57f5c0a804c0 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -510,6 +510,11 @@ int unregister_module_notifier(struct notifier_block *nb);
 
 extern void print_modules(void);
 
+static inline bool module_requested_async_probing(struct module *module)
+{
+	return module && module->async_probe_requested;
+}
+
 #else /* !CONFIG_MODULES... */
 
 /* Given an address, look for it in the exception tables. */
@@ -620,6 +625,12 @@ static inline int unregister_module_notifier(struct notifier_block *nb)
 static inline void print_modules(void)
 {
 }
+
+static inline bool module_requested_async_probing(struct module *module)
+{
+	return false;
+}
+
 #endif /* CONFIG_MODULES */
 
 #ifdef CONFIG_SYSFS
-- 
cgit v1.2.3


From 2539b258ec028351af954c169ea1b0ff72023a9f Mon Sep 17 00:00:00 2001
From: Will Deacon <will.deacon@arm.com>
Date: Fri, 8 May 2015 14:45:34 +0100
Subject: drivers/base: cacheinfo: fix annoying typo when DT nodes are absent

s/hierarcy/hierarchy/

Maybe the typo will annoy people enough so that they add the missing
nodes to their device-tree files, but I still think this is better off
fixed.

Signed-off-by: Will Deacon <will.deacon@arm.com>
Acked-by: Sudeep Holla <sudeep.holla@arm.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 Documentation/ABI/testing/sysfs-devices-system-cpu | 2 +-
 drivers/base/cacheinfo.c                           | 4 ++--
 include/linux/cacheinfo.h                          | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/ABI/testing/sysfs-devices-system-cpu b/Documentation/ABI/testing/sysfs-devices-system-cpu
index 99983e67c13c..0908fb43566d 100644
--- a/Documentation/ABI/testing/sysfs-devices-system-cpu
+++ b/Documentation/ABI/testing/sysfs-devices-system-cpu
@@ -243,7 +243,7 @@ Description:	Parameters for the CPU cache attributes
 		coherency_line_size: the minimum amount of data in bytes that gets
 				     transferred from memory to cache
 
-		level: the cache hierarcy in the multi-level cache configuration
+		level: the cache hierarchy in the multi-level cache configuration
 
 		number_of_sets: total number of sets in the cache, a set is a
 				collection of cache lines with the same cache index
diff --git a/drivers/base/cacheinfo.c b/drivers/base/cacheinfo.c
index 9c2ba1c97c42..c75bda89821b 100644
--- a/drivers/base/cacheinfo.c
+++ b/drivers/base/cacheinfo.c
@@ -191,12 +191,12 @@ static int detect_cache_attributes(unsigned int cpu)
 	if (ret)
 		goto free_ci;
 	/*
-	 * For systems using DT for cache hierarcy, of_node and shared_cpu_map
+	 * For systems using DT for cache hierarchy, of_node and shared_cpu_map
 	 * will be set up here only if they are not populated already
 	 */
 	ret = cache_shared_cpu_map_setup(cpu);
 	if (ret) {
-		pr_warn("Unable to detect cache hierarcy from DT for CPU %d\n",
+		pr_warn("Unable to detect cache hierarchy from DT for CPU %d\n",
 			cpu);
 		goto free_ci;
 	}
diff --git a/include/linux/cacheinfo.h b/include/linux/cacheinfo.h
index 3daf5ed392c9..2189935075b4 100644
--- a/include/linux/cacheinfo.h
+++ b/include/linux/cacheinfo.h
@@ -19,7 +19,7 @@ enum cache_type {
 /**
  * struct cacheinfo - represent a cache leaf node
  * @type: type of the cache - data, inst or unified
- * @level: represents the hierarcy in the multi-level cache
+ * @level: represents the hierarchy in the multi-level cache
  * @coherency_line_size: size of each cache line usually representing
  *	the minimum amount of data that gets transferred from memory
  * @number_of_sets: total number of sets, a set is a collection of cache
-- 
cgit v1.2.3


From f4445f8b204de44a8baa4326b0e56537be867427 Mon Sep 17 00:00:00 2001
From: Sudeep Holla <sudeep.holla@arm.com>
Date: Thu, 14 May 2015 15:28:24 +0100
Subject: drivers: of/base: move of_init to driver_init

Commit 5590f3196b29 ("drivers/core/of: Add symlink to device-tree from
devices with an OF node") adds the symlink `of_node` for each device
pointing to it's device tree node while creating/initialising it.

However the devicetree sysfs is created and setup in of_init which is
executed at core_initcall level. For all the devices created before
of_init, the following error is thrown:
	"Error -2(-ENOENT) creating of_node link"

Like many other components in driver model, initialize the sysfs support
for OF/devicetree from driver_init so that it's ready before any devices
are created.

Fixes: 5590f3196b29 ("drivers/core/of: Add symlink to device-tree from
	devices with an OF node")
Suggested-by: Rob Herring <robh+dt@kernel.org>
Cc: Grant Likely <grant.likely@linaro.org>
Cc: Pawel Moll <pawel.moll@arm.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Sudeep Holla <sudeep.holla@arm.com>
Tested-by: Robert Schwebel <r.schwebel@pengutronix.de>
Acked-by: Rob Herring <robh@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/base/init.c | 2 ++
 drivers/of/base.c   | 8 +++-----
 include/linux/of.h  | 6 ++++++
 3 files changed, 11 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/init.c b/drivers/base/init.c
index da033d3bab3c..48c0e220acc0 100644
--- a/drivers/base/init.c
+++ b/drivers/base/init.c
@@ -8,6 +8,7 @@
 #include <linux/device.h>
 #include <linux/init.h>
 #include <linux/memory.h>
+#include <linux/of.h>
 
 #include "base.h"
 
@@ -34,4 +35,5 @@ void __init driver_init(void)
 	cpu_dev_init();
 	memory_dev_init();
 	container_dev_init();
+	of_core_init();
 }
diff --git a/drivers/of/base.c b/drivers/of/base.c
index 99764db0875a..f0650265febf 100644
--- a/drivers/of/base.c
+++ b/drivers/of/base.c
@@ -189,7 +189,7 @@ int __of_attach_node_sysfs(struct device_node *np)
 	return 0;
 }
 
-static int __init of_init(void)
+void __init of_core_init(void)
 {
 	struct device_node *np;
 
@@ -198,7 +198,8 @@ static int __init of_init(void)
 	of_kset = kset_create_and_add("devicetree", NULL, firmware_kobj);
 	if (!of_kset) {
 		mutex_unlock(&of_mutex);
-		return -ENOMEM;
+		pr_err("devicetree: failed to register existing nodes\n");
+		return;
 	}
 	for_each_of_allnodes(np)
 		__of_attach_node_sysfs(np);
@@ -207,10 +208,7 @@ static int __init of_init(void)
 	/* Symlink in /proc as required by userspace ABI */
 	if (of_root)
 		proc_symlink("device-tree", NULL, "/sys/firmware/devicetree/base");
-
-	return 0;
 }
-core_initcall(of_init);
 
 static struct property *__of_find_property(const struct device_node *np,
 					   const char *name, int *lenp)
diff --git a/include/linux/of.h b/include/linux/of.h
index ddeaae6d2083..b871ff9d81d7 100644
--- a/include/linux/of.h
+++ b/include/linux/of.h
@@ -121,6 +121,8 @@ extern struct device_node *of_stdout;
 extern raw_spinlock_t devtree_lock;
 
 #ifdef CONFIG_OF
+void of_core_init(void);
+
 static inline bool is_of_node(struct fwnode_handle *fwnode)
 {
 	return fwnode && fwnode->type == FWNODE_OF;
@@ -376,6 +378,10 @@ bool of_console_check(struct device_node *dn, char *name, int index);
 
 #else /* CONFIG_OF */
 
+static inline void of_core_init(void)
+{
+}
+
 static inline bool is_of_node(struct fwnode_handle *fwnode)
 {
 	return false;
-- 
cgit v1.2.3


From be12a1fe298e8be04d5215364f94654dff81b0bc Mon Sep 17 00:00:00 2001
From: Hannes Frederic Sowa <hannes@stressinduktion.org>
Date: Thu, 21 May 2015 16:59:58 +0200
Subject: net: skbuff: add skb_append_pagefrags and use it

Signed-off-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h |  3 +++
 net/core/skbuff.c      | 18 ++++++++++++++++++
 net/ipv4/ip_output.c   | 15 ++++-----------
 3 files changed, 25 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index b617095adb88..f708936cdd23 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -861,6 +861,9 @@ int skb_append_datato_frags(struct sock *sk, struct sk_buff *skb,
 					int len, int odd, struct sk_buff *skb),
 			    void *from, int length);
 
+int skb_append_pagefrags(struct sk_buff *skb, struct page *page,
+			 int offset, size_t size);
+
 struct skb_seq_state {
 	__u32		lower_offset;
 	__u32		upper_offset;
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index f3fe9bd9e672..4f2babeaf18d 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -2915,6 +2915,24 @@ int skb_append_datato_frags(struct sock *sk, struct sk_buff *skb,
 }
 EXPORT_SYMBOL(skb_append_datato_frags);
 
+int skb_append_pagefrags(struct sk_buff *skb, struct page *page,
+			 int offset, size_t size)
+{
+	int i = skb_shinfo(skb)->nr_frags;
+
+	if (skb_can_coalesce(skb, i, page, offset)) {
+		skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], size);
+	} else if (i < MAX_SKB_FRAGS) {
+		get_page(page);
+		skb_fill_page_desc(skb, i, page, offset, size);
+	} else {
+		return -EMSGSIZE;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(skb_append_pagefrags);
+
 /**
  *	skb_pull_rcsum - pull skb and update receive checksum
  *	@skb: buffer to update
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 8d91b922fcfe..451b009dae75 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -1233,11 +1233,9 @@ ssize_t	ip_append_page(struct sock *sk, struct flowi4 *fl4, struct page *page,
 	}
 
 	while (size > 0) {
-		int i;
-
-		if (skb_is_gso(skb))
+		if (skb_is_gso(skb)) {
 			len = size;
-		else {
+		} else {
 
 			/* Check if the remaining data fits into current packet. */
 			len = mtu - skb->len;
@@ -1289,15 +1287,10 @@ ssize_t	ip_append_page(struct sock *sk, struct flowi4 *fl4, struct page *page,
 			continue;
 		}
 
-		i = skb_shinfo(skb)->nr_frags;
 		if (len > size)
 			len = size;
-		if (skb_can_coalesce(skb, i, page, offset)) {
-			skb_frag_size_add(&skb_shinfo(skb)->frags[i-1], len);
-		} else if (i < MAX_SKB_FRAGS) {
-			get_page(page);
-			skb_fill_page_desc(skb, i, page, offset, len);
-		} else {
+
+		if (skb_append_pagefrags(skb, page, offset, len)) {
 			err = -EMSGSIZE;
 			goto error;
 		}
-- 
cgit v1.2.3


From a60e3cc7c92973a31fad0fd04dc5cf4355d3d1ef Mon Sep 17 00:00:00 2001
From: Hannes Frederic Sowa <hannes@stressinduktion.org>
Date: Thu, 21 May 2015 17:00:00 +0200
Subject: net: make skb_splice_bits more configureable

Prepare skb_splice_bits to be able to deal with AF_UNIX sockets.

AF_UNIX sockets don't use lock_sock/release_sock and thus we have to
use a callback to make the locking and unlocking configureable.

Signed-off-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h | 11 +++++++++--
 net/core/skbuff.c      | 45 ++++++++++++++++++++++++++++-----------------
 net/ipv4/tcp.c         |  5 +++--
 3 files changed, 40 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index f708936cdd23..6b41c15efa27 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -35,6 +35,7 @@
 #include <linux/netdev_features.h>
 #include <linux/sched.h>
 #include <net/flow_dissector.h>
+#include <linux/splice.h>
 
 /* A. Checksumming of received packets by device.
  *
@@ -2699,9 +2700,15 @@ int skb_copy_bits(const struct sk_buff *skb, int offset, void *to, int len);
 int skb_store_bits(struct sk_buff *skb, int offset, const void *from, int len);
 __wsum skb_copy_and_csum_bits(const struct sk_buff *skb, int offset, u8 *to,
 			      int len, __wsum csum);
-int skb_splice_bits(struct sk_buff *skb, unsigned int offset,
+ssize_t skb_socket_splice(struct sock *sk,
+			  struct pipe_inode_info *pipe,
+			  struct splice_pipe_desc *spd);
+int skb_splice_bits(struct sk_buff *skb, struct sock *sk, unsigned int offset,
 		    struct pipe_inode_info *pipe, unsigned int len,
-		    unsigned int flags);
+		    unsigned int flags,
+		    ssize_t (*splice_cb)(struct sock *,
+					 struct pipe_inode_info *,
+					 struct splice_pipe_desc *));
 void skb_copy_and_csum_dev(const struct sk_buff *skb, u8 *to);
 unsigned int skb_zerocopy_headlen(const struct sk_buff *from);
 int skb_zerocopy(struct sk_buff *to, struct sk_buff *from,
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 4f2babeaf18d..02769fa4f5c8 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -1870,15 +1870,39 @@ static bool __skb_splice_bits(struct sk_buff *skb, struct pipe_inode_info *pipe,
 	return false;
 }
 
+ssize_t skb_socket_splice(struct sock *sk,
+			  struct pipe_inode_info *pipe,
+			  struct splice_pipe_desc *spd)
+{
+	int ret;
+
+	/* Drop the socket lock, otherwise we have reverse
+	 * locking dependencies between sk_lock and i_mutex
+	 * here as compared to sendfile(). We enter here
+	 * with the socket lock held, and splice_to_pipe() will
+	 * grab the pipe inode lock. For sendfile() emulation,
+	 * we call into ->sendpage() with the i_mutex lock held
+	 * and networking will grab the socket lock.
+	 */
+	release_sock(sk);
+	ret = splice_to_pipe(pipe, spd);
+	lock_sock(sk);
+
+	return ret;
+}
+
 /*
  * Map data from the skb to a pipe. Should handle both the linear part,
  * the fragments, and the frag list. It does NOT handle frag lists within
  * the frag list, if such a thing exists. We'd probably need to recurse to
  * handle that cleanly.
  */
-int skb_splice_bits(struct sk_buff *skb, unsigned int offset,
+int skb_splice_bits(struct sk_buff *skb, struct sock *sk, unsigned int offset,
 		    struct pipe_inode_info *pipe, unsigned int tlen,
-		    unsigned int flags)
+		    unsigned int flags,
+		    ssize_t (*splice_cb)(struct sock *,
+					 struct pipe_inode_info *,
+					 struct splice_pipe_desc *))
 {
 	struct partial_page partial[MAX_SKB_FRAGS];
 	struct page *pages[MAX_SKB_FRAGS];
@@ -1891,7 +1915,6 @@ int skb_splice_bits(struct sk_buff *skb, unsigned int offset,
 		.spd_release = sock_spd_release,
 	};
 	struct sk_buff *frag_iter;
-	struct sock *sk = skb->sk;
 	int ret = 0;
 
 	/*
@@ -1914,20 +1937,8 @@ int skb_splice_bits(struct sk_buff *skb, unsigned int offset,
 	}
 
 done:
-	if (spd.nr_pages) {
-		/*
-		 * Drop the socket lock, otherwise we have reverse
-		 * locking dependencies between sk_lock and i_mutex
-		 * here as compared to sendfile(). We enter here
-		 * with the socket lock held, and splice_to_pipe() will
-		 * grab the pipe inode lock. For sendfile() emulation,
-		 * we call into ->sendpage() with the i_mutex lock held
-		 * and networking will grab the socket lock.
-		 */
-		release_sock(sk);
-		ret = splice_to_pipe(pipe, &spd);
-		lock_sock(sk);
-	}
+	if (spd.nr_pages)
+		ret = splice_cb(sk, pipe, &spd);
 
 	return ret;
 }
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 90afcec3f427..65f791f74845 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -695,8 +695,9 @@ static int tcp_splice_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb,
 	struct tcp_splice_state *tss = rd_desc->arg.data;
 	int ret;
 
-	ret = skb_splice_bits(skb, offset, tss->pipe, min(rd_desc->count, len),
-			      tss->flags);
+	ret = skb_splice_bits(skb, skb->sk, offset, tss->pipe,
+			      min(rd_desc->count, len), tss->flags,
+			      skb_socket_splice);
 	if (ret > 0)
 		rd_desc->count -= ret;
 	return ret;
-- 
cgit v1.2.3


From 496e7ce2a46562938edcb74f65b26068ee8895f6 Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert@linux-m68k.org>
Date: Thu, 7 May 2015 01:08:08 -0700
Subject: gpiolib: Add missing dummies for the unified device properties
 interface
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

If GPIOLIB=n:

    drivers/leds/leds-gpio.c: In function ‘gpio_leds_create’:
    drivers/leds/leds-gpio.c:187: error: implicit declaration of function ‘devm_get_gpiod_from_child’
    drivers/leds/leds-gpio.c:187: warning: assignment makes pointer from integer without a cast

Add dummies for fwnode_get_named_gpiod() and devm_get_gpiod_from_child()
for the !GPIOLIB case to fix this.

Signed-off-by: Geert Uytterhoeven <geert@linux-m68k.org>
Fixes: 40b7318319281b1b ("gpio: Support for unified device properties interface")
Acked-by: Alexandre Courbot <acourbot@nvidia.com>
Acked-by: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: Bryan Wu <cooloney@gmail.com>
---
 include/linux/gpio/consumer.h | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/gpio/consumer.h b/include/linux/gpio/consumer.h
index 3a7c9ffd5ab9..da042657dc31 100644
--- a/include/linux/gpio/consumer.h
+++ b/include/linux/gpio/consumer.h
@@ -406,6 +406,21 @@ static inline int desc_to_gpio(const struct gpio_desc *desc)
 	return -EINVAL;
 }
 
+/* Child properties interface */
+struct fwnode_handle;
+
+static inline struct gpio_desc *fwnode_get_named_gpiod(
+	struct fwnode_handle *fwnode, const char *propname)
+{
+	return ERR_PTR(-ENOSYS);
+}
+
+static inline struct gpio_desc *devm_get_gpiod_from_child(
+	struct device *dev, const char *con_id, struct fwnode_handle *child)
+{
+	return ERR_PTR(-ENOSYS);
+}
+
 #endif /* CONFIG_GPIOLIB */
 
 /*
-- 
cgit v1.2.3


From a57e16cf03339c20b09642f46f60190069ff70c7 Mon Sep 17 00:00:00 2001
From: Robert Jarzmik <robert.jarzmik@free.fr>
Date: Mon, 25 May 2015 23:29:20 +0200
Subject: dmaengine: pxa: add pxa dmaengine driver

This is a new driver for pxa SoCs, which is also compatible with the former
mmp_pdma.

The rationale behind a new driver (as opposed to incremental patching) was :

 - the new driver relies on virt-dma, which obsoletes all the internal
   structures of mmp_pdma (sw_desc, hw_desc, ...), and by consequence all the
   functions

 - mmp_pdma allocates dma coherent descriptors containing not only hardware
   descriptors but linked list information
   The new driver only puts the dma hardware descriptors (ie. 4 u32) into the
   dma pool allocated memory. This changes completely the way descriptors are
   handled

 - the architecture behind the interrupt/tasklet management was rewritten to be
   more conforming to virt-dma

 - the buffers alignment is handled differently
   The former driver assumed that the DMA channel stopped between each
   descriptor. The new one chains descriptors to let the channel running. This
   is a necessary guarantee for real-time high bandwidth usecases such as video
   capture on "old" architectures such as pxa.

 - hot chaining / cold chaining / no chaining
   Whenever possible, submitting a descriptor "hot chains" it to a running
   channel. There is still no guarantee that the descriptor will be issued, as
   the channel might be stopped just before the descriptor is submitted. Yet
   this allows to submit several video buffers, and resubmit a buffer while
   another is under handling.
   As before, dma_async_issue_pending() is the only guarantee to have all the
   buffers issued.
   When an alignment issue is detected (ie. one address in a descriptor is not
   a multiple of 8), if the already running channel is in "aligned mode", the
   channel will stop, and restarted in "misaligned mode" to finished the issued
   list.

 - descriptors reusing
   A submitted, issued and completed descriptor can be reused, ie resubmitted if
   it was prepared with the proper flag (DMA_PREP_ACK).  Only a channel
   resources release will in this case release that buffer.
   This allows a rolling ring of buffers to be reused, where there are several
   thousands of hardware descriptors used (video buffer for example).

Additionally, a set of more casual features is introduced :
 - debugging traces
 - lockless way to know if a descriptor is terminated or not

The driver was tested on zylonite board (pxa3xx) and mioa701 (pxa27x),
with dmatest, pxa_camera and pxamci.

Signed-off-by: Robert Jarzmik <robert.jarzmik@free.fr>
Signed-off-by: Vinod Koul <vinod.koul@intel.com>
---
 drivers/dma/Kconfig         |   11 +
 drivers/dma/Makefile        |    1 +
 drivers/dma/pxa_dma.c       | 1195 +++++++++++++++++++++++++++++++++++++++++++
 include/linux/dma/pxa-dma.h |   27 +
 4 files changed, 1234 insertions(+)
 create mode 100644 drivers/dma/pxa_dma.c
 create mode 100644 include/linux/dma/pxa-dma.h

(limited to 'include/linux')

diff --git a/drivers/dma/Kconfig b/drivers/dma/Kconfig
index bda2cb06dc7a..b37015e3ce4e 100644
--- a/drivers/dma/Kconfig
+++ b/drivers/dma/Kconfig
@@ -162,6 +162,17 @@ config MX3_IPU_IRQS
 	  To avoid bloating the irq_desc[] array we allocate a sufficient
 	  number of IRQ slots and map them dynamically to specific sources.
 
+config PXA_DMA
+	bool "PXA DMA support"
+	depends on (ARCH_MMP || ARCH_PXA)
+	select DMA_ENGINE
+	select DMA_VIRTUAL_CHANNELS
+	help
+	  Support the DMA engine for PXA. It is also compatible with MMP PDMA
+	  platform. The internal DMA IP of all PXA variants is supported, with
+	  16 to 32 channels for peripheral to memory or memory to memory
+	  transfers.
+
 config TXX9_DMAC
 	tristate "Toshiba TXx9 SoC DMA support"
 	depends on MACH_TX49XX || MACH_TX39XX
diff --git a/drivers/dma/Makefile b/drivers/dma/Makefile
index 69f77d5ba53b..cdf4cbd430bc 100644
--- a/drivers/dma/Makefile
+++ b/drivers/dma/Makefile
@@ -25,6 +25,7 @@ obj-$(CONFIG_AMCC_PPC440SPE_ADMA) += ppc4xx/
 obj-$(CONFIG_IMX_SDMA) += imx-sdma.o
 obj-$(CONFIG_IMX_DMA) += imx-dma.o
 obj-$(CONFIG_MXS_DMA) += mxs-dma.o
+obj-$(CONFIG_PXA_DMA) += pxa_dma.o
 obj-$(CONFIG_TIMB_DMA) += timb_dma.o
 obj-$(CONFIG_SIRF_DMA) += sirf-dma.o
 obj-$(CONFIG_TI_EDMA) += edma.o
diff --git a/drivers/dma/pxa_dma.c b/drivers/dma/pxa_dma.c
new file mode 100644
index 000000000000..80b68b4326fa
--- /dev/null
+++ b/drivers/dma/pxa_dma.c
@@ -0,0 +1,1195 @@
+/*
+ * Copyright 2015 Robert Jarzmik <robert.jarzmik@free.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/err.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/interrupt.h>
+#include <linux/dma-mapping.h>
+#include <linux/slab.h>
+#include <linux/dmaengine.h>
+#include <linux/platform_device.h>
+#include <linux/device.h>
+#include <linux/platform_data/mmp_dma.h>
+#include <linux/dmapool.h>
+#include <linux/of_device.h>
+#include <linux/of_dma.h>
+#include <linux/of.h>
+#include <linux/dma/pxa-dma.h>
+
+#include "dmaengine.h"
+#include "virt-dma.h"
+
+#define DCSR(n)		(0x0000 + ((n) << 2))
+#define DALGN(n)	0x00a0
+#define DINT		0x00f0
+#define DDADR(n)	(0x0200 + ((n) << 4))
+#define DSADR(n)	(0x0204 + ((n) << 4))
+#define DTADR(n)	(0x0208 + ((n) << 4))
+#define DCMD(n)		(0x020c + ((n) << 4))
+
+#define PXA_DCSR_RUN		BIT(31)	/* Run Bit (read / write) */
+#define PXA_DCSR_NODESC		BIT(30)	/* No-Descriptor Fetch (read / write) */
+#define PXA_DCSR_STOPIRQEN	BIT(29)	/* Stop Interrupt Enable (R/W) */
+#define PXA_DCSR_REQPEND	BIT(8)	/* Request Pending (read-only) */
+#define PXA_DCSR_STOPSTATE	BIT(3)	/* Stop State (read-only) */
+#define PXA_DCSR_ENDINTR	BIT(2)	/* End Interrupt (read / write) */
+#define PXA_DCSR_STARTINTR	BIT(1)	/* Start Interrupt (read / write) */
+#define PXA_DCSR_BUSERR		BIT(0)	/* Bus Error Interrupt (read / write) */
+
+#define PXA_DCSR_EORIRQEN	BIT(28)	/* End of Receive IRQ Enable (R/W) */
+#define PXA_DCSR_EORJMPEN	BIT(27)	/* Jump to next descriptor on EOR */
+#define PXA_DCSR_EORSTOPEN	BIT(26)	/* STOP on an EOR */
+#define PXA_DCSR_SETCMPST	BIT(25)	/* Set Descriptor Compare Status */
+#define PXA_DCSR_CLRCMPST	BIT(24)	/* Clear Descriptor Compare Status */
+#define PXA_DCSR_CMPST		BIT(10)	/* The Descriptor Compare Status */
+#define PXA_DCSR_EORINTR	BIT(9)	/* The end of Receive */
+
+#define DRCMR_MAPVLD	BIT(7)	/* Map Valid (read / write) */
+#define DRCMR_CHLNUM	0x1f	/* mask for Channel Number (read / write) */
+
+#define DDADR_DESCADDR	0xfffffff0	/* Address of next descriptor (mask) */
+#define DDADR_STOP	BIT(0)	/* Stop (read / write) */
+
+#define PXA_DCMD_INCSRCADDR	BIT(31)	/* Source Address Increment Setting. */
+#define PXA_DCMD_INCTRGADDR	BIT(30)	/* Target Address Increment Setting. */
+#define PXA_DCMD_FLOWSRC	BIT(29)	/* Flow Control by the source. */
+#define PXA_DCMD_FLOWTRG	BIT(28)	/* Flow Control by the target. */
+#define PXA_DCMD_STARTIRQEN	BIT(22)	/* Start Interrupt Enable */
+#define PXA_DCMD_ENDIRQEN	BIT(21)	/* End Interrupt Enable */
+#define PXA_DCMD_ENDIAN		BIT(18)	/* Device Endian-ness. */
+#define PXA_DCMD_BURST8		(1 << 16)	/* 8 byte burst */
+#define PXA_DCMD_BURST16	(2 << 16)	/* 16 byte burst */
+#define PXA_DCMD_BURST32	(3 << 16)	/* 32 byte burst */
+#define PXA_DCMD_WIDTH1		(1 << 14)	/* 1 byte width */
+#define PXA_DCMD_WIDTH2		(2 << 14)	/* 2 byte width (HalfWord) */
+#define PXA_DCMD_WIDTH4		(3 << 14)	/* 4 byte width (Word) */
+#define PXA_DCMD_LENGTH		0x01fff		/* length mask (max = 8K - 1) */
+
+#define PDMA_ALIGNMENT		3
+#define PDMA_MAX_DESC_BYTES	(PXA_DCMD_LENGTH & ~((1 << PDMA_ALIGNMENT) - 1))
+
+struct pxad_desc_hw {
+	u32 ddadr;	/* Points to the next descriptor + flags */
+	u32 dsadr;	/* DSADR value for the current transfer */
+	u32 dtadr;	/* DTADR value for the current transfer */
+	u32 dcmd;	/* DCMD value for the current transfer */
+} __aligned(16);
+
+struct pxad_desc_sw {
+	struct virt_dma_desc	vd;		/* Virtual descriptor */
+	int			nb_desc;	/* Number of hw. descriptors */
+	size_t			len;		/* Number of bytes xfered */
+	dma_addr_t		first;		/* First descriptor's addr */
+
+	/* At least one descriptor has an src/dst address not multiple of 8 */
+	bool			misaligned;
+	bool			cyclic;
+	struct dma_pool		*desc_pool;	/* Channel's used allocator */
+
+	struct pxad_desc_hw	*hw_desc[];	/* DMA coherent descriptors */
+};
+
+struct pxad_phy {
+	int			idx;
+	void __iomem		*base;
+	struct pxad_chan	*vchan;
+};
+
+struct pxad_chan {
+	struct virt_dma_chan	vc;		/* Virtual channel */
+	u32			drcmr;		/* Requestor of the channel */
+	enum pxad_chan_prio	prio;		/* Required priority of phy */
+	/*
+	 * At least one desc_sw in submitted or issued transfers on this channel
+	 * has one address such as: addr % 8 != 0. This implies the DALGN
+	 * setting on the phy.
+	 */
+	bool			misaligned;
+	struct dma_slave_config	cfg;		/* Runtime config */
+
+	/* protected by vc->lock */
+	struct pxad_phy		*phy;
+	struct dma_pool		*desc_pool;	/* Descriptors pool */
+};
+
+struct pxad_device {
+	struct dma_device		slave;
+	int				nr_chans;
+	void __iomem			*base;
+	struct pxad_phy			*phys;
+	spinlock_t			phy_lock;	/* Phy association */
+};
+
+#define tx_to_pxad_desc(tx)					\
+	container_of(tx, struct pxad_desc_sw, async_tx)
+#define to_pxad_chan(dchan)					\
+	container_of(dchan, struct pxad_chan, vc.chan)
+#define to_pxad_dev(dmadev)					\
+	container_of(dmadev, struct pxad_device, slave)
+#define to_pxad_sw_desc(_vd)				\
+	container_of((_vd), struct pxad_desc_sw, vd)
+
+#define _phy_readl_relaxed(phy, _reg)					\
+	readl_relaxed((phy)->base + _reg((phy)->idx))
+#define phy_readl_relaxed(phy, _reg)					\
+	({								\
+		u32 _v;							\
+		_v = readl_relaxed((phy)->base + _reg((phy)->idx));	\
+		dev_vdbg(&phy->vchan->vc.chan.dev->device,		\
+			 "%s(): readl(%s): 0x%08x\n", __func__, #_reg,	\
+			  _v);						\
+		_v;							\
+	})
+#define phy_writel(phy, val, _reg)					\
+	do {								\
+		writel((val), (phy)->base + _reg((phy)->idx));		\
+		dev_vdbg(&phy->vchan->vc.chan.dev->device,		\
+			 "%s(): writel(0x%08x, %s)\n",			\
+			 __func__, (u32)(val), #_reg);			\
+	} while (0)
+#define phy_writel_relaxed(phy, val, _reg)				\
+	do {								\
+		writel_relaxed((val), (phy)->base + _reg((phy)->idx));	\
+		dev_vdbg(&phy->vchan->vc.chan.dev->device,		\
+			 "%s(): writel_relaxed(0x%08x, %s)\n",		\
+			 __func__, (u32)(val), #_reg);			\
+	} while (0)
+
+static unsigned int pxad_drcmr(unsigned int line)
+{
+	if (line < 64)
+		return 0x100 + line * 4;
+	return 0x1000 + line * 4;
+}
+static struct pxad_phy *lookup_phy(struct pxad_chan *pchan)
+{
+	int prio, i;
+	struct pxad_device *pdev = to_pxad_dev(pchan->vc.chan.device);
+	struct pxad_phy *phy, *found = NULL;
+	unsigned long flags;
+
+	/*
+	 * dma channel priorities
+	 * ch 0 - 3,  16 - 19  <--> (0)
+	 * ch 4 - 7,  20 - 23  <--> (1)
+	 * ch 8 - 11, 24 - 27  <--> (2)
+	 * ch 12 - 15, 28 - 31  <--> (3)
+	 */
+
+	spin_lock_irqsave(&pdev->phy_lock, flags);
+	for (prio = pchan->prio; prio >= PXAD_PRIO_HIGHEST; prio--) {
+		for (i = 0; i < pdev->nr_chans; i++) {
+			if (prio != (i & 0xf) >> 2)
+				continue;
+			phy = &pdev->phys[i];
+			if (!phy->vchan) {
+				phy->vchan = pchan;
+				found = phy;
+				goto out_unlock;
+			}
+		}
+	}
+
+out_unlock:
+	spin_unlock_irqrestore(&pdev->phy_lock, flags);
+	dev_dbg(&pchan->vc.chan.dev->device,
+		"%s(): phy=%p(%d)\n", __func__, found,
+		found ? found->idx : -1);
+
+	return found;
+}
+
+static void pxad_free_phy(struct pxad_chan *chan)
+{
+	struct pxad_device *pdev = to_pxad_dev(chan->vc.chan.device);
+	unsigned long flags;
+	u32 reg;
+
+	dev_dbg(&chan->vc.chan.dev->device,
+		"%s(): freeing\n", __func__);
+	if (!chan->phy)
+		return;
+
+	/* clear the channel mapping in DRCMR */
+	reg = pxad_drcmr(chan->drcmr);
+	writel_relaxed(0, chan->phy->base + reg);
+
+	spin_lock_irqsave(&pdev->phy_lock, flags);
+	chan->phy->vchan = NULL;
+	chan->phy = NULL;
+	spin_unlock_irqrestore(&pdev->phy_lock, flags);
+}
+
+static bool is_chan_running(struct pxad_chan *chan)
+{
+	u32 dcsr;
+	struct pxad_phy *phy = chan->phy;
+
+	if (!phy)
+		return false;
+	dcsr = phy_readl_relaxed(phy, DCSR);
+	return dcsr & PXA_DCSR_RUN;
+}
+
+static bool is_running_chan_misaligned(struct pxad_chan *chan)
+{
+	u32 dalgn;
+
+	BUG_ON(!chan->phy);
+	dalgn = phy_readl_relaxed(chan->phy, DALGN);
+	return dalgn & (BIT(chan->phy->idx));
+}
+
+static void phy_enable(struct pxad_phy *phy, bool misaligned)
+{
+	u32 reg, dalgn;
+
+	if (!phy->vchan)
+		return;
+
+	dev_dbg(&phy->vchan->vc.chan.dev->device,
+		"%s(); phy=%p(%d) misaligned=%d\n", __func__,
+		phy, phy->idx, misaligned);
+
+	reg = pxad_drcmr(phy->vchan->drcmr);
+	writel_relaxed(DRCMR_MAPVLD | phy->idx, phy->base + reg);
+
+	dalgn = phy_readl_relaxed(phy, DALGN);
+	if (misaligned)
+		dalgn |= BIT(phy->idx);
+	else
+		dalgn &= ~BIT(phy->idx);
+	phy_writel_relaxed(phy, dalgn, DALGN);
+
+	phy_writel(phy, PXA_DCSR_STOPIRQEN | PXA_DCSR_ENDINTR |
+		   PXA_DCSR_BUSERR | PXA_DCSR_RUN, DCSR);
+}
+
+static void phy_disable(struct pxad_phy *phy)
+{
+	u32 dcsr;
+
+	if (!phy)
+		return;
+
+	dcsr = phy_readl_relaxed(phy, DCSR);
+	dev_dbg(&phy->vchan->vc.chan.dev->device,
+		"%s(): phy=%p(%d)\n", __func__, phy, phy->idx);
+	phy_writel(phy, dcsr & ~PXA_DCSR_RUN & ~PXA_DCSR_STOPIRQEN, DCSR);
+}
+
+static void pxad_launch_chan(struct pxad_chan *chan,
+				 struct pxad_desc_sw *desc)
+{
+	dev_dbg(&chan->vc.chan.dev->device,
+		"%s(): desc=%p\n", __func__, desc);
+	if (!chan->phy) {
+		chan->phy = lookup_phy(chan);
+		if (!chan->phy) {
+			dev_dbg(&chan->vc.chan.dev->device,
+				"%s(): no free dma channel\n", __func__);
+			return;
+		}
+	}
+
+	/*
+	 * Program the descriptor's address into the DMA controller,
+	 * then start the DMA transaction
+	 */
+	phy_writel(chan->phy, desc->first, DDADR);
+	phy_enable(chan->phy, chan->misaligned);
+}
+
+static void set_updater_desc(struct pxad_desc_sw *sw_desc,
+			     unsigned long flags)
+{
+	struct pxad_desc_hw *updater =
+		sw_desc->hw_desc[sw_desc->nb_desc - 1];
+	dma_addr_t dma = sw_desc->hw_desc[sw_desc->nb_desc - 2]->ddadr;
+
+	updater->ddadr = DDADR_STOP;
+	updater->dsadr = dma;
+	updater->dtadr = dma + 8;
+	updater->dcmd = PXA_DCMD_WIDTH4 | PXA_DCMD_BURST32 |
+		(PXA_DCMD_LENGTH & sizeof(u32));
+	if (flags & DMA_PREP_INTERRUPT)
+		updater->dcmd |= PXA_DCMD_ENDIRQEN;
+}
+
+static bool is_desc_completed(struct virt_dma_desc *vd)
+{
+	struct pxad_desc_sw *sw_desc = to_pxad_sw_desc(vd);
+	struct pxad_desc_hw *updater =
+		sw_desc->hw_desc[sw_desc->nb_desc - 1];
+
+	return updater->dtadr != (updater->dsadr + 8);
+}
+
+static void pxad_desc_chain(struct virt_dma_desc *vd1,
+				struct virt_dma_desc *vd2)
+{
+	struct pxad_desc_sw *desc1 = to_pxad_sw_desc(vd1);
+	struct pxad_desc_sw *desc2 = to_pxad_sw_desc(vd2);
+	dma_addr_t dma_to_chain;
+
+	dma_to_chain = desc2->first;
+	desc1->hw_desc[desc1->nb_desc - 1]->ddadr = dma_to_chain;
+}
+
+static bool pxad_try_hotchain(struct virt_dma_chan *vc,
+				  struct virt_dma_desc *vd)
+{
+	struct virt_dma_desc *vd_last_issued = NULL;
+	struct pxad_chan *chan = to_pxad_chan(&vc->chan);
+
+	/*
+	 * Attempt to hot chain the tx if the phy is still running. This is
+	 * considered successful only if either the channel is still running
+	 * after the chaining, or if the chained transfer is completed after
+	 * having been hot chained.
+	 * A change of alignment is not allowed, and forbids hotchaining.
+	 */
+	if (is_chan_running(chan)) {
+		BUG_ON(list_empty(&vc->desc_issued));
+
+		if (!is_running_chan_misaligned(chan) &&
+		    to_pxad_sw_desc(vd)->misaligned)
+			return false;
+
+		vd_last_issued = list_entry(vc->desc_issued.prev,
+					    struct virt_dma_desc, node);
+		pxad_desc_chain(vd_last_issued, vd);
+		if (is_chan_running(chan) || is_desc_completed(vd_last_issued))
+			return true;
+	}
+
+	return false;
+}
+
+static unsigned int clear_chan_irq(struct pxad_phy *phy)
+{
+	u32 dcsr;
+	u32 dint = readl(phy->base + DINT);
+
+	if (!(dint & BIT(phy->idx)))
+		return PXA_DCSR_RUN;
+
+	/* clear irq */
+	dcsr = phy_readl_relaxed(phy, DCSR);
+	phy_writel(phy, dcsr, DCSR);
+	if ((dcsr & PXA_DCSR_BUSERR) && (phy->vchan))
+		dev_warn(&phy->vchan->vc.chan.dev->device,
+			 "%s(chan=%p): PXA_DCSR_BUSERR\n",
+			 __func__, &phy->vchan);
+
+	return dcsr & ~PXA_DCSR_RUN;
+}
+
+static irqreturn_t pxad_chan_handler(int irq, void *dev_id)
+{
+	struct pxad_phy *phy = dev_id;
+	struct pxad_chan *chan = phy->vchan;
+	struct virt_dma_desc *vd, *tmp;
+	unsigned int dcsr;
+	unsigned long flags;
+
+	BUG_ON(!chan);
+
+	dcsr = clear_chan_irq(phy);
+	if (dcsr & PXA_DCSR_RUN)
+		return IRQ_NONE;
+
+	spin_lock_irqsave(&chan->vc.lock, flags);
+	list_for_each_entry_safe(vd, tmp, &chan->vc.desc_issued, node) {
+		dev_dbg(&chan->vc.chan.dev->device,
+			"%s(): checking txd %p[%x]: completed=%d\n",
+			__func__, vd, vd->tx.cookie, is_desc_completed(vd));
+		if (is_desc_completed(vd)) {
+			list_del(&vd->node);
+			vchan_cookie_complete(vd);
+		} else {
+			break;
+		}
+	}
+
+	if (dcsr & PXA_DCSR_STOPSTATE) {
+		dev_dbg(&chan->vc.chan.dev->device,
+		"%s(): channel stopped, submitted_empty=%d issued_empty=%d",
+			__func__,
+			list_empty(&chan->vc.desc_submitted),
+			list_empty(&chan->vc.desc_issued));
+		phy_writel_relaxed(phy, dcsr & ~PXA_DCSR_STOPIRQEN, DCSR);
+
+		if (list_empty(&chan->vc.desc_issued)) {
+			chan->misaligned =
+				!list_empty(&chan->vc.desc_submitted);
+		} else {
+			vd = list_first_entry(&chan->vc.desc_issued,
+					      struct virt_dma_desc, node);
+			pxad_launch_chan(chan, to_pxad_sw_desc(vd));
+		}
+	}
+	spin_unlock_irqrestore(&chan->vc.lock, flags);
+
+	return IRQ_HANDLED;
+}
+
+static irqreturn_t pxad_int_handler(int irq, void *dev_id)
+{
+	struct pxad_device *pdev = dev_id;
+	struct pxad_phy *phy;
+	u32 dint = readl(pdev->base + DINT);
+	int i, ret = IRQ_NONE;
+
+	while (dint) {
+		i = __ffs(dint);
+		dint &= (dint - 1);
+		phy = &pdev->phys[i];
+		if (pxad_chan_handler(irq, phy) == IRQ_HANDLED)
+			ret = IRQ_HANDLED;
+	}
+
+	return ret;
+}
+
+static int pxad_alloc_chan_resources(struct dma_chan *dchan)
+{
+	struct pxad_chan *chan = to_pxad_chan(dchan);
+	struct pxad_device *pdev = to_pxad_dev(chan->vc.chan.device);
+
+	if (chan->desc_pool)
+		return 1;
+
+	chan->desc_pool = dma_pool_create(dma_chan_name(dchan),
+					  pdev->slave.dev,
+					  sizeof(struct pxad_desc_hw),
+					  __alignof__(struct pxad_desc_hw),
+					  0);
+	if (!chan->desc_pool) {
+		dev_err(&chan->vc.chan.dev->device,
+			"%s(): unable to allocate descriptor pool\n",
+			__func__);
+		return -ENOMEM;
+	}
+
+	return 1;
+}
+
+static void pxad_free_chan_resources(struct dma_chan *dchan)
+{
+	struct pxad_chan *chan = to_pxad_chan(dchan);
+
+	vchan_free_chan_resources(&chan->vc);
+	dma_pool_destroy(chan->desc_pool);
+	chan->desc_pool = NULL;
+
+}
+
+static void pxad_free_desc(struct virt_dma_desc *vd)
+{
+	int i;
+	dma_addr_t dma;
+	struct pxad_desc_sw *sw_desc = to_pxad_sw_desc(vd);
+
+	BUG_ON(sw_desc->nb_desc == 0);
+	for (i = sw_desc->nb_desc - 1; i >= 0; i--) {
+		if (i > 0)
+			dma = sw_desc->hw_desc[i - 1]->ddadr;
+		else
+			dma = sw_desc->first;
+		dma_pool_free(sw_desc->desc_pool,
+			      sw_desc->hw_desc[i], dma);
+	}
+	sw_desc->nb_desc = 0;
+	kfree(sw_desc);
+}
+
+static struct pxad_desc_sw *
+pxad_alloc_desc(struct pxad_chan *chan, unsigned int nb_hw_desc)
+{
+	struct pxad_desc_sw *sw_desc;
+	dma_addr_t dma;
+	int i;
+
+	sw_desc = kzalloc(sizeof(*sw_desc) +
+			  nb_hw_desc * sizeof(struct pxad_desc_hw *),
+			  GFP_NOWAIT);
+	if (!sw_desc)
+		return NULL;
+	sw_desc->desc_pool = chan->desc_pool;
+
+	for (i = 0; i < nb_hw_desc; i++) {
+		sw_desc->hw_desc[i] = dma_pool_alloc(sw_desc->desc_pool,
+						     GFP_NOWAIT, &dma);
+		if (!sw_desc->hw_desc[i]) {
+			dev_err(&chan->vc.chan.dev->device,
+				"%s(): Couldn't allocate the %dth hw_desc from dma_pool %p\n",
+				__func__, i, sw_desc->desc_pool);
+			goto err;
+		}
+
+		if (i == 0)
+			sw_desc->first = dma;
+		else
+			sw_desc->hw_desc[i - 1]->ddadr = dma;
+		sw_desc->nb_desc++;
+	}
+
+	return sw_desc;
+err:
+	pxad_free_desc(&sw_desc->vd);
+	return NULL;
+}
+
+static dma_cookie_t pxad_tx_submit(struct dma_async_tx_descriptor *tx)
+{
+	struct virt_dma_chan *vc = to_virt_chan(tx->chan);
+	struct pxad_chan *chan = to_pxad_chan(&vc->chan);
+	struct virt_dma_desc *vd_chained = NULL,
+		*vd = container_of(tx, struct virt_dma_desc, tx);
+	dma_cookie_t cookie;
+	unsigned long flags;
+
+	set_updater_desc(to_pxad_sw_desc(vd), tx->flags);
+
+	spin_lock_irqsave(&vc->lock, flags);
+	cookie = dma_cookie_assign(tx);
+
+	if (list_empty(&vc->desc_submitted) && pxad_try_hotchain(vc, vd)) {
+		list_move_tail(&vd->node, &vc->desc_issued);
+		dev_dbg(&chan->vc.chan.dev->device,
+			"%s(): txd %p[%x]: submitted (hot linked)\n",
+			__func__, vd, cookie);
+		goto out;
+	}
+
+	/*
+	 * Fallback to placing the tx in the submitted queue
+	 */
+	if (!list_empty(&vc->desc_submitted)) {
+		vd_chained = list_entry(vc->desc_submitted.prev,
+					struct virt_dma_desc, node);
+		/*
+		 * Only chain the descriptors if no new misalignment is
+		 * introduced. If a new misalignment is chained, let the channel
+		 * stop, and be relaunched in misalign mode from the irq
+		 * handler.
+		 */
+		if (chan->misaligned || !to_pxad_sw_desc(vd)->misaligned)
+			pxad_desc_chain(vd_chained, vd);
+		else
+			vd_chained = NULL;
+	}
+	dev_dbg(&chan->vc.chan.dev->device,
+		"%s(): txd %p[%x]: submitted (%s linked)\n",
+		__func__, vd, cookie, vd_chained ? "cold" : "not");
+	list_move_tail(&vd->node, &vc->desc_submitted);
+	chan->misaligned |= to_pxad_sw_desc(vd)->misaligned;
+
+out:
+	spin_unlock_irqrestore(&vc->lock, flags);
+	return cookie;
+}
+
+static void pxad_issue_pending(struct dma_chan *dchan)
+{
+	struct pxad_chan *chan = to_pxad_chan(dchan);
+	struct virt_dma_desc *vd_first;
+	unsigned long flags;
+
+	spin_lock_irqsave(&chan->vc.lock, flags);
+	if (list_empty(&chan->vc.desc_submitted))
+		goto out;
+
+	vd_first = list_first_entry(&chan->vc.desc_submitted,
+				    struct virt_dma_desc, node);
+	dev_dbg(&chan->vc.chan.dev->device,
+		"%s(): txd %p[%x]", __func__, vd_first, vd_first->tx.cookie);
+
+	vchan_issue_pending(&chan->vc);
+	if (!pxad_try_hotchain(&chan->vc, vd_first))
+		pxad_launch_chan(chan, to_pxad_sw_desc(vd_first));
+out:
+	spin_unlock_irqrestore(&chan->vc.lock, flags);
+}
+
+static inline struct dma_async_tx_descriptor *
+pxad_tx_prep(struct virt_dma_chan *vc, struct virt_dma_desc *vd,
+		 unsigned long tx_flags)
+{
+	struct dma_async_tx_descriptor *tx;
+	struct pxad_chan *chan = container_of(vc, struct pxad_chan, vc);
+
+	tx = vchan_tx_prep(vc, vd, tx_flags);
+	tx->tx_submit = pxad_tx_submit;
+	dev_dbg(&chan->vc.chan.dev->device,
+		"%s(): vc=%p txd=%p[%x] flags=0x%lx\n", __func__,
+		vc, vd, vd->tx.cookie,
+		tx_flags);
+
+	return tx;
+}
+
+static void pxad_get_config(struct pxad_chan *chan,
+			    enum dma_transfer_direction dir,
+			    u32 *dcmd, u32 *dev_src, u32 *dev_dst)
+{
+	u32 maxburst = 0, dev_addr = 0;
+	enum dma_slave_buswidth width = DMA_SLAVE_BUSWIDTH_UNDEFINED;
+
+	*dcmd = 0;
+	if (chan->cfg.direction == DMA_DEV_TO_MEM) {
+		maxburst = chan->cfg.src_maxburst;
+		width = chan->cfg.src_addr_width;
+		dev_addr = chan->cfg.src_addr;
+		*dev_src = dev_addr;
+		*dcmd |= PXA_DCMD_INCTRGADDR | PXA_DCMD_FLOWSRC;
+	}
+	if (chan->cfg.direction == DMA_MEM_TO_DEV) {
+		maxburst = chan->cfg.dst_maxburst;
+		width = chan->cfg.dst_addr_width;
+		dev_addr = chan->cfg.dst_addr;
+		*dev_dst = dev_addr;
+		*dcmd |= PXA_DCMD_INCSRCADDR | PXA_DCMD_FLOWTRG;
+	}
+	if (chan->cfg.direction == DMA_MEM_TO_MEM)
+		*dcmd |= PXA_DCMD_BURST32 | PXA_DCMD_INCTRGADDR |
+			PXA_DCMD_INCSRCADDR;
+
+	dev_dbg(&chan->vc.chan.dev->device,
+		"%s(): dev_addr=0x%x maxburst=%d width=%d  dir=%d\n",
+		__func__, dev_addr, maxburst, width, dir);
+
+	if (width == DMA_SLAVE_BUSWIDTH_1_BYTE)
+		*dcmd |= PXA_DCMD_WIDTH1;
+	else if (width == DMA_SLAVE_BUSWIDTH_2_BYTES)
+		*dcmd |= PXA_DCMD_WIDTH2;
+	else if (width == DMA_SLAVE_BUSWIDTH_4_BYTES)
+		*dcmd |= PXA_DCMD_WIDTH4;
+
+	if (maxburst == 8)
+		*dcmd |= PXA_DCMD_BURST8;
+	else if (maxburst == 16)
+		*dcmd |= PXA_DCMD_BURST16;
+	else if (maxburst == 32)
+		*dcmd |= PXA_DCMD_BURST32;
+
+	/* FIXME: drivers should be ported over to use the filter
+	 * function. Once that's done, the following two lines can
+	 * be removed.
+	 */
+	if (chan->cfg.slave_id)
+		chan->drcmr = chan->cfg.slave_id;
+}
+
+static struct dma_async_tx_descriptor *
+pxad_prep_memcpy(struct dma_chan *dchan,
+		 dma_addr_t dma_dst, dma_addr_t dma_src,
+		 size_t len, unsigned long flags)
+{
+	struct pxad_chan *chan = to_pxad_chan(dchan);
+	struct pxad_desc_sw *sw_desc;
+	struct pxad_desc_hw *hw_desc;
+	u32 dcmd;
+	unsigned int i, nb_desc = 0;
+	size_t copy;
+
+	if (!dchan || !len)
+		return NULL;
+
+	dev_dbg(&chan->vc.chan.dev->device,
+		"%s(): dma_dst=0x%lx dma_src=0x%lx len=%zu flags=%lx\n",
+		__func__, (unsigned long)dma_dst, (unsigned long)dma_src,
+		len, flags);
+	pxad_get_config(chan, DMA_MEM_TO_MEM, &dcmd, NULL, NULL);
+
+	nb_desc = DIV_ROUND_UP(len, PDMA_MAX_DESC_BYTES);
+	sw_desc = pxad_alloc_desc(chan, nb_desc + 1);
+	if (!sw_desc)
+		return NULL;
+	sw_desc->len = len;
+
+	if (!IS_ALIGNED(dma_src, 1 << PDMA_ALIGNMENT) ||
+	    !IS_ALIGNED(dma_dst, 1 << PDMA_ALIGNMENT))
+		sw_desc->misaligned = true;
+
+	i = 0;
+	do {
+		hw_desc = sw_desc->hw_desc[i++];
+		copy = min_t(size_t, len, PDMA_MAX_DESC_BYTES);
+		hw_desc->dcmd = dcmd | (PXA_DCMD_LENGTH & copy);
+		hw_desc->dsadr = dma_src;
+		hw_desc->dtadr = dma_dst;
+		len -= copy;
+		dma_src += copy;
+		dma_dst += copy;
+	} while (len);
+	set_updater_desc(sw_desc, flags);
+
+	return pxad_tx_prep(&chan->vc, &sw_desc->vd, flags);
+}
+
+static struct dma_async_tx_descriptor *
+pxad_prep_slave_sg(struct dma_chan *dchan, struct scatterlist *sgl,
+		   unsigned int sg_len, enum dma_transfer_direction dir,
+		   unsigned long flags, void *context)
+{
+	struct pxad_chan *chan = to_pxad_chan(dchan);
+	struct pxad_desc_sw *sw_desc;
+	size_t len, avail;
+	struct scatterlist *sg;
+	dma_addr_t dma;
+	u32 dcmd, dsadr = 0, dtadr = 0;
+	unsigned int nb_desc = 0, i, j = 0;
+
+	if ((sgl == NULL) || (sg_len == 0))
+		return NULL;
+
+	pxad_get_config(chan, dir, &dcmd, &dsadr, &dtadr);
+	dev_dbg(&chan->vc.chan.dev->device,
+		"%s(): dir=%d flags=%lx\n", __func__, dir, flags);
+
+	for_each_sg(sgl, sg, sg_len, i)
+		nb_desc += DIV_ROUND_UP(sg_dma_len(sg), PDMA_MAX_DESC_BYTES);
+	sw_desc = pxad_alloc_desc(chan, nb_desc + 1);
+	if (!sw_desc)
+		return NULL;
+
+	for_each_sg(sgl, sg, sg_len, i) {
+		dma = sg_dma_address(sg);
+		avail = sg_dma_len(sg);
+		sw_desc->len += avail;
+
+		do {
+			len = min_t(size_t, avail, PDMA_MAX_DESC_BYTES);
+			if (dma & 0x7)
+				sw_desc->misaligned = true;
+
+			sw_desc->hw_desc[j]->dcmd =
+				dcmd | (PXA_DCMD_LENGTH & len);
+			sw_desc->hw_desc[j]->dsadr = dsadr ? dsadr : dma;
+			sw_desc->hw_desc[j++]->dtadr = dtadr ? dtadr : dma;
+
+			dma += len;
+			avail -= len;
+		} while (avail);
+	}
+	set_updater_desc(sw_desc, flags);
+
+	return pxad_tx_prep(&chan->vc, &sw_desc->vd, flags);
+}
+
+static struct dma_async_tx_descriptor *
+pxad_prep_dma_cyclic(struct dma_chan *dchan,
+		     dma_addr_t buf_addr, size_t len, size_t period_len,
+		     enum dma_transfer_direction dir, unsigned long flags)
+{
+	struct pxad_chan *chan = to_pxad_chan(dchan);
+	struct pxad_desc_sw *sw_desc;
+	struct pxad_desc_hw **phw_desc;
+	dma_addr_t dma;
+	u32 dcmd, dsadr = 0, dtadr = 0;
+	unsigned int nb_desc = 0;
+
+	if (!dchan || !len || !period_len)
+		return NULL;
+	if ((dir != DMA_DEV_TO_MEM) && (dir != DMA_MEM_TO_DEV)) {
+		dev_err(&chan->vc.chan.dev->device,
+			"Unsupported direction for cyclic DMA\n");
+		return NULL;
+	}
+	/* the buffer length must be a multiple of period_len */
+	if (len % period_len != 0 || period_len > PDMA_MAX_DESC_BYTES ||
+	    !IS_ALIGNED(period_len, 1 << PDMA_ALIGNMENT))
+		return NULL;
+
+	pxad_get_config(chan, dir, &dcmd, &dsadr, &dtadr);
+	dcmd |= PXA_DCMD_ENDIRQEN | (PXA_DCMD_LENGTH | period_len);
+	dev_dbg(&chan->vc.chan.dev->device,
+		"%s(): buf_addr=0x%lx len=%zu period=%zu dir=%d flags=%lx\n",
+		__func__, (unsigned long)buf_addr, len, period_len, dir, flags);
+
+	nb_desc = DIV_ROUND_UP(period_len, PDMA_MAX_DESC_BYTES);
+	nb_desc *= DIV_ROUND_UP(len, period_len);
+	sw_desc = pxad_alloc_desc(chan, nb_desc + 1);
+	if (!sw_desc)
+		return NULL;
+	sw_desc->cyclic = true;
+	sw_desc->len = len;
+
+	phw_desc = sw_desc->hw_desc;
+	dma = buf_addr;
+	do {
+		phw_desc[0]->dsadr = dsadr ? dsadr : dma;
+		phw_desc[0]->dtadr = dtadr ? dtadr : dma;
+		phw_desc[0]->dcmd = dcmd;
+		phw_desc++;
+		dma += period_len;
+		len -= period_len;
+	} while (len);
+	set_updater_desc(sw_desc, flags);
+
+	return pxad_tx_prep(&chan->vc, &sw_desc->vd, flags);
+}
+
+static int pxad_config(struct dma_chan *dchan,
+		       struct dma_slave_config *cfg)
+{
+	struct pxad_chan *chan = to_pxad_chan(dchan);
+
+	if (!dchan)
+		return -EINVAL;
+
+	chan->cfg = *cfg;
+	return 0;
+}
+
+static int pxad_terminate_all(struct dma_chan *dchan)
+{
+	struct pxad_chan *chan = to_pxad_chan(dchan);
+	struct pxad_device *pdev = to_pxad_dev(chan->vc.chan.device);
+	struct virt_dma_desc *vd = NULL;
+	unsigned long flags;
+	struct pxad_phy *phy;
+	LIST_HEAD(head);
+
+	dev_dbg(&chan->vc.chan.dev->device,
+		"%s(): vchan %p: terminate all\n", __func__, &chan->vc);
+
+	spin_lock_irqsave(&chan->vc.lock, flags);
+	vchan_get_all_descriptors(&chan->vc, &head);
+
+	list_for_each_entry(vd, &head, node) {
+		dev_dbg(&chan->vc.chan.dev->device,
+			"%s(): cancelling txd %p[%x] (completed=%d)", __func__,
+			vd, vd->tx.cookie, is_desc_completed(vd));
+	}
+
+	phy = chan->phy;
+	if (phy) {
+		phy_disable(chan->phy);
+		pxad_free_phy(chan);
+		chan->phy = NULL;
+		spin_lock(&pdev->phy_lock);
+		phy->vchan = NULL;
+		spin_unlock(&pdev->phy_lock);
+	}
+	spin_unlock_irqrestore(&chan->vc.lock, flags);
+	vchan_dma_desc_free_list(&chan->vc, &head);
+
+	return 0;
+}
+
+static unsigned int pxad_residue(struct pxad_chan *chan,
+				 dma_cookie_t cookie)
+{
+	struct virt_dma_desc *vd = NULL;
+	struct pxad_desc_sw *sw_desc = NULL;
+	struct pxad_desc_hw *hw_desc = NULL;
+	u32 curr, start, len, end, residue = 0;
+	unsigned long flags;
+	bool passed = false;
+	int i;
+
+	/*
+	 * If the channel does not have a phy pointer anymore, it has already
+	 * been completed. Therefore, its residue is 0.
+	 */
+	if (!chan->phy)
+		return 0;
+
+	spin_lock_irqsave(&chan->vc.lock, flags);
+
+	vd = vchan_find_desc(&chan->vc, cookie);
+	if (!vd)
+		goto out;
+
+	sw_desc = to_pxad_sw_desc(vd);
+	if (sw_desc->hw_desc[0]->dcmd & PXA_DCMD_INCSRCADDR)
+		curr = phy_readl_relaxed(chan->phy, DSADR);
+	else
+		curr = phy_readl_relaxed(chan->phy, DTADR);
+
+	for (i = 0; i < sw_desc->nb_desc - 1; i++) {
+		hw_desc = sw_desc->hw_desc[i];
+		if (sw_desc->hw_desc[0]->dcmd & PXA_DCMD_INCSRCADDR)
+			start = hw_desc->dsadr;
+		else
+			start = hw_desc->dtadr;
+		len = hw_desc->dcmd & PXA_DCMD_LENGTH;
+		end = start + len;
+
+		/*
+		 * 'passed' will be latched once we found the descriptor
+		 * which lies inside the boundaries of the curr
+		 * pointer. All descriptors that occur in the list
+		 * _after_ we found that partially handled descriptor
+		 * are still to be processed and are hence added to the
+		 * residual bytes counter.
+		 */
+
+		if (passed) {
+			residue += len;
+		} else if (curr >= start && curr <= end) {
+			residue += end - curr;
+			passed = true;
+		}
+	}
+	if (!passed)
+		residue = sw_desc->len;
+
+out:
+	spin_unlock_irqrestore(&chan->vc.lock, flags);
+	dev_dbg(&chan->vc.chan.dev->device,
+		"%s(): txd %p[%x] sw_desc=%p: %d\n",
+		__func__, vd, cookie, sw_desc, residue);
+	return residue;
+}
+
+static enum dma_status pxad_tx_status(struct dma_chan *dchan,
+				      dma_cookie_t cookie,
+				      struct dma_tx_state *txstate)
+{
+	struct pxad_chan *chan = to_pxad_chan(dchan);
+	enum dma_status ret;
+
+	ret = dma_cookie_status(dchan, cookie, txstate);
+	if (likely(txstate && (ret != DMA_ERROR)))
+		dma_set_residue(txstate, pxad_residue(chan, cookie));
+
+	return ret;
+}
+
+static void pxad_free_channels(struct dma_device *dmadev)
+{
+	struct pxad_chan *c, *cn;
+
+	list_for_each_entry_safe(c, cn, &dmadev->channels,
+				 vc.chan.device_node) {
+		list_del(&c->vc.chan.device_node);
+		tasklet_kill(&c->vc.task);
+	}
+}
+
+static int pxad_remove(struct platform_device *op)
+{
+	struct pxad_device *pdev = platform_get_drvdata(op);
+
+	pxad_free_channels(&pdev->slave);
+	dma_async_device_unregister(&pdev->slave);
+	return 0;
+}
+
+static int pxad_init_phys(struct platform_device *op,
+			  struct pxad_device *pdev,
+			  unsigned int nb_phy_chans)
+{
+	int irq0, irq, nr_irq = 0, i, ret;
+	struct pxad_phy *phy;
+
+	irq0 = platform_get_irq(op, 0);
+	if (irq0 < 0)
+		return irq0;
+
+	pdev->phys = devm_kcalloc(&op->dev, nb_phy_chans,
+				  sizeof(pdev->phys[0]), GFP_KERNEL);
+	if (!pdev->phys)
+		return -ENOMEM;
+
+	for (i = 0; i < nb_phy_chans; i++)
+		if (platform_get_irq(op, i) > 0)
+			nr_irq++;
+
+	for (i = 0; i < nb_phy_chans; i++) {
+		phy = &pdev->phys[i];
+		phy->base = pdev->base;
+		phy->idx = i;
+		irq = platform_get_irq(op, i);
+		if ((nr_irq > 1) && (irq > 0))
+			ret = devm_request_irq(&op->dev, irq,
+					       pxad_chan_handler,
+					       IRQF_SHARED, "pxa-dma", phy);
+		if ((nr_irq == 1) && (i == 0))
+			ret = devm_request_irq(&op->dev, irq0,
+					       pxad_int_handler,
+					       IRQF_SHARED, "pxa-dma", pdev);
+		if (ret) {
+			dev_err(pdev->slave.dev,
+				"%s(): can't request irq %d:%d\n", __func__,
+				irq, ret);
+			return ret;
+		}
+	}
+
+	return 0;
+}
+
+static const struct of_device_id const pxad_dt_ids[] = {
+	{ .compatible = "marvell,pdma-1.0", },
+	{}
+};
+MODULE_DEVICE_TABLE(of, pxad_dt_ids);
+
+static struct dma_chan *pxad_dma_xlate(struct of_phandle_args *dma_spec,
+					   struct of_dma *ofdma)
+{
+	struct pxad_device *d = ofdma->of_dma_data;
+	struct dma_chan *chan;
+
+	chan = dma_get_any_slave_channel(&d->slave);
+	if (!chan)
+		return NULL;
+
+	to_pxad_chan(chan)->drcmr = dma_spec->args[0];
+	to_pxad_chan(chan)->prio = dma_spec->args[1];
+
+	return chan;
+}
+
+static int pxad_init_dmadev(struct platform_device *op,
+			    struct pxad_device *pdev,
+			    unsigned int nr_phy_chans)
+{
+	int ret;
+	unsigned int i;
+	struct pxad_chan *c;
+
+	pdev->nr_chans = nr_phy_chans;
+	INIT_LIST_HEAD(&pdev->slave.channels);
+	pdev->slave.device_alloc_chan_resources = pxad_alloc_chan_resources;
+	pdev->slave.device_free_chan_resources = pxad_free_chan_resources;
+	pdev->slave.device_tx_status = pxad_tx_status;
+	pdev->slave.device_issue_pending = pxad_issue_pending;
+	pdev->slave.device_config = pxad_config;
+	pdev->slave.device_terminate_all = pxad_terminate_all;
+
+	if (op->dev.coherent_dma_mask)
+		dma_set_mask(&op->dev, op->dev.coherent_dma_mask);
+	else
+		dma_set_mask(&op->dev, DMA_BIT_MASK(32));
+
+	ret = pxad_init_phys(op, pdev, nr_phy_chans);
+	if (ret)
+		return ret;
+
+	for (i = 0; i < nr_phy_chans; i++) {
+		c = devm_kzalloc(&op->dev, sizeof(*c), GFP_KERNEL);
+		if (!c)
+			return -ENOMEM;
+		c->vc.desc_free = pxad_free_desc;
+		vchan_init(&c->vc, &pdev->slave);
+	}
+
+	return dma_async_device_register(&pdev->slave);
+}
+
+static int pxad_probe(struct platform_device *op)
+{
+	struct pxad_device *pdev;
+	const struct of_device_id *of_id;
+	struct mmp_dma_platdata *pdata = dev_get_platdata(&op->dev);
+	struct resource *iores;
+	int ret, dma_channels = 0;
+	const enum dma_slave_buswidth widths =
+		DMA_SLAVE_BUSWIDTH_1_BYTE   | DMA_SLAVE_BUSWIDTH_2_BYTES |
+		DMA_SLAVE_BUSWIDTH_4_BYTES;
+
+	pdev = devm_kzalloc(&op->dev, sizeof(*pdev), GFP_KERNEL);
+	if (!pdev)
+		return -ENOMEM;
+
+	spin_lock_init(&pdev->phy_lock);
+
+	iores = platform_get_resource(op, IORESOURCE_MEM, 0);
+	pdev->base = devm_ioremap_resource(&op->dev, iores);
+	if (IS_ERR(pdev->base))
+		return PTR_ERR(pdev->base);
+
+	of_id = of_match_device(pxad_dt_ids, &op->dev);
+	if (of_id)
+		of_property_read_u32(op->dev.of_node, "#dma-channels",
+				     &dma_channels);
+	else if (pdata && pdata->dma_channels)
+		dma_channels = pdata->dma_channels;
+	else
+		dma_channels = 32;	/* default 32 channel */
+
+	dma_cap_set(DMA_SLAVE, pdev->slave.cap_mask);
+	dma_cap_set(DMA_MEMCPY, pdev->slave.cap_mask);
+	dma_cap_set(DMA_CYCLIC, pdev->slave.cap_mask);
+	dma_cap_set(DMA_PRIVATE, pdev->slave.cap_mask);
+	pdev->slave.device_prep_dma_memcpy = pxad_prep_memcpy;
+	pdev->slave.device_prep_slave_sg = pxad_prep_slave_sg;
+	pdev->slave.device_prep_dma_cyclic = pxad_prep_dma_cyclic;
+
+	pdev->slave.copy_align = PDMA_ALIGNMENT;
+	pdev->slave.src_addr_widths = widths;
+	pdev->slave.dst_addr_widths = widths;
+	pdev->slave.directions = BIT(DMA_MEM_TO_DEV) | BIT(DMA_DEV_TO_MEM);
+	pdev->slave.residue_granularity = DMA_RESIDUE_GRANULARITY_DESCRIPTOR;
+
+	pdev->slave.dev = &op->dev;
+	ret = pxad_init_dmadev(op, pdev, dma_channels);
+	if (ret) {
+		dev_err(pdev->slave.dev, "unable to register\n");
+		return ret;
+	}
+
+	if (op->dev.of_node) {
+		/* Device-tree DMA controller registration */
+		ret = of_dma_controller_register(op->dev.of_node,
+						 pxad_dma_xlate, pdev);
+		if (ret < 0) {
+			dev_err(pdev->slave.dev,
+				"of_dma_controller_register failed\n");
+			return ret;
+		}
+	}
+
+	platform_set_drvdata(op, pdev);
+	dev_info(pdev->slave.dev, "initialized %d channels\n", dma_channels);
+	return 0;
+}
+
+static const struct platform_device_id pxad_id_table[] = {
+	{ "pxa-dma", },
+	{ },
+};
+
+static struct platform_driver pxad_driver = {
+	.driver		= {
+		.name	= "pxa-dma",
+		.of_match_table = pxad_dt_ids,
+	},
+	.id_table	= pxad_id_table,
+	.probe		= pxad_probe,
+	.remove		= pxad_remove,
+};
+
+bool pxad_filter_fn(struct dma_chan *chan, void *param)
+{
+	struct pxad_chan *c = to_pxad_chan(chan);
+	struct pxad_param *p = param;
+
+	if (chan->device->dev->driver != &pxad_driver.driver)
+		return false;
+
+	c->drcmr = p->drcmr;
+	c->prio = p->prio;
+
+	return true;
+}
+EXPORT_SYMBOL_GPL(pxad_filter_fn);
+
+module_platform_driver(pxad_driver);
+
+MODULE_DESCRIPTION("Marvell PXA Peripheral DMA Driver");
+MODULE_AUTHOR("Robert Jarzmik <robert.jarzmik@free.fr>");
+MODULE_LICENSE("GPL v2");
diff --git a/include/linux/dma/pxa-dma.h b/include/linux/dma/pxa-dma.h
new file mode 100644
index 000000000000..3edc99294bf6
--- /dev/null
+++ b/include/linux/dma/pxa-dma.h
@@ -0,0 +1,27 @@
+#ifndef _PXA_DMA_H_
+#define _PXA_DMA_H_
+
+enum pxad_chan_prio {
+	PXAD_PRIO_HIGHEST = 0,
+	PXAD_PRIO_NORMAL,
+	PXAD_PRIO_LOW,
+	PXAD_PRIO_LOWEST,
+};
+
+struct pxad_param {
+	unsigned int drcmr;
+	enum pxad_chan_prio prio;
+};
+
+struct dma_chan;
+
+#ifdef CONFIG_PXA_DMA
+bool pxad_filter_fn(struct dma_chan *chan, void *param);
+#else
+static inline bool pxad_filter_fn(struct dma_chan *chan, void *param)
+{
+	return false;
+}
+#endif
+
+#endif /* _PXA_DMA_H_ */
-- 
cgit v1.2.3


From 09170a49422bd786be3eac5cec1955257c5a34b7 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Mon, 18 May 2015 13:59:39 +0200
Subject: KVM: const-ify uses of struct kvm_userspace_memory_region

Architecture-specific helpers are not supposed to muck with
struct kvm_userspace_memory_region contents.  Add const to
enforce this.

In order to eliminate the only write in __kvm_set_memory_region,
the cleaning of deleted slots is pulled up from update_memslots
to __kvm_set_memory_region.

Reviewed-by: Takuya Yoshikawa <yoshikawa_takuya_b1@lab.ntt.co.jp>
Reviewed-by: Radim Krcmar <rkrcmar@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/arm/kvm/mmu.c                 |  4 ++--
 arch/mips/kvm/mips.c               |  4 ++--
 arch/powerpc/include/asm/kvm_ppc.h |  8 ++++----
 arch/powerpc/kvm/book3s.c          |  4 ++--
 arch/powerpc/kvm/book3s_hv.c       |  4 ++--
 arch/powerpc/kvm/book3s_pr.c       |  4 ++--
 arch/powerpc/kvm/booke.c           |  4 ++--
 arch/powerpc/kvm/powerpc.c         |  4 ++--
 arch/s390/kvm/kvm-s390.c           |  4 ++--
 arch/x86/kvm/x86.c                 |  4 ++--
 include/linux/kvm_host.h           |  8 ++++----
 virt/kvm/kvm_main.c                | 22 +++++++++++-----------
 12 files changed, 37 insertions(+), 37 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
index 6f0f8f3ac7df..b5c023a37aec 100644
--- a/arch/arm/kvm/mmu.c
+++ b/arch/arm/kvm/mmu.c
@@ -1719,7 +1719,7 @@ out:
 }
 
 void kvm_arch_commit_memory_region(struct kvm *kvm,
-				   struct kvm_userspace_memory_region *mem,
+				   const struct kvm_userspace_memory_region *mem,
 				   const struct kvm_memory_slot *old,
 				   enum kvm_mr_change change)
 {
@@ -1734,7 +1734,7 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
 
 int kvm_arch_prepare_memory_region(struct kvm *kvm,
 				   struct kvm_memory_slot *memslot,
-				   struct kvm_userspace_memory_region *mem,
+				   const struct kvm_userspace_memory_region *mem,
 				   enum kvm_mr_change change)
 {
 	hva_t hva = mem->userspace_addr;
diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c
index bc5ddd973b44..5963e2e8a6d7 100644
--- a/arch/mips/kvm/mips.c
+++ b/arch/mips/kvm/mips.c
@@ -198,14 +198,14 @@ int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot,
 
 int kvm_arch_prepare_memory_region(struct kvm *kvm,
 				   struct kvm_memory_slot *memslot,
-				   struct kvm_userspace_memory_region *mem,
+				   const struct kvm_userspace_memory_region *mem,
 				   enum kvm_mr_change change)
 {
 	return 0;
 }
 
 void kvm_arch_commit_memory_region(struct kvm *kvm,
-				   struct kvm_userspace_memory_region *mem,
+				   const struct kvm_userspace_memory_region *mem,
 				   const struct kvm_memory_slot *old,
 				   enum kvm_mr_change change)
 {
diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index b8475daad884..aff563b5f001 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -182,9 +182,9 @@ extern int kvmppc_core_create_memslot(struct kvm *kvm,
 				      unsigned long npages);
 extern int kvmppc_core_prepare_memory_region(struct kvm *kvm,
 				struct kvm_memory_slot *memslot,
-				struct kvm_userspace_memory_region *mem);
+				const struct kvm_userspace_memory_region *mem);
 extern void kvmppc_core_commit_memory_region(struct kvm *kvm,
-				struct kvm_userspace_memory_region *mem,
+				const struct kvm_userspace_memory_region *mem,
 				const struct kvm_memory_slot *old);
 extern int kvm_vm_ioctl_get_smmu_info(struct kvm *kvm,
 				      struct kvm_ppc_smmu_info *info);
@@ -243,9 +243,9 @@ struct kvmppc_ops {
 	void (*flush_memslot)(struct kvm *kvm, struct kvm_memory_slot *memslot);
 	int (*prepare_memory_region)(struct kvm *kvm,
 				     struct kvm_memory_slot *memslot,
-				     struct kvm_userspace_memory_region *mem);
+				     const struct kvm_userspace_memory_region *mem);
 	void (*commit_memory_region)(struct kvm *kvm,
-				     struct kvm_userspace_memory_region *mem,
+				     const struct kvm_userspace_memory_region *mem,
 				     const struct kvm_memory_slot *old);
 	int (*unmap_hva)(struct kvm *kvm, unsigned long hva);
 	int (*unmap_hva_range)(struct kvm *kvm, unsigned long start,
diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
index 453a8a47a467..60aa0726dccc 100644
--- a/arch/powerpc/kvm/book3s.c
+++ b/arch/powerpc/kvm/book3s.c
@@ -757,13 +757,13 @@ void kvmppc_core_flush_memslot(struct kvm *kvm, struct kvm_memory_slot *memslot)
 
 int kvmppc_core_prepare_memory_region(struct kvm *kvm,
 				struct kvm_memory_slot *memslot,
-				struct kvm_userspace_memory_region *mem)
+				const struct kvm_userspace_memory_region *mem)
 {
 	return kvm->arch.kvm_ops->prepare_memory_region(kvm, memslot, mem);
 }
 
 void kvmppc_core_commit_memory_region(struct kvm *kvm,
-				struct kvm_userspace_memory_region *mem,
+				const struct kvm_userspace_memory_region *mem,
 				const struct kvm_memory_slot *old)
 {
 	kvm->arch.kvm_ops->commit_memory_region(kvm, mem, old);
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 6aff5a990492..ed493d123268 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -2376,13 +2376,13 @@ static int kvmppc_core_create_memslot_hv(struct kvm_memory_slot *slot,
 
 static int kvmppc_core_prepare_memory_region_hv(struct kvm *kvm,
 					struct kvm_memory_slot *memslot,
-					struct kvm_userspace_memory_region *mem)
+					const struct kvm_userspace_memory_region *mem)
 {
 	return 0;
 }
 
 static void kvmppc_core_commit_memory_region_hv(struct kvm *kvm,
-				struct kvm_userspace_memory_region *mem,
+				const struct kvm_userspace_memory_region *mem,
 				const struct kvm_memory_slot *old)
 {
 	unsigned long npages = mem->memory_size >> PAGE_SHIFT;
diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c
index c01dfc798c66..0873e766df1b 100644
--- a/arch/powerpc/kvm/book3s_pr.c
+++ b/arch/powerpc/kvm/book3s_pr.c
@@ -1573,13 +1573,13 @@ static void kvmppc_core_flush_memslot_pr(struct kvm *kvm,
 
 static int kvmppc_core_prepare_memory_region_pr(struct kvm *kvm,
 					struct kvm_memory_slot *memslot,
-					struct kvm_userspace_memory_region *mem)
+					const struct kvm_userspace_memory_region *mem)
 {
 	return 0;
 }
 
 static void kvmppc_core_commit_memory_region_pr(struct kvm *kvm,
-				struct kvm_userspace_memory_region *mem,
+				const struct kvm_userspace_memory_region *mem,
 				const struct kvm_memory_slot *old)
 {
 	return;
diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
index 3872ab31c80a..518e3a8b351f 100644
--- a/arch/powerpc/kvm/booke.c
+++ b/arch/powerpc/kvm/booke.c
@@ -1784,13 +1784,13 @@ int kvmppc_core_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot,
 
 int kvmppc_core_prepare_memory_region(struct kvm *kvm,
 				      struct kvm_memory_slot *memslot,
-				      struct kvm_userspace_memory_region *mem)
+				      const struct kvm_userspace_memory_region *mem)
 {
 	return 0;
 }
 
 void kvmppc_core_commit_memory_region(struct kvm *kvm,
-				struct kvm_userspace_memory_region *mem,
+				const struct kvm_userspace_memory_region *mem,
 				const struct kvm_memory_slot *old)
 {
 }
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 8cd1f80fdc70..5985bb2a332b 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -595,14 +595,14 @@ int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot,
 
 int kvm_arch_prepare_memory_region(struct kvm *kvm,
 				   struct kvm_memory_slot *memslot,
-				   struct kvm_userspace_memory_region *mem,
+				   const struct kvm_userspace_memory_region *mem,
 				   enum kvm_mr_change change)
 {
 	return kvmppc_core_prepare_memory_region(kvm, memslot, mem);
 }
 
 void kvm_arch_commit_memory_region(struct kvm *kvm,
-				   struct kvm_userspace_memory_region *mem,
+				   const struct kvm_userspace_memory_region *mem,
 				   const struct kvm_memory_slot *old,
 				   enum kvm_mr_change change)
 {
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index a05107e9b2bf..994f9c37f25f 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -2582,7 +2582,7 @@ int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot,
 /* Section: memory related */
 int kvm_arch_prepare_memory_region(struct kvm *kvm,
 				   struct kvm_memory_slot *memslot,
-				   struct kvm_userspace_memory_region *mem,
+				   const struct kvm_userspace_memory_region *mem,
 				   enum kvm_mr_change change)
 {
 	/* A few sanity checks. We can have memory slots which have to be
@@ -2600,7 +2600,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
 }
 
 void kvm_arch_commit_memory_region(struct kvm *kvm,
-				struct kvm_userspace_memory_region *mem,
+				const struct kvm_userspace_memory_region *mem,
 				const struct kvm_memory_slot *old,
 				enum kvm_mr_change change)
 {
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 8918e23e0e8e..30854ea218e7 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -7700,7 +7700,7 @@ void kvm_arch_memslots_updated(struct kvm *kvm)
 
 int kvm_arch_prepare_memory_region(struct kvm *kvm,
 				struct kvm_memory_slot *memslot,
-				struct kvm_userspace_memory_region *mem,
+				const struct kvm_userspace_memory_region *mem,
 				enum kvm_mr_change change)
 {
 	/*
@@ -7778,7 +7778,7 @@ static void kvm_mmu_slot_apply_flags(struct kvm *kvm,
 }
 
 void kvm_arch_commit_memory_region(struct kvm *kvm,
-				struct kvm_userspace_memory_region *mem,
+				const struct kvm_userspace_memory_region *mem,
 				const struct kvm_memory_slot *old,
 				enum kvm_mr_change change)
 {
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 87fd74a04005..fbced7015ebd 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -501,9 +501,9 @@ enum kvm_mr_change {
 };
 
 int kvm_set_memory_region(struct kvm *kvm,
-			  struct kvm_userspace_memory_region *mem);
+			  const struct kvm_userspace_memory_region *mem);
 int __kvm_set_memory_region(struct kvm *kvm,
-			    struct kvm_userspace_memory_region *mem);
+			    const struct kvm_userspace_memory_region *mem);
 void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free,
 			   struct kvm_memory_slot *dont);
 int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot,
@@ -511,10 +511,10 @@ int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot,
 void kvm_arch_memslots_updated(struct kvm *kvm);
 int kvm_arch_prepare_memory_region(struct kvm *kvm,
 				struct kvm_memory_slot *memslot,
-				struct kvm_userspace_memory_region *mem,
+				const struct kvm_userspace_memory_region *mem,
 				enum kvm_mr_change change);
 void kvm_arch_commit_memory_region(struct kvm *kvm,
-				struct kvm_userspace_memory_region *mem,
+				const struct kvm_userspace_memory_region *mem,
 				const struct kvm_memory_slot *old,
 				enum kvm_mr_change change);
 bool kvm_largepages_enabled(void);
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 42df724071c0..fc2dbe1c34fc 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -676,8 +676,6 @@ static void update_memslots(struct kvm_memslots *slots,
 	WARN_ON(mslots[i].id != id);
 	if (!new->npages) {
 		WARN_ON(!mslots[i].npages);
-		new->base_gfn = 0;
-		new->flags = 0;
 		if (mslots[i].npages)
 			slots->used_slots--;
 	} else {
@@ -717,7 +715,7 @@ static void update_memslots(struct kvm_memslots *slots,
 	slots->id_to_index[mslots[i].id] = i;
 }
 
-static int check_memory_region_flags(struct kvm_userspace_memory_region *mem)
+static int check_memory_region_flags(const struct kvm_userspace_memory_region *mem)
 {
 	u32 valid_flags = KVM_MEM_LOG_DIRTY_PAGES;
 
@@ -767,7 +765,7 @@ static struct kvm_memslots *install_new_memslots(struct kvm *kvm,
  * Must be called holding kvm->slots_lock for write.
  */
 int __kvm_set_memory_region(struct kvm *kvm,
-			    struct kvm_userspace_memory_region *mem)
+			    const struct kvm_userspace_memory_region *mem)
 {
 	int r;
 	gfn_t base_gfn;
@@ -806,9 +804,6 @@ int __kvm_set_memory_region(struct kvm *kvm,
 	if (npages > KVM_MEM_MAX_NR_PAGES)
 		goto out;
 
-	if (!npages)
-		mem->flags &= ~KVM_MEM_LOG_DIRTY_PAGES;
-
 	new = old = *slot;
 
 	new.id = mem->slot;
@@ -834,10 +829,14 @@ int __kvm_set_memory_region(struct kvm *kvm,
 				goto out;
 			}
 		}
-	} else if (old.npages) {
+	} else {
+		if (!old.npages)
+			goto out;
+
 		change = KVM_MR_DELETE;
-	} else /* Modify a non-existent slot: disallowed. */
-		goto out;
+		new.base_gfn = 0;
+		new.flags = 0;
+	}
 
 	if ((change == KVM_MR_CREATE) || (change == KVM_MR_MOVE)) {
 		/* Check for overlaps */
@@ -944,7 +943,7 @@ out:
 EXPORT_SYMBOL_GPL(__kvm_set_memory_region);
 
 int kvm_set_memory_region(struct kvm *kvm,
-			  struct kvm_userspace_memory_region *mem)
+			  const struct kvm_userspace_memory_region *mem)
 {
 	int r;
 
@@ -960,6 +959,7 @@ static int kvm_vm_ioctl_set_memory_region(struct kvm *kvm,
 {
 	if (mem->slot >= KVM_USER_MEM_SLOTS)
 		return -EINVAL;
+
 	return kvm_set_memory_region(kvm, mem);
 }
 
-- 
cgit v1.2.3


From 15f46015ee17681b542432df21747f5c51857156 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Sun, 17 May 2015 21:26:08 +0200
Subject: KVM: add memslots argument to kvm_arch_memslots_updated

Prepare for the case of multiple address spaces.

Reviewed-by: Radim Krcmar <rkrcmar@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/arm/kvm/mmu.c                  | 2 +-
 arch/mips/include/asm/kvm_host.h    | 2 +-
 arch/powerpc/include/asm/kvm_host.h | 2 +-
 arch/s390/include/asm/kvm_host.h    | 2 +-
 arch/x86/kvm/x86.c                  | 2 +-
 include/linux/kvm_host.h            | 2 +-
 include/linux/kvm_types.h           | 1 +
 virt/kvm/kvm_main.c                 | 2 +-
 8 files changed, 8 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
index b5c023a37aec..e9ac084d21ea 100644
--- a/arch/arm/kvm/mmu.c
+++ b/arch/arm/kvm/mmu.c
@@ -1839,7 +1839,7 @@ int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot,
 	return 0;
 }
 
-void kvm_arch_memslots_updated(struct kvm *kvm)
+void kvm_arch_memslots_updated(struct kvm *kvm, struct kvm_memslots *slots)
 {
 }
 
diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h
index 4c25823563fe..e8c8d9d0c45f 100644
--- a/arch/mips/include/asm/kvm_host.h
+++ b/arch/mips/include/asm/kvm_host.h
@@ -839,7 +839,7 @@ static inline void kvm_arch_hardware_unsetup(void) {}
 static inline void kvm_arch_sync_events(struct kvm *kvm) {}
 static inline void kvm_arch_free_memslot(struct kvm *kvm,
 		struct kvm_memory_slot *free, struct kvm_memory_slot *dont) {}
-static inline void kvm_arch_memslots_updated(struct kvm *kvm) {}
+static inline void kvm_arch_memslots_updated(struct kvm *kvm, struct kvm_memslots *slots) {}
 static inline void kvm_arch_flush_shadow_all(struct kvm *kvm) {}
 static inline void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
 		struct kvm_memory_slot *slot) {}
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index a193a13cf08b..d91f65b28e32 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -698,7 +698,7 @@ struct kvm_vcpu_arch {
 static inline void kvm_arch_hardware_disable(void) {}
 static inline void kvm_arch_hardware_unsetup(void) {}
 static inline void kvm_arch_sync_events(struct kvm *kvm) {}
-static inline void kvm_arch_memslots_updated(struct kvm *kvm) {}
+static inline void kvm_arch_memslots_updated(struct kvm *kvm, struct kvm_memslots *slots) {}
 static inline void kvm_arch_flush_shadow_all(struct kvm *kvm) {}
 static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {}
 static inline void kvm_arch_exit(void) {}
diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h
index 444c412307cb..3024acbe1f9d 100644
--- a/arch/s390/include/asm/kvm_host.h
+++ b/arch/s390/include/asm/kvm_host.h
@@ -636,7 +636,7 @@ static inline void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) {}
 static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {}
 static inline void kvm_arch_free_memslot(struct kvm *kvm,
 		struct kvm_memory_slot *free, struct kvm_memory_slot *dont) {}
-static inline void kvm_arch_memslots_updated(struct kvm *kvm) {}
+static inline void kvm_arch_memslots_updated(struct kvm *kvm, struct kvm_memslots *slots) {}
 static inline void kvm_arch_flush_shadow_all(struct kvm *kvm) {}
 static inline void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
 		struct kvm_memory_slot *slot) {}
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 30854ea218e7..f0aec85f8cdd 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -7689,7 +7689,7 @@ out_free:
 	return -ENOMEM;
 }
 
-void kvm_arch_memslots_updated(struct kvm *kvm)
+void kvm_arch_memslots_updated(struct kvm *kvm, struct kvm_memslots *slots)
 {
 	/*
 	 * memslots->generation has been incremented.
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index fbced7015ebd..8815f1dffb77 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -508,7 +508,7 @@ void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free,
 			   struct kvm_memory_slot *dont);
 int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot,
 			    unsigned long npages);
-void kvm_arch_memslots_updated(struct kvm *kvm);
+void kvm_arch_memslots_updated(struct kvm *kvm, struct kvm_memslots *slots);
 int kvm_arch_prepare_memory_region(struct kvm *kvm,
 				struct kvm_memory_slot *memslot,
 				const struct kvm_userspace_memory_region *mem,
diff --git a/include/linux/kvm_types.h b/include/linux/kvm_types.h
index 931da7e917cf..1b47a185c2f0 100644
--- a/include/linux/kvm_types.h
+++ b/include/linux/kvm_types.h
@@ -28,6 +28,7 @@ struct kvm_run;
 struct kvm_userspace_memory_region;
 struct kvm_vcpu;
 struct kvm_vcpu_init;
+struct kvm_memslots;
 
 enum kvm_mr_change;
 
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index fc2dbe1c34fc..4361204a6348 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -751,7 +751,7 @@ static struct kvm_memslots *install_new_memslots(struct kvm *kvm,
 	 */
 	slots->generation++;
 
-	kvm_arch_memslots_updated(kvm);
+	kvm_arch_memslots_updated(kvm, slots);
 
 	return old_memslots;
 }
-- 
cgit v1.2.3


From cacce073bfd2b4091fbc58e906e7a9a21f538ff6 Mon Sep 17 00:00:00 2001
From: Hauke Mehrtens <hauke@hauke-m.de>
Date: Thu, 14 May 2015 23:05:49 +0200
Subject: bcma: add module_bcma_driver()

This makes it possible to save some lines of code in drivers with an
simple bcma driver registration.

Signed-off-by: Hauke Mehrtens <hauke@hauke-m.de>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 include/linux/bcma/bcma.h | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/bcma/bcma.h b/include/linux/bcma/bcma.h
index e34f906647d3..2ff4a9961e1d 100644
--- a/include/linux/bcma/bcma.h
+++ b/include/linux/bcma/bcma.h
@@ -305,6 +305,15 @@ int __bcma_driver_register(struct bcma_driver *drv, struct module *owner);
 
 extern void bcma_driver_unregister(struct bcma_driver *drv);
 
+/* module_bcma_driver() - Helper macro for drivers that don't do
+ * anything special in module init/exit.  This eliminates a lot of
+ * boilerplate.  Each module may only use this macro once, and
+ * calling it replaces module_init() and module_exit()
+ */
+#define module_bcma_driver(__bcma_driver) \
+	module_driver(__bcma_driver, bcma_driver_register, \
+			bcma_driver_unregister)
+
 /* Set a fallback SPROM.
  * See kdoc at the function definition for complete documentation. */
 extern int bcma_arch_register_fallback_sprom(
-- 
cgit v1.2.3


From 069d4a7b583274e3fd8712c92a035626e0ebf7be Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert+renesas@glider.be>
Date: Tue, 3 Mar 2015 11:58:14 +0100
Subject: netfilter: ebtables: fix comment grammar

s/stongly inspired on/strongly inspired by/

Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
Cc: David S. Miller <davem@davemloft.net>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 include/linux/netfilter_bridge/ebtables.h      | 2 +-
 include/uapi/linux/netfilter_bridge/ebtables.h | 2 +-
 net/bridge/netfilter/ebtables.c                | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter_bridge/ebtables.h b/include/linux/netfilter_bridge/ebtables.h
index 34e7a2b7f867..9ac6f263956b 100644
--- a/include/linux/netfilter_bridge/ebtables.h
+++ b/include/linux/netfilter_bridge/ebtables.h
@@ -6,7 +6,7 @@
  *
  *  ebtables.c,v 2.0, April, 2002
  *
- *  This code is stongly inspired on the iptables code which is
+ *  This code is strongly inspired by the iptables code which is
  *  Copyright (C) 1999 Paul `Rusty' Russell & Michael J. Neuling
  */
 #ifndef __LINUX_BRIDGE_EFF_H
diff --git a/include/uapi/linux/netfilter_bridge/ebtables.h b/include/uapi/linux/netfilter_bridge/ebtables.h
index ba993360dbe9..ab46c805c455 100644
--- a/include/uapi/linux/netfilter_bridge/ebtables.h
+++ b/include/uapi/linux/netfilter_bridge/ebtables.h
@@ -6,7 +6,7 @@
  *
  *  ebtables.c,v 2.0, April, 2002
  *
- *  This code is stongly inspired on the iptables code which is
+ *  This code is strongly inspired by the iptables code which is
  *  Copyright (C) 1999 Paul `Rusty' Russell & Michael J. Neuling
  */
 
diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c
index d9a8c05d995d..54df89edcf20 100644
--- a/net/bridge/netfilter/ebtables.c
+++ b/net/bridge/netfilter/ebtables.c
@@ -6,7 +6,7 @@
  *
  *  ebtables.c,v 2.0, July, 2002
  *
- *  This code is stongly inspired on the iptables code which is
+ *  This code is strongly inspired by the iptables code which is
  *  Copyright (C) 1999 Paul `Rusty' Russell & Michael J. Neuling
  *
  *  This program is free software; you can redistribute it and/or
-- 
cgit v1.2.3


From 7667928601d2981b20011e357904bcb96c365427 Mon Sep 17 00:00:00 2001
From: Masanari Iida <standby24x7@gmail.com>
Date: Fri, 8 May 2015 00:02:27 +0900
Subject: rapidio: Fix kerneldoc and comment

This patch fix spelling typos found in DocBook/rapidio.xml
Ths file was generated from comments in the source files,
I had to fix them, instead of the xml file.

Signed-off-by: Masanari Iida <standby24x7@gmail.com>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 drivers/rapidio/rio-scan.c | 2 +-
 include/linux/rio.h        | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/rapidio/rio-scan.c b/drivers/rapidio/rio-scan.c
index 47a1b2ea76c4..d6a126c17c03 100644
--- a/drivers/rapidio/rio-scan.c
+++ b/drivers/rapidio/rio-scan.c
@@ -83,7 +83,7 @@ static u16 rio_destid_alloc(struct rio_net *net)
  * @destid: destID to reserve
  *
  * Tries to reserve the specified destID.
- * Returns 0 if successfull.
+ * Returns 0 if successful.
  */
 static int rio_destid_reserve(struct rio_net *net, u16 destid)
 {
diff --git a/include/linux/rio.h b/include/linux/rio.h
index 6bda06f21930..cde976e86b48 100644
--- a/include/linux/rio.h
+++ b/include/linux/rio.h
@@ -298,7 +298,7 @@ struct rio_id_table {
  * struct rio_net - RIO network info
  * @node: Node in global list of RIO networks
  * @devices: List of devices in this network
- * @switches: List of switches in this netowrk
+ * @switches: List of switches in this network
  * @mports: List of master ports accessing this network
  * @hport: Default port for accessing this network
  * @id: RIO network ID
-- 
cgit v1.2.3


From 94268fcd9aea736d24cbdade16e7f7c9419c489e Mon Sep 17 00:00:00 2001
From: Antonio Ospite <ao2@ao2.it>
Date: Tue, 28 Apr 2015 13:11:28 +0200
Subject: lib: crc-itu-t.[ch] fix 0x0x prefix in integer constants

Signed-off-by: Antonio Ospite <ao2@ao2.it>
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 include/linux/crc-itu-t.h | 2 +-
 lib/crc-itu-t.c           | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/crc-itu-t.h b/include/linux/crc-itu-t.h
index 84920f3cc83e..a9953c762eee 100644
--- a/include/linux/crc-itu-t.h
+++ b/include/linux/crc-itu-t.h
@@ -3,7 +3,7 @@
  *
  * Implements the standard CRC ITU-T V.41:
  *   Width 16
- *   Poly  0x0x1021 (x^16 + x^12 + x^15 + 1)
+ *   Poly  0x1021 (x^16 + x^12 + x^15 + 1)
  *   Init  0
  *
  * This source code is licensed under the GNU General Public License,
diff --git a/lib/crc-itu-t.c b/lib/crc-itu-t.c
index a63472b82416..b3219d0abfb4 100644
--- a/lib/crc-itu-t.c
+++ b/lib/crc-itu-t.c
@@ -9,7 +9,7 @@
 #include <linux/module.h>
 #include <linux/crc-itu-t.h>
 
-/** CRC table for the CRC ITU-T V.41 0x0x1021 (x^16 + x^12 + x^15 + 1) */
+/** CRC table for the CRC ITU-T V.41 0x1021 (x^16 + x^12 + x^15 + 1) */
 const u16 crc_itu_t_table[256] = {
 	0x0000, 0x1021, 0x2042, 0x3063, 0x4084, 0x50a5, 0x60c6, 0x70e7,
 	0x8108, 0x9129, 0xa14a, 0xb16b, 0xc18c, 0xd1ad, 0xe1ce, 0xf1ef,
-- 
cgit v1.2.3


From e0213bc5467ca5fe44ab04527f0e47998f30c046 Mon Sep 17 00:00:00 2001
From: Yoshihiro Shimoda <yoshihiro.shimoda.uh@renesas.com>
Date: Mon, 18 May 2015 20:04:14 +0900
Subject: usb: renesas_usbhs: Change USBHS_TYPE_R8A779x to USBHS_TYPE_RCAR_GEN2

Since the HSUSB controllers of R-Car Gen2 are the same specification
(they have 16 pipes and usb-dmac), this patch changes USBHS_TYPE_R8A7790
and USBHS_TYPE_R8A7791 to USBHS_TYPE_RCAR_GEN2.

Signed-off-by: Yoshihiro Shimoda <yoshihiro.shimoda.uh@renesas.com>
Signed-off-by: Felipe Balbi <balbi@ti.com>
---
 drivers/usb/renesas_usbhs/common.c | 15 ++++-----------
 include/linux/usb/renesas_usbhs.h  |  3 +--
 2 files changed, 5 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/usb/renesas_usbhs/common.c b/drivers/usb/renesas_usbhs/common.c
index 0f7e850fd4aa..b56bb9d882fc 100644
--- a/drivers/usb/renesas_usbhs/common.c
+++ b/drivers/usb/renesas_usbhs/common.c
@@ -466,11 +466,11 @@ static int usbhsc_drvcllbck_notify_hotplug(struct platform_device *pdev)
 static const struct of_device_id usbhs_of_match[] = {
 	{
 		.compatible = "renesas,usbhs-r8a7790",
-		.data = (void *)USBHS_TYPE_R8A7790,
+		.data = (void *)USBHS_TYPE_RCAR_GEN2,
 	},
 	{
 		.compatible = "renesas,usbhs-r8a7791",
-		.data = (void *)USBHS_TYPE_R8A7791,
+		.data = (void *)USBHS_TYPE_RCAR_GEN2,
 	},
 	{ },
 };
@@ -497,14 +497,8 @@ static struct renesas_usbhs_platform_info *usbhs_parse_dt(struct device *dev)
 	if (gpio > 0)
 		dparam->enable_gpio = gpio;
 
-	switch (dparam->type) {
-	case USBHS_TYPE_R8A7790:
-	case USBHS_TYPE_R8A7791:
+	if (dparam->type == USBHS_TYPE_RCAR_GEN2)
 		dparam->has_usb_dmac = 1;
-		break;
-	default:
-		break;
-	}
 
 	return info;
 }
@@ -559,8 +553,7 @@ static int usbhs_probe(struct platform_device *pdev)
 	       sizeof(struct renesas_usbhs_driver_param));
 
 	switch (priv->dparam.type) {
-	case USBHS_TYPE_R8A7790:
-	case USBHS_TYPE_R8A7791:
+	case USBHS_TYPE_RCAR_GEN2:
 		priv->pfunc = usbhs_rcar2_ops;
 		if (!priv->dparam.pipe_type) {
 			priv->dparam.pipe_type = usbhsc_new_pipe_type;
diff --git a/include/linux/usb/renesas_usbhs.h b/include/linux/usb/renesas_usbhs.h
index f06529c14141..3dd5a781da99 100644
--- a/include/linux/usb/renesas_usbhs.h
+++ b/include/linux/usb/renesas_usbhs.h
@@ -169,8 +169,7 @@ struct renesas_usbhs_driver_param {
 #define USBHS_USB_DMAC_XFER_SIZE	32	/* hardcode the xfer size */
 };
 
-#define USBHS_TYPE_R8A7790 1
-#define USBHS_TYPE_R8A7791 2
+#define USBHS_TYPE_RCAR_GEN2	1
 
 /*
  * option:
-- 
cgit v1.2.3


From a09e23f53e2c14a65a3b14a00060fea163081e1f Mon Sep 17 00:00:00 2001
From: Mian Yousaf Kaukab <yousaf.kaukab@intel.com>
Date: Sat, 16 May 2015 22:33:35 +0200
Subject: usb: gadget: net2280: check interrupts for all endpoints

USB3380 in enhanced mode has 4 IN and 4 OUT endpoints. Check
interrupts for all of them.

Tested-by: Ricardo Ribalda Delgado <ricardo.ribalda@gmail.com>
Signed-off-by: Mian Yousaf Kaukab <yousaf.kaukab@intel.com>
Signed-off-by: Felipe Balbi <balbi@ti.com>
---
 drivers/usb/gadget/udc/net2280.c | 57 +++++++++++++++++++++++++++++++---------
 include/linux/usb/net2280.h      |  3 +++
 2 files changed, 47 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/usb/gadget/udc/net2280.c b/drivers/usb/gadget/udc/net2280.c
index 878a98ed84f5..a78a9c048b87 100644
--- a/drivers/usb/gadget/udc/net2280.c
+++ b/drivers/usb/gadget/udc/net2280.c
@@ -2883,6 +2883,26 @@ next_endpoints3:
 	return;
 }
 
+static void usb338x_handle_ep_intr(struct net2280 *dev, u32 stat0)
+{
+	u32 index;
+	u32 bit;
+
+	for (index = 0; index < ARRAY_SIZE(ep_bit); index++) {
+		bit = BIT(ep_bit[index]);
+
+		if (!stat0)
+			break;
+
+		if (!(stat0 & bit))
+			continue;
+
+		stat0 &= ~bit;
+
+		handle_ep_small(&dev->ep[index]);
+	}
+}
+
 static void handle_stat0_irqs(struct net2280 *dev, u32 stat)
 {
 	struct net2280_ep	*ep;
@@ -3107,20 +3127,31 @@ do_stall:
 #undef	w_length
 
 next_endpoints:
-	/* endpoint data irq ? */
-	scratch = stat & 0x7f;
-	stat &= ~0x7f;
-	for (num = 0; scratch; num++) {
-		u32		t;
-
-		/* do this endpoint's FIFO and queue need tending? */
-		t = BIT(num);
-		if ((scratch & t) == 0)
-			continue;
-		scratch ^= t;
+	if ((dev->quirks & PLX_SUPERSPEED) && dev->enhanced_mode) {
+		u32 mask = (BIT(ENDPOINT_0_INTERRUPT) |
+			USB3380_IRQSTAT0_EP_INTR_MASK_IN |
+			USB3380_IRQSTAT0_EP_INTR_MASK_OUT);
+
+		if (stat & mask) {
+			usb338x_handle_ep_intr(dev, stat & mask);
+			stat &= ~mask;
+		}
+	} else {
+		/* endpoint data irq ? */
+		scratch = stat & 0x7f;
+		stat &= ~0x7f;
+		for (num = 0; scratch; num++) {
+			u32		t;
+
+			/* do this endpoint's FIFO and queue need tending? */
+			t = BIT(num);
+			if ((scratch & t) == 0)
+				continue;
+			scratch ^= t;
 
-		ep = &dev->ep[num];
-		handle_ep_small(ep);
+			ep = &dev->ep[num];
+			handle_ep_small(ep);
+		}
 	}
 
 	if (stat)
diff --git a/include/linux/usb/net2280.h b/include/linux/usb/net2280.h
index 148b8fa5b1a2..725120224472 100644
--- a/include/linux/usb/net2280.h
+++ b/include/linux/usb/net2280.h
@@ -168,6 +168,9 @@ struct net2280_regs {
 #define     ENDPOINT_B_INTERRUPT                                2
 #define     ENDPOINT_A_INTERRUPT                                1
 #define     ENDPOINT_0_INTERRUPT                                0
+#define     USB3380_IRQSTAT0_EP_INTR_MASK_IN (0xF << 17)
+#define     USB3380_IRQSTAT0_EP_INTR_MASK_OUT (0xF << 1)
+
 	u32		irqstat1;
 #define     POWER_STATE_CHANGE_INTERRUPT                        27
 #define     PCI_ARBITER_TIMEOUT_INTERRUPT                       26
-- 
cgit v1.2.3


From c65c4f052bc3b67989bf54914798513685c54988 Mon Sep 17 00:00:00 2001
From: Mian Yousaf Kaukab <yousaf.kaukab@intel.com>
Date: Sat, 16 May 2015 22:33:36 +0200
Subject: usb: gadget: net2280: fix use of GPEP in both directions

USB3380 enhanced mode allows GPEP to be used in both IN and OUT
directions. However, IN and OUT endpoints must use same USB endpoint
address (bEndpointAddress). Fix this by setting the ep_cfg.ep_number
during initialization and keep it in net2280_enable()

Tested-by: Ricardo Ribalda Delgado <ricardo.ribalda@gmail.com>
Signed-off-by: Mian Yousaf Kaukab <yousaf.kaukab@intel.com>
Signed-off-by: Felipe Balbi <balbi@ti.com>
---
 drivers/usb/gadget/udc/net2280.c | 41 +++++++++++++++++++++++++++++++---------
 include/linux/usb/usb338x.h      |  4 ++++
 2 files changed, 36 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/usb/gadget/udc/net2280.c b/drivers/usb/gadget/udc/net2280.c
index a78a9c048b87..779e6fe0005f 100644
--- a/drivers/usb/gadget/udc/net2280.c
+++ b/drivers/usb/gadget/udc/net2280.c
@@ -144,7 +144,9 @@ net2280_enable(struct usb_ep *_ep, const struct usb_endpoint_descriptor *desc)
 {
 	struct net2280		*dev;
 	struct net2280_ep	*ep;
-	u32			max, tmp;
+	u32			max;
+	u32 tmp = 0;
+	u32 type;
 	unsigned long		flags;
 	static const u32 ep_key[9] = { 1, 0, 1, 0, 1, 1, 0, 1, 0 };
 	int ret = 0;
@@ -200,15 +202,29 @@ net2280_enable(struct usb_ep *_ep, const struct usb_endpoint_descriptor *desc)
 
 	/* set type, direction, address; reset fifo counters */
 	writel(BIT(FIFO_FLUSH), &ep->regs->ep_stat);
-	tmp = (desc->bmAttributes & USB_ENDPOINT_XFERTYPE_MASK);
-	if (tmp == USB_ENDPOINT_XFER_INT) {
+
+	if ((dev->quirks & PLX_SUPERSPEED) && dev->enhanced_mode) {
+		tmp = readl(&ep->cfg->ep_cfg);
+		/* If USB ep number doesn't match hardware ep number */
+		if ((tmp & 0xf) != usb_endpoint_num(desc)) {
+			ret = -EINVAL;
+			spin_unlock_irqrestore(&dev->lock, flags);
+			goto print_err;
+		}
+		if (ep->is_in)
+			tmp &= ~USB3380_EP_CFG_MASK_IN;
+		else
+			tmp &= ~USB3380_EP_CFG_MASK_OUT;
+	}
+	type = (desc->bmAttributes & USB_ENDPOINT_XFERTYPE_MASK);
+	if (type == USB_ENDPOINT_XFER_INT) {
 		/* erratum 0105 workaround prevents hs NYET */
 		if (dev->chiprev == 0100 &&
 				dev->gadget.speed == USB_SPEED_HIGH &&
 				!(desc->bEndpointAddress & USB_DIR_IN))
 			writel(BIT(CLEAR_NAK_OUT_PACKETS_MODE),
 				&ep->regs->ep_rsp);
-	} else if (tmp == USB_ENDPOINT_XFER_BULK) {
+	} else if (type == USB_ENDPOINT_XFER_BULK) {
 		/* catch some particularly blatant driver bugs */
 		if ((dev->gadget.speed == USB_SPEED_SUPER && max != 1024) ||
 		    (dev->gadget.speed == USB_SPEED_HIGH && max != 512) ||
@@ -218,10 +234,10 @@ net2280_enable(struct usb_ep *_ep, const struct usb_endpoint_descriptor *desc)
 			goto print_err;
 		}
 	}
-	ep->is_iso = (tmp == USB_ENDPOINT_XFER_ISOC);
+	ep->is_iso = (type == USB_ENDPOINT_XFER_ISOC);
 	/* Enable this endpoint */
 	if (dev->quirks & PLX_LEGACY) {
-		tmp <<= ENDPOINT_TYPE;
+		tmp |= type << ENDPOINT_TYPE;
 		tmp |= desc->bEndpointAddress;
 		/* default full fifo lines */
 		tmp |= (4 << ENDPOINT_BYTE_COUNT);
@@ -230,16 +246,17 @@ net2280_enable(struct usb_ep *_ep, const struct usb_endpoint_descriptor *desc)
 	} else {
 		/* In Legacy mode, only OUT endpoints are used */
 		if (dev->enhanced_mode && ep->is_in) {
-			tmp <<= IN_ENDPOINT_TYPE;
+			tmp |= type << IN_ENDPOINT_TYPE;
 			tmp |= BIT(IN_ENDPOINT_ENABLE);
 		} else {
-			tmp <<= OUT_ENDPOINT_TYPE;
+			tmp |= type << OUT_ENDPOINT_TYPE;
 			tmp |= BIT(OUT_ENDPOINT_ENABLE);
 			tmp |= (ep->is_in << ENDPOINT_DIRECTION);
 		}
 
 		tmp |= (4 << ENDPOINT_BYTE_COUNT);
-		tmp |= usb_endpoint_num(desc);
+		if (!dev->enhanced_mode)
+			tmp |= usb_endpoint_num(desc);
 		tmp |= (ep->ep.maxburst << MAX_BURST_SIZE);
 	}
 
@@ -2074,6 +2091,12 @@ static void usb_reinit_338x(struct net2280 *dev)
 
 		if (dev->enhanced_mode) {
 			ep->cfg = &dev->epregs[ne[i]];
+			/*
+			 * Set USB endpoint number, hardware allows same number
+			 * in both directions.
+			 */
+			 if (i > 0 && i < 5)
+				writel(ne[i], &ep->cfg->ep_cfg);
 			ep->regs = (struct net2280_ep_regs __iomem *)
 				(((void __iomem *)&dev->epregs[ne[i]]) +
 				ep_reg_addr[i]);
diff --git a/include/linux/usb/usb338x.h b/include/linux/usb/usb338x.h
index f92eb635b9d3..11525d8d89a7 100644
--- a/include/linux/usb/usb338x.h
+++ b/include/linux/usb/usb338x.h
@@ -43,6 +43,10 @@
 #define     IN_ENDPOINT_TYPE                    12
 #define     OUT_ENDPOINT_ENABLE                 10
 #define     OUT_ENDPOINT_TYPE                    8
+#define USB3380_EP_CFG_MASK_IN ((0x3 << IN_ENDPOINT_TYPE) | \
+				BIT(IN_ENDPOINT_ENABLE))
+#define USB3380_EP_CFG_MASK_OUT ((0x3 << OUT_ENDPOINT_TYPE) | \
+				BIT(OUT_ENDPOINT_ENABLE))
 
 struct usb338x_usb_ext_regs {
 	u32     usbclass;
-- 
cgit v1.2.3


From e842b84c8e7221c45c8dbd7de09185c6149e1cf9 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 23 Mar 2015 09:52:48 +1100
Subject: usb: phy: Add interface to get phy give of device_node.

Split the "get phy from device_node" functionality out of
"get phy by phandle" so it can be used directly.

This is useful when a battery-charger is intimately associated with a
particular phy but handled by a separate driver.  The charger
can find the device_node based on sibling relationships
without the need for a redundant declaration in the devicetree
description.

As a peripheral that gets a phy will often want to register a
notifier block, and de-register it later, that functionality
is included so the de-registration is automatic.

Acked-by: Pavel Machek <pavel@ucw.cz>
Signed-off-by: NeilBrown <neilb@suse.de>
Signed-off-by: Felipe Balbi <balbi@ti.com>
---
 drivers/usb/phy/phy.c   | 97 +++++++++++++++++++++++++++++++++++--------------
 include/linux/usb/phy.h |  2 +
 2 files changed, 72 insertions(+), 27 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/usb/phy/phy.c b/drivers/usb/phy/phy.c
index d1cd6b50f520..98f75d2842b7 100644
--- a/drivers/usb/phy/phy.c
+++ b/drivers/usb/phy/phy.c
@@ -22,6 +22,11 @@ static LIST_HEAD(phy_list);
 static LIST_HEAD(phy_bind_list);
 static DEFINE_SPINLOCK(phy_lock);
 
+struct phy_devm {
+	struct usb_phy *phy;
+	struct notifier_block *nb;
+};
+
 static struct usb_phy *__usb_find_phy(struct list_head *list,
 	enum usb_phy_type type)
 {
@@ -79,6 +84,15 @@ static void devm_usb_phy_release(struct device *dev, void *res)
 	usb_put_phy(phy);
 }
 
+static void devm_usb_phy_release2(struct device *dev, void *_res)
+{
+	struct phy_devm *res = _res;
+
+	if (res->nb)
+		usb_unregister_notifier(res->phy, res->nb);
+	usb_put_phy(res->phy);
+}
+
 static int devm_usb_phy_match(struct device *dev, void *res, void *match_data)
 {
 	struct usb_phy **phy = res;
@@ -153,40 +167,30 @@ err0:
 EXPORT_SYMBOL_GPL(usb_get_phy);
 
 /**
- * devm_usb_get_phy_by_phandle - find the USB PHY by phandle
+ * devm_usb_get_phy_by_node - find the USB PHY by device_node
  * @dev - device that requests this phy
- * @phandle - name of the property holding the phy phandle value
- * @index - the index of the phy
+ * @node - the device_node for the phy device.
+ * @nb - a notifier_block to register with the phy.
  *
- * Returns the phy driver associated with the given phandle value,
+ * Returns the phy driver associated with the given device_node,
  * after getting a refcount to it, -ENODEV if there is no such phy or
- * -EPROBE_DEFER if there is a phandle to the phy, but the device is
- * not yet loaded. While at that, it also associates the device with
+ * -EPROBE_DEFER if the device is not yet loaded. While at that, it
+ * also associates the device with
  * the phy using devres. On driver detach, release function is invoked
  * on the devres data, then, devres data is freed.
  *
- * For use by USB host and peripheral drivers.
+ * For use by peripheral drivers for devices related to a phy,
+ * such as a charger.
  */
-struct usb_phy *devm_usb_get_phy_by_phandle(struct device *dev,
-	const char *phandle, u8 index)
+struct  usb_phy *devm_usb_get_phy_by_node(struct device *dev,
+					  struct device_node *node,
+					  struct notifier_block *nb)
 {
-	struct usb_phy	*phy = ERR_PTR(-ENOMEM), **ptr;
+	struct usb_phy	*phy = ERR_PTR(-ENOMEM);
+	struct phy_devm	*ptr;
 	unsigned long	flags;
-	struct device_node *node;
 
-	if (!dev->of_node) {
-		dev_dbg(dev, "device does not have a device node entry\n");
-		return ERR_PTR(-EINVAL);
-	}
-
-	node = of_parse_phandle(dev->of_node, phandle, index);
-	if (!node) {
-		dev_dbg(dev, "failed to get %s phandle in %s node\n", phandle,
-			dev->of_node->full_name);
-		return ERR_PTR(-ENODEV);
-	}
-
-	ptr = devres_alloc(devm_usb_phy_release, sizeof(*ptr), GFP_KERNEL);
+	ptr = devres_alloc(devm_usb_phy_release2, sizeof(*ptr), GFP_KERNEL);
 	if (!ptr) {
 		dev_dbg(dev, "failed to allocate memory for devres\n");
 		goto err0;
@@ -205,8 +209,10 @@ struct usb_phy *devm_usb_get_phy_by_phandle(struct device *dev,
 		devres_free(ptr);
 		goto err1;
 	}
-
-	*ptr = phy;
+	if (nb)
+		usb_register_notifier(phy, nb);
+	ptr->phy = phy;
+	ptr->nb = nb;
 	devres_add(dev, ptr);
 
 	get_device(phy->dev);
@@ -215,10 +221,47 @@ err1:
 	spin_unlock_irqrestore(&phy_lock, flags);
 
 err0:
-	of_node_put(node);
 
 	return phy;
 }
+EXPORT_SYMBOL_GPL(devm_usb_get_phy_by_node);
+
+/**
+ * devm_usb_get_phy_by_phandle - find the USB PHY by phandle
+ * @dev - device that requests this phy
+ * @phandle - name of the property holding the phy phandle value
+ * @index - the index of the phy
+ *
+ * Returns the phy driver associated with the given phandle value,
+ * after getting a refcount to it, -ENODEV if there is no such phy or
+ * -EPROBE_DEFER if there is a phandle to the phy, but the device is
+ * not yet loaded. While at that, it also associates the device with
+ * the phy using devres. On driver detach, release function is invoked
+ * on the devres data, then, devres data is freed.
+ *
+ * For use by USB host and peripheral drivers.
+ */
+struct usb_phy *devm_usb_get_phy_by_phandle(struct device *dev,
+	const char *phandle, u8 index)
+{
+	struct device_node *node;
+	struct usb_phy	*phy;
+
+	if (!dev->of_node) {
+		dev_dbg(dev, "device does not have a device node entry\n");
+		return ERR_PTR(-EINVAL);
+	}
+
+	node = of_parse_phandle(dev->of_node, phandle, index);
+	if (!node) {
+		dev_dbg(dev, "failed to get %s phandle in %s node\n", phandle,
+			dev->of_node->full_name);
+		return ERR_PTR(-ENODEV);
+	}
+	phy = devm_usb_get_phy_by_node(dev, node, NULL);
+	of_node_put(node);
+	return phy;
+}
 EXPORT_SYMBOL_GPL(devm_usb_get_phy_by_phandle);
 
 /**
diff --git a/include/linux/usb/phy.h b/include/linux/usb/phy.h
index bc91b5d380fd..8ed1e29ef329 100644
--- a/include/linux/usb/phy.h
+++ b/include/linux/usb/phy.h
@@ -205,6 +205,8 @@ extern struct usb_phy *usb_get_phy_dev(struct device *dev, u8 index);
 extern struct usb_phy *devm_usb_get_phy_dev(struct device *dev, u8 index);
 extern struct usb_phy *devm_usb_get_phy_by_phandle(struct device *dev,
 	const char *phandle, u8 index);
+extern struct usb_phy *devm_usb_get_phy_by_node(struct device *dev,
+	struct device_node *node, struct notifier_block *nb);
 extern void usb_put_phy(struct usb_phy *);
 extern void devm_usb_put_phy(struct device *dev, struct usb_phy *x);
 extern int usb_bind_phy(const char *dev_name, u8 index,
-- 
cgit v1.2.3


From e4b88e19897f1039fd83f1630517becafc0dd163 Mon Sep 17 00:00:00 2001
From: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Date: Fri, 22 May 2015 13:44:33 -0700
Subject: Input: stmpe-ts - enforce device tree only mode

The STMPE MFD is only used with device tree configured systems (and STMPE
MFD core depends on OF), so force the configuration to come from device
tree only.

Tested-by: Heiner Kallweit <hkallweit1@gmail.com>
Reviewed-by: Marek Vasut <marex@denx.de>
Acked-by: Lee Jones <lee.jones@linaro.org>
Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
---
 drivers/input/touchscreen/Kconfig    |  1 +
 drivers/input/touchscreen/stmpe-ts.c | 29 ++++--------------------
 include/linux/mfd/stmpe.h            | 44 ------------------------------------
 3 files changed, 6 insertions(+), 68 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/input/touchscreen/Kconfig b/drivers/input/touchscreen/Kconfig
index 547f67d65372..a398d636ca39 100644
--- a/drivers/input/touchscreen/Kconfig
+++ b/drivers/input/touchscreen/Kconfig
@@ -958,6 +958,7 @@ config TOUCHSCREEN_ST1232
 config TOUCHSCREEN_STMPE
 	tristate "STMicroelectronics STMPE touchscreens"
 	depends on MFD_STMPE
+	depends on (OF || COMPILE_TEST)
 	help
 	  Say Y here if you want support for STMicroelectronics
 	  STMPE touchscreen controllers.
diff --git a/drivers/input/touchscreen/stmpe-ts.c b/drivers/input/touchscreen/stmpe-ts.c
index e4977c6250f3..e414d43e5159 100644
--- a/drivers/input/touchscreen/stmpe-ts.c
+++ b/drivers/input/touchscreen/stmpe-ts.c
@@ -267,27 +267,10 @@ static void stmpe_ts_close(struct input_dev *dev)
 static void stmpe_ts_get_platform_info(struct platform_device *pdev,
 					struct stmpe_touch *ts)
 {
-	struct stmpe *stmpe = dev_get_drvdata(pdev->dev.parent);
 	struct device_node *np = pdev->dev.of_node;
-	struct stmpe_ts_platform_data *ts_pdata = NULL;
-
-	ts->stmpe = stmpe;
-
-	if (stmpe->pdata && stmpe->pdata->ts) {
-		ts_pdata = stmpe->pdata->ts;
-
-		ts->sample_time = ts_pdata->sample_time;
-		ts->mod_12b = ts_pdata->mod_12b;
-		ts->ref_sel = ts_pdata->ref_sel;
-		ts->adc_freq = ts_pdata->adc_freq;
-		ts->ave_ctrl = ts_pdata->ave_ctrl;
-		ts->touch_det_delay = ts_pdata->touch_det_delay;
-		ts->settling = ts_pdata->settling;
-		ts->fraction_z = ts_pdata->fraction_z;
-		ts->i_drive = ts_pdata->i_drive;
-	} else if (np) {
-		u32 val;
+	u32 val;
 
+	if (np) {
 		if (!of_property_read_u32(np, "st,sample-time", &val))
 			ts->sample_time = val;
 		if (!of_property_read_u32(np, "st,mod-12b", &val))
@@ -311,6 +294,7 @@ static void stmpe_ts_get_platform_info(struct platform_device *pdev,
 
 static int stmpe_input_probe(struct platform_device *pdev)
 {
+	struct stmpe *stmpe = dev_get_drvdata(pdev->dev.parent);
 	struct stmpe_touch *ts;
 	struct input_dev *idev;
 	int error;
@@ -329,6 +313,7 @@ static int stmpe_input_probe(struct platform_device *pdev)
 		return -ENOMEM;
 
 	platform_set_drvdata(pdev, ts);
+	ts->stmpe = stmpe;
 	ts->idev = idev;
 	ts->dev = &pdev->dev;
 
@@ -351,14 +336,13 @@ static int stmpe_input_probe(struct platform_device *pdev)
 	idev->name = STMPE_TS_NAME;
 	idev->phys = STMPE_TS_NAME"/input0";
 	idev->id.bustype = BUS_I2C;
-	idev->evbit[0] = BIT_MASK(EV_KEY) | BIT_MASK(EV_ABS);
-	idev->keybit[BIT_WORD(BTN_TOUCH)] = BIT_MASK(BTN_TOUCH);
 
 	idev->open = stmpe_ts_open;
 	idev->close = stmpe_ts_close;
 
 	input_set_drvdata(idev, ts);
 
+	input_set_capability(idev, EV_KEY, BTN_TOUCH);
 	input_set_abs_params(idev, ABS_X, 0, XY_MASK, 0, 0);
 	input_set_abs_params(idev, ABS_Y, 0, XY_MASK, 0, 0);
 	input_set_abs_params(idev, ABS_PRESSURE, 0x0, 0xff, 0, 0);
@@ -390,15 +374,12 @@ static struct platform_driver stmpe_ts_driver = {
 };
 module_platform_driver(stmpe_ts_driver);
 
-#ifdef CONFIG_OF
 static const struct of_device_id stmpe_ts_ids[] = {
 	{ .compatible = "st,stmpe-ts", },
 	{ },
 };
 MODULE_DEVICE_TABLE(of, stmpe_ts_ids);
-#endif
 
 MODULE_AUTHOR("Luotao Fu <l.fu@pengutronix.de>");
 MODULE_DESCRIPTION("STMPEXXX touchscreen driver");
 MODULE_LICENSE("GPL");
-MODULE_ALIAS("platform:" STMPE_TS_NAME);
diff --git a/include/linux/mfd/stmpe.h b/include/linux/mfd/stmpe.h
index c9d869027300..cb83883918a7 100644
--- a/include/linux/mfd/stmpe.h
+++ b/include/linux/mfd/stmpe.h
@@ -117,47 +117,6 @@ extern int stmpe_disable(struct stmpe *stmpe, unsigned int blocks);
 
 #define STMPE_GPIO_NOREQ_811_TOUCH	(0xf0)
 
-/**
- * struct stmpe_ts_platform_data - stmpe811 touch screen controller platform
- * data
- * @sample_time: ADC converstion time in number of clock.
- * (0 -> 36 clocks, 1 -> 44 clocks, 2 -> 56 clocks, 3 -> 64 clocks,
- * 4 -> 80 clocks, 5 -> 96 clocks, 6 -> 144 clocks),
- * recommended is 4.
- * @mod_12b: ADC Bit mode (0 -> 10bit ADC, 1 -> 12bit ADC)
- * @ref_sel: ADC reference source
- * (0 -> internal reference, 1 -> external reference)
- * @adc_freq: ADC Clock speed
- * (0 -> 1.625 MHz, 1 -> 3.25 MHz, 2 || 3 -> 6.5 MHz)
- * @ave_ctrl: Sample average control
- * (0 -> 1 sample, 1 -> 2 samples, 2 -> 4 samples, 3 -> 8 samples)
- * @touch_det_delay: Touch detect interrupt delay
- * (0 -> 10 us, 1 -> 50 us, 2 -> 100 us, 3 -> 500 us,
- * 4-> 1 ms, 5 -> 5 ms, 6 -> 10 ms, 7 -> 50 ms)
- * recommended is 3
- * @settling: Panel driver settling time
- * (0 -> 10 us, 1 -> 100 us, 2 -> 500 us, 3 -> 1 ms,
- * 4 -> 5 ms, 5 -> 10 ms, 6 for 50 ms, 7 -> 100 ms)
- * recommended is 2
- * @fraction_z: Length of the fractional part in z
- * (fraction_z ([0..7]) = Count of the fractional part)
- * recommended is 7
- * @i_drive: current limit value of the touchscreen drivers
- * (0 -> 20 mA typical 35 mA max, 1 -> 50 mA typical 80 mA max)
- *
- * */
-struct stmpe_ts_platform_data {
-       u8 sample_time;
-       u8 mod_12b;
-       u8 ref_sel;
-       u8 adc_freq;
-       u8 ave_ctrl;
-       u8 touch_det_delay;
-       u8 settling;
-       u8 fraction_z;
-       u8 i_drive;
-};
-
 /**
  * struct stmpe_platform_data - STMPE platform data
  * @id: device id to distinguish between multiple STMPEs on the same board
@@ -168,7 +127,6 @@ struct stmpe_ts_platform_data {
  * @irq_over_gpio: true if gpio is used to get irq
  * @irq_gpio: gpio number over which irq will be requested (significant only if
  *	      irq_over_gpio is true)
- * @ts: touchscreen-specific platform data
  */
 struct stmpe_platform_data {
 	int id;
@@ -178,8 +136,6 @@ struct stmpe_platform_data {
 	bool irq_over_gpio;
 	int irq_gpio;
 	int autosleep_timeout;
-
-	struct stmpe_ts_platform_data *ts;
 };
 
 #endif
-- 
cgit v1.2.3


From 7d7efec368d537226142cbe559f45797f18672f9 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 13 May 2015 16:35:16 -0400
Subject: sched, cgroup: reorganize threadgroup locking

threadgroup_change_begin/end() are used to mark the beginning and end
of threadgroup modifying operations to allow code paths which require
a threadgroup to stay stable across blocking operations to synchronize
against those sections using threadgroup_lock/unlock().

It's currently implemented as a general mechanism in sched.h using
per-signal_struct rwsem; however, this never grew non-cgroup use cases
and becomes noop if !CONFIG_CGROUPS.  It turns out that cgroups is
gonna be better served with a different sycnrhonization scheme and is
a bit silly to keep cgroups specific details as a general mechanism.

What's general here is identifying the places where threadgroups are
modified.  This patch restructures threadgroup locking so that
threadgroup_change_begin/end() become a place where subsystems which
need to sycnhronize against threadgroup changes can hook into.

cgroup_threadgroup_change_begin/end() which operate on the
per-signal_struct rwsem are created and threadgroup_lock/unlock() are
moved to cgroup.c and made static.

This is pure reorganization which doesn't cause any functional
changes.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
---
 include/linux/cgroup-defs.h | 10 +++++++++
 include/linux/sched.h       | 53 +++++++++++++++------------------------------
 kernel/cgroup.c             | 42 +++++++++++++++++++++++++++++++++++
 3 files changed, 69 insertions(+), 36 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index 55f3120fb952..1b8c93806dbd 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -14,6 +14,7 @@
 #include <linux/mutex.h>
 #include <linux/rcupdate.h>
 #include <linux/percpu-refcount.h>
+#include <linux/percpu-rwsem.h>
 #include <linux/workqueue.h>
 
 #ifdef CONFIG_CGROUPS
@@ -460,5 +461,14 @@ struct cgroup_subsys {
 	unsigned int depends_on;
 };
 
+void cgroup_threadgroup_change_begin(struct task_struct *tsk);
+void cgroup_threadgroup_change_end(struct task_struct *tsk);
+
+#else	/* CONFIG_CGROUPS */
+
+static inline void cgroup_threadgroup_change_begin(struct task_struct *tsk) {}
+static inline void cgroup_threadgroup_change_end(struct task_struct *tsk) {}
+
 #endif	/* CONFIG_CGROUPS */
+
 #endif	/* _LINUX_CGROUP_DEFS_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 8222ae40ecb0..5ee290003470 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -58,6 +58,7 @@ struct sched_param {
 #include <linux/uidgid.h>
 #include <linux/gfp.h>
 #include <linux/magic.h>
+#include <linux/cgroup-defs.h>
 
 #include <asm/processor.h>
 
@@ -2648,53 +2649,33 @@ static inline void unlock_task_sighand(struct task_struct *tsk,
 	spin_unlock_irqrestore(&tsk->sighand->siglock, *flags);
 }
 
-#ifdef CONFIG_CGROUPS
-static inline void threadgroup_change_begin(struct task_struct *tsk)
-{
-	down_read(&tsk->signal->group_rwsem);
-}
-static inline void threadgroup_change_end(struct task_struct *tsk)
-{
-	up_read(&tsk->signal->group_rwsem);
-}
-
 /**
- * threadgroup_lock - lock threadgroup
- * @tsk: member task of the threadgroup to lock
- *
- * Lock the threadgroup @tsk belongs to.  No new task is allowed to enter
- * and member tasks aren't allowed to exit (as indicated by PF_EXITING) or
- * change ->group_leader/pid.  This is useful for cases where the threadgroup
- * needs to stay stable across blockable operations.
+ * threadgroup_change_begin - mark the beginning of changes to a threadgroup
+ * @tsk: task causing the changes
  *
- * fork and exit paths explicitly call threadgroup_change_{begin|end}() for
- * synchronization.  While held, no new task will be added to threadgroup
- * and no existing live task will have its PF_EXITING set.
- *
- * de_thread() does threadgroup_change_{begin|end}() when a non-leader
- * sub-thread becomes a new leader.
+ * All operations which modify a threadgroup - a new thread joining the
+ * group, death of a member thread (the assertion of PF_EXITING) and
+ * exec(2) dethreading the process and replacing the leader - are wrapped
+ * by threadgroup_change_{begin|end}().  This is to provide a place which
+ * subsystems needing threadgroup stability can hook into for
+ * synchronization.
  */
-static inline void threadgroup_lock(struct task_struct *tsk)
+static inline void threadgroup_change_begin(struct task_struct *tsk)
 {
-	down_write(&tsk->signal->group_rwsem);
+	might_sleep();
+	cgroup_threadgroup_change_begin(tsk);
 }
 
 /**
- * threadgroup_unlock - unlock threadgroup
- * @tsk: member task of the threadgroup to unlock
+ * threadgroup_change_end - mark the end of changes to a threadgroup
+ * @tsk: task causing the changes
  *
- * Reverse threadgroup_lock().
+ * See threadgroup_change_begin().
  */
-static inline void threadgroup_unlock(struct task_struct *tsk)
+static inline void threadgroup_change_end(struct task_struct *tsk)
 {
-	up_write(&tsk->signal->group_rwsem);
+	cgroup_threadgroup_change_end(tsk);
 }
-#else
-static inline void threadgroup_change_begin(struct task_struct *tsk) {}
-static inline void threadgroup_change_end(struct task_struct *tsk) {}
-static inline void threadgroup_lock(struct task_struct *tsk) {}
-static inline void threadgroup_unlock(struct task_struct *tsk) {}
-#endif
 
 #ifndef __HAVE_THREAD_FUNCTIONS
 
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index b91177f93416..980b1f52f39f 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -848,6 +848,48 @@ static struct css_set *find_css_set(struct css_set *old_cset,
 	return cset;
 }
 
+void cgroup_threadgroup_change_begin(struct task_struct *tsk)
+{
+	down_read(&tsk->signal->group_rwsem);
+}
+
+void cgroup_threadgroup_change_end(struct task_struct *tsk)
+{
+	up_read(&tsk->signal->group_rwsem);
+}
+
+/**
+ * threadgroup_lock - lock threadgroup
+ * @tsk: member task of the threadgroup to lock
+ *
+ * Lock the threadgroup @tsk belongs to.  No new task is allowed to enter
+ * and member tasks aren't allowed to exit (as indicated by PF_EXITING) or
+ * change ->group_leader/pid.  This is useful for cases where the threadgroup
+ * needs to stay stable across blockable operations.
+ *
+ * fork and exit explicitly call threadgroup_change_{begin|end}() for
+ * synchronization.  While held, no new task will be added to threadgroup
+ * and no existing live task will have its PF_EXITING set.
+ *
+ * de_thread() does threadgroup_change_{begin|end}() when a non-leader
+ * sub-thread becomes a new leader.
+ */
+static void threadgroup_lock(struct task_struct *tsk)
+{
+	down_write(&tsk->signal->group_rwsem);
+}
+
+/**
+ * threadgroup_unlock - unlock threadgroup
+ * @tsk: member task of the threadgroup to unlock
+ *
+ * Reverse threadgroup_lock().
+ */
+static inline void threadgroup_unlock(struct task_struct *tsk)
+{
+	up_write(&tsk->signal->group_rwsem);
+}
+
 static struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root)
 {
 	struct cgroup *root_cgrp = kf_root->kn->priv;
-- 
cgit v1.2.3


From d59cfc09c32a2ae31f1c3bc2983a0cd79afb3f14 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 13 May 2015 16:35:17 -0400
Subject: sched, cgroup: replace signal_struct->group_rwsem with a global
 percpu_rwsem

The cgroup side of threadgroup locking uses signal_struct->group_rwsem
to synchronize against threadgroup changes.  This per-process rwsem
adds small overhead to thread creation, exit and exec paths, forces
cgroup code paths to do lock-verify-unlock-retry dance in a couple
places and makes it impossible to atomically perform operations across
multiple processes.

This patch replaces signal_struct->group_rwsem with a global
percpu_rwsem cgroup_threadgroup_rwsem which is cheaper on the reader
side and contained in cgroups proper.  This patch converts one-to-one.

This does make writer side heavier and lower the granularity; however,
cgroup process migration is a fairly cold path, we do want to optimize
thread operations over it and cgroup migration operations don't take
enough time for the lower granularity to matter.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
---
 include/linux/cgroup-defs.h | 27 ++++++++++++++--
 include/linux/init_task.h   |  8 -----
 include/linux/sched.h       | 12 -------
 init/Kconfig                |  1 +
 kernel/cgroup.c             | 77 ++++++++++++---------------------------------
 kernel/fork.c               |  4 ---
 6 files changed, 46 insertions(+), 83 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index 1b8c93806dbd..7d83d7f73420 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -461,8 +461,31 @@ struct cgroup_subsys {
 	unsigned int depends_on;
 };
 
-void cgroup_threadgroup_change_begin(struct task_struct *tsk);
-void cgroup_threadgroup_change_end(struct task_struct *tsk);
+extern struct percpu_rw_semaphore cgroup_threadgroup_rwsem;
+
+/**
+ * cgroup_threadgroup_change_begin - threadgroup exclusion for cgroups
+ * @tsk: target task
+ *
+ * Called from threadgroup_change_begin() and allows cgroup operations to
+ * synchronize against threadgroup changes using a percpu_rw_semaphore.
+ */
+static inline void cgroup_threadgroup_change_begin(struct task_struct *tsk)
+{
+	percpu_down_read(&cgroup_threadgroup_rwsem);
+}
+
+/**
+ * cgroup_threadgroup_change_end - threadgroup exclusion for cgroups
+ * @tsk: target task
+ *
+ * Called from threadgroup_change_end().  Counterpart of
+ * cgroup_threadcgroup_change_begin().
+ */
+static inline void cgroup_threadgroup_change_end(struct task_struct *tsk)
+{
+	percpu_up_read(&cgroup_threadgroup_rwsem);
+}
 
 #else	/* CONFIG_CGROUPS */
 
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 696d22312b31..0cc0bbf20022 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -25,13 +25,6 @@
 extern struct files_struct init_files;
 extern struct fs_struct init_fs;
 
-#ifdef CONFIG_CGROUPS
-#define INIT_GROUP_RWSEM(sig)						\
-	.group_rwsem = __RWSEM_INITIALIZER(sig.group_rwsem),
-#else
-#define INIT_GROUP_RWSEM(sig)
-#endif
-
 #ifdef CONFIG_CPUSETS
 #define INIT_CPUSET_SEQ(tsk)							\
 	.mems_allowed_seq = SEQCNT_ZERO(tsk.mems_allowed_seq),
@@ -56,7 +49,6 @@ extern struct fs_struct init_fs;
 	},								\
 	.cred_guard_mutex =						\
 		 __MUTEX_INITIALIZER(sig.cred_guard_mutex),		\
-	INIT_GROUP_RWSEM(sig)						\
 }
 
 extern struct nsproxy init_nsproxy;
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 5ee290003470..add524a910bd 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -743,18 +743,6 @@ struct signal_struct {
 	unsigned audit_tty_log_passwd;
 	struct tty_audit_buf *tty_audit_buf;
 #endif
-#ifdef CONFIG_CGROUPS
-	/*
-	 * group_rwsem prevents new tasks from entering the threadgroup and
-	 * member tasks from exiting,a more specifically, setting of
-	 * PF_EXITING.  fork and exit paths are protected with this rwsem
-	 * using threadgroup_change_begin/end().  Users which require
-	 * threadgroup to remain stable should use threadgroup_[un]lock()
-	 * which also takes care of exec path.  Currently, cgroup is the
-	 * only user.
-	 */
-	struct rw_semaphore group_rwsem;
-#endif
 
 	oom_flags_t oom_flags;
 	short oom_score_adj;		/* OOM kill score adjustment */
diff --git a/init/Kconfig b/init/Kconfig
index dc24dec60232..b9b824bf8f6b 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -938,6 +938,7 @@ config NUMA_BALANCING_DEFAULT_ENABLED
 menuconfig CGROUPS
 	bool "Control Group support"
 	select KERNFS
+	select PERCPU_RWSEM
 	help
 	  This option adds support for grouping sets of processes together, for
 	  use with process control subsystems such as Cpusets, CFS, memory
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 980b1f52f39f..77578a169b8c 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -46,6 +46,7 @@
 #include <linux/slab.h>
 #include <linux/spinlock.h>
 #include <linux/rwsem.h>
+#include <linux/percpu-rwsem.h>
 #include <linux/string.h>
 #include <linux/sort.h>
 #include <linux/kmod.h>
@@ -103,6 +104,8 @@ static DEFINE_SPINLOCK(cgroup_idr_lock);
  */
 static DEFINE_SPINLOCK(release_agent_path_lock);
 
+struct percpu_rw_semaphore cgroup_threadgroup_rwsem;
+
 #define cgroup_assert_mutex_or_rcu_locked()				\
 	rcu_lockdep_assert(rcu_read_lock_held() ||			\
 			   lockdep_is_held(&cgroup_mutex),		\
@@ -848,48 +851,6 @@ static struct css_set *find_css_set(struct css_set *old_cset,
 	return cset;
 }
 
-void cgroup_threadgroup_change_begin(struct task_struct *tsk)
-{
-	down_read(&tsk->signal->group_rwsem);
-}
-
-void cgroup_threadgroup_change_end(struct task_struct *tsk)
-{
-	up_read(&tsk->signal->group_rwsem);
-}
-
-/**
- * threadgroup_lock - lock threadgroup
- * @tsk: member task of the threadgroup to lock
- *
- * Lock the threadgroup @tsk belongs to.  No new task is allowed to enter
- * and member tasks aren't allowed to exit (as indicated by PF_EXITING) or
- * change ->group_leader/pid.  This is useful for cases where the threadgroup
- * needs to stay stable across blockable operations.
- *
- * fork and exit explicitly call threadgroup_change_{begin|end}() for
- * synchronization.  While held, no new task will be added to threadgroup
- * and no existing live task will have its PF_EXITING set.
- *
- * de_thread() does threadgroup_change_{begin|end}() when a non-leader
- * sub-thread becomes a new leader.
- */
-static void threadgroup_lock(struct task_struct *tsk)
-{
-	down_write(&tsk->signal->group_rwsem);
-}
-
-/**
- * threadgroup_unlock - unlock threadgroup
- * @tsk: member task of the threadgroup to unlock
- *
- * Reverse threadgroup_lock().
- */
-static inline void threadgroup_unlock(struct task_struct *tsk)
-{
-	up_write(&tsk->signal->group_rwsem);
-}
-
 static struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root)
 {
 	struct cgroup *root_cgrp = kf_root->kn->priv;
@@ -2095,9 +2056,9 @@ static void cgroup_task_migrate(struct cgroup *old_cgrp,
 	lockdep_assert_held(&css_set_rwsem);
 
 	/*
-	 * We are synchronized through threadgroup_lock() against PF_EXITING
-	 * setting such that we can't race against cgroup_exit() changing the
-	 * css_set to init_css_set and dropping the old one.
+	 * We are synchronized through cgroup_threadgroup_rwsem against
+	 * PF_EXITING setting such that we can't race against cgroup_exit()
+	 * changing the css_set to init_css_set and dropping the old one.
 	 */
 	WARN_ON_ONCE(tsk->flags & PF_EXITING);
 	old_cset = task_css_set(tsk);
@@ -2154,10 +2115,11 @@ static void cgroup_migrate_finish(struct list_head *preloaded_csets)
  * @src_cset and add it to @preloaded_csets, which should later be cleaned
  * up by cgroup_migrate_finish().
  *
- * This function may be called without holding threadgroup_lock even if the
- * target is a process.  Threads may be created and destroyed but as long
- * as cgroup_mutex is not dropped, no new css_set can be put into play and
- * the preloaded css_sets are guaranteed to cover all migrations.
+ * This function may be called without holding cgroup_threadgroup_rwsem
+ * even if the target is a process.  Threads may be created and destroyed
+ * but as long as cgroup_mutex is not dropped, no new css_set can be put
+ * into play and the preloaded css_sets are guaranteed to cover all
+ * migrations.
  */
 static void cgroup_migrate_add_src(struct css_set *src_cset,
 				   struct cgroup *dst_cgrp,
@@ -2260,7 +2222,7 @@ err:
  * @threadgroup: whether @leader points to the whole process or a single task
  *
  * Migrate a process or task denoted by @leader to @cgrp.  If migrating a
- * process, the caller must be holding threadgroup_lock of @leader.  The
+ * process, the caller must be holding cgroup_threadgroup_rwsem.  The
  * caller is also responsible for invoking cgroup_migrate_add_src() and
  * cgroup_migrate_prepare_dst() on the targets before invoking this
  * function and following up with cgroup_migrate_finish().
@@ -2388,7 +2350,7 @@ out_release_tset:
  * @leader: the task or the leader of the threadgroup to be attached
  * @threadgroup: attach the whole threadgroup?
  *
- * Call holding cgroup_mutex and threadgroup_lock of @leader.
+ * Call holding cgroup_mutex and cgroup_threadgroup_rwsem.
  */
 static int cgroup_attach_task(struct cgroup *dst_cgrp,
 			      struct task_struct *leader, bool threadgroup)
@@ -2481,7 +2443,7 @@ retry_find_task:
 	get_task_struct(tsk);
 	rcu_read_unlock();
 
-	threadgroup_lock(tsk);
+	percpu_down_write(&cgroup_threadgroup_rwsem);
 	if (threadgroup) {
 		if (!thread_group_leader(tsk)) {
 			/*
@@ -2491,7 +2453,7 @@ retry_find_task:
 			 * try again; this is
 			 * "double-double-toil-and-trouble-check locking".
 			 */
-			threadgroup_unlock(tsk);
+			percpu_up_write(&cgroup_threadgroup_rwsem);
 			put_task_struct(tsk);
 			goto retry_find_task;
 		}
@@ -2499,7 +2461,7 @@ retry_find_task:
 
 	ret = cgroup_attach_task(cgrp, tsk, threadgroup);
 
-	threadgroup_unlock(tsk);
+	percpu_up_write(&cgroup_threadgroup_rwsem);
 
 	put_task_struct(tsk);
 out_unlock_cgroup:
@@ -2704,17 +2666,17 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
 				goto out_finish;
 			last_task = task;
 
-			threadgroup_lock(task);
+			percpu_down_write(&cgroup_threadgroup_rwsem);
 			/* raced against de_thread() from another thread? */
 			if (!thread_group_leader(task)) {
-				threadgroup_unlock(task);
+				percpu_up_write(&cgroup_threadgroup_rwsem);
 				put_task_struct(task);
 				continue;
 			}
 
 			ret = cgroup_migrate(src_cset->dfl_cgrp, task, true);
 
-			threadgroup_unlock(task);
+			percpu_up_write(&cgroup_threadgroup_rwsem);
 			put_task_struct(task);
 
 			if (WARN(ret, "cgroup: failed to update controllers for the default hierarchy (%d), further operations may crash or hang\n", ret))
@@ -5032,6 +4994,7 @@ int __init cgroup_init(void)
 	unsigned long key;
 	int ssid, err;
 
+	BUG_ON(percpu_init_rwsem(&cgroup_threadgroup_rwsem));
 	BUG_ON(cgroup_init_cftypes(NULL, cgroup_dfl_base_files));
 	BUG_ON(cgroup_init_cftypes(NULL, cgroup_legacy_base_files));
 
diff --git a/kernel/fork.c b/kernel/fork.c
index 03c1eaaa6ef5..9531275e12a9 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1144,10 +1144,6 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
 	tty_audit_fork(sig);
 	sched_autogroup_fork(sig);
 
-#ifdef CONFIG_CGROUPS
-	init_rwsem(&sig->group_rwsem);
-#endif
-
 	sig->oom_score_adj = current->signal->oom_score_adj;
 	sig->oom_score_adj_min = current->signal->oom_score_adj_min;
 
-- 
cgit v1.2.3


From e463d88c36d42211aa72ed76d32fb8bf37820ef1 Mon Sep 17 00:00:00 2001
From: Florian Fainelli <f.fainelli@gmail.com>
Date: Tue, 26 May 2015 12:19:58 -0700
Subject: net: phy: Add phy_interface_is_rgmii helper

RGMII interfaces come in 4 different flavors that the PHY library needs
to care about: regular RGMII (no delays), RGMII with either RX or TX
delay, and both. In order to avoid errors of checking only for one type
of RGMII interface and miss the 3 others, introduce a convenience
function which tests for all values.

Suggested-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/phy.h | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/phy.h b/include/linux/phy.h
index 701c7a3946e0..a26c3f84b8dd 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -677,6 +677,17 @@ static inline bool phy_is_internal(struct phy_device *phydev)
 	return phydev->is_internal;
 }
 
+/**
+ * phy_interface_is_rgmii - Convenience function for testing if a PHY interface
+ * is RGMII (all variants)
+ * @phydev: the phy_device struct
+ */
+static inline bool phy_interface_is_rgmii(struct phy_device *phydev)
+{
+	return phydev->interface >= PHY_INTERFACE_MODE_RGMII &&
+		phydev->interface <= PHY_INTERFACE_MODE_RGMII_TXID;
+}
+
 /**
  * phy_write_mmd - Convenience function for writing a register
  * on an MMD on a given PHY.
-- 
cgit v1.2.3


From b3df4ec4424f27e55d754cfe586195fecca1c4e4 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 19 May 2015 00:00:51 +0000
Subject: perf/x86/intel/cqm: Use proper data types

'int' is really not a proper data type for an MSR. Use u32 to make it
clear that we are dealing with a 32-bit unsigned hardware value.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Matt Fleming <matt.fleming@intel.com>
Cc: Kanaka Juvva <kanaka.d.juvva@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Vikas Shivappa <vikas.shivappa@linux.intel.com>
Cc: Will Auld <will.auld@intel.com>
Link: http://lkml.kernel.org/r/20150518235149.919350144@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event_intel_cqm.c | 4 ++--
 include/linux/perf_event.h                 | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/cpu/perf_event_intel_cqm.c b/arch/x86/kernel/cpu/perf_event_intel_cqm.c
index 572582e2143e..3e9a7fbfce58 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_cqm.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_cqm.c
@@ -18,7 +18,7 @@ static unsigned int cqm_l3_scale; /* supposedly cacheline size */
 
 struct intel_cqm_state {
 	raw_spinlock_t		lock;
-	int			rmid;
+	u32			rmid;
 	int			cnt;
 };
 
@@ -962,7 +962,7 @@ out:
 static void intel_cqm_event_start(struct perf_event *event, int mode)
 {
 	struct intel_cqm_state *state = this_cpu_ptr(&cqm_state);
-	unsigned int rmid = event->hw.cqm_rmid;
+	u32 rmid = event->hw.cqm_rmid;
 	unsigned long flags;
 
 	if (!(event->hw.cqm_state & PERF_HES_STOPPED))
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 248f7829ce41..06580028cee6 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -120,7 +120,7 @@ struct hw_perf_event {
 		};
 		struct { /* intel_cqm */
 			int			cqm_state;
-			int			cqm_rmid;
+			u32			cqm_rmid;
 			struct list_head	cqm_events_entry;
 			struct list_head	cqm_groups_entry;
 			struct list_head	cqm_group_entry;
-- 
cgit v1.2.3


From 16b369a91d0dd80be214b7f7801fbc51875454cc Mon Sep 17 00:00:00 2001
From: Stephan Mueller <smueller@chronox.de>
Date: Mon, 25 May 2015 15:08:47 +0200
Subject: random: Blocking API for accessing nonblocking_pool

The added API calls provide a synchronous function call
get_blocking_random_bytes where the caller is blocked until
the nonblocking_pool is initialized.

CC: Andreas Steffen <andreas.steffen@strongswan.org>
CC: Theodore Ts'o <tytso@mit.edu>
CC: Sandy Harris <sandyinchina@gmail.com>
Signed-off-by: Stephan Mueller <smueller@chronox.de>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 drivers/char/random.c  | 12 ++++++++++++
 include/linux/random.h |  1 +
 2 files changed, 13 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/char/random.c b/drivers/char/random.c
index 8b8c46b5fd5c..159d0700f7d8 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -1244,6 +1244,18 @@ void get_random_bytes(void *buf, int nbytes)
 }
 EXPORT_SYMBOL(get_random_bytes);
 
+/*
+ * Equivalent function to get_random_bytes with the difference that this
+ * function blocks the request until the nonblocking_pool is initialized.
+ */
+void get_blocking_random_bytes(void *buf, int nbytes)
+{
+	if (unlikely(nonblocking_pool.initialized == 0))
+		wait_event(urandom_init_wait, nonblocking_pool.initialized);
+	extract_entropy(&nonblocking_pool, buf, nbytes, 0, 0);
+}
+EXPORT_SYMBOL(get_blocking_random_bytes);
+
 /*
  * This function will use the architecture-specific hardware random
  * number generator if it is available.  The arch-specific hw RNG will
diff --git a/include/linux/random.h b/include/linux/random.h
index b05856e16b75..796267d56901 100644
--- a/include/linux/random.h
+++ b/include/linux/random.h
@@ -14,6 +14,7 @@ extern void add_input_randomness(unsigned int type, unsigned int code,
 extern void add_interrupt_randomness(int irq, int irq_flags);
 
 extern void get_random_bytes(void *buf, int nbytes);
+extern void get_blocking_random_bytes(void *buf, int nbytes);
 extern void get_random_bytes_arch(void *buf, int nbytes);
 void generate_random_uuid(unsigned char uuid_out[16]);
 extern int random_int_secret_init(void);
-- 
cgit v1.2.3


From 7d010fdf299929f9583ce5e17da629dcd83c36ef Mon Sep 17 00:00:00 2001
From: "Luis R. Rodriguez" <mcgrof@suse.com>
Date: Tue, 26 May 2015 10:28:13 +0200
Subject: x86/mm/mtrr: Avoid #ifdeffery with phys_wc_to_mtrr_index()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

There is only one user but since we're going to bury MTRR next
out of access to drivers, expose this last piece of API to
drivers in a general fashion only needing io.h for access to
helpers.

Signed-off-by: Luis R. Rodriguez <mcgrof@suse.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Abhilash Kesavan <a.kesavan@samsung.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Antonino Daplas <adaplas@gmail.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Cristian Stoica <cristian.stoica@freescale.com>
Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
Cc: Dave Airlie <airlied@redhat.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Davidlohr Bueso <dbueso@suse.de>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Jean-Christophe Plagniol-Villard <plagnioj@jcrosoft.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Matthias Brugger <matthias.bgg@gmail.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Suresh Siddha <sbsiddha@gmail.com>
Cc: Thierry Reding <treding@nvidia.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tomi Valkeinen <tomi.valkeinen@ti.com>
Cc: Toshi Kani <toshi.kani@hp.com>
Cc: Ville Syrjälä <syrjala@sci.fi>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Will Deacon <will.deacon@arm.com>
Cc: dri-devel@lists.freedesktop.org
Link: http://lkml.kernel.org/r/1429722736-4473-1-git-send-email-mcgrof@do-not-panic.com
Link: http://lkml.kernel.org/r/1432628901-18044-11-git-send-email-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/io.h       |  3 +++
 arch/x86/include/asm/mtrr.h     |  5 -----
 arch/x86/kernel/cpu/mtrr/main.c |  6 +++---
 drivers/gpu/drm/drm_ioctl.c     | 14 +-------------
 include/linux/io.h              |  7 +++++++
 5 files changed, 14 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h
index 4afc05ffa566..a2b97404922d 100644
--- a/arch/x86/include/asm/io.h
+++ b/arch/x86/include/asm/io.h
@@ -339,6 +339,9 @@ extern bool xen_biovec_phys_mergeable(const struct bio_vec *vec1,
 #define IO_SPACE_LIMIT 0xffff
 
 #ifdef CONFIG_MTRR
+extern int __must_check arch_phys_wc_index(int handle);
+#define arch_phys_wc_index arch_phys_wc_index
+
 extern int __must_check arch_phys_wc_add(unsigned long base,
 					 unsigned long size);
 extern void arch_phys_wc_del(int handle);
diff --git a/arch/x86/include/asm/mtrr.h b/arch/x86/include/asm/mtrr.h
index a31759e1edd9..b94f6f64e23d 100644
--- a/arch/x86/include/asm/mtrr.h
+++ b/arch/x86/include/asm/mtrr.h
@@ -48,7 +48,6 @@ extern void mtrr_aps_init(void);
 extern void mtrr_bp_restore(void);
 extern int mtrr_trim_uncached_memory(unsigned long end_pfn);
 extern int amd_special_default_mtrr(void);
-extern int phys_wc_to_mtrr_index(int handle);
 #  else
 static inline u8 mtrr_type_lookup(u64 addr, u64 end, u8 *uniform)
 {
@@ -84,10 +83,6 @@ static inline int mtrr_trim_uncached_memory(unsigned long end_pfn)
 static inline void mtrr_centaur_report_mcr(int mcr, u32 lo, u32 hi)
 {
 }
-static inline int phys_wc_to_mtrr_index(int handle)
-{
-	return -1;
-}
 
 #define mtrr_ap_init() do {} while (0)
 #define mtrr_bp_init() do {} while (0)
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c
index 04aceb7e6443..81baf5fee0e1 100644
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -580,7 +580,7 @@ void arch_phys_wc_del(int handle)
 EXPORT_SYMBOL(arch_phys_wc_del);
 
 /*
- * phys_wc_to_mtrr_index - translates arch_phys_wc_add's return value
+ * arch_phys_wc_index - translates arch_phys_wc_add's return value
  * @handle: Return value from arch_phys_wc_add
  *
  * This will turn the return value from arch_phys_wc_add into an mtrr
@@ -590,14 +590,14 @@ EXPORT_SYMBOL(arch_phys_wc_del);
  * in printk line.  Alas there is an illegitimate use in some ancient
  * drm ioctls.
  */
-int phys_wc_to_mtrr_index(int handle)
+int arch_phys_wc_index(int handle)
 {
 	if (handle < MTRR_TO_PHYS_WC_OFFSET)
 		return -1;
 	else
 		return handle - MTRR_TO_PHYS_WC_OFFSET;
 }
-EXPORT_SYMBOL_GPL(phys_wc_to_mtrr_index);
+EXPORT_SYMBOL_GPL(arch_phys_wc_index);
 
 /*
  * HACK ALERT!
diff --git a/drivers/gpu/drm/drm_ioctl.c b/drivers/gpu/drm/drm_ioctl.c
index 266dcd6cdf3b..0a957828b3bd 100644
--- a/drivers/gpu/drm/drm_ioctl.c
+++ b/drivers/gpu/drm/drm_ioctl.c
@@ -36,9 +36,6 @@
 
 #include <linux/pci.h>
 #include <linux/export.h>
-#ifdef CONFIG_X86
-#include <asm/mtrr.h>
-#endif
 
 static int drm_version(struct drm_device *dev, void *data,
 		       struct drm_file *file_priv);
@@ -197,16 +194,7 @@ static int drm_getmap(struct drm_device *dev, void *data,
 	map->type = r_list->map->type;
 	map->flags = r_list->map->flags;
 	map->handle = (void *)(unsigned long) r_list->user_token;
-
-#ifdef CONFIG_X86
-	/*
-	 * There appears to be exactly one user of the mtrr index: dritest.
-	 * It's easy enough to keep it working on non-PAT systems.
-	 */
-	map->mtrr = phys_wc_to_mtrr_index(r_list->map->mtrr);
-#else
-	map->mtrr = -1;
-#endif
+	map->mtrr = arch_phys_wc_index(r_list->map->mtrr);
 
 	mutex_unlock(&dev->struct_mutex);
 
diff --git a/include/linux/io.h b/include/linux/io.h
index 986f2bffea1e..04cce4da3685 100644
--- a/include/linux/io.h
+++ b/include/linux/io.h
@@ -111,6 +111,13 @@ static inline void arch_phys_wc_del(int handle)
 }
 
 #define arch_phys_wc_add arch_phys_wc_add
+#ifndef arch_phys_wc_index
+static inline int arch_phys_wc_index(int handle)
+{
+	return -1;
+}
+#define arch_phys_wc_index arch_phys_wc_index
+#endif
 #endif
 
 #endif /* _LINUX_IO_H */
-- 
cgit v1.2.3


From 06931e62246844c73fba24d7aeb4a5dc897a2739 Mon Sep 17 00:00:00 2001
From: Bartosz Golaszewski <bgolaszewski@baylibre.com>
Date: Tue, 26 May 2015 15:11:28 +0200
Subject: sched/topology: Rename topology_thread_cpumask() to
 topology_sibling_cpumask()

Rename topology_thread_cpumask() to topology_sibling_cpumask()
for more consistency with scheduler code.

Signed-off-by: Bartosz Golaszewski <bgolaszewski@baylibre.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Russell King <rmk+kernel@arm.linux.org.uk>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Cc: Benoit Cousson <bcousson@baylibre.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Guenter Roeck <linux@roeck-us.net>
Cc: Jean Delvare <jdelvare@suse.de>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Drokin <oleg.drokin@intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rafael J. Wysocki <rjw@rjwysocki.net>
Cc: Russell King <linux@arm.linux.org.uk>
Cc: Viresh Kumar <viresh.kumar@linaro.org>
Link: http://lkml.kernel.org/r/1432645896-12588-2-git-send-email-bgolaszewski@baylibre.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 Documentation/cputopology.txt                          | 2 +-
 arch/arm/include/asm/topology.h                        | 2 +-
 arch/arm64/include/asm/topology.h                      | 2 +-
 arch/ia64/include/asm/topology.h                       | 2 +-
 arch/mips/include/asm/topology.h                       | 2 +-
 arch/powerpc/include/asm/topology.h                    | 2 +-
 arch/powerpc/mm/tlb_nohash.c                           | 2 +-
 arch/s390/include/asm/topology.h                       | 3 ++-
 arch/sparc/include/asm/topology_64.h                   | 2 +-
 arch/tile/include/asm/topology.h                       | 2 +-
 arch/x86/include/asm/topology.h                        | 2 +-
 arch/x86/kernel/cpu/perf_event_intel.c                 | 6 +++---
 block/blk-mq-cpumap.c                                  | 2 +-
 drivers/acpi/acpi_pad.c                                | 2 +-
 drivers/base/topology.c                                | 2 +-
 drivers/net/ethernet/sfc/efx.c                         | 2 +-
 drivers/staging/lustre/lustre/libcfs/linux/linux-cpu.c | 2 +-
 drivers/staging/lustre/lustre/ptlrpc/service.c         | 4 ++--
 include/linux/topology.h                               | 6 +++---
 lib/cpu_rmap.c                                         | 2 +-
 20 files changed, 26 insertions(+), 25 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/cputopology.txt b/Documentation/cputopology.txt
index 0aad6deb2d96..428a961ff063 100644
--- a/Documentation/cputopology.txt
+++ b/Documentation/cputopology.txt
@@ -44,7 +44,7 @@ these macros in include/asm-XXX/topology.h:
 #define topology_physical_package_id(cpu)
 #define topology_core_id(cpu)
 #define topology_book_id(cpu)
-#define topology_thread_cpumask(cpu)
+#define topology_sibling_cpumask(cpu)
 #define topology_core_cpumask(cpu)
 #define topology_book_cpumask(cpu)
 
diff --git a/arch/arm/include/asm/topology.h b/arch/arm/include/asm/topology.h
index 2fe85fff5cca..370f7a732900 100644
--- a/arch/arm/include/asm/topology.h
+++ b/arch/arm/include/asm/topology.h
@@ -18,7 +18,7 @@ extern struct cputopo_arm cpu_topology[NR_CPUS];
 #define topology_physical_package_id(cpu)	(cpu_topology[cpu].socket_id)
 #define topology_core_id(cpu)		(cpu_topology[cpu].core_id)
 #define topology_core_cpumask(cpu)	(&cpu_topology[cpu].core_sibling)
-#define topology_thread_cpumask(cpu)	(&cpu_topology[cpu].thread_sibling)
+#define topology_sibling_cpumask(cpu)	(&cpu_topology[cpu].thread_sibling)
 
 void init_cpu_topology(void);
 void store_cpu_topology(unsigned int cpuid);
diff --git a/arch/arm64/include/asm/topology.h b/arch/arm64/include/asm/topology.h
index 7ebcd31ce51c..225ec3524fbf 100644
--- a/arch/arm64/include/asm/topology.h
+++ b/arch/arm64/include/asm/topology.h
@@ -18,7 +18,7 @@ extern struct cpu_topology cpu_topology[NR_CPUS];
 #define topology_physical_package_id(cpu)	(cpu_topology[cpu].cluster_id)
 #define topology_core_id(cpu)		(cpu_topology[cpu].core_id)
 #define topology_core_cpumask(cpu)	(&cpu_topology[cpu].core_sibling)
-#define topology_thread_cpumask(cpu)	(&cpu_topology[cpu].thread_sibling)
+#define topology_sibling_cpumask(cpu)	(&cpu_topology[cpu].thread_sibling)
 
 void init_cpu_topology(void);
 void store_cpu_topology(unsigned int cpuid);
diff --git a/arch/ia64/include/asm/topology.h b/arch/ia64/include/asm/topology.h
index 6437ca21f61b..3ad8f6988363 100644
--- a/arch/ia64/include/asm/topology.h
+++ b/arch/ia64/include/asm/topology.h
@@ -53,7 +53,7 @@ void build_cpu_to_node_map(void);
 #define topology_physical_package_id(cpu)	(cpu_data(cpu)->socket_id)
 #define topology_core_id(cpu)			(cpu_data(cpu)->core_id)
 #define topology_core_cpumask(cpu)		(&cpu_core_map[cpu])
-#define topology_thread_cpumask(cpu)		(&per_cpu(cpu_sibling_map, cpu))
+#define topology_sibling_cpumask(cpu)		(&per_cpu(cpu_sibling_map, cpu))
 #endif
 
 extern void arch_fix_phys_package_id(int num, u32 slot);
diff --git a/arch/mips/include/asm/topology.h b/arch/mips/include/asm/topology.h
index 3e307ec2afba..7afda4150a59 100644
--- a/arch/mips/include/asm/topology.h
+++ b/arch/mips/include/asm/topology.h
@@ -15,7 +15,7 @@
 #define topology_physical_package_id(cpu)	(cpu_data[cpu].package)
 #define topology_core_id(cpu)			(cpu_data[cpu].core)
 #define topology_core_cpumask(cpu)		(&cpu_core_map[cpu])
-#define topology_thread_cpumask(cpu)		(&cpu_sibling_map[cpu])
+#define topology_sibling_cpumask(cpu)		(&cpu_sibling_map[cpu])
 #endif
 
 #endif /* __ASM_TOPOLOGY_H */
diff --git a/arch/powerpc/include/asm/topology.h b/arch/powerpc/include/asm/topology.h
index 5f1048eaa5b6..8b3b46b7b0f2 100644
--- a/arch/powerpc/include/asm/topology.h
+++ b/arch/powerpc/include/asm/topology.h
@@ -87,7 +87,7 @@ static inline int prrn_is_enabled(void)
 #include <asm/smp.h>
 
 #define topology_physical_package_id(cpu)	(cpu_to_chip_id(cpu))
-#define topology_thread_cpumask(cpu)	(per_cpu(cpu_sibling_map, cpu))
+#define topology_sibling_cpumask(cpu)	(per_cpu(cpu_sibling_map, cpu))
 #define topology_core_cpumask(cpu)	(per_cpu(cpu_core_map, cpu))
 #define topology_core_id(cpu)		(cpu_to_core_id(cpu))
 #endif
diff --git a/arch/powerpc/mm/tlb_nohash.c b/arch/powerpc/mm/tlb_nohash.c
index cbd3d069897f..723a099f6be3 100644
--- a/arch/powerpc/mm/tlb_nohash.c
+++ b/arch/powerpc/mm/tlb_nohash.c
@@ -217,7 +217,7 @@ static DEFINE_RAW_SPINLOCK(tlbivax_lock);
 static int mm_is_core_local(struct mm_struct *mm)
 {
 	return cpumask_subset(mm_cpumask(mm),
-			      topology_thread_cpumask(smp_processor_id()));
+			      topology_sibling_cpumask(smp_processor_id()));
 }
 
 struct tlb_flush_param {
diff --git a/arch/s390/include/asm/topology.h b/arch/s390/include/asm/topology.h
index b1453a2ae1ca..4990f6c66288 100644
--- a/arch/s390/include/asm/topology.h
+++ b/arch/s390/include/asm/topology.h
@@ -22,7 +22,8 @@ DECLARE_PER_CPU(struct cpu_topology_s390, cpu_topology);
 
 #define topology_physical_package_id(cpu) (per_cpu(cpu_topology, cpu).socket_id)
 #define topology_thread_id(cpu)		  (per_cpu(cpu_topology, cpu).thread_id)
-#define topology_thread_cpumask(cpu)	  (&per_cpu(cpu_topology, cpu).thread_mask)
+#define topology_sibling_cpumask(cpu) \
+		(&per_cpu(cpu_topology, cpu).thread_mask)
 #define topology_core_id(cpu)		  (per_cpu(cpu_topology, cpu).core_id)
 #define topology_core_cpumask(cpu)	  (&per_cpu(cpu_topology, cpu).core_mask)
 #define topology_book_id(cpu)		  (per_cpu(cpu_topology, cpu).book_id)
diff --git a/arch/sparc/include/asm/topology_64.h b/arch/sparc/include/asm/topology_64.h
index ed8f071132e4..9a928fcb7a9b 100644
--- a/arch/sparc/include/asm/topology_64.h
+++ b/arch/sparc/include/asm/topology_64.h
@@ -41,7 +41,7 @@ static inline int pcibus_to_node(struct pci_bus *pbus)
 #define topology_physical_package_id(cpu)	(cpu_data(cpu).proc_id)
 #define topology_core_id(cpu)			(cpu_data(cpu).core_id)
 #define topology_core_cpumask(cpu)		(&cpu_core_map[cpu])
-#define topology_thread_cpumask(cpu)		(&per_cpu(cpu_sibling_map, cpu))
+#define topology_sibling_cpumask(cpu)		(&per_cpu(cpu_sibling_map, cpu))
 #endif /* CONFIG_SMP */
 
 extern cpumask_t cpu_core_map[NR_CPUS];
diff --git a/arch/tile/include/asm/topology.h b/arch/tile/include/asm/topology.h
index 938311844233..76b0d0ebb244 100644
--- a/arch/tile/include/asm/topology.h
+++ b/arch/tile/include/asm/topology.h
@@ -55,7 +55,7 @@ static inline const struct cpumask *cpumask_of_node(int node)
 #define topology_physical_package_id(cpu)       ((void)(cpu), 0)
 #define topology_core_id(cpu)                   (cpu)
 #define topology_core_cpumask(cpu)              ((void)(cpu), cpu_online_mask)
-#define topology_thread_cpumask(cpu)            cpumask_of(cpu)
+#define topology_sibling_cpumask(cpu)           cpumask_of(cpu)
 #endif
 
 #endif /* _ASM_TILE_TOPOLOGY_H */
diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h
index 0e8f04f2c26f..5a77593fdace 100644
--- a/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@@ -124,7 +124,7 @@ extern const struct cpumask *cpu_coregroup_mask(int cpu);
 
 #ifdef ENABLE_TOPO_DEFINES
 #define topology_core_cpumask(cpu)		(per_cpu(cpu_core_map, cpu))
-#define topology_thread_cpumask(cpu)		(per_cpu(cpu_sibling_map, cpu))
+#define topology_sibling_cpumask(cpu)		(per_cpu(cpu_sibling_map, cpu))
 #endif
 
 static inline void arch_fix_phys_package_id(int num, u32 slot)
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 3998131d1a68..324817735771 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -2621,7 +2621,7 @@ static void intel_pmu_cpu_starting(int cpu)
 	if (!(x86_pmu.flags & PMU_FL_NO_HT_SHARING)) {
 		void **onln = &cpuc->kfree_on_online[X86_PERF_KFREE_SHARED];
 
-		for_each_cpu(i, topology_thread_cpumask(cpu)) {
+		for_each_cpu(i, topology_sibling_cpumask(cpu)) {
 			struct intel_shared_regs *pc;
 
 			pc = per_cpu(cpu_hw_events, i).shared_regs;
@@ -2641,7 +2641,7 @@ static void intel_pmu_cpu_starting(int cpu)
 	if (x86_pmu.flags & PMU_FL_EXCL_CNTRS) {
 		int h = x86_pmu.num_counters >> 1;
 
-		for_each_cpu(i, topology_thread_cpumask(cpu)) {
+		for_each_cpu(i, topology_sibling_cpumask(cpu)) {
 			struct intel_excl_cntrs *c;
 
 			c = per_cpu(cpu_hw_events, i).excl_cntrs;
@@ -3403,7 +3403,7 @@ static __init int fixup_ht_bug(void)
 	if (!(x86_pmu.flags & PMU_FL_EXCL_ENABLED))
 		return 0;
 
-	w = cpumask_weight(topology_thread_cpumask(cpu));
+	w = cpumask_weight(topology_sibling_cpumask(cpu));
 	if (w > 1) {
 		pr_info("PMU erratum BJ122, BV98, HSD29 worked around, HT is on\n");
 		return 0;
diff --git a/block/blk-mq-cpumap.c b/block/blk-mq-cpumap.c
index 5f13f4d0bcce..1e28ddb656b8 100644
--- a/block/blk-mq-cpumap.c
+++ b/block/blk-mq-cpumap.c
@@ -24,7 +24,7 @@ static int get_first_sibling(unsigned int cpu)
 {
 	unsigned int ret;
 
-	ret = cpumask_first(topology_thread_cpumask(cpu));
+	ret = cpumask_first(topology_sibling_cpumask(cpu));
 	if (ret < nr_cpu_ids)
 		return ret;
 
diff --git a/drivers/acpi/acpi_pad.c b/drivers/acpi/acpi_pad.c
index 6bc9cbc01ad6..00b39802d7ec 100644
--- a/drivers/acpi/acpi_pad.c
+++ b/drivers/acpi/acpi_pad.c
@@ -105,7 +105,7 @@ static void round_robin_cpu(unsigned int tsk_index)
 	mutex_lock(&round_robin_lock);
 	cpumask_clear(tmp);
 	for_each_cpu(cpu, pad_busy_cpus)
-		cpumask_or(tmp, tmp, topology_thread_cpumask(cpu));
+		cpumask_or(tmp, tmp, topology_sibling_cpumask(cpu));
 	cpumask_andnot(tmp, cpu_online_mask, tmp);
 	/* avoid HT sibilings if possible */
 	if (cpumask_empty(tmp))
diff --git a/drivers/base/topology.c b/drivers/base/topology.c
index 6491f45200a7..8b7d7f8e5851 100644
--- a/drivers/base/topology.c
+++ b/drivers/base/topology.c
@@ -61,7 +61,7 @@ static DEVICE_ATTR_RO(physical_package_id);
 define_id_show_func(core_id);
 static DEVICE_ATTR_RO(core_id);
 
-define_siblings_show_func(thread_siblings, thread_cpumask);
+define_siblings_show_func(thread_siblings, sibling_cpumask);
 static DEVICE_ATTR_RO(thread_siblings);
 static DEVICE_ATTR_RO(thread_siblings_list);
 
diff --git a/drivers/net/ethernet/sfc/efx.c b/drivers/net/ethernet/sfc/efx.c
index 4b00545a3ace..65944dd8bf6b 100644
--- a/drivers/net/ethernet/sfc/efx.c
+++ b/drivers/net/ethernet/sfc/efx.c
@@ -1304,7 +1304,7 @@ static unsigned int efx_wanted_parallelism(struct efx_nic *efx)
 			if (!cpumask_test_cpu(cpu, thread_mask)) {
 				++count;
 				cpumask_or(thread_mask, thread_mask,
-					   topology_thread_cpumask(cpu));
+					   topology_sibling_cpumask(cpu));
 			}
 		}
 
diff --git a/drivers/staging/lustre/lustre/libcfs/linux/linux-cpu.c b/drivers/staging/lustre/lustre/libcfs/linux/linux-cpu.c
index cc3ab351943e..f9262243f935 100644
--- a/drivers/staging/lustre/lustre/libcfs/linux/linux-cpu.c
+++ b/drivers/staging/lustre/lustre/libcfs/linux/linux-cpu.c
@@ -87,7 +87,7 @@ static void cfs_cpu_core_siblings(int cpu, cpumask_t *mask)
 /* return cpumask of HTs in the same core */
 static void cfs_cpu_ht_siblings(int cpu, cpumask_t *mask)
 {
-	cpumask_copy(mask, topology_thread_cpumask(cpu));
+	cpumask_copy(mask, topology_sibling_cpumask(cpu));
 }
 
 static void cfs_node_to_cpumask(int node, cpumask_t *mask)
diff --git a/drivers/staging/lustre/lustre/ptlrpc/service.c b/drivers/staging/lustre/lustre/ptlrpc/service.c
index 8e61421515cb..344189ac5698 100644
--- a/drivers/staging/lustre/lustre/ptlrpc/service.c
+++ b/drivers/staging/lustre/lustre/ptlrpc/service.c
@@ -557,7 +557,7 @@ ptlrpc_server_nthreads_check(struct ptlrpc_service *svc,
 		 * there are.
 		 */
 		/* weight is # of HTs */
-		if (cpumask_weight(topology_thread_cpumask(0)) > 1) {
+		if (cpumask_weight(topology_sibling_cpumask(0)) > 1) {
 			/* depress thread factor for hyper-thread */
 			factor = factor - (factor >> 1) + (factor >> 3);
 		}
@@ -2768,7 +2768,7 @@ int ptlrpc_hr_init(void)
 
 	init_waitqueue_head(&ptlrpc_hr.hr_waitq);
 
-	weight = cpumask_weight(topology_thread_cpumask(0));
+	weight = cpumask_weight(topology_sibling_cpumask(0));
 
 	cfs_percpt_for_each(hrp, i, ptlrpc_hr.hr_partitions) {
 		hrp->hrp_cpt = i;
diff --git a/include/linux/topology.h b/include/linux/topology.h
index 909b6e43b694..73ddad1e0fa3 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -191,8 +191,8 @@ static inline int cpu_to_mem(int cpu)
 #ifndef topology_core_id
 #define topology_core_id(cpu)			((void)(cpu), 0)
 #endif
-#ifndef topology_thread_cpumask
-#define topology_thread_cpumask(cpu)		cpumask_of(cpu)
+#ifndef topology_sibling_cpumask
+#define topology_sibling_cpumask(cpu)		cpumask_of(cpu)
 #endif
 #ifndef topology_core_cpumask
 #define topology_core_cpumask(cpu)		cpumask_of(cpu)
@@ -201,7 +201,7 @@ static inline int cpu_to_mem(int cpu)
 #ifdef CONFIG_SCHED_SMT
 static inline const struct cpumask *cpu_smt_mask(int cpu)
 {
-	return topology_thread_cpumask(cpu);
+	return topology_sibling_cpumask(cpu);
 }
 #endif
 
diff --git a/lib/cpu_rmap.c b/lib/cpu_rmap.c
index 4f134d8907a7..f610b2a10b3e 100644
--- a/lib/cpu_rmap.c
+++ b/lib/cpu_rmap.c
@@ -191,7 +191,7 @@ int cpu_rmap_update(struct cpu_rmap *rmap, u16 index,
 	/* Update distances based on topology */
 	for_each_cpu(cpu, update_mask) {
 		if (cpu_rmap_copy_neigh(rmap, cpu,
-					topology_thread_cpumask(cpu), 1))
+					topology_sibling_cpumask(cpu), 1))
 			continue;
 		if (cpu_rmap_copy_neigh(rmap, cpu,
 					topology_core_cpumask(cpu), 2))
-- 
cgit v1.2.3


From 66eb579e66ecfea55e2007be0594869ea9e453d4 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Wed, 13 May 2015 17:12:23 +0100
Subject: perf: allow for PMU-specific event filtering

In certain circumstances it may not be possible to schedule particular
events due to constraints other than a lack of hardware counters (e.g.
on big.LITTLE systems where CPUs support different events). The core
perf event code does not distinguish these cases and pessimistically
assumes that any failure to schedule an event means that it is not worth
attempting to schedule later events, even if some hardware counters are
still unused.

When an event a pmu cannot schedule exists in a flexible group list it
can unnecessarily prevent event groups following it in the list from
being scheduled (until it is rotated to the end of the list). This means
some events are scheduled for only a portion of the time they could be,
and for short running programs no events may be scheduled if the list is
initially sorted in an unfortunate order.

This patch adds a new (optional) filter_match function pointer to struct
pmu which a pmu driver can use to tell perf core when an event matches
pmu-specific scheduling requirements. This plugs into the existing
event_filter_match logic, and makes it possible to avoid the scheduling
problem described above. When no filter is provided by the PMU, the
existing behaviour is retained.

Cc: Paul Mackerras <paulus@samba.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Acked-by: Will Deacon <will.deacon@arm.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>
---
 include/linux/perf_event.h | 5 +++++
 kernel/events/core.c       | 8 +++++++-
 2 files changed, 12 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 61992cf2e977..67c719cc91aa 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -304,6 +304,11 @@ struct pmu {
 	 * Free pmu-private AUX data structures
 	 */
 	void (*free_aux)		(void *aux); /* optional */
+
+	/*
+	 * Filter events for PMU-specific reasons.
+	 */
+	int (*filter_match)		(struct perf_event *event); /* optional */
 };
 
 /**
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 81aa3a4ece9f..aaeb44939db0 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1506,11 +1506,17 @@ static int __init perf_workqueue_init(void)
 
 core_initcall(perf_workqueue_init);
 
+static inline int pmu_filter_match(struct perf_event *event)
+{
+	struct pmu *pmu = event->pmu;
+	return pmu->filter_match ? pmu->filter_match(event) : 1;
+}
+
 static inline int
 event_filter_match(struct perf_event *event)
 {
 	return (event->cpu == -1 || event->cpu == smp_processor_id())
-	    && perf_cgroup_match(event);
+	    && perf_cgroup_match(event) && pmu_filter_match(event);
 }
 
 static void
-- 
cgit v1.2.3


From 24fe86a617c550fb9bdc6c8bd7cf647d3955f8ba Mon Sep 17 00:00:00 2001
From: Hans de Goede <hdegoede@redhat.com>
Date: Sun, 29 Mar 2015 12:50:46 +0200
Subject: phy: sun4i-usb: Add a sunxi specific function for setting
 squelch-detect

The sunxi otg phy has a bug where it wrongly detects a high speed squelch
when reset on the root port gets de-asserted with a lo-speed device.

The workaround for this is to disable squelch detect before de-asserting
reset, and re-enabling it after the reset de-assert is done. Add a sunxi
specific phy function to allow the sunxi-musb glue to do this.

Acked-by: Kishon Vijay Abraham I <kishon@ti.com>
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Felipe Balbi <balbi@ti.com>
---
 drivers/phy/phy-sun4i-usb.c       |  9 +++++++++
 include/linux/phy/phy-sun4i-usb.h | 26 ++++++++++++++++++++++++++
 2 files changed, 35 insertions(+)
 create mode 100644 include/linux/phy/phy-sun4i-usb.h

(limited to 'include/linux')

diff --git a/drivers/phy/phy-sun4i-usb.c b/drivers/phy/phy-sun4i-usb.c
index a2b08f3ccb03..e17c539e4f6f 100644
--- a/drivers/phy/phy-sun4i-usb.c
+++ b/drivers/phy/phy-sun4i-usb.c
@@ -30,6 +30,7 @@
 #include <linux/of.h>
 #include <linux/of_address.h>
 #include <linux/phy/phy.h>
+#include <linux/phy/phy-sun4i-usb.h>
 #include <linux/platform_device.h>
 #include <linux/regulator/consumer.h>
 #include <linux/reset.h>
@@ -58,6 +59,7 @@
 #define PHY_OTG_FUNC_EN			0x28
 #define PHY_VBUS_DET_EN			0x29
 #define PHY_DISCON_TH_SEL		0x2a
+#define PHY_SQUELCH_DETECT		0x3c
 
 #define MAX_PHYS			3
 
@@ -204,6 +206,13 @@ static int sun4i_usb_phy_power_off(struct phy *_phy)
 	return 0;
 }
 
+void sun4i_usb_phy_set_squelch_detect(struct phy *_phy, bool enabled)
+{
+	struct sun4i_usb_phy *phy = phy_get_drvdata(_phy);
+
+	sun4i_usb_phy_write(phy, PHY_SQUELCH_DETECT, enabled ? 0 : 2, 2);
+}
+
 static struct phy_ops sun4i_usb_phy_ops = {
 	.init		= sun4i_usb_phy_init,
 	.exit		= sun4i_usb_phy_exit,
diff --git a/include/linux/phy/phy-sun4i-usb.h b/include/linux/phy/phy-sun4i-usb.h
new file mode 100644
index 000000000000..50aed92ea89c
--- /dev/null
+++ b/include/linux/phy/phy-sun4i-usb.h
@@ -0,0 +1,26 @@
+/*
+ * Copyright (c) 2015 Hans de Goede <hdegoede@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 and
+ * only version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#ifndef PHY_SUN4I_USB_H_
+#define PHY_SUN4I_USB_H_
+
+#include "phy.h"
+
+/**
+ * sun4i_usb_phy_set_squelch_detect() - Enable/disable squelch detect
+ * @phy: reference to a sun4i usb phy
+ * @enabled: wether to enable or disable squelch detect
+ */
+void sun4i_usb_phy_set_squelch_detect(struct phy *phy, bool enabled);
+
+#endif
-- 
cgit v1.2.3


From e5c4708b2b6fec0300db60ee3cf4b4ae96430a12 Mon Sep 17 00:00:00 2001
From: Sunil Goutham <sgoutham@cavium.com>
Date: Tue, 26 May 2015 19:20:14 -0700
Subject: pci: Add Cavium PCI vendor id

This vendor id will be used by network (vNIC), USB (xHCI),
SATA (AHCI), GPIO, I2C, MMC and maybe other drivers
for ThunderX SoC.

Acked-by: Bjorn Helgaas <bhelgaas@google.com>
Signed-off-by: Sunil Goutham <sgoutham@cavium.com>
Signed-off-by: Aleksey Makarov <aleksey.makarov@caviumnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/pci_ids.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index 2f7b9a40f627..2972c7f3aa1d 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -2329,6 +2329,8 @@
 #define PCI_DEVICE_ID_ALTIMA_AC9100	0x03ea
 #define PCI_DEVICE_ID_ALTIMA_AC1003	0x03eb
 
+#define PCI_VENDOR_ID_CAVIUM		0x177d
+
 #define PCI_VENDOR_ID_BELKIN		0x1799
 #define PCI_DEVICE_ID_BELKIN_F5D7010V7	0x701f
 
-- 
cgit v1.2.3


From 4612c715a6ea6b3af2aee0163c0721375b2548d7 Mon Sep 17 00:00:00 2001
From: Denys Vlasenko <dvlasenk@redhat.com>
Date: Mon, 18 May 2015 12:58:40 +0200
Subject: mtd: cfi: deinline large functions

With this .config: http://busybox.net/~vda/kernel_config,
after uninlining these functions have sizes and callsite counts
as follows:

cfi_udelay(): 74 bytes, 26 callsites
cfi_send_gen_cmd(): 153 bytes, 95 callsites
cfi_build_cmd(): 274 bytes, 123 callsites
cfi_build_cmd_addr(): 49 bytes, 15 callsites
cfi_merge_status(): 230 bytes, 3 callsites

Reduction in code size is about 50,000:

    text     data      bss       dec     hex filename
85842882 22294584 20627456 128764922 7accbfa vmlinux.before
85789648 22294616 20627456 128711720 7abfc28 vmlinux

Signed-off-by: Denys Vlasenko <dvlasenk@redhat.com>
CC: Dan Carpenter <dan.carpenter@oracle.com>
CC: Jingoo Han <jg1.han@samsung.com>
CC: Brian Norris <computersforpeace@gmail.com>
CC: Aaron Sierra <asierra@xes-inc.com>
CC: Artem Bityutskiy <Artem.Bityutskiy@linux.intel.com>
CC: David Woodhouse <David.Woodhouse@intel.com>
CC: linux-mtd@lists.infradead.org
CC: linux-kernel@vger.kernel.org
Signed-off-by: Brian Norris <computersforpeace@gmail.com>
---
 drivers/mtd/chips/cfi_util.c | 188 +++++++++++++++++++++++++++++++++++++++++++
 include/linux/mtd/cfi.h      | 188 ++-----------------------------------------
 2 files changed, 196 insertions(+), 180 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/chips/cfi_util.c b/drivers/mtd/chips/cfi_util.c
index 09c79bd0b4f4..6f16552cd59f 100644
--- a/drivers/mtd/chips/cfi_util.c
+++ b/drivers/mtd/chips/cfi_util.c
@@ -23,6 +23,194 @@
 #include <linux/mtd/map.h>
 #include <linux/mtd/cfi.h>
 
+void cfi_udelay(int us)
+{
+	if (us >= 1000) {
+		msleep((us+999)/1000);
+	} else {
+		udelay(us);
+		cond_resched();
+	}
+}
+EXPORT_SYMBOL(cfi_udelay);
+
+/*
+ * Returns the command address according to the given geometry.
+ */
+uint32_t cfi_build_cmd_addr(uint32_t cmd_ofs,
+				struct map_info *map, struct cfi_private *cfi)
+{
+	unsigned bankwidth = map_bankwidth(map);
+	unsigned interleave = cfi_interleave(cfi);
+	unsigned type = cfi->device_type;
+	uint32_t addr;
+
+	addr = (cmd_ofs * type) * interleave;
+
+	/* Modify the unlock address if we are in compatibility mode.
+	 * For 16bit devices on 8 bit busses
+	 * and 32bit devices on 16 bit busses
+	 * set the low bit of the alternating bit sequence of the address.
+	 */
+	if (((type * interleave) > bankwidth) && ((cmd_ofs & 0xff) == 0xaa))
+		addr |= (type >> 1)*interleave;
+
+	return  addr;
+}
+EXPORT_SYMBOL(cfi_build_cmd_addr);
+
+/*
+ * Transforms the CFI command for the given geometry (bus width & interleave).
+ * It looks too long to be inline, but in the common case it should almost all
+ * get optimised away.
+ */
+map_word cfi_build_cmd(u_long cmd, struct map_info *map, struct cfi_private *cfi)
+{
+	map_word val = { {0} };
+	int wordwidth, words_per_bus, chip_mode, chips_per_word;
+	unsigned long onecmd;
+	int i;
+
+	/* We do it this way to give the compiler a fighting chance
+	   of optimising away all the crap for 'bankwidth' larger than
+	   an unsigned long, in the common case where that support is
+	   disabled */
+	if (map_bankwidth_is_large(map)) {
+		wordwidth = sizeof(unsigned long);
+		words_per_bus = (map_bankwidth(map)) / wordwidth; // i.e. normally 1
+	} else {
+		wordwidth = map_bankwidth(map);
+		words_per_bus = 1;
+	}
+
+	chip_mode = map_bankwidth(map) / cfi_interleave(cfi);
+	chips_per_word = wordwidth * cfi_interleave(cfi) / map_bankwidth(map);
+
+	/* First, determine what the bit-pattern should be for a single
+	   device, according to chip mode and endianness... */
+	switch (chip_mode) {
+	default: BUG();
+	case 1:
+		onecmd = cmd;
+		break;
+	case 2:
+		onecmd = cpu_to_cfi16(map, cmd);
+		break;
+	case 4:
+		onecmd = cpu_to_cfi32(map, cmd);
+		break;
+	}
+
+	/* Now replicate it across the size of an unsigned long, or
+	   just to the bus width as appropriate */
+	switch (chips_per_word) {
+	default: BUG();
+#if BITS_PER_LONG >= 64
+	case 8:
+		onecmd |= (onecmd << (chip_mode * 32));
+#endif
+	case 4:
+		onecmd |= (onecmd << (chip_mode * 16));
+	case 2:
+		onecmd |= (onecmd << (chip_mode * 8));
+	case 1:
+		;
+	}
+
+	/* And finally, for the multi-word case, replicate it
+	   in all words in the structure */
+	for (i=0; i < words_per_bus; i++) {
+		val.x[i] = onecmd;
+	}
+
+	return val;
+}
+EXPORT_SYMBOL(cfi_build_cmd);
+
+unsigned long cfi_merge_status(map_word val, struct map_info *map,
+					   struct cfi_private *cfi)
+{
+	int wordwidth, words_per_bus, chip_mode, chips_per_word;
+	unsigned long onestat, res = 0;
+	int i;
+
+	/* We do it this way to give the compiler a fighting chance
+	   of optimising away all the crap for 'bankwidth' larger than
+	   an unsigned long, in the common case where that support is
+	   disabled */
+	if (map_bankwidth_is_large(map)) {
+		wordwidth = sizeof(unsigned long);
+		words_per_bus = (map_bankwidth(map)) / wordwidth; // i.e. normally 1
+	} else {
+		wordwidth = map_bankwidth(map);
+		words_per_bus = 1;
+	}
+
+	chip_mode = map_bankwidth(map) / cfi_interleave(cfi);
+	chips_per_word = wordwidth * cfi_interleave(cfi) / map_bankwidth(map);
+
+	onestat = val.x[0];
+	/* Or all status words together */
+	for (i=1; i < words_per_bus; i++) {
+		onestat |= val.x[i];
+	}
+
+	res = onestat;
+	switch(chips_per_word) {
+	default: BUG();
+#if BITS_PER_LONG >= 64
+	case 8:
+		res |= (onestat >> (chip_mode * 32));
+#endif
+	case 4:
+		res |= (onestat >> (chip_mode * 16));
+	case 2:
+		res |= (onestat >> (chip_mode * 8));
+	case 1:
+		;
+	}
+
+	/* Last, determine what the bit-pattern should be for a single
+	   device, according to chip mode and endianness... */
+	switch (chip_mode) {
+	case 1:
+		break;
+	case 2:
+		res = cfi16_to_cpu(map, res);
+		break;
+	case 4:
+		res = cfi32_to_cpu(map, res);
+		break;
+	default: BUG();
+	}
+	return res;
+}
+EXPORT_SYMBOL(cfi_merge_status);
+
+/*
+ * Sends a CFI command to a bank of flash for the given geometry.
+ *
+ * Returns the offset in flash where the command was written.
+ * If prev_val is non-null, it will be set to the value at the command address,
+ * before the command was written.
+ */
+uint32_t cfi_send_gen_cmd(u_char cmd, uint32_t cmd_addr, uint32_t base,
+				struct map_info *map, struct cfi_private *cfi,
+				int type, map_word *prev_val)
+{
+	map_word val;
+	uint32_t addr = base + cfi_build_cmd_addr(cmd_addr, map, cfi);
+	val = cfi_build_cmd(cmd, map, cfi);
+
+	if (prev_val)
+		*prev_val = map_read(map, addr);
+
+	map_write(map, val, addr);
+
+	return addr - base;
+}
+EXPORT_SYMBOL(cfi_send_gen_cmd);
+
 int __xipram cfi_qry_present(struct map_info *map, __u32 base,
 			     struct cfi_private *cfi)
 {
diff --git a/include/linux/mtd/cfi.h b/include/linux/mtd/cfi.h
index 299d7d31fe53..9b57a9b1b081 100644
--- a/include/linux/mtd/cfi.h
+++ b/include/linux/mtd/cfi.h
@@ -296,183 +296,19 @@ struct cfi_private {
 	struct flchip chips[0];  /* per-chip data structure for each chip */
 };
 
-/*
- * Returns the command address according to the given geometry.
- */
-static inline uint32_t cfi_build_cmd_addr(uint32_t cmd_ofs,
-				struct map_info *map, struct cfi_private *cfi)
-{
-	unsigned bankwidth = map_bankwidth(map);
-	unsigned interleave = cfi_interleave(cfi);
-	unsigned type = cfi->device_type;
-	uint32_t addr;
-	
-	addr = (cmd_ofs * type) * interleave;
-
-	/* Modify the unlock address if we are in compatibility mode.
-	 * For 16bit devices on 8 bit busses
-	 * and 32bit devices on 16 bit busses
-	 * set the low bit of the alternating bit sequence of the address.
-	 */
-	if (((type * interleave) > bankwidth) && ((cmd_ofs & 0xff) == 0xaa))
-		addr |= (type >> 1)*interleave;
-
-	return  addr;
-}
-
-/*
- * Transforms the CFI command for the given geometry (bus width & interleave).
- * It looks too long to be inline, but in the common case it should almost all
- * get optimised away.
- */
-static inline map_word cfi_build_cmd(u_long cmd, struct map_info *map, struct cfi_private *cfi)
-{
-	map_word val = { {0} };
-	int wordwidth, words_per_bus, chip_mode, chips_per_word;
-	unsigned long onecmd;
-	int i;
-
-	/* We do it this way to give the compiler a fighting chance
-	   of optimising away all the crap for 'bankwidth' larger than
-	   an unsigned long, in the common case where that support is
-	   disabled */
-	if (map_bankwidth_is_large(map)) {
-		wordwidth = sizeof(unsigned long);
-		words_per_bus = (map_bankwidth(map)) / wordwidth; // i.e. normally 1
-	} else {
-		wordwidth = map_bankwidth(map);
-		words_per_bus = 1;
-	}
-
-	chip_mode = map_bankwidth(map) / cfi_interleave(cfi);
-	chips_per_word = wordwidth * cfi_interleave(cfi) / map_bankwidth(map);
-
-	/* First, determine what the bit-pattern should be for a single
-	   device, according to chip mode and endianness... */
-	switch (chip_mode) {
-	default: BUG();
-	case 1:
-		onecmd = cmd;
-		break;
-	case 2:
-		onecmd = cpu_to_cfi16(map, cmd);
-		break;
-	case 4:
-		onecmd = cpu_to_cfi32(map, cmd);
-		break;
-	}
-
-	/* Now replicate it across the size of an unsigned long, or
-	   just to the bus width as appropriate */
-	switch (chips_per_word) {
-	default: BUG();
-#if BITS_PER_LONG >= 64
-	case 8:
-		onecmd |= (onecmd << (chip_mode * 32));
-#endif
-	case 4:
-		onecmd |= (onecmd << (chip_mode * 16));
-	case 2:
-		onecmd |= (onecmd << (chip_mode * 8));
-	case 1:
-		;
-	}
+uint32_t cfi_build_cmd_addr(uint32_t cmd_ofs,
+				struct map_info *map, struct cfi_private *cfi);
 
-	/* And finally, for the multi-word case, replicate it
-	   in all words in the structure */
-	for (i=0; i < words_per_bus; i++) {
-		val.x[i] = onecmd;
-	}
-
-	return val;
-}
+map_word cfi_build_cmd(u_long cmd, struct map_info *map, struct cfi_private *cfi);
 #define CMD(x)  cfi_build_cmd((x), map, cfi)
 
-
-static inline unsigned long cfi_merge_status(map_word val, struct map_info *map,
-					   struct cfi_private *cfi)
-{
-	int wordwidth, words_per_bus, chip_mode, chips_per_word;
-	unsigned long onestat, res = 0;
-	int i;
-
-	/* We do it this way to give the compiler a fighting chance
-	   of optimising away all the crap for 'bankwidth' larger than
-	   an unsigned long, in the common case where that support is
-	   disabled */
-	if (map_bankwidth_is_large(map)) {
-		wordwidth = sizeof(unsigned long);
-		words_per_bus = (map_bankwidth(map)) / wordwidth; // i.e. normally 1
-	} else {
-		wordwidth = map_bankwidth(map);
-		words_per_bus = 1;
-	}
-
-	chip_mode = map_bankwidth(map) / cfi_interleave(cfi);
-	chips_per_word = wordwidth * cfi_interleave(cfi) / map_bankwidth(map);
-
-	onestat = val.x[0];
-	/* Or all status words together */
-	for (i=1; i < words_per_bus; i++) {
-		onestat |= val.x[i];
-	}
-
-	res = onestat;
-	switch(chips_per_word) {
-	default: BUG();
-#if BITS_PER_LONG >= 64
-	case 8:
-		res |= (onestat >> (chip_mode * 32));
-#endif
-	case 4:
-		res |= (onestat >> (chip_mode * 16));
-	case 2:
-		res |= (onestat >> (chip_mode * 8));
-	case 1:
-		;
-	}
-
-	/* Last, determine what the bit-pattern should be for a single
-	   device, according to chip mode and endianness... */
-	switch (chip_mode) {
-	case 1:
-		break;
-	case 2:
-		res = cfi16_to_cpu(map, res);
-		break;
-	case 4:
-		res = cfi32_to_cpu(map, res);
-		break;
-	default: BUG();
-	}
-	return res;
-}
-
+unsigned long cfi_merge_status(map_word val, struct map_info *map,
+					   struct cfi_private *cfi);
 #define MERGESTATUS(x) cfi_merge_status((x), map, cfi)
 
-
-/*
- * Sends a CFI command to a bank of flash for the given geometry.
- *
- * Returns the offset in flash where the command was written.
- * If prev_val is non-null, it will be set to the value at the command address,
- * before the command was written.
- */
-static inline uint32_t cfi_send_gen_cmd(u_char cmd, uint32_t cmd_addr, uint32_t base,
+uint32_t cfi_send_gen_cmd(u_char cmd, uint32_t cmd_addr, uint32_t base,
 				struct map_info *map, struct cfi_private *cfi,
-				int type, map_word *prev_val)
-{
-	map_word val;
-	uint32_t addr = base + cfi_build_cmd_addr(cmd_addr, map, cfi);
-	val = cfi_build_cmd(cmd, map, cfi);
-
-	if (prev_val)
-		*prev_val = map_read(map, addr);
-
-	map_write(map, val, addr);
-
-	return addr - base;
-}
+				int type, map_word *prev_val);
 
 static inline uint8_t cfi_read_query(struct map_info *map, uint32_t addr)
 {
@@ -506,15 +342,7 @@ static inline uint16_t cfi_read_query16(struct map_info *map, uint32_t addr)
 	}
 }
 
-static inline void cfi_udelay(int us)
-{
-	if (us >= 1000) {
-		msleep((us+999)/1000);
-	} else {
-		udelay(us);
-		cond_resched();
-	}
-}
+void cfi_udelay(int us);
 
 int __xipram cfi_qry_present(struct map_info *map, __u32 base,
 			     struct cfi_private *cfi);
-- 
cgit v1.2.3


From 7d0ae8086b828311250c6afdf800b568ac9bd693 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Tue, 3 Mar 2015 14:57:58 -0800
Subject: rcu: Convert ACCESS_ONCE() to READ_ONCE() and WRITE_ONCE()

This commit moves from the old ACCESS_ONCE() API to the new READ_ONCE()
and WRITE_ONCE() APIs.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
[ paulmck:  Updated to include kernel/torture.c as suggested by Jason Low. ]
---
 include/linux/rculist.h  |   6 +-
 include/linux/rcupdate.h |  16 ++---
 kernel/rcu/rcutorture.c  |   2 +-
 kernel/rcu/srcu.c        |  10 +--
 kernel/rcu/tiny_plugin.h |  12 ++--
 kernel/rcu/tree.c        | 184 +++++++++++++++++++++++------------------------
 kernel/rcu/tree_plugin.h |  93 ++++++++++++------------
 kernel/rcu/tree_trace.c  |   6 +-
 kernel/rcu/update.c      |  30 ++++----
 kernel/torture.c         |  26 +++----
 10 files changed, 193 insertions(+), 192 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rculist.h b/include/linux/rculist.h
index a18b16f1dc0e..665397247e82 100644
--- a/include/linux/rculist.h
+++ b/include/linux/rculist.h
@@ -29,8 +29,8 @@
  */
 static inline void INIT_LIST_HEAD_RCU(struct list_head *list)
 {
-	ACCESS_ONCE(list->next) = list;
-	ACCESS_ONCE(list->prev) = list;
+	WRITE_ONCE(list->next, list);
+	WRITE_ONCE(list->prev, list);
 }
 
 /*
@@ -288,7 +288,7 @@ static inline void list_splice_init_rcu(struct list_head *list,
 #define list_first_or_null_rcu(ptr, type, member) \
 ({ \
 	struct list_head *__ptr = (ptr); \
-	struct list_head *__next = ACCESS_ONCE(__ptr->next); \
+	struct list_head *__next = READ_ONCE(__ptr->next); \
 	likely(__ptr != __next) ? list_entry_rcu(__next, type, member) : NULL; \
 })
 
diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 573a5afd5ed8..87bb0eee665b 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -364,8 +364,8 @@ extern struct srcu_struct tasks_rcu_exit_srcu;
 #define rcu_note_voluntary_context_switch(t) \
 	do { \
 		rcu_all_qs(); \
-		if (ACCESS_ONCE((t)->rcu_tasks_holdout)) \
-			ACCESS_ONCE((t)->rcu_tasks_holdout) = false; \
+		if (READ_ONCE((t)->rcu_tasks_holdout)) \
+			WRITE_ONCE((t)->rcu_tasks_holdout, false); \
 	} while (0)
 #else /* #ifdef CONFIG_TASKS_RCU */
 #define TASKS_RCU(x) do { } while (0)
@@ -609,7 +609,7 @@ static inline void rcu_preempt_sleep_check(void)
 
 #define __rcu_access_pointer(p, space) \
 ({ \
-	typeof(*p) *_________p1 = (typeof(*p) *__force)ACCESS_ONCE(p); \
+	typeof(*p) *_________p1 = (typeof(*p) *__force)READ_ONCE(p); \
 	rcu_dereference_sparse(p, space); \
 	((typeof(*p) __force __kernel *)(_________p1)); \
 })
@@ -630,7 +630,7 @@ static inline void rcu_preempt_sleep_check(void)
 
 #define __rcu_access_index(p, space) \
 ({ \
-	typeof(p) _________p1 = ACCESS_ONCE(p); \
+	typeof(p) _________p1 = READ_ONCE(p); \
 	rcu_dereference_sparse(p, space); \
 	(_________p1); \
 })
@@ -659,7 +659,7 @@ static inline void rcu_preempt_sleep_check(void)
  */
 #define lockless_dereference(p) \
 ({ \
-	typeof(p) _________p1 = ACCESS_ONCE(p); \
+	typeof(p) _________p1 = READ_ONCE(p); \
 	smp_read_barrier_depends(); /* Dependency order vs. p above. */ \
 	(_________p1); \
 })
@@ -702,7 +702,7 @@ static inline void rcu_preempt_sleep_check(void)
  * @p: The pointer to read
  *
  * Return the value of the specified RCU-protected pointer, but omit the
- * smp_read_barrier_depends() and keep the ACCESS_ONCE().  This is useful
+ * smp_read_barrier_depends() and keep the READ_ONCE().  This is useful
  * when the value of this pointer is accessed, but the pointer is not
  * dereferenced, for example, when testing an RCU-protected pointer against
  * NULL.  Although rcu_access_pointer() may also be used in cases where
@@ -791,7 +791,7 @@ static inline void rcu_preempt_sleep_check(void)
  * @p: The index to read
  *
  * Return the value of the specified RCU-protected index, but omit the
- * smp_read_barrier_depends() and keep the ACCESS_ONCE().  This is useful
+ * smp_read_barrier_depends() and keep the READ_ONCE().  This is useful
  * when the value of this index is accessed, but the index is not
  * dereferenced, for example, when testing an RCU-protected index against
  * -1.  Although rcu_access_index() may also be used in cases where
@@ -827,7 +827,7 @@ static inline void rcu_preempt_sleep_check(void)
  * @c: The conditions under which the dereference will take place
  *
  * Return the value of the specified RCU-protected pointer, but omit
- * both the smp_read_barrier_depends() and the ACCESS_ONCE().  This
+ * both the smp_read_barrier_depends() and the READ_ONCE().  This
  * is useful in cases where update-side locks prevent the value of the
  * pointer from changing.  Please note that this primitive does -not-
  * prevent the compiler from repeating this reference or combining it
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 8dbe27611ec3..a67ef6ff86b0 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -1413,7 +1413,7 @@ static int rcu_torture_barrier_cbs(void *arg)
 	do {
 		wait_event(barrier_cbs_wq[myid],
 			   (newphase =
-			    ACCESS_ONCE(barrier_phase)) != lastphase ||
+			    READ_ONCE(barrier_phase)) != lastphase ||
 			   torture_must_stop());
 		lastphase = newphase;
 		smp_mb(); /* ensure barrier_phase load before ->call(). */
diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c
index cad76e76b4e7..fb33d35ee0b7 100644
--- a/kernel/rcu/srcu.c
+++ b/kernel/rcu/srcu.c
@@ -151,7 +151,7 @@ static unsigned long srcu_readers_seq_idx(struct srcu_struct *sp, int idx)
 	unsigned long t;
 
 	for_each_possible_cpu(cpu) {
-		t = ACCESS_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->seq[idx]);
+		t = READ_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->seq[idx]);
 		sum += t;
 	}
 	return sum;
@@ -168,7 +168,7 @@ static unsigned long srcu_readers_active_idx(struct srcu_struct *sp, int idx)
 	unsigned long t;
 
 	for_each_possible_cpu(cpu) {
-		t = ACCESS_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[idx]);
+		t = READ_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[idx]);
 		sum += t;
 	}
 	return sum;
@@ -265,8 +265,8 @@ static int srcu_readers_active(struct srcu_struct *sp)
 	unsigned long sum = 0;
 
 	for_each_possible_cpu(cpu) {
-		sum += ACCESS_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[0]);
-		sum += ACCESS_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[1]);
+		sum += READ_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[0]);
+		sum += READ_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[1]);
 	}
 	return sum;
 }
@@ -296,7 +296,7 @@ int __srcu_read_lock(struct srcu_struct *sp)
 {
 	int idx;
 
-	idx = ACCESS_ONCE(sp->completed) & 0x1;
+	idx = READ_ONCE(sp->completed) & 0x1;
 	preempt_disable();
 	__this_cpu_inc(sp->per_cpu_ref->c[idx]);
 	smp_mb(); /* B */  /* Avoid leaking the critical section. */
diff --git a/kernel/rcu/tiny_plugin.h b/kernel/rcu/tiny_plugin.h
index f94e209a10d6..e492a5253e0f 100644
--- a/kernel/rcu/tiny_plugin.h
+++ b/kernel/rcu/tiny_plugin.h
@@ -144,16 +144,17 @@ static void check_cpu_stall(struct rcu_ctrlblk *rcp)
 		return;
 	rcp->ticks_this_gp++;
 	j = jiffies;
-	js = ACCESS_ONCE(rcp->jiffies_stall);
+	js = READ_ONCE(rcp->jiffies_stall);
 	if (rcp->rcucblist && ULONG_CMP_GE(j, js)) {
 		pr_err("INFO: %s stall on CPU (%lu ticks this GP) idle=%llx (t=%lu jiffies q=%ld)\n",
 		       rcp->name, rcp->ticks_this_gp, DYNTICK_TASK_EXIT_IDLE,
 		       jiffies - rcp->gp_start, rcp->qlen);
 		dump_stack();
-		ACCESS_ONCE(rcp->jiffies_stall) = jiffies +
-			3 * rcu_jiffies_till_stall_check() + 3;
+		WRITE_ONCE(rcp->jiffies_stall,
+			   jiffies + 3 * rcu_jiffies_till_stall_check() + 3);
 	} else if (ULONG_CMP_GE(j, js)) {
-		ACCESS_ONCE(rcp->jiffies_stall) = jiffies + rcu_jiffies_till_stall_check();
+		WRITE_ONCE(rcp->jiffies_stall,
+			   jiffies + rcu_jiffies_till_stall_check());
 	}
 }
 
@@ -161,7 +162,8 @@ static void reset_cpu_stall_ticks(struct rcu_ctrlblk *rcp)
 {
 	rcp->ticks_this_gp = 0;
 	rcp->gp_start = jiffies;
-	ACCESS_ONCE(rcp->jiffies_stall) = jiffies + rcu_jiffies_till_stall_check();
+	WRITE_ONCE(rcp->jiffies_stall,
+		   jiffies + rcu_jiffies_till_stall_check());
 }
 
 static void check_cpu_stalls(void)
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 8cf7304b2867..0628df155970 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -191,17 +191,17 @@ unsigned long rcutorture_vernum;
  */
 unsigned long rcu_rnp_online_cpus(struct rcu_node *rnp)
 {
-	return ACCESS_ONCE(rnp->qsmaskinitnext);
+	return READ_ONCE(rnp->qsmaskinitnext);
 }
 
 /*
- * Return true if an RCU grace period is in progress.  The ACCESS_ONCE()s
+ * Return true if an RCU grace period is in progress.  The READ_ONCE()s
  * permit this function to be invoked without holding the root rcu_node
  * structure's ->lock, but of course results can be subject to change.
  */
 static int rcu_gp_in_progress(struct rcu_state *rsp)
 {
-	return ACCESS_ONCE(rsp->completed) != ACCESS_ONCE(rsp->gpnum);
+	return READ_ONCE(rsp->completed) != READ_ONCE(rsp->gpnum);
 }
 
 /*
@@ -278,8 +278,8 @@ static void rcu_momentary_dyntick_idle(void)
 		if (!(resched_mask & rsp->flavor_mask))
 			continue;
 		smp_mb(); /* rcu_sched_qs_mask before cond_resched_completed. */
-		if (ACCESS_ONCE(rdp->mynode->completed) !=
-		    ACCESS_ONCE(rdp->cond_resched_completed))
+		if (READ_ONCE(rdp->mynode->completed) !=
+		    READ_ONCE(rdp->cond_resched_completed))
 			continue;
 
 		/*
@@ -491,9 +491,9 @@ void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags,
 		break;
 	}
 	if (rsp != NULL) {
-		*flags = ACCESS_ONCE(rsp->gp_flags);
-		*gpnum = ACCESS_ONCE(rsp->gpnum);
-		*completed = ACCESS_ONCE(rsp->completed);
+		*flags = READ_ONCE(rsp->gp_flags);
+		*gpnum = READ_ONCE(rsp->gpnum);
+		*completed = READ_ONCE(rsp->completed);
 		return;
 	}
 	*flags = 0;
@@ -539,10 +539,10 @@ static struct rcu_node *rcu_get_root(struct rcu_state *rsp)
 static int rcu_future_needs_gp(struct rcu_state *rsp)
 {
 	struct rcu_node *rnp = rcu_get_root(rsp);
-	int idx = (ACCESS_ONCE(rnp->completed) + 1) & 0x1;
+	int idx = (READ_ONCE(rnp->completed) + 1) & 0x1;
 	int *fp = &rnp->need_future_gp[idx];
 
-	return ACCESS_ONCE(*fp);
+	return READ_ONCE(*fp);
 }
 
 /*
@@ -565,7 +565,7 @@ cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp)
 		return 1;  /* Yes, this CPU has newly registered callbacks. */
 	for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++)
 		if (rdp->nxttail[i - 1] != rdp->nxttail[i] &&
-		    ULONG_CMP_LT(ACCESS_ONCE(rsp->completed),
+		    ULONG_CMP_LT(READ_ONCE(rsp->completed),
 				 rdp->nxtcompleted[i]))
 			return 1;  /* Yes, CBs for future grace period. */
 	return 0; /* No grace period needed. */
@@ -1011,9 +1011,9 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp,
 		trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("dti"));
 		return 1;
 	} else {
-		if (ULONG_CMP_LT(ACCESS_ONCE(rdp->gpnum) + ULONG_MAX / 4,
+		if (ULONG_CMP_LT(READ_ONCE(rdp->gpnum) + ULONG_MAX / 4,
 				 rdp->mynode->gpnum))
-			ACCESS_ONCE(rdp->gpwrap) = true;
+			WRITE_ONCE(rdp->gpwrap, true);
 		return 0;
 	}
 }
@@ -1093,12 +1093,12 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp,
 	if (ULONG_CMP_GE(jiffies,
 			 rdp->rsp->gp_start + jiffies_till_sched_qs) ||
 	    ULONG_CMP_GE(jiffies, rdp->rsp->jiffies_resched)) {
-		if (!(ACCESS_ONCE(*rcrmp) & rdp->rsp->flavor_mask)) {
-			ACCESS_ONCE(rdp->cond_resched_completed) =
-				ACCESS_ONCE(rdp->mynode->completed);
+		if (!(READ_ONCE(*rcrmp) & rdp->rsp->flavor_mask)) {
+			WRITE_ONCE(rdp->cond_resched_completed,
+				   READ_ONCE(rdp->mynode->completed));
 			smp_mb(); /* ->cond_resched_completed before *rcrmp. */
-			ACCESS_ONCE(*rcrmp) =
-				ACCESS_ONCE(*rcrmp) + rdp->rsp->flavor_mask;
+			WRITE_ONCE(*rcrmp,
+				   READ_ONCE(*rcrmp) + rdp->rsp->flavor_mask);
 			resched_cpu(rdp->cpu);  /* Force CPU into scheduler. */
 			rdp->rsp->jiffies_resched += 5; /* Enable beating. */
 		} else if (ULONG_CMP_GE(jiffies, rdp->rsp->jiffies_resched)) {
@@ -1119,9 +1119,9 @@ static void record_gp_stall_check_time(struct rcu_state *rsp)
 	rsp->gp_start = j;
 	smp_wmb(); /* Record start time before stall time. */
 	j1 = rcu_jiffies_till_stall_check();
-	ACCESS_ONCE(rsp->jiffies_stall) = j + j1;
+	WRITE_ONCE(rsp->jiffies_stall, j + j1);
 	rsp->jiffies_resched = j + j1 / 2;
-	rsp->n_force_qs_gpstart = ACCESS_ONCE(rsp->n_force_qs);
+	rsp->n_force_qs_gpstart = READ_ONCE(rsp->n_force_qs);
 }
 
 /*
@@ -1133,7 +1133,7 @@ static void rcu_check_gp_kthread_starvation(struct rcu_state *rsp)
 	unsigned long j;
 
 	j = jiffies;
-	gpa = ACCESS_ONCE(rsp->gp_activity);
+	gpa = READ_ONCE(rsp->gp_activity);
 	if (j - gpa > 2 * HZ)
 		pr_err("%s kthread starved for %ld jiffies!\n",
 		       rsp->name, j - gpa);
@@ -1173,12 +1173,13 @@ static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum)
 	/* Only let one CPU complain about others per time interval. */
 
 	raw_spin_lock_irqsave(&rnp->lock, flags);
-	delta = jiffies - ACCESS_ONCE(rsp->jiffies_stall);
+	delta = jiffies - READ_ONCE(rsp->jiffies_stall);
 	if (delta < RCU_STALL_RAT_DELAY || !rcu_gp_in_progress(rsp)) {
 		raw_spin_unlock_irqrestore(&rnp->lock, flags);
 		return;
 	}
-	ACCESS_ONCE(rsp->jiffies_stall) = jiffies + 3 * rcu_jiffies_till_stall_check() + 3;
+	WRITE_ONCE(rsp->jiffies_stall,
+		   jiffies + 3 * rcu_jiffies_till_stall_check() + 3);
 	raw_spin_unlock_irqrestore(&rnp->lock, flags);
 
 	/*
@@ -1212,12 +1213,12 @@ static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum)
 	if (ndetected) {
 		rcu_dump_cpu_stacks(rsp);
 	} else {
-		if (ACCESS_ONCE(rsp->gpnum) != gpnum ||
-		    ACCESS_ONCE(rsp->completed) == gpnum) {
+		if (READ_ONCE(rsp->gpnum) != gpnum ||
+		    READ_ONCE(rsp->completed) == gpnum) {
 			pr_err("INFO: Stall ended before state dump start\n");
 		} else {
 			j = jiffies;
-			gpa = ACCESS_ONCE(rsp->gp_activity);
+			gpa = READ_ONCE(rsp->gp_activity);
 			pr_err("All QSes seen, last %s kthread activity %ld (%ld-%ld), jiffies_till_next_fqs=%ld, root ->qsmask %#lx\n",
 			       rsp->name, j - gpa, j, gpa,
 			       jiffies_till_next_fqs,
@@ -1262,9 +1263,9 @@ static void print_cpu_stall(struct rcu_state *rsp)
 	rcu_dump_cpu_stacks(rsp);
 
 	raw_spin_lock_irqsave(&rnp->lock, flags);
-	if (ULONG_CMP_GE(jiffies, ACCESS_ONCE(rsp->jiffies_stall)))
-		ACCESS_ONCE(rsp->jiffies_stall) = jiffies +
-				     3 * rcu_jiffies_till_stall_check() + 3;
+	if (ULONG_CMP_GE(jiffies, READ_ONCE(rsp->jiffies_stall)))
+		WRITE_ONCE(rsp->jiffies_stall,
+			   jiffies + 3 * rcu_jiffies_till_stall_check() + 3);
 	raw_spin_unlock_irqrestore(&rnp->lock, flags);
 
 	/*
@@ -1307,20 +1308,20 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
 	 * Given this check, comparisons of jiffies, rsp->jiffies_stall,
 	 * and rsp->gp_start suffice to forestall false positives.
 	 */
-	gpnum = ACCESS_ONCE(rsp->gpnum);
+	gpnum = READ_ONCE(rsp->gpnum);
 	smp_rmb(); /* Pick up ->gpnum first... */
-	js = ACCESS_ONCE(rsp->jiffies_stall);
+	js = READ_ONCE(rsp->jiffies_stall);
 	smp_rmb(); /* ...then ->jiffies_stall before the rest... */
-	gps = ACCESS_ONCE(rsp->gp_start);
+	gps = READ_ONCE(rsp->gp_start);
 	smp_rmb(); /* ...and finally ->gp_start before ->completed. */
-	completed = ACCESS_ONCE(rsp->completed);
+	completed = READ_ONCE(rsp->completed);
 	if (ULONG_CMP_GE(completed, gpnum) ||
 	    ULONG_CMP_LT(j, js) ||
 	    ULONG_CMP_GE(gps, js))
 		return; /* No stall or GP completed since entering function. */
 	rnp = rdp->mynode;
 	if (rcu_gp_in_progress(rsp) &&
-	    (ACCESS_ONCE(rnp->qsmask) & rdp->grpmask)) {
+	    (READ_ONCE(rnp->qsmask) & rdp->grpmask)) {
 
 		/* We haven't checked in, so go dump stack. */
 		print_cpu_stall(rsp);
@@ -1347,7 +1348,7 @@ void rcu_cpu_stall_reset(void)
 	struct rcu_state *rsp;
 
 	for_each_rcu_flavor(rsp)
-		ACCESS_ONCE(rsp->jiffies_stall) = jiffies + ULONG_MAX / 2;
+		WRITE_ONCE(rsp->jiffies_stall, jiffies + ULONG_MAX / 2);
 }
 
 /*
@@ -1457,7 +1458,7 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp,
 	 * doing some extra useless work.
 	 */
 	if (rnp->gpnum != rnp->completed ||
-	    ACCESS_ONCE(rnp_root->gpnum) != ACCESS_ONCE(rnp_root->completed)) {
+	    READ_ONCE(rnp_root->gpnum) != READ_ONCE(rnp_root->completed)) {
 		rnp->need_future_gp[c & 0x1]++;
 		trace_rcu_future_gp(rnp, rdp, c, TPS("Startedleaf"));
 		goto out;
@@ -1542,7 +1543,7 @@ static int rcu_future_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp)
 static void rcu_gp_kthread_wake(struct rcu_state *rsp)
 {
 	if (current == rsp->gp_kthread ||
-	    !ACCESS_ONCE(rsp->gp_flags) ||
+	    !READ_ONCE(rsp->gp_flags) ||
 	    !rsp->gp_kthread)
 		return;
 	wake_up(&rsp->gp_wq);
@@ -1677,7 +1678,7 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp,
 
 	/* Handle the ends of any preceding grace periods first. */
 	if (rdp->completed == rnp->completed &&
-	    !unlikely(ACCESS_ONCE(rdp->gpwrap))) {
+	    !unlikely(READ_ONCE(rdp->gpwrap))) {
 
 		/* No grace period end, so just accelerate recent callbacks. */
 		ret = rcu_accelerate_cbs(rsp, rnp, rdp);
@@ -1692,7 +1693,7 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp,
 		trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuend"));
 	}
 
-	if (rdp->gpnum != rnp->gpnum || unlikely(ACCESS_ONCE(rdp->gpwrap))) {
+	if (rdp->gpnum != rnp->gpnum || unlikely(READ_ONCE(rdp->gpwrap))) {
 		/*
 		 * If the current grace period is waiting for this CPU,
 		 * set up to detect a quiescent state, otherwise don't
@@ -1704,7 +1705,7 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp,
 		rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr);
 		rdp->qs_pending = !!(rnp->qsmask & rdp->grpmask);
 		zero_cpu_stall_ticks(rdp);
-		ACCESS_ONCE(rdp->gpwrap) = false;
+		WRITE_ONCE(rdp->gpwrap, false);
 	}
 	return ret;
 }
@@ -1717,9 +1718,9 @@ static void note_gp_changes(struct rcu_state *rsp, struct rcu_data *rdp)
 
 	local_irq_save(flags);
 	rnp = rdp->mynode;
-	if ((rdp->gpnum == ACCESS_ONCE(rnp->gpnum) &&
-	     rdp->completed == ACCESS_ONCE(rnp->completed) &&
-	     !unlikely(ACCESS_ONCE(rdp->gpwrap))) || /* w/out lock. */
+	if ((rdp->gpnum == READ_ONCE(rnp->gpnum) &&
+	     rdp->completed == READ_ONCE(rnp->completed) &&
+	     !unlikely(READ_ONCE(rdp->gpwrap))) || /* w/out lock. */
 	    !raw_spin_trylock(&rnp->lock)) { /* irqs already off, so later. */
 		local_irq_restore(flags);
 		return;
@@ -1740,15 +1741,15 @@ static int rcu_gp_init(struct rcu_state *rsp)
 	struct rcu_data *rdp;
 	struct rcu_node *rnp = rcu_get_root(rsp);
 
-	ACCESS_ONCE(rsp->gp_activity) = jiffies;
+	WRITE_ONCE(rsp->gp_activity, jiffies);
 	raw_spin_lock_irq(&rnp->lock);
 	smp_mb__after_unlock_lock();
-	if (!ACCESS_ONCE(rsp->gp_flags)) {
+	if (!READ_ONCE(rsp->gp_flags)) {
 		/* Spurious wakeup, tell caller to go back to sleep.  */
 		raw_spin_unlock_irq(&rnp->lock);
 		return 0;
 	}
-	ACCESS_ONCE(rsp->gp_flags) = 0; /* Clear all flags: New grace period. */
+	WRITE_ONCE(rsp->gp_flags, 0); /* Clear all flags: New grace period. */
 
 	if (WARN_ON_ONCE(rcu_gp_in_progress(rsp))) {
 		/*
@@ -1834,9 +1835,9 @@ static int rcu_gp_init(struct rcu_state *rsp)
 		rdp = this_cpu_ptr(rsp->rda);
 		rcu_preempt_check_blocked_tasks(rnp);
 		rnp->qsmask = rnp->qsmaskinit;
-		ACCESS_ONCE(rnp->gpnum) = rsp->gpnum;
+		WRITE_ONCE(rnp->gpnum, rsp->gpnum);
 		if (WARN_ON_ONCE(rnp->completed != rsp->completed))
-			ACCESS_ONCE(rnp->completed) = rsp->completed;
+			WRITE_ONCE(rnp->completed, rsp->completed);
 		if (rnp == rdp->mynode)
 			(void)__note_gp_changes(rsp, rnp, rdp);
 		rcu_preempt_boost_start_gp(rnp);
@@ -1845,7 +1846,7 @@ static int rcu_gp_init(struct rcu_state *rsp)
 					    rnp->grphi, rnp->qsmask);
 		raw_spin_unlock_irq(&rnp->lock);
 		cond_resched_rcu_qs();
-		ACCESS_ONCE(rsp->gp_activity) = jiffies;
+		WRITE_ONCE(rsp->gp_activity, jiffies);
 		if (gp_init_delay > 0 &&
 		    !(rsp->gpnum % (rcu_num_nodes * PER_RCU_NODE_PERIOD)))
 			schedule_timeout_uninterruptible(gp_init_delay);
@@ -1864,7 +1865,7 @@ static int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in)
 	unsigned long maxj;
 	struct rcu_node *rnp = rcu_get_root(rsp);
 
-	ACCESS_ONCE(rsp->gp_activity) = jiffies;
+	WRITE_ONCE(rsp->gp_activity, jiffies);
 	rsp->n_force_qs++;
 	if (fqs_state == RCU_SAVE_DYNTICK) {
 		/* Collect dyntick-idle snapshots. */
@@ -1882,11 +1883,11 @@ static int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in)
 		force_qs_rnp(rsp, rcu_implicit_dynticks_qs, &isidle, &maxj);
 	}
 	/* Clear flag to prevent immediate re-entry. */
-	if (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) {
+	if (READ_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) {
 		raw_spin_lock_irq(&rnp->lock);
 		smp_mb__after_unlock_lock();
-		ACCESS_ONCE(rsp->gp_flags) =
-			ACCESS_ONCE(rsp->gp_flags) & ~RCU_GP_FLAG_FQS;
+		WRITE_ONCE(rsp->gp_flags,
+			   READ_ONCE(rsp->gp_flags) & ~RCU_GP_FLAG_FQS);
 		raw_spin_unlock_irq(&rnp->lock);
 	}
 	return fqs_state;
@@ -1903,7 +1904,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
 	struct rcu_data *rdp;
 	struct rcu_node *rnp = rcu_get_root(rsp);
 
-	ACCESS_ONCE(rsp->gp_activity) = jiffies;
+	WRITE_ONCE(rsp->gp_activity, jiffies);
 	raw_spin_lock_irq(&rnp->lock);
 	smp_mb__after_unlock_lock();
 	gp_duration = jiffies - rsp->gp_start;
@@ -1934,7 +1935,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
 		smp_mb__after_unlock_lock();
 		WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp));
 		WARN_ON_ONCE(rnp->qsmask);
-		ACCESS_ONCE(rnp->completed) = rsp->gpnum;
+		WRITE_ONCE(rnp->completed, rsp->gpnum);
 		rdp = this_cpu_ptr(rsp->rda);
 		if (rnp == rdp->mynode)
 			needgp = __note_gp_changes(rsp, rnp, rdp) || needgp;
@@ -1942,7 +1943,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
 		nocb += rcu_future_gp_cleanup(rsp, rnp);
 		raw_spin_unlock_irq(&rnp->lock);
 		cond_resched_rcu_qs();
-		ACCESS_ONCE(rsp->gp_activity) = jiffies;
+		WRITE_ONCE(rsp->gp_activity, jiffies);
 	}
 	rnp = rcu_get_root(rsp);
 	raw_spin_lock_irq(&rnp->lock);
@@ -1950,16 +1951,16 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
 	rcu_nocb_gp_set(rnp, nocb);
 
 	/* Declare grace period done. */
-	ACCESS_ONCE(rsp->completed) = rsp->gpnum;
+	WRITE_ONCE(rsp->completed, rsp->gpnum);
 	trace_rcu_grace_period(rsp->name, rsp->completed, TPS("end"));
 	rsp->fqs_state = RCU_GP_IDLE;
 	rdp = this_cpu_ptr(rsp->rda);
 	/* Advance CBs to reduce false positives below. */
 	needgp = rcu_advance_cbs(rsp, rnp, rdp) || needgp;
 	if (needgp || cpu_needs_another_gp(rsp, rdp)) {
-		ACCESS_ONCE(rsp->gp_flags) = RCU_GP_FLAG_INIT;
+		WRITE_ONCE(rsp->gp_flags, RCU_GP_FLAG_INIT);
 		trace_rcu_grace_period(rsp->name,
-				       ACCESS_ONCE(rsp->gpnum),
+				       READ_ONCE(rsp->gpnum),
 				       TPS("newreq"));
 	}
 	raw_spin_unlock_irq(&rnp->lock);
@@ -1983,20 +1984,20 @@ static int __noreturn rcu_gp_kthread(void *arg)
 		/* Handle grace-period start. */
 		for (;;) {
 			trace_rcu_grace_period(rsp->name,
-					       ACCESS_ONCE(rsp->gpnum),
+					       READ_ONCE(rsp->gpnum),
 					       TPS("reqwait"));
 			rsp->gp_state = RCU_GP_WAIT_GPS;
 			wait_event_interruptible(rsp->gp_wq,
-						 ACCESS_ONCE(rsp->gp_flags) &
+						 READ_ONCE(rsp->gp_flags) &
 						 RCU_GP_FLAG_INIT);
 			/* Locking provides needed memory barrier. */
 			if (rcu_gp_init(rsp))
 				break;
 			cond_resched_rcu_qs();
-			ACCESS_ONCE(rsp->gp_activity) = jiffies;
+			WRITE_ONCE(rsp->gp_activity, jiffies);
 			WARN_ON(signal_pending(current));
 			trace_rcu_grace_period(rsp->name,
-					       ACCESS_ONCE(rsp->gpnum),
+					       READ_ONCE(rsp->gpnum),
 					       TPS("reqwaitsig"));
 		}
 
@@ -2012,39 +2013,39 @@ static int __noreturn rcu_gp_kthread(void *arg)
 			if (!ret)
 				rsp->jiffies_force_qs = jiffies + j;
 			trace_rcu_grace_period(rsp->name,
-					       ACCESS_ONCE(rsp->gpnum),
+					       READ_ONCE(rsp->gpnum),
 					       TPS("fqswait"));
 			rsp->gp_state = RCU_GP_WAIT_FQS;
 			ret = wait_event_interruptible_timeout(rsp->gp_wq,
-					((gf = ACCESS_ONCE(rsp->gp_flags)) &
+					((gf = READ_ONCE(rsp->gp_flags)) &
 					 RCU_GP_FLAG_FQS) ||
-					(!ACCESS_ONCE(rnp->qsmask) &&
+					(!READ_ONCE(rnp->qsmask) &&
 					 !rcu_preempt_blocked_readers_cgp(rnp)),
 					j);
 			/* Locking provides needed memory barriers. */
 			/* If grace period done, leave loop. */
-			if (!ACCESS_ONCE(rnp->qsmask) &&
+			if (!READ_ONCE(rnp->qsmask) &&
 			    !rcu_preempt_blocked_readers_cgp(rnp))
 				break;
 			/* If time for quiescent-state forcing, do it. */
 			if (ULONG_CMP_GE(jiffies, rsp->jiffies_force_qs) ||
 			    (gf & RCU_GP_FLAG_FQS)) {
 				trace_rcu_grace_period(rsp->name,
-						       ACCESS_ONCE(rsp->gpnum),
+						       READ_ONCE(rsp->gpnum),
 						       TPS("fqsstart"));
 				fqs_state = rcu_gp_fqs(rsp, fqs_state);
 				trace_rcu_grace_period(rsp->name,
-						       ACCESS_ONCE(rsp->gpnum),
+						       READ_ONCE(rsp->gpnum),
 						       TPS("fqsend"));
 				cond_resched_rcu_qs();
-				ACCESS_ONCE(rsp->gp_activity) = jiffies;
+				WRITE_ONCE(rsp->gp_activity, jiffies);
 			} else {
 				/* Deal with stray signal. */
 				cond_resched_rcu_qs();
-				ACCESS_ONCE(rsp->gp_activity) = jiffies;
+				WRITE_ONCE(rsp->gp_activity, jiffies);
 				WARN_ON(signal_pending(current));
 				trace_rcu_grace_period(rsp->name,
-						       ACCESS_ONCE(rsp->gpnum),
+						       READ_ONCE(rsp->gpnum),
 						       TPS("fqswaitsig"));
 			}
 			j = jiffies_till_next_fqs;
@@ -2086,8 +2087,8 @@ rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp,
 		 */
 		return false;
 	}
-	ACCESS_ONCE(rsp->gp_flags) = RCU_GP_FLAG_INIT;
-	trace_rcu_grace_period(rsp->name, ACCESS_ONCE(rsp->gpnum),
+	WRITE_ONCE(rsp->gp_flags, RCU_GP_FLAG_INIT);
+	trace_rcu_grace_period(rsp->name, READ_ONCE(rsp->gpnum),
 			       TPS("newreq"));
 
 	/*
@@ -2359,7 +2360,7 @@ rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp,
 		rsp->qlen += rdp->qlen;
 		rdp->n_cbs_orphaned += rdp->qlen;
 		rdp->qlen_lazy = 0;
-		ACCESS_ONCE(rdp->qlen) = 0;
+		WRITE_ONCE(rdp->qlen, 0);
 	}
 
 	/*
@@ -2580,7 +2581,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
 	/* If no callbacks are ready, just return. */
 	if (!cpu_has_callbacks_ready_to_invoke(rdp)) {
 		trace_rcu_batch_start(rsp->name, rdp->qlen_lazy, rdp->qlen, 0);
-		trace_rcu_batch_end(rsp->name, 0, !!ACCESS_ONCE(rdp->nxtlist),
+		trace_rcu_batch_end(rsp->name, 0, !!READ_ONCE(rdp->nxtlist),
 				    need_resched(), is_idle_task(current),
 				    rcu_is_callbacks_kthread());
 		return;
@@ -2636,7 +2637,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
 	}
 	smp_mb(); /* List handling before counting for rcu_barrier(). */
 	rdp->qlen_lazy -= count_lazy;
-	ACCESS_ONCE(rdp->qlen) = rdp->qlen - count;
+	WRITE_ONCE(rdp->qlen, rdp->qlen - count);
 	rdp->n_cbs_invoked += count;
 
 	/* Reinstate batch limit if we have worked down the excess. */
@@ -2793,7 +2794,7 @@ static void force_quiescent_state(struct rcu_state *rsp)
 	/* Funnel through hierarchy to reduce memory contention. */
 	rnp = __this_cpu_read(rsp->rda->mynode);
 	for (; rnp != NULL; rnp = rnp->parent) {
-		ret = (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) ||
+		ret = (READ_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) ||
 		      !raw_spin_trylock(&rnp->fqslock);
 		if (rnp_old != NULL)
 			raw_spin_unlock(&rnp_old->fqslock);
@@ -2809,13 +2810,12 @@ static void force_quiescent_state(struct rcu_state *rsp)
 	raw_spin_lock_irqsave(&rnp_old->lock, flags);
 	smp_mb__after_unlock_lock();
 	raw_spin_unlock(&rnp_old->fqslock);
-	if (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) {
+	if (READ_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) {
 		rsp->n_force_qs_lh++;
 		raw_spin_unlock_irqrestore(&rnp_old->lock, flags);
 		return;  /* Someone beat us to it. */
 	}
-	ACCESS_ONCE(rsp->gp_flags) =
-		ACCESS_ONCE(rsp->gp_flags) | RCU_GP_FLAG_FQS;
+	WRITE_ONCE(rsp->gp_flags, READ_ONCE(rsp->gp_flags) | RCU_GP_FLAG_FQS);
 	raw_spin_unlock_irqrestore(&rnp_old->lock, flags);
 	rcu_gp_kthread_wake(rsp);
 }
@@ -2881,7 +2881,7 @@ static void rcu_process_callbacks(struct softirq_action *unused)
  */
 static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
 {
-	if (unlikely(!ACCESS_ONCE(rcu_scheduler_fully_active)))
+	if (unlikely(!READ_ONCE(rcu_scheduler_fully_active)))
 		return;
 	if (likely(!rsp->boost)) {
 		rcu_do_batch(rsp, rdp);
@@ -2972,7 +2972,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
 	WARN_ON_ONCE((unsigned long)head & 0x1); /* Misaligned rcu_head! */
 	if (debug_rcu_head_queue(head)) {
 		/* Probable double call_rcu(), so leak the callback. */
-		ACCESS_ONCE(head->func) = rcu_leak_callback;
+		WRITE_ONCE(head->func, rcu_leak_callback);
 		WARN_ONCE(1, "__call_rcu(): Leaked duplicate callback\n");
 		return;
 	}
@@ -3011,7 +3011,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
 		if (!likely(rdp->nxtlist))
 			init_default_callback_list(rdp);
 	}
-	ACCESS_ONCE(rdp->qlen) = rdp->qlen + 1;
+	WRITE_ONCE(rdp->qlen, rdp->qlen + 1);
 	if (lazy)
 		rdp->qlen_lazy++;
 	else
@@ -3450,14 +3450,14 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
 	}
 
 	/* Has another RCU grace period completed?  */
-	if (ACCESS_ONCE(rnp->completed) != rdp->completed) { /* outside lock */
+	if (READ_ONCE(rnp->completed) != rdp->completed) { /* outside lock */
 		rdp->n_rp_gp_completed++;
 		return 1;
 	}
 
 	/* Has a new RCU grace period started? */
-	if (ACCESS_ONCE(rnp->gpnum) != rdp->gpnum ||
-	    unlikely(ACCESS_ONCE(rdp->gpwrap))) { /* outside lock */
+	if (READ_ONCE(rnp->gpnum) != rdp->gpnum ||
+	    unlikely(READ_ONCE(rdp->gpwrap))) { /* outside lock */
 		rdp->n_rp_gp_started++;
 		return 1;
 	}
@@ -3564,7 +3564,7 @@ static void _rcu_barrier(struct rcu_state *rsp)
 {
 	int cpu;
 	struct rcu_data *rdp;
-	unsigned long snap = ACCESS_ONCE(rsp->n_barrier_done);
+	unsigned long snap = READ_ONCE(rsp->n_barrier_done);
 	unsigned long snap_done;
 
 	_rcu_barrier_trace(rsp, "Begin", -1, snap);
@@ -3606,10 +3606,10 @@ static void _rcu_barrier(struct rcu_state *rsp)
 
 	/*
 	 * Increment ->n_barrier_done to avoid duplicate work.  Use
-	 * ACCESS_ONCE() to prevent the compiler from speculating
+	 * WRITE_ONCE() to prevent the compiler from speculating
 	 * the increment to precede the early-exit check.
 	 */
-	ACCESS_ONCE(rsp->n_barrier_done) = rsp->n_barrier_done + 1;
+	WRITE_ONCE(rsp->n_barrier_done, rsp->n_barrier_done + 1);
 	WARN_ON_ONCE((rsp->n_barrier_done & 0x1) != 1);
 	_rcu_barrier_trace(rsp, "Inc1", -1, rsp->n_barrier_done);
 	smp_mb(); /* Order ->n_barrier_done increment with below mechanism. */
@@ -3645,7 +3645,7 @@ static void _rcu_barrier(struct rcu_state *rsp)
 				__call_rcu(&rdp->barrier_head,
 					   rcu_barrier_callback, rsp, cpu, 0);
 			}
-		} else if (ACCESS_ONCE(rdp->qlen)) {
+		} else if (READ_ONCE(rdp->qlen)) {
 			_rcu_barrier_trace(rsp, "OnlineQ", cpu,
 					   rsp->n_barrier_done);
 			smp_call_function_single(cpu, rcu_barrier_func, rsp, 1);
@@ -3665,7 +3665,7 @@ static void _rcu_barrier(struct rcu_state *rsp)
 
 	/* Increment ->n_barrier_done to prevent duplicate work. */
 	smp_mb(); /* Keep increment after above mechanism. */
-	ACCESS_ONCE(rsp->n_barrier_done) = rsp->n_barrier_done + 1;
+	WRITE_ONCE(rsp->n_barrier_done, rsp->n_barrier_done + 1);
 	WARN_ON_ONCE((rsp->n_barrier_done & 0x1) != 0);
 	_rcu_barrier_trace(rsp, "Inc2", -1, rsp->n_barrier_done);
 	smp_mb(); /* Keep increment before caller's subsequent code. */
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 8c0ec0f5a027..58b1ebdc4387 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -570,7 +570,7 @@ static int rcu_preempted_readers_exp(struct rcu_node *rnp)
 static int sync_rcu_preempt_exp_done(struct rcu_node *rnp)
 {
 	return !rcu_preempted_readers_exp(rnp) &&
-	       ACCESS_ONCE(rnp->expmask) == 0;
+	       READ_ONCE(rnp->expmask) == 0;
 }
 
 /*
@@ -716,7 +716,7 @@ void synchronize_rcu_expedited(void)
 	int trycount = 0;
 
 	smp_mb(); /* Caller's modifications seen first by other CPUs. */
-	snap = ACCESS_ONCE(sync_rcu_preempt_exp_count) + 1;
+	snap = READ_ONCE(sync_rcu_preempt_exp_count) + 1;
 	smp_mb(); /* Above access cannot bleed into critical section. */
 
 	/*
@@ -740,7 +740,7 @@ void synchronize_rcu_expedited(void)
 	 */
 	while (!mutex_trylock(&sync_rcu_preempt_exp_mutex)) {
 		if (ULONG_CMP_LT(snap,
-		    ACCESS_ONCE(sync_rcu_preempt_exp_count))) {
+		    READ_ONCE(sync_rcu_preempt_exp_count))) {
 			put_online_cpus();
 			goto mb_ret; /* Others did our work for us. */
 		}
@@ -752,7 +752,7 @@ void synchronize_rcu_expedited(void)
 			return;
 		}
 	}
-	if (ULONG_CMP_LT(snap, ACCESS_ONCE(sync_rcu_preempt_exp_count))) {
+	if (ULONG_CMP_LT(snap, READ_ONCE(sync_rcu_preempt_exp_count))) {
 		put_online_cpus();
 		goto unlock_mb_ret; /* Others did our work for us. */
 	}
@@ -780,8 +780,7 @@ void synchronize_rcu_expedited(void)
 
 	/* Clean up and exit. */
 	smp_mb(); /* ensure expedited GP seen before counter increment. */
-	ACCESS_ONCE(sync_rcu_preempt_exp_count) =
-					sync_rcu_preempt_exp_count + 1;
+	WRITE_ONCE(sync_rcu_preempt_exp_count, sync_rcu_preempt_exp_count + 1);
 unlock_mb_ret:
 	mutex_unlock(&sync_rcu_preempt_exp_mutex);
 mb_ret:
@@ -994,8 +993,8 @@ static int rcu_boost(struct rcu_node *rnp)
 	struct task_struct *t;
 	struct list_head *tb;
 
-	if (ACCESS_ONCE(rnp->exp_tasks) == NULL &&
-	    ACCESS_ONCE(rnp->boost_tasks) == NULL)
+	if (READ_ONCE(rnp->exp_tasks) == NULL &&
+	    READ_ONCE(rnp->boost_tasks) == NULL)
 		return 0;  /* Nothing left to boost. */
 
 	raw_spin_lock_irqsave(&rnp->lock, flags);
@@ -1048,8 +1047,8 @@ static int rcu_boost(struct rcu_node *rnp)
 	rt_mutex_lock(&rnp->boost_mtx);
 	rt_mutex_unlock(&rnp->boost_mtx);  /* Then keep lockdep happy. */
 
-	return ACCESS_ONCE(rnp->exp_tasks) != NULL ||
-	       ACCESS_ONCE(rnp->boost_tasks) != NULL;
+	return READ_ONCE(rnp->exp_tasks) != NULL ||
+	       READ_ONCE(rnp->boost_tasks) != NULL;
 }
 
 /*
@@ -1462,7 +1461,7 @@ static bool __maybe_unused rcu_try_advance_all_cbs(void)
 		 * callbacks not yet ready to invoke.
 		 */
 		if ((rdp->completed != rnp->completed ||
-		     unlikely(ACCESS_ONCE(rdp->gpwrap))) &&
+		     unlikely(READ_ONCE(rdp->gpwrap))) &&
 		    rdp->nxttail[RCU_DONE_TAIL] != rdp->nxttail[RCU_NEXT_TAIL])
 			note_gp_changes(rsp, rdp);
 
@@ -1534,7 +1533,7 @@ static void rcu_prepare_for_idle(void)
 	int tne;
 
 	/* Handle nohz enablement switches conservatively. */
-	tne = ACCESS_ONCE(tick_nohz_active);
+	tne = READ_ONCE(tick_nohz_active);
 	if (tne != rdtp->tick_nohz_enabled_snap) {
 		if (rcu_cpu_has_callbacks(NULL))
 			invoke_rcu_core(); /* force nohz to see update. */
@@ -1760,7 +1759,7 @@ static void print_cpu_stall_info(struct rcu_state *rsp, int cpu)
 	       atomic_read(&rdtp->dynticks) & 0xfff,
 	       rdtp->dynticks_nesting, rdtp->dynticks_nmi_nesting,
 	       rdp->softirq_snap, kstat_softirqs_cpu(RCU_SOFTIRQ, cpu),
-	       ACCESS_ONCE(rsp->n_force_qs) - rsp->n_force_qs_gpstart,
+	       READ_ONCE(rsp->n_force_qs) - rsp->n_force_qs_gpstart,
 	       fast_no_hz);
 }
 
@@ -1898,11 +1897,11 @@ static void wake_nocb_leader(struct rcu_data *rdp, bool force)
 {
 	struct rcu_data *rdp_leader = rdp->nocb_leader;
 
-	if (!ACCESS_ONCE(rdp_leader->nocb_kthread))
+	if (!READ_ONCE(rdp_leader->nocb_kthread))
 		return;
-	if (ACCESS_ONCE(rdp_leader->nocb_leader_sleep) || force) {
+	if (READ_ONCE(rdp_leader->nocb_leader_sleep) || force) {
 		/* Prior smp_mb__after_atomic() orders against prior enqueue. */
-		ACCESS_ONCE(rdp_leader->nocb_leader_sleep) = false;
+		WRITE_ONCE(rdp_leader->nocb_leader_sleep, false);
 		wake_up(&rdp_leader->nocb_wq);
 	}
 }
@@ -1934,14 +1933,14 @@ static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu)
 	ret = atomic_long_read(&rdp->nocb_q_count);
 
 #ifdef CONFIG_PROVE_RCU
-	rhp = ACCESS_ONCE(rdp->nocb_head);
+	rhp = READ_ONCE(rdp->nocb_head);
 	if (!rhp)
-		rhp = ACCESS_ONCE(rdp->nocb_gp_head);
+		rhp = READ_ONCE(rdp->nocb_gp_head);
 	if (!rhp)
-		rhp = ACCESS_ONCE(rdp->nocb_follower_head);
+		rhp = READ_ONCE(rdp->nocb_follower_head);
 
 	/* Having no rcuo kthread but CBs after scheduler starts is bad! */
-	if (!ACCESS_ONCE(rdp->nocb_kthread) && rhp &&
+	if (!READ_ONCE(rdp->nocb_kthread) && rhp &&
 	    rcu_scheduler_fully_active) {
 		/* RCU callback enqueued before CPU first came online??? */
 		pr_err("RCU: Never-onlined no-CBs CPU %d has CB %p\n",
@@ -1975,12 +1974,12 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp,
 	atomic_long_add(rhcount, &rdp->nocb_q_count);
 	/* rcu_barrier() relies on ->nocb_q_count add before xchg. */
 	old_rhpp = xchg(&rdp->nocb_tail, rhtp);
-	ACCESS_ONCE(*old_rhpp) = rhp;
+	WRITE_ONCE(*old_rhpp, rhp);
 	atomic_long_add(rhcount_lazy, &rdp->nocb_q_count_lazy);
 	smp_mb__after_atomic(); /* Store *old_rhpp before _wake test. */
 
 	/* If we are not being polled and there is a kthread, awaken it ... */
-	t = ACCESS_ONCE(rdp->nocb_kthread);
+	t = READ_ONCE(rdp->nocb_kthread);
 	if (rcu_nocb_poll || !t) {
 		trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
 				    TPS("WakeNotPoll"));
@@ -2118,7 +2117,7 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp)
 	for (;;) {
 		wait_event_interruptible(
 			rnp->nocb_gp_wq[c & 0x1],
-			(d = ULONG_CMP_GE(ACCESS_ONCE(rnp->completed), c)));
+			(d = ULONG_CMP_GE(READ_ONCE(rnp->completed), c)));
 		if (likely(d))
 			break;
 		WARN_ON(signal_pending(current));
@@ -2145,7 +2144,7 @@ wait_again:
 	if (!rcu_nocb_poll) {
 		trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, "Sleep");
 		wait_event_interruptible(my_rdp->nocb_wq,
-				!ACCESS_ONCE(my_rdp->nocb_leader_sleep));
+				!READ_ONCE(my_rdp->nocb_leader_sleep));
 		/* Memory barrier handled by smp_mb() calls below and repoll. */
 	} else if (firsttime) {
 		firsttime = false; /* Don't drown trace log with "Poll"! */
@@ -2159,12 +2158,12 @@ wait_again:
 	 */
 	gotcbs = false;
 	for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower) {
-		rdp->nocb_gp_head = ACCESS_ONCE(rdp->nocb_head);
+		rdp->nocb_gp_head = READ_ONCE(rdp->nocb_head);
 		if (!rdp->nocb_gp_head)
 			continue;  /* No CBs here, try next follower. */
 
 		/* Move callbacks to wait-for-GP list, which is empty. */
-		ACCESS_ONCE(rdp->nocb_head) = NULL;
+		WRITE_ONCE(rdp->nocb_head, NULL);
 		rdp->nocb_gp_tail = xchg(&rdp->nocb_tail, &rdp->nocb_head);
 		gotcbs = true;
 	}
@@ -2184,7 +2183,7 @@ wait_again:
 		my_rdp->nocb_leader_sleep = true;
 		smp_mb();  /* Ensure _sleep true before scan. */
 		for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower)
-			if (ACCESS_ONCE(rdp->nocb_head)) {
+			if (READ_ONCE(rdp->nocb_head)) {
 				/* Found CB, so short-circuit next wait. */
 				my_rdp->nocb_leader_sleep = false;
 				break;
@@ -2205,7 +2204,7 @@ wait_again:
 
 	/* Each pass through the following loop wakes a follower, if needed. */
 	for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower) {
-		if (ACCESS_ONCE(rdp->nocb_head))
+		if (READ_ONCE(rdp->nocb_head))
 			my_rdp->nocb_leader_sleep = false;/* No need to sleep.*/
 		if (!rdp->nocb_gp_head)
 			continue; /* No CBs, so no need to wake follower. */
@@ -2241,7 +2240,7 @@ static void nocb_follower_wait(struct rcu_data *rdp)
 			trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
 					    "FollowerSleep");
 			wait_event_interruptible(rdp->nocb_wq,
-						 ACCESS_ONCE(rdp->nocb_follower_head));
+						 READ_ONCE(rdp->nocb_follower_head));
 		} else if (firsttime) {
 			/* Don't drown trace log with "Poll"! */
 			firsttime = false;
@@ -2282,10 +2281,10 @@ static int rcu_nocb_kthread(void *arg)
 			nocb_follower_wait(rdp);
 
 		/* Pull the ready-to-invoke callbacks onto local list. */
-		list = ACCESS_ONCE(rdp->nocb_follower_head);
+		list = READ_ONCE(rdp->nocb_follower_head);
 		BUG_ON(!list);
 		trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, "WokeNonEmpty");
-		ACCESS_ONCE(rdp->nocb_follower_head) = NULL;
+		WRITE_ONCE(rdp->nocb_follower_head, NULL);
 		tail = xchg(&rdp->nocb_follower_tail, &rdp->nocb_follower_head);
 
 		/* Each pass through the following loop invokes a callback. */
@@ -2324,7 +2323,7 @@ static int rcu_nocb_kthread(void *arg)
 /* Is a deferred wakeup of rcu_nocb_kthread() required? */
 static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp)
 {
-	return ACCESS_ONCE(rdp->nocb_defer_wakeup);
+	return READ_ONCE(rdp->nocb_defer_wakeup);
 }
 
 /* Do a deferred wakeup of rcu_nocb_kthread(). */
@@ -2334,8 +2333,8 @@ static void do_nocb_deferred_wakeup(struct rcu_data *rdp)
 
 	if (!rcu_nocb_need_deferred_wakeup(rdp))
 		return;
-	ndw = ACCESS_ONCE(rdp->nocb_defer_wakeup);
-	ACCESS_ONCE(rdp->nocb_defer_wakeup) = RCU_NOGP_WAKE_NOT;
+	ndw = READ_ONCE(rdp->nocb_defer_wakeup);
+	WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOGP_WAKE_NOT);
 	wake_nocb_leader(rdp, ndw == RCU_NOGP_WAKE_FORCE);
 	trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("DeferredWake"));
 }
@@ -2448,7 +2447,7 @@ static void rcu_spawn_one_nocb_kthread(struct rcu_state *rsp, int cpu)
 	t = kthread_run(rcu_nocb_kthread, rdp_spawn,
 			"rcuo%c/%d", rsp->abbr, cpu);
 	BUG_ON(IS_ERR(t));
-	ACCESS_ONCE(rdp_spawn->nocb_kthread) = t;
+	WRITE_ONCE(rdp_spawn->nocb_kthread, t);
 }
 
 /*
@@ -2663,7 +2662,7 @@ static void rcu_sysidle_enter(int irq)
 
 	/* Record start of fully idle period. */
 	j = jiffies;
-	ACCESS_ONCE(rdtp->dynticks_idle_jiffies) = j;
+	WRITE_ONCE(rdtp->dynticks_idle_jiffies, j);
 	smp_mb__before_atomic();
 	atomic_inc(&rdtp->dynticks_idle);
 	smp_mb__after_atomic();
@@ -2681,7 +2680,7 @@ static void rcu_sysidle_enter(int irq)
  */
 void rcu_sysidle_force_exit(void)
 {
-	int oldstate = ACCESS_ONCE(full_sysidle_state);
+	int oldstate = READ_ONCE(full_sysidle_state);
 	int newoldstate;
 
 	/*
@@ -2794,7 +2793,7 @@ static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle,
 	smp_mb(); /* Read counters before timestamps. */
 
 	/* Pick up timestamps. */
-	j = ACCESS_ONCE(rdtp->dynticks_idle_jiffies);
+	j = READ_ONCE(rdtp->dynticks_idle_jiffies);
 	/* If this CPU entered idle more recently, update maxj timestamp. */
 	if (ULONG_CMP_LT(*maxj, j))
 		*maxj = j;
@@ -2831,11 +2830,11 @@ static unsigned long rcu_sysidle_delay(void)
 static void rcu_sysidle(unsigned long j)
 {
 	/* Check the current state. */
-	switch (ACCESS_ONCE(full_sysidle_state)) {
+	switch (READ_ONCE(full_sysidle_state)) {
 	case RCU_SYSIDLE_NOT:
 
 		/* First time all are idle, so note a short idle period. */
-		ACCESS_ONCE(full_sysidle_state) = RCU_SYSIDLE_SHORT;
+		WRITE_ONCE(full_sysidle_state, RCU_SYSIDLE_SHORT);
 		break;
 
 	case RCU_SYSIDLE_SHORT:
@@ -2873,7 +2872,7 @@ static void rcu_sysidle_cancel(void)
 {
 	smp_mb();
 	if (full_sysidle_state > RCU_SYSIDLE_SHORT)
-		ACCESS_ONCE(full_sysidle_state) = RCU_SYSIDLE_NOT;
+		WRITE_ONCE(full_sysidle_state, RCU_SYSIDLE_NOT);
 }
 
 /*
@@ -2925,7 +2924,7 @@ static void rcu_sysidle_cb(struct rcu_head *rhp)
 	smp_mb();  /* grace period precedes setting inuse. */
 
 	rshp = container_of(rhp, struct rcu_sysidle_head, rh);
-	ACCESS_ONCE(rshp->inuse) = 0;
+	WRITE_ONCE(rshp->inuse, 0);
 }
 
 /*
@@ -2936,7 +2935,7 @@ static void rcu_sysidle_cb(struct rcu_head *rhp)
 bool rcu_sys_is_idle(void)
 {
 	static struct rcu_sysidle_head rsh;
-	int rss = ACCESS_ONCE(full_sysidle_state);
+	int rss = READ_ONCE(full_sysidle_state);
 
 	if (WARN_ON_ONCE(smp_processor_id() != tick_do_timer_cpu))
 		return false;
@@ -2964,7 +2963,7 @@ bool rcu_sys_is_idle(void)
 			}
 			rcu_sysidle_report(rcu_state_p, isidle, maxj, false);
 			oldrss = rss;
-			rss = ACCESS_ONCE(full_sysidle_state);
+			rss = READ_ONCE(full_sysidle_state);
 		}
 	}
 
@@ -3048,7 +3047,7 @@ static bool rcu_nohz_full_cpu(struct rcu_state *rsp)
 #ifdef CONFIG_NO_HZ_FULL
 	if (tick_nohz_full_cpu(smp_processor_id()) &&
 	    (!rcu_gp_in_progress(rsp) ||
-	     ULONG_CMP_LT(jiffies, ACCESS_ONCE(rsp->gp_start) + HZ)))
+	     ULONG_CMP_LT(jiffies, READ_ONCE(rsp->gp_start) + HZ)))
 		return 1;
 #endif /* #ifdef CONFIG_NO_HZ_FULL */
 	return 0;
@@ -3077,7 +3076,7 @@ static void rcu_bind_gp_kthread(void)
 static void rcu_dynticks_task_enter(void)
 {
 #if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL)
-	ACCESS_ONCE(current->rcu_tasks_idle_cpu) = smp_processor_id();
+	WRITE_ONCE(current->rcu_tasks_idle_cpu, smp_processor_id());
 #endif /* #if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL) */
 }
 
@@ -3085,6 +3084,6 @@ static void rcu_dynticks_task_enter(void)
 static void rcu_dynticks_task_exit(void)
 {
 #if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL)
-	ACCESS_ONCE(current->rcu_tasks_idle_cpu) = -1;
+	WRITE_ONCE(current->rcu_tasks_idle_cpu, -1);
 #endif /* #if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL) */
 }
diff --git a/kernel/rcu/tree_trace.c b/kernel/rcu/tree_trace.c
index f92361efd0f5..3ea7ffc7d5c4 100644
--- a/kernel/rcu/tree_trace.c
+++ b/kernel/rcu/tree_trace.c
@@ -277,7 +277,7 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
 	seq_printf(m, "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld/%ld\n",
 		   rsp->n_force_qs, rsp->n_force_qs_ngp,
 		   rsp->n_force_qs - rsp->n_force_qs_ngp,
-		   ACCESS_ONCE(rsp->n_force_qs_lh), rsp->qlen_lazy, rsp->qlen);
+		   READ_ONCE(rsp->n_force_qs_lh), rsp->qlen_lazy, rsp->qlen);
 	for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < rcu_num_nodes; rnp++) {
 		if (rnp->level != level) {
 			seq_puts(m, "\n");
@@ -323,8 +323,8 @@ static void show_one_rcugp(struct seq_file *m, struct rcu_state *rsp)
 	struct rcu_node *rnp = &rsp->node[0];
 
 	raw_spin_lock_irqsave(&rnp->lock, flags);
-	completed = ACCESS_ONCE(rsp->completed);
-	gpnum = ACCESS_ONCE(rsp->gpnum);
+	completed = READ_ONCE(rsp->completed);
+	gpnum = READ_ONCE(rsp->gpnum);
 	if (completed == gpnum)
 		gpage = 0;
 	else
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index 1f133350da01..afaecb7a799a 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -150,14 +150,14 @@ void __rcu_read_unlock(void)
 		barrier();  /* critical section before exit code. */
 		t->rcu_read_lock_nesting = INT_MIN;
 		barrier();  /* assign before ->rcu_read_unlock_special load */
-		if (unlikely(ACCESS_ONCE(t->rcu_read_unlock_special.s)))
+		if (unlikely(READ_ONCE(t->rcu_read_unlock_special.s)))
 			rcu_read_unlock_special(t);
 		barrier();  /* ->rcu_read_unlock_special load before assign */
 		t->rcu_read_lock_nesting = 0;
 	}
 #ifdef CONFIG_PROVE_LOCKING
 	{
-		int rrln = ACCESS_ONCE(t->rcu_read_lock_nesting);
+		int rrln = READ_ONCE(t->rcu_read_lock_nesting);
 
 		WARN_ON_ONCE(rrln < 0 && rrln > INT_MIN / 2);
 	}
@@ -389,17 +389,17 @@ module_param(rcu_cpu_stall_timeout, int, 0644);
 
 int rcu_jiffies_till_stall_check(void)
 {
-	int till_stall_check = ACCESS_ONCE(rcu_cpu_stall_timeout);
+	int till_stall_check = READ_ONCE(rcu_cpu_stall_timeout);
 
 	/*
 	 * Limit check must be consistent with the Kconfig limits
 	 * for CONFIG_RCU_CPU_STALL_TIMEOUT.
 	 */
 	if (till_stall_check < 3) {
-		ACCESS_ONCE(rcu_cpu_stall_timeout) = 3;
+		WRITE_ONCE(rcu_cpu_stall_timeout, 3);
 		till_stall_check = 3;
 	} else if (till_stall_check > 300) {
-		ACCESS_ONCE(rcu_cpu_stall_timeout) = 300;
+		WRITE_ONCE(rcu_cpu_stall_timeout, 300);
 		till_stall_check = 300;
 	}
 	return till_stall_check * HZ + RCU_STALL_DELAY_DELTA;
@@ -550,12 +550,12 @@ static void check_holdout_task(struct task_struct *t,
 {
 	int cpu;
 
-	if (!ACCESS_ONCE(t->rcu_tasks_holdout) ||
-	    t->rcu_tasks_nvcsw != ACCESS_ONCE(t->nvcsw) ||
-	    !ACCESS_ONCE(t->on_rq) ||
+	if (!READ_ONCE(t->rcu_tasks_holdout) ||
+	    t->rcu_tasks_nvcsw != READ_ONCE(t->nvcsw) ||
+	    !READ_ONCE(t->on_rq) ||
 	    (IS_ENABLED(CONFIG_NO_HZ_FULL) &&
 	     !is_idle_task(t) && t->rcu_tasks_idle_cpu >= 0)) {
-		ACCESS_ONCE(t->rcu_tasks_holdout) = false;
+		WRITE_ONCE(t->rcu_tasks_holdout, false);
 		list_del_init(&t->rcu_tasks_holdout_list);
 		put_task_struct(t);
 		return;
@@ -639,11 +639,11 @@ static int __noreturn rcu_tasks_kthread(void *arg)
 		 */
 		rcu_read_lock();
 		for_each_process_thread(g, t) {
-			if (t != current && ACCESS_ONCE(t->on_rq) &&
+			if (t != current && READ_ONCE(t->on_rq) &&
 			    !is_idle_task(t)) {
 				get_task_struct(t);
-				t->rcu_tasks_nvcsw = ACCESS_ONCE(t->nvcsw);
-				ACCESS_ONCE(t->rcu_tasks_holdout) = true;
+				t->rcu_tasks_nvcsw = READ_ONCE(t->nvcsw);
+				WRITE_ONCE(t->rcu_tasks_holdout, true);
 				list_add(&t->rcu_tasks_holdout_list,
 					 &rcu_tasks_holdouts);
 			}
@@ -672,7 +672,7 @@ static int __noreturn rcu_tasks_kthread(void *arg)
 			struct task_struct *t1;
 
 			schedule_timeout_interruptible(HZ);
-			rtst = ACCESS_ONCE(rcu_task_stall_timeout);
+			rtst = READ_ONCE(rcu_task_stall_timeout);
 			needreport = rtst > 0 &&
 				     time_after(jiffies, lastreport + rtst);
 			if (needreport)
@@ -728,7 +728,7 @@ static void rcu_spawn_tasks_kthread(void)
 	static struct task_struct *rcu_tasks_kthread_ptr;
 	struct task_struct *t;
 
-	if (ACCESS_ONCE(rcu_tasks_kthread_ptr)) {
+	if (READ_ONCE(rcu_tasks_kthread_ptr)) {
 		smp_mb(); /* Ensure caller sees full kthread. */
 		return;
 	}
@@ -740,7 +740,7 @@ static void rcu_spawn_tasks_kthread(void)
 	t = kthread_run(rcu_tasks_kthread, NULL, "rcu_tasks_kthread");
 	BUG_ON(IS_ERR(t));
 	smp_mb(); /* Ensure others see full kthread. */
-	ACCESS_ONCE(rcu_tasks_kthread_ptr) = t;
+	WRITE_ONCE(rcu_tasks_kthread_ptr, t);
 	mutex_unlock(&rcu_tasks_kthread_mutex);
 }
 
diff --git a/kernel/torture.c b/kernel/torture.c
index dd70993c266c..3e4840633d3e 100644
--- a/kernel/torture.c
+++ b/kernel/torture.c
@@ -409,7 +409,7 @@ static void (*torture_shutdown_hook)(void);
  */
 void torture_shutdown_absorb(const char *title)
 {
-	while (ACCESS_ONCE(fullstop) == FULLSTOP_SHUTDOWN) {
+	while (READ_ONCE(fullstop) == FULLSTOP_SHUTDOWN) {
 		pr_notice("torture thread %s parking due to system shutdown\n",
 			  title);
 		schedule_timeout_uninterruptible(MAX_SCHEDULE_TIMEOUT);
@@ -480,9 +480,9 @@ static int torture_shutdown_notify(struct notifier_block *unused1,
 				   unsigned long unused2, void *unused3)
 {
 	mutex_lock(&fullstop_mutex);
-	if (ACCESS_ONCE(fullstop) == FULLSTOP_DONTSTOP) {
+	if (READ_ONCE(fullstop) == FULLSTOP_DONTSTOP) {
 		VERBOSE_TOROUT_STRING("Unscheduled system shutdown detected");
-		ACCESS_ONCE(fullstop) = FULLSTOP_SHUTDOWN;
+		WRITE_ONCE(fullstop, FULLSTOP_SHUTDOWN);
 	} else {
 		pr_warn("Concurrent rmmod and shutdown illegal!\n");
 	}
@@ -523,13 +523,13 @@ static int stutter;
  */
 void stutter_wait(const char *title)
 {
-	while (ACCESS_ONCE(stutter_pause_test) ||
-	       (torture_runnable && !ACCESS_ONCE(*torture_runnable))) {
+	while (READ_ONCE(stutter_pause_test) ||
+	       (torture_runnable && !READ_ONCE(*torture_runnable))) {
 		if (stutter_pause_test)
-			if (ACCESS_ONCE(stutter_pause_test) == 1)
+			if (READ_ONCE(stutter_pause_test) == 1)
 				schedule_timeout_interruptible(1);
 			else
-				while (ACCESS_ONCE(stutter_pause_test))
+				while (READ_ONCE(stutter_pause_test))
 					cond_resched();
 		else
 			schedule_timeout_interruptible(round_jiffies_relative(HZ));
@@ -549,14 +549,14 @@ static int torture_stutter(void *arg)
 		if (!torture_must_stop()) {
 			if (stutter > 1) {
 				schedule_timeout_interruptible(stutter - 1);
-				ACCESS_ONCE(stutter_pause_test) = 2;
+				WRITE_ONCE(stutter_pause_test, 2);
 			}
 			schedule_timeout_interruptible(1);
-			ACCESS_ONCE(stutter_pause_test) = 1;
+			WRITE_ONCE(stutter_pause_test, 1);
 		}
 		if (!torture_must_stop())
 			schedule_timeout_interruptible(stutter);
-		ACCESS_ONCE(stutter_pause_test) = 0;
+		WRITE_ONCE(stutter_pause_test, 0);
 		torture_shutdown_absorb("torture_stutter");
 	} while (!torture_must_stop());
 	torture_kthread_stopping("torture_stutter");
@@ -642,13 +642,13 @@ EXPORT_SYMBOL_GPL(torture_init_end);
 bool torture_cleanup_begin(void)
 {
 	mutex_lock(&fullstop_mutex);
-	if (ACCESS_ONCE(fullstop) == FULLSTOP_SHUTDOWN) {
+	if (READ_ONCE(fullstop) == FULLSTOP_SHUTDOWN) {
 		pr_warn("Concurrent rmmod and shutdown illegal!\n");
 		mutex_unlock(&fullstop_mutex);
 		schedule_timeout_uninterruptible(10);
 		return true;
 	}
-	ACCESS_ONCE(fullstop) = FULLSTOP_RMMOD;
+	WRITE_ONCE(fullstop, FULLSTOP_RMMOD);
 	mutex_unlock(&fullstop_mutex);
 	torture_shutdown_cleanup();
 	torture_shuffle_cleanup();
@@ -681,7 +681,7 @@ EXPORT_SYMBOL_GPL(torture_must_stop);
  */
 bool torture_must_stop_irq(void)
 {
-	return ACCESS_ONCE(fullstop) != FULLSTOP_DONTSTOP;
+	return READ_ONCE(fullstop) != FULLSTOP_DONTSTOP;
 }
 EXPORT_SYMBOL_GPL(torture_must_stop_irq);
 
-- 
cgit v1.2.3


From 1ebee8017d84ec8a0ba893cf7b8be3f70ead088b Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Sun, 19 Apr 2015 18:21:47 -0700
Subject: rcu: Eliminate array-index-based RCU primitives

Now that rcu_access_index() and rcu_dereference_index_check() are no
longer used, the commit removes them from the RCU API.  This means that
RCU's data dependencies now involve only pointers, give or take the
occasional cast to and then back from an integer type to do pointer
arithmetic.  This in turn eliminates the need for a number of operations
on values carrying RCU data dependencies.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: linux-edac@vger.kernel.org
Cc: Tony Luck <tony.luck@intel.com>
Acked-by: Borislav Petkov <bp@suse.de>
---
 include/linux/rcupdate.h | 50 ------------------------------------------------
 1 file changed, 50 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 87bb0eee665b..b97842ff71d2 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -628,21 +628,6 @@ static inline void rcu_preempt_sleep_check(void)
 	((typeof(*p) __force __kernel *)(p)); \
 })
 
-#define __rcu_access_index(p, space) \
-({ \
-	typeof(p) _________p1 = READ_ONCE(p); \
-	rcu_dereference_sparse(p, space); \
-	(_________p1); \
-})
-#define __rcu_dereference_index_check(p, c) \
-({ \
-	/* Dependency order vs. p above. */ \
-	typeof(p) _________p1 = lockless_dereference(p); \
-	rcu_lockdep_assert(c, \
-			   "suspicious rcu_dereference_index_check() usage"); \
-	(_________p1); \
-})
-
 /**
  * RCU_INITIALIZER() - statically initialize an RCU-protected global variable
  * @v: The value to statically initialize with.
@@ -786,41 +771,6 @@ static inline void rcu_preempt_sleep_check(void)
  */
 #define rcu_dereference_raw_notrace(p) __rcu_dereference_check((p), 1, __rcu)
 
-/**
- * rcu_access_index() - fetch RCU index with no dereferencing
- * @p: The index to read
- *
- * Return the value of the specified RCU-protected index, but omit the
- * smp_read_barrier_depends() and keep the READ_ONCE().  This is useful
- * when the value of this index is accessed, but the index is not
- * dereferenced, for example, when testing an RCU-protected index against
- * -1.  Although rcu_access_index() may also be used in cases where
- * update-side locks prevent the value of the index from changing, you
- * should instead use rcu_dereference_index_protected() for this use case.
- */
-#define rcu_access_index(p) __rcu_access_index((p), __rcu)
-
-/**
- * rcu_dereference_index_check() - rcu_dereference for indices with debug checking
- * @p: The pointer to read, prior to dereferencing
- * @c: The conditions under which the dereference will take place
- *
- * Similar to rcu_dereference_check(), but omits the sparse checking.
- * This allows rcu_dereference_index_check() to be used on integers,
- * which can then be used as array indices.  Attempting to use
- * rcu_dereference_check() on an integer will give compiler warnings
- * because the sparse address-space mechanism relies on dereferencing
- * the RCU-protected pointer.  Dereferencing integers is not something
- * that even gcc will put up with.
- *
- * Note that this function does not implicitly check for RCU read-side
- * critical sections.  If this function gains lots of uses, it might
- * make sense to provide versions for each flavor of RCU, but it does
- * not make sense as of early 2010.
- */
-#define rcu_dereference_index_check(p, c) \
-	__rcu_dereference_index_check((p), (c))
-
 /**
  * rcu_dereference_protected() - fetch RCU pointer when updates prevented
  * @p: The pointer to read, prior to dereferencing
-- 
cgit v1.2.3


From d956028e99b30726b0bce0ca684b40b1ad67b514 Mon Sep 17 00:00:00 2001
From: Will Deacon <will.deacon@arm.com>
Date: Tue, 31 Mar 2015 09:39:41 +0100
Subject: documentation: memory-barriers: Fix smp_mb__before_spinlock()
 semantics

Our current documentation claims that, when followed by an ACQUIRE,
smp_mb__before_spinlock() orders prior loads against subsequent loads
and stores, which isn't the intent.  This commit therefore fixes the
documentation to state that this sequence orders only prior stores
against subsequent loads and stores.

In addition, the original intent of smp_mb__before_spinlock() was to only
order prior loads against subsequent stores, however, people have started
using it as if it ordered prior loads against subsequent loads and stores.
This commit therefore also updates smp_mb__before_spinlock()'s header
comment to reflect this new reality.

Cc: Oleg Nesterov <oleg@redhat.com>
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 Documentation/memory-barriers.txt | 7 +++----
 include/linux/spinlock.h          | 2 +-
 2 files changed, 4 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/memory-barriers.txt b/Documentation/memory-barriers.txt
index f95746189b5d..1f362fd2ecb4 100644
--- a/Documentation/memory-barriers.txt
+++ b/Documentation/memory-barriers.txt
@@ -1784,10 +1784,9 @@ for each construct.  These operations all imply certain barriers:
 
      Memory operations issued before the ACQUIRE may be completed after
      the ACQUIRE operation has completed.  An smp_mb__before_spinlock(),
-     combined with a following ACQUIRE, orders prior loads against
-     subsequent loads and stores and also orders prior stores against
-     subsequent stores.  Note that this is weaker than smp_mb()!  The
-     smp_mb__before_spinlock() primitive is free on many architectures.
+     combined with a following ACQUIRE, orders prior stores against
+     subsequent loads and stores. Note that this is weaker than smp_mb()!
+     The smp_mb__before_spinlock() primitive is free on many architectures.
 
  (2) RELEASE operation implication:
 
diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h
index 3e18379dfa6f..0063b24b4f36 100644
--- a/include/linux/spinlock.h
+++ b/include/linux/spinlock.h
@@ -120,7 +120,7 @@ do {								\
 /*
  * Despite its name it doesn't necessarily has to be a full barrier.
  * It should only guarantee that a STORE before the critical section
- * can not be reordered with a LOAD inside this section.
+ * can not be reordered with LOADs and STOREs inside this section.
  * spin_lock() is the one-way barrier, this LOAD can not escape out
  * of the region. So the default implementation simply ensures that
  * a STORE can not move into the critical section, smp_wmb() should
-- 
cgit v1.2.3


From 3382adbc1bb8c80ea512243acf6059564287620b Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Wed, 4 Mar 2015 15:41:24 -0800
Subject: rcu: Eliminate a few CONFIG_RCU_NOCB_CPU_ALL #ifdefs

This commit converts several CONFIG_RCU_NOCB_CPU_ALL #ifdefs to
instead use IS_ENABLED().  This change should help avoid hiding
code from compiler diagnostics.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 include/linux/rcupdate.h |  4 ++--
 include/linux/rcutree.h  |  2 --
 kernel/rcu/tree_plugin.h | 22 ++++++++++++----------
 3 files changed, 14 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 87bb0eee665b..5ec20bc4af76 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -1153,13 +1153,13 @@ static inline notrace void rcu_read_unlock_sched_notrace(void)
 #define kfree_rcu(ptr, rcu_head)					\
 	__kfree_rcu(&((ptr)->rcu_head), offsetof(typeof(*(ptr)), rcu_head))
 
-#if defined(CONFIG_TINY_RCU) || defined(CONFIG_RCU_NOCB_CPU_ALL)
+#ifdef CONFIG_TINY_RCU
 static inline int rcu_needs_cpu(unsigned long *delta_jiffies)
 {
 	*delta_jiffies = ULONG_MAX;
 	return 0;
 }
-#endif /* #if defined(CONFIG_TINY_RCU) || defined(CONFIG_RCU_NOCB_CPU_ALL) */
+#endif /* #ifdef CONFIG_TINY_RCU */
 
 #if defined(CONFIG_RCU_NOCB_CPU_ALL)
 static inline bool rcu_is_nocb_cpu(int cpu) { return true; }
diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
index d2e583a6aaca..0bd400b02430 100644
--- a/include/linux/rcutree.h
+++ b/include/linux/rcutree.h
@@ -31,9 +31,7 @@
 #define __LINUX_RCUTREE_H
 
 void rcu_note_context_switch(void);
-#ifndef CONFIG_RCU_NOCB_CPU_ALL
 int rcu_needs_cpu(unsigned long *delta_jiffies);
-#endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */
 void rcu_cpu_stall_reset(void);
 
 /*
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 5c0122f09ed0..0730bfcf65db 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -1372,13 +1372,12 @@ static void rcu_prepare_kthreads(int cpu)
  * Because we not have RCU_FAST_NO_HZ, just check whether this CPU needs
  * any flavor of RCU.
  */
-#ifndef CONFIG_RCU_NOCB_CPU_ALL
 int rcu_needs_cpu(unsigned long *delta_jiffies)
 {
 	*delta_jiffies = ULONG_MAX;
-	return rcu_cpu_has_callbacks(NULL);
+	return IS_ENABLED(CONFIG_RCU_NOCB_CPU_ALL)
+	       ? 0 : rcu_cpu_has_callbacks(NULL);
 }
-#endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */
 
 /*
  * Because we do not have RCU_FAST_NO_HZ, don't bother cleaning up
@@ -1485,11 +1484,15 @@ static bool __maybe_unused rcu_try_advance_all_cbs(void)
  *
  * The caller must have disabled interrupts.
  */
-#ifndef CONFIG_RCU_NOCB_CPU_ALL
 int rcu_needs_cpu(unsigned long *dj)
 {
 	struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
 
+	if (IS_ENABLED(CONFIG_RCU_NOCB_CPU_ALL)) {
+		*dj = ULONG_MAX;
+		return 0;
+	}
+
 	/* Snapshot to detect later posting of non-lazy callback. */
 	rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted;
 
@@ -1516,7 +1519,6 @@ int rcu_needs_cpu(unsigned long *dj)
 	}
 	return 0;
 }
-#endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */
 
 /*
  * Prepare a CPU for idle from an RCU perspective.  The first major task
@@ -1530,7 +1532,6 @@ int rcu_needs_cpu(unsigned long *dj)
  */
 static void rcu_prepare_for_idle(void)
 {
-#ifndef CONFIG_RCU_NOCB_CPU_ALL
 	bool needwake;
 	struct rcu_data *rdp;
 	struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
@@ -1538,6 +1539,9 @@ static void rcu_prepare_for_idle(void)
 	struct rcu_state *rsp;
 	int tne;
 
+	if (IS_ENABLED(CONFIG_RCU_NOCB_CPU_ALL))
+		return;
+
 	/* Handle nohz enablement switches conservatively. */
 	tne = READ_ONCE(tick_nohz_active);
 	if (tne != rdtp->tick_nohz_enabled_snap) {
@@ -1585,7 +1589,6 @@ static void rcu_prepare_for_idle(void)
 		if (needwake)
 			rcu_gp_kthread_wake(rsp);
 	}
-#endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */
 }
 
 /*
@@ -1595,12 +1598,11 @@ static void rcu_prepare_for_idle(void)
  */
 static void rcu_cleanup_after_idle(void)
 {
-#ifndef CONFIG_RCU_NOCB_CPU_ALL
-	if (rcu_is_nocb_cpu(smp_processor_id()))
+	if (IS_ENABLED(CONFIG_RCU_NOCB_CPU_ALL) ||
+	    rcu_is_nocb_cpu(smp_processor_id()))
 		return;
 	if (rcu_try_advance_all_cbs())
 		invoke_rcu_core();
-#endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */
 }
 
 /*
-- 
cgit v1.2.3


From 5af4692a75daf08dddc93dbb4cd2a1b3d3b617af Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Sat, 25 Apr 2015 12:48:29 -0700
Subject: smp: Make control dependencies work on Alpha, improve documentation

The current formulation of control dependencies fails on DEC Alpha,
which does not respect dependencies of any kind unless an explicit
memory barrier is provided.  This means that the current fomulation of
control dependencies fails on Alpha.  This commit therefore creates a
READ_ONCE_CTRL() that has the same overhead on non-Alpha systems, but
causes Alpha to produce the needed ordering.  This commit also applies
READ_ONCE_CTRL() to the one known use of control dependencies.

Use of READ_ONCE_CTRL() also has the beneficial effect of adding a bit
of self-documentation to control dependencies.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 Documentation/memory-barriers.txt | 55 +++++++++++++++++++++++----------------
 include/linux/compiler.h          | 16 ++++++++++++
 kernel/events/ring_buffer.c       |  2 +-
 3 files changed, 50 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/memory-barriers.txt b/Documentation/memory-barriers.txt
index f95746189b5d..a3014bcc5b08 100644
--- a/Documentation/memory-barriers.txt
+++ b/Documentation/memory-barriers.txt
@@ -617,16 +617,16 @@ case what's actually required is:
 However, stores are not speculated.  This means that ordering -is- provided
 for load-store control dependencies, as in the following example:
 
-	q = ACCESS_ONCE(a);
+	q = READ_ONCE_CTRL(a);
 	if (q) {
 		ACCESS_ONCE(b) = p;
 	}
 
-Control dependencies pair normally with other types of barriers.
-That said, please note that ACCESS_ONCE() is not optional!  Without the
-ACCESS_ONCE(), might combine the load from 'a' with other loads from
-'a', and the store to 'b' with other stores to 'b', with possible highly
-counterintuitive effects on ordering.
+Control dependencies pair normally with other types of barriers.  That
+said, please note that READ_ONCE_CTRL() is not optional!  Without the
+READ_ONCE_CTRL(), the compiler might combine the load from 'a' with
+other loads from 'a', and the store to 'b' with other stores to 'b',
+with possible highly counterintuitive effects on ordering.
 
 Worse yet, if the compiler is able to prove (say) that the value of
 variable 'a' is always non-zero, it would be well within its rights
@@ -636,12 +636,15 @@ as follows:
 	q = a;
 	b = p;  /* BUG: Compiler and CPU can both reorder!!! */
 
-So don't leave out the ACCESS_ONCE().
+Finally, the READ_ONCE_CTRL() includes an smp_read_barrier_depends()
+that DEC Alpha needs in order to respect control depedencies.
+
+So don't leave out the READ_ONCE_CTRL().
 
 It is tempting to try to enforce ordering on identical stores on both
 branches of the "if" statement as follows:
 
-	q = ACCESS_ONCE(a);
+	q = READ_ONCE_CTRL(a);
 	if (q) {
 		barrier();
 		ACCESS_ONCE(b) = p;
@@ -655,7 +658,7 @@ branches of the "if" statement as follows:
 Unfortunately, current compilers will transform this as follows at high
 optimization levels:
 
-	q = ACCESS_ONCE(a);
+	q = READ_ONCE_CTRL(a);
 	barrier();
 	ACCESS_ONCE(b) = p;  /* BUG: No ordering vs. load from a!!! */
 	if (q) {
@@ -685,7 +688,7 @@ memory barriers, for example, smp_store_release():
 In contrast, without explicit memory barriers, two-legged-if control
 ordering is guaranteed only when the stores differ, for example:
 
-	q = ACCESS_ONCE(a);
+	q = READ_ONCE_CTRL(a);
 	if (q) {
 		ACCESS_ONCE(b) = p;
 		do_something();
@@ -694,14 +697,14 @@ ordering is guaranteed only when the stores differ, for example:
 		do_something_else();
 	}
 
-The initial ACCESS_ONCE() is still required to prevent the compiler from
-proving the value of 'a'.
+The initial READ_ONCE_CTRL() is still required to prevent the compiler
+from proving the value of 'a'.
 
 In addition, you need to be careful what you do with the local variable 'q',
 otherwise the compiler might be able to guess the value and again remove
 the needed conditional.  For example:
 
-	q = ACCESS_ONCE(a);
+	q = READ_ONCE_CTRL(a);
 	if (q % MAX) {
 		ACCESS_ONCE(b) = p;
 		do_something();
@@ -714,7 +717,7 @@ If MAX is defined to be 1, then the compiler knows that (q % MAX) is
 equal to zero, in which case the compiler is within its rights to
 transform the above code into the following:
 
-	q = ACCESS_ONCE(a);
+	q = READ_ONCE_CTRL(a);
 	ACCESS_ONCE(b) = p;
 	do_something_else();
 
@@ -725,7 +728,7 @@ is gone, and the barrier won't bring it back.  Therefore, if you are
 relying on this ordering, you should make sure that MAX is greater than
 one, perhaps as follows:
 
-	q = ACCESS_ONCE(a);
+	q = READ_ONCE_CTRL(a);
 	BUILD_BUG_ON(MAX <= 1); /* Order load from a with store to b. */
 	if (q % MAX) {
 		ACCESS_ONCE(b) = p;
@@ -742,14 +745,15 @@ of the 'if' statement.
 You must also be careful not to rely too much on boolean short-circuit
 evaluation.  Consider this example:
 
-	q = ACCESS_ONCE(a);
+	q = READ_ONCE_CTRL(a);
 	if (a || 1 > 0)
 		ACCESS_ONCE(b) = 1;
 
-Because the second condition is always true, the compiler can transform
-this example as following, defeating control dependency:
+Because the first condition cannot fault and the second condition is
+always true, the compiler can transform this example as following,
+defeating control dependency:
 
-	q = ACCESS_ONCE(a);
+	q = READ_ONCE_CTRL(a);
 	ACCESS_ONCE(b) = 1;
 
 This example underscores the need to ensure that the compiler cannot
@@ -762,8 +766,8 @@ demonstrated by two related examples, with the initial values of
 x and y both being zero:
 
 	CPU 0                     CPU 1
-	=====================     =====================
-	r1 = ACCESS_ONCE(x);      r2 = ACCESS_ONCE(y);
+	=======================   =======================
+	r1 = READ_ONCE_CTRL(x);   r2 = READ_ONCE_CTRL(y);
 	if (r1 > 0)               if (r2 > 0)
 	  ACCESS_ONCE(y) = 1;       ACCESS_ONCE(x) = 1;
 
@@ -783,7 +787,8 @@ But because control dependencies do -not- provide transitivity, the above
 assertion can fail after the combined three-CPU example completes.  If you
 need the three-CPU example to provide ordering, you will need smp_mb()
 between the loads and stores in the CPU 0 and CPU 1 code fragments,
-that is, just before or just after the "if" statements.
+that is, just before or just after the "if" statements.  Furthermore,
+the original two-CPU example is very fragile and should be avoided.
 
 These two examples are the LB and WWC litmus tests from this paper:
 http://www.cl.cam.ac.uk/users/pes20/ppc-supplemental/test6.pdf and this
@@ -791,6 +796,12 @@ site: https://www.cl.cam.ac.uk/~pes20/ppcmem/index.html.
 
 In summary:
 
+  (*) Control dependencies must be headed by READ_ONCE_CTRL().
+      Or, as a much less preferable alternative, interpose
+      be headed by READ_ONCE() or an ACCESS_ONCE() read and must
+      have smp_read_barrier_depends() between this read and the
+      control-dependent write.
+
   (*) Control dependencies can order prior loads against later stores.
       However, they do -not- guarantee any other sort of ordering:
       Not prior loads against later loads, nor prior stores against
diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index 867722591be2..5d66777914db 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -252,6 +252,22 @@ static __always_inline void __write_once_size(volatile void *p, void *res, int s
 #define WRITE_ONCE(x, val) \
 	({ typeof(x) __val = (val); __write_once_size(&(x), &__val, sizeof(__val)); __val; })
 
+/**
+ * READ_ONCE_CTRL - Read a value heading a control dependency
+ * @x: The value to be read, heading the control dependency
+ *
+ * Control dependencies are tricky.  See Documentation/memory-barriers.txt
+ * for important information on how to use them.  Note that in many cases,
+ * use of smp_load_acquire() will be much simpler.  Control dependencies
+ * should be avoided except on the hottest of hotpaths.
+ */
+#define READ_ONCE_CTRL(x) \
+({ \
+	typeof(x) __val = READ_ONCE(x); \
+	smp_read_barrier_depends(); /* Enforce control dependency. */ \
+	__val; \
+})
+
 #endif /* __KERNEL__ */
 
 #endif /* __ASSEMBLY__ */
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index 232f00f273cb..17fcb73c4a50 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -141,7 +141,7 @@ int perf_output_begin(struct perf_output_handle *handle,
 	perf_output_get_handle(handle);
 
 	do {
-		tail = ACCESS_ONCE(rb->user_page->data_tail);
+		tail = READ_ONCE_CTRL(rb->user_page->data_tail);
 		offset = head = local_read(&rb->head);
 		if (!rb->overwrite &&
 		    unlikely(CIRC_SPACE(head, tail, perf_data_size(rb)) < size))
-- 
cgit v1.2.3


From f517700cce37ffcb36e7afae0294fd11c72ed134 Mon Sep 17 00:00:00 2001
From: Ying Xue <ying.xue@windriver.com>
Date: Thu, 26 Mar 2015 13:27:08 +0800
Subject: rculist: Fix another sparse warning

This fixes the following sparse warnings:

make C=1 CF=-D__CHECK_ENDIAN__ net/tipc/name_table.o
net/tipc/name_table.c:977:17: error: incompatible types in comparison expression (different address spaces)
net/tipc/name_table.c:977:17: error: incompatible types in comparison expression (different address spaces)

To silence these spare complaints, an RCU annotation should be added to
"next" pointer of hlist_node structure through hlist_next_rcu() macro
when iterating over a hlist with hlist_for_each_entry_from_rcu().

Signed-off-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 include/linux/rculist.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rculist.h b/include/linux/rculist.h
index 665397247e82..17c6b1f84a77 100644
--- a/include/linux/rculist.h
+++ b/include/linux/rculist.h
@@ -549,8 +549,8 @@ static inline void hlist_add_behind_rcu(struct hlist_node *n,
  */
 #define hlist_for_each_entry_from_rcu(pos, member)			\
 	for (; pos;							\
-	     pos = hlist_entry_safe(rcu_dereference((pos)->member.next),\
-			typeof(*(pos)), member))
+	     pos = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(	\
+			&(pos)->member)), typeof(*(pos)), member))
 
 #endif	/* __KERNEL__ */
 #endif
-- 
cgit v1.2.3


From 51952bc633064311410b041fad38da1614f4539e Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Tue, 21 Apr 2015 11:15:30 -0700
Subject: rcu: Further shrink Tiny RCU by making empty functions static inlines

The Tiny RCU counterparts to rcu_idle_enter(), rcu_idle_exit(),
rcu_irq_enter(), and rcu_irq_exit() are empty functions, but each has
EXPORT_SYMBOL_GPL(), which needlessly consumes extra memory, especially
in kernels built with module support.  This commit therefore moves these
functions to static inlines in rcutiny.h, removing the need for exports.

This won't affect the size of the tiniest kernels, which are likely
built without module support, but might help semi-tiny kernels that
might include module support.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
---
 include/linux/rcupdate.h |  4 ----
 include/linux/rcutiny.h  | 16 ++++++++++++++++
 include/linux/rcutree.h  |  5 +++++
 kernel/rcu/tiny.c        | 33 ---------------------------------
 4 files changed, 21 insertions(+), 37 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 87bb0eee665b..1b3d7bcb3a6c 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -292,10 +292,6 @@ void rcu_sched_qs(void);
 void rcu_bh_qs(void);
 void rcu_check_callbacks(int user);
 struct notifier_block;
-void rcu_idle_enter(void);
-void rcu_idle_exit(void);
-void rcu_irq_enter(void);
-void rcu_irq_exit(void);
 int rcu_cpu_notify(struct notifier_block *self,
 		   unsigned long action, void *hcpu);
 
diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h
index 937edaeb150d..3df6c1ec4e25 100644
--- a/include/linux/rcutiny.h
+++ b/include/linux/rcutiny.h
@@ -159,6 +159,22 @@ static inline void rcu_cpu_stall_reset(void)
 {
 }
 
+static inline void rcu_idle_enter(void)
+{
+}
+
+static inline void rcu_idle_exit(void)
+{
+}
+
+static inline void rcu_irq_enter(void)
+{
+}
+
+static inline void rcu_irq_exit(void)
+{
+}
+
 static inline void exit_rcu(void)
 {
 }
diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
index d2e583a6aaca..f22d83f49e56 100644
--- a/include/linux/rcutree.h
+++ b/include/linux/rcutree.h
@@ -93,6 +93,11 @@ void rcu_force_quiescent_state(void);
 void rcu_bh_force_quiescent_state(void);
 void rcu_sched_force_quiescent_state(void);
 
+void rcu_idle_enter(void);
+void rcu_idle_exit(void);
+void rcu_irq_enter(void);
+void rcu_irq_exit(void);
+
 void exit_rcu(void);
 
 void rcu_scheduler_starting(void);
diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c
index 069742d61c68..a501b4ab9b1c 100644
--- a/kernel/rcu/tiny.c
+++ b/kernel/rcu/tiny.c
@@ -49,39 +49,6 @@ static void __call_rcu(struct rcu_head *head,
 
 #include "tiny_plugin.h"
 
-/*
- * Enter idle, which is an extended quiescent state if we have fully
- * entered that mode.
- */
-void rcu_idle_enter(void)
-{
-}
-EXPORT_SYMBOL_GPL(rcu_idle_enter);
-
-/*
- * Exit an interrupt handler towards idle.
- */
-void rcu_irq_exit(void)
-{
-}
-EXPORT_SYMBOL_GPL(rcu_irq_exit);
-
-/*
- * Exit idle, so that we are no longer in an extended quiescent state.
- */
-void rcu_idle_exit(void)
-{
-}
-EXPORT_SYMBOL_GPL(rcu_idle_exit);
-
-/*
- * Enter an interrupt handler, moving away from idle.
- */
-void rcu_irq_enter(void)
-{
-}
-EXPORT_SYMBOL_GPL(rcu_irq_enter);
-
 #if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE)
 
 /*
-- 
cgit v1.2.3


From ad5fb870c486d932a1749d7853dd70f436a7e03f Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Fri, 3 Apr 2015 12:05:28 -0400
Subject: e820, efi: add ACPI 6.0 persistent memory types

ACPI 6.0 formalizes e820-type-7 and efi-type-14 as persistent memory.
Mark it "reserved" and allow it to be claimed by a persistent memory
device driver.

This definition is in addition to the Linux kernel's existing type-12
definition that was recently added in support of shipping platforms with
NVDIMM support that predate ACPI 6.0 (which now classifies type-12 as
OEM reserved).

Note, /proc/iomem can be consulted for differentiating legacy
"Persistent Memory (legacy)" E820_PRAM vs standard "Persistent Memory"
E820_PMEM.

Cc: Boaz Harrosh <boaz@plexistor.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Jens Axboe <axboe@fb.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Matthew Wilcox <willy@linux.intel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Jeff Moyer <jmoyer@redhat.com>
Acked-by: Andy Lutomirski <luto@amacapital.net>
Reviewed-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Acked-by: Christoph Hellwig <hch@lst.de>
Tested-by: Toshi Kani <toshi.kani@hp.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 arch/arm64/kernel/efi.c          |  1 +
 arch/ia64/kernel/efi.c           |  4 ++++
 arch/x86/boot/compressed/eboot.c |  4 ++++
 arch/x86/include/uapi/asm/e820.h |  1 +
 arch/x86/kernel/e820.c           | 28 ++++++++++++++++++++++++----
 arch/x86/platform/efi/efi.c      |  3 +++
 include/linux/efi.h              |  3 ++-
 7 files changed, 39 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm64/kernel/efi.c b/arch/arm64/kernel/efi.c
index ab21e0d58278..9d4aa18f2a82 100644
--- a/arch/arm64/kernel/efi.c
+++ b/arch/arm64/kernel/efi.c
@@ -158,6 +158,7 @@ static __init int is_reserve_region(efi_memory_desc_t *md)
 	case EFI_BOOT_SERVICES_CODE:
 	case EFI_BOOT_SERVICES_DATA:
 	case EFI_CONVENTIONAL_MEMORY:
+	case EFI_PERSISTENT_MEMORY:
 		return 0;
 	default:
 		break;
diff --git a/arch/ia64/kernel/efi.c b/arch/ia64/kernel/efi.c
index c52d7540dc05..5f6be9dd6968 100644
--- a/arch/ia64/kernel/efi.c
+++ b/arch/ia64/kernel/efi.c
@@ -1223,6 +1223,10 @@ efi_initialize_iomem_resources(struct resource *code_resource,
 				flags |= IORESOURCE_DISABLED;
 				break;
 
+			case EFI_PERSISTENT_MEMORY:
+				name = "Persistent Memory";
+				break;
+
 			case EFI_RESERVED_TYPE:
 			case EFI_RUNTIME_SERVICES_CODE:
 			case EFI_RUNTIME_SERVICES_DATA:
diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c
index 48304b89b601..2c82bd150d43 100644
--- a/arch/x86/boot/compressed/eboot.c
+++ b/arch/x86/boot/compressed/eboot.c
@@ -1224,6 +1224,10 @@ static efi_status_t setup_e820(struct boot_params *params,
 			e820_type = E820_NVS;
 			break;
 
+		case EFI_PERSISTENT_MEMORY:
+			e820_type = E820_PMEM;
+			break;
+
 		default:
 			continue;
 		}
diff --git a/arch/x86/include/uapi/asm/e820.h b/arch/x86/include/uapi/asm/e820.h
index 960a8a9dc4ab..0f457e6eab18 100644
--- a/arch/x86/include/uapi/asm/e820.h
+++ b/arch/x86/include/uapi/asm/e820.h
@@ -32,6 +32,7 @@
 #define E820_ACPI	3
 #define E820_NVS	4
 #define E820_UNUSABLE	5
+#define E820_PMEM	7
 
 /*
  * This is a non-standardized way to represent ADR or NVDIMM regions that
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index e2ce85db2283..c857d53269dd 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -149,6 +149,7 @@ static void __init e820_print_type(u32 type)
 	case E820_UNUSABLE:
 		printk(KERN_CONT "unusable");
 		break;
+	case E820_PMEM:
 	case E820_PRAM:
 		printk(KERN_CONT "persistent (type %u)", type);
 		break;
@@ -918,11 +919,32 @@ static inline const char *e820_type_to_string(int e820_type)
 	case E820_ACPI:	return "ACPI Tables";
 	case E820_NVS:	return "ACPI Non-volatile Storage";
 	case E820_UNUSABLE:	return "Unusable memory";
-	case E820_PRAM: return "Persistent RAM";
+	case E820_PRAM: return "Persistent Memory (legacy)";
+	case E820_PMEM: return "Persistent Memory";
 	default:	return "reserved";
 	}
 }
 
+static bool do_mark_busy(u32 type, struct resource *res)
+{
+	/* this is the legacy bios/dos rom-shadow + mmio region */
+	if (res->start < (1ULL<<20))
+		return true;
+
+	/*
+	 * Treat persistent memory like device memory, i.e. reserve it
+	 * for exclusive use of a driver
+	 */
+	switch (type) {
+	case E820_RESERVED:
+	case E820_PRAM:
+	case E820_PMEM:
+		return false;
+	default:
+		return true;
+	}
+}
+
 /*
  * Mark e820 reserved areas as busy for the resource manager.
  */
@@ -952,9 +974,7 @@ void __init e820_reserve_resources(void)
 		 * pci device BAR resource and insert them later in
 		 * pcibios_resource_survey()
 		 */
-		if (((e820.map[i].type != E820_RESERVED) &&
-		     (e820.map[i].type != E820_PRAM)) ||
-		     res->start < (1ULL<<20)) {
+		if (do_mark_busy(e820.map[i].type, res)) {
 			res->flags |= IORESOURCE_BUSY;
 			insert_resource(&iomem_resource, res);
 		}
diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c
index 02744df576d5..fe01ae37a2a4 100644
--- a/arch/x86/platform/efi/efi.c
+++ b/arch/x86/platform/efi/efi.c
@@ -153,6 +153,9 @@ static void __init do_add_efi_memmap(void)
 		case EFI_UNUSABLE_MEMORY:
 			e820_type = E820_UNUSABLE;
 			break;
+		case EFI_PERSISTENT_MEMORY:
+			e820_type = E820_PMEM;
+			break;
 		default:
 			/*
 			 * EFI_RESERVED_TYPE EFI_RUNTIME_SERVICES_CODE
diff --git a/include/linux/efi.h b/include/linux/efi.h
index af5be0368dec..825b6e3d69cb 100644
--- a/include/linux/efi.h
+++ b/include/linux/efi.h
@@ -85,7 +85,8 @@ typedef	struct {
 #define EFI_MEMORY_MAPPED_IO		11
 #define EFI_MEMORY_MAPPED_IO_PORT_SPACE	12
 #define EFI_PAL_CODE			13
-#define EFI_MAX_MEMORY_TYPE		14
+#define EFI_PERSISTENT_MEMORY		14
+#define EFI_MAX_MEMORY_TYPE		15
 
 /* Attribute values: */
 #define EFI_MEMORY_UC		((u64)0x0000000000000001ULL)	/* uncached */
-- 
cgit v1.2.3


From 0be964be0d45084245673c971d72a4b51690231d Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 27 May 2015 11:09:35 +0930
Subject: module: Sanitize RCU usage and locking

Currently the RCU usage in module is an inconsistent mess of RCU and
RCU-sched, this is broken for CONFIG_PREEMPT where synchronize_rcu()
does not imply synchronize_sched().

Most usage sites use preempt_{dis,en}able() which is RCU-sched, but
(most of) the modification sites use synchronize_rcu(). With the
exception of the module bug list, which actually uses RCU.

Convert everything over to RCU-sched.

Furthermore add lockdep asserts to all sites, because it's not at all
clear to me the required locking is observed, esp. on exported
functions.

Cc: Rusty Russell <rusty@rustcorp.com.au>
Acked-by: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 include/linux/module.h | 12 ++++++++++--
 kernel/module.c        | 40 ++++++++++++++++++++++++++++++++--------
 lib/bug.c              |  7 +++++--
 3 files changed, 47 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/module.h b/include/linux/module.h
index c883b86ea964..fb56dd85a862 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -421,14 +421,22 @@ struct symsearch {
 	bool unused;
 };
 
-/* Search for an exported symbol by name. */
+/*
+ * Search for an exported symbol by name.
+ *
+ * Must be called with module_mutex held or preemption disabled.
+ */
 const struct kernel_symbol *find_symbol(const char *name,
 					struct module **owner,
 					const unsigned long **crc,
 					bool gplok,
 					bool warn);
 
-/* Walk the exported symbol table */
+/*
+ * Walk the exported symbol table
+ *
+ * Must be called with module_mutex held or preemption disabled.
+ */
 bool each_symbol_section(bool (*fn)(const struct symsearch *arr,
 				    struct module *owner,
 				    void *data), void *data);
diff --git a/kernel/module.c b/kernel/module.c
index 1150d5239205..a15899e00ca9 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -105,6 +105,22 @@ static LIST_HEAD(modules);
 struct list_head *kdb_modules = &modules; /* kdb needs the list of modules */
 #endif /* CONFIG_KGDB_KDB */
 
+static void module_assert_mutex(void)
+{
+	lockdep_assert_held(&module_mutex);
+}
+
+static void module_assert_mutex_or_preempt(void)
+{
+#ifdef CONFIG_LOCKDEP
+	if (unlikely(!debug_locks))
+		return;
+
+	WARN_ON(!rcu_read_lock_sched_held() &&
+		!lockdep_is_held(&module_mutex));
+#endif
+}
+
 #ifdef CONFIG_MODULE_SIG
 #ifdef CONFIG_MODULE_SIG_FORCE
 static bool sig_enforce = true;
@@ -318,6 +334,8 @@ bool each_symbol_section(bool (*fn)(const struct symsearch *arr,
 #endif
 	};
 
+	module_assert_mutex_or_preempt();
+
 	if (each_symbol_in_section(arr, ARRAY_SIZE(arr), NULL, fn, data))
 		return true;
 
@@ -457,6 +475,8 @@ static struct module *find_module_all(const char *name, size_t len,
 {
 	struct module *mod;
 
+	module_assert_mutex();
+
 	list_for_each_entry(mod, &modules, list) {
 		if (!even_unformed && mod->state == MODULE_STATE_UNFORMED)
 			continue;
@@ -1860,8 +1880,8 @@ static void free_module(struct module *mod)
 	list_del_rcu(&mod->list);
 	/* Remove this module from bug list, this uses list_del_rcu */
 	module_bug_cleanup(mod);
-	/* Wait for RCU synchronizing before releasing mod->list and buglist. */
-	synchronize_rcu();
+	/* Wait for RCU-sched synchronizing before releasing mod->list and buglist. */
+	synchronize_sched();
 	mutex_unlock(&module_mutex);
 
 	/* This may be NULL, but that's OK */
@@ -3133,11 +3153,11 @@ static noinline int do_init_module(struct module *mod)
 	mod->init_text_size = 0;
 	/*
 	 * We want to free module_init, but be aware that kallsyms may be
-	 * walking this with preempt disabled.  In all the failure paths,
-	 * we call synchronize_rcu/synchronize_sched, but we don't want
-	 * to slow down the success path, so use actual RCU here.
+	 * walking this with preempt disabled.  In all the failure paths, we
+	 * call synchronize_sched(), but we don't want to slow down the success
+	 * path, so use actual RCU here.
 	 */
-	call_rcu(&freeinit->rcu, do_free_init);
+	call_rcu_sched(&freeinit->rcu, do_free_init);
 	mutex_unlock(&module_mutex);
 	wake_up_all(&module_wq);
 
@@ -3395,8 +3415,8 @@ static int load_module(struct load_info *info, const char __user *uargs,
 	/* Unlink carefully: kallsyms could be walking list. */
 	list_del_rcu(&mod->list);
 	wake_up_all(&module_wq);
-	/* Wait for RCU synchronizing before releasing mod->list. */
-	synchronize_rcu();
+	/* Wait for RCU-sched synchronizing before releasing mod->list. */
+	synchronize_sched();
 	mutex_unlock(&module_mutex);
  free_module:
 	/* Free lock-classes; relies on the preceding sync_rcu() */
@@ -3663,6 +3683,8 @@ int module_kallsyms_on_each_symbol(int (*fn)(void *, const char *,
 	unsigned int i;
 	int ret;
 
+	module_assert_mutex();
+
 	list_for_each_entry(mod, &modules, list) {
 		if (mod->state == MODULE_STATE_UNFORMED)
 			continue;
@@ -3837,6 +3859,8 @@ struct module *__module_address(unsigned long addr)
 	if (addr < module_addr_min || addr > module_addr_max)
 		return NULL;
 
+	module_assert_mutex_or_preempt();
+
 	list_for_each_entry_rcu(mod, &modules, list) {
 		if (mod->state == MODULE_STATE_UNFORMED)
 			continue;
diff --git a/lib/bug.c b/lib/bug.c
index 0c3bd9552b6f..cff145f032a5 100644
--- a/lib/bug.c
+++ b/lib/bug.c
@@ -66,7 +66,7 @@ static const struct bug_entry *module_find_bug(unsigned long bugaddr)
 	struct module *mod;
 	const struct bug_entry *bug = NULL;
 
-	rcu_read_lock();
+	rcu_read_lock_sched();
 	list_for_each_entry_rcu(mod, &module_bug_list, bug_list) {
 		unsigned i;
 
@@ -77,7 +77,7 @@ static const struct bug_entry *module_find_bug(unsigned long bugaddr)
 	}
 	bug = NULL;
 out:
-	rcu_read_unlock();
+	rcu_read_unlock_sched();
 
 	return bug;
 }
@@ -88,6 +88,8 @@ void module_bug_finalize(const Elf_Ehdr *hdr, const Elf_Shdr *sechdrs,
 	char *secstrings;
 	unsigned int i;
 
+	lockdep_assert_held(&module_mutex);
+
 	mod->bug_table = NULL;
 	mod->num_bugs = 0;
 
@@ -113,6 +115,7 @@ void module_bug_finalize(const Elf_Ehdr *hdr, const Elf_Shdr *sechdrs,
 
 void module_bug_cleanup(struct module *mod)
 {
+	lockdep_assert_held(&module_mutex);
 	list_del_rcu(&mod->bug_list);
 }
 
-- 
cgit v1.2.3


From d72da4a4d973d8a0a0d3c97e7cdebf287fbe3a99 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 27 May 2015 11:09:36 +0930
Subject: rbtree: Make lockless searches non-fatal

Change the insert and erase code such that lockless searches are
non-fatal.

In and of itself an rbtree cannot be correctly searched while
in-modification, we can however provide weaker guarantees that will
allow the rbtree to be used in conjunction with other techniques, such
as latches; see 9b0fd802e8c0 ("seqcount: Add raw_write_seqcount_latch()").

For this to work we need the following guarantees from the rbtree
code:

 1) a lockless reader must not see partial stores, this would allow it
    to observe nodes that are invalid memory.

 2) there must not be (temporary) loops in the tree structure in the
    modifier's program order, this would cause a lookup which
    interrupts the modifier to get stuck indefinitely.

For 1) we must use WRITE_ONCE() for all updates to the tree structure;
in particular this patch only does rb_{left,right} as those are the
only element required for simple searches.

It generates slightly worse code, probably because volatile. But in
pointer chasing heavy code a few instructions more should not matter.

For 2) I have carefully audited the code and drawn every intermediate
link state and not found a loop.

Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: David Woodhouse <David.Woodhouse@intel.com>
Cc: Rik van Riel <riel@redhat.com>
Reviewed-by: Michel Lespinasse <walken@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 include/linux/rbtree.h           | 16 +++++++--
 include/linux/rbtree_augmented.h | 21 +++++++----
 lib/rbtree.c                     | 76 ++++++++++++++++++++++++++++------------
 3 files changed, 81 insertions(+), 32 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rbtree.h b/include/linux/rbtree.h
index fb31765e935a..830c4992088d 100644
--- a/include/linux/rbtree.h
+++ b/include/linux/rbtree.h
@@ -31,6 +31,7 @@
 
 #include <linux/kernel.h>
 #include <linux/stddef.h>
+#include <linux/rcupdate.h>
 
 struct rb_node {
 	unsigned long  __rb_parent_color;
@@ -73,11 +74,11 @@ extern struct rb_node *rb_first_postorder(const struct rb_root *);
 extern struct rb_node *rb_next_postorder(const struct rb_node *);
 
 /* Fast replacement of a single node without remove/rebalance/add/rebalance */
-extern void rb_replace_node(struct rb_node *victim, struct rb_node *new, 
+extern void rb_replace_node(struct rb_node *victim, struct rb_node *new,
 			    struct rb_root *root);
 
-static inline void rb_link_node(struct rb_node * node, struct rb_node * parent,
-				struct rb_node ** rb_link)
+static inline void rb_link_node(struct rb_node *node, struct rb_node *parent,
+				struct rb_node **rb_link)
 {
 	node->__rb_parent_color = (unsigned long)parent;
 	node->rb_left = node->rb_right = NULL;
@@ -85,6 +86,15 @@ static inline void rb_link_node(struct rb_node * node, struct rb_node * parent,
 	*rb_link = node;
 }
 
+static inline void rb_link_node_rcu(struct rb_node *node, struct rb_node *parent,
+				    struct rb_node **rb_link)
+{
+	node->__rb_parent_color = (unsigned long)parent;
+	node->rb_left = node->rb_right = NULL;
+
+	rcu_assign_pointer(*rb_link, node);
+}
+
 #define rb_entry_safe(ptr, type, member) \
 	({ typeof(ptr) ____ptr = (ptr); \
 	   ____ptr ? rb_entry(____ptr, type, member) : NULL; \
diff --git a/include/linux/rbtree_augmented.h b/include/linux/rbtree_augmented.h
index 378c5ee75f78..14d7b831b63a 100644
--- a/include/linux/rbtree_augmented.h
+++ b/include/linux/rbtree_augmented.h
@@ -123,11 +123,11 @@ __rb_change_child(struct rb_node *old, struct rb_node *new,
 {
 	if (parent) {
 		if (parent->rb_left == old)
-			parent->rb_left = new;
+			WRITE_ONCE(parent->rb_left, new);
 		else
-			parent->rb_right = new;
+			WRITE_ONCE(parent->rb_right, new);
 	} else
-		root->rb_node = new;
+		WRITE_ONCE(root->rb_node, new);
 }
 
 extern void __rb_erase_color(struct rb_node *parent, struct rb_root *root,
@@ -137,7 +137,8 @@ static __always_inline struct rb_node *
 __rb_erase_augmented(struct rb_node *node, struct rb_root *root,
 		     const struct rb_augment_callbacks *augment)
 {
-	struct rb_node *child = node->rb_right, *tmp = node->rb_left;
+	struct rb_node *child = node->rb_right;
+	struct rb_node *tmp = node->rb_left;
 	struct rb_node *parent, *rebalance;
 	unsigned long pc;
 
@@ -167,6 +168,7 @@ __rb_erase_augmented(struct rb_node *node, struct rb_root *root,
 		tmp = parent;
 	} else {
 		struct rb_node *successor = child, *child2;
+
 		tmp = child->rb_left;
 		if (!tmp) {
 			/*
@@ -180,6 +182,7 @@ __rb_erase_augmented(struct rb_node *node, struct rb_root *root,
 			 */
 			parent = successor;
 			child2 = successor->rb_right;
+
 			augment->copy(node, successor);
 		} else {
 			/*
@@ -201,19 +204,23 @@ __rb_erase_augmented(struct rb_node *node, struct rb_root *root,
 				successor = tmp;
 				tmp = tmp->rb_left;
 			} while (tmp);
-			parent->rb_left = child2 = successor->rb_right;
-			successor->rb_right = child;
+			child2 = successor->rb_right;
+			WRITE_ONCE(parent->rb_left, child2);
+			WRITE_ONCE(successor->rb_right, child);
 			rb_set_parent(child, successor);
+
 			augment->copy(node, successor);
 			augment->propagate(parent, successor);
 		}
 
-		successor->rb_left = tmp = node->rb_left;
+		tmp = node->rb_left;
+		WRITE_ONCE(successor->rb_left, tmp);
 		rb_set_parent(tmp, successor);
 
 		pc = node->__rb_parent_color;
 		tmp = __rb_parent(pc);
 		__rb_change_child(node, successor, tmp, root);
+
 		if (child2) {
 			successor->__rb_parent_color = pc;
 			rb_set_parent_color(child2, parent, RB_BLACK);
diff --git a/lib/rbtree.c b/lib/rbtree.c
index c16c81a3d430..1356454e36de 100644
--- a/lib/rbtree.c
+++ b/lib/rbtree.c
@@ -44,6 +44,30 @@
  *  parentheses and have some accompanying text comment.
  */
 
+/*
+ * Notes on lockless lookups:
+ *
+ * All stores to the tree structure (rb_left and rb_right) must be done using
+ * WRITE_ONCE(). And we must not inadvertently cause (temporary) loops in the
+ * tree structure as seen in program order.
+ *
+ * These two requirements will allow lockless iteration of the tree -- not
+ * correct iteration mind you, tree rotations are not atomic so a lookup might
+ * miss entire subtrees.
+ *
+ * But they do guarantee that any such traversal will only see valid elements
+ * and that it will indeed complete -- does not get stuck in a loop.
+ *
+ * It also guarantees that if the lookup returns an element it is the 'correct'
+ * one. But not returning an element does _NOT_ mean it's not present.
+ *
+ * NOTE:
+ *
+ * Stores to __rb_parent_color are not important for simple lookups so those
+ * are left undone as of now. Nor did I check for loops involving parent
+ * pointers.
+ */
+
 static inline void rb_set_black(struct rb_node *rb)
 {
 	rb->__rb_parent_color |= RB_BLACK;
@@ -129,8 +153,9 @@ __rb_insert(struct rb_node *node, struct rb_root *root,
 				 * This still leaves us in violation of 4), the
 				 * continuation into Case 3 will fix that.
 				 */
-				parent->rb_right = tmp = node->rb_left;
-				node->rb_left = parent;
+				tmp = node->rb_left;
+				WRITE_ONCE(parent->rb_right, tmp);
+				WRITE_ONCE(node->rb_left, parent);
 				if (tmp)
 					rb_set_parent_color(tmp, parent,
 							    RB_BLACK);
@@ -149,8 +174,8 @@ __rb_insert(struct rb_node *node, struct rb_root *root,
 			 *     /                 \
 			 *    n                   U
 			 */
-			gparent->rb_left = tmp;  /* == parent->rb_right */
-			parent->rb_right = gparent;
+			WRITE_ONCE(gparent->rb_left, tmp); /* == parent->rb_right */
+			WRITE_ONCE(parent->rb_right, gparent);
 			if (tmp)
 				rb_set_parent_color(tmp, gparent, RB_BLACK);
 			__rb_rotate_set_parents(gparent, parent, root, RB_RED);
@@ -171,8 +196,9 @@ __rb_insert(struct rb_node *node, struct rb_root *root,
 			tmp = parent->rb_left;
 			if (node == tmp) {
 				/* Case 2 - right rotate at parent */
-				parent->rb_left = tmp = node->rb_right;
-				node->rb_right = parent;
+				tmp = node->rb_right;
+				WRITE_ONCE(parent->rb_left, tmp);
+				WRITE_ONCE(node->rb_right, parent);
 				if (tmp)
 					rb_set_parent_color(tmp, parent,
 							    RB_BLACK);
@@ -183,8 +209,8 @@ __rb_insert(struct rb_node *node, struct rb_root *root,
 			}
 
 			/* Case 3 - left rotate at gparent */
-			gparent->rb_right = tmp;  /* == parent->rb_left */
-			parent->rb_left = gparent;
+			WRITE_ONCE(gparent->rb_right, tmp); /* == parent->rb_left */
+			WRITE_ONCE(parent->rb_left, gparent);
 			if (tmp)
 				rb_set_parent_color(tmp, gparent, RB_BLACK);
 			__rb_rotate_set_parents(gparent, parent, root, RB_RED);
@@ -224,8 +250,9 @@ ____rb_erase_color(struct rb_node *parent, struct rb_root *root,
 				 *      / \         / \
 				 *     Sl  Sr      N   Sl
 				 */
-				parent->rb_right = tmp1 = sibling->rb_left;
-				sibling->rb_left = parent;
+				tmp1 = sibling->rb_left;
+				WRITE_ONCE(parent->rb_right, tmp1);
+				WRITE_ONCE(sibling->rb_left, parent);
 				rb_set_parent_color(tmp1, parent, RB_BLACK);
 				__rb_rotate_set_parents(parent, sibling, root,
 							RB_RED);
@@ -275,9 +302,10 @@ ____rb_erase_color(struct rb_node *parent, struct rb_root *root,
 				 *                       \
 				 *                        Sr
 				 */
-				sibling->rb_left = tmp1 = tmp2->rb_right;
-				tmp2->rb_right = sibling;
-				parent->rb_right = tmp2;
+				tmp1 = tmp2->rb_right;
+				WRITE_ONCE(sibling->rb_left, tmp1);
+				WRITE_ONCE(tmp2->rb_right, sibling);
+				WRITE_ONCE(parent->rb_right, tmp2);
 				if (tmp1)
 					rb_set_parent_color(tmp1, sibling,
 							    RB_BLACK);
@@ -297,8 +325,9 @@ ____rb_erase_color(struct rb_node *parent, struct rb_root *root,
 			 *        / \         / \
 			 *      (sl) sr      N  (sl)
 			 */
-			parent->rb_right = tmp2 = sibling->rb_left;
-			sibling->rb_left = parent;
+			tmp2 = sibling->rb_left;
+			WRITE_ONCE(parent->rb_right, tmp2);
+			WRITE_ONCE(sibling->rb_left, parent);
 			rb_set_parent_color(tmp1, sibling, RB_BLACK);
 			if (tmp2)
 				rb_set_parent(tmp2, parent);
@@ -310,8 +339,9 @@ ____rb_erase_color(struct rb_node *parent, struct rb_root *root,
 			sibling = parent->rb_left;
 			if (rb_is_red(sibling)) {
 				/* Case 1 - right rotate at parent */
-				parent->rb_left = tmp1 = sibling->rb_right;
-				sibling->rb_right = parent;
+				tmp1 = sibling->rb_right;
+				WRITE_ONCE(parent->rb_left, tmp1);
+				WRITE_ONCE(sibling->rb_right, parent);
 				rb_set_parent_color(tmp1, parent, RB_BLACK);
 				__rb_rotate_set_parents(parent, sibling, root,
 							RB_RED);
@@ -336,9 +366,10 @@ ____rb_erase_color(struct rb_node *parent, struct rb_root *root,
 					break;
 				}
 				/* Case 3 - right rotate at sibling */
-				sibling->rb_right = tmp1 = tmp2->rb_left;
-				tmp2->rb_left = sibling;
-				parent->rb_left = tmp2;
+				tmp1 = tmp2->rb_left;
+				WRITE_ONCE(sibling->rb_right, tmp1);
+				WRITE_ONCE(tmp2->rb_left, sibling);
+				WRITE_ONCE(parent->rb_left, tmp2);
 				if (tmp1)
 					rb_set_parent_color(tmp1, sibling,
 							    RB_BLACK);
@@ -347,8 +378,9 @@ ____rb_erase_color(struct rb_node *parent, struct rb_root *root,
 				sibling = tmp2;
 			}
 			/* Case 4 - left rotate at parent + color flips */
-			parent->rb_left = tmp2 = sibling->rb_right;
-			sibling->rb_right = parent;
+			tmp2 = sibling->rb_right;
+			WRITE_ONCE(parent->rb_left, tmp2);
+			WRITE_ONCE(sibling->rb_right, parent);
 			rb_set_parent_color(tmp1, sibling, RB_BLACK);
 			if (tmp2)
 				rb_set_parent(tmp2, parent);
-- 
cgit v1.2.3


From 6695b92a60bc7160c92d6dc5b17cc79673017c2f Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 27 May 2015 11:09:36 +0930
Subject: seqlock: Better document raw_write_seqcount_latch()

Improve the documentation of the latch technique as used in the
current timekeeping code, such that it can be readily employed
elsewhere.

Borrow from the comments in timekeeping and replace those with a
reference to this more generic comment.

Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: David Woodhouse <David.Woodhouse@intel.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Acked-by: Michel Lespinasse <walken@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 include/linux/seqlock.h   | 76 ++++++++++++++++++++++++++++++++++++++++++++++-
 kernel/time/timekeeping.c | 27 +----------------
 2 files changed, 76 insertions(+), 27 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h
index 5f68d0a391ce..1c0cf3102fdc 100644
--- a/include/linux/seqlock.h
+++ b/include/linux/seqlock.h
@@ -233,9 +233,83 @@ static inline void raw_write_seqcount_end(seqcount_t *s)
 	s->sequence++;
 }
 
-/*
+/**
  * raw_write_seqcount_latch - redirect readers to even/odd copy
  * @s: pointer to seqcount_t
+ *
+ * The latch technique is a multiversion concurrency control method that allows
+ * queries during non-atomic modifications. If you can guarantee queries never
+ * interrupt the modification -- e.g. the concurrency is strictly between CPUs
+ * -- you most likely do not need this.
+ *
+ * Where the traditional RCU/lockless data structures rely on atomic
+ * modifications to ensure queries observe either the old or the new state the
+ * latch allows the same for non-atomic updates. The trade-off is doubling the
+ * cost of storage; we have to maintain two copies of the entire data
+ * structure.
+ *
+ * Very simply put: we first modify one copy and then the other. This ensures
+ * there is always one copy in a stable state, ready to give us an answer.
+ *
+ * The basic form is a data structure like:
+ *
+ * struct latch_struct {
+ *	seqcount_t		seq;
+ *	struct data_struct	data[2];
+ * };
+ *
+ * Where a modification, which is assumed to be externally serialized, does the
+ * following:
+ *
+ * void latch_modify(struct latch_struct *latch, ...)
+ * {
+ *	smp_wmb();	<- Ensure that the last data[1] update is visible
+ *	latch->seq++;
+ *	smp_wmb();	<- Ensure that the seqcount update is visible
+ *
+ *	modify(latch->data[0], ...);
+ *
+ *	smp_wmb();	<- Ensure that the data[0] update is visible
+ *	latch->seq++;
+ *	smp_wmb();	<- Ensure that the seqcount update is visible
+ *
+ *	modify(latch->data[1], ...);
+ * }
+ *
+ * The query will have a form like:
+ *
+ * struct entry *latch_query(struct latch_struct *latch, ...)
+ * {
+ *	struct entry *entry;
+ *	unsigned seq, idx;
+ *
+ *	do {
+ *		seq = latch->seq;
+ *		smp_rmb();
+ *
+ *		idx = seq & 0x01;
+ *		entry = data_query(latch->data[idx], ...);
+ *
+ *		smp_rmb();
+ *	} while (seq != latch->seq);
+ *
+ *	return entry;
+ * }
+ *
+ * So during the modification, queries are first redirected to data[1]. Then we
+ * modify data[0]. When that is complete, we redirect queries back to data[0]
+ * and we can modify data[1].
+ *
+ * NOTE: The non-requirement for atomic modifications does _NOT_ include
+ *       the publishing of new entries in the case where data is a dynamic
+ *       data structure.
+ *
+ *       An iteration might start in data[0] and get suspended long enough
+ *       to miss an entire modification sequence, once it resumes it might
+ *       observe the new entry.
+ *
+ * NOTE: When data is a dynamic data structure; one should use regular RCU
+ *       patterns to manage the lifetimes of the objects within.
  */
 static inline void raw_write_seqcount_latch(seqcount_t *s)
 {
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 946acb72179f..cbfedddbf0cb 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -330,32 +330,7 @@ static inline s64 timekeeping_get_ns(struct tk_read_base *tkr)
  * We want to use this from any context including NMI and tracing /
  * instrumenting the timekeeping code itself.
  *
- * So we handle this differently than the other timekeeping accessor
- * functions which retry when the sequence count has changed. The
- * update side does:
- *
- * smp_wmb();	<- Ensure that the last base[1] update is visible
- * tkf->seq++;
- * smp_wmb();	<- Ensure that the seqcount update is visible
- * update(tkf->base[0], tkr);
- * smp_wmb();	<- Ensure that the base[0] update is visible
- * tkf->seq++;
- * smp_wmb();	<- Ensure that the seqcount update is visible
- * update(tkf->base[1], tkr);
- *
- * The reader side does:
- *
- * do {
- *	seq = tkf->seq;
- *	smp_rmb();
- *	idx = seq & 0x01;
- *	now = now(tkf->base[idx]);
- *	smp_rmb();
- * } while (seq != tkf->seq)
- *
- * As long as we update base[0] readers are forced off to
- * base[1]. Once base[0] is updated readers are redirected to base[0]
- * and the base[1] update takes place.
+ * Employ the latch technique; see @raw_write_seqcount_latch.
  *
  * So if a NMI hits the update of base[0] then it will use base[1]
  * which is still consistent. In the worst case this can result is a
-- 
cgit v1.2.3


From 0a04b0166929405cd833c1cc40f99e862b965ddc Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 27 May 2015 11:09:36 +0930
Subject: rcu: Move lockless_dereference() out of rcupdate.h

I want to use lockless_dereference() from seqlock.h, which would mean
including rcupdate.h from it, however rcupdate.h already includes
seqlock.h.

Avoid this by moving lockless_dereference() into compiler.h. This is
somewhat tricky since it uses smp_read_barrier_depends() which isn't
available there, but its a CPP macro so we can get away with it.

The alternative would be moving it into asm/barrier.h, but that would
be updating each arch (I can do if people feel that is more
appropriate).

Cc: Paul McKenney <paulmck@linux.vnet.ibm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 include/linux/compiler.h | 15 +++++++++++++++
 include/linux/rcupdate.h | 15 ---------------
 2 files changed, 15 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index 867722591be2..eae42c21d5fd 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -457,6 +457,21 @@ static __always_inline void __write_once_size(volatile void *p, void *res, int s
 	(volatile typeof(x) *)&(x); })
 #define ACCESS_ONCE(x) (*__ACCESS_ONCE(x))
 
+/**
+ * lockless_dereference() - safely load a pointer for later dereference
+ * @p: The pointer to load
+ *
+ * Similar to rcu_dereference(), but for situations where the pointed-to
+ * object's lifetime is managed by something other than RCU.  That
+ * "something other" might be reference counting or simple immortality.
+ */
+#define lockless_dereference(p) \
+({ \
+	typeof(p) _________p1 = ACCESS_ONCE(p); \
+	smp_read_barrier_depends(); /* Dependency order vs. p above. */ \
+	(_________p1); \
+})
+
 /* Ignore/forbid kprobes attach on very low level functions marked by this attribute: */
 #ifdef CONFIG_KPROBES
 # define __kprobes	__attribute__((__section__(".kprobes.text")))
diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 573a5afd5ed8..0356ad954ea5 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -649,21 +649,6 @@ static inline void rcu_preempt_sleep_check(void)
  */
 #define RCU_INITIALIZER(v) (typeof(*(v)) __force __rcu *)(v)
 
-/**
- * lockless_dereference() - safely load a pointer for later dereference
- * @p: The pointer to load
- *
- * Similar to rcu_dereference(), but for situations where the pointed-to
- * object's lifetime is managed by something other than RCU.  That
- * "something other" might be reference counting or simple immortality.
- */
-#define lockless_dereference(p) \
-({ \
-	typeof(p) _________p1 = ACCESS_ONCE(p); \
-	smp_read_barrier_depends(); /* Dependency order vs. p above. */ \
-	(_________p1); \
-})
-
 /**
  * rcu_assign_pointer() - assign to RCU-protected pointer
  * @p: pointer to assign to
-- 
cgit v1.2.3


From 7fc26327b75685f37f58d64bdb061460f834f80d Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 27 May 2015 11:09:36 +0930
Subject: seqlock: Introduce raw_read_seqcount_latch()

Because with latches there is a strict data dependency on the seq load
we can avoid the rmb in favour of a read_barrier_depends.

Suggested-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 include/linux/seqlock.h   | 9 +++++++--
 kernel/time/timekeeping.c | 2 +-
 2 files changed, 8 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h
index 1c0cf3102fdc..890c7ef709d5 100644
--- a/include/linux/seqlock.h
+++ b/include/linux/seqlock.h
@@ -35,6 +35,7 @@
 #include <linux/spinlock.h>
 #include <linux/preempt.h>
 #include <linux/lockdep.h>
+#include <linux/compiler.h>
 #include <asm/processor.h>
 
 /*
@@ -233,6 +234,11 @@ static inline void raw_write_seqcount_end(seqcount_t *s)
 	s->sequence++;
 }
 
+static inline int raw_read_seqcount_latch(seqcount_t *s)
+{
+	return lockless_dereference(s->sequence);
+}
+
 /**
  * raw_write_seqcount_latch - redirect readers to even/odd copy
  * @s: pointer to seqcount_t
@@ -284,8 +290,7 @@ static inline void raw_write_seqcount_end(seqcount_t *s)
  *	unsigned seq, idx;
  *
  *	do {
- *		seq = latch->seq;
- *		smp_rmb();
+ *		seq = lockless_dereference(latch->seq);
  *
  *		idx = seq & 0x01;
  *		entry = data_query(latch->data[idx], ...);
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index cbfedddbf0cb..266dafe8f015 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -393,7 +393,7 @@ static __always_inline u64 __ktime_get_fast_ns(struct tk_fast *tkf)
 	u64 now;
 
 	do {
-		seq = raw_read_seqcount(&tkf->seq);
+		seq = raw_read_seqcount_latch(&tkf->seq);
 		tkr = tkf->base + (seq & 0x01);
 		now = ktime_to_ns(tkr->base) + timekeeping_get_ns(tkr);
 	} while (read_seqcount_retry(&tkf->seq, seq));
-- 
cgit v1.2.3


From ade3f510f93a5613b672febe88eff8ea7f1c63b7 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 27 May 2015 11:09:37 +0930
Subject: rbtree: Implement generic latch_tree

Implement a latched RB-tree in order to get unconditional RCU/lockless
lookups.

Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Michel Lespinasse <walken@google.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: David Woodhouse <David.Woodhouse@intel.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 include/linux/rbtree_latch.h | 212 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 212 insertions(+)
 create mode 100644 include/linux/rbtree_latch.h

(limited to 'include/linux')

diff --git a/include/linux/rbtree_latch.h b/include/linux/rbtree_latch.h
new file mode 100644
index 000000000000..4f3432c61d12
--- /dev/null
+++ b/include/linux/rbtree_latch.h
@@ -0,0 +1,212 @@
+/*
+ * Latched RB-trees
+ *
+ * Copyright (C) 2015 Intel Corp., Peter Zijlstra <peterz@infradead.org>
+ *
+ * Since RB-trees have non-atomic modifications they're not immediately suited
+ * for RCU/lockless queries. Even though we made RB-tree lookups non-fatal for
+ * lockless lookups; we cannot guarantee they return a correct result.
+ *
+ * The simplest solution is a seqlock + RB-tree, this will allow lockless
+ * lookups; but has the constraint (inherent to the seqlock) that read sides
+ * cannot nest in write sides.
+ *
+ * If we need to allow unconditional lookups (say as required for NMI context
+ * usage) we need a more complex setup; this data structure provides this by
+ * employing the latch technique -- see @raw_write_seqcount_latch -- to
+ * implement a latched RB-tree which does allow for unconditional lookups by
+ * virtue of always having (at least) one stable copy of the tree.
+ *
+ * However, while we have the guarantee that there is at all times one stable
+ * copy, this does not guarantee an iteration will not observe modifications.
+ * What might have been a stable copy at the start of the iteration, need not
+ * remain so for the duration of the iteration.
+ *
+ * Therefore, this does require a lockless RB-tree iteration to be non-fatal;
+ * see the comment in lib/rbtree.c. Note however that we only require the first
+ * condition -- not seeing partial stores -- because the latch thing isolates
+ * us from loops. If we were to interrupt a modification the lookup would be
+ * pointed at the stable tree and complete while the modification was halted.
+ */
+
+#ifndef RB_TREE_LATCH_H
+#define RB_TREE_LATCH_H
+
+#include <linux/rbtree.h>
+#include <linux/seqlock.h>
+
+struct latch_tree_node {
+	struct rb_node node[2];
+};
+
+struct latch_tree_root {
+	seqcount_t	seq;
+	struct rb_root	tree[2];
+};
+
+/**
+ * latch_tree_ops - operators to define the tree order
+ * @less: used for insertion; provides the (partial) order between two elements.
+ * @comp: used for lookups; provides the order between the search key and an element.
+ *
+ * The operators are related like:
+ *
+ *	comp(a->key,b) < 0  := less(a,b)
+ *	comp(a->key,b) > 0  := less(b,a)
+ *	comp(a->key,b) == 0 := !less(a,b) && !less(b,a)
+ *
+ * If these operators define a partial order on the elements we make no
+ * guarantee on which of the elements matching the key is found. See
+ * latch_tree_find().
+ */
+struct latch_tree_ops {
+	bool (*less)(struct latch_tree_node *a, struct latch_tree_node *b);
+	int  (*comp)(void *key,                 struct latch_tree_node *b);
+};
+
+static __always_inline struct latch_tree_node *
+__lt_from_rb(struct rb_node *node, int idx)
+{
+	return container_of(node, struct latch_tree_node, node[idx]);
+}
+
+static __always_inline void
+__lt_insert(struct latch_tree_node *ltn, struct latch_tree_root *ltr, int idx,
+	    bool (*less)(struct latch_tree_node *a, struct latch_tree_node *b))
+{
+	struct rb_root *root = &ltr->tree[idx];
+	struct rb_node **link = &root->rb_node;
+	struct rb_node *node = &ltn->node[idx];
+	struct rb_node *parent = NULL;
+	struct latch_tree_node *ltp;
+
+	while (*link) {
+		parent = *link;
+		ltp = __lt_from_rb(parent, idx);
+
+		if (less(ltn, ltp))
+			link = &parent->rb_left;
+		else
+			link = &parent->rb_right;
+	}
+
+	rb_link_node_rcu(node, parent, link);
+	rb_insert_color(node, root);
+}
+
+static __always_inline void
+__lt_erase(struct latch_tree_node *ltn, struct latch_tree_root *ltr, int idx)
+{
+	rb_erase(&ltn->node[idx], &ltr->tree[idx]);
+}
+
+static __always_inline struct latch_tree_node *
+__lt_find(void *key, struct latch_tree_root *ltr, int idx,
+	  int (*comp)(void *key, struct latch_tree_node *node))
+{
+	struct rb_node *node = rcu_dereference_raw(ltr->tree[idx].rb_node);
+	struct latch_tree_node *ltn;
+	int c;
+
+	while (node) {
+		ltn = __lt_from_rb(node, idx);
+		c = comp(key, ltn);
+
+		if (c < 0)
+			node = rcu_dereference_raw(node->rb_left);
+		else if (c > 0)
+			node = rcu_dereference_raw(node->rb_right);
+		else
+			return ltn;
+	}
+
+	return NULL;
+}
+
+/**
+ * latch_tree_insert() - insert @node into the trees @root
+ * @node: nodes to insert
+ * @root: trees to insert @node into
+ * @ops: operators defining the node order
+ *
+ * It inserts @node into @root in an ordered fashion such that we can always
+ * observe one complete tree. See the comment for raw_write_seqcount_latch().
+ *
+ * The inserts use rcu_assign_pointer() to publish the element such that the
+ * tree structure is stored before we can observe the new @node.
+ *
+ * All modifications (latch_tree_insert, latch_tree_remove) are assumed to be
+ * serialized.
+ */
+static __always_inline void
+latch_tree_insert(struct latch_tree_node *node,
+		  struct latch_tree_root *root,
+		  const struct latch_tree_ops *ops)
+{
+	raw_write_seqcount_latch(&root->seq);
+	__lt_insert(node, root, 0, ops->less);
+	raw_write_seqcount_latch(&root->seq);
+	__lt_insert(node, root, 1, ops->less);
+}
+
+/**
+ * latch_tree_erase() - removes @node from the trees @root
+ * @node: nodes to remote
+ * @root: trees to remove @node from
+ * @ops: operators defining the node order
+ *
+ * Removes @node from the trees @root in an ordered fashion such that we can
+ * always observe one complete tree. See the comment for
+ * raw_write_seqcount_latch().
+ *
+ * It is assumed that @node will observe one RCU quiescent state before being
+ * reused of freed.
+ *
+ * All modifications (latch_tree_insert, latch_tree_remove) are assumed to be
+ * serialized.
+ */
+static __always_inline void
+latch_tree_erase(struct latch_tree_node *node,
+		 struct latch_tree_root *root,
+		 const struct latch_tree_ops *ops)
+{
+	raw_write_seqcount_latch(&root->seq);
+	__lt_erase(node, root, 0);
+	raw_write_seqcount_latch(&root->seq);
+	__lt_erase(node, root, 1);
+}
+
+/**
+ * latch_tree_find() - find the node matching @key in the trees @root
+ * @key: search key
+ * @root: trees to search for @key
+ * @ops: operators defining the node order
+ *
+ * Does a lockless lookup in the trees @root for the node matching @key.
+ *
+ * It is assumed that this is called while holding the appropriate RCU read
+ * side lock.
+ *
+ * If the operators define a partial order on the elements (there are multiple
+ * elements which have the same key value) it is undefined which of these
+ * elements will be found. Nor is it possible to iterate the tree to find
+ * further elements with the same key value.
+ *
+ * Returns: a pointer to the node matching @key or NULL.
+ */
+static __always_inline struct latch_tree_node *
+latch_tree_find(void *key, struct latch_tree_root *root,
+		const struct latch_tree_ops *ops)
+{
+	struct latch_tree_node *node;
+	unsigned int seq;
+
+	do {
+		seq = raw_read_seqcount_latch(&root->seq);
+		node = __lt_find(key, root, seq & 1, ops->comp);
+	} while (read_seqcount_retry(&root->seq, seq));
+
+	return node;
+}
+
+#endif /* RB_TREE_LATCH_H */
-- 
cgit v1.2.3


From 93c2e105f6bcee231c951ba0e56e84505c4b0483 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 27 May 2015 11:09:37 +0930
Subject: module: Optimize __module_address() using a latched RB-tree

Currently __module_address() is using a linear search through all
modules in order to find the module corresponding to the provided
address. With a lot of modules this can take a lot of time.

One of the users of this is kernel_text_address() which is employed
in many stack unwinders; which in turn are used by perf-callchain and
ftrace (possibly from NMI context).

So by optimizing __module_address() we optimize many stack unwinders
which are used by both perf and tracing in performance sensitive code.

Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 include/linux/module.h |  29 +++++++++++--
 kernel/module.c        | 115 ++++++++++++++++++++++++++++++++++++++++++++++---
 2 files changed, 136 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/module.h b/include/linux/module.h
index fb56dd85a862..ddf35a3368fb 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -17,6 +17,7 @@
 #include <linux/moduleparam.h>
 #include <linux/jump_label.h>
 #include <linux/export.h>
+#include <linux/rbtree_latch.h>
 
 #include <linux/percpu.h>
 #include <asm/module.h>
@@ -210,6 +211,13 @@ enum module_state {
 	MODULE_STATE_UNFORMED,	/* Still setting it up. */
 };
 
+struct module;
+
+struct mod_tree_node {
+	struct module *mod;
+	struct latch_tree_node node;
+};
+
 struct module {
 	enum module_state state;
 
@@ -269,8 +277,15 @@ struct module {
 	/* Startup function. */
 	int (*init)(void);
 
-	/* If this is non-NULL, vfree after init() returns */
-	void *module_init;
+	/*
+	 * If this is non-NULL, vfree() after init() returns.
+	 *
+	 * Cacheline align here, such that:
+	 *   module_init, module_core, init_size, core_size,
+	 *   init_text_size, core_text_size and ltn_core.node[0]
+	 * are on the same cacheline.
+	 */
+	void *module_init	____cacheline_aligned;
 
 	/* Here is the actual code + data, vfree'd on unload. */
 	void *module_core;
@@ -281,6 +296,14 @@ struct module {
 	/* The size of the executable code in each section.  */
 	unsigned int init_text_size, core_text_size;
 
+	/*
+	 * We want mtn_core::{mod,node[0]} to be in the same cacheline as the
+	 * above entries such that a regular lookup will only touch one
+	 * cacheline.
+	 */
+	struct mod_tree_node	mtn_core;
+	struct mod_tree_node	mtn_init;
+
 	/* Size of RO sections of the module (text+rodata) */
 	unsigned int init_ro_size, core_ro_size;
 
@@ -367,7 +390,7 @@ struct module {
 	ctor_fn_t *ctors;
 	unsigned int num_ctors;
 #endif
-};
+} ____cacheline_aligned;
 #ifndef MODULE_ARCH_INIT
 #define MODULE_ARCH_INIT {}
 #endif
diff --git a/kernel/module.c b/kernel/module.c
index a15899e00ca9..e0db5c31cb53 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -101,6 +101,108 @@
 DEFINE_MUTEX(module_mutex);
 EXPORT_SYMBOL_GPL(module_mutex);
 static LIST_HEAD(modules);
+
+/*
+ * Use a latched RB-tree for __module_address(); this allows us to use
+ * RCU-sched lookups of the address from any context.
+ *
+ * Because modules have two address ranges: init and core, we need two
+ * latch_tree_nodes entries. Therefore we need the back-pointer from
+ * mod_tree_node.
+ *
+ * Because init ranges are short lived we mark them unlikely and have placed
+ * them outside the critical cacheline in struct module.
+ */
+
+static __always_inline unsigned long __mod_tree_val(struct latch_tree_node *n)
+{
+	struct mod_tree_node *mtn = container_of(n, struct mod_tree_node, node);
+	struct module *mod = mtn->mod;
+
+	if (unlikely(mtn == &mod->mtn_init))
+		return (unsigned long)mod->module_init;
+
+	return (unsigned long)mod->module_core;
+}
+
+static __always_inline unsigned long __mod_tree_size(struct latch_tree_node *n)
+{
+	struct mod_tree_node *mtn = container_of(n, struct mod_tree_node, node);
+	struct module *mod = mtn->mod;
+
+	if (unlikely(mtn == &mod->mtn_init))
+		return (unsigned long)mod->init_size;
+
+	return (unsigned long)mod->core_size;
+}
+
+static __always_inline bool
+mod_tree_less(struct latch_tree_node *a, struct latch_tree_node *b)
+{
+	return __mod_tree_val(a) < __mod_tree_val(b);
+}
+
+static __always_inline int
+mod_tree_comp(void *key, struct latch_tree_node *n)
+{
+	unsigned long val = (unsigned long)key;
+	unsigned long start, end;
+
+	start = __mod_tree_val(n);
+	if (val < start)
+		return -1;
+
+	end = start + __mod_tree_size(n);
+	if (val >= end)
+		return 1;
+
+	return 0;
+}
+
+static const struct latch_tree_ops mod_tree_ops = {
+	.less = mod_tree_less,
+	.comp = mod_tree_comp,
+};
+
+static struct latch_tree_root mod_tree __cacheline_aligned;
+
+/*
+ * These modifications: insert, remove_init and remove; are serialized by the
+ * module_mutex.
+ */
+static void mod_tree_insert(struct module *mod)
+{
+	mod->mtn_core.mod = mod;
+	mod->mtn_init.mod = mod;
+
+	latch_tree_insert(&mod->mtn_core.node, &mod_tree, &mod_tree_ops);
+	if (mod->init_size)
+		latch_tree_insert(&mod->mtn_init.node, &mod_tree, &mod_tree_ops);
+}
+
+static void mod_tree_remove_init(struct module *mod)
+{
+	if (mod->init_size)
+		latch_tree_erase(&mod->mtn_init.node, &mod_tree, &mod_tree_ops);
+}
+
+static void mod_tree_remove(struct module *mod)
+{
+	latch_tree_erase(&mod->mtn_core.node, &mod_tree, &mod_tree_ops);
+	mod_tree_remove_init(mod);
+}
+
+static struct module *mod_tree_find(unsigned long addr)
+{
+	struct latch_tree_node *ltn;
+
+	ltn = latch_tree_find((void *)addr, &mod_tree, &mod_tree_ops);
+	if (!ltn)
+		return NULL;
+
+	return container_of(ltn, struct mod_tree_node, node)->mod;
+}
+
 #ifdef CONFIG_KGDB_KDB
 struct list_head *kdb_modules = &modules; /* kdb needs the list of modules */
 #endif /* CONFIG_KGDB_KDB */
@@ -1878,6 +1980,7 @@ static void free_module(struct module *mod)
 	mutex_lock(&module_mutex);
 	/* Unlink carefully: kallsyms could be walking list. */
 	list_del_rcu(&mod->list);
+	mod_tree_remove(mod);
 	/* Remove this module from bug list, this uses list_del_rcu */
 	module_bug_cleanup(mod);
 	/* Wait for RCU-sched synchronizing before releasing mod->list and buglist. */
@@ -3145,6 +3248,7 @@ static noinline int do_init_module(struct module *mod)
 	mod->symtab = mod->core_symtab;
 	mod->strtab = mod->core_strtab;
 #endif
+	mod_tree_remove_init(mod);
 	unset_module_init_ro_nx(mod);
 	module_arch_freeing_init(mod);
 	mod->module_init = NULL;
@@ -3215,6 +3319,7 @@ again:
 		goto out;
 	}
 	list_add_rcu(&mod->list, &modules);
+	mod_tree_insert(mod);
 	err = 0;
 
 out:
@@ -3861,13 +3966,13 @@ struct module *__module_address(unsigned long addr)
 
 	module_assert_mutex_or_preempt();
 
-	list_for_each_entry_rcu(mod, &modules, list) {
+	mod = mod_tree_find(addr);
+	if (mod) {
+		BUG_ON(!within_module(addr, mod));
 		if (mod->state == MODULE_STATE_UNFORMED)
-			continue;
-		if (within_module(addr, mod))
-			return mod;
+			mod = NULL;
 	}
-	return NULL;
+	return mod;
 }
 EXPORT_SYMBOL_GPL(__module_address);
 
-- 
cgit v1.2.3


From 6c9692e2d6a2206d8fd75ea247daa47fb75e4a02 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 27 May 2015 11:09:37 +0930
Subject: module: Make the mod_tree stuff conditional on PERF_EVENTS || TRACING

Andrew worried about the overhead on small systems; only use the fancy
code when either perf or tracing is enabled.

Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Steven Rostedt <rostedt@goodmis.org>
Requested-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 include/linux/module.h |  4 +++-
 init/Kconfig           |  4 ++++
 kernel/module.c        | 30 ++++++++++++++++++++++++++++--
 3 files changed, 35 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/module.h b/include/linux/module.h
index ddf35a3368fb..4c1b02e1361d 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -282,7 +282,7 @@ struct module {
 	 *
 	 * Cacheline align here, such that:
 	 *   module_init, module_core, init_size, core_size,
-	 *   init_text_size, core_text_size and ltn_core.node[0]
+	 *   init_text_size, core_text_size and mtn_core::{mod,node[0]}
 	 * are on the same cacheline.
 	 */
 	void *module_init	____cacheline_aligned;
@@ -296,6 +296,7 @@ struct module {
 	/* The size of the executable code in each section.  */
 	unsigned int init_text_size, core_text_size;
 
+#ifdef CONFIG_MODULES_TREE_LOOKUP
 	/*
 	 * We want mtn_core::{mod,node[0]} to be in the same cacheline as the
 	 * above entries such that a regular lookup will only touch one
@@ -303,6 +304,7 @@ struct module {
 	 */
 	struct mod_tree_node	mtn_core;
 	struct mod_tree_node	mtn_init;
+#endif
 
 	/* Size of RO sections of the module (text+rodata) */
 	unsigned int init_ro_size, core_ro_size;
diff --git a/init/Kconfig b/init/Kconfig
index dc24dec60232..968a001790af 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1989,6 +1989,10 @@ endchoice
 
 endif # MODULES
 
+config MODULES_TREE_LOOKUP
+	def_bool y
+	depends on PERF_EVENTS || TRACING
+
 config INIT_ALL_POSSIBLE
 	bool
 	help
diff --git a/kernel/module.c b/kernel/module.c
index e0db5c31cb53..ac3044ceca3f 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -102,6 +102,8 @@ DEFINE_MUTEX(module_mutex);
 EXPORT_SYMBOL_GPL(module_mutex);
 static LIST_HEAD(modules);
 
+#ifdef CONFIG_MODULES_TREE_LOOKUP
+
 /*
  * Use a latched RB-tree for __module_address(); this allows us to use
  * RCU-sched lookups of the address from any context.
@@ -112,6 +114,10 @@ static LIST_HEAD(modules);
  *
  * Because init ranges are short lived we mark them unlikely and have placed
  * them outside the critical cacheline in struct module.
+ *
+ * This is conditional on PERF_EVENTS || TRACING because those can really hit
+ * __module_address() hard by doing a lot of stack unwinding; potentially from
+ * NMI context.
  */
 
 static __always_inline unsigned long __mod_tree_val(struct latch_tree_node *n)
@@ -192,7 +198,7 @@ static void mod_tree_remove(struct module *mod)
 	mod_tree_remove_init(mod);
 }
 
-static struct module *mod_tree_find(unsigned long addr)
+static struct module *mod_find(unsigned long addr)
 {
 	struct latch_tree_node *ltn;
 
@@ -203,6 +209,26 @@ static struct module *mod_tree_find(unsigned long addr)
 	return container_of(ltn, struct mod_tree_node, node)->mod;
 }
 
+#else /* MODULES_TREE_LOOKUP */
+
+static void mod_tree_insert(struct module *mod) { }
+static void mod_tree_remove_init(struct module *mod) { }
+static void mod_tree_remove(struct module *mod) { }
+
+static struct module *mod_find(unsigned long addr)
+{
+	struct module *mod;
+
+	list_for_each_entry_rcu(mod, &modules, list) {
+		if (within_module(addr, mod))
+			return mod;
+	}
+
+	return NULL;
+}
+
+#endif /* MODULES_TREE_LOOKUP */
+
 #ifdef CONFIG_KGDB_KDB
 struct list_head *kdb_modules = &modules; /* kdb needs the list of modules */
 #endif /* CONFIG_KGDB_KDB */
@@ -3966,7 +3992,7 @@ struct module *__module_address(unsigned long addr)
 
 	module_assert_mutex_or_preempt();
 
-	mod = mod_tree_find(addr);
+	mod = mod_find(addr);
 	if (mod) {
 		BUG_ON(!within_module(addr, mod));
 		if (mod->state == MODULE_STATE_UNFORMED)
-- 
cgit v1.2.3


From 28b8d0c8f560300836dff352348e513cdf328e50 Mon Sep 17 00:00:00 2001
From: Gobinda Charan Maji <gobinda.cemk07@gmail.com>
Date: Wed, 27 May 2015 11:09:38 +0930
Subject: sysfs: tightened sysfs permission checks

There were some inconsistency in restriction to VERIFY_OCTAL_PERMISSIONS().
Previously the test was "User perms >= group perms >= other perms". The
permission field of User, Group or Other consists of three bits. LSB is
EXECUTE permission, MSB is READ permission and the middle bit is WRITE
permission. But logically WRITE is "more privileged" than READ.

Say for example, permission value is "0430". Here User has only READ
permission whereas Group has both WRITE and EXECUTE permission.

So, the checks could be tightened and the tests are separated to
USER_READABLE >= GROUP_READABLE >= OTHER_READABLE,
USER_WRITABLE >= GROUP_WRITABLE and OTHER_WRITABLE is not permitted.

Signed-off-by: Gobinda Charan Maji <gobinda.cemk07@gmail.com>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 include/linux/kernel.h | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 3a5b48e52a9e..cd54b357c934 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -818,13 +818,15 @@ static inline void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) { }
 #endif
 
 /* Permissions on a sysfs file: you didn't miss the 0 prefix did you? */
-#define VERIFY_OCTAL_PERMISSIONS(perms)					\
-	(BUILD_BUG_ON_ZERO((perms) < 0) +				\
-	 BUILD_BUG_ON_ZERO((perms) > 0777) +				\
-	 /* User perms >= group perms >= other perms */			\
-	 BUILD_BUG_ON_ZERO(((perms) >> 6) < (((perms) >> 3) & 7)) +	\
-	 BUILD_BUG_ON_ZERO((((perms) >> 3) & 7) < ((perms) & 7)) +	\
-	 /* Other writable?  Generally considered a bad idea. */	\
-	 BUILD_BUG_ON_ZERO((perms) & 2) +				\
+#define VERIFY_OCTAL_PERMISSIONS(perms)						\
+	(BUILD_BUG_ON_ZERO((perms) < 0) +					\
+	 BUILD_BUG_ON_ZERO((perms) > 0777) +					\
+	 /* USER_READABLE >= GROUP_READABLE >= OTHER_READABLE */		\
+	 BUILD_BUG_ON_ZERO((((perms) >> 6) & 4) < (((perms) >> 3) & 4)) +	\
+	 BUILD_BUG_ON_ZERO((((perms) >> 3) & 4) < ((perms) & 4)) +		\
+	 /* USER_WRITABLE >= GROUP_WRITABLE */					\
+	 BUILD_BUG_ON_ZERO((((perms) >> 6) & 2) < (((perms) >> 3) & 2)) +	\
+	 /* OTHER_WRITABLE?  Generally considered a bad idea. */		\
+	 BUILD_BUG_ON_ZERO((perms) & 2) +					\
 	 (perms))
 #endif
-- 
cgit v1.2.3


From 9c27847dda9cfae7c273cde62becf364f9fa9ea3 Mon Sep 17 00:00:00 2001
From: "Luis R. Rodriguez" <mcgrof@suse.com>
Date: Wed, 27 May 2015 11:09:38 +0930
Subject: kernel/params: constify struct kernel_param_ops uses

Most code already uses consts for the struct kernel_param_ops,
sweep the kernel for the last offending stragglers. Other than
include/linux/moduleparam.h and kernel/params.c all other changes
were generated with the following Coccinelle SmPL patch. Merge
conflicts between trees can be handled with Coccinelle.

In the future git could get Coccinelle merge support to deal with
patch --> fail --> grammar --> Coccinelle --> new patch conflicts
automatically for us on patches where the grammar is available and
the patch is of high confidence. Consider this a feature request.

Test compiled on x86_64 against:

	* allnoconfig
	* allmodconfig
	* allyesconfig

@ const_found @
identifier ops;
@@

const struct kernel_param_ops ops = {
};

@ const_not_found depends on !const_found @
identifier ops;
@@

-struct kernel_param_ops ops = {
+const struct kernel_param_ops ops = {
};

Generated-by: Coccinelle SmPL
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Junio C Hamano <gitster@pobox.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Kees Cook <keescook@chromium.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: cocci@systeme.lip6.fr
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Luis R. Rodriguez <mcgrof@suse.com>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 arch/s390/kernel/perf_cpum_sf.c         |  2 +-
 arch/x86/kvm/mmu_audit.c                |  2 +-
 arch/x86/platform/uv/uv_nmi.c           |  2 +-
 drivers/block/null_blk.c                |  4 ++--
 drivers/char/ipmi/ipmi_watchdog.c       |  6 +++---
 drivers/dma/dmatest.c                   |  4 ++--
 drivers/ide/ide.c                       |  2 +-
 drivers/infiniband/ulp/srp/ib_srp.c     |  4 ++--
 drivers/input/misc/ati_remote2.c        |  4 ++--
 drivers/input/mouse/psmouse-base.c      |  2 +-
 drivers/misc/lis3lv02d/lis3lv02d.c      |  2 +-
 drivers/mtd/ubi/block.c                 |  2 +-
 drivers/net/wireless/ath/wil6210/main.c |  4 ++--
 drivers/power/test_power.c              | 16 ++++++++--------
 drivers/thermal/intel_powerclamp.c      |  4 ++--
 drivers/tty/hvc/hvc_iucv.c              |  2 +-
 drivers/tty/sysrq.c                     |  2 +-
 drivers/video/fbdev/uvesafb.c           |  2 +-
 drivers/virtio/virtio_mmio.c            |  2 +-
 fs/nfs/super.c                          |  2 +-
 include/linux/moduleparam.h             | 30 +++++++++++++++---------------
 kernel/params.c                         | 14 +++++++-------
 net/sunrpc/auth.c                       |  2 +-
 net/sunrpc/xprtsock.c                   |  6 +++---
 security/apparmor/lsm.c                 |  6 +++---
 security/integrity/ima/ima_crypto.c     |  2 +-
 sound/pci/hda/hda_intel.c               |  2 +-
 27 files changed, 66 insertions(+), 66 deletions(-)

(limited to 'include/linux')

diff --git a/arch/s390/kernel/perf_cpum_sf.c b/arch/s390/kernel/perf_cpum_sf.c
index e6a1578fc000..afe05bfb7e00 100644
--- a/arch/s390/kernel/perf_cpum_sf.c
+++ b/arch/s390/kernel/perf_cpum_sf.c
@@ -1572,7 +1572,7 @@ static int param_set_sfb_size(const char *val, const struct kernel_param *kp)
 }
 
 #define param_check_sfb_size(name, p) __param_check(name, p, void)
-static struct kernel_param_ops param_ops_sfb_size = {
+static const struct kernel_param_ops param_ops_sfb_size = {
 	.set = param_set_sfb_size,
 	.get = param_get_sfb_size,
 };
diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c
index 9ade5cfb5a4c..87393e3ae087 100644
--- a/arch/x86/kvm/mmu_audit.c
+++ b/arch/x86/kvm/mmu_audit.c
@@ -291,7 +291,7 @@ static int mmu_audit_set(const char *val, const struct kernel_param *kp)
 	return 0;
 }
 
-static struct kernel_param_ops audit_param_ops = {
+static const struct kernel_param_ops audit_param_ops = {
 	.set = mmu_audit_set,
 	.get = param_get_bool,
 };
diff --git a/arch/x86/platform/uv/uv_nmi.c b/arch/x86/platform/uv/uv_nmi.c
index 7488cafab955..020c101c255f 100644
--- a/arch/x86/platform/uv/uv_nmi.c
+++ b/arch/x86/platform/uv/uv_nmi.c
@@ -104,7 +104,7 @@ static int param_set_local64(const char *val, const struct kernel_param *kp)
 	return 0;
 }
 
-static struct kernel_param_ops param_ops_local64 = {
+static const struct kernel_param_ops param_ops_local64 = {
 	.get = param_get_local64,
 	.set = param_set_local64,
 };
diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c
index 65cd61a4145e..0b4b25617bb7 100644
--- a/drivers/block/null_blk.c
+++ b/drivers/block/null_blk.c
@@ -99,7 +99,7 @@ static int null_set_queue_mode(const char *str, const struct kernel_param *kp)
 	return null_param_store_val(str, &queue_mode, NULL_Q_BIO, NULL_Q_MQ);
 }
 
-static struct kernel_param_ops null_queue_mode_param_ops = {
+static const struct kernel_param_ops null_queue_mode_param_ops = {
 	.set	= null_set_queue_mode,
 	.get	= param_get_int,
 };
@@ -127,7 +127,7 @@ static int null_set_irqmode(const char *str, const struct kernel_param *kp)
 					NULL_IRQ_TIMER);
 }
 
-static struct kernel_param_ops null_irqmode_param_ops = {
+static const struct kernel_param_ops null_irqmode_param_ops = {
 	.set	= null_set_irqmode,
 	.get	= param_get_int,
 };
diff --git a/drivers/char/ipmi/ipmi_watchdog.c b/drivers/char/ipmi/ipmi_watchdog.c
index 37b8be7cba95..0ac3bd1a5497 100644
--- a/drivers/char/ipmi/ipmi_watchdog.c
+++ b/drivers/char/ipmi/ipmi_watchdog.c
@@ -208,7 +208,7 @@ static int set_param_timeout(const char *val, const struct kernel_param *kp)
 	return rv;
 }
 
-static struct kernel_param_ops param_ops_timeout = {
+static const struct kernel_param_ops param_ops_timeout = {
 	.set = set_param_timeout,
 	.get = param_get_int,
 };
@@ -270,14 +270,14 @@ static int set_param_wdog_ifnum(const char *val, const struct kernel_param *kp)
 	return 0;
 }
 
-static struct kernel_param_ops param_ops_wdog_ifnum = {
+static const struct kernel_param_ops param_ops_wdog_ifnum = {
 	.set = set_param_wdog_ifnum,
 	.get = param_get_int,
 };
 
 #define param_check_wdog_ifnum param_check_int
 
-static struct kernel_param_ops param_ops_str = {
+static const struct kernel_param_ops param_ops_str = {
 	.set = set_param_str,
 	.get = get_param_str,
 };
diff --git a/drivers/dma/dmatest.c b/drivers/dma/dmatest.c
index 220ee49633e4..b8576fd6bd0e 100644
--- a/drivers/dma/dmatest.c
+++ b/drivers/dma/dmatest.c
@@ -120,7 +120,7 @@ static struct dmatest_info {
 
 static int dmatest_run_set(const char *val, const struct kernel_param *kp);
 static int dmatest_run_get(char *val, const struct kernel_param *kp);
-static struct kernel_param_ops run_ops = {
+static const struct kernel_param_ops run_ops = {
 	.set = dmatest_run_set,
 	.get = dmatest_run_get,
 };
@@ -195,7 +195,7 @@ static int dmatest_wait_get(char *val, const struct kernel_param *kp)
 	return param_get_bool(val, kp);
 }
 
-static struct kernel_param_ops wait_ops = {
+static const struct kernel_param_ops wait_ops = {
 	.get = dmatest_wait_get,
 	.set = param_set_bool,
 };
diff --git a/drivers/ide/ide.c b/drivers/ide/ide.c
index e29b02ca9e91..f086ef387475 100644
--- a/drivers/ide/ide.c
+++ b/drivers/ide/ide.c
@@ -199,7 +199,7 @@ static int ide_set_dev_param_mask(const char *s, const struct kernel_param *kp)
 	return 0;
 }
 
-static struct kernel_param_ops param_ops_ide_dev_mask = {
+static const struct kernel_param_ops param_ops_ide_dev_mask = {
 	.set = ide_set_dev_param_mask
 };
 
diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c
index 918814cd0f80..e4eec9da5ea9 100644
--- a/drivers/infiniband/ulp/srp/ib_srp.c
+++ b/drivers/infiniband/ulp/srp/ib_srp.c
@@ -98,7 +98,7 @@ module_param(register_always, bool, 0444);
 MODULE_PARM_DESC(register_always,
 		 "Use memory registration even for contiguous memory regions");
 
-static struct kernel_param_ops srp_tmo_ops;
+static const struct kernel_param_ops srp_tmo_ops;
 
 static int srp_reconnect_delay = 10;
 module_param_cb(reconnect_delay, &srp_tmo_ops, &srp_reconnect_delay,
@@ -183,7 +183,7 @@ out:
 	return res;
 }
 
-static struct kernel_param_ops srp_tmo_ops = {
+static const struct kernel_param_ops srp_tmo_ops = {
 	.get = srp_tmo_get,
 	.set = srp_tmo_set,
 };
diff --git a/drivers/input/misc/ati_remote2.c b/drivers/input/misc/ati_remote2.c
index f63341f20b91..cfd58e87da26 100644
--- a/drivers/input/misc/ati_remote2.c
+++ b/drivers/input/misc/ati_remote2.c
@@ -94,7 +94,7 @@ static int ati_remote2_get_mode_mask(char *buffer,
 
 static unsigned int channel_mask = ATI_REMOTE2_MAX_CHANNEL_MASK;
 #define param_check_channel_mask(name, p) __param_check(name, p, unsigned int)
-static struct kernel_param_ops param_ops_channel_mask = {
+static const struct kernel_param_ops param_ops_channel_mask = {
 	.set = ati_remote2_set_channel_mask,
 	.get = ati_remote2_get_channel_mask,
 };
@@ -103,7 +103,7 @@ MODULE_PARM_DESC(channel_mask, "Bitmask of channels to accept <15:Channel16>...<
 
 static unsigned int mode_mask = ATI_REMOTE2_MAX_MODE_MASK;
 #define param_check_mode_mask(name, p) __param_check(name, p, unsigned int)
-static struct kernel_param_ops param_ops_mode_mask = {
+static const struct kernel_param_ops param_ops_mode_mask = {
 	.set = ati_remote2_set_mode_mask,
 	.get = ati_remote2_get_mode_mask,
 };
diff --git a/drivers/input/mouse/psmouse-base.c b/drivers/input/mouse/psmouse-base.c
index 5bb1658f60c7..f8286b625ec4 100644
--- a/drivers/input/mouse/psmouse-base.c
+++ b/drivers/input/mouse/psmouse-base.c
@@ -47,7 +47,7 @@ MODULE_LICENSE("GPL");
 static unsigned int psmouse_max_proto = PSMOUSE_AUTO;
 static int psmouse_set_maxproto(const char *val, const struct kernel_param *);
 static int psmouse_get_maxproto(char *buffer, const struct kernel_param *kp);
-static struct kernel_param_ops param_ops_proto_abbrev = {
+static const struct kernel_param_ops param_ops_proto_abbrev = {
 	.set = psmouse_set_maxproto,
 	.get = psmouse_get_maxproto,
 };
diff --git a/drivers/misc/lis3lv02d/lis3lv02d.c b/drivers/misc/lis3lv02d/lis3lv02d.c
index 4739689d23ad..fb8705fc3aca 100644
--- a/drivers/misc/lis3lv02d/lis3lv02d.c
+++ b/drivers/misc/lis3lv02d/lis3lv02d.c
@@ -115,7 +115,7 @@ static int param_set_axis(const char *val, const struct kernel_param *kp)
 	return ret;
 }
 
-static struct kernel_param_ops param_ops_axis = {
+static const struct kernel_param_ops param_ops_axis = {
 	.set = param_set_axis,
 	.get = param_get_int,
 };
diff --git a/drivers/mtd/ubi/block.c b/drivers/mtd/ubi/block.c
index c9eb78f10a0d..4968c071f399 100644
--- a/drivers/mtd/ubi/block.c
+++ b/drivers/mtd/ubi/block.c
@@ -161,7 +161,7 @@ static int __init ubiblock_set_param(const char *val,
 	return 0;
 }
 
-static struct kernel_param_ops ubiblock_param_ops = {
+static const struct kernel_param_ops ubiblock_param_ops = {
 	.set    = ubiblock_set_param,
 };
 module_param_cb(block, &ubiblock_param_ops, NULL, 0);
diff --git a/drivers/net/wireless/ath/wil6210/main.c b/drivers/net/wireless/ath/wil6210/main.c
index c2a238426425..790c3df0bb03 100644
--- a/drivers/net/wireless/ath/wil6210/main.c
+++ b/drivers/net/wireless/ath/wil6210/main.c
@@ -58,7 +58,7 @@ static int mtu_max_set(const char *val, const struct kernel_param *kp)
 	return ret;
 }
 
-static struct kernel_param_ops mtu_max_ops = {
+static const struct kernel_param_ops mtu_max_ops = {
 	.set = mtu_max_set,
 	.get = param_get_uint,
 };
@@ -87,7 +87,7 @@ static int ring_order_set(const char *val, const struct kernel_param *kp)
 	return 0;
 }
 
-static struct kernel_param_ops ring_order_ops = {
+static const struct kernel_param_ops ring_order_ops = {
 	.set = ring_order_set,
 	.get = param_get_uint,
 };
diff --git a/drivers/power/test_power.c b/drivers/power/test_power.c
index f986e0cca7ac..83c42ea88f2b 100644
--- a/drivers/power/test_power.c
+++ b/drivers/power/test_power.c
@@ -448,42 +448,42 @@ static int param_set_battery_voltage(const char *key,
 
 #define param_get_battery_voltage param_get_int
 
-static struct kernel_param_ops param_ops_ac_online = {
+static const struct kernel_param_ops param_ops_ac_online = {
 	.set = param_set_ac_online,
 	.get = param_get_ac_online,
 };
 
-static struct kernel_param_ops param_ops_usb_online = {
+static const struct kernel_param_ops param_ops_usb_online = {
 	.set = param_set_usb_online,
 	.get = param_get_usb_online,
 };
 
-static struct kernel_param_ops param_ops_battery_status = {
+static const struct kernel_param_ops param_ops_battery_status = {
 	.set = param_set_battery_status,
 	.get = param_get_battery_status,
 };
 
-static struct kernel_param_ops param_ops_battery_present = {
+static const struct kernel_param_ops param_ops_battery_present = {
 	.set = param_set_battery_present,
 	.get = param_get_battery_present,
 };
 
-static struct kernel_param_ops param_ops_battery_technology = {
+static const struct kernel_param_ops param_ops_battery_technology = {
 	.set = param_set_battery_technology,
 	.get = param_get_battery_technology,
 };
 
-static struct kernel_param_ops param_ops_battery_health = {
+static const struct kernel_param_ops param_ops_battery_health = {
 	.set = param_set_battery_health,
 	.get = param_get_battery_health,
 };
 
-static struct kernel_param_ops param_ops_battery_capacity = {
+static const struct kernel_param_ops param_ops_battery_capacity = {
 	.set = param_set_battery_capacity,
 	.get = param_get_battery_capacity,
 };
 
-static struct kernel_param_ops param_ops_battery_voltage = {
+static const struct kernel_param_ops param_ops_battery_voltage = {
 	.set = param_set_battery_voltage,
 	.get = param_get_battery_voltage,
 };
diff --git a/drivers/thermal/intel_powerclamp.c b/drivers/thermal/intel_powerclamp.c
index 725718e97a0b..500f8f551817 100644
--- a/drivers/thermal/intel_powerclamp.c
+++ b/drivers/thermal/intel_powerclamp.c
@@ -119,7 +119,7 @@ exit:
 	return ret;
 }
 
-static struct kernel_param_ops duration_ops = {
+static const struct kernel_param_ops duration_ops = {
 	.set = duration_set,
 	.get = param_get_int,
 };
@@ -167,7 +167,7 @@ exit_win:
 	return ret;
 }
 
-static struct kernel_param_ops window_size_ops = {
+static const struct kernel_param_ops window_size_ops = {
 	.set = window_size_set,
 	.get = param_get_int,
 };
diff --git a/drivers/tty/hvc/hvc_iucv.c b/drivers/tty/hvc/hvc_iucv.c
index f78a87b07872..bb809cf36617 100644
--- a/drivers/tty/hvc/hvc_iucv.c
+++ b/drivers/tty/hvc/hvc_iucv.c
@@ -1345,7 +1345,7 @@ static int param_get_vmidfilter(char *buffer, const struct kernel_param *kp)
 
 #define param_check_vmidfilter(name, p) __param_check(name, p, void)
 
-static struct kernel_param_ops param_ops_vmidfilter = {
+static const struct kernel_param_ops param_ops_vmidfilter = {
 	.set = param_set_vmidfilter,
 	.get = param_get_vmidfilter,
 };
diff --git a/drivers/tty/sysrq.c b/drivers/tty/sysrq.c
index 843f2cdc280b..03b02c37d247 100644
--- a/drivers/tty/sysrq.c
+++ b/drivers/tty/sysrq.c
@@ -1002,7 +1002,7 @@ static int sysrq_reset_seq_param_set(const char *buffer,
 	return 0;
 }
 
-static struct kernel_param_ops param_ops_sysrq_reset_seq = {
+static const struct kernel_param_ops param_ops_sysrq_reset_seq = {
 	.get	= param_get_ushort,
 	.set	= sysrq_reset_seq_param_set,
 };
diff --git a/drivers/video/fbdev/uvesafb.c b/drivers/video/fbdev/uvesafb.c
index d32d1c4d1b99..178ae93b7ebd 100644
--- a/drivers/video/fbdev/uvesafb.c
+++ b/drivers/video/fbdev/uvesafb.c
@@ -1977,7 +1977,7 @@ static int param_set_scroll(const char *val, const struct kernel_param *kp)
 
 	return 0;
 }
-static struct kernel_param_ops param_ops_scroll = {
+static const struct kernel_param_ops param_ops_scroll = {
 	.set = param_set_scroll,
 };
 #define param_check_scroll(name, p) __param_check(name, p, void)
diff --git a/drivers/virtio/virtio_mmio.c b/drivers/virtio/virtio_mmio.c
index 7a5e60dea6c5..10189b5b627f 100644
--- a/drivers/virtio/virtio_mmio.c
+++ b/drivers/virtio/virtio_mmio.c
@@ -691,7 +691,7 @@ static int vm_cmdline_get(char *buffer, const struct kernel_param *kp)
 	return strlen(buffer) + 1;
 }
 
-static struct kernel_param_ops vm_cmdline_param_ops = {
+static const struct kernel_param_ops vm_cmdline_param_ops = {
 	.set = vm_cmdline_set,
 	.get = vm_cmdline_get,
 };
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index f175b833b6ba..aa62004f1706 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -2847,7 +2847,7 @@ static int param_set_portnr(const char *val, const struct kernel_param *kp)
 	*((unsigned int *)kp->arg) = num;
 	return 0;
 }
-static struct kernel_param_ops param_ops_portnr = {
+static const struct kernel_param_ops param_ops_portnr = {
 	.set = param_set_portnr,
 	.get = param_get_uint,
 };
diff --git a/include/linux/moduleparam.h b/include/linux/moduleparam.h
index 1c9effa25e26..5d0f4d97997f 100644
--- a/include/linux/moduleparam.h
+++ b/include/linux/moduleparam.h
@@ -225,7 +225,7 @@ struct kparam_array
 
 /* Obsolete - use module_param_cb() */
 #define module_param_call(name, set, get, arg, perm)			\
-	static struct kernel_param_ops __param_ops_##name =		\
+	static const struct kernel_param_ops __param_ops_##name =		\
 		{ .flags = 0, (void *)set, (void *)get };		\
 	__module_param_call(MODULE_PARAM_PREFIX,			\
 			    name, &__param_ops_##name, arg,		\
@@ -376,64 +376,64 @@ static inline void destroy_params(const struct kernel_param *params,
 #define __param_check(name, p, type) \
 	static inline type __always_unused *__check_##name(void) { return(p); }
 
-extern struct kernel_param_ops param_ops_byte;
+extern const struct kernel_param_ops param_ops_byte;
 extern int param_set_byte(const char *val, const struct kernel_param *kp);
 extern int param_get_byte(char *buffer, const struct kernel_param *kp);
 #define param_check_byte(name, p) __param_check(name, p, unsigned char)
 
-extern struct kernel_param_ops param_ops_short;
+extern const struct kernel_param_ops param_ops_short;
 extern int param_set_short(const char *val, const struct kernel_param *kp);
 extern int param_get_short(char *buffer, const struct kernel_param *kp);
 #define param_check_short(name, p) __param_check(name, p, short)
 
-extern struct kernel_param_ops param_ops_ushort;
+extern const struct kernel_param_ops param_ops_ushort;
 extern int param_set_ushort(const char *val, const struct kernel_param *kp);
 extern int param_get_ushort(char *buffer, const struct kernel_param *kp);
 #define param_check_ushort(name, p) __param_check(name, p, unsigned short)
 
-extern struct kernel_param_ops param_ops_int;
+extern const struct kernel_param_ops param_ops_int;
 extern int param_set_int(const char *val, const struct kernel_param *kp);
 extern int param_get_int(char *buffer, const struct kernel_param *kp);
 #define param_check_int(name, p) __param_check(name, p, int)
 
-extern struct kernel_param_ops param_ops_uint;
+extern const struct kernel_param_ops param_ops_uint;
 extern int param_set_uint(const char *val, const struct kernel_param *kp);
 extern int param_get_uint(char *buffer, const struct kernel_param *kp);
 #define param_check_uint(name, p) __param_check(name, p, unsigned int)
 
-extern struct kernel_param_ops param_ops_long;
+extern const struct kernel_param_ops param_ops_long;
 extern int param_set_long(const char *val, const struct kernel_param *kp);
 extern int param_get_long(char *buffer, const struct kernel_param *kp);
 #define param_check_long(name, p) __param_check(name, p, long)
 
-extern struct kernel_param_ops param_ops_ulong;
+extern const struct kernel_param_ops param_ops_ulong;
 extern int param_set_ulong(const char *val, const struct kernel_param *kp);
 extern int param_get_ulong(char *buffer, const struct kernel_param *kp);
 #define param_check_ulong(name, p) __param_check(name, p, unsigned long)
 
-extern struct kernel_param_ops param_ops_ullong;
+extern const struct kernel_param_ops param_ops_ullong;
 extern int param_set_ullong(const char *val, const struct kernel_param *kp);
 extern int param_get_ullong(char *buffer, const struct kernel_param *kp);
 #define param_check_ullong(name, p) __param_check(name, p, unsigned long long)
 
-extern struct kernel_param_ops param_ops_charp;
+extern const struct kernel_param_ops param_ops_charp;
 extern int param_set_charp(const char *val, const struct kernel_param *kp);
 extern int param_get_charp(char *buffer, const struct kernel_param *kp);
 #define param_check_charp(name, p) __param_check(name, p, char *)
 
 /* We used to allow int as well as bool.  We're taking that away! */
-extern struct kernel_param_ops param_ops_bool;
+extern const struct kernel_param_ops param_ops_bool;
 extern int param_set_bool(const char *val, const struct kernel_param *kp);
 extern int param_get_bool(char *buffer, const struct kernel_param *kp);
 #define param_check_bool(name, p) __param_check(name, p, bool)
 
-extern struct kernel_param_ops param_ops_invbool;
+extern const struct kernel_param_ops param_ops_invbool;
 extern int param_set_invbool(const char *val, const struct kernel_param *kp);
 extern int param_get_invbool(char *buffer, const struct kernel_param *kp);
 #define param_check_invbool(name, p) __param_check(name, p, bool)
 
 /* An int, which can only be set like a bool (though it shows as an int). */
-extern struct kernel_param_ops param_ops_bint;
+extern const struct kernel_param_ops param_ops_bint;
 extern int param_set_bint(const char *val, const struct kernel_param *kp);
 #define param_get_bint param_get_int
 #define param_check_bint param_check_int
@@ -477,9 +477,9 @@ extern int param_set_bint(const char *val, const struct kernel_param *kp);
 			    perm, -1, 0);				\
 	__MODULE_PARM_TYPE(name, "array of " #type)
 
-extern struct kernel_param_ops param_array_ops;
+extern const struct kernel_param_ops param_array_ops;
 
-extern struct kernel_param_ops param_ops_string;
+extern const struct kernel_param_ops param_ops_string;
 extern int param_set_copystring(const char *val, const struct kernel_param *);
 extern int param_get_string(char *buffer, const struct kernel_param *kp);
 
diff --git a/kernel/params.c b/kernel/params.c
index a22d6a759b1a..b7635c025e9b 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -251,7 +251,7 @@ char *parse_args(const char *doing,
 		return scnprintf(buffer, PAGE_SIZE, format,		\
 				*((type *)kp->arg));			\
 	}								\
-	struct kernel_param_ops param_ops_##name = {			\
+	const struct kernel_param_ops param_ops_##name = {			\
 		.set = param_set_##name,				\
 		.get = param_get_##name,				\
 	};								\
@@ -303,7 +303,7 @@ static void param_free_charp(void *arg)
 	maybe_kfree_parameter(*((char **)arg));
 }
 
-struct kernel_param_ops param_ops_charp = {
+const struct kernel_param_ops param_ops_charp = {
 	.set = param_set_charp,
 	.get = param_get_charp,
 	.free = param_free_charp,
@@ -328,7 +328,7 @@ int param_get_bool(char *buffer, const struct kernel_param *kp)
 }
 EXPORT_SYMBOL(param_get_bool);
 
-struct kernel_param_ops param_ops_bool = {
+const struct kernel_param_ops param_ops_bool = {
 	.flags = KERNEL_PARAM_OPS_FL_NOARG,
 	.set = param_set_bool,
 	.get = param_get_bool,
@@ -356,7 +356,7 @@ int param_get_invbool(char *buffer, const struct kernel_param *kp)
 }
 EXPORT_SYMBOL(param_get_invbool);
 
-struct kernel_param_ops param_ops_invbool = {
+const struct kernel_param_ops param_ops_invbool = {
 	.set = param_set_invbool,
 	.get = param_get_invbool,
 };
@@ -379,7 +379,7 @@ int param_set_bint(const char *val, const struct kernel_param *kp)
 }
 EXPORT_SYMBOL(param_set_bint);
 
-struct kernel_param_ops param_ops_bint = {
+const struct kernel_param_ops param_ops_bint = {
 	.flags = KERNEL_PARAM_OPS_FL_NOARG,
 	.set = param_set_bint,
 	.get = param_get_int,
@@ -476,7 +476,7 @@ static void param_array_free(void *arg)
 			arr->ops->free(arr->elem + arr->elemsize * i);
 }
 
-struct kernel_param_ops param_array_ops = {
+const struct kernel_param_ops param_array_ops = {
 	.set = param_array_set,
 	.get = param_array_get,
 	.free = param_array_free,
@@ -504,7 +504,7 @@ int param_get_string(char *buffer, const struct kernel_param *kp)
 }
 EXPORT_SYMBOL(param_get_string);
 
-struct kernel_param_ops param_ops_string = {
+const struct kernel_param_ops param_ops_string = {
 	.set = param_set_copystring,
 	.get = param_get_string,
 };
diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c
index 47f38be4155f..02f53674dc39 100644
--- a/net/sunrpc/auth.c
+++ b/net/sunrpc/auth.c
@@ -72,7 +72,7 @@ static int param_get_hashtbl_sz(char *buffer, const struct kernel_param *kp)
 
 #define param_check_hashtbl_sz(name, p) __param_check(name, p, unsigned int);
 
-static struct kernel_param_ops param_ops_hashtbl_sz = {
+static const struct kernel_param_ops param_ops_hashtbl_sz = {
 	.set = param_set_hashtbl_sz,
 	.get = param_get_hashtbl_sz,
 };
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 66891e32c5e3..b0517287075b 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -2982,7 +2982,7 @@ static int param_set_portnr(const char *val, const struct kernel_param *kp)
 			RPC_MAX_RESVPORT);
 }
 
-static struct kernel_param_ops param_ops_portnr = {
+static const struct kernel_param_ops param_ops_portnr = {
 	.set = param_set_portnr,
 	.get = param_get_uint,
 };
@@ -3001,7 +3001,7 @@ static int param_set_slot_table_size(const char *val,
 			RPC_MAX_SLOT_TABLE);
 }
 
-static struct kernel_param_ops param_ops_slot_table_size = {
+static const struct kernel_param_ops param_ops_slot_table_size = {
 	.set = param_set_slot_table_size,
 	.get = param_get_uint,
 };
@@ -3017,7 +3017,7 @@ static int param_set_max_slot_table_size(const char *val,
 			RPC_MAX_SLOT_TABLE_LIMIT);
 }
 
-static struct kernel_param_ops param_ops_max_slot_table_size = {
+static const struct kernel_param_ops param_ops_max_slot_table_size = {
 	.set = param_set_max_slot_table_size,
 	.get = param_get_uint,
 };
diff --git a/security/apparmor/lsm.c b/security/apparmor/lsm.c
index e5f1561439db..45eb96d1e1d9 100644
--- a/security/apparmor/lsm.c
+++ b/security/apparmor/lsm.c
@@ -667,7 +667,7 @@ static struct security_operations apparmor_ops = {
 static int param_set_aabool(const char *val, const struct kernel_param *kp);
 static int param_get_aabool(char *buffer, const struct kernel_param *kp);
 #define param_check_aabool param_check_bool
-static struct kernel_param_ops param_ops_aabool = {
+static const struct kernel_param_ops param_ops_aabool = {
 	.flags = KERNEL_PARAM_OPS_FL_NOARG,
 	.set = param_set_aabool,
 	.get = param_get_aabool
@@ -676,7 +676,7 @@ static struct kernel_param_ops param_ops_aabool = {
 static int param_set_aauint(const char *val, const struct kernel_param *kp);
 static int param_get_aauint(char *buffer, const struct kernel_param *kp);
 #define param_check_aauint param_check_uint
-static struct kernel_param_ops param_ops_aauint = {
+static const struct kernel_param_ops param_ops_aauint = {
 	.set = param_set_aauint,
 	.get = param_get_aauint
 };
@@ -684,7 +684,7 @@ static struct kernel_param_ops param_ops_aauint = {
 static int param_set_aalockpolicy(const char *val, const struct kernel_param *kp);
 static int param_get_aalockpolicy(char *buffer, const struct kernel_param *kp);
 #define param_check_aalockpolicy param_check_bool
-static struct kernel_param_ops param_ops_aalockpolicy = {
+static const struct kernel_param_ops param_ops_aalockpolicy = {
 	.flags = KERNEL_PARAM_OPS_FL_NOARG,
 	.set = param_set_aalockpolicy,
 	.get = param_get_aalockpolicy
diff --git a/security/integrity/ima/ima_crypto.c b/security/integrity/ima/ima_crypto.c
index 686355fea7fd..e24121afb2f2 100644
--- a/security/integrity/ima/ima_crypto.c
+++ b/security/integrity/ima/ima_crypto.c
@@ -55,7 +55,7 @@ static int param_set_bufsize(const char *val, const struct kernel_param *kp)
 	return 0;
 }
 
-static struct kernel_param_ops param_ops_bufsize = {
+static const struct kernel_param_ops param_ops_bufsize = {
 	.set = param_set_bufsize,
 	.get = param_get_uint,
 };
diff --git a/sound/pci/hda/hda_intel.c b/sound/pci/hda/hda_intel.c
index 34040d26c94f..23a1d2ffa7f3 100644
--- a/sound/pci/hda/hda_intel.c
+++ b/sound/pci/hda/hda_intel.c
@@ -166,7 +166,7 @@ MODULE_PARM_DESC(beep_mode, "Select HDA Beep registration mode "
 
 #ifdef CONFIG_PM
 static int param_set_xint(const char *val, const struct kernel_param *kp);
-static struct kernel_param_ops param_ops_xint = {
+static const struct kernel_param_ops param_ops_xint = {
 	.set = param_set_xint,
 	.get = param_get_int,
 };
-- 
cgit v1.2.3


From d19f05d8a8fa221e5d5f4eaca0f3ca5874c990d3 Mon Sep 17 00:00:00 2001
From: "Luis R. Rodriguez" <mcgrof@suse.com>
Date: Wed, 27 May 2015 11:09:38 +0930
Subject: kernel/params.c: generalize bool_enable_only

This takes out the bool_enable_only implementation from
the module loading code and generalizes it so that others
can make use of it.

Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Jani Nikula <jani.nikula@intel.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Kees Cook <keescook@chromium.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: linux-kernel@vger.kernel.org
Cc: cocci@systeme.lip6.fr
Signed-off-by: Luis R. Rodriguez <mcgrof@suse.com>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 include/linux/moduleparam.h |  6 ++++++
 kernel/module.c             | 31 -------------------------------
 kernel/params.c             | 30 ++++++++++++++++++++++++++++++
 3 files changed, 36 insertions(+), 31 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/moduleparam.h b/include/linux/moduleparam.h
index 5d0f4d97997f..7e0079936396 100644
--- a/include/linux/moduleparam.h
+++ b/include/linux/moduleparam.h
@@ -427,6 +427,12 @@ extern int param_set_bool(const char *val, const struct kernel_param *kp);
 extern int param_get_bool(char *buffer, const struct kernel_param *kp);
 #define param_check_bool(name, p) __param_check(name, p, bool)
 
+extern const struct kernel_param_ops param_ops_bool_enable_only;
+extern int param_set_bool_enable_only(const char *val,
+				      const struct kernel_param *kp);
+/* getter is the same as for the regular bool */
+#define param_check_bool_enable_only param_check_bool
+
 extern const struct kernel_param_ops param_ops_invbool;
 extern int param_set_invbool(const char *val, const struct kernel_param *kp);
 extern int param_get_invbool(char *buffer, const struct kernel_param *kp);
diff --git a/kernel/module.c b/kernel/module.c
index 9e8c9305bba9..9b0e36145474 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -298,37 +298,6 @@ static bool sig_enforce = true;
 #else
 static bool sig_enforce = false;
 
-static int param_set_bool_enable_only(const char *val,
-				      const struct kernel_param *kp)
-{
-	int err = 0;
-	bool new_value;
-	bool orig_value = *(bool *)kp->arg;
-	struct kernel_param dummy_kp = *kp;
-
-	dummy_kp.arg = &new_value;
-
-	err = param_set_bool(val, &dummy_kp);
-	if (err)
-		return err;
-
-	/* Don't let them unset it once it's set! */
-	if (!new_value && orig_value)
-		return -EROFS;
-
-	if (new_value)
-		err = param_set_bool(val, kp);
-
-	return err;
-}
-
-static const struct kernel_param_ops param_ops_bool_enable_only = {
-	.flags = KERNEL_PARAM_OPS_FL_NOARG,
-	.set = param_set_bool_enable_only,
-	.get = param_get_bool,
-};
-#define param_check_bool_enable_only param_check_bool
-
 module_param(sig_enforce, bool_enable_only, 0644);
 #endif /* !CONFIG_MODULE_SIG_FORCE */
 #endif /* CONFIG_MODULE_SIG */
diff --git a/kernel/params.c b/kernel/params.c
index b7635c025e9b..324624ed620f 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -335,6 +335,36 @@ const struct kernel_param_ops param_ops_bool = {
 };
 EXPORT_SYMBOL(param_ops_bool);
 
+int param_set_bool_enable_only(const char *val, const struct kernel_param *kp)
+{
+	int err = 0;
+	bool new_value;
+	bool orig_value = *(bool *)kp->arg;
+	struct kernel_param dummy_kp = *kp;
+
+	dummy_kp.arg = &new_value;
+
+	err = param_set_bool(val, &dummy_kp);
+	if (err)
+		return err;
+
+	/* Don't let them unset it once it's set! */
+	if (!new_value && orig_value)
+		return -EROFS;
+
+	if (new_value)
+		err = param_set_bool(val, kp);
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(param_set_bool_enable_only);
+
+const struct kernel_param_ops param_ops_bool_enable_only = {
+	.flags = KERNEL_PARAM_OPS_FL_NOARG,
+	.set = param_set_bool_enable_only,
+	.get = param_get_bool,
+};
+
 /* This one must be bool. */
 int param_set_invbool(const char *val, const struct kernel_param *kp)
 {
-- 
cgit v1.2.3


From f36f3f2846b5578d62910ee0b6dbef59fdd1cfa4 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Mon, 18 May 2015 13:20:23 +0200
Subject: KVM: add "new" argument to kvm_arch_commit_memory_region

This lets the function access the new memory slot without going through
kvm_memslots and id_to_memslot.  It will simplify the code when more
than one address space will be supported.

Unfortunately, the "const"ness of the new argument must be casted
away in two places.  Fixing KVM to accept const struct kvm_memory_slot
pointers would require modifications in pretty much all architectures,
and is left for later.

Reviewed-by: Radim Krcmar <rkrcmar@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/arm/kvm/mmu.c                 |  1 +
 arch/mips/kvm/mips.c               |  1 +
 arch/powerpc/include/asm/kvm_ppc.h |  6 ++++--
 arch/powerpc/kvm/book3s.c          |  5 +++--
 arch/powerpc/kvm/book3s_hv.c       |  3 ++-
 arch/powerpc/kvm/book3s_pr.c       |  3 ++-
 arch/powerpc/kvm/booke.c           |  3 ++-
 arch/powerpc/kvm/powerpc.c         |  3 ++-
 arch/s390/kvm/kvm-s390.c           |  1 +
 arch/x86/include/asm/kvm_host.h    |  2 +-
 arch/x86/kvm/mmu.c                 |  6 ++++--
 arch/x86/kvm/x86.c                 | 13 +++++--------
 include/linux/kvm_host.h           |  1 +
 virt/kvm/kvm_main.c                |  2 +-
 14 files changed, 30 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
index e9ac084d21ea..7f473e6d3bf5 100644
--- a/arch/arm/kvm/mmu.c
+++ b/arch/arm/kvm/mmu.c
@@ -1721,6 +1721,7 @@ out:
 void kvm_arch_commit_memory_region(struct kvm *kvm,
 				   const struct kvm_userspace_memory_region *mem,
 				   const struct kvm_memory_slot *old,
+				   const struct kvm_memory_slot *new,
 				   enum kvm_mr_change change)
 {
 	/*
diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c
index 5963e2e8a6d7..cd4c129ce743 100644
--- a/arch/mips/kvm/mips.c
+++ b/arch/mips/kvm/mips.c
@@ -207,6 +207,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
 void kvm_arch_commit_memory_region(struct kvm *kvm,
 				   const struct kvm_userspace_memory_region *mem,
 				   const struct kvm_memory_slot *old,
+				   const struct kvm_memory_slot *new,
 				   enum kvm_mr_change change)
 {
 	unsigned long npages = 0;
diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index aff563b5f001..c6ef05bd0765 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -185,7 +185,8 @@ extern int kvmppc_core_prepare_memory_region(struct kvm *kvm,
 				const struct kvm_userspace_memory_region *mem);
 extern void kvmppc_core_commit_memory_region(struct kvm *kvm,
 				const struct kvm_userspace_memory_region *mem,
-				const struct kvm_memory_slot *old);
+				const struct kvm_memory_slot *old,
+				const struct kvm_memory_slot *new);
 extern int kvm_vm_ioctl_get_smmu_info(struct kvm *kvm,
 				      struct kvm_ppc_smmu_info *info);
 extern void kvmppc_core_flush_memslot(struct kvm *kvm,
@@ -246,7 +247,8 @@ struct kvmppc_ops {
 				     const struct kvm_userspace_memory_region *mem);
 	void (*commit_memory_region)(struct kvm *kvm,
 				     const struct kvm_userspace_memory_region *mem,
-				     const struct kvm_memory_slot *old);
+				     const struct kvm_memory_slot *old,
+				     const struct kvm_memory_slot *new);
 	int (*unmap_hva)(struct kvm *kvm, unsigned long hva);
 	int (*unmap_hva_range)(struct kvm *kvm, unsigned long start,
 			   unsigned long end);
diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
index 60aa0726dccc..05ea8fc7f829 100644
--- a/arch/powerpc/kvm/book3s.c
+++ b/arch/powerpc/kvm/book3s.c
@@ -764,9 +764,10 @@ int kvmppc_core_prepare_memory_region(struct kvm *kvm,
 
 void kvmppc_core_commit_memory_region(struct kvm *kvm,
 				const struct kvm_userspace_memory_region *mem,
-				const struct kvm_memory_slot *old)
+				const struct kvm_memory_slot *old,
+				const struct kvm_memory_slot *new)
 {
-	kvm->arch.kvm_ops->commit_memory_region(kvm, mem, old);
+	kvm->arch.kvm_ops->commit_memory_region(kvm, mem, old, new);
 }
 
 int kvm_unmap_hva(struct kvm *kvm, unsigned long hva)
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index ed493d123268..68d067ad4222 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -2383,7 +2383,8 @@ static int kvmppc_core_prepare_memory_region_hv(struct kvm *kvm,
 
 static void kvmppc_core_commit_memory_region_hv(struct kvm *kvm,
 				const struct kvm_userspace_memory_region *mem,
-				const struct kvm_memory_slot *old)
+				const struct kvm_memory_slot *old,
+				const struct kvm_memory_slot *new)
 {
 	unsigned long npages = mem->memory_size >> PAGE_SHIFT;
 	struct kvm_memslots *slots;
diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c
index 0873e766df1b..64891b081ad5 100644
--- a/arch/powerpc/kvm/book3s_pr.c
+++ b/arch/powerpc/kvm/book3s_pr.c
@@ -1580,7 +1580,8 @@ static int kvmppc_core_prepare_memory_region_pr(struct kvm *kvm,
 
 static void kvmppc_core_commit_memory_region_pr(struct kvm *kvm,
 				const struct kvm_userspace_memory_region *mem,
-				const struct kvm_memory_slot *old)
+				const struct kvm_memory_slot *old,
+				const struct kvm_memory_slot *new)
 {
 	return;
 }
diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
index 518e3a8b351f..cc5842657161 100644
--- a/arch/powerpc/kvm/booke.c
+++ b/arch/powerpc/kvm/booke.c
@@ -1791,7 +1791,8 @@ int kvmppc_core_prepare_memory_region(struct kvm *kvm,
 
 void kvmppc_core_commit_memory_region(struct kvm *kvm,
 				const struct kvm_userspace_memory_region *mem,
-				const struct kvm_memory_slot *old)
+				const struct kvm_memory_slot *old,
+				const struct kvm_memory_slot *new)
 {
 }
 
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 5985bb2a332b..e5dde32fe71f 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -604,9 +604,10 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
 void kvm_arch_commit_memory_region(struct kvm *kvm,
 				   const struct kvm_userspace_memory_region *mem,
 				   const struct kvm_memory_slot *old,
+				   const struct kvm_memory_slot *new,
 				   enum kvm_mr_change change)
 {
-	kvmppc_core_commit_memory_region(kvm, mem, old);
+	kvmppc_core_commit_memory_region(kvm, mem, old, new);
 }
 
 void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 994f9c37f25f..8ad4b9a5667f 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -2602,6 +2602,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
 void kvm_arch_commit_memory_region(struct kvm *kvm,
 				const struct kvm_userspace_memory_region *mem,
 				const struct kvm_memory_slot *old,
+				const struct kvm_memory_slot *new,
 				enum kvm_mr_change change)
 {
 	int rc;
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 1a4d6a054749..7276107b35df 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -874,7 +874,7 @@ void kvm_mmu_reset_context(struct kvm_vcpu *vcpu);
 void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
 				      struct kvm_memory_slot *memslot);
 void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm,
-					struct kvm_memory_slot *memslot);
+				   const struct kvm_memory_slot *memslot);
 void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
 				   struct kvm_memory_slot *memslot);
 void kvm_mmu_slot_largepage_remove_write_access(struct kvm *kvm,
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 49c34e632b91..1bf2ae9ca521 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -4621,10 +4621,12 @@ restart:
 }
 
 void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm,
-			struct kvm_memory_slot *memslot)
+				   const struct kvm_memory_slot *memslot)
 {
+	/* FIXME: const-ify all uses of struct kvm_memory_slot.  */
 	spin_lock(&kvm->mmu_lock);
-	slot_handle_leaf(kvm, memslot, kvm_mmu_zap_collapsible_spte, true);
+	slot_handle_leaf(kvm, (struct kvm_memory_slot *)memslot,
+			 kvm_mmu_zap_collapsible_spte, true);
 	spin_unlock(&kvm->mmu_lock);
 }
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index f0aec85f8cdd..ba7b0cc52fed 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -7780,13 +7780,12 @@ static void kvm_mmu_slot_apply_flags(struct kvm *kvm,
 void kvm_arch_commit_memory_region(struct kvm *kvm,
 				const struct kvm_userspace_memory_region *mem,
 				const struct kvm_memory_slot *old,
+				const struct kvm_memory_slot *new,
 				enum kvm_mr_change change)
 {
-	struct kvm_memslots *slots;
-	struct kvm_memory_slot *new;
 	int nr_mmu_pages = 0;
 
-	if ((mem->slot >= KVM_USER_MEM_SLOTS) && (change == KVM_MR_DELETE)) {
+	if (change == KVM_MR_DELETE && old->id >= KVM_USER_MEM_SLOTS) {
 		int ret;
 
 		ret = vm_munmap(old->userspace_addr,
@@ -7803,10 +7802,6 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
 	if (nr_mmu_pages)
 		kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages);
 
-	/* It's OK to get 'new' slot here as it has already been installed */
-	slots = kvm_memslots(kvm);
-	new = id_to_memslot(slots, mem->slot);
-
 	/*
 	 * Dirty logging tracks sptes in 4k granularity, meaning that large
 	 * sptes have to be split.  If live migration is successful, the guest
@@ -7831,9 +7826,11 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
 	 * been zapped so no dirty logging staff is needed for old slot. For
 	 * KVM_MR_FLAGS_ONLY, the old slot is essentially the same one as the
 	 * new and it's also covered when dealing with the new slot.
+	 *
+	 * FIXME: const-ify all uses of struct kvm_memory_slot.
 	 */
 	if (change != KVM_MR_DELETE)
-		kvm_mmu_slot_apply_flags(kvm, new);
+		kvm_mmu_slot_apply_flags(kvm, (struct kvm_memory_slot *) new);
 }
 
 void kvm_arch_flush_shadow_all(struct kvm *kvm)
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 8815f1dffb77..9bd3bc16be87 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -516,6 +516,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
 void kvm_arch_commit_memory_region(struct kvm *kvm,
 				const struct kvm_userspace_memory_region *mem,
 				const struct kvm_memory_slot *old,
+				const struct kvm_memory_slot *new,
 				enum kvm_mr_change change);
 bool kvm_largepages_enabled(void);
 void kvm_disable_largepages(void);
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 4361204a6348..9f67c942d8ee 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -912,7 +912,7 @@ int __kvm_set_memory_region(struct kvm *kvm,
 	update_memslots(slots, &new);
 	old_memslots = install_new_memslots(kvm, slots);
 
-	kvm_arch_commit_memory_region(kvm, mem, &old, change);
+	kvm_arch_commit_memory_region(kvm, mem, &old, &new, change);
 
 	kvm_free_memslot(kvm, &old, &new);
 	kvfree(old_memslots);
-- 
cgit v1.2.3


From d9ef13c2b3983de8dd1373ef670799dbb6498122 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Tue, 19 May 2015 16:01:50 +0200
Subject: KVM: pass kvm_memory_slot to gfn_to_page_many_atomic

The memory slot is already available from gfn_to_memslot_dirty_bitmap.
Isn't it a shame to look it up again?  Plus, it makes gfn_to_page_many_atomic
agnostic of multiple VCPU address spaces.

Reviewed-by: Radim Krcmar <rkrcmar@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/mmu.c       | 6 ++++--
 include/linux/kvm_host.h | 4 ++--
 virt/kvm/kvm_main.c      | 6 +++---
 3 files changed, 9 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 1bf2ae9ca521..6a7e5b6246b1 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2728,15 +2728,17 @@ static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu,
 				    u64 *start, u64 *end)
 {
 	struct page *pages[PTE_PREFETCH_NUM];
+	struct kvm_memory_slot *slot;
 	unsigned access = sp->role.access;
 	int i, ret;
 	gfn_t gfn;
 
 	gfn = kvm_mmu_page_get_gfn(sp, start - sp->spt);
-	if (!gfn_to_memslot_dirty_bitmap(vcpu, gfn, access & ACC_WRITE_MASK))
+	slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, access & ACC_WRITE_MASK);
+	if (!slot)
 		return -1;
 
-	ret = gfn_to_page_many_atomic(vcpu->kvm, gfn, pages, end - start);
+	ret = gfn_to_page_many_atomic(slot, gfn, pages, end - start);
 	if (ret <= 0)
 		return -1;
 
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 9bd3bc16be87..a8bcbc9c6078 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -526,8 +526,8 @@ void kvm_arch_flush_shadow_all(struct kvm *kvm);
 void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
 				   struct kvm_memory_slot *slot);
 
-int gfn_to_page_many_atomic(struct kvm *kvm, gfn_t gfn, struct page **pages,
-			    int nr_pages);
+int gfn_to_page_many_atomic(struct kvm_memory_slot *slot, gfn_t gfn,
+			    struct page **pages, int nr_pages);
 
 struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn);
 unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn);
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 9f67c942d8ee..c57f44216a4e 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1428,13 +1428,13 @@ pfn_t gfn_to_pfn_memslot_atomic(struct kvm_memory_slot *slot, gfn_t gfn)
 }
 EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot_atomic);
 
-int gfn_to_page_many_atomic(struct kvm *kvm, gfn_t gfn, struct page **pages,
-								  int nr_pages)
+int gfn_to_page_many_atomic(struct kvm_memory_slot *slot, gfn_t gfn,
+			    struct page **pages, int nr_pages)
 {
 	unsigned long addr;
 	gfn_t entry;
 
-	addr = gfn_to_hva_many(gfn_to_memslot(kvm, gfn), gfn, &entry);
+	addr = gfn_to_hva_many(slot, gfn, &entry);
 	if (kvm_is_error_hva(addr))
 		return -1;
 
-- 
cgit v1.2.3


From bfa1ce5f38938cc9e6c7f2d1011f88eba2b9e2b2 Mon Sep 17 00:00:00 2001
From: Thomas Petazzoni <thomas.petazzoni@free-electrons.com>
Date: Thu, 28 May 2015 11:40:54 +0200
Subject: bus: mvebu-mbus: add mv_mbus_dram_info_nooverlap()

This commit introduces a variant of the mv_mbus_dram_info() function
called mv_mbus_dram_info_nooverlap(). Both functions are used by
Marvell drivers supporting devices doing DMA, and provide them a
description the DRAM ranges that they need to configure their DRAM
windows.

The ranges provided by the mv_mbus_dram_info() function may overlap
with the I/O windows if there is a lot (>= 4 GB) of RAM
installed. This is not a problem for most of the DMA masters, except
for the upcoming new CESA crypto driver because it does DMA to the
SRAM, which is mapped through an I/O window. For this unit, we need to
have DRAM ranges that do not overlap with the I/O windows.

A first implementation done in commit 1737cac69369 ("bus: mvebu-mbus:
make sure SDRAM CS for DMA don't overlap the MBus bridge window"),
changed the information returned by mv_mbus_dram_info() to match this
requirement. However, it broke the requirement of the other DMA
masters than the DRAM ranges should have power of two sizes.

To solve this situation, this commit introduces a new
mv_mbus_dram_info_nooverlap() function, which returns the same
information as mv_mbus_dram_info(), but guaranteed to not overlap with
the I/O windows.

In the end, it gives us two variants of the mv_mbus_dram_info*()
functions:

 - The normal one, mv_mbus_dram_info(), which has been around for many
   years. This function returns the raw DRAM ranges, which are
   guaranteed to use power of two sizes, but will overlap with I/O
   windows. This function will therefore be used by all DMA masters
   (SATA, XOR, Ethernet, etc.) except the CESA crypto driver.

 - The new 'nooverlap' variant, mv_mbus_dram_info_nooverlap(). This
   function returns DRAM ranges after they have been "tweaked" to make
   sure they don't overlap with I/O windows. By doing this tweaking,
   we remove the power of two size guarantee. This variant will be
   used by the new CESA crypto driver.

Signed-off-by: Thomas Petazzoni <thomas.petazzoni@free-electrons.com>
Signed-off-by: Gregory CLEMENT <gregory.clement@free-electrons.com>
---
 drivers/bus/mvebu-mbus.c | 117 +++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/mbus.h     |   5 ++
 2 files changed, 122 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/bus/mvebu-mbus.c b/drivers/bus/mvebu-mbus.c
index 6f047dcb94c2..c43c3d2baf73 100644
--- a/drivers/bus/mvebu-mbus.c
+++ b/drivers/bus/mvebu-mbus.c
@@ -57,6 +57,7 @@
 #include <linux/of_address.h>
 #include <linux/debugfs.h>
 #include <linux/log2.h>
+#include <linux/memblock.h>
 #include <linux/syscore_ops.h>
 
 /*
@@ -152,13 +153,39 @@ struct mvebu_mbus_state {
 
 static struct mvebu_mbus_state mbus_state;
 
+/*
+ * We provide two variants of the mv_mbus_dram_info() function:
+ *
+ * - The normal one, where the described DRAM ranges may overlap with
+ *   the I/O windows, but for which the DRAM ranges are guaranteed to
+ *   have a power of two size. Such ranges are suitable for the DMA
+ *   masters that only DMA between the RAM and the device, which is
+ *   actually all devices except the crypto engines.
+ *
+ * - The 'nooverlap' one, where the described DRAM ranges are
+ *   guaranteed to not overlap with the I/O windows, but for which the
+ *   DRAM ranges will not have power of two sizes. They will only be
+ *   aligned on a 64 KB boundary, and have a size multiple of 64
+ *   KB. Such ranges are suitable for the DMA masters that DMA between
+ *   the crypto SRAM (which is mapped through an I/O window) and a
+ *   device. This is the case for the crypto engines.
+ */
+
 static struct mbus_dram_target_info mvebu_mbus_dram_info;
+static struct mbus_dram_target_info mvebu_mbus_dram_info_nooverlap;
+
 const struct mbus_dram_target_info *mv_mbus_dram_info(void)
 {
 	return &mvebu_mbus_dram_info;
 }
 EXPORT_SYMBOL_GPL(mv_mbus_dram_info);
 
+const struct mbus_dram_target_info *mv_mbus_dram_info_nooverlap(void)
+{
+	return &mvebu_mbus_dram_info_nooverlap;
+}
+EXPORT_SYMBOL_GPL(mv_mbus_dram_info_nooverlap);
+
 /* Checks whether the given window has remap capability */
 static bool mvebu_mbus_window_is_remappable(struct mvebu_mbus_state *mbus,
 					    const int win)
@@ -576,6 +603,95 @@ static unsigned int armada_xp_mbus_win_remap_offset(int win)
 		return MVEBU_MBUS_NO_REMAP;
 }
 
+/*
+ * Use the memblock information to find the MBus bridge hole in the
+ * physical address space.
+ */
+static void __init
+mvebu_mbus_find_bridge_hole(uint64_t *start, uint64_t *end)
+{
+	struct memblock_region *r;
+	uint64_t s = 0;
+
+	for_each_memblock(memory, r) {
+		/*
+		 * This part of the memory is above 4 GB, so we don't
+		 * care for the MBus bridge hole.
+		 */
+		if (r->base >= 0x100000000ULL)
+			continue;
+
+		/*
+		 * The MBus bridge hole is at the end of the RAM under
+		 * the 4 GB limit.
+		 */
+		if (r->base + r->size > s)
+			s = r->base + r->size;
+	}
+
+	*start = s;
+	*end = 0x100000000ULL;
+}
+
+/*
+ * This function fills in the mvebu_mbus_dram_info_nooverlap data
+ * structure, by looking at the mvebu_mbus_dram_info data, and
+ * removing the parts of it that overlap with I/O windows.
+ */
+static void __init
+mvebu_mbus_setup_cpu_target_nooverlap(struct mvebu_mbus_state *mbus)
+{
+	uint64_t mbus_bridge_base, mbus_bridge_end;
+	int cs_nooverlap = 0;
+	int i;
+
+	mvebu_mbus_find_bridge_hole(&mbus_bridge_base, &mbus_bridge_end);
+
+	for (i = 0; i < mvebu_mbus_dram_info.num_cs; i++) {
+		struct mbus_dram_window *w;
+		u64 base, size, end;
+
+		w = &mvebu_mbus_dram_info.cs[i];
+		base = w->base;
+		size = w->size;
+		end = base + size;
+
+		/*
+		 * The CS is fully enclosed inside the MBus bridge
+		 * area, so ignore it.
+		 */
+		if (base >= mbus_bridge_base && end <= mbus_bridge_end)
+			continue;
+
+		/*
+		 * Beginning of CS overlaps with end of MBus, raise CS
+		 * base address, and shrink its size.
+		 */
+		if (base >= mbus_bridge_base && end > mbus_bridge_end) {
+			size -= mbus_bridge_end - base;
+			base = mbus_bridge_end;
+		}
+
+		/*
+		 * End of CS overlaps with beginning of MBus, shrink
+		 * CS size.
+		 */
+		if (base < mbus_bridge_base && end > mbus_bridge_base)
+			size -= end - mbus_bridge_base;
+
+		w = &mvebu_mbus_dram_info_nooverlap.cs[cs_nooverlap++];
+		w->cs_index = i;
+		w->mbus_attr = 0xf & ~(1 << i);
+		if (mbus->hw_io_coherency)
+			w->mbus_attr |= ATTR_HW_COHERENCY;
+		w->base = base;
+		w->size = size;
+	}
+
+	mvebu_mbus_dram_info_nooverlap.mbus_dram_target_id = TARGET_DDR;
+	mvebu_mbus_dram_info_nooverlap.num_cs = cs_nooverlap;
+}
+
 static void __init
 mvebu_mbus_default_setup_cpu_target(struct mvebu_mbus_state *mbus)
 {
@@ -964,6 +1080,7 @@ static int __init mvebu_mbus_common_init(struct mvebu_mbus_state *mbus,
 		mvebu_mbus_disable_window(mbus, win);
 
 	mbus->soc->setup_cpu_target(mbus);
+	mvebu_mbus_setup_cpu_target_nooverlap(mbus);
 
 	if (is_coherent)
 		writel(UNIT_SYNC_BARRIER_ALL,
diff --git a/include/linux/mbus.h b/include/linux/mbus.h
index 611b69fa8594..1f7bc630d225 100644
--- a/include/linux/mbus.h
+++ b/include/linux/mbus.h
@@ -54,11 +54,16 @@ struct mbus_dram_target_info
  */
 #ifdef CONFIG_PLAT_ORION
 extern const struct mbus_dram_target_info *mv_mbus_dram_info(void);
+extern const struct mbus_dram_target_info *mv_mbus_dram_info_nooverlap(void);
 #else
 static inline const struct mbus_dram_target_info *mv_mbus_dram_info(void)
 {
 	return NULL;
 }
+static inline const struct mbus_dram_target_info *mv_mbus_dram_info_nooverlap(void)
+{
+	return NULL;
+}
 #endif
 
 int mvebu_mbus_save_cpu_target(u32 *store_addr);
-- 
cgit v1.2.3


From 85e6f09785bd06f0510bdb00c40772c7f8da3c43 Mon Sep 17 00:00:00 2001
From: Dmitry Eremin-Solenikov <dbaryshkov@gmail.com>
Date: Tue, 19 May 2015 16:16:14 +0100
Subject: ARM: 8367/1: sa1100: prepare for moving irq driver to drivers/irqchip

Prepare for moving sa1100 irq driver to irqchip infrastructure - split
sa1100_init_irq into helper code and irq parts.

Signed-off-by: Dmitry Eremin-Solenikov <dbaryshkov@gmail.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
---
 arch/arm/mach-sa1100/generic.c     | 13 +++++++++++++
 arch/arm/mach-sa1100/irq.c         | 21 ++++++---------------
 include/linux/irqchip/irq-sa11x0.h | 17 +++++++++++++++++
 3 files changed, 36 insertions(+), 15 deletions(-)
 create mode 100644 include/linux/irqchip/irq-sa11x0.h

(limited to 'include/linux')

diff --git a/arch/arm/mach-sa1100/generic.c b/arch/arm/mach-sa1100/generic.c
index c651f6eb863c..345e63f4eb71 100644
--- a/arch/arm/mach-sa1100/generic.c
+++ b/arch/arm/mach-sa1100/generic.c
@@ -20,6 +20,7 @@
 #include <linux/ioport.h>
 #include <linux/platform_device.h>
 #include <linux/reboot.h>
+#include <linux/irqchip/irq-sa11x0.h>
 
 #include <video/sa1100fb.h>
 
@@ -377,6 +378,18 @@ void __init sa1100_timer_init(void)
 	pxa_timer_nodt_init(IRQ_OST0, io_p2v(0x90000000), 3686400);
 }
 
+static struct resource irq_resource =
+	DEFINE_RES_MEM_NAMED(0x90050000, SZ_64K, "irqs");
+
+void __init sa1100_init_irq(void)
+{
+	request_resource(&iomem_resource, &irq_resource);
+
+	sa11x0_init_irq_nodt(IRQ_GPIO0_SC, irq_resource.start);
+
+	sa1100_init_gpio();
+}
+
 /*
  * Disable the memory bus request/grant signals on the SA1110 to
  * ensure that we don't receive spurious memory requests.  We set
diff --git a/arch/arm/mach-sa1100/irq.c b/arch/arm/mach-sa1100/irq.c
index 08f929efddbc..fdec5edbbfd5 100644
--- a/arch/arm/mach-sa1100/irq.c
+++ b/arch/arm/mach-sa1100/irq.c
@@ -1,9 +1,10 @@
 /*
  * linux/arch/arm/mach-sa1100/irq.c
  *
+ * Copyright (C) 2015 Dmitry Eremin-Solenikov
  * Copyright (C) 1999-2001 Nicolas Pitre
  *
- * Generic IRQ handling for the SA11x0, GPIO 11-27 IRQ demultiplexing.
+ * Generic IRQ handling for the SA11x0.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
@@ -15,16 +16,13 @@
 #include <linux/io.h>
 #include <linux/irq.h>
 #include <linux/irqdomain.h>
-#include <linux/ioport.h>
 #include <linux/syscore_ops.h>
+#include <linux/irqchip/irq-sa11x0.h>
 
 #include <soc/sa1100/pwer.h>
 
-#include <mach/irqs.h>
 #include <asm/exception.h>
 
-#include "generic.h"
-
 #define ICIP	0x00  /* IC IRQ Pending reg. */
 #define ICMR	0x04  /* IC Mask Reg.        */
 #define ICLR	0x08  /* IC Level Reg.       */
@@ -86,9 +84,6 @@ static struct irq_domain_ops sa1100_normal_irqdomain_ops = {
 
 static struct irq_domain *sa1100_normal_irqdomain;
 
-static struct resource irq_resource =
-	DEFINE_RES_MEM_NAMED(0x90050000, SZ_64K, "irqs");
-
 static struct sa1100irq_state {
 	unsigned int	saved;
 	unsigned int	icmr;
@@ -156,11 +151,9 @@ sa1100_handle_irq(struct pt_regs *regs)
 	} while (1);
 }
 
-void __init sa1100_init_irq(void)
+void __init sa11x0_init_irq_nodt(int irq_start, resource_size_t io_start)
 {
-	request_resource(&iomem_resource, &irq_resource);
-
-	iobase = ioremap(irq_resource.start, SZ_64K);
+	iobase = ioremap(io_start, SZ_64K);
 	if (WARN_ON(!iobase))
 		return;
 
@@ -177,10 +170,8 @@ void __init sa1100_init_irq(void)
 	writel_relaxed(1, iobase + ICCR);
 
 	sa1100_normal_irqdomain = irq_domain_add_simple(NULL,
-			32, IRQ_GPIO0_SC,
+			32, irq_start,
 			&sa1100_normal_irqdomain_ops, NULL);
 
 	set_handle_irq(sa1100_handle_irq);
-
-	sa1100_init_gpio();
 }
diff --git a/include/linux/irqchip/irq-sa11x0.h b/include/linux/irqchip/irq-sa11x0.h
new file mode 100644
index 000000000000..15db6829c1e4
--- /dev/null
+++ b/include/linux/irqchip/irq-sa11x0.h
@@ -0,0 +1,17 @@
+/*
+ * Generic IRQ handling for the SA11x0.
+ *
+ * Copyright (C) 2015 Dmitry Eremin-Solenikov
+ * Copyright (C) 1999-2001 Nicolas Pitre
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef __INCLUDE_LINUX_IRQCHIP_IRQ_SA11x0_H
+#define __INCLUDE_LINUX_IRQCHIP_IRQ_SA11x0_H
+
+void __init sa11x0_init_irq_nodt(int irq_start, resource_size_t io_start);
+
+#endif
-- 
cgit v1.2.3


From 391b459d7623b26153c7d8c200e8d213440a7d48 Mon Sep 17 00:00:00 2001
From: Pantelis Antoniou <pantelis.antoniou@konsulko.com>
Date: Fri, 24 Apr 2015 12:41:56 +0300
Subject: of: Move OF flags to be visible even when !CONFIG_OF

We need those to be visible even when compiling with CONFIG_OF
disabled, since even the empty of_node_*_flag() method use the
flag.

Signed-off-by: Pantelis Antoniou <pantelis.antoniou@konsulko.com>
Acked-by: Wolfram Sang <wsa+renesas@sang-engineering.com>
Signed-off-by: Rob Herring <robh@kernel.org>
---
 include/linux/of.h | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/of.h b/include/linux/of.h
index ddeaae6d2083..9b32cbdf0230 100644
--- a/include/linux/of.h
+++ b/include/linux/of.h
@@ -120,6 +120,12 @@ extern struct device_node *of_aliases;
 extern struct device_node *of_stdout;
 extern raw_spinlock_t devtree_lock;
 
+/* flag descriptions (need to be visible even when !CONFIG_OF) */
+#define OF_DYNAMIC	1 /* node and properties were allocated via kmalloc */
+#define OF_DETACHED	2 /* node has been detached from the device tree */
+#define OF_POPULATED	3 /* device already created for the node */
+#define OF_POPULATED_BUS	4 /* of_platform_populate recursed to children of this node */
+
 #ifdef CONFIG_OF
 static inline bool is_of_node(struct fwnode_handle *fwnode)
 {
@@ -217,12 +223,6 @@ static inline unsigned long of_read_ulong(const __be32 *cell, int size)
 #define of_node_cmp(s1, s2)		strcasecmp((s1), (s2))
 #endif
 
-/* flag descriptions */
-#define OF_DYNAMIC	1 /* node and properties were allocated via kmalloc */
-#define OF_DETACHED	2 /* node has been detached from the device tree */
-#define OF_POPULATED	3 /* device already created for the node */
-#define OF_POPULATED_BUS	4 /* of_platform_populate recursed to children of this node */
-
 #define OF_IS_DYNAMIC(x) test_bit(OF_DYNAMIC, &x->_flags)
 #define OF_MARK_DYNAMIC(x) set_bit(OF_DYNAMIC, &x->_flags)
 
-- 
cgit v1.2.3


From 31712c98f8180c7bd3f33eefa461f0e1142e18f5 Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert+renesas@glider.be>
Date: Mon, 4 May 2015 19:42:19 +0200
Subject: of: Grammar s/property exist/property exists/

Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
Signed-off-by: Rob Herring <robh@kernel.org>
---
 include/linux/of.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/of.h b/include/linux/of.h
index 9b32cbdf0230..a3eb9d1e5800 100644
--- a/include/linux/of.h
+++ b/include/linux/of.h
@@ -815,7 +815,7 @@ static inline int of_property_read_string_index(struct device_node *np,
  * @propname:	name of the property to be searched.
  *
  * Search for a property in a device node.
- * Returns true if the property exist false otherwise.
+ * Returns true if the property exists false otherwise.
  */
 static inline bool of_property_read_bool(const struct device_node *np,
 					 const char *propname)
-- 
cgit v1.2.3


From 307c858bc245da5229b30186546590e1d472fc8f Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Thu, 28 May 2015 16:08:02 +0200
Subject: usb: phy: add static inline wrapper for devm_usb_get_phy_by_node

The newly introduced devm_usb_get_phy_by_node function only has
an extern declaration, but no alternative for the case that
CONFIG_USB_PHY is disabled, which leads to a build error when
it is used anyway:

drivers/power/twl4030_charger.c: In function 'twl4030_bci_probe':
drivers/power/twl4030_charger.c:648:23: error: implicit declaration of function 'devm_usb_get_phy_by_node' [-Werror=implicit-function-declaration]
    bci->transceiver = devm_usb_get_phy_by_node(

This adds the wrapper in the same way that we have one for
all other usb-phy API functions.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Fixes: e842b84c8e7 ("usb: phy: Add interface to get phy give of device_node.")
Signed-off-by: Felipe Balbi <balbi@ti.com>
---
 include/linux/usb/phy.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/usb/phy.h b/include/linux/usb/phy.h
index 8ed1e29ef329..e39f251cf861 100644
--- a/include/linux/usb/phy.h
+++ b/include/linux/usb/phy.h
@@ -240,6 +240,12 @@ static inline struct usb_phy *devm_usb_get_phy_by_phandle(struct device *dev,
 	return ERR_PTR(-ENXIO);
 }
 
+static inline struct usb_phy *devm_usb_get_phy_by_node(struct device *dev,
+	struct device_node *node, struct notifier_block *nb)
+{
+	return ERR_PTR(-ENXIO);
+}
+
 static inline void usb_put_phy(struct usb_phy *x)
 {
 }
-- 
cgit v1.2.3


From 9626b6993b2e6faf047d2d96958e8474edc9c7a5 Mon Sep 17 00:00:00 2001
From: jilai wang <jilaiw@codeaurora.org>
Date: Fri, 10 Apr 2015 16:15:59 -0400
Subject: firmware: qcom: scm: Add HDCP Support

HDCP driver needs to check if secure environment supports HDCP.  If it's
supported, then it requires to program some registers through SCM.
Add qcom_scm_hdcp_available and qcom_scm_hdcp_req to support these
requirements.

Signed-off-by: Jilai Wang <jilaiw@codeaurora.org>
Signed-off-by: Kumar Gala <galak@codeaurora.org>
---
 drivers/firmware/qcom_scm-32.c | 25 ++++++++++++++++++++++++-
 drivers/firmware/qcom_scm.c    | 32 +++++++++++++++++++++++++++++++-
 drivers/firmware/qcom_scm.h    | 11 ++++++++++-
 include/linux/qcom_scm.h       | 13 ++++++++++++-
 4 files changed, 77 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/firmware/qcom_scm-32.c b/drivers/firmware/qcom_scm-32.c
index b08b822ebafa..1bd6f9c34331 100644
--- a/drivers/firmware/qcom_scm-32.c
+++ b/drivers/firmware/qcom_scm-32.c
@@ -1,4 +1,4 @@
-/* Copyright (c) 2010, Code Aurora Forum. All rights reserved.
+/* Copyright (c) 2010,2015, The Linux Foundation. All rights reserved.
  * Copyright (C) 2015 Linaro Ltd.
  *
  * This program is free software; you can redistribute it and/or modify
@@ -478,3 +478,26 @@ void __qcom_scm_cpu_power_down(u32 flags)
 	qcom_scm_call_atomic1(QCOM_SCM_SVC_BOOT, QCOM_SCM_CMD_TERMINATE_PC,
 			flags & QCOM_SCM_FLUSH_FLAG_MASK);
 }
+
+int __qcom_scm_is_call_available(u32 svc_id, u32 cmd_id)
+{
+	int ret;
+	u32 svc_cmd = (svc_id << 10) | cmd_id;
+	u32 ret_val = 0;
+
+	ret = qcom_scm_call(QCOM_SCM_SVC_INFO, QCOM_IS_CALL_AVAIL_CMD, &svc_cmd,
+			sizeof(svc_cmd), &ret_val, sizeof(ret_val));
+	if (ret)
+		return ret;
+
+	return ret_val;
+}
+
+int __qcom_scm_hdcp_req(struct qcom_scm_hdcp_req *req, u32 req_cnt, u32 *resp)
+{
+	if (req_cnt > QCOM_SCM_HDCP_MAX_REQ_CNT)
+		return -ERANGE;
+
+	return qcom_scm_call(QCOM_SCM_SVC_HDCP, QCOM_SCM_CMD_HDCP,
+		req, req_cnt * sizeof(*req), resp, sizeof(*resp));
+}
diff --git a/drivers/firmware/qcom_scm.c b/drivers/firmware/qcom_scm.c
index 99892415299b..45c008d68891 100644
--- a/drivers/firmware/qcom_scm.c
+++ b/drivers/firmware/qcom_scm.c
@@ -1,4 +1,4 @@
-/* Copyright (c) 2010, Code Aurora Forum. All rights reserved.
+/* Copyright (c) 2010,2015, The Linux Foundation. All rights reserved.
  * Copyright (C) 2015 Linaro Ltd.
  *
  * This program is free software; you can redistribute it and/or modify
@@ -64,3 +64,33 @@ void qcom_scm_cpu_power_down(u32 flags)
 	__qcom_scm_cpu_power_down(flags);
 }
 EXPORT_SYMBOL(qcom_scm_cpu_power_down);
+
+/**
+ * qcom_scm_hdcp_available() - Check if secure environment supports HDCP.
+ *
+ * Return true if HDCP is supported, false if not.
+ */
+bool qcom_scm_hdcp_available(void)
+{
+	int ret;
+
+	ret = __qcom_scm_is_call_available(QCOM_SCM_SVC_HDCP,
+		QCOM_SCM_CMD_HDCP);
+
+	return (ret > 0) ? true : false;
+}
+EXPORT_SYMBOL(qcom_scm_hdcp_available);
+
+/**
+ * qcom_scm_hdcp_req() - Send HDCP request.
+ * @req: HDCP request array
+ * @req_cnt: HDCP request array count
+ * @resp: response buffer passed to SCM
+ *
+ * Write HDCP register(s) through SCM.
+ */
+int qcom_scm_hdcp_req(struct qcom_scm_hdcp_req *req, u32 req_cnt, u32 *resp)
+{
+	return __qcom_scm_hdcp_req(req, req_cnt, resp);
+}
+EXPORT_SYMBOL(qcom_scm_hdcp_req);
diff --git a/drivers/firmware/qcom_scm.h b/drivers/firmware/qcom_scm.h
index 1172be917307..2cce75c08b99 100644
--- a/drivers/firmware/qcom_scm.h
+++ b/drivers/firmware/qcom_scm.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2010-2014, The Linux Foundation. All rights reserved.
+/* Copyright (c) 2010-2015, The Linux Foundation. All rights reserved.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 and
@@ -27,6 +27,15 @@ extern int __qcom_scm_set_cold_boot_addr(void *entry, const cpumask_t *cpus);
 #define QCOM_SCM_CMD_CORE_HOTPLUGGED	0x10
 extern void __qcom_scm_cpu_power_down(u32 flags);
 
+#define QCOM_SCM_SVC_INFO		0x6
+#define QCOM_IS_CALL_AVAIL_CMD		0x1
+extern int __qcom_scm_is_call_available(u32 svc_id, u32 cmd_id);
+
+#define QCOM_SCM_SVC_HDCP		0x11
+#define QCOM_SCM_CMD_HDCP		0x01
+extern int __qcom_scm_hdcp_req(struct qcom_scm_hdcp_req *req, u32 req_cnt,
+		u32 *resp);
+
 /* common error codes */
 #define QCOM_SCM_ENOMEM		-5
 #define QCOM_SCM_EOPNOTSUPP	-4
diff --git a/include/linux/qcom_scm.h b/include/linux/qcom_scm.h
index d7a974d5f57c..6e7d5ec65838 100644
--- a/include/linux/qcom_scm.h
+++ b/include/linux/qcom_scm.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2010-2014, The Linux Foundation. All rights reserved.
+/* Copyright (c) 2010-2015, The Linux Foundation. All rights reserved.
  * Copyright (C) 2015 Linaro Ltd.
  *
  * This program is free software; you can redistribute it and/or modify
@@ -16,6 +16,17 @@
 extern int qcom_scm_set_cold_boot_addr(void *entry, const cpumask_t *cpus);
 extern int qcom_scm_set_warm_boot_addr(void *entry, const cpumask_t *cpus);
 
+#define QCOM_SCM_HDCP_MAX_REQ_CNT	5
+
+struct qcom_scm_hdcp_req {
+	u32 addr;
+	u32 val;
+};
+
+extern bool qcom_scm_hdcp_available(void);
+extern int qcom_scm_hdcp_req(struct qcom_scm_hdcp_req *req, u32 req_cnt,
+		u32 *resp);
+
 #define QCOM_SCM_CPU_PWR_DOWN_L2_ON	0x0
 #define QCOM_SCM_CPU_PWR_DOWN_L2_OFF	0x1
 
-- 
cgit v1.2.3


From 3386e0fa905c31bf1dafa2cbe2ecceb7e5ce2e66 Mon Sep 17 00:00:00 2001
From: Joachim Eastwood <manabian@gmail.com>
Date: Wed, 6 May 2015 20:09:09 +0200
Subject: of: add helper function to retrive match data

It's a common operation for device drivers to retrive the data
member from of_device_id struct in their probe function.

Most driver end up doing:
    const struct of_device_id *match;
    match = of_match_device(driver_of_match, &pdev->dev);
    driver->data = match->data;

With the of_device_get_match_data helper function all this can
done in one go.

Signed-off-by: Joachim Eastwood <manabian@gmail.com>
[robh: add missing inline to dummmy declaration]
Signed-off-by: Rob Herring <robh@kernel.org>
---
 drivers/of/device.c       | 12 ++++++++++++
 include/linux/of_device.h |  7 +++++++
 2 files changed, 19 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/of/device.c b/drivers/of/device.c
index 20c1332a0018..8b91ea241b10 100644
--- a/drivers/of/device.c
+++ b/drivers/of/device.c
@@ -163,6 +163,18 @@ void of_device_unregister(struct platform_device *ofdev)
 }
 EXPORT_SYMBOL(of_device_unregister);
 
+const void *of_device_get_match_data(const struct device *dev)
+{
+	const struct of_device_id *match;
+
+	match = of_match_device(dev->driver->of_match_table, dev);
+	if (!match)
+		return NULL;
+
+	return match->data;
+}
+EXPORT_SYMBOL(of_device_get_match_data);
+
 ssize_t of_device_get_modalias(struct device *dev, char *str, ssize_t len)
 {
 	const char *compat;
diff --git a/include/linux/of_device.h b/include/linux/of_device.h
index 22801b10cef5..4c508549833a 100644
--- a/include/linux/of_device.h
+++ b/include/linux/of_device.h
@@ -33,6 +33,8 @@ extern int of_device_add(struct platform_device *pdev);
 extern int of_device_register(struct platform_device *ofdev);
 extern void of_device_unregister(struct platform_device *ofdev);
 
+extern const void *of_device_get_match_data(const struct device *dev);
+
 extern ssize_t of_device_get_modalias(struct device *dev,
 					char *str, ssize_t len);
 
@@ -65,6 +67,11 @@ static inline int of_driver_match_device(struct device *dev,
 static inline void of_device_uevent(struct device *dev,
 			struct kobj_uevent_env *env) { }
 
+static inline const void *of_device_get_match_data(const struct device *dev)
+{
+	return NULL;
+}
+
 static inline int of_device_get_modalias(struct device *dev,
 				   char *str, ssize_t len)
 {
-- 
cgit v1.2.3


From 3b1a2c97210b1edd1a999ff8c1f72ab28f7609f7 Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert+renesas@glider.be>
Date: Wed, 13 May 2015 16:33:56 +0200
Subject: of/fdt: Make fdt blob input parameters of unflatten functions const

Operations to unflatten fdt blobs never modify the input blobs, hence
make them const. Now we no longer need to cast arbitrary const data to
"void *" when calling of_fdt_unflatten_tree().

Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
Signed-off-by: Rob Herring <robh@kernel.org>
---
 drivers/of/fdt.c       | 6 +++---
 include/linux/of_fdt.h | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/of/fdt.c b/drivers/of/fdt.c
index cde35c5d0191..9628c4a77f76 100644
--- a/drivers/of/fdt.c
+++ b/drivers/of/fdt.c
@@ -168,7 +168,7 @@ static void *unflatten_dt_alloc(void **mem, unsigned long size,
  * @dad: Parent struct device_node
  * @fpsize: Size of the node path up at the current depth.
  */
-static void * unflatten_dt_node(void *blob,
+static void * unflatten_dt_node(const void *blob,
 				void *mem,
 				int *poffset,
 				struct device_node *dad,
@@ -378,7 +378,7 @@ static void * unflatten_dt_node(void *blob,
  * @dt_alloc: An allocator that provides a virtual address to memory
  * for the resulting tree
  */
-static void __unflatten_device_tree(void *blob,
+static void __unflatten_device_tree(const void *blob,
 			     struct device_node **mynodes,
 			     void * (*dt_alloc)(u64 size, u64 align))
 {
@@ -441,7 +441,7 @@ static void *kernel_tree_alloc(u64 size, u64 align)
  * pointers of the nodes so the normal device-tree walking functions
  * can be used.
  */
-void of_fdt_unflatten_tree(unsigned long *blob,
+void of_fdt_unflatten_tree(const unsigned long *blob,
 			struct device_node **mynodes)
 {
 	__unflatten_device_tree(blob, mynodes, &kernel_tree_alloc);
diff --git a/include/linux/of_fdt.h b/include/linux/of_fdt.h
index 587ee507965d..0cf217d404ce 100644
--- a/include/linux/of_fdt.h
+++ b/include/linux/of_fdt.h
@@ -37,7 +37,7 @@ extern bool of_fdt_is_big_endian(const void *blob,
 				 unsigned long node);
 extern int of_fdt_match(const void *blob, unsigned long node,
 			const char *const *compat);
-extern void of_fdt_unflatten_tree(unsigned long *blob,
+extern void of_fdt_unflatten_tree(const unsigned long *blob,
 			       struct device_node **mynodes);
 
 /* TBD: Temporary export of fdt globals - remove when code fully merged */
-- 
cgit v1.2.3


From 3c6296f716ebef704b76070d90567ab4faa8462c Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Thu, 28 May 2015 13:21:34 -0400
Subject: ring-buffer: Remove useless unused tracing_off_permanent()

The tracing_off_permanent() call is a way to disable all ring_buffers.
Nothing uses it and nothing should use it, as tracing_off() and
friends are better, as they disable the ring buffers related to
tracing. The tracing_off_permanent() even disabled non tracing
ring buffers. This is a bit drastic, and was added to handle NMIs
doing outputs that could corrupt the ring buffer when only tracing
used them. It is now obsolete and adds a little overhead, it should
be removed.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/kernel.h     |  6 -----
 kernel/trace/ring_buffer.c | 61 ----------------------------------------------
 2 files changed, 67 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 3a5b48e52a9e..d948718a83d7 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -532,12 +532,6 @@ bool mac_pton(const char *s, u8 *mac);
  *
  * Most likely, you want to use tracing_on/tracing_off.
  */
-#ifdef CONFIG_RING_BUFFER
-/* trace_off_permanent stops recording with no way to bring it back */
-void tracing_off_permanent(void);
-#else
-static inline void tracing_off_permanent(void) { }
-#endif
 
 enum ftrace_dump_mode {
 	DUMP_NONE,
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index e9420fdc7409..0fc5add6423b 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -115,63 +115,11 @@ int ring_buffer_print_entry_header(struct trace_seq *s)
  *
  */
 
-/*
- * A fast way to enable or disable all ring buffers is to
- * call tracing_on or tracing_off. Turning off the ring buffers
- * prevents all ring buffers from being recorded to.
- * Turning this switch on, makes it OK to write to the
- * ring buffer, if the ring buffer is enabled itself.
- *
- * There's three layers that must be on in order to write
- * to the ring buffer.
- *
- * 1) This global flag must be set.
- * 2) The ring buffer must be enabled for recording.
- * 3) The per cpu buffer must be enabled for recording.
- *
- * In case of an anomaly, this global flag has a bit set that
- * will permantly disable all ring buffers.
- */
-
-/*
- * Global flag to disable all recording to ring buffers
- *  This has two bits: ON, DISABLED
- *
- *  ON   DISABLED
- * ---- ----------
- *   0      0        : ring buffers are off
- *   1      0        : ring buffers are on
- *   X      1        : ring buffers are permanently disabled
- */
-
-enum {
-	RB_BUFFERS_ON_BIT	= 0,
-	RB_BUFFERS_DISABLED_BIT	= 1,
-};
-
-enum {
-	RB_BUFFERS_ON		= 1 << RB_BUFFERS_ON_BIT,
-	RB_BUFFERS_DISABLED	= 1 << RB_BUFFERS_DISABLED_BIT,
-};
-
-static unsigned long ring_buffer_flags __read_mostly = RB_BUFFERS_ON;
-
 /* Used for individual buffers (after the counter) */
 #define RB_BUFFER_OFF		(1 << 20)
 
 #define BUF_PAGE_HDR_SIZE offsetof(struct buffer_data_page, data)
 
-/**
- * tracing_off_permanent - permanently disable ring buffers
- *
- * This function, once called, will disable all ring buffers
- * permanently.
- */
-void tracing_off_permanent(void)
-{
-	set_bit(RB_BUFFERS_DISABLED_BIT, &ring_buffer_flags);
-}
-
 #define RB_EVNT_HDR_SIZE (offsetof(struct ring_buffer_event, array))
 #define RB_ALIGNMENT		4U
 #define RB_MAX_SMALL_DATA	(RB_ALIGNMENT * RINGBUF_TYPE_DATA_TYPE_LEN_MAX)
@@ -2728,9 +2676,6 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length)
 	struct ring_buffer_event *event;
 	int cpu;
 
-	if (ring_buffer_flags != RB_BUFFERS_ON)
-		return NULL;
-
 	/* If we are tracing schedule, we don't want to recurse */
 	preempt_disable_notrace();
 
@@ -2992,9 +2937,6 @@ int ring_buffer_write(struct ring_buffer *buffer,
 	int ret = -EBUSY;
 	int cpu;
 
-	if (ring_buffer_flags != RB_BUFFERS_ON)
-		return -EBUSY;
-
 	preempt_disable_notrace();
 
 	if (atomic_read(&buffer->record_disabled))
@@ -4350,9 +4292,6 @@ int ring_buffer_swap_cpu(struct ring_buffer *buffer_a,
 
 	ret = -EAGAIN;
 
-	if (ring_buffer_flags != RB_BUFFERS_ON)
-		goto out;
-
 	if (atomic_read(&buffer_a->record_disabled))
 		goto out;
 
-- 
cgit v1.2.3


From 0040b933187b11f78e83dd162a31d64a46be4e37 Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Mon, 20 Apr 2015 11:52:23 -0700
Subject: f2fs: add missing version info in superblock

The mkfs.f2fs remains kernel version in superblock, but f2fs module has not
added that so far.

Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 include/linux/f2fs_fs.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h
index 591f8c3ef410..8d345c24bcf7 100644
--- a/include/linux/f2fs_fs.h
+++ b/include/linux/f2fs_fs.h
@@ -50,6 +50,8 @@
 #define MAX_ACTIVE_NODE_LOGS	8
 #define MAX_ACTIVE_DATA_LOGS	8
 
+#define VERSION_LEN	256
+
 /*
  * For superblock
  */
@@ -86,6 +88,9 @@ struct f2fs_super_block {
 	__le32 extension_count;		/* # of extensions below */
 	__u8 extension_list[F2FS_MAX_EXTENSION][8];	/* extension array */
 	__le32 cp_payload;
+	__u8 version[VERSION_LEN];	/* the kernel version */
+	__u8 init_version[VERSION_LEN];	/* the initial kernel version */
+	__u8 reserved[892];		/* valid reserved region */
 } __packed;
 
 /*
-- 
cgit v1.2.3


From 76f105a2dbcd47509bac6ba8d94cb3759a3e6e9d Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Mon, 13 Apr 2015 15:10:36 -0700
Subject: f2fs: add feature facility in superblock

This patch introduces a feature in superblock, which will indicate any new
features for f2fs.

Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/f2fs.h          | 7 +++++++
 include/linux/f2fs_fs.h | 3 ++-
 2 files changed, 9 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index cd9748af1b63..e1dd986262cb 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -70,6 +70,13 @@ struct f2fs_mount_info {
 	unsigned int	opt;
 };
 
+#define F2FS_HAS_FEATURE(sb, mask)					\
+	((F2FS_SB(sb)->raw_super->feature & cpu_to_le32(mask)) != 0)
+#define F2FS_SET_FEATURE(sb, mask)					\
+	F2FS_SB(sb)->raw_super->feature |= cpu_to_le32(mask)
+#define F2FS_CLEAR_FEATURE(sb, mask)					\
+	F2FS_SB(sb)->raw_super->feature &= ~cpu_to_le32(mask)
+
 #define CRCPOLY_LE 0xedb88320
 
 static inline __u32 f2fs_crc32(void *buf, size_t len)
diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h
index 8d345c24bcf7..d44e97f2b98e 100644
--- a/include/linux/f2fs_fs.h
+++ b/include/linux/f2fs_fs.h
@@ -90,7 +90,8 @@ struct f2fs_super_block {
 	__le32 cp_payload;
 	__u8 version[VERSION_LEN];	/* the kernel version */
 	__u8 init_version[VERSION_LEN];	/* the initial kernel version */
-	__u8 reserved[892];		/* valid reserved region */
+	__le32 feature;			/* defined features */
+	__u8 reserved[888];		/* valid reserved region */
 } __packed;
 
 /*
-- 
cgit v1.2.3


From cde4de1205770514005663d70a9a7d81cb555085 Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Mon, 20 Apr 2015 13:57:51 -0700
Subject: f2fs crypto: declare some definitions for f2fs encryption feature

This definitions will be used by inode and superblock for encyption.

Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/f2fs.h          |  55 ++++++++++++++++++
 fs/f2fs/f2fs_crypto.h   | 147 ++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/f2fs_fs.h |   4 +-
 3 files changed, 205 insertions(+), 1 deletion(-)
 create mode 100644 fs/f2fs/f2fs_crypto.h

(limited to 'include/linux')

diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 477e65fe7665..2c2948fbc35f 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -70,6 +70,8 @@ struct f2fs_mount_info {
 	unsigned int	opt;
 };
 
+#define F2FS_FEATURE_ENCRYPT	0x0001
+
 #define F2FS_HAS_FEATURE(sb, mask)					\
 	((F2FS_SB(sb)->raw_super->feature & cpu_to_le32(mask)) != 0)
 #define F2FS_SET_FEATURE(sb, mask)					\
@@ -346,6 +348,7 @@ struct f2fs_map_blocks {
  */
 #define FADVISE_COLD_BIT	0x01
 #define FADVISE_LOST_PINO_BIT	0x02
+#define FADVISE_ENCRYPT_BIT	0x04
 
 #define file_is_cold(inode)	is_file(inode, FADVISE_COLD_BIT)
 #define file_wrong_pino(inode)	is_file(inode, FADVISE_LOST_PINO_BIT)
@@ -353,6 +356,16 @@ struct f2fs_map_blocks {
 #define file_lost_pino(inode)	set_file(inode, FADVISE_LOST_PINO_BIT)
 #define file_clear_cold(inode)	clear_file(inode, FADVISE_COLD_BIT)
 #define file_got_pino(inode)	clear_file(inode, FADVISE_LOST_PINO_BIT)
+#define file_is_encrypt(inode)	is_file(inode, FADVISE_ENCRYPT_BIT)
+#define file_set_encrypt(inode)	set_file(inode, FADVISE_ENCRYPT_BIT)
+#define file_clear_encrypt(inode) clear_file(inode, FADVISE_ENCRYPT_BIT)
+
+/* Encryption algorithms */
+#define F2FS_ENCRYPTION_MODE_INVALID		0
+#define F2FS_ENCRYPTION_MODE_AES_256_XTS	1
+#define F2FS_ENCRYPTION_MODE_AES_256_GCM	2
+#define F2FS_ENCRYPTION_MODE_AES_256_CBC	3
+#define F2FS_ENCRYPTION_MODE_AES_256_CTS	4
 
 #define DEF_DIR_LEVEL		0
 
@@ -380,6 +393,11 @@ struct f2fs_inode_info {
 	struct radix_tree_root inmem_root;	/* radix tree for inmem pages */
 	struct list_head inmem_pages;	/* inmemory pages managed by f2fs */
 	struct mutex inmem_lock;	/* lock for inmemory pages */
+
+#ifdef CONFIG_F2FS_FS_ENCRYPTION
+	/* Encryption params */
+	struct f2fs_crypt_info *i_crypt_info;
+#endif
 };
 
 static inline void get_extent_info(struct extent_info *ext,
@@ -1891,4 +1909,41 @@ void f2fs_delete_inline_entry(struct f2fs_dir_entry *, struct page *,
 						struct inode *, struct inode *);
 bool f2fs_empty_inline_dir(struct inode *);
 int f2fs_read_inline_dir(struct file *, struct dir_context *);
+
+/*
+ * crypto support
+ */
+static inline int f2fs_encrypted_inode(struct inode *inode)
+{
+#ifdef CONFIG_F2FS_FS_ENCRYPTION
+	return file_is_encrypt(inode);
+#else
+	return 0;
+#endif
+}
+
+static inline void f2fs_set_encrypted_inode(struct inode *inode)
+{
+#ifdef CONFIG_F2FS_FS_ENCRYPTION
+	file_set_encrypt(inode);
+#endif
+}
+
+static inline bool f2fs_bio_encrypted(struct bio *bio)
+{
+#ifdef CONFIG_F2FS_FS_ENCRYPTION
+	return unlikely(bio->bi_private != NULL);
+#else
+	return false;
+#endif
+}
+
+static inline int f2fs_sb_has_crypto(struct super_block *sb)
+{
+#ifdef CONFIG_F2FS_FS_ENCRYPTION
+	return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_ENCRYPT);
+#else
+	return 0;
+#endif
+}
 #endif
diff --git a/fs/f2fs/f2fs_crypto.h b/fs/f2fs/f2fs_crypto.h
new file mode 100644
index 000000000000..3323266e7e2b
--- /dev/null
+++ b/fs/f2fs/f2fs_crypto.h
@@ -0,0 +1,147 @@
+/*
+ * linux/fs/f2fs/f2fs_crypto.h
+ *
+ * Copied from linux/fs/ext4/ext4_crypto.h
+ *
+ * Copyright (C) 2015, Google, Inc.
+ *
+ * This contains encryption header content for f2fs
+ *
+ * Written by Michael Halcrow, 2015.
+ * Modified by Jaegeuk Kim, 2015.
+ */
+#ifndef _F2FS_CRYPTO_H
+#define _F2FS_CRYPTO_H
+
+#include <linux/fs.h>
+
+#define F2FS_KEY_DESCRIPTOR_SIZE	8
+
+/* Policy provided via an ioctl on the topmost directory */
+struct f2fs_encryption_policy {
+	char version;
+	char contents_encryption_mode;
+	char filenames_encryption_mode;
+	char flags;
+	char master_key_descriptor[F2FS_KEY_DESCRIPTOR_SIZE];
+} __attribute__((__packed__));
+
+#define F2FS_ENCRYPTION_CONTEXT_FORMAT_V1	1
+#define F2FS_KEY_DERIVATION_NONCE_SIZE		16
+
+#define F2FS_POLICY_FLAGS_PAD_4		0x00
+#define F2FS_POLICY_FLAGS_PAD_8		0x01
+#define F2FS_POLICY_FLAGS_PAD_16	0x02
+#define F2FS_POLICY_FLAGS_PAD_32	0x03
+#define F2FS_POLICY_FLAGS_PAD_MASK	0x03
+#define F2FS_POLICY_FLAGS_VALID		0x03
+
+/**
+ * Encryption context for inode
+ *
+ * Protector format:
+ *  1 byte: Protector format (1 = this version)
+ *  1 byte: File contents encryption mode
+ *  1 byte: File names encryption mode
+ *  1 byte: Flags
+ *  8 bytes: Master Key descriptor
+ *  16 bytes: Encryption Key derivation nonce
+ */
+struct f2fs_encryption_context {
+	char format;
+	char contents_encryption_mode;
+	char filenames_encryption_mode;
+	char flags;
+	char master_key_descriptor[F2FS_KEY_DESCRIPTOR_SIZE];
+	char nonce[F2FS_KEY_DERIVATION_NONCE_SIZE];
+} __attribute__((__packed__));
+
+/* Encryption parameters */
+#define F2FS_XTS_TWEAK_SIZE 16
+#define F2FS_AES_128_ECB_KEY_SIZE 16
+#define F2FS_AES_256_GCM_KEY_SIZE 32
+#define F2FS_AES_256_CBC_KEY_SIZE 32
+#define F2FS_AES_256_CTS_KEY_SIZE 32
+#define F2FS_AES_256_XTS_KEY_SIZE 64
+#define F2FS_MAX_KEY_SIZE 64
+
+struct f2fs_encryption_key {
+	__u32 mode;
+	char raw[F2FS_MAX_KEY_SIZE];
+	__u32 size;
+} __attribute__((__packed__));
+
+struct f2fs_crypt_info {
+	unsigned char	ci_mode;
+	unsigned char	ci_size;
+	char		ci_data_mode;
+	char		ci_filename_mode;
+	char		ci_flags;
+	struct crypto_ablkcipher *ci_ctfm;
+	struct key	*ci_keyring_key;
+	char		ci_raw[F2FS_MAX_KEY_SIZE];
+	char		ci_master_key[F2FS_KEY_DESCRIPTOR_SIZE];
+};
+
+#define F2FS_CTX_REQUIRES_FREE_ENCRYPT_FL             0x00000001
+#define F2FS_BOUNCE_PAGE_REQUIRES_FREE_ENCRYPT_FL     0x00000002
+
+struct f2fs_crypto_ctx {
+	struct crypto_tfm *tfm;         /* Crypto API context */
+	struct page *bounce_page;       /* Ciphertext page on write path */
+	struct page *control_page;      /* Original page on write path */
+	struct bio *bio;                /* The bio for this context */
+	struct work_struct work;        /* Work queue for read complete path */
+	struct list_head free_list;     /* Free list */
+	int flags;                      /* Flags */
+	int mode;                       /* Encryption mode for tfm */
+};
+
+struct f2fs_completion_result {
+	struct completion completion;
+	int res;
+};
+
+#define DECLARE_F2FS_COMPLETION_RESULT(ecr) \
+	struct f2fs_completion_result ecr = { \
+		COMPLETION_INITIALIZER((ecr).completion), 0 }
+
+static inline int f2fs_encryption_key_size(int mode)
+{
+	switch (mode) {
+	case F2FS_ENCRYPTION_MODE_AES_256_XTS:
+		return F2FS_AES_256_XTS_KEY_SIZE;
+	case F2FS_ENCRYPTION_MODE_AES_256_GCM:
+		return F2FS_AES_256_GCM_KEY_SIZE;
+	case F2FS_ENCRYPTION_MODE_AES_256_CBC:
+		return F2FS_AES_256_CBC_KEY_SIZE;
+	case F2FS_ENCRYPTION_MODE_AES_256_CTS:
+		return F2FS_AES_256_CTS_KEY_SIZE;
+	default:
+		BUG();
+	}
+	return 0;
+}
+
+#define F2FS_FNAME_NUM_SCATTER_ENTRIES	4
+#define F2FS_CRYPTO_BLOCK_SIZE		16
+#define F2FS_FNAME_CRYPTO_DIGEST_SIZE	32
+
+/**
+ * For encrypted symlinks, the ciphertext length is stored at the beginning
+ * of the string in little-endian format.
+ */
+struct f2fs_encrypted_symlink_data {
+	__le16 len;
+	char encrypted_path[1];
+} __attribute__((__packed__));
+
+/**
+ * This function is used to calculate the disk space required to
+ * store a filename of length l in encrypted symlink format.
+ */
+static inline u32 encrypted_symlink_data_len(u32 l)
+{
+	return (l + sizeof(struct f2fs_encrypted_symlink_data) - 1);
+}
+#endif	/* _F2FS_CRYPTO_H */
diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h
index d44e97f2b98e..920408a21ffd 100644
--- a/include/linux/f2fs_fs.h
+++ b/include/linux/f2fs_fs.h
@@ -91,7 +91,9 @@ struct f2fs_super_block {
 	__u8 version[VERSION_LEN];	/* the kernel version */
 	__u8 init_version[VERSION_LEN];	/* the initial kernel version */
 	__le32 feature;			/* defined features */
-	__u8 reserved[888];		/* valid reserved region */
+	__u8 encryption_level;		/* versioning level for encryption */
+	__u8 encrypt_pw_salt[16];	/* Salt used for string2key algorithm */
+	__u8 reserved[871];		/* valid reserved region */
 } __packed;
 
 /*
-- 
cgit v1.2.3


From f8df88081183bd4d3c461c617c3519445eb85642 Mon Sep 17 00:00:00 2001
From: Chanwoo Choi <cw00.choi@samsung.com>
Date: Wed, 27 May 2015 23:06:30 +0900
Subject: extcon: Remove optional print_name() function pointer of extcon_dev

This patch removes the optional print_name() function pointer included in
'struct extcon_dev' because the extcon must maintain the consistent name of
extcon device on sysfs instead of inconsistent name. After merged patch[1],
extcon can maintain the consistent name of extcon device without any hard-coded
device name.
[1] https://lkml.org/lkml/2015/4/27/258

Signed-off-by: Chanwoo Choi <cw00.choi@samsung.com>
---
 drivers/extcon/extcon.c | 8 --------
 include/linux/extcon.h  | 3 ---
 2 files changed, 11 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/extcon/extcon.c b/drivers/extcon/extcon.c
index 5099c11d8833..fafd428cae7f 100644
--- a/drivers/extcon/extcon.c
+++ b/drivers/extcon/extcon.c
@@ -207,14 +207,6 @@ static ssize_t name_show(struct device *dev, struct device_attribute *attr,
 {
 	struct extcon_dev *edev = dev_get_drvdata(dev);
 
-	/* Optional callback given by the user */
-	if (edev->print_name) {
-		int ret = edev->print_name(edev, buf);
-
-		if (ret >= 0)
-			return ret;
-	}
-
 	return sprintf(buf, "%s\n", edev->name);
 }
 static DEVICE_ATTR_RO(name);
diff --git a/include/linux/extcon.h b/include/linux/extcon.h
index be9652b3a154..a7b224b20ecc 100644
--- a/include/linux/extcon.h
+++ b/include/linux/extcon.h
@@ -83,8 +83,6 @@ struct extcon_cable;
  *			be attached simulataneously. {0x7, 0} is equivalent to
  *			{0x3, 0x6, 0x5, 0}. If it is {0xFFFFFFFF, 0}, there
  *			can be no simultaneous connections.
- * @print_name:		An optional callback to override the method to print the
- *			name of the extcon device.
  * @print_state:	An optional callback to override the method to print the
  *			status of the extcon device.
  * @dev:		Device of this extcon.
@@ -111,7 +109,6 @@ struct extcon_dev {
 	const u32 *mutually_exclusive;
 
 	/* Optional callbacks to override class functions */
-	ssize_t	(*print_name)(struct extcon_dev *edev, char *buf);
 	ssize_t	(*print_state)(struct extcon_dev *edev, char *buf);
 
 	/* Internal data. Please do not set. */
-- 
cgit v1.2.3


From c80ef9e0c021ff86771fdd72583c75d8f7b6a720 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Fri, 29 May 2015 10:52:59 +0200
Subject: cgroup: add seq_file forward declaration for struct cftype

Recent header file changes for cgroup caused lots of warnings
about a missing struct seq_file form declaration for every
inclusion of include/linux/cgroup-defs.h.

As some files are built with -Werror, this leads to build
failure like:

                 from /git/arm-soc/drivers/gpu/drm/tilcdc/tilcdc_crtc.c:18:
/git/arm-soc/include/linux/cgroup-defs.h:354:25: error: 'struct seq_file' declared inside parameter list [-Werror]
cc1: all warnings being treated as errors
make[6]: *** [drivers/gpu/drm/tilcdc/tilcdc_crtc.o] Error 1

This patch adds the declaration, which resolves both the
warnings and the drm failure.

tj: Moved it where other type declarations are.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Fixes: b4a04ab7a37b ("cgroup: separate out include/linux/cgroup-defs.h")
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/cgroup-defs.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index 7d83d7f73420..26d1cea7929f 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -26,6 +26,7 @@ struct cgroup_taskset;
 struct kernfs_node;
 struct kernfs_ops;
 struct kernfs_open_file;
+struct seq_file;
 
 #define MAX_CGROUP_TYPE_NAMELEN 32
 #define MAX_CGROUP_ROOT_NAMELEN 64
-- 
cgit v1.2.3


From e548ca4ee4595f65b262661d166310ad8a149bec Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Fri, 29 May 2015 13:11:32 -0600
Subject: block: don't honor chunk sizes for data-less IO

We don't need to honor chunk sizes for IO that doesn't carry any
data.

Signed-off-by: Jens Axboe <axboe@fb.com>
---
 include/linux/blkdev.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 9ded80da2c16..ccaa9aecd593 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -903,7 +903,7 @@ static inline unsigned int blk_rq_get_max_sectors(struct request *rq)
 	if (unlikely(rq->cmd_type == REQ_TYPE_BLOCK_PC))
 		return q->limits.max_hw_sectors;
 
-	if (!q->limits.chunk_sectors)
+	if (!q->limits.chunk_sectors || (rq->cmd_flags & REQ_DISCARD))
 		return blk_queue_get_max_sectors(q, rq->cmd_flags);
 
 	return min(blk_max_size_offset(q, blk_rq_pos(rq)),
-- 
cgit v1.2.3


From 19bdb6e4ec071bc49a9871b41e6a59a1657ed365 Mon Sep 17 00:00:00 2001
From: Alex Williamson <alex.williamson@redhat.com>
Date: Tue, 26 May 2015 15:11:44 -0600
Subject: PCI: Move pci_ari_enabled() to global header

pci_ari_enabled() is useful outside of drivers/pci, particularly for
deriving INTx routing via ACPI _PRT, so move it to the global header.
Also convert to bool return.

Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Don Dutile <ddutile@redhat.com>
Acked-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/pci/pci.h   | 11 -----------
 include/linux/pci.h | 11 +++++++++++
 2 files changed, 11 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index 9bd762c237ab..c1b2a433ca04 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -216,17 +216,6 @@ void __pci_bus_assign_resources(const struct pci_bus *bus,
 				struct list_head *fail_head);
 bool pci_bus_clip_resource(struct pci_dev *dev, int idx);
 
-/**
- * pci_ari_enabled - query ARI forwarding status
- * @bus: the PCI bus
- *
- * Returns 1 if ARI forwarding is enabled, or 0 if not enabled;
- */
-static inline int pci_ari_enabled(struct pci_bus *bus)
-{
-	return bus->self && bus->self->ari_enabled;
-}
-
 void pci_reassigndev_resource_alignment(struct pci_dev *dev);
 void pci_disable_bridge_window(struct pci_dev *dev);
 
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 353db8dc4c6e..2925561a8f1e 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1905,4 +1905,15 @@ static inline bool pci_is_dev_assigned(struct pci_dev *pdev)
 {
 	return (pdev->dev_flags & PCI_DEV_FLAGS_ASSIGNED) == PCI_DEV_FLAGS_ASSIGNED;
 }
+
+/**
+ * pci_ari_enabled - query ARI forwarding status
+ * @bus: the PCI bus
+ *
+ * Returns true if ARI forwarding is enabled.
+ */
+static inline bool pci_ari_enabled(struct pci_bus *bus)
+{
+	return bus->self && bus->self->ari_enabled;
+}
 #endif /* LINUX_PCI_H */
-- 
cgit v1.2.3


From 3a9ad0b4fdcd57f775d3615004c8c64c021a9e7d Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Wed, 27 May 2015 17:23:51 -0700
Subject: PCI: Add pci_bus_addr_t

David Ahern reported that d63e2e1f3df9 ("sparc/PCI: Clip bridge windows
to fit in upstream windows") fails to boot on sparc/T5-8:

  pci 0000:06:00.0: reg 0x184: can't handle BAR above 4GB (bus address 0x110204000)

The problem is that sparc64 assumed that dma_addr_t only needed to hold DMA
addresses, i.e., bus addresses returned via the DMA API (dma_map_single(),
etc.), while the PCI core assumed dma_addr_t could hold *any* bus address,
including raw BAR values.  On sparc64, all DMA addresses fit in 32 bits, so
dma_addr_t is a 32-bit type.  However, BAR values can be 64 bits wide, so
they don't fit in a dma_addr_t.  d63e2e1f3df9 added new checking that
tripped over this mismatch.

Add pci_bus_addr_t, which is wide enough to hold any PCI bus address,
including both raw BAR values and DMA addresses.  This will be 64 bits
on 64-bit platforms and on platforms with a 64-bit dma_addr_t.  Then
dma_addr_t only needs to be wide enough to hold addresses from the DMA API.

[bhelgaas: changelog, bugzilla, Kconfig to ensure pci_bus_addr_t is at
least as wide as dma_addr_t, documentation]
Fixes: d63e2e1f3df9 ("sparc/PCI: Clip bridge windows to fit in upstream windows")
Fixes: 23b13bc76f35 ("PCI: Fail safely if we can't handle BARs larger than 4GB")
Link: http://lkml.kernel.org/r/CAE9FiQU1gJY1LYrxs+ma5LCTEEe4xmtjRG0aXJ9K_Tsu+m9Wuw@mail.gmail.com
Link: http://lkml.kernel.org/r/1427857069-6789-1-git-send-email-yinghai@kernel.org
Link: https://bugzilla.kernel.org/show_bug.cgi?id=96231
Reported-by: David Ahern <david.ahern@oracle.com>
Tested-by: David Ahern <david.ahern@oracle.com>
Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Acked-by: David S. Miller <davem@davemloft.net>
CC: stable@vger.kernel.org	# v3.19+
---
 Documentation/DMA-API-HOWTO.txt | 29 +++++++++++++++++------------
 Documentation/DMA-API.txt       | 30 +++++++++++++++---------------
 drivers/pci/Kconfig             |  4 ++++
 drivers/pci/bus.c               | 10 +++++-----
 drivers/pci/probe.c             | 12 ++++++------
 include/linux/pci.h             | 12 +++++++++---
 include/linux/types.h           | 12 ++++++++++--
 7 files changed, 66 insertions(+), 43 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/DMA-API-HOWTO.txt b/Documentation/DMA-API-HOWTO.txt
index 0f7afb2bb442..aef8cc5a677b 100644
--- a/Documentation/DMA-API-HOWTO.txt
+++ b/Documentation/DMA-API-HOWTO.txt
@@ -25,13 +25,18 @@ physical addresses.  These are the addresses in /proc/iomem.  The physical
 address is not directly useful to a driver; it must use ioremap() to map
 the space and produce a virtual address.
 
-I/O devices use a third kind of address: a "bus address" or "DMA address".
-If a device has registers at an MMIO address, or if it performs DMA to read
-or write system memory, the addresses used by the device are bus addresses.
-In some systems, bus addresses are identical to CPU physical addresses, but
-in general they are not.  IOMMUs and host bridges can produce arbitrary
+I/O devices use a third kind of address: a "bus address".  If a device has
+registers at an MMIO address, or if it performs DMA to read or write system
+memory, the addresses used by the device are bus addresses.  In some
+systems, bus addresses are identical to CPU physical addresses, but in
+general they are not.  IOMMUs and host bridges can produce arbitrary
 mappings between physical and bus addresses.
 
+From a device's point of view, DMA uses the bus address space, but it may
+be restricted to a subset of that space.  For example, even if a system
+supports 64-bit addresses for main memory and PCI BARs, it may use an IOMMU
+so devices only need to use 32-bit DMA addresses.
+
 Here's a picture and some examples:
 
                CPU                  CPU                  Bus
@@ -72,11 +77,11 @@ can use virtual address X to access the buffer, but the device itself
 cannot because DMA doesn't go through the CPU virtual memory system.
 
 In some simple systems, the device can do DMA directly to physical address
-Y.  But in many others, there is IOMMU hardware that translates bus
+Y.  But in many others, there is IOMMU hardware that translates DMA
 addresses to physical addresses, e.g., it translates Z to Y.  This is part
 of the reason for the DMA API: the driver can give a virtual address X to
 an interface like dma_map_single(), which sets up any required IOMMU
-mapping and returns the bus address Z.  The driver then tells the device to
+mapping and returns the DMA address Z.  The driver then tells the device to
 do DMA to Z, and the IOMMU maps it to the buffer at address Y in system
 RAM.
 
@@ -98,7 +103,7 @@ First of all, you should make sure
 #include <linux/dma-mapping.h>
 
 is in your driver, which provides the definition of dma_addr_t.  This type
-can hold any valid DMA or bus address for the platform and should be used
+can hold any valid DMA address for the platform and should be used
 everywhere you hold a DMA address returned from the DMA mapping functions.
 
 			 What memory is DMA'able?
@@ -316,7 +321,7 @@ There are two types of DMA mappings:
   Think of "consistent" as "synchronous" or "coherent".
 
   The current default is to return consistent memory in the low 32
-  bits of the bus space.  However, for future compatibility you should
+  bits of the DMA space.  However, for future compatibility you should
   set the consistent mask even if this default is fine for your
   driver.
 
@@ -403,7 +408,7 @@ dma_alloc_coherent() returns two values: the virtual address which you
 can use to access it from the CPU and dma_handle which you pass to the
 card.
 
-The CPU virtual address and the DMA bus address are both
+The CPU virtual address and the DMA address are both
 guaranteed to be aligned to the smallest PAGE_SIZE order which
 is greater than or equal to the requested size.  This invariant
 exists (for example) to guarantee that if you allocate a chunk
@@ -645,8 +650,8 @@ PLEASE NOTE:  The 'nents' argument to the dma_unmap_sg call must be
               dma_map_sg call.
 
 Every dma_map_{single,sg}() call should have its dma_unmap_{single,sg}()
-counterpart, because the bus address space is a shared resource and
-you could render the machine unusable by consuming all bus addresses.
+counterpart, because the DMA address space is a shared resource and
+you could render the machine unusable by consuming all DMA addresses.
 
 If you need to use the same streaming DMA region multiple times and touch
 the data in between the DMA transfers, the buffer needs to be synced
diff --git a/Documentation/DMA-API.txt b/Documentation/DMA-API.txt
index 52088408668a..7eba542eff7c 100644
--- a/Documentation/DMA-API.txt
+++ b/Documentation/DMA-API.txt
@@ -18,10 +18,10 @@ Part I - dma_ API
 To get the dma_ API, you must #include <linux/dma-mapping.h>.  This
 provides dma_addr_t and the interfaces described below.
 
-A dma_addr_t can hold any valid DMA or bus address for the platform.  It
-can be given to a device to use as a DMA source or target.  A CPU cannot
-reference a dma_addr_t directly because there may be translation between
-its physical address space and the bus address space.
+A dma_addr_t can hold any valid DMA address for the platform.  It can be
+given to a device to use as a DMA source or target.  A CPU cannot reference
+a dma_addr_t directly because there may be translation between its physical
+address space and the DMA address space.
 
 Part Ia - Using large DMA-coherent buffers
 ------------------------------------------
@@ -42,7 +42,7 @@ It returns a pointer to the allocated region (in the processor's virtual
 address space) or NULL if the allocation failed.
 
 It also returns a <dma_handle> which may be cast to an unsigned integer the
-same width as the bus and given to the device as the bus address base of
+same width as the bus and given to the device as the DMA address base of
 the region.
 
 Note: consistent memory can be expensive on some platforms, and the
@@ -193,7 +193,7 @@ dma_map_single(struct device *dev, void *cpu_addr, size_t size,
 		      enum dma_data_direction direction)
 
 Maps a piece of processor virtual memory so it can be accessed by the
-device and returns the bus address of the memory.
+device and returns the DMA address of the memory.
 
 The direction for both APIs may be converted freely by casting.
 However the dma_ API uses a strongly typed enumerator for its
@@ -212,20 +212,20 @@ contiguous piece of memory.  For this reason, memory to be mapped by
 this API should be obtained from sources which guarantee it to be
 physically contiguous (like kmalloc).
 
-Further, the bus address of the memory must be within the
+Further, the DMA address of the memory must be within the
 dma_mask of the device (the dma_mask is a bit mask of the
-addressable region for the device, i.e., if the bus address of
-the memory ANDed with the dma_mask is still equal to the bus
+addressable region for the device, i.e., if the DMA address of
+the memory ANDed with the dma_mask is still equal to the DMA
 address, then the device can perform DMA to the memory).  To
 ensure that the memory allocated by kmalloc is within the dma_mask,
 the driver may specify various platform-dependent flags to restrict
-the bus address range of the allocation (e.g., on x86, GFP_DMA
-guarantees to be within the first 16MB of available bus addresses,
+the DMA address range of the allocation (e.g., on x86, GFP_DMA
+guarantees to be within the first 16MB of available DMA addresses,
 as required by ISA devices).
 
 Note also that the above constraints on physical contiguity and
 dma_mask may not apply if the platform has an IOMMU (a device which
-maps an I/O bus address to a physical memory address).  However, to be
+maps an I/O DMA address to a physical memory address).  However, to be
 portable, device driver writers may *not* assume that such an IOMMU
 exists.
 
@@ -296,7 +296,7 @@ reduce current DMA mapping usage or delay and try again later).
 	dma_map_sg(struct device *dev, struct scatterlist *sg,
 		int nents, enum dma_data_direction direction)
 
-Returns: the number of bus address segments mapped (this may be shorter
+Returns: the number of DMA address segments mapped (this may be shorter
 than <nents> passed in if some elements of the scatter/gather list are
 physically or virtually adjacent and an IOMMU maps them with a single
 entry).
@@ -340,7 +340,7 @@ must be the same as those and passed in to the scatter/gather mapping
 API.
 
 Note: <nents> must be the number you passed in, *not* the number of
-bus address entries returned.
+DMA address entries returned.
 
 void
 dma_sync_single_for_cpu(struct device *dev, dma_addr_t dma_handle, size_t size,
@@ -507,7 +507,7 @@ it's asked for coherent memory for this device.
 phys_addr is the CPU physical address to which the memory is currently
 assigned (this will be ioremapped so the CPU can access the region).
 
-device_addr is the bus address the device needs to be programmed
+device_addr is the DMA address the device needs to be programmed
 with to actually address this memory (this will be handed out as the
 dma_addr_t in dma_alloc_coherent()).
 
diff --git a/drivers/pci/Kconfig b/drivers/pci/Kconfig
index 7a8f1c5e65af..73de4efcbe6e 100644
--- a/drivers/pci/Kconfig
+++ b/drivers/pci/Kconfig
@@ -1,6 +1,10 @@
 #
 # PCI configuration
 #
+config PCI_BUS_ADDR_T_64BIT
+	def_bool y if (ARCH_DMA_ADDR_T_64BIT || 64BIT)
+	depends on PCI
+
 config PCI_MSI
 	bool "Message Signaled Interrupts (MSI and MSI-X)"
 	depends on PCI
diff --git a/drivers/pci/bus.c b/drivers/pci/bus.c
index 90fa3a78fb7c..6fbd3f2b5992 100644
--- a/drivers/pci/bus.c
+++ b/drivers/pci/bus.c
@@ -92,11 +92,11 @@ void pci_bus_remove_resources(struct pci_bus *bus)
 }
 
 static struct pci_bus_region pci_32_bit = {0, 0xffffffffULL};
-#ifdef CONFIG_ARCH_DMA_ADDR_T_64BIT
+#ifdef CONFIG_PCI_BUS_ADDR_T_64BIT
 static struct pci_bus_region pci_64_bit = {0,
-				(dma_addr_t) 0xffffffffffffffffULL};
-static struct pci_bus_region pci_high = {(dma_addr_t) 0x100000000ULL,
-				(dma_addr_t) 0xffffffffffffffffULL};
+				(pci_bus_addr_t) 0xffffffffffffffffULL};
+static struct pci_bus_region pci_high = {(pci_bus_addr_t) 0x100000000ULL,
+				(pci_bus_addr_t) 0xffffffffffffffffULL};
 #endif
 
 /*
@@ -200,7 +200,7 @@ int pci_bus_alloc_resource(struct pci_bus *bus, struct resource *res,
 					  resource_size_t),
 		void *alignf_data)
 {
-#ifdef CONFIG_ARCH_DMA_ADDR_T_64BIT
+#ifdef CONFIG_PCI_BUS_ADDR_T_64BIT
 	int rc;
 
 	if (res->flags & IORESOURCE_MEM_64) {
diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index 6675a7a1b9fc..c91185721345 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -254,8 +254,8 @@ int __pci_read_base(struct pci_dev *dev, enum pci_bar_type type,
 	}
 
 	if (res->flags & IORESOURCE_MEM_64) {
-		if ((sizeof(dma_addr_t) < 8 || sizeof(resource_size_t) < 8) &&
-		    sz64 > 0x100000000ULL) {
+		if ((sizeof(pci_bus_addr_t) < 8 || sizeof(resource_size_t) < 8)
+		    && sz64 > 0x100000000ULL) {
 			res->flags |= IORESOURCE_UNSET | IORESOURCE_DISABLED;
 			res->start = 0;
 			res->end = 0;
@@ -264,7 +264,7 @@ int __pci_read_base(struct pci_dev *dev, enum pci_bar_type type,
 			goto out;
 		}
 
-		if ((sizeof(dma_addr_t) < 8) && l) {
+		if ((sizeof(pci_bus_addr_t) < 8) && l) {
 			/* Above 32-bit boundary; try to reallocate */
 			res->flags |= IORESOURCE_UNSET;
 			res->start = 0;
@@ -399,7 +399,7 @@ static void pci_read_bridge_mmio_pref(struct pci_bus *child)
 	struct pci_dev *dev = child->self;
 	u16 mem_base_lo, mem_limit_lo;
 	u64 base64, limit64;
-	dma_addr_t base, limit;
+	pci_bus_addr_t base, limit;
 	struct pci_bus_region region;
 	struct resource *res;
 
@@ -426,8 +426,8 @@ static void pci_read_bridge_mmio_pref(struct pci_bus *child)
 		}
 	}
 
-	base = (dma_addr_t) base64;
-	limit = (dma_addr_t) limit64;
+	base = (pci_bus_addr_t) base64;
+	limit = (pci_bus_addr_t) limit64;
 
 	if (base != base64) {
 		dev_err(&dev->dev, "can't handle bridge window above 4GB (bus address %#010llx)\n",
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 353db8dc4c6e..956f74bad37a 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -577,9 +577,15 @@ int raw_pci_read(unsigned int domain, unsigned int bus, unsigned int devfn,
 int raw_pci_write(unsigned int domain, unsigned int bus, unsigned int devfn,
 		  int reg, int len, u32 val);
 
+#ifdef CONFIG_PCI_BUS_ADDR_T_64BIT
+typedef u64 pci_bus_addr_t;
+#else
+typedef u32 pci_bus_addr_t;
+#endif
+
 struct pci_bus_region {
-	dma_addr_t start;
-	dma_addr_t end;
+	pci_bus_addr_t start;
+	pci_bus_addr_t end;
 };
 
 struct pci_dynids {
@@ -1128,7 +1134,7 @@ int __must_check pci_bus_alloc_resource(struct pci_bus *bus,
 
 int pci_remap_iospace(const struct resource *res, phys_addr_t phys_addr);
 
-static inline dma_addr_t pci_bus_address(struct pci_dev *pdev, int bar)
+static inline pci_bus_addr_t pci_bus_address(struct pci_dev *pdev, int bar)
 {
 	struct pci_bus_region region;
 
diff --git a/include/linux/types.h b/include/linux/types.h
index 59698be03490..8715287c3b1f 100644
--- a/include/linux/types.h
+++ b/include/linux/types.h
@@ -139,12 +139,20 @@ typedef unsigned long blkcnt_t;
  */
 #define pgoff_t unsigned long
 
-/* A dma_addr_t can hold any valid DMA or bus address for the platform */
+/*
+ * A dma_addr_t can hold any valid DMA address, i.e., any address returned
+ * by the DMA API.
+ *
+ * If the DMA API only uses 32-bit addresses, dma_addr_t need only be 32
+ * bits wide.  Bus addresses, e.g., PCI BARs, may be wider than 32 bits,
+ * but drivers do memory-mapped I/O to ioremapped kernel virtual addresses,
+ * so they don't care about the size of the actual bus addresses.
+ */
 #ifdef CONFIG_ARCH_DMA_ADDR_T_64BIT
 typedef u64 dma_addr_t;
 #else
 typedef u32 dma_addr_t;
-#endif /* dma_addr_t */
+#endif
 
 typedef unsigned __bitwise__ gfp_t;
 typedef unsigned __bitwise__ fmode_t;
-- 
cgit v1.2.3


From db874c7e10557f8f1af9a6fb1ec6589ae06f349c Mon Sep 17 00:00:00 2001
From: Tony Lindgren <tony@atomide.com>
Date: Fri, 29 May 2015 09:54:02 -0700
Subject: PM / wakeirq: Fix typo in prototype for dev_pm_set_dedicated_wake_irq

Looks like I only built test the dev_pm_set_wake_irq and not the
dev_pm_set_dedicated_wake_irq case on x86.

Turns out there's a typo for the dev_pm_set_dedicated_wake_irq
prototype that causes a build error if CONFIG_COMPILE_TEST
and CONFIG_MMC_OMAP_HS are selected.

Reported-by: Jim Davis <jim.epost@gmail.com>
Signed-off-by: Tony Lindgren <tony@atomide.com>
Reviewed-by: Felipe Balbi <balbi@ti.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/linux/pm_wakeirq.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pm_wakeirq.h b/include/linux/pm_wakeirq.h
index 4046fa1b7d25..cd5b62db9084 100644
--- a/include/linux/pm_wakeirq.h
+++ b/include/linux/pm_wakeirq.h
@@ -30,8 +30,7 @@ static inline int dev_pm_set_wake_irq(struct device *dev, int irq)
 	return 0;
 }
 
-static inline int dev_pm_set_dedicated__wake_irq(struct device *dev,
-						 int irq)
+static inline int dev_pm_set_dedicated_wake_irq(struct device *dev, int irq)
 {
 	return 0;
 }
-- 
cgit v1.2.3


From 5790cf3c00c2f92aacba348e13f8a9a8f5dd96bd Mon Sep 17 00:00:00 2001
From: Mathieu Olivari <mathieu@codeaurora.org>
Date: Wed, 27 May 2015 11:02:47 -0700
Subject: stmmac: add phy-handle support to the platform layer

On stmmac driver, PHY specification in device-tree was done using the
non-standard property "snps,phy-addr". Specifying a PHY on a different
MDIO bus that the one within the stmmac controller doesn't seem to be
possible when device-tree is used.

This change adds support for the phy-handle property, as specified in
Documentation/devicetree/bindings/net/ethernet.txt.

Signed-off-by: Mathieu Olivari <mathieu@codeaurora.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/stmicro/stmmac/stmmac_main.c  | 28 ++++++++++++++--------
 .../net/ethernet/stmicro/stmmac/stmmac_platform.c  |  6 ++++-
 include/linux/stmmac.h                             |  1 +
 3 files changed, 24 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
index e4f273976071..31c64169f2ec 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
@@ -52,6 +52,7 @@
 #include "stmmac_ptp.h"
 #include "stmmac.h"
 #include <linux/reset.h>
+#include <linux/of_mdio.h>
 
 #define STMMAC_ALIGN(x)	L1_CACHE_ALIGN(x)
 
@@ -816,18 +817,25 @@ static int stmmac_init_phy(struct net_device *dev)
 	priv->speed = 0;
 	priv->oldduplex = -1;
 
-	if (priv->plat->phy_bus_name)
-		snprintf(bus_id, MII_BUS_ID_SIZE, "%s-%x",
-			 priv->plat->phy_bus_name, priv->plat->bus_id);
-	else
-		snprintf(bus_id, MII_BUS_ID_SIZE, "stmmac-%x",
-			 priv->plat->bus_id);
+	if (priv->plat->phy_node) {
+		phydev = of_phy_connect(dev, priv->plat->phy_node,
+					&stmmac_adjust_link, 0, interface);
+	} else {
+		if (priv->plat->phy_bus_name)
+			snprintf(bus_id, MII_BUS_ID_SIZE, "%s-%x",
+				 priv->plat->phy_bus_name, priv->plat->bus_id);
+		else
+			snprintf(bus_id, MII_BUS_ID_SIZE, "stmmac-%x",
+				 priv->plat->bus_id);
 
-	snprintf(phy_id_fmt, MII_BUS_ID_SIZE + 3, PHY_ID_FMT, bus_id,
-		 priv->plat->phy_addr);
-	pr_debug("stmmac_init_phy:  trying to attach to %s\n", phy_id_fmt);
+		snprintf(phy_id_fmt, MII_BUS_ID_SIZE + 3, PHY_ID_FMT, bus_id,
+			 priv->plat->phy_addr);
+		pr_debug("stmmac_init_phy:  trying to attach to %s\n",
+			 phy_id_fmt);
 
-	phydev = phy_connect(dev, phy_id_fmt, &stmmac_adjust_link, interface);
+		phydev = phy_connect(dev, phy_id_fmt, &stmmac_adjust_link,
+				     interface);
+	}
 
 	if (IS_ERR(phydev)) {
 		pr_err("%s: Could not attach to PHY\n", dev->name);
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c
index 1664c0186f5b..8d23155a1a7e 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c
@@ -28,6 +28,7 @@
 #include <linux/of.h>
 #include <linux/of_net.h>
 #include <linux/of_device.h>
+#include <linux/of_mdio.h>
 
 #include "stmmac.h"
 #include "stmmac_platform.h"
@@ -144,13 +145,16 @@ static int stmmac_probe_config_dt(struct platform_device *pdev,
 	/* Default to phy auto-detection */
 	plat->phy_addr = -1;
 
+	/* If we find a phy-handle property, use it as the PHY */
+	plat->phy_node = of_parse_phandle(np, "phy-handle", 0);
+
 	/* "snps,phy-addr" is not a standard property. Mark it as deprecated
 	 * and warn of its use. Remove this when phy node support is added.
 	 */
 	if (of_property_read_u32(np, "snps,phy-addr", &plat->phy_addr) == 0)
 		dev_warn(&pdev->dev, "snps,phy-addr property is deprecated\n");
 
-	if (plat->phy_bus_name)
+	if (plat->phy_node || plat->phy_bus_name)
 		plat->mdio_bus_data = NULL;
 	else
 		plat->mdio_bus_data =
diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h
index 7f484a239f53..c735f5c91eea 100644
--- a/include/linux/stmmac.h
+++ b/include/linux/stmmac.h
@@ -99,6 +99,7 @@ struct plat_stmmacenet_data {
 	int phy_addr;
 	int interface;
 	struct stmmac_mdio_bus_data *mdio_bus_data;
+	struct device_node *phy_node;
 	struct stmmac_dma_cfg *dma_cfg;
 	int clk_csr;
 	int has_gmac;
-- 
cgit v1.2.3


From f4fb874cf076f9eafdd15c0a88cd0f0397b95e43 Mon Sep 17 00:00:00 2001
From: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Date: Wed, 27 May 2015 21:07:26 -0400
Subject: if_vlan: fix vlaue -> value typo

Fixes "vlaue" for "value" in include/linux/if_vlan.h.

Signed-off-by: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/if_vlan.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h
index b9ab677c0c0a..a40d29846ac2 100644
--- a/include/linux/if_vlan.h
+++ b/include/linux/if_vlan.h
@@ -416,7 +416,7 @@ static inline void __vlan_hwaccel_put_tag(struct sk_buff *skb,
 /**
  * __vlan_get_tag - get the VLAN ID that is part of the payload
  * @skb: skbuff to query
- * @vlan_tci: buffer to store vlaue
+ * @vlan_tci: buffer to store value
  *
  * Returns error if the skb is not of VLAN type
  */
@@ -435,7 +435,7 @@ static inline int __vlan_get_tag(const struct sk_buff *skb, u16 *vlan_tci)
 /**
  * __vlan_hwaccel_get_tag - get the VLAN ID that is in @skb->cb[]
  * @skb: skbuff to query
- * @vlan_tci: buffer to store vlaue
+ * @vlan_tci: buffer to store value
  *
  * Returns error if @skb->vlan_tci is not set correctly
  */
@@ -456,7 +456,7 @@ static inline int __vlan_hwaccel_get_tag(const struct sk_buff *skb,
 /**
  * vlan_get_tag - get the VLAN ID from the skb
  * @skb: skbuff to query
- * @vlan_tci: buffer to store vlaue
+ * @vlan_tci: buffer to store value
  *
  * Returns error if the skb is not VLAN tagged
  */
-- 
cgit v1.2.3


From 64ffaa2159b752e6c263dc57eaaaed7367d37493 Mon Sep 17 00:00:00 2001
From: Amir Vadai <amirv@mellanox.com>
Date: Thu, 28 May 2015 22:28:38 +0300
Subject: net/mlx5_core,mlx5_ib: Do not use vmap() on coherent memory

As David Daney pointed in mlx4_core driver [1], mlx5_core is also
misusing the DMA-API.

This patch is removing the code that vmap() memory allocated by
dma_alloc_coherent().

After this patch, users of this drivers might fail allocating resources
on memory fragmeneted systems.  This will be fixed later on.

[1] - https://patchwork.ozlabs.org/patch/458531/

CC: David Daney <david.daney@cavium.com>
Signed-off-by: Amir Vadai <amirv@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/infiniband/hw/mlx5/cq.c                 |  3 +-
 drivers/infiniband/hw/mlx5/qp.c                 |  2 +-
 drivers/infiniband/hw/mlx5/srq.c                |  2 +-
 drivers/net/ethernet/mellanox/mlx5/core/alloc.c | 96 +++++--------------------
 drivers/net/ethernet/mellanox/mlx5/core/eq.c    |  3 +-
 include/linux/mlx5/driver.h                     |  9 +--
 6 files changed, 22 insertions(+), 93 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/infiniband/hw/mlx5/cq.c b/drivers/infiniband/hw/mlx5/cq.c
index 2ee6b1051975..4e88b18cf62e 100644
--- a/drivers/infiniband/hw/mlx5/cq.c
+++ b/drivers/infiniband/hw/mlx5/cq.c
@@ -590,8 +590,7 @@ static int alloc_cq_buf(struct mlx5_ib_dev *dev, struct mlx5_ib_cq_buf *buf,
 {
 	int err;
 
-	err = mlx5_buf_alloc(dev->mdev, nent * cqe_size,
-			     PAGE_SIZE * 2, &buf->buf);
+	err = mlx5_buf_alloc(dev->mdev, nent * cqe_size, &buf->buf);
 	if (err)
 		return err;
 
diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c
index d35f62d4f4c5..426eb88dfa49 100644
--- a/drivers/infiniband/hw/mlx5/qp.c
+++ b/drivers/infiniband/hw/mlx5/qp.c
@@ -768,7 +768,7 @@ static int create_kernel_qp(struct mlx5_ib_dev *dev,
 	qp->sq.offset = qp->rq.wqe_cnt << qp->rq.wqe_shift;
 	qp->buf_size = err + (qp->rq.wqe_cnt << qp->rq.wqe_shift);
 
-	err = mlx5_buf_alloc(dev->mdev, qp->buf_size, PAGE_SIZE * 2, &qp->buf);
+	err = mlx5_buf_alloc(dev->mdev, qp->buf_size, &qp->buf);
 	if (err) {
 		mlx5_ib_dbg(dev, "err %d\n", err);
 		goto err_uuar;
diff --git a/drivers/infiniband/hw/mlx5/srq.c b/drivers/infiniband/hw/mlx5/srq.c
index 02d77a29764d..4242e1ded868 100644
--- a/drivers/infiniband/hw/mlx5/srq.c
+++ b/drivers/infiniband/hw/mlx5/srq.c
@@ -165,7 +165,7 @@ static int create_srq_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_srq *srq,
 		return err;
 	}
 
-	if (mlx5_buf_alloc(dev->mdev, buf_size, PAGE_SIZE * 2, &srq->buf)) {
+	if (mlx5_buf_alloc(dev->mdev, buf_size, &srq->buf)) {
 		mlx5_ib_dbg(dev, "buf alloc failed\n");
 		err = -ENOMEM;
 		goto err_db;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/alloc.c b/drivers/net/ethernet/mellanox/mlx5/core/alloc.c
index ac0f7bf4be95..0715b497511f 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/alloc.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/alloc.c
@@ -42,95 +42,36 @@
 #include "mlx5_core.h"
 
 /* Handling for queue buffers -- we allocate a bunch of memory and
- * register it in a memory region at HCA virtual address 0.  If the
- * requested size is > max_direct, we split the allocation into
- * multiple pages, so we don't require too much contiguous memory.
+ * register it in a memory region at HCA virtual address 0.
  */
 
-int mlx5_buf_alloc(struct mlx5_core_dev *dev, int size, int max_direct,
-		   struct mlx5_buf *buf)
+int mlx5_buf_alloc(struct mlx5_core_dev *dev, int size, struct mlx5_buf *buf)
 {
 	dma_addr_t t;
 
 	buf->size = size;
-	if (size <= max_direct) {
-		buf->nbufs        = 1;
-		buf->npages       = 1;
-		buf->page_shift   = (u8)get_order(size) + PAGE_SHIFT;
-		buf->direct.buf   = dma_zalloc_coherent(&dev->pdev->dev,
-							size, &t, GFP_KERNEL);
-		if (!buf->direct.buf)
-			return -ENOMEM;
-
-		buf->direct.map = t;
-
-		while (t & ((1 << buf->page_shift) - 1)) {
-			--buf->page_shift;
-			buf->npages *= 2;
-		}
-	} else {
-		int i;
-
-		buf->direct.buf  = NULL;
-		buf->nbufs       = (size + PAGE_SIZE - 1) / PAGE_SIZE;
-		buf->npages      = buf->nbufs;
-		buf->page_shift  = PAGE_SHIFT;
-		buf->page_list   = kcalloc(buf->nbufs, sizeof(*buf->page_list),
-					   GFP_KERNEL);
-		if (!buf->page_list)
-			return -ENOMEM;
-
-		for (i = 0; i < buf->nbufs; i++) {
-			buf->page_list[i].buf =
-				dma_zalloc_coherent(&dev->pdev->dev, PAGE_SIZE,
-						    &t, GFP_KERNEL);
-			if (!buf->page_list[i].buf)
-				goto err_free;
-
-			buf->page_list[i].map = t;
-		}
-
-		if (BITS_PER_LONG == 64) {
-			struct page **pages;
-			pages = kmalloc(sizeof(*pages) * buf->nbufs, GFP_KERNEL);
-			if (!pages)
-				goto err_free;
-			for (i = 0; i < buf->nbufs; i++)
-				pages[i] = virt_to_page(buf->page_list[i].buf);
-			buf->direct.buf = vmap(pages, buf->nbufs, VM_MAP, PAGE_KERNEL);
-			kfree(pages);
-			if (!buf->direct.buf)
-				goto err_free;
-		}
-	}
+	buf->npages       = 1;
+	buf->page_shift   = (u8)get_order(size) + PAGE_SHIFT;
+	buf->direct.buf   = dma_zalloc_coherent(&dev->pdev->dev,
+						size, &t, GFP_KERNEL);
+	if (!buf->direct.buf)
+		return -ENOMEM;
 
-	return 0;
+	buf->direct.map = t;
 
-err_free:
-	mlx5_buf_free(dev, buf);
+	while (t & ((1 << buf->page_shift) - 1)) {
+		--buf->page_shift;
+		buf->npages *= 2;
+	}
 
-	return -ENOMEM;
+	return 0;
 }
 EXPORT_SYMBOL_GPL(mlx5_buf_alloc);
 
 void mlx5_buf_free(struct mlx5_core_dev *dev, struct mlx5_buf *buf)
 {
-	int i;
-
-	if (buf->nbufs == 1)
-		dma_free_coherent(&dev->pdev->dev, buf->size, buf->direct.buf,
-				  buf->direct.map);
-	else {
-		if (BITS_PER_LONG == 64)
-			vunmap(buf->direct.buf);
-
-		for (i = 0; i < buf->nbufs; i++)
-			if (buf->page_list[i].buf)
-				dma_free_coherent(&dev->pdev->dev, PAGE_SIZE,
-						  buf->page_list[i].buf,
-						  buf->page_list[i].map);
-		kfree(buf->page_list);
-	}
+	dma_free_coherent(&dev->pdev->dev, buf->size, buf->direct.buf,
+			  buf->direct.map);
 }
 EXPORT_SYMBOL_GPL(mlx5_buf_free);
 
@@ -230,10 +171,7 @@ void mlx5_fill_page_array(struct mlx5_buf *buf, __be64 *pas)
 	int i;
 
 	for (i = 0; i < buf->npages; i++) {
-		if (buf->nbufs == 1)
-			addr = buf->direct.map + (i << buf->page_shift);
-		else
-			addr = buf->page_list[i].map;
+		addr = buf->direct.map + (i << buf->page_shift);
 
 		pas[i] = cpu_to_be64(addr);
 	}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eq.c b/drivers/net/ethernet/mellanox/mlx5/core/eq.c
index 58800e4f3958..3f511bd84489 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eq.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eq.c
@@ -346,8 +346,7 @@ int mlx5_create_map_eq(struct mlx5_core_dev *dev, struct mlx5_eq *eq, u8 vecidx,
 	int inlen;
 
 	eq->nent = roundup_pow_of_two(nent + MLX5_NUM_SPARE_EQE);
-	err = mlx5_buf_alloc(dev, eq->nent * MLX5_EQE_SIZE, 2 * PAGE_SIZE,
-			     &eq->buf);
+	err = mlx5_buf_alloc(dev, eq->nent * MLX5_EQE_SIZE, &eq->buf);
 	if (err)
 		return err;
 
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 9a90e7523dc2..c4cf25ffcc16 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -334,8 +334,6 @@ struct mlx5_buf_list {
 
 struct mlx5_buf {
 	struct mlx5_buf_list	direct;
-	struct mlx5_buf_list   *page_list;
-	int			nbufs;
 	int			npages;
 	int			size;
 	u8			page_shift;
@@ -586,11 +584,7 @@ struct mlx5_pas {
 
 static inline void *mlx5_buf_offset(struct mlx5_buf *buf, int offset)
 {
-	if (likely(BITS_PER_LONG == 64 || buf->nbufs == 1))
 		return buf->direct.buf + offset;
-	else
-		return buf->page_list[offset >> PAGE_SHIFT].buf +
-			(offset & (PAGE_SIZE - 1));
 }
 
 extern struct workqueue_struct *mlx5_core_wq;
@@ -669,8 +663,7 @@ void mlx5_health_cleanup(void);
 void  __init mlx5_health_init(void);
 void mlx5_start_health_poll(struct mlx5_core_dev *dev);
 void mlx5_stop_health_poll(struct mlx5_core_dev *dev);
-int mlx5_buf_alloc(struct mlx5_core_dev *dev, int size, int max_direct,
-		   struct mlx5_buf *buf);
+int mlx5_buf_alloc(struct mlx5_core_dev *dev, int size, struct mlx5_buf *buf);
 void mlx5_buf_free(struct mlx5_core_dev *dev, struct mlx5_buf *buf);
 struct mlx5_cmd_mailbox *mlx5_alloc_cmd_mailbox_chain(struct mlx5_core_dev *dev,
 						      gfp_t flags, int npages);
-- 
cgit v1.2.3


From db058a186f98b057c19c42f7b10d9a96fd3b5d59 Mon Sep 17 00:00:00 2001
From: Saeed Mahameed <saeedm@mellanox.com>
Date: Thu, 28 May 2015 22:28:39 +0300
Subject: net/mlx5_core: Set irq affinity hints

Preparation for upcoming ethernet driver.
- Move msix array from eq_table struct to priv since its not related to
  eq_table
- Intorduce irq_info struct to hold all irq information
- Move name from mlx5_eq to irq_info struct since it is irq property.
- Set IRQ affinity hints

Signed-off-by: Achiad Shochat <achiad@mellanox.com>
Signed-off-by: Rana Shahout <ranas@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: Amir Vadai <amirv@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx5/core/eq.c   |  16 ++--
 drivers/net/ethernet/mellanox/mlx5/core/main.c | 111 ++++++++++++++++++++++---
 include/linux/mlx5/driver.h                    |  11 ++-
 3 files changed, 117 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eq.c b/drivers/net/ethernet/mellanox/mlx5/core/eq.c
index 3f511bd84489..516efc25fc4f 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eq.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eq.c
@@ -339,7 +339,7 @@ static void init_eq_buf(struct mlx5_eq *eq)
 int mlx5_create_map_eq(struct mlx5_core_dev *dev, struct mlx5_eq *eq, u8 vecidx,
 		       int nent, u64 mask, const char *name, struct mlx5_uar *uar)
 {
-	struct mlx5_eq_table *table = &dev->priv.eq_table;
+	struct mlx5_priv *priv = &dev->priv;
 	struct mlx5_create_eq_mbox_in *in;
 	struct mlx5_create_eq_mbox_out out;
 	int err;
@@ -377,14 +377,15 @@ int mlx5_create_map_eq(struct mlx5_core_dev *dev, struct mlx5_eq *eq, u8 vecidx,
 		goto err_in;
 	}
 
-	snprintf(eq->name, MLX5_MAX_EQ_NAME, "%s@pci:%s",
+	snprintf(priv->irq_info[vecidx].name, MLX5_MAX_IRQ_NAME, "%s@pci:%s",
 		 name, pci_name(dev->pdev));
+
 	eq->eqn = out.eq_number;
 	eq->irqn = vecidx;
 	eq->dev = dev;
 	eq->doorbell = uar->map + MLX5_EQ_DOORBEL_OFFSET;
-	err = request_irq(table->msix_arr[vecidx].vector, mlx5_msix_handler, 0,
-			  eq->name, eq);
+	err = request_irq(priv->msix_arr[vecidx].vector, mlx5_msix_handler, 0,
+			  priv->irq_info[vecidx].name, eq);
 	if (err)
 		goto err_eq;
 
@@ -400,7 +401,7 @@ int mlx5_create_map_eq(struct mlx5_core_dev *dev, struct mlx5_eq *eq, u8 vecidx,
 	return 0;
 
 err_irq:
-	free_irq(table->msix_arr[vecidx].vector, eq);
+	free_irq(priv->msix_arr[vecidx].vector, eq);
 
 err_eq:
 	mlx5_cmd_destroy_eq(dev, eq->eqn);
@@ -416,16 +417,15 @@ EXPORT_SYMBOL_GPL(mlx5_create_map_eq);
 
 int mlx5_destroy_unmap_eq(struct mlx5_core_dev *dev, struct mlx5_eq *eq)
 {
-	struct mlx5_eq_table *table = &dev->priv.eq_table;
 	int err;
 
 	mlx5_debug_eq_remove(dev, eq);
-	free_irq(table->msix_arr[eq->irqn].vector, eq);
+	free_irq(dev->priv.msix_arr[eq->irqn].vector, eq);
 	err = mlx5_cmd_destroy_eq(dev, eq->eqn);
 	if (err)
 		mlx5_core_warn(dev, "failed to destroy a previously created eq: eqn %d\n",
 			       eq->eqn);
-	synchronize_irq(table->msix_arr[eq->irqn].vector);
+	synchronize_irq(dev->priv.msix_arr[eq->irqn].vector);
 	mlx5_buf_free(dev, &eq->buf);
 
 	return err;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c
index 28425e5ea91f..55085b01b4ce 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
@@ -38,6 +38,7 @@
 #include <linux/dma-mapping.h>
 #include <linux/slab.h>
 #include <linux/io-mapping.h>
+#include <linux/interrupt.h>
 #include <linux/mlx5/driver.h>
 #include <linux/mlx5/cq.h>
 #include <linux/mlx5/qp.h>
@@ -208,7 +209,8 @@ static void release_bar(struct pci_dev *pdev)
 
 static int mlx5_enable_msix(struct mlx5_core_dev *dev)
 {
-	struct mlx5_eq_table *table = &dev->priv.eq_table;
+	struct mlx5_priv *priv = &dev->priv;
+	struct mlx5_eq_table *table = &priv->eq_table;
 	int num_eqs = 1 << dev->caps.gen.log_max_eq;
 	int nvec;
 	int i;
@@ -218,14 +220,16 @@ static int mlx5_enable_msix(struct mlx5_core_dev *dev)
 	if (nvec <= MLX5_EQ_VEC_COMP_BASE)
 		return -ENOMEM;
 
-	table->msix_arr = kzalloc(nvec * sizeof(*table->msix_arr), GFP_KERNEL);
-	if (!table->msix_arr)
-		return -ENOMEM;
+	priv->msix_arr = kcalloc(nvec, sizeof(*priv->msix_arr), GFP_KERNEL);
+
+	priv->irq_info = kcalloc(nvec, sizeof(*priv->irq_info), GFP_KERNEL);
+	if (!priv->msix_arr || !priv->irq_info)
+		goto err_free_msix;
 
 	for (i = 0; i < nvec; i++)
-		table->msix_arr[i].entry = i;
+		priv->msix_arr[i].entry = i;
 
-	nvec = pci_enable_msix_range(dev->pdev, table->msix_arr,
+	nvec = pci_enable_msix_range(dev->pdev, priv->msix_arr,
 				     MLX5_EQ_VEC_COMP_BASE + 1, nvec);
 	if (nvec < 0)
 		return nvec;
@@ -233,14 +237,20 @@ static int mlx5_enable_msix(struct mlx5_core_dev *dev)
 	table->num_comp_vectors = nvec - MLX5_EQ_VEC_COMP_BASE;
 
 	return 0;
+
+err_free_msix:
+	kfree(priv->irq_info);
+	kfree(priv->msix_arr);
+	return -ENOMEM;
 }
 
 static void mlx5_disable_msix(struct mlx5_core_dev *dev)
 {
-	struct mlx5_eq_table *table = &dev->priv.eq_table;
+	struct mlx5_priv *priv = &dev->priv;
 
 	pci_disable_msix(dev->pdev);
-	kfree(table->msix_arr);
+	kfree(priv->irq_info);
+	kfree(priv->msix_arr);
 }
 
 struct mlx5_reg_host_endianess {
@@ -507,6 +517,77 @@ static int mlx5_core_disable_hca(struct mlx5_core_dev *dev)
 	return 0;
 }
 
+static int mlx5_irq_set_affinity_hint(struct mlx5_core_dev *mdev, int i)
+{
+	struct mlx5_priv *priv  = &mdev->priv;
+	struct msix_entry *msix = priv->msix_arr;
+	int irq                 = msix[i + MLX5_EQ_VEC_COMP_BASE].vector;
+	int numa_node           = dev_to_node(&mdev->pdev->dev);
+	int err;
+
+	if (!zalloc_cpumask_var(&priv->irq_info[i].mask, GFP_KERNEL)) {
+		mlx5_core_warn(mdev, "zalloc_cpumask_var failed");
+		return -ENOMEM;
+	}
+
+	err = cpumask_set_cpu_local_first(i, numa_node, priv->irq_info[i].mask);
+	if (err) {
+		mlx5_core_warn(mdev, "cpumask_set_cpu_local_first failed");
+		goto err_clear_mask;
+	}
+
+	err = irq_set_affinity_hint(irq, priv->irq_info[i].mask);
+	if (err) {
+		mlx5_core_warn(mdev, "irq_set_affinity_hint failed,irq 0x%.4x",
+			       irq);
+		goto err_clear_mask;
+	}
+
+	return 0;
+
+err_clear_mask:
+	free_cpumask_var(priv->irq_info[i].mask);
+	return err;
+}
+
+static void mlx5_irq_clear_affinity_hint(struct mlx5_core_dev *mdev, int i)
+{
+	struct mlx5_priv *priv  = &mdev->priv;
+	struct msix_entry *msix = priv->msix_arr;
+	int irq                 = msix[i + MLX5_EQ_VEC_COMP_BASE].vector;
+
+	irq_set_affinity_hint(irq, NULL);
+	free_cpumask_var(priv->irq_info[i].mask);
+}
+
+static int mlx5_irq_set_affinity_hints(struct mlx5_core_dev *mdev)
+{
+	int err;
+	int i;
+
+	for (i = 0; i < mdev->priv.eq_table.num_comp_vectors; i++) {
+		err = mlx5_irq_set_affinity_hint(mdev, i);
+		if (err)
+			goto err_out;
+	}
+
+	return 0;
+
+err_out:
+	for (i--; i >= 0; i--)
+		mlx5_irq_clear_affinity_hint(mdev, i);
+
+	return err;
+}
+
+static void mlx5_irq_clear_affinity_hints(struct mlx5_core_dev *mdev)
+{
+	int i;
+
+	for (i = 0; i < mdev->priv.eq_table.num_comp_vectors; i++)
+		mlx5_irq_clear_affinity_hint(mdev, i);
+}
+
 int mlx5_vector2eqn(struct mlx5_core_dev *dev, int vector, int *eqn, int *irqn)
 {
 	struct mlx5_eq_table *table = &dev->priv.eq_table;
@@ -549,7 +630,7 @@ static void free_comp_eqs(struct mlx5_core_dev *dev)
 static int alloc_comp_eqs(struct mlx5_core_dev *dev)
 {
 	struct mlx5_eq_table *table = &dev->priv.eq_table;
-	char name[MLX5_MAX_EQ_NAME];
+	char name[MLX5_MAX_IRQ_NAME];
 	struct mlx5_eq *eq;
 	int ncomp_vec;
 	int nent;
@@ -566,7 +647,7 @@ static int alloc_comp_eqs(struct mlx5_core_dev *dev)
 			goto clean;
 		}
 
-		snprintf(name, MLX5_MAX_EQ_NAME, "mlx5_comp%d", i);
+		snprintf(name, MLX5_MAX_IRQ_NAME, "mlx5_comp%d", i);
 		err = mlx5_create_map_eq(dev, eq,
 					 i + MLX5_EQ_VEC_COMP_BASE, nent, 0,
 					 name, &dev->priv.uuari.uars[0]);
@@ -730,6 +811,12 @@ static int mlx5_dev_init(struct mlx5_core_dev *dev, struct pci_dev *pdev)
 		goto err_stop_eqs;
 	}
 
+	err = mlx5_irq_set_affinity_hints(dev);
+	if (err) {
+		dev_err(&pdev->dev, "Failed to alloc affinity hint cpumask\n");
+		goto err_free_comp_eqs;
+	}
+
 	MLX5_INIT_DOORBELL_LOCK(&priv->cq_uar_lock);
 
 	mlx5_init_cq_table(dev);
@@ -739,6 +826,9 @@ static int mlx5_dev_init(struct mlx5_core_dev *dev, struct pci_dev *pdev)
 
 	return 0;
 
+err_free_comp_eqs:
+	free_comp_eqs(dev);
+
 err_stop_eqs:
 	mlx5_stop_eqs(dev);
 
@@ -793,6 +883,7 @@ static void mlx5_dev_cleanup(struct mlx5_core_dev *dev)
 	mlx5_cleanup_srq_table(dev);
 	mlx5_cleanup_qp_table(dev);
 	mlx5_cleanup_cq_table(dev);
+	mlx5_irq_clear_affinity_hints(dev);
 	free_comp_eqs(dev);
 	mlx5_stop_eqs(dev);
 	mlx5_free_uuars(dev, &priv->uuari);
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index c4cf25ffcc16..9e8979502826 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -85,7 +85,7 @@ enum {
 };
 
 enum {
-	MLX5_MAX_EQ_NAME	= 32
+	MLX5_MAX_IRQ_NAME	= 32
 };
 
 enum {
@@ -349,7 +349,6 @@ struct mlx5_eq {
 	u8			eqn;
 	int			nent;
 	u64			mask;
-	char			name[MLX5_MAX_EQ_NAME];
 	struct list_head	list;
 	int			index;
 	struct mlx5_rsc_debug	*dbg;
@@ -412,7 +411,6 @@ struct mlx5_eq_table {
 	struct mlx5_eq		pages_eq;
 	struct mlx5_eq		async_eq;
 	struct mlx5_eq		cmd_eq;
-	struct msix_entry	*msix_arr;
 	int			num_comp_vectors;
 	/* protect EQs list
 	 */
@@ -465,9 +463,16 @@ struct mlx5_mr_table {
 	struct radix_tree_root	tree;
 };
 
+struct mlx5_irq_info {
+	cpumask_var_t mask;
+	char name[MLX5_MAX_IRQ_NAME];
+};
+
 struct mlx5_priv {
 	char			name[MLX5_MAX_NAME_LEN];
 	struct mlx5_eq_table	eq_table;
+	struct msix_entry	*msix_arr;
+	struct mlx5_irq_info	*irq_info;
 	struct mlx5_uuar_info	uuari;
 	MLX5_DECLARE_DOORBELL_LOCK(cq_uar_lock);
 
-- 
cgit v1.2.3


From e281682bf29438848daac11627216bceb1507b71 Mon Sep 17 00:00:00 2001
From: Saeed Mahameed <saeedm@mellanox.com>
Date: Thu, 28 May 2015 22:28:40 +0300
Subject: net/mlx5_core: HW data structs/types definitions cleanup

mlx5_ifc.h was heavily modified here since it is now generated by a
script from the device specification (PRM rev 0.25). This specification
is backward compatible to existing hardware.

Some structures/fields were added here in order to enable the Ethernet
functionality of the driver.

Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: Amir Vadai <amirv@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx5/core/cmd.c  |   17 +-
 drivers/net/ethernet/mellanox/mlx5/core/fw.c   |    2 +-
 drivers/net/ethernet/mellanox/mlx5/core/main.c |    7 +-
 drivers/net/ethernet/mellanox/mlx5/core/mcg.c  |    2 +-
 drivers/net/ethernet/mellanox/mlx5/core/uar.c  |   37 +
 include/linux/mlx5/device.h                    |  113 +-
 include/linux/mlx5/driver.h                    |    4 +-
 include/linux/mlx5/mlx5_ifc.h                  | 6608 +++++++++++++++++++++++-
 include/linux/mlx5/qp.h                        |   25 +
 9 files changed, 6705 insertions(+), 110 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c
index e3273faf4568..2f22cd2de2b0 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c
@@ -390,8 +390,17 @@ const char *mlx5_command_str(int command)
 	case MLX5_CMD_OP_ARM_RQ:
 		return "ARM_RQ";
 
-	case MLX5_CMD_OP_RESIZE_SRQ:
-		return "RESIZE_SRQ";
+	case MLX5_CMD_OP_CREATE_XRC_SRQ:
+		return "CREATE_XRC_SRQ";
+
+	case MLX5_CMD_OP_DESTROY_XRC_SRQ:
+		return "DESTROY_XRC_SRQ";
+
+	case MLX5_CMD_OP_QUERY_XRC_SRQ:
+		return "QUERY_XRC_SRQ";
+
+	case MLX5_CMD_OP_ARM_XRC_SRQ:
+		return "ARM_XRC_SRQ";
 
 	case MLX5_CMD_OP_ALLOC_PD:
 		return "ALLOC_PD";
@@ -408,8 +417,8 @@ const char *mlx5_command_str(int command)
 	case MLX5_CMD_OP_ATTACH_TO_MCG:
 		return "ATTACH_TO_MCG";
 
-	case MLX5_CMD_OP_DETACH_FROM_MCG:
-		return "DETACH_FROM_MCG";
+	case MLX5_CMD_OP_DETTACH_FROM_MCG:
+		return "DETTACH_FROM_MCG";
 
 	case MLX5_CMD_OP_ALLOC_XRCD:
 		return "ALLOC_XRCD";
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fw.c b/drivers/net/ethernet/mellanox/mlx5/core/fw.c
index 4b4cda3bcc5f..ef9b7695decd 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fw.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fw.c
@@ -95,7 +95,7 @@ int mlx5_query_odp_caps(struct mlx5_core_dev *dev, struct mlx5_odp_caps *caps)
 		goto out;
 	}
 
-	memcpy(caps, MLX5_ADDR_OF(query_hca_cap_out, out, capability_struct),
+	memcpy(caps, MLX5_ADDR_OF(query_hca_cap_out, out, capability),
 	       sizeof(*caps));
 
 	mlx5_core_dbg(dev, "on-demand paging capabilities:\nrc: %08x\nuc: %08x\nud: %08x\n",
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c
index 55085b01b4ce..a652cb93ceaa 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
@@ -319,8 +319,7 @@ static void fw2drv_caps(struct mlx5_caps *caps, void *out)
 	gen->max_srq_wqes = 1 << MLX5_GET_PR(cmd_hca_cap, out, log_max_srq_sz);
 	gen->max_wqes = 1 << MLX5_GET_PR(cmd_hca_cap, out, log_max_qp_sz);
 	gen->log_max_qp = MLX5_GET_PR(cmd_hca_cap, out, log_max_qp);
-	gen->log_max_strq = MLX5_GET_PR(cmd_hca_cap, out, log_max_strq_sz);
-	gen->log_max_srq = MLX5_GET_PR(cmd_hca_cap, out, log_max_srqs);
+	gen->log_max_srq = MLX5_GET_PR(cmd_hca_cap, out, log_max_srq);
 	gen->max_cqes = 1 << MLX5_GET_PR(cmd_hca_cap, out, log_max_cq_sz);
 	gen->log_max_cq = MLX5_GET_PR(cmd_hca_cap, out, log_max_cq);
 	gen->max_eqes = 1 << MLX5_GET_PR(cmd_hca_cap, out, log_max_eq_sz);
@@ -391,7 +390,7 @@ int mlx5_core_get_caps(struct mlx5_core_dev *dev, struct mlx5_caps *caps,
 		goto query_ex;
 	}
 	mlx5_core_dbg(dev, "%s\n", caps_opmod_str(opmod));
-	fw2drv_caps(caps, MLX5_ADDR_OF(query_hca_cap_out, out, capability_struct));
+	fw2drv_caps(caps, MLX5_ADDR_OF(query_hca_cap_out, out, capability));
 
 query_ex:
 	kfree(out);
@@ -453,7 +452,7 @@ static int handle_hca_cap(struct mlx5_core_dev *dev)
 	/* disable checksum */
 	cur_caps->gen.flags &= ~MLX5_DEV_CAP_FLAG_CMDIF_CSUM;
 
-	copy_rw_fields(MLX5_ADDR_OF(set_hca_cap_in, set_ctx, hca_capability_struct),
+	copy_rw_fields(MLX5_ADDR_OF(set_hca_cap_in, set_ctx, capability),
 		       cur_caps);
 	err = set_caps(dev, set_ctx, set_sz);
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/mcg.c b/drivers/net/ethernet/mellanox/mlx5/core/mcg.c
index d79fd85d1dd5..d5a0c2d61a18 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/mcg.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/mcg.c
@@ -91,7 +91,7 @@ int mlx5_core_detach_mcg(struct mlx5_core_dev *dev, union ib_gid *mgid, u32 qpn)
 
 	memset(&in, 0, sizeof(in));
 	memset(&out, 0, sizeof(out));
-	in.hdr.opcode = cpu_to_be16(MLX5_CMD_OP_DETACH_FROM_MCG);
+	in.hdr.opcode = cpu_to_be16(MLX5_CMD_OP_DETTACH_FROM_MCG);
 	memcpy(in.gid, mgid, sizeof(*mgid));
 	in.qpn = cpu_to_be32(qpn);
 	err = mlx5_cmd_exec(dev, &in, sizeof(in), &out, sizeof(out));
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/uar.c b/drivers/net/ethernet/mellanox/mlx5/core/uar.c
index 5a89bb1d678a..ba58f6d7c761 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/uar.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/uar.c
@@ -223,3 +223,40 @@ int mlx5_free_uuars(struct mlx5_core_dev *dev, struct mlx5_uuar_info *uuari)
 
 	return 0;
 }
+
+int mlx5_alloc_map_uar(struct mlx5_core_dev *mdev, struct mlx5_uar *uar)
+{
+	phys_addr_t pfn;
+	phys_addr_t uar_bar_start;
+	int err;
+
+	err = mlx5_cmd_alloc_uar(mdev, &uar->index);
+	if (err) {
+		mlx5_core_warn(mdev, "mlx5_cmd_alloc_uar() failed, %d\n", err);
+		return err;
+	}
+
+	uar_bar_start = pci_resource_start(mdev->pdev, 0);
+	pfn           = (uar_bar_start >> PAGE_SHIFT) + uar->index;
+	uar->map      = ioremap(pfn << PAGE_SHIFT, PAGE_SIZE);
+	if (!uar->map) {
+		mlx5_core_warn(mdev, "ioremap() failed, %d\n", err);
+		err = -ENOMEM;
+		goto err_free_uar;
+	}
+
+	return 0;
+
+err_free_uar:
+	mlx5_cmd_free_uar(mdev, uar->index);
+
+	return err;
+}
+EXPORT_SYMBOL(mlx5_alloc_map_uar);
+
+void mlx5_unmap_free_uar(struct mlx5_core_dev *mdev, struct mlx5_uar *uar)
+{
+	iounmap(uar->map);
+	mlx5_cmd_free_uar(mdev, uar->index);
+}
+EXPORT_SYMBOL(mlx5_unmap_free_uar);
diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h
index abf65c790421..feebed7b392b 100644
--- a/include/linux/mlx5/device.h
+++ b/include/linux/mlx5/device.h
@@ -35,6 +35,7 @@
 
 #include <linux/types.h>
 #include <rdma/ib_verbs.h>
+#include <linux/mlx5/mlx5_ifc.h>
 
 #if defined(__LITTLE_ENDIAN)
 #define MLX5_SET_HOST_ENDIANNESS	0
@@ -70,6 +71,14 @@
 		     << __mlx5_dw_bit_off(typ, fld))); \
 } while (0)
 
+#define MLX5_SET_TO_ONES(typ, p, fld) do { \
+	BUILD_BUG_ON(__mlx5_st_sz_bits(typ) % 32);             \
+	*((__be32 *)(p) + __mlx5_dw_off(typ, fld)) = \
+	cpu_to_be32((be32_to_cpu(*((__be32 *)(p) + __mlx5_dw_off(typ, fld))) & \
+		     (~__mlx5_dw_mask(typ, fld))) | ((__mlx5_mask(typ, fld)) \
+		     << __mlx5_dw_bit_off(typ, fld))); \
+} while (0)
+
 #define MLX5_GET(typ, p, fld) ((be32_to_cpu(*((__be32 *)(p) +\
 __mlx5_dw_off(typ, fld))) >> __mlx5_dw_bit_off(typ, fld)) & \
 __mlx5_mask(typ, fld))
@@ -264,6 +273,7 @@ enum {
 	MLX5_OPCODE_RDMA_WRITE_IMM	= 0x09,
 	MLX5_OPCODE_SEND		= 0x0a,
 	MLX5_OPCODE_SEND_IMM		= 0x0b,
+	MLX5_OPCODE_LSO			= 0x0e,
 	MLX5_OPCODE_RDMA_READ		= 0x10,
 	MLX5_OPCODE_ATOMIC_CS		= 0x11,
 	MLX5_OPCODE_ATOMIC_FA		= 0x12,
@@ -541,6 +551,10 @@ struct mlx5_cmd_prot_block {
 	u8		sig;
 };
 
+enum {
+	MLX5_CQE_SYND_FLUSHED_IN_ERROR = 5,
+};
+
 struct mlx5_err_cqe {
 	u8	rsvd0[32];
 	__be32	srqn;
@@ -554,13 +568,22 @@ struct mlx5_err_cqe {
 };
 
 struct mlx5_cqe64 {
-	u8		rsvd0[17];
+	u8		rsvd0[4];
+	u8		lro_tcppsh_abort_dupack;
+	u8		lro_min_ttl;
+	__be16		lro_tcp_win;
+	__be32		lro_ack_seq_num;
+	__be32		rss_hash_result;
+	u8		rss_hash_type;
 	u8		ml_path;
-	u8		rsvd20[4];
+	u8		rsvd20[2];
+	__be16		check_sum;
 	__be16		slid;
 	__be32		flags_rqpn;
-	u8		rsvd28[4];
-	__be32		srqn;
+	u8		hds_ip_ext;
+	u8		l4_hdr_type_etc;
+	__be16		vlan_info;
+	__be32		srqn; /* [31:24]: lro_num_seg, [23:0]: srqn */
 	__be32		imm_inval_pkey;
 	u8		rsvd40[4];
 	__be32		byte_cnt;
@@ -571,6 +594,40 @@ struct mlx5_cqe64 {
 	u8		op_own;
 };
 
+static inline int get_cqe_lro_tcppsh(struct mlx5_cqe64 *cqe)
+{
+	return (cqe->lro_tcppsh_abort_dupack >> 6) & 1;
+}
+
+static inline u8 get_cqe_l4_hdr_type(struct mlx5_cqe64 *cqe)
+{
+	return (cqe->l4_hdr_type_etc >> 4) & 0x7;
+}
+
+static inline int cqe_has_vlan(struct mlx5_cqe64 *cqe)
+{
+	return !!(cqe->l4_hdr_type_etc & 0x1);
+}
+
+enum {
+	CQE_L4_HDR_TYPE_NONE			= 0x0,
+	CQE_L4_HDR_TYPE_TCP_NO_ACK		= 0x1,
+	CQE_L4_HDR_TYPE_UDP			= 0x2,
+	CQE_L4_HDR_TYPE_TCP_ACK_NO_DATA		= 0x3,
+	CQE_L4_HDR_TYPE_TCP_ACK_AND_DATA	= 0x4,
+};
+
+enum {
+	CQE_RSS_HTYPE_IP	= 0x3 << 6,
+	CQE_RSS_HTYPE_L4	= 0x3 << 2,
+};
+
+enum {
+	CQE_L2_OK	= 1 << 0,
+	CQE_L3_OK	= 1 << 1,
+	CQE_L4_OK	= 1 << 2,
+};
+
 struct mlx5_sig_err_cqe {
 	u8		rsvd0[16];
 	__be32		expected_trans_sig;
@@ -996,4 +1053,52 @@ struct mlx5_destroy_psv_out {
 	u8                      rsvd[8];
 };
 
+#define MLX5_CMD_OP_MAX 0x920
+
+enum {
+	VPORT_STATE_DOWN		= 0x0,
+	VPORT_STATE_UP			= 0x1,
+};
+
+enum {
+	MLX5_L3_PROT_TYPE_IPV4		= 0,
+	MLX5_L3_PROT_TYPE_IPV6		= 1,
+};
+
+enum {
+	MLX5_L4_PROT_TYPE_TCP		= 0,
+	MLX5_L4_PROT_TYPE_UDP		= 1,
+};
+
+enum {
+	MLX5_HASH_FIELD_SEL_SRC_IP	= 1 << 0,
+	MLX5_HASH_FIELD_SEL_DST_IP	= 1 << 1,
+	MLX5_HASH_FIELD_SEL_L4_SPORT	= 1 << 2,
+	MLX5_HASH_FIELD_SEL_L4_DPORT	= 1 << 3,
+	MLX5_HASH_FIELD_SEL_IPSEC_SPI	= 1 << 4,
+};
+
+enum {
+	MLX5_MATCH_OUTER_HEADERS	= 1 << 0,
+	MLX5_MATCH_MISC_PARAMETERS	= 1 << 1,
+	MLX5_MATCH_INNER_HEADERS	= 1 << 2,
+
+};
+
+enum {
+	MLX5_FLOW_TABLE_TYPE_NIC_RCV	= 0,
+	MLX5_FLOW_TABLE_TYPE_ESWITCH	= 4,
+};
+
+enum {
+	MLX5_FLOW_CONTEXT_DEST_TYPE_VPORT	= 0,
+	MLX5_FLOW_CONTEXT_DEST_TYPE_FLOW_TABLE	= 1,
+	MLX5_FLOW_CONTEXT_DEST_TYPE_TIR		= 2,
+};
+
+enum {
+	MLX5_RQC_RQ_TYPE_MEMORY_RQ_INLINE = 0x0,
+	MLX5_RQC_RQ_TYPE_MEMORY_RQ_RPM    = 0x1,
+};
+
 #endif /* MLX5_DEVICE_H */
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 9e8979502826..3fd4fdc1ba16 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -44,7 +44,6 @@
 
 #include <linux/mlx5/device.h>
 #include <linux/mlx5/doorbell.h>
-#include <linux/mlx5/mlx5_ifc.h>
 
 enum {
 	MLX5_BOARD_ID_LEN = 64,
@@ -278,7 +277,6 @@ struct mlx5_general_caps {
 	u8	log_max_mkey;
 	u8	log_max_pd;
 	u8	log_max_srq;
-	u8	log_max_strq;
 	u8	log_max_mrw_sz;
 	u8	log_max_bsf_list_size;
 	u8	log_max_klm_list_size;
@@ -664,6 +662,8 @@ int mlx5_cmd_alloc_uar(struct mlx5_core_dev *dev, u32 *uarn);
 int mlx5_cmd_free_uar(struct mlx5_core_dev *dev, u32 uarn);
 int mlx5_alloc_uuars(struct mlx5_core_dev *dev, struct mlx5_uuar_info *uuari);
 int mlx5_free_uuars(struct mlx5_core_dev *dev, struct mlx5_uuar_info *uuari);
+int mlx5_alloc_map_uar(struct mlx5_core_dev *mdev, struct mlx5_uar *uar);
+void mlx5_unmap_free_uar(struct mlx5_core_dev *mdev, struct mlx5_uar *uar);
 void mlx5_health_cleanup(void);
 void  __init mlx5_health_init(void);
 void mlx5_start_health_poll(struct mlx5_core_dev *dev);
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index cb3ad17edd1f..b27e9f6e090a 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2013-2015, Mellanox Technologies, Ltd.  All rights reserved.
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
@@ -28,11 +28,44 @@
  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
- */
-
+*/
 #ifndef MLX5_IFC_H
 #define MLX5_IFC_H
 
+enum {
+	MLX5_EVENT_TYPE_CODING_COMPLETION_EVENTS                   = 0x0,
+	MLX5_EVENT_TYPE_CODING_PATH_MIGRATED_SUCCEEDED             = 0x1,
+	MLX5_EVENT_TYPE_CODING_COMMUNICATION_ESTABLISHED           = 0x2,
+	MLX5_EVENT_TYPE_CODING_SEND_QUEUE_DRAINED                  = 0x3,
+	MLX5_EVENT_TYPE_CODING_LAST_WQE_REACHED                    = 0x13,
+	MLX5_EVENT_TYPE_CODING_SRQ_LIMIT                           = 0x14,
+	MLX5_EVENT_TYPE_CODING_DCT_ALL_CONNECTIONS_CLOSED          = 0x1c,
+	MLX5_EVENT_TYPE_CODING_DCT_ACCESS_KEY_VIOLATION            = 0x1d,
+	MLX5_EVENT_TYPE_CODING_CQ_ERROR                            = 0x4,
+	MLX5_EVENT_TYPE_CODING_LOCAL_WQ_CATASTROPHIC_ERROR         = 0x5,
+	MLX5_EVENT_TYPE_CODING_PATH_MIGRATION_FAILED               = 0x7,
+	MLX5_EVENT_TYPE_CODING_PAGE_FAULT_EVENT                    = 0xc,
+	MLX5_EVENT_TYPE_CODING_INVALID_REQUEST_LOCAL_WQ_ERROR      = 0x10,
+	MLX5_EVENT_TYPE_CODING_LOCAL_ACCESS_VIOLATION_WQ_ERROR     = 0x11,
+	MLX5_EVENT_TYPE_CODING_LOCAL_SRQ_CATASTROPHIC_ERROR        = 0x12,
+	MLX5_EVENT_TYPE_CODING_INTERNAL_ERROR                      = 0x8,
+	MLX5_EVENT_TYPE_CODING_PORT_STATE_CHANGE                   = 0x9,
+	MLX5_EVENT_TYPE_CODING_GPIO_EVENT                          = 0x15,
+	MLX5_EVENT_TYPE_CODING_REMOTE_CONFIGURATION_PROTOCOL_EVENT = 0x19,
+	MLX5_EVENT_TYPE_CODING_DOORBELL_BLUEFLAME_CONGESTION_EVENT = 0x1a,
+	MLX5_EVENT_TYPE_CODING_STALL_VL_EVENT                      = 0x1b,
+	MLX5_EVENT_TYPE_CODING_DROPPED_PACKET_LOGGED_EVENT         = 0x1f,
+	MLX5_EVENT_TYPE_CODING_COMMAND_INTERFACE_COMPLETION        = 0xa,
+	MLX5_EVENT_TYPE_CODING_PAGE_REQUEST                        = 0xb
+};
+
+enum {
+	MLX5_MODIFY_TIR_BITMASK_LRO                   = 0x0,
+	MLX5_MODIFY_TIR_BITMASK_INDIRECT_TABLE        = 0x1,
+	MLX5_MODIFY_TIR_BITMASK_HASH                  = 0x2,
+	MLX5_MODIFY_TIR_BITMASK_TUNNELED_OFFLOAD_EN   = 0x3
+};
+
 enum {
 	MLX5_CMD_OP_QUERY_HCA_CAP                 = 0x100,
 	MLX5_CMD_OP_QUERY_ADAPTER                 = 0x101,
@@ -43,6 +76,8 @@ enum {
 	MLX5_CMD_OP_QUERY_PAGES                   = 0x107,
 	MLX5_CMD_OP_MANAGE_PAGES                  = 0x108,
 	MLX5_CMD_OP_SET_HCA_CAP                   = 0x109,
+	MLX5_CMD_OP_QUERY_ISSI                    = 0x10a,
+	MLX5_CMD_OP_SET_ISSI                      = 0x10b,
 	MLX5_CMD_OP_CREATE_MKEY                   = 0x200,
 	MLX5_CMD_OP_QUERY_MKEY                    = 0x201,
 	MLX5_CMD_OP_DESTROY_MKEY                  = 0x202,
@@ -66,6 +101,7 @@ enum {
 	MLX5_CMD_OP_2ERR_QP                       = 0x507,
 	MLX5_CMD_OP_2RST_QP                       = 0x50a,
 	MLX5_CMD_OP_QUERY_QP                      = 0x50b,
+	MLX5_CMD_OP_SQD_RTS_QP                    = 0x50c,
 	MLX5_CMD_OP_INIT2INIT_QP                  = 0x50e,
 	MLX5_CMD_OP_CREATE_PSV                    = 0x600,
 	MLX5_CMD_OP_DESTROY_PSV                   = 0x601,
@@ -73,7 +109,10 @@ enum {
 	MLX5_CMD_OP_DESTROY_SRQ                   = 0x701,
 	MLX5_CMD_OP_QUERY_SRQ                     = 0x702,
 	MLX5_CMD_OP_ARM_RQ                        = 0x703,
-	MLX5_CMD_OP_RESIZE_SRQ                    = 0x704,
+	MLX5_CMD_OP_CREATE_XRC_SRQ                = 0x705,
+	MLX5_CMD_OP_DESTROY_XRC_SRQ               = 0x706,
+	MLX5_CMD_OP_QUERY_XRC_SRQ                 = 0x707,
+	MLX5_CMD_OP_ARM_XRC_SRQ                   = 0x708,
 	MLX5_CMD_OP_CREATE_DCT                    = 0x710,
 	MLX5_CMD_OP_DESTROY_DCT                   = 0x711,
 	MLX5_CMD_OP_DRAIN_DCT                     = 0x712,
@@ -85,8 +124,12 @@ enum {
 	MLX5_CMD_OP_MODIFY_ESW_VPORT_CONTEXT      = 0x753,
 	MLX5_CMD_OP_QUERY_NIC_VPORT_CONTEXT       = 0x754,
 	MLX5_CMD_OP_MODIFY_NIC_VPORT_CONTEXT      = 0x755,
-	MLX5_CMD_OP_QUERY_RCOE_ADDRESS            = 0x760,
+	MLX5_CMD_OP_QUERY_ROCE_ADDRESS            = 0x760,
 	MLX5_CMD_OP_SET_ROCE_ADDRESS              = 0x761,
+	MLX5_CMD_OP_QUERY_HCA_VPORT_CONTEXT       = 0x762,
+	MLX5_CMD_OP_MODIFY_HCA_VPORT_CONTEXT      = 0x763,
+	MLX5_CMD_OP_QUERY_HCA_VPORT_GID           = 0x764,
+	MLX5_CMD_OP_QUERY_HCA_VPORT_PKEY          = 0x765,
 	MLX5_CMD_OP_QUERY_VPORT_COUNTER           = 0x770,
 	MLX5_CMD_OP_ALLOC_Q_COUNTER               = 0x771,
 	MLX5_CMD_OP_DEALLOC_Q_COUNTER             = 0x772,
@@ -98,7 +141,7 @@ enum {
 	MLX5_CMD_OP_CONFIG_INT_MODERATION         = 0x804,
 	MLX5_CMD_OP_ACCESS_REG                    = 0x805,
 	MLX5_CMD_OP_ATTACH_TO_MCG                 = 0x806,
-	MLX5_CMD_OP_DETACH_FROM_MCG               = 0x807,
+	MLX5_CMD_OP_DETTACH_FROM_MCG              = 0x807,
 	MLX5_CMD_OP_GET_DROPPED_PACKET_LOG        = 0x80a,
 	MLX5_CMD_OP_MAD_IFC                       = 0x50d,
 	MLX5_CMD_OP_QUERY_MAD_DEMUX               = 0x80b,
@@ -106,23 +149,22 @@ enum {
 	MLX5_CMD_OP_NOP                           = 0x80d,
 	MLX5_CMD_OP_ALLOC_XRCD                    = 0x80e,
 	MLX5_CMD_OP_DEALLOC_XRCD                  = 0x80f,
-	MLX5_CMD_OP_SET_BURST_SIZE                = 0x812,
-	MLX5_CMD_OP_QUERY_BURST_SZIE              = 0x813,
-	MLX5_CMD_OP_ACTIVATE_TRACER               = 0x814,
-	MLX5_CMD_OP_DEACTIVATE_TRACER             = 0x815,
-	MLX5_CMD_OP_CREATE_SNIFFER_RULE           = 0x820,
-	MLX5_CMD_OP_DESTROY_SNIFFER_RULE          = 0x821,
-	MLX5_CMD_OP_QUERY_CONG_PARAMS             = 0x822,
-	MLX5_CMD_OP_MODIFY_CONG_PARAMS            = 0x823,
-	MLX5_CMD_OP_QUERY_CONG_STATISTICS         = 0x824,
+	MLX5_CMD_OP_ALLOC_TRANSPORT_DOMAIN        = 0x816,
+	MLX5_CMD_OP_DEALLOC_TRANSPORT_DOMAIN      = 0x817,
+	MLX5_CMD_OP_QUERY_CONG_STATUS             = 0x822,
+	MLX5_CMD_OP_MODIFY_CONG_STATUS            = 0x823,
+	MLX5_CMD_OP_QUERY_CONG_PARAMS             = 0x824,
+	MLX5_CMD_OP_MODIFY_CONG_PARAMS            = 0x825,
+	MLX5_CMD_OP_QUERY_CONG_STATISTICS         = 0x826,
+	MLX5_CMD_OP_ADD_VXLAN_UDP_DPORT           = 0x827,
+	MLX5_CMD_OP_DELETE_VXLAN_UDP_DPORT        = 0x828,
+	MLX5_CMD_OP_SET_L2_TABLE_ENTRY            = 0x829,
+	MLX5_CMD_OP_QUERY_L2_TABLE_ENTRY          = 0x82a,
+	MLX5_CMD_OP_DELETE_L2_TABLE_ENTRY         = 0x82b,
 	MLX5_CMD_OP_CREATE_TIR                    = 0x900,
 	MLX5_CMD_OP_MODIFY_TIR                    = 0x901,
 	MLX5_CMD_OP_DESTROY_TIR                   = 0x902,
 	MLX5_CMD_OP_QUERY_TIR                     = 0x903,
-	MLX5_CMD_OP_CREATE_TIS                    = 0x912,
-	MLX5_CMD_OP_MODIFY_TIS                    = 0x913,
-	MLX5_CMD_OP_DESTROY_TIS                   = 0x914,
-	MLX5_CMD_OP_QUERY_TIS                     = 0x915,
 	MLX5_CMD_OP_CREATE_SQ                     = 0x904,
 	MLX5_CMD_OP_MODIFY_SQ                     = 0x905,
 	MLX5_CMD_OP_DESTROY_SQ                    = 0x906,
@@ -135,9 +177,430 @@ enum {
 	MLX5_CMD_OP_MODIFY_RMP                    = 0x90d,
 	MLX5_CMD_OP_DESTROY_RMP                   = 0x90e,
 	MLX5_CMD_OP_QUERY_RMP                     = 0x90f,
-	MLX5_CMD_OP_SET_FLOW_TABLE_ENTRY          = 0x910,
-	MLX5_CMD_OP_QUERY_FLOW_TABLE_ENTRY        = 0x911,
-	MLX5_CMD_OP_MAX				  = 0x911
+	MLX5_CMD_OP_CREATE_TIS                    = 0x912,
+	MLX5_CMD_OP_MODIFY_TIS                    = 0x913,
+	MLX5_CMD_OP_DESTROY_TIS                   = 0x914,
+	MLX5_CMD_OP_QUERY_TIS                     = 0x915,
+	MLX5_CMD_OP_CREATE_RQT                    = 0x916,
+	MLX5_CMD_OP_MODIFY_RQT                    = 0x917,
+	MLX5_CMD_OP_DESTROY_RQT                   = 0x918,
+	MLX5_CMD_OP_QUERY_RQT                     = 0x919,
+	MLX5_CMD_OP_CREATE_FLOW_TABLE             = 0x930,
+	MLX5_CMD_OP_DESTROY_FLOW_TABLE            = 0x931,
+	MLX5_CMD_OP_QUERY_FLOW_TABLE              = 0x932,
+	MLX5_CMD_OP_CREATE_FLOW_GROUP             = 0x933,
+	MLX5_CMD_OP_DESTROY_FLOW_GROUP            = 0x934,
+	MLX5_CMD_OP_QUERY_FLOW_GROUP              = 0x935,
+	MLX5_CMD_OP_SET_FLOW_TABLE_ENTRY          = 0x936,
+	MLX5_CMD_OP_QUERY_FLOW_TABLE_ENTRY        = 0x937,
+	MLX5_CMD_OP_DELETE_FLOW_TABLE_ENTRY       = 0x938
+};
+
+struct mlx5_ifc_flow_table_fields_supported_bits {
+	u8         outer_dmac[0x1];
+	u8         outer_smac[0x1];
+	u8         outer_ether_type[0x1];
+	u8         reserved_0[0x1];
+	u8         outer_first_prio[0x1];
+	u8         outer_first_cfi[0x1];
+	u8         outer_first_vid[0x1];
+	u8         reserved_1[0x1];
+	u8         outer_second_prio[0x1];
+	u8         outer_second_cfi[0x1];
+	u8         outer_second_vid[0x1];
+	u8         reserved_2[0x1];
+	u8         outer_sip[0x1];
+	u8         outer_dip[0x1];
+	u8         outer_frag[0x1];
+	u8         outer_ip_protocol[0x1];
+	u8         outer_ip_ecn[0x1];
+	u8         outer_ip_dscp[0x1];
+	u8         outer_udp_sport[0x1];
+	u8         outer_udp_dport[0x1];
+	u8         outer_tcp_sport[0x1];
+	u8         outer_tcp_dport[0x1];
+	u8         outer_tcp_flags[0x1];
+	u8         outer_gre_protocol[0x1];
+	u8         outer_gre_key[0x1];
+	u8         outer_vxlan_vni[0x1];
+	u8         reserved_3[0x5];
+	u8         source_eswitch_port[0x1];
+
+	u8         inner_dmac[0x1];
+	u8         inner_smac[0x1];
+	u8         inner_ether_type[0x1];
+	u8         reserved_4[0x1];
+	u8         inner_first_prio[0x1];
+	u8         inner_first_cfi[0x1];
+	u8         inner_first_vid[0x1];
+	u8         reserved_5[0x1];
+	u8         inner_second_prio[0x1];
+	u8         inner_second_cfi[0x1];
+	u8         inner_second_vid[0x1];
+	u8         reserved_6[0x1];
+	u8         inner_sip[0x1];
+	u8         inner_dip[0x1];
+	u8         inner_frag[0x1];
+	u8         inner_ip_protocol[0x1];
+	u8         inner_ip_ecn[0x1];
+	u8         inner_ip_dscp[0x1];
+	u8         inner_udp_sport[0x1];
+	u8         inner_udp_dport[0x1];
+	u8         inner_tcp_sport[0x1];
+	u8         inner_tcp_dport[0x1];
+	u8         inner_tcp_flags[0x1];
+	u8         reserved_7[0x9];
+
+	u8         reserved_8[0x40];
+};
+
+struct mlx5_ifc_flow_table_prop_layout_bits {
+	u8         ft_support[0x1];
+	u8         reserved_0[0x1f];
+
+	u8         reserved_1[0x2];
+	u8         log_max_ft_size[0x6];
+	u8         reserved_2[0x10];
+	u8         max_ft_level[0x8];
+
+	u8         reserved_3[0x20];
+
+	u8         reserved_4[0x18];
+	u8         log_max_ft_num[0x8];
+
+	u8         reserved_5[0x18];
+	u8         log_max_destination[0x8];
+
+	u8         reserved_6[0x18];
+	u8         log_max_flow[0x8];
+
+	u8         reserved_7[0x40];
+
+	struct mlx5_ifc_flow_table_fields_supported_bits ft_field_support;
+
+	struct mlx5_ifc_flow_table_fields_supported_bits ft_field_bitmask_support;
+};
+
+struct mlx5_ifc_odp_per_transport_service_cap_bits {
+	u8         send[0x1];
+	u8         receive[0x1];
+	u8         write[0x1];
+	u8         read[0x1];
+	u8         reserved_0[0x1];
+	u8         srq_receive[0x1];
+	u8         reserved_1[0x1a];
+};
+
+struct mlx5_ifc_fte_match_set_lyr_2_4_bits {
+	u8         smac_47_16[0x20];
+
+	u8         smac_15_0[0x10];
+	u8         ethertype[0x10];
+
+	u8         dmac_47_16[0x20];
+
+	u8         dmac_15_0[0x10];
+	u8         first_prio[0x3];
+	u8         first_cfi[0x1];
+	u8         first_vid[0xc];
+
+	u8         ip_protocol[0x8];
+	u8         ip_dscp[0x6];
+	u8         ip_ecn[0x2];
+	u8         vlan_tag[0x1];
+	u8         reserved_0[0x1];
+	u8         frag[0x1];
+	u8         reserved_1[0x4];
+	u8         tcp_flags[0x9];
+
+	u8         tcp_sport[0x10];
+	u8         tcp_dport[0x10];
+
+	u8         reserved_2[0x20];
+
+	u8         udp_sport[0x10];
+	u8         udp_dport[0x10];
+
+	u8         src_ip[4][0x20];
+
+	u8         dst_ip[4][0x20];
+};
+
+struct mlx5_ifc_fte_match_set_misc_bits {
+	u8         reserved_0[0x20];
+
+	u8         reserved_1[0x10];
+	u8         source_port[0x10];
+
+	u8         outer_second_prio[0x3];
+	u8         outer_second_cfi[0x1];
+	u8         outer_second_vid[0xc];
+	u8         inner_second_prio[0x3];
+	u8         inner_second_cfi[0x1];
+	u8         inner_second_vid[0xc];
+
+	u8         outer_second_vlan_tag[0x1];
+	u8         inner_second_vlan_tag[0x1];
+	u8         reserved_2[0xe];
+	u8         gre_protocol[0x10];
+
+	u8         gre_key_h[0x18];
+	u8         gre_key_l[0x8];
+
+	u8         vxlan_vni[0x18];
+	u8         reserved_3[0x8];
+
+	u8         reserved_4[0x20];
+
+	u8         reserved_5[0xc];
+	u8         outer_ipv6_flow_label[0x14];
+
+	u8         reserved_6[0xc];
+	u8         inner_ipv6_flow_label[0x14];
+
+	u8         reserved_7[0xe0];
+};
+
+struct mlx5_ifc_cmd_pas_bits {
+	u8         pa_h[0x20];
+
+	u8         pa_l[0x14];
+	u8         reserved_0[0xc];
+};
+
+struct mlx5_ifc_uint64_bits {
+	u8         hi[0x20];
+
+	u8         lo[0x20];
+};
+
+enum {
+	MLX5_ADS_STAT_RATE_NO_LIMIT  = 0x0,
+	MLX5_ADS_STAT_RATE_2_5GBPS   = 0x7,
+	MLX5_ADS_STAT_RATE_10GBPS    = 0x8,
+	MLX5_ADS_STAT_RATE_30GBPS    = 0x9,
+	MLX5_ADS_STAT_RATE_5GBPS     = 0xa,
+	MLX5_ADS_STAT_RATE_20GBPS    = 0xb,
+	MLX5_ADS_STAT_RATE_40GBPS    = 0xc,
+	MLX5_ADS_STAT_RATE_60GBPS    = 0xd,
+	MLX5_ADS_STAT_RATE_80GBPS    = 0xe,
+	MLX5_ADS_STAT_RATE_120GBPS   = 0xf,
+};
+
+struct mlx5_ifc_ads_bits {
+	u8         fl[0x1];
+	u8         free_ar[0x1];
+	u8         reserved_0[0xe];
+	u8         pkey_index[0x10];
+
+	u8         reserved_1[0x8];
+	u8         grh[0x1];
+	u8         mlid[0x7];
+	u8         rlid[0x10];
+
+	u8         ack_timeout[0x5];
+	u8         reserved_2[0x3];
+	u8         src_addr_index[0x8];
+	u8         reserved_3[0x4];
+	u8         stat_rate[0x4];
+	u8         hop_limit[0x8];
+
+	u8         reserved_4[0x4];
+	u8         tclass[0x8];
+	u8         flow_label[0x14];
+
+	u8         rgid_rip[16][0x8];
+
+	u8         reserved_5[0x4];
+	u8         f_dscp[0x1];
+	u8         f_ecn[0x1];
+	u8         reserved_6[0x1];
+	u8         f_eth_prio[0x1];
+	u8         ecn[0x2];
+	u8         dscp[0x6];
+	u8         udp_sport[0x10];
+
+	u8         dei_cfi[0x1];
+	u8         eth_prio[0x3];
+	u8         sl[0x4];
+	u8         port[0x8];
+	u8         rmac_47_32[0x10];
+
+	u8         rmac_31_0[0x20];
+};
+
+struct mlx5_ifc_flow_table_nic_cap_bits {
+	u8         reserved_0[0x200];
+
+	struct mlx5_ifc_flow_table_prop_layout_bits flow_table_properties_nic_receive;
+
+	u8         reserved_1[0x200];
+
+	struct mlx5_ifc_flow_table_prop_layout_bits flow_table_properties_nic_receive_sniffer;
+
+	struct mlx5_ifc_flow_table_prop_layout_bits flow_table_properties_nic_transmit;
+
+	u8         reserved_2[0x200];
+
+	struct mlx5_ifc_flow_table_prop_layout_bits flow_table_properties_nic_transmit_sniffer;
+
+	u8         reserved_3[0x7200];
+};
+
+struct mlx5_ifc_per_protocol_networking_offload_caps_bits {
+	u8         csum_cap[0x1];
+	u8         vlan_cap[0x1];
+	u8         lro_cap[0x1];
+	u8         lro_psh_flag[0x1];
+	u8         lro_time_stamp[0x1];
+	u8         reserved_0[0x6];
+	u8         max_lso_cap[0x5];
+	u8         reserved_1[0x4];
+	u8         rss_ind_tbl_cap[0x4];
+	u8         reserved_2[0x3];
+	u8         tunnel_lso_const_out_ip_id[0x1];
+	u8         reserved_3[0x2];
+	u8         tunnel_statless_gre[0x1];
+	u8         tunnel_stateless_vxlan[0x1];
+
+	u8         reserved_4[0x20];
+
+	u8         reserved_5[0x10];
+	u8         lro_min_mss_size[0x10];
+
+	u8         reserved_6[0x120];
+
+	u8         lro_timer_supported_periods[4][0x20];
+
+	u8         reserved_7[0x600];
+};
+
+struct mlx5_ifc_roce_cap_bits {
+	u8         roce_apm[0x1];
+	u8         reserved_0[0x1f];
+
+	u8         reserved_1[0x60];
+
+	u8         reserved_2[0xc];
+	u8         l3_type[0x4];
+	u8         reserved_3[0x8];
+	u8         roce_version[0x8];
+
+	u8         reserved_4[0x10];
+	u8         r_roce_dest_udp_port[0x10];
+
+	u8         r_roce_max_src_udp_port[0x10];
+	u8         r_roce_min_src_udp_port[0x10];
+
+	u8         reserved_5[0x10];
+	u8         roce_address_table_size[0x10];
+
+	u8         reserved_6[0x700];
+};
+
+enum {
+	MLX5_ATOMIC_CAPS_ATOMIC_SIZE_QP_1_BYTE     = 0x0,
+	MLX5_ATOMIC_CAPS_ATOMIC_SIZE_QP_2_BYTES    = 0x2,
+	MLX5_ATOMIC_CAPS_ATOMIC_SIZE_QP_4_BYTES    = 0x4,
+	MLX5_ATOMIC_CAPS_ATOMIC_SIZE_QP_8_BYTES    = 0x8,
+	MLX5_ATOMIC_CAPS_ATOMIC_SIZE_QP_16_BYTES   = 0x10,
+	MLX5_ATOMIC_CAPS_ATOMIC_SIZE_QP_32_BYTES   = 0x20,
+	MLX5_ATOMIC_CAPS_ATOMIC_SIZE_QP_64_BYTES   = 0x40,
+	MLX5_ATOMIC_CAPS_ATOMIC_SIZE_QP_128_BYTES  = 0x80,
+	MLX5_ATOMIC_CAPS_ATOMIC_SIZE_QP_256_BYTES  = 0x100,
+};
+
+enum {
+	MLX5_ATOMIC_CAPS_ATOMIC_SIZE_DC_1_BYTE     = 0x1,
+	MLX5_ATOMIC_CAPS_ATOMIC_SIZE_DC_2_BYTES    = 0x2,
+	MLX5_ATOMIC_CAPS_ATOMIC_SIZE_DC_4_BYTES    = 0x4,
+	MLX5_ATOMIC_CAPS_ATOMIC_SIZE_DC_8_BYTES    = 0x8,
+	MLX5_ATOMIC_CAPS_ATOMIC_SIZE_DC_16_BYTES   = 0x10,
+	MLX5_ATOMIC_CAPS_ATOMIC_SIZE_DC_32_BYTES   = 0x20,
+	MLX5_ATOMIC_CAPS_ATOMIC_SIZE_DC_64_BYTES   = 0x40,
+	MLX5_ATOMIC_CAPS_ATOMIC_SIZE_DC_128_BYTES  = 0x80,
+	MLX5_ATOMIC_CAPS_ATOMIC_SIZE_DC_256_BYTES  = 0x100,
+};
+
+struct mlx5_ifc_atomic_caps_bits {
+	u8         reserved_0[0x40];
+
+	u8         atomic_req_endianness[0x1];
+	u8         reserved_1[0x1f];
+
+	u8         reserved_2[0x20];
+
+	u8         reserved_3[0x10];
+	u8         atomic_operations[0x10];
+
+	u8         reserved_4[0x10];
+	u8         atomic_size_qp[0x10];
+
+	u8         reserved_5[0x10];
+	u8         atomic_size_dc[0x10];
+
+	u8         reserved_6[0x720];
+};
+
+struct mlx5_ifc_odp_cap_bits {
+	u8         reserved_0[0x40];
+
+	u8         sig[0x1];
+	u8         reserved_1[0x1f];
+
+	u8         reserved_2[0x20];
+
+	struct mlx5_ifc_odp_per_transport_service_cap_bits rc_odp_caps;
+
+	struct mlx5_ifc_odp_per_transport_service_cap_bits uc_odp_caps;
+
+	struct mlx5_ifc_odp_per_transport_service_cap_bits ud_odp_caps;
+
+	u8         reserved_3[0x720];
+};
+
+enum {
+	MLX5_WQ_TYPE_LINKED_LIST  = 0x0,
+	MLX5_WQ_TYPE_CYCLIC       = 0x1,
+	MLX5_WQ_TYPE_STRQ         = 0x2,
+};
+
+enum {
+	MLX5_WQ_END_PAD_MODE_NONE   = 0x0,
+	MLX5_WQ_END_PAD_MODE_ALIGN  = 0x1,
+};
+
+enum {
+	MLX5_CMD_HCA_CAP_GID_TABLE_SIZE_8_GID_ENTRIES    = 0x0,
+	MLX5_CMD_HCA_CAP_GID_TABLE_SIZE_16_GID_ENTRIES   = 0x1,
+	MLX5_CMD_HCA_CAP_GID_TABLE_SIZE_32_GID_ENTRIES   = 0x2,
+	MLX5_CMD_HCA_CAP_GID_TABLE_SIZE_64_GID_ENTRIES   = 0x3,
+	MLX5_CMD_HCA_CAP_GID_TABLE_SIZE_128_GID_ENTRIES  = 0x4,
+};
+
+enum {
+	MLX5_CMD_HCA_CAP_PKEY_TABLE_SIZE_128_ENTRIES  = 0x0,
+	MLX5_CMD_HCA_CAP_PKEY_TABLE_SIZE_256_ENTRIES  = 0x1,
+	MLX5_CMD_HCA_CAP_PKEY_TABLE_SIZE_512_ENTRIES  = 0x2,
+	MLX5_CMD_HCA_CAP_PKEY_TABLE_SIZE_1K_ENTRIES   = 0x3,
+	MLX5_CMD_HCA_CAP_PKEY_TABLE_SIZE_2K_ENTRIES   = 0x4,
+	MLX5_CMD_HCA_CAP_PKEY_TABLE_SIZE_4K_ENTRIES   = 0x5,
+};
+
+enum {
+	MLX5_CMD_HCA_CAP_PORT_TYPE_IB        = 0x0,
+	MLX5_CMD_HCA_CAP_PORT_TYPE_ETHERNET  = 0x1,
+};
+
+enum {
+	MLX5_CMD_HCA_CAP_CMDIF_CHECKSUM_DISABLED       = 0x0,
+	MLX5_CMD_HCA_CAP_CMDIF_CHECKSUM_INITIAL_STATE  = 0x1,
+	MLX5_CMD_HCA_CAP_CMDIF_CHECKSUM_ENABLED        = 0x3,
+};
+
+enum {
+	MLX5_CAP_PORT_TYPE_IB  = 0x0,
+	MLX5_CAP_PORT_TYPE_ETH = 0x1,
 };
 
 struct mlx5_ifc_cmd_hca_cap_bits {
@@ -148,9 +611,8 @@ struct mlx5_ifc_cmd_hca_cap_bits {
 	u8         reserved_1[0xb];
 	u8         log_max_qp[0x5];
 
-	u8         log_max_strq_sz[0x8];
-	u8         reserved_2[0x3];
-	u8         log_max_srqs[0x5];
+	u8         reserved_2[0xb];
+	u8         log_max_srq[0x5];
 	u8         reserved_3[0x10];
 
 	u8         reserved_4[0x8];
@@ -185,165 +647,6123 @@ struct mlx5_ifc_cmd_hca_cap_bits {
 	u8         pad_cap[0x1];
 	u8         cc_query_allowed[0x1];
 	u8         cc_modify_allowed[0x1];
-	u8         reserved_15[0x1d];
+	u8         reserved_15[0xd];
+	u8         gid_table_size[0x10];
 
-	u8         reserved_16[0x6];
+	u8         out_of_seq_cnt[0x1];
+	u8         vport_counters[0x1];
+	u8         reserved_16[0x4];
 	u8         max_qp_cnt[0xa];
 	u8         pkey_table_size[0x10];
 
-	u8         eswitch_owner[0x1];
-	u8         reserved_17[0xa];
+	u8         vport_group_manager[0x1];
+	u8         vhca_group_manager[0x1];
+	u8         ib_virt[0x1];
+	u8         eth_virt[0x1];
+	u8         reserved_17[0x1];
+	u8         ets[0x1];
+	u8         nic_flow_table[0x1];
+	u8         reserved_18[0x4];
 	u8         local_ca_ack_delay[0x5];
-	u8         reserved_18[0x8];
+	u8         reserved_19[0x6];
+	u8         port_type[0x2];
 	u8         num_ports[0x8];
 
-	u8         reserved_19[0x3];
+	u8         reserved_20[0x3];
 	u8         log_max_msg[0x5];
-	u8         reserved_20[0x18];
+	u8         reserved_21[0x18];
 
 	u8         stat_rate_support[0x10];
-	u8         reserved_21[0x10];
+	u8         reserved_22[0xc];
+	u8         cqe_version[0x4];
 
-	u8         reserved_22[0x10];
+	u8         compact_address_vector[0x1];
+	u8         reserved_23[0xe];
+	u8         drain_sigerr[0x1];
 	u8         cmdif_checksum[0x2];
 	u8         sigerr_cqe[0x1];
-	u8         reserved_23[0x1];
+	u8         reserved_24[0x1];
 	u8         wq_signature[0x1];
 	u8         sctr_data_cqe[0x1];
-	u8         reserved_24[0x1];
+	u8         reserved_25[0x1];
 	u8         sho[0x1];
 	u8         tph[0x1];
 	u8         rf[0x1];
-	u8         dc[0x1];
-	u8         reserved_25[0x2];
+	u8         dct[0x1];
+	u8         reserved_26[0x1];
+	u8         eth_net_offloads[0x1];
 	u8         roce[0x1];
 	u8         atomic[0x1];
-	u8         rsz_srq[0x1];
+	u8         reserved_27[0x1];
 
 	u8         cq_oi[0x1];
 	u8         cq_resize[0x1];
 	u8         cq_moderation[0x1];
-	u8         sniffer_rule_flow[0x1];
-	u8         sniffer_rule_vport[0x1];
-	u8         sniffer_rule_phy[0x1];
-	u8         reserved_26[0x1];
+	u8         reserved_28[0x3];
+	u8         cq_eq_remap[0x1];
 	u8         pg[0x1];
 	u8         block_lb_mc[0x1];
-	u8         reserved_27[0x3];
+	u8         reserved_29[0x1];
+	u8         scqe_break_moderation[0x1];
+	u8         reserved_30[0x1];
 	u8         cd[0x1];
-	u8         reserved_28[0x1];
+	u8         reserved_31[0x1];
 	u8         apm[0x1];
-	u8         reserved_29[0x7];
+	u8         reserved_32[0x7];
 	u8         qkv[0x1];
 	u8         pkv[0x1];
-	u8         reserved_30[0x4];
+	u8         reserved_33[0x4];
 	u8         xrc[0x1];
 	u8         ud[0x1];
 	u8         uc[0x1];
 	u8         rc[0x1];
 
-	u8         reserved_31[0xa];
+	u8         reserved_34[0xa];
 	u8         uar_sz[0x6];
-	u8         reserved_32[0x8];
+	u8         reserved_35[0x8];
 	u8         log_pg_sz[0x8];
 
 	u8         bf[0x1];
-	u8         reserved_33[0xa];
+	u8         reserved_36[0x1];
+	u8         pad_tx_eth_packet[0x1];
+	u8         reserved_37[0x8];
 	u8         log_bf_reg_size[0x5];
-	u8         reserved_34[0x10];
+	u8         reserved_38[0x10];
 
-	u8         reserved_35[0x10];
+	u8         reserved_39[0x10];
 	u8         max_wqe_sz_sq[0x10];
 
-	u8         reserved_36[0x10];
+	u8         reserved_40[0x10];
 	u8         max_wqe_sz_rq[0x10];
 
-	u8         reserved_37[0x10];
+	u8         reserved_41[0x10];
 	u8         max_wqe_sz_sq_dc[0x10];
 
-	u8         reserved_38[0x7];
+	u8         reserved_42[0x7];
 	u8         max_qp_mcg[0x19];
 
-	u8         reserved_39[0x18];
+	u8         reserved_43[0x18];
 	u8         log_max_mcg[0x8];
 
-	u8         reserved_40[0xb];
+	u8         reserved_44[0x3];
+	u8         log_max_transport_domain[0x5];
+	u8         reserved_45[0x3];
 	u8         log_max_pd[0x5];
-	u8         reserved_41[0xb];
+	u8         reserved_46[0xb];
 	u8         log_max_xrcd[0x5];
 
-	u8         reserved_42[0x20];
+	u8         reserved_47[0x20];
 
-	u8         reserved_43[0x3];
+	u8         reserved_48[0x3];
 	u8         log_max_rq[0x5];
-	u8         reserved_44[0x3];
+	u8         reserved_49[0x3];
 	u8         log_max_sq[0x5];
-	u8         reserved_45[0x3];
+	u8         reserved_50[0x3];
 	u8         log_max_tir[0x5];
-	u8         reserved_46[0x3];
+	u8         reserved_51[0x3];
 	u8         log_max_tis[0x5];
 
-	u8         reserved_47[0x13];
-	u8         log_max_rq_per_tir[0x5];
-	u8         reserved_48[0x3];
+	u8         basic_cyclic_rcv_wqe[0x1];
+	u8         reserved_52[0x2];
+	u8         log_max_rmp[0x5];
+	u8         reserved_53[0x3];
+	u8         log_max_rqt[0x5];
+	u8         reserved_54[0x3];
+	u8         log_max_rqt_size[0x5];
+	u8         reserved_55[0x3];
 	u8         log_max_tis_per_sq[0x5];
 
-	u8         reserved_49[0xe0];
+	u8         reserved_56[0x3];
+	u8         log_max_stride_sz_rq[0x5];
+	u8         reserved_57[0x3];
+	u8         log_min_stride_sz_rq[0x5];
+	u8         reserved_58[0x3];
+	u8         log_max_stride_sz_sq[0x5];
+	u8         reserved_59[0x3];
+	u8         log_min_stride_sz_sq[0x5];
+
+	u8         reserved_60[0x1b];
+	u8         log_max_wq_sz[0x5];
+
+	u8         reserved_61[0xa0];
 
-	u8         reserved_50[0x10];
+	u8         reserved_62[0x3];
+	u8         log_max_l2_table[0x5];
+	u8         reserved_63[0x8];
 	u8         log_uar_page_sz[0x10];
 
-	u8         reserved_51[0x100];
+	u8         reserved_64[0x100];
 
-	u8         reserved_52[0x1f];
+	u8         reserved_65[0x1f];
 	u8         cqe_zip[0x1];
 
 	u8         cqe_zip_timeout[0x10];
 	u8         cqe_zip_max_num[0x10];
 
-	u8         reserved_53[0x220];
+	u8         reserved_66[0x220];
 };
 
-struct mlx5_ifc_set_hca_cap_in_bits {
-	u8         opcode[0x10];
-	u8         reserved_0[0x10];
+enum {
+	MLX5_DEST_FORMAT_STRUCT_DESTINATION_TYPE_FLOW_TABLE_  = 0x1,
+	MLX5_DEST_FORMAT_STRUCT_DESTINATION_TYPE_TIR          = 0x2,
+};
 
-	u8         reserved_1[0x10];
-	u8         op_mod[0x10];
+struct mlx5_ifc_dest_format_struct_bits {
+	u8         destination_type[0x8];
+	u8         destination_id[0x18];
 
-	u8         reserved_2[0x40];
+	u8         reserved_0[0x20];
+};
+
+struct mlx5_ifc_fte_match_param_bits {
+	struct mlx5_ifc_fte_match_set_lyr_2_4_bits outer_headers;
+
+	struct mlx5_ifc_fte_match_set_misc_bits misc_parameters;
+
+	struct mlx5_ifc_fte_match_set_lyr_2_4_bits inner_headers;
 
-	struct mlx5_ifc_cmd_hca_cap_bits hca_capability_struct;
+	u8         reserved_0[0xa00];
 };
 
-struct mlx5_ifc_query_hca_cap_in_bits {
-	u8         opcode[0x10];
-	u8         reserved_0[0x10];
+enum {
+	MLX5_RX_HASH_FIELD_SELECT_SELECTED_FIELDS_SRC_IP     = 0x0,
+	MLX5_RX_HASH_FIELD_SELECT_SELECTED_FIELDS_DST_IP     = 0x1,
+	MLX5_RX_HASH_FIELD_SELECT_SELECTED_FIELDS_L4_SPORT   = 0x2,
+	MLX5_RX_HASH_FIELD_SELECT_SELECTED_FIELDS_L4_DPORT   = 0x3,
+	MLX5_RX_HASH_FIELD_SELECT_SELECTED_FIELDS_IPSEC_SPI  = 0x4,
+};
 
-	u8         reserved_1[0x10];
-	u8         op_mod[0x10];
+struct mlx5_ifc_rx_hash_field_select_bits {
+	u8         l3_prot_type[0x1];
+	u8         l4_prot_type[0x1];
+	u8         selected_fields[0x1e];
+};
 
-	u8         reserved_2[0x40];
+enum {
+	MLX5_WQ_WQ_TYPE_WQ_LINKED_LIST  = 0x0,
+	MLX5_WQ_WQ_TYPE_WQ_CYCLIC       = 0x1,
 };
 
-struct mlx5_ifc_query_hca_cap_out_bits {
-	u8         status[0x8];
+enum {
+	MLX5_WQ_END_PADDING_MODE_END_PAD_NONE   = 0x0,
+	MLX5_WQ_END_PADDING_MODE_END_PAD_ALIGN  = 0x1,
+};
+
+struct mlx5_ifc_wq_bits {
+	u8         wq_type[0x4];
+	u8         wq_signature[0x1];
+	u8         end_padding_mode[0x2];
+	u8         cd_slave[0x1];
 	u8         reserved_0[0x18];
 
-	u8         syndrome[0x20];
+	u8         hds_skip_first_sge[0x1];
+	u8         log2_hds_buf_size[0x3];
+	u8         reserved_1[0x7];
+	u8         page_offset[0x5];
+	u8         lwm[0x10];
 
-	u8         reserved_1[0x40];
+	u8         reserved_2[0x8];
+	u8         pd[0x18];
+
+	u8         reserved_3[0x8];
+	u8         uar_page[0x18];
+
+	u8         dbr_addr[0x40];
+
+	u8         hw_counter[0x20];
+
+	u8         sw_counter[0x20];
+
+	u8         reserved_4[0xc];
+	u8         log_wq_stride[0x4];
+	u8         reserved_5[0x3];
+	u8         log_wq_pg_sz[0x5];
+	u8         reserved_6[0x3];
+	u8         log_wq_sz[0x5];
+
+	u8         reserved_7[0x4e0];
 
-	u8         capability_struct[256][0x8];
+	struct mlx5_ifc_cmd_pas_bits pas[0];
 };
 
-struct mlx5_ifc_set_hca_cap_out_bits {
-	u8         status[0x8];
-	u8         reserved_0[0x18];
+struct mlx5_ifc_rq_num_bits {
+	u8         reserved_0[0x8];
+	u8         rq_num[0x18];
+};
 
-	u8         syndrome[0x20];
+struct mlx5_ifc_mac_address_layout_bits {
+	u8         reserved_0[0x10];
+	u8         mac_addr_47_32[0x10];
 
-	u8         reserved_1[0x40];
+	u8         mac_addr_31_0[0x20];
+};
+
+struct mlx5_ifc_cong_control_r_roce_ecn_np_bits {
+	u8         reserved_0[0xa0];
+
+	u8         min_time_between_cnps[0x20];
+
+	u8         reserved_1[0x12];
+	u8         cnp_dscp[0x6];
+	u8         reserved_2[0x5];
+	u8         cnp_802p_prio[0x3];
+
+	u8         reserved_3[0x720];
+};
+
+struct mlx5_ifc_cong_control_r_roce_ecn_rp_bits {
+	u8         reserved_0[0x60];
+
+	u8         reserved_1[0x4];
+	u8         clamp_tgt_rate[0x1];
+	u8         reserved_2[0x3];
+	u8         clamp_tgt_rate_after_time_inc[0x1];
+	u8         reserved_3[0x17];
+
+	u8         reserved_4[0x20];
+
+	u8         rpg_time_reset[0x20];
+
+	u8         rpg_byte_reset[0x20];
+
+	u8         rpg_threshold[0x20];
+
+	u8         rpg_max_rate[0x20];
+
+	u8         rpg_ai_rate[0x20];
+
+	u8         rpg_hai_rate[0x20];
+
+	u8         rpg_gd[0x20];
+
+	u8         rpg_min_dec_fac[0x20];
+
+	u8         rpg_min_rate[0x20];
+
+	u8         reserved_5[0xe0];
+
+	u8         rate_to_set_on_first_cnp[0x20];
+
+	u8         dce_tcp_g[0x20];
+
+	u8         dce_tcp_rtt[0x20];
+
+	u8         rate_reduce_monitor_period[0x20];
+
+	u8         reserved_6[0x20];
+
+	u8         initial_alpha_value[0x20];
+
+	u8         reserved_7[0x4a0];
+};
+
+struct mlx5_ifc_cong_control_802_1qau_rp_bits {
+	u8         reserved_0[0x80];
+
+	u8         rppp_max_rps[0x20];
+
+	u8         rpg_time_reset[0x20];
+
+	u8         rpg_byte_reset[0x20];
+
+	u8         rpg_threshold[0x20];
+
+	u8         rpg_max_rate[0x20];
+
+	u8         rpg_ai_rate[0x20];
+
+	u8         rpg_hai_rate[0x20];
+
+	u8         rpg_gd[0x20];
+
+	u8         rpg_min_dec_fac[0x20];
+
+	u8         rpg_min_rate[0x20];
+
+	u8         reserved_1[0x640];
+};
+
+enum {
+	MLX5_RESIZE_FIELD_SELECT_RESIZE_FIELD_SELECT_LOG_CQ_SIZE    = 0x1,
+	MLX5_RESIZE_FIELD_SELECT_RESIZE_FIELD_SELECT_PAGE_OFFSET    = 0x2,
+	MLX5_RESIZE_FIELD_SELECT_RESIZE_FIELD_SELECT_LOG_PAGE_SIZE  = 0x4,
+};
+
+struct mlx5_ifc_resize_field_select_bits {
+	u8         resize_field_select[0x20];
+};
+
+enum {
+	MLX5_MODIFY_FIELD_SELECT_MODIFY_FIELD_SELECT_CQ_PERIOD     = 0x1,
+	MLX5_MODIFY_FIELD_SELECT_MODIFY_FIELD_SELECT_CQ_MAX_COUNT  = 0x2,
+	MLX5_MODIFY_FIELD_SELECT_MODIFY_FIELD_SELECT_OI            = 0x4,
+	MLX5_MODIFY_FIELD_SELECT_MODIFY_FIELD_SELECT_C_EQN         = 0x8,
+};
+
+struct mlx5_ifc_modify_field_select_bits {
+	u8         modify_field_select[0x20];
+};
+
+struct mlx5_ifc_field_select_r_roce_np_bits {
+	u8         field_select_r_roce_np[0x20];
+};
+
+struct mlx5_ifc_field_select_r_roce_rp_bits {
+	u8         field_select_r_roce_rp[0x20];
+};
+
+enum {
+	MLX5_FIELD_SELECT_802_1QAU_RP_FIELD_SELECT_8021QAURP_RPPP_MAX_RPS     = 0x4,
+	MLX5_FIELD_SELECT_802_1QAU_RP_FIELD_SELECT_8021QAURP_RPG_TIME_RESET   = 0x8,
+	MLX5_FIELD_SELECT_802_1QAU_RP_FIELD_SELECT_8021QAURP_RPG_BYTE_RESET   = 0x10,
+	MLX5_FIELD_SELECT_802_1QAU_RP_FIELD_SELECT_8021QAURP_RPG_THRESHOLD    = 0x20,
+	MLX5_FIELD_SELECT_802_1QAU_RP_FIELD_SELECT_8021QAURP_RPG_MAX_RATE     = 0x40,
+	MLX5_FIELD_SELECT_802_1QAU_RP_FIELD_SELECT_8021QAURP_RPG_AI_RATE      = 0x80,
+	MLX5_FIELD_SELECT_802_1QAU_RP_FIELD_SELECT_8021QAURP_RPG_HAI_RATE     = 0x100,
+	MLX5_FIELD_SELECT_802_1QAU_RP_FIELD_SELECT_8021QAURP_RPG_GD           = 0x200,
+	MLX5_FIELD_SELECT_802_1QAU_RP_FIELD_SELECT_8021QAURP_RPG_MIN_DEC_FAC  = 0x400,
+	MLX5_FIELD_SELECT_802_1QAU_RP_FIELD_SELECT_8021QAURP_RPG_MIN_RATE     = 0x800,
+};
+
+struct mlx5_ifc_field_select_802_1qau_rp_bits {
+	u8         field_select_8021qaurp[0x20];
+};
+
+struct mlx5_ifc_phys_layer_cntrs_bits {
+	u8         time_since_last_clear_high[0x20];
+
+	u8         time_since_last_clear_low[0x20];
+
+	u8         symbol_errors_high[0x20];
+
+	u8         symbol_errors_low[0x20];
+
+	u8         sync_headers_errors_high[0x20];
+
+	u8         sync_headers_errors_low[0x20];
+
+	u8         edpl_bip_errors_lane0_high[0x20];
+
+	u8         edpl_bip_errors_lane0_low[0x20];
+
+	u8         edpl_bip_errors_lane1_high[0x20];
+
+	u8         edpl_bip_errors_lane1_low[0x20];
+
+	u8         edpl_bip_errors_lane2_high[0x20];
+
+	u8         edpl_bip_errors_lane2_low[0x20];
+
+	u8         edpl_bip_errors_lane3_high[0x20];
+
+	u8         edpl_bip_errors_lane3_low[0x20];
+
+	u8         fc_fec_corrected_blocks_lane0_high[0x20];
+
+	u8         fc_fec_corrected_blocks_lane0_low[0x20];
+
+	u8         fc_fec_corrected_blocks_lane1_high[0x20];
+
+	u8         fc_fec_corrected_blocks_lane1_low[0x20];
+
+	u8         fc_fec_corrected_blocks_lane2_high[0x20];
+
+	u8         fc_fec_corrected_blocks_lane2_low[0x20];
+
+	u8         fc_fec_corrected_blocks_lane3_high[0x20];
+
+	u8         fc_fec_corrected_blocks_lane3_low[0x20];
+
+	u8         fc_fec_uncorrectable_blocks_lane0_high[0x20];
+
+	u8         fc_fec_uncorrectable_blocks_lane0_low[0x20];
+
+	u8         fc_fec_uncorrectable_blocks_lane1_high[0x20];
+
+	u8         fc_fec_uncorrectable_blocks_lane1_low[0x20];
+
+	u8         fc_fec_uncorrectable_blocks_lane2_high[0x20];
+
+	u8         fc_fec_uncorrectable_blocks_lane2_low[0x20];
+
+	u8         fc_fec_uncorrectable_blocks_lane3_high[0x20];
+
+	u8         fc_fec_uncorrectable_blocks_lane3_low[0x20];
+
+	u8         rs_fec_corrected_blocks_high[0x20];
+
+	u8         rs_fec_corrected_blocks_low[0x20];
+
+	u8         rs_fec_uncorrectable_blocks_high[0x20];
+
+	u8         rs_fec_uncorrectable_blocks_low[0x20];
+
+	u8         rs_fec_no_errors_blocks_high[0x20];
+
+	u8         rs_fec_no_errors_blocks_low[0x20];
+
+	u8         rs_fec_single_error_blocks_high[0x20];
+
+	u8         rs_fec_single_error_blocks_low[0x20];
+
+	u8         rs_fec_corrected_symbols_total_high[0x20];
+
+	u8         rs_fec_corrected_symbols_total_low[0x20];
+
+	u8         rs_fec_corrected_symbols_lane0_high[0x20];
+
+	u8         rs_fec_corrected_symbols_lane0_low[0x20];
+
+	u8         rs_fec_corrected_symbols_lane1_high[0x20];
+
+	u8         rs_fec_corrected_symbols_lane1_low[0x20];
+
+	u8         rs_fec_corrected_symbols_lane2_high[0x20];
+
+	u8         rs_fec_corrected_symbols_lane2_low[0x20];
+
+	u8         rs_fec_corrected_symbols_lane3_high[0x20];
+
+	u8         rs_fec_corrected_symbols_lane3_low[0x20];
+
+	u8         link_down_events[0x20];
+
+	u8         successful_recovery_events[0x20];
+
+	u8         reserved_0[0x180];
+};
+
+struct mlx5_ifc_eth_per_traffic_grp_data_layout_bits {
+	u8         transmit_queue_high[0x20];
+
+	u8         transmit_queue_low[0x20];
+
+	u8         reserved_0[0x780];
+};
+
+struct mlx5_ifc_eth_per_prio_grp_data_layout_bits {
+	u8         rx_octets_high[0x20];
+
+	u8         rx_octets_low[0x20];
+
+	u8         reserved_0[0xc0];
+
+	u8         rx_frames_high[0x20];
+
+	u8         rx_frames_low[0x20];
+
+	u8         tx_octets_high[0x20];
+
+	u8         tx_octets_low[0x20];
+
+	u8         reserved_1[0xc0];
+
+	u8         tx_frames_high[0x20];
+
+	u8         tx_frames_low[0x20];
+
+	u8         rx_pause_high[0x20];
+
+	u8         rx_pause_low[0x20];
+
+	u8         rx_pause_duration_high[0x20];
+
+	u8         rx_pause_duration_low[0x20];
+
+	u8         tx_pause_high[0x20];
+
+	u8         tx_pause_low[0x20];
+
+	u8         tx_pause_duration_high[0x20];
+
+	u8         tx_pause_duration_low[0x20];
+
+	u8         rx_pause_transition_high[0x20];
+
+	u8         rx_pause_transition_low[0x20];
+
+	u8         reserved_2[0x400];
+};
+
+struct mlx5_ifc_eth_extended_cntrs_grp_data_layout_bits {
+	u8         port_transmit_wait_high[0x20];
+
+	u8         port_transmit_wait_low[0x20];
+
+	u8         reserved_0[0x780];
+};
+
+struct mlx5_ifc_eth_3635_cntrs_grp_data_layout_bits {
+	u8         dot3stats_alignment_errors_high[0x20];
+
+	u8         dot3stats_alignment_errors_low[0x20];
+
+	u8         dot3stats_fcs_errors_high[0x20];
+
+	u8         dot3stats_fcs_errors_low[0x20];
+
+	u8         dot3stats_single_collision_frames_high[0x20];
+
+	u8         dot3stats_single_collision_frames_low[0x20];
+
+	u8         dot3stats_multiple_collision_frames_high[0x20];
+
+	u8         dot3stats_multiple_collision_frames_low[0x20];
+
+	u8         dot3stats_sqe_test_errors_high[0x20];
+
+	u8         dot3stats_sqe_test_errors_low[0x20];
+
+	u8         dot3stats_deferred_transmissions_high[0x20];
+
+	u8         dot3stats_deferred_transmissions_low[0x20];
+
+	u8         dot3stats_late_collisions_high[0x20];
+
+	u8         dot3stats_late_collisions_low[0x20];
+
+	u8         dot3stats_excessive_collisions_high[0x20];
+
+	u8         dot3stats_excessive_collisions_low[0x20];
+
+	u8         dot3stats_internal_mac_transmit_errors_high[0x20];
+
+	u8         dot3stats_internal_mac_transmit_errors_low[0x20];
+
+	u8         dot3stats_carrier_sense_errors_high[0x20];
+
+	u8         dot3stats_carrier_sense_errors_low[0x20];
+
+	u8         dot3stats_frame_too_longs_high[0x20];
+
+	u8         dot3stats_frame_too_longs_low[0x20];
+
+	u8         dot3stats_internal_mac_receive_errors_high[0x20];
+
+	u8         dot3stats_internal_mac_receive_errors_low[0x20];
+
+	u8         dot3stats_symbol_errors_high[0x20];
+
+	u8         dot3stats_symbol_errors_low[0x20];
+
+	u8         dot3control_in_unknown_opcodes_high[0x20];
+
+	u8         dot3control_in_unknown_opcodes_low[0x20];
+
+	u8         dot3in_pause_frames_high[0x20];
+
+	u8         dot3in_pause_frames_low[0x20];
+
+	u8         dot3out_pause_frames_high[0x20];
+
+	u8         dot3out_pause_frames_low[0x20];
+
+	u8         reserved_0[0x3c0];
+};
+
+struct mlx5_ifc_eth_2819_cntrs_grp_data_layout_bits {
+	u8         ether_stats_drop_events_high[0x20];
+
+	u8         ether_stats_drop_events_low[0x20];
+
+	u8         ether_stats_octets_high[0x20];
+
+	u8         ether_stats_octets_low[0x20];
+
+	u8         ether_stats_pkts_high[0x20];
+
+	u8         ether_stats_pkts_low[0x20];
+
+	u8         ether_stats_broadcast_pkts_high[0x20];
+
+	u8         ether_stats_broadcast_pkts_low[0x20];
+
+	u8         ether_stats_multicast_pkts_high[0x20];
+
+	u8         ether_stats_multicast_pkts_low[0x20];
+
+	u8         ether_stats_crc_align_errors_high[0x20];
+
+	u8         ether_stats_crc_align_errors_low[0x20];
+
+	u8         ether_stats_undersize_pkts_high[0x20];
+
+	u8         ether_stats_undersize_pkts_low[0x20];
+
+	u8         ether_stats_oversize_pkts_high[0x20];
+
+	u8         ether_stats_oversize_pkts_low[0x20];
+
+	u8         ether_stats_fragments_high[0x20];
+
+	u8         ether_stats_fragments_low[0x20];
+
+	u8         ether_stats_jabbers_high[0x20];
+
+	u8         ether_stats_jabbers_low[0x20];
+
+	u8         ether_stats_collisions_high[0x20];
+
+	u8         ether_stats_collisions_low[0x20];
+
+	u8         ether_stats_pkts64octets_high[0x20];
+
+	u8         ether_stats_pkts64octets_low[0x20];
+
+	u8         ether_stats_pkts65to127octets_high[0x20];
+
+	u8         ether_stats_pkts65to127octets_low[0x20];
+
+	u8         ether_stats_pkts128to255octets_high[0x20];
+
+	u8         ether_stats_pkts128to255octets_low[0x20];
+
+	u8         ether_stats_pkts256to511octets_high[0x20];
+
+	u8         ether_stats_pkts256to511octets_low[0x20];
+
+	u8         ether_stats_pkts512to1023octets_high[0x20];
+
+	u8         ether_stats_pkts512to1023octets_low[0x20];
+
+	u8         ether_stats_pkts1024to1518octets_high[0x20];
+
+	u8         ether_stats_pkts1024to1518octets_low[0x20];
+
+	u8         ether_stats_pkts1519to2047octets_high[0x20];
+
+	u8         ether_stats_pkts1519to2047octets_low[0x20];
+
+	u8         ether_stats_pkts2048to4095octets_high[0x20];
+
+	u8         ether_stats_pkts2048to4095octets_low[0x20];
+
+	u8         ether_stats_pkts4096to8191octets_high[0x20];
+
+	u8         ether_stats_pkts4096to8191octets_low[0x20];
+
+	u8         ether_stats_pkts8192to10239octets_high[0x20];
+
+	u8         ether_stats_pkts8192to10239octets_low[0x20];
+
+	u8         reserved_0[0x280];
+};
+
+struct mlx5_ifc_eth_2863_cntrs_grp_data_layout_bits {
+	u8         if_in_octets_high[0x20];
+
+	u8         if_in_octets_low[0x20];
+
+	u8         if_in_ucast_pkts_high[0x20];
+
+	u8         if_in_ucast_pkts_low[0x20];
+
+	u8         if_in_discards_high[0x20];
+
+	u8         if_in_discards_low[0x20];
+
+	u8         if_in_errors_high[0x20];
+
+	u8         if_in_errors_low[0x20];
+
+	u8         if_in_unknown_protos_high[0x20];
+
+	u8         if_in_unknown_protos_low[0x20];
+
+	u8         if_out_octets_high[0x20];
+
+	u8         if_out_octets_low[0x20];
+
+	u8         if_out_ucast_pkts_high[0x20];
+
+	u8         if_out_ucast_pkts_low[0x20];
+
+	u8         if_out_discards_high[0x20];
+
+	u8         if_out_discards_low[0x20];
+
+	u8         if_out_errors_high[0x20];
+
+	u8         if_out_errors_low[0x20];
+
+	u8         if_in_multicast_pkts_high[0x20];
+
+	u8         if_in_multicast_pkts_low[0x20];
+
+	u8         if_in_broadcast_pkts_high[0x20];
+
+	u8         if_in_broadcast_pkts_low[0x20];
+
+	u8         if_out_multicast_pkts_high[0x20];
+
+	u8         if_out_multicast_pkts_low[0x20];
+
+	u8         if_out_broadcast_pkts_high[0x20];
+
+	u8         if_out_broadcast_pkts_low[0x20];
+
+	u8         reserved_0[0x480];
+};
+
+struct mlx5_ifc_eth_802_3_cntrs_grp_data_layout_bits {
+	u8         a_frames_transmitted_ok_high[0x20];
+
+	u8         a_frames_transmitted_ok_low[0x20];
+
+	u8         a_frames_received_ok_high[0x20];
+
+	u8         a_frames_received_ok_low[0x20];
+
+	u8         a_frame_check_sequence_errors_high[0x20];
+
+	u8         a_frame_check_sequence_errors_low[0x20];
+
+	u8         a_alignment_errors_high[0x20];
+
+	u8         a_alignment_errors_low[0x20];
+
+	u8         a_octets_transmitted_ok_high[0x20];
+
+	u8         a_octets_transmitted_ok_low[0x20];
+
+	u8         a_octets_received_ok_high[0x20];
+
+	u8         a_octets_received_ok_low[0x20];
+
+	u8         a_multicast_frames_xmitted_ok_high[0x20];
+
+	u8         a_multicast_frames_xmitted_ok_low[0x20];
+
+	u8         a_broadcast_frames_xmitted_ok_high[0x20];
+
+	u8         a_broadcast_frames_xmitted_ok_low[0x20];
+
+	u8         a_multicast_frames_received_ok_high[0x20];
+
+	u8         a_multicast_frames_received_ok_low[0x20];
+
+	u8         a_broadcast_frames_received_ok_high[0x20];
+
+	u8         a_broadcast_frames_received_ok_low[0x20];
+
+	u8         a_in_range_length_errors_high[0x20];
+
+	u8         a_in_range_length_errors_low[0x20];
+
+	u8         a_out_of_range_length_field_high[0x20];
+
+	u8         a_out_of_range_length_field_low[0x20];
+
+	u8         a_frame_too_long_errors_high[0x20];
+
+	u8         a_frame_too_long_errors_low[0x20];
+
+	u8         a_symbol_error_during_carrier_high[0x20];
+
+	u8         a_symbol_error_during_carrier_low[0x20];
+
+	u8         a_mac_control_frames_transmitted_high[0x20];
+
+	u8         a_mac_control_frames_transmitted_low[0x20];
+
+	u8         a_mac_control_frames_received_high[0x20];
+
+	u8         a_mac_control_frames_received_low[0x20];
+
+	u8         a_unsupported_opcodes_received_high[0x20];
+
+	u8         a_unsupported_opcodes_received_low[0x20];
+
+	u8         a_pause_mac_ctrl_frames_received_high[0x20];
+
+	u8         a_pause_mac_ctrl_frames_received_low[0x20];
+
+	u8         a_pause_mac_ctrl_frames_transmitted_high[0x20];
+
+	u8         a_pause_mac_ctrl_frames_transmitted_low[0x20];
+
+	u8         reserved_0[0x300];
+};
+
+struct mlx5_ifc_cmd_inter_comp_event_bits {
+	u8         command_completion_vector[0x20];
+
+	u8         reserved_0[0xc0];
+};
+
+struct mlx5_ifc_stall_vl_event_bits {
+	u8         reserved_0[0x18];
+	u8         port_num[0x1];
+	u8         reserved_1[0x3];
+	u8         vl[0x4];
+
+	u8         reserved_2[0xa0];
+};
+
+struct mlx5_ifc_db_bf_congestion_event_bits {
+	u8         event_subtype[0x8];
+	u8         reserved_0[0x8];
+	u8         congestion_level[0x8];
+	u8         reserved_1[0x8];
+
+	u8         reserved_2[0xa0];
+};
+
+struct mlx5_ifc_gpio_event_bits {
+	u8         reserved_0[0x60];
+
+	u8         gpio_event_hi[0x20];
+
+	u8         gpio_event_lo[0x20];
+
+	u8         reserved_1[0x40];
+};
+
+struct mlx5_ifc_port_state_change_event_bits {
+	u8         reserved_0[0x40];
+
+	u8         port_num[0x4];
+	u8         reserved_1[0x1c];
+
+	u8         reserved_2[0x80];
+};
+
+struct mlx5_ifc_dropped_packet_logged_bits {
+	u8         reserved_0[0xe0];
+};
+
+enum {
+	MLX5_CQ_ERROR_SYNDROME_CQ_OVERRUN                 = 0x1,
+	MLX5_CQ_ERROR_SYNDROME_CQ_ACCESS_VIOLATION_ERROR  = 0x2,
+};
+
+struct mlx5_ifc_cq_error_bits {
+	u8         reserved_0[0x8];
+	u8         cqn[0x18];
+
+	u8         reserved_1[0x20];
+
+	u8         reserved_2[0x18];
+	u8         syndrome[0x8];
+
+	u8         reserved_3[0x80];
+};
+
+struct mlx5_ifc_rdma_page_fault_event_bits {
+	u8         bytes_committed[0x20];
+
+	u8         r_key[0x20];
+
+	u8         reserved_0[0x10];
+	u8         packet_len[0x10];
+
+	u8         rdma_op_len[0x20];
+
+	u8         rdma_va[0x40];
+
+	u8         reserved_1[0x5];
+	u8         rdma[0x1];
+	u8         write[0x1];
+	u8         requestor[0x1];
+	u8         qp_number[0x18];
+};
+
+struct mlx5_ifc_wqe_associated_page_fault_event_bits {
+	u8         bytes_committed[0x20];
+
+	u8         reserved_0[0x10];
+	u8         wqe_index[0x10];
+
+	u8         reserved_1[0x10];
+	u8         len[0x10];
+
+	u8         reserved_2[0x60];
+
+	u8         reserved_3[0x5];
+	u8         rdma[0x1];
+	u8         write_read[0x1];
+	u8         requestor[0x1];
+	u8         qpn[0x18];
+};
+
+struct mlx5_ifc_qp_events_bits {
+	u8         reserved_0[0xa0];
+
+	u8         type[0x8];
+	u8         reserved_1[0x18];
+
+	u8         reserved_2[0x8];
+	u8         qpn_rqn_sqn[0x18];
+};
+
+struct mlx5_ifc_dct_events_bits {
+	u8         reserved_0[0xc0];
+
+	u8         reserved_1[0x8];
+	u8         dct_number[0x18];
+};
+
+struct mlx5_ifc_comp_event_bits {
+	u8         reserved_0[0xc0];
+
+	u8         reserved_1[0x8];
+	u8         cq_number[0x18];
+};
+
+enum {
+	MLX5_QPC_STATE_RST        = 0x0,
+	MLX5_QPC_STATE_INIT       = 0x1,
+	MLX5_QPC_STATE_RTR        = 0x2,
+	MLX5_QPC_STATE_RTS        = 0x3,
+	MLX5_QPC_STATE_SQER       = 0x4,
+	MLX5_QPC_STATE_ERR        = 0x6,
+	MLX5_QPC_STATE_SQD        = 0x7,
+	MLX5_QPC_STATE_SUSPENDED  = 0x9,
+};
+
+enum {
+	MLX5_QPC_ST_RC            = 0x0,
+	MLX5_QPC_ST_UC            = 0x1,
+	MLX5_QPC_ST_UD            = 0x2,
+	MLX5_QPC_ST_XRC           = 0x3,
+	MLX5_QPC_ST_DCI           = 0x5,
+	MLX5_QPC_ST_QP0           = 0x7,
+	MLX5_QPC_ST_QP1           = 0x8,
+	MLX5_QPC_ST_RAW_DATAGRAM  = 0x9,
+	MLX5_QPC_ST_REG_UMR       = 0xc,
+};
+
+enum {
+	MLX5_QPC_PM_STATE_ARMED     = 0x0,
+	MLX5_QPC_PM_STATE_REARM     = 0x1,
+	MLX5_QPC_PM_STATE_RESERVED  = 0x2,
+	MLX5_QPC_PM_STATE_MIGRATED  = 0x3,
+};
+
+enum {
+	MLX5_QPC_END_PADDING_MODE_SCATTER_AS_IS                = 0x0,
+	MLX5_QPC_END_PADDING_MODE_PAD_TO_CACHE_LINE_ALIGNMENT  = 0x1,
+};
+
+enum {
+	MLX5_QPC_MTU_256_BYTES        = 0x1,
+	MLX5_QPC_MTU_512_BYTES        = 0x2,
+	MLX5_QPC_MTU_1K_BYTES         = 0x3,
+	MLX5_QPC_MTU_2K_BYTES         = 0x4,
+	MLX5_QPC_MTU_4K_BYTES         = 0x5,
+	MLX5_QPC_MTU_RAW_ETHERNET_QP  = 0x7,
+};
+
+enum {
+	MLX5_QPC_ATOMIC_MODE_IB_SPEC     = 0x1,
+	MLX5_QPC_ATOMIC_MODE_ONLY_8B     = 0x2,
+	MLX5_QPC_ATOMIC_MODE_UP_TO_8B    = 0x3,
+	MLX5_QPC_ATOMIC_MODE_UP_TO_16B   = 0x4,
+	MLX5_QPC_ATOMIC_MODE_UP_TO_32B   = 0x5,
+	MLX5_QPC_ATOMIC_MODE_UP_TO_64B   = 0x6,
+	MLX5_QPC_ATOMIC_MODE_UP_TO_128B  = 0x7,
+	MLX5_QPC_ATOMIC_MODE_UP_TO_256B  = 0x8,
+};
+
+enum {
+	MLX5_QPC_CS_REQ_DISABLE    = 0x0,
+	MLX5_QPC_CS_REQ_UP_TO_32B  = 0x11,
+	MLX5_QPC_CS_REQ_UP_TO_64B  = 0x22,
+};
+
+enum {
+	MLX5_QPC_CS_RES_DISABLE    = 0x0,
+	MLX5_QPC_CS_RES_UP_TO_32B  = 0x1,
+	MLX5_QPC_CS_RES_UP_TO_64B  = 0x2,
+};
+
+struct mlx5_ifc_qpc_bits {
+	u8         state[0x4];
+	u8         reserved_0[0x4];
+	u8         st[0x8];
+	u8         reserved_1[0x3];
+	u8         pm_state[0x2];
+	u8         reserved_2[0x7];
+	u8         end_padding_mode[0x2];
+	u8         reserved_3[0x2];
+
+	u8         wq_signature[0x1];
+	u8         block_lb_mc[0x1];
+	u8         atomic_like_write_en[0x1];
+	u8         latency_sensitive[0x1];
+	u8         reserved_4[0x1];
+	u8         drain_sigerr[0x1];
+	u8         reserved_5[0x2];
+	u8         pd[0x18];
+
+	u8         mtu[0x3];
+	u8         log_msg_max[0x5];
+	u8         reserved_6[0x1];
+	u8         log_rq_size[0x4];
+	u8         log_rq_stride[0x3];
+	u8         no_sq[0x1];
+	u8         log_sq_size[0x4];
+	u8         reserved_7[0x6];
+	u8         rlky[0x1];
+	u8         reserved_8[0x4];
+
+	u8         counter_set_id[0x8];
+	u8         uar_page[0x18];
+
+	u8         reserved_9[0x8];
+	u8         user_index[0x18];
+
+	u8         reserved_10[0x3];
+	u8         log_page_size[0x5];
+	u8         remote_qpn[0x18];
+
+	struct mlx5_ifc_ads_bits primary_address_path;
+
+	struct mlx5_ifc_ads_bits secondary_address_path;
+
+	u8         log_ack_req_freq[0x4];
+	u8         reserved_11[0x4];
+	u8         log_sra_max[0x3];
+	u8         reserved_12[0x2];
+	u8         retry_count[0x3];
+	u8         rnr_retry[0x3];
+	u8         reserved_13[0x1];
+	u8         fre[0x1];
+	u8         cur_rnr_retry[0x3];
+	u8         cur_retry_count[0x3];
+	u8         reserved_14[0x5];
+
+	u8         reserved_15[0x20];
+
+	u8         reserved_16[0x8];
+	u8         next_send_psn[0x18];
+
+	u8         reserved_17[0x8];
+	u8         cqn_snd[0x18];
+
+	u8         reserved_18[0x40];
+
+	u8         reserved_19[0x8];
+	u8         last_acked_psn[0x18];
+
+	u8         reserved_20[0x8];
+	u8         ssn[0x18];
+
+	u8         reserved_21[0x8];
+	u8         log_rra_max[0x3];
+	u8         reserved_22[0x1];
+	u8         atomic_mode[0x4];
+	u8         rre[0x1];
+	u8         rwe[0x1];
+	u8         rae[0x1];
+	u8         reserved_23[0x1];
+	u8         page_offset[0x6];
+	u8         reserved_24[0x3];
+	u8         cd_slave_receive[0x1];
+	u8         cd_slave_send[0x1];
+	u8         cd_master[0x1];
+
+	u8         reserved_25[0x3];
+	u8         min_rnr_nak[0x5];
+	u8         next_rcv_psn[0x18];
+
+	u8         reserved_26[0x8];
+	u8         xrcd[0x18];
+
+	u8         reserved_27[0x8];
+	u8         cqn_rcv[0x18];
+
+	u8         dbr_addr[0x40];
+
+	u8         q_key[0x20];
+
+	u8         reserved_28[0x5];
+	u8         rq_type[0x3];
+	u8         srqn_rmpn[0x18];
+
+	u8         reserved_29[0x8];
+	u8         rmsn[0x18];
+
+	u8         hw_sq_wqebb_counter[0x10];
+	u8         sw_sq_wqebb_counter[0x10];
+
+	u8         hw_rq_counter[0x20];
+
+	u8         sw_rq_counter[0x20];
+
+	u8         reserved_30[0x20];
+
+	u8         reserved_31[0xf];
+	u8         cgs[0x1];
+	u8         cs_req[0x8];
+	u8         cs_res[0x8];
+
+	u8         dc_access_key[0x40];
+
+	u8         reserved_32[0xc0];
+};
+
+struct mlx5_ifc_roce_addr_layout_bits {
+	u8         source_l3_address[16][0x8];
+
+	u8         reserved_0[0x3];
+	u8         vlan_valid[0x1];
+	u8         vlan_id[0xc];
+	u8         source_mac_47_32[0x10];
+
+	u8         source_mac_31_0[0x20];
+
+	u8         reserved_1[0x14];
+	u8         roce_l3_type[0x4];
+	u8         roce_version[0x8];
+
+	u8         reserved_2[0x20];
+};
+
+union mlx5_ifc_hca_cap_union_bits {
+	struct mlx5_ifc_cmd_hca_cap_bits cmd_hca_cap;
+	struct mlx5_ifc_odp_cap_bits odp_cap;
+	struct mlx5_ifc_atomic_caps_bits atomic_caps;
+	struct mlx5_ifc_roce_cap_bits roce_cap;
+	struct mlx5_ifc_per_protocol_networking_offload_caps_bits per_protocol_networking_offload_caps;
+	struct mlx5_ifc_flow_table_nic_cap_bits flow_table_nic_cap;
+	u8         reserved_0[0x8000];
+};
+
+enum {
+	MLX5_FLOW_CONTEXT_ACTION_ALLOW     = 0x1,
+	MLX5_FLOW_CONTEXT_ACTION_DROP      = 0x2,
+	MLX5_FLOW_CONTEXT_ACTION_FWD_DEST  = 0x4,
+};
+
+struct mlx5_ifc_flow_context_bits {
+	u8         reserved_0[0x20];
+
+	u8         group_id[0x20];
+
+	u8         reserved_1[0x8];
+	u8         flow_tag[0x18];
+
+	u8         reserved_2[0x10];
+	u8         action[0x10];
+
+	u8         reserved_3[0x8];
+	u8         destination_list_size[0x18];
+
+	u8         reserved_4[0x160];
+
+	struct mlx5_ifc_fte_match_param_bits match_value;
+
+	u8         reserved_5[0x600];
+
+	struct mlx5_ifc_dest_format_struct_bits destination[0];
+};
+
+enum {
+	MLX5_XRC_SRQC_STATE_GOOD   = 0x0,
+	MLX5_XRC_SRQC_STATE_ERROR  = 0x1,
+};
+
+struct mlx5_ifc_xrc_srqc_bits {
+	u8         state[0x4];
+	u8         log_xrc_srq_size[0x4];
+	u8         reserved_0[0x18];
+
+	u8         wq_signature[0x1];
+	u8         cont_srq[0x1];
+	u8         reserved_1[0x1];
+	u8         rlky[0x1];
+	u8         basic_cyclic_rcv_wqe[0x1];
+	u8         log_rq_stride[0x3];
+	u8         xrcd[0x18];
+
+	u8         page_offset[0x6];
+	u8         reserved_2[0x2];
+	u8         cqn[0x18];
+
+	u8         reserved_3[0x20];
+
+	u8         user_index_equal_xrc_srqn[0x1];
+	u8         reserved_4[0x1];
+	u8         log_page_size[0x6];
+	u8         user_index[0x18];
+
+	u8         reserved_5[0x20];
+
+	u8         reserved_6[0x8];
+	u8         pd[0x18];
+
+	u8         lwm[0x10];
+	u8         wqe_cnt[0x10];
+
+	u8         reserved_7[0x40];
+
+	u8         db_record_addr_h[0x20];
+
+	u8         db_record_addr_l[0x1e];
+	u8         reserved_8[0x2];
+
+	u8         reserved_9[0x80];
+};
+
+struct mlx5_ifc_traffic_counter_bits {
+	u8         packets[0x40];
+
+	u8         octets[0x40];
+};
+
+struct mlx5_ifc_tisc_bits {
+	u8         reserved_0[0xc];
+	u8         prio[0x4];
+	u8         reserved_1[0x10];
+
+	u8         reserved_2[0x100];
+
+	u8         reserved_3[0x8];
+	u8         transport_domain[0x18];
+
+	u8         reserved_4[0x3c0];
+};
+
+enum {
+	MLX5_TIRC_DISP_TYPE_DIRECT    = 0x0,
+	MLX5_TIRC_DISP_TYPE_INDIRECT  = 0x1,
+};
+
+enum {
+	MLX5_TIRC_LRO_ENABLE_MASK_IPV4_LRO  = 0x1,
+	MLX5_TIRC_LRO_ENABLE_MASK_IPV6_LRO  = 0x2,
+};
+
+enum {
+	MLX5_TIRC_RX_HASH_FN_HASH_NONE           = 0x0,
+	MLX5_TIRC_RX_HASH_FN_HASH_INVERTED_XOR8  = 0x1,
+	MLX5_TIRC_RX_HASH_FN_HASH_TOEPLITZ       = 0x2,
+};
+
+enum {
+	MLX5_TIRC_SELF_LB_BLOCK_BLOCK_UNICAST_    = 0x1,
+	MLX5_TIRC_SELF_LB_BLOCK_BLOCK_MULTICAST_  = 0x2,
+};
+
+struct mlx5_ifc_tirc_bits {
+	u8         reserved_0[0x20];
+
+	u8         disp_type[0x4];
+	u8         reserved_1[0x1c];
+
+	u8         reserved_2[0x40];
+
+	u8         reserved_3[0x4];
+	u8         lro_timeout_period_usecs[0x10];
+	u8         lro_enable_mask[0x4];
+	u8         lro_max_ip_payload_size[0x8];
+
+	u8         reserved_4[0x40];
+
+	u8         reserved_5[0x8];
+	u8         inline_rqn[0x18];
+
+	u8         rx_hash_symmetric[0x1];
+	u8         reserved_6[0x1];
+	u8         tunneled_offload_en[0x1];
+	u8         reserved_7[0x5];
+	u8         indirect_table[0x18];
+
+	u8         rx_hash_fn[0x4];
+	u8         reserved_8[0x2];
+	u8         self_lb_block[0x2];
+	u8         transport_domain[0x18];
+
+	u8         rx_hash_toeplitz_key[10][0x20];
+
+	struct mlx5_ifc_rx_hash_field_select_bits rx_hash_field_selector_outer;
+
+	struct mlx5_ifc_rx_hash_field_select_bits rx_hash_field_selector_inner;
+
+	u8         reserved_9[0x4c0];
+};
+
+enum {
+	MLX5_SRQC_STATE_GOOD   = 0x0,
+	MLX5_SRQC_STATE_ERROR  = 0x1,
+};
+
+struct mlx5_ifc_srqc_bits {
+	u8         state[0x4];
+	u8         log_srq_size[0x4];
+	u8         reserved_0[0x18];
+
+	u8         wq_signature[0x1];
+	u8         cont_srq[0x1];
+	u8         reserved_1[0x1];
+	u8         rlky[0x1];
+	u8         reserved_2[0x1];
+	u8         log_rq_stride[0x3];
+	u8         xrcd[0x18];
+
+	u8         page_offset[0x6];
+	u8         reserved_3[0x2];
+	u8         cqn[0x18];
+
+	u8         reserved_4[0x20];
+
+	u8         reserved_5[0x2];
+	u8         log_page_size[0x6];
+	u8         reserved_6[0x18];
+
+	u8         reserved_7[0x20];
+
+	u8         reserved_8[0x8];
+	u8         pd[0x18];
+
+	u8         lwm[0x10];
+	u8         wqe_cnt[0x10];
+
+	u8         reserved_9[0x40];
+
+	u8         db_record_addr_h[0x20];
+
+	u8         db_record_addr_l[0x1e];
+	u8         reserved_10[0x2];
+
+	u8         reserved_11[0x80];
+};
+
+enum {
+	MLX5_SQC_STATE_RST  = 0x0,
+	MLX5_SQC_STATE_RDY  = 0x1,
+	MLX5_SQC_STATE_ERR  = 0x3,
+};
+
+struct mlx5_ifc_sqc_bits {
+	u8         rlky[0x1];
+	u8         cd_master[0x1];
+	u8         fre[0x1];
+	u8         flush_in_error_en[0x1];
+	u8         reserved_0[0x4];
+	u8         state[0x4];
+	u8         reserved_1[0x14];
+
+	u8         reserved_2[0x8];
+	u8         user_index[0x18];
+
+	u8         reserved_3[0x8];
+	u8         cqn[0x18];
+
+	u8         reserved_4[0xa0];
+
+	u8         tis_lst_sz[0x10];
+	u8         reserved_5[0x10];
+
+	u8         reserved_6[0x40];
+
+	u8         reserved_7[0x8];
+	u8         tis_num_0[0x18];
+
+	struct mlx5_ifc_wq_bits wq;
+};
+
+struct mlx5_ifc_rqtc_bits {
+	u8         reserved_0[0xa0];
+
+	u8         reserved_1[0x10];
+	u8         rqt_max_size[0x10];
+
+	u8         reserved_2[0x10];
+	u8         rqt_actual_size[0x10];
+
+	u8         reserved_3[0x6a0];
+
+	struct mlx5_ifc_rq_num_bits rq_num[0];
+};
+
+enum {
+	MLX5_RQC_MEM_RQ_TYPE_MEMORY_RQ_INLINE  = 0x0,
+	MLX5_RQC_MEM_RQ_TYPE_MEMORY_RQ_RMP     = 0x1,
+};
+
+enum {
+	MLX5_RQC_STATE_RST  = 0x0,
+	MLX5_RQC_STATE_RDY  = 0x1,
+	MLX5_RQC_STATE_ERR  = 0x3,
+};
+
+struct mlx5_ifc_rqc_bits {
+	u8         rlky[0x1];
+	u8         reserved_0[0x2];
+	u8         vsd[0x1];
+	u8         mem_rq_type[0x4];
+	u8         state[0x4];
+	u8         reserved_1[0x1];
+	u8         flush_in_error_en[0x1];
+	u8         reserved_2[0x12];
+
+	u8         reserved_3[0x8];
+	u8         user_index[0x18];
+
+	u8         reserved_4[0x8];
+	u8         cqn[0x18];
+
+	u8         counter_set_id[0x8];
+	u8         reserved_5[0x18];
+
+	u8         reserved_6[0x8];
+	u8         rmpn[0x18];
+
+	u8         reserved_7[0xe0];
+
+	struct mlx5_ifc_wq_bits wq;
+};
+
+enum {
+	MLX5_RMPC_STATE_RDY  = 0x1,
+	MLX5_RMPC_STATE_ERR  = 0x3,
+};
+
+struct mlx5_ifc_rmpc_bits {
+	u8         reserved_0[0x8];
+	u8         state[0x4];
+	u8         reserved_1[0x14];
+
+	u8         basic_cyclic_rcv_wqe[0x1];
+	u8         reserved_2[0x1f];
+
+	u8         reserved_3[0x140];
+
+	struct mlx5_ifc_wq_bits wq;
+};
+
+enum {
+	MLX5_NIC_VPORT_CONTEXT_ALLOWED_LIST_TYPE_CURRENT_UC_MAC_ADDRESS  = 0x0,
+};
+
+struct mlx5_ifc_nic_vport_context_bits {
+	u8         reserved_0[0x1f];
+	u8         roce_en[0x1];
+
+	u8         reserved_1[0x760];
+
+	u8         reserved_2[0x5];
+	u8         allowed_list_type[0x3];
+	u8         reserved_3[0xc];
+	u8         allowed_list_size[0xc];
+
+	struct mlx5_ifc_mac_address_layout_bits permanent_address;
+
+	u8         reserved_4[0x20];
+
+	u8         current_uc_mac_address[0][0x40];
+};
+
+enum {
+	MLX5_MKC_ACCESS_MODE_PA    = 0x0,
+	MLX5_MKC_ACCESS_MODE_MTT   = 0x1,
+	MLX5_MKC_ACCESS_MODE_KLMS  = 0x2,
+};
+
+struct mlx5_ifc_mkc_bits {
+	u8         reserved_0[0x1];
+	u8         free[0x1];
+	u8         reserved_1[0xd];
+	u8         small_fence_on_rdma_read_response[0x1];
+	u8         umr_en[0x1];
+	u8         a[0x1];
+	u8         rw[0x1];
+	u8         rr[0x1];
+	u8         lw[0x1];
+	u8         lr[0x1];
+	u8         access_mode[0x2];
+	u8         reserved_2[0x8];
+
+	u8         qpn[0x18];
+	u8         mkey_7_0[0x8];
+
+	u8         reserved_3[0x20];
+
+	u8         length64[0x1];
+	u8         bsf_en[0x1];
+	u8         sync_umr[0x1];
+	u8         reserved_4[0x2];
+	u8         expected_sigerr_count[0x1];
+	u8         reserved_5[0x1];
+	u8         en_rinval[0x1];
+	u8         pd[0x18];
+
+	u8         start_addr[0x40];
+
+	u8         len[0x40];
+
+	u8         bsf_octword_size[0x20];
+
+	u8         reserved_6[0x80];
+
+	u8         translations_octword_size[0x20];
+
+	u8         reserved_7[0x1b];
+	u8         log_page_size[0x5];
+
+	u8         reserved_8[0x20];
+};
+
+struct mlx5_ifc_pkey_bits {
+	u8         reserved_0[0x10];
+	u8         pkey[0x10];
+};
+
+struct mlx5_ifc_array128_auto_bits {
+	u8         array128_auto[16][0x8];
+};
+
+struct mlx5_ifc_hca_vport_context_bits {
+	u8         field_select[0x20];
+
+	u8         reserved_0[0xe0];
+
+	u8         sm_virt_aware[0x1];
+	u8         has_smi[0x1];
+	u8         has_raw[0x1];
+	u8         grh_required[0x1];
+	u8         reserved_1[0x10];
+	u8         port_state_policy[0x4];
+	u8         phy_port_state[0x4];
+	u8         vport_state[0x4];
+
+	u8         reserved_2[0x60];
+
+	u8         port_guid[0x40];
+
+	u8         node_guid[0x40];
+
+	u8         cap_mask1[0x20];
+
+	u8         cap_mask1_field_select[0x20];
+
+	u8         cap_mask2[0x20];
+
+	u8         cap_mask2_field_select[0x20];
+
+	u8         reserved_3[0x80];
+
+	u8         lid[0x10];
+	u8         reserved_4[0x4];
+	u8         init_type_reply[0x4];
+	u8         lmc[0x3];
+	u8         subnet_timeout[0x5];
+
+	u8         sm_lid[0x10];
+	u8         sm_sl[0x4];
+	u8         reserved_5[0xc];
+
+	u8         qkey_violation_counter[0x10];
+	u8         pkey_violation_counter[0x10];
+
+	u8         reserved_6[0xca0];
+};
+
+enum {
+	MLX5_EQC_STATUS_OK                = 0x0,
+	MLX5_EQC_STATUS_EQ_WRITE_FAILURE  = 0xa,
+};
+
+enum {
+	MLX5_EQC_ST_ARMED  = 0x9,
+	MLX5_EQC_ST_FIRED  = 0xa,
+};
+
+struct mlx5_ifc_eqc_bits {
+	u8         status[0x4];
+	u8         reserved_0[0x9];
+	u8         ec[0x1];
+	u8         oi[0x1];
+	u8         reserved_1[0x5];
+	u8         st[0x4];
+	u8         reserved_2[0x8];
+
+	u8         reserved_3[0x20];
+
+	u8         reserved_4[0x14];
+	u8         page_offset[0x6];
+	u8         reserved_5[0x6];
+
+	u8         reserved_6[0x3];
+	u8         log_eq_size[0x5];
+	u8         uar_page[0x18];
+
+	u8         reserved_7[0x20];
+
+	u8         reserved_8[0x18];
+	u8         intr[0x8];
+
+	u8         reserved_9[0x3];
+	u8         log_page_size[0x5];
+	u8         reserved_10[0x18];
+
+	u8         reserved_11[0x60];
+
+	u8         reserved_12[0x8];
+	u8         consumer_counter[0x18];
+
+	u8         reserved_13[0x8];
+	u8         producer_counter[0x18];
+
+	u8         reserved_14[0x80];
+};
+
+enum {
+	MLX5_DCTC_STATE_ACTIVE    = 0x0,
+	MLX5_DCTC_STATE_DRAINING  = 0x1,
+	MLX5_DCTC_STATE_DRAINED   = 0x2,
+};
+
+enum {
+	MLX5_DCTC_CS_RES_DISABLE    = 0x0,
+	MLX5_DCTC_CS_RES_NA         = 0x1,
+	MLX5_DCTC_CS_RES_UP_TO_64B  = 0x2,
+};
+
+enum {
+	MLX5_DCTC_MTU_256_BYTES  = 0x1,
+	MLX5_DCTC_MTU_512_BYTES  = 0x2,
+	MLX5_DCTC_MTU_1K_BYTES   = 0x3,
+	MLX5_DCTC_MTU_2K_BYTES   = 0x4,
+	MLX5_DCTC_MTU_4K_BYTES   = 0x5,
+};
+
+struct mlx5_ifc_dctc_bits {
+	u8         reserved_0[0x4];
+	u8         state[0x4];
+	u8         reserved_1[0x18];
+
+	u8         reserved_2[0x8];
+	u8         user_index[0x18];
+
+	u8         reserved_3[0x8];
+	u8         cqn[0x18];
+
+	u8         counter_set_id[0x8];
+	u8         atomic_mode[0x4];
+	u8         rre[0x1];
+	u8         rwe[0x1];
+	u8         rae[0x1];
+	u8         atomic_like_write_en[0x1];
+	u8         latency_sensitive[0x1];
+	u8         rlky[0x1];
+	u8         free_ar[0x1];
+	u8         reserved_4[0xd];
+
+	u8         reserved_5[0x8];
+	u8         cs_res[0x8];
+	u8         reserved_6[0x3];
+	u8         min_rnr_nak[0x5];
+	u8         reserved_7[0x8];
+
+	u8         reserved_8[0x8];
+	u8         srqn[0x18];
+
+	u8         reserved_9[0x8];
+	u8         pd[0x18];
+
+	u8         tclass[0x8];
+	u8         reserved_10[0x4];
+	u8         flow_label[0x14];
+
+	u8         dc_access_key[0x40];
+
+	u8         reserved_11[0x5];
+	u8         mtu[0x3];
+	u8         port[0x8];
+	u8         pkey_index[0x10];
+
+	u8         reserved_12[0x8];
+	u8         my_addr_index[0x8];
+	u8         reserved_13[0x8];
+	u8         hop_limit[0x8];
+
+	u8         dc_access_key_violation_count[0x20];
+
+	u8         reserved_14[0x14];
+	u8         dei_cfi[0x1];
+	u8         eth_prio[0x3];
+	u8         ecn[0x2];
+	u8         dscp[0x6];
+
+	u8         reserved_15[0x40];
+};
+
+enum {
+	MLX5_CQC_STATUS_OK             = 0x0,
+	MLX5_CQC_STATUS_CQ_OVERFLOW    = 0x9,
+	MLX5_CQC_STATUS_CQ_WRITE_FAIL  = 0xa,
+};
+
+enum {
+	MLX5_CQC_CQE_SZ_64_BYTES   = 0x0,
+	MLX5_CQC_CQE_SZ_128_BYTES  = 0x1,
+};
+
+enum {
+	MLX5_CQC_ST_SOLICITED_NOTIFICATION_REQUEST_ARMED  = 0x6,
+	MLX5_CQC_ST_NOTIFICATION_REQUEST_ARMED            = 0x9,
+	MLX5_CQC_ST_FIRED                                 = 0xa,
+};
+
+struct mlx5_ifc_cqc_bits {
+	u8         status[0x4];
+	u8         reserved_0[0x4];
+	u8         cqe_sz[0x3];
+	u8         cc[0x1];
+	u8         reserved_1[0x1];
+	u8         scqe_break_moderation_en[0x1];
+	u8         oi[0x1];
+	u8         reserved_2[0x2];
+	u8         cqe_zip_en[0x1];
+	u8         mini_cqe_res_format[0x2];
+	u8         st[0x4];
+	u8         reserved_3[0x8];
+
+	u8         reserved_4[0x20];
+
+	u8         reserved_5[0x14];
+	u8         page_offset[0x6];
+	u8         reserved_6[0x6];
+
+	u8         reserved_7[0x3];
+	u8         log_cq_size[0x5];
+	u8         uar_page[0x18];
+
+	u8         reserved_8[0x4];
+	u8         cq_period[0xc];
+	u8         cq_max_count[0x10];
+
+	u8         reserved_9[0x18];
+	u8         c_eqn[0x8];
+
+	u8         reserved_10[0x3];
+	u8         log_page_size[0x5];
+	u8         reserved_11[0x18];
+
+	u8         reserved_12[0x20];
+
+	u8         reserved_13[0x8];
+	u8         last_notified_index[0x18];
+
+	u8         reserved_14[0x8];
+	u8         last_solicit_index[0x18];
+
+	u8         reserved_15[0x8];
+	u8         consumer_counter[0x18];
+
+	u8         reserved_16[0x8];
+	u8         producer_counter[0x18];
+
+	u8         reserved_17[0x40];
+
+	u8         dbr_addr[0x40];
+};
+
+union mlx5_ifc_cong_control_roce_ecn_auto_bits {
+	struct mlx5_ifc_cong_control_802_1qau_rp_bits cong_control_802_1qau_rp;
+	struct mlx5_ifc_cong_control_r_roce_ecn_rp_bits cong_control_r_roce_ecn_rp;
+	struct mlx5_ifc_cong_control_r_roce_ecn_np_bits cong_control_r_roce_ecn_np;
+	u8         reserved_0[0x800];
+};
+
+struct mlx5_ifc_query_adapter_param_block_bits {
+	u8         reserved_0[0xe0];
+
+	u8         reserved_1[0x10];
+	u8         vsd_vendor_id[0x10];
+
+	u8         vsd[208][0x8];
+
+	u8         vsd_contd_psid[16][0x8];
+};
+
+union mlx5_ifc_modify_field_select_resize_field_select_auto_bits {
+	struct mlx5_ifc_modify_field_select_bits modify_field_select;
+	struct mlx5_ifc_resize_field_select_bits resize_field_select;
+	u8         reserved_0[0x20];
+};
+
+union mlx5_ifc_field_select_802_1_r_roce_auto_bits {
+	struct mlx5_ifc_field_select_802_1qau_rp_bits field_select_802_1qau_rp;
+	struct mlx5_ifc_field_select_r_roce_rp_bits field_select_r_roce_rp;
+	struct mlx5_ifc_field_select_r_roce_np_bits field_select_r_roce_np;
+	u8         reserved_0[0x20];
+};
+
+union mlx5_ifc_eth_cntrs_grp_data_layout_auto_bits {
+	struct mlx5_ifc_eth_802_3_cntrs_grp_data_layout_bits eth_802_3_cntrs_grp_data_layout;
+	struct mlx5_ifc_eth_2863_cntrs_grp_data_layout_bits eth_2863_cntrs_grp_data_layout;
+	struct mlx5_ifc_eth_2819_cntrs_grp_data_layout_bits eth_2819_cntrs_grp_data_layout;
+	struct mlx5_ifc_eth_3635_cntrs_grp_data_layout_bits eth_3635_cntrs_grp_data_layout;
+	struct mlx5_ifc_eth_extended_cntrs_grp_data_layout_bits eth_extended_cntrs_grp_data_layout;
+	struct mlx5_ifc_eth_per_prio_grp_data_layout_bits eth_per_prio_grp_data_layout;
+	struct mlx5_ifc_eth_per_traffic_grp_data_layout_bits eth_per_traffic_grp_data_layout;
+	struct mlx5_ifc_phys_layer_cntrs_bits phys_layer_cntrs;
+	u8         reserved_0[0x7c0];
+};
+
+union mlx5_ifc_event_auto_bits {
+	struct mlx5_ifc_comp_event_bits comp_event;
+	struct mlx5_ifc_dct_events_bits dct_events;
+	struct mlx5_ifc_qp_events_bits qp_events;
+	struct mlx5_ifc_wqe_associated_page_fault_event_bits wqe_associated_page_fault_event;
+	struct mlx5_ifc_rdma_page_fault_event_bits rdma_page_fault_event;
+	struct mlx5_ifc_cq_error_bits cq_error;
+	struct mlx5_ifc_dropped_packet_logged_bits dropped_packet_logged;
+	struct mlx5_ifc_port_state_change_event_bits port_state_change_event;
+	struct mlx5_ifc_gpio_event_bits gpio_event;
+	struct mlx5_ifc_db_bf_congestion_event_bits db_bf_congestion_event;
+	struct mlx5_ifc_stall_vl_event_bits stall_vl_event;
+	struct mlx5_ifc_cmd_inter_comp_event_bits cmd_inter_comp_event;
+	u8         reserved_0[0xe0];
+};
+
+struct mlx5_ifc_health_buffer_bits {
+	u8         reserved_0[0x100];
+
+	u8         assert_existptr[0x20];
+
+	u8         assert_callra[0x20];
+
+	u8         reserved_1[0x40];
+
+	u8         fw_version[0x20];
+
+	u8         hw_id[0x20];
+
+	u8         reserved_2[0x20];
+
+	u8         irisc_index[0x8];
+	u8         synd[0x8];
+	u8         ext_synd[0x10];
+};
+
+struct mlx5_ifc_register_loopback_control_bits {
+	u8         no_lb[0x1];
+	u8         reserved_0[0x7];
+	u8         port[0x8];
+	u8         reserved_1[0x10];
+
+	u8         reserved_2[0x60];
+};
+
+struct mlx5_ifc_teardown_hca_out_bits {
+	u8         status[0x8];
+	u8         reserved_0[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_1[0x40];
+};
+
+enum {
+	MLX5_TEARDOWN_HCA_IN_PROFILE_GRACEFUL_CLOSE  = 0x0,
+	MLX5_TEARDOWN_HCA_IN_PROFILE_PANIC_CLOSE     = 0x1,
+};
+
+struct mlx5_ifc_teardown_hca_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_0[0x10];
+
+	u8         reserved_1[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_2[0x10];
+	u8         profile[0x10];
+
+	u8         reserved_3[0x20];
+};
+
+struct mlx5_ifc_sqerr2rts_qp_out_bits {
+	u8         status[0x8];
+	u8         reserved_0[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_1[0x40];
+};
+
+struct mlx5_ifc_sqerr2rts_qp_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_0[0x10];
+
+	u8         reserved_1[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_2[0x8];
+	u8         qpn[0x18];
+
+	u8         reserved_3[0x20];
+
+	u8         opt_param_mask[0x20];
+
+	u8         reserved_4[0x20];
+
+	struct mlx5_ifc_qpc_bits qpc;
+
+	u8         reserved_5[0x80];
+};
+
+struct mlx5_ifc_sqd2rts_qp_out_bits {
+	u8         status[0x8];
+	u8         reserved_0[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_1[0x40];
+};
+
+struct mlx5_ifc_sqd2rts_qp_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_0[0x10];
+
+	u8         reserved_1[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_2[0x8];
+	u8         qpn[0x18];
+
+	u8         reserved_3[0x20];
+
+	u8         opt_param_mask[0x20];
+
+	u8         reserved_4[0x20];
+
+	struct mlx5_ifc_qpc_bits qpc;
+
+	u8         reserved_5[0x80];
+};
+
+struct mlx5_ifc_set_roce_address_out_bits {
+	u8         status[0x8];
+	u8         reserved_0[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_1[0x40];
+};
+
+struct mlx5_ifc_set_roce_address_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_0[0x10];
+
+	u8         reserved_1[0x10];
+	u8         op_mod[0x10];
+
+	u8         roce_address_index[0x10];
+	u8         reserved_2[0x10];
+
+	u8         reserved_3[0x20];
+
+	struct mlx5_ifc_roce_addr_layout_bits roce_address;
+};
+
+struct mlx5_ifc_set_mad_demux_out_bits {
+	u8         status[0x8];
+	u8         reserved_0[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_1[0x40];
+};
+
+enum {
+	MLX5_SET_MAD_DEMUX_IN_DEMUX_MODE_PASS_ALL   = 0x0,
+	MLX5_SET_MAD_DEMUX_IN_DEMUX_MODE_SELECTIVE  = 0x2,
+};
+
+struct mlx5_ifc_set_mad_demux_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_0[0x10];
+
+	u8         reserved_1[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_2[0x20];
+
+	u8         reserved_3[0x6];
+	u8         demux_mode[0x2];
+	u8         reserved_4[0x18];
+};
+
+struct mlx5_ifc_set_l2_table_entry_out_bits {
+	u8         status[0x8];
+	u8         reserved_0[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_1[0x40];
+};
+
+struct mlx5_ifc_set_l2_table_entry_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_0[0x10];
+
+	u8         reserved_1[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_2[0x60];
+
+	u8         reserved_3[0x8];
+	u8         table_index[0x18];
+
+	u8         reserved_4[0x20];
+
+	u8         reserved_5[0x13];
+	u8         vlan_valid[0x1];
+	u8         vlan[0xc];
+
+	struct mlx5_ifc_mac_address_layout_bits mac_address;
+
+	u8         reserved_6[0xc0];
+};
+
+struct mlx5_ifc_set_issi_out_bits {
+	u8         status[0x8];
+	u8         reserved_0[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_1[0x40];
+};
+
+struct mlx5_ifc_set_issi_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_0[0x10];
+
+	u8         reserved_1[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_2[0x10];
+	u8         current_issi[0x10];
+
+	u8         reserved_3[0x20];
+};
+
+struct mlx5_ifc_set_hca_cap_out_bits {
+	u8         status[0x8];
+	u8         reserved_0[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_1[0x40];
+};
+
+struct mlx5_ifc_set_hca_cap_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_0[0x10];
+
+	u8         reserved_1[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_2[0x40];
+
+	union mlx5_ifc_hca_cap_union_bits capability;
+};
+
+struct mlx5_ifc_set_fte_out_bits {
+	u8         status[0x8];
+	u8         reserved_0[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_1[0x40];
+};
+
+struct mlx5_ifc_set_fte_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_0[0x10];
+
+	u8         reserved_1[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_2[0x40];
+
+	u8         table_type[0x8];
+	u8         reserved_3[0x18];
+
+	u8         reserved_4[0x8];
+	u8         table_id[0x18];
+
+	u8         reserved_5[0x40];
+
+	u8         flow_index[0x20];
+
+	u8         reserved_6[0xe0];
+
+	struct mlx5_ifc_flow_context_bits flow_context;
+};
+
+struct mlx5_ifc_rts2rts_qp_out_bits {
+	u8         status[0x8];
+	u8         reserved_0[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_1[0x40];
+};
+
+struct mlx5_ifc_rts2rts_qp_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_0[0x10];
+
+	u8         reserved_1[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_2[0x8];
+	u8         qpn[0x18];
+
+	u8         reserved_3[0x20];
+
+	u8         opt_param_mask[0x20];
+
+	u8         reserved_4[0x20];
+
+	struct mlx5_ifc_qpc_bits qpc;
+
+	u8         reserved_5[0x80];
+};
+
+struct mlx5_ifc_rtr2rts_qp_out_bits {
+	u8         status[0x8];
+	u8         reserved_0[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_1[0x40];
+};
+
+struct mlx5_ifc_rtr2rts_qp_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_0[0x10];
+
+	u8         reserved_1[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_2[0x8];
+	u8         qpn[0x18];
+
+	u8         reserved_3[0x20];
+
+	u8         opt_param_mask[0x20];
+
+	u8         reserved_4[0x20];
+
+	struct mlx5_ifc_qpc_bits qpc;
+
+	u8         reserved_5[0x80];
+};
+
+struct mlx5_ifc_rst2init_qp_out_bits {
+	u8         status[0x8];
+	u8         reserved_0[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_1[0x40];
+};
+
+struct mlx5_ifc_rst2init_qp_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_0[0x10];
+
+	u8         reserved_1[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_2[0x8];
+	u8         qpn[0x18];
+
+	u8         reserved_3[0x20];
+
+	u8         opt_param_mask[0x20];
+
+	u8         reserved_4[0x20];
+
+	struct mlx5_ifc_qpc_bits qpc;
+
+	u8         reserved_5[0x80];
+};
+
+struct mlx5_ifc_query_xrc_srq_out_bits {
+	u8         status[0x8];
+	u8         reserved_0[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_1[0x40];
+
+	struct mlx5_ifc_xrc_srqc_bits xrc_srq_context_entry;
+
+	u8         reserved_2[0x600];
+
+	u8         pas[0][0x40];
+};
+
+struct mlx5_ifc_query_xrc_srq_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_0[0x10];
+
+	u8         reserved_1[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_2[0x8];
+	u8         xrc_srqn[0x18];
+
+	u8         reserved_3[0x20];
+};
+
+enum {
+	MLX5_QUERY_VPORT_STATE_OUT_STATE_DOWN  = 0x0,
+	MLX5_QUERY_VPORT_STATE_OUT_STATE_UP    = 0x1,
+};
+
+struct mlx5_ifc_query_vport_state_out_bits {
+	u8         status[0x8];
+	u8         reserved_0[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_1[0x20];
+
+	u8         reserved_2[0x18];
+	u8         admin_state[0x4];
+	u8         state[0x4];
+};
+
+enum {
+	MLX5_QUERY_VPORT_STATE_IN_OP_MOD_VNIC_VPORT  = 0x0,
+};
+
+struct mlx5_ifc_query_vport_state_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_0[0x10];
+
+	u8         reserved_1[0x10];
+	u8         op_mod[0x10];
+
+	u8         other_vport[0x1];
+	u8         reserved_2[0xf];
+	u8         vport_number[0x10];
+
+	u8         reserved_3[0x20];
+};
+
+struct mlx5_ifc_query_vport_counter_out_bits {
+	u8         status[0x8];
+	u8         reserved_0[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_1[0x40];
+
+	struct mlx5_ifc_traffic_counter_bits received_errors;
+
+	struct mlx5_ifc_traffic_counter_bits transmit_errors;
+
+	struct mlx5_ifc_traffic_counter_bits received_ib_unicast;
+
+	struct mlx5_ifc_traffic_counter_bits transmitted_ib_unicast;
+
+	struct mlx5_ifc_traffic_counter_bits received_ib_multicast;
+
+	struct mlx5_ifc_traffic_counter_bits transmitted_ib_multicast;
+
+	struct mlx5_ifc_traffic_counter_bits received_eth_broadcast;
+
+	struct mlx5_ifc_traffic_counter_bits transmitted_eth_broadcast;
+
+	struct mlx5_ifc_traffic_counter_bits received_eth_unicast;
+
+	struct mlx5_ifc_traffic_counter_bits transmitted_eth_unicast;
+
+	struct mlx5_ifc_traffic_counter_bits received_eth_multicast;
+
+	struct mlx5_ifc_traffic_counter_bits transmitted_eth_multicast;
+
+	u8         reserved_2[0xa00];
+};
+
+enum {
+	MLX5_QUERY_VPORT_COUNTER_IN_OP_MOD_VPORT_COUNTERS  = 0x0,
+};
+
+struct mlx5_ifc_query_vport_counter_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_0[0x10];
+
+	u8         reserved_1[0x10];
+	u8         op_mod[0x10];
+
+	u8         other_vport[0x1];
+	u8         reserved_2[0xf];
+	u8         vport_number[0x10];
+
+	u8         reserved_3[0x60];
+
+	u8         clear[0x1];
+	u8         reserved_4[0x1f];
+
+	u8         reserved_5[0x20];
+};
+
+struct mlx5_ifc_query_tis_out_bits {
+	u8         status[0x8];
+	u8         reserved_0[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_1[0x40];
+
+	struct mlx5_ifc_tisc_bits tis_context;
+};
+
+struct mlx5_ifc_query_tis_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_0[0x10];
+
+	u8         reserved_1[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_2[0x8];
+	u8         tisn[0x18];
+
+	u8         reserved_3[0x20];
+};
+
+struct mlx5_ifc_query_tir_out_bits {
+	u8         status[0x8];
+	u8         reserved_0[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_1[0xc0];
+
+	struct mlx5_ifc_tirc_bits tir_context;
+};
+
+struct mlx5_ifc_query_tir_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_0[0x10];
+
+	u8         reserved_1[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_2[0x8];
+	u8         tirn[0x18];
+
+	u8         reserved_3[0x20];
+};
+
+struct mlx5_ifc_query_srq_out_bits {
+	u8         status[0x8];
+	u8         reserved_0[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_1[0x40];
+
+	struct mlx5_ifc_srqc_bits srq_context_entry;
+
+	u8         reserved_2[0x600];
+
+	u8         pas[0][0x40];
+};
+
+struct mlx5_ifc_query_srq_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_0[0x10];
+
+	u8         reserved_1[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_2[0x8];
+	u8         srqn[0x18];
+
+	u8         reserved_3[0x20];
+};
+
+struct mlx5_ifc_query_sq_out_bits {
+	u8         status[0x8];
+	u8         reserved_0[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_1[0xc0];
+
+	struct mlx5_ifc_sqc_bits sq_context;
+};
+
+struct mlx5_ifc_query_sq_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_0[0x10];
+
+	u8         reserved_1[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_2[0x8];
+	u8         sqn[0x18];
+
+	u8         reserved_3[0x20];
+};
+
+struct mlx5_ifc_query_special_contexts_out_bits {
+	u8         status[0x8];
+	u8         reserved_0[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_1[0x20];
+
+	u8         resd_lkey[0x20];
+};
+
+struct mlx5_ifc_query_special_contexts_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_0[0x10];
+
+	u8         reserved_1[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_2[0x40];
+};
+
+struct mlx5_ifc_query_rqt_out_bits {
+	u8         status[0x8];
+	u8         reserved_0[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_1[0xc0];
+
+	struct mlx5_ifc_rqtc_bits rqt_context;
+};
+
+struct mlx5_ifc_query_rqt_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_0[0x10];
+
+	u8         reserved_1[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_2[0x8];
+	u8         rqtn[0x18];
+
+	u8         reserved_3[0x20];
+};
+
+struct mlx5_ifc_query_rq_out_bits {
+	u8         status[0x8];
+	u8         reserved_0[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_1[0xc0];
+
+	struct mlx5_ifc_rqc_bits rq_context;
+};
+
+struct mlx5_ifc_query_rq_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_0[0x10];
+
+	u8         reserved_1[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_2[0x8];
+	u8         rqn[0x18];
+
+	u8         reserved_3[0x20];
+};
+
+struct mlx5_ifc_query_roce_address_out_bits {
+	u8         status[0x8];
+	u8         reserved_0[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_1[0x40];
+
+	struct mlx5_ifc_roce_addr_layout_bits roce_address;
+};
+
+struct mlx5_ifc_query_roce_address_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_0[0x10];
+
+	u8         reserved_1[0x10];
+	u8         op_mod[0x10];
+
+	u8         roce_address_index[0x10];
+	u8         reserved_2[0x10];
+
+	u8         reserved_3[0x20];
+};
+
+struct mlx5_ifc_query_rmp_out_bits {
+	u8         status[0x8];
+	u8         reserved_0[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_1[0xc0];
+
+	struct mlx5_ifc_rmpc_bits rmp_context;
+};
+
+struct mlx5_ifc_query_rmp_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_0[0x10];
+
+	u8         reserved_1[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_2[0x8];
+	u8         rmpn[0x18];
+
+	u8         reserved_3[0x20];
+};
+
+struct mlx5_ifc_query_qp_out_bits {
+	u8         status[0x8];
+	u8         reserved_0[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_1[0x40];
+
+	u8         opt_param_mask[0x20];
+
+	u8         reserved_2[0x20];
+
+	struct mlx5_ifc_qpc_bits qpc;
+
+	u8         reserved_3[0x80];
+
+	u8         pas[0][0x40];
+};
+
+struct mlx5_ifc_query_qp_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_0[0x10];
+
+	u8         reserved_1[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_2[0x8];
+	u8         qpn[0x18];
+
+	u8         reserved_3[0x20];
+};
+
+struct mlx5_ifc_query_q_counter_out_bits {
+	u8         status[0x8];
+	u8         reserved_0[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_1[0x40];
+
+	u8         rx_write_requests[0x20];
+
+	u8         reserved_2[0x20];
+
+	u8         rx_read_requests[0x20];
+
+	u8         reserved_3[0x20];
+
+	u8         rx_atomic_requests[0x20];
+
+	u8         reserved_4[0x20];
+
+	u8         rx_dct_connect[0x20];
+
+	u8         reserved_5[0x20];
+
+	u8         out_of_buffer[0x20];
+
+	u8         reserved_6[0x20];
+
+	u8         out_of_sequence[0x20];
+
+	u8         reserved_7[0x620];
+};
+
+struct mlx5_ifc_query_q_counter_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_0[0x10];
+
+	u8         reserved_1[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_2[0x80];
+
+	u8         clear[0x1];
+	u8         reserved_3[0x1f];
+
+	u8         reserved_4[0x18];
+	u8         counter_set_id[0x8];
+};
+
+struct mlx5_ifc_query_pages_out_bits {
+	u8         status[0x8];
+	u8         reserved_0[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_1[0x10];
+	u8         function_id[0x10];
+
+	u8         num_pages[0x20];
+};
+
+enum {
+	MLX5_QUERY_PAGES_IN_OP_MOD_BOOT_PAGES     = 0x1,
+	MLX5_QUERY_PAGES_IN_OP_MOD_INIT_PAGES     = 0x2,
+	MLX5_QUERY_PAGES_IN_OP_MOD_REGULAR_PAGES  = 0x3,
+};
+
+struct mlx5_ifc_query_pages_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_0[0x10];
+
+	u8         reserved_1[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_2[0x10];
+	u8         function_id[0x10];
+
+	u8         reserved_3[0x20];
+};
+
+struct mlx5_ifc_query_nic_vport_context_out_bits {
+	u8         status[0x8];
+	u8         reserved_0[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_1[0x40];
+
+	struct mlx5_ifc_nic_vport_context_bits nic_vport_context;
+};
+
+struct mlx5_ifc_query_nic_vport_context_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_0[0x10];
+
+	u8         reserved_1[0x10];
+	u8         op_mod[0x10];
+
+	u8         other_vport[0x1];
+	u8         reserved_2[0xf];
+	u8         vport_number[0x10];
+
+	u8         reserved_3[0x5];
+	u8         allowed_list_type[0x3];
+	u8         reserved_4[0x18];
+};
+
+struct mlx5_ifc_query_mkey_out_bits {
+	u8         status[0x8];
+	u8         reserved_0[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_1[0x40];
+
+	struct mlx5_ifc_mkc_bits memory_key_mkey_entry;
+
+	u8         reserved_2[0x600];
+
+	u8         bsf0_klm0_pas_mtt0_1[16][0x8];
+
+	u8         bsf1_klm1_pas_mtt2_3[16][0x8];
+};
+
+struct mlx5_ifc_query_mkey_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_0[0x10];
+
+	u8         reserved_1[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_2[0x8];
+	u8         mkey_index[0x18];
+
+	u8         pg_access[0x1];
+	u8         reserved_3[0x1f];
+};
+
+struct mlx5_ifc_query_mad_demux_out_bits {
+	u8         status[0x8];
+	u8         reserved_0[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_1[0x40];
+
+	u8         mad_dumux_parameters_block[0x20];
+};
+
+struct mlx5_ifc_query_mad_demux_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_0[0x10];
+
+	u8         reserved_1[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_2[0x40];
+};
+
+struct mlx5_ifc_query_l2_table_entry_out_bits {
+	u8         status[0x8];
+	u8         reserved_0[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_1[0xa0];
+
+	u8         reserved_2[0x13];
+	u8         vlan_valid[0x1];
+	u8         vlan[0xc];
+
+	struct mlx5_ifc_mac_address_layout_bits mac_address;
+
+	u8         reserved_3[0xc0];
+};
+
+struct mlx5_ifc_query_l2_table_entry_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_0[0x10];
+
+	u8         reserved_1[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_2[0x60];
+
+	u8         reserved_3[0x8];
+	u8         table_index[0x18];
+
+	u8         reserved_4[0x140];
+};
+
+struct mlx5_ifc_query_issi_out_bits {
+	u8         status[0x8];
+	u8         reserved_0[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_1[0x10];
+	u8         current_issi[0x10];
+
+	u8         reserved_2[0xa0];
+
+	u8         supported_issi_reserved[76][0x8];
+	u8         supported_issi_dw0[0x20];
+};
+
+struct mlx5_ifc_query_issi_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_0[0x10];
+
+	u8         reserved_1[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_2[0x40];
+};
+
+struct mlx5_ifc_query_hca_vport_pkey_out_bits {
+	u8         status[0x8];
+	u8         reserved_0[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_1[0x40];
+
+	struct mlx5_ifc_pkey_bits pkey[0];
+};
+
+struct mlx5_ifc_query_hca_vport_pkey_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_0[0x10];
+
+	u8         reserved_1[0x10];
+	u8         op_mod[0x10];
+
+	u8         other_vport[0x1];
+	u8         reserved_2[0xf];
+	u8         vport_number[0x10];
+
+	u8         reserved_3[0x10];
+	u8         pkey_index[0x10];
+};
+
+struct mlx5_ifc_query_hca_vport_gid_out_bits {
+	u8         status[0x8];
+	u8         reserved_0[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_1[0x20];
+
+	u8         gids_num[0x10];
+	u8         reserved_2[0x10];
+
+	struct mlx5_ifc_array128_auto_bits gid[0];
+};
+
+struct mlx5_ifc_query_hca_vport_gid_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_0[0x10];
+
+	u8         reserved_1[0x10];
+	u8         op_mod[0x10];
+
+	u8         other_vport[0x1];
+	u8         reserved_2[0xf];
+	u8         vport_number[0x10];
+
+	u8         reserved_3[0x10];
+	u8         gid_index[0x10];
+};
+
+struct mlx5_ifc_query_hca_vport_context_out_bits {
+	u8         status[0x8];
+	u8         reserved_0[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_1[0x40];
+
+	struct mlx5_ifc_hca_vport_context_bits hca_vport_context;
+};
+
+struct mlx5_ifc_query_hca_vport_context_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_0[0x10];
+
+	u8         reserved_1[0x10];
+	u8         op_mod[0x10];
+
+	u8         other_vport[0x1];
+	u8         reserved_2[0xf];
+	u8         vport_number[0x10];
+
+	u8         reserved_3[0x20];
+};
+
+struct mlx5_ifc_query_hca_cap_out_bits {
+	u8         status[0x8];
+	u8         reserved_0[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_1[0x40];
+
+	union mlx5_ifc_hca_cap_union_bits capability;
+};
+
+struct mlx5_ifc_query_hca_cap_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_0[0x10];
+
+	u8         reserved_1[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_2[0x40];
+};
+
+struct mlx5_ifc_query_flow_table_out_bits {
+	u8         status[0x8];
+	u8         reserved_0[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_1[0x80];
+
+	u8         reserved_2[0x8];
+	u8         level[0x8];
+	u8         reserved_3[0x8];
+	u8         log_size[0x8];
+
+	u8         reserved_4[0x120];
+};
+
+struct mlx5_ifc_query_flow_table_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_0[0x10];
+
+	u8         reserved_1[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_2[0x40];
+
+	u8         table_type[0x8];
+	u8         reserved_3[0x18];
+
+	u8         reserved_4[0x8];
+	u8         table_id[0x18];
+
+	u8         reserved_5[0x140];
+};
+
+struct mlx5_ifc_query_fte_out_bits {
+	u8         status[0x8];
+	u8         reserved_0[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_1[0x1c0];
+
+	struct mlx5_ifc_flow_context_bits flow_context;
+};
+
+struct mlx5_ifc_query_fte_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_0[0x10];
+
+	u8         reserved_1[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_2[0x40];
+
+	u8         table_type[0x8];
+	u8         reserved_3[0x18];
+
+	u8         reserved_4[0x8];
+	u8         table_id[0x18];
+
+	u8         reserved_5[0x40];
+
+	u8         flow_index[0x20];
+
+	u8         reserved_6[0xe0];
+};
+
+enum {
+	MLX5_QUERY_FLOW_GROUP_OUT_MATCH_CRITERIA_ENABLE_OUTER_HEADERS    = 0x0,
+	MLX5_QUERY_FLOW_GROUP_OUT_MATCH_CRITERIA_ENABLE_MISC_PARAMETERS  = 0x1,
+	MLX5_QUERY_FLOW_GROUP_OUT_MATCH_CRITERIA_ENABLE_INNER_HEADERS    = 0x2,
+};
+
+struct mlx5_ifc_query_flow_group_out_bits {
+	u8         status[0x8];
+	u8         reserved_0[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_1[0xa0];
+
+	u8         start_flow_index[0x20];
+
+	u8         reserved_2[0x20];
+
+	u8         end_flow_index[0x20];
+
+	u8         reserved_3[0xa0];
+
+	u8         reserved_4[0x18];
+	u8         match_criteria_enable[0x8];
+
+	struct mlx5_ifc_fte_match_param_bits match_criteria;
+
+	u8         reserved_5[0xe00];
+};
+
+struct mlx5_ifc_query_flow_group_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_0[0x10];
+
+	u8         reserved_1[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_2[0x40];
+
+	u8         table_type[0x8];
+	u8         reserved_3[0x18];
+
+	u8         reserved_4[0x8];
+	u8         table_id[0x18];
+
+	u8         group_id[0x20];
+
+	u8         reserved_5[0x120];
+};
+
+struct mlx5_ifc_query_eq_out_bits {
+	u8         status[0x8];
+	u8         reserved_0[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_1[0x40];
+
+	struct mlx5_ifc_eqc_bits eq_context_entry;
+
+	u8         reserved_2[0x40];
+
+	u8         event_bitmask[0x40];
+
+	u8         reserved_3[0x580];
+
+	u8         pas[0][0x40];
+};
+
+struct mlx5_ifc_query_eq_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_0[0x10];
+
+	u8         reserved_1[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_2[0x18];
+	u8         eq_number[0x8];
+
+	u8         reserved_3[0x20];
+};
+
+struct mlx5_ifc_query_dct_out_bits {
+	u8         status[0x8];
+	u8         reserved_0[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_1[0x40];
+
+	struct mlx5_ifc_dctc_bits dct_context_entry;
+
+	u8         reserved_2[0x180];
+};
+
+struct mlx5_ifc_query_dct_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_0[0x10];
+
+	u8         reserved_1[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_2[0x8];
+	u8         dctn[0x18];
+
+	u8         reserved_3[0x20];
+};
+
+struct mlx5_ifc_query_cq_out_bits {
+	u8         status[0x8];
+	u8         reserved_0[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_1[0x40];
+
+	struct mlx5_ifc_cqc_bits cq_context;
+
+	u8         reserved_2[0x600];
+
+	u8         pas[0][0x40];
+};
+
+struct mlx5_ifc_query_cq_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_0[0x10];
+
+	u8         reserved_1[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_2[0x8];
+	u8         cqn[0x18];
+
+	u8         reserved_3[0x20];
+};
+
+struct mlx5_ifc_query_cong_status_out_bits {
+	u8         status[0x8];
+	u8         reserved_0[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_1[0x20];
+
+	u8         enable[0x1];
+	u8         tag_enable[0x1];
+	u8         reserved_2[0x1e];
+};
+
+struct mlx5_ifc_query_cong_status_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_0[0x10];
+
+	u8         reserved_1[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_2[0x18];
+	u8         priority[0x4];
+	u8         cong_protocol[0x4];
+
+	u8         reserved_3[0x20];
+};
+
+struct mlx5_ifc_query_cong_statistics_out_bits {
+	u8         status[0x8];
+	u8         reserved_0[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_1[0x40];
+
+	u8         cur_flows[0x20];
+
+	u8         sum_flows[0x20];
+
+	u8         cnp_ignored_high[0x20];
+
+	u8         cnp_ignored_low[0x20];
+
+	u8         cnp_handled_high[0x20];
+
+	u8         cnp_handled_low[0x20];
+
+	u8         reserved_2[0x100];
+
+	u8         time_stamp_high[0x20];
+
+	u8         time_stamp_low[0x20];
+
+	u8         accumulators_period[0x20];
+
+	u8         ecn_marked_roce_packets_high[0x20];
+
+	u8         ecn_marked_roce_packets_low[0x20];
+
+	u8         cnps_sent_high[0x20];
+
+	u8         cnps_sent_low[0x20];
+
+	u8         reserved_3[0x560];
+};
+
+struct mlx5_ifc_query_cong_statistics_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_0[0x10];
+
+	u8         reserved_1[0x10];
+	u8         op_mod[0x10];
+
+	u8         clear[0x1];
+	u8         reserved_2[0x1f];
+
+	u8         reserved_3[0x20];
+};
+
+struct mlx5_ifc_query_cong_params_out_bits {
+	u8         status[0x8];
+	u8         reserved_0[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_1[0x40];
+
+	union mlx5_ifc_cong_control_roce_ecn_auto_bits congestion_parameters;
+};
+
+struct mlx5_ifc_query_cong_params_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_0[0x10];
+
+	u8         reserved_1[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_2[0x1c];
+	u8         cong_protocol[0x4];
+
+	u8         reserved_3[0x20];
+};
+
+struct mlx5_ifc_query_adapter_out_bits {
+	u8         status[0x8];
+	u8         reserved_0[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_1[0x40];
+
+	struct mlx5_ifc_query_adapter_param_block_bits query_adapter_struct;
+};
+
+struct mlx5_ifc_query_adapter_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_0[0x10];
+
+	u8         reserved_1[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_2[0x40];
+};
+
+struct mlx5_ifc_qp_2rst_out_bits {
+	u8         status[0x8];
+	u8         reserved_0[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_1[0x40];
+};
+
+struct mlx5_ifc_qp_2rst_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_0[0x10];
+
+	u8         reserved_1[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_2[0x8];
+	u8         qpn[0x18];
+
+	u8         reserved_3[0x20];
+};
+
+struct mlx5_ifc_qp_2err_out_bits {
+	u8         status[0x8];
+	u8         reserved_0[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_1[0x40];
+};
+
+struct mlx5_ifc_qp_2err_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_0[0x10];
+
+	u8         reserved_1[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_2[0x8];
+	u8         qpn[0x18];
+
+	u8         reserved_3[0x20];
+};
+
+struct mlx5_ifc_page_fault_resume_out_bits {
+	u8         status[0x8];
+	u8         reserved_0[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_1[0x40];
+};
+
+struct mlx5_ifc_page_fault_resume_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_0[0x10];
+
+	u8         reserved_1[0x10];
+	u8         op_mod[0x10];
+
+	u8         error[0x1];
+	u8         reserved_2[0x4];
+	u8         rdma[0x1];
+	u8         read_write[0x1];
+	u8         req_res[0x1];
+	u8         qpn[0x18];
+
+	u8         reserved_3[0x20];
+};
+
+struct mlx5_ifc_nop_out_bits {
+	u8         status[0x8];
+	u8         reserved_0[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_1[0x40];
+};
+
+struct mlx5_ifc_nop_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_0[0x10];
+
+	u8         reserved_1[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_2[0x40];
+};
+
+struct mlx5_ifc_modify_vport_state_out_bits {
+	u8         status[0x8];
+	u8         reserved_0[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_1[0x40];
+};
+
+struct mlx5_ifc_modify_vport_state_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_0[0x10];
+
+	u8         reserved_1[0x10];
+	u8         op_mod[0x10];
+
+	u8         other_vport[0x1];
+	u8         reserved_2[0xf];
+	u8         vport_number[0x10];
+
+	u8         reserved_3[0x18];
+	u8         admin_state[0x4];
+	u8         reserved_4[0x4];
+};
+
+struct mlx5_ifc_modify_tis_out_bits {
+	u8         status[0x8];
+	u8         reserved_0[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_1[0x40];
+};
+
+struct mlx5_ifc_modify_tis_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_0[0x10];
+
+	u8         reserved_1[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_2[0x8];
+	u8         tisn[0x18];
+
+	u8         reserved_3[0x20];
+
+	u8         modify_bitmask[0x40];
+
+	u8         reserved_4[0x40];
+
+	struct mlx5_ifc_tisc_bits ctx;
+};
+
+struct mlx5_ifc_modify_tir_out_bits {
+	u8         status[0x8];
+	u8         reserved_0[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_1[0x40];
+};
+
+struct mlx5_ifc_modify_tir_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_0[0x10];
+
+	u8         reserved_1[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_2[0x8];
+	u8         tirn[0x18];
+
+	u8         reserved_3[0x20];
+
+	u8         modify_bitmask[0x40];
+
+	u8         reserved_4[0x40];
+
+	struct mlx5_ifc_tirc_bits ctx;
+};
+
+struct mlx5_ifc_modify_sq_out_bits {
+	u8         status[0x8];
+	u8         reserved_0[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_1[0x40];
+};
+
+struct mlx5_ifc_modify_sq_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_0[0x10];
+
+	u8         reserved_1[0x10];
+	u8         op_mod[0x10];
+
+	u8         sq_state[0x4];
+	u8         reserved_2[0x4];
+	u8         sqn[0x18];
+
+	u8         reserved_3[0x20];
+
+	u8         modify_bitmask[0x40];
+
+	u8         reserved_4[0x40];
+
+	struct mlx5_ifc_sqc_bits ctx;
+};
+
+struct mlx5_ifc_modify_rqt_out_bits {
+	u8         status[0x8];
+	u8         reserved_0[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_1[0x40];
+};
+
+struct mlx5_ifc_modify_rqt_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_0[0x10];
+
+	u8         reserved_1[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_2[0x8];
+	u8         rqtn[0x18];
+
+	u8         reserved_3[0x20];
+
+	u8         modify_bitmask[0x40];
+
+	u8         reserved_4[0x40];
+
+	struct mlx5_ifc_rqtc_bits ctx;
+};
+
+struct mlx5_ifc_modify_rq_out_bits {
+	u8         status[0x8];
+	u8         reserved_0[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_1[0x40];
+};
+
+struct mlx5_ifc_modify_rq_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_0[0x10];
+
+	u8         reserved_1[0x10];
+	u8         op_mod[0x10];
+
+	u8         rq_state[0x4];
+	u8         reserved_2[0x4];
+	u8         rqn[0x18];
+
+	u8         reserved_3[0x20];
+
+	u8         modify_bitmask[0x40];
+
+	u8         reserved_4[0x40];
+
+	struct mlx5_ifc_rqc_bits ctx;
+};
+
+struct mlx5_ifc_modify_rmp_out_bits {
+	u8         status[0x8];
+	u8         reserved_0[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_1[0x40];
+};
+
+struct mlx5_ifc_modify_rmp_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_0[0x10];
+
+	u8         reserved_1[0x10];
+	u8         op_mod[0x10];
+
+	u8         rmp_state[0x4];
+	u8         reserved_2[0x4];
+	u8         rmpn[0x18];
+
+	u8         reserved_3[0x20];
+
+	u8         modify_bitmask[0x40];
+
+	u8         reserved_4[0x40];
+
+	struct mlx5_ifc_rmpc_bits ctx;
+};
+
+struct mlx5_ifc_modify_nic_vport_context_out_bits {
+	u8         status[0x8];
+	u8         reserved_0[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_1[0x40];
+};
+
+struct mlx5_ifc_modify_nic_vport_field_select_bits {
+	u8         reserved_0[0x1c];
+	u8         permanent_address[0x1];
+	u8         addresses_list[0x1];
+	u8         roce_en[0x1];
+	u8         reserved_1[0x1];
+};
+
+struct mlx5_ifc_modify_nic_vport_context_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_0[0x10];
+
+	u8         reserved_1[0x10];
+	u8         op_mod[0x10];
+
+	u8         other_vport[0x1];
+	u8         reserved_2[0xf];
+	u8         vport_number[0x10];
+
+	struct mlx5_ifc_modify_nic_vport_field_select_bits field_select;
+
+	u8         reserved_3[0x780];
+
+	struct mlx5_ifc_nic_vport_context_bits nic_vport_context;
+};
+
+struct mlx5_ifc_modify_hca_vport_context_out_bits {
+	u8         status[0x8];
+	u8         reserved_0[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_1[0x40];
+};
+
+struct mlx5_ifc_modify_hca_vport_context_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_0[0x10];
+
+	u8         reserved_1[0x10];
+	u8         op_mod[0x10];
+
+	u8         other_vport[0x1];
+	u8         reserved_2[0xf];
+	u8         vport_number[0x10];
+
+	u8         reserved_3[0x20];
+
+	struct mlx5_ifc_hca_vport_context_bits hca_vport_context;
+};
+
+struct mlx5_ifc_modify_cq_out_bits {
+	u8         status[0x8];
+	u8         reserved_0[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_1[0x40];
+};
+
+enum {
+	MLX5_MODIFY_CQ_IN_OP_MOD_MODIFY_CQ  = 0x0,
+	MLX5_MODIFY_CQ_IN_OP_MOD_RESIZE_CQ  = 0x1,
+};
+
+struct mlx5_ifc_modify_cq_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_0[0x10];
+
+	u8         reserved_1[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_2[0x8];
+	u8         cqn[0x18];
+
+	union mlx5_ifc_modify_field_select_resize_field_select_auto_bits modify_field_select_resize_field_select;
+
+	struct mlx5_ifc_cqc_bits cq_context;
+
+	u8         reserved_3[0x600];
+
+	u8         pas[0][0x40];
+};
+
+struct mlx5_ifc_modify_cong_status_out_bits {
+	u8         status[0x8];
+	u8         reserved_0[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_1[0x40];
+};
+
+struct mlx5_ifc_modify_cong_status_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_0[0x10];
+
+	u8         reserved_1[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_2[0x18];
+	u8         priority[0x4];
+	u8         cong_protocol[0x4];
+
+	u8         enable[0x1];
+	u8         tag_enable[0x1];
+	u8         reserved_3[0x1e];
+};
+
+struct mlx5_ifc_modify_cong_params_out_bits {
+	u8         status[0x8];
+	u8         reserved_0[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_1[0x40];
+};
+
+struct mlx5_ifc_modify_cong_params_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_0[0x10];
+
+	u8         reserved_1[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_2[0x1c];
+	u8         cong_protocol[0x4];
+
+	union mlx5_ifc_field_select_802_1_r_roce_auto_bits field_select;
+
+	u8         reserved_3[0x80];
+
+	union mlx5_ifc_cong_control_roce_ecn_auto_bits congestion_parameters;
+};
+
+struct mlx5_ifc_manage_pages_out_bits {
+	u8         status[0x8];
+	u8         reserved_0[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         output_num_entries[0x20];
+
+	u8         reserved_1[0x20];
+
+	u8         pas[0][0x40];
+};
+
+enum {
+	MLX5_MANAGE_PAGES_IN_OP_MOD_ALLOCATION_FAIL     = 0x0,
+	MLX5_MANAGE_PAGES_IN_OP_MOD_ALLOCATION_SUCCESS  = 0x1,
+	MLX5_MANAGE_PAGES_IN_OP_MOD_HCA_RETURN_PAGES    = 0x2,
+};
+
+struct mlx5_ifc_manage_pages_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_0[0x10];
+
+	u8         reserved_1[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_2[0x10];
+	u8         function_id[0x10];
+
+	u8         input_num_entries[0x20];
+
+	u8         pas[0][0x40];
+};
+
+struct mlx5_ifc_mad_ifc_out_bits {
+	u8         status[0x8];
+	u8         reserved_0[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_1[0x40];
+
+	u8         response_mad_packet[256][0x8];
+};
+
+struct mlx5_ifc_mad_ifc_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_0[0x10];
+
+	u8         reserved_1[0x10];
+	u8         op_mod[0x10];
+
+	u8         remote_lid[0x10];
+	u8         reserved_2[0x8];
+	u8         port[0x8];
+
+	u8         reserved_3[0x20];
+
+	u8         mad[256][0x8];
+};
+
+struct mlx5_ifc_init_hca_out_bits {
+	u8         status[0x8];
+	u8         reserved_0[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_1[0x40];
+};
+
+struct mlx5_ifc_init_hca_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_0[0x10];
+
+	u8         reserved_1[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_2[0x40];
+};
+
+struct mlx5_ifc_init2rtr_qp_out_bits {
+	u8         status[0x8];
+	u8         reserved_0[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_1[0x40];
+};
+
+struct mlx5_ifc_init2rtr_qp_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_0[0x10];
+
+	u8         reserved_1[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_2[0x8];
+	u8         qpn[0x18];
+
+	u8         reserved_3[0x20];
+
+	u8         opt_param_mask[0x20];
+
+	u8         reserved_4[0x20];
+
+	struct mlx5_ifc_qpc_bits qpc;
+
+	u8         reserved_5[0x80];
+};
+
+struct mlx5_ifc_init2init_qp_out_bits {
+	u8         status[0x8];
+	u8         reserved_0[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_1[0x40];
+};
+
+struct mlx5_ifc_init2init_qp_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_0[0x10];
+
+	u8         reserved_1[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_2[0x8];
+	u8         qpn[0x18];
+
+	u8         reserved_3[0x20];
+
+	u8         opt_param_mask[0x20];
+
+	u8         reserved_4[0x20];
+
+	struct mlx5_ifc_qpc_bits qpc;
+
+	u8         reserved_5[0x80];
+};
+
+struct mlx5_ifc_get_dropped_packet_log_out_bits {
+	u8         status[0x8];
+	u8         reserved_0[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_1[0x40];
+
+	u8         packet_headers_log[128][0x8];
+
+	u8         packet_syndrome[64][0x8];
+};
+
+struct mlx5_ifc_get_dropped_packet_log_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_0[0x10];
+
+	u8         reserved_1[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_2[0x40];
+};
+
+struct mlx5_ifc_gen_eqe_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_0[0x10];
+
+	u8         reserved_1[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_2[0x18];
+	u8         eq_number[0x8];
+
+	u8         reserved_3[0x20];
+
+	u8         eqe[64][0x8];
+};
+
+struct mlx5_ifc_gen_eq_out_bits {
+	u8         status[0x8];
+	u8         reserved_0[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_1[0x40];
+};
+
+struct mlx5_ifc_enable_hca_out_bits {
+	u8         status[0x8];
+	u8         reserved_0[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_1[0x20];
+};
+
+struct mlx5_ifc_enable_hca_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_0[0x10];
+
+	u8         reserved_1[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_2[0x10];
+	u8         function_id[0x10];
+
+	u8         reserved_3[0x20];
+};
+
+struct mlx5_ifc_drain_dct_out_bits {
+	u8         status[0x8];
+	u8         reserved_0[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_1[0x40];
+};
+
+struct mlx5_ifc_drain_dct_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_0[0x10];
+
+	u8         reserved_1[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_2[0x8];
+	u8         dctn[0x18];
+
+	u8         reserved_3[0x20];
+};
+
+struct mlx5_ifc_disable_hca_out_bits {
+	u8         status[0x8];
+	u8         reserved_0[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_1[0x20];
+};
+
+struct mlx5_ifc_disable_hca_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_0[0x10];
+
+	u8         reserved_1[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_2[0x10];
+	u8         function_id[0x10];
+
+	u8         reserved_3[0x20];
+};
+
+struct mlx5_ifc_detach_from_mcg_out_bits {
+	u8         status[0x8];
+	u8         reserved_0[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_1[0x40];
+};
+
+struct mlx5_ifc_detach_from_mcg_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_0[0x10];
+
+	u8         reserved_1[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_2[0x8];
+	u8         qpn[0x18];
+
+	u8         reserved_3[0x20];
+
+	u8         multicast_gid[16][0x8];
+};
+
+struct mlx5_ifc_destroy_xrc_srq_out_bits {
+	u8         status[0x8];
+	u8         reserved_0[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_1[0x40];
+};
+
+struct mlx5_ifc_destroy_xrc_srq_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_0[0x10];
+
+	u8         reserved_1[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_2[0x8];
+	u8         xrc_srqn[0x18];
+
+	u8         reserved_3[0x20];
+};
+
+struct mlx5_ifc_destroy_tis_out_bits {
+	u8         status[0x8];
+	u8         reserved_0[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_1[0x40];
+};
+
+struct mlx5_ifc_destroy_tis_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_0[0x10];
+
+	u8         reserved_1[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_2[0x8];
+	u8         tisn[0x18];
+
+	u8         reserved_3[0x20];
+};
+
+struct mlx5_ifc_destroy_tir_out_bits {
+	u8         status[0x8];
+	u8         reserved_0[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_1[0x40];
+};
+
+struct mlx5_ifc_destroy_tir_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_0[0x10];
+
+	u8         reserved_1[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_2[0x8];
+	u8         tirn[0x18];
+
+	u8         reserved_3[0x20];
+};
+
+struct mlx5_ifc_destroy_srq_out_bits {
+	u8         status[0x8];
+	u8         reserved_0[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_1[0x40];
+};
+
+struct mlx5_ifc_destroy_srq_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_0[0x10];
+
+	u8         reserved_1[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_2[0x8];
+	u8         srqn[0x18];
+
+	u8         reserved_3[0x20];
+};
+
+struct mlx5_ifc_destroy_sq_out_bits {
+	u8         status[0x8];
+	u8         reserved_0[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_1[0x40];
+};
+
+struct mlx5_ifc_destroy_sq_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_0[0x10];
+
+	u8         reserved_1[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_2[0x8];
+	u8         sqn[0x18];
+
+	u8         reserved_3[0x20];
+};
+
+struct mlx5_ifc_destroy_rqt_out_bits {
+	u8         status[0x8];
+	u8         reserved_0[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_1[0x40];
+};
+
+struct mlx5_ifc_destroy_rqt_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_0[0x10];
+
+	u8         reserved_1[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_2[0x8];
+	u8         rqtn[0x18];
+
+	u8         reserved_3[0x20];
+};
+
+struct mlx5_ifc_destroy_rq_out_bits {
+	u8         status[0x8];
+	u8         reserved_0[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_1[0x40];
+};
+
+struct mlx5_ifc_destroy_rq_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_0[0x10];
+
+	u8         reserved_1[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_2[0x8];
+	u8         rqn[0x18];
+
+	u8         reserved_3[0x20];
+};
+
+struct mlx5_ifc_destroy_rmp_out_bits {
+	u8         status[0x8];
+	u8         reserved_0[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_1[0x40];
+};
+
+struct mlx5_ifc_destroy_rmp_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_0[0x10];
+
+	u8         reserved_1[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_2[0x8];
+	u8         rmpn[0x18];
+
+	u8         reserved_3[0x20];
+};
+
+struct mlx5_ifc_destroy_qp_out_bits {
+	u8         status[0x8];
+	u8         reserved_0[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_1[0x40];
+};
+
+struct mlx5_ifc_destroy_qp_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_0[0x10];
+
+	u8         reserved_1[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_2[0x8];
+	u8         qpn[0x18];
+
+	u8         reserved_3[0x20];
+};
+
+struct mlx5_ifc_destroy_psv_out_bits {
+	u8         status[0x8];
+	u8         reserved_0[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_1[0x40];
+};
+
+struct mlx5_ifc_destroy_psv_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_0[0x10];
+
+	u8         reserved_1[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_2[0x8];
+	u8         psvn[0x18];
+
+	u8         reserved_3[0x20];
+};
+
+struct mlx5_ifc_destroy_mkey_out_bits {
+	u8         status[0x8];
+	u8         reserved_0[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_1[0x40];
+};
+
+struct mlx5_ifc_destroy_mkey_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_0[0x10];
+
+	u8         reserved_1[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_2[0x8];
+	u8         mkey_index[0x18];
+
+	u8         reserved_3[0x20];
+};
+
+struct mlx5_ifc_destroy_flow_table_out_bits {
+	u8         status[0x8];
+	u8         reserved_0[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_1[0x40];
+};
+
+struct mlx5_ifc_destroy_flow_table_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_0[0x10];
+
+	u8         reserved_1[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_2[0x40];
+
+	u8         table_type[0x8];
+	u8         reserved_3[0x18];
+
+	u8         reserved_4[0x8];
+	u8         table_id[0x18];
+
+	u8         reserved_5[0x140];
+};
+
+struct mlx5_ifc_destroy_flow_group_out_bits {
+	u8         status[0x8];
+	u8         reserved_0[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_1[0x40];
+};
+
+struct mlx5_ifc_destroy_flow_group_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_0[0x10];
+
+	u8         reserved_1[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_2[0x40];
+
+	u8         table_type[0x8];
+	u8         reserved_3[0x18];
+
+	u8         reserved_4[0x8];
+	u8         table_id[0x18];
+
+	u8         group_id[0x20];
+
+	u8         reserved_5[0x120];
+};
+
+struct mlx5_ifc_destroy_eq_out_bits {
+	u8         status[0x8];
+	u8         reserved_0[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_1[0x40];
+};
+
+struct mlx5_ifc_destroy_eq_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_0[0x10];
+
+	u8         reserved_1[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_2[0x18];
+	u8         eq_number[0x8];
+
+	u8         reserved_3[0x20];
+};
+
+struct mlx5_ifc_destroy_dct_out_bits {
+	u8         status[0x8];
+	u8         reserved_0[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_1[0x40];
+};
+
+struct mlx5_ifc_destroy_dct_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_0[0x10];
+
+	u8         reserved_1[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_2[0x8];
+	u8         dctn[0x18];
+
+	u8         reserved_3[0x20];
+};
+
+struct mlx5_ifc_destroy_cq_out_bits {
+	u8         status[0x8];
+	u8         reserved_0[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_1[0x40];
+};
+
+struct mlx5_ifc_destroy_cq_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_0[0x10];
+
+	u8         reserved_1[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_2[0x8];
+	u8         cqn[0x18];
+
+	u8         reserved_3[0x20];
+};
+
+struct mlx5_ifc_delete_vxlan_udp_dport_out_bits {
+	u8         status[0x8];
+	u8         reserved_0[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_1[0x40];
+};
+
+struct mlx5_ifc_delete_vxlan_udp_dport_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_0[0x10];
+
+	u8         reserved_1[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_2[0x20];
+
+	u8         reserved_3[0x10];
+	u8         vxlan_udp_port[0x10];
+};
+
+struct mlx5_ifc_delete_l2_table_entry_out_bits {
+	u8         status[0x8];
+	u8         reserved_0[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_1[0x40];
+};
+
+struct mlx5_ifc_delete_l2_table_entry_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_0[0x10];
+
+	u8         reserved_1[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_2[0x60];
+
+	u8         reserved_3[0x8];
+	u8         table_index[0x18];
+
+	u8         reserved_4[0x140];
+};
+
+struct mlx5_ifc_delete_fte_out_bits {
+	u8         status[0x8];
+	u8         reserved_0[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_1[0x40];
+};
+
+struct mlx5_ifc_delete_fte_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_0[0x10];
+
+	u8         reserved_1[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_2[0x40];
+
+	u8         table_type[0x8];
+	u8         reserved_3[0x18];
+
+	u8         reserved_4[0x8];
+	u8         table_id[0x18];
+
+	u8         reserved_5[0x40];
+
+	u8         flow_index[0x20];
+
+	u8         reserved_6[0xe0];
+};
+
+struct mlx5_ifc_dealloc_xrcd_out_bits {
+	u8         status[0x8];
+	u8         reserved_0[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_1[0x40];
+};
+
+struct mlx5_ifc_dealloc_xrcd_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_0[0x10];
+
+	u8         reserved_1[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_2[0x8];
+	u8         xrcd[0x18];
+
+	u8         reserved_3[0x20];
+};
+
+struct mlx5_ifc_dealloc_uar_out_bits {
+	u8         status[0x8];
+	u8         reserved_0[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_1[0x40];
+};
+
+struct mlx5_ifc_dealloc_uar_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_0[0x10];
+
+	u8         reserved_1[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_2[0x8];
+	u8         uar[0x18];
+
+	u8         reserved_3[0x20];
+};
+
+struct mlx5_ifc_dealloc_transport_domain_out_bits {
+	u8         status[0x8];
+	u8         reserved_0[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_1[0x40];
+};
+
+struct mlx5_ifc_dealloc_transport_domain_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_0[0x10];
+
+	u8         reserved_1[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_2[0x8];
+	u8         transport_domain[0x18];
+
+	u8         reserved_3[0x20];
+};
+
+struct mlx5_ifc_dealloc_q_counter_out_bits {
+	u8         status[0x8];
+	u8         reserved_0[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_1[0x40];
+};
+
+struct mlx5_ifc_dealloc_q_counter_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_0[0x10];
+
+	u8         reserved_1[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_2[0x18];
+	u8         counter_set_id[0x8];
+
+	u8         reserved_3[0x20];
+};
+
+struct mlx5_ifc_dealloc_pd_out_bits {
+	u8         status[0x8];
+	u8         reserved_0[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_1[0x40];
+};
+
+struct mlx5_ifc_dealloc_pd_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_0[0x10];
+
+	u8         reserved_1[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_2[0x8];
+	u8         pd[0x18];
+
+	u8         reserved_3[0x20];
+};
+
+struct mlx5_ifc_create_xrc_srq_out_bits {
+	u8         status[0x8];
+	u8         reserved_0[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_1[0x8];
+	u8         xrc_srqn[0x18];
+
+	u8         reserved_2[0x20];
+};
+
+struct mlx5_ifc_create_xrc_srq_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_0[0x10];
+
+	u8         reserved_1[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_2[0x40];
+
+	struct mlx5_ifc_xrc_srqc_bits xrc_srq_context_entry;
+
+	u8         reserved_3[0x600];
+
+	u8         pas[0][0x40];
+};
+
+struct mlx5_ifc_create_tis_out_bits {
+	u8         status[0x8];
+	u8         reserved_0[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_1[0x8];
+	u8         tisn[0x18];
+
+	u8         reserved_2[0x20];
+};
+
+struct mlx5_ifc_create_tis_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_0[0x10];
+
+	u8         reserved_1[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_2[0xc0];
+
+	struct mlx5_ifc_tisc_bits ctx;
+};
+
+struct mlx5_ifc_create_tir_out_bits {
+	u8         status[0x8];
+	u8         reserved_0[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_1[0x8];
+	u8         tirn[0x18];
+
+	u8         reserved_2[0x20];
+};
+
+struct mlx5_ifc_create_tir_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_0[0x10];
+
+	u8         reserved_1[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_2[0xc0];
+
+	struct mlx5_ifc_tirc_bits ctx;
+};
+
+struct mlx5_ifc_create_srq_out_bits {
+	u8         status[0x8];
+	u8         reserved_0[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_1[0x8];
+	u8         srqn[0x18];
+
+	u8         reserved_2[0x20];
+};
+
+struct mlx5_ifc_create_srq_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_0[0x10];
+
+	u8         reserved_1[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_2[0x40];
+
+	struct mlx5_ifc_srqc_bits srq_context_entry;
+
+	u8         reserved_3[0x600];
+
+	u8         pas[0][0x40];
+};
+
+struct mlx5_ifc_create_sq_out_bits {
+	u8         status[0x8];
+	u8         reserved_0[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_1[0x8];
+	u8         sqn[0x18];
+
+	u8         reserved_2[0x20];
+};
+
+struct mlx5_ifc_create_sq_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_0[0x10];
+
+	u8         reserved_1[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_2[0xc0];
+
+	struct mlx5_ifc_sqc_bits ctx;
+};
+
+struct mlx5_ifc_create_rqt_out_bits {
+	u8         status[0x8];
+	u8         reserved_0[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_1[0x8];
+	u8         rqtn[0x18];
+
+	u8         reserved_2[0x20];
+};
+
+struct mlx5_ifc_create_rqt_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_0[0x10];
+
+	u8         reserved_1[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_2[0xc0];
+
+	struct mlx5_ifc_rqtc_bits rqt_context;
+};
+
+struct mlx5_ifc_create_rq_out_bits {
+	u8         status[0x8];
+	u8         reserved_0[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_1[0x8];
+	u8         rqn[0x18];
+
+	u8         reserved_2[0x20];
+};
+
+struct mlx5_ifc_create_rq_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_0[0x10];
+
+	u8         reserved_1[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_2[0xc0];
+
+	struct mlx5_ifc_rqc_bits ctx;
+};
+
+struct mlx5_ifc_create_rmp_out_bits {
+	u8         status[0x8];
+	u8         reserved_0[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_1[0x8];
+	u8         rmpn[0x18];
+
+	u8         reserved_2[0x20];
+};
+
+struct mlx5_ifc_create_rmp_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_0[0x10];
+
+	u8         reserved_1[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_2[0xc0];
+
+	struct mlx5_ifc_rmpc_bits ctx;
+};
+
+struct mlx5_ifc_create_qp_out_bits {
+	u8         status[0x8];
+	u8         reserved_0[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_1[0x8];
+	u8         qpn[0x18];
+
+	u8         reserved_2[0x20];
+};
+
+struct mlx5_ifc_create_qp_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_0[0x10];
+
+	u8         reserved_1[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_2[0x40];
+
+	u8         opt_param_mask[0x20];
+
+	u8         reserved_3[0x20];
+
+	struct mlx5_ifc_qpc_bits qpc;
+
+	u8         reserved_4[0x80];
+
+	u8         pas[0][0x40];
+};
+
+struct mlx5_ifc_create_psv_out_bits {
+	u8         status[0x8];
+	u8         reserved_0[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_1[0x40];
+
+	u8         reserved_2[0x8];
+	u8         psv0_index[0x18];
+
+	u8         reserved_3[0x8];
+	u8         psv1_index[0x18];
+
+	u8         reserved_4[0x8];
+	u8         psv2_index[0x18];
+
+	u8         reserved_5[0x8];
+	u8         psv3_index[0x18];
+};
+
+struct mlx5_ifc_create_psv_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_0[0x10];
+
+	u8         reserved_1[0x10];
+	u8         op_mod[0x10];
+
+	u8         num_psv[0x4];
+	u8         reserved_2[0x4];
+	u8         pd[0x18];
+
+	u8         reserved_3[0x20];
+};
+
+struct mlx5_ifc_create_mkey_out_bits {
+	u8         status[0x8];
+	u8         reserved_0[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_1[0x8];
+	u8         mkey_index[0x18];
+
+	u8         reserved_2[0x20];
+};
+
+struct mlx5_ifc_create_mkey_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_0[0x10];
+
+	u8         reserved_1[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_2[0x20];
+
+	u8         pg_access[0x1];
+	u8         reserved_3[0x1f];
+
+	struct mlx5_ifc_mkc_bits memory_key_mkey_entry;
+
+	u8         reserved_4[0x80];
+
+	u8         translations_octword_actual_size[0x20];
+
+	u8         reserved_5[0x560];
+
+	u8         klm_pas_mtt[0][0x20];
+};
+
+struct mlx5_ifc_create_flow_table_out_bits {
+	u8         status[0x8];
+	u8         reserved_0[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_1[0x8];
+	u8         table_id[0x18];
+
+	u8         reserved_2[0x20];
+};
+
+struct mlx5_ifc_create_flow_table_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_0[0x10];
+
+	u8         reserved_1[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_2[0x40];
+
+	u8         table_type[0x8];
+	u8         reserved_3[0x18];
+
+	u8         reserved_4[0x20];
+
+	u8         reserved_5[0x8];
+	u8         level[0x8];
+	u8         reserved_6[0x8];
+	u8         log_size[0x8];
+
+	u8         reserved_7[0x120];
+};
+
+struct mlx5_ifc_create_flow_group_out_bits {
+	u8         status[0x8];
+	u8         reserved_0[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_1[0x8];
+	u8         group_id[0x18];
+
+	u8         reserved_2[0x20];
+};
+
+enum {
+	MLX5_CREATE_FLOW_GROUP_IN_MATCH_CRITERIA_ENABLE_OUTER_HEADERS    = 0x0,
+	MLX5_CREATE_FLOW_GROUP_IN_MATCH_CRITERIA_ENABLE_MISC_PARAMETERS  = 0x1,
+	MLX5_CREATE_FLOW_GROUP_IN_MATCH_CRITERIA_ENABLE_INNER_HEADERS    = 0x2,
+};
+
+struct mlx5_ifc_create_flow_group_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_0[0x10];
+
+	u8         reserved_1[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_2[0x40];
+
+	u8         table_type[0x8];
+	u8         reserved_3[0x18];
+
+	u8         reserved_4[0x8];
+	u8         table_id[0x18];
+
+	u8         reserved_5[0x20];
+
+	u8         start_flow_index[0x20];
+
+	u8         reserved_6[0x20];
+
+	u8         end_flow_index[0x20];
+
+	u8         reserved_7[0xa0];
+
+	u8         reserved_8[0x18];
+	u8         match_criteria_enable[0x8];
+
+	struct mlx5_ifc_fte_match_param_bits match_criteria;
+
+	u8         reserved_9[0xe00];
+};
+
+struct mlx5_ifc_create_eq_out_bits {
+	u8         status[0x8];
+	u8         reserved_0[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_1[0x18];
+	u8         eq_number[0x8];
+
+	u8         reserved_2[0x20];
+};
+
+struct mlx5_ifc_create_eq_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_0[0x10];
+
+	u8         reserved_1[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_2[0x40];
+
+	struct mlx5_ifc_eqc_bits eq_context_entry;
+
+	u8         reserved_3[0x40];
+
+	u8         event_bitmask[0x40];
+
+	u8         reserved_4[0x580];
+
+	u8         pas[0][0x40];
+};
+
+struct mlx5_ifc_create_dct_out_bits {
+	u8         status[0x8];
+	u8         reserved_0[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_1[0x8];
+	u8         dctn[0x18];
+
+	u8         reserved_2[0x20];
+};
+
+struct mlx5_ifc_create_dct_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_0[0x10];
+
+	u8         reserved_1[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_2[0x40];
+
+	struct mlx5_ifc_dctc_bits dct_context_entry;
+
+	u8         reserved_3[0x180];
+};
+
+struct mlx5_ifc_create_cq_out_bits {
+	u8         status[0x8];
+	u8         reserved_0[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_1[0x8];
+	u8         cqn[0x18];
+
+	u8         reserved_2[0x20];
+};
+
+struct mlx5_ifc_create_cq_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_0[0x10];
+
+	u8         reserved_1[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_2[0x40];
+
+	struct mlx5_ifc_cqc_bits cq_context;
+
+	u8         reserved_3[0x600];
+
+	u8         pas[0][0x40];
+};
+
+struct mlx5_ifc_config_int_moderation_out_bits {
+	u8         status[0x8];
+	u8         reserved_0[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_1[0x4];
+	u8         min_delay[0xc];
+	u8         int_vector[0x10];
+
+	u8         reserved_2[0x20];
+};
+
+enum {
+	MLX5_CONFIG_INT_MODERATION_IN_OP_MOD_WRITE  = 0x0,
+	MLX5_CONFIG_INT_MODERATION_IN_OP_MOD_READ   = 0x1,
+};
+
+struct mlx5_ifc_config_int_moderation_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_0[0x10];
+
+	u8         reserved_1[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_2[0x4];
+	u8         min_delay[0xc];
+	u8         int_vector[0x10];
+
+	u8         reserved_3[0x20];
+};
+
+struct mlx5_ifc_attach_to_mcg_out_bits {
+	u8         status[0x8];
+	u8         reserved_0[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_1[0x40];
+};
+
+struct mlx5_ifc_attach_to_mcg_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_0[0x10];
+
+	u8         reserved_1[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_2[0x8];
+	u8         qpn[0x18];
+
+	u8         reserved_3[0x20];
+
+	u8         multicast_gid[16][0x8];
+};
+
+struct mlx5_ifc_arm_xrc_srq_out_bits {
+	u8         status[0x8];
+	u8         reserved_0[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_1[0x40];
+};
+
+enum {
+	MLX5_ARM_XRC_SRQ_IN_OP_MOD_XRC_SRQ  = 0x1,
+};
+
+struct mlx5_ifc_arm_xrc_srq_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_0[0x10];
+
+	u8         reserved_1[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_2[0x8];
+	u8         xrc_srqn[0x18];
+
+	u8         reserved_3[0x10];
+	u8         lwm[0x10];
+};
+
+struct mlx5_ifc_arm_rq_out_bits {
+	u8         status[0x8];
+	u8         reserved_0[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_1[0x40];
+};
+
+enum {
+	MLX5_ARM_RQ_IN_OP_MOD_SRQ_  = 0x1,
+};
+
+struct mlx5_ifc_arm_rq_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_0[0x10];
+
+	u8         reserved_1[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_2[0x8];
+	u8         srq_number[0x18];
+
+	u8         reserved_3[0x10];
+	u8         lwm[0x10];
+};
+
+struct mlx5_ifc_arm_dct_out_bits {
+	u8         status[0x8];
+	u8         reserved_0[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_1[0x40];
+};
+
+struct mlx5_ifc_arm_dct_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_0[0x10];
+
+	u8         reserved_1[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_2[0x8];
+	u8         dct_number[0x18];
+
+	u8         reserved_3[0x20];
+};
+
+struct mlx5_ifc_alloc_xrcd_out_bits {
+	u8         status[0x8];
+	u8         reserved_0[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_1[0x8];
+	u8         xrcd[0x18];
+
+	u8         reserved_2[0x20];
+};
+
+struct mlx5_ifc_alloc_xrcd_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_0[0x10];
+
+	u8         reserved_1[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_2[0x40];
+};
+
+struct mlx5_ifc_alloc_uar_out_bits {
+	u8         status[0x8];
+	u8         reserved_0[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_1[0x8];
+	u8         uar[0x18];
+
+	u8         reserved_2[0x20];
+};
+
+struct mlx5_ifc_alloc_uar_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_0[0x10];
+
+	u8         reserved_1[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_2[0x40];
+};
+
+struct mlx5_ifc_alloc_transport_domain_out_bits {
+	u8         status[0x8];
+	u8         reserved_0[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_1[0x8];
+	u8         transport_domain[0x18];
+
+	u8         reserved_2[0x20];
+};
+
+struct mlx5_ifc_alloc_transport_domain_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_0[0x10];
+
+	u8         reserved_1[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_2[0x40];
+};
+
+struct mlx5_ifc_alloc_q_counter_out_bits {
+	u8         status[0x8];
+	u8         reserved_0[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_1[0x18];
+	u8         counter_set_id[0x8];
+
+	u8         reserved_2[0x20];
+};
+
+struct mlx5_ifc_alloc_q_counter_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_0[0x10];
+
+	u8         reserved_1[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_2[0x40];
+};
+
+struct mlx5_ifc_alloc_pd_out_bits {
+	u8         status[0x8];
+	u8         reserved_0[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_1[0x8];
+	u8         pd[0x18];
+
+	u8         reserved_2[0x20];
+};
+
+struct mlx5_ifc_alloc_pd_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_0[0x10];
+
+	u8         reserved_1[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_2[0x40];
+};
+
+struct mlx5_ifc_add_vxlan_udp_dport_out_bits {
+	u8         status[0x8];
+	u8         reserved_0[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_1[0x40];
+};
+
+struct mlx5_ifc_add_vxlan_udp_dport_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_0[0x10];
+
+	u8         reserved_1[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_2[0x20];
+
+	u8         reserved_3[0x10];
+	u8         vxlan_udp_port[0x10];
+};
+
+struct mlx5_ifc_access_register_out_bits {
+	u8         status[0x8];
+	u8         reserved_0[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_1[0x40];
+
+	u8         register_data[0][0x20];
+};
+
+enum {
+	MLX5_ACCESS_REGISTER_IN_OP_MOD_WRITE  = 0x0,
+	MLX5_ACCESS_REGISTER_IN_OP_MOD_READ   = 0x1,
+};
+
+struct mlx5_ifc_access_register_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_0[0x10];
+
+	u8         reserved_1[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_2[0x10];
+	u8         register_id[0x10];
+
+	u8         argument[0x20];
+
+	u8         register_data[0][0x20];
+};
+
+struct mlx5_ifc_sltp_reg_bits {
+	u8         status[0x4];
+	u8         version[0x4];
+	u8         local_port[0x8];
+	u8         pnat[0x2];
+	u8         reserved_0[0x2];
+	u8         lane[0x4];
+	u8         reserved_1[0x8];
+
+	u8         reserved_2[0x20];
+
+	u8         reserved_3[0x7];
+	u8         polarity[0x1];
+	u8         ob_tap0[0x8];
+	u8         ob_tap1[0x8];
+	u8         ob_tap2[0x8];
+
+	u8         reserved_4[0xc];
+	u8         ob_preemp_mode[0x4];
+	u8         ob_reg[0x8];
+	u8         ob_bias[0x8];
+
+	u8         reserved_5[0x20];
+};
+
+struct mlx5_ifc_slrg_reg_bits {
+	u8         status[0x4];
+	u8         version[0x4];
+	u8         local_port[0x8];
+	u8         pnat[0x2];
+	u8         reserved_0[0x2];
+	u8         lane[0x4];
+	u8         reserved_1[0x8];
+
+	u8         time_to_link_up[0x10];
+	u8         reserved_2[0xc];
+	u8         grade_lane_speed[0x4];
+
+	u8         grade_version[0x8];
+	u8         grade[0x18];
+
+	u8         reserved_3[0x4];
+	u8         height_grade_type[0x4];
+	u8         height_grade[0x18];
+
+	u8         height_dz[0x10];
+	u8         height_dv[0x10];
+
+	u8         reserved_4[0x10];
+	u8         height_sigma[0x10];
+
+	u8         reserved_5[0x20];
+
+	u8         reserved_6[0x4];
+	u8         phase_grade_type[0x4];
+	u8         phase_grade[0x18];
+
+	u8         reserved_7[0x8];
+	u8         phase_eo_pos[0x8];
+	u8         reserved_8[0x8];
+	u8         phase_eo_neg[0x8];
+
+	u8         ffe_set_tested[0x10];
+	u8         test_errors_per_lane[0x10];
+};
+
+struct mlx5_ifc_pvlc_reg_bits {
+	u8         reserved_0[0x8];
+	u8         local_port[0x8];
+	u8         reserved_1[0x10];
+
+	u8         reserved_2[0x1c];
+	u8         vl_hw_cap[0x4];
+
+	u8         reserved_3[0x1c];
+	u8         vl_admin[0x4];
+
+	u8         reserved_4[0x1c];
+	u8         vl_operational[0x4];
+};
+
+struct mlx5_ifc_pude_reg_bits {
+	u8         swid[0x8];
+	u8         local_port[0x8];
+	u8         reserved_0[0x4];
+	u8         admin_status[0x4];
+	u8         reserved_1[0x4];
+	u8         oper_status[0x4];
+
+	u8         reserved_2[0x60];
+};
+
+struct mlx5_ifc_ptys_reg_bits {
+	u8         reserved_0[0x8];
+	u8         local_port[0x8];
+	u8         reserved_1[0xd];
+	u8         proto_mask[0x3];
+
+	u8         reserved_2[0x40];
+
+	u8         eth_proto_capability[0x20];
+
+	u8         ib_link_width_capability[0x10];
+	u8         ib_proto_capability[0x10];
+
+	u8         reserved_3[0x20];
+
+	u8         eth_proto_admin[0x20];
+
+	u8         ib_link_width_admin[0x10];
+	u8         ib_proto_admin[0x10];
+
+	u8         reserved_4[0x20];
+
+	u8         eth_proto_oper[0x20];
+
+	u8         ib_link_width_oper[0x10];
+	u8         ib_proto_oper[0x10];
+
+	u8         reserved_5[0x20];
+
+	u8         eth_proto_lp_advertise[0x20];
+
+	u8         reserved_6[0x60];
+};
+
+struct mlx5_ifc_ptas_reg_bits {
+	u8         reserved_0[0x20];
+
+	u8         algorithm_options[0x10];
+	u8         reserved_1[0x4];
+	u8         repetitions_mode[0x4];
+	u8         num_of_repetitions[0x8];
+
+	u8         grade_version[0x8];
+	u8         height_grade_type[0x4];
+	u8         phase_grade_type[0x4];
+	u8         height_grade_weight[0x8];
+	u8         phase_grade_weight[0x8];
+
+	u8         gisim_measure_bits[0x10];
+	u8         adaptive_tap_measure_bits[0x10];
+
+	u8         ber_bath_high_error_threshold[0x10];
+	u8         ber_bath_mid_error_threshold[0x10];
+
+	u8         ber_bath_low_error_threshold[0x10];
+	u8         one_ratio_high_threshold[0x10];
+
+	u8         one_ratio_high_mid_threshold[0x10];
+	u8         one_ratio_low_mid_threshold[0x10];
+
+	u8         one_ratio_low_threshold[0x10];
+	u8         ndeo_error_threshold[0x10];
+
+	u8         mixer_offset_step_size[0x10];
+	u8         reserved_2[0x8];
+	u8         mix90_phase_for_voltage_bath[0x8];
+
+	u8         mixer_offset_start[0x10];
+	u8         mixer_offset_end[0x10];
+
+	u8         reserved_3[0x15];
+	u8         ber_test_time[0xb];
+};
+
+struct mlx5_ifc_pspa_reg_bits {
+	u8         swid[0x8];
+	u8         local_port[0x8];
+	u8         sub_port[0x8];
+	u8         reserved_0[0x8];
+
+	u8         reserved_1[0x20];
+};
+
+struct mlx5_ifc_pqdr_reg_bits {
+	u8         reserved_0[0x8];
+	u8         local_port[0x8];
+	u8         reserved_1[0x5];
+	u8         prio[0x3];
+	u8         reserved_2[0x6];
+	u8         mode[0x2];
+
+	u8         reserved_3[0x20];
+
+	u8         reserved_4[0x10];
+	u8         min_threshold[0x10];
+
+	u8         reserved_5[0x10];
+	u8         max_threshold[0x10];
+
+	u8         reserved_6[0x10];
+	u8         mark_probability_denominator[0x10];
+
+	u8         reserved_7[0x60];
+};
+
+struct mlx5_ifc_ppsc_reg_bits {
+	u8         reserved_0[0x8];
+	u8         local_port[0x8];
+	u8         reserved_1[0x10];
+
+	u8         reserved_2[0x60];
+
+	u8         reserved_3[0x1c];
+	u8         wrps_admin[0x4];
+
+	u8         reserved_4[0x1c];
+	u8         wrps_status[0x4];
+
+	u8         reserved_5[0x8];
+	u8         up_threshold[0x8];
+	u8         reserved_6[0x8];
+	u8         down_threshold[0x8];
+
+	u8         reserved_7[0x20];
+
+	u8         reserved_8[0x1c];
+	u8         srps_admin[0x4];
+
+	u8         reserved_9[0x1c];
+	u8         srps_status[0x4];
+
+	u8         reserved_10[0x40];
+};
+
+struct mlx5_ifc_pplr_reg_bits {
+	u8         reserved_0[0x8];
+	u8         local_port[0x8];
+	u8         reserved_1[0x10];
+
+	u8         reserved_2[0x8];
+	u8         lb_cap[0x8];
+	u8         reserved_3[0x8];
+	u8         lb_en[0x8];
+};
+
+struct mlx5_ifc_pplm_reg_bits {
+	u8         reserved_0[0x8];
+	u8         local_port[0x8];
+	u8         reserved_1[0x10];
+
+	u8         reserved_2[0x20];
+
+	u8         port_profile_mode[0x8];
+	u8         static_port_profile[0x8];
+	u8         active_port_profile[0x8];
+	u8         reserved_3[0x8];
+
+	u8         retransmission_active[0x8];
+	u8         fec_mode_active[0x18];
+
+	u8         reserved_4[0x20];
+};
+
+struct mlx5_ifc_ppcnt_reg_bits {
+	u8         swid[0x8];
+	u8         local_port[0x8];
+	u8         pnat[0x2];
+	u8         reserved_0[0x8];
+	u8         grp[0x6];
+
+	u8         clr[0x1];
+	u8         reserved_1[0x1c];
+	u8         prio_tc[0x3];
+
+	union mlx5_ifc_eth_cntrs_grp_data_layout_auto_bits counter_set;
+};
+
+struct mlx5_ifc_ppad_reg_bits {
+	u8         reserved_0[0x3];
+	u8         single_mac[0x1];
+	u8         reserved_1[0x4];
+	u8         local_port[0x8];
+	u8         mac_47_32[0x10];
+
+	u8         mac_31_0[0x20];
+
+	u8         reserved_2[0x40];
+};
+
+struct mlx5_ifc_pmtu_reg_bits {
+	u8         reserved_0[0x8];
+	u8         local_port[0x8];
+	u8         reserved_1[0x10];
+
+	u8         max_mtu[0x10];
+	u8         reserved_2[0x10];
+
+	u8         admin_mtu[0x10];
+	u8         reserved_3[0x10];
+
+	u8         oper_mtu[0x10];
+	u8         reserved_4[0x10];
+};
+
+struct mlx5_ifc_pmpr_reg_bits {
+	u8         reserved_0[0x8];
+	u8         module[0x8];
+	u8         reserved_1[0x10];
+
+	u8         reserved_2[0x18];
+	u8         attenuation_5g[0x8];
+
+	u8         reserved_3[0x18];
+	u8         attenuation_7g[0x8];
+
+	u8         reserved_4[0x18];
+	u8         attenuation_12g[0x8];
+};
+
+struct mlx5_ifc_pmpe_reg_bits {
+	u8         reserved_0[0x8];
+	u8         module[0x8];
+	u8         reserved_1[0xc];
+	u8         module_status[0x4];
+
+	u8         reserved_2[0x60];
+};
+
+struct mlx5_ifc_pmpc_reg_bits {
+	u8         module_state_updated[32][0x8];
+};
+
+struct mlx5_ifc_pmlpn_reg_bits {
+	u8         reserved_0[0x4];
+	u8         mlpn_status[0x4];
+	u8         local_port[0x8];
+	u8         reserved_1[0x10];
+
+	u8         e[0x1];
+	u8         reserved_2[0x1f];
+};
+
+struct mlx5_ifc_pmlp_reg_bits {
+	u8         rxtx[0x1];
+	u8         reserved_0[0x7];
+	u8         local_port[0x8];
+	u8         reserved_1[0x8];
+	u8         width[0x8];
+
+	u8         lane0_module_mapping[0x20];
+
+	u8         lane1_module_mapping[0x20];
+
+	u8         lane2_module_mapping[0x20];
+
+	u8         lane3_module_mapping[0x20];
+
+	u8         reserved_2[0x160];
+};
+
+struct mlx5_ifc_pmaos_reg_bits {
+	u8         reserved_0[0x8];
+	u8         module[0x8];
+	u8         reserved_1[0x4];
+	u8         admin_status[0x4];
+	u8         reserved_2[0x4];
+	u8         oper_status[0x4];
+
+	u8         ase[0x1];
+	u8         ee[0x1];
+	u8         reserved_3[0x1c];
+	u8         e[0x2];
+
+	u8         reserved_4[0x40];
+};
+
+struct mlx5_ifc_plpc_reg_bits {
+	u8         reserved_0[0x4];
+	u8         profile_id[0xc];
+	u8         reserved_1[0x4];
+	u8         proto_mask[0x4];
+	u8         reserved_2[0x8];
+
+	u8         reserved_3[0x10];
+	u8         lane_speed[0x10];
+
+	u8         reserved_4[0x17];
+	u8         lpbf[0x1];
+	u8         fec_mode_policy[0x8];
+
+	u8         retransmission_capability[0x8];
+	u8         fec_mode_capability[0x18];
+
+	u8         retransmission_support_admin[0x8];
+	u8         fec_mode_support_admin[0x18];
+
+	u8         retransmission_request_admin[0x8];
+	u8         fec_mode_request_admin[0x18];
+
+	u8         reserved_5[0x80];
+};
+
+struct mlx5_ifc_plib_reg_bits {
+	u8         reserved_0[0x8];
+	u8         local_port[0x8];
+	u8         reserved_1[0x8];
+	u8         ib_port[0x8];
+
+	u8         reserved_2[0x60];
+};
+
+struct mlx5_ifc_plbf_reg_bits {
+	u8         reserved_0[0x8];
+	u8         local_port[0x8];
+	u8         reserved_1[0xd];
+	u8         lbf_mode[0x3];
+
+	u8         reserved_2[0x20];
+};
+
+struct mlx5_ifc_pipg_reg_bits {
+	u8         reserved_0[0x8];
+	u8         local_port[0x8];
+	u8         reserved_1[0x10];
+
+	u8         dic[0x1];
+	u8         reserved_2[0x19];
+	u8         ipg[0x4];
+	u8         reserved_3[0x2];
+};
+
+struct mlx5_ifc_pifr_reg_bits {
+	u8         reserved_0[0x8];
+	u8         local_port[0x8];
+	u8         reserved_1[0x10];
+
+	u8         reserved_2[0xe0];
+
+	u8         port_filter[8][0x20];
+
+	u8         port_filter_update_en[8][0x20];
+};
+
+struct mlx5_ifc_pfcc_reg_bits {
+	u8         reserved_0[0x8];
+	u8         local_port[0x8];
+	u8         reserved_1[0x10];
+
+	u8         ppan[0x4];
+	u8         reserved_2[0x4];
+	u8         prio_mask_tx[0x8];
+	u8         reserved_3[0x8];
+	u8         prio_mask_rx[0x8];
+
+	u8         pptx[0x1];
+	u8         aptx[0x1];
+	u8         reserved_4[0x6];
+	u8         pfctx[0x8];
+	u8         reserved_5[0x10];
+
+	u8         pprx[0x1];
+	u8         aprx[0x1];
+	u8         reserved_6[0x6];
+	u8         pfcrx[0x8];
+	u8         reserved_7[0x10];
+
+	u8         reserved_8[0x80];
+};
+
+struct mlx5_ifc_pelc_reg_bits {
+	u8         op[0x4];
+	u8         reserved_0[0x4];
+	u8         local_port[0x8];
+	u8         reserved_1[0x10];
+
+	u8         op_admin[0x8];
+	u8         op_capability[0x8];
+	u8         op_request[0x8];
+	u8         op_active[0x8];
+
+	u8         admin[0x40];
+
+	u8         capability[0x40];
+
+	u8         request[0x40];
+
+	u8         active[0x40];
+
+	u8         reserved_2[0x80];
+};
+
+struct mlx5_ifc_peir_reg_bits {
+	u8         reserved_0[0x8];
+	u8         local_port[0x8];
+	u8         reserved_1[0x10];
+
+	u8         reserved_2[0xc];
+	u8         error_count[0x4];
+	u8         reserved_3[0x10];
+
+	u8         reserved_4[0xc];
+	u8         lane[0x4];
+	u8         reserved_5[0x8];
+	u8         error_type[0x8];
+};
+
+struct mlx5_ifc_pcap_reg_bits {
+	u8         reserved_0[0x8];
+	u8         local_port[0x8];
+	u8         reserved_1[0x10];
+
+	u8         port_capability_mask[4][0x20];
+};
+
+struct mlx5_ifc_paos_reg_bits {
+	u8         swid[0x8];
+	u8         local_port[0x8];
+	u8         reserved_0[0x4];
+	u8         admin_status[0x4];
+	u8         reserved_1[0x4];
+	u8         oper_status[0x4];
+
+	u8         ase[0x1];
+	u8         ee[0x1];
+	u8         reserved_2[0x1c];
+	u8         e[0x2];
+
+	u8         reserved_3[0x40];
+};
+
+struct mlx5_ifc_pamp_reg_bits {
+	u8         reserved_0[0x8];
+	u8         opamp_group[0x8];
+	u8         reserved_1[0xc];
+	u8         opamp_group_type[0x4];
+
+	u8         start_index[0x10];
+	u8         reserved_2[0x4];
+	u8         num_of_indices[0xc];
+
+	u8         index_data[18][0x10];
+};
+
+struct mlx5_ifc_lane_2_module_mapping_bits {
+	u8         reserved_0[0x6];
+	u8         rx_lane[0x2];
+	u8         reserved_1[0x6];
+	u8         tx_lane[0x2];
+	u8         reserved_2[0x8];
+	u8         module[0x8];
+};
+
+struct mlx5_ifc_bufferx_reg_bits {
+	u8         reserved_0[0x6];
+	u8         lossy[0x1];
+	u8         epsb[0x1];
+	u8         reserved_1[0xc];
+	u8         size[0xc];
+
+	u8         xoff_threshold[0x10];
+	u8         xon_threshold[0x10];
+};
+
+struct mlx5_ifc_set_node_in_bits {
+	u8         node_description[64][0x8];
+};
+
+struct mlx5_ifc_register_power_settings_bits {
+	u8         reserved_0[0x18];
+	u8         power_settings_level[0x8];
+
+	u8         reserved_1[0x60];
+};
+
+struct mlx5_ifc_register_host_endianness_bits {
+	u8         he[0x1];
+	u8         reserved_0[0x1f];
+
+	u8         reserved_1[0x60];
+};
+
+struct mlx5_ifc_umr_pointer_desc_argument_bits {
+	u8         reserved_0[0x20];
+
+	u8         mkey[0x20];
+
+	u8         addressh_63_32[0x20];
+
+	u8         addressl_31_0[0x20];
+};
+
+struct mlx5_ifc_ud_adrs_vector_bits {
+	u8         dc_key[0x40];
+
+	u8         ext[0x1];
+	u8         reserved_0[0x7];
+	u8         destination_qp_dct[0x18];
+
+	u8         static_rate[0x4];
+	u8         sl_eth_prio[0x4];
+	u8         fl[0x1];
+	u8         mlid[0x7];
+	u8         rlid_udp_sport[0x10];
+
+	u8         reserved_1[0x20];
+
+	u8         rmac_47_16[0x20];
+
+	u8         rmac_15_0[0x10];
+	u8         tclass[0x8];
+	u8         hop_limit[0x8];
+
+	u8         reserved_2[0x1];
+	u8         grh[0x1];
+	u8         reserved_3[0x2];
+	u8         src_addr_index[0x8];
+	u8         flow_label[0x14];
+
+	u8         rgid_rip[16][0x8];
+};
+
+struct mlx5_ifc_pages_req_event_bits {
+	u8         reserved_0[0x10];
+	u8         function_id[0x10];
+
+	u8         num_pages[0x20];
+
+	u8         reserved_1[0xa0];
+};
+
+struct mlx5_ifc_eqe_bits {
+	u8         reserved_0[0x8];
+	u8         event_type[0x8];
+	u8         reserved_1[0x8];
+	u8         event_sub_type[0x8];
+
+	u8         reserved_2[0xe0];
+
+	union mlx5_ifc_event_auto_bits event_data;
+
+	u8         reserved_3[0x10];
+	u8         signature[0x8];
+	u8         reserved_4[0x7];
+	u8         owner[0x1];
+};
+
+enum {
+	MLX5_CMD_QUEUE_ENTRY_TYPE_PCIE_CMD_IF_TRANSPORT  = 0x7,
+};
+
+struct mlx5_ifc_cmd_queue_entry_bits {
+	u8         type[0x8];
+	u8         reserved_0[0x18];
+
+	u8         input_length[0x20];
+
+	u8         input_mailbox_pointer_63_32[0x20];
+
+	u8         input_mailbox_pointer_31_9[0x17];
+	u8         reserved_1[0x9];
+
+	u8         command_input_inline_data[16][0x8];
+
+	u8         command_output_inline_data[16][0x8];
+
+	u8         output_mailbox_pointer_63_32[0x20];
+
+	u8         output_mailbox_pointer_31_9[0x17];
+	u8         reserved_2[0x9];
+
+	u8         output_length[0x20];
+
+	u8         token[0x8];
+	u8         signature[0x8];
+	u8         reserved_3[0x8];
+	u8         status[0x7];
+	u8         ownership[0x1];
+};
+
+struct mlx5_ifc_cmd_out_bits {
+	u8         status[0x8];
+	u8         reserved_0[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         command_output[0x20];
+};
+
+struct mlx5_ifc_cmd_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_0[0x10];
+
+	u8         reserved_1[0x10];
+	u8         op_mod[0x10];
+
+	u8         command[0][0x20];
+};
+
+struct mlx5_ifc_cmd_if_box_bits {
+	u8         mailbox_data[512][0x8];
+
+	u8         reserved_0[0x180];
+
+	u8         next_pointer_63_32[0x20];
+
+	u8         next_pointer_31_10[0x16];
+	u8         reserved_1[0xa];
+
+	u8         block_number[0x20];
+
+	u8         reserved_2[0x8];
+	u8         token[0x8];
+	u8         ctrl_signature[0x8];
+	u8         signature[0x8];
+};
+
+struct mlx5_ifc_mtt_bits {
+	u8         ptag_63_32[0x20];
+
+	u8         ptag_31_8[0x18];
+	u8         reserved_0[0x6];
+	u8         wr_en[0x1];
+	u8         rd_en[0x1];
+};
+
+enum {
+	MLX5_INITIAL_SEG_NIC_INTERFACE_FULL_DRIVER  = 0x0,
+	MLX5_INITIAL_SEG_NIC_INTERFACE_DISABLED     = 0x1,
+	MLX5_INITIAL_SEG_NIC_INTERFACE_NO_DRAM_NIC  = 0x2,
+};
+
+enum {
+	MLX5_INITIAL_SEG_NIC_INTERFACE_SUPPORTED_FULL_DRIVER  = 0x0,
+	MLX5_INITIAL_SEG_NIC_INTERFACE_SUPPORTED_DISABLED     = 0x1,
+	MLX5_INITIAL_SEG_NIC_INTERFACE_SUPPORTED_NO_DRAM_NIC  = 0x2,
+};
+
+enum {
+	MLX5_INITIAL_SEG_HEALTH_SYNDROME_FW_INTERNAL_ERR              = 0x1,
+	MLX5_INITIAL_SEG_HEALTH_SYNDROME_DEAD_IRISC                   = 0x7,
+	MLX5_INITIAL_SEG_HEALTH_SYNDROME_HW_FATAL_ERR                 = 0x8,
+	MLX5_INITIAL_SEG_HEALTH_SYNDROME_FW_CRC_ERR                   = 0x9,
+	MLX5_INITIAL_SEG_HEALTH_SYNDROME_ICM_FETCH_PCI_ERR            = 0xa,
+	MLX5_INITIAL_SEG_HEALTH_SYNDROME_ICM_PAGE_ERR                 = 0xb,
+	MLX5_INITIAL_SEG_HEALTH_SYNDROME_ASYNCHRONOUS_EQ_BUF_OVERRUN  = 0xc,
+	MLX5_INITIAL_SEG_HEALTH_SYNDROME_EQ_IN_ERR                    = 0xd,
+	MLX5_INITIAL_SEG_HEALTH_SYNDROME_EQ_INV                       = 0xe,
+	MLX5_INITIAL_SEG_HEALTH_SYNDROME_FFSER_ERR                    = 0xf,
+	MLX5_INITIAL_SEG_HEALTH_SYNDROME_HIGH_TEMP_ERR                = 0x10,
+};
+
+struct mlx5_ifc_initial_seg_bits {
+	u8         fw_rev_minor[0x10];
+	u8         fw_rev_major[0x10];
+
+	u8         cmd_interface_rev[0x10];
+	u8         fw_rev_subminor[0x10];
+
+	u8         reserved_0[0x40];
+
+	u8         cmdq_phy_addr_63_32[0x20];
+
+	u8         cmdq_phy_addr_31_12[0x14];
+	u8         reserved_1[0x2];
+	u8         nic_interface[0x2];
+	u8         log_cmdq_size[0x4];
+	u8         log_cmdq_stride[0x4];
+
+	u8         command_doorbell_vector[0x20];
+
+	u8         reserved_2[0xf00];
+
+	u8         initializing[0x1];
+	u8         reserved_3[0x4];
+	u8         nic_interface_supported[0x3];
+	u8         reserved_4[0x18];
+
+	struct mlx5_ifc_health_buffer_bits health_buffer;
+
+	u8         no_dram_nic_offset[0x20];
+
+	u8         reserved_5[0x6e40];
+
+	u8         reserved_6[0x1f];
+	u8         clear_int[0x1];
+
+	u8         health_syndrome[0x8];
+	u8         health_counter[0x18];
+
+	u8         reserved_7[0x17fc0];
+};
+
+union mlx5_ifc_ports_control_registers_document_bits {
+	struct mlx5_ifc_bufferx_reg_bits bufferx_reg;
+	struct mlx5_ifc_eth_2819_cntrs_grp_data_layout_bits eth_2819_cntrs_grp_data_layout;
+	struct mlx5_ifc_eth_2863_cntrs_grp_data_layout_bits eth_2863_cntrs_grp_data_layout;
+	struct mlx5_ifc_eth_3635_cntrs_grp_data_layout_bits eth_3635_cntrs_grp_data_layout;
+	struct mlx5_ifc_eth_802_3_cntrs_grp_data_layout_bits eth_802_3_cntrs_grp_data_layout;
+	struct mlx5_ifc_eth_extended_cntrs_grp_data_layout_bits eth_extended_cntrs_grp_data_layout;
+	struct mlx5_ifc_eth_per_prio_grp_data_layout_bits eth_per_prio_grp_data_layout;
+	struct mlx5_ifc_eth_per_traffic_grp_data_layout_bits eth_per_traffic_grp_data_layout;
+	struct mlx5_ifc_lane_2_module_mapping_bits lane_2_module_mapping;
+	struct mlx5_ifc_pamp_reg_bits pamp_reg;
+	struct mlx5_ifc_paos_reg_bits paos_reg;
+	struct mlx5_ifc_pcap_reg_bits pcap_reg;
+	struct mlx5_ifc_peir_reg_bits peir_reg;
+	struct mlx5_ifc_pelc_reg_bits pelc_reg;
+	struct mlx5_ifc_pfcc_reg_bits pfcc_reg;
+	struct mlx5_ifc_phys_layer_cntrs_bits phys_layer_cntrs;
+	struct mlx5_ifc_pifr_reg_bits pifr_reg;
+	struct mlx5_ifc_pipg_reg_bits pipg_reg;
+	struct mlx5_ifc_plbf_reg_bits plbf_reg;
+	struct mlx5_ifc_plib_reg_bits plib_reg;
+	struct mlx5_ifc_plpc_reg_bits plpc_reg;
+	struct mlx5_ifc_pmaos_reg_bits pmaos_reg;
+	struct mlx5_ifc_pmlp_reg_bits pmlp_reg;
+	struct mlx5_ifc_pmlpn_reg_bits pmlpn_reg;
+	struct mlx5_ifc_pmpc_reg_bits pmpc_reg;
+	struct mlx5_ifc_pmpe_reg_bits pmpe_reg;
+	struct mlx5_ifc_pmpr_reg_bits pmpr_reg;
+	struct mlx5_ifc_pmtu_reg_bits pmtu_reg;
+	struct mlx5_ifc_ppad_reg_bits ppad_reg;
+	struct mlx5_ifc_ppcnt_reg_bits ppcnt_reg;
+	struct mlx5_ifc_pplm_reg_bits pplm_reg;
+	struct mlx5_ifc_pplr_reg_bits pplr_reg;
+	struct mlx5_ifc_ppsc_reg_bits ppsc_reg;
+	struct mlx5_ifc_pqdr_reg_bits pqdr_reg;
+	struct mlx5_ifc_pspa_reg_bits pspa_reg;
+	struct mlx5_ifc_ptas_reg_bits ptas_reg;
+	struct mlx5_ifc_ptys_reg_bits ptys_reg;
+	struct mlx5_ifc_pude_reg_bits pude_reg;
+	struct mlx5_ifc_pvlc_reg_bits pvlc_reg;
+	struct mlx5_ifc_slrg_reg_bits slrg_reg;
+	struct mlx5_ifc_sltp_reg_bits sltp_reg;
+	u8         reserved_0[0x60e0];
+};
+
+union mlx5_ifc_debug_enhancements_document_bits {
+	struct mlx5_ifc_health_buffer_bits health_buffer;
+	u8         reserved_0[0x200];
+};
+
+union mlx5_ifc_uplink_pci_interface_document_bits {
+	struct mlx5_ifc_initial_seg_bits initial_seg;
+	u8         reserved_0[0x20060];
 };
 
 #endif /* MLX5_IFC_H */
diff --git a/include/linux/mlx5/qp.h b/include/linux/mlx5/qp.h
index 310b5f7fd6ae..f079fb1a31f7 100644
--- a/include/linux/mlx5/qp.h
+++ b/include/linux/mlx5/qp.h
@@ -134,13 +134,21 @@ enum {
 
 enum {
 	MLX5_WQE_CTRL_CQ_UPDATE		= 2 << 2,
+	MLX5_WQE_CTRL_CQ_UPDATE_AND_EQE	= 3 << 2,
 	MLX5_WQE_CTRL_SOLICITED		= 1 << 1,
 };
 
 enum {
+	MLX5_SEND_WQE_DS	= 16,
 	MLX5_SEND_WQE_BB	= 64,
 };
 
+#define MLX5_SEND_WQEBB_NUM_DS	(MLX5_SEND_WQE_BB / MLX5_SEND_WQE_DS)
+
+enum {
+	MLX5_SEND_WQE_MAX_WQEBBS	= 16,
+};
+
 enum {
 	MLX5_WQE_FMR_PERM_LOCAL_READ	= 1 << 27,
 	MLX5_WQE_FMR_PERM_LOCAL_WRITE	= 1 << 28,
@@ -200,6 +208,23 @@ struct mlx5_wqe_ctrl_seg {
 #define MLX5_WQE_CTRL_WQE_INDEX_MASK 0x00ffff00
 #define MLX5_WQE_CTRL_WQE_INDEX_SHIFT 8
 
+enum {
+	MLX5_ETH_WQE_L3_INNER_CSUM      = 1 << 4,
+	MLX5_ETH_WQE_L4_INNER_CSUM      = 1 << 5,
+	MLX5_ETH_WQE_L3_CSUM            = 1 << 6,
+	MLX5_ETH_WQE_L4_CSUM            = 1 << 7,
+};
+
+struct mlx5_wqe_eth_seg {
+	u8              rsvd0[4];
+	u8              cs_flags;
+	u8              rsvd1;
+	__be16          mss;
+	__be32          rsvd2;
+	__be16          inline_hdr_sz;
+	u8              inline_hdr_start[2];
+};
+
 struct mlx5_wqe_xrc_seg {
 	__be32			xrc_srqn;
 	u8			rsvd[12];
-- 
cgit v1.2.3


From 938fe83c8dcbbf294d167e6163200a8540ae43c4 Mon Sep 17 00:00:00 2001
From: Saeed Mahameed <saeedm@mellanox.com>
Date: Thu, 28 May 2015 22:28:41 +0300
Subject: net/mlx5_core: New device capabilities handling

- Query all supported types of dev caps on driver load.
- Store the Cap data outbox per cap type into driver private data.
- Introduce new Macros to access/dump stored caps (using the auto
  generated data types).
- Obsolete SW representation of dev caps (no need for SW copy for each
  cap).
- Modify IB driver to use new macros for checking caps.

Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: Amir Vadai <amirv@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/infiniband/hw/mlx5/cq.c                    |   8 +-
 drivers/infiniband/hw/mlx5/mad.c                   |   2 +-
 drivers/infiniband/hw/mlx5/main.c                  | 113 ++++++++-------
 drivers/infiniband/hw/mlx5/mlx5_ib.h               |   6 +-
 drivers/infiniband/hw/mlx5/mr.c                    |   3 +-
 drivers/infiniband/hw/mlx5/odp.c                   |  47 +++----
 drivers/infiniband/hw/mlx5/qp.c                    |  84 +++++------
 drivers/infiniband/hw/mlx5/srq.c                   |   7 +-
 drivers/net/ethernet/mellanox/mlx5/core/eq.c       |   4 +-
 drivers/net/ethernet/mellanox/mlx5/core/fw.c       |  90 +++++++-----
 drivers/net/ethernet/mellanox/mlx5/core/main.c     | 154 +++++++--------------
 .../net/ethernet/mellanox/mlx5/core/mlx5_core.h    |  10 +-
 drivers/net/ethernet/mellanox/mlx5/core/uar.c      |   7 +-
 include/linux/mlx5/device.h                        |  66 ++++++++-
 include/linux/mlx5/driver.h                        |  58 +-------
 15 files changed, 310 insertions(+), 349 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/infiniband/hw/mlx5/cq.c b/drivers/infiniband/hw/mlx5/cq.c
index 4e88b18cf62e..e2bea9ab93b3 100644
--- a/drivers/infiniband/hw/mlx5/cq.c
+++ b/drivers/infiniband/hw/mlx5/cq.c
@@ -753,7 +753,7 @@ struct ib_cq *mlx5_ib_create_cq(struct ib_device *ibdev, int entries,
 		return ERR_PTR(-EINVAL);
 
 	entries = roundup_pow_of_two(entries + 1);
-	if (entries > dev->mdev->caps.gen.max_cqes)
+	if (entries > (1 << MLX5_CAP_GEN(dev->mdev, log_max_cq_sz)))
 		return ERR_PTR(-EINVAL);
 
 	cq = kzalloc(sizeof(*cq), GFP_KERNEL);
@@ -920,7 +920,7 @@ int mlx5_ib_modify_cq(struct ib_cq *cq, u16 cq_count, u16 cq_period)
 	int err;
 	u32 fsel;
 
-	if (!(dev->mdev->caps.gen.flags & MLX5_DEV_CAP_FLAG_CQ_MODER))
+	if (!MLX5_CAP_GEN(dev->mdev, cq_moderation))
 		return -ENOSYS;
 
 	in = kzalloc(sizeof(*in), GFP_KERNEL);
@@ -1075,7 +1075,7 @@ int mlx5_ib_resize_cq(struct ib_cq *ibcq, int entries, struct ib_udata *udata)
 	int uninitialized_var(cqe_size);
 	unsigned long flags;
 
-	if (!(dev->mdev->caps.gen.flags & MLX5_DEV_CAP_FLAG_RESIZE_CQ)) {
+	if (!MLX5_CAP_GEN(dev->mdev, cq_resize)) {
 		pr_info("Firmware does not support resize CQ\n");
 		return -ENOSYS;
 	}
@@ -1084,7 +1084,7 @@ int mlx5_ib_resize_cq(struct ib_cq *ibcq, int entries, struct ib_udata *udata)
 		return -EINVAL;
 
 	entries = roundup_pow_of_two(entries + 1);
-	if (entries > dev->mdev->caps.gen.max_cqes + 1)
+	if (entries >  (1 << MLX5_CAP_GEN(dev->mdev, log_max_cq_sz)) + 1)
 		return -EINVAL;
 
 	if (entries == ibcq->cqe + 1)
diff --git a/drivers/infiniband/hw/mlx5/mad.c b/drivers/infiniband/hw/mlx5/mad.c
index 9cf9a37bb5ff..f2d9e70818d7 100644
--- a/drivers/infiniband/hw/mlx5/mad.c
+++ b/drivers/infiniband/hw/mlx5/mad.c
@@ -129,7 +129,7 @@ int mlx5_query_ext_port_caps(struct mlx5_ib_dev *dev, u8 port)
 
 	packet_error = be16_to_cpu(out_mad->status);
 
-	dev->mdev->caps.gen.ext_port_cap[port - 1] = (!err && !packet_error) ?
+	dev->mdev->port_caps[port - 1].ext_port_cap = (!err && !packet_error) ?
 		MLX_EXT_PORT_CAP_FLAG_EXTENDED_PORT_INFO : 0;
 
 out:
diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index 57c9809e8b87..9075649f30fc 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -66,15 +66,13 @@ static int mlx5_ib_query_device(struct ib_device *ibdev,
 				struct ib_device_attr *props)
 {
 	struct mlx5_ib_dev *dev = to_mdev(ibdev);
+	struct mlx5_core_dev *mdev = dev->mdev;
 	struct ib_smp *in_mad  = NULL;
 	struct ib_smp *out_mad = NULL;
-	struct mlx5_general_caps *gen;
 	int err = -ENOMEM;
 	int max_rq_sg;
 	int max_sq_sg;
-	u64 flags;
 
-	gen = &dev->mdev->caps.gen;
 	in_mad  = kzalloc(sizeof(*in_mad), GFP_KERNEL);
 	out_mad = kmalloc(sizeof(*out_mad), GFP_KERNEL);
 	if (!in_mad || !out_mad)
@@ -96,18 +94,18 @@ static int mlx5_ib_query_device(struct ib_device *ibdev,
 		IB_DEVICE_PORT_ACTIVE_EVENT		|
 		IB_DEVICE_SYS_IMAGE_GUID		|
 		IB_DEVICE_RC_RNR_NAK_GEN;
-	flags = gen->flags;
-	if (flags & MLX5_DEV_CAP_FLAG_BAD_PKEY_CNTR)
+
+	if (MLX5_CAP_GEN(mdev, pkv))
 		props->device_cap_flags |= IB_DEVICE_BAD_PKEY_CNTR;
-	if (flags & MLX5_DEV_CAP_FLAG_BAD_QKEY_CNTR)
+	if (MLX5_CAP_GEN(mdev, qkv))
 		props->device_cap_flags |= IB_DEVICE_BAD_QKEY_CNTR;
-	if (flags & MLX5_DEV_CAP_FLAG_APM)
+	if (MLX5_CAP_GEN(mdev, apm))
 		props->device_cap_flags |= IB_DEVICE_AUTO_PATH_MIG;
 	props->device_cap_flags |= IB_DEVICE_LOCAL_DMA_LKEY;
-	if (flags & MLX5_DEV_CAP_FLAG_XRC)
+	if (MLX5_CAP_GEN(mdev, xrc))
 		props->device_cap_flags |= IB_DEVICE_XRC;
 	props->device_cap_flags |= IB_DEVICE_MEM_MGT_EXTENSIONS;
-	if (flags & MLX5_DEV_CAP_FLAG_SIG_HAND_OVER) {
+	if (MLX5_CAP_GEN(mdev, sho)) {
 		props->device_cap_flags |= IB_DEVICE_SIGNATURE_HANDOVER;
 		/* At this stage no support for signature handover */
 		props->sig_prot_cap = IB_PROT_T10DIF_TYPE_1 |
@@ -116,7 +114,7 @@ static int mlx5_ib_query_device(struct ib_device *ibdev,
 		props->sig_guard_cap = IB_GUARD_T10DIF_CRC |
 				       IB_GUARD_T10DIF_CSUM;
 	}
-	if (flags & MLX5_DEV_CAP_FLAG_BLOCK_MCAST)
+	if (MLX5_CAP_GEN(mdev, block_lb_mc))
 		props->device_cap_flags |= IB_DEVICE_BLOCK_MULTICAST_LOOPBACK;
 
 	props->vendor_id	   = be32_to_cpup((__be32 *)(out_mad->data + 36)) &
@@ -126,37 +124,38 @@ static int mlx5_ib_query_device(struct ib_device *ibdev,
 	memcpy(&props->sys_image_guid, out_mad->data +	4, 8);
 
 	props->max_mr_size	   = ~0ull;
-	props->page_size_cap	   = gen->min_page_sz;
-	props->max_qp		   = 1 << gen->log_max_qp;
-	props->max_qp_wr	   = gen->max_wqes;
-	max_rq_sg = gen->max_rq_desc_sz / sizeof(struct mlx5_wqe_data_seg);
-	max_sq_sg = (gen->max_sq_desc_sz - sizeof(struct mlx5_wqe_ctrl_seg)) /
-		sizeof(struct mlx5_wqe_data_seg);
+	props->page_size_cap	   = 1ull << MLX5_CAP_GEN(mdev, log_pg_sz);
+	props->max_qp		   = 1 << MLX5_CAP_GEN(mdev, log_max_qp);
+	props->max_qp_wr	   = 1 << MLX5_CAP_GEN(mdev, log_max_qp_sz);
+	max_rq_sg =  MLX5_CAP_GEN(mdev, max_wqe_sz_rq) /
+		     sizeof(struct mlx5_wqe_data_seg);
+	max_sq_sg = (MLX5_CAP_GEN(mdev, max_wqe_sz_sq) -
+		     sizeof(struct mlx5_wqe_ctrl_seg)) /
+		     sizeof(struct mlx5_wqe_data_seg);
 	props->max_sge = min(max_rq_sg, max_sq_sg);
-	props->max_cq		   = 1 << gen->log_max_cq;
-	props->max_cqe		   = gen->max_cqes - 1;
-	props->max_mr		   = 1 << gen->log_max_mkey;
-	props->max_pd		   = 1 << gen->log_max_pd;
-	props->max_qp_rd_atom	   = 1 << gen->log_max_ra_req_qp;
-	props->max_qp_init_rd_atom = 1 << gen->log_max_ra_res_qp;
-	props->max_srq		   = 1 << gen->log_max_srq;
-	props->max_srq_wr	   = gen->max_srq_wqes - 1;
-	props->local_ca_ack_delay  = gen->local_ca_ack_delay;
+	props->max_cq		   = 1 << MLX5_CAP_GEN(mdev, log_max_cq);
+	props->max_cqe = (1 << MLX5_CAP_GEN(mdev, log_max_eq_sz)) - 1;
+	props->max_mr		   = 1 << MLX5_CAP_GEN(mdev, log_max_mkey);
+	props->max_pd		   = 1 << MLX5_CAP_GEN(mdev, log_max_pd);
+	props->max_qp_rd_atom	   = 1 << MLX5_CAP_GEN(mdev, log_max_ra_req_qp);
+	props->max_qp_init_rd_atom = 1 << MLX5_CAP_GEN(mdev, log_max_ra_res_qp);
+	props->max_srq		   = 1 << MLX5_CAP_GEN(mdev, log_max_srq);
+	props->max_srq_wr = (1 << MLX5_CAP_GEN(mdev, log_max_srq_sz)) - 1;
+	props->local_ca_ack_delay  = MLX5_CAP_GEN(mdev, local_ca_ack_delay);
 	props->max_res_rd_atom	   = props->max_qp_rd_atom * props->max_qp;
 	props->max_srq_sge	   = max_rq_sg - 1;
 	props->max_fast_reg_page_list_len = (unsigned int)-1;
-	props->local_ca_ack_delay  = gen->local_ca_ack_delay;
 	props->atomic_cap	   = IB_ATOMIC_NONE;
 	props->masked_atomic_cap   = IB_ATOMIC_NONE;
 	props->max_pkeys	   = be16_to_cpup((__be16 *)(out_mad->data + 28));
-	props->max_mcast_grp	   = 1 << gen->log_max_mcg;
-	props->max_mcast_qp_attach = gen->max_qp_mcg;
+	props->max_mcast_grp	   = 1 << MLX5_CAP_GEN(mdev, log_max_mcg);
+	props->max_mcast_qp_attach = MLX5_CAP_GEN(mdev, max_qp_mcg);
 	props->max_total_mcast_qp_attach = props->max_mcast_qp_attach *
 					   props->max_mcast_grp;
 	props->max_map_per_fmr = INT_MAX; /* no limit in ConnectIB */
 
 #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
-	if (dev->mdev->caps.gen.flags & MLX5_DEV_CAP_FLAG_ON_DMND_PG)
+	if (MLX5_CAP_GEN(mdev, pg))
 		props->device_cap_flags |= IB_DEVICE_ON_DEMAND_PAGING;
 	props->odp_caps = dev->odp_caps;
 #endif
@@ -172,14 +171,13 @@ int mlx5_ib_query_port(struct ib_device *ibdev, u8 port,
 		       struct ib_port_attr *props)
 {
 	struct mlx5_ib_dev *dev = to_mdev(ibdev);
+	struct mlx5_core_dev *mdev = dev->mdev;
 	struct ib_smp *in_mad  = NULL;
 	struct ib_smp *out_mad = NULL;
-	struct mlx5_general_caps *gen;
 	int ext_active_speed;
 	int err = -ENOMEM;
 
-	gen = &dev->mdev->caps.gen;
-	if (port < 1 || port > gen->num_ports) {
+	if (port < 1 || port > MLX5_CAP_GEN(mdev, num_ports)) {
 		mlx5_ib_warn(dev, "invalid port number %d\n", port);
 		return -EINVAL;
 	}
@@ -210,8 +208,8 @@ int mlx5_ib_query_port(struct ib_device *ibdev, u8 port,
 	props->phys_state	= out_mad->data[33] >> 4;
 	props->port_cap_flags	= be32_to_cpup((__be32 *)(out_mad->data + 20));
 	props->gid_tbl_len	= out_mad->data[50];
-	props->max_msg_sz	= 1 << gen->log_max_msg;
-	props->pkey_tbl_len	= gen->port[port - 1].pkey_table_len;
+	props->max_msg_sz	= 1 << MLX5_CAP_GEN(mdev, log_max_msg);
+	props->pkey_tbl_len	= mdev->port_caps[port - 1].pkey_table_len;
 	props->bad_pkey_cntr	= be16_to_cpup((__be16 *)(out_mad->data + 46));
 	props->qkey_viol_cntr	= be16_to_cpup((__be16 *)(out_mad->data + 48));
 	props->active_width	= out_mad->data[31] & 0xf;
@@ -238,7 +236,7 @@ int mlx5_ib_query_port(struct ib_device *ibdev, u8 port,
 
 	/* If reported active speed is QDR, check if is FDR-10 */
 	if (props->active_speed == 4) {
-		if (gen->ext_port_cap[port - 1] &
+		if (mdev->port_caps[port - 1].ext_port_cap &
 		    MLX_EXT_PORT_CAP_FLAG_EXTENDED_PORT_INFO) {
 			init_query_mad(in_mad);
 			in_mad->attr_id = MLX5_ATTR_EXTENDED_PORT_INFO;
@@ -392,7 +390,6 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev,
 	struct mlx5_ib_alloc_ucontext_req_v2 req;
 	struct mlx5_ib_alloc_ucontext_resp resp;
 	struct mlx5_ib_ucontext *context;
-	struct mlx5_general_caps *gen;
 	struct mlx5_uuar_info *uuari;
 	struct mlx5_uar *uars;
 	int gross_uuars;
@@ -403,7 +400,6 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev,
 	int i;
 	size_t reqlen;
 
-	gen = &dev->mdev->caps.gen;
 	if (!dev->ib_active)
 		return ERR_PTR(-EAGAIN);
 
@@ -436,14 +432,14 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev,
 
 	num_uars = req.total_num_uuars / MLX5_NON_FP_BF_REGS_PER_PAGE;
 	gross_uuars = num_uars * MLX5_BF_REGS_PER_PAGE;
-	resp.qp_tab_size      = 1 << gen->log_max_qp;
-	resp.bf_reg_size      = gen->bf_reg_size;
-	resp.cache_line_size  = L1_CACHE_BYTES;
-	resp.max_sq_desc_sz = gen->max_sq_desc_sz;
-	resp.max_rq_desc_sz = gen->max_rq_desc_sz;
-	resp.max_send_wqebb = gen->max_wqes;
-	resp.max_recv_wr = gen->max_wqes;
-	resp.max_srq_recv_wr = gen->max_srq_wqes;
+	resp.qp_tab_size = 1 << MLX5_CAP_GEN(dev->mdev, log_max_qp);
+	resp.bf_reg_size = 1 << MLX5_CAP_GEN(dev->mdev, log_bf_reg_size);
+	resp.cache_line_size = L1_CACHE_BYTES;
+	resp.max_sq_desc_sz = MLX5_CAP_GEN(dev->mdev, max_wqe_sz_sq);
+	resp.max_rq_desc_sz = MLX5_CAP_GEN(dev->mdev, max_wqe_sz_rq);
+	resp.max_send_wqebb = 1 << MLX5_CAP_GEN(dev->mdev, log_max_qp_sz);
+	resp.max_recv_wr = 1 << MLX5_CAP_GEN(dev->mdev, log_max_qp_sz);
+	resp.max_srq_recv_wr = 1 << MLX5_CAP_GEN(dev->mdev, log_max_srq_sz);
 
 	context = kzalloc(sizeof(*context), GFP_KERNEL);
 	if (!context)
@@ -493,7 +489,7 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev,
 	mutex_init(&context->db_page_mutex);
 
 	resp.tot_uuars = req.total_num_uuars;
-	resp.num_ports = gen->num_ports;
+	resp.num_ports = MLX5_CAP_GEN(dev->mdev, num_ports);
 	err = ib_copy_to_udata(udata, &resp,
 			       sizeof(resp) - sizeof(resp.reserved));
 	if (err)
@@ -895,11 +891,9 @@ static void mlx5_ib_event(struct mlx5_core_dev *dev, void *context,
 
 static void get_ext_port_caps(struct mlx5_ib_dev *dev)
 {
-	struct mlx5_general_caps *gen;
 	int port;
 
-	gen = &dev->mdev->caps.gen;
-	for (port = 1; port <= gen->num_ports; port++)
+	for (port = 1; port <= MLX5_CAP_GEN(dev->mdev, num_ports); port++)
 		mlx5_query_ext_port_caps(dev, port);
 }
 
@@ -907,11 +901,9 @@ static int get_port_caps(struct mlx5_ib_dev *dev)
 {
 	struct ib_device_attr *dprops = NULL;
 	struct ib_port_attr *pprops = NULL;
-	struct mlx5_general_caps *gen;
 	int err = -ENOMEM;
 	int port;
 
-	gen = &dev->mdev->caps.gen;
 	pprops = kmalloc(sizeof(*pprops), GFP_KERNEL);
 	if (!pprops)
 		goto out;
@@ -926,14 +918,17 @@ static int get_port_caps(struct mlx5_ib_dev *dev)
 		goto out;
 	}
 
-	for (port = 1; port <= gen->num_ports; port++) {
+	for (port = 1; port <= MLX5_CAP_GEN(dev->mdev, num_ports); port++) {
 		err = mlx5_ib_query_port(&dev->ib_dev, port, pprops);
 		if (err) {
-			mlx5_ib_warn(dev, "query_port %d failed %d\n", port, err);
+			mlx5_ib_warn(dev, "query_port %d failed %d\n",
+				     port, err);
 			break;
 		}
-		gen->port[port - 1].pkey_table_len = dprops->max_pkeys;
-		gen->port[port - 1].gid_table_len = pprops->gid_tbl_len;
+		dev->mdev->port_caps[port - 1].pkey_table_len =
+						dprops->max_pkeys;
+		dev->mdev->port_caps[port - 1].gid_table_len =
+						pprops->gid_tbl_len;
 		mlx5_ib_dbg(dev, "pkey_table_len %d, gid_table_len %d\n",
 			    dprops->max_pkeys, pprops->gid_tbl_len);
 	}
@@ -1207,8 +1202,8 @@ static void *mlx5_ib_add(struct mlx5_core_dev *mdev)
 	strlcpy(dev->ib_dev.name, "mlx5_%d", IB_DEVICE_NAME_MAX);
 	dev->ib_dev.owner		= THIS_MODULE;
 	dev->ib_dev.node_type		= RDMA_NODE_IB_CA;
-	dev->ib_dev.local_dma_lkey	= mdev->caps.gen.reserved_lkey;
-	dev->num_ports		= mdev->caps.gen.num_ports;
+	dev->ib_dev.local_dma_lkey	= 0 /* not supported for now */;
+	dev->num_ports		= MLX5_CAP_GEN(mdev, num_ports);
 	dev->ib_dev.phys_port_cnt     = dev->num_ports;
 	dev->ib_dev.num_comp_vectors    =
 		dev->mdev->priv.eq_table.num_comp_vectors;
@@ -1286,9 +1281,9 @@ static void *mlx5_ib_add(struct mlx5_core_dev *mdev)
 	dev->ib_dev.free_fast_reg_page_list  = mlx5_ib_free_fast_reg_page_list;
 	dev->ib_dev.check_mr_status	= mlx5_ib_check_mr_status;
 
-	mlx5_ib_internal_query_odp_caps(dev);
+	mlx5_ib_internal_fill_odp_caps(dev);
 
-	if (mdev->caps.gen.flags & MLX5_DEV_CAP_FLAG_XRC) {
+	if (MLX5_CAP_GEN(mdev, xrc)) {
 		dev->ib_dev.alloc_xrcd = mlx5_ib_alloc_xrcd;
 		dev->ib_dev.dealloc_xrcd = mlx5_ib_dealloc_xrcd;
 		dev->ib_dev.uverbs_cmd_mask |=
diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
index dff1cfcdf476..0c441add0464 100644
--- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
+++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
@@ -617,7 +617,7 @@ int mlx5_ib_check_mr_status(struct ib_mr *ibmr, u32 check_mask,
 #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
 extern struct workqueue_struct *mlx5_ib_page_fault_wq;
 
-int mlx5_ib_internal_query_odp_caps(struct mlx5_ib_dev *dev);
+void mlx5_ib_internal_fill_odp_caps(struct mlx5_ib_dev *dev);
 void mlx5_ib_mr_pfault_handler(struct mlx5_ib_qp *qp,
 			       struct mlx5_ib_pfault *pfault);
 void mlx5_ib_odp_create_qp(struct mlx5_ib_qp *qp);
@@ -631,9 +631,9 @@ void mlx5_ib_invalidate_range(struct ib_umem *umem, unsigned long start,
 			      unsigned long end);
 
 #else /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */
-static inline int mlx5_ib_internal_query_odp_caps(struct mlx5_ib_dev *dev)
+static inline void mlx5_ib_internal_fill_odp_caps(struct mlx5_ib_dev *dev)
 {
-	return 0;
+	return;
 }
 
 static inline void mlx5_ib_odp_create_qp(struct mlx5_ib_qp *qp)		{}
diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c
index 71c593583864..bc9a0de897cb 100644
--- a/drivers/infiniband/hw/mlx5/mr.c
+++ b/drivers/infiniband/hw/mlx5/mr.c
@@ -975,8 +975,7 @@ static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, u64 virt_addr,
 	struct mlx5_ib_mr *mr;
 	int inlen;
 	int err;
-	bool pg_cap = !!(dev->mdev->caps.gen.flags &
-			 MLX5_DEV_CAP_FLAG_ON_DMND_PG);
+	bool pg_cap = !!(MLX5_CAP_GEN(dev->mdev, pg));
 
 	mr = kzalloc(sizeof(*mr), GFP_KERNEL);
 	if (!mr)
diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c
index 5099db08afd2..aa8391e75385 100644
--- a/drivers/infiniband/hw/mlx5/odp.c
+++ b/drivers/infiniband/hw/mlx5/odp.c
@@ -109,40 +109,33 @@ void mlx5_ib_invalidate_range(struct ib_umem *umem, unsigned long start,
 	ib_umem_odp_unmap_dma_pages(umem, start, end);
 }
 
-#define COPY_ODP_BIT_MLX_TO_IB(reg, ib_caps, field_name, bit_name) do {	\
-	if (be32_to_cpu(reg.field_name) & MLX5_ODP_SUPPORT_##bit_name)	\
-		ib_caps->field_name |= IB_ODP_SUPPORT_##bit_name;	\
-} while (0)
-
-int mlx5_ib_internal_query_odp_caps(struct mlx5_ib_dev *dev)
+void mlx5_ib_internal_fill_odp_caps(struct mlx5_ib_dev *dev)
 {
-	int err;
-	struct mlx5_odp_caps hw_caps;
 	struct ib_odp_caps *caps = &dev->odp_caps;
 
 	memset(caps, 0, sizeof(*caps));
 
-	if (!(dev->mdev->caps.gen.flags & MLX5_DEV_CAP_FLAG_ON_DMND_PG))
-		return 0;
-
-	err = mlx5_query_odp_caps(dev->mdev, &hw_caps);
-	if (err)
-		goto out;
+	if (!MLX5_CAP_GEN(dev->mdev, pg))
+		return;
 
 	caps->general_caps = IB_ODP_SUPPORT;
-	COPY_ODP_BIT_MLX_TO_IB(hw_caps, caps, per_transport_caps.ud_odp_caps,
-			       SEND);
-	COPY_ODP_BIT_MLX_TO_IB(hw_caps, caps, per_transport_caps.rc_odp_caps,
-			       SEND);
-	COPY_ODP_BIT_MLX_TO_IB(hw_caps, caps, per_transport_caps.rc_odp_caps,
-			       RECV);
-	COPY_ODP_BIT_MLX_TO_IB(hw_caps, caps, per_transport_caps.rc_odp_caps,
-			       WRITE);
-	COPY_ODP_BIT_MLX_TO_IB(hw_caps, caps, per_transport_caps.rc_odp_caps,
-			       READ);
-
-out:
-	return err;
+
+	if (MLX5_CAP_ODP(dev->mdev, ud_odp_caps.send))
+		caps->per_transport_caps.ud_odp_caps |= IB_ODP_SUPPORT_SEND;
+
+	if (MLX5_CAP_ODP(dev->mdev, rc_odp_caps.send))
+		caps->per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_SEND;
+
+	if (MLX5_CAP_ODP(dev->mdev, rc_odp_caps.receive))
+		caps->per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_RECV;
+
+	if (MLX5_CAP_ODP(dev->mdev, rc_odp_caps.write))
+		caps->per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_WRITE;
+
+	if (MLX5_CAP_ODP(dev->mdev, rc_odp_caps.read))
+		caps->per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_READ;
+
+	return;
 }
 
 static struct mlx5_ib_mr *mlx5_ib_odp_find_mr_lkey(struct mlx5_ib_dev *dev,
diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c
index 426eb88dfa49..15fd485d1ad9 100644
--- a/drivers/infiniband/hw/mlx5/qp.c
+++ b/drivers/infiniband/hw/mlx5/qp.c
@@ -220,13 +220,11 @@ static void mlx5_ib_qp_event(struct mlx5_core_qp *qp, int type)
 static int set_rq_size(struct mlx5_ib_dev *dev, struct ib_qp_cap *cap,
 		       int has_rq, struct mlx5_ib_qp *qp, struct mlx5_ib_create_qp *ucmd)
 {
-	struct mlx5_general_caps *gen;
 	int wqe_size;
 	int wq_size;
 
-	gen = &dev->mdev->caps.gen;
 	/* Sanity check RQ size before proceeding */
-	if (cap->max_recv_wr  > gen->max_wqes)
+	if (cap->max_recv_wr > (1 << MLX5_CAP_GEN(dev->mdev, log_max_qp_sz)))
 		return -EINVAL;
 
 	if (!has_rq) {
@@ -246,10 +244,11 @@ static int set_rq_size(struct mlx5_ib_dev *dev, struct ib_qp_cap *cap,
 			wq_size = roundup_pow_of_two(cap->max_recv_wr) * wqe_size;
 			wq_size = max_t(int, wq_size, MLX5_SEND_WQE_BB);
 			qp->rq.wqe_cnt = wq_size / wqe_size;
-			if (wqe_size > gen->max_rq_desc_sz) {
+			if (wqe_size > MLX5_CAP_GEN(dev->mdev, max_wqe_sz_rq)) {
 				mlx5_ib_dbg(dev, "wqe_size %d, max %d\n",
 					    wqe_size,
-					    gen->max_rq_desc_sz);
+					    MLX5_CAP_GEN(dev->mdev,
+							 max_wqe_sz_rq));
 				return -EINVAL;
 			}
 			qp->rq.wqe_shift = ilog2(wqe_size);
@@ -330,11 +329,9 @@ static int calc_send_wqe(struct ib_qp_init_attr *attr)
 static int calc_sq_size(struct mlx5_ib_dev *dev, struct ib_qp_init_attr *attr,
 			struct mlx5_ib_qp *qp)
 {
-	struct mlx5_general_caps *gen;
 	int wqe_size;
 	int wq_size;
 
-	gen = &dev->mdev->caps.gen;
 	if (!attr->cap.max_send_wr)
 		return 0;
 
@@ -343,9 +340,9 @@ static int calc_sq_size(struct mlx5_ib_dev *dev, struct ib_qp_init_attr *attr,
 	if (wqe_size < 0)
 		return wqe_size;
 
-	if (wqe_size > gen->max_sq_desc_sz) {
+	if (wqe_size > MLX5_CAP_GEN(dev->mdev, max_wqe_sz_sq)) {
 		mlx5_ib_dbg(dev, "wqe_size(%d) > max_sq_desc_sz(%d)\n",
-			    wqe_size, gen->max_sq_desc_sz);
+			    wqe_size, MLX5_CAP_GEN(dev->mdev, max_wqe_sz_sq));
 		return -EINVAL;
 	}
 
@@ -358,9 +355,10 @@ static int calc_sq_size(struct mlx5_ib_dev *dev, struct ib_qp_init_attr *attr,
 
 	wq_size = roundup_pow_of_two(attr->cap.max_send_wr * wqe_size);
 	qp->sq.wqe_cnt = wq_size / MLX5_SEND_WQE_BB;
-	if (qp->sq.wqe_cnt > gen->max_wqes) {
+	if (qp->sq.wqe_cnt > (1 << MLX5_CAP_GEN(dev->mdev, log_max_qp_sz))) {
 		mlx5_ib_dbg(dev, "wqe count(%d) exceeds limits(%d)\n",
-			    qp->sq.wqe_cnt, gen->max_wqes);
+			    qp->sq.wqe_cnt,
+			    1 << MLX5_CAP_GEN(dev->mdev, log_max_qp_sz));
 		return -ENOMEM;
 	}
 	qp->sq.wqe_shift = ilog2(MLX5_SEND_WQE_BB);
@@ -375,13 +373,11 @@ static int set_user_buf_size(struct mlx5_ib_dev *dev,
 			    struct mlx5_ib_qp *qp,
 			    struct mlx5_ib_create_qp *ucmd)
 {
-	struct mlx5_general_caps *gen;
 	int desc_sz = 1 << qp->sq.wqe_shift;
 
-	gen = &dev->mdev->caps.gen;
-	if (desc_sz > gen->max_sq_desc_sz) {
+	if (desc_sz > MLX5_CAP_GEN(dev->mdev, max_wqe_sz_sq)) {
 		mlx5_ib_warn(dev, "desc_sz %d, max_sq_desc_sz %d\n",
-			     desc_sz, gen->max_sq_desc_sz);
+			     desc_sz, MLX5_CAP_GEN(dev->mdev, max_wqe_sz_sq));
 		return -EINVAL;
 	}
 
@@ -393,9 +389,10 @@ static int set_user_buf_size(struct mlx5_ib_dev *dev,
 
 	qp->sq.wqe_cnt = ucmd->sq_wqe_count;
 
-	if (qp->sq.wqe_cnt > gen->max_wqes) {
+	if (qp->sq.wqe_cnt > (1 << MLX5_CAP_GEN(dev->mdev, log_max_qp_sz))) {
 		mlx5_ib_warn(dev, "wqe_cnt %d, max_wqes %d\n",
-			     qp->sq.wqe_cnt, gen->max_wqes);
+			     qp->sq.wqe_cnt,
+			     1 << MLX5_CAP_GEN(dev->mdev, log_max_qp_sz));
 		return -EINVAL;
 	}
 
@@ -866,22 +863,21 @@ static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd,
 			    struct ib_udata *udata, struct mlx5_ib_qp *qp)
 {
 	struct mlx5_ib_resources *devr = &dev->devr;
+	struct mlx5_core_dev *mdev = dev->mdev;
 	struct mlx5_ib_create_qp_resp resp;
 	struct mlx5_create_qp_mbox_in *in;
-	struct mlx5_general_caps *gen;
 	struct mlx5_ib_create_qp ucmd;
 	int inlen = sizeof(*in);
 	int err;
 
 	mlx5_ib_odp_create_qp(qp);
 
-	gen = &dev->mdev->caps.gen;
 	mutex_init(&qp->mutex);
 	spin_lock_init(&qp->sq.lock);
 	spin_lock_init(&qp->rq.lock);
 
 	if (init_attr->create_flags & IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK) {
-		if (!(gen->flags & MLX5_DEV_CAP_FLAG_BLOCK_MCAST)) {
+		if (!MLX5_CAP_GEN(mdev, block_lb_mc)) {
 			mlx5_ib_dbg(dev, "block multicast loopback isn't supported\n");
 			return -EINVAL;
 		} else {
@@ -914,15 +910,17 @@ static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd,
 
 	if (pd) {
 		if (pd->uobject) {
+			__u32 max_wqes =
+				1 << MLX5_CAP_GEN(mdev, log_max_qp_sz);
 			mlx5_ib_dbg(dev, "requested sq_wqe_count (%d)\n", ucmd.sq_wqe_count);
 			if (ucmd.rq_wqe_shift != qp->rq.wqe_shift ||
 			    ucmd.rq_wqe_count != qp->rq.wqe_cnt) {
 				mlx5_ib_dbg(dev, "invalid rq params\n");
 				return -EINVAL;
 			}
-			if (ucmd.sq_wqe_count > gen->max_wqes) {
+			if (ucmd.sq_wqe_count > max_wqes) {
 				mlx5_ib_dbg(dev, "requested sq_wqe_count (%d) > max allowed (%d)\n",
-					    ucmd.sq_wqe_count, gen->max_wqes);
+					    ucmd.sq_wqe_count, max_wqes);
 				return -EINVAL;
 			}
 			err = create_user_qp(dev, pd, qp, udata, &in, &resp, &inlen);
@@ -1226,7 +1224,6 @@ struct ib_qp *mlx5_ib_create_qp(struct ib_pd *pd,
 				struct ib_qp_init_attr *init_attr,
 				struct ib_udata *udata)
 {
-	struct mlx5_general_caps *gen;
 	struct mlx5_ib_dev *dev;
 	struct mlx5_ib_qp *qp;
 	u16 xrcdn = 0;
@@ -1244,12 +1241,11 @@ struct ib_qp *mlx5_ib_create_qp(struct ib_pd *pd,
 		}
 		dev = to_mdev(to_mxrcd(init_attr->xrcd)->ibxrcd.device);
 	}
-	gen = &dev->mdev->caps.gen;
 
 	switch (init_attr->qp_type) {
 	case IB_QPT_XRC_TGT:
 	case IB_QPT_XRC_INI:
-		if (!(gen->flags & MLX5_DEV_CAP_FLAG_XRC)) {
+		if (!MLX5_CAP_GEN(dev->mdev, xrc)) {
 			mlx5_ib_dbg(dev, "XRC not supported\n");
 			return ERR_PTR(-ENOSYS);
 		}
@@ -1356,9 +1352,6 @@ enum {
 
 static int ib_rate_to_mlx5(struct mlx5_ib_dev *dev, u8 rate)
 {
-	struct mlx5_general_caps *gen;
-
-	gen = &dev->mdev->caps.gen;
 	if (rate == IB_RATE_PORT_CURRENT) {
 		return 0;
 	} else if (rate < IB_RATE_2_5_GBPS || rate > IB_RATE_300_GBPS) {
@@ -1366,7 +1359,7 @@ static int ib_rate_to_mlx5(struct mlx5_ib_dev *dev, u8 rate)
 	} else {
 		while (rate != IB_RATE_2_5_GBPS &&
 		       !(1 << (rate + MLX5_STAT_RATE_OFFSET) &
-			 gen->stat_rate_support))
+			 MLX5_CAP_GEN(dev->mdev, stat_rate_support)))
 			--rate;
 	}
 
@@ -1377,10 +1370,8 @@ static int mlx5_set_path(struct mlx5_ib_dev *dev, const struct ib_ah_attr *ah,
 			 struct mlx5_qp_path *path, u8 port, int attr_mask,
 			 u32 path_flags, const struct ib_qp_attr *attr)
 {
-	struct mlx5_general_caps *gen;
 	int err;
 
-	gen = &dev->mdev->caps.gen;
 	path->fl = (path_flags & MLX5_PATH_FLAG_FL) ? 0x80 : 0;
 	path->free_ar = (path_flags & MLX5_PATH_FLAG_FREE_AR) ? 0x80 : 0;
 
@@ -1391,9 +1382,11 @@ static int mlx5_set_path(struct mlx5_ib_dev *dev, const struct ib_ah_attr *ah,
 	path->rlid	= cpu_to_be16(ah->dlid);
 
 	if (ah->ah_flags & IB_AH_GRH) {
-		if (ah->grh.sgid_index >= gen->port[port - 1].gid_table_len) {
+		if (ah->grh.sgid_index >=
+		    dev->mdev->port_caps[port - 1].gid_table_len) {
 			pr_err("sgid_index (%u) too large. max is %d\n",
-			       ah->grh.sgid_index, gen->port[port - 1].gid_table_len);
+			       ah->grh.sgid_index,
+			       dev->mdev->port_caps[port - 1].gid_table_len);
 			return -EINVAL;
 		}
 		path->grh_mlid |= 1 << 7;
@@ -1570,7 +1563,6 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp,
 	struct mlx5_ib_qp *qp = to_mqp(ibqp);
 	struct mlx5_ib_cq *send_cq, *recv_cq;
 	struct mlx5_qp_context *context;
-	struct mlx5_general_caps *gen;
 	struct mlx5_modify_qp_mbox_in *in;
 	struct mlx5_ib_pd *pd;
 	enum mlx5_qp_state mlx5_cur, mlx5_new;
@@ -1579,7 +1571,6 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp,
 	int mlx5_st;
 	int err;
 
-	gen = &dev->mdev->caps.gen;
 	in = kzalloc(sizeof(*in), GFP_KERNEL);
 	if (!in)
 		return -ENOMEM;
@@ -1619,7 +1610,8 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp,
 			err = -EINVAL;
 			goto out;
 		}
-		context->mtu_msgmax = (attr->path_mtu << 5) | gen->log_max_msg;
+		context->mtu_msgmax = (attr->path_mtu << 5) |
+				      (u8)MLX5_CAP_GEN(dev->mdev, log_max_msg);
 	}
 
 	if (attr_mask & IB_QP_DEST_QPN)
@@ -1777,11 +1769,9 @@ int mlx5_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
 	struct mlx5_ib_dev *dev = to_mdev(ibqp->device);
 	struct mlx5_ib_qp *qp = to_mqp(ibqp);
 	enum ib_qp_state cur_state, new_state;
-	struct mlx5_general_caps *gen;
 	int err = -EINVAL;
 	int port;
 
-	gen = &dev->mdev->caps.gen;
 	mutex_lock(&qp->mutex);
 
 	cur_state = attr_mask & IB_QP_CUR_STATE ? attr->cur_qp_state : qp->state;
@@ -1793,21 +1783,25 @@ int mlx5_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
 		goto out;
 
 	if ((attr_mask & IB_QP_PORT) &&
-	    (attr->port_num == 0 || attr->port_num > gen->num_ports))
+	    (attr->port_num == 0 ||
+	     attr->port_num > MLX5_CAP_GEN(dev->mdev, num_ports)))
 		goto out;
 
 	if (attr_mask & IB_QP_PKEY_INDEX) {
 		port = attr_mask & IB_QP_PORT ? attr->port_num : qp->port;
-		if (attr->pkey_index >= gen->port[port - 1].pkey_table_len)
+		if (attr->pkey_index >=
+		    dev->mdev->port_caps[port - 1].pkey_table_len)
 			goto out;
 	}
 
 	if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC &&
-	    attr->max_rd_atomic > (1 << gen->log_max_ra_res_qp))
+	    attr->max_rd_atomic >
+	    (1 << MLX5_CAP_GEN(dev->mdev, log_max_ra_res_qp)))
 		goto out;
 
 	if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC &&
-	    attr->max_dest_rd_atomic > (1 << gen->log_max_ra_req_qp))
+	    attr->max_dest_rd_atomic >
+	    (1 << MLX5_CAP_GEN(dev->mdev, log_max_ra_req_qp)))
 		goto out;
 
 	if (cur_state == new_state && cur_state == IB_QPS_RESET) {
@@ -3009,7 +3003,7 @@ static void to_ib_ah_attr(struct mlx5_ib_dev *ibdev, struct ib_ah_attr *ib_ah_at
 	ib_ah_attr->port_num	  = path->port;
 
 	if (ib_ah_attr->port_num == 0 ||
-	    ib_ah_attr->port_num > dev->caps.gen.num_ports)
+	    ib_ah_attr->port_num > MLX5_CAP_GEN(dev, num_ports))
 		return;
 
 	ib_ah_attr->sl = path->sl & 0xf;
@@ -3135,12 +3129,10 @@ struct ib_xrcd *mlx5_ib_alloc_xrcd(struct ib_device *ibdev,
 					  struct ib_udata *udata)
 {
 	struct mlx5_ib_dev *dev = to_mdev(ibdev);
-	struct mlx5_general_caps *gen;
 	struct mlx5_ib_xrcd *xrcd;
 	int err;
 
-	gen = &dev->mdev->caps.gen;
-	if (!(gen->flags & MLX5_DEV_CAP_FLAG_XRC))
+	if (!MLX5_CAP_GEN(dev->mdev, xrc))
 		return ERR_PTR(-ENOSYS);
 
 	xrcd = kmalloc(sizeof(*xrcd), GFP_KERNEL);
diff --git a/drivers/infiniband/hw/mlx5/srq.c b/drivers/infiniband/hw/mlx5/srq.c
index 4242e1ded868..e8e8e942fa4a 100644
--- a/drivers/infiniband/hw/mlx5/srq.c
+++ b/drivers/infiniband/hw/mlx5/srq.c
@@ -236,7 +236,6 @@ struct ib_srq *mlx5_ib_create_srq(struct ib_pd *pd,
 				  struct ib_udata *udata)
 {
 	struct mlx5_ib_dev *dev = to_mdev(pd->device);
-	struct mlx5_general_caps *gen;
 	struct mlx5_ib_srq *srq;
 	int desc_size;
 	int buf_size;
@@ -245,13 +244,13 @@ struct ib_srq *mlx5_ib_create_srq(struct ib_pd *pd,
 	int uninitialized_var(inlen);
 	int is_xrc;
 	u32 flgs, xrcdn;
+	__u32 max_srq_wqes = 1 << MLX5_CAP_GEN(dev->mdev, log_max_srq_sz);
 
-	gen = &dev->mdev->caps.gen;
 	/* Sanity check SRQ size before proceeding */
-	if (init_attr->attr.max_wr >= gen->max_srq_wqes) {
+	if (init_attr->attr.max_wr >= max_srq_wqes) {
 		mlx5_ib_dbg(dev, "max_wr %d, cap %d\n",
 			    init_attr->attr.max_wr,
-			    gen->max_srq_wqes);
+			    max_srq_wqes);
 		return ERR_PTR(-EINVAL);
 	}
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eq.c b/drivers/net/ethernet/mellanox/mlx5/core/eq.c
index 516efc25fc4f..a40b96d4c662 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eq.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eq.c
@@ -455,7 +455,7 @@ int mlx5_start_eqs(struct mlx5_core_dev *dev)
 	u32 async_event_mask = MLX5_ASYNC_EVENT_MASK;
 	int err;
 
-	if (dev->caps.gen.flags & MLX5_DEV_CAP_FLAG_ON_DMND_PG)
+	if (MLX5_CAP_GEN(dev, pg))
 		async_event_mask |= (1ull << MLX5_EVENT_TYPE_PAGE_FAULT);
 
 	err = mlx5_create_map_eq(dev, &table->cmd_eq, MLX5_EQ_VEC_CMD,
@@ -478,7 +478,7 @@ int mlx5_start_eqs(struct mlx5_core_dev *dev)
 
 	err = mlx5_create_map_eq(dev, &table->pages_eq,
 				 MLX5_EQ_VEC_PAGES,
-				 dev->caps.gen.max_vf + 1,
+				 /* TODO: sriov max_vf + */ 1,
 				 1 << MLX5_EVENT_TYPE_PAGE_REQUEST, "mlx5_pages_eq",
 				 &dev->priv.uuari.uars[0]);
 	if (err) {
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fw.c b/drivers/net/ethernet/mellanox/mlx5/core/fw.c
index ef9b7695decd..801ccadd709a 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fw.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fw.c
@@ -64,50 +64,74 @@ out_out:
 	return err;
 }
 
-int mlx5_cmd_query_hca_cap(struct mlx5_core_dev *dev, struct mlx5_caps *caps)
+int mlx5_query_hca_caps(struct mlx5_core_dev *dev)
 {
-	return mlx5_core_get_caps(dev, caps, HCA_CAP_OPMOD_GET_CUR);
-}
-
-int mlx5_query_odp_caps(struct mlx5_core_dev *dev, struct mlx5_odp_caps *caps)
-{
-	u8 in[MLX5_ST_SZ_BYTES(query_hca_cap_in)];
-	int out_sz = MLX5_ST_SZ_BYTES(query_hca_cap_out);
-	void *out;
 	int err;
 
-	if (!(dev->caps.gen.flags & MLX5_DEV_CAP_FLAG_ON_DMND_PG))
-		return -ENOTSUPP;
+	err = mlx5_core_get_caps(dev, MLX5_CAP_GENERAL, HCA_CAP_OPMOD_GET_CUR);
+	if (err)
+		return err;
 
-	memset(in, 0, sizeof(in));
-	out = kzalloc(out_sz, GFP_KERNEL);
-	if (!out)
-		return -ENOMEM;
-	MLX5_SET(query_hca_cap_in, in, opcode, MLX5_CMD_OP_QUERY_HCA_CAP);
-	MLX5_SET(query_hca_cap_in, in, op_mod, HCA_CAP_OPMOD_GET_ODP_CUR);
-	err = mlx5_cmd_exec(dev, in, sizeof(in), out, out_sz);
+	err = mlx5_core_get_caps(dev, MLX5_CAP_GENERAL, HCA_CAP_OPMOD_GET_MAX);
 	if (err)
-		goto out;
+		return err;
 
-	err = mlx5_cmd_status_to_err_v2(out);
-	if (err) {
-		mlx5_core_warn(dev, "query cur hca ODP caps failed, %d\n", err);
-		goto out;
+	if (MLX5_CAP_GEN(dev, eth_net_offloads)) {
+		err = mlx5_core_get_caps(dev, MLX5_CAP_ETHERNET_OFFLOADS,
+					 HCA_CAP_OPMOD_GET_CUR);
+		if (err)
+			return err;
+		err = mlx5_core_get_caps(dev, MLX5_CAP_ETHERNET_OFFLOADS,
+					 HCA_CAP_OPMOD_GET_MAX);
+		if (err)
+			return err;
 	}
 
-	memcpy(caps, MLX5_ADDR_OF(query_hca_cap_out, out, capability),
-	       sizeof(*caps));
+	if (MLX5_CAP_GEN(dev, pg)) {
+		err = mlx5_core_get_caps(dev, MLX5_CAP_ODP,
+					 HCA_CAP_OPMOD_GET_CUR);
+		if (err)
+			return err;
+		err = mlx5_core_get_caps(dev, MLX5_CAP_ODP,
+					 HCA_CAP_OPMOD_GET_MAX);
+		if (err)
+			return err;
+	}
 
-	mlx5_core_dbg(dev, "on-demand paging capabilities:\nrc: %08x\nuc: %08x\nud: %08x\n",
-		be32_to_cpu(caps->per_transport_caps.rc_odp_caps),
-		be32_to_cpu(caps->per_transport_caps.uc_odp_caps),
-		be32_to_cpu(caps->per_transport_caps.ud_odp_caps));
+	if (MLX5_CAP_GEN(dev, atomic)) {
+		err = mlx5_core_get_caps(dev, MLX5_CAP_ATOMIC,
+					 HCA_CAP_OPMOD_GET_CUR);
+		if (err)
+			return err;
+		err = mlx5_core_get_caps(dev, MLX5_CAP_ATOMIC,
+					 HCA_CAP_OPMOD_GET_MAX);
+		if (err)
+			return err;
+	}
 
-out:
-	kfree(out);
-	return err;
+	if (MLX5_CAP_GEN(dev, roce)) {
+		err = mlx5_core_get_caps(dev, MLX5_CAP_ROCE,
+					 HCA_CAP_OPMOD_GET_CUR);
+		if (err)
+			return err;
+		err = mlx5_core_get_caps(dev, MLX5_CAP_ROCE,
+					 HCA_CAP_OPMOD_GET_MAX);
+		if (err)
+			return err;
+	}
+
+	if (MLX5_CAP_GEN(dev, nic_flow_table)) {
+		err = mlx5_core_get_caps(dev, MLX5_CAP_FLOW_TABLE,
+					 HCA_CAP_OPMOD_GET_CUR);
+		if (err)
+			return err;
+		err = mlx5_core_get_caps(dev, MLX5_CAP_FLOW_TABLE,
+					 HCA_CAP_OPMOD_GET_MAX);
+		if (err)
+			return err;
+	}
+	return 0;
 }
-EXPORT_SYMBOL(mlx5_query_odp_caps);
 
 int mlx5_cmd_init_hca(struct mlx5_core_dev *dev)
 {
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c
index a652cb93ceaa..e7b7b123a128 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
@@ -211,11 +211,12 @@ static int mlx5_enable_msix(struct mlx5_core_dev *dev)
 {
 	struct mlx5_priv *priv = &dev->priv;
 	struct mlx5_eq_table *table = &priv->eq_table;
-	int num_eqs = 1 << dev->caps.gen.log_max_eq;
+	int num_eqs = 1 << MLX5_CAP_GEN(dev, log_max_eq);
 	int nvec;
 	int i;
 
-	nvec = dev->caps.gen.num_ports * num_online_cpus() + MLX5_EQ_VEC_COMP_BASE;
+	nvec = MLX5_CAP_GEN(dev, num_ports) * num_online_cpus() +
+	       MLX5_EQ_VEC_COMP_BASE;
 	nvec = min_t(int, nvec, num_eqs);
 	if (nvec <= MLX5_EQ_VEC_COMP_BASE)
 		return -ENOMEM;
@@ -287,97 +288,28 @@ static u16 to_fw_pkey_sz(u32 size)
 	}
 }
 
-/* selectively copy writable fields clearing any reserved area
- */
-static void copy_rw_fields(void *to, struct mlx5_caps *from)
-{
-	__be64 *flags_off = (__be64 *)MLX5_ADDR_OF(cmd_hca_cap, to, reserved_22);
-	u64 v64;
-
-	MLX5_SET(cmd_hca_cap, to, log_max_qp, from->gen.log_max_qp);
-	MLX5_SET(cmd_hca_cap, to, log_max_ra_req_qp, from->gen.log_max_ra_req_qp);
-	MLX5_SET(cmd_hca_cap, to, log_max_ra_res_qp, from->gen.log_max_ra_res_qp);
-	MLX5_SET(cmd_hca_cap, to, pkey_table_size, from->gen.pkey_table_size);
-	MLX5_SET(cmd_hca_cap, to, pkey_table_size, to_fw_pkey_sz(from->gen.pkey_table_size));
-	MLX5_SET(cmd_hca_cap, to, log_uar_page_sz, PAGE_SHIFT - 12);
-	v64 = from->gen.flags & MLX5_CAP_BITS_RW_MASK;
-	*flags_off = cpu_to_be64(v64);
-}
-
-static u16 get_pkey_table_size(int pkey)
+static u16 to_sw_pkey_sz(int pkey_sz)
 {
-	if (pkey > MLX5_MAX_LOG_PKEY_TABLE)
+	if (pkey_sz > MLX5_MAX_LOG_PKEY_TABLE)
 		return 0;
 
-	return MLX5_MIN_PKEY_TABLE_SIZE << pkey;
-}
-
-static void fw2drv_caps(struct mlx5_caps *caps, void *out)
-{
-	struct mlx5_general_caps *gen = &caps->gen;
-
-	gen->max_srq_wqes = 1 << MLX5_GET_PR(cmd_hca_cap, out, log_max_srq_sz);
-	gen->max_wqes = 1 << MLX5_GET_PR(cmd_hca_cap, out, log_max_qp_sz);
-	gen->log_max_qp = MLX5_GET_PR(cmd_hca_cap, out, log_max_qp);
-	gen->log_max_srq = MLX5_GET_PR(cmd_hca_cap, out, log_max_srq);
-	gen->max_cqes = 1 << MLX5_GET_PR(cmd_hca_cap, out, log_max_cq_sz);
-	gen->log_max_cq = MLX5_GET_PR(cmd_hca_cap, out, log_max_cq);
-	gen->max_eqes = 1 << MLX5_GET_PR(cmd_hca_cap, out, log_max_eq_sz);
-	gen->log_max_mkey = MLX5_GET_PR(cmd_hca_cap, out, log_max_mkey);
-	gen->log_max_eq = MLX5_GET_PR(cmd_hca_cap, out, log_max_eq);
-	gen->max_indirection = MLX5_GET_PR(cmd_hca_cap, out, max_indirection);
-	gen->log_max_mrw_sz = MLX5_GET_PR(cmd_hca_cap, out, log_max_mrw_sz);
-	gen->log_max_bsf_list_size = MLX5_GET_PR(cmd_hca_cap, out, log_max_bsf_list_size);
-	gen->log_max_klm_list_size = MLX5_GET_PR(cmd_hca_cap, out, log_max_klm_list_size);
-	gen->log_max_ra_req_dc = MLX5_GET_PR(cmd_hca_cap, out, log_max_ra_req_dc);
-	gen->log_max_ra_res_dc = MLX5_GET_PR(cmd_hca_cap, out, log_max_ra_res_dc);
-	gen->log_max_ra_req_qp = MLX5_GET_PR(cmd_hca_cap, out, log_max_ra_req_qp);
-	gen->log_max_ra_res_qp = MLX5_GET_PR(cmd_hca_cap, out, log_max_ra_res_qp);
-	gen->max_qp_counters = MLX5_GET_PR(cmd_hca_cap, out, max_qp_cnt);
-	gen->pkey_table_size = get_pkey_table_size(MLX5_GET_PR(cmd_hca_cap, out, pkey_table_size));
-	gen->local_ca_ack_delay = MLX5_GET_PR(cmd_hca_cap, out, local_ca_ack_delay);
-	gen->num_ports = MLX5_GET_PR(cmd_hca_cap, out, num_ports);
-	gen->log_max_msg = MLX5_GET_PR(cmd_hca_cap, out, log_max_msg);
-	gen->stat_rate_support = MLX5_GET_PR(cmd_hca_cap, out, stat_rate_support);
-	gen->flags = be64_to_cpu(*(__be64 *)MLX5_ADDR_OF(cmd_hca_cap, out, reserved_22));
-	pr_debug("flags = 0x%llx\n", gen->flags);
-	gen->uar_sz = MLX5_GET_PR(cmd_hca_cap, out, uar_sz);
-	gen->min_log_pg_sz = MLX5_GET_PR(cmd_hca_cap, out, log_pg_sz);
-	gen->bf_reg_size = MLX5_GET_PR(cmd_hca_cap, out, bf);
-	gen->bf_reg_size = 1 << MLX5_GET_PR(cmd_hca_cap, out, log_bf_reg_size);
-	gen->max_sq_desc_sz = MLX5_GET_PR(cmd_hca_cap, out, max_wqe_sz_sq);
-	gen->max_rq_desc_sz = MLX5_GET_PR(cmd_hca_cap, out, max_wqe_sz_rq);
-	gen->max_dc_sq_desc_sz = MLX5_GET_PR(cmd_hca_cap, out, max_wqe_sz_sq_dc);
-	gen->max_qp_mcg = MLX5_GET_PR(cmd_hca_cap, out, max_qp_mcg);
-	gen->log_max_pd = MLX5_GET_PR(cmd_hca_cap, out, log_max_pd);
-	gen->log_max_xrcd = MLX5_GET_PR(cmd_hca_cap, out, log_max_xrcd);
-	gen->log_uar_page_sz = MLX5_GET_PR(cmd_hca_cap, out, log_uar_page_sz);
-}
-
-static const char *caps_opmod_str(u16 opmod)
-{
-	switch (opmod) {
-	case HCA_CAP_OPMOD_GET_MAX:
-		return "GET_MAX";
-	case HCA_CAP_OPMOD_GET_CUR:
-		return "GET_CUR";
-	default:
-		return "Invalid";
-	}
+	return MLX5_MIN_PKEY_TABLE_SIZE << pkey_sz;
 }
 
-int mlx5_core_get_caps(struct mlx5_core_dev *dev, struct mlx5_caps *caps,
-		       u16 opmod)
+int mlx5_core_get_caps(struct mlx5_core_dev *dev, enum mlx5_cap_type cap_type,
+		       enum mlx5_cap_mode cap_mode)
 {
 	u8 in[MLX5_ST_SZ_BYTES(query_hca_cap_in)];
 	int out_sz = MLX5_ST_SZ_BYTES(query_hca_cap_out);
-	void *out;
+	void *out, *hca_caps;
+	u16 opmod = (cap_type << 1) | (cap_mode & 0x01);
 	int err;
 
 	memset(in, 0, sizeof(in));
 	out = kzalloc(out_sz, GFP_KERNEL);
 	if (!out)
 		return -ENOMEM;
+
 	MLX5_SET(query_hca_cap_in, in, opcode, MLX5_CMD_OP_QUERY_HCA_CAP);
 	MLX5_SET(query_hca_cap_in, in, op_mod, opmod);
 	err = mlx5_cmd_exec(dev, in, sizeof(in), out, out_sz);
@@ -386,12 +318,30 @@ int mlx5_core_get_caps(struct mlx5_core_dev *dev, struct mlx5_caps *caps,
 
 	err = mlx5_cmd_status_to_err_v2(out);
 	if (err) {
-		mlx5_core_warn(dev, "query max hca cap failed, %d\n", err);
+		mlx5_core_warn(dev,
+			       "QUERY_HCA_CAP : type(%x) opmode(%x) Failed(%d)\n",
+			       cap_type, cap_mode, err);
 		goto query_ex;
 	}
-	mlx5_core_dbg(dev, "%s\n", caps_opmod_str(opmod));
-	fw2drv_caps(caps, MLX5_ADDR_OF(query_hca_cap_out, out, capability));
 
+	hca_caps =  MLX5_ADDR_OF(query_hca_cap_out, out, capability);
+
+	switch (cap_mode) {
+	case HCA_CAP_OPMOD_GET_MAX:
+		memcpy(dev->hca_caps_max[cap_type], hca_caps,
+		       MLX5_UN_SZ_BYTES(hca_cap_union));
+		break;
+	case HCA_CAP_OPMOD_GET_CUR:
+		memcpy(dev->hca_caps_cur[cap_type], hca_caps,
+		       MLX5_UN_SZ_BYTES(hca_cap_union));
+		break;
+	default:
+		mlx5_core_warn(dev,
+			       "Tried to query dev cap type(%x) with wrong opmode(%x)\n",
+			       cap_type, cap_mode);
+		err = -EINVAL;
+		break;
+	}
 query_ex:
 	kfree(out);
 	return err;
@@ -418,49 +368,45 @@ static int handle_hca_cap(struct mlx5_core_dev *dev)
 {
 	void *set_ctx = NULL;
 	struct mlx5_profile *prof = dev->profile;
-	struct mlx5_caps *cur_caps = NULL;
-	struct mlx5_caps *max_caps = NULL;
 	int err = -ENOMEM;
 	int set_sz = MLX5_ST_SZ_BYTES(set_hca_cap_in);
+	void *set_hca_cap;
 
 	set_ctx = kzalloc(set_sz, GFP_KERNEL);
 	if (!set_ctx)
 		goto query_ex;
 
-	max_caps = kzalloc(sizeof(*max_caps), GFP_KERNEL);
-	if (!max_caps)
-		goto query_ex;
-
-	cur_caps = kzalloc(sizeof(*cur_caps), GFP_KERNEL);
-	if (!cur_caps)
-		goto query_ex;
-
-	err = mlx5_core_get_caps(dev, max_caps, HCA_CAP_OPMOD_GET_MAX);
+	err = mlx5_core_get_caps(dev, MLX5_CAP_GENERAL, HCA_CAP_OPMOD_GET_MAX);
 	if (err)
 		goto query_ex;
 
-	err = mlx5_core_get_caps(dev, cur_caps, HCA_CAP_OPMOD_GET_CUR);
+	err = mlx5_core_get_caps(dev, MLX5_CAP_GENERAL, HCA_CAP_OPMOD_GET_CUR);
 	if (err)
 		goto query_ex;
 
+	set_hca_cap = MLX5_ADDR_OF(set_hca_cap_in, set_ctx,
+				   capability);
+	memcpy(set_hca_cap, dev->hca_caps_cur[MLX5_CAP_GENERAL],
+	       MLX5_ST_SZ_BYTES(cmd_hca_cap));
+
+	mlx5_core_dbg(dev, "Current Pkey table size %d Setting new size %d\n",
+		      to_sw_pkey_sz(MLX5_CAP_GEN(dev, pkey_table_size)),
+		      128);
 	/* we limit the size of the pkey table to 128 entries for now */
-	cur_caps->gen.pkey_table_size = 128;
+	MLX5_SET(cmd_hca_cap, set_hca_cap, pkey_table_size,
+		 to_fw_pkey_sz(128));
 
 	if (prof->mask & MLX5_PROF_MASK_QP_SIZE)
-		cur_caps->gen.log_max_qp = prof->log_max_qp;
+		MLX5_SET(cmd_hca_cap, set_hca_cap, log_max_qp,
+			 prof->log_max_qp);
 
-	/* disable checksum */
-	cur_caps->gen.flags &= ~MLX5_DEV_CAP_FLAG_CMDIF_CSUM;
+	/* disable cmdif checksum */
+	MLX5_SET(cmd_hca_cap, set_hca_cap, cmdif_checksum, 0);
 
-	copy_rw_fields(MLX5_ADDR_OF(set_hca_cap_in, set_ctx, capability),
-		       cur_caps);
 	err = set_caps(dev, set_ctx, set_sz);
 
 query_ex:
-	kfree(cur_caps);
-	kfree(max_caps);
 	kfree(set_ctx);
-
 	return err;
 }
 
@@ -768,7 +714,7 @@ static int mlx5_dev_init(struct mlx5_core_dev *dev, struct pci_dev *pdev)
 
 	mlx5_start_health_poll(dev);
 
-	err = mlx5_cmd_query_hca_cap(dev, &dev->caps);
+	err = mlx5_query_hca_caps(dev);
 	if (err) {
 		dev_err(&pdev->dev, "query hca failed\n");
 		goto err_stop_poll;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
index a051b906afdf..b986f1c258bc 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
@@ -65,9 +65,15 @@ enum {
 	MLX5_CMD_TIME, /* print command execution time */
 };
 
+static inline int mlx5_cmd_exec_check_status(struct mlx5_core_dev *dev, u32 *in,
+					     int in_size, u32 *out,
+					     int out_size)
+{
+	mlx5_cmd_exec(dev, in, in_size, out, out_size);
+	return mlx5_cmd_status_to_err((struct mlx5_outbox_hdr *)out);
+}
 
-int mlx5_cmd_query_hca_cap(struct mlx5_core_dev *dev,
-			   struct mlx5_caps *caps);
+int mlx5_query_hca_caps(struct mlx5_core_dev *dev);
 int mlx5_cmd_query_adapter(struct mlx5_core_dev *dev);
 int mlx5_cmd_init_hca(struct mlx5_core_dev *dev);
 int mlx5_cmd_teardown_hca(struct mlx5_core_dev *dev);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/uar.c b/drivers/net/ethernet/mellanox/mlx5/core/uar.c
index ba58f6d7c761..9ef85873ceea 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/uar.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/uar.c
@@ -175,12 +175,13 @@ int mlx5_alloc_uuars(struct mlx5_core_dev *dev, struct mlx5_uuar_info *uuari)
 	for (i = 0; i < tot_uuars; i++) {
 		bf = &uuari->bfs[i];
 
-		bf->buf_size = dev->caps.gen.bf_reg_size / 2;
+		bf->buf_size = (1 << MLX5_CAP_GEN(dev, log_bf_reg_size)) / 2;
 		bf->uar = &uuari->uars[i / MLX5_BF_REGS_PER_PAGE];
 		bf->regreg = uuari->uars[i / MLX5_BF_REGS_PER_PAGE].map;
 		bf->reg = NULL; /* Add WC support */
-		bf->offset = (i % MLX5_BF_REGS_PER_PAGE) * dev->caps.gen.bf_reg_size +
-			MLX5_BF_OFFSET;
+		bf->offset = (i % MLX5_BF_REGS_PER_PAGE) *
+			     (1 << MLX5_CAP_GEN(dev, log_bf_reg_size)) +
+			     MLX5_BF_OFFSET;
 		bf->need_lock = need_uuar_lock(i);
 		spin_lock_init(&bf->lock);
 		spin_lock_init(&bf->lock32);
diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h
index feebed7b392b..4ee52bf1f959 100644
--- a/include/linux/mlx5/device.h
+++ b/include/linux/mlx5/device.h
@@ -59,6 +59,8 @@
 #define MLX5_FLD_SZ_BYTES(typ, fld) (__mlx5_bit_sz(typ, fld) / 8)
 #define MLX5_ST_SZ_BYTES(typ) (sizeof(struct mlx5_ifc_##typ##_bits) / 8)
 #define MLX5_ST_SZ_DW(typ) (sizeof(struct mlx5_ifc_##typ##_bits) / 32)
+#define MLX5_UN_SZ_BYTES(typ) (sizeof(union mlx5_ifc_##typ##_bits) / 8)
+#define MLX5_UN_SZ_DW(typ) (sizeof(union mlx5_ifc_##typ##_bits) / 32)
 #define MLX5_BYTE_OFF(typ, fld) (__mlx5_bit_off(typ, fld) / 8)
 #define MLX5_ADDR_OF(typ, p, fld) ((char *)(p) + MLX5_BYTE_OFF(typ, fld))
 
@@ -322,13 +324,6 @@ enum {
 	MLX5_CAP_OFF_CMDIF_CSUM		= 46,
 };
 
-enum {
-	HCA_CAP_OPMOD_GET_MAX	= 0,
-	HCA_CAP_OPMOD_GET_CUR	= 1,
-	HCA_CAP_OPMOD_GET_ODP_MAX = 4,
-	HCA_CAP_OPMOD_GET_ODP_CUR = 5
-};
-
 struct mlx5_inbox_hdr {
 	__be16		opcode;
 	u8		rsvd[4];
@@ -1101,4 +1096,61 @@ enum {
 	MLX5_RQC_RQ_TYPE_MEMORY_RQ_RPM    = 0x1,
 };
 
+/* MLX5 DEV CAPs */
+
+/* TODO: EAT.ME */
+enum mlx5_cap_mode {
+	HCA_CAP_OPMOD_GET_MAX	= 0,
+	HCA_CAP_OPMOD_GET_CUR	= 1,
+};
+
+enum mlx5_cap_type {
+	MLX5_CAP_GENERAL = 0,
+	MLX5_CAP_ETHERNET_OFFLOADS,
+	MLX5_CAP_ODP,
+	MLX5_CAP_ATOMIC,
+	MLX5_CAP_ROCE,
+	MLX5_CAP_IPOIB_OFFLOADS,
+	MLX5_CAP_EOIB_OFFLOADS,
+	MLX5_CAP_FLOW_TABLE,
+	/* NUM OF CAP Types */
+	MLX5_CAP_NUM
+};
+
+/* GET Dev Caps macros */
+#define MLX5_CAP_GEN(mdev, cap) \
+	MLX5_GET(cmd_hca_cap, mdev->hca_caps_cur[MLX5_CAP_GENERAL], cap)
+
+#define MLX5_CAP_GEN_MAX(mdev, cap) \
+	MLX5_GET(cmd_hca_cap, mdev->hca_caps_max[MLX5_CAP_GENERAL], cap)
+
+#define MLX5_CAP_ETH(mdev, cap) \
+	MLX5_GET(per_protocol_networking_offload_caps,\
+		 mdev->hca_caps_cur[MLX5_CAP_ETHERNET_OFFLOADS], cap)
+
+#define MLX5_CAP_ETH_MAX(mdev, cap) \
+	MLX5_GET(per_protocol_networking_offload_caps,\
+		 mdev->hca_caps_max[MLX5_CAP_ETHERNET_OFFLOADS], cap)
+
+#define MLX5_CAP_ROCE(mdev, cap) \
+	MLX5_GET(roce_cap, mdev->hca_caps_cur[MLX5_CAP_ROCE], cap)
+
+#define MLX5_CAP_ROCE_MAX(mdev, cap) \
+	MLX5_GET(roce_cap, mdev->hca_caps_max[MLX5_CAP_ROCE], cap)
+
+#define MLX5_CAP_ATOMIC(mdev, cap) \
+	MLX5_GET(atomic_caps, mdev->hca_caps_cur[MLX5_CAP_ATOMIC], cap)
+
+#define MLX5_CAP_ATOMIC_MAX(mdev, cap) \
+	MLX5_GET(atomic_caps, mdev->hca_caps_max[MLX5_CAP_ATOMIC], cap)
+
+#define MLX5_CAP_FLOWTABLE(mdev, cap) \
+	MLX5_GET(flow_table_nic_cap, mdev->hca_caps_cur[MLX5_CAP_FLOW_TABLE], cap)
+
+#define MLX5_CAP_FLOWTABLE_MAX(mdev, cap) \
+	MLX5_GET(flow_table_nic_cap, mdev->hca_caps_max[MLX5_CAP_FLOW_TABLE], cap)
+
+#define MLX5_CAP_ODP(mdev, cap)\
+	MLX5_GET(odp_cap, mdev->hca_caps_cur[MLX5_CAP_ODP], cap)
+
 #endif /* MLX5_DEVICE_H */
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 3fd4fdc1ba16..6b9199163633 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -268,55 +268,7 @@ struct mlx5_cmd {
 struct mlx5_port_caps {
 	int	gid_table_len;
 	int	pkey_table_len;
-};
-
-struct mlx5_general_caps {
-	u8	log_max_eq;
-	u8	log_max_cq;
-	u8	log_max_qp;
-	u8	log_max_mkey;
-	u8	log_max_pd;
-	u8	log_max_srq;
-	u8	log_max_mrw_sz;
-	u8	log_max_bsf_list_size;
-	u8	log_max_klm_list_size;
-	u32	max_cqes;
-	int	max_wqes;
-	u32	max_eqes;
-	u32	max_indirection;
-	int	max_sq_desc_sz;
-	int	max_rq_desc_sz;
-	int	max_dc_sq_desc_sz;
-	u64	flags;
-	u16	stat_rate_support;
-	int	log_max_msg;
-	int	num_ports;
-	u8	log_max_ra_res_qp;
-	u8	log_max_ra_req_qp;
-	int	max_srq_wqes;
-	int	bf_reg_size;
-	int	bf_regs_per_page;
-	struct mlx5_port_caps	port[MLX5_MAX_PORTS];
-	u8			ext_port_cap[MLX5_MAX_PORTS];
-	int	max_vf;
-	u32	reserved_lkey;
-	u8	local_ca_ack_delay;
-	u8	log_max_mcg;
-	u32	max_qp_mcg;
-	int	min_page_sz;
-	int	pd_cap;
-	u32	max_qp_counters;
-	u32	pkey_table_size;
-	u8	log_max_ra_req_dc;
-	u8	log_max_ra_res_dc;
-	u32	uar_sz;
-	u8	min_log_pg_sz;
-	u8	log_max_xrcd;
-	u16	log_uar_page_sz;
-};
-
-struct mlx5_caps {
-	struct mlx5_general_caps gen;
+	u8	ext_port_cap;
 };
 
 struct mlx5_cmd_mailbox {
@@ -521,7 +473,9 @@ struct mlx5_core_dev {
 	u8			rev_id;
 	char			board_id[MLX5_BOARD_ID_LEN];
 	struct mlx5_cmd		cmd;
-	struct mlx5_caps	caps;
+	struct mlx5_port_caps	port_caps[MLX5_MAX_PORTS];
+	u32 hca_caps_cur[MLX5_CAP_NUM][MLX5_UN_SZ_DW(hca_cap_union)];
+	u32 hca_caps_max[MLX5_CAP_NUM][MLX5_UN_SZ_DW(hca_cap_union)];
 	phys_addr_t		iseg_base;
 	struct mlx5_init_seg __iomem *iseg;
 	void			(*event) (struct mlx5_core_dev *dev,
@@ -651,8 +605,8 @@ void mlx5_cmd_use_events(struct mlx5_core_dev *dev);
 void mlx5_cmd_use_polling(struct mlx5_core_dev *dev);
 int mlx5_cmd_status_to_err(struct mlx5_outbox_hdr *hdr);
 int mlx5_cmd_status_to_err_v2(void *ptr);
-int mlx5_core_get_caps(struct mlx5_core_dev *dev, struct mlx5_caps *caps,
-		       u16 opmod);
+int mlx5_core_get_caps(struct mlx5_core_dev *dev, enum mlx5_cap_type cap_type,
+		       enum mlx5_cap_mode cap_mode);
 int mlx5_cmd_exec(struct mlx5_core_dev *dev, void *in, int in_size, void *out,
 		  int out_size);
 int mlx5_cmd_exec_cb(struct mlx5_core_dev *dev, void *in, int in_size,
-- 
cgit v1.2.3


From adb0c9545bce6f1b1d563e988e6ee5531861d449 Mon Sep 17 00:00:00 2001
From: Saeed Mahameed <saeedm@mellanox.com>
Date: Thu, 28 May 2015 22:28:42 +0300
Subject: net/mlx5_core: Implement access functions of ptys register fields

Those registers will be used by the ethtool to set/get settings.

Signed-off-by: Rana Shahout <ranas@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: Amir Vadai <amirv@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx5/core/port.c | 77 ++++++++++++++++++++++++++
 include/linux/mlx5/driver.h                    | 14 +++++
 2 files changed, 91 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/port.c b/drivers/net/ethernet/mellanox/mlx5/core/port.c
index 49e90f2612d8..6e2d99cc3b61 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/port.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/port.c
@@ -102,3 +102,80 @@ int mlx5_set_port_caps(struct mlx5_core_dev *dev, u8 port_num, u32 caps)
 	return err;
 }
 EXPORT_SYMBOL_GPL(mlx5_set_port_caps);
+
+int mlx5_query_port_ptys(struct mlx5_core_dev *dev, u32 *ptys,
+			 int ptys_size, int proto_mask)
+{
+	u32 in[MLX5_ST_SZ_DW(ptys_reg)];
+	int err;
+
+	memset(in, 0, sizeof(in));
+	MLX5_SET(ptys_reg, in, local_port, 1);
+	MLX5_SET(ptys_reg, in, proto_mask, proto_mask);
+
+	err = mlx5_core_access_reg(dev, in, sizeof(in), ptys,
+				   ptys_size, MLX5_REG_PTYS, 0, 0);
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx5_query_port_ptys);
+
+int mlx5_query_port_proto_cap(struct mlx5_core_dev *dev,
+			      u32 *proto_cap, int proto_mask)
+{
+	u32 out[MLX5_ST_SZ_DW(ptys_reg)];
+	int err;
+
+	err = mlx5_query_port_ptys(dev, out, sizeof(out), proto_mask);
+	if (err)
+		return err;
+
+	if (proto_mask == MLX5_PTYS_EN)
+		*proto_cap = MLX5_GET(ptys_reg, out, eth_proto_capability);
+	else
+		*proto_cap = MLX5_GET(ptys_reg, out, ib_proto_capability);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mlx5_query_port_proto_cap);
+
+int mlx5_query_port_proto_admin(struct mlx5_core_dev *dev,
+				u32 *proto_admin, int proto_mask)
+{
+	u32 out[MLX5_ST_SZ_DW(ptys_reg)];
+	int err;
+
+	err = mlx5_query_port_ptys(dev, out, sizeof(out), proto_mask);
+	if (err)
+		return err;
+
+	if (proto_mask == MLX5_PTYS_EN)
+		*proto_admin = MLX5_GET(ptys_reg, out, eth_proto_admin);
+	else
+		*proto_admin = MLX5_GET(ptys_reg, out, ib_proto_admin);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mlx5_query_port_proto_admin);
+
+int mlx5_set_port_proto(struct mlx5_core_dev *dev, u32 proto_admin,
+			int proto_mask)
+{
+	u32 in[MLX5_ST_SZ_DW(ptys_reg)];
+	u32 out[MLX5_ST_SZ_DW(ptys_reg)];
+	int err;
+
+	memset(in, 0, sizeof(in));
+
+	MLX5_SET(ptys_reg, in, local_port, 1);
+	MLX5_SET(ptys_reg, in, proto_mask, proto_mask);
+	if (proto_mask == MLX5_PTYS_EN)
+		MLX5_SET(ptys_reg, in, eth_proto_admin, proto_admin);
+	else
+		MLX5_SET(ptys_reg, in, ib_proto_admin, proto_admin);
+
+	err = mlx5_core_access_reg(dev, in, sizeof(in), out,
+				   sizeof(out), MLX5_REG_PTYS, 0, 1);
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx5_set_port_proto);
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 6b9199163633..266d5498a270 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -504,6 +504,11 @@ enum {
 	MLX5_COMP_EQ_SIZE = 1024,
 };
 
+enum {
+	MLX5_PTYS_IB = 1 << 0,
+	MLX5_PTYS_EN = 1 << 2,
+};
+
 struct mlx5_db_pgdir {
 	struct list_head	list;
 	DECLARE_BITMAP(bitmap, MLX5_DB_PER_PAGE);
@@ -686,7 +691,16 @@ void mlx5_qp_debugfs_cleanup(struct mlx5_core_dev *dev);
 int mlx5_core_access_reg(struct mlx5_core_dev *dev, void *data_in,
 			 int size_in, void *data_out, int size_out,
 			 u16 reg_num, int arg, int write);
+
 int mlx5_set_port_caps(struct mlx5_core_dev *dev, u8 port_num, u32 caps);
+int mlx5_query_port_ptys(struct mlx5_core_dev *dev, u32 *ptys,
+			 int ptys_size, int proto_mask);
+int mlx5_query_port_proto_cap(struct mlx5_core_dev *dev,
+			      u32 *proto_cap, int proto_mask);
+int mlx5_query_port_proto_admin(struct mlx5_core_dev *dev,
+				u32 *proto_admin, int proto_mask);
+int mlx5_set_port_proto(struct mlx5_core_dev *dev, u32 proto_admin,
+			int proto_mask);
 
 int mlx5_debug_eq_add(struct mlx5_core_dev *dev, struct mlx5_eq *eq);
 void mlx5_debug_eq_remove(struct mlx5_core_dev *dev, struct mlx5_eq *eq);
-- 
cgit v1.2.3


From 4c916a798058c1acf5a980438416020932c24aca Mon Sep 17 00:00:00 2001
From: Rana Shahout <ranas@mellanox.com>
Date: Thu, 28 May 2015 22:28:43 +0300
Subject: net/mlx5_core: Implement get/set port status

Implemet get/set port status low level functions to be exposed by the
netdev.

Signed-off-by: Rana Shahout <ranas@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: Amir Vadai <amirv@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx5/core/port.c | 32 ++++++++++++++++++++++++++
 include/linux/mlx5/driver.h                    |  8 +++++++
 2 files changed, 40 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/port.c b/drivers/net/ethernet/mellanox/mlx5/core/port.c
index 6e2d99cc3b61..742a6fb8debe 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/port.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/port.c
@@ -179,3 +179,35 @@ int mlx5_set_port_proto(struct mlx5_core_dev *dev, u32 proto_admin,
 	return err;
 }
 EXPORT_SYMBOL_GPL(mlx5_set_port_proto);
+
+int mlx5_set_port_status(struct mlx5_core_dev *dev,
+			 enum mlx5_port_status status)
+{
+	u32 in[MLX5_ST_SZ_DW(paos_reg)];
+	u32 out[MLX5_ST_SZ_DW(paos_reg)];
+
+	memset(in, 0, sizeof(in));
+
+	MLX5_SET(paos_reg, in, admin_status, status);
+	MLX5_SET(paos_reg, in, ase, 1);
+
+	return mlx5_core_access_reg(dev, in, sizeof(in), out,
+				    sizeof(out), MLX5_REG_PAOS, 0, 1);
+}
+
+int mlx5_query_port_status(struct mlx5_core_dev *dev, u8 *status)
+{
+	u32 in[MLX5_ST_SZ_DW(paos_reg)];
+	u32 out[MLX5_ST_SZ_DW(paos_reg)];
+	int err;
+
+	memset(in, 0, sizeof(in));
+
+	err = mlx5_core_access_reg(dev, in, sizeof(in), out,
+				   sizeof(out), MLX5_REG_PAOS, 0, 0);
+	if (err)
+		return err;
+
+	*status = MLX5_GET(paos_reg, out, oper_status);
+	return err;
+}
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 266d5498a270..6438444ab361 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -149,6 +149,11 @@ enum mlx5_dev_event {
 	MLX5_DEV_EVENT_CLIENT_REREG,
 };
 
+enum mlx5_port_status {
+	MLX5_PORT_UP        = 1 << 1,
+	MLX5_PORT_DOWN      = 1 << 2,
+};
+
 struct mlx5_uuar_info {
 	struct mlx5_uar	       *uars;
 	int			num_uars;
@@ -701,6 +706,9 @@ int mlx5_query_port_proto_admin(struct mlx5_core_dev *dev,
 				u32 *proto_admin, int proto_mask);
 int mlx5_set_port_proto(struct mlx5_core_dev *dev, u32 proto_admin,
 			int proto_mask);
+int mlx5_set_port_status(struct mlx5_core_dev *dev,
+			 enum mlx5_port_status status);
+int mlx5_query_port_status(struct mlx5_core_dev *dev, u8 *status);
 
 int mlx5_debug_eq_add(struct mlx5_core_dev *dev, struct mlx5_eq *eq);
 void mlx5_debug_eq_remove(struct mlx5_core_dev *dev, struct mlx5_eq *eq);
-- 
cgit v1.2.3


From 90b3e38d048f09b22fb50bcd460cea65fd00b2d7 Mon Sep 17 00:00:00 2001
From: Rana Shahout <ranas@mellanox.com>
Date: Thu, 28 May 2015 22:28:44 +0300
Subject: net/mlx5_core: Modify CQ moderation parameters

Introduce mlx5_core_modify_cq_moderation() to be used by the netdev, to
set hardware coalescing.

Signed-off-by: Rana Shahout <ranas@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: Amir Vadai <amirv@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx5/core/cq.c | 18 ++++++++++++++++++
 include/linux/mlx5/cq.h                      |  3 +++
 2 files changed, 21 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/cq.c b/drivers/net/ethernet/mellanox/mlx5/core/cq.c
index eb0cf81f5f45..04ab7e445eae 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/cq.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/cq.c
@@ -219,6 +219,24 @@ int mlx5_core_modify_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq,
 }
 EXPORT_SYMBOL(mlx5_core_modify_cq);
 
+int mlx5_core_modify_cq_moderation(struct mlx5_core_dev *dev,
+				   struct mlx5_core_cq *cq,
+				   u16 cq_period,
+				   u16 cq_max_count)
+{
+	struct mlx5_modify_cq_mbox_in in;
+
+	memset(&in, 0, sizeof(in));
+
+	in.cqn              = cpu_to_be32(cq->cqn);
+	in.ctx.cq_period    = cpu_to_be16(cq_period);
+	in.ctx.cq_max_count = cpu_to_be16(cq_max_count);
+	in.field_select     = cpu_to_be32(MLX5_CQ_MODIFY_PERIOD |
+					  MLX5_CQ_MODIFY_COUNT);
+
+	return mlx5_core_modify_cq(dev, cq, &in, sizeof(in));
+}
+
 int mlx5_init_cq_table(struct mlx5_core_dev *dev)
 {
 	struct mlx5_cq_table *table = &dev->priv.cq_table;
diff --git a/include/linux/mlx5/cq.h b/include/linux/mlx5/cq.h
index 2695ced222df..abc4767695e4 100644
--- a/include/linux/mlx5/cq.h
+++ b/include/linux/mlx5/cq.h
@@ -169,6 +169,9 @@ int mlx5_core_query_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq,
 		       struct mlx5_query_cq_mbox_out *out);
 int mlx5_core_modify_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq,
 			struct mlx5_modify_cq_mbox_in *in, int in_sz);
+int mlx5_core_modify_cq_moderation(struct mlx5_core_dev *dev,
+				   struct mlx5_core_cq *cq, u16 cq_period,
+				   u16 cq_max_count);
 int mlx5_debug_cq_add(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq);
 void mlx5_debug_cq_remove(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq);
 
-- 
cgit v1.2.3


From e725440e75da8c4d617a31c4e38216acc55c24e3 Mon Sep 17 00:00:00 2001
From: Saeed Mahameed <saeedm@mellanox.com>
Date: Thu, 28 May 2015 22:28:45 +0300
Subject: net/mlx5_core: Set/Query port MTU commands

Introduce set/Query low level functions to access MTU in hardware. To be
used by the netdev.

Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: Amir Vadai <amirv@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx5/core/port.c | 53 ++++++++++++++++++++++++++
 include/linux/mlx5/driver.h                    |  4 ++
 2 files changed, 57 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/port.c b/drivers/net/ethernet/mellanox/mlx5/core/port.c
index 742a6fb8debe..7d3d0f9f328d 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/port.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/port.c
@@ -211,3 +211,56 @@ int mlx5_query_port_status(struct mlx5_core_dev *dev, u8 *status)
 	*status = MLX5_GET(paos_reg, out, oper_status);
 	return err;
 }
+
+static int mlx5_query_port_mtu(struct mlx5_core_dev *dev,
+			       int *admin_mtu, int *max_mtu, int *oper_mtu)
+{
+	u32 in[MLX5_ST_SZ_DW(pmtu_reg)];
+	u32 out[MLX5_ST_SZ_DW(pmtu_reg)];
+	int err;
+
+	memset(in, 0, sizeof(in));
+
+	MLX5_SET(pmtu_reg, in, local_port, 1);
+
+	err = mlx5_core_access_reg(dev, in, sizeof(in), out,
+				   sizeof(out), MLX5_REG_PMTU, 0, 0);
+	if (err)
+		return err;
+
+	if (max_mtu)
+		*max_mtu  = MLX5_GET(pmtu_reg, out, max_mtu);
+	if (oper_mtu)
+		*oper_mtu = MLX5_GET(pmtu_reg, out, oper_mtu);
+	if (admin_mtu)
+		*admin_mtu = MLX5_GET(pmtu_reg, out, admin_mtu);
+
+	return 0;
+}
+
+int mlx5_set_port_mtu(struct mlx5_core_dev *dev, int mtu)
+{
+	u32 in[MLX5_ST_SZ_DW(pmtu_reg)];
+	u32 out[MLX5_ST_SZ_DW(pmtu_reg)];
+
+	memset(in, 0, sizeof(in));
+
+	MLX5_SET(pmtu_reg, in, admin_mtu, mtu);
+	MLX5_SET(pmtu_reg, in, local_port, 1);
+
+	return mlx5_core_access_reg(dev, in, sizeof(in), out, sizeof(out),
+				    MLX5_REG_PMTU, 0, 1);
+}
+EXPORT_SYMBOL_GPL(mlx5_set_port_mtu);
+
+int mlx5_query_port_max_mtu(struct mlx5_core_dev *dev, int *max_mtu)
+{
+	return mlx5_query_port_mtu(dev, NULL, max_mtu, NULL);
+}
+EXPORT_SYMBOL_GPL(mlx5_query_port_max_mtu);
+
+int mlx5_query_port_oper_mtu(struct mlx5_core_dev *dev, int *oper_mtu)
+{
+	return mlx5_query_port_mtu(dev, NULL, NULL, oper_mtu);
+}
+EXPORT_SYMBOL_GPL(mlx5_query_port_oper_mtu);
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 6438444ab361..51738472657e 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -710,6 +710,10 @@ int mlx5_set_port_status(struct mlx5_core_dev *dev,
 			 enum mlx5_port_status status);
 int mlx5_query_port_status(struct mlx5_core_dev *dev, u8 *status);
 
+int mlx5_set_port_mtu(struct mlx5_core_dev *dev, int mtu);
+int mlx5_query_port_max_mtu(struct mlx5_core_dev *dev, int *max_mtu);
+int mlx5_query_port_oper_mtu(struct mlx5_core_dev *dev, int *oper_mtu);
+
 int mlx5_debug_eq_add(struct mlx5_core_dev *dev, struct mlx5_eq *eq);
 void mlx5_debug_eq_remove(struct mlx5_core_dev *dev, struct mlx5_eq *eq);
 int mlx5_core_eq_query(struct mlx5_core_dev *dev, struct mlx5_eq *eq,
-- 
cgit v1.2.3


From afb736e9330ad6b2b6935d2f53ded784eb73f12d Mon Sep 17 00:00:00 2001
From: Amir Vadai <amirv@mellanox.com>
Date: Thu, 28 May 2015 22:28:47 +0300
Subject: net/mlx5: Ethernet resource handling files

This patch contains the resource handling files:
- flow_table.c: This file contains the code to handle the low level API
		to configure hardware flow table. It is separated from
		the flow_table_en.c, because it will be used in the
		future by Raw Ethernet QP in mlx5_ib too.
- en_flow_table.[ch]: Ethernet flow steering handling. The flow table
		object contain a mapping between flow specs and TIRs.
		This mechanism will be used also to configure e-switch
		in the future, when SR-IOV support will be added.
- transobj.[ch] - Low level functions to create/modify/destroy the
                  transport objects: RQ/SQ/TIR/TIS
- vport.[ch] - Handle attributes of a virtual port (vPort) in the
  embedded switch. Currently this switch is a passthrough, until SR-IOV
  support will be added.

Signed-off-by: Amir Vadai <amirv@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 .../ethernet/mellanox/mlx5/core/en_flow_table.c    | 858 +++++++++++++++++++++
 .../net/ethernet/mellanox/mlx5/core/flow_table.c   | 422 ++++++++++
 drivers/net/ethernet/mellanox/mlx5/core/transobj.c | 169 ++++
 drivers/net/ethernet/mellanox/mlx5/core/transobj.h |  47 ++
 drivers/net/ethernet/mellanox/mlx5/core/vport.c    |  84 ++
 drivers/net/ethernet/mellanox/mlx5/core/vport.h    |  41 +
 include/linux/mlx5/flow_table.h                    |  54 ++
 7 files changed, 1675 insertions(+)
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/en_flow_table.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/flow_table.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/transobj.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/transobj.h
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/vport.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/vport.h
 create mode 100644 include/linux/mlx5/flow_table.h

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_flow_table.c b/drivers/net/ethernet/mellanox/mlx5/core/en_flow_table.c
new file mode 100644
index 000000000000..6feebda4b3e4
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_flow_table.c
@@ -0,0 +1,858 @@
+/*
+ * Copyright (c) 2015, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/list.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <linux/tcp.h>
+#include <linux/mlx5/flow_table.h>
+#include "en.h"
+
+enum {
+	MLX5E_FULLMATCH = 0,
+	MLX5E_ALLMULTI  = 1,
+	MLX5E_PROMISC   = 2,
+};
+
+enum {
+	MLX5E_UC        = 0,
+	MLX5E_MC_IPV4   = 1,
+	MLX5E_MC_IPV6   = 2,
+	MLX5E_MC_OTHER  = 3,
+};
+
+enum {
+	MLX5E_ACTION_NONE = 0,
+	MLX5E_ACTION_ADD  = 1,
+	MLX5E_ACTION_DEL  = 2,
+};
+
+struct mlx5e_eth_addr_hash_node {
+	struct hlist_node          hlist;
+	u8                         action;
+	struct mlx5e_eth_addr_info ai;
+};
+
+static inline int mlx5e_hash_eth_addr(u8 *addr)
+{
+	return addr[5];
+}
+
+static void mlx5e_add_eth_addr_to_hash(struct hlist_head *hash, u8 *addr)
+{
+	struct mlx5e_eth_addr_hash_node *hn;
+	int ix = mlx5e_hash_eth_addr(addr);
+	int found = 0;
+
+	hlist_for_each_entry(hn, &hash[ix], hlist)
+		if (ether_addr_equal_64bits(hn->ai.addr, addr)) {
+			found = 1;
+			break;
+		}
+
+	if (found) {
+		hn->action = MLX5E_ACTION_NONE;
+		return;
+	}
+
+	hn = kzalloc(sizeof(*hn), GFP_ATOMIC);
+	if (!hn)
+		return;
+
+	ether_addr_copy(hn->ai.addr, addr);
+	hn->action = MLX5E_ACTION_ADD;
+
+	hlist_add_head(&hn->hlist, &hash[ix]);
+}
+
+static void mlx5e_del_eth_addr_from_hash(struct mlx5e_eth_addr_hash_node *hn)
+{
+	hlist_del(&hn->hlist);
+	kfree(hn);
+}
+
+static void mlx5e_del_eth_addr_from_flow_table(struct mlx5e_priv *priv,
+					       struct mlx5e_eth_addr_info *ai)
+{
+	void *ft = priv->ft.main;
+
+	if (ai->tt_vec & (1 << MLX5E_TT_IPV6_TCP))
+		mlx5_del_flow_table_entry(ft, ai->ft_ix[MLX5E_TT_IPV6_TCP]);
+
+	if (ai->tt_vec & (1 << MLX5E_TT_IPV4_TCP))
+		mlx5_del_flow_table_entry(ft, ai->ft_ix[MLX5E_TT_IPV4_TCP]);
+
+	if (ai->tt_vec & (1 << MLX5E_TT_IPV6_UDP))
+		mlx5_del_flow_table_entry(ft, ai->ft_ix[MLX5E_TT_IPV6_UDP]);
+
+	if (ai->tt_vec & (1 << MLX5E_TT_IPV4_UDP))
+		mlx5_del_flow_table_entry(ft, ai->ft_ix[MLX5E_TT_IPV4_UDP]);
+
+	if (ai->tt_vec & (1 << MLX5E_TT_IPV6))
+		mlx5_del_flow_table_entry(ft, ai->ft_ix[MLX5E_TT_IPV6]);
+
+	if (ai->tt_vec & (1 << MLX5E_TT_IPV4))
+		mlx5_del_flow_table_entry(ft, ai->ft_ix[MLX5E_TT_IPV4]);
+
+	if (ai->tt_vec & (1 << MLX5E_TT_ANY))
+		mlx5_del_flow_table_entry(ft, ai->ft_ix[MLX5E_TT_ANY]);
+}
+
+static int mlx5e_get_eth_addr_type(u8 *addr)
+{
+	if (is_unicast_ether_addr(addr))
+		return MLX5E_UC;
+
+	if ((addr[0] == 0x01) &&
+	    (addr[1] == 0x00) &&
+	    (addr[2] == 0x5e) &&
+	   !(addr[3] &  0x80))
+		return MLX5E_MC_IPV4;
+
+	if ((addr[0] == 0x33) &&
+	    (addr[1] == 0x33))
+		return MLX5E_MC_IPV6;
+
+	return MLX5E_MC_OTHER;
+}
+
+static u32 mlx5e_get_tt_vec(struct mlx5e_eth_addr_info *ai, int type)
+{
+	int eth_addr_type;
+	u32 ret;
+
+	switch (type) {
+	case MLX5E_FULLMATCH:
+		eth_addr_type = mlx5e_get_eth_addr_type(ai->addr);
+		switch (eth_addr_type) {
+		case MLX5E_UC:
+			ret =
+				(1 << MLX5E_TT_IPV4_TCP) |
+				(1 << MLX5E_TT_IPV6_TCP) |
+				(1 << MLX5E_TT_IPV4_UDP) |
+				(1 << MLX5E_TT_IPV6_UDP) |
+				(1 << MLX5E_TT_IPV4)     |
+				(1 << MLX5E_TT_IPV6)     |
+				(1 << MLX5E_TT_ANY)      |
+				0;
+			break;
+
+		case MLX5E_MC_IPV4:
+			ret =
+				(1 << MLX5E_TT_IPV4_UDP) |
+				(1 << MLX5E_TT_IPV4)     |
+				0;
+			break;
+
+		case MLX5E_MC_IPV6:
+			ret =
+				(1 << MLX5E_TT_IPV6_UDP) |
+				(1 << MLX5E_TT_IPV6)     |
+				0;
+			break;
+
+		case MLX5E_MC_OTHER:
+			ret =
+				(1 << MLX5E_TT_ANY)      |
+				0;
+			break;
+		}
+
+		break;
+
+	case MLX5E_ALLMULTI:
+		ret =
+			(1 << MLX5E_TT_IPV4_UDP) |
+			(1 << MLX5E_TT_IPV6_UDP) |
+			(1 << MLX5E_TT_IPV4)     |
+			(1 << MLX5E_TT_IPV6)     |
+			(1 << MLX5E_TT_ANY)      |
+			0;
+		break;
+
+	default: /* MLX5E_PROMISC */
+		ret =
+			(1 << MLX5E_TT_IPV4_TCP) |
+			(1 << MLX5E_TT_IPV6_TCP) |
+			(1 << MLX5E_TT_IPV4_UDP) |
+			(1 << MLX5E_TT_IPV6_UDP) |
+			(1 << MLX5E_TT_IPV4)     |
+			(1 << MLX5E_TT_IPV6)     |
+			(1 << MLX5E_TT_ANY)      |
+			0;
+		break;
+	}
+
+	return ret;
+}
+
+static int __mlx5e_add_eth_addr_rule(struct mlx5e_priv *priv,
+				     struct mlx5e_eth_addr_info *ai, int type,
+				     void *flow_context, void *match_criteria)
+{
+	u8 match_criteria_enable = 0;
+	void *match_value;
+	void *dest;
+	u8   *dmac;
+	u8   *match_criteria_dmac;
+	void *ft   = priv->ft.main;
+	u32  *tirn = priv->tirn;
+	u32  tt_vec;
+	int  err;
+
+	match_value = MLX5_ADDR_OF(flow_context, flow_context, match_value);
+	dmac = MLX5_ADDR_OF(fte_match_param, match_value,
+			    outer_headers.dmac_47_16);
+	match_criteria_dmac = MLX5_ADDR_OF(fte_match_param, match_criteria,
+					   outer_headers.dmac_47_16);
+	dest = MLX5_ADDR_OF(flow_context, flow_context, destination);
+
+	MLX5_SET(flow_context, flow_context, action,
+		 MLX5_FLOW_CONTEXT_ACTION_FWD_DEST);
+	MLX5_SET(flow_context, flow_context, destination_list_size, 1);
+	MLX5_SET(dest_format_struct, dest, destination_type,
+		 MLX5_FLOW_CONTEXT_DEST_TYPE_TIR);
+
+	switch (type) {
+	case MLX5E_FULLMATCH:
+		match_criteria_enable = MLX5_MATCH_OUTER_HEADERS;
+		memset(match_criteria_dmac, 0xff, ETH_ALEN);
+		ether_addr_copy(dmac, ai->addr);
+		break;
+
+	case MLX5E_ALLMULTI:
+		match_criteria_enable = MLX5_MATCH_OUTER_HEADERS;
+		match_criteria_dmac[0] = 0x01;
+		dmac[0] = 0x01;
+		break;
+
+	case MLX5E_PROMISC:
+		break;
+	}
+
+	tt_vec = mlx5e_get_tt_vec(ai, type);
+
+	if (tt_vec & (1 << MLX5E_TT_ANY)) {
+		MLX5_SET(dest_format_struct, dest, destination_id,
+			 tirn[MLX5E_TT_ANY]);
+		err = mlx5_add_flow_table_entry(ft, match_criteria_enable,
+						match_criteria, flow_context,
+						&ai->ft_ix[MLX5E_TT_ANY]);
+		if (err) {
+			mlx5e_del_eth_addr_from_flow_table(priv, ai);
+			return err;
+		}
+		ai->tt_vec |= (1 << MLX5E_TT_ANY);
+	}
+
+	match_criteria_enable = MLX5_MATCH_OUTER_HEADERS;
+	MLX5_SET_TO_ONES(fte_match_param, match_criteria,
+			 outer_headers.ethertype);
+
+	if (tt_vec & (1 << MLX5E_TT_IPV4)) {
+		MLX5_SET(fte_match_param, match_value, outer_headers.ethertype,
+			 ETH_P_IP);
+		MLX5_SET(dest_format_struct, dest, destination_id,
+			 tirn[MLX5E_TT_IPV4]);
+		err = mlx5_add_flow_table_entry(ft, match_criteria_enable,
+						match_criteria, flow_context,
+						&ai->ft_ix[MLX5E_TT_IPV4]);
+		if (err) {
+			mlx5e_del_eth_addr_from_flow_table(priv, ai);
+			return err;
+		}
+		ai->tt_vec |= (1 << MLX5E_TT_IPV4);
+	}
+
+	if (tt_vec & (1 << MLX5E_TT_IPV6)) {
+		MLX5_SET(fte_match_param, match_value, outer_headers.ethertype,
+			 ETH_P_IPV6);
+		MLX5_SET(dest_format_struct, dest, destination_id,
+			 tirn[MLX5E_TT_IPV6]);
+		err = mlx5_add_flow_table_entry(ft, match_criteria_enable,
+						match_criteria, flow_context,
+						&ai->ft_ix[MLX5E_TT_IPV6]);
+		if (err) {
+			mlx5e_del_eth_addr_from_flow_table(priv, ai);
+			return err;
+		}
+		ai->tt_vec |= (1 << MLX5E_TT_IPV6);
+	}
+
+	MLX5_SET_TO_ONES(fte_match_param, match_criteria,
+			 outer_headers.ip_protocol);
+	MLX5_SET(fte_match_param, match_value, outer_headers.ip_protocol,
+		 IPPROTO_UDP);
+
+	if (tt_vec & (1 << MLX5E_TT_IPV4_UDP)) {
+		MLX5_SET(fte_match_param, match_value, outer_headers.ethertype,
+			 ETH_P_IP);
+		MLX5_SET(dest_format_struct, dest, destination_id,
+			 tirn[MLX5E_TT_IPV4_UDP]);
+		err = mlx5_add_flow_table_entry(ft, match_criteria_enable,
+						match_criteria, flow_context,
+						&ai->ft_ix[MLX5E_TT_IPV4_UDP]);
+		if (err) {
+			mlx5e_del_eth_addr_from_flow_table(priv, ai);
+			return err;
+		}
+		ai->tt_vec |= (1 << MLX5E_TT_IPV4_UDP);
+	}
+
+	if (tt_vec & (1 << MLX5E_TT_IPV6_UDP)) {
+		MLX5_SET(fte_match_param, match_value, outer_headers.ethertype,
+			 ETH_P_IPV6);
+		MLX5_SET(dest_format_struct, dest, destination_id,
+			 tirn[MLX5E_TT_IPV6_UDP]);
+		err = mlx5_add_flow_table_entry(ft, match_criteria_enable,
+						match_criteria, flow_context,
+						&ai->ft_ix[MLX5E_TT_IPV6_UDP]);
+		if (err) {
+			mlx5e_del_eth_addr_from_flow_table(priv, ai);
+			return err;
+		}
+		ai->tt_vec |= (1 << MLX5E_TT_IPV6_UDP);
+	}
+
+	MLX5_SET(fte_match_param, match_value, outer_headers.ip_protocol,
+		 IPPROTO_TCP);
+
+	if (tt_vec & (1 << MLX5E_TT_IPV4_TCP)) {
+		MLX5_SET(fte_match_param, match_value, outer_headers.ethertype,
+			 ETH_P_IP);
+		MLX5_SET(dest_format_struct, dest, destination_id,
+			 tirn[MLX5E_TT_IPV4_TCP]);
+		err = mlx5_add_flow_table_entry(ft, match_criteria_enable,
+						match_criteria, flow_context,
+						&ai->ft_ix[MLX5E_TT_IPV4_TCP]);
+		if (err) {
+			mlx5e_del_eth_addr_from_flow_table(priv, ai);
+			return err;
+		}
+		ai->tt_vec |= (1 << MLX5E_TT_IPV4_TCP);
+	}
+
+	if (tt_vec & (1 << MLX5E_TT_IPV6_TCP)) {
+		MLX5_SET(fte_match_param, match_value, outer_headers.ethertype,
+			 ETH_P_IPV6);
+		MLX5_SET(dest_format_struct, dest, destination_id,
+			 tirn[MLX5E_TT_IPV6_TCP]);
+		err = mlx5_add_flow_table_entry(ft, match_criteria_enable,
+						match_criteria, flow_context,
+						&ai->ft_ix[MLX5E_TT_IPV6_TCP]);
+		if (err) {
+			mlx5e_del_eth_addr_from_flow_table(priv, ai);
+			return err;
+		}
+		ai->tt_vec |= (1 << MLX5E_TT_IPV6_TCP);
+	}
+
+	return 0;
+}
+
+static int mlx5e_add_eth_addr_rule(struct mlx5e_priv *priv,
+				   struct mlx5e_eth_addr_info *ai, int type)
+{
+	u32 *flow_context;
+	u32 *match_criteria;
+	int err;
+
+	flow_context   = mlx5_vzalloc(MLX5_ST_SZ_BYTES(flow_context) +
+				      MLX5_ST_SZ_BYTES(dest_format_struct));
+	match_criteria = mlx5_vzalloc(MLX5_ST_SZ_BYTES(fte_match_param));
+	if (!flow_context || !match_criteria) {
+		netdev_err(priv->netdev, "%s: alloc failed\n", __func__);
+		err = -ENOMEM;
+		goto add_eth_addr_rule_out;
+	}
+
+	err = __mlx5e_add_eth_addr_rule(priv, ai, type, flow_context,
+					match_criteria);
+	if (err)
+		netdev_err(priv->netdev, "%s: failed\n", __func__);
+
+add_eth_addr_rule_out:
+	kvfree(match_criteria);
+	kvfree(flow_context);
+	return err;
+}
+
+enum mlx5e_vlan_rule_type {
+	MLX5E_VLAN_RULE_TYPE_UNTAGGED,
+	MLX5E_VLAN_RULE_TYPE_ANY_VID,
+	MLX5E_VLAN_RULE_TYPE_MATCH_VID,
+};
+
+static int mlx5e_add_vlan_rule(struct mlx5e_priv *priv,
+			       enum mlx5e_vlan_rule_type rule_type, u16 vid)
+{
+	u8 match_criteria_enable = 0;
+	u32 *flow_context;
+	void *match_value;
+	void *dest;
+	u32 *match_criteria;
+	u32 *ft_ix;
+	int err;
+
+	flow_context   = mlx5_vzalloc(MLX5_ST_SZ_BYTES(flow_context) +
+				      MLX5_ST_SZ_BYTES(dest_format_struct));
+	match_criteria = mlx5_vzalloc(MLX5_ST_SZ_BYTES(fte_match_param));
+	if (!flow_context || !match_criteria) {
+		netdev_err(priv->netdev, "%s: alloc failed\n", __func__);
+		err = -ENOMEM;
+		goto add_vlan_rule_out;
+	}
+	match_value = MLX5_ADDR_OF(flow_context, flow_context, match_value);
+	dest = MLX5_ADDR_OF(flow_context, flow_context, destination);
+
+	MLX5_SET(flow_context, flow_context, action,
+		 MLX5_FLOW_CONTEXT_ACTION_FWD_DEST);
+	MLX5_SET(flow_context, flow_context, destination_list_size, 1);
+	MLX5_SET(dest_format_struct, dest, destination_type,
+		 MLX5_FLOW_CONTEXT_DEST_TYPE_FLOW_TABLE);
+	MLX5_SET(dest_format_struct, dest, destination_id,
+		 mlx5_get_flow_table_id(priv->ft.main));
+
+	match_criteria_enable = MLX5_MATCH_OUTER_HEADERS;
+	MLX5_SET_TO_ONES(fte_match_param, match_criteria,
+			 outer_headers.vlan_tag);
+
+	switch (rule_type) {
+	case MLX5E_VLAN_RULE_TYPE_UNTAGGED:
+		ft_ix = &priv->vlan.untagged_rule_ft_ix;
+		break;
+	case MLX5E_VLAN_RULE_TYPE_ANY_VID:
+		ft_ix = &priv->vlan.any_vlan_rule_ft_ix;
+		MLX5_SET(fte_match_param, match_value, outer_headers.vlan_tag,
+			 1);
+		break;
+	default: /* MLX5E_VLAN_RULE_TYPE_MATCH_VID */
+		ft_ix = &priv->vlan.active_vlans_ft_ix[vid];
+		MLX5_SET(fte_match_param, match_value, outer_headers.vlan_tag,
+			 1);
+		MLX5_SET_TO_ONES(fte_match_param, match_criteria,
+				 outer_headers.first_vid);
+		MLX5_SET(fte_match_param, match_value, outer_headers.first_vid,
+			 vid);
+		break;
+	}
+
+	err = mlx5_add_flow_table_entry(priv->ft.vlan, match_criteria_enable,
+					match_criteria, flow_context, ft_ix);
+	if (err)
+		netdev_err(priv->netdev, "%s: failed\n", __func__);
+
+add_vlan_rule_out:
+	kvfree(match_criteria);
+	kvfree(flow_context);
+	return err;
+}
+
+static void mlx5e_del_vlan_rule(struct mlx5e_priv *priv,
+				enum mlx5e_vlan_rule_type rule_type, u16 vid)
+{
+	switch (rule_type) {
+	case MLX5E_VLAN_RULE_TYPE_UNTAGGED:
+		mlx5_del_flow_table_entry(priv->ft.vlan,
+					  priv->vlan.untagged_rule_ft_ix);
+		break;
+	case MLX5E_VLAN_RULE_TYPE_ANY_VID:
+		mlx5_del_flow_table_entry(priv->ft.vlan,
+					  priv->vlan.any_vlan_rule_ft_ix);
+		break;
+	case MLX5E_VLAN_RULE_TYPE_MATCH_VID:
+		mlx5_del_flow_table_entry(priv->ft.vlan,
+					  priv->vlan.active_vlans_ft_ix[vid]);
+		break;
+	}
+}
+
+void mlx5e_enable_vlan_filter(struct mlx5e_priv *priv)
+{
+	WARN_ON(!mutex_is_locked(&priv->state_lock));
+
+	if (priv->vlan.filter_disabled) {
+		priv->vlan.filter_disabled = false;
+		if (test_bit(MLX5E_STATE_OPENED, &priv->state))
+			mlx5e_del_vlan_rule(priv, MLX5E_VLAN_RULE_TYPE_ANY_VID,
+					    0);
+	}
+}
+
+void mlx5e_disable_vlan_filter(struct mlx5e_priv *priv)
+{
+	WARN_ON(!mutex_is_locked(&priv->state_lock));
+
+	if (!priv->vlan.filter_disabled) {
+		priv->vlan.filter_disabled = true;
+		if (test_bit(MLX5E_STATE_OPENED, &priv->state))
+			mlx5e_add_vlan_rule(priv, MLX5E_VLAN_RULE_TYPE_ANY_VID,
+					    0);
+	}
+}
+
+int mlx5e_vlan_rx_add_vid(struct net_device *dev, __always_unused __be16 proto,
+			  u16 vid)
+{
+	struct mlx5e_priv *priv = netdev_priv(dev);
+	int err = 0;
+
+	mutex_lock(&priv->state_lock);
+
+	set_bit(vid, priv->vlan.active_vlans);
+	if (test_bit(MLX5E_STATE_OPENED, &priv->state))
+		err = mlx5e_add_vlan_rule(priv, MLX5E_VLAN_RULE_TYPE_MATCH_VID,
+					  vid);
+
+	mutex_unlock(&priv->state_lock);
+
+	return err;
+}
+
+int mlx5e_vlan_rx_kill_vid(struct net_device *dev, __always_unused __be16 proto,
+			   u16 vid)
+{
+	struct mlx5e_priv *priv = netdev_priv(dev);
+
+	mutex_lock(&priv->state_lock);
+
+	clear_bit(vid, priv->vlan.active_vlans);
+	if (test_bit(MLX5E_STATE_OPENED, &priv->state))
+		mlx5e_del_vlan_rule(priv, MLX5E_VLAN_RULE_TYPE_MATCH_VID, vid);
+
+	mutex_unlock(&priv->state_lock);
+
+	return 0;
+}
+
+int mlx5e_add_all_vlan_rules(struct mlx5e_priv *priv)
+{
+	u16 vid;
+	int err;
+
+	for_each_set_bit(vid, priv->vlan.active_vlans, VLAN_N_VID) {
+		err = mlx5e_add_vlan_rule(priv, MLX5E_VLAN_RULE_TYPE_MATCH_VID,
+					  vid);
+		if (err)
+			return err;
+	}
+
+	err = mlx5e_add_vlan_rule(priv, MLX5E_VLAN_RULE_TYPE_UNTAGGED, 0);
+	if (err)
+		return err;
+
+	if (priv->vlan.filter_disabled) {
+		err = mlx5e_add_vlan_rule(priv, MLX5E_VLAN_RULE_TYPE_ANY_VID,
+					  0);
+		if (err)
+			return err;
+	}
+
+	return 0;
+}
+
+void mlx5e_del_all_vlan_rules(struct mlx5e_priv *priv)
+{
+	u16 vid;
+
+	if (priv->vlan.filter_disabled)
+		mlx5e_del_vlan_rule(priv, MLX5E_VLAN_RULE_TYPE_ANY_VID, 0);
+
+	mlx5e_del_vlan_rule(priv, MLX5E_VLAN_RULE_TYPE_UNTAGGED, 0);
+
+	for_each_set_bit(vid, priv->vlan.active_vlans, VLAN_N_VID)
+		mlx5e_del_vlan_rule(priv, MLX5E_VLAN_RULE_TYPE_MATCH_VID, vid);
+}
+
+#define mlx5e_for_each_hash_node(hn, tmp, hash, i) \
+	for (i = 0; i < MLX5E_ETH_ADDR_HASH_SIZE; i++) \
+		hlist_for_each_entry_safe(hn, tmp, &hash[i], hlist)
+
+static void mlx5e_execute_action(struct mlx5e_priv *priv,
+				 struct mlx5e_eth_addr_hash_node *hn)
+{
+	switch (hn->action) {
+	case MLX5E_ACTION_ADD:
+		mlx5e_add_eth_addr_rule(priv, &hn->ai, MLX5E_FULLMATCH);
+		hn->action = MLX5E_ACTION_NONE;
+		break;
+
+	case MLX5E_ACTION_DEL:
+		mlx5e_del_eth_addr_from_flow_table(priv, &hn->ai);
+		mlx5e_del_eth_addr_from_hash(hn);
+		break;
+	}
+}
+
+static void mlx5e_sync_netdev_addr(struct mlx5e_priv *priv)
+{
+	struct net_device *netdev = priv->netdev;
+	struct netdev_hw_addr *ha;
+
+	netif_addr_lock_bh(netdev);
+
+	mlx5e_add_eth_addr_to_hash(priv->eth_addr.netdev_uc,
+				   priv->netdev->dev_addr);
+
+	netdev_for_each_uc_addr(ha, netdev)
+		mlx5e_add_eth_addr_to_hash(priv->eth_addr.netdev_uc, ha->addr);
+
+	netdev_for_each_mc_addr(ha, netdev)
+		mlx5e_add_eth_addr_to_hash(priv->eth_addr.netdev_mc, ha->addr);
+
+	netif_addr_unlock_bh(netdev);
+}
+
+static void mlx5e_apply_netdev_addr(struct mlx5e_priv *priv)
+{
+	struct mlx5e_eth_addr_hash_node *hn;
+	struct hlist_node *tmp;
+	int i;
+
+	mlx5e_for_each_hash_node(hn, tmp, priv->eth_addr.netdev_uc, i)
+		mlx5e_execute_action(priv, hn);
+
+	mlx5e_for_each_hash_node(hn, tmp, priv->eth_addr.netdev_mc, i)
+		mlx5e_execute_action(priv, hn);
+}
+
+static void mlx5e_handle_netdev_addr(struct mlx5e_priv *priv)
+{
+	struct mlx5e_eth_addr_hash_node *hn;
+	struct hlist_node *tmp;
+	int i;
+
+	mlx5e_for_each_hash_node(hn, tmp, priv->eth_addr.netdev_uc, i)
+		hn->action = MLX5E_ACTION_DEL;
+	mlx5e_for_each_hash_node(hn, tmp, priv->eth_addr.netdev_mc, i)
+		hn->action = MLX5E_ACTION_DEL;
+
+	if (test_bit(MLX5E_STATE_OPENED, &priv->state))
+		mlx5e_sync_netdev_addr(priv);
+
+	mlx5e_apply_netdev_addr(priv);
+}
+
+void mlx5e_set_rx_mode_core(struct mlx5e_priv *priv)
+{
+	struct mlx5e_eth_addr_db *ea = &priv->eth_addr;
+	struct net_device *ndev = priv->netdev;
+
+	bool rx_mode_enable   = test_bit(MLX5E_STATE_OPENED, &priv->state);
+	bool promisc_enabled   = rx_mode_enable && (ndev->flags & IFF_PROMISC);
+	bool allmulti_enabled  = rx_mode_enable && (ndev->flags & IFF_ALLMULTI);
+	bool broadcast_enabled = rx_mode_enable;
+
+	bool enable_promisc    = !ea->promisc_enabled   &&  promisc_enabled;
+	bool disable_promisc   =  ea->promisc_enabled   && !promisc_enabled;
+	bool enable_allmulti   = !ea->allmulti_enabled  &&  allmulti_enabled;
+	bool disable_allmulti  =  ea->allmulti_enabled  && !allmulti_enabled;
+	bool enable_broadcast  = !ea->broadcast_enabled &&  broadcast_enabled;
+	bool disable_broadcast =  ea->broadcast_enabled && !broadcast_enabled;
+
+	if (enable_promisc)
+		mlx5e_add_eth_addr_rule(priv, &ea->promisc, MLX5E_PROMISC);
+	if (enable_allmulti)
+		mlx5e_add_eth_addr_rule(priv, &ea->allmulti, MLX5E_ALLMULTI);
+	if (enable_broadcast)
+		mlx5e_add_eth_addr_rule(priv, &ea->broadcast, MLX5E_FULLMATCH);
+
+	mlx5e_handle_netdev_addr(priv);
+
+	if (disable_broadcast)
+		mlx5e_del_eth_addr_from_flow_table(priv, &ea->broadcast);
+	if (disable_allmulti)
+		mlx5e_del_eth_addr_from_flow_table(priv, &ea->allmulti);
+	if (disable_promisc)
+		mlx5e_del_eth_addr_from_flow_table(priv, &ea->promisc);
+
+	ea->promisc_enabled   = promisc_enabled;
+	ea->allmulti_enabled  = allmulti_enabled;
+	ea->broadcast_enabled = broadcast_enabled;
+}
+
+void mlx5e_set_rx_mode_work(struct work_struct *work)
+{
+	struct mlx5e_priv *priv = container_of(work, struct mlx5e_priv,
+					       set_rx_mode_work);
+
+	mutex_lock(&priv->state_lock);
+	if (test_bit(MLX5E_STATE_OPENED, &priv->state))
+		mlx5e_set_rx_mode_core(priv);
+	mutex_unlock(&priv->state_lock);
+}
+
+void mlx5e_init_eth_addr(struct mlx5e_priv *priv)
+{
+	ether_addr_copy(priv->eth_addr.broadcast.addr, priv->netdev->broadcast);
+}
+
+static int mlx5e_create_main_flow_table(struct mlx5e_priv *priv)
+{
+	struct mlx5_flow_table_group *g;
+	u8 *dmac;
+
+	g = kcalloc(9, sizeof(*g), GFP_KERNEL);
+
+	g[0].log_sz = 2;
+	g[0].match_criteria_enable = MLX5_MATCH_OUTER_HEADERS;
+	MLX5_SET_TO_ONES(fte_match_param, g[0].match_criteria,
+			 outer_headers.ethertype);
+	MLX5_SET_TO_ONES(fte_match_param, g[0].match_criteria,
+			 outer_headers.ip_protocol);
+
+	g[1].log_sz = 1;
+	g[1].match_criteria_enable = MLX5_MATCH_OUTER_HEADERS;
+	MLX5_SET_TO_ONES(fte_match_param, g[1].match_criteria,
+			 outer_headers.ethertype);
+
+	g[2].log_sz = 0;
+
+	g[3].log_sz = 14;
+	g[3].match_criteria_enable = MLX5_MATCH_OUTER_HEADERS;
+	dmac = MLX5_ADDR_OF(fte_match_param, g[3].match_criteria,
+			    outer_headers.dmac_47_16);
+	memset(dmac, 0xff, ETH_ALEN);
+	MLX5_SET_TO_ONES(fte_match_param, g[3].match_criteria,
+			 outer_headers.ethertype);
+	MLX5_SET_TO_ONES(fte_match_param, g[3].match_criteria,
+			 outer_headers.ip_protocol);
+
+	g[4].log_sz = 13;
+	g[4].match_criteria_enable = MLX5_MATCH_OUTER_HEADERS;
+	dmac = MLX5_ADDR_OF(fte_match_param, g[4].match_criteria,
+			    outer_headers.dmac_47_16);
+	memset(dmac, 0xff, ETH_ALEN);
+	MLX5_SET_TO_ONES(fte_match_param, g[4].match_criteria,
+			 outer_headers.ethertype);
+
+	g[5].log_sz = 11;
+	g[5].match_criteria_enable = MLX5_MATCH_OUTER_HEADERS;
+	dmac = MLX5_ADDR_OF(fte_match_param, g[5].match_criteria,
+			    outer_headers.dmac_47_16);
+	memset(dmac, 0xff, ETH_ALEN);
+
+	g[6].log_sz = 2;
+	g[6].match_criteria_enable = MLX5_MATCH_OUTER_HEADERS;
+	dmac = MLX5_ADDR_OF(fte_match_param, g[6].match_criteria,
+			    outer_headers.dmac_47_16);
+	dmac[0] = 0x01;
+	MLX5_SET_TO_ONES(fte_match_param, g[6].match_criteria,
+			 outer_headers.ethertype);
+	MLX5_SET_TO_ONES(fte_match_param, g[6].match_criteria,
+			 outer_headers.ip_protocol);
+
+	g[7].log_sz = 1;
+	g[7].match_criteria_enable = MLX5_MATCH_OUTER_HEADERS;
+	dmac = MLX5_ADDR_OF(fte_match_param, g[7].match_criteria,
+			    outer_headers.dmac_47_16);
+	dmac[0] = 0x01;
+	MLX5_SET_TO_ONES(fte_match_param, g[7].match_criteria,
+			 outer_headers.ethertype);
+
+	g[8].log_sz = 0;
+	g[8].match_criteria_enable = MLX5_MATCH_OUTER_HEADERS;
+	dmac = MLX5_ADDR_OF(fte_match_param, g[8].match_criteria,
+			    outer_headers.dmac_47_16);
+	dmac[0] = 0x01;
+	priv->ft.main = mlx5_create_flow_table(priv->mdev, 1,
+					       MLX5_FLOW_TABLE_TYPE_NIC_RCV,
+					       9, g);
+	kfree(g);
+
+	return priv->ft.main ? 0 : -ENOMEM;
+}
+
+static void mlx5e_destroy_main_flow_table(struct mlx5e_priv *priv)
+{
+	mlx5_destroy_flow_table(priv->ft.main);
+}
+
+static int mlx5e_create_vlan_flow_table(struct mlx5e_priv *priv)
+{
+	struct mlx5_flow_table_group *g;
+
+	g = kcalloc(2, sizeof(*g), GFP_KERNEL);
+	if (!g)
+		return -ENOMEM;
+
+	g[0].log_sz = 12;
+	g[0].match_criteria_enable = MLX5_MATCH_OUTER_HEADERS;
+	MLX5_SET_TO_ONES(fte_match_param, g[0].match_criteria,
+			 outer_headers.vlan_tag);
+	MLX5_SET_TO_ONES(fte_match_param, g[0].match_criteria,
+			 outer_headers.first_vid);
+
+	/* untagged + any vlan id */
+	g[1].log_sz = 1;
+	g[1].match_criteria_enable = MLX5_MATCH_OUTER_HEADERS;
+	MLX5_SET_TO_ONES(fte_match_param, g[1].match_criteria,
+			 outer_headers.vlan_tag);
+
+	priv->ft.vlan = mlx5_create_flow_table(priv->mdev, 0,
+					       MLX5_FLOW_TABLE_TYPE_NIC_RCV,
+					       2, g);
+
+	kfree(g);
+	return priv->ft.vlan ? 0 : -ENOMEM;
+}
+
+static void mlx5e_destroy_vlan_flow_table(struct mlx5e_priv *priv)
+{
+	mlx5_destroy_flow_table(priv->ft.vlan);
+}
+
+int mlx5e_open_flow_table(struct mlx5e_priv *priv)
+{
+	int err;
+
+	err = mlx5e_create_main_flow_table(priv);
+	if (err)
+		return err;
+
+	err = mlx5e_create_vlan_flow_table(priv);
+	if (err)
+		goto err_destroy_main_flow_table;
+
+	return 0;
+
+err_destroy_main_flow_table:
+	mlx5e_destroy_main_flow_table(priv);
+
+	return err;
+}
+
+void mlx5e_close_flow_table(struct mlx5e_priv *priv)
+{
+	mlx5e_destroy_vlan_flow_table(priv);
+	mlx5e_destroy_main_flow_table(priv);
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/flow_table.c b/drivers/net/ethernet/mellanox/mlx5/core/flow_table.c
new file mode 100644
index 000000000000..ca90b9bc3b95
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/flow_table.c
@@ -0,0 +1,422 @@
+/*
+ * Copyright (c) 2013-2015, Mellanox Technologies, Ltd.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/export.h>
+#include <linux/mlx5/driver.h>
+#include <linux/mlx5/flow_table.h>
+#include "mlx5_core.h"
+
+struct mlx5_ftg {
+	struct mlx5_flow_table_group    g;
+	u32				id;
+	u32				start_ix;
+};
+
+struct mlx5_flow_table {
+	struct mlx5_core_dev	*dev;
+	u8			level;
+	u8			type;
+	u32			id;
+	struct mutex		mutex; /* sync bitmap alloc */
+	u16			num_groups;
+	struct mlx5_ftg		*group;
+	unsigned long		*bitmap;
+	u32			size;
+};
+
+static int mlx5_set_flow_entry_cmd(struct mlx5_flow_table *ft, u32 group_ix,
+				   u32 flow_index, void *flow_context)
+{
+	u32 out[MLX5_ST_SZ_DW(set_fte_out)];
+	u32 *in;
+	void *in_flow_context;
+	int fcdls =
+		MLX5_GET(flow_context, flow_context, destination_list_size) *
+		MLX5_ST_SZ_BYTES(dest_format_struct);
+	int inlen = MLX5_ST_SZ_BYTES(set_fte_in) + fcdls;
+	int err;
+
+	in = mlx5_vzalloc(inlen);
+	if (!in) {
+		mlx5_core_warn(ft->dev, "failed to allocate inbox\n");
+		return -ENOMEM;
+	}
+
+	MLX5_SET(set_fte_in, in, table_type, ft->type);
+	MLX5_SET(set_fte_in, in, table_id,   ft->id);
+	MLX5_SET(set_fte_in, in, flow_index, flow_index);
+	MLX5_SET(set_fte_in, in, opcode, MLX5_CMD_OP_SET_FLOW_TABLE_ENTRY);
+
+	in_flow_context = MLX5_ADDR_OF(set_fte_in, in, flow_context);
+	memcpy(in_flow_context, flow_context,
+	       MLX5_ST_SZ_BYTES(flow_context) + fcdls);
+
+	MLX5_SET(flow_context, in_flow_context, group_id,
+		 ft->group[group_ix].id);
+
+	memset(out, 0, sizeof(out));
+	err = mlx5_cmd_exec_check_status(ft->dev, in, inlen, out,
+					 sizeof(out));
+	kvfree(in);
+
+	return err;
+}
+
+static void mlx5_del_flow_entry_cmd(struct mlx5_flow_table *ft, u32 flow_index)
+{
+	u32 in[MLX5_ST_SZ_DW(delete_fte_in)];
+	u32 out[MLX5_ST_SZ_DW(delete_fte_out)];
+
+	memset(in, 0, sizeof(in));
+	memset(out, 0, sizeof(out));
+
+#define MLX5_SET_DFTEI(p, x, v) MLX5_SET(delete_fte_in, p, x, v)
+	MLX5_SET_DFTEI(in, table_type, ft->type);
+	MLX5_SET_DFTEI(in, table_id,   ft->id);
+	MLX5_SET_DFTEI(in, flow_index, flow_index);
+	MLX5_SET_DFTEI(in, opcode,     MLX5_CMD_OP_DELETE_FLOW_TABLE_ENTRY);
+
+	mlx5_cmd_exec_check_status(ft->dev, in, sizeof(in), out, sizeof(out));
+}
+
+static void mlx5_destroy_flow_group_cmd(struct mlx5_flow_table *ft, int i)
+{
+	u32 in[MLX5_ST_SZ_DW(destroy_flow_group_in)];
+	u32 out[MLX5_ST_SZ_DW(destroy_flow_group_out)];
+
+	memset(in, 0, sizeof(in));
+	memset(out, 0, sizeof(out));
+
+#define MLX5_SET_DFGI(p, x, v) MLX5_SET(destroy_flow_group_in, p, x, v)
+	MLX5_SET_DFGI(in, table_type, ft->type);
+	MLX5_SET_DFGI(in, table_id,   ft->id);
+	MLX5_SET_DFGI(in, opcode, MLX5_CMD_OP_DESTROY_FLOW_GROUP);
+	MLX5_SET_DFGI(in, group_id, ft->group[i].id);
+	mlx5_cmd_exec_check_status(ft->dev, in, sizeof(in), out, sizeof(out));
+}
+
+static int mlx5_create_flow_group_cmd(struct mlx5_flow_table *ft, int i)
+{
+	u32 out[MLX5_ST_SZ_DW(create_flow_group_out)];
+	u32 *in;
+	void *in_match_criteria;
+	int inlen = MLX5_ST_SZ_BYTES(create_flow_group_in);
+	struct mlx5_flow_table_group *g = &ft->group[i].g;
+	u32 start_ix = ft->group[i].start_ix;
+	u32 end_ix = start_ix + (1 << g->log_sz) - 1;
+	int err;
+
+	in = mlx5_vzalloc(inlen);
+	if (!in) {
+		mlx5_core_warn(ft->dev, "failed to allocate inbox\n");
+		return -ENOMEM;
+	}
+	in_match_criteria = MLX5_ADDR_OF(create_flow_group_in, in,
+					 match_criteria);
+
+	memset(out, 0, sizeof(out));
+
+#define MLX5_SET_CFGI(p, x, v) MLX5_SET(create_flow_group_in, p, x, v)
+	MLX5_SET_CFGI(in, table_type,            ft->type);
+	MLX5_SET_CFGI(in, table_id,              ft->id);
+	MLX5_SET_CFGI(in, opcode,                MLX5_CMD_OP_CREATE_FLOW_GROUP);
+	MLX5_SET_CFGI(in, start_flow_index,      start_ix);
+	MLX5_SET_CFGI(in, end_flow_index,        end_ix);
+	MLX5_SET_CFGI(in, match_criteria_enable, g->match_criteria_enable);
+
+	memcpy(in_match_criteria, g->match_criteria,
+	       MLX5_ST_SZ_BYTES(fte_match_param));
+
+	err = mlx5_cmd_exec_check_status(ft->dev, in, inlen, out,
+					 sizeof(out));
+	if (!err)
+		ft->group[i].id = MLX5_GET(create_flow_group_out, out,
+					   group_id);
+
+	kvfree(in);
+
+	return err;
+}
+
+static void mlx5_destroy_flow_table_groups(struct mlx5_flow_table *ft)
+{
+	int i;
+
+	for (i = 0; i < ft->num_groups; i++)
+		mlx5_destroy_flow_group_cmd(ft, i);
+}
+
+static int mlx5_create_flow_table_groups(struct mlx5_flow_table *ft)
+{
+	int err;
+	int i;
+
+	for (i = 0; i < ft->num_groups; i++) {
+		err = mlx5_create_flow_group_cmd(ft, i);
+		if (err)
+			goto err_destroy_flow_table_groups;
+	}
+
+	return 0;
+
+err_destroy_flow_table_groups:
+	for (i--; i >= 0; i--)
+		mlx5_destroy_flow_group_cmd(ft, i);
+
+	return err;
+}
+
+static int mlx5_create_flow_table_cmd(struct mlx5_flow_table *ft)
+{
+	u32 in[MLX5_ST_SZ_DW(create_flow_table_in)];
+	u32 out[MLX5_ST_SZ_DW(create_flow_table_out)];
+	int err;
+
+	memset(in, 0, sizeof(in));
+
+	MLX5_SET(create_flow_table_in, in, table_type, ft->type);
+	MLX5_SET(create_flow_table_in, in, level,      ft->level);
+	MLX5_SET(create_flow_table_in, in, log_size,   order_base_2(ft->size));
+
+	MLX5_SET(create_flow_table_in, in, opcode,
+		 MLX5_CMD_OP_CREATE_FLOW_TABLE);
+
+	memset(out, 0, sizeof(out));
+	err = mlx5_cmd_exec_check_status(ft->dev, in, sizeof(in), out,
+					 sizeof(out));
+	if (err)
+		return err;
+
+	ft->id = MLX5_GET(create_flow_table_out, out, table_id);
+
+	return 0;
+}
+
+static void mlx5_destroy_flow_table_cmd(struct mlx5_flow_table *ft)
+{
+	u32 in[MLX5_ST_SZ_DW(destroy_flow_table_in)];
+	u32 out[MLX5_ST_SZ_DW(destroy_flow_table_out)];
+
+	memset(in, 0, sizeof(in));
+	memset(out, 0, sizeof(out));
+
+#define MLX5_SET_DFTI(p, x, v) MLX5_SET(destroy_flow_table_in, p, x, v)
+	MLX5_SET_DFTI(in, table_type, ft->type);
+	MLX5_SET_DFTI(in, table_id,   ft->id);
+	MLX5_SET_DFTI(in, opcode, MLX5_CMD_OP_DESTROY_FLOW_TABLE);
+
+	mlx5_cmd_exec_check_status(ft->dev, in, sizeof(in), out, sizeof(out));
+}
+
+static int mlx5_find_group(struct mlx5_flow_table *ft, u8 match_criteria_enable,
+			   u32 *match_criteria, int *group_ix)
+{
+	void *mc_outer = MLX5_ADDR_OF(fte_match_param, match_criteria,
+				      outer_headers);
+	void *mc_misc  = MLX5_ADDR_OF(fte_match_param, match_criteria,
+				      misc_parameters);
+	void *mc_inner = MLX5_ADDR_OF(fte_match_param, match_criteria,
+				      inner_headers);
+	int mc_outer_sz = MLX5_ST_SZ_BYTES(fte_match_set_lyr_2_4);
+	int mc_misc_sz  = MLX5_ST_SZ_BYTES(fte_match_set_misc);
+	int mc_inner_sz = MLX5_ST_SZ_BYTES(fte_match_set_lyr_2_4);
+	int i;
+
+	for (i = 0; i < ft->num_groups; i++) {
+		struct mlx5_flow_table_group *g = &ft->group[i].g;
+		void *gmc_outer = MLX5_ADDR_OF(fte_match_param,
+					       g->match_criteria,
+					       outer_headers);
+		void *gmc_misc  = MLX5_ADDR_OF(fte_match_param,
+					       g->match_criteria,
+					       misc_parameters);
+		void *gmc_inner = MLX5_ADDR_OF(fte_match_param,
+					       g->match_criteria,
+					       inner_headers);
+
+		if (g->match_criteria_enable != match_criteria_enable)
+			continue;
+
+		if (match_criteria_enable & MLX5_MATCH_OUTER_HEADERS)
+			if (memcmp(mc_outer, gmc_outer, mc_outer_sz))
+				continue;
+
+		if (match_criteria_enable & MLX5_MATCH_MISC_PARAMETERS)
+			if (memcmp(mc_misc, gmc_misc, mc_misc_sz))
+				continue;
+
+		if (match_criteria_enable & MLX5_MATCH_INNER_HEADERS)
+			if (memcmp(mc_inner, gmc_inner, mc_inner_sz))
+				continue;
+
+		*group_ix = i;
+		return 0;
+	}
+
+	return -EINVAL;
+}
+
+static int alloc_flow_index(struct mlx5_flow_table *ft, int group_ix, u32 *ix)
+{
+	struct mlx5_ftg *g = &ft->group[group_ix];
+	int err = 0;
+
+	mutex_lock(&ft->mutex);
+
+	*ix = find_next_zero_bit(ft->bitmap, ft->size, g->start_ix);
+	if (*ix >= (g->start_ix + (1 << g->g.log_sz)))
+		err = -ENOSPC;
+	else
+		__set_bit(*ix, ft->bitmap);
+
+	mutex_unlock(&ft->mutex);
+
+	return err;
+}
+
+static void mlx5_free_flow_index(struct mlx5_flow_table *ft, u32 ix)
+{
+	__clear_bit(ix, ft->bitmap);
+}
+
+int mlx5_add_flow_table_entry(void *flow_table, u8 match_criteria_enable,
+			      void *match_criteria, void *flow_context,
+			      u32 *flow_index)
+{
+	struct mlx5_flow_table *ft = flow_table;
+	int group_ix;
+	int err;
+
+	err = mlx5_find_group(ft, match_criteria_enable, match_criteria,
+			      &group_ix);
+	if (err) {
+		mlx5_core_warn(ft->dev, "mlx5_find_group failed\n");
+		return err;
+	}
+
+	err = alloc_flow_index(ft, group_ix, flow_index);
+	if (err) {
+		mlx5_core_warn(ft->dev, "alloc_flow_index failed\n");
+		return err;
+	}
+
+	return mlx5_set_flow_entry_cmd(ft, group_ix, *flow_index, flow_context);
+}
+EXPORT_SYMBOL(mlx5_add_flow_table_entry);
+
+void mlx5_del_flow_table_entry(void *flow_table, u32 flow_index)
+{
+	struct mlx5_flow_table *ft = flow_table;
+
+	mlx5_del_flow_entry_cmd(ft, flow_index);
+	mlx5_free_flow_index(ft, flow_index);
+}
+EXPORT_SYMBOL(mlx5_del_flow_table_entry);
+
+void *mlx5_create_flow_table(struct mlx5_core_dev *dev, u8 level, u8 table_type,
+			     u16 num_groups,
+			     struct mlx5_flow_table_group *group)
+{
+	struct mlx5_flow_table *ft;
+	u32 start_ix = 0;
+	u32 ft_size = 0;
+	void *gr;
+	void *bm;
+	int err;
+	int i;
+
+	for (i = 0; i < num_groups; i++)
+		ft_size += (1 << group[i].log_sz);
+
+	ft = kzalloc(sizeof(*ft), GFP_KERNEL);
+	gr = kcalloc(num_groups, sizeof(struct mlx5_ftg), GFP_KERNEL);
+	bm = kcalloc(BITS_TO_LONGS(ft_size), sizeof(uintptr_t), GFP_KERNEL);
+	if (!ft || !gr || !bm)
+		goto err_free_ft;
+
+	ft->group	= gr;
+	ft->bitmap	= bm;
+	ft->num_groups	= num_groups;
+	ft->level	= level;
+	ft->type	= table_type;
+	ft->size	= ft_size;
+	ft->dev		= dev;
+	mutex_init(&ft->mutex);
+
+	for (i = 0; i < ft->num_groups; i++) {
+		memcpy(&ft->group[i].g, &group[i], sizeof(*group));
+		ft->group[i].start_ix = start_ix;
+		start_ix += 1 << group[i].log_sz;
+	}
+
+	err = mlx5_create_flow_table_cmd(ft);
+	if (err)
+		goto err_free_ft;
+
+	err = mlx5_create_flow_table_groups(ft);
+	if (err)
+		goto err_destroy_flow_table_cmd;
+
+	return ft;
+
+err_destroy_flow_table_cmd:
+	mlx5_destroy_flow_table_cmd(ft);
+
+err_free_ft:
+	mlx5_core_warn(dev, "failed to alloc flow table\n");
+	kfree(bm);
+	kfree(gr);
+	kfree(ft);
+
+	return NULL;
+}
+EXPORT_SYMBOL(mlx5_create_flow_table);
+
+void mlx5_destroy_flow_table(void *flow_table)
+{
+	struct mlx5_flow_table *ft = flow_table;
+
+	mlx5_destroy_flow_table_groups(ft);
+	mlx5_destroy_flow_table_cmd(ft);
+	kfree(ft->bitmap);
+	kfree(ft->group);
+	kfree(ft);
+}
+EXPORT_SYMBOL(mlx5_destroy_flow_table);
+
+u32 mlx5_get_flow_table_id(void *flow_table)
+{
+	struct mlx5_flow_table *ft = flow_table;
+
+	return ft->id;
+}
+EXPORT_SYMBOL(mlx5_get_flow_table_id);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/transobj.c b/drivers/net/ethernet/mellanox/mlx5/core/transobj.c
new file mode 100644
index 000000000000..3c555d708af1
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/transobj.c
@@ -0,0 +1,169 @@
+/*
+ * Copyright (c) 2013-2015, Mellanox Technologies, Ltd.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/mlx5/driver.h>
+#include "mlx5_core.h"
+#include "transobj.h"
+
+int mlx5_create_rq(struct mlx5_core_dev *dev, u32 *in, int inlen, u32 *rqn)
+{
+	u32 out[MLX5_ST_SZ_DW(create_rq_out)];
+	int err;
+
+	MLX5_SET(create_rq_in, in, opcode, MLX5_CMD_OP_CREATE_RQ);
+
+	memset(out, 0, sizeof(out));
+	err = mlx5_cmd_exec_check_status(dev, in, inlen, out, sizeof(out));
+	if (!err)
+		*rqn = MLX5_GET(create_rq_out, out, rqn);
+
+	return err;
+}
+
+int mlx5_modify_rq(struct mlx5_core_dev *dev, u32 rqn, u32 *in, int inlen)
+{
+	u32 out[MLX5_ST_SZ_DW(modify_rq_out)];
+
+	MLX5_SET(modify_rq_in, in, rqn, rqn);
+	MLX5_SET(modify_rq_in, in, opcode, MLX5_CMD_OP_MODIFY_RQ);
+
+	memset(out, 0, sizeof(out));
+	return mlx5_cmd_exec_check_status(dev, in, inlen, out, sizeof(out));
+}
+
+void mlx5_destroy_rq(struct mlx5_core_dev *dev, u32 rqn)
+{
+	u32 in[MLX5_ST_SZ_DW(destroy_rq_in)];
+	u32 out[MLX5_ST_SZ_DW(destroy_rq_out)];
+
+	memset(in, 0, sizeof(in));
+
+	MLX5_SET(destroy_rq_in, in, opcode, MLX5_CMD_OP_DESTROY_RQ);
+	MLX5_SET(destroy_rq_in, in, rqn, rqn);
+
+	mlx5_cmd_exec_check_status(dev, in, sizeof(in), out, sizeof(out));
+}
+
+int mlx5_create_sq(struct mlx5_core_dev *dev, u32 *in, int inlen, u32 *sqn)
+{
+	u32 out[MLX5_ST_SZ_DW(create_sq_out)];
+	int err;
+
+	MLX5_SET(create_sq_in, in, opcode, MLX5_CMD_OP_CREATE_SQ);
+
+	memset(out, 0, sizeof(out));
+	err = mlx5_cmd_exec_check_status(dev, in, inlen, out, sizeof(out));
+	if (!err)
+		*sqn = MLX5_GET(create_sq_out, out, sqn);
+
+	return err;
+}
+
+int mlx5_modify_sq(struct mlx5_core_dev *dev, u32 sqn, u32 *in, int inlen)
+{
+	u32 out[MLX5_ST_SZ_DW(modify_sq_out)];
+
+	MLX5_SET(modify_sq_in, in, sqn, sqn);
+	MLX5_SET(modify_sq_in, in, opcode, MLX5_CMD_OP_MODIFY_SQ);
+
+	memset(out, 0, sizeof(out));
+	return mlx5_cmd_exec_check_status(dev, in, inlen, out, sizeof(out));
+}
+
+void mlx5_destroy_sq(struct mlx5_core_dev *dev, u32 sqn)
+{
+	u32 in[MLX5_ST_SZ_DW(destroy_sq_in)];
+	u32 out[MLX5_ST_SZ_DW(destroy_sq_out)];
+
+	memset(in, 0, sizeof(in));
+
+	MLX5_SET(destroy_sq_in, in, opcode, MLX5_CMD_OP_DESTROY_SQ);
+	MLX5_SET(destroy_sq_in, in, sqn, sqn);
+
+	mlx5_cmd_exec_check_status(dev, in, sizeof(in), out, sizeof(out));
+}
+
+int mlx5_create_tir(struct mlx5_core_dev *dev, u32 *in, int inlen, u32 *tirn)
+{
+	u32 out[MLX5_ST_SZ_DW(create_tir_out)];
+	int err;
+
+	MLX5_SET(create_tir_in, in, opcode, MLX5_CMD_OP_CREATE_TIR);
+
+	memset(out, 0, sizeof(out));
+	err = mlx5_cmd_exec_check_status(dev, in, inlen, out, sizeof(out));
+	if (!err)
+		*tirn = MLX5_GET(create_tir_out, out, tirn);
+
+	return err;
+}
+
+void mlx5_destroy_tir(struct mlx5_core_dev *dev, u32 tirn)
+{
+	u32 in[MLX5_ST_SZ_DW(destroy_tir_out)];
+	u32 out[MLX5_ST_SZ_DW(destroy_tir_out)];
+
+	memset(in, 0, sizeof(in));
+
+	MLX5_SET(destroy_tir_in, in, opcode, MLX5_CMD_OP_DESTROY_TIR);
+	MLX5_SET(destroy_tir_in, in, tirn, tirn);
+
+	mlx5_cmd_exec_check_status(dev, in, sizeof(in), out, sizeof(out));
+}
+
+int mlx5_create_tis(struct mlx5_core_dev *dev, u32 *in, int inlen, u32 *tisn)
+{
+	u32 out[MLX5_ST_SZ_DW(create_tis_out)];
+	int err;
+
+	MLX5_SET(create_tis_in, in, opcode, MLX5_CMD_OP_CREATE_TIS);
+
+	memset(out, 0, sizeof(out));
+	err = mlx5_cmd_exec_check_status(dev, in, inlen, out, sizeof(out));
+	if (!err)
+		*tisn = MLX5_GET(create_tis_out, out, tisn);
+
+	return err;
+}
+
+void mlx5_destroy_tis(struct mlx5_core_dev *dev, u32 tisn)
+{
+	u32 in[MLX5_ST_SZ_DW(destroy_tis_out)];
+	u32 out[MLX5_ST_SZ_DW(destroy_tis_out)];
+
+	memset(in, 0, sizeof(in));
+
+	MLX5_SET(destroy_tis_in, in, opcode, MLX5_CMD_OP_DESTROY_TIS);
+	MLX5_SET(destroy_tis_in, in, tisn, tisn);
+
+	mlx5_cmd_exec_check_status(dev, in, sizeof(in), out, sizeof(out));
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/transobj.h b/drivers/net/ethernet/mellanox/mlx5/core/transobj.h
new file mode 100644
index 000000000000..1bc898cc4933
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/transobj.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2013-2015, Mellanox Technologies, Ltd.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __TRANSOBJ_H__
+#define __TRANSOBJ_H__
+
+int mlx5_create_rq(struct mlx5_core_dev *dev, u32 *in, int inlen, u32 *rqn);
+int mlx5_modify_rq(struct mlx5_core_dev *dev, u32 rqn, u32 *in, int inlen);
+void mlx5_destroy_rq(struct mlx5_core_dev *dev, u32 rqn);
+int mlx5_create_sq(struct mlx5_core_dev *dev, u32 *in, int inlen, u32 *sqn);
+int mlx5_modify_sq(struct mlx5_core_dev *dev, u32 sqn, u32 *in, int inlen);
+void mlx5_destroy_sq(struct mlx5_core_dev *dev, u32 sqn);
+int mlx5_create_tir(struct mlx5_core_dev *dev, u32 *in, int inlen, u32 *tirn);
+void mlx5_destroy_tir(struct mlx5_core_dev *dev, u32 tirn);
+int mlx5_create_tis(struct mlx5_core_dev *dev, u32 *in, int inlen, u32 *tisn);
+void mlx5_destroy_tis(struct mlx5_core_dev *dev, u32 tisn);
+
+#endif /* __TRANSOBJ_H__ */
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/vport.c b/drivers/net/ethernet/mellanox/mlx5/core/vport.c
new file mode 100644
index 000000000000..ba374b9a6c87
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/vport.c
@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2013-2015, Mellanox Technologies, Ltd.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/export.h>
+#include <linux/etherdevice.h>
+#include <linux/mlx5/driver.h>
+#include "vport.h"
+#include "mlx5_core.h"
+
+u8 mlx5_query_vport_state(struct mlx5_core_dev *mdev, u8 opmod)
+{
+	u32 in[MLX5_ST_SZ_DW(query_vport_state_in)];
+	u32 out[MLX5_ST_SZ_DW(query_vport_state_out)];
+	int err;
+
+	memset(in, 0, sizeof(in));
+
+	MLX5_SET(query_vport_state_in, in, opcode,
+		 MLX5_CMD_OP_QUERY_VPORT_STATE);
+	MLX5_SET(query_vport_state_in, in, op_mod, opmod);
+
+	err = mlx5_cmd_exec_check_status(mdev, in, sizeof(in), out,
+					 sizeof(out));
+	if (err)
+		mlx5_core_warn(mdev, "MLX5_CMD_OP_QUERY_VPORT_STATE failed\n");
+
+	return MLX5_GET(query_vport_state_out, out, state);
+}
+
+void mlx5_query_vport_mac_address(struct mlx5_core_dev *mdev, u8 *addr)
+{
+	u32  in[MLX5_ST_SZ_DW(query_nic_vport_context_in)];
+	u32 *out;
+	int outlen = MLX5_ST_SZ_BYTES(query_nic_vport_context_out);
+	u8 *out_addr;
+
+	out = mlx5_vzalloc(outlen);
+	if (!out)
+		return;
+
+	out_addr = MLX5_ADDR_OF(query_nic_vport_context_out, out,
+				nic_vport_context.permanent_address);
+
+	memset(in, 0, sizeof(in));
+
+	MLX5_SET(query_nic_vport_context_in, in, opcode,
+		 MLX5_CMD_OP_QUERY_NIC_VPORT_CONTEXT);
+
+	memset(out, 0, outlen);
+	mlx5_cmd_exec_check_status(mdev, in, sizeof(in), out, outlen);
+
+	ether_addr_copy(addr, &out_addr[2]);
+
+	kvfree(out);
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/vport.h b/drivers/net/ethernet/mellanox/mlx5/core/vport.h
new file mode 100644
index 000000000000..c05ca2c3419d
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/vport.h
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2013-2015, Mellanox Technologies, Ltd.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __MLX5_VPORT_H__
+#define __MLX5_VPORT_H__
+
+#include <linux/mlx5/driver.h>
+
+u8 mlx5_query_vport_state(struct mlx5_core_dev *mdev, u8 opmod);
+void mlx5_query_vport_mac_address(struct mlx5_core_dev *mdev, u8 *addr);
+
+#endif /* __MLX5_VPORT_H__ */
diff --git a/include/linux/mlx5/flow_table.h b/include/linux/mlx5/flow_table.h
new file mode 100644
index 000000000000..5f922c6d4fc2
--- /dev/null
+++ b/include/linux/mlx5/flow_table.h
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2013-2015, Mellanox Technologies, Ltd.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef MLX5_FLOW_TABLE_H
+#define MLX5_FLOW_TABLE_H
+
+#include <linux/mlx5/driver.h>
+
+struct mlx5_flow_table_group {
+	u8	log_sz;
+	u8	match_criteria_enable;
+	u32	match_criteria[MLX5_ST_SZ_DW(fte_match_param)];
+};
+
+void *mlx5_create_flow_table(struct mlx5_core_dev *dev, u8 level, u8 table_type,
+			     u16 num_groups,
+			     struct mlx5_flow_table_group *group);
+void mlx5_destroy_flow_table(void *flow_table);
+int mlx5_add_flow_table_entry(void *flow_table, u8 match_criteria_enable,
+			      void *match_criteria, void *flow_context,
+			      u32 *flow_index);
+void mlx5_del_flow_table_entry(void *flow_table, u32 flow_index);
+u32 mlx5_get_flow_table_id(void *flow_table);
+
+#endif /* MLX5_FLOW_TABLE_H */
-- 
cgit v1.2.3


From f62b8bb8f2d30582f30f51e85a8c0e1260125d7e Mon Sep 17 00:00:00 2001
From: Amir Vadai <amirv@mellanox.com>
Date: Thu, 28 May 2015 22:28:48 +0300
Subject: net/mlx5: Extend mlx5_core to support ConnectX-4 Ethernet
 functionality

This is the Ethernet part of the driver for the Mellanox ConnectX(R)-4
Single/Dual-Port Adapter supporting 100Gb/s with VPI.  The driver
extends the existing mlx5 driver with Ethernet functionality.

This patch contains the driver entry points but does not include
transmit and receive (see the previous patch in the series) routines.

It also adds the option MLX5_CORE_EN to Kconfig to enable/disable the
Ethernet functionality. Currently, Kconfig is programmed to make
Ethernet and Infiniband functionality mutally exclusive.
Also changed MLX5_INFINIBAND to be depandant on MLX5_CORE instead of
selecting it, since MLX5_CORE could be selected without MLX5_INFINIBAND
being selected.

Signed-off-by: Amir Vadai <amirv@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/infiniband/hw/mlx5/Kconfig                 |    4 +-
 drivers/net/ethernet/mellanox/mlx5/core/Kconfig    |   14 +-
 drivers/net/ethernet/mellanox/mlx5/core/Makefile   |    3 +
 drivers/net/ethernet/mellanox/mlx5/core/cmd.c      |   19 -
 drivers/net/ethernet/mellanox/mlx5/core/en.h       |  520 ++++++
 .../net/ethernet/mellanox/mlx5/core/en_ethtool.c   |  679 +++++++
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c  | 1899 ++++++++++++++++++++
 drivers/net/ethernet/mellanox/mlx5/core/main.c     |   74 +-
 .../net/ethernet/mellanox/mlx5/core/mlx5_core.h    |    9 +-
 include/linux/mlx5/device.h                        |   19 +
 include/linux/mlx5/driver.h                        |    1 +
 11 files changed, 3213 insertions(+), 28 deletions(-)
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/en.h
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/en_main.c

(limited to 'include/linux')

diff --git a/drivers/infiniband/hw/mlx5/Kconfig b/drivers/infiniband/hw/mlx5/Kconfig
index 10df386c6344..bce263b92821 100644
--- a/drivers/infiniband/hw/mlx5/Kconfig
+++ b/drivers/infiniband/hw/mlx5/Kconfig
@@ -1,8 +1,6 @@
 config MLX5_INFINIBAND
 	tristate "Mellanox Connect-IB HCA support"
-	depends on NETDEVICES && ETHERNET && PCI
-	select NET_VENDOR_MELLANOX
-	select MLX5_CORE
+	depends on NETDEVICES && ETHERNET && PCI && MLX5_CORE
 	---help---
 	  This driver provides low-level InfiniBand support for
 	  Mellanox Connect-IB PCI Express host channel adapters (HCAs).
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Kconfig b/drivers/net/ethernet/mellanox/mlx5/core/Kconfig
index 8ff57e8e3e91..0d7aef040fb0 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/Kconfig
+++ b/drivers/net/ethernet/mellanox/mlx5/core/Kconfig
@@ -3,6 +3,18 @@
 #
 
 config MLX5_CORE
-	tristate
+	tristate "Mellanox Technologies ConnectX-4 and Connect-IB core driver"
 	depends on PCI
 	default n
+	---help---
+	  Core driver for low level functionality of the ConnectX-4 and
+	  Connect-IB cards by Mellanox Technologies.
+
+config MLX5_CORE_EN
+	bool "Mellanox Technologies ConnectX-4 Ethernet support"
+	depends on MLX5_INFINIBAND=n && NETDEVICES && ETHERNET && PCI && MLX5_CORE
+	default n
+	---help---
+	  Ethernet support in Mellanox Technologies ConnectX-4 NIC.
+	  Ethernet and Infiniband support in ConnectX-4 are currently mutually
+	  exclusive.
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Makefile b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
index 105780bb980b..87e9e606596a 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/Makefile
+++ b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
@@ -3,3 +3,6 @@ obj-$(CONFIG_MLX5_CORE)		+= mlx5_core.o
 mlx5_core-y :=	main.o cmd.o debugfs.o fw.o eq.o uar.o pagealloc.o \
 		health.o mcg.o cq.o srq.o alloc.o qp.o port.o mr.o pd.o   \
 		mad.o
+mlx5_core-$(CONFIG_MLX5_CORE_EN) += wq.o flow_table.o vport.o transobj.o \
+		en_main.o en_flow_table.o en_ethtool.o en_tx.o en_rx.o \
+		en_txrx.o
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c
index 2f22cd2de2b0..75ff58dc1ff5 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c
@@ -75,25 +75,6 @@ enum {
 	MLX5_CMD_DELIVERY_STAT_CMD_DESCR_ERR		= 0x10,
 };
 
-enum {
-	MLX5_CMD_STAT_OK			= 0x0,
-	MLX5_CMD_STAT_INT_ERR			= 0x1,
-	MLX5_CMD_STAT_BAD_OP_ERR		= 0x2,
-	MLX5_CMD_STAT_BAD_PARAM_ERR		= 0x3,
-	MLX5_CMD_STAT_BAD_SYS_STATE_ERR		= 0x4,
-	MLX5_CMD_STAT_BAD_RES_ERR		= 0x5,
-	MLX5_CMD_STAT_RES_BUSY			= 0x6,
-	MLX5_CMD_STAT_LIM_ERR			= 0x8,
-	MLX5_CMD_STAT_BAD_RES_STATE_ERR		= 0x9,
-	MLX5_CMD_STAT_IX_ERR			= 0xa,
-	MLX5_CMD_STAT_NO_RES_ERR		= 0xf,
-	MLX5_CMD_STAT_BAD_INP_LEN_ERR		= 0x50,
-	MLX5_CMD_STAT_BAD_OUTP_LEN_ERR		= 0x51,
-	MLX5_CMD_STAT_BAD_QP_STATE_ERR		= 0x10,
-	MLX5_CMD_STAT_BAD_PKT_ERR		= 0x30,
-	MLX5_CMD_STAT_BAD_SIZE_OUTS_CQES_ERR	= 0x40,
-};
-
 static struct mlx5_cmd_work_ent *alloc_cmd(struct mlx5_cmd *cmd,
 					   struct mlx5_cmd_msg *in,
 					   struct mlx5_cmd_msg *out,
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
new file mode 100644
index 000000000000..cbb3c7cb53f7
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -0,0 +1,520 @@
+/*
+ * Copyright (c) 2015, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/if_vlan.h>
+#include <linux/etherdevice.h>
+#include <linux/mlx5/driver.h>
+#include <linux/mlx5/qp.h>
+#include <linux/mlx5/cq.h>
+#include "vport.h"
+#include "wq.h"
+#include "transobj.h"
+#include "mlx5_core.h"
+
+#define MLX5E_MAX_NUM_TC	8
+
+#define MLX5E_PARAMS_MINIMUM_LOG_SQ_SIZE                0x7
+#define MLX5E_PARAMS_DEFAULT_LOG_SQ_SIZE                0xa
+#define MLX5E_PARAMS_MAXIMUM_LOG_SQ_SIZE                0xd
+
+#define MLX5E_PARAMS_MINIMUM_LOG_RQ_SIZE                0x7
+#define MLX5E_PARAMS_DEFAULT_LOG_RQ_SIZE                0xa
+#define MLX5E_PARAMS_MAXIMUM_LOG_RQ_SIZE                0xd
+
+#define MLX5E_PARAMS_DEFAULT_LRO_WQE_SZ                 (16 * 1024)
+#define MLX5E_PARAMS_DEFAULT_RX_CQ_MODERATION_USEC      0x10
+#define MLX5E_PARAMS_DEFAULT_RX_CQ_MODERATION_PKTS      0x20
+#define MLX5E_PARAMS_DEFAULT_TX_CQ_MODERATION_USEC      0x10
+#define MLX5E_PARAMS_DEFAULT_TX_CQ_MODERATION_PKTS      0x20
+#define MLX5E_PARAMS_DEFAULT_MIN_RX_WQES                0x80
+#define MLX5E_PARAMS_DEFAULT_RX_HASH_LOG_TBL_SZ         0x7
+#define MLX5E_PARAMS_MIN_MTU                            46
+
+#define MLX5E_TX_CQ_POLL_BUDGET        128
+#define MLX5E_UPDATE_STATS_INTERVAL    200 /* msecs */
+
+static const char vport_strings[][ETH_GSTRING_LEN] = {
+	/* vport statistics */
+	"rx_packets",
+	"rx_bytes",
+	"tx_packets",
+	"tx_bytes",
+	"rx_error_packets",
+	"rx_error_bytes",
+	"tx_error_packets",
+	"tx_error_bytes",
+	"rx_unicast_packets",
+	"rx_unicast_bytes",
+	"tx_unicast_packets",
+	"tx_unicast_bytes",
+	"rx_multicast_packets",
+	"rx_multicast_bytes",
+	"tx_multicast_packets",
+	"tx_multicast_bytes",
+	"rx_broadcast_packets",
+	"rx_broadcast_bytes",
+	"tx_broadcast_packets",
+	"tx_broadcast_bytes",
+
+	/* SW counters */
+	"tso_packets",
+	"tso_bytes",
+	"lro_packets",
+	"lro_bytes",
+	"rx_csum_good",
+	"rx_csum_none",
+	"tx_csum_offload",
+	"tx_queue_stopped",
+	"tx_queue_wake",
+	"tx_queue_dropped",
+	"rx_wqe_err",
+};
+
+struct mlx5e_vport_stats {
+	/* HW counters */
+	u64 rx_packets;
+	u64 rx_bytes;
+	u64 tx_packets;
+	u64 tx_bytes;
+	u64 rx_error_packets;
+	u64 rx_error_bytes;
+	u64 tx_error_packets;
+	u64 tx_error_bytes;
+	u64 rx_unicast_packets;
+	u64 rx_unicast_bytes;
+	u64 tx_unicast_packets;
+	u64 tx_unicast_bytes;
+	u64 rx_multicast_packets;
+	u64 rx_multicast_bytes;
+	u64 tx_multicast_packets;
+	u64 tx_multicast_bytes;
+	u64 rx_broadcast_packets;
+	u64 rx_broadcast_bytes;
+	u64 tx_broadcast_packets;
+	u64 tx_broadcast_bytes;
+
+	/* SW counters */
+	u64 tso_packets;
+	u64 tso_bytes;
+	u64 lro_packets;
+	u64 lro_bytes;
+	u64 rx_csum_good;
+	u64 rx_csum_none;
+	u64 tx_csum_offload;
+	u64 tx_queue_stopped;
+	u64 tx_queue_wake;
+	u64 tx_queue_dropped;
+	u64 rx_wqe_err;
+
+#define NUM_VPORT_COUNTERS     31
+};
+
+static const char rq_stats_strings[][ETH_GSTRING_LEN] = {
+	"packets",
+	"csum_none",
+	"lro_packets",
+	"lro_bytes",
+	"wqe_err"
+};
+
+struct mlx5e_rq_stats {
+	u64 packets;
+	u64 csum_none;
+	u64 lro_packets;
+	u64 lro_bytes;
+	u64 wqe_err;
+#define NUM_RQ_STATS 5
+};
+
+static const char sq_stats_strings[][ETH_GSTRING_LEN] = {
+	"packets",
+	"tso_packets",
+	"tso_bytes",
+	"csum_offload_none",
+	"stopped",
+	"wake",
+	"dropped",
+	"nop"
+};
+
+struct mlx5e_sq_stats {
+	u64 packets;
+	u64 tso_packets;
+	u64 tso_bytes;
+	u64 csum_offload_none;
+	u64 stopped;
+	u64 wake;
+	u64 dropped;
+	u64 nop;
+#define NUM_SQ_STATS 8
+};
+
+struct mlx5e_stats {
+	struct mlx5e_vport_stats   vport;
+};
+
+struct mlx5e_params {
+	u8  log_sq_size;
+	u8  log_rq_size;
+	u16 num_channels;
+	u8  default_vlan_prio;
+	u8  num_tc;
+	u16 rx_cq_moderation_usec;
+	u16 rx_cq_moderation_pkts;
+	u16 tx_cq_moderation_usec;
+	u16 tx_cq_moderation_pkts;
+	u16 min_rx_wqes;
+	u16 rx_hash_log_tbl_sz;
+	bool lro_en;
+	u32 lro_wqe_sz;
+};
+
+enum {
+	MLX5E_RQ_STATE_POST_WQES_ENABLE,
+};
+
+enum cq_flags {
+	MLX5E_CQ_HAS_CQES = 1,
+};
+
+struct mlx5e_cq {
+	/* data path - accessed per cqe */
+	struct mlx5_cqwq           wq;
+	void                      *sqrq;
+	unsigned long              flags;
+
+	/* data path - accessed per napi poll */
+	struct napi_struct        *napi;
+	struct mlx5_core_cq        mcq;
+	struct mlx5e_channel      *channel;
+
+	/* control */
+	struct mlx5_wq_ctrl        wq_ctrl;
+} ____cacheline_aligned_in_smp;
+
+struct mlx5e_rq {
+	/* data path */
+	struct mlx5_wq_ll      wq;
+	u32                    wqe_sz;
+	struct sk_buff       **skb;
+
+	struct device         *pdev;
+	struct net_device     *netdev;
+	struct mlx5e_rq_stats  stats;
+	struct mlx5e_cq        cq;
+
+	unsigned long          state;
+	int                    ix;
+
+	/* control */
+	struct mlx5_wq_ctrl    wq_ctrl;
+	u32                    rqn;
+	struct mlx5e_channel  *channel;
+} ____cacheline_aligned_in_smp;
+
+struct mlx5e_tx_skb_cb {
+	u32 num_bytes;
+	u8  num_wqebbs;
+	u8  num_dma;
+};
+
+#define MLX5E_TX_SKB_CB(__skb) ((struct mlx5e_tx_skb_cb *)__skb->cb)
+
+struct mlx5e_sq_dma {
+	dma_addr_t addr;
+	u32        size;
+};
+
+enum {
+	MLX5E_SQ_STATE_WAKE_TXQ_ENABLE,
+};
+
+struct mlx5e_sq {
+	/* data path */
+
+	/* dirtied @completion */
+	u16                        cc;
+	u32                        dma_fifo_cc;
+
+	/* dirtied @xmit */
+	u16                        pc ____cacheline_aligned_in_smp;
+	u32                        dma_fifo_pc;
+	u32                        bf_offset;
+	struct mlx5e_sq_stats      stats;
+
+	struct mlx5e_cq            cq;
+
+	/* pointers to per packet info: write@xmit, read@completion */
+	struct sk_buff           **skb;
+	struct mlx5e_sq_dma       *dma_fifo;
+
+	/* read only */
+	struct mlx5_wq_cyc         wq;
+	u32                        dma_fifo_mask;
+	void __iomem              *uar_map;
+	struct netdev_queue       *txq;
+	u32                        sqn;
+	u32                        bf_buf_size;
+	struct device             *pdev;
+	__be32                     mkey_be;
+	unsigned long              state;
+
+	/* control path */
+	struct mlx5_wq_ctrl        wq_ctrl;
+	struct mlx5_uar            uar;
+	struct mlx5e_channel      *channel;
+	int                        tc;
+} ____cacheline_aligned_in_smp;
+
+static inline bool mlx5e_sq_has_room_for(struct mlx5e_sq *sq, u16 n)
+{
+	return (((sq->wq.sz_m1 & (sq->cc - sq->pc)) >= n) ||
+		(sq->cc  == sq->pc));
+}
+
+enum channel_flags {
+	MLX5E_CHANNEL_NAPI_SCHED = 1,
+};
+
+struct mlx5e_channel {
+	/* data path */
+	struct mlx5e_rq            rq;
+	struct mlx5e_sq            sq[MLX5E_MAX_NUM_TC];
+	struct napi_struct         napi;
+	struct device             *pdev;
+	struct net_device         *netdev;
+	__be32                     mkey_be;
+	u8                         num_tc;
+	unsigned long              flags;
+
+	/* control */
+	struct mlx5e_priv         *priv;
+	int                        ix;
+	int                        cpu;
+};
+
+enum mlx5e_traffic_types {
+	MLX5E_TT_IPV4_TCP = 0,
+	MLX5E_TT_IPV6_TCP = 1,
+	MLX5E_TT_IPV4_UDP = 2,
+	MLX5E_TT_IPV6_UDP = 3,
+	MLX5E_TT_IPV4     = 4,
+	MLX5E_TT_IPV6     = 5,
+	MLX5E_TT_ANY      = 6,
+	MLX5E_NUM_TT      = 7,
+};
+
+enum {
+	MLX5E_RQT_SPREADING  = 0,
+	MLX5E_RQT_DEFAULT_RQ = 1,
+	MLX5E_NUM_RQT        = 2,
+};
+
+struct mlx5e_eth_addr_info {
+	u8  addr[ETH_ALEN + 2];
+	u32 tt_vec;
+	u32 ft_ix[MLX5E_NUM_TT]; /* flow table index per traffic type */
+};
+
+#define MLX5E_ETH_ADDR_HASH_SIZE (1 << BITS_PER_BYTE)
+
+struct mlx5e_eth_addr_db {
+	struct hlist_head          netdev_uc[MLX5E_ETH_ADDR_HASH_SIZE];
+	struct hlist_head          netdev_mc[MLX5E_ETH_ADDR_HASH_SIZE];
+	struct mlx5e_eth_addr_info broadcast;
+	struct mlx5e_eth_addr_info allmulti;
+	struct mlx5e_eth_addr_info promisc;
+	bool                       broadcast_enabled;
+	bool                       allmulti_enabled;
+	bool                       promisc_enabled;
+};
+
+enum {
+	MLX5E_STATE_ASYNC_EVENTS_ENABLE,
+	MLX5E_STATE_OPENED,
+};
+
+struct mlx5e_vlan_db {
+	unsigned long active_vlans[BITS_TO_LONGS(VLAN_N_VID)];
+	u32           active_vlans_ft_ix[VLAN_N_VID];
+	u32           untagged_rule_ft_ix;
+	u32           any_vlan_rule_ft_ix;
+	bool          filter_disabled;
+};
+
+struct mlx5e_flow_table {
+	void *vlan;
+	void *main;
+};
+
+struct mlx5e_priv {
+	/* priv data path fields - start */
+	int                        order_base_2_num_channels;
+	int                        queue_mapping_channel_mask;
+	int                        num_tc;
+	int                        default_vlan_prio;
+	/* priv data path fields - end */
+
+	unsigned long              state;
+	struct mutex               state_lock; /* Protects Interface state */
+	struct mlx5_uar            cq_uar;
+	u32                        pdn;
+	struct mlx5_core_mr        mr;
+
+	struct mlx5e_channel     **channel;
+	u32                        tisn[MLX5E_MAX_NUM_TC];
+	u32                        rqtn;
+	u32                        tirn[MLX5E_NUM_TT];
+
+	struct mlx5e_flow_table    ft;
+	struct mlx5e_eth_addr_db   eth_addr;
+	struct mlx5e_vlan_db       vlan;
+
+	struct mlx5e_params        params;
+	spinlock_t                 async_events_spinlock; /* sync hw events */
+	struct work_struct         update_carrier_work;
+	struct work_struct         set_rx_mode_work;
+	struct delayed_work        update_stats_work;
+
+	struct mlx5_core_dev      *mdev;
+	struct net_device         *netdev;
+	struct mlx5e_stats         stats;
+};
+
+#define MLX5E_NET_IP_ALIGN 2
+
+struct mlx5e_tx_wqe {
+	struct mlx5_wqe_ctrl_seg ctrl;
+	struct mlx5_wqe_eth_seg  eth;
+};
+
+struct mlx5e_rx_wqe {
+	struct mlx5_wqe_srq_next_seg  next;
+	struct mlx5_wqe_data_seg      data;
+};
+
+enum mlx5e_link_mode {
+	MLX5E_1000BASE_CX_SGMII	 = 0,
+	MLX5E_1000BASE_KX	 = 1,
+	MLX5E_10GBASE_CX4	 = 2,
+	MLX5E_10GBASE_KX4	 = 3,
+	MLX5E_10GBASE_KR	 = 4,
+	MLX5E_20GBASE_KR2	 = 5,
+	MLX5E_40GBASE_CR4	 = 6,
+	MLX5E_40GBASE_KR4	 = 7,
+	MLX5E_56GBASE_R4	 = 8,
+	MLX5E_10GBASE_CR	 = 12,
+	MLX5E_10GBASE_SR	 = 13,
+	MLX5E_10GBASE_ER	 = 14,
+	MLX5E_40GBASE_SR4	 = 15,
+	MLX5E_40GBASE_LR4	 = 16,
+	MLX5E_100GBASE_CR4	 = 20,
+	MLX5E_100GBASE_SR4	 = 21,
+	MLX5E_100GBASE_KR4	 = 22,
+	MLX5E_100GBASE_LR4	 = 23,
+	MLX5E_100BASE_TX	 = 24,
+	MLX5E_100BASE_T		 = 25,
+	MLX5E_10GBASE_T		 = 26,
+	MLX5E_25GBASE_CR	 = 27,
+	MLX5E_25GBASE_KR	 = 28,
+	MLX5E_25GBASE_SR	 = 29,
+	MLX5E_50GBASE_CR2	 = 30,
+	MLX5E_50GBASE_KR2	 = 31,
+	MLX5E_LINK_MODES_NUMBER,
+};
+
+#define MLX5E_PROT_MASK(link_mode) (1 << link_mode)
+
+u16 mlx5e_select_queue(struct net_device *dev, struct sk_buff *skb,
+		       void *accel_priv, select_queue_fallback_t fallback);
+netdev_tx_t mlx5e_xmit(struct sk_buff *skb, struct net_device *dev);
+netdev_tx_t mlx5e_xmit_multi_tc(struct sk_buff *skb, struct net_device *dev);
+
+void mlx5e_completion_event(struct mlx5_core_cq *mcq);
+void mlx5e_cq_error_event(struct mlx5_core_cq *mcq, enum mlx5_event event);
+int mlx5e_napi_poll(struct napi_struct *napi, int budget);
+bool mlx5e_poll_tx_cq(struct mlx5e_cq *cq);
+bool mlx5e_poll_rx_cq(struct mlx5e_cq *cq, int budget);
+bool mlx5e_post_rx_wqes(struct mlx5e_rq *rq);
+struct mlx5_cqe64 *mlx5e_get_cqe(struct mlx5e_cq *cq);
+
+void mlx5e_update_stats(struct mlx5e_priv *priv);
+
+int mlx5e_open_flow_table(struct mlx5e_priv *priv);
+void mlx5e_close_flow_table(struct mlx5e_priv *priv);
+void mlx5e_init_eth_addr(struct mlx5e_priv *priv);
+void mlx5e_set_rx_mode_core(struct mlx5e_priv *priv);
+void mlx5e_set_rx_mode_work(struct work_struct *work);
+
+int mlx5e_vlan_rx_add_vid(struct net_device *dev, __always_unused __be16 proto,
+			  u16 vid);
+int mlx5e_vlan_rx_kill_vid(struct net_device *dev, __always_unused __be16 proto,
+			   u16 vid);
+void mlx5e_enable_vlan_filter(struct mlx5e_priv *priv);
+void mlx5e_disable_vlan_filter(struct mlx5e_priv *priv);
+int mlx5e_add_all_vlan_rules(struct mlx5e_priv *priv);
+void mlx5e_del_all_vlan_rules(struct mlx5e_priv *priv);
+
+int mlx5e_open_locked(struct net_device *netdev);
+int mlx5e_close_locked(struct net_device *netdev);
+int mlx5e_update_priv_params(struct mlx5e_priv *priv,
+			     struct mlx5e_params *new_params);
+
+static inline void mlx5e_tx_notify_hw(struct mlx5e_sq *sq,
+				      struct mlx5e_tx_wqe *wqe)
+{
+	/* ensure wqe is visible to device before updating doorbell record */
+	dma_wmb();
+
+	*sq->wq.db = cpu_to_be32(sq->pc);
+
+	/* ensure doorbell record is visible to device before ringing the
+	 * doorbell
+	 */
+	wmb();
+
+	mlx5_write64((__be32 *)&wqe->ctrl,
+		     sq->uar_map + MLX5_BF_OFFSET + sq->bf_offset,
+		     NULL);
+
+	sq->bf_offset ^= sq->bf_buf_size;
+}
+
+static inline void mlx5e_cq_arm(struct mlx5e_cq *cq)
+{
+	struct mlx5_core_cq *mcq;
+
+	mcq = &cq->mcq;
+	mlx5_cq_arm(mcq, MLX5_CQ_DB_REQ_NOT, mcq->uar->map, NULL, cq->wq.cc);
+}
+
+extern const struct ethtool_ops mlx5e_ethtool_ops;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
new file mode 100644
index 000000000000..de7aec8abca1
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
@@ -0,0 +1,679 @@
+/*
+ * Copyright (c) 2015, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "en.h"
+
+static void mlx5e_get_drvinfo(struct net_device *dev,
+			      struct ethtool_drvinfo *drvinfo)
+{
+	struct mlx5e_priv *priv = netdev_priv(dev);
+	struct mlx5_core_dev *mdev = priv->mdev;
+
+	strlcpy(drvinfo->driver, DRIVER_NAME, sizeof(drvinfo->driver));
+	strlcpy(drvinfo->version, DRIVER_VERSION " (" DRIVER_RELDATE ")",
+		sizeof(drvinfo->version));
+	snprintf(drvinfo->fw_version, sizeof(drvinfo->fw_version),
+		 "%d.%d.%d",
+		 fw_rev_maj(mdev), fw_rev_min(mdev), fw_rev_sub(mdev));
+	strlcpy(drvinfo->bus_info, pci_name(mdev->pdev),
+		sizeof(drvinfo->bus_info));
+}
+
+static const struct {
+	u32 supported;
+	u32 advertised;
+	u32 speed;
+} ptys2ethtool_table[MLX5E_LINK_MODES_NUMBER] = {
+	[MLX5E_1000BASE_CX_SGMII] = {
+		.supported  = SUPPORTED_1000baseKX_Full,
+		.advertised = ADVERTISED_1000baseKX_Full,
+		.speed      = 1000,
+	},
+	[MLX5E_1000BASE_KX] = {
+		.supported  = SUPPORTED_1000baseKX_Full,
+		.advertised = ADVERTISED_1000baseKX_Full,
+		.speed      = 1000,
+	},
+	[MLX5E_10GBASE_CX4] = {
+		.supported  = SUPPORTED_10000baseKX4_Full,
+		.advertised = ADVERTISED_10000baseKX4_Full,
+		.speed      = 10000,
+	},
+	[MLX5E_10GBASE_KX4] = {
+		.supported  = SUPPORTED_10000baseKX4_Full,
+		.advertised = ADVERTISED_10000baseKX4_Full,
+		.speed      = 10000,
+	},
+	[MLX5E_10GBASE_KR] = {
+		.supported  = SUPPORTED_10000baseKR_Full,
+		.advertised = ADVERTISED_10000baseKR_Full,
+		.speed      = 10000,
+	},
+	[MLX5E_20GBASE_KR2] = {
+		.supported  = SUPPORTED_20000baseKR2_Full,
+		.advertised = ADVERTISED_20000baseKR2_Full,
+		.speed      = 20000,
+	},
+	[MLX5E_40GBASE_CR4] = {
+		.supported  = SUPPORTED_40000baseCR4_Full,
+		.advertised = ADVERTISED_40000baseCR4_Full,
+		.speed      = 40000,
+	},
+	[MLX5E_40GBASE_KR4] = {
+		.supported  = SUPPORTED_40000baseKR4_Full,
+		.advertised = ADVERTISED_40000baseKR4_Full,
+		.speed      = 40000,
+	},
+	[MLX5E_56GBASE_R4] = {
+		.supported  = SUPPORTED_56000baseKR4_Full,
+		.advertised = ADVERTISED_56000baseKR4_Full,
+		.speed      = 56000,
+	},
+	[MLX5E_10GBASE_CR] = {
+		.supported  = SUPPORTED_10000baseKR_Full,
+		.advertised = ADVERTISED_10000baseKR_Full,
+		.speed      = 10000,
+	},
+	[MLX5E_10GBASE_SR] = {
+		.supported  = SUPPORTED_10000baseKR_Full,
+		.advertised = ADVERTISED_10000baseKR_Full,
+		.speed      = 10000,
+	},
+	[MLX5E_10GBASE_ER] = {
+		.supported  = SUPPORTED_10000baseKR_Full,
+		.advertised = ADVERTISED_10000baseKR_Full,
+		.speed      = 10000,
+	},
+	[MLX5E_40GBASE_SR4] = {
+		.supported  = SUPPORTED_40000baseSR4_Full,
+		.advertised = ADVERTISED_40000baseSR4_Full,
+		.speed      = 40000,
+	},
+	[MLX5E_40GBASE_LR4] = {
+		.supported  = SUPPORTED_40000baseLR4_Full,
+		.advertised = ADVERTISED_40000baseLR4_Full,
+		.speed      = 40000,
+	},
+	[MLX5E_100GBASE_CR4] = {
+		.speed      = 100000,
+	},
+	[MLX5E_100GBASE_SR4] = {
+		.speed      = 100000,
+	},
+	[MLX5E_100GBASE_KR4] = {
+		.speed      = 100000,
+	},
+	[MLX5E_100GBASE_LR4] = {
+		.speed      = 100000,
+	},
+	[MLX5E_100BASE_TX]   = {
+		.speed      = 100,
+	},
+	[MLX5E_100BASE_T]    = {
+		.supported  = SUPPORTED_100baseT_Full,
+		.advertised = ADVERTISED_100baseT_Full,
+		.speed      = 100,
+	},
+	[MLX5E_10GBASE_T]    = {
+		.supported  = SUPPORTED_10000baseT_Full,
+		.advertised = ADVERTISED_10000baseT_Full,
+		.speed      = 1000,
+	},
+	[MLX5E_25GBASE_CR]   = {
+		.speed      = 25000,
+	},
+	[MLX5E_25GBASE_KR]   = {
+		.speed      = 25000,
+	},
+	[MLX5E_25GBASE_SR]   = {
+		.speed      = 25000,
+	},
+	[MLX5E_50GBASE_CR2]  = {
+		.speed      = 50000,
+	},
+	[MLX5E_50GBASE_KR2]  = {
+		.speed      = 50000,
+	},
+};
+
+static int mlx5e_get_sset_count(struct net_device *dev, int sset)
+{
+	struct mlx5e_priv *priv = netdev_priv(dev);
+
+	switch (sset) {
+	case ETH_SS_STATS:
+		return NUM_VPORT_COUNTERS +
+		       priv->params.num_channels * NUM_RQ_STATS +
+		       priv->params.num_channels * priv->num_tc *
+						   NUM_SQ_STATS;
+	/* fallthrough */
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
+static void mlx5e_get_strings(struct net_device *dev,
+			      uint32_t stringset, uint8_t *data)
+{
+	int i, j, tc, idx = 0;
+	struct mlx5e_priv *priv = netdev_priv(dev);
+
+	switch (stringset) {
+	case ETH_SS_PRIV_FLAGS:
+		break;
+
+	case ETH_SS_TEST:
+		break;
+
+	case ETH_SS_STATS:
+		/* VPORT counters */
+		for (i = 0; i < NUM_VPORT_COUNTERS; i++)
+			strcpy(data + (idx++) * ETH_GSTRING_LEN,
+			       vport_strings[i]);
+
+		/* per channel counters */
+		for (i = 0; i < priv->params.num_channels; i++)
+			for (j = 0; j < NUM_RQ_STATS; j++)
+				sprintf(data + (idx++) * ETH_GSTRING_LEN,
+					"rx%d_%s", i, rq_stats_strings[j]);
+
+		for (i = 0; i < priv->params.num_channels; i++)
+			for (tc = 0; tc < priv->num_tc; tc++)
+				for (j = 0; j < NUM_SQ_STATS; j++)
+					sprintf(data +
+						(idx++) * ETH_GSTRING_LEN,
+						"tx%d_%d_%s", i, tc,
+						sq_stats_strings[j]);
+		break;
+	}
+}
+
+static void mlx5e_get_ethtool_stats(struct net_device *dev,
+				    struct ethtool_stats *stats, u64 *data)
+{
+	struct mlx5e_priv *priv = netdev_priv(dev);
+	int i, j, tc, idx = 0;
+
+	if (!data)
+		return;
+
+	mutex_lock(&priv->state_lock);
+	if (test_bit(MLX5E_STATE_OPENED, &priv->state))
+		mlx5e_update_stats(priv);
+	mutex_unlock(&priv->state_lock);
+
+	for (i = 0; i < NUM_VPORT_COUNTERS; i++)
+		data[idx++] = ((u64 *)&priv->stats.vport)[i];
+
+	/* per channel counters */
+	for (i = 0; i < priv->params.num_channels; i++)
+		for (j = 0; j < NUM_RQ_STATS; j++)
+			data[idx++] = !test_bit(MLX5E_STATE_OPENED,
+						&priv->state) ? 0 :
+				       ((u64 *)&priv->channel[i]->rq.stats)[j];
+
+	for (i = 0; i < priv->params.num_channels; i++)
+		for (tc = 0; tc < priv->num_tc; tc++)
+			for (j = 0; j < NUM_SQ_STATS; j++)
+				data[idx++] = !test_bit(MLX5E_STATE_OPENED,
+							&priv->state) ? 0 :
+				((u64 *)&priv->channel[i]->sq[tc].stats)[j];
+}
+
+static void mlx5e_get_ringparam(struct net_device *dev,
+				struct ethtool_ringparam *param)
+{
+	struct mlx5e_priv *priv = netdev_priv(dev);
+
+	param->rx_max_pending = 1 << MLX5E_PARAMS_MAXIMUM_LOG_RQ_SIZE;
+	param->tx_max_pending = 1 << MLX5E_PARAMS_MAXIMUM_LOG_SQ_SIZE;
+	param->rx_pending     = 1 << priv->params.log_rq_size;
+	param->tx_pending     = 1 << priv->params.log_sq_size;
+}
+
+static int mlx5e_set_ringparam(struct net_device *dev,
+			       struct ethtool_ringparam *param)
+{
+	struct mlx5e_priv *priv = netdev_priv(dev);
+	struct mlx5e_params new_params;
+	u16 min_rx_wqes;
+	u8 log_rq_size;
+	u8 log_sq_size;
+	int err = 0;
+
+	if (param->rx_jumbo_pending) {
+		netdev_info(dev, "%s: rx_jumbo_pending not supported\n",
+			    __func__);
+		return -EINVAL;
+	}
+	if (param->rx_mini_pending) {
+		netdev_info(dev, "%s: rx_mini_pending not supported\n",
+			    __func__);
+		return -EINVAL;
+	}
+	if (param->rx_pending < (1 << MLX5E_PARAMS_MINIMUM_LOG_RQ_SIZE)) {
+		netdev_info(dev, "%s: rx_pending (%d) < min (%d)\n",
+			    __func__, param->rx_pending,
+			    1 << MLX5E_PARAMS_MINIMUM_LOG_RQ_SIZE);
+		return -EINVAL;
+	}
+	if (param->rx_pending > (1 << MLX5E_PARAMS_MAXIMUM_LOG_RQ_SIZE)) {
+		netdev_info(dev, "%s: rx_pending (%d) > max (%d)\n",
+			    __func__, param->rx_pending,
+			    1 << MLX5E_PARAMS_MAXIMUM_LOG_RQ_SIZE);
+		return -EINVAL;
+	}
+	if (param->tx_pending < (1 << MLX5E_PARAMS_MINIMUM_LOG_SQ_SIZE)) {
+		netdev_info(dev, "%s: tx_pending (%d) < min (%d)\n",
+			    __func__, param->tx_pending,
+			    1 << MLX5E_PARAMS_MINIMUM_LOG_SQ_SIZE);
+		return -EINVAL;
+	}
+	if (param->tx_pending > (1 << MLX5E_PARAMS_MAXIMUM_LOG_SQ_SIZE)) {
+		netdev_info(dev, "%s: tx_pending (%d) > max (%d)\n",
+			    __func__, param->tx_pending,
+			    1 << MLX5E_PARAMS_MAXIMUM_LOG_SQ_SIZE);
+		return -EINVAL;
+	}
+
+	log_rq_size = order_base_2(param->rx_pending);
+	log_sq_size = order_base_2(param->tx_pending);
+	min_rx_wqes = min_t(u16, param->rx_pending - 1,
+			    MLX5E_PARAMS_DEFAULT_MIN_RX_WQES);
+
+	if (log_rq_size == priv->params.log_rq_size &&
+	    log_sq_size == priv->params.log_sq_size &&
+	    min_rx_wqes == priv->params.min_rx_wqes)
+		return 0;
+
+	mutex_lock(&priv->state_lock);
+	new_params = priv->params;
+	new_params.log_rq_size = log_rq_size;
+	new_params.log_sq_size = log_sq_size;
+	new_params.min_rx_wqes = min_rx_wqes;
+	err = mlx5e_update_priv_params(priv, &new_params);
+	mutex_unlock(&priv->state_lock);
+
+	return err;
+}
+
+static void mlx5e_get_channels(struct net_device *dev,
+			       struct ethtool_channels *ch)
+{
+	struct mlx5e_priv *priv = netdev_priv(dev);
+	int ncv = priv->mdev->priv.eq_table.num_comp_vectors;
+
+	ch->max_combined   = ncv;
+	ch->combined_count = priv->params.num_channels;
+}
+
+static int mlx5e_set_channels(struct net_device *dev,
+			      struct ethtool_channels *ch)
+{
+	struct mlx5e_priv *priv = netdev_priv(dev);
+	int ncv = priv->mdev->priv.eq_table.num_comp_vectors;
+	unsigned int count = ch->combined_count;
+	struct mlx5e_params new_params;
+	int err = 0;
+
+	if (!count) {
+		netdev_info(dev, "%s: combined_count=0 not supported\n",
+			    __func__);
+		return -EINVAL;
+	}
+	if (ch->rx_count || ch->tx_count) {
+		netdev_info(dev, "%s: separate rx/tx count not supported\n",
+			    __func__);
+		return -EINVAL;
+	}
+	if (count > ncv) {
+		netdev_info(dev, "%s: count (%d) > max (%d)\n",
+			    __func__, count, ncv);
+		return -EINVAL;
+	}
+
+	if (priv->params.num_channels == count)
+		return 0;
+
+	mutex_lock(&priv->state_lock);
+	new_params = priv->params;
+	new_params.num_channels = count;
+	err = mlx5e_update_priv_params(priv, &new_params);
+	mutex_unlock(&priv->state_lock);
+
+	return err;
+}
+
+static int mlx5e_get_coalesce(struct net_device *netdev,
+			      struct ethtool_coalesce *coal)
+{
+	struct mlx5e_priv *priv = netdev_priv(netdev);
+
+	coal->rx_coalesce_usecs       = priv->params.rx_cq_moderation_usec;
+	coal->rx_max_coalesced_frames = priv->params.rx_cq_moderation_pkts;
+	coal->tx_coalesce_usecs       = priv->params.tx_cq_moderation_usec;
+	coal->tx_max_coalesced_frames = priv->params.tx_cq_moderation_pkts;
+
+	return 0;
+}
+
+static int mlx5e_set_coalesce(struct net_device *netdev,
+			      struct ethtool_coalesce *coal)
+{
+	struct mlx5e_priv *priv    = netdev_priv(netdev);
+	struct mlx5_core_dev *mdev = priv->mdev;
+	struct mlx5e_channel *c;
+	int tc;
+	int i;
+
+	priv->params.tx_cq_moderation_usec = coal->tx_coalesce_usecs;
+	priv->params.tx_cq_moderation_pkts = coal->tx_max_coalesced_frames;
+	priv->params.rx_cq_moderation_usec = coal->rx_coalesce_usecs;
+	priv->params.rx_cq_moderation_pkts = coal->rx_max_coalesced_frames;
+
+	for (i = 0; i < priv->params.num_channels; ++i) {
+		c = priv->channel[i];
+
+		for (tc = 0; tc < c->num_tc; tc++) {
+			mlx5_core_modify_cq_moderation(mdev,
+						&c->sq[tc].cq.mcq,
+						coal->tx_coalesce_usecs,
+						coal->tx_max_coalesced_frames);
+		}
+
+		mlx5_core_modify_cq_moderation(mdev, &c->rq.cq.mcq,
+					       coal->rx_coalesce_usecs,
+					       coal->rx_max_coalesced_frames);
+	}
+
+	return 0;
+}
+
+static u32 ptys2ethtool_supported_link(u32 eth_proto_cap)
+{
+	int i;
+	u32 supported_modes = 0;
+
+	for (i = 0; i < MLX5E_LINK_MODES_NUMBER; ++i) {
+		if (eth_proto_cap & MLX5E_PROT_MASK(i))
+			supported_modes |= ptys2ethtool_table[i].supported;
+	}
+	return supported_modes;
+}
+
+static u32 ptys2ethtool_adver_link(u32 eth_proto_cap)
+{
+	int i;
+	u32 advertising_modes = 0;
+
+	for (i = 0; i < MLX5E_LINK_MODES_NUMBER; ++i) {
+		if (eth_proto_cap & MLX5E_PROT_MASK(i))
+			advertising_modes |= ptys2ethtool_table[i].advertised;
+	}
+	return advertising_modes;
+}
+
+static u32 ptys2ethtool_supported_port(u32 eth_proto_cap)
+{
+	if (eth_proto_cap & (MLX5E_PROT_MASK(MLX5E_10GBASE_CR)
+			   | MLX5E_PROT_MASK(MLX5E_10GBASE_SR)
+			   | MLX5E_PROT_MASK(MLX5E_40GBASE_CR4)
+			   | MLX5E_PROT_MASK(MLX5E_40GBASE_SR4)
+			   | MLX5E_PROT_MASK(MLX5E_100GBASE_SR4)
+			   | MLX5E_PROT_MASK(MLX5E_1000BASE_CX_SGMII))) {
+		return SUPPORTED_FIBRE;
+	}
+
+	if (eth_proto_cap & (MLX5E_PROT_MASK(MLX5E_100GBASE_KR4)
+			   | MLX5E_PROT_MASK(MLX5E_40GBASE_KR4)
+			   | MLX5E_PROT_MASK(MLX5E_10GBASE_KR)
+			   | MLX5E_PROT_MASK(MLX5E_10GBASE_KX4)
+			   | MLX5E_PROT_MASK(MLX5E_1000BASE_KX))) {
+		return SUPPORTED_Backplane;
+	}
+	return 0;
+}
+
+static void get_speed_duplex(struct net_device *netdev,
+			     u32 eth_proto_oper,
+			     struct ethtool_cmd *cmd)
+{
+	int i;
+	u32 speed = SPEED_UNKNOWN;
+	u8 duplex = DUPLEX_UNKNOWN;
+
+	if (!netif_carrier_ok(netdev))
+		goto out;
+
+	for (i = 0; i < MLX5E_LINK_MODES_NUMBER; ++i) {
+		if (eth_proto_oper & MLX5E_PROT_MASK(i)) {
+			speed = ptys2ethtool_table[i].speed;
+			duplex = DUPLEX_FULL;
+			break;
+		}
+	}
+out:
+	ethtool_cmd_speed_set(cmd, speed);
+	cmd->duplex = duplex;
+}
+
+static void get_supported(u32 eth_proto_cap, u32 *supported)
+{
+	*supported |= ptys2ethtool_supported_port(eth_proto_cap);
+	*supported |= ptys2ethtool_supported_link(eth_proto_cap);
+	*supported |= SUPPORTED_Pause | SUPPORTED_Asym_Pause;
+}
+
+static void get_advertising(u32 eth_proto_cap, u8 tx_pause,
+			    u8 rx_pause, u32 *advertising)
+{
+	*advertising |= ptys2ethtool_adver_link(eth_proto_cap);
+	*advertising |= tx_pause ? ADVERTISED_Pause : 0;
+	*advertising |= (tx_pause ^ rx_pause) ? ADVERTISED_Asym_Pause : 0;
+}
+
+static u8 get_connector_port(u32 eth_proto)
+{
+	if (eth_proto & (MLX5E_PROT_MASK(MLX5E_10GBASE_SR)
+			 | MLX5E_PROT_MASK(MLX5E_40GBASE_SR4)
+			 | MLX5E_PROT_MASK(MLX5E_100GBASE_SR4)
+			 | MLX5E_PROT_MASK(MLX5E_1000BASE_CX_SGMII))) {
+			return PORT_FIBRE;
+	}
+
+	if (eth_proto & (MLX5E_PROT_MASK(MLX5E_40GBASE_CR4)
+			 | MLX5E_PROT_MASK(MLX5E_10GBASE_CR)
+			 | MLX5E_PROT_MASK(MLX5E_100GBASE_CR4))) {
+			return PORT_DA;
+	}
+
+	if (eth_proto & (MLX5E_PROT_MASK(MLX5E_10GBASE_KX4)
+			 | MLX5E_PROT_MASK(MLX5E_10GBASE_KR)
+			 | MLX5E_PROT_MASK(MLX5E_40GBASE_KR4)
+			 | MLX5E_PROT_MASK(MLX5E_100GBASE_KR4))) {
+			return PORT_NONE;
+	}
+
+	return PORT_OTHER;
+}
+
+static void get_lp_advertising(u32 eth_proto_lp, u32 *lp_advertising)
+{
+	*lp_advertising = ptys2ethtool_adver_link(eth_proto_lp);
+}
+
+static int mlx5e_get_settings(struct net_device *netdev,
+			      struct ethtool_cmd *cmd)
+{
+	struct mlx5e_priv *priv    = netdev_priv(netdev);
+	struct mlx5_core_dev *mdev = priv->mdev;
+	u32 out[MLX5_ST_SZ_DW(ptys_reg)];
+	u32 eth_proto_cap;
+	u32 eth_proto_admin;
+	u32 eth_proto_lp;
+	u32 eth_proto_oper;
+	int err;
+
+	err = mlx5_query_port_ptys(mdev, out, sizeof(out), MLX5_PTYS_EN);
+
+	if (err) {
+		netdev_err(netdev, "%s: query port ptys failed: %d\n",
+			   __func__, err);
+		goto err_query_ptys;
+	}
+
+	eth_proto_cap   = MLX5_GET(ptys_reg, out, eth_proto_capability);
+	eth_proto_admin = MLX5_GET(ptys_reg, out, eth_proto_admin);
+	eth_proto_oper  = MLX5_GET(ptys_reg, out, eth_proto_oper);
+	eth_proto_lp    = MLX5_GET(ptys_reg, out, eth_proto_lp_advertise);
+
+	cmd->supported   = 0;
+	cmd->advertising = 0;
+
+	get_supported(eth_proto_cap, &cmd->supported);
+	get_advertising(eth_proto_admin, 0, 0, &cmd->advertising);
+	get_speed_duplex(netdev, eth_proto_oper, cmd);
+
+	eth_proto_oper = eth_proto_oper ? eth_proto_oper : eth_proto_cap;
+
+	cmd->port = get_connector_port(eth_proto_oper);
+	get_lp_advertising(eth_proto_lp, &cmd->lp_advertising);
+
+	cmd->transceiver = XCVR_INTERNAL;
+
+err_query_ptys:
+	return err;
+}
+
+static u32 mlx5e_ethtool2ptys_adver_link(u32 link_modes)
+{
+	u32 i, ptys_modes = 0;
+
+	for (i = 0; i < MLX5E_LINK_MODES_NUMBER; ++i) {
+		if (ptys2ethtool_table[i].advertised & link_modes)
+			ptys_modes |= MLX5E_PROT_MASK(i);
+	}
+
+	return ptys_modes;
+}
+
+static u32 mlx5e_ethtool2ptys_speed_link(u32 speed)
+{
+	u32 i, speed_links = 0;
+
+	for (i = 0; i < MLX5E_LINK_MODES_NUMBER; ++i) {
+		if (ptys2ethtool_table[i].speed == speed)
+			speed_links |= MLX5E_PROT_MASK(i);
+	}
+
+	return speed_links;
+}
+
+static int mlx5e_set_settings(struct net_device *netdev,
+			      struct ethtool_cmd *cmd)
+{
+	struct mlx5e_priv *priv    = netdev_priv(netdev);
+	struct mlx5_core_dev *mdev = priv->mdev;
+	u32 link_modes;
+	u32 speed;
+	u32 eth_proto_cap, eth_proto_admin;
+	u8 port_status;
+	int err;
+
+	speed = ethtool_cmd_speed(cmd);
+
+	link_modes = cmd->autoneg == AUTONEG_ENABLE ?
+		mlx5e_ethtool2ptys_adver_link(cmd->advertising) :
+		mlx5e_ethtool2ptys_speed_link(speed);
+
+	err = mlx5_query_port_proto_cap(mdev, &eth_proto_cap, MLX5_PTYS_EN);
+	if (err) {
+		netdev_err(netdev, "%s: query port eth proto cap failed: %d\n",
+			   __func__, err);
+		goto out;
+	}
+
+	link_modes = link_modes & eth_proto_cap;
+	if (!link_modes) {
+		netdev_err(netdev, "%s: Not supported link mode(s) requested",
+			   __func__);
+		err = -EINVAL;
+		goto out;
+	}
+
+	err = mlx5_query_port_proto_admin(mdev, &eth_proto_admin, MLX5_PTYS_EN);
+	if (err) {
+		netdev_err(netdev, "%s: query port eth proto admin failed: %d\n",
+			   __func__, err);
+		goto out;
+	}
+
+	if (link_modes == eth_proto_admin)
+		goto out;
+
+	err = mlx5_set_port_proto(mdev, link_modes, MLX5_PTYS_EN);
+	if (err) {
+		netdev_err(netdev, "%s: set port eth proto admin failed: %d\n",
+			   __func__, err);
+		goto out;
+	}
+
+	err = mlx5_query_port_status(mdev, &port_status);
+	if (err)
+		goto out;
+
+	if (port_status == MLX5_PORT_DOWN)
+		return 0;
+
+	err = mlx5_set_port_status(mdev, MLX5_PORT_DOWN);
+	if (err)
+		goto out;
+	err = mlx5_set_port_status(mdev, MLX5_PORT_UP);
+out:
+	return err;
+}
+
+const struct ethtool_ops mlx5e_ethtool_ops = {
+	.get_drvinfo       = mlx5e_get_drvinfo,
+	.get_link          = ethtool_op_get_link,
+	.get_strings       = mlx5e_get_strings,
+	.get_sset_count    = mlx5e_get_sset_count,
+	.get_ethtool_stats = mlx5e_get_ethtool_stats,
+	.get_ringparam     = mlx5e_get_ringparam,
+	.set_ringparam     = mlx5e_set_ringparam,
+	.get_channels      = mlx5e_get_channels,
+	.set_channels      = mlx5e_set_channels,
+	.get_coalesce      = mlx5e_get_coalesce,
+	.set_coalesce      = mlx5e_set_coalesce,
+	.get_settings      = mlx5e_get_settings,
+	.set_settings      = mlx5e_set_settings,
+};
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
new file mode 100644
index 000000000000..eee829d119f9
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -0,0 +1,1899 @@
+/*
+ * Copyright (c) 2015, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/mlx5/flow_table.h>
+#include "en.h"
+
+struct mlx5e_rq_param {
+	u32                        rqc[MLX5_ST_SZ_DW(rqc)];
+	struct mlx5_wq_param       wq;
+};
+
+struct mlx5e_sq_param {
+	u32                        sqc[MLX5_ST_SZ_DW(sqc)];
+	struct mlx5_wq_param       wq;
+};
+
+struct mlx5e_cq_param {
+	u32                        cqc[MLX5_ST_SZ_DW(cqc)];
+	struct mlx5_wq_param       wq;
+	u16                        eq_ix;
+};
+
+struct mlx5e_channel_param {
+	struct mlx5e_rq_param      rq;
+	struct mlx5e_sq_param      sq;
+	struct mlx5e_cq_param      rx_cq;
+	struct mlx5e_cq_param      tx_cq;
+};
+
+static void mlx5e_update_carrier(struct mlx5e_priv *priv)
+{
+	struct mlx5_core_dev *mdev = priv->mdev;
+	u8 port_state;
+
+	port_state = mlx5_query_vport_state(mdev,
+		MLX5_QUERY_VPORT_STATE_IN_OP_MOD_VNIC_VPORT);
+
+	if (port_state == VPORT_STATE_UP)
+		netif_carrier_on(priv->netdev);
+	else
+		netif_carrier_off(priv->netdev);
+}
+
+static void mlx5e_update_carrier_work(struct work_struct *work)
+{
+	struct mlx5e_priv *priv = container_of(work, struct mlx5e_priv,
+					       update_carrier_work);
+
+	mutex_lock(&priv->state_lock);
+	if (test_bit(MLX5E_STATE_OPENED, &priv->state))
+		mlx5e_update_carrier(priv);
+	mutex_unlock(&priv->state_lock);
+}
+
+void mlx5e_update_stats(struct mlx5e_priv *priv)
+{
+	struct mlx5_core_dev *mdev = priv->mdev;
+	struct mlx5e_vport_stats *s = &priv->stats.vport;
+	struct mlx5e_rq_stats *rq_stats;
+	struct mlx5e_sq_stats *sq_stats;
+	u32 in[MLX5_ST_SZ_DW(query_vport_counter_in)];
+	u32 *out;
+	int outlen = MLX5_ST_SZ_BYTES(query_vport_counter_out);
+	u64 tx_offload_none;
+	int i, j;
+
+	out = mlx5_vzalloc(outlen);
+	if (!out)
+		return;
+
+	/* Collect firts the SW counters and then HW for consistency */
+	s->tso_packets		= 0;
+	s->tso_bytes		= 0;
+	s->tx_queue_stopped	= 0;
+	s->tx_queue_wake	= 0;
+	s->tx_queue_dropped	= 0;
+	tx_offload_none		= 0;
+	s->lro_packets		= 0;
+	s->lro_bytes		= 0;
+	s->rx_csum_none		= 0;
+	s->rx_wqe_err		= 0;
+	for (i = 0; i < priv->params.num_channels; i++) {
+		rq_stats = &priv->channel[i]->rq.stats;
+
+		s->lro_packets	+= rq_stats->lro_packets;
+		s->lro_bytes	+= rq_stats->lro_bytes;
+		s->rx_csum_none	+= rq_stats->csum_none;
+		s->rx_wqe_err   += rq_stats->wqe_err;
+
+		for (j = 0; j < priv->num_tc; j++) {
+			sq_stats = &priv->channel[i]->sq[j].stats;
+
+			s->tso_packets		+= sq_stats->tso_packets;
+			s->tso_bytes		+= sq_stats->tso_bytes;
+			s->tx_queue_stopped	+= sq_stats->stopped;
+			s->tx_queue_wake	+= sq_stats->wake;
+			s->tx_queue_dropped	+= sq_stats->dropped;
+			tx_offload_none		+= sq_stats->csum_offload_none;
+		}
+	}
+
+	/* HW counters */
+	memset(in, 0, sizeof(in));
+
+	MLX5_SET(query_vport_counter_in, in, opcode,
+		 MLX5_CMD_OP_QUERY_VPORT_COUNTER);
+	MLX5_SET(query_vport_counter_in, in, op_mod, 0);
+	MLX5_SET(query_vport_counter_in, in, other_vport, 0);
+
+	memset(out, 0, outlen);
+
+	if (mlx5_cmd_exec(mdev, in, sizeof(in), out, outlen))
+		goto free_out;
+
+#define MLX5_GET_CTR(p, x) \
+	MLX5_GET64(query_vport_counter_out, p, x)
+
+	s->rx_error_packets     =
+		MLX5_GET_CTR(out, received_errors.packets);
+	s->rx_error_bytes       =
+		MLX5_GET_CTR(out, received_errors.octets);
+	s->tx_error_packets     =
+		MLX5_GET_CTR(out, transmit_errors.packets);
+	s->tx_error_bytes       =
+		MLX5_GET_CTR(out, transmit_errors.octets);
+
+	s->rx_unicast_packets   =
+		MLX5_GET_CTR(out, received_eth_unicast.packets);
+	s->rx_unicast_bytes     =
+		MLX5_GET_CTR(out, received_eth_unicast.octets);
+	s->tx_unicast_packets   =
+		MLX5_GET_CTR(out, transmitted_eth_unicast.packets);
+	s->tx_unicast_bytes     =
+		MLX5_GET_CTR(out, transmitted_eth_unicast.octets);
+
+	s->rx_multicast_packets =
+		MLX5_GET_CTR(out, received_eth_multicast.packets);
+	s->rx_multicast_bytes   =
+		MLX5_GET_CTR(out, received_eth_multicast.octets);
+	s->tx_multicast_packets =
+		MLX5_GET_CTR(out, transmitted_eth_multicast.packets);
+	s->tx_multicast_bytes   =
+		MLX5_GET_CTR(out, transmitted_eth_multicast.octets);
+
+	s->rx_broadcast_packets =
+		MLX5_GET_CTR(out, received_eth_broadcast.packets);
+	s->rx_broadcast_bytes   =
+		MLX5_GET_CTR(out, received_eth_broadcast.octets);
+	s->tx_broadcast_packets =
+		MLX5_GET_CTR(out, transmitted_eth_broadcast.packets);
+	s->tx_broadcast_bytes   =
+		MLX5_GET_CTR(out, transmitted_eth_broadcast.octets);
+
+	s->rx_packets =
+		s->rx_unicast_packets +
+		s->rx_multicast_packets +
+		s->rx_broadcast_packets;
+	s->rx_bytes =
+		s->rx_unicast_bytes +
+		s->rx_multicast_bytes +
+		s->rx_broadcast_bytes;
+	s->tx_packets =
+		s->tx_unicast_packets +
+		s->tx_multicast_packets +
+		s->tx_broadcast_packets;
+	s->tx_bytes =
+		s->tx_unicast_bytes +
+		s->tx_multicast_bytes +
+		s->tx_broadcast_bytes;
+
+	/* Update calculated offload counters */
+	s->tx_csum_offload = s->tx_packets - tx_offload_none;
+	s->rx_csum_good    = s->rx_packets - s->rx_csum_none;
+
+free_out:
+	kvfree(out);
+}
+
+static void mlx5e_update_stats_work(struct work_struct *work)
+{
+	struct delayed_work *dwork = to_delayed_work(work);
+	struct mlx5e_priv *priv = container_of(dwork, struct mlx5e_priv,
+					       update_stats_work);
+	mutex_lock(&priv->state_lock);
+	if (test_bit(MLX5E_STATE_OPENED, &priv->state)) {
+		mlx5e_update_stats(priv);
+		schedule_delayed_work(dwork,
+				      msecs_to_jiffies(
+					      MLX5E_UPDATE_STATS_INTERVAL));
+	}
+	mutex_unlock(&priv->state_lock);
+}
+
+static void __mlx5e_async_event(struct mlx5e_priv *priv,
+				enum mlx5_dev_event event)
+{
+	switch (event) {
+	case MLX5_DEV_EVENT_PORT_UP:
+	case MLX5_DEV_EVENT_PORT_DOWN:
+		schedule_work(&priv->update_carrier_work);
+		break;
+
+	default:
+		break;
+	}
+}
+
+static void mlx5e_async_event(struct mlx5_core_dev *mdev, void *vpriv,
+			      enum mlx5_dev_event event, unsigned long param)
+{
+	struct mlx5e_priv *priv = vpriv;
+
+	spin_lock(&priv->async_events_spinlock);
+	if (test_bit(MLX5E_STATE_ASYNC_EVENTS_ENABLE, &priv->state))
+		__mlx5e_async_event(priv, event);
+	spin_unlock(&priv->async_events_spinlock);
+}
+
+static void mlx5e_enable_async_events(struct mlx5e_priv *priv)
+{
+	set_bit(MLX5E_STATE_ASYNC_EVENTS_ENABLE, &priv->state);
+}
+
+static void mlx5e_disable_async_events(struct mlx5e_priv *priv)
+{
+	spin_lock_irq(&priv->async_events_spinlock);
+	clear_bit(MLX5E_STATE_ASYNC_EVENTS_ENABLE, &priv->state);
+	spin_unlock_irq(&priv->async_events_spinlock);
+}
+
+static void mlx5e_send_nop(struct mlx5e_sq *sq)
+{
+	struct mlx5_wq_cyc                *wq  = &sq->wq;
+
+	u16 pi = sq->pc & wq->sz_m1;
+	struct mlx5e_tx_wqe              *wqe  = mlx5_wq_cyc_get_wqe(wq, pi);
+
+	struct mlx5_wqe_ctrl_seg         *cseg = &wqe->ctrl;
+
+	memset(cseg, 0, sizeof(*cseg));
+
+	cseg->opmod_idx_opcode = cpu_to_be32((sq->pc << 8) | MLX5_OPCODE_NOP);
+	cseg->qpn_ds           = cpu_to_be32((sq->sqn << 8) | 0x01);
+	cseg->fm_ce_se         = MLX5_WQE_CTRL_CQ_UPDATE;
+
+	sq->skb[pi] = NULL;
+	sq->pc++;
+	mlx5e_tx_notify_hw(sq, wqe);
+}
+
+static int mlx5e_create_rq(struct mlx5e_channel *c,
+			   struct mlx5e_rq_param *param,
+			   struct mlx5e_rq *rq)
+{
+	struct mlx5e_priv *priv = c->priv;
+	struct mlx5_core_dev *mdev = priv->mdev;
+	void *rqc = param->rqc;
+	void *rqc_wq = MLX5_ADDR_OF(rqc, rqc, wq);
+	int wq_sz;
+	int err;
+	int i;
+
+	err = mlx5_wq_ll_create(mdev, &param->wq, rqc_wq, &rq->wq,
+				&rq->wq_ctrl);
+	if (err)
+		return err;
+
+	rq->wq.db = &rq->wq.db[MLX5_RCV_DBR];
+
+	wq_sz = mlx5_wq_ll_get_size(&rq->wq);
+	rq->skb = kzalloc_node(wq_sz * sizeof(*rq->skb), GFP_KERNEL,
+			       cpu_to_node(c->cpu));
+	if (!rq->skb) {
+		err = -ENOMEM;
+		goto err_rq_wq_destroy;
+	}
+
+	rq->wqe_sz = (priv->params.lro_en) ? priv->params.lro_wqe_sz :
+				priv->netdev->mtu + ETH_HLEN + VLAN_HLEN;
+
+	for (i = 0; i < wq_sz; i++) {
+		struct mlx5e_rx_wqe *wqe = mlx5_wq_ll_get_wqe(&rq->wq, i);
+
+		wqe->data.lkey       = c->mkey_be;
+		wqe->data.byte_count = cpu_to_be32(rq->wqe_sz);
+	}
+
+	rq->pdev    = c->pdev;
+	rq->netdev  = c->netdev;
+	rq->channel = c;
+	rq->ix      = c->ix;
+
+	return 0;
+
+err_rq_wq_destroy:
+	mlx5_wq_destroy(&rq->wq_ctrl);
+
+	return err;
+}
+
+static void mlx5e_destroy_rq(struct mlx5e_rq *rq)
+{
+	kfree(rq->skb);
+	mlx5_wq_destroy(&rq->wq_ctrl);
+}
+
+static int mlx5e_enable_rq(struct mlx5e_rq *rq, struct mlx5e_rq_param *param)
+{
+	struct mlx5e_channel *c = rq->channel;
+	struct mlx5e_priv *priv = c->priv;
+	struct mlx5_core_dev *mdev = priv->mdev;
+
+	void *in;
+	void *rqc;
+	void *wq;
+	int inlen;
+	int err;
+
+	inlen = MLX5_ST_SZ_BYTES(create_rq_in) +
+		sizeof(u64) * rq->wq_ctrl.buf.npages;
+	in = mlx5_vzalloc(inlen);
+	if (!in)
+		return -ENOMEM;
+
+	rqc = MLX5_ADDR_OF(create_rq_in, in, ctx);
+	wq  = MLX5_ADDR_OF(rqc, rqc, wq);
+
+	memcpy(rqc, param->rqc, sizeof(param->rqc));
+
+	MLX5_SET(rqc,  rqc, cqn,		c->rq.cq.mcq.cqn);
+	MLX5_SET(rqc,  rqc, state,		MLX5_RQC_STATE_RST);
+	MLX5_SET(rqc,  rqc, flush_in_error_en,	1);
+	MLX5_SET(wq,   wq,  wq_type,		MLX5_WQ_TYPE_LINKED_LIST);
+	MLX5_SET(wq,   wq,  log_wq_pg_sz,	rq->wq_ctrl.buf.page_shift -
+						PAGE_SHIFT);
+	MLX5_SET64(wq, wq,  dbr_addr,		rq->wq_ctrl.db.dma);
+
+	mlx5_fill_page_array(&rq->wq_ctrl.buf,
+			     (__be64 *)MLX5_ADDR_OF(wq, wq, pas));
+
+	err = mlx5_create_rq(mdev, in, inlen, &rq->rqn);
+
+	kvfree(in);
+
+	return err;
+}
+
+static int mlx5e_modify_rq(struct mlx5e_rq *rq, int curr_state, int next_state)
+{
+	struct mlx5e_channel *c = rq->channel;
+	struct mlx5e_priv *priv = c->priv;
+	struct mlx5_core_dev *mdev = priv->mdev;
+
+	void *in;
+	void *rqc;
+	int inlen;
+	int err;
+
+	inlen = MLX5_ST_SZ_BYTES(modify_rq_in);
+	in = mlx5_vzalloc(inlen);
+	if (!in)
+		return -ENOMEM;
+
+	rqc = MLX5_ADDR_OF(modify_rq_in, in, ctx);
+
+	MLX5_SET(modify_rq_in, in, rq_state, curr_state);
+	MLX5_SET(rqc, rqc, state, next_state);
+
+	err = mlx5_modify_rq(mdev, rq->rqn, in, inlen);
+
+	kvfree(in);
+
+	return err;
+}
+
+static void mlx5e_disable_rq(struct mlx5e_rq *rq)
+{
+	struct mlx5e_channel *c = rq->channel;
+	struct mlx5e_priv *priv = c->priv;
+	struct mlx5_core_dev *mdev = priv->mdev;
+
+	mlx5_destroy_rq(mdev, rq->rqn);
+}
+
+static int mlx5e_wait_for_min_rx_wqes(struct mlx5e_rq *rq)
+{
+	struct mlx5e_channel *c = rq->channel;
+	struct mlx5e_priv *priv = c->priv;
+	struct mlx5_wq_ll *wq = &rq->wq;
+	int i;
+
+	for (i = 0; i < 1000; i++) {
+		if (wq->cur_sz >= priv->params.min_rx_wqes)
+			return 0;
+
+		msleep(20);
+	}
+
+	return -ETIMEDOUT;
+}
+
+static int mlx5e_open_rq(struct mlx5e_channel *c,
+			 struct mlx5e_rq_param *param,
+			 struct mlx5e_rq *rq)
+{
+	int err;
+
+	err = mlx5e_create_rq(c, param, rq);
+	if (err)
+		return err;
+
+	err = mlx5e_enable_rq(rq, param);
+	if (err)
+		goto err_destroy_rq;
+
+	err = mlx5e_modify_rq(rq, MLX5_RQC_STATE_RST, MLX5_RQC_STATE_RDY);
+	if (err)
+		goto err_disable_rq;
+
+	set_bit(MLX5E_RQ_STATE_POST_WQES_ENABLE, &rq->state);
+	mlx5e_send_nop(&c->sq[0]); /* trigger mlx5e_post_rx_wqes() */
+
+	return 0;
+
+err_disable_rq:
+	mlx5e_disable_rq(rq);
+err_destroy_rq:
+	mlx5e_destroy_rq(rq);
+
+	return err;
+}
+
+static void mlx5e_close_rq(struct mlx5e_rq *rq)
+{
+	clear_bit(MLX5E_RQ_STATE_POST_WQES_ENABLE, &rq->state);
+	napi_synchronize(&rq->channel->napi); /* prevent mlx5e_post_rx_wqes */
+
+	mlx5e_modify_rq(rq, MLX5_RQC_STATE_RDY, MLX5_RQC_STATE_ERR);
+	while (!mlx5_wq_ll_is_empty(&rq->wq))
+		msleep(20);
+
+	/* avoid destroying rq before mlx5e_poll_rx_cq() is done with it */
+	napi_synchronize(&rq->channel->napi);
+
+	mlx5e_disable_rq(rq);
+	mlx5e_destroy_rq(rq);
+}
+
+static void mlx5e_free_sq_db(struct mlx5e_sq *sq)
+{
+	kfree(sq->dma_fifo);
+	kfree(sq->skb);
+}
+
+static int mlx5e_alloc_sq_db(struct mlx5e_sq *sq, int numa)
+{
+	int wq_sz = mlx5_wq_cyc_get_size(&sq->wq);
+	int df_sz = wq_sz * MLX5_SEND_WQEBB_NUM_DS;
+
+	sq->skb = kzalloc_node(wq_sz * sizeof(*sq->skb), GFP_KERNEL, numa);
+	sq->dma_fifo = kzalloc_node(df_sz * sizeof(*sq->dma_fifo), GFP_KERNEL,
+				    numa);
+
+	if (!sq->skb || !sq->dma_fifo) {
+		mlx5e_free_sq_db(sq);
+		return -ENOMEM;
+	}
+
+	sq->dma_fifo_mask = df_sz - 1;
+
+	return 0;
+}
+
+static int mlx5e_create_sq(struct mlx5e_channel *c,
+			   int tc,
+			   struct mlx5e_sq_param *param,
+			   struct mlx5e_sq *sq)
+{
+	struct mlx5e_priv *priv = c->priv;
+	struct mlx5_core_dev *mdev = priv->mdev;
+
+	void *sqc = param->sqc;
+	void *sqc_wq = MLX5_ADDR_OF(sqc, sqc, wq);
+	int err;
+
+	err = mlx5_alloc_map_uar(mdev, &sq->uar);
+	if (err)
+		return err;
+
+	err = mlx5_wq_cyc_create(mdev, &param->wq, sqc_wq, &sq->wq,
+				 &sq->wq_ctrl);
+	if (err)
+		goto err_unmap_free_uar;
+
+	sq->wq.db       = &sq->wq.db[MLX5_SND_DBR];
+	sq->uar_map     = sq->uar.map;
+	sq->bf_buf_size = (1 << MLX5_CAP_GEN(mdev, log_bf_reg_size)) / 2;
+
+	if (mlx5e_alloc_sq_db(sq, cpu_to_node(c->cpu)))
+		goto err_sq_wq_destroy;
+
+	sq->txq = netdev_get_tx_queue(priv->netdev,
+				      c->ix + tc * priv->params.num_channels);
+
+	sq->pdev    = c->pdev;
+	sq->mkey_be = c->mkey_be;
+	sq->channel = c;
+	sq->tc      = tc;
+
+	return 0;
+
+err_sq_wq_destroy:
+	mlx5_wq_destroy(&sq->wq_ctrl);
+
+err_unmap_free_uar:
+	mlx5_unmap_free_uar(mdev, &sq->uar);
+
+	return err;
+}
+
+static void mlx5e_destroy_sq(struct mlx5e_sq *sq)
+{
+	struct mlx5e_channel *c = sq->channel;
+	struct mlx5e_priv *priv = c->priv;
+
+	mlx5e_free_sq_db(sq);
+	mlx5_wq_destroy(&sq->wq_ctrl);
+	mlx5_unmap_free_uar(priv->mdev, &sq->uar);
+}
+
+static int mlx5e_enable_sq(struct mlx5e_sq *sq, struct mlx5e_sq_param *param)
+{
+	struct mlx5e_channel *c = sq->channel;
+	struct mlx5e_priv *priv = c->priv;
+	struct mlx5_core_dev *mdev = priv->mdev;
+
+	void *in;
+	void *sqc;
+	void *wq;
+	int inlen;
+	int err;
+
+	inlen = MLX5_ST_SZ_BYTES(create_sq_in) +
+		sizeof(u64) * sq->wq_ctrl.buf.npages;
+	in = mlx5_vzalloc(inlen);
+	if (!in)
+		return -ENOMEM;
+
+	sqc = MLX5_ADDR_OF(create_sq_in, in, ctx);
+	wq = MLX5_ADDR_OF(sqc, sqc, wq);
+
+	memcpy(sqc, param->sqc, sizeof(param->sqc));
+
+	MLX5_SET(sqc,  sqc, user_index,		sq->tc);
+	MLX5_SET(sqc,  sqc, tis_num_0,		priv->tisn[sq->tc]);
+	MLX5_SET(sqc,  sqc, cqn,		c->sq[sq->tc].cq.mcq.cqn);
+	MLX5_SET(sqc,  sqc, state,		MLX5_SQC_STATE_RST);
+	MLX5_SET(sqc,  sqc, tis_lst_sz,		1);
+	MLX5_SET(sqc,  sqc, flush_in_error_en,	1);
+
+	MLX5_SET(wq,   wq, wq_type,       MLX5_WQ_TYPE_CYCLIC);
+	MLX5_SET(wq,   wq, uar_page,      sq->uar.index);
+	MLX5_SET(wq,   wq, log_wq_pg_sz,  sq->wq_ctrl.buf.page_shift -
+					  PAGE_SHIFT);
+	MLX5_SET64(wq, wq, dbr_addr,      sq->wq_ctrl.db.dma);
+
+	mlx5_fill_page_array(&sq->wq_ctrl.buf,
+			     (__be64 *)MLX5_ADDR_OF(wq, wq, pas));
+
+	err = mlx5_create_sq(mdev, in, inlen, &sq->sqn);
+
+	kvfree(in);
+
+	return err;
+}
+
+static int mlx5e_modify_sq(struct mlx5e_sq *sq, int curr_state, int next_state)
+{
+	struct mlx5e_channel *c = sq->channel;
+	struct mlx5e_priv *priv = c->priv;
+	struct mlx5_core_dev *mdev = priv->mdev;
+
+	void *in;
+	void *sqc;
+	int inlen;
+	int err;
+
+	inlen = MLX5_ST_SZ_BYTES(modify_sq_in);
+	in = mlx5_vzalloc(inlen);
+	if (!in)
+		return -ENOMEM;
+
+	sqc = MLX5_ADDR_OF(modify_sq_in, in, ctx);
+
+	MLX5_SET(modify_sq_in, in, sq_state, curr_state);
+	MLX5_SET(sqc, sqc, state, next_state);
+
+	err = mlx5_modify_sq(mdev, sq->sqn, in, inlen);
+
+	kvfree(in);
+
+	return err;
+}
+
+static void mlx5e_disable_sq(struct mlx5e_sq *sq)
+{
+	struct mlx5e_channel *c = sq->channel;
+	struct mlx5e_priv *priv = c->priv;
+	struct mlx5_core_dev *mdev = priv->mdev;
+
+	mlx5_destroy_sq(mdev, sq->sqn);
+}
+
+static int mlx5e_open_sq(struct mlx5e_channel *c,
+			 int tc,
+			 struct mlx5e_sq_param *param,
+			 struct mlx5e_sq *sq)
+{
+	int err;
+
+	err = mlx5e_create_sq(c, tc, param, sq);
+	if (err)
+		return err;
+
+	err = mlx5e_enable_sq(sq, param);
+	if (err)
+		goto err_destroy_sq;
+
+	err = mlx5e_modify_sq(sq, MLX5_SQC_STATE_RST, MLX5_SQC_STATE_RDY);
+	if (err)
+		goto err_disable_sq;
+
+	set_bit(MLX5E_SQ_STATE_WAKE_TXQ_ENABLE, &sq->state);
+	netdev_tx_reset_queue(sq->txq);
+	netif_tx_start_queue(sq->txq);
+
+	return 0;
+
+err_disable_sq:
+	mlx5e_disable_sq(sq);
+err_destroy_sq:
+	mlx5e_destroy_sq(sq);
+
+	return err;
+}
+
+static inline void netif_tx_disable_queue(struct netdev_queue *txq)
+{
+	__netif_tx_lock_bh(txq);
+	netif_tx_stop_queue(txq);
+	__netif_tx_unlock_bh(txq);
+}
+
+static void mlx5e_close_sq(struct mlx5e_sq *sq)
+{
+	clear_bit(MLX5E_SQ_STATE_WAKE_TXQ_ENABLE, &sq->state);
+	napi_synchronize(&sq->channel->napi); /* prevent netif_tx_wake_queue */
+	netif_tx_disable_queue(sq->txq);
+
+	/* ensure hw is notified of all pending wqes */
+	if (mlx5e_sq_has_room_for(sq, 1))
+		mlx5e_send_nop(sq);
+
+	mlx5e_modify_sq(sq, MLX5_SQC_STATE_RDY, MLX5_SQC_STATE_ERR);
+	while (sq->cc != sq->pc) /* wait till sq is empty */
+		msleep(20);
+
+	/* avoid destroying sq before mlx5e_poll_tx_cq() is done with it */
+	napi_synchronize(&sq->channel->napi);
+
+	mlx5e_disable_sq(sq);
+	mlx5e_destroy_sq(sq);
+}
+
+static int mlx5e_create_cq(struct mlx5e_channel *c,
+			   struct mlx5e_cq_param *param,
+			   struct mlx5e_cq *cq)
+{
+	struct mlx5e_priv *priv = c->priv;
+	struct mlx5_core_dev *mdev = priv->mdev;
+	struct mlx5_core_cq *mcq = &cq->mcq;
+	int eqn_not_used;
+	int irqn;
+	int err;
+	u32 i;
+
+	param->wq.numa = cpu_to_node(c->cpu);
+	param->eq_ix   = c->ix;
+
+	err = mlx5_cqwq_create(mdev, &param->wq, param->cqc, &cq->wq,
+			       &cq->wq_ctrl);
+	if (err)
+		return err;
+
+	mlx5_vector2eqn(mdev, param->eq_ix, &eqn_not_used, &irqn);
+
+	cq->napi        = &c->napi;
+
+	mcq->cqe_sz     = 64;
+	mcq->set_ci_db  = cq->wq_ctrl.db.db;
+	mcq->arm_db     = cq->wq_ctrl.db.db + 1;
+	*mcq->set_ci_db = 0;
+	*mcq->arm_db    = 0;
+	mcq->vector     = param->eq_ix;
+	mcq->comp       = mlx5e_completion_event;
+	mcq->event      = mlx5e_cq_error_event;
+	mcq->irqn       = irqn;
+	mcq->uar        = &priv->cq_uar;
+
+	for (i = 0; i < mlx5_cqwq_get_size(&cq->wq); i++) {
+		struct mlx5_cqe64 *cqe = mlx5_cqwq_get_wqe(&cq->wq, i);
+
+		cqe->op_own = 0xf1;
+	}
+
+	cq->channel = c;
+
+	return 0;
+}
+
+static void mlx5e_destroy_cq(struct mlx5e_cq *cq)
+{
+	mlx5_wq_destroy(&cq->wq_ctrl);
+}
+
+static int mlx5e_enable_cq(struct mlx5e_cq *cq, struct mlx5e_cq_param *param)
+{
+	struct mlx5e_channel *c = cq->channel;
+	struct mlx5e_priv *priv = c->priv;
+	struct mlx5_core_dev *mdev = priv->mdev;
+	struct mlx5_core_cq *mcq = &cq->mcq;
+
+	void *in;
+	void *cqc;
+	int inlen;
+	int irqn_not_used;
+	int eqn;
+	int err;
+
+	inlen = MLX5_ST_SZ_BYTES(create_cq_in) +
+		sizeof(u64) * cq->wq_ctrl.buf.npages;
+	in = mlx5_vzalloc(inlen);
+	if (!in)
+		return -ENOMEM;
+
+	cqc = MLX5_ADDR_OF(create_cq_in, in, cq_context);
+
+	memcpy(cqc, param->cqc, sizeof(param->cqc));
+
+	mlx5_fill_page_array(&cq->wq_ctrl.buf,
+			     (__be64 *)MLX5_ADDR_OF(create_cq_in, in, pas));
+
+	mlx5_vector2eqn(mdev, param->eq_ix, &eqn, &irqn_not_used);
+
+	MLX5_SET(cqc,   cqc, c_eqn,         eqn);
+	MLX5_SET(cqc,   cqc, uar_page,      mcq->uar->index);
+	MLX5_SET(cqc,   cqc, log_page_size, cq->wq_ctrl.buf.page_shift -
+					    PAGE_SHIFT);
+	MLX5_SET64(cqc, cqc, dbr_addr,      cq->wq_ctrl.db.dma);
+
+	err = mlx5_core_create_cq(mdev, mcq, in, inlen);
+
+	kvfree(in);
+
+	if (err)
+		return err;
+
+	mlx5e_cq_arm(cq);
+
+	return 0;
+}
+
+static void mlx5e_disable_cq(struct mlx5e_cq *cq)
+{
+	struct mlx5e_channel *c = cq->channel;
+	struct mlx5e_priv *priv = c->priv;
+	struct mlx5_core_dev *mdev = priv->mdev;
+
+	mlx5_core_destroy_cq(mdev, &cq->mcq);
+}
+
+static int mlx5e_open_cq(struct mlx5e_channel *c,
+			 struct mlx5e_cq_param *param,
+			 struct mlx5e_cq *cq,
+			 u16 moderation_usecs,
+			 u16 moderation_frames)
+{
+	int err;
+	struct mlx5e_priv *priv = c->priv;
+	struct mlx5_core_dev *mdev = priv->mdev;
+
+	err = mlx5e_create_cq(c, param, cq);
+	if (err)
+		return err;
+
+	err = mlx5e_enable_cq(cq, param);
+	if (err)
+		goto err_destroy_cq;
+
+	err = mlx5_core_modify_cq_moderation(mdev, &cq->mcq,
+					     moderation_usecs,
+					     moderation_frames);
+	if (err)
+		goto err_destroy_cq;
+
+	return 0;
+
+err_destroy_cq:
+	mlx5e_destroy_cq(cq);
+
+	return err;
+}
+
+static void mlx5e_close_cq(struct mlx5e_cq *cq)
+{
+	mlx5e_disable_cq(cq);
+	mlx5e_destroy_cq(cq);
+}
+
+static int mlx5e_get_cpu(struct mlx5e_priv *priv, int ix)
+{
+	return cpumask_first(priv->mdev->priv.irq_info[ix].mask);
+}
+
+static int mlx5e_open_tx_cqs(struct mlx5e_channel *c,
+			     struct mlx5e_channel_param *cparam)
+{
+	struct mlx5e_priv *priv = c->priv;
+	int err;
+	int tc;
+
+	for (tc = 0; tc < c->num_tc; tc++) {
+		err = mlx5e_open_cq(c, &cparam->tx_cq, &c->sq[tc].cq,
+				    priv->params.tx_cq_moderation_usec,
+				    priv->params.tx_cq_moderation_pkts);
+		if (err)
+			goto err_close_tx_cqs;
+
+		c->sq[tc].cq.sqrq = &c->sq[tc];
+	}
+
+	return 0;
+
+err_close_tx_cqs:
+	for (tc--; tc >= 0; tc--)
+		mlx5e_close_cq(&c->sq[tc].cq);
+
+	return err;
+}
+
+static void mlx5e_close_tx_cqs(struct mlx5e_channel *c)
+{
+	int tc;
+
+	for (tc = 0; tc < c->num_tc; tc++)
+		mlx5e_close_cq(&c->sq[tc].cq);
+}
+
+static int mlx5e_open_sqs(struct mlx5e_channel *c,
+			  struct mlx5e_channel_param *cparam)
+{
+	int err;
+	int tc;
+
+	for (tc = 0; tc < c->num_tc; tc++) {
+		err = mlx5e_open_sq(c, tc, &cparam->sq, &c->sq[tc]);
+		if (err)
+			goto err_close_sqs;
+	}
+
+	return 0;
+
+err_close_sqs:
+	for (tc--; tc >= 0; tc--)
+		mlx5e_close_sq(&c->sq[tc]);
+
+	return err;
+}
+
+static void mlx5e_close_sqs(struct mlx5e_channel *c)
+{
+	int tc;
+
+	for (tc = 0; tc < c->num_tc; tc++)
+		mlx5e_close_sq(&c->sq[tc]);
+}
+
+static int mlx5e_open_channel(struct mlx5e_priv *priv, int ix,
+			      struct mlx5e_channel_param *cparam,
+			      struct mlx5e_channel **cp)
+{
+	struct net_device *netdev = priv->netdev;
+	int cpu = mlx5e_get_cpu(priv, ix);
+	struct mlx5e_channel *c;
+	int err;
+
+	c = kzalloc_node(sizeof(*c), GFP_KERNEL, cpu_to_node(cpu));
+	if (!c)
+		return -ENOMEM;
+
+	c->priv     = priv;
+	c->ix       = ix;
+	c->cpu      = cpu;
+	c->pdev     = &priv->mdev->pdev->dev;
+	c->netdev   = priv->netdev;
+	c->mkey_be  = cpu_to_be32(priv->mr.key);
+	c->num_tc   = priv->num_tc;
+
+	netif_napi_add(netdev, &c->napi, mlx5e_napi_poll, 64);
+
+	err = mlx5e_open_tx_cqs(c, cparam);
+	if (err)
+		goto err_napi_del;
+
+	err = mlx5e_open_cq(c, &cparam->rx_cq, &c->rq.cq,
+			    priv->params.rx_cq_moderation_usec,
+			    priv->params.rx_cq_moderation_pkts);
+	if (err)
+		goto err_close_tx_cqs;
+	c->rq.cq.sqrq = &c->rq;
+
+	napi_enable(&c->napi);
+
+	err = mlx5e_open_sqs(c, cparam);
+	if (err)
+		goto err_disable_napi;
+
+	err = mlx5e_open_rq(c, &cparam->rq, &c->rq);
+	if (err)
+		goto err_close_sqs;
+
+	netif_set_xps_queue(netdev, get_cpu_mask(c->cpu), ix);
+	*cp = c;
+
+	return 0;
+
+err_close_sqs:
+	mlx5e_close_sqs(c);
+
+err_disable_napi:
+	napi_disable(&c->napi);
+	mlx5e_close_cq(&c->rq.cq);
+
+err_close_tx_cqs:
+	mlx5e_close_tx_cqs(c);
+
+err_napi_del:
+	netif_napi_del(&c->napi);
+	kfree(c);
+
+	return err;
+}
+
+static void mlx5e_close_channel(struct mlx5e_channel *c)
+{
+	mlx5e_close_rq(&c->rq);
+	mlx5e_close_sqs(c);
+	napi_disable(&c->napi);
+	mlx5e_close_cq(&c->rq.cq);
+	mlx5e_close_tx_cqs(c);
+	netif_napi_del(&c->napi);
+	kfree(c);
+}
+
+static void mlx5e_build_rq_param(struct mlx5e_priv *priv,
+				 struct mlx5e_rq_param *param)
+{
+	void *rqc = param->rqc;
+	void *wq = MLX5_ADDR_OF(rqc, rqc, wq);
+
+	MLX5_SET(wq, wq, wq_type,          MLX5_WQ_TYPE_LINKED_LIST);
+	MLX5_SET(wq, wq, end_padding_mode, MLX5_WQ_END_PAD_MODE_ALIGN);
+	MLX5_SET(wq, wq, log_wq_stride,    ilog2(sizeof(struct mlx5e_rx_wqe)));
+	MLX5_SET(wq, wq, log_wq_sz,        priv->params.log_rq_size);
+	MLX5_SET(wq, wq, pd,               priv->pdn);
+
+	param->wq.numa   = dev_to_node(&priv->mdev->pdev->dev);
+	param->wq.linear = 1;
+}
+
+static void mlx5e_build_sq_param(struct mlx5e_priv *priv,
+				 struct mlx5e_sq_param *param)
+{
+	void *sqc = param->sqc;
+	void *wq = MLX5_ADDR_OF(sqc, sqc, wq);
+
+	MLX5_SET(wq, wq, log_wq_sz,     priv->params.log_sq_size);
+	MLX5_SET(wq, wq, log_wq_stride, ilog2(MLX5_SEND_WQE_BB));
+	MLX5_SET(wq, wq, pd,            priv->pdn);
+
+	param->wq.numa = dev_to_node(&priv->mdev->pdev->dev);
+}
+
+static void mlx5e_build_common_cq_param(struct mlx5e_priv *priv,
+					struct mlx5e_cq_param *param)
+{
+	void *cqc = param->cqc;
+
+	MLX5_SET(cqc, cqc, uar_page, priv->cq_uar.index);
+}
+
+static void mlx5e_build_rx_cq_param(struct mlx5e_priv *priv,
+				    struct mlx5e_cq_param *param)
+{
+	void *cqc = param->cqc;
+
+	MLX5_SET(cqc, cqc, log_cq_size,  priv->params.log_rq_size);
+
+	mlx5e_build_common_cq_param(priv, param);
+}
+
+static void mlx5e_build_tx_cq_param(struct mlx5e_priv *priv,
+				    struct mlx5e_cq_param *param)
+{
+	void *cqc = param->cqc;
+
+	MLX5_SET(cqc, cqc, log_cq_size,  priv->params.log_sq_size);
+
+	mlx5e_build_common_cq_param(priv, param);
+}
+
+static void mlx5e_build_channel_param(struct mlx5e_priv *priv,
+				      struct mlx5e_channel_param *cparam)
+{
+	memset(cparam, 0, sizeof(*cparam));
+
+	mlx5e_build_rq_param(priv, &cparam->rq);
+	mlx5e_build_sq_param(priv, &cparam->sq);
+	mlx5e_build_rx_cq_param(priv, &cparam->rx_cq);
+	mlx5e_build_tx_cq_param(priv, &cparam->tx_cq);
+}
+
+static int mlx5e_open_channels(struct mlx5e_priv *priv)
+{
+	struct mlx5e_channel_param cparam;
+	int err;
+	int i;
+	int j;
+
+	priv->channel = kcalloc(priv->params.num_channels,
+				sizeof(struct mlx5e_channel *), GFP_KERNEL);
+	if (!priv->channel)
+		return -ENOMEM;
+
+	mlx5e_build_channel_param(priv, &cparam);
+	for (i = 0; i < priv->params.num_channels; i++) {
+		err = mlx5e_open_channel(priv, i, &cparam, &priv->channel[i]);
+		if (err)
+			goto err_close_channels;
+	}
+
+	for (j = 0; j < priv->params.num_channels; j++) {
+		err = mlx5e_wait_for_min_rx_wqes(&priv->channel[j]->rq);
+		if (err)
+			goto err_close_channels;
+	}
+
+	return 0;
+
+err_close_channels:
+	for (i--; i >= 0; i--)
+		mlx5e_close_channel(priv->channel[i]);
+
+	kfree(priv->channel);
+
+	return err;
+}
+
+static void mlx5e_close_channels(struct mlx5e_priv *priv)
+{
+	int i;
+
+	for (i = 0; i < priv->params.num_channels; i++)
+		mlx5e_close_channel(priv->channel[i]);
+
+	kfree(priv->channel);
+}
+
+static int mlx5e_open_tis(struct mlx5e_priv *priv, int tc)
+{
+	struct mlx5_core_dev *mdev = priv->mdev;
+	u32 in[MLX5_ST_SZ_DW(create_tis_in)];
+	void *tisc = MLX5_ADDR_OF(create_tis_in, in, ctx);
+
+	memset(in, 0, sizeof(in));
+
+	MLX5_SET(tisc, tisc, prio,  tc);
+
+	return mlx5_create_tis(mdev, in, sizeof(in), &priv->tisn[tc]);
+}
+
+static void mlx5e_close_tis(struct mlx5e_priv *priv, int tc)
+{
+	mlx5_destroy_tis(priv->mdev, priv->tisn[tc]);
+}
+
+static int mlx5e_open_tises(struct mlx5e_priv *priv)
+{
+	int num_tc = priv->num_tc;
+	int err;
+	int tc;
+
+	for (tc = 0; tc < num_tc; tc++) {
+		err = mlx5e_open_tis(priv, tc);
+		if (err)
+			goto err_close_tises;
+	}
+
+	return 0;
+
+err_close_tises:
+	for (tc--; tc >= 0; tc--)
+		mlx5e_close_tis(priv, tc);
+
+	return err;
+}
+
+static void mlx5e_close_tises(struct mlx5e_priv *priv)
+{
+	int num_tc = priv->num_tc;
+	int tc;
+
+	for (tc = 0; tc < num_tc; tc++)
+		mlx5e_close_tis(priv, tc);
+}
+
+static int mlx5e_open_rqt(struct mlx5e_priv *priv)
+{
+	struct mlx5_core_dev *mdev = priv->mdev;
+	u32 *in;
+	u32 out[MLX5_ST_SZ_DW(create_rqt_out)];
+	void *rqtc;
+	int inlen;
+	int err;
+	int sz;
+	int i;
+
+	sz = 1 << priv->params.rx_hash_log_tbl_sz;
+
+	inlen = MLX5_ST_SZ_BYTES(create_rqt_in) + sizeof(u32) * sz;
+	in = mlx5_vzalloc(inlen);
+	if (!in)
+		return -ENOMEM;
+
+	rqtc = MLX5_ADDR_OF(create_rqt_in, in, rqt_context);
+
+	MLX5_SET(rqtc, rqtc, rqt_actual_size, sz);
+	MLX5_SET(rqtc, rqtc, rqt_max_size, sz);
+
+	for (i = 0; i < sz; i++) {
+		int ix = i % priv->params.num_channels;
+
+		MLX5_SET(rqtc, rqtc, rq_num[i], priv->channel[ix]->rq.rqn);
+	}
+
+	MLX5_SET(create_rqt_in, in, opcode, MLX5_CMD_OP_CREATE_RQT);
+
+	memset(out, 0, sizeof(out));
+	err = mlx5_cmd_exec_check_status(mdev, in, inlen, out, sizeof(out));
+	if (!err)
+		priv->rqtn = MLX5_GET(create_rqt_out, out, rqtn);
+
+	kvfree(in);
+
+	return err;
+}
+
+static void mlx5e_close_rqt(struct mlx5e_priv *priv)
+{
+	u32 in[MLX5_ST_SZ_DW(destroy_rqt_in)];
+	u32 out[MLX5_ST_SZ_DW(destroy_rqt_out)];
+
+	memset(in, 0, sizeof(in));
+
+	MLX5_SET(destroy_rqt_in, in, opcode, MLX5_CMD_OP_DESTROY_RQT);
+	MLX5_SET(destroy_rqt_in, in, rqtn, priv->rqtn);
+
+	mlx5_cmd_exec_check_status(priv->mdev, in, sizeof(in), out,
+				   sizeof(out));
+}
+
+static void mlx5e_build_tir_ctx(struct mlx5e_priv *priv, u32 *tirc, int tt)
+{
+	void *hfso = MLX5_ADDR_OF(tirc, tirc, rx_hash_field_selector_outer);
+
+#define ROUGH_MAX_L2_L3_HDR_SZ 256
+
+#define MLX5_HASH_IP     (MLX5_HASH_FIELD_SEL_SRC_IP   |\
+			  MLX5_HASH_FIELD_SEL_DST_IP)
+
+#define MLX5_HASH_ALL    (MLX5_HASH_FIELD_SEL_SRC_IP   |\
+			  MLX5_HASH_FIELD_SEL_DST_IP   |\
+			  MLX5_HASH_FIELD_SEL_L4_SPORT |\
+			  MLX5_HASH_FIELD_SEL_L4_DPORT)
+
+	if (priv->params.lro_en) {
+		MLX5_SET(tirc, tirc, lro_enable_mask,
+			 MLX5_TIRC_LRO_ENABLE_MASK_IPV4_LRO |
+			 MLX5_TIRC_LRO_ENABLE_MASK_IPV6_LRO);
+		MLX5_SET(tirc, tirc, lro_max_ip_payload_size,
+			 (priv->params.lro_wqe_sz -
+			  ROUGH_MAX_L2_L3_HDR_SZ) >> 8);
+		MLX5_SET(tirc, tirc, lro_timeout_period_usecs,
+			 MLX5_CAP_ETH(priv->mdev,
+				      lro_timer_supported_periods[3]));
+	}
+
+	switch (tt) {
+	case MLX5E_TT_ANY:
+		MLX5_SET(tirc, tirc, disp_type,
+			 MLX5_TIRC_DISP_TYPE_DIRECT);
+		MLX5_SET(tirc, tirc, inline_rqn,
+			 priv->channel[0]->rq.rqn);
+		break;
+	default:
+		MLX5_SET(tirc, tirc, disp_type,
+			 MLX5_TIRC_DISP_TYPE_INDIRECT);
+		MLX5_SET(tirc, tirc, indirect_table,
+			 priv->rqtn);
+		MLX5_SET(tirc, tirc, rx_hash_fn,
+			 MLX5_TIRC_RX_HASH_FN_HASH_TOEPLITZ);
+		MLX5_SET(tirc, tirc, rx_hash_symmetric, 1);
+		netdev_rss_key_fill(MLX5_ADDR_OF(tirc, tirc,
+						 rx_hash_toeplitz_key),
+				    MLX5_FLD_SZ_BYTES(tirc,
+						      rx_hash_toeplitz_key));
+		break;
+	}
+
+	switch (tt) {
+	case MLX5E_TT_IPV4_TCP:
+		MLX5_SET(rx_hash_field_select, hfso, l3_prot_type,
+			 MLX5_L3_PROT_TYPE_IPV4);
+		MLX5_SET(rx_hash_field_select, hfso, l4_prot_type,
+			 MLX5_L4_PROT_TYPE_TCP);
+		MLX5_SET(rx_hash_field_select, hfso, selected_fields,
+			 MLX5_HASH_ALL);
+		break;
+
+	case MLX5E_TT_IPV6_TCP:
+		MLX5_SET(rx_hash_field_select, hfso, l3_prot_type,
+			 MLX5_L3_PROT_TYPE_IPV6);
+		MLX5_SET(rx_hash_field_select, hfso, l4_prot_type,
+			 MLX5_L4_PROT_TYPE_TCP);
+		MLX5_SET(rx_hash_field_select, hfso, selected_fields,
+			 MLX5_HASH_ALL);
+		break;
+
+	case MLX5E_TT_IPV4_UDP:
+		MLX5_SET(rx_hash_field_select, hfso, l3_prot_type,
+			 MLX5_L3_PROT_TYPE_IPV4);
+		MLX5_SET(rx_hash_field_select, hfso, l4_prot_type,
+			 MLX5_L4_PROT_TYPE_UDP);
+		MLX5_SET(rx_hash_field_select, hfso, selected_fields,
+			 MLX5_HASH_ALL);
+		break;
+
+	case MLX5E_TT_IPV6_UDP:
+		MLX5_SET(rx_hash_field_select, hfso, l3_prot_type,
+			 MLX5_L3_PROT_TYPE_IPV6);
+		MLX5_SET(rx_hash_field_select, hfso, l4_prot_type,
+			 MLX5_L4_PROT_TYPE_UDP);
+		MLX5_SET(rx_hash_field_select, hfso, selected_fields,
+			 MLX5_HASH_ALL);
+		break;
+
+	case MLX5E_TT_IPV4:
+		MLX5_SET(rx_hash_field_select, hfso, l3_prot_type,
+			 MLX5_L3_PROT_TYPE_IPV4);
+		MLX5_SET(rx_hash_field_select, hfso, selected_fields,
+			 MLX5_HASH_IP);
+		break;
+
+	case MLX5E_TT_IPV6:
+		MLX5_SET(rx_hash_field_select, hfso, l3_prot_type,
+			 MLX5_L3_PROT_TYPE_IPV6);
+		MLX5_SET(rx_hash_field_select, hfso, selected_fields,
+			 MLX5_HASH_IP);
+		break;
+	}
+}
+
+static int mlx5e_open_tir(struct mlx5e_priv *priv, int tt)
+{
+	struct mlx5_core_dev *mdev = priv->mdev;
+	u32 *in;
+	void *tirc;
+	int inlen;
+	int err;
+
+	inlen = MLX5_ST_SZ_BYTES(create_tir_in);
+	in = mlx5_vzalloc(inlen);
+	if (!in)
+		return -ENOMEM;
+
+	tirc = MLX5_ADDR_OF(create_tir_in, in, ctx);
+
+	mlx5e_build_tir_ctx(priv, tirc, tt);
+
+	err = mlx5_create_tir(mdev, in, inlen, &priv->tirn[tt]);
+
+	kvfree(in);
+
+	return err;
+}
+
+static void mlx5e_close_tir(struct mlx5e_priv *priv, int tt)
+{
+	mlx5_destroy_tir(priv->mdev, priv->tirn[tt]);
+}
+
+static int mlx5e_open_tirs(struct mlx5e_priv *priv)
+{
+	int err;
+	int i;
+
+	for (i = 0; i < MLX5E_NUM_TT; i++) {
+		err = mlx5e_open_tir(priv, i);
+		if (err)
+			goto err_close_tirs;
+	}
+
+	return 0;
+
+err_close_tirs:
+	for (i--; i >= 0; i--)
+		mlx5e_close_tir(priv, i);
+
+	return err;
+}
+
+static void mlx5e_close_tirs(struct mlx5e_priv *priv)
+{
+	int i;
+
+	for (i = 0; i < MLX5E_NUM_TT; i++)
+		mlx5e_close_tir(priv, i);
+}
+
+int mlx5e_open_locked(struct net_device *netdev)
+{
+	struct mlx5e_priv *priv = netdev_priv(netdev);
+	struct mlx5_core_dev *mdev = priv->mdev;
+	int actual_mtu;
+	int num_txqs;
+	int err;
+
+	num_txqs = roundup_pow_of_two(priv->params.num_channels) *
+		   priv->params.num_tc;
+	netif_set_real_num_tx_queues(netdev, num_txqs);
+	netif_set_real_num_rx_queues(netdev, priv->params.num_channels);
+
+	err = mlx5_set_port_mtu(mdev, netdev->mtu);
+	if (err) {
+		netdev_err(netdev, "%s: mlx5_set_port_mtu failed %d\n",
+			   __func__, err);
+		return err;
+	}
+
+	err = mlx5_query_port_oper_mtu(mdev, &actual_mtu);
+	if (err) {
+		netdev_err(netdev, "%s: mlx5_query_port_oper_mtu failed %d\n",
+			   __func__, err);
+		return err;
+	}
+
+	if (actual_mtu != netdev->mtu)
+		netdev_warn(netdev, "%s: Failed to set MTU to %d\n",
+			    __func__, netdev->mtu);
+
+	netdev->mtu = actual_mtu;
+
+	err = mlx5e_open_tises(priv);
+	if (err) {
+		netdev_err(netdev, "%s: mlx5e_open_tises failed, %d\n",
+			   __func__, err);
+		return err;
+	}
+
+	err = mlx5e_open_channels(priv);
+	if (err) {
+		netdev_err(netdev, "%s: mlx5e_open_channels failed, %d\n",
+			   __func__, err);
+		goto err_close_tises;
+	}
+
+	err = mlx5e_open_rqt(priv);
+	if (err) {
+		netdev_err(netdev, "%s: mlx5e_open_rqt failed, %d\n",
+			   __func__, err);
+		goto err_close_channels;
+	}
+
+	err = mlx5e_open_tirs(priv);
+	if (err) {
+		netdev_err(netdev, "%s: mlx5e_open_tir failed, %d\n",
+			   __func__, err);
+		goto err_close_rqls;
+	}
+
+	err = mlx5e_open_flow_table(priv);
+	if (err) {
+		netdev_err(netdev, "%s: mlx5e_open_flow_table failed, %d\n",
+			   __func__, err);
+		goto err_close_tirs;
+	}
+
+	err = mlx5e_add_all_vlan_rules(priv);
+	if (err) {
+		netdev_err(netdev, "%s: mlx5e_add_all_vlan_rules failed, %d\n",
+			   __func__, err);
+		goto err_close_flow_table;
+	}
+
+	mlx5e_init_eth_addr(priv);
+
+	set_bit(MLX5E_STATE_OPENED, &priv->state);
+
+	mlx5e_update_carrier(priv);
+	mlx5e_set_rx_mode_core(priv);
+
+	schedule_delayed_work(&priv->update_stats_work, 0);
+	return 0;
+
+err_close_flow_table:
+	mlx5e_close_flow_table(priv);
+
+err_close_tirs:
+	mlx5e_close_tirs(priv);
+
+err_close_rqls:
+	mlx5e_close_rqt(priv);
+
+err_close_channels:
+	mlx5e_close_channels(priv);
+
+err_close_tises:
+	mlx5e_close_tises(priv);
+
+	return err;
+}
+
+static int mlx5e_open(struct net_device *netdev)
+{
+	struct mlx5e_priv *priv = netdev_priv(netdev);
+	int err;
+
+	mutex_lock(&priv->state_lock);
+	err = mlx5e_open_locked(netdev);
+	mutex_unlock(&priv->state_lock);
+
+	return err;
+}
+
+int mlx5e_close_locked(struct net_device *netdev)
+{
+	struct mlx5e_priv *priv = netdev_priv(netdev);
+
+	clear_bit(MLX5E_STATE_OPENED, &priv->state);
+
+	mlx5e_set_rx_mode_core(priv);
+	mlx5e_del_all_vlan_rules(priv);
+	netif_carrier_off(priv->netdev);
+	mlx5e_close_flow_table(priv);
+	mlx5e_close_tirs(priv);
+	mlx5e_close_rqt(priv);
+	mlx5e_close_channels(priv);
+	mlx5e_close_tises(priv);
+
+	return 0;
+}
+
+static int mlx5e_close(struct net_device *netdev)
+{
+	struct mlx5e_priv *priv = netdev_priv(netdev);
+	int err;
+
+	mutex_lock(&priv->state_lock);
+	err = mlx5e_close_locked(netdev);
+	mutex_unlock(&priv->state_lock);
+
+	return err;
+}
+
+int mlx5e_update_priv_params(struct mlx5e_priv *priv,
+			     struct mlx5e_params *new_params)
+{
+	int err = 0;
+	int was_opened;
+
+	WARN_ON(!mutex_is_locked(&priv->state_lock));
+
+	was_opened = test_bit(MLX5E_STATE_OPENED, &priv->state);
+	if (was_opened)
+		mlx5e_close_locked(priv->netdev);
+
+	priv->params = *new_params;
+
+	if (was_opened)
+		err = mlx5e_open_locked(priv->netdev);
+
+	return err;
+}
+
+static struct rtnl_link_stats64 *
+mlx5e_get_stats(struct net_device *dev, struct rtnl_link_stats64 *stats)
+{
+	struct mlx5e_priv *priv = netdev_priv(dev);
+	struct mlx5e_vport_stats *vstats = &priv->stats.vport;
+
+	stats->rx_packets = vstats->rx_packets;
+	stats->rx_bytes   = vstats->rx_bytes;
+	stats->tx_packets = vstats->tx_packets;
+	stats->tx_bytes   = vstats->tx_bytes;
+	stats->multicast  = vstats->rx_multicast_packets +
+			    vstats->tx_multicast_packets;
+	stats->tx_errors  = vstats->tx_error_packets;
+	stats->rx_errors  = vstats->rx_error_packets;
+	stats->tx_dropped = vstats->tx_queue_dropped;
+	stats->rx_crc_errors = 0;
+	stats->rx_length_errors = 0;
+
+	return stats;
+}
+
+static void mlx5e_set_rx_mode(struct net_device *dev)
+{
+	struct mlx5e_priv *priv = netdev_priv(dev);
+
+	schedule_work(&priv->set_rx_mode_work);
+}
+
+static int mlx5e_set_mac(struct net_device *netdev, void *addr)
+{
+	struct mlx5e_priv *priv = netdev_priv(netdev);
+	struct sockaddr *saddr = addr;
+
+	if (!is_valid_ether_addr(saddr->sa_data))
+		return -EADDRNOTAVAIL;
+
+	netif_addr_lock_bh(netdev);
+	ether_addr_copy(netdev->dev_addr, saddr->sa_data);
+	netif_addr_unlock_bh(netdev);
+
+	schedule_work(&priv->set_rx_mode_work);
+
+	return 0;
+}
+
+static int mlx5e_set_features(struct net_device *netdev,
+			      netdev_features_t features)
+{
+	struct mlx5e_priv *priv = netdev_priv(netdev);
+	netdev_features_t changes = features ^ netdev->features;
+	struct mlx5e_params new_params;
+	bool update_params = false;
+
+	mutex_lock(&priv->state_lock);
+	new_params = priv->params;
+
+	if (changes & NETIF_F_LRO) {
+		new_params.lro_en = !!(features & NETIF_F_LRO);
+		update_params = true;
+	}
+
+	if (update_params)
+		mlx5e_update_priv_params(priv, &new_params);
+
+	if (changes & NETIF_F_HW_VLAN_CTAG_FILTER) {
+		if (features & NETIF_F_HW_VLAN_CTAG_FILTER)
+			mlx5e_enable_vlan_filter(priv);
+		else
+			mlx5e_disable_vlan_filter(priv);
+	}
+
+	mutex_unlock(&priv->state_lock);
+
+	return 0;
+}
+
+static int mlx5e_change_mtu(struct net_device *netdev, int new_mtu)
+{
+	struct mlx5e_priv *priv = netdev_priv(netdev);
+	struct mlx5_core_dev *mdev = priv->mdev;
+	int max_mtu;
+	int err = 0;
+
+	err = mlx5_query_port_max_mtu(mdev, &max_mtu);
+	if (err)
+		return err;
+
+	if (new_mtu > max_mtu || new_mtu < MLX5E_PARAMS_MIN_MTU) {
+		netdev_err(netdev, "%s: Bad MTU size, mtu must be [%d-%d]\n",
+			   __func__, MLX5E_PARAMS_MIN_MTU, max_mtu);
+		return -EINVAL;
+	}
+
+	mutex_lock(&priv->state_lock);
+	netdev->mtu = new_mtu;
+	err = mlx5e_update_priv_params(priv, &priv->params);
+	mutex_unlock(&priv->state_lock);
+
+	return err;
+}
+
+static struct net_device_ops mlx5e_netdev_ops = {
+	.ndo_open                = mlx5e_open,
+	.ndo_stop                = mlx5e_close,
+	.ndo_start_xmit          = mlx5e_xmit,
+	.ndo_get_stats64         = mlx5e_get_stats,
+	.ndo_set_rx_mode         = mlx5e_set_rx_mode,
+	.ndo_set_mac_address     = mlx5e_set_mac,
+	.ndo_vlan_rx_add_vid	 = mlx5e_vlan_rx_add_vid,
+	.ndo_vlan_rx_kill_vid	 = mlx5e_vlan_rx_kill_vid,
+	.ndo_set_features        = mlx5e_set_features,
+	.ndo_change_mtu		 = mlx5e_change_mtu,
+};
+
+static int mlx5e_check_required_hca_cap(struct mlx5_core_dev *mdev)
+{
+	if (MLX5_CAP_GEN(mdev, port_type) != MLX5_CAP_PORT_TYPE_ETH)
+		return -ENOTSUPP;
+	if (!MLX5_CAP_GEN(mdev, eth_net_offloads) ||
+	    !MLX5_CAP_GEN(mdev, nic_flow_table) ||
+	    !MLX5_CAP_ETH(mdev, csum_cap) ||
+	    !MLX5_CAP_ETH(mdev, max_lso_cap) ||
+	    !MLX5_CAP_ETH(mdev, vlan_cap) ||
+	    !MLX5_CAP_ETH(mdev, rss_ind_tbl_cap)) {
+		mlx5_core_warn(mdev,
+			       "Not creating net device, some required device capabilities are missing\n");
+		return -ENOTSUPP;
+	}
+	return 0;
+}
+
+static void mlx5e_build_netdev_priv(struct mlx5_core_dev *mdev,
+				    struct net_device *netdev,
+				    int num_comp_vectors)
+{
+	struct mlx5e_priv *priv = netdev_priv(netdev);
+
+	priv->params.log_sq_size           =
+		MLX5E_PARAMS_DEFAULT_LOG_SQ_SIZE;
+	priv->params.log_rq_size           =
+		MLX5E_PARAMS_DEFAULT_LOG_RQ_SIZE;
+	priv->params.rx_cq_moderation_usec =
+		MLX5E_PARAMS_DEFAULT_RX_CQ_MODERATION_USEC;
+	priv->params.rx_cq_moderation_pkts =
+		MLX5E_PARAMS_DEFAULT_RX_CQ_MODERATION_PKTS;
+	priv->params.tx_cq_moderation_usec =
+		MLX5E_PARAMS_DEFAULT_TX_CQ_MODERATION_USEC;
+	priv->params.tx_cq_moderation_pkts =
+		MLX5E_PARAMS_DEFAULT_TX_CQ_MODERATION_PKTS;
+	priv->params.min_rx_wqes           =
+		MLX5E_PARAMS_DEFAULT_MIN_RX_WQES;
+	priv->params.rx_hash_log_tbl_sz    =
+		(order_base_2(num_comp_vectors) >
+		 MLX5E_PARAMS_DEFAULT_RX_HASH_LOG_TBL_SZ) ?
+		order_base_2(num_comp_vectors)           :
+		MLX5E_PARAMS_DEFAULT_RX_HASH_LOG_TBL_SZ;
+	priv->params.num_tc                = 1;
+	priv->params.default_vlan_prio     = 0;
+
+	priv->params.lro_en = false && !!MLX5_CAP_ETH(priv->mdev, lro_cap);
+	priv->params.lro_wqe_sz            =
+		MLX5E_PARAMS_DEFAULT_LRO_WQE_SZ;
+
+	priv->mdev                         = mdev;
+	priv->netdev                       = netdev;
+	priv->params.num_channels          = num_comp_vectors;
+	priv->order_base_2_num_channels    = order_base_2(num_comp_vectors);
+	priv->queue_mapping_channel_mask   =
+		roundup_pow_of_two(num_comp_vectors) - 1;
+	priv->num_tc                       = priv->params.num_tc;
+	priv->default_vlan_prio            = priv->params.default_vlan_prio;
+
+	spin_lock_init(&priv->async_events_spinlock);
+	mutex_init(&priv->state_lock);
+
+	INIT_WORK(&priv->update_carrier_work, mlx5e_update_carrier_work);
+	INIT_WORK(&priv->set_rx_mode_work, mlx5e_set_rx_mode_work);
+	INIT_DELAYED_WORK(&priv->update_stats_work, mlx5e_update_stats_work);
+}
+
+static void mlx5e_set_netdev_dev_addr(struct net_device *netdev)
+{
+	struct mlx5e_priv *priv = netdev_priv(netdev);
+
+	mlx5_query_vport_mac_address(priv->mdev, netdev->dev_addr);
+}
+
+static void mlx5e_build_netdev(struct net_device *netdev)
+{
+	struct mlx5e_priv *priv = netdev_priv(netdev);
+	struct mlx5_core_dev *mdev = priv->mdev;
+
+	SET_NETDEV_DEV(netdev, &mdev->pdev->dev);
+
+	if (priv->num_tc > 1) {
+		mlx5e_netdev_ops.ndo_select_queue = mlx5e_select_queue;
+		mlx5e_netdev_ops.ndo_start_xmit   = mlx5e_xmit_multi_tc;
+	}
+
+	netdev->netdev_ops        = &mlx5e_netdev_ops;
+	netdev->watchdog_timeo    = 15 * HZ;
+
+	netdev->ethtool_ops	  = &mlx5e_ethtool_ops;
+
+	netdev->vlan_features    |= NETIF_F_IP_CSUM;
+	netdev->vlan_features    |= NETIF_F_IPV6_CSUM;
+	netdev->vlan_features    |= NETIF_F_GRO;
+	netdev->vlan_features    |= NETIF_F_TSO;
+	netdev->vlan_features    |= NETIF_F_TSO6;
+	netdev->vlan_features    |= NETIF_F_RXCSUM;
+	netdev->vlan_features    |= NETIF_F_RXHASH;
+
+	if (!!MLX5_CAP_ETH(mdev, lro_cap))
+		netdev->vlan_features    |= NETIF_F_LRO;
+
+	netdev->hw_features       = netdev->vlan_features;
+	netdev->hw_features      |= NETIF_F_HW_VLAN_CTAG_TX;
+	netdev->hw_features      |= NETIF_F_HW_VLAN_CTAG_RX;
+	netdev->hw_features      |= NETIF_F_HW_VLAN_CTAG_FILTER;
+
+	netdev->features          = netdev->hw_features;
+	if (!priv->params.lro_en)
+		netdev->features  &= ~NETIF_F_LRO;
+
+	netdev->features         |= NETIF_F_HIGHDMA;
+
+	netdev->priv_flags       |= IFF_UNICAST_FLT;
+
+	mlx5e_set_netdev_dev_addr(netdev);
+}
+
+static int mlx5e_create_mkey(struct mlx5e_priv *priv, u32 pdn,
+			     struct mlx5_core_mr *mr)
+{
+	struct mlx5_core_dev *mdev = priv->mdev;
+	struct mlx5_create_mkey_mbox_in *in;
+	int err;
+
+	in = mlx5_vzalloc(sizeof(*in));
+	if (!in)
+		return -ENOMEM;
+
+	in->seg.flags = MLX5_PERM_LOCAL_WRITE |
+			MLX5_PERM_LOCAL_READ  |
+			MLX5_ACCESS_MODE_PA;
+	in->seg.flags_pd = cpu_to_be32(pdn | MLX5_MKEY_LEN64);
+	in->seg.qpn_mkey7_0 = cpu_to_be32(0xffffff << 8);
+
+	err = mlx5_core_create_mkey(mdev, mr, in, sizeof(*in), NULL, NULL,
+				    NULL);
+
+	kvfree(in);
+
+	return err;
+}
+
+static void *mlx5e_create_netdev(struct mlx5_core_dev *mdev)
+{
+	struct net_device *netdev;
+	struct mlx5e_priv *priv;
+	int ncv = mdev->priv.eq_table.num_comp_vectors;
+	int err;
+
+	if (mlx5e_check_required_hca_cap(mdev))
+		return NULL;
+
+	netdev = alloc_etherdev_mqs(sizeof(struct mlx5e_priv),
+				    roundup_pow_of_two(ncv) * MLX5E_MAX_NUM_TC,
+				    ncv);
+	if (!netdev) {
+		mlx5_core_err(mdev, "alloc_etherdev_mqs() failed\n");
+		return NULL;
+	}
+
+	mlx5e_build_netdev_priv(mdev, netdev, ncv);
+	mlx5e_build_netdev(netdev);
+
+	netif_carrier_off(netdev);
+
+	priv = netdev_priv(netdev);
+
+	err = mlx5_alloc_map_uar(mdev, &priv->cq_uar);
+	if (err) {
+		netdev_err(netdev, "%s: mlx5_alloc_map_uar failed, %d\n",
+			   __func__, err);
+		goto err_free_netdev;
+	}
+
+	err = mlx5_core_alloc_pd(mdev, &priv->pdn);
+	if (err) {
+		netdev_err(netdev, "%s: mlx5_core_alloc_pd failed, %d\n",
+			   __func__, err);
+		goto err_unmap_free_uar;
+	}
+
+	err = mlx5e_create_mkey(priv, priv->pdn, &priv->mr);
+	if (err) {
+		netdev_err(netdev, "%s: mlx5e_create_mkey failed, %d\n",
+			   __func__, err);
+		goto err_dealloc_pd;
+	}
+
+	err = register_netdev(netdev);
+	if (err) {
+		netdev_err(netdev, "%s: register_netdev failed, %d\n",
+			   __func__, err);
+		goto err_destroy_mkey;
+	}
+
+	mlx5e_enable_async_events(priv);
+
+	return priv;
+
+err_destroy_mkey:
+	mlx5_core_destroy_mkey(mdev, &priv->mr);
+
+err_dealloc_pd:
+	mlx5_core_dealloc_pd(mdev, priv->pdn);
+
+err_unmap_free_uar:
+	mlx5_unmap_free_uar(mdev, &priv->cq_uar);
+
+err_free_netdev:
+	free_netdev(netdev);
+
+	return NULL;
+}
+
+static void mlx5e_destroy_netdev(struct mlx5_core_dev *mdev, void *vpriv)
+{
+	struct mlx5e_priv *priv = vpriv;
+	struct net_device *netdev = priv->netdev;
+
+	unregister_netdev(netdev);
+	mlx5_core_destroy_mkey(priv->mdev, &priv->mr);
+	mlx5_core_dealloc_pd(priv->mdev, priv->pdn);
+	mlx5_unmap_free_uar(priv->mdev, &priv->cq_uar);
+	mlx5e_disable_async_events(priv);
+	flush_scheduled_work();
+	free_netdev(netdev);
+}
+
+static void *mlx5e_get_netdev(void *vpriv)
+{
+	struct mlx5e_priv *priv = vpriv;
+
+	return priv->netdev;
+}
+
+static struct mlx5_interface mlx5e_interface = {
+	.add       = mlx5e_create_netdev,
+	.remove    = mlx5e_destroy_netdev,
+	.event     = mlx5e_async_event,
+	.protocol  = MLX5_INTERFACE_PROTOCOL_ETH,
+	.get_dev   = mlx5e_get_netdev,
+};
+
+void mlx5e_init(void)
+{
+	mlx5_register_interface(&mlx5e_interface);
+}
+
+void mlx5e_cleanup(void)
+{
+	mlx5_unregister_interface(&mlx5e_interface);
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c
index e7b7b123a128..1c37f587426d 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
@@ -48,10 +48,6 @@
 #include <linux/mlx5/mlx5_ifc.h>
 #include "mlx5_core.h"
 
-#define DRIVER_NAME "mlx5_core"
-#define DRIVER_VERSION "3.0"
-#define DRIVER_RELDATE  "January 2015"
-
 MODULE_AUTHOR("Eli Cohen <eli@mellanox.com>");
 MODULE_DESCRIPTION("Mellanox Connect-IB, ConnectX-4 core driver");
 MODULE_LICENSE("Dual BSD/GPL");
@@ -614,6 +610,61 @@ clean:
 	return err;
 }
 
+#ifdef CONFIG_MLX5_CORE_EN
+static int mlx5_core_set_issi(struct mlx5_core_dev *dev)
+{
+	u32 query_in[MLX5_ST_SZ_DW(query_issi_in)];
+	u32 query_out[MLX5_ST_SZ_DW(query_issi_out)];
+	u32 set_in[MLX5_ST_SZ_DW(set_issi_in)];
+	u32 set_out[MLX5_ST_SZ_DW(set_issi_out)];
+	int err;
+	u32 sup_issi;
+
+	memset(query_in, 0, sizeof(query_in));
+	memset(query_out, 0, sizeof(query_out));
+
+	MLX5_SET(query_issi_in, query_in, opcode, MLX5_CMD_OP_QUERY_ISSI);
+
+	err = mlx5_cmd_exec_check_status(dev, query_in, sizeof(query_in),
+					 query_out, sizeof(query_out));
+	if (err) {
+		if (((struct mlx5_outbox_hdr *)query_out)->status ==
+		    MLX5_CMD_STAT_BAD_OP_ERR) {
+			pr_debug("Only ISSI 0 is supported\n");
+			return 0;
+		}
+
+		pr_err("failed to query ISSI\n");
+		return err;
+	}
+
+	sup_issi = MLX5_GET(query_issi_out, query_out, supported_issi_dw0);
+
+	if (sup_issi & (1 << 1)) {
+		memset(set_in, 0, sizeof(set_in));
+		memset(set_out, 0, sizeof(set_out));
+
+		MLX5_SET(set_issi_in, set_in, opcode, MLX5_CMD_OP_SET_ISSI);
+		MLX5_SET(set_issi_in, set_in, current_issi, 1);
+
+		err = mlx5_cmd_exec_check_status(dev, set_in, sizeof(set_in),
+						 set_out, sizeof(set_out));
+		if (err) {
+			pr_err("failed to set ISSI=1\n");
+			return err;
+		}
+
+		dev->issi = 1;
+
+		return 0;
+	} else if (sup_issi & (1 << 0)) {
+		return 0;
+	}
+
+	return -ENOTSUPP;
+}
+#endif
+
 static int mlx5_dev_init(struct mlx5_core_dev *dev, struct pci_dev *pdev)
 {
 	struct mlx5_priv *priv = &dev->priv;
@@ -676,6 +727,14 @@ static int mlx5_dev_init(struct mlx5_core_dev *dev, struct pci_dev *pdev)
 		goto err_pagealloc_cleanup;
 	}
 
+#ifdef CONFIG_MLX5_CORE_EN
+	err = mlx5_core_set_issi(dev);
+	if (err) {
+		dev_err(&pdev->dev, "failed to set issi\n");
+		goto err_disable_hca;
+	}
+#endif
+
 	err = mlx5_satisfy_startup_pages(dev, 1);
 	if (err) {
 		dev_err(&pdev->dev, "failed to allocate boot pages\n");
@@ -1084,6 +1143,10 @@ static int __init init(void)
 	if (err)
 		goto err_health;
 
+#ifdef CONFIG_MLX5_CORE_EN
+	mlx5e_init();
+#endif
+
 	return 0;
 
 err_health:
@@ -1096,6 +1159,9 @@ err_debug:
 
 static void __exit cleanup(void)
 {
+#ifdef CONFIG_MLX5_CORE_EN
+	mlx5e_cleanup();
+#endif
 	pci_unregister_driver(&mlx5_core_driver);
 	mlx5_health_cleanup();
 	destroy_workqueue(mlx5_core_wq);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
index b986f1c258bc..6983c1047255 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2013-2015, Mellanox Technologies, Ltd.  All rights reserved.
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
@@ -37,6 +37,10 @@
 #include <linux/kernel.h>
 #include <linux/sched.h>
 
+#define DRIVER_NAME "mlx5_core"
+#define DRIVER_VERSION "3.0-1"
+#define DRIVER_RELDATE  "January 2015"
+
 extern int mlx5_core_debug_mask;
 
 #define mlx5_core_dbg(dev, format, ...)					\
@@ -78,4 +82,7 @@ int mlx5_cmd_query_adapter(struct mlx5_core_dev *dev);
 int mlx5_cmd_init_hca(struct mlx5_core_dev *dev);
 int mlx5_cmd_teardown_hca(struct mlx5_core_dev *dev);
 
+void mlx5e_init(void);
+void mlx5e_cleanup(void);
+
 #endif /* __MLX5_CORE_H__ */
diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h
index 4ee52bf1f959..b288c538347a 100644
--- a/include/linux/mlx5/device.h
+++ b/include/linux/mlx5/device.h
@@ -1153,4 +1153,23 @@ enum mlx5_cap_type {
 #define MLX5_CAP_ODP(mdev, cap)\
 	MLX5_GET(odp_cap, mdev->hca_caps_cur[MLX5_CAP_ODP], cap)
 
+enum {
+	MLX5_CMD_STAT_OK			= 0x0,
+	MLX5_CMD_STAT_INT_ERR			= 0x1,
+	MLX5_CMD_STAT_BAD_OP_ERR		= 0x2,
+	MLX5_CMD_STAT_BAD_PARAM_ERR		= 0x3,
+	MLX5_CMD_STAT_BAD_SYS_STATE_ERR		= 0x4,
+	MLX5_CMD_STAT_BAD_RES_ERR		= 0x5,
+	MLX5_CMD_STAT_RES_BUSY			= 0x6,
+	MLX5_CMD_STAT_LIM_ERR			= 0x8,
+	MLX5_CMD_STAT_BAD_RES_STATE_ERR		= 0x9,
+	MLX5_CMD_STAT_IX_ERR			= 0xa,
+	MLX5_CMD_STAT_NO_RES_ERR		= 0xf,
+	MLX5_CMD_STAT_BAD_INP_LEN_ERR		= 0x50,
+	MLX5_CMD_STAT_BAD_OUTP_LEN_ERR		= 0x51,
+	MLX5_CMD_STAT_BAD_QP_STATE_ERR		= 0x10,
+	MLX5_CMD_STAT_BAD_PKT_ERR		= 0x30,
+	MLX5_CMD_STAT_BAD_SIZE_OUTS_CQES_ERR	= 0x40,
+};
+
 #endif /* MLX5_DEVICE_H */
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 51738472657e..7fa26f03acc1 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -489,6 +489,7 @@ struct mlx5_core_dev {
 	struct mlx5_priv	priv;
 	struct mlx5_profile	*profile;
 	atomic_t		num_qps;
+	u32			issi;
 };
 
 struct mlx5_db {
-- 
cgit v1.2.3


From 07d783fd830a49008f3b2764ae7b6033ee1bf329 Mon Sep 17 00:00:00 2001
From: Peter Senna Tschudin <peter.senna@gmail.com>
Date: Tue, 19 May 2015 11:44:46 +0200
Subject: staging: goldfish: Fix pointer cast for 32 bits

As the first argument of gf_write64() was of type unsigned long, and as
some calls to gf_write64() were casting the first argument from void *
to u64 the compiler and/or sparse were printing warnings for casts of
wrong sizes when compiling for i386.

This patch changes the type of the first argument of gf_write64() to
const void *, and update calls to the function. This change fixed the
warnings and allowed to remove casts from 3 calls to gf_write64().

In addition gf_write64() was renamed to gf_write_ptr() as the name was
misleading because it only writes 32 bits on 32 bit systems.

gf_write_dma_addr() was added to handle dma_addr_t values which is
used at drivers/staging/goldfish/goldfish_audio.c.

Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Peter Senna Tschudin <peter.senna@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/platform/goldfish/goldfish_pipe.c | 18 +++++++++---------
 drivers/staging/goldfish/goldfish_audio.c |  2 +-
 drivers/staging/goldfish/goldfish_nand.c  |  2 +-
 drivers/tty/goldfish.c                    |  4 ++--
 include/linux/goldfish.h                  | 19 +++++++++++++++----
 5 files changed, 28 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/platform/goldfish/goldfish_pipe.c b/drivers/platform/goldfish/goldfish_pipe.c
index d9a09d9637d9..aad16bc9e630 100644
--- a/drivers/platform/goldfish/goldfish_pipe.c
+++ b/drivers/platform/goldfish/goldfish_pipe.c
@@ -158,8 +158,8 @@ static u32 goldfish_cmd_status(struct goldfish_pipe *pipe, u32 cmd)
 	struct goldfish_pipe_dev *dev = pipe->dev;
 
 	spin_lock_irqsave(&dev->lock, flags);
-	gf_write64((u64)(unsigned long)pipe, dev->base + PIPE_REG_CHANNEL,
-				dev->base + PIPE_REG_CHANNEL_HIGH);
+	gf_write_ptr(pipe, dev->base + PIPE_REG_CHANNEL,
+		     dev->base + PIPE_REG_CHANNEL_HIGH);
 	writel(cmd, dev->base + PIPE_REG_COMMAND);
 	status = readl(dev->base + PIPE_REG_STATUS);
 	spin_unlock_irqrestore(&dev->lock, flags);
@@ -172,8 +172,8 @@ static void goldfish_cmd(struct goldfish_pipe *pipe, u32 cmd)
 	struct goldfish_pipe_dev *dev = pipe->dev;
 
 	spin_lock_irqsave(&dev->lock, flags);
-	gf_write64((u64)(unsigned long)pipe, dev->base + PIPE_REG_CHANNEL,
-				dev->base + PIPE_REG_CHANNEL_HIGH);
+	gf_write_ptr(pipe, dev->base + PIPE_REG_CHANNEL,
+		     dev->base + PIPE_REG_CHANNEL_HIGH);
 	writel(cmd, dev->base + PIPE_REG_COMMAND);
 	spin_unlock_irqrestore(&dev->lock, flags);
 }
@@ -327,12 +327,12 @@ static ssize_t goldfish_pipe_read_write(struct file *filp, char __user *buffer,
 		spin_lock_irqsave(&dev->lock, irq_flags);
 		if (access_with_param(dev, CMD_WRITE_BUFFER + cmd_offset,
 				address, avail, pipe, &status)) {
-			gf_write64((u64)(unsigned long)pipe,
-				   dev->base + PIPE_REG_CHANNEL,
-				   dev->base + PIPE_REG_CHANNEL_HIGH);
+			gf_write_ptr(pipe, dev->base + PIPE_REG_CHANNEL,
+				     dev->base + PIPE_REG_CHANNEL_HIGH);
 			writel(avail, dev->base + PIPE_REG_SIZE);
-			gf_write64(address, dev->base + PIPE_REG_ADDRESS,
-				dev->base + PIPE_REG_ADDRESS_HIGH);
+			gf_write_ptr((void *)address,
+				     dev->base + PIPE_REG_ADDRESS,
+				     dev->base + PIPE_REG_ADDRESS_HIGH);
 			writel(CMD_WRITE_BUFFER + cmd_offset,
 					dev->base + PIPE_REG_COMMAND);
 			status = readl(dev->base + PIPE_REG_STATUS);
diff --git a/drivers/staging/goldfish/goldfish_audio.c b/drivers/staging/goldfish/goldfish_audio.c
index 702ae04df912..b0927e49d0a8 100644
--- a/drivers/staging/goldfish/goldfish_audio.c
+++ b/drivers/staging/goldfish/goldfish_audio.c
@@ -63,7 +63,7 @@ struct goldfish_audio {
 #define AUDIO_READ(data, addr)		(readl(data->reg_base + addr))
 #define AUDIO_WRITE(data, addr, x)	(writel(x, data->reg_base + addr))
 #define AUDIO_WRITE64(data, addr, addr2, x)	\
-	(gf_write64((u64)(x), data->reg_base + addr, data->reg_base+addr2))
+	(gf_write_dma_addr((x), data->reg_base + addr, data->reg_base+addr2))
 
 /*
  *  temporary variable used between goldfish_audio_probe() and
diff --git a/drivers/staging/goldfish/goldfish_nand.c b/drivers/staging/goldfish/goldfish_nand.c
index 213877a2c430..66ae48fcc2b2 100644
--- a/drivers/staging/goldfish/goldfish_nand.c
+++ b/drivers/staging/goldfish/goldfish_nand.c
@@ -87,7 +87,7 @@ static u32 goldfish_nand_cmd(struct mtd_info *mtd, enum nand_cmd cmd,
 		writel((u32)(addr >> 32), base + NAND_ADDR_HIGH);
 		writel((u32)addr, base + NAND_ADDR_LOW);
 		writel(len, base + NAND_TRANSFER_SIZE);
-		gf_write64((u64)ptr, base + NAND_DATA, base + NAND_DATA_HIGH);
+		gf_write_ptr(ptr, base + NAND_DATA, base + NAND_DATA_HIGH);
 		writel(cmd, base + NAND_COMMAND);
 		rv = readl(base + NAND_RESULT);
 	}
diff --git a/drivers/tty/goldfish.c b/drivers/tty/goldfish.c
index 0655fecf8240..0f82c0b146f6 100644
--- a/drivers/tty/goldfish.c
+++ b/drivers/tty/goldfish.c
@@ -59,7 +59,7 @@ static void goldfish_tty_do_write(int line, const char *buf, unsigned count)
 	struct goldfish_tty *qtty = &goldfish_ttys[line];
 	void __iomem *base = qtty->base;
 	spin_lock_irqsave(&qtty->lock, irq_flags);
-	gf_write64((u64)buf, base + GOLDFISH_TTY_DATA_PTR,
+	gf_write_ptr(buf, base + GOLDFISH_TTY_DATA_PTR,
 				base + GOLDFISH_TTY_DATA_PTR_HIGH);
 	writel(count, base + GOLDFISH_TTY_DATA_LEN);
 	writel(GOLDFISH_TTY_CMD_WRITE_BUFFER, base + GOLDFISH_TTY_CMD);
@@ -81,7 +81,7 @@ static irqreturn_t goldfish_tty_interrupt(int irq, void *dev_id)
 
 	count = tty_prepare_flip_string(&qtty->port, &buf, count);
 	spin_lock_irqsave(&qtty->lock, irq_flags);
-	gf_write64((u64)buf, base + GOLDFISH_TTY_DATA_PTR,
+	gf_write_ptr(buf, base + GOLDFISH_TTY_DATA_PTR,
 				base + GOLDFISH_TTY_DATA_PTR_HIGH);
 	writel(count, base + GOLDFISH_TTY_DATA_LEN);
 	writel(GOLDFISH_TTY_CMD_READ_BUFFER, base + GOLDFISH_TTY_CMD);
diff --git a/include/linux/goldfish.h b/include/linux/goldfish.h
index 569236e6b2bc..93e080b39cf6 100644
--- a/include/linux/goldfish.h
+++ b/include/linux/goldfish.h
@@ -3,13 +3,24 @@
 
 /* Helpers for Goldfish virtual platform */
 
-static inline void gf_write64(unsigned long data,
-		void __iomem *portl, void __iomem *porth)
+static inline void gf_write_ptr(const void *ptr, void __iomem *portl,
+				void __iomem *porth)
 {
-	writel((u32)data, portl);
+	writel((u32)(unsigned long)ptr, portl);
 #ifdef CONFIG_64BIT
-	writel(data>>32, porth);
+	writel((unsigned long)ptr >> 32, porth);
 #endif
 }
 
+static inline void gf_write_dma_addr(const dma_addr_t addr,
+				     void __iomem *portl,
+				     void __iomem *porth)
+{
+	writel((u32)addr, portl);
+#ifdef CONFIG_ARCH_DMA_ADDR_T_64BIT
+	writel(addr >> 32, porth);
+#endif
+}
+
+
 #endif /* __LINUX_GOLDFISH_H */
-- 
cgit v1.2.3


From b144ce2d37619e05afdb0a15676500d76a64b1be Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Wed, 27 May 2015 17:17:27 -0700
Subject: mei: fix up uuid matching

A previous commit, c93b76b34b4d ("mei: bus: report also uuid in module
alias") caused a build error as I missed applying a needed patch to add
some macros to uapi/linux/uuid.h.  Instead of those additional macros,
change the mei code to use the existing uuid structure directly.

Fixes: c93b76b34b4d
Cc: Tomas Winkler <tomas.winkler@intel.com>
Cc: Samuel Ortiz <sameo@linux.intel.com>
Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/misc/mei/bus.c          | 9 ++-------
 drivers/nfc/mei_phy.h           | 2 +-
 include/linux/mod_devicetable.h | 2 +-
 scripts/mod/file2alias.c        | 7 +++++--
 4 files changed, 9 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/misc/mei/bus.c b/drivers/misc/mei/bus.c
index de8fd089a8a4..357b6ae4d207 100644
--- a/drivers/misc/mei/bus.c
+++ b/drivers/misc/mei/bus.c
@@ -30,11 +30,6 @@
 #define to_mei_cl_driver(d) container_of(d, struct mei_cl_driver, driver)
 #define to_mei_cl_device(d) container_of(d, struct mei_cl_device, dev)
 
-static inline uuid_le uuid_le_cast(const __u8 uuid[16])
-{
-	return *(uuid_le *)uuid;
-}
-
 static int mei_cl_device_match(struct device *dev, struct device_driver *drv)
 {
 	struct mei_cl_device *device = to_mei_cl_device(dev);
@@ -54,9 +49,9 @@ static int mei_cl_device_match(struct device *dev, struct device_driver *drv)
 
 	id = driver->id_table;
 
-	while (uuid_le_cmp(NULL_UUID_LE, uuid_le_cast(id->uuid))) {
+	while (uuid_le_cmp(NULL_UUID_LE, id->uuid)) {
 
-		if (!uuid_le_cmp(*uuid, uuid_le_cast(id->uuid))) {
+		if (!uuid_le_cmp(*uuid, id->uuid)) {
 			if (id->name[0]) {
 				if (!strncmp(name, id->name, sizeof(id->name)))
 					return 1;
diff --git a/drivers/nfc/mei_phy.h b/drivers/nfc/mei_phy.h
index a51f8f2685cc..fbfa3e61738f 100644
--- a/drivers/nfc/mei_phy.h
+++ b/drivers/nfc/mei_phy.h
@@ -5,7 +5,7 @@
 #include <net/nfc/hci.h>
 #include <linux/uuid.h>
 
-#define MEI_NFC_UUID __UUID_LE(0x0bb17a78, 0x2a8e, 0x4c50, \
+#define MEI_NFC_UUID UUID_LE(0x0bb17a78, 0x2a8e, 0x4c50, \
 		0x94, 0xd4, 0x50, 0x26, 0x67, 0x23, 0x77, 0x5c)
 #define MEI_NFC_HEADER_SIZE 10
 #define MEI_NFC_MAX_HCI_PAYLOAD 300
diff --git a/include/linux/mod_devicetable.h b/include/linux/mod_devicetable.h
index 2d2b2b571d61..048c270822f9 100644
--- a/include/linux/mod_devicetable.h
+++ b/include/linux/mod_devicetable.h
@@ -614,7 +614,7 @@ struct ipack_device_id {
  */
 struct mei_cl_device_id {
 	char name[MEI_CL_NAME_SIZE];
-	__u8 uuid[16];
+	uuid_le uuid;
 	kernel_ulong_t driver_info;
 };
 
diff --git a/scripts/mod/file2alias.c b/scripts/mod/file2alias.c
index 62c517f4b592..718b2a29bd43 100644
--- a/scripts/mod/file2alias.c
+++ b/scripts/mod/file2alias.c
@@ -34,6 +34,9 @@ typedef Elf64_Addr	kernel_ulong_t;
 typedef uint32_t	__u32;
 typedef uint16_t	__u16;
 typedef unsigned char	__u8;
+typedef struct {
+	__u8 b[16];
+} uuid_le;
 
 /* Big exception to the "don't include kernel headers into userspace, which
  * even potentially has different endianness and word sizes, since
@@ -131,13 +134,13 @@ static inline void add_wildcard(char *str)
 		strcat(str + len, "*");
 }
 
-static inline void add_uuid(char *str, __u8 uuid[16])
+static inline void add_uuid(char *str, uuid_le uuid)
 {
 	int len = strlen(str);
 	int i;
 
 	for (i = 0; i < 16; i++)
-		sprintf(str + len + (i << 1), "%02x", uuid[i]);
+		sprintf(str + len + (i << 1), "%02x", uuid.b[i]);
 }
 
 /**
-- 
cgit v1.2.3


From 10081fb532a2a2216b7d8e4ad585c985075b6f60 Mon Sep 17 00:00:00 2001
From: Akinobu Mita <akinobu.mita@gmail.com>
Date: Fri, 1 May 2015 15:23:50 +0900
Subject: lib: introduce crc_t10dif_update()

This introduces crc_t10dif_update() which enables to calculate CRC
for a block which straddles multiple SG elements by calling multiple
times.  This also converts crc_t10dif() to use crc_t10dif_update() as
they are almost same.

(remove extra function call in crc_t10dif() and crc_t10dif_update -
 Tim + Herbert)

Signed-off-by: Akinobu Mita <akinobu.mita@gmail.com>
Acked-by: Martin K. Petersen <martin.petersen@oracle.com>
Cc: Tim Chen <tim.c.chen@linux.intel.com>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: linux-crypto@vger.kernel.org
Cc: Nicholas Bellinger <nab@linux-iscsi.org>
Cc: Sagi Grimberg <sagig@mellanox.com>
Cc: "Martin K. Petersen" <martin.petersen@oracle.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: "James E.J. Bottomley" <James.Bottomley@HansenPartnership.com>
Cc: target-devel@vger.kernel.org
Signed-off-by: Nicholas Bellinger <nab@linux-iscsi.org>
---
 include/linux/crc-t10dif.h |  1 +
 lib/crc-t10dif.c           | 12 +++++++++---
 2 files changed, 10 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/crc-t10dif.h b/include/linux/crc-t10dif.h
index cf53d0773ce3..d81961e9e37d 100644
--- a/include/linux/crc-t10dif.h
+++ b/include/linux/crc-t10dif.h
@@ -9,5 +9,6 @@
 extern __u16 crc_t10dif_generic(__u16 crc, const unsigned char *buffer,
 				size_t len);
 extern __u16 crc_t10dif(unsigned char const *, size_t);
+extern __u16 crc_t10dif_update(__u16 crc, unsigned char const *, size_t);
 
 #endif
diff --git a/lib/crc-t10dif.c b/lib/crc-t10dif.c
index dfe6ec17c0a5..1ad33e555805 100644
--- a/lib/crc-t10dif.c
+++ b/lib/crc-t10dif.c
@@ -19,7 +19,7 @@
 static struct crypto_shash *crct10dif_tfm;
 static struct static_key crct10dif_fallback __read_mostly;
 
-__u16 crc_t10dif(const unsigned char *buffer, size_t len)
+__u16 crc_t10dif_update(__u16 crc, const unsigned char *buffer, size_t len)
 {
 	struct {
 		struct shash_desc shash;
@@ -28,17 +28,23 @@ __u16 crc_t10dif(const unsigned char *buffer, size_t len)
 	int err;
 
 	if (static_key_false(&crct10dif_fallback))
-		return crc_t10dif_generic(0, buffer, len);
+		return crc_t10dif_generic(crc, buffer, len);
 
 	desc.shash.tfm = crct10dif_tfm;
 	desc.shash.flags = 0;
-	*(__u16 *)desc.ctx = 0;
+	*(__u16 *)desc.ctx = crc;
 
 	err = crypto_shash_update(&desc.shash, buffer, len);
 	BUG_ON(err);
 
 	return *(__u16 *)desc.ctx;
 }
+EXPORT_SYMBOL(crc_t10dif_update);
+
+__u16 crc_t10dif(const unsigned char *buffer, size_t len)
+{
+	return crc_t10dif_update(0, buffer, len);
+}
 EXPORT_SYMBOL(crc_t10dif);
 
 static int __init crc_t10dif_mod_init(void)
-- 
cgit v1.2.3


From c66fa19c405a36673d4aab13658c8246413d5c0f Mon Sep 17 00:00:00 2001
From: Matan Barak <matanb@mellanox.com>
Date: Sun, 31 May 2015 09:30:16 +0300
Subject: net/mlx4: Add EQ pool

Previously, mlx4_en allocated EQs and used them exclusively.
This affected RoCE performance, as applications which are
events sensitive were limited to use only the legacy EQs.

Change that by introducing an EQ pool. This pool is managed
by mlx4_core. EQs are assigned to ports (when there are limited
number of EQs, multiple ports could be assigned to the same EQs).

An exception to this rule is the ASYNC EQ which handles various events.

Legacy EQs are completely removed as all EQs could be shared.

When a consumer (mlx4_ib/mlx4_en) requests an EQ, it asks for
EQ serving on a specific port. The core driver calculates which
EQ should be assigned to that request.

Because IRQs are shared between IB and Ethernet modules, their
names only include the PCI device BDF address.

Signed-off-by: Matan Barak <matanb@mellanox.com>
Signed-off-by: Ido Shamay <idos@mellanox.com>
Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/infiniband/hw/mlx4/main.c              |  71 ++---
 drivers/infiniband/hw/mlx4/mlx4_ib.h           |   1 -
 drivers/net/ethernet/mellanox/mlx4/cq.c        |  10 +-
 drivers/net/ethernet/mellanox/mlx4/en_cq.c     |  48 ++--
 drivers/net/ethernet/mellanox/mlx4/en_netdev.c |   7 +-
 drivers/net/ethernet/mellanox/mlx4/en_rx.c     |  13 +-
 drivers/net/ethernet/mellanox/mlx4/eq.c        | 353 +++++++++++++++----------
 drivers/net/ethernet/mellanox/mlx4/main.c      |  74 ++++--
 drivers/net/ethernet/mellanox/mlx4/mlx4.h      |  11 +-
 drivers/net/ethernet/mellanox/mlx4/mlx4_en.h   |   2 +-
 include/linux/mlx4/device.h                    |  11 +-
 11 files changed, 342 insertions(+), 259 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c
index 8c96c71e7bab..024b0f745035 100644
--- a/drivers/infiniband/hw/mlx4/main.c
+++ b/drivers/infiniband/hw/mlx4/main.c
@@ -2041,77 +2041,52 @@ static void init_pkeys(struct mlx4_ib_dev *ibdev)
 
 static void mlx4_ib_alloc_eqs(struct mlx4_dev *dev, struct mlx4_ib_dev *ibdev)
 {
-	char name[80];
-	int eq_per_port = 0;
-	int added_eqs = 0;
-	int total_eqs = 0;
-	int i, j, eq;
-
-	/* Legacy mode or comp_pool is not large enough */
-	if (dev->caps.comp_pool == 0 ||
-	    dev->caps.num_ports > dev->caps.comp_pool)
-		return;
-
-	eq_per_port = dev->caps.comp_pool / dev->caps.num_ports;
-
-	/* Init eq table */
-	added_eqs = 0;
-	mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_IB)
-		added_eqs += eq_per_port;
-
-	total_eqs = dev->caps.num_comp_vectors + added_eqs;
+	int i, j, eq = 0, total_eqs = 0;
 
-	ibdev->eq_table = kzalloc(total_eqs * sizeof(int), GFP_KERNEL);
+	ibdev->eq_table = kcalloc(dev->caps.num_comp_vectors,
+				  sizeof(ibdev->eq_table[0]), GFP_KERNEL);
 	if (!ibdev->eq_table)
 		return;
 
-	ibdev->eq_added = added_eqs;
-
-	eq = 0;
-	mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_IB) {
-		for (j = 0; j < eq_per_port; j++) {
-			snprintf(name, sizeof(name), "mlx4-ib-%d-%d@%s",
-				 i, j, dev->persist->pdev->bus->name);
-			/* Set IRQ for specific name (per ring) */
-			if (mlx4_assign_eq(dev, name, NULL,
-					   &ibdev->eq_table[eq])) {
-				/* Use legacy (same as mlx4_en driver) */
-				pr_warn("Can't allocate EQ %d; reverting to legacy\n", eq);
-				ibdev->eq_table[eq] =
-					(eq % dev->caps.num_comp_vectors);
-			}
-			eq++;
+	for (i = 1; i <= dev->caps.num_ports; i++) {
+		for (j = 0; j < mlx4_get_eqs_per_port(dev, i);
+		     j++, total_eqs++) {
+			if (i > 1 &&  mlx4_is_eq_shared(dev, total_eqs))
+				continue;
+			ibdev->eq_table[eq] = total_eqs;
+			if (!mlx4_assign_eq(dev, i,
+					    &ibdev->eq_table[eq]))
+				eq++;
+			else
+				ibdev->eq_table[eq] = -1;
 		}
 	}
 
-	/* Fill the reset of the vector with legacy EQ */
-	for (i = 0, eq = added_eqs; i < dev->caps.num_comp_vectors; i++)
-		ibdev->eq_table[eq++] = i;
+	for (i = eq; i < dev->caps.num_comp_vectors;
+	     ibdev->eq_table[i++] = -1)
+		;
 
 	/* Advertise the new number of EQs to clients */
-	ibdev->ib_dev.num_comp_vectors = total_eqs;
+	ibdev->ib_dev.num_comp_vectors = eq;
 }
 
 static void mlx4_ib_free_eqs(struct mlx4_dev *dev, struct mlx4_ib_dev *ibdev)
 {
 	int i;
+	int total_eqs = ibdev->ib_dev.num_comp_vectors;
 
-	/* no additional eqs were added */
+	/* no eqs were allocated */
 	if (!ibdev->eq_table)
 		return;
 
 	/* Reset the advertised EQ number */
-	ibdev->ib_dev.num_comp_vectors = dev->caps.num_comp_vectors;
+	ibdev->ib_dev.num_comp_vectors = 0;
 
-	/* Free only the added eqs */
-	for (i = 0; i < ibdev->eq_added; i++) {
-		/* Don't free legacy eqs if used */
-		if (ibdev->eq_table[i] <= dev->caps.num_comp_vectors)
-			continue;
+	for (i = 0; i < total_eqs; i++)
 		mlx4_release_eq(dev, ibdev->eq_table[i]);
-	}
 
 	kfree(ibdev->eq_table);
+	ibdev->eq_table = NULL;
 }
 
 static void *mlx4_ib_add(struct mlx4_dev *dev)
diff --git a/drivers/infiniband/hw/mlx4/mlx4_ib.h b/drivers/infiniband/hw/mlx4/mlx4_ib.h
index fce3934372a1..ef80e6c99a68 100644
--- a/drivers/infiniband/hw/mlx4/mlx4_ib.h
+++ b/drivers/infiniband/hw/mlx4/mlx4_ib.h
@@ -523,7 +523,6 @@ struct mlx4_ib_dev {
 	struct mlx4_ib_iboe	iboe;
 	int			counters[MLX4_MAX_PORTS];
 	int		       *eq_table;
-	int			eq_added;
 	struct kobject	       *iov_parent;
 	struct kobject	       *ports_parent;
 	struct kobject	       *dev_ports_parent[MLX4_MFUNC_MAX];
diff --git a/drivers/net/ethernet/mellanox/mlx4/cq.c b/drivers/net/ethernet/mellanox/mlx4/cq.c
index e71f31387ac6..7431cd4d7390 100644
--- a/drivers/net/ethernet/mellanox/mlx4/cq.c
+++ b/drivers/net/ethernet/mellanox/mlx4/cq.c
@@ -292,7 +292,7 @@ int mlx4_cq_alloc(struct mlx4_dev *dev, int nent,
 	u64 mtt_addr;
 	int err;
 
-	if (vector > dev->caps.num_comp_vectors + dev->caps.comp_pool)
+	if (vector >= dev->caps.num_comp_vectors)
 		return -EINVAL;
 
 	cq->vector = vector;
@@ -319,7 +319,7 @@ int mlx4_cq_alloc(struct mlx4_dev *dev, int nent,
 		cq_context->flags  |= cpu_to_be32(1 << 19);
 
 	cq_context->logsize_usrpage = cpu_to_be32((ilog2(nent) << 24) | uar->index);
-	cq_context->comp_eqn	    = priv->eq_table.eq[vector].eqn;
+	cq_context->comp_eqn	    = priv->eq_table.eq[MLX4_CQ_TO_EQ_VECTOR(vector)].eqn;
 	cq_context->log_page_size   = mtt->page_shift - MLX4_ICM_PAGE_SHIFT;
 
 	mtt_addr = mlx4_mtt_addr(dev, mtt);
@@ -339,11 +339,11 @@ int mlx4_cq_alloc(struct mlx4_dev *dev, int nent,
 	init_completion(&cq->free);
 	cq->comp = mlx4_add_cq_to_tasklet;
 	cq->tasklet_ctx.priv =
-		&priv->eq_table.eq[cq->vector].tasklet_ctx;
+		&priv->eq_table.eq[MLX4_CQ_TO_EQ_VECTOR(vector)].tasklet_ctx;
 	INIT_LIST_HEAD(&cq->tasklet_ctx.list);
 
 
-	cq->irq = priv->eq_table.eq[cq->vector].irq;
+	cq->irq = priv->eq_table.eq[MLX4_CQ_TO_EQ_VECTOR(vector)].irq;
 	return 0;
 
 err_radix:
@@ -368,7 +368,7 @@ void mlx4_cq_free(struct mlx4_dev *dev, struct mlx4_cq *cq)
 	if (err)
 		mlx4_warn(dev, "HW2SW_CQ failed (%d) for CQN %06x\n", err, cq->cqn);
 
-	synchronize_irq(priv->eq_table.eq[cq->vector].irq);
+	synchronize_irq(priv->eq_table.eq[MLX4_CQ_TO_EQ_VECTOR(cq->vector)].irq);
 
 	spin_lock_irq(&cq_table->lock);
 	radix_tree_delete(&cq_table->tree, cq->cqn);
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_cq.c b/drivers/net/ethernet/mellanox/mlx4/en_cq.c
index 22da4d0d0f05..d71c567eb076 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_cq.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_cq.c
@@ -66,6 +66,7 @@ int mlx4_en_create_cq(struct mlx4_en_priv *priv,
 
 	cq->ring = ring;
 	cq->is_tx = mode;
+	cq->vector = mdev->dev->caps.num_comp_vectors;
 
 	/* Allocate HW buffers on provided NUMA node.
 	 * dev->numa_node is used in mtt range allocation flow.
@@ -101,12 +102,7 @@ int mlx4_en_activate_cq(struct mlx4_en_priv *priv, struct mlx4_en_cq *cq,
 	int err = 0;
 	char name[25];
 	int timestamp_en = 0;
-	struct cpu_rmap *rmap =
-#ifdef CONFIG_RFS_ACCEL
-		priv->dev->rx_cpu_rmap;
-#else
-		NULL;
-#endif
+	bool assigned_eq = false;
 
 	cq->dev = mdev->pndev[priv->port];
 	cq->mcq.set_ci_db  = cq->wqres.db.db;
@@ -116,23 +112,19 @@ int mlx4_en_activate_cq(struct mlx4_en_priv *priv, struct mlx4_en_cq *cq,
 	memset(cq->buf, 0, cq->buf_size);
 
 	if (cq->is_tx == RX) {
-		if (mdev->dev->caps.comp_pool) {
-			if (!cq->vector) {
-				sprintf(name, "%s-%d", priv->dev->name,
-					cq->ring);
-				/* Set IRQ for specific name (per ring) */
-				if (mlx4_assign_eq(mdev->dev, name, rmap,
-						   &cq->vector)) {
-					cq->vector = (cq->ring + 1 + priv->port)
-					    % mdev->dev->caps.num_comp_vectors;
-					mlx4_warn(mdev, "Failed assigning an EQ to %s, falling back to legacy EQ's\n",
-						  name);
-				}
-
+		if (!mlx4_is_eq_vector_valid(mdev->dev, priv->port,
+					     cq->vector)) {
+			cq->vector = cq_idx;
+
+			err = mlx4_assign_eq(mdev->dev, priv->port,
+					     &cq->vector);
+			if (err) {
+				mlx4_err(mdev, "Failed assigning an EQ to %s\n",
+					 name);
+				goto free_eq;
 			}
-		} else {
-			cq->vector = (cq->ring + 1 + priv->port) %
-				mdev->dev->caps.num_comp_vectors;
+
+			assigned_eq = true;
 		}
 
 		cq->irq_desc =
@@ -159,7 +151,7 @@ int mlx4_en_activate_cq(struct mlx4_en_priv *priv, struct mlx4_en_cq *cq,
 			    &mdev->priv_uar, cq->wqres.db.dma, &cq->mcq,
 			    cq->vector, 0, timestamp_en);
 	if (err)
-		return err;
+		goto free_eq;
 
 	cq->mcq.comp  = cq->is_tx ? mlx4_en_tx_irq : mlx4_en_rx_irq;
 	cq->mcq.event = mlx4_en_cq_event;
@@ -182,6 +174,12 @@ int mlx4_en_activate_cq(struct mlx4_en_priv *priv, struct mlx4_en_cq *cq,
 	napi_enable(&cq->napi);
 
 	return 0;
+
+free_eq:
+	if (assigned_eq)
+		mlx4_release_eq(mdev->dev, cq->vector);
+	cq->vector = mdev->dev->caps.num_comp_vectors;
+	return err;
 }
 
 void mlx4_en_destroy_cq(struct mlx4_en_priv *priv, struct mlx4_en_cq **pcq)
@@ -191,9 +189,9 @@ void mlx4_en_destroy_cq(struct mlx4_en_priv *priv, struct mlx4_en_cq **pcq)
 
 	mlx4_en_unmap_buffer(&cq->wqres.buf);
 	mlx4_free_hwq_res(mdev->dev, &cq->wqres, cq->buf_size);
-	if (priv->mdev->dev->caps.comp_pool && cq->vector) {
+	if (mlx4_is_eq_vector_valid(mdev->dev, priv->port, cq->vector) &&
+	    cq->is_tx == RX)
 		mlx4_release_eq(priv->mdev->dev, cq->vector);
-	}
 	cq->vector = 0;
 	cq->buf_size = 0;
 	cq->buf = NULL;
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
index 32f5ec737472..455cecae5aa4 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
@@ -1958,7 +1958,6 @@ void mlx4_en_free_resources(struct mlx4_en_priv *priv)
 	int i;
 
 #ifdef CONFIG_RFS_ACCEL
-	free_irq_cpu_rmap(priv->dev->rx_cpu_rmap);
 	priv->dev->rx_cpu_rmap = NULL;
 #endif
 
@@ -2016,11 +2015,7 @@ int mlx4_en_alloc_resources(struct mlx4_en_priv *priv)
 	}
 
 #ifdef CONFIG_RFS_ACCEL
-	if (priv->mdev->dev->caps.comp_pool) {
-		priv->dev->rx_cpu_rmap = alloc_irq_cpu_rmap(priv->mdev->dev->caps.comp_pool);
-		if (!priv->dev->rx_cpu_rmap)
-			goto err;
-	}
+	priv->dev->rx_cpu_rmap = mlx4_get_cpu_rmap(priv->mdev->dev, priv->port);
 #endif
 
 	return 0;
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_rx.c b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
index 2a77a6b19121..35f726c17e48 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
@@ -337,15 +337,10 @@ void mlx4_en_set_num_rx_rings(struct mlx4_en_dev *mdev)
 	struct mlx4_dev *dev = mdev->dev;
 
 	mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_ETH) {
-		if (!dev->caps.comp_pool)
-			num_of_eqs = max_t(int, MIN_RX_RINGS,
-					   min_t(int,
-						 dev->caps.num_comp_vectors,
-						 DEF_RX_RINGS));
-		else
-			num_of_eqs = min_t(int, MAX_MSIX_P_PORT,
-					   dev->caps.comp_pool/
-					   dev->caps.num_ports) - 1;
+		num_of_eqs = max_t(int, MIN_RX_RINGS,
+				   min_t(int,
+					 mlx4_get_eqs_per_port(mdev->dev, i),
+					 DEF_RX_RINGS));
 
 		num_rx_rings = mlx4_low_memory_profile() ? MIN_RX_RINGS :
 			min_t(int, num_of_eqs,
diff --git a/drivers/net/ethernet/mellanox/mlx4/eq.c b/drivers/net/ethernet/mellanox/mlx4/eq.c
index 80bcd648c5e0..2e6fc6a860a7 100644
--- a/drivers/net/ethernet/mellanox/mlx4/eq.c
+++ b/drivers/net/ethernet/mellanox/mlx4/eq.c
@@ -895,8 +895,8 @@ static int mlx4_num_eq_uar(struct mlx4_dev *dev)
 	 * we need to map, take the difference of highest index and
 	 * the lowest index we'll use and add 1.
 	 */
-	return (dev->caps.num_comp_vectors + 1 + dev->caps.reserved_eqs +
-		 dev->caps.comp_pool)/4 - dev->caps.reserved_eqs/4 + 1;
+	return (dev->caps.num_comp_vectors + 1 + dev->caps.reserved_eqs) / 4 -
+		dev->caps.reserved_eqs / 4 + 1;
 }
 
 static void __iomem *mlx4_get_eq_uar(struct mlx4_dev *dev, struct mlx4_eq *eq)
@@ -1085,8 +1085,7 @@ static void mlx4_free_eq(struct mlx4_dev *dev,
 static void mlx4_free_irqs(struct mlx4_dev *dev)
 {
 	struct mlx4_eq_table *eq_table = &mlx4_priv(dev)->eq_table;
-	struct mlx4_priv *priv = mlx4_priv(dev);
-	int	i, vec;
+	int	i;
 
 	if (eq_table->have_irq)
 		free_irq(dev->persist->pdev->irq, dev);
@@ -1097,20 +1096,6 @@ static void mlx4_free_irqs(struct mlx4_dev *dev)
 			eq_table->eq[i].have_irq = 0;
 		}
 
-	for (i = 0; i < dev->caps.comp_pool; i++) {
-		/*
-		 * Freeing the assigned irq's
-		 * all bits should be 0, but we need to validate
-		 */
-		if (priv->msix_ctl.pool_bm & 1ULL << i) {
-			/* NO need protecting*/
-			vec = dev->caps.num_comp_vectors + 1 + i;
-			free_irq(priv->eq_table.eq[vec].irq,
-				 &priv->eq_table.eq[vec]);
-		}
-	}
-
-
 	kfree(eq_table->irq_names);
 }
 
@@ -1191,76 +1176,73 @@ int mlx4_init_eq_table(struct mlx4_dev *dev)
 	}
 
 	priv->eq_table.irq_names =
-		kmalloc(MLX4_IRQNAME_SIZE * (dev->caps.num_comp_vectors + 1 +
-					     dev->caps.comp_pool),
+		kmalloc(MLX4_IRQNAME_SIZE * (dev->caps.num_comp_vectors + 1),
 			GFP_KERNEL);
 	if (!priv->eq_table.irq_names) {
 		err = -ENOMEM;
-		goto err_out_bitmap;
+		goto err_out_clr_int;
 	}
 
-	for (i = 0; i < dev->caps.num_comp_vectors; ++i) {
-		err = mlx4_create_eq(dev, dev->caps.num_cqs -
-					  dev->caps.reserved_cqs +
-					  MLX4_NUM_SPARE_EQE,
-				     (dev->flags & MLX4_FLAG_MSI_X) ? i : 0,
-				     &priv->eq_table.eq[i]);
-		if (err) {
-			--i;
-			goto err_out_unmap;
-		}
-	}
-
-	err = mlx4_create_eq(dev, MLX4_NUM_ASYNC_EQE + MLX4_NUM_SPARE_EQE,
-			     (dev->flags & MLX4_FLAG_MSI_X) ? dev->caps.num_comp_vectors : 0,
-			     &priv->eq_table.eq[dev->caps.num_comp_vectors]);
-	if (err)
-		goto err_out_comp;
-
-	/*if additional completion vectors poolsize is 0 this loop will not run*/
-	for (i = dev->caps.num_comp_vectors + 1;
-	      i < dev->caps.num_comp_vectors + dev->caps.comp_pool + 1; ++i) {
+	for (i = 0; i < dev->caps.num_comp_vectors + 1; ++i) {
+		if (i == MLX4_EQ_ASYNC) {
+			err = mlx4_create_eq(dev,
+					     MLX4_NUM_ASYNC_EQE + MLX4_NUM_SPARE_EQE,
+					     0, &priv->eq_table.eq[MLX4_EQ_ASYNC]);
+		} else {
+#ifdef CONFIG_RFS_ACCEL
+			struct mlx4_eq	*eq = &priv->eq_table.eq[i];
+			int port = find_first_bit(eq->actv_ports.ports,
+						  dev->caps.num_ports) + 1;
+
+			if (port <= dev->caps.num_ports) {
+				struct mlx4_port_info *info =
+					&mlx4_priv(dev)->port[port];
+
+				if (!info->rmap) {
+					info->rmap = alloc_irq_cpu_rmap(
+						mlx4_get_eqs_per_port(dev, port));
+					if (!info->rmap) {
+						mlx4_warn(dev, "Failed to allocate cpu rmap\n");
+						err = -ENOMEM;
+						goto err_out_unmap;
+					}
+				}
 
-		err = mlx4_create_eq(dev, dev->caps.num_cqs -
-					  dev->caps.reserved_cqs +
-					  MLX4_NUM_SPARE_EQE,
-				     (dev->flags & MLX4_FLAG_MSI_X) ? i : 0,
-				     &priv->eq_table.eq[i]);
-		if (err) {
-			--i;
-			goto err_out_unmap;
+				err = irq_cpu_rmap_add(
+					info->rmap, eq->irq);
+				if (err)
+					mlx4_warn(dev, "Failed adding irq rmap\n");
+			}
+#endif
+			err = mlx4_create_eq(dev, dev->caps.num_cqs -
+						  dev->caps.reserved_cqs +
+						  MLX4_NUM_SPARE_EQE,
+					     (dev->flags & MLX4_FLAG_MSI_X) ?
+					     i + 1 - !!(i > MLX4_EQ_ASYNC) : 0,
+					     eq);
 		}
+		if (err)
+			goto err_out_unmap;
 	}
 
-
 	if (dev->flags & MLX4_FLAG_MSI_X) {
 		const char *eq_name;
 
-		for (i = 0; i < dev->caps.num_comp_vectors + 1; ++i) {
-			if (i < dev->caps.num_comp_vectors) {
-				snprintf(priv->eq_table.irq_names +
-					 i * MLX4_IRQNAME_SIZE,
-					 MLX4_IRQNAME_SIZE,
-					 "mlx4-comp-%d@pci:%s", i,
-					 pci_name(dev->persist->pdev));
-			} else {
-				snprintf(priv->eq_table.irq_names +
-					 i * MLX4_IRQNAME_SIZE,
-					 MLX4_IRQNAME_SIZE,
-					 "mlx4-async@pci:%s",
-					 pci_name(dev->persist->pdev));
-			}
+		snprintf(priv->eq_table.irq_names +
+			 MLX4_EQ_ASYNC * MLX4_IRQNAME_SIZE,
+			 MLX4_IRQNAME_SIZE,
+			 "mlx4-async@pci:%s",
+			 pci_name(dev->persist->pdev));
+		eq_name = priv->eq_table.irq_names +
+			MLX4_EQ_ASYNC * MLX4_IRQNAME_SIZE;
 
-			eq_name = priv->eq_table.irq_names +
-				  i * MLX4_IRQNAME_SIZE;
-			err = request_irq(priv->eq_table.eq[i].irq,
-					  mlx4_msi_x_interrupt, 0, eq_name,
-					  priv->eq_table.eq + i);
-			if (err)
-				goto err_out_async;
+		err = request_irq(priv->eq_table.eq[MLX4_EQ_ASYNC].irq,
+				  mlx4_msi_x_interrupt, 0, eq_name,
+				  priv->eq_table.eq + MLX4_EQ_ASYNC);
+		if (err)
+			goto err_out_unmap;
 
-			priv->eq_table.eq[i].have_irq = 1;
-		}
+		priv->eq_table.eq[MLX4_EQ_ASYNC].have_irq = 1;
 	} else {
 		snprintf(priv->eq_table.irq_names,
 			 MLX4_IRQNAME_SIZE,
@@ -1269,36 +1251,38 @@ int mlx4_init_eq_table(struct mlx4_dev *dev)
 		err = request_irq(dev->persist->pdev->irq, mlx4_interrupt,
 				  IRQF_SHARED, priv->eq_table.irq_names, dev);
 		if (err)
-			goto err_out_async;
+			goto err_out_unmap;
 
 		priv->eq_table.have_irq = 1;
 	}
 
 	err = mlx4_MAP_EQ(dev, get_async_ev_mask(dev), 0,
-			  priv->eq_table.eq[dev->caps.num_comp_vectors].eqn);
+			  priv->eq_table.eq[MLX4_EQ_ASYNC].eqn);
 	if (err)
 		mlx4_warn(dev, "MAP_EQ for async EQ %d failed (%d)\n",
-			   priv->eq_table.eq[dev->caps.num_comp_vectors].eqn, err);
+			   priv->eq_table.eq[MLX4_EQ_ASYNC].eqn, err);
 
-	for (i = 0; i < dev->caps.num_comp_vectors + 1; ++i)
-		eq_set_ci(&priv->eq_table.eq[i], 1);
+	/* arm ASYNC eq */
+	eq_set_ci(&priv->eq_table.eq[MLX4_EQ_ASYNC], 1);
 
 	return 0;
 
-err_out_async:
-	mlx4_free_eq(dev, &priv->eq_table.eq[dev->caps.num_comp_vectors]);
-
-err_out_comp:
-	i = dev->caps.num_comp_vectors - 1;
-
 err_out_unmap:
-	while (i >= 0) {
-		mlx4_free_eq(dev, &priv->eq_table.eq[i]);
-		--i;
+	while (i >= 0)
+		mlx4_free_eq(dev, &priv->eq_table.eq[i--]);
+#ifdef CONFIG_RFS_ACCEL
+	for (i = 1; i <= dev->caps.num_ports; i++) {
+		if (mlx4_priv(dev)->port[i].rmap) {
+			free_irq_cpu_rmap(mlx4_priv(dev)->port[i].rmap);
+			mlx4_priv(dev)->port[i].rmap = NULL;
+		}
 	}
+#endif
+	mlx4_free_irqs(dev);
+
+err_out_clr_int:
 	if (!mlx4_is_slave(dev))
 		mlx4_unmap_clr_int(dev);
-	mlx4_free_irqs(dev);
 
 err_out_bitmap:
 	mlx4_unmap_uar(dev);
@@ -1316,11 +1300,19 @@ void mlx4_cleanup_eq_table(struct mlx4_dev *dev)
 	int i;
 
 	mlx4_MAP_EQ(dev, get_async_ev_mask(dev), 1,
-		    priv->eq_table.eq[dev->caps.num_comp_vectors].eqn);
+		    priv->eq_table.eq[MLX4_EQ_ASYNC].eqn);
 
+#ifdef CONFIG_RFS_ACCEL
+	for (i = 1; i <= dev->caps.num_ports; i++) {
+		if (mlx4_priv(dev)->port[i].rmap) {
+			free_irq_cpu_rmap(mlx4_priv(dev)->port[i].rmap);
+			mlx4_priv(dev)->port[i].rmap = NULL;
+		}
+	}
+#endif
 	mlx4_free_irqs(dev);
 
-	for (i = 0; i < dev->caps.num_comp_vectors + dev->caps.comp_pool + 1; ++i)
+	for (i = 0; i < dev->caps.num_comp_vectors + 1; ++i)
 		mlx4_free_eq(dev, &priv->eq_table.eq[i]);
 
 	if (!mlx4_is_slave(dev))
@@ -1371,87 +1363,166 @@ int mlx4_test_interrupts(struct mlx4_dev *dev)
 
 	/* Return to default */
 	mlx4_MAP_EQ(dev, get_async_ev_mask(dev), 0,
-		    priv->eq_table.eq[dev->caps.num_comp_vectors].eqn);
+		    priv->eq_table.eq[MLX4_EQ_ASYNC].eqn);
 	return err;
 }
 EXPORT_SYMBOL(mlx4_test_interrupts);
 
-int mlx4_assign_eq(struct mlx4_dev *dev, char *name, struct cpu_rmap *rmap,
-		   int *vector)
+bool mlx4_is_eq_vector_valid(struct mlx4_dev *dev, u8 port, int vector)
 {
+	struct mlx4_priv *priv = mlx4_priv(dev);
 
+	vector = MLX4_CQ_TO_EQ_VECTOR(vector);
+	if (vector < 0 || (vector >= dev->caps.num_comp_vectors + 1) ||
+	    (vector == MLX4_EQ_ASYNC))
+		return false;
+
+	return test_bit(port - 1, priv->eq_table.eq[vector].actv_ports.ports);
+}
+EXPORT_SYMBOL(mlx4_is_eq_vector_valid);
+
+u32 mlx4_get_eqs_per_port(struct mlx4_dev *dev, u8 port)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	unsigned int i;
+	unsigned int sum = 0;
+
+	for (i = 0; i < dev->caps.num_comp_vectors + 1; i++)
+		sum += !!test_bit(port - 1,
+				  priv->eq_table.eq[i].actv_ports.ports);
+
+	return sum;
+}
+EXPORT_SYMBOL(mlx4_get_eqs_per_port);
+
+int mlx4_is_eq_shared(struct mlx4_dev *dev, int vector)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+
+	vector = MLX4_CQ_TO_EQ_VECTOR(vector);
+	if (vector <= 0 || (vector >= dev->caps.num_comp_vectors + 1))
+		return -EINVAL;
+
+	return !!(bitmap_weight(priv->eq_table.eq[vector].actv_ports.ports,
+				dev->caps.num_ports) > 1);
+}
+EXPORT_SYMBOL(mlx4_is_eq_shared);
+
+struct cpu_rmap *mlx4_get_cpu_rmap(struct mlx4_dev *dev, int port)
+{
+	return mlx4_priv(dev)->port[port].rmap;
+}
+EXPORT_SYMBOL(mlx4_get_cpu_rmap);
+
+int mlx4_assign_eq(struct mlx4_dev *dev, u8 port, int *vector)
+{
 	struct mlx4_priv *priv = mlx4_priv(dev);
-	int vec = 0, err = 0, i;
+	int err = 0, i = 0;
+	u32 min_ref_count_val = (u32)-1;
+	int requested_vector = MLX4_CQ_TO_EQ_VECTOR(*vector);
+	int *prequested_vector = NULL;
+
 
 	mutex_lock(&priv->msix_ctl.pool_lock);
-	for (i = 0; !vec && i < dev->caps.comp_pool; i++) {
-		if (~priv->msix_ctl.pool_bm & 1ULL << i) {
-			priv->msix_ctl.pool_bm |= 1ULL << i;
-			vec = dev->caps.num_comp_vectors + 1 + i;
-			snprintf(priv->eq_table.irq_names +
-					vec * MLX4_IRQNAME_SIZE,
-					MLX4_IRQNAME_SIZE, "%s", name);
-#ifdef CONFIG_RFS_ACCEL
-			if (rmap) {
-				err = irq_cpu_rmap_add(rmap,
-						       priv->eq_table.eq[vec].irq);
-				if (err)
-					mlx4_warn(dev, "Failed adding irq rmap\n");
+	if (requested_vector < (dev->caps.num_comp_vectors + 1) &&
+	    (requested_vector >= 0) &&
+	    (requested_vector != MLX4_EQ_ASYNC)) {
+		if (test_bit(port - 1,
+			     priv->eq_table.eq[requested_vector].actv_ports.ports)) {
+			prequested_vector = &requested_vector;
+		} else {
+			struct mlx4_eq *eq;
+
+			for (i = 1; i < port;
+			     requested_vector += mlx4_get_eqs_per_port(dev, i++))
+				;
+
+			eq = &priv->eq_table.eq[requested_vector];
+			if (requested_vector < dev->caps.num_comp_vectors + 1 &&
+			    test_bit(port - 1, eq->actv_ports.ports)) {
+				prequested_vector = &requested_vector;
 			}
-#endif
-			err = request_irq(priv->eq_table.eq[vec].irq,
-					  mlx4_msi_x_interrupt, 0,
-					  &priv->eq_table.irq_names[vec<<5],
-					  priv->eq_table.eq + vec);
-			if (err) {
-				/*zero out bit by fliping it*/
-				priv->msix_ctl.pool_bm ^= 1 << i;
-				vec = 0;
-				continue;
-				/*we dont want to break here*/
+		}
+	}
+
+	if  (!prequested_vector) {
+		requested_vector = -1;
+		for (i = 0; min_ref_count_val && i < dev->caps.num_comp_vectors + 1;
+		     i++) {
+			struct mlx4_eq *eq = &priv->eq_table.eq[i];
+
+			if (min_ref_count_val > eq->ref_count &&
+			    test_bit(port - 1, eq->actv_ports.ports)) {
+				min_ref_count_val = eq->ref_count;
+				requested_vector = i;
 			}
+		}
 
-			eq_set_ci(&priv->eq_table.eq[vec], 1);
+		if (requested_vector < 0) {
+			err = -ENOSPC;
+			goto err_unlock;
 		}
+
+		prequested_vector = &requested_vector;
 	}
+
+	if (!test_bit(*prequested_vector, priv->msix_ctl.pool_bm) &&
+	    dev->flags & MLX4_FLAG_MSI_X) {
+		set_bit(*prequested_vector, priv->msix_ctl.pool_bm);
+		snprintf(priv->eq_table.irq_names +
+			 *prequested_vector * MLX4_IRQNAME_SIZE,
+			 MLX4_IRQNAME_SIZE, "mlx4-%d@%s",
+			 *prequested_vector, dev_name(&dev->persist->pdev->dev));
+
+		err = request_irq(priv->eq_table.eq[*prequested_vector].irq,
+				  mlx4_msi_x_interrupt, 0,
+				  &priv->eq_table.irq_names[*prequested_vector << 5],
+				  priv->eq_table.eq + *prequested_vector);
+
+		if (err) {
+			clear_bit(*prequested_vector, priv->msix_ctl.pool_bm);
+			*prequested_vector = -1;
+		} else {
+			eq_set_ci(&priv->eq_table.eq[*prequested_vector], 1);
+			priv->eq_table.eq[*prequested_vector].have_irq = 1;
+		}
+	}
+
+	if (!err && *prequested_vector >= 0)
+		priv->eq_table.eq[*prequested_vector].ref_count++;
+
+err_unlock:
 	mutex_unlock(&priv->msix_ctl.pool_lock);
 
-	if (vec) {
-		*vector = vec;
-	} else {
+	if (!err && *prequested_vector >= 0)
+		*vector = MLX4_EQ_TO_CQ_VECTOR(*prequested_vector);
+	else
 		*vector = 0;
-		err = (i == dev->caps.comp_pool) ? -ENOSPC : err;
-	}
+
 	return err;
 }
 EXPORT_SYMBOL(mlx4_assign_eq);
 
-int mlx4_eq_get_irq(struct mlx4_dev *dev, int vec)
+int mlx4_eq_get_irq(struct mlx4_dev *dev, int cq_vec)
 {
 	struct mlx4_priv *priv = mlx4_priv(dev);
 
-	return priv->eq_table.eq[vec].irq;
+	return priv->eq_table.eq[MLX4_CQ_TO_EQ_VECTOR(cq_vec)].irq;
 }
 EXPORT_SYMBOL(mlx4_eq_get_irq);
 
 void mlx4_release_eq(struct mlx4_dev *dev, int vec)
 {
 	struct mlx4_priv *priv = mlx4_priv(dev);
-	/*bm index*/
-	int i = vec - dev->caps.num_comp_vectors - 1;
-
-	if (likely(i >= 0)) {
-		/*sanity check , making sure were not trying to free irq's
-		  Belonging to a legacy EQ*/
-		mutex_lock(&priv->msix_ctl.pool_lock);
-		if (priv->msix_ctl.pool_bm & 1ULL << i) {
-			free_irq(priv->eq_table.eq[vec].irq,
-				 &priv->eq_table.eq[vec]);
-			priv->msix_ctl.pool_bm &= ~(1ULL << i);
-		}
-		mutex_unlock(&priv->msix_ctl.pool_lock);
-	}
+	int eq_vec = MLX4_CQ_TO_EQ_VECTOR(vec);
+
+	mutex_lock(&priv->msix_ctl.pool_lock);
+	priv->eq_table.eq[eq_vec].ref_count--;
 
+	/* once we allocated EQ, we don't release it because it might be binded
+	 * to cpu_rmap.
+	 */
+	mutex_unlock(&priv->msix_ctl.pool_lock);
 }
 EXPORT_SYMBOL(mlx4_release_eq);
 
diff --git a/drivers/net/ethernet/mellanox/mlx4/main.c b/drivers/net/ethernet/mellanox/mlx4/main.c
index 70d33f6e2a41..3ec5113c5a33 100644
--- a/drivers/net/ethernet/mellanox/mlx4/main.c
+++ b/drivers/net/ethernet/mellanox/mlx4/main.c
@@ -2364,11 +2364,11 @@ static int mlx4_setup_hca(struct mlx4_dev *dev)
 	if (err) {
 		if (dev->flags & MLX4_FLAG_MSI_X) {
 			mlx4_warn(dev, "NOP command failed to generate MSI-X interrupt IRQ %d)\n",
-				  priv->eq_table.eq[dev->caps.num_comp_vectors].irq);
+				  priv->eq_table.eq[MLX4_EQ_ASYNC].irq);
 			mlx4_warn(dev, "Trying again without MSI-X\n");
 		} else {
 			mlx4_err(dev, "NOP command failed to generate interrupt (IRQ %d), aborting\n",
-				 priv->eq_table.eq[dev->caps.num_comp_vectors].irq);
+				 priv->eq_table.eq[MLX4_EQ_ASYNC].irq);
 			mlx4_err(dev, "BIOS or ACPI interrupt routing problem?\n");
 		}
 
@@ -2486,9 +2486,10 @@ static void mlx4_enable_msi_x(struct mlx4_dev *dev)
 	struct mlx4_priv *priv = mlx4_priv(dev);
 	struct msix_entry *entries;
 	int i;
+	int port = 0;
 
 	if (msi_x) {
-		int nreq = dev->caps.num_ports * num_online_cpus() + MSIX_LEGACY_SZ;
+		int nreq = dev->caps.num_ports * num_online_cpus() + 1;
 
 		nreq = min_t(int, dev->caps.num_eqs - dev->caps.reserved_eqs,
 			     nreq);
@@ -2503,20 +2504,49 @@ static void mlx4_enable_msi_x(struct mlx4_dev *dev)
 		nreq = pci_enable_msix_range(dev->persist->pdev, entries, 2,
 					     nreq);
 
-		if (nreq < 0) {
+		if (nreq < 0 || nreq < MLX4_EQ_ASYNC) {
 			kfree(entries);
 			goto no_msi;
-		} else if (nreq < MSIX_LEGACY_SZ +
-			   dev->caps.num_ports * MIN_MSIX_P_PORT) {
-			/*Working in legacy mode , all EQ's shared*/
-			dev->caps.comp_pool           = 0;
-			dev->caps.num_comp_vectors = nreq - 1;
-		} else {
-			dev->caps.comp_pool           = nreq - MSIX_LEGACY_SZ;
-			dev->caps.num_comp_vectors = MSIX_LEGACY_SZ - 1;
 		}
-		for (i = 0; i < nreq; ++i)
-			priv->eq_table.eq[i].irq = entries[i].vector;
+		/* 1 is reserved for events (asyncrounous EQ) */
+		dev->caps.num_comp_vectors = nreq - 1;
+
+		priv->eq_table.eq[MLX4_EQ_ASYNC].irq = entries[0].vector;
+		bitmap_zero(priv->eq_table.eq[MLX4_EQ_ASYNC].actv_ports.ports,
+			    dev->caps.num_ports);
+
+		for (i = 0; i < dev->caps.num_comp_vectors + 1; i++) {
+			if (i == MLX4_EQ_ASYNC)
+				continue;
+
+			priv->eq_table.eq[i].irq =
+				entries[i + 1 - !!(i > MLX4_EQ_ASYNC)].vector;
+
+			if (MLX4_IS_LEGACY_EQ_MODE(dev->caps)) {
+				bitmap_fill(priv->eq_table.eq[i].actv_ports.ports,
+					    dev->caps.num_ports);
+			} else {
+				set_bit(port,
+					priv->eq_table.eq[i].actv_ports.ports);
+			}
+			/* We divide the Eqs evenly between the two ports.
+			 * (dev->caps.num_comp_vectors / dev->caps.num_ports)
+			 * refers to the number of Eqs per port
+			 * (i.e eqs_per_port). Theoretically, we would like to
+			 * write something like (i + 1) % eqs_per_port == 0.
+			 * However, since there's an asynchronous Eq, we have
+			 * to skip over it by comparing this condition to
+			 * !!((i + 1) > MLX4_EQ_ASYNC).
+			 */
+			if ((dev->caps.num_comp_vectors > dev->caps.num_ports) &&
+			    ((i + 1) %
+			     (dev->caps.num_comp_vectors / dev->caps.num_ports)) ==
+			    !!((i + 1) > MLX4_EQ_ASYNC))
+				/* If dev->caps.num_comp_vectors < dev->caps.num_ports,
+				 * everything is shared anyway.
+				 */
+				port++;
+		}
 
 		dev->flags |= MLX4_FLAG_MSI_X;
 
@@ -2526,10 +2556,15 @@ static void mlx4_enable_msi_x(struct mlx4_dev *dev)
 
 no_msi:
 	dev->caps.num_comp_vectors = 1;
-	dev->caps.comp_pool	   = 0;
 
-	for (i = 0; i < 2; ++i)
+	BUG_ON(MLX4_EQ_ASYNC >= 2);
+	for (i = 0; i < 2; ++i) {
 		priv->eq_table.eq[i].irq = dev->persist->pdev->irq;
+		if (i != MLX4_EQ_ASYNC) {
+			bitmap_fill(priv->eq_table.eq[i].actv_ports.ports,
+				    dev->caps.num_ports);
+		}
+	}
 }
 
 static int mlx4_init_port_info(struct mlx4_dev *dev, int port)
@@ -2594,6 +2629,10 @@ static void mlx4_cleanup_port_info(struct mlx4_port_info *info)
 	device_remove_file(&info->dev->persist->pdev->dev, &info->port_attr);
 	device_remove_file(&info->dev->persist->pdev->dev,
 			   &info->port_mtu_attr);
+#ifdef CONFIG_RFS_ACCEL
+	free_irq_cpu_rmap(info->rmap);
+	info->rmap = NULL;
+#endif
 }
 
 static int mlx4_init_steering(struct mlx4_dev *dev)
@@ -3024,7 +3063,7 @@ slave_start:
 	if (err)
 		goto err_master_mfunc;
 
-	priv->msix_ctl.pool_bm = 0;
+	bitmap_zero(priv->msix_ctl.pool_bm, MAX_MSIX);
 	mutex_init(&priv->msix_ctl.pool_lock);
 
 	mlx4_enable_msi_x(dev);
@@ -3046,7 +3085,6 @@ slave_start:
 	    !mlx4_is_mfunc(dev)) {
 		dev->flags &= ~MLX4_FLAG_MSI_X;
 		dev->caps.num_comp_vectors = 1;
-		dev->caps.comp_pool	   = 0;
 		pci_disable_msix(pdev);
 		err = mlx4_setup_hca(dev);
 	}
diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4.h b/drivers/net/ethernet/mellanox/mlx4/mlx4.h
index 502d3dd2c888..ff40098eaf4c 100644
--- a/drivers/net/ethernet/mellanox/mlx4/mlx4.h
+++ b/drivers/net/ethernet/mellanox/mlx4/mlx4.h
@@ -287,6 +287,12 @@ struct mlx4_icm_table {
 #define MLX4_CQE_SIZE_MASK_STRIDE	0x3
 #define MLX4_EQE_SIZE_MASK_STRIDE	0x30
 
+#define MLX4_EQ_ASYNC			0
+#define MLX4_EQ_TO_CQ_VECTOR(vector)	((vector) - \
+					 !!((int)(vector) >= MLX4_EQ_ASYNC))
+#define MLX4_CQ_TO_EQ_VECTOR(vector)	((vector) + \
+					 !!((int)(vector) >= MLX4_EQ_ASYNC))
+
 /*
  * Must be packed because mtt_seg is 64 bits but only aligned to 32 bits.
  */
@@ -391,6 +397,8 @@ struct mlx4_eq {
 	struct mlx4_buf_list   *page_list;
 	struct mlx4_mtt		mtt;
 	struct mlx4_eq_tasklet	tasklet_ctx;
+	struct mlx4_active_ports actv_ports;
+	u32			ref_count;
 };
 
 struct mlx4_slave_eqe {
@@ -808,6 +816,7 @@ struct mlx4_port_info {
 	struct mlx4_vlan_table	vlan_table;
 	struct mlx4_roce_gid_table gid_table;
 	int			base_qpn;
+	struct cpu_rmap		*rmap;
 };
 
 struct mlx4_sense {
@@ -818,7 +827,7 @@ struct mlx4_sense {
 };
 
 struct mlx4_msix_ctl {
-	u64		pool_bm;
+	DECLARE_BITMAP(pool_bm, MAX_MSIX);
 	struct mutex	pool_lock;
 };
 
diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
index d021f079f181..edd8fd69ec9a 100644
--- a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
+++ b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
@@ -338,7 +338,7 @@ struct mlx4_en_cq {
 	struct napi_struct	napi;
 	int size;
 	int buf_size;
-	unsigned vector;
+	int vector;
 	enum cq_type is_tx;
 	u16 moder_time;
 	u16 moder_cnt;
diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h
index 83e80ab94500..ad31e476873f 100644
--- a/include/linux/mlx4/device.h
+++ b/include/linux/mlx4/device.h
@@ -46,8 +46,9 @@
 
 #define MAX_MSIX_P_PORT		17
 #define MAX_MSIX		64
-#define MSIX_LEGACY_SZ		4
 #define MIN_MSIX_P_PORT		5
+#define MLX4_IS_LEGACY_EQ_MODE(dev_cap) ((dev_cap).num_comp_vectors < \
+					 (dev_cap).num_ports * MIN_MSIX_P_PORT)
 
 #define MLX4_MAX_100M_UNITS_VAL		255	/*
 						 * work around: can't set values
@@ -528,7 +529,6 @@ struct mlx4_caps {
 	int			num_eqs;
 	int			reserved_eqs;
 	int			num_comp_vectors;
-	int			comp_pool;
 	int			num_mpts;
 	int			max_fmr_maps;
 	int			num_mtts;
@@ -1332,10 +1332,13 @@ void mlx4_fmr_unmap(struct mlx4_dev *dev, struct mlx4_fmr *fmr,
 int mlx4_fmr_free(struct mlx4_dev *dev, struct mlx4_fmr *fmr);
 int mlx4_SYNC_TPT(struct mlx4_dev *dev);
 int mlx4_test_interrupts(struct mlx4_dev *dev);
-int mlx4_assign_eq(struct mlx4_dev *dev, char *name, struct cpu_rmap *rmap,
-		   int *vector);
+u32 mlx4_get_eqs_per_port(struct mlx4_dev *dev, u8 port);
+bool mlx4_is_eq_vector_valid(struct mlx4_dev *dev, u8 port, int vector);
+struct cpu_rmap *mlx4_get_cpu_rmap(struct mlx4_dev *dev, int port);
+int mlx4_assign_eq(struct mlx4_dev *dev, u8 port, int *vector);
 void mlx4_release_eq(struct mlx4_dev *dev, int vec);
 
+int mlx4_is_eq_shared(struct mlx4_dev *dev, int vector);
 int mlx4_eq_get_irq(struct mlx4_dev *dev, int vec);
 
 int mlx4_get_phys_port_id(struct mlx4_dev *dev);
-- 
cgit v1.2.3


From f0e6a326deec8b51ee12f82a34057efd3d0979b8 Mon Sep 17 00:00:00 2001
From: Abhishek Bist <ishubist@gmail.com>
Date: Wed, 27 May 2015 23:54:19 +0530
Subject: USB: hcd.h : Removed an unnecessary function prototype
 usb_find_interface_driver()

This function is used to call in early version of linux kernel in order
to find out the interface used by a usb device. But now it's use is
completely abolished. So,it would be relevant to remove this obselete
function from kernel mainline.

Signed-off-by: Abhishek Bist <ishubist@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/usb/hcd.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/usb/hcd.h b/include/linux/usb/hcd.h
index 68b1e836dff1..c9aa7792de10 100644
--- a/include/linux/usb/hcd.h
+++ b/include/linux/usb/hcd.h
@@ -622,8 +622,6 @@ extern struct list_head usb_bus_list;
 extern struct mutex usb_bus_list_lock;
 extern wait_queue_head_t usb_kill_urb_queue;
 
-extern int usb_find_interface_driver(struct usb_device *dev,
-	struct usb_interface *interface);
 
 #define usb_endpoint_out(ep_dir)	(!((ep_dir) & USB_DIR_IN))
 
-- 
cgit v1.2.3


From 138c3f03b017e261316a4f1ec793e1ff74516def Mon Sep 17 00:00:00 2001
From: Nikhil Badola <nikhil.badola@freescale.com>
Date: Tue, 26 May 2015 17:15:29 +0530
Subject: drivers:usb:fsl: Add support for USB controller version-2.5

Add support for USB controller version-2.5 used in
T4240 rev2.0, T1024, T1040, T2080, LS1021A

Signed-off-by: Nikhil Badola <nikhil.badola@freescale.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/host/fsl-mph-dr-of.c | 5 +++++
 include/linux/fsl_devices.h      | 1 +
 2 files changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/usb/host/fsl-mph-dr-of.c b/drivers/usb/host/fsl-mph-dr-of.c
index e588ccd468fc..5e0d60035216 100644
--- a/drivers/usb/host/fsl-mph-dr-of.c
+++ b/drivers/usb/host/fsl-mph-dr-of.c
@@ -127,6 +127,7 @@ static int usb_get_ver_info(struct device_node *np)
 	 * returns 1 for usb controller version 1.6
 	 * returns 2 for usb controller version 2.2
 	 * returns 3 for usb controller version 2.4
+	 * returns 4 for usb controller version 2.5
 	 * returns 0 otherwise
 	 */
 	if (of_device_is_compatible(np, "fsl-usb2-dr")) {
@@ -136,6 +137,8 @@ static int usb_get_ver_info(struct device_node *np)
 			ver = FSL_USB_VER_2_2;
 		else if (of_device_is_compatible(np, "fsl-usb2-dr-v2.4"))
 			ver = FSL_USB_VER_2_4;
+		else if (of_device_is_compatible(np, "fsl-usb2-dr-v2.5"))
+			ver = FSL_USB_VER_2_5;
 		else /* for previous controller versions */
 			ver = FSL_USB_VER_OLD;
 
@@ -153,6 +156,8 @@ static int usb_get_ver_info(struct device_node *np)
 			ver = FSL_USB_VER_2_2;
 		else if (of_device_is_compatible(np, "fsl-usb2-mph-v2.4"))
 			ver = FSL_USB_VER_2_4;
+		else if (of_device_is_compatible(np, "fsl-usb2-mph-v2.5"))
+			ver = FSL_USB_VER_2_5;
 		else /* for previous controller versions */
 			ver = FSL_USB_VER_OLD;
 	}
diff --git a/include/linux/fsl_devices.h b/include/linux/fsl_devices.h
index a82296af413f..2a2f56b292c1 100644
--- a/include/linux/fsl_devices.h
+++ b/include/linux/fsl_devices.h
@@ -24,6 +24,7 @@
 #define FSL_USB_VER_1_6		1
 #define FSL_USB_VER_2_2		2
 #define FSL_USB_VER_2_4		3
+#define FSL_USB_VER_2_5		4
 
 #include <linux/types.h>
 
-- 
cgit v1.2.3


From 34d2e4584ae594eff29d1595d47d7d044e57f834 Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <yamada.masahiro@socionext.com>
Date: Tue, 26 May 2015 13:28:48 +0900
Subject: serial: 8250: include <linux/serial_reg.h> from serial_8250.h

The header file, include/linux/serial_8250.h, contains references to
UART_LSR_BRK_ERROR_BITS and UART_MSR_ANY_DELTA that are defined in
<linux/serial_reg.h>.

Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
Reviewed-by: Peter Hurley <peter@hurleysoftware.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/serial_8250.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/serial_8250.h b/include/linux/serial_8250.h
index f0c68d88b6f4..ba82c07feb95 100644
--- a/include/linux/serial_8250.h
+++ b/include/linux/serial_8250.h
@@ -12,6 +12,7 @@
 #define _LINUX_SERIAL_8250_H
 
 #include <linux/serial_core.h>
+#include <linux/serial_reg.h>
 #include <linux/platform_device.h>
 
 /*
-- 
cgit v1.2.3


From abf2e7d6e2e315b32ee00067a69aaad2cf4e1b3f Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@plumgrid.com>
Date: Thu, 28 May 2015 19:26:02 -0700
Subject: bpf: add missing rcu protection when releasing programs from
 prog_array

Normally the program attachment place (like sockets, qdiscs) takes
care of rcu protection and calls bpf_prog_put() after a grace period.
The programs stored inside prog_array may not be attached anywhere,
so prog_array needs to take care of preserving rcu protection.
Otherwise bpf_tail_call() will race with bpf_prog_put().
To solve that introduce bpf_prog_put_rcu() helper function and use
it in 3 places where unattached program can decrement refcnt:
closing program fd, deleting/replacing program in prog_array.

Fixes: 04fd61ab36ec ("bpf: allow bpf programs to tail-call other bpf programs")
Reported-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h   |  6 +++++-
 kernel/bpf/arraymap.c |  4 ++--
 kernel/bpf/syscall.c  | 19 ++++++++++++++++++-
 3 files changed, 25 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 8821b9a8689e..5f520f5f087e 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -123,7 +123,10 @@ struct bpf_prog_aux {
 	const struct bpf_verifier_ops *ops;
 	struct bpf_map **used_maps;
 	struct bpf_prog *prog;
-	struct work_struct work;
+	union {
+		struct work_struct work;
+		struct rcu_head	rcu;
+	};
 };
 
 struct bpf_array {
@@ -153,6 +156,7 @@ void bpf_register_map_type(struct bpf_map_type_list *tl);
 
 struct bpf_prog *bpf_prog_get(u32 ufd);
 void bpf_prog_put(struct bpf_prog *prog);
+void bpf_prog_put_rcu(struct bpf_prog *prog);
 
 struct bpf_map *bpf_map_get(struct fd f);
 void bpf_map_put(struct bpf_map *map);
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 614bcd4c1d74..cb31229a6fa4 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -202,7 +202,7 @@ static int prog_array_map_update_elem(struct bpf_map *map, void *key,
 
 	old_prog = xchg(array->prog + index, prog);
 	if (old_prog)
-		bpf_prog_put(old_prog);
+		bpf_prog_put_rcu(old_prog);
 
 	return 0;
 }
@@ -218,7 +218,7 @@ static int prog_array_map_delete_elem(struct bpf_map *map, void *key)
 
 	old_prog = xchg(array->prog + index, NULL);
 	if (old_prog) {
-		bpf_prog_put(old_prog);
+		bpf_prog_put_rcu(old_prog);
 		return 0;
 	} else {
 		return -ENOENT;
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 98a69bd83069..a1b14d197a4f 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -432,6 +432,23 @@ static void free_used_maps(struct bpf_prog_aux *aux)
 	kfree(aux->used_maps);
 }
 
+static void __prog_put_rcu(struct rcu_head *rcu)
+{
+	struct bpf_prog_aux *aux = container_of(rcu, struct bpf_prog_aux, rcu);
+
+	free_used_maps(aux);
+	bpf_prog_free(aux->prog);
+}
+
+/* version of bpf_prog_put() that is called after a grace period */
+void bpf_prog_put_rcu(struct bpf_prog *prog)
+{
+	if (atomic_dec_and_test(&prog->aux->refcnt)) {
+		prog->aux->prog = prog;
+		call_rcu(&prog->aux->rcu, __prog_put_rcu);
+	}
+}
+
 void bpf_prog_put(struct bpf_prog *prog)
 {
 	if (atomic_dec_and_test(&prog->aux->refcnt)) {
@@ -445,7 +462,7 @@ static int bpf_prog_release(struct inode *inode, struct file *filp)
 {
 	struct bpf_prog *prog = filp->private_data;
 
-	bpf_prog_put(prog);
+	bpf_prog_put_rcu(prog);
 	return 0;
 }
 
-- 
cgit v1.2.3


From 3b369bd212d5cabb46cff0e863298971b382bbd6 Mon Sep 17 00:00:00 2001
From: Lennert Buytenhek <buytenh@wantstofly.org>
Date: Thu, 28 May 2015 15:38:32 +0300
Subject: ieee802154: Fix generation of random EUI-64 addresses.

Currently, ieee802154_random_extended_addr() has a 50% chance of
generating a group (multicast) address, while this function is used
for generating station addresses (which can't be group addresses)
for interfaces that don't have a hardware-provided address.

Also, in case get_random_bytes() generates the EUI-64 address
00:00:00:00:00:00:00:00 (extremely unlikely), which is an invalid
address, ieee802154_random_extended_addr() reacts by changing it
to 01:00:00:00:00:00:00:00, which is an invalid station address as
well, as it is a group address.

This patch changes the address generation procedure to grab eight
random bytes, treat that as an EUI-64, and then clear the Group
address bit and set the Locally Administered bit, which is in
line with how eth_random_addr() generates random EUI-48s.

Signed-off-by: Lennert Buytenhek <buytenh@wantstofly.org>
Acked-by: Alexander Aring <alex.aring@gmail.com>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 include/linux/ieee802154.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ieee802154.h b/include/linux/ieee802154.h
index 8872ca103d06..552210d0a46f 100644
--- a/include/linux/ieee802154.h
+++ b/include/linux/ieee802154.h
@@ -244,9 +244,9 @@ static inline void ieee802154_random_extended_addr(__le64 *addr)
 {
 	get_random_bytes(addr, IEEE802154_EXTENDED_ADDR_LEN);
 
-	/* toggle some bit if we hit an invalid extended addr */
-	if (!ieee802154_is_valid_extended_addr(*addr))
-		((u8 *)addr)[IEEE802154_EXTENDED_ADDR_LEN - 1] ^= 0x01;
+	/* clear the group bit, and set the locally administered bit */
+	((u8 *)addr)[IEEE802154_EXTENDED_ADDR_LEN - 1] &= ~0x01;
+	((u8 *)addr)[IEEE802154_EXTENDED_ADDR_LEN - 1] |= 0x02;
 }
 
 #endif /* LINUX_IEEE802154_H */
-- 
cgit v1.2.3


From daf4e2c89254ed6eb8cf7ef60f614edebfdb9f3a Mon Sep 17 00:00:00 2001
From: Lennert Buytenhek <buytenh@wantstofly.org>
Date: Thu, 28 May 2015 15:38:43 +0300
Subject: ieee802154: Fix EUI-64 station address validation.

Refuse to allow setting an EUI-64 group address as an interface
address, as those are not valid station addresses.

Signed-off-by: Lennert Buytenhek <buytenh@wantstofly.org>
Acked-by: Alexander Aring <alex.aring@gmail.com>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 include/linux/ieee802154.h | 10 ++++------
 net/mac802154/iface.c      |  4 ++--
 2 files changed, 6 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ieee802154.h b/include/linux/ieee802154.h
index 552210d0a46f..1dc1f4ed4001 100644
--- a/include/linux/ieee802154.h
+++ b/include/linux/ieee802154.h
@@ -225,15 +225,13 @@ static inline bool ieee802154_is_valid_psdu_len(const u8 len)
  * ieee802154_is_valid_psdu_len - check if extended addr is valid
  * @addr: extended addr to check
  */
-static inline bool ieee802154_is_valid_extended_addr(const __le64 addr)
+static inline bool ieee802154_is_valid_extended_unicast_addr(const __le64 addr)
 {
-	/* These EUI-64 addresses are reserved by IEEE. 0xffffffffffffffff
-	 * is used internally as extended to short address broadcast mapping.
-	 * This is currently a workaround because neighbor discovery can't
-	 * deal with short addresses types right now.
+	/* Bail out if the address is all zero, or if the group
+	 * address bit is set.
 	 */
 	return ((addr != cpu_to_le64(0x0000000000000000ULL)) &&
-		(addr != cpu_to_le64(0xffffffffffffffffULL)));
+		!(addr & cpu_to_le64(0x0100000000000000ULL)));
 }
 
 /**
diff --git a/net/mac802154/iface.c b/net/mac802154/iface.c
index b544b5dc4bfb..6ac023932ce0 100644
--- a/net/mac802154/iface.c
+++ b/net/mac802154/iface.c
@@ -126,7 +126,7 @@ static int mac802154_wpan_mac_addr(struct net_device *dev, void *p)
 		return -EBUSY;
 
 	ieee802154_be64_to_le64(&extended_addr, addr->sa_data);
-	if (!ieee802154_is_valid_extended_addr(extended_addr))
+	if (!ieee802154_is_valid_extended_unicast_addr(extended_addr))
 		return -EINVAL;
 
 	memcpy(dev->dev_addr, addr->sa_data, dev->addr_len);
@@ -539,7 +539,7 @@ ieee802154_if_add(struct ieee802154_local *local, const char *name,
 	switch (type) {
 	case NL802154_IFTYPE_NODE:
 		ndev->type = ARPHRD_IEEE802154;
-		if (ieee802154_is_valid_extended_addr(extended_addr))
+		if (ieee802154_is_valid_extended_unicast_addr(extended_addr))
 			ieee802154_le64_to_be64(ndev->dev_addr, &extended_addr);
 		else
 			memcpy(ndev->dev_addr, ndev->perm_addr,
-- 
cgit v1.2.3


From 1a1bc59c5f7657387d1a4b45d63248fed55ab88c Mon Sep 17 00:00:00 2001
From: Varka Bhadram <varkabhadram@gmail.com>
Date: Fri, 29 May 2015 10:56:55 +0530
Subject: cc2520: fix CC2591 handling

This patch changes tha way of handling of cc2591-cc2520 combination
by moving amplified variable from platform data to private data.
This will be useful in other sections like tx power support.

Signed-off-by: Varka Bhadram <varkab@cdac.in>
Cc: Brad Campbell <bradjc5@gmail.com>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 drivers/net/ieee802154/cc2520.c | 10 ++++++++--
 include/linux/spi/cc2520.h      |  1 -
 2 files changed, 8 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ieee802154/cc2520.c b/drivers/net/ieee802154/cc2520.c
index ea280d437ca1..0d9353756598 100644
--- a/drivers/net/ieee802154/cc2520.c
+++ b/drivers/net/ieee802154/cc2520.c
@@ -196,6 +196,7 @@ struct cc2520_private {
 	u8 *buf;			/* SPI TX/Rx data buffer */
 	struct mutex buffer_mutex;	/* SPI buffer mutex */
 	bool is_tx;			/* Flag for sync b/w Tx and Rx */
+	bool amplified;			/* Flag for CC2591 */
 	int fifo_pin;			/* FIFO GPIO pin number */
 	struct work_struct fifop_irqwork;/* Workqueue for FIFOP */
 	spinlock_t lock;		/* Lock for is_tx*/
@@ -738,7 +739,9 @@ static int cc2520_get_platform_data(struct spi_device *spi,
 	pdata->vreg = of_get_named_gpio(np, "vreg-gpio", 0);
 	pdata->reset = of_get_named_gpio(np, "reset-gpio", 0);
 
-	pdata->amplified = of_property_read_bool(np, "amplified");
+	/* CC2591 front end for CC2520 */
+	if (of_property_read_bool(np, "amplified"))
+		priv->amplified = true;
 
 	return 0;
 }
@@ -781,7 +784,7 @@ static int cc2520_hw_init(struct cc2520_private *priv)
 	 * amplifier. See section 8 page 17 of TI application note AN065.
 	 * http://www.ti.com/lit/an/swra229a/swra229a.pdf
 	 */
-	if (pdata.amplified) {
+	if (priv->amplified) {
 		ret = cc2520_write_register(priv, CC2520_AGCCTRL1, 0x16);
 		if (ret)
 			goto err_ret;
@@ -896,6 +899,9 @@ static int cc2520_probe(struct spi_device *spi)
 	spin_lock_init(&priv->lock);
 	init_completion(&priv->tx_complete);
 
+	/* Assumption that CC2591 is not connected */
+	priv->amplified = false;
+
 	/* Request all the gpio's */
 	if (!gpio_is_valid(pdata.fifo)) {
 		dev_err(&spi->dev, "fifo gpio is not valid\n");
diff --git a/include/linux/spi/cc2520.h b/include/linux/spi/cc2520.h
index e741e8baad92..85b8ee67e937 100644
--- a/include/linux/spi/cc2520.h
+++ b/include/linux/spi/cc2520.h
@@ -21,7 +21,6 @@ struct cc2520_platform_data {
 	int sfd;
 	int reset;
 	int vreg;
-	bool amplified;
 };
 
 #endif
-- 
cgit v1.2.3


From 6c4e5f9c9ff41ea997fd0f345b3b2b88c113eb68 Mon Sep 17 00:00:00 2001
From: Keith Mange <keith.mange@microsoft.com>
Date: Tue, 26 May 2015 14:23:01 -0700
Subject: Drivers: hv: vmbus:Update preferred vmbus protocol version to windows
 10.

Add support for Windows 10.

Signed-off-by: Keith Mange <keith.mange@microsoft.com>
Signed-off-by: K. Y. Srinivasan <kys@microsoft.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/hv/connection.c | 8 +++++---
 include/linux/hyperv.h  | 4 +++-
 2 files changed, 8 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/hv/connection.c b/drivers/hv/connection.c
index acd50e9d666a..4fc2e8836e60 100644
--- a/drivers/hv/connection.c
+++ b/drivers/hv/connection.c
@@ -58,6 +58,9 @@ static __u32 vmbus_get_next_version(__u32 current_version)
 	case (VERSION_WIN8_1):
 		return VERSION_WIN8;
 
+	case (VERSION_WIN10):
+		return VERSION_WIN8_1;
+
 	case (VERSION_WS2008):
 	default:
 		return VERSION_INVAL;
@@ -80,7 +83,7 @@ static int vmbus_negotiate_version(struct vmbus_channel_msginfo *msginfo,
 	msg->interrupt_page = virt_to_phys(vmbus_connection.int_page);
 	msg->monitor_page1 = virt_to_phys(vmbus_connection.monitor_pages[0]);
 	msg->monitor_page2 = virt_to_phys(vmbus_connection.monitor_pages[1]);
-	if (version == VERSION_WIN8_1) {
+	if (version >= VERSION_WIN8_1) {
 		msg->target_vcpu = hv_context.vp_index[get_cpu()];
 		put_cpu();
 	}
@@ -376,8 +379,7 @@ void vmbus_on_event(unsigned long data)
 	int cpu = smp_processor_id();
 	union hv_synic_event_flags *event;
 
-	if ((vmbus_proto_version == VERSION_WS2008) ||
-		(vmbus_proto_version == VERSION_WIN7)) {
+	if (vmbus_proto_version < VERSION_WIN8) {
 		maxdword = MAX_NUM_CHANNELS_SUPPORTED >> 5;
 		recv_int_page = vmbus_connection.recv_int_page;
 	} else {
diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h
index 3932a993ff5a..4317cd1b69ed 100644
--- a/include/linux/hyperv.h
+++ b/include/linux/hyperv.h
@@ -160,16 +160,18 @@ hv_get_ringbuffer_availbytes(struct hv_ring_buffer_info *rbi,
  * 1 . 1  (Windows 7)
  * 2 . 4  (Windows 8)
  * 3 . 0  (Windows 8 R2)
+ * 4 . 0  (Windows 10)
  */
 
 #define VERSION_WS2008  ((0 << 16) | (13))
 #define VERSION_WIN7    ((1 << 16) | (1))
 #define VERSION_WIN8    ((2 << 16) | (4))
 #define VERSION_WIN8_1    ((3 << 16) | (0))
+#define VERSION_WIN10	((4 << 16) | (0))
 
 #define VERSION_INVAL -1
 
-#define VERSION_CURRENT VERSION_WIN8_1
+#define VERSION_CURRENT VERSION_WIN10
 
 /* Make maximum size of pipe payload of 16K */
 #define MAX_PIPE_DATA_PAYLOAD		(sizeof(u8) * 16384)
-- 
cgit v1.2.3


From 6fa45a22689722dac9f0e90c0931d4b34b334ede Mon Sep 17 00:00:00 2001
From: Sudip Mukherjee <sudipm.mukherjee@gmail.com>
Date: Wed, 20 May 2015 20:56:57 +0530
Subject: parport: add device-model to parport subsystem

parport subsystem starts using the device-model. Drivers using the
device-model has to define devmodel as true and should register the
device with parport using parport_register_dev_model().

Tested-by: Jean Delvare <jdelvare@suse.de>
Tested-by: Alan Cox <gnomes@lxorguk.ukuu.org.uk>
Signed-off-by: Sudip Mukherjee <sudip@vectorindia.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/parport/parport_pc.c |   4 +-
 drivers/parport/procfs.c     |  15 +-
 drivers/parport/share.c      | 345 ++++++++++++++++++++++++++++++++++++++++---
 include/linux/parport.h      |  43 +++++-
 4 files changed, 379 insertions(+), 28 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/parport/parport_pc.c b/drivers/parport/parport_pc.c
index 53d15b30636a..78530d1714dc 100644
--- a/drivers/parport/parport_pc.c
+++ b/drivers/parport/parport_pc.c
@@ -2255,7 +2255,7 @@ out5:
 		release_region(base+0x3, 5);
 	release_region(base, 3);
 out4:
-	parport_put_port(p);
+	parport_del_port(p);
 out3:
 	kfree(priv);
 out2:
@@ -2294,7 +2294,7 @@ void parport_pc_unregister_port(struct parport *p)
 				    priv->dma_handle);
 #endif
 	kfree(p->private_data);
-	parport_put_port(p);
+	parport_del_port(p);
 	kfree(ops); /* hope no-one cached it */
 }
 EXPORT_SYMBOL(parport_pc_unregister_port);
diff --git a/drivers/parport/procfs.c b/drivers/parport/procfs.c
index 3b470801a04f..c776333a68bc 100644
--- a/drivers/parport/procfs.c
+++ b/drivers/parport/procfs.c
@@ -21,6 +21,7 @@
 #include <linux/parport.h>
 #include <linux/ctype.h>
 #include <linux/sysctl.h>
+#include <linux/device.h>
 
 #include <asm/uaccess.h>
 
@@ -558,8 +559,18 @@ int parport_device_proc_unregister(struct pardevice *device)
 
 static int __init parport_default_proc_register(void)
 {
+	int ret;
+
 	parport_default_sysctl_table.sysctl_header =
 		register_sysctl_table(parport_default_sysctl_table.dev_dir);
+	if (!parport_default_sysctl_table.sysctl_header)
+		return -ENOMEM;
+	ret = parport_bus_init();
+	if (ret) {
+		unregister_sysctl_table(parport_default_sysctl_table.
+					sysctl_header);
+		return ret;
+	}
 	return 0;
 }
 
@@ -570,6 +581,7 @@ static void __exit parport_default_proc_unregister(void)
 					sysctl_header);
 		parport_default_sysctl_table.sysctl_header = NULL;
 	}
+	parport_bus_exit();
 }
 
 #else /* no sysctl or no procfs*/
@@ -596,11 +608,12 @@ int parport_device_proc_unregister(struct pardevice *device)
 
 static int __init parport_default_proc_register (void)
 {
-	return 0;
+	return parport_bus_init();
 }
 
 static void __exit parport_default_proc_unregister (void)
 {
+	parport_bus_exit();
 }
 #endif
 
diff --git a/drivers/parport/share.c b/drivers/parport/share.c
index 3fa66244ce32..697c6d7bf0fe 100644
--- a/drivers/parport/share.c
+++ b/drivers/parport/share.c
@@ -29,6 +29,7 @@
 #include <linux/slab.h>
 #include <linux/sched.h>
 #include <linux/kmod.h>
+#include <linux/device.h>
 
 #include <linux/spinlock.h>
 #include <linux/mutex.h>
@@ -100,13 +101,91 @@ static struct parport_operations dead_ops = {
 	.owner		= NULL,
 };
 
+static struct device_type parport_device_type = {
+	.name = "parport",
+};
+
+static int is_parport(struct device *dev)
+{
+	return dev->type == &parport_device_type;
+}
+
+static int parport_probe(struct device *dev)
+{
+	struct parport_driver *drv;
+
+	if (is_parport(dev))
+		return -ENODEV;
+
+	drv = to_parport_driver(dev->driver);
+	if (!drv->probe) {
+		/* if driver has not defined a custom probe */
+		struct pardevice *par_dev = to_pardevice(dev);
+
+		if (strcmp(par_dev->name, drv->name))
+			return -ENODEV;
+		return 0;
+	}
+	/* if driver defined its own probe */
+	return drv->probe(to_pardevice(dev));
+}
+
+static struct bus_type parport_bus_type = {
+	.name = "parport",
+	.probe = parport_probe,
+};
+
+int parport_bus_init(void)
+{
+	return bus_register(&parport_bus_type);
+}
+
+void parport_bus_exit(void)
+{
+	bus_unregister(&parport_bus_type);
+}
+
+/*
+ * iterates through all the drivers registered with the bus and sends the port
+ * details to the match_port callback of the driver, so that the driver can
+ * know about the new port that just regsitered with the bus and decide if it
+ * wants to use this new port.
+ */
+static int driver_check(struct device_driver *dev_drv, void *_port)
+{
+	struct parport *port = _port;
+	struct parport_driver *drv = to_parport_driver(dev_drv);
+
+	if (drv->match_port)
+		drv->match_port(port);
+	return 0;
+}
+
 /* Call attach(port) for each registered driver. */
 static void attach_driver_chain(struct parport *port)
 {
 	/* caller has exclusive registration_lock */
 	struct parport_driver *drv;
+
 	list_for_each_entry(drv, &drivers, list)
 		drv->attach(port);
+
+	/*
+	 * call the driver_check function of the drivers registered in
+	 * new device model
+	 */
+
+	bus_for_each_drv(&parport_bus_type, NULL, port, driver_check);
+}
+
+static int driver_detach(struct device_driver *_drv, void *_port)
+{
+	struct parport *port = _port;
+	struct parport_driver *drv = to_parport_driver(_drv);
+
+	if (drv->detach)
+		drv->detach(port);
+	return 0;
 }
 
 /* Call detach(port) for each registered driver. */
@@ -116,6 +195,13 @@ static void detach_driver_chain(struct parport *port)
 	/* caller has exclusive registration_lock */
 	list_for_each_entry(drv, &drivers, list)
 		drv->detach (port);
+
+	/*
+	 * call the detach function of the drivers registered in
+	 * new device model
+	 */
+
+	bus_for_each_drv(&parport_bus_type, NULL, port, driver_detach);
 }
 
 /* Ask kmod for some lowlevel drivers. */
@@ -126,17 +212,39 @@ static void get_lowlevel_driver (void)
 	request_module ("parport_lowlevel");
 }
 
+/*
+ * iterates through all the devices connected to the bus and sends the device
+ * details to the match_port callback of the driver, so that the driver can
+ * know what are all the ports that are connected to the bus and choose the
+ * port to which it wants to register its device.
+ */
+static int port_check(struct device *dev, void *dev_drv)
+{
+	struct parport_driver *drv = dev_drv;
+
+	/* only send ports, do not send other devices connected to bus */
+	if (is_parport(dev))
+		drv->match_port(to_parport_dev(dev));
+	return 0;
+}
+
 /**
  *	parport_register_driver - register a parallel port device driver
  *	@drv: structure describing the driver
+ *	@owner: owner module of drv
+ *	@mod_name: module name string
  *
  *	This can be called by a parallel port device driver in order
  *	to receive notifications about ports being found in the
  *	system, as well as ports no longer available.
  *
+ *	If devmodel is true then the new device model is used
+ *	for registration.
+ *
  *	The @drv structure is allocated by the caller and must not be
  *	deallocated until after calling parport_unregister_driver().
  *
+ *	If using the non device model:
  *	The driver's attach() function may block.  The port that
  *	attach() is given will be valid for the duration of the
  *	callback, but if the driver wants to take a copy of the
@@ -148,21 +256,57 @@ static void get_lowlevel_driver (void)
  *	callback, but if the driver wants to take a copy of the
  *	pointer it must call parport_get_port() to do so.
  *
- *	Returns 0 on success.  Currently it always succeeds.
+ *
+ *	Returns 0 on success. The non device model will always succeeds.
+ *	but the new device model can fail and will return the error code.
  **/
 
-int parport_register_driver (struct parport_driver *drv)
+int __parport_register_driver(struct parport_driver *drv, struct module *owner,
+			      const char *mod_name)
 {
-	struct parport *port;
-
 	if (list_empty(&portlist))
 		get_lowlevel_driver ();
 
-	mutex_lock(&registration_lock);
-	list_for_each_entry(port, &portlist, list)
-		drv->attach(port);
-	list_add(&drv->list, &drivers);
-	mutex_unlock(&registration_lock);
+	if (drv->devmodel) {
+		/* using device model */
+		int ret;
+
+		/* initialize common driver fields */
+		drv->driver.name = drv->name;
+		drv->driver.bus = &parport_bus_type;
+		drv->driver.owner = owner;
+		drv->driver.mod_name = mod_name;
+		ret = driver_register(&drv->driver);
+		if (ret)
+			return ret;
+
+		mutex_lock(&registration_lock);
+		if (drv->match_port)
+			bus_for_each_dev(&parport_bus_type, NULL, drv,
+					 port_check);
+		mutex_unlock(&registration_lock);
+	} else {
+		struct parport *port;
+
+		drv->devmodel = false;
+
+		mutex_lock(&registration_lock);
+		list_for_each_entry(port, &portlist, list)
+			drv->attach(port);
+		list_add(&drv->list, &drivers);
+		mutex_unlock(&registration_lock);
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL(__parport_register_driver);
+
+static int port_detach(struct device *dev, void *_drv)
+{
+	struct parport_driver *drv = _drv;
+
+	if (is_parport(dev) && drv->detach)
+		drv->detach(to_parport_dev(dev));
 
 	return 0;
 }
@@ -189,15 +333,22 @@ void parport_unregister_driver (struct parport_driver *drv)
 	struct parport *port;
 
 	mutex_lock(&registration_lock);
-	list_del_init(&drv->list);
-	list_for_each_entry(port, &portlist, list)
-		drv->detach(port);
+	if (drv->devmodel) {
+		bus_for_each_dev(&parport_bus_type, NULL, drv, port_detach);
+		driver_unregister(&drv->driver);
+	} else {
+		list_del_init(&drv->list);
+		list_for_each_entry(port, &portlist, list)
+			drv->detach(port);
+	}
 	mutex_unlock(&registration_lock);
 }
 
-static void free_port (struct parport *port)
+static void free_port(struct device *dev)
 {
 	int d;
+	struct parport *port = to_parport_dev(dev);
+
 	spin_lock(&full_list_lock);
 	list_del(&port->full_list);
 	spin_unlock(&full_list_lock);
@@ -223,25 +374,29 @@ static void free_port (struct parport *port)
 
 struct parport *parport_get_port (struct parport *port)
 {
-	atomic_inc (&port->ref_count);
-	return port;
+	struct device *dev = get_device(&port->bus_dev);
+
+	return to_parport_dev(dev);
+}
+
+void parport_del_port(struct parport *port)
+{
+	device_unregister(&port->bus_dev);
 }
+EXPORT_SYMBOL(parport_del_port);
 
 /**
  *	parport_put_port - decrement a port's reference count
  *	@port: the port
  *
  *	This should be called once for each call to parport_get_port(),
- *	once the port is no longer needed.
+ *	once the port is no longer needed. When the reference count reaches
+ *	zero (port is no longer used), free_port is called.
  **/
 
 void parport_put_port (struct parport *port)
 {
-	if (atomic_dec_and_test (&port->ref_count))
-		/* Can destroy it now. */
-		free_port (port);
-
-	return;
+	put_device(&port->bus_dev);
 }
 
 /**
@@ -281,6 +436,7 @@ struct parport *parport_register_port(unsigned long base, int irq, int dma,
 	int num;
 	int device;
 	char *name;
+	int ret;
 
 	tmp = kzalloc(sizeof(struct parport), GFP_KERNEL);
 	if (!tmp) {
@@ -333,6 +489,10 @@ struct parport *parport_register_port(unsigned long base, int irq, int dma,
 	 */
 	sprintf(name, "parport%d", tmp->portnum = tmp->number);
 	tmp->name = name;
+	tmp->bus_dev.bus = &parport_bus_type;
+	tmp->bus_dev.release = free_port;
+	dev_set_name(&tmp->bus_dev, name);
+	tmp->bus_dev.type = &parport_device_type;
 
 	for (device = 0; device < 5; device++)
 		/* assume the worst */
@@ -340,6 +500,12 @@ struct parport *parport_register_port(unsigned long base, int irq, int dma,
 
 	tmp->waithead = tmp->waittail = NULL;
 
+	ret = device_register(&tmp->bus_dev);
+	if (ret) {
+		put_device(&tmp->bus_dev);
+		return NULL;
+	}
+
 	return tmp;
 }
 
@@ -575,6 +741,7 @@ parport_register_device(struct parport *port, const char *name,
 	tmp->irq_func = irq_func;
 	tmp->waiting = 0;
 	tmp->timeout = 5 * HZ;
+	tmp->devmodel = false;
 
 	/* Chain this onto the list */
 	tmp->prev = NULL;
@@ -630,6 +797,136 @@ parport_register_device(struct parport *port, const char *name,
 	return NULL;
 }
 
+static void free_pardevice(struct device *dev)
+{
+	struct pardevice *par_dev = to_pardevice(dev);
+
+	kfree(par_dev->name);
+	kfree(par_dev);
+}
+
+struct pardevice *
+parport_register_dev_model(struct parport *port, const char *name,
+			   const struct pardev_cb *par_dev_cb, int id)
+{
+	struct pardevice *par_dev;
+	int ret;
+	char *devname;
+
+	if (port->physport->flags & PARPORT_FLAG_EXCL) {
+		/* An exclusive device is registered. */
+		pr_err("%s: no more devices allowed\n", port->name);
+		return NULL;
+	}
+
+	if (par_dev_cb->flags & PARPORT_DEV_LURK) {
+		if (!par_dev_cb->preempt || !par_dev_cb->wakeup) {
+			pr_info("%s: refused to register lurking device (%s) without callbacks\n",
+				port->name, name);
+			return NULL;
+		}
+	}
+
+	if (!try_module_get(port->ops->owner))
+		return NULL;
+
+	parport_get_port(port);
+
+	par_dev = kzalloc(sizeof(*par_dev), GFP_KERNEL);
+	if (!par_dev)
+		goto err_put_port;
+
+	par_dev->state = kzalloc(sizeof(*par_dev->state), GFP_KERNEL);
+	if (!par_dev->state)
+		goto err_put_par_dev;
+
+	devname = kstrdup(name, GFP_KERNEL);
+	if (!devname)
+		goto err_free_par_dev;
+
+	par_dev->name = devname;
+	par_dev->port = port;
+	par_dev->daisy = -1;
+	par_dev->preempt = par_dev_cb->preempt;
+	par_dev->wakeup = par_dev_cb->wakeup;
+	par_dev->private = par_dev_cb->private;
+	par_dev->flags = par_dev_cb->flags;
+	par_dev->irq_func = par_dev_cb->irq_func;
+	par_dev->waiting = 0;
+	par_dev->timeout = 5 * HZ;
+
+	par_dev->dev.parent = &port->bus_dev;
+	par_dev->dev.bus = &parport_bus_type;
+	ret = dev_set_name(&par_dev->dev, "%s.%d", devname, id);
+	if (ret)
+		goto err_free_devname;
+	par_dev->dev.release = free_pardevice;
+	par_dev->devmodel = true;
+	ret = device_register(&par_dev->dev);
+	if (ret)
+		goto err_put_dev;
+
+	/* Chain this onto the list */
+	par_dev->prev = NULL;
+	/*
+	 * This function must not run from an irq handler so we don' t need
+	 * to clear irq on the local CPU. -arca
+	 */
+	spin_lock(&port->physport->pardevice_lock);
+
+	if (par_dev_cb->flags & PARPORT_DEV_EXCL) {
+		if (port->physport->devices) {
+			spin_unlock(&port->physport->pardevice_lock);
+			pr_debug("%s: cannot grant exclusive access for device %s\n",
+				 port->name, name);
+			goto err_put_dev;
+		}
+		port->flags |= PARPORT_FLAG_EXCL;
+	}
+
+	par_dev->next = port->physport->devices;
+	wmb();	/*
+		 * Make sure that tmp->next is written before it's
+		 * added to the list; see comments marked 'no locking
+		 * required'
+		 */
+	if (port->physport->devices)
+		port->physport->devices->prev = par_dev;
+	port->physport->devices = par_dev;
+	spin_unlock(&port->physport->pardevice_lock);
+
+	init_waitqueue_head(&par_dev->wait_q);
+	par_dev->timeslice = parport_default_timeslice;
+	par_dev->waitnext = NULL;
+	par_dev->waitprev = NULL;
+
+	/*
+	 * This has to be run as last thing since init_state may need other
+	 * pardevice fields. -arca
+	 */
+	port->ops->init_state(par_dev, par_dev->state);
+	port->proc_device = par_dev;
+	parport_device_proc_register(par_dev);
+
+	return par_dev;
+
+err_put_dev:
+	put_device(&par_dev->dev);
+err_free_devname:
+	kfree(devname);
+err_free_par_dev:
+	kfree(par_dev->state);
+err_put_par_dev:
+	if (!par_dev->devmodel)
+		kfree(par_dev);
+err_put_port:
+	parport_put_port(port);
+	module_put(port->ops->owner);
+
+	return NULL;
+}
+EXPORT_SYMBOL(parport_register_dev_model);
+
 /**
  *	parport_unregister_device - deregister a device on a parallel port
  *	@dev: pointer to structure representing device
@@ -691,7 +988,10 @@ void parport_unregister_device(struct pardevice *dev)
 	spin_unlock_irq(&port->waitlist_lock);
 
 	kfree(dev->state);
-	kfree(dev);
+	if (dev->devmodel)
+		device_unregister(&dev->dev);
+	else
+		kfree(dev);
 
 	module_put(port->ops->owner);
 	parport_put_port (port);
@@ -1019,7 +1319,6 @@ EXPORT_SYMBOL(parport_release);
 EXPORT_SYMBOL(parport_register_port);
 EXPORT_SYMBOL(parport_announce_port);
 EXPORT_SYMBOL(parport_remove_port);
-EXPORT_SYMBOL(parport_register_driver);
 EXPORT_SYMBOL(parport_unregister_driver);
 EXPORT_SYMBOL(parport_register_device);
 EXPORT_SYMBOL(parport_unregister_device);
diff --git a/include/linux/parport.h b/include/linux/parport.h
index c22f12547324..58e3c64c6b49 100644
--- a/include/linux/parport.h
+++ b/include/linux/parport.h
@@ -13,6 +13,7 @@
 #include <linux/wait.h>
 #include <linux/irqreturn.h>
 #include <linux/semaphore.h>
+#include <linux/device.h>
 #include <asm/ptrace.h>
 #include <uapi/linux/parport.h>
 
@@ -145,6 +146,8 @@ struct pardevice {
 	unsigned int flags;
 	struct pardevice *next;
 	struct pardevice *prev;
+	struct device dev;
+	bool devmodel;
 	struct parport_state *state;     /* saved status over preemption */
 	wait_queue_head_t wait_q;
 	unsigned long int time;
@@ -156,6 +159,8 @@ struct pardevice {
 	void * sysctl_table;
 };
 
+#define to_pardevice(n) container_of(n, struct pardevice, dev)
+
 /* IEEE1284 information */
 
 /* IEEE1284 phases. These are exposed to userland through ppdev IOCTL
@@ -195,7 +200,7 @@ struct parport {
 				 * This may unfortulately be null if the
 				 * port has a legacy driver.
 				 */
-
+	struct device bus_dev;	/* to link with the bus */
 	struct parport *physport;
 				/* If this is a non-default mux
 				   parport, i.e. we're a clone of a real
@@ -245,15 +250,26 @@ struct parport {
 	struct parport *slaves[3];
 };
 
+#define to_parport_dev(n) container_of(n, struct parport, bus_dev)
+
 #define DEFAULT_SPIN_TIME 500 /* us */
 
 struct parport_driver {
 	const char *name;
 	void (*attach) (struct parport *);
 	void (*detach) (struct parport *);
+	void (*match_port)(struct parport *);
+	int (*probe)(struct pardevice *);
+	struct device_driver driver;
+	bool devmodel;
 	struct list_head list;
 };
 
+#define to_parport_driver(n) container_of(n, struct parport_driver, driver)
+
+int parport_bus_init(void);
+void parport_bus_exit(void);
+
 /* parport_register_port registers a new parallel port at the given
    address (if one does not already exist) and returns a pointer to it.
    This entails claiming the I/O region, IRQ and DMA.  NULL is returned
@@ -272,10 +288,20 @@ void parport_announce_port (struct parport *port);
 extern void parport_remove_port(struct parport *port);
 
 /* Register a new high-level driver. */
-extern int parport_register_driver (struct parport_driver *);
+
+int __must_check __parport_register_driver(struct parport_driver *,
+					   struct module *,
+					   const char *mod_name);
+/*
+ * parport_register_driver must be a macro so that KBUILD_MODNAME can
+ * be expanded
+ */
+#define parport_register_driver(driver)             \
+	__parport_register_driver(driver, THIS_MODULE, KBUILD_MODNAME)
 
 /* Unregister a high-level driver. */
 extern void parport_unregister_driver (struct parport_driver *);
+void parport_unregister_driver(struct parport_driver *);
 
 /* If parport_register_driver doesn't fit your needs, perhaps
  * parport_find_xxx does. */
@@ -288,6 +314,15 @@ extern irqreturn_t parport_irq_handler(int irq, void *dev_id);
 /* Reference counting for ports. */
 extern struct parport *parport_get_port (struct parport *);
 extern void parport_put_port (struct parport *);
+void parport_del_port(struct parport *);
+
+struct pardev_cb {
+	int (*preempt)(void *);
+	void (*wakeup)(void *);
+	void *private;
+	void (*irq_func)(void *);
+	unsigned int flags;
+};
 
 /* parport_register_device declares that a device is connected to a
    port, and tells the kernel all it needs to know.
@@ -301,6 +336,10 @@ struct pardevice *parport_register_device(struct parport *port,
 			  void (*irq_func)(void *), 
 			  int flags, void *handle);
 
+struct pardevice *
+parport_register_dev_model(struct parport *port, const char *name,
+			   const struct pardev_cb *par_dev_cb, int cnt);
+
 /* parport_unregister unlinks a device from the chain. */
 extern void parport_unregister_device(struct pardevice *dev);
 
-- 
cgit v1.2.3


From b84b1d522f979fb53ad347605e24b2940fa2ad99 Mon Sep 17 00:00:00 2001
From: Hannes Reinecke <hare@suse.de>
Date: Wed, 29 Apr 2015 08:57:34 +0200
Subject: scsi: Do not set cmd_per_lun to 1 in the host template

'0' is now used as the default cmd_per_lun value,
so there's no need to explicitly set it to '1' in the
host template.

Signed-off-by: Hannes Reinecke <hare@suse.de>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: James Bottomley <JBottomley@Odin.com>
---
 drivers/block/cciss_scsi.c          | 1 -
 drivers/firewire/sbp2.c             | 1 -
 drivers/s390/scsi/zfcp_scsi.c       | 1 -
 drivers/scsi/NCR53c406a.c           | 1 -
 drivers/scsi/a100u2w.c              | 1 -
 drivers/scsi/aha152x.c              | 1 -
 drivers/scsi/aha1542.c              | 1 -
 drivers/scsi/aha1740.c              | 1 -
 drivers/scsi/aha1740.h              | 1 -
 drivers/scsi/aic94xx/aic94xx_init.c | 1 -
 drivers/scsi/arm/arxescsi.c         | 1 -
 drivers/scsi/arm/cumana_2.c         | 1 -
 drivers/scsi/arm/eesox.c            | 1 -
 drivers/scsi/atp870u.c              | 1 -
 drivers/scsi/atp870u.h              | 1 -
 drivers/scsi/dpt_i2o.c              | 1 -
 drivers/scsi/fdomain.c              | 1 -
 drivers/scsi/imm.c                  | 1 -
 drivers/scsi/initio.c               | 1 -
 drivers/scsi/isci/init.c            | 1 -
 drivers/scsi/mac53c94.c             | 1 -
 drivers/scsi/mvsas/mv_init.c        | 1 -
 drivers/scsi/nsp32.c                | 1 -
 drivers/scsi/pcmcia/nsp_cs.c        | 1 -
 drivers/scsi/pcmcia/qlogic_stub.c   | 1 -
 drivers/scsi/pcmcia/sym53c500_cs.c  | 1 -
 drivers/scsi/pm8001/pm8001_init.c   | 1 -
 drivers/scsi/ppa.c                  | 1 -
 drivers/scsi/ps3rom.c               | 1 -
 drivers/scsi/qla1280.c              | 1 -
 drivers/scsi/qlogicfas.c            | 1 -
 drivers/scsi/qlogicpti.c            | 1 -
 drivers/scsi/sym53c416.c            | 1 -
 drivers/scsi/wd719x.c               | 1 -
 drivers/scsi/wd719x.h               | 2 --
 drivers/staging/rts5208/rtsx.c      | 1 -
 drivers/usb/image/microtek.c        | 1 -
 drivers/usb/storage/scsiglue.c      | 1 -
 drivers/usb/storage/uas.c           | 1 -
 include/linux/libata.h              | 2 --
 40 files changed, 42 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/block/cciss_scsi.c b/drivers/block/cciss_scsi.c
index ecd845cd28d8..1537302e56e3 100644
--- a/drivers/block/cciss_scsi.c
+++ b/drivers/block/cciss_scsi.c
@@ -84,7 +84,6 @@ static struct scsi_host_template cciss_driver_template = {
 	.show_info		= cciss_scsi_show_info,
 	.queuecommand		= cciss_scsi_queue_command,
 	.this_id		= 7,
-	.cmd_per_lun		= 1,
 	.use_clustering		= DISABLE_CLUSTERING,
 	/* Can't have eh_bus_reset_handler or eh_host_reset_handler for cciss */
 	.eh_device_reset_handler= cciss_eh_device_reset_handler,
diff --git a/drivers/firewire/sbp2.c b/drivers/firewire/sbp2.c
index c22606fe3d44..6bac03999fd4 100644
--- a/drivers/firewire/sbp2.c
+++ b/drivers/firewire/sbp2.c
@@ -1611,7 +1611,6 @@ static struct scsi_host_template scsi_driver_template = {
 	.this_id		= -1,
 	.sg_tablesize		= SG_ALL,
 	.use_clustering		= ENABLE_CLUSTERING,
-	.cmd_per_lun		= 1,
 	.can_queue		= 1,
 	.sdev_attrs		= sbp2_scsi_sysfs_attrs,
 };
diff --git a/drivers/s390/scsi/zfcp_scsi.c b/drivers/s390/scsi/zfcp_scsi.c
index 75f4bfc2b98a..b3c6ff49103b 100644
--- a/drivers/s390/scsi/zfcp_scsi.c
+++ b/drivers/s390/scsi/zfcp_scsi.c
@@ -297,7 +297,6 @@ static struct scsi_host_template zfcp_scsi_host_template = {
 				     * ZFCP_QDIO_MAX_SBALS_PER_REQ) - 2) * 8,
 				   /* GCD, adjusted later */
 	.dma_boundary		 = ZFCP_QDIO_SBALE_LEN - 1,
-	.cmd_per_lun		 = 1,
 	.use_clustering		 = 1,
 	.shost_attrs		 = zfcp_sysfs_shost_attrs,
 	.sdev_attrs		 = zfcp_sysfs_sdev_attrs,
diff --git a/drivers/scsi/NCR53c406a.c b/drivers/scsi/NCR53c406a.c
index 42c7161474f7..6e110c630d2c 100644
--- a/drivers/scsi/NCR53c406a.c
+++ b/drivers/scsi/NCR53c406a.c
@@ -1064,7 +1064,6 @@ static struct scsi_host_template driver_template =
      .can_queue         	= 1			/* can_queue */,        
      .this_id           	= 7			/* SCSI ID of the chip */,
      .sg_tablesize      	= 32			/*SG_ALL*/ /*SG_NONE*/, 
-     .cmd_per_lun       	= 1			/* commands per lun */, 
      .unchecked_isa_dma 	= 1			/* unchecked_isa_dma */,
      .use_clustering    	= ENABLE_CLUSTERING,
 };
diff --git a/drivers/scsi/a100u2w.c b/drivers/scsi/a100u2w.c
index 7e33a61c1ba4..cac6b37d7b1b 100644
--- a/drivers/scsi/a100u2w.c
+++ b/drivers/scsi/a100u2w.c
@@ -1078,7 +1078,6 @@ static struct scsi_host_template inia100_template = {
 	.can_queue		= 1,
 	.this_id		= 1,
 	.sg_tablesize		= SG_ALL,
-	.cmd_per_lun 		= 1,
 	.use_clustering		= ENABLE_CLUSTERING,
 };
 
diff --git a/drivers/scsi/aha152x.c b/drivers/scsi/aha152x.c
index e31c460a1335..f44d0487236e 100644
--- a/drivers/scsi/aha152x.c
+++ b/drivers/scsi/aha152x.c
@@ -2922,7 +2922,6 @@ static struct scsi_host_template aha152x_driver_template = {
 	.can_queue			= 1,
 	.this_id			= 7,
 	.sg_tablesize			= SG_ALL,
-	.cmd_per_lun			= 1,
 	.use_clustering			= DISABLE_CLUSTERING,
 	.slave_alloc			= aha152x_adjust_queue,
 };
diff --git a/drivers/scsi/aha1542.c b/drivers/scsi/aha1542.c
index b95d2779f467..5b8b2937a3fe 100644
--- a/drivers/scsi/aha1542.c
+++ b/drivers/scsi/aha1542.c
@@ -950,7 +950,6 @@ static struct scsi_host_template driver_template = {
 	.can_queue		= AHA1542_MAILBOXES, 
 	.this_id		= 7,
 	.sg_tablesize		= 16,
-	.cmd_per_lun		= 1,
 	.unchecked_isa_dma	= 1, 
 	.use_clustering		= ENABLE_CLUSTERING,
 };
diff --git a/drivers/scsi/aha1740.c b/drivers/scsi/aha1740.c
index 31ace4bef8fe..bad35ffc015d 100644
--- a/drivers/scsi/aha1740.c
+++ b/drivers/scsi/aha1740.c
@@ -544,7 +544,6 @@ static struct scsi_host_template aha1740_template = {
 	.can_queue        = AHA1740_ECBS,
 	.this_id          = 7,
 	.sg_tablesize     = AHA1740_SCATTER,
-	.cmd_per_lun      = AHA1740_CMDLUN,
 	.use_clustering   = ENABLE_CLUSTERING,
 	.eh_abort_handler = aha1740_eh_abort_handler,
 };
diff --git a/drivers/scsi/aha1740.h b/drivers/scsi/aha1740.h
index af23fd6bd795..b0c5603461ca 100644
--- a/drivers/scsi/aha1740.h
+++ b/drivers/scsi/aha1740.h
@@ -149,6 +149,5 @@ struct ecb {			/* Enhanced Control Block 6.1 */
 
 #define AHA1740_ECBS 32
 #define AHA1740_SCATTER 16
-#define AHA1740_CMDLUN 1
 
 #endif
diff --git a/drivers/scsi/aic94xx/aic94xx_init.c b/drivers/scsi/aic94xx/aic94xx_init.c
index 02a2512b76a8..4b135cca42a1 100644
--- a/drivers/scsi/aic94xx/aic94xx_init.c
+++ b/drivers/scsi/aic94xx/aic94xx_init.c
@@ -65,7 +65,6 @@ static struct scsi_host_template aic94xx_sht = {
 	.change_queue_depth	= sas_change_queue_depth,
 	.bios_param		= sas_bios_param,
 	.can_queue		= 1,
-	.cmd_per_lun		= 1,
 	.this_id		= -1,
 	.sg_tablesize		= SG_ALL,
 	.max_sectors		= SCSI_DEFAULT_MAX_SECTORS,
diff --git a/drivers/scsi/arm/arxescsi.c b/drivers/scsi/arm/arxescsi.c
index 32d23212de48..3110736fd337 100644
--- a/drivers/scsi/arm/arxescsi.c
+++ b/drivers/scsi/arm/arxescsi.c
@@ -245,7 +245,6 @@ static struct scsi_host_template arxescsi_template = {
 	.can_queue			= 0,
 	.this_id			= 7,
 	.sg_tablesize			= SG_ALL,
-	.cmd_per_lun			= 1,
 	.use_clustering			= DISABLE_CLUSTERING,
 	.proc_name			= "arxescsi",
 };
diff --git a/drivers/scsi/arm/cumana_2.c b/drivers/scsi/arm/cumana_2.c
index abc66f5263ec..faa1bee07c8a 100644
--- a/drivers/scsi/arm/cumana_2.c
+++ b/drivers/scsi/arm/cumana_2.c
@@ -367,7 +367,6 @@ static struct scsi_host_template cumanascsi2_template = {
 	.this_id			= 7,
 	.sg_tablesize			= SCSI_MAX_SG_CHAIN_SEGMENTS,
 	.dma_boundary			= IOMD_DMA_BOUNDARY,
-	.cmd_per_lun			= 1,
 	.use_clustering			= DISABLE_CLUSTERING,
 	.proc_name			= "cumanascsi2",
 };
diff --git a/drivers/scsi/arm/eesox.c b/drivers/scsi/arm/eesox.c
index 5bf3c0d134b4..a8ad6880dd91 100644
--- a/drivers/scsi/arm/eesox.c
+++ b/drivers/scsi/arm/eesox.c
@@ -486,7 +486,6 @@ static struct scsi_host_template eesox_template = {
 	.this_id			= 7,
 	.sg_tablesize			= SCSI_MAX_SG_CHAIN_SEGMENTS,
 	.dma_boundary			= IOMD_DMA_BOUNDARY,
-	.cmd_per_lun			= 1,
 	.use_clustering			= DISABLE_CLUSTERING,
 	.proc_name			= "eesox",
 };
diff --git a/drivers/scsi/atp870u.c b/drivers/scsi/atp870u.c
index 0836433e3a2d..05301bc752ee 100644
--- a/drivers/scsi/atp870u.c
+++ b/drivers/scsi/atp870u.c
@@ -3158,7 +3158,6 @@ static struct scsi_host_template atp870u_template = {
      .can_queue         	= qcnt			/* can_queue */,
      .this_id           	= 7			/* SCSI ID */,
      .sg_tablesize      	= ATP870U_SCATTER	/*SG_ALL*/ /*SG_NONE*/,
-     .cmd_per_lun       	= ATP870U_CMDLUN		/* commands per lun */,
      .use_clustering    	= ENABLE_CLUSTERING,
      .max_sectors		= ATP870U_MAX_SECTORS,
 };
diff --git a/drivers/scsi/atp870u.h b/drivers/scsi/atp870u.h
index 62bae64a01c1..5cf62566ad42 100644
--- a/drivers/scsi/atp870u.h
+++ b/drivers/scsi/atp870u.h
@@ -10,7 +10,6 @@
 #define MAX_SENSE 	14
 #define qcnt	       	32
 #define ATP870U_SCATTER 	128
-#define ATP870U_CMDLUN  	1
 
 #define MAX_ADAPTER	8
 #define MAX_SCSI_ID	16
diff --git a/drivers/scsi/dpt_i2o.c b/drivers/scsi/dpt_i2o.c
index 2806cfbec2b9..f35ed53adaac 100644
--- a/drivers/scsi/dpt_i2o.c
+++ b/drivers/scsi/dpt_i2o.c
@@ -3562,7 +3562,6 @@ static struct scsi_host_template driver_template = {
 	.slave_configure	= adpt_slave_configure,
 	.can_queue		= MAX_TO_IOP_MESSAGES,
 	.this_id		= 7,
-	.cmd_per_lun		= 1,
 	.use_clustering		= ENABLE_CLUSTERING,
 };
 
diff --git a/drivers/scsi/fdomain.c b/drivers/scsi/fdomain.c
index fff682976c56..eefe14d453db 100644
--- a/drivers/scsi/fdomain.c
+++ b/drivers/scsi/fdomain.c
@@ -1764,7 +1764,6 @@ struct scsi_host_template fdomain_driver_template = {
 	.can_queue		= 1,
 	.this_id		= 6,
 	.sg_tablesize		= 64,
-	.cmd_per_lun		= 1,
 	.use_clustering		= DISABLE_CLUSTERING,
 };
 
diff --git a/drivers/scsi/imm.c b/drivers/scsi/imm.c
index 89a8266560d0..4e1a632ccf16 100644
--- a/drivers/scsi/imm.c
+++ b/drivers/scsi/imm.c
@@ -1109,7 +1109,6 @@ static struct scsi_host_template imm_template = {
 	.bios_param		= imm_biosparam,
 	.this_id		= 7,
 	.sg_tablesize		= SG_ALL,
-	.cmd_per_lun		= 1,
 	.use_clustering		= ENABLE_CLUSTERING,
 	.can_queue		= 1,
 	.slave_alloc		= imm_adjust_queue,
diff --git a/drivers/scsi/initio.c b/drivers/scsi/initio.c
index e5dae7b54d9a..6a926bae76b2 100644
--- a/drivers/scsi/initio.c
+++ b/drivers/scsi/initio.c
@@ -2833,7 +2833,6 @@ static struct scsi_host_template initio_template = {
 	.can_queue		= MAX_TARGETS * i91u_MAXQUEUE,
 	.this_id		= 1,
 	.sg_tablesize		= SG_ALL,
-	.cmd_per_lun		= 1,
 	.use_clustering		= ENABLE_CLUSTERING,
 };
 
diff --git a/drivers/scsi/isci/init.c b/drivers/scsi/isci/init.c
index cd41b63a2f10..0dfcabe3ca7c 100644
--- a/drivers/scsi/isci/init.c
+++ b/drivers/scsi/isci/init.c
@@ -160,7 +160,6 @@ static struct scsi_host_template isci_sht = {
 	.change_queue_depth		= sas_change_queue_depth,
 	.bios_param			= sas_bios_param,
 	.can_queue			= ISCI_CAN_QUEUE_VAL,
-	.cmd_per_lun			= 1,
 	.this_id			= -1,
 	.sg_tablesize			= SG_ALL,
 	.max_sectors			= SCSI_DEFAULT_MAX_SECTORS,
diff --git a/drivers/scsi/mac53c94.c b/drivers/scsi/mac53c94.c
index 0adb2e015597..141226631429 100644
--- a/drivers/scsi/mac53c94.c
+++ b/drivers/scsi/mac53c94.c
@@ -403,7 +403,6 @@ static struct scsi_host_template mac53c94_template = {
 	.can_queue	= 1,
 	.this_id	= 7,
 	.sg_tablesize	= SG_ALL,
-	.cmd_per_lun	= 1,
 	.use_clustering	= DISABLE_CLUSTERING,
 };
 
diff --git a/drivers/scsi/mvsas/mv_init.c b/drivers/scsi/mvsas/mv_init.c
index 53030b0e8015..d40d734aa53a 100644
--- a/drivers/scsi/mvsas/mv_init.c
+++ b/drivers/scsi/mvsas/mv_init.c
@@ -56,7 +56,6 @@ static struct scsi_host_template mvs_sht = {
 	.change_queue_depth	= sas_change_queue_depth,
 	.bios_param		= sas_bios_param,
 	.can_queue		= 1,
-	.cmd_per_lun		= 1,
 	.this_id		= -1,
 	.sg_tablesize		= SG_ALL,
 	.max_sectors		= SCSI_DEFAULT_MAX_SECTORS,
diff --git a/drivers/scsi/nsp32.c b/drivers/scsi/nsp32.c
index c6077cefbeca..53c84771f0e8 100644
--- a/drivers/scsi/nsp32.c
+++ b/drivers/scsi/nsp32.c
@@ -274,7 +274,6 @@ static struct scsi_host_template nsp32_template = {
 	.can_queue			= 1,
 	.sg_tablesize			= NSP32_SG_SIZE,
 	.max_sectors			= 128,
-	.cmd_per_lun			= 1,
 	.this_id			= NSP32_HOST_SCSIID,
 	.use_clustering			= DISABLE_CLUSTERING,
 	.eh_abort_handler       	= nsp32_eh_abort,
diff --git a/drivers/scsi/pcmcia/nsp_cs.c b/drivers/scsi/pcmcia/nsp_cs.c
index 1b6c8833a304..5fb6eefc6541 100644
--- a/drivers/scsi/pcmcia/nsp_cs.c
+++ b/drivers/scsi/pcmcia/nsp_cs.c
@@ -86,7 +86,6 @@ static struct scsi_host_template nsp_driver_template = {
 	.can_queue		 = 1,
 	.this_id		 = NSP_INITIATOR_ID,
 	.sg_tablesize		 = SG_ALL,
-	.cmd_per_lun		 = 1,
 	.use_clustering		 = DISABLE_CLUSTERING,
 };
 
diff --git a/drivers/scsi/pcmcia/qlogic_stub.c b/drivers/scsi/pcmcia/qlogic_stub.c
index bcaf89fe0c9e..c670dc704c74 100644
--- a/drivers/scsi/pcmcia/qlogic_stub.c
+++ b/drivers/scsi/pcmcia/qlogic_stub.c
@@ -72,7 +72,6 @@ static struct scsi_host_template qlogicfas_driver_template = {
 	.can_queue		= 1,
 	.this_id		= -1,
 	.sg_tablesize		= SG_ALL,
-	.cmd_per_lun		= 1,
 	.use_clustering		= DISABLE_CLUSTERING,
 };
 
diff --git a/drivers/scsi/pcmcia/sym53c500_cs.c b/drivers/scsi/pcmcia/sym53c500_cs.c
index 155f9573021f..20011c8afbb5 100644
--- a/drivers/scsi/pcmcia/sym53c500_cs.c
+++ b/drivers/scsi/pcmcia/sym53c500_cs.c
@@ -680,7 +680,6 @@ static struct scsi_host_template sym53c500_driver_template = {
      .can_queue			= 1,
      .this_id			= 7,
      .sg_tablesize		= 32,
-     .cmd_per_lun		= 1,
      .use_clustering		= ENABLE_CLUSTERING,
      .shost_attrs		= SYM53C500_shost_attrs
 };
diff --git a/drivers/scsi/pm8001/pm8001_init.c b/drivers/scsi/pm8001/pm8001_init.c
index 65555916d3b8..a132f2664d2f 100644
--- a/drivers/scsi/pm8001/pm8001_init.c
+++ b/drivers/scsi/pm8001/pm8001_init.c
@@ -78,7 +78,6 @@ static struct scsi_host_template pm8001_sht = {
 	.change_queue_depth	= sas_change_queue_depth,
 	.bios_param		= sas_bios_param,
 	.can_queue		= 1,
-	.cmd_per_lun		= 1,
 	.this_id		= -1,
 	.sg_tablesize		= SG_ALL,
 	.max_sectors		= SCSI_DEFAULT_MAX_SECTORS,
diff --git a/drivers/scsi/ppa.c b/drivers/scsi/ppa.c
index 1db8b26063b4..ee00e27ba396 100644
--- a/drivers/scsi/ppa.c
+++ b/drivers/scsi/ppa.c
@@ -974,7 +974,6 @@ static struct scsi_host_template ppa_template = {
 	.bios_param		= ppa_biosparam,
 	.this_id		= -1,
 	.sg_tablesize		= SG_ALL,
-	.cmd_per_lun		= 1,
 	.use_clustering		= ENABLE_CLUSTERING,
 	.can_queue		= 1,
 	.slave_alloc		= ppa_adjust_queue,
diff --git a/drivers/scsi/ps3rom.c b/drivers/scsi/ps3rom.c
index 5298def33733..4924424d20fe 100644
--- a/drivers/scsi/ps3rom.c
+++ b/drivers/scsi/ps3rom.c
@@ -347,7 +347,6 @@ static struct scsi_host_template ps3rom_host_template = {
 	.can_queue =		1,
 	.this_id =		7,
 	.sg_tablesize =		SG_ALL,
-	.cmd_per_lun =		1,
 	.emulated =             1,		/* only sg driver uses this */
 	.max_sectors =		PS3ROM_MAX_SECTORS,
 	.use_clustering =	ENABLE_CLUSTERING,
diff --git a/drivers/scsi/qla1280.c b/drivers/scsi/qla1280.c
index c68a66e8cfc1..5d0ec42a9317 100644
--- a/drivers/scsi/qla1280.c
+++ b/drivers/scsi/qla1280.c
@@ -4217,7 +4217,6 @@ static struct scsi_host_template qla1280_driver_template = {
 	.can_queue		= 0xfffff,
 	.this_id		= -1,
 	.sg_tablesize		= SG_ALL,
-	.cmd_per_lun		= 1,
 	.use_clustering		= ENABLE_CLUSTERING,
 };
 
diff --git a/drivers/scsi/qlogicfas.c b/drivers/scsi/qlogicfas.c
index a22bb1b40ce2..61cac87fb86f 100644
--- a/drivers/scsi/qlogicfas.c
+++ b/drivers/scsi/qlogicfas.c
@@ -193,7 +193,6 @@ static struct scsi_host_template qlogicfas_driver_template = {
 	.can_queue		= 1,
 	.this_id		= -1,
 	.sg_tablesize		= SG_ALL,
-	.cmd_per_lun		= 1,
 	.use_clustering		= DISABLE_CLUSTERING,
 };
 
diff --git a/drivers/scsi/qlogicpti.c b/drivers/scsi/qlogicpti.c
index fe122700cad8..676385ff28ef 100644
--- a/drivers/scsi/qlogicpti.c
+++ b/drivers/scsi/qlogicpti.c
@@ -1287,7 +1287,6 @@ static struct scsi_host_template qpti_template = {
 	.can_queue		= QLOGICPTI_REQ_QUEUE_LEN,
 	.this_id		= 7,
 	.sg_tablesize		= QLOGICPTI_MAX_SG(QLOGICPTI_REQ_QUEUE_LEN),
-	.cmd_per_lun		= 1,
 	.use_clustering		= ENABLE_CLUSTERING,
 };
 
diff --git a/drivers/scsi/sym53c416.c b/drivers/scsi/sym53c416.c
index 0b7819f3e09b..5bdcbe8fa958 100644
--- a/drivers/scsi/sym53c416.c
+++ b/drivers/scsi/sym53c416.c
@@ -838,7 +838,6 @@ static struct scsi_host_template driver_template = {
 	.can_queue =		1,
 	.this_id =		SYM53C416_SCSI_ID,
 	.sg_tablesize =		32,
-	.cmd_per_lun =		1,
 	.unchecked_isa_dma =	1,
 	.use_clustering =	ENABLE_CLUSTERING,
 };
diff --git a/drivers/scsi/wd719x.c b/drivers/scsi/wd719x.c
index 289ad016d925..61346aa73178 100644
--- a/drivers/scsi/wd719x.c
+++ b/drivers/scsi/wd719x.c
@@ -882,7 +882,6 @@ static struct scsi_host_template wd719x_template = {
 	.can_queue			= 255,
 	.this_id			= 7,
 	.sg_tablesize			= WD719X_SG,
-	.cmd_per_lun			= WD719X_CMD_PER_LUN,
 	.use_clustering			= ENABLE_CLUSTERING,
 };
 
diff --git a/drivers/scsi/wd719x.h b/drivers/scsi/wd719x.h
index 185e30e4eb93..9c6dd45f95f5 100644
--- a/drivers/scsi/wd719x.h
+++ b/drivers/scsi/wd719x.h
@@ -2,8 +2,6 @@
 #define _WD719X_H_
 
 #define WD719X_SG 255		/* Scatter/gather size */
-#define WD719X_CMD_PER_LUN 1	/* We should be able to do linked commands, but
-				 * this is 1 for now to be safe. */
 
 struct wd719x_sglist {
 	__le32 ptr;
diff --git a/drivers/staging/rts5208/rtsx.c b/drivers/staging/rts5208/rtsx.c
index d64b6ed9c0c9..aed49bf762b4 100644
--- a/drivers/staging/rts5208/rtsx.c
+++ b/drivers/staging/rts5208/rtsx.c
@@ -230,7 +230,6 @@ static struct scsi_host_template rtsx_host_template = {
 
 	/* queue commands only, only one command per LUN */
 	.can_queue =			1,
-	.cmd_per_lun =			1,
 
 	/* unknown initiator id */
 	.this_id =			-1,
diff --git a/drivers/usb/image/microtek.c b/drivers/usb/image/microtek.c
index 6431d08c8d9d..a4dbb0cd80da 100644
--- a/drivers/usb/image/microtek.c
+++ b/drivers/usb/image/microtek.c
@@ -635,7 +635,6 @@ static struct scsi_host_template mts_scsi_host_template = {
 	.sg_tablesize =		SG_ALL,
 	.can_queue =		1,
 	.this_id =		-1,
-	.cmd_per_lun =		1,
 	.use_clustering =	1,
 	.emulated =		1,
 	.slave_alloc =		mts_slave_alloc,
diff --git a/drivers/usb/storage/scsiglue.c b/drivers/usb/storage/scsiglue.c
index 0e400f382f3a..996ef1e882d3 100644
--- a/drivers/usb/storage/scsiglue.c
+++ b/drivers/usb/storage/scsiglue.c
@@ -558,7 +558,6 @@ struct scsi_host_template usb_stor_host_template = {
 
 	/* queue commands only, only one command per LUN */
 	.can_queue =			1,
-	.cmd_per_lun =			1,
 
 	/* unknown initiator id */
 	.this_id =			-1,
diff --git a/drivers/usb/storage/uas.c b/drivers/usb/storage/uas.c
index 6cdabdc119a7..56b97fe1d18e 100644
--- a/drivers/usb/storage/uas.c
+++ b/drivers/usb/storage/uas.c
@@ -803,7 +803,6 @@ static struct scsi_host_template uas_host_template = {
 	.can_queue = 65536,	/* Is there a limit on the _host_ ? */
 	.this_id = -1,
 	.sg_tablesize = SG_NONE,
-	.cmd_per_lun = 1,	/* until we override it */
 	.skip_settle_delay = 1,
 	.use_blk_tags = 1,
 };
diff --git a/include/linux/libata.h b/include/linux/libata.h
index 8dad4a307bb8..1402291aab5e 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -134,7 +134,6 @@ enum {
 	ATA_ALL_DEVICES		= (1 << ATA_MAX_DEVICES) - 1,
 
 	ATA_SHT_EMULATED	= 1,
-	ATA_SHT_CMD_PER_LUN	= 1,
 	ATA_SHT_THIS_ID		= -1,
 	ATA_SHT_USE_CLUSTERING	= 1,
 
@@ -1354,7 +1353,6 @@ extern struct device_attribute *ata_common_sdev_attrs[];
 	.can_queue		= ATA_DEF_QUEUE,		\
 	.tag_alloc_policy	= BLK_TAG_ALLOC_RR,		\
 	.this_id		= ATA_SHT_THIS_ID,		\
-	.cmd_per_lun		= ATA_SHT_CMD_PER_LUN,		\
 	.emulated		= ATA_SHT_EMULATED,		\
 	.use_clustering		= ATA_SHT_USE_CLUSTERING,	\
 	.proc_name		= drv_name,			\
-- 
cgit v1.2.3


From 1c4b1d73bacc546ba4e42f7eb4cb88c54139820b Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jslaby@suse.cz>
Date: Wed, 27 May 2015 13:57:46 +0200
Subject: tty: move linux/gsmmux.h to uapi

linux/gsmmux.h defines a user interface and therefore should be
installed with other headers.

Make the file include:
* linux/if.h for IFNAMSIZ
* linux/ioctl.h for _IO* macros

Signed-off-by: Jiri Slaby <jslaby@suse.cz>
Cc: Alan Cox <alan@linux.intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/gsmmux.h      | 36 ------------------------------------
 include/uapi/linux/Kbuild   |  1 +
 include/uapi/linux/gsmmux.h | 39 +++++++++++++++++++++++++++++++++++++++
 3 files changed, 40 insertions(+), 36 deletions(-)
 delete mode 100644 include/linux/gsmmux.h
 create mode 100644 include/uapi/linux/gsmmux.h

(limited to 'include/linux')

diff --git a/include/linux/gsmmux.h b/include/linux/gsmmux.h
deleted file mode 100644
index c25e9477f7c3..000000000000
--- a/include/linux/gsmmux.h
+++ /dev/null
@@ -1,36 +0,0 @@
-#ifndef _LINUX_GSMMUX_H
-#define _LINUX_GSMMUX_H
-
-struct gsm_config
-{
-	unsigned int adaption;
-	unsigned int encapsulation;
-	unsigned int initiator;
-	unsigned int t1;
-	unsigned int t2;
-	unsigned int t3;
-	unsigned int n2;
-	unsigned int mru;
-	unsigned int mtu;
-	unsigned int k;
-	unsigned int i;
-	unsigned int unused[8];		/* Padding for expansion without
-					   breaking stuff */
-};
-
-#define GSMIOC_GETCONF		_IOR('G', 0, struct gsm_config)
-#define GSMIOC_SETCONF		_IOW('G', 1, struct gsm_config)
-
-struct gsm_netconfig {
-	unsigned int adaption;  /* Adaption to use in network mode */
-	unsigned short protocol;/* Protocol to use - only ETH_P_IP supported */
-	unsigned short unused2;
-	char if_name[IFNAMSIZ];	/* interface name format string */
-	__u8 unused[28];        /* For future use */
-};
-
-#define GSMIOC_ENABLE_NET      _IOW('G', 2, struct gsm_netconfig)
-#define GSMIOC_DISABLE_NET     _IO('G', 3)
-
-
-#endif
diff --git a/include/uapi/linux/Kbuild b/include/uapi/linux/Kbuild
index 1a0006a76b00..1d3db6a74d1f 100644
--- a/include/uapi/linux/Kbuild
+++ b/include/uapi/linux/Kbuild
@@ -138,6 +138,7 @@ header-y += genetlink.h
 header-y += gen_stats.h
 header-y += gfs2_ondisk.h
 header-y += gigaset_dev.h
+header-y += gsmmux.h
 header-y += hdlcdrv.h
 header-y += hdlc.h
 header-y += hdreg.h
diff --git a/include/uapi/linux/gsmmux.h b/include/uapi/linux/gsmmux.h
new file mode 100644
index 000000000000..c06742d52856
--- /dev/null
+++ b/include/uapi/linux/gsmmux.h
@@ -0,0 +1,39 @@
+#ifndef _LINUX_GSMMUX_H
+#define _LINUX_GSMMUX_H
+
+#include <linux/if.h>
+#include <linux/ioctl.h>
+
+struct gsm_config
+{
+	unsigned int adaption;
+	unsigned int encapsulation;
+	unsigned int initiator;
+	unsigned int t1;
+	unsigned int t2;
+	unsigned int t3;
+	unsigned int n2;
+	unsigned int mru;
+	unsigned int mtu;
+	unsigned int k;
+	unsigned int i;
+	unsigned int unused[8];		/* Padding for expansion without
+					   breaking stuff */
+};
+
+#define GSMIOC_GETCONF		_IOR('G', 0, struct gsm_config)
+#define GSMIOC_SETCONF		_IOW('G', 1, struct gsm_config)
+
+struct gsm_netconfig {
+	unsigned int adaption;  /* Adaption to use in network mode */
+	unsigned short protocol;/* Protocol to use - only ETH_P_IP supported */
+	unsigned short unused2;
+	char if_name[IFNAMSIZ];	/* interface name format string */
+	__u8 unused[28];        /* For future use */
+};
+
+#define GSMIOC_ENABLE_NET      _IOW('G', 2, struct gsm_netconfig)
+#define GSMIOC_DISABLE_NET     _IO('G', 3)
+
+
+#endif
-- 
cgit v1.2.3


From 1f656ff3fdddc2f59649cc84b633b799908f1f7b Mon Sep 17 00:00:00 2001
From: "K. Y. Srinivasan" <kys@microsoft.com>
Date: Sat, 30 May 2015 23:37:48 -0700
Subject: Drivers: hv: vmbus: Implement NUMA aware CPU affinity for channels

Channels/sub-channels can be affinitized to VCPUs in the guest. Implement
this affinity in a way that is NUMA aware. The current protocol distributed
the primary channels uniformly across all available CPUs. The new protocol
is NUMA aware: primary channels are distributed across the available NUMA
nodes while the sub-channels within a primary channel are distributed amongst
CPUs within the NUMA node assigned to the primary channel.

Signed-off-by: K. Y. Srinivasan <kys@microsoft.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/hv/channel_mgmt.c | 72 ++++++++++++++++++++++++++++-------------------
 include/linux/hyperv.h    |  5 ++++
 2 files changed, 48 insertions(+), 29 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/hv/channel_mgmt.c b/drivers/hv/channel_mgmt.c
index c3eba37db9b7..4506a6623618 100644
--- a/drivers/hv/channel_mgmt.c
+++ b/drivers/hv/channel_mgmt.c
@@ -370,25 +370,27 @@ static const struct hv_vmbus_device_id hp_devs[] = {
 /*
  * We use this state to statically distribute the channel interrupt load.
  */
-static u32  next_vp;
+static int next_numa_node_id;
 
 /*
  * Starting with Win8, we can statically distribute the incoming
- * channel interrupt load by binding a channel to VCPU. We
- * implement here a simple round robin scheme for distributing
- * the interrupt load.
- * We will bind channels that are not performance critical to cpu 0 and
- * performance critical channels (IDE, SCSI and Network) will be uniformly
- * distributed across all available CPUs.
+ * channel interrupt load by binding a channel to VCPU.
+ * We do this in a hierarchical fashion:
+ * First distribute the primary channels across available NUMA nodes
+ * and then distribute the subchannels amongst the CPUs in the NUMA
+ * node assigned to the primary channel.
+ *
+ * For pre-win8 hosts or non-performance critical channels we assign the
+ * first CPU in the first NUMA node.
  */
 static void init_vp_index(struct vmbus_channel *channel, const uuid_le *type_guid)
 {
 	u32 cur_cpu;
 	int i;
 	bool perf_chn = false;
-	u32 max_cpus = num_online_cpus();
-	struct vmbus_channel *primary = channel->primary_channel, *prev;
-	unsigned long flags;
+	struct vmbus_channel *primary = channel->primary_channel;
+	int next_node;
+	struct cpumask available_mask;
 
 	for (i = IDE; i < MAX_PERF_CHN; i++) {
 		if (!memcmp(type_guid->b, hp_devs[i].guid,
@@ -405,36 +407,48 @@ static void init_vp_index(struct vmbus_channel *channel, const uuid_le *type_gui
 		 * Also if the channel is not a performance critical
 		 * channel, bind it to cpu 0.
 		 */
+		channel->numa_node = 0;
+		cpumask_set_cpu(0, &channel->alloced_cpus_in_node);
 		channel->target_cpu = 0;
 		channel->target_vp = hv_context.vp_index[0];
 		return;
 	}
 
 	/*
-	 * Primary channels are distributed evenly across all vcpus we have.
-	 * When the host asks us to create subchannels it usually makes us
-	 * num_cpus-1 offers and we are supposed to distribute the work evenly
-	 * among the channel itself and all its subchannels. Make sure they are
-	 * all assigned to different vcpus.
+	 * We distribute primary channels evenly across all the available
+	 * NUMA nodes and within the assigned NUMA node we will assign the
+	 * first available CPU to the primary channel.
+	 * The sub-channels will be assigned to the CPUs available in the
+	 * NUMA node evenly.
 	 */
-	if (!primary)
-		cur_cpu = (++next_vp % max_cpus);
-	else {
+	if (!primary) {
+		while (true) {
+			next_node = next_numa_node_id++;
+			if (next_node == nr_node_ids)
+				next_node = next_numa_node_id = 0;
+			if (cpumask_empty(cpumask_of_node(next_node)))
+				continue;
+			break;
+		}
+		channel->numa_node = next_node;
+		primary = channel;
+	}
+
+	if (cpumask_weight(&primary->alloced_cpus_in_node) ==
+	    cpumask_weight(cpumask_of_node(primary->numa_node))) {
 		/*
-		 * Let's assign the first subchannel of a channel to the
-		 * primary->target_cpu+1 and all the subsequent channels to
-		 * the prev->target_cpu+1.
+		 * We have cycled through all the CPUs in the node;
+		 * reset the alloced map.
 		 */
-		spin_lock_irqsave(&primary->lock, flags);
-		if (primary->num_sc == 1)
-			cur_cpu = (primary->target_cpu + 1) % max_cpus;
-		else {
-			prev = list_prev_entry(channel, sc_list);
-			cur_cpu = (prev->target_cpu + 1) % max_cpus;
-		}
-		spin_unlock_irqrestore(&primary->lock, flags);
+		cpumask_clear(&primary->alloced_cpus_in_node);
 	}
 
+	cpumask_xor(&available_mask, &primary->alloced_cpus_in_node,
+		    cpumask_of_node(primary->numa_node));
+
+	cur_cpu = cpumask_next(-1, &available_mask);
+	cpumask_set_cpu(cur_cpu, &primary->alloced_cpus_in_node);
+
 	channel->target_cpu = cur_cpu;
 	channel->target_vp = hv_context.vp_index[cur_cpu];
 }
diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h
index 4317cd1b69ed..30d3a1f79450 100644
--- a/include/linux/hyperv.h
+++ b/include/linux/hyperv.h
@@ -696,6 +696,11 @@ struct vmbus_channel {
 	u32 target_vp;
 	/* The corresponding CPUID in the guest */
 	u32 target_cpu;
+	/*
+	 * State to manage the CPU affiliation of channels.
+	 */
+	struct cpumask alloced_cpus_in_node;
+	int numa_node;
 	/*
 	 * Support for sub-channels. For high performance devices,
 	 * it will be useful to have multiple sub-channels to support
-- 
cgit v1.2.3


From 17ca8cbf49be3aa94bb1c2b7ee6545fd70094eb4 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Fri, 29 May 2015 23:23:06 +0200
Subject: ebpf: allow bpf_ktime_get_ns_proto also for networking

As this is already exported from tracing side via commit d9847d310ab4
("tracing: Allow BPF programs to call bpf_ktime_get_ns()"), we might
as well want to move it to the core, so also networking users can make
use of it, e.g. to measure diffs for certain flows from ingress/egress.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Cc: Alexei Starovoitov <ast@plumgrid.com>
Cc: Ingo Molnar <mingo@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h      |  1 +
 kernel/bpf/core.c        |  1 +
 kernel/bpf/helpers.c     | 13 +++++++++++++
 kernel/trace/bpf_trace.c | 12 ------------
 net/core/filter.c        |  2 ++
 5 files changed, 17 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 5f520f5f087e..ca854e5bb2f7 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -186,5 +186,6 @@ extern const struct bpf_func_proto bpf_map_delete_elem_proto;
 extern const struct bpf_func_proto bpf_get_prandom_u32_proto;
 extern const struct bpf_func_proto bpf_get_smp_processor_id_proto;
 extern const struct bpf_func_proto bpf_tail_call_proto;
+extern const struct bpf_func_proto bpf_ktime_get_ns_proto;
 
 #endif /* _LINUX_BPF_H */
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index d44b25cbe460..4548422d5f6c 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -734,6 +734,7 @@ const struct bpf_func_proto bpf_map_delete_elem_proto __weak;
 
 const struct bpf_func_proto bpf_get_prandom_u32_proto __weak;
 const struct bpf_func_proto bpf_get_smp_processor_id_proto __weak;
+const struct bpf_func_proto bpf_ktime_get_ns_proto __weak;
 
 /* To execute LD_ABS/LD_IND instructions __bpf_prog_run() may call
  * skb_copy_bits(), so provide a weak definition of it for NET-less config.
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index bd7f5988ed9c..b3aaabdf9a50 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -13,6 +13,7 @@
 #include <linux/rcupdate.h>
 #include <linux/random.h>
 #include <linux/smp.h>
+#include <linux/ktime.h>
 
 /* If kernel subsystem is allowing eBPF programs to call this function,
  * inside its own verifier_ops->get_func_proto() callback it should return
@@ -111,3 +112,15 @@ const struct bpf_func_proto bpf_get_smp_processor_id_proto = {
 	.gpl_only	= false,
 	.ret_type	= RET_INTEGER,
 };
+
+static u64 bpf_ktime_get_ns(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+	/* NMI safe access to clock monotonic */
+	return ktime_get_mono_fast_ns();
+}
+
+const struct bpf_func_proto bpf_ktime_get_ns_proto = {
+	.func		= bpf_ktime_get_ns,
+	.gpl_only	= true,
+	.ret_type	= RET_INTEGER,
+};
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 646445e41bd4..50c4015a8ad3 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -79,18 +79,6 @@ static const struct bpf_func_proto bpf_probe_read_proto = {
 	.arg3_type	= ARG_ANYTHING,
 };
 
-static u64 bpf_ktime_get_ns(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
-{
-	/* NMI safe access to clock monotonic */
-	return ktime_get_mono_fast_ns();
-}
-
-static const struct bpf_func_proto bpf_ktime_get_ns_proto = {
-	.func		= bpf_ktime_get_ns,
-	.gpl_only	= true,
-	.ret_type	= RET_INTEGER,
-};
-
 /*
  * limited trace_printk()
  * only %d %u %x %ld %lu %lx %lld %llu %llx %p conversion specifiers allowed
diff --git a/net/core/filter.c b/net/core/filter.c
index 2c30d6632d66..b78a010a957f 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1423,6 +1423,8 @@ sk_filter_func_proto(enum bpf_func_id func_id)
 		return &bpf_get_smp_processor_id_proto;
 	case BPF_FUNC_tail_call:
 		return &bpf_tail_call_proto;
+	case BPF_FUNC_ktime_get_ns:
+		return &bpf_ktime_get_ns_proto;
 	default:
 		return NULL;
 	}
-- 
cgit v1.2.3


From 887ee43477e4e327dbcd2aabc2d78a5116ed8a33 Mon Sep 17 00:00:00 2001
From: Beomho Seo <beomho.seo@samsung.com>
Date: Thu, 30 Apr 2015 13:07:43 +0900
Subject: hwmon: (ntc_thermistor) Add support for ncpXXwf104

This patch adds support for the ntc thermistor NCPXXWF104 series.

Cc: Jean Delvare <jdelvare@suse.de>
Cc: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Beomho Seo <beomho.seo@samsung.com>
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
---
 .../devicetree/bindings/hwmon/ntc_thermistor.txt   |  1 +
 Documentation/hwmon/ntc_thermistor                 |  6 ++-
 drivers/hwmon/Kconfig                              |  4 +-
 drivers/hwmon/ntc_thermistor.c                     | 44 ++++++++++++++++++++++
 include/linux/platform_data/ntc_thermistor.h       |  1 +
 5 files changed, 52 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/devicetree/bindings/hwmon/ntc_thermistor.txt b/Documentation/devicetree/bindings/hwmon/ntc_thermistor.txt
index fcca8e744f41..a04a80f9cc70 100644
--- a/Documentation/devicetree/bindings/hwmon/ntc_thermistor.txt
+++ b/Documentation/devicetree/bindings/hwmon/ntc_thermistor.txt
@@ -9,6 +9,7 @@ Requires node properties:
 	"murata,ncp21wb473"
 	"murata,ncp03wb473"
 	"murata,ncp15wl333"
+	"murata,ncp03wf104"
 
 /* Usage of vendor name "ntc" is deprecated */
 <DEPRECATED>	"ntc,ncp15wb473"
diff --git a/Documentation/hwmon/ntc_thermistor b/Documentation/hwmon/ntc_thermistor
index c5e05e2900a3..1d4cc847c6fe 100644
--- a/Documentation/hwmon/ntc_thermistor
+++ b/Documentation/hwmon/ntc_thermistor
@@ -2,8 +2,10 @@ Kernel driver ntc_thermistor
 =================
 
 Supported thermistors from Murata:
-* Murata NTC Thermistors NCP15WB473, NCP18WB473, NCP21WB473, NCP03WB473, NCP15WL333
-  Prefixes: 'ncp15wb473', 'ncp18wb473', 'ncp21wb473', 'ncp03wb473', 'ncp15wl333'
+* Murata NTC Thermistors NCP15WB473, NCP18WB473, NCP21WB473, NCP03WB473,
+  NCP15WL333, NCP03WF104
+  Prefixes: 'ncp15wb473', 'ncp18wb473', 'ncp21wb473', 'ncp03wb473',
+  'ncp15wl333', 'ncp03wf104'
   Datasheet: Publicly available at Murata
 
 Supported thermistors from EPCOS:
diff --git a/drivers/hwmon/Kconfig b/drivers/hwmon/Kconfig
index 25d9e72627e9..4542ffc5468a 100644
--- a/drivers/hwmon/Kconfig
+++ b/drivers/hwmon/Kconfig
@@ -1106,8 +1106,8 @@ config SENSORS_NTC_THERMISTOR
 	  send notifications about the temperature.
 
 	  Currently, this driver supports
-	  NCP15WB473, NCP18WB473, NCP21WB473, NCP03WB473, and NCP15WL333
-	  from Murata and B57330V2103 from EPCOS.
+	  NCP15WB473, NCP18WB473, NCP21WB473, NCP03WB473, NCP15WL333,
+	  and NCP03WF104 from Murata and B57330V2103 from EPCOS.
 
 	  This driver can also be built as a module.  If so, the module
 	  will be called ntc-thermistor.
diff --git a/drivers/hwmon/ntc_thermistor.c b/drivers/hwmon/ntc_thermistor.c
index 68800115876b..fca92912269e 100644
--- a/drivers/hwmon/ntc_thermistor.c
+++ b/drivers/hwmon/ntc_thermistor.c
@@ -53,6 +53,7 @@ static const struct platform_device_id ntc_thermistor_id[] = {
 	{ "ncp03wb473", TYPE_NCPXXWB473 },
 	{ "ncp15wl333", TYPE_NCPXXWL333 },
 	{ "b57330v2103", TYPE_B57330V2103},
+	{ "ncp03wf104", TYPE_NCPXXWF104 },
 	{ },
 };
 
@@ -135,6 +136,43 @@ static const struct ntc_compensation ncpXXwl333[] = {
 	{ .temp_c	= 125, .ohm	= 707 },
 };
 
+static const struct ntc_compensation ncpXXwf104[] = {
+	{ .temp_c	= -40, .ohm	= 4397119 },
+	{ .temp_c	= -35, .ohm	= 3088599 },
+	{ .temp_c	= -30, .ohm	= 2197225 },
+	{ .temp_c	= -25, .ohm	= 1581881 },
+	{ .temp_c	= -20, .ohm	= 1151037 },
+	{ .temp_c	= -15, .ohm	= 846579 },
+	{ .temp_c	= -10, .ohm	= 628988 },
+	{ .temp_c	= -5, .ohm	= 471632 },
+	{ .temp_c	= 0, .ohm	= 357012 },
+	{ .temp_c	= 5, .ohm	= 272500 },
+	{ .temp_c	= 10, .ohm	= 209710 },
+	{ .temp_c	= 15, .ohm	= 162651 },
+	{ .temp_c	= 20, .ohm	= 127080 },
+	{ .temp_c	= 25, .ohm	= 100000 },
+	{ .temp_c	= 30, .ohm	= 79222 },
+	{ .temp_c	= 35, .ohm	= 63167 },
+	{ .temp_c	= 40, .ohm	= 50677 },
+	{ .temp_c	= 45, .ohm	= 40904 },
+	{ .temp_c	= 50, .ohm	= 33195 },
+	{ .temp_c	= 55, .ohm	= 27091 },
+	{ .temp_c	= 60, .ohm	= 22224 },
+	{ .temp_c	= 65, .ohm	= 18323 },
+	{ .temp_c	= 70, .ohm	= 15184 },
+	{ .temp_c	= 75, .ohm	= 12635 },
+	{ .temp_c	= 80, .ohm	= 10566 },
+	{ .temp_c	= 85, .ohm	= 8873 },
+	{ .temp_c	= 90, .ohm	= 7481 },
+	{ .temp_c	= 95, .ohm	= 6337 },
+	{ .temp_c	= 100, .ohm	= 5384 },
+	{ .temp_c	= 105, .ohm	= 4594 },
+	{ .temp_c	= 110, .ohm	= 3934 },
+	{ .temp_c	= 115, .ohm	= 3380 },
+	{ .temp_c	= 120, .ohm	= 2916 },
+	{ .temp_c	= 125, .ohm	= 2522 },
+};
+
 /*
  * The following compensation table is from the specification of EPCOS NTC
  * Thermistors Datasheet
@@ -219,6 +257,8 @@ static const struct of_device_id ntc_match[] = {
 		.data = &ntc_thermistor_id[4] },
 	{ .compatible = "epcos,b57330v2103",
 		.data = &ntc_thermistor_id[5]},
+	{ .compatible = "murata,ncp03wf104",
+		.data = &ntc_thermistor_id[6] },
 
 	/* Usage of vendor name "ntc" is deprecated */
 	{ .compatible = "ntc,ncp15wb473",
@@ -567,6 +607,10 @@ static int ntc_thermistor_probe(struct platform_device *pdev)
 		data->comp = b57330v2103;
 		data->n_comp = ARRAY_SIZE(b57330v2103);
 		break;
+	case TYPE_NCPXXWF104:
+		data->comp = ncpXXwf104;
+		data->n_comp = ARRAY_SIZE(ncpXXwf104);
+		break;
 	default:
 		dev_err(&pdev->dev, "Unknown device type: %lu(%s)\n",
 				pdev_id->driver_data, pdev_id->name);
diff --git a/include/linux/platform_data/ntc_thermistor.h b/include/linux/platform_data/ntc_thermistor.h
index 0a6de4ca4930..aed170588b74 100644
--- a/include/linux/platform_data/ntc_thermistor.h
+++ b/include/linux/platform_data/ntc_thermistor.h
@@ -27,6 +27,7 @@ enum ntc_thermistor_type {
 	TYPE_NCPXXWB473,
 	TYPE_NCPXXWL333,
 	TYPE_B57330V2103,
+	TYPE_NCPXXWF104,
 };
 
 struct ntc_thermistor_platform_data {
-- 
cgit v1.2.3


From dfa13ebbe3340e538b988f5608efd9ff2ca7fc35 Mon Sep 17 00:00:00 2001
From: Adrian Hunter <adrian.hunter@intel.com>
Date: Thu, 7 May 2015 13:10:12 +0300
Subject: mmc: host: Add facility to support re-tuning

Currently, there is core support for tuning during
initialization. There can also be a need to re-tune
periodically (e.g. sdhci) or to re-tune after the
host controller is powered off (e.g. after PM
runtime suspend / resume) or to re-tune in response
to CRC errors.

The main requirements for re-tuning are:
  - ability to enable / disable re-tuning
  - ability to flag that re-tuning is needed
  - ability to re-tune before any request
  - ability to hold off re-tuning if the card is busy
  - ability to hold off re-tuning if re-tuning is in
  progress
  - ability to run a re-tuning timer

To support those requirements 7 members are added to struct
mmc_host:

  unsigned int		can_retune:1;	/* re-tuning can be used */
  unsigned int		doing_retune:1;	/* re-tuning in progress */
  unsigned int		retune_now:1;   /* do re-tuning at next req */
  int			need_retune;	/* re-tuning is needed */
  int			hold_retune;	/* hold off re-tuning */
  unsigned int		retune_period;  /* re-tuning period in secs */
  struct timer_list	retune_timer;	/* for periodic re-tuning */

need_retune is an integer so it can be set without needing
synchronization. hold_retune is a integer to allow nesting.

Various simple functions are provided to set / clear those
variables.

Subsequent patches take those functions into use.

Signed-off-by: Adrian Hunter <adrian.hunter@intel.com>
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 drivers/mmc/core/host.c  | 68 ++++++++++++++++++++++++++++++++++++++++++++++++
 drivers/mmc/core/host.h  |  6 +++++
 include/linux/mmc/host.h | 23 ++++++++++++++++
 3 files changed, 97 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/mmc/core/host.c b/drivers/mmc/core/host.c
index 8be0df758e68..e90e02fc596a 100644
--- a/drivers/mmc/core/host.c
+++ b/drivers/mmc/core/host.c
@@ -301,6 +301,73 @@ static inline void mmc_host_clk_sysfs_init(struct mmc_host *host)
 
 #endif
 
+void mmc_retune_enable(struct mmc_host *host)
+{
+	host->can_retune = 1;
+	if (host->retune_period)
+		mod_timer(&host->retune_timer,
+			  jiffies + host->retune_period * HZ);
+}
+
+void mmc_retune_disable(struct mmc_host *host)
+{
+	host->can_retune = 0;
+	del_timer_sync(&host->retune_timer);
+	host->retune_now = 0;
+	host->need_retune = 0;
+}
+
+void mmc_retune_timer_stop(struct mmc_host *host)
+{
+	del_timer_sync(&host->retune_timer);
+}
+EXPORT_SYMBOL(mmc_retune_timer_stop);
+
+void mmc_retune_hold(struct mmc_host *host)
+{
+	if (!host->hold_retune)
+		host->retune_now = 1;
+	host->hold_retune += 1;
+}
+
+void mmc_retune_release(struct mmc_host *host)
+{
+	if (host->hold_retune)
+		host->hold_retune -= 1;
+	else
+		WARN_ON(1);
+}
+
+int mmc_retune(struct mmc_host *host)
+{
+	int err;
+
+	if (host->retune_now)
+		host->retune_now = 0;
+	else
+		return 0;
+
+	if (!host->need_retune || host->doing_retune || !host->card)
+		return 0;
+
+	host->need_retune = 0;
+
+	host->doing_retune = 1;
+
+	err = mmc_execute_tuning(host->card);
+
+	host->doing_retune = 0;
+
+	return err;
+}
+
+static void mmc_retune_timer(unsigned long data)
+{
+	struct mmc_host *host = (struct mmc_host *)data;
+
+	mmc_retune_needed(host);
+}
+
 /**
  *	mmc_of_parse() - parse host's device-tree node
  *	@host: host whose node should be parsed.
@@ -504,6 +571,7 @@ struct mmc_host *mmc_alloc_host(int extra, struct device *dev)
 #ifdef CONFIG_PM
 	host->pm_notify.notifier_call = mmc_pm_notify;
 #endif
+	setup_timer(&host->retune_timer, mmc_retune_timer, (unsigned long)host);
 
 	/*
 	 * By default, hosts do not support SGIO or large requests.
diff --git a/drivers/mmc/core/host.h b/drivers/mmc/core/host.h
index f2ab9e578126..992bf5397633 100644
--- a/drivers/mmc/core/host.h
+++ b/drivers/mmc/core/host.h
@@ -15,5 +15,11 @@
 int mmc_register_host_class(void);
 void mmc_unregister_host_class(void);
 
+void mmc_retune_enable(struct mmc_host *host);
+void mmc_retune_disable(struct mmc_host *host);
+void mmc_retune_hold(struct mmc_host *host);
+void mmc_retune_release(struct mmc_host *host);
+int mmc_retune(struct mmc_host *host);
+
 #endif
 
diff --git a/include/linux/mmc/host.h b/include/linux/mmc/host.h
index b5bedaec6223..f471193ef6d6 100644
--- a/include/linux/mmc/host.h
+++ b/include/linux/mmc/host.h
@@ -12,6 +12,7 @@
 
 #include <linux/leds.h>
 #include <linux/mutex.h>
+#include <linux/timer.h>
 #include <linux/sched.h>
 #include <linux/device.h>
 #include <linux/fault-inject.h>
@@ -321,10 +322,18 @@ struct mmc_host {
 #ifdef CONFIG_MMC_DEBUG
 	unsigned int		removed:1;	/* host is being removed */
 #endif
+	unsigned int		can_retune:1;	/* re-tuning can be used */
+	unsigned int		doing_retune:1;	/* re-tuning in progress */
+	unsigned int		retune_now:1;	/* do re-tuning at next req */
 
 	int			rescan_disable;	/* disable card detection */
 	int			rescan_entered;	/* used with nonremovable devices */
 
+	int			need_retune;	/* re-tuning is needed */
+	int			hold_retune;	/* hold off re-tuning */
+	unsigned int		retune_period;	/* re-tuning period in secs */
+	struct timer_list	retune_timer;	/* for periodic re-tuning */
+
 	bool			trigger_card_event; /* card_event necessary */
 
 	struct mmc_card		*card;		/* device attached to this host */
@@ -513,4 +522,18 @@ static inline bool mmc_card_hs400(struct mmc_card *card)
 	return card->host->ios.timing == MMC_TIMING_MMC_HS400;
 }
 
+void mmc_retune_timer_stop(struct mmc_host *host);
+
+static inline void mmc_retune_needed(struct mmc_host *host)
+{
+	if (host->can_retune)
+		host->need_retune = 1;
+}
+
+static inline void mmc_retune_recheck(struct mmc_host *host)
+{
+	if (host->hold_retune <= 1)
+		host->retune_now = 1;
+}
+
 #endif /* LINUX_MMC_HOST_H */
-- 
cgit v1.2.3


From 9f6e0bff2afb52a4c29f5ca8a4db01810357974e Mon Sep 17 00:00:00 2001
From: Lars-Peter Clausen <lars@metafoo.de>
Date: Wed, 6 May 2015 20:31:19 +0200
Subject: mmc: Add support for disabling write-protect detection

It is not uncommon to see systems where there is no physical write-protect
signal (e.g. when using eMMC or microSD card slots). For some controllers,
which have a dedicated write-protection detection logic (like SDHCI
controllers), the get_ro() callback can return bogus data in such a case.

Instead of handling this on a per controller basis this patch adds a new
capability flag to the MMC core that can be set to specify that the result
of get_ro() is invalid. When the flag is set the core will not call
get_ro() and assume that the card is always read-write.

Signed-off-by: Lars-Peter Clausen <lars@metafoo.de>
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 drivers/mmc/core/sd.c    | 30 +++++++++++++++++++++++-------
 include/linux/mmc/host.h |  1 +
 2 files changed, 24 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mmc/core/sd.c b/drivers/mmc/core/sd.c
index 31a9ef256d06..8f6864a2a055 100644
--- a/drivers/mmc/core/sd.c
+++ b/drivers/mmc/core/sd.c
@@ -804,6 +804,28 @@ int mmc_sd_get_csd(struct mmc_host *host, struct mmc_card *card)
 	return 0;
 }
 
+static int mmc_sd_get_ro(struct mmc_host *host)
+{
+	int ro;
+
+	/*
+	 * Some systems don't feature a write-protect pin and don't need one.
+	 * E.g. because they only have micro-SD card slot. For those systems
+	 * assume that the SD card is always read-write.
+	 */
+	if (host->caps2 & MMC_CAP2_NO_WRITE_PROTECT)
+		return 0;
+
+	if (!host->ops->get_ro)
+		return -1;
+
+	mmc_host_clk_hold(host);
+	ro = host->ops->get_ro(host);
+	mmc_host_clk_release(host);
+
+	return ro;
+}
+
 int mmc_sd_setup_card(struct mmc_host *host, struct mmc_card *card,
 	bool reinit)
 {
@@ -855,13 +877,7 @@ int mmc_sd_setup_card(struct mmc_host *host, struct mmc_card *card,
 	 * Check if read-only switch is active.
 	 */
 	if (!reinit) {
-		int ro = -1;
-
-		if (host->ops->get_ro) {
-			mmc_host_clk_hold(card->host);
-			ro = host->ops->get_ro(host);
-			mmc_host_clk_release(card->host);
-		}
+		int ro = mmc_sd_get_ro(host);
 
 		if (ro < 0) {
 			pr_warn("%s: host does not support reading read-only switch, assuming write-enable\n",
diff --git a/include/linux/mmc/host.h b/include/linux/mmc/host.h
index f471193ef6d6..433eccb50838 100644
--- a/include/linux/mmc/host.h
+++ b/include/linux/mmc/host.h
@@ -286,6 +286,7 @@ struct mmc_host {
 				 MMC_CAP2_HS400_1_2V)
 #define MMC_CAP2_HSX00_1_2V	(MMC_CAP2_HS200_1_2V_SDR | MMC_CAP2_HS400_1_2V)
 #define MMC_CAP2_SDIO_IRQ_NOTHREAD (1 << 17)
+#define MMC_CAP2_NO_WRITE_PROTECT (1 << 18)	/* No physical write protect pin, assume that card is always read-write */
 
 	mmc_pm_flag_t		pm_caps;	/* supported pm features */
 
-- 
cgit v1.2.3


From eff8f2f5df1c509c873cdc70c84eb2ee75b41e65 Mon Sep 17 00:00:00 2001
From: Lars-Peter Clausen <lars@metafoo.de>
Date: Wed, 6 May 2015 20:31:22 +0200
Subject: mmc: dw_mmc: Use core to handle absent write protect line

Use the new MMC_CAP2_NO_WRITE_PROTECT to let the core handle the case where
no write protect line is present instead of having custom driver code to
handle it.

dw_mci_of_get_slot_quirks() is slightly refactored to directly modify the
mmc_host capabilities instead of returning a quirk mask.

Signed-off-by: Lars-Peter Clausen <lars@metafoo.de>
Signed-off-by: Jaehoon Chung <jh80.chung@samsung.com>
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 drivers/mmc/host/dw_mmc.c  | 53 +++++++++++++++-------------------------------
 drivers/mmc/host/dw_mmc.h  |  3 ---
 include/linux/mmc/dw_mmc.h |  6 ------
 3 files changed, 17 insertions(+), 45 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mmc/host/dw_mmc.c b/drivers/mmc/host/dw_mmc.c
index ce66565d50ef..55179f1001fb 100644
--- a/drivers/mmc/host/dw_mmc.c
+++ b/drivers/mmc/host/dw_mmc.c
@@ -1282,10 +1282,7 @@ static int dw_mci_get_ro(struct mmc_host *mmc)
 	int gpio_ro = mmc_gpio_get_ro(mmc);
 
 	/* Use platform get_ro function, else try on board write protect */
-	if ((slot->quirks & DW_MCI_SLOT_QUIRK_NO_WRITE_PROTECT) ||
-			(slot->host->quirks & DW_MCI_QUIRK_NO_WRITE_PROTECT))
-		read_only = 0;
-	else if (!IS_ERR_VALUE(gpio_ro))
+	if (!IS_ERR_VALUE(gpio_ro))
 		read_only = gpio_ro;
 	else
 		read_only =
@@ -2284,9 +2281,10 @@ static irqreturn_t dw_mci_interrupt(int irq, void *dev_id)
 }
 
 #ifdef CONFIG_OF
-/* given a slot id, find out the device node representing that slot */
-static struct device_node *dw_mci_of_find_slot_node(struct device *dev, u8 slot)
+/* given a slot, find out the device node representing that slot */
+static struct device_node *dw_mci_of_find_slot_node(struct dw_mci_slot *slot)
 {
+	struct device *dev = slot->mmc->parent;
 	struct device_node *np;
 	const __be32 *addr;
 	int len;
@@ -2298,42 +2296,28 @@ static struct device_node *dw_mci_of_find_slot_node(struct device *dev, u8 slot)
 		addr = of_get_property(np, "reg", &len);
 		if (!addr || (len < sizeof(int)))
 			continue;
-		if (be32_to_cpup(addr) == slot)
+		if (be32_to_cpup(addr) == slot->id)
 			return np;
 	}
 	return NULL;
 }
 
-static struct dw_mci_of_slot_quirks {
-	char *quirk;
-	int id;
-} of_slot_quirks[] = {
-	{
-		.quirk	= "disable-wp",
-		.id	= DW_MCI_SLOT_QUIRK_NO_WRITE_PROTECT,
-	},
-};
-
-static int dw_mci_of_get_slot_quirks(struct device *dev, u8 slot)
+static void dw_mci_slot_of_parse(struct dw_mci_slot *slot)
 {
-	struct device_node *np = dw_mci_of_find_slot_node(dev, slot);
-	int quirks = 0;
-	int idx;
+	struct device_node *np = dw_mci_of_find_slot_node(slot);
 
-	/* get quirks */
-	for (idx = 0; idx < ARRAY_SIZE(of_slot_quirks); idx++)
-		if (of_get_property(np, of_slot_quirks[idx].quirk, NULL)) {
-			dev_warn(dev, "Slot quirk %s is deprecated\n",
-					of_slot_quirks[idx].quirk);
-			quirks |= of_slot_quirks[idx].id;
-		}
+	if (!np)
+		return;
 
-	return quirks;
+	if (of_property_read_bool(np, "disable-wp")) {
+		slot->mmc->caps2 |= MMC_CAP2_NO_WRITE_PROTECT;
+		dev_warn(slot->mmc->parent,
+			"Slot quirk 'disable-wp' is deprecated\n");
+	}
 }
 #else /* CONFIG_OF */
-static int dw_mci_of_get_slot_quirks(struct device *dev, u8 slot)
+static void dw_mci_slot_of_parse(struct dw_mci_slot *slot)
 {
-	return 0;
 }
 #endif /* CONFIG_OF */
 
@@ -2356,8 +2340,6 @@ static int dw_mci_init_slot(struct dw_mci *host, unsigned int id)
 	slot->host = host;
 	host->slot[id] = slot;
 
-	slot->quirks = dw_mci_of_get_slot_quirks(host->dev, slot->id);
-
 	mmc->ops = &dw_mci_ops;
 	if (of_property_read_u32_array(host->dev->of_node,
 				       "clock-freq-min-max", freq, 2)) {
@@ -2395,6 +2377,8 @@ static int dw_mci_init_slot(struct dw_mci *host, unsigned int id)
 	if (host->pdata->caps2)
 		mmc->caps2 = host->pdata->caps2;
 
+	dw_mci_slot_of_parse(slot);
+
 	ret = mmc_of_parse(mmc);
 	if (ret)
 		goto err_host_allocated;
@@ -2622,9 +2606,6 @@ static struct dw_mci_of_quirks {
 	{
 		.quirk	= "broken-cd",
 		.id	= DW_MCI_QUIRK_BROKEN_CARD_DETECTION,
-	}, {
-		.quirk	= "disable-wp",
-		.id	= DW_MCI_QUIRK_NO_WRITE_PROTECT,
 	},
 };
 
diff --git a/drivers/mmc/host/dw_mmc.h b/drivers/mmc/host/dw_mmc.h
index c7236170bf98..8ce4674730a6 100644
--- a/drivers/mmc/host/dw_mmc.h
+++ b/drivers/mmc/host/dw_mmc.h
@@ -227,7 +227,6 @@ extern int dw_mci_resume(struct dw_mci *host);
  * struct dw_mci_slot - MMC slot state
  * @mmc: The mmc_host representing this slot.
  * @host: The MMC controller this slot is using.
- * @quirks: Slot-level quirks (DW_MCI_SLOT_QUIRK_XXX)
  * @ctype: Card type for this slot.
  * @mrq: mmc_request currently being processed or waiting to be
  *	processed, or NULL when the slot is idle.
@@ -245,8 +244,6 @@ struct dw_mci_slot {
 	struct mmc_host		*mmc;
 	struct dw_mci		*host;
 
-	int			quirks;
-
 	u32			ctype;
 
 	struct mmc_request	*mrq;
diff --git a/include/linux/mmc/dw_mmc.h b/include/linux/mmc/dw_mmc.h
index 12111993a317..5be97676f1fa 100644
--- a/include/linux/mmc/dw_mmc.h
+++ b/include/linux/mmc/dw_mmc.h
@@ -226,12 +226,6 @@ struct dw_mci_dma_ops {
 #define DW_MCI_QUIRK_HIGHSPEED			BIT(2)
 /* Unreliable card detection */
 #define DW_MCI_QUIRK_BROKEN_CARD_DETECTION	BIT(3)
-/* No write protect */
-#define DW_MCI_QUIRK_NO_WRITE_PROTECT		BIT(4)
-
-/* Slot level quirks */
-/* This slot has no write protect */
-#define DW_MCI_SLOT_QUIRK_NO_WRITE_PROTECT	BIT(0)
 
 struct dma_pdata;
 
-- 
cgit v1.2.3


From b4f30a174e1fda8118eda038b5d8d5260db36ad5 Mon Sep 17 00:00:00 2001
From: Adrian Hunter <adrian.hunter@intel.com>
Date: Fri, 6 Feb 2015 14:12:52 +0200
Subject: mmc: core: Allow card drive strength to be different to host

Initialization of UHS-I modes for SD and SDIO cards
employs a callback to allow the host driver to
choose a drive strength value. Currently that
assumes the card drive strength and host driver
type must be the same value. Change to let the
callback make that decision and return both the
card drive strength and host driver type.

Signed-off-by: Adrian Hunter <adrian.hunter@intel.com>
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 drivers/mmc/core/sd.c    | 33 +++++++++++++--------------------
 drivers/mmc/core/sdio.c  | 43 ++++++++++++++++++-------------------------
 include/linux/mmc/host.h |  3 ++-
 3 files changed, 33 insertions(+), 46 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mmc/core/sd.c b/drivers/mmc/core/sd.c
index 8f6864a2a055..5edd7d8b033e 100644
--- a/drivers/mmc/core/sd.c
+++ b/drivers/mmc/core/sd.c
@@ -388,18 +388,9 @@ static int sd_select_driver_type(struct mmc_card *card, u8 *status)
 {
 	int host_drv_type = SD_DRIVER_TYPE_B;
 	int card_drv_type = SD_DRIVER_TYPE_B;
-	int drive_strength;
+	int drive_strength, drv_type;
 	int err;
 
-	/*
-	 * If the host doesn't support any of the Driver Types A,C or D,
-	 * or there is no board specific handler then default Driver
-	 * Type B is used.
-	 */
-	if (!(card->host->caps & (MMC_CAP_DRIVER_TYPE_A | MMC_CAP_DRIVER_TYPE_C
-	    | MMC_CAP_DRIVER_TYPE_D)))
-		return 0;
-
 	if (!card->host->ops->select_drive_strength)
 		return 0;
 
@@ -430,20 +421,22 @@ static int sd_select_driver_type(struct mmc_card *card, u8 *status)
 	mmc_host_clk_hold(card->host);
 	drive_strength = card->host->ops->select_drive_strength(
 		card->sw_caps.uhs_max_dtr,
-		host_drv_type, card_drv_type);
+		host_drv_type, card_drv_type, &drv_type);
 	mmc_host_clk_release(card->host);
 
-	err = mmc_sd_switch(card, 1, 2, drive_strength, status);
-	if (err)
-		return err;
-
-	if ((status[15] & 0xF) != drive_strength) {
-		pr_warn("%s: Problem setting drive strength!\n",
-			mmc_hostname(card->host));
-		return 0;
+	if (drive_strength) {
+		err = mmc_sd_switch(card, 1, 2, drive_strength, status);
+		if (err)
+			return err;
+		if ((status[15] & 0xF) != drive_strength) {
+			pr_warn("%s: Problem setting drive strength!\n",
+				mmc_hostname(card->host));
+			return 0;
+		}
 	}
 
-	mmc_set_driver_type(card->host, drive_strength);
+	if (drv_type)
+		mmc_set_driver_type(card->host, drv_type);
 
 	return 0;
 }
diff --git a/drivers/mmc/core/sdio.c b/drivers/mmc/core/sdio.c
index 5c1423a3e7d7..9d87aeb7c752 100644
--- a/drivers/mmc/core/sdio.c
+++ b/drivers/mmc/core/sdio.c
@@ -404,21 +404,10 @@ static void sdio_select_driver_type(struct mmc_card *card)
 {
 	int host_drv_type = SD_DRIVER_TYPE_B;
 	int card_drv_type = SD_DRIVER_TYPE_B;
-	int drive_strength;
+	int drive_strength, drv_type;
 	unsigned char card_strength;
 	int err;
 
-	/*
-	 * If the host doesn't support any of the Driver Types A,C or D,
-	 * or there is no board specific handler then default Driver
-	 * Type B is used.
-	 */
-	if (!(card->host->caps &
-		(MMC_CAP_DRIVER_TYPE_A |
-		 MMC_CAP_DRIVER_TYPE_C |
-		 MMC_CAP_DRIVER_TYPE_D)))
-		return;
-
 	if (!card->host->ops->select_drive_strength)
 		return;
 
@@ -448,23 +437,27 @@ static void sdio_select_driver_type(struct mmc_card *card)
 	 */
 	drive_strength = card->host->ops->select_drive_strength(
 		card->sw_caps.uhs_max_dtr,
-		host_drv_type, card_drv_type);
+		host_drv_type, card_drv_type, &drv_type);
 
-	/* if error just use default for drive strength B */
-	err = mmc_io_rw_direct(card, 0, 0, SDIO_CCCR_DRIVE_STRENGTH, 0,
-		&card_strength);
-	if (err)
-		return;
+	if (drive_strength) {
+		/* if error just use default for drive strength B */
+		err = mmc_io_rw_direct(card, 0, 0, SDIO_CCCR_DRIVE_STRENGTH, 0,
+				       &card_strength);
+		if (err)
+			return;
 
-	card_strength &= ~(SDIO_DRIVE_DTSx_MASK<<SDIO_DRIVE_DTSx_SHIFT);
-	card_strength |= host_drive_to_sdio_drive(drive_strength);
+		card_strength &= ~(SDIO_DRIVE_DTSx_MASK<<SDIO_DRIVE_DTSx_SHIFT);
+		card_strength |= host_drive_to_sdio_drive(drive_strength);
 
-	err = mmc_io_rw_direct(card, 1, 0, SDIO_CCCR_DRIVE_STRENGTH,
-		card_strength, NULL);
+		/* if error default to drive strength B */
+		err = mmc_io_rw_direct(card, 1, 0, SDIO_CCCR_DRIVE_STRENGTH,
+				       card_strength, NULL);
+		if (err)
+			return;
+	}
 
-	/* if error default to drive strength B */
-	if (!err)
-		mmc_set_driver_type(card->host, drive_strength);
+	if (drv_type)
+		mmc_set_driver_type(card->host, drv_type);
 }
 
 
diff --git a/include/linux/mmc/host.h b/include/linux/mmc/host.h
index 433eccb50838..da33d18c66c8 100644
--- a/include/linux/mmc/host.h
+++ b/include/linux/mmc/host.h
@@ -132,7 +132,8 @@ struct mmc_host_ops {
 
 	/* Prepare HS400 target operating frequency depending host driver */
 	int	(*prepare_hs400_tuning)(struct mmc_host *host, struct mmc_ios *ios);
-	int	(*select_drive_strength)(unsigned int max_dtr, int host_drv, int card_drv);
+	int	(*select_drive_strength)(unsigned int max_dtr, int host_drv,
+					 int card_drv, int *drv_type);
 	void	(*hw_reset)(struct mmc_host *host);
 	void	(*card_event)(struct mmc_host *host);
 
-- 
cgit v1.2.3


From f168359efbb99d6f8591bb666d6510bb78df2d07 Mon Sep 17 00:00:00 2001
From: Adrian Hunter <adrian.hunter@intel.com>
Date: Fri, 6 Feb 2015 14:12:54 +0200
Subject: mmc: core: Add 'card' to drive strength selection callback

In preparation for supporting also eMMC drive strength,
add the 'card' as a parameter so that the callback can
distinguish different types of cards if necessary.

Signed-off-by: Adrian Hunter <adrian.hunter@intel.com>
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 drivers/mmc/core/sd.c    | 2 +-
 drivers/mmc/core/sdio.c  | 2 +-
 include/linux/mmc/host.h | 3 ++-
 3 files changed, 4 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mmc/core/sd.c b/drivers/mmc/core/sd.c
index 9b72ea6b3177..63f9163b8dda 100644
--- a/drivers/mmc/core/sd.c
+++ b/drivers/mmc/core/sd.c
@@ -411,7 +411,7 @@ static int sd_select_driver_type(struct mmc_card *card, u8 *status)
 	 * return what is possible given the options
 	 */
 	mmc_host_clk_hold(card->host);
-	drive_strength = card->host->ops->select_drive_strength(
+	drive_strength = card->host->ops->select_drive_strength(card,
 		card->sw_caps.uhs_max_dtr,
 		host_drv_type, card_drv_type, &drv_type);
 	mmc_host_clk_release(card->host);
diff --git a/drivers/mmc/core/sdio.c b/drivers/mmc/core/sdio.c
index ef82f3d029e8..d3d13047f316 100644
--- a/drivers/mmc/core/sdio.c
+++ b/drivers/mmc/core/sdio.c
@@ -427,7 +427,7 @@ static void sdio_select_driver_type(struct mmc_card *card)
 	 * information and let the hardware specific code
 	 * return what is possible given the options
 	 */
-	drive_strength = card->host->ops->select_drive_strength(
+	drive_strength = card->host->ops->select_drive_strength(card,
 		card->sw_caps.uhs_max_dtr,
 		host_drv_type, card_drv_type, &drv_type);
 
diff --git a/include/linux/mmc/host.h b/include/linux/mmc/host.h
index da33d18c66c8..1369e54faeb7 100644
--- a/include/linux/mmc/host.h
+++ b/include/linux/mmc/host.h
@@ -132,7 +132,8 @@ struct mmc_host_ops {
 
 	/* Prepare HS400 target operating frequency depending host driver */
 	int	(*prepare_hs400_tuning)(struct mmc_host *host, struct mmc_ios *ios);
-	int	(*select_drive_strength)(unsigned int max_dtr, int host_drv,
+	int	(*select_drive_strength)(struct mmc_card *card,
+					 unsigned int max_dtr, int host_drv,
 					 int card_drv, int *drv_type);
 	void	(*hw_reset)(struct mmc_host *host);
 	void	(*card_event)(struct mmc_host *host);
-- 
cgit v1.2.3


From 3853a042325e8f497c199020979c4fc824528c6e Mon Sep 17 00:00:00 2001
From: Adrian Hunter <adrian.hunter@intel.com>
Date: Fri, 6 Feb 2015 14:12:56 +0200
Subject: mmc: core: Record card drive strength

In preparation for adding drive strength support
for eMMC, add drive_strength to struct mmc_card
to record the card drive strength for UHS-I modes
and HS200 / HS400. For eMMC this will be needed
when switching between HS200 and HS400.

Signed-off-by: Adrian Hunter <adrian.hunter@intel.com>
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 drivers/mmc/core/sd.c    | 3 +++
 drivers/mmc/core/sdio.c  | 3 +++
 include/linux/mmc/card.h | 1 +
 3 files changed, 7 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/mmc/core/sd.c b/drivers/mmc/core/sd.c
index 9771b84db4b3..b99e25b9bcdc 100644
--- a/drivers/mmc/core/sd.c
+++ b/drivers/mmc/core/sd.c
@@ -389,6 +389,8 @@ static int sd_select_driver_type(struct mmc_card *card, u8 *status)
 	int card_drv_type, drive_strength, drv_type;
 	int err;
 
+	card->drive_strength = 0;
+
 	card_drv_type = card->sw_caps.sd3_drv_type | SD_DRIVER_TYPE_B;
 
 	drive_strength = mmc_select_drive_strength(card,
@@ -404,6 +406,7 @@ static int sd_select_driver_type(struct mmc_card *card, u8 *status)
 				mmc_hostname(card->host));
 			return 0;
 		}
+		card->drive_strength = drive_strength;
 	}
 
 	if (drv_type)
diff --git a/drivers/mmc/core/sdio.c b/drivers/mmc/core/sdio.c
index 73b091331c96..b91abedcfdca 100644
--- a/drivers/mmc/core/sdio.c
+++ b/drivers/mmc/core/sdio.c
@@ -406,6 +406,8 @@ static void sdio_select_driver_type(struct mmc_card *card)
 	unsigned char card_strength;
 	int err;
 
+	card->drive_strength = 0;
+
 	card_drv_type = card->sw_caps.sd3_drv_type | SD_DRIVER_TYPE_B;
 
 	drive_strength = mmc_select_drive_strength(card,
@@ -427,6 +429,7 @@ static void sdio_select_driver_type(struct mmc_card *card)
 				       card_strength, NULL);
 		if (err)
 			return;
+		card->drive_strength = drive_strength;
 	}
 
 	if (drv_type)
diff --git a/include/linux/mmc/card.h b/include/linux/mmc/card.h
index 19f0175c0afa..2f073d555793 100644
--- a/include/linux/mmc/card.h
+++ b/include/linux/mmc/card.h
@@ -305,6 +305,7 @@ struct mmc_card {
 
 	unsigned int		sd_bus_speed;	/* Bus Speed Mode set for the card */
 	unsigned int		mmc_avail_type;	/* supported device type by both host and card */
+	unsigned int		drive_strength;	/* for UHS-I, HS200 or HS400 */
 
 	struct dentry		*debugfs_root;
 	struct mmc_part	part[MMC_NUM_PHY_PARTITION]; /* physical partitions */
-- 
cgit v1.2.3


From b097e07f57930eda774c83aa46e8e401686d01dc Mon Sep 17 00:00:00 2001
From: Adrian Hunter <adrian.hunter@intel.com>
Date: Fri, 6 Feb 2015 14:12:57 +0200
Subject: mmc: mmc: Read card's valid driver strength mask

In preparation for supporing drive strength selection
for eMMC, read the card's valid driver strengths.

Note that though the SD spec uses the term "drive strength",
the JEDEC eMMC spec uses the term "driver strength".

Signed-off-by: Adrian Hunter <adrian.hunter@intel.com>
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 drivers/mmc/core/mmc.c   | 1 +
 include/linux/mmc/card.h | 1 +
 include/linux/mmc/mmc.h  | 1 +
 3 files changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/mmc/core/mmc.c b/drivers/mmc/core/mmc.c
index 03c94c2f23e1..9b808d1d6ca7 100644
--- a/drivers/mmc/core/mmc.c
+++ b/drivers/mmc/core/mmc.c
@@ -437,6 +437,7 @@ static int mmc_decode_ext_csd(struct mmc_card *card, u8 *ext_csd)
 	card->ext_csd.raw_trim_mult =
 		ext_csd[EXT_CSD_TRIM_MULT];
 	card->ext_csd.raw_partition_support = ext_csd[EXT_CSD_PARTITION_SUPPORT];
+	card->ext_csd.raw_driver_strength = ext_csd[EXT_CSD_DRIVER_STRENGTH];
 	if (card->ext_csd.rev >= 4) {
 		if (ext_csd[EXT_CSD_PARTITION_SETTING_COMPLETED] &
 		    EXT_CSD_PART_SETTING_COMPLETED)
diff --git a/include/linux/mmc/card.h b/include/linux/mmc/card.h
index 2f073d555793..4d3776d25925 100644
--- a/include/linux/mmc/card.h
+++ b/include/linux/mmc/card.h
@@ -97,6 +97,7 @@ struct mmc_ext_csd {
 	u8			raw_erased_mem_count;	/* 181 */
 	u8			raw_ext_csd_structure;	/* 194 */
 	u8			raw_card_type;		/* 196 */
+	u8			raw_driver_strength;	/* 197 */
 	u8			out_of_int_time;	/* 198 */
 	u8			raw_pwr_cl_52_195;	/* 200 */
 	u8			raw_pwr_cl_26_195;	/* 201 */
diff --git a/include/linux/mmc/mmc.h b/include/linux/mmc/mmc.h
index 124f562118b8..4819cfbc3795 100644
--- a/include/linux/mmc/mmc.h
+++ b/include/linux/mmc/mmc.h
@@ -302,6 +302,7 @@ struct _mmc_csd {
 #define EXT_CSD_REV			192	/* RO */
 #define EXT_CSD_STRUCTURE		194	/* RO */
 #define EXT_CSD_CARD_TYPE		196	/* RO */
+#define EXT_CSD_DRIVER_STRENGTH		197	/* RO */
 #define EXT_CSD_OUT_OF_INTERRUPT_TIME	198	/* RO */
 #define EXT_CSD_PART_SWITCH_TIME        199     /* RO */
 #define EXT_CSD_PWR_CL_52_195		200	/* RO */
-- 
cgit v1.2.3


From cc4f414c885cd04f7227ad9bcd6b18fd78d718d9 Mon Sep 17 00:00:00 2001
From: Adrian Hunter <adrian.hunter@intel.com>
Date: Fri, 6 Feb 2015 14:12:58 +0200
Subject: mmc: mmc: Add driver strength selection

Add the ability to set eMMC driver strength
for HS200 and HS400.

Signed-off-by: Adrian Hunter <adrian.hunter@intel.com>
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 drivers/mmc/core/mmc.c  | 45 ++++++++++++++++++++++++++++++++++++++-------
 include/linux/mmc/mmc.h |  3 +++
 2 files changed, 41 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mmc/core/mmc.c b/drivers/mmc/core/mmc.c
index 9b808d1d6ca7..e519e3110a20 100644
--- a/drivers/mmc/core/mmc.c
+++ b/drivers/mmc/core/mmc.c
@@ -1044,6 +1044,7 @@ static int mmc_select_hs400(struct mmc_card *card)
 {
 	struct mmc_host *host = card->host;
 	int err = 0;
+	u8 val;
 
 	/*
 	 * HS400 mode requires 8-bit bus width
@@ -1059,8 +1060,10 @@ static int mmc_select_hs400(struct mmc_card *card)
 	mmc_set_timing(card->host, MMC_TIMING_MMC_HS);
 	mmc_set_bus_speed(card);
 
+	val = EXT_CSD_TIMING_HS |
+	      card->drive_strength << EXT_CSD_DRV_STR_SHIFT;
 	err = __mmc_switch(card, EXT_CSD_CMD_SET_NORMAL,
-			   EXT_CSD_HS_TIMING, EXT_CSD_TIMING_HS,
+			   EXT_CSD_HS_TIMING, val,
 			   card->ext_csd.generic_cmd6_time,
 			   true, true, true);
 	if (err) {
@@ -1079,8 +1082,10 @@ static int mmc_select_hs400(struct mmc_card *card)
 		return err;
 	}
 
+	val = EXT_CSD_TIMING_HS400 |
+	      card->drive_strength << EXT_CSD_DRV_STR_SHIFT;
 	err = __mmc_switch(card, EXT_CSD_CMD_SET_NORMAL,
-			   EXT_CSD_HS_TIMING, EXT_CSD_TIMING_HS400,
+			   EXT_CSD_HS_TIMING, val,
 			   card->ext_csd.generic_cmd6_time,
 			   true, true, true);
 	if (err) {
@@ -1119,6 +1124,7 @@ int mmc_hs400_to_hs200(struct mmc_card *card)
 	bool send_status = true;
 	unsigned int max_dtr;
 	int err;
+	u8 val;
 
 	if (host->caps & MMC_CAP_WAIT_WHILE_BUSY)
 		send_status = false;
@@ -1128,8 +1134,10 @@ int mmc_hs400_to_hs200(struct mmc_card *card)
 	mmc_set_clock(host, max_dtr);
 
 	/* Switch HS400 to HS DDR */
+	val = EXT_CSD_TIMING_HS |
+	      card->drive_strength << EXT_CSD_DRV_STR_SHIFT;
 	err = __mmc_switch(card, EXT_CSD_CMD_SET_NORMAL, EXT_CSD_HS_TIMING,
-			   EXT_CSD_TIMING_HS, card->ext_csd.generic_cmd6_time,
+			   val, card->ext_csd.generic_cmd6_time,
 			   true, send_status, true);
 	if (err)
 		goto out_err;
@@ -1158,10 +1166,11 @@ int mmc_hs400_to_hs200(struct mmc_card *card)
 	}
 
 	/* Switch HS to HS200 */
+	val = EXT_CSD_TIMING_HS200 |
+	      card->drive_strength << EXT_CSD_DRV_STR_SHIFT;
 	err = __mmc_switch(card, EXT_CSD_CMD_SET_NORMAL, EXT_CSD_HS_TIMING,
-			   EXT_CSD_TIMING_HS200,
-			   card->ext_csd.generic_cmd6_time, true, send_status,
-			   true);
+			   val, card->ext_csd.generic_cmd6_time, true,
+			   send_status, true);
 	if (err)
 		goto out_err;
 
@@ -1183,6 +1192,23 @@ out_err:
 	return err;
 }
 
+static void mmc_select_driver_type(struct mmc_card *card)
+{
+	int card_drv_type, drive_strength, drv_type;
+
+	card_drv_type = card->ext_csd.raw_driver_strength |
+			mmc_driver_type_mask(0);
+
+	drive_strength = mmc_select_drive_strength(card,
+						   card->ext_csd.hs200_max_dtr,
+						   card_drv_type, &drv_type);
+
+	card->drive_strength = drive_strength;
+
+	if (drv_type)
+		mmc_set_driver_type(card->host, drv_type);
+}
+
 /*
  * For device supporting HS200 mode, the following sequence
  * should be done before executing the tuning process.
@@ -1194,6 +1220,7 @@ static int mmc_select_hs200(struct mmc_card *card)
 {
 	struct mmc_host *host = card->host;
 	int err = -EINVAL;
+	u8 val;
 
 	if (card->mmc_avail_type & EXT_CSD_CARD_TYPE_HS200_1_2V)
 		err = __mmc_set_signal_voltage(host, MMC_SIGNAL_VOLTAGE_120);
@@ -1205,14 +1232,18 @@ static int mmc_select_hs200(struct mmc_card *card)
 	if (err)
 		goto err;
 
+	mmc_select_driver_type(card);
+
 	/*
 	 * Set the bus width(4 or 8) with host's support and
 	 * switch to HS200 mode if bus width is set successfully.
 	 */
 	err = mmc_select_bus_width(card);
 	if (!IS_ERR_VALUE(err)) {
+		val = EXT_CSD_TIMING_HS200 |
+		      card->drive_strength << EXT_CSD_DRV_STR_SHIFT;
 		err = __mmc_switch(card, EXT_CSD_CMD_SET_NORMAL,
-				   EXT_CSD_HS_TIMING, EXT_CSD_TIMING_HS200,
+				   EXT_CSD_HS_TIMING, val,
 				   card->ext_csd.generic_cmd6_time,
 				   true, true, true);
 		if (!err)
diff --git a/include/linux/mmc/mmc.h b/include/linux/mmc/mmc.h
index 4819cfbc3795..15f2c4a0a62c 100644
--- a/include/linux/mmc/mmc.h
+++ b/include/linux/mmc/mmc.h
@@ -391,6 +391,7 @@ struct _mmc_csd {
 #define EXT_CSD_TIMING_HS	1	/* High speed */
 #define EXT_CSD_TIMING_HS200	2	/* HS200 */
 #define EXT_CSD_TIMING_HS400	3	/* HS400 */
+#define EXT_CSD_DRV_STR_SHIFT	4	/* Driver Strength shift */
 
 #define EXT_CSD_SEC_ER_EN	BIT(0)
 #define EXT_CSD_SEC_BD_BLK_EN	BIT(2)
@@ -442,4 +443,6 @@ struct _mmc_csd {
 #define MMC_SWITCH_MODE_CLEAR_BITS	0x02	/* Clear bits which are 1 in value */
 #define MMC_SWITCH_MODE_WRITE_BYTE	0x03	/* Set target to value */
 
+#define mmc_driver_type_mask(n)		(1 << (n))
+
 #endif /* LINUX_MMC_MMC_H */
-- 
cgit v1.2.3


From e1bfad6d936d7149a83423e2a7244dd5771f27e7 Mon Sep 17 00:00:00 2001
From: Adrian Hunter <adrian.hunter@intel.com>
Date: Fri, 6 Feb 2015 14:13:00 +0200
Subject: mmc: sdhci-pci: Add support for drive strength selection for SPT

Implement the select_drive_strength callback to provide
drive strength selection for Intel SPT.

Signed-off-by: Adrian Hunter <adrian.hunter@intel.com>
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 drivers/mmc/host/sdhci-pci-data.c  |  3 ++
 drivers/mmc/host/sdhci-pci.c       | 83 ++++++++++++++++++++++++++++++++++++++
 drivers/mmc/host/sdhci-pci.h       |  4 ++
 include/linux/mmc/sdhci-pci-data.h |  2 +
 4 files changed, 92 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/mmc/host/sdhci-pci-data.c b/drivers/mmc/host/sdhci-pci-data.c
index a611217769f5..56fddc622a54 100644
--- a/drivers/mmc/host/sdhci-pci-data.c
+++ b/drivers/mmc/host/sdhci-pci-data.c
@@ -3,3 +3,6 @@
 
 struct sdhci_pci_data *(*sdhci_pci_get_data)(struct pci_dev *pdev, int slotno);
 EXPORT_SYMBOL_GPL(sdhci_pci_get_data);
+
+int sdhci_pci_spt_drive_strength;
+EXPORT_SYMBOL_GPL(sdhci_pci_spt_drive_strength);
diff --git a/drivers/mmc/host/sdhci-pci.c b/drivers/mmc/host/sdhci-pci.c
index 7a3fc16d0a6c..6ecd158d8084 100644
--- a/drivers/mmc/host/sdhci-pci.c
+++ b/drivers/mmc/host/sdhci-pci.c
@@ -20,6 +20,7 @@
 #include <linux/slab.h>
 #include <linux/device.h>
 #include <linux/mmc/host.h>
+#include <linux/mmc/mmc.h>
 #include <linux/scatterlist.h>
 #include <linux/io.h>
 #include <linux/gpio.h>
@@ -266,6 +267,69 @@ static void sdhci_pci_int_hw_reset(struct sdhci_host *host)
 	usleep_range(300, 1000);
 }
 
+static int spt_select_drive_strength(struct sdhci_host *host,
+				     struct mmc_card *card,
+				     unsigned int max_dtr,
+				     int host_drv, int card_drv, int *drv_type)
+{
+	int drive_strength;
+
+	if (sdhci_pci_spt_drive_strength > 0)
+		drive_strength = sdhci_pci_spt_drive_strength & 0xf;
+	else
+		drive_strength = 1; /* 33-ohm */
+
+	if ((mmc_driver_type_mask(drive_strength) & card_drv) == 0)
+		drive_strength = 0; /* Default 50-ohm */
+
+	return drive_strength;
+}
+
+/* Try to read the drive strength from the card */
+static void spt_read_drive_strength(struct sdhci_host *host)
+{
+	u32 val, i, t;
+	u16 m;
+
+	if (sdhci_pci_spt_drive_strength)
+		return;
+
+	sdhci_pci_spt_drive_strength = -1;
+
+	m = sdhci_readw(host, SDHCI_HOST_CONTROL2) & 0x7;
+	if (m != 3 && m != 5)
+		return;
+	val = sdhci_readl(host, SDHCI_PRESENT_STATE);
+	if (val & 0x3)
+		return;
+	sdhci_writel(host, 0x007f0023, SDHCI_INT_ENABLE);
+	sdhci_writel(host, 0, SDHCI_SIGNAL_ENABLE);
+	sdhci_writew(host, 0x10, SDHCI_TRANSFER_MODE);
+	sdhci_writeb(host, 0xe, SDHCI_TIMEOUT_CONTROL);
+	sdhci_writew(host, 512, SDHCI_BLOCK_SIZE);
+	sdhci_writew(host, 1, SDHCI_BLOCK_COUNT);
+	sdhci_writel(host, 0, SDHCI_ARGUMENT);
+	sdhci_writew(host, 0x83b, SDHCI_COMMAND);
+	for (i = 0; i < 1000; i++) {
+		val = sdhci_readl(host, SDHCI_INT_STATUS);
+		if (val & 0xffff8000)
+			return;
+		if (val & 0x20)
+			break;
+		udelay(1);
+	}
+	val = sdhci_readl(host, SDHCI_PRESENT_STATE);
+	if (!(val & 0x800))
+		return;
+	for (i = 0; i < 47; i++)
+		val = sdhci_readl(host, SDHCI_BUFFER);
+	t = val & 0xf00;
+	if (t != 0x200 && t != 0x300)
+		return;
+
+	sdhci_pci_spt_drive_strength = 0x10 | ((val >> 12) & 0xf);
+}
+
 static int byt_emmc_probe_slot(struct sdhci_pci_slot *slot)
 {
 	slot->host->mmc->caps |= MMC_CAP_8_BIT_DATA | MMC_CAP_NONREMOVABLE |
@@ -276,6 +340,10 @@ static int byt_emmc_probe_slot(struct sdhci_pci_slot *slot)
 	slot->hw_reset = sdhci_pci_int_hw_reset;
 	if (slot->chip->pdev->device == PCI_DEVICE_ID_INTEL_BSW_EMMC)
 		slot->host->timeout_clk = 1000; /* 1000 kHz i.e. 1 MHz */
+	if (slot->chip->pdev->device == PCI_DEVICE_ID_INTEL_SPT_EMMC) {
+		spt_read_drive_strength(slot->host);
+		slot->select_drive_strength = spt_select_drive_strength;
+	}
 	return 0;
 }
 
@@ -1203,6 +1271,20 @@ static void sdhci_pci_hw_reset(struct sdhci_host *host)
 		slot->hw_reset(host);
 }
 
+static int sdhci_pci_select_drive_strength(struct sdhci_host *host,
+					   struct mmc_card *card,
+					   unsigned int max_dtr, int host_drv,
+					   int card_drv, int *drv_type)
+{
+	struct sdhci_pci_slot *slot = sdhci_priv(host);
+
+	if (!slot->select_drive_strength)
+		return 0;
+
+	return slot->select_drive_strength(host, card, max_dtr, host_drv,
+					   card_drv, drv_type);
+}
+
 static const struct sdhci_ops sdhci_pci_ops = {
 	.set_clock	= sdhci_set_clock,
 	.enable_dma	= sdhci_pci_enable_dma,
@@ -1210,6 +1292,7 @@ static const struct sdhci_ops sdhci_pci_ops = {
 	.reset		= sdhci_reset,
 	.set_uhs_signaling = sdhci_set_uhs_signaling,
 	.hw_reset		= sdhci_pci_hw_reset,
+	.select_drive_strength	= sdhci_pci_select_drive_strength,
 };
 
 /*****************************************************************************\
diff --git a/drivers/mmc/host/sdhci-pci.h b/drivers/mmc/host/sdhci-pci.h
index 1ec684d06d54..541f1cad5247 100644
--- a/drivers/mmc/host/sdhci-pci.h
+++ b/drivers/mmc/host/sdhci-pci.h
@@ -72,6 +72,10 @@ struct sdhci_pci_slot {
 	bool			cd_override_level;
 
 	void (*hw_reset)(struct sdhci_host *host);
+	int (*select_drive_strength)(struct sdhci_host *host,
+				     struct mmc_card *card,
+				     unsigned int max_dtr, int host_drv,
+				     int card_drv, int *drv_type);
 };
 
 struct sdhci_pci_chip {
diff --git a/include/linux/mmc/sdhci-pci-data.h b/include/linux/mmc/sdhci-pci-data.h
index 8959604a13d3..fda15b6d4135 100644
--- a/include/linux/mmc/sdhci-pci-data.h
+++ b/include/linux/mmc/sdhci-pci-data.h
@@ -15,4 +15,6 @@ struct sdhci_pci_data {
 extern struct sdhci_pci_data *(*sdhci_pci_get_data)(struct pci_dev *pdev,
 				int slotno);
 
+extern int sdhci_pci_spt_drive_strength;
+
 #endif
-- 
cgit v1.2.3


From 225d59adf1c899176cce0fc80e42b1d1c12f109f Mon Sep 17 00:00:00 2001
From: Lars-Peter Clausen <lars@metafoo.de>
Date: Fri, 29 May 2015 18:14:21 +0200
Subject: iio: Specify supported modes for buffers

For each buffer type specify the supported device modes for this buffer.
This allows us for devices which support multiple different operating modes
to pick the correct operating mode based on the modes supported by the
attached buffers.

It also prevents that buffers with conflicting modes are attached
to a device at the same time or that a buffer with a non-supported mode is
attached to a device (e.g. in-kernel callback buffer to a device only
supporting hardware mode).

Signed-off-by: Lars-Peter Clausen <lars@metafoo.de>
Signed-off-by: Jonathan Cameron <jic23@kernel.org>
---
 drivers/iio/buffer_cb.c                  |  2 ++
 drivers/iio/industrialio-buffer.c        | 18 +++++++++++++++---
 drivers/iio/kfifo_buf.c                  |  2 ++
 drivers/staging/iio/accel/sca3000_ring.c |  2 ++
 include/linux/iio/buffer.h               |  3 +++
 5 files changed, 24 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iio/buffer_cb.c b/drivers/iio/buffer_cb.c
index eb46e728aa2e..1648e6e5a848 100644
--- a/drivers/iio/buffer_cb.c
+++ b/drivers/iio/buffer_cb.c
@@ -33,6 +33,8 @@ static void iio_buffer_cb_release(struct iio_buffer *buffer)
 static const struct iio_buffer_access_funcs iio_cb_access = {
 	.store_to = &iio_buffer_cb_store_to,
 	.release = &iio_buffer_cb_release,
+
+	.modes = INDIO_BUFFER_SOFTWARE | INDIO_BUFFER_TRIGGERED,
 };
 
 struct iio_cb_buffer *iio_channel_get_all_cb(struct device *dev,
diff --git a/drivers/iio/industrialio-buffer.c b/drivers/iio/industrialio-buffer.c
index 209c7ad793c5..0a14bd825fe0 100644
--- a/drivers/iio/industrialio-buffer.c
+++ b/drivers/iio/industrialio-buffer.c
@@ -604,6 +604,7 @@ static int iio_verify_update(struct iio_dev *indio_dev,
 	const unsigned long *scan_mask;
 	struct iio_buffer *buffer;
 	bool scan_timestamp;
+	unsigned int modes;
 
 	memset(config, 0, sizeof(*config));
 
@@ -615,12 +616,23 @@ static int iio_verify_update(struct iio_dev *indio_dev,
 		list_is_singular(&indio_dev->buffer_list))
 			return 0;
 
+	modes = indio_dev->modes;
+
+	list_for_each_entry(buffer, &indio_dev->buffer_list, buffer_list) {
+		if (buffer == remove_buffer)
+			continue;
+		modes &= buffer->access->modes;
+	}
+
+	if (insert_buffer)
+		modes &= insert_buffer->access->modes;
+
 	/* Definitely possible for devices to support both of these. */
-	if ((indio_dev->modes & INDIO_BUFFER_TRIGGERED) && indio_dev->trig) {
+	if ((modes & INDIO_BUFFER_TRIGGERED) && indio_dev->trig) {
 		config->mode = INDIO_BUFFER_TRIGGERED;
-	} else if (indio_dev->modes & INDIO_BUFFER_HARDWARE) {
+	} else if (modes & INDIO_BUFFER_HARDWARE) {
 		config->mode = INDIO_BUFFER_HARDWARE;
-	} else if (indio_dev->modes & INDIO_BUFFER_SOFTWARE) {
+	} else if (modes & INDIO_BUFFER_SOFTWARE) {
 		config->mode = INDIO_BUFFER_SOFTWARE;
 	} else {
 		/* Can only occur on first buffer */
diff --git a/drivers/iio/kfifo_buf.c b/drivers/iio/kfifo_buf.c
index 847ca561afe0..db15684a4731 100644
--- a/drivers/iio/kfifo_buf.c
+++ b/drivers/iio/kfifo_buf.c
@@ -135,6 +135,8 @@ static const struct iio_buffer_access_funcs kfifo_access_funcs = {
 	.set_bytes_per_datum = &iio_set_bytes_per_datum_kfifo,
 	.set_length = &iio_set_length_kfifo,
 	.release = &iio_kfifo_buffer_release,
+
+	.modes = INDIO_BUFFER_SOFTWARE | INDIO_BUFFER_TRIGGERED,
 };
 
 struct iio_buffer *iio_kfifo_allocate(void)
diff --git a/drivers/staging/iio/accel/sca3000_ring.c b/drivers/staging/iio/accel/sca3000_ring.c
index 8589eade1057..23685e74917e 100644
--- a/drivers/staging/iio/accel/sca3000_ring.c
+++ b/drivers/staging/iio/accel/sca3000_ring.c
@@ -258,6 +258,8 @@ static const struct iio_buffer_access_funcs sca3000_ring_access_funcs = {
 	.read_first_n = &sca3000_read_first_n_hw_rb,
 	.data_available = sca3000_ring_buf_data_available,
 	.release = sca3000_ring_release,
+
+	.modes = INDIO_BUFFER_HARDWARE,
 };
 
 int sca3000_configure_ring(struct iio_dev *indio_dev)
diff --git a/include/linux/iio/buffer.h b/include/linux/iio/buffer.h
index eb8622b78ec9..1600c55828e0 100644
--- a/include/linux/iio/buffer.h
+++ b/include/linux/iio/buffer.h
@@ -29,6 +29,7 @@ struct iio_buffer;
  * @set_length:		set number of datums in buffer
  * @release:		called when the last reference to the buffer is dropped,
  *			should free all resources allocated by the buffer.
+ * @modes:		Supported operating modes by this buffer type
  *
  * The purpose of this structure is to make the buffer element
  * modular as event for a given driver, different usecases may require
@@ -51,6 +52,8 @@ struct iio_buffer_access_funcs {
 	int (*set_length)(struct iio_buffer *buffer, int length);
 
 	void (*release)(struct iio_buffer *buffer);
+
+	unsigned int modes;
 };
 
 /**
-- 
cgit v1.2.3


From 04fba7864ffcceae8a5f78d88ae1fd8d682a5123 Mon Sep 17 00:00:00 2001
From: Goffredo Baroncelli <kreijack@inwind.it>
Date: Sat, 30 May 2015 11:00:26 +0200
Subject: HID: Export hid_field_extract()

Rename the function extract() to hid_field_extract(), make it external linkage
to allow the use from other modules.

Suggested-by: Jiri Kosina <jkosina@suse.cz>
Signed-off-by: Goffredo Baroncelli <kreijack@inwind.it>
Reviewed-by: Benjamin Tissoires <benjamin.tissoires@redhat.com>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 drivers/hid/hid-core.c | 11 ++++++-----
 include/linux/hid.h    |  2 ++
 2 files changed, 8 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/hid/hid-core.c b/drivers/hid/hid-core.c
index 722a925795a2..fe33f9e9aebc 100644
--- a/drivers/hid/hid-core.c
+++ b/drivers/hid/hid-core.c
@@ -1061,13 +1061,13 @@ static u32 s32ton(__s32 value, unsigned n)
  * Search linux-kernel and linux-usb-devel archives for "hid-core extract".
  */
 
-static __u32 extract(const struct hid_device *hid, __u8 *report,
+__u32 hid_field_extract(const struct hid_device *hid, __u8 *report,
 		     unsigned offset, unsigned n)
 {
 	u64 x;
 
 	if (n > 32)
-		hid_warn(hid, "extract() called with n (%d) > 32! (%s)\n",
+		hid_warn(hid, "hid_field_extract() called with n (%d) > 32! (%s)\n",
 			 n, current->comm);
 
 	report += offset >> 3;  /* adjust byte index */
@@ -1076,6 +1076,7 @@ static __u32 extract(const struct hid_device *hid, __u8 *report,
 	x = (x >> offset) & ((1ULL << n) - 1);  /* extract bit field */
 	return (u32) x;
 }
+EXPORT_SYMBOL_GPL(hid_field_extract);
 
 /*
  * "implement" : set bits in a little endian bit stream.
@@ -1221,9 +1222,9 @@ static void hid_input_field(struct hid_device *hid, struct hid_field *field,
 	for (n = 0; n < count; n++) {
 
 		value[n] = min < 0 ?
-			snto32(extract(hid, data, offset + n * size, size),
-			       size) :
-			extract(hid, data, offset + n * size, size);
+			snto32(hid_field_extract(hid, data, offset + n * size,
+			       size), size) :
+			hid_field_extract(hid, data, offset + n * size, size);
 
 		/* Ignore report if ErrorRollOver */
 		if (!(field->flags & HID_MAIN_ITEM_VARIABLE) &&
diff --git a/include/linux/hid.h b/include/linux/hid.h
index 176b43670e5d..f17980de2662 100644
--- a/include/linux/hid.h
+++ b/include/linux/hid.h
@@ -815,6 +815,8 @@ void hid_disconnect(struct hid_device *hid);
 const struct hid_device_id *hid_match_id(struct hid_device *hdev,
 					 const struct hid_device_id *id);
 s32 hid_snto32(__u32 value, unsigned n);
+__u32 hid_field_extract(const struct hid_device *hid, __u8 *report,
+		     unsigned offset, unsigned n);
 
 /**
  * hid_device_io_start - enable HID input during probe, remove
-- 
cgit v1.2.3


From 3fff99bc4e926d9602a7d6e8c008a0175a099ce4 Mon Sep 17 00:00:00 2001
From: Rojhalat Ibrahim <imr@rtschenk.de>
Date: Wed, 13 May 2015 11:04:56 +0200
Subject: gpiolib: rename gpiod_set_array to gpiod_set_array_value

There have been concerns that the function names gpiod_set_array() and
gpiod_get_array() might be confusing to users. One might expect
gpiod_get_array() to return array values, while it is actually the array
counterpart of gpiod_get(). To be consistent with the single descriptor API
we could rename gpiod_set_array() to gpiod_set_array_value(). This makes
some function names a bit lengthy: gpiod_set_raw_array_value_cansleep().

Signed-off-by: Rojhalat Ibrahim <imr@rtschenk.de>
Acked-by: Alexandre Courbot <acourbot@nvidia.com>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 drivers/gpio/gpiolib.c                 | 54 ++++++++++++++++++----------------
 drivers/net/phy/mdio-mux-gpio.c        |  3 +-
 drivers/tty/serial/serial_mctrl_gpio.c |  2 +-
 include/linux/gpio/consumer.h          | 37 +++++++++++------------
 4 files changed, 51 insertions(+), 45 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c
index e01f6b2bfb3a..8dfb54f92594 100644
--- a/drivers/gpio/gpiolib.c
+++ b/drivers/gpio/gpiolib.c
@@ -1316,10 +1316,10 @@ static void gpio_chip_set_multiple(struct gpio_chip *chip,
 	}
 }
 
-static void gpiod_set_array_priv(bool raw, bool can_sleep,
-				 unsigned int array_size,
-				 struct gpio_desc **desc_array,
-				 int *value_array)
+static void gpiod_set_array_value_priv(bool raw, bool can_sleep,
+				       unsigned int array_size,
+				       struct gpio_desc **desc_array,
+				       int *value_array)
 {
 	int i = 0;
 
@@ -1412,7 +1412,7 @@ void gpiod_set_value(struct gpio_desc *desc, int value)
 EXPORT_SYMBOL_GPL(gpiod_set_value);
 
 /**
- * gpiod_set_raw_array() - assign values to an array of GPIOs
+ * gpiod_set_raw_array_value() - assign values to an array of GPIOs
  * @array_size: number of elements in the descriptor / value arrays
  * @desc_array: array of GPIO descriptors whose values will be assigned
  * @value_array: array of values to assign
@@ -1423,17 +1423,18 @@ EXPORT_SYMBOL_GPL(gpiod_set_value);
  * This function should be called from contexts where we cannot sleep, and will
  * complain if the GPIO chip functions potentially sleep.
  */
-void gpiod_set_raw_array(unsigned int array_size,
+void gpiod_set_raw_array_value(unsigned int array_size,
 			 struct gpio_desc **desc_array, int *value_array)
 {
 	if (!desc_array)
 		return;
-	gpiod_set_array_priv(true, false, array_size, desc_array, value_array);
+	gpiod_set_array_value_priv(true, false, array_size, desc_array,
+				   value_array);
 }
-EXPORT_SYMBOL_GPL(gpiod_set_raw_array);
+EXPORT_SYMBOL_GPL(gpiod_set_raw_array_value);
 
 /**
- * gpiod_set_array() - assign values to an array of GPIOs
+ * gpiod_set_array_value() - assign values to an array of GPIOs
  * @array_size: number of elements in the descriptor / value arrays
  * @desc_array: array of GPIO descriptors whose values will be assigned
  * @value_array: array of values to assign
@@ -1444,14 +1445,15 @@ EXPORT_SYMBOL_GPL(gpiod_set_raw_array);
  * This function should be called from contexts where we cannot sleep, and will
  * complain if the GPIO chip functions potentially sleep.
  */
-void gpiod_set_array(unsigned int array_size,
-		     struct gpio_desc **desc_array, int *value_array)
+void gpiod_set_array_value(unsigned int array_size,
+			   struct gpio_desc **desc_array, int *value_array)
 {
 	if (!desc_array)
 		return;
-	gpiod_set_array_priv(false, false, array_size, desc_array, value_array);
+	gpiod_set_array_value_priv(false, false, array_size, desc_array,
+				   value_array);
 }
-EXPORT_SYMBOL_GPL(gpiod_set_array);
+EXPORT_SYMBOL_GPL(gpiod_set_array_value);
 
 /**
  * gpiod_cansleep() - report whether gpio value access may sleep
@@ -1613,7 +1615,7 @@ void gpiod_set_value_cansleep(struct gpio_desc *desc, int value)
 EXPORT_SYMBOL_GPL(gpiod_set_value_cansleep);
 
 /**
- * gpiod_set_raw_array_cansleep() - assign values to an array of GPIOs
+ * gpiod_set_raw_array_value_cansleep() - assign values to an array of GPIOs
  * @array_size: number of elements in the descriptor / value arrays
  * @desc_array: array of GPIO descriptors whose values will be assigned
  * @value_array: array of values to assign
@@ -1623,19 +1625,20 @@ EXPORT_SYMBOL_GPL(gpiod_set_value_cansleep);
  *
  * This function is to be called from contexts that can sleep.
  */
-void gpiod_set_raw_array_cansleep(unsigned int array_size,
-				  struct gpio_desc **desc_array,
-				  int *value_array)
+void gpiod_set_raw_array_value_cansleep(unsigned int array_size,
+					struct gpio_desc **desc_array,
+					int *value_array)
 {
 	might_sleep_if(extra_checks);
 	if (!desc_array)
 		return;
-	gpiod_set_array_priv(true, true, array_size, desc_array, value_array);
+	gpiod_set_array_value_priv(true, true, array_size, desc_array,
+				   value_array);
 }
-EXPORT_SYMBOL_GPL(gpiod_set_raw_array_cansleep);
+EXPORT_SYMBOL_GPL(gpiod_set_raw_array_value_cansleep);
 
 /**
- * gpiod_set_array_cansleep() - assign values to an array of GPIOs
+ * gpiod_set_array_value_cansleep() - assign values to an array of GPIOs
  * @array_size: number of elements in the descriptor / value arrays
  * @desc_array: array of GPIO descriptors whose values will be assigned
  * @value_array: array of values to assign
@@ -1645,16 +1648,17 @@ EXPORT_SYMBOL_GPL(gpiod_set_raw_array_cansleep);
  *
  * This function is to be called from contexts that can sleep.
  */
-void gpiod_set_array_cansleep(unsigned int array_size,
-			      struct gpio_desc **desc_array,
-			      int *value_array)
+void gpiod_set_array_value_cansleep(unsigned int array_size,
+				    struct gpio_desc **desc_array,
+				    int *value_array)
 {
 	might_sleep_if(extra_checks);
 	if (!desc_array)
 		return;
-	gpiod_set_array_priv(false, true, array_size, desc_array, value_array);
+	gpiod_set_array_value_priv(false, true, array_size, desc_array,
+				   value_array);
 }
-EXPORT_SYMBOL_GPL(gpiod_set_array_cansleep);
+EXPORT_SYMBOL_GPL(gpiod_set_array_value_cansleep);
 
 /**
  * gpiod_add_lookup_table() - register GPIO device consumers
diff --git a/drivers/net/phy/mdio-mux-gpio.c b/drivers/net/phy/mdio-mux-gpio.c
index 66edd99bc302..7ddb1ab70891 100644
--- a/drivers/net/phy/mdio-mux-gpio.c
+++ b/drivers/net/phy/mdio-mux-gpio.c
@@ -35,7 +35,8 @@ static int mdio_mux_gpio_switch_fn(int current_child, int desired_child,
 	for (n = 0; n < s->gpios->ndescs; n++)
 		values[n] = (desired_child >> n) & 1;
 
-	gpiod_set_array_cansleep(s->gpios->ndescs, s->gpios->desc, values);
+	gpiod_set_array_value_cansleep(s->gpios->ndescs, s->gpios->desc,
+				       values);
 
 	return 0;
 }
diff --git a/drivers/tty/serial/serial_mctrl_gpio.c b/drivers/tty/serial/serial_mctrl_gpio.c
index 0ec756c62bcf..d7b846d41630 100644
--- a/drivers/tty/serial/serial_mctrl_gpio.c
+++ b/drivers/tty/serial/serial_mctrl_gpio.c
@@ -55,7 +55,7 @@ void mctrl_gpio_set(struct mctrl_gpios *gpios, unsigned int mctrl)
 			value_array[count] = !!(mctrl & mctrl_gpios_desc[i].mctrl);
 			count++;
 		}
-	gpiod_set_array(count, desc_array, value_array);
+	gpiod_set_array_value(count, desc_array, value_array);
 }
 EXPORT_SYMBOL_GPL(mctrl_gpio_set);
 
diff --git a/include/linux/gpio/consumer.h b/include/linux/gpio/consumer.h
index 09a7fb0062a6..fd098169fe87 100644
--- a/include/linux/gpio/consumer.h
+++ b/include/linux/gpio/consumer.h
@@ -100,24 +100,25 @@ int gpiod_direction_output_raw(struct gpio_desc *desc, int value);
 /* Value get/set from non-sleeping context */
 int gpiod_get_value(const struct gpio_desc *desc);
 void gpiod_set_value(struct gpio_desc *desc, int value);
-void gpiod_set_array(unsigned int array_size,
-		     struct gpio_desc **desc_array, int *value_array);
+void gpiod_set_array_value(unsigned int array_size,
+			   struct gpio_desc **desc_array, int *value_array);
 int gpiod_get_raw_value(const struct gpio_desc *desc);
 void gpiod_set_raw_value(struct gpio_desc *desc, int value);
-void gpiod_set_raw_array(unsigned int array_size,
-			 struct gpio_desc **desc_array, int *value_array);
+void gpiod_set_raw_array_value(unsigned int array_size,
+			       struct gpio_desc **desc_array,
+			       int *value_array);
 
 /* Value get/set from sleeping context */
 int gpiod_get_value_cansleep(const struct gpio_desc *desc);
 void gpiod_set_value_cansleep(struct gpio_desc *desc, int value);
-void gpiod_set_array_cansleep(unsigned int array_size,
-			      struct gpio_desc **desc_array,
-			      int *value_array);
+void gpiod_set_array_value_cansleep(unsigned int array_size,
+				    struct gpio_desc **desc_array,
+				    int *value_array);
 int gpiod_get_raw_value_cansleep(const struct gpio_desc *desc);
 void gpiod_set_raw_value_cansleep(struct gpio_desc *desc, int value);
-void gpiod_set_raw_array_cansleep(unsigned int array_size,
-				  struct gpio_desc **desc_array,
-				  int *value_array);
+void gpiod_set_raw_array_value_cansleep(unsigned int array_size,
+					struct gpio_desc **desc_array,
+					int *value_array);
 
 int gpiod_set_debounce(struct gpio_desc *desc, unsigned debounce);
 
@@ -304,9 +305,9 @@ static inline void gpiod_set_value(struct gpio_desc *desc, int value)
 	/* GPIO can never have been requested */
 	WARN_ON(1);
 }
-static inline void gpiod_set_array(unsigned int array_size,
-				   struct gpio_desc **desc_array,
-				   int *value_array)
+static inline void gpiod_set_array_value(unsigned int array_size,
+					 struct gpio_desc **desc_array,
+					 int *value_array)
 {
 	/* GPIO can never have been requested */
 	WARN_ON(1);
@@ -322,9 +323,9 @@ static inline void gpiod_set_raw_value(struct gpio_desc *desc, int value)
 	/* GPIO can never have been requested */
 	WARN_ON(1);
 }
-static inline void gpiod_set_raw_array(unsigned int array_size,
-				       struct gpio_desc **desc_array,
-				       int *value_array)
+static inline void gpiod_set_raw_array_value(unsigned int array_size,
+					     struct gpio_desc **desc_array,
+					     int *value_array)
 {
 	/* GPIO can never have been requested */
 	WARN_ON(1);
@@ -341,7 +342,7 @@ static inline void gpiod_set_value_cansleep(struct gpio_desc *desc, int value)
 	/* GPIO can never have been requested */
 	WARN_ON(1);
 }
-static inline void gpiod_set_array_cansleep(unsigned int array_size,
+static inline void gpiod_set_array_value_cansleep(unsigned int array_size,
 					    struct gpio_desc **desc_array,
 					    int *value_array)
 {
@@ -360,7 +361,7 @@ static inline void gpiod_set_raw_value_cansleep(struct gpio_desc *desc,
 	/* GPIO can never have been requested */
 	WARN_ON(1);
 }
-static inline void gpiod_set_raw_array_cansleep(unsigned int array_size,
+static inline void gpiod_set_raw_array_value_cansleep(unsigned int array_size,
 						struct gpio_desc **desc_array,
 						int *value_array)
 {
-- 
cgit v1.2.3


From 5fbf65d5c9c0fd2e5c6c48d69ce34b1c5415f2fd Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <yamada.masahiro@socionext.com>
Date: Fri, 22 May 2015 15:19:37 +0900
Subject: pinctrl: remove useless const qualifier

This "const" claims the get_function_groups callback never
changes the given num_groups pointer.  It is always true
in C language, so not worth mentioning.

Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 include/linux/pinctrl/pinmux.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/pinctrl/pinmux.h b/include/linux/pinctrl/pinmux.h
index d3740fa7073f..ace60d775b20 100644
--- a/include/linux/pinctrl/pinmux.h
+++ b/include/linux/pinctrl/pinmux.h
@@ -69,7 +69,7 @@ struct pinmux_ops {
 	int (*get_function_groups) (struct pinctrl_dev *pctldev,
 				  unsigned selector,
 				  const char * const **groups,
-				  unsigned * const num_groups);
+				  unsigned *num_groups);
 	int (*set_mux) (struct pinctrl_dev *pctldev, unsigned func_selector,
 			unsigned group_selector);
 	int (*gpio_request_enable) (struct pinctrl_dev *pctldev,
-- 
cgit v1.2.3


From b3da97ee581387cd42dafd76eb2ac23f2335cd92 Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <yamada.masahiro@socionext.com>
Date: Fri, 22 May 2015 15:25:50 +0900
Subject: pinctrl: use "const struct ..." rather than "struct ... const"

Only this member, pins, is defined as "struct ... const *", but the
others in this struct, pinlops, pmxops, confops, etc. are defined as
"const struct ... *".

Swap the "struct pinctrl_pin_desc" and "const" for consistency.

Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 include/linux/pinctrl/pinctrl.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/pinctrl/pinctrl.h b/include/linux/pinctrl/pinctrl.h
index 66e4697516de..9ba59fcba549 100644
--- a/include/linux/pinctrl/pinctrl.h
+++ b/include/linux/pinctrl/pinctrl.h
@@ -127,7 +127,7 @@ struct pinctrl_ops {
  */
 struct pinctrl_desc {
 	const char *name;
-	struct pinctrl_pin_desc const *pins;
+	const struct pinctrl_pin_desc *pins;
 	unsigned int npins;
 	const struct pinctrl_ops *pctlops;
 	const struct pinmux_ops *pmxops;
-- 
cgit v1.2.3


From cf561f0d2eb74574ad9985a2feab134267a9d298 Mon Sep 17 00:00:00 2001
From: Greg Kurz <gkurz@linux.vnet.ibm.com>
Date: Fri, 24 Apr 2015 14:24:27 +0200
Subject: virtio: introduce virtio_is_little_endian() helper

Signed-off-by: Greg Kurz <gkurz@linux.vnet.ibm.com>

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Acked-by: Cornelia Huck <cornelia.huck@de.ibm.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
---
 include/linux/virtio_config.h | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h
index 1e306f727edc..170947eb11b5 100644
--- a/include/linux/virtio_config.h
+++ b/include/linux/virtio_config.h
@@ -205,35 +205,40 @@ int virtqueue_set_affinity(struct virtqueue *vq, int cpu)
 	return 0;
 }
 
+static inline bool virtio_is_little_endian(struct virtio_device *vdev)
+{
+	return virtio_has_feature(vdev, VIRTIO_F_VERSION_1);
+}
+
 /* Memory accessors */
 static inline u16 virtio16_to_cpu(struct virtio_device *vdev, __virtio16 val)
 {
-	return __virtio16_to_cpu(virtio_has_feature(vdev, VIRTIO_F_VERSION_1), val);
+	return __virtio16_to_cpu(virtio_is_little_endian(vdev), val);
 }
 
 static inline __virtio16 cpu_to_virtio16(struct virtio_device *vdev, u16 val)
 {
-	return __cpu_to_virtio16(virtio_has_feature(vdev, VIRTIO_F_VERSION_1), val);
+	return __cpu_to_virtio16(virtio_is_little_endian(vdev), val);
 }
 
 static inline u32 virtio32_to_cpu(struct virtio_device *vdev, __virtio32 val)
 {
-	return __virtio32_to_cpu(virtio_has_feature(vdev, VIRTIO_F_VERSION_1), val);
+	return __virtio32_to_cpu(virtio_is_little_endian(vdev), val);
 }
 
 static inline __virtio32 cpu_to_virtio32(struct virtio_device *vdev, u32 val)
 {
-	return __cpu_to_virtio32(virtio_has_feature(vdev, VIRTIO_F_VERSION_1), val);
+	return __cpu_to_virtio32(virtio_is_little_endian(vdev), val);
 }
 
 static inline u64 virtio64_to_cpu(struct virtio_device *vdev, __virtio64 val)
 {
-	return __virtio64_to_cpu(virtio_has_feature(vdev, VIRTIO_F_VERSION_1), val);
+	return __virtio64_to_cpu(virtio_is_little_endian(vdev), val);
 }
 
 static inline __virtio64 cpu_to_virtio64(struct virtio_device *vdev, u64 val)
 {
-	return __cpu_to_virtio64(virtio_has_feature(vdev, VIRTIO_F_VERSION_1), val);
+	return __cpu_to_virtio64(virtio_is_little_endian(vdev), val);
 }
 
 /* Config space accessors. */
-- 
cgit v1.2.3


From 5da7b160b36a42f161980c5e20d3960d6d076fb8 Mon Sep 17 00:00:00 2001
From: Greg Kurz <gkurz@linux.vnet.ibm.com>
Date: Fri, 24 Apr 2015 14:24:58 +0200
Subject: vringh: introduce vringh_is_little_endian() helper

Signed-off-by: Greg Kurz <gkurz@linux.vnet.ibm.com>

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Acked-by: Cornelia Huck <cornelia.huck@de.ibm.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
---
 include/linux/vringh.h | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/vringh.h b/include/linux/vringh.h
index a3fa537e717a..3ed62efc2bc5 100644
--- a/include/linux/vringh.h
+++ b/include/linux/vringh.h
@@ -226,33 +226,38 @@ static inline void vringh_notify(struct vringh *vrh)
 		vrh->notify(vrh);
 }
 
+static inline bool vringh_is_little_endian(const struct vringh *vrh)
+{
+	return vrh->little_endian;
+}
+
 static inline u16 vringh16_to_cpu(const struct vringh *vrh, __virtio16 val)
 {
-	return __virtio16_to_cpu(vrh->little_endian, val);
+	return __virtio16_to_cpu(vringh_is_little_endian(vrh), val);
 }
 
 static inline __virtio16 cpu_to_vringh16(const struct vringh *vrh, u16 val)
 {
-	return __cpu_to_virtio16(vrh->little_endian, val);
+	return __cpu_to_virtio16(vringh_is_little_endian(vrh), val);
 }
 
 static inline u32 vringh32_to_cpu(const struct vringh *vrh, __virtio32 val)
 {
-	return __virtio32_to_cpu(vrh->little_endian, val);
+	return __virtio32_to_cpu(vringh_is_little_endian(vrh), val);
 }
 
 static inline __virtio32 cpu_to_vringh32(const struct vringh *vrh, u32 val)
 {
-	return __cpu_to_virtio32(vrh->little_endian, val);
+	return __cpu_to_virtio32(vringh_is_little_endian(vrh), val);
 }
 
 static inline u64 vringh64_to_cpu(const struct vringh *vrh, __virtio64 val)
 {
-	return __virtio64_to_cpu(vrh->little_endian, val);
+	return __virtio64_to_cpu(vringh_is_little_endian(vrh), val);
 }
 
 static inline __virtio64 cpu_to_vringh64(const struct vringh *vrh, u64 val)
 {
-	return __cpu_to_virtio64(vrh->little_endian, val);
+	return __cpu_to_virtio64(vringh_is_little_endian(vrh), val);
 }
 #endif /* _LINUX_VRINGH_H */
-- 
cgit v1.2.3


From 7d82410950aa74adccf035c332e409af2bb93e92 Mon Sep 17 00:00:00 2001
From: Greg Kurz <gkurz@linux.vnet.ibm.com>
Date: Fri, 24 Apr 2015 14:26:24 +0200
Subject: virtio: add explicit big-endian support to memory accessors

The current memory accessors logic is:
- little endian if little_endian
- native endian (i.e. no byteswap) if !little_endian

If we want to fully support cross-endian vhost, we also need to be
able to convert to big endian.

Instead of changing the little_endian argument to some 3-value enum, this
patch changes the logic to:
- little endian if little_endian
- big endian if !little_endian

The native endian case is handled by all users with a trivial helper. This
patch doesn't change any functionality, nor it does add overhead.

Signed-off-by: Greg Kurz <gkurz@linux.vnet.ibm.com>

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Reviewed-by: Cornelia Huck <cornelia.huck@de.ibm.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
---
 drivers/net/macvtap.c            |  3 ++-
 drivers/net/tun.c                |  3 ++-
 drivers/vhost/vhost.h            |  3 ++-
 include/linux/virtio_byteorder.h | 24 ++++++++++++++----------
 include/linux/virtio_config.h    |  3 ++-
 include/linux/vringh.h           |  3 ++-
 6 files changed, 24 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c
index 4fdb88102082..072ccbaf7c45 100644
--- a/drivers/net/macvtap.c
+++ b/drivers/net/macvtap.c
@@ -51,7 +51,8 @@ struct macvtap_queue {
 
 static inline bool macvtap_is_little_endian(struct macvtap_queue *q)
 {
-	return q->flags & MACVTAP_VNET_LE;
+	return q->flags & MACVTAP_VNET_LE ||
+		virtio_legacy_is_little_endian();
 }
 
 static inline u16 macvtap16_to_cpu(struct macvtap_queue *q, __virtio16 val)
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 301e87c89a9f..b210139bef8f 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -208,7 +208,8 @@ struct tun_struct {
 
 static inline bool tun_is_little_endian(struct tun_struct *tun)
 {
-	return tun->flags & TUN_VNET_LE;
+	return tun->flags & TUN_VNET_LE ||
+		virtio_legacy_is_little_endian();
 }
 
 static inline u16 tun16_to_cpu(struct tun_struct *tun, __virtio16 val)
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index 6a499603d856..a4fa33a79bf2 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -175,7 +175,8 @@ static inline bool vhost_has_feature(struct vhost_virtqueue *vq, int bit)
 
 static inline bool vhost_is_little_endian(struct vhost_virtqueue *vq)
 {
-	return vhost_has_feature(vq, VIRTIO_F_VERSION_1);
+	return vhost_has_feature(vq, VIRTIO_F_VERSION_1) ||
+		virtio_legacy_is_little_endian();
 }
 
 /* Memory accessors */
diff --git a/include/linux/virtio_byteorder.h b/include/linux/virtio_byteorder.h
index 51865d05b267..ce63a2c3a612 100644
--- a/include/linux/virtio_byteorder.h
+++ b/include/linux/virtio_byteorder.h
@@ -3,17 +3,21 @@
 #include <linux/types.h>
 #include <uapi/linux/virtio_types.h>
 
-/*
- * Low-level memory accessors for handling virtio in modern little endian and in
- * compatibility native endian format.
- */
+static inline bool virtio_legacy_is_little_endian(void)
+{
+#ifdef __LITTLE_ENDIAN
+	return true;
+#else
+	return false;
+#endif
+}
 
 static inline u16 __virtio16_to_cpu(bool little_endian, __virtio16 val)
 {
 	if (little_endian)
 		return le16_to_cpu((__force __le16)val);
 	else
-		return (__force u16)val;
+		return be16_to_cpu((__force __be16)val);
 }
 
 static inline __virtio16 __cpu_to_virtio16(bool little_endian, u16 val)
@@ -21,7 +25,7 @@ static inline __virtio16 __cpu_to_virtio16(bool little_endian, u16 val)
 	if (little_endian)
 		return (__force __virtio16)cpu_to_le16(val);
 	else
-		return (__force __virtio16)val;
+		return (__force __virtio16)cpu_to_be16(val);
 }
 
 static inline u32 __virtio32_to_cpu(bool little_endian, __virtio32 val)
@@ -29,7 +33,7 @@ static inline u32 __virtio32_to_cpu(bool little_endian, __virtio32 val)
 	if (little_endian)
 		return le32_to_cpu((__force __le32)val);
 	else
-		return (__force u32)val;
+		return be32_to_cpu((__force __be32)val);
 }
 
 static inline __virtio32 __cpu_to_virtio32(bool little_endian, u32 val)
@@ -37,7 +41,7 @@ static inline __virtio32 __cpu_to_virtio32(bool little_endian, u32 val)
 	if (little_endian)
 		return (__force __virtio32)cpu_to_le32(val);
 	else
-		return (__force __virtio32)val;
+		return (__force __virtio32)cpu_to_be32(val);
 }
 
 static inline u64 __virtio64_to_cpu(bool little_endian, __virtio64 val)
@@ -45,7 +49,7 @@ static inline u64 __virtio64_to_cpu(bool little_endian, __virtio64 val)
 	if (little_endian)
 		return le64_to_cpu((__force __le64)val);
 	else
-		return (__force u64)val;
+		return be64_to_cpu((__force __be64)val);
 }
 
 static inline __virtio64 __cpu_to_virtio64(bool little_endian, u64 val)
@@ -53,7 +57,7 @@ static inline __virtio64 __cpu_to_virtio64(bool little_endian, u64 val)
 	if (little_endian)
 		return (__force __virtio64)cpu_to_le64(val);
 	else
-		return (__force __virtio64)val;
+		return (__force __virtio64)cpu_to_be64(val);
 }
 
 #endif /* _LINUX_VIRTIO_BYTEORDER */
diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h
index 170947eb11b5..e5ce8ab0b8b0 100644
--- a/include/linux/virtio_config.h
+++ b/include/linux/virtio_config.h
@@ -207,7 +207,8 @@ int virtqueue_set_affinity(struct virtqueue *vq, int cpu)
 
 static inline bool virtio_is_little_endian(struct virtio_device *vdev)
 {
-	return virtio_has_feature(vdev, VIRTIO_F_VERSION_1);
+	return virtio_has_feature(vdev, VIRTIO_F_VERSION_1) ||
+		virtio_legacy_is_little_endian();
 }
 
 /* Memory accessors */
diff --git a/include/linux/vringh.h b/include/linux/vringh.h
index 3ed62efc2bc5..bc6c28d04263 100644
--- a/include/linux/vringh.h
+++ b/include/linux/vringh.h
@@ -228,7 +228,8 @@ static inline void vringh_notify(struct vringh *vrh)
 
 static inline bool vringh_is_little_endian(const struct vringh *vrh)
 {
-	return vrh->little_endian;
+	return vrh->little_endian ||
+		virtio_legacy_is_little_endian();
 }
 
 static inline u16 vringh16_to_cpu(const struct vringh *vrh, __virtio16 val)
-- 
cgit v1.2.3


From 4af34b572a85c44c55491a10693535a79627c478 Mon Sep 17 00:00:00 2001
From: Maxime Ripard <maxime.ripard@free-electrons.com>
Date: Mon, 1 Jun 2015 11:04:26 +0200
Subject: drivers: soc: sunxi: Introduce SoC driver to map SRAMs

The Allwinner SoCs have a handful of SRAM that can be either mapped to be
accessible by devices or the CPU.

That mapping is controlled by an SRAM controller, and that mapping might
not be set by the bootloader, for example if the device wasn't used at all,
or if we're using solutions like the U-Boot's Falcon Boot.

We could also imagine changing this at runtime for example to change the
mapping of these SRAMs to use them for suspend/resume or runtime memory
rate change, if that ever happens.

These use cases require some API in the kernel to control that mapping,
exported through a drivers/soc driver.

This driver also implement a debugfs file that shows the SRAM found in the
system, the current mapping and the SRAM that have been claimed by some
drivers in the kernel.

Signed-off-by: Maxime Ripard <maxime.ripard@free-electrons.com>
Acked-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Hans de Goede <hdegoede@redhat.com>
Tested-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 .../devicetree/bindings/soc/sunxi/sram.txt         |  72 ++++++
 drivers/soc/Kconfig                                |   1 +
 drivers/soc/Makefile                               |   1 +
 drivers/soc/sunxi/Kconfig                          |  10 +
 drivers/soc/sunxi/Makefile                         |   1 +
 drivers/soc/sunxi/sunxi_sram.c                     | 284 +++++++++++++++++++++
 include/linux/soc/sunxi/sunxi_sram.h               |  19 ++
 7 files changed, 388 insertions(+)
 create mode 100644 Documentation/devicetree/bindings/soc/sunxi/sram.txt
 create mode 100644 drivers/soc/sunxi/Kconfig
 create mode 100644 drivers/soc/sunxi/Makefile
 create mode 100644 drivers/soc/sunxi/sunxi_sram.c
 create mode 100644 include/linux/soc/sunxi/sunxi_sram.h

(limited to 'include/linux')

diff --git a/Documentation/devicetree/bindings/soc/sunxi/sram.txt b/Documentation/devicetree/bindings/soc/sunxi/sram.txt
new file mode 100644
index 000000000000..067698112f5f
--- /dev/null
+++ b/Documentation/devicetree/bindings/soc/sunxi/sram.txt
@@ -0,0 +1,72 @@
+Allwinnner SoC SRAM controllers
+-----------------------------------------------------
+
+The SRAM controller found on most Allwinner devices is represented by
+a regular node for the SRAM controller itself, with sub-nodes
+reprensenting the SRAM handled by the SRAM controller.
+
+Controller Node
+---------------
+
+Required properties:
+- compatible : "allwinner,sun4i-a10-sram-controller"
+- reg : sram controller register offset + length
+
+SRAM nodes
+----------
+
+Each SRAM is described using the mmio-sram bindings documented in
+Documentation/devicetree/bindings/misc/sram.txt
+
+Each SRAM will have SRAM sections that are going to be handled by the
+SRAM controller as subnodes. These sections are represented following
+once again the representation described in the mmio-sram binding.
+
+The valid sections compatible are:
+    - allwinner,sun4i-a10-sram-a3-a4
+    - allwinner,sun4i-a10-sram-d
+
+Devices using SRAM sections
+---------------------------
+
+Some devices need to request to the SRAM controller to map an SRAM for
+their exclusive use.
+
+The relationship between such a device and an SRAM section is
+expressed through the allwinner,sram property, that will take a
+phandle and an argument.
+
+This valid values for this argument are:
+  - 0: CPU
+  - 1: Device
+
+Example
+-------
+sram-controller@01c00000 {
+	compatible = "allwinner,sun4i-a10-sram-controller";
+	reg = <0x01c00000 0x30>;
+	#address-cells = <1>;
+	#size-cells = <1>;
+	ranges;
+
+	sram_a: sram@00000000 {
+		compatible = "mmio-sram";
+		reg = <0x00000000 0xc000>;
+		#address-cells = <1>;
+		#size-cells = <1>;
+		ranges = <0 0x00000000 0xc000>;
+
+		emac_sram: sram-section@8000 {
+			compatible = "allwinner,sun4i-a10-sram-a3-a4";
+			reg = <0x8000 0x4000>;
+			status = "disabled";
+		};
+	};
+};
+
+emac: ethernet@01c0b000 {
+	compatible = "allwinner,sun4i-a10-emac";
+	...
+
+	allwinner,sram = <&emac_sram 1>;
+};
diff --git a/drivers/soc/Kconfig b/drivers/soc/Kconfig
index d8bde82f0370..96ddecb92254 100644
--- a/drivers/soc/Kconfig
+++ b/drivers/soc/Kconfig
@@ -2,6 +2,7 @@ menu "SOC (System On Chip) specific Drivers"
 
 source "drivers/soc/mediatek/Kconfig"
 source "drivers/soc/qcom/Kconfig"
+source "drivers/soc/sunxi/Kconfig"
 source "drivers/soc/ti/Kconfig"
 source "drivers/soc/versatile/Kconfig"
 
diff --git a/drivers/soc/Makefile b/drivers/soc/Makefile
index 70042b259744..7dc7c0d8a2c1 100644
--- a/drivers/soc/Makefile
+++ b/drivers/soc/Makefile
@@ -4,6 +4,7 @@
 
 obj-$(CONFIG_ARCH_MEDIATEK)	+= mediatek/
 obj-$(CONFIG_ARCH_QCOM)		+= qcom/
+obj-$(CONFIG_ARCH_SUNXI)	+= sunxi/
 obj-$(CONFIG_ARCH_TEGRA)	+= tegra/
 obj-$(CONFIG_SOC_TI)		+= ti/
 obj-$(CONFIG_PLAT_VERSATILE)	+= versatile/
diff --git a/drivers/soc/sunxi/Kconfig b/drivers/soc/sunxi/Kconfig
new file mode 100644
index 000000000000..353b07e40176
--- /dev/null
+++ b/drivers/soc/sunxi/Kconfig
@@ -0,0 +1,10 @@
+#
+# Allwinner sunXi SoC drivers
+#
+config SUNXI_SRAM
+	bool
+	default ARCH_SUNXI
+	help
+	  Say y here to enable the SRAM controller support. This
+	  device is responsible on mapping the SRAM in the sunXi SoCs
+	  whether to the CPU/DMA, or to the devices.
diff --git a/drivers/soc/sunxi/Makefile b/drivers/soc/sunxi/Makefile
new file mode 100644
index 000000000000..4cf9dbdf346e
--- /dev/null
+++ b/drivers/soc/sunxi/Makefile
@@ -0,0 +1 @@
+obj-$(CONFIG_SUNXI_SRAM) +=	sunxi_sram.o
diff --git a/drivers/soc/sunxi/sunxi_sram.c b/drivers/soc/sunxi/sunxi_sram.c
new file mode 100644
index 000000000000..bc52670c8f4b
--- /dev/null
+++ b/drivers/soc/sunxi/sunxi_sram.c
@@ -0,0 +1,284 @@
+/*
+ * Allwinner SoCs SRAM Controller Driver
+ *
+ * Copyright (C) 2015 Maxime Ripard
+ *
+ * Author: Maxime Ripard <maxime.ripard@free-electrons.com>
+ *
+ * This file is licensed under the terms of the GNU General Public
+ * License version 2.  This program is licensed "as is" without any
+ * warranty of any kind, whether express or implied.
+ */
+
+#include <linux/debugfs.h>
+#include <linux/io.h>
+#include <linux/module.h>
+#include <linux/of.h>
+#include <linux/of_address.h>
+#include <linux/of_device.h>
+#include <linux/platform_device.h>
+
+#include <linux/soc/sunxi/sunxi_sram.h>
+
+struct sunxi_sram_func {
+	char	*func;
+	u8	val;
+};
+
+struct sunxi_sram_data {
+	char			*name;
+	u8			reg;
+	u8			offset;
+	u8			width;
+	struct sunxi_sram_func	*func;
+	struct list_head	list;
+};
+
+struct sunxi_sram_desc {
+	struct sunxi_sram_data	data;
+	bool			claimed;
+};
+
+#define SUNXI_SRAM_MAP(_val, _func)				\
+	{							\
+		.func = _func,					\
+		.val = _val,					\
+	}
+
+#define SUNXI_SRAM_DATA(_name, _reg, _off, _width, ...)		\
+	{							\
+		.name = _name,					\
+		.reg = _reg,					\
+		.offset = _off,					\
+		.width = _width,				\
+		.func = (struct sunxi_sram_func[]){		\
+			__VA_ARGS__, { } },			\
+	}
+
+static struct sunxi_sram_desc sun4i_a10_sram_a3_a4 = {
+	.data	= SUNXI_SRAM_DATA("A3-A4", 0x4, 0x4, 2,
+				  SUNXI_SRAM_MAP(0, "cpu"),
+				  SUNXI_SRAM_MAP(1, "emac")),
+};
+
+static struct sunxi_sram_desc sun4i_a10_sram_d = {
+	.data	= SUNXI_SRAM_DATA("D", 0x4, 0x0, 1,
+				  SUNXI_SRAM_MAP(0, "cpu"),
+				  SUNXI_SRAM_MAP(1, "usb-otg")),
+};
+
+static const struct of_device_id sunxi_sram_dt_ids[] = {
+	{
+		.compatible	= "allwinner,sun4i-a10-sram-a3-a4",
+		.data		= &sun4i_a10_sram_a3_a4.data,
+	},
+	{
+		.compatible	= "allwinner,sun4i-a10-sram-d",
+		.data		= &sun4i_a10_sram_d.data,
+	},
+	{}
+};
+
+static struct device *sram_dev;
+static LIST_HEAD(claimed_sram);
+static DEFINE_SPINLOCK(sram_lock);
+static void __iomem *base;
+
+static int sunxi_sram_show(struct seq_file *s, void *data)
+{
+	struct device_node *sram_node, *section_node;
+	const struct sunxi_sram_data *sram_data;
+	const struct of_device_id *match;
+	struct sunxi_sram_func *func;
+	const __be32 *sram_addr_p, *section_addr_p;
+	u32 val;
+
+	seq_puts(s, "Allwinner sunXi SRAM\n");
+	seq_puts(s, "--------------------\n\n");
+
+	for_each_child_of_node(sram_dev->of_node, sram_node) {
+		sram_addr_p = of_get_address(sram_node, 0, NULL, NULL);
+
+		seq_printf(s, "sram@%08x\n",
+			   be32_to_cpu(*sram_addr_p));
+
+		for_each_child_of_node(sram_node, section_node) {
+			match = of_match_node(sunxi_sram_dt_ids, section_node);
+			if (!match)
+				continue;
+			sram_data = match->data;
+
+			section_addr_p = of_get_address(section_node, 0,
+							NULL, NULL);
+
+			seq_printf(s, "\tsection@%04x\t(%s)\n",
+				   be32_to_cpu(*section_addr_p),
+				   sram_data->name);
+
+			val = readl(base + sram_data->reg);
+			val >>= sram_data->offset;
+			val &= sram_data->width;
+
+			for (func = sram_data->func; func->func; func++) {
+				seq_printf(s, "\t\t%s%c\n", func->func,
+					   func->val == val ? '*' : ' ');
+			}
+		}
+
+		seq_puts(s, "\n");
+	}
+
+	return 0;
+}
+
+static int sunxi_sram_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, sunxi_sram_show, inode->i_private);
+}
+
+static const struct file_operations sunxi_sram_fops = {
+	.open = sunxi_sram_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = single_release,
+};
+
+static inline struct sunxi_sram_desc *to_sram_desc(const struct sunxi_sram_data *data)
+{
+	return container_of(data, struct sunxi_sram_desc, data);
+}
+
+static const struct sunxi_sram_data *sunxi_sram_of_parse(struct device_node *node,
+							 unsigned int *value)
+{
+	const struct of_device_id *match;
+	struct of_phandle_args args;
+	int ret;
+
+	ret = of_parse_phandle_with_fixed_args(node, "allwinner,sram", 1, 0,
+					       &args);
+	if (ret)
+		return ERR_PTR(ret);
+
+	if (!of_device_is_available(args.np)) {
+		ret = -EBUSY;
+		goto err;
+	}
+
+	if (value)
+		*value = args.args[0];
+
+	match = of_match_node(sunxi_sram_dt_ids, args.np);
+	if (!match) {
+		ret = -EINVAL;
+		goto err;
+	}
+
+	of_node_put(args.np);
+	return match->data;
+
+err:
+	of_node_put(args.np);
+	return ERR_PTR(ret);
+}
+
+int sunxi_sram_claim(struct device *dev)
+{
+	const struct sunxi_sram_data *sram_data;
+	struct sunxi_sram_desc *sram_desc;
+	unsigned int device;
+	u32 val, mask;
+
+	if (IS_ERR(base))
+		return -EPROBE_DEFER;
+
+	if (!dev || !dev->of_node)
+		return -EINVAL;
+
+	sram_data = sunxi_sram_of_parse(dev->of_node, &device);
+	if (IS_ERR(sram_data))
+		return PTR_ERR(sram_data);
+
+	sram_desc = to_sram_desc(sram_data);
+
+	spin_lock(&sram_lock);
+
+	if (sram_desc->claimed) {
+		spin_unlock(&sram_lock);
+		return -EBUSY;
+	}
+
+	mask = GENMASK(sram_data->offset + sram_data->width, sram_data->offset);
+	val = readl(base + sram_data->reg);
+	val &= ~mask;
+	writel(val | ((device << sram_data->offset) & mask),
+	       base + sram_data->reg);
+
+	spin_unlock(&sram_lock);
+
+	return 0;
+}
+EXPORT_SYMBOL(sunxi_sram_claim);
+
+int sunxi_sram_release(struct device *dev)
+{
+	const struct sunxi_sram_data *sram_data;
+	struct sunxi_sram_desc *sram_desc;
+
+	if (!dev || !dev->of_node)
+		return -EINVAL;
+
+	sram_data = sunxi_sram_of_parse(dev->of_node, NULL);
+	if (IS_ERR(sram_data))
+		return -EINVAL;
+
+	sram_desc = to_sram_desc(sram_data);
+
+	spin_lock(&sram_lock);
+	sram_desc->claimed = false;
+	spin_unlock(&sram_lock);
+
+	return 0;
+}
+EXPORT_SYMBOL(sunxi_sram_release);
+
+static int sunxi_sram_probe(struct platform_device *pdev)
+{
+	struct resource *res;
+	struct dentry *d;
+
+	sram_dev = &pdev->dev;
+
+	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	base = devm_ioremap_resource(&pdev->dev, res);
+	if (IS_ERR(base))
+		return PTR_ERR(base);
+
+	of_platform_populate(pdev->dev.of_node, NULL, NULL, &pdev->dev);
+
+	d = debugfs_create_file("sram", S_IRUGO, NULL, NULL,
+				&sunxi_sram_fops);
+	if (!d)
+		return -ENOMEM;
+
+	return 0;
+}
+
+static const struct of_device_id sunxi_sram_dt_match[] = {
+	{ .compatible = "allwinner,sun4i-a10-sram-controller" },
+	{ },
+};
+MODULE_DEVICE_TABLE(of, sunxi_sram_dt_match);
+
+static struct platform_driver sunxi_sram_driver = {
+	.driver = {
+		.name		= "sunxi-sram",
+		.of_match_table	= sunxi_sram_dt_match,
+	},
+	.probe	= sunxi_sram_probe,
+};
+module_platform_driver(sunxi_sram_driver);
+
+MODULE_AUTHOR("Maxime Ripard <maxime.ripard@free-electrons.com>");
+MODULE_DESCRIPTION("Allwinner sunXi SRAM Controller Driver");
+MODULE_LICENSE("GPL");
diff --git a/include/linux/soc/sunxi/sunxi_sram.h b/include/linux/soc/sunxi/sunxi_sram.h
new file mode 100644
index 000000000000..c5f663bba9c2
--- /dev/null
+++ b/include/linux/soc/sunxi/sunxi_sram.h
@@ -0,0 +1,19 @@
+/*
+ * Allwinner SoCs SRAM Controller Driver
+ *
+ * Copyright (C) 2015 Maxime Ripard
+ *
+ * Author: Maxime Ripard <maxime.ripard@free-electrons.com>
+ *
+ * This file is licensed under the terms of the GNU General Public
+ * License version 2.  This program is licensed "as is" without any
+ * warranty of any kind, whether express or implied.
+ */
+
+#ifndef _SUNXI_SRAM_H_
+#define _SUNXI_SRAM_H_
+
+int sunxi_sram_claim(struct device *dev);
+int sunxi_sram_release(struct device *dev);
+
+#endif /* _SUNXI_SRAM_H_ */
-- 
cgit v1.2.3


From f26cdc8536ad50fb802a0445f836b4f94ca09ae7 Mon Sep 17 00:00:00 2001
From: Keith Busch <keith.busch@intel.com>
Date: Mon, 1 Jun 2015 09:29:53 -0600
Subject: blk-mq: Shared tag enhancements

Storage controllers may expose multiple block devices that share hardware
resources managed by blk-mq. This patch enhances the shared tags so a
low-level driver can access the shared resources not tied to the unshared
h/w contexts. This way the LLD can dynamically add and delete disks and
request queues without having to track all the request_queue hctx's to
iterate outstanding tags.

Signed-off-by: Keith Busch <keith.busch@intel.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq-tag.c     | 38 ++++++++++++++++++++++++++++++++++++++
 block/blk-mq-tag.h     |  1 +
 block/blk-mq.c         | 12 ++++++++++--
 include/linux/blk-mq.h |  4 ++++
 4 files changed, 53 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index be3290cc0644..9b6e28830b82 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -438,6 +438,39 @@ static void bt_for_each(struct blk_mq_hw_ctx *hctx,
 	}
 }
 
+static void bt_tags_for_each(struct blk_mq_tags *tags,
+		struct blk_mq_bitmap_tags *bt, unsigned int off,
+		busy_tag_iter_fn *fn, void *data, bool reserved)
+{
+	struct request *rq;
+	int bit, i;
+
+	if (!tags->rqs)
+		return;
+	for (i = 0; i < bt->map_nr; i++) {
+		struct blk_align_bitmap *bm = &bt->map[i];
+
+		for (bit = find_first_bit(&bm->word, bm->depth);
+		     bit < bm->depth;
+		     bit = find_next_bit(&bm->word, bm->depth, bit + 1)) {
+			rq = blk_mq_tag_to_rq(tags, off + bit);
+			fn(rq, data, reserved);
+		}
+
+		off += (1 << bt->bits_per_word);
+	}
+}
+
+void blk_mq_all_tag_busy_iter(struct blk_mq_tags *tags, busy_tag_iter_fn *fn,
+		void *priv)
+{
+	if (tags->nr_reserved_tags)
+		bt_tags_for_each(tags, &tags->breserved_tags, 0, fn, priv, true);
+	bt_tags_for_each(tags, &tags->bitmap_tags, tags->nr_reserved_tags, fn, priv,
+			false);
+}
+EXPORT_SYMBOL(blk_mq_all_tag_busy_iter);
+
 void blk_mq_tag_busy_iter(struct blk_mq_hw_ctx *hctx, busy_iter_fn *fn,
 		void *priv)
 {
@@ -580,6 +613,11 @@ struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags,
 	if (!tags)
 		return NULL;
 
+	if (!zalloc_cpumask_var(&tags->cpumask, GFP_KERNEL)) {
+		kfree(tags);
+		return NULL;
+	}
+
 	tags->nr_tags = total_tags;
 	tags->nr_reserved_tags = reserved_tags;
 
diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h
index 90767b370308..75893a34237d 100644
--- a/block/blk-mq-tag.h
+++ b/block/blk-mq-tag.h
@@ -44,6 +44,7 @@ struct blk_mq_tags {
 	struct list_head page_list;
 
 	int alloc_policy;
+	cpumask_var_t cpumask;
 };
 
 
diff --git a/block/blk-mq.c b/block/blk-mq.c
index c382a34fe5ac..ef100fd2cb86 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1525,7 +1525,6 @@ static struct blk_mq_tags *blk_mq_init_rq_map(struct blk_mq_tag_set *set,
 			i++;
 		}
 	}
-
 	return tags;
 
 fail:
@@ -1821,6 +1820,7 @@ static void blk_mq_map_swqueue(struct request_queue *q)
 
 		hctx = q->mq_ops->map_queue(q, i);
 		cpumask_set_cpu(i, hctx->cpumask);
+		cpumask_set_cpu(i, hctx->tags->cpumask);
 		ctx->index_hw = hctx->nr_ctx;
 		hctx->ctxs[hctx->nr_ctx++] = ctx;
 	}
@@ -2187,6 +2187,12 @@ static int blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set)
 	return 0;
 }
 
+struct cpumask *blk_mq_tags_cpumask(struct blk_mq_tags *tags)
+{
+	return tags->cpumask;
+}
+EXPORT_SYMBOL_GPL(blk_mq_tags_cpumask);
+
 /*
  * Alloc a tag set to be associated with one or more request queues.
  * May fail with EINVAL for various error conditions. May adjust the
@@ -2248,8 +2254,10 @@ void blk_mq_free_tag_set(struct blk_mq_tag_set *set)
 	int i;
 
 	for (i = 0; i < set->nr_hw_queues; i++) {
-		if (set->tags[i])
+		if (set->tags[i]) {
 			blk_mq_free_rq_map(set, set->tags[i], i);
+			free_cpumask_var(set->tags[i]->cpumask);
+		}
 	}
 
 	kfree(set->tags);
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 2056a99b92f8..37d1602c4f7a 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -96,6 +96,7 @@ typedef void (exit_request_fn)(void *, struct request *, unsigned int,
 
 typedef void (busy_iter_fn)(struct blk_mq_hw_ctx *, struct request *, void *,
 		bool);
+typedef void (busy_tag_iter_fn)(struct request *, void *, bool);
 
 struct blk_mq_ops {
 	/*
@@ -182,6 +183,7 @@ bool blk_mq_can_queue(struct blk_mq_hw_ctx *);
 struct request *blk_mq_alloc_request(struct request_queue *q, int rw,
 		gfp_t gfp, bool reserved);
 struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag);
+struct cpumask *blk_mq_tags_cpumask(struct blk_mq_tags *tags);
 
 enum {
 	BLK_MQ_UNIQUE_TAG_BITS = 16,
@@ -224,6 +226,8 @@ void blk_mq_run_hw_queues(struct request_queue *q, bool async);
 void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs);
 void blk_mq_tag_busy_iter(struct blk_mq_hw_ctx *hctx, busy_iter_fn *fn,
 		void *priv);
+void blk_mq_all_tag_busy_iter(struct blk_mq_tags *tags, busy_tag_iter_fn *fn,
+		void *priv);
 void blk_mq_freeze_queue(struct request_queue *q);
 void blk_mq_unfreeze_queue(struct request_queue *q);
 void blk_mq_freeze_queue_start(struct request_queue *q);
-- 
cgit v1.2.3


From bdef7de4b8d9be4cf7bf5aea977f827310ab3ff0 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Mon, 1 Jun 2015 14:56:09 -0700
Subject: net: Add priority to packet_offload objects.

When we scan a packet for GRO processing, we want to see the most
common packet types in the front of the offload_base list.

So add a priority field so we can handle this properly.

IPv4/IPv6 get the highest priority with the implicit zero priority
field.

Next comes ethernet with a priority of 10, and then we have the MPLS
types with a priority of 15.

Suggested-by: Eric Dumazet <eric.dumazet@gmail.com>
Suggested-by: Toshiaki Makita <makita.toshiaki@lab.ntt.co.jp>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 1 +
 net/core/dev.c            | 8 ++++++--
 net/ethernet/eth.c        | 1 +
 net/mpls/mpls_gso.c       | 2 ++
 4 files changed, 10 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 51f8d2f5dc3f..6f5f71ff5169 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1997,6 +1997,7 @@ struct offload_callbacks {
 
 struct packet_offload {
 	__be16			 type;	/* This is really htons(ether_type). */
+	u16			 priority;
 	struct offload_callbacks callbacks;
 	struct list_head	 list;
 };
diff --git a/net/core/dev.c b/net/core/dev.c
index 594163d0c6eb..0602e917a305 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -469,10 +469,14 @@ EXPORT_SYMBOL(dev_remove_pack);
  */
 void dev_add_offload(struct packet_offload *po)
 {
-	struct list_head *head = &offload_base;
+	struct packet_offload *elem;
 
 	spin_lock(&offload_lock);
-	list_add_rcu(&po->list, head);
+	list_for_each_entry(elem, &offload_base, list) {
+		if (po->priority < elem->priority)
+			break;
+	}
+	list_add_rcu(&po->list, elem->list.prev);
 	spin_unlock(&offload_lock);
 }
 EXPORT_SYMBOL(dev_add_offload);
diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c
index c3325bd2f3fb..7d0e239a6755 100644
--- a/net/ethernet/eth.c
+++ b/net/ethernet/eth.c
@@ -470,6 +470,7 @@ EXPORT_SYMBOL(eth_gro_complete);
 
 static struct packet_offload eth_packet_offload __read_mostly = {
 	.type = cpu_to_be16(ETH_P_TEB),
+	.priority = 10,
 	.callbacks = {
 		.gro_receive = eth_gro_receive,
 		.gro_complete = eth_gro_complete,
diff --git a/net/mpls/mpls_gso.c b/net/mpls/mpls_gso.c
index 809df534a720..0183b32da942 100644
--- a/net/mpls/mpls_gso.c
+++ b/net/mpls/mpls_gso.c
@@ -62,6 +62,7 @@ out:
 
 static struct packet_offload mpls_mc_offload __read_mostly = {
 	.type = cpu_to_be16(ETH_P_MPLS_MC),
+	.priority = 15,
 	.callbacks = {
 		.gso_segment    =	mpls_gso_segment,
 	},
@@ -69,6 +70,7 @@ static struct packet_offload mpls_mc_offload __read_mostly = {
 
 static struct packet_offload mpls_uc_offload __read_mostly = {
 	.type = cpu_to_be16(ETH_P_MPLS_UC),
+	.priority = 15,
 	.callbacks = {
 		.gso_segment    =	mpls_gso_segment,
 	},
-- 
cgit v1.2.3


From 66e5133f19e901a044fa5eaeeb6ecff4545839e5 Mon Sep 17 00:00:00 2001
From: Toshiaki Makita <makita.toshiaki@lab.ntt.co.jp>
Date: Mon, 1 Jun 2015 21:55:06 +0900
Subject: vlan: Add GRO support for non hardware accelerated vlan

Currently packets with non-hardware-accelerated vlan cannot be handled
by GRO. This causes low performance for 802.1ad and stacked vlan, as their
vlan tags are currently not stripped by hardware.

This patch adds GRO support for non-hardware-accelerated vlan and
improves receive performance of them.

Test Environment:
 vlan device (.1Q) on vlan device (.1ad) on ixgbe (82599)

Result:

- Before

$ netperf -t TCP_STREAM -H 192.168.20.2 -l 60
Recv   Send    Send
Socket Socket  Message  Elapsed
Size   Size    Size     Time     Throughput
bytes  bytes   bytes    secs.    10^6bits/sec

 87380  16384  16384    60.00    5233.17

Rx side CPU usage:
  %usr      %sys      %irq     %soft     %idle
  0.27     58.03      0.00     41.70      0.00

- After

$ netperf -t TCP_STREAM -H 192.168.20.2 -l 60
Recv   Send    Send
Socket Socket  Message  Elapsed
Size   Size    Size     Time     Throughput
bytes  bytes   bytes    secs.    10^6bits/sec

 87380  16384  16384    60.00    7586.85

Rx side CPU usage:
  %usr      %sys      %irq     %soft     %idle
  0.50     25.83      0.00     59.53     14.14

[ Register VLAN offloads with priority 10 -DaveM ]

Signed-off-by: Toshiaki Makita <makita.toshiaki@lab.ntt.co.jp>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/if_vlan.h | 20 +++++++++++
 net/8021q/vlan.c        | 96 +++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 116 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h
index a40d29846ac2..67ce5bd3b56a 100644
--- a/include/linux/if_vlan.h
+++ b/include/linux/if_vlan.h
@@ -628,4 +628,24 @@ static inline netdev_features_t vlan_features_check(const struct sk_buff *skb,
 	return features;
 }
 
+/**
+ * compare_vlan_header - Compare two vlan headers
+ * @h1: Pointer to vlan header
+ * @h2: Pointer to vlan header
+ *
+ * Compare two vlan headers, returns 0 if equal.
+ *
+ * Please note that alignment of h1 & h2 are only guaranteed to be 16 bits.
+ */
+static inline unsigned long compare_vlan_header(const struct vlan_hdr *h1,
+						const struct vlan_hdr *h2)
+{
+#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)
+	return *(u32 *)h1 ^ *(u32 *)h2;
+#else
+	return ((__force u32)h1->h_vlan_TCI ^ (__force u32)h2->h_vlan_TCI) |
+	       ((__force u32)h1->h_vlan_encapsulated_proto ^
+		(__force u32)h2->h_vlan_encapsulated_proto);
+#endif
+}
 #endif /* !(_LINUX_IF_VLAN_H_) */
diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c
index 59555f0f8fc8..d2cd9de4b724 100644
--- a/net/8021q/vlan.c
+++ b/net/8021q/vlan.c
@@ -618,6 +618,92 @@ out:
 	return err;
 }
 
+static struct sk_buff **vlan_gro_receive(struct sk_buff **head,
+					 struct sk_buff *skb)
+{
+	struct sk_buff *p, **pp = NULL;
+	struct vlan_hdr *vhdr;
+	unsigned int hlen, off_vlan;
+	const struct packet_offload *ptype;
+	__be16 type;
+	int flush = 1;
+
+	off_vlan = skb_gro_offset(skb);
+	hlen = off_vlan + sizeof(*vhdr);
+	vhdr = skb_gro_header_fast(skb, off_vlan);
+	if (skb_gro_header_hard(skb, hlen)) {
+		vhdr = skb_gro_header_slow(skb, hlen, off_vlan);
+		if (unlikely(!vhdr))
+			goto out;
+	}
+
+	type = vhdr->h_vlan_encapsulated_proto;
+
+	rcu_read_lock();
+	ptype = gro_find_receive_by_type(type);
+	if (!ptype)
+		goto out_unlock;
+
+	flush = 0;
+
+	for (p = *head; p; p = p->next) {
+		struct vlan_hdr *vhdr2;
+
+		if (!NAPI_GRO_CB(p)->same_flow)
+			continue;
+
+		vhdr2 = (struct vlan_hdr *)(p->data + off_vlan);
+		if (compare_vlan_header(vhdr, vhdr2))
+			NAPI_GRO_CB(p)->same_flow = 0;
+	}
+
+	skb_gro_pull(skb, sizeof(*vhdr));
+	skb_gro_postpull_rcsum(skb, vhdr, sizeof(*vhdr));
+	pp = ptype->callbacks.gro_receive(head, skb);
+
+out_unlock:
+	rcu_read_unlock();
+out:
+	NAPI_GRO_CB(skb)->flush |= flush;
+
+	return pp;
+}
+
+static int vlan_gro_complete(struct sk_buff *skb, int nhoff)
+{
+	struct vlan_hdr *vhdr = (struct vlan_hdr *)(skb->data + nhoff);
+	__be16 type = vhdr->h_vlan_encapsulated_proto;
+	struct packet_offload *ptype;
+	int err = -ENOENT;
+
+	rcu_read_lock();
+	ptype = gro_find_complete_by_type(type);
+	if (ptype)
+		err = ptype->callbacks.gro_complete(skb, nhoff + sizeof(*vhdr));
+
+	rcu_read_unlock();
+	return err;
+}
+
+static struct packet_offload vlan_packet_offloads[] __read_mostly = {
+	{
+		.type = cpu_to_be16(ETH_P_8021Q),
+		.priority = 10,
+		.callbacks = {
+			.gro_receive = vlan_gro_receive,
+			.gro_complete = vlan_gro_complete,
+		},
+	},
+	{
+		.type = cpu_to_be16(ETH_P_8021AD),
+		.priority = 10,
+		.callbacks = {
+			.gro_receive = vlan_gro_receive,
+			.gro_complete = vlan_gro_complete,
+		},
+	},
+};
+
 static int __net_init vlan_init_net(struct net *net)
 {
 	struct vlan_net *vn = net_generic(net, vlan_net_id);
@@ -645,6 +731,7 @@ static struct pernet_operations vlan_net_ops = {
 static int __init vlan_proto_init(void)
 {
 	int err;
+	unsigned int i;
 
 	pr_info("%s v%s\n", vlan_fullname, vlan_version);
 
@@ -668,6 +755,9 @@ static int __init vlan_proto_init(void)
 	if (err < 0)
 		goto err5;
 
+	for (i = 0; i < ARRAY_SIZE(vlan_packet_offloads); i++)
+		dev_add_offload(&vlan_packet_offloads[i]);
+
 	vlan_ioctl_set(vlan_ioctl_handler);
 	return 0;
 
@@ -685,7 +775,13 @@ err0:
 
 static void __exit vlan_cleanup_module(void)
 {
+	unsigned int i;
+
 	vlan_ioctl_set(NULL);
+
+	for (i = 0; i < ARRAY_SIZE(vlan_packet_offloads); i++)
+		dev_remove_offload(&vlan_packet_offloads[i]);
+
 	vlan_netlink_fini();
 
 	unregister_netdevice_notifier(&vlan_notifier_block);
-- 
cgit v1.2.3


From 3434d23b694e5cb6e44e966914563406c31c4053 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Thu, 21 May 2015 13:33:45 +0530
Subject: clockevents: Add helpers to check the state of a clockevent device

Some clockevent drivers, once migrated to use per-state callbacks,
need to check the state of the clockevent device in their callbacks or
interrupt handler.

Add accessor functions clockevent_state_*() to get this information.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Cc: linaro-kernel@lists.linaro.org
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/04a717d490335c688dd7af899fbcede97e1bb8ee.1432192527.git.viresh.kumar@linaro.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/clockchips.h | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/clockchips.h b/include/linux/clockchips.h
index 271fa4c8eb29..64214ad85af9 100644
--- a/include/linux/clockchips.h
+++ b/include/linux/clockchips.h
@@ -149,6 +149,32 @@ struct clock_event_device {
 	struct module		*owner;
 } ____cacheline_aligned;
 
+/* Helpers to verify state of a clockevent device */
+static inline bool clockevent_state_detached(struct clock_event_device *dev)
+{
+	return dev->state == CLOCK_EVT_STATE_DETACHED;
+}
+
+static inline bool clockevent_state_shutdown(struct clock_event_device *dev)
+{
+	return dev->state == CLOCK_EVT_STATE_SHUTDOWN;
+}
+
+static inline bool clockevent_state_periodic(struct clock_event_device *dev)
+{
+	return dev->state == CLOCK_EVT_STATE_PERIODIC;
+}
+
+static inline bool clockevent_state_oneshot(struct clock_event_device *dev)
+{
+	return dev->state == CLOCK_EVT_STATE_ONESHOT;
+}
+
+static inline bool clockevent_state_oneshot_stopped(struct clock_event_device *dev)
+{
+	return dev->state == CLOCK_EVT_STATE_ONESHOT_STOPPED;
+}
+
 /*
  * Calculate a multiplication factor for scaled math, which is used to convert
  * nanoseconds based values to clock ticks:
-- 
cgit v1.2.3


From a97e2d86a9b88ea9e9a280b594b80f0eec2c955b Mon Sep 17 00:00:00 2001
From: Ira Weiny <ira.weiny@intel.com>
Date: Sun, 31 May 2015 17:15:30 -0400
Subject: IB/core cleanup: Add const on args - device->process_mad

The process_mad device function declares some parameters as "in".  Make those
parameters const and adjust the call tree under process_mad in the various
drivers accordingly.

Signed-off-by: Ira Weiny <ira.weiny@intel.com>
Reviewed-by: Hal Rosenstock <hal@mellanox.com>
Reviewed-by: Jason Gunthorpe <jgunthorpe@obsidianresearch.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/infiniband/hw/amso1100/c2_provider.c  |  6 +++---
 drivers/infiniband/hw/cxgb3/iwch_provider.c   |  6 +++---
 drivers/infiniband/hw/cxgb4/provider.c        |  5 +++--
 drivers/infiniband/hw/ehca/ehca_iverbs.h      |  4 ++--
 drivers/infiniband/hw/ehca/ehca_sqp.c         | 14 +++++++-------
 drivers/infiniband/hw/ipath/ipath_mad.c       |  8 ++++----
 drivers/infiniband/hw/ipath/ipath_verbs.h     |  6 +++---
 drivers/infiniband/hw/mlx4/mad.c              | 21 +++++++++++----------
 drivers/infiniband/hw/mlx4/mlx4_ib.h          |  8 ++++----
 drivers/infiniband/hw/mlx5/mad.c              |  8 ++++----
 drivers/infiniband/hw/mlx5/mlx5_ib.h          |  8 ++++----
 drivers/infiniband/hw/mthca/mthca_cmd.c       |  4 ++--
 drivers/infiniband/hw/mthca/mthca_cmd.h       |  4 ++--
 drivers/infiniband/hw/mthca/mthca_dev.h       |  6 +++---
 drivers/infiniband/hw/mthca/mthca_mad.c       | 10 +++++-----
 drivers/infiniband/hw/nes/nes_verbs.c         |  4 ++--
 drivers/infiniband/hw/ocrdma/ocrdma_ah.c      |  6 +++---
 drivers/infiniband/hw/ocrdma/ocrdma_ah.h      |  6 +++---
 drivers/infiniband/hw/qib/qib_mad.c           | 10 +++++-----
 drivers/infiniband/hw/qib/qib_verbs.h         |  4 ++--
 drivers/net/ethernet/mellanox/mlx5/core/mad.c |  2 +-
 include/linux/mlx5/driver.h                   |  2 +-
 include/rdma/ib_verbs.h                       |  6 +++---
 23 files changed, 80 insertions(+), 78 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/infiniband/hw/amso1100/c2_provider.c b/drivers/infiniband/hw/amso1100/c2_provider.c
index d396c39918de..0f007a6b188b 100644
--- a/drivers/infiniband/hw/amso1100/c2_provider.c
+++ b/drivers/infiniband/hw/amso1100/c2_provider.c
@@ -582,9 +582,9 @@ static int c2_multicast_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
 static int c2_process_mad(struct ib_device *ibdev,
 			  int mad_flags,
 			  u8 port_num,
-			  struct ib_wc *in_wc,
-			  struct ib_grh *in_grh,
-			  struct ib_mad *in_mad, struct ib_mad *out_mad)
+			  const struct ib_wc *in_wc,
+			  const struct ib_grh *in_grh,
+			  const struct ib_mad *in_mad, struct ib_mad *out_mad)
 {
 	pr_debug("%s:%u\n", __func__, __LINE__);
 	return -ENOSYS;
diff --git a/drivers/infiniband/hw/cxgb3/iwch_provider.c b/drivers/infiniband/hw/cxgb3/iwch_provider.c
index 061ef08c92e2..19c830ecbb67 100644
--- a/drivers/infiniband/hw/cxgb3/iwch_provider.c
+++ b/drivers/infiniband/hw/cxgb3/iwch_provider.c
@@ -85,9 +85,9 @@ static int iwch_multicast_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
 static int iwch_process_mad(struct ib_device *ibdev,
 			    int mad_flags,
 			    u8 port_num,
-			    struct ib_wc *in_wc,
-			    struct ib_grh *in_grh,
-			    struct ib_mad *in_mad, struct ib_mad *out_mad)
+			    const struct ib_wc *in_wc,
+			    const struct ib_grh *in_grh,
+			    const struct ib_mad *in_mad, struct ib_mad *out_mad)
 {
 	return -ENOSYS;
 }
diff --git a/drivers/infiniband/hw/cxgb4/provider.c b/drivers/infiniband/hw/cxgb4/provider.c
index ef08a9f29451..75ea26a32076 100644
--- a/drivers/infiniband/hw/cxgb4/provider.c
+++ b/drivers/infiniband/hw/cxgb4/provider.c
@@ -80,8 +80,9 @@ static int c4iw_multicast_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
 }
 
 static int c4iw_process_mad(struct ib_device *ibdev, int mad_flags,
-			    u8 port_num, struct ib_wc *in_wc,
-			    struct ib_grh *in_grh, struct ib_mad *in_mad,
+			    u8 port_num, const struct ib_wc *in_wc,
+			    const struct ib_grh *in_grh,
+			    const struct ib_mad *in_mad,
 			    struct ib_mad *out_mad)
 {
 	return -ENOSYS;
diff --git a/drivers/infiniband/hw/ehca/ehca_iverbs.h b/drivers/infiniband/hw/ehca/ehca_iverbs.h
index 077185b3fbd6..582fc71a8488 100644
--- a/drivers/infiniband/hw/ehca/ehca_iverbs.h
+++ b/drivers/infiniband/hw/ehca/ehca_iverbs.h
@@ -191,8 +191,8 @@ int ehca_dealloc_ucontext(struct ib_ucontext *context);
 int ehca_mmap(struct ib_ucontext *context, struct vm_area_struct *vma);
 
 int ehca_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num,
-		     struct ib_wc *in_wc, struct ib_grh *in_grh,
-		     struct ib_mad *in_mad,
+		     const struct ib_wc *in_wc, const struct ib_grh *in_grh,
+		     const struct ib_mad *in_mad,
 		     struct ib_mad *out_mad);
 
 void ehca_poll_eqs(unsigned long data);
diff --git a/drivers/infiniband/hw/ehca/ehca_sqp.c b/drivers/infiniband/hw/ehca/ehca_sqp.c
index dba8f9f8b996..889ccfda6401 100644
--- a/drivers/infiniband/hw/ehca/ehca_sqp.c
+++ b/drivers/infiniband/hw/ehca/ehca_sqp.c
@@ -140,10 +140,10 @@ struct vertcfl {
 } __attribute__ ((packed));
 
 static int ehca_process_perf(struct ib_device *ibdev, u8 port_num,
-			     struct ib_wc *in_wc, struct ib_grh *in_grh,
-			     struct ib_mad *in_mad, struct ib_mad *out_mad)
+			     const struct ib_wc *in_wc, const struct ib_grh *in_grh,
+			     const struct ib_mad *in_mad, struct ib_mad *out_mad)
 {
-	struct ib_perf *in_perf = (struct ib_perf *)in_mad;
+	const struct ib_perf *in_perf = (const struct ib_perf *)in_mad;
 	struct ib_perf *out_perf = (struct ib_perf *)out_mad;
 	struct ib_class_port_info *poi =
 		(struct ib_class_port_info *)out_perf->data;
@@ -187,8 +187,8 @@ static int ehca_process_perf(struct ib_device *ibdev, u8 port_num,
 
 		/* if request was globally routed, copy route info */
 		if (in_grh) {
-			struct vertcfl *vertcfl =
-				(struct vertcfl *)&in_grh->version_tclass_flow;
+			const struct vertcfl *vertcfl =
+				(const struct vertcfl *)&in_grh->version_tclass_flow;
 			memcpy(poi->redirect_gid, in_grh->dgid.raw,
 			       sizeof(poi->redirect_gid));
 			tcslfl->tc        = vertcfl->tc;
@@ -217,8 +217,8 @@ perf_reply:
 }
 
 int ehca_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num,
-		     struct ib_wc *in_wc, struct ib_grh *in_grh,
-		     struct ib_mad *in_mad, struct ib_mad *out_mad)
+		     const struct ib_wc *in_wc, const struct ib_grh *in_grh,
+		     const struct ib_mad *in_mad, struct ib_mad *out_mad)
 {
 	int ret;
 
diff --git a/drivers/infiniband/hw/ipath/ipath_mad.c b/drivers/infiniband/hw/ipath/ipath_mad.c
index e890e5ba0e01..9e8929e23740 100644
--- a/drivers/infiniband/hw/ipath/ipath_mad.c
+++ b/drivers/infiniband/hw/ipath/ipath_mad.c
@@ -1257,7 +1257,7 @@ static int recv_pma_set_portcounters_ext(struct ib_pma_mad *pmp,
 }
 
 static int process_subn(struct ib_device *ibdev, int mad_flags,
-			u8 port_num, struct ib_mad *in_mad,
+			u8 port_num, const struct ib_mad *in_mad,
 			struct ib_mad *out_mad)
 {
 	struct ib_smp *smp = (struct ib_smp *)out_mad;
@@ -1389,7 +1389,7 @@ bail:
 }
 
 static int process_perf(struct ib_device *ibdev, u8 port_num,
-			struct ib_mad *in_mad,
+			const struct ib_mad *in_mad,
 			struct ib_mad *out_mad)
 {
 	struct ib_pma_mad *pmp = (struct ib_pma_mad *)out_mad;
@@ -1490,8 +1490,8 @@ bail:
  * This is called by the ib_mad module.
  */
 int ipath_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num,
-		      struct ib_wc *in_wc, struct ib_grh *in_grh,
-		      struct ib_mad *in_mad, struct ib_mad *out_mad)
+		      const struct ib_wc *in_wc, const struct ib_grh *in_grh,
+		      const struct ib_mad *in_mad, struct ib_mad *out_mad)
 {
 	int ret;
 
diff --git a/drivers/infiniband/hw/ipath/ipath_verbs.h b/drivers/infiniband/hw/ipath/ipath_verbs.h
index ae6cff4abffc..7a2b6a17f844 100644
--- a/drivers/infiniband/hw/ipath/ipath_verbs.h
+++ b/drivers/infiniband/hw/ipath/ipath_verbs.h
@@ -701,9 +701,9 @@ static inline void ipath_schedule_send(struct ipath_qp *qp)
 int ipath_process_mad(struct ib_device *ibdev,
 		      int mad_flags,
 		      u8 port_num,
-		      struct ib_wc *in_wc,
-		      struct ib_grh *in_grh,
-		      struct ib_mad *in_mad, struct ib_mad *out_mad);
+		      const struct ib_wc *in_wc,
+		      const struct ib_grh *in_grh,
+		      const struct ib_mad *in_mad, struct ib_mad *out_mad);
 
 /*
  * Compare the lower 24 bits of the two values.
diff --git a/drivers/infiniband/hw/mlx4/mad.c b/drivers/infiniband/hw/mlx4/mad.c
index 9cd2b002d7ae..614ac6f07ae1 100644
--- a/drivers/infiniband/hw/mlx4/mad.c
+++ b/drivers/infiniband/hw/mlx4/mad.c
@@ -111,8 +111,9 @@ __be64 mlx4_ib_get_new_demux_tid(struct mlx4_ib_demux_ctx *ctx)
 }
 
 int mlx4_MAD_IFC(struct mlx4_ib_dev *dev, int mad_ifc_flags,
-		 int port, struct ib_wc *in_wc, struct ib_grh *in_grh,
-		 void *in_mad, void *response_mad)
+		 int port, const struct ib_wc *in_wc,
+		 const struct ib_grh *in_grh,
+		 const void *in_mad, void *response_mad)
 {
 	struct mlx4_cmd_mailbox *inmailbox, *outmailbox;
 	void *inbox;
@@ -220,7 +221,7 @@ static void update_sm_ah(struct mlx4_ib_dev *dev, u8 port_num, u16 lid, u8 sl)
  * Snoop SM MADs for port info, GUID info, and  P_Key table sets, so we can
  * synthesize LID change, Client-Rereg, GID change, and P_Key change events.
  */
-static void smp_snoop(struct ib_device *ibdev, u8 port_num, struct ib_mad *mad,
+static void smp_snoop(struct ib_device *ibdev, u8 port_num, const struct ib_mad *mad,
 		      u16 prev_lid)
 {
 	struct ib_port_info *pinfo;
@@ -356,7 +357,7 @@ static void node_desc_override(struct ib_device *dev,
 	}
 }
 
-static void forward_trap(struct mlx4_ib_dev *dev, u8 port_num, struct ib_mad *mad)
+static void forward_trap(struct mlx4_ib_dev *dev, u8 port_num, const struct ib_mad *mad)
 {
 	int qpn = mad->mad_hdr.mgmt_class != IB_MGMT_CLASS_SUBN_LID_ROUTED;
 	struct ib_mad_send_buf *send_buf;
@@ -722,8 +723,8 @@ static int mlx4_ib_demux_mad(struct ib_device *ibdev, u8 port,
 }
 
 static int ib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num,
-			struct ib_wc *in_wc, struct ib_grh *in_grh,
-			struct ib_mad *in_mad, struct ib_mad *out_mad)
+			const struct ib_wc *in_wc, const struct ib_grh *in_grh,
+			const struct ib_mad *in_mad, struct ib_mad *out_mad)
 {
 	u16 slid, prev_lid = 0;
 	int err;
@@ -825,8 +826,8 @@ static void edit_counter(struct mlx4_counter *cnt,
 }
 
 static int iboe_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num,
-			struct ib_wc *in_wc, struct ib_grh *in_grh,
-			struct ib_mad *in_mad, struct ib_mad *out_mad)
+			const struct ib_wc *in_wc, const struct ib_grh *in_grh,
+			const struct ib_mad *in_mad, struct ib_mad *out_mad)
 {
 	struct mlx4_cmd_mailbox *mailbox;
 	struct mlx4_ib_dev *dev = to_mdev(ibdev);
@@ -866,8 +867,8 @@ static int iboe_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num,
 }
 
 int mlx4_ib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num,
-			struct ib_wc *in_wc, struct ib_grh *in_grh,
-			struct ib_mad *in_mad, struct ib_mad *out_mad)
+			const struct ib_wc *in_wc, const struct ib_grh *in_grh,
+			const struct ib_mad *in_mad, struct ib_mad *out_mad)
 {
 	switch (rdma_port_get_link_layer(ibdev, port_num)) {
 	case IB_LINK_LAYER_INFINIBAND:
diff --git a/drivers/infiniband/hw/mlx4/mlx4_ib.h b/drivers/infiniband/hw/mlx4/mlx4_ib.h
index fce3934372a1..645d55ef0604 100644
--- a/drivers/infiniband/hw/mlx4/mlx4_ib.h
+++ b/drivers/infiniband/hw/mlx4/mlx4_ib.h
@@ -706,11 +706,11 @@ int mlx4_ib_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr,
 		      struct ib_recv_wr **bad_wr);
 
 int mlx4_MAD_IFC(struct mlx4_ib_dev *dev, int mad_ifc_flags,
-		 int port, struct ib_wc *in_wc, struct ib_grh *in_grh,
-		 void *in_mad, void *response_mad);
+		 int port, const struct ib_wc *in_wc, const struct ib_grh *in_grh,
+		 const void *in_mad, void *response_mad);
 int mlx4_ib_process_mad(struct ib_device *ibdev, int mad_flags,	u8 port_num,
-			struct ib_wc *in_wc, struct ib_grh *in_grh,
-			struct ib_mad *in_mad, struct ib_mad *out_mad);
+			const struct ib_wc *in_wc, const struct ib_grh *in_grh,
+			const struct ib_mad *in_mad, struct ib_mad *out_mad);
 int mlx4_ib_mad_init(struct mlx4_ib_dev *dev);
 void mlx4_ib_mad_cleanup(struct mlx4_ib_dev *dev);
 
diff --git a/drivers/infiniband/hw/mlx5/mad.c b/drivers/infiniband/hw/mlx5/mad.c
index 9cf9a37bb5ff..34e519cd4c64 100644
--- a/drivers/infiniband/hw/mlx5/mad.c
+++ b/drivers/infiniband/hw/mlx5/mad.c
@@ -41,8 +41,8 @@ enum {
 };
 
 int mlx5_MAD_IFC(struct mlx5_ib_dev *dev, int ignore_mkey, int ignore_bkey,
-		 u8 port, struct ib_wc *in_wc, struct ib_grh *in_grh,
-		 void *in_mad, void *response_mad)
+		 u8 port, const struct ib_wc *in_wc, const struct ib_grh *in_grh,
+		 const void *in_mad, void *response_mad)
 {
 	u8 op_modifier = 0;
 
@@ -58,8 +58,8 @@ int mlx5_MAD_IFC(struct mlx5_ib_dev *dev, int ignore_mkey, int ignore_bkey,
 }
 
 int mlx5_ib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num,
-			struct ib_wc *in_wc, struct ib_grh *in_grh,
-			struct ib_mad *in_mad, struct ib_mad *out_mad)
+			const struct ib_wc *in_wc, const struct ib_grh *in_grh,
+			const struct ib_mad *in_mad, struct ib_mad *out_mad)
 {
 	u16 slid;
 	int err;
diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
index dff1cfcdf476..c6219032d00c 100644
--- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
+++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
@@ -525,8 +525,8 @@ void __mlx5_ib_cq_clean(struct mlx5_ib_cq *cq, u32 qpn, struct mlx5_ib_srq *srq)
 void mlx5_ib_cq_clean(struct mlx5_ib_cq *cq, u32 qpn, struct mlx5_ib_srq *srq);
 void mlx5_ib_free_srq_wqe(struct mlx5_ib_srq *srq, int wqe_index);
 int mlx5_MAD_IFC(struct mlx5_ib_dev *dev, int ignore_mkey, int ignore_bkey,
-		 u8 port, struct ib_wc *in_wc, struct ib_grh *in_grh,
-		 void *in_mad, void *response_mad);
+		 u8 port, const struct ib_wc *in_wc, const struct ib_grh *in_grh,
+		 const void *in_mad, void *response_mad);
 struct ib_ah *create_ib_ah(struct ib_ah_attr *ah_attr,
 			   struct mlx5_ib_ah *ah);
 struct ib_ah *mlx5_ib_create_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr);
@@ -586,8 +586,8 @@ int mlx5_ib_map_phys_fmr(struct ib_fmr *ibfmr, u64 *page_list,
 int mlx5_ib_unmap_fmr(struct list_head *fmr_list);
 int mlx5_ib_fmr_dealloc(struct ib_fmr *ibfmr);
 int mlx5_ib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num,
-			struct ib_wc *in_wc, struct ib_grh *in_grh,
-			struct ib_mad *in_mad, struct ib_mad *out_mad);
+			const struct ib_wc *in_wc, const struct ib_grh *in_grh,
+			const struct ib_mad *in_mad, struct ib_mad *out_mad);
 struct ib_xrcd *mlx5_ib_alloc_xrcd(struct ib_device *ibdev,
 					  struct ib_ucontext *context,
 					  struct ib_udata *udata);
diff --git a/drivers/infiniband/hw/mthca/mthca_cmd.c b/drivers/infiniband/hw/mthca/mthca_cmd.c
index 9d3e5c1ac60e..c7f49bbb0c72 100644
--- a/drivers/infiniband/hw/mthca/mthca_cmd.c
+++ b/drivers/infiniband/hw/mthca/mthca_cmd.c
@@ -1858,8 +1858,8 @@ int mthca_CONF_SPECIAL_QP(struct mthca_dev *dev, int type, u32 qpn)
 }
 
 int mthca_MAD_IFC(struct mthca_dev *dev, int ignore_mkey, int ignore_bkey,
-		  int port, struct ib_wc *in_wc, struct ib_grh *in_grh,
-		  void *in_mad, void *response_mad)
+		  int port, const struct ib_wc *in_wc, const struct ib_grh *in_grh,
+		  const void *in_mad, void *response_mad)
 {
 	struct mthca_mailbox *inmailbox, *outmailbox;
 	void *inbox;
diff --git a/drivers/infiniband/hw/mthca/mthca_cmd.h b/drivers/infiniband/hw/mthca/mthca_cmd.h
index f952244c54de..d2e5b194b938 100644
--- a/drivers/infiniband/hw/mthca/mthca_cmd.h
+++ b/drivers/infiniband/hw/mthca/mthca_cmd.h
@@ -312,8 +312,8 @@ int mthca_QUERY_QP(struct mthca_dev *dev, u32 num, int is_ee,
 		   struct mthca_mailbox *mailbox);
 int mthca_CONF_SPECIAL_QP(struct mthca_dev *dev, int type, u32 qpn);
 int mthca_MAD_IFC(struct mthca_dev *dev, int ignore_mkey, int ignore_bkey,
-		  int port, struct ib_wc *in_wc, struct ib_grh *in_grh,
-		  void *in_mad, void *response_mad);
+		  int port, const struct ib_wc *in_wc, const struct ib_grh *in_grh,
+		  const void *in_mad, void *response_mad);
 int mthca_READ_MGM(struct mthca_dev *dev, int index,
 		   struct mthca_mailbox *mailbox);
 int mthca_WRITE_MGM(struct mthca_dev *dev, int index,
diff --git a/drivers/infiniband/hw/mthca/mthca_dev.h b/drivers/infiniband/hw/mthca/mthca_dev.h
index 7e6a6d64ad4e..b70f9ff23171 100644
--- a/drivers/infiniband/hw/mthca/mthca_dev.h
+++ b/drivers/infiniband/hw/mthca/mthca_dev.h
@@ -576,9 +576,9 @@ int mthca_multicast_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid);
 int mthca_process_mad(struct ib_device *ibdev,
 		      int mad_flags,
 		      u8 port_num,
-		      struct ib_wc *in_wc,
-		      struct ib_grh *in_grh,
-		      struct ib_mad *in_mad,
+		      const struct ib_wc *in_wc,
+		      const struct ib_grh *in_grh,
+		      const struct ib_mad *in_mad,
 		      struct ib_mad *out_mad);
 int mthca_create_agents(struct mthca_dev *dev);
 void mthca_free_agents(struct mthca_dev *dev);
diff --git a/drivers/infiniband/hw/mthca/mthca_mad.c b/drivers/infiniband/hw/mthca/mthca_mad.c
index 8881fa376e06..d54608ca0820 100644
--- a/drivers/infiniband/hw/mthca/mthca_mad.c
+++ b/drivers/infiniband/hw/mthca/mthca_mad.c
@@ -104,7 +104,7 @@ static void update_sm_ah(struct mthca_dev *dev,
  */
 static void smp_snoop(struct ib_device *ibdev,
 		      u8 port_num,
-		      struct ib_mad *mad,
+		      const struct ib_mad *mad,
 		      u16 prev_lid)
 {
 	struct ib_event event;
@@ -160,7 +160,7 @@ static void node_desc_override(struct ib_device *dev,
 
 static void forward_trap(struct mthca_dev *dev,
 			 u8 port_num,
-			 struct ib_mad *mad)
+			 const struct ib_mad *mad)
 {
 	int qpn = mad->mad_hdr.mgmt_class != IB_MGMT_CLASS_SUBN_LID_ROUTED;
 	struct ib_mad_send_buf *send_buf;
@@ -195,9 +195,9 @@ static void forward_trap(struct mthca_dev *dev,
 int mthca_process_mad(struct ib_device *ibdev,
 		      int mad_flags,
 		      u8 port_num,
-		      struct ib_wc *in_wc,
-		      struct ib_grh *in_grh,
-		      struct ib_mad *in_mad,
+		      const struct ib_wc *in_wc,
+		      const struct ib_grh *in_grh,
+		      const struct ib_mad *in_mad,
 		      struct ib_mad *out_mad)
 {
 	int err;
diff --git a/drivers/infiniband/hw/nes/nes_verbs.c b/drivers/infiniband/hw/nes/nes_verbs.c
index 05530e3f6ff0..0099e419e24f 100644
--- a/drivers/infiniband/hw/nes/nes_verbs.c
+++ b/drivers/infiniband/hw/nes/nes_verbs.c
@@ -3221,8 +3221,8 @@ static int nes_multicast_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
  * nes_process_mad
  */
 static int nes_process_mad(struct ib_device *ibdev, int mad_flags,
-		u8 port_num, struct ib_wc *in_wc, struct ib_grh *in_grh,
-		struct ib_mad *in_mad, struct ib_mad *out_mad)
+		u8 port_num, const struct ib_wc *in_wc, const struct ib_grh *in_grh,
+		const struct ib_mad *in_mad, struct ib_mad *out_mad)
 {
 	nes_debug(NES_DBG_INIT, "\n");
 	return -ENOSYS;
diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_ah.c b/drivers/infiniband/hw/ocrdma/ocrdma_ah.c
index d812904f3984..3216bce08a10 100644
--- a/drivers/infiniband/hw/ocrdma/ocrdma_ah.c
+++ b/drivers/infiniband/hw/ocrdma/ocrdma_ah.c
@@ -196,9 +196,9 @@ int ocrdma_modify_ah(struct ib_ah *ibah, struct ib_ah_attr *attr)
 int ocrdma_process_mad(struct ib_device *ibdev,
 		       int process_mad_flags,
 		       u8 port_num,
-		       struct ib_wc *in_wc,
-		       struct ib_grh *in_grh,
-		       struct ib_mad *in_mad, struct ib_mad *out_mad)
+		       const struct ib_wc *in_wc,
+		       const struct ib_grh *in_grh,
+		       const struct ib_mad *in_mad, struct ib_mad *out_mad)
 {
 	int status;
 	struct ocrdma_dev *dev;
diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_ah.h b/drivers/infiniband/hw/ocrdma/ocrdma_ah.h
index 726a87cf22dc..5c4ae3eba47c 100644
--- a/drivers/infiniband/hw/ocrdma/ocrdma_ah.h
+++ b/drivers/infiniband/hw/ocrdma/ocrdma_ah.h
@@ -42,7 +42,7 @@ int ocrdma_modify_ah(struct ib_ah *, struct ib_ah_attr *);
 int ocrdma_process_mad(struct ib_device *,
 		       int process_mad_flags,
 		       u8 port_num,
-		       struct ib_wc *in_wc,
-		       struct ib_grh *in_grh,
-		       struct ib_mad *in_mad, struct ib_mad *out_mad);
+		       const struct ib_wc *in_wc,
+		       const struct ib_grh *in_grh,
+		       const struct ib_mad *in_mad, struct ib_mad *out_mad);
 #endif				/* __OCRDMA_AH_H__ */
diff --git a/drivers/infiniband/hw/qib/qib_mad.c b/drivers/infiniband/hw/qib/qib_mad.c
index 395f4046dba2..6ab8ab89d058 100644
--- a/drivers/infiniband/hw/qib/qib_mad.c
+++ b/drivers/infiniband/hw/qib/qib_mad.c
@@ -1854,7 +1854,7 @@ static int pma_set_portcounters_ext(struct ib_pma_mad *pmp,
 }
 
 static int process_subn(struct ib_device *ibdev, int mad_flags,
-			u8 port, struct ib_mad *in_mad,
+			u8 port, const struct ib_mad *in_mad,
 			struct ib_mad *out_mad)
 {
 	struct ib_smp *smp = (struct ib_smp *)out_mad;
@@ -2006,7 +2006,7 @@ bail:
 }
 
 static int process_perf(struct ib_device *ibdev, u8 port,
-			struct ib_mad *in_mad,
+			const struct ib_mad *in_mad,
 			struct ib_mad *out_mad)
 {
 	struct ib_pma_mad *pmp = (struct ib_pma_mad *)out_mad;
@@ -2299,7 +2299,7 @@ static int check_cc_key(struct qib_ibport *ibp,
 }
 
 static int process_cc(struct ib_device *ibdev, int mad_flags,
-			u8 port, struct ib_mad *in_mad,
+			u8 port, const struct ib_mad *in_mad,
 			struct ib_mad *out_mad)
 {
 	struct ib_cc_mad *ccp = (struct ib_cc_mad *)out_mad;
@@ -2400,8 +2400,8 @@ bail:
  * This is called by the ib_mad module.
  */
 int qib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port,
-		    struct ib_wc *in_wc, struct ib_grh *in_grh,
-		    struct ib_mad *in_mad, struct ib_mad *out_mad)
+		    const struct ib_wc *in_wc, const struct ib_grh *in_grh,
+		    const struct ib_mad *in_mad, struct ib_mad *out_mad)
 {
 	int ret;
 	struct qib_ibport *ibp = to_iport(ibdev, port);
diff --git a/drivers/infiniband/hw/qib/qib_verbs.h b/drivers/infiniband/hw/qib/qib_verbs.h
index bfc8948fdd35..f2f57749c07d 100644
--- a/drivers/infiniband/hw/qib/qib_verbs.h
+++ b/drivers/infiniband/hw/qib/qib_verbs.h
@@ -872,8 +872,8 @@ void qib_cap_mask_chg(struct qib_ibport *ibp);
 void qib_sys_guid_chg(struct qib_ibport *ibp);
 void qib_node_desc_chg(struct qib_ibport *ibp);
 int qib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num,
-		    struct ib_wc *in_wc, struct ib_grh *in_grh,
-		    struct ib_mad *in_mad, struct ib_mad *out_mad);
+		    const struct ib_wc *in_wc, const struct ib_grh *in_grh,
+		    const struct ib_mad *in_mad, struct ib_mad *out_mad);
 int qib_create_agents(struct qib_ibdev *dev);
 void qib_free_agents(struct qib_ibdev *dev);
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/mad.c b/drivers/net/ethernet/mellanox/mlx5/core/mad.c
index ee1b0b965f34..1368dac00da0 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/mad.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/mad.c
@@ -36,7 +36,7 @@
 #include <linux/mlx5/cmd.h>
 #include "mlx5_core.h"
 
-int mlx5_core_mad_ifc(struct mlx5_core_dev *dev, void *inb, void *outb,
+int mlx5_core_mad_ifc(struct mlx5_core_dev *dev, const void *inb, void *outb,
 		      u16 opmod, u8 port)
 {
 	struct mlx5_mad_ifc_mbox_in *in = NULL;
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 9a90e7523dc2..9ec7c93d6fa3 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -696,7 +696,7 @@ int mlx5_core_dump_fill_mkey(struct mlx5_core_dev *dev, struct mlx5_core_mr *mr,
 			     u32 *mkey);
 int mlx5_core_alloc_pd(struct mlx5_core_dev *dev, u32 *pdn);
 int mlx5_core_dealloc_pd(struct mlx5_core_dev *dev, u32 pdn);
-int mlx5_core_mad_ifc(struct mlx5_core_dev *dev, void *inb, void *outb,
+int mlx5_core_mad_ifc(struct mlx5_core_dev *dev, const void *inb, void *outb,
 		      u16 opmod, u8 port);
 void mlx5_pagealloc_init(struct mlx5_core_dev *dev);
 void mlx5_pagealloc_cleanup(struct mlx5_core_dev *dev);
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index f19739adf80d..48c9acbea53e 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -1690,9 +1690,9 @@ struct ib_device {
 	int                        (*process_mad)(struct ib_device *device,
 						  int process_mad_flags,
 						  u8 port_num,
-						  struct ib_wc *in_wc,
-						  struct ib_grh *in_grh,
-						  struct ib_mad *in_mad,
+						  const struct ib_wc *in_wc,
+						  const struct ib_grh *in_grh,
+						  const struct ib_mad *in_mad,
 						  struct ib_mad *out_mad);
 	struct ib_xrcd *	   (*alloc_xrcd)(struct ib_device *device,
 						 struct ib_ucontext *ucontext,
-- 
cgit v1.2.3


From 11f81becca04bb7d2826a9b65bb8d27b0a1bb543 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 22 May 2015 17:13:15 -0400
Subject: page_writeback: revive cancel_dirty_page() in a restricted form

cancel_dirty_page() had some issues and b9ea25152e56 ("page_writeback:
clean up mess around cancel_dirty_page()") replaced it with
account_page_cleaned() which makes the caller responsible for clearing
the dirty bit; unfortunately, the planned changes for cgroup writeback
support requires synchronization between dirty bit manipulation and
stat updates.  While we can open-code such synchronization in each
account_page_cleaned() callsite, that's gonna be unnecessarily awkward
and verbose.

This patch revives cancel_dirty_page() but in a more restricted form.
All it does is TestClearPageDirty() followed by account_page_cleaned()
invocation if the page was dirty.  This helper covers all
account_page_cleaned() usages except for __delete_from_page_cache()
which is a special case anyway and left alone.  As this leaves no
module user for account_page_cleaned(), EXPORT_SYMBOL() is dropped
from it.

This patch just revives cancel_dirty_page() as a trivial wrapper to
replace equivalent usages and doesn't introduce any functional
changes.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 .../lustre/include/linux/lustre_patchless_compat.h |  4 +---
 fs/buffer.c                                        |  4 ++--
 include/linux/mm.h                                 |  1 +
 mm/page-writeback.c                                | 27 ++++++++++++++++------
 mm/truncate.c                                      |  4 +---
 5 files changed, 25 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/staging/lustre/lustre/include/linux/lustre_patchless_compat.h b/drivers/staging/lustre/lustre/include/linux/lustre_patchless_compat.h
index d72605864b0a..14562788e4e0 100644
--- a/drivers/staging/lustre/lustre/include/linux/lustre_patchless_compat.h
+++ b/drivers/staging/lustre/lustre/include/linux/lustre_patchless_compat.h
@@ -55,9 +55,7 @@ truncate_complete_page(struct address_space *mapping, struct page *page)
 	if (PagePrivate(page))
 		page->mapping->a_ops->invalidatepage(page, 0, PAGE_CACHE_SIZE);
 
-	if (TestClearPageDirty(page))
-		account_page_cleaned(page, mapping);
-
+	cancel_dirty_page(page);
 	ClearPageMappedToDisk(page);
 	ll_delete_from_page_cache(page);
 }
diff --git a/fs/buffer.c b/fs/buffer.c
index f96173ad62d9..f21327d1f673 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -3232,8 +3232,8 @@ int try_to_free_buffers(struct page *page)
 	 * to synchronise against __set_page_dirty_buffers and prevent the
 	 * dirty bit from being lost.
 	 */
-	if (ret && TestClearPageDirty(page))
-		account_page_cleaned(page, mapping);
+	if (ret)
+		cancel_dirty_page(page);
 	spin_unlock(&mapping->private_lock);
 out:
 	if (buffers_to_free) {
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 0755b9fd03a7..a83cf3a6f78e 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1215,6 +1215,7 @@ void account_page_dirtied(struct page *page, struct address_space *mapping);
 void account_page_cleaned(struct page *page, struct address_space *mapping);
 int set_page_dirty(struct page *page);
 int set_page_dirty_lock(struct page *page);
+void cancel_dirty_page(struct page *page);
 int clear_page_dirty_for_io(struct page *page);
 
 int get_cmdline(struct task_struct *task, char *buffer, int buflen);
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 5daf5568b9e1..227b867598e1 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -2112,12 +2112,6 @@ EXPORT_SYMBOL(account_page_dirtied);
 
 /*
  * Helper function for deaccounting dirty page without writeback.
- *
- * Doing this should *normally* only ever be done when a page
- * is truncated, and is not actually mapped anywhere at all. However,
- * fs/buffer.c does this when it notices that somebody has cleaned
- * out all the buffers on a page without actually doing it through
- * the VM. Can you say "ext3 is horribly ugly"? Thought you could.
  */
 void account_page_cleaned(struct page *page, struct address_space *mapping)
 {
@@ -2127,7 +2121,6 @@ void account_page_cleaned(struct page *page, struct address_space *mapping)
 		task_io_account_cancelled_write(PAGE_CACHE_SIZE);
 	}
 }
-EXPORT_SYMBOL(account_page_cleaned);
 
 /*
  * For address_spaces which do not use buffers.  Just tag the page as dirty in
@@ -2265,6 +2258,26 @@ int set_page_dirty_lock(struct page *page)
 }
 EXPORT_SYMBOL(set_page_dirty_lock);
 
+/*
+ * This cancels just the dirty bit on the kernel page itself, it does NOT
+ * actually remove dirty bits on any mmap's that may be around. It also
+ * leaves the page tagged dirty, so any sync activity will still find it on
+ * the dirty lists, and in particular, clear_page_dirty_for_io() will still
+ * look at the dirty bits in the VM.
+ *
+ * Doing this should *normally* only ever be done when a page is truncated,
+ * and is not actually mapped anywhere at all. However, fs/buffer.c does
+ * this when it notices that somebody has cleaned out all the buffers on a
+ * page without actually doing it through the VM. Can you say "ext3 is
+ * horribly ugly"? Thought you could.
+ */
+void cancel_dirty_page(struct page *page)
+{
+	if (TestClearPageDirty(page))
+		account_page_cleaned(page, page_mapping(page));
+}
+EXPORT_SYMBOL(cancel_dirty_page);
+
 /*
  * Clear a page's dirty flag, while caring for dirty memory accounting.
  * Returns true if the page was previously dirty.
diff --git a/mm/truncate.c b/mm/truncate.c
index 66af9031fae8..0c360259c085 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -116,9 +116,7 @@ truncate_complete_page(struct address_space *mapping, struct page *page)
 	 * the VM has canceled the dirty bit (eg ext3 journaling).
 	 * Hence dirty accounting check is placed after invalidation.
 	 */
-	if (TestClearPageDirty(page))
-		account_page_cleaned(page, mapping);
-
+	cancel_dirty_page(page);
 	ClearPageMappedToDisk(page);
 	delete_from_page_cache(page);
 	return 0;
-- 
cgit v1.2.3


From c4843a7593a9df3ff5b1806084cefdfa81dd7c79 Mon Sep 17 00:00:00 2001
From: Greg Thelen <gthelen@google.com>
Date: Fri, 22 May 2015 17:13:16 -0400
Subject: memcg: add per cgroup dirty page accounting

When modifying PG_Dirty on cached file pages, update the new
MEM_CGROUP_STAT_DIRTY counter.  This is done in the same places where
global NR_FILE_DIRTY is managed.  The new memcg stat is visible in the
per memcg memory.stat cgroupfs file.  The most recent past attempt at
this was http://thread.gmane.org/gmane.linux.kernel.cgroups/8632

The new accounting supports future efforts to add per cgroup dirty
page throttling and writeback.  It also helps an administrator break
down a container's memory usage and provides evidence to understand
memcg oom kills (the new dirty count is included in memcg oom kill
messages).

The ability to move page accounting between memcg
(memory.move_charge_at_immigrate) makes this accounting more
complicated than the global counter.  The existing
mem_cgroup_{begin,end}_page_stat() lock is used to serialize move
accounting with stat updates.
Typical update operation:
	memcg = mem_cgroup_begin_page_stat(page)
	if (TestSetPageDirty()) {
		[...]
		mem_cgroup_update_page_stat(memcg)
	}
	mem_cgroup_end_page_stat(memcg)

Summary of mem_cgroup_end_page_stat() overhead:
- Without CONFIG_MEMCG it's a no-op
- With CONFIG_MEMCG and no inter memcg task movement, it's just
  rcu_read_lock()
- With CONFIG_MEMCG and inter memcg  task movement, it's
  rcu_read_lock() + spin_lock_irqsave()

A memcg parameter is added to several routines because their callers
now grab mem_cgroup_begin_page_stat() which returns the memcg later
needed by for mem_cgroup_update_page_stat().

Because mem_cgroup_begin_page_stat() may disable interrupts, some
adjustments are needed:
- move __mark_inode_dirty() from __set_page_dirty() to its caller.
  __mark_inode_dirty() locking does not want interrupts disabled.
- use spin_lock_irqsave(tree_lock) rather than spin_lock_irq() in
  __delete_from_page_cache(), replace_page_cache_page(),
  invalidate_complete_page2(), and __remove_mapping().

   text    data     bss      dec    hex filename
8925147 1774832 1785856 12485835 be84cb vmlinux-!CONFIG_MEMCG-before
8925339 1774832 1785856 12486027 be858b vmlinux-!CONFIG_MEMCG-after
                            +192 text bytes
8965977 1784992 1785856 12536825 bf4bf9 vmlinux-CONFIG_MEMCG-before
8966750 1784992 1785856 12537598 bf4efe vmlinux-CONFIG_MEMCG-after
                            +773 text bytes

Performance tests run on v4.0-rc1-36-g4f671fe2f952.  Lower is better for
all metrics, they're all wall clock or cycle counts.  The read and write
fault benchmarks just measure fault time, they do not include I/O time.

* CONFIG_MEMCG not set:
                            baseline                              patched
  kbuild                 1m25.030000(+-0.088% 3 samples)       1m25.426667(+-0.120% 3 samples)
  dd write 100 MiB          0.859211561 +-15.10%                  0.874162885 +-15.03%
  dd write 200 MiB          1.670653105 +-17.87%                  1.669384764 +-11.99%
  dd write 1000 MiB         8.434691190 +-14.15%                  8.474733215 +-14.77%
  read fault cycles       254.0(+-0.000% 10 samples)            253.0(+-0.000% 10 samples)
  write fault cycles     2021.2(+-3.070% 10 samples)           1984.5(+-1.036% 10 samples)

* CONFIG_MEMCG=y root_memcg:
                            baseline                              patched
  kbuild                 1m25.716667(+-0.105% 3 samples)       1m25.686667(+-0.153% 3 samples)
  dd write 100 MiB          0.855650830 +-14.90%                  0.887557919 +-14.90%
  dd write 200 MiB          1.688322953 +-12.72%                  1.667682724 +-13.33%
  dd write 1000 MiB         8.418601605 +-14.30%                  8.673532299 +-15.00%
  read fault cycles       266.0(+-0.000% 10 samples)            266.0(+-0.000% 10 samples)
  write fault cycles     2051.7(+-1.349% 10 samples)           2049.6(+-1.686% 10 samples)

* CONFIG_MEMCG=y non-root_memcg:
                            baseline                              patched
  kbuild                 1m26.120000(+-0.273% 3 samples)       1m25.763333(+-0.127% 3 samples)
  dd write 100 MiB          0.861723964 +-15.25%                  0.818129350 +-14.82%
  dd write 200 MiB          1.669887569 +-13.30%                  1.698645885 +-13.27%
  dd write 1000 MiB         8.383191730 +-14.65%                  8.351742280 +-14.52%
  read fault cycles       265.7(+-0.172% 10 samples)            267.0(+-0.000% 10 samples)
  write fault cycles     2070.6(+-1.512% 10 samples)           2084.4(+-2.148% 10 samples)

As expected anon page faults are not affected by this patch.

tj: Updated to apply on top of the recent cancel_dirty_page() changes.

Signed-off-by: Sha Zhengju <handai.szj@gmail.com>
Signed-off-by: Greg Thelen <gthelen@google.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 Documentation/cgroups/memory.txt |  1 +
 fs/buffer.c                      | 34 +++++++++++++++++++++------
 fs/xfs/xfs_aops.c                | 12 ++++++++--
 include/linux/memcontrol.h       |  1 +
 include/linux/mm.h               |  6 +++--
 include/linux/pagemap.h          |  3 ++-
 mm/filemap.c                     | 31 +++++++++++++++++--------
 mm/memcontrol.c                  | 24 ++++++++++++++++++-
 mm/page-writeback.c              | 50 +++++++++++++++++++++++++++++++++-------
 mm/rmap.c                        |  2 ++
 mm/truncate.c                    | 14 +++++++----
 mm/vmscan.c                      | 17 ++++++++++----
 12 files changed, 156 insertions(+), 39 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/cgroups/memory.txt b/Documentation/cgroups/memory.txt
index f456b4315e86..ff71e16cc752 100644
--- a/Documentation/cgroups/memory.txt
+++ b/Documentation/cgroups/memory.txt
@@ -493,6 +493,7 @@ pgpgin		- # of charging events to the memory cgroup. The charging
 pgpgout		- # of uncharging events to the memory cgroup. The uncharging
 		event happens each time a page is unaccounted from the cgroup.
 swap		- # of bytes of swap usage
+dirty		- # of bytes that are waiting to get written back to the disk.
 writeback	- # of bytes of file/anon cache that are queued for syncing to
 		disk.
 inactive_anon	- # of bytes of anonymous and swap cache memory on inactive
diff --git a/fs/buffer.c b/fs/buffer.c
index f21327d1f673..23b640d5d6e9 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -623,21 +623,22 @@ EXPORT_SYMBOL(mark_buffer_dirty_inode);
  *
  * If warn is true, then emit a warning if the page is not uptodate and has
  * not been truncated.
+ *
+ * The caller must hold mem_cgroup_begin_page_stat() lock.
  */
-static void __set_page_dirty(struct page *page,
-		struct address_space *mapping, int warn)
+static void __set_page_dirty(struct page *page, struct address_space *mapping,
+			     struct mem_cgroup *memcg, int warn)
 {
 	unsigned long flags;
 
 	spin_lock_irqsave(&mapping->tree_lock, flags);
 	if (page->mapping) {	/* Race with truncate? */
 		WARN_ON_ONCE(warn && !PageUptodate(page));
-		account_page_dirtied(page, mapping);
+		account_page_dirtied(page, mapping, memcg);
 		radix_tree_tag_set(&mapping->page_tree,
 				page_index(page), PAGECACHE_TAG_DIRTY);
 	}
 	spin_unlock_irqrestore(&mapping->tree_lock, flags);
-	__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
 }
 
 /*
@@ -668,6 +669,7 @@ static void __set_page_dirty(struct page *page,
 int __set_page_dirty_buffers(struct page *page)
 {
 	int newly_dirty;
+	struct mem_cgroup *memcg;
 	struct address_space *mapping = page_mapping(page);
 
 	if (unlikely(!mapping))
@@ -683,11 +685,22 @@ int __set_page_dirty_buffers(struct page *page)
 			bh = bh->b_this_page;
 		} while (bh != head);
 	}
+	/*
+	 * Use mem_group_begin_page_stat() to keep PageDirty synchronized with
+	 * per-memcg dirty page counters.
+	 */
+	memcg = mem_cgroup_begin_page_stat(page);
 	newly_dirty = !TestSetPageDirty(page);
 	spin_unlock(&mapping->private_lock);
 
 	if (newly_dirty)
-		__set_page_dirty(page, mapping, 1);
+		__set_page_dirty(page, mapping, memcg, 1);
+
+	mem_cgroup_end_page_stat(memcg);
+
+	if (newly_dirty)
+		__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
+
 	return newly_dirty;
 }
 EXPORT_SYMBOL(__set_page_dirty_buffers);
@@ -1158,11 +1171,18 @@ void mark_buffer_dirty(struct buffer_head *bh)
 
 	if (!test_set_buffer_dirty(bh)) {
 		struct page *page = bh->b_page;
+		struct address_space *mapping = NULL;
+		struct mem_cgroup *memcg;
+
+		memcg = mem_cgroup_begin_page_stat(page);
 		if (!TestSetPageDirty(page)) {
-			struct address_space *mapping = page_mapping(page);
+			mapping = page_mapping(page);
 			if (mapping)
-				__set_page_dirty(page, mapping, 0);
+				__set_page_dirty(page, mapping, memcg, 0);
 		}
+		mem_cgroup_end_page_stat(memcg);
+		if (mapping)
+			__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
 	}
 }
 EXPORT_SYMBOL(mark_buffer_dirty);
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 095f94c2d8b5..e5099f268032 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -1873,6 +1873,7 @@ xfs_vm_set_page_dirty(
 	loff_t			end_offset;
 	loff_t			offset;
 	int			newly_dirty;
+	struct mem_cgroup	*memcg;
 
 	if (unlikely(!mapping))
 		return !TestSetPageDirty(page);
@@ -1892,6 +1893,11 @@ xfs_vm_set_page_dirty(
 			offset += 1 << inode->i_blkbits;
 		} while (bh != head);
 	}
+	/*
+	 * Use mem_group_begin_page_stat() to keep PageDirty synchronized with
+	 * per-memcg dirty page counters.
+	 */
+	memcg = mem_cgroup_begin_page_stat(page);
 	newly_dirty = !TestSetPageDirty(page);
 	spin_unlock(&mapping->private_lock);
 
@@ -1902,13 +1908,15 @@ xfs_vm_set_page_dirty(
 		spin_lock_irqsave(&mapping->tree_lock, flags);
 		if (page->mapping) {	/* Race with truncate? */
 			WARN_ON_ONCE(!PageUptodate(page));
-			account_page_dirtied(page, mapping);
+			account_page_dirtied(page, mapping, memcg);
 			radix_tree_tag_set(&mapping->page_tree,
 					page_index(page), PAGECACHE_TAG_DIRTY);
 		}
 		spin_unlock_irqrestore(&mapping->tree_lock, flags);
-		__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
 	}
+	mem_cgroup_end_page_stat(memcg);
+	if (newly_dirty)
+		__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
 	return newly_dirty;
 }
 
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 72dff5fb0d0c..5fe6411b5e54 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -41,6 +41,7 @@ enum mem_cgroup_stat_index {
 	MEM_CGROUP_STAT_RSS,		/* # of pages charged as anon rss */
 	MEM_CGROUP_STAT_RSS_HUGE,	/* # of pages charged as anon huge */
 	MEM_CGROUP_STAT_FILE_MAPPED,	/* # of pages charged as file rss */
+	MEM_CGROUP_STAT_DIRTY,          /* # of dirty pages in page cache */
 	MEM_CGROUP_STAT_WRITEBACK,	/* # of pages under writeback */
 	MEM_CGROUP_STAT_SWAP,		/* # of pages, swapped out */
 	MEM_CGROUP_STAT_NSTATS,
diff --git a/include/linux/mm.h b/include/linux/mm.h
index a83cf3a6f78e..f48d979ced4b 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1211,8 +1211,10 @@ int __set_page_dirty_nobuffers(struct page *page);
 int __set_page_dirty_no_writeback(struct page *page);
 int redirty_page_for_writepage(struct writeback_control *wbc,
 				struct page *page);
-void account_page_dirtied(struct page *page, struct address_space *mapping);
-void account_page_cleaned(struct page *page, struct address_space *mapping);
+void account_page_dirtied(struct page *page, struct address_space *mapping,
+			  struct mem_cgroup *memcg);
+void account_page_cleaned(struct page *page, struct address_space *mapping,
+			  struct mem_cgroup *memcg);
 int set_page_dirty(struct page *page);
 int set_page_dirty_lock(struct page *page);
 void cancel_dirty_page(struct page *page);
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 4b3736f7065c..fb0814ca65c7 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -651,7 +651,8 @@ int add_to_page_cache_locked(struct page *page, struct address_space *mapping,
 int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
 				pgoff_t index, gfp_t gfp_mask);
 extern void delete_from_page_cache(struct page *page);
-extern void __delete_from_page_cache(struct page *page, void *shadow);
+extern void __delete_from_page_cache(struct page *page, void *shadow,
+				     struct mem_cgroup *memcg);
 int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask);
 
 /*
diff --git a/mm/filemap.c b/mm/filemap.c
index 6bf5e42d560a..7b1443dc3ad0 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -100,6 +100,7 @@
  *    ->tree_lock		(page_remove_rmap->set_page_dirty)
  *    bdi.wb->list_lock		(page_remove_rmap->set_page_dirty)
  *    ->inode->i_lock		(page_remove_rmap->set_page_dirty)
+ *    ->memcg->move_lock	(page_remove_rmap->mem_cgroup_begin_page_stat)
  *    bdi.wb->list_lock		(zap_pte_range->set_page_dirty)
  *    ->inode->i_lock		(zap_pte_range->set_page_dirty)
  *    ->private_lock		(zap_pte_range->__set_page_dirty_buffers)
@@ -174,9 +175,11 @@ static void page_cache_tree_delete(struct address_space *mapping,
 /*
  * Delete a page from the page cache and free it. Caller has to make
  * sure the page is locked and that nobody else uses it - or that usage
- * is safe.  The caller must hold the mapping's tree_lock.
+ * is safe.  The caller must hold the mapping's tree_lock and
+ * mem_cgroup_begin_page_stat().
  */
-void __delete_from_page_cache(struct page *page, void *shadow)
+void __delete_from_page_cache(struct page *page, void *shadow,
+			      struct mem_cgroup *memcg)
 {
 	struct address_space *mapping = page->mapping;
 
@@ -210,7 +213,7 @@ void __delete_from_page_cache(struct page *page, void *shadow)
 	 * anyway will be cleared before returning page into buddy allocator.
 	 */
 	if (WARN_ON_ONCE(PageDirty(page)))
-		account_page_cleaned(page, mapping);
+		account_page_cleaned(page, mapping, memcg);
 }
 
 /**
@@ -224,14 +227,20 @@ void __delete_from_page_cache(struct page *page, void *shadow)
 void delete_from_page_cache(struct page *page)
 {
 	struct address_space *mapping = page->mapping;
+	struct mem_cgroup *memcg;
+	unsigned long flags;
+
 	void (*freepage)(struct page *);
 
 	BUG_ON(!PageLocked(page));
 
 	freepage = mapping->a_ops->freepage;
-	spin_lock_irq(&mapping->tree_lock);
-	__delete_from_page_cache(page, NULL);
-	spin_unlock_irq(&mapping->tree_lock);
+
+	memcg = mem_cgroup_begin_page_stat(page);
+	spin_lock_irqsave(&mapping->tree_lock, flags);
+	__delete_from_page_cache(page, NULL, memcg);
+	spin_unlock_irqrestore(&mapping->tree_lock, flags);
+	mem_cgroup_end_page_stat(memcg);
 
 	if (freepage)
 		freepage(page);
@@ -470,6 +479,8 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask)
 	if (!error) {
 		struct address_space *mapping = old->mapping;
 		void (*freepage)(struct page *);
+		struct mem_cgroup *memcg;
+		unsigned long flags;
 
 		pgoff_t offset = old->index;
 		freepage = mapping->a_ops->freepage;
@@ -478,15 +489,17 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask)
 		new->mapping = mapping;
 		new->index = offset;
 
-		spin_lock_irq(&mapping->tree_lock);
-		__delete_from_page_cache(old, NULL);
+		memcg = mem_cgroup_begin_page_stat(old);
+		spin_lock_irqsave(&mapping->tree_lock, flags);
+		__delete_from_page_cache(old, NULL, memcg);
 		error = radix_tree_insert(&mapping->page_tree, offset, new);
 		BUG_ON(error);
 		mapping->nrpages++;
 		__inc_zone_page_state(new, NR_FILE_PAGES);
 		if (PageSwapBacked(new))
 			__inc_zone_page_state(new, NR_SHMEM);
-		spin_unlock_irq(&mapping->tree_lock);
+		spin_unlock_irqrestore(&mapping->tree_lock, flags);
+		mem_cgroup_end_page_stat(memcg);
 		mem_cgroup_migrate(old, new, true);
 		radix_tree_preload_end();
 		if (freepage)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 14c2f2017e37..c23c1a3e8e16 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -90,6 +90,7 @@ static const char * const mem_cgroup_stat_names[] = {
 	"rss",
 	"rss_huge",
 	"mapped_file",
+	"dirty",
 	"writeback",
 	"swap",
 };
@@ -2011,6 +2012,7 @@ again:
 
 	return memcg;
 }
+EXPORT_SYMBOL(mem_cgroup_begin_page_stat);
 
 /**
  * mem_cgroup_end_page_stat - finish a page state statistics transaction
@@ -2029,6 +2031,7 @@ void mem_cgroup_end_page_stat(struct mem_cgroup *memcg)
 
 	rcu_read_unlock();
 }
+EXPORT_SYMBOL(mem_cgroup_end_page_stat);
 
 /**
  * mem_cgroup_update_page_stat - update page state statistics
@@ -4746,6 +4749,7 @@ static int mem_cgroup_move_account(struct page *page,
 {
 	unsigned long flags;
 	int ret;
+	bool anon;
 
 	VM_BUG_ON(from == to);
 	VM_BUG_ON_PAGE(PageLRU(page), page);
@@ -4771,15 +4775,33 @@ static int mem_cgroup_move_account(struct page *page,
 	if (page->mem_cgroup != from)
 		goto out_unlock;
 
+	anon = PageAnon(page);
+
 	spin_lock_irqsave(&from->move_lock, flags);
 
-	if (!PageAnon(page) && page_mapped(page)) {
+	if (!anon && page_mapped(page)) {
 		__this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED],
 			       nr_pages);
 		__this_cpu_add(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED],
 			       nr_pages);
 	}
 
+	/*
+	 * move_lock grabbed above and caller set from->moving_account, so
+	 * mem_cgroup_update_page_stat() will serialize updates to PageDirty.
+	 * So mapping should be stable for dirty pages.
+	 */
+	if (!anon && PageDirty(page)) {
+		struct address_space *mapping = page_mapping(page);
+
+		if (mapping_cap_account_dirty(mapping)) {
+			__this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_DIRTY],
+				       nr_pages);
+			__this_cpu_add(to->stat->count[MEM_CGROUP_STAT_DIRTY],
+				       nr_pages);
+		}
+	}
+
 	if (PageWriteback(page)) {
 		__this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_WRITEBACK],
 			       nr_pages);
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 227b867598e1..bdeecad00489 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -2090,15 +2090,20 @@ int __set_page_dirty_no_writeback(struct page *page)
 
 /*
  * Helper function for set_page_dirty family.
+ *
+ * Caller must hold mem_cgroup_begin_page_stat().
+ *
  * NOTE: This relies on being atomic wrt interrupts.
  */
-void account_page_dirtied(struct page *page, struct address_space *mapping)
+void account_page_dirtied(struct page *page, struct address_space *mapping,
+			  struct mem_cgroup *memcg)
 {
 	trace_writeback_dirty_page(page, mapping);
 
 	if (mapping_cap_account_dirty(mapping)) {
 		struct backing_dev_info *bdi = inode_to_bdi(mapping->host);
 
+		mem_cgroup_inc_page_stat(memcg, MEM_CGROUP_STAT_DIRTY);
 		__inc_zone_page_state(page, NR_FILE_DIRTY);
 		__inc_zone_page_state(page, NR_DIRTIED);
 		__inc_bdi_stat(bdi, BDI_RECLAIMABLE);
@@ -2112,10 +2117,14 @@ EXPORT_SYMBOL(account_page_dirtied);
 
 /*
  * Helper function for deaccounting dirty page without writeback.
+ *
+ * Caller must hold mem_cgroup_begin_page_stat().
  */
-void account_page_cleaned(struct page *page, struct address_space *mapping)
+void account_page_cleaned(struct page *page, struct address_space *mapping,
+			  struct mem_cgroup *memcg)
 {
 	if (mapping_cap_account_dirty(mapping)) {
+		mem_cgroup_dec_page_stat(memcg, MEM_CGROUP_STAT_DIRTY);
 		dec_zone_page_state(page, NR_FILE_DIRTY);
 		dec_bdi_stat(inode_to_bdi(mapping->host), BDI_RECLAIMABLE);
 		task_io_account_cancelled_write(PAGE_CACHE_SIZE);
@@ -2136,26 +2145,34 @@ void account_page_cleaned(struct page *page, struct address_space *mapping)
  */
 int __set_page_dirty_nobuffers(struct page *page)
 {
+	struct mem_cgroup *memcg;
+
+	memcg = mem_cgroup_begin_page_stat(page);
 	if (!TestSetPageDirty(page)) {
 		struct address_space *mapping = page_mapping(page);
 		unsigned long flags;
 
-		if (!mapping)
+		if (!mapping) {
+			mem_cgroup_end_page_stat(memcg);
 			return 1;
+		}
 
 		spin_lock_irqsave(&mapping->tree_lock, flags);
 		BUG_ON(page_mapping(page) != mapping);
 		WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page));
-		account_page_dirtied(page, mapping);
+		account_page_dirtied(page, mapping, memcg);
 		radix_tree_tag_set(&mapping->page_tree, page_index(page),
 				   PAGECACHE_TAG_DIRTY);
 		spin_unlock_irqrestore(&mapping->tree_lock, flags);
+		mem_cgroup_end_page_stat(memcg);
+
 		if (mapping->host) {
 			/* !PageAnon && !swapper_space */
 			__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
 		}
 		return 1;
 	}
+	mem_cgroup_end_page_stat(memcg);
 	return 0;
 }
 EXPORT_SYMBOL(__set_page_dirty_nobuffers);
@@ -2273,8 +2290,20 @@ EXPORT_SYMBOL(set_page_dirty_lock);
  */
 void cancel_dirty_page(struct page *page)
 {
-	if (TestClearPageDirty(page))
-		account_page_cleaned(page, page_mapping(page));
+	struct address_space *mapping = page_mapping(page);
+
+	if (mapping_cap_account_dirty(mapping)) {
+		struct mem_cgroup *memcg;
+
+		memcg = mem_cgroup_begin_page_stat(page);
+
+		if (TestClearPageDirty(page))
+			account_page_cleaned(page, mapping, memcg);
+
+		mem_cgroup_end_page_stat(memcg);
+	} else {
+		ClearPageDirty(page);
+	}
 }
 EXPORT_SYMBOL(cancel_dirty_page);
 
@@ -2295,6 +2324,8 @@ EXPORT_SYMBOL(cancel_dirty_page);
 int clear_page_dirty_for_io(struct page *page)
 {
 	struct address_space *mapping = page_mapping(page);
+	struct mem_cgroup *memcg;
+	int ret = 0;
 
 	BUG_ON(!PageLocked(page));
 
@@ -2334,13 +2365,16 @@ int clear_page_dirty_for_io(struct page *page)
 		 * always locked coming in here, so we get the desired
 		 * exclusion.
 		 */
+		memcg = mem_cgroup_begin_page_stat(page);
 		if (TestClearPageDirty(page)) {
+			mem_cgroup_dec_page_stat(memcg, MEM_CGROUP_STAT_DIRTY);
 			dec_zone_page_state(page, NR_FILE_DIRTY);
 			dec_bdi_stat(inode_to_bdi(mapping->host),
 					BDI_RECLAIMABLE);
-			return 1;
+			ret = 1;
 		}
-		return 0;
+		mem_cgroup_end_page_stat(memcg);
+		return ret;
 	}
 	return TestClearPageDirty(page);
 }
diff --git a/mm/rmap.c b/mm/rmap.c
index 24dd3f9fee27..8fc556ce2dcb 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -30,6 +30,8 @@
  *             swap_lock (in swap_duplicate, swap_info_get)
  *               mmlist_lock (in mmput, drain_mmlist and others)
  *               mapping->private_lock (in __set_page_dirty_buffers)
+ *                 mem_cgroup_{begin,end}_page_stat (memcg->move_lock)
+ *                   mapping->tree_lock (widely used)
  *               inode->i_lock (in set_page_dirty's __mark_inode_dirty)
  *               bdi.wb->list_lock (in set_page_dirty's __mark_inode_dirty)
  *                 sb_lock (within inode_lock in fs/fs-writeback.c)
diff --git a/mm/truncate.c b/mm/truncate.c
index 0c360259c085..76e35ad97102 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -510,19 +510,24 @@ EXPORT_SYMBOL(invalidate_mapping_pages);
 static int
 invalidate_complete_page2(struct address_space *mapping, struct page *page)
 {
+	struct mem_cgroup *memcg;
+	unsigned long flags;
+
 	if (page->mapping != mapping)
 		return 0;
 
 	if (page_has_private(page) && !try_to_release_page(page, GFP_KERNEL))
 		return 0;
 
-	spin_lock_irq(&mapping->tree_lock);
+	memcg = mem_cgroup_begin_page_stat(page);
+	spin_lock_irqsave(&mapping->tree_lock, flags);
 	if (PageDirty(page))
 		goto failed;
 
 	BUG_ON(page_has_private(page));
-	__delete_from_page_cache(page, NULL);
-	spin_unlock_irq(&mapping->tree_lock);
+	__delete_from_page_cache(page, NULL, memcg);
+	spin_unlock_irqrestore(&mapping->tree_lock, flags);
+	mem_cgroup_end_page_stat(memcg);
 
 	if (mapping->a_ops->freepage)
 		mapping->a_ops->freepage(page);
@@ -530,7 +535,8 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page)
 	page_cache_release(page);	/* pagecache ref */
 	return 1;
 failed:
-	spin_unlock_irq(&mapping->tree_lock);
+	spin_unlock_irqrestore(&mapping->tree_lock, flags);
+	mem_cgroup_end_page_stat(memcg);
 	return 0;
 }
 
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 5e8eadd71bac..7582f9fcda92 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -579,10 +579,14 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
 static int __remove_mapping(struct address_space *mapping, struct page *page,
 			    bool reclaimed)
 {
+	unsigned long flags;
+	struct mem_cgroup *memcg;
+
 	BUG_ON(!PageLocked(page));
 	BUG_ON(mapping != page_mapping(page));
 
-	spin_lock_irq(&mapping->tree_lock);
+	memcg = mem_cgroup_begin_page_stat(page);
+	spin_lock_irqsave(&mapping->tree_lock, flags);
 	/*
 	 * The non racy check for a busy page.
 	 *
@@ -620,7 +624,8 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
 		swp_entry_t swap = { .val = page_private(page) };
 		mem_cgroup_swapout(page, swap);
 		__delete_from_swap_cache(page);
-		spin_unlock_irq(&mapping->tree_lock);
+		spin_unlock_irqrestore(&mapping->tree_lock, flags);
+		mem_cgroup_end_page_stat(memcg);
 		swapcache_free(swap);
 	} else {
 		void (*freepage)(struct page *);
@@ -640,8 +645,9 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
 		if (reclaimed && page_is_file_cache(page) &&
 		    !mapping_exiting(mapping))
 			shadow = workingset_eviction(mapping, page);
-		__delete_from_page_cache(page, shadow);
-		spin_unlock_irq(&mapping->tree_lock);
+		__delete_from_page_cache(page, shadow, memcg);
+		spin_unlock_irqrestore(&mapping->tree_lock, flags);
+		mem_cgroup_end_page_stat(memcg);
 
 		if (freepage != NULL)
 			freepage(page);
@@ -650,7 +656,8 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
 	return 1;
 
 cannot_free:
-	spin_unlock_irq(&mapping->tree_lock);
+	spin_unlock_irqrestore(&mapping->tree_lock, flags);
+	mem_cgroup_end_page_stat(memcg);
 	return 0;
 }
 
-- 
cgit v1.2.3


From eea8f41cc58849e354ecf8b95bd7f806e1d1f703 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 22 May 2015 17:13:17 -0400
Subject: blkcg: move block/blk-cgroup.h to include/linux/blk-cgroup.h

cgroup aware writeback support will require exposing some of blkcg
details.  In preprataion, move block/blk-cgroup.h to
include/linux/blk-cgroup.h.  This patch is pure file move.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-cgroup.c         |   2 +-
 block/blk-cgroup.h         | 603 ---------------------------------------------
 block/blk-core.c           |   2 +-
 block/blk-sysfs.c          |   2 +-
 block/blk-throttle.c       |   2 +-
 block/cfq-iosched.c        |   2 +-
 block/elevator.c           |   2 +-
 include/linux/blk-cgroup.h | 603 +++++++++++++++++++++++++++++++++++++++++++++
 8 files changed, 609 insertions(+), 609 deletions(-)
 delete mode 100644 block/blk-cgroup.h
 create mode 100644 include/linux/blk-cgroup.h

(limited to 'include/linux')

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 0ac817b750db..c3226ce8c5f6 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -19,7 +19,7 @@
 #include <linux/genhd.h>
 #include <linux/delay.h>
 #include <linux/atomic.h>
-#include "blk-cgroup.h"
+#include <linux/blk-cgroup.h>
 #include "blk.h"
 
 #define MAX_KEY_LEN 100
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
deleted file mode 100644
index c567865b5f1d..000000000000
--- a/block/blk-cgroup.h
+++ /dev/null
@@ -1,603 +0,0 @@
-#ifndef _BLK_CGROUP_H
-#define _BLK_CGROUP_H
-/*
- * Common Block IO controller cgroup interface
- *
- * Based on ideas and code from CFQ, CFS and BFQ:
- * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
- *
- * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
- *		      Paolo Valente <paolo.valente@unimore.it>
- *
- * Copyright (C) 2009 Vivek Goyal <vgoyal@redhat.com>
- * 	              Nauman Rafique <nauman@google.com>
- */
-
-#include <linux/cgroup.h>
-#include <linux/u64_stats_sync.h>
-#include <linux/seq_file.h>
-#include <linux/radix-tree.h>
-#include <linux/blkdev.h>
-#include <linux/atomic.h>
-
-/* Max limits for throttle policy */
-#define THROTL_IOPS_MAX		UINT_MAX
-
-/* CFQ specific, out here for blkcg->cfq_weight */
-#define CFQ_WEIGHT_MIN		10
-#define CFQ_WEIGHT_MAX		1000
-#define CFQ_WEIGHT_DEFAULT	500
-
-#ifdef CONFIG_BLK_CGROUP
-
-enum blkg_rwstat_type {
-	BLKG_RWSTAT_READ,
-	BLKG_RWSTAT_WRITE,
-	BLKG_RWSTAT_SYNC,
-	BLKG_RWSTAT_ASYNC,
-
-	BLKG_RWSTAT_NR,
-	BLKG_RWSTAT_TOTAL = BLKG_RWSTAT_NR,
-};
-
-struct blkcg_gq;
-
-struct blkcg {
-	struct cgroup_subsys_state	css;
-	spinlock_t			lock;
-
-	struct radix_tree_root		blkg_tree;
-	struct blkcg_gq			*blkg_hint;
-	struct hlist_head		blkg_list;
-
-	/* TODO: per-policy storage in blkcg */
-	unsigned int			cfq_weight;	/* belongs to cfq */
-	unsigned int			cfq_leaf_weight;
-};
-
-struct blkg_stat {
-	struct u64_stats_sync		syncp;
-	uint64_t			cnt;
-};
-
-struct blkg_rwstat {
-	struct u64_stats_sync		syncp;
-	uint64_t			cnt[BLKG_RWSTAT_NR];
-};
-
-/*
- * A blkcg_gq (blkg) is association between a block cgroup (blkcg) and a
- * request_queue (q).  This is used by blkcg policies which need to track
- * information per blkcg - q pair.
- *
- * There can be multiple active blkcg policies and each has its private
- * data on each blkg, the size of which is determined by
- * blkcg_policy->pd_size.  blkcg core allocates and frees such areas
- * together with blkg and invokes pd_init/exit_fn() methods.
- *
- * Such private data must embed struct blkg_policy_data (pd) at the
- * beginning and pd_size can't be smaller than pd.
- */
-struct blkg_policy_data {
-	/* the blkg and policy id this per-policy data belongs to */
-	struct blkcg_gq			*blkg;
-	int				plid;
-
-	/* used during policy activation */
-	struct list_head		alloc_node;
-};
-
-/* association between a blk cgroup and a request queue */
-struct blkcg_gq {
-	/* Pointer to the associated request_queue */
-	struct request_queue		*q;
-	struct list_head		q_node;
-	struct hlist_node		blkcg_node;
-	struct blkcg			*blkcg;
-
-	/* all non-root blkcg_gq's are guaranteed to have access to parent */
-	struct blkcg_gq			*parent;
-
-	/* request allocation list for this blkcg-q pair */
-	struct request_list		rl;
-
-	/* reference count */
-	atomic_t			refcnt;
-
-	/* is this blkg online? protected by both blkcg and q locks */
-	bool				online;
-
-	struct blkg_policy_data		*pd[BLKCG_MAX_POLS];
-
-	struct rcu_head			rcu_head;
-};
-
-typedef void (blkcg_pol_init_pd_fn)(struct blkcg_gq *blkg);
-typedef void (blkcg_pol_online_pd_fn)(struct blkcg_gq *blkg);
-typedef void (blkcg_pol_offline_pd_fn)(struct blkcg_gq *blkg);
-typedef void (blkcg_pol_exit_pd_fn)(struct blkcg_gq *blkg);
-typedef void (blkcg_pol_reset_pd_stats_fn)(struct blkcg_gq *blkg);
-
-struct blkcg_policy {
-	int				plid;
-	/* policy specific private data size */
-	size_t				pd_size;
-	/* cgroup files for the policy */
-	struct cftype			*cftypes;
-
-	/* operations */
-	blkcg_pol_init_pd_fn		*pd_init_fn;
-	blkcg_pol_online_pd_fn		*pd_online_fn;
-	blkcg_pol_offline_pd_fn		*pd_offline_fn;
-	blkcg_pol_exit_pd_fn		*pd_exit_fn;
-	blkcg_pol_reset_pd_stats_fn	*pd_reset_stats_fn;
-};
-
-extern struct blkcg blkcg_root;
-
-struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, struct request_queue *q);
-struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg,
-				    struct request_queue *q);
-int blkcg_init_queue(struct request_queue *q);
-void blkcg_drain_queue(struct request_queue *q);
-void blkcg_exit_queue(struct request_queue *q);
-
-/* Blkio controller policy registration */
-int blkcg_policy_register(struct blkcg_policy *pol);
-void blkcg_policy_unregister(struct blkcg_policy *pol);
-int blkcg_activate_policy(struct request_queue *q,
-			  const struct blkcg_policy *pol);
-void blkcg_deactivate_policy(struct request_queue *q,
-			     const struct blkcg_policy *pol);
-
-void blkcg_print_blkgs(struct seq_file *sf, struct blkcg *blkcg,
-		       u64 (*prfill)(struct seq_file *,
-				     struct blkg_policy_data *, int),
-		       const struct blkcg_policy *pol, int data,
-		       bool show_total);
-u64 __blkg_prfill_u64(struct seq_file *sf, struct blkg_policy_data *pd, u64 v);
-u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
-			 const struct blkg_rwstat *rwstat);
-u64 blkg_prfill_stat(struct seq_file *sf, struct blkg_policy_data *pd, int off);
-u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
-		       int off);
-
-u64 blkg_stat_recursive_sum(struct blkg_policy_data *pd, int off);
-struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkg_policy_data *pd,
-					     int off);
-
-struct blkg_conf_ctx {
-	struct gendisk			*disk;
-	struct blkcg_gq			*blkg;
-	u64				v;
-};
-
-int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
-		   const char *input, struct blkg_conf_ctx *ctx);
-void blkg_conf_finish(struct blkg_conf_ctx *ctx);
-
-
-static inline struct blkcg *css_to_blkcg(struct cgroup_subsys_state *css)
-{
-	return css ? container_of(css, struct blkcg, css) : NULL;
-}
-
-static inline struct blkcg *task_blkcg(struct task_struct *tsk)
-{
-	return css_to_blkcg(task_css(tsk, blkio_cgrp_id));
-}
-
-static inline struct blkcg *bio_blkcg(struct bio *bio)
-{
-	if (bio && bio->bi_css)
-		return css_to_blkcg(bio->bi_css);
-	return task_blkcg(current);
-}
-
-/**
- * blkcg_parent - get the parent of a blkcg
- * @blkcg: blkcg of interest
- *
- * Return the parent blkcg of @blkcg.  Can be called anytime.
- */
-static inline struct blkcg *blkcg_parent(struct blkcg *blkcg)
-{
-	return css_to_blkcg(blkcg->css.parent);
-}
-
-/**
- * blkg_to_pdata - get policy private data
- * @blkg: blkg of interest
- * @pol: policy of interest
- *
- * Return pointer to private data associated with the @blkg-@pol pair.
- */
-static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg,
-						  struct blkcg_policy *pol)
-{
-	return blkg ? blkg->pd[pol->plid] : NULL;
-}
-
-/**
- * pdata_to_blkg - get blkg associated with policy private data
- * @pd: policy private data of interest
- *
- * @pd is policy private data.  Determine the blkg it's associated with.
- */
-static inline struct blkcg_gq *pd_to_blkg(struct blkg_policy_data *pd)
-{
-	return pd ? pd->blkg : NULL;
-}
-
-/**
- * blkg_path - format cgroup path of blkg
- * @blkg: blkg of interest
- * @buf: target buffer
- * @buflen: target buffer length
- *
- * Format the path of the cgroup of @blkg into @buf.
- */
-static inline int blkg_path(struct blkcg_gq *blkg, char *buf, int buflen)
-{
-	char *p;
-
-	p = cgroup_path(blkg->blkcg->css.cgroup, buf, buflen);
-	if (!p) {
-		strncpy(buf, "<unavailable>", buflen);
-		return -ENAMETOOLONG;
-	}
-
-	memmove(buf, p, buf + buflen - p);
-	return 0;
-}
-
-/**
- * blkg_get - get a blkg reference
- * @blkg: blkg to get
- *
- * The caller should be holding an existing reference.
- */
-static inline void blkg_get(struct blkcg_gq *blkg)
-{
-	WARN_ON_ONCE(atomic_read(&blkg->refcnt) <= 0);
-	atomic_inc(&blkg->refcnt);
-}
-
-void __blkg_release_rcu(struct rcu_head *rcu);
-
-/**
- * blkg_put - put a blkg reference
- * @blkg: blkg to put
- */
-static inline void blkg_put(struct blkcg_gq *blkg)
-{
-	WARN_ON_ONCE(atomic_read(&blkg->refcnt) <= 0);
-	if (atomic_dec_and_test(&blkg->refcnt))
-		call_rcu(&blkg->rcu_head, __blkg_release_rcu);
-}
-
-struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, struct request_queue *q,
-			       bool update_hint);
-
-/**
- * blkg_for_each_descendant_pre - pre-order walk of a blkg's descendants
- * @d_blkg: loop cursor pointing to the current descendant
- * @pos_css: used for iteration
- * @p_blkg: target blkg to walk descendants of
- *
- * Walk @c_blkg through the descendants of @p_blkg.  Must be used with RCU
- * read locked.  If called under either blkcg or queue lock, the iteration
- * is guaranteed to include all and only online blkgs.  The caller may
- * update @pos_css by calling css_rightmost_descendant() to skip subtree.
- * @p_blkg is included in the iteration and the first node to be visited.
- */
-#define blkg_for_each_descendant_pre(d_blkg, pos_css, p_blkg)		\
-	css_for_each_descendant_pre((pos_css), &(p_blkg)->blkcg->css)	\
-		if (((d_blkg) = __blkg_lookup(css_to_blkcg(pos_css),	\
-					      (p_blkg)->q, false)))
-
-/**
- * blkg_for_each_descendant_post - post-order walk of a blkg's descendants
- * @d_blkg: loop cursor pointing to the current descendant
- * @pos_css: used for iteration
- * @p_blkg: target blkg to walk descendants of
- *
- * Similar to blkg_for_each_descendant_pre() but performs post-order
- * traversal instead.  Synchronization rules are the same.  @p_blkg is
- * included in the iteration and the last node to be visited.
- */
-#define blkg_for_each_descendant_post(d_blkg, pos_css, p_blkg)		\
-	css_for_each_descendant_post((pos_css), &(p_blkg)->blkcg->css)	\
-		if (((d_blkg) = __blkg_lookup(css_to_blkcg(pos_css),	\
-					      (p_blkg)->q, false)))
-
-/**
- * blk_get_rl - get request_list to use
- * @q: request_queue of interest
- * @bio: bio which will be attached to the allocated request (may be %NULL)
- *
- * The caller wants to allocate a request from @q to use for @bio.  Find
- * the request_list to use and obtain a reference on it.  Should be called
- * under queue_lock.  This function is guaranteed to return non-%NULL
- * request_list.
- */
-static inline struct request_list *blk_get_rl(struct request_queue *q,
-					      struct bio *bio)
-{
-	struct blkcg *blkcg;
-	struct blkcg_gq *blkg;
-
-	rcu_read_lock();
-
-	blkcg = bio_blkcg(bio);
-
-	/* bypass blkg lookup and use @q->root_rl directly for root */
-	if (blkcg == &blkcg_root)
-		goto root_rl;
-
-	/*
-	 * Try to use blkg->rl.  blkg lookup may fail under memory pressure
-	 * or if either the blkcg or queue is going away.  Fall back to
-	 * root_rl in such cases.
-	 */
-	blkg = blkg_lookup_create(blkcg, q);
-	if (unlikely(IS_ERR(blkg)))
-		goto root_rl;
-
-	blkg_get(blkg);
-	rcu_read_unlock();
-	return &blkg->rl;
-root_rl:
-	rcu_read_unlock();
-	return &q->root_rl;
-}
-
-/**
- * blk_put_rl - put request_list
- * @rl: request_list to put
- *
- * Put the reference acquired by blk_get_rl().  Should be called under
- * queue_lock.
- */
-static inline void blk_put_rl(struct request_list *rl)
-{
-	/* root_rl may not have blkg set */
-	if (rl->blkg && rl->blkg->blkcg != &blkcg_root)
-		blkg_put(rl->blkg);
-}
-
-/**
- * blk_rq_set_rl - associate a request with a request_list
- * @rq: request of interest
- * @rl: target request_list
- *
- * Associate @rq with @rl so that accounting and freeing can know the
- * request_list @rq came from.
- */
-static inline void blk_rq_set_rl(struct request *rq, struct request_list *rl)
-{
-	rq->rl = rl;
-}
-
-/**
- * blk_rq_rl - return the request_list a request came from
- * @rq: request of interest
- *
- * Return the request_list @rq is allocated from.
- */
-static inline struct request_list *blk_rq_rl(struct request *rq)
-{
-	return rq->rl;
-}
-
-struct request_list *__blk_queue_next_rl(struct request_list *rl,
-					 struct request_queue *q);
-/**
- * blk_queue_for_each_rl - iterate through all request_lists of a request_queue
- *
- * Should be used under queue_lock.
- */
-#define blk_queue_for_each_rl(rl, q)	\
-	for ((rl) = &(q)->root_rl; (rl); (rl) = __blk_queue_next_rl((rl), (q)))
-
-static inline void blkg_stat_init(struct blkg_stat *stat)
-{
-	u64_stats_init(&stat->syncp);
-}
-
-/**
- * blkg_stat_add - add a value to a blkg_stat
- * @stat: target blkg_stat
- * @val: value to add
- *
- * Add @val to @stat.  The caller is responsible for synchronizing calls to
- * this function.
- */
-static inline void blkg_stat_add(struct blkg_stat *stat, uint64_t val)
-{
-	u64_stats_update_begin(&stat->syncp);
-	stat->cnt += val;
-	u64_stats_update_end(&stat->syncp);
-}
-
-/**
- * blkg_stat_read - read the current value of a blkg_stat
- * @stat: blkg_stat to read
- *
- * Read the current value of @stat.  This function can be called without
- * synchroniztion and takes care of u64 atomicity.
- */
-static inline uint64_t blkg_stat_read(struct blkg_stat *stat)
-{
-	unsigned int start;
-	uint64_t v;
-
-	do {
-		start = u64_stats_fetch_begin_irq(&stat->syncp);
-		v = stat->cnt;
-	} while (u64_stats_fetch_retry_irq(&stat->syncp, start));
-
-	return v;
-}
-
-/**
- * blkg_stat_reset - reset a blkg_stat
- * @stat: blkg_stat to reset
- */
-static inline void blkg_stat_reset(struct blkg_stat *stat)
-{
-	stat->cnt = 0;
-}
-
-/**
- * blkg_stat_merge - merge a blkg_stat into another
- * @to: the destination blkg_stat
- * @from: the source
- *
- * Add @from's count to @to.
- */
-static inline void blkg_stat_merge(struct blkg_stat *to, struct blkg_stat *from)
-{
-	blkg_stat_add(to, blkg_stat_read(from));
-}
-
-static inline void blkg_rwstat_init(struct blkg_rwstat *rwstat)
-{
-	u64_stats_init(&rwstat->syncp);
-}
-
-/**
- * blkg_rwstat_add - add a value to a blkg_rwstat
- * @rwstat: target blkg_rwstat
- * @rw: mask of REQ_{WRITE|SYNC}
- * @val: value to add
- *
- * Add @val to @rwstat.  The counters are chosen according to @rw.  The
- * caller is responsible for synchronizing calls to this function.
- */
-static inline void blkg_rwstat_add(struct blkg_rwstat *rwstat,
-				   int rw, uint64_t val)
-{
-	u64_stats_update_begin(&rwstat->syncp);
-
-	if (rw & REQ_WRITE)
-		rwstat->cnt[BLKG_RWSTAT_WRITE] += val;
-	else
-		rwstat->cnt[BLKG_RWSTAT_READ] += val;
-	if (rw & REQ_SYNC)
-		rwstat->cnt[BLKG_RWSTAT_SYNC] += val;
-	else
-		rwstat->cnt[BLKG_RWSTAT_ASYNC] += val;
-
-	u64_stats_update_end(&rwstat->syncp);
-}
-
-/**
- * blkg_rwstat_read - read the current values of a blkg_rwstat
- * @rwstat: blkg_rwstat to read
- *
- * Read the current snapshot of @rwstat and return it as the return value.
- * This function can be called without synchronization and takes care of
- * u64 atomicity.
- */
-static inline struct blkg_rwstat blkg_rwstat_read(struct blkg_rwstat *rwstat)
-{
-	unsigned int start;
-	struct blkg_rwstat tmp;
-
-	do {
-		start = u64_stats_fetch_begin_irq(&rwstat->syncp);
-		tmp = *rwstat;
-	} while (u64_stats_fetch_retry_irq(&rwstat->syncp, start));
-
-	return tmp;
-}
-
-/**
- * blkg_rwstat_total - read the total count of a blkg_rwstat
- * @rwstat: blkg_rwstat to read
- *
- * Return the total count of @rwstat regardless of the IO direction.  This
- * function can be called without synchronization and takes care of u64
- * atomicity.
- */
-static inline uint64_t blkg_rwstat_total(struct blkg_rwstat *rwstat)
-{
-	struct blkg_rwstat tmp = blkg_rwstat_read(rwstat);
-
-	return tmp.cnt[BLKG_RWSTAT_READ] + tmp.cnt[BLKG_RWSTAT_WRITE];
-}
-
-/**
- * blkg_rwstat_reset - reset a blkg_rwstat
- * @rwstat: blkg_rwstat to reset
- */
-static inline void blkg_rwstat_reset(struct blkg_rwstat *rwstat)
-{
-	memset(rwstat->cnt, 0, sizeof(rwstat->cnt));
-}
-
-/**
- * blkg_rwstat_merge - merge a blkg_rwstat into another
- * @to: the destination blkg_rwstat
- * @from: the source
- *
- * Add @from's counts to @to.
- */
-static inline void blkg_rwstat_merge(struct blkg_rwstat *to,
-				     struct blkg_rwstat *from)
-{
-	struct blkg_rwstat v = blkg_rwstat_read(from);
-	int i;
-
-	u64_stats_update_begin(&to->syncp);
-	for (i = 0; i < BLKG_RWSTAT_NR; i++)
-		to->cnt[i] += v.cnt[i];
-	u64_stats_update_end(&to->syncp);
-}
-
-#else	/* CONFIG_BLK_CGROUP */
-
-struct cgroup;
-struct blkcg;
-
-struct blkg_policy_data {
-};
-
-struct blkcg_gq {
-};
-
-struct blkcg_policy {
-};
-
-static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, void *key) { return NULL; }
-static inline int blkcg_init_queue(struct request_queue *q) { return 0; }
-static inline void blkcg_drain_queue(struct request_queue *q) { }
-static inline void blkcg_exit_queue(struct request_queue *q) { }
-static inline int blkcg_policy_register(struct blkcg_policy *pol) { return 0; }
-static inline void blkcg_policy_unregister(struct blkcg_policy *pol) { }
-static inline int blkcg_activate_policy(struct request_queue *q,
-					const struct blkcg_policy *pol) { return 0; }
-static inline void blkcg_deactivate_policy(struct request_queue *q,
-					   const struct blkcg_policy *pol) { }
-
-static inline struct blkcg *bio_blkcg(struct bio *bio) { return NULL; }
-
-static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg,
-						  struct blkcg_policy *pol) { return NULL; }
-static inline struct blkcg_gq *pd_to_blkg(struct blkg_policy_data *pd) { return NULL; }
-static inline char *blkg_path(struct blkcg_gq *blkg) { return NULL; }
-static inline void blkg_get(struct blkcg_gq *blkg) { }
-static inline void blkg_put(struct blkcg_gq *blkg) { }
-
-static inline struct request_list *blk_get_rl(struct request_queue *q,
-					      struct bio *bio) { return &q->root_rl; }
-static inline void blk_put_rl(struct request_list *rl) { }
-static inline void blk_rq_set_rl(struct request *rq, struct request_list *rl) { }
-static inline struct request_list *blk_rq_rl(struct request *rq) { return &rq->q->root_rl; }
-
-#define blk_queue_for_each_rl(rl, q)	\
-	for ((rl) = &(q)->root_rl; (rl); (rl) = NULL)
-
-#endif	/* CONFIG_BLK_CGROUP */
-#endif	/* _BLK_CGROUP_H */
diff --git a/block/blk-core.c b/block/blk-core.c
index aa819a58ea24..4afac14d4499 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -32,12 +32,12 @@
 #include <linux/delay.h>
 #include <linux/ratelimit.h>
 #include <linux/pm_runtime.h>
+#include <linux/blk-cgroup.h>
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/block.h>
 
 #include "blk.h"
-#include "blk-cgroup.h"
 #include "blk-mq.h"
 
 EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_remap);
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index faaf36ade7eb..5677eb78557d 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -8,9 +8,9 @@
 #include <linux/blkdev.h>
 #include <linux/blktrace_api.h>
 #include <linux/blk-mq.h>
+#include <linux/blk-cgroup.h>
 
 #include "blk.h"
-#include "blk-cgroup.h"
 #include "blk-mq.h"
 
 struct queue_sysfs_entry {
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 5b9c6d5c3636..b23193518ac7 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -9,7 +9,7 @@
 #include <linux/blkdev.h>
 #include <linux/bio.h>
 #include <linux/blktrace_api.h>
-#include "blk-cgroup.h"
+#include <linux/blk-cgroup.h>
 #include "blk.h"
 
 /* Max dispatch from a group in 1 round */
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 5da8e6e9ab4b..bc8f42930773 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -14,8 +14,8 @@
 #include <linux/rbtree.h>
 #include <linux/ioprio.h>
 #include <linux/blktrace_api.h>
+#include <linux/blk-cgroup.h>
 #include "blk.h"
-#include "blk-cgroup.h"
 
 /*
  * tunables
diff --git a/block/elevator.c b/block/elevator.c
index 59794d0d38e3..3bbb48f430e4 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -35,11 +35,11 @@
 #include <linux/hash.h>
 #include <linux/uaccess.h>
 #include <linux/pm_runtime.h>
+#include <linux/blk-cgroup.h>
 
 #include <trace/events/block.h>
 
 #include "blk.h"
-#include "blk-cgroup.h"
 
 static DEFINE_SPINLOCK(elv_list_lock);
 static LIST_HEAD(elv_list);
diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h
new file mode 100644
index 000000000000..c567865b5f1d
--- /dev/null
+++ b/include/linux/blk-cgroup.h
@@ -0,0 +1,603 @@
+#ifndef _BLK_CGROUP_H
+#define _BLK_CGROUP_H
+/*
+ * Common Block IO controller cgroup interface
+ *
+ * Based on ideas and code from CFQ, CFS and BFQ:
+ * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
+ *
+ * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
+ *		      Paolo Valente <paolo.valente@unimore.it>
+ *
+ * Copyright (C) 2009 Vivek Goyal <vgoyal@redhat.com>
+ * 	              Nauman Rafique <nauman@google.com>
+ */
+
+#include <linux/cgroup.h>
+#include <linux/u64_stats_sync.h>
+#include <linux/seq_file.h>
+#include <linux/radix-tree.h>
+#include <linux/blkdev.h>
+#include <linux/atomic.h>
+
+/* Max limits for throttle policy */
+#define THROTL_IOPS_MAX		UINT_MAX
+
+/* CFQ specific, out here for blkcg->cfq_weight */
+#define CFQ_WEIGHT_MIN		10
+#define CFQ_WEIGHT_MAX		1000
+#define CFQ_WEIGHT_DEFAULT	500
+
+#ifdef CONFIG_BLK_CGROUP
+
+enum blkg_rwstat_type {
+	BLKG_RWSTAT_READ,
+	BLKG_RWSTAT_WRITE,
+	BLKG_RWSTAT_SYNC,
+	BLKG_RWSTAT_ASYNC,
+
+	BLKG_RWSTAT_NR,
+	BLKG_RWSTAT_TOTAL = BLKG_RWSTAT_NR,
+};
+
+struct blkcg_gq;
+
+struct blkcg {
+	struct cgroup_subsys_state	css;
+	spinlock_t			lock;
+
+	struct radix_tree_root		blkg_tree;
+	struct blkcg_gq			*blkg_hint;
+	struct hlist_head		blkg_list;
+
+	/* TODO: per-policy storage in blkcg */
+	unsigned int			cfq_weight;	/* belongs to cfq */
+	unsigned int			cfq_leaf_weight;
+};
+
+struct blkg_stat {
+	struct u64_stats_sync		syncp;
+	uint64_t			cnt;
+};
+
+struct blkg_rwstat {
+	struct u64_stats_sync		syncp;
+	uint64_t			cnt[BLKG_RWSTAT_NR];
+};
+
+/*
+ * A blkcg_gq (blkg) is association between a block cgroup (blkcg) and a
+ * request_queue (q).  This is used by blkcg policies which need to track
+ * information per blkcg - q pair.
+ *
+ * There can be multiple active blkcg policies and each has its private
+ * data on each blkg, the size of which is determined by
+ * blkcg_policy->pd_size.  blkcg core allocates and frees such areas
+ * together with blkg and invokes pd_init/exit_fn() methods.
+ *
+ * Such private data must embed struct blkg_policy_data (pd) at the
+ * beginning and pd_size can't be smaller than pd.
+ */
+struct blkg_policy_data {
+	/* the blkg and policy id this per-policy data belongs to */
+	struct blkcg_gq			*blkg;
+	int				plid;
+
+	/* used during policy activation */
+	struct list_head		alloc_node;
+};
+
+/* association between a blk cgroup and a request queue */
+struct blkcg_gq {
+	/* Pointer to the associated request_queue */
+	struct request_queue		*q;
+	struct list_head		q_node;
+	struct hlist_node		blkcg_node;
+	struct blkcg			*blkcg;
+
+	/* all non-root blkcg_gq's are guaranteed to have access to parent */
+	struct blkcg_gq			*parent;
+
+	/* request allocation list for this blkcg-q pair */
+	struct request_list		rl;
+
+	/* reference count */
+	atomic_t			refcnt;
+
+	/* is this blkg online? protected by both blkcg and q locks */
+	bool				online;
+
+	struct blkg_policy_data		*pd[BLKCG_MAX_POLS];
+
+	struct rcu_head			rcu_head;
+};
+
+typedef void (blkcg_pol_init_pd_fn)(struct blkcg_gq *blkg);
+typedef void (blkcg_pol_online_pd_fn)(struct blkcg_gq *blkg);
+typedef void (blkcg_pol_offline_pd_fn)(struct blkcg_gq *blkg);
+typedef void (blkcg_pol_exit_pd_fn)(struct blkcg_gq *blkg);
+typedef void (blkcg_pol_reset_pd_stats_fn)(struct blkcg_gq *blkg);
+
+struct blkcg_policy {
+	int				plid;
+	/* policy specific private data size */
+	size_t				pd_size;
+	/* cgroup files for the policy */
+	struct cftype			*cftypes;
+
+	/* operations */
+	blkcg_pol_init_pd_fn		*pd_init_fn;
+	blkcg_pol_online_pd_fn		*pd_online_fn;
+	blkcg_pol_offline_pd_fn		*pd_offline_fn;
+	blkcg_pol_exit_pd_fn		*pd_exit_fn;
+	blkcg_pol_reset_pd_stats_fn	*pd_reset_stats_fn;
+};
+
+extern struct blkcg blkcg_root;
+
+struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, struct request_queue *q);
+struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg,
+				    struct request_queue *q);
+int blkcg_init_queue(struct request_queue *q);
+void blkcg_drain_queue(struct request_queue *q);
+void blkcg_exit_queue(struct request_queue *q);
+
+/* Blkio controller policy registration */
+int blkcg_policy_register(struct blkcg_policy *pol);
+void blkcg_policy_unregister(struct blkcg_policy *pol);
+int blkcg_activate_policy(struct request_queue *q,
+			  const struct blkcg_policy *pol);
+void blkcg_deactivate_policy(struct request_queue *q,
+			     const struct blkcg_policy *pol);
+
+void blkcg_print_blkgs(struct seq_file *sf, struct blkcg *blkcg,
+		       u64 (*prfill)(struct seq_file *,
+				     struct blkg_policy_data *, int),
+		       const struct blkcg_policy *pol, int data,
+		       bool show_total);
+u64 __blkg_prfill_u64(struct seq_file *sf, struct blkg_policy_data *pd, u64 v);
+u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
+			 const struct blkg_rwstat *rwstat);
+u64 blkg_prfill_stat(struct seq_file *sf, struct blkg_policy_data *pd, int off);
+u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
+		       int off);
+
+u64 blkg_stat_recursive_sum(struct blkg_policy_data *pd, int off);
+struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkg_policy_data *pd,
+					     int off);
+
+struct blkg_conf_ctx {
+	struct gendisk			*disk;
+	struct blkcg_gq			*blkg;
+	u64				v;
+};
+
+int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
+		   const char *input, struct blkg_conf_ctx *ctx);
+void blkg_conf_finish(struct blkg_conf_ctx *ctx);
+
+
+static inline struct blkcg *css_to_blkcg(struct cgroup_subsys_state *css)
+{
+	return css ? container_of(css, struct blkcg, css) : NULL;
+}
+
+static inline struct blkcg *task_blkcg(struct task_struct *tsk)
+{
+	return css_to_blkcg(task_css(tsk, blkio_cgrp_id));
+}
+
+static inline struct blkcg *bio_blkcg(struct bio *bio)
+{
+	if (bio && bio->bi_css)
+		return css_to_blkcg(bio->bi_css);
+	return task_blkcg(current);
+}
+
+/**
+ * blkcg_parent - get the parent of a blkcg
+ * @blkcg: blkcg of interest
+ *
+ * Return the parent blkcg of @blkcg.  Can be called anytime.
+ */
+static inline struct blkcg *blkcg_parent(struct blkcg *blkcg)
+{
+	return css_to_blkcg(blkcg->css.parent);
+}
+
+/**
+ * blkg_to_pdata - get policy private data
+ * @blkg: blkg of interest
+ * @pol: policy of interest
+ *
+ * Return pointer to private data associated with the @blkg-@pol pair.
+ */
+static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg,
+						  struct blkcg_policy *pol)
+{
+	return blkg ? blkg->pd[pol->plid] : NULL;
+}
+
+/**
+ * pdata_to_blkg - get blkg associated with policy private data
+ * @pd: policy private data of interest
+ *
+ * @pd is policy private data.  Determine the blkg it's associated with.
+ */
+static inline struct blkcg_gq *pd_to_blkg(struct blkg_policy_data *pd)
+{
+	return pd ? pd->blkg : NULL;
+}
+
+/**
+ * blkg_path - format cgroup path of blkg
+ * @blkg: blkg of interest
+ * @buf: target buffer
+ * @buflen: target buffer length
+ *
+ * Format the path of the cgroup of @blkg into @buf.
+ */
+static inline int blkg_path(struct blkcg_gq *blkg, char *buf, int buflen)
+{
+	char *p;
+
+	p = cgroup_path(blkg->blkcg->css.cgroup, buf, buflen);
+	if (!p) {
+		strncpy(buf, "<unavailable>", buflen);
+		return -ENAMETOOLONG;
+	}
+
+	memmove(buf, p, buf + buflen - p);
+	return 0;
+}
+
+/**
+ * blkg_get - get a blkg reference
+ * @blkg: blkg to get
+ *
+ * The caller should be holding an existing reference.
+ */
+static inline void blkg_get(struct blkcg_gq *blkg)
+{
+	WARN_ON_ONCE(atomic_read(&blkg->refcnt) <= 0);
+	atomic_inc(&blkg->refcnt);
+}
+
+void __blkg_release_rcu(struct rcu_head *rcu);
+
+/**
+ * blkg_put - put a blkg reference
+ * @blkg: blkg to put
+ */
+static inline void blkg_put(struct blkcg_gq *blkg)
+{
+	WARN_ON_ONCE(atomic_read(&blkg->refcnt) <= 0);
+	if (atomic_dec_and_test(&blkg->refcnt))
+		call_rcu(&blkg->rcu_head, __blkg_release_rcu);
+}
+
+struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, struct request_queue *q,
+			       bool update_hint);
+
+/**
+ * blkg_for_each_descendant_pre - pre-order walk of a blkg's descendants
+ * @d_blkg: loop cursor pointing to the current descendant
+ * @pos_css: used for iteration
+ * @p_blkg: target blkg to walk descendants of
+ *
+ * Walk @c_blkg through the descendants of @p_blkg.  Must be used with RCU
+ * read locked.  If called under either blkcg or queue lock, the iteration
+ * is guaranteed to include all and only online blkgs.  The caller may
+ * update @pos_css by calling css_rightmost_descendant() to skip subtree.
+ * @p_blkg is included in the iteration and the first node to be visited.
+ */
+#define blkg_for_each_descendant_pre(d_blkg, pos_css, p_blkg)		\
+	css_for_each_descendant_pre((pos_css), &(p_blkg)->blkcg->css)	\
+		if (((d_blkg) = __blkg_lookup(css_to_blkcg(pos_css),	\
+					      (p_blkg)->q, false)))
+
+/**
+ * blkg_for_each_descendant_post - post-order walk of a blkg's descendants
+ * @d_blkg: loop cursor pointing to the current descendant
+ * @pos_css: used for iteration
+ * @p_blkg: target blkg to walk descendants of
+ *
+ * Similar to blkg_for_each_descendant_pre() but performs post-order
+ * traversal instead.  Synchronization rules are the same.  @p_blkg is
+ * included in the iteration and the last node to be visited.
+ */
+#define blkg_for_each_descendant_post(d_blkg, pos_css, p_blkg)		\
+	css_for_each_descendant_post((pos_css), &(p_blkg)->blkcg->css)	\
+		if (((d_blkg) = __blkg_lookup(css_to_blkcg(pos_css),	\
+					      (p_blkg)->q, false)))
+
+/**
+ * blk_get_rl - get request_list to use
+ * @q: request_queue of interest
+ * @bio: bio which will be attached to the allocated request (may be %NULL)
+ *
+ * The caller wants to allocate a request from @q to use for @bio.  Find
+ * the request_list to use and obtain a reference on it.  Should be called
+ * under queue_lock.  This function is guaranteed to return non-%NULL
+ * request_list.
+ */
+static inline struct request_list *blk_get_rl(struct request_queue *q,
+					      struct bio *bio)
+{
+	struct blkcg *blkcg;
+	struct blkcg_gq *blkg;
+
+	rcu_read_lock();
+
+	blkcg = bio_blkcg(bio);
+
+	/* bypass blkg lookup and use @q->root_rl directly for root */
+	if (blkcg == &blkcg_root)
+		goto root_rl;
+
+	/*
+	 * Try to use blkg->rl.  blkg lookup may fail under memory pressure
+	 * or if either the blkcg or queue is going away.  Fall back to
+	 * root_rl in such cases.
+	 */
+	blkg = blkg_lookup_create(blkcg, q);
+	if (unlikely(IS_ERR(blkg)))
+		goto root_rl;
+
+	blkg_get(blkg);
+	rcu_read_unlock();
+	return &blkg->rl;
+root_rl:
+	rcu_read_unlock();
+	return &q->root_rl;
+}
+
+/**
+ * blk_put_rl - put request_list
+ * @rl: request_list to put
+ *
+ * Put the reference acquired by blk_get_rl().  Should be called under
+ * queue_lock.
+ */
+static inline void blk_put_rl(struct request_list *rl)
+{
+	/* root_rl may not have blkg set */
+	if (rl->blkg && rl->blkg->blkcg != &blkcg_root)
+		blkg_put(rl->blkg);
+}
+
+/**
+ * blk_rq_set_rl - associate a request with a request_list
+ * @rq: request of interest
+ * @rl: target request_list
+ *
+ * Associate @rq with @rl so that accounting and freeing can know the
+ * request_list @rq came from.
+ */
+static inline void blk_rq_set_rl(struct request *rq, struct request_list *rl)
+{
+	rq->rl = rl;
+}
+
+/**
+ * blk_rq_rl - return the request_list a request came from
+ * @rq: request of interest
+ *
+ * Return the request_list @rq is allocated from.
+ */
+static inline struct request_list *blk_rq_rl(struct request *rq)
+{
+	return rq->rl;
+}
+
+struct request_list *__blk_queue_next_rl(struct request_list *rl,
+					 struct request_queue *q);
+/**
+ * blk_queue_for_each_rl - iterate through all request_lists of a request_queue
+ *
+ * Should be used under queue_lock.
+ */
+#define blk_queue_for_each_rl(rl, q)	\
+	for ((rl) = &(q)->root_rl; (rl); (rl) = __blk_queue_next_rl((rl), (q)))
+
+static inline void blkg_stat_init(struct blkg_stat *stat)
+{
+	u64_stats_init(&stat->syncp);
+}
+
+/**
+ * blkg_stat_add - add a value to a blkg_stat
+ * @stat: target blkg_stat
+ * @val: value to add
+ *
+ * Add @val to @stat.  The caller is responsible for synchronizing calls to
+ * this function.
+ */
+static inline void blkg_stat_add(struct blkg_stat *stat, uint64_t val)
+{
+	u64_stats_update_begin(&stat->syncp);
+	stat->cnt += val;
+	u64_stats_update_end(&stat->syncp);
+}
+
+/**
+ * blkg_stat_read - read the current value of a blkg_stat
+ * @stat: blkg_stat to read
+ *
+ * Read the current value of @stat.  This function can be called without
+ * synchroniztion and takes care of u64 atomicity.
+ */
+static inline uint64_t blkg_stat_read(struct blkg_stat *stat)
+{
+	unsigned int start;
+	uint64_t v;
+
+	do {
+		start = u64_stats_fetch_begin_irq(&stat->syncp);
+		v = stat->cnt;
+	} while (u64_stats_fetch_retry_irq(&stat->syncp, start));
+
+	return v;
+}
+
+/**
+ * blkg_stat_reset - reset a blkg_stat
+ * @stat: blkg_stat to reset
+ */
+static inline void blkg_stat_reset(struct blkg_stat *stat)
+{
+	stat->cnt = 0;
+}
+
+/**
+ * blkg_stat_merge - merge a blkg_stat into another
+ * @to: the destination blkg_stat
+ * @from: the source
+ *
+ * Add @from's count to @to.
+ */
+static inline void blkg_stat_merge(struct blkg_stat *to, struct blkg_stat *from)
+{
+	blkg_stat_add(to, blkg_stat_read(from));
+}
+
+static inline void blkg_rwstat_init(struct blkg_rwstat *rwstat)
+{
+	u64_stats_init(&rwstat->syncp);
+}
+
+/**
+ * blkg_rwstat_add - add a value to a blkg_rwstat
+ * @rwstat: target blkg_rwstat
+ * @rw: mask of REQ_{WRITE|SYNC}
+ * @val: value to add
+ *
+ * Add @val to @rwstat.  The counters are chosen according to @rw.  The
+ * caller is responsible for synchronizing calls to this function.
+ */
+static inline void blkg_rwstat_add(struct blkg_rwstat *rwstat,
+				   int rw, uint64_t val)
+{
+	u64_stats_update_begin(&rwstat->syncp);
+
+	if (rw & REQ_WRITE)
+		rwstat->cnt[BLKG_RWSTAT_WRITE] += val;
+	else
+		rwstat->cnt[BLKG_RWSTAT_READ] += val;
+	if (rw & REQ_SYNC)
+		rwstat->cnt[BLKG_RWSTAT_SYNC] += val;
+	else
+		rwstat->cnt[BLKG_RWSTAT_ASYNC] += val;
+
+	u64_stats_update_end(&rwstat->syncp);
+}
+
+/**
+ * blkg_rwstat_read - read the current values of a blkg_rwstat
+ * @rwstat: blkg_rwstat to read
+ *
+ * Read the current snapshot of @rwstat and return it as the return value.
+ * This function can be called without synchronization and takes care of
+ * u64 atomicity.
+ */
+static inline struct blkg_rwstat blkg_rwstat_read(struct blkg_rwstat *rwstat)
+{
+	unsigned int start;
+	struct blkg_rwstat tmp;
+
+	do {
+		start = u64_stats_fetch_begin_irq(&rwstat->syncp);
+		tmp = *rwstat;
+	} while (u64_stats_fetch_retry_irq(&rwstat->syncp, start));
+
+	return tmp;
+}
+
+/**
+ * blkg_rwstat_total - read the total count of a blkg_rwstat
+ * @rwstat: blkg_rwstat to read
+ *
+ * Return the total count of @rwstat regardless of the IO direction.  This
+ * function can be called without synchronization and takes care of u64
+ * atomicity.
+ */
+static inline uint64_t blkg_rwstat_total(struct blkg_rwstat *rwstat)
+{
+	struct blkg_rwstat tmp = blkg_rwstat_read(rwstat);
+
+	return tmp.cnt[BLKG_RWSTAT_READ] + tmp.cnt[BLKG_RWSTAT_WRITE];
+}
+
+/**
+ * blkg_rwstat_reset - reset a blkg_rwstat
+ * @rwstat: blkg_rwstat to reset
+ */
+static inline void blkg_rwstat_reset(struct blkg_rwstat *rwstat)
+{
+	memset(rwstat->cnt, 0, sizeof(rwstat->cnt));
+}
+
+/**
+ * blkg_rwstat_merge - merge a blkg_rwstat into another
+ * @to: the destination blkg_rwstat
+ * @from: the source
+ *
+ * Add @from's counts to @to.
+ */
+static inline void blkg_rwstat_merge(struct blkg_rwstat *to,
+				     struct blkg_rwstat *from)
+{
+	struct blkg_rwstat v = blkg_rwstat_read(from);
+	int i;
+
+	u64_stats_update_begin(&to->syncp);
+	for (i = 0; i < BLKG_RWSTAT_NR; i++)
+		to->cnt[i] += v.cnt[i];
+	u64_stats_update_end(&to->syncp);
+}
+
+#else	/* CONFIG_BLK_CGROUP */
+
+struct cgroup;
+struct blkcg;
+
+struct blkg_policy_data {
+};
+
+struct blkcg_gq {
+};
+
+struct blkcg_policy {
+};
+
+static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, void *key) { return NULL; }
+static inline int blkcg_init_queue(struct request_queue *q) { return 0; }
+static inline void blkcg_drain_queue(struct request_queue *q) { }
+static inline void blkcg_exit_queue(struct request_queue *q) { }
+static inline int blkcg_policy_register(struct blkcg_policy *pol) { return 0; }
+static inline void blkcg_policy_unregister(struct blkcg_policy *pol) { }
+static inline int blkcg_activate_policy(struct request_queue *q,
+					const struct blkcg_policy *pol) { return 0; }
+static inline void blkcg_deactivate_policy(struct request_queue *q,
+					   const struct blkcg_policy *pol) { }
+
+static inline struct blkcg *bio_blkcg(struct bio *bio) { return NULL; }
+
+static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg,
+						  struct blkcg_policy *pol) { return NULL; }
+static inline struct blkcg_gq *pd_to_blkg(struct blkg_policy_data *pd) { return NULL; }
+static inline char *blkg_path(struct blkcg_gq *blkg) { return NULL; }
+static inline void blkg_get(struct blkcg_gq *blkg) { }
+static inline void blkg_put(struct blkcg_gq *blkg) { }
+
+static inline struct request_list *blk_get_rl(struct request_queue *q,
+					      struct bio *bio) { return &q->root_rl; }
+static inline void blk_put_rl(struct request_list *rl) { }
+static inline void blk_rq_set_rl(struct request *rq, struct request_list *rl) { }
+static inline struct request_list *blk_rq_rl(struct request *rq) { return &rq->q->root_rl; }
+
+#define blk_queue_for_each_rl(rl, q)	\
+	for ((rl) = &(q)->root_rl; (rl); (rl) = NULL)
+
+#endif	/* CONFIG_BLK_CGROUP */
+#endif	/* _BLK_CGROUP_H */
-- 
cgit v1.2.3


From efa7d1c733d1d2c1a468b85126d70bad9fdf6ba8 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 22 May 2015 17:13:18 -0400
Subject: update !CONFIG_BLK_CGROUP dummies in include/linux/blk-cgroup.h

The header file will be used more widely with the pending cgroup
writeback support and the current set of dummy declarations aren't
enough to handle different config combinations.  Update as follows.

* Drop the struct cgroup declaration.  None of the dummy defs need it.

* Define blkcg as an empty struct instead of just declaring it.

* Wrap dummy function defs in CONFIG_BLOCK.  Some functions use block
  data types and none of them are to be used w/o block enabled.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 include/linux/blk-cgroup.h | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h
index c567865b5f1d..51f95b34d3f0 100644
--- a/include/linux/blk-cgroup.h
+++ b/include/linux/blk-cgroup.h
@@ -558,8 +558,8 @@ static inline void blkg_rwstat_merge(struct blkg_rwstat *to,
 
 #else	/* CONFIG_BLK_CGROUP */
 
-struct cgroup;
-struct blkcg;
+struct blkcg {
+};
 
 struct blkg_policy_data {
 };
@@ -570,6 +570,8 @@ struct blkcg_gq {
 struct blkcg_policy {
 };
 
+#ifdef CONFIG_BLOCK
+
 static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, void *key) { return NULL; }
 static inline int blkcg_init_queue(struct request_queue *q) { return 0; }
 static inline void blkcg_drain_queue(struct request_queue *q) { }
@@ -599,5 +601,6 @@ static inline struct request_list *blk_rq_rl(struct request *rq) { return &rq->q
 #define blk_queue_for_each_rl(rl, q)	\
 	for ((rl) = &(q)->root_rl; (rl); (rl) = NULL)
 
+#endif	/* CONFIG_BLOCK */
 #endif	/* CONFIG_BLK_CGROUP */
 #endif	/* _BLK_CGROUP_H */
-- 
cgit v1.2.3


From 56161634e4824380a67243a4cf3fa52eb1e5d836 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 22 May 2015 17:13:20 -0400
Subject: memcg: add mem_cgroup_root_css

Add global mem_cgroup_root_css which points to the root memcg css.
This will be used by cgroup writeback support.  If memcg is disabled,
it's defined as ERR_PTR(-EINVAL).

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Johannes Weiner <hannes@cmpxchg.org>
aCc: Michal Hocko <mhocko@suse.cz>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 include/linux/memcontrol.h | 4 ++++
 mm/memcontrol.c            | 2 ++
 2 files changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 5fe6411b5e54..294498f4f6fc 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -68,6 +68,8 @@ enum mem_cgroup_events_index {
 };
 
 #ifdef CONFIG_MEMCG
+extern struct cgroup_subsys_state *mem_cgroup_root_css;
+
 void mem_cgroup_events(struct mem_cgroup *memcg,
 		       enum mem_cgroup_events_index idx,
 		       unsigned int nr);
@@ -196,6 +198,8 @@ void mem_cgroup_split_huge_fixup(struct page *head);
 #else /* CONFIG_MEMCG */
 struct mem_cgroup;
 
+#define mem_cgroup_root_css ((struct cgroup_subsys_state *)ERR_PTR(-EINVAL))
+
 static inline void mem_cgroup_events(struct mem_cgroup *memcg,
 				     enum mem_cgroup_events_index idx,
 				     unsigned int nr)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index c23c1a3e8e16..b22a92b37480 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -77,6 +77,7 @@ EXPORT_SYMBOL(memory_cgrp_subsys);
 
 #define MEM_CGROUP_RECLAIM_RETRIES	5
 static struct mem_cgroup *root_mem_cgroup __read_mostly;
+struct cgroup_subsys_state *mem_cgroup_root_css __read_mostly;
 
 /* Whether the swap controller is active */
 #ifdef CONFIG_MEMCG_SWAP
@@ -4441,6 +4442,7 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
 	/* root ? */
 	if (parent_css == NULL) {
 		root_mem_cgroup = memcg;
+		mem_cgroup_root_css = &memcg->css;
 		page_counter_init(&memcg->memory, NULL);
 		memcg->high = PAGE_COUNTER_MAX;
 		memcg->soft_limit = PAGE_COUNTER_MAX;
-- 
cgit v1.2.3


From 496d5e7560dbb84399dbd92316fc33857aa83900 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 22 May 2015 17:13:21 -0400
Subject: blkcg: add blkcg_root_css

Add global constant blkcg_root_css which points to &blkcg_root.css.
This will be used by cgroup writeback support.  If blkcg is disabled,
it's defined as ERR_PTR(-EINVAL).

v2: The declarations moved to include/linux/blk-cgroup.h as suggested
    by Vivek.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-cgroup.c         | 2 ++
 include/linux/blk-cgroup.h | 3 +++
 2 files changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 2a4f77f2b229..54ec1721cd3e 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -30,6 +30,8 @@ struct blkcg blkcg_root = { .cfq_weight = 2 * CFQ_WEIGHT_DEFAULT,
 			    .cfq_leaf_weight = 2 * CFQ_WEIGHT_DEFAULT, };
 EXPORT_SYMBOL_GPL(blkcg_root);
 
+struct cgroup_subsys_state * const blkcg_root_css = &blkcg_root.css;
+
 static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS];
 
 static bool blkcg_policy_enabled(struct request_queue *q,
diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h
index 51f95b34d3f0..65f0c178fd04 100644
--- a/include/linux/blk-cgroup.h
+++ b/include/linux/blk-cgroup.h
@@ -134,6 +134,7 @@ struct blkcg_policy {
 };
 
 extern struct blkcg blkcg_root;
+extern struct cgroup_subsys_state * const blkcg_root_css;
 
 struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, struct request_queue *q);
 struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg,
@@ -570,6 +571,8 @@ struct blkcg_gq {
 struct blkcg_policy {
 };
 
+#define blkcg_root_css	((struct cgroup_subsys_state *)ERR_PTR(-EINVAL))
+
 #ifdef CONFIG_BLOCK
 
 static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, void *key) { return NULL; }
-- 
cgit v1.2.3


From ec438699a9ae0856c2ce20a50dd39cdc7e92a732 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 22 May 2015 17:13:22 -0400
Subject: cgroup, block: implement task_get_css() and use it in
 bio_associate_current()

bio_associate_current() currently open codes task_css() and
css_tryget_online() to find and pin $current's blkcg css.  Abstract it
into task_get_css() which is implemented from cgroup side.  As a task
is always associated with an online css for every subsystem except
while the css_set update is propagating, task_get_css() retries till
css_tryget_online() succeeds.

This is a cleanup and shouldn't lead to noticeable behavior changes.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Li Zefan <lizefan@huawei.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/bio.c            | 11 +----------
 include/linux/cgroup.h | 25 +++++++++++++++++++++++++
 2 files changed, 26 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/block/bio.c b/block/bio.c
index 259197d97de1..c4f87018fb79 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -2004,7 +2004,6 @@ EXPORT_SYMBOL(bioset_create_nobvec);
 int bio_associate_current(struct bio *bio)
 {
 	struct io_context *ioc;
-	struct cgroup_subsys_state *css;
 
 	if (bio->bi_ioc)
 		return -EBUSY;
@@ -2013,17 +2012,9 @@ int bio_associate_current(struct bio *bio)
 	if (!ioc)
 		return -ENOENT;
 
-	/* acquire active ref on @ioc and associate */
 	get_io_context_active(ioc);
 	bio->bi_ioc = ioc;
-
-	/* associate blkcg if exists */
-	rcu_read_lock();
-	css = task_css(current, blkio_cgrp_id);
-	if (css && css_tryget_online(css))
-		bio->bi_css = css;
-	rcu_read_unlock();
-
+	bio->bi_css = task_get_css(current, blkio_cgrp_id);
 	return 0;
 }
 
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index b9cb94c3102a..e7da0aa65b2d 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -773,6 +773,31 @@ static inline struct cgroup_subsys_state *task_css(struct task_struct *task,
 	return task_css_check(task, subsys_id, false);
 }
 
+/**
+ * task_get_css - find and get the css for (task, subsys)
+ * @task: the target task
+ * @subsys_id: the target subsystem ID
+ *
+ * Find the css for the (@task, @subsys_id) combination, increment a
+ * reference on and return it.  This function is guaranteed to return a
+ * valid css.
+ */
+static inline struct cgroup_subsys_state *
+task_get_css(struct task_struct *task, int subsys_id)
+{
+	struct cgroup_subsys_state *css;
+
+	rcu_read_lock();
+	while (true) {
+		css = task_css(task, subsys_id);
+		if (likely(css_tryget_online(css)))
+			break;
+		cpu_relax();
+	}
+	rcu_read_unlock();
+	return css;
+}
+
 /**
  * task_css_is_root - test whether a task belongs to the root css
  * @task: the target task
-- 
cgit v1.2.3


From fd383c2d3cae146337cea809de0d622b8b887e6c Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 22 May 2015 17:13:23 -0400
Subject: blkcg: implement task_get_blkcg_css()

Implement a wrapper around task_get_css() to acquire the blkcg css for
a given task.  The wrapper is necessary for cgroup writeback support
as there will be places outside blkcg proper trying to acquire
blkcg_css and blkio_cgrp_id will be undefined when !CONFIG_BLK_CGROUP.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 include/linux/blk-cgroup.h | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h
index 65f0c178fd04..4dc643f2046e 100644
--- a/include/linux/blk-cgroup.h
+++ b/include/linux/blk-cgroup.h
@@ -195,6 +195,12 @@ static inline struct blkcg *bio_blkcg(struct bio *bio)
 	return task_blkcg(current);
 }
 
+static inline struct cgroup_subsys_state *
+task_get_blkcg_css(struct task_struct *task)
+{
+	return task_get_css(task, blkio_cgrp_id);
+}
+
 /**
  * blkcg_parent - get the parent of a blkcg
  * @blkcg: blkcg of interest
@@ -573,6 +579,12 @@ struct blkcg_policy {
 
 #define blkcg_root_css	((struct cgroup_subsys_state *)ERR_PTR(-EINVAL))
 
+static inline struct cgroup_subsys_state *
+task_get_blkcg_css(struct task_struct *task)
+{
+	return NULL;
+}
+
 #ifdef CONFIG_BLOCK
 
 static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, void *key) { return NULL; }
-- 
cgit v1.2.3


From 1d933cf096e3aea15f1aec8297657b7a846fab63 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 22 May 2015 17:13:24 -0400
Subject: blkcg: implement bio_associate_blkcg()

Currently, a bio can only be associated with the io_context and blkcg
of %current using bio_associate_current().  This is too restrictive
for cgroup writeback support.  Implement bio_associate_blkcg() which
associates a bio with the specified blkcg.

bio_associate_blkcg() leaves the io_context unassociated.
bio_associate_current() is updated so that it considers a bio as
already associated if it has a blkcg_css, instead of an io_context,
associated with it.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/bio.c         | 24 +++++++++++++++++++++++-
 include/linux/bio.h |  3 +++
 2 files changed, 26 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/block/bio.c b/block/bio.c
index c4f87018fb79..2a00d349cd68 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -1988,6 +1988,28 @@ struct bio_set *bioset_create_nobvec(unsigned int pool_size, unsigned int front_
 EXPORT_SYMBOL(bioset_create_nobvec);
 
 #ifdef CONFIG_BLK_CGROUP
+
+/**
+ * bio_associate_blkcg - associate a bio with the specified blkcg
+ * @bio: target bio
+ * @blkcg_css: css of the blkcg to associate
+ *
+ * Associate @bio with the blkcg specified by @blkcg_css.  Block layer will
+ * treat @bio as if it were issued by a task which belongs to the blkcg.
+ *
+ * This function takes an extra reference of @blkcg_css which will be put
+ * when @bio is released.  The caller must own @bio and is responsible for
+ * synchronizing calls to this function.
+ */
+int bio_associate_blkcg(struct bio *bio, struct cgroup_subsys_state *blkcg_css)
+{
+	if (unlikely(bio->bi_css))
+		return -EBUSY;
+	css_get(blkcg_css);
+	bio->bi_css = blkcg_css;
+	return 0;
+}
+
 /**
  * bio_associate_current - associate a bio with %current
  * @bio: target bio
@@ -2005,7 +2027,7 @@ int bio_associate_current(struct bio *bio)
 {
 	struct io_context *ioc;
 
-	if (bio->bi_ioc)
+	if (bio->bi_css)
 		return -EBUSY;
 
 	ioc = current->io_context;
diff --git a/include/linux/bio.h b/include/linux/bio.h
index f0291cf64cc5..5e963a6d7c14 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -482,9 +482,12 @@ extern void bvec_free(mempool_t *, struct bio_vec *, unsigned int);
 extern unsigned int bvec_nr_vecs(unsigned short idx);
 
 #ifdef CONFIG_BLK_CGROUP
+int bio_associate_blkcg(struct bio *bio, struct cgroup_subsys_state *blkcg_css);
 int bio_associate_current(struct bio *bio);
 void bio_disassociate_task(struct bio *bio);
 #else	/* CONFIG_BLK_CGROUP */
+static inline int bio_associate_blkcg(struct bio *bio,
+			struct cgroup_subsys_state *blkcg_css) { return 0; }
 static inline int bio_associate_current(struct bio *bio) { return -ENOENT; }
 static inline void bio_disassociate_task(struct bio *bio) { }
 #endif	/* CONFIG_BLK_CGROUP */
-- 
cgit v1.2.3


From ad7fa852d3d2816d68a138ebc5bc8967aeb7fd86 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 27 May 2015 20:00:02 -0400
Subject: memcg: implement mem_cgroup_css_from_page()

Implement mem_cgroup_css_from_page() which returns the
cgroup_subsys_state of the memcg associated with a given page on the
default hierarchy.  This will be used by cgroup writeback support.

This function assumes that page->mem_cgroup association doesn't change
until the page is released, which is true on the default hierarchy as
long as replace_page_cache_page() is not used.  As the only user of
replace_page_cache_page() is FUSE which won't support cgroup writeback
for the time being, this works for now, and replace_page_cache_page()
will soon be updated so that the invariant actually holds.

Note that the RCU protected page->mem_cgroup access is consistent with
other usages across memcg but ultimately incorrect.  These unlocked
accesses are missing required barriers.  page->mem_cgroup should be
made an RCU pointer and updated and accessed using RCU operations.

v4: Instead of triggering WARN, return the root css on the traditional
    hierarchies.  This makes the function a lot easier to deal with
    especially as there's no light way to synchronize against
    hierarchy rebinding.

v3: s/mem_cgroup_migrate()/mem_cgroup_css_from_page()/

v2: Trigger WARN if the function is used on the traditional
    hierarchies and add comment about the assumed invariant.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.cz>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 include/linux/memcontrol.h |  1 +
 mm/memcontrol.c            | 33 +++++++++++++++++++++++++++++++++
 2 files changed, 34 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 294498f4f6fc..637ef626008e 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -115,6 +115,7 @@ static inline bool mm_match_cgroup(struct mm_struct *mm,
 }
 
 extern struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg);
+extern struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page);
 
 struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *,
 				   struct mem_cgroup *,
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index b22a92b37480..5c270a03729c 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -598,6 +598,39 @@ struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg)
 	return &memcg->css;
 }
 
+/**
+ * mem_cgroup_css_from_page - css of the memcg associated with a page
+ * @page: page of interest
+ *
+ * If memcg is bound to the default hierarchy, css of the memcg associated
+ * with @page is returned.  The returned css remains associated with @page
+ * until it is released.
+ *
+ * If memcg is bound to a traditional hierarchy, the css of root_mem_cgroup
+ * is returned.
+ *
+ * XXX: The above description of behavior on the default hierarchy isn't
+ * strictly true yet as replace_page_cache_page() can modify the
+ * association before @page is released even on the default hierarchy;
+ * however, the current and planned usages don't mix the the two functions
+ * and replace_page_cache_page() will soon be updated to make the invariant
+ * actually true.
+ */
+struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page)
+{
+	struct mem_cgroup *memcg;
+
+	rcu_read_lock();
+
+	memcg = page->mem_cgroup;
+
+	if (!memcg || !cgroup_on_dfl(memcg->css.cgroup))
+		memcg = root_mem_cgroup;
+
+	rcu_read_unlock();
+	return &memcg->css;
+}
+
 static struct mem_cgroup_per_zone *
 mem_cgroup_page_zoneinfo(struct mem_cgroup *memcg, struct page *page)
 {
-- 
cgit v1.2.3


From 4452226ea276e74fc3e252c88d9bb7e8f8e44bf0 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 22 May 2015 17:13:26 -0400
Subject: writeback: move backing_dev_info->state into bdi_writeback

Currently, a bdi (backing_dev_info) embeds single wb (bdi_writeback)
and the role of the separation is unclear.  For cgroup support for
writeback IOs, a bdi will be updated to host multiple wb's where each
wb serves writeback IOs of a different cgroup on the bdi.  To achieve
that, a wb should carry all states necessary for servicing writeback
IOs for a cgroup independently.

This patch moves bdi->state into wb.

* enum bdi_state is renamed to wb_state and the prefix of all enums is
  changed from BDI_ to WB_.

* Explicit zeroing of bdi->state is removed without adding zeoring of
  wb->state as the whole data structure is zeroed on init anyway.

* As there's still only one bdi_writeback per backing_dev_info, all
  uses of bdi->state are mechanically replaced with bdi->wb.state
  introducing no behavior changes.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Jan Kara <jack@suse.cz>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Wu Fengguang <fengguang.wu@intel.com>
Cc: drbd-dev@lists.linbit.com
Cc: Neil Brown <neilb@suse.de>
Cc: Alasdair Kergon <agk@redhat.com>
Cc: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-core.c               |  1 -
 drivers/block/drbd/drbd_main.c | 10 +++++-----
 drivers/md/dm.c                |  2 +-
 drivers/md/raid1.c             |  4 ++--
 drivers/md/raid10.c            |  2 +-
 fs/fs-writeback.c              | 14 +++++++-------
 include/linux/backing-dev.h    | 24 ++++++++++++------------
 mm/backing-dev.c               | 20 ++++++++++----------
 8 files changed, 38 insertions(+), 39 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-core.c b/block/blk-core.c
index 4afac14d4499..c3ba9c3b08d4 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -621,7 +621,6 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
 
 	q->backing_dev_info.ra_pages =
 			(VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
-	q->backing_dev_info.state = 0;
 	q->backing_dev_info.capabilities = 0;
 	q->backing_dev_info.name = "block";
 	q->node = node_id;
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index 81fde9ef7f8e..a1518539b858 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -2359,7 +2359,7 @@ static void drbd_cleanup(void)
  * @congested_data:	User data
  * @bdi_bits:		Bits the BDI flusher thread is currently interested in
  *
- * Returns 1<<BDI_async_congested and/or 1<<BDI_sync_congested if we are congested.
+ * Returns 1<<WB_async_congested and/or 1<<WB_sync_congested if we are congested.
  */
 static int drbd_congested(void *congested_data, int bdi_bits)
 {
@@ -2376,14 +2376,14 @@ static int drbd_congested(void *congested_data, int bdi_bits)
 	}
 
 	if (test_bit(CALLBACK_PENDING, &first_peer_device(device)->connection->flags)) {
-		r |= (1 << BDI_async_congested);
+		r |= (1 << WB_async_congested);
 		/* Without good local data, we would need to read from remote,
 		 * and that would need the worker thread as well, which is
 		 * currently blocked waiting for that usermode helper to
 		 * finish.
 		 */
 		if (!get_ldev_if_state(device, D_UP_TO_DATE))
-			r |= (1 << BDI_sync_congested);
+			r |= (1 << WB_sync_congested);
 		else
 			put_ldev(device);
 		r &= bdi_bits;
@@ -2399,9 +2399,9 @@ static int drbd_congested(void *congested_data, int bdi_bits)
 			reason = 'b';
 	}
 
-	if (bdi_bits & (1 << BDI_async_congested) &&
+	if (bdi_bits & (1 << WB_async_congested) &&
 	    test_bit(NET_CONGESTED, &first_peer_device(device)->connection->flags)) {
-		r |= (1 << BDI_async_congested);
+		r |= (1 << WB_async_congested);
 		reason = reason == 'b' ? 'a' : 'n';
 	}
 
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 38837f8ea327..2161ed9329c4 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -2074,7 +2074,7 @@ static int dm_any_congested(void *congested_data, int bdi_bits)
 			 * the query about congestion status of request_queue
 			 */
 			if (dm_request_based(md))
-				r = md->queue->backing_dev_info.state &
+				r = md->queue->backing_dev_info.wb.state &
 				    bdi_bits;
 			else
 				r = dm_table_any_congested(map, bdi_bits);
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 9157a29c8dbf..f80f1af61ce7 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -745,7 +745,7 @@ static int raid1_congested(struct mddev *mddev, int bits)
 	struct r1conf *conf = mddev->private;
 	int i, ret = 0;
 
-	if ((bits & (1 << BDI_async_congested)) &&
+	if ((bits & (1 << WB_async_congested)) &&
 	    conf->pending_count >= max_queued_requests)
 		return 1;
 
@@ -760,7 +760,7 @@ static int raid1_congested(struct mddev *mddev, int bits)
 			/* Note the '|| 1' - when read_balance prefers
 			 * non-congested targets, it can be removed
 			 */
-			if ((bits & (1<<BDI_async_congested)) || 1)
+			if ((bits & (1 << WB_async_congested)) || 1)
 				ret |= bdi_congested(&q->backing_dev_info, bits);
 			else
 				ret &= bdi_congested(&q->backing_dev_info, bits);
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index e793ab6b3570..fca825718f29 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -914,7 +914,7 @@ static int raid10_congested(struct mddev *mddev, int bits)
 	struct r10conf *conf = mddev->private;
 	int i, ret = 0;
 
-	if ((bits & (1 << BDI_async_congested)) &&
+	if ((bits & (1 << WB_async_congested)) &&
 	    conf->pending_count >= max_queued_requests)
 		return 1;
 
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 32a8bbd7a9ad..983312cea245 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -74,7 +74,7 @@ unsigned int dirtytime_expire_interval = 12 * 60 * 60;
  */
 int writeback_in_progress(struct backing_dev_info *bdi)
 {
-	return test_bit(BDI_writeback_running, &bdi->state);
+	return test_bit(WB_writeback_running, &bdi->wb.state);
 }
 EXPORT_SYMBOL(writeback_in_progress);
 
@@ -112,7 +112,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(wbc_writepage);
 static void bdi_wakeup_thread(struct backing_dev_info *bdi)
 {
 	spin_lock_bh(&bdi->wb_lock);
-	if (test_bit(BDI_registered, &bdi->state))
+	if (test_bit(WB_registered, &bdi->wb.state))
 		mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0);
 	spin_unlock_bh(&bdi->wb_lock);
 }
@@ -123,7 +123,7 @@ static void bdi_queue_work(struct backing_dev_info *bdi,
 	trace_writeback_queue(bdi, work);
 
 	spin_lock_bh(&bdi->wb_lock);
-	if (!test_bit(BDI_registered, &bdi->state)) {
+	if (!test_bit(WB_registered, &bdi->wb.state)) {
 		if (work->done)
 			complete(work->done);
 		goto out_unlock;
@@ -1057,7 +1057,7 @@ static long wb_do_writeback(struct bdi_writeback *wb)
 	struct wb_writeback_work *work;
 	long wrote = 0;
 
-	set_bit(BDI_writeback_running, &wb->bdi->state);
+	set_bit(WB_writeback_running, &wb->state);
 	while ((work = get_next_work_item(bdi)) != NULL) {
 
 		trace_writeback_exec(bdi, work);
@@ -1079,7 +1079,7 @@ static long wb_do_writeback(struct bdi_writeback *wb)
 	 */
 	wrote += wb_check_old_data_flush(wb);
 	wrote += wb_check_background_flush(wb);
-	clear_bit(BDI_writeback_running, &wb->bdi->state);
+	clear_bit(WB_writeback_running, &wb->state);
 
 	return wrote;
 }
@@ -1099,7 +1099,7 @@ void bdi_writeback_workfn(struct work_struct *work)
 	current->flags |= PF_SWAPWRITE;
 
 	if (likely(!current_is_workqueue_rescuer() ||
-		   !test_bit(BDI_registered, &bdi->state))) {
+		   !test_bit(WB_registered, &wb->state))) {
 		/*
 		 * The normal path.  Keep writing back @bdi until its
 		 * work_list is empty.  Note that this path is also taken
@@ -1323,7 +1323,7 @@ void __mark_inode_dirty(struct inode *inode, int flags)
 			spin_unlock(&inode->i_lock);
 			spin_lock(&bdi->wb.list_lock);
 			if (bdi_cap_writeback_dirty(bdi)) {
-				WARN(!test_bit(BDI_registered, &bdi->state),
+				WARN(!test_bit(WB_registered, &bdi->wb.state),
 				     "bdi-%s not registered\n", bdi->name);
 
 				/*
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index aff923ae8c4b..eb14f988a63e 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -25,13 +25,13 @@ struct device;
 struct dentry;
 
 /*
- * Bits in backing_dev_info.state
+ * Bits in bdi_writeback.state
  */
-enum bdi_state {
-	BDI_async_congested,	/* The async (write) queue is getting full */
-	BDI_sync_congested,	/* The sync queue is getting full */
-	BDI_registered,		/* bdi_register() was done */
-	BDI_writeback_running,	/* Writeback is in progress */
+enum wb_state {
+	WB_async_congested,	/* The async (write) queue is getting full */
+	WB_sync_congested,	/* The sync queue is getting full */
+	WB_registered,		/* bdi_register() was done */
+	WB_writeback_running,	/* Writeback is in progress */
 };
 
 typedef int (congested_fn)(void *, int);
@@ -49,6 +49,7 @@ enum bdi_stat_item {
 struct bdi_writeback {
 	struct backing_dev_info *bdi;	/* our parent bdi */
 
+	unsigned long state;		/* Always use atomic bitops on this */
 	unsigned long last_old_flush;	/* last old data flush */
 
 	struct delayed_work dwork;	/* work item used for writeback */
@@ -62,7 +63,6 @@ struct bdi_writeback {
 struct backing_dev_info {
 	struct list_head bdi_list;
 	unsigned long ra_pages;	/* max readahead in PAGE_CACHE_SIZE units */
-	unsigned long state;	/* Always use atomic bitops on this */
 	unsigned int capabilities; /* Device capabilities */
 	congested_fn *congested_fn; /* Function pointer if device is md/dm */
 	void *congested_data;	/* Pointer to aux data for congested func */
@@ -250,23 +250,23 @@ static inline int bdi_congested(struct backing_dev_info *bdi, int bdi_bits)
 {
 	if (bdi->congested_fn)
 		return bdi->congested_fn(bdi->congested_data, bdi_bits);
-	return (bdi->state & bdi_bits);
+	return (bdi->wb.state & bdi_bits);
 }
 
 static inline int bdi_read_congested(struct backing_dev_info *bdi)
 {
-	return bdi_congested(bdi, 1 << BDI_sync_congested);
+	return bdi_congested(bdi, 1 << WB_sync_congested);
 }
 
 static inline int bdi_write_congested(struct backing_dev_info *bdi)
 {
-	return bdi_congested(bdi, 1 << BDI_async_congested);
+	return bdi_congested(bdi, 1 << WB_async_congested);
 }
 
 static inline int bdi_rw_congested(struct backing_dev_info *bdi)
 {
-	return bdi_congested(bdi, (1 << BDI_sync_congested) |
-				  (1 << BDI_async_congested));
+	return bdi_congested(bdi, (1 << WB_sync_congested) |
+				  (1 << WB_async_congested));
 }
 
 enum {
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 6dc4580df2af..b23cf0ea5912 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -96,7 +96,7 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v)
 		   nr_io,
 		   nr_more_io,
 		   nr_dirty_time,
-		   !list_empty(&bdi->bdi_list), bdi->state);
+		   !list_empty(&bdi->bdi_list), bdi->wb.state);
 #undef K
 
 	return 0;
@@ -280,7 +280,7 @@ void bdi_wakeup_thread_delayed(struct backing_dev_info *bdi)
 
 	timeout = msecs_to_jiffies(dirty_writeback_interval * 10);
 	spin_lock_bh(&bdi->wb_lock);
-	if (test_bit(BDI_registered, &bdi->state))
+	if (test_bit(WB_registered, &bdi->wb.state))
 		queue_delayed_work(bdi_wq, &bdi->wb.dwork, timeout);
 	spin_unlock_bh(&bdi->wb_lock);
 }
@@ -315,7 +315,7 @@ int bdi_register(struct backing_dev_info *bdi, struct device *parent,
 	bdi->dev = dev;
 
 	bdi_debug_register(bdi, dev_name(dev));
-	set_bit(BDI_registered, &bdi->state);
+	set_bit(WB_registered, &bdi->wb.state);
 
 	spin_lock_bh(&bdi_lock);
 	list_add_tail_rcu(&bdi->bdi_list, &bdi_list);
@@ -339,7 +339,7 @@ static void bdi_wb_shutdown(struct backing_dev_info *bdi)
 {
 	/* Make sure nobody queues further work */
 	spin_lock_bh(&bdi->wb_lock);
-	if (!test_and_clear_bit(BDI_registered, &bdi->state)) {
+	if (!test_and_clear_bit(WB_registered, &bdi->wb.state)) {
 		spin_unlock_bh(&bdi->wb_lock);
 		return;
 	}
@@ -492,11 +492,11 @@ static atomic_t nr_bdi_congested[2];
 
 void clear_bdi_congested(struct backing_dev_info *bdi, int sync)
 {
-	enum bdi_state bit;
+	enum wb_state bit;
 	wait_queue_head_t *wqh = &congestion_wqh[sync];
 
-	bit = sync ? BDI_sync_congested : BDI_async_congested;
-	if (test_and_clear_bit(bit, &bdi->state))
+	bit = sync ? WB_sync_congested : WB_async_congested;
+	if (test_and_clear_bit(bit, &bdi->wb.state))
 		atomic_dec(&nr_bdi_congested[sync]);
 	smp_mb__after_atomic();
 	if (waitqueue_active(wqh))
@@ -506,10 +506,10 @@ EXPORT_SYMBOL(clear_bdi_congested);
 
 void set_bdi_congested(struct backing_dev_info *bdi, int sync)
 {
-	enum bdi_state bit;
+	enum wb_state bit;
 
-	bit = sync ? BDI_sync_congested : BDI_async_congested;
-	if (!test_and_set_bit(bit, &bdi->state))
+	bit = sync ? WB_sync_congested : WB_async_congested;
+	if (!test_and_set_bit(bit, &bdi->wb.state))
 		atomic_inc(&nr_bdi_congested[sync]);
 }
 EXPORT_SYMBOL(set_bdi_congested);
-- 
cgit v1.2.3


From 93f78d882865cb90020d0f80a9523c99cf46924c Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 22 May 2015 17:13:27 -0400
Subject: writeback: move backing_dev_info->bdi_stat[] into bdi_writeback

Currently, a bdi (backing_dev_info) embeds single wb (bdi_writeback)
and the role of the separation is unclear.  For cgroup support for
writeback IOs, a bdi will be updated to host multiple wb's where each
wb serves writeback IOs of a different cgroup on the bdi.  To achieve
that, a wb should carry all states necessary for servicing writeback
IOs for a cgroup independently.

This patch moves bdi->bdi_stat[] into wb.

* enum bdi_stat_item is renamed to wb_stat_item and the prefix of all
  enums is changed from BDI_ to WB_.

* BDI_STAT_BATCH() -> WB_STAT_BATCH()

* [__]{add|inc|dec|sum}_wb_stat(bdi, ...) -> [__]{add|inc}_wb_stat(wb, ...)

* bdi_stat[_error]() -> wb_stat[_error]()

* bdi_writeout_inc() -> wb_writeout_inc()

* stat init is moved to bdi_wb_init() and bdi_wb_exit() is added and
  frees stat.

* As there's still only one bdi_writeback per backing_dev_info, all
  uses of bdi->stat[] are mechanically replaced with bdi->wb.stat[]
  introducing no behavior changes.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Jan Kara <jack@suse.cz>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Wu Fengguang <fengguang.wu@intel.com>
Cc: Miklos Szeredi <miklos@szeredi.hu>
Cc: Trond Myklebust <trond.myklebust@primarydata.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 fs/fs-writeback.c           |  2 +-
 fs/fuse/file.c              | 12 ++++----
 fs/nfs/internal.h           |  2 +-
 fs/nfs/write.c              |  3 +-
 include/linux/backing-dev.h | 68 +++++++++++++++++++++------------------------
 mm/backing-dev.c            | 60 ++++++++++++++++++++++++---------------
 mm/page-writeback.c         | 55 ++++++++++++++++++------------------
 7 files changed, 106 insertions(+), 96 deletions(-)

(limited to 'include/linux')

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 983312cea245..8873ecd1578c 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -840,7 +840,7 @@ static bool over_bground_thresh(struct backing_dev_info *bdi)
 	    global_page_state(NR_UNSTABLE_NFS) > background_thresh)
 		return true;
 
-	if (bdi_stat(bdi, BDI_RECLAIMABLE) >
+	if (wb_stat(&bdi->wb, WB_RECLAIMABLE) >
 				bdi_dirty_limit(bdi, background_thresh))
 		return true;
 
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 5ef05b5c4cff..8c5e2fa68835 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -1445,9 +1445,9 @@ static void fuse_writepage_finish(struct fuse_conn *fc, struct fuse_req *req)
 
 	list_del(&req->writepages_entry);
 	for (i = 0; i < req->num_pages; i++) {
-		dec_bdi_stat(bdi, BDI_WRITEBACK);
+		dec_wb_stat(&bdi->wb, WB_WRITEBACK);
 		dec_zone_page_state(req->pages[i], NR_WRITEBACK_TEMP);
-		bdi_writeout_inc(bdi);
+		wb_writeout_inc(&bdi->wb);
 	}
 	wake_up(&fi->page_waitq);
 }
@@ -1634,7 +1634,7 @@ static int fuse_writepage_locked(struct page *page)
 	req->end = fuse_writepage_end;
 	req->inode = inode;
 
-	inc_bdi_stat(inode_to_bdi(inode), BDI_WRITEBACK);
+	inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK);
 	inc_zone_page_state(tmp_page, NR_WRITEBACK_TEMP);
 
 	spin_lock(&fc->lock);
@@ -1749,9 +1749,9 @@ static bool fuse_writepage_in_flight(struct fuse_req *new_req,
 		copy_highpage(old_req->pages[0], page);
 		spin_unlock(&fc->lock);
 
-		dec_bdi_stat(bdi, BDI_WRITEBACK);
+		dec_wb_stat(&bdi->wb, WB_WRITEBACK);
 		dec_zone_page_state(page, NR_WRITEBACK_TEMP);
-		bdi_writeout_inc(bdi);
+		wb_writeout_inc(&bdi->wb);
 		fuse_writepage_free(fc, new_req);
 		fuse_request_free(new_req);
 		goto out;
@@ -1848,7 +1848,7 @@ static int fuse_writepages_fill(struct page *page,
 	req->page_descs[req->num_pages].offset = 0;
 	req->page_descs[req->num_pages].length = PAGE_SIZE;
 
-	inc_bdi_stat(inode_to_bdi(inode), BDI_WRITEBACK);
+	inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK);
 	inc_zone_page_state(tmp_page, NR_WRITEBACK_TEMP);
 
 	err = 0;
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 9e6475bc5ba2..7e3c4604bea8 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -607,7 +607,7 @@ void nfs_mark_page_unstable(struct page *page)
 	struct inode *inode = page_file_mapping(page)->host;
 
 	inc_zone_page_state(page, NR_UNSTABLE_NFS);
-	inc_bdi_stat(inode_to_bdi(inode), BDI_RECLAIMABLE);
+	inc_wb_stat(&inode_to_bdi(inode)->wb, WB_RECLAIMABLE);
 	 __mark_inode_dirty(inode, I_DIRTY_DATASYNC);
 }
 
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index d12a4be613a5..94c7ce01dfb1 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -853,7 +853,8 @@ static void
 nfs_clear_page_commit(struct page *page)
 {
 	dec_zone_page_state(page, NR_UNSTABLE_NFS);
-	dec_bdi_stat(inode_to_bdi(page_file_mapping(page)->host), BDI_RECLAIMABLE);
+	dec_wb_stat(&inode_to_bdi(page_file_mapping(page)->host)->wb,
+		    WB_RECLAIMABLE);
 }
 
 /* Called holding inode (/cinfo) lock */
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index eb14f988a63e..fe7a907a4e16 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -36,15 +36,15 @@ enum wb_state {
 
 typedef int (congested_fn)(void *, int);
 
-enum bdi_stat_item {
-	BDI_RECLAIMABLE,
-	BDI_WRITEBACK,
-	BDI_DIRTIED,
-	BDI_WRITTEN,
-	NR_BDI_STAT_ITEMS
+enum wb_stat_item {
+	WB_RECLAIMABLE,
+	WB_WRITEBACK,
+	WB_DIRTIED,
+	WB_WRITTEN,
+	NR_WB_STAT_ITEMS
 };
 
-#define BDI_STAT_BATCH (8*(1+ilog2(nr_cpu_ids)))
+#define WB_STAT_BATCH (8*(1+ilog2(nr_cpu_ids)))
 
 struct bdi_writeback {
 	struct backing_dev_info *bdi;	/* our parent bdi */
@@ -58,6 +58,8 @@ struct bdi_writeback {
 	struct list_head b_more_io;	/* parked for more writeback */
 	struct list_head b_dirty_time;	/* time stamps are dirty */
 	spinlock_t list_lock;		/* protects the b_* lists */
+
+	struct percpu_counter stat[NR_WB_STAT_ITEMS];
 };
 
 struct backing_dev_info {
@@ -69,8 +71,6 @@ struct backing_dev_info {
 
 	char *name;
 
-	struct percpu_counter bdi_stat[NR_BDI_STAT_ITEMS];
-
 	unsigned long bw_time_stamp;	/* last time write bw is updated */
 	unsigned long dirtied_stamp;
 	unsigned long written_stamp;	/* pages written at bw_time_stamp */
@@ -137,78 +137,74 @@ static inline int wb_has_dirty_io(struct bdi_writeback *wb)
 	       !list_empty(&wb->b_more_io);
 }
 
-static inline void __add_bdi_stat(struct backing_dev_info *bdi,
-		enum bdi_stat_item item, s64 amount)
+static inline void __add_wb_stat(struct bdi_writeback *wb,
+				 enum wb_stat_item item, s64 amount)
 {
-	__percpu_counter_add(&bdi->bdi_stat[item], amount, BDI_STAT_BATCH);
+	__percpu_counter_add(&wb->stat[item], amount, WB_STAT_BATCH);
 }
 
-static inline void __inc_bdi_stat(struct backing_dev_info *bdi,
-		enum bdi_stat_item item)
+static inline void __inc_wb_stat(struct bdi_writeback *wb,
+				 enum wb_stat_item item)
 {
-	__add_bdi_stat(bdi, item, 1);
+	__add_wb_stat(wb, item, 1);
 }
 
-static inline void inc_bdi_stat(struct backing_dev_info *bdi,
-		enum bdi_stat_item item)
+static inline void inc_wb_stat(struct bdi_writeback *wb, enum wb_stat_item item)
 {
 	unsigned long flags;
 
 	local_irq_save(flags);
-	__inc_bdi_stat(bdi, item);
+	__inc_wb_stat(wb, item);
 	local_irq_restore(flags);
 }
 
-static inline void __dec_bdi_stat(struct backing_dev_info *bdi,
-		enum bdi_stat_item item)
+static inline void __dec_wb_stat(struct bdi_writeback *wb,
+				 enum wb_stat_item item)
 {
-	__add_bdi_stat(bdi, item, -1);
+	__add_wb_stat(wb, item, -1);
 }
 
-static inline void dec_bdi_stat(struct backing_dev_info *bdi,
-		enum bdi_stat_item item)
+static inline void dec_wb_stat(struct bdi_writeback *wb, enum wb_stat_item item)
 {
 	unsigned long flags;
 
 	local_irq_save(flags);
-	__dec_bdi_stat(bdi, item);
+	__dec_wb_stat(wb, item);
 	local_irq_restore(flags);
 }
 
-static inline s64 bdi_stat(struct backing_dev_info *bdi,
-		enum bdi_stat_item item)
+static inline s64 wb_stat(struct bdi_writeback *wb, enum wb_stat_item item)
 {
-	return percpu_counter_read_positive(&bdi->bdi_stat[item]);
+	return percpu_counter_read_positive(&wb->stat[item]);
 }
 
-static inline s64 __bdi_stat_sum(struct backing_dev_info *bdi,
-		enum bdi_stat_item item)
+static inline s64 __wb_stat_sum(struct bdi_writeback *wb,
+				enum wb_stat_item item)
 {
-	return percpu_counter_sum_positive(&bdi->bdi_stat[item]);
+	return percpu_counter_sum_positive(&wb->stat[item]);
 }
 
-static inline s64 bdi_stat_sum(struct backing_dev_info *bdi,
-		enum bdi_stat_item item)
+static inline s64 wb_stat_sum(struct bdi_writeback *wb, enum wb_stat_item item)
 {
 	s64 sum;
 	unsigned long flags;
 
 	local_irq_save(flags);
-	sum = __bdi_stat_sum(bdi, item);
+	sum = __wb_stat_sum(wb, item);
 	local_irq_restore(flags);
 
 	return sum;
 }
 
-extern void bdi_writeout_inc(struct backing_dev_info *bdi);
+extern void wb_writeout_inc(struct bdi_writeback *wb);
 
 /*
  * maximal error of a stat counter.
  */
-static inline unsigned long bdi_stat_error(struct backing_dev_info *bdi)
+static inline unsigned long wb_stat_error(struct bdi_writeback *wb)
 {
 #ifdef CONFIG_SMP
-	return nr_cpu_ids * BDI_STAT_BATCH;
+	return nr_cpu_ids * WB_STAT_BATCH;
 #else
 	return 1;
 #endif
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index b23cf0ea5912..7b1d1917b658 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -84,13 +84,13 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v)
 		   "b_dirty_time:       %10lu\n"
 		   "bdi_list:           %10u\n"
 		   "state:              %10lx\n",
-		   (unsigned long) K(bdi_stat(bdi, BDI_WRITEBACK)),
-		   (unsigned long) K(bdi_stat(bdi, BDI_RECLAIMABLE)),
+		   (unsigned long) K(wb_stat(wb, WB_WRITEBACK)),
+		   (unsigned long) K(wb_stat(wb, WB_RECLAIMABLE)),
 		   K(bdi_thresh),
 		   K(dirty_thresh),
 		   K(background_thresh),
-		   (unsigned long) K(bdi_stat(bdi, BDI_DIRTIED)),
-		   (unsigned long) K(bdi_stat(bdi, BDI_WRITTEN)),
+		   (unsigned long) K(wb_stat(wb, WB_DIRTIED)),
+		   (unsigned long) K(wb_stat(wb, WB_WRITTEN)),
 		   (unsigned long) K(bdi->write_bandwidth),
 		   nr_dirty,
 		   nr_io,
@@ -376,8 +376,10 @@ void bdi_unregister(struct backing_dev_info *bdi)
 }
 EXPORT_SYMBOL(bdi_unregister);
 
-static void bdi_wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi)
+static int bdi_wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi)
 {
+	int i, err;
+
 	memset(wb, 0, sizeof(*wb));
 
 	wb->bdi = bdi;
@@ -388,6 +390,27 @@ static void bdi_wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi)
 	INIT_LIST_HEAD(&wb->b_dirty_time);
 	spin_lock_init(&wb->list_lock);
 	INIT_DELAYED_WORK(&wb->dwork, bdi_writeback_workfn);
+
+	for (i = 0; i < NR_WB_STAT_ITEMS; i++) {
+		err = percpu_counter_init(&wb->stat[i], 0, GFP_KERNEL);
+		if (err) {
+			while (--i)
+				percpu_counter_destroy(&wb->stat[i]);
+			return err;
+		}
+	}
+
+	return 0;
+}
+
+static void bdi_wb_exit(struct bdi_writeback *wb)
+{
+	int i;
+
+	WARN_ON(delayed_work_pending(&wb->dwork));
+
+	for (i = 0; i < NR_WB_STAT_ITEMS; i++)
+		percpu_counter_destroy(&wb->stat[i]);
 }
 
 /*
@@ -397,7 +420,7 @@ static void bdi_wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi)
 
 int bdi_init(struct backing_dev_info *bdi)
 {
-	int i, err;
+	int err;
 
 	bdi->dev = NULL;
 
@@ -408,13 +431,9 @@ int bdi_init(struct backing_dev_info *bdi)
 	INIT_LIST_HEAD(&bdi->bdi_list);
 	INIT_LIST_HEAD(&bdi->work_list);
 
-	bdi_wb_init(&bdi->wb, bdi);
-
-	for (i = 0; i < NR_BDI_STAT_ITEMS; i++) {
-		err = percpu_counter_init(&bdi->bdi_stat[i], 0, GFP_KERNEL);
-		if (err)
-			goto err;
-	}
+	err = bdi_wb_init(&bdi->wb, bdi);
+	if (err)
+		return err;
 
 	bdi->dirty_exceeded = 0;
 
@@ -427,25 +446,20 @@ int bdi_init(struct backing_dev_info *bdi)
 	bdi->avg_write_bandwidth = INIT_BW;
 
 	err = fprop_local_init_percpu(&bdi->completions, GFP_KERNEL);
-
 	if (err) {
-err:
-		while (i--)
-			percpu_counter_destroy(&bdi->bdi_stat[i]);
+		bdi_wb_exit(&bdi->wb);
+		return err;
 	}
 
-	return err;
+	return 0;
 }
 EXPORT_SYMBOL(bdi_init);
 
 void bdi_destroy(struct backing_dev_info *bdi)
 {
-	int i;
-
 	bdi_wb_shutdown(bdi);
 
 	WARN_ON(!list_empty(&bdi->work_list));
-	WARN_ON(delayed_work_pending(&bdi->wb.dwork));
 
 	if (bdi->dev) {
 		bdi_debug_unregister(bdi);
@@ -453,8 +467,8 @@ void bdi_destroy(struct backing_dev_info *bdi)
 		bdi->dev = NULL;
 	}
 
-	for (i = 0; i < NR_BDI_STAT_ITEMS; i++)
-		percpu_counter_destroy(&bdi->bdi_stat[i]);
+	bdi_wb_exit(&bdi->wb);
+
 	fprop_local_destroy_percpu(&bdi->completions);
 }
 EXPORT_SYMBOL(bdi_destroy);
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index bdeecad00489..dc673a035413 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -396,11 +396,11 @@ static unsigned long wp_next_time(unsigned long cur_time)
  * Increment the BDI's writeout completion count and the global writeout
  * completion count. Called from test_clear_page_writeback().
  */
-static inline void __bdi_writeout_inc(struct backing_dev_info *bdi)
+static inline void __wb_writeout_inc(struct bdi_writeback *wb)
 {
-	__inc_bdi_stat(bdi, BDI_WRITTEN);
-	__fprop_inc_percpu_max(&writeout_completions, &bdi->completions,
-			       bdi->max_prop_frac);
+	__inc_wb_stat(wb, WB_WRITTEN);
+	__fprop_inc_percpu_max(&writeout_completions, &wb->bdi->completions,
+			       wb->bdi->max_prop_frac);
 	/* First event after period switching was turned off? */
 	if (!unlikely(writeout_period_time)) {
 		/*
@@ -414,15 +414,15 @@ static inline void __bdi_writeout_inc(struct backing_dev_info *bdi)
 	}
 }
 
-void bdi_writeout_inc(struct backing_dev_info *bdi)
+void wb_writeout_inc(struct bdi_writeback *wb)
 {
 	unsigned long flags;
 
 	local_irq_save(flags);
-	__bdi_writeout_inc(bdi);
+	__wb_writeout_inc(wb);
 	local_irq_restore(flags);
 }
-EXPORT_SYMBOL_GPL(bdi_writeout_inc);
+EXPORT_SYMBOL_GPL(wb_writeout_inc);
 
 /*
  * Obtain an accurate fraction of the BDI's portion.
@@ -1130,8 +1130,8 @@ void __bdi_update_bandwidth(struct backing_dev_info *bdi,
 	if (elapsed < BANDWIDTH_INTERVAL)
 		return;
 
-	dirtied = percpu_counter_read(&bdi->bdi_stat[BDI_DIRTIED]);
-	written = percpu_counter_read(&bdi->bdi_stat[BDI_WRITTEN]);
+	dirtied = percpu_counter_read(&bdi->wb.stat[WB_DIRTIED]);
+	written = percpu_counter_read(&bdi->wb.stat[WB_WRITTEN]);
 
 	/*
 	 * Skip quiet periods when disk bandwidth is under-utilized.
@@ -1288,7 +1288,8 @@ static inline void bdi_dirty_limits(struct backing_dev_info *bdi,
 				    unsigned long *bdi_thresh,
 				    unsigned long *bdi_bg_thresh)
 {
-	unsigned long bdi_reclaimable;
+	struct bdi_writeback *wb = &bdi->wb;
+	unsigned long wb_reclaimable;
 
 	/*
 	 * bdi_thresh is not treated as some limiting factor as
@@ -1320,14 +1321,12 @@ static inline void bdi_dirty_limits(struct backing_dev_info *bdi,
 	 * actually dirty; with m+n sitting in the percpu
 	 * deltas.
 	 */
-	if (*bdi_thresh < 2 * bdi_stat_error(bdi)) {
-		bdi_reclaimable = bdi_stat_sum(bdi, BDI_RECLAIMABLE);
-		*bdi_dirty = bdi_reclaimable +
-			bdi_stat_sum(bdi, BDI_WRITEBACK);
+	if (*bdi_thresh < 2 * wb_stat_error(wb)) {
+		wb_reclaimable = wb_stat_sum(wb, WB_RECLAIMABLE);
+		*bdi_dirty = wb_reclaimable + wb_stat_sum(wb, WB_WRITEBACK);
 	} else {
-		bdi_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE);
-		*bdi_dirty = bdi_reclaimable +
-			bdi_stat(bdi, BDI_WRITEBACK);
+		wb_reclaimable = wb_stat(wb, WB_RECLAIMABLE);
+		*bdi_dirty = wb_reclaimable + wb_stat(wb, WB_WRITEBACK);
 	}
 }
 
@@ -1514,9 +1513,9 @@ pause:
 		 * In theory 1 page is enough to keep the comsumer-producer
 		 * pipe going: the flusher cleans 1 page => the task dirties 1
 		 * more page. However bdi_dirty has accounting errors.  So use
-		 * the larger and more IO friendly bdi_stat_error.
+		 * the larger and more IO friendly wb_stat_error.
 		 */
-		if (bdi_dirty <= bdi_stat_error(bdi))
+		if (bdi_dirty <= wb_stat_error(&bdi->wb))
 			break;
 
 		if (fatal_signal_pending(current))
@@ -2106,8 +2105,8 @@ void account_page_dirtied(struct page *page, struct address_space *mapping,
 		mem_cgroup_inc_page_stat(memcg, MEM_CGROUP_STAT_DIRTY);
 		__inc_zone_page_state(page, NR_FILE_DIRTY);
 		__inc_zone_page_state(page, NR_DIRTIED);
-		__inc_bdi_stat(bdi, BDI_RECLAIMABLE);
-		__inc_bdi_stat(bdi, BDI_DIRTIED);
+		__inc_wb_stat(&bdi->wb, WB_RECLAIMABLE);
+		__inc_wb_stat(&bdi->wb, WB_DIRTIED);
 		task_io_account_write(PAGE_CACHE_SIZE);
 		current->nr_dirtied++;
 		this_cpu_inc(bdp_ratelimits);
@@ -2126,7 +2125,7 @@ void account_page_cleaned(struct page *page, struct address_space *mapping,
 	if (mapping_cap_account_dirty(mapping)) {
 		mem_cgroup_dec_page_stat(memcg, MEM_CGROUP_STAT_DIRTY);
 		dec_zone_page_state(page, NR_FILE_DIRTY);
-		dec_bdi_stat(inode_to_bdi(mapping->host), BDI_RECLAIMABLE);
+		dec_wb_stat(&inode_to_bdi(mapping->host)->wb, WB_RECLAIMABLE);
 		task_io_account_cancelled_write(PAGE_CACHE_SIZE);
 	}
 }
@@ -2190,7 +2189,7 @@ void account_page_redirty(struct page *page)
 	if (mapping && mapping_cap_account_dirty(mapping)) {
 		current->nr_dirtied--;
 		dec_zone_page_state(page, NR_DIRTIED);
-		dec_bdi_stat(inode_to_bdi(mapping->host), BDI_DIRTIED);
+		dec_wb_stat(&inode_to_bdi(mapping->host)->wb, WB_DIRTIED);
 	}
 }
 EXPORT_SYMBOL(account_page_redirty);
@@ -2369,8 +2368,8 @@ int clear_page_dirty_for_io(struct page *page)
 		if (TestClearPageDirty(page)) {
 			mem_cgroup_dec_page_stat(memcg, MEM_CGROUP_STAT_DIRTY);
 			dec_zone_page_state(page, NR_FILE_DIRTY);
-			dec_bdi_stat(inode_to_bdi(mapping->host),
-					BDI_RECLAIMABLE);
+			dec_wb_stat(&inode_to_bdi(mapping->host)->wb,
+				    WB_RECLAIMABLE);
 			ret = 1;
 		}
 		mem_cgroup_end_page_stat(memcg);
@@ -2398,8 +2397,8 @@ int test_clear_page_writeback(struct page *page)
 						page_index(page),
 						PAGECACHE_TAG_WRITEBACK);
 			if (bdi_cap_account_writeback(bdi)) {
-				__dec_bdi_stat(bdi, BDI_WRITEBACK);
-				__bdi_writeout_inc(bdi);
+				__dec_wb_stat(&bdi->wb, WB_WRITEBACK);
+				__wb_writeout_inc(&bdi->wb);
 			}
 		}
 		spin_unlock_irqrestore(&mapping->tree_lock, flags);
@@ -2433,7 +2432,7 @@ int __test_set_page_writeback(struct page *page, bool keep_write)
 						page_index(page),
 						PAGECACHE_TAG_WRITEBACK);
 			if (bdi_cap_account_writeback(bdi))
-				__inc_bdi_stat(bdi, BDI_WRITEBACK);
+				__inc_wb_stat(&bdi->wb, WB_WRITEBACK);
 		}
 		if (!PageDirty(page))
 			radix_tree_tag_clear(&mapping->page_tree,
-- 
cgit v1.2.3


From a88a341a73be4ef035ca26170c849f002797da27 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 22 May 2015 17:13:28 -0400
Subject: writeback: move bandwidth related fields from backing_dev_info into
 bdi_writeback

Currently, a bdi (backing_dev_info) embeds single wb (bdi_writeback)
and the role of the separation is unclear.  For cgroup support for
writeback IOs, a bdi will be updated to host multiple wb's where each
wb serves writeback IOs of a different cgroup on the bdi.  To achieve
that, a wb should carry all states necessary for servicing writeback
IOs for a cgroup independently.

This patch moves bandwidth related fields from backing_dev_info into
bdi_writeback.

* The moved fields are: bw_time_stamp, dirtied_stamp, written_stamp,
  write_bandwidth, avg_write_bandwidth, dirty_ratelimit,
  balanced_dirty_ratelimit, completions and dirty_exceeded.

* writeback_chunk_size() and over_bground_thresh() now take @wb
  instead of @bdi.

* bdi_writeout_fraction(bdi, ...)	-> wb_writeout_fraction(wb, ...)
  bdi_dirty_limit(bdi, ...)		-> wb_dirty_limit(wb, ...)
  bdi_position_ration(bdi, ...)		-> wb_position_ratio(wb, ...)
  bdi_update_writebandwidth(bdi, ...)	-> wb_update_write_bandwidth(wb, ...)
  [__]bdi_update_bandwidth(bdi, ...)	-> [__]wb_update_bandwidth(wb, ...)
  bdi_{max|min}_pause(bdi, ...)		-> wb_{max|min}_pause(wb, ...)
  bdi_dirty_limits(bdi, ...)		-> wb_dirty_limits(wb, ...)

* Init/exits of the relocated fields are moved to bdi_wb_init/exit()
  respectively.  Note that explicit zeroing is dropped in the process
  as wb's are cleared in entirety anyway.

* As there's still only one bdi_writeback per backing_dev_info, all
  uses of bdi->stat[] are mechanically replaced with bdi->wb.stat[]
  introducing no behavior changes.

v2: Typo in description fixed as suggested by Jan.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Jan Kara <jack@suse.cz>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Wu Fengguang <fengguang.wu@intel.com>
Cc: Jaegeuk Kim <jaegeuk@kernel.org>
Cc: Steven Whitehouse <swhiteho@redhat.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 fs/f2fs/node.c                   |   4 +-
 fs/f2fs/segment.h                |   2 +-
 fs/fs-writeback.c                |  17 ++-
 fs/gfs2/super.c                  |   2 +-
 include/linux/backing-dev.h      |  20 +--
 include/linux/writeback.h        |  19 ++-
 include/trace/events/writeback.h |   8 +-
 mm/backing-dev.c                 |  45 +++----
 mm/page-writeback.c              | 262 ++++++++++++++++++++-------------------
 9 files changed, 187 insertions(+), 192 deletions(-)

(limited to 'include/linux')

diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index 8ab0cf1930bd..d211602e0f86 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -53,7 +53,7 @@ bool available_free_memory(struct f2fs_sb_info *sbi, int type)
 							PAGE_CACHE_SHIFT;
 		res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 2);
 	} else if (type == DIRTY_DENTS) {
-		if (sbi->sb->s_bdi->dirty_exceeded)
+		if (sbi->sb->s_bdi->wb.dirty_exceeded)
 			return false;
 		mem_size = get_pages(sbi, F2FS_DIRTY_DENTS);
 		res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1);
@@ -70,7 +70,7 @@ bool available_free_memory(struct f2fs_sb_info *sbi, int type)
 				sizeof(struct extent_node)) >> PAGE_CACHE_SHIFT;
 		res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1);
 	} else {
-		if (sbi->sb->s_bdi->dirty_exceeded)
+		if (sbi->sb->s_bdi->wb.dirty_exceeded)
 			return false;
 	}
 	return res;
diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
index 85d7fa7514b2..6408989857e0 100644
--- a/fs/f2fs/segment.h
+++ b/fs/f2fs/segment.h
@@ -713,7 +713,7 @@ static inline unsigned int max_hw_blocks(struct f2fs_sb_info *sbi)
  */
 static inline int nr_pages_to_skip(struct f2fs_sb_info *sbi, int type)
 {
-	if (sbi->sb->s_bdi->dirty_exceeded)
+	if (sbi->sb->s_bdi->wb.dirty_exceeded)
 		return 0;
 
 	if (type == DATA)
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 8873ecd1578c..1945cb9f6b97 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -624,7 +624,7 @@ out:
 	return ret;
 }
 
-static long writeback_chunk_size(struct backing_dev_info *bdi,
+static long writeback_chunk_size(struct bdi_writeback *wb,
 				 struct wb_writeback_work *work)
 {
 	long pages;
@@ -645,7 +645,7 @@ static long writeback_chunk_size(struct backing_dev_info *bdi,
 	if (work->sync_mode == WB_SYNC_ALL || work->tagged_writepages)
 		pages = LONG_MAX;
 	else {
-		pages = min(bdi->avg_write_bandwidth / 2,
+		pages = min(wb->avg_write_bandwidth / 2,
 			    global_dirty_limit / DIRTY_SCOPE);
 		pages = min(pages, work->nr_pages);
 		pages = round_down(pages + MIN_WRITEBACK_PAGES,
@@ -743,7 +743,7 @@ static long writeback_sb_inodes(struct super_block *sb,
 		inode->i_state |= I_SYNC;
 		spin_unlock(&inode->i_lock);
 
-		write_chunk = writeback_chunk_size(wb->bdi, work);
+		write_chunk = writeback_chunk_size(wb, work);
 		wbc.nr_to_write = write_chunk;
 		wbc.pages_skipped = 0;
 
@@ -830,7 +830,7 @@ static long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages,
 	return nr_pages - work.nr_pages;
 }
 
-static bool over_bground_thresh(struct backing_dev_info *bdi)
+static bool over_bground_thresh(struct bdi_writeback *wb)
 {
 	unsigned long background_thresh, dirty_thresh;
 
@@ -840,8 +840,7 @@ static bool over_bground_thresh(struct backing_dev_info *bdi)
 	    global_page_state(NR_UNSTABLE_NFS) > background_thresh)
 		return true;
 
-	if (wb_stat(&bdi->wb, WB_RECLAIMABLE) >
-				bdi_dirty_limit(bdi, background_thresh))
+	if (wb_stat(wb, WB_RECLAIMABLE) > wb_dirty_limit(wb, background_thresh))
 		return true;
 
 	return false;
@@ -854,7 +853,7 @@ static bool over_bground_thresh(struct backing_dev_info *bdi)
 static void wb_update_bandwidth(struct bdi_writeback *wb,
 				unsigned long start_time)
 {
-	__bdi_update_bandwidth(wb->bdi, 0, 0, 0, 0, 0, start_time);
+	__wb_update_bandwidth(wb, 0, 0, 0, 0, 0, start_time);
 }
 
 /*
@@ -906,7 +905,7 @@ static long wb_writeback(struct bdi_writeback *wb,
 		 * For background writeout, stop when we are below the
 		 * background dirty threshold
 		 */
-		if (work->for_background && !over_bground_thresh(wb->bdi))
+		if (work->for_background && !over_bground_thresh(wb))
 			break;
 
 		/*
@@ -998,7 +997,7 @@ static unsigned long get_nr_dirty_pages(void)
 
 static long wb_check_background_flush(struct bdi_writeback *wb)
 {
-	if (over_bground_thresh(wb->bdi)) {
+	if (over_bground_thresh(wb)) {
 
 		struct wb_writeback_work work = {
 			.nr_pages	= LONG_MAX,
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 859c6edbf81a..2982445947e1 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -748,7 +748,7 @@ static int gfs2_write_inode(struct inode *inode, struct writeback_control *wbc)
 
 	if (wbc->sync_mode == WB_SYNC_ALL)
 		gfs2_log_flush(GFS2_SB(inode), ip->i_gl, NORMAL_FLUSH);
-	if (bdi->dirty_exceeded)
+	if (bdi->wb.dirty_exceeded)
 		gfs2_ail1_flush(sdp, wbc);
 	else
 		filemap_fdatawrite(metamapping);
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index fe7a907a4e16..2ab06049d812 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -60,16 +60,6 @@ struct bdi_writeback {
 	spinlock_t list_lock;		/* protects the b_* lists */
 
 	struct percpu_counter stat[NR_WB_STAT_ITEMS];
-};
-
-struct backing_dev_info {
-	struct list_head bdi_list;
-	unsigned long ra_pages;	/* max readahead in PAGE_CACHE_SIZE units */
-	unsigned int capabilities; /* Device capabilities */
-	congested_fn *congested_fn; /* Function pointer if device is md/dm */
-	void *congested_data;	/* Pointer to aux data for congested func */
-
-	char *name;
 
 	unsigned long bw_time_stamp;	/* last time write bw is updated */
 	unsigned long dirtied_stamp;
@@ -88,6 +78,16 @@ struct backing_dev_info {
 
 	struct fprop_local_percpu completions;
 	int dirty_exceeded;
+};
+
+struct backing_dev_info {
+	struct list_head bdi_list;
+	unsigned long ra_pages;	/* max readahead in PAGE_CACHE_SIZE units */
+	unsigned int capabilities; /* Device capabilities */
+	congested_fn *congested_fn; /* Function pointer if device is md/dm */
+	void *congested_data;	/* Pointer to aux data for congested func */
+
+	char *name;
 
 	unsigned int min_ratio;
 	unsigned int max_ratio, max_prop_frac;
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index b2dd371ec0ca..a6b9db7fcee8 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -155,16 +155,15 @@ int dirty_writeback_centisecs_handler(struct ctl_table *, int,
 				      void __user *, size_t *, loff_t *);
 
 void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty);
-unsigned long bdi_dirty_limit(struct backing_dev_info *bdi,
-			       unsigned long dirty);
-
-void __bdi_update_bandwidth(struct backing_dev_info *bdi,
-			    unsigned long thresh,
-			    unsigned long bg_thresh,
-			    unsigned long dirty,
-			    unsigned long bdi_thresh,
-			    unsigned long bdi_dirty,
-			    unsigned long start_time);
+unsigned long wb_dirty_limit(struct bdi_writeback *wb, unsigned long dirty);
+
+void __wb_update_bandwidth(struct bdi_writeback *wb,
+			   unsigned long thresh,
+			   unsigned long bg_thresh,
+			   unsigned long dirty,
+			   unsigned long bdi_thresh,
+			   unsigned long bdi_dirty,
+			   unsigned long start_time);
 
 void page_writeback_init(void);
 void balance_dirty_pages_ratelimited(struct address_space *mapping);
diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h
index 880dd7437172..9b876f6cc81a 100644
--- a/include/trace/events/writeback.h
+++ b/include/trace/events/writeback.h
@@ -400,13 +400,13 @@ TRACE_EVENT(bdi_dirty_ratelimit,
 
 	TP_fast_assign(
 		strlcpy(__entry->bdi, dev_name(bdi->dev), 32);
-		__entry->write_bw	= KBps(bdi->write_bandwidth);
-		__entry->avg_write_bw	= KBps(bdi->avg_write_bandwidth);
+		__entry->write_bw	= KBps(bdi->wb.write_bandwidth);
+		__entry->avg_write_bw	= KBps(bdi->wb.avg_write_bandwidth);
 		__entry->dirty_rate	= KBps(dirty_rate);
-		__entry->dirty_ratelimit = KBps(bdi->dirty_ratelimit);
+		__entry->dirty_ratelimit = KBps(bdi->wb.dirty_ratelimit);
 		__entry->task_ratelimit	= KBps(task_ratelimit);
 		__entry->balanced_dirty_ratelimit =
-					  KBps(bdi->balanced_dirty_ratelimit);
+					KBps(bdi->wb.balanced_dirty_ratelimit);
 	),
 
 	TP_printk("bdi %s: "
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 7b1d1917b658..9a6c472ebd0b 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -66,7 +66,7 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v)
 	spin_unlock(&wb->list_lock);
 
 	global_dirty_limits(&background_thresh, &dirty_thresh);
-	bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh);
+	bdi_thresh = wb_dirty_limit(wb, dirty_thresh);
 
 #define K(x) ((x) << (PAGE_SHIFT - 10))
 	seq_printf(m,
@@ -91,7 +91,7 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v)
 		   K(background_thresh),
 		   (unsigned long) K(wb_stat(wb, WB_DIRTIED)),
 		   (unsigned long) K(wb_stat(wb, WB_WRITTEN)),
-		   (unsigned long) K(bdi->write_bandwidth),
+		   (unsigned long) K(wb->write_bandwidth),
 		   nr_dirty,
 		   nr_io,
 		   nr_more_io,
@@ -376,6 +376,11 @@ void bdi_unregister(struct backing_dev_info *bdi)
 }
 EXPORT_SYMBOL(bdi_unregister);
 
+/*
+ * Initial write bandwidth: 100 MB/s
+ */
+#define INIT_BW		(100 << (20 - PAGE_SHIFT))
+
 static int bdi_wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi)
 {
 	int i, err;
@@ -391,11 +396,22 @@ static int bdi_wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi)
 	spin_lock_init(&wb->list_lock);
 	INIT_DELAYED_WORK(&wb->dwork, bdi_writeback_workfn);
 
+	wb->bw_time_stamp = jiffies;
+	wb->balanced_dirty_ratelimit = INIT_BW;
+	wb->dirty_ratelimit = INIT_BW;
+	wb->write_bandwidth = INIT_BW;
+	wb->avg_write_bandwidth = INIT_BW;
+
+	err = fprop_local_init_percpu(&wb->completions, GFP_KERNEL);
+	if (err)
+		return err;
+
 	for (i = 0; i < NR_WB_STAT_ITEMS; i++) {
 		err = percpu_counter_init(&wb->stat[i], 0, GFP_KERNEL);
 		if (err) {
 			while (--i)
 				percpu_counter_destroy(&wb->stat[i]);
+			fprop_local_destroy_percpu(&wb->completions);
 			return err;
 		}
 	}
@@ -411,12 +427,9 @@ static void bdi_wb_exit(struct bdi_writeback *wb)
 
 	for (i = 0; i < NR_WB_STAT_ITEMS; i++)
 		percpu_counter_destroy(&wb->stat[i]);
-}
 
-/*
- * Initial write bandwidth: 100 MB/s
- */
-#define INIT_BW		(100 << (20 - PAGE_SHIFT))
+	fprop_local_destroy_percpu(&wb->completions);
+}
 
 int bdi_init(struct backing_dev_info *bdi)
 {
@@ -435,22 +448,6 @@ int bdi_init(struct backing_dev_info *bdi)
 	if (err)
 		return err;
 
-	bdi->dirty_exceeded = 0;
-
-	bdi->bw_time_stamp = jiffies;
-	bdi->written_stamp = 0;
-
-	bdi->balanced_dirty_ratelimit = INIT_BW;
-	bdi->dirty_ratelimit = INIT_BW;
-	bdi->write_bandwidth = INIT_BW;
-	bdi->avg_write_bandwidth = INIT_BW;
-
-	err = fprop_local_init_percpu(&bdi->completions, GFP_KERNEL);
-	if (err) {
-		bdi_wb_exit(&bdi->wb);
-		return err;
-	}
-
 	return 0;
 }
 EXPORT_SYMBOL(bdi_init);
@@ -468,8 +465,6 @@ void bdi_destroy(struct backing_dev_info *bdi)
 	}
 
 	bdi_wb_exit(&bdi->wb);
-
-	fprop_local_destroy_percpu(&bdi->completions);
 }
 EXPORT_SYMBOL(bdi_destroy);
 
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index dc673a035413..cd39ee91b7bb 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -399,7 +399,7 @@ static unsigned long wp_next_time(unsigned long cur_time)
 static inline void __wb_writeout_inc(struct bdi_writeback *wb)
 {
 	__inc_wb_stat(wb, WB_WRITTEN);
-	__fprop_inc_percpu_max(&writeout_completions, &wb->bdi->completions,
+	__fprop_inc_percpu_max(&writeout_completions, &wb->completions,
 			       wb->bdi->max_prop_frac);
 	/* First event after period switching was turned off? */
 	if (!unlikely(writeout_period_time)) {
@@ -427,10 +427,10 @@ EXPORT_SYMBOL_GPL(wb_writeout_inc);
 /*
  * Obtain an accurate fraction of the BDI's portion.
  */
-static void bdi_writeout_fraction(struct backing_dev_info *bdi,
-		long *numerator, long *denominator)
+static void wb_writeout_fraction(struct bdi_writeback *wb,
+				 long *numerator, long *denominator)
 {
-	fprop_fraction_percpu(&writeout_completions, &bdi->completions,
+	fprop_fraction_percpu(&writeout_completions, &wb->completions,
 				numerator, denominator);
 }
 
@@ -516,11 +516,11 @@ static unsigned long hard_dirty_limit(unsigned long thresh)
 }
 
 /**
- * bdi_dirty_limit - @bdi's share of dirty throttling threshold
- * @bdi: the backing_dev_info to query
+ * wb_dirty_limit - @wb's share of dirty throttling threshold
+ * @wb: bdi_writeback to query
  * @dirty: global dirty limit in pages
  *
- * Returns @bdi's dirty limit in pages. The term "dirty" in the context of
+ * Returns @wb's dirty limit in pages. The term "dirty" in the context of
  * dirty balancing includes all PG_dirty, PG_writeback and NFS unstable pages.
  *
  * Note that balance_dirty_pages() will only seriously take it as a hard limit
@@ -528,34 +528,35 @@ static unsigned long hard_dirty_limit(unsigned long thresh)
  * control. For example, when the device is completely stalled due to some error
  * conditions, or when there are 1000 dd tasks writing to a slow 10MB/s USB key.
  * In the other normal situations, it acts more gently by throttling the tasks
- * more (rather than completely block them) when the bdi dirty pages go high.
+ * more (rather than completely block them) when the wb dirty pages go high.
  *
  * It allocates high/low dirty limits to fast/slow devices, in order to prevent
  * - starving fast devices
  * - piling up dirty pages (that will take long time to sync) on slow devices
  *
- * The bdi's share of dirty limit will be adapting to its throughput and
+ * The wb's share of dirty limit will be adapting to its throughput and
  * bounded by the bdi->min_ratio and/or bdi->max_ratio parameters, if set.
  */
-unsigned long bdi_dirty_limit(struct backing_dev_info *bdi, unsigned long dirty)
+unsigned long wb_dirty_limit(struct bdi_writeback *wb, unsigned long dirty)
 {
-	u64 bdi_dirty;
+	struct backing_dev_info *bdi = wb->bdi;
+	u64 wb_dirty;
 	long numerator, denominator;
 
 	/*
 	 * Calculate this BDI's share of the dirty ratio.
 	 */
-	bdi_writeout_fraction(bdi, &numerator, &denominator);
+	wb_writeout_fraction(wb, &numerator, &denominator);
 
-	bdi_dirty = (dirty * (100 - bdi_min_ratio)) / 100;
-	bdi_dirty *= numerator;
-	do_div(bdi_dirty, denominator);
+	wb_dirty = (dirty * (100 - bdi_min_ratio)) / 100;
+	wb_dirty *= numerator;
+	do_div(wb_dirty, denominator);
 
-	bdi_dirty += (dirty * bdi->min_ratio) / 100;
-	if (bdi_dirty > (dirty * bdi->max_ratio) / 100)
-		bdi_dirty = dirty * bdi->max_ratio / 100;
+	wb_dirty += (dirty * bdi->min_ratio) / 100;
+	if (wb_dirty > (dirty * bdi->max_ratio) / 100)
+		wb_dirty = dirty * bdi->max_ratio / 100;
 
-	return bdi_dirty;
+	return wb_dirty;
 }
 
 /*
@@ -664,14 +665,14 @@ static long long pos_ratio_polynom(unsigned long setpoint,
  *   card's bdi_dirty may rush to many times higher than bdi_setpoint.
  * - the bdi dirty thresh drops quickly due to change of JBOD workload
  */
-static unsigned long bdi_position_ratio(struct backing_dev_info *bdi,
-					unsigned long thresh,
-					unsigned long bg_thresh,
-					unsigned long dirty,
-					unsigned long bdi_thresh,
-					unsigned long bdi_dirty)
+static unsigned long wb_position_ratio(struct bdi_writeback *wb,
+				       unsigned long thresh,
+				       unsigned long bg_thresh,
+				       unsigned long dirty,
+				       unsigned long bdi_thresh,
+				       unsigned long bdi_dirty)
 {
-	unsigned long write_bw = bdi->avg_write_bandwidth;
+	unsigned long write_bw = wb->avg_write_bandwidth;
 	unsigned long freerun = dirty_freerun_ceiling(thresh, bg_thresh);
 	unsigned long limit = hard_dirty_limit(thresh);
 	unsigned long x_intercept;
@@ -702,12 +703,12 @@ static unsigned long bdi_position_ratio(struct backing_dev_info *bdi,
 	 * consume arbitrary amount of RAM because it is accounted in
 	 * NR_WRITEBACK_TEMP which is not involved in calculating "nr_dirty".
 	 *
-	 * Here, in bdi_position_ratio(), we calculate pos_ratio based on
+	 * Here, in wb_position_ratio(), we calculate pos_ratio based on
 	 * two values: bdi_dirty and bdi_thresh. Let's consider an example:
 	 * total amount of RAM is 16GB, bdi->max_ratio is equal to 1%, global
 	 * limits are set by default to 10% and 20% (background and throttle).
 	 * Then bdi_thresh is 1% of 20% of 16GB. This amounts to ~8K pages.
-	 * bdi_dirty_limit(bdi, bg_thresh) is about ~4K pages. bdi_setpoint is
+	 * wb_dirty_limit(wb, bg_thresh) is about ~4K pages. bdi_setpoint is
 	 * about ~6K pages (as the average of background and throttle bdi
 	 * limits). The 3rd order polynomial will provide positive feedback if
 	 * bdi_dirty is under bdi_setpoint and vice versa.
@@ -717,7 +718,7 @@ static unsigned long bdi_position_ratio(struct backing_dev_info *bdi,
 	 * much earlier than global "freerun" is reached (~23MB vs. ~2.3GB
 	 * in the example above).
 	 */
-	if (unlikely(bdi->capabilities & BDI_CAP_STRICTLIMIT)) {
+	if (unlikely(wb->bdi->capabilities & BDI_CAP_STRICTLIMIT)) {
 		long long bdi_pos_ratio;
 		unsigned long bdi_bg_thresh;
 
@@ -842,13 +843,13 @@ static unsigned long bdi_position_ratio(struct backing_dev_info *bdi,
 	return pos_ratio;
 }
 
-static void bdi_update_write_bandwidth(struct backing_dev_info *bdi,
-				       unsigned long elapsed,
-				       unsigned long written)
+static void wb_update_write_bandwidth(struct bdi_writeback *wb,
+				      unsigned long elapsed,
+				      unsigned long written)
 {
 	const unsigned long period = roundup_pow_of_two(3 * HZ);
-	unsigned long avg = bdi->avg_write_bandwidth;
-	unsigned long old = bdi->write_bandwidth;
+	unsigned long avg = wb->avg_write_bandwidth;
+	unsigned long old = wb->write_bandwidth;
 	u64 bw;
 
 	/*
@@ -861,14 +862,14 @@ static void bdi_update_write_bandwidth(struct backing_dev_info *bdi,
 	 * @written may have decreased due to account_page_redirty().
 	 * Avoid underflowing @bw calculation.
 	 */
-	bw = written - min(written, bdi->written_stamp);
+	bw = written - min(written, wb->written_stamp);
 	bw *= HZ;
 	if (unlikely(elapsed > period)) {
 		do_div(bw, elapsed);
 		avg = bw;
 		goto out;
 	}
-	bw += (u64)bdi->write_bandwidth * (period - elapsed);
+	bw += (u64)wb->write_bandwidth * (period - elapsed);
 	bw >>= ilog2(period);
 
 	/*
@@ -881,8 +882,8 @@ static void bdi_update_write_bandwidth(struct backing_dev_info *bdi,
 		avg += (old - avg) >> 3;
 
 out:
-	bdi->write_bandwidth = bw;
-	bdi->avg_write_bandwidth = avg;
+	wb->write_bandwidth = bw;
+	wb->avg_write_bandwidth = avg;
 }
 
 /*
@@ -947,20 +948,20 @@ static void global_update_bandwidth(unsigned long thresh,
  * Normal bdi tasks will be curbed at or below it in long term.
  * Obviously it should be around (write_bw / N) when there are N dd tasks.
  */
-static void bdi_update_dirty_ratelimit(struct backing_dev_info *bdi,
-				       unsigned long thresh,
-				       unsigned long bg_thresh,
-				       unsigned long dirty,
-				       unsigned long bdi_thresh,
-				       unsigned long bdi_dirty,
-				       unsigned long dirtied,
-				       unsigned long elapsed)
+static void wb_update_dirty_ratelimit(struct bdi_writeback *wb,
+				      unsigned long thresh,
+				      unsigned long bg_thresh,
+				      unsigned long dirty,
+				      unsigned long bdi_thresh,
+				      unsigned long bdi_dirty,
+				      unsigned long dirtied,
+				      unsigned long elapsed)
 {
 	unsigned long freerun = dirty_freerun_ceiling(thresh, bg_thresh);
 	unsigned long limit = hard_dirty_limit(thresh);
 	unsigned long setpoint = (freerun + limit) / 2;
-	unsigned long write_bw = bdi->avg_write_bandwidth;
-	unsigned long dirty_ratelimit = bdi->dirty_ratelimit;
+	unsigned long write_bw = wb->avg_write_bandwidth;
+	unsigned long dirty_ratelimit = wb->dirty_ratelimit;
 	unsigned long dirty_rate;
 	unsigned long task_ratelimit;
 	unsigned long balanced_dirty_ratelimit;
@@ -972,10 +973,10 @@ static void bdi_update_dirty_ratelimit(struct backing_dev_info *bdi,
 	 * The dirty rate will match the writeout rate in long term, except
 	 * when dirty pages are truncated by userspace or re-dirtied by FS.
 	 */
-	dirty_rate = (dirtied - bdi->dirtied_stamp) * HZ / elapsed;
+	dirty_rate = (dirtied - wb->dirtied_stamp) * HZ / elapsed;
 
-	pos_ratio = bdi_position_ratio(bdi, thresh, bg_thresh, dirty,
-				       bdi_thresh, bdi_dirty);
+	pos_ratio = wb_position_ratio(wb, thresh, bg_thresh, dirty,
+				      bdi_thresh, bdi_dirty);
 	/*
 	 * task_ratelimit reflects each dd's dirty rate for the past 200ms.
 	 */
@@ -1059,31 +1060,31 @@ static void bdi_update_dirty_ratelimit(struct backing_dev_info *bdi,
 
 	/*
 	 * For strictlimit case, calculations above were based on bdi counters
-	 * and limits (starting from pos_ratio = bdi_position_ratio() and up to
+	 * and limits (starting from pos_ratio = wb_position_ratio() and up to
 	 * balanced_dirty_ratelimit = task_ratelimit * write_bw / dirty_rate).
 	 * Hence, to calculate "step" properly, we have to use bdi_dirty as
 	 * "dirty" and bdi_setpoint as "setpoint".
 	 *
 	 * We rampup dirty_ratelimit forcibly if bdi_dirty is low because
 	 * it's possible that bdi_thresh is close to zero due to inactivity
-	 * of backing device (see the implementation of bdi_dirty_limit()).
+	 * of backing device (see the implementation of wb_dirty_limit()).
 	 */
-	if (unlikely(bdi->capabilities & BDI_CAP_STRICTLIMIT)) {
+	if (unlikely(wb->bdi->capabilities & BDI_CAP_STRICTLIMIT)) {
 		dirty = bdi_dirty;
 		if (bdi_dirty < 8)
 			setpoint = bdi_dirty + 1;
 		else
 			setpoint = (bdi_thresh +
-				    bdi_dirty_limit(bdi, bg_thresh)) / 2;
+				    wb_dirty_limit(wb, bg_thresh)) / 2;
 	}
 
 	if (dirty < setpoint) {
-		x = min3(bdi->balanced_dirty_ratelimit,
+		x = min3(wb->balanced_dirty_ratelimit,
 			 balanced_dirty_ratelimit, task_ratelimit);
 		if (dirty_ratelimit < x)
 			step = x - dirty_ratelimit;
 	} else {
-		x = max3(bdi->balanced_dirty_ratelimit,
+		x = max3(wb->balanced_dirty_ratelimit,
 			 balanced_dirty_ratelimit, task_ratelimit);
 		if (dirty_ratelimit > x)
 			step = dirty_ratelimit - x;
@@ -1105,22 +1106,22 @@ static void bdi_update_dirty_ratelimit(struct backing_dev_info *bdi,
 	else
 		dirty_ratelimit -= step;
 
-	bdi->dirty_ratelimit = max(dirty_ratelimit, 1UL);
-	bdi->balanced_dirty_ratelimit = balanced_dirty_ratelimit;
+	wb->dirty_ratelimit = max(dirty_ratelimit, 1UL);
+	wb->balanced_dirty_ratelimit = balanced_dirty_ratelimit;
 
-	trace_bdi_dirty_ratelimit(bdi, dirty_rate, task_ratelimit);
+	trace_bdi_dirty_ratelimit(wb->bdi, dirty_rate, task_ratelimit);
 }
 
-void __bdi_update_bandwidth(struct backing_dev_info *bdi,
-			    unsigned long thresh,
-			    unsigned long bg_thresh,
-			    unsigned long dirty,
-			    unsigned long bdi_thresh,
-			    unsigned long bdi_dirty,
-			    unsigned long start_time)
+void __wb_update_bandwidth(struct bdi_writeback *wb,
+			   unsigned long thresh,
+			   unsigned long bg_thresh,
+			   unsigned long dirty,
+			   unsigned long bdi_thresh,
+			   unsigned long bdi_dirty,
+			   unsigned long start_time)
 {
 	unsigned long now = jiffies;
-	unsigned long elapsed = now - bdi->bw_time_stamp;
+	unsigned long elapsed = now - wb->bw_time_stamp;
 	unsigned long dirtied;
 	unsigned long written;
 
@@ -1130,44 +1131,44 @@ void __bdi_update_bandwidth(struct backing_dev_info *bdi,
 	if (elapsed < BANDWIDTH_INTERVAL)
 		return;
 
-	dirtied = percpu_counter_read(&bdi->wb.stat[WB_DIRTIED]);
-	written = percpu_counter_read(&bdi->wb.stat[WB_WRITTEN]);
+	dirtied = percpu_counter_read(&wb->stat[WB_DIRTIED]);
+	written = percpu_counter_read(&wb->stat[WB_WRITTEN]);
 
 	/*
 	 * Skip quiet periods when disk bandwidth is under-utilized.
 	 * (at least 1s idle time between two flusher runs)
 	 */
-	if (elapsed > HZ && time_before(bdi->bw_time_stamp, start_time))
+	if (elapsed > HZ && time_before(wb->bw_time_stamp, start_time))
 		goto snapshot;
 
 	if (thresh) {
 		global_update_bandwidth(thresh, dirty, now);
-		bdi_update_dirty_ratelimit(bdi, thresh, bg_thresh, dirty,
-					   bdi_thresh, bdi_dirty,
-					   dirtied, elapsed);
+		wb_update_dirty_ratelimit(wb, thresh, bg_thresh, dirty,
+					  bdi_thresh, bdi_dirty,
+					  dirtied, elapsed);
 	}
-	bdi_update_write_bandwidth(bdi, elapsed, written);
+	wb_update_write_bandwidth(wb, elapsed, written);
 
 snapshot:
-	bdi->dirtied_stamp = dirtied;
-	bdi->written_stamp = written;
-	bdi->bw_time_stamp = now;
+	wb->dirtied_stamp = dirtied;
+	wb->written_stamp = written;
+	wb->bw_time_stamp = now;
 }
 
-static void bdi_update_bandwidth(struct backing_dev_info *bdi,
-				 unsigned long thresh,
-				 unsigned long bg_thresh,
-				 unsigned long dirty,
-				 unsigned long bdi_thresh,
-				 unsigned long bdi_dirty,
-				 unsigned long start_time)
+static void wb_update_bandwidth(struct bdi_writeback *wb,
+				unsigned long thresh,
+				unsigned long bg_thresh,
+				unsigned long dirty,
+				unsigned long bdi_thresh,
+				unsigned long bdi_dirty,
+				unsigned long start_time)
 {
-	if (time_is_after_eq_jiffies(bdi->bw_time_stamp + BANDWIDTH_INTERVAL))
+	if (time_is_after_eq_jiffies(wb->bw_time_stamp + BANDWIDTH_INTERVAL))
 		return;
-	spin_lock(&bdi->wb.list_lock);
-	__bdi_update_bandwidth(bdi, thresh, bg_thresh, dirty,
-			       bdi_thresh, bdi_dirty, start_time);
-	spin_unlock(&bdi->wb.list_lock);
+	spin_lock(&wb->list_lock);
+	__wb_update_bandwidth(wb, thresh, bg_thresh, dirty,
+			      bdi_thresh, bdi_dirty, start_time);
+	spin_unlock(&wb->list_lock);
 }
 
 /*
@@ -1187,10 +1188,10 @@ static unsigned long dirty_poll_interval(unsigned long dirty,
 	return 1;
 }
 
-static unsigned long bdi_max_pause(struct backing_dev_info *bdi,
-				   unsigned long bdi_dirty)
+static unsigned long wb_max_pause(struct bdi_writeback *wb,
+				      unsigned long bdi_dirty)
 {
-	unsigned long bw = bdi->avg_write_bandwidth;
+	unsigned long bw = wb->avg_write_bandwidth;
 	unsigned long t;
 
 	/*
@@ -1206,14 +1207,14 @@ static unsigned long bdi_max_pause(struct backing_dev_info *bdi,
 	return min_t(unsigned long, t, MAX_PAUSE);
 }
 
-static long bdi_min_pause(struct backing_dev_info *bdi,
-			  long max_pause,
-			  unsigned long task_ratelimit,
-			  unsigned long dirty_ratelimit,
-			  int *nr_dirtied_pause)
+static long wb_min_pause(struct bdi_writeback *wb,
+			 long max_pause,
+			 unsigned long task_ratelimit,
+			 unsigned long dirty_ratelimit,
+			 int *nr_dirtied_pause)
 {
-	long hi = ilog2(bdi->avg_write_bandwidth);
-	long lo = ilog2(bdi->dirty_ratelimit);
+	long hi = ilog2(wb->avg_write_bandwidth);
+	long lo = ilog2(wb->dirty_ratelimit);
 	long t;		/* target pause */
 	long pause;	/* estimated next pause */
 	int pages;	/* target nr_dirtied_pause */
@@ -1281,14 +1282,13 @@ static long bdi_min_pause(struct backing_dev_info *bdi,
 	return pages >= DIRTY_POLL_THRESH ? 1 + t / 2 : t;
 }
 
-static inline void bdi_dirty_limits(struct backing_dev_info *bdi,
-				    unsigned long dirty_thresh,
-				    unsigned long background_thresh,
-				    unsigned long *bdi_dirty,
-				    unsigned long *bdi_thresh,
-				    unsigned long *bdi_bg_thresh)
+static inline void wb_dirty_limits(struct bdi_writeback *wb,
+				   unsigned long dirty_thresh,
+				   unsigned long background_thresh,
+				   unsigned long *bdi_dirty,
+				   unsigned long *bdi_thresh,
+				   unsigned long *bdi_bg_thresh)
 {
-	struct bdi_writeback *wb = &bdi->wb;
 	unsigned long wb_reclaimable;
 
 	/*
@@ -1301,10 +1301,10 @@ static inline void bdi_dirty_limits(struct backing_dev_info *bdi,
 	 *   In this case we don't want to hard throttle the USB key
 	 *   dirtiers for 100 seconds until bdi_dirty drops under
 	 *   bdi_thresh. Instead the auxiliary bdi control line in
-	 *   bdi_position_ratio() will let the dirtier task progress
+	 *   wb_position_ratio() will let the dirtier task progress
 	 *   at some rate <= (write_bw / 2) for bringing down bdi_dirty.
 	 */
-	*bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh);
+	*bdi_thresh = wb_dirty_limit(wb, dirty_thresh);
 
 	if (bdi_bg_thresh)
 		*bdi_bg_thresh = dirty_thresh ? div_u64((u64)*bdi_thresh *
@@ -1354,6 +1354,7 @@ static void balance_dirty_pages(struct address_space *mapping,
 	unsigned long dirty_ratelimit;
 	unsigned long pos_ratio;
 	struct backing_dev_info *bdi = inode_to_bdi(mapping->host);
+	struct bdi_writeback *wb = &bdi->wb;
 	bool strictlimit = bdi->capabilities & BDI_CAP_STRICTLIMIT;
 	unsigned long start_time = jiffies;
 
@@ -1378,8 +1379,8 @@ static void balance_dirty_pages(struct address_space *mapping,
 		global_dirty_limits(&background_thresh, &dirty_thresh);
 
 		if (unlikely(strictlimit)) {
-			bdi_dirty_limits(bdi, dirty_thresh, background_thresh,
-					 &bdi_dirty, &bdi_thresh, &bg_thresh);
+			wb_dirty_limits(wb, dirty_thresh, background_thresh,
+					&bdi_dirty, &bdi_thresh, &bg_thresh);
 
 			dirty = bdi_dirty;
 			thresh = bdi_thresh;
@@ -1410,28 +1411,28 @@ static void balance_dirty_pages(struct address_space *mapping,
 			bdi_start_background_writeback(bdi);
 
 		if (!strictlimit)
-			bdi_dirty_limits(bdi, dirty_thresh, background_thresh,
-					 &bdi_dirty, &bdi_thresh, NULL);
+			wb_dirty_limits(wb, dirty_thresh, background_thresh,
+					&bdi_dirty, &bdi_thresh, NULL);
 
 		dirty_exceeded = (bdi_dirty > bdi_thresh) &&
 				 ((nr_dirty > dirty_thresh) || strictlimit);
-		if (dirty_exceeded && !bdi->dirty_exceeded)
-			bdi->dirty_exceeded = 1;
+		if (dirty_exceeded && !wb->dirty_exceeded)
+			wb->dirty_exceeded = 1;
 
-		bdi_update_bandwidth(bdi, dirty_thresh, background_thresh,
-				     nr_dirty, bdi_thresh, bdi_dirty,
-				     start_time);
+		wb_update_bandwidth(wb, dirty_thresh, background_thresh,
+				    nr_dirty, bdi_thresh, bdi_dirty,
+				    start_time);
 
-		dirty_ratelimit = bdi->dirty_ratelimit;
-		pos_ratio = bdi_position_ratio(bdi, dirty_thresh,
-					       background_thresh, nr_dirty,
-					       bdi_thresh, bdi_dirty);
+		dirty_ratelimit = wb->dirty_ratelimit;
+		pos_ratio = wb_position_ratio(wb, dirty_thresh,
+					      background_thresh, nr_dirty,
+					      bdi_thresh, bdi_dirty);
 		task_ratelimit = ((u64)dirty_ratelimit * pos_ratio) >>
 							RATELIMIT_CALC_SHIFT;
-		max_pause = bdi_max_pause(bdi, bdi_dirty);
-		min_pause = bdi_min_pause(bdi, max_pause,
-					  task_ratelimit, dirty_ratelimit,
-					  &nr_dirtied_pause);
+		max_pause = wb_max_pause(wb, bdi_dirty);
+		min_pause = wb_min_pause(wb, max_pause,
+					 task_ratelimit, dirty_ratelimit,
+					 &nr_dirtied_pause);
 
 		if (unlikely(task_ratelimit == 0)) {
 			period = max_pause;
@@ -1515,15 +1516,15 @@ pause:
 		 * more page. However bdi_dirty has accounting errors.  So use
 		 * the larger and more IO friendly wb_stat_error.
 		 */
-		if (bdi_dirty <= wb_stat_error(&bdi->wb))
+		if (bdi_dirty <= wb_stat_error(wb))
 			break;
 
 		if (fatal_signal_pending(current))
 			break;
 	}
 
-	if (!dirty_exceeded && bdi->dirty_exceeded)
-		bdi->dirty_exceeded = 0;
+	if (!dirty_exceeded && wb->dirty_exceeded)
+		wb->dirty_exceeded = 0;
 
 	if (writeback_in_progress(bdi))
 		return;
@@ -1577,6 +1578,7 @@ DEFINE_PER_CPU(int, dirty_throttle_leaks) = 0;
 void balance_dirty_pages_ratelimited(struct address_space *mapping)
 {
 	struct backing_dev_info *bdi = inode_to_bdi(mapping->host);
+	struct bdi_writeback *wb = &bdi->wb;
 	int ratelimit;
 	int *p;
 
@@ -1584,7 +1586,7 @@ void balance_dirty_pages_ratelimited(struct address_space *mapping)
 		return;
 
 	ratelimit = current->nr_dirtied_pause;
-	if (bdi->dirty_exceeded)
+	if (wb->dirty_exceeded)
 		ratelimit = min(ratelimit, 32 >> (PAGE_SHIFT - 10));
 
 	preempt_disable();
-- 
cgit v1.2.3


From f0054bb1e1f3be03cc33369df640db97f10f6172 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 22 May 2015 17:13:30 -0400
Subject: writeback: move backing_dev_info->wb_lock and ->worklist into
 bdi_writeback

Currently, a bdi (backing_dev_info) embeds single wb (bdi_writeback)
and the role of the separation is unclear.  For cgroup support for
writeback IOs, a bdi will be updated to host multiple wb's where each
wb serves writeback IOs of a different cgroup on the bdi.  To achieve
that, a wb should carry all states necessary for servicing writeback
IOs for a cgroup independently.

This patch moves bdi->wb_lock and ->worklist into wb.

* The lock protects bdi->worklist and bdi->wb.dwork scheduling.  While
  moving, rename it to wb->work_lock as wb->wb_lock is confusing.
  Also, move wb->dwork downwards so that it's colocated with the new
  ->work_lock and ->work_list fields.

* bdi_writeback_workfn()		-> wb_workfn()
  bdi_wakeup_thread_delayed(bdi)	-> wb_wakeup_delayed(wb)
  bdi_wakeup_thread(bdi)		-> wb_wakeup(wb)
  bdi_queue_work(bdi, ...)		-> wb_queue_work(wb, ...)
  __bdi_start_writeback(bdi, ...)	-> __wb_start_writeback(wb, ...)
  get_next_work_item(bdi)		-> get_next_work_item(wb)

* bdi_wb_shutdown() is renamed to wb_shutdown() and now takes @wb.
  The function contained parts which belong to the containing bdi
  rather than the wb itself - testing cap_writeback_dirty and
  bdi_remove_from_list() invocation.  Those are moved to
  bdi_unregister().

* bdi_wb_{init|exit}() are renamed to wb_{init|exit}().
  Initializations of the moved bdi->wb_lock and ->work_list are
  relocated from bdi_init() to wb_init().

* As there's still only one bdi_writeback per backing_dev_info, all
  uses of bdi->state are mechanically replaced with bdi->wb.state
  introducing no behavior changes.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Jan Kara <jack@suse.cz>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Wu Fengguang <fengguang.wu@intel.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 fs/fs-writeback.c           | 86 +++++++++++++++++++++------------------------
 include/linux/backing-dev.h | 12 +++----
 mm/backing-dev.c            | 59 +++++++++++++++----------------
 3 files changed, 75 insertions(+), 82 deletions(-)

(limited to 'include/linux')

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 1945cb9f6b97..a69d2e10d514 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -109,34 +109,33 @@ static inline struct inode *wb_inode(struct list_head *head)
 
 EXPORT_TRACEPOINT_SYMBOL_GPL(wbc_writepage);
 
-static void bdi_wakeup_thread(struct backing_dev_info *bdi)
+static void wb_wakeup(struct bdi_writeback *wb)
 {
-	spin_lock_bh(&bdi->wb_lock);
-	if (test_bit(WB_registered, &bdi->wb.state))
-		mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0);
-	spin_unlock_bh(&bdi->wb_lock);
+	spin_lock_bh(&wb->work_lock);
+	if (test_bit(WB_registered, &wb->state))
+		mod_delayed_work(bdi_wq, &wb->dwork, 0);
+	spin_unlock_bh(&wb->work_lock);
 }
 
-static void bdi_queue_work(struct backing_dev_info *bdi,
-			   struct wb_writeback_work *work)
+static void wb_queue_work(struct bdi_writeback *wb,
+			  struct wb_writeback_work *work)
 {
-	trace_writeback_queue(bdi, work);
+	trace_writeback_queue(wb->bdi, work);
 
-	spin_lock_bh(&bdi->wb_lock);
-	if (!test_bit(WB_registered, &bdi->wb.state)) {
+	spin_lock_bh(&wb->work_lock);
+	if (!test_bit(WB_registered, &wb->state)) {
 		if (work->done)
 			complete(work->done);
 		goto out_unlock;
 	}
-	list_add_tail(&work->list, &bdi->work_list);
-	mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0);
+	list_add_tail(&work->list, &wb->work_list);
+	mod_delayed_work(bdi_wq, &wb->dwork, 0);
 out_unlock:
-	spin_unlock_bh(&bdi->wb_lock);
+	spin_unlock_bh(&wb->work_lock);
 }
 
-static void
-__bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
-		      bool range_cyclic, enum wb_reason reason)
+static void __wb_start_writeback(struct bdi_writeback *wb, long nr_pages,
+				 bool range_cyclic, enum wb_reason reason)
 {
 	struct wb_writeback_work *work;
 
@@ -146,8 +145,8 @@ __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
 	 */
 	work = kzalloc(sizeof(*work), GFP_ATOMIC);
 	if (!work) {
-		trace_writeback_nowork(bdi);
-		bdi_wakeup_thread(bdi);
+		trace_writeback_nowork(wb->bdi);
+		wb_wakeup(wb);
 		return;
 	}
 
@@ -156,7 +155,7 @@ __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
 	work->range_cyclic = range_cyclic;
 	work->reason	= reason;
 
-	bdi_queue_work(bdi, work);
+	wb_queue_work(wb, work);
 }
 
 /**
@@ -174,7 +173,7 @@ __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
 void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
 			enum wb_reason reason)
 {
-	__bdi_start_writeback(bdi, nr_pages, true, reason);
+	__wb_start_writeback(&bdi->wb, nr_pages, true, reason);
 }
 
 /**
@@ -194,7 +193,7 @@ void bdi_start_background_writeback(struct backing_dev_info *bdi)
 	 * writeback as soon as there is no other work to do.
 	 */
 	trace_writeback_wake_background(bdi);
-	bdi_wakeup_thread(bdi);
+	wb_wakeup(&bdi->wb);
 }
 
 /*
@@ -898,7 +897,7 @@ static long wb_writeback(struct bdi_writeback *wb,
 		 * after the other works are all done.
 		 */
 		if ((work->for_background || work->for_kupdate) &&
-		    !list_empty(&wb->bdi->work_list))
+		    !list_empty(&wb->work_list))
 			break;
 
 		/*
@@ -969,18 +968,17 @@ static long wb_writeback(struct bdi_writeback *wb,
 /*
  * Return the next wb_writeback_work struct that hasn't been processed yet.
  */
-static struct wb_writeback_work *
-get_next_work_item(struct backing_dev_info *bdi)
+static struct wb_writeback_work *get_next_work_item(struct bdi_writeback *wb)
 {
 	struct wb_writeback_work *work = NULL;
 
-	spin_lock_bh(&bdi->wb_lock);
-	if (!list_empty(&bdi->work_list)) {
-		work = list_entry(bdi->work_list.next,
+	spin_lock_bh(&wb->work_lock);
+	if (!list_empty(&wb->work_list)) {
+		work = list_entry(wb->work_list.next,
 				  struct wb_writeback_work, list);
 		list_del_init(&work->list);
 	}
-	spin_unlock_bh(&bdi->wb_lock);
+	spin_unlock_bh(&wb->work_lock);
 	return work;
 }
 
@@ -1052,14 +1050,13 @@ static long wb_check_old_data_flush(struct bdi_writeback *wb)
  */
 static long wb_do_writeback(struct bdi_writeback *wb)
 {
-	struct backing_dev_info *bdi = wb->bdi;
 	struct wb_writeback_work *work;
 	long wrote = 0;
 
 	set_bit(WB_writeback_running, &wb->state);
-	while ((work = get_next_work_item(bdi)) != NULL) {
+	while ((work = get_next_work_item(wb)) != NULL) {
 
-		trace_writeback_exec(bdi, work);
+		trace_writeback_exec(wb->bdi, work);
 
 		wrote += wb_writeback(wb, work);
 
@@ -1087,43 +1084,42 @@ static long wb_do_writeback(struct bdi_writeback *wb)
  * Handle writeback of dirty data for the device backed by this bdi. Also
  * reschedules periodically and does kupdated style flushing.
  */
-void bdi_writeback_workfn(struct work_struct *work)
+void wb_workfn(struct work_struct *work)
 {
 	struct bdi_writeback *wb = container_of(to_delayed_work(work),
 						struct bdi_writeback, dwork);
-	struct backing_dev_info *bdi = wb->bdi;
 	long pages_written;
 
-	set_worker_desc("flush-%s", dev_name(bdi->dev));
+	set_worker_desc("flush-%s", dev_name(wb->bdi->dev));
 	current->flags |= PF_SWAPWRITE;
 
 	if (likely(!current_is_workqueue_rescuer() ||
 		   !test_bit(WB_registered, &wb->state))) {
 		/*
-		 * The normal path.  Keep writing back @bdi until its
+		 * The normal path.  Keep writing back @wb until its
 		 * work_list is empty.  Note that this path is also taken
-		 * if @bdi is shutting down even when we're running off the
+		 * if @wb is shutting down even when we're running off the
 		 * rescuer as work_list needs to be drained.
 		 */
 		do {
 			pages_written = wb_do_writeback(wb);
 			trace_writeback_pages_written(pages_written);
-		} while (!list_empty(&bdi->work_list));
+		} while (!list_empty(&wb->work_list));
 	} else {
 		/*
 		 * bdi_wq can't get enough workers and we're running off
 		 * the emergency worker.  Don't hog it.  Hopefully, 1024 is
 		 * enough for efficient IO.
 		 */
-		pages_written = writeback_inodes_wb(&bdi->wb, 1024,
+		pages_written = writeback_inodes_wb(wb, 1024,
 						    WB_REASON_FORKER_THREAD);
 		trace_writeback_pages_written(pages_written);
 	}
 
-	if (!list_empty(&bdi->work_list))
+	if (!list_empty(&wb->work_list))
 		mod_delayed_work(bdi_wq, &wb->dwork, 0);
 	else if (wb_has_dirty_io(wb) && dirty_writeback_interval)
-		bdi_wakeup_thread_delayed(bdi);
+		wb_wakeup_delayed(wb);
 
 	current->flags &= ~PF_SWAPWRITE;
 }
@@ -1143,7 +1139,7 @@ void wakeup_flusher_threads(long nr_pages, enum wb_reason reason)
 	list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) {
 		if (!bdi_has_dirty_io(bdi))
 			continue;
-		__bdi_start_writeback(bdi, nr_pages, false, reason);
+		__wb_start_writeback(&bdi->wb, nr_pages, false, reason);
 	}
 	rcu_read_unlock();
 }
@@ -1174,7 +1170,7 @@ static void wakeup_dirtytime_writeback(struct work_struct *w)
 	list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) {
 		if (list_empty(&bdi->wb.b_dirty_time))
 			continue;
-		bdi_wakeup_thread(bdi);
+		wb_wakeup(&bdi->wb);
 	}
 	rcu_read_unlock();
 	schedule_delayed_work(&dirtytime_work, dirtytime_expire_interval * HZ);
@@ -1347,7 +1343,7 @@ void __mark_inode_dirty(struct inode *inode, int flags)
 			trace_writeback_dirty_inode_enqueue(inode);
 
 			if (wakeup_bdi)
-				bdi_wakeup_thread_delayed(bdi);
+				wb_wakeup_delayed(&bdi->wb);
 			return;
 		}
 	}
@@ -1437,7 +1433,7 @@ void writeback_inodes_sb_nr(struct super_block *sb,
 	if (sb->s_bdi == &noop_backing_dev_info)
 		return;
 	WARN_ON(!rwsem_is_locked(&sb->s_umount));
-	bdi_queue_work(sb->s_bdi, &work);
+	wb_queue_work(&sb->s_bdi->wb, &work);
 	wait_for_completion(&done);
 }
 EXPORT_SYMBOL(writeback_inodes_sb_nr);
@@ -1521,7 +1517,7 @@ void sync_inodes_sb(struct super_block *sb)
 		return;
 	WARN_ON(!rwsem_is_locked(&sb->s_umount));
 
-	bdi_queue_work(sb->s_bdi, &work);
+	wb_queue_work(&sb->s_bdi->wb, &work);
 	wait_for_completion(&done);
 
 	wait_sb_inodes(sb);
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index 2ab06049d812..d796f49ce87a 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -52,7 +52,6 @@ struct bdi_writeback {
 	unsigned long state;		/* Always use atomic bitops on this */
 	unsigned long last_old_flush;	/* last old data flush */
 
-	struct delayed_work dwork;	/* work item used for writeback */
 	struct list_head b_dirty;	/* dirty inodes */
 	struct list_head b_io;		/* parked for writeback */
 	struct list_head b_more_io;	/* parked for more writeback */
@@ -78,6 +77,10 @@ struct bdi_writeback {
 
 	struct fprop_local_percpu completions;
 	int dirty_exceeded;
+
+	spinlock_t work_lock;		/* protects work_list & dwork scheduling */
+	struct list_head work_list;
+	struct delayed_work dwork;	/* work item used for writeback */
 };
 
 struct backing_dev_info {
@@ -93,9 +96,6 @@ struct backing_dev_info {
 	unsigned int max_ratio, max_prop_frac;
 
 	struct bdi_writeback wb;  /* default writeback info for this bdi */
-	spinlock_t wb_lock;	  /* protects work_list & wb.dwork scheduling */
-
-	struct list_head work_list;
 
 	struct device *dev;
 
@@ -121,9 +121,9 @@ int __must_check bdi_setup_and_register(struct backing_dev_info *, char *);
 void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
 			enum wb_reason reason);
 void bdi_start_background_writeback(struct backing_dev_info *bdi);
-void bdi_writeback_workfn(struct work_struct *work);
+void wb_workfn(struct work_struct *work);
 int bdi_has_dirty_io(struct backing_dev_info *bdi);
-void bdi_wakeup_thread_delayed(struct backing_dev_info *bdi);
+void wb_wakeup_delayed(struct bdi_writeback *wb);
 
 extern spinlock_t bdi_lock;
 extern struct list_head bdi_list;
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 9a6c472ebd0b..597f0cec8405 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -261,7 +261,7 @@ int bdi_has_dirty_io(struct backing_dev_info *bdi)
 }
 
 /*
- * This function is used when the first inode for this bdi is marked dirty. It
+ * This function is used when the first inode for this wb is marked dirty. It
  * wakes-up the corresponding bdi thread which should then take care of the
  * periodic background write-out of dirty inodes. Since the write-out would
  * starts only 'dirty_writeback_interval' centisecs from now anyway, we just
@@ -274,15 +274,15 @@ int bdi_has_dirty_io(struct backing_dev_info *bdi)
  * We have to be careful not to postpone flush work if it is scheduled for
  * earlier. Thus we use queue_delayed_work().
  */
-void bdi_wakeup_thread_delayed(struct backing_dev_info *bdi)
+void wb_wakeup_delayed(struct bdi_writeback *wb)
 {
 	unsigned long timeout;
 
 	timeout = msecs_to_jiffies(dirty_writeback_interval * 10);
-	spin_lock_bh(&bdi->wb_lock);
-	if (test_bit(WB_registered, &bdi->wb.state))
-		queue_delayed_work(bdi_wq, &bdi->wb.dwork, timeout);
-	spin_unlock_bh(&bdi->wb_lock);
+	spin_lock_bh(&wb->work_lock);
+	if (test_bit(WB_registered, &wb->state))
+		queue_delayed_work(bdi_wq, &wb->dwork, timeout);
+	spin_unlock_bh(&wb->work_lock);
 }
 
 /*
@@ -335,28 +335,24 @@ EXPORT_SYMBOL(bdi_register_dev);
 /*
  * Remove bdi from the global list and shutdown any threads we have running
  */
-static void bdi_wb_shutdown(struct backing_dev_info *bdi)
+static void wb_shutdown(struct bdi_writeback *wb)
 {
 	/* Make sure nobody queues further work */
-	spin_lock_bh(&bdi->wb_lock);
-	if (!test_and_clear_bit(WB_registered, &bdi->wb.state)) {
-		spin_unlock_bh(&bdi->wb_lock);
+	spin_lock_bh(&wb->work_lock);
+	if (!test_and_clear_bit(WB_registered, &wb->state)) {
+		spin_unlock_bh(&wb->work_lock);
 		return;
 	}
-	spin_unlock_bh(&bdi->wb_lock);
+	spin_unlock_bh(&wb->work_lock);
 
 	/*
-	 * Make sure nobody finds us on the bdi_list anymore
+	 * Drain work list and shutdown the delayed_work.  !WB_registered
+	 * tells wb_workfn() that @wb is dying and its work_list needs to
+	 * be drained no matter what.
 	 */
-	bdi_remove_from_list(bdi);
-
-	/*
-	 * Drain work list and shutdown the delayed_work.  At this point,
-	 * @bdi->bdi_list is empty telling bdi_Writeback_workfn() that @bdi
-	 * is dying and its work_list needs to be drained no matter what.
-	 */
-	mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0);
-	flush_delayed_work(&bdi->wb.dwork);
+	mod_delayed_work(bdi_wq, &wb->dwork, 0);
+	flush_delayed_work(&wb->dwork);
+	WARN_ON(!list_empty(&wb->work_list));
 }
 
 /*
@@ -381,7 +377,7 @@ EXPORT_SYMBOL(bdi_unregister);
  */
 #define INIT_BW		(100 << (20 - PAGE_SHIFT))
 
-static int bdi_wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi)
+static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi)
 {
 	int i, err;
 
@@ -394,7 +390,6 @@ static int bdi_wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi)
 	INIT_LIST_HEAD(&wb->b_more_io);
 	INIT_LIST_HEAD(&wb->b_dirty_time);
 	spin_lock_init(&wb->list_lock);
-	INIT_DELAYED_WORK(&wb->dwork, bdi_writeback_workfn);
 
 	wb->bw_time_stamp = jiffies;
 	wb->balanced_dirty_ratelimit = INIT_BW;
@@ -402,6 +397,10 @@ static int bdi_wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi)
 	wb->write_bandwidth = INIT_BW;
 	wb->avg_write_bandwidth = INIT_BW;
 
+	spin_lock_init(&wb->work_lock);
+	INIT_LIST_HEAD(&wb->work_list);
+	INIT_DELAYED_WORK(&wb->dwork, wb_workfn);
+
 	err = fprop_local_init_percpu(&wb->completions, GFP_KERNEL);
 	if (err)
 		return err;
@@ -419,7 +418,7 @@ static int bdi_wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi)
 	return 0;
 }
 
-static void bdi_wb_exit(struct bdi_writeback *wb)
+static void wb_exit(struct bdi_writeback *wb)
 {
 	int i;
 
@@ -440,11 +439,9 @@ int bdi_init(struct backing_dev_info *bdi)
 	bdi->min_ratio = 0;
 	bdi->max_ratio = 100;
 	bdi->max_prop_frac = FPROP_FRAC_BASE;
-	spin_lock_init(&bdi->wb_lock);
 	INIT_LIST_HEAD(&bdi->bdi_list);
-	INIT_LIST_HEAD(&bdi->work_list);
 
-	err = bdi_wb_init(&bdi->wb, bdi);
+	err = wb_init(&bdi->wb, bdi);
 	if (err)
 		return err;
 
@@ -454,9 +451,9 @@ EXPORT_SYMBOL(bdi_init);
 
 void bdi_destroy(struct backing_dev_info *bdi)
 {
-	bdi_wb_shutdown(bdi);
-
-	WARN_ON(!list_empty(&bdi->work_list));
+	/* make sure nobody finds us on the bdi_list anymore */
+	bdi_remove_from_list(bdi);
+	wb_shutdown(&bdi->wb);
 
 	if (bdi->dev) {
 		bdi_debug_unregister(bdi);
@@ -464,7 +461,7 @@ void bdi_destroy(struct backing_dev_info *bdi)
 		bdi->dev = NULL;
 	}
 
-	bdi_wb_exit(&bdi->wb);
+	wb_exit(&bdi->wb);
 }
 EXPORT_SYMBOL(bdi_destroy);
 
-- 
cgit v1.2.3


From 66114cad64bf76a155fec1f0fff0de771cf909d5 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 22 May 2015 17:13:32 -0400
Subject: writeback: separate out include/linux/backing-dev-defs.h

With the planned cgroup writeback support, backing-dev related
declarations will be more widely used across block and cgroup;
unfortunately, including backing-dev.h from include/linux/blkdev.h
makes cyclic include dependency quite likely.

This patch separates out backing-dev-defs.h which only has the
essential definitions and updates blkdev.h to include it.  c files
which need access to more backing-dev details now include
backing-dev.h directly.  This takes backing-dev.h off the common
include dependency chain making it a lot easier to use it across block
and cgroup.

v2: fs/fat build failure fixed.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Jan Kara <jack@suse.cz>
Cc: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-integrity.c            |   1 +
 block/blk-sysfs.c                |   1 +
 block/bounce.c                   |   1 +
 block/genhd.c                    |   1 +
 drivers/block/drbd/drbd_int.h    |   1 +
 drivers/block/pktcdvd.c          |   1 +
 drivers/char/raw.c               |   1 +
 drivers/md/bcache/request.c      |   1 +
 drivers/md/dm.h                  |   1 +
 drivers/md/md.h                  |   1 +
 drivers/mtd/devices/block2mtd.c  |   1 +
 fs/block_dev.c                   |   1 +
 fs/ext4/extents.c                |   1 +
 fs/ext4/mballoc.c                |   1 +
 fs/ext4/super.c                  |   1 +
 fs/f2fs/segment.h                |   1 +
 fs/fat/file.c                    |   1 +
 fs/fat/inode.c                   |   1 +
 fs/hfs/super.c                   |   1 +
 fs/hfsplus/super.c               |   1 +
 fs/nfs/filelayout/filelayout.c   |   1 +
 fs/ocfs2/file.c                  |   1 +
 fs/reiserfs/super.c              |   1 +
 fs/ufs/super.c                   |   1 +
 fs/xfs/xfs_file.c                |   1 +
 include/linux/backing-dev-defs.h | 106 +++++++++++++++++++++++++++++++++++++++
 include/linux/backing-dev.h      | 102 +------------------------------------
 include/linux/blkdev.h           |   2 +-
 mm/madvise.c                     |   1 +
 29 files changed, 134 insertions(+), 102 deletions(-)
 create mode 100644 include/linux/backing-dev-defs.h

(limited to 'include/linux')

diff --git a/block/blk-integrity.c b/block/blk-integrity.c
index 79ffb4855af0..f548b64be092 100644
--- a/block/blk-integrity.c
+++ b/block/blk-integrity.c
@@ -21,6 +21,7 @@
  */
 
 #include <linux/blkdev.h>
+#include <linux/backing-dev.h>
 #include <linux/mempool.h>
 #include <linux/bio.h>
 #include <linux/scatterlist.h>
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 5677eb78557d..1b60941dc4c6 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -6,6 +6,7 @@
 #include <linux/module.h>
 #include <linux/bio.h>
 #include <linux/blkdev.h>
+#include <linux/backing-dev.h>
 #include <linux/blktrace_api.h>
 #include <linux/blk-mq.h>
 #include <linux/blk-cgroup.h>
diff --git a/block/bounce.c b/block/bounce.c
index 4bac72579c1f..072280b3dd13 100644
--- a/block/bounce.c
+++ b/block/bounce.c
@@ -13,6 +13,7 @@
 #include <linux/pagemap.h>
 #include <linux/mempool.h>
 #include <linux/blkdev.h>
+#include <linux/backing-dev.h>
 #include <linux/init.h>
 #include <linux/hash.h>
 #include <linux/highmem.h>
diff --git a/block/genhd.c b/block/genhd.c
index 0a536dc05f3b..d46ba566d62f 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -8,6 +8,7 @@
 #include <linux/kdev_t.h>
 #include <linux/kernel.h>
 #include <linux/blkdev.h>
+#include <linux/backing-dev.h>
 #include <linux/init.h>
 #include <linux/spinlock.h>
 #include <linux/proc_fs.h>
diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index b905e9888b88..efd19c2da9c2 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -38,6 +38,7 @@
 #include <linux/mutex.h>
 #include <linux/major.h>
 #include <linux/blkdev.h>
+#include <linux/backing-dev.h>
 #include <linux/genhd.h>
 #include <linux/idr.h>
 #include <net/tcp.h>
diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c
index 09e628dafd9d..4c20c228184c 100644
--- a/drivers/block/pktcdvd.c
+++ b/drivers/block/pktcdvd.c
@@ -61,6 +61,7 @@
 #include <linux/freezer.h>
 #include <linux/mutex.h>
 #include <linux/slab.h>
+#include <linux/backing-dev.h>
 #include <scsi/scsi_cmnd.h>
 #include <scsi/scsi_ioctl.h>
 #include <scsi/scsi.h>
diff --git a/drivers/char/raw.c b/drivers/char/raw.c
index 5fc291c6157e..60316fbaf295 100644
--- a/drivers/char/raw.c
+++ b/drivers/char/raw.c
@@ -12,6 +12,7 @@
 #include <linux/fs.h>
 #include <linux/major.h>
 #include <linux/blkdev.h>
+#include <linux/backing-dev.h>
 #include <linux/module.h>
 #include <linux/raw.h>
 #include <linux/capability.h>
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index 1616f668a4cb..4afb2d26b148 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -15,6 +15,7 @@
 #include <linux/module.h>
 #include <linux/hash.h>
 #include <linux/random.h>
+#include <linux/backing-dev.h>
 
 #include <trace/events/bcache.h>
 
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index e6e66d087b26..7fff744f0865 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -14,6 +14,7 @@
 #include <linux/device-mapper.h>
 #include <linux/list.h>
 #include <linux/blkdev.h>
+#include <linux/backing-dev.h>
 #include <linux/hdreg.h>
 #include <linux/completion.h>
 #include <linux/kobject.h>
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 4046a6c6f223..7da6e9c3cb53 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -16,6 +16,7 @@
 #define _MD_MD_H
 
 #include <linux/blkdev.h>
+#include <linux/backing-dev.h>
 #include <linux/kobject.h>
 #include <linux/list.h>
 #include <linux/mm.h>
diff --git a/drivers/mtd/devices/block2mtd.c b/drivers/mtd/devices/block2mtd.c
index b16f3cda97ff..e2c0057737e6 100644
--- a/drivers/mtd/devices/block2mtd.c
+++ b/drivers/mtd/devices/block2mtd.c
@@ -20,6 +20,7 @@
 #include <linux/delay.h>
 #include <linux/fs.h>
 #include <linux/blkdev.h>
+#include <linux/backing-dev.h>
 #include <linux/bio.h>
 #include <linux/pagemap.h>
 #include <linux/list.h>
diff --git a/fs/block_dev.c b/fs/block_dev.c
index c7e4163ede87..e545cbfbe5b2 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -14,6 +14,7 @@
 #include <linux/device_cgroup.h>
 #include <linux/highmem.h>
 #include <linux/blkdev.h>
+#include <linux/backing-dev.h>
 #include <linux/module.h>
 #include <linux/blkpg.h>
 #include <linux/magic.h>
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index d74e08029643..e8b5866ffa07 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -39,6 +39,7 @@
 #include <linux/slab.h>
 #include <asm/uaccess.h>
 #include <linux/fiemap.h>
+#include <linux/backing-dev.h>
 #include "ext4_jbd2.h"
 #include "ext4_extents.h"
 #include "xattr.h"
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 8d1e60214ef0..440987c8ba9e 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -26,6 +26,7 @@
 #include <linux/log2.h>
 #include <linux/module.h>
 #include <linux/slab.h>
+#include <linux/backing-dev.h>
 #include <trace/events/ext4.h>
 
 #ifdef CONFIG_EXT4_DEBUG
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index f06d0589ddba..56b8bb75c3fc 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -24,6 +24,7 @@
 #include <linux/slab.h>
 #include <linux/init.h>
 #include <linux/blkdev.h>
+#include <linux/backing-dev.h>
 #include <linux/parser.h>
 #include <linux/buffer_head.h>
 #include <linux/exportfs.h>
diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
index 6408989857e0..aba72f7a8ac4 100644
--- a/fs/f2fs/segment.h
+++ b/fs/f2fs/segment.h
@@ -9,6 +9,7 @@
  * published by the Free Software Foundation.
  */
 #include <linux/blkdev.h>
+#include <linux/backing-dev.h>
 
 /* constant macro */
 #define NULL_SEGNO			((unsigned int)(~0))
diff --git a/fs/fat/file.c b/fs/fat/file.c
index 442d50a0e33e..a08f1039909a 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -11,6 +11,7 @@
 #include <linux/compat.h>
 #include <linux/mount.h>
 #include <linux/blkdev.h>
+#include <linux/backing-dev.h>
 #include <linux/fsnotify.h>
 #include <linux/security.h>
 #include "fat.h"
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index c06774658345..509411dd3698 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -18,6 +18,7 @@
 #include <linux/parser.h>
 #include <linux/uio.h>
 #include <linux/blkdev.h>
+#include <linux/backing-dev.h>
 #include <asm/unaligned.h>
 #include "fat.h"
 
diff --git a/fs/hfs/super.c b/fs/hfs/super.c
index eee7206c38d1..55c03b9e9070 100644
--- a/fs/hfs/super.c
+++ b/fs/hfs/super.c
@@ -14,6 +14,7 @@
 
 #include <linux/module.h>
 #include <linux/blkdev.h>
+#include <linux/backing-dev.h>
 #include <linux/mount.h>
 #include <linux/init.h>
 #include <linux/nls.h>
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index 593af2fdcc2d..7302d96ae8bf 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -11,6 +11,7 @@
 #include <linux/init.h>
 #include <linux/pagemap.h>
 #include <linux/blkdev.h>
+#include <linux/backing-dev.h>
 #include <linux/fs.h>
 #include <linux/slab.h>
 #include <linux/vfs.h>
diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c
index a46bf6de9ce4..b34f2e228601 100644
--- a/fs/nfs/filelayout/filelayout.c
+++ b/fs/nfs/filelayout/filelayout.c
@@ -32,6 +32,7 @@
 #include <linux/nfs_fs.h>
 #include <linux/nfs_page.h>
 #include <linux/module.h>
+#include <linux/backing-dev.h>
 
 #include <linux/sunrpc/metrics.h>
 
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index d8b670cbd909..8f1feca89fb0 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -37,6 +37,7 @@
 #include <linux/falloc.h>
 #include <linux/quotaops.h>
 #include <linux/blkdev.h>
+#include <linux/backing-dev.h>
 
 #include <cluster/masklog.h>
 
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 0111ad0466ed..3e0af317fcc3 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -21,6 +21,7 @@
 #include "xattr.h"
 #include <linux/init.h>
 #include <linux/blkdev.h>
+#include <linux/backing-dev.h>
 #include <linux/buffer_head.h>
 #include <linux/exportfs.h>
 #include <linux/quotaops.h>
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index b3bc3e7ae79d..098508a93c7b 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -80,6 +80,7 @@
 #include <linux/stat.h>
 #include <linux/string.h>
 #include <linux/blkdev.h>
+#include <linux/backing-dev.h>
 #include <linux/init.h>
 #include <linux/parser.h>
 #include <linux/buffer_head.h>
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 8121e75352ee..4e00b38efbe0 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -41,6 +41,7 @@
 #include <linux/dcache.h>
 #include <linux/falloc.h>
 #include <linux/pagevec.h>
+#include <linux/backing-dev.h>
 
 static const struct vm_operations_struct xfs_file_vm_ops;
 
diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h
new file mode 100644
index 000000000000..aa18c4bd43c1
--- /dev/null
+++ b/include/linux/backing-dev-defs.h
@@ -0,0 +1,106 @@
+#ifndef __LINUX_BACKING_DEV_DEFS_H
+#define __LINUX_BACKING_DEV_DEFS_H
+
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/percpu_counter.h>
+#include <linux/flex_proportions.h>
+#include <linux/timer.h>
+#include <linux/workqueue.h>
+
+struct page;
+struct device;
+struct dentry;
+
+/*
+ * Bits in bdi_writeback.state
+ */
+enum wb_state {
+	WB_async_congested,	/* The async (write) queue is getting full */
+	WB_sync_congested,	/* The sync queue is getting full */
+	WB_registered,		/* bdi_register() was done */
+	WB_writeback_running,	/* Writeback is in progress */
+};
+
+typedef int (congested_fn)(void *, int);
+
+enum wb_stat_item {
+	WB_RECLAIMABLE,
+	WB_WRITEBACK,
+	WB_DIRTIED,
+	WB_WRITTEN,
+	NR_WB_STAT_ITEMS
+};
+
+#define WB_STAT_BATCH (8*(1+ilog2(nr_cpu_ids)))
+
+struct bdi_writeback {
+	struct backing_dev_info *bdi;	/* our parent bdi */
+
+	unsigned long state;		/* Always use atomic bitops on this */
+	unsigned long last_old_flush;	/* last old data flush */
+
+	struct list_head b_dirty;	/* dirty inodes */
+	struct list_head b_io;		/* parked for writeback */
+	struct list_head b_more_io;	/* parked for more writeback */
+	struct list_head b_dirty_time;	/* time stamps are dirty */
+	spinlock_t list_lock;		/* protects the b_* lists */
+
+	struct percpu_counter stat[NR_WB_STAT_ITEMS];
+
+	unsigned long bw_time_stamp;	/* last time write bw is updated */
+	unsigned long dirtied_stamp;
+	unsigned long written_stamp;	/* pages written at bw_time_stamp */
+	unsigned long write_bandwidth;	/* the estimated write bandwidth */
+	unsigned long avg_write_bandwidth; /* further smoothed write bw */
+
+	/*
+	 * The base dirty throttle rate, re-calculated on every 200ms.
+	 * All the bdi tasks' dirty rate will be curbed under it.
+	 * @dirty_ratelimit tracks the estimated @balanced_dirty_ratelimit
+	 * in small steps and is much more smooth/stable than the latter.
+	 */
+	unsigned long dirty_ratelimit;
+	unsigned long balanced_dirty_ratelimit;
+
+	struct fprop_local_percpu completions;
+	int dirty_exceeded;
+
+	spinlock_t work_lock;		/* protects work_list & dwork scheduling */
+	struct list_head work_list;
+	struct delayed_work dwork;	/* work item used for writeback */
+};
+
+struct backing_dev_info {
+	struct list_head bdi_list;
+	unsigned long ra_pages;	/* max readahead in PAGE_CACHE_SIZE units */
+	unsigned int capabilities; /* Device capabilities */
+	congested_fn *congested_fn; /* Function pointer if device is md/dm */
+	void *congested_data;	/* Pointer to aux data for congested func */
+
+	char *name;
+
+	unsigned int min_ratio;
+	unsigned int max_ratio, max_prop_frac;
+
+	struct bdi_writeback wb;  /* default writeback info for this bdi */
+
+	struct device *dev;
+
+	struct timer_list laptop_mode_wb_timer;
+
+#ifdef CONFIG_DEBUG_FS
+	struct dentry *debug_dir;
+	struct dentry *debug_stats;
+#endif
+};
+
+enum {
+	BLK_RW_ASYNC	= 0,
+	BLK_RW_SYNC	= 1,
+};
+
+void clear_bdi_congested(struct backing_dev_info *bdi, int sync);
+void set_bdi_congested(struct backing_dev_info *bdi, int sync);
+
+#endif	/* __LINUX_BACKING_DEV_DEFS_H */
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index d796f49ce87a..5e39f7a8efed 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -8,104 +8,11 @@
 #ifndef _LINUX_BACKING_DEV_H
 #define _LINUX_BACKING_DEV_H
 
-#include <linux/percpu_counter.h>
-#include <linux/log2.h>
-#include <linux/flex_proportions.h>
 #include <linux/kernel.h>
 #include <linux/fs.h>
 #include <linux/sched.h>
-#include <linux/timer.h>
 #include <linux/writeback.h>
-#include <linux/atomic.h>
-#include <linux/sysctl.h>
-#include <linux/workqueue.h>
-
-struct page;
-struct device;
-struct dentry;
-
-/*
- * Bits in bdi_writeback.state
- */
-enum wb_state {
-	WB_async_congested,	/* The async (write) queue is getting full */
-	WB_sync_congested,	/* The sync queue is getting full */
-	WB_registered,		/* bdi_register() was done */
-	WB_writeback_running,	/* Writeback is in progress */
-};
-
-typedef int (congested_fn)(void *, int);
-
-enum wb_stat_item {
-	WB_RECLAIMABLE,
-	WB_WRITEBACK,
-	WB_DIRTIED,
-	WB_WRITTEN,
-	NR_WB_STAT_ITEMS
-};
-
-#define WB_STAT_BATCH (8*(1+ilog2(nr_cpu_ids)))
-
-struct bdi_writeback {
-	struct backing_dev_info *bdi;	/* our parent bdi */
-
-	unsigned long state;		/* Always use atomic bitops on this */
-	unsigned long last_old_flush;	/* last old data flush */
-
-	struct list_head b_dirty;	/* dirty inodes */
-	struct list_head b_io;		/* parked for writeback */
-	struct list_head b_more_io;	/* parked for more writeback */
-	struct list_head b_dirty_time;	/* time stamps are dirty */
-	spinlock_t list_lock;		/* protects the b_* lists */
-
-	struct percpu_counter stat[NR_WB_STAT_ITEMS];
-
-	unsigned long bw_time_stamp;	/* last time write bw is updated */
-	unsigned long dirtied_stamp;
-	unsigned long written_stamp;	/* pages written at bw_time_stamp */
-	unsigned long write_bandwidth;	/* the estimated write bandwidth */
-	unsigned long avg_write_bandwidth; /* further smoothed write bw */
-
-	/*
-	 * The base dirty throttle rate, re-calculated on every 200ms.
-	 * All the bdi tasks' dirty rate will be curbed under it.
-	 * @dirty_ratelimit tracks the estimated @balanced_dirty_ratelimit
-	 * in small steps and is much more smooth/stable than the latter.
-	 */
-	unsigned long dirty_ratelimit;
-	unsigned long balanced_dirty_ratelimit;
-
-	struct fprop_local_percpu completions;
-	int dirty_exceeded;
-
-	spinlock_t work_lock;		/* protects work_list & dwork scheduling */
-	struct list_head work_list;
-	struct delayed_work dwork;	/* work item used for writeback */
-};
-
-struct backing_dev_info {
-	struct list_head bdi_list;
-	unsigned long ra_pages;	/* max readahead in PAGE_CACHE_SIZE units */
-	unsigned int capabilities; /* Device capabilities */
-	congested_fn *congested_fn; /* Function pointer if device is md/dm */
-	void *congested_data;	/* Pointer to aux data for congested func */
-
-	char *name;
-
-	unsigned int min_ratio;
-	unsigned int max_ratio, max_prop_frac;
-
-	struct bdi_writeback wb;  /* default writeback info for this bdi */
-
-	struct device *dev;
-
-	struct timer_list laptop_mode_wb_timer;
-
-#ifdef CONFIG_DEBUG_FS
-	struct dentry *debug_dir;
-	struct dentry *debug_stats;
-#endif
-};
+#include <linux/backing-dev-defs.h>
 
 struct backing_dev_info *inode_to_bdi(struct inode *inode);
 
@@ -265,13 +172,6 @@ static inline int bdi_rw_congested(struct backing_dev_info *bdi)
 				  (1 << WB_async_congested));
 }
 
-enum {
-	BLK_RW_ASYNC	= 0,
-	BLK_RW_SYNC	= 1,
-};
-
-void clear_bdi_congested(struct backing_dev_info *bdi, int sync);
-void set_bdi_congested(struct backing_dev_info *bdi, int sync);
 long congestion_wait(int sync, long timeout);
 long wait_iff_congested(struct zone *zone, int sync, long timeout);
 int pdflush_proc_obsolete(struct ctl_table *table, int write,
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index ccaa9aecd593..60d2726a6b62 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -12,7 +12,7 @@
 #include <linux/timer.h>
 #include <linux/workqueue.h>
 #include <linux/pagemap.h>
-#include <linux/backing-dev.h>
+#include <linux/backing-dev-defs.h>
 #include <linux/wait.h>
 #include <linux/mempool.h>
 #include <linux/bio.h>
diff --git a/mm/madvise.c b/mm/madvise.c
index d551475517bf..64bb8a22110c 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -17,6 +17,7 @@
 #include <linux/fs.h>
 #include <linux/file.h>
 #include <linux/blkdev.h>
+#include <linux/backing-dev.h>
 #include <linux/swap.h>
 #include <linux/swapops.h>
 
-- 
cgit v1.2.3


From a212b105b07d75b48b1a166378282e8a77fbf53d Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 22 May 2015 17:13:33 -0400
Subject: bdi: make inode_to_bdi() inline

Now that bdi definitions are moved to backing-dev-defs.h,
backing-dev.h can include blkdev.h and inline inode_to_bdi() without
worrying about introducing circular include dependency.  The function
gets called from hot paths and fairly trivial.

This patch makes inode_to_bdi() and sb_is_blkdev_sb() that the
function calls inline.  blockdev_superblock and noop_backing_dev_info
are EXPORT_GPL'd to allow the inline functions to be used from
modules.

While at it, make sb_is_blkdev_sb() return bool instead of int.

v2: Fixed typo in description as suggested by Jan.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Jens Axboe <axboe@kernel.dk>
Cc: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 fs/block_dev.c              |  8 ++------
 fs/fs-writeback.c           | 16 ----------------
 include/linux/backing-dev.h | 18 ++++++++++++++++--
 include/linux/fs.h          |  8 +++++++-
 mm/backing-dev.c            |  1 +
 5 files changed, 26 insertions(+), 25 deletions(-)

(limited to 'include/linux')

diff --git a/fs/block_dev.c b/fs/block_dev.c
index e545cbfbe5b2..f04c873a7365 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -547,7 +547,8 @@ static struct file_system_type bd_type = {
 	.kill_sb	= kill_anon_super,
 };
 
-static struct super_block *blockdev_superblock __read_mostly;
+struct super_block *blockdev_superblock __read_mostly;
+EXPORT_SYMBOL_GPL(blockdev_superblock);
 
 void __init bdev_cache_init(void)
 {
@@ -688,11 +689,6 @@ static struct block_device *bd_acquire(struct inode *inode)
 	return bdev;
 }
 
-int sb_is_blkdev_sb(struct super_block *sb)
-{
-	return sb == blockdev_superblock;
-}
-
 /* Call when you free inode */
 
 void bd_forget(struct inode *inode)
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index a69d2e10d514..34d1cb854fc1 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -78,22 +78,6 @@ int writeback_in_progress(struct backing_dev_info *bdi)
 }
 EXPORT_SYMBOL(writeback_in_progress);
 
-struct backing_dev_info *inode_to_bdi(struct inode *inode)
-{
-	struct super_block *sb;
-
-	if (!inode)
-		return &noop_backing_dev_info;
-
-	sb = inode->i_sb;
-#ifdef CONFIG_BLOCK
-	if (sb_is_blkdev_sb(sb))
-		return blk_get_backing_dev_info(I_BDEV(inode));
-#endif
-	return sb->s_bdi;
-}
-EXPORT_SYMBOL_GPL(inode_to_bdi);
-
 static inline struct inode *wb_inode(struct list_head *head)
 {
 	return list_entry(head, struct inode, i_wb_list);
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index 5e39f7a8efed..785782034e86 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -11,11 +11,10 @@
 #include <linux/kernel.h>
 #include <linux/fs.h>
 #include <linux/sched.h>
+#include <linux/blkdev.h>
 #include <linux/writeback.h>
 #include <linux/backing-dev-defs.h>
 
-struct backing_dev_info *inode_to_bdi(struct inode *inode);
-
 int __must_check bdi_init(struct backing_dev_info *bdi);
 void bdi_destroy(struct backing_dev_info *bdi);
 
@@ -149,6 +148,21 @@ extern struct backing_dev_info noop_backing_dev_info;
 
 int writeback_in_progress(struct backing_dev_info *bdi);
 
+static inline struct backing_dev_info *inode_to_bdi(struct inode *inode)
+{
+	struct super_block *sb;
+
+	if (!inode)
+		return &noop_backing_dev_info;
+
+	sb = inode->i_sb;
+#ifdef CONFIG_BLOCK
+	if (sb_is_blkdev_sb(sb))
+		return blk_get_backing_dev_info(I_BDEV(inode));
+#endif
+	return sb->s_bdi;
+}
+
 static inline int bdi_congested(struct backing_dev_info *bdi, int bdi_bits)
 {
 	if (bdi->congested_fn)
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 1ef63900243c..ce100b87fba3 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2240,7 +2240,13 @@ extern struct super_block *freeze_bdev(struct block_device *);
 extern void emergency_thaw_all(void);
 extern int thaw_bdev(struct block_device *bdev, struct super_block *sb);
 extern int fsync_bdev(struct block_device *);
-extern int sb_is_blkdev_sb(struct super_block *sb);
+
+extern struct super_block *blockdev_superblock;
+
+static inline bool sb_is_blkdev_sb(struct super_block *sb)
+{
+	return sb == blockdev_superblock;
+}
 #else
 static inline void bd_forget(struct inode *inode) {}
 static inline int sync_blockdev(struct block_device *bdev) { return 0; }
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index ff85ecb7d6a1..b0707d1b1d38 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -18,6 +18,7 @@ struct backing_dev_info noop_backing_dev_info = {
 	.name		= "noop",
 	.capabilities	= BDI_CAP_NO_ACCT_AND_WRITEBACK,
 };
+EXPORT_SYMBOL_GPL(noop_backing_dev_info);
 
 static struct class *bdi_class;
 
-- 
cgit v1.2.3


From 4aa9c692e052cf6db99db62a8fe0543e5c455da7 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 22 May 2015 17:13:35 -0400
Subject: bdi: separate out congested state into a separate struct

Currently, a wb's (bdi_writeback) congestion state is carried in its
->state field; however, cgroup writeback support will require multiple
wb's sharing the same congestion state.  This patch separates out
congestion state into its own struct - struct bdi_writeback_congested.
A new field wb field, wb_congested, points to its associated congested
struct.  The default wb, bdi->wb, always points to bdi->wb_congested.

While this patch adds a layer of indirection, it doesn't introduce any
behavior changes.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 include/linux/backing-dev-defs.h | 14 ++++++++++++--
 include/linux/backing-dev.h      |  2 +-
 mm/backing-dev.c                 |  7 +++++--
 3 files changed, 18 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h
index aa18c4bd43c1..9e9eafa5f5aa 100644
--- a/include/linux/backing-dev-defs.h
+++ b/include/linux/backing-dev-defs.h
@@ -16,12 +16,15 @@ struct dentry;
  * Bits in bdi_writeback.state
  */
 enum wb_state {
-	WB_async_congested,	/* The async (write) queue is getting full */
-	WB_sync_congested,	/* The sync queue is getting full */
 	WB_registered,		/* bdi_register() was done */
 	WB_writeback_running,	/* Writeback is in progress */
 };
 
+enum wb_congested_state {
+	WB_async_congested,	/* The async (write) queue is getting full */
+	WB_sync_congested,	/* The sync queue is getting full */
+};
+
 typedef int (congested_fn)(void *, int);
 
 enum wb_stat_item {
@@ -34,6 +37,10 @@ enum wb_stat_item {
 
 #define WB_STAT_BATCH (8*(1+ilog2(nr_cpu_ids)))
 
+struct bdi_writeback_congested {
+	unsigned long state;		/* WB_[a]sync_congested flags */
+};
+
 struct bdi_writeback {
 	struct backing_dev_info *bdi;	/* our parent bdi */
 
@@ -48,6 +55,8 @@ struct bdi_writeback {
 
 	struct percpu_counter stat[NR_WB_STAT_ITEMS];
 
+	struct bdi_writeback_congested *congested;
+
 	unsigned long bw_time_stamp;	/* last time write bw is updated */
 	unsigned long dirtied_stamp;
 	unsigned long written_stamp;	/* pages written at bw_time_stamp */
@@ -84,6 +93,7 @@ struct backing_dev_info {
 	unsigned int max_ratio, max_prop_frac;
 
 	struct bdi_writeback wb;  /* default writeback info for this bdi */
+	struct bdi_writeback_congested wb_congested;
 
 	struct device *dev;
 
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index 785782034e86..bfdaa18ba0a1 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -167,7 +167,7 @@ static inline int bdi_congested(struct backing_dev_info *bdi, int bdi_bits)
 {
 	if (bdi->congested_fn)
 		return bdi->congested_fn(bdi->congested_data, bdi_bits);
-	return (bdi->wb.state & bdi_bits);
+	return (bdi->wb.congested->state & bdi_bits);
 }
 
 static inline int bdi_read_congested(struct backing_dev_info *bdi)
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 805b2870ca4e..5ec7658cb984 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -383,6 +383,9 @@ int bdi_init(struct backing_dev_info *bdi)
 	if (err)
 		return err;
 
+	bdi->wb_congested.state = 0;
+	bdi->wb.congested = &bdi->wb_congested;
+
 	return 0;
 }
 EXPORT_SYMBOL(bdi_init);
@@ -504,7 +507,7 @@ void clear_bdi_congested(struct backing_dev_info *bdi, int sync)
 	wait_queue_head_t *wqh = &congestion_wqh[sync];
 
 	bit = sync ? WB_sync_congested : WB_async_congested;
-	if (test_and_clear_bit(bit, &bdi->wb.state))
+	if (test_and_clear_bit(bit, &bdi->wb.congested->state))
 		atomic_dec(&nr_bdi_congested[sync]);
 	smp_mb__after_atomic();
 	if (waitqueue_active(wqh))
@@ -517,7 +520,7 @@ void set_bdi_congested(struct backing_dev_info *bdi, int sync)
 	enum wb_state bit;
 
 	bit = sync ? WB_sync_congested : WB_async_congested;
-	if (!test_and_set_bit(bit, &bdi->wb.state))
+	if (!test_and_set_bit(bit, &bdi->wb.congested->state))
 		atomic_inc(&nr_bdi_congested[sync]);
 }
 EXPORT_SYMBOL(set_bdi_congested);
-- 
cgit v1.2.3


From 89e9b9e07a390c50980d10aa37a04631db5a23ab Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 22 May 2015 17:13:36 -0400
Subject: writeback: add {CONFIG|BDI_CAP|FS}_CGROUP_WRITEBACK

cgroup writeback requires support from both bdi and filesystem sides.
Add BDI_CAP_CGROUP_WRITEBACK and FS_CGROUP_WRITEBACK to indicate
support and enable BDI_CAP_CGROUP_WRITEBACK on block based bdi's by
default.  Also, define CONFIG_CGROUP_WRITEBACK which is enabled if
both MEMCG and BLK_CGROUP are enabled.

inode_cgwb_enabled() which determines whether a given inode's both bdi
and fs support cgroup writeback is added.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Jan Kara <jack@suse.cz>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-core.c            |  2 +-
 include/linux/backing-dev.h | 32 +++++++++++++++++++++++++++++++-
 include/linux/fs.h          |  1 +
 init/Kconfig                |  5 +++++
 4 files changed, 38 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-core.c b/block/blk-core.c
index c3ba9c3b08d4..c114a61590bc 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -621,7 +621,7 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
 
 	q->backing_dev_info.ra_pages =
 			(VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
-	q->backing_dev_info.capabilities = 0;
+	q->backing_dev_info.capabilities = BDI_CAP_CGROUP_WRITEBACK;
 	q->backing_dev_info.name = "block";
 	q->node = node_id;
 
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index bfdaa18ba0a1..6bb31234e6a9 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -134,12 +134,15 @@ int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio);
  * BDI_CAP_NO_WRITEBACK:   Don't write pages back
  * BDI_CAP_NO_ACCT_WB:     Don't automatically account writeback pages
  * BDI_CAP_STRICTLIMIT:    Keep number of dirty pages below bdi threshold.
+ *
+ * BDI_CAP_CGROUP_WRITEBACK: Supports cgroup-aware writeback.
  */
 #define BDI_CAP_NO_ACCT_DIRTY	0x00000001
 #define BDI_CAP_NO_WRITEBACK	0x00000002
 #define BDI_CAP_NO_ACCT_WB	0x00000004
 #define BDI_CAP_STABLE_WRITES	0x00000008
 #define BDI_CAP_STRICTLIMIT	0x00000010
+#define BDI_CAP_CGROUP_WRITEBACK 0x00000020
 
 #define BDI_CAP_NO_ACCT_AND_WRITEBACK \
 	(BDI_CAP_NO_WRITEBACK | BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_ACCT_WB)
@@ -229,4 +232,31 @@ static inline int bdi_sched_wait(void *word)
 	return 0;
 }
 
-#endif		/* _LINUX_BACKING_DEV_H */
+#ifdef CONFIG_CGROUP_WRITEBACK
+
+/**
+ * inode_cgwb_enabled - test whether cgroup writeback is enabled on an inode
+ * @inode: inode of interest
+ *
+ * cgroup writeback requires support from both the bdi and filesystem.
+ * Test whether @inode has both.
+ */
+static inline bool inode_cgwb_enabled(struct inode *inode)
+{
+	struct backing_dev_info *bdi = inode_to_bdi(inode);
+
+	return bdi_cap_account_dirty(bdi) &&
+		(bdi->capabilities & BDI_CAP_CGROUP_WRITEBACK) &&
+		(inode->i_sb->s_type->fs_flags & FS_CGROUP_WRITEBACK);
+}
+
+#else	/* CONFIG_CGROUP_WRITEBACK */
+
+static inline bool inode_cgwb_enabled(struct inode *inode)
+{
+	return false;
+}
+
+#endif	/* CONFIG_CGROUP_WRITEBACK */
+
+#endif	/* _LINUX_BACKING_DEV_H */
diff --git a/include/linux/fs.h b/include/linux/fs.h
index ce100b87fba3..74e0ae0626a8 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1897,6 +1897,7 @@ struct file_system_type {
 #define FS_HAS_SUBTYPE		4
 #define FS_USERNS_MOUNT		8	/* Can be mounted by userns root */
 #define FS_USERNS_DEV_MOUNT	16 /* A userns mount does not imply MNT_NODEV */
+#define FS_CGROUP_WRITEBACK	32	/* Supports cgroup-aware writeback */
 #define FS_RENAME_DOES_D_MOVE	32768	/* FS will handle d_move() during rename() internally. */
 	struct dentry *(*mount) (struct file_system_type *, int,
 		       const char *, void *);
diff --git a/init/Kconfig b/init/Kconfig
index dc24dec60232..d4f763332f9f 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1141,6 +1141,11 @@ config DEBUG_BLK_CGROUP
 	Enable some debugging help. Currently it exports additional stat
 	files in a cgroup which can be useful for debugging.
 
+config CGROUP_WRITEBACK
+	bool
+	depends on MEMCG && BLK_CGROUP
+	default y
+
 endif # CGROUPS
 
 config CHECKPOINT_RESTORE
-- 
cgit v1.2.3


From 52ebea749aaed195245701a8f90a23d672c7a933 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 22 May 2015 17:13:37 -0400
Subject: writeback: make backing_dev_info host cgroup-specific bdi_writebacks

For the planned cgroup writeback support, on each bdi
(backing_dev_info), each memcg will be served by a separate wb
(bdi_writeback).  This patch updates bdi so that a bdi can host
multiple wbs (bdi_writebacks).

On the default hierarchy, blkcg implicitly enables memcg.  This allows
using memcg's page ownership for attributing writeback IOs, and every
memcg - blkcg combination can be served by its own wb by assigning a
dedicated wb to each memcg.  This means that there may be multiple
wb's of a bdi mapped to the same blkcg.  As congested state is per
blkcg - bdi combination, those wb's should share the same congested
state.  This is achieved by tracking congested state via
bdi_writeback_congested structs which are keyed by blkcg.

bdi->wb remains unchanged and will keep serving the root cgroup.
cgwb's (cgroup wb's) for non-root cgroups are created on-demand or
looked up while dirtying an inode according to the memcg of the page
being dirtied or current task.  Each cgwb is indexed on bdi->cgwb_tree
by its memcg id.  Once an inode is associated with its wb, it can be
retrieved using inode_to_wb().

Currently, none of the filesystems has FS_CGROUP_WRITEBACK and all
pages will keep being associated with bdi->wb.

v3: inode_attach_wb() in account_page_dirtied() moved inside
    mapping_cap_account_dirty() block where it's known to be !NULL.
    Also, an unnecessary NULL check before kfree() removed.  Both
    detected by the kbuild bot.

v2: Updated so that wb association is per inode and wb is per memcg
    rather than blkcg.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: kbuild test robot <fengguang.wu@intel.com>
Cc: Dan Carpenter <dan.carpenter@oracle.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Jan Kara <jack@suse.cz>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-cgroup.c               |   7 +-
 fs/fs-writeback.c                |   8 +-
 fs/inode.c                       |   1 +
 include/linux/backing-dev-defs.h |  59 +++++-
 include/linux/backing-dev.h      | 195 +++++++++++++++++++
 include/linux/blk-cgroup.h       |   4 +
 include/linux/fs.h               |   4 +
 include/linux/memcontrol.h       |   4 +
 mm/backing-dev.c                 | 397 +++++++++++++++++++++++++++++++++++++++
 mm/memcontrol.c                  |  19 +-
 mm/page-writeback.c              |  11 +-
 11 files changed, 698 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 54ec1721cd3e..979cfdbb94e0 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -15,6 +15,7 @@
 #include <linux/module.h>
 #include <linux/err.h>
 #include <linux/blkdev.h>
+#include <linux/backing-dev.h>
 #include <linux/slab.h>
 #include <linux/genhd.h>
 #include <linux/delay.h>
@@ -797,6 +798,8 @@ static void blkcg_css_offline(struct cgroup_subsys_state *css)
 	}
 
 	spin_unlock_irq(&blkcg->lock);
+
+	wb_blkcg_offline(blkcg);
 }
 
 static void blkcg_css_free(struct cgroup_subsys_state *css)
@@ -827,7 +830,9 @@ done:
 	spin_lock_init(&blkcg->lock);
 	INIT_RADIX_TREE(&blkcg->blkg_tree, GFP_ATOMIC);
 	INIT_HLIST_HEAD(&blkcg->blkg_list);
-
+#ifdef CONFIG_CGROUP_WRITEBACK
+	INIT_LIST_HEAD(&blkcg->cgwb_list);
+#endif
 	return &blkcg->css;
 }
 
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 34d1cb854fc1..99a2440c5588 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -185,11 +185,11 @@ void bdi_start_background_writeback(struct backing_dev_info *bdi)
  */
 void inode_wb_list_del(struct inode *inode)
 {
-	struct backing_dev_info *bdi = inode_to_bdi(inode);
+	struct bdi_writeback *wb = inode_to_wb(inode);
 
-	spin_lock(&bdi->wb.list_lock);
+	spin_lock(&wb->list_lock);
 	list_del_init(&inode->i_wb_list);
-	spin_unlock(&bdi->wb.list_lock);
+	spin_unlock(&wb->list_lock);
 }
 
 /*
@@ -1268,6 +1268,8 @@ void __mark_inode_dirty(struct inode *inode, int flags)
 	if ((inode->i_state & flags) != flags) {
 		const int was_dirty = inode->i_state & I_DIRTY;
 
+		inode_attach_wb(inode, NULL);
+
 		if (flags & I_DIRTY_INODE)
 			inode->i_state &= ~I_DIRTY_TIME;
 		inode->i_state |= flags;
diff --git a/fs/inode.c b/fs/inode.c
index ea37cd17b53f..efc9edacfb9b 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -223,6 +223,7 @@ EXPORT_SYMBOL(free_inode_nonrcu);
 void __destroy_inode(struct inode *inode)
 {
 	BUG_ON(inode_has_buffers(inode));
+	inode_detach_wb(inode);
 	security_inode_free(inode);
 	fsnotify_inode_delete(inode);
 	locks_free_lock_context(inode->i_flctx);
diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h
index 9e9eafa5f5aa..a1e9c407a59a 100644
--- a/include/linux/backing-dev-defs.h
+++ b/include/linux/backing-dev-defs.h
@@ -2,8 +2,11 @@
 #define __LINUX_BACKING_DEV_DEFS_H
 
 #include <linux/list.h>
+#include <linux/radix-tree.h>
+#include <linux/rbtree.h>
 #include <linux/spinlock.h>
 #include <linux/percpu_counter.h>
+#include <linux/percpu-refcount.h>
 #include <linux/flex_proportions.h>
 #include <linux/timer.h>
 #include <linux/workqueue.h>
@@ -37,10 +40,43 @@ enum wb_stat_item {
 
 #define WB_STAT_BATCH (8*(1+ilog2(nr_cpu_ids)))
 
+/*
+ * For cgroup writeback, multiple wb's may map to the same blkcg.  Those
+ * wb's can operate mostly independently but should share the congested
+ * state.  To facilitate such sharing, the congested state is tracked using
+ * the following struct which is created on demand, indexed by blkcg ID on
+ * its bdi, and refcounted.
+ */
 struct bdi_writeback_congested {
 	unsigned long state;		/* WB_[a]sync_congested flags */
+
+#ifdef CONFIG_CGROUP_WRITEBACK
+	struct backing_dev_info *bdi;	/* the associated bdi */
+	atomic_t refcnt;		/* nr of attached wb's and blkg */
+	int blkcg_id;			/* ID of the associated blkcg */
+	struct rb_node rb_node;		/* on bdi->cgwb_congestion_tree */
+#endif
 };
 
+/*
+ * Each wb (bdi_writeback) can perform writeback operations, is measured
+ * and throttled, independently.  Without cgroup writeback, each bdi
+ * (bdi_writeback) is served by its embedded bdi->wb.
+ *
+ * On the default hierarchy, blkcg implicitly enables memcg.  This allows
+ * using memcg's page ownership for attributing writeback IOs, and every
+ * memcg - blkcg combination can be served by its own wb by assigning a
+ * dedicated wb to each memcg, which enables isolation across different
+ * cgroups and propagation of IO back pressure down from the IO layer upto
+ * the tasks which are generating the dirty pages to be written back.
+ *
+ * A cgroup wb is indexed on its bdi by the ID of the associated memcg,
+ * refcounted with the number of inodes attached to it, and pins the memcg
+ * and the corresponding blkcg.  As the corresponding blkcg for a memcg may
+ * change as blkcg is disabled and enabled higher up in the hierarchy, a wb
+ * is tested for blkcg after lookup and removed from index on mismatch so
+ * that a new wb for the combination can be created.
+ */
 struct bdi_writeback {
 	struct backing_dev_info *bdi;	/* our parent bdi */
 
@@ -78,6 +114,19 @@ struct bdi_writeback {
 	spinlock_t work_lock;		/* protects work_list & dwork scheduling */
 	struct list_head work_list;
 	struct delayed_work dwork;	/* work item used for writeback */
+
+#ifdef CONFIG_CGROUP_WRITEBACK
+	struct percpu_ref refcnt;	/* used only for !root wb's */
+	struct cgroup_subsys_state *memcg_css; /* the associated memcg */
+	struct cgroup_subsys_state *blkcg_css; /* and blkcg */
+	struct list_head memcg_node;	/* anchored at memcg->cgwb_list */
+	struct list_head blkcg_node;	/* anchored at blkcg->cgwb_list */
+
+	union {
+		struct work_struct release_work;
+		struct rcu_head rcu;
+	};
+#endif
 };
 
 struct backing_dev_info {
@@ -92,9 +141,13 @@ struct backing_dev_info {
 	unsigned int min_ratio;
 	unsigned int max_ratio, max_prop_frac;
 
-	struct bdi_writeback wb;  /* default writeback info for this bdi */
-	struct bdi_writeback_congested wb_congested;
-
+	struct bdi_writeback wb;  /* the root writeback info for this bdi */
+	struct bdi_writeback_congested wb_congested; /* its congested state */
+#ifdef CONFIG_CGROUP_WRITEBACK
+	struct radix_tree_root cgwb_tree; /* radix tree of active cgroup wbs */
+	struct rb_root cgwb_congested_tree; /* their congested states */
+	atomic_t usage_cnt; /* counts both cgwbs and cgwb_contested's */
+#endif
 	struct device *dev;
 
 	struct timer_list laptop_mode_wb_timer;
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index 6bb31234e6a9..8ae59df2e3d1 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -13,6 +13,7 @@
 #include <linux/sched.h>
 #include <linux/blkdev.h>
 #include <linux/writeback.h>
+#include <linux/blk-cgroup.h>
 #include <linux/backing-dev-defs.h>
 
 int __must_check bdi_init(struct backing_dev_info *bdi);
@@ -234,6 +235,16 @@ static inline int bdi_sched_wait(void *word)
 
 #ifdef CONFIG_CGROUP_WRITEBACK
 
+struct bdi_writeback_congested *
+wb_congested_get_create(struct backing_dev_info *bdi, int blkcg_id, gfp_t gfp);
+void wb_congested_put(struct bdi_writeback_congested *congested);
+struct bdi_writeback *wb_get_create(struct backing_dev_info *bdi,
+				    struct cgroup_subsys_state *memcg_css,
+				    gfp_t gfp);
+void __inode_attach_wb(struct inode *inode, struct page *page);
+void wb_memcg_offline(struct mem_cgroup *memcg);
+void wb_blkcg_offline(struct blkcg *blkcg);
+
 /**
  * inode_cgwb_enabled - test whether cgroup writeback is enabled on an inode
  * @inode: inode of interest
@@ -250,6 +261,135 @@ static inline bool inode_cgwb_enabled(struct inode *inode)
 		(inode->i_sb->s_type->fs_flags & FS_CGROUP_WRITEBACK);
 }
 
+/**
+ * wb_tryget - try to increment a wb's refcount
+ * @wb: bdi_writeback to get
+ */
+static inline bool wb_tryget(struct bdi_writeback *wb)
+{
+	if (wb != &wb->bdi->wb)
+		return percpu_ref_tryget(&wb->refcnt);
+	return true;
+}
+
+/**
+ * wb_get - increment a wb's refcount
+ * @wb: bdi_writeback to get
+ */
+static inline void wb_get(struct bdi_writeback *wb)
+{
+	if (wb != &wb->bdi->wb)
+		percpu_ref_get(&wb->refcnt);
+}
+
+/**
+ * wb_put - decrement a wb's refcount
+ * @wb: bdi_writeback to put
+ */
+static inline void wb_put(struct bdi_writeback *wb)
+{
+	if (wb != &wb->bdi->wb)
+		percpu_ref_put(&wb->refcnt);
+}
+
+/**
+ * wb_find_current - find wb for %current on a bdi
+ * @bdi: bdi of interest
+ *
+ * Find the wb of @bdi which matches both the memcg and blkcg of %current.
+ * Must be called under rcu_read_lock() which protects the returend wb.
+ * NULL if not found.
+ */
+static inline struct bdi_writeback *wb_find_current(struct backing_dev_info *bdi)
+{
+	struct cgroup_subsys_state *memcg_css;
+	struct bdi_writeback *wb;
+
+	memcg_css = task_css(current, memory_cgrp_id);
+	if (!memcg_css->parent)
+		return &bdi->wb;
+
+	wb = radix_tree_lookup(&bdi->cgwb_tree, memcg_css->id);
+
+	/*
+	 * %current's blkcg equals the effective blkcg of its memcg.  No
+	 * need to use the relatively expensive cgroup_get_e_css().
+	 */
+	if (likely(wb && wb->blkcg_css == task_css(current, blkio_cgrp_id)))
+		return wb;
+	return NULL;
+}
+
+/**
+ * wb_get_create_current - get or create wb for %current on a bdi
+ * @bdi: bdi of interest
+ * @gfp: allocation mask
+ *
+ * Equivalent to wb_get_create() on %current's memcg.  This function is
+ * called from a relatively hot path and optimizes the common cases using
+ * wb_find_current().
+ */
+static inline struct bdi_writeback *
+wb_get_create_current(struct backing_dev_info *bdi, gfp_t gfp)
+{
+	struct bdi_writeback *wb;
+
+	rcu_read_lock();
+	wb = wb_find_current(bdi);
+	if (wb && unlikely(!wb_tryget(wb)))
+		wb = NULL;
+	rcu_read_unlock();
+
+	if (unlikely(!wb)) {
+		struct cgroup_subsys_state *memcg_css;
+
+		memcg_css = task_get_css(current, memory_cgrp_id);
+		wb = wb_get_create(bdi, memcg_css, gfp);
+		css_put(memcg_css);
+	}
+	return wb;
+}
+
+/**
+ * inode_attach_wb - associate an inode with its wb
+ * @inode: inode of interest
+ * @page: page being dirtied (may be NULL)
+ *
+ * If @inode doesn't have its wb, associate it with the wb matching the
+ * memcg of @page or, if @page is NULL, %current.  May be called w/ or w/o
+ * @inode->i_lock.
+ */
+static inline void inode_attach_wb(struct inode *inode, struct page *page)
+{
+	if (!inode->i_wb)
+		__inode_attach_wb(inode, page);
+}
+
+/**
+ * inode_detach_wb - disassociate an inode from its wb
+ * @inode: inode of interest
+ *
+ * @inode is being freed.  Detach from its wb.
+ */
+static inline void inode_detach_wb(struct inode *inode)
+{
+	if (inode->i_wb) {
+		wb_put(inode->i_wb);
+		inode->i_wb = NULL;
+	}
+}
+
+/**
+ * inode_to_wb - determine the wb of an inode
+ * @inode: inode of interest
+ *
+ * Returns the wb @inode is currently associated with.
+ */
+static inline struct bdi_writeback *inode_to_wb(struct inode *inode)
+{
+	return inode->i_wb;
+}
+
 #else	/* CONFIG_CGROUP_WRITEBACK */
 
 static inline bool inode_cgwb_enabled(struct inode *inode)
@@ -257,6 +397,61 @@ static inline bool inode_cgwb_enabled(struct inode *inode)
 	return false;
 }
 
+static inline struct bdi_writeback_congested *
+wb_congested_get_create(struct backing_dev_info *bdi, int blkcg_id, gfp_t gfp)
+{
+	return bdi->wb.congested;
+}
+
+static inline void wb_congested_put(struct bdi_writeback_congested *congested)
+{
+}
+
+static inline bool wb_tryget(struct bdi_writeback *wb)
+{
+	return true;
+}
+
+static inline void wb_get(struct bdi_writeback *wb)
+{
+}
+
+static inline void wb_put(struct bdi_writeback *wb)
+{
+}
+
+static inline struct bdi_writeback *wb_find_current(struct backing_dev_info *bdi)
+{
+	return &bdi->wb;
+}
+
+static inline struct bdi_writeback *
+wb_get_create_current(struct backing_dev_info *bdi, gfp_t gfp)
+{
+	return &bdi->wb;
+}
+
+static inline void inode_attach_wb(struct inode *inode, struct page *page)
+{
+}
+
+static inline void inode_detach_wb(struct inode *inode)
+{
+}
+
+static inline struct bdi_writeback *inode_to_wb(struct inode *inode)
+{
+	return &inode_to_bdi(inode)->wb;
+}
+
+static inline void wb_memcg_offline(struct mem_cgroup *memcg)
+{
+}
+
+static inline void wb_blkcg_offline(struct blkcg *blkcg)
+{
+}
+
 #endif	/* CONFIG_CGROUP_WRITEBACK */
 
 #endif	/* _LINUX_BACKING_DEV_H */
diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h
index 4dc643f2046e..3033eb173eb4 100644
--- a/include/linux/blk-cgroup.h
+++ b/include/linux/blk-cgroup.h
@@ -53,6 +53,10 @@ struct blkcg {
 	/* TODO: per-policy storage in blkcg */
 	unsigned int			cfq_weight;	/* belongs to cfq */
 	unsigned int			cfq_leaf_weight;
+
+#ifdef CONFIG_CGROUP_WRITEBACK
+	struct list_head		cgwb_list;
+#endif
 };
 
 struct blkg_stat {
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 74e0ae0626a8..67a42ec95065 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -35,6 +35,7 @@
 #include <uapi/linux/fs.h>
 
 struct backing_dev_info;
+struct bdi_writeback;
 struct export_operations;
 struct hd_geometry;
 struct iovec;
@@ -635,6 +636,9 @@ struct inode {
 
 	struct hlist_node	i_hash;
 	struct list_head	i_wb_list;	/* backing dev IO list */
+#ifdef CONFIG_CGROUP_WRITEBACK
+	struct bdi_writeback	*i_wb;		/* the associated cgroup wb */
+#endif
 	struct list_head	i_lru;		/* inode LRU list */
 	struct list_head	i_sb_list;
 	union {
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 637ef626008e..662a953ea8ad 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -388,6 +388,10 @@ enum {
 	OVER_LIMIT,
 };
 
+#ifdef CONFIG_CGROUP_WRITEBACK
+struct list_head *mem_cgroup_cgwb_list(struct mem_cgroup *memcg);
+#endif
+
 struct sock;
 #if defined(CONFIG_INET) && defined(CONFIG_MEMCG_KMEM)
 void sock_update_memcg(struct sock *sk);
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 5ec7658cb984..4c9386c98ec1 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -368,6 +368,401 @@ static void wb_exit(struct bdi_writeback *wb)
 	fprop_local_destroy_percpu(&wb->completions);
 }
 
+#ifdef CONFIG_CGROUP_WRITEBACK
+
+#include <linux/memcontrol.h>
+
+/*
+ * cgwb_lock protects bdi->cgwb_tree, bdi->cgwb_congested_tree,
+ * blkcg->cgwb_list, and memcg->cgwb_list.  bdi->cgwb_tree is also RCU
+ * protected.  cgwb_release_wait is used to wait for the completion of cgwb
+ * releases from bdi destruction path.
+ */
+static DEFINE_SPINLOCK(cgwb_lock);
+static DECLARE_WAIT_QUEUE_HEAD(cgwb_release_wait);
+
+/**
+ * wb_congested_get_create - get or create a wb_congested
+ * @bdi: associated bdi
+ * @blkcg_id: ID of the associated blkcg
+ * @gfp: allocation mask
+ *
+ * Look up the wb_congested for @blkcg_id on @bdi.  If missing, create one.
+ * The returned wb_congested has its reference count incremented.  Returns
+ * NULL on failure.
+ */
+struct bdi_writeback_congested *
+wb_congested_get_create(struct backing_dev_info *bdi, int blkcg_id, gfp_t gfp)
+{
+	struct bdi_writeback_congested *new_congested = NULL, *congested;
+	struct rb_node **node, *parent;
+	unsigned long flags;
+
+	if (blkcg_id == 1)
+		return &bdi->wb_congested;
+retry:
+	spin_lock_irqsave(&cgwb_lock, flags);
+
+	node = &bdi->cgwb_congested_tree.rb_node;
+	parent = NULL;
+
+	while (*node != NULL) {
+		parent = *node;
+		congested = container_of(parent, struct bdi_writeback_congested,
+					 rb_node);
+		if (congested->blkcg_id < blkcg_id)
+			node = &parent->rb_left;
+		else if (congested->blkcg_id > blkcg_id)
+			node = &parent->rb_right;
+		else
+			goto found;
+	}
+
+	if (new_congested) {
+		/* !found and storage for new one already allocated, insert */
+		congested = new_congested;
+		new_congested = NULL;
+		rb_link_node(&congested->rb_node, parent, node);
+		rb_insert_color(&congested->rb_node, &bdi->cgwb_congested_tree);
+		atomic_inc(&bdi->usage_cnt);
+		goto found;
+	}
+
+	spin_unlock_irqrestore(&cgwb_lock, flags);
+
+	/* allocate storage for new one and retry */
+	new_congested = kzalloc(sizeof(*new_congested), gfp);
+	if (!new_congested)
+		return NULL;
+
+	atomic_set(&new_congested->refcnt, 0);
+	new_congested->bdi = bdi;
+	new_congested->blkcg_id = blkcg_id;
+	goto retry;
+
+found:
+	atomic_inc(&congested->refcnt);
+	spin_unlock_irqrestore(&cgwb_lock, flags);
+	kfree(new_congested);
+	return congested;
+}
+
+/**
+ * wb_congested_put - put a wb_congested
+ * @congested: wb_congested to put
+ *
+ * Put @congested and destroy it if the refcnt reaches zero.
+ */
+void wb_congested_put(struct bdi_writeback_congested *congested)
+{
+	struct backing_dev_info *bdi = congested->bdi;
+	unsigned long flags;
+
+	if (congested->blkcg_id == 1)
+		return;
+
+	local_irq_save(flags);
+	if (!atomic_dec_and_lock(&congested->refcnt, &cgwb_lock)) {
+		local_irq_restore(flags);
+		return;
+	}
+
+	rb_erase(&congested->rb_node, &congested->bdi->cgwb_congested_tree);
+	spin_unlock_irqrestore(&cgwb_lock, flags);
+	kfree(congested);
+
+	if (atomic_dec_and_test(&bdi->usage_cnt))
+		wake_up_all(&cgwb_release_wait);
+}
+
+static void cgwb_release_workfn(struct work_struct *work)
+{
+	struct bdi_writeback *wb = container_of(work, struct bdi_writeback,
+						release_work);
+	struct backing_dev_info *bdi = wb->bdi;
+
+	wb_shutdown(wb);
+
+	css_put(wb->memcg_css);
+	css_put(wb->blkcg_css);
+	wb_congested_put(wb->congested);
+
+	percpu_ref_exit(&wb->refcnt);
+	wb_exit(wb);
+	kfree_rcu(wb, rcu);
+
+	if (atomic_dec_and_test(&bdi->usage_cnt))
+		wake_up_all(&cgwb_release_wait);
+}
+
+static void cgwb_release(struct percpu_ref *refcnt)
+{
+	struct bdi_writeback *wb = container_of(refcnt, struct bdi_writeback,
+						refcnt);
+	schedule_work(&wb->release_work);
+}
+
+static void cgwb_kill(struct bdi_writeback *wb)
+{
+	lockdep_assert_held(&cgwb_lock);
+
+	WARN_ON(!radix_tree_delete(&wb->bdi->cgwb_tree, wb->memcg_css->id));
+	list_del(&wb->memcg_node);
+	list_del(&wb->blkcg_node);
+	percpu_ref_kill(&wb->refcnt);
+}
+
+static int cgwb_create(struct backing_dev_info *bdi,
+		       struct cgroup_subsys_state *memcg_css, gfp_t gfp)
+{
+	struct mem_cgroup *memcg;
+	struct cgroup_subsys_state *blkcg_css;
+	struct blkcg *blkcg;
+	struct list_head *memcg_cgwb_list, *blkcg_cgwb_list;
+	struct bdi_writeback *wb;
+	unsigned long flags;
+	int ret = 0;
+
+	memcg = mem_cgroup_from_css(memcg_css);
+	blkcg_css = cgroup_get_e_css(memcg_css->cgroup, &blkio_cgrp_subsys);
+	blkcg = css_to_blkcg(blkcg_css);
+	memcg_cgwb_list = mem_cgroup_cgwb_list(memcg);
+	blkcg_cgwb_list = &blkcg->cgwb_list;
+
+	/* look up again under lock and discard on blkcg mismatch */
+	spin_lock_irqsave(&cgwb_lock, flags);
+	wb = radix_tree_lookup(&bdi->cgwb_tree, memcg_css->id);
+	if (wb && wb->blkcg_css != blkcg_css) {
+		cgwb_kill(wb);
+		wb = NULL;
+	}
+	spin_unlock_irqrestore(&cgwb_lock, flags);
+	if (wb)
+		goto out_put;
+
+	/* need to create a new one */
+	wb = kmalloc(sizeof(*wb), gfp);
+	if (!wb)
+		return -ENOMEM;
+
+	ret = wb_init(wb, bdi, gfp);
+	if (ret)
+		goto err_free;
+
+	ret = percpu_ref_init(&wb->refcnt, cgwb_release, 0, gfp);
+	if (ret)
+		goto err_wb_exit;
+
+	wb->congested = wb_congested_get_create(bdi, blkcg_css->id, gfp);
+	if (!wb->congested)
+		goto err_ref_exit;
+
+	wb->memcg_css = memcg_css;
+	wb->blkcg_css = blkcg_css;
+	INIT_WORK(&wb->release_work, cgwb_release_workfn);
+	set_bit(WB_registered, &wb->state);
+
+	/*
+	 * The root wb determines the registered state of the whole bdi and
+	 * memcg_cgwb_list and blkcg_cgwb_list's next pointers indicate
+	 * whether they're still online.  Don't link @wb if any is dead.
+	 * See wb_memcg_offline() and wb_blkcg_offline().
+	 */
+	ret = -ENODEV;
+	spin_lock_irqsave(&cgwb_lock, flags);
+	if (test_bit(WB_registered, &bdi->wb.state) &&
+	    blkcg_cgwb_list->next && memcg_cgwb_list->next) {
+		/* we might have raced another instance of this function */
+		ret = radix_tree_insert(&bdi->cgwb_tree, memcg_css->id, wb);
+		if (!ret) {
+			atomic_inc(&bdi->usage_cnt);
+			list_add(&wb->memcg_node, memcg_cgwb_list);
+			list_add(&wb->blkcg_node, blkcg_cgwb_list);
+			css_get(memcg_css);
+			css_get(blkcg_css);
+		}
+	}
+	spin_unlock_irqrestore(&cgwb_lock, flags);
+	if (ret) {
+		if (ret == -EEXIST)
+			ret = 0;
+		goto err_put_congested;
+	}
+	goto out_put;
+
+err_put_congested:
+	wb_congested_put(wb->congested);
+err_ref_exit:
+	percpu_ref_exit(&wb->refcnt);
+err_wb_exit:
+	wb_exit(wb);
+err_free:
+	kfree(wb);
+out_put:
+	css_put(blkcg_css);
+	return ret;
+}
+
+/**
+ * wb_get_create - get wb for a given memcg, create if necessary
+ * @bdi: target bdi
+ * @memcg_css: cgroup_subsys_state of the target memcg (must have positive ref)
+ * @gfp: allocation mask to use
+ *
+ * Try to get the wb for @memcg_css on @bdi.  If it doesn't exist, try to
+ * create one.  The returned wb has its refcount incremented.
+ *
+ * This function uses css_get() on @memcg_css and thus expects its refcnt
+ * to be positive on invocation.  IOW, rcu_read_lock() protection on
+ * @memcg_css isn't enough.  try_get it before calling this function.
+ *
+ * A wb is keyed by its associated memcg.  As blkcg implicitly enables
+ * memcg on the default hierarchy, memcg association is guaranteed to be
+ * more specific (equal or descendant to the associated blkcg) and thus can
+ * identify both the memcg and blkcg associations.
+ *
+ * Because the blkcg associated with a memcg may change as blkcg is enabled
+ * and disabled closer to root in the hierarchy, each wb keeps track of
+ * both the memcg and blkcg associated with it and verifies the blkcg on
+ * each lookup.  On mismatch, the existing wb is discarded and a new one is
+ * created.
+ */
+struct bdi_writeback *wb_get_create(struct backing_dev_info *bdi,
+				    struct cgroup_subsys_state *memcg_css,
+				    gfp_t gfp)
+{
+	struct bdi_writeback *wb;
+
+	might_sleep_if(gfp & __GFP_WAIT);
+
+	if (!memcg_css->parent)
+		return &bdi->wb;
+
+	do {
+		rcu_read_lock();
+		wb = radix_tree_lookup(&bdi->cgwb_tree, memcg_css->id);
+		if (wb) {
+			struct cgroup_subsys_state *blkcg_css;
+
+			/* see whether the blkcg association has changed */
+			blkcg_css = cgroup_get_e_css(memcg_css->cgroup,
+						     &blkio_cgrp_subsys);
+			if (unlikely(wb->blkcg_css != blkcg_css ||
+				     !wb_tryget(wb)))
+				wb = NULL;
+			css_put(blkcg_css);
+		}
+		rcu_read_unlock();
+	} while (!wb && !cgwb_create(bdi, memcg_css, gfp));
+
+	return wb;
+}
+
+void __inode_attach_wb(struct inode *inode, struct page *page)
+{
+	struct backing_dev_info *bdi = inode_to_bdi(inode);
+	struct bdi_writeback *wb = NULL;
+
+	if (inode_cgwb_enabled(inode)) {
+		struct cgroup_subsys_state *memcg_css;
+
+		if (page) {
+			memcg_css = mem_cgroup_css_from_page(page);
+			wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC);
+		} else {
+			/* must pin memcg_css, see wb_get_create() */
+			memcg_css = task_get_css(current, memory_cgrp_id);
+			wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC);
+			css_put(memcg_css);
+		}
+	}
+
+	if (!wb)
+		wb = &bdi->wb;
+
+	/*
+	 * There may be multiple instances of this function racing to
+	 * update the same inode.  Use cmpxchg() to tell the winner.
+	 */
+	if (unlikely(cmpxchg(&inode->i_wb, NULL, wb)))
+		wb_put(wb);
+}
+
+static void cgwb_bdi_init(struct backing_dev_info *bdi)
+{
+	bdi->wb.memcg_css = mem_cgroup_root_css;
+	bdi->wb.blkcg_css = blkcg_root_css;
+	bdi->wb_congested.blkcg_id = 1;
+	INIT_RADIX_TREE(&bdi->cgwb_tree, GFP_ATOMIC);
+	bdi->cgwb_congested_tree = RB_ROOT;
+	atomic_set(&bdi->usage_cnt, 1);
+}
+
+static void cgwb_bdi_destroy(struct backing_dev_info *bdi)
+{
+	struct radix_tree_iter iter;
+	void **slot;
+
+	WARN_ON(test_bit(WB_registered, &bdi->wb.state));
+
+	spin_lock_irq(&cgwb_lock);
+	radix_tree_for_each_slot(slot, &bdi->cgwb_tree, &iter, 0)
+		cgwb_kill(*slot);
+	spin_unlock_irq(&cgwb_lock);
+
+	/*
+	 * All cgwb's and their congested states must be shutdown and
+	 * released before returning.  Drain the usage counter to wait for
+	 * all cgwb's and cgwb_congested's ever created on @bdi.
+	 */
+	atomic_dec(&bdi->usage_cnt);
+	wait_event(cgwb_release_wait, !atomic_read(&bdi->usage_cnt));
+}
+
+/**
+ * wb_memcg_offline - kill all wb's associated with a memcg being offlined
+ * @memcg: memcg being offlined
+ *
+ * Also prevents creation of any new wb's associated with @memcg.
+ */
+void wb_memcg_offline(struct mem_cgroup *memcg)
+{
+	LIST_HEAD(to_destroy);
+	struct list_head *memcg_cgwb_list = mem_cgroup_cgwb_list(memcg);
+	struct bdi_writeback *wb, *next;
+
+	spin_lock_irq(&cgwb_lock);
+	list_for_each_entry_safe(wb, next, memcg_cgwb_list, memcg_node)
+		cgwb_kill(wb);
+	memcg_cgwb_list->next = NULL;	/* prevent new wb's */
+	spin_unlock_irq(&cgwb_lock);
+}
+
+/**
+ * wb_blkcg_offline - kill all wb's associated with a blkcg being offlined
+ * @blkcg: blkcg being offlined
+ *
+ * Also prevents creation of any new wb's associated with @blkcg.
+ */
+void wb_blkcg_offline(struct blkcg *blkcg)
+{
+	LIST_HEAD(to_destroy);
+	struct bdi_writeback *wb, *next;
+
+	spin_lock_irq(&cgwb_lock);
+	list_for_each_entry_safe(wb, next, &blkcg->cgwb_list, blkcg_node)
+		cgwb_kill(wb);
+	blkcg->cgwb_list.next = NULL;	/* prevent new wb's */
+	spin_unlock_irq(&cgwb_lock);
+}
+
+#else	/* CONFIG_CGROUP_WRITEBACK */
+
+static void cgwb_bdi_init(struct backing_dev_info *bdi) { }
+static void cgwb_bdi_destroy(struct backing_dev_info *bdi) { }
+
+#endif	/* CONFIG_CGROUP_WRITEBACK */
+
 int bdi_init(struct backing_dev_info *bdi)
 {
 	int err;
@@ -386,6 +781,7 @@ int bdi_init(struct backing_dev_info *bdi)
 	bdi->wb_congested.state = 0;
 	bdi->wb.congested = &bdi->wb_congested;
 
+	cgwb_bdi_init(bdi);
 	return 0;
 }
 EXPORT_SYMBOL(bdi_init);
@@ -459,6 +855,7 @@ void bdi_destroy(struct backing_dev_info *bdi)
 	/* make sure nobody finds us on the bdi_list anymore */
 	bdi_remove_from_list(bdi);
 	wb_shutdown(&bdi->wb);
+	cgwb_bdi_destroy(bdi);
 
 	if (bdi->dev) {
 		bdi_debug_unregister(bdi);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 5c270a03729c..49e5aa6abca6 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -348,6 +348,10 @@ struct mem_cgroup {
 	atomic_t	numainfo_updating;
 #endif
 
+#ifdef CONFIG_CGROUP_WRITEBACK
+	struct list_head cgwb_list;
+#endif
+
 	/* List of events which userspace want to receive */
 	struct list_head event_list;
 	spinlock_t event_list_lock;
@@ -4030,6 +4034,15 @@ static void memcg_destroy_kmem(struct mem_cgroup *memcg)
 }
 #endif
 
+#ifdef CONFIG_CGROUP_WRITEBACK
+
+struct list_head *mem_cgroup_cgwb_list(struct mem_cgroup *memcg)
+{
+	return &memcg->cgwb_list;
+}
+
+#endif	/* CONFIG_CGROUP_WRITEBACK */
+
 /*
  * DO NOT USE IN NEW FILES.
  *
@@ -4494,7 +4507,9 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
 #ifdef CONFIG_MEMCG_KMEM
 	memcg->kmemcg_id = -1;
 #endif
-
+#ifdef CONFIG_CGROUP_WRITEBACK
+	INIT_LIST_HEAD(&memcg->cgwb_list);
+#endif
 	return &memcg->css;
 
 free_out:
@@ -4582,6 +4597,8 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
 	vmpressure_cleanup(&memcg->vmpressure);
 
 	memcg_deactivate_kmem(memcg);
+
+	wb_memcg_offline(memcg);
 }
 
 static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 78ef5511a198..9b95cf80b407 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -2097,16 +2097,21 @@ int __set_page_dirty_no_writeback(struct page *page)
 void account_page_dirtied(struct page *page, struct address_space *mapping,
 			  struct mem_cgroup *memcg)
 {
+	struct inode *inode = mapping->host;
+
 	trace_writeback_dirty_page(page, mapping);
 
 	if (mapping_cap_account_dirty(mapping)) {
-		struct backing_dev_info *bdi = inode_to_bdi(mapping->host);
+		struct bdi_writeback *wb;
+
+		inode_attach_wb(inode, page);
+		wb = inode_to_wb(inode);
 
 		mem_cgroup_inc_page_stat(memcg, MEM_CGROUP_STAT_DIRTY);
 		__inc_zone_page_state(page, NR_FILE_DIRTY);
 		__inc_zone_page_state(page, NR_DIRTIED);
-		__inc_wb_stat(&bdi->wb, WB_RECLAIMABLE);
-		__inc_wb_stat(&bdi->wb, WB_DIRTIED);
+		__inc_wb_stat(wb, WB_RECLAIMABLE);
+		__inc_wb_stat(wb, WB_DIRTIED);
 		task_io_account_write(PAGE_CACHE_SIZE);
 		current->nr_dirtied++;
 		this_cpu_inc(bdp_ratelimits);
-- 
cgit v1.2.3


From ce7acfeaf0363c8b75810908448f61af04d38f91 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 22 May 2015 17:13:38 -0400
Subject: writeback, blkcg: associate each blkcg_gq with the corresponding
 bdi_writeback_congested

A blkg (blkcg_gq) can be congested and decongested independently from
other blkgs on the same request_queue.  Accordingly, for cgroup
writeback support, the congestion status at bdi (backing_dev_info)
should be split and updated separately from matching blkg's.

This patch prepares by adding blkg->wb_congested and associating a
blkg with its matching per-blkcg bdi_writeback_congested on creation.

v2: Updated to associate bdi_writeback_congested instead of
    bdi_writeback.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Jan Kara <jack@suse.cz>
Cc: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-cgroup.c         | 17 +++++++++++++++--
 include/linux/blk-cgroup.h |  6 ++++++
 2 files changed, 21 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 979cfdbb94e0..31610ae0ebff 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -182,6 +182,7 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
 				    struct blkcg_gq *new_blkg)
 {
 	struct blkcg_gq *blkg;
+	struct bdi_writeback_congested *wb_congested;
 	int i, ret;
 
 	WARN_ON_ONCE(!rcu_read_lock_held());
@@ -193,22 +194,30 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
 		goto err_free_blkg;
 	}
 
+	wb_congested = wb_congested_get_create(&q->backing_dev_info,
+					       blkcg->css.id, GFP_ATOMIC);
+	if (!wb_congested) {
+		ret = -ENOMEM;
+		goto err_put_css;
+	}
+
 	/* allocate */
 	if (!new_blkg) {
 		new_blkg = blkg_alloc(blkcg, q, GFP_ATOMIC);
 		if (unlikely(!new_blkg)) {
 			ret = -ENOMEM;
-			goto err_put_css;
+			goto err_put_congested;
 		}
 	}
 	blkg = new_blkg;
+	blkg->wb_congested = wb_congested;
 
 	/* link parent */
 	if (blkcg_parent(blkcg)) {
 		blkg->parent = __blkg_lookup(blkcg_parent(blkcg), q, false);
 		if (WARN_ON_ONCE(!blkg->parent)) {
 			ret = -EINVAL;
-			goto err_put_css;
+			goto err_put_congested;
 		}
 		blkg_get(blkg->parent);
 	}
@@ -245,6 +254,8 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
 	blkg_put(blkg);
 	return ERR_PTR(ret);
 
+err_put_congested:
+	wb_congested_put(wb_congested);
 err_put_css:
 	css_put(&blkcg->css);
 err_free_blkg:
@@ -391,6 +402,8 @@ void __blkg_release_rcu(struct rcu_head *rcu_head)
 	if (blkg->parent)
 		blkg_put(blkg->parent);
 
+	wb_congested_put(blkg->wb_congested);
+
 	blkg_free(blkg);
 }
 EXPORT_SYMBOL_GPL(__blkg_release_rcu);
diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h
index 3033eb173eb4..07a32b813ed8 100644
--- a/include/linux/blk-cgroup.h
+++ b/include/linux/blk-cgroup.h
@@ -99,6 +99,12 @@ struct blkcg_gq {
 	struct hlist_node		blkcg_node;
 	struct blkcg			*blkcg;
 
+	/*
+	 * Each blkg gets congested separately and the congestion state is
+	 * propagated to the matching bdi_writeback_congested.
+	 */
+	struct bdi_writeback_congested	*wb_congested;
+
 	/* all non-root blkcg_gq's are guaranteed to have access to parent */
 	struct blkcg_gq			*parent;
 
-- 
cgit v1.2.3


From ec8a6f2643923ee5b74d24fa8d134240379f436b Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 22 May 2015 17:13:41 -0400
Subject: writeback: make congestion functions per bdi_writeback

Currently, all congestion functions take bdi (backing_dev_info) and
always operate on the root wb (bdi->wb) and the congestion state from
the block layer is propagated only for the root blkcg.  This patch
introduces {set|clear}_wb_congested() and wb_congested() which take a
bdi_writeback_congested and bdi_writeback respectively.  The bdi
counteparts are now wrappers invoking the wb based functions on
@bdi->wb.

While converting clear_bdi_congested() to clear_wb_congested(), the
local variable declaration order between @wqh and @bit is swapped for
cosmetic reason.

This patch just adds the new wb based functions.  The following
patches will apply them.

v2: Updated for bdi_writeback_congested.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Jan Kara <jack@suse.cz>
Cc: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 include/linux/backing-dev-defs.h | 14 +++++++++++--
 include/linux/backing-dev.h      | 45 +++++++++++++++++++++++-----------------
 mm/backing-dev.c                 | 22 ++++++++++----------
 3 files changed, 49 insertions(+), 32 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h
index a1e9c407a59a..eb386766b5f3 100644
--- a/include/linux/backing-dev-defs.h
+++ b/include/linux/backing-dev-defs.h
@@ -163,7 +163,17 @@ enum {
 	BLK_RW_SYNC	= 1,
 };
 
-void clear_bdi_congested(struct backing_dev_info *bdi, int sync);
-void set_bdi_congested(struct backing_dev_info *bdi, int sync);
+void clear_wb_congested(struct bdi_writeback_congested *congested, int sync);
+void set_wb_congested(struct bdi_writeback_congested *congested, int sync);
+
+static inline void clear_bdi_congested(struct backing_dev_info *bdi, int sync)
+{
+	clear_wb_congested(bdi->wb.congested, sync);
+}
+
+static inline void set_bdi_congested(struct backing_dev_info *bdi, int sync)
+{
+	set_wb_congested(bdi->wb.congested, sync);
+}
 
 #endif	/* __LINUX_BACKING_DEV_DEFS_H */
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index 8ae59df2e3d1..2c498a2a8268 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -167,27 +167,13 @@ static inline struct backing_dev_info *inode_to_bdi(struct inode *inode)
 	return sb->s_bdi;
 }
 
-static inline int bdi_congested(struct backing_dev_info *bdi, int bdi_bits)
+static inline int wb_congested(struct bdi_writeback *wb, int cong_bits)
 {
-	if (bdi->congested_fn)
-		return bdi->congested_fn(bdi->congested_data, bdi_bits);
-	return (bdi->wb.congested->state & bdi_bits);
-}
-
-static inline int bdi_read_congested(struct backing_dev_info *bdi)
-{
-	return bdi_congested(bdi, 1 << WB_sync_congested);
-}
-
-static inline int bdi_write_congested(struct backing_dev_info *bdi)
-{
-	return bdi_congested(bdi, 1 << WB_async_congested);
-}
+	struct backing_dev_info *bdi = wb->bdi;
 
-static inline int bdi_rw_congested(struct backing_dev_info *bdi)
-{
-	return bdi_congested(bdi, (1 << WB_sync_congested) |
-				  (1 << WB_async_congested));
+	if (bdi->congested_fn)
+		return bdi->congested_fn(bdi->congested_data, cong_bits);
+	return wb->congested->state & cong_bits;
 }
 
 long congestion_wait(int sync, long timeout);
@@ -454,4 +440,25 @@ static inline void wb_blkcg_offline(struct blkcg *blkcg)
 
 #endif	/* CONFIG_CGROUP_WRITEBACK */
 
+static inline int bdi_congested(struct backing_dev_info *bdi, int cong_bits)
+{
+	return wb_congested(&bdi->wb, cong_bits);
+}
+
+static inline int bdi_read_congested(struct backing_dev_info *bdi)
+{
+	return bdi_congested(bdi, 1 << WB_sync_congested);
+}
+
+static inline int bdi_write_congested(struct backing_dev_info *bdi)
+{
+	return bdi_congested(bdi, 1 << WB_async_congested);
+}
+
+static inline int bdi_rw_congested(struct backing_dev_info *bdi)
+{
+	return bdi_congested(bdi, (1 << WB_sync_congested) |
+				  (1 << WB_async_congested));
+}
+
 #endif	/* _LINUX_BACKING_DEV_H */
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 4c9386c98ec1..5029c4ad2f36 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -896,31 +896,31 @@ static wait_queue_head_t congestion_wqh[2] = {
 		__WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[0]),
 		__WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[1])
 	};
-static atomic_t nr_bdi_congested[2];
+static atomic_t nr_wb_congested[2];
 
-void clear_bdi_congested(struct backing_dev_info *bdi, int sync)
+void clear_wb_congested(struct bdi_writeback_congested *congested, int sync)
 {
-	enum wb_state bit;
 	wait_queue_head_t *wqh = &congestion_wqh[sync];
+	enum wb_state bit;
 
 	bit = sync ? WB_sync_congested : WB_async_congested;
-	if (test_and_clear_bit(bit, &bdi->wb.congested->state))
-		atomic_dec(&nr_bdi_congested[sync]);
+	if (test_and_clear_bit(bit, &congested->state))
+		atomic_dec(&nr_wb_congested[sync]);
 	smp_mb__after_atomic();
 	if (waitqueue_active(wqh))
 		wake_up(wqh);
 }
-EXPORT_SYMBOL(clear_bdi_congested);
+EXPORT_SYMBOL(clear_wb_congested);
 
-void set_bdi_congested(struct backing_dev_info *bdi, int sync)
+void set_wb_congested(struct bdi_writeback_congested *congested, int sync)
 {
 	enum wb_state bit;
 
 	bit = sync ? WB_sync_congested : WB_async_congested;
-	if (!test_and_set_bit(bit, &bdi->wb.congested->state))
-		atomic_inc(&nr_bdi_congested[sync]);
+	if (!test_and_set_bit(bit, &congested->state))
+		atomic_inc(&nr_wb_congested[sync]);
 }
-EXPORT_SYMBOL(set_bdi_congested);
+EXPORT_SYMBOL(set_wb_congested);
 
 /**
  * congestion_wait - wait for a backing_dev to become uncongested
@@ -979,7 +979,7 @@ long wait_iff_congested(struct zone *zone, int sync, long timeout)
 	 * encountered in the current zone, yield if necessary instead
 	 * of sleeping on the congestion queue
 	 */
-	if (atomic_read(&nr_bdi_congested[sync]) == 0 ||
+	if (atomic_read(&nr_wb_congested[sync]) == 0 ||
 	    !test_bit(ZONE_CONGESTED, &zone->flags)) {
 		cond_resched();
 
-- 
cgit v1.2.3


From d40f75a06dd675808eed385d490ba9468200b23f Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 22 May 2015 17:13:42 -0400
Subject: writeback, blkcg: restructure blk_{set|clear}_queue_congested()

blk_{set|clear}_queue_congested() take @q and set or clear,
respectively, the congestion state of its bdi's root wb.  Because bdi
used to be able to handle congestion state only on the root wb, the
callers of those functions tested whether the congestion is on the
root blkcg and skipped if not.

This is cumbersome and makes implementation of per cgroup
bdi_writeback congestion state propagation difficult.  This patch
renames blk_{set|clear}_queue_congested() to
blk_{set|clear}_congested(), and makes them take request_list instead
of request_queue and test whether the specified request_list is the
root one before updating bdi_writeback congestion state.  This makes
the tests in the callers unnecessary and simplifies them.

As there are no external users of these functions, the definitions are
moved from include/linux/blkdev.h to block/blk-core.c.

This patch doesn't introduce any noticeable behavior difference.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Jan Kara <jack@suse.cz>
Cc: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-core.c       | 62 ++++++++++++++++++++++++++++++--------------------
 include/linux/blkdev.h | 19 ----------------
 2 files changed, 37 insertions(+), 44 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-core.c b/block/blk-core.c
index c114a61590bc..f46a43ef1a94 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -63,6 +63,28 @@ struct kmem_cache *blk_requestq_cachep;
  */
 static struct workqueue_struct *kblockd_workqueue;
 
+static void blk_clear_congested(struct request_list *rl, int sync)
+{
+	if (rl != &rl->q->root_rl)
+		return;
+#ifdef CONFIG_CGROUP_WRITEBACK
+	clear_wb_congested(rl->blkg->wb_congested, sync);
+#else
+	clear_wb_congested(rl->q->backing_dev_info.wb.congested, sync);
+#endif
+}
+
+static void blk_set_congested(struct request_list *rl, int sync)
+{
+	if (rl != &rl->q->root_rl)
+		return;
+#ifdef CONFIG_CGROUP_WRITEBACK
+	set_wb_congested(rl->blkg->wb_congested, sync);
+#else
+	set_wb_congested(rl->q->backing_dev_info.wb.congested, sync);
+#endif
+}
+
 void blk_queue_congestion_threshold(struct request_queue *q)
 {
 	int nr;
@@ -842,13 +864,8 @@ static void __freed_request(struct request_list *rl, int sync)
 {
 	struct request_queue *q = rl->q;
 
-	/*
-	 * bdi isn't aware of blkcg yet.  As all async IOs end up root
-	 * blkcg anyway, just use root blkcg state.
-	 */
-	if (rl == &q->root_rl &&
-	    rl->count[sync] < queue_congestion_off_threshold(q))
-		blk_clear_queue_congested(q, sync);
+	if (rl->count[sync] < queue_congestion_off_threshold(q))
+		blk_clear_congested(rl, sync);
 
 	if (rl->count[sync] + 1 <= q->nr_requests) {
 		if (waitqueue_active(&rl->wait[sync]))
@@ -881,25 +898,25 @@ static void freed_request(struct request_list *rl, unsigned int flags)
 int blk_update_nr_requests(struct request_queue *q, unsigned int nr)
 {
 	struct request_list *rl;
+	int on_thresh, off_thresh;
 
 	spin_lock_irq(q->queue_lock);
 	q->nr_requests = nr;
 	blk_queue_congestion_threshold(q);
+	on_thresh = queue_congestion_on_threshold(q);
+	off_thresh = queue_congestion_off_threshold(q);
 
-	/* congestion isn't cgroup aware and follows root blkcg for now */
-	rl = &q->root_rl;
-
-	if (rl->count[BLK_RW_SYNC] >= queue_congestion_on_threshold(q))
-		blk_set_queue_congested(q, BLK_RW_SYNC);
-	else if (rl->count[BLK_RW_SYNC] < queue_congestion_off_threshold(q))
-		blk_clear_queue_congested(q, BLK_RW_SYNC);
+	blk_queue_for_each_rl(rl, q) {
+		if (rl->count[BLK_RW_SYNC] >= on_thresh)
+			blk_set_congested(rl, BLK_RW_SYNC);
+		else if (rl->count[BLK_RW_SYNC] < off_thresh)
+			blk_clear_congested(rl, BLK_RW_SYNC);
 
-	if (rl->count[BLK_RW_ASYNC] >= queue_congestion_on_threshold(q))
-		blk_set_queue_congested(q, BLK_RW_ASYNC);
-	else if (rl->count[BLK_RW_ASYNC] < queue_congestion_off_threshold(q))
-		blk_clear_queue_congested(q, BLK_RW_ASYNC);
+		if (rl->count[BLK_RW_ASYNC] >= on_thresh)
+			blk_set_congested(rl, BLK_RW_ASYNC);
+		else if (rl->count[BLK_RW_ASYNC] < off_thresh)
+			blk_clear_congested(rl, BLK_RW_ASYNC);
 
-	blk_queue_for_each_rl(rl, q) {
 		if (rl->count[BLK_RW_SYNC] >= q->nr_requests) {
 			blk_set_rl_full(rl, BLK_RW_SYNC);
 		} else {
@@ -1009,12 +1026,7 @@ static struct request *__get_request(struct request_list *rl, int rw_flags,
 				}
 			}
 		}
-		/*
-		 * bdi isn't aware of blkcg yet.  As all async IOs end up
-		 * root blkcg anyway, just use root blkcg state.
-		 */
-		if (rl == &q->root_rl)
-			blk_set_queue_congested(q, is_sync);
+		blk_set_congested(rl, is_sync);
 	}
 
 	/*
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 60d2726a6b62..ab4a27852f1b 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -790,25 +790,6 @@ extern int sg_scsi_ioctl(struct request_queue *, struct gendisk *, fmode_t,
 
 extern void blk_queue_bio(struct request_queue *q, struct bio *bio);
 
-/*
- * A queue has just exitted congestion.  Note this in the global counter of
- * congested queues, and wake up anyone who was waiting for requests to be
- * put back.
- */
-static inline void blk_clear_queue_congested(struct request_queue *q, int sync)
-{
-	clear_bdi_congested(&q->backing_dev_info, sync);
-}
-
-/*
- * A queue has just entered congestion.  Flag that in the queue's VM-visible
- * state flags and increment the global gounter of congested queues.
- */
-static inline void blk_set_queue_congested(struct request_queue *q, int sync)
-{
-	set_bdi_congested(&q->backing_dev_info, sync);
-}
-
 extern void blk_start_queue(struct request_queue *q);
 extern void blk_stop_queue(struct request_queue *q);
 extern void blk_sync_queue(struct request_queue *q);
-- 
cgit v1.2.3


From 703c270887bb5106c4c46a00cc7477d30d5e04f5 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 22 May 2015 17:13:44 -0400
Subject: writeback: implement and use inode_congested()

In several places, bdi_congested() and its wrappers are used to
determine whether more IOs should be issued.  With cgroup writeback
support, this question can't be answered solely based on the bdi
(backing_dev_info).  It's dependent on whether the filesystem and bdi
support cgroup writeback and the blkcg the inode is associated with.

This patch implements inode_congested() and its wrappers which take
@inode and determines the congestion state considering cgroup
writeback.  The new functions replace bdi_*congested() calls in places
where the query is about specific inode and task.

There are several filesystem users which also fit this criteria but
they should be updated when each filesystem implements cgroup
writeback support.

v2: Now that a given inode is associated with only one wb, congestion
    state can be determined independent from the asking task.  Drop
    @task.  Spotted by Vivek.  Also, converted to take @inode instead
    of @mapping and renamed to inode_congested().

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Jan Kara <jack@suse.cz>
Cc: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 fs/fs-writeback.c           | 29 +++++++++++++++++++++++++++++
 include/linux/backing-dev.h | 22 ++++++++++++++++++++++
 mm/fadvise.c                |  2 +-
 mm/readahead.c              |  2 +-
 mm/vmscan.c                 | 11 +++++------
 5 files changed, 58 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 99a2440c5588..7ec491b1be04 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -142,6 +142,35 @@ static void __wb_start_writeback(struct bdi_writeback *wb, long nr_pages,
 	wb_queue_work(wb, work);
 }
 
+#ifdef CONFIG_CGROUP_WRITEBACK
+
+/**
+ * inode_congested - test whether an inode is congested
+ * @inode: inode to test for congestion
+ * @cong_bits: mask of WB_[a]sync_congested bits to test
+ *
+ * Tests whether @inode is congested.  @cong_bits is the mask of congestion
+ * bits to test and the return value is the mask of set bits.
+ *
+ * If cgroup writeback is enabled for @inode, the congestion state is
+ * determined by whether the cgwb (cgroup bdi_writeback) for the blkcg
+ * associated with @inode is congested; otherwise, the root wb's congestion
+ * state is used.
+ */
+int inode_congested(struct inode *inode, int cong_bits)
+{
+	if (inode) {
+		struct bdi_writeback *wb = inode_to_wb(inode);
+		if (wb)
+			return wb_congested(wb, cong_bits);
+	}
+
+	return wb_congested(&inode_to_bdi(inode)->wb, cong_bits);
+}
+EXPORT_SYMBOL_GPL(inode_congested);
+
+#endif	/* CONFIG_CGROUP_WRITEBACK */
+
 /**
  * bdi_start_writeback - start writeback
  * @bdi: the backing device to write from
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index 2c498a2a8268..6f0882105f95 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -230,6 +230,7 @@ struct bdi_writeback *wb_get_create(struct backing_dev_info *bdi,
 void __inode_attach_wb(struct inode *inode, struct page *page);
 void wb_memcg_offline(struct mem_cgroup *memcg);
 void wb_blkcg_offline(struct blkcg *blkcg);
+int inode_congested(struct inode *inode, int cong_bits);
 
 /**
  * inode_cgwb_enabled - test whether cgroup writeback is enabled on an inode
@@ -438,8 +439,29 @@ static inline void wb_blkcg_offline(struct blkcg *blkcg)
 {
 }
 
+static inline int inode_congested(struct inode *inode, int cong_bits)
+{
+	return wb_congested(&inode_to_bdi(inode)->wb, cong_bits);
+}
+
 #endif	/* CONFIG_CGROUP_WRITEBACK */
 
+static inline int inode_read_congested(struct inode *inode)
+{
+	return inode_congested(inode, 1 << WB_sync_congested);
+}
+
+static inline int inode_write_congested(struct inode *inode)
+{
+	return inode_congested(inode, 1 << WB_async_congested);
+}
+
+static inline int inode_rw_congested(struct inode *inode)
+{
+	return inode_congested(inode, (1 << WB_sync_congested) |
+				      (1 << WB_async_congested));
+}
+
 static inline int bdi_congested(struct backing_dev_info *bdi, int cong_bits)
 {
 	return wb_congested(&bdi->wb, cong_bits);
diff --git a/mm/fadvise.c b/mm/fadvise.c
index 4a3907cf79f8..b8a5bc66b0c0 100644
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -115,7 +115,7 @@ SYSCALL_DEFINE4(fadvise64_64, int, fd, loff_t, offset, loff_t, len, int, advice)
 	case POSIX_FADV_NOREUSE:
 		break;
 	case POSIX_FADV_DONTNEED:
-		if (!bdi_write_congested(bdi))
+		if (!inode_write_congested(mapping->host))
 			__filemap_fdatawrite_range(mapping, offset, endbyte,
 						   WB_SYNC_NONE);
 
diff --git a/mm/readahead.c b/mm/readahead.c
index 935675844b2e..60cd846a9a44 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -541,7 +541,7 @@ page_cache_async_readahead(struct address_space *mapping,
 	/*
 	 * Defer asynchronous read-ahead on IO congestion.
 	 */
-	if (bdi_read_congested(inode_to_bdi(mapping->host)))
+	if (inode_read_congested(mapping->host))
 		return;
 
 	/* do read-ahead */
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 7582f9fcda92..f46339870147 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -452,14 +452,13 @@ static inline int is_page_cache_freeable(struct page *page)
 	return page_count(page) - page_has_private(page) == 2;
 }
 
-static int may_write_to_queue(struct backing_dev_info *bdi,
-			      struct scan_control *sc)
+static int may_write_to_inode(struct inode *inode, struct scan_control *sc)
 {
 	if (current->flags & PF_SWAPWRITE)
 		return 1;
-	if (!bdi_write_congested(bdi))
+	if (!inode_write_congested(inode))
 		return 1;
-	if (bdi == current->backing_dev_info)
+	if (inode_to_bdi(inode) == current->backing_dev_info)
 		return 1;
 	return 0;
 }
@@ -538,7 +537,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
 	}
 	if (mapping->a_ops->writepage == NULL)
 		return PAGE_ACTIVATE;
-	if (!may_write_to_queue(inode_to_bdi(mapping->host), sc))
+	if (!may_write_to_inode(mapping->host, sc))
 		return PAGE_KEEP;
 
 	if (clear_page_dirty_for_io(page)) {
@@ -924,7 +923,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 		 */
 		mapping = page_mapping(page);
 		if (((dirty || writeback) && mapping &&
-		     bdi_write_congested(inode_to_bdi(mapping->host))) ||
+		     inode_write_congested(mapping->host)) ||
 		    (writeback && PageReclaim(page)))
 			nr_congested++;
 
-- 
cgit v1.2.3


From d6c10f1fc8626dc55946f4768ae322b4c57b07dd Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 22 May 2015 17:13:45 -0400
Subject: writeback: implement WB_has_dirty_io wb_state flag

Currently, wb_has_dirty_io() determines whether a wb (bdi_writeback)
has any dirty inode by testing all three IO lists on each invocation
without actively keeping track.  For cgroup writeback support, a
single bdi will host multiple wb's each of which will host dirty
inodes separately and we'll need to make bdi_has_dirty_io(), which
currently only represents the root wb, aggregate has_dirty_io from all
member wb's, which requires tracking transitions in has_dirty_io state
on each wb.

This patch introduces inode_wb_list_{move|del}_locked() to consolidate
IO list operations leaving queue_io() the only other function which
directly manipulates IO lists (via move_expired_inodes()).  All three
functions are updated to call wb_io_lists_[de]populated() which keep
track of whether the wb has dirty inodes or not and record it using
the new WB_has_dirty_io flag.  inode_wb_list_moved_locked()'s return
value indicates whether the wb had no dirty inodes before.

mark_inode_dirty() is restructured so that the return value of
inode_wb_list_move_locked() can be used for deciding whether to wake
up the wb.

While at it, change {bdi|wb}_has_dirty_io()'s return values to bool.
These functions were returning 0 and 1 before.  Also, add a comment
explaining the synchronization of wb_state flags.

v2: Updated to accommodate b_dirty_time.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Jan Kara <jack@suse.cz>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 fs/fs-writeback.c                | 110 ++++++++++++++++++++++++++++++---------
 include/linux/backing-dev-defs.h |   1 +
 include/linux/backing-dev.h      |   8 ++-
 mm/backing-dev.c                 |   2 +-
 4 files changed, 91 insertions(+), 30 deletions(-)

(limited to 'include/linux')

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 7ec491b1be04..0a90dc557748 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -93,6 +93,66 @@ static inline struct inode *wb_inode(struct list_head *head)
 
 EXPORT_TRACEPOINT_SYMBOL_GPL(wbc_writepage);
 
+static bool wb_io_lists_populated(struct bdi_writeback *wb)
+{
+	if (wb_has_dirty_io(wb)) {
+		return false;
+	} else {
+		set_bit(WB_has_dirty_io, &wb->state);
+		return true;
+	}
+}
+
+static void wb_io_lists_depopulated(struct bdi_writeback *wb)
+{
+	if (wb_has_dirty_io(wb) && list_empty(&wb->b_dirty) &&
+	    list_empty(&wb->b_io) && list_empty(&wb->b_more_io))
+		clear_bit(WB_has_dirty_io, &wb->state);
+}
+
+/**
+ * inode_wb_list_move_locked - move an inode onto a bdi_writeback IO list
+ * @inode: inode to be moved
+ * @wb: target bdi_writeback
+ * @head: one of @wb->b_{dirty|io|more_io}
+ *
+ * Move @inode->i_wb_list to @list of @wb and set %WB_has_dirty_io.
+ * Returns %true if @inode is the first occupant of the !dirty_time IO
+ * lists; otherwise, %false.
+ */
+static bool inode_wb_list_move_locked(struct inode *inode,
+				      struct bdi_writeback *wb,
+				      struct list_head *head)
+{
+	assert_spin_locked(&wb->list_lock);
+
+	list_move(&inode->i_wb_list, head);
+
+	/* dirty_time doesn't count as dirty_io until expiration */
+	if (head != &wb->b_dirty_time)
+		return wb_io_lists_populated(wb);
+
+	wb_io_lists_depopulated(wb);
+	return false;
+}
+
+/**
+ * inode_wb_list_del_locked - remove an inode from its bdi_writeback IO list
+ * @inode: inode to be removed
+ * @wb: bdi_writeback @inode is being removed from
+ *
+ * Remove @inode which may be on one of @wb->b_{dirty|io|more_io} lists and
+ * clear %WB_has_dirty_io if all are empty afterwards.
+ */
+static void inode_wb_list_del_locked(struct inode *inode,
+				     struct bdi_writeback *wb)
+{
+	assert_spin_locked(&wb->list_lock);
+
+	list_del_init(&inode->i_wb_list);
+	wb_io_lists_depopulated(wb);
+}
+
 static void wb_wakeup(struct bdi_writeback *wb)
 {
 	spin_lock_bh(&wb->work_lock);
@@ -217,7 +277,7 @@ void inode_wb_list_del(struct inode *inode)
 	struct bdi_writeback *wb = inode_to_wb(inode);
 
 	spin_lock(&wb->list_lock);
-	list_del_init(&inode->i_wb_list);
+	inode_wb_list_del_locked(inode, wb);
 	spin_unlock(&wb->list_lock);
 }
 
@@ -232,7 +292,6 @@ void inode_wb_list_del(struct inode *inode)
  */
 static void redirty_tail(struct inode *inode, struct bdi_writeback *wb)
 {
-	assert_spin_locked(&wb->list_lock);
 	if (!list_empty(&wb->b_dirty)) {
 		struct inode *tail;
 
@@ -240,7 +299,7 @@ static void redirty_tail(struct inode *inode, struct bdi_writeback *wb)
 		if (time_before(inode->dirtied_when, tail->dirtied_when))
 			inode->dirtied_when = jiffies;
 	}
-	list_move(&inode->i_wb_list, &wb->b_dirty);
+	inode_wb_list_move_locked(inode, wb, &wb->b_dirty);
 }
 
 /*
@@ -248,8 +307,7 @@ static void redirty_tail(struct inode *inode, struct bdi_writeback *wb)
  */
 static void requeue_io(struct inode *inode, struct bdi_writeback *wb)
 {
-	assert_spin_locked(&wb->list_lock);
-	list_move(&inode->i_wb_list, &wb->b_more_io);
+	inode_wb_list_move_locked(inode, wb, &wb->b_more_io);
 }
 
 static void inode_sync_complete(struct inode *inode)
@@ -358,6 +416,8 @@ static void queue_io(struct bdi_writeback *wb, struct wb_writeback_work *work)
 	moved = move_expired_inodes(&wb->b_dirty, &wb->b_io, 0, work);
 	moved += move_expired_inodes(&wb->b_dirty_time, &wb->b_io,
 				     EXPIRE_DIRTY_ATIME, work);
+	if (moved)
+		wb_io_lists_populated(wb);
 	trace_writeback_queue_io(wb, work, moved);
 }
 
@@ -483,10 +543,10 @@ static void requeue_inode(struct inode *inode, struct bdi_writeback *wb,
 		redirty_tail(inode, wb);
 	} else if (inode->i_state & I_DIRTY_TIME) {
 		inode->dirtied_when = jiffies;
-		list_move(&inode->i_wb_list, &wb->b_dirty_time);
+		inode_wb_list_move_locked(inode, wb, &wb->b_dirty_time);
 	} else {
 		/* The inode is clean. Remove from writeback lists. */
-		list_del_init(&inode->i_wb_list);
+		inode_wb_list_del_locked(inode, wb);
 	}
 }
 
@@ -628,7 +688,7 @@ writeback_single_inode(struct inode *inode, struct bdi_writeback *wb,
 	 * touch it. See comment above for explanation.
 	 */
 	if (!(inode->i_state & I_DIRTY_ALL))
-		list_del_init(&inode->i_wb_list);
+		inode_wb_list_del_locked(inode, wb);
 	spin_unlock(&wb->list_lock);
 	inode_sync_complete(inode);
 out:
@@ -1327,37 +1387,39 @@ void __mark_inode_dirty(struct inode *inode, int flags)
 		 * reposition it (that would break b_dirty time-ordering).
 		 */
 		if (!was_dirty) {
+			struct list_head *dirty_list;
 			bool wakeup_bdi = false;
 			bdi = inode_to_bdi(inode);
 
 			spin_unlock(&inode->i_lock);
 			spin_lock(&bdi->wb.list_lock);
-			if (bdi_cap_writeback_dirty(bdi)) {
-				WARN(!test_bit(WB_registered, &bdi->wb.state),
-				     "bdi-%s not registered\n", bdi->name);
 
-				/*
-				 * If this is the first dirty inode for this
-				 * bdi, we have to wake-up the corresponding
-				 * bdi thread to make sure background
-				 * write-back happens later.
-				 */
-				if (!wb_has_dirty_io(&bdi->wb))
-					wakeup_bdi = true;
-			}
+			WARN(bdi_cap_writeback_dirty(bdi) &&
+			     !test_bit(WB_registered, &bdi->wb.state),
+			     "bdi-%s not registered\n", bdi->name);
 
 			inode->dirtied_when = jiffies;
 			if (dirtytime)
 				inode->dirtied_time_when = jiffies;
+
 			if (inode->i_state & (I_DIRTY_INODE | I_DIRTY_PAGES))
-				list_move(&inode->i_wb_list, &bdi->wb.b_dirty);
+				dirty_list = &bdi->wb.b_dirty;
 			else
-				list_move(&inode->i_wb_list,
-					  &bdi->wb.b_dirty_time);
+				dirty_list = &bdi->wb.b_dirty_time;
+
+			wakeup_bdi = inode_wb_list_move_locked(inode, &bdi->wb,
+							       dirty_list);
+
 			spin_unlock(&bdi->wb.list_lock);
 			trace_writeback_dirty_inode_enqueue(inode);
 
-			if (wakeup_bdi)
+			/*
+			 * If this is the first dirty inode for this bdi,
+			 * we have to wake-up the corresponding bdi thread
+			 * to make sure background write-back happens
+			 * later.
+			 */
+			if (bdi_cap_writeback_dirty(bdi) && wakeup_bdi)
 				wb_wakeup_delayed(&bdi->wb);
 			return;
 		}
diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h
index eb386766b5f3..7a94b7850b7c 100644
--- a/include/linux/backing-dev-defs.h
+++ b/include/linux/backing-dev-defs.h
@@ -21,6 +21,7 @@ struct dentry;
 enum wb_state {
 	WB_registered,		/* bdi_register() was done */
 	WB_writeback_running,	/* Writeback is in progress */
+	WB_has_dirty_io,	/* Dirty inodes on ->b_{dirty|io|more_io} */
 };
 
 enum wb_congested_state {
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index 6f0882105f95..3c8403c012ce 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -29,7 +29,7 @@ void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
 			enum wb_reason reason);
 void bdi_start_background_writeback(struct backing_dev_info *bdi);
 void wb_workfn(struct work_struct *work);
-int bdi_has_dirty_io(struct backing_dev_info *bdi);
+bool bdi_has_dirty_io(struct backing_dev_info *bdi);
 void wb_wakeup_delayed(struct bdi_writeback *wb);
 
 extern spinlock_t bdi_lock;
@@ -37,11 +37,9 @@ extern struct list_head bdi_list;
 
 extern struct workqueue_struct *bdi_wq;
 
-static inline int wb_has_dirty_io(struct bdi_writeback *wb)
+static inline bool wb_has_dirty_io(struct bdi_writeback *wb)
 {
-	return !list_empty(&wb->b_dirty) ||
-	       !list_empty(&wb->b_io) ||
-	       !list_empty(&wb->b_more_io);
+	return test_bit(WB_has_dirty_io, &wb->state);
 }
 
 static inline void __add_wb_stat(struct bdi_writeback *wb,
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 5029c4ad2f36..161ddf12b862 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -256,7 +256,7 @@ static int __init default_bdi_init(void)
 }
 subsys_initcall(default_bdi_init);
 
-int bdi_has_dirty_io(struct backing_dev_info *bdi)
+bool bdi_has_dirty_io(struct backing_dev_info *bdi)
 {
 	return wb_has_dirty_io(&bdi->wb);
 }
-- 
cgit v1.2.3


From 766a9d6e60578f1ef6de71f89f022084f8bffc82 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 22 May 2015 17:13:46 -0400
Subject: writeback: implement backing_dev_info->tot_write_bandwidth

cgroup writeback support needs to keep track of the sum of
avg_write_bandwidth of all wb's (bdi_writeback's) with dirty inodes to
distribute write workload.  This patch adds bdi->tot_write_bandwidth
and updates inode_wb_list_move_locked(), inode_wb_list_del_locked()
and wb_update_write_bandwidth() to adjust it as wb's gain and lose
dirty inodes and its avg_write_bandwidth gets updated.

As the update events are not synchronized with each other,
bdi->tot_write_bandwidth is an atomic_long_t.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Jan Kara <jack@suse.cz>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 fs/fs-writeback.c                | 7 ++++++-
 include/linux/backing-dev-defs.h | 2 ++
 mm/page-writeback.c              | 3 +++
 3 files changed, 11 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 0a90dc557748..bbccf68b38db 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -99,6 +99,8 @@ static bool wb_io_lists_populated(struct bdi_writeback *wb)
 		return false;
 	} else {
 		set_bit(WB_has_dirty_io, &wb->state);
+		atomic_long_add(wb->avg_write_bandwidth,
+				&wb->bdi->tot_write_bandwidth);
 		return true;
 	}
 }
@@ -106,8 +108,11 @@ static bool wb_io_lists_populated(struct bdi_writeback *wb)
 static void wb_io_lists_depopulated(struct bdi_writeback *wb)
 {
 	if (wb_has_dirty_io(wb) && list_empty(&wb->b_dirty) &&
-	    list_empty(&wb->b_io) && list_empty(&wb->b_more_io))
+	    list_empty(&wb->b_io) && list_empty(&wb->b_more_io)) {
 		clear_bit(WB_has_dirty_io, &wb->state);
+		atomic_long_sub(wb->avg_write_bandwidth,
+				&wb->bdi->tot_write_bandwidth);
+	}
 }
 
 /**
diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h
index 7a94b7850b7c..d631a61f4023 100644
--- a/include/linux/backing-dev-defs.h
+++ b/include/linux/backing-dev-defs.h
@@ -142,6 +142,8 @@ struct backing_dev_info {
 	unsigned int min_ratio;
 	unsigned int max_ratio, max_prop_frac;
 
+	atomic_long_t tot_write_bandwidth; /* sum of active avg_write_bw */
+
 	struct bdi_writeback wb;  /* the root writeback info for this bdi */
 	struct bdi_writeback_congested wb_congested; /* its congested state */
 #ifdef CONFIG_CGROUP_WRITEBACK
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index e31dea94d116..c95eb24dd652 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -881,6 +881,9 @@ static void wb_update_write_bandwidth(struct bdi_writeback *wb,
 		avg += (old - avg) >> 3;
 
 out:
+	if (wb_has_dirty_io(wb))
+		atomic_long_add(avg - wb->avg_write_bandwidth,
+				&wb->bdi->tot_write_bandwidth);
 	wb->write_bandwidth = bw;
 	wb->avg_write_bandwidth = avg;
 }
-- 
cgit v1.2.3


From 95a46c65e3c09edb9f17dabf2dc16670cd328739 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 22 May 2015 17:13:47 -0400
Subject: writeback: make bdi_has_dirty_io() take multiple bdi_writeback's into
 account

bdi_has_dirty_io() used to only reflect whether the root wb
(bdi_writeback) has dirty inodes.  For cgroup writeback support, it
needs to take all active wb's into account.  If any wb on the bdi has
dirty inodes, bdi_has_dirty_io() should return true.

To achieve that, as inode_wb_list_{move|del}_locked() now keep track
of the dirty state transition of each wb, the number of dirty wbs can
be counted in the bdi; however, bdi is already aggregating
wb->avg_write_bandwidth which can easily be guaranteed to be > 0 when
there are any dirty inodes by ensuring wb->avg_write_bandwidth can't
dip below 1.  bdi_has_dirty_io() can simply test whether
bdi->tot_write_bandwidth is zero or not.

While this bumps the value of wb->avg_write_bandwidth to one when it
used to be zero, this shouldn't cause any meaningful behavior
difference.

bdi_has_dirty_io() is made an inline function which tests whether
->tot_write_bandwidth is non-zero.  Also, WARN_ON_ONCE()'s on its
value are added to inode_wb_list_{move|del}_locked().

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Jan Kara <jack@suse.cz>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 fs/fs-writeback.c                |  5 +++--
 include/linux/backing-dev-defs.h |  8 ++++++--
 include/linux/backing-dev.h      | 10 +++++++++-
 mm/backing-dev.c                 |  5 -----
 mm/page-writeback.c              | 10 +++++++---
 5 files changed, 25 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index bbccf68b38db..c98d3923d9a9 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -99,6 +99,7 @@ static bool wb_io_lists_populated(struct bdi_writeback *wb)
 		return false;
 	} else {
 		set_bit(WB_has_dirty_io, &wb->state);
+		WARN_ON_ONCE(!wb->avg_write_bandwidth);
 		atomic_long_add(wb->avg_write_bandwidth,
 				&wb->bdi->tot_write_bandwidth);
 		return true;
@@ -110,8 +111,8 @@ static void wb_io_lists_depopulated(struct bdi_writeback *wb)
 	if (wb_has_dirty_io(wb) && list_empty(&wb->b_dirty) &&
 	    list_empty(&wb->b_io) && list_empty(&wb->b_more_io)) {
 		clear_bit(WB_has_dirty_io, &wb->state);
-		atomic_long_sub(wb->avg_write_bandwidth,
-				&wb->bdi->tot_write_bandwidth);
+		WARN_ON_ONCE(atomic_long_sub_return(wb->avg_write_bandwidth,
+					&wb->bdi->tot_write_bandwidth) < 0);
 	}
 }
 
diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h
index d631a61f4023..8c857d723023 100644
--- a/include/linux/backing-dev-defs.h
+++ b/include/linux/backing-dev-defs.h
@@ -98,7 +98,7 @@ struct bdi_writeback {
 	unsigned long dirtied_stamp;
 	unsigned long written_stamp;	/* pages written at bw_time_stamp */
 	unsigned long write_bandwidth;	/* the estimated write bandwidth */
-	unsigned long avg_write_bandwidth; /* further smoothed write bw */
+	unsigned long avg_write_bandwidth; /* further smoothed write bw, > 0 */
 
 	/*
 	 * The base dirty throttle rate, re-calculated on every 200ms.
@@ -142,7 +142,11 @@ struct backing_dev_info {
 	unsigned int min_ratio;
 	unsigned int max_ratio, max_prop_frac;
 
-	atomic_long_t tot_write_bandwidth; /* sum of active avg_write_bw */
+	/*
+	 * Sum of avg_write_bw of wbs with dirty inodes.  > 0 if there are
+	 * any dirty wbs, which is depended upon by bdi_has_dirty().
+	 */
+	atomic_long_t tot_write_bandwidth;
 
 	struct bdi_writeback wb;  /* the root writeback info for this bdi */
 	struct bdi_writeback_congested wb_congested; /* its congested state */
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index 3c8403c012ce..0839e44105bd 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -29,7 +29,6 @@ void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
 			enum wb_reason reason);
 void bdi_start_background_writeback(struct backing_dev_info *bdi);
 void wb_workfn(struct work_struct *work);
-bool bdi_has_dirty_io(struct backing_dev_info *bdi);
 void wb_wakeup_delayed(struct bdi_writeback *wb);
 
 extern spinlock_t bdi_lock;
@@ -42,6 +41,15 @@ static inline bool wb_has_dirty_io(struct bdi_writeback *wb)
 	return test_bit(WB_has_dirty_io, &wb->state);
 }
 
+static inline bool bdi_has_dirty_io(struct backing_dev_info *bdi)
+{
+	/*
+	 * @bdi->tot_write_bandwidth is guaranteed to be > 0 if there are
+	 * any dirty wbs.  See wb_update_write_bandwidth().
+	 */
+	return atomic_long_read(&bdi->tot_write_bandwidth);
+}
+
 static inline void __add_wb_stat(struct bdi_writeback *wb,
 				 enum wb_stat_item item, s64 amount)
 {
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 161ddf12b862..d2f16fc9069a 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -256,11 +256,6 @@ static int __init default_bdi_init(void)
 }
 subsys_initcall(default_bdi_init);
 
-bool bdi_has_dirty_io(struct backing_dev_info *bdi)
-{
-	return wb_has_dirty_io(&bdi->wb);
-}
-
 /*
  * This function is used when the first inode for this wb is marked dirty. It
  * wakes-up the corresponding bdi thread which should then take care of the
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index c95eb24dd652..99b88465096e 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -881,9 +881,13 @@ static void wb_update_write_bandwidth(struct bdi_writeback *wb,
 		avg += (old - avg) >> 3;
 
 out:
-	if (wb_has_dirty_io(wb))
-		atomic_long_add(avg - wb->avg_write_bandwidth,
-				&wb->bdi->tot_write_bandwidth);
+	/* keep avg > 0 to guarantee that tot > 0 if there are dirty wbs */
+	avg = max(avg, 1LU);
+	if (wb_has_dirty_io(wb)) {
+		long delta = avg - wb->avg_write_bandwidth;
+		WARN_ON_ONCE(atomic_long_add_return(delta,
+					&wb->bdi->tot_write_bandwidth) <= 0);
+	}
 	wb->write_bandwidth = bw;
 	wb->avg_write_bandwidth = avg;
 }
-- 
cgit v1.2.3


From ebe41ab0c79d5633123f6faa3265a1a63c5f22d8 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 22 May 2015 17:13:50 -0400
Subject: writeback: implement bdi_for_each_wb()

This will be used to implement bdi-wide operations which should be
distributed across all its cgroup bdi_writebacks.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Jan Kara <jack@suse.cz>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 include/linux/backing-dev.h | 63 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 63 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index 0839e44105bd..c7979806baee 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -383,6 +383,61 @@ static inline struct bdi_writeback *inode_to_wb(struct inode *inode)
 	return inode->i_wb;
 }
 
+struct wb_iter {
+	int			start_blkcg_id;
+	struct radix_tree_iter	tree_iter;
+	void			**slot;
+};
+
+static inline struct bdi_writeback *__wb_iter_next(struct wb_iter *iter,
+						   struct backing_dev_info *bdi)
+{
+	struct radix_tree_iter *titer = &iter->tree_iter;
+
+	WARN_ON_ONCE(!rcu_read_lock_held());
+
+	if (iter->start_blkcg_id >= 0) {
+		iter->slot = radix_tree_iter_init(titer, iter->start_blkcg_id);
+		iter->start_blkcg_id = -1;
+	} else {
+		iter->slot = radix_tree_next_slot(iter->slot, titer, 0);
+	}
+
+	if (!iter->slot)
+		iter->slot = radix_tree_next_chunk(&bdi->cgwb_tree, titer, 0);
+	if (iter->slot)
+		return *iter->slot;
+	return NULL;
+}
+
+static inline struct bdi_writeback *__wb_iter_init(struct wb_iter *iter,
+						   struct backing_dev_info *bdi,
+						   int start_blkcg_id)
+{
+	iter->start_blkcg_id = start_blkcg_id;
+
+	if (start_blkcg_id)
+		return __wb_iter_next(iter, bdi);
+	else
+		return &bdi->wb;
+}
+
+/**
+ * bdi_for_each_wb - walk all wb's of a bdi in ascending blkcg ID order
+ * @wb_cur: cursor struct bdi_writeback pointer
+ * @bdi: bdi to walk wb's of
+ * @iter: pointer to struct wb_iter to be used as iteration buffer
+ * @start_blkcg_id: blkcg ID to start iteration from
+ *
+ * Iterate @wb_cur through the wb's (bdi_writeback's) of @bdi in ascending
+ * blkcg ID order starting from @start_blkcg_id.  @iter is struct wb_iter
+ * to be used as temp storage during iteration.  rcu_read_lock() must be
+ * held throughout iteration.
+ */
+#define bdi_for_each_wb(wb_cur, bdi, iter, start_blkcg_id)		\
+	for ((wb_cur) = __wb_iter_init(iter, bdi, start_blkcg_id);	\
+	     (wb_cur); (wb_cur) = __wb_iter_next(iter, bdi))
+
 #else	/* CONFIG_CGROUP_WRITEBACK */
 
 static inline bool inode_cgwb_enabled(struct inode *inode)
@@ -445,6 +500,14 @@ static inline void wb_blkcg_offline(struct blkcg *blkcg)
 {
 }
 
+struct wb_iter {
+	int		next_id;
+};
+
+#define bdi_for_each_wb(wb_cur, bdi, iter, start_blkcg_id)		\
+	for ((iter)->next_id = (start_blkcg_id);			\
+	     ({	(wb_cur) = !(iter)->next_id++ ? &(bdi)->wb : NULL; }); )
+
 static inline int inode_congested(struct inode *inode, int cong_bits)
 {
 	return wb_congested(&inode_to_bdi(inode)->wb, cong_bits);
-- 
cgit v1.2.3


From c00ddad39f512b1a81e25b7892217ce10efab0f1 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 22 May 2015 17:13:51 -0400
Subject: writeback: remove bdi_start_writeback()

bdi_start_writeback() is a thin wrapper on top of
__wb_start_writeback() which is used only by laptop_mode_timer_fn().
This patches removes bdi_start_writeback(), renames
__wb_start_writeback() to wb_start_writeback() and makes
laptop_mode_timer_fn() use it instead.

This doesn't cause any functional difference and will ease making
laptop_mode_timer_fn() cgroup writeback aware.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Jan Kara <jack@suse.cz>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 fs/fs-writeback.c           | 68 +++++++++++++++++----------------------------
 include/linux/backing-dev.h |  4 +--
 mm/page-writeback.c         |  4 +--
 3 files changed, 29 insertions(+), 47 deletions(-)

(limited to 'include/linux')

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 921a9e43b1db..79f11af67357 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -184,33 +184,6 @@ out_unlock:
 	spin_unlock_bh(&wb->work_lock);
 }
 
-static void __wb_start_writeback(struct bdi_writeback *wb, long nr_pages,
-				 bool range_cyclic, enum wb_reason reason)
-{
-	struct wb_writeback_work *work;
-
-	if (!wb_has_dirty_io(wb))
-		return;
-
-	/*
-	 * This is WB_SYNC_NONE writeback, so if allocation fails just
-	 * wakeup the thread for old dirty data writeback
-	 */
-	work = kzalloc(sizeof(*work), GFP_ATOMIC);
-	if (!work) {
-		trace_writeback_nowork(wb->bdi);
-		wb_wakeup(wb);
-		return;
-	}
-
-	work->sync_mode	= WB_SYNC_NONE;
-	work->nr_pages	= nr_pages;
-	work->range_cyclic = range_cyclic;
-	work->reason	= reason;
-
-	wb_queue_work(wb, work);
-}
-
 #ifdef CONFIG_CGROUP_WRITEBACK
 
 /**
@@ -240,22 +213,31 @@ EXPORT_SYMBOL_GPL(inode_congested);
 
 #endif	/* CONFIG_CGROUP_WRITEBACK */
 
-/**
- * bdi_start_writeback - start writeback
- * @bdi: the backing device to write from
- * @nr_pages: the number of pages to write
- * @reason: reason why some writeback work was initiated
- *
- * Description:
- *   This does WB_SYNC_NONE opportunistic writeback. The IO is only
- *   started when this function returns, we make no guarantees on
- *   completion. Caller need not hold sb s_umount semaphore.
- *
- */
-void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
-			enum wb_reason reason)
+void wb_start_writeback(struct bdi_writeback *wb, long nr_pages,
+			bool range_cyclic, enum wb_reason reason)
 {
-	__wb_start_writeback(&bdi->wb, nr_pages, true, reason);
+	struct wb_writeback_work *work;
+
+	if (!wb_has_dirty_io(wb))
+		return;
+
+	/*
+	 * This is WB_SYNC_NONE writeback, so if allocation fails just
+	 * wakeup the thread for old dirty data writeback
+	 */
+	work = kzalloc(sizeof(*work), GFP_ATOMIC);
+	if (!work) {
+		trace_writeback_nowork(wb->bdi);
+		wb_wakeup(wb);
+		return;
+	}
+
+	work->sync_mode	= WB_SYNC_NONE;
+	work->nr_pages	= nr_pages;
+	work->range_cyclic = range_cyclic;
+	work->reason	= reason;
+
+	wb_queue_work(wb, work);
 }
 
 /**
@@ -1219,7 +1201,7 @@ void wakeup_flusher_threads(long nr_pages, enum wb_reason reason)
 
 	rcu_read_lock();
 	list_for_each_entry_rcu(bdi, &bdi_list, bdi_list)
-		__wb_start_writeback(&bdi->wb, nr_pages, false, reason);
+		wb_start_writeback(&bdi->wb, nr_pages, false, reason);
 	rcu_read_unlock();
 }
 
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index c7979806baee..0ff40c228bee 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -25,8 +25,8 @@ int bdi_register(struct backing_dev_info *bdi, struct device *parent,
 int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev);
 void bdi_unregister(struct backing_dev_info *bdi);
 int __must_check bdi_setup_and_register(struct backing_dev_info *, char *);
-void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
-			enum wb_reason reason);
+void wb_start_writeback(struct bdi_writeback *wb, long nr_pages,
+			bool range_cyclic, enum wb_reason reason);
 void bdi_start_background_writeback(struct backing_dev_info *bdi);
 void wb_workfn(struct work_struct *work);
 void wb_wakeup_delayed(struct bdi_writeback *wb);
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 9b55f12040e6..6301af28da21 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -1729,8 +1729,8 @@ void laptop_mode_timer_fn(unsigned long data)
 	 * threshold
 	 */
 	if (bdi_has_dirty_io(&q->backing_dev_info))
-		bdi_start_writeback(&q->backing_dev_info, nr_pages,
-					WB_REASON_LAPTOP_TIMER);
+		wb_start_writeback(&q->backing_dev_info.wb, nr_pages, true,
+				   WB_REASON_LAPTOP_TIMER);
 }
 
 /*
-- 
cgit v1.2.3


From bc05873dccd27d75d6acdf812c3edfb181f1ba17 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 22 May 2015 17:13:53 -0400
Subject: writeback: make writeback_in_progress() take bdi_writeback instead of
 backing_dev_info

writeback_in_progress() currently takes @bdi and returns whether
writeback is in progress on its root wb (bdi_writeback).  In
preparation for cgroup writeback support, make it take wb instead.
While at it, make it an inline function.

This patch doesn't make any functional difference.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Jan Kara <jack@suse.cz>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 fs/fs-writeback.c           | 15 +--------------
 include/linux/backing-dev.h | 12 +++++++++++-
 mm/page-writeback.c         |  4 ++--
 3 files changed, 14 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 79f11af67357..45baf6c89b99 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -65,19 +65,6 @@ struct wb_writeback_work {
  */
 unsigned int dirtytime_expire_interval = 12 * 60 * 60;
 
-/**
- * writeback_in_progress - determine whether there is writeback in progress
- * @bdi: the device's backing_dev_info structure.
- *
- * Determine whether there is writeback waiting to be handled against a
- * backing device.
- */
-int writeback_in_progress(struct backing_dev_info *bdi)
-{
-	return test_bit(WB_writeback_running, &bdi->wb.state);
-}
-EXPORT_SYMBOL(writeback_in_progress);
-
 static inline struct inode *wb_inode(struct list_head *head)
 {
 	return list_entry(head, struct inode, i_wb_list);
@@ -1532,7 +1519,7 @@ int try_to_writeback_inodes_sb_nr(struct super_block *sb,
 				  unsigned long nr,
 				  enum wb_reason reason)
 {
-	if (writeback_in_progress(sb->s_bdi))
+	if (writeback_in_progress(&sb->s_bdi->wb))
 		return 1;
 
 	if (!down_read_trylock(&sb->s_umount))
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index 0ff40c228bee..f04956c900ec 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -156,7 +156,17 @@ int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio);
 
 extern struct backing_dev_info noop_backing_dev_info;
 
-int writeback_in_progress(struct backing_dev_info *bdi);
+/**
+ * writeback_in_progress - determine whether there is writeback in progress
+ * @wb: bdi_writeback of interest
+ *
+ * Determine whether there is writeback waiting to be handled against a
+ * bdi_writeback.
+ */
+static inline bool writeback_in_progress(struct bdi_writeback *wb)
+{
+	return test_bit(WB_writeback_running, &wb->state);
+}
 
 static inline struct backing_dev_info *inode_to_bdi(struct inode *inode)
 {
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 682e3a6a8b2e..e3b5c1dddf1d 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -1455,7 +1455,7 @@ static void balance_dirty_pages(struct address_space *mapping,
 			break;
 		}
 
-		if (unlikely(!writeback_in_progress(bdi)))
+		if (unlikely(!writeback_in_progress(wb)))
 			bdi_start_background_writeback(bdi);
 
 		if (!strictlimit)
@@ -1573,7 +1573,7 @@ pause:
 	if (!dirty_exceeded && wb->dirty_exceeded)
 		wb->dirty_exceeded = 0;
 
-	if (writeback_in_progress(bdi))
+	if (writeback_in_progress(wb))
 		return;
 
 	/*
-- 
cgit v1.2.3


From 9ecf4866c018aeb304a7b49216c4d183665becb7 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 22 May 2015 17:13:54 -0400
Subject: writeback: make bdi_start_background_writeback() take bdi_writeback
 instead of backing_dev_info

bdi_start_background_writeback() currently takes @bdi and kicks the
root wb (bdi_writeback).  In preparation for cgroup writeback support,
make it take wb instead.

This patch doesn't make any functional difference.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Jan Kara <jack@suse.cz>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 fs/fs-writeback.c           | 12 ++++++------
 include/linux/backing-dev.h |  2 +-
 mm/page-writeback.c         |  4 ++--
 3 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 45baf6c89b99..92aaf641ee22 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -228,23 +228,23 @@ void wb_start_writeback(struct bdi_writeback *wb, long nr_pages,
 }
 
 /**
- * bdi_start_background_writeback - start background writeback
- * @bdi: the backing device to write from
+ * wb_start_background_writeback - start background writeback
+ * @wb: bdi_writback to write from
  *
  * Description:
  *   This makes sure WB_SYNC_NONE background writeback happens. When
- *   this function returns, it is only guaranteed that for given BDI
+ *   this function returns, it is only guaranteed that for given wb
  *   some IO is happening if we are over background dirty threshold.
  *   Caller need not hold sb s_umount semaphore.
  */
-void bdi_start_background_writeback(struct backing_dev_info *bdi)
+void wb_start_background_writeback(struct bdi_writeback *wb)
 {
 	/*
 	 * We just wake up the flusher thread. It will perform background
 	 * writeback as soon as there is no other work to do.
 	 */
-	trace_writeback_wake_background(bdi);
-	wb_wakeup(&bdi->wb);
+	trace_writeback_wake_background(wb->bdi);
+	wb_wakeup(wb);
 }
 
 /*
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index f04956c900ec..9cc11e5b97ca 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -27,7 +27,7 @@ void bdi_unregister(struct backing_dev_info *bdi);
 int __must_check bdi_setup_and_register(struct backing_dev_info *, char *);
 void wb_start_writeback(struct bdi_writeback *wb, long nr_pages,
 			bool range_cyclic, enum wb_reason reason);
-void bdi_start_background_writeback(struct backing_dev_info *bdi);
+void wb_start_background_writeback(struct bdi_writeback *wb);
 void wb_workfn(struct work_struct *work);
 void wb_wakeup_delayed(struct bdi_writeback *wb);
 
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index e3b5c1dddf1d..70cf98dc3423 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -1456,7 +1456,7 @@ static void balance_dirty_pages(struct address_space *mapping,
 		}
 
 		if (unlikely(!writeback_in_progress(wb)))
-			bdi_start_background_writeback(bdi);
+			wb_start_background_writeback(wb);
 
 		if (!strictlimit)
 			wb_dirty_limits(wb, dirty_thresh, background_thresh,
@@ -1588,7 +1588,7 @@ pause:
 		return;
 
 	if (nr_reclaimable > background_thresh)
-		bdi_start_background_writeback(bdi);
+		wb_start_background_writeback(wb);
 }
 
 static DEFINE_PER_CPU(int, bdp_ratelimits);
-- 
cgit v1.2.3


From cc395d7f1f7b9c740ab6d367ef1f6eb248595dff Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 22 May 2015 17:13:58 -0400
Subject: writeback: implement bdi_wait_for_completion()

If the completion of a wb_writeback_work can be waited upon by setting
its ->done to a struct completion and waiting on it; however, for
cgroup writeback support, it's necessary to issue multiple work items
to multiple bdi_writebacks and wait for the completion of all.

This patch implements wb_completion which can wait for multiple work
items and replaces the struct completion with it.  It can be defined
using DEFINE_WB_COMPLETION_ONSTACK(), used for multiple work items and
waited for by wb_wait_for_completion().

Nobody currently issues multiple work items and this patch doesn't
introduce any behavior changes.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Jan Kara <jack@suse.cz>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 fs/fs-writeback.c                | 58 +++++++++++++++++++++++++++++++---------
 include/linux/backing-dev-defs.h |  2 ++
 mm/backing-dev.c                 |  1 +
 3 files changed, 49 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 22f1def4e63a..d7d4a1bcdd2f 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -34,6 +34,10 @@
  */
 #define MIN_WRITEBACK_PAGES	(4096UL >> (PAGE_CACHE_SHIFT - 10))
 
+struct wb_completion {
+	atomic_t		cnt;
+};
+
 /*
  * Passed into wb_writeback(), essentially a subset of writeback_control
  */
@@ -51,9 +55,22 @@ struct wb_writeback_work {
 	enum wb_reason reason;		/* why was writeback initiated? */
 
 	struct list_head list;		/* pending work list */
-	struct completion *done;	/* set if the caller waits */
+	struct wb_completion *done;	/* set if the caller waits */
 };
 
+/*
+ * If one wants to wait for one or more wb_writeback_works, each work's
+ * ->done should be set to a wb_completion defined using the following
+ * macro.  Once all work items are issued with wb_queue_work(), the caller
+ * can wait for the completion of all using wb_wait_for_completion().  Work
+ * items which are waited upon aren't freed automatically on completion.
+ */
+#define DEFINE_WB_COMPLETION_ONSTACK(cmpl)				\
+	struct wb_completion cmpl = {					\
+		.cnt		= ATOMIC_INIT(1),			\
+	}
+
+
 /*
  * If an inode is constantly having its pages dirtied, but then the
  * updates stop dirtytime_expire_interval seconds in the past, it's
@@ -161,17 +178,34 @@ static void wb_queue_work(struct bdi_writeback *wb,
 	trace_writeback_queue(wb->bdi, work);
 
 	spin_lock_bh(&wb->work_lock);
-	if (!test_bit(WB_registered, &wb->state)) {
-		if (work->done)
-			complete(work->done);
+	if (!test_bit(WB_registered, &wb->state))
 		goto out_unlock;
-	}
+	if (work->done)
+		atomic_inc(&work->done->cnt);
 	list_add_tail(&work->list, &wb->work_list);
 	mod_delayed_work(bdi_wq, &wb->dwork, 0);
 out_unlock:
 	spin_unlock_bh(&wb->work_lock);
 }
 
+/**
+ * wb_wait_for_completion - wait for completion of bdi_writeback_works
+ * @bdi: bdi work items were issued to
+ * @done: target wb_completion
+ *
+ * Wait for one or more work items issued to @bdi with their ->done field
+ * set to @done, which should have been defined with
+ * DEFINE_WB_COMPLETION_ONSTACK().  This function returns after all such
+ * work items are completed.  Work items which are waited upon aren't freed
+ * automatically on completion.
+ */
+static void wb_wait_for_completion(struct backing_dev_info *bdi,
+				   struct wb_completion *done)
+{
+	atomic_dec(&done->cnt);		/* put down the initial count */
+	wait_event(bdi->wb_waitq, !atomic_read(&done->cnt));
+}
+
 #ifdef CONFIG_CGROUP_WRITEBACK
 
 /**
@@ -1143,7 +1177,7 @@ static long wb_do_writeback(struct bdi_writeback *wb)
 
 	set_bit(WB_writeback_running, &wb->state);
 	while ((work = get_next_work_item(wb)) != NULL) {
-		struct completion *done = work->done;
+		struct wb_completion *done = work->done;
 
 		trace_writeback_exec(wb->bdi, work);
 
@@ -1151,8 +1185,8 @@ static long wb_do_writeback(struct bdi_writeback *wb)
 
 		if (work->auto_free)
 			kfree(work);
-		if (done)
-			complete(done);
+		if (done && atomic_dec_and_test(&done->cnt))
+			wake_up_all(&wb->bdi->wb_waitq);
 	}
 
 	/*
@@ -1518,7 +1552,7 @@ void writeback_inodes_sb_nr(struct super_block *sb,
 			    unsigned long nr,
 			    enum wb_reason reason)
 {
-	DECLARE_COMPLETION_ONSTACK(done);
+	DEFINE_WB_COMPLETION_ONSTACK(done);
 	struct wb_writeback_work work = {
 		.sb			= sb,
 		.sync_mode		= WB_SYNC_NONE,
@@ -1533,7 +1567,7 @@ void writeback_inodes_sb_nr(struct super_block *sb,
 		return;
 	WARN_ON(!rwsem_is_locked(&sb->s_umount));
 	wb_queue_work(&bdi->wb, &work);
-	wait_for_completion(&done);
+	wb_wait_for_completion(bdi, &done);
 }
 EXPORT_SYMBOL(writeback_inodes_sb_nr);
 
@@ -1600,7 +1634,7 @@ EXPORT_SYMBOL(try_to_writeback_inodes_sb);
  */
 void sync_inodes_sb(struct super_block *sb)
 {
-	DECLARE_COMPLETION_ONSTACK(done);
+	DEFINE_WB_COMPLETION_ONSTACK(done);
 	struct wb_writeback_work work = {
 		.sb		= sb,
 		.sync_mode	= WB_SYNC_ALL,
@@ -1618,7 +1652,7 @@ void sync_inodes_sb(struct super_block *sb)
 	WARN_ON(!rwsem_is_locked(&sb->s_umount));
 
 	wb_queue_work(&bdi->wb, &work);
-	wait_for_completion(&done);
+	wb_wait_for_completion(bdi, &done);
 
 	wait_sb_inodes(sb);
 }
diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h
index 8c857d723023..97a92fa0cdb5 100644
--- a/include/linux/backing-dev-defs.h
+++ b/include/linux/backing-dev-defs.h
@@ -155,6 +155,8 @@ struct backing_dev_info {
 	struct rb_root cgwb_congested_tree; /* their congested states */
 	atomic_t usage_cnt; /* counts both cgwbs and cgwb_contested's */
 #endif
+	wait_queue_head_t wb_waitq;
+
 	struct device *dev;
 
 	struct timer_list laptop_mode_wb_timer;
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index d2f16fc9069a..ad5608d01e8c 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -768,6 +768,7 @@ int bdi_init(struct backing_dev_info *bdi)
 	bdi->max_ratio = 100;
 	bdi->max_prop_frac = FPROP_FRAC_BASE;
 	INIT_LIST_HEAD(&bdi->bdi_list);
+	init_waitqueue_head(&bdi->wb_waitq);
 
 	err = wb_init(&bdi->wb, bdi, GFP_KERNEL);
 	if (err)
-- 
cgit v1.2.3


From f30a7d0cc8d9096d6728fadd0ab024e648010ec0 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 22 May 2015 17:14:00 -0400
Subject: writeback: restructure try_writeback_inodes_sb[_nr]()

try_writeback_inodes_sb_nr() wraps writeback_inodes_sb_nr() so that it
handles s_umount locking and skips if writeback is already in
progress.  The in progress test is performed on the root wb
(bdi_writeback) which isn't sufficient for cgroup writeback support.
The test must be done per-wb.

To prepare for the change, this patch factors out
__writeback_inodes_sb_nr() from writeback_inodes_sb_nr() and adds
@skip_if_busy and moves the in progress test right before queueing the
wb_writeback_work.  try_writeback_inodes_sb_nr() now just grabs
s_umount and invokes __writeback_inodes_sb_nr() with asserted
@skip_if_busy.  This way, later addition of multiple wb handling can
skip only the wb's which already have writeback in progress.

This swaps the order between in progress test and s_umount test which
can flip the return value when writeback is in progress and s_umount
is being held by someone else but this shouldn't cause any meaningful
difference.  It's a fringe condition and the return value is an
unsynchronized hint anyway.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Jan Kara <jack@suse.cz>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 fs/fs-writeback.c         | 52 ++++++++++++++++++++++++++---------------------
 include/linux/writeback.h |  6 +++---
 2 files changed, 32 insertions(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 093b9594e846..0039c5839cdd 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -1581,19 +1581,8 @@ static void wait_sb_inodes(struct super_block *sb)
 	iput(old_inode);
 }
 
-/**
- * writeback_inodes_sb_nr -	writeback dirty inodes from given super_block
- * @sb: the superblock
- * @nr: the number of pages to write
- * @reason: reason why some writeback work initiated
- *
- * Start writeback on some inodes on this super_block. No guarantees are made
- * on how many (if any) will be written, and this function does not wait
- * for IO completion of submitted IO.
- */
-void writeback_inodes_sb_nr(struct super_block *sb,
-			    unsigned long nr,
-			    enum wb_reason reason)
+static void __writeback_inodes_sb_nr(struct super_block *sb, unsigned long nr,
+				     enum wb_reason reason, bool skip_if_busy)
 {
 	DEFINE_WB_COMPLETION_ONSTACK(done);
 	struct wb_writeback_work work = {
@@ -1609,9 +1598,30 @@ void writeback_inodes_sb_nr(struct super_block *sb,
 	if (!bdi_has_dirty_io(bdi) || bdi == &noop_backing_dev_info)
 		return;
 	WARN_ON(!rwsem_is_locked(&sb->s_umount));
+
+	if (skip_if_busy && writeback_in_progress(&bdi->wb))
+		return;
+
 	wb_queue_work(&bdi->wb, &work);
 	wb_wait_for_completion(bdi, &done);
 }
+
+/**
+ * writeback_inodes_sb_nr -	writeback dirty inodes from given super_block
+ * @sb: the superblock
+ * @nr: the number of pages to write
+ * @reason: reason why some writeback work initiated
+ *
+ * Start writeback on some inodes on this super_block. No guarantees are made
+ * on how many (if any) will be written, and this function does not wait
+ * for IO completion of submitted IO.
+ */
+void writeback_inodes_sb_nr(struct super_block *sb,
+			    unsigned long nr,
+			    enum wb_reason reason)
+{
+	__writeback_inodes_sb_nr(sb, nr, reason, false);
+}
 EXPORT_SYMBOL(writeback_inodes_sb_nr);
 
 /**
@@ -1638,19 +1648,15 @@ EXPORT_SYMBOL(writeback_inodes_sb);
  * Invoke writeback_inodes_sb_nr if no writeback is currently underway.
  * Returns 1 if writeback was started, 0 if not.
  */
-int try_to_writeback_inodes_sb_nr(struct super_block *sb,
-				  unsigned long nr,
-				  enum wb_reason reason)
+bool try_to_writeback_inodes_sb_nr(struct super_block *sb, unsigned long nr,
+				   enum wb_reason reason)
 {
-	if (writeback_in_progress(&sb->s_bdi->wb))
-		return 1;
-
 	if (!down_read_trylock(&sb->s_umount))
-		return 0;
+		return false;
 
-	writeback_inodes_sb_nr(sb, nr, reason);
+	__writeback_inodes_sb_nr(sb, nr, reason, true);
 	up_read(&sb->s_umount);
-	return 1;
+	return true;
 }
 EXPORT_SYMBOL(try_to_writeback_inodes_sb_nr);
 
@@ -1662,7 +1668,7 @@ EXPORT_SYMBOL(try_to_writeback_inodes_sb_nr);
  * Implement by try_to_writeback_inodes_sb_nr()
  * Returns 1 if writeback was started, 0 if not.
  */
-int try_to_writeback_inodes_sb(struct super_block *sb, enum wb_reason reason)
+bool try_to_writeback_inodes_sb(struct super_block *sb, enum wb_reason reason)
 {
 	return try_to_writeback_inodes_sb_nr(sb, get_nr_dirty_pages(), reason);
 }
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index a6b9db7fcee8..23af355d5471 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -93,9 +93,9 @@ struct bdi_writeback;
 void writeback_inodes_sb(struct super_block *, enum wb_reason reason);
 void writeback_inodes_sb_nr(struct super_block *, unsigned long nr,
 							enum wb_reason reason);
-int try_to_writeback_inodes_sb(struct super_block *, enum wb_reason reason);
-int try_to_writeback_inodes_sb_nr(struct super_block *, unsigned long nr,
-				  enum wb_reason reason);
+bool try_to_writeback_inodes_sb(struct super_block *, enum wb_reason reason);
+bool try_to_writeback_inodes_sb_nr(struct super_block *, unsigned long nr,
+				   enum wb_reason reason);
 void sync_inodes_sb(struct super_block *);
 void wakeup_flusher_threads(long nr_pages, enum wb_reason reason);
 void inode_wait_for_writeback(struct inode *inode);
-- 
cgit v1.2.3


From bafc0dba1e20d84578d7098d32caf63441e5743d Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 2 Jun 2015 08:37:23 -0600
Subject: buffer, writeback: make __block_write_full_page() honor cgroup
 writeback

[__]block_write_full_page() is used to implement ->writepage in
various filesystems.  All writeback logic is now updated to handle
cgroup writeback and the block cgroup to issue IOs for is encoded in
writeback_control and can be retrieved from the inode; however,
[__]block_write_full_page() currently ignores the blkcg indicated by
inode and issues all bio's without explicit blkcg association.

This patch adds submit_bh_blkcg() which associates the bio with the
specified blkio cgroup before issuing and uses it in
__block_write_full_page() so that the issued bio's are associated with
inode_to_wb_blkcg_css(inode).

v2: Updated for per-inode wb association.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Jan Kara <jack@suse.cz>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 fs/buffer.c                 | 26 ++++++++++++++++++++------
 include/linux/backing-dev.h | 12 ++++++++++++
 2 files changed, 32 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/fs/buffer.c b/fs/buffer.c
index 23b640d5d6e9..b85e94134ea6 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -30,6 +30,7 @@
 #include <linux/quotaops.h>
 #include <linux/highmem.h>
 #include <linux/export.h>
+#include <linux/backing-dev.h>
 #include <linux/writeback.h>
 #include <linux/hash.h>
 #include <linux/suspend.h>
@@ -44,6 +45,9 @@
 #include <trace/events/block.h>
 
 static int fsync_buffers_list(spinlock_t *lock, struct list_head *list);
+static int submit_bh_blkcg(int rw, struct buffer_head *bh,
+			   unsigned long bio_flags,
+			   struct cgroup_subsys_state *blkcg_css);
 
 #define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers)
 
@@ -1704,8 +1708,8 @@ static int __block_write_full_page(struct inode *inode, struct page *page,
 	struct buffer_head *bh, *head;
 	unsigned int blocksize, bbits;
 	int nr_underway = 0;
-	int write_op = (wbc->sync_mode == WB_SYNC_ALL ?
-			WRITE_SYNC : WRITE);
+	int write_op = (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE);
+	struct cgroup_subsys_state *blkcg_css = inode_to_wb_blkcg_css(inode);
 
 	head = create_page_buffers(page, inode,
 					(1 << BH_Dirty)|(1 << BH_Uptodate));
@@ -1794,7 +1798,7 @@ static int __block_write_full_page(struct inode *inode, struct page *page,
 	do {
 		struct buffer_head *next = bh->b_this_page;
 		if (buffer_async_write(bh)) {
-			submit_bh(write_op, bh);
+			submit_bh_blkcg(write_op, bh, 0, blkcg_css);
 			nr_underway++;
 		}
 		bh = next;
@@ -1848,7 +1852,7 @@ recover:
 		struct buffer_head *next = bh->b_this_page;
 		if (buffer_async_write(bh)) {
 			clear_buffer_dirty(bh);
-			submit_bh(write_op, bh);
+			submit_bh_blkcg(write_op, bh, 0, blkcg_css);
 			nr_underway++;
 		}
 		bh = next;
@@ -3013,7 +3017,9 @@ void guard_bio_eod(int rw, struct bio *bio)
 	}
 }
 
-int _submit_bh(int rw, struct buffer_head *bh, unsigned long bio_flags)
+static int submit_bh_blkcg(int rw, struct buffer_head *bh,
+			   unsigned long bio_flags,
+			   struct cgroup_subsys_state *blkcg_css)
 {
 	struct bio *bio;
 
@@ -3035,6 +3041,9 @@ int _submit_bh(int rw, struct buffer_head *bh, unsigned long bio_flags)
 	 */
 	bio = bio_alloc(GFP_NOIO, 1);
 
+	if (blkcg_css)
+		bio_associate_blkcg(bio, blkcg_css);
+
 	bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
 	bio->bi_bdev = bh->b_bdev;
 	bio->bi_io_vec[0].bv_page = bh->b_page;
@@ -3059,11 +3068,16 @@ int _submit_bh(int rw, struct buffer_head *bh, unsigned long bio_flags)
 	submit_bio(rw, bio);
 	return 0;
 }
+
+int _submit_bh(int rw, struct buffer_head *bh, unsigned long bio_flags)
+{
+	return submit_bh_blkcg(rw, bh, bio_flags, NULL);
+}
 EXPORT_SYMBOL_GPL(_submit_bh);
 
 int submit_bh(int rw, struct buffer_head *bh)
 {
-	return _submit_bh(rw, bh, 0);
+	return submit_bh_blkcg(rw, bh, 0, NULL);
 }
 EXPORT_SYMBOL(submit_bh);
 
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index 9cc11e5b97ca..e9d7373f5f93 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -393,6 +393,12 @@ static inline struct bdi_writeback *inode_to_wb(struct inode *inode)
 	return inode->i_wb;
 }
 
+static inline struct cgroup_subsys_state *
+inode_to_wb_blkcg_css(struct inode *inode)
+{
+	return inode_to_wb(inode)->blkcg_css;
+}
+
 struct wb_iter {
 	int			start_blkcg_id;
 	struct radix_tree_iter	tree_iter;
@@ -510,6 +516,12 @@ static inline void wb_blkcg_offline(struct blkcg *blkcg)
 {
 }
 
+static inline struct cgroup_subsys_state *
+inode_to_wb_blkcg_css(struct inode *inode)
+{
+	return blkcg_root_css;
+}
+
 struct wb_iter {
 	int		next_id;
 };
-- 
cgit v1.2.3


From 0d960a383ae7aa791b2833e122ba7519d264cf92 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 22 May 2015 18:23:19 -0400
Subject: writeback: clean up wb_dirty_limit()

The function name wb_dirty_limit(), its argument @dirty and the local
variable @wb_dirty are mortally confusing given that the function
calculates per-wb threshold value not dirty pages, especially given
that @dirty and @wb_dirty are used elsewhere for dirty pages.

Let's rename the function to wb_calc_thresh() and wb_dirty to
wb_thresh.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Jan Kara <jack@suse.cz>
Cc: Wu Fengguang <fengguang.wu@intel.com>
Cc: Greg Thelen <gthelen@google.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 fs/fs-writeback.c         |  2 +-
 include/linux/writeback.h |  2 +-
 mm/backing-dev.c          |  6 +++---
 mm/page-writeback.c       | 30 +++++++++++++++---------------
 4 files changed, 20 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 881ea5d97c00..b1b3b8184500 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -1081,7 +1081,7 @@ static bool over_bground_thresh(struct bdi_writeback *wb)
 	    global_page_state(NR_UNSTABLE_NFS) > background_thresh)
 		return true;
 
-	if (wb_stat(wb, WB_RECLAIMABLE) > wb_dirty_limit(wb, background_thresh))
+	if (wb_stat(wb, WB_RECLAIMABLE) > wb_calc_thresh(wb, background_thresh))
 		return true;
 
 	return false;
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 23af355d5471..0435c85d4cfa 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -155,7 +155,7 @@ int dirty_writeback_centisecs_handler(struct ctl_table *, int,
 				      void __user *, size_t *, loff_t *);
 
 void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty);
-unsigned long wb_dirty_limit(struct bdi_writeback *wb, unsigned long dirty);
+unsigned long wb_calc_thresh(struct bdi_writeback *wb, unsigned long thresh);
 
 void __wb_update_bandwidth(struct bdi_writeback *wb,
 			   unsigned long thresh,
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index ad5608d01e8c..9c8b7b5a0eee 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -49,7 +49,7 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v)
 	struct bdi_writeback *wb = &bdi->wb;
 	unsigned long background_thresh;
 	unsigned long dirty_thresh;
-	unsigned long bdi_thresh;
+	unsigned long wb_thresh;
 	unsigned long nr_dirty, nr_io, nr_more_io, nr_dirty_time;
 	struct inode *inode;
 
@@ -67,7 +67,7 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v)
 	spin_unlock(&wb->list_lock);
 
 	global_dirty_limits(&background_thresh, &dirty_thresh);
-	bdi_thresh = wb_dirty_limit(wb, dirty_thresh);
+	wb_thresh = wb_calc_thresh(wb, dirty_thresh);
 
 #define K(x) ((x) << (PAGE_SHIFT - 10))
 	seq_printf(m,
@@ -87,7 +87,7 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v)
 		   "state:              %10lx\n",
 		   (unsigned long) K(wb_stat(wb, WB_WRITEBACK)),
 		   (unsigned long) K(wb_stat(wb, WB_RECLAIMABLE)),
-		   K(bdi_thresh),
+		   K(wb_thresh),
 		   K(dirty_thresh),
 		   K(background_thresh),
 		   (unsigned long) K(wb_stat(wb, WB_DIRTIED)),
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 70cf98dc3423..c7745a7fe11e 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -556,7 +556,7 @@ static unsigned long hard_dirty_limit(unsigned long thresh)
 }
 
 /**
- * wb_dirty_limit - @wb's share of dirty throttling threshold
+ * wb_calc_thresh - @wb's share of dirty throttling threshold
  * @wb: bdi_writeback to query
  * @dirty: global dirty limit in pages
  *
@@ -577,28 +577,28 @@ static unsigned long hard_dirty_limit(unsigned long thresh)
  * The wb's share of dirty limit will be adapting to its throughput and
  * bounded by the bdi->min_ratio and/or bdi->max_ratio parameters, if set.
  */
-unsigned long wb_dirty_limit(struct bdi_writeback *wb, unsigned long dirty)
+unsigned long wb_calc_thresh(struct bdi_writeback *wb, unsigned long thresh)
 {
-	u64 wb_dirty;
+	u64 wb_thresh;
 	long numerator, denominator;
 	unsigned long wb_min_ratio, wb_max_ratio;
 
 	/*
-	 * Calculate this BDI's share of the dirty ratio.
+	 * Calculate this BDI's share of the thresh ratio.
 	 */
 	wb_writeout_fraction(wb, &numerator, &denominator);
 
-	wb_dirty = (dirty * (100 - bdi_min_ratio)) / 100;
-	wb_dirty *= numerator;
-	do_div(wb_dirty, denominator);
+	wb_thresh = (thresh * (100 - bdi_min_ratio)) / 100;
+	wb_thresh *= numerator;
+	do_div(wb_thresh, denominator);
 
 	wb_min_max_ratio(wb, &wb_min_ratio, &wb_max_ratio);
 
-	wb_dirty += (dirty * wb_min_ratio) / 100;
-	if (wb_dirty > (dirty * wb_max_ratio) / 100)
-		wb_dirty = dirty * wb_max_ratio / 100;
+	wb_thresh += (thresh * wb_min_ratio) / 100;
+	if (wb_thresh > (thresh * wb_max_ratio) / 100)
+		wb_thresh = thresh * wb_max_ratio / 100;
 
-	return wb_dirty;
+	return wb_thresh;
 }
 
 /*
@@ -750,7 +750,7 @@ static unsigned long wb_position_ratio(struct bdi_writeback *wb,
 	 * total amount of RAM is 16GB, bdi->max_ratio is equal to 1%, global
 	 * limits are set by default to 10% and 20% (background and throttle).
 	 * Then wb_thresh is 1% of 20% of 16GB. This amounts to ~8K pages.
-	 * wb_dirty_limit(wb, bg_thresh) is about ~4K pages. wb_setpoint is
+	 * wb_calc_thresh(wb, bg_thresh) is about ~4K pages. wb_setpoint is
 	 * about ~6K pages (as the average of background and throttle wb
 	 * limits). The 3rd order polynomial will provide positive feedback if
 	 * wb_dirty is under wb_setpoint and vice versa.
@@ -1115,7 +1115,7 @@ static void wb_update_dirty_ratelimit(struct bdi_writeback *wb,
 	 *
 	 * We rampup dirty_ratelimit forcibly if wb_dirty is low because
 	 * it's possible that wb_thresh is close to zero due to inactivity
-	 * of backing device (see the implementation of wb_dirty_limit()).
+	 * of backing device (see the implementation of wb_calc_thresh()).
 	 */
 	if (unlikely(wb->bdi->capabilities & BDI_CAP_STRICTLIMIT)) {
 		dirty = wb_dirty;
@@ -1123,7 +1123,7 @@ static void wb_update_dirty_ratelimit(struct bdi_writeback *wb,
 			setpoint = wb_dirty + 1;
 		else
 			setpoint = (wb_thresh +
-				    wb_dirty_limit(wb, bg_thresh)) / 2;
+				    wb_calc_thresh(wb, bg_thresh)) / 2;
 	}
 
 	if (dirty < setpoint) {
@@ -1352,7 +1352,7 @@ static inline void wb_dirty_limits(struct bdi_writeback *wb,
 	 *   wb_position_ratio() will let the dirtier task progress
 	 *   at some rate <= (write_bw / 2) for bringing down wb_dirty.
 	 */
-	*wb_thresh = wb_dirty_limit(wb, dirty_thresh);
+	*wb_thresh = wb_calc_thresh(wb, dirty_thresh);
 
 	if (wb_bg_thresh)
 		*wb_bg_thresh = dirty_thresh ? div_u64((u64)*wb_thresh *
-- 
cgit v1.2.3


From 8a73179956e649df0d4b3250db17734f272d8266 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 22 May 2015 18:23:20 -0400
Subject: writeback: reorganize [__]wb_update_bandwidth()

__wb_update_bandwidth() is called from two places -
fs/fs-writeback.c::balance_dirty_pages() and
mm/page-writeback.c::wb_writeback().  The latter updates only the
write bandwidth while the former also deals with the dirty ratelimit.
The two callsites are distinguished by whether @thresh parameter is
zero or not, which is cryptic.  In addition, the two files define
their own different versions of wb_update_bandwidth() on top of
__wb_update_bandwidth(), which is confusing to say the least.  This
patch cleans up [__]wb_update_bandwidth() in the following ways.

* __wb_update_bandwidth() now takes explicit @update_ratelimit
  parameter to gate dirty ratelimit handling.

* mm/page-writeback.c::wb_update_bandwidth() is flattened into its
  caller - balance_dirty_pages().

* fs/fs-writeback.c::wb_update_bandwidth() is moved to
  mm/page-writeback.c and __wb_update_bandwidth() is made static.

* While at it, add a lockdep assertion to __wb_update_bandwidth().

Except for the lockdep addition, this is pure reorganization and
doesn't introduce any behavioral changes.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Jan Kara <jack@suse.cz>
Cc: Wu Fengguang <fengguang.wu@intel.com>
Cc: Greg Thelen <gthelen@google.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 fs/fs-writeback.c         | 10 ----------
 include/linux/writeback.h |  9 +--------
 mm/page-writeback.c       | 45 ++++++++++++++++++++++-----------------------
 3 files changed, 23 insertions(+), 41 deletions(-)

(limited to 'include/linux')

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index b1b3b8184500..cd89484486f6 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -1087,16 +1087,6 @@ static bool over_bground_thresh(struct bdi_writeback *wb)
 	return false;
 }
 
-/*
- * Called under wb->list_lock. If there are multiple wb per bdi,
- * only the flusher working on the first wb should do it.
- */
-static void wb_update_bandwidth(struct bdi_writeback *wb,
-				unsigned long start_time)
-{
-	__wb_update_bandwidth(wb, 0, 0, 0, 0, 0, start_time);
-}
-
 /*
  * Explicit flushing or periodic writeback of "old" data.
  *
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 0435c85d4cfa..80adf3d88d9d 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -157,14 +157,7 @@ int dirty_writeback_centisecs_handler(struct ctl_table *, int,
 void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty);
 unsigned long wb_calc_thresh(struct bdi_writeback *wb, unsigned long thresh);
 
-void __wb_update_bandwidth(struct bdi_writeback *wb,
-			   unsigned long thresh,
-			   unsigned long bg_thresh,
-			   unsigned long dirty,
-			   unsigned long bdi_thresh,
-			   unsigned long bdi_dirty,
-			   unsigned long start_time);
-
+void wb_update_bandwidth(struct bdi_writeback *wb, unsigned long start_time);
 void page_writeback_init(void);
 void balance_dirty_pages_ratelimited(struct address_space *mapping);
 
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index c7745a7fe11e..bebdd41b8d8e 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -1160,19 +1160,22 @@ static void wb_update_dirty_ratelimit(struct bdi_writeback *wb,
 	trace_bdi_dirty_ratelimit(wb->bdi, dirty_rate, task_ratelimit);
 }
 
-void __wb_update_bandwidth(struct bdi_writeback *wb,
-			   unsigned long thresh,
-			   unsigned long bg_thresh,
-			   unsigned long dirty,
-			   unsigned long wb_thresh,
-			   unsigned long wb_dirty,
-			   unsigned long start_time)
+static void __wb_update_bandwidth(struct bdi_writeback *wb,
+				  unsigned long thresh,
+				  unsigned long bg_thresh,
+				  unsigned long dirty,
+				  unsigned long wb_thresh,
+				  unsigned long wb_dirty,
+				  unsigned long start_time,
+				  bool update_ratelimit)
 {
 	unsigned long now = jiffies;
 	unsigned long elapsed = now - wb->bw_time_stamp;
 	unsigned long dirtied;
 	unsigned long written;
 
+	lockdep_assert_held(&wb->list_lock);
+
 	/*
 	 * rate-limit, only update once every 200ms.
 	 */
@@ -1189,7 +1192,7 @@ void __wb_update_bandwidth(struct bdi_writeback *wb,
 	if (elapsed > HZ && time_before(wb->bw_time_stamp, start_time))
 		goto snapshot;
 
-	if (thresh) {
+	if (update_ratelimit) {
 		global_update_bandwidth(thresh, dirty, now);
 		wb_update_dirty_ratelimit(wb, thresh, bg_thresh, dirty,
 					  wb_thresh, wb_dirty,
@@ -1203,20 +1206,9 @@ snapshot:
 	wb->bw_time_stamp = now;
 }
 
-static void wb_update_bandwidth(struct bdi_writeback *wb,
-				unsigned long thresh,
-				unsigned long bg_thresh,
-				unsigned long dirty,
-				unsigned long wb_thresh,
-				unsigned long wb_dirty,
-				unsigned long start_time)
+void wb_update_bandwidth(struct bdi_writeback *wb, unsigned long start_time)
 {
-	if (time_is_after_eq_jiffies(wb->bw_time_stamp + BANDWIDTH_INTERVAL))
-		return;
-	spin_lock(&wb->list_lock);
-	__wb_update_bandwidth(wb, thresh, bg_thresh, dirty,
-			      wb_thresh, wb_dirty, start_time);
-	spin_unlock(&wb->list_lock);
+	__wb_update_bandwidth(wb, 0, 0, 0, 0, 0, start_time, false);
 }
 
 /*
@@ -1467,8 +1459,15 @@ static void balance_dirty_pages(struct address_space *mapping,
 		if (dirty_exceeded && !wb->dirty_exceeded)
 			wb->dirty_exceeded = 1;
 
-		wb_update_bandwidth(wb, dirty_thresh, background_thresh,
-				    nr_dirty, wb_thresh, wb_dirty, start_time);
+		if (time_is_before_jiffies(wb->bw_time_stamp +
+					   BANDWIDTH_INTERVAL)) {
+			spin_lock(&wb->list_lock);
+			__wb_update_bandwidth(wb, dirty_thresh,
+					      background_thresh, nr_dirty,
+					      wb_thresh, wb_dirty, start_time,
+					      true);
+			spin_unlock(&wb->list_lock);
+		}
 
 		dirty_ratelimit = wb->dirty_ratelimit;
 		pos_ratio = wb_position_ratio(wb, dirty_thresh,
-- 
cgit v1.2.3


From 380c27ca33ebecc9da35aa90c8b3a9154f90aac2 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 22 May 2015 18:23:21 -0400
Subject: writeback: implement wb_domain

Dirtyable memory is distributed to a wb (bdi_writeback) according to
the relative bandwidth the wb is writing out in the whole system.
This distribution is global - each wb is measured against all other
wb's and gets the proportinately sized portion of the memory in the
whole system.

For cgroup writeback, the amount of dirtyable memory is scoped by
memcg and thus each wb would need to be measured and controlled in its
memcg.  IOW, a wb will belong to two writeback domains - the global
and memcg domains.

Currently, what constitutes the global writeback domain are scattered
across a number of global states.  This patch starts collecting them
into struct wb_domain.

* fprop_global which serves as the basis for proportional bandwidth
  measurement and its period timer are moved into struct wb_domain.

* global_wb_domain hosts the states for the global domain.

* While at it, flatten wb_writeout_fraction() into its callers.  This
  thin wrapper doesn't provide any actual benefits while getting in
  the way.

This is pure reorganization and doesn't introduce any behavioral
changes.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Jan Kara <jack@suse.cz>
Cc: Wu Fengguang <fengguang.wu@intel.com>
Cc: Greg Thelen <gthelen@google.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 include/linux/writeback.h | 32 +++++++++++++++++++++
 mm/page-writeback.c       | 72 ++++++++++++++++++-----------------------------
 2 files changed, 59 insertions(+), 45 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 80adf3d88d9d..3148db1296a2 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -7,6 +7,7 @@
 #include <linux/sched.h>
 #include <linux/workqueue.h>
 #include <linux/fs.h>
+#include <linux/flex_proportions.h>
 
 DECLARE_PER_CPU(int, dirty_throttle_leaks);
 
@@ -86,6 +87,36 @@ struct writeback_control {
 	unsigned for_sync:1;		/* sync(2) WB_SYNC_ALL writeback */
 };
 
+/*
+ * A wb_domain represents a domain that wb's (bdi_writeback's) belong to
+ * and are measured against each other in.  There always is one global
+ * domain, global_wb_domain, that every wb in the system is a member of.
+ * This allows measuring the relative bandwidth of each wb to distribute
+ * dirtyable memory accordingly.
+ */
+struct wb_domain {
+	/*
+	 * Scale the writeback cache size proportional to the relative
+	 * writeout speed.
+	 *
+	 * We do this by keeping a floating proportion between BDIs, based
+	 * on page writeback completions [end_page_writeback()]. Those
+	 * devices that write out pages fastest will get the larger share,
+	 * while the slower will get a smaller share.
+	 *
+	 * We use page writeout completions because we are interested in
+	 * getting rid of dirty pages. Having them written out is the
+	 * primary goal.
+	 *
+	 * We introduce a concept of time, a period over which we measure
+	 * these events, because demand can/will vary over time. The length
+	 * of this period itself is measured in page writeback completions.
+	 */
+	struct fprop_global completions;
+	struct timer_list period_timer;	/* timer for aging of completions */
+	unsigned long period_time;
+};
+
 /*
  * fs/fs-writeback.c
  */	
@@ -120,6 +151,7 @@ static inline void laptop_sync_completion(void) { }
 #endif
 void throttle_vm_writeout(gfp_t gfp_mask);
 bool zone_dirty_ok(struct zone *zone);
+int wb_domain_init(struct wb_domain *dom, gfp_t gfp);
 
 extern unsigned long global_dirty_limit;
 
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index bebdd41b8d8e..08e1737edb39 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -124,29 +124,7 @@ EXPORT_SYMBOL(laptop_mode);
 
 unsigned long global_dirty_limit;
 
-/*
- * Scale the writeback cache size proportional to the relative writeout speeds.
- *
- * We do this by keeping a floating proportion between BDIs, based on page
- * writeback completions [end_page_writeback()]. Those devices that write out
- * pages fastest will get the larger share, while the slower will get a smaller
- * share.
- *
- * We use page writeout completions because we are interested in getting rid of
- * dirty pages. Having them written out is the primary goal.
- *
- * We introduce a concept of time, a period over which we measure these events,
- * because demand can/will vary over time. The length of this period itself is
- * measured in page writeback completions.
- *
- */
-static struct fprop_global writeout_completions;
-
-static void writeout_period(unsigned long t);
-/* Timer for aging of writeout_completions */
-static struct timer_list writeout_period_timer =
-		TIMER_DEFERRED_INITIALIZER(writeout_period, 0, 0);
-static unsigned long writeout_period_time = 0;
+static struct wb_domain global_wb_domain;
 
 /*
  * Length of period for aging writeout fractions of bdis. This is an
@@ -433,24 +411,26 @@ static unsigned long wp_next_time(unsigned long cur_time)
 }
 
 /*
- * Increment the BDI's writeout completion count and the global writeout
+ * Increment the wb's writeout completion count and the global writeout
  * completion count. Called from test_clear_page_writeback().
  */
 static inline void __wb_writeout_inc(struct bdi_writeback *wb)
 {
+	struct wb_domain *dom = &global_wb_domain;
+
 	__inc_wb_stat(wb, WB_WRITTEN);
-	__fprop_inc_percpu_max(&writeout_completions, &wb->completions,
+	__fprop_inc_percpu_max(&dom->completions, &wb->completions,
 			       wb->bdi->max_prop_frac);
 	/* First event after period switching was turned off? */
-	if (!unlikely(writeout_period_time)) {
+	if (!unlikely(dom->period_time)) {
 		/*
 		 * We can race with other __bdi_writeout_inc calls here but
 		 * it does not cause any harm since the resulting time when
 		 * timer will fire and what is in writeout_period_time will be
 		 * roughly the same.
 		 */
-		writeout_period_time = wp_next_time(jiffies);
-		mod_timer(&writeout_period_timer, writeout_period_time);
+		dom->period_time = wp_next_time(jiffies);
+		mod_timer(&dom->period_timer, dom->period_time);
 	}
 }
 
@@ -464,38 +444,38 @@ void wb_writeout_inc(struct bdi_writeback *wb)
 }
 EXPORT_SYMBOL_GPL(wb_writeout_inc);
 
-/*
- * Obtain an accurate fraction of the BDI's portion.
- */
-static void wb_writeout_fraction(struct bdi_writeback *wb,
-				 long *numerator, long *denominator)
-{
-	fprop_fraction_percpu(&writeout_completions, &wb->completions,
-				numerator, denominator);
-}
-
 /*
  * On idle system, we can be called long after we scheduled because we use
  * deferred timers so count with missed periods.
  */
 static void writeout_period(unsigned long t)
 {
-	int miss_periods = (jiffies - writeout_period_time) /
+	struct wb_domain *dom = (void *)t;
+	int miss_periods = (jiffies - dom->period_time) /
 						 VM_COMPLETIONS_PERIOD_LEN;
 
-	if (fprop_new_period(&writeout_completions, miss_periods + 1)) {
-		writeout_period_time = wp_next_time(writeout_period_time +
+	if (fprop_new_period(&dom->completions, miss_periods + 1)) {
+		dom->period_time = wp_next_time(dom->period_time +
 				miss_periods * VM_COMPLETIONS_PERIOD_LEN);
-		mod_timer(&writeout_period_timer, writeout_period_time);
+		mod_timer(&dom->period_timer, dom->period_time);
 	} else {
 		/*
 		 * Aging has zeroed all fractions. Stop wasting CPU on period
 		 * updates.
 		 */
-		writeout_period_time = 0;
+		dom->period_time = 0;
 	}
 }
 
+int wb_domain_init(struct wb_domain *dom, gfp_t gfp)
+{
+	memset(dom, 0, sizeof(*dom));
+	init_timer_deferrable(&dom->period_timer);
+	dom->period_timer.function = writeout_period;
+	dom->period_timer.data = (unsigned long)dom;
+	return fprop_global_init(&dom->completions, gfp);
+}
+
 /*
  * bdi_min_ratio keeps the sum of the minimum dirty shares of all
  * registered backing devices, which, for obvious reasons, can not
@@ -579,6 +559,7 @@ static unsigned long hard_dirty_limit(unsigned long thresh)
  */
 unsigned long wb_calc_thresh(struct bdi_writeback *wb, unsigned long thresh)
 {
+	struct wb_domain *dom = &global_wb_domain;
 	u64 wb_thresh;
 	long numerator, denominator;
 	unsigned long wb_min_ratio, wb_max_ratio;
@@ -586,7 +567,8 @@ unsigned long wb_calc_thresh(struct bdi_writeback *wb, unsigned long thresh)
 	/*
 	 * Calculate this BDI's share of the thresh ratio.
 	 */
-	wb_writeout_fraction(wb, &numerator, &denominator);
+	fprop_fraction_percpu(&dom->completions, &wb->completions,
+			      &numerator, &denominator);
 
 	wb_thresh = (thresh * (100 - bdi_min_ratio)) / 100;
 	wb_thresh *= numerator;
@@ -1831,7 +1813,7 @@ void __init page_writeback_init(void)
 	writeback_set_ratelimit();
 	register_cpu_notifier(&ratelimit_nb);
 
-	fprop_global_init(&writeout_completions, GFP_KERNEL);
+	BUG_ON(wb_domain_init(&global_wb_domain, GFP_KERNEL));
 }
 
 /**
-- 
cgit v1.2.3


From dcc25ae76eb7b8ff883eaaab57e30e8f2f085be3 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 22 May 2015 18:23:22 -0400
Subject: writeback: move global_dirty_limit into wb_domain

This patch is a part of the series to define wb_domain which
represents a domain that wb's (bdi_writeback's) belong to and are
measured against each other in.  This will enable IO backpressure
propagation for cgroup writeback.

global_dirty_limit exists to regulate the global dirty threshold which
is a property of the wb_domain.  This patch moves hard_dirty_limit,
dirty_lock, and update_time into wb_domain.

This is pure reorganization and doesn't introduce any behavioral
changes.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Jan Kara <jack@suse.cz>
Cc: Wu Fengguang <fengguang.wu@intel.com>
Cc: Greg Thelen <gthelen@google.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 fs/fs-writeback.c                |  2 +-
 include/linux/writeback.h        | 17 ++++++++++++++-
 include/trace/events/writeback.h |  7 +++---
 mm/page-writeback.c              | 46 ++++++++++++++++++++--------------------
 4 files changed, 44 insertions(+), 28 deletions(-)

(limited to 'include/linux')

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index cd89484486f6..51c8a5b14cdf 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -887,7 +887,7 @@ static long writeback_chunk_size(struct bdi_writeback *wb,
 		pages = LONG_MAX;
 	else {
 		pages = min(wb->avg_write_bandwidth / 2,
-			    global_dirty_limit / DIRTY_SCOPE);
+			    global_wb_domain.dirty_limit / DIRTY_SCOPE);
 		pages = min(pages, work->nr_pages);
 		pages = round_down(pages + MIN_WRITEBACK_PAGES,
 				   MIN_WRITEBACK_PAGES);
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 3148db1296a2..5fdd4e1805e6 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -95,6 +95,8 @@ struct writeback_control {
  * dirtyable memory accordingly.
  */
 struct wb_domain {
+	spinlock_t lock;
+
 	/*
 	 * Scale the writeback cache size proportional to the relative
 	 * writeout speed.
@@ -115,6 +117,19 @@ struct wb_domain {
 	struct fprop_global completions;
 	struct timer_list period_timer;	/* timer for aging of completions */
 	unsigned long period_time;
+
+	/*
+	 * The dirtyable memory and dirty threshold could be suddenly
+	 * knocked down by a large amount (eg. on the startup of KVM in a
+	 * swapless system). This may throw the system into deep dirty
+	 * exceeded state and throttle heavy/light dirtiers alike. To
+	 * retain good responsiveness, maintain global_dirty_limit for
+	 * tracking slowly down to the knocked down dirty threshold.
+	 *
+	 * Both fields are protected by ->lock.
+	 */
+	unsigned long dirty_limit_tstamp;
+	unsigned long dirty_limit;
 };
 
 /*
@@ -153,7 +168,7 @@ void throttle_vm_writeout(gfp_t gfp_mask);
 bool zone_dirty_ok(struct zone *zone);
 int wb_domain_init(struct wb_domain *dom, gfp_t gfp);
 
-extern unsigned long global_dirty_limit;
+extern struct wb_domain global_wb_domain;
 
 /* These are exported to sysctl. */
 extern int dirty_background_ratio;
diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h
index 9b876f6cc81a..bec69995968f 100644
--- a/include/trace/events/writeback.h
+++ b/include/trace/events/writeback.h
@@ -361,7 +361,7 @@ TRACE_EVENT(global_dirty_state,
 		__entry->nr_written	= global_page_state(NR_WRITTEN);
 		__entry->background_thresh = background_thresh;
 		__entry->dirty_thresh	= dirty_thresh;
-		__entry->dirty_limit = global_dirty_limit;
+		__entry->dirty_limit	= global_wb_domain.dirty_limit;
 	),
 
 	TP_printk("dirty=%lu writeback=%lu unstable=%lu "
@@ -463,8 +463,9 @@ TRACE_EVENT(balance_dirty_pages,
 		unsigned long freerun = (thresh + bg_thresh) / 2;
 		strlcpy(__entry->bdi, dev_name(bdi->dev), 32);
 
-		__entry->limit		= global_dirty_limit;
-		__entry->setpoint	= (global_dirty_limit + freerun) / 2;
+		__entry->limit		= global_wb_domain.dirty_limit;
+		__entry->setpoint	= (global_wb_domain.dirty_limit +
+						freerun) / 2;
 		__entry->dirty		= dirty;
 		__entry->bdi_setpoint	= __entry->setpoint *
 						bdi_thresh / (thresh + 1);
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 08e1737edb39..27e60ba8e688 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -122,9 +122,7 @@ EXPORT_SYMBOL(laptop_mode);
 
 /* End of sysctl-exported parameters */
 
-unsigned long global_dirty_limit;
-
-static struct wb_domain global_wb_domain;
+struct wb_domain global_wb_domain;
 
 /*
  * Length of period for aging writeout fractions of bdis. This is an
@@ -470,9 +468,15 @@ static void writeout_period(unsigned long t)
 int wb_domain_init(struct wb_domain *dom, gfp_t gfp)
 {
 	memset(dom, 0, sizeof(*dom));
+
+	spin_lock_init(&dom->lock);
+
 	init_timer_deferrable(&dom->period_timer);
 	dom->period_timer.function = writeout_period;
 	dom->period_timer.data = (unsigned long)dom;
+
+	dom->dirty_limit_tstamp = jiffies;
+
 	return fprop_global_init(&dom->completions, gfp);
 }
 
@@ -532,7 +536,9 @@ static unsigned long dirty_freerun_ceiling(unsigned long thresh,
 
 static unsigned long hard_dirty_limit(unsigned long thresh)
 {
-	return max(thresh, global_dirty_limit);
+	struct wb_domain *dom = &global_wb_domain;
+
+	return max(thresh, dom->dirty_limit);
 }
 
 /**
@@ -916,17 +922,10 @@ out:
 	wb->avg_write_bandwidth = avg;
 }
 
-/*
- * The global dirtyable memory and dirty threshold could be suddenly knocked
- * down by a large amount (eg. on the startup of KVM in a swapless system).
- * This may throw the system into deep dirty exceeded state and throttle
- * heavy/light dirtiers alike. To retain good responsiveness, maintain
- * global_dirty_limit for tracking slowly down to the knocked down dirty
- * threshold.
- */
 static void update_dirty_limit(unsigned long thresh, unsigned long dirty)
 {
-	unsigned long limit = global_dirty_limit;
+	struct wb_domain *dom = &global_wb_domain;
+	unsigned long limit = dom->dirty_limit;
 
 	/*
 	 * Follow up in one step.
@@ -939,7 +938,7 @@ static void update_dirty_limit(unsigned long thresh, unsigned long dirty)
 	/*
 	 * Follow down slowly. Use the higher one as the target, because thresh
 	 * may drop below dirty. This is exactly the reason to introduce
-	 * global_dirty_limit which is guaranteed to lie above the dirty pages.
+	 * dom->dirty_limit which is guaranteed to lie above the dirty pages.
 	 */
 	thresh = max(thresh, dirty);
 	if (limit > thresh) {
@@ -948,28 +947,27 @@ static void update_dirty_limit(unsigned long thresh, unsigned long dirty)
 	}
 	return;
 update:
-	global_dirty_limit = limit;
+	dom->dirty_limit = limit;
 }
 
 static void global_update_bandwidth(unsigned long thresh,
 				    unsigned long dirty,
 				    unsigned long now)
 {
-	static DEFINE_SPINLOCK(dirty_lock);
-	static unsigned long update_time = INITIAL_JIFFIES;
+	struct wb_domain *dom = &global_wb_domain;
 
 	/*
 	 * check locklessly first to optimize away locking for the most time
 	 */
-	if (time_before(now, update_time + BANDWIDTH_INTERVAL))
+	if (time_before(now, dom->dirty_limit_tstamp + BANDWIDTH_INTERVAL))
 		return;
 
-	spin_lock(&dirty_lock);
-	if (time_after_eq(now, update_time + BANDWIDTH_INTERVAL)) {
+	spin_lock(&dom->lock);
+	if (time_after_eq(now, dom->dirty_limit_tstamp + BANDWIDTH_INTERVAL)) {
 		update_dirty_limit(thresh, dirty);
-		update_time = now;
+		dom->dirty_limit_tstamp = now;
 	}
-	spin_unlock(&dirty_lock);
+	spin_unlock(&dom->lock);
 }
 
 /*
@@ -1761,10 +1759,12 @@ void laptop_sync_completion(void)
 
 void writeback_set_ratelimit(void)
 {
+	struct wb_domain *dom = &global_wb_domain;
 	unsigned long background_thresh;
 	unsigned long dirty_thresh;
+
 	global_dirty_limits(&background_thresh, &dirty_thresh);
-	global_dirty_limit = dirty_thresh;
+	dom->dirty_limit = dirty_thresh;
 	ratelimit_pages = dirty_thresh / (num_online_cpus() * 32);
 	if (ratelimit_pages < 16)
 		ratelimit_pages = 16;
-- 
cgit v1.2.3


From aa661bbe1e61ce80ca4ae98804f673ede94b0827 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 22 May 2015 18:23:31 -0400
Subject: writeback: move over_bground_thresh() to mm/page-writeback.c

and rename it to wb_over_bg_thresh().  The function is closely tied to
the dirty throttling mechanism implemented in page-writeback.c.  This
relocation will allow future updates necessary for cgroup writeback
support.

While at it, add function comment.

This is pure reorganization and doesn't introduce any behavioral
changes.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Jan Kara <jack@suse.cz>
Cc: Wu Fengguang <fengguang.wu@intel.com>
Cc: Greg Thelen <gthelen@google.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 fs/fs-writeback.c         | 20 ++------------------
 include/linux/writeback.h |  1 +
 mm/page-writeback.c       | 23 +++++++++++++++++++++++
 3 files changed, 26 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 51c8a5b14cdf..da355879ba7c 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -1071,22 +1071,6 @@ static long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages,
 	return nr_pages - work.nr_pages;
 }
 
-static bool over_bground_thresh(struct bdi_writeback *wb)
-{
-	unsigned long background_thresh, dirty_thresh;
-
-	global_dirty_limits(&background_thresh, &dirty_thresh);
-
-	if (global_page_state(NR_FILE_DIRTY) +
-	    global_page_state(NR_UNSTABLE_NFS) > background_thresh)
-		return true;
-
-	if (wb_stat(wb, WB_RECLAIMABLE) > wb_calc_thresh(wb, background_thresh))
-		return true;
-
-	return false;
-}
-
 /*
  * Explicit flushing or periodic writeback of "old" data.
  *
@@ -1136,7 +1120,7 @@ static long wb_writeback(struct bdi_writeback *wb,
 		 * For background writeout, stop when we are below the
 		 * background dirty threshold
 		 */
-		if (work->for_background && !over_bground_thresh(wb))
+		if (work->for_background && !wb_over_bg_thresh(wb))
 			break;
 
 		/*
@@ -1227,7 +1211,7 @@ static unsigned long get_nr_dirty_pages(void)
 
 static long wb_check_background_flush(struct bdi_writeback *wb)
 {
-	if (over_bground_thresh(wb)) {
+	if (wb_over_bg_thresh(wb)) {
 
 		struct wb_writeback_work work = {
 			.nr_pages	= LONG_MAX,
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 5fdd4e1805e6..b57c2786b5aa 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -207,6 +207,7 @@ unsigned long wb_calc_thresh(struct bdi_writeback *wb, unsigned long thresh);
 void wb_update_bandwidth(struct bdi_writeback *wb, unsigned long start_time);
 void page_writeback_init(void);
 void balance_dirty_pages_ratelimited(struct address_space *mapping);
+bool wb_over_bg_thresh(struct bdi_writeback *wb);
 
 typedef int (*writepage_t)(struct page *page, struct writeback_control *wbc,
 				void *data);
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index c8ac8cea67dc..9d9a896fa7b5 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -1740,6 +1740,29 @@ void balance_dirty_pages_ratelimited(struct address_space *mapping)
 }
 EXPORT_SYMBOL(balance_dirty_pages_ratelimited);
 
+/**
+ * wb_over_bg_thresh - does @wb need to be written back?
+ * @wb: bdi_writeback of interest
+ *
+ * Determines whether background writeback should keep writing @wb or it's
+ * clean enough.  Returns %true if writeback should continue.
+ */
+bool wb_over_bg_thresh(struct bdi_writeback *wb)
+{
+	unsigned long background_thresh, dirty_thresh;
+
+	global_dirty_limits(&background_thresh, &dirty_thresh);
+
+	if (global_page_state(NR_FILE_DIRTY) +
+	    global_page_state(NR_UNSTABLE_NFS) > background_thresh)
+		return true;
+
+	if (wb_stat(wb, WB_RECLAIMABLE) > wb_calc_thresh(wb, background_thresh))
+		return true;
+
+	return false;
+}
+
 void throttle_vm_writeout(gfp_t gfp_mask)
 {
 	unsigned long background_thresh;
-- 
cgit v1.2.3


From 841710aa6e4acd066ab9fe8c8cb6f4e4e6709d83 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 22 May 2015 18:23:33 -0400
Subject: writeback: implement memcg wb_domain

Dirtyable memory is distributed to a wb (bdi_writeback) according to
the relative bandwidth the wb is writing out in the whole system.
This distribution is global - each wb is measured against all other
wb's and gets the proportinately sized portion of the memory in the
whole system.

For cgroup writeback, the amount of dirtyable memory is scoped by
memcg and thus each wb would need to be measured and controlled in its
memcg.  IOW, a wb will belong to two writeback domains - the global
and memcg domains.

The previous patches laid the groundwork to support the two wb_domains
and this patch implements memcg wb_domain.  memcg->cgwb_domain is
initialized on css online and destroyed on css release,
wb->memcg_completions is added, and __wb_writeout_inc() is updated to
increment completions against both global and memcg wb_domains.

The following patches will update balance_dirty_pages() and its
subroutines to actually consider memcg wb_domain for throttling.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Jan Kara <jack@suse.cz>
Cc: Wu Fengguang <fengguang.wu@intel.com>
Cc: Greg Thelen <gthelen@google.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 include/linux/backing-dev-defs.h |  1 +
 include/linux/memcontrol.h       | 12 +++++++++++-
 include/linux/writeback.h        |  3 +++
 mm/backing-dev.c                 |  9 ++++++++-
 mm/memcontrol.c                  | 39 +++++++++++++++++++++++++++++++++++++++
 mm/page-writeback.c              | 25 +++++++++++++++++++++++++
 6 files changed, 87 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h
index 97a92fa0cdb5..8d470b73824f 100644
--- a/include/linux/backing-dev-defs.h
+++ b/include/linux/backing-dev-defs.h
@@ -118,6 +118,7 @@ struct bdi_writeback {
 
 #ifdef CONFIG_CGROUP_WRITEBACK
 	struct percpu_ref refcnt;	/* used only for !root wb's */
+	struct fprop_local_percpu memcg_completions;
 	struct cgroup_subsys_state *memcg_css; /* the associated memcg */
 	struct cgroup_subsys_state *blkcg_css; /* and blkcg */
 	struct list_head memcg_node;	/* anchored at memcg->cgwb_list */
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 662a953ea8ad..e3177bed23ea 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -389,8 +389,18 @@ enum {
 };
 
 #ifdef CONFIG_CGROUP_WRITEBACK
+
 struct list_head *mem_cgroup_cgwb_list(struct mem_cgroup *memcg);
-#endif
+struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb);
+
+#else	/* CONFIG_CGROUP_WRITEBACK */
+
+static inline struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb)
+{
+	return NULL;
+}
+
+#endif	/* CONFIG_CGROUP_WRITEBACK */
 
 struct sock;
 #if defined(CONFIG_INET) && defined(CONFIG_MEMCG_KMEM)
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index b57c2786b5aa..04a3786c456f 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -167,6 +167,9 @@ static inline void laptop_sync_completion(void) { }
 void throttle_vm_writeout(gfp_t gfp_mask);
 bool zone_dirty_ok(struct zone *zone);
 int wb_domain_init(struct wb_domain *dom, gfp_t gfp);
+#ifdef CONFIG_CGROUP_WRITEBACK
+void wb_domain_exit(struct wb_domain *dom);
+#endif
 
 extern struct wb_domain global_wb_domain;
 
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 9c8b7b5a0eee..84ebf7c8d006 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -482,6 +482,7 @@ static void cgwb_release_workfn(struct work_struct *work)
 	css_put(wb->blkcg_css);
 	wb_congested_put(wb->congested);
 
+	fprop_local_destroy_percpu(&wb->memcg_completions);
 	percpu_ref_exit(&wb->refcnt);
 	wb_exit(wb);
 	kfree_rcu(wb, rcu);
@@ -548,9 +549,13 @@ static int cgwb_create(struct backing_dev_info *bdi,
 	if (ret)
 		goto err_wb_exit;
 
+	ret = fprop_local_init_percpu(&wb->memcg_completions, gfp);
+	if (ret)
+		goto err_ref_exit;
+
 	wb->congested = wb_congested_get_create(bdi, blkcg_css->id, gfp);
 	if (!wb->congested)
-		goto err_ref_exit;
+		goto err_fprop_exit;
 
 	wb->memcg_css = memcg_css;
 	wb->blkcg_css = blkcg_css;
@@ -587,6 +592,8 @@ static int cgwb_create(struct backing_dev_info *bdi,
 
 err_put_congested:
 	wb_congested_put(wb->congested);
+err_fprop_exit:
+	fprop_local_destroy_percpu(&wb->memcg_completions);
 err_ref_exit:
 	percpu_ref_exit(&wb->refcnt);
 err_wb_exit:
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 701cbee9acba..ce113ddf2fb5 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -345,6 +345,7 @@ struct mem_cgroup {
 
 #ifdef CONFIG_CGROUP_WRITEBACK
 	struct list_head cgwb_list;
+	struct wb_domain cgwb_domain;
 #endif
 
 	/* List of events which userspace want to receive */
@@ -3994,6 +3995,37 @@ struct list_head *mem_cgroup_cgwb_list(struct mem_cgroup *memcg)
 	return &memcg->cgwb_list;
 }
 
+static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp)
+{
+	return wb_domain_init(&memcg->cgwb_domain, gfp);
+}
+
+static void memcg_wb_domain_exit(struct mem_cgroup *memcg)
+{
+	wb_domain_exit(&memcg->cgwb_domain);
+}
+
+struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb)
+{
+	struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
+
+	if (!memcg->css.parent)
+		return NULL;
+
+	return &memcg->cgwb_domain;
+}
+
+#else	/* CONFIG_CGROUP_WRITEBACK */
+
+static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp)
+{
+	return 0;
+}
+
+static void memcg_wb_domain_exit(struct mem_cgroup *memcg)
+{
+}
+
 #endif	/* CONFIG_CGROUP_WRITEBACK */
 
 /*
@@ -4380,9 +4412,15 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
 	memcg->stat = alloc_percpu(struct mem_cgroup_stat_cpu);
 	if (!memcg->stat)
 		goto out_free;
+
+	if (memcg_wb_domain_init(memcg, GFP_KERNEL))
+		goto out_free_stat;
+
 	spin_lock_init(&memcg->pcp_counter_lock);
 	return memcg;
 
+out_free_stat:
+	free_percpu(memcg->stat);
 out_free:
 	kfree(memcg);
 	return NULL;
@@ -4409,6 +4447,7 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)
 		free_mem_cgroup_per_zone_info(memcg, node);
 
 	free_percpu(memcg->stat);
+	memcg_wb_domain_exit(memcg);
 	kfree(memcg);
 }
 
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index a7ba5cee950b..a146e3389e78 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -171,6 +171,11 @@ static struct dirty_throttle_control *mdtc_gdtc(struct dirty_throttle_control *m
 	return mdtc->gdtc;
 }
 
+static struct fprop_local_percpu *wb_memcg_completions(struct bdi_writeback *wb)
+{
+	return &wb->memcg_completions;
+}
+
 static void wb_min_max_ratio(struct bdi_writeback *wb,
 			     unsigned long *minp, unsigned long *maxp)
 {
@@ -213,6 +218,11 @@ static struct dirty_throttle_control *mdtc_gdtc(struct dirty_throttle_control *m
 	return NULL;
 }
 
+static struct fprop_local_percpu *wb_memcg_completions(struct bdi_writeback *wb)
+{
+	return NULL;
+}
+
 static void wb_min_max_ratio(struct bdi_writeback *wb,
 			     unsigned long *minp, unsigned long *maxp)
 {
@@ -530,9 +540,16 @@ static void wb_domain_writeout_inc(struct wb_domain *dom,
  */
 static inline void __wb_writeout_inc(struct bdi_writeback *wb)
 {
+	struct wb_domain *cgdom;
+
 	__inc_wb_stat(wb, WB_WRITTEN);
 	wb_domain_writeout_inc(&global_wb_domain, &wb->completions,
 			       wb->bdi->max_prop_frac);
+
+	cgdom = mem_cgroup_wb_domain(wb);
+	if (cgdom)
+		wb_domain_writeout_inc(cgdom, wb_memcg_completions(wb),
+				       wb->bdi->max_prop_frac);
 }
 
 void wb_writeout_inc(struct bdi_writeback *wb)
@@ -583,6 +600,14 @@ int wb_domain_init(struct wb_domain *dom, gfp_t gfp)
 	return fprop_global_init(&dom->completions, gfp);
 }
 
+#ifdef CONFIG_CGROUP_WRITEBACK
+void wb_domain_exit(struct wb_domain *dom)
+{
+	del_timer_sync(&dom->period_timer);
+	fprop_global_destroy(&dom->completions);
+}
+#endif
+
 /*
  * bdi_min_ratio keeps the sum of the minimum dirty shares of all
  * registered backing devices, which, for obvious reasons, can not
-- 
cgit v1.2.3


From 2529bb3aadc40a93e642f5f3650f63379a964467 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 22 May 2015 18:23:34 -0400
Subject: writeback: reset wb_domain->dirty_limit[_tstmp] when memcg domain
 size changes

The amount of available memory to a memcg wb_domain can change as
memcg configuration changes.  A domain's ->dirty_limit exists to
smooth out sudden drops in dirty threshold; however, when a domain's
size actually drops significantly, it hinders the dirty throttling
from adjusting to the new configuration leading to unexpected
behaviors including unnecessary OOM kills.

This patch resolves the issue by adding wb_domain_size_changed() which
resets ->dirty_limit[_tstmp] and making memcg call it on configuration
changes.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Jan Kara <jack@suse.cz>
Cc: Wu Fengguang <fengguang.wu@intel.com>
Cc: Greg Thelen <gthelen@google.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 include/linux/writeback.h | 20 ++++++++++++++++++++
 mm/memcontrol.c           | 12 ++++++++++++
 2 files changed, 32 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 04a3786c456f..3b73e97ecfc7 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -132,6 +132,26 @@ struct wb_domain {
 	unsigned long dirty_limit;
 };
 
+/**
+ * wb_domain_size_changed - memory available to a wb_domain has changed
+ * @dom: wb_domain of interest
+ *
+ * This function should be called when the amount of memory available to
+ * @dom has changed.  It resets @dom's dirty limit parameters to prevent
+ * the past values which don't match the current configuration from skewing
+ * dirty throttling.  Without this, when memory size of a wb_domain is
+ * greatly reduced, the dirty throttling logic may allow too many pages to
+ * be dirtied leading to consecutive unnecessary OOMs and may get stuck in
+ * that situation.
+ */
+static inline void wb_domain_size_changed(struct wb_domain *dom)
+{
+	spin_lock(&dom->lock);
+	dom->dirty_limit_tstamp = jiffies;
+	dom->dirty_limit = 0;
+	spin_unlock(&dom->lock);
+}
+
 /*
  * fs/fs-writeback.c
  */	
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index ce113ddf2fb5..c0b0406ae5ca 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -4005,6 +4005,11 @@ static void memcg_wb_domain_exit(struct mem_cgroup *memcg)
 	wb_domain_exit(&memcg->cgwb_domain);
 }
 
+static void memcg_wb_domain_size_changed(struct mem_cgroup *memcg)
+{
+	wb_domain_size_changed(&memcg->cgwb_domain);
+}
+
 struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb)
 {
 	struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
@@ -4026,6 +4031,10 @@ static void memcg_wb_domain_exit(struct mem_cgroup *memcg)
 {
 }
 
+static void memcg_wb_domain_size_changed(struct mem_cgroup *memcg)
+{
+}
+
 #endif	/* CONFIG_CGROUP_WRITEBACK */
 
 /*
@@ -4624,6 +4633,7 @@ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css)
 	memcg->low = 0;
 	memcg->high = PAGE_COUNTER_MAX;
 	memcg->soft_limit = PAGE_COUNTER_MAX;
+	memcg_wb_domain_size_changed(memcg);
 }
 
 #ifdef CONFIG_MMU
@@ -5361,6 +5371,7 @@ static ssize_t memory_high_write(struct kernfs_open_file *of,
 
 	memcg->high = high;
 
+	memcg_wb_domain_size_changed(memcg);
 	return nbytes;
 }
 
@@ -5393,6 +5404,7 @@ static ssize_t memory_max_write(struct kernfs_open_file *of,
 	if (err)
 		return err;
 
+	memcg_wb_domain_size_changed(memcg);
 	return nbytes;
 }
 
-- 
cgit v1.2.3


From c2aa723a6093633ae4ec15b08a4db276643cab3e Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 22 May 2015 18:23:35 -0400
Subject: writeback: implement memcg writeback domain based throttling

While cgroup writeback support now connects memcg and blkcg so that
writeback IOs are properly attributed and controlled, the IO back
pressure propagation mechanism implemented in balance_dirty_pages()
and its subroutines wasn't aware of cgroup writeback.

Processes belonging to a memcg may have access to only subset of total
memory available in the system and not factoring this into dirty
throttling rendered it completely ineffective for processes under
memcg limits and memcg ended up building a separate ad-hoc degenerate
mechanism directly into vmscan code to limit page dirtying.

The previous patches updated balance_dirty_pages() and its subroutines
so that they can deal with multiple wb_domain's (writeback domains)
and defined per-memcg wb_domain.  Processes belonging to a non-root
memcg are bound to two wb_domains, global wb_domain and memcg
wb_domain, and should be throttled according to IO pressures from both
domains.  This patch updates dirty throttling code so that it repeats
similar calculations for the two domains - the differences between the
two are few and minor - and applies the lower of the two sets of
resulting constraints.

wb_over_bg_thresh(), which controls when background writeback
terminates, is also updated to consider both global and memcg
wb_domains.  It returns true if dirty is over bg_thresh for either
domain.

This makes the dirty throttling mechanism operational for memcg
domains including writeback-bandwidth-proportional dirty page
distribution inside them but the ad-hoc memcg throttling mechanism in
vmscan is still in place.  The next patch will rip it out.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Jan Kara <jack@suse.cz>
Cc: Wu Fengguang <fengguang.wu@intel.com>
Cc: Greg Thelen <gthelen@google.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 include/linux/memcontrol.h |   9 +++
 mm/memcontrol.c            |  43 ++++++++++++
 mm/page-writeback.c        | 158 ++++++++++++++++++++++++++++++++++++++-------
 3 files changed, 188 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index e3177bed23ea..c3eb19e2bc1c 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -392,6 +392,8 @@ enum {
 
 struct list_head *mem_cgroup_cgwb_list(struct mem_cgroup *memcg);
 struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb);
+void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pavail,
+			 unsigned long *pdirty, unsigned long *pwriteback);
 
 #else	/* CONFIG_CGROUP_WRITEBACK */
 
@@ -400,6 +402,13 @@ static inline struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb)
 	return NULL;
 }
 
+static inline void mem_cgroup_wb_stats(struct bdi_writeback *wb,
+				       unsigned long *pavail,
+				       unsigned long *pdirty,
+				       unsigned long *pwriteback)
+{
+}
+
 #endif	/* CONFIG_CGROUP_WRITEBACK */
 
 struct sock;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index c0b0406ae5ca..f816d91c643b 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -4020,6 +4020,49 @@ struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb)
 	return &memcg->cgwb_domain;
 }
 
+/**
+ * mem_cgroup_wb_stats - retrieve writeback related stats from its memcg
+ * @wb: bdi_writeback in question
+ * @pavail: out parameter for number of available pages
+ * @pdirty: out parameter for number of dirty pages
+ * @pwriteback: out parameter for number of pages under writeback
+ *
+ * Determine the numbers of available, dirty, and writeback pages in @wb's
+ * memcg.  Dirty and writeback are self-explanatory.  Available is a bit
+ * more involved.
+ *
+ * A memcg's headroom is "min(max, high) - used".  The available memory is
+ * calculated as the lowest headroom of itself and the ancestors plus the
+ * number of pages already being used for file pages.  Note that this
+ * doesn't consider the actual amount of available memory in the system.
+ * The caller should further cap *@pavail accordingly.
+ */
+void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pavail,
+			 unsigned long *pdirty, unsigned long *pwriteback)
+{
+	struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
+	struct mem_cgroup *parent;
+	unsigned long head_room = PAGE_COUNTER_MAX;
+	unsigned long file_pages;
+
+	*pdirty = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_DIRTY);
+
+	/* this should eventually include NR_UNSTABLE_NFS */
+	*pwriteback = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_WRITEBACK);
+
+	file_pages = mem_cgroup_nr_lru_pages(memcg, (1 << LRU_INACTIVE_FILE) |
+						    (1 << LRU_ACTIVE_FILE));
+	while ((parent = parent_mem_cgroup(memcg))) {
+		unsigned long ceiling = min(memcg->memory.limit, memcg->high);
+		unsigned long used = page_counter_read(&memcg->memory);
+
+		head_room = min(head_room, ceiling - min(ceiling, used));
+		memcg = parent;
+	}
+
+	*pavail = file_pages + head_room;
+}
+
 #else	/* CONFIG_CGROUP_WRITEBACK */
 
 static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp)
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index a146e3389e78..e8903356f423 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -160,6 +160,14 @@ struct dirty_throttle_control {
 #define GDTC_INIT(__wb)		.dom = &global_wb_domain,		\
 				DTC_INIT_COMMON(__wb)
 #define GDTC_INIT_NO_WB		.dom = &global_wb_domain
+#define MDTC_INIT(__wb, __gdtc)	.dom = mem_cgroup_wb_domain(__wb),	\
+				.gdtc = __gdtc,				\
+				DTC_INIT_COMMON(__wb)
+
+static bool mdtc_valid(struct dirty_throttle_control *dtc)
+{
+	return dtc->dom;
+}
 
 static struct wb_domain *dtc_dom(struct dirty_throttle_control *dtc)
 {
@@ -207,6 +215,12 @@ static void wb_min_max_ratio(struct bdi_writeback *wb,
 
 #define GDTC_INIT(__wb)		DTC_INIT_COMMON(__wb)
 #define GDTC_INIT_NO_WB
+#define MDTC_INIT(__wb, __gdtc)
+
+static bool mdtc_valid(struct dirty_throttle_control *dtc)
+{
+	return false;
+}
 
 static struct wb_domain *dtc_dom(struct dirty_throttle_control *dtc)
 {
@@ -668,6 +682,15 @@ static unsigned long hard_dirty_limit(struct wb_domain *dom,
 	return max(thresh, dom->dirty_limit);
 }
 
+/* memory available to a memcg domain is capped by system-wide clean memory */
+static void mdtc_cap_avail(struct dirty_throttle_control *mdtc)
+{
+	struct dirty_throttle_control *gdtc = mdtc_gdtc(mdtc);
+	unsigned long clean = gdtc->avail - min(gdtc->avail, gdtc->dirty);
+
+	mdtc->avail = min(mdtc->avail, clean);
+}
+
 /**
  * __wb_calc_thresh - @wb's share of dirty throttling threshold
  * @dtc: dirty_throttle_context of interest
@@ -1269,11 +1292,12 @@ static void wb_update_dirty_ratelimit(struct dirty_throttle_control *dtc,
 	trace_bdi_dirty_ratelimit(wb->bdi, dirty_rate, task_ratelimit);
 }
 
-static void __wb_update_bandwidth(struct dirty_throttle_control *dtc,
+static void __wb_update_bandwidth(struct dirty_throttle_control *gdtc,
+				  struct dirty_throttle_control *mdtc,
 				  unsigned long start_time,
 				  bool update_ratelimit)
 {
-	struct bdi_writeback *wb = dtc->wb;
+	struct bdi_writeback *wb = gdtc->wb;
 	unsigned long now = jiffies;
 	unsigned long elapsed = now - wb->bw_time_stamp;
 	unsigned long dirtied;
@@ -1298,8 +1322,17 @@ static void __wb_update_bandwidth(struct dirty_throttle_control *dtc,
 		goto snapshot;
 
 	if (update_ratelimit) {
-		domain_update_bandwidth(dtc, now);
-		wb_update_dirty_ratelimit(dtc, dirtied, elapsed);
+		domain_update_bandwidth(gdtc, now);
+		wb_update_dirty_ratelimit(gdtc, dirtied, elapsed);
+
+		/*
+		 * @mdtc is always NULL if !CGROUP_WRITEBACK but the
+		 * compiler has no way to figure that out.  Help it.
+		 */
+		if (IS_ENABLED(CONFIG_CGROUP_WRITEBACK) && mdtc) {
+			domain_update_bandwidth(mdtc, now);
+			wb_update_dirty_ratelimit(mdtc, dirtied, elapsed);
+		}
 	}
 	wb_update_write_bandwidth(wb, elapsed, written);
 
@@ -1313,7 +1346,7 @@ void wb_update_bandwidth(struct bdi_writeback *wb, unsigned long start_time)
 {
 	struct dirty_throttle_control gdtc = { GDTC_INIT(wb) };
 
-	__wb_update_bandwidth(&gdtc, start_time, false);
+	__wb_update_bandwidth(&gdtc, NULL, start_time, false);
 }
 
 /*
@@ -1480,7 +1513,11 @@ static void balance_dirty_pages(struct address_space *mapping,
 				unsigned long pages_dirtied)
 {
 	struct dirty_throttle_control gdtc_stor = { GDTC_INIT(wb) };
+	struct dirty_throttle_control mdtc_stor = { MDTC_INIT(wb, &gdtc_stor) };
 	struct dirty_throttle_control * const gdtc = &gdtc_stor;
+	struct dirty_throttle_control * const mdtc = mdtc_valid(&mdtc_stor) ?
+						     &mdtc_stor : NULL;
+	struct dirty_throttle_control *sdtc;
 	unsigned long nr_reclaimable;	/* = file_dirty + unstable_nfs */
 	long period;
 	long pause;
@@ -1497,6 +1534,7 @@ static void balance_dirty_pages(struct address_space *mapping,
 	for (;;) {
 		unsigned long now = jiffies;
 		unsigned long dirty, thresh, bg_thresh;
+		unsigned long m_dirty, m_thresh, m_bg_thresh;
 
 		/*
 		 * Unstable writes are a feature of certain networked
@@ -1523,6 +1561,32 @@ static void balance_dirty_pages(struct address_space *mapping,
 			bg_thresh = gdtc->bg_thresh;
 		}
 
+		if (mdtc) {
+			unsigned long writeback;
+
+			/*
+			 * If @wb belongs to !root memcg, repeat the same
+			 * basic calculations for the memcg domain.
+			 */
+			mem_cgroup_wb_stats(wb, &mdtc->avail, &mdtc->dirty,
+					    &writeback);
+			mdtc_cap_avail(mdtc);
+			mdtc->dirty += writeback;
+
+			domain_dirty_limits(mdtc);
+
+			if (unlikely(strictlimit)) {
+				wb_dirty_limits(mdtc);
+				m_dirty = mdtc->wb_dirty;
+				m_thresh = mdtc->wb_thresh;
+				m_bg_thresh = mdtc->wb_bg_thresh;
+			} else {
+				m_dirty = mdtc->dirty;
+				m_thresh = mdtc->thresh;
+				m_bg_thresh = mdtc->bg_thresh;
+			}
+		}
+
 		/*
 		 * Throttle it only when the background writeback cannot
 		 * catch-up. This avoids (excessively) small writeouts
@@ -1531,18 +1595,31 @@ static void balance_dirty_pages(struct address_space *mapping,
 		 * In strictlimit case make decision based on the wb counters
 		 * and limits. Small writeouts when the wb limits are ramping
 		 * up are the price we consciously pay for strictlimit-ing.
+		 *
+		 * If memcg domain is in effect, @dirty should be under
+		 * both global and memcg freerun ceilings.
 		 */
-		if (dirty <= dirty_freerun_ceiling(thresh, bg_thresh)) {
+		if (dirty <= dirty_freerun_ceiling(thresh, bg_thresh) &&
+		    (!mdtc ||
+		     m_dirty <= dirty_freerun_ceiling(m_thresh, m_bg_thresh))) {
+			unsigned long intv = dirty_poll_interval(dirty, thresh);
+			unsigned long m_intv = ULONG_MAX;
+
 			current->dirty_paused_when = now;
 			current->nr_dirtied = 0;
-			current->nr_dirtied_pause =
-				dirty_poll_interval(dirty, thresh);
+			if (mdtc)
+				m_intv = dirty_poll_interval(m_dirty, m_thresh);
+			current->nr_dirtied_pause = min(intv, m_intv);
 			break;
 		}
 
 		if (unlikely(!writeback_in_progress(wb)))
 			wb_start_background_writeback(wb);
 
+		/*
+		 * Calculate global domain's pos_ratio and select the
+		 * global dtc by default.
+		 */
 		if (!strictlimit)
 			wb_dirty_limits(gdtc);
 
@@ -1550,6 +1627,25 @@ static void balance_dirty_pages(struct address_space *mapping,
 			((gdtc->dirty > gdtc->thresh) || strictlimit);
 
 		wb_position_ratio(gdtc);
+		sdtc = gdtc;
+
+		if (mdtc) {
+			/*
+			 * If memcg domain is in effect, calculate its
+			 * pos_ratio.  @wb should satisfy constraints from
+			 * both global and memcg domains.  Choose the one
+			 * w/ lower pos_ratio.
+			 */
+			if (!strictlimit)
+				wb_dirty_limits(mdtc);
+
+			dirty_exceeded |= (mdtc->wb_dirty > mdtc->wb_thresh) &&
+				((mdtc->dirty > mdtc->thresh) || strictlimit);
+
+			wb_position_ratio(mdtc);
+			if (mdtc->pos_ratio < gdtc->pos_ratio)
+				sdtc = mdtc;
+		}
 
 		if (dirty_exceeded && !wb->dirty_exceeded)
 			wb->dirty_exceeded = 1;
@@ -1557,14 +1653,15 @@ static void balance_dirty_pages(struct address_space *mapping,
 		if (time_is_before_jiffies(wb->bw_time_stamp +
 					   BANDWIDTH_INTERVAL)) {
 			spin_lock(&wb->list_lock);
-			__wb_update_bandwidth(gdtc, start_time, true);
+			__wb_update_bandwidth(gdtc, mdtc, start_time, true);
 			spin_unlock(&wb->list_lock);
 		}
 
+		/* throttle according to the chosen dtc */
 		dirty_ratelimit = wb->dirty_ratelimit;
-		task_ratelimit = ((u64)dirty_ratelimit * gdtc->pos_ratio) >>
+		task_ratelimit = ((u64)dirty_ratelimit * sdtc->pos_ratio) >>
 							RATELIMIT_CALC_SHIFT;
-		max_pause = wb_max_pause(wb, gdtc->wb_dirty);
+		max_pause = wb_max_pause(wb, sdtc->wb_dirty);
 		min_pause = wb_min_pause(wb, max_pause,
 					 task_ratelimit, dirty_ratelimit,
 					 &nr_dirtied_pause);
@@ -1587,11 +1684,11 @@ static void balance_dirty_pages(struct address_space *mapping,
 		 */
 		if (pause < min_pause) {
 			trace_balance_dirty_pages(bdi,
-						  gdtc->thresh,
-						  gdtc->bg_thresh,
-						  gdtc->dirty,
-						  gdtc->wb_thresh,
-						  gdtc->wb_dirty,
+						  sdtc->thresh,
+						  sdtc->bg_thresh,
+						  sdtc->dirty,
+						  sdtc->wb_thresh,
+						  sdtc->wb_dirty,
 						  dirty_ratelimit,
 						  task_ratelimit,
 						  pages_dirtied,
@@ -1616,11 +1713,11 @@ static void balance_dirty_pages(struct address_space *mapping,
 
 pause:
 		trace_balance_dirty_pages(bdi,
-					  gdtc->thresh,
-					  gdtc->bg_thresh,
-					  gdtc->dirty,
-					  gdtc->wb_thresh,
-					  gdtc->wb_dirty,
+					  sdtc->thresh,
+					  sdtc->bg_thresh,
+					  sdtc->dirty,
+					  sdtc->wb_thresh,
+					  sdtc->wb_dirty,
 					  dirty_ratelimit,
 					  task_ratelimit,
 					  pages_dirtied,
@@ -1651,7 +1748,7 @@ pause:
 		 * more page. However wb_dirty has accounting errors.  So use
 		 * the larger and more IO friendly wb_stat_error.
 		 */
-		if (gdtc->wb_dirty <= wb_stat_error(wb))
+		if (sdtc->wb_dirty <= wb_stat_error(wb))
 			break;
 
 		if (fatal_signal_pending(current))
@@ -1775,7 +1872,10 @@ EXPORT_SYMBOL(balance_dirty_pages_ratelimited);
 bool wb_over_bg_thresh(struct bdi_writeback *wb)
 {
 	struct dirty_throttle_control gdtc_stor = { GDTC_INIT(wb) };
+	struct dirty_throttle_control mdtc_stor = { MDTC_INIT(wb, &gdtc_stor) };
 	struct dirty_throttle_control * const gdtc = &gdtc_stor;
+	struct dirty_throttle_control * const mdtc = mdtc_valid(&mdtc_stor) ?
+						     &mdtc_stor : NULL;
 
 	/*
 	 * Similar to balance_dirty_pages() but ignores pages being written
@@ -1792,6 +1892,20 @@ bool wb_over_bg_thresh(struct bdi_writeback *wb)
 	if (wb_stat(wb, WB_RECLAIMABLE) > __wb_calc_thresh(gdtc))
 		return true;
 
+	if (mdtc) {
+		unsigned long writeback;
+
+		mem_cgroup_wb_stats(wb, &mdtc->avail, &mdtc->dirty, &writeback);
+		mdtc_cap_avail(mdtc);
+		domain_dirty_limits(mdtc);	/* ditto, ignore writeback */
+
+		if (mdtc->dirty > mdtc->bg_thresh)
+			return true;
+
+		if (wb_stat(wb, WB_RECLAIMABLE) > __wb_calc_thresh(mdtc))
+			return true;
+	}
+
 	return false;
 }
 
-- 
cgit v1.2.3


From 21c6321fbb3a3787af07f1bc031d713a707fb69c Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 28 May 2015 14:50:49 -0400
Subject: writeback: relocate wb[_try]_get(), wb_put(),
 inode_{attach|detach}_wb()

Currently, majority of cgroup writeback support including all the
above functions are implemented in include/linux/backing-dev.h and
mm/backing-dev.c; however, the portion closely related to writeback
logic implemented in include/linux/writeback.h and mm/page-writeback.c
will expand to support foreign writeback detection and correction.

This patch moves wb[_try]_get() and wb_put() to
include/linux/backing-dev-defs.h so that they can be used from
writeback.h and inode_{attach|detach}_wb() to writeback.h and
page-writeback.c.

This is pure reorganization and doesn't introduce any functional
changes.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Jan Kara <jack@suse.cz>
Cc: Wu Fengguang <fengguang.wu@intel.com>
Cc: Greg Thelen <gthelen@google.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 fs/fs-writeback.c                | 31 +++++++++++++++
 include/linux/backing-dev-defs.h | 50 ++++++++++++++++++++++++
 include/linux/backing-dev.h      | 82 ----------------------------------------
 include/linux/writeback.h        | 46 ++++++++++++++++++++++
 mm/backing-dev.c                 | 30 ---------------
 5 files changed, 127 insertions(+), 112 deletions(-)

(limited to 'include/linux')

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index da355879ba7c..cf6ccfb01e03 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -27,6 +27,7 @@
 #include <linux/backing-dev.h>
 #include <linux/tracepoint.h>
 #include <linux/device.h>
+#include <linux/memcontrol.h>
 #include "internal.h"
 
 /*
@@ -213,6 +214,36 @@ static void wb_wait_for_completion(struct backing_dev_info *bdi,
 
 #ifdef CONFIG_CGROUP_WRITEBACK
 
+void __inode_attach_wb(struct inode *inode, struct page *page)
+{
+	struct backing_dev_info *bdi = inode_to_bdi(inode);
+	struct bdi_writeback *wb = NULL;
+
+	if (inode_cgwb_enabled(inode)) {
+		struct cgroup_subsys_state *memcg_css;
+
+		if (page) {
+			memcg_css = mem_cgroup_css_from_page(page);
+			wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC);
+		} else {
+			/* must pin memcg_css, see wb_get_create() */
+			memcg_css = task_get_css(current, memory_cgrp_id);
+			wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC);
+			css_put(memcg_css);
+		}
+	}
+
+	if (!wb)
+		wb = &bdi->wb;
+
+	/*
+	 * There may be multiple instances of this function racing to
+	 * update the same inode.  Use cmpxchg() to tell the winner.
+	 */
+	if (unlikely(cmpxchg(&inode->i_wb, NULL, wb)))
+		wb_put(wb);
+}
+
 /**
  * inode_congested - test whether an inode is congested
  * @inode: inode to test for congestion
diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h
index 8d470b73824f..e047b496a0b9 100644
--- a/include/linux/backing-dev-defs.h
+++ b/include/linux/backing-dev-defs.h
@@ -186,4 +186,54 @@ static inline void set_bdi_congested(struct backing_dev_info *bdi, int sync)
 	set_wb_congested(bdi->wb.congested, sync);
 }
 
+#ifdef CONFIG_CGROUP_WRITEBACK
+
+/**
+ * wb_tryget - try to increment a wb's refcount
+ * @wb: bdi_writeback to get
+ */
+static inline bool wb_tryget(struct bdi_writeback *wb)
+{
+	if (wb != &wb->bdi->wb)
+		return percpu_ref_tryget(&wb->refcnt);
+	return true;
+}
+
+/**
+ * wb_get - increment a wb's refcount
+ * @wb: bdi_writeback to get
+ */
+static inline void wb_get(struct bdi_writeback *wb)
+{
+	if (wb != &wb->bdi->wb)
+		percpu_ref_get(&wb->refcnt);
+}
+
+/**
+ * wb_put - decrement a wb's refcount
+ * @wb: bdi_writeback to put
+ */
+static inline void wb_put(struct bdi_writeback *wb)
+{
+	if (wb != &wb->bdi->wb)
+		percpu_ref_put(&wb->refcnt);
+}
+
+#else	/* CONFIG_CGROUP_WRITEBACK */
+
+static inline bool wb_tryget(struct bdi_writeback *wb)
+{
+	return true;
+}
+
+static inline void wb_get(struct bdi_writeback *wb)
+{
+}
+
+static inline void wb_put(struct bdi_writeback *wb)
+{
+}
+
+#endif	/* CONFIG_CGROUP_WRITEBACK */
+
 #endif	/* __LINUX_BACKING_DEV_DEFS_H */
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index e9d7373f5f93..5c978a924157 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -243,7 +243,6 @@ void wb_congested_put(struct bdi_writeback_congested *congested);
 struct bdi_writeback *wb_get_create(struct backing_dev_info *bdi,
 				    struct cgroup_subsys_state *memcg_css,
 				    gfp_t gfp);
-void __inode_attach_wb(struct inode *inode, struct page *page);
 void wb_memcg_offline(struct mem_cgroup *memcg);
 void wb_blkcg_offline(struct blkcg *blkcg);
 int inode_congested(struct inode *inode, int cong_bits);
@@ -264,37 +263,6 @@ static inline bool inode_cgwb_enabled(struct inode *inode)
 		(inode->i_sb->s_type->fs_flags & FS_CGROUP_WRITEBACK);
 }
 
-/**
- * wb_tryget - try to increment a wb's refcount
- * @wb: bdi_writeback to get
- */
-static inline bool wb_tryget(struct bdi_writeback *wb)
-{
-	if (wb != &wb->bdi->wb)
-		return percpu_ref_tryget(&wb->refcnt);
-	return true;
-}
-
-/**
- * wb_get - increment a wb's refcount
- * @wb: bdi_writeback to get
- */
-static inline void wb_get(struct bdi_writeback *wb)
-{
-	if (wb != &wb->bdi->wb)
-		percpu_ref_get(&wb->refcnt);
-}
-
-/**
- * wb_put - decrement a wb's refcount
- * @wb: bdi_writeback to put
- */
-static inline void wb_put(struct bdi_writeback *wb)
-{
-	if (wb != &wb->bdi->wb)
-		percpu_ref_put(&wb->refcnt);
-}
-
 /**
  * wb_find_current - find wb for %current on a bdi
  * @bdi: bdi of interest
@@ -353,35 +321,6 @@ wb_get_create_current(struct backing_dev_info *bdi, gfp_t gfp)
 	return wb;
 }
 
-/**
- * inode_attach_wb - associate an inode with its wb
- * @inode: inode of interest
- * @page: page being dirtied (may be NULL)
- *
- * If @inode doesn't have its wb, associate it with the wb matching the
- * memcg of @page or, if @page is NULL, %current.  May be called w/ or w/o
- * @inode->i_lock.
- */
-static inline void inode_attach_wb(struct inode *inode, struct page *page)
-{
-	if (!inode->i_wb)
-		__inode_attach_wb(inode, page);
-}
-
-/**
- * inode_detach_wb - disassociate an inode from its wb
- * @inode: inode of interest
- *
- * @inode is being freed.  Detach from its wb.
- */
-static inline void inode_detach_wb(struct inode *inode)
-{
-	if (inode->i_wb) {
-		wb_put(inode->i_wb);
-		inode->i_wb = NULL;
-	}
-}
-
 /**
  * inode_to_wb - determine the wb of an inode
  * @inode: inode of interest
@@ -471,19 +410,6 @@ static inline void wb_congested_put(struct bdi_writeback_congested *congested)
 {
 }
 
-static inline bool wb_tryget(struct bdi_writeback *wb)
-{
-	return true;
-}
-
-static inline void wb_get(struct bdi_writeback *wb)
-{
-}
-
-static inline void wb_put(struct bdi_writeback *wb)
-{
-}
-
 static inline struct bdi_writeback *wb_find_current(struct backing_dev_info *bdi)
 {
 	return &bdi->wb;
@@ -495,14 +421,6 @@ wb_get_create_current(struct backing_dev_info *bdi, gfp_t gfp)
 	return &bdi->wb;
 }
 
-static inline void inode_attach_wb(struct inode *inode, struct page *page)
-{
-}
-
-static inline void inode_detach_wb(struct inode *inode)
-{
-}
-
 static inline struct bdi_writeback *inode_to_wb(struct inode *inode)
 {
 	return &inode_to_bdi(inode)->wb;
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 3b73e97ecfc7..6726b7e56beb 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -8,6 +8,7 @@
 #include <linux/workqueue.h>
 #include <linux/fs.h>
 #include <linux/flex_proportions.h>
+#include <linux/backing-dev-defs.h>
 
 DECLARE_PER_CPU(int, dirty_throttle_leaks);
 
@@ -173,6 +174,51 @@ static inline void wait_on_inode(struct inode *inode)
 	wait_on_bit(&inode->i_state, __I_NEW, TASK_UNINTERRUPTIBLE);
 }
 
+#ifdef CONFIG_CGROUP_WRITEBACK
+
+void __inode_attach_wb(struct inode *inode, struct page *page);
+
+/**
+ * inode_attach_wb - associate an inode with its wb
+ * @inode: inode of interest
+ * @page: page being dirtied (may be NULL)
+ *
+ * If @inode doesn't have its wb, associate it with the wb matching the
+ * memcg of @page or, if @page is NULL, %current.  May be called w/ or w/o
+ * @inode->i_lock.
+ */
+static inline void inode_attach_wb(struct inode *inode, struct page *page)
+{
+	if (!inode->i_wb)
+		__inode_attach_wb(inode, page);
+}
+
+/**
+ * inode_detach_wb - disassociate an inode from its wb
+ * @inode: inode of interest
+ *
+ * @inode is being freed.  Detach from its wb.
+ */
+static inline void inode_detach_wb(struct inode *inode)
+{
+	if (inode->i_wb) {
+		wb_put(inode->i_wb);
+		inode->i_wb = NULL;
+	}
+}
+
+#else	/* CONFIG_CGROUP_WRITEBACK */
+
+static inline void inode_attach_wb(struct inode *inode, struct page *page)
+{
+}
+
+static inline void inode_detach_wb(struct inode *inode)
+{
+}
+
+#endif	/* CONFIG_CGROUP_WRITEBACK */
+
 /*
  * mm/page-writeback.c
  */
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 84ebf7c8d006..887d72a85b5e 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -660,36 +660,6 @@ struct bdi_writeback *wb_get_create(struct backing_dev_info *bdi,
 	return wb;
 }
 
-void __inode_attach_wb(struct inode *inode, struct page *page)
-{
-	struct backing_dev_info *bdi = inode_to_bdi(inode);
-	struct bdi_writeback *wb = NULL;
-
-	if (inode_cgwb_enabled(inode)) {
-		struct cgroup_subsys_state *memcg_css;
-
-		if (page) {
-			memcg_css = mem_cgroup_css_from_page(page);
-			wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC);
-		} else {
-			/* must pin memcg_css, see wb_get_create() */
-			memcg_css = task_get_css(current, memory_cgrp_id);
-			wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC);
-			css_put(memcg_css);
-		}
-	}
-
-	if (!wb)
-		wb = &bdi->wb;
-
-	/*
-	 * There may be multiple instances of this function racing to
-	 * update the same inode.  Use cmpxchg() to tell the winner.
-	 */
-	if (unlikely(cmpxchg(&inode->i_wb, NULL, wb)))
-		wb_put(wb);
-}
-
 static void cgwb_bdi_init(struct backing_dev_info *bdi)
 {
 	bdi->wb.memcg_css = mem_cgroup_root_css;
-- 
cgit v1.2.3


From b16b1deb553adcd7b3b7ce3e6d6fd1b923f314da Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 2 Jun 2015 08:39:48 -0600
Subject: writeback: make writeback_control track the inode being written back

Currently, for cgroup writeback, the IO submission paths directly
associate the bio's with the blkcg from inode_to_wb_blkcg_css();
however, it'd be necessary to keep more writeback context to implement
foreign inode writeback detection.  wbc (writeback_control) is the
natural fit for the extra context - it persists throughout the
writeback of each inode and is passed all the way down to IO
submission paths.

This patch adds wbc_attach_and_unlock_inode(), wbc_detach_inode(), and
wbc_attach_fdatawrite_inode() which are used to associate wbc with the
inode being written back.  IO submission paths now use wbc_init_bio()
instead of directly associating bio's with blkcg themselves.  This
leaves inode_to_wb_blkcg_css() w/o any user.  The function is removed.

wbc currently only tracks the associated wb (bdi_writeback).  Future
patches will add more for foreign inode detection.  The association is
established under i_lock which will be depended upon when migrating
foreign inodes to other wb's.

As currently, once established, inode to wb association never changes,
going through wbc when initializing bio's doesn't cause any behavior
changes.

v2: submit_blk_blkcg() now checks whether the wbc is associated with a
    wb before dereferencing it.  This can happen when pageout() is
    writing pages directly without going through the usual writeback
    path.  As pageout() path is single-threaded, we don't want it to
    be blocked behind a slow cgroup and ultimately want it to delegate
    actual writing to the usual writeback path.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Jan Kara <jack@suse.cz>
Cc: Wu Fengguang <fengguang.wu@intel.com>
Cc: Greg Thelen <gthelen@google.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 fs/buffer.c                 | 25 ++++++++---------
 fs/fs-writeback.c           | 37 ++++++++++++++++++++++--
 fs/mpage.c                  |  2 +-
 include/linux/backing-dev.h | 12 --------
 include/linux/writeback.h   | 68 +++++++++++++++++++++++++++++++++++++++++++++
 mm/filemap.c                |  2 ++
 6 files changed, 118 insertions(+), 28 deletions(-)

(limited to 'include/linux')

diff --git a/fs/buffer.c b/fs/buffer.c
index b85e94134ea6..d883c799fb45 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -45,9 +45,9 @@
 #include <trace/events/block.h>
 
 static int fsync_buffers_list(spinlock_t *lock, struct list_head *list);
-static int submit_bh_blkcg(int rw, struct buffer_head *bh,
-			   unsigned long bio_flags,
-			   struct cgroup_subsys_state *blkcg_css);
+static int submit_bh_wbc(int rw, struct buffer_head *bh,
+			 unsigned long bio_flags,
+			 struct writeback_control *wbc);
 
 #define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers)
 
@@ -1709,7 +1709,6 @@ static int __block_write_full_page(struct inode *inode, struct page *page,
 	unsigned int blocksize, bbits;
 	int nr_underway = 0;
 	int write_op = (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE);
-	struct cgroup_subsys_state *blkcg_css = inode_to_wb_blkcg_css(inode);
 
 	head = create_page_buffers(page, inode,
 					(1 << BH_Dirty)|(1 << BH_Uptodate));
@@ -1798,7 +1797,7 @@ static int __block_write_full_page(struct inode *inode, struct page *page,
 	do {
 		struct buffer_head *next = bh->b_this_page;
 		if (buffer_async_write(bh)) {
-			submit_bh_blkcg(write_op, bh, 0, blkcg_css);
+			submit_bh_wbc(write_op, bh, 0, wbc);
 			nr_underway++;
 		}
 		bh = next;
@@ -1852,7 +1851,7 @@ recover:
 		struct buffer_head *next = bh->b_this_page;
 		if (buffer_async_write(bh)) {
 			clear_buffer_dirty(bh);
-			submit_bh_blkcg(write_op, bh, 0, blkcg_css);
+			submit_bh_wbc(write_op, bh, 0, wbc);
 			nr_underway++;
 		}
 		bh = next;
@@ -3017,11 +3016,11 @@ void guard_bio_eod(int rw, struct bio *bio)
 	}
 }
 
-static int submit_bh_blkcg(int rw, struct buffer_head *bh,
-			   unsigned long bio_flags,
-			   struct cgroup_subsys_state *blkcg_css)
+static int submit_bh_wbc(int rw, struct buffer_head *bh,
+			 unsigned long bio_flags, struct writeback_control *wbc)
 {
 	struct bio *bio;
+	int ret = 0;
 
 	BUG_ON(!buffer_locked(bh));
 	BUG_ON(!buffer_mapped(bh));
@@ -3041,8 +3040,8 @@ static int submit_bh_blkcg(int rw, struct buffer_head *bh,
 	 */
 	bio = bio_alloc(GFP_NOIO, 1);
 
-	if (blkcg_css)
-		bio_associate_blkcg(bio, blkcg_css);
+	if (wbc)
+		wbc_init_bio(wbc, bio);
 
 	bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
 	bio->bi_bdev = bh->b_bdev;
@@ -3071,13 +3070,13 @@ static int submit_bh_blkcg(int rw, struct buffer_head *bh,
 
 int _submit_bh(int rw, struct buffer_head *bh, unsigned long bio_flags)
 {
-	return submit_bh_blkcg(rw, bh, bio_flags, NULL);
+	return submit_bh_wbc(rw, bh, bio_flags, NULL);
 }
 EXPORT_SYMBOL_GPL(_submit_bh);
 
 int submit_bh(int rw, struct buffer_head *bh)
 {
-	return submit_bh_blkcg(rw, bh, 0, NULL);
+	return submit_bh_wbc(rw, bh, 0, NULL);
 }
 EXPORT_SYMBOL(submit_bh);
 
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index cf6ccfb01e03..755e8ef8d1f0 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -244,6 +244,37 @@ void __inode_attach_wb(struct inode *inode, struct page *page)
 		wb_put(wb);
 }
 
+/**
+ * wbc_attach_and_unlock_inode - associate wbc with target inode and unlock it
+ * @wbc: writeback_control of interest
+ * @inode: target inode
+ *
+ * @inode is locked and about to be written back under the control of @wbc.
+ * Record @inode's writeback context into @wbc and unlock the i_lock.  On
+ * writeback completion, wbc_detach_inode() should be called.  This is used
+ * to track the cgroup writeback context.
+ */
+void wbc_attach_and_unlock_inode(struct writeback_control *wbc,
+				 struct inode *inode)
+{
+	wbc->wb = inode_to_wb(inode);
+	wb_get(wbc->wb);
+	spin_unlock(&inode->i_lock);
+}
+
+/**
+ * wbc_detach_inode - disassociate wbc from its target inode
+ * @wbc: writeback_control of interest
+ *
+ * To be called after a writeback attempt of an inode finishes and undoes
+ * wbc_attach_and_unlock_inode().  Can be called under any context.
+ */
+void wbc_detach_inode(struct writeback_control *wbc)
+{
+	wb_put(wbc->wb);
+	wbc->wb = NULL;
+}
+
 /**
  * inode_congested - test whether an inode is congested
  * @inode: inode to test for congestion
@@ -877,10 +908,11 @@ writeback_single_inode(struct inode *inode, struct bdi_writeback *wb,
 	     !mapping_tagged(inode->i_mapping, PAGECACHE_TAG_WRITEBACK)))
 		goto out;
 	inode->i_state |= I_SYNC;
-	spin_unlock(&inode->i_lock);
+	wbc_attach_and_unlock_inode(wbc, inode);
 
 	ret = __writeback_single_inode(inode, wbc);
 
+	wbc_detach_inode(wbc);
 	spin_lock(&wb->list_lock);
 	spin_lock(&inode->i_lock);
 	/*
@@ -1013,7 +1045,7 @@ static long writeback_sb_inodes(struct super_block *sb,
 			continue;
 		}
 		inode->i_state |= I_SYNC;
-		spin_unlock(&inode->i_lock);
+		wbc_attach_and_unlock_inode(&wbc, inode);
 
 		write_chunk = writeback_chunk_size(wb, work);
 		wbc.nr_to_write = write_chunk;
@@ -1025,6 +1057,7 @@ static long writeback_sb_inodes(struct super_block *sb,
 		 */
 		__writeback_single_inode(inode, &wbc);
 
+		wbc_detach_inode(&wbc);
 		work->nr_pages -= write_chunk - wbc.nr_to_write;
 		wrote += write_chunk - wbc.nr_to_write;
 		spin_lock(&wb->list_lock);
diff --git a/fs/mpage.c b/fs/mpage.c
index a3ccb0bd465a..388fde6ac255 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -606,7 +606,7 @@ alloc_new:
 		if (bio == NULL)
 			goto confused;
 
-		bio_associate_blkcg(bio, inode_to_wb_blkcg_css(inode));
+		wbc_init_bio(wbc, bio);
 	}
 
 	/*
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index 5c978a924157..b1d2489a6536 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -332,12 +332,6 @@ static inline struct bdi_writeback *inode_to_wb(struct inode *inode)
 	return inode->i_wb;
 }
 
-static inline struct cgroup_subsys_state *
-inode_to_wb_blkcg_css(struct inode *inode)
-{
-	return inode_to_wb(inode)->blkcg_css;
-}
-
 struct wb_iter {
 	int			start_blkcg_id;
 	struct radix_tree_iter	tree_iter;
@@ -434,12 +428,6 @@ static inline void wb_blkcg_offline(struct blkcg *blkcg)
 {
 }
 
-static inline struct cgroup_subsys_state *
-inode_to_wb_blkcg_css(struct inode *inode)
-{
-	return blkcg_root_css;
-}
-
 struct wb_iter {
 	int		next_id;
 };
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 6726b7e56beb..8f964e558af5 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -86,6 +86,9 @@ struct writeback_control {
 	unsigned for_reclaim:1;		/* Invoked from the page allocator */
 	unsigned range_cyclic:1;	/* range_start is cyclic */
 	unsigned for_sync:1;		/* sync(2) WB_SYNC_ALL writeback */
+#ifdef CONFIG_CGROUP_WRITEBACK
+	struct bdi_writeback *wb;	/* wb this writeback is issued under */
+#endif
 };
 
 /*
@@ -176,7 +179,14 @@ static inline void wait_on_inode(struct inode *inode)
 
 #ifdef CONFIG_CGROUP_WRITEBACK
 
+#include <linux/cgroup.h>
+#include <linux/bio.h>
+
 void __inode_attach_wb(struct inode *inode, struct page *page);
+void wbc_attach_and_unlock_inode(struct writeback_control *wbc,
+				 struct inode *inode)
+	__releases(&inode->i_lock);
+void wbc_detach_inode(struct writeback_control *wbc);
 
 /**
  * inode_attach_wb - associate an inode with its wb
@@ -207,6 +217,44 @@ static inline void inode_detach_wb(struct inode *inode)
 	}
 }
 
+/**
+ * wbc_attach_fdatawrite_inode - associate wbc and inode for fdatawrite
+ * @wbc: writeback_control of interest
+ * @inode: target inode
+ *
+ * This function is to be used by __filemap_fdatawrite_range(), which is an
+ * alternative entry point into writeback code, and first ensures @inode is
+ * associated with a bdi_writeback and attaches it to @wbc.
+ */
+static inline void wbc_attach_fdatawrite_inode(struct writeback_control *wbc,
+					       struct inode *inode)
+{
+	spin_lock(&inode->i_lock);
+	inode_attach_wb(inode, NULL);
+	wbc_attach_and_unlock_inode(wbc, inode);
+}
+
+/**
+ * wbc_init_bio - writeback specific initializtion of bio
+ * @wbc: writeback_control for the writeback in progress
+ * @bio: bio to be initialized
+ *
+ * @bio is a part of the writeback in progress controlled by @wbc.  Perform
+ * writeback specific initialization.  This is used to apply the cgroup
+ * writeback context.
+ */
+static inline void wbc_init_bio(struct writeback_control *wbc, struct bio *bio)
+{
+	/*
+	 * pageout() path doesn't attach @wbc to the inode being written
+	 * out.  This is intentional as we don't want the function to block
+	 * behind a slow cgroup.  Ultimately, we want pageout() to kick off
+	 * regular writeback instead of writing things out itself.
+	 */
+	if (wbc->wb)
+		bio_associate_blkcg(bio, wbc->wb->blkcg_css);
+}
+
 #else	/* CONFIG_CGROUP_WRITEBACK */
 
 static inline void inode_attach_wb(struct inode *inode, struct page *page)
@@ -217,6 +265,26 @@ static inline void inode_detach_wb(struct inode *inode)
 {
 }
 
+static inline void wbc_attach_and_unlock_inode(struct writeback_control *wbc,
+					       struct inode *inode)
+	__releases(&inode->i_lock)
+{
+	spin_unlock(&inode->i_lock);
+}
+
+static inline void wbc_attach_fdatawrite_inode(struct writeback_control *wbc,
+					       struct inode *inode)
+{
+}
+
+static inline void wbc_detach_inode(struct writeback_control *wbc)
+{
+}
+
+static inline void wbc_init_bio(struct writeback_control *wbc, struct bio *bio)
+{
+}
+
 #endif	/* CONFIG_CGROUP_WRITEBACK */
 
 /*
diff --git a/mm/filemap.c b/mm/filemap.c
index 7b1443dc3ad0..2f065b19b635 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -290,7 +290,9 @@ int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
 	if (!mapping_cap_writeback_dirty(mapping))
 		return 0;
 
+	wbc_attach_fdatawrite_inode(&wbc, mapping->host);
 	ret = do_writepages(mapping, &wbc);
+	wbc_detach_inode(&wbc);
 	return ret;
 }
 
-- 
cgit v1.2.3


From 2a81490811d0296d390c571bb64eaa93e5ed7def Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 28 May 2015 14:50:51 -0400
Subject: writeback: implement foreign cgroup inode detection

As concurrent write sharing of an inode is expected to be very rare
and memcg only tracks page ownership on first-use basis severely
confining the usefulness of such sharing, cgroup writeback tracks
ownership per-inode.  While the support for concurrent write sharing
of an inode is deemed unnecessary, an inode being written to by
different cgroups at different points in time is a lot more common,
and, more importantly, charging only by first-use can too readily lead
to grossly incorrect behaviors (single foreign page can lead to
gigabytes of writeback to be incorrectly attributed).

To resolve this issue, cgroup writeback detects the majority dirtier
of an inode and will transfer the ownership to it.  To avoid
unnnecessary oscillation, the detection mechanism keeps track of
history and gives out the switch verdict only if the foreign usage
pattern is stable over a certain amount of time and/or writeback
attempts.

The detection mechanism has fairly low space and computation overhead.
It adds 8 bytes to struct inode (one int and two u16's) and minimal
amount of calculation per IO.  The detection mechanism converges to
the correct answer usually in several seconds of IO time when there's
a clear majority dirtier.  Even when there isn't, it can reach an
acceptable answer fairly quickly under most circumstances.

Please see wb_detach_inode() for more details.

This patch only implements detection.  Following patches will
implement actual switching.

v2: wbc_account_io() now checks whether the wbc is associated with a
    wb before dereferencing it.  This can happen when pageout() is
    writing pages directly without going through the usual writeback
    path.  As pageout() path is single-threaded, we don't want it to
    be blocked behind a slow cgroup and ultimately want it to delegate
    actual writing to the usual writeback path.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Jan Kara <jack@suse.cz>
Cc: Wu Fengguang <fengguang.wu@intel.com>
Cc: Greg Thelen <gthelen@google.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 fs/buffer.c               |   4 +-
 fs/fs-writeback.c         | 177 +++++++++++++++++++++++++++++++++++++++++++++-
 fs/mpage.c                |   1 +
 include/linux/fs.h        |   5 ++
 include/linux/writeback.h |  16 +++++
 5 files changed, 200 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/fs/buffer.c b/fs/buffer.c
index d883c799fb45..aca687f966d7 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -3040,8 +3040,10 @@ static int submit_bh_wbc(int rw, struct buffer_head *bh,
 	 */
 	bio = bio_alloc(GFP_NOIO, 1);
 
-	if (wbc)
+	if (wbc) {
 		wbc_init_bio(wbc, bio);
+		wbc_account_io(wbc, bh->b_page, bh->b_size);
+	}
 
 	bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
 	bio->bi_bdev = bh->b_bdev;
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 755e8ef8d1f0..f98d40333c85 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -214,6 +214,20 @@ static void wb_wait_for_completion(struct backing_dev_info *bdi,
 
 #ifdef CONFIG_CGROUP_WRITEBACK
 
+/* parameters for foreign inode detection, see wb_detach_inode() */
+#define WB_FRN_TIME_SHIFT	13	/* 1s = 2^13, upto 8 secs w/ 16bit */
+#define WB_FRN_TIME_AVG_SHIFT	3	/* avg = avg * 7/8 + new * 1/8 */
+#define WB_FRN_TIME_CUT_DIV	2	/* ignore rounds < avg / 2 */
+#define WB_FRN_TIME_PERIOD	(2 * (1 << WB_FRN_TIME_SHIFT))	/* 2s */
+
+#define WB_FRN_HIST_SLOTS	16	/* inode->i_wb_frn_history is 16bit */
+#define WB_FRN_HIST_UNIT	(WB_FRN_TIME_PERIOD / WB_FRN_HIST_SLOTS)
+					/* each slot's duration is 2s / 16 */
+#define WB_FRN_HIST_THR_SLOTS	(WB_FRN_HIST_SLOTS / 2)
+					/* if foreign slots >= 8, switch */
+#define WB_FRN_HIST_MAX_SLOTS	(WB_FRN_HIST_THR_SLOTS / 2 + 1)
+					/* one round can affect upto 5 slots */
+
 void __inode_attach_wb(struct inode *inode, struct page *page)
 {
 	struct backing_dev_info *bdi = inode_to_bdi(inode);
@@ -258,23 +272,182 @@ void wbc_attach_and_unlock_inode(struct writeback_control *wbc,
 				 struct inode *inode)
 {
 	wbc->wb = inode_to_wb(inode);
+	wbc->inode = inode;
+
+	wbc->wb_id = wbc->wb->memcg_css->id;
+	wbc->wb_lcand_id = inode->i_wb_frn_winner;
+	wbc->wb_tcand_id = 0;
+	wbc->wb_bytes = 0;
+	wbc->wb_lcand_bytes = 0;
+	wbc->wb_tcand_bytes = 0;
+
 	wb_get(wbc->wb);
 	spin_unlock(&inode->i_lock);
 }
 
 /**
- * wbc_detach_inode - disassociate wbc from its target inode
- * @wbc: writeback_control of interest
+ * wbc_detach_inode - disassociate wbc from inode and perform foreign detection
+ * @wbc: writeback_control of the just finished writeback
  *
  * To be called after a writeback attempt of an inode finishes and undoes
  * wbc_attach_and_unlock_inode().  Can be called under any context.
+ *
+ * As concurrent write sharing of an inode is expected to be very rare and
+ * memcg only tracks page ownership on first-use basis severely confining
+ * the usefulness of such sharing, cgroup writeback tracks ownership
+ * per-inode.  While the support for concurrent write sharing of an inode
+ * is deemed unnecessary, an inode being written to by different cgroups at
+ * different points in time is a lot more common, and, more importantly,
+ * charging only by first-use can too readily lead to grossly incorrect
+ * behaviors (single foreign page can lead to gigabytes of writeback to be
+ * incorrectly attributed).
+ *
+ * To resolve this issue, cgroup writeback detects the majority dirtier of
+ * an inode and transfers the ownership to it.  To avoid unnnecessary
+ * oscillation, the detection mechanism keeps track of history and gives
+ * out the switch verdict only if the foreign usage pattern is stable over
+ * a certain amount of time and/or writeback attempts.
+ *
+ * On each writeback attempt, @wbc tries to detect the majority writer
+ * using Boyer-Moore majority vote algorithm.  In addition to the byte
+ * count from the majority voting, it also counts the bytes written for the
+ * current wb and the last round's winner wb (max of last round's current
+ * wb, the winner from two rounds ago, and the last round's majority
+ * candidate).  Keeping track of the historical winner helps the algorithm
+ * to semi-reliably detect the most active writer even when it's not the
+ * absolute majority.
+ *
+ * Once the winner of the round is determined, whether the winner is
+ * foreign or not and how much IO time the round consumed is recorded in
+ * inode->i_wb_frn_history.  If the amount of recorded foreign IO time is
+ * over a certain threshold, the switch verdict is given.
  */
 void wbc_detach_inode(struct writeback_control *wbc)
 {
+	struct bdi_writeback *wb = wbc->wb;
+	struct inode *inode = wbc->inode;
+	u16 history = inode->i_wb_frn_history;
+	unsigned long avg_time = inode->i_wb_frn_avg_time;
+	unsigned long max_bytes, max_time;
+	int max_id;
+
+	/* pick the winner of this round */
+	if (wbc->wb_bytes >= wbc->wb_lcand_bytes &&
+	    wbc->wb_bytes >= wbc->wb_tcand_bytes) {
+		max_id = wbc->wb_id;
+		max_bytes = wbc->wb_bytes;
+	} else if (wbc->wb_lcand_bytes >= wbc->wb_tcand_bytes) {
+		max_id = wbc->wb_lcand_id;
+		max_bytes = wbc->wb_lcand_bytes;
+	} else {
+		max_id = wbc->wb_tcand_id;
+		max_bytes = wbc->wb_tcand_bytes;
+	}
+
+	/*
+	 * Calculate the amount of IO time the winner consumed and fold it
+	 * into the running average kept per inode.  If the consumed IO
+	 * time is lower than avag / WB_FRN_TIME_CUT_DIV, ignore it for
+	 * deciding whether to switch or not.  This is to prevent one-off
+	 * small dirtiers from skewing the verdict.
+	 */
+	max_time = DIV_ROUND_UP((max_bytes >> PAGE_SHIFT) << WB_FRN_TIME_SHIFT,
+				wb->avg_write_bandwidth);
+	if (avg_time)
+		avg_time += (max_time >> WB_FRN_TIME_AVG_SHIFT) -
+			    (avg_time >> WB_FRN_TIME_AVG_SHIFT);
+	else
+		avg_time = max_time;	/* immediate catch up on first run */
+
+	if (max_time >= avg_time / WB_FRN_TIME_CUT_DIV) {
+		int slots;
+
+		/*
+		 * The switch verdict is reached if foreign wb's consume
+		 * more than a certain proportion of IO time in a
+		 * WB_FRN_TIME_PERIOD.  This is loosely tracked by 16 slot
+		 * history mask where each bit represents one sixteenth of
+		 * the period.  Determine the number of slots to shift into
+		 * history from @max_time.
+		 */
+		slots = min(DIV_ROUND_UP(max_time, WB_FRN_HIST_UNIT),
+			    (unsigned long)WB_FRN_HIST_MAX_SLOTS);
+		history <<= slots;
+		if (wbc->wb_id != max_id)
+			history |= (1U << slots) - 1;
+
+		/*
+		 * Switch if the current wb isn't the consistent winner.
+		 * If there are multiple closely competing dirtiers, the
+		 * inode may switch across them repeatedly over time, which
+		 * is okay.  The main goal is avoiding keeping an inode on
+		 * the wrong wb for an extended period of time.
+		 */
+		if (hweight32(history) > WB_FRN_HIST_THR_SLOTS) {
+			/* switch */
+			max_id = 0;
+			avg_time = 0;
+			history = 0;
+		}
+	}
+
+	/*
+	 * Multiple instances of this function may race to update the
+	 * following fields but we don't mind occassional inaccuracies.
+	 */
+	inode->i_wb_frn_winner = max_id;
+	inode->i_wb_frn_avg_time = min(avg_time, (unsigned long)U16_MAX);
+	inode->i_wb_frn_history = history;
+
 	wb_put(wbc->wb);
 	wbc->wb = NULL;
 }
 
+/**
+ * wbc_account_io - account IO issued during writeback
+ * @wbc: writeback_control of the writeback in progress
+ * @page: page being written out
+ * @bytes: number of bytes being written out
+ *
+ * @bytes from @page are about to written out during the writeback
+ * controlled by @wbc.  Keep the book for foreign inode detection.  See
+ * wbc_detach_inode().
+ */
+void wbc_account_io(struct writeback_control *wbc, struct page *page,
+		    size_t bytes)
+{
+	int id;
+
+	/*
+	 * pageout() path doesn't attach @wbc to the inode being written
+	 * out.  This is intentional as we don't want the function to block
+	 * behind a slow cgroup.  Ultimately, we want pageout() to kick off
+	 * regular writeback instead of writing things out itself.
+	 */
+	if (!wbc->wb)
+		return;
+
+	rcu_read_lock();
+	id = mem_cgroup_css_from_page(page)->id;
+	rcu_read_unlock();
+
+	if (id == wbc->wb_id) {
+		wbc->wb_bytes += bytes;
+		return;
+	}
+
+	if (id == wbc->wb_lcand_id)
+		wbc->wb_lcand_bytes += bytes;
+
+	/* Boyer-Moore majority vote algorithm */
+	if (!wbc->wb_tcand_bytes)
+		wbc->wb_tcand_id = id;
+	if (id == wbc->wb_tcand_id)
+		wbc->wb_tcand_bytes += bytes;
+	else
+		wbc->wb_tcand_bytes -= min(bytes, wbc->wb_tcand_bytes);
+}
+
 /**
  * inode_congested - test whether an inode is congested
  * @inode: inode to test for congestion
diff --git a/fs/mpage.c b/fs/mpage.c
index 388fde6ac255..ca0244b69de8 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -614,6 +614,7 @@ alloc_new:
 	 * the confused fail path above (OOM) will be very confused when
 	 * it finds all bh marked clean (i.e. it will not write anything)
 	 */
+	wbc_account_io(wbc, page, PAGE_SIZE);
 	length = first_unmapped << blkbits;
 	if (bio_add_page(bio, page, length, 0) < length) {
 		bio = mpage_bio_submit(WRITE, bio);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 67a42ec95065..740126d7c44e 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -638,6 +638,11 @@ struct inode {
 	struct list_head	i_wb_list;	/* backing dev IO list */
 #ifdef CONFIG_CGROUP_WRITEBACK
 	struct bdi_writeback	*i_wb;		/* the associated cgroup wb */
+
+	/* foreign inode detection, see wbc_detach_inode() */
+	int			i_wb_frn_winner;
+	u16			i_wb_frn_avg_time;
+	u16			i_wb_frn_history;
 #endif
 	struct list_head	i_lru;		/* inode LRU list */
 	struct list_head	i_sb_list;
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 8f964e558af5..b333c945e571 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -88,6 +88,15 @@ struct writeback_control {
 	unsigned for_sync:1;		/* sync(2) WB_SYNC_ALL writeback */
 #ifdef CONFIG_CGROUP_WRITEBACK
 	struct bdi_writeback *wb;	/* wb this writeback is issued under */
+	struct inode *inode;		/* inode being written out */
+
+	/* foreign inode detection, see wbc_detach_inode() */
+	int wb_id;			/* current wb id */
+	int wb_lcand_id;		/* last foreign candidate wb id */
+	int wb_tcand_id;		/* this foreign candidate wb id */
+	size_t wb_bytes;		/* bytes written by current wb */
+	size_t wb_lcand_bytes;		/* bytes written by last candidate */
+	size_t wb_tcand_bytes;		/* bytes written by this candidate */
 #endif
 };
 
@@ -187,6 +196,8 @@ void wbc_attach_and_unlock_inode(struct writeback_control *wbc,
 				 struct inode *inode)
 	__releases(&inode->i_lock);
 void wbc_detach_inode(struct writeback_control *wbc);
+void wbc_account_io(struct writeback_control *wbc, struct page *page,
+		    size_t bytes);
 
 /**
  * inode_attach_wb - associate an inode with its wb
@@ -285,6 +296,11 @@ static inline void wbc_init_bio(struct writeback_control *wbc, struct bio *bio)
 {
 }
 
+static inline void wbc_account_io(struct writeback_control *wbc,
+				  struct page *page, size_t bytes)
+{
+}
+
 #endif	/* CONFIG_CGROUP_WRITEBACK */
 
 /*
-- 
cgit v1.2.3


From 682aa8e1a6a1504a4caaa62e6c2c9daae3757210 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 28 May 2015 14:50:53 -0400
Subject: writeback: implement unlocked_inode_to_wb transaction and use it for
 stat updates

The mechanism for detecting whether an inode should switch its wb
(bdi_writeback) association is now in place.  This patch build the
framework for the actual switching.

This patch adds a new inode flag I_WB_SWITCHING, which has two
functions.  First, the easy one, it ensures that there's only one
switching in progress for a give inode.  Second, it's used as a
mechanism to synchronize wb stat updates.

The two stats, WB_RECLAIMABLE and WB_WRITEBACK, aren't event counters
but track the current number of dirty pages and pages under writeback
respectively.  As such, when an inode is moved from one wb to another,
the inode's portion of those stats have to be transferred together;
unfortunately, this is a bit tricky as those stat updates are percpu
operations which are performed without holding any lock in some
places.

This patch solves the problem in a similar way as memcg.  Each such
lockless stat updates are wrapped in transaction surrounded by
unlocked_inode_to_wb_begin/end().  During normal operation, they map
to rcu_read_lock/unlock(); however, if I_WB_SWITCHING is asserted,
mapping->tree_lock is grabbed across the transaction.

In turn, the switching path sets I_WB_SWITCHING and waits for a RCU
grace period to pass before actually starting to switch, which
guarantees that all stat update paths are synchronizing against
mapping->tree_lock.

This patch still doesn't implement the actual switching.

v3: Updated on top of the recent cancel_dirty_page() updates.
    unlocked_inode_to_wb_begin() now nests inside
    mem_cgroup_begin_page_stat() to match the locking order.

v2: The i_wb access transaction will be used for !stat accesses too.
    Function names and comments updated accordingly.

    s/inode_wb_stat_unlocked_{begin|end}/unlocked_inode_to_wb_{begin|end}/
    s/switch_wb/switch_wbs/

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Jan Kara <jack@suse.cz>
Cc: Wu Fengguang <fengguang.wu@intel.com>
Cc: Greg Thelen <gthelen@google.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 fs/fs-writeback.c           | 117 +++++++++++++++++++++++++++++++++++++++++---
 include/linux/backing-dev.h |  54 ++++++++++++++++++++
 include/linux/fs.h          |   6 +++
 include/linux/mm.h          |   3 +-
 mm/filemap.c                |   3 +-
 mm/page-writeback.c         |  27 +++++++---
 6 files changed, 196 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index f67b956fb321..08f5496fcf1b 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -308,6 +308,115 @@ static struct bdi_writeback *inode_to_wb_and_lock_list(struct inode *inode)
 	return locked_inode_to_wb_and_lock_list(inode);
 }
 
+struct inode_switch_wbs_context {
+	struct inode		*inode;
+	struct bdi_writeback	*new_wb;
+
+	struct rcu_head		rcu_head;
+	struct work_struct	work;
+};
+
+static void inode_switch_wbs_work_fn(struct work_struct *work)
+{
+	struct inode_switch_wbs_context *isw =
+		container_of(work, struct inode_switch_wbs_context, work);
+	struct inode *inode = isw->inode;
+	struct bdi_writeback *new_wb = isw->new_wb;
+
+	/*
+	 * By the time control reaches here, RCU grace period has passed
+	 * since I_WB_SWITCH assertion and all wb stat update transactions
+	 * between unlocked_inode_to_wb_begin/end() are guaranteed to be
+	 * synchronizing against mapping->tree_lock.
+	 */
+	spin_lock(&inode->i_lock);
+
+	inode->i_wb_frn_winner = 0;
+	inode->i_wb_frn_avg_time = 0;
+	inode->i_wb_frn_history = 0;
+
+	/*
+	 * Paired with load_acquire in unlocked_inode_to_wb_begin() and
+	 * ensures that the new wb is visible if they see !I_WB_SWITCH.
+	 */
+	smp_store_release(&inode->i_state, inode->i_state & ~I_WB_SWITCH);
+
+	spin_unlock(&inode->i_lock);
+
+	iput(inode);
+	wb_put(new_wb);
+	kfree(isw);
+}
+
+static void inode_switch_wbs_rcu_fn(struct rcu_head *rcu_head)
+{
+	struct inode_switch_wbs_context *isw = container_of(rcu_head,
+				struct inode_switch_wbs_context, rcu_head);
+
+	/* needs to grab bh-unsafe locks, bounce to work item */
+	INIT_WORK(&isw->work, inode_switch_wbs_work_fn);
+	schedule_work(&isw->work);
+}
+
+/**
+ * inode_switch_wbs - change the wb association of an inode
+ * @inode: target inode
+ * @new_wb_id: ID of the new wb
+ *
+ * Switch @inode's wb association to the wb identified by @new_wb_id.  The
+ * switching is performed asynchronously and may fail silently.
+ */
+static void inode_switch_wbs(struct inode *inode, int new_wb_id)
+{
+	struct backing_dev_info *bdi = inode_to_bdi(inode);
+	struct cgroup_subsys_state *memcg_css;
+	struct inode_switch_wbs_context *isw;
+
+	/* noop if seems to be already in progress */
+	if (inode->i_state & I_WB_SWITCH)
+		return;
+
+	isw = kzalloc(sizeof(*isw), GFP_ATOMIC);
+	if (!isw)
+		return;
+
+	/* find and pin the new wb */
+	rcu_read_lock();
+	memcg_css = css_from_id(new_wb_id, &memory_cgrp_subsys);
+	if (memcg_css)
+		isw->new_wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC);
+	rcu_read_unlock();
+	if (!isw->new_wb)
+		goto out_free;
+
+	/* while holding I_WB_SWITCH, no one else can update the association */
+	spin_lock(&inode->i_lock);
+	if (inode->i_state & (I_WB_SWITCH | I_FREEING) ||
+	    inode_to_wb(inode) == isw->new_wb) {
+		spin_unlock(&inode->i_lock);
+		goto out_free;
+	}
+	inode->i_state |= I_WB_SWITCH;
+	spin_unlock(&inode->i_lock);
+
+	ihold(inode);
+	isw->inode = inode;
+
+	/*
+	 * In addition to synchronizing among switchers, I_WB_SWITCH tells
+	 * the RCU protected stat update paths to grab the mapping's
+	 * tree_lock so that stat transfer can synchronize against them.
+	 * Let's continue after I_WB_SWITCH is guaranteed to be visible.
+	 */
+	call_rcu(&isw->rcu_head, inode_switch_wbs_rcu_fn);
+	return;
+
+out_free:
+	if (isw->new_wb)
+		wb_put(isw->new_wb);
+	kfree(isw);
+}
+
 /**
  * wbc_attach_and_unlock_inode - associate wbc with target inode and unlock it
  * @wbc: writeback_control of interest
@@ -433,12 +542,8 @@ void wbc_detach_inode(struct writeback_control *wbc)
 		 * is okay.  The main goal is avoiding keeping an inode on
 		 * the wrong wb for an extended period of time.
 		 */
-		if (hweight32(history) > WB_FRN_HIST_THR_SLOTS) {
-			/* switch */
-			max_id = 0;
-			avg_time = 0;
-			history = 0;
-		}
+		if (hweight32(history) > WB_FRN_HIST_THR_SLOTS)
+			inode_switch_wbs(inode, max_id);
 	}
 
 	/*
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index b1d2489a6536..73ffa32e58ee 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -332,6 +332,50 @@ static inline struct bdi_writeback *inode_to_wb(struct inode *inode)
 	return inode->i_wb;
 }
 
+/**
+ * unlocked_inode_to_wb_begin - begin unlocked inode wb access transaction
+ * @inode: target inode
+ * @lockedp: temp bool output param, to be passed to the end function
+ *
+ * The caller wants to access the wb associated with @inode but isn't
+ * holding inode->i_lock, mapping->tree_lock or wb->list_lock.  This
+ * function determines the wb associated with @inode and ensures that the
+ * association doesn't change until the transaction is finished with
+ * unlocked_inode_to_wb_end().
+ *
+ * The caller must call unlocked_inode_to_wb_end() with *@lockdep
+ * afterwards and can't sleep during transaction.  IRQ may or may not be
+ * disabled on return.
+ */
+static inline struct bdi_writeback *
+unlocked_inode_to_wb_begin(struct inode *inode, bool *lockedp)
+{
+	rcu_read_lock();
+
+	/*
+	 * Paired with store_release in inode_switch_wb_work_fn() and
+	 * ensures that we see the new wb if we see cleared I_WB_SWITCH.
+	 */
+	*lockedp = smp_load_acquire(&inode->i_state) & I_WB_SWITCH;
+
+	if (unlikely(*lockedp))
+		spin_lock_irq(&inode->i_mapping->tree_lock);
+	return inode_to_wb(inode);
+}
+
+/**
+ * unlocked_inode_to_wb_end - end inode wb access transaction
+ * @inode: target inode
+ * @locked: *@lockedp from unlocked_inode_to_wb_begin()
+ */
+static inline void unlocked_inode_to_wb_end(struct inode *inode, bool locked)
+{
+	if (unlikely(locked))
+		spin_unlock_irq(&inode->i_mapping->tree_lock);
+
+	rcu_read_unlock();
+}
+
 struct wb_iter {
 	int			start_blkcg_id;
 	struct radix_tree_iter	tree_iter;
@@ -420,6 +464,16 @@ static inline struct bdi_writeback *inode_to_wb(struct inode *inode)
 	return &inode_to_bdi(inode)->wb;
 }
 
+static inline struct bdi_writeback *
+unlocked_inode_to_wb_begin(struct inode *inode, bool *lockedp)
+{
+	return inode_to_wb(inode);
+}
+
+static inline void unlocked_inode_to_wb_end(struct inode *inode, bool locked)
+{
+}
+
 static inline void wb_memcg_offline(struct mem_cgroup *memcg)
 {
 }
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 740126d7c44e..b5e1dcfbc5e3 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1815,6 +1815,11 @@ struct super_operations {
  *
  * I_DIO_WAKEUP		Never set.  Only used as a key for wait_on_bit().
  *
+ * I_WB_SWITCH		Cgroup bdi_writeback switching in progress.  Used to
+ *			synchronize competing switching instances and to tell
+ *			wb stat updates to grab mapping->tree_lock.  See
+ *			inode_switch_wb_work_fn() for details.
+ *
  * Q: What is the difference between I_WILL_FREE and I_FREEING?
  */
 #define I_DIRTY_SYNC		(1 << 0)
@@ -1834,6 +1839,7 @@ struct super_operations {
 #define I_DIRTY_TIME		(1 << 11)
 #define __I_DIRTY_TIME_EXPIRED	12
 #define I_DIRTY_TIME_EXPIRED	(1 << __I_DIRTY_TIME_EXPIRED)
+#define I_WB_SWITCH		(1 << 13)
 
 #define I_DIRTY (I_DIRTY_SYNC | I_DIRTY_DATASYNC | I_DIRTY_PAGES)
 #define I_DIRTY_ALL (I_DIRTY | I_DIRTY_TIME)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index f48d979ced4b..4024543b4203 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -27,6 +27,7 @@ struct anon_vma_chain;
 struct file_ra_state;
 struct user_struct;
 struct writeback_control;
+struct bdi_writeback;
 
 #ifndef CONFIG_NEED_MULTIPLE_NODES	/* Don't use mapnrs, do it properly */
 extern unsigned long max_mapnr;
@@ -1214,7 +1215,7 @@ int redirty_page_for_writepage(struct writeback_control *wbc,
 void account_page_dirtied(struct page *page, struct address_space *mapping,
 			  struct mem_cgroup *memcg);
 void account_page_cleaned(struct page *page, struct address_space *mapping,
-			  struct mem_cgroup *memcg);
+			  struct mem_cgroup *memcg, struct bdi_writeback *wb);
 int set_page_dirty(struct page *page);
 int set_page_dirty_lock(struct page *page);
 void cancel_dirty_page(struct page *page);
diff --git a/mm/filemap.c b/mm/filemap.c
index 2f065b19b635..bfc1ab053b12 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -213,7 +213,8 @@ void __delete_from_page_cache(struct page *page, void *shadow,
 	 * anyway will be cleared before returning page into buddy allocator.
 	 */
 	if (WARN_ON_ONCE(PageDirty(page)))
-		account_page_cleaned(page, mapping, memcg);
+		account_page_cleaned(page, mapping, memcg,
+				     inode_to_wb(mapping->host));
 }
 
 /**
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index e8903356f423..e1514d5b4e9b 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -2422,12 +2422,12 @@ EXPORT_SYMBOL(account_page_dirtied);
  * Caller must hold mem_cgroup_begin_page_stat().
  */
 void account_page_cleaned(struct page *page, struct address_space *mapping,
-			  struct mem_cgroup *memcg)
+			  struct mem_cgroup *memcg, struct bdi_writeback *wb)
 {
 	if (mapping_cap_account_dirty(mapping)) {
 		mem_cgroup_dec_page_stat(memcg, MEM_CGROUP_STAT_DIRTY);
 		dec_zone_page_state(page, NR_FILE_DIRTY);
-		dec_wb_stat(inode_to_wb(mapping->host), WB_RECLAIMABLE);
+		dec_wb_stat(wb, WB_RECLAIMABLE);
 		task_io_account_cancelled_write(PAGE_CACHE_SIZE);
 	}
 }
@@ -2490,11 +2490,15 @@ void account_page_redirty(struct page *page)
 	struct address_space *mapping = page->mapping;
 
 	if (mapping && mapping_cap_account_dirty(mapping)) {
-		struct bdi_writeback *wb = inode_to_wb(mapping->host);
+		struct inode *inode = mapping->host;
+		struct bdi_writeback *wb;
+		bool locked;
 
+		wb = unlocked_inode_to_wb_begin(inode, &locked);
 		current->nr_dirtied--;
 		dec_zone_page_state(page, NR_DIRTIED);
 		dec_wb_stat(wb, WB_DIRTIED);
+		unlocked_inode_to_wb_end(inode, locked);
 	}
 }
 EXPORT_SYMBOL(account_page_redirty);
@@ -2597,13 +2601,18 @@ void cancel_dirty_page(struct page *page)
 	struct address_space *mapping = page_mapping(page);
 
 	if (mapping_cap_account_dirty(mapping)) {
+		struct inode *inode = mapping->host;
+		struct bdi_writeback *wb;
 		struct mem_cgroup *memcg;
+		bool locked;
 
 		memcg = mem_cgroup_begin_page_stat(page);
+		wb = unlocked_inode_to_wb_begin(inode, &locked);
 
 		if (TestClearPageDirty(page))
-			account_page_cleaned(page, mapping, memcg);
+			account_page_cleaned(page, mapping, memcg, wb);
 
+		unlocked_inode_to_wb_end(inode, locked);
 		mem_cgroup_end_page_stat(memcg);
 	} else {
 		ClearPageDirty(page);
@@ -2628,12 +2637,16 @@ EXPORT_SYMBOL(cancel_dirty_page);
 int clear_page_dirty_for_io(struct page *page)
 {
 	struct address_space *mapping = page_mapping(page);
-	struct mem_cgroup *memcg;
 	int ret = 0;
 
 	BUG_ON(!PageLocked(page));
 
 	if (mapping && mapping_cap_account_dirty(mapping)) {
+		struct inode *inode = mapping->host;
+		struct bdi_writeback *wb;
+		struct mem_cgroup *memcg;
+		bool locked;
+
 		/*
 		 * Yes, Virginia, this is indeed insane.
 		 *
@@ -2670,12 +2683,14 @@ int clear_page_dirty_for_io(struct page *page)
 		 * exclusion.
 		 */
 		memcg = mem_cgroup_begin_page_stat(page);
+		wb = unlocked_inode_to_wb_begin(inode, &locked);
 		if (TestClearPageDirty(page)) {
 			mem_cgroup_dec_page_stat(memcg, MEM_CGROUP_STAT_DIRTY);
 			dec_zone_page_state(page, NR_FILE_DIRTY);
-			dec_wb_stat(inode_to_wb(mapping->host), WB_RECLAIMABLE);
+			dec_wb_stat(wb, WB_RECLAIMABLE);
 			ret = 1;
 		}
+		unlocked_inode_to_wb_end(inode, locked);
 		mem_cgroup_end_page_stat(memcg);
 		return ret;
 	}
-- 
cgit v1.2.3


From aaa2cacf8184e2a92accb8e443b1608d65f9a13f Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 28 May 2015 14:50:55 -0400
Subject: writeback: add lockdep annotation to inode_to_wb()

With the previous three patches, all operations which acquire wb from
inode are either under one of inode->i_lock, mapping->tree_lock or
wb->list_lock or protected by unlocked_inode_to_wb transaction.  This
will be depended upon by foreign inode wb switching.

This patch adds lockdep assertion to inode_to_wb() so that usages
outside the above list locks can be caught easily.  There are three
exceptions.

* locked_inode_to_wb_and_lock_list() is holding wb->list_lock but the
  wb may not be the inode's.  Ensuring that is the function's role
  after all.  Updated to deref inode->i_wb directly.

* inode_wb_stat_unlocked_begin() is usually protected by combination
  of !I_WB_SWITCH and rcu_read_lock().  Updated to deref inode->i_wb
  directly.

* inode_congested() wants to test whether inode->i_wb is set before
  starting the transaction.  Added inode_to_wb_is_valid() which tests
  inode->i_wb directly.

v5: might_lock() removed.  It annotates that the lock is grabbed w/
    irq enabled which isn't the case and triggering lockdep warning
    spuriously.

v4: might_lock() added to unlocked_inode_to_wb_begin().

v3: inode_congested() conversion added.

v2: locked_inode_to_wb_and_lock_list() was missing in the first
    version.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Jan Kara <jack@suse.cz>
Cc: Wu Fengguang <fengguang.wu@intel.com>
Cc: Greg Thelen <gthelen@google.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 fs/fs-writeback.c           |  5 +++--
 include/linux/backing-dev.h | 34 ++++++++++++++++++++++++++++++++--
 2 files changed, 35 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 25458fa4753d..6b99deeb9de0 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -285,7 +285,8 @@ locked_inode_to_wb_and_lock_list(struct inode *inode)
 		spin_lock(&wb->list_lock);
 		wb_put(wb);		/* not gonna deref it anymore */
 
-		if (likely(wb == inode_to_wb(inode)))
+		/* i_wb may have changed inbetween, can't use inode_to_wb() */
+		if (likely(wb == inode->i_wb))
 			return wb;	/* @inode already has ref */
 
 		spin_unlock(&wb->list_lock);
@@ -622,7 +623,7 @@ int inode_congested(struct inode *inode, int cong_bits)
 	 * Once set, ->i_wb never becomes NULL while the inode is alive.
 	 * Start transaction iff ->i_wb is visible.
 	 */
-	if (inode && inode_to_wb(inode)) {
+	if (inode && inode_to_wb_is_valid(inode)) {
 		struct bdi_writeback *wb;
 		bool locked, congested;
 
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index 73ffa32e58ee..dfce80869145 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -321,14 +321,34 @@ wb_get_create_current(struct backing_dev_info *bdi, gfp_t gfp)
 	return wb;
 }
 
+/**
+ * inode_to_wb_is_valid - test whether an inode has a wb associated
+ * @inode: inode of interest
+ *
+ * Returns %true if @inode has a wb associated.  May be called without any
+ * locking.
+ */
+static inline bool inode_to_wb_is_valid(struct inode *inode)
+{
+	return inode->i_wb;
+}
+
 /**
  * inode_to_wb - determine the wb of an inode
  * @inode: inode of interest
  *
- * Returns the wb @inode is currently associated with.
+ * Returns the wb @inode is currently associated with.  The caller must be
+ * holding either @inode->i_lock, @inode->i_mapping->tree_lock, or the
+ * associated wb's list_lock.
  */
 static inline struct bdi_writeback *inode_to_wb(struct inode *inode)
 {
+#ifdef CONFIG_LOCKDEP
+	WARN_ON_ONCE(debug_locks &&
+		     (!lockdep_is_held(&inode->i_lock) &&
+		      !lockdep_is_held(&inode->i_mapping->tree_lock) &&
+		      !lockdep_is_held(&inode->i_wb->list_lock)));
+#endif
 	return inode->i_wb;
 }
 
@@ -360,7 +380,12 @@ unlocked_inode_to_wb_begin(struct inode *inode, bool *lockedp)
 
 	if (unlikely(*lockedp))
 		spin_lock_irq(&inode->i_mapping->tree_lock);
-	return inode_to_wb(inode);
+
+	/*
+	 * Protected by either !I_WB_SWITCH + rcu_read_lock() or tree_lock.
+	 * inode_to_wb() will bark.  Deref directly.
+	 */
+	return inode->i_wb;
 }
 
 /**
@@ -459,6 +484,11 @@ wb_get_create_current(struct backing_dev_info *bdi, gfp_t gfp)
 	return &bdi->wb;
 }
 
+static inline bool inode_to_wb_is_valid(struct inode *inode)
+{
+	return true;
+}
+
 static inline struct bdi_writeback *inode_to_wb(struct inode *inode)
 {
 	return &inode_to_bdi(inode)->wb;
-- 
cgit v1.2.3


From e8a7abf5a5bd302a1e06a3c21a629eaa4cba57d6 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 28 May 2015 14:50:57 -0400
Subject: writeback: disassociate inodes from dying bdi_writebacks

For the purpose of foreign inode detection, wb's (bdi_writeback's) are
identified by the associated memcg ID.  As we create a separate wb for
each memcg, this is enough to identify the active wb's; however, when
blkcg is enabled or disabled higher up in the hierarchy, the mapping
between memcg and blkcg changes which in turn creates a new wb to
service the new mapping.  The old wb is unlinked from index and
released after all references are drained.  The foreign inode
detection logic can't detect this condition because both the old and
new wb's point to the same memcg and thus never decides to move inodes
attached to the old wb to the new one.

This patch adds logic to initiate switching immediately in
wbc_attach_and_unlock_inode() if the associated wb is dying.  We can
make the usual foreign detection logic to distinguish the different
wb's mapped to the memcg but the dying wb is never gonna be in active
service again and there's no point in tracking the usage history and
reaching the switch verdict after enough data points are collected.
It's already known that the wb has to be switched.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Jan Kara <jack@suse.cz>
Cc: Wu Fengguang <fengguang.wu@intel.com>
Cc: Greg Thelen <gthelen@google.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 fs/fs-writeback.c                |  7 +++++++
 include/linux/backing-dev-defs.h | 16 ++++++++++++++++
 2 files changed, 23 insertions(+)

(limited to 'include/linux')

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 5eeb24a8082f..f60de54d2042 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -525,6 +525,13 @@ void wbc_attach_and_unlock_inode(struct writeback_control *wbc,
 
 	wb_get(wbc->wb);
 	spin_unlock(&inode->i_lock);
+
+	/*
+	 * A dying wb indicates that the memcg-blkcg mapping has changed
+	 * and a new wb is already serving the memcg.  Switch immediately.
+	 */
+	if (unlikely(wb_dying(wbc->wb)))
+		inode_switch_wbs(inode, wbc->wb_id);
 }
 
 /**
diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h
index e047b496a0b9..a48d90e3bcbb 100644
--- a/include/linux/backing-dev-defs.h
+++ b/include/linux/backing-dev-defs.h
@@ -219,6 +219,17 @@ static inline void wb_put(struct bdi_writeback *wb)
 		percpu_ref_put(&wb->refcnt);
 }
 
+/**
+ * wb_dying - is a wb dying?
+ * @wb: bdi_writeback of interest
+ *
+ * Returns whether @wb is unlinked and being drained.
+ */
+static inline bool wb_dying(struct bdi_writeback *wb)
+{
+	return percpu_ref_is_dying(&wb->refcnt);
+}
+
 #else	/* CONFIG_CGROUP_WRITEBACK */
 
 static inline bool wb_tryget(struct bdi_writeback *wb)
@@ -234,6 +245,11 @@ static inline void wb_put(struct bdi_writeback *wb)
 {
 }
 
+static inline bool wb_dying(struct bdi_writeback *wb)
+{
+	return false;
+}
+
 #endif	/* CONFIG_CGROUP_WRITEBACK */
 
 #endif	/* __LINUX_BACKING_DEV_DEFS_H */
-- 
cgit v1.2.3


From be3ef76e9d9b97962c70bd6351787d29071ae481 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 2 Jun 2015 14:30:11 +0200
Subject: clockevents: Rename state to state_use_accessors

The only sensible way to make abuse of core internal fields obvious
and easy to grep for.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Viresh Kumar <viresh.kumar@linaro.org>
Cc: Peter Zijlstra <peterz@infradead.org>
---
 include/linux/clockchips.h  | 14 +++++++-------
 kernel/time/tick-internal.h |  4 ++--
 2 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/clockchips.h b/include/linux/clockchips.h
index 64214ad85af9..597a1e836f22 100644
--- a/include/linux/clockchips.h
+++ b/include/linux/clockchips.h
@@ -87,7 +87,7 @@ enum clock_event_state {
  * @mult:		nanosecond to cycles multiplier
  * @shift:		nanoseconds to cycles divisor (power of two)
  * @mode:		operating mode, relevant only to ->set_mode(), OBSOLETE
- * @state:		current state of the device, assigned by the core code
+ * @state_use_accessors:current state of the device, assigned by the core code
  * @features:		features
  * @retries:		number of forced programming retries
  * @set_mode:		legacy set mode function, only for modes <= CLOCK_EVT_MODE_RESUME.
@@ -117,7 +117,7 @@ struct clock_event_device {
 	u32			mult;
 	u32			shift;
 	enum clock_event_mode	mode;
-	enum clock_event_state	state;
+	enum clock_event_state	state_use_accessors;
 	unsigned int		features;
 	unsigned long		retries;
 
@@ -152,27 +152,27 @@ struct clock_event_device {
 /* Helpers to verify state of a clockevent device */
 static inline bool clockevent_state_detached(struct clock_event_device *dev)
 {
-	return dev->state == CLOCK_EVT_STATE_DETACHED;
+	return dev->state_use_accessors == CLOCK_EVT_STATE_DETACHED;
 }
 
 static inline bool clockevent_state_shutdown(struct clock_event_device *dev)
 {
-	return dev->state == CLOCK_EVT_STATE_SHUTDOWN;
+	return dev->state_use_accessors == CLOCK_EVT_STATE_SHUTDOWN;
 }
 
 static inline bool clockevent_state_periodic(struct clock_event_device *dev)
 {
-	return dev->state == CLOCK_EVT_STATE_PERIODIC;
+	return dev->state_use_accessors == CLOCK_EVT_STATE_PERIODIC;
 }
 
 static inline bool clockevent_state_oneshot(struct clock_event_device *dev)
 {
-	return dev->state == CLOCK_EVT_STATE_ONESHOT;
+	return dev->state_use_accessors == CLOCK_EVT_STATE_ONESHOT;
 }
 
 static inline bool clockevent_state_oneshot_stopped(struct clock_event_device *dev)
 {
-	return dev->state == CLOCK_EVT_STATE_ONESHOT_STOPPED;
+	return dev->state_use_accessors == CLOCK_EVT_STATE_ONESHOT_STOPPED;
 }
 
 /*
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index 4461de9bb4b8..ec2208aabdd1 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -38,13 +38,13 @@ static inline int tick_device_is_functional(struct clock_event_device *dev)
 
 static inline enum clock_event_state clockevent_get_state(struct clock_event_device *dev)
 {
-	return dev->state;
+	return dev->state_use_accessors;
 }
 
 static inline void clockevent_set_state(struct clock_event_device *dev,
 					enum clock_event_state state)
 {
-	dev->state = state;
+	dev->state_use_accessors = state;
 }
 
 extern void clockevents_shutdown(struct clock_event_device *dev);
-- 
cgit v1.2.3


From 24bbd929e6b9e62afd263c42b4318d3b603c956c Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Date: Mon, 1 Jun 2015 13:40:31 +0200
Subject: of/fdt: split off FDT self reservation from memreserve processing

This splits off the reservation of the memory occupied by the FDT
binary itself from the processing of the memory reservations it
contains. This is necessary because the physical address of the FDT,
which is needed to perform the reservation, may not be known to the
FDT driver core, i.e., it may be mapped outside the linear direct
mapping, in which case __pa() returns a bogus value.

Cc: Russell King <linux@arm.linux.org.uk>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Acked-by: Rob Herring <robh@kernel.org>
Acked-by: Mark Rutland <mark.rutland@arm.com>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm/mm/init.c         |  1 +
 arch/arm64/mm/init.c       |  1 +
 arch/powerpc/kernel/prom.c |  1 +
 drivers/of/fdt.c           | 19 ++++++++++++++-----
 include/linux/of_fdt.h     |  2 ++
 5 files changed, 19 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/mm/init.c b/arch/arm/mm/init.c
index be92fa0f2f35..8a63b4cdc0f2 100644
--- a/arch/arm/mm/init.c
+++ b/arch/arm/mm/init.c
@@ -268,6 +268,7 @@ void __init arm_memblock_init(const struct machine_desc *mdesc)
 	if (mdesc->reserve)
 		mdesc->reserve();
 
+	early_init_fdt_reserve_self();
 	early_init_fdt_scan_reserved_mem();
 
 	/* reserve memory for DMA contiguous allocations */
diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
index 597831bdddf3..89a05f467ffb 100644
--- a/arch/arm64/mm/init.c
+++ b/arch/arm64/mm/init.c
@@ -170,6 +170,7 @@ void __init arm64_memblock_init(void)
 		memblock_reserve(__virt_to_phys(initrd_start), initrd_end - initrd_start);
 #endif
 
+	early_init_fdt_reserve_self();
 	early_init_fdt_scan_reserved_mem();
 
 	/* 4GB maximum for 32-bit only capable devices */
diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c
index 308c5e15676b..51ea36f79307 100644
--- a/arch/powerpc/kernel/prom.c
+++ b/arch/powerpc/kernel/prom.c
@@ -573,6 +573,7 @@ static void __init early_reserve_mem_dt(void)
 	int len;
 	const __be32 *prop;
 
+	early_init_fdt_reserve_self();
 	early_init_fdt_scan_reserved_mem();
 
 	dt_root = of_get_flat_dt_root();
diff --git a/drivers/of/fdt.c b/drivers/of/fdt.c
index cde35c5d0191..f2dd23a32267 100644
--- a/drivers/of/fdt.c
+++ b/drivers/of/fdt.c
@@ -580,11 +580,6 @@ void __init early_init_fdt_scan_reserved_mem(void)
 	if (!initial_boot_params)
 		return;
 
-	/* Reserve the dtb region */
-	early_init_dt_reserve_memory_arch(__pa(initial_boot_params),
-					  fdt_totalsize(initial_boot_params),
-					  0);
-
 	/* Process header /memreserve/ fields */
 	for (n = 0; ; n++) {
 		fdt_get_mem_rsv(initial_boot_params, n, &base, &size);
@@ -597,6 +592,20 @@ void __init early_init_fdt_scan_reserved_mem(void)
 	fdt_init_reserved_mem();
 }
 
+/**
+ * early_init_fdt_reserve_self() - reserve the memory used by the FDT blob
+ */
+void __init early_init_fdt_reserve_self(void)
+{
+	if (!initial_boot_params)
+		return;
+
+	/* Reserve the dtb region */
+	early_init_dt_reserve_memory_arch(__pa(initial_boot_params),
+					  fdt_totalsize(initial_boot_params),
+					  0);
+}
+
 /**
  * of_scan_flat_dt - scan flattened tree blob and call callback on each.
  * @it: callback function
diff --git a/include/linux/of_fdt.h b/include/linux/of_fdt.h
index 587ee507965d..fd627a58068f 100644
--- a/include/linux/of_fdt.h
+++ b/include/linux/of_fdt.h
@@ -64,6 +64,7 @@ extern int early_init_dt_scan_chosen(unsigned long node, const char *uname,
 extern int early_init_dt_scan_memory(unsigned long node, const char *uname,
 				     int depth, void *data);
 extern void early_init_fdt_scan_reserved_mem(void);
+extern void early_init_fdt_reserve_self(void);
 extern void early_init_dt_add_memory_arch(u64 base, u64 size);
 extern int early_init_dt_reserve_memory_arch(phys_addr_t base, phys_addr_t size,
 					     bool no_map);
@@ -91,6 +92,7 @@ extern u64 fdt_translate_address(const void *blob, int node_offset);
 extern void of_fdt_limit_memory(int limit);
 #else /* CONFIG_OF_FLATTREE */
 static inline void early_init_fdt_scan_reserved_mem(void) {}
+static inline void early_init_fdt_reserve_self(void) {}
 static inline const char *of_flat_dt_get_machine_name(void) { return NULL; }
 static inline void unflatten_device_tree(void) {}
 static inline void unflatten_and_copy_device_tree(void) {}
-- 
cgit v1.2.3


From 632dda833e28fe43049a01b4bc53e409176e7843 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 11 May 2015 14:04:50 -0400
Subject: SUNRPC: Clean up bc_send()

Clean up: Merge bc_send() into bc_svc_process().

Note: even thought this touches svc.c, it is a client-side change.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 include/linux/sunrpc/bc_xprt.h |  1 -
 net/sunrpc/Makefile            |  2 +-
 net/sunrpc/bc_svc.c            | 63 ------------------------------------------
 net/sunrpc/svc.c               | 33 ++++++++++++++++------
 4 files changed, 26 insertions(+), 73 deletions(-)
 delete mode 100644 net/sunrpc/bc_svc.c

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/bc_xprt.h b/include/linux/sunrpc/bc_xprt.h
index 2ca67b55e0fe..8df43c9f11dc 100644
--- a/include/linux/sunrpc/bc_xprt.h
+++ b/include/linux/sunrpc/bc_xprt.h
@@ -37,7 +37,6 @@ void xprt_complete_bc_request(struct rpc_rqst *req, uint32_t copied);
 void xprt_free_bc_request(struct rpc_rqst *req);
 int xprt_setup_backchannel(struct rpc_xprt *, unsigned int min_reqs);
 void xprt_destroy_backchannel(struct rpc_xprt *, unsigned int max_reqs);
-int bc_send(struct rpc_rqst *req);
 
 /*
  * Determine if a shared backchannel is in use
diff --git a/net/sunrpc/Makefile b/net/sunrpc/Makefile
index 15e6f6c23c5d..1b8e68d0e690 100644
--- a/net/sunrpc/Makefile
+++ b/net/sunrpc/Makefile
@@ -15,6 +15,6 @@ sunrpc-y := clnt.o xprt.o socklib.o xprtsock.o sched.o \
 	    sunrpc_syms.o cache.o rpc_pipe.o \
 	    svc_xprt.o
 sunrpc-$(CONFIG_SUNRPC_DEBUG) += debugfs.o
-sunrpc-$(CONFIG_SUNRPC_BACKCHANNEL) += backchannel_rqst.o bc_svc.o
+sunrpc-$(CONFIG_SUNRPC_BACKCHANNEL) += backchannel_rqst.o
 sunrpc-$(CONFIG_PROC_FS) += stats.o
 sunrpc-$(CONFIG_SYSCTL) += sysctl.o
diff --git a/net/sunrpc/bc_svc.c b/net/sunrpc/bc_svc.c
deleted file mode 100644
index 15c7a8a1c24f..000000000000
--- a/net/sunrpc/bc_svc.c
+++ /dev/null
@@ -1,63 +0,0 @@
-/******************************************************************************
-
-(c) 2007 Network Appliance, Inc.  All Rights Reserved.
-(c) 2009 NetApp.  All Rights Reserved.
-
-NetApp provides this source code under the GPL v2 License.
-The GPL v2 license is available at
-http://opensource.org/licenses/gpl-license.php.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
-CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-******************************************************************************/
-
-/*
- * The NFSv4.1 callback service helper routines.
- * They implement the transport level processing required to send the
- * reply over an existing open connection previously established by the client.
- */
-
-#include <linux/module.h>
-
-#include <linux/sunrpc/xprt.h>
-#include <linux/sunrpc/sched.h>
-#include <linux/sunrpc/bc_xprt.h>
-
-#define RPCDBG_FACILITY	RPCDBG_SVCDSP
-
-/* Empty callback ops */
-static const struct rpc_call_ops nfs41_callback_ops = {
-};
-
-
-/*
- * Send the callback reply
- */
-int bc_send(struct rpc_rqst *req)
-{
-	struct rpc_task *task;
-	int ret;
-
-	dprintk("RPC:       bc_send req= %p\n", req);
-	task = rpc_run_bc_task(req, &nfs41_callback_ops);
-	if (IS_ERR(task))
-		ret = PTR_ERR(task);
-	else {
-		WARN_ON_ONCE(atomic_read(&task->tk_count) != 1);
-		ret = task->tk_status;
-		rpc_put_task(task);
-	}
-	dprintk("RPC:       bc_send ret= %d\n", ret);
-	return ret;
-}
-
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index 78974e4d9ad2..e144902d382e 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -1350,6 +1350,11 @@ bc_svc_process(struct svc_serv *serv, struct rpc_rqst *req,
 {
 	struct kvec	*argv = &rqstp->rq_arg.head[0];
 	struct kvec	*resv = &rqstp->rq_res.head[0];
+	static const struct rpc_call_ops reply_ops = { };
+	struct rpc_task *task;
+	int error;
+
+	dprintk("svc: %s(%p)\n", __func__, req);
 
 	/* Build the svc_rqst used by the common processing routine */
 	rqstp->rq_xprt = serv->sv_bc_xprt;
@@ -1372,21 +1377,33 @@ bc_svc_process(struct svc_serv *serv, struct rpc_rqst *req,
 
 	/*
 	 * Skip the next two words because they've already been
-	 * processed in the trasport
+	 * processed in the transport
 	 */
 	svc_getu32(argv);	/* XID */
 	svc_getnl(argv);	/* CALLDIR */
 
-	/* Returns 1 for send, 0 for drop */
-	if (svc_process_common(rqstp, argv, resv)) {
-		memcpy(&req->rq_snd_buf, &rqstp->rq_res,
-						sizeof(req->rq_snd_buf));
-		return bc_send(req);
-	} else {
-		/* drop request */
+	/* Parse and execute the bc call */
+	if (!svc_process_common(rqstp, argv, resv)) {
+		/* Processing error: drop the request */
 		xprt_free_bc_request(req);
 		return 0;
 	}
+
+	/* Finally, send the reply synchronously */
+	memcpy(&req->rq_snd_buf, &rqstp->rq_res, sizeof(req->rq_snd_buf));
+	task = rpc_run_bc_task(req, &reply_ops);
+	if (IS_ERR(task)) {
+		error = PTR_ERR(task);
+		goto out;
+	}
+
+	WARN_ON_ONCE(atomic_read(&task->tk_count) != 1);
+	error = task->tk_status;
+	rpc_put_task(task);
+
+out:
+	dprintk("svc: %s(), error=%d\n", __func__, error);
+	return error;
 }
 EXPORT_SYMBOL_GPL(bc_svc_process);
 #endif /* CONFIG_SUNRPC_BACKCHANNEL */
-- 
cgit v1.2.3


From 1e25aa9641e8f3fa39cd5e46b4afcafd7f12a44b Mon Sep 17 00:00:00 2001
From: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Date: Mon, 1 Jun 2015 16:36:27 -0700
Subject: hid-sensor: Fix suspend/resume delay

By default all the sensors are runtime suspended state (lowest power
state). During Linux suspend process, all the run time suspended
devices are resumed and then suspended. This caused all sensors to
power up and introduced delay in suspend time, when we introduced
runtime PM for HID sensors. The opposite process happens during resume
process.

To fix this, we do powerup process of the sensors only when the request
is issued from user (raw or tiggerred). In this way when runtime,
resume calls for powerup it will simply return as this will not match
user requested state.

Note this is a regression fix as the increase in suspend / resume
times can be substantial (report of 8 seconds on Len's laptop!)

Signed-off-by: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Tested-by: Len Brown <len.brown@intel.com>
Cc: <Stable@vger.kernel.org>
Signed-off-by: Jonathan Cameron <jic23@kernel.org>
---
 drivers/iio/common/hid-sensors/hid-sensor-trigger.c | 11 ++++++++++-
 include/linux/hid-sensor-hub.h                      |  1 +
 2 files changed, 11 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/iio/common/hid-sensors/hid-sensor-trigger.c b/drivers/iio/common/hid-sensors/hid-sensor-trigger.c
index 610fc98f88ef..595511022795 100644
--- a/drivers/iio/common/hid-sensors/hid-sensor-trigger.c
+++ b/drivers/iio/common/hid-sensors/hid-sensor-trigger.c
@@ -36,6 +36,8 @@ static int _hid_sensor_power_state(struct hid_sensor_common *st, bool state)
 	s32 poll_value = 0;
 
 	if (state) {
+		if (!atomic_read(&st->user_requested_state))
+			return 0;
 		if (sensor_hub_device_open(st->hsdev))
 			return -EIO;
 
@@ -52,8 +54,12 @@ static int _hid_sensor_power_state(struct hid_sensor_common *st, bool state)
 
 		poll_value = hid_sensor_read_poll_value(st);
 	} else {
-		if (!atomic_dec_and_test(&st->data_ready))
+		int val;
+
+		val = atomic_dec_if_positive(&st->data_ready);
+		if (val < 0)
 			return 0;
+
 		sensor_hub_device_close(st->hsdev);
 		state_val = hid_sensor_get_usage_index(st->hsdev,
 			st->power_state.report_id,
@@ -92,9 +98,11 @@ EXPORT_SYMBOL(hid_sensor_power_state);
 
 int hid_sensor_power_state(struct hid_sensor_common *st, bool state)
 {
+
 #ifdef CONFIG_PM
 	int ret;
 
+	atomic_set(&st->user_requested_state, state);
 	if (state)
 		ret = pm_runtime_get_sync(&st->pdev->dev);
 	else {
@@ -109,6 +117,7 @@ int hid_sensor_power_state(struct hid_sensor_common *st, bool state)
 
  	return 0;
 #else
+	atomic_set(&st->user_requested_state, state);
 	return _hid_sensor_power_state(st, state);
 #endif
 }
diff --git a/include/linux/hid-sensor-hub.h b/include/linux/hid-sensor-hub.h
index 0408421d885f..cd224dfd94d8 100644
--- a/include/linux/hid-sensor-hub.h
+++ b/include/linux/hid-sensor-hub.h
@@ -230,6 +230,7 @@ struct hid_sensor_common {
 	struct platform_device *pdev;
 	unsigned usage_id;
 	atomic_t data_ready;
+	atomic_t user_requested_state;
 	struct iio_trigger *trigger;
 	struct hid_sensor_hub_attribute_info poll;
 	struct hid_sensor_hub_attribute_info report_state;
-- 
cgit v1.2.3


From cfaed10d1f27d036b72bbdc6b1e59ea28c38ec7f Mon Sep 17 00:00:00 2001
From: Tom Lendacky <thomas.lendacky@amd.com>
Date: Mon, 1 Jun 2015 11:15:25 -0500
Subject: scatterlist: introduce sg_nents_for_len

When performing a dma_map_sg() call, the number of sg entries to map is
required. Using sg_nents to retrieve the number of sg entries will
return the total number of entries in the sg list up to the entry marked
as the end. If there happen to be unused entries in the list, these will
still be counted. Some dma_map_sg() implementations will not handle the
unused entries correctly (lib/swiotlb.c) and execute a BUG_ON.

The sg_nents_for_len() function will traverse the sg list and return the
number of entries required to satisfy the supplied length argument. This
can then be supplied to the dma_map_sg() call to successfully map the
sg.

Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 include/linux/scatterlist.h |  1 +
 lib/scatterlist.c           | 32 ++++++++++++++++++++++++++++++++
 2 files changed, 33 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/scatterlist.h b/include/linux/scatterlist.h
index ed8f9e70df9b..a0edb992c9c3 100644
--- a/include/linux/scatterlist.h
+++ b/include/linux/scatterlist.h
@@ -221,6 +221,7 @@ static inline void *sg_virt(struct scatterlist *sg)
 }
 
 int sg_nents(struct scatterlist *sg);
+int sg_nents_for_len(struct scatterlist *sg, u64 len);
 struct scatterlist *sg_next(struct scatterlist *);
 struct scatterlist *sg_last(struct scatterlist *s, unsigned int);
 void sg_init_table(struct scatterlist *, unsigned int);
diff --git a/lib/scatterlist.c b/lib/scatterlist.c
index c9f2e8c6ccc9..99fbc2f238c4 100644
--- a/lib/scatterlist.c
+++ b/lib/scatterlist.c
@@ -56,6 +56,38 @@ int sg_nents(struct scatterlist *sg)
 }
 EXPORT_SYMBOL(sg_nents);
 
+/**
+ * sg_nents_for_len - return total count of entries in scatterlist
+ *                    needed to satisfy the supplied length
+ * @sg:		The scatterlist
+ * @len:	The total required length
+ *
+ * Description:
+ * Determines the number of entries in sg that are required to meet
+ * the supplied length, taking into acount chaining as well
+ *
+ * Returns:
+ *   the number of sg entries needed, negative error on failure
+ *
+ **/
+int sg_nents_for_len(struct scatterlist *sg, u64 len)
+{
+	int nents;
+	u64 total;
+
+	if (!len)
+		return 0;
+
+	for (nents = 0, total = 0; sg; sg = sg_next(sg)) {
+		nents++;
+		total += sg->length;
+		if (total >= len)
+			return nents;
+	}
+
+	return -EINVAL;
+}
+EXPORT_SYMBOL(sg_nents_for_len);
 
 /**
  * sg_last - return the last scatterlist entry in a list
-- 
cgit v1.2.3


From d6472302f242559d45dcf4ebace62508dc4d8aeb Mon Sep 17 00:00:00 2001
From: Stephen Rothwell <sfr@canb.auug.org.au>
Date: Tue, 2 Jun 2015 19:01:38 +1000
Subject: x86/mm: Decouple <linux/vmalloc.h> from <asm/io.h>

Nothing in <asm/io.h> uses anything from <linux/vmalloc.h>, so
remove it from there and fix up the resulting build problems
triggered on x86 {64|32}-bit {def|allmod|allno}configs.

The breakages were triggering in places where x86 builds relied
on vmalloc() facilities but did not include <linux/vmalloc.h>
explicitly and relied on the implicit inclusion via <asm/io.h>.

Also add:

  - <linux/init.h> to <linux/io.h>
  - <asm/pgtable_types> to <asm/io.h>

... which were two other implicit header file dependencies.

Suggested-by: David Miller <davem@davemloft.net>
Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au>
[ Tidied up the changelog. ]
Acked-by: David Miller <davem@davemloft.net>
Acked-by: Takashi Iwai <tiwai@suse.de>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Acked-by: Vinod Koul <vinod.koul@intel.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Anton Vorontsov <anton@enomsg.org>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Colin Cross <ccross@android.com>
Cc: David Vrabel <david.vrabel@citrix.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Haiyang Zhang <haiyangz@microsoft.com>
Cc: James E.J. Bottomley <JBottomley@odin.com>
Cc: Jaroslav Kysela <perex@perex.cz>
Cc: K. Y. Srinivasan <kys@microsoft.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: Kristen Carlson Accardi <kristen@linux.intel.com>
Cc: Len Brown <lenb@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rafael J. Wysocki <rjw@rjwysocki.net>
Cc: Suma Ramars <sramars@cisco.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tony Luck <tony.luck@intel.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/io.h          | 3 +--
 arch/x86/kernel/crash.c            | 1 +
 arch/x86/kernel/machine_kexec_64.c | 1 +
 arch/x86/mm/pageattr-test.c        | 1 +
 arch/x86/mm/pageattr.c             | 1 +
 arch/x86/xen/p2m.c                 | 1 +
 drivers/acpi/apei/erst.c           | 1 +
 drivers/cpufreq/intel_pstate.c     | 1 +
 drivers/dma/mic_x100_dma.c         | 1 +
 drivers/net/hyperv/netvsc.c        | 1 +
 drivers/net/hyperv/rndis_filter.c  | 1 +
 drivers/scsi/fnic/fnic_debugfs.c   | 1 +
 drivers/scsi/fnic/fnic_trace.c     | 1 +
 include/linux/io.h                 | 1 +
 sound/pci/asihpi/hpioctl.c         | 1 +
 15 files changed, 15 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h
index a2b97404922d..a94463063b46 100644
--- a/arch/x86/include/asm/io.h
+++ b/arch/x86/include/asm/io.h
@@ -40,6 +40,7 @@
 #include <linux/compiler.h>
 #include <asm/page.h>
 #include <asm/early_ioremap.h>
+#include <asm/pgtable_types.h>
 
 #define build_mmio_read(name, size, type, reg, barrier) \
 static inline type name(const volatile void __iomem *addr) \
@@ -198,8 +199,6 @@ extern void set_iounmap_nonlazy(void);
 
 #include <asm-generic/iomap.h>
 
-#include <linux/vmalloc.h>
-
 /*
  * Convert a virtual cached pointer to an uncached pointer
  */
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index c76d3e37c6e1..e068d6683dba 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -22,6 +22,7 @@
 #include <linux/elfcore.h>
 #include <linux/module.h>
 #include <linux/slab.h>
+#include <linux/vmalloc.h>
 
 #include <asm/processor.h>
 #include <asm/hardirq.h>
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index 415480d3ea84..11546b462fa6 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -17,6 +17,7 @@
 #include <linux/ftrace.h>
 #include <linux/io.h>
 #include <linux/suspend.h>
+#include <linux/vmalloc.h>
 
 #include <asm/init.h>
 #include <asm/pgtable.h>
diff --git a/arch/x86/mm/pageattr-test.c b/arch/x86/mm/pageattr-test.c
index 6629f397b467..8ff686aa7e8c 100644
--- a/arch/x86/mm/pageattr-test.c
+++ b/arch/x86/mm/pageattr-test.c
@@ -9,6 +9,7 @@
 #include <linux/random.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
+#include <linux/vmalloc.h>
 
 #include <asm/cacheflush.h>
 #include <asm/pgtable.h>
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 70d221fe2eb4..fae3c5366ac0 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -14,6 +14,7 @@
 #include <linux/percpu.h>
 #include <linux/gfp.h>
 #include <linux/pci.h>
+#include <linux/vmalloc.h>
 
 #include <asm/e820.h>
 #include <asm/processor.h>
diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c
index b47124d4cd67..8b7f18e200aa 100644
--- a/arch/x86/xen/p2m.c
+++ b/arch/x86/xen/p2m.c
@@ -67,6 +67,7 @@
 #include <linux/seq_file.h>
 #include <linux/bootmem.h>
 #include <linux/slab.h>
+#include <linux/vmalloc.h>
 
 #include <asm/cache.h>
 #include <asm/setup.h>
diff --git a/drivers/acpi/apei/erst.c b/drivers/acpi/apei/erst.c
index ed65e9c4b5b0..3670bbab57a3 100644
--- a/drivers/acpi/apei/erst.c
+++ b/drivers/acpi/apei/erst.c
@@ -35,6 +35,7 @@
 #include <linux/nmi.h>
 #include <linux/hardirq.h>
 #include <linux/pstore.h>
+#include <linux/vmalloc.h>
 #include <acpi/apei.h>
 
 #include "apei-internal.h"
diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index 6414661ac1c4..2ba53f4f6af2 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -26,6 +26,7 @@
 #include <linux/fs.h>
 #include <linux/debugfs.h>
 #include <linux/acpi.h>
+#include <linux/vmalloc.h>
 #include <trace/events/power.h>
 
 #include <asm/div64.h>
diff --git a/drivers/dma/mic_x100_dma.c b/drivers/dma/mic_x100_dma.c
index 6de2e677be04..74d9db05a5ad 100644
--- a/drivers/dma/mic_x100_dma.c
+++ b/drivers/dma/mic_x100_dma.c
@@ -22,6 +22,7 @@
 #include <linux/module.h>
 #include <linux/io.h>
 #include <linux/seq_file.h>
+#include <linux/vmalloc.h>
 
 #include "mic_x100_dma.h"
 
diff --git a/drivers/net/hyperv/netvsc.c b/drivers/net/hyperv/netvsc.c
index ea091bc5ff09..1e09243d5449 100644
--- a/drivers/net/hyperv/netvsc.c
+++ b/drivers/net/hyperv/netvsc.c
@@ -28,6 +28,7 @@
 #include <linux/slab.h>
 #include <linux/netdevice.h>
 #include <linux/if_ether.h>
+#include <linux/vmalloc.h>
 #include <asm/sync_bitops.h>
 
 #include "hyperv_net.h"
diff --git a/drivers/net/hyperv/rndis_filter.c b/drivers/net/hyperv/rndis_filter.c
index 9118cea91882..35a482d526d9 100644
--- a/drivers/net/hyperv/rndis_filter.c
+++ b/drivers/net/hyperv/rndis_filter.c
@@ -27,6 +27,7 @@
 #include <linux/netdevice.h>
 #include <linux/if_vlan.h>
 #include <linux/nls.h>
+#include <linux/vmalloc.h>
 
 #include "hyperv_net.h"
 
diff --git a/drivers/scsi/fnic/fnic_debugfs.c b/drivers/scsi/fnic/fnic_debugfs.c
index 5980c10c734d..d6498fabe628 100644
--- a/drivers/scsi/fnic/fnic_debugfs.c
+++ b/drivers/scsi/fnic/fnic_debugfs.c
@@ -18,6 +18,7 @@
 #include <linux/module.h>
 #include <linux/errno.h>
 #include <linux/debugfs.h>
+#include <linux/vmalloc.h>
 #include "fnic.h"
 
 static struct dentry *fnic_trace_debugfs_root;
diff --git a/drivers/scsi/fnic/fnic_trace.c b/drivers/scsi/fnic/fnic_trace.c
index 65a9bde26974..4e15c4bf0795 100644
--- a/drivers/scsi/fnic/fnic_trace.c
+++ b/drivers/scsi/fnic/fnic_trace.c
@@ -21,6 +21,7 @@
 #include <linux/spinlock.h>
 #include <linux/kallsyms.h>
 #include <linux/time.h>
+#include <linux/vmalloc.h>
 #include "fnic_io.h"
 #include "fnic.h"
 
diff --git a/include/linux/io.h b/include/linux/io.h
index 04cce4da3685..fb5a99800e77 100644
--- a/include/linux/io.h
+++ b/include/linux/io.h
@@ -19,6 +19,7 @@
 #define _LINUX_IO_H
 
 #include <linux/types.h>
+#include <linux/init.h>
 #include <asm/io.h>
 #include <asm/page.h>
 
diff --git a/sound/pci/asihpi/hpioctl.c b/sound/pci/asihpi/hpioctl.c
index 6610bd096fc9..d17937b92331 100644
--- a/sound/pci/asihpi/hpioctl.c
+++ b/sound/pci/asihpi/hpioctl.c
@@ -32,6 +32,7 @@
 #include <linux/pci.h>
 #include <linux/stringify.h>
 #include <linux/module.h>
+#include <linux/vmalloc.h>
 
 #ifdef MODULE_FIRMWARE
 MODULE_FIRMWARE("asihpi/dsp5000.bin");
-- 
cgit v1.2.3


From da7049f834c3582c1ed1a04889bda5b4121973c0 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Tue, 26 May 2015 13:49:07 -0400
Subject: svcrdma: Remove svc_rdma_xdr_decode_deferred_req()

svc_rdma_xdr_decode_deferred_req() indexes an array with an
un-byte-swapped value off the wire. Fortunately this function
isn't used anywhere, so simply remove it.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 include/linux/sunrpc/svc_rdma.h        |  1 -
 net/sunrpc/xprtrdma/svc_rdma_marshal.c | 56 ----------------------------------
 2 files changed, 57 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h
index df8edf8ec914..8ad9b6d9d4e0 100644
--- a/include/linux/sunrpc/svc_rdma.h
+++ b/include/linux/sunrpc/svc_rdma.h
@@ -182,7 +182,6 @@ struct svcxprt_rdma {
 
 /* svc_rdma_marshal.c */
 extern int svc_rdma_xdr_decode_req(struct rpcrdma_msg **, struct svc_rqst *);
-extern int svc_rdma_xdr_decode_deferred_req(struct svc_rqst *);
 extern int svc_rdma_xdr_encode_error(struct svcxprt_rdma *,
 				     struct rpcrdma_msg *,
 				     enum rpcrdma_errcode, u32 *);
diff --git a/net/sunrpc/xprtrdma/svc_rdma_marshal.c b/net/sunrpc/xprtrdma/svc_rdma_marshal.c
index b681855cf970..45e22c9549d2 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_marshal.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_marshal.c
@@ -211,62 +211,6 @@ int svc_rdma_xdr_decode_req(struct rpcrdma_msg **rdma_req,
 	return hdr_len;
 }
 
-int svc_rdma_xdr_decode_deferred_req(struct svc_rqst *rqstp)
-{
-	struct rpcrdma_msg *rmsgp = NULL;
-	struct rpcrdma_read_chunk *ch;
-	struct rpcrdma_write_array *ary;
-	u32 *va;
-	u32 hdrlen;
-
-	dprintk("svcrdma: processing deferred RDMA header on rqstp=%p\n",
-		rqstp);
-	rmsgp = (struct rpcrdma_msg *)rqstp->rq_arg.head[0].iov_base;
-
-	/* Pull in the extra for the padded case and bump our pointer */
-	if (rmsgp->rm_type == RDMA_MSGP) {
-		va = &rmsgp->rm_body.rm_padded.rm_pempty[4];
-		rqstp->rq_arg.head[0].iov_base = va;
-		hdrlen = (u32)((unsigned long)va - (unsigned long)rmsgp);
-		rqstp->rq_arg.head[0].iov_len -= hdrlen;
-		return hdrlen;
-	}
-
-	/*
-	 * Skip all chunks to find RPC msg. These were previously processed
-	 */
-	va = &rmsgp->rm_body.rm_chunks[0];
-
-	/* Skip read-list */
-	for (ch = (struct rpcrdma_read_chunk *)va;
-	     ch->rc_discrim != xdr_zero; ch++);
-	va = (u32 *)&ch->rc_position;
-
-	/* Skip write-list */
-	ary = (struct rpcrdma_write_array *)va;
-	if (ary->wc_discrim == xdr_zero)
-		va = (u32 *)&ary->wc_nchunks;
-	else
-		/*
-		 * rs_length is the 2nd 4B field in wc_target and taking its
-		 * address skips the list terminator
-		 */
-		va = (u32 *)&ary->wc_array[ary->wc_nchunks].wc_target.rs_length;
-
-	/* Skip reply-array */
-	ary = (struct rpcrdma_write_array *)va;
-	if (ary->wc_discrim == xdr_zero)
-		va = (u32 *)&ary->wc_nchunks;
-	else
-		va = (u32 *)&ary->wc_array[ary->wc_nchunks];
-
-	rqstp->rq_arg.head[0].iov_base = va;
-	hdrlen = (unsigned long)va - (unsigned long)rmsgp;
-	rqstp->rq_arg.head[0].iov_len -= hdrlen;
-
-	return hdrlen;
-}
-
 int svc_rdma_xdr_encode_error(struct svcxprt_rdma *xprt,
 			      struct rpcrdma_msg *rmsgp,
 			      enum rpcrdma_errcode err, u32 *va)
-- 
cgit v1.2.3


From e842f2903908934187af7232fb5b21da527d1757 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Thu, 4 Jun 2015 09:18:18 +1000
Subject: dax: don't abuse get_block mapping for endio callbacks

dax_fault() currently relies on the get_block callback to attach an
io completion callback to the mapping buffer head so that it can
run unwritten extent conversion after zeroing allocated blocks.

Instead of this hack, pass the conversion callback directly into
dax_fault() similar to the get_block callback. When the filesystem
allocates unwritten extents, it will set the buffer_unwritten()
flag, and hence the dax_fault code can call the completion function
in the contexts where it is necessary without overloading the
mapping buffer head.

Note: The changes to ext4 to use this interface are suspect at best.
In fact, the way ext4 did this end_io assignment in the first place
looks suspect because it only set a completion callback when there
wasn't already some other write() call taking place on the same
inode. The ext4 end_io code looks rather intricate and fragile with
all it's reference counting and passing to different contexts for
modification via inode private pointers that aren't protected by
locks...

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Acked-by: Jan Kara <jack@suse.cz>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/dax.c           | 21 +++++++++++++++------
 fs/ext2/file.c     |  4 ++--
 fs/ext4/file.c     | 16 ++++++++++++++--
 fs/ext4/inode.c    | 21 +++++++--------------
 include/linux/fs.h |  6 ++++--
 5 files changed, 42 insertions(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/fs/dax.c b/fs/dax.c
index 6f65f00e58ec..4bb5b7cd5dfd 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -309,14 +309,11 @@ static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh,
  out:
 	i_mmap_unlock_read(mapping);
 
-	if (bh->b_end_io)
-		bh->b_end_io(bh, 1);
-
 	return error;
 }
 
 static int do_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
-			get_block_t get_block)
+			get_block_t get_block, dax_iodone_t complete_unwritten)
 {
 	struct file *file = vma->vm_file;
 	struct address_space *mapping = file->f_mapping;
@@ -417,7 +414,19 @@ static int do_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 		page_cache_release(page);
 	}
 
+	/*
+	 * If we successfully insert the new mapping over an unwritten extent,
+	 * we need to ensure we convert the unwritten extent. If there is an
+	 * error inserting the mapping, the filesystem needs to leave it as
+	 * unwritten to prevent exposure of the stale underlying data to
+	 * userspace, but we still need to call the completion function so
+	 * the private resources on the mapping buffer can be released. We
+	 * indicate what the callback should do via the uptodate variable, same
+	 * as for normal BH based IO completions.
+	 */
 	error = dax_insert_mapping(inode, &bh, vma, vmf);
+	if (buffer_unwritten(&bh))
+		complete_unwritten(&bh, !error);
 
  out:
 	if (error == -ENOMEM)
@@ -445,7 +454,7 @@ static int do_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
  * fault handler for DAX files.
  */
 int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
-			get_block_t get_block)
+	      get_block_t get_block, dax_iodone_t complete_unwritten)
 {
 	int result;
 	struct super_block *sb = file_inode(vma->vm_file)->i_sb;
@@ -454,7 +463,7 @@ int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 		sb_start_pagefault(sb);
 		file_update_time(vma->vm_file);
 	}
-	result = do_dax_fault(vma, vmf, get_block);
+	result = do_dax_fault(vma, vmf, get_block, complete_unwritten);
 	if (vmf->flags & FAULT_FLAG_WRITE)
 		sb_end_pagefault(sb);
 
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index 3a0a6c6406d0..3b57c9f83c9b 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -28,12 +28,12 @@
 #ifdef CONFIG_FS_DAX
 static int ext2_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
-	return dax_fault(vma, vmf, ext2_get_block);
+	return dax_fault(vma, vmf, ext2_get_block, NULL);
 }
 
 static int ext2_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
-	return dax_mkwrite(vma, vmf, ext2_get_block);
+	return dax_mkwrite(vma, vmf, ext2_get_block, NULL);
 }
 
 static const struct vm_operations_struct ext2_dax_vm_ops = {
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 0613c256c344..f713cfcc43a2 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -192,15 +192,27 @@ out:
 }
 
 #ifdef CONFIG_FS_DAX
+static void ext4_end_io_unwritten(struct buffer_head *bh, int uptodate)
+{
+	struct inode *inode = bh->b_assoc_map->host;
+	/* XXX: breaks on 32-bit > 16GB. Is that even supported? */
+	loff_t offset = (loff_t)(uintptr_t)bh->b_private << inode->i_blkbits;
+	int err;
+	if (!uptodate)
+		return;
+	WARN_ON(!buffer_unwritten(bh));
+	err = ext4_convert_unwritten_extents(NULL, inode, offset, bh->b_size);
+}
+
 static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
-	return dax_fault(vma, vmf, ext4_get_block);
+	return dax_fault(vma, vmf, ext4_get_block, ext4_end_io_unwritten);
 					/* Is this the right get_block? */
 }
 
 static int ext4_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
-	return dax_mkwrite(vma, vmf, ext4_get_block);
+	return dax_mkwrite(vma, vmf, ext4_get_block, ext4_end_io_unwritten);
 }
 
 static const struct vm_operations_struct ext4_dax_vm_ops = {
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 55b187c3bac1..7c38ed3494cb 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -656,18 +656,6 @@ has_zeroout:
 	return retval;
 }
 
-static void ext4_end_io_unwritten(struct buffer_head *bh, int uptodate)
-{
-	struct inode *inode = bh->b_assoc_map->host;
-	/* XXX: breaks on 32-bit > 16GB. Is that even supported? */
-	loff_t offset = (loff_t)(uintptr_t)bh->b_private << inode->i_blkbits;
-	int err;
-	if (!uptodate)
-		return;
-	WARN_ON(!buffer_unwritten(bh));
-	err = ext4_convert_unwritten_extents(NULL, inode, offset, bh->b_size);
-}
-
 /* Maximum number of blocks we map for direct IO at once. */
 #define DIO_MAX_BLOCKS 4096
 
@@ -705,10 +693,15 @@ static int _ext4_get_block(struct inode *inode, sector_t iblock,
 
 		map_bh(bh, inode->i_sb, map.m_pblk);
 		bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags;
-		if (IS_DAX(inode) && buffer_unwritten(bh) && !io_end) {
+		if (IS_DAX(inode) && buffer_unwritten(bh)) {
+			/*
+			 * dgc: I suspect unwritten conversion on ext4+DAX is
+			 * fundamentally broken here when there are concurrent
+			 * read/write in progress on this inode.
+			 */
+			WARN_ON_ONCE(io_end);
 			bh->b_assoc_map = inode->i_mapping;
 			bh->b_private = (void *)(unsigned long)iblock;
-			bh->b_end_io = ext4_end_io_unwritten;
 		}
 		if (io_end && io_end->flag & EXT4_IO_END_UNWRITTEN)
 			set_buffer_defer_completion(bh);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 35ec87e490b1..c9b4cca9e08d 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -70,6 +70,7 @@ typedef int (get_block_t)(struct inode *inode, sector_t iblock,
 			struct buffer_head *bh_result, int create);
 typedef void (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
 			ssize_t bytes, void *private);
+typedef void (dax_iodone_t)(struct buffer_head *bh_map, int uptodate);
 
 #define MAY_EXEC		0x00000001
 #define MAY_WRITE		0x00000002
@@ -2627,9 +2628,10 @@ ssize_t dax_do_io(struct kiocb *, struct inode *, struct iov_iter *, loff_t,
 int dax_clear_blocks(struct inode *, sector_t block, long size);
 int dax_zero_page_range(struct inode *, loff_t from, unsigned len, get_block_t);
 int dax_truncate_page(struct inode *, loff_t from, get_block_t);
-int dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t);
+int dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t,
+		dax_iodone_t);
 int dax_pfn_mkwrite(struct vm_area_struct *, struct vm_fault *);
-#define dax_mkwrite(vma, vmf, gb)	dax_fault(vma, vmf, gb)
+#define dax_mkwrite(vma, vmf, gb, iod)	dax_fault(vma, vmf, gb, iod)
 
 #ifdef CONFIG_BLOCK
 typedef void (dio_submit_t)(int rw, struct bio *bio, struct inode *inode,
-- 
cgit v1.2.3


From ce5c5d554dc47a4fb4360c84b72231fea081e7a0 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Thu, 4 Jun 2015 09:18:18 +1000
Subject: dax: expose __dax_fault for filesystems with locking constraints

Some filesystems cannot call dax_fault() directly because they have
different locking and/or allocation constraints in the page fault IO
path. To handle this, we need to follow the same model as the
generic block_page_mkwrite code, where the internals are exposed via
__block_page_mkwrite() so that filesystems can wrap the correct
locking and operations around the outside.

This is loosely based on a patch originally from Matthew Willcox.
Unlike the original patch, it does not change ext4 code, error
returns or unwritten extent conversion handling.  It also adds a
__dax_mkwrite() wrapper for .page_mkwrite implementations to do the
right thing, too.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/dax.c           | 15 +++++++++++++--
 include/linux/fs.h |  5 ++++-
 2 files changed, 17 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/fs/dax.c b/fs/dax.c
index 4bb5b7cd5dfd..99b5fbc38992 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -312,7 +312,17 @@ static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh,
 	return error;
 }
 
-static int do_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
+/**
+ * __dax_fault - handle a page fault on a DAX file
+ * @vma: The virtual memory area where the fault occurred
+ * @vmf: The description of the fault
+ * @get_block: The filesystem method used to translate file offsets to blocks
+ *
+ * When a page fault occurs, filesystems may call this helper in their
+ * fault handler for DAX files. __dax_fault() assumes the caller has done all
+ * the necessary locking for the page fault to proceed successfully.
+ */
+int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 			get_block_t get_block, dax_iodone_t complete_unwritten)
 {
 	struct file *file = vma->vm_file;
@@ -443,6 +453,7 @@ static int do_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 	}
 	goto out;
 }
+EXPORT_SYMBOL(__dax_fault);
 
 /**
  * dax_fault - handle a page fault on a DAX file
@@ -463,7 +474,7 @@ int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 		sb_start_pagefault(sb);
 		file_update_time(vma->vm_file);
 	}
-	result = do_dax_fault(vma, vmf, get_block, complete_unwritten);
+	result = __dax_fault(vma, vmf, get_block, complete_unwritten);
 	if (vmf->flags & FAULT_FLAG_WRITE)
 		sb_end_pagefault(sb);
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index c9b4cca9e08d..5784377e7c56 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2630,8 +2630,11 @@ int dax_zero_page_range(struct inode *, loff_t from, unsigned len, get_block_t);
 int dax_truncate_page(struct inode *, loff_t from, get_block_t);
 int dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t,
 		dax_iodone_t);
+int __dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t,
+		dax_iodone_t);
 int dax_pfn_mkwrite(struct vm_area_struct *, struct vm_fault *);
-#define dax_mkwrite(vma, vmf, gb, iod)	dax_fault(vma, vmf, gb, iod)
+#define dax_mkwrite(vma, vmf, gb, iod)		dax_fault(vma, vmf, gb, iod)
+#define __dax_mkwrite(vma, vmf, gb, iod)	__dax_fault(vma, vmf, gb, iod)
 
 #ifdef CONFIG_BLOCK
 typedef void (dio_submit_t)(int rw, struct bio *bio, struct inode *inode,
-- 
cgit v1.2.3


From c8fff7bc5bba6bd59cad40441c189c4efe7190f6 Mon Sep 17 00:00:00 2001
From: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
Date: Wed, 8 Apr 2015 19:59:20 +0300
Subject: of: return NUMA_NO_NODE from fallback of_node_to_nid()

Node 0 might be offline as well as any other numa node,
in this case kernel cannot handle memory allocation and crashes.

Signed-off-by: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
Fixes: 0c3f061c195c ("of: implement of_node_to_nid as a weak function")
Signed-off-by: Grant Likely <grant.likely@linaro.org>
---
 drivers/of/base.c  | 2 +-
 include/linux/of.h | 5 ++++-
 2 files changed, 5 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/of/base.c b/drivers/of/base.c
index 99764db0875a..8e39b38ca206 100644
--- a/drivers/of/base.c
+++ b/drivers/of/base.c
@@ -89,7 +89,7 @@ EXPORT_SYMBOL(of_n_size_cells);
 #ifdef CONFIG_NUMA
 int __weak of_node_to_nid(struct device_node *np)
 {
-	return numa_node_id();
+	return NUMA_NO_NODE;
 }
 #endif
 
diff --git a/include/linux/of.h b/include/linux/of.h
index ddeaae6d2083..ab071742c0c4 100644
--- a/include/linux/of.h
+++ b/include/linux/of.h
@@ -667,7 +667,10 @@ static inline void of_property_clear_flag(struct property *p, unsigned long flag
 #if defined(CONFIG_OF) && defined(CONFIG_NUMA)
 extern int of_node_to_nid(struct device_node *np);
 #else
-static inline int of_node_to_nid(struct device_node *device) { return 0; }
+static inline int of_node_to_nid(struct device_node *device)
+{
+	return NUMA_NO_NODE;
+}
 #endif
 
 static inline struct device_node *of_find_matching_node(
-- 
cgit v1.2.3


From 12f7c14aa602f15ad60e5a9da459271f63b92917 Mon Sep 17 00:00:00 2001
From: Masanari Iida <standby24x7@gmail.com>
Date: Thu, 4 Jun 2015 00:01:21 +0900
Subject: crypto: doc - Fix typo in crypto-API.xml

This patch fix some typos found in crypto-API.xml.
It is because the file is generated from comments in sources,
so I had to fix typo in sources.

Signed-off-by: Masanari Iida <standby24x7@gmail.com>
Acked-by: Stephan Mueller <smueller@chronox.de>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 include/crypto/aead.h  | 2 +-
 include/crypto/hash.h  | 2 +-
 include/crypto/rng.h   | 2 +-
 include/linux/crypto.h | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/crypto/aead.h b/include/crypto/aead.h
index 5cb0066be30b..7169ad04acc0 100644
--- a/include/crypto/aead.h
+++ b/include/crypto/aead.h
@@ -33,7 +33,7 @@
  *
  * The example code provided for the asynchronous block cipher operation
  * applies here as well. Naturally all *ablkcipher* symbols must be exchanged
- * the *aead* pendants discussed in the following. In addtion, for the AEAD
+ * the *aead* pendants discussed in the following. In addition, for the AEAD
  * operation, the aead_request_set_assoc function must be used to set the
  * pointer to the associated data memory location before performing the
  * encryption or decryption operation. In case of an encryption, the associated
diff --git a/include/crypto/hash.h b/include/crypto/hash.h
index 98abda9ed3aa..57c8a6ee33c2 100644
--- a/include/crypto/hash.h
+++ b/include/crypto/hash.h
@@ -66,7 +66,7 @@ struct ahash_request {
 /**
  * struct ahash_alg - asynchronous message digest definition
  * @init: Initialize the transformation context. Intended only to initialize the
- *	  state of the HASH transformation at the begining. This shall fill in
+ *	  state of the HASH transformation at the beginning. This shall fill in
  *	  the internal structures used during the entire duration of the whole
  *	  transformation. No data processing happens at this point.
  * @update: Push a chunk of data into the driver for transformation. This
diff --git a/include/crypto/rng.h b/include/crypto/rng.h
index c5d4684429f5..b95ede354a66 100644
--- a/include/crypto/rng.h
+++ b/include/crypto/rng.h
@@ -28,7 +28,7 @@ struct crypto_rng;
  *		if provided to the call.
  * @seed:	Seed or reseed the random number generator.  With the
  *		invocation of this function call, the random number
- *		generator shall become ready fo generation.  If the
+ *		generator shall become ready for generation.  If the
  *		random number generator requires a seed for setting
  *		up a new state, the seed must be provided by the
  *		consumer while invoking this function. The required
diff --git a/include/linux/crypto.h b/include/linux/crypto.h
index 7d290a91c6f9..25a4b71d6d1f 100644
--- a/include/linux/crypto.h
+++ b/include/linux/crypto.h
@@ -451,7 +451,7 @@ struct compress_alg {
  *		     transformation algorithm.
  * @cra_type: Type of the cryptographic transformation. This is a pointer to
  *	      struct crypto_type, which implements callbacks common for all
- *	      trasnformation types. There are multiple options:
+ *	      transformation types. There are multiple options:
  *	      &crypto_blkcipher_type, &crypto_ablkcipher_type,
  *	      &crypto_ahash_type, &crypto_aead_type, &crypto_rng_type.
  *	      This field might be empty. In that case, there are no common
-- 
cgit v1.2.3


From 64d6067057d9658acb8675afcfba549abdb7fc16 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Thu, 7 May 2015 11:36:11 +0200
Subject: KVM: x86: stubs for SMM support
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This patch adds the interface between x86.c and the emulator: the
SMBASE register, a new emulator flag, the RSM instruction.  It also
adds a new request bit that will be used by the KVM_SMI ioctl.

Reviewed-by: Radim Krčmář <rkrcmar@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_emulate.h |  4 +++
 arch/x86/include/asm/kvm_host.h    |  1 +
 arch/x86/kvm/emulate.c             | 10 +++++-
 arch/x86/kvm/lapic.c               |  4 ++-
 arch/x86/kvm/svm.c                 |  1 +
 arch/x86/kvm/x86.c                 | 64 ++++++++++++++++++++++++++++++++++++--
 include/linux/kvm_host.h           |  1 +
 7 files changed, 81 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index 7410879a41f7..e16466ec473c 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -193,6 +193,8 @@ struct x86_emulate_ops {
 	int (*cpl)(struct x86_emulate_ctxt *ctxt);
 	int (*get_dr)(struct x86_emulate_ctxt *ctxt, int dr, ulong *dest);
 	int (*set_dr)(struct x86_emulate_ctxt *ctxt, int dr, ulong value);
+	u64 (*get_smbase)(struct x86_emulate_ctxt *ctxt);
+	void (*set_smbase)(struct x86_emulate_ctxt *ctxt, u64 smbase);
 	int (*set_msr)(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 data);
 	int (*get_msr)(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 *pdata);
 	int (*check_pmc)(struct x86_emulate_ctxt *ctxt, u32 pmc);
@@ -264,6 +266,8 @@ enum x86emul_mode {
 
 /* These match some of the HF_* flags defined in kvm_host.h  */
 #define X86EMUL_GUEST_MASK           (1 << 5) /* VCPU is in guest-mode */
+#define X86EMUL_SMM_MASK             (1 << 6)
+#define X86EMUL_SMM_INSIDE_NMI_MASK  (1 << 7)
 
 struct x86_emulate_ctxt {
 	const struct x86_emulate_ops *ops;
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index d52d7aea375f..12a7318887ad 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -368,6 +368,7 @@ struct kvm_vcpu_arch {
 	int32_t apic_arb_prio;
 	int mp_state;
 	u64 ia32_misc_enable_msr;
+	u64 smbase;
 	bool tpr_access_reporting;
 	u64 ia32_xss;
 
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index a1c6c25552e9..e763a9b8c26b 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2259,6 +2259,14 @@ static int em_lseg(struct x86_emulate_ctxt *ctxt)
 	return rc;
 }
 
+static int em_rsm(struct x86_emulate_ctxt *ctxt)
+{
+	if ((ctxt->emul_flags & X86EMUL_SMM_MASK) == 0)
+		return emulate_ud(ctxt);
+
+	return X86EMUL_UNHANDLEABLE;
+}
+
 static void
 setup_syscalls_segments(struct x86_emulate_ctxt *ctxt,
 			struct desc_struct *cs, struct desc_struct *ss)
@@ -4197,7 +4205,7 @@ static const struct opcode twobyte_table[256] = {
 	F(DstMem | SrcReg | Src2CL | ModRM, em_shld), N, N,
 	/* 0xA8 - 0xAF */
 	I(Stack | Src2GS, em_push_sreg), I(Stack | Src2GS, em_pop_sreg),
-	DI(ImplicitOps, rsm),
+	II(No64 | EmulateOnUD | ImplicitOps, em_rsm, rsm),
 	F(DstMem | SrcReg | ModRM | BitOp | Lock | PageTable, em_bts),
 	F(DstMem | SrcReg | Src2ImmByte | ModRM, em_shrd),
 	F(DstMem | SrcReg | Src2CL | ModRM, em_shrd),
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index c789e00dfa8b..b8e47e2b1d94 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -808,7 +808,9 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
 		break;
 
 	case APIC_DM_SMI:
-		apic_debug("Ignoring guest SMI\n");
+		result = 1;
+		kvm_make_request(KVM_REQ_SMI, vcpu);
+		kvm_vcpu_kick(vcpu);
 		break;
 
 	case APIC_DM_NMI:
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index a08df4145173..c48748cf638e 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -3394,6 +3394,7 @@ static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = {
 	[SVM_EXIT_MWAIT]			= mwait_interception,
 	[SVM_EXIT_XSETBV]			= xsetbv_interception,
 	[SVM_EXIT_NPF]				= pf_interception,
+	[SVM_EXIT_RSM]                          = emulate_on_interception,
 };
 
 static void dump_vmcb(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index aa46ac1ff48b..ab977e763812 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -954,6 +954,7 @@ static u32 emulated_msrs[] = {
 	MSR_IA32_MISC_ENABLE,
 	MSR_IA32_MCG_STATUS,
 	MSR_IA32_MCG_CTL,
+	MSR_IA32_SMBASE,
 };
 
 static unsigned num_emulated_msrs;
@@ -2282,6 +2283,11 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 	case MSR_IA32_MISC_ENABLE:
 		vcpu->arch.ia32_misc_enable_msr = data;
 		break;
+	case MSR_IA32_SMBASE:
+		if (!msr_info->host_initiated)
+			return 1;
+		vcpu->arch.smbase = data;
+		break;
 	case MSR_KVM_WALL_CLOCK_NEW:
 	case MSR_KVM_WALL_CLOCK:
 		vcpu->kvm->arch.wall_clock = data;
@@ -2679,6 +2685,11 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 	case MSR_IA32_MISC_ENABLE:
 		msr_info->data = vcpu->arch.ia32_misc_enable_msr;
 		break;
+	case MSR_IA32_SMBASE:
+		if (!msr_info->host_initiated)
+			return 1;
+		msr_info->data = vcpu->arch.smbase;
+		break;
 	case MSR_IA32_PERF_STATUS:
 		/* TSC increment by tick */
 		msr_info->data = 1000ULL;
@@ -3103,6 +3114,8 @@ static int kvm_vcpu_ioctl_nmi(struct kvm_vcpu *vcpu)
 
 static int kvm_vcpu_ioctl_smi(struct kvm_vcpu *vcpu)
 {
+	kvm_make_request(KVM_REQ_SMI, vcpu);
+
 	return 0;
 }
 
@@ -5129,6 +5142,20 @@ static int emulator_set_msr(struct x86_emulate_ctxt *ctxt,
 	return kvm_set_msr(emul_to_vcpu(ctxt), &msr);
 }
 
+static u64 emulator_get_smbase(struct x86_emulate_ctxt *ctxt)
+{
+	struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
+
+	return vcpu->arch.smbase;
+}
+
+static void emulator_set_smbase(struct x86_emulate_ctxt *ctxt, u64 smbase)
+{
+	struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
+
+	vcpu->arch.smbase = smbase;
+}
+
 static int emulator_check_pmc(struct x86_emulate_ctxt *ctxt,
 			      u32 pmc)
 {
@@ -5214,6 +5241,8 @@ static const struct x86_emulate_ops emulate_ops = {
 	.cpl                 = emulator_get_cpl,
 	.get_dr              = emulator_get_dr,
 	.set_dr              = emulator_set_dr,
+	.get_smbase          = emulator_get_smbase,
+	.set_smbase          = emulator_set_smbase,
 	.set_msr             = emulator_set_msr,
 	.get_msr             = emulator_get_msr,
 	.check_pmc	     = emulator_check_pmc,
@@ -5276,6 +5305,8 @@ static void init_emulate_ctxt(struct kvm_vcpu *vcpu)
 		     cs_db				? X86EMUL_MODE_PROT32 :
 							  X86EMUL_MODE_PROT16;
 	BUILD_BUG_ON(HF_GUEST_MASK != X86EMUL_GUEST_MASK);
+	BUILD_BUG_ON(HF_SMM_MASK != X86EMUL_SMM_MASK);
+	BUILD_BUG_ON(HF_SMM_INSIDE_NMI_MASK != X86EMUL_SMM_INSIDE_NMI_MASK);
 	ctxt->emul_flags = vcpu->arch.hflags;
 
 	init_decode_cache(ctxt);
@@ -5445,9 +5476,24 @@ static bool retry_instruction(struct x86_emulate_ctxt *ctxt,
 static int complete_emulated_mmio(struct kvm_vcpu *vcpu);
 static int complete_emulated_pio(struct kvm_vcpu *vcpu);
 
-void kvm_set_hflags(struct kvm_vcpu *vcpu, unsigned emul_flags)
+static void kvm_smm_changed(struct kvm_vcpu *vcpu)
 {
+	if (!(vcpu->arch.hflags & HF_SMM_MASK)) {
+		if (unlikely(vcpu->arch.smi_pending)) {
+			kvm_make_request(KVM_REQ_SMI, vcpu);
+			vcpu->arch.smi_pending = 0;
+		}
+	}
+}
+
+static void kvm_set_hflags(struct kvm_vcpu *vcpu, unsigned emul_flags)
+{
+	unsigned changed = vcpu->arch.hflags ^ emul_flags;
+
 	vcpu->arch.hflags = emul_flags;
+
+	if (changed & HF_SMM_MASK)
+		kvm_smm_changed(vcpu);
 }
 
 static int kvm_vcpu_check_hw_bp(unsigned long addr, u32 type, u32 dr7,
@@ -6341,6 +6387,16 @@ static void process_nmi(struct kvm_vcpu *vcpu)
 	kvm_make_request(KVM_REQ_EVENT, vcpu);
 }
 
+static void process_smi(struct kvm_vcpu *vcpu)
+{
+	if (is_smm(vcpu)) {
+		vcpu->arch.smi_pending = true;
+		return;
+	}
+
+	printk_once(KERN_DEBUG "Ignoring guest SMI\n");
+}
+
 static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu)
 {
 	u64 eoi_exit_bitmap[4];
@@ -6449,6 +6505,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 		}
 		if (kvm_check_request(KVM_REQ_STEAL_UPDATE, vcpu))
 			record_steal_time(vcpu);
+		if (kvm_check_request(KVM_REQ_SMI, vcpu))
+			process_smi(vcpu);
 		if (kvm_check_request(KVM_REQ_NMI, vcpu))
 			process_nmi(vcpu);
 		if (kvm_check_request(KVM_REQ_PMU, vcpu))
@@ -7363,8 +7421,10 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
 	kvm_async_pf_hash_reset(vcpu);
 	vcpu->arch.apf.halted = false;
 
-	if (!init_event)
+	if (!init_event) {
 		kvm_pmu_reset(vcpu);
+		vcpu->arch.smbase = 0x30000;
+	}
 
 	memset(vcpu->arch.regs, 0, sizeof(vcpu->arch.regs));
 	vcpu->arch.regs_avail = ~0;
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index a8bcbc9c6078..b019fee6d941 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -134,6 +134,7 @@ static inline bool is_error_page(struct page *page)
 #define KVM_REQ_ENABLE_IBS        23
 #define KVM_REQ_DISABLE_IBS       24
 #define KVM_REQ_APIC_PAGE_RELOAD  25
+#define KVM_REQ_SMI               26
 
 #define KVM_USERSPACE_IRQ_SOURCE_ID		0
 #define KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID	1
-- 
cgit v1.2.3


From 9e7c8f8c62c1e1cda203b5bfaba4575b141e42e7 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Thu, 4 Jun 2015 16:22:16 -0400
Subject: signals: don't abuse __flush_signals() in
 selinux_bprm_committed_creds()

selinux_bprm_committed_creds()->__flush_signals() is not right, we
shouldn't clear TIF_SIGPENDING unconditionally. There can be other
reasons for signal_pending(): freezing(), JOBCTL_PENDING_MASK, and
potentially more.

Also change this code to check fatal_signal_pending() rather than
SIGNAL_GROUP_EXIT, it looks a bit better.

Now we can kill __flush_signals() before it finds another buggy user.

Note: this code looks racy, we can flush a signal which was sent after
the task SID has been updated.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Paul Moore <pmoore@redhat.com>
---
 include/linux/sched.h    |  1 -
 kernel/signal.c          | 13 ++++---------
 security/selinux/hooks.c |  6 ++++--
 3 files changed, 8 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 8222ae40ecb0..4f84aade8b4d 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2373,7 +2373,6 @@ extern void sched_dead(struct task_struct *p);
 
 extern void proc_caches_init(void);
 extern void flush_signals(struct task_struct *);
-extern void __flush_signals(struct task_struct *);
 extern void ignore_signals(struct task_struct *);
 extern void flush_signal_handlers(struct task_struct *, int force_default);
 extern int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info);
diff --git a/kernel/signal.c b/kernel/signal.c
index d51c5ddd855c..d4972504f2f1 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -414,21 +414,16 @@ void flush_sigqueue(struct sigpending *queue)
 }
 
 /*
- * Flush all pending signals for a task.
+ * Flush all pending signals for this kthread.
  */
-void __flush_signals(struct task_struct *t)
-{
-	clear_tsk_thread_flag(t, TIF_SIGPENDING);
-	flush_sigqueue(&t->pending);
-	flush_sigqueue(&t->signal->shared_pending);
-}
-
 void flush_signals(struct task_struct *t)
 {
 	unsigned long flags;
 
 	spin_lock_irqsave(&t->sighand->siglock, flags);
-	__flush_signals(t);
+	clear_tsk_thread_flag(t, TIF_SIGPENDING);
+	flush_sigqueue(&t->pending);
+	flush_sigqueue(&t->signal->shared_pending);
 	spin_unlock_irqrestore(&t->sighand->siglock, flags);
 }
 
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 99c4a00cce4e..8abbd548ece9 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -2416,10 +2416,12 @@ static void selinux_bprm_committed_creds(struct linux_binprm *bprm)
 		for (i = 0; i < 3; i++)
 			do_setitimer(i, &itimer, NULL);
 		spin_lock_irq(&current->sighand->siglock);
-		if (!(current->signal->flags & SIGNAL_GROUP_EXIT)) {
-			__flush_signals(current);
+		if (!fatal_signal_pending(current)) {
+			flush_sigqueue(&current->pending);
+			flush_sigqueue(&current->signal->shared_pending);
 			flush_signal_handlers(current, 1);
 			sigemptyset(&current->blocked);
+			recalc_sigpending();
 		}
 		spin_unlock_irq(&current->sighand->siglock);
 	}
-- 
cgit v1.2.3


From 30b7e246a6222f1fbad39b1451273375306fe1e2 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Thu, 4 Jun 2015 11:21:10 -0400
Subject: svcrdma: Keep rpcrdma_msg fields in network byte-order

Fields in struct rpcrdma_msg are __be32. Don't byte-swap these
fields when decoding RPC calls and then swap them back for the
reply. For the most part, they can be left alone.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 include/linux/sunrpc/svc_rdma.h          |  2 +-
 net/sunrpc/xprtrdma/svc_rdma_marshal.c   | 84 +++++++++++++++-----------------
 net/sunrpc/xprtrdma/svc_rdma_recvfrom.c  |  2 +-
 net/sunrpc/xprtrdma/svc_rdma_transport.c |  2 +-
 4 files changed, 42 insertions(+), 48 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h
index 8ad9b6d9d4e0..c03ca0a1b743 100644
--- a/include/linux/sunrpc/svc_rdma.h
+++ b/include/linux/sunrpc/svc_rdma.h
@@ -184,7 +184,7 @@ struct svcxprt_rdma {
 extern int svc_rdma_xdr_decode_req(struct rpcrdma_msg **, struct svc_rqst *);
 extern int svc_rdma_xdr_encode_error(struct svcxprt_rdma *,
 				     struct rpcrdma_msg *,
-				     enum rpcrdma_errcode, u32 *);
+				     enum rpcrdma_errcode, __be32 *);
 extern void svc_rdma_xdr_encode_write_list(struct rpcrdma_msg *, int);
 extern void svc_rdma_xdr_encode_reply_array(struct rpcrdma_write_array *, int);
 extern void svc_rdma_xdr_encode_array_chunk(struct rpcrdma_write_array *, int,
diff --git a/net/sunrpc/xprtrdma/svc_rdma_marshal.c b/net/sunrpc/xprtrdma/svc_rdma_marshal.c
index 45e22c9549d2..e2fca7617242 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_marshal.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_marshal.c
@@ -50,12 +50,12 @@
 /*
  * Decodes a read chunk list. The expected format is as follows:
  *    descrim  : xdr_one
- *    position : u32 offset into XDR stream
- *    handle   : u32 RKEY
+ *    position : __be32 offset into XDR stream
+ *    handle   : __be32 RKEY
  *    . . .
  *  end-of-list: xdr_zero
  */
-static u32 *decode_read_list(u32 *va, u32 *vaend)
+static __be32 *decode_read_list(__be32 *va, __be32 *vaend)
 {
 	struct rpcrdma_read_chunk *ch = (struct rpcrdma_read_chunk *)va;
 
@@ -67,20 +67,20 @@ static u32 *decode_read_list(u32 *va, u32 *vaend)
 		}
 		ch++;
 	}
-	return (u32 *)&ch->rc_position;
+	return &ch->rc_position;
 }
 
 /*
  * Decodes a write chunk list. The expected format is as follows:
  *    descrim  : xdr_one
  *    nchunks  : <count>
- *       handle   : u32 RKEY              ---+
- *       length   : u32 <len of segment>     |
+ *       handle   : __be32 RKEY           ---+
+ *       length   : __be32 <len of segment>  |
  *       offset   : remove va                + <count>
  *       . . .                               |
  *                                        ---+
  */
-static u32 *decode_write_list(u32 *va, u32 *vaend)
+static __be32 *decode_write_list(__be32 *va, __be32 *vaend)
 {
 	unsigned long start, end;
 	int nchunks;
@@ -90,14 +90,14 @@ static u32 *decode_write_list(u32 *va, u32 *vaend)
 
 	/* Check for not write-array */
 	if (ary->wc_discrim == xdr_zero)
-		return (u32 *)&ary->wc_nchunks;
+		return &ary->wc_nchunks;
 
 	if ((unsigned long)ary + sizeof(struct rpcrdma_write_array) >
 	    (unsigned long)vaend) {
 		dprintk("svcrdma: ary=%p, vaend=%p\n", ary, vaend);
 		return NULL;
 	}
-	nchunks = ntohl(ary->wc_nchunks);
+	nchunks = be32_to_cpu(ary->wc_nchunks);
 
 	start = (unsigned long)&ary->wc_array[0];
 	end = (unsigned long)vaend;
@@ -112,10 +112,10 @@ static u32 *decode_write_list(u32 *va, u32 *vaend)
 	 * rs_length is the 2nd 4B field in wc_target and taking its
 	 * address skips the list terminator
 	 */
-	return (u32 *)&ary->wc_array[nchunks].wc_target.rs_length;
+	return &ary->wc_array[nchunks].wc_target.rs_length;
 }
 
-static u32 *decode_reply_array(u32 *va, u32 *vaend)
+static __be32 *decode_reply_array(__be32 *va, __be32 *vaend)
 {
 	unsigned long start, end;
 	int nchunks;
@@ -124,14 +124,14 @@ static u32 *decode_reply_array(u32 *va, u32 *vaend)
 
 	/* Check for no reply-array */
 	if (ary->wc_discrim == xdr_zero)
-		return (u32 *)&ary->wc_nchunks;
+		return &ary->wc_nchunks;
 
 	if ((unsigned long)ary + sizeof(struct rpcrdma_write_array) >
 	    (unsigned long)vaend) {
 		dprintk("svcrdma: ary=%p, vaend=%p\n", ary, vaend);
 		return NULL;
 	}
-	nchunks = ntohl(ary->wc_nchunks);
+	nchunks = be32_to_cpu(ary->wc_nchunks);
 
 	start = (unsigned long)&ary->wc_array[0];
 	end = (unsigned long)vaend;
@@ -142,15 +142,14 @@ static u32 *decode_reply_array(u32 *va, u32 *vaend)
 			ary, nchunks, vaend);
 		return NULL;
 	}
-	return (u32 *)&ary->wc_array[nchunks];
+	return (__be32 *)&ary->wc_array[nchunks];
 }
 
 int svc_rdma_xdr_decode_req(struct rpcrdma_msg **rdma_req,
 			    struct svc_rqst *rqstp)
 {
 	struct rpcrdma_msg *rmsgp = NULL;
-	u32 *va;
-	u32 *vaend;
+	__be32 *va, *vaend;
 	u32 hdr_len;
 
 	rmsgp = (struct rpcrdma_msg *)rqstp->rq_arg.head[0].iov_base;
@@ -162,22 +161,17 @@ int svc_rdma_xdr_decode_req(struct rpcrdma_msg **rdma_req,
 		return -EINVAL;
 	}
 
-	/* Decode the header */
-	rmsgp->rm_xid = ntohl(rmsgp->rm_xid);
-	rmsgp->rm_vers = ntohl(rmsgp->rm_vers);
-	rmsgp->rm_credit = ntohl(rmsgp->rm_credit);
-	rmsgp->rm_type = ntohl(rmsgp->rm_type);
-
-	if (rmsgp->rm_vers != RPCRDMA_VERSION)
+	if (rmsgp->rm_vers != rpcrdma_version)
 		return -ENOSYS;
 
 	/* Pull in the extra for the padded case and bump our pointer */
-	if (rmsgp->rm_type == RDMA_MSGP) {
+	if (rmsgp->rm_type == rdma_msgp) {
 		int hdrlen;
+
 		rmsgp->rm_body.rm_padded.rm_align =
-			ntohl(rmsgp->rm_body.rm_padded.rm_align);
+			be32_to_cpu(rmsgp->rm_body.rm_padded.rm_align);
 		rmsgp->rm_body.rm_padded.rm_thresh =
-			ntohl(rmsgp->rm_body.rm_padded.rm_thresh);
+			be32_to_cpu(rmsgp->rm_body.rm_padded.rm_thresh);
 
 		va = &rmsgp->rm_body.rm_padded.rm_pempty[4];
 		rqstp->rq_arg.head[0].iov_base = va;
@@ -192,7 +186,7 @@ int svc_rdma_xdr_decode_req(struct rpcrdma_msg **rdma_req,
 	 * chunk list and a reply chunk list.
 	 */
 	va = &rmsgp->rm_body.rm_chunks[0];
-	vaend = (u32 *)((unsigned long)rmsgp + rqstp->rq_arg.len);
+	vaend = (__be32 *)((unsigned long)rmsgp + rqstp->rq_arg.len);
 	va = decode_read_list(va, vaend);
 	if (!va)
 		return -EINVAL;
@@ -213,18 +207,18 @@ int svc_rdma_xdr_decode_req(struct rpcrdma_msg **rdma_req,
 
 int svc_rdma_xdr_encode_error(struct svcxprt_rdma *xprt,
 			      struct rpcrdma_msg *rmsgp,
-			      enum rpcrdma_errcode err, u32 *va)
+			      enum rpcrdma_errcode err, __be32 *va)
 {
-	u32 *startp = va;
+	__be32 *startp = va;
 
-	*va++ = htonl(rmsgp->rm_xid);
-	*va++ = htonl(rmsgp->rm_vers);
-	*va++ = htonl(xprt->sc_max_requests);
-	*va++ = htonl(RDMA_ERROR);
-	*va++ = htonl(err);
+	*va++ = rmsgp->rm_xid;
+	*va++ = rmsgp->rm_vers;
+	*va++ = cpu_to_be32(xprt->sc_max_requests);
+	*va++ = rdma_error;
+	*va++ = cpu_to_be32(err);
 	if (err == ERR_VERS) {
-		*va++ = htonl(RPCRDMA_VERSION);
-		*va++ = htonl(RPCRDMA_VERSION);
+		*va++ = rpcrdma_version;
+		*va++ = rpcrdma_version;
 	}
 
 	return (int)((unsigned long)va - (unsigned long)startp);
@@ -241,7 +235,7 @@ int svc_rdma_xdr_get_reply_hdr_len(struct rpcrdma_msg *rmsgp)
 		&rmsgp->rm_body.rm_chunks[1];
 	if (wr_ary->wc_discrim)
 		wr_ary = (struct rpcrdma_write_array *)
-			&wr_ary->wc_array[ntohl(wr_ary->wc_nchunks)].
+			&wr_ary->wc_array[be32_to_cpu(wr_ary->wc_nchunks)].
 			wc_target.rs_length;
 	else
 		wr_ary = (struct rpcrdma_write_array *)
@@ -250,7 +244,7 @@ int svc_rdma_xdr_get_reply_hdr_len(struct rpcrdma_msg *rmsgp)
 	/* skip reply array */
 	if (wr_ary->wc_discrim)
 		wr_ary = (struct rpcrdma_write_array *)
-			&wr_ary->wc_array[ntohl(wr_ary->wc_nchunks)];
+			&wr_ary->wc_array[be32_to_cpu(wr_ary->wc_nchunks)];
 	else
 		wr_ary = (struct rpcrdma_write_array *)
 			&wr_ary->wc_nchunks;
@@ -269,7 +263,7 @@ void svc_rdma_xdr_encode_write_list(struct rpcrdma_msg *rmsgp, int chunks)
 	ary = (struct rpcrdma_write_array *)
 		&rmsgp->rm_body.rm_chunks[1];
 	ary->wc_discrim = xdr_one;
-	ary->wc_nchunks = htonl(chunks);
+	ary->wc_nchunks = cpu_to_be32(chunks);
 
 	/* write-list terminator */
 	ary->wc_array[chunks].wc_target.rs_handle = xdr_zero;
@@ -282,7 +276,7 @@ void svc_rdma_xdr_encode_reply_array(struct rpcrdma_write_array *ary,
 				 int chunks)
 {
 	ary->wc_discrim = xdr_one;
-	ary->wc_nchunks = htonl(chunks);
+	ary->wc_nchunks = cpu_to_be32(chunks);
 }
 
 void svc_rdma_xdr_encode_array_chunk(struct rpcrdma_write_array *ary,
@@ -294,7 +288,7 @@ void svc_rdma_xdr_encode_array_chunk(struct rpcrdma_write_array *ary,
 	struct rpcrdma_segment *seg = &ary->wc_array[chunk_no].wc_target;
 	seg->rs_handle = rs_handle;
 	seg->rs_offset = rs_offset;
-	seg->rs_length = htonl(write_len);
+	seg->rs_length = cpu_to_be32(write_len);
 }
 
 void svc_rdma_xdr_encode_reply_header(struct svcxprt_rdma *xprt,
@@ -302,10 +296,10 @@ void svc_rdma_xdr_encode_reply_header(struct svcxprt_rdma *xprt,
 				  struct rpcrdma_msg *rdma_resp,
 				  enum rpcrdma_proc rdma_type)
 {
-	rdma_resp->rm_xid = htonl(rdma_argp->rm_xid);
-	rdma_resp->rm_vers = htonl(rdma_argp->rm_vers);
-	rdma_resp->rm_credit = htonl(xprt->sc_max_requests);
-	rdma_resp->rm_type = htonl(rdma_type);
+	rdma_resp->rm_xid = rdma_argp->rm_xid;
+	rdma_resp->rm_vers = rdma_argp->rm_vers;
+	rdma_resp->rm_credit = cpu_to_be32(xprt->sc_max_requests);
+	rdma_resp->rm_type = cpu_to_be32(rdma_type);
 
 	/* Encode <nul> chunks lists */
 	rdma_resp->rm_body.rm_chunks[0] = xdr_zero;
diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
index f9f13a32ddb8..ac93ce01a729 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
@@ -85,7 +85,7 @@ static void rdma_build_arg_xdr(struct svc_rqst *rqstp,
 
 	/* RDMA_NOMSG: RDMA READ data should land just after RDMA RECV data */
 	rmsgp = (struct rpcrdma_msg *)rqstp->rq_arg.head[0].iov_base;
-	if (be32_to_cpu(rmsgp->rm_type) == RDMA_NOMSG)
+	if (rmsgp->rm_type == rdma_nomsg)
 		rqstp->rq_arg.pages = &rqstp->rq_pages[0];
 	else
 		rqstp->rq_arg.pages = &rqstp->rq_pages[1];
diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
index f609c1c2d38d..be084938e3ff 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -1319,7 +1319,7 @@ void svc_rdma_send_error(struct svcxprt_rdma *xprt, struct rpcrdma_msg *rmsgp,
 	struct ib_send_wr err_wr;
 	struct page *p;
 	struct svc_rdma_op_ctxt *ctxt;
-	u32 *va;
+	__be32 *va;
 	int length;
 	int ret;
 
-- 
cgit v1.2.3


From b7e0b9a965a116341b4ef86ab98ea2843b218271 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Thu, 4 Jun 2015 11:21:20 -0400
Subject: svcrdma: Replace GFP_KERNEL in a loop with GFP_NOFAIL

At the 2015 LSF/MM, it was requested that memory allocation
call sites that request GFP_KERNEL allocations in a loop should be
annotated with __GFP_NOFAIL.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 include/linux/sunrpc/svc_rdma.h          |  1 -
 net/sunrpc/xprtrdma/svc_rdma_sendto.c    |  2 +-
 net/sunrpc/xprtrdma/svc_rdma_transport.c | 32 ++++++--------------------------
 3 files changed, 7 insertions(+), 28 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h
index c03ca0a1b743..d26384b22126 100644
--- a/include/linux/sunrpc/svc_rdma.h
+++ b/include/linux/sunrpc/svc_rdma.h
@@ -211,7 +211,6 @@ extern int svc_rdma_sendto(struct svc_rqst *);
 extern int svc_rdma_send(struct svcxprt_rdma *, struct ib_send_wr *);
 extern void svc_rdma_send_error(struct svcxprt_rdma *, struct rpcrdma_msg *,
 				enum rpcrdma_errcode);
-struct page *svc_rdma_get_page(void);
 extern int svc_rdma_post_recv(struct svcxprt_rdma *);
 extern int svc_rdma_create_listen(struct svc_serv *, int, struct sockaddr *);
 extern struct svc_rdma_op_ctxt *svc_rdma_get_context(struct svcxprt_rdma *);
diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
index 109e9670be8c..d25cd430f9ff 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
@@ -517,7 +517,7 @@ int svc_rdma_sendto(struct svc_rqst *rqstp)
 	inline_bytes = rqstp->rq_res.len;
 
 	/* Create the RDMA response header */
-	res_page = svc_rdma_get_page();
+	res_page = alloc_page(GFP_KERNEL | __GFP_NOFAIL);
 	rdma_resp = page_address(res_page);
 	reply_ary = svc_rdma_get_reply_array(rdma_argp);
 	if (reply_ary)
diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
index be084938e3ff..1ed4740ff0b5 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -99,12 +99,8 @@ struct svc_rdma_op_ctxt *svc_rdma_get_context(struct svcxprt_rdma *xprt)
 {
 	struct svc_rdma_op_ctxt *ctxt;
 
-	while (1) {
-		ctxt = kmem_cache_alloc(svc_rdma_ctxt_cachep, GFP_KERNEL);
-		if (ctxt)
-			break;
-		schedule_timeout_uninterruptible(msecs_to_jiffies(500));
-	}
+	ctxt = kmem_cache_alloc(svc_rdma_ctxt_cachep,
+				GFP_KERNEL | __GFP_NOFAIL);
 	ctxt->xprt = xprt;
 	INIT_LIST_HEAD(&ctxt->dto_q);
 	ctxt->count = 0;
@@ -156,12 +152,8 @@ void svc_rdma_put_context(struct svc_rdma_op_ctxt *ctxt, int free_pages)
 struct svc_rdma_req_map *svc_rdma_get_req_map(void)
 {
 	struct svc_rdma_req_map *map;
-	while (1) {
-		map = kmem_cache_alloc(svc_rdma_map_cachep, GFP_KERNEL);
-		if (map)
-			break;
-		schedule_timeout_uninterruptible(msecs_to_jiffies(500));
-	}
+	map = kmem_cache_alloc(svc_rdma_map_cachep,
+			       GFP_KERNEL | __GFP_NOFAIL);
 	map->count = 0;
 	return map;
 }
@@ -490,18 +482,6 @@ static struct svcxprt_rdma *rdma_create_xprt(struct svc_serv *serv,
 	return cma_xprt;
 }
 
-struct page *svc_rdma_get_page(void)
-{
-	struct page *page;
-
-	while ((page = alloc_page(GFP_KERNEL)) == NULL) {
-		/* If we can't get memory, wait a bit and try again */
-		printk(KERN_INFO "svcrdma: out of memory...retrying in 1s\n");
-		schedule_timeout_uninterruptible(msecs_to_jiffies(1000));
-	}
-	return page;
-}
-
 int svc_rdma_post_recv(struct svcxprt_rdma *xprt)
 {
 	struct ib_recv_wr recv_wr, *bad_recv_wr;
@@ -520,7 +500,7 @@ int svc_rdma_post_recv(struct svcxprt_rdma *xprt)
 			pr_err("svcrdma: Too many sges (%d)\n", sge_no);
 			goto err_put_ctxt;
 		}
-		page = svc_rdma_get_page();
+		page = alloc_page(GFP_KERNEL | __GFP_NOFAIL);
 		ctxt->pages[sge_no] = page;
 		pa = ib_dma_map_page(xprt->sc_cm_id->device,
 				     page, 0, PAGE_SIZE,
@@ -1323,7 +1303,7 @@ void svc_rdma_send_error(struct svcxprt_rdma *xprt, struct rpcrdma_msg *rmsgp,
 	int length;
 	int ret;
 
-	p = svc_rdma_get_page();
+	p = alloc_page(GFP_KERNEL | __GFP_NOFAIL);
 	va = page_address(p);
 
 	/* XDR encode error */
-- 
cgit v1.2.3


From 0380a3f37540ad0582b3c749a74fc127af914689 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Thu, 4 Jun 2015 11:21:32 -0400
Subject: svcrdma: Add a separate "max data segs macro for svcrdma

The server and client maximum are architecturally independent.
Allow changing one without affecting the other.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 include/linux/sunrpc/svc_rdma.h          | 7 +++++++
 net/sunrpc/xprtrdma/svc_rdma_transport.c | 2 +-
 net/sunrpc/xprtrdma/xprt_rdma.h          | 6 ------
 3 files changed, 8 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h
index d26384b22126..cb94ee4181d4 100644
--- a/include/linux/sunrpc/svc_rdma.h
+++ b/include/linux/sunrpc/svc_rdma.h
@@ -172,6 +172,13 @@ struct svcxprt_rdma {
 #define RDMAXPRT_SQ_PENDING	2
 #define RDMAXPRT_CONN_PENDING	3
 
+#define RPCRDMA_MAX_SVC_SEGS	(64)	/* server max scatter/gather */
+#if RPCSVC_MAXPAYLOAD < (RPCRDMA_MAX_SVC_SEGS << PAGE_SHIFT)
+#define RPCRDMA_MAXPAYLOAD	RPCSVC_MAXPAYLOAD
+#else
+#define RPCRDMA_MAXPAYLOAD	(RPCRDMA_MAX_SVC_SEGS << PAGE_SHIFT)
+#endif
+
 #define RPCRDMA_LISTEN_BACKLOG  10
 /* The default ORD value is based on two outstanding full-size writes with a
  * page size of 4k, or 32k * 2 ops / 4k = 16 outstanding RDMA_READ.  */
diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
index 1ed4740ff0b5..3b4c2ff66a2f 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -91,7 +91,7 @@ struct svc_xprt_class svc_rdma_class = {
 	.xcl_name = "rdma",
 	.xcl_owner = THIS_MODULE,
 	.xcl_ops = &svc_rdma_ops,
-	.xcl_max_payload = RPCSVC_MAXPAYLOAD_RDMA,
+	.xcl_max_payload = RPCRDMA_MAXPAYLOAD,
 	.xcl_ident = XPRT_TRANSPORT_RDMA,
 };
 
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index 78e0b8beaa36..e60907b0e1f1 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -487,10 +487,4 @@ extern struct kmem_cache *svc_rdma_ctxt_cachep;
 /* Workqueue created in svc_rdma.c */
 extern struct workqueue_struct *svc_rdma_wq;
 
-#if RPCSVC_MAXPAYLOAD < (RPCRDMA_MAX_DATA_SEGS << PAGE_SHIFT)
-#define RPCSVC_MAXPAYLOAD_RDMA RPCSVC_MAXPAYLOAD
-#else
-#define RPCSVC_MAXPAYLOAD_RDMA (RPCRDMA_MAX_DATA_SEGS << PAGE_SHIFT)
-#endif
-
 #endif				/* _LINUX_SUNRPC_XPRT_RDMA_H */
-- 
cgit v1.2.3


From 42aecaa9bb2bd57eb8d61b4565cee5d3640863fb Mon Sep 17 00:00:00 2001
From: Tom Herbert <tom@herbertland.com>
Date: Thu, 4 Jun 2015 09:16:39 -0700
Subject: net: Get skb hash over flow_keys structure

This patch changes flow hashing to use jhash2 over the flow_keys
structure instead just doing jhash_3words over src, dst, and ports.
This method will allow us take more input into the hashing function
so that we can include full IPv6 addresses, VLAN, flow labels etc.
without needing to resort to xor'ing which makes for a poor hash.

Acked-by: Jiri Pirko <jiri@resnulli.us>
Signed-off-by: Tom Herbert <tom@herbertland.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h       |  2 +-
 include/net/flow_dissector.h | 21 ++++++++++++++---
 include/net/ip.h             |  2 ++
 include/net/ipv6.h           |  2 ++
 net/core/flow_dissector.c    | 54 +++++++++++++++++++++++++++++++++-----------
 net/sched/cls_flower.c       |  2 ++
 6 files changed, 66 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 6b41c15efa27..cc612fc0a894 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1943,7 +1943,7 @@ static inline void skb_probe_transport_header(struct sk_buff *skb,
 	if (skb_transport_header_was_set(skb))
 		return;
 	else if (skb_flow_dissect_flow_keys(skb, &keys))
-		skb_set_transport_header(skb, keys.basic.thoff);
+		skb_set_transport_header(skb, keys.control.thoff);
 	else
 		skb_set_transport_header(skb, offset_hint);
 }
diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h
index bac9c1421f58..cba6a10b214a 100644
--- a/include/net/flow_dissector.h
+++ b/include/net/flow_dissector.h
@@ -6,6 +6,15 @@
 #include <linux/in6.h>
 #include <uapi/linux/if_ether.h>
 
+/**
+ * struct flow_dissector_key_control:
+ * @thoff: Transport header offset
+ */
+struct flow_dissector_key_control {
+	u16	thoff;
+	u16	padding;
+};
+
 /**
  * struct flow_dissector_key_basic:
  * @thoff: Transport header offset
@@ -13,9 +22,9 @@
  * @ip_proto: Transport header protocol (eg. TCP/UDP)
  */
 struct flow_dissector_key_basic {
-	u16	thoff;
 	__be16	n_proto;
 	u8	ip_proto;
+	u8	padding;
 };
 
 /**
@@ -70,6 +79,7 @@ struct flow_dissector_key_eth_addrs {
 };
 
 enum flow_dissector_key_id {
+	FLOW_DISSECTOR_KEY_CONTROL, /* struct flow_dissector_key_control */
 	FLOW_DISSECTOR_KEY_BASIC, /* struct flow_dissector_key_basic */
 	FLOW_DISSECTOR_KEY_IPV4_ADDRS, /* struct flow_dissector_key_addrs */
 	FLOW_DISSECTOR_KEY_IPV6_HASH_ADDRS, /* struct flow_dissector_key_addrs */
@@ -109,11 +119,16 @@ static inline bool skb_flow_dissect(const struct sk_buff *skb,
 }
 
 struct flow_keys {
-	struct flow_dissector_key_addrs addrs;
-	struct flow_dissector_key_ports ports;
+	struct flow_dissector_key_control control;
+#define FLOW_KEYS_HASH_START_FIELD basic
 	struct flow_dissector_key_basic basic;
+	struct flow_dissector_key_ports ports;
+	struct flow_dissector_key_addrs addrs;
 };
 
+#define FLOW_KEYS_HASH_OFFSET		\
+	offsetof(struct flow_keys, FLOW_KEYS_HASH_START_FIELD)
+
 extern struct flow_dissector flow_keys_dissector;
 extern struct flow_dissector flow_keys_buf_dissector;
 
diff --git a/include/net/ip.h b/include/net/ip.h
index 9b976cf99122..16cfc87fed6c 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -360,6 +360,8 @@ static inline void inet_set_txhash(struct sock *sk)
 	struct inet_sock *inet = inet_sk(sk);
 	struct flow_keys keys;
 
+	memset(&keys, 0, sizeof(keys));
+
 	keys.addrs.src = inet->inet_saddr;
 	keys.addrs.dst = inet->inet_daddr;
 	keys.ports.src = inet->inet_sport;
diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index 35d485c78080..474ca466a091 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -699,6 +699,8 @@ static inline void ip6_set_txhash(struct sock *sk)
 	struct ipv6_pinfo *np = inet6_sk(sk);
 	struct flow_keys keys;
 
+	memset(&keys, 0, sizeof(keys));
+
 	keys.addrs.src = (__force __be32)ipv6_addr_hash(&np->saddr);
 	keys.addrs.dst = (__force __be32)ipv6_addr_hash(&sk->sk_v6_daddr);
 	keys.ports.src = inet->inet_sport;
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index 0763795bea8f..55b5f2962afa 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -57,9 +57,11 @@ void skb_flow_dissector_init(struct flow_dissector *flow_dissector,
 		flow_dissector->offset[key->key_id] = key->offset;
 	}
 
-	/* Ensure that the dissector always includes basic key. That way
-	 * we are able to avoid handling lack of it in fast path.
+	/* Ensure that the dissector always includes control and basic key.
+	 * That way we are able to avoid handling lack of these in fast path.
 	 */
+	BUG_ON(!skb_flow_dissector_uses_key(flow_dissector,
+					    FLOW_DISSECTOR_KEY_CONTROL));
 	BUG_ON(!skb_flow_dissector_uses_key(flow_dissector,
 					    FLOW_DISSECTOR_KEY_BASIC));
 }
@@ -120,6 +122,7 @@ bool __skb_flow_dissect(const struct sk_buff *skb,
 			void *target_container,
 			void *data, __be16 proto, int nhoff, int hlen)
 {
+	struct flow_dissector_key_control *key_control;
 	struct flow_dissector_key_basic *key_basic;
 	struct flow_dissector_key_addrs *key_addrs;
 	struct flow_dissector_key_ports *key_ports;
@@ -132,6 +135,13 @@ bool __skb_flow_dissect(const struct sk_buff *skb,
 		hlen = skb_headlen(skb);
 	}
 
+	/* It is ensured by skb_flow_dissector_init() that control key will
+	 * be always present.
+	 */
+	key_control = skb_flow_dissector_target(flow_dissector,
+						FLOW_DISSECTOR_KEY_CONTROL,
+						target_container);
+
 	/* It is ensured by skb_flow_dissector_init() that basic key will
 	 * be always present.
 	 */
@@ -219,7 +229,7 @@ flow_label:
 
 			key_basic->n_proto = proto;
 			key_basic->ip_proto = ip_proto;
-			key_basic->thoff = (u16)nhoff;
+			key_control->thoff = (u16)nhoff;
 
 			if (skb_flow_dissector_uses_key(flow_dissector,
 							FLOW_DISSECTOR_KEY_PORTS)) {
@@ -275,7 +285,7 @@ flow_label:
 		if (!hdr)
 			return false;
 		key_basic->n_proto = proto;
-		key_basic->thoff = (u16)nhoff;
+		key_control->thoff = (u16)nhoff;
 
 		if (skb_flow_dissector_uses_key(flow_dissector,
 						FLOW_DISSECTOR_KEY_IPV6_HASH_ADDRS)) {
@@ -288,7 +298,7 @@ flow_label:
 		return true;
 	}
 	case htons(ETH_P_FCOE):
-		key_basic->thoff = (u16)(nhoff + FCOE_HEADER_LEN);
+		key_control->thoff = (u16)(nhoff + FCOE_HEADER_LEN);
 		/* fall through */
 	default:
 		return false;
@@ -345,7 +355,7 @@ flow_label:
 
 	key_basic->n_proto = proto;
 	key_basic->ip_proto = ip_proto;
-	key_basic->thoff = (u16) nhoff;
+	key_control->thoff = (u16)nhoff;
 
 	if (skb_flow_dissector_uses_key(flow_dissector,
 					FLOW_DISSECTOR_KEY_PORTS)) {
@@ -366,9 +376,21 @@ static __always_inline void __flow_hash_secret_init(void)
 	net_get_random_once(&hashrnd, sizeof(hashrnd));
 }
 
-static __always_inline u32 __flow_hash_3words(u32 a, u32 b, u32 c, u32 keyval)
+static __always_inline u32 __flow_hash_words(u32 *words, u32 length, u32 keyval)
+{
+	return jhash2(words, length, keyval);
+}
+
+static inline void *flow_keys_hash_start(struct flow_keys *flow)
 {
-	return jhash_3words(a, b, c, keyval);
+	BUILD_BUG_ON(FLOW_KEYS_HASH_OFFSET % sizeof(u32));
+	return (void *)flow + FLOW_KEYS_HASH_OFFSET;
+}
+
+static inline size_t flow_keys_hash_length(struct flow_keys *flow)
+{
+	BUILD_BUG_ON((sizeof(*flow) - FLOW_KEYS_HASH_OFFSET) % sizeof(u32));
+	return (sizeof(*flow) - FLOW_KEYS_HASH_OFFSET) / sizeof(u32);
 }
 
 static inline u32 __flow_hash_from_keys(struct flow_keys *keys, u32 keyval)
@@ -383,10 +405,8 @@ static inline u32 __flow_hash_from_keys(struct flow_keys *keys, u32 keyval)
 		swap(keys->ports.src, keys->ports.dst);
 	}
 
-	hash = __flow_hash_3words((__force u32)keys->addrs.dst,
-				  (__force u32)keys->addrs.src,
-				  (__force u32)keys->ports.ports,
-				  keyval);
+	hash = __flow_hash_words((u32 *)flow_keys_hash_start(keys),
+				 flow_keys_hash_length(keys), keyval);
 	if (!hash)
 		hash = 1;
 
@@ -473,7 +493,7 @@ EXPORT_SYMBOL(skb_get_hash_perturb);
 u32 __skb_get_poff(const struct sk_buff *skb, void *data,
 		   const struct flow_keys *keys, int hlen)
 {
-	u32 poff = keys->basic.thoff;
+	u32 poff = keys->control.thoff;
 
 	switch (keys->basic.ip_proto) {
 	case IPPROTO_TCP: {
@@ -536,6 +556,10 @@ u32 skb_get_poff(const struct sk_buff *skb)
 }
 
 static const struct flow_dissector_key flow_keys_dissector_keys[] = {
+	{
+		.key_id = FLOW_DISSECTOR_KEY_CONTROL,
+		.offset = offsetof(struct flow_keys, control),
+	},
 	{
 		.key_id = FLOW_DISSECTOR_KEY_BASIC,
 		.offset = offsetof(struct flow_keys, basic),
@@ -555,6 +579,10 @@ static const struct flow_dissector_key flow_keys_dissector_keys[] = {
 };
 
 static const struct flow_dissector_key flow_keys_buf_dissector_keys[] = {
+	{
+		.key_id = FLOW_DISSECTOR_KEY_CONTROL,
+		.offset = offsetof(struct flow_keys, control),
+	},
 	{
 		.key_id = FLOW_DISSECTOR_KEY_BASIC,
 		.offset = offsetof(struct flow_keys, basic),
diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index 8c8f34ef6980..5a7d66c59684 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -25,6 +25,7 @@
 
 struct fl_flow_key {
 	int	indev_ifindex;
+	struct flow_dissector_key_control control;
 	struct flow_dissector_key_basic basic;
 	struct flow_dissector_key_eth_addrs eth;
 	union {
@@ -347,6 +348,7 @@ static void fl_init_dissector(struct cls_fl_head *head,
 	struct flow_dissector_key keys[FLOW_DISSECTOR_KEY_MAX];
 	size_t cnt = 0;
 
+	FL_KEY_SET(keys, cnt, FLOW_DISSECTOR_KEY_CONTROL, control);
 	FL_KEY_SET(keys, cnt, FLOW_DISSECTOR_KEY_BASIC, basic);
 	FL_KEY_SET_IF_IN_RANGE(mask, keys, cnt,
 			       FLOW_DISSECTOR_KEY_ETH_ADDRS, eth);
-- 
cgit v1.2.3


From 01949d0109ee5fae33752f0db99a36f1619e1873 Mon Sep 17 00:00:00 2001
From: Haggai Abramonvsky <hagaya@mellanox.com>
Date: Thu, 4 Jun 2015 19:30:38 +0300
Subject: net/mlx5_core: Enable XRCs and SRQs when using ISSI > 0

When working in ISSI > 0 mode, the model exposed by the device for
XRCs and SRQs is different. XRCs use XRC SRQs and plain SRQs are based
on RPM (Receive Memory Pool).

Add helper functions to create, modify, query, and arm XRC SRQs and RMPs.

Signed-off-by: Haggai Abramovsky <hagaya@mellanox.com>
Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/infiniband/hw/mlx5/srq.c                   |   2 +-
 drivers/net/ethernet/mellanox/mlx5/core/Makefile   |   4 +-
 drivers/net/ethernet/mellanox/mlx5/core/srq.c      | 444 ++++++++++++++++++---
 drivers/net/ethernet/mellanox/mlx5/core/transobj.c | 154 +++++++
 drivers/net/ethernet/mellanox/mlx5/core/transobj.h |  11 +
 include/linux/mlx5/driver.h                        |   6 +-
 include/linux/mlx5/mlx5_ifc.h                      |  16 +-
 7 files changed, 564 insertions(+), 73 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/infiniband/hw/mlx5/srq.c b/drivers/infiniband/hw/mlx5/srq.c
index e8e8e942fa4a..e008505e96e9 100644
--- a/drivers/infiniband/hw/mlx5/srq.c
+++ b/drivers/infiniband/hw/mlx5/srq.c
@@ -302,7 +302,7 @@ struct ib_srq *mlx5_ib_create_srq(struct ib_pd *pd,
 
 	in->ctx.pd = cpu_to_be32(to_mpd(pd)->pdn);
 	in->ctx.db_record = cpu_to_be64(srq->db.dma);
-	err = mlx5_core_create_srq(dev->mdev, &srq->msrq, in, inlen);
+	err = mlx5_core_create_srq(dev->mdev, &srq->msrq, in, inlen, is_xrc);
 	kvfree(in);
 	if (err) {
 		mlx5_ib_dbg(dev, "create SRQ failed, err %d\n", err);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Makefile b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
index 87e9e606596a..07540f732df1 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/Makefile
+++ b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
@@ -2,7 +2,7 @@ obj-$(CONFIG_MLX5_CORE)		+= mlx5_core.o
 
 mlx5_core-y :=	main.o cmd.o debugfs.o fw.o eq.o uar.o pagealloc.o \
 		health.o mcg.o cq.o srq.o alloc.o qp.o port.o mr.o pd.o   \
-		mad.o
-mlx5_core-$(CONFIG_MLX5_CORE_EN) += wq.o flow_table.o vport.o transobj.o \
+		mad.o transobj.o
+mlx5_core-$(CONFIG_MLX5_CORE_EN) += wq.o flow_table.o vport.o \
 		en_main.o en_flow_table.o en_ethtool.o en_tx.o en_rx.o \
 		en_txrx.o
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/srq.c b/drivers/net/ethernet/mellanox/mlx5/core/srq.c
index f9d25dcd03c1..c48f504ccbeb 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/srq.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/srq.c
@@ -37,6 +37,7 @@
 #include <linux/mlx5/srq.h>
 #include <rdma/ib_verbs.h>
 #include "mlx5_core.h"
+#include "transobj.h"
 
 void mlx5_srq_event(struct mlx5_core_dev *dev, u32 srqn, int event_type)
 {
@@ -62,6 +63,74 @@ void mlx5_srq_event(struct mlx5_core_dev *dev, u32 srqn, int event_type)
 		complete(&srq->free);
 }
 
+static int get_pas_size(void *srqc)
+{
+	u32 log_page_size = MLX5_GET(srqc, srqc, log_page_size) + 12;
+	u32 log_srq_size  = MLX5_GET(srqc, srqc, log_srq_size);
+	u32 log_rq_stride = MLX5_GET(srqc, srqc, log_rq_stride);
+	u32 page_offset   = MLX5_GET(srqc, srqc, page_offset);
+	u32 po_quanta	  = 1 << (log_page_size - 6);
+	u32 rq_sz	  = 1 << (log_srq_size + 4 + log_rq_stride);
+	u32 page_size	  = 1 << log_page_size;
+	u32 rq_sz_po      = rq_sz + (page_offset * po_quanta);
+	u32 rq_num_pas	  = (rq_sz_po + page_size - 1) / page_size;
+
+	return rq_num_pas * sizeof(u64);
+}
+
+static void rmpc_srqc_reformat(void *srqc, void *rmpc, bool srqc_to_rmpc)
+{
+	void *wq = MLX5_ADDR_OF(rmpc, rmpc, wq);
+
+	if (srqc_to_rmpc) {
+		switch (MLX5_GET(srqc, srqc, state)) {
+		case MLX5_SRQC_STATE_GOOD:
+			MLX5_SET(rmpc, rmpc, state, MLX5_RMPC_STATE_RDY);
+			break;
+		case MLX5_SRQC_STATE_ERROR:
+			MLX5_SET(rmpc, rmpc, state, MLX5_RMPC_STATE_ERR);
+			break;
+		default:
+			pr_warn("%s: %d: Unknown srq state = 0x%x\n", __func__,
+				__LINE__, MLX5_GET(srqc, srqc, state));
+			MLX5_SET(rmpc, rmpc, state, MLX5_GET(srqc, srqc, state));
+		}
+
+		MLX5_SET(wq,   wq, wq_signature,  MLX5_GET(srqc,  srqc, wq_signature));
+		MLX5_SET(wq,   wq, log_wq_pg_sz,  MLX5_GET(srqc,  srqc, log_page_size));
+		MLX5_SET(wq,   wq, log_wq_stride, MLX5_GET(srqc,  srqc, log_rq_stride) + 4);
+		MLX5_SET(wq,   wq, log_wq_sz,     MLX5_GET(srqc,  srqc, log_srq_size));
+		MLX5_SET(wq,   wq, page_offset,   MLX5_GET(srqc,  srqc, page_offset));
+		MLX5_SET(wq,   wq, lwm,           MLX5_GET(srqc,  srqc, lwm));
+		MLX5_SET(wq,   wq, pd,            MLX5_GET(srqc,  srqc, pd));
+		MLX5_SET64(wq, wq, dbr_addr, MLX5_GET64(srqc,	  srqc, dbr_addr));
+	} else {
+		switch (MLX5_GET(rmpc, rmpc, state)) {
+		case MLX5_RMPC_STATE_RDY:
+			MLX5_SET(srqc, srqc, state, MLX5_SRQC_STATE_GOOD);
+			break;
+		case MLX5_RMPC_STATE_ERR:
+			MLX5_SET(srqc, srqc, state, MLX5_SRQC_STATE_ERROR);
+			break;
+		default:
+			pr_warn("%s: %d: Unknown rmp state = 0x%x\n",
+				__func__, __LINE__,
+				MLX5_GET(rmpc, rmpc, state));
+			MLX5_SET(srqc, srqc, state,
+				 MLX5_GET(rmpc, rmpc, state));
+		}
+
+		MLX5_SET(srqc,   srqc, wq_signature,   MLX5_GET(wq,   wq, wq_signature));
+		MLX5_SET(srqc,   srqc, log_page_size,  MLX5_GET(wq,   wq, log_wq_pg_sz));
+		MLX5_SET(srqc,   srqc, log_rq_stride,  MLX5_GET(wq,   wq, log_wq_stride) - 4);
+		MLX5_SET(srqc,   srqc, log_srq_size,   MLX5_GET(wq,   wq, log_wq_sz));
+		MLX5_SET(srqc,   srqc, page_offset,    MLX5_GET(wq,   wq, page_offset));
+		MLX5_SET(srqc,   srqc, lwm,	       MLX5_GET(wq,   wq, lwm));
+		MLX5_SET(srqc,   srqc, pd,	       MLX5_GET(wq,   wq, pd));
+		MLX5_SET64(srqc, srqc, dbr_addr,       MLX5_GET64(wq, wq, dbr_addr));
+	}
+}
+
 struct mlx5_core_srq *mlx5_core_get_srq(struct mlx5_core_dev *dev, u32 srqn)
 {
 	struct mlx5_srq_table *table = &dev->priv.srq_table;
@@ -79,26 +148,311 @@ struct mlx5_core_srq *mlx5_core_get_srq(struct mlx5_core_dev *dev, u32 srqn)
 }
 EXPORT_SYMBOL(mlx5_core_get_srq);
 
-int mlx5_core_create_srq(struct mlx5_core_dev *dev, struct mlx5_core_srq *srq,
-			 struct mlx5_create_srq_mbox_in *in, int inlen)
+static int create_srq_cmd(struct mlx5_core_dev *dev, struct mlx5_core_srq *srq,
+			  struct mlx5_create_srq_mbox_in *in, int inlen)
 {
 	struct mlx5_create_srq_mbox_out out;
-	struct mlx5_srq_table *table = &dev->priv.srq_table;
-	struct mlx5_destroy_srq_mbox_in din;
-	struct mlx5_destroy_srq_mbox_out dout;
 	int err;
 
 	memset(&out, 0, sizeof(out));
+
 	in->hdr.opcode = cpu_to_be16(MLX5_CMD_OP_CREATE_SRQ);
-	err = mlx5_cmd_exec(dev, in, inlen, &out, sizeof(out));
-	if (err)
-		return err;
 
-	if (out.hdr.status)
-		return mlx5_cmd_status_to_err(&out.hdr);
+	err = mlx5_cmd_exec_check_status(dev, (u32 *)in, inlen, (u32 *)(&out),
+					 sizeof(out));
 
 	srq->srqn = be32_to_cpu(out.srqn) & 0xffffff;
 
+	return err;
+}
+
+static int destroy_srq_cmd(struct mlx5_core_dev *dev,
+			   struct mlx5_core_srq *srq)
+{
+	struct mlx5_destroy_srq_mbox_in in;
+	struct mlx5_destroy_srq_mbox_out out;
+
+	memset(&in, 0, sizeof(in));
+	memset(&out, 0, sizeof(out));
+	in.hdr.opcode = cpu_to_be16(MLX5_CMD_OP_DESTROY_SRQ);
+	in.srqn = cpu_to_be32(srq->srqn);
+
+	return mlx5_cmd_exec_check_status(dev, (u32 *)(&in), sizeof(in),
+					  (u32 *)(&out), sizeof(out));
+}
+
+static int arm_srq_cmd(struct mlx5_core_dev *dev, struct mlx5_core_srq *srq,
+		       u16 lwm, int is_srq)
+{
+	struct mlx5_arm_srq_mbox_in	in;
+	struct mlx5_arm_srq_mbox_out	out;
+
+	memset(&in, 0, sizeof(in));
+	memset(&out, 0, sizeof(out));
+
+	in.hdr.opcode = cpu_to_be16(MLX5_CMD_OP_ARM_RQ);
+	in.hdr.opmod = cpu_to_be16(!!is_srq);
+	in.srqn = cpu_to_be32(srq->srqn);
+	in.lwm = cpu_to_be16(lwm);
+
+	return mlx5_cmd_exec_check_status(dev, (u32 *)(&in),
+					  sizeof(in), (u32 *)(&out),
+					  sizeof(out));
+}
+
+static int query_srq_cmd(struct mlx5_core_dev *dev, struct mlx5_core_srq *srq,
+			 struct mlx5_query_srq_mbox_out *out)
+{
+	struct mlx5_query_srq_mbox_in in;
+
+	memset(&in, 0, sizeof(in));
+
+	in.hdr.opcode = cpu_to_be16(MLX5_CMD_OP_QUERY_SRQ);
+	in.srqn = cpu_to_be32(srq->srqn);
+
+	return mlx5_cmd_exec_check_status(dev, (u32 *)(&in), sizeof(in),
+					  (u32 *)out, sizeof(*out));
+}
+
+static int create_xrc_srq_cmd(struct mlx5_core_dev *dev,
+			      struct mlx5_core_srq *srq,
+			      struct mlx5_create_srq_mbox_in *in,
+			      int srq_inlen)
+{
+	u32 create_out[MLX5_ST_SZ_DW(create_xrc_srq_out)];
+	void *create_in;
+	void *srqc;
+	void *xrc_srqc;
+	void *pas;
+	int pas_size;
+	int inlen;
+	int err;
+
+	srqc	  = MLX5_ADDR_OF(create_srq_in, in, srq_context_entry);
+	pas_size  = get_pas_size(srqc);
+	inlen	  = MLX5_ST_SZ_BYTES(create_xrc_srq_in) + pas_size;
+	create_in = mlx5_vzalloc(inlen);
+	if (!create_in)
+		return -ENOMEM;
+
+	xrc_srqc = MLX5_ADDR_OF(create_xrc_srq_in, create_in,
+				xrc_srq_context_entry);
+	pas	 = MLX5_ADDR_OF(create_xrc_srq_in, create_in, pas);
+
+	memcpy(xrc_srqc, srqc, MLX5_ST_SZ_BYTES(srqc));
+	memcpy(pas, in->pas, pas_size);
+	/* 0xffffff means we ask to work with cqe version 0 */
+	MLX5_SET(xrc_srqc,	    xrc_srqc,  user_index, 0xffffff);
+	MLX5_SET(create_xrc_srq_in, create_in, opcode,
+		 MLX5_CMD_OP_CREATE_XRC_SRQ);
+
+	memset(create_out, 0, sizeof(create_out));
+	err = mlx5_cmd_exec_check_status(dev, create_in, inlen, create_out,
+					 sizeof(create_out));
+	if (err)
+		goto out;
+
+	srq->srqn = MLX5_GET(create_xrc_srq_out, create_out, xrc_srqn);
+out:
+	kvfree(create_in);
+	return err;
+}
+
+static int destroy_xrc_srq_cmd(struct mlx5_core_dev *dev,
+			       struct mlx5_core_srq *srq)
+{
+	u32 xrcsrq_in[MLX5_ST_SZ_DW(destroy_xrc_srq_in)];
+	u32 xrcsrq_out[MLX5_ST_SZ_DW(destroy_xrc_srq_out)];
+
+	memset(xrcsrq_in, 0, sizeof(xrcsrq_in));
+	memset(xrcsrq_out, 0, sizeof(xrcsrq_out));
+
+	MLX5_SET(destroy_xrc_srq_in, xrcsrq_in, opcode,
+		 MLX5_CMD_OP_DESTROY_XRC_SRQ);
+	MLX5_SET(destroy_xrc_srq_in, xrcsrq_in, xrc_srqn, srq->srqn);
+
+	return mlx5_cmd_exec_check_status(dev, xrcsrq_in, sizeof(xrcsrq_in),
+					  xrcsrq_out, sizeof(xrcsrq_out));
+}
+
+static int arm_xrc_srq_cmd(struct mlx5_core_dev *dev,
+			   struct mlx5_core_srq *srq, u16 lwm)
+{
+	u32 xrcsrq_in[MLX5_ST_SZ_DW(arm_xrc_srq_in)];
+	u32 xrcsrq_out[MLX5_ST_SZ_DW(arm_xrc_srq_out)];
+
+	memset(xrcsrq_in, 0, sizeof(xrcsrq_in));
+	memset(xrcsrq_out, 0, sizeof(xrcsrq_out));
+
+	MLX5_SET(arm_xrc_srq_in, xrcsrq_in, opcode,   MLX5_CMD_OP_ARM_XRC_SRQ);
+	MLX5_SET(arm_xrc_srq_in, xrcsrq_in, op_mod,   MLX5_ARM_XRC_SRQ_IN_OP_MOD_XRC_SRQ);
+	MLX5_SET(arm_xrc_srq_in, xrcsrq_in, xrc_srqn, srq->srqn);
+	MLX5_SET(arm_xrc_srq_in, xrcsrq_in, lwm,      lwm);
+
+	return  mlx5_cmd_exec_check_status(dev, xrcsrq_in, sizeof(xrcsrq_in),
+					   xrcsrq_out, sizeof(xrcsrq_out));
+}
+
+static int query_xrc_srq_cmd(struct mlx5_core_dev *dev,
+			     struct mlx5_core_srq *srq,
+			     struct mlx5_query_srq_mbox_out *out)
+{
+	u32 xrcsrq_in[MLX5_ST_SZ_DW(query_xrc_srq_in)];
+	u32 *xrcsrq_out;
+	void *srqc;
+	void *xrc_srqc;
+	int err;
+
+	xrcsrq_out = mlx5_vzalloc(MLX5_ST_SZ_BYTES(query_xrc_srq_out));
+	if (!xrcsrq_out)
+		return -ENOMEM;
+	memset(xrcsrq_in, 0, sizeof(xrcsrq_in));
+
+	MLX5_SET(query_xrc_srq_in, xrcsrq_in, opcode,
+		 MLX5_CMD_OP_QUERY_XRC_SRQ);
+	MLX5_SET(query_xrc_srq_in, xrcsrq_in, xrc_srqn, srq->srqn);
+	err =  mlx5_cmd_exec_check_status(dev, xrcsrq_in, sizeof(xrcsrq_in),
+					  xrcsrq_out,
+					  MLX5_ST_SZ_BYTES(query_xrc_srq_out));
+	if (err)
+		goto out;
+
+	xrc_srqc = MLX5_ADDR_OF(query_xrc_srq_out, xrcsrq_out,
+				xrc_srq_context_entry);
+	srqc = MLX5_ADDR_OF(query_srq_out, out, srq_context_entry);
+	memcpy(srqc, xrc_srqc, MLX5_ST_SZ_BYTES(srqc));
+
+out:
+	kvfree(xrcsrq_out);
+	return err;
+}
+
+static int create_rmp_cmd(struct mlx5_core_dev *dev, struct mlx5_core_srq *srq,
+			  struct mlx5_create_srq_mbox_in *in, int srq_inlen)
+{
+	void *create_in;
+	void *rmpc;
+	void *srqc;
+	int pas_size;
+	int inlen;
+	int err;
+
+	srqc = MLX5_ADDR_OF(create_srq_in, in, srq_context_entry);
+	pas_size = get_pas_size(srqc);
+	inlen = MLX5_ST_SZ_BYTES(create_rmp_in) + pas_size;
+	create_in = mlx5_vzalloc(inlen);
+	if (!create_in)
+		return -ENOMEM;
+
+	rmpc = MLX5_ADDR_OF(create_rmp_in, create_in, ctx);
+
+	memcpy(MLX5_ADDR_OF(rmpc, rmpc, wq.pas), in->pas, pas_size);
+	rmpc_srqc_reformat(srqc, rmpc, true);
+
+	err = mlx5_core_create_rmp(dev, create_in, inlen, &srq->srqn);
+
+	kvfree(create_in);
+	return err;
+}
+
+static int destroy_rmp_cmd(struct mlx5_core_dev *dev,
+			   struct mlx5_core_srq *srq)
+{
+	return mlx5_core_destroy_rmp(dev, srq->srqn);
+}
+
+static int arm_rmp_cmd(struct mlx5_core_dev *dev,
+		       struct mlx5_core_srq *srq,
+		       u16 lwm)
+{
+	void *in;
+	void *rmpc;
+	void *wq;
+	void *bitmask;
+	int err;
+
+	in = mlx5_vzalloc(MLX5_ST_SZ_BYTES(modify_rmp_in));
+	if (!in)
+		return -ENOMEM;
+
+	rmpc =	  MLX5_ADDR_OF(modify_rmp_in,   in,   ctx);
+	bitmask = MLX5_ADDR_OF(modify_rmp_in,   in,   bitmask);
+	wq   =	  MLX5_ADDR_OF(rmpc,	        rmpc, wq);
+
+	MLX5_SET(modify_rmp_in, in,	 rmp_state, MLX5_RMPC_STATE_RDY);
+	MLX5_SET(modify_rmp_in, in,	 rmpn,      srq->srqn);
+	MLX5_SET(wq,		wq,	 lwm,	    lwm);
+	MLX5_SET(rmp_bitmask,	bitmask, lwm,	    1);
+	MLX5_SET(rmpc, rmpc, state, MLX5_RMPC_STATE_RDY);
+
+	err = mlx5_core_modify_rmp(dev, in, MLX5_ST_SZ_BYTES(modify_rmp_in));
+
+	kvfree(in);
+	return err;
+}
+
+static int query_rmp_cmd(struct mlx5_core_dev *dev, struct mlx5_core_srq *srq,
+			 struct mlx5_query_srq_mbox_out *out)
+{
+	u32 *rmp_out;
+	void *rmpc;
+	void *srqc;
+	int err;
+
+	rmp_out =  mlx5_vzalloc(MLX5_ST_SZ_BYTES(query_rmp_out));
+	if (!rmp_out)
+		return -ENOMEM;
+
+	err = mlx5_core_query_rmp(dev, srq->srqn, rmp_out);
+	if (err)
+		goto out;
+
+	srqc = MLX5_ADDR_OF(query_srq_out, out,	    srq_context_entry);
+	rmpc = MLX5_ADDR_OF(query_rmp_out, rmp_out, rmp_context);
+	rmpc_srqc_reformat(srqc, rmpc, false);
+
+out:
+	kvfree(rmp_out);
+	return err;
+}
+
+static int create_srq_split(struct mlx5_core_dev *dev,
+			    struct mlx5_core_srq *srq,
+			    struct mlx5_create_srq_mbox_in *in,
+			    int inlen, int is_xrc)
+{
+	if (!dev->issi)
+		return create_srq_cmd(dev, srq, in, inlen);
+	else if (srq->common.res == MLX5_RES_XSRQ)
+		return create_xrc_srq_cmd(dev, srq, in, inlen);
+	else
+		return create_rmp_cmd(dev, srq, in, inlen);
+}
+
+static int destroy_srq_split(struct mlx5_core_dev *dev,
+			     struct mlx5_core_srq *srq)
+{
+	if (!dev->issi)
+		return destroy_srq_cmd(dev, srq);
+	else if (srq->common.res == MLX5_RES_XSRQ)
+		return destroy_xrc_srq_cmd(dev, srq);
+	else
+		return destroy_rmp_cmd(dev, srq);
+}
+
+int mlx5_core_create_srq(struct mlx5_core_dev *dev, struct mlx5_core_srq *srq,
+			 struct mlx5_create_srq_mbox_in *in, int inlen,
+			 int is_xrc)
+{
+	int err;
+	struct mlx5_srq_table *table = &dev->priv.srq_table;
+
+	srq->common.res = is_xrc ? MLX5_RES_XSRQ : MLX5_RES_SRQ;
+
+	err = create_srq_split(dev, srq, in, inlen, is_xrc);
+	if (err)
+		return err;
+
 	atomic_set(&srq->refcount, 1);
 	init_completion(&srq->free);
 
@@ -107,25 +461,20 @@ int mlx5_core_create_srq(struct mlx5_core_dev *dev, struct mlx5_core_srq *srq,
 	spin_unlock_irq(&table->lock);
 	if (err) {
 		mlx5_core_warn(dev, "err %d, srqn 0x%x\n", err, srq->srqn);
-		goto err_cmd;
+		goto err_destroy_srq_split;
 	}
 
 	return 0;
 
-err_cmd:
-	memset(&din, 0, sizeof(din));
-	memset(&dout, 0, sizeof(dout));
-	din.srqn = cpu_to_be32(srq->srqn);
-	din.hdr.opcode = cpu_to_be16(MLX5_CMD_OP_DESTROY_SRQ);
-	mlx5_cmd_exec(dev, &din, sizeof(din), &dout, sizeof(dout));
+err_destroy_srq_split:
+	destroy_srq_split(dev, srq);
+
 	return err;
 }
 EXPORT_SYMBOL(mlx5_core_create_srq);
 
 int mlx5_core_destroy_srq(struct mlx5_core_dev *dev, struct mlx5_core_srq *srq)
 {
-	struct mlx5_destroy_srq_mbox_in in;
-	struct mlx5_destroy_srq_mbox_out out;
 	struct mlx5_srq_table *table = &dev->priv.srq_table;
 	struct mlx5_core_srq *tmp;
 	int err;
@@ -142,17 +491,10 @@ int mlx5_core_destroy_srq(struct mlx5_core_dev *dev, struct mlx5_core_srq *srq)
 		return -EINVAL;
 	}
 
-	memset(&in, 0, sizeof(in));
-	memset(&out, 0, sizeof(out));
-	in.hdr.opcode = cpu_to_be16(MLX5_CMD_OP_DESTROY_SRQ);
-	in.srqn = cpu_to_be32(srq->srqn);
-	err = mlx5_cmd_exec(dev, &in, sizeof(in), &out, sizeof(out));
+	err = destroy_srq_split(dev, srq);
 	if (err)
 		return err;
 
-	if (out.hdr.status)
-		return mlx5_cmd_status_to_err(&out.hdr);
-
 	if (atomic_dec_and_test(&srq->refcount))
 		complete(&srq->free);
 	wait_for_completion(&srq->free);
@@ -164,48 +506,24 @@ EXPORT_SYMBOL(mlx5_core_destroy_srq);
 int mlx5_core_query_srq(struct mlx5_core_dev *dev, struct mlx5_core_srq *srq,
 			struct mlx5_query_srq_mbox_out *out)
 {
-	struct mlx5_query_srq_mbox_in in;
-	int err;
-
-	memset(&in, 0, sizeof(in));
-	memset(out, 0, sizeof(*out));
-
-	in.hdr.opcode = cpu_to_be16(MLX5_CMD_OP_QUERY_SRQ);
-	in.srqn = cpu_to_be32(srq->srqn);
-	err = mlx5_cmd_exec(dev, &in, sizeof(in), out, sizeof(*out));
-	if (err)
-		return err;
-
-	if (out->hdr.status)
-		return mlx5_cmd_status_to_err(&out->hdr);
-
-	return err;
+	if (!dev->issi)
+		return query_srq_cmd(dev, srq, out);
+	else if (srq->common.res == MLX5_RES_XSRQ)
+		return query_xrc_srq_cmd(dev, srq, out);
+	else
+		return query_rmp_cmd(dev, srq, out);
 }
 EXPORT_SYMBOL(mlx5_core_query_srq);
 
 int mlx5_core_arm_srq(struct mlx5_core_dev *dev, struct mlx5_core_srq *srq,
 		      u16 lwm, int is_srq)
 {
-	struct mlx5_arm_srq_mbox_in	in;
-	struct mlx5_arm_srq_mbox_out	out;
-	int err;
-
-	memset(&in, 0, sizeof(in));
-	memset(&out, 0, sizeof(out));
-
-	in.hdr.opcode = cpu_to_be16(MLX5_CMD_OP_ARM_RQ);
-	in.hdr.opmod = cpu_to_be16(!!is_srq);
-	in.srqn = cpu_to_be32(srq->srqn);
-	in.lwm = cpu_to_be16(lwm);
-
-	err = mlx5_cmd_exec(dev, &in, sizeof(in), &out, sizeof(out));
-	if (err)
-		return err;
-
-	if (out.hdr.status)
-		return mlx5_cmd_status_to_err(&out.hdr);
-
-	return err;
+	if (!dev->issi)
+		return arm_srq_cmd(dev, srq, lwm, is_srq);
+	else if (srq->common.res == MLX5_RES_XSRQ)
+		return arm_xrc_srq_cmd(dev, srq, lwm);
+	else
+		return arm_rmp_cmd(dev, srq, lwm);
 }
 EXPORT_SYMBOL(mlx5_core_arm_srq);
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/transobj.c b/drivers/net/ethernet/mellanox/mlx5/core/transobj.c
index d918816ac4d8..7a120283de2c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/transobj.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/transobj.c
@@ -169,3 +169,157 @@ void mlx5_core_destroy_tis(struct mlx5_core_dev *dev, u32 tisn)
 
 	mlx5_cmd_exec_check_status(dev, in, sizeof(in), out, sizeof(out));
 }
+
+int mlx5_core_create_rmp(struct mlx5_core_dev *dev, u32 *in, int inlen,
+			 u32 *rmpn)
+{
+	u32 out[MLX5_ST_SZ_DW(create_rmp_out)];
+	int err;
+
+	MLX5_SET(create_rmp_in, in, opcode, MLX5_CMD_OP_CREATE_RMP);
+
+	memset(out, 0, sizeof(out));
+	err = mlx5_cmd_exec_check_status(dev, in, inlen, out, sizeof(out));
+	if (!err)
+		*rmpn = MLX5_GET(create_rmp_out, out, rmpn);
+
+	return err;
+}
+
+int mlx5_core_modify_rmp(struct mlx5_core_dev *dev, u32 *in, int inlen)
+{
+	u32 out[MLX5_ST_SZ_DW(modify_rmp_out)];
+
+	MLX5_SET(modify_rmp_in, in, opcode, MLX5_CMD_OP_MODIFY_RMP);
+
+	memset(out, 0, sizeof(out));
+	return mlx5_cmd_exec_check_status(dev, in, inlen, out, sizeof(out));
+}
+
+int mlx5_core_destroy_rmp(struct mlx5_core_dev *dev, u32 rmpn)
+{
+	u32 in[MLX5_ST_SZ_DW(destroy_rmp_in)];
+	u32 out[MLX5_ST_SZ_DW(destroy_rmp_out)];
+
+	memset(in, 0, sizeof(in));
+
+	MLX5_SET(destroy_rmp_in, in, opcode, MLX5_CMD_OP_DESTROY_RMP);
+	MLX5_SET(destroy_rmp_in, in, rmpn, rmpn);
+
+	return mlx5_cmd_exec_check_status(dev, in, sizeof(in), out,
+					  sizeof(out));
+}
+
+int mlx5_core_query_rmp(struct mlx5_core_dev *dev, u32 rmpn, u32 *out)
+{
+	u32 in[MLX5_ST_SZ_DW(query_rmp_in)];
+	int outlen = MLX5_ST_SZ_BYTES(query_rmp_out);
+
+	memset(in, 0, sizeof(in));
+	MLX5_SET(query_rmp_in, in, opcode, MLX5_CMD_OP_QUERY_RMP);
+	MLX5_SET(query_rmp_in, in, rmpn,   rmpn);
+
+	return mlx5_cmd_exec_check_status(dev, in, sizeof(in), out, outlen);
+}
+
+int mlx5_core_arm_rmp(struct mlx5_core_dev *dev, u32 rmpn, u16 lwm)
+{
+	void *in;
+	void *rmpc;
+	void *wq;
+	void *bitmask;
+	int  err;
+
+	in = mlx5_vzalloc(MLX5_ST_SZ_BYTES(modify_rmp_in));
+	if (!in)
+		return -ENOMEM;
+
+	rmpc    = MLX5_ADDR_OF(modify_rmp_in,   in,   ctx);
+	bitmask = MLX5_ADDR_OF(modify_rmp_in,   in,   bitmask);
+	wq      = MLX5_ADDR_OF(rmpc,	        rmpc, wq);
+
+	MLX5_SET(modify_rmp_in, in,	 rmp_state, MLX5_RMPC_STATE_RDY);
+	MLX5_SET(modify_rmp_in, in,	 rmpn,      rmpn);
+	MLX5_SET(wq,		wq,	 lwm,	    lwm);
+	MLX5_SET(rmp_bitmask,	bitmask, lwm,	    1);
+	MLX5_SET(rmpc,		rmpc,	 state,	    MLX5_RMPC_STATE_RDY);
+
+	err =  mlx5_core_modify_rmp(dev, in, MLX5_ST_SZ_BYTES(modify_rmp_in));
+
+	kvfree(in);
+
+	return err;
+}
+
+int mlx5_core_create_xsrq(struct mlx5_core_dev *dev, u32 *in, int inlen,
+			  u32 *xsrqn)
+{
+	u32 out[MLX5_ST_SZ_DW(create_xrc_srq_out)];
+	int err;
+
+	MLX5_SET(create_xrc_srq_in, in, opcode,     MLX5_CMD_OP_CREATE_XRC_SRQ);
+
+	memset(out, 0, sizeof(out));
+	err = mlx5_cmd_exec_check_status(dev, in, inlen, out, sizeof(out));
+	if (!err)
+		*xsrqn = MLX5_GET(create_xrc_srq_out, out, xrc_srqn);
+
+	return err;
+}
+
+int mlx5_core_destroy_xsrq(struct mlx5_core_dev *dev, u32 xsrqn)
+{
+	u32 in[MLX5_ST_SZ_DW(destroy_xrc_srq_in)];
+	u32 out[MLX5_ST_SZ_DW(destroy_xrc_srq_out)];
+
+	memset(in, 0, sizeof(in));
+	memset(out, 0, sizeof(out));
+
+	MLX5_SET(destroy_xrc_srq_in, in, opcode,   MLX5_CMD_OP_DESTROY_XRC_SRQ);
+	MLX5_SET(destroy_xrc_srq_in, in, xrc_srqn, xsrqn);
+
+	return mlx5_cmd_exec_check_status(dev, in, sizeof(in), out,
+					  sizeof(out));
+}
+
+int mlx5_core_query_xsrq(struct mlx5_core_dev *dev, u32 xsrqn, u32 *out)
+{
+	u32 in[MLX5_ST_SZ_DW(query_xrc_srq_in)];
+	void *srqc;
+	void *xrc_srqc;
+	int err;
+
+	memset(in, 0, sizeof(in));
+	MLX5_SET(query_xrc_srq_in, in, opcode,   MLX5_CMD_OP_QUERY_XRC_SRQ);
+	MLX5_SET(query_xrc_srq_in, in, xrc_srqn, xsrqn);
+
+	err =  mlx5_cmd_exec_check_status(dev, in, sizeof(in),
+					  out,
+					  MLX5_ST_SZ_BYTES(query_xrc_srq_out));
+	if (!err) {
+		xrc_srqc = MLX5_ADDR_OF(query_xrc_srq_out, out,
+					xrc_srq_context_entry);
+		srqc = MLX5_ADDR_OF(query_srq_out, out, srq_context_entry);
+		memcpy(srqc, xrc_srqc, MLX5_ST_SZ_BYTES(srqc));
+	}
+
+	return err;
+}
+
+int mlx5_core_arm_xsrq(struct mlx5_core_dev *dev, u32 xsrqn, u16 lwm)
+{
+	u32 in[MLX5_ST_SZ_DW(arm_xrc_srq_in)];
+	u32 out[MLX5_ST_SZ_DW(arm_xrc_srq_out)];
+
+	memset(in, 0, sizeof(in));
+	memset(out, 0, sizeof(out));
+
+	MLX5_SET(arm_xrc_srq_in, in, opcode,   MLX5_CMD_OP_ARM_XRC_SRQ);
+	MLX5_SET(arm_xrc_srq_in, in, xrc_srqn, xsrqn);
+	MLX5_SET(arm_xrc_srq_in, in, lwm,      lwm);
+	MLX5_SET(arm_xrc_srq_in, in, op_mod,
+		 MLX5_ARM_XRC_SRQ_IN_OP_MOD_XRC_SRQ);
+
+	return  mlx5_cmd_exec_check_status(dev, in, sizeof(in), out,
+					   sizeof(out));
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/transobj.h b/drivers/net/ethernet/mellanox/mlx5/core/transobj.h
index b71d77f61a1d..90322c11361f 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/transobj.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/transobj.h
@@ -47,5 +47,16 @@ void mlx5_core_destroy_tir(struct mlx5_core_dev *dev, u32 tirn);
 int mlx5_core_create_tis(struct mlx5_core_dev *dev, u32 *in, int inlen,
 			 u32 *tisn);
 void mlx5_core_destroy_tis(struct mlx5_core_dev *dev, u32 tisn);
+int mlx5_core_create_rmp(struct mlx5_core_dev *dev, u32 *in, int inlen,
+			 u32 *rmpn);
+int mlx5_core_modify_rmp(struct mlx5_core_dev *dev, u32 *in, int inlen);
+int mlx5_core_destroy_rmp(struct mlx5_core_dev *dev, u32 rmpn);
+int mlx5_core_query_rmp(struct mlx5_core_dev *dev, u32 rmpn, u32 *out);
+int mlx5_core_arm_rmp(struct mlx5_core_dev *dev, u32 rmpn, u16 lwm);
+int mlx5_core_create_xsrq(struct mlx5_core_dev *dev, u32 *in, int inlen,
+			  u32 *rmpn);
+int mlx5_core_destroy_xsrq(struct mlx5_core_dev *dev, u32 rmpn);
+int mlx5_core_query_xsrq(struct mlx5_core_dev *dev, u32 rmpn, u32 *out);
+int mlx5_core_arm_xsrq(struct mlx5_core_dev *dev, u32 rmpn, u16 lwm);
 
 #endif /* __TRANSOBJ_H__ */
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 7fa26f03acc1..ba9f212c94bb 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -339,6 +339,8 @@ struct mlx5_core_mr {
 
 enum mlx5_res_type {
 	MLX5_RES_QP,
+	MLX5_RES_SRQ,
+	MLX5_RES_XSRQ,
 };
 
 struct mlx5_core_rsc_common {
@@ -348,6 +350,7 @@ struct mlx5_core_rsc_common {
 };
 
 struct mlx5_core_srq {
+	struct mlx5_core_rsc_common	common; /* must be first */
 	u32		srqn;
 	int		max;
 	int		max_gs;
@@ -640,7 +643,8 @@ struct mlx5_cmd_mailbox *mlx5_alloc_cmd_mailbox_chain(struct mlx5_core_dev *dev,
 void mlx5_free_cmd_mailbox_chain(struct mlx5_core_dev *dev,
 				 struct mlx5_cmd_mailbox *head);
 int mlx5_core_create_srq(struct mlx5_core_dev *dev, struct mlx5_core_srq *srq,
-			 struct mlx5_create_srq_mbox_in *in, int inlen);
+			 struct mlx5_create_srq_mbox_in *in, int inlen,
+			 int is_xrc);
 int mlx5_core_destroy_srq(struct mlx5_core_dev *dev, struct mlx5_core_srq *srq);
 int mlx5_core_query_srq(struct mlx5_core_dev *dev, struct mlx5_core_srq *srq,
 			struct mlx5_query_srq_mbox_out *out);
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index b27e9f6e090a..dbe2b32c0539 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -2022,12 +2022,9 @@ struct mlx5_ifc_srqc_bits {
 
 	u8         reserved_9[0x40];
 
-	u8         db_record_addr_h[0x20];
-
-	u8         db_record_addr_l[0x1e];
-	u8         reserved_10[0x2];
+	u8         dbr_addr[0x40];
 
-	u8         reserved_11[0x80];
+	u8         reserved_10[0x80];
 };
 
 enum {
@@ -4167,6 +4164,13 @@ struct mlx5_ifc_modify_rmp_out_bits {
 	u8         reserved_1[0x40];
 };
 
+struct mlx5_ifc_rmp_bitmask_bits {
+	u8	   reserved[0x20];
+
+	u8         reserved1[0x1f];
+	u8         lwm[0x1];
+};
+
 struct mlx5_ifc_modify_rmp_in_bits {
 	u8         opcode[0x10];
 	u8         reserved_0[0x10];
@@ -4180,7 +4184,7 @@ struct mlx5_ifc_modify_rmp_in_bits {
 
 	u8         reserved_3[0x20];
 
-	u8         modify_bitmask[0x40];
+	struct mlx5_ifc_rmp_bitmask_bits bitmask;
 
 	u8         reserved_4[0x40];
 
-- 
cgit v1.2.3


From d18a9470f89727f870db944a36223bf1bb15bdc1 Mon Sep 17 00:00:00 2001
From: Majd Dibbiny <majd@mellanox.com>
Date: Thu, 4 Jun 2015 19:30:40 +0300
Subject: net/mlx5_core: Make the vport helpers available for the IB driver too

Move the vport header file to be under include/linux/mlx5, such that
the mlx5 IB can use it as well.

Also add nic_ prefix to the vport NIC commands to differeniate between
HCA vport commands and NIC vport commands.

Signed-off-by: Majd Dibbiny <majd@mellanox.com>
Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx5/core/en.h      |  2 +-
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c |  2 +-
 drivers/net/ethernet/mellanox/mlx5/core/vport.c   |  6 ++--
 drivers/net/ethernet/mellanox/mlx5/core/vport.h   | 41 -----------------------
 include/linux/mlx5/vport.h                        | 41 +++++++++++++++++++++++
 5 files changed, 47 insertions(+), 45 deletions(-)
 delete mode 100644 drivers/net/ethernet/mellanox/mlx5/core/vport.h
 create mode 100644 include/linux/mlx5/vport.h

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index cbb3c7cb53f7..e9edb7210de1 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -35,7 +35,7 @@
 #include <linux/mlx5/driver.h>
 #include <linux/mlx5/qp.h>
 #include <linux/mlx5/cq.h>
-#include "vport.h"
+#include <linux/mlx5/vport.h>
 #include "wq.h"
 #include "transobj.h"
 #include "mlx5_core.h"
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 1b592913d4d7..edba09cca600 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -1715,7 +1715,7 @@ static void mlx5e_set_netdev_dev_addr(struct net_device *netdev)
 {
 	struct mlx5e_priv *priv = netdev_priv(netdev);
 
-	mlx5_query_vport_mac_address(priv->mdev, netdev->dev_addr);
+	mlx5_query_nic_vport_mac_address(priv->mdev, netdev->dev_addr);
 }
 
 static void mlx5e_build_netdev(struct net_device *netdev)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/vport.c b/drivers/net/ethernet/mellanox/mlx5/core/vport.c
index ba374b9a6c87..32c4e03f6d3c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/vport.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/vport.c
@@ -33,7 +33,7 @@
 #include <linux/export.h>
 #include <linux/etherdevice.h>
 #include <linux/mlx5/driver.h>
-#include "vport.h"
+#include <linux/mlx5/vport.h>
 #include "mlx5_core.h"
 
 u8 mlx5_query_vport_state(struct mlx5_core_dev *mdev, u8 opmod)
@@ -55,8 +55,9 @@ u8 mlx5_query_vport_state(struct mlx5_core_dev *mdev, u8 opmod)
 
 	return MLX5_GET(query_vport_state_out, out, state);
 }
+EXPORT_SYMBOL(mlx5_query_vport_state);
 
-void mlx5_query_vport_mac_address(struct mlx5_core_dev *mdev, u8 *addr)
+void mlx5_query_nic_vport_mac_address(struct mlx5_core_dev *mdev, u8 *addr)
 {
 	u32  in[MLX5_ST_SZ_DW(query_nic_vport_context_in)];
 	u32 *out;
@@ -82,3 +83,4 @@ void mlx5_query_vport_mac_address(struct mlx5_core_dev *mdev, u8 *addr)
 
 	kvfree(out);
 }
+EXPORT_SYMBOL(mlx5_query_nic_vport_mac_address);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/vport.h b/drivers/net/ethernet/mellanox/mlx5/core/vport.h
deleted file mode 100644
index c05ca2c3419d..000000000000
--- a/drivers/net/ethernet/mellanox/mlx5/core/vport.h
+++ /dev/null
@@ -1,41 +0,0 @@
-/*
- * Copyright (c) 2013-2015, Mellanox Technologies, Ltd.  All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef __MLX5_VPORT_H__
-#define __MLX5_VPORT_H__
-
-#include <linux/mlx5/driver.h>
-
-u8 mlx5_query_vport_state(struct mlx5_core_dev *mdev, u8 opmod);
-void mlx5_query_vport_mac_address(struct mlx5_core_dev *mdev, u8 *addr);
-
-#endif /* __MLX5_VPORT_H__ */
diff --git a/include/linux/mlx5/vport.h b/include/linux/mlx5/vport.h
new file mode 100644
index 000000000000..99d0e9f85432
--- /dev/null
+++ b/include/linux/mlx5/vport.h
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2013-2015, Mellanox Technologies, Ltd.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __MLX5_VPORT_H__
+#define __MLX5_VPORT_H__
+
+#include <linux/mlx5/driver.h>
+
+u8 mlx5_query_vport_state(struct mlx5_core_dev *mdev, u8 opmod);
+void mlx5_query_nic_vport_mac_address(struct mlx5_core_dev *mdev, u8 *addr);
+
+#endif /* __MLX5_VPORT_H__ */
-- 
cgit v1.2.3


From 707c4602cda6624940761b66a4119f1909492385 Mon Sep 17 00:00:00 2001
From: Majd Dibbiny <majd@mellanox.com>
Date: Thu, 4 Jun 2015 19:30:41 +0300
Subject: net/mlx5_core: Add new query HCA vport commands

Added the implementation for the following commands:

1. QUERY_HCA_VPORT_GID
2. QUERY_HCA_VPORT_PKEY
3. QUERY_HCA_VPORT_CONTEXT

They will be needed when we move to work with ISSI > 0 in the IB driver too.

Signed-off-by: Majd Dibbiny <majd@mellanox.com>
Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx5/core/Makefile |   4 +-
 drivers/net/ethernet/mellanox/mlx5/core/main.c   |  10 +-
 drivers/net/ethernet/mellanox/mlx5/core/vport.c  | 259 +++++++++++++++++++++++
 include/linux/mlx5/device.h                      |  13 ++
 include/linux/mlx5/driver.h                      |  45 ++++
 include/linux/mlx5/mlx5_ifc.h                    |  23 +-
 include/linux/mlx5/vport.h                       |  14 ++
 7 files changed, 349 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Makefile b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
index 07540f732df1..26a68b8af2c5 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/Makefile
+++ b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
@@ -2,7 +2,7 @@ obj-$(CONFIG_MLX5_CORE)		+= mlx5_core.o
 
 mlx5_core-y :=	main.o cmd.o debugfs.o fw.o eq.o uar.o pagealloc.o \
 		health.o mcg.o cq.o srq.o alloc.o qp.o port.o mr.o pd.o   \
-		mad.o transobj.o
-mlx5_core-$(CONFIG_MLX5_CORE_EN) += wq.o flow_table.o vport.o \
+		mad.o transobj.o vport.o
+mlx5_core-$(CONFIG_MLX5_CORE_EN) += wq.o flow_table.o \
 		en_main.o en_flow_table.o en_ethtool.o en_tx.o en_rx.o \
 		en_txrx.o
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c
index 11c7216a2517..58354122dfe4 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
@@ -284,14 +284,6 @@ static u16 to_fw_pkey_sz(u32 size)
 	}
 }
 
-static u16 to_sw_pkey_sz(int pkey_sz)
-{
-	if (pkey_sz > MLX5_MAX_LOG_PKEY_TABLE)
-		return 0;
-
-	return MLX5_MIN_PKEY_TABLE_SIZE << pkey_sz;
-}
-
 int mlx5_core_get_caps(struct mlx5_core_dev *dev, enum mlx5_cap_type cap_type,
 		       enum mlx5_cap_mode cap_mode)
 {
@@ -386,7 +378,7 @@ static int handle_hca_cap(struct mlx5_core_dev *dev)
 	       MLX5_ST_SZ_BYTES(cmd_hca_cap));
 
 	mlx5_core_dbg(dev, "Current Pkey table size %d Setting new size %d\n",
-		      to_sw_pkey_sz(MLX5_CAP_GEN(dev, pkey_table_size)),
+		      mlx5_to_sw_pkey_sz(MLX5_CAP_GEN(dev, pkey_table_size)),
 		      128);
 	/* we limit the size of the pkey table to 128 entries for now */
 	MLX5_SET(cmd_hca_cap, set_hca_cap, pkey_table_size,
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/vport.c b/drivers/net/ethernet/mellanox/mlx5/core/vport.c
index 32c4e03f6d3c..20150ffa8b16 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/vport.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/vport.c
@@ -84,3 +84,262 @@ void mlx5_query_nic_vport_mac_address(struct mlx5_core_dev *mdev, u8 *addr)
 	kvfree(out);
 }
 EXPORT_SYMBOL(mlx5_query_nic_vport_mac_address);
+
+int mlx5_query_hca_vport_gid(struct mlx5_core_dev *dev, u8 other_vport,
+			     u8 port_num, u16  vf_num, u16 gid_index,
+			     union ib_gid *gid)
+{
+	int in_sz = MLX5_ST_SZ_BYTES(query_hca_vport_gid_in);
+	int out_sz = MLX5_ST_SZ_BYTES(query_hca_vport_gid_out);
+	int is_group_manager;
+	void *out = NULL;
+	void *in = NULL;
+	union ib_gid *tmp;
+	int tbsz;
+	int nout;
+	int err;
+
+	is_group_manager = MLX5_CAP_GEN(dev, vport_group_manager);
+	tbsz = mlx5_get_gid_table_len(MLX5_CAP_GEN(dev, gid_table_size));
+	mlx5_core_dbg(dev, "vf_num %d, index %d, gid_table_size %d\n",
+		      vf_num, gid_index, tbsz);
+
+	if (gid_index > tbsz && gid_index != 0xffff)
+		return -EINVAL;
+
+	if (gid_index == 0xffff)
+		nout = tbsz;
+	else
+		nout = 1;
+
+	out_sz += nout * sizeof(*gid);
+
+	in = kzalloc(in_sz, GFP_KERNEL);
+	out = kzalloc(out_sz, GFP_KERNEL);
+	if (!in || !out) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	MLX5_SET(query_hca_vport_gid_in, in, opcode, MLX5_CMD_OP_QUERY_HCA_VPORT_GID);
+	if (other_vport) {
+		if (is_group_manager) {
+			MLX5_SET(query_hca_vport_gid_in, in, vport_number, vf_num);
+			MLX5_SET(query_hca_vport_gid_in, in, other_vport, 1);
+		} else {
+			err = -EPERM;
+			goto out;
+		}
+	}
+	MLX5_SET(query_hca_vport_gid_in, in, gid_index, gid_index);
+
+	if (MLX5_CAP_GEN(dev, num_ports) == 2)
+		MLX5_SET(query_hca_vport_gid_in, in, port_num, port_num);
+
+	err = mlx5_cmd_exec(dev, in, in_sz, out, out_sz);
+	if (err)
+		goto out;
+
+	err = mlx5_cmd_status_to_err_v2(out);
+	if (err)
+		goto out;
+
+	tmp = out + MLX5_ST_SZ_BYTES(query_hca_vport_gid_out);
+	gid->global.subnet_prefix = tmp->global.subnet_prefix;
+	gid->global.interface_id = tmp->global.interface_id;
+
+out:
+	kfree(in);
+	kfree(out);
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx5_query_hca_vport_gid);
+
+int mlx5_query_hca_vport_pkey(struct mlx5_core_dev *dev, u8 other_vport,
+			      u8 port_num, u16 vf_num, u16 pkey_index,
+			      u16 *pkey)
+{
+	int in_sz = MLX5_ST_SZ_BYTES(query_hca_vport_pkey_in);
+	int out_sz = MLX5_ST_SZ_BYTES(query_hca_vport_pkey_out);
+	int is_group_manager;
+	void *out = NULL;
+	void *in = NULL;
+	void *pkarr;
+	int nout;
+	int tbsz;
+	int err;
+	int i;
+
+	is_group_manager = MLX5_CAP_GEN(dev, vport_group_manager);
+
+	tbsz = mlx5_to_sw_pkey_sz(MLX5_CAP_GEN(dev, pkey_table_size));
+	if (pkey_index > tbsz && pkey_index != 0xffff)
+		return -EINVAL;
+
+	if (pkey_index == 0xffff)
+		nout = tbsz;
+	else
+		nout = 1;
+
+	out_sz += nout * MLX5_ST_SZ_BYTES(pkey);
+
+	in = kzalloc(in_sz, GFP_KERNEL);
+	out = kzalloc(out_sz, GFP_KERNEL);
+	if (!in || !out) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	MLX5_SET(query_hca_vport_pkey_in, in, opcode, MLX5_CMD_OP_QUERY_HCA_VPORT_PKEY);
+	if (other_vport) {
+		if (is_group_manager) {
+			MLX5_SET(query_hca_vport_pkey_in, in, vport_number, vf_num);
+			MLX5_SET(query_hca_vport_pkey_in, in, other_vport, 1);
+		} else {
+			err = -EPERM;
+			goto out;
+		}
+	}
+	MLX5_SET(query_hca_vport_pkey_in, in, pkey_index, pkey_index);
+
+	if (MLX5_CAP_GEN(dev, num_ports) == 2)
+		MLX5_SET(query_hca_vport_pkey_in, in, port_num, port_num);
+
+	err = mlx5_cmd_exec(dev, in, in_sz, out, out_sz);
+	if (err)
+		goto out;
+
+	err = mlx5_cmd_status_to_err_v2(out);
+	if (err)
+		goto out;
+
+	pkarr = MLX5_ADDR_OF(query_hca_vport_pkey_out, out, pkey);
+	for (i = 0; i < nout; i++, pkey++, pkarr += MLX5_ST_SZ_BYTES(pkey))
+		*pkey = MLX5_GET_PR(pkey, pkarr, pkey);
+
+out:
+	kfree(in);
+	kfree(out);
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx5_query_hca_vport_pkey);
+
+int mlx5_query_hca_vport_context(struct mlx5_core_dev *dev,
+				 u8 other_vport, u8 port_num,
+				 u16 vf_num,
+				 struct mlx5_hca_vport_context *rep)
+{
+	int out_sz = MLX5_ST_SZ_BYTES(query_hca_vport_context_out);
+	int in[MLX5_ST_SZ_DW(query_hca_vport_context_in)];
+	int is_group_manager;
+	void *out;
+	void *ctx;
+	int err;
+
+	is_group_manager = MLX5_CAP_GEN(dev, vport_group_manager);
+
+	memset(in, 0, sizeof(in));
+	out = kzalloc(out_sz, GFP_KERNEL);
+	if (!out)
+		return -ENOMEM;
+
+	MLX5_SET(query_hca_vport_context_in, in, opcode, MLX5_CMD_OP_QUERY_HCA_VPORT_CONTEXT);
+
+	if (other_vport) {
+		if (is_group_manager) {
+			MLX5_SET(query_hca_vport_context_in, in, other_vport, 1);
+			MLX5_SET(query_hca_vport_context_in, in, vport_number, vf_num);
+		} else {
+			err = -EPERM;
+			goto ex;
+		}
+	}
+
+	if (MLX5_CAP_GEN(dev, num_ports) == 2)
+		MLX5_SET(query_hca_vport_context_in, in, port_num, port_num);
+
+	err = mlx5_cmd_exec(dev, in, sizeof(in), out,  out_sz);
+	if (err)
+		goto ex;
+	err = mlx5_cmd_status_to_err_v2(out);
+	if (err)
+		goto ex;
+
+	ctx = MLX5_ADDR_OF(query_hca_vport_context_out, out, hca_vport_context);
+	rep->field_select = MLX5_GET_PR(hca_vport_context, ctx, field_select);
+	rep->sm_virt_aware = MLX5_GET_PR(hca_vport_context, ctx, sm_virt_aware);
+	rep->has_smi = MLX5_GET_PR(hca_vport_context, ctx, has_smi);
+	rep->has_raw = MLX5_GET_PR(hca_vport_context, ctx, has_raw);
+	rep->policy = MLX5_GET_PR(hca_vport_context, ctx, vport_state_policy);
+	rep->phys_state = MLX5_GET_PR(hca_vport_context, ctx,
+				      port_physical_state);
+	rep->vport_state = MLX5_GET_PR(hca_vport_context, ctx, vport_state);
+	rep->port_physical_state = MLX5_GET_PR(hca_vport_context, ctx,
+					       port_physical_state);
+	rep->port_guid = MLX5_GET64_PR(hca_vport_context, ctx, port_guid);
+	rep->node_guid = MLX5_GET64_PR(hca_vport_context, ctx, node_guid);
+	rep->cap_mask1 = MLX5_GET_PR(hca_vport_context, ctx, cap_mask1);
+	rep->cap_mask1_perm = MLX5_GET_PR(hca_vport_context, ctx,
+					  cap_mask1_field_select);
+	rep->cap_mask2 = MLX5_GET_PR(hca_vport_context, ctx, cap_mask2);
+	rep->cap_mask2_perm = MLX5_GET_PR(hca_vport_context, ctx,
+					  cap_mask2_field_select);
+	rep->lid = MLX5_GET_PR(hca_vport_context, ctx, lid);
+	rep->init_type_reply = MLX5_GET_PR(hca_vport_context, ctx,
+					   init_type_reply);
+	rep->lmc = MLX5_GET_PR(hca_vport_context, ctx, lmc);
+	rep->subnet_timeout = MLX5_GET_PR(hca_vport_context, ctx,
+					  subnet_timeout);
+	rep->sm_lid = MLX5_GET_PR(hca_vport_context, ctx, sm_lid);
+	rep->sm_sl = MLX5_GET_PR(hca_vport_context, ctx, sm_sl);
+	rep->qkey_violation_counter = MLX5_GET_PR(hca_vport_context, ctx,
+						  qkey_violation_counter);
+	rep->pkey_violation_counter = MLX5_GET_PR(hca_vport_context, ctx,
+						  pkey_violation_counter);
+	rep->grh_required = MLX5_GET_PR(hca_vport_context, ctx, grh_required);
+	rep->sys_image_guid = MLX5_GET64_PR(hca_vport_context, ctx,
+					    system_image_guid);
+
+ex:
+	kfree(out);
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx5_query_hca_vport_context);
+
+int mlx5_query_hca_vport_system_image_guid(struct mlx5_core_dev *dev,
+					   __be64 *sys_image_guid)
+{
+	struct mlx5_hca_vport_context *rep;
+	int err;
+
+	rep = kzalloc(sizeof(*rep), GFP_KERNEL);
+	if (!rep)
+		return -ENOMEM;
+
+	err = mlx5_query_hca_vport_context(dev, 0, 1, 0, rep);
+	if (!err)
+		*sys_image_guid = rep->sys_image_guid;
+
+	kfree(rep);
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx5_query_hca_vport_system_image_guid);
+
+int mlx5_query_hca_vport_node_guid(struct mlx5_core_dev *dev,
+				   u64 *node_guid)
+{
+	struct mlx5_hca_vport_context *rep;
+	int err;
+
+	rep = kzalloc(sizeof(*rep), GFP_KERNEL);
+	if (!rep)
+		return -ENOMEM;
+
+	err = mlx5_query_hca_vport_context(dev, 0, 1, 0, rep);
+	if (!err)
+		*node_guid = rep->node_guid;
+
+	kfree(rep);
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx5_query_hca_vport_node_guid);
diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h
index b288c538347a..b2c43508a737 100644
--- a/include/linux/mlx5/device.h
+++ b/include/linux/mlx5/device.h
@@ -99,6 +99,12 @@ __mlx5_mask(typ, fld))
 
 #define MLX5_GET64(typ, p, fld) be64_to_cpu(*((__be64 *)(p) + __mlx5_64_off(typ, fld)))
 
+#define MLX5_GET64_PR(typ, p, fld) ({ \
+	u64 ___t = MLX5_GET64(typ, p, fld); \
+	pr_debug(#fld " = 0x%llx\n", ___t); \
+	___t; \
+})
+
 enum {
 	MLX5_MAX_COMMANDS		= 32,
 	MLX5_CMD_DATA_BLOCK_SIZE	= 512,
@@ -1172,4 +1178,11 @@ enum {
 	MLX5_CMD_STAT_BAD_SIZE_OUTS_CQES_ERR	= 0x40,
 };
 
+static inline u16 mlx5_to_sw_pkey_sz(int pkey_sz)
+{
+	if (pkey_sz > MLX5_MAX_LOG_PKEY_TABLE)
+		return 0;
+	return MLX5_MIN_PKEY_TABLE_SIZE << pkey_sz;
+}
+
 #endif /* MLX5_DEVICE_H */
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index ba9f212c94bb..8ab8b8af5c32 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -553,6 +553,41 @@ struct mlx5_pas {
 	u8	log_sz;
 };
 
+enum port_state_policy {
+	MLX5_AAA_000
+};
+
+enum phy_port_state {
+	MLX5_AAA_111
+};
+
+struct mlx5_hca_vport_context {
+	u32			field_select;
+	bool			sm_virt_aware;
+	bool			has_smi;
+	bool			has_raw;
+	enum port_state_policy	policy;
+	enum phy_port_state	phys_state;
+	enum ib_port_state	vport_state;
+	u8			port_physical_state;
+	u64			sys_image_guid;
+	u64			port_guid;
+	u64			node_guid;
+	u32			cap_mask1;
+	u32			cap_mask1_perm;
+	u32			cap_mask2;
+	u32			cap_mask2_perm;
+	u16			lid;
+	u8			init_type_reply; /* bitmask: see ib spec 14.2.5.6 InitTypeReply */
+	u8			lmc;
+	u8			subnet_timeout;
+	u16			sm_lid;
+	u8			sm_sl;
+	u16			qkey_violation_counter;
+	u16			pkey_violation_counter;
+	bool			grh_required;
+};
+
 static inline void *mlx5_buf_offset(struct mlx5_buf *buf, int offset)
 {
 		return buf->direct.buf + offset;
@@ -792,4 +827,14 @@ struct mlx5_profile {
 	} mr_cache[MAX_MR_CACHE_ENTRIES];
 };
 
+static inline int mlx5_get_gid_table_len(u16 param)
+{
+	if (param > 4) {
+		pr_warn("gid table length is zero\n");
+		return 0;
+	}
+
+	return 8 * (1 << param);
+}
+
 #endif /* MLX5_DRIVER_H */
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index dbe2b32c0539..f06d054ad021 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -2221,12 +2221,15 @@ struct mlx5_ifc_hca_vport_context_bits {
 	u8         has_smi[0x1];
 	u8         has_raw[0x1];
 	u8         grh_required[0x1];
-	u8         reserved_1[0x10];
-	u8         port_state_policy[0x4];
-	u8         phy_port_state[0x4];
+	u8         reserved_1[0xc];
+	u8         port_physical_state[0x4];
+	u8         vport_state_policy[0x4];
+	u8         port_state[0x4];
 	u8         vport_state[0x4];
 
-	u8         reserved_2[0x60];
+	u8         reserved_2[0x20];
+
+	u8         system_image_guid[0x40];
 
 	u8         port_guid[0x40];
 
@@ -3490,7 +3493,8 @@ struct mlx5_ifc_query_hca_vport_pkey_in_bits {
 	u8         op_mod[0x10];
 
 	u8         other_vport[0x1];
-	u8         reserved_2[0xf];
+	u8         reserved_2[0xb];
+	u8         port_num[0x4];
 	u8         vport_number[0x10];
 
 	u8         reserved_3[0x10];
@@ -3519,7 +3523,8 @@ struct mlx5_ifc_query_hca_vport_gid_in_bits {
 	u8         op_mod[0x10];
 
 	u8         other_vport[0x1];
-	u8         reserved_2[0xf];
+	u8         reserved_2[0xb];
+	u8         port_num[0x4];
 	u8         vport_number[0x10];
 
 	u8         reserved_3[0x10];
@@ -3545,7 +3550,8 @@ struct mlx5_ifc_query_hca_vport_context_in_bits {
 	u8         op_mod[0x10];
 
 	u8         other_vport[0x1];
-	u8         reserved_2[0xf];
+	u8         reserved_2[0xb];
+	u8         port_num[0x4];
 	u8         vport_number[0x10];
 
 	u8         reserved_3[0x20];
@@ -4243,7 +4249,8 @@ struct mlx5_ifc_modify_hca_vport_context_in_bits {
 	u8         op_mod[0x10];
 
 	u8         other_vport[0x1];
-	u8         reserved_2[0xf];
+	u8         reserved_2[0xb];
+	u8         port_num[0x4];
 	u8         vport_number[0x10];
 
 	u8         reserved_3[0x20];
diff --git a/include/linux/mlx5/vport.h b/include/linux/mlx5/vport.h
index 99d0e9f85432..67882a834efb 100644
--- a/include/linux/mlx5/vport.h
+++ b/include/linux/mlx5/vport.h
@@ -37,5 +37,19 @@
 
 u8 mlx5_query_vport_state(struct mlx5_core_dev *mdev, u8 opmod);
 void mlx5_query_nic_vport_mac_address(struct mlx5_core_dev *mdev, u8 *addr);
+int mlx5_query_hca_vport_gid(struct mlx5_core_dev *dev, u8 other_vport,
+			     u8 port_num, u16  vf_num, u16 gid_index,
+			     union ib_gid *gid);
+int mlx5_query_hca_vport_pkey(struct mlx5_core_dev *dev, u8 other_vport,
+			      u8 port_num, u16 vf_num, u16 pkey_index,
+			      u16 *pkey);
+int mlx5_query_hca_vport_context(struct mlx5_core_dev *dev,
+				 u8 other_vport, u8 port_num,
+				 u16 vf_num,
+				 struct mlx5_hca_vport_context *rep);
+int mlx5_query_hca_vport_system_image_guid(struct mlx5_core_dev *dev,
+					   __be64 *sys_image_guid);
+int mlx5_query_hca_vport_node_guid(struct mlx5_core_dev *dev,
+				   u64 *node_guid);
 
 #endif /* __MLX5_VPORT_H__ */
-- 
cgit v1.2.3


From 211e6c80e5a68ef39a81484583e8efbf9774627d Mon Sep 17 00:00:00 2001
From: Majd Dibbiny <majd@mellanox.com>
Date: Thu, 4 Jun 2015 19:30:42 +0300
Subject: net/mlx5_core: Get vendor-id using the query adapter command

Add two wrapper functions to the query adapter command:

1. mlx5_query_board_id -- replaces the old mlx5_cmd_query_adapter.

2. mlx5_core_query_vendor_id -- retrieves the vendor_id from the
   query_adapter command.

Signed-off-by: Majd Dibbiny <majd@mellanox.com>
Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx5/core/fw.c       | 59 ++++++++++++++++------
 drivers/net/ethernet/mellanox/mlx5/core/main.c     |  4 +-
 .../net/ethernet/mellanox/mlx5/core/mlx5_core.h    |  2 +-
 include/linux/mlx5/driver.h                        |  1 +
 include/linux/mlx5/mlx5_ifc.h                      |  7 ++-
 5 files changed, 53 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fw.c b/drivers/net/ethernet/mellanox/mlx5/core/fw.c
index 801ccadd709a..ba87442ef776 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fw.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fw.c
@@ -35,34 +35,63 @@
 #include <linux/module.h>
 #include "mlx5_core.h"
 
-int mlx5_cmd_query_adapter(struct mlx5_core_dev *dev)
+int mlx5_cmd_query_adapter(struct mlx5_core_dev *dev, u32 *out, int outlen)
 {
-	struct mlx5_cmd_query_adapter_mbox_out *out;
-	struct mlx5_cmd_query_adapter_mbox_in in;
+	u32 in[MLX5_ST_SZ_DW(query_adapter_in)];
+
+	memset(in, 0, sizeof(in));
+
+	MLX5_SET(query_adapter_in, in, opcode, MLX5_CMD_OP_QUERY_ADAPTER);
+
+	return mlx5_cmd_exec_check_status(dev, in, sizeof(in), out, outlen);
+}
+
+int mlx5_query_board_id(struct mlx5_core_dev *dev)
+{
+	u32 *out;
+	int outlen = MLX5_ST_SZ_BYTES(query_adapter_out);
 	int err;
 
-	out = kzalloc(sizeof(*out), GFP_KERNEL);
+	out = kzalloc(outlen, GFP_KERNEL);
 	if (!out)
 		return -ENOMEM;
 
-	memset(&in, 0, sizeof(in));
-	in.hdr.opcode = cpu_to_be16(MLX5_CMD_OP_QUERY_ADAPTER);
-	err = mlx5_cmd_exec(dev, &in, sizeof(in), out, sizeof(*out));
+	err = mlx5_cmd_query_adapter(dev, out, outlen);
 	if (err)
-		goto out_out;
+		goto out;
 
-	if (out->hdr.status) {
-		err = mlx5_cmd_status_to_err(&out->hdr);
-		goto out_out;
-	}
+	memcpy(dev->board_id,
+	       MLX5_ADDR_OF(query_adapter_out, out,
+			    query_adapter_struct.vsd_contd_psid),
+	       MLX5_FLD_SZ_BYTES(query_adapter_out,
+				 query_adapter_struct.vsd_contd_psid));
 
-	memcpy(dev->board_id, out->vsd_psid, sizeof(out->vsd_psid));
-
-out_out:
+out:
 	kfree(out);
+	return err;
+}
+
+int mlx5_core_query_vendor_id(struct mlx5_core_dev *mdev, u32 *vendor_id)
+{
+	u32 *out;
+	int outlen = MLX5_ST_SZ_BYTES(query_adapter_out);
+	int err;
+
+	out = kzalloc(outlen, GFP_KERNEL);
+	if (!out)
+		return -ENOMEM;
 
+	err = mlx5_cmd_query_adapter(mdev, out, outlen);
+	if (err)
+		goto out;
+
+	*vendor_id = MLX5_GET(query_adapter_out, out,
+			      query_adapter_struct.ieee_vendor_id);
+out:
+	kfree(out);
 	return err;
 }
+EXPORT_SYMBOL(mlx5_core_query_vendor_id);
 
 int mlx5_query_hca_caps(struct mlx5_core_dev *dev)
 {
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c
index 58354122dfe4..afad529838de 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
@@ -768,9 +768,9 @@ static int mlx5_dev_init(struct mlx5_core_dev *dev, struct pci_dev *pdev)
 		goto err_stop_poll;
 	}
 
-	err = mlx5_cmd_query_adapter(dev);
+	err = mlx5_query_board_id(dev);
 	if (err) {
-		dev_err(&pdev->dev, "query adapter failed\n");
+		dev_err(&pdev->dev, "query board id failed\n");
 		goto err_stop_poll;
 	}
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
index 6983c1047255..fc88ecaecb4b 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
@@ -78,7 +78,7 @@ static inline int mlx5_cmd_exec_check_status(struct mlx5_core_dev *dev, u32 *in,
 }
 
 int mlx5_query_hca_caps(struct mlx5_core_dev *dev);
-int mlx5_cmd_query_adapter(struct mlx5_core_dev *dev);
+int mlx5_query_board_id(struct mlx5_core_dev *dev);
 int mlx5_cmd_init_hca(struct mlx5_core_dev *dev);
 int mlx5_cmd_teardown_hca(struct mlx5_core_dev *dev);
 
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 8ab8b8af5c32..b90fb9336d21 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -817,6 +817,7 @@ struct mlx5_interface {
 void *mlx5_get_protocol_dev(struct mlx5_core_dev *mdev, int protocol);
 int mlx5_register_interface(struct mlx5_interface *intf);
 void mlx5_unregister_interface(struct mlx5_interface *intf);
+int mlx5_core_query_vendor_id(struct mlx5_core_dev *mdev, u32 *vendor_id);
 
 struct mlx5_profile {
 	u64	mask;
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index f06d054ad021..6d2f6fee041c 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -2470,9 +2470,12 @@ union mlx5_ifc_cong_control_roce_ecn_auto_bits {
 };
 
 struct mlx5_ifc_query_adapter_param_block_bits {
-	u8         reserved_0[0xe0];
+	u8         reserved_0[0xc0];
 
-	u8         reserved_1[0x10];
+	u8         reserved_1[0x8];
+	u8         ieee_vendor_id[0x18];
+
+	u8         reserved_2[0x10];
 	u8         vsd_vendor_id[0x10];
 
 	u8         vsd[208][0x8];
-- 
cgit v1.2.3


From e760152d08da78aa160e68ac90bf8f3f10aff462 Mon Sep 17 00:00:00 2001
From: Majd Dibbiny <majd@mellanox.com>
Date: Thu, 4 Jun 2015 19:30:43 +0300
Subject: net/mlx5_core: Use port number in the query port mtu helpers

Extend the function prototypes for max and operational mtu to take the
local port number. In the Ethernet driver is this hard coded to one,
since ConnectX4 Ethernet devices are always function-per-port.
The IB driver also serves older devices (ConnectIB) which isn't such,
and hence the part can vary.

Signed-off-by: Majd Dibbiny <majd@mellanox.com>
Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c |  4 ++--
 drivers/net/ethernet/mellanox/mlx5/core/port.c    | 15 +++++++++------
 include/linux/mlx5/driver.h                       |  6 ++++--
 3 files changed, 15 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index edba09cca600..7348c5173aa9 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -1386,7 +1386,7 @@ int mlx5e_open_locked(struct net_device *netdev)
 		return err;
 	}
 
-	err = mlx5_query_port_oper_mtu(mdev, &actual_mtu);
+	err = mlx5_query_port_oper_mtu(mdev, &actual_mtu, 1);
 	if (err) {
 		netdev_err(netdev, "%s: mlx5_query_port_oper_mtu failed %d\n",
 			   __func__, err);
@@ -1614,7 +1614,7 @@ static int mlx5e_change_mtu(struct net_device *netdev, int new_mtu)
 	int max_mtu;
 	int err = 0;
 
-	err = mlx5_query_port_max_mtu(mdev, &max_mtu);
+	err = mlx5_query_port_max_mtu(mdev, &max_mtu, 1);
 	if (err)
 		return err;
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/port.c b/drivers/net/ethernet/mellanox/mlx5/core/port.c
index 7d3d0f9f328d..d9498aae5ab5 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/port.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/port.c
@@ -213,7 +213,8 @@ int mlx5_query_port_status(struct mlx5_core_dev *dev, u8 *status)
 }
 
 static int mlx5_query_port_mtu(struct mlx5_core_dev *dev,
-			       int *admin_mtu, int *max_mtu, int *oper_mtu)
+			       int *admin_mtu, int *max_mtu, int *oper_mtu,
+			       u8 local_port)
 {
 	u32 in[MLX5_ST_SZ_DW(pmtu_reg)];
 	u32 out[MLX5_ST_SZ_DW(pmtu_reg)];
@@ -221,7 +222,7 @@ static int mlx5_query_port_mtu(struct mlx5_core_dev *dev,
 
 	memset(in, 0, sizeof(in));
 
-	MLX5_SET(pmtu_reg, in, local_port, 1);
+	MLX5_SET(pmtu_reg, in, local_port, local_port);
 
 	err = mlx5_core_access_reg(dev, in, sizeof(in), out,
 				   sizeof(out), MLX5_REG_PMTU, 0, 0);
@@ -253,14 +254,16 @@ int mlx5_set_port_mtu(struct mlx5_core_dev *dev, int mtu)
 }
 EXPORT_SYMBOL_GPL(mlx5_set_port_mtu);
 
-int mlx5_query_port_max_mtu(struct mlx5_core_dev *dev, int *max_mtu)
+int mlx5_query_port_max_mtu(struct mlx5_core_dev *dev, int *max_mtu,
+			    u8 local_port)
 {
-	return mlx5_query_port_mtu(dev, NULL, max_mtu, NULL);
+	return mlx5_query_port_mtu(dev, NULL, max_mtu, NULL, local_port);
 }
 EXPORT_SYMBOL_GPL(mlx5_query_port_max_mtu);
 
-int mlx5_query_port_oper_mtu(struct mlx5_core_dev *dev, int *oper_mtu)
+int mlx5_query_port_oper_mtu(struct mlx5_core_dev *dev, int *oper_mtu,
+			     u8 local_port)
 {
-	return mlx5_query_port_mtu(dev, NULL, NULL, oper_mtu);
+	return mlx5_query_port_mtu(dev, NULL, NULL, oper_mtu, local_port);
 }
 EXPORT_SYMBOL_GPL(mlx5_query_port_oper_mtu);
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index b90fb9336d21..cd09784b6999 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -751,8 +751,10 @@ int mlx5_set_port_status(struct mlx5_core_dev *dev,
 int mlx5_query_port_status(struct mlx5_core_dev *dev, u8 *status);
 
 int mlx5_set_port_mtu(struct mlx5_core_dev *dev, int mtu);
-int mlx5_query_port_max_mtu(struct mlx5_core_dev *dev, int *max_mtu);
-int mlx5_query_port_oper_mtu(struct mlx5_core_dev *dev, int *oper_mtu);
+int mlx5_query_port_max_mtu(struct mlx5_core_dev *dev, int *max_mtu,
+			    u8 local_port);
+int mlx5_query_port_oper_mtu(struct mlx5_core_dev *dev, int *oper_mtu,
+			     u8 local_port);
 
 int mlx5_debug_eq_add(struct mlx5_core_dev *dev, struct mlx5_eq *eq);
 void mlx5_debug_eq_remove(struct mlx5_core_dev *dev, struct mlx5_eq *eq);
-- 
cgit v1.2.3


From a05bdefa4081d43f9c86c3bb693d0492a21590da Mon Sep 17 00:00:00 2001
From: Majd Dibbiny <majd@mellanox.com>
Date: Thu, 4 Jun 2015 19:30:44 +0300
Subject: net/mlx5_core: Use port number when querying port ptys

Until now, mlx5_query_port_ptys always queried port number one.

Added new argument in the function's prototype so we can also query
the second port. This will be needed  when thr helper will be invoked
from the IB driver on non FPP (Function-Per-Port) devices.

Signed-off-by: Majd Dibbiny <majd@mellanox.com>
Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c | 2 +-
 drivers/net/ethernet/mellanox/mlx5/core/port.c       | 8 ++++----
 include/linux/mlx5/driver.h                          | 2 +-
 3 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
index de7aec8abca1..388938482ff9 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
@@ -543,7 +543,7 @@ static int mlx5e_get_settings(struct net_device *netdev,
 	u32 eth_proto_oper;
 	int err;
 
-	err = mlx5_query_port_ptys(mdev, out, sizeof(out), MLX5_PTYS_EN);
+	err = mlx5_query_port_ptys(mdev, out, sizeof(out), MLX5_PTYS_EN, 1);
 
 	if (err) {
 		netdev_err(netdev, "%s: query port ptys failed: %d\n",
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/port.c b/drivers/net/ethernet/mellanox/mlx5/core/port.c
index d9498aae5ab5..cbbce40d853c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/port.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/port.c
@@ -104,13 +104,13 @@ int mlx5_set_port_caps(struct mlx5_core_dev *dev, u8 port_num, u32 caps)
 EXPORT_SYMBOL_GPL(mlx5_set_port_caps);
 
 int mlx5_query_port_ptys(struct mlx5_core_dev *dev, u32 *ptys,
-			 int ptys_size, int proto_mask)
+			 int ptys_size, int proto_mask, u8 local_port)
 {
 	u32 in[MLX5_ST_SZ_DW(ptys_reg)];
 	int err;
 
 	memset(in, 0, sizeof(in));
-	MLX5_SET(ptys_reg, in, local_port, 1);
+	MLX5_SET(ptys_reg, in, local_port, local_port);
 	MLX5_SET(ptys_reg, in, proto_mask, proto_mask);
 
 	err = mlx5_core_access_reg(dev, in, sizeof(in), ptys,
@@ -126,7 +126,7 @@ int mlx5_query_port_proto_cap(struct mlx5_core_dev *dev,
 	u32 out[MLX5_ST_SZ_DW(ptys_reg)];
 	int err;
 
-	err = mlx5_query_port_ptys(dev, out, sizeof(out), proto_mask);
+	err = mlx5_query_port_ptys(dev, out, sizeof(out), proto_mask, 1);
 	if (err)
 		return err;
 
@@ -145,7 +145,7 @@ int mlx5_query_port_proto_admin(struct mlx5_core_dev *dev,
 	u32 out[MLX5_ST_SZ_DW(ptys_reg)];
 	int err;
 
-	err = mlx5_query_port_ptys(dev, out, sizeof(out), proto_mask);
+	err = mlx5_query_port_ptys(dev, out, sizeof(out), proto_mask, 1);
 	if (err)
 		return err;
 
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index cd09784b6999..e4b814f64014 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -739,7 +739,7 @@ int mlx5_core_access_reg(struct mlx5_core_dev *dev, void *data_in,
 
 int mlx5_set_port_caps(struct mlx5_core_dev *dev, u8 port_num, u32 caps);
 int mlx5_query_port_ptys(struct mlx5_core_dev *dev, u32 *ptys,
-			 int ptys_size, int proto_mask);
+			 int ptys_size, int proto_mask, u8 local_port);
 int mlx5_query_port_proto_cap(struct mlx5_core_dev *dev,
 			      u32 *proto_cap, int proto_mask);
 int mlx5_query_port_proto_admin(struct mlx5_core_dev *dev,
-- 
cgit v1.2.3


From a124d13ef59e09941fc0924fd7c29ae6d7cd77a3 Mon Sep 17 00:00:00 2001
From: Majd Dibbiny <majd@mellanox.com>
Date: Thu, 4 Jun 2015 19:30:45 +0300
Subject: net/mlx5_core: Add more query port helpers

Add the following helpers:

1. mlx5_query_port_proto_oper -- queries the port speed port mask
2. mlx5_query_port_link_width_oper - queries the port link with bitmask
3. mlx5_query_port_vl_hw_cap - queries the Virtual Lanes supported on this port

These helpers will be used from the IB driver when working in ISSI > 0 mode.

Signed-off-by: Majd Dibbiny <majd@mellanox.com>
Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx5/core/port.c | 67 ++++++++++++++++++++++++++
 include/linux/mlx5/driver.h                    |  8 +++
 2 files changed, 75 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/port.c b/drivers/net/ethernet/mellanox/mlx5/core/port.c
index cbbce40d853c..619d3baf19ea 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/port.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/port.c
@@ -158,6 +158,42 @@ int mlx5_query_port_proto_admin(struct mlx5_core_dev *dev,
 }
 EXPORT_SYMBOL_GPL(mlx5_query_port_proto_admin);
 
+int mlx5_query_port_link_width_oper(struct mlx5_core_dev *dev,
+				    u8 *link_width_oper, u8 local_port)
+{
+	u32 out[MLX5_ST_SZ_DW(ptys_reg)];
+	int err;
+
+	err = mlx5_query_port_ptys(dev, out, sizeof(out), MLX5_PTYS_IB, local_port);
+	if (err)
+		return err;
+
+	*link_width_oper = MLX5_GET(ptys_reg, out, ib_link_width_oper);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mlx5_query_port_link_width_oper);
+
+int mlx5_query_port_proto_oper(struct mlx5_core_dev *dev,
+			       u8 *proto_oper, int proto_mask,
+			       u8 local_port)
+{
+	u32 out[MLX5_ST_SZ_DW(ptys_reg)];
+	int err;
+
+	err = mlx5_query_port_ptys(dev, out, sizeof(out), proto_mask, local_port);
+	if (err)
+		return err;
+
+	if (proto_mask == MLX5_PTYS_EN)
+		*proto_oper = MLX5_GET(ptys_reg, out, eth_proto_oper);
+	else
+		*proto_oper = MLX5_GET(ptys_reg, out, ib_proto_oper);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mlx5_query_port_proto_oper);
+
 int mlx5_set_port_proto(struct mlx5_core_dev *dev, u32 proto_admin,
 			int proto_mask)
 {
@@ -267,3 +303,34 @@ int mlx5_query_port_oper_mtu(struct mlx5_core_dev *dev, int *oper_mtu,
 	return mlx5_query_port_mtu(dev, NULL, NULL, oper_mtu, local_port);
 }
 EXPORT_SYMBOL_GPL(mlx5_query_port_oper_mtu);
+
+static int mlx5_query_port_pvlc(struct mlx5_core_dev *dev, u32 *pvlc,
+				int pvlc_size,  u8 local_port)
+{
+	u32 in[MLX5_ST_SZ_DW(pvlc_reg)];
+	int err;
+
+	memset(in, 0, sizeof(in));
+	MLX5_SET(ptys_reg, in, local_port, local_port);
+
+	err = mlx5_core_access_reg(dev, in, sizeof(in), pvlc,
+				   pvlc_size, MLX5_REG_PVLC, 0, 0);
+
+	return err;
+}
+
+int mlx5_query_port_vl_hw_cap(struct mlx5_core_dev *dev,
+			      u8 *vl_hw_cap, u8 local_port)
+{
+	u32 out[MLX5_ST_SZ_DW(pvlc_reg)];
+	int err;
+
+	err = mlx5_query_port_pvlc(dev, out, sizeof(out), local_port);
+	if (err)
+		return err;
+
+	*vl_hw_cap = MLX5_GET(pvlc_reg, out, vl_hw_cap);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mlx5_query_port_vl_hw_cap);
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index e4b814f64014..6093bde16b94 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -107,6 +107,7 @@ enum {
 	MLX5_REG_PUDE		 = 0x5009,
 	MLX5_REG_PMPE		 = 0x5010,
 	MLX5_REG_PELC		 = 0x500e,
+	MLX5_REG_PVLC		 = 0x500f,
 	MLX5_REG_PMLP		 = 0, /* TBD */
 	MLX5_REG_NODE_DESC	 = 0x6001,
 	MLX5_REG_HOST_ENDIANNESS = 0x7004,
@@ -744,6 +745,11 @@ int mlx5_query_port_proto_cap(struct mlx5_core_dev *dev,
 			      u32 *proto_cap, int proto_mask);
 int mlx5_query_port_proto_admin(struct mlx5_core_dev *dev,
 				u32 *proto_admin, int proto_mask);
+int mlx5_query_port_link_width_oper(struct mlx5_core_dev *dev,
+				    u8 *link_width_oper, u8 local_port);
+int mlx5_query_port_proto_oper(struct mlx5_core_dev *dev,
+			       u8 *proto_oper, int proto_mask,
+			       u8 local_port);
 int mlx5_set_port_proto(struct mlx5_core_dev *dev, u32 proto_admin,
 			int proto_mask);
 int mlx5_set_port_status(struct mlx5_core_dev *dev,
@@ -755,6 +761,8 @@ int mlx5_query_port_max_mtu(struct mlx5_core_dev *dev, int *max_mtu,
 			    u8 local_port);
 int mlx5_query_port_oper_mtu(struct mlx5_core_dev *dev, int *oper_mtu,
 			     u8 local_port);
+int mlx5_query_port_vl_hw_cap(struct mlx5_core_dev *dev,
+			      u8 *vl_hw_cap, u8 local_port);
 
 int mlx5_debug_eq_add(struct mlx5_core_dev *dev, struct mlx5_eq *eq);
 void mlx5_debug_eq_remove(struct mlx5_core_dev *dev, struct mlx5_eq *eq);
-- 
cgit v1.2.3


From 03fbf488cece461468d3abb795f5e5f055e00040 Mon Sep 17 00:00:00 2001
From: Jarkko Nikula <jarkko.nikula@linux.intel.com>
Date: Thu, 4 Jun 2015 16:55:10 +0300
Subject: spi: pxa2xx: Differentiate Intel LPSS types

Intel LPSS SPI properties differ between between platforms. Now private
registers offset 0x400 or 0x800 is autodetected but there is need to
support also other offset and handle a few other differences.

Prepare for that by splitting the LPSS_SSP type into compatible hardware
types and set it now based on PCI or ACPI ID. That type will be used to set
properties that differ between current and upcoming platforms.

Signed-off-by: Jarkko Nikula <jarkko.nikula@linux.intel.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/spi/spi-pxa2xx-pci.c |  8 ++++----
 drivers/spi/spi-pxa2xx.c     | 44 ++++++++++++++++++++++++++++++--------------
 include/linux/pxa2xx_ssp.h   |  3 ++-
 3 files changed, 36 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/spi/spi-pxa2xx-pci.c b/drivers/spi/spi-pxa2xx-pci.c
index fa7399e84bbb..3cfd4357489a 100644
--- a/drivers/spi/spi-pxa2xx-pci.c
+++ b/drivers/spi/spi-pxa2xx-pci.c
@@ -62,7 +62,7 @@ static struct pxa_spi_info spi_info_configs[] = {
 		.max_clk_rate = 3686400,
 	},
 	[PORT_BYT] = {
-		.type = LPSS_SSP,
+		.type = LPSS_BYT_SSP,
 		.port_id = 0,
 		.num_chipselect = 1,
 		.max_clk_rate = 50000000,
@@ -70,7 +70,7 @@ static struct pxa_spi_info spi_info_configs[] = {
 		.rx_param = &byt_rx_param,
 	},
 	[PORT_BSW0] = {
-		.type = LPSS_SSP,
+		.type = LPSS_BYT_SSP,
 		.port_id = 0,
 		.num_chipselect = 1,
 		.max_clk_rate = 50000000,
@@ -78,7 +78,7 @@ static struct pxa_spi_info spi_info_configs[] = {
 		.rx_param = &bsw0_rx_param,
 	},
 	[PORT_BSW1] = {
-		.type = LPSS_SSP,
+		.type = LPSS_BYT_SSP,
 		.port_id = 1,
 		.num_chipselect = 1,
 		.max_clk_rate = 50000000,
@@ -86,7 +86,7 @@ static struct pxa_spi_info spi_info_configs[] = {
 		.rx_param = &bsw1_rx_param,
 	},
 	[PORT_BSW2] = {
-		.type = LPSS_SSP,
+		.type = LPSS_BYT_SSP,
 		.port_id = 2,
 		.num_chipselect = 1,
 		.max_clk_rate = 50000000,
diff --git a/drivers/spi/spi-pxa2xx.c b/drivers/spi/spi-pxa2xx.c
index e3223ac75a7c..a85b7496a3cd 100644
--- a/drivers/spi/spi-pxa2xx.c
+++ b/drivers/spi/spi-pxa2xx.c
@@ -74,7 +74,13 @@ MODULE_ALIAS("platform:pxa2xx-spi");
 
 static bool is_lpss_ssp(const struct driver_data *drv_data)
 {
-	return drv_data->ssp_type == LPSS_SSP;
+	switch (drv_data->ssp_type) {
+	case LPSS_LPT_SSP:
+	case LPSS_BYT_SSP:
+		return true;
+	default:
+		return false;
+	}
 }
 
 static bool is_quark_x1000_ssp(const struct driver_data *drv_data)
@@ -1085,7 +1091,8 @@ static int setup(struct spi_device *spi)
 		tx_hi_thres = 0;
 		rx_thres = RX_THRESH_QUARK_X1000_DFLT;
 		break;
-	case LPSS_SSP:
+	case LPSS_LPT_SSP:
+	case LPSS_BYT_SSP:
 		tx_thres = LPSS_TX_LOTHRESH_DFLT;
 		tx_hi_thres = LPSS_TX_HITHRESH_DFLT;
 		rx_thres = LPSS_RX_THRESH_DFLT;
@@ -1242,6 +1249,18 @@ static void cleanup(struct spi_device *spi)
 }
 
 #ifdef CONFIG_ACPI
+
+static struct acpi_device_id pxa2xx_spi_acpi_match[] = {
+	{ "INT33C0", LPSS_LPT_SSP },
+	{ "INT33C1", LPSS_LPT_SSP },
+	{ "INT3430", LPSS_LPT_SSP },
+	{ "INT3431", LPSS_LPT_SSP },
+	{ "80860F0E", LPSS_BYT_SSP },
+	{ "8086228E", LPSS_BYT_SSP },
+	{ },
+};
+MODULE_DEVICE_TABLE(acpi, pxa2xx_spi_acpi_match);
+
 static struct pxa2xx_spi_master *
 pxa2xx_spi_acpi_get_pdata(struct platform_device *pdev)
 {
@@ -1249,12 +1268,19 @@ pxa2xx_spi_acpi_get_pdata(struct platform_device *pdev)
 	struct acpi_device *adev;
 	struct ssp_device *ssp;
 	struct resource *res;
-	int devid;
+	const struct acpi_device_id *id;
+	int devid, type;
 
 	if (!ACPI_HANDLE(&pdev->dev) ||
 	    acpi_bus_get_device(ACPI_HANDLE(&pdev->dev), &adev))
 		return NULL;
 
+	id = acpi_match_device(pdev->dev.driver->acpi_match_table, &pdev->dev);
+	if (id)
+		type = (int)id->driver_data;
+	else
+		return NULL;
+
 	pdata = devm_kzalloc(&pdev->dev, sizeof(*pdata), GFP_KERNEL);
 	if (!pdata)
 		return NULL;
@@ -1272,7 +1298,7 @@ pxa2xx_spi_acpi_get_pdata(struct platform_device *pdev)
 
 	ssp->clk = devm_clk_get(&pdev->dev, NULL);
 	ssp->irq = platform_get_irq(pdev, 0);
-	ssp->type = LPSS_SSP;
+	ssp->type = type;
 	ssp->pdev = pdev;
 
 	ssp->port_id = -1;
@@ -1285,16 +1311,6 @@ pxa2xx_spi_acpi_get_pdata(struct platform_device *pdev)
 	return pdata;
 }
 
-static struct acpi_device_id pxa2xx_spi_acpi_match[] = {
-	{ "INT33C0", 0 },
-	{ "INT33C1", 0 },
-	{ "INT3430", 0 },
-	{ "INT3431", 0 },
-	{ "80860F0E", 0 },
-	{ "8086228E", 0 },
-	{ },
-};
-MODULE_DEVICE_TABLE(acpi, pxa2xx_spi_acpi_match);
 #else
 static inline struct pxa2xx_spi_master *
 pxa2xx_spi_acpi_get_pdata(struct platform_device *pdev)
diff --git a/include/linux/pxa2xx_ssp.h b/include/linux/pxa2xx_ssp.h
index dab545bb66b3..95a4b3bd7a5c 100644
--- a/include/linux/pxa2xx_ssp.h
+++ b/include/linux/pxa2xx_ssp.h
@@ -194,8 +194,9 @@ enum pxa_ssp_type {
 	PXA168_SSP,
 	PXA910_SSP,
 	CE4100_SSP,
-	LPSS_SSP,
 	QUARK_X1000_SSP,
+	LPSS_LPT_SSP,
+	LPSS_BYT_SSP,
 };
 
 struct ssp_device {
-- 
cgit v1.2.3


From dccf7369652f3934456345aab6a92fa905177886 Mon Sep 17 00:00:00 2001
From: Jarkko Nikula <jarkko.nikula@linux.intel.com>
Date: Thu, 4 Jun 2015 16:55:11 +0300
Subject: spi: pxa2xx: Prepare for new Intel LPSS SPI type

Some of the Intel LPSS SPI properties will be different in upcoming
platforms compared to existing Lynxpoint and BayTrail/Braswell. LPSS SPI
private registers will be at different offset and there will be changes in
individual registers and default FIFO thresholds too.

Add configuration for these differences and use them in runtime based on
LPSS SSP type. With this change private registers offset autodetection
becomes needless.

Signed-off-by: Jarkko Nikula <jarkko.nikula@linux.intel.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/spi/spi-pxa2xx.c   | 107 +++++++++++++++++++++++++--------------------
 include/linux/pxa2xx_ssp.h |   2 +-
 2 files changed, 60 insertions(+), 49 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/spi/spi-pxa2xx.c b/drivers/spi/spi-pxa2xx.c
index a85b7496a3cd..3fec31dbf972 100644
--- a/drivers/spi/spi-pxa2xx.c
+++ b/drivers/spi/spi-pxa2xx.c
@@ -60,18 +60,51 @@ MODULE_ALIAS("platform:pxa2xx-spi");
 				| QUARK_X1000_SSCR1_TFT		\
 				| SSCR1_SPH | SSCR1_SPO | SSCR1_LBM)
 
-#define LPSS_RX_THRESH_DFLT	64
-#define LPSS_TX_LOTHRESH_DFLT	160
-#define LPSS_TX_HITHRESH_DFLT	224
-
-/* Offset from drv_data->lpss_base */
-#define GENERAL_REG		0x08
 #define GENERAL_REG_RXTO_HOLDOFF_DISABLE BIT(24)
-#define SSP_REG			0x0c
-#define SPI_CS_CONTROL		0x18
 #define SPI_CS_CONTROL_SW_MODE	BIT(0)
 #define SPI_CS_CONTROL_CS_HIGH	BIT(1)
 
+struct lpss_config {
+	/* LPSS offset from drv_data->ioaddr */
+	unsigned offset;
+	/* Register offsets from drv_data->lpss_base or -1 */
+	int reg_general;
+	int reg_ssp;
+	int reg_cs_ctrl;
+	/* FIFO thresholds */
+	u32 rx_threshold;
+	u32 tx_threshold_lo;
+	u32 tx_threshold_hi;
+};
+
+/* Keep these sorted with enum pxa_ssp_type */
+static const struct lpss_config lpss_platforms[] = {
+	{	/* LPSS_LPT_SSP */
+		.offset = 0x800,
+		.reg_general = 0x08,
+		.reg_ssp = 0x0c,
+		.reg_cs_ctrl = 0x18,
+		.rx_threshold = 64,
+		.tx_threshold_lo = 160,
+		.tx_threshold_hi = 224,
+	},
+	{	/* LPSS_BYT_SSP */
+		.offset = 0x400,
+		.reg_general = 0x08,
+		.reg_ssp = 0x0c,
+		.reg_cs_ctrl = 0x18,
+		.rx_threshold = 64,
+		.tx_threshold_lo = 160,
+		.tx_threshold_hi = 224,
+	},
+};
+
+static inline const struct lpss_config
+*lpss_get_config(const struct driver_data *drv_data)
+{
+	return &lpss_platforms[drv_data->ssp_type - LPSS_LPT_SSP];
+}
+
 static bool is_lpss_ssp(const struct driver_data *drv_data)
 {
 	switch (drv_data->ssp_type) {
@@ -198,63 +231,39 @@ static void __lpss_ssp_write_priv(struct driver_data *drv_data,
  */
 static void lpss_ssp_setup(struct driver_data *drv_data)
 {
-	unsigned offset = 0x400;
-	u32 value, orig;
-
-	/*
-	 * Perform auto-detection of the LPSS SSP private registers. They
-	 * can be either at 1k or 2k offset from the base address.
-	 */
-	orig = readl(drv_data->ioaddr + offset + SPI_CS_CONTROL);
-
-	/* Test SPI_CS_CONTROL_SW_MODE bit enabling */
-	value = orig | SPI_CS_CONTROL_SW_MODE;
-	writel(value, drv_data->ioaddr + offset + SPI_CS_CONTROL);
-	value = readl(drv_data->ioaddr + offset + SPI_CS_CONTROL);
-	if (value != (orig | SPI_CS_CONTROL_SW_MODE)) {
-		offset = 0x800;
-		goto detection_done;
-	}
-
-	orig = readl(drv_data->ioaddr + offset + SPI_CS_CONTROL);
-
-	/* Test SPI_CS_CONTROL_SW_MODE bit disabling */
-	value = orig & ~SPI_CS_CONTROL_SW_MODE;
-	writel(value, drv_data->ioaddr + offset + SPI_CS_CONTROL);
-	value = readl(drv_data->ioaddr + offset + SPI_CS_CONTROL);
-	if (value != (orig & ~SPI_CS_CONTROL_SW_MODE)) {
-		offset = 0x800;
-		goto detection_done;
-	}
+	const struct lpss_config *config;
+	u32 value;
 
-detection_done:
-	/* Now set the LPSS base */
-	drv_data->lpss_base = drv_data->ioaddr + offset;
+	config = lpss_get_config(drv_data);
+	drv_data->lpss_base = drv_data->ioaddr + config->offset;
 
 	/* Enable software chip select control */
 	value = SPI_CS_CONTROL_SW_MODE | SPI_CS_CONTROL_CS_HIGH;
-	__lpss_ssp_write_priv(drv_data, SPI_CS_CONTROL, value);
+	__lpss_ssp_write_priv(drv_data, config->reg_cs_ctrl, value);
 
 	/* Enable multiblock DMA transfers */
 	if (drv_data->master_info->enable_dma) {
-		__lpss_ssp_write_priv(drv_data, SSP_REG, 1);
+		__lpss_ssp_write_priv(drv_data, config->reg_ssp, 1);
 
-		value = __lpss_ssp_read_priv(drv_data, GENERAL_REG);
+		value = __lpss_ssp_read_priv(drv_data, config->reg_general);
 		value |= GENERAL_REG_RXTO_HOLDOFF_DISABLE;
-		__lpss_ssp_write_priv(drv_data, GENERAL_REG, value);
+		__lpss_ssp_write_priv(drv_data, config->reg_general, value);
 	}
 }
 
 static void lpss_ssp_cs_control(struct driver_data *drv_data, bool enable)
 {
+	const struct lpss_config *config;
 	u32 value;
 
-	value = __lpss_ssp_read_priv(drv_data, SPI_CS_CONTROL);
+	config = lpss_get_config(drv_data);
+
+	value = __lpss_ssp_read_priv(drv_data, config->reg_cs_ctrl);
 	if (enable)
 		value &= ~SPI_CS_CONTROL_CS_HIGH;
 	else
 		value |= SPI_CS_CONTROL_CS_HIGH;
-	__lpss_ssp_write_priv(drv_data, SPI_CS_CONTROL, value);
+	__lpss_ssp_write_priv(drv_data, config->reg_cs_ctrl, value);
 }
 
 static void cs_assert(struct driver_data *drv_data)
@@ -1081,6 +1090,7 @@ static int setup(struct spi_device *spi)
 {
 	struct pxa2xx_spi_chip *chip_info = NULL;
 	struct chip_data *chip;
+	const struct lpss_config *config;
 	struct driver_data *drv_data = spi_master_get_devdata(spi->master);
 	unsigned int clk_div;
 	uint tx_thres, tx_hi_thres, rx_thres;
@@ -1093,9 +1103,10 @@ static int setup(struct spi_device *spi)
 		break;
 	case LPSS_LPT_SSP:
 	case LPSS_BYT_SSP:
-		tx_thres = LPSS_TX_LOTHRESH_DFLT;
-		tx_hi_thres = LPSS_TX_HITHRESH_DFLT;
-		rx_thres = LPSS_RX_THRESH_DFLT;
+		config = lpss_get_config(drv_data);
+		tx_thres = config->tx_threshold_lo;
+		tx_hi_thres = config->tx_threshold_hi;
+		rx_thres = config->rx_threshold;
 		break;
 	default:
 		tx_thres = TX_THRESH_DFLT;
diff --git a/include/linux/pxa2xx_ssp.h b/include/linux/pxa2xx_ssp.h
index 95a4b3bd7a5c..0485bab061fd 100644
--- a/include/linux/pxa2xx_ssp.h
+++ b/include/linux/pxa2xx_ssp.h
@@ -195,7 +195,7 @@ enum pxa_ssp_type {
 	PXA910_SSP,
 	CE4100_SSP,
 	QUARK_X1000_SSP,
-	LPSS_LPT_SSP,
+	LPSS_LPT_SSP, /* Keep LPSS types sorted with lpss_platforms[] */
 	LPSS_BYT_SSP,
 };
 
-- 
cgit v1.2.3


From 0f41979164a52a1ca0f0a601f90000fc18e3a396 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Mon, 1 Jun 2015 22:59:08 -0400
Subject: SUNRPC: Remove unused argument 'tk_ops' in rpc_run_bc_task

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 include/linux/sunrpc/sched.h | 3 +--
 net/sunrpc/clnt.c            | 6 ++----
 net/sunrpc/svc.c             | 3 +--
 3 files changed, 4 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h
index 5f1e6bd4c316..2aa68976a482 100644
--- a/include/linux/sunrpc/sched.h
+++ b/include/linux/sunrpc/sched.h
@@ -205,8 +205,7 @@ struct rpc_wait_queue {
  */
 struct rpc_task *rpc_new_task(const struct rpc_task_setup *);
 struct rpc_task *rpc_run_task(const struct rpc_task_setup *);
-struct rpc_task *rpc_run_bc_task(struct rpc_rqst *req,
-				const struct rpc_call_ops *ops);
+struct rpc_task *rpc_run_bc_task(struct rpc_rqst *req);
 void		rpc_put_task(struct rpc_task *);
 void		rpc_put_task_async(struct rpc_task *);
 void		rpc_exit_task(struct rpc_task *);
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index fa70f663eb92..f6717170480e 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -1031,15 +1031,13 @@ EXPORT_SYMBOL_GPL(rpc_call_async);
  * rpc_run_bc_task - Allocate a new RPC task for backchannel use, then run
  * rpc_execute against it
  * @req: RPC request
- * @tk_ops: RPC call ops
  */
-struct rpc_task *rpc_run_bc_task(struct rpc_rqst *req,
-				const struct rpc_call_ops *tk_ops)
+struct rpc_task *rpc_run_bc_task(struct rpc_rqst *req)
 {
 	struct rpc_task *task;
 	struct xdr_buf *xbufp = &req->rq_snd_buf;
 	struct rpc_task_setup task_setup_data = {
-		.callback_ops = tk_ops,
+		.callback_ops = &rpc_default_ops,
 	};
 
 	dprintk("RPC: rpc_run_bc_task req= %p\n", req);
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index e144902d382e..51c8cad5e765 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -1350,7 +1350,6 @@ bc_svc_process(struct svc_serv *serv, struct rpc_rqst *req,
 {
 	struct kvec	*argv = &rqstp->rq_arg.head[0];
 	struct kvec	*resv = &rqstp->rq_res.head[0];
-	static const struct rpc_call_ops reply_ops = { };
 	struct rpc_task *task;
 	int error;
 
@@ -1391,7 +1390,7 @@ bc_svc_process(struct svc_serv *serv, struct rpc_rqst *req,
 
 	/* Finally, send the reply synchronously */
 	memcpy(&req->rq_snd_buf, &rqstp->rq_res, sizeof(req->rq_snd_buf));
-	task = rpc_run_bc_task(req, &reply_ops);
+	task = rpc_run_bc_task(req);
 	if (IS_ERR(task)) {
 		error = PTR_ERR(task);
 		goto out;
-- 
cgit v1.2.3


From 0d2a970d0ae55086520e1e58e572a7acd519429c Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Thu, 4 Jun 2015 15:37:10 -0400
Subject: SUNRPC: Fix a backchannel race

We need to allow the server to send a new request immediately after we've
replied to the previous one. Right now, there is a window between the
send and the release of the old request in rpc_put_task(), where the
server could send us a new backchannel RPC call, and we have no
request to service it.

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 include/linux/sunrpc/xprt.h   |  3 ++-
 net/sunrpc/backchannel_rqst.c | 37 ++++++++++++++++++++++++-------------
 net/sunrpc/svc.c              |  6 +++++-
 3 files changed, 31 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h
index 8b93ef53df3c..bc9c39d6d30d 100644
--- a/include/linux/sunrpc/xprt.h
+++ b/include/linux/sunrpc/xprt.h
@@ -212,7 +212,8 @@ struct rpc_xprt {
 #if defined(CONFIG_SUNRPC_BACKCHANNEL)
 	struct svc_serv		*bc_serv;       /* The RPC service which will */
 						/* process the callback */
-	unsigned int		bc_alloc_count;	/* Total number of preallocs */
+	int			bc_alloc_count;	/* Total number of preallocs */
+	atomic_t		bc_free_slots;
 	spinlock_t		bc_pa_lock;	/* Protects the preallocated
 						 * items */
 	struct list_head	bc_pa_list;	/* List of preallocated
diff --git a/net/sunrpc/backchannel_rqst.c b/net/sunrpc/backchannel_rqst.c
index 1baa41099469..9825ff0f91d6 100644
--- a/net/sunrpc/backchannel_rqst.c
+++ b/net/sunrpc/backchannel_rqst.c
@@ -37,16 +37,18 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 static inline int xprt_need_to_requeue(struct rpc_xprt *xprt)
 {
-	return xprt->bc_alloc_count > 0;
+	return xprt->bc_alloc_count < atomic_read(&xprt->bc_free_slots);
 }
 
 static inline void xprt_inc_alloc_count(struct rpc_xprt *xprt, unsigned int n)
 {
+	atomic_add(n, &xprt->bc_free_slots);
 	xprt->bc_alloc_count += n;
 }
 
 static inline int xprt_dec_alloc_count(struct rpc_xprt *xprt, unsigned int n)
 {
+	atomic_sub(n, &xprt->bc_free_slots);
 	return xprt->bc_alloc_count -= n;
 }
 
@@ -232,9 +234,15 @@ static struct rpc_rqst *xprt_alloc_bc_request(struct rpc_xprt *xprt, __be32 xid)
 	struct rpc_rqst *req = NULL;
 
 	dprintk("RPC:       allocate a backchannel request\n");
-	if (list_empty(&xprt->bc_pa_list))
+	if (atomic_read(&xprt->bc_free_slots) <= 0)
 		goto not_found;
-
+	if (list_empty(&xprt->bc_pa_list)) {
+		req = xprt_alloc_bc_req(xprt, GFP_ATOMIC);
+		if (!req)
+			goto not_found;
+		/* Note: this 'free' request adds it to xprt->bc_pa_list */
+		xprt_free_bc_request(req);
+	}
 	req = list_first_entry(&xprt->bc_pa_list, struct rpc_rqst,
 				rq_bc_pa_list);
 	req->rq_reply_bytes_recvd = 0;
@@ -260,11 +268,21 @@ void xprt_free_bc_request(struct rpc_rqst *req)
 
 	req->rq_connect_cookie = xprt->connect_cookie - 1;
 	smp_mb__before_atomic();
-	WARN_ON_ONCE(!test_bit(RPC_BC_PA_IN_USE, &req->rq_bc_pa_state));
 	clear_bit(RPC_BC_PA_IN_USE, &req->rq_bc_pa_state);
 	smp_mb__after_atomic();
 
-	if (!xprt_need_to_requeue(xprt)) {
+	/*
+	 * Return it to the list of preallocations so that it
+	 * may be reused by a new callback request.
+	 */
+	spin_lock_bh(&xprt->bc_pa_lock);
+	if (xprt_need_to_requeue(xprt)) {
+		list_add_tail(&req->rq_bc_pa_list, &xprt->bc_pa_list);
+		xprt->bc_alloc_count++;
+		req = NULL;
+	}
+	spin_unlock_bh(&xprt->bc_pa_lock);
+	if (req != NULL) {
 		/*
 		 * The last remaining session was destroyed while this
 		 * entry was in use.  Free the entry and don't attempt
@@ -275,14 +293,6 @@ void xprt_free_bc_request(struct rpc_rqst *req)
 		xprt_free_allocation(req);
 		return;
 	}
-
-	/*
-	 * Return it to the list of preallocations so that it
-	 * may be reused by a new callback request.
-	 */
-	spin_lock_bh(&xprt->bc_pa_lock);
-	list_add_tail(&req->rq_bc_pa_list, &xprt->bc_pa_list);
-	spin_unlock_bh(&xprt->bc_pa_lock);
 }
 
 /*
@@ -326,6 +336,7 @@ void xprt_complete_bc_request(struct rpc_rqst *req, uint32_t copied)
 
 	spin_lock(&xprt->bc_pa_lock);
 	list_del(&req->rq_bc_pa_list);
+	xprt->bc_alloc_count--;
 	spin_unlock(&xprt->bc_pa_lock);
 
 	req->rq_private_buf.len = copied;
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index 51c8cad5e765..b47f02f60b9c 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -1351,6 +1351,7 @@ bc_svc_process(struct svc_serv *serv, struct rpc_rqst *req,
 	struct kvec	*argv = &rqstp->rq_arg.head[0];
 	struct kvec	*resv = &rqstp->rq_res.head[0];
 	struct rpc_task *task;
+	int proc_error;
 	int error;
 
 	dprintk("svc: %s(%p)\n", __func__, req);
@@ -1382,7 +1383,10 @@ bc_svc_process(struct svc_serv *serv, struct rpc_rqst *req,
 	svc_getnl(argv);	/* CALLDIR */
 
 	/* Parse and execute the bc call */
-	if (!svc_process_common(rqstp, argv, resv)) {
+	proc_error = svc_process_common(rqstp, argv, resv);
+
+	atomic_inc(&req->rq_xprt->bc_free_slots);
+	if (!proc_error) {
 		/* Processing error: drop the request */
 		xprt_free_bc_request(req);
 		return 0;
-- 
cgit v1.2.3


From 8e73485c7959fd25650761eab04db1e72ea14c23 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Sun, 17 May 2015 13:58:53 +0200
Subject: KVM: add vcpu-specific functions to read/write/translate GFNs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

We need to hide SMRAM from guests not running in SMM.  Therefore, all
uses of kvm_read_guest* and kvm_write_guest* must be changed to use
different address spaces, depending on whether the VCPU is in system
management mode.  We need to introduce a new family of functions for
this purpose.

For now, the VCPU-based functions have the same behavior as the
existing per-VM ones, they just accept a different type for the
first argument.  Later however they will be changed to use one of many
"struct kvm_memslots" stored in struct kvm, through an architecture hook.
VM-based functions will unconditionally use the first memslots pointer.

Whenever possible, this patch introduces slot-based functions with an
__ prefix, with two wrappers for generic and vcpu-based actions.
The exceptions are kvm_read_guest and kvm_write_guest, which are copied
into the new functions kvm_vcpu_read_guest and kvm_vcpu_write_guest.

Reviewed-by: Radim Krčmář <rkrcmar@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 include/linux/kvm_host.h |  24 +++++++
 virt/kvm/kvm_main.c      | 167 +++++++++++++++++++++++++++++++++++++++++++----
 2 files changed, 178 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index b019fee6d941..ba1ea43998e4 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -471,6 +471,11 @@ static inline struct kvm_memslots *kvm_memslots(struct kvm *kvm)
 			|| lockdep_is_held(&kvm->slots_lock));
 }
 
+static inline struct kvm_memslots *kvm_vcpu_memslots(struct kvm_vcpu *vcpu)
+{
+	return kvm_memslots(vcpu->kvm);
+}
+
 static inline struct kvm_memory_slot *
 id_to_memslot(struct kvm_memslots *slots, int id)
 {
@@ -576,6 +581,25 @@ int kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn);
 unsigned long kvm_host_page_size(struct kvm *kvm, gfn_t gfn);
 void mark_page_dirty(struct kvm *kvm, gfn_t gfn);
 
+struct kvm_memslots *kvm_vcpu_memslots(struct kvm_vcpu *vcpu);
+struct kvm_memory_slot *kvm_vcpu_gfn_to_memslot(struct kvm_vcpu *vcpu, gfn_t gfn);
+pfn_t kvm_vcpu_gfn_to_pfn_atomic(struct kvm_vcpu *vcpu, gfn_t gfn);
+pfn_t kvm_vcpu_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn);
+struct page *kvm_vcpu_gfn_to_page(struct kvm_vcpu *vcpu, gfn_t gfn);
+unsigned long kvm_vcpu_gfn_to_hva(struct kvm_vcpu *vcpu, gfn_t gfn);
+unsigned long kvm_vcpu_gfn_to_hva_prot(struct kvm_vcpu *vcpu, gfn_t gfn, bool *writable);
+int kvm_vcpu_read_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, void *data, int offset,
+			     int len);
+int kvm_vcpu_read_guest_atomic(struct kvm_vcpu *vcpu, gpa_t gpa, void *data,
+			       unsigned long len);
+int kvm_vcpu_read_guest(struct kvm_vcpu *vcpu, gpa_t gpa, void *data,
+			unsigned long len);
+int kvm_vcpu_write_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, const void *data,
+			      int offset, int len);
+int kvm_vcpu_write_guest(struct kvm_vcpu *vcpu, gpa_t gpa, const void *data,
+			 unsigned long len);
+void kvm_vcpu_mark_page_dirty(struct kvm_vcpu *vcpu, gfn_t gfn);
+
 void kvm_vcpu_block(struct kvm_vcpu *vcpu);
 void kvm_vcpu_kick(struct kvm_vcpu *vcpu);
 int kvm_vcpu_yield_to(struct kvm_vcpu *target);
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 6c8e124006ad..3a121cedcc77 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1100,6 +1100,11 @@ struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
 }
 EXPORT_SYMBOL_GPL(gfn_to_memslot);
 
+struct kvm_memory_slot *kvm_vcpu_gfn_to_memslot(struct kvm_vcpu *vcpu, gfn_t gfn)
+{
+	return __gfn_to_memslot(kvm_vcpu_memslots(vcpu), gfn);
+}
+
 int kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn)
 {
 	struct kvm_memory_slot *memslot = gfn_to_memslot(kvm, gfn);
@@ -1175,6 +1180,12 @@ unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn)
 }
 EXPORT_SYMBOL_GPL(gfn_to_hva);
 
+unsigned long kvm_vcpu_gfn_to_hva(struct kvm_vcpu *vcpu, gfn_t gfn)
+{
+	return gfn_to_hva_many(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn, NULL);
+}
+EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_hva);
+
 /*
  * If writable is set to false, the hva returned by this function is only
  * allowed to be read.
@@ -1197,6 +1208,13 @@ unsigned long gfn_to_hva_prot(struct kvm *kvm, gfn_t gfn, bool *writable)
 	return gfn_to_hva_memslot_prot(slot, gfn, writable);
 }
 
+unsigned long kvm_vcpu_gfn_to_hva_prot(struct kvm_vcpu *vcpu, gfn_t gfn, bool *writable)
+{
+	struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
+
+	return gfn_to_hva_memslot_prot(slot, gfn, writable);
+}
+
 static int get_user_page_nowait(struct task_struct *tsk, struct mm_struct *mm,
 	unsigned long start, int write, struct page **page)
 {
@@ -1412,12 +1430,24 @@ pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn)
 }
 EXPORT_SYMBOL_GPL(gfn_to_pfn_atomic);
 
+pfn_t kvm_vcpu_gfn_to_pfn_atomic(struct kvm_vcpu *vcpu, gfn_t gfn)
+{
+	return gfn_to_pfn_memslot_atomic(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn);
+}
+EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_pfn_atomic);
+
 pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn)
 {
 	return gfn_to_pfn_memslot(gfn_to_memslot(kvm, gfn), gfn);
 }
 EXPORT_SYMBOL_GPL(gfn_to_pfn);
 
+pfn_t kvm_vcpu_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn)
+{
+	return gfn_to_pfn_memslot(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn);
+}
+EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_pfn);
+
 int gfn_to_page_many_atomic(struct kvm_memory_slot *slot, gfn_t gfn,
 			    struct page **pages, int nr_pages)
 {
@@ -1458,6 +1488,16 @@ struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn)
 }
 EXPORT_SYMBOL_GPL(gfn_to_page);
 
+struct page *kvm_vcpu_gfn_to_page(struct kvm_vcpu *vcpu, gfn_t gfn)
+{
+	pfn_t pfn;
+
+	pfn = kvm_vcpu_gfn_to_pfn(vcpu, gfn);
+
+	return kvm_pfn_to_page(pfn);
+}
+EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_page);
+
 void kvm_release_page_clean(struct page *page)
 {
 	WARN_ON(is_error_page(page));
@@ -1520,13 +1560,13 @@ static int next_segment(unsigned long len, int offset)
 		return len;
 }
 
-int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset,
-			int len)
+static int __kvm_read_guest_page(struct kvm_memory_slot *slot, gfn_t gfn,
+				 void *data, int offset, int len)
 {
 	int r;
 	unsigned long addr;
 
-	addr = gfn_to_hva_prot(kvm, gfn, NULL);
+	addr = gfn_to_hva_memslot_prot(slot, gfn, NULL);
 	if (kvm_is_error_hva(addr))
 		return -EFAULT;
 	r = __copy_from_user(data, (void __user *)addr + offset, len);
@@ -1534,8 +1574,25 @@ int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset,
 		return -EFAULT;
 	return 0;
 }
+
+int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset,
+			int len)
+{
+	struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
+
+	return __kvm_read_guest_page(slot, gfn, data, offset, len);
+}
 EXPORT_SYMBOL_GPL(kvm_read_guest_page);
 
+int kvm_vcpu_read_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, void *data,
+			     int offset, int len)
+{
+	struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
+
+	return __kvm_read_guest_page(slot, gfn, data, offset, len);
+}
+EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest_page);
+
 int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len)
 {
 	gfn_t gfn = gpa >> PAGE_SHIFT;
@@ -1556,15 +1613,33 @@ int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len)
 }
 EXPORT_SYMBOL_GPL(kvm_read_guest);
 
-int kvm_read_guest_atomic(struct kvm *kvm, gpa_t gpa, void *data,
-			  unsigned long len)
+int kvm_vcpu_read_guest(struct kvm_vcpu *vcpu, gpa_t gpa, void *data, unsigned long len)
 {
-	int r;
-	unsigned long addr;
 	gfn_t gfn = gpa >> PAGE_SHIFT;
+	int seg;
 	int offset = offset_in_page(gpa);
+	int ret;
+
+	while ((seg = next_segment(len, offset)) != 0) {
+		ret = kvm_vcpu_read_guest_page(vcpu, gfn, data, offset, seg);
+		if (ret < 0)
+			return ret;
+		offset = 0;
+		len -= seg;
+		data += seg;
+		++gfn;
+	}
+	return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest);
 
-	addr = gfn_to_hva_prot(kvm, gfn, NULL);
+static int __kvm_read_guest_atomic(struct kvm_memory_slot *slot, gfn_t gfn,
+			           void *data, int offset, unsigned long len)
+{
+	int r;
+	unsigned long addr;
+
+	addr = gfn_to_hva_memslot_prot(slot, gfn, NULL);
 	if (kvm_is_error_hva(addr))
 		return -EFAULT;
 	pagefault_disable();
@@ -1574,16 +1649,35 @@ int kvm_read_guest_atomic(struct kvm *kvm, gpa_t gpa, void *data,
 		return -EFAULT;
 	return 0;
 }
-EXPORT_SYMBOL(kvm_read_guest_atomic);
 
-int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn, const void *data,
-			 int offset, int len)
+int kvm_read_guest_atomic(struct kvm *kvm, gpa_t gpa, void *data,
+			  unsigned long len)
+{
+	gfn_t gfn = gpa >> PAGE_SHIFT;
+	struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
+	int offset = offset_in_page(gpa);
+
+	return __kvm_read_guest_atomic(slot, gfn, data, offset, len);
+}
+EXPORT_SYMBOL_GPL(kvm_read_guest_atomic);
+
+int kvm_vcpu_read_guest_atomic(struct kvm_vcpu *vcpu, gpa_t gpa,
+			       void *data, unsigned long len)
+{
+	gfn_t gfn = gpa >> PAGE_SHIFT;
+	struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
+	int offset = offset_in_page(gpa);
+
+	return __kvm_read_guest_atomic(slot, gfn, data, offset, len);
+}
+EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest_atomic);
+
+static int __kvm_write_guest_page(struct kvm_memory_slot *memslot, gfn_t gfn,
+			          const void *data, int offset, int len)
 {
 	int r;
-	struct kvm_memory_slot *memslot;
 	unsigned long addr;
 
-	memslot = gfn_to_memslot(kvm, gfn);
 	addr = gfn_to_hva_memslot(memslot, gfn);
 	if (kvm_is_error_hva(addr))
 		return -EFAULT;
@@ -1593,8 +1687,25 @@ int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn, const void *data,
 	mark_page_dirty_in_slot(memslot, gfn);
 	return 0;
 }
+
+int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn,
+			 const void *data, int offset, int len)
+{
+	struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
+
+	return __kvm_write_guest_page(slot, gfn, data, offset, len);
+}
 EXPORT_SYMBOL_GPL(kvm_write_guest_page);
 
+int kvm_vcpu_write_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn,
+			      const void *data, int offset, int len)
+{
+	struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
+
+	return __kvm_write_guest_page(slot, gfn, data, offset, len);
+}
+EXPORT_SYMBOL_GPL(kvm_vcpu_write_guest_page);
+
 int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data,
 		    unsigned long len)
 {
@@ -1616,6 +1727,27 @@ int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data,
 }
 EXPORT_SYMBOL_GPL(kvm_write_guest);
 
+int kvm_vcpu_write_guest(struct kvm_vcpu *vcpu, gpa_t gpa, const void *data,
+		         unsigned long len)
+{
+	gfn_t gfn = gpa >> PAGE_SHIFT;
+	int seg;
+	int offset = offset_in_page(gpa);
+	int ret;
+
+	while ((seg = next_segment(len, offset)) != 0) {
+		ret = kvm_vcpu_write_guest_page(vcpu, gfn, data, offset, seg);
+		if (ret < 0)
+			return ret;
+		offset = 0;
+		len -= seg;
+		data += seg;
+		++gfn;
+	}
+	return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_vcpu_write_guest);
+
 int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
 			      gpa_t gpa, unsigned long len)
 {
@@ -1750,6 +1882,15 @@ void mark_page_dirty(struct kvm *kvm, gfn_t gfn)
 }
 EXPORT_SYMBOL_GPL(mark_page_dirty);
 
+void kvm_vcpu_mark_page_dirty(struct kvm_vcpu *vcpu, gfn_t gfn)
+{
+	struct kvm_memory_slot *memslot;
+
+	memslot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
+	mark_page_dirty_in_slot(memslot, gfn);
+}
+EXPORT_SYMBOL_GPL(kvm_vcpu_mark_page_dirty);
+
 static int kvm_vcpu_check_block(struct kvm_vcpu *vcpu)
 {
 	if (kvm_arch_vcpu_runnable(vcpu)) {
-- 
cgit v1.2.3


From f481b069e674378758c73761827e83ab05c46b52 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Sun, 17 May 2015 17:30:37 +0200
Subject: KVM: implement multiple address spaces
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Only two ioctls have to be modified; the address space id is
placed in the higher 16 bits of their slot id argument.

As of this patch, no architecture defines more than one
address space; x86 will be the first.

Reviewed-by: Radim Krčmář <rkrcmar@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 Documentation/virtual/kvm/api.txt        | 12 ++++++
 arch/powerpc/include/asm/kvm_book3s_64.h |  2 +-
 include/linux/kvm_host.h                 | 26 ++++++++++--
 include/uapi/linux/kvm.h                 |  1 +
 virt/kvm/kvm_main.c                      | 70 ++++++++++++++++++++------------
 5 files changed, 79 insertions(+), 32 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index 2ddefd58b1aa..461956a0ee8e 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -254,6 +254,11 @@ since the last call to this ioctl.  Bit 0 is the first page in the
 memory slot.  Ensure the entire structure is cleared to avoid padding
 issues.
 
+If KVM_CAP_MULTI_ADDRESS_SPACE is available, bits 16-31 specifies
+the address space for which you want to return the dirty bitmap.
+They must be less than the value that KVM_CHECK_EXTENSION returns for
+the KVM_CAP_MULTI_ADDRESS_SPACE capability.
+
 
 4.9 KVM_SET_MEMORY_ALIAS
 
@@ -924,6 +929,13 @@ slot.  When changing an existing slot, it may be moved in the guest
 physical memory space, or its flags may be modified.  It may not be
 resized.  Slots may not overlap in guest physical address space.
 
+If KVM_CAP_MULTI_ADDRESS_SPACE is available, bits 16-31 of "slot"
+specifies the address space which is being modified.  They must be
+less than the value that KVM_CHECK_EXTENSION returns for the
+KVM_CAP_MULTI_ADDRESS_SPACE capability.  Slots in separate address spaces
+are unrelated; the restriction on overlapping slots only applies within
+each address space.
+
 Memory for the region is taken starting at the address denoted by the
 field userspace_addr, which must point at user addressable memory for
 the entire memory slot size.  Any object may back this memory, including
diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h
index 3536d12eb798..2aa79c864e91 100644
--- a/arch/powerpc/include/asm/kvm_book3s_64.h
+++ b/arch/powerpc/include/asm/kvm_book3s_64.h
@@ -430,7 +430,7 @@ static inline void note_hpte_modification(struct kvm *kvm,
  */
 static inline struct kvm_memslots *kvm_memslots_raw(struct kvm *kvm)
 {
-	return rcu_dereference_raw_notrace(kvm->memslots);
+	return rcu_dereference_raw_notrace(kvm->memslots[0]);
 }
 
 extern void kvmppc_mmu_debugfs_init(struct kvm *kvm);
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index ba1ea43998e4..9564fd78c547 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -44,6 +44,10 @@
 /* Two fragments for cross MMIO pages. */
 #define KVM_MAX_MMIO_FRAGMENTS	2
 
+#ifndef KVM_ADDRESS_SPACE_NUM
+#define KVM_ADDRESS_SPACE_NUM	1
+#endif
+
 /*
  * For the normal pfn, the highest 12 bits should be zero,
  * so we can mask bit 62 ~ bit 52  to indicate the error pfn,
@@ -331,6 +335,13 @@ struct kvm_kernel_irq_routing_entry {
 #define KVM_MEM_SLOTS_NUM (KVM_USER_MEM_SLOTS + KVM_PRIVATE_MEM_SLOTS)
 #endif
 
+#ifndef __KVM_VCPU_MULTIPLE_ADDRESS_SPACE
+static inline int kvm_arch_vcpu_memslots_id(struct kvm_vcpu *vcpu)
+{
+	return 0;
+}
+#endif
+
 /*
  * Note:
  * memslots are not sorted by id anymore, please use id_to_memslot()
@@ -349,7 +360,7 @@ struct kvm {
 	spinlock_t mmu_lock;
 	struct mutex slots_lock;
 	struct mm_struct *mm; /* userspace tied to this vm */
-	struct kvm_memslots *memslots;
+	struct kvm_memslots *memslots[KVM_ADDRESS_SPACE_NUM];
 	struct srcu_struct srcu;
 	struct srcu_struct irq_srcu;
 #ifdef CONFIG_KVM_APIC_ARCHITECTURE
@@ -464,16 +475,23 @@ void kvm_exit(void);
 void kvm_get_kvm(struct kvm *kvm);
 void kvm_put_kvm(struct kvm *kvm);
 
-static inline struct kvm_memslots *kvm_memslots(struct kvm *kvm)
+static inline struct kvm_memslots *__kvm_memslots(struct kvm *kvm, int as_id)
 {
-	return rcu_dereference_check(kvm->memslots,
+	return rcu_dereference_check(kvm->memslots[as_id],
 			srcu_read_lock_held(&kvm->srcu)
 			|| lockdep_is_held(&kvm->slots_lock));
 }
 
+static inline struct kvm_memslots *kvm_memslots(struct kvm *kvm)
+{
+	return __kvm_memslots(kvm, 0);
+}
+
 static inline struct kvm_memslots *kvm_vcpu_memslots(struct kvm_vcpu *vcpu)
 {
-	return kvm_memslots(vcpu->kvm);
+	int as_id = kvm_arch_vcpu_memslots_id(vcpu);
+
+	return __kvm_memslots(vcpu->kvm, as_id);
 }
 
 static inline struct kvm_memory_slot *
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index eace8babd227..5ff1038437e3 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -816,6 +816,7 @@ struct kvm_ppc_smmu_info {
 #define KVM_CAP_PPC_HWRNG 115
 #define KVM_CAP_DISABLE_QUIRKS 116
 #define KVM_CAP_X86_SMM 117
+#define KVM_CAP_MULTI_ADDRESS_SPACE 118
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 3a121cedcc77..848af90b8091 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -518,9 +518,11 @@ static struct kvm *kvm_create_vm(unsigned long type)
 	BUILD_BUG_ON(KVM_MEM_SLOTS_NUM > SHRT_MAX);
 
 	r = -ENOMEM;
-	kvm->memslots = kvm_alloc_memslots();
-	if (!kvm->memslots)
-		goto out_err_no_srcu;
+	for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
+		kvm->memslots[i] = kvm_alloc_memslots();
+		if (!kvm->memslots[i])
+			goto out_err_no_srcu;
+	}
 
 	if (init_srcu_struct(&kvm->srcu))
 		goto out_err_no_srcu;
@@ -562,7 +564,8 @@ out_err_no_srcu:
 out_err_no_disable:
 	for (i = 0; i < KVM_NR_BUSES; i++)
 		kfree(kvm->buses[i]);
-	kvm_free_memslots(kvm, kvm->memslots);
+	for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++)
+		kvm_free_memslots(kvm, kvm->memslots[i]);
 	kvm_arch_free_vm(kvm);
 	return ERR_PTR(r);
 }
@@ -612,7 +615,8 @@ static void kvm_destroy_vm(struct kvm *kvm)
 #endif
 	kvm_arch_destroy_vm(kvm);
 	kvm_destroy_devices(kvm);
-	kvm_free_memslots(kvm, kvm->memslots);
+	for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++)
+		kvm_free_memslots(kvm, kvm->memslots[i]);
 	cleanup_srcu_struct(&kvm->irq_srcu);
 	cleanup_srcu_struct(&kvm->srcu);
 	kvm_arch_free_vm(kvm);
@@ -729,9 +733,9 @@ static int check_memory_region_flags(const struct kvm_userspace_memory_region *m
 }
 
 static struct kvm_memslots *install_new_memslots(struct kvm *kvm,
-		struct kvm_memslots *slots)
+		int as_id, struct kvm_memslots *slots)
 {
-	struct kvm_memslots *old_memslots = kvm_memslots(kvm);
+	struct kvm_memslots *old_memslots = __kvm_memslots(kvm, as_id);
 
 	/*
 	 * Set the low bit in the generation, which disables SPTE caching
@@ -740,7 +744,7 @@ static struct kvm_memslots *install_new_memslots(struct kvm *kvm,
 	WARN_ON(old_memslots->generation & 1);
 	slots->generation = old_memslots->generation + 1;
 
-	rcu_assign_pointer(kvm->memslots, slots);
+	rcu_assign_pointer(kvm->memslots[as_id], slots);
 	synchronize_srcu_expedited(&kvm->srcu);
 
 	/*
@@ -772,6 +776,7 @@ int __kvm_set_memory_region(struct kvm *kvm,
 	struct kvm_memory_slot *slot;
 	struct kvm_memory_slot old, new;
 	struct kvm_memslots *slots = NULL, *old_memslots;
+	int as_id, id;
 	enum kvm_mr_change change;
 
 	r = check_memory_region_flags(mem);
@@ -779,24 +784,27 @@ int __kvm_set_memory_region(struct kvm *kvm,
 		goto out;
 
 	r = -EINVAL;
+	as_id = mem->slot >> 16;
+	id = (u16)mem->slot;
+
 	/* General sanity checks */
 	if (mem->memory_size & (PAGE_SIZE - 1))
 		goto out;
 	if (mem->guest_phys_addr & (PAGE_SIZE - 1))
 		goto out;
 	/* We can read the guest memory with __xxx_user() later on. */
-	if ((mem->slot < KVM_USER_MEM_SLOTS) &&
+	if ((id < KVM_USER_MEM_SLOTS) &&
 	    ((mem->userspace_addr & (PAGE_SIZE - 1)) ||
 	     !access_ok(VERIFY_WRITE,
 			(void __user *)(unsigned long)mem->userspace_addr,
 			mem->memory_size)))
 		goto out;
-	if (mem->slot >= KVM_MEM_SLOTS_NUM)
+	if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_MEM_SLOTS_NUM)
 		goto out;
 	if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr)
 		goto out;
 
-	slot = id_to_memslot(kvm_memslots(kvm), mem->slot);
+	slot = id_to_memslot(__kvm_memslots(kvm, as_id), id);
 	base_gfn = mem->guest_phys_addr >> PAGE_SHIFT;
 	npages = mem->memory_size >> PAGE_SHIFT;
 
@@ -805,7 +813,7 @@ int __kvm_set_memory_region(struct kvm *kvm,
 
 	new = old = *slot;
 
-	new.id = mem->slot;
+	new.id = id;
 	new.base_gfn = base_gfn;
 	new.npages = npages;
 	new.flags = mem->flags;
@@ -840,9 +848,9 @@ int __kvm_set_memory_region(struct kvm *kvm,
 	if ((change == KVM_MR_CREATE) || (change == KVM_MR_MOVE)) {
 		/* Check for overlaps */
 		r = -EEXIST;
-		kvm_for_each_memslot(slot, kvm_memslots(kvm)) {
+		kvm_for_each_memslot(slot, __kvm_memslots(kvm, as_id)) {
 			if ((slot->id >= KVM_USER_MEM_SLOTS) ||
-			    (slot->id == mem->slot))
+			    (slot->id == id))
 				continue;
 			if (!((base_gfn + npages <= slot->base_gfn) ||
 			      (base_gfn >= slot->base_gfn + slot->npages)))
@@ -871,13 +879,13 @@ int __kvm_set_memory_region(struct kvm *kvm,
 	slots = kvm_kvzalloc(sizeof(struct kvm_memslots));
 	if (!slots)
 		goto out_free;
-	memcpy(slots, kvm_memslots(kvm), sizeof(struct kvm_memslots));
+	memcpy(slots, __kvm_memslots(kvm, as_id), sizeof(struct kvm_memslots));
 
 	if ((change == KVM_MR_DELETE) || (change == KVM_MR_MOVE)) {
-		slot = id_to_memslot(slots, mem->slot);
+		slot = id_to_memslot(slots, id);
 		slot->flags |= KVM_MEMSLOT_INVALID;
 
-		old_memslots = install_new_memslots(kvm, slots);
+		old_memslots = install_new_memslots(kvm, as_id, slots);
 
 		/* slot was deleted or moved, clear iommu mapping */
 		kvm_iommu_unmap_pages(kvm, &old);
@@ -909,7 +917,7 @@ int __kvm_set_memory_region(struct kvm *kvm,
 	}
 
 	update_memslots(slots, &new);
-	old_memslots = install_new_memslots(kvm, slots);
+	old_memslots = install_new_memslots(kvm, as_id, slots);
 
 	kvm_arch_commit_memory_region(kvm, mem, &old, &new, change);
 
@@ -956,7 +964,7 @@ EXPORT_SYMBOL_GPL(kvm_set_memory_region);
 static int kvm_vm_ioctl_set_memory_region(struct kvm *kvm,
 					  struct kvm_userspace_memory_region *mem)
 {
-	if (mem->slot >= KVM_USER_MEM_SLOTS)
+	if ((u16)mem->slot >= KVM_USER_MEM_SLOTS)
 		return -EINVAL;
 
 	return kvm_set_memory_region(kvm, mem);
@@ -967,16 +975,18 @@ int kvm_get_dirty_log(struct kvm *kvm,
 {
 	struct kvm_memslots *slots;
 	struct kvm_memory_slot *memslot;
-	int r, i;
+	int r, i, as_id, id;
 	unsigned long n;
 	unsigned long any = 0;
 
 	r = -EINVAL;
-	if (log->slot >= KVM_USER_MEM_SLOTS)
+	as_id = log->slot >> 16;
+	id = (u16)log->slot;
+	if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS)
 		goto out;
 
-	slots = kvm_memslots(kvm);
-	memslot = id_to_memslot(slots, log->slot);
+	slots = __kvm_memslots(kvm, as_id);
+	memslot = id_to_memslot(slots, id);
 	r = -ENOENT;
 	if (!memslot->dirty_bitmap)
 		goto out;
@@ -1027,17 +1037,19 @@ int kvm_get_dirty_log_protect(struct kvm *kvm,
 {
 	struct kvm_memslots *slots;
 	struct kvm_memory_slot *memslot;
-	int r, i;
+	int r, i, as_id, id;
 	unsigned long n;
 	unsigned long *dirty_bitmap;
 	unsigned long *dirty_bitmap_buffer;
 
 	r = -EINVAL;
-	if (log->slot >= KVM_USER_MEM_SLOTS)
+	as_id = log->slot >> 16;
+	id = (u16)log->slot;
+	if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS)
 		goto out;
 
-	slots = kvm_memslots(kvm);
-	memslot = id_to_memslot(slots, log->slot);
+	slots = __kvm_memslots(kvm, as_id);
+	memslot = id_to_memslot(slots, id);
 
 	dirty_bitmap = memslot->dirty_bitmap;
 	r = -ENOENT;
@@ -2619,6 +2631,10 @@ static long kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg)
 #ifdef CONFIG_HAVE_KVM_IRQ_ROUTING
 	case KVM_CAP_IRQ_ROUTING:
 		return KVM_MAX_IRQ_ROUTES;
+#endif
+#if KVM_ADDRESS_SPACE_NUM > 1
+	case KVM_CAP_MULTI_ADDRESS_SPACE:
+		return KVM_ADDRESS_SPACE_NUM;
 #endif
 	default:
 		break;
-- 
cgit v1.2.3


From 3f21c265cd5f7ae867cc0e86a1f6d5093f1963cc Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Fri, 5 Jun 2015 10:57:37 -0600
Subject: block: add blk_set_queue_dying() to blkdev.h

We export this function and NVMe wants to use it, but for some reason
it was never added to the block header. Do that.

Signed-off-by: Jens Axboe <axboe@fb.com>
---
 include/linux/blkdev.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index ccaa9aecd593..a31380c35918 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1024,6 +1024,7 @@ bool __must_check blk_get_queue(struct request_queue *);
 struct request_queue *blk_alloc_queue(gfp_t);
 struct request_queue *blk_alloc_queue_node(gfp_t, int);
 extern void blk_put_queue(struct request_queue *);
+extern void blk_set_queue_dying(struct request_queue *);
 
 /*
  * block layer runtime pm functions
-- 
cgit v1.2.3


From a5768aa887fb636f0cc4c83a2f1242506aaf50f6 Mon Sep 17 00:00:00 2001
From: Keith Busch <keith.busch@intel.com>
Date: Mon, 1 Jun 2015 14:28:14 -0600
Subject: NVMe: Automatic namespace rescan

Namespaces may be dynamically allocated and deleted or attached and
detached. This has the driver rescan the device for namespace changes
after each device reset or namespace change asynchronous event.

There could potentially be many detached namespaces that we don't want
polluting /dev/ with unusable block handles, so this will delete disks
if the namespace is not active as indicated by the response from identify
namespace. This also skips adding the disk if no capacity is provisioned
to the namespace in the first place.

Signed-off-by: Keith Busch <keith.busch@intel.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/block/nvme-core.c | 159 ++++++++++++++++++++++++++++++++++++----------
 include/linux/nvme.h      |   1 +
 include/uapi/linux/nvme.h |   4 ++
 3 files changed, 132 insertions(+), 32 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c
index cae7cac6cc43..2072ae81c13a 100644
--- a/drivers/block/nvme-core.c
+++ b/drivers/block/nvme-core.c
@@ -29,6 +29,7 @@
 #include <linux/kdev_t.h>
 #include <linux/kthread.h>
 #include <linux/kernel.h>
+#include <linux/list_sort.h>
 #include <linux/mm.h>
 #include <linux/module.h>
 #include <linux/moduleparam.h>
@@ -300,9 +301,16 @@ static void async_req_completion(struct nvme_queue *nvmeq, void *ctx,
 
 	if (status == NVME_SC_SUCCESS || status == NVME_SC_ABORT_REQ)
 		++nvmeq->dev->event_limit;
-	if (status == NVME_SC_SUCCESS)
-		dev_warn(nvmeq->q_dmadev,
-			"async event result %08x\n", result);
+	if (status != NVME_SC_SUCCESS)
+		return;
+
+	switch (result & 0xff07) {
+	case NVME_AER_NOTICE_NS_CHANGED:
+		dev_info(nvmeq->q_dmadev, "rescanning\n");
+		schedule_work(&nvmeq->dev->scan_work);
+	default:
+		dev_warn(nvmeq->q_dmadev, "async event result %08x\n", result);
+	}
 }
 
 static void abort_completion(struct nvme_queue *nvmeq, void *ctx,
@@ -1923,8 +1931,13 @@ static int nvme_revalidate_disk(struct gendisk *disk)
 	unsigned short bs;
 
 	if (nvme_identify_ns(dev, ns->ns_id, &id)) {
-		dev_warn(dev->dev, "%s: Identify failure\n", __func__);
-		return 0;
+		dev_warn(dev->dev, "%s: Identify failure nvme%dn%d\n", __func__,
+						dev->instance, ns->ns_id);
+		return -ENODEV;
+	}
+	if (id->ncap == 0) {
+		kfree(id);
+		return -ENODEV;
 	}
 
 	old_ms = ns->ms;
@@ -1958,7 +1971,7 @@ static int nvme_revalidate_disk(struct gendisk *disk)
 								!ns->ext)
 		nvme_init_integrity(ns);
 
-	if (id->ncap == 0 || (ns->ms && !blk_get_integrity(disk)))
+	if (ns->ms && !blk_get_integrity(disk))
 		set_capacity(disk, 0);
 	else
 		set_capacity(disk, le64_to_cpup(&id->nsze) << (ns->lba_shift - 9));
@@ -2073,11 +2086,16 @@ static void nvme_alloc_ns(struct nvme_dev *dev, unsigned nsid)
 	 * requires it.
 	 */
 	set_capacity(disk, 0);
-	nvme_revalidate_disk(ns->disk);
+	if (nvme_revalidate_disk(ns->disk))
+		goto out_free_disk;
+
 	add_disk(ns->disk);
 	if (ns->ms)
 		revalidate_disk(ns->disk);
 	return;
+ out_free_disk:
+	kfree(disk);
+	list_del(&ns->list);
  out_free_queue:
 	blk_cleanup_queue(ns->queue);
  out_free_ns:
@@ -2194,6 +2212,99 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
 	return result;
 }
 
+static void nvme_free_namespace(struct nvme_ns *ns)
+{
+	list_del(&ns->list);
+
+	spin_lock(&dev_list_lock);
+	ns->disk->private_data = NULL;
+	spin_unlock(&dev_list_lock);
+
+	put_disk(ns->disk);
+	kfree(ns);
+}
+
+static int ns_cmp(void *priv, struct list_head *a, struct list_head *b)
+{
+	struct nvme_ns *nsa = container_of(a, struct nvme_ns, list);
+	struct nvme_ns *nsb = container_of(b, struct nvme_ns, list);
+
+	return nsa->ns_id - nsb->ns_id;
+}
+
+static struct nvme_ns *nvme_find_ns(struct nvme_dev *dev, unsigned nsid)
+{
+	struct nvme_ns *ns;
+
+	list_for_each_entry(ns, &dev->namespaces, list) {
+		if (ns->ns_id == nsid)
+			return ns;
+		if (ns->ns_id > nsid)
+			break;
+	}
+	return NULL;
+}
+
+static inline bool nvme_io_incapable(struct nvme_dev *dev)
+{
+	return (!dev->bar || readl(&dev->bar->csts) & NVME_CSTS_CFS ||
+							dev->online_queues < 2);
+}
+
+static void nvme_ns_remove(struct nvme_ns *ns)
+{
+	bool kill = nvme_io_incapable(ns->dev) && !blk_queue_dying(ns->queue);
+
+	if (kill)
+		blk_set_queue_dying(ns->queue);
+	if (ns->disk->flags & GENHD_FL_UP) {
+		if (blk_get_integrity(ns->disk))
+			blk_integrity_unregister(ns->disk);
+		del_gendisk(ns->disk);
+	}
+	if (kill || !blk_queue_dying(ns->queue)) {
+		blk_mq_abort_requeue_list(ns->queue);
+		blk_cleanup_queue(ns->queue);
+        }
+}
+
+static void nvme_scan_namespaces(struct nvme_dev *dev, unsigned nn)
+{
+	struct nvme_ns *ns, *next;
+	unsigned i;
+
+	for (i = 1; i <= nn; i++) {
+		ns = nvme_find_ns(dev, i);
+		if (ns) {
+			if (revalidate_disk(ns->disk)) {
+				nvme_ns_remove(ns);
+				nvme_free_namespace(ns);
+			}
+		} else
+			nvme_alloc_ns(dev, i);
+	}
+	list_for_each_entry_safe(ns, next, &dev->namespaces, list) {
+		if (ns->ns_id > nn) {
+			nvme_ns_remove(ns);
+			nvme_free_namespace(ns);
+		}
+	}
+	list_sort(NULL, &dev->namespaces, ns_cmp);
+}
+
+static void nvme_dev_scan(struct work_struct *work)
+{
+	struct nvme_dev *dev = container_of(work, struct nvme_dev, scan_work);
+	struct nvme_id_ctrl *ctrl;
+
+	if (!dev->tagset.tags)
+		return;
+	if (nvme_identify_ctrl(dev, &ctrl))
+		return;
+	nvme_scan_namespaces(dev, le32_to_cpup(&ctrl->nn));
+	kfree(ctrl);
+}
+
 /*
  * Return: error value if an error occurred setting up the queues or calling
  * Identify Device.  0 if these succeeded, even if adding some of the
@@ -2204,7 +2315,7 @@ static int nvme_dev_add(struct nvme_dev *dev)
 {
 	struct pci_dev *pdev = to_pci_dev(dev->dev);
 	int res;
-	unsigned nn, i;
+	unsigned nn;
 	struct nvme_id_ctrl *ctrl;
 	int shift = NVME_CAP_MPSMIN(readq(&dev->bar->cap)) + 12;
 
@@ -2250,9 +2361,7 @@ static int nvme_dev_add(struct nvme_dev *dev)
 	if (blk_mq_alloc_tag_set(&dev->tagset))
 		return 0;
 
-	for (i = 1; i <= nn; i++)
-		nvme_alloc_ns(dev, i);
-
+	schedule_work(&dev->scan_work);
 	return 0;
 }
 
@@ -2552,17 +2661,8 @@ static void nvme_dev_remove(struct nvme_dev *dev)
 {
 	struct nvme_ns *ns;
 
-	list_for_each_entry(ns, &dev->namespaces, list) {
-		if (ns->disk->flags & GENHD_FL_UP) {
-			if (blk_get_integrity(ns->disk))
-				blk_integrity_unregister(ns->disk);
-			del_gendisk(ns->disk);
-		}
-		if (!blk_queue_dying(ns->queue)) {
-			blk_mq_abort_requeue_list(ns->queue);
-			blk_cleanup_queue(ns->queue);
-		}
-	}
+	list_for_each_entry(ns, &dev->namespaces, list)
+		nvme_ns_remove(ns);
 }
 
 static int nvme_setup_prp_pools(struct nvme_dev *dev)
@@ -2621,16 +2721,8 @@ static void nvme_free_namespaces(struct nvme_dev *dev)
 {
 	struct nvme_ns *ns, *next;
 
-	list_for_each_entry_safe(ns, next, &dev->namespaces, list) {
-		list_del(&ns->list);
-
-		spin_lock(&dev_list_lock);
-		ns->disk->private_data = NULL;
-		spin_unlock(&dev_list_lock);
-
-		put_disk(ns->disk);
-		kfree(ns);
-	}
+	list_for_each_entry_safe(ns, next, &dev->namespaces, list)
+		nvme_free_namespace(ns);
 }
 
 static void nvme_free_dev(struct kref *kref)
@@ -2814,6 +2906,7 @@ static int nvme_dev_resume(struct nvme_dev *dev)
 		spin_unlock(&dev_list_lock);
 	} else {
 		nvme_unfreeze_queues(dev);
+		schedule_work(&dev->scan_work);
 		nvme_set_irq_hints(dev);
 	}
 	return 0;
@@ -2935,6 +3028,7 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 		goto put_dev;
 
 	INIT_LIST_HEAD(&dev->node);
+	INIT_WORK(&dev->scan_work, nvme_dev_scan);
 	INIT_WORK(&dev->probe_work, nvme_async_probe);
 	schedule_work(&dev->probe_work);
 	return 0;
@@ -3007,6 +3101,7 @@ static void nvme_remove(struct pci_dev *pdev)
 	pci_set_drvdata(pdev, NULL);
 	flush_work(&dev->probe_work);
 	flush_work(&dev->reset_work);
+	flush_work(&dev->scan_work);
 	device_remove_file(dev->device, &dev_attr_reset_controller);
 	nvme_dev_shutdown(dev);
 	nvme_dev_remove(dev);
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index 986bf8ad8e93..c0d94ed8ce9a 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -92,6 +92,7 @@ struct nvme_dev {
 	work_func_t reset_workfn;
 	struct work_struct reset_work;
 	struct work_struct probe_work;
+	struct work_struct scan_work;
 	char name[12];
 	char serial[20];
 	char model[40];
diff --git a/include/uapi/linux/nvme.h b/include/uapi/linux/nvme.h
index b660dc2fadfb..732b32e92b02 100644
--- a/include/uapi/linux/nvme.h
+++ b/include/uapi/linux/nvme.h
@@ -179,6 +179,10 @@ enum {
 	NVME_SMART_CRIT_VOLATILE_MEMORY	= 1 << 4,
 };
 
+enum {
+	NVME_AER_NOTICE_NS_CHANGED	= 0x0002,
+};
+
 struct nvme_lba_range_type {
 	__u8			type;
 	__u8			attributes;
-- 
cgit v1.2.3


From 2e61dfb3602b904966491f260f62c01b9895936a Mon Sep 17 00:00:00 2001
From: Dinh Nguyen <dinguyen@opensource.altera.com>
Date: Fri, 5 Jun 2015 11:26:13 -0500
Subject: clk: of: helper for filling parent clock array and return num of
 parents

Sprinkled all through the platform clock drivers are code like this to
fill the clock parent array:

for (i = 0; i < num_parents; ++i)
	parent_names[i] = of_clk_get_parent_name(np, i);

The of_clk_parent_fill() will do the same as the code above, and while
at it, return the number of parents as well since the logic of the
function is to the walk the clock node to look for the parent.

Signed-off-by: Dinh Nguyen <dinguyen@opensource.altera.com>
[sboyd@codeaurora.org: Fixed kernel-doc]
Signed-off-by: Stephen Boyd <sboyd@codeaurora.org>
---
 drivers/clk/clk.c            | 21 +++++++++++++++++++++
 include/linux/clk-provider.h |  2 ++
 2 files changed, 23 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/clk/clk.c b/drivers/clk/clk.c
index a522dbbc01b9..ee222e28bd97 100644
--- a/drivers/clk/clk.c
+++ b/drivers/clk/clk.c
@@ -3016,6 +3016,27 @@ const char *of_clk_get_parent_name(struct device_node *np, int index)
 }
 EXPORT_SYMBOL_GPL(of_clk_get_parent_name);
 
+/**
+ * of_clk_parent_fill() - Fill @parents with names of @np's parents and return
+ * number of parents
+ * @np: Device node pointer associated with clock provider
+ * @parents: pointer to char array that hold the parents' names
+ * @size: size of the @parents array
+ *
+ * Return: number of parents for the clock node.
+ */
+int of_clk_parent_fill(struct device_node *np, const char **parents,
+		       unsigned int size)
+{
+	unsigned int i = 0;
+
+	while (i < size && (parents[i] = of_clk_get_parent_name(np, i)) != NULL)
+		i++;
+
+	return i;
+}
+EXPORT_SYMBOL_GPL(of_clk_parent_fill);
+
 struct clock_provider {
 	of_clk_init_cb_t clk_init_cb;
 	struct device_node *np;
diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h
index 5378c2aba4d2..2e5df069ca34 100644
--- a/include/linux/clk-provider.h
+++ b/include/linux/clk-provider.h
@@ -626,6 +626,8 @@ struct clk *of_clk_src_simple_get(struct of_phandle_args *clkspec,
 				  void *data);
 struct clk *of_clk_src_onecell_get(struct of_phandle_args *clkspec, void *data);
 int of_clk_get_parent_count(struct device_node *np);
+int of_clk_parent_fill(struct device_node *np, const char **parents,
+		       unsigned int size);
 const char *of_clk_get_parent_name(struct device_node *np, int index);
 
 void of_clk_init(const struct of_device_id *matches);
-- 
cgit v1.2.3


From d691f9e8d4405c334aa10d556e73c8bf44cb0e01 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@plumgrid.com>
Date: Thu, 4 Jun 2015 10:11:54 -0700
Subject: bpf: allow programs to write to certain skb fields

allow programs read/write skb->mark, tc_index fields and
((struct qdisc_skb_cb *)cb)->data.

mark and tc_index are generically useful in TC.
cb[0]-cb[4] are primarily used to pass arguments from one
program to another called via bpf_tail_call() which can
be seen in sockex3_kern.c example.

All fields of 'struct __sk_buff' are readable to socket and tc_cls_act progs.
mark, tc_index are writeable from tc_cls_act only.
cb[0]-cb[4] are writeable by both sockets and tc_cls_act.

Add verifier tests and improve sample code.

Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h         |  3 +-
 include/uapi/linux/bpf.h    |  2 +
 kernel/bpf/verifier.c       | 37 +++++++++++++-----
 net/core/filter.c           | 94 +++++++++++++++++++++++++++++++++++++++------
 samples/bpf/sockex3_kern.c  | 35 ++++++-----------
 samples/bpf/test_verifier.c | 84 +++++++++++++++++++++++++++++++++++++++-
 6 files changed, 207 insertions(+), 48 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index ca854e5bb2f7..2235aee8096a 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -105,7 +105,8 @@ struct bpf_verifier_ops {
 	 */
 	bool (*is_valid_access)(int off, int size, enum bpf_access_type type);
 
-	u32 (*convert_ctx_access)(int dst_reg, int src_reg, int ctx_off,
+	u32 (*convert_ctx_access)(enum bpf_access_type type, int dst_reg,
+				  int src_reg, int ctx_off,
 				  struct bpf_insn *insn);
 };
 
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 42aa19abab86..602f05b7a275 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -248,6 +248,8 @@ struct __sk_buff {
 	__u32 priority;
 	__u32 ingress_ifindex;
 	__u32 ifindex;
+	__u32 tc_index;
+	__u32 cb[5];
 };
 
 #endif /* _UAPI__LINUX_BPF_H__ */
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index cfd9a40b9a5a..039d866fd36a 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1692,6 +1692,8 @@ static int do_check(struct verifier_env *env)
 			}
 
 		} else if (class == BPF_STX) {
+			enum bpf_reg_type dst_reg_type;
+
 			if (BPF_MODE(insn->code) == BPF_XADD) {
 				err = check_xadd(env, insn);
 				if (err)
@@ -1700,11 +1702,6 @@ static int do_check(struct verifier_env *env)
 				continue;
 			}
 
-			if (BPF_MODE(insn->code) != BPF_MEM ||
-			    insn->imm != 0) {
-				verbose("BPF_STX uses reserved fields\n");
-				return -EINVAL;
-			}
 			/* check src1 operand */
 			err = check_reg_arg(regs, insn->src_reg, SRC_OP);
 			if (err)
@@ -1714,6 +1711,8 @@ static int do_check(struct verifier_env *env)
 			if (err)
 				return err;
 
+			dst_reg_type = regs[insn->dst_reg].type;
+
 			/* check that memory (dst_reg + off) is writeable */
 			err = check_mem_access(env, insn->dst_reg, insn->off,
 					       BPF_SIZE(insn->code), BPF_WRITE,
@@ -1721,6 +1720,15 @@ static int do_check(struct verifier_env *env)
 			if (err)
 				return err;
 
+			if (insn->imm == 0) {
+				insn->imm = dst_reg_type;
+			} else if (dst_reg_type != insn->imm &&
+				   (dst_reg_type == PTR_TO_CTX ||
+				    insn->imm == PTR_TO_CTX)) {
+				verbose("same insn cannot be used with different pointers\n");
+				return -EINVAL;
+			}
+
 		} else if (class == BPF_ST) {
 			if (BPF_MODE(insn->code) != BPF_MEM ||
 			    insn->src_reg != BPF_REG_0) {
@@ -1839,12 +1847,18 @@ static int replace_map_fd_with_map_ptr(struct verifier_env *env)
 
 	for (i = 0; i < insn_cnt; i++, insn++) {
 		if (BPF_CLASS(insn->code) == BPF_LDX &&
-		    (BPF_MODE(insn->code) != BPF_MEM ||
-		     insn->imm != 0)) {
+		    (BPF_MODE(insn->code) != BPF_MEM || insn->imm != 0)) {
 			verbose("BPF_LDX uses reserved fields\n");
 			return -EINVAL;
 		}
 
+		if (BPF_CLASS(insn->code) == BPF_STX &&
+		    ((BPF_MODE(insn->code) != BPF_MEM &&
+		      BPF_MODE(insn->code) != BPF_XADD) || insn->imm != 0)) {
+			verbose("BPF_STX uses reserved fields\n");
+			return -EINVAL;
+		}
+
 		if (insn[0].code == (BPF_LD | BPF_IMM | BPF_DW)) {
 			struct bpf_map *map;
 			struct fd f;
@@ -1967,12 +1981,17 @@ static int convert_ctx_accesses(struct verifier_env *env)
 	struct bpf_prog *new_prog;
 	u32 cnt;
 	int i;
+	enum bpf_access_type type;
 
 	if (!env->prog->aux->ops->convert_ctx_access)
 		return 0;
 
 	for (i = 0; i < insn_cnt; i++, insn++) {
-		if (insn->code != (BPF_LDX | BPF_MEM | BPF_W))
+		if (insn->code == (BPF_LDX | BPF_MEM | BPF_W))
+			type = BPF_READ;
+		else if (insn->code == (BPF_STX | BPF_MEM | BPF_W))
+			type = BPF_WRITE;
+		else
 			continue;
 
 		if (insn->imm != PTR_TO_CTX) {
@@ -1982,7 +2001,7 @@ static int convert_ctx_accesses(struct verifier_env *env)
 		}
 
 		cnt = env->prog->aux->ops->
-			convert_ctx_access(insn->dst_reg, insn->src_reg,
+			convert_ctx_access(type, insn->dst_reg, insn->src_reg,
 					   insn->off, insn_buf);
 		if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) {
 			verbose("bpf verifier is misconfigured\n");
diff --git a/net/core/filter.c b/net/core/filter.c
index 36a69e33d76b..d271c06bf01f 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -46,6 +46,7 @@
 #include <linux/seccomp.h>
 #include <linux/if_vlan.h>
 #include <linux/bpf.h>
+#include <net/sch_generic.h>
 
 /**
  *	sk_filter - run a packet through a socket filter
@@ -1463,13 +1464,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id)
 	}
 }
 
-static bool sk_filter_is_valid_access(int off, int size,
-				      enum bpf_access_type type)
+static bool __is_valid_access(int off, int size, enum bpf_access_type type)
 {
-	/* only read is allowed */
-	if (type != BPF_READ)
-		return false;
-
 	/* check bounds */
 	if (off < 0 || off >= sizeof(struct __sk_buff))
 		return false;
@@ -1485,8 +1481,42 @@ static bool sk_filter_is_valid_access(int off, int size,
 	return true;
 }
 
-static u32 sk_filter_convert_ctx_access(int dst_reg, int src_reg, int ctx_off,
-					struct bpf_insn *insn_buf)
+static bool sk_filter_is_valid_access(int off, int size,
+				      enum bpf_access_type type)
+{
+	if (type == BPF_WRITE) {
+		switch (off) {
+		case offsetof(struct __sk_buff, cb[0]) ...
+			offsetof(struct __sk_buff, cb[4]):
+			break;
+		default:
+			return false;
+		}
+	}
+
+	return __is_valid_access(off, size, type);
+}
+
+static bool tc_cls_act_is_valid_access(int off, int size,
+				       enum bpf_access_type type)
+{
+	if (type == BPF_WRITE) {
+		switch (off) {
+		case offsetof(struct __sk_buff, mark):
+		case offsetof(struct __sk_buff, tc_index):
+		case offsetof(struct __sk_buff, cb[0]) ...
+			offsetof(struct __sk_buff, cb[4]):
+			break;
+		default:
+			return false;
+		}
+	}
+	return __is_valid_access(off, size, type);
+}
+
+static u32 bpf_net_convert_ctx_access(enum bpf_access_type type, int dst_reg,
+				      int src_reg, int ctx_off,
+				      struct bpf_insn *insn_buf)
 {
 	struct bpf_insn *insn = insn_buf;
 
@@ -1538,7 +1568,15 @@ static u32 sk_filter_convert_ctx_access(int dst_reg, int src_reg, int ctx_off,
 		break;
 
 	case offsetof(struct __sk_buff, mark):
-		return convert_skb_access(SKF_AD_MARK, dst_reg, src_reg, insn);
+		BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, mark) != 4);
+
+		if (type == BPF_WRITE)
+			*insn++ = BPF_STX_MEM(BPF_W, dst_reg, src_reg,
+					      offsetof(struct sk_buff, mark));
+		else
+			*insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg,
+					      offsetof(struct sk_buff, mark));
+		break;
 
 	case offsetof(struct __sk_buff, pkt_type):
 		return convert_skb_access(SKF_AD_PKTTYPE, dst_reg, src_reg, insn);
@@ -1553,6 +1591,38 @@ static u32 sk_filter_convert_ctx_access(int dst_reg, int src_reg, int ctx_off,
 	case offsetof(struct __sk_buff, vlan_tci):
 		return convert_skb_access(SKF_AD_VLAN_TAG,
 					  dst_reg, src_reg, insn);
+
+	case offsetof(struct __sk_buff, cb[0]) ...
+		offsetof(struct __sk_buff, cb[4]):
+		BUILD_BUG_ON(FIELD_SIZEOF(struct qdisc_skb_cb, data) < 20);
+
+		ctx_off -= offsetof(struct __sk_buff, cb[0]);
+		ctx_off += offsetof(struct sk_buff, cb);
+		ctx_off += offsetof(struct qdisc_skb_cb, data);
+		if (type == BPF_WRITE)
+			*insn++ = BPF_STX_MEM(BPF_W, dst_reg, src_reg, ctx_off);
+		else
+			*insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg, ctx_off);
+		break;
+
+	case offsetof(struct __sk_buff, tc_index):
+#ifdef CONFIG_NET_SCHED
+		BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, tc_index) != 2);
+
+		if (type == BPF_WRITE)
+			*insn++ = BPF_STX_MEM(BPF_H, dst_reg, src_reg,
+					      offsetof(struct sk_buff, tc_index));
+		else
+			*insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg,
+					      offsetof(struct sk_buff, tc_index));
+		break;
+#else
+		if (type == BPF_WRITE)
+			*insn++ = BPF_MOV64_REG(dst_reg, dst_reg);
+		else
+			*insn++ = BPF_MOV64_IMM(dst_reg, 0);
+		break;
+#endif
 	}
 
 	return insn - insn_buf;
@@ -1561,13 +1631,13 @@ static u32 sk_filter_convert_ctx_access(int dst_reg, int src_reg, int ctx_off,
 static const struct bpf_verifier_ops sk_filter_ops = {
 	.get_func_proto = sk_filter_func_proto,
 	.is_valid_access = sk_filter_is_valid_access,
-	.convert_ctx_access = sk_filter_convert_ctx_access,
+	.convert_ctx_access = bpf_net_convert_ctx_access,
 };
 
 static const struct bpf_verifier_ops tc_cls_act_ops = {
 	.get_func_proto = tc_cls_act_func_proto,
-	.is_valid_access = sk_filter_is_valid_access,
-	.convert_ctx_access = sk_filter_convert_ctx_access,
+	.is_valid_access = tc_cls_act_is_valid_access,
+	.convert_ctx_access = bpf_net_convert_ctx_access,
 };
 
 static struct bpf_prog_type_list sk_filter_type __read_mostly = {
diff --git a/samples/bpf/sockex3_kern.c b/samples/bpf/sockex3_kern.c
index 2625b987944f..41ae2fd21b13 100644
--- a/samples/bpf/sockex3_kern.c
+++ b/samples/bpf/sockex3_kern.c
@@ -89,7 +89,6 @@ static inline __u32 ipv6_addr_hash(struct __sk_buff *ctx, __u64 off)
 
 struct globals {
 	struct flow_keys flow;
-	__u32 nhoff;
 };
 
 struct bpf_map_def SEC("maps") percpu_map = {
@@ -139,7 +138,7 @@ static void update_stats(struct __sk_buff *skb, struct globals *g)
 static __always_inline void parse_ip_proto(struct __sk_buff *skb,
 					   struct globals *g, __u32 ip_proto)
 {
-	__u32 nhoff = g->nhoff;
+	__u32 nhoff = skb->cb[0];
 	int poff;
 
 	switch (ip_proto) {
@@ -165,7 +164,7 @@ static __always_inline void parse_ip_proto(struct __sk_buff *skb,
 		if (gre_flags & GRE_SEQ)
 			nhoff += 4;
 
-		g->nhoff = nhoff;
+		skb->cb[0] = nhoff;
 		parse_eth_proto(skb, gre_proto);
 		break;
 	}
@@ -195,7 +194,7 @@ PROG(PARSE_IP)(struct __sk_buff *skb)
 	if (!g)
 		return 0;
 
-	nhoff = g->nhoff;
+	nhoff = skb->cb[0];
 
 	if (unlikely(ip_is_fragment(skb, nhoff)))
 		return 0;
@@ -210,7 +209,7 @@ PROG(PARSE_IP)(struct __sk_buff *skb)
 	verlen = load_byte(skb, nhoff + 0/*offsetof(struct iphdr, ihl)*/);
 	nhoff += (verlen & 0xF) << 2;
 
-	g->nhoff = nhoff;
+	skb->cb[0] = nhoff;
 	parse_ip_proto(skb, g, ip_proto);
 	return 0;
 }
@@ -223,7 +222,7 @@ PROG(PARSE_IPV6)(struct __sk_buff *skb)
 	if (!g)
 		return 0;
 
-	nhoff = g->nhoff;
+	nhoff = skb->cb[0];
 
 	ip_proto = load_byte(skb,
 			     nhoff + offsetof(struct ipv6hdr, nexthdr));
@@ -233,25 +232,21 @@ PROG(PARSE_IPV6)(struct __sk_buff *skb)
 				     nhoff + offsetof(struct ipv6hdr, daddr));
 	nhoff += sizeof(struct ipv6hdr);
 
-	g->nhoff = nhoff;
+	skb->cb[0] = nhoff;
 	parse_ip_proto(skb, g, ip_proto);
 	return 0;
 }
 
 PROG(PARSE_VLAN)(struct __sk_buff *skb)
 {
-	struct globals *g = this_cpu_globals();
 	__u32 nhoff, proto;
 
-	if (!g)
-		return 0;
-
-	nhoff = g->nhoff;
+	nhoff = skb->cb[0];
 
 	proto = load_half(skb, nhoff + offsetof(struct vlan_hdr,
 						h_vlan_encapsulated_proto));
 	nhoff += sizeof(struct vlan_hdr);
-	g->nhoff = nhoff;
+	skb->cb[0] = nhoff;
 
 	parse_eth_proto(skb, proto);
 
@@ -260,17 +255,13 @@ PROG(PARSE_VLAN)(struct __sk_buff *skb)
 
 PROG(PARSE_MPLS)(struct __sk_buff *skb)
 {
-	struct globals *g = this_cpu_globals();
 	__u32 nhoff, label;
 
-	if (!g)
-		return 0;
-
-	nhoff = g->nhoff;
+	nhoff = skb->cb[0];
 
 	label = load_word(skb, nhoff);
 	nhoff += sizeof(struct mpls_label);
-	g->nhoff = nhoff;
+	skb->cb[0] = nhoff;
 
 	if (label & MPLS_LS_S_MASK) {
 		__u8 verlen = load_byte(skb, nhoff);
@@ -288,14 +279,10 @@ PROG(PARSE_MPLS)(struct __sk_buff *skb)
 SEC("socket/0")
 int main_prog(struct __sk_buff *skb)
 {
-	struct globals *g = this_cpu_globals();
 	__u32 nhoff = ETH_HLEN;
 	__u32 proto = load_half(skb, 12);
 
-	if (!g)
-		return 0;
-
-	g->nhoff = nhoff;
+	skb->cb[0] = nhoff;
 	parse_eth_proto(skb, proto);
 	return 0;
 }
diff --git a/samples/bpf/test_verifier.c b/samples/bpf/test_verifier.c
index 12f3780af73f..693605997abc 100644
--- a/samples/bpf/test_verifier.c
+++ b/samples/bpf/test_verifier.c
@@ -29,6 +29,7 @@ struct bpf_test {
 		ACCEPT,
 		REJECT
 	} result;
+	enum bpf_prog_type prog_type;
 };
 
 static struct bpf_test tests[] = {
@@ -743,6 +744,84 @@ static struct bpf_test tests[] = {
 		.errstr = "different pointers",
 		.result = REJECT,
 	},
+	{
+		"check skb->mark is not writeable by sockets",
+		.insns = {
+			BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_1,
+				    offsetof(struct __sk_buff, mark)),
+			BPF_EXIT_INSN(),
+		},
+		.errstr = "invalid bpf_context access",
+		.result = REJECT,
+	},
+	{
+		"check skb->tc_index is not writeable by sockets",
+		.insns = {
+			BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_1,
+				    offsetof(struct __sk_buff, tc_index)),
+			BPF_EXIT_INSN(),
+		},
+		.errstr = "invalid bpf_context access",
+		.result = REJECT,
+	},
+	{
+		"check non-u32 access to cb",
+		.insns = {
+			BPF_STX_MEM(BPF_H, BPF_REG_1, BPF_REG_1,
+				    offsetof(struct __sk_buff, cb[0])),
+			BPF_EXIT_INSN(),
+		},
+		.errstr = "invalid bpf_context access",
+		.result = REJECT,
+	},
+	{
+		"check out of range skb->cb access",
+		.insns = {
+			BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
+				    offsetof(struct __sk_buff, cb[60])),
+			BPF_EXIT_INSN(),
+		},
+		.errstr = "invalid bpf_context access",
+		.result = REJECT,
+		.prog_type = BPF_PROG_TYPE_SCHED_ACT,
+	},
+	{
+		"write skb fields from socket prog",
+		.insns = {
+			BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
+				    offsetof(struct __sk_buff, cb[4])),
+			BPF_JMP_IMM(BPF_JGE, BPF_REG_0, 0, 1),
+			BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
+				    offsetof(struct __sk_buff, mark)),
+			BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
+				    offsetof(struct __sk_buff, tc_index)),
+			BPF_JMP_IMM(BPF_JGE, BPF_REG_0, 0, 1),
+			BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_1,
+				    offsetof(struct __sk_buff, cb[0])),
+			BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_1,
+				    offsetof(struct __sk_buff, cb[2])),
+			BPF_EXIT_INSN(),
+		},
+		.result = ACCEPT,
+	},
+	{
+		"write skb fields from tc_cls_act prog",
+		.insns = {
+			BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
+				    offsetof(struct __sk_buff, cb[0])),
+			BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_0,
+				    offsetof(struct __sk_buff, mark)),
+			BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
+				    offsetof(struct __sk_buff, tc_index)),
+			BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_0,
+				    offsetof(struct __sk_buff, tc_index)),
+			BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_0,
+				    offsetof(struct __sk_buff, cb[3])),
+			BPF_EXIT_INSN(),
+		},
+		.result = ACCEPT,
+		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+	},
 };
 
 static int probe_filter_length(struct bpf_insn *fp)
@@ -775,6 +854,7 @@ static int test(void)
 
 	for (i = 0; i < ARRAY_SIZE(tests); i++) {
 		struct bpf_insn *prog = tests[i].insns;
+		int prog_type = tests[i].prog_type;
 		int prog_len = probe_filter_length(prog);
 		int *fixup = tests[i].fixup;
 		int map_fd = -1;
@@ -789,8 +869,8 @@ static int test(void)
 		}
 		printf("#%d %s ", i, tests[i].descr);
 
-		prog_fd = bpf_prog_load(BPF_PROG_TYPE_SOCKET_FILTER, prog,
-					prog_len * sizeof(struct bpf_insn),
+		prog_fd = bpf_prog_load(prog_type ?: BPF_PROG_TYPE_SOCKET_FILTER,
+					prog, prog_len * sizeof(struct bpf_insn),
 					"GPL", 0);
 
 		if (tests[i].result == ACCEPT) {
-- 
cgit v1.2.3


From 4eaca0a887eaee04fc7a3866d0f5b51b34030dfa Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Thu, 4 Jun 2015 17:39:08 +0200
Subject: preempt: Use preempt_schedule_context() as the official tracing
 preemption point

preempt_schedule_context() is a tracing safe preemption point but it's
only used when CONFIG_CONTEXT_TRACKING=y. Other configs have tracing
recursion issues since commit:

  b30f0e3ffedf ("sched/preempt: Optimize preemption operations on __schedule() callers")

introduced function based preemp_count_*() ops.

Lets make it available on all configs and give it a more appropriate
name for its new position.

Reported-by: Fengguang Wu <fengguang.wu@intel.com>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1433432349-1021-3-git-send-email-fweisbec@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/preempt.h   | 8 +++-----
 arch/x86/kernel/i386_ksyms_32.c  | 4 +---
 arch/x86/kernel/x8664_ksyms_64.c | 4 +---
 arch/x86/lib/thunk_32.S          | 4 +---
 arch/x86/lib/thunk_64.S          | 4 +---
 include/asm-generic/preempt.h    | 7 ++-----
 include/linux/preempt.h          | 6 +-----
 kernel/sched/core.c              | 8 +++-----
 8 files changed, 13 insertions(+), 32 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h
index 8f3271842533..dca71714f860 100644
--- a/arch/x86/include/asm/preempt.h
+++ b/arch/x86/include/asm/preempt.h
@@ -99,11 +99,9 @@ static __always_inline bool should_resched(void)
   extern asmlinkage void ___preempt_schedule(void);
 # define __preempt_schedule() asm ("call ___preempt_schedule")
   extern asmlinkage void preempt_schedule(void);
-# ifdef CONFIG_CONTEXT_TRACKING
-    extern asmlinkage void ___preempt_schedule_context(void);
-#   define __preempt_schedule_context() asm ("call ___preempt_schedule_context")
-    extern asmlinkage void preempt_schedule_context(void);
-# endif
+  extern asmlinkage void ___preempt_schedule_notrace(void);
+# define __preempt_schedule_notrace() asm ("call ___preempt_schedule_notrace")
+  extern asmlinkage void preempt_schedule_notrace(void);
 #endif
 
 #endif /* __ASM_PREEMPT_H */
diff --git a/arch/x86/kernel/i386_ksyms_32.c b/arch/x86/kernel/i386_ksyms_32.c
index 05fd74f537d6..64341aa485ae 100644
--- a/arch/x86/kernel/i386_ksyms_32.c
+++ b/arch/x86/kernel/i386_ksyms_32.c
@@ -40,7 +40,5 @@ EXPORT_SYMBOL(empty_zero_page);
 
 #ifdef CONFIG_PREEMPT
 EXPORT_SYMBOL(___preempt_schedule);
-#ifdef CONFIG_CONTEXT_TRACKING
-EXPORT_SYMBOL(___preempt_schedule_context);
-#endif
+EXPORT_SYMBOL(___preempt_schedule_notrace);
 #endif
diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c
index 37d8fa4438f0..a0695be19864 100644
--- a/arch/x86/kernel/x8664_ksyms_64.c
+++ b/arch/x86/kernel/x8664_ksyms_64.c
@@ -75,7 +75,5 @@ EXPORT_SYMBOL(native_load_gs_index);
 
 #ifdef CONFIG_PREEMPT
 EXPORT_SYMBOL(___preempt_schedule);
-#ifdef CONFIG_CONTEXT_TRACKING
-EXPORT_SYMBOL(___preempt_schedule_context);
-#endif
+EXPORT_SYMBOL(___preempt_schedule_notrace);
 #endif
diff --git a/arch/x86/lib/thunk_32.S b/arch/x86/lib/thunk_32.S
index 5eb715087b80..e407941d0488 100644
--- a/arch/x86/lib/thunk_32.S
+++ b/arch/x86/lib/thunk_32.S
@@ -38,8 +38,6 @@
 
 #ifdef CONFIG_PREEMPT
 	THUNK ___preempt_schedule, preempt_schedule
-#ifdef CONFIG_CONTEXT_TRACKING
-	THUNK ___preempt_schedule_context, preempt_schedule_context
-#endif
+	THUNK ___preempt_schedule_notrace, preempt_schedule_notrace
 #endif
 
diff --git a/arch/x86/lib/thunk_64.S b/arch/x86/lib/thunk_64.S
index f89ba4e93025..2198902329b5 100644
--- a/arch/x86/lib/thunk_64.S
+++ b/arch/x86/lib/thunk_64.S
@@ -49,9 +49,7 @@
 
 #ifdef CONFIG_PREEMPT
 	THUNK ___preempt_schedule, preempt_schedule
-#ifdef CONFIG_CONTEXT_TRACKING
-	THUNK ___preempt_schedule_context, preempt_schedule_context
-#endif
+	THUNK ___preempt_schedule_notrace, preempt_schedule_notrace
 #endif
 
 #if defined(CONFIG_TRACE_IRQFLAGS) \
diff --git a/include/asm-generic/preempt.h b/include/asm-generic/preempt.h
index eb6f9e6c3075..d0a7a4753db2 100644
--- a/include/asm-generic/preempt.h
+++ b/include/asm-generic/preempt.h
@@ -79,11 +79,8 @@ static __always_inline bool should_resched(void)
 #ifdef CONFIG_PREEMPT
 extern asmlinkage void preempt_schedule(void);
 #define __preempt_schedule() preempt_schedule()
-
-#ifdef CONFIG_CONTEXT_TRACKING
-extern asmlinkage void preempt_schedule_context(void);
-#define __preempt_schedule_context() preempt_schedule_context()
-#endif
+extern asmlinkage void preempt_schedule_notrace(void);
+#define __preempt_schedule_notrace() preempt_schedule_notrace()
 #endif /* CONFIG_PREEMPT */
 
 #endif /* __ASM_PREEMPT_H */
diff --git a/include/linux/preempt.h b/include/linux/preempt.h
index a1a00e14c14f..7686dd63bc35 100644
--- a/include/linux/preempt.h
+++ b/include/linux/preempt.h
@@ -204,15 +204,11 @@ do { \
 
 #ifdef CONFIG_PREEMPT
 
-#ifndef CONFIG_CONTEXT_TRACKING
-#define __preempt_schedule_context() __preempt_schedule()
-#endif
-
 #define preempt_enable_notrace() \
 do { \
 	barrier(); \
 	if (unlikely(__preempt_count_dec_and_test())) \
-		__preempt_schedule_context(); \
+		__preempt_schedule_notrace(); \
 } while (0)
 #else
 #define preempt_enable_notrace() \
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 4e925ea10c0c..af0a5a6cee98 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2937,9 +2937,8 @@ asmlinkage __visible void __sched notrace preempt_schedule(void)
 NOKPROBE_SYMBOL(preempt_schedule);
 EXPORT_SYMBOL(preempt_schedule);
 
-#ifdef CONFIG_CONTEXT_TRACKING
 /**
- * preempt_schedule_context - preempt_schedule called by tracing
+ * preempt_schedule_notrace - preempt_schedule called by tracing
  *
  * The tracing infrastructure uses preempt_enable_notrace to prevent
  * recursion and tracing preempt enabling caused by the tracing
@@ -2952,7 +2951,7 @@ EXPORT_SYMBOL(preempt_schedule);
  * instead of preempt_schedule() to exit user context if needed before
  * calling the scheduler.
  */
-asmlinkage __visible void __sched notrace preempt_schedule_context(void)
+asmlinkage __visible void __sched notrace preempt_schedule_notrace(void)
 {
 	enum ctx_state prev_ctx;
 
@@ -2980,8 +2979,7 @@ asmlinkage __visible void __sched notrace preempt_schedule_context(void)
 		__preempt_count_sub(PREEMPT_ACTIVE + PREEMPT_DISABLE_OFFSET);
 	} while (need_resched());
 }
-EXPORT_SYMBOL_GPL(preempt_schedule_context);
-#endif /* CONFIG_CONTEXT_TRACKING */
+EXPORT_SYMBOL_GPL(preempt_schedule_notrace);
 
 #endif /* CONFIG_PREEMPT */
 
-- 
cgit v1.2.3


From 9a92e3dc6ad02208a014d0d8404ebbd697e3d5ef Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Thu, 4 Jun 2015 17:39:09 +0200
Subject: preempt: Reorganize the notrace definitions a bit

preempt.h has two seperate "#ifdef CONFIG_PREEMPT" sections: one to
define preempt_enable() and another to define preempt_enable_notrace().

Lets gather both.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Fengguang Wu <fengguang.wu@intel.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1433432349-1021-4-git-send-email-fweisbec@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/preempt.h | 32 +++++++++++++++-----------------
 1 file changed, 15 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/preempt.h b/include/linux/preempt.h
index 7686dd63bc35..0f1534acaf60 100644
--- a/include/linux/preempt.h
+++ b/include/linux/preempt.h
@@ -175,48 +175,46 @@ do { \
 		__preempt_schedule(); \
 } while (0)
 
+#define preempt_enable_notrace() \
+do { \
+	barrier(); \
+	if (unlikely(__preempt_count_dec_and_test())) \
+		__preempt_schedule_notrace(); \
+} while (0)
+
 #define preempt_check_resched() \
 do { \
 	if (should_resched()) \
 		__preempt_schedule(); \
 } while (0)
 
-#else
+#else /* !CONFIG_PREEMPT */
 #define preempt_enable() \
 do { \
 	barrier(); \
 	preempt_count_dec(); \
 } while (0)
-#define preempt_check_resched() do { } while (0)
-#endif
-
-#define preempt_disable_notrace() \
-do { \
-	__preempt_count_inc(); \
-	barrier(); \
-} while (0)
 
-#define preempt_enable_no_resched_notrace() \
+#define preempt_enable_notrace() \
 do { \
 	barrier(); \
 	__preempt_count_dec(); \
 } while (0)
 
-#ifdef CONFIG_PREEMPT
+#define preempt_check_resched() do { } while (0)
+#endif /* CONFIG_PREEMPT */
 
-#define preempt_enable_notrace() \
+#define preempt_disable_notrace() \
 do { \
+	__preempt_count_inc(); \
 	barrier(); \
-	if (unlikely(__preempt_count_dec_and_test())) \
-		__preempt_schedule_notrace(); \
 } while (0)
-#else
-#define preempt_enable_notrace() \
+
+#define preempt_enable_no_resched_notrace() \
 do { \
 	barrier(); \
 	__preempt_count_dec(); \
 } while (0)
-#endif
 
 #else /* !CONFIG_PREEMPT_COUNT */
 
-- 
cgit v1.2.3


From 21509084f999d7accd32e45961ef76853112e978 Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zheng.z.yan@intel.com>
Date: Wed, 6 May 2015 15:33:49 -0400
Subject: perf/x86/intel: Handle multiple records in the PEBS buffer

When the PEBS interrupt threshold is larger than one record and the
machine supports multiple PEBS events, the records of these events are
mixed up and we need to demultiplex them.

Demuxing the records is hard because the hardware is deficient. The
hardware has two issues that, when combined, create impossible
scenarios to demux.

The first issue is that the 'status' field of the PEBS record is a copy
of the GLOBAL_STATUS MSR at PEBS assist time. To see why this is a
problem let us first describe the regular PEBS cycle:

A) the CTRn value reaches 0:
  - the corresponding bit in GLOBAL_STATUS gets set
  - we start arming the hardware assist
  < some unspecified amount of time later -- this could cover multiple
    events of interest >

B) the hardware assist is armed, any next event will trigger it

C) a matching event happens:
  - the hardware assist triggers and generates a PEBS record
    this includes a copy of GLOBAL_STATUS at this moment
  - if we auto-reload we (re)set CTRn
  - we clear the relevant bit in GLOBAL_STATUS

Now consider the following chain of events:

  A0, B0, A1, C0

The event generated for counter 0 will include a status with counter 1
set, even though its not at all related to the record. A similar thing
can happen with a !PEBS event if it just happens to overflow at the
right moment.

The second issue is that the hardware will only emit one record for two
or more counters if the event that triggers the assist is 'close'. The
'close' can be several cycles. In some cases even the complete assist,
if the event is something that doesn't need retirement.

For instance, consider this chain of events:

  A0, B0, A1, B1, C01

Where C01 is an event that triggers both hardware assists, we will
generate but a single record, but again with both counters listed in the
status field.

This time the record pertains to both events.

Note that these two cases are different but undistinguishable with the
data as generated. Therefore demuxing records with multiple PEBS bits
(we can safely ignore status bits for !PEBS counters) is impossible.

Furthermore we cannot emit the record to both events because that might
cause a data leak -- the events might not have the same privileges -- so
what this patch does is discard such events.

The assumption/hope is that such discards will be rare.

Here lists some possible ways you may get high discard rate.

  - when you count the same thing multiple times. But it is not a useful
    configuration.
  - you can be unfortunate if you measure with a userspace only PEBS
    event along with either a kernel or unrestricted PEBS event. Imagine
    the event triggering and setting the overflow flag right before
    entering the kernel. Then all kernel side events will end up with
    multiple bits set.

Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
Signed-off-by: Kan Liang <kan.liang@intel.com>
[ Changelog improvements. ]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: acme@infradead.org
Cc: eranian@google.com
Link: http://lkml.kernel.org/r/1430940834-8964-4-git-send-email-kan.liang@intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>

Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event_intel_ds.c | 142 +++++++++++++++++++++---------
 include/linux/perf_event.h                |  13 +++
 kernel/events/core.c                      |   6 +-
 kernel/events/internal.h                  |   9 --
 4 files changed, 116 insertions(+), 54 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index a5fe561c4902..72529c237e6e 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -872,6 +872,9 @@ static void setup_pebs_sample_data(struct perf_event *event,
 	int fll, fst, dsrc;
 	int fl = event->hw.flags;
 
+	if (pebs == NULL)
+		return;
+
 	sample_type = event->attr.sample_type;
 	dsrc = sample_type & PERF_SAMPLE_DATA_SRC;
 
@@ -966,19 +969,68 @@ static void setup_pebs_sample_data(struct perf_event *event,
 		data->br_stack = &cpuc->lbr_stack;
 }
 
+static inline void *
+get_next_pebs_record_by_bit(void *base, void *top, int bit)
+{
+	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+	void *at;
+	u64 pebs_status;
+
+	if (base == NULL)
+		return NULL;
+
+	for (at = base; at < top; at += x86_pmu.pebs_record_size) {
+		struct pebs_record_nhm *p = at;
+
+		if (test_bit(bit, (unsigned long *)&p->status)) {
+
+			if (p->status == (1 << bit))
+				return at;
+
+			/* clear non-PEBS bit and re-check */
+			pebs_status = p->status & cpuc->pebs_enabled;
+			pebs_status &= (1ULL << MAX_PEBS_EVENTS) - 1;
+			if (pebs_status == (1 << bit))
+				return at;
+		}
+	}
+	return NULL;
+}
+
 static void __intel_pmu_pebs_event(struct perf_event *event,
-				   struct pt_regs *iregs, void *__pebs)
+				   struct pt_regs *iregs,
+				   void *base, void *top,
+				   int bit, int count)
 {
 	struct perf_sample_data data;
 	struct pt_regs regs;
+	int i;
+	void *at = get_next_pebs_record_by_bit(base, top, bit);
 
-	if (!intel_pmu_save_and_restart(event))
+	if (!intel_pmu_save_and_restart(event) &&
+	    !(event->hw.flags & PERF_X86_EVENT_AUTO_RELOAD))
 		return;
 
-	setup_pebs_sample_data(event, iregs, __pebs, &data, &regs);
+	if (count > 1) {
+		for (i = 0; i < count - 1; i++) {
+			setup_pebs_sample_data(event, iregs, at, &data, &regs);
+			perf_event_output(event, &data, &regs);
+			at += x86_pmu.pebs_record_size;
+			at = get_next_pebs_record_by_bit(at, top, bit);
+		}
+	}
+
+	setup_pebs_sample_data(event, iregs, at, &data, &regs);
 
-	if (perf_event_overflow(event, &data, &regs))
+	/*
+	 * All but the last records are processed.
+	 * The last one is left to be able to call the overflow handler.
+	 */
+	if (perf_event_overflow(event, &data, &regs)) {
 		x86_pmu_stop(event, 0);
+		return;
+	}
+
 }
 
 static void intel_pmu_drain_pebs_core(struct pt_regs *iregs)
@@ -1008,72 +1060,78 @@ static void intel_pmu_drain_pebs_core(struct pt_regs *iregs)
 	if (!event->attr.precise_ip)
 		return;
 
-	n = top - at;
+	n = (top - at) / x86_pmu.pebs_record_size;
 	if (n <= 0)
 		return;
 
-	/*
-	 * Should not happen, we program the threshold at 1 and do not
-	 * set a reset value.
-	 */
-	WARN_ONCE(n > 1, "bad leftover pebs %d\n", n);
-	at += n - 1;
-
-	__intel_pmu_pebs_event(event, iregs, at);
+	__intel_pmu_pebs_event(event, iregs, at, top, 0, n);
 }
 
 static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)
 {
 	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
 	struct debug_store *ds = cpuc->ds;
-	struct perf_event *event = NULL;
-	void *at, *top;
-	u64 status = 0;
+	struct perf_event *event;
+	void *base, *at, *top;
 	int bit;
+	short counts[MAX_PEBS_EVENTS] = {};
 
 	if (!x86_pmu.pebs_active)
 		return;
 
-	at  = (struct pebs_record_nhm *)(unsigned long)ds->pebs_buffer_base;
+	base = (struct pebs_record_nhm *)(unsigned long)ds->pebs_buffer_base;
 	top = (struct pebs_record_nhm *)(unsigned long)ds->pebs_index;
 
 	ds->pebs_index = ds->pebs_buffer_base;
 
-	if (unlikely(at > top))
+	if (unlikely(base >= top))
 		return;
 
-	/*
-	 * Should not happen, we program the threshold at 1 and do not
-	 * set a reset value.
-	 */
-	WARN_ONCE(top - at > x86_pmu.max_pebs_events * x86_pmu.pebs_record_size,
-		  "Unexpected number of pebs records %ld\n",
-		  (long)(top - at) / x86_pmu.pebs_record_size);
-
-	for (; at < top; at += x86_pmu.pebs_record_size) {
+	for (at = base; at < top; at += x86_pmu.pebs_record_size) {
 		struct pebs_record_nhm *p = at;
 
-		for_each_set_bit(bit, (unsigned long *)&p->status,
-				 x86_pmu.max_pebs_events) {
-			event = cpuc->events[bit];
-			if (!test_bit(bit, cpuc->active_mask))
-				continue;
-
-			WARN_ON_ONCE(!event);
-
-			if (!event->attr.precise_ip)
-				continue;
+		bit = find_first_bit((unsigned long *)&p->status,
+					x86_pmu.max_pebs_events);
+		if (bit >= x86_pmu.max_pebs_events)
+			continue;
+		if (!test_bit(bit, cpuc->active_mask))
+			continue;
+		/*
+		 * The PEBS hardware does not deal well with the situation
+		 * when events happen near to each other and multiple bits
+		 * are set. But it should happen rarely.
+		 *
+		 * If these events include one PEBS and multiple non-PEBS
+		 * events, it doesn't impact PEBS record. The record will
+		 * be handled normally. (slow path)
+		 *
+		 * If these events include two or more PEBS events, the
+		 * records for the events can be collapsed into a single
+		 * one, and it's not possible to reconstruct all events
+		 * that caused the PEBS record. It's called collision.
+		 * If collision happened, the record will be dropped.
+		 *
+		 */
+		if (p->status != (1 << bit)) {
+			u64 pebs_status;
 
-			if (__test_and_set_bit(bit, (unsigned long *)&status))
+			/* slow path */
+			pebs_status = p->status & cpuc->pebs_enabled;
+			pebs_status &= (1ULL << MAX_PEBS_EVENTS) - 1;
+			if (pebs_status != (1 << bit))
 				continue;
-
-			break;
 		}
+		counts[bit]++;
+	}
 
-		if (!event || bit >= x86_pmu.max_pebs_events)
+	for (bit = 0; bit < x86_pmu.max_pebs_events; bit++) {
+		if (counts[bit] == 0)
 			continue;
+		event = cpuc->events[bit];
+		WARN_ON_ONCE(!event);
+		WARN_ON_ONCE(!event->attr.precise_ip);
 
-		__intel_pmu_pebs_event(event, iregs, at);
+		__intel_pmu_pebs_event(event, iregs, base, top, bit, counts[bit]);
 	}
 }
 
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 06580028cee6..5f192e1bc98e 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -730,6 +730,19 @@ extern int perf_event_overflow(struct perf_event *event,
 				 struct perf_sample_data *data,
 				 struct pt_regs *regs);
 
+extern void perf_event_output(struct perf_event *event,
+				struct perf_sample_data *data,
+				struct pt_regs *regs);
+
+extern void
+perf_event_header__init_id(struct perf_event_header *header,
+			   struct perf_sample_data *data,
+			   struct perf_event *event);
+extern void
+perf_event__output_id_sample(struct perf_event *event,
+			     struct perf_output_handle *handle,
+			     struct perf_sample_data *sample);
+
 static inline bool is_sampling_event(struct perf_event *event)
 {
 	return event->attr.sample_period != 0;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index eddf1ed4155e..e499b4e43aff 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -5381,9 +5381,9 @@ void perf_prepare_sample(struct perf_event_header *header,
 	}
 }
 
-static void perf_event_output(struct perf_event *event,
-				struct perf_sample_data *data,
-				struct pt_regs *regs)
+void perf_event_output(struct perf_event *event,
+			struct perf_sample_data *data,
+			struct pt_regs *regs)
 {
 	struct perf_output_handle handle;
 	struct perf_event_header header;
diff --git a/kernel/events/internal.h b/kernel/events/internal.h
index 9f6ce9ba4a04..2deb24c7a40d 100644
--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -72,15 +72,6 @@ static inline bool rb_has_aux(struct ring_buffer *rb)
 void perf_event_aux_event(struct perf_event *event, unsigned long head,
 			  unsigned long size, u64 flags);
 
-extern void
-perf_event_header__init_id(struct perf_event_header *header,
-			   struct perf_sample_data *data,
-			   struct perf_event *event);
-extern void
-perf_event__output_id_sample(struct perf_event *event,
-			     struct perf_output_handle *handle,
-			     struct perf_sample_data *sample);
-
 extern struct page *
 perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff);
 
-- 
cgit v1.2.3


From f38b0dbb491a6987e198aa6b428db8692a6480f8 Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@intel.com>
Date: Sun, 10 May 2015 15:13:14 -0400
Subject: perf/x86/intel: Introduce PERF_RECORD_LOST_SAMPLES

After enlarging the PEBS interrupt threshold, there may be some mixed up
PEBS samples which are discarded by the kernel.

This patch makes the kernel emit a PERF_RECORD_LOST_SAMPLES record with
the number of possible discarded records when it is impossible to demux
the samples.

It makes sure the user is not left in the dark about such discards.

Signed-off-by: Kan Liang <kan.liang@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: acme@infradead.org
Cc: eranian@google.com
Link: http://lkml.kernel.org/r/1431285195-14269-8-git-send-email-kan.liang@intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event_intel_ds.c | 20 ++++++++++++++++---
 include/linux/perf_event.h                |  3 +++
 include/uapi/linux/perf_event.h           | 12 +++++++++++
 kernel/events/core.c                      | 33 +++++++++++++++++++++++++++++++
 4 files changed, 65 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index 266079a3a646..34d0c4816141 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -1126,6 +1126,7 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)
 	void *base, *at, *top;
 	int bit;
 	short counts[MAX_PEBS_EVENTS] = {};
+	short error[MAX_PEBS_EVENTS] = {};
 
 	if (!x86_pmu.pebs_active)
 		return;
@@ -1169,20 +1170,33 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)
 			/* slow path */
 			pebs_status = p->status & cpuc->pebs_enabled;
 			pebs_status &= (1ULL << MAX_PEBS_EVENTS) - 1;
-			if (pebs_status != (1 << bit))
+			if (pebs_status != (1 << bit)) {
+				u8 i;
+
+				for_each_set_bit(i, (unsigned long *)&pebs_status,
+						 MAX_PEBS_EVENTS)
+					error[i]++;
 				continue;
+			}
 		}
 		counts[bit]++;
 	}
 
 	for (bit = 0; bit < x86_pmu.max_pebs_events; bit++) {
-		if (counts[bit] == 0)
+		if ((counts[bit] == 0) && (error[bit] == 0))
 			continue;
 		event = cpuc->events[bit];
 		WARN_ON_ONCE(!event);
 		WARN_ON_ONCE(!event->attr.precise_ip);
 
-		__intel_pmu_pebs_event(event, iregs, base, top, bit, counts[bit]);
+		/* log dropped samples number */
+		if (error[bit])
+			perf_log_lost_samples(event, error[bit]);
+
+		if (counts[bit]) {
+			__intel_pmu_pebs_event(event, iregs, base,
+					       top, bit, counts[bit]);
+		}
 	}
 }
 
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 5f192e1bc98e..a204d5266f5f 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -743,6 +743,9 @@ perf_event__output_id_sample(struct perf_event *event,
 			     struct perf_output_handle *handle,
 			     struct perf_sample_data *sample);
 
+extern void
+perf_log_lost_samples(struct perf_event *event, u64 lost);
+
 static inline bool is_sampling_event(struct perf_event *event)
 {
 	return event->attr.sample_period != 0;
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index c4622f1ce046..613ed9ad588f 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -802,6 +802,18 @@ enum perf_event_type {
 	 */
 	PERF_RECORD_ITRACE_START		= 12,
 
+	/*
+	 * Records the dropped/lost sample number.
+	 *
+	 * struct {
+	 *	struct perf_event_header	header;
+	 *
+	 *	u64				lost;
+	 *	struct sample_id		sample_id;
+	 * };
+	 */
+	PERF_RECORD_LOST_SAMPLES		= 13,
+
 	PERF_RECORD_MAX,			/* non-ABI */
 };
 
diff --git a/kernel/events/core.c b/kernel/events/core.c
index e499b4e43aff..9e0773d5d110 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -5974,6 +5974,39 @@ void perf_event_aux_event(struct perf_event *event, unsigned long head,
 	perf_output_end(&handle);
 }
 
+/*
+ * Lost/dropped samples logging
+ */
+void perf_log_lost_samples(struct perf_event *event, u64 lost)
+{
+	struct perf_output_handle handle;
+	struct perf_sample_data sample;
+	int ret;
+
+	struct {
+		struct perf_event_header	header;
+		u64				lost;
+	} lost_samples_event = {
+		.header = {
+			.type = PERF_RECORD_LOST_SAMPLES,
+			.misc = 0,
+			.size = sizeof(lost_samples_event),
+		},
+		.lost		= lost,
+	};
+
+	perf_event_header__init_id(&lost_samples_event.header, &sample, event);
+
+	ret = perf_output_begin(&handle, event,
+				lost_samples_event.header.size);
+	if (ret)
+		return;
+
+	perf_output_put(&handle, lost_samples_event);
+	perf_event__output_id_sample(event, &handle, &sample);
+	perf_output_end(&handle);
+}
+
 /*
  * IRQ throttle logging
  */
-- 
cgit v1.2.3


From 7cf7fa529d0b6b514949cc67b39e3ce406c37006 Mon Sep 17 00:00:00 2001
From: Majd Dibbiny <majd@mellanox.com>
Date: Sun, 7 Jun 2015 15:44:23 +0300
Subject: net/mlx5_core: Fix static checker warnings around system guid query
 flow

Fix static checker warnings in the flow of system guid query.

Fixes: 707c4602cda6 ('net/mlx5_core: Add new query HCA vport commands')
Signed-off-by: Majd Dibbiny <majd@mellanox.com>
Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx5/core/fw.c    | 3 ++-
 drivers/net/ethernet/mellanox/mlx5/core/vport.c | 2 +-
 include/linux/mlx5/vport.h                      | 2 +-
 3 files changed, 4 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fw.c b/drivers/net/ethernet/mellanox/mlx5/core/fw.c
index ba87442ef776..9335e5ae18cc 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fw.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fw.c
@@ -35,7 +35,8 @@
 #include <linux/module.h>
 #include "mlx5_core.h"
 
-int mlx5_cmd_query_adapter(struct mlx5_core_dev *dev, u32 *out, int outlen)
+static int mlx5_cmd_query_adapter(struct mlx5_core_dev *dev, u32 *out,
+				  int outlen)
 {
 	u32 in[MLX5_ST_SZ_DW(query_adapter_in)];
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/vport.c b/drivers/net/ethernet/mellanox/mlx5/core/vport.c
index 20150ffa8b16..b94177ebcf3a 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/vport.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/vport.c
@@ -307,7 +307,7 @@ ex:
 EXPORT_SYMBOL_GPL(mlx5_query_hca_vport_context);
 
 int mlx5_query_hca_vport_system_image_guid(struct mlx5_core_dev *dev,
-					   __be64 *sys_image_guid)
+					   u64 *sys_image_guid)
 {
 	struct mlx5_hca_vport_context *rep;
 	int err;
diff --git a/include/linux/mlx5/vport.h b/include/linux/mlx5/vport.h
index 67882a834efb..967e0fd06e89 100644
--- a/include/linux/mlx5/vport.h
+++ b/include/linux/mlx5/vport.h
@@ -48,7 +48,7 @@ int mlx5_query_hca_vport_context(struct mlx5_core_dev *dev,
 				 u16 vf_num,
 				 struct mlx5_hca_vport_context *rep);
 int mlx5_query_hca_vport_system_image_guid(struct mlx5_core_dev *dev,
-					   __be64 *sys_image_guid);
+					   u64 *sys_image_guid);
 int mlx5_query_hca_vport_node_guid(struct mlx5_core_dev *dev,
 				   u64 *node_guid);
 
-- 
cgit v1.2.3


From cb4a316752709be4a644f070440a8be470d92b7d Mon Sep 17 00:00:00 2001
From: Aleksa Sarai <cyphar@cyphar.com>
Date: Sat, 6 Jun 2015 10:02:14 +1000
Subject: cgroup: use bitmask to filter for_each_subsys

Add a new macro for_each_subsys_which that allows all enabled cgroup
subsystems to be filtered by a bitmask, such that mask & (1 << ssid)
determines if the subsystem is to be processed in the loop body (where
ssid is the unique id of the subsystem).

Also replace the need_forkexit_callback with two separate bitmasks for
each callback to make (ss->{fork,exit}) checks unnecessary.

tj: add a short comment for "if (!CGROUP_SUBSYS_COUNT)".

Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
---
 include/linux/cgroup-defs.h |  2 ++
 kernel/cgroup.c             | 53 ++++++++++++++++++++++++++++-----------------
 2 files changed, 35 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index 26d1cea7929f..c5588c438448 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -490,6 +490,8 @@ static inline void cgroup_threadgroup_change_end(struct task_struct *tsk)
 
 #else	/* CONFIG_CGROUPS */
 
+#define CGROUP_SUBSYS_COUNT 0
+
 static inline void cgroup_threadgroup_change_begin(struct task_struct *tsk) {}
 static inline void cgroup_threadgroup_change_end(struct task_struct *tsk) {}
 
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 0fd5227958fe..3a973519068f 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -178,12 +178,13 @@ static DEFINE_IDR(cgroup_hierarchy_idr);
  */
 static u64 css_serial_nr_next = 1;
 
-/* This flag indicates whether tasks in the fork and exit paths should
- * check for fork/exit handlers to call. This avoids us having to do
- * extra work in the fork/exit path if none of the subsystems need to
- * be called.
+/*
+ * These bitmask flags indicate whether tasks in the fork and exit paths have
+ * fork/exit handlers to call. This avoids us having to do extra work in the
+ * fork/exit path to check which subsystems have fork/exit callbacks.
  */
-static int need_forkexit_callback __read_mostly;
+static unsigned long have_fork_callback __read_mostly;
+static unsigned long have_exit_callback __read_mostly;
 
 static struct cftype cgroup_dfl_base_files[];
 static struct cftype cgroup_legacy_base_files[];
@@ -412,6 +413,24 @@ static int notify_on_release(const struct cgroup *cgrp)
 	for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT &&		\
 	     (((ss) = cgroup_subsys[ssid]) || true); (ssid)++)
 
+/**
+ * for_each_subsys_which - filter for_each_subsys with a bitmask
+ * @ss: the iteration cursor
+ * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end
+ * @ss_maskp: a pointer to the bitmask
+ *
+ * The block will only run for cases where the ssid-th bit (1 << ssid) of
+ * mask is set to 1.
+ */
+#define for_each_subsys_which(ss, ssid, ss_maskp)			\
+	if (!CGROUP_SUBSYS_COUNT) /* to avoid spurious gcc warning */	\
+		;							\
+	else								\
+		for_each_set_bit(ssid, ss_maskp, CGROUP_SUBSYS_COUNT)	\
+			if (((ss) = cgroup_subsys[ssid]) && false)	\
+				break;					\
+			else
+
 /* iterate across the hierarchies */
 #define for_each_root(root)						\
 	list_for_each_entry((root), &cgroup_roots, root_list)
@@ -4914,7 +4933,8 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early)
 	 * init_css_set is in the subsystem's root cgroup. */
 	init_css_set.subsys[ss->id] = css;
 
-	need_forkexit_callback |= ss->fork || ss->exit;
+	have_fork_callback |= (bool)ss->fork << ss->id;
+	have_exit_callback |= (bool)ss->exit << ss->id;
 
 	/* At system boot, before all subsystems have been
 	 * registered, no tasks have been forked, so we don't
@@ -5225,11 +5245,8 @@ void cgroup_post_fork(struct task_struct *child)
 	 * css_set; otherwise, @child might change state between ->fork()
 	 * and addition to css_set.
 	 */
-	if (need_forkexit_callback) {
-		for_each_subsys(ss, i)
-			if (ss->fork)
-				ss->fork(child);
-	}
+	for_each_subsys_which(ss, i, &have_fork_callback)
+		ss->fork(child);
 }
 
 /**
@@ -5273,16 +5290,12 @@ void cgroup_exit(struct task_struct *tsk)
 	cset = task_css_set(tsk);
 	RCU_INIT_POINTER(tsk->cgroups, &init_css_set);
 
-	if (need_forkexit_callback) {
-		/* see cgroup_post_fork() for details */
-		for_each_subsys(ss, i) {
-			if (ss->exit) {
-				struct cgroup_subsys_state *old_css = cset->subsys[i];
-				struct cgroup_subsys_state *css = task_css(tsk, i);
+	/* see cgroup_post_fork() for details */
+	for_each_subsys_which(ss, i, &have_exit_callback) {
+		struct cgroup_subsys_state *old_css = cset->subsys[i];
+		struct cgroup_subsys_state *css = task_css(tsk, i);
 
-				ss->exit(css, old_css, tsk);
-			}
-		}
+		ss->exit(css, old_css, tsk);
 	}
 
 	if (put_cset)
-- 
cgit v1.2.3


From 3846c15820a1841225d0245afda4875af23dfbbe Mon Sep 17 00:00:00 2001
From: Peter Jones <pjones@redhat.com>
Date: Fri, 5 Jun 2015 15:14:54 -0400
Subject: efi: Work around ia64 build problem with ESRT driver

So, I'm told this problem exists in the world:

 > Subject: Build error in -next due to 'efi: Add esrt support'
 >
 > Building ia64:defconfig ... failed
 > --------------
 > Error log:
 >
 > drivers/firmware/efi/esrt.c:28:31: fatal error: asm/early_ioremap.h: No such file or directory
 >

I'm not really sure how it's okay that we have things in asm-generic on
some platforms but not others - is having it the same everywhere not the
whole point of asm-generic?

That said, ia64 doesn't have early_ioremap.h .  So instead, since it's
difficult to imagine new IA64 machines with UEFI 2.5, just don't build
this code there.

To me this looks like a workaround - doing something like:

generic-y += early_ioremap.h

in arch/ia64/include/asm/Kbuild would appear to be more correct, but
ia64 has its own early_memremap() decl in arch/ia64/include/asm/io.h ,
and it's a macro.  So adding the above /and/ requiring that asm/io.h be
included /after/ asm/early_ioremap.h in all cases would fix it, but
that's pretty ugly as well.  Since I'm not going to spend the rest of my
life rectifying ia64 headers vs "generic" headers that aren't generic,
it's much simpler to just not build there.

Note that I've only actually tried to build this patch on x86_64, but
esrt.o still gets built there, and that would seem to demonstrate that
the conditional building is working correctly at all the places the code
built before.  I no longer have any ia64 machines handy to test that the
exclusion actually works there.

Signed-off-by: Peter Jones <pjones@redhat.com>
Acked-by: Tony Luck <tony.luck@intel.com>
Reviewed-by: Guenter Roeck <linux@roeck-us.net>
(Compile-)Tested-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Matt Fleming <matt.fleming@intel.com>
---
 drivers/firmware/efi/Kconfig  | 5 +++++
 drivers/firmware/efi/Makefile | 3 ++-
 include/linux/efi.h           | 4 ++++
 3 files changed, 11 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/firmware/efi/Kconfig b/drivers/firmware/efi/Kconfig
index 8de4da5c9ab6..54071c148340 100644
--- a/drivers/firmware/efi/Kconfig
+++ b/drivers/firmware/efi/Kconfig
@@ -18,6 +18,11 @@ config EFI_VARS
 	  Subsequent efibootmgr releases may be found at:
 	  <http://github.com/vathpela/efibootmgr>
 
+config EFI_ESRT
+	bool
+	depends on EFI && !IA64
+	default y
+
 config EFI_VARS_PSTORE
 	tristate "Register efivars backend for pstore"
 	depends on EFI_VARS && PSTORE
diff --git a/drivers/firmware/efi/Makefile b/drivers/firmware/efi/Makefile
index 26eabbc55341..6fd3da938717 100644
--- a/drivers/firmware/efi/Makefile
+++ b/drivers/firmware/efi/Makefile
@@ -1,8 +1,9 @@
 #
 # Makefile for linux kernel
 #
-obj-$(CONFIG_EFI)			+= efi.o esrt.o vars.o reboot.o
+obj-$(CONFIG_EFI)			+= efi.o vars.o reboot.o
 obj-$(CONFIG_EFI_VARS)			+= efivars.o
+obj-$(CONFIG_EFI_ESRT)			+= esrt.o
 obj-$(CONFIG_EFI_VARS_PSTORE)		+= efi-pstore.o
 obj-$(CONFIG_UEFI_CPER)			+= cper.o
 obj-$(CONFIG_EFI_RUNTIME_MAP)		+= runtime-map.o
diff --git a/include/linux/efi.h b/include/linux/efi.h
index 024c27e7c0fa..2092965afca3 100644
--- a/include/linux/efi.h
+++ b/include/linux/efi.h
@@ -879,7 +879,11 @@ static inline efi_status_t efi_query_variable_store(u32 attributes, unsigned lon
 #endif
 extern void __iomem *efi_lookup_mapped_addr(u64 phys_addr);
 extern int efi_config_init(efi_config_table_type_t *arch_tables);
+#ifdef CONFIG_EFI_ESRT
 extern void __init efi_esrt_init(void);
+#else
+static inline void efi_esrt_init(void) { }
+#endif
 extern int efi_config_parse_tables(void *config_tables, int count, int sz,
 				   efi_config_table_type_t *arch_tables);
 extern u64 efi_get_iobase (void);
-- 
cgit v1.2.3


From a8077d6573530a91d5674a28cdedbed39c391ff0 Mon Sep 17 00:00:00 2001
From: Rafał Miłecki <zajec5@gmail.com>
Date: Sun, 7 Jun 2015 13:15:30 +0200
Subject: bcma: make calls to PCI hostmode functions config-safe
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Rafał Miłecki <zajec5@gmail.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 include/linux/bcma/bcma_driver_pci.h | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/bcma/bcma_driver_pci.h b/include/linux/bcma/bcma_driver_pci.h
index 5ba6918ca20b..9657f11d48a7 100644
--- a/include/linux/bcma/bcma_driver_pci.h
+++ b/include/linux/bcma/bcma_driver_pci.h
@@ -246,7 +246,18 @@ static inline void bcma_core_pci_power_save(struct bcma_bus *bus, bool up)
 }
 #endif
 
+#ifdef CONFIG_BCMA_DRIVER_PCI_HOSTMODE
 extern int bcma_core_pci_pcibios_map_irq(const struct pci_dev *dev);
 extern int bcma_core_pci_plat_dev_init(struct pci_dev *dev);
+#else
+static inline int bcma_core_pci_pcibios_map_irq(const struct pci_dev *dev)
+{
+	return -ENOTSUPP;
+}
+static inline int bcma_core_pci_plat_dev_init(struct pci_dev *dev)
+{
+	return -ENOTSUPP;
+}
+#endif
 
 #endif /* LINUX_BCMA_DRIVER_PCI_H_ */
-- 
cgit v1.2.3


From 01d72a95188880b22190e937ed8718ed4b45bdce Mon Sep 17 00:00:00 2001
From: Bjorn Helgaas <bhelgaas@google.com>
Date: Thu, 4 Jun 2015 16:38:17 -0500
Subject: PCI: Remove unused pci_dma_burst_advice()

pci_dma_burst_advice() was added by e24c2d963a60 ("[PATCH] PCI: DMA
bursting advice") but apparently never used.  Remove it.

Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Acked-by: Michal Simek <monstr@monstr.eu>	# microblaze
CC: David S. Miller <davem@davemloft.net>
---
 arch/alpha/include/asm/pci.h       | 16 ----------------
 arch/arm/include/asm/pci.h         | 10 ----------
 arch/frv/include/asm/pci.h         | 10 ----------
 arch/ia64/include/asm/pci.h        | 19 -------------------
 arch/microblaze/include/asm/pci.h  | 10 ----------
 arch/mips/include/asm/pci.h        | 10 ----------
 arch/parisc/include/asm/pci.h      | 19 -------------------
 arch/powerpc/include/asm/pci.h     | 30 ------------------------------
 arch/sh/include/asm/pci.h          | 18 ------------------
 arch/sparc/include/asm/pci_32.h    | 10 ----------
 arch/sparc/include/asm/pci_64.h    | 19 -------------------
 arch/unicore32/include/asm/pci.h   | 10 ----------
 arch/x86/include/asm/pci.h         |  7 -------
 drivers/net/ethernet/sun/cassini.c |  1 -
 include/linux/pci.h                | 11 -----------
 15 files changed, 200 deletions(-)

(limited to 'include/linux')

diff --git a/arch/alpha/include/asm/pci.h b/arch/alpha/include/asm/pci.h
index f7f680f7457d..8b02afeb6319 100644
--- a/arch/alpha/include/asm/pci.h
+++ b/arch/alpha/include/asm/pci.h
@@ -71,22 +71,6 @@ extern void pcibios_set_master(struct pci_dev *dev);
 /* implement the pci_ DMA API in terms of the generic device dma_ one */
 #include <asm-generic/pci-dma-compat.h>
 
-static inline void pci_dma_burst_advice(struct pci_dev *pdev,
-					enum pci_dma_burst_strategy *strat,
-					unsigned long *strategy_parameter)
-{
-	unsigned long cacheline_size;
-	u8 byte;
-
-	pci_read_config_byte(pdev, PCI_CACHE_LINE_SIZE, &byte);
-	if (byte == 0)
-		cacheline_size = 1024;
-	else
-		cacheline_size = (int) byte * 4;
-
-	*strat = PCI_DMA_BURST_BOUNDARY;
-	*strategy_parameter = cacheline_size;
-}
 #endif
 
 /* TODO: integrate with include/asm-generic/pci.h ? */
diff --git a/arch/arm/include/asm/pci.h b/arch/arm/include/asm/pci.h
index 585dc33a7a24..a5635444ca41 100644
--- a/arch/arm/include/asm/pci.h
+++ b/arch/arm/include/asm/pci.h
@@ -31,16 +31,6 @@ static inline int pci_proc_domain(struct pci_bus *bus)
  */
 #define PCI_DMA_BUS_IS_PHYS     (1)
 
-#ifdef CONFIG_PCI
-static inline void pci_dma_burst_advice(struct pci_dev *pdev,
-					enum pci_dma_burst_strategy *strat,
-					unsigned long *strategy_parameter)
-{
-	*strat = PCI_DMA_BURST_INFINITY;
-	*strategy_parameter = ~0UL;
-}
-#endif
-
 #define HAVE_PCI_MMAP
 extern int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma,
                                enum pci_mmap_state mmap_state, int write_combine);
diff --git a/arch/frv/include/asm/pci.h b/arch/frv/include/asm/pci.h
index 2035a4d3f9b9..a6d4ed042c70 100644
--- a/arch/frv/include/asm/pci.h
+++ b/arch/frv/include/asm/pci.h
@@ -41,16 +41,6 @@ extern void pci_free_consistent(struct pci_dev *hwdev, size_t size,
 /* Return the index of the PCI controller for device PDEV. */
 #define pci_controller_num(PDEV)	(0)
 
-#ifdef CONFIG_PCI
-static inline void pci_dma_burst_advice(struct pci_dev *pdev,
-					enum pci_dma_burst_strategy *strat,
-					unsigned long *strategy_parameter)
-{
-	*strat = PCI_DMA_BURST_INFINITY;
-	*strategy_parameter = ~0UL;
-}
-#endif
-
 /*
  *	These are pretty much arbitrary with the CoMEM implementation.
  *	We have the whole address space to ourselves.
diff --git a/arch/ia64/include/asm/pci.h b/arch/ia64/include/asm/pci.h
index 32ea19ac5713..b897fae1f0ca 100644
--- a/arch/ia64/include/asm/pci.h
+++ b/arch/ia64/include/asm/pci.h
@@ -52,25 +52,6 @@ extern unsigned long ia64_max_iommu_merge_mask;
 
 #include <asm-generic/pci-dma-compat.h>
 
-#ifdef CONFIG_PCI
-static inline void pci_dma_burst_advice(struct pci_dev *pdev,
-					enum pci_dma_burst_strategy *strat,
-					unsigned long *strategy_parameter)
-{
-	unsigned long cacheline_size;
-	u8 byte;
-
-	pci_read_config_byte(pdev, PCI_CACHE_LINE_SIZE, &byte);
-	if (byte == 0)
-		cacheline_size = 1024;
-	else
-		cacheline_size = (int) byte * 4;
-
-	*strat = PCI_DMA_BURST_MULTIPLE;
-	*strategy_parameter = cacheline_size;
-}
-#endif
-
 #define HAVE_PCI_MMAP
 extern int pci_mmap_page_range (struct pci_dev *dev, struct vm_area_struct *vma,
 				enum pci_mmap_state mmap_state, int write_combine);
diff --git a/arch/microblaze/include/asm/pci.h b/arch/microblaze/include/asm/pci.h
index 95b03886e0a8..fdf2e75d7033 100644
--- a/arch/microblaze/include/asm/pci.h
+++ b/arch/microblaze/include/asm/pci.h
@@ -44,16 +44,6 @@ struct pci_dev;
  */
 #define pcibios_assign_all_busses()	0
 
-#ifdef CONFIG_PCI
-static inline void pci_dma_burst_advice(struct pci_dev *pdev,
-					enum pci_dma_burst_strategy *strat,
-					unsigned long *strategy_parameter)
-{
-	*strat = PCI_DMA_BURST_INFINITY;
-	*strategy_parameter = ~0UL;
-}
-#endif
-
 extern int pci_domain_nr(struct pci_bus *bus);
 
 /* Decide whether to display the domain number in /proc */
diff --git a/arch/mips/include/asm/pci.h b/arch/mips/include/asm/pci.h
index d9692993fc83..70dcc5498128 100644
--- a/arch/mips/include/asm/pci.h
+++ b/arch/mips/include/asm/pci.h
@@ -113,16 +113,6 @@ struct pci_dev;
  */
 extern unsigned int PCI_DMA_BUS_IS_PHYS;
 
-#ifdef CONFIG_PCI
-static inline void pci_dma_burst_advice(struct pci_dev *pdev,
-					enum pci_dma_burst_strategy *strat,
-					unsigned long *strategy_parameter)
-{
-	*strat = PCI_DMA_BURST_INFINITY;
-	*strategy_parameter = ~0UL;
-}
-#endif
-
 #ifdef CONFIG_PCI_DOMAINS
 #define pci_domain_nr(bus) ((struct pci_controller *)(bus)->sysdata)->index
 
diff --git a/arch/parisc/include/asm/pci.h b/arch/parisc/include/asm/pci.h
index 20df2b04fc09..bf5e044281d6 100644
--- a/arch/parisc/include/asm/pci.h
+++ b/arch/parisc/include/asm/pci.h
@@ -196,25 +196,6 @@ static inline void pcibios_register_hba(struct pci_hba_data *x)
 /* export the pci_ DMA API in terms of the dma_ one */
 #include <asm-generic/pci-dma-compat.h>
 
-#ifdef CONFIG_PCI
-static inline void pci_dma_burst_advice(struct pci_dev *pdev,
-					enum pci_dma_burst_strategy *strat,
-					unsigned long *strategy_parameter)
-{
-	unsigned long cacheline_size;
-	u8 byte;
-
-	pci_read_config_byte(pdev, PCI_CACHE_LINE_SIZE, &byte);
-	if (byte == 0)
-		cacheline_size = 1024;
-	else
-		cacheline_size = (int) byte * 4;
-
-	*strat = PCI_DMA_BURST_MULTIPLE;
-	*strategy_parameter = cacheline_size;
-}
-#endif
-
 static inline int pci_get_legacy_ide_irq(struct pci_dev *dev, int channel)
 {
 	return channel ? 15 : 14;
diff --git a/arch/powerpc/include/asm/pci.h b/arch/powerpc/include/asm/pci.h
index 4aef8d660999..99dc432b256a 100644
--- a/arch/powerpc/include/asm/pci.h
+++ b/arch/powerpc/include/asm/pci.h
@@ -71,36 +71,6 @@ extern struct dma_map_ops *get_pci_dma_ops(void);
  */
 #define PCI_DISABLE_MWI
 
-#ifdef CONFIG_PCI
-static inline void pci_dma_burst_advice(struct pci_dev *pdev,
-					enum pci_dma_burst_strategy *strat,
-					unsigned long *strategy_parameter)
-{
-	unsigned long cacheline_size;
-	u8 byte;
-
-	pci_read_config_byte(pdev, PCI_CACHE_LINE_SIZE, &byte);
-	if (byte == 0)
-		cacheline_size = 1024;
-	else
-		cacheline_size = (int) byte * 4;
-
-	*strat = PCI_DMA_BURST_MULTIPLE;
-	*strategy_parameter = cacheline_size;
-}
-#endif
-
-#else /* 32-bit */
-
-#ifdef CONFIG_PCI
-static inline void pci_dma_burst_advice(struct pci_dev *pdev,
-					enum pci_dma_burst_strategy *strat,
-					unsigned long *strategy_parameter)
-{
-	*strat = PCI_DMA_BURST_INFINITY;
-	*strategy_parameter = ~0UL;
-}
-#endif
 #endif /* CONFIG_PPC64 */
 
 extern int pci_domain_nr(struct pci_bus *bus);
diff --git a/arch/sh/include/asm/pci.h b/arch/sh/include/asm/pci.h
index 5b4511552998..e343dbd02e41 100644
--- a/arch/sh/include/asm/pci.h
+++ b/arch/sh/include/asm/pci.h
@@ -86,24 +86,6 @@ extern void pcibios_set_master(struct pci_dev *dev);
  * direct memory write.
  */
 #define PCI_DISABLE_MWI
-
-static inline void pci_dma_burst_advice(struct pci_dev *pdev,
-					enum pci_dma_burst_strategy *strat,
-					unsigned long *strategy_parameter)
-{
-	unsigned long cacheline_size;
-	u8 byte;
-
-	pci_read_config_byte(pdev, PCI_CACHE_LINE_SIZE, &byte);
-
-	if (byte == 0)
-		cacheline_size = L1_CACHE_BYTES;
-	else
-		cacheline_size = byte << 2;
-
-	*strat = PCI_DMA_BURST_MULTIPLE;
-	*strategy_parameter = cacheline_size;
-}
 #endif
 
 /* Board-specific fixup routines. */
diff --git a/arch/sparc/include/asm/pci_32.h b/arch/sparc/include/asm/pci_32.h
index 53e9b4987db0..b7c092df3134 100644
--- a/arch/sparc/include/asm/pci_32.h
+++ b/arch/sparc/include/asm/pci_32.h
@@ -22,16 +22,6 @@
 
 struct pci_dev;
 
-#ifdef CONFIG_PCI
-static inline void pci_dma_burst_advice(struct pci_dev *pdev,
-					enum pci_dma_burst_strategy *strat,
-					unsigned long *strategy_parameter)
-{
-	*strat = PCI_DMA_BURST_INFINITY;
-	*strategy_parameter = ~0UL;
-}
-#endif
-
 #endif /* __KERNEL__ */
 
 #ifndef CONFIG_LEON_PCI
diff --git a/arch/sparc/include/asm/pci_64.h b/arch/sparc/include/asm/pci_64.h
index bd00a6226169..022d16008a00 100644
--- a/arch/sparc/include/asm/pci_64.h
+++ b/arch/sparc/include/asm/pci_64.h
@@ -31,25 +31,6 @@
 #define PCI64_REQUIRED_MASK	(~(u64)0)
 #define PCI64_ADDR_BASE		0xfffc000000000000UL
 
-#ifdef CONFIG_PCI
-static inline void pci_dma_burst_advice(struct pci_dev *pdev,
-					enum pci_dma_burst_strategy *strat,
-					unsigned long *strategy_parameter)
-{
-	unsigned long cacheline_size;
-	u8 byte;
-
-	pci_read_config_byte(pdev, PCI_CACHE_LINE_SIZE, &byte);
-	if (byte == 0)
-		cacheline_size = 1024;
-	else
-		cacheline_size = (int) byte * 4;
-
-	*strat = PCI_DMA_BURST_BOUNDARY;
-	*strategy_parameter = cacheline_size;
-}
-#endif
-
 /* Return the index of the PCI controller for device PDEV. */
 
 int pci_domain_nr(struct pci_bus *bus);
diff --git a/arch/unicore32/include/asm/pci.h b/arch/unicore32/include/asm/pci.h
index 654407e98619..38b3f3785c3c 100644
--- a/arch/unicore32/include/asm/pci.h
+++ b/arch/unicore32/include/asm/pci.h
@@ -18,16 +18,6 @@
 #include <asm-generic/pci.h>
 #include <mach/hardware.h> /* for PCIBIOS_MIN_* */
 
-#ifdef CONFIG_PCI
-static inline void pci_dma_burst_advice(struct pci_dev *pdev,
-					enum pci_dma_burst_strategy *strat,
-					unsigned long *strategy_parameter)
-{
-	*strat = PCI_DMA_BURST_INFINITY;
-	*strategy_parameter = ~0UL;
-}
-#endif
-
 #define HAVE_PCI_MMAP
 extern int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma,
 	enum pci_mmap_state mmap_state, int write_combine);
diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h
index 4e370a5d8117..e51c229a29a4 100644
--- a/arch/x86/include/asm/pci.h
+++ b/arch/x86/include/asm/pci.h
@@ -80,13 +80,6 @@ extern int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma,
 
 #ifdef CONFIG_PCI
 extern void early_quirks(void);
-static inline void pci_dma_burst_advice(struct pci_dev *pdev,
-					enum pci_dma_burst_strategy *strat,
-					unsigned long *strategy_parameter)
-{
-	*strat = PCI_DMA_BURST_INFINITY;
-	*strategy_parameter = ~0UL;
-}
 #else
 static inline void early_quirks(void) { }
 #endif
diff --git a/drivers/net/ethernet/sun/cassini.c b/drivers/net/ethernet/sun/cassini.c
index 3dc1f68b322d..6ce973187225 100644
--- a/drivers/net/ethernet/sun/cassini.c
+++ b/drivers/net/ethernet/sun/cassini.c
@@ -3058,7 +3058,6 @@ static void cas_init_mac(struct cas *cp)
 	/* setup core arbitration weight register */
 	writel(CAWR_RR_DIS, cp->regs + REG_CAWR);
 
-	/* XXX Use pci_dma_burst_advice() */
 #if !defined(CONFIG_SPARC64) && !defined(CONFIG_ALPHA)
 	/* set the infinite burst register for chips that don't have
 	 * pci issues.
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 353db8dc4c6e..08fb4e307d68 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1197,15 +1197,6 @@ int pci_set_vga_state(struct pci_dev *pdev, bool decode,
 #define	pci_pool_alloc(pool, flags, handle) dma_pool_alloc(pool, flags, handle)
 #define	pci_pool_free(pool, vaddr, addr) dma_pool_free(pool, vaddr, addr)
 
-enum pci_dma_burst_strategy {
-	PCI_DMA_BURST_INFINITY,	/* make bursts as large as possible,
-				   strategy_parameter is N/A */
-	PCI_DMA_BURST_BOUNDARY, /* disconnect at every strategy_parameter
-				   byte boundaries */
-	PCI_DMA_BURST_MULTIPLE, /* disconnect at some multiple of
-				   strategy_parameter byte boundaries */
-};
-
 struct msix_entry {
 	u32	vector;	/* kernel uses to write allocated vector */
 	u16	entry;	/* driver uses to specify entry, OS writes */
@@ -1430,8 +1421,6 @@ static inline int pci_request_regions(struct pci_dev *dev, const char *res_name)
 { return -EIO; }
 static inline void pci_release_regions(struct pci_dev *dev) { }
 
-#define pci_dma_burst_advice(pdev, strat, strategy_parameter) do { } while (0)
-
 static inline void pci_block_cfg_access(struct pci_dev *dev) { }
 static inline int pci_block_cfg_access_in_atomic(struct pci_dev *dev)
 { return 0; }
-- 
cgit v1.2.3


From d711b8b30c803b1b2aedf6a3474758798078f9e1 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Sat, 6 Jun 2015 11:30:00 +0200
Subject: hrtimers: Make sure hrtimer_resolution is unsigned int
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

... in the !CONFIG_HIGH_RES_TIMERS case too. And thus fix warnings like
this one:

net/sched/sch_api.c: In function ‘psched_show’:
net/sched/sch_api.c:1891:6: warning: format ‘%x’ expects argument of type ‘unsigned int’, but argument 6 has type ‘long int’ [-Wformat=]
      (u32)NSEC_PER_SEC / hrtimer_resolution);

Signed-off-by: Borislav Petkov <bp@suse.de>
Link: http://lkml.kernel.org/r/1433583000-32090-1-git-send-email-bp@alien8.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/hrtimer.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index 470d876c2eda..3f82a7edc03d 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -309,7 +309,7 @@ extern unsigned int hrtimer_resolution;
 # define MONOTONIC_RES_NSEC	LOW_RES_NSEC
 # define KTIME_MONOTONIC_RES	KTIME_LOW_RES
 
-#define hrtimer_resolution	LOW_RES_NSEC
+#define hrtimer_resolution	(unsigned int)LOW_RES_NSEC
 
 static inline void hrtimer_peek_ahead_timers(void) { }
 
-- 
cgit v1.2.3


From 2c1296d92ac0367364bcb73a43c12a0bdfbfee75 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <jroedel@suse.de>
Date: Thu, 28 May 2015 18:41:32 +0200
Subject: iommu: Add iommu_get_domain_for_dev function

This function can be used to request the current domain a
device is attached to.

Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/iommu/iommu.c | 18 ++++++++++++++++++
 include/linux/iommu.h |  6 ++++++
 2 files changed, 24 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 7bce522c2367..a0a38bd2668b 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -1076,6 +1076,24 @@ out_unlock:
 }
 EXPORT_SYMBOL_GPL(iommu_detach_device);
 
+struct iommu_domain *iommu_get_domain_for_dev(struct device *dev)
+{
+	struct iommu_domain *domain;
+	struct iommu_group *group;
+
+	group = iommu_group_get(dev);
+	/* FIXME: Remove this when groups a mandatory for iommu drivers */
+	if (group == NULL)
+		return NULL;
+
+	domain = group->domain;
+
+	iommu_group_put(group);
+
+	return domain;
+}
+EXPORT_SYMBOL_GPL(iommu_get_domain_for_dev);
+
 /*
  * IOMMU groups are really the natrual working unit of the IOMMU, but
  * the IOMMU API works on domains and devices.  Bridge that gap by
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 0546b8710ce3..683a1c4b15e7 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -193,6 +193,7 @@ extern int iommu_attach_device(struct iommu_domain *domain,
 			       struct device *dev);
 extern void iommu_detach_device(struct iommu_domain *domain,
 				struct device *dev);
+extern struct iommu_domain *iommu_get_domain_for_dev(struct device *dev);
 extern int iommu_map(struct iommu_domain *domain, unsigned long iova,
 		     phys_addr_t paddr, size_t size, int prot);
 extern size_t iommu_unmap(struct iommu_domain *domain, unsigned long iova,
@@ -332,6 +333,11 @@ static inline void iommu_detach_device(struct iommu_domain *domain,
 {
 }
 
+static inline struct iommu_domain *iommu_get_domain_for_dev(struct device *dev)
+{
+	return NULL;
+}
+
 static inline int iommu_map(struct iommu_domain *domain, unsigned long iova,
 			    phys_addr_t paddr, int gfp_order, int prot)
 {
-- 
cgit v1.2.3


From a1015c2b99b94cf521603b41debf167114031456 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <jroedel@suse.de>
Date: Thu, 28 May 2015 18:41:33 +0200
Subject: iommu: Introduce direct mapped region handling

Add two new functions to the IOMMU-API to allow the IOMMU
drivers to export the requirements for direct mapped regions
per device.
This is useful for exporting the information in Intel VT-d's
RMRR entries or AMD-Vi's unity mappings.

Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/iommu/iommu.c | 16 ++++++++++++++++
 include/linux/iommu.h | 31 +++++++++++++++++++++++++++++++
 2 files changed, 47 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index a0a38bd2668b..6b8d6e7771e1 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -1469,3 +1469,19 @@ int iommu_domain_set_attr(struct iommu_domain *domain,
 	return ret;
 }
 EXPORT_SYMBOL_GPL(iommu_domain_set_attr);
+
+void iommu_get_dm_regions(struct device *dev, struct list_head *list)
+{
+	const struct iommu_ops *ops = dev->bus->iommu_ops;
+
+	if (ops && ops->get_dm_regions)
+		ops->get_dm_regions(dev, list);
+}
+
+void iommu_put_dm_regions(struct device *dev, struct list_head *list)
+{
+	const struct iommu_ops *ops = dev->bus->iommu_ops;
+
+	if (ops && ops->put_dm_regions)
+		ops->put_dm_regions(dev, list);
+}
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 683a1c4b15e7..689499904166 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -114,6 +114,20 @@ enum iommu_attr {
 	DOMAIN_ATTR_MAX,
 };
 
+/**
+ * struct iommu_dm_region - descriptor for a direct mapped memory region
+ * @list: Linked list pointers
+ * @start: System physical start address of the region
+ * @length: Length of the region in bytes
+ * @prot: IOMMU Protection flags (READ/WRITE/...)
+ */
+struct iommu_dm_region {
+	struct list_head	list;
+	phys_addr_t		start;
+	size_t			length;
+	int			prot;
+};
+
 #ifdef CONFIG_IOMMU_API
 
 /**
@@ -159,6 +173,10 @@ struct iommu_ops {
 	int (*domain_set_attr)(struct iommu_domain *domain,
 			       enum iommu_attr attr, void *data);
 
+	/* Request/Free a list of direct mapping requirements for a device */
+	void (*get_dm_regions)(struct device *dev, struct list_head *list);
+	void (*put_dm_regions)(struct device *dev, struct list_head *list);
+
 	/* Window handling functions */
 	int (*domain_window_enable)(struct iommu_domain *domain, u32 wnd_nr,
 				    phys_addr_t paddr, u64 size, int prot);
@@ -205,6 +223,9 @@ extern phys_addr_t iommu_iova_to_phys(struct iommu_domain *domain, dma_addr_t io
 extern void iommu_set_fault_handler(struct iommu_domain *domain,
 			iommu_fault_handler_t handler, void *token);
 
+extern void iommu_get_dm_regions(struct device *dev, struct list_head *list);
+extern void iommu_put_dm_regions(struct device *dev, struct list_head *list);
+
 extern int iommu_attach_group(struct iommu_domain *domain,
 			      struct iommu_group *group);
 extern void iommu_detach_group(struct iommu_domain *domain,
@@ -379,6 +400,16 @@ static inline void iommu_set_fault_handler(struct iommu_domain *domain,
 {
 }
 
+static inline void iommu_get_dm_regions(struct device *dev,
+					struct list_head *list)
+{
+}
+
+static inline void iommu_put_dm_regions(struct device *dev,
+					struct list_head *list)
+{
+}
+
 static inline int iommu_attach_group(struct iommu_domain *domain,
 				     struct iommu_group *group)
 {
-- 
cgit v1.2.3


From 6827ca83695d5e41ad31b0719788ee65f00ca4b3 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <jroedel@suse.de>
Date: Thu, 28 May 2015 18:41:35 +0200
Subject: iommu: Add function to query the default domain of a group

This will be used to handle unity mappings in the iommu
drivers.

Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/iommu/iommu.c | 5 +++++
 include/linux/iommu.h | 1 +
 2 files changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index ffad1eaf450d..224c6dd2d249 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -837,6 +837,11 @@ struct iommu_group *iommu_group_get_for_dev(struct device *dev)
 	return group;
 }
 
+struct iommu_domain *iommu_group_default_domain(struct iommu_group *group)
+{
+	return group->default_domain;
+}
+
 static int add_iommu_group(struct device *dev, void *data)
 {
 	struct iommu_callback_data *cb = data;
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 689499904166..b944b2be4fa2 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -249,6 +249,7 @@ extern int iommu_group_unregister_notifier(struct iommu_group *group,
 					   struct notifier_block *nb);
 extern int iommu_group_id(struct iommu_group *group);
 extern struct iommu_group *iommu_group_get_for_dev(struct device *dev);
+extern struct iommu_domain *iommu_group_default_domain(struct iommu_group *);
 
 extern int iommu_domain_get_attr(struct iommu_domain *domain, enum iommu_attr,
 				 void *data);
-- 
cgit v1.2.3


From 5179f0ce2f96f155e3bda93b3b82f912dbaddad2 Mon Sep 17 00:00:00 2001
From: Steve Twiss <stwiss.opensource@diasemi.com>
Date: Mon, 8 Jun 2015 16:26:20 -0700
Subject: Input: add OnKey driver for DA9063 MFD part

This adds OnKey driver support for DA9063.

Signed-off-by: Steve Twiss <stwiss.opensource@diasemi.com>
Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
---
 drivers/input/misc/Kconfig        |  10 ++
 drivers/input/misc/Makefile       |   1 +
 drivers/input/misc/da9063_onkey.c | 226 ++++++++++++++++++++++++++++++++++++++
 include/linux/mfd/da9063/pdata.h  |   1 +
 4 files changed, 238 insertions(+)
 create mode 100644 drivers/input/misc/da9063_onkey.c

(limited to 'include/linux')

diff --git a/drivers/input/misc/Kconfig b/drivers/input/misc/Kconfig
index e5c4de279001..d4f0a817e858 100644
--- a/drivers/input/misc/Kconfig
+++ b/drivers/input/misc/Kconfig
@@ -610,6 +610,16 @@ config INPUT_DA9055_ONKEY
 	  To compile this driver as a module, choose M here: the module
 	  will be called da9055_onkey.
 
+config INPUT_DA9063_ONKEY
+	tristate "Dialog DA9063 OnKey"
+	depends on MFD_DA9063
+	help
+	  Support the ONKEY of Dialog DA9063 Power Management IC as an
+	  input device reporting power button statue.
+
+	  To compile this driver as a module, choose M here: the module
+	  will be called da9063_onkey.
+
 config INPUT_DM355EVM
 	tristate "TI DaVinci DM355 EVM Keypad and IR Remote"
 	depends on MFD_DM355EVM_MSP
diff --git a/drivers/input/misc/Makefile b/drivers/input/misc/Makefile
index d3179472474d..53df07dcc23c 100644
--- a/drivers/input/misc/Makefile
+++ b/drivers/input/misc/Makefile
@@ -25,6 +25,7 @@ obj-$(CONFIG_INPUT_CMA3000_I2C)		+= cma3000_d0x_i2c.o
 obj-$(CONFIG_INPUT_COBALT_BTNS)		+= cobalt_btns.o
 obj-$(CONFIG_INPUT_DA9052_ONKEY)	+= da9052_onkey.o
 obj-$(CONFIG_INPUT_DA9055_ONKEY)	+= da9055_onkey.o
+obj-$(CONFIG_INPUT_DA9063_ONKEY)	+= da9063_onkey.o
 obj-$(CONFIG_INPUT_DM355EVM)		+= dm355evm_keys.o
 obj-$(CONFIG_INPUT_E3X0_BUTTON)		+= e3x0-button.o
 obj-$(CONFIG_INPUT_DRV260X_HAPTICS)	+= drv260x.o
diff --git a/drivers/input/misc/da9063_onkey.c b/drivers/input/misc/da9063_onkey.c
new file mode 100644
index 000000000000..f577585ef999
--- /dev/null
+++ b/drivers/input/misc/da9063_onkey.c
@@ -0,0 +1,226 @@
+/*
+ * OnKey device driver for DA9063
+ * Copyright (C) 2015  Dialog Semiconductor Ltd.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/input.h>
+#include <linux/interrupt.h>
+#include <linux/platform_device.h>
+#include <linux/workqueue.h>
+#include <linux/regmap.h>
+#include <linux/of.h>
+#include <linux/mfd/da9063/core.h>
+#include <linux/mfd/da9063/pdata.h>
+#include <linux/mfd/da9063/registers.h>
+
+struct da9063_onkey {
+	struct da9063 *hw;
+	struct delayed_work work;
+	struct input_dev *input;
+	struct device *dev;
+	bool key_power;
+};
+
+static void da9063_poll_on(struct work_struct *work)
+{
+	struct da9063_onkey *onkey = container_of(work, struct da9063_onkey,
+						  work.work);
+	unsigned int val;
+	int fault_log = 0;
+	bool poll = true;
+	int error;
+
+	/* Poll to see when the pin is released */
+	error = regmap_read(onkey->hw->regmap, DA9063_REG_STATUS_A, &val);
+	if (error) {
+		dev_err(onkey->dev,
+			"Failed to read ON status: %d\n", error);
+		goto err_poll;
+	}
+
+	if (!(val & DA9063_NONKEY)) {
+		error = regmap_update_bits(onkey->hw->regmap,
+					   DA9063_REG_CONTROL_B,
+					   DA9063_NONKEY_LOCK, 0);
+		if (error) {
+			dev_err(onkey->dev,
+				"Failed to reset the Key Delay %d\n", error);
+			goto err_poll;
+		}
+
+		input_report_key(onkey->input, KEY_POWER, 0);
+		input_sync(onkey->input);
+
+		poll = false;
+	}
+
+	/*
+	 * If the fault log KEY_RESET is detected, then clear it
+	 * and shut down the system.
+	 */
+	error = regmap_read(onkey->hw->regmap,
+			    DA9063_REG_FAULT_LOG, &fault_log);
+	if (error) {
+		dev_warn(&onkey->input->dev,
+			 "Cannot read FAULT_LOG: %d\n", error);
+	} else if (fault_log & DA9063_KEY_RESET) {
+		error = regmap_write(onkey->hw->regmap,
+				     DA9063_REG_FAULT_LOG,
+				     DA9063_KEY_RESET);
+		if (error) {
+			dev_warn(&onkey->input->dev,
+				 "Cannot reset KEY_RESET fault log: %d\n",
+				 error);
+		} else {
+			/* at this point we do any S/W housekeeping
+			 * and then send shutdown command
+			 */
+			dev_dbg(&onkey->input->dev,
+				 "Sending SHUTDOWN to DA9063 ...\n");
+			error = regmap_write(onkey->hw->regmap,
+					     DA9063_REG_CONTROL_F,
+					     DA9063_SHUTDOWN);
+			if (error)
+				dev_err(&onkey->input->dev,
+					"Cannot SHUTDOWN DA9063: %d\n",
+					error);
+		}
+	}
+
+err_poll:
+	if (poll)
+		schedule_delayed_work(&onkey->work, msecs_to_jiffies(50));
+}
+
+static irqreturn_t da9063_onkey_irq_handler(int irq, void *data)
+{
+	struct da9063_onkey *onkey = data;
+	unsigned int val;
+	int error;
+
+	error = regmap_read(onkey->hw->regmap, DA9063_REG_STATUS_A, &val);
+	if (onkey->key_power && !error && (val & DA9063_NONKEY)) {
+		input_report_key(onkey->input, KEY_POWER, 1);
+		input_sync(onkey->input);
+		schedule_delayed_work(&onkey->work, 0);
+		dev_dbg(onkey->dev, "KEY_POWER pressed.\n");
+	} else {
+		input_report_key(onkey->input, KEY_SLEEP, 1);
+		input_sync(onkey->input);
+		input_report_key(onkey->input, KEY_SLEEP, 0);
+		input_sync(onkey->input);
+		dev_dbg(onkey->dev, "KEY_SLEEP pressed.\n");
+	}
+
+	return IRQ_HANDLED;
+}
+
+static void da9063_cancel_poll(void *data)
+{
+	struct da9063_onkey *onkey = data;
+
+	cancel_delayed_work_sync(&onkey->work);
+}
+
+static int da9063_onkey_probe(struct platform_device *pdev)
+{
+	struct da9063 *da9063 = dev_get_drvdata(pdev->dev.parent);
+	struct da9063_pdata *pdata = dev_get_platdata(da9063->dev);
+	struct da9063_onkey *onkey;
+	int irq;
+	int error;
+
+	onkey = devm_kzalloc(&pdev->dev, sizeof(struct da9063_onkey),
+			     GFP_KERNEL);
+	if (!onkey) {
+		dev_err(&pdev->dev, "Failed to allocate memory.\n");
+		return -ENOMEM;
+	}
+
+	onkey->dev = &pdev->dev;
+	onkey->hw = da9063;
+
+	if (pdata)
+		onkey->key_power = pdata->key_power;
+	else
+		onkey->key_power =
+			!of_property_read_bool(pdev->dev.of_node,
+					       "dlg,disable-key-power");
+
+	onkey->input = devm_input_allocate_device(&pdev->dev);
+	if (!onkey->input) {
+		dev_err(&pdev->dev, "Failed to allocated input device.\n");
+		return -ENOMEM;
+	}
+
+	onkey->input->name = DA9063_DRVNAME_ONKEY;
+	onkey->input->phys = DA9063_DRVNAME_ONKEY "/input0";
+	onkey->input->dev.parent = &pdev->dev;
+
+	if (onkey->key_power)
+		input_set_capability(onkey->input, EV_KEY, KEY_POWER);
+
+	input_set_capability(onkey->input, EV_KEY, KEY_SLEEP);
+
+	INIT_DELAYED_WORK(&onkey->work, da9063_poll_on);
+
+	error = devm_add_action(&pdev->dev, da9063_cancel_poll, onkey);
+	if (error) {
+		dev_err(&pdev->dev,
+			"Failed to add cancel poll action: %d\n",
+			error);
+		return error;
+	}
+
+	irq = platform_get_irq_byname(pdev, "ONKEY");
+	if (irq < 0) {
+		error = irq;
+		dev_err(&pdev->dev, "Failed to get platform IRQ: %d\n", error);
+		return error;
+	}
+
+	error = devm_request_threaded_irq(&pdev->dev, irq,
+					  NULL, da9063_onkey_irq_handler,
+					  IRQF_TRIGGER_LOW | IRQF_ONESHOT,
+					  "ONKEY", onkey);
+	if (error) {
+		dev_err(&pdev->dev,
+			"Failed to request IRQ %d: %d\n", irq, error);
+		return error;
+	}
+
+	error = input_register_device(onkey->input);
+	if (error) {
+		dev_err(&pdev->dev,
+			"Failed to register input device: %d\n", error);
+		return error;
+	}
+
+	platform_set_drvdata(pdev, onkey);
+	return 0;
+}
+
+static struct platform_driver da9063_onkey_driver = {
+	.probe	= da9063_onkey_probe,
+	.driver	= {
+		.name	= DA9063_DRVNAME_ONKEY,
+	},
+};
+module_platform_driver(da9063_onkey_driver);
+
+MODULE_AUTHOR("S Twiss <stwiss.opensource@diasemi.com>");
+MODULE_DESCRIPTION("Onkey device driver for Dialog DA9063");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("platform:" DA9063_DRVNAME_ONKEY);
diff --git a/include/linux/mfd/da9063/pdata.h b/include/linux/mfd/da9063/pdata.h
index 95c8742215a7..612383bd80ae 100644
--- a/include/linux/mfd/da9063/pdata.h
+++ b/include/linux/mfd/da9063/pdata.h
@@ -103,6 +103,7 @@ struct da9063;
 struct da9063_pdata {
 	int				(*init)(struct da9063 *da9063);
 	int				irq_base;
+	bool				key_power;
 	unsigned			flags;
 	struct da9063_regulators_pdata	*regulators_pdata;
 	struct led_platform_data	*leds_pdata;
-- 
cgit v1.2.3


From b1999477ed91c3c33891acfe0e18a4457e5c4915 Mon Sep 17 00:00:00 2001
From: Guo Zeng <Guo.Zeng@csr.com>
Date: Tue, 14 Apr 2015 11:55:55 +0000
Subject: ARM: prima2: move to use REGMAP APIs for rtciobrg

all devices behind rtciobrg needs a special way to access. currently they
are using a platform-specific API.
this patch moves to REGMAP, then clients can use regmap APIs to read/write.
for the moment, old APIs are still kept, once all clients move to regmap,
old APIs will be dropped.

this patch also does minor clean for comments, authors statement.

Signed-off-by: Guo Zeng <Guo.Zeng@csr.com>
Signed-off-by: Barry Song <Baohua.Song@csr.com>
---
 arch/arm/mach-prima2/Kconfig         |  1 +
 arch/arm/mach-prima2/rtciobrg.c      | 48 +++++++++++++++++++++++++++++++++---
 include/linux/rtc/sirfsoc_rtciobrg.h |  4 +++
 3 files changed, 50 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/mach-prima2/Kconfig b/arch/arm/mach-prima2/Kconfig
index e03d8b5c9ad0..9ab8932403e5 100644
--- a/arch/arm/mach-prima2/Kconfig
+++ b/arch/arm/mach-prima2/Kconfig
@@ -4,6 +4,7 @@ menuconfig ARCH_SIRF
 	select ARCH_REQUIRE_GPIOLIB
 	select GENERIC_IRQ_CHIP
 	select NO_IOPORT_MAP
+	select REGMAP
 	select PINCTRL
 	select PINCTRL_SIRF
 	help
diff --git a/arch/arm/mach-prima2/rtciobrg.c b/arch/arm/mach-prima2/rtciobrg.c
index 8f66d8f7ca75..d4852d24dc7d 100644
--- a/arch/arm/mach-prima2/rtciobrg.c
+++ b/arch/arm/mach-prima2/rtciobrg.c
@@ -1,5 +1,5 @@
 /*
- * RTC I/O Bridge interfaces for CSR SiRFprimaII
+ * RTC I/O Bridge interfaces for CSR SiRFprimaII/atlas7
  * ARM access the registers of SYSRTC, GPSRTC and PWRC through this module
  *
  * Copyright (c) 2011 Cambridge Silicon Radio Limited, a CSR plc group company.
@@ -10,6 +10,7 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/io.h>
+#include <linux/regmap.h>
 #include <linux/of.h>
 #include <linux/of_address.h>
 #include <linux/of_device.h>
@@ -66,6 +67,7 @@ u32 sirfsoc_rtc_iobrg_readl(u32 addr)
 {
 	unsigned long flags, val;
 
+	/* TODO: add hwspinlock to sync with M3 */
 	spin_lock_irqsave(&rtciobrg_lock, flags);
 
 	val = __sirfsoc_rtc_iobrg_readl(addr);
@@ -90,6 +92,7 @@ void sirfsoc_rtc_iobrg_writel(u32 val, u32 addr)
 {
 	unsigned long flags;
 
+	 /* TODO: add hwspinlock to sync with M3 */
 	spin_lock_irqsave(&rtciobrg_lock, flags);
 
 	sirfsoc_rtc_iobrg_pre_writel(val, addr);
@@ -102,6 +105,45 @@ void sirfsoc_rtc_iobrg_writel(u32 val, u32 addr)
 }
 EXPORT_SYMBOL_GPL(sirfsoc_rtc_iobrg_writel);
 
+
+static int regmap_iobg_regwrite(void *context, unsigned int reg,
+				   unsigned int val)
+{
+	sirfsoc_rtc_iobrg_writel(val, reg);
+	return 0;
+}
+
+static int regmap_iobg_regread(void *context, unsigned int reg,
+				  unsigned int *val)
+{
+	*val = (u32)sirfsoc_rtc_iobrg_readl(reg);
+	return 0;
+}
+
+static struct regmap_bus regmap_iobg = {
+	.reg_write = regmap_iobg_regwrite,
+	.reg_read = regmap_iobg_regread,
+};
+
+/**
+ * devm_regmap_init_iobg(): Initialise managed register map
+ *
+ * @iobg: Device that will be interacted with
+ * @config: Configuration for register map
+ *
+ * The return value will be an ERR_PTR() on error or a valid pointer
+ * to a struct regmap.  The regmap will be automatically freed by the
+ * device management code.
+ */
+struct regmap *devm_regmap_init_iobg(struct device *dev,
+				    const struct regmap_config *config)
+{
+	const struct regmap_bus *bus = &regmap_iobg;
+
+	return devm_regmap_init(dev, bus, dev, config);
+}
+EXPORT_SYMBOL_GPL(devm_regmap_init_iobg);
+
 static const struct of_device_id rtciobrg_ids[] = {
 	{ .compatible = "sirf,prima2-rtciobg" },
 	{}
@@ -132,7 +174,7 @@ static int __init sirfsoc_rtciobrg_init(void)
 }
 postcore_initcall(sirfsoc_rtciobrg_init);
 
-MODULE_AUTHOR("Zhiwu Song <zhiwu.song@csr.com>, "
-		"Barry Song <baohua.song@csr.com>");
+MODULE_AUTHOR("Zhiwu Song <zhiwu.song@csr.com>");
+MODULE_AUTHOR("Barry Song <baohua.song@csr.com>");
 MODULE_DESCRIPTION("CSR SiRFprimaII rtc io bridge");
 MODULE_LICENSE("GPL v2");
diff --git a/include/linux/rtc/sirfsoc_rtciobrg.h b/include/linux/rtc/sirfsoc_rtciobrg.h
index 2c92e1c8e055..aefd997262e4 100644
--- a/include/linux/rtc/sirfsoc_rtciobrg.h
+++ b/include/linux/rtc/sirfsoc_rtciobrg.h
@@ -9,10 +9,14 @@
 #ifndef _SIRFSOC_RTC_IOBRG_H_
 #define _SIRFSOC_RTC_IOBRG_H_
 
+struct regmap_config;
+
 extern void sirfsoc_rtc_iobrg_besyncing(void);
 
 extern u32 sirfsoc_rtc_iobrg_readl(u32 addr);
 
 extern void sirfsoc_rtc_iobrg_writel(u32 val, u32 addr);
+struct regmap *devm_regmap_init_iobg(struct device *dev,
+				    const struct regmap_config *config);
 
 #endif
-- 
cgit v1.2.3


From ae60d6a0e3a9197d37f8c8c4584a8ecd18518cd6 Mon Sep 17 00:00:00 2001
From: Nicholas Mc Guire <hofrat@osadl.org>
Date: Thu, 28 May 2015 19:09:55 +0200
Subject: time: Refactor usecs_to_jiffies

Refactor the usecs_to_jiffies conditional code part in time.c and
jiffies.h putting it into conditional functions rather than #ifdefs
to improve readability. This is analogous to the msecs_to_jiffies()
cleanup in commit ca42aaf0c861 ("time: Refactor msecs_to_jiffies")

Signed-off-by: Nicholas Mc Guire <hofrat@osadl.org>
Cc: Masahiro Yamada <yamada.m@jp.panasonic.com>
Cc: Sam Ravnborg <sam@ravnborg.org>
Cc: Joe Perches <joe@perches.com>
Cc: John Stultz <john.stultz@linaro.org>
Cc: Andrew Hunter <ahh@google.com>
Cc: Paul Turner <pjt@google.com>
Cc: Michal Marek <mmarek@suse.cz>
Link: http://lkml.kernel.org/r/1432832996-12129-1-git-send-email-hofrat@osadl.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/jiffies.h | 27 ++++++++++++++++++++++++++-
 kernel/time/time.c      | 13 +++----------
 2 files changed, 29 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/jiffies.h b/include/linux/jiffies.h
index 3bde5eb8568b..a316ebea0a89 100644
--- a/include/linux/jiffies.h
+++ b/include/linux/jiffies.h
@@ -362,7 +362,32 @@ static inline unsigned long msecs_to_jiffies(const unsigned int m)
 	}
 }
 
-extern unsigned long usecs_to_jiffies(const unsigned int u);
+extern unsigned long __usecs_to_jiffies(const unsigned int u);
+#if HZ <= USEC_PER_SEC && !(USEC_PER_SEC % HZ)
+static inline unsigned long _usecs_to_jiffies(const unsigned int u)
+{
+	return (u + (USEC_PER_SEC / HZ) - 1) / (USEC_PER_SEC / HZ);
+}
+#elif HZ > USEC_PER_SEC && !(HZ % USEC_PER_SEC)
+static inline unsigned long _usecs_to_jiffies(const unsigned int u)
+{
+	return u * (HZ / USEC_PER_SEC);
+}
+static inline unsigned long _usecs_to_jiffies(const unsigned int u)
+{
+#else
+static inline unsigned long _usecs_to_jiffies(const unsigned int u)
+{
+	return (USEC_TO_HZ_MUL32 * u + USEC_TO_HZ_ADJ32)
+		>> USEC_TO_HZ_SHR32;
+}
+#endif
+
+static inline unsigned long usecs_to_jiffies(const unsigned int u)
+{
+	return __usecs_to_jiffies(u);
+}
+
 extern unsigned long timespec_to_jiffies(const struct timespec *value);
 extern void jiffies_to_timespec(const unsigned long jiffies,
 				struct timespec *value);
diff --git a/kernel/time/time.c b/kernel/time/time.c
index 972e3bbac963..85d5bb1d67eb 100644
--- a/kernel/time/time.c
+++ b/kernel/time/time.c
@@ -522,20 +522,13 @@ unsigned long __msecs_to_jiffies(const unsigned int m)
 }
 EXPORT_SYMBOL(__msecs_to_jiffies);
 
-unsigned long usecs_to_jiffies(const unsigned int u)
+unsigned long __usecs_to_jiffies(const unsigned int u)
 {
 	if (u > jiffies_to_usecs(MAX_JIFFY_OFFSET))
 		return MAX_JIFFY_OFFSET;
-#if HZ <= USEC_PER_SEC && !(USEC_PER_SEC % HZ)
-	return (u + (USEC_PER_SEC / HZ) - 1) / (USEC_PER_SEC / HZ);
-#elif HZ > USEC_PER_SEC && !(HZ % USEC_PER_SEC)
-	return u * (HZ / USEC_PER_SEC);
-#else
-	return (USEC_TO_HZ_MUL32 * u + USEC_TO_HZ_ADJ32)
-		>> USEC_TO_HZ_SHR32;
-#endif
+	return _usecs_to_jiffies(u);
 }
-EXPORT_SYMBOL(usecs_to_jiffies);
+EXPORT_SYMBOL(__usecs_to_jiffies);
 
 /*
  * The TICK_NSEC - 1 rounds up the value to the next resolution.  Note
-- 
cgit v1.2.3


From c569a23d65ac2900d9998d3fe04044fe95be6b2f Mon Sep 17 00:00:00 2001
From: Nicholas Mc Guire <hofrat@osadl.org>
Date: Thu, 28 May 2015 19:09:56 +0200
Subject: time: Allow gcc to fold usecs_to_jiffies(constant)

To allow constant folding in usecs_to_jiffies() conditionally calls
the HZ dependent _usecs_to_jiffies() helpers or, when gcc can not
figure out constant folding, __usecs_to_jiffies, which is the renamed
original usecs_to_jiffies() function.

Signed-off-by: Nicholas Mc Guire <hofrat@osadl.org>
Cc: Masahiro Yamada <yamada.m@jp.panasonic.com>
Cc: Sam Ravnborg <sam@ravnborg.org>
Cc: Joe Perches <joe@perches.com>
Cc: John Stultz <john.stultz@linaro.org>
Cc: Andrew Hunter <ahh@google.com>
Cc: Paul Turner <pjt@google.com>
Cc: Michal Marek <mmarek@suse.cz>
Link: http://lkml.kernel.org/r/1432832996-12129-2-git-send-email-hofrat@osadl.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/jiffies.h | 30 +++++++++++++++++++++++++++++-
 1 file changed, 29 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/jiffies.h b/include/linux/jiffies.h
index a316ebea0a89..535fd3bb1ba8 100644
--- a/include/linux/jiffies.h
+++ b/include/linux/jiffies.h
@@ -383,9 +383,37 @@ static inline unsigned long _usecs_to_jiffies(const unsigned int u)
 }
 #endif
 
+/**
+ * usecs_to_jiffies: - convert microseconds to jiffies
+ * @u:	time in microseconds
+ *
+ * conversion is done as follows:
+ *
+ * - 'too large' values [that would result in larger than
+ *   MAX_JIFFY_OFFSET values] mean 'infinite timeout' too.
+ *
+ * - all other values are converted to jiffies by either multiplying
+ *   the input value by a factor or dividing it with a factor and
+ *   handling any 32-bit overflows as for msecs_to_jiffies.
+ *
+ * usecs_to_jiffies() checks for the passed in value being a constant
+ * via __builtin_constant_p() allowing gcc to eliminate most of the
+ * code, __usecs_to_jiffies() is called if the value passed does not
+ * allow constant folding and the actual conversion must be done at
+ * runtime.
+ * the HZ range specific helpers _usecs_to_jiffies() are called both
+ * directly here and from __msecs_to_jiffies() in the case where
+ * constant folding is not possible.
+ */
 static inline unsigned long usecs_to_jiffies(const unsigned int u)
 {
-	return __usecs_to_jiffies(u);
+	if (__builtin_constant_p(u)) {
+		if (u > jiffies_to_usecs(MAX_JIFFY_OFFSET))
+			return MAX_JIFFY_OFFSET;
+		return _usecs_to_jiffies(u);
+	} else {
+		return __usecs_to_jiffies(u);
+	}
 }
 
 extern unsigned long timespec_to_jiffies(const struct timespec *value);
-- 
cgit v1.2.3


From ed06aeefdac348cfb91a3db5fe1067e3202afd70 Mon Sep 17 00:00:00 2001
From: Christophe Ricard <christophe.ricard@gmail.com>
Date: Tue, 9 Jun 2015 22:26:05 +0200
Subject: nfc: st-nci: Rename st21nfcb to st-nci

STMicroelectronics NFC NCI chips family is extending
with the new ST21NFCC using the AMS AS39230 RF booster.
The st21nfcb driver is relevant for this solution and
might be with future products.

Signed-off-by: Christophe Ricard <christophe-h.ricard@st.com>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>
---
 .../devicetree/bindings/net/nfc/st-nci.txt         |  33 +
 .../devicetree/bindings/net/nfc/st21nfcb.txt       |  33 -
 drivers/nfc/Kconfig                                |   2 +-
 drivers/nfc/Makefile                               |   2 +-
 drivers/nfc/st-nci/Kconfig                         |  23 +
 drivers/nfc/st-nci/Makefile                        |   9 +
 drivers/nfc/st-nci/core.c                          | 179 ++++++
 drivers/nfc/st-nci/i2c.c                           | 385 +++++++++++
 drivers/nfc/st-nci/ndlc.c                          | 313 +++++++++
 drivers/nfc/st-nci/ndlc.h                          |  60 ++
 drivers/nfc/st-nci/st-nci.h                        |  50 ++
 drivers/nfc/st-nci/st-nci_se.c                     | 714 +++++++++++++++++++++
 drivers/nfc/st-nci/st-nci_se.h                     |  61 ++
 drivers/nfc/st21nfcb/Kconfig                       |  22 -
 drivers/nfc/st21nfcb/Makefile                      |   9 -
 drivers/nfc/st21nfcb/i2c.c                         | 384 -----------
 drivers/nfc/st21nfcb/ndlc.c                        | 313 ---------
 drivers/nfc/st21nfcb/ndlc.h                        |  60 --
 drivers/nfc/st21nfcb/st21nfcb.c                    | 179 ------
 drivers/nfc/st21nfcb/st21nfcb.h                    |  50 --
 drivers/nfc/st21nfcb/st21nfcb_se.c                 | 713 --------------------
 drivers/nfc/st21nfcb/st21nfcb_se.h                 |  61 --
 include/linux/platform_data/st-nci.h               |  29 +
 include/linux/platform_data/st21nfcb.h             |  29 -
 include/linux/platform_data/st_nci.h               |  29 +
 25 files changed, 1887 insertions(+), 1855 deletions(-)
 create mode 100644 Documentation/devicetree/bindings/net/nfc/st-nci.txt
 delete mode 100644 Documentation/devicetree/bindings/net/nfc/st21nfcb.txt
 create mode 100644 drivers/nfc/st-nci/Kconfig
 create mode 100644 drivers/nfc/st-nci/Makefile
 create mode 100644 drivers/nfc/st-nci/core.c
 create mode 100644 drivers/nfc/st-nci/i2c.c
 create mode 100644 drivers/nfc/st-nci/ndlc.c
 create mode 100644 drivers/nfc/st-nci/ndlc.h
 create mode 100644 drivers/nfc/st-nci/st-nci.h
 create mode 100644 drivers/nfc/st-nci/st-nci_se.c
 create mode 100644 drivers/nfc/st-nci/st-nci_se.h
 delete mode 100644 drivers/nfc/st21nfcb/Kconfig
 delete mode 100644 drivers/nfc/st21nfcb/Makefile
 delete mode 100644 drivers/nfc/st21nfcb/i2c.c
 delete mode 100644 drivers/nfc/st21nfcb/ndlc.c
 delete mode 100644 drivers/nfc/st21nfcb/ndlc.h
 delete mode 100644 drivers/nfc/st21nfcb/st21nfcb.c
 delete mode 100644 drivers/nfc/st21nfcb/st21nfcb.h
 delete mode 100644 drivers/nfc/st21nfcb/st21nfcb_se.c
 delete mode 100644 drivers/nfc/st21nfcb/st21nfcb_se.h
 create mode 100644 include/linux/platform_data/st-nci.h
 delete mode 100644 include/linux/platform_data/st21nfcb.h
 create mode 100644 include/linux/platform_data/st_nci.h

(limited to 'include/linux')

diff --git a/Documentation/devicetree/bindings/net/nfc/st-nci.txt b/Documentation/devicetree/bindings/net/nfc/st-nci.txt
new file mode 100644
index 000000000000..d707588ed734
--- /dev/null
+++ b/Documentation/devicetree/bindings/net/nfc/st-nci.txt
@@ -0,0 +1,33 @@
+* STMicroelectronics SAS. ST NCI NFC Controller
+
+Required properties:
+- compatible: Should be "st,st21nfcb-i2c" or "st,st21nfcc-i2c".
+- clock-frequency: I²C work frequency.
+- reg: address on the bus
+- interrupt-parent: phandle for the interrupt gpio controller
+- interrupts: GPIO interrupt to which the chip is connected
+- reset-gpios: Output GPIO pin used to reset the ST21NFCB
+
+Optional SoC Specific Properties:
+- pinctrl-names: Contains only one value - "default".
+- pintctrl-0: Specifies the pin control groups used for this controller.
+
+Example (for ARM-based BeagleBoard xM with ST21NFCB on I2C2):
+
+&i2c2 {
+
+	status = "okay";
+
+	st21nfcb: st21nfcb@8 {
+
+		compatible = "st,st21nfcb-i2c";
+
+		reg = <0x08>;
+		clock-frequency = <400000>;
+
+		interrupt-parent = <&gpio5>;
+		interrupts = <2 IRQ_TYPE_LEVEL_HIGH>;
+
+		reset-gpios = <&gpio5 29 GPIO_ACTIVE_HIGH>;
+	};
+};
diff --git a/Documentation/devicetree/bindings/net/nfc/st21nfcb.txt b/Documentation/devicetree/bindings/net/nfc/st21nfcb.txt
deleted file mode 100644
index bb237072dbe9..000000000000
--- a/Documentation/devicetree/bindings/net/nfc/st21nfcb.txt
+++ /dev/null
@@ -1,33 +0,0 @@
-* STMicroelectronics SAS. ST21NFCB NFC Controller
-
-Required properties:
-- compatible: Should be "st,st21nfcb-i2c".
-- clock-frequency: I²C work frequency.
-- reg: address on the bus
-- interrupt-parent: phandle for the interrupt gpio controller
-- interrupts: GPIO interrupt to which the chip is connected
-- reset-gpios: Output GPIO pin used to reset the ST21NFCB
-
-Optional SoC Specific Properties:
-- pinctrl-names: Contains only one value - "default".
-- pintctrl-0: Specifies the pin control groups used for this controller.
-
-Example (for ARM-based BeagleBoard xM with ST21NFCB on I2C2):
-
-&i2c2 {
-
-	status = "okay";
-
-	st21nfcb: st21nfcb@8 {
-
-		compatible = "st,st21nfcb-i2c";
-
-		reg = <0x08>;
-		clock-frequency = <400000>;
-
-		interrupt-parent = <&gpio5>;
-		interrupts = <2 IRQ_TYPE_LEVEL_HIGH>;
-
-		reset-gpios = <&gpio5 29 GPIO_ACTIVE_HIGH>;
-	};
-};
diff --git a/drivers/nfc/Kconfig b/drivers/nfc/Kconfig
index 107714e4405f..722673cb785b 100644
--- a/drivers/nfc/Kconfig
+++ b/drivers/nfc/Kconfig
@@ -72,6 +72,6 @@ source "drivers/nfc/pn544/Kconfig"
 source "drivers/nfc/microread/Kconfig"
 source "drivers/nfc/nfcmrvl/Kconfig"
 source "drivers/nfc/st21nfca/Kconfig"
-source "drivers/nfc/st21nfcb/Kconfig"
+source "drivers/nfc/st-nci/Kconfig"
 source "drivers/nfc/nxp-nci/Kconfig"
 endmenu
diff --git a/drivers/nfc/Makefile b/drivers/nfc/Makefile
index 13b648baf175..368b6dfe71b3 100644
--- a/drivers/nfc/Makefile
+++ b/drivers/nfc/Makefile
@@ -12,5 +12,5 @@ obj-$(CONFIG_NFC_PORT100)	+= port100.o
 obj-$(CONFIG_NFC_MRVL)		+= nfcmrvl/
 obj-$(CONFIG_NFC_TRF7970A)	+= trf7970a.o
 obj-$(CONFIG_NFC_ST21NFCA)  	+= st21nfca/
-obj-$(CONFIG_NFC_ST21NFCB)	+= st21nfcb/
+obj-$(CONFIG_NFC_ST_NCI)	+= st-nci/
 obj-$(CONFIG_NFC_NXP_NCI)	+= nxp-nci/
diff --git a/drivers/nfc/st-nci/Kconfig b/drivers/nfc/st-nci/Kconfig
new file mode 100644
index 000000000000..fc3904c946ee
--- /dev/null
+++ b/drivers/nfc/st-nci/Kconfig
@@ -0,0 +1,23 @@
+config NFC_ST_NCI
+	tristate "STMicroelectronics ST NCI NFC driver"
+	depends on NFC_NCI
+	default n
+	---help---
+	  STMicroelectronics NFC NCI chips core driver. It implements the chipset
+	  NCI logic and hooks into the NFC kernel APIs. Physical layers will
+	  register against it.
+
+	  To compile this driver as a module, choose m here. The module will
+	  be called st-nci.
+	  Say N if unsure.
+
+config NFC_ST_NCI_I2C
+	tristate "NFC ST NCI i2c support"
+	depends on NFC_ST_NCI && I2C
+	---help---
+	  This module adds support for an I2C interface to the
+	  STMicroelectronics NFC NCI chips familly.
+	  Select this if your platform is using the i2c bus.
+
+	  If you choose to build a module, it'll be called st-nci_i2c.
+	  Say N if unsure.
diff --git a/drivers/nfc/st-nci/Makefile b/drivers/nfc/st-nci/Makefile
new file mode 100644
index 000000000000..0df157df3a94
--- /dev/null
+++ b/drivers/nfc/st-nci/Makefile
@@ -0,0 +1,9 @@
+#
+# Makefile for ST21NFCB NCI based NFC driver
+#
+
+st-nci-objs = ndlc.o core.o st-nci_se.o
+obj-$(CONFIG_NFC_ST_NCI)     += st-nci.o
+
+st-nci_i2c-objs = i2c.o
+obj-$(CONFIG_NFC_ST_NCI_I2C) += st-nci_i2c.o
diff --git a/drivers/nfc/st-nci/core.c b/drivers/nfc/st-nci/core.c
new file mode 100644
index 000000000000..c419d3943973
--- /dev/null
+++ b/drivers/nfc/st-nci/core.c
@@ -0,0 +1,179 @@
+/*
+ * NCI based Driver for STMicroelectronics NFC Chip
+ *
+ * Copyright (C) 2014-2015  STMicroelectronics SAS. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/module.h>
+#include <linux/nfc.h>
+#include <net/nfc/nci.h>
+#include <net/nfc/nci_core.h>
+#include <linux/gpio.h>
+#include <linux/delay.h>
+
+#include "st-nci.h"
+#include "st-nci_se.h"
+
+#define DRIVER_DESC "NCI NFC driver for ST_NCI"
+
+#define ST_NCI1_X_PROPRIETARY_ISO15693 0x83
+
+static int st_nci_init(struct nci_dev *ndev)
+{
+	struct nci_mode_set_cmd cmd;
+
+	cmd.cmd_type = ST_NCI_SET_NFC_MODE;
+	cmd.mode = 1;
+
+	return nci_prop_cmd(ndev, ST_NCI_CORE_PROP,
+			sizeof(struct nci_mode_set_cmd), (__u8 *)&cmd);
+}
+
+static int st_nci_open(struct nci_dev *ndev)
+{
+	struct st_nci_info *info = nci_get_drvdata(ndev);
+	int r;
+
+	if (test_and_set_bit(ST_NCI_RUNNING, &info->flags))
+		return 0;
+
+	r = ndlc_open(info->ndlc);
+	if (r)
+		clear_bit(ST_NCI_RUNNING, &info->flags);
+
+	return r;
+}
+
+static int st_nci_close(struct nci_dev *ndev)
+{
+	struct st_nci_info *info = nci_get_drvdata(ndev);
+
+	if (!test_bit(ST_NCI_RUNNING, &info->flags))
+		return 0;
+
+	ndlc_close(info->ndlc);
+
+	clear_bit(ST_NCI_RUNNING, &info->flags);
+
+	return 0;
+}
+
+static int st_nci_send(struct nci_dev *ndev, struct sk_buff *skb)
+{
+	struct st_nci_info *info = nci_get_drvdata(ndev);
+
+	skb->dev = (void *)ndev;
+
+	if (!test_bit(ST_NCI_RUNNING, &info->flags))
+		return -EBUSY;
+
+	return ndlc_send(info->ndlc, skb);
+}
+
+static __u32 st_nci_get_rfprotocol(struct nci_dev *ndev,
+					 __u8 rf_protocol)
+{
+	return rf_protocol == ST_NCI1_X_PROPRIETARY_ISO15693 ?
+		NFC_PROTO_ISO15693_MASK : 0;
+}
+
+static int st_nci_prop_rsp_packet(struct nci_dev *ndev,
+					struct sk_buff *skb)
+{
+	__u8 status = skb->data[0];
+
+	nci_req_complete(ndev, status);
+	return 0;
+}
+
+static struct nci_prop_ops st_nci_prop_ops[] = {
+	{
+		.opcode = nci_opcode_pack(NCI_GID_PROPRIETARY,
+					  ST_NCI_CORE_PROP),
+		.rsp = st_nci_prop_rsp_packet,
+	},
+};
+
+static struct nci_ops st_nci_ops = {
+	.init = st_nci_init,
+	.open = st_nci_open,
+	.close = st_nci_close,
+	.send = st_nci_send,
+	.get_rfprotocol = st_nci_get_rfprotocol,
+	.discover_se = st_nci_discover_se,
+	.enable_se = st_nci_enable_se,
+	.disable_se = st_nci_disable_se,
+	.se_io = st_nci_se_io,
+	.hci_load_session = st_nci_hci_load_session,
+	.hci_event_received = st_nci_hci_event_received,
+	.hci_cmd_received = st_nci_hci_cmd_received,
+	.prop_ops = st_nci_prop_ops,
+	.n_prop_ops = ARRAY_SIZE(st_nci_prop_ops),
+};
+
+int st_nci_probe(struct llt_ndlc *ndlc, int phy_headroom,
+		       int phy_tailroom)
+{
+	struct st_nci_info *info;
+	int r;
+	u32 protocols;
+
+	info = devm_kzalloc(ndlc->dev,
+			sizeof(struct st_nci_info), GFP_KERNEL);
+	if (!info)
+		return -ENOMEM;
+
+	protocols = NFC_PROTO_JEWEL_MASK
+		| NFC_PROTO_MIFARE_MASK
+		| NFC_PROTO_FELICA_MASK
+		| NFC_PROTO_ISO14443_MASK
+		| NFC_PROTO_ISO14443_B_MASK
+		| NFC_PROTO_ISO15693_MASK
+		| NFC_PROTO_NFC_DEP_MASK;
+
+	ndlc->ndev = nci_allocate_device(&st_nci_ops, protocols,
+					phy_headroom, phy_tailroom);
+	if (!ndlc->ndev) {
+		pr_err("Cannot allocate nfc ndev\n");
+		return -ENOMEM;
+	}
+	info->ndlc = ndlc;
+
+	nci_set_drvdata(ndlc->ndev, info);
+
+	r = nci_register_device(ndlc->ndev);
+	if (r) {
+		pr_err("Cannot register nfc device to nci core\n");
+		nci_free_device(ndlc->ndev);
+		return r;
+	}
+
+	return st_nci_se_init(ndlc->ndev);
+}
+EXPORT_SYMBOL_GPL(st_nci_probe);
+
+void st_nci_remove(struct nci_dev *ndev)
+{
+	struct st_nci_info *info = nci_get_drvdata(ndev);
+
+	ndlc_close(info->ndlc);
+
+	nci_unregister_device(ndev);
+	nci_free_device(ndev);
+}
+EXPORT_SYMBOL_GPL(st_nci_remove);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION(DRIVER_DESC);
diff --git a/drivers/nfc/st-nci/i2c.c b/drivers/nfc/st-nci/i2c.c
new file mode 100644
index 000000000000..06175ce769bb
--- /dev/null
+++ b/drivers/nfc/st-nci/i2c.c
@@ -0,0 +1,385 @@
+/*
+ * I2C Link Layer for ST NCI NFC controller familly based Driver
+ * Copyright (C) 2014-2015 STMicroelectronics SAS. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
+#include <linux/i2c.h>
+#include <linux/gpio.h>
+#include <linux/of_irq.h>
+#include <linux/of_gpio.h>
+#include <linux/interrupt.h>
+#include <linux/delay.h>
+#include <linux/nfc.h>
+#include <linux/platform_data/st_nci.h>
+
+#include "ndlc.h"
+
+#define DRIVER_DESC "NCI NFC driver for ST21NFCB"
+
+/* ndlc header */
+#define ST21NFCB_FRAME_HEADROOM	1
+#define ST21NFCB_FRAME_TAILROOM 0
+
+#define ST_NCI_I2C_MIN_SIZE 4   /* PCB(1) + NCI Packet header(3) */
+#define ST_NCI_I2C_MAX_SIZE 250 /* req 4.2.1 */
+
+#define ST_NCI_I2C_DRIVER_NAME "st_nci_i2c"
+
+static struct i2c_device_id st_nci_i2c_id_table[] = {
+	{ST_NCI_DRIVER_NAME, 0},
+	{}
+};
+MODULE_DEVICE_TABLE(i2c, st_nci_i2c_id_table);
+
+struct st_nci_i2c_phy {
+	struct i2c_client *i2c_dev;
+	struct llt_ndlc *ndlc;
+
+	unsigned int gpio_reset;
+	unsigned int irq_polarity;
+};
+
+#define I2C_DUMP_SKB(info, skb)					\
+do {								\
+	pr_debug("%s:\n", info);				\
+	print_hex_dump(KERN_DEBUG, "i2c: ", DUMP_PREFIX_OFFSET,	\
+		       16, 1, (skb)->data, (skb)->len, 0);	\
+} while (0)
+
+static int st_nci_i2c_enable(void *phy_id)
+{
+	struct st_nci_i2c_phy *phy = phy_id;
+
+	gpio_set_value(phy->gpio_reset, 0);
+	usleep_range(10000, 15000);
+	gpio_set_value(phy->gpio_reset, 1);
+	usleep_range(80000, 85000);
+
+	if (phy->ndlc->powered == 0)
+		enable_irq(phy->i2c_dev->irq);
+
+	return 0;
+}
+
+static void st_nci_i2c_disable(void *phy_id)
+{
+	struct st_nci_i2c_phy *phy = phy_id;
+
+	disable_irq_nosync(phy->i2c_dev->irq);
+}
+
+/*
+ * Writing a frame must not return the number of written bytes.
+ * It must return either zero for success, or <0 for error.
+ * In addition, it must not alter the skb
+ */
+static int st_nci_i2c_write(void *phy_id, struct sk_buff *skb)
+{
+	int r = -1;
+	struct st_nci_i2c_phy *phy = phy_id;
+	struct i2c_client *client = phy->i2c_dev;
+
+	I2C_DUMP_SKB("st_nci_i2c_write", skb);
+
+	if (phy->ndlc->hard_fault != 0)
+		return phy->ndlc->hard_fault;
+
+	r = i2c_master_send(client, skb->data, skb->len);
+	if (r < 0) {  /* Retry, chip was in standby */
+		usleep_range(1000, 4000);
+		r = i2c_master_send(client, skb->data, skb->len);
+	}
+
+	if (r >= 0) {
+		if (r != skb->len)
+			r = -EREMOTEIO;
+		else
+			r = 0;
+	}
+
+	return r;
+}
+
+/*
+ * Reads an ndlc frame and returns it in a newly allocated sk_buff.
+ * returns:
+ * frame size : if received frame is complete (find ST21NFCB_SOF_EOF at
+ * end of read)
+ * -EAGAIN : if received frame is incomplete (not find ST21NFCB_SOF_EOF
+ * at end of read)
+ * -EREMOTEIO : i2c read error (fatal)
+ * -EBADMSG : frame was incorrect and discarded
+ * (value returned from st_nci_i2c_repack)
+ * -EIO : if no ST21NFCB_SOF_EOF is found after reaching
+ * the read length end sequence
+ */
+static int st_nci_i2c_read(struct st_nci_i2c_phy *phy,
+				 struct sk_buff **skb)
+{
+	int r;
+	u8 len;
+	u8 buf[ST_NCI_I2C_MAX_SIZE];
+	struct i2c_client *client = phy->i2c_dev;
+
+	r = i2c_master_recv(client, buf, ST_NCI_I2C_MIN_SIZE);
+	if (r < 0) {  /* Retry, chip was in standby */
+		usleep_range(1000, 4000);
+		r = i2c_master_recv(client, buf, ST_NCI_I2C_MIN_SIZE);
+	}
+
+	if (r != ST_NCI_I2C_MIN_SIZE)
+		return -EREMOTEIO;
+
+	len = be16_to_cpu(*(__be16 *) (buf + 2));
+	if (len > ST_NCI_I2C_MAX_SIZE) {
+		nfc_err(&client->dev, "invalid frame len\n");
+		return -EBADMSG;
+	}
+
+	*skb = alloc_skb(ST_NCI_I2C_MIN_SIZE + len, GFP_KERNEL);
+	if (*skb == NULL)
+		return -ENOMEM;
+
+	skb_reserve(*skb, ST_NCI_I2C_MIN_SIZE);
+	skb_put(*skb, ST_NCI_I2C_MIN_SIZE);
+	memcpy((*skb)->data, buf, ST_NCI_I2C_MIN_SIZE);
+
+	if (!len)
+		return 0;
+
+	r = i2c_master_recv(client, buf, len);
+	if (r != len) {
+		kfree_skb(*skb);
+		return -EREMOTEIO;
+	}
+
+	skb_put(*skb, len);
+	memcpy((*skb)->data + ST_NCI_I2C_MIN_SIZE, buf, len);
+
+	I2C_DUMP_SKB("i2c frame read", *skb);
+
+	return 0;
+}
+
+/*
+ * Reads an ndlc frame from the chip.
+ *
+ * On ST21NFCB, IRQ goes in idle state when read starts.
+ */
+static irqreturn_t st_nci_irq_thread_fn(int irq, void *phy_id)
+{
+	struct st_nci_i2c_phy *phy = phy_id;
+	struct i2c_client *client;
+	struct sk_buff *skb = NULL;
+	int r;
+
+	if (!phy || !phy->ndlc || irq != phy->i2c_dev->irq) {
+		WARN_ON_ONCE(1);
+		return IRQ_NONE;
+	}
+
+	client = phy->i2c_dev;
+	dev_dbg(&client->dev, "IRQ\n");
+
+	if (phy->ndlc->hard_fault)
+		return IRQ_HANDLED;
+
+	if (!phy->ndlc->powered) {
+		st_nci_i2c_disable(phy);
+		return IRQ_HANDLED;
+	}
+
+	r = st_nci_i2c_read(phy, &skb);
+	if (r == -EREMOTEIO || r == -ENOMEM || r == -EBADMSG)
+		return IRQ_HANDLED;
+
+	ndlc_recv(phy->ndlc, skb);
+
+	return IRQ_HANDLED;
+}
+
+static struct nfc_phy_ops i2c_phy_ops = {
+	.write = st_nci_i2c_write,
+	.enable = st_nci_i2c_enable,
+	.disable = st_nci_i2c_disable,
+};
+
+#ifdef CONFIG_OF
+static int st_nci_i2c_of_request_resources(struct i2c_client *client)
+{
+	struct st_nci_i2c_phy *phy = i2c_get_clientdata(client);
+	struct device_node *pp;
+	int gpio;
+	int r;
+
+	pp = client->dev.of_node;
+	if (!pp)
+		return -ENODEV;
+
+	/* Get GPIO from device tree */
+	gpio = of_get_named_gpio(pp, "reset-gpios", 0);
+	if (gpio < 0) {
+		nfc_err(&client->dev,
+			"Failed to retrieve reset-gpios from device tree\n");
+		return gpio;
+	}
+
+	/* GPIO request and configuration */
+	r = devm_gpio_request_one(&client->dev, gpio,
+				GPIOF_OUT_INIT_HIGH, "clf_reset");
+	if (r) {
+		nfc_err(&client->dev, "Failed to request reset pin\n");
+		return r;
+	}
+	phy->gpio_reset = gpio;
+
+	phy->irq_polarity = irq_get_trigger_type(client->irq);
+
+	return 0;
+}
+#else
+static int st_nci_i2c_of_request_resources(struct i2c_client *client)
+{
+	return -ENODEV;
+}
+#endif
+
+static int st_nci_i2c_request_resources(struct i2c_client *client)
+{
+	struct st_nci_nfc_platform_data *pdata;
+	struct st_nci_i2c_phy *phy = i2c_get_clientdata(client);
+	int r;
+
+	pdata = client->dev.platform_data;
+	if (pdata == NULL) {
+		nfc_err(&client->dev, "No platform data\n");
+		return -EINVAL;
+	}
+
+	/* store for later use */
+	phy->gpio_reset = pdata->gpio_reset;
+	phy->irq_polarity = pdata->irq_polarity;
+
+	r = devm_gpio_request_one(&client->dev,
+			phy->gpio_reset, GPIOF_OUT_INIT_HIGH, "clf_reset");
+	if (r) {
+		pr_err("%s : reset gpio_request failed\n", __FILE__);
+		return r;
+	}
+
+	return 0;
+}
+
+static int st_nci_i2c_probe(struct i2c_client *client,
+				  const struct i2c_device_id *id)
+{
+	struct st_nci_i2c_phy *phy;
+	struct st_nci_nfc_platform_data *pdata;
+	int r;
+
+	dev_dbg(&client->dev, "%s\n", __func__);
+	dev_dbg(&client->dev, "IRQ: %d\n", client->irq);
+
+	if (!i2c_check_functionality(client->adapter, I2C_FUNC_I2C)) {
+		nfc_err(&client->dev, "Need I2C_FUNC_I2C\n");
+		return -ENODEV;
+	}
+
+	phy = devm_kzalloc(&client->dev, sizeof(struct st_nci_i2c_phy),
+			   GFP_KERNEL);
+	if (!phy)
+		return -ENOMEM;
+
+	phy->i2c_dev = client;
+
+	i2c_set_clientdata(client, phy);
+
+	pdata = client->dev.platform_data;
+	if (!pdata && client->dev.of_node) {
+		r = st_nci_i2c_of_request_resources(client);
+		if (r) {
+			nfc_err(&client->dev, "No platform data\n");
+			return r;
+		}
+	} else if (pdata) {
+		r = st_nci_i2c_request_resources(client);
+		if (r) {
+			nfc_err(&client->dev,
+				"Cannot get platform resources\n");
+			return r;
+		}
+	} else {
+		nfc_err(&client->dev,
+			"st21nfcb platform resources not available\n");
+		return -ENODEV;
+	}
+
+	r = ndlc_probe(phy, &i2c_phy_ops, &client->dev,
+			ST21NFCB_FRAME_HEADROOM, ST21NFCB_FRAME_TAILROOM,
+			&phy->ndlc);
+	if (r < 0) {
+		nfc_err(&client->dev, "Unable to register ndlc layer\n");
+		return r;
+	}
+
+	r = devm_request_threaded_irq(&client->dev, client->irq, NULL,
+				st_nci_irq_thread_fn,
+				phy->irq_polarity | IRQF_ONESHOT,
+				ST_NCI_DRIVER_NAME, phy);
+	if (r < 0)
+		nfc_err(&client->dev, "Unable to register IRQ handler\n");
+
+	return r;
+}
+
+static int st_nci_i2c_remove(struct i2c_client *client)
+{
+	struct st_nci_i2c_phy *phy = i2c_get_clientdata(client);
+
+	dev_dbg(&client->dev, "%s\n", __func__);
+
+	ndlc_remove(phy->ndlc);
+
+	return 0;
+}
+
+#ifdef CONFIG_OF
+static const struct of_device_id of_st_nci_i2c_match[] = {
+	{ .compatible = "st,st21nfcb-i2c", },
+	{ .compatible = "st,st21nfcb_i2c", },
+	{ .compatible = "st,st21nfcc-i2c", },
+	{}
+};
+MODULE_DEVICE_TABLE(of, of_st_nci_i2c_match);
+#endif
+
+static struct i2c_driver st_nci_i2c_driver = {
+	.driver = {
+		.owner = THIS_MODULE,
+		.name = ST_NCI_I2C_DRIVER_NAME,
+		.of_match_table = of_match_ptr(of_st_nci_i2c_match),
+	},
+	.probe = st_nci_i2c_probe,
+	.id_table = st_nci_i2c_id_table,
+	.remove = st_nci_i2c_remove,
+};
+
+module_i2c_driver(st_nci_i2c_driver);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION(DRIVER_DESC);
diff --git a/drivers/nfc/st-nci/ndlc.c b/drivers/nfc/st-nci/ndlc.c
new file mode 100644
index 000000000000..56c6a4cb4c96
--- /dev/null
+++ b/drivers/nfc/st-nci/ndlc.c
@@ -0,0 +1,313 @@
+/*
+ * Low Level Transport (NDLC) Driver for STMicroelectronics NFC Chip
+ *
+ * Copyright (C) 2014-2015  STMicroelectronics SAS. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/sched.h>
+#include <net/nfc/nci_core.h>
+
+#include "ndlc.h"
+#include "st-nci.h"
+
+#define NDLC_TIMER_T1		100
+#define NDLC_TIMER_T1_WAIT	400
+#define NDLC_TIMER_T2		1200
+
+#define PCB_TYPE_DATAFRAME		0x80
+#define PCB_TYPE_SUPERVISOR		0xc0
+#define PCB_TYPE_MASK			PCB_TYPE_SUPERVISOR
+
+#define PCB_SYNC_ACK			0x20
+#define PCB_SYNC_NACK			0x10
+#define PCB_SYNC_WAIT			0x30
+#define PCB_SYNC_NOINFO			0x00
+#define PCB_SYNC_MASK			PCB_SYNC_WAIT
+
+#define PCB_DATAFRAME_RETRANSMIT_YES	0x00
+#define PCB_DATAFRAME_RETRANSMIT_NO	0x04
+#define PCB_DATAFRAME_RETRANSMIT_MASK	PCB_DATAFRAME_RETRANSMIT_NO
+
+#define PCB_SUPERVISOR_RETRANSMIT_YES	0x00
+#define PCB_SUPERVISOR_RETRANSMIT_NO	0x02
+#define PCB_SUPERVISOR_RETRANSMIT_MASK	PCB_SUPERVISOR_RETRANSMIT_NO
+
+#define PCB_FRAME_CRC_INFO_PRESENT	0x08
+#define PCB_FRAME_CRC_INFO_NOTPRESENT	0x00
+#define PCB_FRAME_CRC_INFO_MASK		PCB_FRAME_CRC_INFO_PRESENT
+
+#define NDLC_DUMP_SKB(info, skb)                                 \
+do {                                                             \
+	pr_debug("%s:\n", info);                                 \
+	print_hex_dump(KERN_DEBUG, "ndlc: ", DUMP_PREFIX_OFFSET, \
+			16, 1, skb->data, skb->len, 0);          \
+} while (0)
+
+int ndlc_open(struct llt_ndlc *ndlc)
+{
+	/* toggle reset pin */
+	ndlc->ops->enable(ndlc->phy_id);
+	ndlc->powered = 1;
+	return 0;
+}
+EXPORT_SYMBOL(ndlc_open);
+
+void ndlc_close(struct llt_ndlc *ndlc)
+{
+	struct nci_mode_set_cmd cmd;
+
+	cmd.cmd_type = ST_NCI_SET_NFC_MODE;
+	cmd.mode = 0;
+
+	/* toggle reset pin */
+	ndlc->ops->enable(ndlc->phy_id);
+
+	nci_prop_cmd(ndlc->ndev, ST_NCI_CORE_PROP,
+		     sizeof(struct nci_mode_set_cmd), (__u8 *)&cmd);
+
+	ndlc->powered = 0;
+	ndlc->ops->disable(ndlc->phy_id);
+}
+EXPORT_SYMBOL(ndlc_close);
+
+int ndlc_send(struct llt_ndlc *ndlc, struct sk_buff *skb)
+{
+	/* add ndlc header */
+	u8 pcb = PCB_TYPE_DATAFRAME | PCB_DATAFRAME_RETRANSMIT_NO |
+		PCB_FRAME_CRC_INFO_NOTPRESENT;
+
+	*skb_push(skb, 1) = pcb;
+	skb_queue_tail(&ndlc->send_q, skb);
+
+	schedule_work(&ndlc->sm_work);
+
+	return 0;
+}
+EXPORT_SYMBOL(ndlc_send);
+
+static void llt_ndlc_send_queue(struct llt_ndlc *ndlc)
+{
+	struct sk_buff *skb;
+	int r;
+	unsigned long time_sent;
+
+	if (ndlc->send_q.qlen)
+		pr_debug("sendQlen=%d unackQlen=%d\n",
+			 ndlc->send_q.qlen, ndlc->ack_pending_q.qlen);
+
+	while (ndlc->send_q.qlen) {
+		skb = skb_dequeue(&ndlc->send_q);
+		NDLC_DUMP_SKB("ndlc frame written", skb);
+		r = ndlc->ops->write(ndlc->phy_id, skb);
+		if (r < 0) {
+			ndlc->hard_fault = r;
+			break;
+		}
+		time_sent = jiffies;
+		*(unsigned long *)skb->cb = time_sent;
+
+		skb_queue_tail(&ndlc->ack_pending_q, skb);
+
+		/* start timer t1 for ndlc aknowledge */
+		ndlc->t1_active = true;
+		mod_timer(&ndlc->t1_timer, time_sent +
+			msecs_to_jiffies(NDLC_TIMER_T1));
+		/* start timer t2 for chip availability */
+		ndlc->t2_active = true;
+		mod_timer(&ndlc->t2_timer, time_sent +
+			msecs_to_jiffies(NDLC_TIMER_T2));
+	}
+}
+
+static void llt_ndlc_requeue_data_pending(struct llt_ndlc *ndlc)
+{
+	struct sk_buff *skb;
+	u8 pcb;
+
+	while ((skb = skb_dequeue_tail(&ndlc->ack_pending_q))) {
+		pcb = skb->data[0];
+		switch (pcb & PCB_TYPE_MASK) {
+		case PCB_TYPE_SUPERVISOR:
+			skb->data[0] = (pcb & ~PCB_SUPERVISOR_RETRANSMIT_MASK) |
+				PCB_SUPERVISOR_RETRANSMIT_YES;
+			break;
+		case PCB_TYPE_DATAFRAME:
+			skb->data[0] = (pcb & ~PCB_DATAFRAME_RETRANSMIT_MASK) |
+				PCB_DATAFRAME_RETRANSMIT_YES;
+			break;
+		default:
+			pr_err("UNKNOWN Packet Control Byte=%d\n", pcb);
+			kfree_skb(skb);
+			continue;
+		}
+		skb_queue_head(&ndlc->send_q, skb);
+	}
+}
+
+static void llt_ndlc_rcv_queue(struct llt_ndlc *ndlc)
+{
+	struct sk_buff *skb;
+	u8 pcb;
+	unsigned long time_sent;
+
+	if (ndlc->rcv_q.qlen)
+		pr_debug("rcvQlen=%d\n", ndlc->rcv_q.qlen);
+
+	while ((skb = skb_dequeue(&ndlc->rcv_q)) != NULL) {
+		pcb = skb->data[0];
+		skb_pull(skb, 1);
+		if ((pcb & PCB_TYPE_MASK) == PCB_TYPE_SUPERVISOR) {
+			switch (pcb & PCB_SYNC_MASK) {
+			case PCB_SYNC_ACK:
+				del_timer_sync(&ndlc->t1_timer);
+				del_timer_sync(&ndlc->t2_timer);
+				ndlc->t2_active = false;
+				ndlc->t1_active = false;
+				break;
+			case PCB_SYNC_NACK:
+				llt_ndlc_requeue_data_pending(ndlc);
+				llt_ndlc_send_queue(ndlc);
+				/* start timer t1 for ndlc aknowledge */
+				time_sent = jiffies;
+				ndlc->t1_active = true;
+				mod_timer(&ndlc->t1_timer, time_sent +
+					msecs_to_jiffies(NDLC_TIMER_T1));
+				break;
+			case PCB_SYNC_WAIT:
+				time_sent = jiffies;
+				ndlc->t1_active = true;
+				mod_timer(&ndlc->t1_timer, time_sent +
+					  msecs_to_jiffies(NDLC_TIMER_T1_WAIT));
+				break;
+			default:
+				pr_err("UNKNOWN Packet Control Byte=%d\n", pcb);
+				kfree_skb(skb);
+				break;
+			}
+		} else {
+			nci_recv_frame(ndlc->ndev, skb);
+		}
+	}
+}
+
+static void llt_ndlc_sm_work(struct work_struct *work)
+{
+	struct llt_ndlc *ndlc = container_of(work, struct llt_ndlc, sm_work);
+
+	llt_ndlc_send_queue(ndlc);
+	llt_ndlc_rcv_queue(ndlc);
+
+	if (ndlc->t1_active && timer_pending(&ndlc->t1_timer) == 0) {
+		pr_debug
+		    ("Handle T1(recv SUPERVISOR) elapsed (T1 now inactive)\n");
+		ndlc->t1_active = false;
+
+		llt_ndlc_requeue_data_pending(ndlc);
+		llt_ndlc_send_queue(ndlc);
+	}
+
+	if (ndlc->t2_active && timer_pending(&ndlc->t2_timer) == 0) {
+		pr_debug("Handle T2(recv DATA) elapsed (T2 now inactive)\n");
+		ndlc->t2_active = false;
+		ndlc->t1_active = false;
+		del_timer_sync(&ndlc->t1_timer);
+		del_timer_sync(&ndlc->t2_timer);
+		ndlc_close(ndlc);
+		ndlc->hard_fault = -EREMOTEIO;
+	}
+}
+
+void ndlc_recv(struct llt_ndlc *ndlc, struct sk_buff *skb)
+{
+	if (skb == NULL) {
+		pr_err("NULL Frame -> link is dead\n");
+		ndlc->hard_fault = -EREMOTEIO;
+		ndlc_close(ndlc);
+	} else {
+		NDLC_DUMP_SKB("incoming frame", skb);
+		skb_queue_tail(&ndlc->rcv_q, skb);
+	}
+
+	schedule_work(&ndlc->sm_work);
+}
+EXPORT_SYMBOL(ndlc_recv);
+
+static void ndlc_t1_timeout(unsigned long data)
+{
+	struct llt_ndlc *ndlc = (struct llt_ndlc *)data;
+
+	pr_debug("\n");
+
+	schedule_work(&ndlc->sm_work);
+}
+
+static void ndlc_t2_timeout(unsigned long data)
+{
+	struct llt_ndlc *ndlc = (struct llt_ndlc *)data;
+
+	pr_debug("\n");
+
+	schedule_work(&ndlc->sm_work);
+}
+
+int ndlc_probe(void *phy_id, struct nfc_phy_ops *phy_ops, struct device *dev,
+	       int phy_headroom, int phy_tailroom, struct llt_ndlc **ndlc_id)
+{
+	struct llt_ndlc *ndlc;
+
+	ndlc = devm_kzalloc(dev, sizeof(struct llt_ndlc), GFP_KERNEL);
+	if (!ndlc)
+		return -ENOMEM;
+
+	ndlc->ops = phy_ops;
+	ndlc->phy_id = phy_id;
+	ndlc->dev = dev;
+	ndlc->powered = 0;
+
+	*ndlc_id = ndlc;
+
+	/* initialize timers */
+	init_timer(&ndlc->t1_timer);
+	ndlc->t1_timer.data = (unsigned long)ndlc;
+	ndlc->t1_timer.function = ndlc_t1_timeout;
+
+	init_timer(&ndlc->t2_timer);
+	ndlc->t2_timer.data = (unsigned long)ndlc;
+	ndlc->t2_timer.function = ndlc_t2_timeout;
+
+	skb_queue_head_init(&ndlc->rcv_q);
+	skb_queue_head_init(&ndlc->send_q);
+	skb_queue_head_init(&ndlc->ack_pending_q);
+
+	INIT_WORK(&ndlc->sm_work, llt_ndlc_sm_work);
+
+	return st_nci_probe(ndlc, phy_headroom, phy_tailroom);
+}
+EXPORT_SYMBOL(ndlc_probe);
+
+void ndlc_remove(struct llt_ndlc *ndlc)
+{
+	st_nci_remove(ndlc->ndev);
+
+	/* cancel timers */
+	del_timer_sync(&ndlc->t1_timer);
+	del_timer_sync(&ndlc->t2_timer);
+	ndlc->t2_active = false;
+	ndlc->t1_active = false;
+
+	skb_queue_purge(&ndlc->rcv_q);
+	skb_queue_purge(&ndlc->send_q);
+}
+EXPORT_SYMBOL(ndlc_remove);
diff --git a/drivers/nfc/st-nci/ndlc.h b/drivers/nfc/st-nci/ndlc.h
new file mode 100644
index 000000000000..6361005ef003
--- /dev/null
+++ b/drivers/nfc/st-nci/ndlc.h
@@ -0,0 +1,60 @@
+/*
+ * NCI based Driver for STMicroelectronics NFC Chip
+ *
+ * Copyright (C) 2014-2015  STMicroelectronics SAS. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef __LOCAL_NDLC_H_
+#define __LOCAL_NDLC_H_
+
+#include <linux/skbuff.h>
+#include <net/nfc/nfc.h>
+
+/* Low Level Transport description */
+struct llt_ndlc {
+	struct nci_dev *ndev;
+	struct nfc_phy_ops *ops;
+	void *phy_id;
+
+	struct timer_list t1_timer;
+	bool t1_active;
+
+	struct timer_list t2_timer;
+	bool t2_active;
+
+	struct sk_buff_head rcv_q;
+	struct sk_buff_head send_q;
+	struct sk_buff_head ack_pending_q;
+
+	struct work_struct sm_work;
+
+	struct device *dev;
+
+	/*
+	 * < 0 if hardware error occurred
+	 * and prevents normal operation.
+	 */
+	int hard_fault;
+	int powered;
+};
+
+int ndlc_open(struct llt_ndlc *ndlc);
+void ndlc_close(struct llt_ndlc *ndlc);
+int ndlc_send(struct llt_ndlc *ndlc, struct sk_buff *skb);
+void ndlc_recv(struct llt_ndlc *ndlc, struct sk_buff *skb);
+int ndlc_probe(void *phy_id, struct nfc_phy_ops *phy_ops, struct device *dev,
+	int phy_headroom, int phy_tailroom, struct llt_ndlc **ndlc_id);
+void ndlc_remove(struct llt_ndlc *ndlc);
+#endif /* __LOCAL_NDLC_H__ */
diff --git a/drivers/nfc/st-nci/st-nci.h b/drivers/nfc/st-nci/st-nci.h
new file mode 100644
index 000000000000..850a2395deb7
--- /dev/null
+++ b/drivers/nfc/st-nci/st-nci.h
@@ -0,0 +1,50 @@
+/*
+ * NCI based Driver for STMicroelectronics NFC Chip
+ *
+ * Copyright (C) 2014  STMicroelectronics SAS. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef __LOCAL_ST_NCI_H_
+#define __LOCAL_ST_NCI_H_
+
+#include "st-nci_se.h"
+#include "ndlc.h"
+
+/* Define private flags: */
+#define ST_NCI_RUNNING			1
+
+#define ST_NCI_CORE_PROP                0x01
+#define ST_NCI_SET_NFC_MODE             0x02
+
+struct nci_mode_set_cmd {
+	u8 cmd_type;
+	u8 mode;
+} __packed;
+
+struct nci_mode_set_rsp {
+	u8 status;
+} __packed;
+
+struct st_nci_info {
+	struct llt_ndlc *ndlc;
+	unsigned long flags;
+	struct st_nci_se_info se_info;
+};
+
+void st_nci_remove(struct nci_dev *ndev);
+int st_nci_probe(struct llt_ndlc *ndlc, int phy_headroom,
+		int phy_tailroom);
+
+#endif /* __LOCAL_ST_NCI_H_ */
diff --git a/drivers/nfc/st-nci/st-nci_se.c b/drivers/nfc/st-nci/st-nci_se.c
new file mode 100644
index 000000000000..97addfa96c6f
--- /dev/null
+++ b/drivers/nfc/st-nci/st-nci_se.c
@@ -0,0 +1,714 @@
+/*
+ * Secure Element driver for STMicroelectronics NFC NCI chip
+ *
+ * Copyright (C) 2014-2015 STMicroelectronics SAS. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/module.h>
+#include <linux/nfc.h>
+#include <linux/delay.h>
+#include <net/nfc/nci.h>
+#include <net/nfc/nci_core.h>
+
+#include "st-nci.h"
+#include "st-nci_se.h"
+
+struct st_nci_pipe_info {
+	u8 pipe_state;
+	u8 src_host_id;
+	u8 src_gate_id;
+	u8 dst_host_id;
+	u8 dst_gate_id;
+} __packed;
+
+/* Hosts */
+#define ST_NCI_HOST_CONTROLLER_ID     0x00
+#define ST_NCI_TERMINAL_HOST_ID       0x01
+#define ST_NCI_UICC_HOST_ID           0x02
+#define ST_NCI_ESE_HOST_ID            0xc0
+
+/* Gates */
+#define ST_NCI_DEVICE_MGNT_GATE       0x01
+#define ST_NCI_APDU_READER_GATE       0xf0
+#define ST_NCI_CONNECTIVITY_GATE      0x41
+
+/* Pipes */
+#define ST_NCI_DEVICE_MGNT_PIPE               0x02
+
+/* Connectivity pipe only */
+#define ST_NCI_SE_COUNT_PIPE_UICC             0x01
+/* Connectivity + APDU Reader pipe */
+#define ST_NCI_SE_COUNT_PIPE_EMBEDDED         0x02
+
+#define ST_NCI_SE_TO_HOT_PLUG			1000 /* msecs */
+#define ST_NCI_SE_TO_PIPES			2000
+
+#define ST_NCI_EVT_HOT_PLUG_IS_INHIBITED(x)   (x->data[0] & 0x80)
+
+#define NCI_HCI_APDU_PARAM_ATR                     0x01
+#define NCI_HCI_ADMIN_PARAM_SESSION_IDENTITY       0x01
+#define NCI_HCI_ADMIN_PARAM_WHITELIST              0x03
+#define NCI_HCI_ADMIN_PARAM_HOST_LIST              0x04
+
+#define ST_NCI_EVT_SE_HARD_RESET		0x20
+#define ST_NCI_EVT_TRANSMIT_DATA		0x10
+#define ST_NCI_EVT_WTX_REQUEST		0x11
+#define ST_NCI_EVT_SE_SOFT_RESET		0x11
+#define ST_NCI_EVT_SE_END_OF_APDU_TRANSFER	0x21
+#define ST_NCI_EVT_HOT_PLUG			0x03
+
+#define ST_NCI_SE_MODE_OFF                    0x00
+#define ST_NCI_SE_MODE_ON                     0x01
+
+#define ST_NCI_EVT_CONNECTIVITY       0x10
+#define ST_NCI_EVT_TRANSACTION        0x12
+
+#define ST_NCI_DM_GETINFO             0x13
+#define ST_NCI_DM_GETINFO_PIPE_LIST   0x02
+#define ST_NCI_DM_GETINFO_PIPE_INFO   0x01
+#define ST_NCI_DM_PIPE_CREATED        0x02
+#define ST_NCI_DM_PIPE_OPEN           0x04
+#define ST_NCI_DM_RF_ACTIVE           0x80
+#define ST_NCI_DM_DISCONNECT          0x30
+
+#define ST_NCI_DM_IS_PIPE_OPEN(p) \
+	((p & 0x0f) == (ST_NCI_DM_PIPE_CREATED | ST_NCI_DM_PIPE_OPEN))
+
+#define ST_NCI_ATR_DEFAULT_BWI        0x04
+
+/*
+ * WT = 2^BWI/10[s], convert into msecs and add a secure
+ * room by increasing by 2 this timeout
+ */
+#define ST_NCI_BWI_TO_TIMEOUT(x)      ((1 << x) * 200)
+#define ST_NCI_ATR_GET_Y_FROM_TD(x)   (x >> 4)
+
+/* If TA is present bit 0 is set */
+#define ST_NCI_ATR_TA_PRESENT(x) (x & 0x01)
+/* If TB is present bit 1 is set */
+#define ST_NCI_ATR_TB_PRESENT(x) (x & 0x02)
+
+#define ST_NCI_NUM_DEVICES           256
+
+static DECLARE_BITMAP(dev_mask, ST_NCI_NUM_DEVICES);
+
+/* Here are the mandatory pipe for st_nci */
+static struct nci_hci_gate st_nci_gates[] = {
+	{NCI_HCI_ADMIN_GATE, NCI_HCI_ADMIN_PIPE,
+					ST_NCI_HOST_CONTROLLER_ID},
+	{NCI_HCI_LINK_MGMT_GATE, NCI_HCI_LINK_MGMT_PIPE,
+					ST_NCI_HOST_CONTROLLER_ID},
+	{ST_NCI_DEVICE_MGNT_GATE, ST_NCI_DEVICE_MGNT_PIPE,
+					ST_NCI_HOST_CONTROLLER_ID},
+
+	/* Secure element pipes are created by secure element host */
+	{ST_NCI_CONNECTIVITY_GATE, NCI_HCI_DO_NOT_OPEN_PIPE,
+					ST_NCI_HOST_CONTROLLER_ID},
+	{ST_NCI_APDU_READER_GATE, NCI_HCI_DO_NOT_OPEN_PIPE,
+					ST_NCI_HOST_CONTROLLER_ID},
+};
+
+static u8 st_nci_se_get_bwi(struct nci_dev *ndev)
+{
+	int i;
+	u8 td;
+	struct st_nci_info *info = nci_get_drvdata(ndev);
+
+	/* Bits 8 to 5 of the first TB for T=1 encode BWI from zero to nine */
+	for (i = 1; i < ST_NCI_ESE_MAX_LENGTH; i++) {
+		td = ST_NCI_ATR_GET_Y_FROM_TD(info->se_info.atr[i]);
+		if (ST_NCI_ATR_TA_PRESENT(td))
+			i++;
+		if (ST_NCI_ATR_TB_PRESENT(td)) {
+			i++;
+			return info->se_info.atr[i] >> 4;
+		}
+	}
+	return ST_NCI_ATR_DEFAULT_BWI;
+}
+
+static void st_nci_se_get_atr(struct nci_dev *ndev)
+{
+	struct st_nci_info *info = nci_get_drvdata(ndev);
+	int r;
+	struct sk_buff *skb;
+
+	r = nci_hci_get_param(ndev, ST_NCI_APDU_READER_GATE,
+				NCI_HCI_APDU_PARAM_ATR, &skb);
+	if (r < 0)
+		return;
+
+	if (skb->len <= ST_NCI_ESE_MAX_LENGTH) {
+		memcpy(info->se_info.atr, skb->data, skb->len);
+
+		info->se_info.wt_timeout =
+			ST_NCI_BWI_TO_TIMEOUT(st_nci_se_get_bwi(ndev));
+	}
+	kfree_skb(skb);
+}
+
+int st_nci_hci_load_session(struct nci_dev *ndev)
+{
+	int i, j, r;
+	struct sk_buff *skb_pipe_list, *skb_pipe_info;
+	struct st_nci_pipe_info *dm_pipe_info;
+	u8 pipe_list[] = { ST_NCI_DM_GETINFO_PIPE_LIST,
+			ST_NCI_TERMINAL_HOST_ID};
+	u8 pipe_info[] = { ST_NCI_DM_GETINFO_PIPE_INFO,
+			ST_NCI_TERMINAL_HOST_ID, 0};
+
+	/* On ST_NCI device pipes number are dynamics
+	 * If pipes are already created, hci_dev_up will fail.
+	 * Doing a clear all pipe is a bad idea because:
+	 * - It does useless EEPROM cycling
+	 * - It might cause issue for secure elements support
+	 * (such as removing connectivity or APDU reader pipe)
+	 * A better approach on ST_NCI is to:
+	 * - get a pipe list for each host.
+	 * (eg: ST_NCI_HOST_CONTROLLER_ID for now).
+	 * (TODO Later on UICC HOST and eSE HOST)
+	 * - get pipe information
+	 * - match retrieved pipe list in st_nci_gates
+	 * ST_NCI_DEVICE_MGNT_GATE is a proprietary gate
+	 * with ST_NCI_DEVICE_MGNT_PIPE.
+	 * Pipe can be closed and need to be open.
+	 */
+	r = nci_hci_connect_gate(ndev, ST_NCI_HOST_CONTROLLER_ID,
+				ST_NCI_DEVICE_MGNT_GATE,
+				ST_NCI_DEVICE_MGNT_PIPE);
+	if (r < 0)
+		goto free_info;
+
+	/* Get pipe list */
+	r = nci_hci_send_cmd(ndev, ST_NCI_DEVICE_MGNT_GATE,
+			ST_NCI_DM_GETINFO, pipe_list, sizeof(pipe_list),
+			&skb_pipe_list);
+	if (r < 0)
+		goto free_info;
+
+	/* Complete the existing gate_pipe table */
+	for (i = 0; i < skb_pipe_list->len; i++) {
+		pipe_info[2] = skb_pipe_list->data[i];
+		r = nci_hci_send_cmd(ndev, ST_NCI_DEVICE_MGNT_GATE,
+					ST_NCI_DM_GETINFO, pipe_info,
+					sizeof(pipe_info), &skb_pipe_info);
+
+		if (r)
+			continue;
+
+		/*
+		 * Match pipe ID and gate ID
+		 * Output format from ST21NFC_DM_GETINFO is:
+		 * - pipe state (1byte)
+		 * - source hid (1byte)
+		 * - source gid (1byte)
+		 * - destination hid (1byte)
+		 * - destination gid (1byte)
+		 */
+		dm_pipe_info = (struct st_nci_pipe_info *)skb_pipe_info->data;
+		if (dm_pipe_info->dst_gate_id == ST_NCI_APDU_READER_GATE &&
+		    dm_pipe_info->src_host_id != ST_NCI_ESE_HOST_ID) {
+			pr_err("Unexpected apdu_reader pipe on host %x\n",
+			       dm_pipe_info->src_host_id);
+			continue;
+		}
+
+		for (j = 0; (j < ARRAY_SIZE(st_nci_gates)) &&
+		     (st_nci_gates[j].gate != dm_pipe_info->dst_gate_id); j++)
+			;
+
+		if (j < ARRAY_SIZE(st_nci_gates) &&
+		    st_nci_gates[j].gate == dm_pipe_info->dst_gate_id &&
+		    ST_NCI_DM_IS_PIPE_OPEN(dm_pipe_info->pipe_state)) {
+			st_nci_gates[j].pipe = pipe_info[2];
+
+			ndev->hci_dev->gate2pipe[st_nci_gates[j].gate] =
+						st_nci_gates[j].pipe;
+			ndev->hci_dev->pipes[st_nci_gates[j].pipe].gate =
+						st_nci_gates[j].gate;
+			ndev->hci_dev->pipes[st_nci_gates[j].pipe].host =
+						dm_pipe_info->src_host_id;
+		}
+	}
+
+	memcpy(ndev->hci_dev->init_data.gates, st_nci_gates,
+	       sizeof(st_nci_gates));
+
+free_info:
+	kfree_skb(skb_pipe_info);
+	kfree_skb(skb_pipe_list);
+	return r;
+}
+EXPORT_SYMBOL_GPL(st_nci_hci_load_session);
+
+static void st_nci_hci_admin_event_received(struct nci_dev *ndev,
+					      u8 event, struct sk_buff *skb)
+{
+	struct st_nci_info *info = nci_get_drvdata(ndev);
+
+	switch (event) {
+	case ST_NCI_EVT_HOT_PLUG:
+		if (info->se_info.se_active) {
+			if (!ST_NCI_EVT_HOT_PLUG_IS_INHIBITED(skb)) {
+				del_timer_sync(&info->se_info.se_active_timer);
+				info->se_info.se_active = false;
+				complete(&info->se_info.req_completion);
+			} else {
+				mod_timer(&info->se_info.se_active_timer,
+				      jiffies +
+				      msecs_to_jiffies(ST_NCI_SE_TO_PIPES));
+			}
+		}
+	break;
+	}
+}
+
+static int st_nci_hci_apdu_reader_event_received(struct nci_dev *ndev,
+						   u8 event,
+						   struct sk_buff *skb)
+{
+	int r = 0;
+	struct st_nci_info *info = nci_get_drvdata(ndev);
+
+	pr_debug("apdu reader gate event: %x\n", event);
+
+	switch (event) {
+	case ST_NCI_EVT_TRANSMIT_DATA:
+		del_timer_sync(&info->se_info.bwi_timer);
+		info->se_info.bwi_active = false;
+		info->se_info.cb(info->se_info.cb_context,
+				 skb->data, skb->len, 0);
+	break;
+	case ST_NCI_EVT_WTX_REQUEST:
+		mod_timer(&info->se_info.bwi_timer, jiffies +
+			  msecs_to_jiffies(info->se_info.wt_timeout));
+	break;
+	}
+
+	kfree_skb(skb);
+	return r;
+}
+
+/*
+ * Returns:
+ * <= 0: driver handled the event, skb consumed
+ *    1: driver does not handle the event, please do standard processing
+ */
+static int st_nci_hci_connectivity_event_received(struct nci_dev *ndev,
+						u8 host, u8 event,
+						struct sk_buff *skb)
+{
+	int r = 0;
+	struct device *dev = &ndev->nfc_dev->dev;
+	struct nfc_evt_transaction *transaction;
+
+	pr_debug("connectivity gate event: %x\n", event);
+
+	switch (event) {
+	case ST_NCI_EVT_CONNECTIVITY:
+
+	break;
+	case ST_NCI_EVT_TRANSACTION:
+		/* According to specification etsi 102 622
+		 * 11.2.2.4 EVT_TRANSACTION Table 52
+		 * Description  Tag     Length
+		 * AID          81      5 to 16
+		 * PARAMETERS   82      0 to 255
+		 */
+		if (skb->len < NFC_MIN_AID_LENGTH + 2 &&
+		    skb->data[0] != NFC_EVT_TRANSACTION_AID_TAG)
+			return -EPROTO;
+
+		transaction = (struct nfc_evt_transaction *)devm_kzalloc(dev,
+					    skb->len - 2, GFP_KERNEL);
+
+		transaction->aid_len = skb->data[1];
+		memcpy(transaction->aid, &skb->data[2], transaction->aid_len);
+
+		/* Check next byte is PARAMETERS tag (82) */
+		if (skb->data[transaction->aid_len + 2] !=
+		    NFC_EVT_TRANSACTION_PARAMS_TAG)
+			return -EPROTO;
+
+		transaction->params_len = skb->data[transaction->aid_len + 3];
+		memcpy(transaction->params, skb->data +
+		       transaction->aid_len + 4, transaction->params_len);
+
+		r = nfc_se_transaction(ndev->nfc_dev, host, transaction);
+		break;
+	default:
+		return 1;
+	}
+	kfree_skb(skb);
+	return r;
+}
+
+void st_nci_hci_event_received(struct nci_dev *ndev, u8 pipe,
+				 u8 event, struct sk_buff *skb)
+{
+	u8 gate = ndev->hci_dev->pipes[pipe].gate;
+	u8 host = ndev->hci_dev->pipes[pipe].host;
+
+	switch (gate) {
+	case NCI_HCI_ADMIN_GATE:
+		st_nci_hci_admin_event_received(ndev, event, skb);
+	break;
+	case ST_NCI_APDU_READER_GATE:
+		st_nci_hci_apdu_reader_event_received(ndev, event, skb);
+	break;
+	case ST_NCI_CONNECTIVITY_GATE:
+		st_nci_hci_connectivity_event_received(ndev, host, event,
+							 skb);
+	break;
+	}
+}
+EXPORT_SYMBOL_GPL(st_nci_hci_event_received);
+
+
+void st_nci_hci_cmd_received(struct nci_dev *ndev, u8 pipe, u8 cmd,
+			       struct sk_buff *skb)
+{
+	struct st_nci_info *info = nci_get_drvdata(ndev);
+	u8 gate = ndev->hci_dev->pipes[pipe].gate;
+
+	pr_debug("cmd: %x\n", cmd);
+
+	switch (cmd) {
+	case NCI_HCI_ANY_OPEN_PIPE:
+		if (gate != ST_NCI_APDU_READER_GATE &&
+		    ndev->hci_dev->pipes[pipe].host != ST_NCI_UICC_HOST_ID)
+			ndev->hci_dev->count_pipes++;
+
+		if (ndev->hci_dev->count_pipes ==
+		    ndev->hci_dev->expected_pipes) {
+			del_timer_sync(&info->se_info.se_active_timer);
+			info->se_info.se_active = false;
+			ndev->hci_dev->count_pipes = 0;
+			complete(&info->se_info.req_completion);
+		}
+	break;
+	}
+}
+EXPORT_SYMBOL_GPL(st_nci_hci_cmd_received);
+
+/*
+ * Remarks: On some early st_nci firmware, nci_nfcee_mode_set(0)
+ * is rejected
+ */
+static int st_nci_control_se(struct nci_dev *ndev, u8 se_idx,
+				   u8 state)
+{
+	struct st_nci_info *info = nci_get_drvdata(ndev);
+	int r;
+	struct sk_buff *sk_host_list;
+	u8 host_id;
+
+	switch (se_idx) {
+	case ST_NCI_UICC_HOST_ID:
+		ndev->hci_dev->count_pipes = 0;
+		ndev->hci_dev->expected_pipes = ST_NCI_SE_COUNT_PIPE_UICC;
+		break;
+	case ST_NCI_ESE_HOST_ID:
+		ndev->hci_dev->count_pipes = 0;
+		ndev->hci_dev->expected_pipes = ST_NCI_SE_COUNT_PIPE_EMBEDDED;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	/*
+	 * Wait for an EVT_HOT_PLUG in order to
+	 * retrieve a relevant host list.
+	 */
+	reinit_completion(&info->se_info.req_completion);
+	r = nci_nfcee_mode_set(ndev, se_idx, NCI_NFCEE_ENABLE);
+	if (r != NCI_STATUS_OK)
+		return r;
+
+	mod_timer(&info->se_info.se_active_timer, jiffies +
+		msecs_to_jiffies(ST_NCI_SE_TO_HOT_PLUG));
+	info->se_info.se_active = true;
+
+	/* Ignore return value and check in any case the host_list */
+	wait_for_completion_interruptible(&info->se_info.req_completion);
+
+	/* There might be some "collision" after receiving a HOT_PLUG event
+	 * This may cause the CLF to not answer to the next hci command.
+	 * There is no possible synchronization to prevent this.
+	 * Adding a small delay is the only way to solve the issue.
+	 */
+	usleep_range(3000, 5000);
+
+	r = nci_hci_get_param(ndev, NCI_HCI_ADMIN_GATE,
+			NCI_HCI_ADMIN_PARAM_HOST_LIST, &sk_host_list);
+	if (r != NCI_HCI_ANY_OK)
+		return r;
+
+	host_id = sk_host_list->data[sk_host_list->len - 1];
+	kfree_skb(sk_host_list);
+	if (state == ST_NCI_SE_MODE_ON && host_id == se_idx)
+		return se_idx;
+	else if (state == ST_NCI_SE_MODE_OFF && host_id != se_idx)
+		return se_idx;
+
+	return -1;
+}
+
+int st_nci_disable_se(struct nci_dev *ndev, u32 se_idx)
+{
+	int r;
+
+	pr_debug("st_nci_disable_se\n");
+
+	if (se_idx == NFC_SE_EMBEDDED) {
+		r = nci_hci_send_event(ndev, ST_NCI_APDU_READER_GATE,
+				ST_NCI_EVT_SE_END_OF_APDU_TRANSFER, NULL, 0);
+		if (r < 0)
+			return r;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(st_nci_disable_se);
+
+int st_nci_enable_se(struct nci_dev *ndev, u32 se_idx)
+{
+	int r;
+
+	pr_debug("st_nci_enable_se\n");
+
+	if (se_idx == ST_NCI_HCI_HOST_ID_ESE) {
+		r = nci_hci_send_event(ndev, ST_NCI_APDU_READER_GATE,
+				ST_NCI_EVT_SE_SOFT_RESET, NULL, 0);
+		if (r < 0)
+			return r;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(st_nci_enable_se);
+
+static int st_nci_hci_network_init(struct nci_dev *ndev)
+{
+	struct core_conn_create_dest_spec_params *dest_params;
+	struct dest_spec_params spec_params;
+	struct nci_conn_info    *conn_info;
+	int r, dev_num;
+
+	dest_params =
+		kzalloc(sizeof(struct core_conn_create_dest_spec_params) +
+			sizeof(struct dest_spec_params), GFP_KERNEL);
+	if (dest_params == NULL) {
+		r = -ENOMEM;
+		goto exit;
+	}
+
+	dest_params->type = NCI_DESTINATION_SPECIFIC_PARAM_NFCEE_TYPE;
+	dest_params->length = sizeof(struct dest_spec_params);
+	spec_params.id = ndev->hci_dev->nfcee_id;
+	spec_params.protocol = NCI_NFCEE_INTERFACE_HCI_ACCESS;
+	memcpy(dest_params->value, &spec_params,
+	       sizeof(struct dest_spec_params));
+	r = nci_core_conn_create(ndev, NCI_DESTINATION_NFCEE, 1,
+				 sizeof(struct core_conn_create_dest_spec_params) +
+				 sizeof(struct dest_spec_params),
+				 dest_params);
+	if (r != NCI_STATUS_OK)
+		goto free_dest_params;
+
+	conn_info = ndev->hci_dev->conn_info;
+	if (!conn_info)
+		goto free_dest_params;
+
+	memcpy(ndev->hci_dev->init_data.gates, st_nci_gates,
+	       sizeof(st_nci_gates));
+
+	/*
+	 * Session id must include the driver name + i2c bus addr
+	 * persistent info to discriminate 2 identical chips
+	 */
+	dev_num = find_first_zero_bit(dev_mask, ST_NCI_NUM_DEVICES);
+	if (dev_num >= ST_NCI_NUM_DEVICES) {
+		r = -ENODEV;
+		goto free_dest_params;
+	}
+
+	scnprintf(ndev->hci_dev->init_data.session_id,
+		  sizeof(ndev->hci_dev->init_data.session_id),
+		  "%s%2x", "ST21BH", dev_num);
+
+	r = nci_hci_dev_session_init(ndev);
+	if (r != NCI_HCI_ANY_OK)
+		goto free_dest_params;
+
+	r = nci_nfcee_mode_set(ndev, ndev->hci_dev->conn_info->id,
+			       NCI_NFCEE_ENABLE);
+	if (r != NCI_STATUS_OK)
+		goto free_dest_params;
+
+free_dest_params:
+	kfree(dest_params);
+
+exit:
+	return r;
+}
+
+int st_nci_discover_se(struct nci_dev *ndev)
+{
+	u8 param[2];
+	int r;
+	int se_count = 0;
+
+	pr_debug("st_nci_discover_se\n");
+
+	r = st_nci_hci_network_init(ndev);
+	if (r != 0)
+		return r;
+
+	param[0] = ST_NCI_UICC_HOST_ID;
+	param[1] = ST_NCI_HCI_HOST_ID_ESE;
+	r = nci_hci_set_param(ndev, NCI_HCI_ADMIN_GATE,
+				NCI_HCI_ADMIN_PARAM_WHITELIST,
+				param, sizeof(param));
+	if (r != NCI_HCI_ANY_OK)
+		return r;
+
+	r = st_nci_control_se(ndev, ST_NCI_UICC_HOST_ID,
+				ST_NCI_SE_MODE_ON);
+	if (r == ST_NCI_UICC_HOST_ID) {
+		nfc_add_se(ndev->nfc_dev, ST_NCI_UICC_HOST_ID, NFC_SE_UICC);
+		se_count++;
+	}
+
+	/* Try to enable eSE in order to check availability */
+	r = st_nci_control_se(ndev, ST_NCI_HCI_HOST_ID_ESE,
+				ST_NCI_SE_MODE_ON);
+	if (r == ST_NCI_HCI_HOST_ID_ESE) {
+		nfc_add_se(ndev->nfc_dev, ST_NCI_HCI_HOST_ID_ESE,
+			   NFC_SE_EMBEDDED);
+		se_count++;
+		st_nci_se_get_atr(ndev);
+	}
+
+	return !se_count;
+}
+EXPORT_SYMBOL_GPL(st_nci_discover_se);
+
+int st_nci_se_io(struct nci_dev *ndev, u32 se_idx,
+		       u8 *apdu, size_t apdu_length,
+		       se_io_cb_t cb, void *cb_context)
+{
+	struct st_nci_info *info = nci_get_drvdata(ndev);
+
+	pr_debug("\n");
+
+	switch (se_idx) {
+	case ST_NCI_HCI_HOST_ID_ESE:
+		info->se_info.cb = cb;
+		info->se_info.cb_context = cb_context;
+		mod_timer(&info->se_info.bwi_timer, jiffies +
+			  msecs_to_jiffies(info->se_info.wt_timeout));
+		info->se_info.bwi_active = true;
+		return nci_hci_send_event(ndev, ST_NCI_APDU_READER_GATE,
+					ST_NCI_EVT_TRANSMIT_DATA, apdu,
+					apdu_length);
+	default:
+		return -ENODEV;
+	}
+}
+EXPORT_SYMBOL(st_nci_se_io);
+
+static void st_nci_se_wt_timeout(unsigned long data)
+{
+	/*
+	 * No answer from the secure element
+	 * within the defined timeout.
+	 * Let's send a reset request as recovery procedure.
+	 * According to the situation, we first try to send a software reset
+	 * to the secure element. If the next command is still not
+	 * answering in time, we send to the CLF a secure element hardware
+	 * reset request.
+	 */
+	/* hardware reset managed through VCC_UICC_OUT power supply */
+	u8 param = 0x01;
+	struct st_nci_info *info = (struct st_nci_info *) data;
+
+	pr_debug("\n");
+
+	info->se_info.bwi_active = false;
+
+	if (!info->se_info.xch_error) {
+		info->se_info.xch_error = true;
+		nci_hci_send_event(info->ndlc->ndev, ST_NCI_APDU_READER_GATE,
+				ST_NCI_EVT_SE_SOFT_RESET, NULL, 0);
+	} else {
+		info->se_info.xch_error = false;
+		nci_hci_send_event(info->ndlc->ndev, ST_NCI_DEVICE_MGNT_GATE,
+				ST_NCI_EVT_SE_HARD_RESET, &param, 1);
+	}
+	info->se_info.cb(info->se_info.cb_context, NULL, 0, -ETIME);
+}
+
+static void st_nci_se_activation_timeout(unsigned long data)
+{
+	struct st_nci_info *info = (struct st_nci_info *) data;
+
+	pr_debug("\n");
+
+	info->se_info.se_active = false;
+
+	complete(&info->se_info.req_completion);
+}
+
+int st_nci_se_init(struct nci_dev *ndev)
+{
+	struct st_nci_info *info = nci_get_drvdata(ndev);
+
+	init_completion(&info->se_info.req_completion);
+	/* initialize timers */
+	init_timer(&info->se_info.bwi_timer);
+	info->se_info.bwi_timer.data = (unsigned long)info;
+	info->se_info.bwi_timer.function = st_nci_se_wt_timeout;
+	info->se_info.bwi_active = false;
+
+	init_timer(&info->se_info.se_active_timer);
+	info->se_info.se_active_timer.data = (unsigned long)info;
+	info->se_info.se_active_timer.function =
+			st_nci_se_activation_timeout;
+	info->se_info.se_active = false;
+
+	info->se_info.xch_error = false;
+
+	info->se_info.wt_timeout =
+		ST_NCI_BWI_TO_TIMEOUT(ST_NCI_ATR_DEFAULT_BWI);
+
+	return 0;
+}
+EXPORT_SYMBOL(st_nci_se_init);
+
+void st_nci_se_deinit(struct nci_dev *ndev)
+{
+	struct st_nci_info *info = nci_get_drvdata(ndev);
+
+	if (info->se_info.bwi_active)
+		del_timer_sync(&info->se_info.bwi_timer);
+	if (info->se_info.se_active)
+		del_timer_sync(&info->se_info.se_active_timer);
+
+	info->se_info.se_active = false;
+	info->se_info.bwi_active = false;
+}
+EXPORT_SYMBOL(st_nci_se_deinit);
+
diff --git a/drivers/nfc/st-nci/st-nci_se.h b/drivers/nfc/st-nci/st-nci_se.h
new file mode 100644
index 000000000000..ea66e879d67f
--- /dev/null
+++ b/drivers/nfc/st-nci/st-nci_se.h
@@ -0,0 +1,61 @@
+/*
+ * Secure Element Driver for STMicroelectronics NFC NCI Chip
+ *
+ * Copyright (C) 2014-2015 STMicroelectronics SAS. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+#ifndef __LOCAL_ST_NCI_SE_H_
+#define __LOCAL_ST_NCI_SE_H_
+
+/*
+ * ref ISO7816-3 chap 8.1. the initial character TS is followed by a
+ * sequence of at most 32 characters.
+ */
+#define ST_NCI_ESE_MAX_LENGTH	33
+#define ST_NCI_HCI_HOST_ID_ESE	0xc0
+
+struct st_nci_se_info {
+	u8 atr[ST_NCI_ESE_MAX_LENGTH];
+	struct completion req_completion;
+
+	struct timer_list bwi_timer;
+	int wt_timeout; /* in msecs */
+	bool bwi_active;
+
+	struct timer_list se_active_timer;
+	bool se_active;
+
+	bool xch_error;
+
+	se_io_cb_t cb;
+	void *cb_context;
+};
+
+int st_nci_se_init(struct nci_dev *ndev);
+void st_nci_se_deinit(struct nci_dev *ndev);
+
+int st_nci_discover_se(struct nci_dev *ndev);
+int st_nci_enable_se(struct nci_dev *ndev, u32 se_idx);
+int st_nci_disable_se(struct nci_dev *ndev, u32 se_idx);
+int st_nci_se_io(struct nci_dev *ndev, u32 se_idx,
+		u8 *apdu, size_t apdu_length,
+		se_io_cb_t cb, void *cb_context);
+int st_nci_hci_load_session(struct nci_dev *ndev);
+void st_nci_hci_event_received(struct nci_dev *ndev, u8 pipe,
+			u8 event, struct sk_buff *skb);
+void st_nci_hci_cmd_received(struct nci_dev *ndev, u8 pipe, u8 cmd,
+			struct sk_buff *skb);
+
+
+#endif /* __LOCAL_ST_NCI_SE_H_ */
diff --git a/drivers/nfc/st21nfcb/Kconfig b/drivers/nfc/st21nfcb/Kconfig
deleted file mode 100644
index e0322dd03a70..000000000000
--- a/drivers/nfc/st21nfcb/Kconfig
+++ /dev/null
@@ -1,22 +0,0 @@
-config NFC_ST21NFCB
-	tristate "STMicroelectronics ST21NFCB NFC driver"
-	depends on NFC_NCI
-	default n
-	---help---
-	  STMicroelectronics ST21NFCB core driver. It implements the chipset
-	  NCI logic and hooks into the NFC kernel APIs. Physical layers will
-	  register against it.
-
-	  To compile this driver as a module, choose m here. The module will
-	  be called st21nfcb.
-	  Say N if unsure.
-
-config NFC_ST21NFCB_I2C
-	tristate "NFC ST21NFCB i2c support"
-	depends on NFC_ST21NFCB && I2C
-	---help---
-	  This module adds support for the STMicroelectronics st21nfcb i2c interface.
-	  Select this if your platform is using the i2c bus.
-
-	  If you choose to build a module, it'll be called st21nfcb_i2c.
-	  Say N if unsure.
diff --git a/drivers/nfc/st21nfcb/Makefile b/drivers/nfc/st21nfcb/Makefile
deleted file mode 100644
index ce659a9e5a1a..000000000000
--- a/drivers/nfc/st21nfcb/Makefile
+++ /dev/null
@@ -1,9 +0,0 @@
-#
-# Makefile for ST21NFCB NCI based NFC driver
-#
-
-st21nfcb_nci-objs = ndlc.o st21nfcb.o st21nfcb_se.o
-obj-$(CONFIG_NFC_ST21NFCB)     += st21nfcb_nci.o
-
-st21nfcb_i2c-objs = i2c.o
-obj-$(CONFIG_NFC_ST21NFCB_I2C) += st21nfcb_i2c.o
diff --git a/drivers/nfc/st21nfcb/i2c.c b/drivers/nfc/st21nfcb/i2c.c
deleted file mode 100644
index dbc0dfd8ae85..000000000000
--- a/drivers/nfc/st21nfcb/i2c.c
+++ /dev/null
@@ -1,384 +0,0 @@
-/*
- * I2C Link Layer for ST21NFCB NCI based Driver
- * Copyright (C) 2014  STMicroelectronics SAS. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, see <http://www.gnu.org/licenses/>.
- */
-
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
-#include <linux/module.h>
-#include <linux/i2c.h>
-#include <linux/gpio.h>
-#include <linux/of_irq.h>
-#include <linux/of_gpio.h>
-#include <linux/interrupt.h>
-#include <linux/delay.h>
-#include <linux/nfc.h>
-#include <linux/platform_data/st21nfcb.h>
-
-#include "ndlc.h"
-
-#define DRIVER_DESC "NCI NFC driver for ST21NFCB"
-
-/* ndlc header */
-#define ST21NFCB_FRAME_HEADROOM	1
-#define ST21NFCB_FRAME_TAILROOM 0
-
-#define ST21NFCB_NCI_I2C_MIN_SIZE 4   /* PCB(1) + NCI Packet header(3) */
-#define ST21NFCB_NCI_I2C_MAX_SIZE 250 /* req 4.2.1 */
-
-#define ST21NFCB_NCI_I2C_DRIVER_NAME "st21nfcb_nci_i2c"
-
-static struct i2c_device_id st21nfcb_nci_i2c_id_table[] = {
-	{ST21NFCB_NCI_DRIVER_NAME, 0},
-	{}
-};
-MODULE_DEVICE_TABLE(i2c, st21nfcb_nci_i2c_id_table);
-
-struct st21nfcb_i2c_phy {
-	struct i2c_client *i2c_dev;
-	struct llt_ndlc *ndlc;
-
-	unsigned int gpio_reset;
-	unsigned int irq_polarity;
-};
-
-#define I2C_DUMP_SKB(info, skb)					\
-do {								\
-	pr_debug("%s:\n", info);				\
-	print_hex_dump(KERN_DEBUG, "i2c: ", DUMP_PREFIX_OFFSET,	\
-		       16, 1, (skb)->data, (skb)->len, 0);	\
-} while (0)
-
-static int st21nfcb_nci_i2c_enable(void *phy_id)
-{
-	struct st21nfcb_i2c_phy *phy = phy_id;
-
-	gpio_set_value(phy->gpio_reset, 0);
-	usleep_range(10000, 15000);
-	gpio_set_value(phy->gpio_reset, 1);
-	usleep_range(80000, 85000);
-
-	if (phy->ndlc->powered == 0)
-		enable_irq(phy->i2c_dev->irq);
-
-	return 0;
-}
-
-static void st21nfcb_nci_i2c_disable(void *phy_id)
-{
-	struct st21nfcb_i2c_phy *phy = phy_id;
-
-	disable_irq_nosync(phy->i2c_dev->irq);
-}
-
-/*
- * Writing a frame must not return the number of written bytes.
- * It must return either zero for success, or <0 for error.
- * In addition, it must not alter the skb
- */
-static int st21nfcb_nci_i2c_write(void *phy_id, struct sk_buff *skb)
-{
-	int r = -1;
-	struct st21nfcb_i2c_phy *phy = phy_id;
-	struct i2c_client *client = phy->i2c_dev;
-
-	I2C_DUMP_SKB("st21nfcb_nci_i2c_write", skb);
-
-	if (phy->ndlc->hard_fault != 0)
-		return phy->ndlc->hard_fault;
-
-	r = i2c_master_send(client, skb->data, skb->len);
-	if (r < 0) {  /* Retry, chip was in standby */
-		usleep_range(1000, 4000);
-		r = i2c_master_send(client, skb->data, skb->len);
-	}
-
-	if (r >= 0) {
-		if (r != skb->len)
-			r = -EREMOTEIO;
-		else
-			r = 0;
-	}
-
-	return r;
-}
-
-/*
- * Reads an ndlc frame and returns it in a newly allocated sk_buff.
- * returns:
- * frame size : if received frame is complete (find ST21NFCB_SOF_EOF at
- * end of read)
- * -EAGAIN : if received frame is incomplete (not find ST21NFCB_SOF_EOF
- * at end of read)
- * -EREMOTEIO : i2c read error (fatal)
- * -EBADMSG : frame was incorrect and discarded
- * (value returned from st21nfcb_nci_i2c_repack)
- * -EIO : if no ST21NFCB_SOF_EOF is found after reaching
- * the read length end sequence
- */
-static int st21nfcb_nci_i2c_read(struct st21nfcb_i2c_phy *phy,
-				 struct sk_buff **skb)
-{
-	int r;
-	u8 len;
-	u8 buf[ST21NFCB_NCI_I2C_MAX_SIZE];
-	struct i2c_client *client = phy->i2c_dev;
-
-	r = i2c_master_recv(client, buf, ST21NFCB_NCI_I2C_MIN_SIZE);
-	if (r < 0) {  /* Retry, chip was in standby */
-		usleep_range(1000, 4000);
-		r = i2c_master_recv(client, buf, ST21NFCB_NCI_I2C_MIN_SIZE);
-	}
-
-	if (r != ST21NFCB_NCI_I2C_MIN_SIZE)
-		return -EREMOTEIO;
-
-	len = be16_to_cpu(*(__be16 *) (buf + 2));
-	if (len > ST21NFCB_NCI_I2C_MAX_SIZE) {
-		nfc_err(&client->dev, "invalid frame len\n");
-		return -EBADMSG;
-	}
-
-	*skb = alloc_skb(ST21NFCB_NCI_I2C_MIN_SIZE + len, GFP_KERNEL);
-	if (*skb == NULL)
-		return -ENOMEM;
-
-	skb_reserve(*skb, ST21NFCB_NCI_I2C_MIN_SIZE);
-	skb_put(*skb, ST21NFCB_NCI_I2C_MIN_SIZE);
-	memcpy((*skb)->data, buf, ST21NFCB_NCI_I2C_MIN_SIZE);
-
-	if (!len)
-		return 0;
-
-	r = i2c_master_recv(client, buf, len);
-	if (r != len) {
-		kfree_skb(*skb);
-		return -EREMOTEIO;
-	}
-
-	skb_put(*skb, len);
-	memcpy((*skb)->data + ST21NFCB_NCI_I2C_MIN_SIZE, buf, len);
-
-	I2C_DUMP_SKB("i2c frame read", *skb);
-
-	return 0;
-}
-
-/*
- * Reads an ndlc frame from the chip.
- *
- * On ST21NFCB, IRQ goes in idle state when read starts.
- */
-static irqreturn_t st21nfcb_nci_irq_thread_fn(int irq, void *phy_id)
-{
-	struct st21nfcb_i2c_phy *phy = phy_id;
-	struct i2c_client *client;
-	struct sk_buff *skb = NULL;
-	int r;
-
-	if (!phy || !phy->ndlc || irq != phy->i2c_dev->irq) {
-		WARN_ON_ONCE(1);
-		return IRQ_NONE;
-	}
-
-	client = phy->i2c_dev;
-	dev_dbg(&client->dev, "IRQ\n");
-
-	if (phy->ndlc->hard_fault)
-		return IRQ_HANDLED;
-
-	if (!phy->ndlc->powered) {
-		st21nfcb_nci_i2c_disable(phy);
-		return IRQ_HANDLED;
-	}
-
-	r = st21nfcb_nci_i2c_read(phy, &skb);
-	if (r == -EREMOTEIO || r == -ENOMEM || r == -EBADMSG)
-		return IRQ_HANDLED;
-
-	ndlc_recv(phy->ndlc, skb);
-
-	return IRQ_HANDLED;
-}
-
-static struct nfc_phy_ops i2c_phy_ops = {
-	.write = st21nfcb_nci_i2c_write,
-	.enable = st21nfcb_nci_i2c_enable,
-	.disable = st21nfcb_nci_i2c_disable,
-};
-
-#ifdef CONFIG_OF
-static int st21nfcb_nci_i2c_of_request_resources(struct i2c_client *client)
-{
-	struct st21nfcb_i2c_phy *phy = i2c_get_clientdata(client);
-	struct device_node *pp;
-	int gpio;
-	int r;
-
-	pp = client->dev.of_node;
-	if (!pp)
-		return -ENODEV;
-
-	/* Get GPIO from device tree */
-	gpio = of_get_named_gpio(pp, "reset-gpios", 0);
-	if (gpio < 0) {
-		nfc_err(&client->dev,
-			"Failed to retrieve reset-gpios from device tree\n");
-		return gpio;
-	}
-
-	/* GPIO request and configuration */
-	r = devm_gpio_request_one(&client->dev, gpio,
-				GPIOF_OUT_INIT_HIGH, "clf_reset");
-	if (r) {
-		nfc_err(&client->dev, "Failed to request reset pin\n");
-		return r;
-	}
-	phy->gpio_reset = gpio;
-
-	phy->irq_polarity = irq_get_trigger_type(client->irq);
-
-	return 0;
-}
-#else
-static int st21nfcb_nci_i2c_of_request_resources(struct i2c_client *client)
-{
-	return -ENODEV;
-}
-#endif
-
-static int st21nfcb_nci_i2c_request_resources(struct i2c_client *client)
-{
-	struct st21nfcb_nfc_platform_data *pdata;
-	struct st21nfcb_i2c_phy *phy = i2c_get_clientdata(client);
-	int r;
-
-	pdata = client->dev.platform_data;
-	if (pdata == NULL) {
-		nfc_err(&client->dev, "No platform data\n");
-		return -EINVAL;
-	}
-
-	/* store for later use */
-	phy->gpio_reset = pdata->gpio_reset;
-	phy->irq_polarity = pdata->irq_polarity;
-
-	r = devm_gpio_request_one(&client->dev,
-			phy->gpio_reset, GPIOF_OUT_INIT_HIGH, "clf_reset");
-	if (r) {
-		pr_err("%s : reset gpio_request failed\n", __FILE__);
-		return r;
-	}
-
-	return 0;
-}
-
-static int st21nfcb_nci_i2c_probe(struct i2c_client *client,
-				  const struct i2c_device_id *id)
-{
-	struct st21nfcb_i2c_phy *phy;
-	struct st21nfcb_nfc_platform_data *pdata;
-	int r;
-
-	dev_dbg(&client->dev, "%s\n", __func__);
-	dev_dbg(&client->dev, "IRQ: %d\n", client->irq);
-
-	if (!i2c_check_functionality(client->adapter, I2C_FUNC_I2C)) {
-		nfc_err(&client->dev, "Need I2C_FUNC_I2C\n");
-		return -ENODEV;
-	}
-
-	phy = devm_kzalloc(&client->dev, sizeof(struct st21nfcb_i2c_phy),
-			   GFP_KERNEL);
-	if (!phy)
-		return -ENOMEM;
-
-	phy->i2c_dev = client;
-
-	i2c_set_clientdata(client, phy);
-
-	pdata = client->dev.platform_data;
-	if (!pdata && client->dev.of_node) {
-		r = st21nfcb_nci_i2c_of_request_resources(client);
-		if (r) {
-			nfc_err(&client->dev, "No platform data\n");
-			return r;
-		}
-	} else if (pdata) {
-		r = st21nfcb_nci_i2c_request_resources(client);
-		if (r) {
-			nfc_err(&client->dev,
-				"Cannot get platform resources\n");
-			return r;
-		}
-	} else {
-		nfc_err(&client->dev,
-			"st21nfcb platform resources not available\n");
-		return -ENODEV;
-	}
-
-	r = ndlc_probe(phy, &i2c_phy_ops, &client->dev,
-			ST21NFCB_FRAME_HEADROOM, ST21NFCB_FRAME_TAILROOM,
-			&phy->ndlc);
-	if (r < 0) {
-		nfc_err(&client->dev, "Unable to register ndlc layer\n");
-		return r;
-	}
-
-	r = devm_request_threaded_irq(&client->dev, client->irq, NULL,
-				st21nfcb_nci_irq_thread_fn,
-				phy->irq_polarity | IRQF_ONESHOT,
-				ST21NFCB_NCI_DRIVER_NAME, phy);
-	if (r < 0)
-		nfc_err(&client->dev, "Unable to register IRQ handler\n");
-
-	return r;
-}
-
-static int st21nfcb_nci_i2c_remove(struct i2c_client *client)
-{
-	struct st21nfcb_i2c_phy *phy = i2c_get_clientdata(client);
-
-	dev_dbg(&client->dev, "%s\n", __func__);
-
-	ndlc_remove(phy->ndlc);
-
-	return 0;
-}
-
-#ifdef CONFIG_OF
-static const struct of_device_id of_st21nfcb_i2c_match[] = {
-	{ .compatible = "st,st21nfcb-i2c", },
-	{ .compatible = "st,st21nfcb_i2c", },
-	{}
-};
-MODULE_DEVICE_TABLE(of, of_st21nfcb_i2c_match);
-#endif
-
-static struct i2c_driver st21nfcb_nci_i2c_driver = {
-	.driver = {
-		.owner = THIS_MODULE,
-		.name = ST21NFCB_NCI_I2C_DRIVER_NAME,
-		.of_match_table = of_match_ptr(of_st21nfcb_i2c_match),
-	},
-	.probe = st21nfcb_nci_i2c_probe,
-	.id_table = st21nfcb_nci_i2c_id_table,
-	.remove = st21nfcb_nci_i2c_remove,
-};
-
-module_i2c_driver(st21nfcb_nci_i2c_driver);
-
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION(DRIVER_DESC);
diff --git a/drivers/nfc/st21nfcb/ndlc.c b/drivers/nfc/st21nfcb/ndlc.c
deleted file mode 100644
index 91e81f37b3a6..000000000000
--- a/drivers/nfc/st21nfcb/ndlc.c
+++ /dev/null
@@ -1,313 +0,0 @@
-/*
- * Low Level Transport (NDLC) Driver for STMicroelectronics NFC Chip
- *
- * Copyright (C) 2014  STMicroelectronics SAS. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, see <http://www.gnu.org/licenses/>.
- */
-
-#include <linux/sched.h>
-#include <net/nfc/nci_core.h>
-
-#include "ndlc.h"
-#include "st21nfcb.h"
-
-#define NDLC_TIMER_T1		100
-#define NDLC_TIMER_T1_WAIT	400
-#define NDLC_TIMER_T2		1200
-
-#define PCB_TYPE_DATAFRAME		0x80
-#define PCB_TYPE_SUPERVISOR		0xc0
-#define PCB_TYPE_MASK			PCB_TYPE_SUPERVISOR
-
-#define PCB_SYNC_ACK			0x20
-#define PCB_SYNC_NACK			0x10
-#define PCB_SYNC_WAIT			0x30
-#define PCB_SYNC_NOINFO			0x00
-#define PCB_SYNC_MASK			PCB_SYNC_WAIT
-
-#define PCB_DATAFRAME_RETRANSMIT_YES	0x00
-#define PCB_DATAFRAME_RETRANSMIT_NO	0x04
-#define PCB_DATAFRAME_RETRANSMIT_MASK	PCB_DATAFRAME_RETRANSMIT_NO
-
-#define PCB_SUPERVISOR_RETRANSMIT_YES	0x00
-#define PCB_SUPERVISOR_RETRANSMIT_NO	0x02
-#define PCB_SUPERVISOR_RETRANSMIT_MASK	PCB_SUPERVISOR_RETRANSMIT_NO
-
-#define PCB_FRAME_CRC_INFO_PRESENT	0x08
-#define PCB_FRAME_CRC_INFO_NOTPRESENT	0x00
-#define PCB_FRAME_CRC_INFO_MASK		PCB_FRAME_CRC_INFO_PRESENT
-
-#define NDLC_DUMP_SKB(info, skb)                                 \
-do {                                                             \
-	pr_debug("%s:\n", info);                                 \
-	print_hex_dump(KERN_DEBUG, "ndlc: ", DUMP_PREFIX_OFFSET, \
-			16, 1, skb->data, skb->len, 0);          \
-} while (0)
-
-int ndlc_open(struct llt_ndlc *ndlc)
-{
-	/* toggle reset pin */
-	ndlc->ops->enable(ndlc->phy_id);
-	ndlc->powered = 1;
-	return 0;
-}
-EXPORT_SYMBOL(ndlc_open);
-
-void ndlc_close(struct llt_ndlc *ndlc)
-{
-	struct nci_mode_set_cmd cmd;
-
-	cmd.cmd_type = ST21NFCB_NCI_SET_NFC_MODE;
-	cmd.mode = 0;
-
-	/* toggle reset pin */
-	ndlc->ops->enable(ndlc->phy_id);
-
-	nci_prop_cmd(ndlc->ndev, ST21NFCB_NCI_CORE_PROP,
-		     sizeof(struct nci_mode_set_cmd), (__u8 *)&cmd);
-
-	ndlc->powered = 0;
-	ndlc->ops->disable(ndlc->phy_id);
-}
-EXPORT_SYMBOL(ndlc_close);
-
-int ndlc_send(struct llt_ndlc *ndlc, struct sk_buff *skb)
-{
-	/* add ndlc header */
-	u8 pcb = PCB_TYPE_DATAFRAME | PCB_DATAFRAME_RETRANSMIT_NO |
-		PCB_FRAME_CRC_INFO_NOTPRESENT;
-
-	*skb_push(skb, 1) = pcb;
-	skb_queue_tail(&ndlc->send_q, skb);
-
-	schedule_work(&ndlc->sm_work);
-
-	return 0;
-}
-EXPORT_SYMBOL(ndlc_send);
-
-static void llt_ndlc_send_queue(struct llt_ndlc *ndlc)
-{
-	struct sk_buff *skb;
-	int r;
-	unsigned long time_sent;
-
-	if (ndlc->send_q.qlen)
-		pr_debug("sendQlen=%d unackQlen=%d\n",
-			 ndlc->send_q.qlen, ndlc->ack_pending_q.qlen);
-
-	while (ndlc->send_q.qlen) {
-		skb = skb_dequeue(&ndlc->send_q);
-		NDLC_DUMP_SKB("ndlc frame written", skb);
-		r = ndlc->ops->write(ndlc->phy_id, skb);
-		if (r < 0) {
-			ndlc->hard_fault = r;
-			break;
-		}
-		time_sent = jiffies;
-		*(unsigned long *)skb->cb = time_sent;
-
-		skb_queue_tail(&ndlc->ack_pending_q, skb);
-
-		/* start timer t1 for ndlc aknowledge */
-		ndlc->t1_active = true;
-		mod_timer(&ndlc->t1_timer, time_sent +
-			msecs_to_jiffies(NDLC_TIMER_T1));
-		/* start timer t2 for chip availability */
-		ndlc->t2_active = true;
-		mod_timer(&ndlc->t2_timer, time_sent +
-			msecs_to_jiffies(NDLC_TIMER_T2));
-	}
-}
-
-static void llt_ndlc_requeue_data_pending(struct llt_ndlc *ndlc)
-{
-	struct sk_buff *skb;
-	u8 pcb;
-
-	while ((skb = skb_dequeue_tail(&ndlc->ack_pending_q))) {
-		pcb = skb->data[0];
-		switch (pcb & PCB_TYPE_MASK) {
-		case PCB_TYPE_SUPERVISOR:
-			skb->data[0] = (pcb & ~PCB_SUPERVISOR_RETRANSMIT_MASK) |
-				PCB_SUPERVISOR_RETRANSMIT_YES;
-			break;
-		case PCB_TYPE_DATAFRAME:
-			skb->data[0] = (pcb & ~PCB_DATAFRAME_RETRANSMIT_MASK) |
-				PCB_DATAFRAME_RETRANSMIT_YES;
-			break;
-		default:
-			pr_err("UNKNOWN Packet Control Byte=%d\n", pcb);
-			kfree_skb(skb);
-			continue;
-		}
-		skb_queue_head(&ndlc->send_q, skb);
-	}
-}
-
-static void llt_ndlc_rcv_queue(struct llt_ndlc *ndlc)
-{
-	struct sk_buff *skb;
-	u8 pcb;
-	unsigned long time_sent;
-
-	if (ndlc->rcv_q.qlen)
-		pr_debug("rcvQlen=%d\n", ndlc->rcv_q.qlen);
-
-	while ((skb = skb_dequeue(&ndlc->rcv_q)) != NULL) {
-		pcb = skb->data[0];
-		skb_pull(skb, 1);
-		if ((pcb & PCB_TYPE_MASK) == PCB_TYPE_SUPERVISOR) {
-			switch (pcb & PCB_SYNC_MASK) {
-			case PCB_SYNC_ACK:
-				del_timer_sync(&ndlc->t1_timer);
-				del_timer_sync(&ndlc->t2_timer);
-				ndlc->t2_active = false;
-				ndlc->t1_active = false;
-				break;
-			case PCB_SYNC_NACK:
-				llt_ndlc_requeue_data_pending(ndlc);
-				llt_ndlc_send_queue(ndlc);
-				/* start timer t1 for ndlc aknowledge */
-				time_sent = jiffies;
-				ndlc->t1_active = true;
-				mod_timer(&ndlc->t1_timer, time_sent +
-					msecs_to_jiffies(NDLC_TIMER_T1));
-				break;
-			case PCB_SYNC_WAIT:
-				time_sent = jiffies;
-				ndlc->t1_active = true;
-				mod_timer(&ndlc->t1_timer, time_sent +
-					  msecs_to_jiffies(NDLC_TIMER_T1_WAIT));
-				break;
-			default:
-				pr_err("UNKNOWN Packet Control Byte=%d\n", pcb);
-				kfree_skb(skb);
-				break;
-			}
-		} else {
-			nci_recv_frame(ndlc->ndev, skb);
-		}
-	}
-}
-
-static void llt_ndlc_sm_work(struct work_struct *work)
-{
-	struct llt_ndlc *ndlc = container_of(work, struct llt_ndlc, sm_work);
-
-	llt_ndlc_send_queue(ndlc);
-	llt_ndlc_rcv_queue(ndlc);
-
-	if (ndlc->t1_active && timer_pending(&ndlc->t1_timer) == 0) {
-		pr_debug
-		    ("Handle T1(recv SUPERVISOR) elapsed (T1 now inactive)\n");
-		ndlc->t1_active = false;
-
-		llt_ndlc_requeue_data_pending(ndlc);
-		llt_ndlc_send_queue(ndlc);
-	}
-
-	if (ndlc->t2_active && timer_pending(&ndlc->t2_timer) == 0) {
-		pr_debug("Handle T2(recv DATA) elapsed (T2 now inactive)\n");
-		ndlc->t2_active = false;
-		ndlc->t1_active = false;
-		del_timer_sync(&ndlc->t1_timer);
-		del_timer_sync(&ndlc->t2_timer);
-		ndlc_close(ndlc);
-		ndlc->hard_fault = -EREMOTEIO;
-	}
-}
-
-void ndlc_recv(struct llt_ndlc *ndlc, struct sk_buff *skb)
-{
-	if (skb == NULL) {
-		pr_err("NULL Frame -> link is dead\n");
-		ndlc->hard_fault = -EREMOTEIO;
-		ndlc_close(ndlc);
-	} else {
-		NDLC_DUMP_SKB("incoming frame", skb);
-		skb_queue_tail(&ndlc->rcv_q, skb);
-	}
-
-	schedule_work(&ndlc->sm_work);
-}
-EXPORT_SYMBOL(ndlc_recv);
-
-static void ndlc_t1_timeout(unsigned long data)
-{
-	struct llt_ndlc *ndlc = (struct llt_ndlc *)data;
-
-	pr_debug("\n");
-
-	schedule_work(&ndlc->sm_work);
-}
-
-static void ndlc_t2_timeout(unsigned long data)
-{
-	struct llt_ndlc *ndlc = (struct llt_ndlc *)data;
-
-	pr_debug("\n");
-
-	schedule_work(&ndlc->sm_work);
-}
-
-int ndlc_probe(void *phy_id, struct nfc_phy_ops *phy_ops, struct device *dev,
-	       int phy_headroom, int phy_tailroom, struct llt_ndlc **ndlc_id)
-{
-	struct llt_ndlc *ndlc;
-
-	ndlc = devm_kzalloc(dev, sizeof(struct llt_ndlc), GFP_KERNEL);
-	if (!ndlc)
-		return -ENOMEM;
-
-	ndlc->ops = phy_ops;
-	ndlc->phy_id = phy_id;
-	ndlc->dev = dev;
-	ndlc->powered = 0;
-
-	*ndlc_id = ndlc;
-
-	/* initialize timers */
-	init_timer(&ndlc->t1_timer);
-	ndlc->t1_timer.data = (unsigned long)ndlc;
-	ndlc->t1_timer.function = ndlc_t1_timeout;
-
-	init_timer(&ndlc->t2_timer);
-	ndlc->t2_timer.data = (unsigned long)ndlc;
-	ndlc->t2_timer.function = ndlc_t2_timeout;
-
-	skb_queue_head_init(&ndlc->rcv_q);
-	skb_queue_head_init(&ndlc->send_q);
-	skb_queue_head_init(&ndlc->ack_pending_q);
-
-	INIT_WORK(&ndlc->sm_work, llt_ndlc_sm_work);
-
-	return st21nfcb_nci_probe(ndlc, phy_headroom, phy_tailroom);
-}
-EXPORT_SYMBOL(ndlc_probe);
-
-void ndlc_remove(struct llt_ndlc *ndlc)
-{
-	st21nfcb_nci_remove(ndlc->ndev);
-
-	/* cancel timers */
-	del_timer_sync(&ndlc->t1_timer);
-	del_timer_sync(&ndlc->t2_timer);
-	ndlc->t2_active = false;
-	ndlc->t1_active = false;
-
-	skb_queue_purge(&ndlc->rcv_q);
-	skb_queue_purge(&ndlc->send_q);
-}
-EXPORT_SYMBOL(ndlc_remove);
diff --git a/drivers/nfc/st21nfcb/ndlc.h b/drivers/nfc/st21nfcb/ndlc.h
deleted file mode 100644
index cf6a9d9f2983..000000000000
--- a/drivers/nfc/st21nfcb/ndlc.h
+++ /dev/null
@@ -1,60 +0,0 @@
-/*
- * NCI based Driver for STMicroelectronics NFC Chip
- *
- * Copyright (C) 2014  STMicroelectronics SAS. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, see <http://www.gnu.org/licenses/>.
- */
-
-#ifndef __LOCAL_NDLC_H_
-#define __LOCAL_NDLC_H_
-
-#include <linux/skbuff.h>
-#include <net/nfc/nfc.h>
-
-/* Low Level Transport description */
-struct llt_ndlc {
-	struct nci_dev *ndev;
-	struct nfc_phy_ops *ops;
-	void *phy_id;
-
-	struct timer_list t1_timer;
-	bool t1_active;
-
-	struct timer_list t2_timer;
-	bool t2_active;
-
-	struct sk_buff_head rcv_q;
-	struct sk_buff_head send_q;
-	struct sk_buff_head ack_pending_q;
-
-	struct work_struct sm_work;
-
-	struct device *dev;
-
-	/*
-	 * < 0 if hardware error occured
-	 * and prevents normal operation.
-	 */
-	int hard_fault;
-	int powered;
-};
-
-int ndlc_open(struct llt_ndlc *ndlc);
-void ndlc_close(struct llt_ndlc *ndlc);
-int ndlc_send(struct llt_ndlc *ndlc, struct sk_buff *skb);
-void ndlc_recv(struct llt_ndlc *ndlc, struct sk_buff *skb);
-int ndlc_probe(void *phy_id, struct nfc_phy_ops *phy_ops, struct device *dev,
-	int phy_headroom, int phy_tailroom, struct llt_ndlc **ndlc_id);
-void ndlc_remove(struct llt_ndlc *ndlc);
-#endif /* __LOCAL_NDLC_H__ */
diff --git a/drivers/nfc/st21nfcb/st21nfcb.c b/drivers/nfc/st21nfcb/st21nfcb.c
deleted file mode 100644
index a16c3a3d3fff..000000000000
--- a/drivers/nfc/st21nfcb/st21nfcb.c
+++ /dev/null
@@ -1,179 +0,0 @@
-/*
- * NCI based Driver for STMicroelectronics NFC Chip
- *
- * Copyright (C) 2014  STMicroelectronics SAS. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, see <http://www.gnu.org/licenses/>.
- */
-
-#include <linux/module.h>
-#include <linux/nfc.h>
-#include <net/nfc/nci.h>
-#include <net/nfc/nci_core.h>
-#include <linux/gpio.h>
-#include <linux/delay.h>
-
-#include "st21nfcb.h"
-#include "st21nfcb_se.h"
-
-#define DRIVER_DESC "NCI NFC driver for ST21NFCB"
-
-#define ST21NFCB_NCI1_X_PROPRIETARY_ISO15693 0x83
-
-static int st21nfcb_nci_init(struct nci_dev *ndev)
-{
-	struct nci_mode_set_cmd cmd;
-
-	cmd.cmd_type = ST21NFCB_NCI_SET_NFC_MODE;
-	cmd.mode = 1;
-
-	return nci_prop_cmd(ndev, ST21NFCB_NCI_CORE_PROP,
-			sizeof(struct nci_mode_set_cmd), (__u8 *)&cmd);
-}
-
-static int st21nfcb_nci_open(struct nci_dev *ndev)
-{
-	struct st21nfcb_nci_info *info = nci_get_drvdata(ndev);
-	int r;
-
-	if (test_and_set_bit(ST21NFCB_NCI_RUNNING, &info->flags))
-		return 0;
-
-	r = ndlc_open(info->ndlc);
-	if (r)
-		clear_bit(ST21NFCB_NCI_RUNNING, &info->flags);
-
-	return r;
-}
-
-static int st21nfcb_nci_close(struct nci_dev *ndev)
-{
-	struct st21nfcb_nci_info *info = nci_get_drvdata(ndev);
-
-	if (!test_bit(ST21NFCB_NCI_RUNNING, &info->flags))
-		return 0;
-
-	ndlc_close(info->ndlc);
-
-	clear_bit(ST21NFCB_NCI_RUNNING, &info->flags);
-
-	return 0;
-}
-
-static int st21nfcb_nci_send(struct nci_dev *ndev, struct sk_buff *skb)
-{
-	struct st21nfcb_nci_info *info = nci_get_drvdata(ndev);
-
-	skb->dev = (void *)ndev;
-
-	if (!test_bit(ST21NFCB_NCI_RUNNING, &info->flags))
-		return -EBUSY;
-
-	return ndlc_send(info->ndlc, skb);
-}
-
-static __u32 st21nfcb_nci_get_rfprotocol(struct nci_dev *ndev,
-					 __u8 rf_protocol)
-{
-	return rf_protocol == ST21NFCB_NCI1_X_PROPRIETARY_ISO15693 ?
-		NFC_PROTO_ISO15693_MASK : 0;
-}
-
-static int st21nfcb_nci_prop_rsp_packet(struct nci_dev *ndev,
-					struct sk_buff *skb)
-{
-	__u8 status = skb->data[0];
-
-	nci_req_complete(ndev, status);
-	return 0;
-}
-
-static struct nci_prop_ops st21nfcb_nci_prop_ops[] = {
-	{
-		.opcode = nci_opcode_pack(NCI_GID_PROPRIETARY,
-					  ST21NFCB_NCI_CORE_PROP),
-		.rsp = st21nfcb_nci_prop_rsp_packet,
-	},
-};
-
-static struct nci_ops st21nfcb_nci_ops = {
-	.init = st21nfcb_nci_init,
-	.open = st21nfcb_nci_open,
-	.close = st21nfcb_nci_close,
-	.send = st21nfcb_nci_send,
-	.get_rfprotocol = st21nfcb_nci_get_rfprotocol,
-	.discover_se = st21nfcb_nci_discover_se,
-	.enable_se = st21nfcb_nci_enable_se,
-	.disable_se = st21nfcb_nci_disable_se,
-	.se_io = st21nfcb_nci_se_io,
-	.hci_load_session = st21nfcb_hci_load_session,
-	.hci_event_received = st21nfcb_hci_event_received,
-	.hci_cmd_received = st21nfcb_hci_cmd_received,
-	.prop_ops = st21nfcb_nci_prop_ops,
-	.n_prop_ops = ARRAY_SIZE(st21nfcb_nci_prop_ops),
-};
-
-int st21nfcb_nci_probe(struct llt_ndlc *ndlc, int phy_headroom,
-		       int phy_tailroom)
-{
-	struct st21nfcb_nci_info *info;
-	int r;
-	u32 protocols;
-
-	info = devm_kzalloc(ndlc->dev,
-			sizeof(struct st21nfcb_nci_info), GFP_KERNEL);
-	if (!info)
-		return -ENOMEM;
-
-	protocols = NFC_PROTO_JEWEL_MASK
-		| NFC_PROTO_MIFARE_MASK
-		| NFC_PROTO_FELICA_MASK
-		| NFC_PROTO_ISO14443_MASK
-		| NFC_PROTO_ISO14443_B_MASK
-		| NFC_PROTO_ISO15693_MASK
-		| NFC_PROTO_NFC_DEP_MASK;
-
-	ndlc->ndev = nci_allocate_device(&st21nfcb_nci_ops, protocols,
-					phy_headroom, phy_tailroom);
-	if (!ndlc->ndev) {
-		pr_err("Cannot allocate nfc ndev\n");
-		return -ENOMEM;
-	}
-	info->ndlc = ndlc;
-
-	nci_set_drvdata(ndlc->ndev, info);
-
-	r = nci_register_device(ndlc->ndev);
-	if (r) {
-		pr_err("Cannot register nfc device to nci core\n");
-		nci_free_device(ndlc->ndev);
-		return r;
-	}
-
-	return st21nfcb_se_init(ndlc->ndev);
-}
-EXPORT_SYMBOL_GPL(st21nfcb_nci_probe);
-
-void st21nfcb_nci_remove(struct nci_dev *ndev)
-{
-	struct st21nfcb_nci_info *info = nci_get_drvdata(ndev);
-
-	ndlc_close(info->ndlc);
-
-	nci_unregister_device(ndev);
-	nci_free_device(ndev);
-}
-EXPORT_SYMBOL_GPL(st21nfcb_nci_remove);
-
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION(DRIVER_DESC);
diff --git a/drivers/nfc/st21nfcb/st21nfcb.h b/drivers/nfc/st21nfcb/st21nfcb.h
deleted file mode 100644
index 710636325c1f..000000000000
--- a/drivers/nfc/st21nfcb/st21nfcb.h
+++ /dev/null
@@ -1,50 +0,0 @@
-/*
- * NCI based Driver for STMicroelectronics NFC Chip
- *
- * Copyright (C) 2014  STMicroelectronics SAS. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, see <http://www.gnu.org/licenses/>.
- */
-
-#ifndef __LOCAL_ST21NFCB_H_
-#define __LOCAL_ST21NFCB_H_
-
-#include "st21nfcb_se.h"
-#include "ndlc.h"
-
-/* Define private flags: */
-#define ST21NFCB_NCI_RUNNING			1
-
-#define ST21NFCB_NCI_CORE_PROP                0x01
-#define ST21NFCB_NCI_SET_NFC_MODE             0x02
-
-struct nci_mode_set_cmd {
-	u8 cmd_type;
-	u8 mode;
-} __packed;
-
-struct nci_mode_set_rsp {
-	u8 status;
-} __packed;
-
-struct st21nfcb_nci_info {
-	struct llt_ndlc *ndlc;
-	unsigned long flags;
-	struct st21nfcb_se_info se_info;
-};
-
-void st21nfcb_nci_remove(struct nci_dev *ndev);
-int st21nfcb_nci_probe(struct llt_ndlc *ndlc, int phy_headroom,
-		int phy_tailroom);
-
-#endif /* __LOCAL_ST21NFCB_H_ */
diff --git a/drivers/nfc/st21nfcb/st21nfcb_se.c b/drivers/nfc/st21nfcb/st21nfcb_se.c
deleted file mode 100644
index 24862a525fb5..000000000000
--- a/drivers/nfc/st21nfcb/st21nfcb_se.c
+++ /dev/null
@@ -1,713 +0,0 @@
-/*
- * NCI based Driver for STMicroelectronics NFC Chip
- *
- * Copyright (C) 2014  STMicroelectronics SAS. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, see <http://www.gnu.org/licenses/>.
- */
-
-#include <linux/module.h>
-#include <linux/nfc.h>
-#include <linux/delay.h>
-#include <net/nfc/nci.h>
-#include <net/nfc/nci_core.h>
-
-#include "st21nfcb.h"
-#include "st21nfcb_se.h"
-
-struct st21nfcb_pipe_info {
-	u8 pipe_state;
-	u8 src_host_id;
-	u8 src_gate_id;
-	u8 dst_host_id;
-	u8 dst_gate_id;
-} __packed;
-
-/* Hosts */
-#define ST21NFCB_HOST_CONTROLLER_ID     0x00
-#define ST21NFCB_TERMINAL_HOST_ID       0x01
-#define ST21NFCB_UICC_HOST_ID           0x02
-#define ST21NFCB_ESE_HOST_ID            0xc0
-
-/* Gates */
-#define ST21NFCB_DEVICE_MGNT_GATE       0x01
-#define ST21NFCB_APDU_READER_GATE       0xf0
-#define ST21NFCB_CONNECTIVITY_GATE      0x41
-
-/* Pipes */
-#define ST21NFCB_DEVICE_MGNT_PIPE               0x02
-
-/* Connectivity pipe only */
-#define ST21NFCB_SE_COUNT_PIPE_UICC             0x01
-/* Connectivity + APDU Reader pipe */
-#define ST21NFCB_SE_COUNT_PIPE_EMBEDDED         0x02
-
-#define ST21NFCB_SE_TO_HOT_PLUG			1000 /* msecs */
-#define ST21NFCB_SE_TO_PIPES			2000
-
-#define ST21NFCB_EVT_HOT_PLUG_IS_INHIBITED(x)   (x->data[0] & 0x80)
-
-#define NCI_HCI_APDU_PARAM_ATR                     0x01
-#define NCI_HCI_ADMIN_PARAM_SESSION_IDENTITY       0x01
-#define NCI_HCI_ADMIN_PARAM_WHITELIST              0x03
-#define NCI_HCI_ADMIN_PARAM_HOST_LIST              0x04
-
-#define ST21NFCB_EVT_SE_HARD_RESET		0x20
-#define ST21NFCB_EVT_TRANSMIT_DATA		0x10
-#define ST21NFCB_EVT_WTX_REQUEST		0x11
-#define ST21NFCB_EVT_SE_SOFT_RESET		0x11
-#define ST21NFCB_EVT_SE_END_OF_APDU_TRANSFER	0x21
-#define ST21NFCB_EVT_HOT_PLUG			0x03
-
-#define ST21NFCB_SE_MODE_OFF                    0x00
-#define ST21NFCB_SE_MODE_ON                     0x01
-
-#define ST21NFCB_EVT_CONNECTIVITY       0x10
-#define ST21NFCB_EVT_TRANSACTION        0x12
-
-#define ST21NFCB_DM_GETINFO             0x13
-#define ST21NFCB_DM_GETINFO_PIPE_LIST   0x02
-#define ST21NFCB_DM_GETINFO_PIPE_INFO   0x01
-#define ST21NFCB_DM_PIPE_CREATED        0x02
-#define ST21NFCB_DM_PIPE_OPEN           0x04
-#define ST21NFCB_DM_RF_ACTIVE           0x80
-#define ST21NFCB_DM_DISCONNECT          0x30
-
-#define ST21NFCB_DM_IS_PIPE_OPEN(p) \
-	((p & 0x0f) == (ST21NFCB_DM_PIPE_CREATED | ST21NFCB_DM_PIPE_OPEN))
-
-#define ST21NFCB_ATR_DEFAULT_BWI        0x04
-
-/*
- * WT = 2^BWI/10[s], convert into msecs and add a secure
- * room by increasing by 2 this timeout
- */
-#define ST21NFCB_BWI_TO_TIMEOUT(x)      ((1 << x) * 200)
-#define ST21NFCB_ATR_GET_Y_FROM_TD(x)   (x >> 4)
-
-/* If TA is present bit 0 is set */
-#define ST21NFCB_ATR_TA_PRESENT(x) (x & 0x01)
-/* If TB is present bit 1 is set */
-#define ST21NFCB_ATR_TB_PRESENT(x) (x & 0x02)
-
-#define ST21NFCB_NUM_DEVICES           256
-
-static DECLARE_BITMAP(dev_mask, ST21NFCB_NUM_DEVICES);
-
-/* Here are the mandatory pipe for st21nfcb */
-static struct nci_hci_gate st21nfcb_gates[] = {
-	{NCI_HCI_ADMIN_GATE, NCI_HCI_ADMIN_PIPE,
-					ST21NFCB_HOST_CONTROLLER_ID},
-	{NCI_HCI_LINK_MGMT_GATE, NCI_HCI_LINK_MGMT_PIPE,
-					ST21NFCB_HOST_CONTROLLER_ID},
-	{ST21NFCB_DEVICE_MGNT_GATE, ST21NFCB_DEVICE_MGNT_PIPE,
-					ST21NFCB_HOST_CONTROLLER_ID},
-
-	/* Secure element pipes are created by secure element host */
-	{ST21NFCB_CONNECTIVITY_GATE, NCI_HCI_DO_NOT_OPEN_PIPE,
-					ST21NFCB_HOST_CONTROLLER_ID},
-	{ST21NFCB_APDU_READER_GATE, NCI_HCI_DO_NOT_OPEN_PIPE,
-					ST21NFCB_HOST_CONTROLLER_ID},
-};
-
-static u8 st21nfcb_se_get_bwi(struct nci_dev *ndev)
-{
-	int i;
-	u8 td;
-	struct st21nfcb_nci_info *info = nci_get_drvdata(ndev);
-
-	/* Bits 8 to 5 of the first TB for T=1 encode BWI from zero to nine */
-	for (i = 1; i < ST21NFCB_ESE_MAX_LENGTH; i++) {
-		td = ST21NFCB_ATR_GET_Y_FROM_TD(info->se_info.atr[i]);
-		if (ST21NFCB_ATR_TA_PRESENT(td))
-			i++;
-		if (ST21NFCB_ATR_TB_PRESENT(td)) {
-			i++;
-			return info->se_info.atr[i] >> 4;
-		}
-	}
-	return ST21NFCB_ATR_DEFAULT_BWI;
-}
-
-static void st21nfcb_se_get_atr(struct nci_dev *ndev)
-{
-	struct st21nfcb_nci_info *info = nci_get_drvdata(ndev);
-	int r;
-	struct sk_buff *skb;
-
-	r = nci_hci_get_param(ndev, ST21NFCB_APDU_READER_GATE,
-				NCI_HCI_APDU_PARAM_ATR, &skb);
-	if (r < 0)
-		return;
-
-	if (skb->len <= ST21NFCB_ESE_MAX_LENGTH) {
-		memcpy(info->se_info.atr, skb->data, skb->len);
-
-		info->se_info.wt_timeout =
-			ST21NFCB_BWI_TO_TIMEOUT(st21nfcb_se_get_bwi(ndev));
-	}
-	kfree_skb(skb);
-}
-
-int st21nfcb_hci_load_session(struct nci_dev *ndev)
-{
-	int i, j, r;
-	struct sk_buff *skb_pipe_list, *skb_pipe_info;
-	struct st21nfcb_pipe_info *dm_pipe_info;
-	u8 pipe_list[] = { ST21NFCB_DM_GETINFO_PIPE_LIST,
-			ST21NFCB_TERMINAL_HOST_ID};
-	u8 pipe_info[] = { ST21NFCB_DM_GETINFO_PIPE_INFO,
-			ST21NFCB_TERMINAL_HOST_ID, 0};
-
-	/* On ST21NFCB device pipes number are dynamics
-	 * If pipes are already created, hci_dev_up will fail.
-	 * Doing a clear all pipe is a bad idea because:
-	 * - It does useless EEPROM cycling
-	 * - It might cause issue for secure elements support
-	 * (such as removing connectivity or APDU reader pipe)
-	 * A better approach on ST21NFCB is to:
-	 * - get a pipe list for each host.
-	 * (eg: ST21NFCB_HOST_CONTROLLER_ID for now).
-	 * (TODO Later on UICC HOST and eSE HOST)
-	 * - get pipe information
-	 * - match retrieved pipe list in st21nfcb_gates
-	 * ST21NFCB_DEVICE_MGNT_GATE is a proprietary gate
-	 * with ST21NFCB_DEVICE_MGNT_PIPE.
-	 * Pipe can be closed and need to be open.
-	 */
-	r = nci_hci_connect_gate(ndev, ST21NFCB_HOST_CONTROLLER_ID,
-				ST21NFCB_DEVICE_MGNT_GATE,
-				ST21NFCB_DEVICE_MGNT_PIPE);
-	if (r < 0)
-		goto free_info;
-
-	/* Get pipe list */
-	r = nci_hci_send_cmd(ndev, ST21NFCB_DEVICE_MGNT_GATE,
-			ST21NFCB_DM_GETINFO, pipe_list, sizeof(pipe_list),
-			&skb_pipe_list);
-	if (r < 0)
-		goto free_info;
-
-	/* Complete the existing gate_pipe table */
-	for (i = 0; i < skb_pipe_list->len; i++) {
-		pipe_info[2] = skb_pipe_list->data[i];
-		r = nci_hci_send_cmd(ndev, ST21NFCB_DEVICE_MGNT_GATE,
-					ST21NFCB_DM_GETINFO, pipe_info,
-					sizeof(pipe_info), &skb_pipe_info);
-
-		if (r)
-			continue;
-
-		/*
-		 * Match pipe ID and gate ID
-		 * Output format from ST21NFC_DM_GETINFO is:
-		 * - pipe state (1byte)
-		 * - source hid (1byte)
-		 * - source gid (1byte)
-		 * - destination hid (1byte)
-		 * - destination gid (1byte)
-		 */
-		dm_pipe_info = (struct st21nfcb_pipe_info *)skb_pipe_info->data;
-		if (dm_pipe_info->dst_gate_id == ST21NFCB_APDU_READER_GATE &&
-		    dm_pipe_info->src_host_id != ST21NFCB_ESE_HOST_ID) {
-			pr_err("Unexpected apdu_reader pipe on host %x\n",
-			       dm_pipe_info->src_host_id);
-			continue;
-		}
-
-		for (j = 0; (j < ARRAY_SIZE(st21nfcb_gates)) &&
-		     (st21nfcb_gates[j].gate != dm_pipe_info->dst_gate_id); j++)
-			;
-
-		if (j < ARRAY_SIZE(st21nfcb_gates) &&
-		    st21nfcb_gates[j].gate == dm_pipe_info->dst_gate_id &&
-		    ST21NFCB_DM_IS_PIPE_OPEN(dm_pipe_info->pipe_state)) {
-			st21nfcb_gates[j].pipe = pipe_info[2];
-
-			ndev->hci_dev->gate2pipe[st21nfcb_gates[j].gate] =
-						st21nfcb_gates[j].pipe;
-			ndev->hci_dev->pipes[st21nfcb_gates[j].pipe].gate =
-						st21nfcb_gates[j].gate;
-			ndev->hci_dev->pipes[st21nfcb_gates[j].pipe].host =
-						dm_pipe_info->src_host_id;
-		}
-	}
-
-	memcpy(ndev->hci_dev->init_data.gates, st21nfcb_gates,
-	       sizeof(st21nfcb_gates));
-
-free_info:
-	kfree_skb(skb_pipe_info);
-	kfree_skb(skb_pipe_list);
-	return r;
-}
-EXPORT_SYMBOL_GPL(st21nfcb_hci_load_session);
-
-static void st21nfcb_hci_admin_event_received(struct nci_dev *ndev,
-					      u8 event, struct sk_buff *skb)
-{
-	struct st21nfcb_nci_info *info = nci_get_drvdata(ndev);
-
-	switch (event) {
-	case ST21NFCB_EVT_HOT_PLUG:
-		if (info->se_info.se_active) {
-			if (!ST21NFCB_EVT_HOT_PLUG_IS_INHIBITED(skb)) {
-				del_timer_sync(&info->se_info.se_active_timer);
-				info->se_info.se_active = false;
-				complete(&info->se_info.req_completion);
-			} else {
-				mod_timer(&info->se_info.se_active_timer,
-				      jiffies +
-				      msecs_to_jiffies(ST21NFCB_SE_TO_PIPES));
-			}
-		}
-	break;
-	}
-}
-
-static int st21nfcb_hci_apdu_reader_event_received(struct nci_dev *ndev,
-						   u8 event,
-						   struct sk_buff *skb)
-{
-	int r = 0;
-	struct st21nfcb_nci_info *info = nci_get_drvdata(ndev);
-
-	pr_debug("apdu reader gate event: %x\n", event);
-
-	switch (event) {
-	case ST21NFCB_EVT_TRANSMIT_DATA:
-		del_timer_sync(&info->se_info.bwi_timer);
-		info->se_info.bwi_active = false;
-		info->se_info.cb(info->se_info.cb_context,
-				 skb->data, skb->len, 0);
-	break;
-	case ST21NFCB_EVT_WTX_REQUEST:
-		mod_timer(&info->se_info.bwi_timer, jiffies +
-			  msecs_to_jiffies(info->se_info.wt_timeout));
-	break;
-	}
-
-	kfree_skb(skb);
-	return r;
-}
-
-/*
- * Returns:
- * <= 0: driver handled the event, skb consumed
- *    1: driver does not handle the event, please do standard processing
- */
-static int st21nfcb_hci_connectivity_event_received(struct nci_dev *ndev,
-						u8 host, u8 event,
-						struct sk_buff *skb)
-{
-	int r = 0;
-	struct device *dev = &ndev->nfc_dev->dev;
-	struct nfc_evt_transaction *transaction;
-
-	pr_debug("connectivity gate event: %x\n", event);
-
-	switch (event) {
-	case ST21NFCB_EVT_CONNECTIVITY:
-
-	break;
-	case ST21NFCB_EVT_TRANSACTION:
-		/* According to specification etsi 102 622
-		 * 11.2.2.4 EVT_TRANSACTION Table 52
-		 * Description  Tag     Length
-		 * AID          81      5 to 16
-		 * PARAMETERS   82      0 to 255
-		 */
-		if (skb->len < NFC_MIN_AID_LENGTH + 2 &&
-		    skb->data[0] != NFC_EVT_TRANSACTION_AID_TAG)
-			return -EPROTO;
-
-		transaction = (struct nfc_evt_transaction *)devm_kzalloc(dev,
-					    skb->len - 2, GFP_KERNEL);
-
-		transaction->aid_len = skb->data[1];
-		memcpy(transaction->aid, &skb->data[2], transaction->aid_len);
-
-		/* Check next byte is PARAMETERS tag (82) */
-		if (skb->data[transaction->aid_len + 2] !=
-		    NFC_EVT_TRANSACTION_PARAMS_TAG)
-			return -EPROTO;
-
-		transaction->params_len = skb->data[transaction->aid_len + 3];
-		memcpy(transaction->params, skb->data +
-		       transaction->aid_len + 4, transaction->params_len);
-
-		r = nfc_se_transaction(ndev->nfc_dev, host, transaction);
-		break;
-	default:
-		return 1;
-	}
-	kfree_skb(skb);
-	return r;
-}
-
-void st21nfcb_hci_event_received(struct nci_dev *ndev, u8 pipe,
-				 u8 event, struct sk_buff *skb)
-{
-	u8 gate = ndev->hci_dev->pipes[pipe].gate;
-	u8 host = ndev->hci_dev->pipes[pipe].host;
-
-	switch (gate) {
-	case NCI_HCI_ADMIN_GATE:
-		st21nfcb_hci_admin_event_received(ndev, event, skb);
-	break;
-	case ST21NFCB_APDU_READER_GATE:
-		st21nfcb_hci_apdu_reader_event_received(ndev, event, skb);
-	break;
-	case ST21NFCB_CONNECTIVITY_GATE:
-		st21nfcb_hci_connectivity_event_received(ndev, host, event,
-							 skb);
-	break;
-	}
-}
-EXPORT_SYMBOL_GPL(st21nfcb_hci_event_received);
-
-
-void st21nfcb_hci_cmd_received(struct nci_dev *ndev, u8 pipe, u8 cmd,
-			       struct sk_buff *skb)
-{
-	struct st21nfcb_nci_info *info = nci_get_drvdata(ndev);
-	u8 gate = ndev->hci_dev->pipes[pipe].gate;
-
-	pr_debug("cmd: %x\n", cmd);
-
-	switch (cmd) {
-	case NCI_HCI_ANY_OPEN_PIPE:
-		if (gate != ST21NFCB_APDU_READER_GATE &&
-		    ndev->hci_dev->pipes[pipe].host != ST21NFCB_UICC_HOST_ID)
-			ndev->hci_dev->count_pipes++;
-
-		if (ndev->hci_dev->count_pipes ==
-		    ndev->hci_dev->expected_pipes) {
-			del_timer_sync(&info->se_info.se_active_timer);
-			info->se_info.se_active = false;
-			ndev->hci_dev->count_pipes = 0;
-			complete(&info->se_info.req_completion);
-		}
-	break;
-	}
-}
-EXPORT_SYMBOL_GPL(st21nfcb_hci_cmd_received);
-
-/*
- * Remarks: On some early st21nfcb firmware, nci_nfcee_mode_set(0)
- * is rejected
- */
-static int st21nfcb_nci_control_se(struct nci_dev *ndev, u8 se_idx,
-				   u8 state)
-{
-	struct st21nfcb_nci_info *info = nci_get_drvdata(ndev);
-	int r;
-	struct sk_buff *sk_host_list;
-	u8 host_id;
-
-	switch (se_idx) {
-	case ST21NFCB_UICC_HOST_ID:
-		ndev->hci_dev->count_pipes = 0;
-		ndev->hci_dev->expected_pipes = ST21NFCB_SE_COUNT_PIPE_UICC;
-		break;
-	case ST21NFCB_ESE_HOST_ID:
-		ndev->hci_dev->count_pipes = 0;
-		ndev->hci_dev->expected_pipes = ST21NFCB_SE_COUNT_PIPE_EMBEDDED;
-		break;
-	default:
-		return -EINVAL;
-	}
-
-	/*
-	 * Wait for an EVT_HOT_PLUG in order to
-	 * retrieve a relevant host list.
-	 */
-	reinit_completion(&info->se_info.req_completion);
-	r = nci_nfcee_mode_set(ndev, se_idx, NCI_NFCEE_ENABLE);
-	if (r != NCI_STATUS_OK)
-		return r;
-
-	mod_timer(&info->se_info.se_active_timer, jiffies +
-		msecs_to_jiffies(ST21NFCB_SE_TO_HOT_PLUG));
-	info->se_info.se_active = true;
-
-	/* Ignore return value and check in any case the host_list */
-	wait_for_completion_interruptible(&info->se_info.req_completion);
-
-	/* There might be some "collision" after receiving a HOT_PLUG event
-	 * This may cause the CLF to not answer to the next hci command.
-	 * There is no possible synchronization to prevent this.
-	 * Adding a small delay is the only way to solve the issue.
-	 */
-	usleep_range(3000, 5000);
-
-	r = nci_hci_get_param(ndev, NCI_HCI_ADMIN_GATE,
-			NCI_HCI_ADMIN_PARAM_HOST_LIST, &sk_host_list);
-	if (r != NCI_HCI_ANY_OK)
-		return r;
-
-	host_id = sk_host_list->data[sk_host_list->len - 1];
-	kfree_skb(sk_host_list);
-	if (state == ST21NFCB_SE_MODE_ON && host_id == se_idx)
-		return se_idx;
-	else if (state == ST21NFCB_SE_MODE_OFF && host_id != se_idx)
-		return se_idx;
-
-	return -1;
-}
-
-int st21nfcb_nci_disable_se(struct nci_dev *ndev, u32 se_idx)
-{
-	int r;
-
-	pr_debug("st21nfcb_nci_disable_se\n");
-
-	if (se_idx == NFC_SE_EMBEDDED) {
-		r = nci_hci_send_event(ndev, ST21NFCB_APDU_READER_GATE,
-				ST21NFCB_EVT_SE_END_OF_APDU_TRANSFER, NULL, 0);
-		if (r < 0)
-			return r;
-	}
-
-	return 0;
-}
-EXPORT_SYMBOL_GPL(st21nfcb_nci_disable_se);
-
-int st21nfcb_nci_enable_se(struct nci_dev *ndev, u32 se_idx)
-{
-	int r;
-
-	pr_debug("st21nfcb_nci_enable_se\n");
-
-	if (se_idx == ST21NFCB_HCI_HOST_ID_ESE) {
-		r = nci_hci_send_event(ndev, ST21NFCB_APDU_READER_GATE,
-				ST21NFCB_EVT_SE_SOFT_RESET, NULL, 0);
-		if (r < 0)
-			return r;
-	}
-
-	return 0;
-}
-EXPORT_SYMBOL_GPL(st21nfcb_nci_enable_se);
-
-static int st21nfcb_hci_network_init(struct nci_dev *ndev)
-{
-	struct core_conn_create_dest_spec_params *dest_params;
-	struct dest_spec_params spec_params;
-	struct nci_conn_info    *conn_info;
-	int r, dev_num;
-
-	dest_params =
-		kzalloc(sizeof(struct core_conn_create_dest_spec_params) +
-			sizeof(struct dest_spec_params), GFP_KERNEL);
-	if (dest_params == NULL) {
-		r = -ENOMEM;
-		goto exit;
-	}
-
-	dest_params->type = NCI_DESTINATION_SPECIFIC_PARAM_NFCEE_TYPE;
-	dest_params->length = sizeof(struct dest_spec_params);
-	spec_params.id = ndev->hci_dev->nfcee_id;
-	spec_params.protocol = NCI_NFCEE_INTERFACE_HCI_ACCESS;
-	memcpy(dest_params->value, &spec_params, sizeof(struct dest_spec_params));
-	r = nci_core_conn_create(ndev, NCI_DESTINATION_NFCEE, 1,
-				 sizeof(struct core_conn_create_dest_spec_params) +
-				 sizeof(struct dest_spec_params),
-				 dest_params);
-	if (r != NCI_STATUS_OK)
-		goto free_dest_params;
-
-	conn_info = ndev->hci_dev->conn_info;
-	if (!conn_info)
-		goto free_dest_params;
-
-	memcpy(ndev->hci_dev->init_data.gates, st21nfcb_gates,
-	       sizeof(st21nfcb_gates));
-
-	/*
-	 * Session id must include the driver name + i2c bus addr
-	 * persistent info to discriminate 2 identical chips
-	 */
-	dev_num = find_first_zero_bit(dev_mask, ST21NFCB_NUM_DEVICES);
-	if (dev_num >= ST21NFCB_NUM_DEVICES) {
-		r = -ENODEV;
-		goto free_dest_params;
-	}
-
-	scnprintf(ndev->hci_dev->init_data.session_id,
-		  sizeof(ndev->hci_dev->init_data.session_id),
-		  "%s%2x", "ST21BH", dev_num);
-
-	r = nci_hci_dev_session_init(ndev);
-	if (r != NCI_HCI_ANY_OK)
-		goto free_dest_params;
-
-	r = nci_nfcee_mode_set(ndev, ndev->hci_dev->conn_info->id,
-			       NCI_NFCEE_ENABLE);
-	if (r != NCI_STATUS_OK)
-		goto free_dest_params;
-
-free_dest_params:
-	kfree(dest_params);
-
-exit:
-	return r;
-}
-
-int st21nfcb_nci_discover_se(struct nci_dev *ndev)
-{
-	u8 param[2];
-	int r;
-	int se_count = 0;
-
-	pr_debug("st21nfcb_nci_discover_se\n");
-
-	r = st21nfcb_hci_network_init(ndev);
-	if (r != 0)
-		return r;
-
-	param[0] = ST21NFCB_UICC_HOST_ID;
-	param[1] = ST21NFCB_HCI_HOST_ID_ESE;
-	r = nci_hci_set_param(ndev, NCI_HCI_ADMIN_GATE,
-				NCI_HCI_ADMIN_PARAM_WHITELIST,
-				param, sizeof(param));
-	if (r != NCI_HCI_ANY_OK)
-		return r;
-
-	r = st21nfcb_nci_control_se(ndev, ST21NFCB_UICC_HOST_ID,
-				ST21NFCB_SE_MODE_ON);
-	if (r == ST21NFCB_UICC_HOST_ID) {
-		nfc_add_se(ndev->nfc_dev, ST21NFCB_UICC_HOST_ID, NFC_SE_UICC);
-		se_count++;
-	}
-
-	/* Try to enable eSE in order to check availability */
-	r = st21nfcb_nci_control_se(ndev, ST21NFCB_HCI_HOST_ID_ESE,
-				ST21NFCB_SE_MODE_ON);
-	if (r == ST21NFCB_HCI_HOST_ID_ESE) {
-		nfc_add_se(ndev->nfc_dev, ST21NFCB_HCI_HOST_ID_ESE,
-			   NFC_SE_EMBEDDED);
-		se_count++;
-		st21nfcb_se_get_atr(ndev);
-	}
-
-	return !se_count;
-}
-EXPORT_SYMBOL_GPL(st21nfcb_nci_discover_se);
-
-int st21nfcb_nci_se_io(struct nci_dev *ndev, u32 se_idx,
-		       u8 *apdu, size_t apdu_length,
-		       se_io_cb_t cb, void *cb_context)
-{
-	struct st21nfcb_nci_info *info = nci_get_drvdata(ndev);
-
-	pr_debug("\n");
-
-	switch (se_idx) {
-	case ST21NFCB_HCI_HOST_ID_ESE:
-		info->se_info.cb = cb;
-		info->se_info.cb_context = cb_context;
-		mod_timer(&info->se_info.bwi_timer, jiffies +
-			  msecs_to_jiffies(info->se_info.wt_timeout));
-		info->se_info.bwi_active = true;
-		return nci_hci_send_event(ndev, ST21NFCB_APDU_READER_GATE,
-					ST21NFCB_EVT_TRANSMIT_DATA, apdu,
-					apdu_length);
-	default:
-		return -ENODEV;
-	}
-}
-EXPORT_SYMBOL(st21nfcb_nci_se_io);
-
-static void st21nfcb_se_wt_timeout(unsigned long data)
-{
-	/*
-	 * No answer from the secure element
-	 * within the defined timeout.
-	 * Let's send a reset request as recovery procedure.
-	 * According to the situation, we first try to send a software reset
-	 * to the secure element. If the next command is still not
-	 * answering in time, we send to the CLF a secure element hardware
-	 * reset request.
-	 */
-	/* hardware reset managed through VCC_UICC_OUT power supply */
-	u8 param = 0x01;
-	struct st21nfcb_nci_info *info = (struct st21nfcb_nci_info *) data;
-
-	pr_debug("\n");
-
-	info->se_info.bwi_active = false;
-
-	if (!info->se_info.xch_error) {
-		info->se_info.xch_error = true;
-		nci_hci_send_event(info->ndlc->ndev, ST21NFCB_APDU_READER_GATE,
-				ST21NFCB_EVT_SE_SOFT_RESET, NULL, 0);
-	} else {
-		info->se_info.xch_error = false;
-		nci_hci_send_event(info->ndlc->ndev, ST21NFCB_DEVICE_MGNT_GATE,
-				ST21NFCB_EVT_SE_HARD_RESET, &param, 1);
-	}
-	info->se_info.cb(info->se_info.cb_context, NULL, 0, -ETIME);
-}
-
-static void st21nfcb_se_activation_timeout(unsigned long data)
-{
-	struct st21nfcb_nci_info *info = (struct st21nfcb_nci_info *) data;
-
-	pr_debug("\n");
-
-	info->se_info.se_active = false;
-
-	complete(&info->se_info.req_completion);
-}
-
-int st21nfcb_se_init(struct nci_dev *ndev)
-{
-	struct st21nfcb_nci_info *info = nci_get_drvdata(ndev);
-
-	init_completion(&info->se_info.req_completion);
-	/* initialize timers */
-	init_timer(&info->se_info.bwi_timer);
-	info->se_info.bwi_timer.data = (unsigned long)info;
-	info->se_info.bwi_timer.function = st21nfcb_se_wt_timeout;
-	info->se_info.bwi_active = false;
-
-	init_timer(&info->se_info.se_active_timer);
-	info->se_info.se_active_timer.data = (unsigned long)info;
-	info->se_info.se_active_timer.function =
-			st21nfcb_se_activation_timeout;
-	info->se_info.se_active = false;
-
-	info->se_info.xch_error = false;
-
-	info->se_info.wt_timeout =
-		ST21NFCB_BWI_TO_TIMEOUT(ST21NFCB_ATR_DEFAULT_BWI);
-
-	return 0;
-}
-EXPORT_SYMBOL(st21nfcb_se_init);
-
-void st21nfcb_se_deinit(struct nci_dev *ndev)
-{
-	struct st21nfcb_nci_info *info = nci_get_drvdata(ndev);
-
-	if (info->se_info.bwi_active)
-		del_timer_sync(&info->se_info.bwi_timer);
-	if (info->se_info.se_active)
-		del_timer_sync(&info->se_info.se_active_timer);
-
-	info->se_info.se_active = false;
-	info->se_info.bwi_active = false;
-}
-EXPORT_SYMBOL(st21nfcb_se_deinit);
-
diff --git a/drivers/nfc/st21nfcb/st21nfcb_se.h b/drivers/nfc/st21nfcb/st21nfcb_se.h
deleted file mode 100644
index 52a323872bea..000000000000
--- a/drivers/nfc/st21nfcb/st21nfcb_se.h
+++ /dev/null
@@ -1,61 +0,0 @@
-/*
- * NCI based Driver for STMicroelectronics NFC Chip
- *
- * Copyright (C) 2014  STMicroelectronics SAS. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, see <http://www.gnu.org/licenses/>.
- */
-#ifndef __LOCAL_ST21NFCB_SE_H_
-#define __LOCAL_ST21NFCB_SE_H_
-
-/*
- * ref ISO7816-3 chap 8.1. the initial character TS is followed by a
- * sequence of at most 32 characters.
- */
-#define ST21NFCB_ESE_MAX_LENGTH		33
-#define ST21NFCB_HCI_HOST_ID_ESE	0xc0
-
-struct st21nfcb_se_info {
-	u8 atr[ST21NFCB_ESE_MAX_LENGTH];
-	struct completion req_completion;
-
-	struct timer_list bwi_timer;
-	int wt_timeout; /* in msecs */
-	bool bwi_active;
-
-	struct timer_list se_active_timer;
-	bool se_active;
-
-	bool xch_error;
-
-	se_io_cb_t cb;
-	void *cb_context;
-};
-
-int st21nfcb_se_init(struct nci_dev *ndev);
-void st21nfcb_se_deinit(struct nci_dev *ndev);
-
-int st21nfcb_nci_discover_se(struct nci_dev *ndev);
-int st21nfcb_nci_enable_se(struct nci_dev *ndev, u32 se_idx);
-int st21nfcb_nci_disable_se(struct nci_dev *ndev, u32 se_idx);
-int st21nfcb_nci_se_io(struct nci_dev *ndev, u32 se_idx,
-					u8 *apdu, size_t apdu_length,
-					se_io_cb_t cb, void *cb_context);
-int st21nfcb_hci_load_session(struct nci_dev *ndev);
-void st21nfcb_hci_event_received(struct nci_dev *ndev, u8 pipe,
-				 u8 event, struct sk_buff *skb);
-void st21nfcb_hci_cmd_received(struct nci_dev *ndev, u8 pipe, u8 cmd,
-			       struct sk_buff *skb);
-
-
-#endif /* __LOCAL_ST21NFCB_NCI_H_ */
diff --git a/include/linux/platform_data/st-nci.h b/include/linux/platform_data/st-nci.h
new file mode 100644
index 000000000000..d9d400a297bd
--- /dev/null
+++ b/include/linux/platform_data/st-nci.h
@@ -0,0 +1,29 @@
+/*
+ * Driver include for ST NCI NFC chip family.
+ *
+ * Copyright (C) 2014-2015  STMicroelectronics SAS. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _ST_NCI_H_
+#define _ST_NCI_H_
+
+#define ST_NCI_DRIVER_NAME "st_nci"
+
+struct st_nci_nfc_platform_data {
+	unsigned int gpio_reset;
+	unsigned int irq_polarity;
+};
+
+#endif /* _ST_NCI_H_ */
diff --git a/include/linux/platform_data/st21nfcb.h b/include/linux/platform_data/st21nfcb.h
deleted file mode 100644
index b023373d9874..000000000000
--- a/include/linux/platform_data/st21nfcb.h
+++ /dev/null
@@ -1,29 +0,0 @@
-/*
- * Driver include for the ST21NFCB NFC chip.
- *
- * Copyright (C) 2014  STMicroelectronics SAS. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, see <http://www.gnu.org/licenses/>.
- */
-
-#ifndef _ST21NFCB_NCI_H_
-#define _ST21NFCB_NCI_H_
-
-#define ST21NFCB_NCI_DRIVER_NAME "st21nfcb_nci"
-
-struct st21nfcb_nfc_platform_data {
-	unsigned int gpio_reset;
-	unsigned int irq_polarity;
-};
-
-#endif /* _ST21NFCB_NCI_H_ */
diff --git a/include/linux/platform_data/st_nci.h b/include/linux/platform_data/st_nci.h
new file mode 100644
index 000000000000..d9d400a297bd
--- /dev/null
+++ b/include/linux/platform_data/st_nci.h
@@ -0,0 +1,29 @@
+/*
+ * Driver include for ST NCI NFC chip family.
+ *
+ * Copyright (C) 2014-2015  STMicroelectronics SAS. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _ST_NCI_H_
+#define _ST_NCI_H_
+
+#define ST_NCI_DRIVER_NAME "st_nci"
+
+struct st_nci_nfc_platform_data {
+	unsigned int gpio_reset;
+	unsigned int irq_polarity;
+};
+
+#endif /* _ST_NCI_H_ */
-- 
cgit v1.2.3


From 205a525c334295e3cd4cc7755fd2c0398e3a787f Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Tue, 9 Jun 2015 18:19:39 +0800
Subject: random: Add callback API for random pool readiness

The get_blocking_random_bytes API is broken because the wait can
be arbitrarily long (potentially forever) so there is no safe way
of calling it from within the kernel.

This patch replaces it with a callback API instead.  The callback
is invoked potentially from interrupt context so the user needs
to schedule their own work thread if necessary.

In addition to adding callbacks, they can also be removed as
otherwise this opens up a way for user-space to allocate kernel
memory with no bound (by opening algif_rng descriptors and then
closing them).

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 drivers/char/random.c  | 78 ++++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/random.h |  9 ++++++
 2 files changed, 87 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/char/random.c b/drivers/char/random.c
index 159d0700f7d8..a1576ed1d88e 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -409,6 +409,9 @@ static DECLARE_WAIT_QUEUE_HEAD(random_write_wait);
 static DECLARE_WAIT_QUEUE_HEAD(urandom_init_wait);
 static struct fasync_struct *fasync;
 
+static DEFINE_SPINLOCK(random_ready_list_lock);
+static LIST_HEAD(random_ready_list);
+
 /**********************************************************************
  *
  * OS independent entropy store.   Here are the functions which handle
@@ -589,6 +592,22 @@ static void fast_mix(struct fast_pool *f)
 	f->count++;
 }
 
+static void process_random_ready_list(void)
+{
+	unsigned long flags;
+	struct random_ready_callback *rdy, *tmp;
+
+	spin_lock_irqsave(&random_ready_list_lock, flags);
+	list_for_each_entry_safe(rdy, tmp, &random_ready_list, list) {
+		struct module *owner = rdy->owner;
+
+		list_del_init(&rdy->list);
+		rdy->func(rdy);
+		module_put(owner);
+	}
+	spin_unlock_irqrestore(&random_ready_list_lock, flags);
+}
+
 /*
  * Credit (or debit) the entropy store with n bits of entropy.
  * Use credit_entropy_bits_safe() if the value comes from userspace
@@ -660,6 +679,7 @@ retry:
 		r->entropy_total = 0;
 		if (r == &nonblocking_pool) {
 			prandom_reseed_late();
+			process_random_ready_list();
 			wake_up_all(&urandom_init_wait);
 			pr_notice("random: %s pool is initialized\n", r->name);
 		}
@@ -1256,6 +1276,64 @@ void get_blocking_random_bytes(void *buf, int nbytes)
 }
 EXPORT_SYMBOL(get_blocking_random_bytes);
 
+/*
+ * Add a callback function that will be invoked when the nonblocking
+ * pool is initialised.
+ *
+ * returns: 0 if callback is successfully added
+ *	    -EALREADY if pool is already initialised (callback not called)
+ *	    -ENOENT if module for callback is not alive
+ */
+int add_random_ready_callback(struct random_ready_callback *rdy)
+{
+	struct module *owner;
+	unsigned long flags;
+	int err = -EALREADY;
+
+	if (likely(nonblocking_pool.initialized))
+		return err;
+
+	owner = rdy->owner;
+	if (!try_module_get(owner))
+		return -ENOENT;
+
+	spin_lock_irqsave(&random_ready_list_lock, flags);
+	if (nonblocking_pool.initialized)
+		goto out;
+
+	owner = NULL;
+
+	list_add(&rdy->list, &random_ready_list);
+	err = 0;
+
+out:
+	spin_unlock_irqrestore(&random_ready_list_lock, flags);
+
+	module_put(owner);
+
+	return err;
+}
+EXPORT_SYMBOL(add_random_ready_callback);
+
+/*
+ * Delete a previously registered readiness callback function.
+ */
+void del_random_ready_callback(struct random_ready_callback *rdy)
+{
+	unsigned long flags;
+	struct module *owner = NULL;
+
+	spin_lock_irqsave(&random_ready_list_lock, flags);
+	if (!list_empty(&rdy->list)) {
+		list_del_init(&rdy->list);
+		owner = rdy->owner;
+	}
+	spin_unlock_irqrestore(&random_ready_list_lock, flags);
+
+	module_put(owner);
+}
+EXPORT_SYMBOL(del_random_ready_callback);
+
 /*
  * This function will use the architecture-specific hardware random
  * number generator if it is available.  The arch-specific hw RNG will
diff --git a/include/linux/random.h b/include/linux/random.h
index 796267d56901..30e2aca0b16a 100644
--- a/include/linux/random.h
+++ b/include/linux/random.h
@@ -6,8 +6,15 @@
 #ifndef _LINUX_RANDOM_H
 #define _LINUX_RANDOM_H
 
+#include <linux/list.h>
 #include <uapi/linux/random.h>
 
+struct random_ready_callback {
+	struct list_head list;
+	void (*func)(struct random_ready_callback *rdy);
+	struct module *owner;
+};
+
 extern void add_device_randomness(const void *, unsigned int);
 extern void add_input_randomness(unsigned int type, unsigned int code,
 				 unsigned int value);
@@ -15,6 +22,8 @@ extern void add_interrupt_randomness(int irq, int irq_flags);
 
 extern void get_random_bytes(void *buf, int nbytes);
 extern void get_blocking_random_bytes(void *buf, int nbytes);
+extern int add_random_ready_callback(struct random_ready_callback *rdy);
+extern void del_random_ready_callback(struct random_ready_callback *rdy);
 extern void get_random_bytes_arch(void *buf, int nbytes);
 void generate_random_uuid(unsigned char uuid_out[16]);
 extern int random_int_secret_init(void);
-- 
cgit v1.2.3


From c2719503f5e1e6213d716bb078bdad01e28ebcbf Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Tue, 9 Jun 2015 18:19:42 +0800
Subject: random: Remove kernel blocking API

This patch removes the kernel blocking API as it has been completely
replaced by the callback API.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 drivers/char/random.c  | 12 ------------
 include/linux/random.h |  1 -
 2 files changed, 13 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/char/random.c b/drivers/char/random.c
index a1576ed1d88e..d0da5d852d41 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -1264,18 +1264,6 @@ void get_random_bytes(void *buf, int nbytes)
 }
 EXPORT_SYMBOL(get_random_bytes);
 
-/*
- * Equivalent function to get_random_bytes with the difference that this
- * function blocks the request until the nonblocking_pool is initialized.
- */
-void get_blocking_random_bytes(void *buf, int nbytes)
-{
-	if (unlikely(nonblocking_pool.initialized == 0))
-		wait_event(urandom_init_wait, nonblocking_pool.initialized);
-	extract_entropy(&nonblocking_pool, buf, nbytes, 0, 0);
-}
-EXPORT_SYMBOL(get_blocking_random_bytes);
-
 /*
  * Add a callback function that will be invoked when the nonblocking
  * pool is initialised.
diff --git a/include/linux/random.h b/include/linux/random.h
index 30e2aca0b16a..e651874df2c9 100644
--- a/include/linux/random.h
+++ b/include/linux/random.h
@@ -21,7 +21,6 @@ extern void add_input_randomness(unsigned int type, unsigned int code,
 extern void add_interrupt_randomness(int irq, int irq_flags);
 
 extern void get_random_bytes(void *buf, int nbytes);
-extern void get_blocking_random_bytes(void *buf, int nbytes);
 extern int add_random_ready_callback(struct random_ready_callback *rdy);
 extern void del_random_ready_callback(struct random_ready_callback *rdy);
 extern void get_random_bytes_arch(void *buf, int nbytes);
-- 
cgit v1.2.3


From 0bb979472a7401022109e81dd89d777adea58710 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Wed, 10 Jun 2015 08:01:20 -0600
Subject: cfq-iosched: fix the setting of IOPS mode on SSDs

A previous commit wanted to make CFQ default to IOPS mode on
non-rotational storage, however it did so when the queue was
initialized and the non-rotational flag is only set later on
in the probe.

Add an elevator hook that gets called off the add_disk() path,
at that point we know that feature probing has finished, and
we can reliably check for the various flags that drivers can
set.

Fixes: 41c0126b ("block: Make CFQ default to IOPS mode on SSDs")
Tested-by: Romain Francoise <romain@orebokech.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/cfq-iosched.c      | 15 ++++++++++++++-
 block/elevator.c         |  2 ++
 include/linux/elevator.h |  2 ++
 3 files changed, 18 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index c808ad87652d..d1d0cb235cd2 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -4508,7 +4508,7 @@ static int cfq_init_queue(struct request_queue *q, struct elevator_type *e)
 	cfqd->cfq_slice[1] = cfq_slice_sync;
 	cfqd->cfq_target_latency = cfq_target_latency;
 	cfqd->cfq_slice_async_rq = cfq_slice_async_rq;
-	cfqd->cfq_slice_idle = blk_queue_nonrot(q) ? 0 : cfq_slice_idle;
+	cfqd->cfq_slice_idle = cfq_slice_idle;
 	cfqd->cfq_group_idle = cfq_group_idle;
 	cfqd->cfq_latency = 1;
 	cfqd->hw_tag = -1;
@@ -4525,6 +4525,18 @@ out_free:
 	return ret;
 }
 
+static void cfq_registered_queue(struct request_queue *q)
+{
+	struct elevator_queue *e = q->elevator;
+	struct cfq_data *cfqd = e->elevator_data;
+
+	/*
+	 * Default to IOPS mode with no idling for SSDs
+	 */
+	if (blk_queue_nonrot(q))
+		cfqd->cfq_slice_idle = 0;
+}
+
 /*
  * sysfs parts below -->
  */
@@ -4640,6 +4652,7 @@ static struct elevator_type iosched_cfq = {
 		.elevator_may_queue_fn =	cfq_may_queue,
 		.elevator_init_fn =		cfq_init_queue,
 		.elevator_exit_fn =		cfq_exit_queue,
+		.elevator_registered_fn =	cfq_registered_queue,
 	},
 	.icq_size	=	sizeof(struct cfq_io_cq),
 	.icq_align	=	__alignof__(struct cfq_io_cq),
diff --git a/block/elevator.c b/block/elevator.c
index 59794d0d38e3..5f0452734a40 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -810,6 +810,8 @@ int elv_register_queue(struct request_queue *q)
 		}
 		kobject_uevent(&e->kobj, KOBJ_ADD);
 		e->registered = 1;
+		if (e->type->ops.elevator_registered_fn)
+			e->type->ops.elevator_registered_fn(q);
 	}
 	return error;
 }
diff --git a/include/linux/elevator.h b/include/linux/elevator.h
index 45a91474487d..638b324f0291 100644
--- a/include/linux/elevator.h
+++ b/include/linux/elevator.h
@@ -39,6 +39,7 @@ typedef void (elevator_deactivate_req_fn) (struct request_queue *, struct reques
 typedef int (elevator_init_fn) (struct request_queue *,
 				struct elevator_type *e);
 typedef void (elevator_exit_fn) (struct elevator_queue *);
+typedef void (elevator_registered_fn) (struct request_queue *);
 
 struct elevator_ops
 {
@@ -68,6 +69,7 @@ struct elevator_ops
 
 	elevator_init_fn *elevator_init_fn;
 	elevator_exit_fn *elevator_exit_fn;
+	elevator_registered_fn *elevator_registered_fn;
 };
 
 #define ELV_NAME_MAX	(16)
-- 
cgit v1.2.3


From 5c6e3a97e969e978368df83239583771c936efea Mon Sep 17 00:00:00 2001
From: Krzysztof Kozlowski <k.kozlowski@samsung.com>
Date: Mon, 8 Jun 2015 10:09:48 +0900
Subject: power_supply: sysfs: Bring back write to writeable properties

The fix for NULL pointer exception related to calling uevent for not
finished probe caused to set all writeable properties as non-writeable.
This was caused by checking if property is writeable before the initial
increase of power supply usage counter and in the same time using
wrapper over property_is_writeable(). The wrapper returns ENODEV if the
usage counter is still 0.

The call trace looked like:
  device probe:
    power_supply_register()
      use_cnt = 0;
      device_add()
        create sysfs entries
          power_supply_attr_is_visible()
            power_supply_property_is_writeable()
              if (use_cnt == 0) return -ENODEV;
      use_cnt++;

Replace the usage of wrapper with direct call to property_is_writeable()
from driver. This should be safe call during device probe because
implementations of this callback just return 0/1 for different
properties and they do not access any of the driver's internal data.

Fixes: 8e59c7f23410 ("power_supply: Fix NULL pointer dereference during bq27x00_battery probe")
Signed-off-by: Krzysztof Kozlowski <k.kozlowski@samsung.com>
Signed-off-by: Sebastian Reichel <sre@kernel.org>
---
 drivers/power/power_supply_sysfs.c | 2 +-
 include/linux/power_supply.h       | 5 +++++
 2 files changed, 6 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/power/power_supply_sysfs.c b/drivers/power/power_supply_sysfs.c
index af026806cba5..ed2d7fd0c734 100644
--- a/drivers/power/power_supply_sysfs.c
+++ b/drivers/power/power_supply_sysfs.c
@@ -223,7 +223,7 @@ static umode_t power_supply_attr_is_visible(struct kobject *kobj,
 
 		if (property == attrno) {
 			if (psy->desc->property_is_writeable &&
-			    power_supply_property_is_writeable(psy, property) > 0)
+			    psy->desc->property_is_writeable(psy, property) > 0)
 				mode |= S_IWUSR;
 
 			return mode;
diff --git a/include/linux/power_supply.h b/include/linux/power_supply.h
index a80f1fd01ddb..0395bcb18ddb 100644
--- a/include/linux/power_supply.h
+++ b/include/linux/power_supply.h
@@ -206,6 +206,11 @@ struct power_supply_desc {
 	int (*set_property)(struct power_supply *psy,
 			    enum power_supply_property psp,
 			    const union power_supply_propval *val);
+	/*
+	 * property_is_writeable() will be called during registration
+	 * of power supply. If this happens during device probe then it must
+	 * not access internal data of device (because probe did not end).
+	 */
 	int (*property_is_writeable)(struct power_supply *psy,
 				     enum power_supply_property psp);
 	void (*external_power_changed)(struct power_supply *psy);
-- 
cgit v1.2.3


From fe27e1dfe9962b07215ee01445926306ddbb7c25 Mon Sep 17 00:00:00 2001
From: Hans de Goede <hdegoede@redhat.com>
Date: Tue, 9 Jun 2015 23:37:56 +0200
Subject: power: Add devm_power_supply_get_by_phandle() helper function

This commit adds a resource-managed version of the
power_supply_get_by_phandle() function.

Signed-off-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Sebastian Reichel <sre@kernel.org>
---
 drivers/power/power_supply_core.c | 39 +++++++++++++++++++++++++++++++++++++++
 include/linux/power_supply.h      |  5 +++++
 2 files changed, 44 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/power/power_supply_core.c b/drivers/power/power_supply_core.c
index 4bc0c7f459a5..a44d4554af4f 100644
--- a/drivers/power/power_supply_core.c
+++ b/drivers/power/power_supply_core.c
@@ -446,6 +446,45 @@ struct power_supply *power_supply_get_by_phandle(struct device_node *np,
 	return psy;
 }
 EXPORT_SYMBOL_GPL(power_supply_get_by_phandle);
+
+static void devm_power_supply_put(struct device *dev, void *res)
+{
+	struct power_supply **psy = res;
+
+	power_supply_put(*psy);
+}
+
+/**
+ * devm_power_supply_get_by_phandle() - Resource managed version of
+ *  power_supply_get_by_phandle()
+ * @dev: Pointer to device holding phandle property
+ * @phandle_name: Name of property holding a power supply phandle
+ *
+ * Return: On success returns a reference to a power supply with
+ * matching name equals to value under @property, NULL or ERR_PTR otherwise.
+ */
+struct power_supply *devm_power_supply_get_by_phandle(struct device *dev,
+						      const char *property)
+{
+	struct power_supply **ptr, *psy;
+
+	if (!dev->of_node)
+		return ERR_PTR(-ENODEV);
+
+	ptr = devres_alloc(devm_power_supply_put, sizeof(*ptr), GFP_KERNEL);
+	if (!ptr)
+		return ERR_PTR(-ENOMEM);
+
+	psy = power_supply_get_by_phandle(dev->of_node, property);
+	if (IS_ERR_OR_NULL(psy)) {
+		devres_free(ptr);
+	} else {
+		*ptr = psy;
+		devres_add(dev, ptr);
+	}
+	return psy;
+}
+EXPORT_SYMBOL_GPL(devm_power_supply_get_by_phandle);
 #endif /* CONFIG_OF */
 
 int power_supply_get_property(struct power_supply *psy,
diff --git a/include/linux/power_supply.h b/include/linux/power_supply.h
index 0395bcb18ddb..ef9f1592185d 100644
--- a/include/linux/power_supply.h
+++ b/include/linux/power_supply.h
@@ -292,10 +292,15 @@ extern void power_supply_put(struct power_supply *psy);
 #ifdef CONFIG_OF
 extern struct power_supply *power_supply_get_by_phandle(struct device_node *np,
 							const char *property);
+extern struct power_supply *devm_power_supply_get_by_phandle(
+				    struct device *dev, const char *property);
 #else /* !CONFIG_OF */
 static inline struct power_supply *
 power_supply_get_by_phandle(struct device_node *np, const char *property)
 { return NULL; }
+static inline struct power_supply *
+devm_power_supply_get_by_phandle(struct device *dev, const char *property)
+{ return NULL; }
 #endif /* CONFIG_OF */
 extern void power_supply_changed(struct power_supply *psy);
 extern int power_supply_am_i_supplied(struct power_supply *psy);
-- 
cgit v1.2.3


From 3037e9ea780027d41baaaabb68a749e49e7c8260 Mon Sep 17 00:00:00 2001
From: Daniel Thompson <daniel.thompson@linaro.org>
Date: Wed, 10 Jun 2015 21:04:54 +0100
Subject: clk: fixed: Add comment to clk_fixed_set_rate

Currently it is not made explicit why clk_fixed_set_rate() can ignore
its arguments and unconditionally return success. Add a comment
to explain this.

We also mark the clk_ops table const since it should never be
modified at runtime.

Suggested-by: Stephen Boyd <sboyd@codeaurora.org>
Signed-off-by: Daniel Thompson <daniel.thompson@linaro.org>
Signed-off-by: Stephen Boyd <sboyd@codeaurora.org>
---
 drivers/clk/clk-fixed-factor.c | 8 +++++++-
 include/linux/clk-provider.h   | 2 +-
 2 files changed, 8 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/clk/clk-fixed-factor.c b/drivers/clk/clk-fixed-factor.c
index e186db263d5e..fccabe497f6e 100644
--- a/drivers/clk/clk-fixed-factor.c
+++ b/drivers/clk/clk-fixed-factor.c
@@ -55,10 +55,16 @@ static long clk_factor_round_rate(struct clk_hw *hw, unsigned long rate,
 static int clk_factor_set_rate(struct clk_hw *hw, unsigned long rate,
 				unsigned long parent_rate)
 {
+	/*
+	 * We must report success but we can do so unconditionally because
+	 * clk_factor_round_rate returns values that ensure this call is a
+	 * nop.
+	 */
+
 	return 0;
 }
 
-struct clk_ops clk_fixed_factor_ops = {
+const struct clk_ops clk_fixed_factor_ops = {
 	.round_rate = clk_factor_round_rate,
 	.set_rate = clk_factor_set_rate,
 	.recalc_rate = clk_factor_recalc_rate,
diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h
index 2e5df069ca34..4a943d13625b 100644
--- a/include/linux/clk-provider.h
+++ b/include/linux/clk-provider.h
@@ -459,7 +459,7 @@ struct clk_fixed_factor {
 	unsigned int	div;
 };
 
-extern struct clk_ops clk_fixed_factor_ops;
+extern const struct clk_ops clk_fixed_factor_ops;
 struct clk *clk_register_fixed_factor(struct device *dev, const char *name,
 		const char *parent_name, unsigned long flags,
 		unsigned int mult, unsigned int div);
-- 
cgit v1.2.3


From b064a8fa77dfead647564c46ac8fc5b13bd1ab73 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Wed, 10 Jun 2015 01:33:36 +0200
Subject: ACPI / init: Switch over platform to the ACPI mode later

Commit 73f7d1ca3263 "ACPI / init: Run acpi_early_init() before
timekeeping_init()" moved the ACPI subsystem initialization,
including the ACPI mode enabling, to an earlier point in the
initialization sequence, to allow the timekeeping subsystem
use ACPI early.  Unfortunately, that resulted in boot regressions
on some systems and the early ACPI initialization was moved toward
its original position in the kernel initialization code by commit
c4e1acbb35e4 "ACPI / init: Invoke early ACPI initialization later".

However, that turns out to be insufficient, as boot is still broken
on the Tyan S8812 mainboard.

To fix that issue, split the ACPI early initialization code into
two pieces so the majority of it still located in acpi_early_init()
and the part switching over the platform into the ACPI mode goes into
a new function, acpi_subsystem_init(), executed at the original early
ACPI initialization spot.

That fixes the Tyan S8812 boot problem, but still allows ACPI
tables to be loaded earlier which is useful to the EFI code in
efi_enter_virtual_mode().

Link: https://bugzilla.kernel.org/show_bug.cgi?id=97141
Fixes: 73f7d1ca3263 "ACPI / init: Run acpi_early_init() before timekeeping_init()"
Reported-and-tested-by: Marius Tolzmann <tolzmann@molgen.mpg.de>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Toshi Kani <toshi.kani@hp.com>
Reviewed-by: Hanjun Guo <hanjun.guo@linaro.org>
Reviewed-by: Lee, Chun-Yi <jlee@suse.com>
---
 drivers/acpi/bus.c   | 56 ++++++++++++++++++++++++++++++++++++++--------------
 include/linux/acpi.h |  2 ++
 init/main.c          |  1 +
 3 files changed, 44 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/acpi/bus.c b/drivers/acpi/bus.c
index c412fdb28d34..513e7230e3d0 100644
--- a/drivers/acpi/bus.c
+++ b/drivers/acpi/bus.c
@@ -470,6 +470,16 @@ static int __init acpi_bus_init_irq(void)
 	return 0;
 }
 
+/**
+ * acpi_early_init - Initialize ACPICA and populate the ACPI namespace.
+ *
+ * The ACPI tables are accessible after this, but the handling of events has not
+ * been initialized and the global lock is not available yet, so AML should not
+ * be executed at this point.
+ *
+ * Doing this before switching the EFI runtime services to virtual mode allows
+ * the EfiBootServices memory to be freed slightly earlier on boot.
+ */
 void __init acpi_early_init(void)
 {
 	acpi_status status;
@@ -533,26 +543,42 @@ void __init acpi_early_init(void)
 		acpi_gbl_FADT.sci_interrupt = acpi_sci_override_gsi;
 	}
 #endif
+	return;
+
+ error0:
+	disable_acpi();
+}
+
+/**
+ * acpi_subsystem_init - Finalize the early initialization of ACPI.
+ *
+ * Switch over the platform to the ACPI mode (if possible), initialize the
+ * handling of ACPI events, install the interrupt and global lock handlers.
+ *
+ * Doing this too early is generally unsafe, but at the same time it needs to be
+ * done before all things that really depend on ACPI.  The right spot appears to
+ * be before finalizing the EFI initialization.
+ */
+void __init acpi_subsystem_init(void)
+{
+	acpi_status status;
+
+	if (acpi_disabled)
+		return;
 
 	status = acpi_enable_subsystem(~ACPI_NO_ACPI_ENABLE);
 	if (ACPI_FAILURE(status)) {
 		printk(KERN_ERR PREFIX "Unable to enable ACPI\n");
-		goto error0;
+		disable_acpi();
+	} else {
+		/*
+		 * If the system is using ACPI then we can be reasonably
+		 * confident that any regulators are managed by the firmware
+		 * so tell the regulator core it has everything it needs to
+		 * know.
+		 */
+		regulator_has_full_constraints();
 	}
-
-	/*
-	 * If the system is using ACPI then we can be reasonably
-	 * confident that any regulators are managed by the firmware
-	 * so tell the regulator core it has everything it needs to
-	 * know.
-	 */
-	regulator_has_full_constraints();
-
-	return;
-
-      error0:
-	disable_acpi();
-	return;
 }
 
 static int __init acpi_bus_init(void)
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index e4da5e35e29c..4550be3bb63b 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -440,6 +440,7 @@ extern acpi_status acpi_pci_osc_control_set(acpi_handle handle,
 #define ACPI_OST_SC_INSERT_NOT_SUPPORTED	0x82
 
 extern void acpi_early_init(void);
+extern void acpi_subsystem_init(void);
 
 extern int acpi_nvs_register(__u64 start, __u64 size);
 
@@ -494,6 +495,7 @@ static inline const char *acpi_dev_name(struct acpi_device *adev)
 }
 
 static inline void acpi_early_init(void) { }
+static inline void acpi_subsystem_init(void) { }
 
 static inline int early_acpi_boot_init(void)
 {
diff --git a/init/main.c b/init/main.c
index 2115055faeac..2a89545e0a5d 100644
--- a/init/main.c
+++ b/init/main.c
@@ -664,6 +664,7 @@ asmlinkage __visible void __init start_kernel(void)
 
 	check_bugs();
 
+	acpi_subsystem_init();
 	sfi_init_late();
 
 	if (efi_enabled(EFI_RUNTIME_SERVICES)) {
-- 
cgit v1.2.3


From 3c87ef6efb40f0e339d705c194b2224f854ec38e Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@poochiereds.net>
Date: Wed, 3 Jun 2015 16:14:25 -0400
Subject: sunrpc: keep a count of swapfiles associated with the rpc_clnt

Jerome reported seeing a warning pop when working with a swapfile on
NFS. The nfs_swap_activate can end up calling sk_set_memalloc while
holding the rcu_read_lock and that function can sleep.

To fix that, we need to take a reference to the xprt while holding the
rcu_read_lock, set the socket up for swapping and then drop that
reference. But, xprt_put is not exported and having NFS deal with the
underlying xprt is a bit of layering violation anyway.

Fix this by adding a set of activate/deactivate functions that take a
rpc_clnt pointer instead of an rpc_xprt, and have nfs_swap_activate and
nfs_swap_deactivate call those.

Also, add a per-rpc_clnt atomic counter to keep track of the number of
active swapfiles associated with it. When the counter does a 0->1
transition, we enable swapping on the xprt, when we do a 1->0 transition
we disable swapping on it.

This also allows us to be a bit more selective with the RPC_TASK_SWAPPER
flag. If non-swapper and swapper clnts are sharing a xprt, then we only
need to flag the tasks from the swapper clnt with that flag.

Acked-by: Mel Gorman <mgorman@suse.de>
Reported-by: Jerome Marchand <jmarchan@redhat.com>
Signed-off-by: Jeff Layton <jeff.layton@primarydata.com>
Reviewed-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/file.c                | 15 ++--------
 include/linux/sunrpc/clnt.h  |  1 +
 include/linux/sunrpc/sched.h | 16 +++++++++++
 net/sunrpc/clnt.c            | 67 ++++++++++++++++++++++++++++++++++++++------
 net/sunrpc/xprtsock.c        |  3 +-
 5 files changed, 78 insertions(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 8b8d83a526ce..cc4fa1ed61fc 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -555,31 +555,22 @@ static int nfs_launder_page(struct page *page)
 	return nfs_wb_page(inode, page);
 }
 
-#ifdef CONFIG_NFS_SWAP
 static int nfs_swap_activate(struct swap_info_struct *sis, struct file *file,
 						sector_t *span)
 {
-	int ret;
 	struct rpc_clnt *clnt = NFS_CLIENT(file->f_mapping->host);
 
 	*span = sis->pages;
 
-	rcu_read_lock();
-	ret = xs_swapper(rcu_dereference(clnt->cl_xprt), 1);
-	rcu_read_unlock();
-
-	return ret;
+	return rpc_clnt_swap_activate(clnt);
 }
 
 static void nfs_swap_deactivate(struct file *file)
 {
 	struct rpc_clnt *clnt = NFS_CLIENT(file->f_mapping->host);
 
-	rcu_read_lock();
-	xs_swapper(rcu_dereference(clnt->cl_xprt), 0);
-	rcu_read_unlock();
+	rpc_clnt_swap_deactivate(clnt);
 }
-#endif
 
 const struct address_space_operations nfs_file_aops = {
 	.readpage = nfs_readpage,
@@ -596,10 +587,8 @@ const struct address_space_operations nfs_file_aops = {
 	.launder_page = nfs_launder_page,
 	.is_dirty_writeback = nfs_check_dirty_writeback,
 	.error_remove_page = generic_error_remove_page,
-#ifdef CONFIG_NFS_SWAP
 	.swap_activate = nfs_swap_activate,
 	.swap_deactivate = nfs_swap_deactivate,
-#endif
 };
 
 /*
diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h
index 598ba80ec30c..131032f15cc1 100644
--- a/include/linux/sunrpc/clnt.h
+++ b/include/linux/sunrpc/clnt.h
@@ -56,6 +56,7 @@ struct rpc_clnt {
 	struct rpc_rtt *	cl_rtt;		/* RTO estimator data */
 	const struct rpc_timeout *cl_timeout;	/* Timeout strategy */
 
+	atomic_t		cl_swapper;	/* swapfile count */
 	int			cl_nodelen;	/* nodename length */
 	char 			cl_nodename[UNX_MAXNODENAME+1];
 	struct rpc_pipe_dir_head cl_pipedir_objects;
diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h
index 2aa68976a482..d703f0ef37d8 100644
--- a/include/linux/sunrpc/sched.h
+++ b/include/linux/sunrpc/sched.h
@@ -268,4 +268,20 @@ static inline void rpc_assign_waitqueue_name(struct rpc_wait_queue *q,
 }
 #endif
 
+#if IS_ENABLED(CONFIG_SUNRPC_SWAP)
+int rpc_clnt_swap_activate(struct rpc_clnt *clnt);
+void rpc_clnt_swap_deactivate(struct rpc_clnt *clnt);
+#else
+static inline int
+rpc_clnt_swap_activate(struct rpc_clnt *clnt)
+{
+	return -EINVAL;
+}
+
+static inline void
+rpc_clnt_swap_deactivate(struct rpc_clnt *clnt)
+{
+}
+#endif /* CONFIG_SUNRPC_SWAP */
+
 #endif /* _LINUX_SUNRPC_SCHED_H_ */
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index f6717170480e..0ba65156a62b 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -891,15 +891,8 @@ void rpc_task_set_client(struct rpc_task *task, struct rpc_clnt *clnt)
 			task->tk_flags |= RPC_TASK_SOFT;
 		if (clnt->cl_noretranstimeo)
 			task->tk_flags |= RPC_TASK_NO_RETRANS_TIMEOUT;
-		if (sk_memalloc_socks()) {
-			struct rpc_xprt *xprt;
-
-			rcu_read_lock();
-			xprt = rcu_dereference(clnt->cl_xprt);
-			if (xprt->swapper)
-				task->tk_flags |= RPC_TASK_SWAPPER;
-			rcu_read_unlock();
-		}
+		if (atomic_read(&clnt->cl_swapper))
+			task->tk_flags |= RPC_TASK_SWAPPER;
 		/* Add to the client's list of all tasks */
 		spin_lock(&clnt->cl_lock);
 		list_add_tail(&task->tk_task, &clnt->cl_tasks);
@@ -2479,3 +2472,59 @@ void rpc_show_tasks(struct net *net)
 	spin_unlock(&sn->rpc_client_lock);
 }
 #endif
+
+#if IS_ENABLED(CONFIG_SUNRPC_SWAP)
+int
+rpc_clnt_swap_activate(struct rpc_clnt *clnt)
+{
+	int ret = 0;
+	struct rpc_xprt	*xprt;
+
+	if (atomic_inc_return(&clnt->cl_swapper) == 1) {
+retry:
+		rcu_read_lock();
+		xprt = xprt_get(rcu_dereference(clnt->cl_xprt));
+		rcu_read_unlock();
+		if (!xprt) {
+			/*
+			 * If we didn't get a reference, then we likely are
+			 * racing with a migration event. Wait for a grace
+			 * period and try again.
+			 */
+			synchronize_rcu();
+			goto retry;
+		}
+
+		ret = xs_swapper(xprt, 1);
+		xprt_put(xprt);
+	}
+	return ret;
+}
+EXPORT_SYMBOL_GPL(rpc_clnt_swap_activate);
+
+void
+rpc_clnt_swap_deactivate(struct rpc_clnt *clnt)
+{
+	struct rpc_xprt	*xprt;
+
+	if (atomic_dec_if_positive(&clnt->cl_swapper) == 0) {
+retry:
+		rcu_read_lock();
+		xprt = xprt_get(rcu_dereference(clnt->cl_xprt));
+		rcu_read_unlock();
+		if (!xprt) {
+			/*
+			 * If we didn't get a reference, then we likely are
+			 * racing with a migration event. Wait for a grace
+			 * period and try again.
+			 */
+			synchronize_rcu();
+			goto retry;
+		}
+
+		xs_swapper(xprt, 0);
+		xprt_put(xprt);
+	}
+}
+EXPORT_SYMBOL_GPL(rpc_clnt_swap_deactivate);
+#endif /* CONFIG_SUNRPC_SWAP */
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index b142feef0cf4..f344c0f8974c 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -1955,7 +1955,7 @@ static void xs_local_connect(struct rpc_xprt *xprt, struct rpc_task *task)
 		msleep_interruptible(15000);
 }
 
-#ifdef CONFIG_SUNRPC_SWAP
+#if IS_ENABLED(CONFIG_SUNRPC_SWAP)
 static void xs_set_memalloc(struct rpc_xprt *xprt)
 {
 	struct sock_xprt *transport = container_of(xprt, struct sock_xprt,
@@ -1987,7 +1987,6 @@ int xs_swapper(struct rpc_xprt *xprt, int enable)
 
 	return err;
 }
-EXPORT_SYMBOL_GPL(xs_swapper);
 #else
 static void xs_set_memalloc(struct rpc_xprt *xprt)
 {
-- 
cgit v1.2.3


From 8e2281330f9930bccf77cf04027ec60b6cc0fb34 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@poochiereds.net>
Date: Wed, 3 Jun 2015 16:14:26 -0400
Subject: sunrpc: make xprt->swapper an atomic_t

Split xs_swapper into enable/disable functions and eliminate the
"enable" flag.

Currently, it's racy if you have multiple swapon/swapoff operations
running in parallel over the same xprt. Also fix it so that we only
set it to a memalloc socket on a 0->1 transition and only clear it
on a 1->0 transition.

Cc: Mel Gorman <mgorman@suse.de>
Signed-off-by: Jeff Layton <jeff.layton@primarydata.com>
Reviewed-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 include/linux/sunrpc/xprt.h |  5 +++--
 net/sunrpc/clnt.c           |  4 ++--
 net/sunrpc/xprtsock.c       | 38 +++++++++++++++++++++++++-------------
 3 files changed, 30 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h
index bc9c39d6d30d..9a3308703c15 100644
--- a/include/linux/sunrpc/xprt.h
+++ b/include/linux/sunrpc/xprt.h
@@ -180,7 +180,7 @@ struct rpc_xprt {
 	atomic_t		num_reqs;	/* total slots */
 	unsigned long		state;		/* transport state */
 	unsigned char		resvport   : 1; /* use a reserved port */
-	unsigned int		swapper;	/* we're swapping over this
+	atomic_t		swapper;	/* we're swapping over this
 						   transport */
 	unsigned int		bind_index;	/* bind function index */
 
@@ -346,7 +346,8 @@ void			xprt_release_rqst_cong(struct rpc_task *task);
 void			xprt_disconnect_done(struct rpc_xprt *xprt);
 void			xprt_force_disconnect(struct rpc_xprt *xprt);
 void			xprt_conditional_disconnect(struct rpc_xprt *xprt, unsigned int cookie);
-int			xs_swapper(struct rpc_xprt *xprt, int enable);
+int			xs_swapper_enable(struct rpc_xprt *xprt);
+void			xs_swapper_disable(struct rpc_xprt *xprt);
 
 bool			xprt_lock_connect(struct rpc_xprt *, struct rpc_task *, void *);
 void			xprt_unlock_connect(struct rpc_xprt *, void *);
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 0ba65156a62b..801044aeeedd 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -2495,7 +2495,7 @@ retry:
 			goto retry;
 		}
 
-		ret = xs_swapper(xprt, 1);
+		ret = xs_swapper_enable(xprt);
 		xprt_put(xprt);
 	}
 	return ret;
@@ -2522,7 +2522,7 @@ retry:
 			goto retry;
 		}
 
-		xs_swapper(xprt, 0);
+		xs_swapper_disable(xprt);
 		xprt_put(xprt);
 	}
 }
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index f344c0f8974c..6538e6fbb7ba 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -1961,31 +1961,43 @@ static void xs_set_memalloc(struct rpc_xprt *xprt)
 	struct sock_xprt *transport = container_of(xprt, struct sock_xprt,
 			xprt);
 
-	if (xprt->swapper)
+	if (atomic_read(&xprt->swapper))
 		sk_set_memalloc(transport->inet);
 }
 
 /**
- * xs_swapper - Tag this transport as being used for swap.
+ * xs_swapper_enable - Tag this transport as being used for swap.
  * @xprt: transport to tag
- * @enable: enable/disable
  *
+ * Take a reference to this transport on behalf of the rpc_clnt, and
+ * optionally mark it for swapping if it wasn't already.
  */
-int xs_swapper(struct rpc_xprt *xprt, int enable)
+int
+xs_swapper_enable(struct rpc_xprt *xprt)
 {
 	struct sock_xprt *transport = container_of(xprt, struct sock_xprt,
 			xprt);
-	int err = 0;
 
-	if (enable) {
-		xprt->swapper++;
-		xs_set_memalloc(xprt);
-	} else if (xprt->swapper) {
-		xprt->swapper--;
-		sk_clear_memalloc(transport->inet);
-	}
+	if (atomic_inc_return(&xprt->swapper) == 1)
+		sk_set_memalloc(transport->inet);
+	return 0;
+}
 
-	return err;
+/**
+ * xs_swapper_disable - Untag this transport as being used for swap.
+ * @xprt: transport to tag
+ *
+ * Drop a "swapper" reference to this xprt on behalf of the rpc_clnt. If the
+ * swapper refcount goes to 0, untag the socket as a memalloc socket.
+ */
+void
+xs_swapper_disable(struct rpc_xprt *xprt)
+{
+	struct sock_xprt *transport = container_of(xprt, struct sock_xprt,
+			xprt);
+
+	if (atomic_dec_and_test(&xprt->swapper))
+		sk_clear_memalloc(transport->inet);
 }
 #else
 static void xs_set_memalloc(struct rpc_xprt *xprt)
-- 
cgit v1.2.3


From d67fa4d85a2143b46052b2e9ccc6749a4c97b2de Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@poochiereds.net>
Date: Wed, 3 Jun 2015 16:14:29 -0400
Subject: sunrpc: turn swapper_enable/disable functions into rpc_xprt_ops

RDMA xprts don't have a sock_xprt, but an rdma_xprt, so the
xs_swapper_enable/disable functions will likely oops when fed an RDMA
xprt. Turn these functions into rpc_xprt_ops so that that doesn't
occur. For now the RDMA versions are no-ops that just return -EINVAL
on an attempt to swapon.

Cc: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Jeff Layton <jeff.layton@primarydata.com>
Reviewed-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 include/linux/sunrpc/xprt.h     | 16 ++++++++++++++--
 net/sunrpc/clnt.c               |  4 ++--
 net/sunrpc/xprtrdma/transport.c | 15 ++++++++++++++-
 net/sunrpc/xprtsock.c           | 31 +++++++++++++++++++++++++------
 4 files changed, 55 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h
index 9a3308703c15..b6096592b1d4 100644
--- a/include/linux/sunrpc/xprt.h
+++ b/include/linux/sunrpc/xprt.h
@@ -133,6 +133,8 @@ struct rpc_xprt_ops {
 	void		(*close)(struct rpc_xprt *xprt);
 	void		(*destroy)(struct rpc_xprt *xprt);
 	void		(*print_stats)(struct rpc_xprt *xprt, struct seq_file *seq);
+	int		(*enable_swap)(struct rpc_xprt *xprt);
+	void		(*disable_swap)(struct rpc_xprt *xprt);
 };
 
 /*
@@ -328,6 +330,18 @@ static inline __be32 *xprt_skip_transport_header(struct rpc_xprt *xprt, __be32 *
 	return p + xprt->tsh_size;
 }
 
+static inline int
+xprt_enable_swap(struct rpc_xprt *xprt)
+{
+	return xprt->ops->enable_swap(xprt);
+}
+
+static inline void
+xprt_disable_swap(struct rpc_xprt *xprt)
+{
+	xprt->ops->disable_swap(xprt);
+}
+
 /*
  * Transport switch helper functions
  */
@@ -346,8 +360,6 @@ void			xprt_release_rqst_cong(struct rpc_task *task);
 void			xprt_disconnect_done(struct rpc_xprt *xprt);
 void			xprt_force_disconnect(struct rpc_xprt *xprt);
 void			xprt_conditional_disconnect(struct rpc_xprt *xprt, unsigned int cookie);
-int			xs_swapper_enable(struct rpc_xprt *xprt);
-void			xs_swapper_disable(struct rpc_xprt *xprt);
 
 bool			xprt_lock_connect(struct rpc_xprt *, struct rpc_task *, void *);
 void			xprt_unlock_connect(struct rpc_xprt *, void *);
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 801044aeeedd..576d6ae39f25 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -2495,7 +2495,7 @@ retry:
 			goto retry;
 		}
 
-		ret = xs_swapper_enable(xprt);
+		ret = xprt_enable_swap(xprt);
 		xprt_put(xprt);
 	}
 	return ret;
@@ -2522,7 +2522,7 @@ retry:
 			goto retry;
 		}
 
-		xs_swapper_disable(xprt);
+		xprt_disable_swap(xprt);
 		xprt_put(xprt);
 	}
 }
diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
index 54f23b1be986..ebf6fe759f0e 100644
--- a/net/sunrpc/xprtrdma/transport.c
+++ b/net/sunrpc/xprtrdma/transport.c
@@ -682,6 +682,17 @@ static void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq)
 	   r_xprt->rx_stats.bad_reply_count);
 }
 
+static int
+xprt_rdma_enable_swap(struct rpc_xprt *xprt)
+{
+	return -EINVAL;
+}
+
+static void
+xprt_rdma_disable_swap(struct rpc_xprt *xprt)
+{
+}
+
 /*
  * Plumbing for rpc transport switch and kernel module
  */
@@ -700,7 +711,9 @@ static struct rpc_xprt_ops xprt_rdma_procs = {
 	.send_request		= xprt_rdma_send_request,
 	.close			= xprt_rdma_close,
 	.destroy		= xprt_rdma_destroy,
-	.print_stats		= xprt_rdma_print_stats
+	.print_stats		= xprt_rdma_print_stats,
+	.enable_swap		= xprt_rdma_enable_swap,
+	.disable_swap		= xprt_rdma_disable_swap,
 };
 
 static struct xprt_class xprt_rdma = {
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index e0efd8514886..bf20726d4ab5 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -1980,14 +1980,14 @@ static void xs_set_memalloc(struct rpc_xprt *xprt)
 }
 
 /**
- * xs_swapper_enable - Tag this transport as being used for swap.
+ * xs_enable_swap - Tag this transport as being used for swap.
  * @xprt: transport to tag
  *
  * Take a reference to this transport on behalf of the rpc_clnt, and
  * optionally mark it for swapping if it wasn't already.
  */
-int
-xs_swapper_enable(struct rpc_xprt *xprt)
+static int
+xs_enable_swap(struct rpc_xprt *xprt)
 {
 	struct sock_xprt *xs = container_of(xprt, struct sock_xprt, xprt);
 
@@ -2002,14 +2002,14 @@ xs_swapper_enable(struct rpc_xprt *xprt)
 }
 
 /**
- * xs_swapper_disable - Untag this transport as being used for swap.
+ * xs_disable_swap - Untag this transport as being used for swap.
  * @xprt: transport to tag
  *
  * Drop a "swapper" reference to this xprt on behalf of the rpc_clnt. If the
  * swapper refcount goes to 0, untag the socket as a memalloc socket.
  */
-void
-xs_swapper_disable(struct rpc_xprt *xprt)
+static void
+xs_disable_swap(struct rpc_xprt *xprt)
 {
 	struct sock_xprt *xs = container_of(xprt, struct sock_xprt, xprt);
 
@@ -2025,6 +2025,17 @@ xs_swapper_disable(struct rpc_xprt *xprt)
 static void xs_set_memalloc(struct rpc_xprt *xprt)
 {
 }
+
+static int
+xs_enable_swap(struct rpc_xprt *xprt)
+{
+	return -EINVAL;
+}
+
+static void
+xs_disable_swap(struct rpc_xprt *xprt)
+{
+}
 #endif
 
 static void xs_udp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock)
@@ -2488,6 +2499,8 @@ static struct rpc_xprt_ops xs_local_ops = {
 	.close			= xs_close,
 	.destroy		= xs_destroy,
 	.print_stats		= xs_local_print_stats,
+	.enable_swap		= xs_enable_swap,
+	.disable_swap		= xs_disable_swap,
 };
 
 static struct rpc_xprt_ops xs_udp_ops = {
@@ -2507,6 +2520,8 @@ static struct rpc_xprt_ops xs_udp_ops = {
 	.close			= xs_close,
 	.destroy		= xs_destroy,
 	.print_stats		= xs_udp_print_stats,
+	.enable_swap		= xs_enable_swap,
+	.disable_swap		= xs_disable_swap,
 };
 
 static struct rpc_xprt_ops xs_tcp_ops = {
@@ -2523,6 +2538,8 @@ static struct rpc_xprt_ops xs_tcp_ops = {
 	.close			= xs_tcp_shutdown,
 	.destroy		= xs_destroy,
 	.print_stats		= xs_tcp_print_stats,
+	.enable_swap		= xs_enable_swap,
+	.disable_swap		= xs_disable_swap,
 };
 
 /*
@@ -2540,6 +2557,8 @@ static struct rpc_xprt_ops bc_tcp_ops = {
 	.close			= bc_close,
 	.destroy		= bc_destroy,
 	.print_stats		= xs_tcp_print_stats,
+	.enable_swap		= xs_enable_swap,
+	.disable_swap		= xs_disable_swap,
 };
 
 static int xs_init_anyaddr(const int family, struct sockaddr *sap)
-- 
cgit v1.2.3


From 11598b8ff2b97cf034d0c025cf125c92b574bafc Mon Sep 17 00:00:00 2001
From: Anna Schumaker <Anna.Schumaker@netapp.com>
Date: Wed, 10 Jun 2015 16:54:28 -0400
Subject: NFS: Remove unused nfs_rw_ops->rw_release() function

This was only ever set to nfs_writeback_release_common(), a function
which is completely empty.  Let's just drop this function pointer and
simplify the code a bit.

Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/pagelist.c        | 2 --
 fs/nfs/write.c           | 6 ------
 include/linux/nfs_page.h | 1 -
 3 files changed, 9 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 282b39369510..e9953ab1ac40 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -690,8 +690,6 @@ static int nfs_pgio_error(struct nfs_pageio_descriptor *desc,
 static void nfs_pgio_release(void *calldata)
 {
 	struct nfs_pgio_header *hdr = calldata;
-	if (hdr->rw_ops->rw_release)
-		hdr->rw_ops->rw_release(hdr);
 	nfs_pgio_data_destroy(hdr);
 	hdr->completion_ops->completion(hdr);
 }
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index dfc19f1575a1..d0f1f210342e 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -1347,11 +1347,6 @@ void nfs_commit_prepare(struct rpc_task *task, void *calldata)
 	NFS_PROTO(data->inode)->commit_rpc_prepare(task, data);
 }
 
-static void nfs_writeback_release_common(struct nfs_pgio_header *hdr)
-{
-	/* do nothing! */
-}
-
 /*
  * Special version of should_remove_suid() that ignores capabilities.
  */
@@ -2012,7 +2007,6 @@ static const struct nfs_rw_ops nfs_rw_write_ops = {
 	.rw_mode		= FMODE_WRITE,
 	.rw_alloc_header	= nfs_writehdr_alloc,
 	.rw_free_header		= nfs_writehdr_free,
-	.rw_release		= nfs_writeback_release_common,
 	.rw_done		= nfs_writeback_done,
 	.rw_result		= nfs_writeback_result,
 	.rw_initiate		= nfs_initiate_write,
diff --git a/include/linux/nfs_page.h b/include/linux/nfs_page.h
index 3eb072dbce83..f2f650f136ee 100644
--- a/include/linux/nfs_page.h
+++ b/include/linux/nfs_page.h
@@ -67,7 +67,6 @@ struct nfs_rw_ops {
 	const fmode_t rw_mode;
 	struct nfs_pgio_header *(*rw_alloc_header)(void);
 	void (*rw_free_header)(struct nfs_pgio_header *);
-	void (*rw_release)(struct nfs_pgio_header *);
 	int  (*rw_done)(struct rpc_task *, struct nfs_pgio_header *,
 			struct inode *);
 	void (*rw_result)(struct rpc_task *, struct nfs_pgio_header *);
-- 
cgit v1.2.3


From 4a06825839889cc1756d0dd8a52d6b1071ee0263 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 11 May 2015 14:02:25 -0400
Subject: SUNRPC: Transport fault injection

It has been exceptionally useful to exercise the logic that handles
local immediate errors and RDMA connection loss.  To enable
developers to test this regularly and repeatably, add logic to
simulate connection loss every so often.

Fault injection is disabled by default. It is enabled with

  $ sudo echo xxx > /sys/kernel/debug/sunrpc/inject_fault/disconnect

where "xxx" is a large positive number of transport method calls
before a disconnect. A value of several thousand is usually a good
number that allows reasonable forward progress while still causing a
lot of connection drops.

These hooks are disabled when SUNRPC_DEBUG is turned off.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 include/linux/sunrpc/xprt.h     | 19 ++++++++++
 net/sunrpc/clnt.c               |  1 +
 net/sunrpc/debugfs.c            | 77 +++++++++++++++++++++++++++++++++++++++++
 net/sunrpc/xprt.c               |  2 ++
 net/sunrpc/xprtrdma/transport.c | 11 ++++++
 net/sunrpc/xprtsock.c           | 10 ++++++
 6 files changed, 120 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h
index b6096592b1d4..0fb9acbb4780 100644
--- a/include/linux/sunrpc/xprt.h
+++ b/include/linux/sunrpc/xprt.h
@@ -135,6 +135,7 @@ struct rpc_xprt_ops {
 	void		(*print_stats)(struct rpc_xprt *xprt, struct seq_file *seq);
 	int		(*enable_swap)(struct rpc_xprt *xprt);
 	void		(*disable_swap)(struct rpc_xprt *xprt);
+	void		(*inject_disconnect)(struct rpc_xprt *xprt);
 };
 
 /*
@@ -244,6 +245,7 @@ struct rpc_xprt {
 	const char		*address_strings[RPC_DISPLAY_MAX];
 #if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
 	struct dentry		*debugfs;		/* debugfs directory */
+	atomic_t		inject_disconnect;
 #endif
 };
 
@@ -445,6 +447,23 @@ static inline int xprt_test_and_set_binding(struct rpc_xprt *xprt)
 	return test_and_set_bit(XPRT_BINDING, &xprt->state);
 }
 
+#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
+extern unsigned int rpc_inject_disconnect;
+static inline void xprt_inject_disconnect(struct rpc_xprt *xprt)
+{
+	if (!rpc_inject_disconnect)
+		return;
+	if (atomic_dec_return(&xprt->inject_disconnect))
+		return;
+	atomic_set(&xprt->inject_disconnect, rpc_inject_disconnect);
+	xprt->ops->inject_disconnect(xprt);
+}
+#else
+static inline void xprt_inject_disconnect(struct rpc_xprt *xprt)
+{
+}
+#endif
+
 #endif /* __KERNEL__*/
 
 #endif /* _LINUX_SUNRPC_XPRT_H */
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 576d6ae39f25..f41ed882ed3c 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -1605,6 +1605,7 @@ call_allocate(struct rpc_task *task)
 					req->rq_callsize + req->rq_rcvsize);
 	if (req->rq_buffer != NULL)
 		return;
+	xprt_inject_disconnect(xprt);
 
 	dprintk("RPC: %5u rpc_buffer allocation failed\n", task->tk_pid);
 
diff --git a/net/sunrpc/debugfs.c b/net/sunrpc/debugfs.c
index 82962f7e6e88..7cc1b8a6ef6d 100644
--- a/net/sunrpc/debugfs.c
+++ b/net/sunrpc/debugfs.c
@@ -10,9 +10,12 @@
 #include "netns.h"
 
 static struct dentry *topdir;
+static struct dentry *rpc_fault_dir;
 static struct dentry *rpc_clnt_dir;
 static struct dentry *rpc_xprt_dir;
 
+unsigned int rpc_inject_disconnect;
+
 struct rpc_clnt_iter {
 	struct rpc_clnt	*clnt;
 	loff_t		pos;
@@ -257,6 +260,8 @@ rpc_xprt_debugfs_register(struct rpc_xprt *xprt)
 		debugfs_remove_recursive(xprt->debugfs);
 		xprt->debugfs = NULL;
 	}
+
+	atomic_set(&xprt->inject_disconnect, rpc_inject_disconnect);
 }
 
 void
@@ -266,11 +271,78 @@ rpc_xprt_debugfs_unregister(struct rpc_xprt *xprt)
 	xprt->debugfs = NULL;
 }
 
+static int
+fault_open(struct inode *inode, struct file *filp)
+{
+	filp->private_data = kmalloc(128, GFP_KERNEL);
+	if (!filp->private_data)
+		return -ENOMEM;
+	return 0;
+}
+
+static int
+fault_release(struct inode *inode, struct file *filp)
+{
+	kfree(filp->private_data);
+	return 0;
+}
+
+static ssize_t
+fault_disconnect_read(struct file *filp, char __user *user_buf,
+		      size_t len, loff_t *offset)
+{
+	char *buffer = (char *)filp->private_data;
+	size_t size;
+
+	size = sprintf(buffer, "%u\n", rpc_inject_disconnect);
+	return simple_read_from_buffer(user_buf, len, offset, buffer, size);
+}
+
+static ssize_t
+fault_disconnect_write(struct file *filp, const char __user *user_buf,
+		       size_t len, loff_t *offset)
+{
+	char buffer[16];
+
+	len = min(len, sizeof(buffer) - 1);
+	if (copy_from_user(buffer, user_buf, len))
+		return -EFAULT;
+	buffer[len] = '\0';
+	if (kstrtouint(buffer, 10, &rpc_inject_disconnect))
+		return -EINVAL;
+	return len;
+}
+
+static const struct file_operations fault_disconnect_fops = {
+	.owner		= THIS_MODULE,
+	.open		= fault_open,
+	.read		= fault_disconnect_read,
+	.write		= fault_disconnect_write,
+	.release	= fault_release,
+};
+
+static struct dentry *
+inject_fault_dir(struct dentry *topdir)
+{
+	struct dentry *faultdir;
+
+	faultdir = debugfs_create_dir("inject_fault", topdir);
+	if (!faultdir)
+		return NULL;
+
+	if (!debugfs_create_file("disconnect", S_IFREG | S_IRUSR, faultdir,
+				 NULL, &fault_disconnect_fops))
+		return NULL;
+
+	return faultdir;
+}
+
 void __exit
 sunrpc_debugfs_exit(void)
 {
 	debugfs_remove_recursive(topdir);
 	topdir = NULL;
+	rpc_fault_dir = NULL;
 	rpc_clnt_dir = NULL;
 	rpc_xprt_dir = NULL;
 }
@@ -282,6 +354,10 @@ sunrpc_debugfs_init(void)
 	if (!topdir)
 		return;
 
+	rpc_fault_dir = inject_fault_dir(topdir);
+	if (!rpc_fault_dir)
+		goto out_remove;
+
 	rpc_clnt_dir = debugfs_create_dir("rpc_clnt", topdir);
 	if (!rpc_clnt_dir)
 		goto out_remove;
@@ -294,5 +370,6 @@ sunrpc_debugfs_init(void)
 out_remove:
 	debugfs_remove_recursive(topdir);
 	topdir = NULL;
+	rpc_fault_dir = NULL;
 	rpc_clnt_dir = NULL;
 }
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index 1d4fe24af06a..e1fb538e10e0 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -967,6 +967,7 @@ void xprt_transmit(struct rpc_task *task)
 		task->tk_status = status;
 		return;
 	}
+	xprt_inject_disconnect(xprt);
 
 	dprintk("RPC: %5u xmit complete\n", task->tk_pid);
 	task->tk_flags |= RPC_TASK_SENT;
@@ -1285,6 +1286,7 @@ void xprt_release(struct rpc_task *task)
 	spin_unlock_bh(&xprt->transport_lock);
 	if (req->rq_buffer)
 		xprt->ops->buf_free(req->rq_buffer);
+	xprt_inject_disconnect(xprt);
 	if (req->rq_cred != NULL)
 		put_rpccred(req->rq_cred);
 	task->tk_rqstp = NULL;
diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
index ebf6fe759f0e..b8aac23b1b0c 100644
--- a/net/sunrpc/xprtrdma/transport.c
+++ b/net/sunrpc/xprtrdma/transport.c
@@ -246,6 +246,16 @@ xprt_rdma_connect_worker(struct work_struct *work)
 	xprt_clear_connecting(xprt);
 }
 
+static void
+xprt_rdma_inject_disconnect(struct rpc_xprt *xprt)
+{
+	struct rpcrdma_xprt *r_xprt = container_of(xprt, struct rpcrdma_xprt,
+						   rx_xprt);
+
+	pr_info("rpcrdma: injecting transport disconnect on xprt=%p\n", xprt);
+	rdma_disconnect(r_xprt->rx_ia.ri_id);
+}
+
 /*
  * xprt_rdma_destroy
  *
@@ -714,6 +724,7 @@ static struct rpc_xprt_ops xprt_rdma_procs = {
 	.print_stats		= xprt_rdma_print_stats,
 	.enable_swap		= xprt_rdma_enable_swap,
 	.disable_swap		= xprt_rdma_disable_swap,
+	.inject_disconnect	= xprt_rdma_inject_disconnect
 };
 
 static struct xprt_class xprt_rdma = {
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index bf20726d4ab5..fda8ec8c74c0 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -866,6 +866,13 @@ static void xs_close(struct rpc_xprt *xprt)
 	xprt_disconnect_done(xprt);
 }
 
+static void xs_inject_disconnect(struct rpc_xprt *xprt)
+{
+	dprintk("RPC:       injecting transport disconnect on xprt=%p\n",
+		xprt);
+	xprt_disconnect_done(xprt);
+}
+
 static void xs_xprt_free(struct rpc_xprt *xprt)
 {
 	xs_free_peer_addresses(xprt);
@@ -2522,6 +2529,7 @@ static struct rpc_xprt_ops xs_udp_ops = {
 	.print_stats		= xs_udp_print_stats,
 	.enable_swap		= xs_enable_swap,
 	.disable_swap		= xs_disable_swap,
+	.inject_disconnect	= xs_inject_disconnect,
 };
 
 static struct rpc_xprt_ops xs_tcp_ops = {
@@ -2540,6 +2548,7 @@ static struct rpc_xprt_ops xs_tcp_ops = {
 	.print_stats		= xs_tcp_print_stats,
 	.enable_swap		= xs_enable_swap,
 	.disable_swap		= xs_disable_swap,
+	.inject_disconnect	= xs_inject_disconnect,
 };
 
 /*
@@ -2559,6 +2568,7 @@ static struct rpc_xprt_ops bc_tcp_ops = {
 	.print_stats		= xs_tcp_print_stats,
 	.enable_swap		= xs_enable_swap,
 	.disable_swap		= xs_disable_swap,
+	.inject_disconnect	= xs_inject_disconnect,
 };
 
 static int xs_init_anyaddr(const int family, struct sockaddr *sap)
-- 
cgit v1.2.3


From 4f822c625f9c68267ffee7519519e304858a46c3 Mon Sep 17 00:00:00 2001
From: Florian Fainelli <f.fainelli@gmail.com>
Date: Wed, 10 Jun 2015 18:07:57 -0700
Subject: net: phy: broadcom: include phy.h for brcmphy.h

We utilize inline functions from the PHY library, make sure that we do
include phy.h in brcmphy.h in order for the code including brcmphy.h not
to have to resolve this inclusion dependency.

Fixes: 705314797b8b ("net: phy: broadcom: move shadow 0x1C register accessors to brcmphy.h")
Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/brcmphy.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/brcmphy.h b/include/linux/brcmphy.h
index 656da2a12ffe..abb6106f839d 100644
--- a/include/linux/brcmphy.h
+++ b/include/linux/brcmphy.h
@@ -1,6 +1,8 @@
 #ifndef _LINUX_BRCMPHY_H
 #define _LINUX_BRCMPHY_H
 
+#include <linux/phy.h>
+
 #define PHY_ID_BCM50610			0x0143bd60
 #define PHY_ID_BCM50610M		0x0143bd70
 #define PHY_ID_BCM5241			0x0143bc30
-- 
cgit v1.2.3


From 8bc84b79265f66d5af736473db582e74b28e099d Mon Sep 17 00:00:00 2001
From: Florian Fainelli <f.fainelli@gmail.com>
Date: Wed, 10 Jun 2015 18:07:58 -0700
Subject: net: phy: broadcom: define Broadcom pseudo-PHY address in brcmphy.h

Define the pseudo-PHY address (30) which is used by all Broadcom
Ethernet switches in a shared header file.

Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/brcmphy.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/brcmphy.h b/include/linux/brcmphy.h
index abb6106f839d..697ca7795bd9 100644
--- a/include/linux/brcmphy.h
+++ b/include/linux/brcmphy.h
@@ -3,6 +3,11 @@
 
 #include <linux/phy.h>
 
+/* All Broadcom Ethernet switches have a pseudo-PHY at address 30 which is used
+ * to configure the switch internal registers via MDIO accesses.
+ */
+#define BRCM_PSEUDO_PHY_ADDR           30
+
 #define PHY_ID_BCM50610			0x0143bd60
 #define PHY_ID_BCM50610M		0x0143bd70
 #define PHY_ID_BCM5241			0x0143bc30
-- 
cgit v1.2.3


From d290f1e70d85a9a4d124594c6a3d769329960bdc Mon Sep 17 00:00:00 2001
From: Joerg Roedel <jroedel@suse.de>
Date: Thu, 28 May 2015 18:41:36 +0200
Subject: iommu: Introduce iommu_request_dm_for_dev()

This function can be called by an IOMMU driver to request
that a device's default domain is direct mapped.

Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/iommu/iommu.c | 53 +++++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/iommu.h |  6 ++++++
 2 files changed, 59 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 224c6dd2d249..3b1a2551a747 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -1538,3 +1538,56 @@ void iommu_put_dm_regions(struct device *dev, struct list_head *list)
 	if (ops && ops->put_dm_regions)
 		ops->put_dm_regions(dev, list);
 }
+
+/* Request that a device is direct mapped by the IOMMU */
+int iommu_request_dm_for_dev(struct device *dev)
+{
+	struct iommu_domain *dm_domain;
+	struct iommu_group *group;
+	int ret;
+
+	/* Device must already be in a group before calling this function */
+	group = iommu_group_get_for_dev(dev);
+	if (!group)
+		return -EINVAL;
+
+	mutex_lock(&group->mutex);
+
+	/* Check if the default domain is already direct mapped */
+	ret = 0;
+	if (group->default_domain &&
+	    group->default_domain->type == IOMMU_DOMAIN_IDENTITY)
+		goto out;
+
+	/* Don't change mappings of existing devices */
+	ret = -EBUSY;
+	if (iommu_group_device_count(group) != 1)
+		goto out;
+
+	/* Allocate a direct mapped domain */
+	ret = -ENOMEM;
+	dm_domain = __iommu_domain_alloc(dev->bus, IOMMU_DOMAIN_IDENTITY);
+	if (!dm_domain)
+		goto out;
+
+	/* Attach the device to the domain */
+	ret = __iommu_attach_group(dm_domain, group);
+	if (ret) {
+		iommu_domain_free(dm_domain);
+		goto out;
+	}
+
+	/* Make the direct mapped domain the default for this group */
+	if (group->default_domain)
+		iommu_domain_free(group->default_domain);
+	group->default_domain = dm_domain;
+
+	pr_info("Using direct mapping for device %s\n", dev_name(dev));
+
+	ret = 0;
+out:
+	mutex_unlock(&group->mutex);
+	iommu_group_put(group);
+
+	return ret;
+}
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index b944b2be4fa2..dc767f7c3704 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -225,6 +225,7 @@ extern void iommu_set_fault_handler(struct iommu_domain *domain,
 
 extern void iommu_get_dm_regions(struct device *dev, struct list_head *list);
 extern void iommu_put_dm_regions(struct device *dev, struct list_head *list);
+extern int iommu_request_dm_for_dev(struct device *dev);
 
 extern int iommu_attach_group(struct iommu_domain *domain,
 			      struct iommu_group *group);
@@ -411,6 +412,11 @@ static inline void iommu_put_dm_regions(struct device *dev,
 {
 }
 
+static inline int iommu_request_dm_for_dev(struct device *dev)
+{
+	return -ENODEV;
+}
+
 static inline int iommu_attach_group(struct iommu_domain *domain,
 				     struct iommu_group *group)
 {
-- 
cgit v1.2.3


From dfabde206aa10ae71a89ba75e68b1f58a6336a05 Mon Sep 17 00:00:00 2001
From: Lee Jones <lee.jones@linaro.org>
Date: Mon, 11 May 2015 17:08:50 +0100
Subject: mailbox: Add ability for clients to request channels by name

This patch supplies a new framework API; mbox_request_channel_byname().

It works by supplying the usual client pointer as the first argument and
a string as the second.  The API will search the client's node for a
'mbox-names' property then request a channel in the normal way using the
requested string's index as the expected second 'index' argument.

Signed-off-by: Lee Jones <lee.jones@linaro.org>
Signed-off-by: Jassi Brar <jaswinder.singh@linaro.org>
---
 drivers/mailbox/mailbox.c      | 29 +++++++++++++++++++++++++++++
 include/linux/mailbox_client.h |  2 ++
 2 files changed, 31 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/mailbox/mailbox.c b/drivers/mailbox/mailbox.c
index c3c42d42d017..c7fdb57fd166 100644
--- a/drivers/mailbox/mailbox.c
+++ b/drivers/mailbox/mailbox.c
@@ -362,6 +362,35 @@ struct mbox_chan *mbox_request_channel(struct mbox_client *cl, int index)
 }
 EXPORT_SYMBOL_GPL(mbox_request_channel);
 
+struct mbox_chan *mbox_request_channel_byname(struct mbox_client *cl,
+					      const char *name)
+{
+	struct device_node *np = cl->dev->of_node;
+	struct property *prop;
+	const char *mbox_name;
+	int index = 0;
+
+	if (!np) {
+		dev_err(cl->dev, "%s() currently only supports DT\n", __func__);
+		return ERR_PTR(-ENOSYS);
+	}
+
+	if (!of_get_property(np, "mbox-names", NULL)) {
+		dev_err(cl->dev,
+			"%s() requires an \"mbox-names\" property\n", __func__);
+		return ERR_PTR(-ENOSYS);
+	}
+
+	of_property_for_each_string(np, "mbox-names", prop, mbox_name) {
+		if (!strncmp(name, mbox_name, strlen(name)))
+			break;
+		index++;
+	}
+
+	return mbox_request_channel(cl, index);
+}
+EXPORT_SYMBOL_GPL(mbox_request_channel_byname);
+
 /**
  * mbox_free_channel - The client relinquishes control of a mailbox
  *			channel by this call.
diff --git a/include/linux/mailbox_client.h b/include/linux/mailbox_client.h
index 1726ccbd8009..44348710953f 100644
--- a/include/linux/mailbox_client.h
+++ b/include/linux/mailbox_client.h
@@ -40,6 +40,8 @@ struct mbox_client {
 	void (*tx_done)(struct mbox_client *cl, void *mssg, int r);
 };
 
+struct mbox_chan *mbox_request_channel_byname(struct mbox_client *cl,
+					      const char *name);
 struct mbox_chan *mbox_request_channel(struct mbox_client *cl, int index);
 int mbox_send_message(struct mbox_chan *chan, void *mssg);
 void mbox_client_txdone(struct mbox_chan *chan, int r); /* atomic */
-- 
cgit v1.2.3


From dc14bdef8762a8098b1da881b611d722e24fe787 Mon Sep 17 00:00:00 2001
From: Vincent Cuissard <cuissard@marvell.com>
Date: Thu, 11 Jun 2015 14:00:19 +0200
Subject: NFC: nfcmrvl: add platform_data and DT configuration

Declare nfcmrvl platform_data structure and few DT parameters
for nfcmrvl driver.

Signed-off-by: Vincent Cuissard <cuissard@marvell.com>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>
---
 drivers/nfc/nfcmrvl/main.c            | 66 +++++++++++++++++++++++++++--------
 drivers/nfc/nfcmrvl/nfcmrvl.h         | 36 +++++++++++++------
 drivers/nfc/nfcmrvl/usb.c             |  6 +++-
 include/linux/platform_data/nfcmrvl.h | 31 ++++++++++++++++
 4 files changed, 113 insertions(+), 26 deletions(-)
 create mode 100644 include/linux/platform_data/nfcmrvl.h

(limited to 'include/linux')

diff --git a/drivers/nfc/nfcmrvl/main.c b/drivers/nfc/nfcmrvl/main.c
index 708aad28c567..e317a69a4560 100644
--- a/drivers/nfc/nfcmrvl/main.c
+++ b/drivers/nfc/nfcmrvl/main.c
@@ -19,6 +19,7 @@
 #include <linux/module.h>
 #include <linux/gpio.h>
 #include <linux/delay.h>
+#include <linux/of_gpio.h>
 #include <linux/nfc.h>
 #include <net/nfc/nci.h>
 #include <net/nfc/nci_core.h>
@@ -65,7 +66,7 @@ static int nfcmrvl_nci_send(struct nci_dev *ndev, struct sk_buff *skb)
 	if (!test_bit(NFCMRVL_NCI_RUNNING, &priv->flags))
 		return -EBUSY;
 
-	if (priv->hci_muxed) {
+	if (priv->config.hci_muxed) {
 		unsigned char *hdr;
 		unsigned char len = skb->len;
 
@@ -92,9 +93,9 @@ static struct nci_ops nfcmrvl_nci_ops = {
 };
 
 struct nfcmrvl_private *nfcmrvl_nci_register_dev(void *drv_data,
-						 struct nfcmrvl_if_ops *ops,
-						 struct device *dev,
-						 unsigned int flags)
+				struct nfcmrvl_if_ops *ops,
+				struct device *dev,
+				struct nfcmrvl_platform_data *pdata)
 {
 	struct nfcmrvl_private *priv;
 	int rc;
@@ -108,23 +109,24 @@ struct nfcmrvl_private *nfcmrvl_nci_register_dev(void *drv_data,
 	priv->drv_data = drv_data;
 	priv->if_ops = ops;
 	priv->dev = dev;
-	priv->hci_muxed = (flags & NFCMRVL_DEV_FLAG_HCI_MUXED) ? 1 : 0;
-	priv->reset_n_io = NFCMRVL_DEV_FLAG_GET_RESET_N_IO(flags);
 
-	if (priv->reset_n_io) {
+	memcpy(&priv->config, pdata, sizeof(*pdata));
+
+	if (priv->config.reset_n_io) {
 		rc = devm_gpio_request_one(dev,
-					   priv->reset_n_io,
+					   priv->config.reset_n_io,
 					   GPIOF_OUT_INIT_LOW,
 					   "nfcmrvl_reset_n");
 		if (rc < 0)
 			nfc_err(dev, "failed to request reset_n io\n");
 	}
 
-	if (priv->hci_muxed)
+	if (priv->config.hci_muxed)
 		headroom = NFCMRVL_HCI_EVENT_HEADER_SIZE;
 
 	protocols = NFC_PROTO_JEWEL_MASK
-		| NFC_PROTO_MIFARE_MASK | NFC_PROTO_FELICA_MASK
+		| NFC_PROTO_MIFARE_MASK
+		| NFC_PROTO_FELICA_MASK
 		| NFC_PROTO_ISO14443_MASK
 		| NFC_PROTO_ISO14443_B_MASK
 		| NFC_PROTO_NFC_DEP_MASK;
@@ -169,7 +171,7 @@ EXPORT_SYMBOL_GPL(nfcmrvl_nci_unregister_dev);
 
 int nfcmrvl_nci_recv_frame(struct nfcmrvl_private *priv, struct sk_buff *skb)
 {
-	if (priv->hci_muxed) {
+	if (priv->config.hci_muxed) {
 		if (skb->data[0] == NFCMRVL_HCI_EVENT_CODE &&
 		    skb->data[1] == NFCMRVL_HCI_NFC_EVENT_CODE) {
 			/* Data packet, let's extract NCI payload */
@@ -200,15 +202,51 @@ void nfcmrvl_chip_reset(struct nfcmrvl_private *priv)
 	 * To be improved.
 	 */
 
-	if (priv->reset_n_io) {
+	if (priv->config.reset_n_io) {
 		nfc_info(priv->dev, "reset the chip\n");
-		gpio_set_value(priv->reset_n_io, 0);
+		gpio_set_value(priv->config.reset_n_io, 0);
 		usleep_range(5000, 10000);
-		gpio_set_value(priv->reset_n_io, 1);
+		gpio_set_value(priv->config.reset_n_io, 1);
 	} else
 		nfc_info(priv->dev, "no reset available on this interface\n");
 }
 
+#ifdef CONFIG_OF
+
+int nfcmrvl_parse_dt(struct device_node *node,
+		     struct nfcmrvl_platform_data *pdata)
+{
+	int reset_n_io;
+
+	reset_n_io = of_get_named_gpio(node, "reset-n-io", 0);
+	if (reset_n_io < 0) {
+		pr_info("no reset-n-io config\n");
+		reset_n_io = 0;
+	} else if (!gpio_is_valid(reset_n_io)) {
+		pr_err("invalid reset-n-io GPIO\n");
+		return reset_n_io;
+	}
+	pdata->reset_n_io = reset_n_io;
+
+	if (of_find_property(node, "hci-muxed", NULL))
+		pdata->hci_muxed = 1;
+	else
+		pdata->hci_muxed = 0;
+
+	return 0;
+}
+
+#else
+
+int nfcmrvl_parse_dt(struct device_node *node,
+		     struct nfcmrvl_platform_data *pdata)
+{
+	return -ENODEV;
+}
+
+#endif
+EXPORT_SYMBOL_GPL(nfcmrvl_parse_dt);
+
 MODULE_AUTHOR("Marvell International Ltd.");
 MODULE_DESCRIPTION("Marvell NFC driver ver " VERSION);
 MODULE_VERSION(VERSION);
diff --git a/drivers/nfc/nfcmrvl/nfcmrvl.h b/drivers/nfc/nfcmrvl/nfcmrvl.h
index 2edae9a4b6bd..214412bd6110 100644
--- a/drivers/nfc/nfcmrvl/nfcmrvl.h
+++ b/drivers/nfc/nfcmrvl/nfcmrvl.h
@@ -16,6 +16,11 @@
  * this warranty disclaimer.
  **/
 
+#ifndef _NFCMRVL_H_
+#define _NFCMRVL_H_
+
+#include <linux/platform_data/nfcmrvl.h>
+
 /* Define private flags: */
 #define NFCMRVL_NCI_RUNNING			1
 
@@ -38,22 +43,25 @@
 #define NFCMRVL_HCI_OGF				0x81
 #define NFCMRVL_HCI_OCF				0xFE
 
-#define NFCMRVL_DEV_FLAG_HCI_MUXED		(1 << 0)
-#define NFCMRVL_DEV_FLAG_SET_RESET_N_IO(X)	((X) << 16)
-#define NFCMRVL_DEV_FLAG_GET_RESET_N_IO(X)	((X) >> 16)
 
 struct nfcmrvl_private {
 
-	/* Tell if NCI packets are encapsulated in HCI ones */
-	int hci_muxed;
+	unsigned long flags;
+
+	/* Platform configuration */
+	struct nfcmrvl_platform_data config;
+
 	struct nci_dev *ndev;
 
-	/* Reset IO (0 if not available) */
-	int reset_n_io;
+	/*
+	** PHY related information
+	*/
 
-	unsigned long flags;
+	/* PHY driver context */
 	void *drv_data;
+	/* PHY device */
 	struct device *dev;
+	/* Low level driver ops */
 	struct nfcmrvl_if_ops *if_ops;
 };
 
@@ -66,8 +74,14 @@ struct nfcmrvl_if_ops {
 void nfcmrvl_nci_unregister_dev(struct nfcmrvl_private *priv);
 int nfcmrvl_nci_recv_frame(struct nfcmrvl_private *priv, struct sk_buff *skb);
 struct nfcmrvl_private *nfcmrvl_nci_register_dev(void *drv_data,
-						 struct nfcmrvl_if_ops *ops,
-						 struct device *dev,
-						 unsigned int flags);
+				struct nfcmrvl_if_ops *ops,
+				struct device *dev,
+				struct nfcmrvl_platform_data *pdata);
+
 
 void nfcmrvl_chip_reset(struct nfcmrvl_private *priv);
+
+int nfcmrvl_parse_dt(struct device_node *node,
+		     struct nfcmrvl_platform_data *pdata);
+
+#endif
diff --git a/drivers/nfc/nfcmrvl/usb.c b/drivers/nfc/nfcmrvl/usb.c
index c4046b681bfa..aa3f3c1cff0a 100644
--- a/drivers/nfc/nfcmrvl/usb.c
+++ b/drivers/nfc/nfcmrvl/usb.c
@@ -302,6 +302,10 @@ static int nfcmrvl_probe(struct usb_interface *intf,
 	struct nfcmrvl_private *priv;
 	int i;
 	struct usb_device *udev = interface_to_usbdev(intf);
+	struct nfcmrvl_platform_data config;
+
+	/* No configuration for USB */
+	memset(&config, 0, sizeof(config));
 
 	nfc_info(&udev->dev, "intf %p id %p\n", intf, id);
 
@@ -339,7 +343,7 @@ static int nfcmrvl_probe(struct usb_interface *intf,
 	init_usb_anchor(&drv_data->deferred);
 
 	priv = nfcmrvl_nci_register_dev(drv_data, &usb_ops,
-					&drv_data->udev->dev, 0);
+					&drv_data->udev->dev, &config);
 	if (IS_ERR(priv))
 		return PTR_ERR(priv);
 
diff --git a/include/linux/platform_data/nfcmrvl.h b/include/linux/platform_data/nfcmrvl.h
new file mode 100644
index 000000000000..106cfe5ed589
--- /dev/null
+++ b/include/linux/platform_data/nfcmrvl.h
@@ -0,0 +1,31 @@
+/*
+ * Copyright (C) 2015, Marvell International Ltd.
+ *
+ * This software file (the "File") is distributed by Marvell International
+ * Ltd. under the terms of the GNU General Public License Version 2, June 1991
+ * (the "License").  You may use, redistribute and/or modify this File in
+ * accordance with the terms and conditions of the License, a copy of which
+ * is available on the worldwide web at
+ * http://www.gnu.org/licenses/old-licenses/gpl-2.0.txt.
+ *
+ * THE FILE IS DISTRIBUTED AS-IS, WITHOUT WARRANTY OF ANY KIND, AND THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE
+ * ARE EXPRESSLY DISCLAIMED.  The License provides additional details about
+ * this warranty disclaimer.
+ */
+
+#ifndef _NFCMRVL_PTF_H_
+#define _NFCMRVL_PTF_H_
+
+struct nfcmrvl_platform_data {
+	/*
+	 * Generic
+	 */
+
+	/* GPIO that is wired to RESET_N signal */
+	unsigned int reset_n_io;
+	/* Tell if transport is muxed in HCI one */
+	unsigned int hci_muxed;
+};
+
+#endif /* _NFCMRVL_PTF_H_ */
-- 
cgit v1.2.3


From e097dc624f784debbde49701a493bf920bc422c7 Mon Sep 17 00:00:00 2001
From: Vincent Cuissard <cuissard@marvell.com>
Date: Thu, 11 Jun 2015 14:00:20 +0200
Subject: NFC: nfcmrvl: add UART driver

Add support of Marvell NFC chip controlled over UART

Signed-off-by: Vincent Cuissard <cuissard@marvell.com>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>
---
 .../devicetree/bindings/net/nfc/nfcmrvl.txt        |  29 +++
 drivers/nfc/nfcmrvl/Kconfig                        |  11 +
 drivers/nfc/nfcmrvl/Makefile                       |   3 +
 drivers/nfc/nfcmrvl/nfcmrvl.h                      |   7 +
 drivers/nfc/nfcmrvl/uart.c                         | 225 +++++++++++++++++++++
 include/linux/platform_data/nfcmrvl.h              |   9 +
 net/nfc/nci/uart.c                                 |   1 -
 7 files changed, 284 insertions(+), 1 deletion(-)
 create mode 100644 Documentation/devicetree/bindings/net/nfc/nfcmrvl.txt
 create mode 100644 drivers/nfc/nfcmrvl/uart.c

(limited to 'include/linux')

diff --git a/Documentation/devicetree/bindings/net/nfc/nfcmrvl.txt b/Documentation/devicetree/bindings/net/nfc/nfcmrvl.txt
new file mode 100644
index 000000000000..7c4a0cc370cf
--- /dev/null
+++ b/Documentation/devicetree/bindings/net/nfc/nfcmrvl.txt
@@ -0,0 +1,29 @@
+* Marvell International Ltd. NCI NFC Controller
+
+Required properties:
+- compatible: Should be "mrvl,nfc-uart".
+
+Optional SoC specific properties:
+- pinctrl-names: Contains only one value - "default".
+- pintctrl-0: Specifies the pin control groups used for this controller.
+- reset-n-io: Output GPIO pin used to reset the chip (active low).
+- hci-muxed: Specifies that the chip is muxing NCI over HCI frames.
+
+Optional UART-based chip specific properties:
+- flow-control: Specifies that the chip is using RTS/CTS.
+- break-control: Specifies that the chip needs specific break management.
+
+Example (for ARM-based BeagleBoard Black with 88W8887 on UART5):
+
+&uart5 {
+	status = "okay";
+
+	nfcmrvluart: nfcmrvluart@5 {
+		compatible = "mrvl,nfc-uart";
+
+		reset-n-io = <&gpio3 16 0>;
+
+		hci-muxed;
+		flow-control;
+        }
+};
diff --git a/drivers/nfc/nfcmrvl/Kconfig b/drivers/nfc/nfcmrvl/Kconfig
index 5e18afd9abe2..796be2411440 100644
--- a/drivers/nfc/nfcmrvl/Kconfig
+++ b/drivers/nfc/nfcmrvl/Kconfig
@@ -21,3 +21,14 @@ config NFC_MRVL_USB
 
 	  Say Y here to compile support for Marvell NFC-over-USB driver
 	  into the kernel or say M to compile it as module.
+
+config NFC_MRVL_UART
+	tristate "Marvell NFC-over-UART driver"
+	depends on NFC_MRVL && NFC_NCI_UART
+	help
+	  Marvell NFC-over-UART driver.
+
+	  This driver provides support for Marvell NFC-over-UART devices
+
+	  Say Y here to compile support for Marvell NFC-over-UART driver
+	  into the kernel or say M to compile it as module.
diff --git a/drivers/nfc/nfcmrvl/Makefile b/drivers/nfc/nfcmrvl/Makefile
index 97a0de72dc01..775196274d1f 100644
--- a/drivers/nfc/nfcmrvl/Makefile
+++ b/drivers/nfc/nfcmrvl/Makefile
@@ -7,3 +7,6 @@ obj-$(CONFIG_NFC_MRVL) += nfcmrvl.o
 
 nfcmrvl_usb-y += usb.o
 obj-$(CONFIG_NFC_MRVL_USB) += nfcmrvl_usb.o
+
+nfcmrvl_uart-y += uart.o
+obj-$(CONFIG_NFC_MRVL_UART) += nfcmrvl_uart.o
diff --git a/drivers/nfc/nfcmrvl/nfcmrvl.h b/drivers/nfc/nfcmrvl/nfcmrvl.h
index 214412bd6110..09780d57c9b8 100644
--- a/drivers/nfc/nfcmrvl/nfcmrvl.h
+++ b/drivers/nfc/nfcmrvl/nfcmrvl.h
@@ -43,6 +43,11 @@
 #define NFCMRVL_HCI_OGF				0x81
 #define NFCMRVL_HCI_OCF				0xFE
 
+enum nfcmrvl_phy {
+	NFCMRVL_PHY_USB		= 0,
+	NFCMRVL_PHY_UART	= 1,
+};
+
 
 struct nfcmrvl_private {
 
@@ -61,6 +66,8 @@ struct nfcmrvl_private {
 	void *drv_data;
 	/* PHY device */
 	struct device *dev;
+	/* PHY type */
+	enum nfcmrvl_phy phy;
 	/* Low level driver ops */
 	struct nfcmrvl_if_ops *if_ops;
 };
diff --git a/drivers/nfc/nfcmrvl/uart.c b/drivers/nfc/nfcmrvl/uart.c
new file mode 100644
index 000000000000..61442d6528a6
--- /dev/null
+++ b/drivers/nfc/nfcmrvl/uart.c
@@ -0,0 +1,225 @@
+/**
+ * Marvell NFC-over-UART driver
+ *
+ * Copyright (C) 2015, Marvell International Ltd.
+ *
+ * This software file (the "File") is distributed by Marvell International
+ * Ltd. under the terms of the GNU General Public License Version 2, June 1991
+ * (the "License").  You may use, redistribute and/or modify this File in
+ * accordance with the terms and conditions of the License, a copy of which
+ * is available on the worldwide web at
+ * http://www.gnu.org/licenses/old-licenses/gpl-2.0.txt.
+ *
+ * THE FILE IS DISTRIBUTED AS-IS, WITHOUT WARRANTY OF ANY KIND, AND THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE
+ * ARE EXPRESSLY DISCLAIMED.  The License provides additional details about
+ * this warranty disclaimer.
+ */
+
+#include <linux/module.h>
+#include <linux/delay.h>
+#include <linux/of_gpio.h>
+#include <net/nfc/nci.h>
+#include <net/nfc/nci_core.h>
+#include "nfcmrvl.h"
+
+static unsigned int hci_muxed;
+static unsigned int flow_control;
+static unsigned int break_control;
+static unsigned int reset_n_io;
+
+/*
+** NFCMRVL NCI OPS
+*/
+
+static int nfcmrvl_uart_nci_open(struct nfcmrvl_private *priv)
+{
+	return 0;
+}
+
+static int nfcmrvl_uart_nci_close(struct nfcmrvl_private *priv)
+{
+	return 0;
+}
+
+static int nfcmrvl_uart_nci_send(struct nfcmrvl_private *priv,
+				 struct sk_buff *skb)
+{
+	struct nci_uart *nu = priv->drv_data;
+
+	return nu->ops.send(nu, skb);
+}
+
+static struct nfcmrvl_if_ops uart_ops = {
+	.nci_open = nfcmrvl_uart_nci_open,
+	.nci_close = nfcmrvl_uart_nci_close,
+	.nci_send = nfcmrvl_uart_nci_send,
+};
+
+#ifdef CONFIG_OF
+
+static int nfcmrvl_uart_parse_dt(struct device_node *node,
+				 struct nfcmrvl_platform_data *pdata)
+{
+	struct device_node *matched_node;
+	int ret;
+
+	matched_node = of_find_compatible_node(node, NULL, "mrvl,nfc-uart");
+	if (!matched_node)
+		return -ENODEV;
+
+	ret = nfcmrvl_parse_dt(matched_node, pdata);
+	if (ret < 0) {
+		pr_err("Failed to get generic entries\n");
+		return ret;
+	}
+
+	if (of_find_property(matched_node, "flow-control", NULL))
+		pdata->flow_control = 1;
+	else
+		pdata->flow_control = 0;
+
+	if (of_find_property(matched_node, "break-control", NULL))
+		pdata->break_control = 1;
+	else
+		pdata->break_control = 0;
+
+	return 0;
+}
+
+#else
+
+static int nfcmrvl_uart_parse_dt(struct device_node *node,
+				 struct nfcmrvl_platform_data *pdata)
+{
+	return -ENODEV;
+}
+
+#endif
+
+/*
+** NCI UART OPS
+*/
+
+static int nfcmrvl_nci_uart_open(struct nci_uart *nu)
+{
+	struct nfcmrvl_private *priv;
+	struct nfcmrvl_platform_data *pdata = NULL;
+	struct nfcmrvl_platform_data config;
+
+	/*
+	 * Platform data cannot be used here since usually it is already used
+	 * by low level serial driver. We can try to retrieve serial device
+	 * and check if DT entries were added.
+	 */
+
+	if (nu->tty->dev->parent && nu->tty->dev->parent->of_node)
+		if (nfcmrvl_uart_parse_dt(nu->tty->dev->parent->of_node,
+					  &config) == 0)
+			pdata = &config;
+
+	if (!pdata) {
+		pr_info("No platform data / DT -> fallback to module params\n");
+		config.hci_muxed = hci_muxed;
+		config.reset_n_io = reset_n_io;
+		config.flow_control = flow_control;
+		config.break_control = break_control;
+		pdata = &config;
+	}
+
+	priv = nfcmrvl_nci_register_dev(nu, &uart_ops, nu->tty->dev, pdata);
+	if (IS_ERR(priv))
+		return PTR_ERR(priv);
+
+	priv->phy = NFCMRVL_PHY_UART;
+
+	nu->drv_data = priv;
+	nu->ndev = priv->ndev;
+
+	/* Set BREAK */
+	if (priv->config.break_control && nu->tty->ops->break_ctl)
+		nu->tty->ops->break_ctl(nu->tty, -1);
+
+	return 0;
+}
+
+static void nfcmrvl_nci_uart_close(struct nci_uart *nu)
+{
+	nfcmrvl_nci_unregister_dev((struct nfcmrvl_private *)nu->drv_data);
+}
+
+static int nfcmrvl_nci_uart_recv(struct nci_uart *nu, struct sk_buff *skb)
+{
+	return nfcmrvl_nci_recv_frame((struct nfcmrvl_private *)nu->drv_data,
+				      skb);
+}
+
+static void nfcmrvl_nci_uart_tx_start(struct nci_uart *nu)
+{
+	struct nfcmrvl_private *priv = (struct nfcmrvl_private *)nu->drv_data;
+
+	/* Remove BREAK to wake up the NFCC */
+	if (priv->config.break_control && nu->tty->ops->break_ctl) {
+		nu->tty->ops->break_ctl(nu->tty, 0);
+		usleep_range(3000, 5000);
+	}
+}
+
+static void nfcmrvl_nci_uart_tx_done(struct nci_uart *nu)
+{
+	struct nfcmrvl_private *priv = (struct nfcmrvl_private *)nu->drv_data;
+
+	/*
+	** To ensure that if the NFCC goes in DEEP SLEEP sate we can wake him
+	** up. we set BREAK. Once we will be ready to send again we will remove
+	** it.
+	*/
+	if (priv->config.break_control && nu->tty->ops->break_ctl)
+		nu->tty->ops->break_ctl(nu->tty, -1);
+}
+
+static struct nci_uart nfcmrvl_nci_uart = {
+	.owner  = THIS_MODULE,
+	.name   = "nfcmrvl_uart",
+	.driver = NCI_UART_DRIVER_MARVELL,
+	.ops	= {
+		.open		= nfcmrvl_nci_uart_open,
+		.close		= nfcmrvl_nci_uart_close,
+		.recv		= nfcmrvl_nci_uart_recv,
+		.tx_start	= nfcmrvl_nci_uart_tx_start,
+		.tx_done	= nfcmrvl_nci_uart_tx_done,
+	}
+};
+
+/*
+** Module init
+*/
+
+static int nfcmrvl_uart_init_module(void)
+{
+	return nci_uart_register(&nfcmrvl_nci_uart);
+}
+
+static void nfcmrvl_uart_exit_module(void)
+{
+	nci_uart_unregister(&nfcmrvl_nci_uart);
+}
+
+module_init(nfcmrvl_uart_init_module);
+module_exit(nfcmrvl_uart_exit_module);
+
+MODULE_AUTHOR("Marvell International Ltd.");
+MODULE_DESCRIPTION("Marvell NFC-over-UART");
+MODULE_LICENSE("GPL v2");
+
+module_param(flow_control, uint, 0);
+MODULE_PARM_DESC(flow_control, "Tell if UART needs flow control at init.");
+
+module_param(break_control, uint, 0);
+MODULE_PARM_DESC(break_control, "Tell if UART driver must drive break signal.");
+
+module_param(hci_muxed, uint, 0);
+MODULE_PARM_DESC(hci_muxed, "Tell if transport is muxed in HCI one.");
+
+module_param(reset_n_io, uint, 0);
+MODULE_PARM_DESC(reset_n_io, "GPIO that is wired to RESET_N signal.");
diff --git a/include/linux/platform_data/nfcmrvl.h b/include/linux/platform_data/nfcmrvl.h
index 106cfe5ed589..ac91707dabcb 100644
--- a/include/linux/platform_data/nfcmrvl.h
+++ b/include/linux/platform_data/nfcmrvl.h
@@ -26,6 +26,15 @@ struct nfcmrvl_platform_data {
 	unsigned int reset_n_io;
 	/* Tell if transport is muxed in HCI one */
 	unsigned int hci_muxed;
+
+	/*
+	 * UART specific
+	 */
+
+	/* Tell if UART needs flow control at init */
+	unsigned int flow_control;
+	/* Tell if firmware supports break control for power management */
+	unsigned int break_control;
 };
 
 #endif /* _NFCMRVL_PTF_H_ */
diff --git a/net/nfc/nci/uart.c b/net/nfc/nci/uart.c
index 70c279543074..2c48810af5bb 100644
--- a/net/nfc/nci/uart.c
+++ b/net/nfc/nci/uart.c
@@ -12,7 +12,6 @@
  * IMPLIED WARRANTIES OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE
  * ARE EXPRESSLY DISCLAIMED.  The License provides additional details about
  * this warranty disclaimer.
-
  */
 
 /* Inspired (hugely) by HCI LDISC implementation in Bluetooth.
-- 
cgit v1.2.3


From facc9699f0fe7d65a92cc09e175662659306066d Mon Sep 17 00:00:00 2001
From: Saeed Mahameed <saeedm@mellanox.com>
Date: Thu, 11 Jun 2015 14:47:27 +0300
Subject: net/mlx5e: Fix HW MTU settings

Previously we configured HW MTU to be netdev->mtu, actually we
need to configure netdev->mtu + (ETH_HLEN + VLAN_HLEN + ETH_FCS_LEN).

Also, query MTU can not fail, hence make the relevant helper a
void functionm, add mlx5e_set_dev_port_mtu, helper function to
handle MTU setting.

Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/infiniband/hw/mlx5/main.c                 |  8 +--
 drivers/net/ethernet/mellanox/mlx5/core/en.h      |  1 -
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 61 ++++++++++++-----------
 drivers/net/ethernet/mellanox/mlx5/core/port.c    | 36 ++++++-------
 include/linux/mlx5/driver.h                       | 10 ++--
 5 files changed, 55 insertions(+), 61 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index d4dea86052d6..79dadd627e9c 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -446,15 +446,11 @@ static int mlx5_query_hca_port(struct ib_device *ibdev, u8 port,
 	if (err)
 		goto out;
 
-	err = mlx5_query_port_max_mtu(mdev, &max_mtu, port);
-	if (err)
-		goto out;
+	mlx5_query_port_max_mtu(mdev, &max_mtu, port);
 
 	props->max_mtu = mlx5_mtu_to_ib_mtu(max_mtu);
 
-	err = mlx5_query_port_oper_mtu(mdev, &oper_mtu, port);
-	if (err)
-		goto out;
+	mlx5_query_port_oper_mtu(mdev, &oper_mtu, port);
 
 	props->active_mtu = mlx5_mtu_to_ib_mtu(oper_mtu);
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index e9edb7210de1..71f38bb5dbfc 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -57,7 +57,6 @@
 #define MLX5E_PARAMS_DEFAULT_TX_CQ_MODERATION_PKTS      0x20
 #define MLX5E_PARAMS_DEFAULT_MIN_RX_WQES                0x80
 #define MLX5E_PARAMS_DEFAULT_RX_HASH_LOG_TBL_SZ         0x7
-#define MLX5E_PARAMS_MIN_MTU                            46
 
 #define MLX5E_TX_CQ_POLL_BUDGET        128
 #define MLX5E_UPDATE_STATS_INTERVAL    200 /* msecs */
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 075e5175e779..22b2665e0328 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -277,6 +277,9 @@ static void mlx5e_send_nop(struct mlx5e_sq *sq)
 	mlx5e_tx_notify_hw(sq, wqe);
 }
 
+#define MLX5E_HW2SW_MTU(hwmtu) (hwmtu - (ETH_HLEN + VLAN_HLEN + ETH_FCS_LEN))
+#define MLX5E_SW2HW_MTU(swmtu) (swmtu + (ETH_HLEN + VLAN_HLEN + ETH_FCS_LEN))
+
 static int mlx5e_create_rq(struct mlx5e_channel *c,
 			   struct mlx5e_rq_param *param,
 			   struct mlx5e_rq *rq)
@@ -305,7 +308,7 @@ static int mlx5e_create_rq(struct mlx5e_channel *c,
 	}
 
 	rq->wqe_sz = (priv->params.lro_en) ? priv->params.lro_wqe_sz :
-				priv->netdev->mtu + ETH_HLEN + VLAN_HLEN;
+					     MLX5E_SW2HW_MTU(priv->netdev->mtu);
 
 	for (i = 0; i < wq_sz; i++) {
 		struct mlx5e_rx_wqe *wqe = mlx5_wq_ll_get_wqe(&rq->wq, i);
@@ -1367,11 +1370,30 @@ static void mlx5e_close_tirs(struct mlx5e_priv *priv)
 		mlx5e_close_tir(priv, i);
 }
 
-int mlx5e_open_locked(struct net_device *netdev)
+static int mlx5e_set_dev_port_mtu(struct net_device *netdev)
 {
 	struct mlx5e_priv *priv = netdev_priv(netdev);
 	struct mlx5_core_dev *mdev = priv->mdev;
-	int actual_mtu;
+	int hw_mtu;
+	int err;
+
+	err = mlx5_set_port_mtu(mdev, MLX5E_SW2HW_MTU(netdev->mtu), 1);
+	if (err)
+		return err;
+
+	mlx5_query_port_oper_mtu(mdev, &hw_mtu, 1);
+
+	if (MLX5E_HW2SW_MTU(hw_mtu) != netdev->mtu)
+		netdev_warn(netdev, "%s: Port MTU %d is different than netdev mtu %d\n",
+			    __func__, MLX5E_HW2SW_MTU(hw_mtu), netdev->mtu);
+
+	netdev->mtu = MLX5E_HW2SW_MTU(hw_mtu);
+	return 0;
+}
+
+int mlx5e_open_locked(struct net_device *netdev)
+{
+	struct mlx5e_priv *priv = netdev_priv(netdev);
 	int num_txqs;
 	int err;
 
@@ -1380,25 +1402,9 @@ int mlx5e_open_locked(struct net_device *netdev)
 	netif_set_real_num_tx_queues(netdev, num_txqs);
 	netif_set_real_num_rx_queues(netdev, priv->params.num_channels);
 
-	err = mlx5_set_port_mtu(mdev, netdev->mtu);
-	if (err) {
-		netdev_err(netdev, "%s: mlx5_set_port_mtu failed %d\n",
-			   __func__, err);
+	err = mlx5e_set_dev_port_mtu(netdev);
+	if (err)
 		return err;
-	}
-
-	err = mlx5_query_port_oper_mtu(mdev, &actual_mtu, 1);
-	if (err) {
-		netdev_err(netdev, "%s: mlx5_query_port_oper_mtu failed %d\n",
-			   __func__, err);
-		return err;
-	}
-
-	if (actual_mtu != netdev->mtu)
-		netdev_warn(netdev, "%s: Failed to set MTU to %d\n",
-			    __func__, netdev->mtu);
-
-	netdev->mtu = actual_mtu;
 
 	err = mlx5e_open_tises(priv);
 	if (err) {
@@ -1613,15 +1619,14 @@ static int mlx5e_change_mtu(struct net_device *netdev, int new_mtu)
 	struct mlx5e_priv *priv = netdev_priv(netdev);
 	struct mlx5_core_dev *mdev = priv->mdev;
 	int max_mtu;
-	int err = 0;
+	int err;
 
-	err = mlx5_query_port_max_mtu(mdev, &max_mtu, 1);
-	if (err)
-		return err;
+	mlx5_query_port_max_mtu(mdev, &max_mtu, 1);
 
-	if (new_mtu > max_mtu || new_mtu < MLX5E_PARAMS_MIN_MTU) {
-		netdev_err(netdev, "%s: Bad MTU size, mtu must be [%d-%d]\n",
-			   __func__, MLX5E_PARAMS_MIN_MTU, max_mtu);
+	if (new_mtu > max_mtu) {
+		netdev_err(netdev,
+			   "%s: Bad MTU (%d) > (%d) Max\n",
+			   __func__, new_mtu, max_mtu);
 		return -EINVAL;
 	}
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/port.c b/drivers/net/ethernet/mellanox/mlx5/core/port.c
index 619d3baf19ea..70147999f657 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/port.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/port.c
@@ -248,22 +248,18 @@ int mlx5_query_port_status(struct mlx5_core_dev *dev, u8 *status)
 	return err;
 }
 
-static int mlx5_query_port_mtu(struct mlx5_core_dev *dev,
-			       int *admin_mtu, int *max_mtu, int *oper_mtu,
-			       u8 local_port)
+static void mlx5_query_port_mtu(struct mlx5_core_dev *dev, int *admin_mtu,
+				int *max_mtu, int *oper_mtu, u8 port)
 {
 	u32 in[MLX5_ST_SZ_DW(pmtu_reg)];
 	u32 out[MLX5_ST_SZ_DW(pmtu_reg)];
-	int err;
 
 	memset(in, 0, sizeof(in));
 
-	MLX5_SET(pmtu_reg, in, local_port, local_port);
+	MLX5_SET(pmtu_reg, in, local_port, port);
 
-	err = mlx5_core_access_reg(dev, in, sizeof(in), out,
-				   sizeof(out), MLX5_REG_PMTU, 0, 0);
-	if (err)
-		return err;
+	mlx5_core_access_reg(dev, in, sizeof(in), out,
+			     sizeof(out), MLX5_REG_PMTU, 0, 0);
 
 	if (max_mtu)
 		*max_mtu  = MLX5_GET(pmtu_reg, out, max_mtu);
@@ -271,11 +267,9 @@ static int mlx5_query_port_mtu(struct mlx5_core_dev *dev,
 		*oper_mtu = MLX5_GET(pmtu_reg, out, oper_mtu);
 	if (admin_mtu)
 		*admin_mtu = MLX5_GET(pmtu_reg, out, admin_mtu);
-
-	return 0;
 }
 
-int mlx5_set_port_mtu(struct mlx5_core_dev *dev, int mtu)
+int mlx5_set_port_mtu(struct mlx5_core_dev *dev, int mtu, u8 port)
 {
 	u32 in[MLX5_ST_SZ_DW(pmtu_reg)];
 	u32 out[MLX5_ST_SZ_DW(pmtu_reg)];
@@ -283,24 +277,24 @@ int mlx5_set_port_mtu(struct mlx5_core_dev *dev, int mtu)
 	memset(in, 0, sizeof(in));
 
 	MLX5_SET(pmtu_reg, in, admin_mtu, mtu);
-	MLX5_SET(pmtu_reg, in, local_port, 1);
+	MLX5_SET(pmtu_reg, in, local_port, port);
 
-	return mlx5_core_access_reg(dev, in, sizeof(in), out, sizeof(out),
-				    MLX5_REG_PMTU, 0, 1);
+	return mlx5_core_access_reg(dev, in, sizeof(in), out,
+				   sizeof(out), MLX5_REG_PMTU, 0, 1);
 }
 EXPORT_SYMBOL_GPL(mlx5_set_port_mtu);
 
-int mlx5_query_port_max_mtu(struct mlx5_core_dev *dev, int *max_mtu,
-			    u8 local_port)
+void mlx5_query_port_max_mtu(struct mlx5_core_dev *dev, int *max_mtu,
+			     u8 port)
 {
-	return mlx5_query_port_mtu(dev, NULL, max_mtu, NULL, local_port);
+	mlx5_query_port_mtu(dev, NULL, max_mtu, NULL, port);
 }
 EXPORT_SYMBOL_GPL(mlx5_query_port_max_mtu);
 
-int mlx5_query_port_oper_mtu(struct mlx5_core_dev *dev, int *oper_mtu,
-			     u8 local_port)
+void mlx5_query_port_oper_mtu(struct mlx5_core_dev *dev, int *oper_mtu,
+			      u8 port)
 {
-	return mlx5_query_port_mtu(dev, NULL, NULL, oper_mtu, local_port);
+	mlx5_query_port_mtu(dev, NULL, NULL, oper_mtu, port);
 }
 EXPORT_SYMBOL_GPL(mlx5_query_port_oper_mtu);
 
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 6093bde16b94..c0930f8d7021 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -756,11 +756,11 @@ int mlx5_set_port_status(struct mlx5_core_dev *dev,
 			 enum mlx5_port_status status);
 int mlx5_query_port_status(struct mlx5_core_dev *dev, u8 *status);
 
-int mlx5_set_port_mtu(struct mlx5_core_dev *dev, int mtu);
-int mlx5_query_port_max_mtu(struct mlx5_core_dev *dev, int *max_mtu,
-			    u8 local_port);
-int mlx5_query_port_oper_mtu(struct mlx5_core_dev *dev, int *oper_mtu,
-			     u8 local_port);
+int mlx5_set_port_mtu(struct mlx5_core_dev *dev, int mtu, u8 port);
+void mlx5_query_port_max_mtu(struct mlx5_core_dev *dev, int *max_mtu, u8 port);
+void mlx5_query_port_oper_mtu(struct mlx5_core_dev *dev, int *oper_mtu,
+			      u8 port);
+
 int mlx5_query_port_vl_hw_cap(struct mlx5_core_dev *dev,
 			      u8 *vl_hw_cap, u8 local_port);
 
-- 
cgit v1.2.3


From fc11fbf9a785b25c5d07f05a30d4169ec39818da Mon Sep 17 00:00:00 2001
From: Saeed Mahameed <saeedm@mellanox.com>
Date: Thu, 11 Jun 2015 14:47:28 +0300
Subject: net/mlx5e: Add HW cacheline start padding

Enable HW cacheline start padding and align RX WQE size to cacheline
while considering HW start padding. Also, fix dma_unmap call to use
the correct SKB data buffer size.

Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c |  5 ++++-
 drivers/net/ethernet/mellanox/mlx5/core/en_rx.c   | 10 +++++-----
 include/linux/mlx5/device.h                       |  4 ++++
 3 files changed, 13 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 22b2665e0328..1c62af69ca29 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -309,12 +309,15 @@ static int mlx5e_create_rq(struct mlx5e_channel *c,
 
 	rq->wqe_sz = (priv->params.lro_en) ? priv->params.lro_wqe_sz :
 					     MLX5E_SW2HW_MTU(priv->netdev->mtu);
+	rq->wqe_sz = SKB_DATA_ALIGN(rq->wqe_sz + MLX5E_NET_IP_ALIGN);
 
 	for (i = 0; i < wq_sz; i++) {
 		struct mlx5e_rx_wqe *wqe = mlx5_wq_ll_get_wqe(&rq->wq, i);
+		u32 byte_count = rq->wqe_sz - MLX5E_NET_IP_ALIGN;
 
 		wqe->data.lkey       = c->mkey_be;
-		wqe->data.byte_count = cpu_to_be32(rq->wqe_sz);
+		wqe->data.byte_count =
+			cpu_to_be32(byte_count | MLX5_HW_START_PADDING);
 	}
 
 	rq->pdev    = c->pdev;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
index ce1317cdabd7..06e7c744ed4a 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
@@ -45,18 +45,18 @@ static inline int mlx5e_alloc_rx_wqe(struct mlx5e_rq *rq,
 	if (unlikely(!skb))
 		return -ENOMEM;
 
-	skb_reserve(skb, MLX5E_NET_IP_ALIGN);
-
 	dma_addr = dma_map_single(rq->pdev,
 				  /* hw start padding */
-				  skb->data - MLX5E_NET_IP_ALIGN,
-				  /* hw   end padding */
+				  skb->data,
+				  /* hw end padding */
 				  rq->wqe_sz,
 				  DMA_FROM_DEVICE);
 
 	if (unlikely(dma_mapping_error(rq->pdev, dma_addr)))
 		goto err_free_skb;
 
+	skb_reserve(skb, MLX5E_NET_IP_ALIGN);
+
 	*((dma_addr_t *)skb->cb) = dma_addr;
 	wqe->data.addr = cpu_to_be64(dma_addr + MLX5E_NET_IP_ALIGN);
 
@@ -217,7 +217,7 @@ bool mlx5e_poll_rx_cq(struct mlx5e_cq *cq, int budget)
 
 		dma_unmap_single(rq->pdev,
 				 *((dma_addr_t *)skb->cb),
-				 skb_end_offset(skb),
+				 rq->wqe_sz,
 				 DMA_FROM_DEVICE);
 
 		if (unlikely((cqe->op_own >> 4) != MLX5_CQE_RESP_SEND)) {
diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h
index b2c43508a737..b943cd9e2097 100644
--- a/include/linux/mlx5/device.h
+++ b/include/linux/mlx5/device.h
@@ -131,6 +131,10 @@ enum {
 	MLX5_INLINE_SEG = 0x80000000,
 };
 
+enum {
+	MLX5_HW_START_PADDING = MLX5_INLINE_SEG,
+};
+
 enum {
 	MLX5_MIN_PKEY_TABLE_SIZE = 128,
 	MLX5_MAX_LOG_PKEY_TABLE  = 5,
-- 
cgit v1.2.3


From 833f32d763028c1bb371c64f457788b933773b3e Mon Sep 17 00:00:00 2001
From: John Stultz <john.stultz@linaro.org>
Date: Thu, 11 Jun 2015 15:54:55 -0700
Subject: time: Prevent early expiry of hrtimers[CLOCK_REALTIME] at the leap
 second edge

Currently, leapsecond adjustments are done at tick time. As a result,
the leapsecond was applied at the first timer tick *after* the
leapsecond (~1-10ms late depending on HZ), rather then exactly on the
second edge.

This was in part historical from back when we were always tick based,
but correcting this since has been avoided since it adds extra
conditional checks in the gettime fastpath, which has performance
overhead.

However, it was recently pointed out that ABS_TIME CLOCK_REALTIME
timers set for right after the leapsecond could fire a second early,
since some timers may be expired before we trigger the timekeeping
timer, which then applies the leapsecond.

This isn't quite as bad as it sounds, since behaviorally it is similar
to what is possible w/ ntpd made leapsecond adjustments done w/o using
the kernel discipline. Where due to latencies, timers may fire just
prior to the settimeofday call. (Also, one should note that all
applications using CLOCK_REALTIME timers should always be careful,
since they are prone to quirks from settimeofday() disturbances.)

However, the purpose of having the kernel do the leap adjustment is to
avoid such latencies, so I think this is worth fixing.

So in order to properly keep those timers from firing a second early,
this patch modifies the ntp and timekeeping logic so that we keep
enough state so that the update_base_offsets_now accessor, which
provides the hrtimer core the current time, can check and apply the
leapsecond adjustment on the second edge. This prevents the hrtimer
core from expiring timers too early.

This patch does not modify any other time read path, so no additional
overhead is incurred. However, this also means that the leap-second
continues to be applied at tick time for all other read-paths.

Apologies to Richard Cochran, who pushed for similar changes years
ago, which I resisted due to the concerns about the performance
overhead.

While I suspect this isn't extremely critical, folks who care about
strict leap-second correctness will likely want to watch
this. Potentially a -stable candidate eventually.

Originally-suggested-by: Richard Cochran <richardcochran@gmail.com>
Reported-by: Daniel Bristot de Oliveira <bristot@redhat.com>
Reported-by: Prarit Bhargava <prarit@redhat.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jiri Bohac <jbohac@suse.cz>
Cc: Shuah Khan <shuahkh@osg.samsung.com>
Cc: Ingo Molnar <mingo@kernel.org>
Link: http://lkml.kernel.org/r/1434063297-28657-4-git-send-email-john.stultz@linaro.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/time64.h              |  1 +
 include/linux/timekeeper_internal.h |  2 ++
 kernel/time/ntp.c                   | 42 ++++++++++++++++++++++++++++++-------
 kernel/time/ntp_internal.h          |  1 +
 kernel/time/timekeeping.c           | 23 +++++++++++++++++++-
 5 files changed, 61 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/time64.h b/include/linux/time64.h
index 12d4e82b0276..77b5df2acd2a 100644
--- a/include/linux/time64.h
+++ b/include/linux/time64.h
@@ -29,6 +29,7 @@ struct timespec64 {
 #define FSEC_PER_SEC	1000000000000000LL
 
 /* Located here for timespec[64]_valid_strict */
+#define TIME64_MAX			((s64)~((u64)1 << 63))
 #define KTIME_MAX			((s64)~((u64)1 << 63))
 #define KTIME_SEC_MAX			(KTIME_MAX / NSEC_PER_SEC)
 
diff --git a/include/linux/timekeeper_internal.h b/include/linux/timekeeper_internal.h
index e1f5a1136554..25247220b4b7 100644
--- a/include/linux/timekeeper_internal.h
+++ b/include/linux/timekeeper_internal.h
@@ -50,6 +50,7 @@ struct tk_read_base {
  * @offs_tai:		Offset clock monotonic -> clock tai
  * @tai_offset:		The current UTC to TAI offset in seconds
  * @clock_was_set_seq:	The sequence number of clock was set events
+ * @next_leap_ktime:	CLOCK_MONOTONIC time value of a pending leap-second
  * @raw_time:		Monotonic raw base time in timespec64 format
  * @cycle_interval:	Number of clock cycles in one NTP interval
  * @xtime_interval:	Number of clock shifted nano seconds in one NTP
@@ -90,6 +91,7 @@ struct timekeeper {
 	ktime_t			offs_tai;
 	s32			tai_offset;
 	unsigned int		clock_was_set_seq;
+	ktime_t			next_leap_ktime;
 	struct timespec64	raw_time;
 
 	/* The following members are for timekeeping internal use */
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 7aa216188450..033743e3647a 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -77,6 +77,9 @@ static long			time_adjust;
 /* constant (boot-param configurable) NTP tick adjustment (upscaled)	*/
 static s64			ntp_tick_adj;
 
+/* second value of the next pending leapsecond, or TIME64_MAX if no leap */
+static time64_t			ntp_next_leap_sec = TIME64_MAX;
+
 #ifdef CONFIG_NTP_PPS
 
 /*
@@ -350,6 +353,7 @@ void ntp_clear(void)
 	tick_length	= tick_length_base;
 	time_offset	= 0;
 
+	ntp_next_leap_sec = TIME64_MAX;
 	/* Clear PPS state variables */
 	pps_clear();
 }
@@ -360,6 +364,21 @@ u64 ntp_tick_length(void)
 	return tick_length;
 }
 
+/**
+ * ntp_get_next_leap - Returns the next leapsecond in CLOCK_REALTIME ktime_t
+ *
+ * Provides the time of the next leapsecond against CLOCK_REALTIME in
+ * a ktime_t format. Returns KTIME_MAX if no leapsecond is pending.
+ */
+ktime_t ntp_get_next_leap(void)
+{
+	ktime_t ret;
+
+	if ((time_state == TIME_INS) && (time_status & STA_INS))
+		return ktime_set(ntp_next_leap_sec, 0);
+	ret.tv64 = KTIME_MAX;
+	return ret;
+}
 
 /*
  * this routine handles the overflow of the microsecond field
@@ -383,15 +402,21 @@ int second_overflow(unsigned long secs)
 	 */
 	switch (time_state) {
 	case TIME_OK:
-		if (time_status & STA_INS)
+		if (time_status & STA_INS) {
 			time_state = TIME_INS;
-		else if (time_status & STA_DEL)
+			ntp_next_leap_sec = secs + SECS_PER_DAY -
+						(secs % SECS_PER_DAY);
+		} else if (time_status & STA_DEL) {
 			time_state = TIME_DEL;
+			ntp_next_leap_sec = secs + SECS_PER_DAY -
+						 ((secs+1) % SECS_PER_DAY);
+		}
 		break;
 	case TIME_INS:
-		if (!(time_status & STA_INS))
+		if (!(time_status & STA_INS)) {
+			ntp_next_leap_sec = TIME64_MAX;
 			time_state = TIME_OK;
-		else if (secs % SECS_PER_DAY == 0) {
+		} else if (secs % SECS_PER_DAY == 0) {
 			leap = -1;
 			time_state = TIME_OOP;
 			printk(KERN_NOTICE
@@ -399,19 +424,21 @@ int second_overflow(unsigned long secs)
 		}
 		break;
 	case TIME_DEL:
-		if (!(time_status & STA_DEL))
+		if (!(time_status & STA_DEL)) {
+			ntp_next_leap_sec = TIME64_MAX;
 			time_state = TIME_OK;
-		else if ((secs + 1) % SECS_PER_DAY == 0) {
+		} else if ((secs + 1) % SECS_PER_DAY == 0) {
 			leap = 1;
+			ntp_next_leap_sec = TIME64_MAX;
 			time_state = TIME_WAIT;
 			printk(KERN_NOTICE
 				"Clock: deleting leap second 23:59:59 UTC\n");
 		}
 		break;
 	case TIME_OOP:
+		ntp_next_leap_sec = TIME64_MAX;
 		time_state = TIME_WAIT;
 		break;
-
 	case TIME_WAIT:
 		if (!(time_status & (STA_INS | STA_DEL)))
 			time_state = TIME_OK;
@@ -548,6 +575,7 @@ static inline void process_adj_status(struct timex *txc, struct timespec64 *ts)
 	if ((time_status & STA_PLL) && !(txc->status & STA_PLL)) {
 		time_state = TIME_OK;
 		time_status = STA_UNSYNC;
+		ntp_next_leap_sec = TIME64_MAX;
 		/* restart PPS frequency calibration */
 		pps_reset_freq_interval();
 	}
diff --git a/kernel/time/ntp_internal.h b/kernel/time/ntp_internal.h
index bbd102ad9df7..65430504ca26 100644
--- a/kernel/time/ntp_internal.h
+++ b/kernel/time/ntp_internal.h
@@ -5,6 +5,7 @@ extern void ntp_init(void);
 extern void ntp_clear(void);
 /* Returns how long ticks are at present, in ns / 2^NTP_SCALE_SHIFT. */
 extern u64 ntp_tick_length(void);
+extern ktime_t ntp_get_next_leap(void);
 extern int second_overflow(unsigned long secs);
 extern int ntp_validate_timex(struct timex *);
 extern int __do_adjtimex(struct timex *, struct timespec64 *, s32 *);
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 849b93265904..5d67ffb7e317 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -539,6 +539,17 @@ int pvclock_gtod_unregister_notifier(struct notifier_block *nb)
 }
 EXPORT_SYMBOL_GPL(pvclock_gtod_unregister_notifier);
 
+/*
+ * tk_update_leap_state - helper to update the next_leap_ktime
+ */
+static inline void tk_update_leap_state(struct timekeeper *tk)
+{
+	tk->next_leap_ktime = ntp_get_next_leap();
+	if (tk->next_leap_ktime.tv64 != KTIME_MAX)
+		/* Convert to monotonic time */
+		tk->next_leap_ktime = ktime_sub(tk->next_leap_ktime, tk->offs_real);
+}
+
 /*
  * Update the ktime_t based scalar nsec members of the timekeeper
  */
@@ -580,6 +591,7 @@ static void timekeeping_update(struct timekeeper *tk, unsigned int action)
 		ntp_clear();
 	}
 
+	tk_update_leap_state(tk);
 	tk_update_ktime_data(tk);
 
 	update_vsyscall(tk);
@@ -1956,15 +1968,22 @@ ktime_t ktime_get_update_offsets_now(unsigned int *cwsseq, ktime_t *offs_real,
 
 		base = tk->tkr_mono.base;
 		nsecs = timekeeping_get_ns(&tk->tkr_mono);
+		base = ktime_add_ns(base, nsecs);
+
 		if (*cwsseq != tk->clock_was_set_seq) {
 			*cwsseq = tk->clock_was_set_seq;
 			*offs_real = tk->offs_real;
 			*offs_boot = tk->offs_boot;
 			*offs_tai = tk->offs_tai;
 		}
+
+		/* Handle leapsecond insertion adjustments */
+		if (unlikely(base.tv64 >= tk->next_leap_ktime.tv64))
+			*offs_real = ktime_sub(tk->offs_real, ktime_set(1, 0));
+
 	} while (read_seqcount_retry(&tk_core.seq, seq));
 
-	return ktime_add_ns(base, nsecs);
+	return base;
 }
 
 /**
@@ -2006,6 +2025,8 @@ int do_adjtimex(struct timex *txc)
 		__timekeeping_set_tai_offset(tk, tai);
 		timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET);
 	}
+	tk_update_leap_state(tk);
+
 	write_seqcount_end(&tk_core.seq);
 	raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
 
-- 
cgit v1.2.3


From 3bf17472226b0041b0c61363bd57a5cadbe620c4 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 9 Jun 2015 13:20:29 +0800
Subject: iommu: dmar: Extend struct irte for VT-d Posted-Interrupts

The IRTE (Interrupt Remapping Table Entry) is either an entry for
remapped or for posted interrupts. The hardware distiguishes between
remapped and posted entries by bit 15 in the low 64 bit of the
IRTE. If cleared the entry is remapped, if set it's posted.

The entries have common fields and dependent on the posted bit fields
with different meanings.

Extend struct irte to handle the differences between remap and posted
mode by having three structs in the unions:

        - Shared
        - Remapped
        - Posted

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Feng Wu <feng.wu@intel.com>
Acked-by: Joerg Roedel <joro@8bytes.org>
Cc: jiang.liu@linux.intel.com
Cc: iommu@lists.linux-foundation.org
Cc: dwmw2@infradead.org
Link: http://lkml.kernel.org/r/1433827237-3382-3-git-send-email-feng.wu@intel.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/dmar.h | 70 +++++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 55 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/dmar.h b/include/linux/dmar.h
index 84737565c1fd..0dbcabcb5f0d 100644
--- a/include/linux/dmar.h
+++ b/include/linux/dmar.h
@@ -185,33 +185,73 @@ static inline int dmar_device_remove(void *handle)
 
 struct irte {
 	union {
+		/* Shared between remapped and posted mode*/
 		struct {
-			__u64	present 	: 1,
-				fpd		: 1,
-				dst_mode	: 1,
-				redir_hint	: 1,
-				trigger_mode	: 1,
-				dlvry_mode	: 3,
-				avail		: 4,
-				__reserved_1	: 4,
-				vector		: 8,
-				__reserved_2	: 8,
-				dest_id		: 32;
+			__u64	present		: 1,  /*  0      */
+				fpd		: 1,  /*  1      */
+				__res0		: 6,  /*  2 -  6 */
+				avail		: 4,  /*  8 - 11 */
+				__res1		: 3,  /* 12 - 14 */
+				pst		: 1,  /* 15      */
+				vector		: 8,  /* 16 - 23 */
+				__res2		: 40; /* 24 - 63 */
+		};
+
+		/* Remapped mode */
+		struct {
+			__u64	r_present	: 1,  /*  0      */
+				r_fpd		: 1,  /*  1      */
+				dst_mode	: 1,  /*  2      */
+				redir_hint	: 1,  /*  3      */
+				trigger_mode	: 1,  /*  4      */
+				dlvry_mode	: 3,  /*  5 -  7 */
+				r_avail		: 4,  /*  8 - 11 */
+				r_res0		: 4,  /* 12 - 15 */
+				r_vector	: 8,  /* 16 - 23 */
+				r_res1		: 8,  /* 24 - 31 */
+				dest_id		: 32; /* 32 - 63 */
+		};
+
+		/* Posted mode */
+		struct {
+			__u64	p_present	: 1,  /*  0      */
+				p_fpd		: 1,  /*  1      */
+				p_res0		: 6,  /*  2 -  7 */
+				p_avail		: 4,  /*  8 - 11 */
+				p_res1		: 2,  /* 12 - 13 */
+				p_urgent	: 1,  /* 14      */
+				p_pst		: 1,  /* 15      */
+				p_vector	: 8,  /* 16 - 23 */
+				p_res2		: 14, /* 24 - 37 */
+				pda_l		: 26; /* 38 - 63 */
 		};
 		__u64 low;
 	};
 
 	union {
+		/* Shared between remapped and posted mode*/
 		struct {
-			__u64	sid		: 16,
-				sq		: 2,
-				svt		: 2,
-				__reserved_3	: 44;
+			__u64	sid		: 16,  /* 64 - 79  */
+				sq		: 2,   /* 80 - 81  */
+				svt		: 2,   /* 82 - 83  */
+				__res3		: 44;  /* 84 - 127 */
+		};
+
+		/* Posted mode*/
+		struct {
+			__u64	p_sid		: 16,  /* 64 - 79  */
+				p_sq		: 2,   /* 80 - 81  */
+				p_svt		: 2,   /* 82 - 83  */
+				p_res3		: 12,  /* 84 - 95  */
+				pda_h		: 32;  /* 96 - 127 */
 		};
 		__u64 high;
 	};
 };
 
+#define PDA_LOW_BIT    26
+#define PDA_HIGH_BIT   32
+
 enum {
 	IRQ_REMAP_XAPIC_MODE,
 	IRQ_REMAP_X2APIC_MODE,
-- 
cgit v1.2.3


From bf56027ff4d9e75bf668ae990fe6204d00a23002 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 9 Jun 2015 13:20:30 +0800
Subject: iommu: dmar: Provide helper to copy shared irte fields

Instead of open coding, provide a helper function to copy the shared
irte fields.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: jiang.liu@linux.intel.com
Cc: iommu@lists.linux-foundation.org
Cc: joro@8bytes.org
Cc: dwmw2@infradead.org
Link: http://lkml.kernel.org/r/1433827237-3382-4-git-send-email-feng.wu@intel.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/dmar.h | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/dmar.h b/include/linux/dmar.h
index 0dbcabcb5f0d..e9bc9292bd3a 100644
--- a/include/linux/dmar.h
+++ b/include/linux/dmar.h
@@ -249,6 +249,18 @@ struct irte {
 	};
 };
 
+static inline void dmar_copy_shared_irte(struct irte *dst, struct irte *src)
+{
+	dst->present	= src->present;
+	dst->fpd	= src->fpd;
+	dst->avail	= src->avail;
+	dst->pst	= src->pst;
+	dst->vector	= src->vector;
+	dst->sid	= src->sid;
+	dst->sq		= src->sq;
+	dst->svt	= src->svt;
+}
+
 #define PDA_LOW_BIT    26
 #define PDA_HIGH_BIT   32
 
-- 
cgit v1.2.3


From 07c09787b26db724c94a912a572a9a4fa66008f3 Mon Sep 17 00:00:00 2001
From: Feng Wu <feng.wu@intel.com>
Date: Tue, 9 Jun 2015 13:20:34 +0800
Subject: iommu, x86: Add cap_pi_support() to detect VT-d PI capability

Add helper function to detect VT-d Posted-Interrupts capability.

Signed-off-by: Feng Wu <feng.wu@intel.com>
Reviewed-by: Jiang Liu <jiang.liu@linux.intel.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: David Woodhouse <David.Woodhouse@intel.com>
Acked-by: Joerg Roedel <joro@8bytes.org>
Cc: iommu@lists.linux-foundation.org
Cc: dwmw2@infradead.org
Link: http://lkml.kernel.org/r/1433827237-3382-8-git-send-email-feng.wu@intel.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/intel-iommu.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index 0af9b03e2b1c..0c251be39836 100644
--- a/include/linux/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -87,6 +87,7 @@ static inline void dmar_writeq(void __iomem *addr, u64 val)
 /*
  * Decoding Capability Register
  */
+#define cap_pi_support(c)	(((c) >> 59) & 1)
 #define cap_read_drain(c)	(((c) >> 55) & 1)
 #define cap_write_drain(c)	(((c) >> 54) & 1)
 #define cap_max_amask_val(c)	(((c) >> 48) & 0x3f)
-- 
cgit v1.2.3


From b6a00fae9760a49114016e4764d09e522a5ba5b6 Mon Sep 17 00:00:00 2001
From: Tim Kryger <tim.kryger@gmail.com>
Date: Tue, 26 May 2015 13:08:16 -0700
Subject: pwm: Add pwmchip_add_with_polarity() API

Add a new function to register a PWM chip with channels that have their
initial polarity as specified by an additional parameter. This benefits
drivers of controllers that by default operate with inversed polarity
by removing the need to modify the polarity during initialization.

Signed-off-by: Tim Kryger <tim.kryger@gmail.com>
Signed-off-by: Jonathan Richardson <jonathar@broadcom.com>
[thierry.reding@gmail.com: export pwmchip_add_with_polarity()]
Signed-off-by: Thierry Reding <thierry.reding@gmail.com>
---
 drivers/pwm/core.c  | 23 ++++++++++++++++++++---
 include/linux/pwm.h |  7 +++++++
 2 files changed, 27 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pwm/core.c b/drivers/pwm/core.c
index 27cd58d16881..3a7769fe53de 100644
--- a/drivers/pwm/core.c
+++ b/drivers/pwm/core.c
@@ -223,13 +223,16 @@ void *pwm_get_chip_data(struct pwm_device *pwm)
 EXPORT_SYMBOL_GPL(pwm_get_chip_data);
 
 /**
- * pwmchip_add() - register a new PWM chip
+ * pwmchip_add_with_polarity() - register a new PWM chip
  * @chip: the PWM chip to add
+ * @polarity: initial polarity of PWM channels
  *
  * Register a new PWM chip. If chip->base < 0 then a dynamically assigned base
- * will be used.
+ * will be used. The initial polarity for all channels is specified by the
+ * @polarity parameter.
  */
-int pwmchip_add(struct pwm_chip *chip)
+int pwmchip_add_with_polarity(struct pwm_chip *chip,
+			      enum pwm_polarity polarity)
 {
 	struct pwm_device *pwm;
 	unsigned int i;
@@ -259,6 +262,7 @@ int pwmchip_add(struct pwm_chip *chip)
 		pwm->chip = chip;
 		pwm->pwm = chip->base + i;
 		pwm->hwpwm = i;
+		pwm->polarity = polarity;
 
 		radix_tree_insert(&pwm_tree, pwm->pwm, pwm);
 	}
@@ -279,6 +283,19 @@ out:
 	mutex_unlock(&pwm_lock);
 	return ret;
 }
+EXPORT_SYMBOL_GPL(pwmchip_add_with_polarity);
+
+/**
+ * pwmchip_add() - register a new PWM chip
+ * @chip: the PWM chip to add
+ *
+ * Register a new PWM chip. If chip->base < 0 then a dynamically assigned base
+ * will be used. The initial polarity for all channels is normal.
+ */
+int pwmchip_add(struct pwm_chip *chip)
+{
+	return pwmchip_add_with_polarity(chip, PWM_POLARITY_NORMAL);
+}
 EXPORT_SYMBOL_GPL(pwmchip_add);
 
 /**
diff --git a/include/linux/pwm.h b/include/linux/pwm.h
index cfe2d8df5be0..36262d08a9da 100644
--- a/include/linux/pwm.h
+++ b/include/linux/pwm.h
@@ -182,6 +182,8 @@ struct pwm_chip {
 int pwm_set_chip_data(struct pwm_device *pwm, void *data);
 void *pwm_get_chip_data(struct pwm_device *pwm);
 
+int pwmchip_add_with_polarity(struct pwm_chip *chip,
+			      enum pwm_polarity polarity);
 int pwmchip_add(struct pwm_chip *chip);
 int pwmchip_remove(struct pwm_chip *chip);
 struct pwm_device *pwm_request_from_chip(struct pwm_chip *chip,
@@ -217,6 +219,11 @@ static inline int pwmchip_add(struct pwm_chip *chip)
 	return -EINVAL;
 }
 
+static inline int pwmchip_add_inversed(struct pwm_chip *chip)
+{
+	return -EINVAL;
+}
+
 static inline int pwmchip_remove(struct pwm_chip *chip)
 {
 	return -EINVAL;
-- 
cgit v1.2.3


From 22a10bca280073f81e9e2d9fed6f90a3bcf00236 Mon Sep 17 00:00:00 2001
From: Stephen Boyd <sboyd@codeaurora.org>
Date: Thu, 11 Jun 2015 17:37:03 -0700
Subject: regulator: Add system_load constraint

Some regulators have a fixed load that isn't captured by
consumers that the kernel knows about. Add a constraint to
support this.

Signed-off-by: Stephen Boyd <sboyd@codeaurora.org>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 Documentation/devicetree/bindings/regulator/regulator.txt | 2 ++
 drivers/regulator/core.c                                  | 2 ++
 drivers/regulator/of_regulator.c                          | 3 +++
 include/linux/regulator/machine.h                         | 3 +++
 4 files changed, 10 insertions(+)

(limited to 'include/linux')

diff --git a/Documentation/devicetree/bindings/regulator/regulator.txt b/Documentation/devicetree/bindings/regulator/regulator.txt
index abb26b58c83e..553d2d0fe6d9 100644
--- a/Documentation/devicetree/bindings/regulator/regulator.txt
+++ b/Documentation/devicetree/bindings/regulator/regulator.txt
@@ -37,6 +37,8 @@ Optional properties:
 - regulator-initial-mode: initial operating mode. The set of possible operating
   modes depends on the capabilities of every hardware so each device binding
   documentation explains which values the regulator supports.
+- regulator-system-load: Load in uA present on regulator that is not captured by
+  any consumer request.
 
 Deprecated properties:
 - regulator-compatible: If a regulator chip contains multiple
diff --git a/drivers/regulator/core.c b/drivers/regulator/core.c
index 443eaab933fc..c8d5e2b05fdf 100644
--- a/drivers/regulator/core.c
+++ b/drivers/regulator/core.c
@@ -678,6 +678,8 @@ static int drms_uA_update(struct regulator_dev *rdev)
 	list_for_each_entry(sibling, &rdev->consumer_list, list)
 		current_uA += sibling->uA_load;
 
+	current_uA += rdev->constraints->system_load;
+
 	if (rdev->desc->ops->set_load) {
 		/* set the optimum mode for our new total regulator load */
 		err = rdev->desc->ops->set_load(rdev, current_uA);
diff --git a/drivers/regulator/of_regulator.c b/drivers/regulator/of_regulator.c
index 24e812c48d93..482a86f90839 100644
--- a/drivers/regulator/of_regulator.c
+++ b/drivers/regulator/of_regulator.c
@@ -95,6 +95,9 @@ static void of_get_regulation_constraints(struct device_node *np,
 		}
 	}
 
+	if (!of_property_read_u32(np, "regulator-system-load", &pval))
+		constraints->system_load = pval;
+
 	for (i = 0; i < ARRAY_SIZE(regulator_states); i++) {
 		switch (i) {
 		case PM_SUSPEND_MEM:
diff --git a/include/linux/regulator/machine.h b/include/linux/regulator/machine.h
index b07562e082c4..01526559c8c3 100644
--- a/include/linux/regulator/machine.h
+++ b/include/linux/regulator/machine.h
@@ -75,6 +75,7 @@ struct regulator_state {
  *
  * @min_uA: Smallest current consumers may set.
  * @max_uA: Largest current consumers may set.
+ * @system_load: Load that isn't captured by any consumer requests.
  *
  * @valid_modes_mask: Mask of modes which may be configured by consumers.
  * @valid_ops_mask: Operations which may be performed by consumers.
@@ -112,6 +113,8 @@ struct regulation_constraints {
 	int min_uA;
 	int max_uA;
 
+	int system_load;
+
 	/* valid regulator operating modes for this machine */
 	unsigned int valid_modes_mask;
 
-- 
cgit v1.2.3


From 72b31f7271df34c6aab36c01305287924826678f Mon Sep 17 00:00:00 2001
From: Bernhard Thaler <bernhard.thaler@wvnet.at>
Date: Sat, 30 May 2015 15:27:40 +0200
Subject: netfilter: bridge: detect NAT66 correctly and change MAC address

IPv4 iptables allows to REDIRECT/DNAT/SNAT any traffic over a bridge.

e.g. REDIRECT
$ sysctl -w net.bridge.bridge-nf-call-iptables=1
$ iptables -t nat -A PREROUTING -p tcp -m tcp --dport 8080 \
  -j REDIRECT --to-ports 81

This does not work with ip6tables on a bridge in NAT66 scenario
because the REDIRECT/DNAT/SNAT is not correctly detected.

The bridge pre-routing (finish) netfilter hook has to check for a possible
redirect and then fix the destination mac address. This allows to use the
ip6tables rules for local REDIRECT/DNAT/SNAT REDIRECT similar to the IPv4
iptables version.

e.g. REDIRECT
$ sysctl -w net.bridge.bridge-nf-call-ip6tables=1
$ ip6tables -t nat -A PREROUTING -p tcp -m tcp --dport 8080 \
  -j REDIRECT --to-ports 81

This patch makes it possible to use IPv6 NAT66 on a bridge. It was tested
on a bridge with two interfaces using SNAT/DNAT NAT66 rules.

Reported-by: Artie Hamilton <artiemhamilton@yahoo.com>
Signed-off-by: Sven Eckelmann <sven@open-mesh.com>
[bernhard.thaler@wvnet.at: rebased, add indirect call to ip6_route_input()]
[bernhard.thaler@wvnet.at: rebased, split into separate patches]
Signed-off-by: Bernhard Thaler <bernhard.thaler@wvnet.at>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter_ipv6.h |  1 +
 include/linux/skbuff.h         |  6 ++++-
 net/bridge/br_netfilter.c      | 55 +++++++++++++++++++++++++++++++++++-------
 net/ipv6/netfilter.c           |  1 +
 4 files changed, 53 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter_ipv6.h b/include/linux/netfilter_ipv6.h
index 64dad1cc1a4b..e2d19694ee8f 100644
--- a/include/linux/netfilter_ipv6.h
+++ b/include/linux/netfilter_ipv6.h
@@ -25,6 +25,7 @@ void ipv6_netfilter_fini(void);
 struct nf_ipv6_ops {
 	int (*chk_addr)(struct net *net, const struct in6_addr *addr,
 			const struct net_device *dev, int strict);
+	void (*route_input)(struct sk_buff *skb);
 };
 
 extern const struct nf_ipv6_ops __rcu *nf_ipv6_ops;
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index cc612fc0a894..f70fc0e6bf7b 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -36,6 +36,7 @@
 #include <linux/sched.h>
 #include <net/flow_dissector.h>
 #include <linux/splice.h>
+#include <linux/in6.h>
 
 /* A. Checksumming of received packets by device.
  *
@@ -179,7 +180,10 @@ struct nf_bridge_info {
 		struct net_device *physoutdev;
 		char neigh_header[8];
 	};
-	__be32			ipv4_daddr;
+	union {
+		__be32          ipv4_daddr;
+		struct in6_addr ipv6_daddr;
+	};
 };
 #endif
 
diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c
index 6cb642c43451..9ac0c6417c77 100644
--- a/net/bridge/br_netfilter.c
+++ b/net/bridge/br_netfilter.c
@@ -326,30 +326,63 @@ free_skb:
 static bool daddr_was_changed(const struct sk_buff *skb,
 			      const struct nf_bridge_info *nf_bridge)
 {
-	return ip_hdr(skb)->daddr != nf_bridge->ipv4_daddr;
+	switch (skb->protocol) {
+	case htons(ETH_P_IP):
+		return ip_hdr(skb)->daddr != nf_bridge->ipv4_daddr;
+	case htons(ETH_P_IPV6):
+		return memcmp(&nf_bridge->ipv6_daddr, &ipv6_hdr(skb)->daddr,
+			      sizeof(ipv6_hdr(skb)->daddr)) != 0;
+	default:
+		return false;
+	}
 }
 
-/* PF_BRIDGE/PRE_ROUTING *********************************************/
-/* Undo the changes made for ip6tables PREROUTING and continue the
- * bridge PRE_ROUTING hook.
+/* PF_BRIDGE/PRE_ROUTING: Undo the changes made for ip6tables
+ * PREROUTING and continue the bridge PRE_ROUTING hook. See comment
+ * for br_nf_pre_routing_finish(), same logic is used here but
+ * equivalent IPv6 function ip6_route_input() called indirectly.
  */
 static int br_nf_pre_routing_finish_ipv6(struct sock *sk, struct sk_buff *skb)
 {
 	struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb);
 	struct rtable *rt;
+	struct net_device *dev = skb->dev;
+	const struct nf_ipv6_ops *v6ops = nf_get_ipv6_ops();
 
 	if (nf_bridge->pkt_otherhost) {
 		skb->pkt_type = PACKET_OTHERHOST;
 		nf_bridge->pkt_otherhost = false;
 	}
 	nf_bridge->mask &= ~BRNF_NF_BRIDGE_PREROUTING;
+	if (daddr_was_changed(skb, nf_bridge)) {
+		skb_dst_drop(skb);
+		v6ops->route_input(skb);
 
-	rt = bridge_parent_rtable(nf_bridge->physindev);
-	if (!rt) {
-		kfree_skb(skb);
-		return 0;
+		if (skb_dst(skb)->error) {
+			kfree_skb(skb);
+			return 0;
+		}
+
+		if (skb_dst(skb)->dev == dev) {
+			skb->dev = nf_bridge->physindev;
+			nf_bridge_update_protocol(skb);
+			nf_bridge_push_encap_header(skb);
+			NF_HOOK_THRESH(NFPROTO_BRIDGE, NF_BR_PRE_ROUTING,
+				       sk, skb, skb->dev, NULL,
+				       br_nf_pre_routing_finish_bridge,
+				       1);
+			return 0;
+		}
+		ether_addr_copy(eth_hdr(skb)->h_dest, dev->dev_addr);
+		skb->pkt_type = PACKET_HOST;
+	} else {
+		rt = bridge_parent_rtable(nf_bridge->physindev);
+		if (!rt) {
+			kfree_skb(skb);
+			return 0;
+		}
+		skb_dst_set_noref(skb, &rt->dst);
 	}
-	skb_dst_set_noref(skb, &rt->dst);
 
 	skb->dev = nf_bridge->physindev;
 	nf_bridge_update_protocol(skb);
@@ -579,6 +612,7 @@ static unsigned int br_nf_pre_routing_ipv6(const struct nf_hook_ops *ops,
 					   struct sk_buff *skb,
 					   const struct nf_hook_state *state)
 {
+	struct nf_bridge_info *nf_bridge;
 	const struct ipv6hdr *hdr;
 	u32 pkt_len;
 
@@ -610,6 +644,9 @@ static unsigned int br_nf_pre_routing_ipv6(const struct nf_hook_ops *ops,
 	if (!setup_pre_routing(skb))
 		return NF_DROP;
 
+	nf_bridge = nf_bridge_info_get(skb);
+	nf_bridge->ipv6_daddr = ipv6_hdr(skb)->daddr;
+
 	skb->protocol = htons(ETH_P_IPV6);
 	NF_HOOK(NFPROTO_IPV6, NF_INET_PRE_ROUTING, state->sk, skb,
 		skb->dev, NULL,
diff --git a/net/ipv6/netfilter.c b/net/ipv6/netfilter.c
index d958718b5031..bbca09fe9553 100644
--- a/net/ipv6/netfilter.c
+++ b/net/ipv6/netfilter.c
@@ -191,6 +191,7 @@ static __sum16 nf_ip6_checksum_partial(struct sk_buff *skb, unsigned int hook,
 
 static const struct nf_ipv6_ops ipv6ops = {
 	.chk_addr	= ipv6_chk_addr,
+	.route_input    = ip6_route_input
 };
 
 static const struct nf_afinfo nf_ip6_afinfo = {
-- 
cgit v1.2.3


From 411ffb4fde80705a9a8db4c2d38dbeef6f5bd689 Mon Sep 17 00:00:00 2001
From: Bernhard Thaler <bernhard.thaler@wvnet.at>
Date: Sat, 30 May 2015 15:28:28 +0200
Subject: netfilter: bridge: refactor frag_max_size

Currently frag_max_size is member of br_input_skb_cb and copied back and
forth using IPCB(skb) and BR_INPUT_SKB_CB(skb) each time it is changed or
used.

Attach frag_max_size to nf_bridge_info and set value in pre_routing and
forward functions. Use its value in forward and xmit functions.

Signed-off-by: Bernhard Thaler <bernhard.thaler@wvnet.at>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/skbuff.h    |  1 +
 net/bridge/br_netfilter.c | 20 +++++++-------------
 net/bridge/br_private.h   |  1 -
 3 files changed, 8 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index f70fc0e6bf7b..32b105e682b3 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -174,6 +174,7 @@ struct nf_bridge_info {
 		BRNF_PROTO_PPPOE
 	} orig_proto:8;
 	bool			pkt_otherhost;
+	__u16			frag_max_size;
 	unsigned int		mask;
 	struct net_device	*physindev;
 	union {
diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c
index 9ac0c6417c77..1f30b28745de 100644
--- a/net/bridge/br_netfilter.c
+++ b/net/bridge/br_netfilter.c
@@ -440,10 +440,8 @@ static int br_nf_pre_routing_finish(struct sock *sk, struct sk_buff *skb)
 	struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb);
 	struct rtable *rt;
 	int err;
-	int frag_max_size;
 
-	frag_max_size = IPCB(skb)->frag_max_size;
-	BR_INPUT_SKB_CB(skb)->frag_max_size = frag_max_size;
+	nf_bridge->frag_max_size = IPCB(skb)->frag_max_size;
 
 	if (nf_bridge->pkt_otherhost) {
 		skb->pkt_type = PACKET_OTHERHOST;
@@ -738,11 +736,9 @@ static int br_nf_forward_finish(struct sock *sk, struct sk_buff *skb)
 	struct net_device *in;
 
 	if (!IS_ARP(skb) && !IS_VLAN_ARP(skb)) {
-		int frag_max_size;
 
 		if (skb->protocol == htons(ETH_P_IP)) {
-			frag_max_size = IPCB(skb)->frag_max_size;
-			BR_INPUT_SKB_CB(skb)->frag_max_size = frag_max_size;
+			nf_bridge->frag_max_size = IPCB(skb)->frag_max_size;
 		}
 
 		in = nf_bridge->physindev;
@@ -806,12 +802,9 @@ static unsigned int br_nf_forward_ip(const struct nf_hook_ops *ops,
 	}
 
 	if (pf == NFPROTO_IPV4) {
-		int frag_max = BR_INPUT_SKB_CB(skb)->frag_max_size;
-
 		if (br_parse_ip_options(skb))
 			return NF_DROP;
-
-		IPCB(skb)->frag_max_size = frag_max;
+		IPCB(skb)->frag_max_size = nf_bridge->frag_max_size;
 	}
 
 	nf_bridge->physoutdev = skb->dev;
@@ -904,7 +897,7 @@ static int br_nf_ip_fragment(struct sock *sk, struct sk_buff *skb,
 static int br_nf_dev_queue_xmit(struct sock *sk, struct sk_buff *skb)
 {
 	int ret;
-	int frag_max_size;
+	struct nf_bridge_info *nf_bridge;
 	unsigned int mtu_reserved;
 
 	if (skb_is_gso(skb) || skb->protocol != htons(ETH_P_IP)) {
@@ -913,17 +906,18 @@ static int br_nf_dev_queue_xmit(struct sock *sk, struct sk_buff *skb)
 	}
 
 	mtu_reserved = nf_bridge_mtu_reduction(skb);
+	nf_bridge = nf_bridge_info_get(skb);
 	/* This is wrong! We should preserve the original fragment
 	 * boundaries by preserving frag_list rather than refragmenting.
 	 */
 	if (skb->len + mtu_reserved > skb->dev->mtu) {
 		struct brnf_frag_data *data;
 
-		frag_max_size = BR_INPUT_SKB_CB(skb)->frag_max_size;
 		if (br_parse_ip_options(skb))
 			/* Drop invalid packet */
 			return NF_DROP;
-		IPCB(skb)->frag_max_size = frag_max_size;
+
+		IPCB(skb)->frag_max_size = nf_bridge->frag_max_size;
 
 		nf_bridge_update_protocol(skb);
 
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index 1f36fa70639b..8cde96e68fd5 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -304,7 +304,6 @@ struct br_input_skb_cb {
 	int mrouters_only;
 #endif
 
-	u16 frag_max_size;
 	bool proxyarp_replied;
 
 #ifdef CONFIG_BRIDGE_VLAN_FILTERING
-- 
cgit v1.2.3


From 23c779b9f9161d6568d3b2fca06e70ad182c480c Mon Sep 17 00:00:00 2001
From: Stephen Boyd <sboyd@codeaurora.org>
Date: Thu, 11 Jun 2015 17:37:04 -0700
Subject: regulator: Add pull down support

Some regulators need to be configured to pull down a resistor
when the regulator is disabled. Add an op (set_pull_down) and a
DT property + constraint to support this.

Signed-off-by: Stephen Boyd <sboyd@codeaurora.org>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 Documentation/devicetree/bindings/regulator/regulator.txt | 1 +
 drivers/regulator/core.c                                  | 8 ++++++++
 drivers/regulator/of_regulator.c                          | 2 ++
 include/linux/regulator/driver.h                          | 5 +++++
 include/linux/regulator/machine.h                         | 2 ++
 5 files changed, 18 insertions(+)

(limited to 'include/linux')

diff --git a/Documentation/devicetree/bindings/regulator/regulator.txt b/Documentation/devicetree/bindings/regulator/regulator.txt
index 553d2d0fe6d9..6c79fd70ab5a 100644
--- a/Documentation/devicetree/bindings/regulator/regulator.txt
+++ b/Documentation/devicetree/bindings/regulator/regulator.txt
@@ -39,6 +39,7 @@ Optional properties:
   documentation explains which values the regulator supports.
 - regulator-system-load: Load in uA present on regulator that is not captured by
   any consumer request.
+- regulator-pull-down: Enable pull down resistor when the regulator is disabled.
 
 Deprecated properties:
 - regulator-compatible: If a regulator chip contains multiple
diff --git a/drivers/regulator/core.c b/drivers/regulator/core.c
index c8d5e2b05fdf..60fcfba52592 100644
--- a/drivers/regulator/core.c
+++ b/drivers/regulator/core.c
@@ -1051,6 +1051,14 @@ static int set_machine_constraints(struct regulator_dev *rdev,
 		}
 	}
 
+	if (rdev->constraints->pull_down && ops->set_pull_down) {
+		ret = ops->set_pull_down(rdev);
+		if (ret < 0) {
+			rdev_err(rdev, "failed to set pull down\n");
+			goto out;
+		}
+	}
+
 	print_constraints(rdev);
 	return 0;
 out:
diff --git a/drivers/regulator/of_regulator.c b/drivers/regulator/of_regulator.c
index 482a86f90839..c3433db0acda 100644
--- a/drivers/regulator/of_regulator.c
+++ b/drivers/regulator/of_regulator.c
@@ -67,6 +67,8 @@ static void of_get_regulation_constraints(struct device_node *np,
 	if (!constraints->always_on) /* status change should be possible. */
 		constraints->valid_ops_mask |= REGULATOR_CHANGE_STATUS;
 
+	constraints->pull_down = of_property_read_bool(np, "regulator-pull-down");
+
 	if (of_property_read_bool(np, "regulator-allow-bypass"))
 		constraints->valid_ops_mask |= REGULATOR_CHANGE_BYPASS;
 
diff --git a/include/linux/regulator/driver.h b/include/linux/regulator/driver.h
index fffa688ac3a7..76144a337ff7 100644
--- a/include/linux/regulator/driver.h
+++ b/include/linux/regulator/driver.h
@@ -121,6 +121,9 @@ struct regulator_linear_range {
  * @set_suspend_mode: Set the operating mode for the regulator when the
  *                    system is suspended.
  *
+ * @set_pull_down: Configure the regulator to pull down when the regulator
+ *		   is disabled.
+ *
  * This struct describes regulator operations which can be implemented by
  * regulator chip drivers.
  */
@@ -187,6 +190,8 @@ struct regulator_ops {
 
 	/* set regulator suspend operating mode (defined in consumer.h) */
 	int (*set_suspend_mode) (struct regulator_dev *, unsigned int mode);
+
+	int (*set_pull_down) (struct regulator_dev *);
 };
 
 /*
diff --git a/include/linux/regulator/machine.h b/include/linux/regulator/machine.h
index 01526559c8c3..8ffb0619a03c 100644
--- a/include/linux/regulator/machine.h
+++ b/include/linux/regulator/machine.h
@@ -87,6 +87,7 @@ struct regulator_state {
  *           applied.
  * @apply_uV: Apply the voltage constraint when initialising.
  * @ramp_disable: Disable ramp delay when initialising or when setting voltage.
+ * @pull_down: Enable pull down when regulator is disabled.
  *
  * @input_uV: Input voltage for regulator when supplied by another regulator.
  *
@@ -141,6 +142,7 @@ struct regulation_constraints {
 	unsigned boot_on:1;	/* bootloader/firmware enabled regulator */
 	unsigned apply_uV:1;	/* apply uV constraint if min == max */
 	unsigned ramp_disable:1; /* disable ramp delay */
+	unsigned pull_down:1;	/* pull down resistor when regulator off */
 };
 
 /**
-- 
cgit v1.2.3


From efb6de9b4ba0092b2c55f6a52d16294a8a698edd Mon Sep 17 00:00:00 2001
From: Bernhard Thaler <bernhard.thaler@wvnet.at>
Date: Sat, 30 May 2015 15:30:16 +0200
Subject: netfilter: bridge: forward IPv6 fragmented packets

IPv6 fragmented packets are not forwarded on an ethernet bridge
with netfilter ip6_tables loaded. e.g. steps to reproduce

1) create a simple bridge like this

        modprobe br_netfilter
        brctl addbr br0
        brctl addif br0 eth0
        brctl addif br0 eth2
        ifconfig eth0 up
        ifconfig eth2 up
        ifconfig br0 up

2) place a host with an IPv6 address on each side of the bridge

        set IPv6 address on host A:
        ip -6 addr add fd01:2345:6789:1::1/64 dev eth0

        set IPv6 address on host B:
        ip -6 addr add fd01:2345:6789:1::2/64 dev eth0

3) run a simple ping command on host A with packets > MTU

        ping6 -s 4000 fd01:2345:6789:1::2

4) wait some time and run e.g. "ip6tables -t nat -nvL" on the bridge

IPv6 fragmented packets traverse the bridge cleanly until somebody runs.
"ip6tables -t nat -nvL". As soon as it is run (and netfilter modules are
loaded) IPv6 fragmented packets do not traverse the bridge any more (you
see no more responses in ping's output).

After applying this patch IPv6 fragmented packets traverse the bridge
cleanly in above scenario.

Signed-off-by: Bernhard Thaler <bernhard.thaler@wvnet.at>
[pablo@netfilter.org: small changes to br_nf_dev_queue_xmit]
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter_ipv6.h |   2 +
 net/bridge/br_netfilter.c      | 139 +++++++++++++++++++++++++++++------------
 net/bridge/br_private.h        |   6 +-
 net/ipv6/netfilter.c           |   3 +-
 4 files changed, 108 insertions(+), 42 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter_ipv6.h b/include/linux/netfilter_ipv6.h
index e2d19694ee8f..8b7d28f3aada 100644
--- a/include/linux/netfilter_ipv6.h
+++ b/include/linux/netfilter_ipv6.h
@@ -26,6 +26,8 @@ struct nf_ipv6_ops {
 	int (*chk_addr)(struct net *net, const struct in6_addr *addr,
 			const struct net_device *dev, int strict);
 	void (*route_input)(struct sk_buff *skb);
+	int (*fragment)(struct sock *sk, struct sk_buff *skb,
+			int (*output)(struct sock *, struct sk_buff *));
 };
 
 extern const struct nf_ipv6_ops __rcu *nf_ipv6_ops;
diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c
index d201ea4440c9..535f9dae743e 100644
--- a/net/bridge/br_netfilter.c
+++ b/net/bridge/br_netfilter.c
@@ -34,6 +34,7 @@
 
 #include <net/ip.h>
 #include <net/ipv6.h>
+#include <net/addrconf.h>
 #include <net/route.h>
 #include <net/netfilter/br_netfilter.h>
 
@@ -320,6 +321,55 @@ bad:
 	return -1;
 }
 
+/* Equivalent to br_validate_ipv4 for IPv6 */
+static int br_validate_ipv6(struct sk_buff *skb)
+{
+	const struct ipv6hdr *hdr;
+	struct net_device *dev = skb->dev;
+	struct inet6_dev *idev = in6_dev_get(skb->dev);
+	u32 pkt_len;
+	u8 ip6h_len = sizeof(struct ipv6hdr);
+
+	if (!pskb_may_pull(skb, ip6h_len))
+		goto inhdr_error;
+
+	if (skb->len < ip6h_len)
+		goto drop;
+
+	hdr = ipv6_hdr(skb);
+
+	if (hdr->version != 6)
+		goto inhdr_error;
+
+	pkt_len = ntohs(hdr->payload_len);
+
+	if (pkt_len || hdr->nexthdr != NEXTHDR_HOP) {
+		if (pkt_len + ip6h_len > skb->len) {
+			IP6_INC_STATS_BH(dev_net(dev), idev,
+					 IPSTATS_MIB_INTRUNCATEDPKTS);
+			goto drop;
+		}
+		if (pskb_trim_rcsum(skb, pkt_len + ip6h_len)) {
+			IP6_INC_STATS_BH(dev_net(dev), idev,
+					 IPSTATS_MIB_INDISCARDS);
+			goto drop;
+		}
+	}
+	if (hdr->nexthdr == NEXTHDR_HOP && check_hbh_len(skb))
+		goto drop;
+
+	memset(IP6CB(skb), 0, sizeof(struct inet6_skb_parm));
+	/* No IP options in IPv6 header; however it should be
+	 * checked if some next headers need special treatment
+	 */
+	return 0;
+
+inhdr_error:
+	IP6_INC_STATS_BH(dev_net(dev), idev, IPSTATS_MIB_INHDRERRORS);
+drop:
+	return -1;
+}
+
 static void nf_bridge_update_protocol(struct sk_buff *skb)
 {
 	switch (skb->nf_bridge->orig_proto) {
@@ -405,6 +455,8 @@ static int br_nf_pre_routing_finish_ipv6(struct sock *sk, struct sk_buff *skb)
 	struct net_device *dev = skb->dev;
 	const struct nf_ipv6_ops *v6ops = nf_get_ipv6_ops();
 
+	nf_bridge->frag_max_size = IP6CB(skb)->frag_max_size;
+
 	if (nf_bridge->pkt_otherhost) {
 		skb->pkt_type = PACKET_OTHERHOST;
 		nf_bridge->pkt_otherhost = false;
@@ -606,35 +658,15 @@ static struct net_device *setup_pre_routing(struct sk_buff *skb)
 }
 
 /* Replicate the checks that IPv6 does on packet reception and pass the packet
- * to ip6tables, which doesn't support NAT, so things are fairly simple. */
+ * to ip6tables.
+ */
 static unsigned int br_nf_pre_routing_ipv6(const struct nf_hook_ops *ops,
 					   struct sk_buff *skb,
 					   const struct nf_hook_state *state)
 {
 	struct nf_bridge_info *nf_bridge;
-	const struct ipv6hdr *hdr;
-	u32 pkt_len;
-
-	if (skb->len < sizeof(struct ipv6hdr))
-		return NF_DROP;
-
-	if (!pskb_may_pull(skb, sizeof(struct ipv6hdr)))
-		return NF_DROP;
-
-	hdr = ipv6_hdr(skb);
-
-	if (hdr->version != 6)
-		return NF_DROP;
 
-	pkt_len = ntohs(hdr->payload_len);
-
-	if (pkt_len || hdr->nexthdr != NEXTHDR_HOP) {
-		if (pkt_len + sizeof(struct ipv6hdr) > skb->len)
-			return NF_DROP;
-		if (pskb_trim_rcsum(skb, pkt_len + sizeof(struct ipv6hdr)))
-			return NF_DROP;
-	}
-	if (hdr->nexthdr == NEXTHDR_HOP && check_hbh_len(skb))
+	if (br_validate_ipv6(skb))
 		return NF_DROP;
 
 	nf_bridge_put(skb->nf_bridge);
@@ -738,9 +770,11 @@ static int br_nf_forward_finish(struct sock *sk, struct sk_buff *skb)
 
 	if (!IS_ARP(skb) && !IS_VLAN_ARP(skb)) {
 
-		if (skb->protocol == htons(ETH_P_IP)) {
+		if (skb->protocol == htons(ETH_P_IP))
 			nf_bridge->frag_max_size = IPCB(skb)->frag_max_size;
-		}
+
+		if (skb->protocol == htons(ETH_P_IPV6))
+			nf_bridge->frag_max_size = IP6CB(skb)->frag_max_size;
 
 		in = nf_bridge->physindev;
 		if (nf_bridge->pkt_otherhost) {
@@ -808,6 +842,12 @@ static unsigned int br_nf_forward_ip(const struct nf_hook_ops *ops,
 		IPCB(skb)->frag_max_size = nf_bridge->frag_max_size;
 	}
 
+	if (pf == NFPROTO_IPV6) {
+		if (br_validate_ipv6(skb))
+			return NF_DROP;
+		IP6CB(skb)->frag_max_size = nf_bridge->frag_max_size;
+	}
+
 	nf_bridge->physoutdev = skb->dev;
 	if (pf == NFPROTO_IPV4)
 		skb->protocol = htons(ETH_P_IP);
@@ -855,7 +895,7 @@ static unsigned int br_nf_forward_arp(const struct nf_hook_ops *ops,
 	return NF_STOLEN;
 }
 
-#if IS_ENABLED(CONFIG_NF_DEFRAG_IPV4)
+#if IS_ENABLED(CONFIG_NF_DEFRAG_IPV4) || IS_ENABLED(CONFIG_NF_DEFRAG_IPV6)
 static int br_nf_push_frag_xmit(struct sock *sk, struct sk_buff *skb)
 {
 	struct brnf_frag_data *data;
@@ -875,6 +915,7 @@ static int br_nf_push_frag_xmit(struct sock *sk, struct sk_buff *skb)
 	nf_bridge_info_free(skb);
 	return br_dev_queue_push_xmit(sk, skb);
 }
+#endif
 
 static int br_nf_ip_fragment(struct sock *sk, struct sk_buff *skb,
 			     int (*output)(struct sock *, struct sk_buff *))
@@ -897,21 +938,23 @@ static int br_nf_ip_fragment(struct sock *sk, struct sk_buff *skb,
 
 static int br_nf_dev_queue_xmit(struct sock *sk, struct sk_buff *skb)
 {
-	int ret;
 	struct nf_bridge_info *nf_bridge;
 	unsigned int mtu_reserved;
 
-	if (skb_is_gso(skb) || skb->protocol != htons(ETH_P_IP)) {
+	mtu_reserved = nf_bridge_mtu_reduction(skb);
+
+	if (skb_is_gso(skb) || skb->len + mtu_reserved <= skb->dev->mtu) {
 		nf_bridge_info_free(skb);
 		return br_dev_queue_push_xmit(sk, skb);
 	}
 
-	mtu_reserved = nf_bridge_mtu_reduction(skb);
 	nf_bridge = nf_bridge_info_get(skb);
+
+#if IS_ENABLED(CONFIG_NF_DEFRAG_IPV4)
 	/* This is wrong! We should preserve the original fragment
 	 * boundaries by preserving frag_list rather than refragmenting.
 	 */
-	if (skb->len + mtu_reserved > skb->dev->mtu) {
+	if (skb->protocol == htons(ETH_P_IP)) {
 		struct brnf_frag_data *data;
 
 		if (br_validate_ipv4(skb))
@@ -928,21 +971,37 @@ static int br_nf_dev_queue_xmit(struct sock *sk, struct sk_buff *skb)
 		skb_copy_from_linear_data_offset(skb, -data->size, data->mac,
 						 data->size);
 
-		ret = br_nf_ip_fragment(sk, skb, br_nf_push_frag_xmit);
-	} else {
-		nf_bridge_info_free(skb);
-		ret = br_dev_queue_push_xmit(sk, skb);
+		return br_nf_ip_fragment(sk, skb, br_nf_push_frag_xmit);
 	}
+#endif
+#if IS_ENABLED(CONFIG_NF_DEFRAG_IPV6)
+	if (skb->protocol == htons(ETH_P_IPV6)) {
+		const struct nf_ipv6_ops *v6ops = nf_get_ipv6_ops();
+		struct brnf_frag_data *data;
 
-	return ret;
-}
-#else
-static int br_nf_dev_queue_xmit(struct sock *sk, struct sk_buff *skb)
-{
+		if (br_validate_ipv6(skb))
+			return NF_DROP;
+
+		IP6CB(skb)->frag_max_size = nf_bridge->frag_max_size;
+
+		nf_bridge_update_protocol(skb);
+
+		data = this_cpu_ptr(&brnf_frag_data_storage);
+		data->encap_size = nf_bridge_encap_header_len(skb);
+		data->size = ETH_HLEN + data->encap_size;
+
+		skb_copy_from_linear_data_offset(skb, -data->size, data->mac,
+						 data->size);
+
+		if (v6ops)
+			return v6ops->fragment(sk, skb, br_nf_push_frag_xmit);
+		else
+			return -EMSGSIZE;
+	}
+#endif
 	nf_bridge_info_free(skb);
 	return br_dev_queue_push_xmit(sk, skb);
 }
-#endif
 
 /* PF_BRIDGE/POST_ROUTING ********************************************/
 static unsigned int br_nf_post_routing(const struct nf_hook_ops *ops,
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index 8cde96e68fd5..5dccced71269 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -18,6 +18,7 @@
 #include <linux/netpoll.h>
 #include <linux/u64_stats_sync.h>
 #include <net/route.h>
+#include <net/ip6_fib.h>
 #include <linux/if_vlan.h>
 
 #define BR_HASH_BITS 8
@@ -214,7 +215,10 @@ struct net_bridge
 	spinlock_t			hash_lock;
 	struct hlist_head		hash[BR_HASH_SIZE];
 #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
-	struct rtable 			fake_rtable;
+	union {
+		struct rtable		fake_rtable;
+		struct rt6_info		fake_rt6_info;
+	};
 	bool				nf_call_iptables;
 	bool				nf_call_ip6tables;
 	bool				nf_call_arptables;
diff --git a/net/ipv6/netfilter.c b/net/ipv6/netfilter.c
index bbca09fe9553..b4de08a83e0b 100644
--- a/net/ipv6/netfilter.c
+++ b/net/ipv6/netfilter.c
@@ -191,7 +191,8 @@ static __sum16 nf_ip6_checksum_partial(struct sk_buff *skb, unsigned int hook,
 
 static const struct nf_ipv6_ops ipv6ops = {
 	.chk_addr	= ipv6_chk_addr,
-	.route_input    = ip6_route_input
+	.route_input    = ip6_route_input,
+	.fragment	= ip6_fragment
 };
 
 static const struct nf_afinfo nf_ip6_afinfo = {
-- 
cgit v1.2.3


From 33b1f31392861947fa2a2a57c3a39ab63b8c9f9d Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Fri, 5 Jun 2015 13:28:38 +0200
Subject: net: ip_fragment: remove BRIDGE_NETFILTER mtu special handling

since commit d6b915e29f4adea9
("ip_fragment: don't forward defragmented DF packet") the largest
fragment size is available in the IPCB.

Therefore we no longer need to care about 'encapsulation'
overhead of stripped PPPOE/VLAN headers since ip_do_fragment
doesn't use device mtu in such cases.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter_bridge.h | 7 -------
 net/bridge/br_netfilter.c        | 7 +++++++
 net/ipv4/ip_output.c             | 4 ----
 3 files changed, 7 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter_bridge.h b/include/linux/netfilter_bridge.h
index f2fdb5a52070..6d80fc686323 100644
--- a/include/linux/netfilter_bridge.h
+++ b/include/linux/netfilter_bridge.h
@@ -20,13 +20,6 @@ enum nf_br_hook_priorities {
 #define BRNF_BRIDGED_DNAT		0x02
 #define BRNF_NF_BRIDGE_PREROUTING	0x08
 
-static inline unsigned int nf_bridge_mtu_reduction(const struct sk_buff *skb)
-{
-	if (skb->nf_bridge->orig_proto == BRNF_PROTO_PPPOE)
-		return PPPOE_SES_HLEN;
-	return 0;
-}
-
 int br_handle_frame_finish(struct sock *sk, struct sk_buff *skb);
 
 static inline void br_drop_fake_rtable(struct sk_buff *skb)
diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c
index 535f9dae743e..1e62ae5d8f4e 100644
--- a/net/bridge/br_netfilter.c
+++ b/net/bridge/br_netfilter.c
@@ -936,6 +936,13 @@ static int br_nf_ip_fragment(struct sock *sk, struct sk_buff *skb,
 	return ip_do_fragment(sk, skb, output);
 }
 
+static unsigned int nf_bridge_mtu_reduction(const struct sk_buff *skb)
+{
+	if (skb->nf_bridge->orig_proto == BRNF_PROTO_PPPOE)
+		return PPPOE_SES_HLEN;
+	return 0;
+}
+
 static int br_nf_dev_queue_xmit(struct sock *sk, struct sk_buff *skb)
 {
 	struct nf_bridge_info *nf_bridge;
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index f5f5ef1cebd5..19d7e43b5370 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -549,10 +549,6 @@ int ip_do_fragment(struct sock *sk, struct sk_buff *skb,
 
 	hlen = iph->ihl * 4;
 	mtu = mtu - hlen;	/* Size of data space */
-#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
-	if (skb->nf_bridge)
-		mtu -= nf_bridge_mtu_reduction(skb);
-#endif
 	IPCB(skb)->flags |= IPSKB_FRAG_COMPLETE;
 
 	/* When frag_list is given, use it. First, check its validity:
-- 
cgit v1.2.3


From 57f66b78860968fc7eddc9ce25f8e57f7e5000bd Mon Sep 17 00:00:00 2001
From: Stephen Boyd <sboyd@codeaurora.org>
Date: Thu, 11 Jun 2015 17:37:05 -0700
Subject: regulator: Add soft start support

Some regulators support a "soft start" feature where the voltage
ramps up slowly when the regulator is enabled. Add an op
(set_soft_start) and a DT property + constraint to support this.

Signed-off-by: Stephen Boyd <sboyd@codeaurora.org>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 Documentation/devicetree/bindings/regulator/regulator.txt | 1 +
 drivers/regulator/core.c                                  | 8 ++++++++
 drivers/regulator/of_regulator.c                          | 3 +++
 include/linux/regulator/driver.h                          | 2 ++
 include/linux/regulator/machine.h                         | 1 +
 5 files changed, 15 insertions(+)

(limited to 'include/linux')

diff --git a/Documentation/devicetree/bindings/regulator/regulator.txt b/Documentation/devicetree/bindings/regulator/regulator.txt
index 6c79fd70ab5a..4b1df61ccbd7 100644
--- a/Documentation/devicetree/bindings/regulator/regulator.txt
+++ b/Documentation/devicetree/bindings/regulator/regulator.txt
@@ -19,6 +19,7 @@ Optional properties:
   design requires. This property describes the total system ramp time
   required due to the combination of internal ramping of the regulator itself,
   and board design issues such as trace capacitance and load on the supply.
+- regulator-soft-start: Enable soft start so that voltage ramps slowly
 - regulator-state-mem sub-root node for Suspend-to-RAM mode
   : suspend to memory, the device goes to sleep, but all data stored in memory,
   only some external interrupt can wake the device.
diff --git a/drivers/regulator/core.c b/drivers/regulator/core.c
index 60fcfba52592..6dfb2d6c19ae 100644
--- a/drivers/regulator/core.c
+++ b/drivers/regulator/core.c
@@ -1059,6 +1059,14 @@ static int set_machine_constraints(struct regulator_dev *rdev,
 		}
 	}
 
+	if (rdev->constraints->soft_start && ops->set_soft_start) {
+		ret = ops->set_soft_start(rdev);
+		if (ret < 0) {
+			rdev_err(rdev, "failed to set soft start\n");
+			goto out;
+		}
+	}
+
 	print_constraints(rdev);
 	return 0;
 out:
diff --git a/drivers/regulator/of_regulator.c b/drivers/regulator/of_regulator.c
index c3433db0acda..207da037cd2d 100644
--- a/drivers/regulator/of_regulator.c
+++ b/drivers/regulator/of_regulator.c
@@ -84,6 +84,9 @@ static void of_get_regulation_constraints(struct device_node *np,
 	if (!ret)
 		constraints->enable_time = pval;
 
+	constraints->soft_start = of_property_read_bool(np,
+					"regulator-soft-start");
+
 	if (!of_property_read_u32(np, "regulator-initial-mode", &pval)) {
 		if (desc && desc->of_map_mode) {
 			ret = desc->of_map_mode(pval);
diff --git a/include/linux/regulator/driver.h b/include/linux/regulator/driver.h
index 76144a337ff7..e0635d0894aa 100644
--- a/include/linux/regulator/driver.h
+++ b/include/linux/regulator/driver.h
@@ -161,6 +161,8 @@ struct regulator_ops {
 				     unsigned int old_selector,
 				     unsigned int new_selector);
 
+	int (*set_soft_start) (struct regulator_dev *);
+
 	/* report regulator status ... most other accessors report
 	 * control inputs, this reports results of combining inputs
 	 * from Linux (and other sources) with the actual load.
diff --git a/include/linux/regulator/machine.h b/include/linux/regulator/machine.h
index 8ffb0619a03c..7f7d0a3fe1e1 100644
--- a/include/linux/regulator/machine.h
+++ b/include/linux/regulator/machine.h
@@ -142,6 +142,7 @@ struct regulation_constraints {
 	unsigned boot_on:1;	/* bootloader/firmware enabled regulator */
 	unsigned apply_uV:1;	/* apply uV constraint if min == max */
 	unsigned ramp_disable:1; /* disable ramp delay */
+	unsigned soft_start:1;	/* ramp voltage slowly */
 	unsigned pull_down:1;	/* pull down resistor when regulator off */
 };
 
-- 
cgit v1.2.3


From 36e4f839de59b6216a16cdf5c1d3263f4dbd9421 Mon Sep 17 00:00:00 2001
From: Stephen Boyd <sboyd@codeaurora.org>
Date: Thu, 11 Jun 2015 17:37:06 -0700
Subject: regulator: Add input current limit support

Some regulators can limit their input current (typically annotated
as ilim). Add an op (set_input_current_limit) and a DT property +
constraint to support this.

Signed-off-by: Stephen Boyd <sboyd@codeaurora.org>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 Documentation/devicetree/bindings/regulator/regulator.txt | 1 +
 drivers/regulator/core.c                                  | 9 +++++++++
 drivers/regulator/of_regulator.c                          | 4 ++++
 include/linux/regulator/driver.h                          | 3 +++
 include/linux/regulator/machine.h                         | 2 ++
 5 files changed, 19 insertions(+)

(limited to 'include/linux')

diff --git a/Documentation/devicetree/bindings/regulator/regulator.txt b/Documentation/devicetree/bindings/regulator/regulator.txt
index 4b1df61ccbd7..e29db73e55f4 100644
--- a/Documentation/devicetree/bindings/regulator/regulator.txt
+++ b/Documentation/devicetree/bindings/regulator/regulator.txt
@@ -7,6 +7,7 @@ Optional properties:
 - regulator-microvolt-offset: Offset applied to voltages to compensate for voltage drops
 - regulator-min-microamp: smallest current consumers may set
 - regulator-max-microamp: largest current consumers may set
+- regulator-input-current-limit-microamp: maximum input current regulator allows
 - regulator-always-on: boolean, regulator should never be disabled
 - regulator-boot-on: bootloader/firmware enabled regulator
 - regulator-allow-bypass: allow the regulator to go into bypass mode
diff --git a/drivers/regulator/core.c b/drivers/regulator/core.c
index 6dfb2d6c19ae..ba565416d1d0 100644
--- a/drivers/regulator/core.c
+++ b/drivers/regulator/core.c
@@ -1008,6 +1008,15 @@ static int set_machine_constraints(struct regulator_dev *rdev,
 	if (ret != 0)
 		goto out;
 
+	if (rdev->constraints->ilim_uA && ops->set_input_current_limit) {
+		ret = ops->set_input_current_limit(rdev,
+						   rdev->constraints->ilim_uA);
+		if (ret < 0) {
+			rdev_err(rdev, "failed to set input limit\n");
+			goto out;
+		}
+	}
+
 	/* do we need to setup our suspend state */
 	if (rdev->constraints->initial_state) {
 		ret = suspend_prepare(rdev, rdev->constraints->initial_state);
diff --git a/drivers/regulator/of_regulator.c b/drivers/regulator/of_regulator.c
index 207da037cd2d..f025c1047d0a 100644
--- a/drivers/regulator/of_regulator.c
+++ b/drivers/regulator/of_regulator.c
@@ -58,6 +58,10 @@ static void of_get_regulation_constraints(struct device_node *np,
 	if (!of_property_read_u32(np, "regulator-max-microamp", &pval))
 		constraints->max_uA = pval;
 
+	if (!of_property_read_u32(np, "regulator-input-current-limit-microamp",
+				  &pval))
+		constraints->ilim_uA = pval;
+
 	/* Current change possible? */
 	if (constraints->min_uA != constraints->max_uA)
 		constraints->valid_ops_mask |= REGULATOR_CHANGE_CURRENT;
diff --git a/include/linux/regulator/driver.h b/include/linux/regulator/driver.h
index e0635d0894aa..125264f8be93 100644
--- a/include/linux/regulator/driver.h
+++ b/include/linux/regulator/driver.h
@@ -91,6 +91,7 @@ struct regulator_linear_range {
  * @set_current_limit: Configure a limit for a current-limited regulator.
  *                     The driver should select the current closest to max_uA.
  * @get_current_limit: Get the configured limit for a current-limited regulator.
+ * @set_input_current_limit: Configure an input limit.
  *
  * @set_mode: Set the configured operating mode for the regulator.
  * @get_mode: Get the configured operating mode for the regulator.
@@ -145,6 +146,8 @@ struct regulator_ops {
 				 int min_uA, int max_uA);
 	int (*get_current_limit) (struct regulator_dev *);
 
+	int (*set_input_current_limit) (struct regulator_dev *, int lim_uA);
+
 	/* enable/disable regulator */
 	int (*enable) (struct regulator_dev *);
 	int (*disable) (struct regulator_dev *);
diff --git a/include/linux/regulator/machine.h b/include/linux/regulator/machine.h
index 7f7d0a3fe1e1..85a3b457de51 100644
--- a/include/linux/regulator/machine.h
+++ b/include/linux/regulator/machine.h
@@ -75,6 +75,7 @@ struct regulator_state {
  *
  * @min_uA: Smallest current consumers may set.
  * @max_uA: Largest current consumers may set.
+ * @ilim_uA: Maximum input current.
  * @system_load: Load that isn't captured by any consumer requests.
  *
  * @valid_modes_mask: Mask of modes which may be configured by consumers.
@@ -113,6 +114,7 @@ struct regulation_constraints {
 	/* current output range (inclusive) - for current control */
 	int min_uA;
 	int max_uA;
+	int ilim_uA;
 
 	int system_load;
 
-- 
cgit v1.2.3


From 71ae0dff02d756e4d2ca710b79f2ff5390029a5f Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Thu, 11 Jun 2015 01:34:54 +0200
Subject: netfilter: xtables: use percpu rule counters

The binary arp/ip/ip6tables ruleset is stored per cpu.

The only reason left as to why we need percpu duplication are the rule
counters embedded into ipt_entry et al -- since each cpu has its own copy
of the rules, all counters can be lockless.

The downside is that the more cpus are supported, the more memory is
required.  Rules are not just duplicated per online cpu but for each
possible cpu, i.e. if maxcpu is 144, then rule is duplicated 144 times,
not for the e.g. 64 cores present.

To save some memory and also improve utilization of shared caches it
would be preferable to only store the rule blob once.

So we first need to separate counters and the rule blob.

Instead of using entry->counters, allocate this percpu and store the
percpu address in entry->counters.pcnt on CONFIG_SMP.

This change makes no sense as-is; it is merely an intermediate step to
remove the percpu duplication of the rule set in a followup patch.

Suggested-by: Eric Dumazet <edumazet@google.com>
Acked-by: Jesper Dangaard Brouer <brouer@redhat.com>
Reported-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: Florian Westphal <fw@strlen.de>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter/x_tables.h | 49 ++++++++++++++++++++++++++++++++++++++
 net/ipv4/netfilter/arp_tables.c    | 32 +++++++++++++++++++++----
 net/ipv4/netfilter/ip_tables.c     | 31 ++++++++++++++++++++----
 net/ipv6/netfilter/ip6_tables.c    | 31 ++++++++++++++++++++----
 4 files changed, 129 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h
index 09f38206c18f..b77ab9f17641 100644
--- a/include/linux/netfilter/x_tables.h
+++ b/include/linux/netfilter/x_tables.h
@@ -353,6 +353,55 @@ static inline unsigned long ifname_compare_aligned(const char *_a,
 	return ret;
 }
 
+
+/* On SMP, ip(6)t_entry->counters.pcnt holds address of the
+ * real (percpu) counter.  On !SMP, its just the packet count,
+ * so nothing needs to be done there.
+ *
+ * xt_percpu_counter_alloc returns the address of the percpu
+ * counter, or 0 on !SMP.
+ *
+ * Hence caller must use IS_ERR_VALUE to check for error, this
+ * allows us to return 0 for single core systems without forcing
+ * callers to deal with SMP vs. NONSMP issues.
+ */
+static inline u64 xt_percpu_counter_alloc(void)
+{
+	if (nr_cpu_ids > 1) {
+		void __percpu *res = alloc_percpu(struct xt_counters);
+
+		if (res == NULL)
+			return (u64) -ENOMEM;
+
+		return (__force u64) res;
+	}
+
+	return 0;
+}
+static inline void xt_percpu_counter_free(u64 pcnt)
+{
+	if (nr_cpu_ids > 1)
+		free_percpu((void __percpu *) pcnt);
+}
+
+static inline struct xt_counters *
+xt_get_this_cpu_counter(struct xt_counters *cnt)
+{
+	if (nr_cpu_ids > 1)
+		return this_cpu_ptr((void __percpu *) cnt->pcnt);
+
+	return cnt;
+}
+
+static inline struct xt_counters *
+xt_get_per_cpu_counter(struct xt_counters *cnt, unsigned int cpu)
+{
+	if (nr_cpu_ids > 1)
+		return per_cpu_ptr((void __percpu *) cnt->pcnt, cpu);
+
+	return cnt;
+}
+
 struct nf_hook_ops *xt_hook_link(const struct xt_table *, nf_hookfn *);
 void xt_hook_unlink(const struct xt_table *, struct nf_hook_ops *);
 
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index a61200754f4b..0ada09ae6e81 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -289,13 +289,15 @@ unsigned int arpt_do_table(struct sk_buff *skb,
 	arp = arp_hdr(skb);
 	do {
 		const struct xt_entry_target *t;
+		struct xt_counters *counter;
 
 		if (!arp_packet_match(arp, skb->dev, indev, outdev, &e->arp)) {
 			e = arpt_next_entry(e);
 			continue;
 		}
 
-		ADD_COUNTER(e->counters, arp_hdr_len(skb->dev), 1);
+		counter = xt_get_this_cpu_counter(&e->counters);
+		ADD_COUNTER(*counter, arp_hdr_len(skb->dev), 1);
 
 		t = arpt_get_target_c(e);
 
@@ -521,6 +523,10 @@ find_check_entry(struct arpt_entry *e, const char *name, unsigned int size)
 	if (ret)
 		return ret;
 
+	e->counters.pcnt = xt_percpu_counter_alloc();
+	if (IS_ERR_VALUE(e->counters.pcnt))
+		return -ENOMEM;
+
 	t = arpt_get_target(e);
 	target = xt_request_find_target(NFPROTO_ARP, t->u.user.name,
 					t->u.user.revision);
@@ -538,6 +544,8 @@ find_check_entry(struct arpt_entry *e, const char *name, unsigned int size)
 err:
 	module_put(t->u.kernel.target->me);
 out:
+	xt_percpu_counter_free(e->counters.pcnt);
+
 	return ret;
 }
 
@@ -614,6 +622,7 @@ static inline void cleanup_entry(struct arpt_entry *e)
 	if (par.target->destroy != NULL)
 		par.target->destroy(&par);
 	module_put(par.target->me);
+	xt_percpu_counter_free(e->counters.pcnt);
 }
 
 /* Checks and translates the user-supplied table segment (held in
@@ -723,13 +732,15 @@ static void get_counters(const struct xt_table_info *t,
 
 		i = 0;
 		xt_entry_foreach(iter, t->entries[cpu], t->size) {
+			struct xt_counters *tmp;
 			u64 bcnt, pcnt;
 			unsigned int start;
 
+			tmp = xt_get_per_cpu_counter(&iter->counters, cpu);
 			do {
 				start = read_seqcount_begin(s);
-				bcnt = iter->counters.bcnt;
-				pcnt = iter->counters.pcnt;
+				bcnt = tmp->bcnt;
+				pcnt = tmp->pcnt;
 			} while (read_seqcount_retry(s, start));
 
 			ADD_COUNTER(counters[i], bcnt, pcnt);
@@ -1186,7 +1197,10 @@ static int do_add_counters(struct net *net, const void __user *user,
 	loc_cpu_entry = private->entries[curcpu];
 	addend = xt_write_recseq_begin();
 	xt_entry_foreach(iter, loc_cpu_entry, private->size) {
-		ADD_COUNTER(iter->counters, paddc[i].bcnt, paddc[i].pcnt);
+		struct xt_counters *tmp;
+
+		tmp = xt_get_this_cpu_counter(&iter->counters);
+		ADD_COUNTER(*tmp, paddc[i].bcnt, paddc[i].pcnt);
 		++i;
 	}
 	xt_write_recseq_end(addend);
@@ -1416,9 +1430,17 @@ static int translate_compat_table(const char *name,
 
 	i = 0;
 	xt_entry_foreach(iter1, entry1, newinfo->size) {
+		iter1->counters.pcnt = xt_percpu_counter_alloc();
+		if (IS_ERR_VALUE(iter1->counters.pcnt)) {
+			ret = -ENOMEM;
+			break;
+		}
+
 		ret = check_target(iter1, name);
-		if (ret != 0)
+		if (ret != 0) {
+			xt_percpu_counter_free(iter1->counters.pcnt);
 			break;
+		}
 		++i;
 		if (strcmp(arpt_get_target(iter1)->u.user.name,
 		    XT_ERROR_TARGET) == 0)
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index e7abf5145edc..d190b10dabfe 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -345,6 +345,7 @@ ipt_do_table(struct sk_buff *skb,
 	do {
 		const struct xt_entry_target *t;
 		const struct xt_entry_match *ematch;
+		struct xt_counters *counter;
 
 		IP_NF_ASSERT(e);
 		if (!ip_packet_match(ip, indev, outdev,
@@ -361,7 +362,8 @@ ipt_do_table(struct sk_buff *skb,
 				goto no_match;
 		}
 
-		ADD_COUNTER(e->counters, skb->len, 1);
+		counter = xt_get_this_cpu_counter(&e->counters);
+		ADD_COUNTER(*counter, skb->len, 1);
 
 		t = ipt_get_target(e);
 		IP_NF_ASSERT(t->u.kernel.target);
@@ -665,6 +667,10 @@ find_check_entry(struct ipt_entry *e, struct net *net, const char *name,
 	if (ret)
 		return ret;
 
+	e->counters.pcnt = xt_percpu_counter_alloc();
+	if (IS_ERR_VALUE(e->counters.pcnt))
+		return -ENOMEM;
+
 	j = 0;
 	mtpar.net	= net;
 	mtpar.table     = name;
@@ -691,6 +697,7 @@ find_check_entry(struct ipt_entry *e, struct net *net, const char *name,
 	ret = check_target(e, net, name);
 	if (ret)
 		goto err;
+
 	return 0;
  err:
 	module_put(t->u.kernel.target->me);
@@ -700,6 +707,9 @@ find_check_entry(struct ipt_entry *e, struct net *net, const char *name,
 			break;
 		cleanup_match(ematch, net);
 	}
+
+	xt_percpu_counter_free(e->counters.pcnt);
+
 	return ret;
 }
 
@@ -784,6 +794,7 @@ cleanup_entry(struct ipt_entry *e, struct net *net)
 	if (par.target->destroy != NULL)
 		par.target->destroy(&par);
 	module_put(par.target->me);
+	xt_percpu_counter_free(e->counters.pcnt);
 }
 
 /* Checks and translates the user-supplied table segment (held in
@@ -888,13 +899,15 @@ get_counters(const struct xt_table_info *t,
 
 		i = 0;
 		xt_entry_foreach(iter, t->entries[cpu], t->size) {
+			struct xt_counters *tmp;
 			u64 bcnt, pcnt;
 			unsigned int start;
 
+			tmp = xt_get_per_cpu_counter(&iter->counters, cpu);
 			do {
 				start = read_seqcount_begin(s);
-				bcnt = iter->counters.bcnt;
-				pcnt = iter->counters.pcnt;
+				bcnt = tmp->bcnt;
+				pcnt = tmp->pcnt;
 			} while (read_seqcount_retry(s, start));
 
 			ADD_COUNTER(counters[i], bcnt, pcnt);
@@ -1374,7 +1387,10 @@ do_add_counters(struct net *net, const void __user *user,
 	loc_cpu_entry = private->entries[curcpu];
 	addend = xt_write_recseq_begin();
 	xt_entry_foreach(iter, loc_cpu_entry, private->size) {
-		ADD_COUNTER(iter->counters, paddc[i].bcnt, paddc[i].pcnt);
+		struct xt_counters *tmp;
+
+		tmp = xt_get_this_cpu_counter(&iter->counters);
+		ADD_COUNTER(*tmp, paddc[i].bcnt, paddc[i].pcnt);
 		++i;
 	}
 	xt_write_recseq_end(addend);
@@ -1608,6 +1624,10 @@ compat_check_entry(struct ipt_entry *e, struct net *net, const char *name)
 	unsigned int j;
 	int ret = 0;
 
+	e->counters.pcnt = xt_percpu_counter_alloc();
+	if (IS_ERR_VALUE(e->counters.pcnt))
+		return -ENOMEM;
+
 	j = 0;
 	mtpar.net	= net;
 	mtpar.table     = name;
@@ -1632,6 +1652,9 @@ compat_check_entry(struct ipt_entry *e, struct net *net, const char *name)
 			break;
 		cleanup_match(ematch, net);
 	}
+
+	xt_percpu_counter_free(e->counters.pcnt);
+
 	return ret;
 }
 
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index cdd085f8b770..a1190ee14723 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -367,6 +367,7 @@ ip6t_do_table(struct sk_buff *skb,
 	do {
 		const struct xt_entry_target *t;
 		const struct xt_entry_match *ematch;
+		struct xt_counters *counter;
 
 		IP_NF_ASSERT(e);
 		acpar.thoff = 0;
@@ -384,7 +385,8 @@ ip6t_do_table(struct sk_buff *skb,
 				goto no_match;
 		}
 
-		ADD_COUNTER(e->counters, skb->len, 1);
+		counter = xt_get_this_cpu_counter(&e->counters);
+		ADD_COUNTER(*counter, skb->len, 1);
 
 		t = ip6t_get_target_c(e);
 		IP_NF_ASSERT(t->u.kernel.target);
@@ -679,6 +681,10 @@ find_check_entry(struct ip6t_entry *e, struct net *net, const char *name,
 	if (ret)
 		return ret;
 
+	e->counters.pcnt = xt_percpu_counter_alloc();
+	if (IS_ERR_VALUE(e->counters.pcnt))
+		return -ENOMEM;
+
 	j = 0;
 	mtpar.net	= net;
 	mtpar.table     = name;
@@ -714,6 +720,9 @@ find_check_entry(struct ip6t_entry *e, struct net *net, const char *name,
 			break;
 		cleanup_match(ematch, net);
 	}
+
+	xt_percpu_counter_free(e->counters.pcnt);
+
 	return ret;
 }
 
@@ -797,6 +806,8 @@ static void cleanup_entry(struct ip6t_entry *e, struct net *net)
 	if (par.target->destroy != NULL)
 		par.target->destroy(&par);
 	module_put(par.target->me);
+
+	xt_percpu_counter_free(e->counters.pcnt);
 }
 
 /* Checks and translates the user-supplied table segment (held in
@@ -901,13 +912,15 @@ get_counters(const struct xt_table_info *t,
 
 		i = 0;
 		xt_entry_foreach(iter, t->entries[cpu], t->size) {
+			struct xt_counters *tmp;
 			u64 bcnt, pcnt;
 			unsigned int start;
 
+			tmp = xt_get_per_cpu_counter(&iter->counters, cpu);
 			do {
 				start = read_seqcount_begin(s);
-				bcnt = iter->counters.bcnt;
-				pcnt = iter->counters.pcnt;
+				bcnt = tmp->bcnt;
+				pcnt = tmp->pcnt;
 			} while (read_seqcount_retry(s, start));
 
 			ADD_COUNTER(counters[i], bcnt, pcnt);
@@ -1374,7 +1387,6 @@ do_add_counters(struct net *net, const void __user *user, unsigned int len,
 		goto free;
 	}
 
-
 	local_bh_disable();
 	private = t->private;
 	if (private->number != num_counters) {
@@ -1388,7 +1400,10 @@ do_add_counters(struct net *net, const void __user *user, unsigned int len,
 	addend = xt_write_recseq_begin();
 	loc_cpu_entry = private->entries[curcpu];
 	xt_entry_foreach(iter, loc_cpu_entry, private->size) {
-		ADD_COUNTER(iter->counters, paddc[i].bcnt, paddc[i].pcnt);
+		struct xt_counters *tmp;
+
+		tmp = xt_get_this_cpu_counter(&iter->counters);
+		ADD_COUNTER(*tmp, paddc[i].bcnt, paddc[i].pcnt);
 		++i;
 	}
 	xt_write_recseq_end(addend);
@@ -1621,6 +1636,9 @@ static int compat_check_entry(struct ip6t_entry *e, struct net *net,
 	struct xt_mtchk_param mtpar;
 	struct xt_entry_match *ematch;
 
+	e->counters.pcnt = xt_percpu_counter_alloc();
+	if (IS_ERR_VALUE(e->counters.pcnt))
+		return -ENOMEM;
 	j = 0;
 	mtpar.net	= net;
 	mtpar.table     = name;
@@ -1645,6 +1663,9 @@ static int compat_check_entry(struct ip6t_entry *e, struct net *net,
 			break;
 		cleanup_match(ematch, net);
 	}
+
+	xt_percpu_counter_free(e->counters.pcnt);
+
 	return ret;
 }
 
-- 
cgit v1.2.3


From 482cfc318559e2527dfd8513582d2fdb276e47c2 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Thu, 11 Jun 2015 01:34:55 +0200
Subject: netfilter: xtables: avoid percpu ruleset duplication

We store the rule blob per (possible) cpu.  Unfortunately this means we can
waste lot of memory on big smp machines. ipt_entry structure ('rule head')
is 112 byte, so e.g. with maxcpu=64 one single rule eats
close to 8k RAM.

Since previous patch made counters percpu it appears there is nothing
left in the rule blob that needs to be percpu.

On my test system (144 possible cpus, 400k dummy rules) this
change saves close to 9 Gigabyte of RAM.

Reported-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Acked-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Florian Westphal <fw@strlen.de>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter/x_tables.h |  4 +--
 net/ipv4/netfilter/arp_tables.c    | 50 +++++++++--------------------
 net/ipv4/netfilter/ip_tables.c     | 64 ++++++++++---------------------------
 net/ipv6/netfilter/ip6_tables.c    | 65 ++++++++++----------------------------
 net/netfilter/x_tables.c           | 23 +++++---------
 5 files changed, 57 insertions(+), 149 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h
index b77ab9f17641..9969d79dcde1 100644
--- a/include/linux/netfilter/x_tables.h
+++ b/include/linux/netfilter/x_tables.h
@@ -224,9 +224,9 @@ struct xt_table_info {
 	unsigned int stacksize;
 	unsigned int __percpu *stackptr;
 	void ***jumpstack;
-	/* ipt_entry tables: one per CPU */
+
 	/* Note : this field MUST be the last one, see XT_TABLE_INFO_SZ */
-	void *entries[1];
+	void *entries;
 };
 
 #define XT_TABLE_INFO_SZ (offsetof(struct xt_table_info, entries) \
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index 0ada09ae6e81..d75c139393fc 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -275,7 +275,7 @@ unsigned int arpt_do_table(struct sk_buff *skb,
 	 * pointer.
 	 */
 	smp_read_barrier_depends();
-	table_base = private->entries[smp_processor_id()];
+	table_base = private->entries;
 
 	e = get_entry(table_base, private->hook_entry[hook]);
 	back = get_entry(table_base, private->underflow[hook]);
@@ -711,12 +711,6 @@ static int translate_table(struct xt_table_info *newinfo, void *entry0,
 		return ret;
 	}
 
-	/* And one copy for every other CPU */
-	for_each_possible_cpu(i) {
-		if (newinfo->entries[i] && newinfo->entries[i] != entry0)
-			memcpy(newinfo->entries[i], entry0, newinfo->size);
-	}
-
 	return ret;
 }
 
@@ -731,7 +725,7 @@ static void get_counters(const struct xt_table_info *t,
 		seqcount_t *s = &per_cpu(xt_recseq, cpu);
 
 		i = 0;
-		xt_entry_foreach(iter, t->entries[cpu], t->size) {
+		xt_entry_foreach(iter, t->entries, t->size) {
 			struct xt_counters *tmp;
 			u64 bcnt, pcnt;
 			unsigned int start;
@@ -785,7 +779,7 @@ static int copy_entries_to_user(unsigned int total_size,
 	if (IS_ERR(counters))
 		return PTR_ERR(counters);
 
-	loc_cpu_entry = private->entries[raw_smp_processor_id()];
+	loc_cpu_entry = private->entries;
 	/* ... then copy entire thing ... */
 	if (copy_to_user(userptr, loc_cpu_entry, total_size) != 0) {
 		ret = -EFAULT;
@@ -880,10 +874,10 @@ static int compat_table_info(const struct xt_table_info *info,
 	if (!newinfo || !info)
 		return -EINVAL;
 
-	/* we dont care about newinfo->entries[] */
+	/* we dont care about newinfo->entries */
 	memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
 	newinfo->initial_entries = 0;
-	loc_cpu_entry = info->entries[raw_smp_processor_id()];
+	loc_cpu_entry = info->entries;
 	xt_compat_init_offsets(NFPROTO_ARP, info->number);
 	xt_entry_foreach(iter, loc_cpu_entry, info->size) {
 		ret = compat_calc_entry(iter, info, loc_cpu_entry, newinfo);
@@ -1048,7 +1042,7 @@ static int __do_replace(struct net *net, const char *name,
 	get_counters(oldinfo, counters);
 
 	/* Decrease module usage counts and free resource */
-	loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()];
+	loc_cpu_old_entry = oldinfo->entries;
 	xt_entry_foreach(iter, loc_cpu_old_entry, oldinfo->size)
 		cleanup_entry(iter);
 
@@ -1095,8 +1089,7 @@ static int do_replace(struct net *net, const void __user *user,
 	if (!newinfo)
 		return -ENOMEM;
 
-	/* choose the copy that is on our node/cpu */
-	loc_cpu_entry = newinfo->entries[raw_smp_processor_id()];
+	loc_cpu_entry = newinfo->entries;
 	if (copy_from_user(loc_cpu_entry, user + sizeof(tmp),
 			   tmp.size) != 0) {
 		ret = -EFAULT;
@@ -1126,7 +1119,7 @@ static int do_replace(struct net *net, const void __user *user,
 static int do_add_counters(struct net *net, const void __user *user,
 			   unsigned int len, int compat)
 {
-	unsigned int i, curcpu;
+	unsigned int i;
 	struct xt_counters_info tmp;
 	struct xt_counters *paddc;
 	unsigned int num_counters;
@@ -1136,7 +1129,6 @@ static int do_add_counters(struct net *net, const void __user *user,
 	struct xt_table *t;
 	const struct xt_table_info *private;
 	int ret = 0;
-	void *loc_cpu_entry;
 	struct arpt_entry *iter;
 	unsigned int addend;
 #ifdef CONFIG_COMPAT
@@ -1192,11 +1184,9 @@ static int do_add_counters(struct net *net, const void __user *user,
 	}
 
 	i = 0;
-	/* Choose the copy that is on our node */
-	curcpu = smp_processor_id();
-	loc_cpu_entry = private->entries[curcpu];
+
 	addend = xt_write_recseq_begin();
-	xt_entry_foreach(iter, loc_cpu_entry, private->size) {
+	xt_entry_foreach(iter,  private->entries, private->size) {
 		struct xt_counters *tmp;
 
 		tmp = xt_get_this_cpu_counter(&iter->counters);
@@ -1410,7 +1400,7 @@ static int translate_compat_table(const char *name,
 		newinfo->hook_entry[i] = info->hook_entry[i];
 		newinfo->underflow[i] = info->underflow[i];
 	}
-	entry1 = newinfo->entries[raw_smp_processor_id()];
+	entry1 = newinfo->entries;
 	pos = entry1;
 	size = total_size;
 	xt_entry_foreach(iter0, entry0, total_size) {
@@ -1470,11 +1460,6 @@ static int translate_compat_table(const char *name,
 		return ret;
 	}
 
-	/* And one copy for every other CPU */
-	for_each_possible_cpu(i)
-		if (newinfo->entries[i] && newinfo->entries[i] != entry1)
-			memcpy(newinfo->entries[i], entry1, newinfo->size);
-
 	*pinfo = newinfo;
 	*pentry0 = entry1;
 	xt_free_table_info(info);
@@ -1533,8 +1518,7 @@ static int compat_do_replace(struct net *net, void __user *user,
 	if (!newinfo)
 		return -ENOMEM;
 
-	/* choose the copy that is on our node/cpu */
-	loc_cpu_entry = newinfo->entries[raw_smp_processor_id()];
+	loc_cpu_entry = newinfo->entries;
 	if (copy_from_user(loc_cpu_entry, user + sizeof(tmp), tmp.size) != 0) {
 		ret = -EFAULT;
 		goto free_newinfo;
@@ -1631,7 +1615,6 @@ static int compat_copy_entries_to_user(unsigned int total_size,
 	void __user *pos;
 	unsigned int size;
 	int ret = 0;
-	void *loc_cpu_entry;
 	unsigned int i = 0;
 	struct arpt_entry *iter;
 
@@ -1639,11 +1622,9 @@ static int compat_copy_entries_to_user(unsigned int total_size,
 	if (IS_ERR(counters))
 		return PTR_ERR(counters);
 
-	/* choose the copy on our node/cpu */
-	loc_cpu_entry = private->entries[raw_smp_processor_id()];
 	pos = userptr;
 	size = total_size;
-	xt_entry_foreach(iter, loc_cpu_entry, total_size) {
+	xt_entry_foreach(iter, private->entries, total_size) {
 		ret = compat_copy_entry_to_user(iter, &pos,
 						&size, counters, i++);
 		if (ret != 0)
@@ -1812,8 +1793,7 @@ struct xt_table *arpt_register_table(struct net *net,
 		goto out;
 	}
 
-	/* choose the copy on our node/cpu */
-	loc_cpu_entry = newinfo->entries[raw_smp_processor_id()];
+	loc_cpu_entry = newinfo->entries;
 	memcpy(loc_cpu_entry, repl->entries, repl->size);
 
 	ret = translate_table(newinfo, loc_cpu_entry, repl);
@@ -1844,7 +1824,7 @@ void arpt_unregister_table(struct xt_table *table)
 	private = xt_unregister_table(table);
 
 	/* Decrease module usage counts and free resources */
-	loc_cpu_entry = private->entries[raw_smp_processor_id()];
+	loc_cpu_entry = private->entries;
 	xt_entry_foreach(iter, loc_cpu_entry, private->size)
 		cleanup_entry(iter);
 	if (private->number > private->initial_entries)
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index d190b10dabfe..6151500c2df8 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -254,15 +254,13 @@ static void trace_packet(const struct sk_buff *skb,
 			 const struct xt_table_info *private,
 			 const struct ipt_entry *e)
 {
-	const void *table_base;
 	const struct ipt_entry *root;
 	const char *hookname, *chainname, *comment;
 	const struct ipt_entry *iter;
 	unsigned int rulenum = 0;
 	struct net *net = dev_net(in ? in : out);
 
-	table_base = private->entries[smp_processor_id()];
-	root = get_entry(table_base, private->hook_entry[hook]);
+	root = get_entry(private->entries, private->hook_entry[hook]);
 
 	hookname = chainname = hooknames[hook];
 	comment = comments[NF_IP_TRACE_COMMENT_RULE];
@@ -331,7 +329,7 @@ ipt_do_table(struct sk_buff *skb,
 	 * pointer.
 	 */
 	smp_read_barrier_depends();
-	table_base = private->entries[cpu];
+	table_base = private->entries;
 	jumpstack  = (struct ipt_entry **)private->jumpstack[cpu];
 	stackptr   = per_cpu_ptr(private->stackptr, cpu);
 	origptr    = *stackptr;
@@ -877,12 +875,6 @@ translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0,
 		return ret;
 	}
 
-	/* And one copy for every other CPU */
-	for_each_possible_cpu(i) {
-		if (newinfo->entries[i] && newinfo->entries[i] != entry0)
-			memcpy(newinfo->entries[i], entry0, newinfo->size);
-	}
-
 	return ret;
 }
 
@@ -898,7 +890,7 @@ get_counters(const struct xt_table_info *t,
 		seqcount_t *s = &per_cpu(xt_recseq, cpu);
 
 		i = 0;
-		xt_entry_foreach(iter, t->entries[cpu], t->size) {
+		xt_entry_foreach(iter, t->entries, t->size) {
 			struct xt_counters *tmp;
 			u64 bcnt, pcnt;
 			unsigned int start;
@@ -946,17 +938,13 @@ copy_entries_to_user(unsigned int total_size,
 	struct xt_counters *counters;
 	const struct xt_table_info *private = table->private;
 	int ret = 0;
-	const void *loc_cpu_entry;
+	void *loc_cpu_entry;
 
 	counters = alloc_counters(table);
 	if (IS_ERR(counters))
 		return PTR_ERR(counters);
 
-	/* choose the copy that is on our node/cpu, ...
-	 * This choice is lazy (because current thread is
-	 * allowed to migrate to another cpu)
-	 */
-	loc_cpu_entry = private->entries[raw_smp_processor_id()];
+	loc_cpu_entry = private->entries;
 	if (copy_to_user(userptr, loc_cpu_entry, total_size) != 0) {
 		ret = -EFAULT;
 		goto free_counters;
@@ -1070,10 +1058,10 @@ static int compat_table_info(const struct xt_table_info *info,
 	if (!newinfo || !info)
 		return -EINVAL;
 
-	/* we dont care about newinfo->entries[] */
+	/* we dont care about newinfo->entries */
 	memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
 	newinfo->initial_entries = 0;
-	loc_cpu_entry = info->entries[raw_smp_processor_id()];
+	loc_cpu_entry = info->entries;
 	xt_compat_init_offsets(AF_INET, info->number);
 	xt_entry_foreach(iter, loc_cpu_entry, info->size) {
 		ret = compat_calc_entry(iter, info, loc_cpu_entry, newinfo);
@@ -1194,7 +1182,6 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks,
 	struct xt_table *t;
 	struct xt_table_info *oldinfo;
 	struct xt_counters *counters;
-	void *loc_cpu_old_entry;
 	struct ipt_entry *iter;
 
 	ret = 0;
@@ -1237,8 +1224,7 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks,
 	get_counters(oldinfo, counters);
 
 	/* Decrease module usage counts and free resource */
-	loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()];
-	xt_entry_foreach(iter, loc_cpu_old_entry, oldinfo->size)
+	xt_entry_foreach(iter, oldinfo->entries, oldinfo->size)
 		cleanup_entry(iter, net);
 
 	xt_free_table_info(oldinfo);
@@ -1284,8 +1270,7 @@ do_replace(struct net *net, const void __user *user, unsigned int len)
 	if (!newinfo)
 		return -ENOMEM;
 
-	/* choose the copy that is on our node/cpu */
-	loc_cpu_entry = newinfo->entries[raw_smp_processor_id()];
+	loc_cpu_entry = newinfo->entries;
 	if (copy_from_user(loc_cpu_entry, user + sizeof(tmp),
 			   tmp.size) != 0) {
 		ret = -EFAULT;
@@ -1316,7 +1301,7 @@ static int
 do_add_counters(struct net *net, const void __user *user,
                 unsigned int len, int compat)
 {
-	unsigned int i, curcpu;
+	unsigned int i;
 	struct xt_counters_info tmp;
 	struct xt_counters *paddc;
 	unsigned int num_counters;
@@ -1326,7 +1311,6 @@ do_add_counters(struct net *net, const void __user *user,
 	struct xt_table *t;
 	const struct xt_table_info *private;
 	int ret = 0;
-	void *loc_cpu_entry;
 	struct ipt_entry *iter;
 	unsigned int addend;
 #ifdef CONFIG_COMPAT
@@ -1382,11 +1366,8 @@ do_add_counters(struct net *net, const void __user *user,
 	}
 
 	i = 0;
-	/* Choose the copy that is on our node */
-	curcpu = smp_processor_id();
-	loc_cpu_entry = private->entries[curcpu];
 	addend = xt_write_recseq_begin();
-	xt_entry_foreach(iter, loc_cpu_entry, private->size) {
+	xt_entry_foreach(iter, private->entries, private->size) {
 		struct xt_counters *tmp;
 
 		tmp = xt_get_this_cpu_counter(&iter->counters);
@@ -1739,7 +1720,7 @@ translate_compat_table(struct net *net,
 		newinfo->hook_entry[i] = info->hook_entry[i];
 		newinfo->underflow[i] = info->underflow[i];
 	}
-	entry1 = newinfo->entries[raw_smp_processor_id()];
+	entry1 = newinfo->entries;
 	pos = entry1;
 	size = total_size;
 	xt_entry_foreach(iter0, entry0, total_size) {
@@ -1791,11 +1772,6 @@ translate_compat_table(struct net *net,
 		return ret;
 	}
 
-	/* And one copy for every other CPU */
-	for_each_possible_cpu(i)
-		if (newinfo->entries[i] && newinfo->entries[i] != entry1)
-			memcpy(newinfo->entries[i], entry1, newinfo->size);
-
 	*pinfo = newinfo;
 	*pentry0 = entry1;
 	xt_free_table_info(info);
@@ -1842,8 +1818,7 @@ compat_do_replace(struct net *net, void __user *user, unsigned int len)
 	if (!newinfo)
 		return -ENOMEM;
 
-	/* choose the copy that is on our node/cpu */
-	loc_cpu_entry = newinfo->entries[raw_smp_processor_id()];
+	loc_cpu_entry = newinfo->entries;
 	if (copy_from_user(loc_cpu_entry, user + sizeof(tmp),
 			   tmp.size) != 0) {
 		ret = -EFAULT;
@@ -1914,7 +1889,6 @@ compat_copy_entries_to_user(unsigned int total_size, struct xt_table *table,
 	void __user *pos;
 	unsigned int size;
 	int ret = 0;
-	const void *loc_cpu_entry;
 	unsigned int i = 0;
 	struct ipt_entry *iter;
 
@@ -1922,14 +1896,9 @@ compat_copy_entries_to_user(unsigned int total_size, struct xt_table *table,
 	if (IS_ERR(counters))
 		return PTR_ERR(counters);
 
-	/* choose the copy that is on our node/cpu, ...
-	 * This choice is lazy (because current thread is
-	 * allowed to migrate to another cpu)
-	 */
-	loc_cpu_entry = private->entries[raw_smp_processor_id()];
 	pos = userptr;
 	size = total_size;
-	xt_entry_foreach(iter, loc_cpu_entry, total_size) {
+	xt_entry_foreach(iter, private->entries, total_size) {
 		ret = compat_copy_entry_to_user(iter, &pos,
 						&size, counters, i++);
 		if (ret != 0)
@@ -2104,8 +2073,7 @@ struct xt_table *ipt_register_table(struct net *net,
 		goto out;
 	}
 
-	/* choose the copy on our node/cpu, but dont care about preemption */
-	loc_cpu_entry = newinfo->entries[raw_smp_processor_id()];
+	loc_cpu_entry = newinfo->entries;
 	memcpy(loc_cpu_entry, repl->entries, repl->size);
 
 	ret = translate_table(net, newinfo, loc_cpu_entry, repl);
@@ -2136,7 +2104,7 @@ void ipt_unregister_table(struct net *net, struct xt_table *table)
 	private = xt_unregister_table(table);
 
 	/* Decrease module usage counts and free resources */
-	loc_cpu_entry = private->entries[raw_smp_processor_id()];
+	loc_cpu_entry = private->entries;
 	xt_entry_foreach(iter, loc_cpu_entry, private->size)
 		cleanup_entry(iter, net);
 	if (private->number > private->initial_entries)
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index a1190ee14723..80a7f0d38aa4 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -283,15 +283,13 @@ static void trace_packet(const struct sk_buff *skb,
 			 const struct xt_table_info *private,
 			 const struct ip6t_entry *e)
 {
-	const void *table_base;
 	const struct ip6t_entry *root;
 	const char *hookname, *chainname, *comment;
 	const struct ip6t_entry *iter;
 	unsigned int rulenum = 0;
 	struct net *net = dev_net(in ? in : out);
 
-	table_base = private->entries[smp_processor_id()];
-	root = get_entry(table_base, private->hook_entry[hook]);
+	root = get_entry(private->entries, private->hook_entry[hook]);
 
 	hookname = chainname = hooknames[hook];
 	comment = comments[NF_IP6_TRACE_COMMENT_RULE];
@@ -357,7 +355,7 @@ ip6t_do_table(struct sk_buff *skb,
 	 */
 	smp_read_barrier_depends();
 	cpu        = smp_processor_id();
-	table_base = private->entries[cpu];
+	table_base = private->entries;
 	jumpstack  = (struct ip6t_entry **)private->jumpstack[cpu];
 	stackptr   = per_cpu_ptr(private->stackptr, cpu);
 	origptr    = *stackptr;
@@ -890,12 +888,6 @@ translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0,
 		return ret;
 	}
 
-	/* And one copy for every other CPU */
-	for_each_possible_cpu(i) {
-		if (newinfo->entries[i] && newinfo->entries[i] != entry0)
-			memcpy(newinfo->entries[i], entry0, newinfo->size);
-	}
-
 	return ret;
 }
 
@@ -911,7 +903,7 @@ get_counters(const struct xt_table_info *t,
 		seqcount_t *s = &per_cpu(xt_recseq, cpu);
 
 		i = 0;
-		xt_entry_foreach(iter, t->entries[cpu], t->size) {
+		xt_entry_foreach(iter, t->entries, t->size) {
 			struct xt_counters *tmp;
 			u64 bcnt, pcnt;
 			unsigned int start;
@@ -959,17 +951,13 @@ copy_entries_to_user(unsigned int total_size,
 	struct xt_counters *counters;
 	const struct xt_table_info *private = table->private;
 	int ret = 0;
-	const void *loc_cpu_entry;
+	void *loc_cpu_entry;
 
 	counters = alloc_counters(table);
 	if (IS_ERR(counters))
 		return PTR_ERR(counters);
 
-	/* choose the copy that is on our node/cpu, ...
-	 * This choice is lazy (because current thread is
-	 * allowed to migrate to another cpu)
-	 */
-	loc_cpu_entry = private->entries[raw_smp_processor_id()];
+	loc_cpu_entry = private->entries;
 	if (copy_to_user(userptr, loc_cpu_entry, total_size) != 0) {
 		ret = -EFAULT;
 		goto free_counters;
@@ -1083,10 +1071,10 @@ static int compat_table_info(const struct xt_table_info *info,
 	if (!newinfo || !info)
 		return -EINVAL;
 
-	/* we dont care about newinfo->entries[] */
+	/* we dont care about newinfo->entries */
 	memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
 	newinfo->initial_entries = 0;
-	loc_cpu_entry = info->entries[raw_smp_processor_id()];
+	loc_cpu_entry = info->entries;
 	xt_compat_init_offsets(AF_INET6, info->number);
 	xt_entry_foreach(iter, loc_cpu_entry, info->size) {
 		ret = compat_calc_entry(iter, info, loc_cpu_entry, newinfo);
@@ -1207,7 +1195,6 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks,
 	struct xt_table *t;
 	struct xt_table_info *oldinfo;
 	struct xt_counters *counters;
-	const void *loc_cpu_old_entry;
 	struct ip6t_entry *iter;
 
 	ret = 0;
@@ -1250,8 +1237,7 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks,
 	get_counters(oldinfo, counters);
 
 	/* Decrease module usage counts and free resource */
-	loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()];
-	xt_entry_foreach(iter, loc_cpu_old_entry, oldinfo->size)
+	xt_entry_foreach(iter, oldinfo->entries, oldinfo->size)
 		cleanup_entry(iter, net);
 
 	xt_free_table_info(oldinfo);
@@ -1297,8 +1283,7 @@ do_replace(struct net *net, const void __user *user, unsigned int len)
 	if (!newinfo)
 		return -ENOMEM;
 
-	/* choose the copy that is on our node/cpu */
-	loc_cpu_entry = newinfo->entries[raw_smp_processor_id()];
+	loc_cpu_entry = newinfo->entries;
 	if (copy_from_user(loc_cpu_entry, user + sizeof(tmp),
 			   tmp.size) != 0) {
 		ret = -EFAULT;
@@ -1329,7 +1314,7 @@ static int
 do_add_counters(struct net *net, const void __user *user, unsigned int len,
 		int compat)
 {
-	unsigned int i, curcpu;
+	unsigned int i;
 	struct xt_counters_info tmp;
 	struct xt_counters *paddc;
 	unsigned int num_counters;
@@ -1339,7 +1324,6 @@ do_add_counters(struct net *net, const void __user *user, unsigned int len,
 	struct xt_table *t;
 	const struct xt_table_info *private;
 	int ret = 0;
-	const void *loc_cpu_entry;
 	struct ip6t_entry *iter;
 	unsigned int addend;
 #ifdef CONFIG_COMPAT
@@ -1395,11 +1379,8 @@ do_add_counters(struct net *net, const void __user *user, unsigned int len,
 	}
 
 	i = 0;
-	/* Choose the copy that is on our node */
-	curcpu = smp_processor_id();
 	addend = xt_write_recseq_begin();
-	loc_cpu_entry = private->entries[curcpu];
-	xt_entry_foreach(iter, loc_cpu_entry, private->size) {
+	xt_entry_foreach(iter, private->entries, private->size) {
 		struct xt_counters *tmp;
 
 		tmp = xt_get_this_cpu_counter(&iter->counters);
@@ -1407,7 +1388,6 @@ do_add_counters(struct net *net, const void __user *user, unsigned int len,
 		++i;
 	}
 	xt_write_recseq_end(addend);
-
  unlock_up_free:
 	local_bh_enable();
 	xt_table_unlock(t);
@@ -1750,7 +1730,7 @@ translate_compat_table(struct net *net,
 		newinfo->hook_entry[i] = info->hook_entry[i];
 		newinfo->underflow[i] = info->underflow[i];
 	}
-	entry1 = newinfo->entries[raw_smp_processor_id()];
+	entry1 = newinfo->entries;
 	pos = entry1;
 	size = total_size;
 	xt_entry_foreach(iter0, entry0, total_size) {
@@ -1802,11 +1782,6 @@ translate_compat_table(struct net *net,
 		return ret;
 	}
 
-	/* And one copy for every other CPU */
-	for_each_possible_cpu(i)
-		if (newinfo->entries[i] && newinfo->entries[i] != entry1)
-			memcpy(newinfo->entries[i], entry1, newinfo->size);
-
 	*pinfo = newinfo;
 	*pentry0 = entry1;
 	xt_free_table_info(info);
@@ -1853,8 +1828,7 @@ compat_do_replace(struct net *net, void __user *user, unsigned int len)
 	if (!newinfo)
 		return -ENOMEM;
 
-	/* choose the copy that is on our node/cpu */
-	loc_cpu_entry = newinfo->entries[raw_smp_processor_id()];
+	loc_cpu_entry = newinfo->entries;
 	if (copy_from_user(loc_cpu_entry, user + sizeof(tmp),
 			   tmp.size) != 0) {
 		ret = -EFAULT;
@@ -1925,7 +1899,6 @@ compat_copy_entries_to_user(unsigned int total_size, struct xt_table *table,
 	void __user *pos;
 	unsigned int size;
 	int ret = 0;
-	const void *loc_cpu_entry;
 	unsigned int i = 0;
 	struct ip6t_entry *iter;
 
@@ -1933,14 +1906,9 @@ compat_copy_entries_to_user(unsigned int total_size, struct xt_table *table,
 	if (IS_ERR(counters))
 		return PTR_ERR(counters);
 
-	/* choose the copy that is on our node/cpu, ...
-	 * This choice is lazy (because current thread is
-	 * allowed to migrate to another cpu)
-	 */
-	loc_cpu_entry = private->entries[raw_smp_processor_id()];
 	pos = userptr;
 	size = total_size;
-	xt_entry_foreach(iter, loc_cpu_entry, total_size) {
+	xt_entry_foreach(iter, private->entries, total_size) {
 		ret = compat_copy_entry_to_user(iter, &pos,
 						&size, counters, i++);
 		if (ret != 0)
@@ -2115,8 +2083,7 @@ struct xt_table *ip6t_register_table(struct net *net,
 		goto out;
 	}
 
-	/* choose the copy on our node/cpu, but dont care about preemption */
-	loc_cpu_entry = newinfo->entries[raw_smp_processor_id()];
+	loc_cpu_entry = newinfo->entries;
 	memcpy(loc_cpu_entry, repl->entries, repl->size);
 
 	ret = translate_table(net, newinfo, loc_cpu_entry, repl);
@@ -2146,7 +2113,7 @@ void ip6t_unregister_table(struct net *net, struct xt_table *table)
 	private = xt_unregister_table(table);
 
 	/* Decrease module usage counts and free resources */
-	loc_cpu_entry = private->entries[raw_smp_processor_id()];
+	loc_cpu_entry = private->entries;
 	xt_entry_foreach(iter, loc_cpu_entry, private->size)
 		cleanup_entry(iter, net);
 	if (private->number > private->initial_entries)
diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c
index 83032464a4bd..6062ce3e862c 100644
--- a/net/netfilter/x_tables.c
+++ b/net/netfilter/x_tables.c
@@ -659,7 +659,6 @@ EXPORT_SYMBOL_GPL(xt_compat_target_to_user);
 struct xt_table_info *xt_alloc_table_info(unsigned int size)
 {
 	struct xt_table_info *newinfo;
-	int cpu;
 
 	/* Pedantry: prevent them from hitting BUG() in vmalloc.c --RR */
 	if ((SMP_ALIGN(size) >> PAGE_SHIFT) + 2 > totalram_pages)
@@ -671,19 +670,14 @@ struct xt_table_info *xt_alloc_table_info(unsigned int size)
 
 	newinfo->size = size;
 
-	for_each_possible_cpu(cpu) {
-		if (size <= PAGE_SIZE)
-			newinfo->entries[cpu] = kmalloc_node(size,
-							GFP_KERNEL,
-							cpu_to_node(cpu));
-		else
-			newinfo->entries[cpu] = vmalloc_node(size,
-							cpu_to_node(cpu));
+	if (size <= PAGE_SIZE)
+		newinfo->entries = kmalloc(size, GFP_KERNEL);
+	else
+		newinfo->entries = vmalloc(size);
 
-		if (newinfo->entries[cpu] == NULL) {
-			xt_free_table_info(newinfo);
-			return NULL;
-		}
+	if (newinfo->entries == NULL) {
+		xt_free_table_info(newinfo);
+		return NULL;
 	}
 
 	return newinfo;
@@ -694,8 +688,7 @@ void xt_free_table_info(struct xt_table_info *info)
 {
 	int cpu;
 
-	for_each_possible_cpu(cpu)
-		kvfree(info->entries[cpu]);
+	kvfree(info->entries);
 
 	if (info->jumpstack != NULL) {
 		for_each_possible_cpu(cpu)
-- 
cgit v1.2.3


From 87d001ef5366c4a24f7a1340246c4ce68190581c Mon Sep 17 00:00:00 2001
From: Maxime Ripard <maxime.ripard@free-electrons.com>
Date: Wed, 27 May 2015 16:01:52 +0200
Subject: dmaengine: Move icg helpers to global header

Now that we can have ICGs set for both the source and destination (using
the icg field of struct data_chunk) or for only the source or the
destination (using the dst_icg or src_icg respectively), and that these
fields can be ignored depending on other parameters (src_inc, src_sgl,
etc.), the logic to get the actual ICG value can be quite tricky.

The XDMAC driver was already implementing it, but since we will need it in
other drivers, we can move it to the main header file.

Signed-off-by: Maxime Ripard <maxime.ripard@free-electrons.com>
Acked-by: Ludovic Desroches <ludovic.desroches@atmel.com>
Signed-off-by: Vinod Koul <vinod.koul@intel.com>
---
 drivers/dma/at_xdmac.c    | 46 ++++------------------------------------------
 include/linux/dmaengine.h | 27 +++++++++++++++++++++++++++
 2 files changed, 31 insertions(+), 42 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/dma/at_xdmac.c b/drivers/dma/at_xdmac.c
index 9b602a67d40d..80e46e571bdd 100644
--- a/drivers/dma/at_xdmac.c
+++ b/drivers/dma/at_xdmac.c
@@ -862,20 +862,8 @@ at_xdmac_interleaved_queue_desc(struct dma_chan *chan,
 
 	desc->lld.mbr_sa = src;
 	desc->lld.mbr_da = dst;
-
-	if (xt->src_inc && xt->src_sgl) {
-		if (chunk->src_icg)
-			desc->lld.mbr_sus = chunk->src_icg;
-		else
-			desc->lld.mbr_sus = chunk->icg;
-	}
-
-	if (xt->dst_inc && xt->dst_sgl) {
-		if (chunk->dst_icg)
-			desc->lld.mbr_dus = chunk->dst_icg;
-		else
-			desc->lld.mbr_dus = chunk->icg;
-	}
+	desc->lld.mbr_sus = dmaengine_get_src_icg(xt, chunk);
+	desc->lld.mbr_dus = dmaengine_get_dst_icg(xt, chunk);
 
 	desc->lld.mbr_ubc = AT_XDMAC_MBR_UBC_NDV3
 		| AT_XDMAC_MBR_UBC_NDEN
@@ -895,32 +883,6 @@ at_xdmac_interleaved_queue_desc(struct dma_chan *chan,
 	return desc;
 }
 
-static size_t at_xdmac_get_icg(bool inc, bool sgl, size_t icg, size_t dir_icg)
-{
-	if (inc) {
-		if (dir_icg)
-			return dir_icg;
-		else if (sgl)
-			return icg;
-	}
-
-	return 0;
-}
-
-static size_t at_xdmac_get_dst_icg(struct dma_interleaved_template *xt,
-				   struct data_chunk *chunk)
-{
-	return at_xdmac_get_icg(xt->dst_inc, xt->dst_sgl,
-				chunk->icg, chunk->dst_icg);
-}
-
-static size_t at_xdmac_get_src_icg(struct dma_interleaved_template *xt,
-				   struct data_chunk *chunk)
-{
-	return at_xdmac_get_icg(xt->src_inc, xt->src_sgl,
-				chunk->icg, chunk->src_icg);
-}
-
 static struct dma_async_tx_descriptor *
 at_xdmac_prep_interleaved(struct dma_chan *chan,
 			  struct dma_interleaved_template *xt,
@@ -950,8 +912,8 @@ at_xdmac_prep_interleaved(struct dma_chan *chan,
 
 		chunk = xt->sgl + i;
 
-		dst_icg = at_xdmac_get_dst_icg(xt, chunk);
-		src_icg = at_xdmac_get_src_icg(xt, chunk);
+		dst_icg = dmaengine_get_dst_icg(xt, chunk);
+		src_icg = dmaengine_get_src_icg(xt, chunk);
 
 		src_skip = chunk->size + src_icg;
 		dst_skip = chunk->size + dst_icg;
diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h
index ad419757241f..499c530bcbaa 100644
--- a/include/linux/dmaengine.h
+++ b/include/linux/dmaengine.h
@@ -874,6 +874,33 @@ static inline int dma_maxpq(struct dma_device *dma, enum dma_ctrl_flags flags)
 	BUG();
 }
 
+static inline size_t dmaengine_get_icg(bool inc, bool sgl, size_t icg,
+				      size_t dir_icg)
+{
+	if (inc) {
+		if (dir_icg)
+			return dir_icg;
+		else if (sgl)
+			return icg;
+	}
+
+	return 0;
+}
+
+static inline size_t dmaengine_get_dst_icg(struct dma_interleaved_template *xt,
+					   struct data_chunk *chunk)
+{
+	return dmaengine_get_icg(xt->dst_inc, xt->dst_sgl,
+				 chunk->icg, chunk->dst_icg);
+}
+
+static inline size_t dmaengine_get_src_icg(struct dma_interleaved_template *xt,
+					   struct data_chunk *chunk)
+{
+	return dmaengine_get_icg(xt->src_inc, xt->src_sgl,
+				 chunk->icg, chunk->src_icg);
+}
+
 /* --- public DMA engine API --- */
 
 #ifdef CONFIG_DMA_ENGINE
-- 
cgit v1.2.3


From 4983a501afede12f95d26e1e213f8f2e9eda1871 Mon Sep 17 00:00:00 2001
From: Maxime Ripard <maxime.ripard@free-electrons.com>
Date: Mon, 18 May 2015 13:46:15 +0200
Subject: dmaengine: Revert "drivers/dma: remove unused support for MEMSET
 operations"

This reverts commit 48a9db462d99494583dad829969616ac90a8df4e.

Some platforms actually need support for the memset operations. Bring it back.

Signed-off-by: Maxime Ripard <maxime.ripard@free-electrons.com>
Signed-off-by: Vinod Koul <vinod.koul@intel.com>
---
 drivers/dma/dmaengine.c   |  2 ++
 include/linux/dmaengine.h | 24 ++++++++++++++++++++++++
 2 files changed, 26 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/dma/dmaengine.c b/drivers/dma/dmaengine.c
index a4c860dabf91..c0793818bb99 100644
--- a/drivers/dma/dmaengine.c
+++ b/drivers/dma/dmaengine.c
@@ -832,6 +832,8 @@ int dma_async_device_register(struct dma_device *device)
 		!device->device_prep_dma_pq);
 	BUG_ON(dma_has_cap(DMA_PQ_VAL, device->cap_mask) &&
 		!device->device_prep_dma_pq_val);
+	BUG_ON(dma_has_cap(DMA_MEMSET, device->cap_mask) &&
+		!device->device_prep_dma_memset);
 	BUG_ON(dma_has_cap(DMA_INTERRUPT, device->cap_mask) &&
 		!device->device_prep_dma_interrupt);
 	BUG_ON(dma_has_cap(DMA_SG, device->cap_mask) &&
diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h
index ad419757241f..19face3168b4 100644
--- a/include/linux/dmaengine.h
+++ b/include/linux/dmaengine.h
@@ -65,6 +65,7 @@ enum dma_transaction_type {
 	DMA_PQ,
 	DMA_XOR_VAL,
 	DMA_PQ_VAL,
+	DMA_MEMSET,
 	DMA_INTERRUPT,
 	DMA_SG,
 	DMA_PRIVATE,
@@ -570,6 +571,7 @@ struct dma_tx_state {
  * @copy_align: alignment shift for memcpy operations
  * @xor_align: alignment shift for xor operations
  * @pq_align: alignment shift for pq operations
+ * @fill_align: alignment shift for memset operations
  * @dev_id: unique device ID
  * @dev: struct device reference for dma mapping api
  * @src_addr_widths: bit mask of src addr widths the device supports
@@ -588,6 +590,7 @@ struct dma_tx_state {
  * @device_prep_dma_xor_val: prepares a xor validation operation
  * @device_prep_dma_pq: prepares a pq operation
  * @device_prep_dma_pq_val: prepares a pqzero_sum operation
+ * @device_prep_dma_memset: prepares a memset operation
  * @device_prep_dma_interrupt: prepares an end of chain interrupt operation
  * @device_prep_slave_sg: prepares a slave dma operation
  * @device_prep_dma_cyclic: prepare a cyclic dma operation suitable for audio.
@@ -620,6 +623,7 @@ struct dma_device {
 	u8 copy_align;
 	u8 xor_align;
 	u8 pq_align;
+	u8 fill_align;
 	#define DMA_HAS_PQ_CONTINUE (1 << 15)
 
 	int dev_id;
@@ -650,6 +654,9 @@ struct dma_device {
 		struct dma_chan *chan, dma_addr_t *pq, dma_addr_t *src,
 		unsigned int src_cnt, const unsigned char *scf, size_t len,
 		enum sum_check_flags *pqres, unsigned long flags);
+	struct dma_async_tx_descriptor *(*device_prep_dma_memset)(
+		struct dma_chan *chan, dma_addr_t dest, int value, size_t len,
+		unsigned long flags);
 	struct dma_async_tx_descriptor *(*device_prep_dma_interrupt)(
 		struct dma_chan *chan, unsigned long flags);
 	struct dma_async_tx_descriptor *(*device_prep_dma_sg)(
@@ -745,6 +752,17 @@ static inline struct dma_async_tx_descriptor *dmaengine_prep_interleaved_dma(
 	return chan->device->device_prep_interleaved_dma(chan, xt, flags);
 }
 
+static inline struct dma_async_tx_descriptor *dmaengine_prep_dma_memset(
+		struct dma_chan *chan, dma_addr_t dest, int value, size_t len,
+		unsigned long flags)
+{
+	if (!chan || !chan->device)
+		return NULL;
+
+	return chan->device->device_prep_dma_memset(chan, dest, value,
+						    len, flags);
+}
+
 static inline struct dma_async_tx_descriptor *dmaengine_prep_dma_sg(
 		struct dma_chan *chan,
 		struct scatterlist *dst_sg, unsigned int dst_nents,
@@ -820,6 +838,12 @@ static inline bool is_dma_pq_aligned(struct dma_device *dev, size_t off1,
 	return dmaengine_check_align(dev->pq_align, off1, off2, len);
 }
 
+static inline bool is_dma_fill_aligned(struct dma_device *dev, size_t off1,
+				       size_t off2, size_t len)
+{
+	return dmaengine_check_align(dev->fill_align, off1, off2, len);
+}
+
 static inline void
 dma_set_maxpq(struct dma_device *dma, int maxpq, int has_pq_continue)
 {
-- 
cgit v1.2.3


From 7bbf1dd24b17b9ec4f47c43ce4e05bf190745553 Mon Sep 17 00:00:00 2001
From: Jiang Liu <jiang.liu@linux.intel.com>
Date: Mon, 1 Jun 2015 16:05:10 +0800
Subject: genirq: Enhance irq_data_to_desc() to support hierarchy irqdomain

For irq associated with hierarchy irqdomains, there will be multiple
irq_datas for one irq_desc. So enhance irq_data_to_desc() to support
hierarchy irqdomain. Also export irq_data_to_desc() as an inline
function for later reuse.

Signed-off-by: Jiang Liu <jiang.liu@linux.intel.com>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Marc Zyngier <marc.zyngier@arm.com>
Link: http://lkml.kernel.org/r/1433145945-789-2-git-send-email-jiang.liu@linux.intel.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/irqdesc.h | 9 +++++++++
 kernel/irq/internals.h  | 2 --
 2 files changed, 9 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h
index dd1109fb241e..a113a8dc7438 100644
--- a/include/linux/irqdesc.h
+++ b/include/linux/irqdesc.h
@@ -93,6 +93,15 @@ struct irq_desc {
 extern struct irq_desc irq_desc[NR_IRQS];
 #endif
 
+static inline struct irq_desc *irq_data_to_desc(struct irq_data *data)
+{
+#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY
+	return irq_to_desc(data->irq);
+#else
+	return container_of(data, struct irq_desc, irq_data);
+#endif
+}
+
 static inline struct irq_data *irq_desc_get_irq_data(struct irq_desc *desc)
 {
 	return &desc->irq_data;
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index df553b0af936..b93d434e70bd 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -59,8 +59,6 @@ enum {
 #include "debug.h"
 #include "settings.h"
 
-#define irq_data_to_desc(data)	container_of(data, struct irq_desc, irq_data)
-
 extern int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
 		unsigned long flags);
 extern void __disable_irq(struct irq_desc *desc, unsigned int irq);
-- 
cgit v1.2.3


From 0d0b4c866bcce647f40d73efe5e90aeeb079050a Mon Sep 17 00:00:00 2001
From: Jiang Liu <jiang.liu@linux.intel.com>
Date: Mon, 1 Jun 2015 16:05:12 +0800
Subject: genirq: Introduce struct irq_common_data to host shared irq data

With the introduction of hierarchy irqdomain, struct irq_data becomes
per-chip instead of per-irq and there may be multiple irq_datas
associated with the same irq. Some per-irq data stored in struct
irq_data now may get duplicated into multiple irq_datas, and causes
inconsistent view.

So introduce struct irq_common_data to host per-irq common data and to
achieve consistent view among irq_chips.

Signed-off-by: Jiang Liu <jiang.liu@linux.intel.com>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Jason Cooper <jason@lakedaemon.net>
Cc: Kevin Cernekee <cernekee@gmail.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Marc Zyngier <marc.zyngier@arm.com>
Link: http://lkml.kernel.org/r/1433145945-789-4-git-send-email-jiang.liu@linux.intel.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/irq.h     | 54 +++++++++++++++++++++++++++++--------------------
 include/linux/irqdesc.h |  3 ++-
 kernel/irq/internals.h  | 10 ++++-----
 kernel/irq/irqdesc.c    |  1 +
 kernel/irq/irqdomain.c  |  1 +
 5 files changed, 41 insertions(+), 28 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/irq.h b/include/linux/irq.h
index 48cb7d1aa58f..3c7fbe44edae 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -126,13 +126,21 @@ struct msi_desc;
 struct irq_domain;
 
 /**
- * struct irq_data - per irq and irq chip data passed down to chip functions
+ * struct irq_common_data - per irq data shared by all irqchips
+ * @state_use_accessors: status information for irq chip functions.
+ *			Use accessor functions to deal with it
+ */
+struct irq_common_data {
+	unsigned int		state_use_accessors;
+};
+
+/**
+ * struct irq_data - per irq chip data passed down to chip functions
  * @mask:		precomputed bitmask for accessing the chip registers
  * @irq:		interrupt number
  * @hwirq:		hardware interrupt number, local to the interrupt domain
  * @node:		node index useful for balancing
- * @state_use_accessors: status information for irq chip functions.
- *			Use accessor functions to deal with it
+ * @common:		point to data shared by all irqchips
  * @chip:		low level interrupt hardware access
  * @domain:		Interrupt translation domain; responsible for mapping
  *			between hwirq number and linux irq number.
@@ -153,7 +161,7 @@ struct irq_data {
 	unsigned int		irq;
 	unsigned long		hwirq;
 	unsigned int		node;
-	unsigned int		state_use_accessors;
+	struct irq_common_data	*common;
 	struct irq_chip		*chip;
 	struct irq_domain	*domain;
 #ifdef	CONFIG_IRQ_DOMAIN_HIERARCHY
@@ -166,7 +174,7 @@ struct irq_data {
 };
 
 /*
- * Bit masks for irq_data.state
+ * Bit masks for irq_common_data.state_use_accessors
  *
  * IRQD_TRIGGER_MASK		- Mask for the trigger type bits
  * IRQD_SETAFFINITY_PENDING	- Affinity setting is pending
@@ -198,34 +206,36 @@ enum {
 	IRQD_WAKEUP_ARMED		= (1 << 19),
 };
 
+#define __irqd_to_state(d)		((d)->common->state_use_accessors)
+
 static inline bool irqd_is_setaffinity_pending(struct irq_data *d)
 {
-	return d->state_use_accessors & IRQD_SETAFFINITY_PENDING;
+	return __irqd_to_state(d) & IRQD_SETAFFINITY_PENDING;
 }
 
 static inline bool irqd_is_per_cpu(struct irq_data *d)
 {
-	return d->state_use_accessors & IRQD_PER_CPU;
+	return __irqd_to_state(d) & IRQD_PER_CPU;
 }
 
 static inline bool irqd_can_balance(struct irq_data *d)
 {
-	return !(d->state_use_accessors & (IRQD_PER_CPU | IRQD_NO_BALANCING));
+	return !(__irqd_to_state(d) & (IRQD_PER_CPU | IRQD_NO_BALANCING));
 }
 
 static inline bool irqd_affinity_was_set(struct irq_data *d)
 {
-	return d->state_use_accessors & IRQD_AFFINITY_SET;
+	return __irqd_to_state(d) & IRQD_AFFINITY_SET;
 }
 
 static inline void irqd_mark_affinity_was_set(struct irq_data *d)
 {
-	d->state_use_accessors |= IRQD_AFFINITY_SET;
+	__irqd_to_state(d) |= IRQD_AFFINITY_SET;
 }
 
 static inline u32 irqd_get_trigger_type(struct irq_data *d)
 {
-	return d->state_use_accessors & IRQD_TRIGGER_MASK;
+	return __irqd_to_state(d) & IRQD_TRIGGER_MASK;
 }
 
 /*
@@ -233,43 +243,43 @@ static inline u32 irqd_get_trigger_type(struct irq_data *d)
  */
 static inline void irqd_set_trigger_type(struct irq_data *d, u32 type)
 {
-	d->state_use_accessors &= ~IRQD_TRIGGER_MASK;
-	d->state_use_accessors |= type & IRQD_TRIGGER_MASK;
+	__irqd_to_state(d) &= ~IRQD_TRIGGER_MASK;
+	__irqd_to_state(d) |= type & IRQD_TRIGGER_MASK;
 }
 
 static inline bool irqd_is_level_type(struct irq_data *d)
 {
-	return d->state_use_accessors & IRQD_LEVEL;
+	return __irqd_to_state(d) & IRQD_LEVEL;
 }
 
 static inline bool irqd_is_wakeup_set(struct irq_data *d)
 {
-	return d->state_use_accessors & IRQD_WAKEUP_STATE;
+	return __irqd_to_state(d) & IRQD_WAKEUP_STATE;
 }
 
 static inline bool irqd_can_move_in_process_context(struct irq_data *d)
 {
-	return d->state_use_accessors & IRQD_MOVE_PCNTXT;
+	return __irqd_to_state(d) & IRQD_MOVE_PCNTXT;
 }
 
 static inline bool irqd_irq_disabled(struct irq_data *d)
 {
-	return d->state_use_accessors & IRQD_IRQ_DISABLED;
+	return __irqd_to_state(d) & IRQD_IRQ_DISABLED;
 }
 
 static inline bool irqd_irq_masked(struct irq_data *d)
 {
-	return d->state_use_accessors & IRQD_IRQ_MASKED;
+	return __irqd_to_state(d) & IRQD_IRQ_MASKED;
 }
 
 static inline bool irqd_irq_inprogress(struct irq_data *d)
 {
-	return d->state_use_accessors & IRQD_IRQ_INPROGRESS;
+	return __irqd_to_state(d) & IRQD_IRQ_INPROGRESS;
 }
 
 static inline bool irqd_is_wakeup_armed(struct irq_data *d)
 {
-	return d->state_use_accessors & IRQD_WAKEUP_ARMED;
+	return __irqd_to_state(d) & IRQD_WAKEUP_ARMED;
 }
 
 
@@ -280,12 +290,12 @@ static inline bool irqd_is_wakeup_armed(struct irq_data *d)
  */
 static inline void irqd_set_chained_irq_inprogress(struct irq_data *d)
 {
-	d->state_use_accessors |= IRQD_IRQ_INPROGRESS;
+	__irqd_to_state(d) |= IRQD_IRQ_INPROGRESS;
 }
 
 static inline void irqd_clr_chained_irq_inprogress(struct irq_data *d)
 {
-	d->state_use_accessors &= ~IRQD_IRQ_INPROGRESS;
+	__irqd_to_state(d) &= ~IRQD_IRQ_INPROGRESS;
 }
 
 static inline irq_hw_number_t irqd_to_hwirq(struct irq_data *d)
diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h
index a113a8dc7438..c52d1480f272 100644
--- a/include/linux/irqdesc.h
+++ b/include/linux/irqdesc.h
@@ -17,7 +17,7 @@ struct pt_regs;
 
 /**
  * struct irq_desc - interrupt descriptor
- * @irq_data:		per irq and chip data passed down to chip functions
+ * @irq_common_data:	per irq and chip data passed down to chip functions
  * @kstat_irqs:		irq stats per cpu
  * @handle_irq:		highlevel irq-events handler
  * @preflow_handler:	handler called before the flow handler (currently used by sparc)
@@ -47,6 +47,7 @@ struct pt_regs;
  * @name:		flow handler name for /proc/interrupts output
  */
 struct irq_desc {
+	struct irq_common_data	irq_common_data;
 	struct irq_data		irq_data;
 	unsigned int __percpu	*kstat_irqs;
 	irq_flow_handler_t	handle_irq;
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index b93d434e70bd..a1ed80d11800 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -168,27 +168,27 @@ irq_put_desc_unlock(struct irq_desc *desc, unsigned long flags)
  */
 static inline void irqd_set_move_pending(struct irq_data *d)
 {
-	d->state_use_accessors |= IRQD_SETAFFINITY_PENDING;
+	__irqd_to_state(d) |= IRQD_SETAFFINITY_PENDING;
 }
 
 static inline void irqd_clr_move_pending(struct irq_data *d)
 {
-	d->state_use_accessors &= ~IRQD_SETAFFINITY_PENDING;
+	__irqd_to_state(d) &= ~IRQD_SETAFFINITY_PENDING;
 }
 
 static inline void irqd_clear(struct irq_data *d, unsigned int mask)
 {
-	d->state_use_accessors &= ~mask;
+	__irqd_to_state(d) &= ~mask;
 }
 
 static inline void irqd_set(struct irq_data *d, unsigned int mask)
 {
-	d->state_use_accessors |= mask;
+	__irqd_to_state(d) |= mask;
 }
 
 static inline bool irqd_has_set(struct irq_data *d, unsigned int mask)
 {
-	return d->state_use_accessors & mask;
+	return __irqd_to_state(d) & mask;
 }
 
 static inline void kstat_incr_irqs_this_cpu(unsigned int irq, struct irq_desc *desc)
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 99793b9b6d23..eac1aac906ea 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -76,6 +76,7 @@ static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node,
 {
 	int cpu;
 
+	desc->irq_data.common = &desc->irq_common_data;
 	desc->irq_data.irq = irq;
 	desc->irq_data.chip = &no_irq_chip;
 	desc->irq_data.chip_data = NULL;
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 7fac311057b8..3552b8750efd 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -834,6 +834,7 @@ static struct irq_data *irq_domain_insert_irq_data(struct irq_domain *domain,
 	if (irq_data) {
 		child->parent_data = irq_data;
 		irq_data->irq = child->irq;
+		irq_data->common = child->common;
 		irq_data->node = child->node;
 		irq_data->domain = domain;
 	}
-- 
cgit v1.2.3


From 6783011b48096b9a0c239d0f7645f93070b6eefd Mon Sep 17 00:00:00 2001
From: Jiang Liu <jiang.liu@linux.intel.com>
Date: Mon, 1 Jun 2015 16:05:13 +0800
Subject: genirq: Introduce helper function irq_data_get_node()

Introduce helper function irq_data_get_node() and variants thereof to
hide struct irq_data implementation details.

Convert the core code to use them.

Signed-off-by: Jiang Liu <jiang.liu@linux.intel.com>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Jason Cooper <jason@lakedaemon.net>
Cc: Kevin Cernekee <cernekee@gmail.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Link: http://lkml.kernel.org/r/1433145945-789-5-git-send-email-jiang.liu@linux.intel.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/irq.h    | 5 +++++
 kernel/irq/internals.h | 5 +++++
 kernel/irq/irqdesc.c   | 8 +-------
 kernel/irq/irqdomain.c | 3 ++-
 kernel/irq/manage.c    | 2 +-
 kernel/irq/proc.c      | 2 +-
 6 files changed, 15 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/irq.h b/include/linux/irq.h
index 3c7fbe44edae..b3b82a5344c8 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -640,6 +640,11 @@ static inline u32 irq_get_trigger_type(unsigned int irq)
 	return d ? irqd_get_trigger_type(d) : 0;
 }
 
+static inline int irq_data_get_node(struct irq_data *d)
+{
+	return d->node;
+}
+
 unsigned int arch_dynirq_lower_bound(unsigned int from);
 
 int __irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node,
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index a1ed80d11800..4834ee828c41 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -197,6 +197,11 @@ static inline void kstat_incr_irqs_this_cpu(unsigned int irq, struct irq_desc *d
 	__this_cpu_inc(kstat.irqs_sum);
 }
 
+static inline int irq_desc_get_node(struct irq_desc *desc)
+{
+	return irq_data_get_node(&desc->irq_data);
+}
+
 #ifdef CONFIG_PM_SLEEP
 bool irq_pm_check_wakeup(struct irq_desc *desc);
 void irq_pm_install_action(struct irq_desc *desc, struct irqaction *action);
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index eac1aac906ea..b18d3f1d73d9 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -59,16 +59,10 @@ static void desc_smp_init(struct irq_desc *desc, int node)
 #endif
 }
 
-static inline int desc_node(struct irq_desc *desc)
-{
-	return desc->irq_data.node;
-}
-
 #else
 static inline int
 alloc_masks(struct irq_desc *desc, gfp_t gfp, int node) { return 0; }
 static inline void desc_smp_init(struct irq_desc *desc, int node) { }
-static inline int desc_node(struct irq_desc *desc) { return 0; }
 #endif
 
 static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node,
@@ -300,7 +294,7 @@ static void free_desc(unsigned int irq)
 	unsigned long flags;
 
 	raw_spin_lock_irqsave(&desc->lock, flags);
-	desc_set_defaults(irq, desc, desc_node(desc), NULL);
+	desc_set_defaults(irq, desc, irq_desc_get_node(desc), NULL);
 	raw_spin_unlock_irqrestore(&desc->lock, flags);
 }
 
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 3552b8750efd..1b06dfed4574 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -830,7 +830,8 @@ static struct irq_data *irq_domain_insert_irq_data(struct irq_domain *domain,
 {
 	struct irq_data *irq_data;
 
-	irq_data = kzalloc_node(sizeof(*irq_data), GFP_KERNEL, child->node);
+	irq_data = kzalloc_node(sizeof(*irq_data), GFP_KERNEL,
+				irq_data_get_node(child));
 	if (irq_data) {
 		child->parent_data = irq_data;
 		irq_data->irq = child->irq;
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index b1c7e8f46bfb..f9744853b656 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -363,7 +363,7 @@ static int
 setup_affinity(unsigned int irq, struct irq_desc *desc, struct cpumask *mask)
 {
 	struct cpumask *set = irq_default_affinity;
-	int node = desc->irq_data.node;
+	int node = irq_desc_get_node(desc);
 
 	/* Excludes PER_CPU and NO_BALANCE interrupts */
 	if (!irq_can_set_affinity(irq))
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index df2f4642d1e7..0e97c142ce40 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -241,7 +241,7 @@ static int irq_node_proc_show(struct seq_file *m, void *v)
 {
 	struct irq_desc *desc = irq_to_desc((long) m->private);
 
-	seq_printf(m, "%d\n", desc->irq_data.node);
+	seq_printf(m, "%d\n", irq_desc_get_node(desc));
 	return 0;
 }
 
-- 
cgit v1.2.3


From c64301a230a64dfc2fcf4581cd98a2d703f3c057 Mon Sep 17 00:00:00 2001
From: Jiang Liu <jiang.liu@linux.intel.com>
Date: Mon, 1 Jun 2015 16:05:23 +0800
Subject: genirq: Introduce helper function irq_data_get_affinity_mask()

Introduce helper function irq_data_get_affinity_mask() and
irq_get_affinity_mask() to hide implementation details,
so we could move field 'affinity' from struct irq_data into
struct irq_common_data later.

Signed-off-by: Jiang Liu <jiang.liu@linux.intel.com>
Acked-by: Russell King <rmk+kernel@arm.linux.org.uk>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Jason Cooper <jason@lakedaemon.net>
Cc: Kevin Cernekee <cernekee@gmail.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Link: http://lkml.kernel.org/r/1433145945-789-15-git-send-email-jiang.liu@linux.intel.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/irq.h | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/irq.h b/include/linux/irq.h
index b3b82a5344c8..1e0ccef205ed 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -645,6 +645,18 @@ static inline int irq_data_get_node(struct irq_data *d)
 	return d->node;
 }
 
+static inline struct cpumask *irq_get_affinity_mask(int irq)
+{
+	struct irq_data *d = irq_get_irq_data(irq);
+
+	return d ? d->affinity : NULL;
+}
+
+static inline struct cpumask *irq_data_get_affinity_mask(struct irq_data *d)
+{
+	return d->affinity;
+}
+
 unsigned int arch_dynirq_lower_bound(unsigned int from);
 
 int __irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node,
-- 
cgit v1.2.3


From 7e53df111beea8db2543424d07bdee2a630698c3 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Tue, 26 May 2015 11:53:04 -0400
Subject: xprtrdma: Remove rpcrdma_ia::ri_memreg_strategy

Clean up: This field is no longer used.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Reviewed-by: Steve Wise <swise@opengridcomputing.com>
Reviewed-by: Sagi Grimberg <sagig@mellanox.com>
Reviewed-by: Devesh Sharma <devesh.sharma@avagotech.com>
Tested-By: Devesh Sharma <devesh.sharma@avagotech.com>
Reviewed-by: Doug Ledford <dledford@redhat.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 include/linux/sunrpc/xprtrdma.h | 3 ++-
 net/sunrpc/xprtrdma/verbs.c     | 3 ---
 net/sunrpc/xprtrdma/xprt_rdma.h | 1 -
 3 files changed, 2 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/xprtrdma.h b/include/linux/sunrpc/xprtrdma.h
index c984c85981ea..b17613052cc3 100644
--- a/include/linux/sunrpc/xprtrdma.h
+++ b/include/linux/sunrpc/xprtrdma.h
@@ -56,7 +56,8 @@
 
 #define RPCRDMA_INLINE_PAD_THRESH  (512)/* payload threshold to pad (bytes) */
 
-/* memory registration strategies */
+/* Memory registration strategies, by number.
+ * This is part of a kernel / user space API. Do not remove. */
 enum rpcrdma_memreg {
 	RPCRDMA_BOUNCEBUFFERS = 0,
 	RPCRDMA_REGISTER,
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index db9303a6a145..cc1a52609974 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -665,9 +665,6 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
 	dprintk("RPC:       %s: memory registration strategy is '%s'\n",
 		__func__, ia->ri_ops->ro_displayname);
 
-	/* Else will do memory reg/dereg for each chunk */
-	ia->ri_memreg_strategy = memreg;
-
 	rwlock_init(&ia->ri_qplock);
 	return 0;
 
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index f19376d35750..3ecee38bf1a0 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -70,7 +70,6 @@ struct rpcrdma_ia {
 	int			ri_have_dma_lkey;
 	struct completion	ri_done;
 	int			ri_async_rc;
-	enum rpcrdma_memreg	ri_memreg_strategy;
 	unsigned int		ri_max_frmr_depth;
 	struct ib_device_attr	ri_devattr;
 	struct ib_qp_attr	ri_qp_attr;
-- 
cgit v1.2.3


From 52033cfb5aab2a54e238e93c9e52f61c2c5708aa Mon Sep 17 00:00:00 2001
From: Matan Barak <matanb@mellanox.com>
Date: Thu, 11 Jun 2015 16:35:26 +0300
Subject: IB/mlx4: Add mmap call to map the hardware clock

In order to read the HCA's cycle counter efficiently in
user space, we need to map the HCA's register.
This is done through mmap call.

Signed-off-by: Matan Barak <matanb@mellanox.com>
Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/infiniband/hw/mlx4/main.c         | 18 +++++++++++++++++-
 drivers/net/ethernet/mellanox/mlx4/main.c | 19 +++++++++++++++++++
 include/linux/mlx4/device.h               |  9 +++++++++
 3 files changed, 45 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c
index 6cd8cd5bf1c5..2b107070b0ce 100644
--- a/drivers/infiniband/hw/mlx4/main.c
+++ b/drivers/infiniband/hw/mlx4/main.c
@@ -716,8 +716,24 @@ static int mlx4_ib_mmap(struct ib_ucontext *context, struct vm_area_struct *vma)
 				       dev->dev->caps.num_uars,
 				       PAGE_SIZE, vma->vm_page_prot))
 			return -EAGAIN;
-	} else
+	} else if (vma->vm_pgoff == 3) {
+		struct mlx4_clock_params params;
+		int ret = mlx4_get_internal_clock_params(dev->dev, &params);
+
+		if (ret)
+			return ret;
+
+		vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+		if (io_remap_pfn_range(vma, vma->vm_start,
+				       (pci_resource_start(dev->dev->persist->pdev,
+							   params.bar) +
+					params.offset)
+				       >> PAGE_SHIFT,
+				       PAGE_SIZE, vma->vm_page_prot))
+			return -EAGAIN;
+	} else {
 		return -EINVAL;
+	}
 
 	return 0;
 }
diff --git a/drivers/net/ethernet/mellanox/mlx4/main.c b/drivers/net/ethernet/mellanox/mlx4/main.c
index ced5ecab5aa7..70de39c6a397 100644
--- a/drivers/net/ethernet/mellanox/mlx4/main.c
+++ b/drivers/net/ethernet/mellanox/mlx4/main.c
@@ -1674,6 +1674,25 @@ static int map_internal_clock(struct mlx4_dev *dev)
 	return 0;
 }
 
+int mlx4_get_internal_clock_params(struct mlx4_dev *dev,
+				   struct mlx4_clock_params *params)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+
+	if (mlx4_is_slave(dev))
+		return -ENOTSUPP;
+
+	if (!params)
+		return -EINVAL;
+
+	params->bar = priv->fw.clock_bar;
+	params->offset = priv->fw.clock_offset;
+	params->size = MLX4_CLOCK_SIZE;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mlx4_get_internal_clock_params);
+
 static void unmap_internal_clock(struct mlx4_dev *dev)
 {
 	struct mlx4_priv *priv = mlx4_priv(dev);
diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h
index 83e80ab94500..f94984fb8bb2 100644
--- a/include/linux/mlx4/device.h
+++ b/include/linux/mlx4/device.h
@@ -829,6 +829,12 @@ struct mlx4_dev {
 	struct mlx4_vf_dev     *dev_vfs;
 };
 
+struct mlx4_clock_params {
+	u64 offset;
+	u8 bar;
+	u8 size;
+};
+
 struct mlx4_eqe {
 	u8			reserved1;
 	u8			type;
@@ -1485,4 +1491,7 @@ int mlx4_ACCESS_PTYS_REG(struct mlx4_dev *dev,
 			 enum mlx4_access_reg_method method,
 			 struct mlx4_ptys_reg *ptys_reg);
 
+int mlx4_get_internal_clock_params(struct mlx4_dev *dev,
+				   struct mlx4_clock_params *params);
+
 #endif /* MLX4_DEVICE_H */
-- 
cgit v1.2.3


From c0300089fd2dbeebef5ab9b6d66b4e6cedf8500a Mon Sep 17 00:00:00 2001
From: Yijing Wang <wangyijing@huawei.com>
Date: Tue, 28 Apr 2015 17:32:34 +0800
Subject: PCI: Remove unused pci_scan_bus_parented()

No one uses pci_scan_bus_parented() any more, remove it.

Signed-off-by: Yijing Wang <wangyijing@huawei.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 drivers/pci/probe.c | 19 -------------------
 include/linux/pci.h |  2 --
 2 files changed, 21 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index 6675a7a1b9fc..7d6a61c0d5ad 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -2094,25 +2094,6 @@ struct pci_bus *pci_scan_root_bus(struct device *parent, int bus,
 }
 EXPORT_SYMBOL(pci_scan_root_bus);
 
-/* Deprecated; use pci_scan_root_bus() instead */
-struct pci_bus *pci_scan_bus_parented(struct device *parent,
-		int bus, struct pci_ops *ops, void *sysdata)
-{
-	LIST_HEAD(resources);
-	struct pci_bus *b;
-
-	pci_add_resource(&resources, &ioport_resource);
-	pci_add_resource(&resources, &iomem_resource);
-	pci_add_resource(&resources, &busn_resource);
-	b = pci_create_root_bus(parent, bus, ops, sysdata, &resources);
-	if (b)
-		pci_scan_child_bus(b);
-	else
-		pci_free_resource_list(&resources);
-	return b;
-}
-EXPORT_SYMBOL(pci_scan_bus_parented);
-
 struct pci_bus *pci_scan_bus(int bus, struct pci_ops *ops,
 					void *sysdata)
 {
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 353db8dc4c6e..1ec0d5d9723c 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -773,8 +773,6 @@ void pcibios_bus_to_resource(struct pci_bus *bus, struct resource *res,
 void pcibios_scan_specific_bus(int busn);
 struct pci_bus *pci_find_bus(int domain, int busnr);
 void pci_bus_add_devices(const struct pci_bus *bus);
-struct pci_bus *pci_scan_bus_parented(struct device *parent, int bus,
-				      struct pci_ops *ops, void *sysdata);
 struct pci_bus *pci_scan_bus(int bus, struct pci_ops *ops, void *sysdata);
 struct pci_bus *pci_create_root_bus(struct device *parent, int bus,
 				    struct pci_ops *ops, void *sysdata,
-- 
cgit v1.2.3


From 73b6ecdb93e8e77752cae9077c424fcdc6f23c39 Mon Sep 17 00:00:00 2001
From: Chanwoo Choi <cw00.choi@samsung.com>
Date: Fri, 12 Jun 2015 11:10:06 +0900
Subject: extcon: Redefine the unique id of supported external connectors
 without 'enum extcon' type

This patch just redefine the unique id of supported external connectors without
'enum extcon' type. Because unique id would be used on devictree file(*.dts) to
indicate the specific external connectors like key number of input framework.
So, I have the plan to move this definitions to following header file which
includes the unique id of supported external connectors.
- include/dt-bindings/extcon/extcon.h

Fixes: 2a9de9c0f08d ("extcon: Use the unique id for external connector instead of string")
Signed-off-by: Chanwoo Choi <cw00.choi@samsung.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/extcon/extcon-adc-jack.c |  2 +-
 drivers/extcon/extcon-arizona.c  |  4 +-
 drivers/extcon/extcon-axp288.c   |  4 +-
 drivers/extcon/extcon-max14577.c |  2 +-
 drivers/extcon/extcon-max77693.c |  4 +-
 drivers/extcon/extcon-max77843.c |  2 +-
 drivers/extcon/extcon-max8997.c  |  2 +-
 drivers/extcon/extcon-palmas.c   |  2 +-
 drivers/extcon/extcon-rt8973a.c  |  4 +-
 drivers/extcon/extcon-sm5502.c   |  4 +-
 drivers/extcon/extcon-usb-gpio.c |  2 +-
 drivers/extcon/extcon.c          | 24 +++++------
 drivers/usb/phy/phy-tahvo.c      |  2 +-
 include/linux/extcon.h           | 90 +++++++++++++++++++---------------------
 14 files changed, 71 insertions(+), 77 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/extcon/extcon-adc-jack.c b/drivers/extcon/extcon-adc-jack.c
index 5bf08ec1cacf..7fc0ae1912f8 100644
--- a/drivers/extcon/extcon-adc-jack.c
+++ b/drivers/extcon/extcon-adc-jack.c
@@ -40,7 +40,7 @@
 struct adc_jack_data {
 	struct extcon_dev *edev;
 
-	const char **cable_names;
+	const unsigned int **cable_names;
 	struct adc_jack_cond *adc_conditions;
 	int num_conditions;
 
diff --git a/drivers/extcon/extcon-arizona.c b/drivers/extcon/extcon-arizona.c
index 9262b45a4484..ad87f263056f 100644
--- a/drivers/extcon/extcon-arizona.c
+++ b/drivers/extcon/extcon-arizona.c
@@ -118,7 +118,7 @@ static const int arizona_micd_levels[] = {
 	1257,
 };
 
-static const enum extcon arizona_cable[] = {
+static const unsigned int arizona_cable[] = {
 	EXTCON_MECHANICAL,
 	EXTCON_MICROPHONE,
 	EXTCON_HEADPHONE,
@@ -552,7 +552,7 @@ static irqreturn_t arizona_hpdet_irq(int irq, void *data)
 	struct arizona_extcon_info *info = data;
 	struct arizona *arizona = info->arizona;
 	int id_gpio = arizona->pdata.hpdet_id_gpio;
-	enum extcon report = EXTCON_HEADPHONE;
+	unsigned int report = EXTCON_HEADPHONE;
 	int ret, reading;
 	bool mic = false;
 
diff --git a/drivers/extcon/extcon-axp288.c b/drivers/extcon/extcon-axp288.c
index 3605aa96c25a..ea962bc547b8 100644
--- a/drivers/extcon/extcon-axp288.c
+++ b/drivers/extcon/extcon-axp288.c
@@ -101,7 +101,7 @@ enum axp288_extcon_irq {
 	EXTCON_IRQ_END,
 };
 
-static const enum extcon axp288_extcon_cables[] = {
+static const unsigned int axp288_extcon_cables[] = {
 	EXTCON_SLOW_CHARGER,
 	EXTCON_CHARGE_DOWNSTREAM,
 	EXTCON_FAST_CHARGER,
@@ -157,7 +157,7 @@ static void axp288_extcon_log_rsi(struct axp288_extcon_info *info)
 static int axp288_handle_chrg_det_event(struct axp288_extcon_info *info)
 {
 	static bool notify_otg, notify_charger;
-	static enum extcon cable;
+	static unsigned int cable;
 	int ret, stat, cfg, pwr_stat;
 	u8 chrg_type;
 	bool vbus_attach = false;
diff --git a/drivers/extcon/extcon-max14577.c b/drivers/extcon/extcon-max14577.c
index e7c3edb5bd4b..df0659d98e5a 100644
--- a/drivers/extcon/extcon-max14577.c
+++ b/drivers/extcon/extcon-max14577.c
@@ -148,7 +148,7 @@ enum max14577_muic_acc_type {
 	MAX14577_MUIC_ADC_OPEN,
 };
 
-static const enum extcon max14577_extcon_cable[] = {
+static const unsigned int max14577_extcon_cable[] = {
 	EXTCON_USB,
 	EXTCON_TA,
 	EXTCON_FAST_CHARGER,
diff --git a/drivers/extcon/extcon-max77693.c b/drivers/extcon/extcon-max77693.c
index 20e796e10e57..f4f3b3d53928 100644
--- a/drivers/extcon/extcon-max77693.c
+++ b/drivers/extcon/extcon-max77693.c
@@ -200,7 +200,7 @@ enum max77693_muic_acc_type {
 /*
  * MAX77693 MUIC device support below list of accessories(external connector)
  */
-static const enum extcon max77693_extcon_cable[] = {
+static const unsigned int max77693_extcon_cable[] = {
 	EXTCON_USB,
 	EXTCON_USB_HOST,
 	EXTCON_TA,
@@ -457,7 +457,7 @@ static int max77693_muic_dock_handler(struct max77693_muic_info *info,
 	int ret = 0;
 	int vbvolt;
 	bool cable_attached;
-	enum extcon dock_id;
+	unsigned int dock_id;
 
 	dev_info(info->dev,
 		"external connector is %s (adc:0x%02x)\n",
diff --git a/drivers/extcon/extcon-max77843.c b/drivers/extcon/extcon-max77843.c
index d78a64d7fc20..fac2f1417a79 100644
--- a/drivers/extcon/extcon-max77843.c
+++ b/drivers/extcon/extcon-max77843.c
@@ -118,7 +118,7 @@ enum max77843_muic_charger_type {
 	MAX77843_MUIC_CHG_GND,
 };
 
-static const enum extcon max77843_extcon_cable[] = {
+static const unsigned int max77843_extcon_cable[] = {
 	EXTCON_USB,
 	EXTCON_USB_HOST,
 	EXTCON_TA,
diff --git a/drivers/extcon/extcon-max8997.c b/drivers/extcon/extcon-max8997.c
index 4d10949c6eb2..7b1ef200b121 100644
--- a/drivers/extcon/extcon-max8997.c
+++ b/drivers/extcon/extcon-max8997.c
@@ -145,7 +145,7 @@ struct max8997_muic_info {
 	int path_uart;
 };
 
-static const enum extcon max8997_extcon_cable[] = {
+static const unsigned int max8997_extcon_cable[] = {
 	EXTCON_USB,
 	EXTCON_USB_HOST,
 	EXTCON_TA,
diff --git a/drivers/extcon/extcon-palmas.c b/drivers/extcon/extcon-palmas.c
index d68954045a33..080d5cc27055 100644
--- a/drivers/extcon/extcon-palmas.c
+++ b/drivers/extcon/extcon-palmas.c
@@ -29,7 +29,7 @@
 #include <linux/of.h>
 #include <linux/of_platform.h>
 
-static const enum extcon palmas_extcon_cable[] = {
+static const unsigned int palmas_extcon_cable[] = {
 	EXTCON_USB,
 	EXTCON_USB_HOST,
 	EXTCON_NONE,
diff --git a/drivers/extcon/extcon-rt8973a.c b/drivers/extcon/extcon-rt8973a.c
index f2a8672cbf82..92c939221a41 100644
--- a/drivers/extcon/extcon-rt8973a.c
+++ b/drivers/extcon/extcon-rt8973a.c
@@ -90,7 +90,7 @@ static struct reg_data rt8973a_reg_data[] = {
 };
 
 /* List of detectable cables */
-static const enum extcon rt8973a_extcon_cable[] = {
+static const unsigned int rt8973a_extcon_cable[] = {
 	EXTCON_USB,
 	EXTCON_USB_HOST,
 	EXTCON_TA,
@@ -300,7 +300,7 @@ static int rt8973a_muic_cable_handler(struct rt8973a_muic_info *info,
 	static unsigned int prev_cable_type;
 	unsigned int con_sw = DM_DP_SWITCH_UART;
 	int ret, cable_type;
-	enum extcon id;
+	unsigned int id;
 	bool attached = false;
 
 	switch (event) {
diff --git a/drivers/extcon/extcon-sm5502.c b/drivers/extcon/extcon-sm5502.c
index 520693d6fa8a..817dece23b4c 100644
--- a/drivers/extcon/extcon-sm5502.c
+++ b/drivers/extcon/extcon-sm5502.c
@@ -92,7 +92,7 @@ static struct reg_data sm5502_reg_data[] = {
 };
 
 /* List of detectable cables */
-static const enum extcon sm5502_extcon_cable[] = {
+static const unsigned int sm5502_extcon_cable[] = {
 	EXTCON_USB,
 	EXTCON_USB_HOST,
 	EXTCON_TA,
@@ -372,7 +372,7 @@ static int sm5502_muic_cable_handler(struct sm5502_muic_info *info,
 	unsigned int cable_type = SM5502_MUIC_ADC_GROUND;
 	unsigned int con_sw = DM_DP_SWITCH_OPEN;
 	unsigned int vbus_sw = VBUSIN_SWITCH_OPEN;
-	enum extcon id;
+	unsigned int id;
 	int ret;
 
 	/* Get the type of attached or detached cable */
diff --git a/drivers/extcon/extcon-usb-gpio.c b/drivers/extcon/extcon-usb-gpio.c
index 14da94cb57fa..a2a44536a608 100644
--- a/drivers/extcon/extcon-usb-gpio.c
+++ b/drivers/extcon/extcon-usb-gpio.c
@@ -39,7 +39,7 @@ struct usb_extcon_info {
 	struct delayed_work wq_detcable;
 };
 
-static const enum extcon usb_extcon_cable[] = {
+static const unsigned int usb_extcon_cable[] = {
 	EXTCON_USB,
 	EXTCON_USB_HOST,
 	EXTCON_NONE,
diff --git a/drivers/extcon/extcon.c b/drivers/extcon/extcon.c
index fafd428cae7f..76157ab9faf3 100644
--- a/drivers/extcon/extcon.c
+++ b/drivers/extcon/extcon.c
@@ -39,6 +39,8 @@
 #define CABLE_NAME_MAX		30
 
 static const char *extcon_name[] =  {
+	[EXTCON_NONE]		= "NONE",
+
 	/* USB external connector */
 	[EXTCON_USB]		= "USB",
 	[EXTCON_USB_HOST]	= "USB-HOST",
@@ -109,7 +111,7 @@ static int check_mutually_exclusive(struct extcon_dev *edev, u32 new_state)
 	return 0;
 }
 
-static int find_cable_index_by_id(struct extcon_dev *edev, const enum extcon id)
+static int find_cable_index_by_id(struct extcon_dev *edev, const unsigned int id)
 {
 	int i;
 
@@ -124,16 +126,14 @@ static int find_cable_index_by_id(struct extcon_dev *edev, const enum extcon id)
 
 static int find_cable_index_by_name(struct extcon_dev *edev, const char *name)
 {
-	enum extcon id = EXTCON_NONE;
-	int i;
+	unsigned int id = EXTCON_NONE;
+	int i = 0;
 
 	if (edev->max_supported == 0)
 		return -EINVAL;
 
 	/* Find the the number of extcon cable */
-	for (i = 0; i < EXTCON_END; i++) {
-		if (!extcon_name[i])
-			continue;
+	while (extcon_name[i]) {
 		if (!strncmp(extcon_name[i], name, CABLE_NAME_MAX)) {
 			id = i;
 			break;
@@ -337,7 +337,7 @@ EXPORT_SYMBOL_GPL(extcon_set_state);
  * @edev:	the extcon device that has the cable.
  * @id:		the unique id of each external connector in extcon enumeration.
  */
-int extcon_get_cable_state_(struct extcon_dev *edev, const enum extcon id)
+int extcon_get_cable_state_(struct extcon_dev *edev, const unsigned int id)
 {
 	int index;
 
@@ -374,7 +374,7 @@ EXPORT_SYMBOL_GPL(extcon_get_cable_state);
  * @state:		the new cable status. The default semantics is
  *			true: attached / false: detached.
  */
-int extcon_set_cable_state_(struct extcon_dev *edev, enum extcon id,
+int extcon_set_cable_state_(struct extcon_dev *edev, unsigned int id,
 				bool cable_state)
 {
 	u32 state;
@@ -539,7 +539,7 @@ EXPORT_SYMBOL_GPL(extcon_unregister_interest);
  * "old_state", not the current state. The current state can be retrieved
  * by looking at the third pameter (edev pointer)'s state value.
  */
-int extcon_register_notifier(struct extcon_dev *edev, enum extcon id,
+int extcon_register_notifier(struct extcon_dev *edev, unsigned int id,
 			     struct notifier_block *nb)
 {
 	unsigned long flags;
@@ -561,7 +561,7 @@ EXPORT_SYMBOL_GPL(extcon_register_notifier);
  * @id:		the unique id of each external connector in extcon enumeration.
  * @nb:		a notifier block to be registered.
  */
-int extcon_unregister_notifier(struct extcon_dev *edev, enum extcon id,
+int extcon_unregister_notifier(struct extcon_dev *edev, unsigned int id,
 				struct notifier_block *nb)
 {
 	unsigned long flags;
@@ -623,7 +623,7 @@ static void dummy_sysfs_dev_release(struct device *dev)
  *
  * Return the pointer of extcon device if success or ERR_PTR(err) if fail
  */
-struct extcon_dev *extcon_dev_allocate(const enum extcon *supported_cable)
+struct extcon_dev *extcon_dev_allocate(const unsigned int *supported_cable)
 {
 	struct extcon_dev *edev;
 
@@ -677,7 +677,7 @@ static void devm_extcon_dev_release(struct device *dev, void *res)
  * or ERR_PTR(err) if fail
  */
 struct extcon_dev *devm_extcon_dev_allocate(struct device *dev,
-					const enum extcon *supported_cable)
+					const unsigned int *supported_cable)
 {
 	struct extcon_dev **ptr, *edev;
 
diff --git a/drivers/usb/phy/phy-tahvo.c b/drivers/usb/phy/phy-tahvo.c
index 1d1bb9ad8ccf..2f777d22184d 100644
--- a/drivers/usb/phy/phy-tahvo.c
+++ b/drivers/usb/phy/phy-tahvo.c
@@ -60,7 +60,7 @@ struct tahvo_usb {
 	struct extcon_dev	extcon;
 };
 
-static const enum extcon tahvo_cable[] = {
+static const unsigned int tahvo_cable[] = {
 	EXTCON_USB,
 	EXTCON_USB_HOST,
 
diff --git a/include/linux/extcon.h b/include/linux/extcon.h
index a7b224b20ecc..b16d929fa75f 100644
--- a/include/linux/extcon.h
+++ b/include/linux/extcon.h
@@ -30,41 +30,35 @@
 #include <linux/notifier.h>
 #include <linux/sysfs.h>
 
-enum extcon {
-	EXTCON_NONE		= 0x0,
-
-	/* USB external connector */
-	EXTCON_USB		= 0x1,
-	EXTCON_USB_HOST		= 0x2,
-
-	/* Charger external connector */
-	EXTCON_TA		= 0x10,
-	EXTCON_FAST_CHARGER	= 0x11,
-	EXTCON_SLOW_CHARGER	= 0x12,
-	EXTCON_CHARGE_DOWNSTREAM = 0x13,
-
-	/* Audio/Video external connector */
-	EXTCON_LINE_IN		= 0x20,
-	EXTCON_LINE_OUT		= 0x21,
-	EXTCON_MICROPHONE	= 0x22,
-	EXTCON_HEADPHONE	= 0x23,
-
-	EXTCON_HDMI		= 0x30,
-	EXTCON_MHL		= 0x31,
-	EXTCON_DVI		= 0x32,
-	EXTCON_VGA		= 0x33,
-	EXTCON_SPDIF_IN		= 0x34,
-	EXTCON_SPDIF_OUT	= 0x35,
-	EXTCON_VIDEO_IN		= 0x36,
-	EXTCON_VIDEO_OUT	= 0x37,
-
-	/* Etc external connector */
-	EXTCON_DOCK		= 0x50,
-	EXTCON_JIG		= 0x51,
-	EXTCON_MECHANICAL	= 0x52,
-
-	EXTCON_END,
-};
+/*
+ * Define the unique id of supported external connectors
+ */
+#define EXTCON_NONE			0
+
+#define EXTCON_USB			1	/* USB connector */
+#define EXTCON_USB_HOST			2
+
+#define EXTCON_TA			3	/* Charger connector */
+#define EXTCON_FAST_CHARGER		4
+#define EXTCON_SLOW_CHARGER		5
+#define EXTCON_CHARGE_DOWNSTREAM	6
+
+#define EXTCON_LINE_IN			7	/* Audio/Video connector */
+#define EXTCON_LINE_OUT			8
+#define EXTCON_MICROPHONE		9
+#define EXTCON_HEADPHONE		10
+#define EXTCON_HDMI			11
+#define EXTCON_MHL			12
+#define EXTCON_DVI			13
+#define EXTCON_VGA			14
+#define EXTCON_SPDIF_IN			15
+#define EXTCON_SPDIF_OUT		16
+#define EXTCON_VIDEO_IN			17
+#define EXTCON_VIDEO_OUT		18
+
+#define EXTCON_DOCK			19	/* Misc connector */
+#define EXTCON_JIG			20
+#define EXTCON_MECHANICAL		21
 
 struct extcon_cable;
 
@@ -105,7 +99,7 @@ struct extcon_cable;
 struct extcon_dev {
 	/* Optional user initializing data */
 	const char *name;
-	const enum extcon *supported_cable;
+	const unsigned int *supported_cable;
 	const u32 *mutually_exclusive;
 
 	/* Optional callbacks to override class functions */
@@ -182,10 +176,10 @@ extern struct extcon_dev *extcon_get_extcon_dev(const char *extcon_name);
 /*
  * Following APIs control the memory of extcon device.
  */
-extern struct extcon_dev *extcon_dev_allocate(const enum extcon *cable);
+extern struct extcon_dev *extcon_dev_allocate(const unsigned int *cable);
 extern void extcon_dev_free(struct extcon_dev *edev);
 extern struct extcon_dev *devm_extcon_dev_allocate(struct device *dev,
-						   const enum extcon *cable);
+						   const unsigned int *cable);
 extern void devm_extcon_dev_free(struct device *dev, struct extcon_dev *edev);
 
 /*
@@ -206,8 +200,8 @@ extern int extcon_update_state(struct extcon_dev *edev, u32 mask, u32 state);
  * get/set_cable_state access each bit of the 32b encoded state value.
  * They are used to access the status of each cable based on the cable_name.
  */
-extern int extcon_get_cable_state_(struct extcon_dev *edev, enum extcon id);
-extern int extcon_set_cable_state_(struct extcon_dev *edev, enum extcon id,
+extern int extcon_get_cable_state_(struct extcon_dev *edev, unsigned int id);
+extern int extcon_set_cable_state_(struct extcon_dev *edev, unsigned int id,
 				   bool cable_state);
 
 extern int extcon_get_cable_state(struct extcon_dev *edev,
@@ -234,9 +228,9 @@ extern int extcon_unregister_interest(struct extcon_specific_cable_nb *nb);
  * we do not recommend to use this for normal 'notifiee' device drivers who
  * want to be notified by a specific external port of the notifier.
  */
-extern int extcon_register_notifier(struct extcon_dev *edev, enum extcon id,
+extern int extcon_register_notifier(struct extcon_dev *edev, unsigned int id,
 				    struct notifier_block *nb);
-extern int extcon_unregister_notifier(struct extcon_dev *edev, enum extcon id,
+extern int extcon_unregister_notifier(struct extcon_dev *edev, unsigned int id,
 				    struct notifier_block *nb);
 
 /*
@@ -266,7 +260,7 @@ static inline int devm_extcon_dev_register(struct device *dev,
 static inline void devm_extcon_dev_unregister(struct device *dev,
 					      struct extcon_dev *edev) { }
 
-static inline struct extcon_dev *extcon_dev_allocate(const enum extcon *cable)
+static inline struct extcon_dev *extcon_dev_allocate(const unsigned int *cable)
 {
 	return ERR_PTR(-ENOSYS);
 }
@@ -274,7 +268,7 @@ static inline struct extcon_dev *extcon_dev_allocate(const enum extcon *cable)
 static inline void extcon_dev_free(struct extcon_dev *edev) { }
 
 static inline struct extcon_dev *devm_extcon_dev_allocate(struct device *dev,
-						const enum extcon *cable)
+						const unsigned int *cable)
 {
 	return ERR_PTR(-ENOSYS);
 }
@@ -298,13 +292,13 @@ static inline int extcon_update_state(struct extcon_dev *edev, u32 mask,
 }
 
 static inline int extcon_get_cable_state_(struct extcon_dev *edev,
-					  enum extcon id)
+					  unsigned int id)
 {
 	return 0;
 }
 
 static inline int extcon_set_cable_state_(struct extcon_dev *edev,
-					  enum extcon id, bool cable_state)
+					  unsigned int id, bool cable_state)
 {
 	return 0;
 }
@@ -327,14 +321,14 @@ static inline struct extcon_dev *extcon_get_extcon_dev(const char *extcon_name)
 }
 
 static inline int extcon_register_notifier(struct extcon_dev *edev,
-					enum extcon id,
+					unsigned int id,
 					struct notifier_block *nb)
 {
 	return 0;
 }
 
 static inline int extcon_unregister_notifier(struct extcon_dev *edev,
-					enum extcon id,
+					unsigned int id,
 					struct notifier_block *nb)
 {
 	return 0;
-- 
cgit v1.2.3


From ef73f886b53548d83d71a439f51a0c13ea6c1dae Mon Sep 17 00:00:00 2001
From: Dmitry Kalinkin <dmitry.kalinkin@gmail.com>
Date: Thu, 28 May 2015 15:07:04 +0300
Subject: vme: export vme_check_window()

Signed-off-by: Dmitry Kalinkin <dmitry.kalinkin@gmail.com>
Cc: Igor Alekseev <igor.alekseev@itep.ru>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/vme/vme.c   | 5 +++--
 include/linux/vme.h | 2 ++
 2 files changed, 5 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/vme/vme.c b/drivers/vme/vme.c
index 1b78d27aa728..56708915ebbe 100644
--- a/drivers/vme/vme.c
+++ b/drivers/vme/vme.c
@@ -177,8 +177,8 @@ size_t vme_get_size(struct vme_resource *resource)
 }
 EXPORT_SYMBOL(vme_get_size);
 
-static int vme_check_window(u32 aspace, unsigned long long vme_base,
-	unsigned long long size)
+int vme_check_window(u32 aspace, unsigned long long vme_base,
+		     unsigned long long size)
 {
 	int retval = 0;
 
@@ -221,6 +221,7 @@ static int vme_check_window(u32 aspace, unsigned long long vme_base,
 
 	return retval;
 }
+EXPORT_SYMBOL(vme_check_window);
 
 /*
  * Request a slave image with specific attributes, return some unique
diff --git a/include/linux/vme.h b/include/linux/vme.h
index 79242e9c06b8..c0131358f351 100644
--- a/include/linux/vme.h
+++ b/include/linux/vme.h
@@ -120,6 +120,8 @@ void vme_free_consistent(struct vme_resource *, size_t,  void *,
 	dma_addr_t);
 
 size_t vme_get_size(struct vme_resource *);
+int vme_check_window(u32 aspace, unsigned long long vme_base,
+		     unsigned long long size);
 
 struct vme_resource *vme_slave_request(struct vme_dev *, u32, u32);
 int vme_slave_set(struct vme_resource *, int, unsigned long long,
-- 
cgit v1.2.3


From 1e98a0f08abddde87f0f93237f10629ecb4880ef Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 12 Jun 2015 19:31:32 -0700
Subject: flow_dissector: fix ipv6 dst, hop-by-hop and routing ext hdrs

__skb_header_pointer() returns a pointer that must be checked.

Fixes infinite loop reported by Alexei, and add __must_check to
catch these errors earlier.

Fixes: 6a74fcf426f5 ("flow_dissector: add support for dst, hop-by-hop and routing ext hdrs")
Reported-by: Alexei Starovoitov <alexei.starovoitov@gmail.com>
Tested-by: Alexei Starovoitov <alexei.starovoitov@gmail.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Tom Herbert <tom@herbertland.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h    | 9 +++++----
 net/core/flow_dissector.c | 6 ++++--
 2 files changed, 9 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index cc612fc0a894..a7acc92aa668 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -2743,8 +2743,9 @@ __wsum __skb_checksum(const struct sk_buff *skb, int offset, int len,
 __wsum skb_checksum(const struct sk_buff *skb, int offset, int len,
 		    __wsum csum);
 
-static inline void *__skb_header_pointer(const struct sk_buff *skb, int offset,
-					 int len, void *data, int hlen, void *buffer)
+static inline void * __must_check
+__skb_header_pointer(const struct sk_buff *skb, int offset,
+		     int len, void *data, int hlen, void *buffer)
 {
 	if (hlen - offset >= len)
 		return data + offset;
@@ -2756,8 +2757,8 @@ static inline void *__skb_header_pointer(const struct sk_buff *skb, int offset,
 	return buffer;
 }
 
-static inline void *skb_header_pointer(const struct sk_buff *skb, int offset,
-				       int len, void *buffer)
+static inline void * __must_check
+skb_header_pointer(const struct sk_buff *skb, int offset, int len, void *buffer)
 {
 	return __skb_header_pointer(skb, offset, len, skb->data,
 				    skb_headlen(skb), buffer);
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index 22e4dffa0c8b..476e5dda59e1 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -394,9 +394,11 @@ ip_proto_again:
 
 		opthdr = __skb_header_pointer(skb, nhoff, sizeof(_opthdr),
 					      data, hlen, &_opthdr);
+		if (!opthdr)
+			return false;
 
-		ip_proto = _opthdr[0];
-		nhoff += (_opthdr[1] + 1) << 3;
+		ip_proto = opthdr[0];
+		nhoff += (opthdr[1] + 1) << 3;
 
 		goto ip_proto_again;
 	}
-- 
cgit v1.2.3


From aaeb6e24f5b6cb6a664fbdec6e08b65c3173c1b3 Mon Sep 17 00:00:00 2001
From: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
Date: Fri, 12 Jun 2015 21:07:54 +0200
Subject: netfilter: ipset: Use MSEC_PER_SEC consistently

Signed-off-by: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
---
 include/linux/netfilter/ipset/ip_set_timeout.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter/ipset/ip_set_timeout.h b/include/linux/netfilter/ipset/ip_set_timeout.h
index 83c2f9e0886c..3c8842bcedaa 100644
--- a/include/linux/netfilter/ipset/ip_set_timeout.h
+++ b/include/linux/netfilter/ipset/ip_set_timeout.h
@@ -61,7 +61,7 @@ ip_set_timeout_set(unsigned long *timeout, u32 t)
 		return;
 	}
 
-	*timeout = msecs_to_jiffies(t * 1000) + jiffies;
+	*timeout = msecs_to_jiffies(t * MSEC_PER_SEC) + jiffies;
 	if (*timeout == IPSET_ELEM_PERMANENT)
 		/* Bingo! :-) */
 		(*timeout)--;
@@ -71,7 +71,7 @@ static inline u32
 ip_set_timeout_get(unsigned long *timeout)
 {
 	return *timeout == IPSET_ELEM_PERMANENT ? 0 :
-		jiffies_to_msecs(*timeout - jiffies)/1000;
+		jiffies_to_msecs(*timeout - jiffies)/MSEC_PER_SEC;
 }
 
 #endif	/* __KERNEL__ */
-- 
cgit v1.2.3


From f690cbaed9fe4d77592e24139db7ad790641c4fd Mon Sep 17 00:00:00 2001
From: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
Date: Fri, 12 Jun 2015 22:11:00 +0200
Subject: netfilter: ipset: Fix cidr handling for hash:*net* types

Commit "Simplify cidr handling for hash:*net* types" broke the cidr
handling for the hash:*net* types when the sets were used by the SET
target: entries with invalid cidr values were added to the sets.
Reported by Jonathan Johnson.

Testsuite entry is added to verify the fix.

Signed-off-by: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
---
 include/linux/netfilter/ipset/ip_set.h       |  2 --
 net/netfilter/ipset/ip_set_hash_gen.h        | 44 ++++++++++++++++------------
 net/netfilter/ipset/ip_set_hash_ipportnet.c  |  4 +--
 net/netfilter/ipset/ip_set_hash_net.c        |  4 +--
 net/netfilter/ipset/ip_set_hash_netiface.c   |  4 +--
 net/netfilter/ipset/ip_set_hash_netnet.c     |  8 ++---
 net/netfilter/ipset/ip_set_hash_netport.c    |  4 +--
 net/netfilter/ipset/ip_set_hash_netportnet.c |  8 ++---
 8 files changed, 42 insertions(+), 36 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter/ipset/ip_set.h b/include/linux/netfilter/ipset/ip_set.h
index ffdfdc24952a..a6fe1ce96437 100644
--- a/include/linux/netfilter/ipset/ip_set.h
+++ b/include/linux/netfilter/ipset/ip_set.h
@@ -545,8 +545,6 @@ ip_set_put_extensions(struct sk_buff *skb, const struct ip_set *set,
 	{ .bytes = ULLONG_MAX, .packets = ULLONG_MAX,	\
 	  .timeout = (set)->timeout }
 
-#define IP_SET_INIT_CIDR(a, b) ((a) ? (a) : (b))
-
 #define IPSET_CONCAT(a, b)		a##b
 #define IPSET_TOKEN(a, b)		IPSET_CONCAT(a, b)
 
diff --git a/net/netfilter/ipset/ip_set_hash_gen.h b/net/netfilter/ipset/ip_set_hash_gen.h
index 8dd82db05ed0..2f1985e71f6c 100644
--- a/net/netfilter/ipset/ip_set_hash_gen.h
+++ b/net/netfilter/ipset/ip_set_hash_gen.h
@@ -149,17 +149,21 @@ hbucket_elem_add(struct hbucket *n, u8 ahash_max, size_t dsize)
 #endif
 
 /* cidr + 1 is stored in net_prefixes to support /0 */
-#define SCIDR(cidr, i)		(__CIDR(cidr, i) + 1)
+#define NCIDR_PUT(cidr)		((cidr) + 1)
+#define NCIDR_GET(cidr)		((cidr) - 1)
 
 #ifdef IP_SET_HASH_WITH_NETS_PACKED
 /* When cidr is packed with nomatch, cidr - 1 is stored in the data entry */
-#define GCIDR(cidr, i)		(__CIDR(cidr, i) + 1)
-#define NCIDR(cidr)		(cidr)
+#define DCIDR_PUT(cidr)		((cidr) - 1)
+#define DCIDR_GET(cidr, i)	(__CIDR(cidr, i) + 1)
 #else
-#define GCIDR(cidr, i)		(__CIDR(cidr, i))
-#define NCIDR(cidr)		(cidr - 1)
+#define DCIDR_PUT(cidr)		(cidr)
+#define DCIDR_GET(cidr, i)	__CIDR(cidr, i)
 #endif
 
+#define INIT_CIDR(cidr, host_mask)	\
+	DCIDR_PUT(((cidr) ? NCIDR_GET(cidr) : host_mask))
+
 #define SET_HOST_MASK(family)	(family == AF_INET ? 32 : 128)
 
 #ifdef IP_SET_HASH_WITH_NET0
@@ -303,7 +307,8 @@ struct htype {
 
 #ifdef IP_SET_HASH_WITH_NETS
 /* Network cidr size book keeping when the hash stores different
- * sized networks */
+ * sized networks. cidr == real cidr + 1 to support /0.
+ */
 static void
 mtype_add_cidr(struct htype *h, u8 cidr, u8 nets_length, u8 n)
 {
@@ -498,8 +503,10 @@ mtype_expire(struct ip_set *set, struct htype *h, u8 nets_length, size_t dsize)
 				pr_debug("expired %u/%u\n", i, j);
 #ifdef IP_SET_HASH_WITH_NETS
 				for (k = 0; k < IPSET_NET_COUNT; k++)
-					mtype_del_cidr(h, SCIDR(data->cidr, k),
-						       nets_length, k);
+					mtype_del_cidr(h,
+						NCIDR_PUT(DCIDR_GET(data->cidr,
+								    k)),
+						nets_length, k);
 #endif
 				ip_set_ext_destroy(set, data);
 				if (j != n->pos - 1)
@@ -692,9 +699,9 @@ reuse_slot:
 		data = ahash_data(n, j, set->dsize);
 #ifdef IP_SET_HASH_WITH_NETS
 		for (i = 0; i < IPSET_NET_COUNT; i++) {
-			mtype_del_cidr(h, SCIDR(data->cidr, i),
+			mtype_del_cidr(h, NCIDR_PUT(DCIDR_GET(data->cidr, i)),
 				       NLEN(set->family), i);
-			mtype_add_cidr(h, SCIDR(d->cidr, i),
+			mtype_add_cidr(h, NCIDR_PUT(DCIDR_GET(d->cidr, i)),
 				       NLEN(set->family), i);
 		}
 #endif
@@ -711,8 +718,8 @@ reuse_slot:
 		data = ahash_data(n, n->pos++, set->dsize);
 #ifdef IP_SET_HASH_WITH_NETS
 		for (i = 0; i < IPSET_NET_COUNT; i++)
-			mtype_add_cidr(h, SCIDR(d->cidr, i), NLEN(set->family),
-				       i);
+			mtype_add_cidr(h, NCIDR_PUT(DCIDR_GET(d->cidr, i)),
+				       NLEN(set->family), i);
 #endif
 		h->elements++;
 	}
@@ -772,8 +779,8 @@ mtype_del(struct ip_set *set, void *value, const struct ip_set_ext *ext,
 		h->elements--;
 #ifdef IP_SET_HASH_WITH_NETS
 		for (j = 0; j < IPSET_NET_COUNT; j++)
-			mtype_del_cidr(h, SCIDR(d->cidr, j), NLEN(set->family),
-				       j);
+			mtype_del_cidr(h, NCIDR_PUT(DCIDR_GET(d->cidr, j)),
+				       NLEN(set->family), j);
 #endif
 		ip_set_ext_destroy(set, data);
 		if (n->pos + AHASH_INIT_SIZE < n->size) {
@@ -836,12 +843,13 @@ mtype_test_cidrs(struct ip_set *set, struct mtype_elem *d,
 	for (; j < nets_length && h->nets[j].cidr[0] && !multi; j++) {
 #if IPSET_NET_COUNT == 2
 		mtype_data_reset_elem(d, &orig);
-		mtype_data_netmask(d, NCIDR(h->nets[j].cidr[0]), false);
+		mtype_data_netmask(d, NCIDR_GET(h->nets[j].cidr[0]), false);
 		for (k = 0; k < nets_length && h->nets[k].cidr[1] && !multi;
 		     k++) {
-			mtype_data_netmask(d, NCIDR(h->nets[k].cidr[1]), true);
+			mtype_data_netmask(d, NCIDR_GET(h->nets[k].cidr[1]),
+					   true);
 #else
-		mtype_data_netmask(d, NCIDR(h->nets[j].cidr[0]));
+		mtype_data_netmask(d, NCIDR_GET(h->nets[j].cidr[0]));
 #endif
 		key = HKEY(d, h->initval, t->htable_bits);
 		n = hbucket(t, key);
@@ -889,7 +897,7 @@ mtype_test(struct ip_set *set, void *value, const struct ip_set_ext *ext,
 	/* If we test an IP address and not a network address,
 	 * try all possible network sizes */
 	for (i = 0; i < IPSET_NET_COUNT; i++)
-		if (GCIDR(d->cidr, i) != SET_HOST_MASK(set->family))
+		if (DCIDR_GET(d->cidr, i) != SET_HOST_MASK(set->family))
 			break;
 	if (i == IPSET_NET_COUNT) {
 		ret = mtype_test_cidrs(set, d, ext, mext, flags);
diff --git a/net/netfilter/ipset/ip_set_hash_ipportnet.c b/net/netfilter/ipset/ip_set_hash_ipportnet.c
index e38a029f3002..50248debdc8b 100644
--- a/net/netfilter/ipset/ip_set_hash_ipportnet.c
+++ b/net/netfilter/ipset/ip_set_hash_ipportnet.c
@@ -141,7 +141,7 @@ hash_ipportnet4_kadt(struct ip_set *set, const struct sk_buff *skb,
 	const struct hash_ipportnet *h = set->data;
 	ipset_adtfn adtfn = set->variant->adt[adt];
 	struct hash_ipportnet4_elem e = {
-		.cidr = IP_SET_INIT_CIDR(h->nets[0].cidr[0], HOST_MASK) - 1,
+		.cidr = INIT_CIDR(h->nets[0].cidr[0], HOST_MASK),
 	};
 	struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);
 
@@ -389,7 +389,7 @@ hash_ipportnet6_kadt(struct ip_set *set, const struct sk_buff *skb,
 	const struct hash_ipportnet *h = set->data;
 	ipset_adtfn adtfn = set->variant->adt[adt];
 	struct hash_ipportnet6_elem e = {
-		.cidr = IP_SET_INIT_CIDR(h->nets[0].cidr[0], HOST_MASK) - 1,
+		.cidr = INIT_CIDR(h->nets[0].cidr[0], HOST_MASK),
 	};
 	struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);
 
diff --git a/net/netfilter/ipset/ip_set_hash_net.c b/net/netfilter/ipset/ip_set_hash_net.c
index d19926ac3a03..089b23fd1a94 100644
--- a/net/netfilter/ipset/ip_set_hash_net.c
+++ b/net/netfilter/ipset/ip_set_hash_net.c
@@ -120,7 +120,7 @@ hash_net4_kadt(struct ip_set *set, const struct sk_buff *skb,
 	const struct hash_net *h = set->data;
 	ipset_adtfn adtfn = set->variant->adt[adt];
 	struct hash_net4_elem e = {
-		.cidr = IP_SET_INIT_CIDR(h->nets[0].cidr[0], HOST_MASK),
+		.cidr = INIT_CIDR(h->nets[0].cidr[0], HOST_MASK),
 	};
 	struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);
 
@@ -288,7 +288,7 @@ hash_net6_kadt(struct ip_set *set, const struct sk_buff *skb,
 	const struct hash_net *h = set->data;
 	ipset_adtfn adtfn = set->variant->adt[adt];
 	struct hash_net6_elem e = {
-		.cidr = IP_SET_INIT_CIDR(h->nets[0].cidr[0], HOST_MASK),
+		.cidr = INIT_CIDR(h->nets[0].cidr[0], HOST_MASK),
 	};
 	struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);
 
diff --git a/net/netfilter/ipset/ip_set_hash_netiface.c b/net/netfilter/ipset/ip_set_hash_netiface.c
index 7b69fa28d774..aac20768f6f0 100644
--- a/net/netfilter/ipset/ip_set_hash_netiface.c
+++ b/net/netfilter/ipset/ip_set_hash_netiface.c
@@ -235,7 +235,7 @@ hash_netiface4_kadt(struct ip_set *set, const struct sk_buff *skb,
 	struct hash_netiface *h = set->data;
 	ipset_adtfn adtfn = set->variant->adt[adt];
 	struct hash_netiface4_elem e = {
-		.cidr = IP_SET_INIT_CIDR(h->nets[0].cidr[0], HOST_MASK),
+		.cidr = INIT_CIDR(h->nets[0].cidr[0], HOST_MASK),
 		.elem = 1,
 	};
 	struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);
@@ -469,7 +469,7 @@ hash_netiface6_kadt(struct ip_set *set, const struct sk_buff *skb,
 	struct hash_netiface *h = set->data;
 	ipset_adtfn adtfn = set->variant->adt[adt];
 	struct hash_netiface6_elem e = {
-		.cidr = IP_SET_INIT_CIDR(h->nets[0].cidr[0], HOST_MASK),
+		.cidr = INIT_CIDR(h->nets[0].cidr[0], HOST_MASK),
 		.elem = 1,
 	};
 	struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);
diff --git a/net/netfilter/ipset/ip_set_hash_netnet.c b/net/netfilter/ipset/ip_set_hash_netnet.c
index 11eee0077b8b..ed9cc45084dd 100644
--- a/net/netfilter/ipset/ip_set_hash_netnet.c
+++ b/net/netfilter/ipset/ip_set_hash_netnet.c
@@ -141,8 +141,8 @@ hash_netnet4_kadt(struct ip_set *set, const struct sk_buff *skb,
 	struct hash_netnet4_elem e = { };
 	struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);
 
-	e.cidr[0] = IP_SET_INIT_CIDR(h->nets[0].cidr[0], HOST_MASK);
-	e.cidr[1] = IP_SET_INIT_CIDR(h->nets[0].cidr[1], HOST_MASK);
+	e.cidr[0] = INIT_CIDR(h->nets[0].cidr[0], HOST_MASK);
+	e.cidr[1] = INIT_CIDR(h->nets[0].cidr[1], HOST_MASK);
 	if (adt == IPSET_TEST)
 		e.ccmp = (HOST_MASK << (sizeof(e.cidr[0]) * 8)) | HOST_MASK;
 
@@ -364,8 +364,8 @@ hash_netnet6_kadt(struct ip_set *set, const struct sk_buff *skb,
 	struct hash_netnet6_elem e = { };
 	struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);
 
-	e.cidr[0] = IP_SET_INIT_CIDR(h->nets[0].cidr[0], HOST_MASK);
-	e.cidr[1] = IP_SET_INIT_CIDR(h->nets[0].cidr[1], HOST_MASK);
+	e.cidr[0] = INIT_CIDR(h->nets[0].cidr[0], HOST_MASK);
+	e.cidr[1] = INIT_CIDR(h->nets[0].cidr[1], HOST_MASK);
 	if (adt == IPSET_TEST)
 		e.ccmp = (HOST_MASK << (sizeof(u8)*8)) | HOST_MASK;
 
diff --git a/net/netfilter/ipset/ip_set_hash_netport.c b/net/netfilter/ipset/ip_set_hash_netport.c
index 7a6448cbd8fb..fbaf8138e5d4 100644
--- a/net/netfilter/ipset/ip_set_hash_netport.c
+++ b/net/netfilter/ipset/ip_set_hash_netport.c
@@ -136,7 +136,7 @@ hash_netport4_kadt(struct ip_set *set, const struct sk_buff *skb,
 	const struct hash_netport *h = set->data;
 	ipset_adtfn adtfn = set->variant->adt[adt];
 	struct hash_netport4_elem e = {
-		.cidr = IP_SET_INIT_CIDR(h->nets[0].cidr[0], HOST_MASK) - 1,
+		.cidr = INIT_CIDR(h->nets[0].cidr[0], HOST_MASK),
 	};
 	struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);
 
@@ -348,7 +348,7 @@ hash_netport6_kadt(struct ip_set *set, const struct sk_buff *skb,
 	const struct hash_netport *h = set->data;
 	ipset_adtfn adtfn = set->variant->adt[adt];
 	struct hash_netport6_elem e = {
-		.cidr = IP_SET_INIT_CIDR(h->nets[0].cidr[0], HOST_MASK) - 1,
+		.cidr = INIT_CIDR(h->nets[0].cidr[0], HOST_MASK),
 	};
 	struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);
 
diff --git a/net/netfilter/ipset/ip_set_hash_netportnet.c b/net/netfilter/ipset/ip_set_hash_netportnet.c
index 6eb5f8798304..a828cbc8bed7 100644
--- a/net/netfilter/ipset/ip_set_hash_netportnet.c
+++ b/net/netfilter/ipset/ip_set_hash_netportnet.c
@@ -152,8 +152,8 @@ hash_netportnet4_kadt(struct ip_set *set, const struct sk_buff *skb,
 	struct hash_netportnet4_elem e = { };
 	struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);
 
-	e.cidr[0] = IP_SET_INIT_CIDR(h->nets[0].cidr[0], HOST_MASK);
-	e.cidr[1] = IP_SET_INIT_CIDR(h->nets[0].cidr[1], HOST_MASK);
+	e.cidr[0] = INIT_CIDR(h->nets[0].cidr[0], HOST_MASK);
+	e.cidr[1] = INIT_CIDR(h->nets[0].cidr[1], HOST_MASK);
 	if (adt == IPSET_TEST)
 		e.ccmp = (HOST_MASK << (sizeof(e.cidr[0]) * 8)) | HOST_MASK;
 
@@ -418,8 +418,8 @@ hash_netportnet6_kadt(struct ip_set *set, const struct sk_buff *skb,
 	struct hash_netportnet6_elem e = { };
 	struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);
 
-	e.cidr[0] = IP_SET_INIT_CIDR(h->nets[0].cidr[0], HOST_MASK);
-	e.cidr[1] = IP_SET_INIT_CIDR(h->nets[0].cidr[1], HOST_MASK);
+	e.cidr[0] = INIT_CIDR(h->nets[0].cidr[0], HOST_MASK);
+	e.cidr[1] = INIT_CIDR(h->nets[0].cidr[1], HOST_MASK);
 	if (adt == IPSET_TEST)
 		e.ccmp = (HOST_MASK << (sizeof(u8) * 8)) | HOST_MASK;
 
-- 
cgit v1.2.3


From c4c997839cf92cb1037e43a85cdb4cbf44ed39a5 Mon Sep 17 00:00:00 2001
From: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
Date: Sat, 13 Jun 2015 11:59:45 +0200
Subject: netfilter: ipset: Fix parallel resizing and listing of the same set

When elements added to a hash:* type of set and resizing triggered,
parallel listing could start to list the original set (before resizing)
and "continue" with listing the new set. Fix it by references and
using the original hash table for listing. Therefore the destroying of
the original hash table may happen from the resizing or listing functions.

Signed-off-by: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
---
 include/linux/netfilter/ipset/ip_set.h | 13 ++++++----
 net/netfilter/ipset/ip_set_core.c      | 29 ++++++++++++----------
 net/netfilter/ipset/ip_set_hash_gen.h  | 44 +++++++++++++++++++++++++++++++---
 3 files changed, 66 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter/ipset/ip_set.h b/include/linux/netfilter/ipset/ip_set.h
index a6fe1ce96437..5674b6ac6646 100644
--- a/include/linux/netfilter/ipset/ip_set.h
+++ b/include/linux/netfilter/ipset/ip_set.h
@@ -176,6 +176,9 @@ struct ip_set_type_variant {
 	/* List elements */
 	int (*list)(const struct ip_set *set, struct sk_buff *skb,
 		    struct netlink_callback *cb);
+	/* Keep listing private when resizing runs parallel */
+	void (*uref)(struct ip_set *set, struct netlink_callback *cb,
+		     bool start);
 
 	/* Return true if "b" set is the same as "a"
 	 * according to the create set parameters */
@@ -380,12 +383,12 @@ ip_set_init_counter(struct ip_set_counter *counter,
 
 /* Netlink CB args */
 enum {
-	IPSET_CB_NET = 0,
-	IPSET_CB_DUMP,
-	IPSET_CB_INDEX,
-	IPSET_CB_ARG0,
+	IPSET_CB_NET = 0,	/* net namespace */
+	IPSET_CB_DUMP,		/* dump single set/all sets */
+	IPSET_CB_INDEX,		/* set index */
+	IPSET_CB_PRIVATE,	/* set private data */
+	IPSET_CB_ARG0,		/* type specific */
 	IPSET_CB_ARG1,
-	IPSET_CB_ARG2,
 };
 
 /* register and unregister set references */
diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c
index 68ae551f2b39..777cac6fd64d 100644
--- a/net/netfilter/ipset/ip_set_core.c
+++ b/net/netfilter/ipset/ip_set_core.c
@@ -1211,12 +1211,16 @@ ip_set_swap(struct sock *ctnl, struct sk_buff *skb,
 static int
 ip_set_dump_done(struct netlink_callback *cb)
 {
-	struct ip_set_net *inst = (struct ip_set_net *)cb->args[IPSET_CB_NET];
 	if (cb->args[IPSET_CB_ARG0]) {
-		pr_debug("release set %s\n",
-			 ip_set(inst, cb->args[IPSET_CB_INDEX])->name);
-		__ip_set_put_byindex(inst,
-			(ip_set_id_t) cb->args[IPSET_CB_INDEX]);
+		struct ip_set_net *inst =
+			(struct ip_set_net *)cb->args[IPSET_CB_NET];
+		ip_set_id_t index = (ip_set_id_t)cb->args[IPSET_CB_INDEX];
+		struct ip_set *set = ip_set(inst, index);
+
+		if (set->variant->uref)
+			set->variant->uref(set, cb, false);
+		pr_debug("release set %s\n", set->name);
+		__ip_set_put_byindex(inst, index);
 	}
 	return 0;
 }
@@ -1247,12 +1251,6 @@ dump_init(struct netlink_callback *cb, struct ip_set_net *inst)
 	nla_parse(cda, IPSET_ATTR_CMD_MAX,
 		  attr, nlh->nlmsg_len - min_len, ip_set_setname_policy);
 
-	/* cb->args[IPSET_CB_NET]:	net namespace
-	 *         [IPSET_CB_DUMP]:	dump single set/all sets
-	 *         [IPSET_CB_INDEX]: 	set index
-	 *         [IPSET_CB_ARG0]:	type specific
-	 */
-
 	if (cda[IPSET_ATTR_SETNAME]) {
 		struct ip_set *set;
 
@@ -1359,6 +1357,8 @@ dump_last:
 				goto release_refcount;
 			if (dump_flags & IPSET_FLAG_LIST_HEADER)
 				goto next_set;
+			if (set->variant->uref)
+				set->variant->uref(set, cb, true);
 			/* Fall through and add elements */
 		default:
 			read_lock_bh(&set->lock);
@@ -1375,6 +1375,8 @@ dump_last:
 		dump_type = DUMP_LAST;
 		cb->args[IPSET_CB_DUMP] = dump_type | (dump_flags << 16);
 		cb->args[IPSET_CB_INDEX] = 0;
+		if (set && set->variant->uref)
+			set->variant->uref(set, cb, false);
 		goto dump_last;
 	}
 	goto out;
@@ -1389,7 +1391,10 @@ next_set:
 release_refcount:
 	/* If there was an error or set is done, release set */
 	if (ret || !cb->args[IPSET_CB_ARG0]) {
-		pr_debug("release set %s\n", ip_set(inst, index)->name);
+		set = ip_set(inst, index);
+		if (set->variant->uref)
+			set->variant->uref(set, cb, false);
+		pr_debug("release set %s\n", set->name);
 		__ip_set_put_byindex(inst, index);
 		cb->args[IPSET_CB_ARG0] = 0;
 	}
diff --git a/net/netfilter/ipset/ip_set_hash_gen.h b/net/netfilter/ipset/ip_set_hash_gen.h
index 2f1985e71f6c..5fcf70b0ebc2 100644
--- a/net/netfilter/ipset/ip_set_hash_gen.h
+++ b/net/netfilter/ipset/ip_set_hash_gen.h
@@ -71,6 +71,8 @@ struct hbucket {
 
 /* The hash table: the table size stored here in order to make resizing easy */
 struct htable {
+	atomic_t ref;		/* References for resizing */
+	atomic_t uref;		/* References for dumping */
 	u8 htable_bits;		/* size of hash table == 2^htable_bits */
 	struct hbucket bucket[0]; /* hashtable buckets */
 };
@@ -207,6 +209,7 @@ hbucket_elem_add(struct hbucket *n, u8 ahash_max, size_t dsize)
 #undef mtype_del
 #undef mtype_test_cidrs
 #undef mtype_test
+#undef mtype_uref
 #undef mtype_expire
 #undef mtype_resize
 #undef mtype_head
@@ -248,6 +251,7 @@ hbucket_elem_add(struct hbucket *n, u8 ahash_max, size_t dsize)
 #define mtype_del		IPSET_TOKEN(MTYPE, _del)
 #define mtype_test_cidrs	IPSET_TOKEN(MTYPE, _test_cidrs)
 #define mtype_test		IPSET_TOKEN(MTYPE, _test)
+#define mtype_uref		IPSET_TOKEN(MTYPE, _uref)
 #define mtype_expire		IPSET_TOKEN(MTYPE, _expire)
 #define mtype_resize		IPSET_TOKEN(MTYPE, _resize)
 #define mtype_head		IPSET_TOKEN(MTYPE, _head)
@@ -595,6 +599,9 @@ retry:
 	t->htable_bits = htable_bits;
 
 	read_lock_bh(&set->lock);
+	/* There can't be another parallel resizing, but dumping is possible */
+	atomic_set(&orig->ref, 1);
+	atomic_inc(&orig->uref);
 	for (i = 0; i < jhash_size(orig->htable_bits); i++) {
 		n = hbucket(orig, i);
 		for (j = 0; j < n->pos; j++) {
@@ -609,6 +616,8 @@ retry:
 #ifdef IP_SET_HASH_WITH_NETS
 				mtype_data_reset_flags(data, &flags);
 #endif
+				atomic_set(&orig->ref, 0);
+				atomic_dec(&orig->uref);
 				read_unlock_bh(&set->lock);
 				mtype_ahash_destroy(set, t, false);
 				if (ret == -EAGAIN)
@@ -631,7 +640,11 @@ retry:
 
 	pr_debug("set %s resized from %u (%p) to %u (%p)\n", set->name,
 		 orig->htable_bits, orig, t->htable_bits, t);
-	mtype_ahash_destroy(set, orig, false);
+	/* If there's nobody else dumping the table, destroy it */
+	if (atomic_dec_and_test(&orig->uref)) {
+		pr_debug("Table destroy by resize %p\n", orig);
+		mtype_ahash_destroy(set, orig, false);
+	}
 
 	return 0;
 }
@@ -961,13 +974,36 @@ nla_put_failure:
 	return -EMSGSIZE;
 }
 
+/* Make possible to run dumping parallel with resizing */
+static void
+mtype_uref(struct ip_set *set, struct netlink_callback *cb, bool start)
+{
+	struct htype *h = set->data;
+	struct htable *t;
+
+	if (start) {
+		rcu_read_lock_bh();
+		t = rcu_dereference_bh_nfnl(h->table);
+		atomic_inc(&t->uref);
+		cb->args[IPSET_CB_PRIVATE] = (unsigned long)t;
+		rcu_read_unlock_bh();
+	} else if (cb->args[IPSET_CB_PRIVATE]) {
+		t = (struct htable *)cb->args[IPSET_CB_PRIVATE];
+		if (atomic_dec_and_test(&t->uref) && atomic_read(&t->ref)) {
+			/* Resizing didn't destroy the hash table */
+			pr_debug("Table destroy by dump: %p\n", t);
+			mtype_ahash_destroy(set, t, false);
+		}
+		cb->args[IPSET_CB_PRIVATE] = 0;
+	}
+}
+
 /* Reply a LIST/SAVE request: dump the elements of the specified set */
 static int
 mtype_list(const struct ip_set *set,
 	   struct sk_buff *skb, struct netlink_callback *cb)
 {
-	const struct htype *h = set->data;
-	const struct htable *t = rcu_dereference_bh_nfnl(h->table);
+	const struct htable *t;
 	struct nlattr *atd, *nested;
 	const struct hbucket *n;
 	const struct mtype_elem *e;
@@ -980,6 +1016,7 @@ mtype_list(const struct ip_set *set,
 	if (!atd)
 		return -EMSGSIZE;
 	pr_debug("list hash set %s\n", set->name);
+	t = (const struct htable *)cb->args[IPSET_CB_PRIVATE];
 	for (; cb->args[IPSET_CB_ARG0] < jhash_size(t->htable_bits);
 	     cb->args[IPSET_CB_ARG0]++) {
 		incomplete = skb_tail_pointer(skb);
@@ -1047,6 +1084,7 @@ static const struct ip_set_type_variant mtype_variant = {
 	.flush	= mtype_flush,
 	.head	= mtype_head,
 	.list	= mtype_list,
+	.uref	= mtype_uref,
 	.resize	= mtype_resize,
 	.same_set = mtype_same_set,
 };
-- 
cgit v1.2.3


From b57b2d1fa53fe8563bdfc66a33b844463b9af285 Mon Sep 17 00:00:00 2001
From: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
Date: Sat, 13 Jun 2015 14:22:25 +0200
Subject: netfilter: ipset: Prepare the ipset core to use RCU at set level

Replace rwlock_t with spinlock_t in "struct ip_set" and change the locking
accordingly. Convert the comment extension into an rcu-avare object. Also,
simplify the timeout routines.

Signed-off-by: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
---
 include/linux/netfilter/ipset/ip_set.h         |  9 ++++--
 include/linux/netfilter/ipset/ip_set_comment.h | 38 +++++++++++++++-------
 include/linux/netfilter/ipset/ip_set_timeout.h | 25 ++++++---------
 net/netfilter/ipset/ip_set_core.c              | 44 +++++++++++++-------------
 4 files changed, 66 insertions(+), 50 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter/ipset/ip_set.h b/include/linux/netfilter/ipset/ip_set.h
index 5674b6ac6646..19b4969a25fe 100644
--- a/include/linux/netfilter/ipset/ip_set.h
+++ b/include/linux/netfilter/ipset/ip_set.h
@@ -108,8 +108,13 @@ struct ip_set_counter {
 	atomic64_t packets;
 };
 
+struct ip_set_comment_rcu {
+	struct rcu_head rcu;
+	char str[0];
+};
+
 struct ip_set_comment {
-	char *str;
+	struct ip_set_comment_rcu __rcu *c;
 };
 
 struct ip_set_skbinfo {
@@ -226,7 +231,7 @@ struct ip_set {
 	/* The name of the set */
 	char name[IPSET_MAXNAMELEN];
 	/* Lock protecting the set data */
-	rwlock_t lock;
+	spinlock_t lock;
 	/* References to the set */
 	u32 ref;
 	/* The core set type */
diff --git a/include/linux/netfilter/ipset/ip_set_comment.h b/include/linux/netfilter/ipset/ip_set_comment.h
index 21217ea008d7..8d0248525957 100644
--- a/include/linux/netfilter/ipset/ip_set_comment.h
+++ b/include/linux/netfilter/ipset/ip_set_comment.h
@@ -16,41 +16,57 @@ ip_set_comment_uget(struct nlattr *tb)
 	return nla_data(tb);
 }
 
+/* Called from uadd only, protected by the set spinlock.
+ * The kadt functions don't use the comment extensions in any way.
+ */
 static inline void
 ip_set_init_comment(struct ip_set_comment *comment,
 		    const struct ip_set_ext *ext)
 {
+	struct ip_set_comment_rcu *c = rcu_dereference_protected(comment->c, 1);
 	size_t len = ext->comment ? strlen(ext->comment) : 0;
 
-	if (unlikely(comment->str)) {
-		kfree(comment->str);
-		comment->str = NULL;
+	if (unlikely(c)) {
+		kfree_rcu(c, rcu);
+		rcu_assign_pointer(comment->c, NULL);
 	}
 	if (!len)
 		return;
 	if (unlikely(len > IPSET_MAX_COMMENT_SIZE))
 		len = IPSET_MAX_COMMENT_SIZE;
-	comment->str = kzalloc(len + 1, GFP_ATOMIC);
-	if (unlikely(!comment->str))
+	c = kzalloc(sizeof(*c) + len + 1, GFP_ATOMIC);
+	if (unlikely(!c))
 		return;
-	strlcpy(comment->str, ext->comment, len + 1);
+	strlcpy(c->str, ext->comment, len + 1);
+	rcu_assign_pointer(comment->c, c);
 }
 
+/* Used only when dumping a set, protected by rcu_read_lock_bh() */
 static inline int
 ip_set_put_comment(struct sk_buff *skb, struct ip_set_comment *comment)
 {
-	if (!comment->str)
+	struct ip_set_comment_rcu *c = rcu_dereference_bh(comment->c);
+
+	if (!c)
 		return 0;
-	return nla_put_string(skb, IPSET_ATTR_COMMENT, comment->str);
+	return nla_put_string(skb, IPSET_ATTR_COMMENT, c->str);
 }
 
+/* Called from uadd/udel, flush or the garbage collectors protected
+ * by the set spinlock.
+ * Called when the set is destroyed and when there can't be any user
+ * of the set data anymore.
+ */
 static inline void
 ip_set_comment_free(struct ip_set_comment *comment)
 {
-	if (unlikely(!comment->str))
+	struct ip_set_comment_rcu *c;
+
+	c = rcu_dereference_protected(comment->c, 1);
+	if (unlikely(!c))
 		return;
-	kfree(comment->str);
-	comment->str = NULL;
+	kfree_rcu(c, rcu);
+	rcu_assign_pointer(comment->c, NULL);
 }
 
 #endif
diff --git a/include/linux/netfilter/ipset/ip_set_timeout.h b/include/linux/netfilter/ipset/ip_set_timeout.h
index 3c8842bcedaa..1d6a935c1ac5 100644
--- a/include/linux/netfilter/ipset/ip_set_timeout.h
+++ b/include/linux/netfilter/ipset/ip_set_timeout.h
@@ -40,31 +40,26 @@ ip_set_timeout_uget(struct nlattr *tb)
 }
 
 static inline bool
-ip_set_timeout_test(unsigned long timeout)
+ip_set_timeout_expired(unsigned long *t)
 {
-	return timeout == IPSET_ELEM_PERMANENT ||
-	       time_is_after_jiffies(timeout);
-}
-
-static inline bool
-ip_set_timeout_expired(unsigned long *timeout)
-{
-	return *timeout != IPSET_ELEM_PERMANENT &&
-	       time_is_before_jiffies(*timeout);
+	return *t != IPSET_ELEM_PERMANENT && time_is_before_jiffies(*t);
 }
 
 static inline void
-ip_set_timeout_set(unsigned long *timeout, u32 t)
+ip_set_timeout_set(unsigned long *timeout, u32 value)
 {
-	if (!t) {
+	unsigned long t;
+
+	if (!value) {
 		*timeout = IPSET_ELEM_PERMANENT;
 		return;
 	}
 
-	*timeout = msecs_to_jiffies(t * MSEC_PER_SEC) + jiffies;
-	if (*timeout == IPSET_ELEM_PERMANENT)
+	t = msecs_to_jiffies(value * MSEC_PER_SEC) + jiffies;
+	if (t == IPSET_ELEM_PERMANENT)
 		/* Bingo! :-) */
-		(*timeout)--;
+		t--;
+	*timeout = t;
 }
 
 static inline u32
diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c
index 87b4182660ba..2b21a1983a98 100644
--- a/net/netfilter/ipset/ip_set_core.c
+++ b/net/netfilter/ipset/ip_set_core.c
@@ -209,15 +209,15 @@ ip_set_type_register(struct ip_set_type *type)
 		pr_warn("ip_set type %s, family %s with revision min %u already registered!\n",
 			type->name, family_name(type->family),
 			type->revision_min);
-		ret = -EINVAL;
-		goto unlock;
+		ip_set_type_unlock();
+		return -EINVAL;
 	}
 	list_add_rcu(&type->list, &ip_set_type_list);
 	pr_debug("type %s, family %s, revision %u:%u registered.\n",
 		 type->name, family_name(type->family),
 		 type->revision_min, type->revision_max);
-unlock:
 	ip_set_type_unlock();
+
 	return ret;
 }
 EXPORT_SYMBOL_GPL(ip_set_type_register);
@@ -231,12 +231,12 @@ ip_set_type_unregister(struct ip_set_type *type)
 		pr_warn("ip_set type %s, family %s with revision min %u not registered\n",
 			type->name, family_name(type->family),
 			type->revision_min);
-		goto unlock;
+		ip_set_type_unlock();
+		return;
 	}
 	list_del_rcu(&type->list);
 	pr_debug("type %s, family %s with revision min %u unregistered.\n",
 		 type->name, family_name(type->family), type->revision_min);
-unlock:
 	ip_set_type_unlock();
 
 	synchronize_rcu();
@@ -531,16 +531,16 @@ ip_set_test(ip_set_id_t index, const struct sk_buff *skb,
 	    !(opt->family == set->family || set->family == NFPROTO_UNSPEC))
 		return 0;
 
-	read_lock_bh(&set->lock);
+	rcu_read_lock_bh();
 	ret = set->variant->kadt(set, skb, par, IPSET_TEST, opt);
-	read_unlock_bh(&set->lock);
+	rcu_read_unlock_bh();
 
 	if (ret == -EAGAIN) {
 		/* Type requests element to be completed */
 		pr_debug("element must be completed, ADD is triggered\n");
-		write_lock_bh(&set->lock);
+		spin_lock_bh(&set->lock);
 		set->variant->kadt(set, skb, par, IPSET_ADD, opt);
-		write_unlock_bh(&set->lock);
+		spin_unlock_bh(&set->lock);
 		ret = 1;
 	} else {
 		/* --return-nomatch: invert matched element */
@@ -570,9 +570,9 @@ ip_set_add(ip_set_id_t index, const struct sk_buff *skb,
 	    !(opt->family == set->family || set->family == NFPROTO_UNSPEC))
 		return -IPSET_ERR_TYPE_MISMATCH;
 
-	write_lock_bh(&set->lock);
+	spin_lock_bh(&set->lock);
 	ret = set->variant->kadt(set, skb, par, IPSET_ADD, opt);
-	write_unlock_bh(&set->lock);
+	spin_unlock_bh(&set->lock);
 
 	return ret;
 }
@@ -593,9 +593,9 @@ ip_set_del(ip_set_id_t index, const struct sk_buff *skb,
 	    !(opt->family == set->family || set->family == NFPROTO_UNSPEC))
 		return -IPSET_ERR_TYPE_MISMATCH;
 
-	write_lock_bh(&set->lock);
+	spin_lock_bh(&set->lock);
 	ret = set->variant->kadt(set, skb, par, IPSET_DEL, opt);
-	write_unlock_bh(&set->lock);
+	spin_unlock_bh(&set->lock);
 
 	return ret;
 }
@@ -880,7 +880,7 @@ ip_set_create(struct sock *ctnl, struct sk_buff *skb,
 	set = kzalloc(sizeof(struct ip_set), GFP_KERNEL);
 	if (!set)
 		return -ENOMEM;
-	rwlock_init(&set->lock);
+	spin_lock_init(&set->lock);
 	strlcpy(set->name, name, IPSET_MAXNAMELEN);
 	set->family = family;
 	set->revision = revision;
@@ -1062,9 +1062,9 @@ ip_set_flush_set(struct ip_set *set)
 {
 	pr_debug("set: %s\n",  set->name);
 
-	write_lock_bh(&set->lock);
+	spin_lock_bh(&set->lock);
 	set->variant->flush(set);
-	write_unlock_bh(&set->lock);
+	spin_unlock_bh(&set->lock);
 }
 
 static int
@@ -1377,9 +1377,9 @@ dump_last:
 				set->variant->uref(set, cb, true);
 			/* Fall through and add elements */
 		default:
-			read_lock_bh(&set->lock);
+			rcu_read_lock_bh();
 			ret = set->variant->list(set, skb, cb);
-			read_unlock_bh(&set->lock);
+			rcu_read_unlock_bh();
 			if (!cb->args[IPSET_CB_ARG0])
 				/* Set is done, proceed with next one */
 				goto next_set;
@@ -1462,9 +1462,9 @@ call_ad(struct sock *ctnl, struct sk_buff *skb, struct ip_set *set,
 	bool eexist = flags & IPSET_FLAG_EXIST, retried = false;
 
 	do {
-		write_lock_bh(&set->lock);
+		spin_lock_bh(&set->lock);
 		ret = set->variant->uadt(set, tb, adt, &lineno, flags, retried);
-		write_unlock_bh(&set->lock);
+		spin_unlock_bh(&set->lock);
 		retried = true;
 	} while (ret == -EAGAIN &&
 		 set->variant->resize &&
@@ -1644,9 +1644,9 @@ ip_set_utest(struct sock *ctnl, struct sk_buff *skb,
 			     set->type->adt_policy))
 		return -IPSET_ERR_PROTOCOL;
 
-	read_lock_bh(&set->lock);
+	rcu_read_lock_bh();
 	ret = set->variant->uadt(set, tb, IPSET_TEST, NULL, 0, 0);
-	read_unlock_bh(&set->lock);
+	rcu_read_unlock_bh();
 	/* Userspace can't trigger element to be re-added */
 	if (ret == -EAGAIN)
 		ret = 1;
-- 
cgit v1.2.3


From ca0f6a5cd99e0c6ba4bb78dc402817f636370f26 Mon Sep 17 00:00:00 2001
From: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
Date: Sat, 13 Jun 2015 19:45:33 +0200
Subject: netfilter: ipset: Fix coding styles reported by checkpatch.pl

Signed-off-by: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
---
 include/linux/netfilter/ipset/ip_set.h       |   5 +-
 include/uapi/linux/netfilter/ipset/ip_set.h  |   6 +-
 net/netfilter/ipset/ip_set_bitmap_gen.h      |  11 +-
 net/netfilter/ipset/ip_set_bitmap_ip.c       |  12 +-
 net/netfilter/ipset/ip_set_bitmap_ipmac.c    |  21 +--
 net/netfilter/ipset/ip_set_bitmap_port.c     |   7 +-
 net/netfilter/ipset/ip_set_core.c            | 201 +++++++++++++--------------
 net/netfilter/ipset/ip_set_getport.c         |  13 +-
 net/netfilter/ipset/ip_set_hash_gen.h        |  55 ++++----
 net/netfilter/ipset/ip_set_hash_ip.c         |   4 +-
 net/netfilter/ipset/ip_set_hash_ipmark.c     |   9 +-
 net/netfilter/ipset/ip_set_hash_ipport.c     |  14 +-
 net/netfilter/ipset/ip_set_hash_ipportip.c   |  16 ++-
 net/netfilter/ipset/ip_set_hash_ipportnet.c  |  19 ++-
 net/netfilter/ipset/ip_set_hash_mac.c        |   6 +-
 net/netfilter/ipset/ip_set_hash_net.c        |   8 +-
 net/netfilter/ipset/ip_set_hash_netiface.c   |  25 ++--
 net/netfilter/ipset/ip_set_hash_netnet.c     |  46 +++---
 net/netfilter/ipset/ip_set_hash_netport.c    |  19 ++-
 net/netfilter/ipset/ip_set_hash_netportnet.c |  54 +++----
 net/netfilter/ipset/ip_set_list_set.c        |  11 +-
 net/netfilter/ipset/pfxlen.c                 |  16 +--
 net/netfilter/xt_set.c                       |  44 +++---
 23 files changed, 327 insertions(+), 295 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter/ipset/ip_set.h b/include/linux/netfilter/ipset/ip_set.h
index 19b4969a25fe..48bb01edcf30 100644
--- a/include/linux/netfilter/ipset/ip_set.h
+++ b/include/linux/netfilter/ipset/ip_set.h
@@ -349,12 +349,11 @@ ip_set_put_skbinfo(struct sk_buff *skb, struct ip_set_skbinfo *skbinfo)
 			      cpu_to_be64((u64)skbinfo->skbmark << 32 |
 					  skbinfo->skbmarkmask))) ||
 	       (skbinfo->skbprio &&
-	        nla_put_net32(skb, IPSET_ATTR_SKBPRIO,
+		nla_put_net32(skb, IPSET_ATTR_SKBPRIO,
 			      cpu_to_be32(skbinfo->skbprio))) ||
 	       (skbinfo->skbqueue &&
-	        nla_put_net16(skb, IPSET_ATTR_SKBQUEUE,
+		nla_put_net16(skb, IPSET_ATTR_SKBQUEUE,
 			     cpu_to_be16(skbinfo->skbqueue)));
-
 }
 
 static inline void
diff --git a/include/uapi/linux/netfilter/ipset/ip_set.h b/include/uapi/linux/netfilter/ipset/ip_set.h
index 5ab4e60894cf..63b2e34f1b60 100644
--- a/include/uapi/linux/netfilter/ipset/ip_set.h
+++ b/include/uapi/linux/netfilter/ipset/ip_set.h
@@ -15,12 +15,12 @@
 /* The protocol version */
 #define IPSET_PROTOCOL		6
 
-/* The maximum permissible comment length we will accept over netlink */
-#define IPSET_MAX_COMMENT_SIZE	255
-
 /* The max length of strings including NUL: set and type identifiers */
 #define IPSET_MAXNAMELEN	32
 
+/* The maximum permissible comment length we will accept over netlink */
+#define IPSET_MAX_COMMENT_SIZE	255
+
 /* Message types and commands */
 enum ipset_cmd {
 	IPSET_CMD_NONE,
diff --git a/net/netfilter/ipset/ip_set_bitmap_gen.h b/net/netfilter/ipset/ip_set_bitmap_gen.h
index 86429f369128..d05e759ed0fa 100644
--- a/net/netfilter/ipset/ip_set_bitmap_gen.h
+++ b/net/netfilter/ipset/ip_set_bitmap_gen.h
@@ -41,7 +41,7 @@ mtype_gc_init(struct ip_set *set, void (*gc)(unsigned long ul_set))
 	struct mtype *map = set->data;
 
 	init_timer(&map->gc);
-	map->gc.data = (unsigned long) set;
+	map->gc.data = (unsigned long)set;
 	map->gc.function = gc;
 	map->gc.expires = jiffies + IPSET_GC_PERIOD(set->timeout) * HZ;
 	add_timer(&map->gc);
@@ -223,7 +223,7 @@ mtype_list(const struct ip_set *set,
 		if (!test_bit(id, map->members) ||
 		    (SET_WITH_TIMEOUT(set) &&
 #ifdef IP_SET_BITMAP_STORED_TIMEOUT
-		     mtype_is_filled((const struct mtype_elem *) x) &&
+		     mtype_is_filled((const struct mtype_elem *)x) &&
 #endif
 		     ip_set_timeout_expired(ext_timeout(x, set))))
 			continue;
@@ -240,7 +240,7 @@ mtype_list(const struct ip_set *set,
 		if (mtype_do_list(skb, map, id, set->dsize))
 			goto nla_put_failure;
 		if (ip_set_put_extensions(skb, set, x,
-		    mtype_is_filled((const struct mtype_elem *) x)))
+		    mtype_is_filled((const struct mtype_elem *)x)))
 			goto nla_put_failure;
 		ipset_nest_end(skb, nested);
 	}
@@ -266,13 +266,14 @@ out:
 static void
 mtype_gc(unsigned long ul_set)
 {
-	struct ip_set *set = (struct ip_set *) ul_set;
+	struct ip_set *set = (struct ip_set *)ul_set;
 	struct mtype *map = set->data;
 	void *x;
 	u32 id;
 
 	/* We run parallel with other readers (test element)
-	 * but adding/deleting new entries is locked out */
+	 * but adding/deleting new entries is locked out
+	 */
 	spin_lock_bh(&set->lock);
 	for (id = 0; id < map->elements; id++)
 		if (mtype_gc_test(id, map, set->dsize)) {
diff --git a/net/netfilter/ipset/ip_set_bitmap_ip.c b/net/netfilter/ipset/ip_set_bitmap_ip.c
index b8ce474c038d..64a564334418 100644
--- a/net/netfilter/ipset/ip_set_bitmap_ip.c
+++ b/net/netfilter/ipset/ip_set_bitmap_ip.c
@@ -59,7 +59,7 @@ struct bitmap_ip_adt_elem {
 static inline u32
 ip_to_id(const struct bitmap_ip *m, u32 ip)
 {
-	return ((ip & ip_set_hostmask(m->netmask)) - m->first_ip)/m->hosts;
+	return ((ip & ip_set_hostmask(m->netmask)) - m->first_ip) / m->hosts;
 }
 
 /* Common functions */
@@ -175,8 +175,9 @@ bitmap_ip_uadt(struct ip_set *set, struct nlattr *tb[],
 		if (!cidr || cidr > HOST_MASK)
 			return -IPSET_ERR_INVALID_CIDR;
 		ip_set_mask_from_to(ip, ip_to, cidr);
-	} else
+	} else {
 		ip_to = ip;
+	}
 
 	if (ip_to > map->last_ip)
 		return -IPSET_ERR_BITMAP_RANGE;
@@ -187,8 +188,8 @@ bitmap_ip_uadt(struct ip_set *set, struct nlattr *tb[],
 
 		if (ret && !ip_set_eexist(ret, flags))
 			return ret;
-		else
-			ret = 0;
+
+		ret = 0;
 	}
 	return ret;
 }
@@ -278,8 +279,9 @@ bitmap_ip_create(struct net *net, struct ip_set *set, struct nlattr *tb[],
 		if (cidr >= HOST_MASK)
 			return -IPSET_ERR_INVALID_CIDR;
 		ip_set_mask_from_to(first_ip, last_ip, cidr);
-	} else
+	} else {
 		return -IPSET_ERR_PROTOCOL;
+	}
 
 	if (tb[IPSET_ATTR_NETMASK]) {
 		netmask = nla_get_u8(tb[IPSET_ATTR_NETMASK]);
diff --git a/net/netfilter/ipset/ip_set_bitmap_ipmac.c b/net/netfilter/ipset/ip_set_bitmap_ipmac.c
index fe00e87decc8..1430535118fb 100644
--- a/net/netfilter/ipset/ip_set_bitmap_ipmac.c
+++ b/net/netfilter/ipset/ip_set_bitmap_ipmac.c
@@ -90,7 +90,7 @@ bitmap_ipmac_do_test(const struct bitmap_ipmac_adt_elem *e,
 		return 0;
 	elem = get_elem(map->extensions, e->id, dsize);
 	if (elem->filled == MAC_FILLED)
-		return e->ether == NULL ||
+		return !e->ether ||
 		       ether_addr_equal(e->ether, elem->ether);
 	/* Trigger kernel to fill out the ethernet address */
 	return -EAGAIN;
@@ -131,7 +131,8 @@ bitmap_ipmac_add_timeout(unsigned long *timeout,
 		/* If MAC is unset yet, we store plain timeout value
 		 * because the timer is not activated yet
 		 * and we can reuse it later when MAC is filled out,
-		 * possibly by the kernel */
+		 * possibly by the kernel
+		 */
 		if (e->ether)
 			ip_set_timeout_set(timeout, t);
 		else
@@ -155,7 +156,7 @@ bitmap_ipmac_do_add(const struct bitmap_ipmac_adt_elem *e,
 				/* memcpy isn't atomic */
 				clear_bit(e->id, map->members);
 				smp_mb__after_atomic();
-				memcpy(elem->ether, e->ether, ETH_ALEN);
+				ether_addr_copy(elem->ether, e->ether);
 			}
 			return IPSET_ADD_FAILED;
 		} else if (!e->ether)
@@ -164,19 +165,18 @@ bitmap_ipmac_do_add(const struct bitmap_ipmac_adt_elem *e,
 		/* Fill the MAC address and trigger the timer activation */
 		clear_bit(e->id, map->members);
 		smp_mb__after_atomic();
-		memcpy(elem->ether, e->ether, ETH_ALEN);
+		ether_addr_copy(elem->ether, e->ether);
 		elem->filled = MAC_FILLED;
 		return IPSET_ADD_START_STORED_TIMEOUT;
 	} else if (e->ether) {
 		/* We can store MAC too */
-		memcpy(elem->ether, e->ether, ETH_ALEN);
+		ether_addr_copy(elem->ether, e->ether);
 		elem->filled = MAC_FILLED;
 		return 0;
-	} else {
-		elem->filled = MAC_UNSET;
-		/* MAC is not stored yet, don't start timer */
-		return IPSET_ADD_STORE_PLAIN_TIMEOUT;
 	}
+	elem->filled = MAC_UNSET;
+	/* MAC is not stored yet, don't start timer */
+	return IPSET_ADD_STORE_PLAIN_TIMEOUT;
 }
 
 static inline int
@@ -352,8 +352,9 @@ bitmap_ipmac_create(struct net *net, struct ip_set *set, struct nlattr *tb[],
 		if (cidr >= HOST_MASK)
 			return -IPSET_ERR_INVALID_CIDR;
 		ip_set_mask_from_to(first_ip, last_ip, cidr);
-	} else
+	} else {
 		return -IPSET_ERR_PROTOCOL;
+	}
 
 	elements = (u64)last_ip - first_ip + 1;
 
diff --git a/net/netfilter/ipset/ip_set_bitmap_port.c b/net/netfilter/ipset/ip_set_bitmap_port.c
index 2d360f951d18..5338ccd5da46 100644
--- a/net/netfilter/ipset/ip_set_bitmap_port.c
+++ b/net/netfilter/ipset/ip_set_bitmap_port.c
@@ -162,8 +162,9 @@ bitmap_port_uadt(struct ip_set *set, struct nlattr *tb[],
 			if (port < map->first_port)
 				return -IPSET_ERR_BITMAP_RANGE;
 		}
-	} else
+	} else {
 		port_to = port;
+	}
 
 	if (port_to > map->last_port)
 		return -IPSET_ERR_BITMAP_RANGE;
@@ -174,8 +175,8 @@ bitmap_port_uadt(struct ip_set *set, struct nlattr *tb[],
 
 		if (ret && !ip_set_eexist(ret, flags))
 			return ret;
-		else
-			ret = 0;
+
+		ret = 0;
 	}
 	return ret;
 }
diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c
index 2b21a1983a98..338b4047776f 100644
--- a/net/netfilter/ipset/ip_set_core.c
+++ b/net/netfilter/ipset/ip_set_core.c
@@ -35,6 +35,7 @@ struct ip_set_net {
 	bool		is_deleted;	/* deleted by ip_set_net_exit */
 	bool		is_destroyed;	/* all sets are destroyed */
 };
+
 static int ip_set_net_id __read_mostly;
 
 static inline struct ip_set_net *ip_set_pernet(struct net *net)
@@ -60,8 +61,7 @@ MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_IPSET);
 #define ip_set(inst, id)		\
 	ip_set_dereference((inst)->ip_set_list)[id]
 
-/*
- * The set types are implemented in modules and registered set types
+/* The set types are implemented in modules and registered set types
  * can be found in ip_set_type_list. Adding/deleting types is
  * serialized by ip_set_type_mutex.
  */
@@ -131,7 +131,8 @@ __find_set_type_get(const char *name, u8 family, u8 revision,
 		goto unlock;
 	}
 	/* Make sure the type is already loaded
-	 * but we don't support the revision */
+	 * but we don't support the revision
+	 */
 	list_for_each_entry_rcu(type, &ip_set_type_list, list)
 		if (STRNCMP(type->name, name)) {
 			err = -IPSET_ERR_FIND_TYPE;
@@ -290,7 +291,7 @@ static const struct nla_policy ipaddr_policy[IPSET_ATTR_IPADDR_MAX + 1] = {
 int
 ip_set_get_ipaddr4(struct nlattr *nla,  __be32 *ipaddr)
 {
-	struct nlattr *tb[IPSET_ATTR_IPADDR_MAX+1];
+	struct nlattr *tb[IPSET_ATTR_IPADDR_MAX + 1];
 
 	if (unlikely(!flag_nested(nla)))
 		return -IPSET_ERR_PROTOCOL;
@@ -307,7 +308,7 @@ EXPORT_SYMBOL_GPL(ip_set_get_ipaddr4);
 int
 ip_set_get_ipaddr6(struct nlattr *nla, union nf_inet_addr *ipaddr)
 {
-	struct nlattr *tb[IPSET_ATTR_IPADDR_MAX+1];
+	struct nlattr *tb[IPSET_ATTR_IPADDR_MAX + 1];
 
 	if (unlikely(!flag_nested(nla)))
 		return -IPSET_ERR_PROTOCOL;
@@ -318,7 +319,7 @@ ip_set_get_ipaddr6(struct nlattr *nla, union nf_inet_addr *ipaddr)
 		return -IPSET_ERR_PROTOCOL;
 
 	memcpy(ipaddr, nla_data(tb[IPSET_ATTR_IPADDR_IPV6]),
-		sizeof(struct in6_addr));
+	       sizeof(struct in6_addr));
 	return 0;
 }
 EXPORT_SYMBOL_GPL(ip_set_get_ipaddr6);
@@ -467,8 +468,7 @@ ip_set_put_extensions(struct sk_buff *skb, const struct ip_set *set,
 }
 EXPORT_SYMBOL_GPL(ip_set_put_extensions);
 
-/*
- * Creating/destroying/renaming/swapping affect the existence and
+/* Creating/destroying/renaming/swapping affect the existence and
  * the properties of a set. All of these can be executed from userspace
  * only and serialized by the nfnl mutex indirectly from nfnetlink.
  *
@@ -495,8 +495,7 @@ __ip_set_put(struct ip_set *set)
 	write_unlock_bh(&ip_set_ref_lock);
 }
 
-/*
- * Add, del and test set entries from kernel.
+/* Add, del and test set entries from kernel.
  *
  * The set behind the index must exist and must be referenced
  * so it can't be destroyed (or changed) under our foot.
@@ -524,7 +523,7 @@ ip_set_test(ip_set_id_t index, const struct sk_buff *skb,
 			dev_net(par->in ? par->in : par->out), index);
 	int ret = 0;
 
-	BUG_ON(set == NULL);
+	BUG_ON(!set);
 	pr_debug("set %s, index %u\n", set->name, index);
 
 	if (opt->dim < set->type->dimension ||
@@ -563,7 +562,7 @@ ip_set_add(ip_set_id_t index, const struct sk_buff *skb,
 			dev_net(par->in ? par->in : par->out), index);
 	int ret;
 
-	BUG_ON(set == NULL);
+	BUG_ON(!set);
 	pr_debug("set %s, index %u\n", set->name, index);
 
 	if (opt->dim < set->type->dimension ||
@@ -586,7 +585,7 @@ ip_set_del(ip_set_id_t index, const struct sk_buff *skb,
 			dev_net(par->in ? par->in : par->out), index);
 	int ret = 0;
 
-	BUG_ON(set == NULL);
+	BUG_ON(!set);
 	pr_debug("set %s, index %u\n", set->name, index);
 
 	if (opt->dim < set->type->dimension ||
@@ -601,8 +600,7 @@ ip_set_del(ip_set_id_t index, const struct sk_buff *skb,
 }
 EXPORT_SYMBOL_GPL(ip_set_del);
 
-/*
- * Find set by name, reference it once. The reference makes sure the
+/* Find set by name, reference it once. The reference makes sure the
  * thing pointed to, does not go away under our feet.
  *
  */
@@ -616,7 +614,7 @@ ip_set_get_byname(struct net *net, const char *name, struct ip_set **set)
 	rcu_read_lock();
 	for (i = 0; i < inst->ip_set_max; i++) {
 		s = rcu_dereference(inst->ip_set_list)[i];
-		if (s != NULL && STRNCMP(s->name, name)) {
+		if (s && STRNCMP(s->name, name)) {
 			__ip_set_get(s);
 			index = i;
 			*set = s;
@@ -629,8 +627,7 @@ ip_set_get_byname(struct net *net, const char *name, struct ip_set **set)
 }
 EXPORT_SYMBOL_GPL(ip_set_get_byname);
 
-/*
- * If the given set pointer points to a valid set, decrement
+/* If the given set pointer points to a valid set, decrement
  * reference count by 1. The caller shall not assume the index
  * to be valid, after calling this function.
  *
@@ -643,7 +640,7 @@ __ip_set_put_byindex(struct ip_set_net *inst, ip_set_id_t index)
 
 	rcu_read_lock();
 	set = rcu_dereference(inst->ip_set_list)[index];
-	if (set != NULL)
+	if (set)
 		__ip_set_put(set);
 	rcu_read_unlock();
 }
@@ -657,8 +654,7 @@ ip_set_put_byindex(struct net *net, ip_set_id_t index)
 }
 EXPORT_SYMBOL_GPL(ip_set_put_byindex);
 
-/*
- * Get the name of a set behind a set index.
+/* Get the name of a set behind a set index.
  * We assume the set is referenced, so it does exist and
  * can't be destroyed. The set cannot be renamed due to
  * the referencing either.
@@ -669,7 +665,7 @@ ip_set_name_byindex(struct net *net, ip_set_id_t index)
 {
 	const struct ip_set *set = ip_set_rcu_get(net, index);
 
-	BUG_ON(set == NULL);
+	BUG_ON(!set);
 	BUG_ON(set->ref == 0);
 
 	/* Referenced, so it's safe */
@@ -677,13 +673,11 @@ ip_set_name_byindex(struct net *net, ip_set_id_t index)
 }
 EXPORT_SYMBOL_GPL(ip_set_name_byindex);
 
-/*
- * Routines to call by external subsystems, which do not
+/* Routines to call by external subsystems, which do not
  * call nfnl_lock for us.
  */
 
-/*
- * Find set by index, reference it once. The reference makes sure the
+/* Find set by index, reference it once. The reference makes sure the
  * thing pointed to, does not go away under our feet.
  *
  * The nfnl mutex is used in the function.
@@ -709,8 +703,7 @@ ip_set_nfnl_get_byindex(struct net *net, ip_set_id_t index)
 }
 EXPORT_SYMBOL_GPL(ip_set_nfnl_get_byindex);
 
-/*
- * If the given set pointer points to a valid set, decrement
+/* If the given set pointer points to a valid set, decrement
  * reference count by 1. The caller shall not assume the index
  * to be valid, after calling this function.
  *
@@ -725,15 +718,14 @@ ip_set_nfnl_put(struct net *net, ip_set_id_t index)
 	nfnl_lock(NFNL_SUBSYS_IPSET);
 	if (!inst->is_deleted) { /* already deleted from ip_set_net_exit() */
 		set = ip_set(inst, index);
-		if (set != NULL)
+		if (set)
 			__ip_set_put(set);
 	}
 	nfnl_unlock(NFNL_SUBSYS_IPSET);
 }
 EXPORT_SYMBOL_GPL(ip_set_nfnl_put);
 
-/*
- * Communication protocol with userspace over netlink.
+/* Communication protocol with userspace over netlink.
  *
  * The commands are serialized by the nfnl mutex.
  */
@@ -760,7 +752,7 @@ start_msg(struct sk_buff *skb, u32 portid, u32 seq, unsigned int flags,
 
 	nlh = nlmsg_put(skb, portid, seq, cmd | (NFNL_SUBSYS_IPSET << 8),
 			sizeof(*nfmsg), flags);
-	if (nlh == NULL)
+	if (!nlh)
 		return NULL;
 
 	nfmsg = nlmsg_data(nlh);
@@ -793,7 +785,7 @@ find_set_and_id(struct ip_set_net *inst, const char *name, ip_set_id_t *id)
 	*id = IPSET_INVALID_ID;
 	for (i = 0; i < inst->ip_set_max; i++) {
 		set = ip_set(inst, i);
-		if (set != NULL && STRNCMP(set->name, name)) {
+		if (set && STRNCMP(set->name, name)) {
 			*id = i;
 			break;
 		}
@@ -819,7 +811,7 @@ find_free_id(struct ip_set_net *inst, const char *name, ip_set_id_t *index,
 	*index = IPSET_INVALID_ID;
 	for (i = 0;  i < inst->ip_set_max; i++) {
 		s = ip_set(inst, i);
-		if (s == NULL) {
+		if (!s) {
 			if (*index == IPSET_INVALID_ID)
 				*index = i;
 		} else if (STRNCMP(name, s->name)) {
@@ -851,18 +843,18 @@ ip_set_create(struct sock *ctnl, struct sk_buff *skb,
 	struct ip_set_net *inst = ip_set_pernet(net);
 	struct ip_set *set, *clash = NULL;
 	ip_set_id_t index = IPSET_INVALID_ID;
-	struct nlattr *tb[IPSET_ATTR_CREATE_MAX+1] = {};
+	struct nlattr *tb[IPSET_ATTR_CREATE_MAX + 1] = {};
 	const char *name, *typename;
 	u8 family, revision;
 	u32 flags = flag_exist(nlh);
 	int ret = 0;
 
 	if (unlikely(protocol_failed(attr) ||
-		     attr[IPSET_ATTR_SETNAME] == NULL ||
-		     attr[IPSET_ATTR_TYPENAME] == NULL ||
-		     attr[IPSET_ATTR_REVISION] == NULL ||
-		     attr[IPSET_ATTR_FAMILY] == NULL ||
-		     (attr[IPSET_ATTR_DATA] != NULL &&
+		     !attr[IPSET_ATTR_SETNAME] ||
+		     !attr[IPSET_ATTR_TYPENAME] ||
+		     !attr[IPSET_ATTR_REVISION] ||
+		     !attr[IPSET_ATTR_FAMILY] ||
+		     (attr[IPSET_ATTR_DATA] &&
 		      !flag_nested(attr[IPSET_ATTR_DATA]))))
 		return -IPSET_ERR_PROTOCOL;
 
@@ -873,11 +865,10 @@ ip_set_create(struct sock *ctnl, struct sk_buff *skb,
 	pr_debug("setname: %s, typename: %s, family: %s, revision: %u\n",
 		 name, typename, family_name(family), revision);
 
-	/*
-	 * First, and without any locks, allocate and initialize
+	/* First, and without any locks, allocate and initialize
 	 * a normal base set structure.
 	 */
-	set = kzalloc(sizeof(struct ip_set), GFP_KERNEL);
+	set = kzalloc(sizeof(*set), GFP_KERNEL);
 	if (!set)
 		return -ENOMEM;
 	spin_lock_init(&set->lock);
@@ -885,21 +876,18 @@ ip_set_create(struct sock *ctnl, struct sk_buff *skb,
 	set->family = family;
 	set->revision = revision;
 
-	/*
-	 * Next, check that we know the type, and take
+	/* Next, check that we know the type, and take
 	 * a reference on the type, to make sure it stays available
 	 * while constructing our new set.
 	 *
 	 * After referencing the type, we try to create the type
 	 * specific part of the set without holding any locks.
 	 */
-	ret = find_set_type_get(typename, family, revision, &(set->type));
+	ret = find_set_type_get(typename, family, revision, &set->type);
 	if (ret)
 		goto out;
 
-	/*
-	 * Without holding any locks, create private part.
-	 */
+	/* Without holding any locks, create private part. */
 	if (attr[IPSET_ATTR_DATA] &&
 	    nla_parse_nested(tb, IPSET_ATTR_CREATE_MAX, attr[IPSET_ATTR_DATA],
 			     set->type->create_policy)) {
@@ -913,8 +901,7 @@ ip_set_create(struct sock *ctnl, struct sk_buff *skb,
 
 	/* BTW, ret==0 here. */
 
-	/*
-	 * Here, we have a valid, constructed set and we are protected
+	/* Here, we have a valid, constructed set and we are protected
 	 * by the nfnl mutex. Find the first free index in ip_set_list
 	 * and check clashing.
 	 */
@@ -937,7 +924,7 @@ ip_set_create(struct sock *ctnl, struct sk_buff *skb,
 			/* Wraparound */
 			goto cleanup;
 
-		list = kzalloc(sizeof(struct ip_set *) * i, GFP_KERNEL);
+		list = kcalloc(i, sizeof(struct ip_set *), GFP_KERNEL);
 		if (!list)
 			goto cleanup;
 		/* nfnl mutex is held, both lists are valid */
@@ -951,12 +938,11 @@ ip_set_create(struct sock *ctnl, struct sk_buff *skb,
 		inst->ip_set_max = i;
 		kfree(tmp);
 		ret = 0;
-	} else if (ret)
+	} else if (ret) {
 		goto cleanup;
+	}
 
-	/*
-	 * Finally! Add our shiny new set to the list, and be done.
-	 */
+	/* Finally! Add our shiny new set to the list, and be done. */
 	pr_debug("create: '%s' created with index %u!\n", set->name, index);
 	ip_set(inst, index) = set;
 
@@ -1018,7 +1004,7 @@ ip_set_destroy(struct sock *ctnl, struct sk_buff *skb,
 	if (!attr[IPSET_ATTR_SETNAME]) {
 		for (i = 0; i < inst->ip_set_max; i++) {
 			s = ip_set(inst, i);
-			if (s != NULL && s->ref) {
+			if (s && s->ref) {
 				ret = -IPSET_ERR_BUSY;
 				goto out;
 			}
@@ -1037,7 +1023,7 @@ ip_set_destroy(struct sock *ctnl, struct sk_buff *skb,
 	} else {
 		s = find_set_and_id(inst, nla_data(attr[IPSET_ATTR_SETNAME]),
 				    &i);
-		if (s == NULL) {
+		if (!s) {
 			ret = -ENOENT;
 			goto out;
 		} else if (s->ref) {
@@ -1082,12 +1068,12 @@ ip_set_flush(struct sock *ctnl, struct sk_buff *skb,
 	if (!attr[IPSET_ATTR_SETNAME]) {
 		for (i = 0; i < inst->ip_set_max; i++) {
 			s = ip_set(inst, i);
-			if (s != NULL)
+			if (s)
 				ip_set_flush_set(s);
 		}
 	} else {
 		s = find_set(inst, nla_data(attr[IPSET_ATTR_SETNAME]));
-		if (s == NULL)
+		if (!s)
 			return -ENOENT;
 
 		ip_set_flush_set(s);
@@ -1119,12 +1105,12 @@ ip_set_rename(struct sock *ctnl, struct sk_buff *skb,
 	int ret = 0;
 
 	if (unlikely(protocol_failed(attr) ||
-		     attr[IPSET_ATTR_SETNAME] == NULL ||
-		     attr[IPSET_ATTR_SETNAME2] == NULL))
+		     !attr[IPSET_ATTR_SETNAME] ||
+		     !attr[IPSET_ATTR_SETNAME2]))
 		return -IPSET_ERR_PROTOCOL;
 
 	set = find_set(inst, nla_data(attr[IPSET_ATTR_SETNAME]));
-	if (set == NULL)
+	if (!set)
 		return -ENOENT;
 
 	read_lock_bh(&ip_set_ref_lock);
@@ -1136,7 +1122,7 @@ ip_set_rename(struct sock *ctnl, struct sk_buff *skb,
 	name2 = nla_data(attr[IPSET_ATTR_SETNAME2]);
 	for (i = 0; i < inst->ip_set_max; i++) {
 		s = ip_set(inst, i);
-		if (s != NULL && STRNCMP(s->name, name2)) {
+		if (s && STRNCMP(s->name, name2)) {
 			ret = -IPSET_ERR_EXIST_SETNAME2;
 			goto out;
 		}
@@ -1168,23 +1154,24 @@ ip_set_swap(struct sock *ctnl, struct sk_buff *skb,
 	char from_name[IPSET_MAXNAMELEN];
 
 	if (unlikely(protocol_failed(attr) ||
-		     attr[IPSET_ATTR_SETNAME] == NULL ||
-		     attr[IPSET_ATTR_SETNAME2] == NULL))
+		     !attr[IPSET_ATTR_SETNAME] ||
+		     !attr[IPSET_ATTR_SETNAME2]))
 		return -IPSET_ERR_PROTOCOL;
 
 	from = find_set_and_id(inst, nla_data(attr[IPSET_ATTR_SETNAME]),
 			       &from_id);
-	if (from == NULL)
+	if (!from)
 		return -ENOENT;
 
 	to = find_set_and_id(inst, nla_data(attr[IPSET_ATTR_SETNAME2]),
 			     &to_id);
-	if (to == NULL)
+	if (!to)
 		return -IPSET_ERR_EXIST_SETNAME2;
 
 	/* Features must not change.
-	 * Not an artificial restriction anymore, as we must prevent
-	 * possible loops created by swapping in setlist type of sets. */
+	 * Not an artifical restriction anymore, as we must prevent
+	 * possible loops created by swapping in setlist type of sets.
+	 */
 	if (!(from->type->features == to->type->features &&
 	      from->family == to->family))
 		return -IPSET_ERR_TYPE_MISMATCH;
@@ -1246,7 +1233,7 @@ dump_init(struct netlink_callback *cb, struct ip_set_net *inst)
 {
 	struct nlmsghdr *nlh = nlmsg_hdr(cb->skb);
 	int min_len = nlmsg_total_size(sizeof(struct nfgenmsg));
-	struct nlattr *cda[IPSET_ATTR_CMD_MAX+1];
+	struct nlattr *cda[IPSET_ATTR_CMD_MAX + 1];
 	struct nlattr *attr = (void *)nlh + min_len;
 	u32 dump_type;
 	ip_set_id_t index;
@@ -1260,16 +1247,18 @@ dump_init(struct netlink_callback *cb, struct ip_set_net *inst)
 
 		set = find_set_and_id(inst, nla_data(cda[IPSET_ATTR_SETNAME]),
 				      &index);
-		if (set == NULL)
+		if (!set)
 			return -ENOENT;
 
 		dump_type = DUMP_ONE;
 		cb->args[IPSET_CB_INDEX] = index;
-	} else
+	} else {
 		dump_type = DUMP_ALL;
+	}
 
 	if (cda[IPSET_ATTR_FLAGS]) {
 		u32 f = ip_set_get_h32(cda[IPSET_ATTR_FLAGS]);
+
 		dump_type |= (f << 16);
 	}
 	cb->args[IPSET_CB_NET] = (unsigned long)inst;
@@ -1295,7 +1284,8 @@ ip_set_dump_start(struct sk_buff *skb, struct netlink_callback *cb)
 		if (ret < 0) {
 			nlh = nlmsg_hdr(cb->skb);
 			/* We have to create and send the error message
-			 * manually :-( */
+			 * manually :-(
+			 */
 			if (nlh->nlmsg_flags & NLM_F_ACK)
 				netlink_ack(cb->skb, nlh, ret);
 			return ret;
@@ -1313,7 +1303,7 @@ dump_last:
 	pr_debug("dump type, flag: %u %u index: %ld\n",
 		 dump_type, dump_flags, cb->args[IPSET_CB_INDEX]);
 	for (; cb->args[IPSET_CB_INDEX] < max; cb->args[IPSET_CB_INDEX]++) {
-		index = (ip_set_id_t) cb->args[IPSET_CB_INDEX];
+		index = (ip_set_id_t)cb->args[IPSET_CB_INDEX];
 		write_lock_bh(&ip_set_ref_lock);
 		set = ip_set(inst, index);
 		is_destroyed = inst->is_destroyed;
@@ -1480,12 +1470,12 @@ call_ad(struct sock *ctnl, struct sk_buff *skb, struct ip_set *set,
 		size_t payload = min(SIZE_MAX,
 				     sizeof(*errmsg) + nlmsg_len(nlh));
 		int min_len = nlmsg_total_size(sizeof(struct nfgenmsg));
-		struct nlattr *cda[IPSET_ATTR_CMD_MAX+1];
+		struct nlattr *cda[IPSET_ATTR_CMD_MAX + 1];
 		struct nlattr *cmdattr;
 		u32 *errline;
 
 		skb2 = nlmsg_new(payload, GFP_KERNEL);
-		if (skb2 == NULL)
+		if (!skb2)
 			return -ENOMEM;
 		rep = __nlmsg_put(skb2, NETLINK_CB(skb).portid,
 				  nlh->nlmsg_seq, NLMSG_ERROR, payload, 0);
@@ -1502,7 +1492,8 @@ call_ad(struct sock *ctnl, struct sk_buff *skb, struct ip_set *set,
 
 		*errline = lineno;
 
-		netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid, MSG_DONTWAIT);
+		netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid,
+				MSG_DONTWAIT);
 		/* Signal netlink not to send its ACK/errmsg.  */
 		return -EINTR;
 	}
@@ -1517,25 +1508,25 @@ ip_set_uadd(struct sock *ctnl, struct sk_buff *skb,
 {
 	struct ip_set_net *inst = ip_set_pernet(sock_net(ctnl));
 	struct ip_set *set;
-	struct nlattr *tb[IPSET_ATTR_ADT_MAX+1] = {};
+	struct nlattr *tb[IPSET_ATTR_ADT_MAX + 1] = {};
 	const struct nlattr *nla;
 	u32 flags = flag_exist(nlh);
 	bool use_lineno;
 	int ret = 0;
 
 	if (unlikely(protocol_failed(attr) ||
-		     attr[IPSET_ATTR_SETNAME] == NULL ||
+		     !attr[IPSET_ATTR_SETNAME] ||
 		     !((attr[IPSET_ATTR_DATA] != NULL) ^
 		       (attr[IPSET_ATTR_ADT] != NULL)) ||
-		     (attr[IPSET_ATTR_DATA] != NULL &&
+		     (attr[IPSET_ATTR_DATA] &&
 		      !flag_nested(attr[IPSET_ATTR_DATA])) ||
-		     (attr[IPSET_ATTR_ADT] != NULL &&
+		     (attr[IPSET_ATTR_ADT] &&
 		      (!flag_nested(attr[IPSET_ATTR_ADT]) ||
-		       attr[IPSET_ATTR_LINENO] == NULL))))
+		       !attr[IPSET_ATTR_LINENO]))))
 		return -IPSET_ERR_PROTOCOL;
 
 	set = find_set(inst, nla_data(attr[IPSET_ATTR_SETNAME]));
-	if (set == NULL)
+	if (!set)
 		return -ENOENT;
 
 	use_lineno = !!attr[IPSET_ATTR_LINENO];
@@ -1572,25 +1563,25 @@ ip_set_udel(struct sock *ctnl, struct sk_buff *skb,
 {
 	struct ip_set_net *inst = ip_set_pernet(sock_net(ctnl));
 	struct ip_set *set;
-	struct nlattr *tb[IPSET_ATTR_ADT_MAX+1] = {};
+	struct nlattr *tb[IPSET_ATTR_ADT_MAX + 1] = {};
 	const struct nlattr *nla;
 	u32 flags = flag_exist(nlh);
 	bool use_lineno;
 	int ret = 0;
 
 	if (unlikely(protocol_failed(attr) ||
-		     attr[IPSET_ATTR_SETNAME] == NULL ||
+		     !attr[IPSET_ATTR_SETNAME] ||
 		     !((attr[IPSET_ATTR_DATA] != NULL) ^
 		       (attr[IPSET_ATTR_ADT] != NULL)) ||
-		     (attr[IPSET_ATTR_DATA] != NULL &&
+		     (attr[IPSET_ATTR_DATA] &&
 		      !flag_nested(attr[IPSET_ATTR_DATA])) ||
-		     (attr[IPSET_ATTR_ADT] != NULL &&
+		     (attr[IPSET_ATTR_ADT] &&
 		      (!flag_nested(attr[IPSET_ATTR_ADT]) ||
-		       attr[IPSET_ATTR_LINENO] == NULL))))
+		       !attr[IPSET_ATTR_LINENO]))))
 		return -IPSET_ERR_PROTOCOL;
 
 	set = find_set(inst, nla_data(attr[IPSET_ATTR_SETNAME]));
-	if (set == NULL)
+	if (!set)
 		return -ENOENT;
 
 	use_lineno = !!attr[IPSET_ATTR_LINENO];
@@ -1627,17 +1618,17 @@ ip_set_utest(struct sock *ctnl, struct sk_buff *skb,
 {
 	struct ip_set_net *inst = ip_set_pernet(sock_net(ctnl));
 	struct ip_set *set;
-	struct nlattr *tb[IPSET_ATTR_ADT_MAX+1] = {};
+	struct nlattr *tb[IPSET_ATTR_ADT_MAX + 1] = {};
 	int ret = 0;
 
 	if (unlikely(protocol_failed(attr) ||
-		     attr[IPSET_ATTR_SETNAME] == NULL ||
-		     attr[IPSET_ATTR_DATA] == NULL ||
+		     !attr[IPSET_ATTR_SETNAME] ||
+		     !attr[IPSET_ATTR_DATA] ||
 		     !flag_nested(attr[IPSET_ATTR_DATA])))
 		return -IPSET_ERR_PROTOCOL;
 
 	set = find_set(inst, nla_data(attr[IPSET_ATTR_SETNAME]));
-	if (set == NULL)
+	if (!set)
 		return -ENOENT;
 
 	if (nla_parse_nested(tb, IPSET_ATTR_ADT_MAX, attr[IPSET_ATTR_DATA],
@@ -1668,15 +1659,15 @@ ip_set_header(struct sock *ctnl, struct sk_buff *skb,
 	int ret = 0;
 
 	if (unlikely(protocol_failed(attr) ||
-		     attr[IPSET_ATTR_SETNAME] == NULL))
+		     !attr[IPSET_ATTR_SETNAME]))
 		return -IPSET_ERR_PROTOCOL;
 
 	set = find_set(inst, nla_data(attr[IPSET_ATTR_SETNAME]));
-	if (set == NULL)
+	if (!set)
 		return -ENOENT;
 
 	skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
-	if (skb2 == NULL)
+	if (!skb2)
 		return -ENOMEM;
 
 	nlh2 = start_msg(skb2, NETLINK_CB(skb).portid, nlh->nlmsg_seq, 0,
@@ -1725,8 +1716,8 @@ ip_set_type(struct sock *ctnl, struct sk_buff *skb,
 	int ret = 0;
 
 	if (unlikely(protocol_failed(attr) ||
-		     attr[IPSET_ATTR_TYPENAME] == NULL ||
-		     attr[IPSET_ATTR_FAMILY] == NULL))
+		     !attr[IPSET_ATTR_TYPENAME] ||
+		     !attr[IPSET_ATTR_FAMILY]))
 		return -IPSET_ERR_PROTOCOL;
 
 	family = nla_get_u8(attr[IPSET_ATTR_FAMILY]);
@@ -1736,7 +1727,7 @@ ip_set_type(struct sock *ctnl, struct sk_buff *skb,
 		return ret;
 
 	skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
-	if (skb2 == NULL)
+	if (!skb2)
 		return -ENOMEM;
 
 	nlh2 = start_msg(skb2, NETLINK_CB(skb).portid, nlh->nlmsg_seq, 0,
@@ -1781,11 +1772,11 @@ ip_set_protocol(struct sock *ctnl, struct sk_buff *skb,
 	struct nlmsghdr *nlh2;
 	int ret = 0;
 
-	if (unlikely(attr[IPSET_ATTR_PROTOCOL] == NULL))
+	if (unlikely(!attr[IPSET_ATTR_PROTOCOL]))
 		return -IPSET_ERR_PROTOCOL;
 
 	skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
-	if (skb2 == NULL)
+	if (!skb2)
 		return -ENOMEM;
 
 	nlh2 = start_msg(skb2, NETLINK_CB(skb).portid, nlh->nlmsg_seq, 0,
@@ -1913,7 +1904,7 @@ ip_set_sockfn_get(struct sock *sk, int optval, void __user *user, int *len)
 		ret = -EFAULT;
 		goto done;
 	}
-	op = (unsigned int *) data;
+	op = (unsigned int *)data;
 
 	if (*op < IP_SET_OP_VERSION) {
 		/* Check the version at the beginning of operations */
@@ -2025,7 +2016,7 @@ ip_set_net_init(struct net *net)
 	if (inst->ip_set_max >= IPSET_INVALID_ID)
 		inst->ip_set_max = IPSET_INVALID_ID - 1;
 
-	list = kzalloc(sizeof(struct ip_set *) * inst->ip_set_max, GFP_KERNEL);
+	list = kcalloc(inst->ip_set_max, sizeof(struct ip_set *), GFP_KERNEL);
 	if (!list)
 		return -ENOMEM;
 	inst->is_deleted = false;
@@ -2061,11 +2052,11 @@ static struct pernet_operations ip_set_net_ops = {
 	.size	= sizeof(struct ip_set_net)
 };
 
-
 static int __init
 ip_set_init(void)
 {
 	int ret = nfnetlink_subsys_register(&ip_set_netlink_subsys);
+
 	if (ret != 0) {
 		pr_err("ip_set: cannot register with nfnetlink.\n");
 		return ret;
diff --git a/net/netfilter/ipset/ip_set_getport.c b/net/netfilter/ipset/ip_set_getport.c
index 1981f021cc60..42c3e3ba1b94 100644
--- a/net/netfilter/ipset/ip_set_getport.c
+++ b/net/netfilter/ipset/ip_set_getport.c
@@ -30,7 +30,7 @@ get_port(const struct sk_buff *skb, int protocol, unsigned int protooff,
 		const struct tcphdr *th;
 
 		th = skb_header_pointer(skb, protooff, sizeof(_tcph), &_tcph);
-		if (th == NULL)
+		if (!th)
 			/* No choice either */
 			return false;
 
@@ -42,7 +42,7 @@ get_port(const struct sk_buff *skb, int protocol, unsigned int protooff,
 		const sctp_sctphdr_t *sh;
 
 		sh = skb_header_pointer(skb, protooff, sizeof(_sh), &_sh);
-		if (sh == NULL)
+		if (!sh)
 			/* No choice either */
 			return false;
 
@@ -55,7 +55,7 @@ get_port(const struct sk_buff *skb, int protocol, unsigned int protooff,
 		const struct udphdr *uh;
 
 		uh = skb_header_pointer(skb, protooff, sizeof(_udph), &_udph);
-		if (uh == NULL)
+		if (!uh)
 			/* No choice either */
 			return false;
 
@@ -67,7 +67,7 @@ get_port(const struct sk_buff *skb, int protocol, unsigned int protooff,
 		const struct icmphdr *ic;
 
 		ic = skb_header_pointer(skb, protooff, sizeof(_ich), &_ich);
-		if (ic == NULL)
+		if (!ic)
 			return false;
 
 		*port = (__force __be16)htons((ic->type << 8) | ic->code);
@@ -78,7 +78,7 @@ get_port(const struct sk_buff *skb, int protocol, unsigned int protooff,
 		const struct icmp6hdr *ic;
 
 		ic = skb_header_pointer(skb, protooff, sizeof(_ich), &_ich);
-		if (ic == NULL)
+		if (!ic)
 			return false;
 
 		*port = (__force __be16)
@@ -116,7 +116,8 @@ ip_set_get_ip4_port(const struct sk_buff *skb, bool src,
 			return false;
 		default:
 			/* Other protocols doesn't have ports,
-			   so we can match fragments */
+			 * so we can match fragments.
+			 */
 			*proto = protocol;
 			return true;
 		}
diff --git a/net/netfilter/ipset/ip_set_hash_gen.h b/net/netfilter/ipset/ip_set_hash_gen.h
index f352cc022010..afe905c208af 100644
--- a/net/netfilter/ipset/ip_set_hash_gen.h
+++ b/net/netfilter/ipset/ip_set_hash_gen.h
@@ -35,7 +35,7 @@
 /* Number of elements to store in an initial array block */
 #define AHASH_INIT_SIZE			4
 /* Max number of elements to store in an array block */
-#define AHASH_MAX_SIZE			(3*AHASH_INIT_SIZE)
+#define AHASH_MAX_SIZE			(3 * AHASH_INIT_SIZE)
 /* Max muber of elements in the array block when tuned */
 #define AHASH_MAX_TUNED			64
 
@@ -57,6 +57,7 @@ tune_ahash_max(u8 curr, u32 multi)
 	 */
 	return n > curr && n <= AHASH_MAX_TUNED ? n : curr;
 }
+
 #define TUNE_AHASH_MAX(h, multi)	\
 	((h)->ahash_max = tune_ahash_max((h)->ahash_max, multi))
 #else
@@ -256,7 +257,7 @@ htable_bits(u32 hashsize)
 #endif
 
 #define HKEY(data, initval, htable_bits)			\
-(jhash2((u32 *)(data), HKEY_DATALEN/sizeof(u32), initval)	\
+(jhash2((u32 *)(data), HKEY_DATALEN / sizeof(u32), initval)	\
 	& jhash_mask(htable_bits))
 
 #ifndef htype
@@ -299,11 +300,11 @@ mtype_add_cidr(struct htype *h, u8 cidr, u8 nets_length, u8 n)
 
 	/* Add in increasing prefix order, so larger cidr first */
 	for (i = 0, j = -1; i < nets_length && h->nets[i].cidr[n]; i++) {
-		if (j != -1)
+		if (j != -1) {
 			continue;
-		else if (h->nets[i].cidr[n] < cidr)
+		} else if (h->nets[i].cidr[n] < cidr) {
 			j = i;
-		else if (h->nets[i].cidr[n] == cidr) {
+		} else if (h->nets[i].cidr[n] == cidr) {
 			h->nets[cidr - 1].nets[n]++;
 			return;
 		}
@@ -322,15 +323,15 @@ mtype_del_cidr(struct htype *h, u8 cidr, u8 nets_length, u8 n)
 	u8 i, j, net_end = nets_length - 1;
 
 	for (i = 0; i < nets_length; i++) {
-	        if (h->nets[i].cidr[n] != cidr)
-	                continue;
+		if (h->nets[i].cidr[n] != cidr)
+			continue;
 		h->nets[cidr - 1].nets[n]--;
 		if (h->nets[cidr - 1].nets[n] > 0)
-                        return;
+			return;
 		for (j = i; j < net_end && h->nets[j].cidr[n]; j++)
-		        h->nets[j].cidr[n] = h->nets[j + 1].cidr[n];
+			h->nets[j].cidr[n] = h->nets[j + 1].cidr[n];
 		h->nets[j].cidr[n] = 0;
-                return;
+		return;
 	}
 }
 #endif
@@ -426,8 +427,8 @@ mtype_destroy(struct ip_set *set)
 	if (SET_WITH_TIMEOUT(set))
 		del_timer_sync(&h->gc);
 
-	mtype_ahash_destroy(set, __ipset_dereference_protected(h->table, 1),
-			    true);
+	mtype_ahash_destroy(set,
+			    __ipset_dereference_protected(h->table, 1), true);
 	kfree(h);
 
 	set->data = NULL;
@@ -439,7 +440,7 @@ mtype_gc_init(struct ip_set *set, void (*gc)(unsigned long ul_set))
 	struct htype *h = set->data;
 
 	init_timer(&h->gc);
-	h->gc.data = (unsigned long) set;
+	h->gc.data = (unsigned long)set;
 	h->gc.function = gc;
 	h->gc.expires = jiffies + IPSET_GC_PERIOD(set->timeout) * HZ;
 	add_timer(&h->gc);
@@ -530,7 +531,7 @@ mtype_expire(struct ip_set *set, struct htype *h, u8 nets_length, size_t dsize)
 static void
 mtype_gc(unsigned long ul_set)
 {
-	struct ip_set *set = (struct ip_set *) ul_set;
+	struct ip_set *set = (struct ip_set *)ul_set;
 	struct htype *h = set->data;
 
 	pr_debug("called\n");
@@ -544,7 +545,8 @@ mtype_gc(unsigned long ul_set)
 
 /* Resize a hash: create a new hash table with doubling the hashsize
  * and inserting the elements to it. Repeat until we succeed or
- * fail due to memory pressures. */
+ * fail due to memory pressures.
+ */
 static int
 mtype_resize(struct ip_set *set, bool retried)
 {
@@ -687,7 +689,8 @@ cleanup:
 }
 
 /* Add an element to a hash and update the internal counters when succeeded,
- * otherwise report the proper error code. */
+ * otherwise report the proper error code.
+ */
 static int
 mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext,
 	  struct ip_set_ext *mext, u32 flags)
@@ -926,7 +929,8 @@ mtype_data_match(struct mtype_elem *data, const struct ip_set_ext *ext,
 
 #ifdef IP_SET_HASH_WITH_NETS
 /* Special test function which takes into account the different network
- * sizes added to the set */
+ * sizes added to the set
+ */
 static int
 mtype_test_cidrs(struct ip_set *set, struct mtype_elem *d,
 		 const struct ip_set_ext *ext,
@@ -1004,7 +1008,8 @@ mtype_test(struct ip_set *set, void *value, const struct ip_set_ext *ext,
 	t = rcu_dereference_bh(h->table);
 #ifdef IP_SET_HASH_WITH_NETS
 	/* If we test an IP address and not a network address,
-	 * try all possible network sizes */
+	 * try all possible network sizes
+	 */
 	for (i = 0; i < IPSET_NET_COUNT; i++)
 		if (DCIDR_GET(d->cidr, i) != SET_HOST_MASK(set->family))
 			break;
@@ -1148,8 +1153,8 @@ mtype_list(const struct ip_set *set,
 					nla_nest_cancel(skb, atd);
 					ret = -EMSGSIZE;
 					goto out;
-				} else
-					goto nla_put_failure;
+				}
+				goto nla_put_failure;
 			}
 			if (mtype_data_list(skb, e))
 				goto nla_put_failure;
@@ -1171,8 +1176,9 @@ nla_put_failure:
 			set->name);
 		cb->args[IPSET_CB_ARG0] = 0;
 		ret = -EMSGSIZE;
-	} else
+	} else {
 		ipset_nest_end(skb, atd);
+	}
 out:
 	rcu_read_unlock();
 	return ret;
@@ -1180,12 +1186,13 @@ out:
 
 static int
 IPSET_TOKEN(MTYPE, _kadt)(struct ip_set *set, const struct sk_buff *skb,
-	    const struct xt_action_param *par,
-	    enum ipset_adt adt, struct ip_set_adt_opt *opt);
+			  const struct xt_action_param *par,
+			  enum ipset_adt adt, struct ip_set_adt_opt *opt);
 
 static int
 IPSET_TOKEN(MTYPE, _uadt)(struct ip_set *set, struct nlattr *tb[],
-	    enum ipset_adt adt, u32 *lineno, u32 flags, bool retried);
+			  enum ipset_adt adt, u32 *lineno, u32 flags,
+			  bool retried);
 
 static const struct ip_set_type_variant mtype_variant = {
 	.kadt	= mtype_kadt,
diff --git a/net/netfilter/ipset/ip_set_hash_ip.c b/net/netfilter/ipset/ip_set_hash_ip.c
index f54d7069d633..9d6bf19f7b78 100644
--- a/net/netfilter/ipset/ip_set_hash_ip.c
+++ b/net/netfilter/ipset/ip_set_hash_ip.c
@@ -158,8 +158,8 @@ hash_ip4_uadt(struct ip_set *set, struct nlattr *tb[],
 
 		if (ret && !ip_set_eexist(ret, flags))
 			return ret;
-		else
-			ret = 0;
+
+		ret = 0;
 	}
 	return ret;
 }
diff --git a/net/netfilter/ipset/ip_set_hash_ipmark.c b/net/netfilter/ipset/ip_set_hash_ipmark.c
index f8fbc325ad34..a0695a2ab585 100644
--- a/net/netfilter/ipset/ip_set_hash_ipmark.c
+++ b/net/netfilter/ipset/ip_set_hash_ipmark.c
@@ -155,8 +155,8 @@ hash_ipmark4_uadt(struct ip_set *set, struct nlattr *tb[],
 
 		if (ret && !ip_set_eexist(ret, flags))
 			return ret;
-		else
-			ret = 0;
+
+		ret = 0;
 	}
 	return ret;
 }
@@ -206,7 +206,6 @@ hash_ipmark6_data_next(struct hash_ipmark4_elem *next,
 #define IP_SET_EMIT_CREATE
 #include "ip_set_hash_gen.h"
 
-
 static int
 hash_ipmark6_kadt(struct ip_set *set, const struct sk_buff *skb,
 		  const struct xt_action_param *par,
@@ -268,10 +267,8 @@ hash_ipmark6_uadt(struct ip_set *set, struct nlattr *tb[],
 	ret = adtfn(set, &e, &ext, &ext, flags);
 	if (ret && !ip_set_eexist(ret, flags))
 		return ret;
-	else
-		ret = 0;
 
-	return ret;
+	return 0;
 }
 
 static struct ip_set_type hash_ipmark_type __read_mostly = {
diff --git a/net/netfilter/ipset/ip_set_hash_ipport.c b/net/netfilter/ipset/ip_set_hash_ipport.c
index 9a31db8ccca6..9d84b3dff603 100644
--- a/net/netfilter/ipset/ip_set_hash_ipport.c
+++ b/net/netfilter/ipset/ip_set_hash_ipport.c
@@ -140,8 +140,9 @@ hash_ipport4_uadt(struct ip_set *set, struct nlattr *tb[],
 
 		if (e.proto == 0)
 			return -IPSET_ERR_INVALID_PROTO;
-	} else
+	} else {
 		return -IPSET_ERR_MISSING_PROTO;
+	}
 
 	if (!(with_ports || e.proto == IPPROTO_ICMP))
 		e.port = 0;
@@ -187,8 +188,8 @@ hash_ipport4_uadt(struct ip_set *set, struct nlattr *tb[],
 
 			if (ret && !ip_set_eexist(ret, flags))
 				return ret;
-			else
-				ret = 0;
+
+			ret = 0;
 		}
 	}
 	return ret;
@@ -305,8 +306,9 @@ hash_ipport6_uadt(struct ip_set *set, struct nlattr *tb[],
 
 		if (e.proto == 0)
 			return -IPSET_ERR_INVALID_PROTO;
-	} else
+	} else {
 		return -IPSET_ERR_MISSING_PROTO;
+	}
 
 	if (!(with_ports || e.proto == IPPROTO_ICMPV6))
 		e.port = 0;
@@ -329,8 +331,8 @@ hash_ipport6_uadt(struct ip_set *set, struct nlattr *tb[],
 
 		if (ret && !ip_set_eexist(ret, flags))
 			return ret;
-		else
-			ret = 0;
+
+		ret = 0;
 	}
 	return ret;
 }
diff --git a/net/netfilter/ipset/ip_set_hash_ipportip.c b/net/netfilter/ipset/ip_set_hash_ipportip.c
index fc42489f8795..215b7b942038 100644
--- a/net/netfilter/ipset/ip_set_hash_ipportip.c
+++ b/net/netfilter/ipset/ip_set_hash_ipportip.c
@@ -63,7 +63,7 @@ hash_ipportip4_data_equal(const struct hash_ipportip4_elem *ip1,
 
 static bool
 hash_ipportip4_data_list(struct sk_buff *skb,
-		       const struct hash_ipportip4_elem *data)
+			 const struct hash_ipportip4_elem *data)
 {
 	if (nla_put_ipaddr4(skb, IPSET_ATTR_IP, data->ip) ||
 	    nla_put_ipaddr4(skb, IPSET_ATTR_IP2, data->ip2) ||
@@ -147,8 +147,9 @@ hash_ipportip4_uadt(struct ip_set *set, struct nlattr *tb[],
 
 		if (e.proto == 0)
 			return -IPSET_ERR_INVALID_PROTO;
-	} else
+	} else {
 		return -IPSET_ERR_MISSING_PROTO;
+	}
 
 	if (!(with_ports || e.proto == IPPROTO_ICMP))
 		e.port = 0;
@@ -194,8 +195,8 @@ hash_ipportip4_uadt(struct ip_set *set, struct nlattr *tb[],
 
 			if (ret && !ip_set_eexist(ret, flags))
 				return ret;
-			else
-				ret = 0;
+
+			ret = 0;
 		}
 	}
 	return ret;
@@ -320,8 +321,9 @@ hash_ipportip6_uadt(struct ip_set *set, struct nlattr *tb[],
 
 		if (e.proto == 0)
 			return -IPSET_ERR_INVALID_PROTO;
-	} else
+	} else {
 		return -IPSET_ERR_MISSING_PROTO;
+	}
 
 	if (!(with_ports || e.proto == IPPROTO_ICMPV6))
 		e.port = 0;
@@ -344,8 +346,8 @@ hash_ipportip6_uadt(struct ip_set *set, struct nlattr *tb[],
 
 		if (ret && !ip_set_eexist(ret, flags))
 			return ret;
-		else
-			ret = 0;
+
+		ret = 0;
 	}
 	return ret;
 }
diff --git a/net/netfilter/ipset/ip_set_hash_ipportnet.c b/net/netfilter/ipset/ip_set_hash_ipportnet.c
index 2a69b9bf66b8..9ca719625ea3 100644
--- a/net/netfilter/ipset/ip_set_hash_ipportnet.c
+++ b/net/netfilter/ipset/ip_set_hash_ipportnet.c
@@ -209,14 +209,16 @@ hash_ipportnet4_uadt(struct ip_set *set, struct nlattr *tb[],
 
 		if (e.proto == 0)
 			return -IPSET_ERR_INVALID_PROTO;
-	} else
+	} else {
 		return -IPSET_ERR_MISSING_PROTO;
+	}
 
 	if (!(with_ports || e.proto == IPPROTO_ICMP))
 		e.port = 0;
 
 	if (tb[IPSET_ATTR_CADT_FLAGS]) {
 		u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]);
+
 		if (cadt_flags & IPSET_FLAG_NOMATCH)
 			flags |= (IPSET_FLAG_NOMATCH << 16);
 	}
@@ -263,8 +265,9 @@ hash_ipportnet4_uadt(struct ip_set *set, struct nlattr *tb[],
 			swap(ip2_from, ip2_to);
 		if (ip2_from + UINT_MAX == ip2_to)
 			return -IPSET_ERR_HASH_RANGE;
-	} else
+	} else {
 		ip_set_mask_from_to(ip2_from, ip2_to, e.cidr + 1);
+	}
 
 	if (retried)
 		ip = ntohl(h->next.ip);
@@ -287,8 +290,8 @@ hash_ipportnet4_uadt(struct ip_set *set, struct nlattr *tb[],
 
 				if (ret && !ip_set_eexist(ret, flags))
 					return ret;
-				else
-					ret = 0;
+
+				ret = 0;
 				ip2 = ip2_last + 1;
 			}
 		}
@@ -466,14 +469,16 @@ hash_ipportnet6_uadt(struct ip_set *set, struct nlattr *tb[],
 
 		if (e.proto == 0)
 			return -IPSET_ERR_INVALID_PROTO;
-	} else
+	} else {
 		return -IPSET_ERR_MISSING_PROTO;
+	}
 
 	if (!(with_ports || e.proto == IPPROTO_ICMPV6))
 		e.port = 0;
 
 	if (tb[IPSET_ATTR_CADT_FLAGS]) {
 		u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]);
+
 		if (cadt_flags & IPSET_FLAG_NOMATCH)
 			flags |= (IPSET_FLAG_NOMATCH << 16);
 	}
@@ -497,8 +502,8 @@ hash_ipportnet6_uadt(struct ip_set *set, struct nlattr *tb[],
 
 		if (ret && !ip_set_eexist(ret, flags))
 			return ret;
-		else
-			ret = 0;
+
+		ret = 0;
 	}
 	return ret;
 }
diff --git a/net/netfilter/ipset/ip_set_hash_mac.c b/net/netfilter/ipset/ip_set_hash_mac.c
index 112aff3cda96..f1e7d2c0f685 100644
--- a/net/netfilter/ipset/ip_set_hash_mac.c
+++ b/net/netfilter/ipset/ip_set_hash_mac.c
@@ -89,10 +89,10 @@ hash_mac4_kadt(struct ip_set *set, const struct sk_buff *skb,
 		return 0;
 
 	if (skb_mac_header(skb) < skb->head ||
-	     (skb_mac_header(skb) + ETH_HLEN) > skb->data)
+	    (skb_mac_header(skb) + ETH_HLEN) > skb->data)
 		return -EINVAL;
 
-	memcpy(e.ether, eth_hdr(skb)->h_source, ETH_ALEN);
+	ether_addr_copy(e.ether, eth_hdr(skb)->h_source);
 	if (memcmp(e.ether, invalid_ether, ETH_ALEN) == 0)
 		return -EINVAL;
 	return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags);
@@ -116,7 +116,7 @@ hash_mac4_uadt(struct ip_set *set, struct nlattr *tb[],
 	ret = ip_set_get_extensions(set, tb, &ext);
 	if (ret)
 		return ret;
-	memcpy(e.ether, nla_data(tb[IPSET_ATTR_ETHER]), ETH_ALEN);
+	ether_addr_copy(e.ether, nla_data(tb[IPSET_ATTR_ETHER]));
 	if (memcmp(e.ether, invalid_ether, ETH_ALEN) == 0)
 		return -IPSET_ERR_HASH_ELEM;
 
diff --git a/net/netfilter/ipset/ip_set_hash_net.c b/net/netfilter/ipset/ip_set_hash_net.c
index e49b1d010d30..3e4bffdc1cc0 100644
--- a/net/netfilter/ipset/ip_set_hash_net.c
+++ b/net/netfilter/ipset/ip_set_hash_net.c
@@ -169,6 +169,7 @@ hash_net4_uadt(struct ip_set *set, struct nlattr *tb[],
 
 	if (tb[IPSET_ATTR_CADT_FLAGS]) {
 		u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]);
+
 		if (cadt_flags & IPSET_FLAG_NOMATCH)
 			flags |= (IPSET_FLAG_NOMATCH << 16);
 	}
@@ -176,7 +177,7 @@ hash_net4_uadt(struct ip_set *set, struct nlattr *tb[],
 	if (adt == IPSET_TEST || !tb[IPSET_ATTR_IP_TO]) {
 		e.ip = htonl(ip & ip_set_hostmask(e.cidr));
 		ret = adtfn(set, &e, &ext, &ext, flags);
-		return ip_set_enomatch(ret, flags, adt, set) ? -ret:
+		return ip_set_enomatch(ret, flags, adt, set) ? -ret :
 		       ip_set_eexist(ret, flags) ? 0 : ret;
 	}
 
@@ -198,8 +199,8 @@ hash_net4_uadt(struct ip_set *set, struct nlattr *tb[],
 		ret = adtfn(set, &e, &ext, &ext, flags);
 		if (ret && !ip_set_eexist(ret, flags))
 			return ret;
-		else
-			ret = 0;
+
+		ret = 0;
 		ip = last + 1;
 	}
 	return ret;
@@ -339,6 +340,7 @@ hash_net6_uadt(struct ip_set *set, struct nlattr *tb[],
 
 	if (tb[IPSET_ATTR_CADT_FLAGS]) {
 		u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]);
+
 		if (cadt_flags & IPSET_FLAG_NOMATCH)
 			flags |= (IPSET_FLAG_NOMATCH << 16);
 	}
diff --git a/net/netfilter/ipset/ip_set_hash_netiface.c b/net/netfilter/ipset/ip_set_hash_netiface.c
index 42c893e08842..43d8c9896fa3 100644
--- a/net/netfilter/ipset/ip_set_hash_netiface.c
+++ b/net/netfilter/ipset/ip_set_hash_netiface.c
@@ -143,7 +143,7 @@ static const char *get_physindev_name(const struct sk_buff *skb)
 	return dev ? dev->name : NULL;
 }
 
-static const char *get_phyoutdev_name(const struct sk_buff *skb)
+static const char *get_physoutdev_name(const struct sk_buff *skb)
 {
 	struct net_device *dev = nf_bridge_get_physoutdev(skb);
 
@@ -178,15 +178,16 @@ hash_netiface4_kadt(struct ip_set *set, const struct sk_buff *skb,
 	if (opt->cmdflags & IPSET_FLAG_PHYSDEV) {
 #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
 		const char *eiface = SRCDIR ? get_physindev_name(skb) :
-					      get_phyoutdev_name(skb);
+					      get_physoutdev_name(skb);
 
 		if (!eiface)
 			return -EINVAL;
 		STRLCPY(e.iface, eiface);
 		e.physdev = 1;
 #endif
-	} else
+	} else {
 		STRLCPY(e.iface, SRCDIR ? IFACE(in) : IFACE(out));
+	}
 
 	if (strlen(e.iface) == 0)
 		return -EINVAL;
@@ -229,6 +230,7 @@ hash_netiface4_uadt(struct ip_set *set, struct nlattr *tb[],
 
 	if (tb[IPSET_ATTR_CADT_FLAGS]) {
 		u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]);
+
 		if (cadt_flags & IPSET_FLAG_PHYSDEV)
 			e.physdev = 1;
 		if (cadt_flags & IPSET_FLAG_NOMATCH)
@@ -249,8 +251,9 @@ hash_netiface4_uadt(struct ip_set *set, struct nlattr *tb[],
 			swap(ip, ip_to);
 		if (ip + UINT_MAX == ip_to)
 			return -IPSET_ERR_HASH_RANGE;
-	} else
+	} else {
 		ip_set_mask_from_to(ip, ip_to, e.cidr);
+	}
 
 	if (retried)
 		ip = ntohl(h->next.ip);
@@ -261,8 +264,8 @@ hash_netiface4_uadt(struct ip_set *set, struct nlattr *tb[],
 
 		if (ret && !ip_set_eexist(ret, flags))
 			return ret;
-		else
-			ret = 0;
+
+		ret = 0;
 		ip = last + 1;
 	}
 	return ret;
@@ -385,15 +388,16 @@ hash_netiface6_kadt(struct ip_set *set, const struct sk_buff *skb,
 	if (opt->cmdflags & IPSET_FLAG_PHYSDEV) {
 #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
 		const char *eiface = SRCDIR ? get_physindev_name(skb) :
-					      get_phyoutdev_name(skb);
+					      get_physoutdev_name(skb);
+
 		if (!eiface)
 			return -EINVAL;
-
 		STRLCPY(e.iface, eiface);
 		e.physdev = 1;
 #endif
-	} else
+	} else {
 		STRLCPY(e.iface, SRCDIR ? IFACE(in) : IFACE(out));
+	}
 
 	if (strlen(e.iface) == 0)
 		return -EINVAL;
@@ -403,7 +407,7 @@ hash_netiface6_kadt(struct ip_set *set, const struct sk_buff *skb,
 
 static int
 hash_netiface6_uadt(struct ip_set *set, struct nlattr *tb[],
-		   enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
+		    enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
 {
 	ipset_adtfn adtfn = set->variant->adt[adt];
 	struct hash_netiface6_elem e = { .cidr = HOST_MASK, .elem = 1 };
@@ -440,6 +444,7 @@ hash_netiface6_uadt(struct ip_set *set, struct nlattr *tb[],
 
 	if (tb[IPSET_ATTR_CADT_FLAGS]) {
 		u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]);
+
 		if (cadt_flags & IPSET_FLAG_PHYSDEV)
 			e.physdev = 1;
 		if (cadt_flags & IPSET_FLAG_NOMATCH)
diff --git a/net/netfilter/ipset/ip_set_hash_netnet.c b/net/netfilter/ipset/ip_set_hash_netnet.c
index b5428be1f159..3c862c0a76d1 100644
--- a/net/netfilter/ipset/ip_set_hash_netnet.c
+++ b/net/netfilter/ipset/ip_set_hash_netnet.c
@@ -57,8 +57,8 @@ struct hash_netnet4_elem {
 
 static inline bool
 hash_netnet4_data_equal(const struct hash_netnet4_elem *ip1,
-		     const struct hash_netnet4_elem *ip2,
-		     u32 *multi)
+			const struct hash_netnet4_elem *ip2,
+			u32 *multi)
 {
 	return ip1->ipcmp == ip2->ipcmp &&
 	       ip1->ccmp == ip2->ccmp;
@@ -84,7 +84,7 @@ hash_netnet4_data_reset_flags(struct hash_netnet4_elem *elem, u8 *flags)
 
 static inline void
 hash_netnet4_data_reset_elem(struct hash_netnet4_elem *elem,
-			  struct hash_netnet4_elem *orig)
+			     struct hash_netnet4_elem *orig)
 {
 	elem->ip[1] = orig->ip[1];
 }
@@ -103,7 +103,7 @@ hash_netnet4_data_netmask(struct hash_netnet4_elem *elem, u8 cidr, bool inner)
 
 static bool
 hash_netnet4_data_list(struct sk_buff *skb,
-		    const struct hash_netnet4_elem *data)
+		       const struct hash_netnet4_elem *data)
 {
 	u32 flags = data->nomatch ? IPSET_FLAG_NOMATCH : 0;
 
@@ -122,7 +122,7 @@ nla_put_failure:
 
 static inline void
 hash_netnet4_data_next(struct hash_netnet4_elem *next,
-		    const struct hash_netnet4_elem *d)
+		       const struct hash_netnet4_elem *d)
 {
 	next->ipcmp = d->ipcmp;
 }
@@ -133,8 +133,8 @@ hash_netnet4_data_next(struct hash_netnet4_elem *next,
 
 static int
 hash_netnet4_kadt(struct ip_set *set, const struct sk_buff *skb,
-	       const struct xt_action_param *par,
-	       enum ipset_adt adt, struct ip_set_adt_opt *opt)
+		  const struct xt_action_param *par,
+		  enum ipset_adt adt, struct ip_set_adt_opt *opt)
 {
 	const struct hash_netnet *h = set->data;
 	ipset_adtfn adtfn = set->variant->adt[adt];
@@ -156,7 +156,7 @@ hash_netnet4_kadt(struct ip_set *set, const struct sk_buff *skb,
 
 static int
 hash_netnet4_uadt(struct ip_set *set, struct nlattr *tb[],
-	       enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
+		  enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
 {
 	const struct hash_netnet *h = set->data;
 	ipset_adtfn adtfn = set->variant->adt[adt];
@@ -199,6 +199,7 @@ hash_netnet4_uadt(struct ip_set *set, struct nlattr *tb[],
 
 	if (tb[IPSET_ATTR_CADT_FLAGS]) {
 		u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]);
+
 		if (cadt_flags & IPSET_FLAG_NOMATCH)
 			flags |= (IPSET_FLAG_NOMATCH << 16);
 	}
@@ -221,8 +222,9 @@ hash_netnet4_uadt(struct ip_set *set, struct nlattr *tb[],
 			swap(ip, ip_to);
 		if (unlikely(ip + UINT_MAX == ip_to))
 			return -IPSET_ERR_HASH_RANGE;
-	} else
+	} else {
 		ip_set_mask_from_to(ip, ip_to, e.cidr[0]);
+	}
 
 	ip2_to = ip2_from;
 	if (tb[IPSET_ATTR_IP2_TO]) {
@@ -233,8 +235,9 @@ hash_netnet4_uadt(struct ip_set *set, struct nlattr *tb[],
 			swap(ip2_from, ip2_to);
 		if (unlikely(ip2_from + UINT_MAX == ip2_to))
 			return -IPSET_ERR_HASH_RANGE;
-	} else
+	} else {
 		ip_set_mask_from_to(ip2_from, ip2_to, e.cidr[1]);
+	}
 
 	if (retried)
 		ip = ntohl(h->next.ip[0]);
@@ -251,8 +254,8 @@ hash_netnet4_uadt(struct ip_set *set, struct nlattr *tb[],
 			ret = adtfn(set, &e, &ext, &ext, flags);
 			if (ret && !ip_set_eexist(ret, flags))
 				return ret;
-			else
-				ret = 0;
+
+			ret = 0;
 			ip2 = last2 + 1;
 		}
 		ip = last + 1;
@@ -276,8 +279,8 @@ struct hash_netnet6_elem {
 
 static inline bool
 hash_netnet6_data_equal(const struct hash_netnet6_elem *ip1,
-		     const struct hash_netnet6_elem *ip2,
-		     u32 *multi)
+			const struct hash_netnet6_elem *ip2,
+			u32 *multi)
 {
 	return ipv6_addr_equal(&ip1->ip[0].in6, &ip2->ip[0].in6) &&
 	       ipv6_addr_equal(&ip1->ip[1].in6, &ip2->ip[1].in6) &&
@@ -304,7 +307,7 @@ hash_netnet6_data_reset_flags(struct hash_netnet6_elem *elem, u8 *flags)
 
 static inline void
 hash_netnet6_data_reset_elem(struct hash_netnet6_elem *elem,
-			  struct hash_netnet6_elem *orig)
+			     struct hash_netnet6_elem *orig)
 {
 	elem->ip[1] = orig->ip[1];
 }
@@ -323,7 +326,7 @@ hash_netnet6_data_netmask(struct hash_netnet6_elem *elem, u8 cidr, bool inner)
 
 static bool
 hash_netnet6_data_list(struct sk_buff *skb,
-		    const struct hash_netnet6_elem *data)
+		       const struct hash_netnet6_elem *data)
 {
 	u32 flags = data->nomatch ? IPSET_FLAG_NOMATCH : 0;
 
@@ -342,7 +345,7 @@ nla_put_failure:
 
 static inline void
 hash_netnet6_data_next(struct hash_netnet4_elem *next,
-		    const struct hash_netnet6_elem *d)
+		       const struct hash_netnet6_elem *d)
 {
 }
 
@@ -356,8 +359,8 @@ hash_netnet6_data_next(struct hash_netnet4_elem *next,
 
 static int
 hash_netnet6_kadt(struct ip_set *set, const struct sk_buff *skb,
-	       const struct xt_action_param *par,
-	       enum ipset_adt adt, struct ip_set_adt_opt *opt)
+		  const struct xt_action_param *par,
+		  enum ipset_adt adt, struct ip_set_adt_opt *opt)
 {
 	const struct hash_netnet *h = set->data;
 	ipset_adtfn adtfn = set->variant->adt[adt];
@@ -367,7 +370,7 @@ hash_netnet6_kadt(struct ip_set *set, const struct sk_buff *skb,
 	e.cidr[0] = INIT_CIDR(h->nets[0].cidr[0], HOST_MASK);
 	e.cidr[1] = INIT_CIDR(h->nets[0].cidr[1], HOST_MASK);
 	if (adt == IPSET_TEST)
-		e.ccmp = (HOST_MASK << (sizeof(u8)*8)) | HOST_MASK;
+		e.ccmp = (HOST_MASK << (sizeof(u8) * 8)) | HOST_MASK;
 
 	ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip[0].in6);
 	ip6addrptr(skb, opt->flags & IPSET_DIM_TWO_SRC, &e.ip[1].in6);
@@ -379,7 +382,7 @@ hash_netnet6_kadt(struct ip_set *set, const struct sk_buff *skb,
 
 static int
 hash_netnet6_uadt(struct ip_set *set, struct nlattr *tb[],
-	       enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
+		  enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
 {
 	ipset_adtfn adtfn = set->variant->adt[adt];
 	struct hash_netnet6_elem e = { .cidr = { HOST_MASK, HOST_MASK, }, };
@@ -424,6 +427,7 @@ hash_netnet6_uadt(struct ip_set *set, struct nlattr *tb[],
 
 	if (tb[IPSET_ATTR_CADT_FLAGS]) {
 		u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]);
+
 		if (cadt_flags & IPSET_FLAG_NOMATCH)
 			flags |= (IPSET_FLAG_NOMATCH << 16);
 	}
diff --git a/net/netfilter/ipset/ip_set_hash_netport.c b/net/netfilter/ipset/ip_set_hash_netport.c
index 27307d0a8a5d..731813e0f08c 100644
--- a/net/netfilter/ipset/ip_set_hash_netport.c
+++ b/net/netfilter/ipset/ip_set_hash_netport.c
@@ -198,8 +198,9 @@ hash_netport4_uadt(struct ip_set *set, struct nlattr *tb[],
 
 		if (e.proto == 0)
 			return -IPSET_ERR_INVALID_PROTO;
-	} else
+	} else {
 		return -IPSET_ERR_MISSING_PROTO;
+	}
 
 	if (!(with_ports || e.proto == IPPROTO_ICMP))
 		e.port = 0;
@@ -208,6 +209,7 @@ hash_netport4_uadt(struct ip_set *set, struct nlattr *tb[],
 
 	if (tb[IPSET_ATTR_CADT_FLAGS]) {
 		u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]);
+
 		if (cadt_flags & IPSET_FLAG_NOMATCH)
 			flags |= (IPSET_FLAG_NOMATCH << 16);
 	}
@@ -233,8 +235,9 @@ hash_netport4_uadt(struct ip_set *set, struct nlattr *tb[],
 			swap(ip, ip_to);
 		if (ip + UINT_MAX == ip_to)
 			return -IPSET_ERR_HASH_RANGE;
-	} else
+	} else {
 		ip_set_mask_from_to(ip, ip_to, e.cidr + 1);
+	}
 
 	if (retried)
 		ip = ntohl(h->next.ip);
@@ -250,8 +253,8 @@ hash_netport4_uadt(struct ip_set *set, struct nlattr *tb[],
 
 			if (ret && !ip_set_eexist(ret, flags))
 				return ret;
-			else
-				ret = 0;
+
+			ret = 0;
 		}
 		ip = last + 1;
 	}
@@ -413,14 +416,16 @@ hash_netport6_uadt(struct ip_set *set, struct nlattr *tb[],
 
 		if (e.proto == 0)
 			return -IPSET_ERR_INVALID_PROTO;
-	} else
+	} else {
 		return -IPSET_ERR_MISSING_PROTO;
+	}
 
 	if (!(with_ports || e.proto == IPPROTO_ICMPV6))
 		e.port = 0;
 
 	if (tb[IPSET_ATTR_CADT_FLAGS]) {
 		u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]);
+
 		if (cadt_flags & IPSET_FLAG_NOMATCH)
 			flags |= (IPSET_FLAG_NOMATCH << 16);
 	}
@@ -444,8 +449,8 @@ hash_netport6_uadt(struct ip_set *set, struct nlattr *tb[],
 
 		if (ret && !ip_set_eexist(ret, flags))
 			return ret;
-		else
-			ret = 0;
+
+		ret = 0;
 	}
 	return ret;
 }
diff --git a/net/netfilter/ipset/ip_set_hash_netportnet.c b/net/netfilter/ipset/ip_set_hash_netportnet.c
index 1e0e47ae40a4..0c68734f5cc4 100644
--- a/net/netfilter/ipset/ip_set_hash_netportnet.c
+++ b/net/netfilter/ipset/ip_set_hash_netportnet.c
@@ -62,8 +62,8 @@ struct hash_netportnet4_elem {
 
 static inline bool
 hash_netportnet4_data_equal(const struct hash_netportnet4_elem *ip1,
-			   const struct hash_netportnet4_elem *ip2,
-			   u32 *multi)
+			    const struct hash_netportnet4_elem *ip2,
+			    u32 *multi)
 {
 	return ip1->ipcmp == ip2->ipcmp &&
 	       ip1->ccmp == ip2->ccmp &&
@@ -91,7 +91,7 @@ hash_netportnet4_data_reset_flags(struct hash_netportnet4_elem *elem, u8 *flags)
 
 static inline void
 hash_netportnet4_data_reset_elem(struct hash_netportnet4_elem *elem,
-				struct hash_netportnet4_elem *orig)
+				 struct hash_netportnet4_elem *orig)
 {
 	elem->ip[1] = orig->ip[1];
 }
@@ -111,7 +111,7 @@ hash_netportnet4_data_netmask(struct hash_netportnet4_elem *elem,
 
 static bool
 hash_netportnet4_data_list(struct sk_buff *skb,
-			  const struct hash_netportnet4_elem *data)
+			   const struct hash_netportnet4_elem *data)
 {
 	u32 flags = data->nomatch ? IPSET_FLAG_NOMATCH : 0;
 
@@ -132,7 +132,7 @@ nla_put_failure:
 
 static inline void
 hash_netportnet4_data_next(struct hash_netportnet4_elem *next,
-			  const struct hash_netportnet4_elem *d)
+			   const struct hash_netportnet4_elem *d)
 {
 	next->ipcmp = d->ipcmp;
 	next->port = d->port;
@@ -144,8 +144,8 @@ hash_netportnet4_data_next(struct hash_netportnet4_elem *next,
 
 static int
 hash_netportnet4_kadt(struct ip_set *set, const struct sk_buff *skb,
-		     const struct xt_action_param *par,
-		     enum ipset_adt adt, struct ip_set_adt_opt *opt)
+		      const struct xt_action_param *par,
+		      enum ipset_adt adt, struct ip_set_adt_opt *opt)
 {
 	const struct hash_netportnet *h = set->data;
 	ipset_adtfn adtfn = set->variant->adt[adt];
@@ -171,7 +171,7 @@ hash_netportnet4_kadt(struct ip_set *set, const struct sk_buff *skb,
 
 static int
 hash_netportnet4_uadt(struct ip_set *set, struct nlattr *tb[],
-		     enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
+		      enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
 {
 	const struct hash_netportnet *h = set->data;
 	ipset_adtfn adtfn = set->variant->adt[adt];
@@ -223,14 +223,16 @@ hash_netportnet4_uadt(struct ip_set *set, struct nlattr *tb[],
 
 		if (e.proto == 0)
 			return -IPSET_ERR_INVALID_PROTO;
-	} else
+	} else {
 		return -IPSET_ERR_MISSING_PROTO;
+	}
 
 	if (!(with_ports || e.proto == IPPROTO_ICMP))
 		e.port = 0;
 
 	if (tb[IPSET_ATTR_CADT_FLAGS]) {
 		u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]);
+
 		if (cadt_flags & IPSET_FLAG_NOMATCH)
 			flags |= (IPSET_FLAG_NOMATCH << 16);
 	}
@@ -254,8 +256,9 @@ hash_netportnet4_uadt(struct ip_set *set, struct nlattr *tb[],
 			swap(ip, ip_to);
 		if (unlikely(ip + UINT_MAX == ip_to))
 			return -IPSET_ERR_HASH_RANGE;
-	} else
+	} else {
 		ip_set_mask_from_to(ip, ip_to, e.cidr[0]);
+	}
 
 	port_to = port = ntohs(e.port);
 	if (tb[IPSET_ATTR_PORT_TO]) {
@@ -273,8 +276,9 @@ hash_netportnet4_uadt(struct ip_set *set, struct nlattr *tb[],
 			swap(ip2_from, ip2_to);
 		if (unlikely(ip2_from + UINT_MAX == ip2_to))
 			return -IPSET_ERR_HASH_RANGE;
-	} else
+	} else {
 		ip_set_mask_from_to(ip2_from, ip2_to, e.cidr[1]);
+	}
 
 	if (retried)
 		ip = ntohl(h->next.ip[0]);
@@ -296,8 +300,8 @@ hash_netportnet4_uadt(struct ip_set *set, struct nlattr *tb[],
 				ret = adtfn(set, &e, &ext, &ext, flags);
 				if (ret && !ip_set_eexist(ret, flags))
 					return ret;
-				else
-					ret = 0;
+
+				ret = 0;
 				ip2 = ip2_last + 1;
 			}
 		}
@@ -324,8 +328,8 @@ struct hash_netportnet6_elem {
 
 static inline bool
 hash_netportnet6_data_equal(const struct hash_netportnet6_elem *ip1,
-			   const struct hash_netportnet6_elem *ip2,
-			   u32 *multi)
+			    const struct hash_netportnet6_elem *ip2,
+			    u32 *multi)
 {
 	return ipv6_addr_equal(&ip1->ip[0].in6, &ip2->ip[0].in6) &&
 	       ipv6_addr_equal(&ip1->ip[1].in6, &ip2->ip[1].in6) &&
@@ -354,7 +358,7 @@ hash_netportnet6_data_reset_flags(struct hash_netportnet6_elem *elem, u8 *flags)
 
 static inline void
 hash_netportnet6_data_reset_elem(struct hash_netportnet6_elem *elem,
-				struct hash_netportnet6_elem *orig)
+				 struct hash_netportnet6_elem *orig)
 {
 	elem->ip[1] = orig->ip[1];
 }
@@ -374,7 +378,7 @@ hash_netportnet6_data_netmask(struct hash_netportnet6_elem *elem,
 
 static bool
 hash_netportnet6_data_list(struct sk_buff *skb,
-			  const struct hash_netportnet6_elem *data)
+			   const struct hash_netportnet6_elem *data)
 {
 	u32 flags = data->nomatch ? IPSET_FLAG_NOMATCH : 0;
 
@@ -395,7 +399,7 @@ nla_put_failure:
 
 static inline void
 hash_netportnet6_data_next(struct hash_netportnet4_elem *next,
-			  const struct hash_netportnet6_elem *d)
+			   const struct hash_netportnet6_elem *d)
 {
 	next->port = d->port;
 }
@@ -410,8 +414,8 @@ hash_netportnet6_data_next(struct hash_netportnet4_elem *next,
 
 static int
 hash_netportnet6_kadt(struct ip_set *set, const struct sk_buff *skb,
-		     const struct xt_action_param *par,
-		     enum ipset_adt adt, struct ip_set_adt_opt *opt)
+		      const struct xt_action_param *par,
+		      enum ipset_adt adt, struct ip_set_adt_opt *opt)
 {
 	const struct hash_netportnet *h = set->data;
 	ipset_adtfn adtfn = set->variant->adt[adt];
@@ -437,7 +441,7 @@ hash_netportnet6_kadt(struct ip_set *set, const struct sk_buff *skb,
 
 static int
 hash_netportnet6_uadt(struct ip_set *set, struct nlattr *tb[],
-		     enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
+		      enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
 {
 	const struct hash_netportnet *h = set->data;
 	ipset_adtfn adtfn = set->variant->adt[adt];
@@ -493,14 +497,16 @@ hash_netportnet6_uadt(struct ip_set *set, struct nlattr *tb[],
 
 		if (e.proto == 0)
 			return -IPSET_ERR_INVALID_PROTO;
-	} else
+	} else {
 		return -IPSET_ERR_MISSING_PROTO;
+	}
 
 	if (!(with_ports || e.proto == IPPROTO_ICMPV6))
 		e.port = 0;
 
 	if (tb[IPSET_ATTR_CADT_FLAGS]) {
 		u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]);
+
 		if (cadt_flags & IPSET_FLAG_NOMATCH)
 			flags |= (IPSET_FLAG_NOMATCH << 16);
 	}
@@ -524,8 +530,8 @@ hash_netportnet6_uadt(struct ip_set *set, struct nlattr *tb[],
 
 		if (ret && !ip_set_eexist(ret, flags))
 			return ret;
-		else
-			ret = 0;
+
+		ret = 0;
 	}
 	return ret;
 }
diff --git a/net/netfilter/ipset/ip_set_list_set.c b/net/netfilter/ipset/ip_set_list_set.c
index 9f624ee9a41e..a1fe5377a2b3 100644
--- a/net/netfilter/ipset/ip_set_list_set.c
+++ b/net/netfilter/ipset/ip_set_list_set.c
@@ -206,14 +206,15 @@ list_set_utest(struct ip_set *set, void *value, const struct ip_set_ext *ext,
 			continue;
 		}
 
-		if (d->before == 0)
+		if (d->before == 0) {
 			ret = 1;
-		else if (d->before > 0) {
+		} else if (d->before > 0) {
 			next = list_next_entry(e, list);
 			ret = !list_is_last(&e->list, &map->members) &&
 			      next->id == d->refid;
-		} else
+		} else {
 			ret = prev && prev->id == d->refid;
+		}
 		return ret;
 	}
 	return 0;
@@ -558,7 +559,7 @@ static const struct ip_set_type_variant set_variant = {
 static void
 list_set_gc(unsigned long ul_set)
 {
-	struct ip_set *set = (struct ip_set *) ul_set;
+	struct ip_set *set = (struct ip_set *)ul_set;
 	struct list_set *map = set->data;
 
 	spin_lock_bh(&set->lock);
@@ -575,7 +576,7 @@ list_set_gc_init(struct ip_set *set, void (*gc)(unsigned long ul_set))
 	struct list_set *map = set->data;
 
 	init_timer(&map->gc);
-	map->gc.data = (unsigned long) set;
+	map->gc.data = (unsigned long)set;
 	map->gc.function = gc;
 	map->gc.expires = jiffies + IPSET_GC_PERIOD(set->timeout) * HZ;
 	add_timer(&map->gc);
diff --git a/net/netfilter/ipset/pfxlen.c b/net/netfilter/ipset/pfxlen.c
index 04d15fdc99ee..1c8a42c1056c 100644
--- a/net/netfilter/ipset/pfxlen.c
+++ b/net/netfilter/ipset/pfxlen.c
@@ -1,9 +1,7 @@
 #include <linux/export.h>
 #include <linux/netfilter/ipset/pfxlen.h>
 
-/*
- * Prefixlen maps for fast conversions, by Jan Engelhardt.
- */
+/* Prefixlen maps for fast conversions, by Jan Engelhardt. */
 
 #define E(a, b, c, d) \
 	{.ip6 = { \
@@ -11,8 +9,7 @@
 		htonl(c), htonl(d), \
 	} }
 
-/*
- * This table works for both IPv4 and IPv6;
+/* This table works for both IPv4 and IPv6;
  * just use prefixlen_netmask_map[prefixlength].ip.
  */
 const union nf_inet_addr ip_set_netmask_map[] = {
@@ -149,13 +146,12 @@ const union nf_inet_addr ip_set_netmask_map[] = {
 EXPORT_SYMBOL_GPL(ip_set_netmask_map);
 
 #undef  E
-#define E(a, b, c, d)						\
-	{.ip6 = { (__force __be32) a, (__force __be32) b,	\
-		  (__force __be32) c, (__force __be32) d,	\
+#define E(a, b, c, d)					\
+	{.ip6 = { (__force __be32)a, (__force __be32)b,	\
+		  (__force __be32)c, (__force __be32)d,	\
 	} }
 
-/*
- * This table works for both IPv4 and IPv6;
+/* This table works for both IPv4 and IPv6;
  * just use prefixlen_hostmask_map[prefixlength].ip.
  */
 const union nf_inet_addr ip_set_hostmask_map[] = {
diff --git a/net/netfilter/xt_set.c b/net/netfilter/xt_set.c
index b103e9627716..5669e5b453f4 100644
--- a/net/netfilter/xt_set.c
+++ b/net/netfilter/xt_set.c
@@ -9,7 +9,8 @@
  */
 
 /* Kernel module which implements the set match and SET target
- * for netfilter/iptables. */
+ * for netfilter/iptables.
+ */
 
 #include <linux/module.h>
 #include <linux/skbuff.h>
@@ -53,6 +54,7 @@ static bool
 set_match_v0(const struct sk_buff *skb, struct xt_action_param *par)
 {
 	const struct xt_set_info_match_v0 *info = par->matchinfo;
+
 	ADT_OPT(opt, par->family, info->match_set.u.compat.dim,
 		info->match_set.u.compat.flags, 0, UINT_MAX);
 
@@ -69,10 +71,10 @@ compat_flags(struct xt_set_info_v0 *info)
 	info->u.compat.dim = IPSET_DIM_ZERO;
 	if (info->u.flags[0] & IPSET_MATCH_INV)
 		info->u.compat.flags |= IPSET_INV_MATCH;
-	for (i = 0; i < IPSET_DIM_MAX-1 && info->u.flags[i]; i++) {
+	for (i = 0; i < IPSET_DIM_MAX - 1 && info->u.flags[i]; i++) {
 		info->u.compat.dim++;
 		if (info->u.flags[i] & IPSET_SRC)
-			info->u.compat.flags |= (1<<info->u.compat.dim);
+			info->u.compat.flags |= (1 << info->u.compat.dim);
 	}
 }
 
@@ -89,7 +91,7 @@ set_match_v0_checkentry(const struct xt_mtchk_param *par)
 			info->match_set.index);
 		return -ENOENT;
 	}
-	if (info->match_set.u.flags[IPSET_DIM_MAX-1] != 0) {
+	if (info->match_set.u.flags[IPSET_DIM_MAX - 1] != 0) {
 		pr_warn("Protocol error: set match dimension is over the limit!\n");
 		ip_set_nfnl_put(par->net, info->match_set.index);
 		return -ERANGE;
@@ -115,6 +117,7 @@ static bool
 set_match_v1(const struct sk_buff *skb, struct xt_action_param *par)
 {
 	const struct xt_set_info_match_v1 *info = par->matchinfo;
+
 	ADT_OPT(opt, par->family, info->match_set.dim,
 		info->match_set.flags, 0, UINT_MAX);
 
@@ -179,9 +182,10 @@ static bool
 set_match_v3(const struct sk_buff *skb, struct xt_action_param *par)
 {
 	const struct xt_set_info_match_v3 *info = par->matchinfo;
+	int ret;
+
 	ADT_OPT(opt, par->family, info->match_set.dim,
 		info->match_set.flags, info->flags, UINT_MAX);
-	int ret;
 
 	if (info->packets.op != IPSET_COUNTER_NONE ||
 	    info->bytes.op != IPSET_COUNTER_NONE)
@@ -225,9 +229,10 @@ static bool
 set_match_v4(const struct sk_buff *skb, struct xt_action_param *par)
 {
 	const struct xt_set_info_match_v4 *info = par->matchinfo;
+	int ret;
+
 	ADT_OPT(opt, par->family, info->match_set.dim,
 		info->match_set.flags, info->flags, UINT_MAX);
-	int ret;
 
 	if (info->packets.op != IPSET_COUNTER_NONE ||
 	    info->bytes.op != IPSET_COUNTER_NONE)
@@ -253,6 +258,7 @@ static unsigned int
 set_target_v0(struct sk_buff *skb, const struct xt_action_param *par)
 {
 	const struct xt_set_info_target_v0 *info = par->targinfo;
+
 	ADT_OPT(add_opt, par->family, info->add_set.u.compat.dim,
 		info->add_set.u.compat.flags, 0, UINT_MAX);
 	ADT_OPT(del_opt, par->family, info->del_set.u.compat.dim,
@@ -291,8 +297,8 @@ set_target_v0_checkentry(const struct xt_tgchk_param *par)
 			return -ENOENT;
 		}
 	}
-	if (info->add_set.u.flags[IPSET_DIM_MAX-1] != 0 ||
-	    info->del_set.u.flags[IPSET_DIM_MAX-1] != 0) {
+	if (info->add_set.u.flags[IPSET_DIM_MAX - 1] != 0 ||
+	    info->del_set.u.flags[IPSET_DIM_MAX - 1] != 0) {
 		pr_warn("Protocol error: SET target dimension is over the limit!\n");
 		if (info->add_set.index != IPSET_INVALID_ID)
 			ip_set_nfnl_put(par->net, info->add_set.index);
@@ -325,6 +331,7 @@ static unsigned int
 set_target_v1(struct sk_buff *skb, const struct xt_action_param *par)
 {
 	const struct xt_set_info_target_v1 *info = par->targinfo;
+
 	ADT_OPT(add_opt, par->family, info->add_set.dim,
 		info->add_set.flags, 0, UINT_MAX);
 	ADT_OPT(del_opt, par->family, info->del_set.dim,
@@ -393,6 +400,7 @@ static unsigned int
 set_target_v2(struct sk_buff *skb, const struct xt_action_param *par)
 {
 	const struct xt_set_info_target_v2 *info = par->targinfo;
+
 	ADT_OPT(add_opt, par->family, info->add_set.dim,
 		info->add_set.flags, info->flags, info->timeout);
 	ADT_OPT(del_opt, par->family, info->del_set.dim,
@@ -400,8 +408,8 @@ set_target_v2(struct sk_buff *skb, const struct xt_action_param *par)
 
 	/* Normalize to fit into jiffies */
 	if (add_opt.ext.timeout != IPSET_NO_TIMEOUT &&
-	    add_opt.ext.timeout > UINT_MAX/MSEC_PER_SEC)
-		add_opt.ext.timeout = UINT_MAX/MSEC_PER_SEC;
+	    add_opt.ext.timeout > UINT_MAX / MSEC_PER_SEC)
+		add_opt.ext.timeout = UINT_MAX / MSEC_PER_SEC;
 	if (info->add_set.index != IPSET_INVALID_ID)
 		ip_set_add(info->add_set.index, skb, par, &add_opt);
 	if (info->del_set.index != IPSET_INVALID_ID)
@@ -419,6 +427,8 @@ static unsigned int
 set_target_v3(struct sk_buff *skb, const struct xt_action_param *par)
 {
 	const struct xt_set_info_target_v3 *info = par->targinfo;
+	int ret;
+
 	ADT_OPT(add_opt, par->family, info->add_set.dim,
 		info->add_set.flags, info->flags, info->timeout);
 	ADT_OPT(del_opt, par->family, info->del_set.dim,
@@ -426,12 +436,10 @@ set_target_v3(struct sk_buff *skb, const struct xt_action_param *par)
 	ADT_OPT(map_opt, par->family, info->map_set.dim,
 		info->map_set.flags, 0, UINT_MAX);
 
-	int ret;
-
 	/* Normalize to fit into jiffies */
 	if (add_opt.ext.timeout != IPSET_NO_TIMEOUT &&
-	    add_opt.ext.timeout > UINT_MAX/MSEC_PER_SEC)
-		add_opt.ext.timeout = UINT_MAX/MSEC_PER_SEC;
+	    add_opt.ext.timeout > UINT_MAX / MSEC_PER_SEC)
+		add_opt.ext.timeout = UINT_MAX / MSEC_PER_SEC;
 	if (info->add_set.index != IPSET_INVALID_ID)
 		ip_set_add(info->add_set.index, skb, par, &add_opt);
 	if (info->del_set.index != IPSET_INVALID_ID)
@@ -457,7 +465,6 @@ set_target_v3(struct sk_buff *skb, const struct xt_action_param *par)
 	return XT_CONTINUE;
 }
 
-
 static int
 set_target_v3_checkentry(const struct xt_tgchk_param *par)
 {
@@ -497,8 +504,7 @@ set_target_v3_checkentry(const struct xt_tgchk_param *par)
 		     !(par->hook_mask & (1 << NF_INET_FORWARD |
 					 1 << NF_INET_LOCAL_OUT |
 					 1 << NF_INET_POST_ROUTING))) {
-			pr_warn("mapping of prio or/and queue is allowed only"
-				"from OUTPUT/FORWARD/POSTROUTING chains\n");
+			pr_warn("mapping of prio or/and queue is allowed only from OUTPUT/FORWARD/POSTROUTING chains\n");
 			return -EINVAL;
 		}
 		index = ip_set_nfnl_get_byindex(par->net,
@@ -519,8 +525,7 @@ set_target_v3_checkentry(const struct xt_tgchk_param *par)
 	if (info->add_set.dim > IPSET_DIM_MAX ||
 	    info->del_set.dim > IPSET_DIM_MAX ||
 	    info->map_set.dim > IPSET_DIM_MAX) {
-		pr_warn("Protocol error: SET target dimension "
-			"is over the limit!\n");
+		pr_warn("Protocol error: SET target dimension is over the limit!\n");
 		if (info->add_set.index != IPSET_INVALID_ID)
 			ip_set_nfnl_put(par->net, info->add_set.index);
 		if (info->del_set.index != IPSET_INVALID_ID)
@@ -546,7 +551,6 @@ set_target_v3_destroy(const struct xt_tgdtor_param *par)
 		ip_set_nfnl_put(par->net, info->map_set.index);
 }
 
-
 static struct xt_match set_matches[] __read_mostly = {
 	{
 		.name		= "set",
-- 
cgit v1.2.3


From c751ad0dd640f4ce9269acd7a54de5ba8092e99e Mon Sep 17 00:00:00 2001
From: Stephen Boyd <sboyd@codeaurora.org>
Date: Fri, 12 Jun 2015 15:48:06 -0700
Subject: regulator: Add docbook for soft start

The docbook for these members is missing. Add them.

Warning(include/linux/regulator/machine.h:147): No description
found for parameter 'soft_start'
Warning(include/linux/regulator/driver.h:197): No description
found for parameter 'set_soft_start'

Reported-by: kbuild test robot <fengguang.wu@intel.com>
Signed-off-by: Stephen Boyd <sboyd@codeaurora.org>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/linux/regulator/driver.h  | 1 +
 include/linux/regulator/machine.h | 1 +
 2 files changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/regulator/driver.h b/include/linux/regulator/driver.h
index e0635d0894aa..9398d31f9531 100644
--- a/include/linux/regulator/driver.h
+++ b/include/linux/regulator/driver.h
@@ -111,6 +111,7 @@ struct regulator_linear_range {
  *               to stabilise after being set to a new value, in microseconds.
  *               The function provides the from and to voltage selector, the
  *               function should return the worst case.
+ * @set_soft_start: Enable soft start for the regulator.
  *
  * @set_suspend_voltage: Set the voltage for the regulator when the system
  *                       is suspended.
diff --git a/include/linux/regulator/machine.h b/include/linux/regulator/machine.h
index 7f7d0a3fe1e1..1258275d3751 100644
--- a/include/linux/regulator/machine.h
+++ b/include/linux/regulator/machine.h
@@ -87,6 +87,7 @@ struct regulator_state {
  *           applied.
  * @apply_uV: Apply the voltage constraint when initialising.
  * @ramp_disable: Disable ramp delay when initialising or when setting voltage.
+ * @soft_start: Enable soft start so that voltage ramps slowly.
  * @pull_down: Enable pull down when regulator is disabled.
  *
  * @input_uV: Input voltage for regulator when supplied by another regulator.
-- 
cgit v1.2.3


From d45337328b1fb86a8f045f6c9938e9e08e6d7134 Mon Sep 17 00:00:00 2001
From: Vincent Wan <vincent.wan@amd.com>
Date: Thu, 11 Jun 2015 20:11:45 +0800
Subject: pci_ids: Add AMD KERNCZ device ID support

The KERNCZ is new AMD SB/FCH generation name, like HUDSON2.
0x790b is the device ID for this generation.

Signed-off-by: Wan ZongShun <Vincent.Wan@amd.com>
Acked-by: Bjorn Helgaas <bhelgaas@google.com>
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 include/linux/pci_ids.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index 2f7b9a40f627..cb63a7b522ef 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -579,6 +579,7 @@
 #define PCI_DEVICE_ID_AMD_HUDSON2_SATA_IDE	0x7800
 #define PCI_DEVICE_ID_AMD_HUDSON2_SMBUS		0x780b
 #define PCI_DEVICE_ID_AMD_HUDSON2_IDE		0x780c
+#define PCI_DEVICE_ID_AMD_KERNCZ_SMBUS  0x790b
 
 #define PCI_VENDOR_ID_TRIDENT		0x1023
 #define PCI_DEVICE_ID_TRIDENT_4DWAVE_DX	0x2000
-- 
cgit v1.2.3


From 32be6d3e362b896c81aae7c635d44e5a91406ce2 Mon Sep 17 00:00:00 2001
From: Dan Streetman <ddstreet@ieee.org>
Date: Fri, 12 Jun 2015 10:58:46 -0400
Subject: crypto: nx - move include/linux/nx842.h into
 drivers/crypto/nx/nx-842.h

Move the contents of the include/linux/nx842.h header file into the
drivers/crypto/nx/nx-842.h header file.  Remove the nx842.h header
file and its entry in the MAINTAINERS file.

The include/linux/nx842.h header originally was there because the
crypto/842.c driver needed it to communicate with the nx-842 hw
driver.  However, that crypto compression driver was moved into
the drivers/crypto/nx/ directory, and now can directly include the
nx-842.h header.  Nothing else needs the public include/linux/nx842.h
header file, as all use of the nx-842 hardware driver will be through
the "842-nx" crypto compression driver, since the direct nx-842 api is
very limited in the buffer alignments and sizes that it will accept,
and the crypto compression interface handles those limitations and
allows any alignment and size buffers.

Signed-off-by: Dan Streetman <ddstreet@ieee.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 MAINTAINERS                       |  1 -
 drivers/crypto/nx/nx-842-crypto.c |  3 ++-
 drivers/crypto/nx/nx-842.h        | 21 ++++++++++++++++++++-
 include/linux/nx842.h             | 24 ------------------------
 4 files changed, 22 insertions(+), 27 deletions(-)
 delete mode 100644 include/linux/nx842.h

(limited to 'include/linux')

diff --git a/MAINTAINERS b/MAINTAINERS
index 912c9d9f741c..fb06e1eba758 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -4862,7 +4862,6 @@ IBM Power 842 compression accelerator
 M:	Dan Streetman <ddstreet@us.ibm.com>
 S:	Supported
 F:	drivers/crypto/nx/nx-842*
-F:	include/linux/nx842.h
 F:	include/linux/sw842.h
 F:	crypto/842.c
 F:	lib/842/
diff --git a/drivers/crypto/nx/nx-842-crypto.c b/drivers/crypto/nx/nx-842-crypto.c
index 2ffa1031c91f..95066f336b26 100644
--- a/drivers/crypto/nx/nx-842-crypto.c
+++ b/drivers/crypto/nx/nx-842-crypto.c
@@ -58,10 +58,11 @@
 #include <linux/module.h>
 #include <linux/crypto.h>
 #include <linux/vmalloc.h>
-#include <linux/nx842.h>
 #include <linux/sw842.h>
 #include <linux/ratelimit.h>
 
+#include "nx-842.h"
+
 /* The first 5 bits of this magic are 0x1f, which is an invalid 842 5-bit
  * template (see lib/842/842.h), so this magic number will never appear at
  * the start of a raw 842 compressed buffer.  That is important, as any buffer
diff --git a/drivers/crypto/nx/nx-842.h b/drivers/crypto/nx/nx-842.h
index 1730f4da1cf6..4dbac11c2aa5 100644
--- a/drivers/crypto/nx/nx-842.h
+++ b/drivers/crypto/nx/nx-842.h
@@ -4,7 +4,6 @@
 
 #include <linux/kernel.h>
 #include <linux/module.h>
-#include <linux/nx842.h>
 #include <linux/sw842.h>
 #include <linux/of.h>
 #include <linux/slab.h>
@@ -12,6 +11,12 @@
 #include <linux/mm.h>
 #include <linux/ratelimit.h>
 
+#define __NX842_PSERIES_MEM_COMPRESS	(10240)
+#define __NX842_POWERNV_MEM_COMPRESS	(1024)
+
+#define NX842_MEM_COMPRESS	(max_t(unsigned int,			\
+	__NX842_PSERIES_MEM_COMPRESS, __NX842_POWERNV_MEM_COMPRESS))
+
 /* Restrictions on Data Descriptor List (DDL) and Entry (DDE) buffers
  *
  * From NX P8 workbook, sec 4.9.1 "842 details"
@@ -104,6 +109,13 @@ static inline unsigned long nx842_get_pa(void *addr)
 #define GET_FIELD(v, m)		(((v) & (m)) >> MASK_LSH(m))
 #define SET_FIELD(v, m, val)	(((v) & ~(m)) | (((val) << MASK_LSH(m)) & (m)))
 
+struct nx842_constraints {
+	int alignment;
+	int multiple;
+	int minimum;
+	int maximum;
+};
+
 struct nx842_driver {
 	char *name;
 	struct module *owner;
@@ -124,4 +136,11 @@ void nx842_platform_driver_unset(struct nx842_driver *driver);
 bool nx842_platform_driver_get(void);
 void nx842_platform_driver_put(void);
 
+int nx842_constraints(struct nx842_constraints *constraints);
+
+int nx842_compress(const unsigned char *in, unsigned int in_len,
+		   unsigned char *out, unsigned int *out_len, void *wrkmem);
+int nx842_decompress(const unsigned char *in, unsigned int in_len,
+		     unsigned char *out, unsigned int *out_len, void *wrkmem);
+
 #endif /* __NX_842_H__ */
diff --git a/include/linux/nx842.h b/include/linux/nx842.h
deleted file mode 100644
index 4ddf68d9c0d4..000000000000
--- a/include/linux/nx842.h
+++ /dev/null
@@ -1,24 +0,0 @@
-#ifndef __NX842_H__
-#define __NX842_H__
-
-#define __NX842_PSERIES_MEM_COMPRESS	(10240)
-#define __NX842_POWERNV_MEM_COMPRESS	(1024)
-
-#define NX842_MEM_COMPRESS	(max_t(unsigned int,			\
-	__NX842_PSERIES_MEM_COMPRESS, __NX842_POWERNV_MEM_COMPRESS))
-
-struct nx842_constraints {
-	int alignment;
-	int multiple;
-	int minimum;
-	int maximum;
-};
-
-int nx842_constraints(struct nx842_constraints *constraints);
-
-int nx842_compress(const unsigned char *in, unsigned int in_len,
-		   unsigned char *out, unsigned int *out_len, void *wrkmem);
-int nx842_decompress(const unsigned char *in, unsigned int in_len,
-		     unsigned char *out, unsigned int *out_len, void *wrkmem);
-
-#endif
-- 
cgit v1.2.3


From e7b707f96820161820a864d2ee81740a14da6b93 Mon Sep 17 00:00:00 2001
From: Gwendal Grignou <gwendal@chromium.org>
Date: Wed, 20 May 2015 11:31:27 +0200
Subject: mfd: cros_ec: Remove parent field

Parent and device were pointing to the same device structure.

Parent is unused, removed.

Signed-off-by: Gwendal Grignou <gwendal@chromium.org>
Tested-by: Stephen Barber <smbarber@chromium.org>
Tested-by: Heiko Stuebner <heiko@sntech.de>
Tested-by: Gwendal Grignou <gwendal@chromium.org>
Reviewed-by: Puthikorn Voravootivat <puthik@chromium.org>
Signed-off-by: Javier Martinez Canillas <javier.martinez@collabora.co.uk>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/mfd/cros_ec_i2c.c             | 1 -
 drivers/mfd/cros_ec_spi.c             | 1 -
 drivers/platform/chrome/cros_ec_lpc.c | 1 -
 include/linux/mfd/cros_ec.h           | 2 --
 4 files changed, 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mfd/cros_ec_i2c.c b/drivers/mfd/cros_ec_i2c.c
index c0c30f4f946f..82b4d6148698 100644
--- a/drivers/mfd/cros_ec_i2c.c
+++ b/drivers/mfd/cros_ec_i2c.c
@@ -145,7 +145,6 @@ static int cros_ec_i2c_probe(struct i2c_client *client,
 	ec_dev->cmd_xfer = cros_ec_cmd_xfer_i2c;
 	ec_dev->ec_name = client->name;
 	ec_dev->phys_name = client->adapter->name;
-	ec_dev->parent = &client->dev;
 
 	err = cros_ec_register(ec_dev);
 	if (err) {
diff --git a/drivers/mfd/cros_ec_spi.c b/drivers/mfd/cros_ec_spi.c
index bf6e08e8013e..27bd52e5e8b7 100644
--- a/drivers/mfd/cros_ec_spi.c
+++ b/drivers/mfd/cros_ec_spi.c
@@ -363,7 +363,6 @@ static int cros_ec_spi_probe(struct spi_device *spi)
 	ec_dev->cmd_xfer = cros_ec_cmd_xfer_spi;
 	ec_dev->ec_name = ec_spi->spi->modalias;
 	ec_dev->phys_name = dev_name(&ec_spi->spi->dev);
-	ec_dev->parent = &ec_spi->spi->dev;
 	ec_dev->din_size = EC_MSG_BYTES + EC_MSG_PREAMBLE_COUNT;
 	ec_dev->dout_size = EC_MSG_BYTES;
 
diff --git a/drivers/platform/chrome/cros_ec_lpc.c b/drivers/platform/chrome/cros_ec_lpc.c
index 8f9ac4d7bbd0..860310513cf0 100644
--- a/drivers/platform/chrome/cros_ec_lpc.c
+++ b/drivers/platform/chrome/cros_ec_lpc.c
@@ -214,7 +214,6 @@ static int cros_ec_lpc_probe(struct platform_device *pdev)
 	ec_dev->dev = dev;
 	ec_dev->ec_name = pdev->name;
 	ec_dev->phys_name = dev_name(dev);
-	ec_dev->parent = dev;
 	ec_dev->cmd_xfer = cros_ec_cmd_xfer_lpc;
 	ec_dev->cmd_readmem = cros_ec_lpc_readmem;
 
diff --git a/include/linux/mfd/cros_ec.h b/include/linux/mfd/cros_ec.h
index 324a34683971..14cf522123dd 100644
--- a/include/linux/mfd/cros_ec.h
+++ b/include/linux/mfd/cros_ec.h
@@ -85,7 +85,6 @@ struct cros_ec_command {
  * to using dword.
  * @din_size: size of din buffer to allocate (zero to use static din)
  * @dout_size: size of dout buffer to allocate (zero to use static dout)
- * @parent: pointer to parent device (e.g. i2c or spi device)
  * @wake_enabled: true if this device can wake the system from sleep
  * @cmd_xfer: send command to EC and get response
  *     Returns the number of bytes received if the communication succeeded, but
@@ -113,7 +112,6 @@ struct cros_ec_device {
 	uint8_t *dout;
 	int din_size;
 	int dout_size;
-	struct device *parent;
 	bool wake_enabled;
 	int (*cmd_xfer)(struct cros_ec_device *ec,
 			struct cros_ec_command *msg);
-- 
cgit v1.2.3


From a841178445bb72a3d566b4e6ab9d19e9b002eb47 Mon Sep 17 00:00:00 2001
From: Javier Martinez Canillas <javier.martinez@collabora.co.uk>
Date: Tue, 9 Jun 2015 13:04:42 +0200
Subject: mfd: cros_ec: Use a zero-length array for command data

Commit 1b84f2a4cd4a ("mfd: cros_ec: Use fixed size arrays to transfer
data with the EC") modified the struct cros_ec_command fields to not
use pointers for the input and output buffers and use fixed length
arrays instead.

This change was made because the cros_ec ioctl API uses that struct
cros_ec_command to allow user-space to send commands to the EC and
to get data from the EC. So using pointers made the API not 64-bit
safe. Unfortunately this approach was not flexible enough for all
the use-cases since there may be a need to send larger commands
on newer versions of the EC command protocol.

So to avoid to choose a constant length that it may be too big for
most commands and thus wasting memory and CPU cycles on copy from
and to user-space or having a size that is too small for some big
commands, use a zero-length array that is both 64-bit safe and
flexible. The same buffer is used for both output and input data
so the maximum of these values should be used to allocate it.

Suggested-by: Gwendal Grignou <gwendal@chromium.org>
Signed-off-by: Javier Martinez Canillas <javier.martinez@collabora.co.uk>
Tested-by: Heiko Stuebner <heiko@sntech.de>
Acked-by: Lee Jones <lee.jones@linaro.org>
Acked-by: Olof Johansson <olof@lixom.net>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/i2c/busses/i2c-cros-ec-tunnel.c    |  45 ++++++---
 drivers/input/keyboard/cros_ec_keyb.c      |  29 ++++--
 drivers/mfd/cros_ec.c                      |  28 ++++--
 drivers/mfd/cros_ec_i2c.c                  |   4 +-
 drivers/mfd/cros_ec_spi.c                  |   2 +-
 drivers/platform/chrome/cros_ec_dev.c      |  65 +++++++-----
 drivers/platform/chrome/cros_ec_lightbar.c | 152 +++++++++++++++++++---------
 drivers/platform/chrome/cros_ec_lpc.c      |   8 +-
 drivers/platform/chrome/cros_ec_sysfs.c    | 154 ++++++++++++++++++-----------
 include/linux/mfd/cros_ec.h                |   6 +-
 10 files changed, 318 insertions(+), 175 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/i2c/busses/i2c-cros-ec-tunnel.c b/drivers/i2c/busses/i2c-cros-ec-tunnel.c
index fa8dedd8c3a2..a0d95ff682ae 100644
--- a/drivers/i2c/busses/i2c-cros-ec-tunnel.c
+++ b/drivers/i2c/busses/i2c-cros-ec-tunnel.c
@@ -182,8 +182,9 @@ static int ec_i2c_xfer(struct i2c_adapter *adap, struct i2c_msg i2c_msgs[],
 	const u16 bus_num = bus->remote_bus;
 	int request_len;
 	int response_len;
+	int alloc_size;
 	int result;
-	struct cros_ec_command msg = { };
+	struct cros_ec_command *msg;
 
 	request_len = ec_i2c_count_message(i2c_msgs, num);
 	if (request_len < 0) {
@@ -198,25 +199,39 @@ static int ec_i2c_xfer(struct i2c_adapter *adap, struct i2c_msg i2c_msgs[],
 		return response_len;
 	}
 
-	result = ec_i2c_construct_message(msg.outdata, i2c_msgs, num, bus_num);
-	if (result)
-		return result;
+	alloc_size = max(request_len, response_len);
+	msg = kmalloc(sizeof(*msg) + alloc_size, GFP_KERNEL);
+	if (!msg)
+		return -ENOMEM;
 
-	msg.version = 0;
-	msg.command = EC_CMD_I2C_PASSTHRU;
-	msg.outsize = request_len;
-	msg.insize = response_len;
+	result = ec_i2c_construct_message(msg->data, i2c_msgs, num, bus_num);
+	if (result) {
+		dev_err(dev, "Error constructing EC i2c message %d\n", result);
+		goto exit;
+	}
 
-	result = cros_ec_cmd_xfer(bus->ec, &msg);
-	if (result < 0)
-		return result;
+	msg->version = 0;
+	msg->command = EC_CMD_I2C_PASSTHRU;
+	msg->outsize = request_len;
+	msg->insize = response_len;
 
-	result = ec_i2c_parse_response(msg.indata, i2c_msgs, &num);
-	if (result < 0)
-		return result;
+	result = cros_ec_cmd_xfer(bus->ec, msg);
+	if (result < 0) {
+		dev_err(dev, "Error transferring EC i2c message %d\n", result);
+		goto exit;
+	}
+
+	result = ec_i2c_parse_response(msg->data, i2c_msgs, &num);
+	if (result < 0) {
+		dev_err(dev, "Error parsing EC i2c message %d\n", result);
+		goto exit;
+	}
 
 	/* Indicate success by saying how many messages were sent */
-	return num;
+	result = num;
+exit:
+	kfree(msg);
+	return result;
 }
 
 static u32 ec_i2c_functionality(struct i2c_adapter *adap)
diff --git a/drivers/input/keyboard/cros_ec_keyb.c b/drivers/input/keyboard/cros_ec_keyb.c
index b50c5b8b8a4d..974154a74505 100644
--- a/drivers/input/keyboard/cros_ec_keyb.c
+++ b/drivers/input/keyboard/cros_ec_keyb.c
@@ -148,19 +148,28 @@ static void cros_ec_keyb_process(struct cros_ec_keyb *ckdev,
 
 static int cros_ec_keyb_get_state(struct cros_ec_keyb *ckdev, uint8_t *kb_state)
 {
-	int ret;
-	struct cros_ec_command msg = {
-		.command = EC_CMD_MKBP_STATE,
-		.insize = ckdev->cols,
-	};
+	int ret = 0;
+	struct cros_ec_command *msg;
 
-	ret = cros_ec_cmd_xfer(ckdev->ec, &msg);
-	if (ret < 0)
-		return ret;
+	msg = kmalloc(sizeof(*msg) + ckdev->cols, GFP_KERNEL);
+	if (!msg)
+		return -ENOMEM;
 
-	memcpy(kb_state, msg.indata, ckdev->cols);
+	msg->version = 0;
+	msg->command = EC_CMD_MKBP_STATE;
+	msg->insize = ckdev->cols;
+	msg->outsize = 0;
 
-	return 0;
+	ret = cros_ec_cmd_xfer(ckdev->ec, msg);
+	if (ret < 0) {
+		dev_err(ckdev->dev, "Error transferring EC message %d\n", ret);
+		goto exit;
+	}
+
+	memcpy(kb_state, msg->data, ckdev->cols);
+exit:
+	kfree(msg);
+	return ret;
 }
 
 static irqreturn_t cros_ec_keyb_irq(int irq, void *data)
diff --git a/drivers/mfd/cros_ec.c b/drivers/mfd/cros_ec.c
index 1574a9352a6d..4a0f6dfcd376 100644
--- a/drivers/mfd/cros_ec.c
+++ b/drivers/mfd/cros_ec.c
@@ -41,7 +41,7 @@ int cros_ec_prepare_tx(struct cros_ec_device *ec_dev,
 	out[2] = msg->outsize;
 	csum = out[0] + out[1] + out[2];
 	for (i = 0; i < msg->outsize; i++)
-		csum += out[EC_MSG_TX_HEADER_BYTES + i] = msg->outdata[i];
+		csum += out[EC_MSG_TX_HEADER_BYTES + i] = msg->data[i];
 	out[EC_MSG_TX_HEADER_BYTES + msg->outsize] = (uint8_t)(csum & 0xff);
 
 	return EC_MSG_TX_PROTO_BYTES + msg->outsize;
@@ -75,11 +75,20 @@ int cros_ec_cmd_xfer(struct cros_ec_device *ec_dev,
 	ret = ec_dev->cmd_xfer(ec_dev, msg);
 	if (msg->result == EC_RES_IN_PROGRESS) {
 		int i;
-		struct cros_ec_command status_msg = { };
+		struct cros_ec_command *status_msg;
 		struct ec_response_get_comms_status *status;
 
-		status_msg.command = EC_CMD_GET_COMMS_STATUS;
-		status_msg.insize = sizeof(*status);
+		status_msg = kmalloc(sizeof(*status_msg) + sizeof(*status),
+				     GFP_KERNEL);
+		if (!status_msg) {
+			ret = -ENOMEM;
+			goto exit;
+		}
+
+		status_msg->version = 0;
+		status_msg->command = EC_CMD_GET_COMMS_STATUS;
+		status_msg->insize = sizeof(*status);
+		status_msg->outsize = 0;
 
 		/*
 		 * Query the EC's status until it's no longer busy or
@@ -88,20 +97,23 @@ int cros_ec_cmd_xfer(struct cros_ec_device *ec_dev,
 		for (i = 0; i < EC_COMMAND_RETRIES; i++) {
 			usleep_range(10000, 11000);
 
-			ret = ec_dev->cmd_xfer(ec_dev, &status_msg);
+			ret = ec_dev->cmd_xfer(ec_dev, status_msg);
 			if (ret < 0)
 				break;
 
-			msg->result = status_msg.result;
-			if (status_msg.result != EC_RES_SUCCESS)
+			msg->result = status_msg->result;
+			if (status_msg->result != EC_RES_SUCCESS)
 				break;
 
 			status = (struct ec_response_get_comms_status *)
-				 status_msg.indata;
+				 status_msg->data;
 			if (!(status->flags & EC_COMMS_STATUS_PROCESSING))
 				break;
 		}
+
+		kfree(status_msg);
 	}
+exit:
 	mutex_unlock(&ec_dev->lock);
 
 	return ret;
diff --git a/drivers/mfd/cros_ec_i2c.c b/drivers/mfd/cros_ec_i2c.c
index 82b4d6148698..fbf7819f5de5 100644
--- a/drivers/mfd/cros_ec_i2c.c
+++ b/drivers/mfd/cros_ec_i2c.c
@@ -76,7 +76,7 @@ static int cros_ec_cmd_xfer_i2c(struct cros_ec_device *ec_dev,
 	/* copy message payload and compute checksum */
 	sum = out_buf[0] + out_buf[1] + out_buf[2];
 	for (i = 0; i < msg->outsize; i++) {
-		out_buf[3 + i] = msg->outdata[i];
+		out_buf[3 + i] = msg->data[i];
 		sum += out_buf[3 + i];
 	}
 	out_buf[3 + msg->outsize] = sum;
@@ -109,7 +109,7 @@ static int cros_ec_cmd_xfer_i2c(struct cros_ec_device *ec_dev,
 	/* copy response packet payload and compute checksum */
 	sum = in_buf[0] + in_buf[1];
 	for (i = 0; i < len; i++) {
-		msg->indata[i] = in_buf[2 + i];
+		msg->data[i] = in_buf[2 + i];
 		sum += in_buf[2 + i];
 	}
 	dev_dbg(ec_dev->dev, "packet: %*ph, sum = %02x\n",
diff --git a/drivers/mfd/cros_ec_spi.c b/drivers/mfd/cros_ec_spi.c
index 27bd52e5e8b7..573730fec947 100644
--- a/drivers/mfd/cros_ec_spi.c
+++ b/drivers/mfd/cros_ec_spi.c
@@ -299,7 +299,7 @@ static int cros_ec_cmd_xfer_spi(struct cros_ec_device *ec_dev,
 	for (i = 0; i < len; i++) {
 		sum += ptr[i + 2];
 		if (ec_msg->insize)
-			ec_msg->indata[i] = ptr[i + 2];
+			ec_msg->data[i] = ptr[i + 2];
 	}
 	sum &= 0xff;
 
diff --git a/drivers/platform/chrome/cros_ec_dev.c b/drivers/platform/chrome/cros_ec_dev.c
index 6090d0b2826f..e91ced1cb8ce 100644
--- a/drivers/platform/chrome/cros_ec_dev.c
+++ b/drivers/platform/chrome/cros_ec_dev.c
@@ -20,6 +20,7 @@
 #include <linux/fs.h>
 #include <linux/module.h>
 #include <linux/platform_device.h>
+#include <linux/slab.h>
 #include <linux/uaccess.h>
 
 #include "cros_ec_dev.h"
@@ -36,28 +37,31 @@ static int ec_get_version(struct cros_ec_device *ec, char *str, int maxlen)
 	static const char * const current_image_name[] = {
 		"unknown", "read-only", "read-write", "invalid",
 	};
-	struct cros_ec_command msg = {
-		.version = 0,
-		.command = EC_CMD_GET_VERSION,
-		.outdata = { 0 },
-		.outsize = 0,
-		.indata = { 0 },
-		.insize = sizeof(*resp),
-	};
+	struct cros_ec_command *msg;
 	int ret;
 
-	ret = cros_ec_cmd_xfer(ec, &msg);
+	msg = kmalloc(sizeof(*msg) + sizeof(*resp), GFP_KERNEL);
+	if (!msg)
+		return -ENOMEM;
+
+	msg->version = 0;
+	msg->command = EC_CMD_GET_VERSION;
+	msg->insize = sizeof(*resp);
+	msg->outsize = 0;
+
+	ret = cros_ec_cmd_xfer(ec, msg);
 	if (ret < 0)
-		return ret;
+		goto exit;
 
-	if (msg.result != EC_RES_SUCCESS) {
+	if (msg->result != EC_RES_SUCCESS) {
 		snprintf(str, maxlen,
 			 "%s\nUnknown EC version: EC returned %d\n",
-			 CROS_EC_DEV_VERSION, msg.result);
-		return 0;
+			 CROS_EC_DEV_VERSION, msg->result);
+		ret = -EINVAL;
+		goto exit;
 	}
 
-	resp = (struct ec_response_get_version *)msg.indata;
+	resp = (struct ec_response_get_version *)msg->data;
 	if (resp->current_image >= ARRAY_SIZE(current_image_name))
 		resp->current_image = 3; /* invalid */
 
@@ -65,7 +69,10 @@ static int ec_get_version(struct cros_ec_device *ec, char *str, int maxlen)
 		 resp->version_string_ro, resp->version_string_rw,
 		 current_image_name[resp->current_image]);
 
-	return 0;
+	ret = 0;
+exit:
+	kfree(msg);
+	return ret;
 }
 
 /* Device file ops */
@@ -110,20 +117,32 @@ static ssize_t ec_device_read(struct file *filp, char __user *buffer,
 static long ec_device_ioctl_xcmd(struct cros_ec_device *ec, void __user *arg)
 {
 	long ret;
-	struct cros_ec_command s_cmd = { };
+	struct cros_ec_command u_cmd;
+	struct cros_ec_command *s_cmd;
 
-	if (copy_from_user(&s_cmd, arg, sizeof(s_cmd)))
+	if (copy_from_user(&u_cmd, arg, sizeof(u_cmd)))
 		return -EFAULT;
 
-	ret = cros_ec_cmd_xfer(ec, &s_cmd);
+	s_cmd = kmalloc(sizeof(*s_cmd) + max(u_cmd.outsize, u_cmd.insize),
+			GFP_KERNEL);
+	if (!s_cmd)
+		return -ENOMEM;
+
+	if (copy_from_user(s_cmd, arg, sizeof(*s_cmd) + u_cmd.outsize)) {
+		ret = -EFAULT;
+		goto exit;
+	}
+
+	ret = cros_ec_cmd_xfer(ec, s_cmd);
 	/* Only copy data to userland if data was received. */
 	if (ret < 0)
-		return ret;
+		goto exit;
 
-	if (copy_to_user(arg, &s_cmd, sizeof(s_cmd)))
-		return -EFAULT;
-
-	return 0;
+	if (copy_to_user(arg, s_cmd, sizeof(*s_cmd) + u_cmd.insize))
+		ret = -EFAULT;
+exit:
+	kfree(s_cmd);
+	return ret;
 }
 
 static long ec_device_ioctl_readmem(struct cros_ec_device *ec, void __user *arg)
diff --git a/drivers/platform/chrome/cros_ec_lightbar.c b/drivers/platform/chrome/cros_ec_lightbar.c
index b4ff47a9069a..560e5d41b7ae 100644
--- a/drivers/platform/chrome/cros_ec_lightbar.c
+++ b/drivers/platform/chrome/cros_ec_lightbar.c
@@ -31,6 +31,7 @@
 #include <linux/sched.h>
 #include <linux/types.h>
 #include <linux/uaccess.h>
+#include <linux/slab.h>
 
 #include "cros_ec_dev.h"
 
@@ -91,54 +92,79 @@ out:
 	return ret;
 }
 
-#define INIT_MSG(P, R) { \
-		.command = EC_CMD_LIGHTBAR_CMD, \
-		.outsize = sizeof(*P), \
-		.insize = sizeof(*R), \
-	}
+static struct cros_ec_command *alloc_lightbar_cmd_msg(void)
+{
+	struct cros_ec_command *msg;
+	int len;
+
+	len = max(sizeof(struct ec_params_lightbar),
+		  sizeof(struct ec_response_lightbar));
+
+	msg = kmalloc(sizeof(*msg) + len, GFP_KERNEL);
+	if (!msg)
+		return NULL;
+
+	msg->version = 0;
+	msg->command = EC_CMD_LIGHTBAR_CMD;
+	msg->outsize = sizeof(struct ec_params_lightbar);
+	msg->insize = sizeof(struct ec_response_lightbar);
+
+	return msg;
+}
 
 static int get_lightbar_version(struct cros_ec_device *ec,
 				uint32_t *ver_ptr, uint32_t *flg_ptr)
 {
 	struct ec_params_lightbar *param;
 	struct ec_response_lightbar *resp;
-	struct cros_ec_command msg = INIT_MSG(param, resp);
+	struct cros_ec_command *msg;
 	int ret;
 
-	param = (struct ec_params_lightbar *)msg.outdata;
-	param->cmd = LIGHTBAR_CMD_VERSION;
-	ret = cros_ec_cmd_xfer(ec, &msg);
-	if (ret < 0)
+	msg = alloc_lightbar_cmd_msg();
+	if (!msg)
 		return 0;
 
-	switch (msg.result) {
+	param = (struct ec_params_lightbar *)msg->data;
+	param->cmd = LIGHTBAR_CMD_VERSION;
+	ret = cros_ec_cmd_xfer(ec, msg);
+	if (ret < 0) {
+		ret = 0;
+		goto exit;
+	}
+
+	switch (msg->result) {
 	case EC_RES_INVALID_PARAM:
 		/* Pixel had no version command. */
 		if (ver_ptr)
 			*ver_ptr = 0;
 		if (flg_ptr)
 			*flg_ptr = 0;
-		return 1;
+		ret = 1;
+		goto exit;
 
 	case EC_RES_SUCCESS:
-		resp = (struct ec_response_lightbar *)msg.indata;
+		resp = (struct ec_response_lightbar *)msg->data;
 
 		/* Future devices w/lightbars should implement this command */
 		if (ver_ptr)
 			*ver_ptr = resp->version.num;
 		if (flg_ptr)
 			*flg_ptr = resp->version.flags;
-		return 1;
+		ret = 1;
+		goto exit;
 	}
 
 	/* Anything else (ie, EC_RES_INVALID_COMMAND) - no lightbar */
-	return 0;
+	ret = 0;
+exit:
+	kfree(msg);
+	return ret;
 }
 
 static ssize_t version_show(struct device *dev,
 			    struct device_attribute *attr, char *buf)
 {
-	uint32_t version, flags;
+	uint32_t version = 0, flags = 0;
 	struct cros_ec_device *ec = dev_get_drvdata(dev);
 	int ret;
 
@@ -158,8 +184,7 @@ static ssize_t brightness_store(struct device *dev,
 				const char *buf, size_t count)
 {
 	struct ec_params_lightbar *param;
-	struct ec_response_lightbar *resp;
-	struct cros_ec_command msg = INIT_MSG(param, resp);
+	struct cros_ec_command *msg;
 	int ret;
 	unsigned int val;
 	struct cros_ec_device *ec = dev_get_drvdata(dev);
@@ -167,21 +192,30 @@ static ssize_t brightness_store(struct device *dev,
 	if (kstrtouint(buf, 0, &val))
 		return -EINVAL;
 
-	param = (struct ec_params_lightbar *)msg.outdata;
+	msg = alloc_lightbar_cmd_msg();
+	if (!msg)
+		return -ENOMEM;
+
+	param = (struct ec_params_lightbar *)msg->data;
 	param->cmd = LIGHTBAR_CMD_BRIGHTNESS;
 	param->brightness.num = val;
 	ret = lb_throttle();
 	if (ret)
-		return ret;
+		goto exit;
 
-	ret = cros_ec_cmd_xfer(ec, &msg);
+	ret = cros_ec_cmd_xfer(ec, msg);
 	if (ret < 0)
-		return ret;
+		goto exit;
 
-	if (msg.result != EC_RES_SUCCESS)
-		return -EINVAL;
+	if (msg->result != EC_RES_SUCCESS) {
+		ret = -EINVAL;
+		goto exit;
+	}
 
-	return count;
+	ret = count;
+exit:
+	kfree(msg);
+	return ret;
 }
 
 
@@ -196,12 +230,15 @@ static ssize_t led_rgb_store(struct device *dev, struct device_attribute *attr,
 			     const char *buf, size_t count)
 {
 	struct ec_params_lightbar *param;
-	struct ec_response_lightbar *resp;
-	struct cros_ec_command msg = INIT_MSG(param, resp);
+	struct cros_ec_command *msg;
 	struct cros_ec_device *ec = dev_get_drvdata(dev);
 	unsigned int val[4];
 	int ret, i = 0, j = 0, ok = 0;
 
+	msg = alloc_lightbar_cmd_msg();
+	if (!msg)
+		return -ENOMEM;
+
 	do {
 		/* Skip any whitespace */
 		while (*buf && isspace(*buf))
@@ -215,7 +252,7 @@ static ssize_t led_rgb_store(struct device *dev, struct device_attribute *attr,
 			return -EINVAL;
 
 		if (i == 4) {
-			param = (struct ec_params_lightbar *)msg.outdata;
+			param = (struct ec_params_lightbar *)msg->data;
 			param->cmd = LIGHTBAR_CMD_RGB;
 			param->rgb.led = val[0];
 			param->rgb.red = val[1];
@@ -231,12 +268,14 @@ static ssize_t led_rgb_store(struct device *dev, struct device_attribute *attr,
 					return ret;
 			}
 
-			ret = cros_ec_cmd_xfer(ec, &msg);
+			ret = cros_ec_cmd_xfer(ec, msg);
 			if (ret < 0)
-				return ret;
+				goto exit;
 
-			if (msg.result != EC_RES_SUCCESS)
-				return -EINVAL;
+			if (msg->result != EC_RES_SUCCESS) {
+				ret = -EINVAL;
+				goto exit;
+			}
 
 			i = 0;
 			ok = 1;
@@ -248,6 +287,8 @@ static ssize_t led_rgb_store(struct device *dev, struct device_attribute *attr,
 
 	} while (*buf);
 
+exit:
+	kfree(msg);
 	return (ok && i == 0) ? count : -EINVAL;
 }
 
@@ -261,42 +302,55 @@ static ssize_t sequence_show(struct device *dev,
 {
 	struct ec_params_lightbar *param;
 	struct ec_response_lightbar *resp;
-	struct cros_ec_command msg = INIT_MSG(param, resp);
+	struct cros_ec_command *msg;
 	int ret;
 	struct cros_ec_device *ec = dev_get_drvdata(dev);
 
-	param = (struct ec_params_lightbar *)msg.outdata;
+	msg = alloc_lightbar_cmd_msg();
+	if (!msg)
+		return -ENOMEM;
+
+	param = (struct ec_params_lightbar *)msg->data;
 	param->cmd = LIGHTBAR_CMD_GET_SEQ;
 	ret = lb_throttle();
 	if (ret)
-		return ret;
+		goto exit;
 
-	ret = cros_ec_cmd_xfer(ec, &msg);
+	ret = cros_ec_cmd_xfer(ec, msg);
 	if (ret < 0)
-		return ret;
+		goto exit;
 
-	if (msg.result != EC_RES_SUCCESS)
-		return scnprintf(buf, PAGE_SIZE,
-				 "ERROR: EC returned %d\n", msg.result);
+	if (msg->result != EC_RES_SUCCESS) {
+		ret = scnprintf(buf, PAGE_SIZE,
+				"ERROR: EC returned %d\n", msg->result);
+		goto exit;
+	}
 
-	resp = (struct ec_response_lightbar *)msg.indata;
+	resp = (struct ec_response_lightbar *)msg->data;
 	if (resp->get_seq.num >= ARRAY_SIZE(seqname))
-		return scnprintf(buf, PAGE_SIZE, "%d\n", resp->get_seq.num);
+		ret = scnprintf(buf, PAGE_SIZE, "%d\n", resp->get_seq.num);
 	else
-		return scnprintf(buf, PAGE_SIZE, "%s\n",
-				 seqname[resp->get_seq.num]);
+		ret = scnprintf(buf, PAGE_SIZE, "%s\n",
+				seqname[resp->get_seq.num]);
+
+exit:
+	kfree(msg);
+	return ret;
 }
 
 static ssize_t sequence_store(struct device *dev, struct device_attribute *attr,
 			      const char *buf, size_t count)
 {
 	struct ec_params_lightbar *param;
-	struct ec_response_lightbar *resp;
-	struct cros_ec_command msg = INIT_MSG(param, resp);
+	struct cros_ec_command *msg;
 	unsigned int num;
 	int ret, len;
 	struct cros_ec_device *ec = dev_get_drvdata(dev);
 
+	msg = alloc_lightbar_cmd_msg();
+	if (!msg)
+		return -ENOMEM;
+
 	for (len = 0; len < count; len++)
 		if (!isalnum(buf[len]))
 			break;
@@ -311,18 +365,18 @@ static ssize_t sequence_store(struct device *dev, struct device_attribute *attr,
 			return ret;
 	}
 
-	param = (struct ec_params_lightbar *)msg.outdata;
+	param = (struct ec_params_lightbar *)msg->data;
 	param->cmd = LIGHTBAR_CMD_SEQ;
 	param->seq.num = num;
 	ret = lb_throttle();
 	if (ret)
 		return ret;
 
-	ret = cros_ec_cmd_xfer(ec, &msg);
+	ret = cros_ec_cmd_xfer(ec, msg);
 	if (ret < 0)
 		return ret;
 
-	if (msg.result != EC_RES_SUCCESS)
+	if (msg->result != EC_RES_SUCCESS)
 		return -EINVAL;
 
 	return count;
diff --git a/drivers/platform/chrome/cros_ec_lpc.c b/drivers/platform/chrome/cros_ec_lpc.c
index 860310513cf0..4c7f0df33bf8 100644
--- a/drivers/platform/chrome/cros_ec_lpc.c
+++ b/drivers/platform/chrome/cros_ec_lpc.c
@@ -73,8 +73,8 @@ static int cros_ec_cmd_xfer_lpc(struct cros_ec_device *ec,
 
 	/* Copy data and update checksum */
 	for (i = 0; i < msg->outsize; i++) {
-		outb(msg->outdata[i], EC_LPC_ADDR_HOST_PARAM + i);
-		csum += msg->outdata[i];
+		outb(msg->data[i], EC_LPC_ADDR_HOST_PARAM + i);
+		csum += msg->data[i];
 	}
 
 	/* Finalize checksum and write args */
@@ -129,8 +129,8 @@ static int cros_ec_cmd_xfer_lpc(struct cros_ec_device *ec,
 
 	/* Read response and update checksum */
 	for (i = 0; i < args.data_size; i++) {
-		msg->indata[i] = inb(EC_LPC_ADDR_HOST_PARAM + i);
-		csum += msg->indata[i];
+		msg->data[i] = inb(EC_LPC_ADDR_HOST_PARAM + i);
+		csum += msg->data[i];
 	}
 
 	/* Verify checksum */
diff --git a/drivers/platform/chrome/cros_ec_sysfs.c b/drivers/platform/chrome/cros_ec_sysfs.c
index fb62ab6cc659..78ba82d670cb 100644
--- a/drivers/platform/chrome/cros_ec_sysfs.c
+++ b/drivers/platform/chrome/cros_ec_sysfs.c
@@ -29,6 +29,7 @@
 #include <linux/module.h>
 #include <linux/platform_device.h>
 #include <linux/printk.h>
+#include <linux/slab.h>
 #include <linux/stat.h>
 #include <linux/types.h>
 #include <linux/uaccess.h>
@@ -66,14 +67,19 @@ static ssize_t store_ec_reboot(struct device *dev,
 		{"hibernate",    EC_REBOOT_HIBERNATE, 0},
 		{"at-shutdown",  -1, EC_REBOOT_FLAG_ON_AP_SHUTDOWN},
 	};
-	struct cros_ec_command msg = { 0 };
-	struct ec_params_reboot_ec *param =
-		(struct ec_params_reboot_ec *)msg.outdata;
+	struct cros_ec_command *msg;
+	struct ec_params_reboot_ec *param;
 	int got_cmd = 0, offset = 0;
 	int i;
 	int ret;
 	struct cros_ec_device *ec = dev_get_drvdata(dev);
 
+	msg = kmalloc(sizeof(*msg) + sizeof(*param), GFP_KERNEL);
+	if (!msg)
+		return -ENOMEM;
+
+	param = (struct ec_params_reboot_ec *)msg->data;
+
 	param->flags = 0;
 	while (1) {
 		/* Find word to start scanning */
@@ -100,19 +106,26 @@ static ssize_t store_ec_reboot(struct device *dev,
 			offset++;
 	}
 
-	if (!got_cmd)
-		return -EINVAL;
-
-	msg.command = EC_CMD_REBOOT_EC;
-	msg.outsize = sizeof(param);
-	ret = cros_ec_cmd_xfer(ec, &msg);
-	if (ret < 0)
-		return ret;
-	if (msg.result != EC_RES_SUCCESS) {
-		dev_dbg(ec->dev, "EC result %d\n", msg.result);
-		return -EINVAL;
+	if (!got_cmd) {
+		count = -EINVAL;
+		goto exit;
 	}
 
+	msg->version = 0;
+	msg->command = EC_CMD_REBOOT_EC;
+	msg->outsize = sizeof(*param);
+	msg->insize = 0;
+	ret = cros_ec_cmd_xfer(ec, msg);
+	if (ret < 0) {
+		count = ret;
+		goto exit;
+	}
+	if (msg->result != EC_RES_SUCCESS) {
+		dev_dbg(ec->dev, "EC result %d\n", msg->result);
+		count = -EINVAL;
+	}
+exit:
+	kfree(msg);
 	return count;
 }
 
@@ -123,22 +136,32 @@ static ssize_t show_ec_version(struct device *dev,
 	struct ec_response_get_version *r_ver;
 	struct ec_response_get_chip_info *r_chip;
 	struct ec_response_board_version *r_board;
-	struct cros_ec_command msg = { 0 };
+	struct cros_ec_command *msg;
 	int ret;
 	int count = 0;
 	struct cros_ec_device *ec = dev_get_drvdata(dev);
 
+	msg = kmalloc(sizeof(*msg) + EC_HOST_PARAM_SIZE, GFP_KERNEL);
+	if (!msg)
+		return -ENOMEM;
+
 	/* Get versions. RW may change. */
-	msg.command = EC_CMD_GET_VERSION;
-	msg.insize = sizeof(*r_ver);
-	ret = cros_ec_cmd_xfer(ec, &msg);
-	if (ret < 0)
-		return ret;
-	if (msg.result != EC_RES_SUCCESS)
-		return scnprintf(buf, PAGE_SIZE,
-				 "ERROR: EC returned %d\n", msg.result);
+	msg->version = 0;
+	msg->command = EC_CMD_GET_VERSION;
+	msg->insize = sizeof(*r_ver);
+	msg->outsize = 0;
+	ret = cros_ec_cmd_xfer(ec, msg);
+	if (ret < 0) {
+		count = ret;
+		goto exit;
+	}
+	if (msg->result != EC_RES_SUCCESS) {
+		count = scnprintf(buf, PAGE_SIZE,
+				  "ERROR: EC returned %d\n", msg->result);
+		goto exit;
+	}
 
-	r_ver = (struct ec_response_get_version *)msg.indata;
+	r_ver = (struct ec_response_get_version *)msg->data;
 	/* Strings should be null-terminated, but let's be sure. */
 	r_ver->version_string_ro[sizeof(r_ver->version_string_ro) - 1] = '\0';
 	r_ver->version_string_rw[sizeof(r_ver->version_string_rw) - 1] = '\0';
@@ -152,33 +175,33 @@ static ssize_t show_ec_version(struct device *dev,
 			    image_names[r_ver->current_image] : "?"));
 
 	/* Get build info. */
-	msg.command = EC_CMD_GET_BUILD_INFO;
-	msg.insize = sizeof(msg.indata);
-	ret = cros_ec_cmd_xfer(ec, &msg);
+	msg->command = EC_CMD_GET_BUILD_INFO;
+	msg->insize = EC_HOST_PARAM_SIZE;
+	ret = cros_ec_cmd_xfer(ec, msg);
 	if (ret < 0)
 		count += scnprintf(buf + count, PAGE_SIZE - count,
 				   "Build info:    XFER ERROR %d\n", ret);
-	else if (msg.result != EC_RES_SUCCESS)
+	else if (msg->result != EC_RES_SUCCESS)
 		count += scnprintf(buf + count, PAGE_SIZE - count,
-				   "Build info:    EC error %d\n", msg.result);
+				   "Build info:    EC error %d\n", msg->result);
 	else {
-		msg.indata[sizeof(msg.indata) - 1] = '\0';
+		msg->data[sizeof(msg->data) - 1] = '\0';
 		count += scnprintf(buf + count, PAGE_SIZE - count,
-				   "Build info:    %s\n", msg.indata);
+				   "Build info:    %s\n", msg->data);
 	}
 
 	/* Get chip info. */
-	msg.command = EC_CMD_GET_CHIP_INFO;
-	msg.insize = sizeof(*r_chip);
-	ret = cros_ec_cmd_xfer(ec, &msg);
+	msg->command = EC_CMD_GET_CHIP_INFO;
+	msg->insize = sizeof(*r_chip);
+	ret = cros_ec_cmd_xfer(ec, msg);
 	if (ret < 0)
 		count += scnprintf(buf + count, PAGE_SIZE - count,
 				   "Chip info:     XFER ERROR %d\n", ret);
-	else if (msg.result != EC_RES_SUCCESS)
+	else if (msg->result != EC_RES_SUCCESS)
 		count += scnprintf(buf + count, PAGE_SIZE - count,
-				   "Chip info:     EC error %d\n", msg.result);
+				   "Chip info:     EC error %d\n", msg->result);
 	else {
-		r_chip = (struct ec_response_get_chip_info *)msg.indata;
+		r_chip = (struct ec_response_get_chip_info *)msg->data;
 
 		r_chip->vendor[sizeof(r_chip->vendor) - 1] = '\0';
 		r_chip->name[sizeof(r_chip->name) - 1] = '\0';
@@ -192,23 +215,25 @@ static ssize_t show_ec_version(struct device *dev,
 	}
 
 	/* Get board version */
-	msg.command = EC_CMD_GET_BOARD_VERSION;
-	msg.insize = sizeof(*r_board);
-	ret = cros_ec_cmd_xfer(ec, &msg);
+	msg->command = EC_CMD_GET_BOARD_VERSION;
+	msg->insize = sizeof(*r_board);
+	ret = cros_ec_cmd_xfer(ec, msg);
 	if (ret < 0)
 		count += scnprintf(buf + count, PAGE_SIZE - count,
 				   "Board version: XFER ERROR %d\n", ret);
-	else if (msg.result != EC_RES_SUCCESS)
+	else if (msg->result != EC_RES_SUCCESS)
 		count += scnprintf(buf + count, PAGE_SIZE - count,
-				   "Board version: EC error %d\n", msg.result);
+				   "Board version: EC error %d\n", msg->result);
 	else {
-		r_board = (struct ec_response_board_version *)msg.indata;
+		r_board = (struct ec_response_board_version *)msg->data;
 
 		count += scnprintf(buf + count, PAGE_SIZE - count,
 				   "Board version: %d\n",
 				   r_board->board_version);
 	}
 
+exit:
+	kfree(msg);
 	return count;
 }
 
@@ -216,27 +241,38 @@ static ssize_t show_ec_flashinfo(struct device *dev,
 				 struct device_attribute *attr, char *buf)
 {
 	struct ec_response_flash_info *resp;
-	struct cros_ec_command msg = { 0 };
+	struct cros_ec_command *msg;
 	int ret;
 	struct cros_ec_device *ec = dev_get_drvdata(dev);
 
+	msg = kmalloc(sizeof(*msg) + sizeof(*resp), GFP_KERNEL);
+	if (!msg)
+		return -ENOMEM;
+
 	/* The flash info shouldn't ever change, but ask each time anyway. */
-	msg.command = EC_CMD_FLASH_INFO;
-	msg.insize = sizeof(*resp);
-	ret = cros_ec_cmd_xfer(ec, &msg);
+	msg->version = 0;
+	msg->command = EC_CMD_FLASH_INFO;
+	msg->insize = sizeof(*resp);
+	msg->outsize = 0;
+	ret = cros_ec_cmd_xfer(ec, msg);
 	if (ret < 0)
-		return ret;
-	if (msg.result != EC_RES_SUCCESS)
-		return scnprintf(buf, PAGE_SIZE,
-				 "ERROR: EC returned %d\n", msg.result);
-
-	resp = (struct ec_response_flash_info *)msg.indata;
-
-	return scnprintf(buf, PAGE_SIZE,
-			 "FlashSize %d\nWriteSize %d\n"
-			 "EraseSize %d\nProtectSize %d\n",
-			 resp->flash_size, resp->write_block_size,
-			 resp->erase_block_size, resp->protect_block_size);
+		goto exit;
+	if (msg->result != EC_RES_SUCCESS) {
+		ret = scnprintf(buf, PAGE_SIZE,
+				"ERROR: EC returned %d\n", msg->result);
+		goto exit;
+	}
+
+	resp = (struct ec_response_flash_info *)msg->data;
+
+	ret = scnprintf(buf, PAGE_SIZE,
+			"FlashSize %d\nWriteSize %d\n"
+			"EraseSize %d\nProtectSize %d\n",
+			resp->flash_size, resp->write_block_size,
+			resp->erase_block_size, resp->protect_block_size);
+exit:
+	kfree(msg);
+	return ret;
 }
 
 /* Module initialization */
diff --git a/include/linux/mfd/cros_ec.h b/include/linux/mfd/cros_ec.h
index 14cf522123dd..7eee38abd02a 100644
--- a/include/linux/mfd/cros_ec.h
+++ b/include/linux/mfd/cros_ec.h
@@ -42,8 +42,7 @@ enum {
  * @outsize: Outgoing length in bytes
  * @insize: Max number of bytes to accept from EC
  * @result: EC's response to the command (separate from communication failure)
- * @outdata: Outgoing data to EC
- * @indata: Where to put the incoming data from EC
+ * @data: Where to put the incoming data from EC and outgoing data to EC
  */
 struct cros_ec_command {
 	uint32_t version;
@@ -51,8 +50,7 @@ struct cros_ec_command {
 	uint32_t outsize;
 	uint32_t insize;
 	uint32_t result;
-	uint8_t outdata[EC_PROTO2_MAX_PARAM_SIZE];
-	uint8_t indata[EC_PROTO2_MAX_PARAM_SIZE];
+	uint8_t data[0];
 };
 
 /**
-- 
cgit v1.2.3


From 256ab950bdaa8797b7bac8fc11a567030d486304 Mon Sep 17 00:00:00 2001
From: Stephen Barber <smbarber@chromium.org>
Date: Tue, 9 Jun 2015 13:04:43 +0200
Subject: mfd: cros_ec: rev cros_ec_commands.h

Update cros_ec_commands.h to the latest version in the EC
firmware sources and add power domain and passthru commands.

Also, update lightbar to use new command names.

Signed-off-by: Stephen Barber <smbarber@chromium.org>
Reviewed-by: Randall Spangler <rspangler@chromium.org>
Signed-off-by: Javier Martinez Canillas <javier.martinez@collabora.co.uk>
Tested-by: Heiko Stuebner <heiko@sntech.de>
Reviewed-by: Gwendal Grignou <gwendal@chromium.org>
Tested-by: Gwendal Grignou <gwendal@chromium.org>
Acked-by: Lee Jones <lee.jones@linaro.org>
Acked-by: Olof Johansson <olof@lixom.net>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/platform/chrome/cros_ec_lightbar.c |  14 +-
 include/linux/mfd/cros_ec_commands.h       | 277 ++++++++++++++++++++++++++---
 2 files changed, 262 insertions(+), 29 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/platform/chrome/cros_ec_lightbar.c b/drivers/platform/chrome/cros_ec_lightbar.c
index 560e5d41b7ae..6e1986a2dce1 100644
--- a/drivers/platform/chrome/cros_ec_lightbar.c
+++ b/drivers/platform/chrome/cros_ec_lightbar.c
@@ -197,8 +197,8 @@ static ssize_t brightness_store(struct device *dev,
 		return -ENOMEM;
 
 	param = (struct ec_params_lightbar *)msg->data;
-	param->cmd = LIGHTBAR_CMD_BRIGHTNESS;
-	param->brightness.num = val;
+	param->cmd = LIGHTBAR_CMD_SET_BRIGHTNESS;
+	param->set_brightness.num = val;
 	ret = lb_throttle();
 	if (ret)
 		goto exit;
@@ -253,11 +253,11 @@ static ssize_t led_rgb_store(struct device *dev, struct device_attribute *attr,
 
 		if (i == 4) {
 			param = (struct ec_params_lightbar *)msg->data;
-			param->cmd = LIGHTBAR_CMD_RGB;
-			param->rgb.led = val[0];
-			param->rgb.red = val[1];
-			param->rgb.green = val[2];
-			param->rgb.blue = val[3];
+			param->cmd = LIGHTBAR_CMD_SET_RGB;
+			param->set_rgb.led = val[0];
+			param->set_rgb.red = val[1];
+			param->set_rgb.green = val[2];
+			param->set_rgb.blue = val[3];
 			/*
 			 * Throttle only the first of every four transactions,
 			 * so that the user can update all four LEDs at once.
diff --git a/include/linux/mfd/cros_ec_commands.h b/include/linux/mfd/cros_ec_commands.h
index a49cd41feea7..13b630c10d4c 100644
--- a/include/linux/mfd/cros_ec_commands.h
+++ b/include/linux/mfd/cros_ec_commands.h
@@ -515,7 +515,7 @@ struct ec_host_response {
 /*
  * Notes on commands:
  *
- * Each command is an 8-byte command value.  Commands which take params or
+ * Each command is an 16-bit command value.  Commands which take params or
  * return response data specify structs for that data.  If no struct is
  * specified, the command does not input or output data, respectively.
  * Parameter/response length is implicit in the structs.  Some underlying
@@ -966,7 +966,7 @@ struct rgb_s {
 /* List of tweakable parameters. NOTE: It's __packed so it can be sent in a
  * host command, but the alignment is the same regardless. Keep it that way.
  */
-struct lightbar_params {
+struct lightbar_params_v0 {
 	/* Timing */
 	int32_t google_ramp_up;
 	int32_t google_ramp_down;
@@ -1000,32 +1000,81 @@ struct lightbar_params {
 	struct rgb_s color[8];			/* 0-3 are Google colors */
 } __packed;
 
+struct lightbar_params_v1 {
+	/* Timing */
+	int32_t google_ramp_up;
+	int32_t google_ramp_down;
+	int32_t s3s0_ramp_up;
+	int32_t s0_tick_delay[2];		/* AC=0/1 */
+	int32_t s0a_tick_delay[2];		/* AC=0/1 */
+	int32_t s0s3_ramp_down;
+	int32_t s3_sleep_for;
+	int32_t s3_ramp_up;
+	int32_t s3_ramp_down;
+	int32_t tap_tick_delay;
+	int32_t tap_display_time;
+
+	/* Tap-for-battery params */
+	uint8_t tap_pct_red;
+	uint8_t tap_pct_green;
+	uint8_t tap_seg_min_on;
+	uint8_t tap_seg_max_on;
+	uint8_t tap_seg_osc;
+	uint8_t tap_idx[3];
+
+	/* Oscillation */
+	uint8_t osc_min[2];			/* AC=0/1 */
+	uint8_t osc_max[2];			/* AC=0/1 */
+	uint8_t w_ofs[2];			/* AC=0/1 */
+
+	/* Brightness limits based on the backlight and AC. */
+	uint8_t bright_bl_off_fixed[2];		/* AC=0/1 */
+	uint8_t bright_bl_on_min[2];		/* AC=0/1 */
+	uint8_t bright_bl_on_max[2];		/* AC=0/1 */
+
+	/* Battery level thresholds */
+	uint8_t battery_threshold[LB_BATTERY_LEVELS - 1];
+
+	/* Map [AC][battery_level] to color index */
+	uint8_t s0_idx[2][LB_BATTERY_LEVELS];	/* AP is running */
+	uint8_t s3_idx[2][LB_BATTERY_LEVELS];	/* AP is sleeping */
+
+	/* Color palette */
+	struct rgb_s color[8];			/* 0-3 are Google colors */
+} __packed;
+
 struct ec_params_lightbar {
 	uint8_t cmd;		      /* Command (see enum lightbar_command) */
 	union {
 		struct {
 			/* no args */
-		} dump, off, on, init, get_seq, get_params, version;
+		} dump, off, on, init, get_seq, get_params_v0, get_params_v1,
+			version, get_brightness, get_demo;
 
-		struct num {
+		struct {
 			uint8_t num;
-		} brightness, seq, demo;
+		} set_brightness, seq, demo;
 
-		struct reg {
+		struct {
 			uint8_t ctrl, reg, value;
 		} reg;
 
-		struct rgb {
+		struct {
 			uint8_t led, red, green, blue;
-		} rgb;
+		} set_rgb;
+
+		struct {
+			uint8_t led;
+		} get_rgb;
 
-		struct lightbar_params set_params;
+		struct lightbar_params_v0 set_params_v0;
+		struct lightbar_params_v1 set_params_v1;
 	};
 } __packed;
 
 struct ec_response_lightbar {
 	union {
-		struct dump {
+		struct {
 			struct {
 				uint8_t reg;
 				uint8_t ic0;
@@ -1033,20 +1082,26 @@ struct ec_response_lightbar {
 			} vals[23];
 		} dump;
 
-		struct get_seq {
+		struct  {
 			uint8_t num;
-		} get_seq;
+		} get_seq, get_brightness, get_demo;
 
-		struct lightbar_params get_params;
+		struct lightbar_params_v0 get_params_v0;
+		struct lightbar_params_v1 get_params_v1;
 
-		struct version {
+		struct {
 			uint32_t num;
 			uint32_t flags;
 		} version;
 
+		struct {
+			uint8_t red, green, blue;
+		} get_rgb;
+
 		struct {
 			/* no return params */
-		} off, on, init, brightness, seq, reg, rgb, demo, set_params;
+		} off, on, init, set_brightness, seq, reg, set_rgb,
+			demo, set_params_v0, set_params_v1;
 	};
 } __packed;
 
@@ -1056,15 +1111,20 @@ enum lightbar_command {
 	LIGHTBAR_CMD_OFF = 1,
 	LIGHTBAR_CMD_ON = 2,
 	LIGHTBAR_CMD_INIT = 3,
-	LIGHTBAR_CMD_BRIGHTNESS = 4,
+	LIGHTBAR_CMD_SET_BRIGHTNESS = 4,
 	LIGHTBAR_CMD_SEQ = 5,
 	LIGHTBAR_CMD_REG = 6,
-	LIGHTBAR_CMD_RGB = 7,
+	LIGHTBAR_CMD_SET_RGB = 7,
 	LIGHTBAR_CMD_GET_SEQ = 8,
 	LIGHTBAR_CMD_DEMO = 9,
-	LIGHTBAR_CMD_GET_PARAMS = 10,
-	LIGHTBAR_CMD_SET_PARAMS = 11,
+	LIGHTBAR_CMD_GET_PARAMS_V0 = 10,
+	LIGHTBAR_CMD_SET_PARAMS_V0 = 11,
 	LIGHTBAR_CMD_VERSION = 12,
+	LIGHTBAR_CMD_GET_BRIGHTNESS = 13,
+	LIGHTBAR_CMD_GET_RGB = 14,
+	LIGHTBAR_CMD_GET_DEMO = 15,
+	LIGHTBAR_CMD_GET_PARAMS_V1 = 16,
+	LIGHTBAR_CMD_SET_PARAMS_V1 = 17,
 	LIGHTBAR_NUM_CMDS
 };
 
@@ -1421,8 +1481,40 @@ struct ec_response_rtc {
 /*****************************************************************************/
 /* Port80 log access */
 
+/* Maximum entries that can be read/written in a single command */
+#define EC_PORT80_SIZE_MAX 32
+
 /* Get last port80 code from previous boot */
 #define EC_CMD_PORT80_LAST_BOOT 0x48
+#define EC_CMD_PORT80_READ 0x48
+
+enum ec_port80_subcmd {
+	EC_PORT80_GET_INFO = 0,
+	EC_PORT80_READ_BUFFER,
+};
+
+struct ec_params_port80_read {
+	uint16_t subcmd;
+	union {
+		struct {
+			uint32_t offset;
+			uint32_t num_entries;
+		} read_buffer;
+	};
+} __packed;
+
+struct ec_response_port80_read {
+	union {
+		struct {
+			uint32_t writes;
+			uint32_t history_size;
+			uint32_t last_boot;
+		} get_info;
+		struct {
+			uint16_t codes[EC_PORT80_SIZE_MAX];
+		} data;
+	};
+} __packed;
 
 struct ec_response_port80_last_boot {
 	uint16_t code;
@@ -1782,6 +1874,7 @@ struct ec_params_gpio_set {
 /* Get GPIO value */
 #define EC_CMD_GPIO_GET 0x93
 
+/* Version 0 of input params and response */
 struct ec_params_gpio_get {
 	char name[32];
 } __packed;
@@ -1789,6 +1882,38 @@ struct ec_response_gpio_get {
 	uint8_t val;
 } __packed;
 
+/* Version 1 of input params and response */
+struct ec_params_gpio_get_v1 {
+	uint8_t subcmd;
+	union {
+		struct {
+			char name[32];
+		} get_value_by_name;
+		struct {
+			uint8_t index;
+		} get_info;
+	};
+} __packed;
+
+struct ec_response_gpio_get_v1 {
+	union {
+		struct {
+			uint8_t val;
+		} get_value_by_name, get_count;
+		struct {
+			uint8_t val;
+			char name[32];
+			uint32_t flags;
+		} get_info;
+	};
+} __packed;
+
+enum gpio_get_subcmd {
+	EC_GPIO_GET_BY_NAME = 0,
+	EC_GPIO_GET_COUNT = 1,
+	EC_GPIO_GET_INFO = 2,
+};
+
 /*****************************************************************************/
 /* I2C commands. Only available when flash write protect is unlocked. */
 
@@ -1857,13 +1982,21 @@ struct ec_params_charge_control {
 /*****************************************************************************/
 
 /*
- * Cut off battery power output if the battery supports.
+ * Cut off battery power immediately or after the host has shut down.
  *
- * For unsupported battery, just don't implement this command and lets EC
- * return EC_RES_INVALID_COMMAND.
+ * return EC_RES_INVALID_COMMAND if unsupported by a board/battery.
+ *	  EC_RES_SUCCESS if the command was successful.
+ *	  EC_RES_ERROR if the cut off command failed.
  */
+
 #define EC_CMD_BATTERY_CUT_OFF 0x99
 
+#define EC_BATTERY_CUTOFF_FLAG_AT_SHUTDOWN	(1 << 0)
+
+struct ec_params_battery_cutoff {
+	uint8_t flags;
+} __packed;
+
 /*****************************************************************************/
 /* USB port mux control. */
 
@@ -2141,6 +2274,32 @@ struct ec_params_sb_wr_block {
 	uint16_t data[32];
 } __packed;
 
+/*****************************************************************************/
+/* Battery vendor parameters
+ *
+ * Get or set vendor-specific parameters in the battery. Implementations may
+ * differ between boards or batteries. On a set operation, the response
+ * contains the actual value set, which may be rounded or clipped from the
+ * requested value.
+ */
+
+#define EC_CMD_BATTERY_VENDOR_PARAM 0xb4
+
+enum ec_battery_vendor_param_mode {
+	BATTERY_VENDOR_PARAM_MODE_GET = 0,
+	BATTERY_VENDOR_PARAM_MODE_SET,
+};
+
+struct ec_params_battery_vendor_param {
+	uint32_t param;
+	uint32_t value;
+	uint8_t mode;
+} __packed;
+
+struct ec_response_battery_vendor_param {
+	uint32_t value;
+} __packed;
+
 /*****************************************************************************/
 /* System commands */
 
@@ -2336,6 +2495,80 @@ struct ec_params_reboot_ec {
 
 #endif  /* !__ACPI__ */
 
+/*****************************************************************************/
+/*
+ * PD commands
+ *
+ * These commands are for PD MCU communication.
+ */
+
+/* EC to PD MCU exchange status command */
+#define EC_CMD_PD_EXCHANGE_STATUS 0x100
+
+/* Status of EC being sent to PD */
+struct ec_params_pd_status {
+	int8_t batt_soc; /* battery state of charge */
+} __packed;
+
+/* Status of PD being sent back to EC */
+struct ec_response_pd_status {
+	int8_t status;        /* PD MCU status */
+	uint32_t curr_lim_ma; /* input current limit */
+} __packed;
+
+/* Set USB type-C port role and muxes */
+#define EC_CMD_USB_PD_CONTROL 0x101
+
+enum usb_pd_control_role {
+	USB_PD_CTRL_ROLE_NO_CHANGE = 0,
+	USB_PD_CTRL_ROLE_TOGGLE_ON = 1, /* == AUTO */
+	USB_PD_CTRL_ROLE_TOGGLE_OFF = 2,
+	USB_PD_CTRL_ROLE_FORCE_SINK = 3,
+	USB_PD_CTRL_ROLE_FORCE_SOURCE = 4,
+};
+
+enum usb_pd_control_mux {
+	USB_PD_CTRL_MUX_NO_CHANGE = 0,
+	USB_PD_CTRL_MUX_NONE = 1,
+	USB_PD_CTRL_MUX_USB = 2,
+	USB_PD_CTRL_MUX_DP = 3,
+	USB_PD_CTRL_MUX_DOCK = 4,
+	USB_PD_CTRL_MUX_AUTO = 5,
+};
+
+struct ec_params_usb_pd_control {
+	uint8_t port;
+	uint8_t role;
+	uint8_t mux;
+} __packed;
+
+/*****************************************************************************/
+/*
+ * Passthru commands
+ *
+ * Some platforms have sub-processors chained to each other.  For example.
+ *
+ *     AP <--> EC <--> PD MCU
+ *
+ * The top 2 bits of the command number are used to indicate which device the
+ * command is intended for.  Device 0 is always the device receiving the
+ * command; other device mapping is board-specific.
+ *
+ * When a device receives a command to be passed to a sub-processor, it passes
+ * it on with the device number set back to 0.  This allows the sub-processor
+ * to remain blissfully unaware of whether the command originated on the next
+ * device up the chain, or was passed through from the AP.
+ *
+ * In the above example, if the AP wants to send command 0x0002 to the PD MCU,
+ *     AP sends command 0x4002 to the EC
+ *     EC sends command 0x0002 to the PD MCU
+ *     EC forwards PD MCU response back to the AP
+ */
+
+/* Offset and max command number for sub-device n */
+#define EC_CMD_PASSTHRU_OFFSET(n) (0x4000 * (n))
+#define EC_CMD_PASSTHRU_MAX(n) (EC_CMD_PASSTHRU_OFFSET(n) + 0x3fff)
+
 /*****************************************************************************/
 /*
  * Deprecated constants. These constants have been renamed for clarity. The
-- 
cgit v1.2.3


From 2c7589af3c4dee844e6a4174f2aa8996cf837604 Mon Sep 17 00:00:00 2001
From: Stephen Barber <smbarber@chromium.org>
Date: Tue, 9 Jun 2015 13:04:45 +0200
Subject: mfd: cros_ec: add proto v3 skeleton

Add support in cros_ec.c to handle EC host command protocol v3.
For v3+, probe for maximum shared protocol version and max
request, response, and passthrough sizes. For now, this will
always fall back to v2, since there is no bus-specific code
for handling proto v3 packets.

Signed-off-by: Stephen Barber <smbarber@chromium.org>
Signed-off-by: Javier Martinez Canillas <javier.martinez@collabora.co.uk>
Reviewed-by: Gwendal Grignou <gwendal@chromium.org>
Tested-by: Gwendal Grignou <gwendal@chromium.org>
Tested-by: Heiko Stuebner <heiko@sntech.de>
Acked-by: Lee Jones <lee.jones@linaro.org>
Acked-by: Olof Johansson <olof@lixom.net>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/mfd/cros_ec.c                   |  23 ++-
 drivers/mfd/cros_ec_i2c.c               |   4 +
 drivers/mfd/cros_ec_spi.c               |   7 +-
 drivers/platform/chrome/cros_ec_lpc.c   |   4 +
 drivers/platform/chrome/cros_ec_proto.c | 339 ++++++++++++++++++++++++++++----
 include/linux/mfd/cros_ec.h             |  28 ++-
 6 files changed, 355 insertions(+), 50 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mfd/cros_ec.c b/drivers/mfd/cros_ec.c
index d857f6a2b57b..08d82bfc5268 100644
--- a/drivers/mfd/cros_ec.c
+++ b/drivers/mfd/cros_ec.c
@@ -36,19 +36,22 @@ int cros_ec_register(struct cros_ec_device *ec_dev)
 	struct device *dev = ec_dev->dev;
 	int err = 0;
 
-	if (ec_dev->din_size) {
-		ec_dev->din = devm_kzalloc(dev, ec_dev->din_size, GFP_KERNEL);
-		if (!ec_dev->din)
-			return -ENOMEM;
-	}
-	if (ec_dev->dout_size) {
-		ec_dev->dout = devm_kzalloc(dev, ec_dev->dout_size, GFP_KERNEL);
-		if (!ec_dev->dout)
-			return -ENOMEM;
-	}
+	ec_dev->max_request = sizeof(struct ec_params_hello);
+	ec_dev->max_response = sizeof(struct ec_response_get_protocol_info);
+	ec_dev->max_passthru = 0;
+
+	ec_dev->din = devm_kzalloc(dev, ec_dev->din_size, GFP_KERNEL);
+	if (!ec_dev->din)
+		return -ENOMEM;
+
+	ec_dev->dout = devm_kzalloc(dev, ec_dev->dout_size, GFP_KERNEL);
+	if (!ec_dev->dout)
+		return -ENOMEM;
 
 	mutex_init(&ec_dev->lock);
 
+	cros_ec_query_all(ec_dev);
+
 	err = mfd_add_devices(dev, 0, cros_devs,
 			      ARRAY_SIZE(cros_devs),
 			      NULL, ec_dev->irq, NULL);
diff --git a/drivers/mfd/cros_ec_i2c.c b/drivers/mfd/cros_ec_i2c.c
index fbf7819f5de5..b400bfa2772a 100644
--- a/drivers/mfd/cros_ec_i2c.c
+++ b/drivers/mfd/cros_ec_i2c.c
@@ -143,8 +143,12 @@ static int cros_ec_i2c_probe(struct i2c_client *client,
 	ec_dev->priv = client;
 	ec_dev->irq = client->irq;
 	ec_dev->cmd_xfer = cros_ec_cmd_xfer_i2c;
+	ec_dev->pkt_xfer = NULL;
 	ec_dev->ec_name = client->name;
 	ec_dev->phys_name = client->adapter->name;
+	ec_dev->din_size = sizeof(struct ec_host_response) +
+			   sizeof(struct ec_response_get_protocol_info);
+	ec_dev->dout_size = sizeof(struct ec_host_request);
 
 	err = cros_ec_register(ec_dev);
 	if (err) {
diff --git a/drivers/mfd/cros_ec_spi.c b/drivers/mfd/cros_ec_spi.c
index 573730fec947..04da2f288ef8 100644
--- a/drivers/mfd/cros_ec_spi.c
+++ b/drivers/mfd/cros_ec_spi.c
@@ -361,10 +361,13 @@ static int cros_ec_spi_probe(struct spi_device *spi)
 	ec_dev->priv = ec_spi;
 	ec_dev->irq = spi->irq;
 	ec_dev->cmd_xfer = cros_ec_cmd_xfer_spi;
+	ec_dev->pkt_xfer = NULL;
 	ec_dev->ec_name = ec_spi->spi->modalias;
 	ec_dev->phys_name = dev_name(&ec_spi->spi->dev);
-	ec_dev->din_size = EC_MSG_BYTES + EC_MSG_PREAMBLE_COUNT;
-	ec_dev->dout_size = EC_MSG_BYTES;
+	ec_dev->din_size = EC_MSG_PREAMBLE_COUNT +
+			   sizeof(struct ec_host_response) +
+			   sizeof(struct ec_response_get_protocol_info);
+	ec_dev->dout_size = sizeof(struct ec_host_request);
 
 	err = cros_ec_register(ec_dev);
 	if (err) {
diff --git a/drivers/platform/chrome/cros_ec_lpc.c b/drivers/platform/chrome/cros_ec_lpc.c
index 4c7f0df33bf8..05aeb559275f 100644
--- a/drivers/platform/chrome/cros_ec_lpc.c
+++ b/drivers/platform/chrome/cros_ec_lpc.c
@@ -215,7 +215,11 @@ static int cros_ec_lpc_probe(struct platform_device *pdev)
 	ec_dev->ec_name = pdev->name;
 	ec_dev->phys_name = dev_name(dev);
 	ec_dev->cmd_xfer = cros_ec_cmd_xfer_lpc;
+	ec_dev->pkt_xfer = NULL;
 	ec_dev->cmd_readmem = cros_ec_lpc_readmem;
+	ec_dev->din_size = sizeof(struct ec_host_response) +
+			   sizeof(struct ec_response_get_protocol_info);
+	ec_dev->dout_size = sizeof(struct ec_host_request);
 
 	ret = cros_ec_register(ec_dev);
 	if (ret) {
diff --git a/drivers/platform/chrome/cros_ec_proto.c b/drivers/platform/chrome/cros_ec_proto.c
index 58e98a24fd08..990308ca384f 100644
--- a/drivers/platform/chrome/cros_ec_proto.c
+++ b/drivers/platform/chrome/cros_ec_proto.c
@@ -22,11 +22,100 @@
 
 #define EC_COMMAND_RETRIES	50
 
+static int prepare_packet(struct cros_ec_device *ec_dev,
+			  struct cros_ec_command *msg)
+{
+	struct ec_host_request *request;
+	u8 *out;
+	int i;
+	u8 csum = 0;
+
+	BUG_ON(ec_dev->proto_version != EC_HOST_REQUEST_VERSION);
+	BUG_ON(msg->outsize + sizeof(*request) > ec_dev->dout_size);
+
+	out = ec_dev->dout;
+	request = (struct ec_host_request *)out;
+	request->struct_version = EC_HOST_REQUEST_VERSION;
+	request->checksum = 0;
+	request->command = msg->command;
+	request->command_version = msg->version;
+	request->reserved = 0;
+	request->data_len = msg->outsize;
+
+	for (i = 0; i < sizeof(*request); i++)
+		csum += out[i];
+
+	/* Copy data and update checksum */
+	memcpy(out + sizeof(*request), msg->data, msg->outsize);
+	for (i = 0; i < msg->outsize; i++)
+		csum += msg->data[i];
+
+	request->checksum = -csum;
+
+	return sizeof(*request) + msg->outsize;
+}
+
+static int send_command(struct cros_ec_device *ec_dev,
+			struct cros_ec_command *msg)
+{
+	int ret;
+
+	if (ec_dev->proto_version > 2)
+		ret = ec_dev->pkt_xfer(ec_dev, msg);
+	else
+		ret = ec_dev->cmd_xfer(ec_dev, msg);
+
+	if (msg->result == EC_RES_IN_PROGRESS) {
+		int i;
+		struct cros_ec_command *status_msg;
+		struct ec_response_get_comms_status *status;
+
+		status_msg = kmalloc(sizeof(*status_msg) + sizeof(*status),
+				     GFP_KERNEL);
+		if (!status_msg)
+			return -ENOMEM;
+
+		status_msg->version = 0;
+		status_msg->command = EC_CMD_GET_COMMS_STATUS;
+		status_msg->insize = sizeof(*status);
+		status_msg->outsize = 0;
+
+		/*
+		 * Query the EC's status until it's no longer busy or
+		 * we encounter an error.
+		 */
+		for (i = 0; i < EC_COMMAND_RETRIES; i++) {
+			usleep_range(10000, 11000);
+
+			ret = ec_dev->cmd_xfer(ec_dev, status_msg);
+			if (ret < 0)
+				break;
+
+			msg->result = status_msg->result;
+			if (status_msg->result != EC_RES_SUCCESS)
+				break;
+
+			status = (struct ec_response_get_comms_status *)
+				 status_msg->data;
+			if (!(status->flags & EC_COMMS_STATUS_PROCESSING))
+				break;
+		}
+
+		kfree(status_msg);
+	}
+
+	return ret;
+}
+
 int cros_ec_prepare_tx(struct cros_ec_device *ec_dev,
 		       struct cros_ec_command *msg)
 {
-	uint8_t *out;
-	int csum, i;
+	u8 *out;
+	u8 csum;
+	int i;
+
+	if (ec_dev->proto_version > 2)
+		return prepare_packet(ec_dev, msg);
 
 	BUG_ON(msg->outsize > EC_PROTO2_MAX_PARAM_SIZE);
 	out = ec_dev->dout;
@@ -36,7 +125,7 @@ int cros_ec_prepare_tx(struct cros_ec_device *ec_dev,
 	csum = out[0] + out[1] + out[2];
 	for (i = 0; i < msg->outsize; i++)
 		csum += out[EC_MSG_TX_HEADER_BYTES + i] = msg->data[i];
-	out[EC_MSG_TX_HEADER_BYTES + msg->outsize] = (uint8_t)(csum & 0xff);
+	out[EC_MSG_TX_HEADER_BYTES + msg->outsize] = csum;
 
 	return EC_MSG_TX_PROTO_BYTES + msg->outsize;
 }
@@ -60,54 +149,232 @@ int cros_ec_check_result(struct cros_ec_device *ec_dev,
 }
 EXPORT_SYMBOL(cros_ec_check_result);
 
-int cros_ec_cmd_xfer(struct cros_ec_device *ec_dev,
-		     struct cros_ec_command *msg)
+static int cros_ec_host_command_proto_query(struct cros_ec_device *ec_dev,
+					    int devidx,
+					    struct cros_ec_command *msg)
 {
+	/*
+	 * Try using v3+ to query for supported protocols. If this
+	 * command fails, fall back to v2. Returns the highest protocol
+	 * supported by the EC.
+	 * Also sets the max request/response/passthru size.
+	 */
 	int ret;
 
-	mutex_lock(&ec_dev->lock);
-	ret = ec_dev->cmd_xfer(ec_dev, msg);
-	if (msg->result == EC_RES_IN_PROGRESS) {
-		int i;
-		struct cros_ec_command *status_msg;
-		struct ec_response_get_comms_status *status;
+	if (!ec_dev->pkt_xfer)
+		return -EPROTONOSUPPORT;
 
-		status_msg = kmalloc(sizeof(*status_msg) + sizeof(*status),
-				     GFP_KERNEL);
-		if (!status_msg) {
-			ret = -ENOMEM;
-			goto exit;
-		}
+	memset(msg, 0, sizeof(*msg));
+	msg->command = EC_CMD_PASSTHRU_OFFSET(devidx) | EC_CMD_GET_PROTOCOL_INFO;
+	msg->insize = sizeof(struct ec_response_get_protocol_info);
 
-		status_msg->version = 0;
-		status_msg->command = EC_CMD_GET_COMMS_STATUS;
-		status_msg->insize = sizeof(*status);
-		status_msg->outsize = 0;
+	ret = send_command(ec_dev, msg);
+
+	if (ret < 0) {
+		dev_dbg(ec_dev->dev,
+			"failed to check for EC[%d] protocol version: %d\n",
+			devidx, ret);
+		return ret;
+	}
+
+	if (devidx > 0 && msg->result == EC_RES_INVALID_COMMAND)
+		return -ENODEV;
+	else if (msg->result != EC_RES_SUCCESS)
+		return msg->result;
+
+	return 0;
+}
+
+static int cros_ec_host_command_proto_query_v2(struct cros_ec_device *ec_dev)
+{
+	struct cros_ec_command *msg;
+	struct ec_params_hello *hello_params;
+	struct ec_response_hello *hello_response;
+	int ret;
+	int len = max(sizeof(*hello_params), sizeof(*hello_response));
+
+	msg = kmalloc(sizeof(*msg) + len, GFP_KERNEL);
+	if (!msg)
+		return -ENOMEM;
+
+	msg->version = 0;
+	msg->command = EC_CMD_HELLO;
+	hello_params = (struct ec_params_hello *)msg->data;
+	msg->outsize = sizeof(*hello_params);
+	hello_response = (struct ec_response_hello *)msg->data;
+	msg->insize = sizeof(*hello_response);
+
+	hello_params->in_data = 0xa0b0c0d0;
+
+	ret = send_command(ec_dev, msg);
+
+	if (ret < 0) {
+		dev_dbg(ec_dev->dev,
+			"EC failed to respond to v2 hello: %d\n",
+			ret);
+		goto exit;
+	} else if (msg->result != EC_RES_SUCCESS) {
+		dev_err(ec_dev->dev,
+			"EC responded to v2 hello with error: %d\n",
+			msg->result);
+		ret = msg->result;
+		goto exit;
+	} else if (hello_response->out_data != 0xa1b2c3d4) {
+		dev_err(ec_dev->dev,
+			"EC responded to v2 hello with bad result: %u\n",
+			hello_response->out_data);
+		ret = -EBADMSG;
+		goto exit;
+	}
+
+	ret = 0;
+
+ exit:
+	kfree(msg);
+	return ret;
+}
+
+int cros_ec_query_all(struct cros_ec_device *ec_dev)
+{
+	struct device *dev = ec_dev->dev;
+	struct cros_ec_command *proto_msg;
+	struct ec_response_get_protocol_info *proto_info;
+	int ret;
+
+	proto_msg = kzalloc(sizeof(*proto_msg) + sizeof(*proto_info),
+			    GFP_KERNEL);
+	if (!proto_msg)
+		return -ENOMEM;
+
+	/* First try sending with proto v3. */
+	ec_dev->proto_version = 3;
+	ret = cros_ec_host_command_proto_query(ec_dev, 0, proto_msg);
+
+	if (ret == 0) {
+		proto_info = (struct ec_response_get_protocol_info *)
+			proto_msg->data;
+		ec_dev->max_request = proto_info->max_request_packet_size -
+			sizeof(struct ec_host_request);
+		ec_dev->max_response = proto_info->max_response_packet_size -
+			sizeof(struct ec_host_response);
+		ec_dev->proto_version =
+			min(EC_HOST_REQUEST_VERSION,
+					fls(proto_info->protocol_versions) - 1);
+		dev_dbg(ec_dev->dev,
+			"using proto v%u\n",
+			ec_dev->proto_version);
+
+		ec_dev->din_size = ec_dev->max_response +
+			sizeof(struct ec_host_response) +
+			EC_MAX_RESPONSE_OVERHEAD;
+		ec_dev->dout_size = ec_dev->max_request +
+			sizeof(struct ec_host_request) +
+			EC_MAX_REQUEST_OVERHEAD;
 
 		/*
-		 * Query the EC's status until it's no longer busy or
-		 * we encounter an error.
+		 * Check for PD
 		 */
-		for (i = 0; i < EC_COMMAND_RETRIES; i++) {
-			usleep_range(10000, 11000);
+		ret = cros_ec_host_command_proto_query(ec_dev, 1, proto_msg);
 
-			ret = ec_dev->cmd_xfer(ec_dev, status_msg);
-			if (ret < 0)
-				break;
+		if (ret) {
+			dev_dbg(ec_dev->dev, "no PD chip found: %d\n", ret);
+			ec_dev->max_passthru = 0;
+		} else {
+			dev_dbg(ec_dev->dev, "found PD chip\n");
+			ec_dev->max_passthru =
+				proto_info->max_request_packet_size -
+				sizeof(struct ec_host_request);
+		}
+	} else {
+		/* Try querying with a v2 hello message. */
+		ec_dev->proto_version = 2;
+		ret = cros_ec_host_command_proto_query_v2(ec_dev);
 
-			msg->result = status_msg->result;
-			if (status_msg->result != EC_RES_SUCCESS)
-				break;
+		if (ret == 0) {
+			/* V2 hello succeeded. */
+			dev_dbg(ec_dev->dev, "falling back to proto v2\n");
 
-			status = (struct ec_response_get_comms_status *)
-				 status_msg->data;
-			if (!(status->flags & EC_COMMS_STATUS_PROCESSING))
-				break;
+			ec_dev->max_request = EC_PROTO2_MAX_PARAM_SIZE;
+			ec_dev->max_response = EC_PROTO2_MAX_PARAM_SIZE;
+			ec_dev->max_passthru = 0;
+			ec_dev->pkt_xfer = NULL;
+			ec_dev->din_size = EC_MSG_BYTES;
+			ec_dev->dout_size = EC_MSG_BYTES;
+		} else {
+			/*
+			 * It's possible for a test to occur too early when
+			 * the EC isn't listening. If this happens, we'll
+			 * test later when the first command is run.
+			 */
+			ec_dev->proto_version = EC_PROTO_VERSION_UNKNOWN;
+			dev_dbg(ec_dev->dev, "EC query failed: %d\n", ret);
+			goto exit;
 		}
+	}
 
-		kfree(status_msg);
+	devm_kfree(dev, ec_dev->din);
+	devm_kfree(dev, ec_dev->dout);
+
+	ec_dev->din = devm_kzalloc(dev, ec_dev->din_size, GFP_KERNEL);
+	if (!ec_dev->din) {
+		ret = -ENOMEM;
+		goto exit;
 	}
+
+	ec_dev->dout = devm_kzalloc(dev, ec_dev->dout_size, GFP_KERNEL);
+	if (!ec_dev->dout) {
+		devm_kfree(dev, ec_dev->din);
+		ret = -ENOMEM;
+		goto exit;
+	}
+
 exit:
+	kfree(proto_msg);
+	return ret;
+}
+EXPORT_SYMBOL(cros_ec_query_all);
+
+int cros_ec_cmd_xfer(struct cros_ec_device *ec_dev,
+		     struct cros_ec_command *msg)
+{
+	int ret;
+
+	mutex_lock(&ec_dev->lock);
+	if (ec_dev->proto_version == EC_PROTO_VERSION_UNKNOWN) {
+		ret = cros_ec_query_all(ec_dev);
+		if (ret) {
+			dev_err(ec_dev->dev,
+				"EC version unknown and query failed; aborting command\n");
+			mutex_unlock(&ec_dev->lock);
+			return ret;
+		}
+	}
+
+	if (msg->insize > ec_dev->max_response) {
+		dev_dbg(ec_dev->dev, "clamping message receive buffer\n");
+		msg->insize = ec_dev->max_response;
+	}
+
+	if (msg->command < EC_CMD_PASSTHRU_OFFSET(1)) {
+		if (msg->outsize > ec_dev->max_request) {
+			dev_err(ec_dev->dev,
+				"request of size %u is too big (max: %u)\n",
+				msg->outsize,
+				ec_dev->max_request);
+			mutex_unlock(&ec_dev->lock);
+			return -EMSGSIZE;
+		}
+	} else {
+		if (msg->outsize > ec_dev->max_passthru) {
+			dev_err(ec_dev->dev,
+				"passthru rq of size %u is too big (max: %u)\n",
+				msg->outsize,
+				ec_dev->max_passthru);
+			mutex_unlock(&ec_dev->lock);
+			return -EMSGSIZE;
+		}
+	}
+	ret = send_command(ec_dev, msg);
 	mutex_unlock(&ec_dev->lock);
 
 	return ret;
diff --git a/include/linux/mfd/cros_ec.h b/include/linux/mfd/cros_ec.h
index 7eee38abd02a..59d909434efd 100644
--- a/include/linux/mfd/cros_ec.h
+++ b/include/linux/mfd/cros_ec.h
@@ -21,6 +21,15 @@
 #include <linux/mfd/cros_ec_commands.h>
 #include <linux/mutex.h>
 
+/*
+ * Max bus-specific overhead incurred by request/responses.
+ * I2C requires 1 additional byte for requests.
+ * I2C requires 2 additional bytes for responses.
+ * */
+#define EC_PROTO_VERSION_UNKNOWN	0
+#define EC_MAX_REQUEST_OVERHEAD		1
+#define EC_MAX_RESPONSE_OVERHEAD	2
+
 /*
  * Command interface between EC and AP, for LPC, I2C and SPI interfaces.
  */
@@ -88,6 +97,7 @@ struct cros_ec_command {
  *     Returns the number of bytes received if the communication succeeded, but
  *     that doesn't mean the EC was happy with the command. The caller
  *     should check msg.result for the EC's result code.
+ * @pkt_xfer: send packet to EC and get response
  * @lock: one transaction at a time
  */
 struct cros_ec_device {
@@ -104,15 +114,21 @@ struct cros_ec_device {
 			   unsigned int bytes, void *dest);
 
 	/* These are used to implement the platform-specific interface */
+	u16 max_request;
+	u16 max_response;
+	u16 max_passthru;
+	u16 proto_version;
 	void *priv;
 	int irq;
-	uint8_t *din;
-	uint8_t *dout;
+	u8 *din;
+	u8 *dout;
 	int din_size;
 	int dout_size;
 	bool wake_enabled;
 	int (*cmd_xfer)(struct cros_ec_device *ec,
 			struct cros_ec_command *msg);
+	int (*pkt_xfer)(struct cros_ec_device *ec,
+			struct cros_ec_command *msg);
 	struct mutex lock;
 };
 
@@ -194,4 +210,12 @@ int cros_ec_remove(struct cros_ec_device *ec_dev);
  */
 int cros_ec_register(struct cros_ec_device *ec_dev);
 
+/**
+ * cros_ec_register -  Query the protocol version supported by the ChromeOS EC
+ *
+ * @ec_dev: Device to register
+ * @return 0 if ok, -ve on error
+ */
+int cros_ec_query_all(struct cros_ec_device *ec_dev);
+
 #endif /* __LINUX_MFD_CROS_EC_H */
-- 
cgit v1.2.3


From d365407079d33106f76bd486a863de05eb5ae95d Mon Sep 17 00:00:00 2001
From: Stephen Barber <smbarber@chromium.org>
Date: Tue, 9 Jun 2015 13:04:46 +0200
Subject: mfd: cros_ec: add bus-specific proto v3 code

Add proto v3 support to the SPI, I2C, and LPC.

Signed-off-by: Stephen Barber <smbarber@chromium.org>
Signed-off-by: Javier Martinez Canillas <javier.martinez@collabora.co.uk>
Tested-by: Heiko Stuebner <heiko@sntech.de>
Reviewed-by: Gwendal Grignou <gwendal@chromium.org>
Tested-by: Gwendal Grignou <gwendal@chromium.org>
Acked-by: Lee Jones <lee.jones@linaro.org>
Acked-by: Olof Johansson <olof@lixom.net>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/mfd/cros_ec_i2c.c             | 166 ++++++++++++++-
 drivers/mfd/cros_ec_spi.c             | 382 +++++++++++++++++++++++++++++-----
 drivers/platform/chrome/cros_ec_lpc.c |  73 ++++++-
 include/linux/mfd/cros_ec.h           |   6 +
 4 files changed, 569 insertions(+), 58 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mfd/cros_ec_i2c.c b/drivers/mfd/cros_ec_i2c.c
index b400bfa2772a..22e8a4ae1711 100644
--- a/drivers/mfd/cros_ec_i2c.c
+++ b/drivers/mfd/cros_ec_i2c.c
@@ -13,6 +13,7 @@
  * GNU General Public License for more details.
  */
 
+#include <linux/delay.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/i2c.h>
@@ -22,6 +23,32 @@
 #include <linux/platform_device.h>
 #include <linux/slab.h>
 
+/**
+ * Request format for protocol v3
+ * byte 0	0xda (EC_COMMAND_PROTOCOL_3)
+ * byte 1-8	struct ec_host_request
+ * byte 10-	response data
+ */
+struct ec_host_request_i2c {
+	/* Always 0xda to backward compatible with v2 struct */
+	uint8_t  command_protocol;
+	struct ec_host_request ec_request;
+} __packed;
+
+
+/*
+ * Response format for protocol v3
+ * byte 0	result code
+ * byte 1	packet_length
+ * byte 2-9	struct ec_host_response
+ * byte 10-	response data
+ */
+struct ec_host_response_i2c {
+	uint8_t result;
+	uint8_t packet_length;
+	struct ec_host_response ec_response;
+} __packed;
+
 static inline struct cros_ec_device *to_ec_dev(struct device *dev)
 {
 	struct i2c_client *client = to_i2c_client(dev);
@@ -29,6 +56,134 @@ static inline struct cros_ec_device *to_ec_dev(struct device *dev)
 	return i2c_get_clientdata(client);
 }
 
+static int cros_ec_pkt_xfer_i2c(struct cros_ec_device *ec_dev,
+				struct cros_ec_command *msg)
+{
+	struct i2c_client *client = ec_dev->priv;
+	int ret = -ENOMEM;
+	int i;
+	int packet_len;
+	u8 *out_buf = NULL;
+	u8 *in_buf = NULL;
+	u8 sum;
+	struct i2c_msg i2c_msg[2];
+	struct ec_host_response *ec_response;
+	struct ec_host_request_i2c *ec_request_i2c;
+	struct ec_host_response_i2c *ec_response_i2c;
+	int request_header_size = sizeof(struct ec_host_request_i2c);
+	int response_header_size = sizeof(struct ec_host_response_i2c);
+
+	i2c_msg[0].addr = client->addr;
+	i2c_msg[0].flags = 0;
+	i2c_msg[1].addr = client->addr;
+	i2c_msg[1].flags = I2C_M_RD;
+
+	packet_len = msg->insize + response_header_size;
+	BUG_ON(packet_len > ec_dev->din_size);
+	in_buf = ec_dev->din;
+	i2c_msg[1].len = packet_len;
+	i2c_msg[1].buf = (char *) in_buf;
+
+	packet_len = msg->outsize + request_header_size;
+	BUG_ON(packet_len > ec_dev->dout_size);
+	out_buf = ec_dev->dout;
+	i2c_msg[0].len = packet_len;
+	i2c_msg[0].buf = (char *) out_buf;
+
+	/* create request data */
+	ec_request_i2c = (struct ec_host_request_i2c *) out_buf;
+	ec_request_i2c->command_protocol = EC_COMMAND_PROTOCOL_3;
+
+	ec_dev->dout++;
+	ret = cros_ec_prepare_tx(ec_dev, msg);
+	ec_dev->dout--;
+
+	/* send command to EC and read answer */
+	ret = i2c_transfer(client->adapter, i2c_msg, 2);
+	if (ret < 0) {
+		dev_dbg(ec_dev->dev, "i2c transfer failed: %d\n", ret);
+		goto done;
+	} else if (ret != 2) {
+		dev_err(ec_dev->dev, "failed to get response: %d\n", ret);
+		ret = -EIO;
+		goto done;
+	}
+
+	ec_response_i2c = (struct ec_host_response_i2c *) in_buf;
+	msg->result = ec_response_i2c->result;
+	ec_response = &ec_response_i2c->ec_response;
+
+	switch (msg->result) {
+	case EC_RES_SUCCESS:
+		break;
+	case EC_RES_IN_PROGRESS:
+		ret = -EAGAIN;
+		dev_dbg(ec_dev->dev, "command 0x%02x in progress\n",
+			msg->command);
+		goto done;
+
+	default:
+		dev_dbg(ec_dev->dev, "command 0x%02x returned %d\n",
+			msg->command, msg->result);
+		/*
+		 * When we send v3 request to v2 ec, ec won't recognize the
+		 * 0xda (EC_COMMAND_PROTOCOL_3) and will return with status
+		 * EC_RES_INVALID_COMMAND with zero data length.
+		 *
+		 * In case of invalid command for v3 protocol the data length
+		 * will be at least sizeof(struct ec_host_response)
+		 */
+		if (ec_response_i2c->result == EC_RES_INVALID_COMMAND &&
+		    ec_response_i2c->packet_length == 0) {
+			ret = -EPROTONOSUPPORT;
+			goto done;
+		}
+	}
+
+	if (ec_response_i2c->packet_length < sizeof(struct ec_host_response)) {
+		dev_err(ec_dev->dev,
+			"response of %u bytes too short; not a full header\n",
+			ec_response_i2c->packet_length);
+		ret = -EBADMSG;
+		goto done;
+	}
+
+	if (msg->insize < ec_response->data_len) {
+		dev_err(ec_dev->dev,
+			"response data size is too large: expected %u, got %u\n",
+			msg->insize,
+			ec_response->data_len);
+		ret = -EMSGSIZE;
+		goto done;
+	}
+
+	/* copy response packet payload and compute checksum */
+	sum = 0;
+	for (i = 0; i < sizeof(struct ec_host_response); i++)
+		sum += ((u8 *)ec_response)[i];
+
+	memcpy(msg->data,
+	       in_buf + response_header_size,
+	       ec_response->data_len);
+	for (i = 0; i < ec_response->data_len; i++)
+		sum += msg->data[i];
+
+	/* All bytes should sum to zero */
+	if (sum) {
+		dev_err(ec_dev->dev, "bad packet checksum\n");
+		ret = -EBADMSG;
+		goto done;
+	}
+
+	ret = ec_response->data_len;
+
+done:
+	if (msg->command == EC_CMD_REBOOT_EC)
+		msleep(EC_REBOOT_DELAY_MS);
+
+	return ret;
+}
+
 static int cros_ec_cmd_xfer_i2c(struct cros_ec_device *ec_dev,
 				struct cros_ec_command *msg)
 {
@@ -121,9 +276,12 @@ static int cros_ec_cmd_xfer_i2c(struct cros_ec_device *ec_dev,
 	}
 
 	ret = len;
- done:
+done:
 	kfree(in_buf);
 	kfree(out_buf);
+	if (msg->command == EC_CMD_REBOOT_EC)
+		msleep(EC_REBOOT_DELAY_MS);
+
 	return ret;
 }
 
@@ -143,12 +301,12 @@ static int cros_ec_i2c_probe(struct i2c_client *client,
 	ec_dev->priv = client;
 	ec_dev->irq = client->irq;
 	ec_dev->cmd_xfer = cros_ec_cmd_xfer_i2c;
-	ec_dev->pkt_xfer = NULL;
+	ec_dev->pkt_xfer = cros_ec_pkt_xfer_i2c;
 	ec_dev->ec_name = client->name;
 	ec_dev->phys_name = client->adapter->name;
-	ec_dev->din_size = sizeof(struct ec_host_response) +
+	ec_dev->din_size = sizeof(struct ec_host_response_i2c) +
 			   sizeof(struct ec_response_get_protocol_info);
-	ec_dev->dout_size = sizeof(struct ec_host_request);
+	ec_dev->dout_size = sizeof(struct ec_host_request_i2c);
 
 	err = cros_ec_register(ec_dev);
 	if (err) {
diff --git a/drivers/mfd/cros_ec_spi.c b/drivers/mfd/cros_ec_spi.c
index 04da2f288ef8..4e6f2f6b1095 100644
--- a/drivers/mfd/cros_ec_spi.c
+++ b/drivers/mfd/cros_ec_spi.c
@@ -65,12 +65,6 @@
   */
 #define EC_SPI_RECOVERY_TIME_NS	(200 * 1000)
 
-/*
- * The EC is unresponsive for a time after a reboot command.  Add a
- * simple delay to make sure that the bus stays locked.
- */
-#define EC_REBOOT_DELAY_MS	50
-
 /**
  * struct cros_ec_spi - information about a SPI-connected EC
  *
@@ -87,7 +81,7 @@ struct cros_ec_spi {
 };
 
 static void debug_packet(struct device *dev, const char *name, u8 *ptr,
-			  int len)
+			 int len)
 {
 #ifdef DEBUG
 	int i;
@@ -100,6 +94,172 @@ static void debug_packet(struct device *dev, const char *name, u8 *ptr,
 #endif
 }
 
+static int terminate_request(struct cros_ec_device *ec_dev)
+{
+	struct cros_ec_spi *ec_spi = ec_dev->priv;
+	struct spi_message msg;
+	struct spi_transfer trans;
+	int ret;
+
+	/*
+	 * Turn off CS, possibly adding a delay to ensure the rising edge
+	 * doesn't come too soon after the end of the data.
+	 */
+	spi_message_init(&msg);
+	memset(&trans, 0, sizeof(trans));
+	trans.delay_usecs = ec_spi->end_of_msg_delay;
+	spi_message_add_tail(&trans, &msg);
+
+	ret = spi_sync(ec_spi->spi, &msg);
+
+	/* Reset end-of-response timer */
+	ec_spi->last_transfer_ns = ktime_get_ns();
+	if (ret < 0) {
+		dev_err(ec_dev->dev,
+			"cs-deassert spi transfer failed: %d\n",
+			ret);
+	}
+
+	return ret;
+}
+
+/**
+ * receive_n_bytes - receive n bytes from the EC.
+ *
+ * Assumes buf is a pointer into the ec_dev->din buffer
+ */
+static int receive_n_bytes(struct cros_ec_device *ec_dev, u8 *buf, int n)
+{
+	struct cros_ec_spi *ec_spi = ec_dev->priv;
+	struct spi_transfer trans;
+	struct spi_message msg;
+	int ret;
+
+	BUG_ON(buf - ec_dev->din + n > ec_dev->din_size);
+
+	memset(&trans, 0, sizeof(trans));
+	trans.cs_change = 1;
+	trans.rx_buf = buf;
+	trans.len = n;
+
+	spi_message_init(&msg);
+	spi_message_add_tail(&trans, &msg);
+	ret = spi_sync(ec_spi->spi, &msg);
+	if (ret < 0)
+		dev_err(ec_dev->dev, "spi transfer failed: %d\n", ret);
+
+	return ret;
+}
+
+/**
+ * cros_ec_spi_receive_packet - Receive a packet from the EC.
+ *
+ * This function has two phases: reading the preamble bytes (since if we read
+ * data from the EC before it is ready to send, we just get preamble) and
+ * reading the actual message.
+ *
+ * The received data is placed into ec_dev->din.
+ *
+ * @ec_dev: ChromeOS EC device
+ * @need_len: Number of message bytes we need to read
+ */
+static int cros_ec_spi_receive_packet(struct cros_ec_device *ec_dev,
+				      int need_len)
+{
+	struct ec_host_response *response;
+	u8 *ptr, *end;
+	int ret;
+	unsigned long deadline;
+	int todo;
+
+	BUG_ON(EC_MSG_PREAMBLE_COUNT > ec_dev->din_size);
+
+	/* Receive data until we see the header byte */
+	deadline = jiffies + msecs_to_jiffies(EC_MSG_DEADLINE_MS);
+	while (true) {
+		unsigned long start_jiffies = jiffies;
+
+		ret = receive_n_bytes(ec_dev,
+				      ec_dev->din,
+				      EC_MSG_PREAMBLE_COUNT);
+		if (ret < 0)
+			return ret;
+
+		ptr = ec_dev->din;
+		for (end = ptr + EC_MSG_PREAMBLE_COUNT; ptr != end; ptr++) {
+			if (*ptr == EC_SPI_FRAME_START) {
+				dev_dbg(ec_dev->dev, "msg found at %zd\n",
+					ptr - ec_dev->din);
+				break;
+			}
+		}
+		if (ptr != end)
+			break;
+
+		/*
+		 * Use the time at the start of the loop as a timeout.  This
+		 * gives us one last shot at getting the transfer and is useful
+		 * in case we got context switched out for a while.
+		 */
+		if (time_after(start_jiffies, deadline)) {
+			dev_warn(ec_dev->dev, "EC failed to respond in time\n");
+			return -ETIMEDOUT;
+		}
+	}
+
+	/*
+	 * ptr now points to the header byte. Copy any valid data to the
+	 * start of our buffer
+	 */
+	todo = end - ++ptr;
+	BUG_ON(todo < 0 || todo > ec_dev->din_size);
+	todo = min(todo, need_len);
+	memmove(ec_dev->din, ptr, todo);
+	ptr = ec_dev->din + todo;
+	dev_dbg(ec_dev->dev, "need %d, got %d bytes from preamble\n",
+		need_len, todo);
+	need_len -= todo;
+
+	/* If the entire response struct wasn't read, get the rest of it. */
+	if (todo < sizeof(*response)) {
+		ret = receive_n_bytes(ec_dev, ptr, sizeof(*response) - todo);
+		if (ret < 0)
+			return -EBADMSG;
+		ptr += (sizeof(*response) - todo);
+		todo = sizeof(*response);
+	}
+
+	response = (struct ec_host_response *)ec_dev->din;
+
+	/* Abort if data_len is too large. */
+	if (response->data_len > ec_dev->din_size)
+		return -EMSGSIZE;
+
+	/* Receive data until we have it all */
+	while (need_len > 0) {
+		/*
+		 * We can't support transfers larger than the SPI FIFO size
+		 * unless we have DMA. We don't have DMA on the ISP SPI ports
+		 * for Exynos. We need a way of asking SPI driver for
+		 * maximum-supported transfer size.
+		 */
+		todo = min(need_len, 256);
+		dev_dbg(ec_dev->dev, "loop, todo=%d, need_len=%d, ptr=%zd\n",
+			todo, need_len, ptr - ec_dev->din);
+
+		ret = receive_n_bytes(ec_dev, ptr, todo);
+		if (ret < 0)
+			return ret;
+
+		ptr += todo;
+		need_len -= todo;
+	}
+
+	dev_dbg(ec_dev->dev, "loop done, ptr=%zd\n", ptr - ec_dev->din);
+
+	return 0;
+}
+
 /**
  * cros_ec_spi_receive_response - Receive a response from the EC.
  *
@@ -115,34 +275,27 @@ static void debug_packet(struct device *dev, const char *name, u8 *ptr,
 static int cros_ec_spi_receive_response(struct cros_ec_device *ec_dev,
 					int need_len)
 {
-	struct cros_ec_spi *ec_spi = ec_dev->priv;
-	struct spi_transfer trans;
-	struct spi_message msg;
 	u8 *ptr, *end;
 	int ret;
 	unsigned long deadline;
 	int todo;
 
+	BUG_ON(EC_MSG_PREAMBLE_COUNT > ec_dev->din_size);
+
 	/* Receive data until we see the header byte */
 	deadline = jiffies + msecs_to_jiffies(EC_MSG_DEADLINE_MS);
 	while (true) {
 		unsigned long start_jiffies = jiffies;
 
-		memset(&trans, 0, sizeof(trans));
-		trans.cs_change = 1;
-		trans.rx_buf = ptr = ec_dev->din;
-		trans.len = EC_MSG_PREAMBLE_COUNT;
-
-		spi_message_init(&msg);
-		spi_message_add_tail(&trans, &msg);
-		ret = spi_sync(ec_spi->spi, &msg);
-		if (ret < 0) {
-			dev_err(ec_dev->dev, "spi transfer failed: %d\n", ret);
+		ret = receive_n_bytes(ec_dev,
+				      ec_dev->din,
+				      EC_MSG_PREAMBLE_COUNT);
+		if (ret < 0)
 			return ret;
-		}
 
+		ptr = ec_dev->din;
 		for (end = ptr + EC_MSG_PREAMBLE_COUNT; ptr != end; ptr++) {
-			if (*ptr == EC_MSG_HEADER) {
+			if (*ptr == EC_SPI_FRAME_START) {
 				dev_dbg(ec_dev->dev, "msg found at %zd\n",
 					ptr - ec_dev->din);
 				break;
@@ -187,21 +340,9 @@ static int cros_ec_spi_receive_response(struct cros_ec_device *ec_dev,
 		dev_dbg(ec_dev->dev, "loop, todo=%d, need_len=%d, ptr=%zd\n",
 			todo, need_len, ptr - ec_dev->din);
 
-		memset(&trans, 0, sizeof(trans));
-		trans.cs_change = 1;
-		trans.rx_buf = ptr;
-		trans.len = todo;
-		spi_message_init(&msg);
-		spi_message_add_tail(&trans, &msg);
-
-		/* send command to EC and read answer */
-		BUG_ON((u8 *)trans.rx_buf - ec_dev->din + todo >
-				ec_dev->din_size);
-		ret = spi_sync(ec_spi->spi, &msg);
-		if (ret < 0) {
-			dev_err(ec_dev->dev, "spi transfer failed: %d\n", ret);
+		ret = receive_n_bytes(ec_dev, ptr, todo);
+		if (ret < 0)
 			return ret;
-		}
 
 		debug_packet(ec_dev->dev, "interim", ptr, todo);
 		ptr += todo;
@@ -213,6 +354,128 @@ static int cros_ec_spi_receive_response(struct cros_ec_device *ec_dev,
 	return 0;
 }
 
+/**
+ * cros_ec_pkt_xfer_spi - Transfer a packet over SPI and receive the reply
+ *
+ * @ec_dev: ChromeOS EC device
+ * @ec_msg: Message to transfer
+ */
+static int cros_ec_pkt_xfer_spi(struct cros_ec_device *ec_dev,
+				struct cros_ec_command *ec_msg)
+{
+	struct ec_host_request *request;
+	struct ec_host_response *response;
+	struct cros_ec_spi *ec_spi = ec_dev->priv;
+	struct spi_transfer trans;
+	struct spi_message msg;
+	int i, len;
+	u8 *ptr;
+	u8 *rx_buf;
+	u8 sum;
+	int ret = 0, final_ret;
+
+	len = cros_ec_prepare_tx(ec_dev, ec_msg);
+	request = (struct ec_host_request *)ec_dev->dout;
+	dev_dbg(ec_dev->dev, "prepared, len=%d\n", len);
+
+	/* If it's too soon to do another transaction, wait */
+	if (ec_spi->last_transfer_ns) {
+		unsigned long delay;	/* The delay completed so far */
+
+		delay = ktime_get_ns() - ec_spi->last_transfer_ns;
+		if (delay < EC_SPI_RECOVERY_TIME_NS)
+			ndelay(EC_SPI_RECOVERY_TIME_NS - delay);
+	}
+
+	rx_buf = kzalloc(len, GFP_KERNEL);
+	if (!rx_buf) {
+		ret = -ENOMEM;
+		goto exit;
+	}
+
+	/* Transmit phase - send our message */
+	memset(&trans, 0, sizeof(trans));
+	trans.tx_buf = ec_dev->dout;
+	trans.rx_buf = rx_buf;
+	trans.len = len;
+	trans.cs_change = 1;
+	spi_message_init(&msg);
+	spi_message_add_tail(&trans, &msg);
+	ret = spi_sync(ec_spi->spi, &msg);
+
+	/* Get the response */
+	if (!ret) {
+		/* Verify that EC can process command */
+		for (i = 0; i < len; i++) {
+			switch (rx_buf[i]) {
+			case EC_SPI_PAST_END:
+			case EC_SPI_RX_BAD_DATA:
+			case EC_SPI_NOT_READY:
+				ret = -EAGAIN;
+				ec_msg->result = EC_RES_IN_PROGRESS;
+			default:
+				break;
+			}
+			if (ret)
+				break;
+		}
+		if (!ret)
+			ret = cros_ec_spi_receive_packet(ec_dev,
+					ec_msg->insize + sizeof(*response));
+	} else {
+		dev_err(ec_dev->dev, "spi transfer failed: %d\n", ret);
+	}
+
+	final_ret = terminate_request(ec_dev);
+	if (!ret)
+		ret = final_ret;
+	if (ret < 0)
+		goto exit;
+
+	ptr = ec_dev->din;
+
+	/* check response error code */
+	response = (struct ec_host_response *)ptr;
+	ec_msg->result = response->result;
+
+	ret = cros_ec_check_result(ec_dev, ec_msg);
+	if (ret)
+		goto exit;
+
+	len = response->data_len;
+	sum = 0;
+	if (len > ec_msg->insize) {
+		dev_err(ec_dev->dev, "packet too long (%d bytes, expected %d)",
+			len, ec_msg->insize);
+		ret = -EMSGSIZE;
+		goto exit;
+	}
+
+	for (i = 0; i < sizeof(*response); i++)
+		sum += ptr[i];
+
+	/* copy response packet payload and compute checksum */
+	memcpy(ec_msg->data, ptr + sizeof(*response), len);
+	for (i = 0; i < len; i++)
+		sum += ec_msg->data[i];
+
+	if (sum) {
+		dev_err(ec_dev->dev,
+			"bad packet checksum, calculated %x\n",
+			sum);
+		ret = -EBADMSG;
+		goto exit;
+	}
+
+	ret = len;
+exit:
+	kfree(rx_buf);
+	if (ec_msg->command == EC_CMD_REBOOT_EC)
+		msleep(EC_REBOOT_DELAY_MS);
+
+	return ret;
+}
+
 /**
  * cros_ec_cmd_xfer_spi - Transfer a message over SPI and receive the reply
  *
@@ -227,6 +490,7 @@ static int cros_ec_cmd_xfer_spi(struct cros_ec_device *ec_dev,
 	struct spi_message msg;
 	int i, len;
 	u8 *ptr;
+	u8 *rx_buf;
 	int sum;
 	int ret = 0, final_ret;
 
@@ -242,10 +506,17 @@ static int cros_ec_cmd_xfer_spi(struct cros_ec_device *ec_dev,
 			ndelay(EC_SPI_RECOVERY_TIME_NS - delay);
 	}
 
+	rx_buf = kzalloc(len, GFP_KERNEL);
+	if (!rx_buf) {
+		ret = -ENOMEM;
+		goto exit;
+	}
+
 	/* Transmit phase - send our message */
 	debug_packet(ec_dev->dev, "out", ec_dev->dout, len);
 	memset(&trans, 0, sizeof(trans));
 	trans.tx_buf = ec_dev->dout;
+	trans.rx_buf = rx_buf;
 	trans.len = len;
 	trans.cs_change = 1;
 	spi_message_init(&msg);
@@ -254,29 +525,32 @@ static int cros_ec_cmd_xfer_spi(struct cros_ec_device *ec_dev,
 
 	/* Get the response */
 	if (!ret) {
-		ret = cros_ec_spi_receive_response(ec_dev,
-				ec_msg->insize + EC_MSG_TX_PROTO_BYTES);
+		/* Verify that EC can process command */
+		for (i = 0; i < len; i++) {
+			switch (rx_buf[i]) {
+			case EC_SPI_PAST_END:
+			case EC_SPI_RX_BAD_DATA:
+			case EC_SPI_NOT_READY:
+				ret = -EAGAIN;
+				ec_msg->result = EC_RES_IN_PROGRESS;
+			default:
+				break;
+			}
+			if (ret)
+				break;
+		}
+		if (!ret)
+			ret = cros_ec_spi_receive_response(ec_dev,
+					ec_msg->insize + EC_MSG_TX_PROTO_BYTES);
 	} else {
 		dev_err(ec_dev->dev, "spi transfer failed: %d\n", ret);
 	}
 
-	/*
-	 * Turn off CS, possibly adding a delay to ensure the rising edge
-	 * doesn't come too soon after the end of the data.
-	 */
-	spi_message_init(&msg);
-	memset(&trans, 0, sizeof(trans));
-	trans.delay_usecs = ec_spi->end_of_msg_delay;
-	spi_message_add_tail(&trans, &msg);
-
-	final_ret = spi_sync(ec_spi->spi, &msg);
-	ec_spi->last_transfer_ns = ktime_get_ns();
+	final_ret = terminate_request(ec_dev);
 	if (!ret)
 		ret = final_ret;
-	if (ret < 0) {
-		dev_err(ec_dev->dev, "spi transfer failed: %d\n", ret);
+	if (ret < 0)
 		goto exit;
-	}
 
 	ptr = ec_dev->din;
 
@@ -315,6 +589,7 @@ static int cros_ec_cmd_xfer_spi(struct cros_ec_device *ec_dev,
 
 	ret = len;
 exit:
+	kfree(rx_buf);
 	if (ec_msg->command == EC_CMD_REBOOT_EC)
 		msleep(EC_REBOOT_DELAY_MS);
 
@@ -361,7 +636,7 @@ static int cros_ec_spi_probe(struct spi_device *spi)
 	ec_dev->priv = ec_spi;
 	ec_dev->irq = spi->irq;
 	ec_dev->cmd_xfer = cros_ec_cmd_xfer_spi;
-	ec_dev->pkt_xfer = NULL;
+	ec_dev->pkt_xfer = cros_ec_pkt_xfer_spi;
 	ec_dev->ec_name = ec_spi->spi->modalias;
 	ec_dev->phys_name = dev_name(&ec_spi->spi->dev);
 	ec_dev->din_size = EC_MSG_PREAMBLE_COUNT +
@@ -369,6 +644,7 @@ static int cros_ec_spi_probe(struct spi_device *spi)
 			   sizeof(struct ec_response_get_protocol_info);
 	ec_dev->dout_size = sizeof(struct ec_host_request);
 
+
 	err = cros_ec_register(ec_dev);
 	if (err) {
 		dev_err(dev, "cannot register EC\n");
diff --git a/drivers/platform/chrome/cros_ec_lpc.c b/drivers/platform/chrome/cros_ec_lpc.c
index 05aeb559275f..cac24d356c91 100644
--- a/drivers/platform/chrome/cros_ec_lpc.c
+++ b/drivers/platform/chrome/cros_ec_lpc.c
@@ -46,6 +46,77 @@ static int ec_response_timed_out(void)
 	return 1;
 }
 
+static int cros_ec_pkt_xfer_lpc(struct cros_ec_device *ec,
+				struct cros_ec_command *msg)
+{
+	struct ec_host_request *request;
+	struct ec_host_response response;
+	u8 sum = 0;
+	int i;
+	int ret = 0;
+	u8 *dout;
+
+	ret = cros_ec_prepare_tx(ec, msg);
+
+	/* Write buffer */
+	for (i = 0; i < ret; i++)
+		outb(ec->dout[i], EC_LPC_ADDR_HOST_PACKET + i);
+
+	request = (struct ec_host_request *)ec->dout;
+
+	/* Here we go */
+	outb(EC_COMMAND_PROTOCOL_3, EC_LPC_ADDR_HOST_CMD);
+
+	if (ec_response_timed_out()) {
+		dev_warn(ec->dev, "EC responsed timed out\n");
+		ret = -EIO;
+		goto done;
+	}
+
+	/* Check result */
+	msg->result = inb(EC_LPC_ADDR_HOST_DATA);
+	ret = cros_ec_check_result(ec, msg);
+	if (ret)
+		goto done;
+
+	/* Read back response */
+	dout = (u8 *)&response;
+	for (i = 0; i < sizeof(response); i++) {
+		dout[i] = inb(EC_LPC_ADDR_HOST_PACKET + i);
+		sum += dout[i];
+	}
+
+	msg->result = response.result;
+
+	if (response.data_len > msg->insize) {
+		dev_err(ec->dev,
+			"packet too long (%d bytes, expected %d)",
+			response.data_len, msg->insize);
+		ret = -EMSGSIZE;
+		goto done;
+	}
+
+	/* Read response and process checksum */
+	for (i = 0; i < response.data_len; i++) {
+		msg->data[i] =
+			inb(EC_LPC_ADDR_HOST_PACKET + sizeof(response) + i);
+		sum += msg->data[i];
+	}
+
+	if (sum) {
+		dev_err(ec->dev,
+			"bad packet checksum %02x\n",
+			response.checksum);
+		ret = -EBADMSG;
+		goto done;
+	}
+
+	/* Return actual amount of data received */
+	ret = response.data_len;
+done:
+	return ret;
+}
+
 static int cros_ec_cmd_xfer_lpc(struct cros_ec_device *ec,
 				struct cros_ec_command *msg)
 {
@@ -215,7 +286,7 @@ static int cros_ec_lpc_probe(struct platform_device *pdev)
 	ec_dev->ec_name = pdev->name;
 	ec_dev->phys_name = dev_name(dev);
 	ec_dev->cmd_xfer = cros_ec_cmd_xfer_lpc;
-	ec_dev->pkt_xfer = NULL;
+	ec_dev->pkt_xfer = cros_ec_pkt_xfer_lpc;
 	ec_dev->cmd_readmem = cros_ec_lpc_readmem;
 	ec_dev->din_size = sizeof(struct ec_host_response) +
 			   sizeof(struct ec_response_get_protocol_info);
diff --git a/include/linux/mfd/cros_ec.h b/include/linux/mfd/cros_ec.h
index 59d909434efd..92e13aaa450c 100644
--- a/include/linux/mfd/cros_ec.h
+++ b/include/linux/mfd/cros_ec.h
@@ -21,6 +21,12 @@
 #include <linux/mfd/cros_ec_commands.h>
 #include <linux/mutex.h>
 
+/*
+ * The EC is unresponsive for a time after a reboot command.  Add a
+ * simple delay to make sure that the bus stays locked.
+ */
+#define EC_REBOOT_DELAY_MS             50
+
 /*
  * Max bus-specific overhead incurred by request/responses.
  * I2C requires 1 additional byte for requests.
-- 
cgit v1.2.3


From 57b33ff077beebb68481a2b6b8e5fe58ca998169 Mon Sep 17 00:00:00 2001
From: Gwendal Grignou <gwendal@chromium.org>
Date: Tue, 9 Jun 2015 13:04:47 +0200
Subject: mfd: cros_ec: Support multiple EC in a system

Chromebooks can have more than one Embedded Controller so the
cros_ec device id has to be incremented for each EC registered.

Add a new structure to represent multiple EC as different char
devices (e.g: /dev/cros_ec, /dev/cros_pd). It connects to
cros_ec_device and allows sysfs inferface for cros_pd.

Also reduce number of allocated objects, make chromeos sysfs
class object a static and add refcounting to prevent object
deletion while command is in progress.

Signed-off-by: Gwendal Grignou <gwendal@chromium.org>
Reviewed-by: Dmitry Torokhov <dtor@chromium.org>
Signed-off-by: Javier Martinez Canillas <javier.martinez@collabora.co.uk>
Tested-by: Heiko Stuebner <heiko@sntech.de>
Acked-by: Lee Jones <lee.jones@linaro.org>
Acked-by: Olof Johansson <olof@lixom.net>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/input/keyboard/cros_ec_keyb.c      |   2 +-
 drivers/mfd/cros_ec.c                      |  54 ++++++++++--
 drivers/mfd/cros_ec_i2c.c                  |   1 -
 drivers/mfd/cros_ec_spi.c                  |   1 -
 drivers/platform/chrome/cros_ec_dev.c      | 130 ++++++++++++++++++++---------
 drivers/platform/chrome/cros_ec_dev.h      |   7 --
 drivers/platform/chrome/cros_ec_lightbar.c |  75 +++++++++--------
 drivers/platform/chrome/cros_ec_lpc.c      |   1 -
 drivers/platform/chrome/cros_ec_sysfs.c    |  48 +++++------
 include/linux/mfd/cros_ec.h                |  44 ++++++++--
 10 files changed, 236 insertions(+), 127 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/input/keyboard/cros_ec_keyb.c b/drivers/input/keyboard/cros_ec_keyb.c
index 974154a74505..b01966dc7eb3 100644
--- a/drivers/input/keyboard/cros_ec_keyb.c
+++ b/drivers/input/keyboard/cros_ec_keyb.c
@@ -275,7 +275,7 @@ static int cros_ec_keyb_probe(struct platform_device *pdev)
 	ckdev->dev = dev;
 	dev_set_drvdata(&pdev->dev, ckdev);
 
-	idev->name = ec->ec_name;
+	idev->name = CROS_EC_DEV_NAME;
 	idev->phys = ec->phys_name;
 	__set_bit(EV_REP, idev->evbit);
 
diff --git a/drivers/mfd/cros_ec.c b/drivers/mfd/cros_ec.c
index 08d82bfc5268..11b1884bce62 100644
--- a/drivers/mfd/cros_ec.c
+++ b/drivers/mfd/cros_ec.c
@@ -24,11 +24,29 @@
 #include <linux/mfd/core.h>
 #include <linux/mfd/cros_ec.h>
 
-static const struct mfd_cell cros_devs[] = {
-	{
-		.name = "cros-ec-ctl",
-		.id = PLATFORM_DEVID_AUTO,
-	},
+#define CROS_EC_DEV_EC_INDEX 0
+#define CROS_EC_DEV_PD_INDEX 1
+
+struct cros_ec_platform ec_p = {
+	.ec_name = CROS_EC_DEV_NAME,
+	.cmd_offset = EC_CMD_PASSTHRU_OFFSET(CROS_EC_DEV_EC_INDEX),
+};
+
+struct cros_ec_platform pd_p = {
+	.ec_name = CROS_EC_DEV_PD_NAME,
+	.cmd_offset = EC_CMD_PASSTHRU_OFFSET(CROS_EC_DEV_PD_INDEX),
+};
+
+struct mfd_cell ec_cell = {
+	.name = "cros-ec-ctl",
+	.platform_data = &ec_p,
+	.pdata_size = sizeof(ec_p),
+};
+
+struct mfd_cell ec_pd_cell = {
+	.name = "cros-ec-ctl",
+	.platform_data = &pd_p,
+	.pdata_size = sizeof(pd_p),
 };
 
 int cros_ec_register(struct cros_ec_device *ec_dev)
@@ -52,14 +70,34 @@ int cros_ec_register(struct cros_ec_device *ec_dev)
 
 	cros_ec_query_all(ec_dev);
 
-	err = mfd_add_devices(dev, 0, cros_devs,
-			      ARRAY_SIZE(cros_devs),
+	err = mfd_add_devices(ec_dev->dev, PLATFORM_DEVID_AUTO, &ec_cell, 1,
 			      NULL, ec_dev->irq, NULL);
 	if (err) {
-		dev_err(dev, "failed to add mfd devices\n");
+		dev_err(dev,
+			"Failed to register Embedded Controller subdevice %d\n",
+			err);
 		return err;
 	}
 
+	if (ec_dev->max_passthru) {
+		/*
+		 * Register a PD device as well on top of this device.
+		 * We make the following assumptions:
+		 * - behind an EC, we have a pd
+		 * - only one device added.
+		 * - the EC is responsive at init time (it is not true for a
+		 *   sensor hub.
+		 */
+		err = mfd_add_devices(ec_dev->dev, PLATFORM_DEVID_AUTO,
+				      &ec_pd_cell, 1, NULL, ec_dev->irq, NULL);
+		if (err) {
+			dev_err(dev,
+				"Failed to register Power Delivery subdevice %d\n",
+				err);
+			return err;
+		}
+	}
+
 	if (IS_ENABLED(CONFIG_OF) && dev->of_node) {
 		err = of_platform_populate(dev->of_node, NULL, NULL, dev);
 		if (err) {
diff --git a/drivers/mfd/cros_ec_i2c.c b/drivers/mfd/cros_ec_i2c.c
index 22e8a4ae1711..b9a0963ca5c3 100644
--- a/drivers/mfd/cros_ec_i2c.c
+++ b/drivers/mfd/cros_ec_i2c.c
@@ -302,7 +302,6 @@ static int cros_ec_i2c_probe(struct i2c_client *client,
 	ec_dev->irq = client->irq;
 	ec_dev->cmd_xfer = cros_ec_cmd_xfer_i2c;
 	ec_dev->pkt_xfer = cros_ec_pkt_xfer_i2c;
-	ec_dev->ec_name = client->name;
 	ec_dev->phys_name = client->adapter->name;
 	ec_dev->din_size = sizeof(struct ec_host_response_i2c) +
 			   sizeof(struct ec_response_get_protocol_info);
diff --git a/drivers/mfd/cros_ec_spi.c b/drivers/mfd/cros_ec_spi.c
index 4e6f2f6b1095..faba03e2f1ef 100644
--- a/drivers/mfd/cros_ec_spi.c
+++ b/drivers/mfd/cros_ec_spi.c
@@ -637,7 +637,6 @@ static int cros_ec_spi_probe(struct spi_device *spi)
 	ec_dev->irq = spi->irq;
 	ec_dev->cmd_xfer = cros_ec_cmd_xfer_spi;
 	ec_dev->pkt_xfer = cros_ec_pkt_xfer_spi;
-	ec_dev->ec_name = ec_spi->spi->modalias;
 	ec_dev->phys_name = dev_name(&ec_spi->spi->dev);
 	ec_dev->din_size = EC_MSG_PREAMBLE_COUNT +
 			   sizeof(struct ec_host_response) +
diff --git a/drivers/platform/chrome/cros_ec_dev.c b/drivers/platform/chrome/cros_ec_dev.c
index e91ced1cb8ce..e8fcdc237029 100644
--- a/drivers/platform/chrome/cros_ec_dev.c
+++ b/drivers/platform/chrome/cros_ec_dev.c
@@ -27,11 +27,22 @@
 
 /* Device variables */
 #define CROS_MAX_DEV 128
-static struct class *cros_class;
 static int ec_major;
 
+static const struct attribute_group *cros_ec_groups[] = {
+	&cros_ec_attr_group,
+	&cros_ec_lightbar_attr_group,
+	NULL,
+};
+
+static struct class cros_class = {
+	.owner          = THIS_MODULE,
+	.name           = "chromeos",
+	.dev_groups     = cros_ec_groups,
+};
+
 /* Basic communication */
-static int ec_get_version(struct cros_ec_device *ec, char *str, int maxlen)
+static int ec_get_version(struct cros_ec_dev *ec, char *str, int maxlen)
 {
 	struct ec_response_get_version *resp;
 	static const char * const current_image_name[] = {
@@ -45,11 +56,11 @@ static int ec_get_version(struct cros_ec_device *ec, char *str, int maxlen)
 		return -ENOMEM;
 
 	msg->version = 0;
-	msg->command = EC_CMD_GET_VERSION;
+	msg->command = EC_CMD_GET_VERSION + ec->cmd_offset;
 	msg->insize = sizeof(*resp);
 	msg->outsize = 0;
 
-	ret = cros_ec_cmd_xfer(ec, msg);
+	ret = cros_ec_cmd_xfer(ec->ec_dev, msg);
 	if (ret < 0)
 		goto exit;
 
@@ -78,8 +89,10 @@ exit:
 /* Device file ops */
 static int ec_device_open(struct inode *inode, struct file *filp)
 {
-	filp->private_data = container_of(inode->i_cdev,
-					  struct cros_ec_device, cdev);
+	struct cros_ec_dev *ec = container_of(inode->i_cdev,
+					      struct cros_ec_dev, cdev);
+	filp->private_data = ec;
+	nonseekable_open(inode, filp);
 	return 0;
 }
 
@@ -91,7 +104,7 @@ static int ec_device_release(struct inode *inode, struct file *filp)
 static ssize_t ec_device_read(struct file *filp, char __user *buffer,
 			      size_t length, loff_t *offset)
 {
-	struct cros_ec_device *ec = filp->private_data;
+	struct cros_ec_dev *ec = filp->private_data;
 	char msg[sizeof(struct ec_response_get_version) +
 		 sizeof(CROS_EC_DEV_VERSION)];
 	size_t count;
@@ -114,7 +127,7 @@ static ssize_t ec_device_read(struct file *filp, char __user *buffer,
 }
 
 /* Ioctls */
-static long ec_device_ioctl_xcmd(struct cros_ec_device *ec, void __user *arg)
+static long ec_device_ioctl_xcmd(struct cros_ec_dev *ec, void __user *arg)
 {
 	long ret;
 	struct cros_ec_command u_cmd;
@@ -133,7 +146,8 @@ static long ec_device_ioctl_xcmd(struct cros_ec_device *ec, void __user *arg)
 		goto exit;
 	}
 
-	ret = cros_ec_cmd_xfer(ec, s_cmd);
+	s_cmd->command += ec->cmd_offset;
+	ret = cros_ec_cmd_xfer(ec->ec_dev, s_cmd);
 	/* Only copy data to userland if data was received. */
 	if (ret < 0)
 		goto exit;
@@ -145,19 +159,21 @@ exit:
 	return ret;
 }
 
-static long ec_device_ioctl_readmem(struct cros_ec_device *ec, void __user *arg)
+static long ec_device_ioctl_readmem(struct cros_ec_dev *ec, void __user *arg)
 {
+	struct cros_ec_device *ec_dev = ec->ec_dev;
 	struct cros_ec_readmem s_mem = { };
 	long num;
 
 	/* Not every platform supports direct reads */
-	if (!ec->cmd_readmem)
+	if (!ec_dev->cmd_readmem)
 		return -ENOTTY;
 
 	if (copy_from_user(&s_mem, arg, sizeof(s_mem)))
 		return -EFAULT;
 
-	num = ec->cmd_readmem(ec, s_mem.offset, s_mem.bytes, s_mem.buffer);
+	num = ec_dev->cmd_readmem(ec_dev, s_mem.offset, s_mem.bytes,
+				  s_mem.buffer);
 	if (num <= 0)
 		return num;
 
@@ -170,7 +186,7 @@ static long ec_device_ioctl_readmem(struct cros_ec_device *ec, void __user *arg)
 static long ec_device_ioctl(struct file *filp, unsigned int cmd,
 			    unsigned long arg)
 {
-	struct cros_ec_device *ec = filp->private_data;
+	struct cros_ec_dev *ec = filp->private_data;
 
 	if (_IOC_TYPE(cmd) != CROS_EC_DEV_IOC)
 		return -ENOTTY;
@@ -193,45 +209,81 @@ static const struct file_operations fops = {
 	.unlocked_ioctl = ec_device_ioctl,
 };
 
+static void __remove(struct device *dev)
+{
+	struct cros_ec_dev *ec = container_of(dev, struct cros_ec_dev,
+					      class_dev);
+	kfree(ec);
+}
+
 static int ec_device_probe(struct platform_device *pdev)
 {
-	struct cros_ec_device *ec = dev_get_drvdata(pdev->dev.parent);
-	int retval = -ENOTTY;
-	dev_t devno = MKDEV(ec_major, 0);
+	int retval = -ENOMEM;
+	struct device *dev = &pdev->dev;
+	struct cros_ec_platform *ec_platform = dev_get_platdata(dev);
+	dev_t devno = MKDEV(ec_major, pdev->id);
+	struct cros_ec_dev *ec = kzalloc(sizeof(*ec), GFP_KERNEL);
+
+	if (!ec)
+		return retval;
 
-	/* Instantiate it (and remember the EC) */
+	dev_set_drvdata(dev, ec);
+	ec->ec_dev = dev_get_drvdata(dev->parent);
+	ec->dev = dev;
+	ec->cmd_offset = ec_platform->cmd_offset;
+	device_initialize(&ec->class_dev);
 	cdev_init(&ec->cdev, &fops);
 
+	/*
+	 * Add the character device
+	 * Link cdev to the class device to be sure device is not used
+	 * before unbinding it.
+	 */
+	ec->cdev.kobj.parent = &ec->class_dev.kobj;
 	retval = cdev_add(&ec->cdev, devno, 1);
 	if (retval) {
-		dev_err(&pdev->dev, ": failed to add character device\n");
-		return retval;
+		dev_err(dev, ": failed to add character device\n");
+		goto cdev_add_failed;
 	}
 
-	ec->vdev = device_create(cros_class, NULL, devno, ec,
-				 CROS_EC_DEV_NAME);
-	if (IS_ERR(ec->vdev)) {
-		retval = PTR_ERR(ec->vdev);
-		dev_err(&pdev->dev, ": failed to create device\n");
-		cdev_del(&ec->cdev);
-		return retval;
+	/*
+	 * Add the class device
+	 * Link to the character device for creating the /dev entry
+	 * in devtmpfs.
+	 */
+	ec->class_dev.devt = ec->cdev.dev;
+	ec->class_dev.class = &cros_class;
+	ec->class_dev.parent = dev;
+	ec->class_dev.release = __remove;
+
+	retval = dev_set_name(&ec->class_dev, "%s", ec_platform->ec_name);
+	if (retval) {
+		dev_err(dev, "dev_set_name failed => %d\n", retval);
+		goto set_named_failed;
 	}
 
-	/* Initialize extra interfaces */
-	ec_dev_sysfs_init(ec);
-	ec_dev_lightbar_init(ec);
+	retval = device_add(&ec->class_dev);
+	if (retval) {
+		dev_err(dev, "device_register failed => %d\n", retval);
+		goto dev_reg_failed;
+	}
 
 	return 0;
+
+dev_reg_failed:
+set_named_failed:
+	dev_set_drvdata(dev, NULL);
+	cdev_del(&ec->cdev);
+cdev_add_failed:
+	kfree(ec);
+	return retval;
 }
 
 static int ec_device_remove(struct platform_device *pdev)
 {
-	struct cros_ec_device *ec = dev_get_drvdata(pdev->dev.parent);
-
-	ec_dev_lightbar_remove(ec);
-	ec_dev_sysfs_remove(ec);
-	device_destroy(cros_class, MKDEV(ec_major, 0));
+	struct cros_ec_dev *ec = dev_get_drvdata(&pdev->dev);
 	cdev_del(&ec->cdev);
+	device_unregister(&ec->class_dev);
 	return 0;
 }
 
@@ -248,10 +300,10 @@ static int __init cros_ec_dev_init(void)
 	int ret;
 	dev_t dev = 0;
 
-	cros_class = class_create(THIS_MODULE, "chromeos");
-	if (IS_ERR(cros_class)) {
+	ret  = class_register(&cros_class);
+	if (ret) {
 		pr_err(CROS_EC_DEV_NAME ": failed to register device class\n");
-		return PTR_ERR(cros_class);
+		return ret;
 	}
 
 	/* Get a range of minor numbers (starting with 0) to work with */
@@ -273,7 +325,7 @@ static int __init cros_ec_dev_init(void)
 failed_devreg:
 	unregister_chrdev_region(MKDEV(ec_major, 0), CROS_MAX_DEV);
 failed_chrdevreg:
-	class_destroy(cros_class);
+	class_unregister(&cros_class);
 	return ret;
 }
 
@@ -281,7 +333,7 @@ static void __exit cros_ec_dev_exit(void)
 {
 	platform_driver_unregister(&cros_ec_dev_driver);
 	unregister_chrdev(ec_major, CROS_EC_DEV_NAME);
-	class_destroy(cros_class);
+	class_unregister(&cros_class);
 }
 
 module_init(cros_ec_dev_init);
diff --git a/drivers/platform/chrome/cros_ec_dev.h b/drivers/platform/chrome/cros_ec_dev.h
index 45d67f7e518c..bfd2c84c3571 100644
--- a/drivers/platform/chrome/cros_ec_dev.h
+++ b/drivers/platform/chrome/cros_ec_dev.h
@@ -24,7 +24,6 @@
 #include <linux/types.h>
 #include <linux/mfd/cros_ec.h>
 
-#define CROS_EC_DEV_NAME "cros_ec"
 #define CROS_EC_DEV_VERSION "1.0.0"
 
 /*
@@ -44,10 +43,4 @@ struct cros_ec_readmem {
 #define CROS_EC_DEV_IOCXCMD   _IOWR(CROS_EC_DEV_IOC, 0, struct cros_ec_command)
 #define CROS_EC_DEV_IOCRDMEM  _IOWR(CROS_EC_DEV_IOC, 1, struct cros_ec_readmem)
 
-void ec_dev_sysfs_init(struct cros_ec_device *);
-void ec_dev_sysfs_remove(struct cros_ec_device *);
-
-void ec_dev_lightbar_init(struct cros_ec_device *);
-void ec_dev_lightbar_remove(struct cros_ec_device *);
-
 #endif /* _CROS_EC_DEV_H_ */
diff --git a/drivers/platform/chrome/cros_ec_lightbar.c b/drivers/platform/chrome/cros_ec_lightbar.c
index 6e1986a2dce1..144e09df9b84 100644
--- a/drivers/platform/chrome/cros_ec_lightbar.c
+++ b/drivers/platform/chrome/cros_ec_lightbar.c
@@ -92,7 +92,7 @@ out:
 	return ret;
 }
 
-static struct cros_ec_command *alloc_lightbar_cmd_msg(void)
+static struct cros_ec_command *alloc_lightbar_cmd_msg(struct cros_ec_dev *ec)
 {
 	struct cros_ec_command *msg;
 	int len;
@@ -105,14 +105,14 @@ static struct cros_ec_command *alloc_lightbar_cmd_msg(void)
 		return NULL;
 
 	msg->version = 0;
-	msg->command = EC_CMD_LIGHTBAR_CMD;
+	msg->command = EC_CMD_LIGHTBAR_CMD + ec->cmd_offset;
 	msg->outsize = sizeof(struct ec_params_lightbar);
 	msg->insize = sizeof(struct ec_response_lightbar);
 
 	return msg;
 }
 
-static int get_lightbar_version(struct cros_ec_device *ec,
+static int get_lightbar_version(struct cros_ec_dev *ec,
 				uint32_t *ver_ptr, uint32_t *flg_ptr)
 {
 	struct ec_params_lightbar *param;
@@ -120,13 +120,13 @@ static int get_lightbar_version(struct cros_ec_device *ec,
 	struct cros_ec_command *msg;
 	int ret;
 
-	msg = alloc_lightbar_cmd_msg();
+	msg = alloc_lightbar_cmd_msg(ec);
 	if (!msg)
 		return 0;
 
 	param = (struct ec_params_lightbar *)msg->data;
 	param->cmd = LIGHTBAR_CMD_VERSION;
-	ret = cros_ec_cmd_xfer(ec, msg);
+	ret = cros_ec_cmd_xfer(ec->ec_dev, msg);
 	if (ret < 0) {
 		ret = 0;
 		goto exit;
@@ -165,7 +165,8 @@ static ssize_t version_show(struct device *dev,
 			    struct device_attribute *attr, char *buf)
 {
 	uint32_t version = 0, flags = 0;
-	struct cros_ec_device *ec = dev_get_drvdata(dev);
+	struct cros_ec_dev *ec = container_of(dev,
+					      struct cros_ec_dev, class_dev);
 	int ret;
 
 	ret = lb_throttle();
@@ -187,12 +188,13 @@ static ssize_t brightness_store(struct device *dev,
 	struct cros_ec_command *msg;
 	int ret;
 	unsigned int val;
-	struct cros_ec_device *ec = dev_get_drvdata(dev);
+	struct cros_ec_dev *ec = container_of(dev,
+					      struct cros_ec_dev, class_dev);
 
 	if (kstrtouint(buf, 0, &val))
 		return -EINVAL;
 
-	msg = alloc_lightbar_cmd_msg();
+	msg = alloc_lightbar_cmd_msg(ec);
 	if (!msg)
 		return -ENOMEM;
 
@@ -203,7 +205,7 @@ static ssize_t brightness_store(struct device *dev,
 	if (ret)
 		goto exit;
 
-	ret = cros_ec_cmd_xfer(ec, msg);
+	ret = cros_ec_cmd_xfer(ec->ec_dev, msg);
 	if (ret < 0)
 		goto exit;
 
@@ -231,11 +233,12 @@ static ssize_t led_rgb_store(struct device *dev, struct device_attribute *attr,
 {
 	struct ec_params_lightbar *param;
 	struct cros_ec_command *msg;
-	struct cros_ec_device *ec = dev_get_drvdata(dev);
+	struct cros_ec_dev *ec = container_of(dev,
+					      struct cros_ec_dev, class_dev);
 	unsigned int val[4];
 	int ret, i = 0, j = 0, ok = 0;
 
-	msg = alloc_lightbar_cmd_msg();
+	msg = alloc_lightbar_cmd_msg(ec);
 	if (!msg)
 		return -ENOMEM;
 
@@ -268,7 +271,7 @@ static ssize_t led_rgb_store(struct device *dev, struct device_attribute *attr,
 					return ret;
 			}
 
-			ret = cros_ec_cmd_xfer(ec, msg);
+			ret = cros_ec_cmd_xfer(ec->ec_dev, msg);
 			if (ret < 0)
 				goto exit;
 
@@ -304,9 +307,10 @@ static ssize_t sequence_show(struct device *dev,
 	struct ec_response_lightbar *resp;
 	struct cros_ec_command *msg;
 	int ret;
-	struct cros_ec_device *ec = dev_get_drvdata(dev);
+	struct cros_ec_dev *ec = container_of(dev,
+					      struct cros_ec_dev, class_dev);
 
-	msg = alloc_lightbar_cmd_msg();
+	msg = alloc_lightbar_cmd_msg(ec);
 	if (!msg)
 		return -ENOMEM;
 
@@ -316,7 +320,7 @@ static ssize_t sequence_show(struct device *dev,
 	if (ret)
 		goto exit;
 
-	ret = cros_ec_cmd_xfer(ec, msg);
+	ret = cros_ec_cmd_xfer(ec->ec_dev, msg);
 	if (ret < 0)
 		goto exit;
 
@@ -345,9 +349,10 @@ static ssize_t sequence_store(struct device *dev, struct device_attribute *attr,
 	struct cros_ec_command *msg;
 	unsigned int num;
 	int ret, len;
-	struct cros_ec_device *ec = dev_get_drvdata(dev);
+	struct cros_ec_dev *ec = container_of(dev,
+					      struct cros_ec_dev, class_dev);
 
-	msg = alloc_lightbar_cmd_msg();
+	msg = alloc_lightbar_cmd_msg(ec);
 	if (!msg)
 		return -ENOMEM;
 
@@ -372,7 +377,7 @@ static ssize_t sequence_store(struct device *dev, struct device_attribute *attr,
 	if (ret)
 		return ret;
 
-	ret = cros_ec_cmd_xfer(ec, msg);
+	ret = cros_ec_cmd_xfer(ec->ec_dev, msg);
 	if (ret < 0)
 		return ret;
 
@@ -397,25 +402,27 @@ static struct attribute *__lb_cmds_attrs[] = {
 	&dev_attr_sequence.attr,
 	NULL,
 };
-static struct attribute_group lb_cmds_attr_group = {
-	.name = "lightbar",
-	.attrs = __lb_cmds_attrs,
-};
 
-void ec_dev_lightbar_init(struct cros_ec_device *ec)
+static umode_t cros_ec_lightbar_attrs_are_visible(struct kobject *kobj,
+						  struct attribute *a, int n)
 {
-	int ret = 0;
+	struct device *dev = container_of(kobj, struct device, kobj);
+	struct cros_ec_dev *ec = container_of(dev,
+					      struct cros_ec_dev, class_dev);
+	struct platform_device *pdev = container_of(ec->dev,
+						   struct platform_device, dev);
+	if (pdev->id != 0)
+		return 0;
 
 	/* Only instantiate this stuff if the EC has a lightbar */
-	if (!get_lightbar_version(ec, NULL, NULL))
-		return;
-
-	ret = sysfs_create_group(&ec->vdev->kobj, &lb_cmds_attr_group);
-	if (ret)
-		pr_warn("sysfs_create_group() failed: %d\n", ret);
+	if (get_lightbar_version(ec, NULL, NULL))
+		return a->mode;
+	else
+		return 0;
 }
 
-void ec_dev_lightbar_remove(struct cros_ec_device *ec)
-{
-	sysfs_remove_group(&ec->vdev->kobj, &lb_cmds_attr_group);
-}
+struct attribute_group cros_ec_lightbar_attr_group = {
+	.name = "lightbar",
+	.attrs = __lb_cmds_attrs,
+	.is_visible = cros_ec_lightbar_attrs_are_visible,
+};
diff --git a/drivers/platform/chrome/cros_ec_lpc.c b/drivers/platform/chrome/cros_ec_lpc.c
index cac24d356c91..bdd77ce45f05 100644
--- a/drivers/platform/chrome/cros_ec_lpc.c
+++ b/drivers/platform/chrome/cros_ec_lpc.c
@@ -283,7 +283,6 @@ static int cros_ec_lpc_probe(struct platform_device *pdev)
 
 	platform_set_drvdata(pdev, ec_dev);
 	ec_dev->dev = dev;
-	ec_dev->ec_name = pdev->name;
 	ec_dev->phys_name = dev_name(dev);
 	ec_dev->cmd_xfer = cros_ec_cmd_xfer_lpc;
 	ec_dev->pkt_xfer = cros_ec_pkt_xfer_lpc;
diff --git a/drivers/platform/chrome/cros_ec_sysfs.c b/drivers/platform/chrome/cros_ec_sysfs.c
index 78ba82d670cb..f3baf9973989 100644
--- a/drivers/platform/chrome/cros_ec_sysfs.c
+++ b/drivers/platform/chrome/cros_ec_sysfs.c
@@ -72,7 +72,8 @@ static ssize_t store_ec_reboot(struct device *dev,
 	int got_cmd = 0, offset = 0;
 	int i;
 	int ret;
-	struct cros_ec_device *ec = dev_get_drvdata(dev);
+	struct cros_ec_dev *ec = container_of(dev,
+					      struct cros_ec_dev, class_dev);
 
 	msg = kmalloc(sizeof(*msg) + sizeof(*param), GFP_KERNEL);
 	if (!msg)
@@ -112,10 +113,10 @@ static ssize_t store_ec_reboot(struct device *dev,
 	}
 
 	msg->version = 0;
-	msg->command = EC_CMD_REBOOT_EC;
+	msg->command = EC_CMD_REBOOT_EC + ec->cmd_offset;
 	msg->outsize = sizeof(*param);
 	msg->insize = 0;
-	ret = cros_ec_cmd_xfer(ec, msg);
+	ret = cros_ec_cmd_xfer(ec->ec_dev, msg);
 	if (ret < 0) {
 		count = ret;
 		goto exit;
@@ -139,7 +140,8 @@ static ssize_t show_ec_version(struct device *dev,
 	struct cros_ec_command *msg;
 	int ret;
 	int count = 0;
-	struct cros_ec_device *ec = dev_get_drvdata(dev);
+	struct cros_ec_dev *ec = container_of(dev,
+					      struct cros_ec_dev, class_dev);
 
 	msg = kmalloc(sizeof(*msg) + EC_HOST_PARAM_SIZE, GFP_KERNEL);
 	if (!msg)
@@ -147,10 +149,10 @@ static ssize_t show_ec_version(struct device *dev,
 
 	/* Get versions. RW may change. */
 	msg->version = 0;
-	msg->command = EC_CMD_GET_VERSION;
+	msg->command = EC_CMD_GET_VERSION + ec->cmd_offset;
 	msg->insize = sizeof(*r_ver);
 	msg->outsize = 0;
-	ret = cros_ec_cmd_xfer(ec, msg);
+	ret = cros_ec_cmd_xfer(ec->ec_dev, msg);
 	if (ret < 0) {
 		count = ret;
 		goto exit;
@@ -175,9 +177,9 @@ static ssize_t show_ec_version(struct device *dev,
 			    image_names[r_ver->current_image] : "?"));
 
 	/* Get build info. */
-	msg->command = EC_CMD_GET_BUILD_INFO;
+	msg->command = EC_CMD_GET_BUILD_INFO + ec->cmd_offset;
 	msg->insize = EC_HOST_PARAM_SIZE;
-	ret = cros_ec_cmd_xfer(ec, msg);
+	ret = cros_ec_cmd_xfer(ec->ec_dev, msg);
 	if (ret < 0)
 		count += scnprintf(buf + count, PAGE_SIZE - count,
 				   "Build info:    XFER ERROR %d\n", ret);
@@ -191,9 +193,9 @@ static ssize_t show_ec_version(struct device *dev,
 	}
 
 	/* Get chip info. */
-	msg->command = EC_CMD_GET_CHIP_INFO;
+	msg->command = EC_CMD_GET_CHIP_INFO + ec->cmd_offset;
 	msg->insize = sizeof(*r_chip);
-	ret = cros_ec_cmd_xfer(ec, msg);
+	ret = cros_ec_cmd_xfer(ec->ec_dev, msg);
 	if (ret < 0)
 		count += scnprintf(buf + count, PAGE_SIZE - count,
 				   "Chip info:     XFER ERROR %d\n", ret);
@@ -215,9 +217,9 @@ static ssize_t show_ec_version(struct device *dev,
 	}
 
 	/* Get board version */
-	msg->command = EC_CMD_GET_BOARD_VERSION;
+	msg->command = EC_CMD_GET_BOARD_VERSION + ec->cmd_offset;
 	msg->insize = sizeof(*r_board);
-	ret = cros_ec_cmd_xfer(ec, msg);
+	ret = cros_ec_cmd_xfer(ec->ec_dev, msg);
 	if (ret < 0)
 		count += scnprintf(buf + count, PAGE_SIZE - count,
 				   "Board version: XFER ERROR %d\n", ret);
@@ -243,7 +245,8 @@ static ssize_t show_ec_flashinfo(struct device *dev,
 	struct ec_response_flash_info *resp;
 	struct cros_ec_command *msg;
 	int ret;
-	struct cros_ec_device *ec = dev_get_drvdata(dev);
+	struct cros_ec_dev *ec = container_of(dev,
+					      struct cros_ec_dev, class_dev);
 
 	msg = kmalloc(sizeof(*msg) + sizeof(*resp), GFP_KERNEL);
 	if (!msg)
@@ -251,10 +254,10 @@ static ssize_t show_ec_flashinfo(struct device *dev,
 
 	/* The flash info shouldn't ever change, but ask each time anyway. */
 	msg->version = 0;
-	msg->command = EC_CMD_FLASH_INFO;
+	msg->command = EC_CMD_FLASH_INFO + ec->cmd_offset;
 	msg->insize = sizeof(*resp);
 	msg->outsize = 0;
-	ret = cros_ec_cmd_xfer(ec, msg);
+	ret = cros_ec_cmd_xfer(ec->ec_dev, msg);
 	if (ret < 0)
 		goto exit;
 	if (msg->result != EC_RES_SUCCESS) {
@@ -288,20 +291,7 @@ static struct attribute *__ec_attrs[] = {
 	NULL,
 };
 
-static struct attribute_group ec_attr_group = {
+struct attribute_group cros_ec_attr_group = {
 	.attrs = __ec_attrs,
 };
 
-void ec_dev_sysfs_init(struct cros_ec_device *ec)
-{
-	int error;
-
-	error = sysfs_create_group(&ec->vdev->kobj, &ec_attr_group);
-	if (error)
-		pr_warn("failed to create group: %d\n", error);
-}
-
-void ec_dev_sysfs_remove(struct cros_ec_device *ec)
-{
-	sysfs_remove_group(&ec->vdev->kobj, &ec_attr_group);
-}
diff --git a/include/linux/mfd/cros_ec.h b/include/linux/mfd/cros_ec.h
index 92e13aaa450c..da72671a42fa 100644
--- a/include/linux/mfd/cros_ec.h
+++ b/include/linux/mfd/cros_ec.h
@@ -17,10 +17,14 @@
 #define __LINUX_MFD_CROS_EC_H
 
 #include <linux/cdev.h>
+#include <linux/device.h>
 #include <linux/notifier.h>
 #include <linux/mfd/cros_ec_commands.h>
 #include <linux/mutex.h>
 
+#define CROS_EC_DEV_NAME "cros_ec"
+#define CROS_EC_DEV_PD_NAME "cros_pd"
+
 /*
  * The EC is unresponsive for a time after a reboot command.  Add a
  * simple delay to make sure that the bus stays locked.
@@ -71,11 +75,8 @@ struct cros_ec_command {
 /**
  * struct cros_ec_device - Information about a ChromeOS EC device
  *
- * @ec_name: name of EC device (e.g. 'chromeos-ec')
  * @phys_name: name of physical comms layer (e.g. 'i2c-4')
  * @dev: Device pointer for physical comms device
- * @vdev: Device pointer for virtual comms device
- * @cdev: Character device structure for virtual comms device
  * @was_wake_device: true if this device was set to wake the system from
  * sleep at the last suspend
  * @cmd_readmem: direct read of the EC memory-mapped region, if supported
@@ -87,6 +88,7 @@ struct cros_ec_command {
  *
  * @priv: Private data
  * @irq: Interrupt to use
+ * @id: Device id
  * @din: input buffer (for data from EC)
  * @dout: output buffer (for data to EC)
  * \note
@@ -109,11 +111,8 @@ struct cros_ec_command {
 struct cros_ec_device {
 
 	/* These are used by other drivers that want to talk to the EC */
-	const char *ec_name;
 	const char *phys_name;
 	struct device *dev;
-	struct device *vdev;
-	struct cdev cdev;
 	bool was_wake_device;
 	struct class *cros_class;
 	int (*cmd_readmem)(struct cros_ec_device *ec, unsigned int offset,
@@ -138,6 +137,35 @@ struct cros_ec_device {
 	struct mutex lock;
 };
 
+/* struct cros_ec_platform - ChromeOS EC platform information
+ *
+ * @ec_name: name of EC device (e.g. 'cros-ec', 'cros-pd', ...)
+ * used in /dev/ and sysfs.
+ * @cmd_offset: offset to apply for each command. Set when
+ * registering a devicde behind another one.
+ */
+struct cros_ec_platform {
+	const char *ec_name;
+	u16 cmd_offset;
+};
+
+/*
+ * struct cros_ec_dev - ChromeOS EC device entry point
+ *
+ * @class_dev: Device structure used in sysfs
+ * @cdev: Character device structure in /dev
+ * @ec_dev: cros_ec_device structure to talk to the physical device
+ * @dev: pointer to the platform device
+ * @cmd_offset: offset to apply for each command.
+ */
+struct cros_ec_dev {
+	struct device class_dev;
+	struct cdev cdev;
+	struct cros_ec_device *ec_dev;
+	struct device *dev;
+	u16 cmd_offset;
+};
+
 /**
  * cros_ec_suspend - Handle a suspend operation for the ChromeOS EC device
  *
@@ -224,4 +252,8 @@ int cros_ec_register(struct cros_ec_device *ec_dev);
  */
 int cros_ec_query_all(struct cros_ec_device *ec_dev);
 
+/* sysfs stuff */
+extern struct attribute_group cros_ec_attr_group;
+extern struct attribute_group cros_ec_lightbar_attr_group;
+
 #endif /* __LINUX_MFD_CROS_EC_H */
-- 
cgit v1.2.3


From d0562674838c08ff142c0e9a8e12634e133c4361 Mon Sep 17 00:00:00 2001
From: "Suthikulpanit, Suravee" <Suravee.Suthikulpanit@amd.com>
Date: Wed, 10 Jun 2015 11:08:52 -0500
Subject: ACPI / scan: Parse _CCA and setup device coherency

This patch implements support for ACPI _CCA object, which is introduced in
ACPIv5.1, can be used for specifying device DMA coherency attribute.

The parsing logic traverses device namespace to parse coherency
information, and stores it in acpi_device_flags. Then uses it to call
arch_setup_dma_ops() when creating each device enumerated in DSDT
during ACPI scan.

This patch also introduces acpi_dma_is_coherent(), which provides
an interface for device drivers to check the coherency information
similarly to the of_dma_is_coherent().

Signed-off-by: Mark Salter <msalter@redhat.com>
Signed-off-by: Suravee Suthikulpanit <Suravee.Suthikulpanit@amd.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/Kconfig         |  3 +++
 drivers/acpi/acpi_platform.c |  2 +-
 drivers/acpi/glue.c          |  5 +++++
 drivers/acpi/scan.c          | 35 +++++++++++++++++++++++++++++++++++
 include/acpi/acpi_bus.h      | 37 ++++++++++++++++++++++++++++++++++++-
 include/linux/acpi.h         |  5 +++++
 6 files changed, 85 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/acpi/Kconfig b/drivers/acpi/Kconfig
index ab2cbb51c6aa..212735f3f074 100644
--- a/drivers/acpi/Kconfig
+++ b/drivers/acpi/Kconfig
@@ -54,6 +54,9 @@ config ACPI_GENERIC_GSI
 config ACPI_SYSTEM_POWER_STATES_SUPPORT
 	bool
 
+config ACPI_CCA_REQUIRED
+	bool
+
 config ACPI_SLEEP
 	bool
 	depends on SUSPEND || HIBERNATION
diff --git a/drivers/acpi/acpi_platform.c b/drivers/acpi/acpi_platform.c
index 4bf75597f732..06a67d5f2846 100644
--- a/drivers/acpi/acpi_platform.c
+++ b/drivers/acpi/acpi_platform.c
@@ -103,7 +103,7 @@ struct platform_device *acpi_create_platform_device(struct acpi_device *adev)
 	pdevinfo.res = resources;
 	pdevinfo.num_res = count;
 	pdevinfo.fwnode = acpi_fwnode_handle(adev);
-	pdevinfo.dma_mask = DMA_BIT_MASK(32);
+	pdevinfo.dma_mask = acpi_check_dma(adev, NULL) ? DMA_BIT_MASK(32) : 0;
 	pdev = platform_device_register_full(&pdevinfo);
 	if (IS_ERR(pdev))
 		dev_err(&adev->dev, "platform device creation failed: %ld\n",
diff --git a/drivers/acpi/glue.c b/drivers/acpi/glue.c
index 39c485b0c25c..b9657af751d1 100644
--- a/drivers/acpi/glue.c
+++ b/drivers/acpi/glue.c
@@ -13,6 +13,7 @@
 #include <linux/slab.h>
 #include <linux/rwsem.h>
 #include <linux/acpi.h>
+#include <linux/dma-mapping.h>
 
 #include "internal.h"
 
@@ -167,6 +168,7 @@ int acpi_bind_one(struct device *dev, struct acpi_device *acpi_dev)
 	struct list_head *physnode_list;
 	unsigned int node_id;
 	int retval = -EINVAL;
+	bool coherent;
 
 	if (has_acpi_companion(dev)) {
 		if (acpi_dev) {
@@ -223,6 +225,9 @@ int acpi_bind_one(struct device *dev, struct acpi_device *acpi_dev)
 	if (!has_acpi_companion(dev))
 		ACPI_COMPANION_SET(dev, acpi_dev);
 
+	if (acpi_check_dma(acpi_dev, &coherent))
+		arch_setup_dma_ops(dev, 0, 0, NULL, coherent);
+
 	acpi_physnode_link_name(physical_node_name, node_id);
 	retval = sysfs_create_link(&acpi_dev->dev.kobj, &dev->kobj,
 				   physical_node_name);
diff --git a/drivers/acpi/scan.c b/drivers/acpi/scan.c
index 0a099917a006..67509b23172c 100644
--- a/drivers/acpi/scan.c
+++ b/drivers/acpi/scan.c
@@ -11,6 +11,7 @@
 #include <linux/kthread.h>
 #include <linux/dmi.h>
 #include <linux/nls.h>
+#include <linux/dma-mapping.h>
 
 #include <asm/pgtable.h>
 
@@ -2111,6 +2112,39 @@ void acpi_free_pnp_ids(struct acpi_device_pnp *pnp)
 	kfree(pnp->unique_id);
 }
 
+static void acpi_init_coherency(struct acpi_device *adev)
+{
+	unsigned long long cca = 0;
+	acpi_status status;
+	struct acpi_device *parent = adev->parent;
+
+	if (parent && parent->flags.cca_seen) {
+		/*
+		 * From ACPI spec, OSPM will ignore _CCA if an ancestor
+		 * already saw one.
+		 */
+		adev->flags.cca_seen = 1;
+		cca = parent->flags.coherent_dma;
+	} else {
+		status = acpi_evaluate_integer(adev->handle, "_CCA",
+					       NULL, &cca);
+		if (ACPI_SUCCESS(status))
+			adev->flags.cca_seen = 1;
+		else if (!IS_ENABLED(CONFIG_ACPI_CCA_REQUIRED))
+			/*
+			 * If architecture does not specify that _CCA is
+			 * required for DMA-able devices (e.g. x86),
+			 * we default to _CCA=1.
+			 */
+			cca = 1;
+		else
+			acpi_handle_debug(adev->handle,
+					  "ACPI device is missing _CCA.\n");
+	}
+
+	adev->flags.coherent_dma = cca;
+}
+
 void acpi_init_device_object(struct acpi_device *device, acpi_handle handle,
 			     int type, unsigned long long sta)
 {
@@ -2129,6 +2163,7 @@ void acpi_init_device_object(struct acpi_device *device, acpi_handle handle,
 	device->flags.visited = false;
 	device_initialize(&device->dev);
 	dev_set_uevent_suppress(&device->dev, true);
+	acpi_init_coherency(device);
 }
 
 void acpi_device_add_finalize(struct acpi_device *device)
diff --git a/include/acpi/acpi_bus.h b/include/acpi/acpi_bus.h
index da079976971f..54df54d00cd2 100644
--- a/include/acpi/acpi_bus.h
+++ b/include/acpi/acpi_bus.h
@@ -209,7 +209,9 @@ struct acpi_device_flags {
 	u32 hotplug_notify:1;
 	u32 is_dock_station:1;
 	u32 of_compatible_ok:1;
-	u32 reserved:22;
+	u32 coherent_dma:1;
+	u32 cca_seen:1;
+	u32 reserved:20;
 };
 
 /* File System */
@@ -381,6 +383,39 @@ struct acpi_device {
 	void (*remove)(struct acpi_device *);
 };
 
+static inline bool acpi_check_dma(struct acpi_device *adev, bool *coherent)
+{
+	bool ret = false;
+
+	if (!adev)
+		return ret;
+
+	/**
+	 * Currently, we only support _CCA=1 (i.e. coherent_dma=1)
+	 * This should be equivalent to specifyig dma-coherent for
+	 * a device in OF.
+	 *
+	 * For the case when _CCA=0 (i.e. coherent_dma=0 && cca_seen=1),
+	 * There are two cases:
+	 * case 1. Do not support and disable DMA.
+	 * case 2. Support but rely on arch-specific cache maintenance for
+	 *         non-coherence DMA operations.
+	 * Currently, we implement case 1 above.
+	 *
+	 * For the case when _CCA is missing (i.e. cca_seen=0) and
+	 * platform specifies ACPI_CCA_REQUIRED, we do not support DMA,
+	 * and fallback to arch-specific default handling.
+	 *
+	 * See acpi_init_coherency() for more info.
+	 */
+	if (adev->flags.coherent_dma) {
+		ret = true;
+		if (coherent)
+			*coherent = adev->flags.coherent_dma;
+	}
+	return ret;
+}
+
 static inline bool is_acpi_node(struct fwnode_handle *fwnode)
 {
 	return fwnode && fwnode->type == FWNODE_ACPI;
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index e4da5e35e29c..d46a48c21c67 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -569,6 +569,11 @@ static inline int acpi_device_modalias(struct device *dev,
 	return -ENODEV;
 }
 
+static inline bool acpi_check_dma(struct acpi_device *adev, bool *coherent)
+{
+	return false;
+}
+
 #define ACPI_PTR(_ptr)	(NULL)
 
 #endif	/* !CONFIG_ACPI */
-- 
cgit v1.2.3


From 05ca556003b1d6b4df0b8831e4c07fad7f5bdd2c Mon Sep 17 00:00:00 2001
From: "Suthikulpanit, Suravee" <Suravee.Suthikulpanit@amd.com>
Date: Wed, 10 Jun 2015 11:08:54 -0500
Subject: device property: Introduces device_dma_is_coherent()

Currently, device drivers, which support both OF and ACPI,
need to call two separate APIs, of_dma_is_coherent() and
acpi_dma_is_coherent()) to determine device coherency attribute.

This patch simplifies this process by introducing a new device
property API, device_dma_is_coherent(), which calls the appropriate
interface based on the booting architecture.

Signed-off-by: Suravee Suthikulpanit <Suravee.Suthikulpanit@amd.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/base/property.c  | 14 ++++++++++++++
 include/linux/property.h |  2 ++
 2 files changed, 16 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/base/property.c b/drivers/base/property.c
index 1d0b116cae95..e645852396ba 100644
--- a/drivers/base/property.c
+++ b/drivers/base/property.c
@@ -14,6 +14,7 @@
 #include <linux/export.h>
 #include <linux/kernel.h>
 #include <linux/of.h>
+#include <linux/of_address.h>
 #include <linux/property.h>
 
 /**
@@ -519,3 +520,16 @@ unsigned int device_get_child_node_count(struct device *dev)
 	return count;
 }
 EXPORT_SYMBOL_GPL(device_get_child_node_count);
+
+bool device_dma_is_coherent(struct device *dev)
+{
+	bool coherent = false;
+
+	if (IS_ENABLED(CONFIG_OF) && dev->of_node)
+		coherent = of_dma_is_coherent(dev->of_node);
+	else
+		acpi_check_dma(ACPI_COMPANION(dev), &coherent);
+
+	return coherent;
+}
+EXPORT_SYMBOL_GPL(device_dma_is_coherent);
diff --git a/include/linux/property.h b/include/linux/property.h
index de8bdf417a35..76ebde9c11d4 100644
--- a/include/linux/property.h
+++ b/include/linux/property.h
@@ -164,4 +164,6 @@ struct property_set {
 
 void device_add_property_set(struct device *dev, struct property_set *pset);
 
+bool device_dma_is_coherent(struct device *dev);
+
 #endif /* _LINUX_PROPERTY_H_ */
-- 
cgit v1.2.3


From 711bdde6a884354ddae8da2fcb495b2a9364cc90 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 15 Jun 2015 09:57:30 -0700
Subject: netfilter: x_tables: remove XT_TABLE_INFO_SZ and a dereference.

After Florian patches, there is no need for XT_TABLE_INFO_SZ anymore :
Only one copy of table is kept, instead of one copy per cpu.

We also can avoid a dereference if we put table data right after
xt_table_info. It reduces register pressure and helps compiler.

Then, we attempt a kmalloc() if total size is under order-3 allocation,
to reduce TLB pressure, as in many cases, rules fit in 32 KB.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter/x_tables.h |  5 +----
 net/ipv4/netfilter/arp_tables.c    |  4 ++--
 net/ipv4/netfilter/ip_tables.c     |  4 ++--
 net/ipv6/netfilter/ip6_tables.c    |  4 ++--
 net/netfilter/x_tables.c           | 32 ++++++++++++--------------------
 5 files changed, 19 insertions(+), 30 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h
index 9969d79dcde1..95693c4cebdd 100644
--- a/include/linux/netfilter/x_tables.h
+++ b/include/linux/netfilter/x_tables.h
@@ -225,12 +225,9 @@ struct xt_table_info {
 	unsigned int __percpu *stackptr;
 	void ***jumpstack;
 
-	/* Note : this field MUST be the last one, see XT_TABLE_INFO_SZ */
-	void *entries;
+	unsigned char entries[0] __aligned(8);
 };
 
-#define XT_TABLE_INFO_SZ (offsetof(struct xt_table_info, entries) \
-			  + nr_cpu_ids * sizeof(char *))
 int xt_register_target(struct xt_target *target);
 void xt_unregister_target(struct xt_target *target);
 int xt_register_targets(struct xt_target *target, unsigned int n);
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index d75c139393fc..95c9b6eece25 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -256,7 +256,7 @@ unsigned int arpt_do_table(struct sk_buff *skb,
 	const struct arphdr *arp;
 	struct arpt_entry *e, *back;
 	const char *indev, *outdev;
-	void *table_base;
+	const void *table_base;
 	const struct xt_table_info *private;
 	struct xt_action_param acpar;
 	unsigned int addend;
@@ -868,7 +868,7 @@ static int compat_table_info(const struct xt_table_info *info,
 			     struct xt_table_info *newinfo)
 {
 	struct arpt_entry *iter;
-	void *loc_cpu_entry;
+	const void *loc_cpu_entry;
 	int ret;
 
 	if (!newinfo || !info)
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index 6151500c2df8..6c72fbb7b49e 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -938,7 +938,7 @@ copy_entries_to_user(unsigned int total_size,
 	struct xt_counters *counters;
 	const struct xt_table_info *private = table->private;
 	int ret = 0;
-	void *loc_cpu_entry;
+	const void *loc_cpu_entry;
 
 	counters = alloc_counters(table);
 	if (IS_ERR(counters))
@@ -1052,7 +1052,7 @@ static int compat_table_info(const struct xt_table_info *info,
 			     struct xt_table_info *newinfo)
 {
 	struct ipt_entry *iter;
-	void *loc_cpu_entry;
+	const void *loc_cpu_entry;
 	int ret;
 
 	if (!newinfo || !info)
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index 80a7f0d38aa4..3c35ced39b42 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -951,7 +951,7 @@ copy_entries_to_user(unsigned int total_size,
 	struct xt_counters *counters;
 	const struct xt_table_info *private = table->private;
 	int ret = 0;
-	void *loc_cpu_entry;
+	const void *loc_cpu_entry;
 
 	counters = alloc_counters(table);
 	if (IS_ERR(counters))
@@ -1065,7 +1065,7 @@ static int compat_table_info(const struct xt_table_info *info,
 			     struct xt_table_info *newinfo)
 {
 	struct ip6t_entry *iter;
-	void *loc_cpu_entry;
+	const void *loc_cpu_entry;
 	int ret;
 
 	if (!newinfo || !info)
diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c
index 6062ce3e862c..d324fe71260c 100644
--- a/net/netfilter/x_tables.c
+++ b/net/netfilter/x_tables.c
@@ -658,29 +658,23 @@ EXPORT_SYMBOL_GPL(xt_compat_target_to_user);
 
 struct xt_table_info *xt_alloc_table_info(unsigned int size)
 {
-	struct xt_table_info *newinfo;
+	struct xt_table_info *info = NULL;
+	size_t sz = sizeof(*info) + size;
 
 	/* Pedantry: prevent them from hitting BUG() in vmalloc.c --RR */
 	if ((SMP_ALIGN(size) >> PAGE_SHIFT) + 2 > totalram_pages)
 		return NULL;
 
-	newinfo = kzalloc(XT_TABLE_INFO_SZ, GFP_KERNEL);
-	if (!newinfo)
-		return NULL;
-
-	newinfo->size = size;
-
-	if (size <= PAGE_SIZE)
-		newinfo->entries = kmalloc(size, GFP_KERNEL);
-	else
-		newinfo->entries = vmalloc(size);
-
-	if (newinfo->entries == NULL) {
-		xt_free_table_info(newinfo);
-		return NULL;
+	if (sz <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER))
+		info = kmalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY);
+	if (!info) {
+		info = vmalloc(sz);
+		if (!info)
+			return NULL;
 	}
-
-	return newinfo;
+	memset(info, 0, sizeof(*info));
+	info->size = size;
+	return info;
 }
 EXPORT_SYMBOL(xt_alloc_table_info);
 
@@ -688,8 +682,6 @@ void xt_free_table_info(struct xt_table_info *info)
 {
 	int cpu;
 
-	kvfree(info->entries);
-
 	if (info->jumpstack != NULL) {
 		for_each_possible_cpu(cpu)
 			kvfree(info->jumpstack[cpu]);
@@ -698,7 +690,7 @@ void xt_free_table_info(struct xt_table_info *info)
 
 	free_percpu(info->stackptr);
 
-	kfree(info);
+	kvfree(info);
 }
 EXPORT_SYMBOL(xt_free_table_info);
 
-- 
cgit v1.2.3


From 6f6a6fda294506dfe0e3e0a253bb2d2923f28f0a Mon Sep 17 00:00:00 2001
From: Joseph Qi <joseph.qi@huawei.com>
Date: Mon, 15 Jun 2015 14:36:01 -0400
Subject: jbd2: fix ocfs2 corrupt when updating journal superblock fails

If updating journal superblock fails after journal data has been
flushed, the error is omitted and this will mislead the caller as a
normal case.  In ocfs2, the checkpoint will be treated successfully
and the other node can get the lock to update. Since the sb_start is
still pointing to the old log block, it will rewrite the journal data
during journal recovery by the other node. Thus the new updates will
be overwritten and ocfs2 corrupts.  So in above case we have to return
the error, and ocfs2_commit_cache will take care of the error and
prevent the other node to do update first.  And only after recovering
journal it can do the new updates.

The issue discussion mail can be found at:
https://oss.oracle.com/pipermail/ocfs2-devel/2015-June/010856.html
http://comments.gmane.org/gmane.comp.file-systems.ext4/48841

[ Fixed bug in patch which allowed a non-negative error return from
  jbd2_cleanup_journal_tail() to leak out of jbd2_fjournal_flush(); this
  was causing xfstests ext4/306 to fail. -- Ted ]

Reported-by: Yiwen Jiang <jiangyiwen@huawei.com>
Signed-off-by: Joseph Qi <joseph.qi@huawei.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Tested-by: Yiwen Jiang <jiangyiwen@huawei.com>
Cc: Junxiao Bi <junxiao.bi@oracle.com>
Cc: stable@vger.kernel.org
---
 fs/jbd2/checkpoint.c |  5 ++---
 fs/jbd2/journal.c    | 38 +++++++++++++++++++++++++++++++-------
 include/linux/jbd2.h |  4 ++--
 3 files changed, 35 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 6b7b73afef81..4227dc4f7437 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -390,7 +390,7 @@ int jbd2_cleanup_journal_tail(journal_t *journal)
 	unsigned long	blocknr;
 
 	if (is_journal_aborted(journal))
-		return 1;
+		return -EIO;
 
 	if (!jbd2_journal_get_log_tail(journal, &first_tid, &blocknr))
 		return 1;
@@ -407,8 +407,7 @@ int jbd2_cleanup_journal_tail(journal_t *journal)
 	if (journal->j_flags & JBD2_BARRIER)
 		blkdev_issue_flush(journal->j_fs_dev, GFP_NOFS, NULL);
 
-	__jbd2_update_log_tail(journal, first_tid, blocknr);
-	return 0;
+	return __jbd2_update_log_tail(journal, first_tid, blocknr);
 }
 
 
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 303ccd953e95..5804466b5785 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -876,9 +876,10 @@ int jbd2_journal_get_log_tail(journal_t *journal, tid_t *tid,
  *
  * Requires j_checkpoint_mutex
  */
-void __jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block)
+int __jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block)
 {
 	unsigned long freed;
+	int ret;
 
 	BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex));
 
@@ -888,7 +889,10 @@ void __jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block)
 	 * space and if we lose sb update during power failure we'd replay
 	 * old transaction with possibly newly overwritten data.
 	 */
-	jbd2_journal_update_sb_log_tail(journal, tid, block, WRITE_FUA);
+	ret = jbd2_journal_update_sb_log_tail(journal, tid, block, WRITE_FUA);
+	if (ret)
+		goto out;
+
 	write_lock(&journal->j_state_lock);
 	freed = block - journal->j_tail;
 	if (block < journal->j_tail)
@@ -904,6 +908,9 @@ void __jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block)
 	journal->j_tail_sequence = tid;
 	journal->j_tail = block;
 	write_unlock(&journal->j_state_lock);
+
+out:
+	return ret;
 }
 
 /*
@@ -1322,7 +1329,7 @@ static int journal_reset(journal_t *journal)
 	return jbd2_journal_start_thread(journal);
 }
 
-static void jbd2_write_superblock(journal_t *journal, int write_op)
+static int jbd2_write_superblock(journal_t *journal, int write_op)
 {
 	struct buffer_head *bh = journal->j_sb_buffer;
 	journal_superblock_t *sb = journal->j_superblock;
@@ -1361,7 +1368,10 @@ static void jbd2_write_superblock(journal_t *journal, int write_op)
 		printk(KERN_ERR "JBD2: Error %d detected when updating "
 		       "journal superblock for %s.\n", ret,
 		       journal->j_devname);
+		jbd2_journal_abort(journal, ret);
 	}
+
+	return ret;
 }
 
 /**
@@ -1374,10 +1384,11 @@ static void jbd2_write_superblock(journal_t *journal, int write_op)
  * Update a journal's superblock information about log tail and write it to
  * disk, waiting for the IO to complete.
  */
-void jbd2_journal_update_sb_log_tail(journal_t *journal, tid_t tail_tid,
+int jbd2_journal_update_sb_log_tail(journal_t *journal, tid_t tail_tid,
 				     unsigned long tail_block, int write_op)
 {
 	journal_superblock_t *sb = journal->j_superblock;
+	int ret;
 
 	BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex));
 	jbd_debug(1, "JBD2: updating superblock (start %lu, seq %u)\n",
@@ -1386,13 +1397,18 @@ void jbd2_journal_update_sb_log_tail(journal_t *journal, tid_t tail_tid,
 	sb->s_sequence = cpu_to_be32(tail_tid);
 	sb->s_start    = cpu_to_be32(tail_block);
 
-	jbd2_write_superblock(journal, write_op);
+	ret = jbd2_write_superblock(journal, write_op);
+	if (ret)
+		goto out;
 
 	/* Log is no longer empty */
 	write_lock(&journal->j_state_lock);
 	WARN_ON(!sb->s_sequence);
 	journal->j_flags &= ~JBD2_FLUSHED;
 	write_unlock(&journal->j_state_lock);
+
+out:
+	return ret;
 }
 
 /**
@@ -1941,7 +1957,14 @@ int jbd2_journal_flush(journal_t *journal)
 		return -EIO;
 
 	mutex_lock(&journal->j_checkpoint_mutex);
-	jbd2_cleanup_journal_tail(journal);
+	if (!err) {
+		err = jbd2_cleanup_journal_tail(journal);
+		if (err < 0) {
+			mutex_unlock(&journal->j_checkpoint_mutex);
+			goto out;
+		}
+		err = 0;
+	}
 
 	/* Finally, mark the journal as really needing no recovery.
 	 * This sets s_start==0 in the underlying superblock, which is
@@ -1957,7 +1980,8 @@ int jbd2_journal_flush(journal_t *journal)
 	J_ASSERT(journal->j_head == journal->j_tail);
 	J_ASSERT(journal->j_tail_sequence == journal->j_transaction_sequence);
 	write_unlock(&journal->j_state_lock);
-	return 0;
+out:
+	return err;
 }
 
 /**
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index 20e7f78041c8..edb640ae9a94 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -1035,7 +1035,7 @@ struct buffer_head *jbd2_journal_get_descriptor_buffer(journal_t *journal);
 int jbd2_journal_next_log_block(journal_t *, unsigned long long *);
 int jbd2_journal_get_log_tail(journal_t *journal, tid_t *tid,
 			      unsigned long *block);
-void __jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block);
+int __jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block);
 void jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block);
 
 /* Commit management */
@@ -1157,7 +1157,7 @@ extern int	   jbd2_journal_recover    (journal_t *journal);
 extern int	   jbd2_journal_wipe       (journal_t *, int);
 extern int	   jbd2_journal_skip_recovery	(journal_t *);
 extern void	   jbd2_journal_update_sb_errno(journal_t *);
-extern void	   jbd2_journal_update_sb_log_tail	(journal_t *, tid_t,
+extern int	   jbd2_journal_update_sb_log_tail	(journal_t *, tid_t,
 				unsigned long, int);
 extern void	   __jbd2_journal_abort_hard	(journal_t *);
 extern void	   jbd2_journal_abort      (journal_t *, int);
-- 
cgit v1.2.3


From ffeedafbf0236f03aeb2e8db273b3e5ae5f5bc89 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@plumgrid.com>
Date: Fri, 12 Jun 2015 19:39:12 -0700
Subject: bpf: introduce current->pid, tgid, uid, gid, comm accessors

eBPF programs attached to kprobes need to filter based on
current->pid, uid and other fields, so introduce helper functions:

u64 bpf_get_current_pid_tgid(void)
Return: current->tgid << 32 | current->pid

u64 bpf_get_current_uid_gid(void)
Return: current_gid << 32 | current_uid

bpf_get_current_comm(char *buf, int size_of_buf)
stores current->comm into buf

They can be used from the programs attached to TC as well to classify packets
based on current task fields.

Update tracex2 example to print histogram of write syscalls for each process
instead of aggregated for all.

Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h        |  3 +++
 include/uapi/linux/bpf.h   | 19 +++++++++++++
 kernel/bpf/core.c          |  3 +++
 kernel/bpf/helpers.c       | 58 +++++++++++++++++++++++++++++++++++++++
 kernel/trace/bpf_trace.c   |  6 +++++
 net/core/filter.c          |  6 +++++
 samples/bpf/bpf_helpers.h  |  6 +++++
 samples/bpf/tracex2_kern.c | 24 +++++++++++++----
 samples/bpf/tracex2_user.c | 67 +++++++++++++++++++++++++++++++++++++++-------
 9 files changed, 178 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 2235aee8096a..1b9a3f5b27f6 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -188,5 +188,8 @@ extern const struct bpf_func_proto bpf_get_prandom_u32_proto;
 extern const struct bpf_func_proto bpf_get_smp_processor_id_proto;
 extern const struct bpf_func_proto bpf_tail_call_proto;
 extern const struct bpf_func_proto bpf_ktime_get_ns_proto;
+extern const struct bpf_func_proto bpf_get_current_pid_tgid_proto;
+extern const struct bpf_func_proto bpf_get_current_uid_gid_proto;
+extern const struct bpf_func_proto bpf_get_current_comm_proto;
 
 #endif /* _LINUX_BPF_H */
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 602f05b7a275..29ef6f99e43d 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -230,6 +230,25 @@ enum bpf_func_id {
 	 * Return: 0 on success
 	 */
 	BPF_FUNC_clone_redirect,
+
+	/**
+	 * u64 bpf_get_current_pid_tgid(void)
+	 * Return: current->tgid << 32 | current->pid
+	 */
+	BPF_FUNC_get_current_pid_tgid,
+
+	/**
+	 * u64 bpf_get_current_uid_gid(void)
+	 * Return: current_gid << 32 | current_uid
+	 */
+	BPF_FUNC_get_current_uid_gid,
+
+	/**
+	 * bpf_get_current_comm(char *buf, int size_of_buf)
+	 * stores current->comm into buf
+	 * Return: 0 on success
+	 */
+	BPF_FUNC_get_current_comm,
 	__BPF_FUNC_MAX_ID,
 };
 
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 1e00aa3316dc..1fc45cc83076 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -730,6 +730,9 @@ const struct bpf_func_proto bpf_map_delete_elem_proto __weak;
 const struct bpf_func_proto bpf_get_prandom_u32_proto __weak;
 const struct bpf_func_proto bpf_get_smp_processor_id_proto __weak;
 const struct bpf_func_proto bpf_ktime_get_ns_proto __weak;
+const struct bpf_func_proto bpf_get_current_pid_tgid_proto __weak;
+const struct bpf_func_proto bpf_get_current_uid_gid_proto __weak;
+const struct bpf_func_proto bpf_get_current_comm_proto __weak;
 
 /* Always built-in helper functions. */
 const struct bpf_func_proto bpf_tail_call_proto = {
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 7ad5d8842d5b..1447ec09421e 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -14,6 +14,8 @@
 #include <linux/random.h>
 #include <linux/smp.h>
 #include <linux/ktime.h>
+#include <linux/sched.h>
+#include <linux/uidgid.h>
 
 /* If kernel subsystem is allowing eBPF programs to call this function,
  * inside its own verifier_ops->get_func_proto() callback it should return
@@ -124,3 +126,59 @@ const struct bpf_func_proto bpf_ktime_get_ns_proto = {
 	.gpl_only	= true,
 	.ret_type	= RET_INTEGER,
 };
+
+static u64 bpf_get_current_pid_tgid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+	struct task_struct *task = current;
+
+	if (!task)
+		return -EINVAL;
+
+	return (u64) task->tgid << 32 | task->pid;
+}
+
+const struct bpf_func_proto bpf_get_current_pid_tgid_proto = {
+	.func		= bpf_get_current_pid_tgid,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+};
+
+static u64 bpf_get_current_uid_gid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+	struct task_struct *task = current;
+	kuid_t uid;
+	kgid_t gid;
+
+	if (!task)
+		return -EINVAL;
+
+	current_uid_gid(&uid, &gid);
+	return (u64) from_kgid(&init_user_ns, gid) << 32 |
+		from_kuid(&init_user_ns, uid);
+}
+
+const struct bpf_func_proto bpf_get_current_uid_gid_proto = {
+	.func		= bpf_get_current_uid_gid,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+};
+
+static u64 bpf_get_current_comm(u64 r1, u64 size, u64 r3, u64 r4, u64 r5)
+{
+	struct task_struct *task = current;
+	char *buf = (char *) (long) r1;
+
+	if (!task)
+		return -EINVAL;
+
+	memcpy(buf, task->comm, min_t(size_t, size, sizeof(task->comm)));
+	return 0;
+}
+
+const struct bpf_func_proto bpf_get_current_comm_proto = {
+	.func		= bpf_get_current_comm,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_STACK,
+	.arg2_type	= ARG_CONST_STACK_SIZE,
+};
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 50c4015a8ad3..3a17638cdf46 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -162,6 +162,12 @@ static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func
 		return &bpf_ktime_get_ns_proto;
 	case BPF_FUNC_tail_call:
 		return &bpf_tail_call_proto;
+	case BPF_FUNC_get_current_pid_tgid:
+		return &bpf_get_current_pid_tgid_proto;
+	case BPF_FUNC_get_current_uid_gid:
+		return &bpf_get_current_uid_gid_proto;
+	case BPF_FUNC_get_current_comm:
+		return &bpf_get_current_comm_proto;
 
 	case BPF_FUNC_trace_printk:
 		/*
diff --git a/net/core/filter.c b/net/core/filter.c
index d271c06bf01f..20aa51ccbf9d 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1459,6 +1459,12 @@ tc_cls_act_func_proto(enum bpf_func_id func_id)
 		return &bpf_l4_csum_replace_proto;
 	case BPF_FUNC_clone_redirect:
 		return &bpf_clone_redirect_proto;
+	case BPF_FUNC_get_current_pid_tgid:
+		return &bpf_get_current_pid_tgid_proto;
+	case BPF_FUNC_get_current_uid_gid:
+		return &bpf_get_current_uid_gid_proto;
+	case BPF_FUNC_get_current_comm:
+		return &bpf_get_current_comm_proto;
 	default:
 		return sk_filter_func_proto(func_id);
 	}
diff --git a/samples/bpf/bpf_helpers.h b/samples/bpf/bpf_helpers.h
index f531a0b3282d..bdf1c1607b80 100644
--- a/samples/bpf/bpf_helpers.h
+++ b/samples/bpf/bpf_helpers.h
@@ -25,6 +25,12 @@ static void (*bpf_tail_call)(void *ctx, void *map, int index) =
 	(void *) BPF_FUNC_tail_call;
 static unsigned long long (*bpf_get_smp_processor_id)(void) =
 	(void *) BPF_FUNC_get_smp_processor_id;
+static unsigned long long (*bpf_get_current_pid_tgid)(void) =
+	(void *) BPF_FUNC_get_current_pid_tgid;
+static unsigned long long (*bpf_get_current_uid_gid)(void) =
+	(void *) BPF_FUNC_get_current_uid_gid;
+static int (*bpf_get_current_comm)(void *buf, int buf_size) =
+	(void *) BPF_FUNC_get_current_comm;
 
 /* llvm builtin functions that eBPF C program may use to
  * emit BPF_LD_ABS and BPF_LD_IND instructions
diff --git a/samples/bpf/tracex2_kern.c b/samples/bpf/tracex2_kern.c
index 19ec1cfc45db..dc50f4f2943f 100644
--- a/samples/bpf/tracex2_kern.c
+++ b/samples/bpf/tracex2_kern.c
@@ -62,11 +62,18 @@ static unsigned int log2l(unsigned long v)
 		return log2(v);
 }
 
+struct hist_key {
+	char comm[16];
+	u64 pid_tgid;
+	u64 uid_gid;
+	u32 index;
+};
+
 struct bpf_map_def SEC("maps") my_hist_map = {
-	.type = BPF_MAP_TYPE_ARRAY,
-	.key_size = sizeof(u32),
+	.type = BPF_MAP_TYPE_HASH,
+	.key_size = sizeof(struct hist_key),
 	.value_size = sizeof(long),
-	.max_entries = 64,
+	.max_entries = 1024,
 };
 
 SEC("kprobe/sys_write")
@@ -75,11 +82,18 @@ int bpf_prog3(struct pt_regs *ctx)
 	long write_size = ctx->dx; /* arg3 */
 	long init_val = 1;
 	long *value;
-	u32 index = log2l(write_size);
+	struct hist_key key = {};
+
+	key.index = log2l(write_size);
+	key.pid_tgid = bpf_get_current_pid_tgid();
+	key.uid_gid = bpf_get_current_uid_gid();
+	bpf_get_current_comm(&key.comm, sizeof(key.comm));
 
-	value = bpf_map_lookup_elem(&my_hist_map, &index);
+	value = bpf_map_lookup_elem(&my_hist_map, &key);
 	if (value)
 		__sync_fetch_and_add(value, 1);
+	else
+		bpf_map_update_elem(&my_hist_map, &key, &init_val, BPF_ANY);
 	return 0;
 }
 char _license[] SEC("license") = "GPL";
diff --git a/samples/bpf/tracex2_user.c b/samples/bpf/tracex2_user.c
index 91b8d0896fbb..cd0241c1447a 100644
--- a/samples/bpf/tracex2_user.c
+++ b/samples/bpf/tracex2_user.c
@@ -3,6 +3,7 @@
 #include <stdlib.h>
 #include <signal.h>
 #include <linux/bpf.h>
+#include <string.h>
 #include "libbpf.h"
 #include "bpf_load.h"
 
@@ -20,23 +21,42 @@ static void stars(char *str, long val, long max, int width)
 	str[i] = '\0';
 }
 
-static void print_hist(int fd)
+struct task {
+	char comm[16];
+	__u64 pid_tgid;
+	__u64 uid_gid;
+};
+
+struct hist_key {
+	struct task t;
+	__u32 index;
+};
+
+#define SIZE sizeof(struct task)
+
+static void print_hist_for_pid(int fd, void *task)
 {
-	int key;
+	struct hist_key key = {}, next_key;
+	char starstr[MAX_STARS];
 	long value;
 	long data[MAX_INDEX] = {};
-	char starstr[MAX_STARS];
-	int i;
 	int max_ind = -1;
 	long max_value = 0;
+	int i, ind;
 
-	for (key = 0; key < MAX_INDEX; key++) {
-		bpf_lookup_elem(fd, &key, &value);
-		data[key] = value;
-		if (value && key > max_ind)
-			max_ind = key;
+	while (bpf_get_next_key(fd, &key, &next_key) == 0) {
+		if (memcmp(&next_key, task, SIZE)) {
+			key = next_key;
+			continue;
+		}
+		bpf_lookup_elem(fd, &next_key, &value);
+		ind = next_key.index;
+		data[ind] = value;
+		if (value && ind > max_ind)
+			max_ind = ind;
 		if (value > max_value)
 			max_value = value;
+		key = next_key;
 	}
 
 	printf("           syscall write() stats\n");
@@ -48,6 +68,35 @@ static void print_hist(int fd)
 		       MAX_STARS, starstr);
 	}
 }
+
+static void print_hist(int fd)
+{
+	struct hist_key key = {}, next_key;
+	static struct task tasks[1024];
+	int task_cnt = 0;
+	int i;
+
+	while (bpf_get_next_key(fd, &key, &next_key) == 0) {
+		int found = 0;
+
+		for (i = 0; i < task_cnt; i++)
+			if (memcmp(&tasks[i], &next_key, SIZE) == 0)
+				found = 1;
+		if (!found)
+			memcpy(&tasks[task_cnt++], &next_key, SIZE);
+		key = next_key;
+	}
+
+	for (i = 0; i < task_cnt; i++) {
+		printf("\npid %d cmd %s uid %d\n",
+		       (__u32) tasks[i].pid_tgid,
+		       tasks[i].comm,
+		       (__u32) tasks[i].uid_gid);
+		print_hist_for_pid(fd, &tasks[i]);
+	}
+
+}
+
 static void int_exit(int sig)
 {
 	print_hist(map_fd[1]);
-- 
cgit v1.2.3


From 0756ea3e85139d23a8148ebaa95411c2f0aa4f11 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@plumgrid.com>
Date: Fri, 12 Jun 2015 19:39:13 -0700
Subject: bpf: allow networking programs to use bpf_trace_printk() for
 debugging

bpf_trace_printk() is a helper function used to debug eBPF programs.
Let socket and TC programs use it as well.
Note, it's DEBUG ONLY helper. If it's used in the program,
the kernel will print warning banner to make sure users don't use
it in production.

Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h      |  1 +
 kernel/bpf/core.c        |  4 ++++
 kernel/trace/bpf_trace.c | 20 ++++++++++++--------
 net/core/filter.c        |  2 ++
 4 files changed, 19 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 1b9a3f5b27f6..4383476a0d48 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -150,6 +150,7 @@ struct bpf_array {
 u64 bpf_tail_call(u64 ctx, u64 r2, u64 index, u64 r4, u64 r5);
 void bpf_prog_array_map_clear(struct bpf_map *map);
 bool bpf_prog_array_compatible(struct bpf_array *array, const struct bpf_prog *fp);
+const struct bpf_func_proto *bpf_get_trace_printk_proto(void);
 
 #ifdef CONFIG_BPF_SYSCALL
 void bpf_register_prog_type(struct bpf_prog_type_list *tl);
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 1fc45cc83076..c5bedc82bc1c 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -733,6 +733,10 @@ const struct bpf_func_proto bpf_ktime_get_ns_proto __weak;
 const struct bpf_func_proto bpf_get_current_pid_tgid_proto __weak;
 const struct bpf_func_proto bpf_get_current_uid_gid_proto __weak;
 const struct bpf_func_proto bpf_get_current_comm_proto __weak;
+const struct bpf_func_proto * __weak bpf_get_trace_printk_proto(void)
+{
+	return NULL;
+}
 
 /* Always built-in helper functions. */
 const struct bpf_func_proto bpf_tail_call_proto = {
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 3a17638cdf46..4f9b5d41869b 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -147,6 +147,17 @@ static const struct bpf_func_proto bpf_trace_printk_proto = {
 	.arg2_type	= ARG_CONST_STACK_SIZE,
 };
 
+const struct bpf_func_proto *bpf_get_trace_printk_proto(void)
+{
+	/*
+	 * this program might be calling bpf_trace_printk,
+	 * so allocate per-cpu printk buffers
+	 */
+	trace_printk_init_buffers();
+
+	return &bpf_trace_printk_proto;
+}
+
 static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func_id)
 {
 	switch (func_id) {
@@ -168,15 +179,8 @@ static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func
 		return &bpf_get_current_uid_gid_proto;
 	case BPF_FUNC_get_current_comm:
 		return &bpf_get_current_comm_proto;
-
 	case BPF_FUNC_trace_printk:
-		/*
-		 * this program might be calling bpf_trace_printk,
-		 * so allocate per-cpu printk buffers
-		 */
-		trace_printk_init_buffers();
-
-		return &bpf_trace_printk_proto;
+		return bpf_get_trace_printk_proto();
 	default:
 		return NULL;
 	}
diff --git a/net/core/filter.c b/net/core/filter.c
index 20aa51ccbf9d..65ff107d3d29 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1442,6 +1442,8 @@ sk_filter_func_proto(enum bpf_func_id func_id)
 		return &bpf_tail_call_proto;
 	case BPF_FUNC_ktime_get_ns:
 		return &bpf_ktime_get_ns_proto;
+	case BPF_FUNC_trace_printk:
+		return bpf_get_trace_printk_proto();
 	default:
 		return NULL;
 	}
-- 
cgit v1.2.3


From 9464ca650008615d28d9aaf7de99b4edf61f0109 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 12 Jun 2015 19:44:48 -0700
Subject: net: make u64_stats_init() a function
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Using a function instead of a macro is cleaner and remove
following W=1 warnings (extract)

In file included from net/ipv6/ip6_vti.c:29:0:
net/ipv6/ip6_vti.c: In function ‘vti6_dev_init_gen’:
include/linux/netdevice.h:2029:18: warning: variable ‘stat’ set but not
used [-Wunused-but-set-variable]
    typeof(type) *stat;   \
                  ^
net/ipv6/ip6_vti.c:862:16: note: in expansion of macro
‘netdev_alloc_pcpu_stats’
  dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
                ^
  CC [M]  net/ipv6/sit.o
In file included from net/ipv6/sit.c:30:0:
net/ipv6/sit.c: In function ‘ipip6_tunnel_init’:
include/linux/netdevice.h:2029:18: warning: variable ‘stat’ set but not
used [-Wunused-but-set-variable]
    typeof(type) *stat;   \
                  ^

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/u64_stats_sync.h | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/u64_stats_sync.h b/include/linux/u64_stats_sync.h
index 4b4439e75f45..df89c9bcba7d 100644
--- a/include/linux/u64_stats_sync.h
+++ b/include/linux/u64_stats_sync.h
@@ -68,11 +68,12 @@ struct u64_stats_sync {
 };
 
 
+static inline void u64_stats_init(struct u64_stats_sync *syncp)
+{
 #if BITS_PER_LONG == 32 && defined(CONFIG_SMP)
-# define u64_stats_init(syncp)	seqcount_init(syncp.seq)
-#else
-# define u64_stats_init(syncp)	do { } while (0)
+	seqcount_init(&syncp->seq);
 #endif
+}
 
 static inline void u64_stats_update_begin(struct u64_stats_sync *syncp)
 {
-- 
cgit v1.2.3


From 47d8417f5914012c794684f651213ffae1b91619 Mon Sep 17 00:00:00 2001
From: Eran Ben Elisha <eranbe@mellanox.com>
Date: Mon, 15 Jun 2015 17:58:58 +0300
Subject: net/mlx4_core: Add sink counter

Reserve the last valid counter index for "sink" counter, when a
new counter cannot be allocated, the driver will use this counter.

In order to avoid allocating this counter on any other flow, fix the
indices bitmap allocation range, and reserve the sink counter index.

Add macro for the sink counter index and replace all appearences of the
index with the macro.

Signed-off-by: Eran Ben Elisha <eranbe@mellanox.com>
Signed-off-by: Hadar Hen Zion <hadarh@mellanox.com>
Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/infiniband/hw/mlx4/qp.c                   |  3 ++-
 drivers/net/ethernet/mellanox/mlx4/en_resources.c |  2 +-
 drivers/net/ethernet/mellanox/mlx4/main.c         | 11 +++++++----
 include/linux/mlx4/device.h                       |  1 +
 4 files changed, 11 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c
index 02fc91c68027..c7d916fab68a 100644
--- a/drivers/infiniband/hw/mlx4/qp.c
+++ b/drivers/infiniband/hw/mlx4/qp.c
@@ -1544,7 +1544,8 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp,
 						dev->counters[qp->port - 1];
 			optpar |= MLX4_QP_OPTPAR_COUNTER_INDEX;
 		} else
-			context->pri_path.counter_index = 0xff;
+			context->pri_path.counter_index =
+				MLX4_SINK_COUNTER_INDEX(dev->dev);
 
 		if (qp->flags & MLX4_IB_QP_NETIF) {
 			mlx4_ib_steer_qp_reg(dev, qp, 1);
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_resources.c b/drivers/net/ethernet/mellanox/mlx4/en_resources.c
index 34f2fdf4fe5d..97bcb9135f8d 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_resources.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_resources.c
@@ -66,7 +66,7 @@ void mlx4_en_fill_qp_context(struct mlx4_en_priv *priv, int size, int stride,
 		context->pri_path.sched_queue |= user_prio << 3;
 		context->pri_path.feup = MLX4_FEUP_FORCE_ETH_UP;
 	}
-	context->pri_path.counter_index = 0xff;
+	context->pri_path.counter_index = MLX4_SINK_COUNTER_INDEX(mdev->dev);
 	context->cqn_send = cpu_to_be32(cqn);
 	context->cqn_recv = cpu_to_be32(cqn);
 	context->db_rec_addr = cpu_to_be64(priv->res.db.dma << 2);
diff --git a/drivers/net/ethernet/mellanox/mlx4/main.c b/drivers/net/ethernet/mellanox/mlx4/main.c
index 1072dc1054dd..2bf54687f9fb 100644
--- a/drivers/net/ethernet/mellanox/mlx4/main.c
+++ b/drivers/net/ethernet/mellanox/mlx4/main.c
@@ -479,7 +479,7 @@ static int mlx4_dev_cap(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
 		}
 	}
 
-	dev->caps.max_counters = 1 << ilog2(dev_cap->max_counters);
+	dev->caps.max_counters = dev_cap->max_counters;
 
 	dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FW] = dev_cap->reserved_qps;
 	dev->caps.reserved_qps_cnt[MLX4_QP_REGION_ETH_ADDR] =
@@ -2193,13 +2193,16 @@ err_free_icm:
 static int mlx4_init_counters_table(struct mlx4_dev *dev)
 {
 	struct mlx4_priv *priv = mlx4_priv(dev);
-	int nent;
+	int nent_pow2;
 
 	if (!(dev->caps.flags & MLX4_DEV_CAP_FLAG_COUNTERS))
 		return -ENOENT;
 
-	nent = dev->caps.max_counters;
-	return mlx4_bitmap_init(&priv->counters_bitmap, nent, nent - 1, 0, 0);
+	nent_pow2 = roundup_pow_of_two(dev->caps.max_counters);
+	/* reserve last counter index for sink counter */
+	return mlx4_bitmap_init(&priv->counters_bitmap, nent_pow2,
+				nent_pow2 - 1, 0,
+				nent_pow2 - dev->caps.max_counters + 1);
 }
 
 static void mlx4_cleanup_counters_table(struct mlx4_dev *dev)
diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h
index ad31e476873f..312c50420dde 100644
--- a/include/linux/mlx4/device.h
+++ b/include/linux/mlx4/device.h
@@ -957,6 +957,7 @@ struct mlx4_mad_ifc {
 			((dev)->caps.flags & MLX4_DEV_CAP_FLAG_IBOE))
 
 #define MLX4_INVALID_SLAVE_ID	0xFF
+#define MLX4_SINK_COUNTER_INDEX(dev)	(dev->caps.max_counters - 1)
 
 void handle_port_mgmt_change_event(struct work_struct *work);
 
-- 
cgit v1.2.3


From 6de5f7f6a1fa2288552d46b3effbb6d5571413e5 Mon Sep 17 00:00:00 2001
From: Eran Ben Elisha <eranbe@mellanox.com>
Date: Mon, 15 Jun 2015 17:59:02 +0300
Subject: net/mlx4_core: Allocate default counter per port

Default counter per port will be allocated at the mlx4 core driver load.

Every QP opened by the Ethernet driver will be attached to the port's default
counter.  This is an infrastructure step to collect VF statistics from the PF.

Signed-off-by: Eran Ben Elisha <eranbe@mellanox.com>
Signed-off-by: Hadar Hen Zion <hadarh@mellanox.com>
Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx4/en_netdev.c    |  5 ++
 drivers/net/ethernet/mellanox/mlx4/en_resources.c |  2 +-
 drivers/net/ethernet/mellanox/mlx4/main.c         | 71 ++++++++++++++++++++++-
 drivers/net/ethernet/mellanox/mlx4/mlx4.h         |  1 +
 drivers/net/ethernet/mellanox/mlx4/mlx4_en.h      |  1 +
 include/linux/mlx4/device.h                       |  1 +
 6 files changed, 77 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
index 98efb5842fca..048fca0ca9a4 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
@@ -1597,6 +1597,9 @@ int mlx4_en_start_port(struct net_device *dev)
 	}
 	mdev->mac_removed[priv->port] = 0;
 
+	priv->counter_index =
+			mlx4_get_default_counter_index(mdev->dev, priv->port);
+
 	err = mlx4_en_config_rss_steer(priv);
 	if (err) {
 		en_err(priv, "Failed configuring rss steering\n");
@@ -1755,6 +1758,7 @@ void mlx4_en_stop_port(struct net_device *dev, int detach)
 
 	/* Set port as not active */
 	priv->port_up = false;
+	priv->counter_index = MLX4_SINK_COUNTER_INDEX(mdev->dev);
 
 	/* Promsicuous mode */
 	if (mdev->dev->caps.steering_mode ==
@@ -2778,6 +2782,7 @@ int mlx4_en_init_netdev(struct mlx4_en_dev *mdev, int port,
 
 	priv = netdev_priv(dev);
 	memset(priv, 0, sizeof(struct mlx4_en_priv));
+	priv->counter_index = MLX4_SINK_COUNTER_INDEX(mdev->dev);
 	spin_lock_init(&priv->stats_lock);
 	INIT_WORK(&priv->rx_mode_task, mlx4_en_do_set_rx_mode);
 	INIT_WORK(&priv->watchdog_task, mlx4_en_restart);
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_resources.c b/drivers/net/ethernet/mellanox/mlx4/en_resources.c
index 97bcb9135f8d..e482fa1bb741 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_resources.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_resources.c
@@ -66,7 +66,7 @@ void mlx4_en_fill_qp_context(struct mlx4_en_priv *priv, int size, int stride,
 		context->pri_path.sched_queue |= user_prio << 3;
 		context->pri_path.feup = MLX4_FEUP_FORCE_ETH_UP;
 	}
-	context->pri_path.counter_index = MLX4_SINK_COUNTER_INDEX(mdev->dev);
+	context->pri_path.counter_index = priv->counter_index;
 	context->cqn_send = cpu_to_be32(cqn);
 	context->cqn_recv = cpu_to_be32(cqn);
 	context->db_rec_addr = cpu_to_be64(priv->res.db.dma << 2);
diff --git a/drivers/net/ethernet/mellanox/mlx4/main.c b/drivers/net/ethernet/mellanox/mlx4/main.c
index 8b27bfcf809d..4e69cf52a579 100644
--- a/drivers/net/ethernet/mellanox/mlx4/main.c
+++ b/drivers/net/ethernet/mellanox/mlx4/main.c
@@ -2219,6 +2219,47 @@ static void mlx4_cleanup_counters_table(struct mlx4_dev *dev)
 	mlx4_bitmap_cleanup(&mlx4_priv(dev)->counters_bitmap);
 }
 
+static void mlx4_cleanup_default_counters(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	int port;
+
+	for (port = 0; port < dev->caps.num_ports; port++)
+		if (priv->def_counter[port] != -1)
+			mlx4_counter_free(dev,  priv->def_counter[port]);
+}
+
+static int mlx4_allocate_default_counters(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	int port, err = 0;
+	u32 idx;
+
+	for (port = 0; port < dev->caps.num_ports; port++)
+		priv->def_counter[port] = -1;
+
+	for (port = 0; port < dev->caps.num_ports; port++) {
+		err = mlx4_counter_alloc(dev, &idx);
+
+		if (!err || err == -ENOSPC) {
+			priv->def_counter[port] = idx;
+		} else if (err == -ENOENT) {
+			err = 0;
+			continue;
+		} else {
+			mlx4_err(dev, "%s: failed to allocate default counter port %d err %d\n",
+				 __func__, port + 1, err);
+			mlx4_cleanup_default_counters(dev);
+			return err;
+		}
+
+		mlx4_dbg(dev, "%s: default counter index %d for port %d\n",
+			 __func__, priv->def_counter[port], port + 1);
+	}
+
+	return err;
+}
+
 int __mlx4_counter_alloc(struct mlx4_dev *dev, u32 *idx)
 {
 	struct mlx4_priv *priv = mlx4_priv(dev);
@@ -2227,8 +2268,10 @@ int __mlx4_counter_alloc(struct mlx4_dev *dev, u32 *idx)
 		return -ENOENT;
 
 	*idx = mlx4_bitmap_alloc(&priv->counters_bitmap);
-	if (*idx == -1)
-		return -ENOMEM;
+	if (*idx == -1) {
+		*idx = MLX4_SINK_COUNTER_INDEX(dev);
+		return -ENOSPC;
+	}
 
 	return 0;
 }
@@ -2275,6 +2318,9 @@ void __mlx4_counter_free(struct mlx4_dev *dev, u32 idx)
 	if (!(dev->caps.flags & MLX4_DEV_CAP_FLAG_COUNTERS))
 		return;
 
+	if (idx == MLX4_SINK_COUNTER_INDEX(dev))
+		return;
+
 	__mlx4_clear_if_stat(dev, idx);
 
 	mlx4_bitmap_free(&mlx4_priv(dev)->counters_bitmap, idx, MLX4_USE_RR);
@@ -2296,6 +2342,14 @@ void mlx4_counter_free(struct mlx4_dev *dev, u32 idx)
 }
 EXPORT_SYMBOL_GPL(mlx4_counter_free);
 
+int mlx4_get_default_counter_index(struct mlx4_dev *dev, int port)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+
+	return priv->def_counter[port - 1];
+}
+EXPORT_SYMBOL_GPL(mlx4_get_default_counter_index);
+
 void mlx4_set_admin_guid(struct mlx4_dev *dev, __be64 guid, int entry, int port)
 {
 	struct mlx4_priv *priv = mlx4_priv(dev);
@@ -2439,6 +2493,12 @@ static int mlx4_setup_hca(struct mlx4_dev *dev)
 		}
 	}
 
+	err = mlx4_allocate_default_counters(dev);
+	if (err) {
+		mlx4_err(dev, "Failed to allocate default counters, aborting\n");
+		goto err_counters_table_free;
+	}
+
 	if (!mlx4_is_slave(dev)) {
 		for (port = 1; port <= dev->caps.num_ports; port++) {
 			ib_port_default_caps = 0;
@@ -2470,13 +2530,16 @@ static int mlx4_setup_hca(struct mlx4_dev *dev)
 			if (err) {
 				mlx4_err(dev, "Failed to set port %d, aborting\n",
 					 port);
-				goto err_counters_table_free;
+				goto err_default_countes_free;
 			}
 		}
 	}
 
 	return 0;
 
+err_default_countes_free:
+	mlx4_cleanup_default_counters(dev);
+
 err_counters_table_free:
 	if (!mlx4_is_slave(dev))
 		mlx4_cleanup_counters_table(dev);
@@ -3212,6 +3275,7 @@ err_port:
 	for (--port; port >= 1; --port)
 		mlx4_cleanup_port_info(&priv->port[port]);
 
+	mlx4_cleanup_default_counters(dev);
 	if (!mlx4_is_slave(dev))
 		mlx4_cleanup_counters_table(dev);
 	mlx4_cleanup_qp_table(dev);
@@ -3511,6 +3575,7 @@ static void mlx4_unload_one(struct pci_dev *pdev)
 		mlx4_free_resource_tracker(dev,
 					   RES_TR_FREE_SLAVES_ONLY);
 
+	mlx4_cleanup_default_counters(dev);
 	if (!mlx4_is_slave(dev))
 		mlx4_cleanup_counters_table(dev);
 	mlx4_cleanup_qp_table(dev);
diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4.h b/drivers/net/ethernet/mellanox/mlx4/mlx4.h
index 8d2d3594bf8f..e9e94bc0e75d 100644
--- a/drivers/net/ethernet/mellanox/mlx4/mlx4.h
+++ b/drivers/net/ethernet/mellanox/mlx4/mlx4.h
@@ -876,6 +876,7 @@ struct mlx4_priv {
 	struct mlx4_qp_table	qp_table;
 	struct mlx4_mcg_table	mcg_table;
 	struct mlx4_bitmap	counters_bitmap;
+	int			def_counter[MLX4_MAX_PORTS];
 
 	struct mlx4_catas_err	catas_err;
 
diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
index edd8fd69ec9a..f6c3e128cc0c 100644
--- a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
+++ b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
@@ -582,6 +582,7 @@ struct mlx4_en_priv {
 	int base_tx_qpn;
 	struct hlist_head mac_hash[MLX4_EN_MAC_HASH_SIZE];
 	struct hwtstamp_config hwtstamp_config;
+	u32 counter_index;
 
 #ifdef CONFIG_MLX4_EN_DCB
 	struct ieee_ets ets;
diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h
index 312c50420dde..4820080ac394 100644
--- a/include/linux/mlx4/device.h
+++ b/include/linux/mlx4/device.h
@@ -1348,6 +1348,7 @@ int mlx4_wol_write(struct mlx4_dev *dev, u64 config, int port);
 
 int mlx4_counter_alloc(struct mlx4_dev *dev, u32 *idx);
 void mlx4_counter_free(struct mlx4_dev *dev, u32 idx);
+int mlx4_get_default_counter_index(struct mlx4_dev *dev, int port);
 
 void mlx4_set_admin_guid(struct mlx4_dev *dev, __be64 guid, int entry,
 			 int port);
-- 
cgit v1.2.3


From 9616982f3fcc9e6577d7f41009c4ef2df19a71ec Mon Sep 17 00:00:00 2001
From: Eran Ben Elisha <eranbe@mellanox.com>
Date: Mon, 15 Jun 2015 17:59:05 +0300
Subject: net/mlx4_core: Add helper to query counters

This is an infrastructure step for querying VF and PF counters.

This code was in the IB driver, move it to the mlx4 core driver
so it will be accessible for more use cases.

Signed-off-by: Eran Ben Elisha <eranbe@mellanox.com>
Signed-off-by: Hadar Hen Zion <hadarh@mellanox.com>
Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/infiniband/hw/mlx4/mad.c         | 32 +++++-------------
 drivers/net/ethernet/mellanox/mlx4/cmd.c | 57 ++++++++++++++++++++++++++++++++
 include/linux/mlx4/cmd.h                 |  3 ++
 include/linux/mlx4/device.h              |  8 +++++
 4 files changed, 76 insertions(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/infiniband/hw/mlx4/mad.c b/drivers/infiniband/hw/mlx4/mad.c
index bc698b14683f..bc09b4e1f57c 100644
--- a/drivers/infiniband/hw/mlx4/mad.c
+++ b/drivers/infiniband/hw/mlx4/mad.c
@@ -64,14 +64,6 @@ enum {
 #define GUID_TBL_BLK_NUM_ENTRIES 8
 #define GUID_TBL_BLK_SIZE (GUID_TBL_ENTRY_SIZE * GUID_TBL_BLK_NUM_ENTRIES)
 
-/* Counters should be saturate once they reach their maximum value */
-#define ASSIGN_32BIT_COUNTER(counter, value) do {\
-	if ((value) > U32_MAX)			 \
-		counter = cpu_to_be32(U32_MAX); \
-	else					 \
-		counter = cpu_to_be32(value);	 \
-} while (0)
-
 struct mlx4_mad_rcv_buf {
 	struct ib_grh grh;
 	u8 payload[256];
@@ -828,31 +820,25 @@ static int iboe_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num,
 			struct ib_wc *in_wc, struct ib_grh *in_grh,
 			struct ib_mad *in_mad, struct ib_mad *out_mad)
 {
-	struct mlx4_cmd_mailbox *mailbox;
+	struct mlx4_counter counter_stats;
 	struct mlx4_ib_dev *dev = to_mdev(ibdev);
 	int err;
-	u32 inmod = dev->counters[port_num - 1].index & 0xffff;
-	u8 mode;
 
 	if (in_mad->mad_hdr.mgmt_class != IB_MGMT_CLASS_PERF_MGMT)
 		return -EINVAL;
 
-	mailbox = mlx4_alloc_cmd_mailbox(dev->dev);
-	if (IS_ERR(mailbox))
-		return IB_MAD_RESULT_FAILURE;
-
-	err = mlx4_cmd_box(dev->dev, 0, mailbox->dma, inmod, 0,
-			   MLX4_CMD_QUERY_IF_STAT, MLX4_CMD_TIME_CLASS_C,
-			   MLX4_CMD_WRAPPED);
+	memset(&counter_stats, 0, sizeof(counter_stats));
+	err = mlx4_get_counter_stats(dev->dev,
+				     dev->counters[port_num - 1].index,
+				     &counter_stats, 0);
 	if (err)
 		err = IB_MAD_RESULT_FAILURE;
 	else {
 		memset(out_mad->data, 0, sizeof out_mad->data);
-		mode = ((struct mlx4_counter *)mailbox->buf)->counter_mode;
-		switch (mode & 0xf) {
+		switch (counter_stats.counter_mode & 0xf) {
 		case 0:
-			edit_counter(mailbox->buf,
-						(void *)(out_mad->data + 40));
+			edit_counter(&counter_stats,
+				     (void *)(out_mad->data + 40));
 			err = IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY;
 			break;
 		default:
@@ -860,8 +846,6 @@ static int iboe_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num,
 		}
 	}
 
-	mlx4_free_cmd_mailbox(dev->dev, mailbox);
-
 	return err;
 }
 
diff --git a/drivers/net/ethernet/mellanox/mlx4/cmd.c b/drivers/net/ethernet/mellanox/mlx4/cmd.c
index 68ae765873a9..44b8f7715ade 100644
--- a/drivers/net/ethernet/mellanox/mlx4/cmd.c
+++ b/drivers/net/ethernet/mellanox/mlx4/cmd.c
@@ -49,6 +49,7 @@
 #include "mlx4.h"
 #include "fw.h"
 #include "fw_qos.h"
+#include "mlx4_stats.h"
 
 #define CMD_POLL_TOKEN 0xffff
 #define INBOX_MASK	0xffffffffffffff00ULL
@@ -3166,6 +3167,62 @@ int mlx4_set_vf_link_state(struct mlx4_dev *dev, int port, int vf, int link_stat
 }
 EXPORT_SYMBOL_GPL(mlx4_set_vf_link_state);
 
+int mlx4_get_counter_stats(struct mlx4_dev *dev, int counter_index,
+			   struct mlx4_counter *counter_stats, int reset)
+{
+	struct mlx4_cmd_mailbox *mailbox = NULL;
+	struct mlx4_counter *tmp_counter;
+	int err;
+	u32 if_stat_in_mod;
+
+	if (!counter_stats)
+		return -EINVAL;
+
+	if (counter_index == MLX4_SINK_COUNTER_INDEX(dev))
+		return 0;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+
+	memset(mailbox->buf, 0, sizeof(struct mlx4_counter));
+	if_stat_in_mod = counter_index;
+	if (reset)
+		if_stat_in_mod |= MLX4_QUERY_IF_STAT_RESET;
+	err = mlx4_cmd_box(dev, 0, mailbox->dma,
+			   if_stat_in_mod, 0,
+			   MLX4_CMD_QUERY_IF_STAT,
+			   MLX4_CMD_TIME_CLASS_C,
+			   MLX4_CMD_NATIVE);
+	if (err) {
+		mlx4_dbg(dev, "%s: failed to read statistics for counter index %d\n",
+			 __func__, counter_index);
+		goto if_stat_out;
+	}
+	tmp_counter = (struct mlx4_counter *)mailbox->buf;
+	counter_stats->counter_mode = tmp_counter->counter_mode;
+	if (counter_stats->counter_mode == 0) {
+		counter_stats->rx_frames =
+			cpu_to_be64(be64_to_cpu(counter_stats->rx_frames) +
+				    be64_to_cpu(tmp_counter->rx_frames));
+		counter_stats->tx_frames =
+			cpu_to_be64(be64_to_cpu(counter_stats->tx_frames) +
+				    be64_to_cpu(tmp_counter->tx_frames));
+		counter_stats->rx_bytes =
+			cpu_to_be64(be64_to_cpu(counter_stats->rx_bytes) +
+				    be64_to_cpu(tmp_counter->rx_bytes));
+		counter_stats->tx_bytes =
+			cpu_to_be64(be64_to_cpu(counter_stats->tx_bytes) +
+				    be64_to_cpu(tmp_counter->tx_bytes));
+	}
+
+if_stat_out:
+	mlx4_free_cmd_mailbox(dev, mailbox);
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx4_get_counter_stats);
+
 int mlx4_vf_smi_enabled(struct mlx4_dev *dev, int slave, int port)
 {
 	struct mlx4_priv *priv = mlx4_priv(dev);
diff --git a/include/linux/mlx4/cmd.h b/include/linux/mlx4/cmd.h
index f62e7cf227c6..5dffc869988b 100644
--- a/include/linux/mlx4/cmd.h
+++ b/include/linux/mlx4/cmd.h
@@ -35,6 +35,7 @@
 
 #include <linux/dma-mapping.h>
 #include <linux/if_link.h>
+#include <linux/mlx4/device.h>
 
 enum {
 	/* initialization and general commands */
@@ -300,6 +301,8 @@ static inline int mlx4_cmd_imm(struct mlx4_dev *dev, u64 in_param, u64 *out_para
 struct mlx4_cmd_mailbox *mlx4_alloc_cmd_mailbox(struct mlx4_dev *dev);
 void mlx4_free_cmd_mailbox(struct mlx4_dev *dev, struct mlx4_cmd_mailbox *mailbox);
 
+int mlx4_get_counter_stats(struct mlx4_dev *dev, int counter_index,
+			   struct mlx4_counter *counter_stats, int reset);
 u32 mlx4_comm_get_version(void);
 int mlx4_set_vf_mac(struct mlx4_dev *dev, int port, int vf, u64 mac);
 int mlx4_set_vf_vlan(struct mlx4_dev *dev, int port, int vf, u16 vlan, u8 qos);
diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h
index 4820080ac394..efe80c754b2f 100644
--- a/include/linux/mlx4/device.h
+++ b/include/linux/mlx4/device.h
@@ -771,6 +771,14 @@ union mlx4_ext_av {
 	struct mlx4_eth_av	eth;
 };
 
+/* Counters should be saturate once they reach their maximum value */
+#define ASSIGN_32BIT_COUNTER(counter, value) do {	\
+	if ((value) > U32_MAX)				\
+		counter = cpu_to_be32(U32_MAX);		\
+	else						\
+		counter = cpu_to_be32(value);		\
+} while (0)
+
 struct mlx4_counter {
 	u8	reserved1[3];
 	u8	counter_mode;
-- 
cgit v1.2.3


From 3b766cd832328fcb87db3507e7b98cf42f21689d Mon Sep 17 00:00:00 2001
From: Eran Ben Elisha <eranbe@mellanox.com>
Date: Mon, 15 Jun 2015 17:59:07 +0300
Subject: net/core: Add reading VF statistics through the PF netdevice

Add ndo_get_vf_stats where the PF retrieves and fills the VFs traffic
statistics. We encode the VF stats in a nested manner to allow for
future extensions.

Signed-off-by: Eran Ben Elisha <eranbe@mellanox.com>
Signed-off-by: Hadar Hen Zion <hadarh@mellanox.com>
Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/if_link.h      |  9 ++++++++
 include/linux/netdevice.h    |  4 ++++
 include/uapi/linux/if_link.h | 13 +++++++++++
 net/core/rtnetlink.c         | 51 ++++++++++++++++++++++++++++++++++++++++++--
 4 files changed, 75 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/if_link.h b/include/linux/if_link.h
index da4929927f69..ae5d0d22955d 100644
--- a/include/linux/if_link.h
+++ b/include/linux/if_link.h
@@ -5,6 +5,15 @@
 
 
 /* We don't want this structure exposed to user space */
+struct ifla_vf_stats {
+	__u64 rx_packets;
+	__u64 tx_packets;
+	__u64 rx_bytes;
+	__u64 tx_bytes;
+	__u64 broadcast;
+	__u64 multicast;
+};
+
 struct ifla_vf_info {
 	__u32 vf;
 	__u8 mac[32];
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 6f5f71ff5169..e20979dfd6a9 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1100,6 +1100,10 @@ struct net_device_ops {
 						     struct ifla_vf_info *ivf);
 	int			(*ndo_set_vf_link_state)(struct net_device *dev,
 							 int vf, int link_state);
+	int			(*ndo_get_vf_stats)(struct net_device *dev,
+						    int vf,
+						    struct ifla_vf_stats
+						    *vf_stats);
 	int			(*ndo_set_vf_port)(struct net_device *dev,
 						   int vf,
 						   struct nlattr *port[]);
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index 510efb360580..2c7e8e3d3981 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -484,6 +484,7 @@ enum {
 	IFLA_VF_RSS_QUERY_EN,	/* RSS Redirection Table and Hash Key query
 				 * on/off switch
 				 */
+	IFLA_VF_STATS,		/* network device statistics */
 	__IFLA_VF_MAX,
 };
 
@@ -533,6 +534,18 @@ struct ifla_vf_rss_query_en {
 	__u32 setting;
 };
 
+enum {
+	IFLA_VF_STATS_RX_PACKETS,
+	IFLA_VF_STATS_TX_PACKETS,
+	IFLA_VF_STATS_RX_BYTES,
+	IFLA_VF_STATS_TX_BYTES,
+	IFLA_VF_STATS_BROADCAST,
+	IFLA_VF_STATS_MULTICAST,
+	__IFLA_VF_STATS_MAX,
+};
+
+#define IFLA_VF_STATS_MAX (__IFLA_VF_STATS_MAX - 1)
+
 /* VF ports management section
  *
  *	Nested layout of set/get msg is:
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 077b6d280371..2d102ce1474f 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -819,7 +819,19 @@ static inline int rtnl_vfinfo_size(const struct net_device *dev,
 			 nla_total_size(sizeof(struct ifla_vf_spoofchk)) +
 			 nla_total_size(sizeof(struct ifla_vf_rate)) +
 			 nla_total_size(sizeof(struct ifla_vf_link_state)) +
-			 nla_total_size(sizeof(struct ifla_vf_rss_query_en)));
+			 nla_total_size(sizeof(struct ifla_vf_rss_query_en)) +
+			 /* IFLA_VF_STATS_RX_PACKETS */
+			 nla_total_size(sizeof(__u64)) +
+			 /* IFLA_VF_STATS_TX_PACKETS */
+			 nla_total_size(sizeof(__u64)) +
+			 /* IFLA_VF_STATS_RX_BYTES */
+			 nla_total_size(sizeof(__u64)) +
+			 /* IFLA_VF_STATS_TX_BYTES */
+			 nla_total_size(sizeof(__u64)) +
+			 /* IFLA_VF_STATS_BROADCAST */
+			 nla_total_size(sizeof(__u64)) +
+			 /* IFLA_VF_STATS_MULTICAST */
+			 nla_total_size(sizeof(__u64)));
 		return size;
 	} else
 		return 0;
@@ -1123,7 +1135,7 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,
 	    && (ext_filter_mask & RTEXT_FILTER_VF)) {
 		int i;
 
-		struct nlattr *vfinfo, *vf;
+		struct nlattr *vfinfo, *vf, *vfstats;
 		int num_vfs = dev_num_vf(dev->dev.parent);
 
 		vfinfo = nla_nest_start(skb, IFLA_VFINFO_LIST);
@@ -1138,6 +1150,7 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,
 			struct ifla_vf_spoofchk vf_spoofchk;
 			struct ifla_vf_link_state vf_linkstate;
 			struct ifla_vf_rss_query_en vf_rss_query_en;
+			struct ifla_vf_stats vf_stats;
 
 			/*
 			 * Not all SR-IOV capable drivers support the
@@ -1190,6 +1203,30 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,
 				    sizeof(vf_rss_query_en),
 				    &vf_rss_query_en))
 				goto nla_put_failure;
+			memset(&vf_stats, 0, sizeof(vf_stats));
+			if (dev->netdev_ops->ndo_get_vf_stats)
+				dev->netdev_ops->ndo_get_vf_stats(dev, i,
+								  &vf_stats);
+			vfstats = nla_nest_start(skb, IFLA_VF_STATS);
+			if (!vfstats) {
+				nla_nest_cancel(skb, vf);
+				nla_nest_cancel(skb, vfinfo);
+				goto nla_put_failure;
+			}
+			if (nla_put_u64(skb, IFLA_VF_STATS_RX_PACKETS,
+					vf_stats.rx_packets) ||
+			    nla_put_u64(skb, IFLA_VF_STATS_TX_PACKETS,
+					vf_stats.tx_packets) ||
+			    nla_put_u64(skb, IFLA_VF_STATS_RX_BYTES,
+					vf_stats.rx_bytes) ||
+			    nla_put_u64(skb, IFLA_VF_STATS_TX_BYTES,
+					vf_stats.tx_bytes) ||
+			    nla_put_u64(skb, IFLA_VF_STATS_BROADCAST,
+					vf_stats.broadcast) ||
+			    nla_put_u64(skb, IFLA_VF_STATS_MULTICAST,
+					vf_stats.multicast))
+				goto nla_put_failure;
+			nla_nest_end(skb, vfstats);
 			nla_nest_end(skb, vf);
 		}
 		nla_nest_end(skb, vfinfo);
@@ -1303,6 +1340,16 @@ static const struct nla_policy ifla_vf_policy[IFLA_VF_MAX+1] = {
 	[IFLA_VF_RATE]		= { .len = sizeof(struct ifla_vf_rate) },
 	[IFLA_VF_LINK_STATE]	= { .len = sizeof(struct ifla_vf_link_state) },
 	[IFLA_VF_RSS_QUERY_EN]	= { .len = sizeof(struct ifla_vf_rss_query_en) },
+	[IFLA_VF_STATS]		= { .type = NLA_NESTED },
+};
+
+static const struct nla_policy ifla_vf_stats_policy[IFLA_VF_STATS_MAX + 1] = {
+	[IFLA_VF_STATS_RX_PACKETS]	= { .type = NLA_U64 },
+	[IFLA_VF_STATS_TX_PACKETS]	= { .type = NLA_U64 },
+	[IFLA_VF_STATS_RX_BYTES]	= { .type = NLA_U64 },
+	[IFLA_VF_STATS_TX_BYTES]	= { .type = NLA_U64 },
+	[IFLA_VF_STATS_BROADCAST]	= { .type = NLA_U64 },
+	[IFLA_VF_STATS_MULTICAST]	= { .type = NLA_U64 },
 };
 
 static const struct nla_policy ifla_port_policy[IFLA_PORT_MAX+1] = {
-- 
cgit v1.2.3


From 62a890557f57e6cbebe9cc6c32aef045405d4fa2 Mon Sep 17 00:00:00 2001
From: Eran Ben Elisha <eranbe@mellanox.com>
Date: Mon, 15 Jun 2015 17:59:08 +0300
Subject: net/mlx4_en: Support ndo_get_vf_stats

Implement the ndo to gather VF statistics through the PF.

All counters related to this VF are stored in a per slave
list, run over the slave's list and collect all statistics.

Signed-off-by: Eran Ben Elisha <eranbe@mellanox.com>
Signed-off-by: Hadar Hen Zion <hadarh@mellanox.com>
Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx4/cmd.c           | 30 ++++++++++++++
 drivers/net/ethernet/mellanox/mlx4/en_netdev.c     | 10 +++++
 drivers/net/ethernet/mellanox/mlx4/mlx4.h          |  2 +
 .../net/ethernet/mellanox/mlx4/resource_tracker.c  | 48 ++++++++++++++++++++++
 include/linux/mlx4/cmd.h                           |  3 ++
 5 files changed, 93 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx4/cmd.c b/drivers/net/ethernet/mellanox/mlx4/cmd.c
index 44b8f7715ade..82040137d7d9 100644
--- a/drivers/net/ethernet/mellanox/mlx4/cmd.c
+++ b/drivers/net/ethernet/mellanox/mlx4/cmd.c
@@ -3223,6 +3223,36 @@ if_stat_out:
 }
 EXPORT_SYMBOL_GPL(mlx4_get_counter_stats);
 
+int mlx4_get_vf_stats(struct mlx4_dev *dev, int port, int vf_idx,
+		      struct ifla_vf_stats *vf_stats)
+{
+	struct mlx4_counter tmp_vf_stats;
+	int slave;
+	int err = 0;
+
+	if (!vf_stats)
+		return -EINVAL;
+
+	if (!mlx4_is_master(dev))
+		return -EPROTONOSUPPORT;
+
+	slave = mlx4_get_slave_indx(dev, vf_idx);
+	if (slave < 0)
+		return -EINVAL;
+
+	port = mlx4_slaves_closest_port(dev, slave, port);
+	err = mlx4_calc_vf_counters(dev, slave, port, &tmp_vf_stats);
+	if (!err && tmp_vf_stats.counter_mode == 0) {
+		vf_stats->rx_packets = be64_to_cpu(tmp_vf_stats.rx_frames);
+		vf_stats->tx_packets = be64_to_cpu(tmp_vf_stats.tx_frames);
+		vf_stats->rx_bytes = be64_to_cpu(tmp_vf_stats.rx_bytes);
+		vf_stats->tx_bytes = be64_to_cpu(tmp_vf_stats.tx_bytes);
+	}
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx4_get_vf_stats);
+
 int mlx4_vf_smi_enabled(struct mlx4_dev *dev, int slave, int port)
 {
 	struct mlx4_priv *priv = mlx4_priv(dev);
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
index f9142f22d630..77179d7ae4cc 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
@@ -2292,6 +2292,15 @@ static int mlx4_en_set_vf_link_state(struct net_device *dev, int vf, int link_st
 	return mlx4_set_vf_link_state(mdev->dev, en_priv->port, vf, link_state);
 }
 
+static int mlx4_en_get_vf_stats(struct net_device *dev, int vf,
+				struct ifla_vf_stats *vf_stats)
+{
+	struct mlx4_en_priv *en_priv = netdev_priv(dev);
+	struct mlx4_en_dev *mdev = en_priv->mdev;
+
+	return mlx4_get_vf_stats(mdev->dev, en_priv->port, vf, vf_stats);
+}
+
 #define PORT_ID_BYTE_LEN 8
 static int mlx4_en_get_phys_port_id(struct net_device *dev,
 				    struct netdev_phys_item_id *ppid)
@@ -2489,6 +2498,7 @@ static const struct net_device_ops mlx4_netdev_ops_master = {
 	.ndo_set_vf_rate	= mlx4_en_set_vf_rate,
 	.ndo_set_vf_spoofchk	= mlx4_en_set_vf_spoofchk,
 	.ndo_set_vf_link_state	= mlx4_en_set_vf_link_state,
+	.ndo_get_vf_stats       = mlx4_en_get_vf_stats,
 	.ndo_get_vf_config	= mlx4_en_get_vf_config,
 #ifdef CONFIG_NET_POLL_CONTROLLER
 	.ndo_poll_controller	= mlx4_en_netpoll,
diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4.h b/drivers/net/ethernet/mellanox/mlx4/mlx4.h
index e9e94bc0e75d..a092c5c34d43 100644
--- a/drivers/net/ethernet/mellanox/mlx4/mlx4.h
+++ b/drivers/net/ethernet/mellanox/mlx4/mlx4.h
@@ -1010,6 +1010,8 @@ int __mlx4_write_mtt(struct mlx4_dev *dev, struct mlx4_mtt *mtt,
 		     int start_index, int npages, u64 *page_list);
 int __mlx4_counter_alloc(struct mlx4_dev *dev, u32 *idx);
 void __mlx4_counter_free(struct mlx4_dev *dev, u32 idx);
+int mlx4_calc_vf_counters(struct mlx4_dev *dev, int slave, int port,
+			  struct mlx4_counter *data);
 int __mlx4_xrcd_alloc(struct mlx4_dev *dev, u32 *xrcdn);
 void __mlx4_xrcd_free(struct mlx4_dev *dev, u32 xrcdn);
 
diff --git a/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c b/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c
index 73db584bc240..731423ca575d 100644
--- a/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c
+++ b/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c
@@ -46,6 +46,7 @@
 
 #include "mlx4.h"
 #include "fw.h"
+#include "mlx4_stats.h"
 
 #define MLX4_MAC_VALID		(1ull << 63)
 #define MLX4_PF_COUNTERS_PER_PORT	2
@@ -1147,6 +1148,53 @@ static struct res_common *alloc_tr(u64 id, enum mlx4_resource type, int slave,
 	return ret;
 }
 
+int mlx4_calc_vf_counters(struct mlx4_dev *dev, int slave, int port,
+			  struct mlx4_counter *data)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_resource_tracker *tracker = &priv->mfunc.master.res_tracker;
+	struct res_common *tmp;
+	struct res_counter *counter;
+	int *counters_arr;
+	int i = 0, err = 0;
+
+	memset(data, 0, sizeof(*data));
+
+	counters_arr = kmalloc_array(dev->caps.max_counters,
+				     sizeof(*counters_arr), GFP_KERNEL);
+	if (!counters_arr)
+		return -ENOMEM;
+
+	spin_lock_irq(mlx4_tlock(dev));
+	list_for_each_entry(tmp,
+			    &tracker->slave_list[slave].res_list[RES_COUNTER],
+			    list) {
+		counter = container_of(tmp, struct res_counter, com);
+		if (counter->port == port) {
+			counters_arr[i] = (int)tmp->res_id;
+			i++;
+		}
+	}
+	spin_unlock_irq(mlx4_tlock(dev));
+	counters_arr[i] = -1;
+
+	i = 0;
+
+	while (counters_arr[i] != -1) {
+		err = mlx4_get_counter_stats(dev, counters_arr[i], data,
+					     0);
+		if (err) {
+			memset(data, 0, sizeof(*data));
+			goto table_changed;
+		}
+		i++;
+	}
+
+table_changed:
+	kfree(counters_arr);
+	return 0;
+}
+
 static int add_res_range(struct mlx4_dev *dev, int slave, u64 base, int count,
 			 enum mlx4_resource type, int extra)
 {
diff --git a/include/linux/mlx4/cmd.h b/include/linux/mlx4/cmd.h
index 5dffc869988b..58391f2e0414 100644
--- a/include/linux/mlx4/cmd.h
+++ b/include/linux/mlx4/cmd.h
@@ -36,6 +36,7 @@
 #include <linux/dma-mapping.h>
 #include <linux/if_link.h>
 #include <linux/mlx4/device.h>
+#include <linux/netdevice.h>
 
 enum {
 	/* initialization and general commands */
@@ -303,6 +304,8 @@ void mlx4_free_cmd_mailbox(struct mlx4_dev *dev, struct mlx4_cmd_mailbox *mailbo
 
 int mlx4_get_counter_stats(struct mlx4_dev *dev, int counter_index,
 			   struct mlx4_counter *counter_stats, int reset);
+int mlx4_get_vf_stats(struct mlx4_dev *dev, int port, int vf_idx,
+		      struct ifla_vf_stats *vf_stats);
 u32 mlx4_comm_get_version(void);
 int mlx4_set_vf_mac(struct mlx4_dev *dev, int port, int vf, u64 mac);
 int mlx4_set_vf_vlan(struct mlx4_dev *dev, int port, int vf, u16 vlan, u8 qos);
-- 
cgit v1.2.3


From eb4cb008529ca08e0d8c0fa54e8f739520197a65 Mon Sep 17 00:00:00 2001
From: Craig Gallek <kraig@google.com>
Date: Mon, 15 Jun 2015 11:26:18 -0400
Subject: sock_diag: define destruction multicast groups

These groups will contain socket-destruction events for
AF_INET/AF_INET6, IPPROTO_TCP/IPPROTO_UDP.

Near the end of socket destruction, a check for listeners is
performed.  In the presence of a listener, rather than completely
cleanup the socket, a unit of work will be added to a private
work queue which will first broadcast information about the socket
and then finish the cleanup operation.

Signed-off-by: Craig Gallek <kraig@google.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/sock_diag.h      | 42 +++++++++++++++++++++
 include/net/sock.h             |  1 +
 include/uapi/linux/sock_diag.h | 10 +++++
 net/core/sock.c                | 11 +++++-
 net/core/sock_diag.c           | 85 ++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 148 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/sock_diag.h b/include/linux/sock_diag.h
index 083ac388098e..fddebc617469 100644
--- a/include/linux/sock_diag.h
+++ b/include/linux/sock_diag.h
@@ -1,7 +1,10 @@
 #ifndef __SOCK_DIAG_H__
 #define __SOCK_DIAG_H__
 
+#include <linux/netlink.h>
 #include <linux/user_namespace.h>
+#include <net/net_namespace.h>
+#include <net/sock.h>
 #include <uapi/linux/sock_diag.h>
 
 struct sk_buff;
@@ -11,6 +14,7 @@ struct sock;
 struct sock_diag_handler {
 	__u8 family;
 	int (*dump)(struct sk_buff *skb, struct nlmsghdr *nlh);
+	int (*get_info)(struct sk_buff *skb, struct sock *sk);
 };
 
 int sock_diag_register(const struct sock_diag_handler *h);
@@ -26,4 +30,42 @@ int sock_diag_put_meminfo(struct sock *sk, struct sk_buff *skb, int attr);
 int sock_diag_put_filterinfo(bool may_report_filterinfo, struct sock *sk,
 			     struct sk_buff *skb, int attrtype);
 
+static inline
+enum sknetlink_groups sock_diag_destroy_group(const struct sock *sk)
+{
+	switch (sk->sk_family) {
+	case AF_INET:
+		switch (sk->sk_protocol) {
+		case IPPROTO_TCP:
+			return SKNLGRP_INET_TCP_DESTROY;
+		case IPPROTO_UDP:
+			return SKNLGRP_INET_UDP_DESTROY;
+		default:
+			return SKNLGRP_NONE;
+		}
+	case AF_INET6:
+		switch (sk->sk_protocol) {
+		case IPPROTO_TCP:
+			return SKNLGRP_INET6_TCP_DESTROY;
+		case IPPROTO_UDP:
+			return SKNLGRP_INET6_UDP_DESTROY;
+		default:
+			return SKNLGRP_NONE;
+		}
+	default:
+		return SKNLGRP_NONE;
+	}
+}
+
+static inline
+bool sock_diag_has_destroy_listeners(const struct sock *sk)
+{
+	const struct net *n = sock_net(sk);
+	const enum sknetlink_groups group = sock_diag_destroy_group(sk);
+
+	return group != SKNLGRP_NONE && n->diag_nlsk &&
+		netlink_has_listeners(n->diag_nlsk, group);
+}
+void sock_diag_broadcast_destroy(struct sock *sk);
+
 #endif
diff --git a/include/net/sock.h b/include/net/sock.h
index 26c1c3171e00..3e8258699270 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1518,6 +1518,7 @@ static inline void unlock_sock_fast(struct sock *sk, bool slow)
 struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
 		      struct proto *prot, int kern);
 void sk_free(struct sock *sk);
+void sk_destruct(struct sock *sk);
 struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority);
 
 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
diff --git a/include/uapi/linux/sock_diag.h b/include/uapi/linux/sock_diag.h
index b00e29efb161..49230d36f9ce 100644
--- a/include/uapi/linux/sock_diag.h
+++ b/include/uapi/linux/sock_diag.h
@@ -23,4 +23,14 @@ enum {
 	SK_MEMINFO_VARS,
 };
 
+enum sknetlink_groups {
+	SKNLGRP_NONE,
+	SKNLGRP_INET_TCP_DESTROY,
+	SKNLGRP_INET_UDP_DESTROY,
+	SKNLGRP_INET6_TCP_DESTROY,
+	SKNLGRP_INET6_UDP_DESTROY,
+	__SKNLGRP_MAX,
+};
+#define SKNLGRP_MAX	(__SKNLGRP_MAX - 1)
+
 #endif /* _UAPI__SOCK_DIAG_H__ */
diff --git a/net/core/sock.c b/net/core/sock.c
index 7063c329c1b6..1e1fe9a68d83 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -131,6 +131,7 @@
 #include <linux/ipsec.h>
 #include <net/cls_cgroup.h>
 #include <net/netprio_cgroup.h>
+#include <linux/sock_diag.h>
 
 #include <linux/filter.h>
 
@@ -1423,7 +1424,7 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
 }
 EXPORT_SYMBOL(sk_alloc);
 
-static void __sk_free(struct sock *sk)
+void sk_destruct(struct sock *sk)
 {
 	struct sk_filter *filter;
 
@@ -1451,6 +1452,14 @@ static void __sk_free(struct sock *sk)
 	sk_prot_free(sk->sk_prot_creator, sk);
 }
 
+static void __sk_free(struct sock *sk)
+{
+	if (unlikely(sock_diag_has_destroy_listeners(sk)))
+		sock_diag_broadcast_destroy(sk);
+	else
+		sk_destruct(sk);
+}
+
 void sk_free(struct sock *sk)
 {
 	/*
diff --git a/net/core/sock_diag.c b/net/core/sock_diag.c
index 74dddf84adcd..d79866c5f8bc 100644
--- a/net/core/sock_diag.c
+++ b/net/core/sock_diag.c
@@ -5,6 +5,9 @@
 #include <net/net_namespace.h>
 #include <linux/module.h>
 #include <net/sock.h>
+#include <linux/kernel.h>
+#include <linux/tcp.h>
+#include <linux/workqueue.h>
 
 #include <linux/inet_diag.h>
 #include <linux/sock_diag.h>
@@ -12,6 +15,7 @@
 static const struct sock_diag_handler *sock_diag_handlers[AF_MAX];
 static int (*inet_rcv_compat)(struct sk_buff *skb, struct nlmsghdr *nlh);
 static DEFINE_MUTEX(sock_diag_table_mutex);
+static struct workqueue_struct *broadcast_wq;
 
 static u64 sock_gen_cookie(struct sock *sk)
 {
@@ -101,6 +105,62 @@ out:
 }
 EXPORT_SYMBOL(sock_diag_put_filterinfo);
 
+struct broadcast_sk {
+	struct sock *sk;
+	struct work_struct work;
+};
+
+static size_t sock_diag_nlmsg_size(void)
+{
+	return NLMSG_ALIGN(sizeof(struct inet_diag_msg)
+	       + nla_total_size(sizeof(u8)) /* INET_DIAG_PROTOCOL */
+	       + nla_total_size(sizeof(struct tcp_info))); /* INET_DIAG_INFO */
+}
+
+static void sock_diag_broadcast_destroy_work(struct work_struct *work)
+{
+	struct broadcast_sk *bsk =
+		container_of(work, struct broadcast_sk, work);
+	struct sock *sk = bsk->sk;
+	const struct sock_diag_handler *hndl;
+	struct sk_buff *skb;
+	const enum sknetlink_groups group = sock_diag_destroy_group(sk);
+	int err = -1;
+
+	WARN_ON(group == SKNLGRP_NONE);
+
+	skb = nlmsg_new(sock_diag_nlmsg_size(), GFP_KERNEL);
+	if (!skb)
+		goto out;
+
+	mutex_lock(&sock_diag_table_mutex);
+	hndl = sock_diag_handlers[sk->sk_family];
+	if (hndl && hndl->get_info)
+		err = hndl->get_info(skb, sk);
+	mutex_unlock(&sock_diag_table_mutex);
+
+	if (!err)
+		nlmsg_multicast(sock_net(sk)->diag_nlsk, skb, 0, group,
+				GFP_KERNEL);
+	else
+		kfree_skb(skb);
+out:
+	sk_destruct(sk);
+	kfree(bsk);
+}
+
+void sock_diag_broadcast_destroy(struct sock *sk)
+{
+	/* Note, this function is often called from an interrupt context. */
+	struct broadcast_sk *bsk =
+		kmalloc(sizeof(struct broadcast_sk), GFP_ATOMIC);
+	if (!bsk)
+		return sk_destruct(sk);
+	bsk->sk = sk;
+	INIT_WORK(&bsk->work, sock_diag_broadcast_destroy_work);
+	queue_work(broadcast_wq, &bsk->work);
+}
+
 void sock_diag_register_inet_compat(int (*fn)(struct sk_buff *skb, struct nlmsghdr *nlh))
 {
 	mutex_lock(&sock_diag_table_mutex);
@@ -211,10 +271,32 @@ static void sock_diag_rcv(struct sk_buff *skb)
 	mutex_unlock(&sock_diag_mutex);
 }
 
+static int sock_diag_bind(struct net *net, int group)
+{
+	switch (group) {
+	case SKNLGRP_INET_TCP_DESTROY:
+	case SKNLGRP_INET_UDP_DESTROY:
+		if (!sock_diag_handlers[AF_INET])
+			request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK,
+				       NETLINK_SOCK_DIAG, AF_INET);
+		break;
+	case SKNLGRP_INET6_TCP_DESTROY:
+	case SKNLGRP_INET6_UDP_DESTROY:
+		if (!sock_diag_handlers[AF_INET6])
+			request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK,
+				       NETLINK_SOCK_DIAG, AF_INET);
+		break;
+	}
+	return 0;
+}
+
 static int __net_init diag_net_init(struct net *net)
 {
 	struct netlink_kernel_cfg cfg = {
+		.groups	= SKNLGRP_MAX,
 		.input	= sock_diag_rcv,
+		.bind	= sock_diag_bind,
+		.flags	= NL_CFG_F_NONROOT_RECV,
 	};
 
 	net->diag_nlsk = netlink_kernel_create(net, NETLINK_SOCK_DIAG, &cfg);
@@ -234,12 +316,15 @@ static struct pernet_operations diag_net_ops = {
 
 static int __init sock_diag_init(void)
 {
+	broadcast_wq = alloc_workqueue("sock_diag_events", 0, 0);
+	BUG_ON(!broadcast_wq);
 	return register_pernet_subsys(&diag_net_ops);
 }
 
 static void __exit sock_diag_exit(void)
 {
 	unregister_pernet_subsys(&diag_net_ops);
+	destroy_workqueue(broadcast_wq);
 }
 
 module_init(sock_diag_init);
-- 
cgit v1.2.3


From 3fd22af808f4d7455ba91596d334438c7ee0f889 Mon Sep 17 00:00:00 2001
From: Craig Gallek <kraig@google.com>
Date: Mon, 15 Jun 2015 11:26:19 -0400
Subject: sock_diag: specify info_size per inet protocol

Previously, there was no clear distinction between the inet protocols
that used struct tcp_info to report information and those that didn't.
This change adds a specific size attribute to the inet_diag_handler
struct which defines these interfaces.  This will make dispatching
sock_diag get_info requests identical for all inet protocols in a
following patch.

Tested: ss -au
Tested: ss -at
Signed-off-by: Craig Gallek <kraig@google.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/inet_diag.h | 1 +
 net/dccp/diag.c           | 1 +
 net/ipv4/inet_diag.c      | 4 ++--
 net/ipv4/tcp_diag.c       | 1 +
 net/ipv4/udp_diag.c       | 2 ++
 5 files changed, 7 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/inet_diag.h b/include/linux/inet_diag.h
index ac48b10c9395..0e707f0c1a3e 100644
--- a/include/linux/inet_diag.h
+++ b/include/linux/inet_diag.h
@@ -24,6 +24,7 @@ struct inet_diag_handler {
 					  struct inet_diag_msg *r,
 					  void *info);
 	__u16		idiag_type;
+	__u16		idiag_info_size;
 };
 
 struct inet_connection_sock;
diff --git a/net/dccp/diag.c b/net/dccp/diag.c
index 5a45f8de5d99..2d84303ea6bf 100644
--- a/net/dccp/diag.c
+++ b/net/dccp/diag.c
@@ -66,6 +66,7 @@ static const struct inet_diag_handler dccp_diag_handler = {
 	.dump_one	 = dccp_diag_dump_one,
 	.idiag_get_info	 = dccp_diag_get_info,
 	.idiag_type	 = IPPROTO_DCCP,
+	.idiag_info_size = sizeof(struct tcp_info),
 };
 
 static int __init dccp_diag_init(void)
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index 4d32262c7502..b1f01174bf32 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -200,9 +200,9 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
 	}
 #undef EXPIRES_IN_MS
 
-	if (ext & (1 << (INET_DIAG_INFO - 1))) {
+	if ((ext & (1 << (INET_DIAG_INFO - 1))) && handler->idiag_info_size) {
 		attr = nla_reserve(skb, INET_DIAG_INFO,
-				   sizeof(struct tcp_info));
+				   handler->idiag_info_size);
 		if (!attr)
 			goto errout;
 
diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c
index 79b34a0f4a4a..423e3881a40b 100644
--- a/net/ipv4/tcp_diag.c
+++ b/net/ipv4/tcp_diag.c
@@ -50,6 +50,7 @@ static const struct inet_diag_handler tcp_diag_handler = {
 	.dump_one	 = tcp_diag_dump_one,
 	.idiag_get_info	 = tcp_diag_get_info,
 	.idiag_type	 = IPPROTO_TCP,
+	.idiag_info_size = sizeof(struct tcp_info),
 };
 
 static int __init tcp_diag_init(void)
diff --git a/net/ipv4/udp_diag.c b/net/ipv4/udp_diag.c
index b763c39ae1d7..6116604bf6e8 100644
--- a/net/ipv4/udp_diag.c
+++ b/net/ipv4/udp_diag.c
@@ -170,6 +170,7 @@ static const struct inet_diag_handler udp_diag_handler = {
 	.dump_one	 = udp_diag_dump_one,
 	.idiag_get_info  = udp_diag_get_info,
 	.idiag_type	 = IPPROTO_UDP,
+	.idiag_info_size = 0,
 };
 
 static void udplite_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
@@ -190,6 +191,7 @@ static const struct inet_diag_handler udplite_diag_handler = {
 	.dump_one	 = udplite_diag_dump_one,
 	.idiag_get_info  = udp_diag_get_info,
 	.idiag_type	 = IPPROTO_UDPLITE,
+	.idiag_info_size = 0,
 };
 
 static int __init udp_diag_init(void)
-- 
cgit v1.2.3


From d37e296979ed1652aec6850e2d736bd0ebf0cdb1 Mon Sep 17 00:00:00 2001
From: Tadeusz Struk <tadeusz.struk@intel.com>
Date: Mon, 15 Jun 2015 13:18:36 -0700
Subject: MPILIB: add mpi_read_buf() and mpi_get_size() helpers

Added a mpi_read_buf() helper function to export MPI to a buf provided by
the user, and a mpi_get_size() helper, that tells the user how big the buf is.
Changed mpi_free to use kzfree instead of kfree because it is used to free
crypto keys.

Signed-off-by: Tadeusz Struk <tadeusz.struk@intel.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 include/linux/mpi.h | 15 +++++++++
 lib/mpi/mpicoder.c  | 87 +++++++++++++++++++++++++++++++++++++++++------------
 lib/mpi/mpiutil.c   |  6 ++--
 3 files changed, 86 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mpi.h b/include/linux/mpi.h
index 5af1b81def49..641b7d6fd096 100644
--- a/include/linux/mpi.h
+++ b/include/linux/mpi.h
@@ -81,6 +81,8 @@ MPI mpi_read_from_buffer(const void *buffer, unsigned *ret_nread);
 int mpi_fromstr(MPI val, const char *str);
 u32 mpi_get_keyid(MPI a, u32 *keyid);
 void *mpi_get_buffer(MPI a, unsigned *nbytes, int *sign);
+int mpi_read_buffer(MPI a, uint8_t *buf, unsigned buf_len, unsigned *nbytes,
+		    int *sign);
 void *mpi_get_secure_buffer(MPI a, unsigned *nbytes, int *sign);
 int mpi_set_buffer(MPI a, const void *buffer, unsigned nbytes, int sign);
 
@@ -142,4 +144,17 @@ int mpi_rshift(MPI x, MPI a, unsigned n);
 /*-- mpi-inv.c --*/
 int mpi_invm(MPI x, MPI u, MPI v);
 
+/* inline functions */
+
+/**
+ * mpi_get_size() - returns max size required to store the number
+ *
+ * @a:	A multi precision integer for which we want to allocate a bufer
+ *
+ * Return: size required to store the number
+ */
+static inline unsigned int mpi_get_size(MPI a)
+{
+	return a->nlimbs * BYTES_PER_MPI_LIMB;
+}
 #endif /*G10_MPI_H */
diff --git a/lib/mpi/mpicoder.c b/lib/mpi/mpicoder.c
index 4cc6442733f4..bc0a1da8afba 100644
--- a/lib/mpi/mpicoder.c
+++ b/lib/mpi/mpicoder.c
@@ -128,28 +128,36 @@ leave:
 }
 EXPORT_SYMBOL_GPL(mpi_read_from_buffer);
 
-/****************
- * Return an allocated buffer with the MPI (msb first).
- * NBYTES receives the length of this buffer. Caller must free the
- * return string (This function does return a 0 byte buffer with NBYTES
- * set to zero if the value of A is zero. If sign is not NULL, it will
- * be set to the sign of the A.
+/**
+ * mpi_read_buffer() - read MPI to a bufer provided by user (msb first)
+ *
+ * @a:		a multi precision integer
+ * @buf:	bufer to which the output will be written to. Needs to be at
+ *		leaset mpi_get_size(a) long.
+ * @buf_len:	size of the buf.
+ * @nbytes:	receives the actual length of the data written.
+ * @sign:	if not NULL, it will be set to the sign of a.
+ *
+ * Return:	0 on success or error code in case of error
  */
-void *mpi_get_buffer(MPI a, unsigned *nbytes, int *sign)
+int mpi_read_buffer(MPI a, uint8_t *buf, unsigned buf_len, unsigned *nbytes,
+		    int *sign)
 {
-	uint8_t *p, *buffer;
+	uint8_t *p;
 	mpi_limb_t alimb;
+	unsigned int n = mpi_get_size(a);
 	int i;
-	unsigned int n;
+
+	if (buf_len < n || !buf)
+		return -EINVAL;
 
 	if (sign)
 		*sign = a->sign;
-	*nbytes = n = a->nlimbs * BYTES_PER_MPI_LIMB;
-	if (!n)
-		n++;		/* avoid zero length allocation */
-	p = buffer = kmalloc(n, GFP_KERNEL);
-	if (!p)
-		return NULL;
+
+	if (nbytes)
+		*nbytes = n;
+
+	p = buf;
 
 	for (i = a->nlimbs - 1; i >= 0; i--) {
 		alimb = a->d[i];
@@ -171,15 +179,56 @@ void *mpi_get_buffer(MPI a, unsigned *nbytes, int *sign)
 #error please implement for this limb size.
 #endif
 	}
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mpi_read_buffer);
+
+/*
+ * mpi_get_buffer() - Returns an allocated buffer with the MPI (msb first).
+ * Caller must free the return string.
+ * This function does return a 0 byte buffer with nbytes set to zero if the
+ * value of A is zero.
+ *
+ * @a:		a multi precision integer.
+ * @nbytes:	receives the length of this buffer.
+ * @sign:	if not NULL, it will be set to the sign of the a.
+ *
+ * Return:	Pointer to MPI buffer or NULL on error
+ */
+void *mpi_get_buffer(MPI a, unsigned *nbytes, int *sign)
+{
+	uint8_t *buf, *p;
+	unsigned int n;
+	int ret;
+
+	if (!nbytes)
+		return NULL;
+
+	n = mpi_get_size(a);
+
+	if (!n)
+		n++;
+
+	buf = kmalloc(n, GFP_KERNEL);
+
+	if (!buf)
+		return NULL;
+
+	ret = mpi_read_buffer(a, buf, n, nbytes, sign);
+
+	if (ret) {
+		kfree(buf);
+		return NULL;
+	}
 
 	/* this is sub-optimal but we need to do the shift operation
 	 * because the caller has to free the returned buffer */
-	for (p = buffer; !*p && *nbytes; p++, --*nbytes)
+	for (p = buf; !*p && *nbytes; p++, --*nbytes)
 		;
-	if (p != buffer)
-		memmove(buffer, p, *nbytes);
+	if (p != buf)
+		memmove(buf, p, *nbytes);
 
-	return buffer;
+	return buf;
 }
 EXPORT_SYMBOL_GPL(mpi_get_buffer);
 
diff --git a/lib/mpi/mpiutil.c b/lib/mpi/mpiutil.c
index bf076d281d40..314f4dfa603e 100644
--- a/lib/mpi/mpiutil.c
+++ b/lib/mpi/mpiutil.c
@@ -69,7 +69,7 @@ void mpi_free_limb_space(mpi_ptr_t a)
 	if (!a)
 		return;
 
-	kfree(a);
+	kzfree(a);
 }
 
 void mpi_assign_limb_space(MPI a, mpi_ptr_t ap, unsigned nlimbs)
@@ -95,7 +95,7 @@ int mpi_resize(MPI a, unsigned nlimbs)
 		if (!p)
 			return -ENOMEM;
 		memcpy(p, a->d, a->alloced * sizeof(mpi_limb_t));
-		kfree(a->d);
+		kzfree(a->d);
 		a->d = p;
 	} else {
 		a->d = kzalloc(nlimbs * sizeof(mpi_limb_t), GFP_KERNEL);
@@ -112,7 +112,7 @@ void mpi_free(MPI a)
 		return;
 
 	if (a->flags & 4)
-		kfree(a->d);
+		kzfree(a->d);
 	else
 		mpi_free_limb_space(a->d);
 
-- 
cgit v1.2.3


From c7cfc94096db28d3072b402c224eb50349926e24 Mon Sep 17 00:00:00 2001
From: Jiang Liu <jiang.liu@linux.intel.com>
Date: Mon, 1 Jun 2015 16:05:10 +0800
Subject: genirq: Enhance irq_data_to_desc() to support hierarchy irqdomain

For irq associated with hierarchy irqdomains, there will be multiple
irq_datas for one irq_desc. So enhance irq_data_to_desc() to support
hierarchy irqdomain. Also export irq_data_to_desc() as an inline
function for later reuse.

Signed-off-by: Jiang Liu <jiang.liu@linux.intel.com>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Marc Zyngier <marc.zyngier@arm.com>
Link: http://lkml.kernel.org/r/1433145945-789-2-git-send-email-jiang.liu@linux.intel.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/irqdesc.h | 9 +++++++++
 kernel/irq/internals.h  | 2 --
 2 files changed, 9 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h
index dd1109fb241e..a113a8dc7438 100644
--- a/include/linux/irqdesc.h
+++ b/include/linux/irqdesc.h
@@ -93,6 +93,15 @@ struct irq_desc {
 extern struct irq_desc irq_desc[NR_IRQS];
 #endif
 
+static inline struct irq_desc *irq_data_to_desc(struct irq_data *data)
+{
+#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY
+	return irq_to_desc(data->irq);
+#else
+	return container_of(data, struct irq_desc, irq_data);
+#endif
+}
+
 static inline struct irq_data *irq_desc_get_irq_data(struct irq_desc *desc)
 {
 	return &desc->irq_data;
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index df553b0af936..b93d434e70bd 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -59,8 +59,6 @@ enum {
 #include "debug.h"
 #include "settings.h"
 
-#define irq_data_to_desc(data)	container_of(data, struct irq_desc, irq_data)
-
 extern int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
 		unsigned long flags);
 extern void __disable_irq(struct irq_desc *desc, unsigned int irq);
-- 
cgit v1.2.3


From 4158c2eca3c77ed3cccdcaeab153aad4e433369c Mon Sep 17 00:00:00 2001
From: Joerg Roedel <jroedel@suse.de>
Date: Fri, 12 Jun 2015 10:14:02 +0200
Subject: iommu/vt-d: Detect pre enabled translation

Add code to detect whether translation is already enabled in
the IOMMU. Save this state in a flags field added to
struct intel_iommu.

Tested-by: ZhenHua Li <zhen-hual@hp.com>
Tested-by: Baoquan He <bhe@redhat.com>
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/iommu/intel-iommu.c | 19 +++++++++++++++++++
 include/linux/intel-iommu.h |  4 ++++
 2 files changed, 23 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index bf3e450b5b97..39b90621a1a1 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -443,6 +443,20 @@ static LIST_HEAD(device_domain_list);
 
 static const struct iommu_ops intel_iommu_ops;
 
+static bool translation_pre_enabled(struct intel_iommu *iommu)
+{
+	return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
+}
+
+static void init_translation_status(struct intel_iommu *iommu)
+{
+	u32 gsts;
+
+	gsts = readl(iommu->reg + DMAR_GSTS_REG);
+	if (gsts & DMA_GSTS_TES)
+		iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
+}
+
 /* Convert generic 'struct iommu_domain to private struct dmar_domain */
 static struct dmar_domain *to_dmar_domain(struct iommu_domain *dom)
 {
@@ -2809,6 +2823,11 @@ static int __init init_dmars(void)
 		if (ret)
 			goto free_iommu;
 
+		init_translation_status(iommu);
+
+		if (translation_pre_enabled(iommu))
+			pr_info("Translation already enabled - trying to copy translation structures\n");
+
 		/*
 		 * TBD:
 		 * we could share the same root & context tables
diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index a240e61a7700..b85b81ad5eba 100644
--- a/include/linux/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -320,6 +320,9 @@ enum {
 	MAX_SR_DMAR_REGS
 };
 
+#define VTD_FLAG_TRANS_PRE_ENABLED	(1 << 0)
+#define VTD_FLAG_IRQ_REMAP_PRE_ENABLED	(1 << 1)
+
 struct intel_iommu {
 	void __iomem	*reg; /* Pointer to hardware regs, virtual addr */
 	u64 		reg_phys; /* physical address of hw register set */
@@ -351,6 +354,7 @@ struct intel_iommu {
 #endif
 	struct device	*iommu_dev; /* IOMMU-sysfs device */
 	int		node;
+	u32		flags;      /* Software defined flags */
 };
 
 static inline void __iommu_flush_cache(
-- 
cgit v1.2.3


From af3b358e48115588d905cc07a47b3f356e0d01d1 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <jroedel@suse.de>
Date: Fri, 12 Jun 2015 15:00:21 +0200
Subject: iommu/vt-d: Copy IR table from old kernel when in kdump mode

When we are booting into a kdump kernel and find IR enabled,
copy over the contents of the previous IR table so that
spurious interrupts will not be target aborted.

Tested-by: ZhenHua Li <zhen-hual@hp.com>
Tested-by: Baoquan He <bhe@redhat.com>
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/iommu/intel_irq_remapping.c | 70 +++++++++++++++++++++++++++++++++++++
 include/linux/intel-iommu.h         |  1 +
 2 files changed, 71 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/iommu/intel_irq_remapping.c b/drivers/iommu/intel_irq_remapping.c
index 84970281b754..2a901219f953 100644
--- a/drivers/iommu/intel_irq_remapping.c
+++ b/drivers/iommu/intel_irq_remapping.c
@@ -11,6 +11,7 @@
 #include <linux/irq.h>
 #include <linux/intel-iommu.h>
 #include <linux/acpi.h>
+#include <linux/crash_dump.h>
 #include <asm/io_apic.h>
 #include <asm/smp.h>
 #include <asm/cpu.h>
@@ -54,8 +55,28 @@ static struct hpet_scope ir_hpet[MAX_HPET_TBS];
  */
 static DEFINE_RAW_SPINLOCK(irq_2_ir_lock);
 
+static void iommu_disable_irq_remapping(struct intel_iommu *iommu);
 static int __init parse_ioapics_under_ir(void);
 
+static bool ir_pre_enabled(struct intel_iommu *iommu)
+{
+	return (iommu->flags & VTD_FLAG_IRQ_REMAP_PRE_ENABLED);
+}
+
+static void clear_ir_pre_enabled(struct intel_iommu *iommu)
+{
+	iommu->flags &= ~VTD_FLAG_IRQ_REMAP_PRE_ENABLED;
+}
+
+static void init_ir_status(struct intel_iommu *iommu)
+{
+	u32 gsts;
+
+	gsts = readl(iommu->reg + DMAR_GSTS_REG);
+	if (gsts & DMA_GSTS_IRES)
+		iommu->flags |= VTD_FLAG_IRQ_REMAP_PRE_ENABLED;
+}
+
 static struct irq_2_iommu *irq_2_iommu(unsigned int irq)
 {
 	struct irq_cfg *cfg = irq_cfg(irq);
@@ -426,6 +447,44 @@ static int set_msi_sid(struct irte *irte, struct pci_dev *dev)
 	return 0;
 }
 
+static int iommu_load_old_irte(struct intel_iommu *iommu)
+{
+	struct irte *old_ir_table;
+	phys_addr_t irt_phys;
+	size_t size;
+	u64 irta;
+
+	if (!is_kdump_kernel()) {
+		pr_warn("IRQ remapping was enabled on %s but we are not in kdump mode\n",
+			iommu->name);
+		clear_ir_pre_enabled(iommu);
+		iommu_disable_irq_remapping(iommu);
+		return -EINVAL;
+	}
+
+	/* Check whether the old ir-table has the same size as ours */
+	irta = dmar_readq(iommu->reg + DMAR_IRTA_REG);
+	if ((irta & INTR_REMAP_TABLE_REG_SIZE_MASK)
+	     != INTR_REMAP_TABLE_REG_SIZE)
+		return -EINVAL;
+
+	irt_phys = irta & VTD_PAGE_MASK;
+	size     = INTR_REMAP_TABLE_ENTRIES*sizeof(struct irte);
+
+	/* Map the old IR table */
+	old_ir_table = ioremap_cache(irt_phys, size);
+	if (!old_ir_table)
+		return -ENOMEM;
+
+	/* Copy data over */
+	memcpy(iommu->ir_table->base, old_ir_table, size);
+
+	__iommu_flush_cache(iommu, iommu->ir_table->base, size);
+
+	return 0;
+}
+
+
 static void iommu_set_irq_remapping(struct intel_iommu *iommu, int mode)
 {
 	unsigned long flags;
@@ -531,6 +590,17 @@ static int intel_setup_irq_remapping(struct intel_iommu *iommu)
 		}
 	}
 
+	init_ir_status(iommu);
+
+	if (ir_pre_enabled(iommu)) {
+		if (iommu_load_old_irte(iommu))
+			pr_err("Failed to copy IR table for %s from previous kernel\n",
+			       iommu->name);
+		else
+			pr_info("Copied IR table for %s from previous kernel\n",
+				iommu->name);
+	}
+
 	iommu_set_irq_remapping(iommu, eim_mode);
 
 	return 0;
diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index b85b81ad5eba..9e14edcf7f6e 100644
--- a/include/linux/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -296,6 +296,7 @@ struct q_inval {
 /* 1MB - maximum possible interrupt remapping table size */
 #define INTR_REMAP_PAGE_ORDER	8
 #define INTR_REMAP_TABLE_REG_SIZE	0xf
+#define INTR_REMAP_TABLE_REG_SIZE_MASK  0xf
 
 #define INTR_REMAP_TABLE_ENTRIES	65536
 
-- 
cgit v1.2.3


From 764ad8ba8cd4c6f836fca9378f8c5121aece0842 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@poochiereds.net>
Date: Tue, 9 Jun 2015 19:43:56 -0400
Subject: nfs: increase size of EXCHANGE_ID name string buffer

The current buffer is much too small if you have a relatively long
hostname. Bring it up to the size of the one that SETCLIENTID has.

Cc: <stable@vger.kernel.org>
Reported-by: Michael Skralivetsky <michael.skralivetsky@primarydata.com>
Signed-off-by: Jeff Layton <jeff.layton@primarydata.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 include/linux/nfs_xdr.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 93ab6071bbe9..e9e9a8dcfb47 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -1142,7 +1142,7 @@ struct nfs41_state_protection {
 	struct nfs4_op_map allow;
 };
 
-#define NFS4_EXCHANGE_ID_LEN	(48)
+#define NFS4_EXCHANGE_ID_LEN	(127)
 struct nfs41_exchange_id_args {
 	struct nfs_client		*client;
 	nfs4_verifier			*verifier;
-- 
cgit v1.2.3


From 3a6bb738792500e8b4534c0350c13a132bac0492 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@poochiereds.net>
Date: Tue, 9 Jun 2015 19:43:57 -0400
Subject: nfs: convert setclientid and exchange_id encoders to use
 clp->cl_owner_id

...instead of buffers that are part of their arg structs. We already
hold a reference to the client, so we might as well use the allocated
buffer. In the event that we can't allocate the clp->cl_owner_id, then
just return -ENOMEM.

Note too that we switch from a GFP_KERNEL allocation here to GFP_NOFS.
It's possible we could end up trying to do a SETCLIENTID or EXCHANGE_ID
in order to reclaim some memory, and the GFP_KERNEL allocations in the
existing code could cause recursion back into NFS reclaim.

Signed-off-by: Jeff Layton <jeff.layton@primarydata.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/nfs4proc.c       | 21 ++++++++++++++++-----
 fs/nfs/nfs4xdr.c        |  8 +++++---
 include/linux/nfs_xdr.h |  2 +-
 3 files changed, 22 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index d689ea37be84..b2afe9556147 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -5046,7 +5046,7 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program,
 	struct nfs4_setclientid setclientid = {
 		.sc_verifier = &sc_verifier,
 		.sc_prog = program,
-		.sc_cb_ident = clp->cl_cb_ident,
+		.sc_clnt = clp,
 	};
 	struct rpc_message msg = {
 		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SETCLIENTID],
@@ -5076,6 +5076,12 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program,
 				nfs4_init_nonuniform_client_string(clp,
 						setclientid.sc_name,
 						sizeof(setclientid.sc_name));
+
+	if (!clp->cl_owner_id) {
+		status = -ENOMEM;
+		goto out;
+	}
+
 	/* cb_client4 */
 	setclientid.sc_netid_len =
 				nfs4_init_callback_netid(clp,
@@ -5085,9 +5091,9 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program,
 				sizeof(setclientid.sc_uaddr), "%s.%u.%u",
 				clp->cl_ipaddr, port >> 8, port & 255);
 
-	dprintk("NFS call  setclientid auth=%s, '%.*s'\n",
+	dprintk("NFS call  setclientid auth=%s, '%s'\n",
 		clp->cl_rpcclient->cl_auth->au_ops->au_name,
-		setclientid.sc_name_len, setclientid.sc_name);
+		clp->cl_owner_id);
 	task = rpc_run_task(&task_setup_data);
 	if (IS_ERR(task)) {
 		status = PTR_ERR(task);
@@ -6850,9 +6856,14 @@ static int _nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred,
 	nfs4_init_boot_verifier(clp, &verifier);
 	args.id_len = nfs4_init_uniform_client_string(clp, args.id,
 							sizeof(args.id));
-	dprintk("NFS call  exchange_id auth=%s, '%.*s'\n",
+	if (!clp->cl_owner_id) {
+		status = -ENOMEM;
+		goto out;
+	}
+
+	dprintk("NFS call  exchange_id auth=%s, '%s'\n",
 		clp->cl_rpcclient->cl_auth->au_ops->au_name,
-		args.id_len, args.id);
+		clp->cl_owner_id);
 
 	res.server_owner = kzalloc(sizeof(struct nfs41_server_owner),
 					GFP_NOFS);
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 0aea97841d30..f58e8a979397 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -1667,13 +1667,14 @@ static void encode_setclientid(struct xdr_stream *xdr, const struct nfs4_setclie
 	encode_op_hdr(xdr, OP_SETCLIENTID, decode_setclientid_maxsz, hdr);
 	encode_nfs4_verifier(xdr, setclientid->sc_verifier);
 
-	encode_string(xdr, setclientid->sc_name_len, setclientid->sc_name);
+	encode_string(xdr, strlen(setclientid->sc_clnt->cl_owner_id),
+			setclientid->sc_clnt->cl_owner_id);
 	p = reserve_space(xdr, 4);
 	*p = cpu_to_be32(setclientid->sc_prog);
 	encode_string(xdr, setclientid->sc_netid_len, setclientid->sc_netid);
 	encode_string(xdr, setclientid->sc_uaddr_len, setclientid->sc_uaddr);
 	p = reserve_space(xdr, 4);
-	*p = cpu_to_be32(setclientid->sc_cb_ident);
+	*p = cpu_to_be32(setclientid->sc_clnt->cl_cb_ident);
 }
 
 static void encode_setclientid_confirm(struct xdr_stream *xdr, const struct nfs4_setclientid_res *arg, struct compound_hdr *hdr)
@@ -1747,7 +1748,8 @@ static void encode_exchange_id(struct xdr_stream *xdr,
 	encode_op_hdr(xdr, OP_EXCHANGE_ID, decode_exchange_id_maxsz, hdr);
 	encode_nfs4_verifier(xdr, args->verifier);
 
-	encode_string(xdr, args->id_len, args->id);
+	encode_string(xdr, strlen(args->client->cl_owner_id),
+			args->client->cl_owner_id);
 
 	encode_uint32(xdr, args->flags);
 	encode_uint32(xdr, args->state_protect.how);
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index e9e9a8dcfb47..6c0b423d781b 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -994,7 +994,7 @@ struct nfs4_setclientid {
 	char				sc_netid[RPCBIND_MAXNETIDLEN + 1];
 	unsigned int			sc_uaddr_len;
 	char				sc_uaddr[RPCBIND_MAXUADDRLEN + 1];
-	u32				sc_cb_ident;
+	struct nfs_client		*sc_clnt;
 	struct rpc_cred			*sc_cred;
 };
 
-- 
cgit v1.2.3


From 873e385116b2cc5c7daca8f51881371fadb90970 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@poochiereds.net>
Date: Tue, 9 Jun 2015 19:44:00 -0400
Subject: nfs: make nfs4_init_uniform_client_string use a dynamically allocated
 buffer

Change the uniform client string generator to dynamically allocate the
NFSv4 client name string buffer. With this patch, we can eliminate the
buffers that are embedded within the "args" structs and simply use the
name string that is hanging off the client.

This uniform string case is a little simpler than the nonuniform since
we don't need to deal with RCU, but we do have two different cases,
depending on whether there is a uniquifier or not.

Signed-off-by: Jeff Layton <jeff.layton@primarydata.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/nfs4proc.c       | 112 +++++++++++++++++++++++++++++++++---------------
 include/linux/nfs_xdr.h |   6 ---
 2 files changed, 77 insertions(+), 41 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index c20322636a68..d181090124db 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -5007,28 +5007,78 @@ retry:
 	return 0;
 }
 
-static unsigned int
-nfs4_init_uniform_client_string(struct nfs_client *clp,
-				char *buf, size_t len)
+static int
+nfs4_init_uniquifier_client_string(struct nfs_client *clp)
 {
-	const char *nodename = clp->cl_rpcclient->cl_nodename;
-	unsigned int result;
+	int result;
+	size_t len;
+	char *str;
+
+	len = 10 + 10 + 1 + 10 + 1 +
+		strlen(nfs4_client_id_uniquifier) + 1 +
+		strlen(clp->cl_rpcclient->cl_nodename) + 1;
+
+	if (len > NFS4_OPAQUE_LIMIT + 1)
+		return -EINVAL;
+
+	/*
+	 * Since this string is allocated at mount time, and held until the
+	 * nfs_client is destroyed, we can use GFP_KERNEL here w/o worrying
+	 * about a memory-reclaim deadlock.
+	 */
+	str = kmalloc(len, GFP_KERNEL);
+	if (!str)
+		return -ENOMEM;
+
+	result = scnprintf(str, len, "Linux NFSv%u.%u %s/%s",
+			clp->rpc_ops->version, clp->cl_minorversion,
+			nfs4_client_id_uniquifier,
+			clp->cl_rpcclient->cl_nodename);
+	if (result >= len) {
+		kfree(str);
+		return -EINVAL;
+	}
+	clp->cl_owner_id = str;
+	return 0;
+}
+
+static int
+nfs4_init_uniform_client_string(struct nfs_client *clp)
+{
+	int result;
+	size_t len;
+	char *str;
 
 	if (clp->cl_owner_id != NULL)
-		return strlcpy(buf, clp->cl_owner_id, len);
+		return 0;
 
 	if (nfs4_client_id_uniquifier[0] != '\0')
-		result = scnprintf(buf, len, "Linux NFSv%u.%u %s/%s",
-				clp->rpc_ops->version,
-				clp->cl_minorversion,
-				nfs4_client_id_uniquifier,
-				nodename);
-	else
-		result = scnprintf(buf, len, "Linux NFSv%u.%u %s",
-				clp->rpc_ops->version, clp->cl_minorversion,
-				nodename);
-	clp->cl_owner_id = kstrdup(buf, GFP_KERNEL);
-	return result;
+		return nfs4_init_uniquifier_client_string(clp);
+
+	len = 10 + 10 + 1 + 10 + 1 +
+		strlen(clp->cl_rpcclient->cl_nodename) + 1;
+
+	if (len > NFS4_OPAQUE_LIMIT + 1)
+		return -EINVAL;
+
+	/*
+	 * Since this string is allocated at mount time, and held until the
+	 * nfs_client is destroyed, we can use GFP_KERNEL here w/o worrying
+	 * about a memory-reclaim deadlock.
+	 */
+	str = kmalloc(len, GFP_KERNEL);
+	if (!str)
+		return -ENOMEM;
+
+	result = scnprintf(str, len, "Linux NFSv%u.%u %s",
+			clp->rpc_ops->version, clp->cl_minorversion,
+			clp->cl_rpcclient->cl_nodename);
+	if (result >= len) {
+		kfree(str);
+		return -EINVAL;
+	}
+	clp->cl_owner_id = str;
+	return 0;
 }
 
 /*
@@ -5095,20 +5145,14 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program,
 
 	/* nfs_client_id4 */
 	nfs4_init_boot_verifier(clp, &sc_verifier);
-	if (test_bit(NFS_CS_MIGRATION, &clp->cl_flags)) {
-		setclientid.sc_name_len =
-				nfs4_init_uniform_client_string(clp,
-						setclientid.sc_name,
-						sizeof(setclientid.sc_name));
-		if (!clp->cl_owner_id) {
-			status = -ENOMEM;
-			goto out;
-		}
-	} else {
+
+	if (test_bit(NFS_CS_MIGRATION, &clp->cl_flags))
+		status = nfs4_init_uniform_client_string(clp);
+	else
 		status = nfs4_init_nonuniform_client_string(clp);
-		if (status)
-			goto out;
-	}
+
+	if (status)
+		goto out;
 
 	/* cb_client4 */
 	setclientid.sc_netid_len =
@@ -6882,12 +6926,10 @@ static int _nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred,
 	};
 
 	nfs4_init_boot_verifier(clp, &verifier);
-	args.id_len = nfs4_init_uniform_client_string(clp, args.id,
-							sizeof(args.id));
-	if (!clp->cl_owner_id) {
-		status = -ENOMEM;
+
+	status = nfs4_init_uniform_client_string(clp);
+	if (status)
 		goto out;
-	}
 
 	dprintk("NFS call  exchange_id auth=%s, '%s'\n",
 		clp->cl_rpcclient->cl_auth->au_ops->au_name,
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 6c0b423d781b..c959e7eb5bd4 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -984,11 +984,8 @@ struct nfs4_readlink_res {
 	struct nfs4_sequence_res	seq_res;
 };
 
-#define NFS4_SETCLIENTID_NAMELEN	(127)
 struct nfs4_setclientid {
 	const nfs4_verifier *		sc_verifier;
-	unsigned int			sc_name_len;
-	char				sc_name[NFS4_SETCLIENTID_NAMELEN + 1];
 	u32				sc_prog;
 	unsigned int			sc_netid_len;
 	char				sc_netid[RPCBIND_MAXNETIDLEN + 1];
@@ -1142,12 +1139,9 @@ struct nfs41_state_protection {
 	struct nfs4_op_map allow;
 };
 
-#define NFS4_EXCHANGE_ID_LEN	(127)
 struct nfs41_exchange_id_args {
 	struct nfs_client		*client;
 	nfs4_verifier			*verifier;
-	unsigned int 			id_len;
-	char 				id[NFS4_EXCHANGE_ID_LEN];
 	u32				flags;
 	struct nfs41_state_protection	state_protect;
 };
-- 
cgit v1.2.3


From 22c1587adfed1977d26a57ac2831d03e37f8f805 Mon Sep 17 00:00:00 2001
From: Paul Gortmaker <paul.gortmaker@windriver.com>
Date: Mon, 27 Apr 2015 17:37:53 -0400
Subject: init: delete the __cpuinit related stubs

The __cpuinit support was removed several releases ago in 3.11-rc1 with
commit 22f0a27367742f65130c0fb25ef00f7297e032c1 ("init.h: remove __cpuinit
sections from the kernel")

People have had a chance to update their out of tree code, so now we remove
the no-op stubs to ensure no more new use cases can creep back in.

Also delete the mention of __cpuinitdata from the tag script.

Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
---
 include/linux/init.h | 11 -----------
 scripts/tags.sh      |  2 +-
 2 files changed, 1 insertion(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/init.h b/include/linux/init.h
index 21b6d768edd7..7c68c36d3fd8 100644
--- a/include/linux/init.h
+++ b/include/linux/init.h
@@ -91,14 +91,6 @@
 
 #define __exit          __section(.exit.text) __exitused __cold notrace
 
-/* temporary, until all users are removed */
-#define __cpuinit
-#define __cpuinitdata
-#define __cpuinitconst
-#define __cpuexit
-#define __cpuexitdata
-#define __cpuexitconst
-
 /* Used for MEMORY_HOTPLUG */
 #define __meminit        __section(.meminit.text) __cold notrace
 #define __meminitdata    __section(.meminit.data)
@@ -116,9 +108,6 @@
 #define __INITRODATA	.section	".init.rodata","a",%progbits
 #define __FINITDATA	.previous
 
-/* temporary, until all users are removed */
-#define __CPUINIT
-
 #define __MEMINIT        .section	".meminit.text", "ax"
 #define __MEMINITDATA    .section	".meminit.data", "aw"
 #define __MEMINITRODATA  .section	".meminit.rodata", "a"
diff --git a/scripts/tags.sh b/scripts/tags.sh
index cdb491d84503..c0a932dff329 100755
--- a/scripts/tags.sh
+++ b/scripts/tags.sh
@@ -154,7 +154,7 @@ exuberant()
 {
 	all_target_sources | xargs $1 -a                        \
 	-I __initdata,__exitdata,__initconst,			\
-	-I __cpuinitdata,__initdata_memblock			\
+	-I __initdata_memblock					\
 	-I __refdata,__attribute,__maybe_unused,__always_unused \
 	-I __acquires,__releases,__deprecated			\
 	-I __read_mostly,__aligned,____cacheline_aligned        \
-- 
cgit v1.2.3


From f309d4443130bf814e991f836e919dca22df37ae Mon Sep 17 00:00:00 2001
From: Paul Gortmaker <paul.gortmaker@windriver.com>
Date: Fri, 1 May 2015 20:10:57 -0400
Subject: platform_device: better support builtin boilerplate avoidance

We have macros that help reduce the boilerplate for modules
that register with no extra init/exit complexity other than the
most standard use case.  However we see an increasing number of
non-modular drivers using these modular_driver() type register
functions.

There are several downsides to this:
1) The code can appear modular to a reader of the code, and they
   won't know if the code really is modular without checking the
   Makefile and Kconfig to see if compilation is governed by a
   bool or tristate.
2) Coders of drivers may be tempted to code up an __exit function
   that is never used, just in order to satisfy the required three
   args of the modular registration function.
3) Non-modular code ends up including the <module.h> which increases
   CPP overhead that they don't need.
4) It hinders us from performing better separation of the module
   init code and the generic init code.

Here we introduce similar macros, with the mapping from module_driver
to builtin_driver and similar, so that simple changes of:

  module_platform_driver()       --->  builtin_platform_driver()
  module_platform_driver_probe() --->  builtin_platform_driver_probe().

can help us avoid #3 above, without having to code up the same
__init functions and device_initcall() boilerplate.

For non modular code, module_init becomes __initcall.  But direct use
of __initcall is discouraged, vs. one of the priority categorized
subgroups.  As __initcall gets mapped onto device_initcall, our
use of device_initcall directly in this change means that the
runtime impact is zero -- drivers will remain at level 6 in the
initcall ordering.

Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
---
 include/linux/device.h          | 22 ++++++++++++++++++++++
 include/linux/platform_device.h | 23 +++++++++++++++++++++++
 2 files changed, 45 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/device.h b/include/linux/device.h
index 6558af90c8fe..c2d6167cb4a3 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -1269,4 +1269,26 @@ static void __exit __driver##_exit(void) \
 } \
 module_exit(__driver##_exit);
 
+/**
+ * builtin_driver() - Helper macro for drivers that don't do anything
+ * special in init and have no exit. This eliminates some boilerplate.
+ * Each driver may only use this macro once, and calling it replaces
+ * device_initcall (or in some cases, the legacy __initcall).  This is
+ * meant to be a direct parallel of module_driver() above but without
+ * the __exit stuff that is not used for builtin cases.
+ *
+ * @__driver: driver name
+ * @__register: register function for this driver type
+ * @...: Additional arguments to be passed to __register
+ *
+ * Use this macro to construct bus specific macros for registering
+ * drivers, and do not use it on its own.
+ */
+#define builtin_driver(__driver, __register, ...) \
+static int __init __driver##_init(void) \
+{ \
+	return __register(&(__driver) , ##__VA_ARGS__); \
+} \
+device_initcall(__driver##_init);
+
 #endif /* _DEVICE_H_ */
diff --git a/include/linux/platform_device.h b/include/linux/platform_device.h
index 58f1e75ba105..bba08f44cc97 100644
--- a/include/linux/platform_device.h
+++ b/include/linux/platform_device.h
@@ -222,6 +222,15 @@ static inline void platform_set_drvdata(struct platform_device *pdev,
 	module_driver(__platform_driver, platform_driver_register, \
 			platform_driver_unregister)
 
+/* builtin_platform_driver() - Helper macro for builtin drivers that
+ * don't do anything special in driver init.  This eliminates some
+ * boilerplate.  Each driver may only use this macro once, and
+ * calling it replaces device_initcall().  Note this is meant to be
+ * a parallel of module_platform_driver() above, but w/o _exit stuff.
+ */
+#define builtin_platform_driver(__platform_driver) \
+	builtin_driver(__platform_driver, platform_driver_register)
+
 /* module_platform_driver_probe() - Helper macro for drivers that don't do
  * anything special in module init/exit.  This eliminates a lot of
  * boilerplate.  Each module may only use this macro once, and
@@ -240,6 +249,20 @@ static void __exit __platform_driver##_exit(void) \
 } \
 module_exit(__platform_driver##_exit);
 
+/* builtin_platform_driver_probe() - Helper macro for drivers that don't do
+ * anything special in device init.  This eliminates some boilerplate.  Each
+ * driver may only use this macro once, and using it replaces device_initcall.
+ * This is meant to be a parallel of module_platform_driver_probe above, but
+ * without the __exit parts.
+ */
+#define builtin_platform_driver_probe(__platform_driver, __platform_probe) \
+static int __init __platform_driver##_init(void) \
+{ \
+	return platform_driver_probe(&(__platform_driver), \
+				     __platform_probe);    \
+} \
+device_initcall(__platform_driver##_init); \
+
 #define platform_create_bundle(driver, probe, res, n_res, data, size) \
 	__platform_create_bundle(driver, probe, res, n_res, data, size, THIS_MODULE)
 extern struct platform_device *__platform_create_bundle(
-- 
cgit v1.2.3


From fec47d863587c272d6fbf4e50066209c953d5e60 Mon Sep 17 00:00:00 2001
From: Dave Gerlach <d-gerlach@ti.com>
Date: Fri, 22 May 2015 15:45:27 -0500
Subject: remoteproc: introduce rproc_get_by_phandle API

Allow users of remoteproc the ability to get a handle to an rproc by
passing a phandle supplied in the user's device tree node. This is
useful in situations that require manual booting of the rproc.

This patch uses the code removed by commit 40e575b1d0b3 ("remoteproc:
remove the get_by_name/put API") for the ref counting but is modified
to use a simple list and locking mechanism and has rproc_get_by_name
replaced with an rproc_get_by_phandle API.

Signed-off-by: Dave Gerlach <d-gerlach@ti.com>
Signed-off-by: Suman Anna <s-anna@ti.com>
[fix order of Signed-off-by tags]
Signed-off-by: Ohad Ben-Cohen <ohad@wizery.com>
---
 Documentation/remoteproc.txt         |  6 +++++
 drivers/remoteproc/remoteproc_core.c | 50 ++++++++++++++++++++++++++++++++++++
 include/linux/remoteproc.h           |  7 ++---
 3 files changed, 60 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/remoteproc.txt b/Documentation/remoteproc.txt
index e6469fdcf89a..ef0219fa4bb4 100644
--- a/Documentation/remoteproc.txt
+++ b/Documentation/remoteproc.txt
@@ -51,6 +51,12 @@ cost.
         rproc_shutdown() returns, and users can still use it with a subsequent
         rproc_boot(), if needed.
 
+  struct rproc *rproc_get_by_phandle(phandle phandle)
+    - Find an rproc handle using a device tree phandle. Returns the rproc
+      handle on success, and NULL on failure. This function increments
+      the remote processor's refcount, so always use rproc_put() to
+      decrement it back once rproc isn't needed anymore.
+
 3. Typical usage
 
 #include <linux/remoteproc.h>
diff --git a/drivers/remoteproc/remoteproc_core.c b/drivers/remoteproc/remoteproc_core.c
index e991512e04ed..e1a6d693b8bb 100644
--- a/drivers/remoteproc/remoteproc_core.c
+++ b/drivers/remoteproc/remoteproc_core.c
@@ -44,6 +44,9 @@
 
 #include "remoteproc_internal.h"
 
+static DEFINE_MUTEX(rproc_list_mutex);
+static LIST_HEAD(rproc_list);
+
 typedef int (*rproc_handle_resources_t)(struct rproc *rproc,
 				struct resource_table *table, int len);
 typedef int (*rproc_handle_resource_t)(struct rproc *rproc,
@@ -1144,6 +1147,43 @@ out:
 }
 EXPORT_SYMBOL(rproc_shutdown);
 
+/**
+ * rproc_get_by_phandle() - find a remote processor by phandle
+ * @phandle: phandle to the rproc
+ *
+ * Finds an rproc handle using the remote processor's phandle, and then
+ * return a handle to the rproc.
+ *
+ * This function increments the remote processor's refcount, so always
+ * use rproc_put() to decrement it back once rproc isn't needed anymore.
+ *
+ * Returns the rproc handle on success, and NULL on failure.
+ */
+struct rproc *rproc_get_by_phandle(phandle phandle)
+{
+	struct rproc *rproc = NULL, *r;
+	struct device_node *np;
+
+	np = of_find_node_by_phandle(phandle);
+	if (!np)
+		return NULL;
+
+	mutex_lock(&rproc_list_mutex);
+	list_for_each_entry(r, &rproc_list, node) {
+		if (r->dev.parent && r->dev.parent->of_node == np) {
+			rproc = r;
+			get_device(&rproc->dev);
+			break;
+		}
+	}
+	mutex_unlock(&rproc_list_mutex);
+
+	of_node_put(np);
+
+	return rproc;
+}
+EXPORT_SYMBOL(rproc_get_by_phandle);
+
 /**
  * rproc_add() - register a remote processor
  * @rproc: the remote processor handle to register
@@ -1173,6 +1213,11 @@ int rproc_add(struct rproc *rproc)
 	if (ret < 0)
 		return ret;
 
+	/* expose to rproc_get_by_phandle users */
+	mutex_lock(&rproc_list_mutex);
+	list_add(&rproc->node, &rproc_list);
+	mutex_unlock(&rproc_list_mutex);
+
 	dev_info(dev, "%s is available\n", rproc->name);
 
 	dev_info(dev, "Note: remoteproc is still under development and considered experimental.\n");
@@ -1360,6 +1405,11 @@ int rproc_del(struct rproc *rproc)
 	/* Free the copy of the resource table */
 	kfree(rproc->cached_table);
 
+	/* the rproc is downref'ed as soon as it's removed from the klist */
+	mutex_lock(&rproc_list_mutex);
+	list_del(&rproc->node);
+	mutex_unlock(&rproc_list_mutex);
+
 	device_del(&rproc->dev);
 
 	return 0;
diff --git a/include/linux/remoteproc.h b/include/linux/remoteproc.h
index 78b8a9b9d40a..56739e5df1e6 100644
--- a/include/linux/remoteproc.h
+++ b/include/linux/remoteproc.h
@@ -36,11 +36,11 @@
 #define REMOTEPROC_H
 
 #include <linux/types.h>
-#include <linux/klist.h>
 #include <linux/mutex.h>
 #include <linux/virtio.h>
 #include <linux/completion.h>
 #include <linux/idr.h>
+#include <linux/of.h>
 
 /**
  * struct resource_table - firmware resource table header
@@ -375,7 +375,7 @@ enum rproc_crash_type {
 
 /**
  * struct rproc - represents a physical remote processor device
- * @node: klist node of this rproc object
+ * @node: list node of this rproc object
  * @domain: iommu domain
  * @name: human readable name of the rproc
  * @firmware: name of firmware file to be loaded
@@ -407,7 +407,7 @@ enum rproc_crash_type {
  * @has_iommu: flag to indicate if remote processor is behind an MMU
  */
 struct rproc {
-	struct klist_node node;
+	struct list_head node;
 	struct iommu_domain *domain;
 	const char *name;
 	const char *firmware;
@@ -481,6 +481,7 @@ struct rproc_vdev {
 	u32 rsc_offset;
 };
 
+struct rproc *rproc_get_by_phandle(phandle phandle);
 struct rproc *rproc_alloc(struct device *dev, const char *name,
 				const struct rproc_ops *ops,
 				const char *firmware, int len);
-- 
cgit v1.2.3


From 9f3520c3115b451ac1301779fc3c769d94907a70 Mon Sep 17 00:00:00 2001
From: Yuanhan Liu <yuanhan.liu@linux.intel.com>
Date: Fri, 8 May 2015 18:19:05 +1000
Subject: wait: introduce wait_event_exclusive_cmd

It's just a variant of wait_event_cmd(), with exclusive flag being set.

For cases like RAID5, which puts many processes to sleep until 1/4
resources are free, a wake_up wakes up all processes to run, but
there is one process being able to get the resource as it's protected
by a spin lock. That ends up introducing heavy lock contentions, and
hurts performance badly.

Here introduce wait_event_exclusive_cmd to relieve the lock contention
naturally by letting wake_up just wake up one process.

Cc: Ingo Molnar <mingo@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
v2: its assumed that wait*() and __wait*() have the same arguments - peterz

Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Yuanhan Liu <yuanhan.liu@linux.intel.com>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 include/linux/wait.h | 13 +++++++++++++
 1 file changed, 13 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/wait.h b/include/linux/wait.h
index 2db83349865b..db78c7204947 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -358,6 +358,19 @@ do {									\
 	__ret;								\
 })
 
+#define __wait_event_exclusive_cmd(wq, condition, cmd1, cmd2)		\
+	(void)___wait_event(wq, condition, TASK_UNINTERRUPTIBLE, 1, 0,	\
+			    cmd1; schedule(); cmd2)
+/*
+ * Just like wait_event_cmd(), except it sets exclusive flag
+ */
+#define wait_event_exclusive_cmd(wq, condition, cmd1, cmd2)		\
+do {									\
+	if (condition)							\
+		break;							\
+	__wait_event_exclusive_cmd(wq, condition, cmd1, cmd2);		\
+} while (0)
+
 #define __wait_event_cmd(wq, condition, cmd1, cmd2)			\
 	(void)___wait_event(wq, condition, TASK_UNINTERRUPTIBLE, 0, 0,	\
 			    cmd1; schedule(); cmd2)
-- 
cgit v1.2.3


From a01f7cd657c95941079d548ef7fbf0cc370c1259 Mon Sep 17 00:00:00 2001
From: Suman Anna <s-anna@ti.com>
Date: Fri, 22 May 2015 15:45:28 -0500
Subject: remoteproc: add a rproc ops for performing address translation

The rproc_da_to_va API is currently used to perform any device to
kernel address translations to meet the different needs of the remoteproc
core/drivers (eg: loading). The functionality is achieved within the
remoteproc core, and is limited only for carveouts allocated within the
core.

A new rproc ops, da_to_va, is added to provide flexibility to platform
implementations to perform the address translation themselves when the
above conditions cannot be met by the implementations. The rproc_da_to_va()
API is extended to invoke this ops if present, and fallback to regular
processing if the platform implementation cannot provide the translation.
This will allow any remoteproc implementations to translate addresses for
dedicated memories like internal memories.

While at this, also update the rproc_da_to_va() documentation since it
is an exported function.

Signed-off-by: Suman Anna <s-anna@ti.com>
Signed-off-by: Dave Gerlach <d-gerlach@ti.com>
Signed-off-by: Ohad Ben-Cohen <ohad@wizery.com>
---
 drivers/remoteproc/remoteproc_core.c | 31 +++++++++++++++++++++++++------
 include/linux/remoteproc.h           |  2 ++
 2 files changed, 27 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/remoteproc/remoteproc_core.c b/drivers/remoteproc/remoteproc_core.c
index e1a6d693b8bb..b4ab38015cf2 100644
--- a/drivers/remoteproc/remoteproc_core.c
+++ b/drivers/remoteproc/remoteproc_core.c
@@ -137,28 +137,46 @@ static void rproc_disable_iommu(struct rproc *rproc)
 	iommu_domain_free(domain);
 }
 
-/*
+/**
+ * rproc_da_to_va() - lookup the kernel virtual address for a remoteproc address
+ * @rproc: handle of a remote processor
+ * @da: remoteproc device address to translate
+ * @len: length of the memory region @da is pointing to
+ *
  * Some remote processors will ask us to allocate them physically contiguous
  * memory regions (which we call "carveouts"), and map them to specific
- * device addresses (which are hardcoded in the firmware).
+ * device addresses (which are hardcoded in the firmware). They may also have
+ * dedicated memory regions internal to the processors, and use them either
+ * exclusively or alongside carveouts.
  *
  * They may then ask us to copy objects into specific device addresses (e.g.
  * code/data sections) or expose us certain symbols in other device address
  * (e.g. their trace buffer).
  *
- * This function is an internal helper with which we can go over the allocated
- * carveouts and translate specific device address to kernel virtual addresses
- * so we can access the referenced memory.
+ * This function is a helper function with which we can go over the allocated
+ * carveouts and translate specific device addresses to kernel virtual addresses
+ * so we can access the referenced memory. This function also allows to perform
+ * translations on the internal remoteproc memory regions through a platform
+ * implementation specific da_to_va ops, if present.
+ *
+ * The function returns a valid kernel address on success or NULL on failure.
  *
  * Note: phys_to_virt(iommu_iova_to_phys(rproc->domain, da)) will work too,
  * but only on kernel direct mapped RAM memory. Instead, we're just using
- * here the output of the DMA API, which should be more correct.
+ * here the output of the DMA API for the carveouts, which should be more
+ * correct.
  */
 void *rproc_da_to_va(struct rproc *rproc, u64 da, int len)
 {
 	struct rproc_mem_entry *carveout;
 	void *ptr = NULL;
 
+	if (rproc->ops->da_to_va) {
+		ptr = rproc->ops->da_to_va(rproc, da, len);
+		if (ptr)
+			goto out;
+	}
+
 	list_for_each_entry(carveout, &rproc->carveouts, node) {
 		int offset = da - carveout->da;
 
@@ -175,6 +193,7 @@ void *rproc_da_to_va(struct rproc *rproc, u64 da, int len)
 		break;
 	}
 
+out:
 	return ptr;
 }
 EXPORT_SYMBOL(rproc_da_to_va);
diff --git a/include/linux/remoteproc.h b/include/linux/remoteproc.h
index 56739e5df1e6..9c4e1384f636 100644
--- a/include/linux/remoteproc.h
+++ b/include/linux/remoteproc.h
@@ -330,11 +330,13 @@ struct rproc;
  * @start:	power on the device and boot it
  * @stop:	power off the device
  * @kick:	kick a virtqueue (virtqueue id given as a parameter)
+ * @da_to_va:	optional platform hook to perform address translations
  */
 struct rproc_ops {
 	int (*start)(struct rproc *rproc);
 	int (*stop)(struct rproc *rproc);
 	void (*kick)(struct rproc *rproc, int vqid);
+	void * (*da_to_va)(struct rproc *rproc, u64 da, int len);
 };
 
 /**
-- 
cgit v1.2.3


From a01bc0d5f557bd8becd0ba75d09c39192004697e Mon Sep 17 00:00:00 2001
From: Dave Gerlach <d-gerlach@ti.com>
Date: Fri, 22 May 2015 15:45:30 -0500
Subject: remoteproc/wkup_m3: add a remoteproc driver for TI Wakeup M3

Add a remoteproc driver to load the firmware and boot a small
Wakeup M3 processor present on TI AM33xx and AM43xx SoCs. This
Wakeup M3 remote processor is an integrated Cortex M3 that allows
the SoC to enter the lowest possible power state by taking control
from the MPU after it has gone into its own low power state and
shutting off any additional peripherals.

The Wakeup M3 processor has two internal memory regions - 16 kB of
unified instruction memory called UMEM used to store executable
code, and 8 kB of data memory called DMEM used for all data sections.
The Wakeup M3 processor executes its code entirely from within the
UMEM and uses the DMEM for any data. It does not use any external
memory or any other external resources. The device address view has
the UMEM at address 0x0 and DMEM at address 0x80000, and these are
computed automatically within the driver based on relative address
calculation from the corresponding device tree IOMEM resources.
These device addresses are used to aid the core remoteproc ELF
loader code to properly translate and load the firmware segments
through the .rproc_da_to_va ops.

Signed-off-by: Dave Gerlach <d-gerlach@ti.com>
Signed-off-by: Suman Anna <s-anna@ti.com>
Signed-off-by: Ohad Ben-Cohen <ohad@wizery.com>
---
 drivers/remoteproc/Kconfig            |  13 ++
 drivers/remoteproc/Makefile           |   1 +
 drivers/remoteproc/wkup_m3_rproc.c    | 257 ++++++++++++++++++++++++++++++++++
 include/linux/platform_data/wkup_m3.h |  30 ++++
 4 files changed, 301 insertions(+)
 create mode 100644 drivers/remoteproc/wkup_m3_rproc.c
 create mode 100644 include/linux/platform_data/wkup_m3.h

(limited to 'include/linux')

diff --git a/drivers/remoteproc/Kconfig b/drivers/remoteproc/Kconfig
index 5e343bab9458..28c711f0ac6b 100644
--- a/drivers/remoteproc/Kconfig
+++ b/drivers/remoteproc/Kconfig
@@ -41,6 +41,19 @@ config STE_MODEM_RPROC
 	  This can be either built-in or a loadable module.
 	  If unsure say N.
 
+config WKUP_M3_RPROC
+	tristate "AMx3xx Wakeup M3 remoteproc support"
+	depends on SOC_AM33XX || SOC_AM43XX
+	select REMOTEPROC
+	help
+	  Say y here to support Wakeup M3 remote processor on TI AM33xx
+	  and AM43xx family of SoCs.
+
+	  Required for Suspend-to-RAM on AM33xx and AM43xx SoCs. Also needed
+	  for deep CPUIdle states on AM33xx SoCs. Allows for loading of the
+	  firmware onto these remote processors.
+	  If unsure say N.
+
 config DA8XX_REMOTEPROC
 	tristate "DA8xx/OMAP-L13x remoteproc support"
 	depends on ARCH_DAVINCI_DA8XX
diff --git a/drivers/remoteproc/Makefile b/drivers/remoteproc/Makefile
index ac2ff75686d2..81b04d1e2e58 100644
--- a/drivers/remoteproc/Makefile
+++ b/drivers/remoteproc/Makefile
@@ -9,4 +9,5 @@ remoteproc-y				+= remoteproc_virtio.o
 remoteproc-y				+= remoteproc_elf_loader.o
 obj-$(CONFIG_OMAP_REMOTEPROC)		+= omap_remoteproc.o
 obj-$(CONFIG_STE_MODEM_RPROC)	 	+= ste_modem_rproc.o
+obj-$(CONFIG_WKUP_M3_RPROC)		+= wkup_m3_rproc.o
 obj-$(CONFIG_DA8XX_REMOTEPROC)		+= da8xx_remoteproc.o
diff --git a/drivers/remoteproc/wkup_m3_rproc.c b/drivers/remoteproc/wkup_m3_rproc.c
new file mode 100644
index 000000000000..edf81819cce1
--- /dev/null
+++ b/drivers/remoteproc/wkup_m3_rproc.c
@@ -0,0 +1,257 @@
+/*
+ * TI AMx3 Wakeup M3 Remote Processor driver
+ *
+ * Copyright (C) 2014-2015 Texas Instruments, Inc.
+ *
+ * Dave Gerlach <d-gerlach@ti.com>
+ * Suman Anna <s-anna@ti.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/err.h>
+#include <linux/interrupt.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/of_device.h>
+#include <linux/of_address.h>
+#include <linux/platform_device.h>
+#include <linux/pm_runtime.h>
+#include <linux/remoteproc.h>
+
+#include <linux/platform_data/wkup_m3.h>
+
+#include "remoteproc_internal.h"
+
+#define WKUPM3_MEM_MAX	2
+
+/**
+ * struct wkup_m3_mem - WkupM3 internal memory structure
+ * @cpu_addr: MPU virtual address of the memory region
+ * @bus_addr: Bus address used to access the memory region
+ * @dev_addr: Device address from Wakeup M3 view
+ * @size: Size of the memory region
+ */
+struct wkup_m3_mem {
+	void __iomem *cpu_addr;
+	phys_addr_t bus_addr;
+	u32 dev_addr;
+	size_t size;
+};
+
+/**
+ * struct wkup_m3_rproc - WkupM3 remote processor state
+ * @rproc: rproc handle
+ * @pdev: pointer to platform device
+ * @mem: WkupM3 memory information
+ */
+struct wkup_m3_rproc {
+	struct rproc *rproc;
+	struct platform_device *pdev;
+	struct wkup_m3_mem mem[WKUPM3_MEM_MAX];
+};
+
+static int wkup_m3_rproc_start(struct rproc *rproc)
+{
+	struct wkup_m3_rproc *wkupm3 = rproc->priv;
+	struct platform_device *pdev = wkupm3->pdev;
+	struct device *dev = &pdev->dev;
+	struct wkup_m3_platform_data *pdata = dev_get_platdata(dev);
+
+	if (pdata->deassert_reset(pdev, pdata->reset_name)) {
+		dev_err(dev, "Unable to reset wkup_m3!\n");
+		return -ENODEV;
+	}
+
+	return 0;
+}
+
+static int wkup_m3_rproc_stop(struct rproc *rproc)
+{
+	struct wkup_m3_rproc *wkupm3 = rproc->priv;
+	struct platform_device *pdev = wkupm3->pdev;
+	struct device *dev = &pdev->dev;
+	struct wkup_m3_platform_data *pdata = dev_get_platdata(dev);
+
+	if (pdata->assert_reset(pdev, pdata->reset_name)) {
+		dev_err(dev, "Unable to assert reset of wkup_m3!\n");
+		return -ENODEV;
+	}
+
+	return 0;
+}
+
+static void *wkup_m3_rproc_da_to_va(struct rproc *rproc, u64 da, int len)
+{
+	struct wkup_m3_rproc *wkupm3 = rproc->priv;
+	void *va = NULL;
+	int i;
+	u32 offset;
+
+	if (len <= 0)
+		return NULL;
+
+	for (i = 0; i < WKUPM3_MEM_MAX; i++) {
+		if (da >= wkupm3->mem[i].dev_addr && da + len <=
+		    wkupm3->mem[i].dev_addr +  wkupm3->mem[i].size) {
+			offset = da -  wkupm3->mem[i].dev_addr;
+			/* __force to make sparse happy with type conversion */
+			va = (__force void *)(wkupm3->mem[i].cpu_addr + offset);
+			break;
+		}
+	}
+
+	return va;
+}
+
+static struct rproc_ops wkup_m3_rproc_ops = {
+	.start		= wkup_m3_rproc_start,
+	.stop		= wkup_m3_rproc_stop,
+	.da_to_va	= wkup_m3_rproc_da_to_va,
+};
+
+static const struct of_device_id wkup_m3_rproc_of_match[] = {
+	{ .compatible = "ti,am3352-wkup-m3", },
+	{ .compatible = "ti,am4372-wkup-m3", },
+	{},
+};
+
+static int wkup_m3_rproc_probe(struct platform_device *pdev)
+{
+	struct device *dev = &pdev->dev;
+	struct wkup_m3_platform_data *pdata = dev->platform_data;
+	/* umem always needs to be processed first */
+	const char *mem_names[WKUPM3_MEM_MAX] = { "umem", "dmem" };
+	struct wkup_m3_rproc *wkupm3;
+	const char *fw_name;
+	struct rproc *rproc;
+	struct resource *res;
+	const __be32 *addrp;
+	u32 l4_offset = 0;
+	u64 size;
+	int ret;
+	int i;
+
+	if (!(pdata && pdata->deassert_reset && pdata->assert_reset &&
+	      pdata->reset_name)) {
+		dev_err(dev, "Platform data missing!\n");
+		return -ENODEV;
+	}
+
+	ret = of_property_read_string(dev->of_node, "ti,pm-firmware",
+				      &fw_name);
+	if (ret) {
+		dev_err(dev, "No firmware filename given\n");
+		return -ENODEV;
+	}
+
+	pm_runtime_enable(&pdev->dev);
+	ret = pm_runtime_get_sync(&pdev->dev);
+	if (ret < 0) {
+		dev_err(&pdev->dev, "pm_runtime_get_sync() failed\n");
+		goto err;
+	}
+
+	rproc = rproc_alloc(dev, "wkup_m3", &wkup_m3_rproc_ops,
+			    fw_name, sizeof(*wkupm3));
+	if (!rproc) {
+		ret = -ENOMEM;
+		goto err;
+	}
+
+	wkupm3 = rproc->priv;
+	wkupm3->rproc = rproc;
+	wkupm3->pdev = pdev;
+
+	for (i = 0; i < ARRAY_SIZE(mem_names); i++) {
+		res = platform_get_resource_byname(pdev, IORESOURCE_MEM,
+						   mem_names[i]);
+		wkupm3->mem[i].cpu_addr = devm_ioremap_resource(dev, res);
+		if (IS_ERR(wkupm3->mem[i].cpu_addr)) {
+			dev_err(&pdev->dev, "devm_ioremap_resource failed for resource %d\n",
+				i);
+			ret = PTR_ERR(wkupm3->mem[i].cpu_addr);
+			goto err;
+		}
+		wkupm3->mem[i].bus_addr = res->start;
+		wkupm3->mem[i].size = resource_size(res);
+		addrp = of_get_address(dev->of_node, i, &size, NULL);
+		/*
+		 * The wkupm3 has umem at address 0 in its view, so the device
+		 * addresses for each memory region is computed as a relative
+		 * offset of the bus address for umem, and therefore needs to be
+		 * processed first.
+		 */
+		if (!strcmp(mem_names[i], "umem"))
+			l4_offset = be32_to_cpu(*addrp);
+		wkupm3->mem[i].dev_addr = be32_to_cpu(*addrp) - l4_offset;
+	}
+
+	dev_set_drvdata(dev, rproc);
+
+	ret = rproc_add(rproc);
+	if (ret) {
+		dev_err(dev, "rproc_add failed\n");
+		goto err_put_rproc;
+	}
+
+	return 0;
+
+err_put_rproc:
+	rproc_put(rproc);
+err:
+	pm_runtime_put_noidle(dev);
+	pm_runtime_disable(dev);
+	return ret;
+}
+
+static int wkup_m3_rproc_remove(struct platform_device *pdev)
+{
+	struct rproc *rproc = platform_get_drvdata(pdev);
+
+	rproc_del(rproc);
+	rproc_put(rproc);
+	pm_runtime_put_sync(&pdev->dev);
+	pm_runtime_disable(&pdev->dev);
+
+	return 0;
+}
+
+#ifdef CONFIG_PM
+static int wkup_m3_rpm_suspend(struct device *dev)
+{
+	return -EBUSY;
+}
+
+static int wkup_m3_rpm_resume(struct device *dev)
+{
+	return 0;
+}
+#endif
+
+static const struct dev_pm_ops wkup_m3_rproc_pm_ops = {
+	SET_RUNTIME_PM_OPS(wkup_m3_rpm_suspend, wkup_m3_rpm_resume, NULL)
+};
+
+static struct platform_driver wkup_m3_rproc_driver = {
+	.probe = wkup_m3_rproc_probe,
+	.remove = wkup_m3_rproc_remove,
+	.driver = {
+		.name = "wkup_m3_rproc",
+		.of_match_table = wkup_m3_rproc_of_match,
+		.pm = &wkup_m3_rproc_pm_ops,
+	},
+};
+
+module_platform_driver(wkup_m3_rproc_driver);
+
+MODULE_LICENSE("GPL v2");
+MODULE_DESCRIPTION("TI Wakeup M3 remote processor control driver");
+MODULE_AUTHOR("Dave Gerlach <d-gerlach@ti.com>");
diff --git a/include/linux/platform_data/wkup_m3.h b/include/linux/platform_data/wkup_m3.h
new file mode 100644
index 000000000000..3f1d77effd71
--- /dev/null
+++ b/include/linux/platform_data/wkup_m3.h
@@ -0,0 +1,30 @@
+/*
+ * TI Wakeup M3 remote processor platform data
+ *
+ * Copyright (C) 2014-2015 Texas Instruments, Inc.
+ *
+ * Dave Gerlach <d-gerlach@ti.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#ifndef _LINUX_PLATFORM_DATA_WKUP_M3_H
+#define _LINUX_PLATFORM_DATA_WKUP_M3_H
+
+struct platform_device;
+
+struct wkup_m3_platform_data {
+	const char *reset_name;
+
+	int (*assert_reset)(struct platform_device *pdev, const char *name);
+	int (*deassert_reset)(struct platform_device *pdev, const char *name);
+};
+
+#endif /* _LINUX_PLATFORM_DATA_WKUP_M3_H */
-- 
cgit v1.2.3


From 3c339ab83fc09d9d91fb7e8b4a60e8ddc91de417 Mon Sep 17 00:00:00 2001
From: Tadeusz Struk <tadeusz.struk@intel.com>
Date: Tue, 16 Jun 2015 10:30:55 -0700
Subject: crypto: akcipher - add PKE API

Add Public Key Encryption API.

Signed-off-by: Tadeusz Struk <tadeusz.struk@intel.com>

Made CRYPTO_AKCIPHER invisible like other type config options.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/Kconfig                     |   9 +
 crypto/Makefile                    |   1 +
 crypto/akcipher.c                  | 117 +++++++++++++
 crypto/crypto_user.c               |  22 +++
 include/crypto/akcipher.h          | 340 +++++++++++++++++++++++++++++++++++++
 include/crypto/internal/akcipher.h |  60 +++++++
 include/linux/crypto.h             |   1 +
 include/linux/cryptouser.h         |   5 +
 8 files changed, 555 insertions(+)
 create mode 100644 crypto/akcipher.c
 create mode 100644 include/crypto/akcipher.h
 create mode 100644 include/crypto/internal/akcipher.h

(limited to 'include/linux')

diff --git a/crypto/Kconfig b/crypto/Kconfig
index f6fc054eb2d1..eb0aca45d430 100644
--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@@ -91,6 +91,15 @@ config CRYPTO_PCOMP2
 	tristate
 	select CRYPTO_ALGAPI2
 
+config CRYPTO_AKCIPHER2
+	tristate
+	select CRYPTO_ALGAPI2
+
+config CRYPTO_AKCIPHER
+	tristate
+	select CRYPTO_AKCIPHER2
+	select CRYPTO_ALGAPI
+
 config CRYPTO_MANAGER
 	tristate "Cryptographic algorithm manager"
 	select CRYPTO_MANAGER2
diff --git a/crypto/Makefile b/crypto/Makefile
index c84203572477..1ed382df7db9 100644
--- a/crypto/Makefile
+++ b/crypto/Makefile
@@ -28,6 +28,7 @@ crypto_hash-y += shash.o
 obj-$(CONFIG_CRYPTO_HASH2) += crypto_hash.o
 
 obj-$(CONFIG_CRYPTO_PCOMP2) += pcompress.o
+obj-$(CONFIG_CRYPTO_AKCIPHER2) += akcipher.o
 
 cryptomgr-y := algboss.o testmgr.o
 
diff --git a/crypto/akcipher.c b/crypto/akcipher.c
new file mode 100644
index 000000000000..d7986414814e
--- /dev/null
+++ b/crypto/akcipher.c
@@ -0,0 +1,117 @@
+/*
+ * Public Key Encryption
+ *
+ * Copyright (c) 2015, Intel Corporation
+ * Authors: Tadeusz Struk <tadeusz.struk@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ */
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/seq_file.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/crypto.h>
+#include <crypto/algapi.h>
+#include <linux/cryptouser.h>
+#include <net/netlink.h>
+#include <crypto/akcipher.h>
+#include <crypto/public_key.h>
+#include "internal.h"
+
+#ifdef CONFIG_NET
+static int crypto_akcipher_report(struct sk_buff *skb, struct crypto_alg *alg)
+{
+	struct crypto_report_akcipher rakcipher;
+
+	strncpy(rakcipher.type, "akcipher", sizeof(rakcipher.type));
+
+	if (nla_put(skb, CRYPTOCFGA_REPORT_AKCIPHER,
+		    sizeof(struct crypto_report_akcipher), &rakcipher))
+		goto nla_put_failure;
+	return 0;
+
+nla_put_failure:
+	return -EMSGSIZE;
+}
+#else
+static int crypto_akcipher_report(struct sk_buff *skb, struct crypto_alg *alg)
+{
+	return -ENOSYS;
+}
+#endif
+
+static void crypto_akcipher_show(struct seq_file *m, struct crypto_alg *alg)
+	__attribute__ ((unused));
+
+static void crypto_akcipher_show(struct seq_file *m, struct crypto_alg *alg)
+{
+	seq_puts(m, "type         : akcipher\n");
+}
+
+static void crypto_akcipher_exit_tfm(struct crypto_tfm *tfm)
+{
+	struct crypto_akcipher *akcipher = __crypto_akcipher_tfm(tfm);
+	struct akcipher_alg *alg = crypto_akcipher_alg(akcipher);
+
+	alg->exit(akcipher);
+}
+
+static int crypto_akcipher_init_tfm(struct crypto_tfm *tfm)
+{
+	struct crypto_akcipher *akcipher = __crypto_akcipher_tfm(tfm);
+	struct akcipher_alg *alg = crypto_akcipher_alg(akcipher);
+
+	if (alg->exit)
+		akcipher->base.exit = crypto_akcipher_exit_tfm;
+
+	if (alg->init)
+		return alg->init(akcipher);
+
+	return 0;
+}
+
+static const struct crypto_type crypto_akcipher_type = {
+	.extsize = crypto_alg_extsize,
+	.init_tfm = crypto_akcipher_init_tfm,
+#ifdef CONFIG_PROC_FS
+	.show = crypto_akcipher_show,
+#endif
+	.report = crypto_akcipher_report,
+	.maskclear = ~CRYPTO_ALG_TYPE_MASK,
+	.maskset = CRYPTO_ALG_TYPE_MASK,
+	.type = CRYPTO_ALG_TYPE_AKCIPHER,
+	.tfmsize = offsetof(struct crypto_akcipher, base),
+};
+
+struct crypto_akcipher *crypto_alloc_akcipher(const char *alg_name, u32 type,
+					      u32 mask)
+{
+	return crypto_alloc_tfm(alg_name, &crypto_akcipher_type, type, mask);
+}
+EXPORT_SYMBOL_GPL(crypto_alloc_akcipher);
+
+int crypto_register_akcipher(struct akcipher_alg *alg)
+{
+	struct crypto_alg *base = &alg->base;
+
+	base->cra_type = &crypto_akcipher_type;
+	base->cra_flags &= ~CRYPTO_ALG_TYPE_MASK;
+	base->cra_flags |= CRYPTO_ALG_TYPE_AKCIPHER;
+	return crypto_register_alg(base);
+}
+EXPORT_SYMBOL_GPL(crypto_register_akcipher);
+
+void crypto_unregister_akcipher(struct akcipher_alg *alg)
+{
+	crypto_unregister_alg(&alg->base);
+}
+EXPORT_SYMBOL_GPL(crypto_unregister_akcipher);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Generic public key cihper type");
diff --git a/crypto/crypto_user.c b/crypto/crypto_user.c
index 41dfe762b7fb..11dbd5a81c72 100644
--- a/crypto/crypto_user.c
+++ b/crypto/crypto_user.c
@@ -27,6 +27,7 @@
 #include <net/net_namespace.h>
 #include <crypto/internal/aead.h>
 #include <crypto/internal/skcipher.h>
+#include <crypto/akcipher.h>
 
 #include "internal.h"
 
@@ -110,6 +111,21 @@ nla_put_failure:
 	return -EMSGSIZE;
 }
 
+static int crypto_report_akcipher(struct sk_buff *skb, struct crypto_alg *alg)
+{
+	struct crypto_report_akcipher rakcipher;
+
+	strncpy(rakcipher.type, "akcipher", sizeof(rakcipher.type));
+
+	if (nla_put(skb, CRYPTOCFGA_REPORT_AKCIPHER,
+		    sizeof(struct crypto_report_akcipher), &rakcipher))
+		goto nla_put_failure;
+	return 0;
+
+nla_put_failure:
+	return -EMSGSIZE;
+}
+
 static int crypto_report_one(struct crypto_alg *alg,
 			     struct crypto_user_alg *ualg, struct sk_buff *skb)
 {
@@ -154,6 +170,12 @@ static int crypto_report_one(struct crypto_alg *alg,
 			goto nla_put_failure;
 
 		break;
+
+	case CRYPTO_ALG_TYPE_AKCIPHER:
+		if (crypto_report_akcipher(skb, alg))
+			goto nla_put_failure;
+
+		break;
 	}
 
 out:
diff --git a/include/crypto/akcipher.h b/include/crypto/akcipher.h
new file mode 100644
index 000000000000..69d163e39101
--- /dev/null
+++ b/include/crypto/akcipher.h
@@ -0,0 +1,340 @@
+/*
+ * Public Key Encryption
+ *
+ * Copyright (c) 2015, Intel Corporation
+ * Authors: Tadeusz Struk <tadeusz.struk@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ */
+#ifndef _CRYPTO_AKCIPHER_H
+#define _CRYPTO_AKCIPHER_H
+#include <linux/crypto.h>
+
+/**
+ * struct akcipher_request - public key request
+ *
+ * @base:	Common attributes for async crypto requests
+ * @src:	Pointer to memory containing the input parameters
+ *		The format of the parameter(s) is expeted to be Octet String
+ * @dst:	Pointer to memory whare the result will be stored
+ * @src_len:	Size of the input parameter
+ * @dst_len:	Size of the output buffer. It needs to be at leaset
+ *		as big as the expected result depending	on the operation
+ *		After operation it will be updated with the acctual size of the
+ *		result. In case of error, where the dst_len was insufficient,
+ *		it will be updated to the size required for the operation.
+ * @__ctx:	Start of private context data
+ */
+struct akcipher_request {
+	struct crypto_async_request base;
+	void *src;
+	void *dst;
+	unsigned int src_len;
+	unsigned int dst_len;
+	void *__ctx[] CRYPTO_MINALIGN_ATTR;
+};
+
+/**
+ * struct crypto_akcipher - user-instantiated objects which encapsulate
+ * algorithms and core processing logic
+ *
+ * @base:	Common crypto API algorithm data structure
+ */
+struct crypto_akcipher {
+	struct crypto_tfm base;
+};
+
+/**
+ * struct akcipher_alg - generic public key algorithm
+ *
+ * @sign:	Function performs a sign operation as defined by public key
+ *		algorithm. In case of error, where the dst_len was insufficient,
+ *		the req->dst_len will be updated to the size required for the
+ *		operation
+ * @verify:	Function performs a sign operation as defined by public key
+ *		algorithm. In case of error, where the dst_len was insufficient,
+ *		the req->dst_len will be updated to the size required for the
+ *		operation
+ * @encrypt:	Function performs an encrytp operation as defined by public key
+ *		algorithm. In case of error, where the dst_len was insufficient,
+ *		the req->dst_len will be updated to the size required for the
+ *		operation
+ * @decrypt:	Function performs a decrypt operation as defined by public key
+ *		algorithm. In case of error, where the dst_len was insufficient,
+ *		the req->dst_len will be updated to the size required for the
+ *		operation
+ * @setkey:	Function invokes the algorithm specific set key function, which
+ *		knows how to decode and interpret the BER encoded key
+ * @init:	Initialize the cryptographic transformation object.
+ *		This function is used to initialize the cryptographic
+ *		transformation object. This function is called only once at
+ *		the instantiation time, right after the transformation context
+ *		was allocated. In case the cryptographic hardware has some
+ *		special requirements which need to be handled by software, this
+ *		function shall check for the precise requirement of the
+ *		transformation and put any software fallbacks in place.
+ * @exit:	Deinitialize the cryptographic transformation object. This is a
+ *		counterpart to @init, used to remove various changes set in
+ *		@init.
+ *
+ * @reqsize:	Request context size required by algorithm implementation
+ * @base:	Common crypto API algorithm data structure
+ */
+struct akcipher_alg {
+	int (*sign)(struct akcipher_request *req);
+	int (*verify)(struct akcipher_request *req);
+	int (*encrypt)(struct akcipher_request *req);
+	int (*decrypt)(struct akcipher_request *req);
+	int (*setkey)(struct crypto_akcipher *tfm, const void *key,
+		      unsigned int keylen);
+	int (*init)(struct crypto_akcipher *tfm);
+	void (*exit)(struct crypto_akcipher *tfm);
+
+	unsigned int reqsize;
+	struct crypto_alg base;
+};
+
+/**
+ * DOC: Generic Public Key API
+ *
+ * The Public Key API is used with the algorithms of type
+ * CRYPTO_ALG_TYPE_AKCIPHER (listed as type "akcipher" in /proc/crypto)
+ */
+
+/**
+ * crypto_alloc_akcipher() -- allocate AKCIPHER tfm handle
+ * @alg_name: is the cra_name / name or cra_driver_name / driver name of the
+ *	      public key algorithm e.g. "rsa"
+ * @type: specifies the type of the algorithm
+ * @mask: specifies the mask for the algorithm
+ *
+ * Allocate a handle for public key algorithm. The returned struct
+ * crypto_akcipher is the handle that is required for any subsequent
+ * API invocation for the public key operations.
+ *
+ * Return: allocated handle in case of success; IS_ERR() is true in case
+ *	   of an error, PTR_ERR() returns the error code.
+ */
+struct crypto_akcipher *crypto_alloc_akcipher(const char *alg_name, u32 type,
+					      u32 mask);
+
+static inline struct crypto_tfm *crypto_akcipher_tfm(
+	struct crypto_akcipher *tfm)
+{
+	return &tfm->base;
+}
+
+static inline struct akcipher_alg *__crypto_akcipher_alg(struct crypto_alg *alg)
+{
+	return container_of(alg, struct akcipher_alg, base);
+}
+
+static inline struct crypto_akcipher *__crypto_akcipher_tfm(
+	struct crypto_tfm *tfm)
+{
+	return container_of(tfm, struct crypto_akcipher, base);
+}
+
+static inline struct akcipher_alg *crypto_akcipher_alg(
+	struct crypto_akcipher *tfm)
+{
+	return __crypto_akcipher_alg(crypto_akcipher_tfm(tfm)->__crt_alg);
+}
+
+static inline unsigned int crypto_akcipher_reqsize(struct crypto_akcipher *tfm)
+{
+	return crypto_akcipher_alg(tfm)->reqsize;
+}
+
+static inline void akcipher_request_set_tfm(struct akcipher_request *req,
+					    struct crypto_akcipher *tfm)
+{
+	req->base.tfm = crypto_akcipher_tfm(tfm);
+}
+
+static inline struct crypto_akcipher *crypto_akcipher_reqtfm(
+	struct akcipher_request *req)
+{
+	return __crypto_akcipher_tfm(req->base.tfm);
+}
+
+/**
+ * crypto_free_akcipher() -- free AKCIPHER tfm handle
+ *
+ * @tfm: AKCIPHER tfm handle allocated with crypto_alloc_akcipher()
+ */
+static inline void crypto_free_akcipher(struct crypto_akcipher *tfm)
+{
+	crypto_destroy_tfm(tfm, crypto_akcipher_tfm(tfm));
+}
+
+/**
+ * akcipher_request_alloc() -- allocates public key request
+ *
+ * @tfm:	AKCIPHER tfm handle allocated with crypto_alloc_akcipher()
+ * @gfp:	allocation flags
+ *
+ * Return: allocated handle in case of success or NULL in case of an error.
+ */
+static inline struct akcipher_request *akcipher_request_alloc(
+	struct crypto_akcipher *tfm, gfp_t gfp)
+{
+	struct akcipher_request *req;
+
+	req = kmalloc(sizeof(*req) + crypto_akcipher_reqsize(tfm), gfp);
+	if (likely(req))
+		akcipher_request_set_tfm(req, tfm);
+
+	return req;
+}
+
+/**
+ * akcipher_request_free() -- zeroize and free public key request
+ *
+ * @req:	request to free
+ */
+static inline void akcipher_request_free(struct akcipher_request *req)
+{
+	kzfree(req);
+}
+
+/**
+ * akcipher_request_set_callback() -- Sets an asynchronous callback.
+ *
+ * Callback will be called when an asynchronous operation on a given
+ * request is finished.
+ *
+ * @req:	request that the callback will be set for
+ * @flgs:	specify for instance if the operation may backlog
+ * @cmlp:	callback which will be called
+ * @data:	private data used by the caller
+ */
+static inline void akcipher_request_set_callback(struct akcipher_request *req,
+						 u32 flgs,
+						 crypto_completion_t cmpl,
+						 void *data)
+{
+	req->base.complete = cmpl;
+	req->base.data = data;
+	req->base.flags = flgs;
+}
+
+/**
+ * akcipher_request_set_crypt() -- Sets reqest parameters
+ *
+ * Sets parameters required by crypto operation
+ *
+ * @req:	public key request
+ * @src:	ptr to input parameter
+ * @dst:	ptr of output parameter
+ * @src_len:	size of the input buffer
+ * @dst_len:	size of the output buffer. It will be updated by the
+ *		implementation to reflect the acctual size of the result
+ */
+static inline void akcipher_request_set_crypt(struct akcipher_request *req,
+					      void *src, void *dst,
+					      unsigned int src_len,
+					      unsigned int dst_len)
+{
+	req->src = src;
+	req->dst = dst;
+	req->src_len = src_len;
+	req->dst_len = dst_len;
+}
+
+/**
+ * crypto_akcipher_encrypt() -- Invoke public key encrypt operation
+ *
+ * Function invokes the specific public key encrypt operation for a given
+ * public key algorithm
+ *
+ * @req:	asymmetric key request
+ *
+ * Return: zero on success; error code in case of error
+ */
+static inline int crypto_akcipher_encrypt(struct akcipher_request *req)
+{
+	struct crypto_akcipher *tfm = crypto_akcipher_reqtfm(req);
+	struct akcipher_alg *alg = crypto_akcipher_alg(tfm);
+
+	return alg->encrypt(req);
+}
+
+/**
+ * crypto_akcipher_decrypt() -- Invoke public key decrypt operation
+ *
+ * Function invokes the specific public key decrypt operation for a given
+ * public key algorithm
+ *
+ * @req:	asymmetric key request
+ *
+ * Return: zero on success; error code in case of error
+ */
+static inline int crypto_akcipher_decrypt(struct akcipher_request *req)
+{
+	struct crypto_akcipher *tfm = crypto_akcipher_reqtfm(req);
+	struct akcipher_alg *alg = crypto_akcipher_alg(tfm);
+
+	return alg->decrypt(req);
+}
+
+/**
+ * crypto_akcipher_sign() -- Invoke public key sign operation
+ *
+ * Function invokes the specific public key sign operation for a given
+ * public key algorithm
+ *
+ * @req:	asymmetric key request
+ *
+ * Return: zero on success; error code in case of error
+ */
+static inline int crypto_akcipher_sign(struct akcipher_request *req)
+{
+	struct crypto_akcipher *tfm = crypto_akcipher_reqtfm(req);
+	struct akcipher_alg *alg = crypto_akcipher_alg(tfm);
+
+	return alg->sign(req);
+}
+
+/**
+ * crypto_akcipher_verify() -- Invoke public key verify operation
+ *
+ * Function invokes the specific public key verify operation for a given
+ * public key algorithm
+ *
+ * @req:	asymmetric key request
+ *
+ * Return: zero on success; error code in case of error
+ */
+static inline int crypto_akcipher_verify(struct akcipher_request *req)
+{
+	struct crypto_akcipher *tfm = crypto_akcipher_reqtfm(req);
+	struct akcipher_alg *alg = crypto_akcipher_alg(tfm);
+
+	return alg->verify(req);
+}
+
+/**
+ * crypto_akcipher_setkey() -- Invoke public key setkey operation
+ *
+ * Function invokes the algorithm specific set key function, which knows
+ * how to decode and interpret the encoded key
+ *
+ * @tfm:	tfm handle
+ * @key:	BER encoded private or public key
+ * @keylen:	length of the key
+ *
+ * Return: zero on success; error code in case of error
+ */
+static inline int crypto_akcipher_setkey(struct crypto_akcipher *tfm, void *key,
+					 unsigned int keylen)
+{
+	struct akcipher_alg *alg = crypto_akcipher_alg(tfm);
+
+	return alg->setkey(tfm, key, keylen);
+}
+#endif
diff --git a/include/crypto/internal/akcipher.h b/include/crypto/internal/akcipher.h
new file mode 100644
index 000000000000..9a2bda15e454
--- /dev/null
+++ b/include/crypto/internal/akcipher.h
@@ -0,0 +1,60 @@
+/*
+ * Public Key Encryption
+ *
+ * Copyright (c) 2015, Intel Corporation
+ * Authors: Tadeusz Struk <tadeusz.struk@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ */
+#ifndef _CRYPTO_AKCIPHER_INT_H
+#define _CRYPTO_AKCIPHER_INT_H
+#include <crypto/akcipher.h>
+
+/*
+ * Transform internal helpers.
+ */
+static inline void *akcipher_request_ctx(struct akcipher_request *req)
+{
+	return req->__ctx;
+}
+
+static inline void *akcipher_tfm_ctx(struct crypto_akcipher *tfm)
+{
+	return tfm->base.__crt_ctx;
+}
+
+static inline void akcipher_request_complete(struct akcipher_request *req,
+					     int err)
+{
+	req->base.complete(&req->base, err);
+}
+
+static inline const char *akcipher_alg_name(struct crypto_akcipher *tfm)
+{
+	return crypto_akcipher_tfm(tfm)->__crt_alg->cra_name;
+}
+
+/**
+ * crypto_register_akcipher() -- Register public key algorithm
+ *
+ * Function registers an implementation of a public key verify algorithm
+ *
+ * @alg:	algorithm definition
+ *
+ * Return: zero on success; error code in case of error
+ */
+int crypto_register_akcipher(struct akcipher_alg *alg);
+
+/**
+ * crypto_unregister_akcipher() -- Unregister public key algorithm
+ *
+ * Function unregisters an implementation of a public key verify algorithm
+ *
+ * @alg:	algorithm definition
+ */
+void crypto_unregister_akcipher(struct akcipher_alg *alg);
+#endif
diff --git a/include/linux/crypto.h b/include/linux/crypto.h
index 25a4b71d6d1f..0e3f71a73e3b 100644
--- a/include/linux/crypto.h
+++ b/include/linux/crypto.h
@@ -53,6 +53,7 @@
 #define CRYPTO_ALG_TYPE_SHASH		0x00000009
 #define CRYPTO_ALG_TYPE_AHASH		0x0000000a
 #define CRYPTO_ALG_TYPE_RNG		0x0000000c
+#define CRYPTO_ALG_TYPE_AKCIPHER	0x0000000d
 #define CRYPTO_ALG_TYPE_PCOMPRESS	0x0000000f
 
 #define CRYPTO_ALG_TYPE_HASH_MASK	0x0000000e
diff --git a/include/linux/cryptouser.h b/include/linux/cryptouser.h
index 4abf2ea6a887..36efbbbf2f83 100644
--- a/include/linux/cryptouser.h
+++ b/include/linux/cryptouser.h
@@ -43,6 +43,7 @@ enum crypto_attr_type_t {
 	CRYPTOCFGA_REPORT_COMPRESS,	/* struct crypto_report_comp */
 	CRYPTOCFGA_REPORT_RNG,		/* struct crypto_report_rng */
 	CRYPTOCFGA_REPORT_CIPHER,	/* struct crypto_report_cipher */
+	CRYPTOCFGA_REPORT_AKCIPHER,	/* struct crypto_report_akcipher */
 	__CRYPTOCFGA_MAX
 
 #define CRYPTOCFGA_MAX (__CRYPTOCFGA_MAX - 1)
@@ -101,5 +102,9 @@ struct crypto_report_rng {
 	unsigned int seedsize;
 };
 
+struct crypto_report_akcipher {
+	char type[CRYPTO_MAX_NAME];
+};
+
 #define CRYPTO_REPORT_MAXSIZE (sizeof(struct crypto_user_alg) + \
 			       sizeof(struct crypto_report_blkcipher))
-- 
cgit v1.2.3


From 46b15caa7cb19b0f6e3bc8ebaee5bc1bb2e35110 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 16 Jun 2015 18:48:31 -0400
Subject: vfs, writeback: replace FS_CGROUP_WRITEBACK with SB_I_CGROUPWB

FS_CGROUP_WRITEBACK indicates whether a file_system_type supports
cgroup writeback; however, different super_blocks of the same
file_system_type may or may not support cgroup writeback depending on
filesystem options.  This patch replaces FS_CGROUP_WRITEBACK with a
per-super_block flag.

super_block->s_flags carries some internal flags in the high bits but
it's exposd to userland through uapi header and running out of space
anyway.  This patch adds a new field super_block->s_iflags to carry
kernel-internal flags.  It is currently only used by the new
SB_I_CGROUPWB flag whose concatenated and abbreviated name is for
consistency with other super_block flags.

ext2_fill_super() is updated to set SB_I_CGROUPWB.

v2: Added super_block->s_iflags instead of stealing another high bit
    from sb->s_flags as suggested by Christoph and Jan.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: linux-fsdevel@vger.kernel.org
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Jan Kara <jack@suse.cz>
Cc: linux-ext4@vger.kernel.org
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 fs/ext2/super.c             | 3 ++-
 include/linux/backing-dev.h | 2 +-
 include/linux/fs.h          | 4 +++-
 3 files changed, 6 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 549219d44f1c..900e19cf9ef6 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -882,6 +882,7 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
 	sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
 		((EXT2_SB(sb)->s_mount_opt & EXT2_MOUNT_POSIX_ACL) ?
 		 MS_POSIXACL : 0);
+	sb->s_iflags |= SB_I_CGROUPWB;
 
 	if (le32_to_cpu(es->s_rev_level) == EXT2_GOOD_OLD_REV &&
 	    (EXT2_HAS_COMPAT_FEATURE(sb, ~0U) ||
@@ -1543,7 +1544,7 @@ static struct file_system_type ext2_fs_type = {
 	.name		= "ext2",
 	.mount		= ext2_mount,
 	.kill_sb	= kill_block_super,
-	.fs_flags	= FS_REQUIRES_DEV | FS_CGROUP_WRITEBACK,
+	.fs_flags	= FS_REQUIRES_DEV,
 };
 MODULE_ALIAS_FS("ext2");
 
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index dfce80869145..a13181a42b9a 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -260,7 +260,7 @@ static inline bool inode_cgwb_enabled(struct inode *inode)
 
 	return bdi_cap_account_dirty(bdi) &&
 		(bdi->capabilities & BDI_CAP_CGROUP_WRITEBACK) &&
-		(inode->i_sb->s_type->fs_flags & FS_CGROUP_WRITEBACK);
+		(inode->i_sb->s_iflags & SB_I_CGROUPWB);
 }
 
 /**
diff --git a/include/linux/fs.h b/include/linux/fs.h
index b5e1dcfbc5e3..2c5e33a5b2af 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1241,6 +1241,8 @@ struct mm_struct;
 #define UMOUNT_NOFOLLOW	0x00000008	/* Don't follow symlink on umount */
 #define UMOUNT_UNUSED	0x80000000	/* Flag guaranteed to be unused */
 
+/* sb->s_iflags */
+#define SB_I_CGROUPWB	0x00000001	/* cgroup-aware writeback enabled */
 
 /* Possible states of 'frozen' field */
 enum {
@@ -1279,6 +1281,7 @@ struct super_block {
 	const struct quotactl_ops	*s_qcop;
 	const struct export_operations *s_export_op;
 	unsigned long		s_flags;
+	unsigned long		s_iflags;	/* internal SB_I_* flags */
 	unsigned long		s_magic;
 	struct dentry		*s_root;
 	struct rw_semaphore	s_umount;
@@ -1912,7 +1915,6 @@ struct file_system_type {
 #define FS_HAS_SUBTYPE		4
 #define FS_USERNS_MOUNT		8	/* Can be mounted by userns root */
 #define FS_USERNS_DEV_MOUNT	16 /* A userns mount does not imply MNT_NODEV */
-#define FS_CGROUP_WRITEBACK	32	/* Supports cgroup-aware writeback */
 #define FS_RENAME_DOES_D_MOVE	32768	/* FS will handle d_move() during rename() internally. */
 	struct dentry *(*mount) (struct file_system_type *, int,
 		       const char *, void *);
-- 
cgit v1.2.3


From a9bd32a8b4c4c2670f9ed8cae63f9378b6df3ded Mon Sep 17 00:00:00 2001
From: Stephen Boyd <sboyd@codeaurora.org>
Date: Fri, 13 Mar 2015 11:09:45 -0700
Subject: msm: msm_fb: Remove dead code

This code is no longer used now that mach-msm has been removed.
Delete it.

Cc: Jean-Christophe Plagniol-Villard <plagnioj@jcrosoft.com>
Cc: Tomi Valkeinen <tomi.valkeinen@ti.com>
Cc: David Brown <davidb@codeaurora.org>
Cc: Bryan Huntsman <bryanh@codeaurora.org>
Cc: Daniel Walker <dwalker@fifo99.com>
Signed-off-by: Stephen Boyd <sboyd@codeaurora.org>
Acked-by: Jean-Christophe PLAGNIOL-VILLARD <plagnioj@jcrosoft.com>
Signed-off-by: Tomi Valkeinen <tomi.valkeinen@ti.com>
---
 drivers/video/fbdev/Kconfig                   |   7 -
 drivers/video/fbdev/Makefile                  |   1 -
 drivers/video/fbdev/msm/Makefile              |  19 -
 drivers/video/fbdev/msm/mddi.c                | 821 --------------------------
 drivers/video/fbdev/msm/mddi_client_dummy.c   |  85 ---
 drivers/video/fbdev/msm/mddi_client_nt35399.c | 252 --------
 drivers/video/fbdev/msm/mddi_client_toshiba.c | 280 ---------
 drivers/video/fbdev/msm/mddi_hw.h             | 305 ----------
 drivers/video/fbdev/msm/mdp.c                 | 520 ----------------
 drivers/video/fbdev/msm/mdp_csc_table.h       | 582 ------------------
 drivers/video/fbdev/msm/mdp_hw.h              | 627 --------------------
 drivers/video/fbdev/msm/mdp_ppp.c             | 731 -----------------------
 drivers/video/fbdev/msm/mdp_scale_tables.c    | 766 ------------------------
 drivers/video/fbdev/msm/mdp_scale_tables.h    |  38 --
 drivers/video/fbdev/msm/msm_fb.c              | 659 ---------------------
 include/linux/platform_data/video-msm_fb.h    | 146 -----
 16 files changed, 5839 deletions(-)
 delete mode 100644 drivers/video/fbdev/msm/Makefile
 delete mode 100644 drivers/video/fbdev/msm/mddi.c
 delete mode 100644 drivers/video/fbdev/msm/mddi_client_dummy.c
 delete mode 100644 drivers/video/fbdev/msm/mddi_client_nt35399.c
 delete mode 100644 drivers/video/fbdev/msm/mddi_client_toshiba.c
 delete mode 100644 drivers/video/fbdev/msm/mddi_hw.h
 delete mode 100644 drivers/video/fbdev/msm/mdp.c
 delete mode 100644 drivers/video/fbdev/msm/mdp_csc_table.h
 delete mode 100644 drivers/video/fbdev/msm/mdp_hw.h
 delete mode 100644 drivers/video/fbdev/msm/mdp_ppp.c
 delete mode 100644 drivers/video/fbdev/msm/mdp_scale_tables.c
 delete mode 100644 drivers/video/fbdev/msm/mdp_scale_tables.h
 delete mode 100644 drivers/video/fbdev/msm/msm_fb.c
 delete mode 100644 include/linux/platform_data/video-msm_fb.h

(limited to 'include/linux')

diff --git a/drivers/video/fbdev/Kconfig b/drivers/video/fbdev/Kconfig
index 54fb8f86b68d..2d98de535e0f 100644
--- a/drivers/video/fbdev/Kconfig
+++ b/drivers/video/fbdev/Kconfig
@@ -2326,13 +2326,6 @@ config FB_PRE_INIT_FB
 	  Select this option if display contents should be inherited as set by
 	  the bootloader.
 
-config FB_MSM
-	tristate "MSM Framebuffer support"
-	depends on FB && ARCH_MSM
-	select FB_CFB_FILLRECT
-	select FB_CFB_COPYAREA
-	select FB_CFB_IMAGEBLIT
-
 config FB_MX3
 	tristate "MX3 Framebuffer support"
 	depends on FB && MX3_IPU
diff --git a/drivers/video/fbdev/Makefile b/drivers/video/fbdev/Makefile
index 1979afffccfe..cecea5063a80 100644
--- a/drivers/video/fbdev/Makefile
+++ b/drivers/video/fbdev/Makefile
@@ -126,7 +126,6 @@ obj-y                             += omap2/
 obj-$(CONFIG_XEN_FBDEV_FRONTEND)  += xen-fbfront.o
 obj-$(CONFIG_FB_CARMINE)          += carminefb.o
 obj-$(CONFIG_FB_MB862XX)	  += mb862xx/
-obj-$(CONFIG_FB_MSM)              += msm/
 obj-$(CONFIG_FB_NUC900)           += nuc900fb.o
 obj-$(CONFIG_FB_JZ4740)		  += jz4740_fb.o
 obj-$(CONFIG_FB_PUV3_UNIGFX)      += fb-puv3.o
diff --git a/drivers/video/fbdev/msm/Makefile b/drivers/video/fbdev/msm/Makefile
deleted file mode 100644
index 802d6ae523fb..000000000000
--- a/drivers/video/fbdev/msm/Makefile
+++ /dev/null
@@ -1,19 +0,0 @@
-
-# core framebuffer
-#
-obj-y := msm_fb.o
-
-# MDP DMA/PPP engine
-#
-obj-y += mdp.o mdp_scale_tables.o mdp_ppp.o
-
-# MDDI interface
-#
-obj-y += mddi.o
-
-# MDDI client/panel drivers
-#
-obj-y += mddi_client_dummy.o
-obj-y += mddi_client_toshiba.o
-obj-y += mddi_client_nt35399.o
-
diff --git a/drivers/video/fbdev/msm/mddi.c b/drivers/video/fbdev/msm/mddi.c
deleted file mode 100644
index e0f8011a3c4b..000000000000
--- a/drivers/video/fbdev/msm/mddi.c
+++ /dev/null
@@ -1,821 +0,0 @@
-/*
- * MSM MDDI Transport
- *
- * Copyright (C) 2007 Google Incorporated
- * Copyright (C) 2007 QUALCOMM Incorporated
- *
- * This software is licensed under the terms of the GNU General Public
- * License version 2, as published by the Free Software Foundation, and
- * may be copied, distributed, and modified under those terms.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.	 See the
- * GNU General Public License for more details.
- *
- */
-
-#include <linux/module.h>
-#include <linux/kernel.h>
-#include <linux/dma-mapping.h>
-#include <linux/interrupt.h>
-#include <linux/platform_device.h>
-#include <linux/delay.h>
-#include <linux/gfp.h>
-#include <linux/spinlock.h>
-#include <linux/clk.h>
-#include <linux/io.h>
-#include <linux/sched.h>
-#include <linux/platform_data/video-msm_fb.h>
-#include "mddi_hw.h"
-
-#define FLAG_DISABLE_HIBERNATION 0x0001
-#define FLAG_HAVE_CAPS		 0x0002
-#define FLAG_HAS_VSYNC_IRQ	 0x0004
-#define FLAG_HAVE_STATUS	 0x0008
-
-#define CMD_GET_CLIENT_CAP     0x0601
-#define CMD_GET_CLIENT_STATUS  0x0602
-
-union mddi_rev {
-	unsigned char raw[MDDI_REV_BUFFER_SIZE];
-	struct mddi_rev_packet hdr;
-	struct mddi_client_status status;
-	struct mddi_client_caps caps;
-	struct mddi_register_access reg;
-};
-
-struct reg_read_info {
-	struct completion done;
-	uint32_t reg;
-	uint32_t status;
-	uint32_t result;
-};
-
-struct mddi_info {
-	uint16_t flags;
-	uint16_t version;
-	char __iomem *base;
-	int irq;
-	struct clk *clk;
-	struct msm_mddi_client_data client_data;
-
-	/* buffer for rev encap packets */
-	void *rev_data;
-	dma_addr_t rev_addr;
-	struct mddi_llentry *reg_write_data;
-	dma_addr_t reg_write_addr;
-	struct mddi_llentry *reg_read_data;
-	dma_addr_t reg_read_addr;
-	size_t rev_data_curr;
-
-	spinlock_t int_lock;
-	uint32_t int_enable;
-	uint32_t got_int;
-	wait_queue_head_t int_wait;
-
-	struct mutex reg_write_lock;
-	struct mutex reg_read_lock;
-	struct reg_read_info *reg_read;
-
-	struct mddi_client_caps caps;
-	struct mddi_client_status status;
-
-	void (*power_client)(struct msm_mddi_client_data *, int);
-
-	/* client device published to bind us to the
-	 * appropriate mddi_client driver
-	 */
-	char client_name[20];
-
-	struct platform_device client_pdev;
-};
-
-static void mddi_init_rev_encap(struct mddi_info *mddi);
-
-#define mddi_readl(r) readl(mddi->base + (MDDI_##r))
-#define mddi_writel(v, r) writel((v), mddi->base + (MDDI_##r))
-
-void mddi_activate_link(struct msm_mddi_client_data *cdata)
-{
-	struct mddi_info *mddi = container_of(cdata, struct mddi_info,
-					      client_data);
-
-	mddi_writel(MDDI_CMD_LINK_ACTIVE, CMD);
-}
-
-static void mddi_handle_link_list_done(struct mddi_info *mddi)
-{
-}
-
-static void mddi_reset_rev_encap_ptr(struct mddi_info *mddi)
-{
-	printk(KERN_INFO "mddi: resetting rev ptr\n");
-	mddi->rev_data_curr = 0;
-	mddi_writel(mddi->rev_addr, REV_PTR);
-	mddi_writel(mddi->rev_addr, REV_PTR);
-	mddi_writel(MDDI_CMD_FORCE_NEW_REV_PTR, CMD);
-}
-
-static void mddi_handle_rev_data(struct mddi_info *mddi, union mddi_rev *rev)
-{
-	int i;
-	struct reg_read_info *ri;
-
-	if ((rev->hdr.length <= MDDI_REV_BUFFER_SIZE - 2) &&
-	   (rev->hdr.length >= sizeof(struct mddi_rev_packet) - 2)) {
-
-		switch (rev->hdr.type) {
-		case TYPE_CLIENT_CAPS:
-			memcpy(&mddi->caps, &rev->caps,
-			       sizeof(struct mddi_client_caps));
-			mddi->flags |= FLAG_HAVE_CAPS;
-			wake_up(&mddi->int_wait);
-			break;
-		case TYPE_CLIENT_STATUS:
-			memcpy(&mddi->status, &rev->status,
-			       sizeof(struct mddi_client_status));
-			mddi->flags |= FLAG_HAVE_STATUS;
-			wake_up(&mddi->int_wait);
-			break;
-		case TYPE_REGISTER_ACCESS:
-			ri = mddi->reg_read;
-			if (ri == 0) {
-				printk(KERN_INFO "rev: got reg %x = %x without "
-						 " pending read\n",
-				       rev->reg.register_address,
-				       rev->reg.register_data_list);
-				break;
-			}
-			if (ri->reg != rev->reg.register_address) {
-				printk(KERN_INFO "rev: got reg %x = %x for "
-						 "wrong register, expected "
-						 "%x\n",
-				       rev->reg.register_address,
-				       rev->reg.register_data_list, ri->reg);
-				break;
-			}
-			mddi->reg_read = NULL;
-			ri->status = 0;
-			ri->result = rev->reg.register_data_list;
-			complete(&ri->done);
-			break;
-		default:
-			printk(KERN_INFO "rev: unknown reverse packet: "
-					 "len=%04x type=%04x CURR_REV_PTR=%x\n",
-			       rev->hdr.length, rev->hdr.type,
-			       mddi_readl(CURR_REV_PTR));
-			for (i = 0; i < rev->hdr.length + 2; i++) {
-				if ((i % 16) == 0)
-					printk(KERN_INFO "\n");
-				printk(KERN_INFO " %02x", rev->raw[i]);
-			}
-			printk(KERN_INFO "\n");
-			mddi_reset_rev_encap_ptr(mddi);
-		}
-	} else {
-		printk(KERN_INFO "bad rev length, %d, CURR_REV_PTR %x\n",
-		       rev->hdr.length, mddi_readl(CURR_REV_PTR));
-		mddi_reset_rev_encap_ptr(mddi);
-	}
-}
-
-static void mddi_wait_interrupt(struct mddi_info *mddi, uint32_t intmask);
-
-static void mddi_handle_rev_data_avail(struct mddi_info *mddi)
-{
-	uint32_t rev_data_count;
-	uint32_t rev_crc_err_count;
-	struct reg_read_info *ri;
-	size_t prev_offset;
-	uint16_t length;
-
-	union mddi_rev *crev = mddi->rev_data + mddi->rev_data_curr;
-
-	/* clear the interrupt */
-	mddi_writel(MDDI_INT_REV_DATA_AVAIL, INT);
-	rev_data_count = mddi_readl(REV_PKT_CNT);
-	rev_crc_err_count = mddi_readl(REV_CRC_ERR);
-	if (rev_data_count > 1)
-		printk(KERN_INFO "rev_data_count %d\n", rev_data_count);
-
-	if (rev_crc_err_count) {
-		printk(KERN_INFO "rev_crc_err_count %d, INT %x\n",
-		       rev_crc_err_count,  mddi_readl(INT));
-		ri = mddi->reg_read;
-		if (ri == 0) {
-			printk(KERN_INFO "rev: got crc error without pending "
-			       "read\n");
-		} else {
-			mddi->reg_read = NULL;
-			ri->status = -EIO;
-			ri->result = -1;
-			complete(&ri->done);
-		}
-	}
-
-	if (rev_data_count == 0)
-		return;
-
-	prev_offset = mddi->rev_data_curr;
-
-	length = *((uint8_t *)mddi->rev_data + mddi->rev_data_curr);
-	mddi->rev_data_curr++;
-	if (mddi->rev_data_curr == MDDI_REV_BUFFER_SIZE)
-		mddi->rev_data_curr = 0;
-	length += *((uint8_t *)mddi->rev_data + mddi->rev_data_curr) << 8;
-	mddi->rev_data_curr += 1 + length;
-	if (mddi->rev_data_curr >= MDDI_REV_BUFFER_SIZE)
-		mddi->rev_data_curr =
-			mddi->rev_data_curr % MDDI_REV_BUFFER_SIZE;
-
-	if (length > MDDI_REV_BUFFER_SIZE - 2) {
-		printk(KERN_INFO "mddi: rev data length greater than buffer"
-			"size\n");
-		mddi_reset_rev_encap_ptr(mddi);
-		return;
-	}
-
-	if (prev_offset + 2 + length >= MDDI_REV_BUFFER_SIZE) {
-		union mddi_rev tmprev;
-		size_t rem = MDDI_REV_BUFFER_SIZE - prev_offset;
-		memcpy(&tmprev.raw[0], mddi->rev_data + prev_offset, rem);
-		memcpy(&tmprev.raw[rem], mddi->rev_data, 2 + length - rem);
-		mddi_handle_rev_data(mddi, &tmprev);
-	} else {
-		mddi_handle_rev_data(mddi, crev);
-	}
-
-	if (prev_offset < MDDI_REV_BUFFER_SIZE / 2 &&
-	    mddi->rev_data_curr >= MDDI_REV_BUFFER_SIZE / 2) {
-		mddi_writel(mddi->rev_addr, REV_PTR);
-	}
-}
-
-static irqreturn_t mddi_isr(int irq, void *data)
-{
-	struct msm_mddi_client_data *cdata = data;
-	struct mddi_info *mddi = container_of(cdata, struct mddi_info,
-					      client_data);
-	uint32_t active, status;
-
-	spin_lock(&mddi->int_lock);
-
-	active = mddi_readl(INT);
-	status = mddi_readl(STAT);
-
-	mddi_writel(active, INT);
-
-	/* ignore any interrupts we have disabled */
-	active &= mddi->int_enable;
-
-	mddi->got_int |= active;
-	wake_up(&mddi->int_wait);
-
-	if (active & MDDI_INT_PRI_LINK_LIST_DONE) {
-		mddi->int_enable &= (~MDDI_INT_PRI_LINK_LIST_DONE);
-		mddi_handle_link_list_done(mddi);
-	}
-	if (active & MDDI_INT_REV_DATA_AVAIL)
-		mddi_handle_rev_data_avail(mddi);
-
-	if (active & ~MDDI_INT_NEED_CLEAR)
-		mddi->int_enable &= ~(active & ~MDDI_INT_NEED_CLEAR);
-
-	if (active & MDDI_INT_LINK_ACTIVE) {
-		mddi->int_enable &= (~MDDI_INT_LINK_ACTIVE);
-		mddi->int_enable |= MDDI_INT_IN_HIBERNATION;
-	}
-
-	if (active & MDDI_INT_IN_HIBERNATION) {
-		mddi->int_enable &= (~MDDI_INT_IN_HIBERNATION);
-		mddi->int_enable |= MDDI_INT_LINK_ACTIVE;
-	}
-
-	mddi_writel(mddi->int_enable, INTEN);
-	spin_unlock(&mddi->int_lock);
-
-	return IRQ_HANDLED;
-}
-
-static long mddi_wait_interrupt_timeout(struct mddi_info *mddi,
-					uint32_t intmask, int timeout)
-{
-	unsigned long irq_flags;
-
-	spin_lock_irqsave(&mddi->int_lock, irq_flags);
-	mddi->got_int &= ~intmask;
-	mddi->int_enable |= intmask;
-	mddi_writel(mddi->int_enable, INTEN);
-	spin_unlock_irqrestore(&mddi->int_lock, irq_flags);
-	return wait_event_timeout(mddi->int_wait, mddi->got_int & intmask,
-				  timeout);
-}
-
-static void mddi_wait_interrupt(struct mddi_info *mddi, uint32_t intmask)
-{
-	if (mddi_wait_interrupt_timeout(mddi, intmask, HZ/10) == 0)
-		printk(KERN_INFO "mddi_wait_interrupt %d, timeout "
-		       "waiting for %x, INT = %x, STAT = %x gotint = %x\n",
-		       current->pid, intmask, mddi_readl(INT), mddi_readl(STAT),
-		       mddi->got_int);
-}
-
-static void mddi_init_rev_encap(struct mddi_info *mddi)
-{
-	memset(mddi->rev_data, 0xee, MDDI_REV_BUFFER_SIZE);
-	mddi_writel(mddi->rev_addr, REV_PTR);
-	mddi_writel(MDDI_CMD_FORCE_NEW_REV_PTR, CMD);
-	mddi_wait_interrupt(mddi, MDDI_INT_NO_CMD_PKTS_PEND);
-}
-
-void mddi_set_auto_hibernate(struct msm_mddi_client_data *cdata, int on)
-{
-	struct mddi_info *mddi = container_of(cdata, struct mddi_info,
-					      client_data);
-	mddi_writel(MDDI_CMD_POWERDOWN, CMD);
-	mddi_wait_interrupt(mddi, MDDI_INT_IN_HIBERNATION);
-	mddi_writel(MDDI_CMD_HIBERNATE | !!on, CMD);
-	mddi_wait_interrupt(mddi, MDDI_INT_NO_CMD_PKTS_PEND);
-}
-
-
-static uint16_t mddi_init_registers(struct mddi_info *mddi)
-{
-	mddi_writel(0x0001, VERSION);
-	mddi_writel(MDDI_HOST_BYTES_PER_SUBFRAME, BPS);
-	mddi_writel(0x0003, SPM); /* subframes per media */
-	mddi_writel(0x0005, TA1_LEN);
-	mddi_writel(MDDI_HOST_TA2_LEN, TA2_LEN);
-	mddi_writel(0x0096, DRIVE_HI);
-	/* 0x32 normal, 0x50 for Toshiba display */
-	mddi_writel(0x0050, DRIVE_LO);
-	mddi_writel(0x003C, DISP_WAKE); /* wakeup counter */
-	mddi_writel(MDDI_HOST_REV_RATE_DIV, REV_RATE_DIV);
-
-	mddi_writel(MDDI_REV_BUFFER_SIZE, REV_SIZE);
-	mddi_writel(MDDI_MAX_REV_PKT_SIZE, REV_ENCAP_SZ);
-
-	/* disable periodic rev encap */
-	mddi_writel(MDDI_CMD_PERIODIC_REV_ENCAP, CMD);
-	mddi_wait_interrupt(mddi, MDDI_INT_NO_CMD_PKTS_PEND);
-
-	if (mddi_readl(PAD_CTL) == 0) {
-		/* If we are turning on band gap, need to wait 5us before
-		 * turning on the rest of the PAD */
-		mddi_writel(0x08000, PAD_CTL);
-		udelay(5);
-	}
-
-	/* Recommendation from PAD hw team */
-	mddi_writel(0xa850f, PAD_CTL);
-
-
-	/* Need an even number for counts */
-	mddi_writel(0x60006, DRIVER_START_CNT);
-
-	mddi_set_auto_hibernate(&mddi->client_data, 0);
-
-	mddi_writel(MDDI_CMD_DISP_IGNORE, CMD);
-	mddi_wait_interrupt(mddi, MDDI_INT_NO_CMD_PKTS_PEND);
-
-	mddi_init_rev_encap(mddi);
-	return mddi_readl(CORE_VER) & 0xffff;
-}
-
-static void mddi_suspend(struct msm_mddi_client_data *cdata)
-{
-	struct mddi_info *mddi = container_of(cdata, struct mddi_info,
-					      client_data);
-	/* turn off the client */
-	if (mddi->power_client)
-		mddi->power_client(&mddi->client_data, 0);
-	/* turn off the link */
-	mddi_writel(MDDI_CMD_RESET, CMD);
-	mddi_wait_interrupt(mddi, MDDI_INT_NO_CMD_PKTS_PEND);
-	/* turn off the clock */
-	clk_disable(mddi->clk);
-}
-
-static void mddi_resume(struct msm_mddi_client_data *cdata)
-{
-	struct mddi_info *mddi = container_of(cdata, struct mddi_info,
-					      client_data);
-	mddi_set_auto_hibernate(&mddi->client_data, 0);
-	/* turn on the client */
-	if (mddi->power_client)
-		mddi->power_client(&mddi->client_data, 1);
-	/* turn on the clock */
-	clk_enable(mddi->clk);
-	/* set up the local registers */
-	mddi->rev_data_curr = 0;
-	mddi_init_registers(mddi);
-	mddi_writel(mddi->int_enable, INTEN);
-	mddi_writel(MDDI_CMD_LINK_ACTIVE, CMD);
-	mddi_writel(MDDI_CMD_SEND_RTD, CMD);
-	mddi_wait_interrupt(mddi, MDDI_INT_NO_CMD_PKTS_PEND);
-	mddi_set_auto_hibernate(&mddi->client_data, 1);
-}
-
-static int mddi_get_client_caps(struct mddi_info *mddi)
-{
-	int i, j;
-
-	/* clear any stale interrupts */
-	mddi_writel(0xffffffff, INT);
-
-	mddi->int_enable = MDDI_INT_LINK_ACTIVE |
-			   MDDI_INT_IN_HIBERNATION |
-			   MDDI_INT_PRI_LINK_LIST_DONE |
-			   MDDI_INT_REV_DATA_AVAIL |
-			   MDDI_INT_REV_OVERFLOW |
-			   MDDI_INT_REV_OVERWRITE |
-			   MDDI_INT_RTD_FAILURE;
-	mddi_writel(mddi->int_enable, INTEN);
-
-	mddi_writel(MDDI_CMD_LINK_ACTIVE, CMD);
-	mddi_wait_interrupt(mddi, MDDI_INT_NO_CMD_PKTS_PEND);
-
-	for (j = 0; j < 3; j++) {
-		/* the toshiba vga panel does not respond to get
-		 * caps unless you SEND_RTD, but the first SEND_RTD
-		 * will fail...
-		 */
-		for (i = 0; i < 4; i++) {
-			uint32_t stat;
-
-			mddi_writel(MDDI_CMD_SEND_RTD, CMD);
-			mddi_wait_interrupt(mddi, MDDI_INT_NO_CMD_PKTS_PEND);
-			stat = mddi_readl(STAT);
-			printk(KERN_INFO "mddi cmd send rtd: int %x, stat %x, "
-					"rtd val %x\n", mddi_readl(INT), stat,
-					mddi_readl(RTD_VAL));
-			if ((stat & MDDI_STAT_RTD_MEAS_FAIL) == 0)
-				break;
-			msleep(1);
-		}
-
-		mddi_writel(CMD_GET_CLIENT_CAP, CMD);
-		mddi_wait_interrupt(mddi, MDDI_INT_NO_CMD_PKTS_PEND);
-		wait_event_timeout(mddi->int_wait, mddi->flags & FLAG_HAVE_CAPS,
-				   HZ / 100);
-
-		if (mddi->flags & FLAG_HAVE_CAPS)
-			break;
-		printk(KERN_INFO "mddi_init, timeout waiting for caps\n");
-	}
-	return mddi->flags & FLAG_HAVE_CAPS;
-}
-
-/* link must be active when this is called */
-int mddi_check_status(struct mddi_info *mddi)
-{
-	int ret = -1, retry = 3;
-	mutex_lock(&mddi->reg_read_lock);
-	mddi_writel(MDDI_CMD_PERIODIC_REV_ENCAP | 1, CMD);
-	mddi_wait_interrupt(mddi, MDDI_INT_NO_CMD_PKTS_PEND);
-
-	do {
-		mddi->flags &= ~FLAG_HAVE_STATUS;
-		mddi_writel(CMD_GET_CLIENT_STATUS, CMD);
-		mddi_wait_interrupt(mddi, MDDI_INT_NO_CMD_PKTS_PEND);
-		wait_event_timeout(mddi->int_wait,
-				   mddi->flags & FLAG_HAVE_STATUS,
-				   HZ / 100);
-
-		if (mddi->flags & FLAG_HAVE_STATUS) {
-			if (mddi->status.crc_error_count)
-				printk(KERN_INFO "mddi status: crc_error "
-					"count: %d\n",
-					mddi->status.crc_error_count);
-			else
-				ret = 0;
-			break;
-		} else
-			printk(KERN_INFO "mddi status: failed to get client "
-				"status\n");
-		mddi_writel(MDDI_CMD_SEND_RTD, CMD);
-		mddi_wait_interrupt(mddi, MDDI_INT_NO_CMD_PKTS_PEND);
-	} while (--retry);
-
-	mddi_writel(MDDI_CMD_PERIODIC_REV_ENCAP | 0, CMD);
-	mddi_wait_interrupt(mddi, MDDI_INT_NO_CMD_PKTS_PEND);
-	mutex_unlock(&mddi->reg_read_lock);
-	return ret;
-}
-
-
-void mddi_remote_write(struct msm_mddi_client_data *cdata, uint32_t val,
-		       uint32_t reg)
-{
-	struct mddi_info *mddi = container_of(cdata, struct mddi_info,
-					      client_data);
-	struct mddi_llentry *ll;
-	struct mddi_register_access *ra;
-
-	mutex_lock(&mddi->reg_write_lock);
-
-	ll = mddi->reg_write_data;
-
-	ra = &(ll->u.r);
-	ra->length = 14 + 4;
-	ra->type = TYPE_REGISTER_ACCESS;
-	ra->client_id = 0;
-	ra->read_write_info = MDDI_WRITE | 1;
-	ra->crc16 = 0;
-
-	ra->register_address = reg;
-	ra->register_data_list = val;
-
-	ll->flags = 1;
-	ll->header_count = 14;
-	ll->data_count = 4;
-	ll->data = mddi->reg_write_addr + offsetof(struct mddi_llentry,
-						   u.r.register_data_list);
-	ll->next = 0;
-	ll->reserved = 0;
-
-	mddi_writel(mddi->reg_write_addr, PRI_PTR);
-
-	mddi_wait_interrupt(mddi, MDDI_INT_PRI_LINK_LIST_DONE);
-	mutex_unlock(&mddi->reg_write_lock);
-}
-
-uint32_t mddi_remote_read(struct msm_mddi_client_data *cdata, uint32_t reg)
-{
-	struct mddi_info *mddi = container_of(cdata, struct mddi_info,
-					      client_data);
-	struct mddi_llentry *ll;
-	struct mddi_register_access *ra;
-	struct reg_read_info ri;
-	unsigned s;
-	int retry_count = 2;
-	unsigned long irq_flags;
-
-	mutex_lock(&mddi->reg_read_lock);
-
-	ll = mddi->reg_read_data;
-
-	ra = &(ll->u.r);
-	ra->length = 14;
-	ra->type = TYPE_REGISTER_ACCESS;
-	ra->client_id = 0;
-	ra->read_write_info = MDDI_READ | 1;
-	ra->crc16 = 0;
-
-	ra->register_address = reg;
-
-	ll->flags = 0x11;
-	ll->header_count = 14;
-	ll->data_count = 0;
-	ll->data = 0;
-	ll->next = 0;
-	ll->reserved = 0;
-
-	s = mddi_readl(STAT);
-
-	ri.reg = reg;
-	ri.status = -1;
-
-	do {
-		init_completion(&ri.done);
-		mddi->reg_read = &ri;
-		mddi_writel(mddi->reg_read_addr, PRI_PTR);
-
-		mddi_wait_interrupt(mddi, MDDI_INT_PRI_LINK_LIST_DONE);
-
-		/* Enable Periodic Reverse Encapsulation. */
-		mddi_writel(MDDI_CMD_PERIODIC_REV_ENCAP | 1, CMD);
-		mddi_wait_interrupt(mddi, MDDI_INT_NO_CMD_PKTS_PEND);
-		if (wait_for_completion_timeout(&ri.done, HZ/10) == 0 &&
-		    !ri.done.done) {
-			printk(KERN_INFO "mddi_remote_read(%x) timeout "
-					 "(%d %d %d)\n",
-			       reg, ri.status, ri.result, ri.done.done);
-			spin_lock_irqsave(&mddi->int_lock, irq_flags);
-			mddi->reg_read = NULL;
-			spin_unlock_irqrestore(&mddi->int_lock, irq_flags);
-			ri.status = -1;
-			ri.result = -1;
-		}
-		if (ri.status == 0)
-			break;
-
-		mddi_writel(MDDI_CMD_SEND_RTD, CMD);
-		mddi_writel(MDDI_CMD_LINK_ACTIVE, CMD);
-		mddi_wait_interrupt(mddi, MDDI_INT_NO_CMD_PKTS_PEND);
-		printk(KERN_INFO "mddi_remote_read: failed, sent "
-		       "MDDI_CMD_SEND_RTD: int %x, stat %x, rtd val %x "
-		       "curr_rev_ptr %x\n", mddi_readl(INT), mddi_readl(STAT),
-		       mddi_readl(RTD_VAL), mddi_readl(CURR_REV_PTR));
-	} while (retry_count-- > 0);
-	/* Disable Periodic Reverse Encapsulation. */
-	mddi_writel(MDDI_CMD_PERIODIC_REV_ENCAP | 0, CMD);
-	mddi_wait_interrupt(mddi, MDDI_INT_NO_CMD_PKTS_PEND);
-	mddi->reg_read = NULL;
-	mutex_unlock(&mddi->reg_read_lock);
-	return ri.result;
-}
-
-static struct mddi_info mddi_info[2];
-
-static int mddi_clk_setup(struct platform_device *pdev, struct mddi_info *mddi,
-			  unsigned long clk_rate)
-{
-	int ret;
-
-	/* set up the clocks */
-	mddi->clk = clk_get(&pdev->dev, "mddi_clk");
-	if (IS_ERR(mddi->clk)) {
-		printk(KERN_INFO "mddi: failed to get clock\n");
-		return PTR_ERR(mddi->clk);
-	}
-	ret =  clk_enable(mddi->clk);
-	if (ret)
-		goto fail;
-	ret = clk_set_rate(mddi->clk, clk_rate);
-	if (ret)
-		goto fail;
-	return 0;
-
-fail:
-	clk_put(mddi->clk);
-	return ret;
-}
-
-static int __init mddi_rev_data_setup(struct mddi_info *mddi)
-{
-	void *dma;
-	dma_addr_t dma_addr;
-
-	/* set up dma buffer */
-	dma = dma_alloc_coherent(NULL, 0x1000, &dma_addr, GFP_KERNEL);
-	if (dma == 0)
-		return -ENOMEM;
-	mddi->rev_data = dma;
-	mddi->rev_data_curr = 0;
-	mddi->rev_addr = dma_addr;
-	mddi->reg_write_data = dma + MDDI_REV_BUFFER_SIZE;
-	mddi->reg_write_addr = dma_addr + MDDI_REV_BUFFER_SIZE;
-	mddi->reg_read_data = mddi->reg_write_data + 1;
-	mddi->reg_read_addr = mddi->reg_write_addr +
-			      sizeof(*mddi->reg_write_data);
-	return 0;
-}
-
-static int mddi_probe(struct platform_device *pdev)
-{
-	struct msm_mddi_platform_data *pdata = pdev->dev.platform_data;
-	struct mddi_info *mddi = &mddi_info[pdev->id];
-	struct resource *resource;
-	int ret, i;
-
-	resource = platform_get_resource(pdev, IORESOURCE_MEM, 0);
-	if (!resource) {
-		printk(KERN_ERR "mddi: no associated mem resource!\n");
-		return -ENOMEM;
-	}
-	mddi->base = ioremap(resource->start, resource_size(resource));
-	if (!mddi->base) {
-		printk(KERN_ERR "mddi: failed to remap base!\n");
-		ret = -EINVAL;
-		goto error_ioremap;
-	}
-	resource = platform_get_resource(pdev, IORESOURCE_IRQ, 0);
-	if (!resource) {
-		printk(KERN_ERR "mddi: no associated irq resource!\n");
-		ret = -EINVAL;
-		goto error_get_irq_resource;
-	}
-	mddi->irq = resource->start;
-	printk(KERN_INFO "mddi: init() base=0x%p irq=%d\n", mddi->base,
-	       mddi->irq);
-	mddi->power_client = pdata->power_client;
-
-	mutex_init(&mddi->reg_write_lock);
-	mutex_init(&mddi->reg_read_lock);
-	spin_lock_init(&mddi->int_lock);
-	init_waitqueue_head(&mddi->int_wait);
-
-	ret = mddi_clk_setup(pdev, mddi, pdata->clk_rate);
-	if (ret) {
-		printk(KERN_ERR "mddi: failed to setup clock!\n");
-		goto error_clk_setup;
-	}
-
-	ret = mddi_rev_data_setup(mddi);
-	if (ret) {
-		printk(KERN_ERR "mddi: failed to setup rev data!\n");
-		goto error_rev_data;
-	}
-
-	mddi->int_enable = 0;
-	mddi_writel(mddi->int_enable, INTEN);
-	ret = request_irq(mddi->irq, mddi_isr, 0, "mddi",
-			  &mddi->client_data);
-	if (ret) {
-		printk(KERN_ERR "mddi: failed to request enable irq!\n");
-		goto error_request_irq;
-	}
-
-	/* turn on the mddi client bridge chip */
-	if (mddi->power_client)
-		mddi->power_client(&mddi->client_data, 1);
-
-	/* initialize the mddi registers */
-	mddi_set_auto_hibernate(&mddi->client_data, 0);
-	mddi_writel(MDDI_CMD_RESET, CMD);
-	mddi_wait_interrupt(mddi, MDDI_INT_NO_CMD_PKTS_PEND);
-	mddi->version = mddi_init_registers(mddi);
-	if (mddi->version < 0x20) {
-		printk(KERN_ERR "mddi: unsupported version 0x%x\n",
-		       mddi->version);
-		ret = -ENODEV;
-		goto error_mddi_version;
-	}
-
-	/* read the capabilities off the client */
-	if (!mddi_get_client_caps(mddi)) {
-		printk(KERN_INFO "mddi: no client found\n");
-		/* power down the panel */
-		mddi_writel(MDDI_CMD_POWERDOWN, CMD);
-		printk(KERN_INFO "mddi powerdown: stat %x\n", mddi_readl(STAT));
-		msleep(100);
-		printk(KERN_INFO "mddi powerdown: stat %x\n", mddi_readl(STAT));
-		return 0;
-	}
-	mddi_set_auto_hibernate(&mddi->client_data, 1);
-
-	if (mddi->caps.Mfr_Name == 0 && mddi->caps.Product_Code == 0)
-		pdata->fixup(&mddi->caps.Mfr_Name, &mddi->caps.Product_Code);
-
-	mddi->client_pdev.id = 0;
-	for (i = 0; i < pdata->num_clients; i++) {
-		if (pdata->client_platform_data[i].product_id ==
-		    (mddi->caps.Mfr_Name << 16 | mddi->caps.Product_Code)) {
-			mddi->client_data.private_client_data =
-				pdata->client_platform_data[i].client_data;
-			mddi->client_pdev.name =
-				pdata->client_platform_data[i].name;
-			mddi->client_pdev.id =
-				pdata->client_platform_data[i].id;
-			/* XXX: possibly set clock */
-			break;
-		}
-	}
-
-	if (i >= pdata->num_clients)
-		mddi->client_pdev.name = "mddi_c_dummy";
-	printk(KERN_INFO "mddi: registering panel %s\n",
-		mddi->client_pdev.name);
-
-	mddi->client_data.suspend = mddi_suspend;
-	mddi->client_data.resume = mddi_resume;
-	mddi->client_data.activate_link = mddi_activate_link;
-	mddi->client_data.remote_write = mddi_remote_write;
-	mddi->client_data.remote_read = mddi_remote_read;
-	mddi->client_data.auto_hibernate = mddi_set_auto_hibernate;
-	mddi->client_data.fb_resource = pdata->fb_resource;
-	if (pdev->id == 0)
-		mddi->client_data.interface_type = MSM_MDDI_PMDH_INTERFACE;
-	else if (pdev->id == 1)
-		mddi->client_data.interface_type = MSM_MDDI_EMDH_INTERFACE;
-	else {
-		printk(KERN_ERR "mddi: can not determine interface %d!\n",
-		       pdev->id);
-		ret = -EINVAL;
-		goto error_mddi_interface;
-	}
-
-	mddi->client_pdev.dev.platform_data = &mddi->client_data;
-	printk(KERN_INFO "mddi: publish: %s\n", mddi->client_name);
-	platform_device_register(&mddi->client_pdev);
-	return 0;
-
-error_mddi_interface:
-error_mddi_version:
-	free_irq(mddi->irq, 0);
-error_request_irq:
-	dma_free_coherent(NULL, 0x1000, mddi->rev_data, mddi->rev_addr);
-error_rev_data:
-error_clk_setup:
-error_get_irq_resource:
-	iounmap(mddi->base);
-error_ioremap:
-
-	printk(KERN_INFO "mddi: mddi_init() failed (%d)\n", ret);
-	return ret;
-}
-
-
-static struct platform_driver mddi_driver = {
-	.probe = mddi_probe,
-	.driver = { .name = "msm_mddi" },
-};
-
-static int __init _mddi_init(void)
-{
-	return platform_driver_register(&mddi_driver);
-}
-
-module_init(_mddi_init);
diff --git a/drivers/video/fbdev/msm/mddi_client_dummy.c b/drivers/video/fbdev/msm/mddi_client_dummy.c
deleted file mode 100644
index cdb8f69a5d88..000000000000
--- a/drivers/video/fbdev/msm/mddi_client_dummy.c
+++ /dev/null
@@ -1,85 +0,0 @@
-/* drivers/video/msm_fb/mddi_client_dummy.c
- *
- * Support for "dummy" mddi client devices which require no
- * special initialization code.
- *
- * Copyright (C) 2007 Google Incorporated
- *
- * This software is licensed under the terms of the GNU General Public
- * License version 2, as published by the Free Software Foundation, and
- * may be copied, distributed, and modified under those terms.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- */
-
-#include <linux/device.h>
-#include <linux/slab.h>
-#include <linux/module.h>
-#include <linux/kernel.h>
-#include <linux/platform_device.h>
-
-#include <linux/platform_data/video-msm_fb.h>
-
-struct panel_info {
-	struct platform_device pdev;
-	struct msm_panel_data panel_data;
-};
-
-static int mddi_dummy_suspend(struct msm_panel_data *panel_data)
-{
-	return 0;
-}
-
-static int mddi_dummy_resume(struct msm_panel_data *panel_data)
-{
-	return 0;
-}
-
-static int mddi_dummy_blank(struct msm_panel_data *panel_data)
-{
-	return 0;
-}
-
-static int mddi_dummy_unblank(struct msm_panel_data *panel_data)
-{
-	return 0;
-}
-
-static int mddi_dummy_probe(struct platform_device *pdev)
-{
-	struct msm_mddi_client_data *client_data = pdev->dev.platform_data;
-	struct panel_info *panel =
-		devm_kzalloc(&pdev->dev, sizeof(struct panel_info), GFP_KERNEL);
-	if (!panel)
-		return -ENOMEM;
-	platform_set_drvdata(pdev, panel);
-	panel->panel_data.suspend = mddi_dummy_suspend;
-	panel->panel_data.resume = mddi_dummy_resume;
-	panel->panel_data.blank = mddi_dummy_blank;
-	panel->panel_data.unblank = mddi_dummy_unblank;
-	panel->panel_data.caps = MSMFB_CAP_PARTIAL_UPDATES;
-	panel->pdev.name = "msm_panel";
-	panel->pdev.id = pdev->id;
-	platform_device_add_resources(&panel->pdev,
-				      client_data->fb_resource, 1);
-	panel->panel_data.fb_data = client_data->private_client_data;
-	panel->pdev.dev.platform_data = &panel->panel_data;
-	return platform_device_register(&panel->pdev);
-}
-
-static struct platform_driver mddi_client_dummy = {
-	.probe = mddi_dummy_probe,
-	.driver = { .name = "mddi_c_dummy" },
-};
-
-static int __init mddi_client_dummy_init(void)
-{
-	platform_driver_register(&mddi_client_dummy);
-	return 0;
-}
-
-module_init(mddi_client_dummy_init);
-
diff --git a/drivers/video/fbdev/msm/mddi_client_nt35399.c b/drivers/video/fbdev/msm/mddi_client_nt35399.c
deleted file mode 100644
index f96df32e5509..000000000000
--- a/drivers/video/fbdev/msm/mddi_client_nt35399.c
+++ /dev/null
@@ -1,252 +0,0 @@
-/* drivers/video/msm_fb/mddi_client_nt35399.c
- *
- * Support for Novatek NT35399 MDDI client of Sapphire
- *
- * Copyright (C) 2008 HTC Incorporated
- * Author: Solomon Chiu (solomon_chiu@htc.com)
- *
- * This software is licensed under the terms of the GNU General Public
- * License version 2, as published by the Free Software Foundation, and
- * may be copied, distributed, and modified under those terms.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- */
-
-#include <linux/module.h>
-#include <linux/kernel.h>
-#include <linux/platform_device.h>
-#include <linux/interrupt.h>
-#include <linux/sched.h>
-#include <linux/gpio.h>
-#include <linux/slab.h>
-#include <linux/platform_data/video-msm_fb.h>
-
-static DECLARE_WAIT_QUEUE_HEAD(nt35399_vsync_wait);
-
-struct panel_info {
-	struct msm_mddi_client_data *client_data;
-	struct platform_device pdev;
-	struct msm_panel_data panel_data;
-	struct msmfb_callback *fb_callback;
-	struct work_struct panel_work;
-	struct workqueue_struct *fb_wq;
-	int nt35399_got_int;
-};
-
-static void
-nt35399_request_vsync(struct msm_panel_data *panel_data,
-		      struct msmfb_callback *callback)
-{
-	struct panel_info *panel = container_of(panel_data, struct panel_info,
-						panel_data);
-	struct msm_mddi_client_data *client_data = panel->client_data;
-
-	panel->fb_callback = callback;
-	if (panel->nt35399_got_int) {
-		panel->nt35399_got_int = 0;
-		client_data->activate_link(client_data); /* clears interrupt */
-	}
-}
-
-static void nt35399_wait_vsync(struct msm_panel_data *panel_data)
-{
-	struct panel_info *panel = container_of(panel_data, struct panel_info,
-						panel_data);
-	struct msm_mddi_client_data *client_data = panel->client_data;
-
-	if (panel->nt35399_got_int) {
-		panel->nt35399_got_int = 0;
-		client_data->activate_link(client_data); /* clears interrupt */
-	}
-
-	if (wait_event_timeout(nt35399_vsync_wait, panel->nt35399_got_int,
-				HZ/2) == 0)
-		printk(KERN_ERR "timeout waiting for VSYNC\n");
-
-	panel->nt35399_got_int = 0;
-	/* interrupt clears when screen dma starts */
-}
-
-static int nt35399_suspend(struct msm_panel_data *panel_data)
-{
-	struct panel_info *panel = container_of(panel_data, struct panel_info,
-						panel_data);
-	struct msm_mddi_client_data *client_data = panel->client_data;
-
-	struct msm_mddi_bridge_platform_data *bridge_data =
-		client_data->private_client_data;
-	int ret;
-
-	ret = bridge_data->uninit(bridge_data, client_data);
-	if (ret) {
-		printk(KERN_INFO "mddi nt35399 client: non zero return from "
-			"uninit\n");
-		return ret;
-	}
-	client_data->suspend(client_data);
-	return 0;
-}
-
-static int nt35399_resume(struct msm_panel_data *panel_data)
-{
-	struct panel_info *panel = container_of(panel_data, struct panel_info,
-						panel_data);
-	struct msm_mddi_client_data *client_data = panel->client_data;
-
-	struct msm_mddi_bridge_platform_data *bridge_data =
-		client_data->private_client_data;
-	int ret;
-
-	client_data->resume(client_data);
-	ret = bridge_data->init(bridge_data, client_data);
-	if (ret)
-		return ret;
-	return 0;
-}
-
-static int nt35399_blank(struct msm_panel_data *panel_data)
-{
-	struct panel_info *panel = container_of(panel_data, struct panel_info,
-						panel_data);
-	struct msm_mddi_client_data *client_data = panel->client_data;
-	struct msm_mddi_bridge_platform_data *bridge_data =
-		client_data->private_client_data;
-
-	return bridge_data->blank(bridge_data, client_data);
-}
-
-static int nt35399_unblank(struct msm_panel_data *panel_data)
-{
-	struct panel_info *panel = container_of(panel_data, struct panel_info,
-						panel_data);
-	struct msm_mddi_client_data *client_data = panel->client_data;
-	struct msm_mddi_bridge_platform_data *bridge_data =
-		client_data->private_client_data;
-
-	return bridge_data->unblank(bridge_data, client_data);
-}
-
-irqreturn_t nt35399_vsync_interrupt(int irq, void *data)
-{
-	struct panel_info *panel = data;
-
-	panel->nt35399_got_int = 1;
-
-	if (panel->fb_callback) {
-		panel->fb_callback->func(panel->fb_callback);
-		panel->fb_callback = NULL;
-	}
-
-	wake_up(&nt35399_vsync_wait);
-
-	return IRQ_HANDLED;
-}
-
-static int setup_vsync(struct panel_info *panel, int init)
-{
-	int ret;
-	int gpio = 97;
-	unsigned int irq;
-
-	if (!init) {
-		ret = 0;
-		goto uninit;
-	}
-	ret = gpio_request_one(gpio, GPIOF_IN, "vsync");
-	if (ret)
-		goto err_request_gpio_failed;
-
-	ret = irq = gpio_to_irq(gpio);
-	if (ret < 0)
-		goto err_get_irq_num_failed;
-
-	ret = request_irq(irq, nt35399_vsync_interrupt, IRQF_TRIGGER_RISING,
-			  "vsync", panel);
-	if (ret)
-		goto err_request_irq_failed;
-
-	printk(KERN_INFO "vsync on gpio %d now %d\n",
-	       gpio, gpio_get_value(gpio));
-	return 0;
-
-uninit:
-	free_irq(gpio_to_irq(gpio), panel->client_data);
-err_request_irq_failed:
-err_get_irq_num_failed:
-	gpio_free(gpio);
-err_request_gpio_failed:
-	return ret;
-}
-
-static int mddi_nt35399_probe(struct platform_device *pdev)
-{
-	struct msm_mddi_client_data *client_data = pdev->dev.platform_data;
-	struct msm_mddi_bridge_platform_data *bridge_data =
-		client_data->private_client_data;
-
-	int ret;
-
-	struct panel_info *panel = devm_kzalloc(&pdev->dev,
-						sizeof(struct panel_info),
-						GFP_KERNEL);
-
-	printk(KERN_DEBUG "%s: enter.\n", __func__);
-
-	if (!panel)
-		return -ENOMEM;
-	platform_set_drvdata(pdev, panel);
-
-	ret = setup_vsync(panel, 1);
-	if (ret) {
-		dev_err(&pdev->dev, "mddi_nt35399_setup_vsync failed\n");
-		return ret;
-	}
-
-	panel->client_data = client_data;
-	panel->panel_data.suspend = nt35399_suspend;
-	panel->panel_data.resume = nt35399_resume;
-	panel->panel_data.wait_vsync = nt35399_wait_vsync;
-	panel->panel_data.request_vsync = nt35399_request_vsync;
-	panel->panel_data.blank = nt35399_blank;
-	panel->panel_data.unblank = nt35399_unblank;
-	panel->panel_data.fb_data = &bridge_data->fb_data;
-	panel->panel_data.caps = 0;
-
-	panel->pdev.name = "msm_panel";
-	panel->pdev.id = pdev->id;
-	panel->pdev.resource = client_data->fb_resource;
-	panel->pdev.num_resources = 1;
-	panel->pdev.dev.platform_data = &panel->panel_data;
-
-	if (bridge_data->init)
-		bridge_data->init(bridge_data, client_data);
-
-	platform_device_register(&panel->pdev);
-
-	return 0;
-}
-
-static int mddi_nt35399_remove(struct platform_device *pdev)
-{
-	struct panel_info *panel = platform_get_drvdata(pdev);
-
-	setup_vsync(panel, 0);
-	return 0;
-}
-
-static struct platform_driver mddi_client_0bda_8a47 = {
-	.probe = mddi_nt35399_probe,
-	.remove = mddi_nt35399_remove,
-	.driver = { .name = "mddi_c_0bda_8a47" },
-};
-
-static int __init mddi_client_nt35399_init(void)
-{
-	return platform_driver_register(&mddi_client_0bda_8a47);
-}
-
-module_init(mddi_client_nt35399_init);
-
diff --git a/drivers/video/fbdev/msm/mddi_client_toshiba.c b/drivers/video/fbdev/msm/mddi_client_toshiba.c
deleted file mode 100644
index 061d7dfebbf3..000000000000
--- a/drivers/video/fbdev/msm/mddi_client_toshiba.c
+++ /dev/null
@@ -1,280 +0,0 @@
-/* drivers/video/msm_fb/mddi_client_toshiba.c
- *
- * Support for Toshiba TC358720XBG mddi client devices which require no
- * special initialization code.
- *
- * Copyright (C) 2007 Google Incorporated
- *
- * This software is licensed under the terms of the GNU General Public
- * License version 2, as published by the Free Software Foundation, and
- * may be copied, distributed, and modified under those terms.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- */
-
-#include <linux/module.h>
-#include <linux/kernel.h>
-#include <linux/platform_device.h>
-#include <linux/interrupt.h>
-#include <linux/gpio.h>
-#include <linux/sched.h>
-#include <linux/slab.h>
-#include <linux/platform_data/video-msm_fb.h>
-
-
-#define LCD_CONTROL_BLOCK_BASE 0x110000
-#define CMN         (LCD_CONTROL_BLOCK_BASE|0x10)
-#define INTFLG      (LCD_CONTROL_BLOCK_BASE|0x18)
-#define HCYCLE      (LCD_CONTROL_BLOCK_BASE|0x34)
-#define HDE_START   (LCD_CONTROL_BLOCK_BASE|0x3C)
-#define VPOS        (LCD_CONTROL_BLOCK_BASE|0xC0)
-#define MPLFBUF     (LCD_CONTROL_BLOCK_BASE|0x20)
-#define WAKEUP      (LCD_CONTROL_BLOCK_BASE|0x54)
-#define WSYN_DLY    (LCD_CONTROL_BLOCK_BASE|0x58)
-#define REGENB      (LCD_CONTROL_BLOCK_BASE|0x5C)
-
-#define BASE5 0x150000
-#define BASE6 0x160000
-#define BASE7 0x170000
-
-#define GPIOIEV     (BASE5 + 0x10)
-#define GPIOIE      (BASE5 + 0x14)
-#define GPIORIS     (BASE5 + 0x18)
-#define GPIOMIS     (BASE5 + 0x1C)
-#define GPIOIC      (BASE5 + 0x20)
-
-#define INTMASK     (BASE6 + 0x0C)
-#define INTMASK_VWAKEOUT (1U << 0)
-#define INTMASK_VWAKEOUT_ACTIVE_LOW (1U << 8)
-#define GPIOSEL     (BASE7 + 0x00)
-#define GPIOSEL_VWAKEINT (1U << 0)
-
-static DECLARE_WAIT_QUEUE_HEAD(toshiba_vsync_wait);
-
-struct panel_info {
-	struct msm_mddi_client_data *client_data;
-	struct platform_device pdev;
-	struct msm_panel_data panel_data;
-	struct msmfb_callback *toshiba_callback;
-	int toshiba_got_int;
-};
-
-
-static void toshiba_request_vsync(struct msm_panel_data *panel_data,
-				  struct msmfb_callback *callback)
-{
-	struct panel_info *panel = container_of(panel_data, struct panel_info,
-						panel_data);
-	struct msm_mddi_client_data *client_data = panel->client_data;
-
-	panel->toshiba_callback = callback;
-	if (panel->toshiba_got_int) {
-		panel->toshiba_got_int = 0;
-		client_data->activate_link(client_data);
-	}
-}
-
-static void toshiba_clear_vsync(struct msm_panel_data *panel_data)
-{
-	struct panel_info *panel = container_of(panel_data, struct panel_info,
-						panel_data);
-	struct msm_mddi_client_data *client_data = panel->client_data;
-
-	client_data->activate_link(client_data);
-}
-
-static void toshiba_wait_vsync(struct msm_panel_data *panel_data)
-{
-	struct panel_info *panel = container_of(panel_data, struct panel_info,
-						panel_data);
-	struct msm_mddi_client_data *client_data = panel->client_data;
-
-	if (panel->toshiba_got_int) {
-		panel->toshiba_got_int = 0;
-		client_data->activate_link(client_data); /* clears interrupt */
-	}
-	if (wait_event_timeout(toshiba_vsync_wait, panel->toshiba_got_int,
-				HZ/2) == 0)
-		printk(KERN_ERR "timeout waiting for VSYNC\n");
-	panel->toshiba_got_int = 0;
-	/* interrupt clears when screen dma starts */
-}
-
-static int toshiba_suspend(struct msm_panel_data *panel_data)
-{
-	struct panel_info *panel = container_of(panel_data, struct panel_info,
-						panel_data);
-	struct msm_mddi_client_data *client_data = panel->client_data;
-
-	struct msm_mddi_bridge_platform_data *bridge_data =
-		client_data->private_client_data;
-	int ret;
-
-	ret = bridge_data->uninit(bridge_data, client_data);
-	if (ret) {
-		printk(KERN_INFO "mddi toshiba client: non zero return from "
-			"uninit\n");
-		return ret;
-	}
-	client_data->suspend(client_data);
-	return 0;
-}
-
-static int toshiba_resume(struct msm_panel_data *panel_data)
-{
-	struct panel_info *panel = container_of(panel_data, struct panel_info,
-						panel_data);
-	struct msm_mddi_client_data *client_data = panel->client_data;
-
-	struct msm_mddi_bridge_platform_data *bridge_data =
-		client_data->private_client_data;
-	int ret;
-
-	client_data->resume(client_data);
-	ret = bridge_data->init(bridge_data, client_data);
-	if (ret)
-		return ret;
-	return 0;
-}
-
-static int toshiba_blank(struct msm_panel_data *panel_data)
-{
-	struct panel_info *panel = container_of(panel_data, struct panel_info,
-						panel_data);
-	struct msm_mddi_client_data *client_data = panel->client_data;
-	struct msm_mddi_bridge_platform_data *bridge_data =
-		client_data->private_client_data;
-
-	return bridge_data->blank(bridge_data, client_data);
-}
-
-static int toshiba_unblank(struct msm_panel_data *panel_data)
-{
-	struct panel_info *panel = container_of(panel_data, struct panel_info,
-						panel_data);
-	struct msm_mddi_client_data *client_data = panel->client_data;
-	struct msm_mddi_bridge_platform_data *bridge_data =
-		client_data->private_client_data;
-
-	return bridge_data->unblank(bridge_data, client_data);
-}
-
-irqreturn_t toshiba_vsync_interrupt(int irq, void *data)
-{
-	struct panel_info *panel = data;
-
-	panel->toshiba_got_int = 1;
-	if (panel->toshiba_callback) {
-		panel->toshiba_callback->func(panel->toshiba_callback);
-		panel->toshiba_callback = 0;
-	}
-	wake_up(&toshiba_vsync_wait);
-	return IRQ_HANDLED;
-}
-
-static int setup_vsync(struct panel_info *panel,
-		       int init)
-{
-	int ret;
-	int gpio = 97;
-	unsigned int irq;
-
-	if (!init) {
-		ret = 0;
-		goto uninit;
-	}
-	ret = gpio_request_one(gpio, GPIOF_IN, "vsync");
-	if (ret)
-		goto err_request_gpio_failed;
-
-	ret = irq = gpio_to_irq(gpio);
-	if (ret < 0)
-		goto err_get_irq_num_failed;
-
-	ret = request_irq(irq, toshiba_vsync_interrupt, IRQF_TRIGGER_RISING,
-			  "vsync", panel);
-	if (ret)
-		goto err_request_irq_failed;
-	printk(KERN_INFO "vsync on gpio %d now %d\n",
-	       gpio, gpio_get_value(gpio));
-	return 0;
-
-uninit:
-	free_irq(gpio_to_irq(gpio), panel);
-err_request_irq_failed:
-err_get_irq_num_failed:
-	gpio_free(gpio);
-err_request_gpio_failed:
-	return ret;
-}
-
-static int mddi_toshiba_probe(struct platform_device *pdev)
-{
-	int ret;
-	struct msm_mddi_client_data *client_data = pdev->dev.platform_data;
-	struct msm_mddi_bridge_platform_data *bridge_data =
-		client_data->private_client_data;
-	struct panel_info *panel =
-		kzalloc(sizeof(struct panel_info), GFP_KERNEL);
-	if (!panel)
-		return -ENOMEM;
-	platform_set_drvdata(pdev, panel);
-
-	/* mddi_remote_write(mddi, 0, WAKEUP); */
-	client_data->remote_write(client_data, GPIOSEL_VWAKEINT, GPIOSEL);
-	client_data->remote_write(client_data, INTMASK_VWAKEOUT, INTMASK);
-
-	ret = setup_vsync(panel, 1);
-	if (ret) {
-		dev_err(&pdev->dev, "mddi_bridge_setup_vsync failed\n");
-		return ret;
-	}
-
-	panel->client_data = client_data;
-	panel->panel_data.suspend = toshiba_suspend;
-	panel->panel_data.resume = toshiba_resume;
-	panel->panel_data.wait_vsync = toshiba_wait_vsync;
-	panel->panel_data.request_vsync = toshiba_request_vsync;
-	panel->panel_data.clear_vsync = toshiba_clear_vsync;
-	panel->panel_data.blank = toshiba_blank;
-	panel->panel_data.unblank = toshiba_unblank;
-	panel->panel_data.fb_data =  &bridge_data->fb_data;
-	panel->panel_data.caps = MSMFB_CAP_PARTIAL_UPDATES;
-
-	panel->pdev.name = "msm_panel";
-	panel->pdev.id = pdev->id;
-	panel->pdev.resource = client_data->fb_resource;
-	panel->pdev.num_resources = 1;
-	panel->pdev.dev.platform_data = &panel->panel_data;
-	bridge_data->init(bridge_data, client_data);
-	platform_device_register(&panel->pdev);
-
-	return 0;
-}
-
-static int mddi_toshiba_remove(struct platform_device *pdev)
-{
-	struct panel_info *panel = platform_get_drvdata(pdev);
-
-	setup_vsync(panel, 0);
-	kfree(panel);
-	return 0;
-}
-
-static struct platform_driver mddi_client_d263_0000 = {
-	.probe = mddi_toshiba_probe,
-	.remove = mddi_toshiba_remove,
-	.driver = { .name = "mddi_c_d263_0000" },
-};
-
-static int __init mddi_client_toshiba_init(void)
-{
-	platform_driver_register(&mddi_client_d263_0000);
-	return 0;
-}
-
-module_init(mddi_client_toshiba_init);
-
diff --git a/drivers/video/fbdev/msm/mddi_hw.h b/drivers/video/fbdev/msm/mddi_hw.h
deleted file mode 100644
index 45cc01fc1e7f..000000000000
--- a/drivers/video/fbdev/msm/mddi_hw.h
+++ /dev/null
@@ -1,305 +0,0 @@
-/* drivers/video/msm_fb/mddi_hw.h
- *
- * MSM MDDI Hardware Registers and Structures
- *
- * Copyright (C) 2007 QUALCOMM Incorporated
- * Copyright (C) 2007 Google Incorporated
- *
- * This software is licensed under the terms of the GNU General Public
- * License version 2, as published by the Free Software Foundation, and
- * may be copied, distributed, and modified under those terms.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- */
-
-#ifndef _MDDI_HW_H_
-#define _MDDI_HW_H_
-
-#include <linux/types.h>
-
-#define MDDI_CMD                0x0000
-#define MDDI_VERSION            0x0004
-#define MDDI_PRI_PTR            0x0008
-#define MDDI_SEC_PTR            0x000c
-#define MDDI_BPS                0x0010
-#define MDDI_SPM                0x0014
-#define MDDI_INT                0x0018
-#define MDDI_INTEN              0x001c
-#define MDDI_REV_PTR            0x0020
-#define MDDI_REV_SIZE           0x0024
-#define MDDI_STAT               0x0028
-#define MDDI_REV_RATE_DIV       0x002c
-#define MDDI_REV_CRC_ERR        0x0030
-#define MDDI_TA1_LEN            0x0034
-#define MDDI_TA2_LEN            0x0038
-#define MDDI_TEST_BUS           0x003c
-#define MDDI_TEST               0x0040
-#define MDDI_REV_PKT_CNT        0x0044
-#define MDDI_DRIVE_HI           0x0048
-#define MDDI_DRIVE_LO           0x004c
-#define MDDI_DISP_WAKE          0x0050
-#define MDDI_REV_ENCAP_SZ       0x0054
-#define MDDI_RTD_VAL            0x0058
-#define MDDI_PAD_CTL            0x0068
-#define MDDI_DRIVER_START_CNT   0x006c
-#define MDDI_NEXT_PRI_PTR       0x0070
-#define MDDI_NEXT_SEC_PTR       0x0074
-#define MDDI_MISR_CTL           0x0078
-#define MDDI_MISR_DATA          0x007c
-#define MDDI_SF_CNT             0x0080
-#define MDDI_MF_CNT             0x0084
-#define MDDI_CURR_REV_PTR       0x0088
-#define MDDI_CORE_VER           0x008c
-
-#define MDDI_INT_PRI_PTR_READ       0x0001
-#define MDDI_INT_SEC_PTR_READ       0x0002
-#define MDDI_INT_REV_DATA_AVAIL     0x0004
-#define MDDI_INT_DISP_REQ           0x0008
-#define MDDI_INT_PRI_UNDERFLOW      0x0010
-#define MDDI_INT_SEC_UNDERFLOW      0x0020
-#define MDDI_INT_REV_OVERFLOW       0x0040
-#define MDDI_INT_CRC_ERROR          0x0080
-#define MDDI_INT_MDDI_IN            0x0100
-#define MDDI_INT_PRI_OVERWRITE      0x0200
-#define MDDI_INT_SEC_OVERWRITE      0x0400
-#define MDDI_INT_REV_OVERWRITE      0x0800
-#define MDDI_INT_DMA_FAILURE        0x1000
-#define MDDI_INT_LINK_ACTIVE        0x2000
-#define MDDI_INT_IN_HIBERNATION     0x4000
-#define MDDI_INT_PRI_LINK_LIST_DONE 0x8000
-#define MDDI_INT_SEC_LINK_LIST_DONE 0x10000
-#define MDDI_INT_NO_CMD_PKTS_PEND   0x20000
-#define MDDI_INT_RTD_FAILURE        0x40000
-#define MDDI_INT_REV_PKT_RECEIVED   0x80000
-#define MDDI_INT_REV_PKTS_AVAIL     0x100000
-
-#define MDDI_INT_NEED_CLEAR ( \
-	MDDI_INT_REV_DATA_AVAIL | \
-	MDDI_INT_PRI_UNDERFLOW | \
-	MDDI_INT_SEC_UNDERFLOW | \
-	MDDI_INT_REV_OVERFLOW | \
-	MDDI_INT_CRC_ERROR | \
-	MDDI_INT_REV_PKT_RECEIVED)
-
-
-#define MDDI_STAT_LINK_ACTIVE        0x0001
-#define MDDI_STAT_NEW_REV_PTR        0x0002
-#define MDDI_STAT_NEW_PRI_PTR        0x0004
-#define MDDI_STAT_NEW_SEC_PTR        0x0008
-#define MDDI_STAT_IN_HIBERNATION     0x0010
-#define MDDI_STAT_PRI_LINK_LIST_DONE 0x0020
-#define MDDI_STAT_SEC_LINK_LIST_DONE 0x0040
-#define MDDI_STAT_PENDING_TIMING_PKT 0x0080
-#define MDDI_STAT_PENDING_REV_ENCAP  0x0100
-#define MDDI_STAT_PENDING_POWERDOWN  0x0200
-#define MDDI_STAT_RTD_MEAS_FAIL      0x0800
-#define MDDI_STAT_CLIENT_WAKEUP_REQ  0x1000
-
-
-#define MDDI_CMD_POWERDOWN           0x0100
-#define MDDI_CMD_POWERUP             0x0200
-#define MDDI_CMD_HIBERNATE           0x0300
-#define MDDI_CMD_RESET               0x0400
-#define MDDI_CMD_DISP_IGNORE         0x0501
-#define MDDI_CMD_DISP_LISTEN         0x0500
-#define MDDI_CMD_SEND_REV_ENCAP      0x0600
-#define MDDI_CMD_GET_CLIENT_CAP      0x0601
-#define MDDI_CMD_GET_CLIENT_STATUS   0x0602
-#define MDDI_CMD_SEND_RTD            0x0700
-#define MDDI_CMD_LINK_ACTIVE         0x0900
-#define MDDI_CMD_PERIODIC_REV_ENCAP  0x0A00
-#define MDDI_CMD_FORCE_NEW_REV_PTR   0x0C00
-
-
-
-#define MDDI_VIDEO_REV_PKT_SIZE              0x40
-#define MDDI_CLIENT_CAPABILITY_REV_PKT_SIZE  0x60
-#define MDDI_MAX_REV_PKT_SIZE                0x60
-
-/* #define MDDI_REV_BUFFER_SIZE 128 */
-#define MDDI_REV_BUFFER_SIZE (MDDI_MAX_REV_PKT_SIZE * 4)
-
-/* MDP sends 256 pixel packets, so lower value hibernates more without
- * significantly increasing latency of waiting for next subframe */
-#define MDDI_HOST_BYTES_PER_SUBFRAME  0x3C00
-#define MDDI_HOST_TA2_LEN       0x000c
-#define MDDI_HOST_REV_RATE_DIV  0x0002
-
-
-struct __attribute__((packed)) mddi_rev_packet {
-	uint16_t length;
-	uint16_t type;
-	uint16_t client_id;
-};
-
-struct __attribute__((packed)) mddi_client_status {
-	uint16_t length;
-	uint16_t type;
-	uint16_t client_id;
-	uint16_t reverse_link_request;  /* bytes needed in rev encap message */
-	uint8_t  crc_error_count;
-	uint8_t  capability_change;
-	uint16_t graphics_busy_flags;
-	uint16_t crc16;
-};
-
-struct __attribute__((packed)) mddi_client_caps {
-	uint16_t length; /* length, exclusive of this field */
-	uint16_t type; /* 66 */
-	uint16_t client_id;
-
-	uint16_t Protocol_Version;
-	uint16_t Minimum_Protocol_Version;
-	uint16_t Data_Rate_Capability;
-	uint8_t  Interface_Type_Capability;
-	uint8_t  Number_of_Alt_Displays;
-	uint16_t PostCal_Data_Rate;
-	uint16_t Bitmap_Width;
-	uint16_t Bitmap_Height;
-	uint16_t Display_Window_Width;
-	uint16_t Display_Window_Height;
-	uint32_t Color_Map_Size;
-	uint16_t Color_Map_RGB_Width;
-	uint16_t RGB_Capability;
-	uint8_t  Monochrome_Capability;
-	uint8_t  Reserved_1;
-	uint16_t Y_Cb_Cr_Capability;
-	uint16_t Bayer_Capability;
-	uint16_t Alpha_Cursor_Image_Planes;
-	uint32_t Client_Feature_Capability_Indicators;
-	uint8_t  Maximum_Video_Frame_Rate_Capability;
-	uint8_t  Minimum_Video_Frame_Rate_Capability;
-	uint16_t Minimum_Sub_frame_Rate;
-	uint16_t Audio_Buffer_Depth;
-	uint16_t Audio_Channel_Capability;
-	uint16_t Audio_Sample_Rate_Capability;
-	uint8_t  Audio_Sample_Resolution;
-	uint8_t  Mic_Audio_Sample_Resolution;
-	uint16_t Mic_Sample_Rate_Capability;
-	uint8_t  Keyboard_Data_Format;
-	uint8_t  pointing_device_data_format;
-	uint16_t content_protection_type;
-	uint16_t Mfr_Name;
-	uint16_t Product_Code;
-	uint16_t Reserved_3;
-	uint32_t Serial_Number;
-	uint8_t  Week_of_Manufacture;
-	uint8_t  Year_of_Manufacture;
-
-	uint16_t crc16;
-} mddi_client_capability_type;
-
-
-struct __attribute__((packed)) mddi_video_stream {
-	uint16_t length;
-	uint16_t type; /* 16 */
-	uint16_t client_id; /* 0 */
-
-	uint16_t video_data_format_descriptor;
-/* format of each pixel in the Pixel Data in the present stream in the
- * present packet.
- * If bits [15:13] = 000 monochrome
- * If bits [15:13] = 001 color pixels (palette).
- * If bits [15:13] = 010 color pixels in raw RGB
- * If bits [15:13] = 011 data in 4:2:2 Y Cb Cr format
- * If bits [15:13] = 100 Bayer pixels
- */
-
-	uint16_t pixel_data_attributes;
-/* interpreted as follows:
- * Bits [1:0] = 11  pixel data is displayed to both eyes
- * Bits [1:0] = 10  pixel data is routed to the left eye only.
- * Bits [1:0] = 01  pixel data is routed to the right eye only.
- * Bits [1:0] = 00  pixel data is routed to the alternate display.
- * Bit 2 is 0  Pixel Data is in the standard progressive format.
- * Bit 2 is 1  Pixel Data is in interlace format.
- * Bit 3 is 0  Pixel Data is in the standard progressive format.
- * Bit 3 is 1  Pixel Data is in alternate pixel format.
- * Bit 4 is 0  Pixel Data is to or from the display frame buffer.
- * Bit 4 is 1  Pixel Data is to or from the camera.
- * Bit 5 is 0  pixel data contains the next consecutive row of pixels.
- * Bit 5 is 1  X Left Edge, Y Top Edge, X Right Edge, Y Bottom Edge,
- *             X Start, and Y Start parameters are not defined and
- *             shall be ignored by the client.
- * Bits [7:6] = 01  Pixel data is written to the offline image buffer.
- * Bits [7:6] = 00  Pixel data is written to the buffer to refresh display.
- * Bits [7:6] = 11  Pixel data is written to all image buffers.
- * Bits [7:6] = 10  Invalid. Reserved for future use.
- * Bits 8 through 11 alternate display number.
- * Bits 12 through 14 are reserved for future use and shall be set to zero.
- * Bit 15 is 1 the row of pixels is the last row of pixels in a frame.
- */
-
-	uint16_t x_left_edge;
-	uint16_t y_top_edge;
-	/* X,Y coordinate of the top left edge of the screen window */
-
-	uint16_t x_right_edge;
-	uint16_t y_bottom_edge;
-	/* X,Y coordinate of the bottom right edge of the window being
-	 * updated. */
-
-	uint16_t x_start;
-	uint16_t y_start;
-	/* (X Start, Y Start) is the first pixel in the Pixel Data field
-	 * below. */
-
-	uint16_t pixel_count;
-	/* number of pixels in the Pixel Data field below. */
-
-	uint16_t parameter_CRC;
-	/* 16-bit CRC of all bytes from the Packet Length to the Pixel Count. */
-
-	uint16_t reserved;
-	/* 16-bit variable to make structure align on 4 byte boundary */
-};
-
-#define TYPE_VIDEO_STREAM      16
-#define TYPE_CLIENT_CAPS       66
-#define TYPE_REGISTER_ACCESS   146
-#define TYPE_CLIENT_STATUS     70
-
-struct __attribute__((packed)) mddi_register_access {
-	uint16_t length;
-	uint16_t type; /* 146 */
-	uint16_t client_id;
-
-	uint16_t read_write_info;
-	/* Bits 13:0  a 14-bit unsigned integer that specifies the number of
-	 *            32-bit Register Data List items to be transferred in the
-	 *            Register Data List field.
-	 * Bits[15:14] = 00  Write to register(s);
-	 * Bits[15:14] = 10  Read from register(s);
-	 * Bits[15:14] = 11  Response to a Read.
-	 * Bits[15:14] = 01  this value is reserved for future use. */
-#define MDDI_WRITE     (0 << 14)
-#define MDDI_READ      (2 << 14)
-#define MDDI_READ_RESP (3 << 14)
-
-	uint32_t register_address;
-	/* the register address that is to be written to or read from. */
-
-	uint16_t crc16;
-
-	uint32_t register_data_list;
-	/* list of 4-byte register data values for/from client registers */
-};
-
-struct __attribute__((packed)) mddi_llentry {
-	uint16_t flags;
-	uint16_t header_count;
-	uint16_t data_count;
-	dma_addr_t data; /* 32 bit */
-	struct mddi_llentry *next;
-	uint16_t reserved;
-	union {
-		struct mddi_video_stream v;
-		struct mddi_register_access r;
-		uint32_t _[12];
-	} u;
-};
-
-#endif
diff --git a/drivers/video/fbdev/msm/mdp.c b/drivers/video/fbdev/msm/mdp.c
deleted file mode 100644
index 113c7876c855..000000000000
--- a/drivers/video/fbdev/msm/mdp.c
+++ /dev/null
@@ -1,520 +0,0 @@
-/* drivers/video/msm_fb/mdp.c
- *
- * MSM MDP Interface (used by framebuffer core)
- *
- * Copyright (C) 2007 QUALCOMM Incorporated
- * Copyright (C) 2007 Google Incorporated
- *
- * This software is licensed under the terms of the GNU General Public
- * License version 2, as published by the Free Software Foundation, and
- * may be copied, distributed, and modified under those terms.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- */
-
-#include <linux/kernel.h>
-#include <linux/fb.h>
-#include <linux/msm_mdp.h>
-#include <linux/interrupt.h>
-#include <linux/wait.h>
-#include <linux/clk.h>
-#include <linux/file.h>
-#include <linux/major.h>
-#include <linux/slab.h>
-
-#include <linux/platform_data/video-msm_fb.h>
-#include <linux/platform_device.h>
-#include <linux/export.h>
-
-#include "mdp_hw.h"
-
-struct class *mdp_class;
-
-#define MDP_CMD_DEBUG_ACCESS_BASE (0x10000)
-
-static uint16_t mdp_default_ccs[] = {
-	0x254, 0x000, 0x331, 0x254, 0xF38, 0xE61, 0x254, 0x409, 0x000,
-	0x010, 0x080, 0x080
-};
-
-static DECLARE_WAIT_QUEUE_HEAD(mdp_dma2_waitqueue);
-static DECLARE_WAIT_QUEUE_HEAD(mdp_ppp_waitqueue);
-static struct msmfb_callback *dma_callback;
-static struct clk *clk;
-static unsigned int mdp_irq_mask;
-static DEFINE_SPINLOCK(mdp_lock);
-DEFINE_MUTEX(mdp_mutex);
-
-static int enable_mdp_irq(struct mdp_info *mdp, uint32_t mask)
-{
-	unsigned long irq_flags;
-	int ret = 0;
-
-	BUG_ON(!mask);
-
-	spin_lock_irqsave(&mdp_lock, irq_flags);
-	/* if the mask bits are already set return an error, this interrupt
-	 * is already enabled */
-	if (mdp_irq_mask & mask) {
-		printk(KERN_ERR "mdp irq already on already on %x %x\n",
-		       mdp_irq_mask, mask);
-		ret = -1;
-	}
-	/* if the mdp irq is not already enabled enable it */
-	if (!mdp_irq_mask) {
-		if (clk)
-			clk_enable(clk);
-		enable_irq(mdp->irq);
-	}
-
-	/* update the irq mask to reflect the fact that the interrupt is
-	 * enabled */
-	mdp_irq_mask |= mask;
-	spin_unlock_irqrestore(&mdp_lock, irq_flags);
-	return ret;
-}
-
-static int locked_disable_mdp_irq(struct mdp_info *mdp, uint32_t mask)
-{
-	/* this interrupt is already disabled! */
-	if (!(mdp_irq_mask & mask)) {
-		printk(KERN_ERR "mdp irq already off %x %x\n",
-		       mdp_irq_mask, mask);
-		return -1;
-	}
-	/* update the irq mask to reflect the fact that the interrupt is
-	 * disabled */
-	mdp_irq_mask &= ~(mask);
-	/* if no one is waiting on the interrupt, disable it */
-	if (!mdp_irq_mask) {
-		disable_irq_nosync(mdp->irq);
-		if (clk)
-			clk_disable(clk);
-	}
-	return 0;
-}
-
-static int disable_mdp_irq(struct mdp_info *mdp, uint32_t mask)
-{
-	unsigned long irq_flags;
-	int ret;
-
-	spin_lock_irqsave(&mdp_lock, irq_flags);
-	ret = locked_disable_mdp_irq(mdp, mask);
-	spin_unlock_irqrestore(&mdp_lock, irq_flags);
-	return ret;
-}
-
-static irqreturn_t mdp_isr(int irq, void *data)
-{
-	uint32_t status;
-	unsigned long irq_flags;
-	struct mdp_info *mdp = data;
-
-	spin_lock_irqsave(&mdp_lock, irq_flags);
-
-	status = mdp_readl(mdp, MDP_INTR_STATUS);
-	mdp_writel(mdp, status, MDP_INTR_CLEAR);
-
-	status &= mdp_irq_mask;
-	if (status & DL0_DMA2_TERM_DONE) {
-		if (dma_callback) {
-			dma_callback->func(dma_callback);
-			dma_callback = NULL;
-		}
-		wake_up(&mdp_dma2_waitqueue);
-	}
-
-	if (status & DL0_ROI_DONE)
-		wake_up(&mdp_ppp_waitqueue);
-
-	if (status)
-		locked_disable_mdp_irq(mdp, status);
-
-	spin_unlock_irqrestore(&mdp_lock, irq_flags);
-	return IRQ_HANDLED;
-}
-
-static uint32_t mdp_check_mask(uint32_t mask)
-{
-	uint32_t ret;
-	unsigned long irq_flags;
-
-	spin_lock_irqsave(&mdp_lock, irq_flags);
-	ret = mdp_irq_mask & mask;
-	spin_unlock_irqrestore(&mdp_lock, irq_flags);
-	return ret;
-}
-
-static int mdp_wait(struct mdp_info *mdp, uint32_t mask, wait_queue_head_t *wq)
-{
-	int ret = 0;
-	unsigned long irq_flags;
-
-	wait_event_timeout(*wq, !mdp_check_mask(mask), HZ);
-
-	spin_lock_irqsave(&mdp_lock, irq_flags);
-	if (mdp_irq_mask & mask) {
-		locked_disable_mdp_irq(mdp, mask);
-		printk(KERN_WARNING "timeout waiting for mdp to complete %x\n",
-		       mask);
-		ret = -ETIMEDOUT;
-	}
-	spin_unlock_irqrestore(&mdp_lock, irq_flags);
-
-	return ret;
-}
-
-void mdp_dma_wait(struct mdp_device *mdp_dev)
-{
-#define MDP_MAX_TIMEOUTS 20
-	static int timeout_count;
-	struct mdp_info *mdp = container_of(mdp_dev, struct mdp_info, mdp_dev);
-
-	if (mdp_wait(mdp, DL0_DMA2_TERM_DONE, &mdp_dma2_waitqueue) == -ETIMEDOUT)
-		timeout_count++;
-	else
-		timeout_count = 0;
-
-	if (timeout_count > MDP_MAX_TIMEOUTS) {
-		printk(KERN_ERR "mdp: dma failed %d times, somethings wrong!\n",
-		       MDP_MAX_TIMEOUTS);
-		BUG();
-	}
-}
-
-static int mdp_ppp_wait(struct mdp_info *mdp)
-{
-	return mdp_wait(mdp, DL0_ROI_DONE, &mdp_ppp_waitqueue);
-}
-
-void mdp_dma_to_mddi(struct mdp_info *mdp, uint32_t addr, uint32_t stride,
-		     uint32_t width, uint32_t height, uint32_t x, uint32_t y,
-		     struct msmfb_callback *callback)
-{
-	uint32_t dma2_cfg;
-	uint16_t ld_param = 0; /* 0=PRIM, 1=SECD, 2=EXT */
-
-	if (enable_mdp_irq(mdp, DL0_DMA2_TERM_DONE)) {
-		printk(KERN_ERR "mdp_dma_to_mddi: busy\n");
-		return;
-	}
-
-	dma_callback = callback;
-
-	dma2_cfg = DMA_PACK_TIGHT |
-		DMA_PACK_ALIGN_LSB |
-		DMA_PACK_PATTERN_RGB |
-		DMA_OUT_SEL_AHB |
-		DMA_IBUF_NONCONTIGUOUS;
-
-	dma2_cfg |= DMA_IBUF_FORMAT_RGB565;
-
-	dma2_cfg |= DMA_OUT_SEL_MDDI;
-
-	dma2_cfg |= DMA_MDDI_DMAOUT_LCD_SEL_PRIMARY;
-
-	dma2_cfg |= DMA_DITHER_EN;
-
-	/* setup size, address, and stride */
-	mdp_writel(mdp, (height << 16) | (width),
-		   MDP_CMD_DEBUG_ACCESS_BASE + 0x0184);
-	mdp_writel(mdp, addr, MDP_CMD_DEBUG_ACCESS_BASE + 0x0188);
-	mdp_writel(mdp, stride, MDP_CMD_DEBUG_ACCESS_BASE + 0x018C);
-
-	/* 666 18BPP */
-	dma2_cfg |= DMA_DSTC0G_6BITS | DMA_DSTC1B_6BITS | DMA_DSTC2R_6BITS;
-
-	/* set y & x offset and MDDI transaction parameters */
-	mdp_writel(mdp, (y << 16) | (x), MDP_CMD_DEBUG_ACCESS_BASE + 0x0194);
-	mdp_writel(mdp, ld_param, MDP_CMD_DEBUG_ACCESS_BASE + 0x01a0);
-	mdp_writel(mdp, (MDDI_VDO_PACKET_DESC << 16) | MDDI_VDO_PACKET_PRIM,
-		   MDP_CMD_DEBUG_ACCESS_BASE + 0x01a4);
-
-	mdp_writel(mdp, dma2_cfg, MDP_CMD_DEBUG_ACCESS_BASE + 0x0180);
-
-	/* start DMA2 */
-	mdp_writel(mdp, 0, MDP_CMD_DEBUG_ACCESS_BASE + 0x0044);
-}
-
-void mdp_dma(struct mdp_device *mdp_dev, uint32_t addr, uint32_t stride,
-	     uint32_t width, uint32_t height, uint32_t x, uint32_t y,
-	     struct msmfb_callback *callback, int interface)
-{
-	struct mdp_info *mdp = container_of(mdp_dev, struct mdp_info, mdp_dev);
-
-	if (interface == MSM_MDDI_PMDH_INTERFACE) {
-		mdp_dma_to_mddi(mdp, addr, stride, width, height, x, y,
-				callback);
-	}
-}
-
-int get_img(struct mdp_img *img, struct fb_info *info,
-	    unsigned long *start, unsigned long *len,
-	    struct file **filep)
-{
-	int ret = 0;
-	struct fd f = fdget(img->memory_id);
-	if (f.file == NULL)
-		return -1;
-
-	if (MAJOR(file_inode(f.file)->i_rdev) == FB_MAJOR) {
-		*start = info->fix.smem_start;
-		*len = info->fix.smem_len;
-	} else
-		ret = -1;
-	fdput(f);
-
-	return ret;
-}
-
-void put_img(struct file *src_file, struct file *dst_file)
-{
-}
-
-int mdp_blit(struct mdp_device *mdp_dev, struct fb_info *fb,
-	     struct mdp_blit_req *req)
-{
-	int ret;
-	unsigned long src_start = 0, src_len = 0, dst_start = 0, dst_len = 0;
-	struct mdp_info *mdp = container_of(mdp_dev, struct mdp_info, mdp_dev);
-	struct file *src_file = 0, *dst_file = 0;
-
-	/* WORKAROUND FOR HARDWARE BUG IN BG TILE FETCH */
-	if (unlikely(req->src_rect.h == 0 ||
-		     req->src_rect.w == 0)) {
-		printk(KERN_ERR "mpd_ppp: src img of zero size!\n");
-		return -EINVAL;
-	}
-	if (unlikely(req->dst_rect.h == 0 ||
-		     req->dst_rect.w == 0))
-		return -EINVAL;
-
-	/* do this first so that if this fails, the caller can always
-	 * safely call put_img */
-	if (unlikely(get_img(&req->src, fb, &src_start, &src_len, &src_file))) {
-		printk(KERN_ERR "mpd_ppp: could not retrieve src image from "
-				"memory\n");
-		return -EINVAL;
-	}
-
-	if (unlikely(get_img(&req->dst, fb, &dst_start, &dst_len, &dst_file))) {
-		printk(KERN_ERR "mpd_ppp: could not retrieve dst image from "
-				"memory\n");
-		return -EINVAL;
-	}
-	mutex_lock(&mdp_mutex);
-
-	/* transp_masking unimplemented */
-	req->transp_mask = MDP_TRANSP_NOP;
-	if (unlikely((req->transp_mask != MDP_TRANSP_NOP ||
-		      req->alpha != MDP_ALPHA_NOP ||
-		      HAS_ALPHA(req->src.format)) &&
-		     (req->flags & MDP_ROT_90 &&
-		      req->dst_rect.w <= 16 && req->dst_rect.h >= 16))) {
-		int i;
-		unsigned int tiles = req->dst_rect.h / 16;
-		unsigned int remainder = req->dst_rect.h % 16;
-		req->src_rect.w = 16*req->src_rect.w / req->dst_rect.h;
-		req->dst_rect.h = 16;
-		for (i = 0; i < tiles; i++) {
-			enable_mdp_irq(mdp, DL0_ROI_DONE);
-			ret = mdp_ppp_blit(mdp, req, src_file, src_start,
-					   src_len, dst_file, dst_start,
-					   dst_len);
-			if (ret)
-				goto err_bad_blit;
-			ret = mdp_ppp_wait(mdp);
-			if (ret)
-				goto err_wait_failed;
-			req->dst_rect.y += 16;
-			req->src_rect.x += req->src_rect.w;
-		}
-		if (!remainder)
-			goto end;
-		req->src_rect.w = remainder*req->src_rect.w / req->dst_rect.h;
-		req->dst_rect.h = remainder;
-	}
-	enable_mdp_irq(mdp, DL0_ROI_DONE);
-	ret = mdp_ppp_blit(mdp, req, src_file, src_start, src_len, dst_file,
-			   dst_start,
-			   dst_len);
-	if (ret)
-		goto err_bad_blit;
-	ret = mdp_ppp_wait(mdp);
-	if (ret)
-		goto err_wait_failed;
-end:
-	put_img(src_file, dst_file);
-	mutex_unlock(&mdp_mutex);
-	return 0;
-err_bad_blit:
-	disable_mdp_irq(mdp, DL0_ROI_DONE);
-err_wait_failed:
-	put_img(src_file, dst_file);
-	mutex_unlock(&mdp_mutex);
-	return ret;
-}
-
-void mdp_set_grp_disp(struct mdp_device *mdp_dev, unsigned disp_id)
-{
-	struct mdp_info *mdp = container_of(mdp_dev, struct mdp_info, mdp_dev);
-
-	disp_id &= 0xf;
-	mdp_writel(mdp, disp_id, MDP_FULL_BYPASS_WORD43);
-}
-
-int register_mdp_client(struct class_interface *cint)
-{
-	if (!mdp_class) {
-		pr_err("mdp: no mdp_class when registering mdp client\n");
-		return -ENODEV;
-	}
-	cint->class = mdp_class;
-	return class_interface_register(cint);
-}
-
-#include "mdp_csc_table.h"
-#include "mdp_scale_tables.h"
-
-int mdp_probe(struct platform_device *pdev)
-{
-	struct resource *resource;
-	int ret;
-	int n;
-	struct mdp_info *mdp;
-
-	resource = platform_get_resource(pdev, IORESOURCE_MEM, 0);
-	if (!resource) {
-		pr_err("mdp: can not get mdp mem resource!\n");
-		return -ENOMEM;
-	}
-
-	mdp = kzalloc(sizeof(struct mdp_info), GFP_KERNEL);
-	if (!mdp)
-		return -ENOMEM;
-
-	mdp->irq = platform_get_irq(pdev, 0);
-	if (mdp->irq < 0) {
-		pr_err("mdp: can not get mdp irq\n");
-		ret = mdp->irq;
-		goto error_get_irq;
-	}
-
-	mdp->base = ioremap(resource->start, resource_size(resource));
-	if (mdp->base == 0) {
-		printk(KERN_ERR "msmfb: cannot allocate mdp regs!\n");
-		ret = -ENOMEM;
-		goto error_ioremap;
-	}
-
-	mdp->mdp_dev.dma = mdp_dma;
-	mdp->mdp_dev.dma_wait = mdp_dma_wait;
-	mdp->mdp_dev.blit = mdp_blit;
-	mdp->mdp_dev.set_grp_disp = mdp_set_grp_disp;
-
-	clk = clk_get(&pdev->dev, "mdp_clk");
-	if (IS_ERR(clk)) {
-		printk(KERN_INFO "mdp: failed to get mdp clk");
-		ret = PTR_ERR(clk);
-		goto error_get_clk;
-	}
-
-	ret = request_irq(mdp->irq, mdp_isr, 0, "msm_mdp", mdp);
-	if (ret)
-		goto error_request_irq;
-	disable_irq(mdp->irq);
-	mdp_irq_mask = 0;
-
-	/* debug interface write access */
-	mdp_writel(mdp, 1, 0x60);
-
-	mdp_writel(mdp, MDP_ANY_INTR_MASK, MDP_INTR_ENABLE);
-	mdp_writel(mdp, 1, MDP_EBI2_PORTMAP_MODE);
-
-	mdp_writel(mdp, 0, MDP_CMD_DEBUG_ACCESS_BASE + 0x01f8);
-	mdp_writel(mdp, 0, MDP_CMD_DEBUG_ACCESS_BASE + 0x01fc);
-
-	for (n = 0; n < ARRAY_SIZE(csc_table); n++)
-		mdp_writel(mdp, csc_table[n].val, csc_table[n].reg);
-
-	/* clear up unused fg/main registers */
-	/* comp.plane 2&3 ystride */
-	mdp_writel(mdp, 0, MDP_CMD_DEBUG_ACCESS_BASE + 0x0120);
-
-	/* unpacked pattern */
-	mdp_writel(mdp, 0, MDP_CMD_DEBUG_ACCESS_BASE + 0x012c);
-	mdp_writel(mdp, 0, MDP_CMD_DEBUG_ACCESS_BASE + 0x0130);
-	mdp_writel(mdp, 0, MDP_CMD_DEBUG_ACCESS_BASE + 0x0134);
-	mdp_writel(mdp, 0, MDP_CMD_DEBUG_ACCESS_BASE + 0x0158);
-	mdp_writel(mdp, 0, MDP_CMD_DEBUG_ACCESS_BASE + 0x015c);
-	mdp_writel(mdp, 0, MDP_CMD_DEBUG_ACCESS_BASE + 0x0160);
-	mdp_writel(mdp, 0, MDP_CMD_DEBUG_ACCESS_BASE + 0x0170);
-	mdp_writel(mdp, 0, MDP_CMD_DEBUG_ACCESS_BASE + 0x0174);
-	mdp_writel(mdp, 0, MDP_CMD_DEBUG_ACCESS_BASE + 0x017c);
-
-	/* comp.plane 2 & 3 */
-	mdp_writel(mdp, 0, MDP_CMD_DEBUG_ACCESS_BASE + 0x0114);
-	mdp_writel(mdp, 0, MDP_CMD_DEBUG_ACCESS_BASE + 0x0118);
-
-	/* clear unused bg registers */
-	mdp_writel(mdp, 0, MDP_CMD_DEBUG_ACCESS_BASE + 0x01c8);
-	mdp_writel(mdp, 0, MDP_CMD_DEBUG_ACCESS_BASE + 0x01d0);
-	mdp_writel(mdp, 0, MDP_CMD_DEBUG_ACCESS_BASE + 0x01dc);
-	mdp_writel(mdp, 0, MDP_CMD_DEBUG_ACCESS_BASE + 0x01e0);
-	mdp_writel(mdp, 0, MDP_CMD_DEBUG_ACCESS_BASE + 0x01e4);
-
-	for (n = 0; n < ARRAY_SIZE(mdp_upscale_table); n++)
-		mdp_writel(mdp, mdp_upscale_table[n].val,
-		       mdp_upscale_table[n].reg);
-
-	for (n = 0; n < 9; n++)
-		mdp_writel(mdp, mdp_default_ccs[n], 0x40440 + 4 * n);
-	mdp_writel(mdp, mdp_default_ccs[9], 0x40500 + 4 * 0);
-	mdp_writel(mdp, mdp_default_ccs[10], 0x40500 + 4 * 0);
-	mdp_writel(mdp, mdp_default_ccs[11], 0x40500 + 4 * 0);
-
-	/* register mdp device */
-	mdp->mdp_dev.dev.parent = &pdev->dev;
-	mdp->mdp_dev.dev.class = mdp_class;
-	dev_set_name(&mdp->mdp_dev.dev, "mdp%d", pdev->id);
-
-	/* if you can remove the platform device you'd have to implement
-	 * this:
-	mdp_dev.release = mdp_class; */
-
-	ret = device_register(&mdp->mdp_dev.dev);
-	if (ret)
-		goto error_device_register;
-	return 0;
-
-error_device_register:
-	free_irq(mdp->irq, mdp);
-error_request_irq:
-error_get_clk:
-	iounmap(mdp->base);
-error_get_irq:
-error_ioremap:
-	kfree(mdp);
-	return ret;
-}
-
-static struct platform_driver msm_mdp_driver = {
-	.probe = mdp_probe,
-	.driver = {.name = "msm_mdp"},
-};
-
-static int __init mdp_init(void)
-{
-	mdp_class = class_create(THIS_MODULE, "msm_mdp");
-	if (IS_ERR(mdp_class)) {
-		printk(KERN_ERR "Error creating mdp class\n");
-		return PTR_ERR(mdp_class);
-	}
-	return platform_driver_register(&msm_mdp_driver);
-}
-
-subsys_initcall(mdp_init);
diff --git a/drivers/video/fbdev/msm/mdp_csc_table.h b/drivers/video/fbdev/msm/mdp_csc_table.h
deleted file mode 100644
index d1cde30ead52..000000000000
--- a/drivers/video/fbdev/msm/mdp_csc_table.h
+++ /dev/null
@@ -1,582 +0,0 @@
-/* drivers/video/msm_fb/mdp_csc_table.h
- *
- * Copyright (C) 2007 QUALCOMM Incorporated
- * Copyright (C) 2007 Google Incorporated
- *
- * This software is licensed under the terms of the GNU General Public
- * License version 2, as published by the Free Software Foundation, and
- * may be copied, distributed, and modified under those terms.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- */
-
-static struct {
-	uint32_t reg;
-	uint32_t val;
-} csc_table[] = {
-	{ 0x40400, 0x83 },
-	{ 0x40404, 0x102 },
-	{ 0x40408, 0x32 },
-	{ 0x4040c, 0xffffffb5 },
-	{ 0x40410, 0xffffff6c },
-	{ 0x40414, 0xe1 },
-	{ 0x40418, 0xe1 },
-	{ 0x4041c, 0xffffff45 },
-	{ 0x40420, 0xffffffdc },
-	{ 0x40440, 0x254 },
-	{ 0x40444, 0x0 },
-	{ 0x40448, 0x331 },
-	{ 0x4044c, 0x254 },
-	{ 0x40450, 0xffffff38 },
-	{ 0x40454, 0xfffffe61 },
-	{ 0x40458, 0x254 },
-	{ 0x4045c, 0x409 },
-	{ 0x40460, 0x0 },
-	{ 0x40480, 0x5d },
-	{ 0x40484, 0x13a },
-	{ 0x40488, 0x20 },
-	{ 0x4048c, 0xffffffcd },
-	{ 0x40490, 0xffffff54 },
-	{ 0x40494, 0xe1 },
-	{ 0x40498, 0xe1 },
-	{ 0x4049c, 0xffffff35 },
-	{ 0x404a0, 0xffffffec },
-	{ 0x404c0, 0x254 },
-	{ 0x404c4, 0x0 },
-	{ 0x404c8, 0x396 },
-	{ 0x404cc, 0x254 },
-	{ 0x404d0, 0xffffff94 },
-	{ 0x404d4, 0xfffffef0 },
-	{ 0x404d8, 0x254 },
-	{ 0x404dc, 0x43a },
-	{ 0x404e0, 0x0 },
-	{ 0x40500, 0x10 },
-	{ 0x40504, 0x80 },
-	{ 0x40508, 0x80 },
-	{ 0x40540, 0x10 },
-	{ 0x40544, 0x80 },
-	{ 0x40548, 0x80 },
-	{ 0x40580, 0x10 },
-	{ 0x40584, 0xeb },
-	{ 0x40588, 0x10 },
-	{ 0x4058c, 0xf0 },
-	{ 0x405c0, 0x10 },
-	{ 0x405c4, 0xeb },
-	{ 0x405c8, 0x10 },
-	{ 0x405cc, 0xf0 },
-	{ 0x40800, 0x0 },
-	{ 0x40804, 0x151515 },
-	{ 0x40808, 0x1d1d1d },
-	{ 0x4080c, 0x232323 },
-	{ 0x40810, 0x272727 },
-	{ 0x40814, 0x2b2b2b },
-	{ 0x40818, 0x2f2f2f },
-	{ 0x4081c, 0x333333 },
-	{ 0x40820, 0x363636 },
-	{ 0x40824, 0x393939 },
-	{ 0x40828, 0x3b3b3b },
-	{ 0x4082c, 0x3e3e3e },
-	{ 0x40830, 0x404040 },
-	{ 0x40834, 0x434343 },
-	{ 0x40838, 0x454545 },
-	{ 0x4083c, 0x474747 },
-	{ 0x40840, 0x494949 },
-	{ 0x40844, 0x4b4b4b },
-	{ 0x40848, 0x4d4d4d },
-	{ 0x4084c, 0x4f4f4f },
-	{ 0x40850, 0x515151 },
-	{ 0x40854, 0x535353 },
-	{ 0x40858, 0x555555 },
-	{ 0x4085c, 0x565656 },
-	{ 0x40860, 0x585858 },
-	{ 0x40864, 0x5a5a5a },
-	{ 0x40868, 0x5b5b5b },
-	{ 0x4086c, 0x5d5d5d },
-	{ 0x40870, 0x5e5e5e },
-	{ 0x40874, 0x606060 },
-	{ 0x40878, 0x616161 },
-	{ 0x4087c, 0x636363 },
-	{ 0x40880, 0x646464 },
-	{ 0x40884, 0x666666 },
-	{ 0x40888, 0x676767 },
-	{ 0x4088c, 0x686868 },
-	{ 0x40890, 0x6a6a6a },
-	{ 0x40894, 0x6b6b6b },
-	{ 0x40898, 0x6c6c6c },
-	{ 0x4089c, 0x6e6e6e },
-	{ 0x408a0, 0x6f6f6f },
-	{ 0x408a4, 0x707070 },
-	{ 0x408a8, 0x717171 },
-	{ 0x408ac, 0x727272 },
-	{ 0x408b0, 0x747474 },
-	{ 0x408b4, 0x757575 },
-	{ 0x408b8, 0x767676 },
-	{ 0x408bc, 0x777777 },
-	{ 0x408c0, 0x787878 },
-	{ 0x408c4, 0x797979 },
-	{ 0x408c8, 0x7a7a7a },
-	{ 0x408cc, 0x7c7c7c },
-	{ 0x408d0, 0x7d7d7d },
-	{ 0x408d4, 0x7e7e7e },
-	{ 0x408d8, 0x7f7f7f },
-	{ 0x408dc, 0x808080 },
-	{ 0x408e0, 0x818181 },
-	{ 0x408e4, 0x828282 },
-	{ 0x408e8, 0x838383 },
-	{ 0x408ec, 0x848484 },
-	{ 0x408f0, 0x858585 },
-	{ 0x408f4, 0x868686 },
-	{ 0x408f8, 0x878787 },
-	{ 0x408fc, 0x888888 },
-	{ 0x40900, 0x898989 },
-	{ 0x40904, 0x8a8a8a },
-	{ 0x40908, 0x8b8b8b },
-	{ 0x4090c, 0x8c8c8c },
-	{ 0x40910, 0x8d8d8d },
-	{ 0x40914, 0x8e8e8e },
-	{ 0x40918, 0x8f8f8f },
-	{ 0x4091c, 0x8f8f8f },
-	{ 0x40920, 0x909090 },
-	{ 0x40924, 0x919191 },
-	{ 0x40928, 0x929292 },
-	{ 0x4092c, 0x939393 },
-	{ 0x40930, 0x949494 },
-	{ 0x40934, 0x959595 },
-	{ 0x40938, 0x969696 },
-	{ 0x4093c, 0x969696 },
-	{ 0x40940, 0x979797 },
-	{ 0x40944, 0x989898 },
-	{ 0x40948, 0x999999 },
-	{ 0x4094c, 0x9a9a9a },
-	{ 0x40950, 0x9b9b9b },
-	{ 0x40954, 0x9c9c9c },
-	{ 0x40958, 0x9c9c9c },
-	{ 0x4095c, 0x9d9d9d },
-	{ 0x40960, 0x9e9e9e },
-	{ 0x40964, 0x9f9f9f },
-	{ 0x40968, 0xa0a0a0 },
-	{ 0x4096c, 0xa0a0a0 },
-	{ 0x40970, 0xa1a1a1 },
-	{ 0x40974, 0xa2a2a2 },
-	{ 0x40978, 0xa3a3a3 },
-	{ 0x4097c, 0xa4a4a4 },
-	{ 0x40980, 0xa4a4a4 },
-	{ 0x40984, 0xa5a5a5 },
-	{ 0x40988, 0xa6a6a6 },
-	{ 0x4098c, 0xa7a7a7 },
-	{ 0x40990, 0xa7a7a7 },
-	{ 0x40994, 0xa8a8a8 },
-	{ 0x40998, 0xa9a9a9 },
-	{ 0x4099c, 0xaaaaaa },
-	{ 0x409a0, 0xaaaaaa },
-	{ 0x409a4, 0xababab },
-	{ 0x409a8, 0xacacac },
-	{ 0x409ac, 0xadadad },
-	{ 0x409b0, 0xadadad },
-	{ 0x409b4, 0xaeaeae },
-	{ 0x409b8, 0xafafaf },
-	{ 0x409bc, 0xafafaf },
-	{ 0x409c0, 0xb0b0b0 },
-	{ 0x409c4, 0xb1b1b1 },
-	{ 0x409c8, 0xb2b2b2 },
-	{ 0x409cc, 0xb2b2b2 },
-	{ 0x409d0, 0xb3b3b3 },
-	{ 0x409d4, 0xb4b4b4 },
-	{ 0x409d8, 0xb4b4b4 },
-	{ 0x409dc, 0xb5b5b5 },
-	{ 0x409e0, 0xb6b6b6 },
-	{ 0x409e4, 0xb6b6b6 },
-	{ 0x409e8, 0xb7b7b7 },
-	{ 0x409ec, 0xb8b8b8 },
-	{ 0x409f0, 0xb8b8b8 },
-	{ 0x409f4, 0xb9b9b9 },
-	{ 0x409f8, 0xbababa },
-	{ 0x409fc, 0xbababa },
-	{ 0x40a00, 0xbbbbbb },
-	{ 0x40a04, 0xbcbcbc },
-	{ 0x40a08, 0xbcbcbc },
-	{ 0x40a0c, 0xbdbdbd },
-	{ 0x40a10, 0xbebebe },
-	{ 0x40a14, 0xbebebe },
-	{ 0x40a18, 0xbfbfbf },
-	{ 0x40a1c, 0xc0c0c0 },
-	{ 0x40a20, 0xc0c0c0 },
-	{ 0x40a24, 0xc1c1c1 },
-	{ 0x40a28, 0xc1c1c1 },
-	{ 0x40a2c, 0xc2c2c2 },
-	{ 0x40a30, 0xc3c3c3 },
-	{ 0x40a34, 0xc3c3c3 },
-	{ 0x40a38, 0xc4c4c4 },
-	{ 0x40a3c, 0xc5c5c5 },
-	{ 0x40a40, 0xc5c5c5 },
-	{ 0x40a44, 0xc6c6c6 },
-	{ 0x40a48, 0xc6c6c6 },
-	{ 0x40a4c, 0xc7c7c7 },
-	{ 0x40a50, 0xc8c8c8 },
-	{ 0x40a54, 0xc8c8c8 },
-	{ 0x40a58, 0xc9c9c9 },
-	{ 0x40a5c, 0xc9c9c9 },
-	{ 0x40a60, 0xcacaca },
-	{ 0x40a64, 0xcbcbcb },
-	{ 0x40a68, 0xcbcbcb },
-	{ 0x40a6c, 0xcccccc },
-	{ 0x40a70, 0xcccccc },
-	{ 0x40a74, 0xcdcdcd },
-	{ 0x40a78, 0xcecece },
-	{ 0x40a7c, 0xcecece },
-	{ 0x40a80, 0xcfcfcf },
-	{ 0x40a84, 0xcfcfcf },
-	{ 0x40a88, 0xd0d0d0 },
-	{ 0x40a8c, 0xd0d0d0 },
-	{ 0x40a90, 0xd1d1d1 },
-	{ 0x40a94, 0xd2d2d2 },
-	{ 0x40a98, 0xd2d2d2 },
-	{ 0x40a9c, 0xd3d3d3 },
-	{ 0x40aa0, 0xd3d3d3 },
-	{ 0x40aa4, 0xd4d4d4 },
-	{ 0x40aa8, 0xd4d4d4 },
-	{ 0x40aac, 0xd5d5d5 },
-	{ 0x40ab0, 0xd6d6d6 },
-	{ 0x40ab4, 0xd6d6d6 },
-	{ 0x40ab8, 0xd7d7d7 },
-	{ 0x40abc, 0xd7d7d7 },
-	{ 0x40ac0, 0xd8d8d8 },
-	{ 0x40ac4, 0xd8d8d8 },
-	{ 0x40ac8, 0xd9d9d9 },
-	{ 0x40acc, 0xd9d9d9 },
-	{ 0x40ad0, 0xdadada },
-	{ 0x40ad4, 0xdbdbdb },
-	{ 0x40ad8, 0xdbdbdb },
-	{ 0x40adc, 0xdcdcdc },
-	{ 0x40ae0, 0xdcdcdc },
-	{ 0x40ae4, 0xdddddd },
-	{ 0x40ae8, 0xdddddd },
-	{ 0x40aec, 0xdedede },
-	{ 0x40af0, 0xdedede },
-	{ 0x40af4, 0xdfdfdf },
-	{ 0x40af8, 0xdfdfdf },
-	{ 0x40afc, 0xe0e0e0 },
-	{ 0x40b00, 0xe0e0e0 },
-	{ 0x40b04, 0xe1e1e1 },
-	{ 0x40b08, 0xe1e1e1 },
-	{ 0x40b0c, 0xe2e2e2 },
-	{ 0x40b10, 0xe3e3e3 },
-	{ 0x40b14, 0xe3e3e3 },
-	{ 0x40b18, 0xe4e4e4 },
-	{ 0x40b1c, 0xe4e4e4 },
-	{ 0x40b20, 0xe5e5e5 },
-	{ 0x40b24, 0xe5e5e5 },
-	{ 0x40b28, 0xe6e6e6 },
-	{ 0x40b2c, 0xe6e6e6 },
-	{ 0x40b30, 0xe7e7e7 },
-	{ 0x40b34, 0xe7e7e7 },
-	{ 0x40b38, 0xe8e8e8 },
-	{ 0x40b3c, 0xe8e8e8 },
-	{ 0x40b40, 0xe9e9e9 },
-	{ 0x40b44, 0xe9e9e9 },
-	{ 0x40b48, 0xeaeaea },
-	{ 0x40b4c, 0xeaeaea },
-	{ 0x40b50, 0xebebeb },
-	{ 0x40b54, 0xebebeb },
-	{ 0x40b58, 0xececec },
-	{ 0x40b5c, 0xececec },
-	{ 0x40b60, 0xededed },
-	{ 0x40b64, 0xededed },
-	{ 0x40b68, 0xeeeeee },
-	{ 0x40b6c, 0xeeeeee },
-	{ 0x40b70, 0xefefef },
-	{ 0x40b74, 0xefefef },
-	{ 0x40b78, 0xf0f0f0 },
-	{ 0x40b7c, 0xf0f0f0 },
-	{ 0x40b80, 0xf1f1f1 },
-	{ 0x40b84, 0xf1f1f1 },
-	{ 0x40b88, 0xf2f2f2 },
-	{ 0x40b8c, 0xf2f2f2 },
-	{ 0x40b90, 0xf2f2f2 },
-	{ 0x40b94, 0xf3f3f3 },
-	{ 0x40b98, 0xf3f3f3 },
-	{ 0x40b9c, 0xf4f4f4 },
-	{ 0x40ba0, 0xf4f4f4 },
-	{ 0x40ba4, 0xf5f5f5 },
-	{ 0x40ba8, 0xf5f5f5 },
-	{ 0x40bac, 0xf6f6f6 },
-	{ 0x40bb0, 0xf6f6f6 },
-	{ 0x40bb4, 0xf7f7f7 },
-	{ 0x40bb8, 0xf7f7f7 },
-	{ 0x40bbc, 0xf8f8f8 },
-	{ 0x40bc0, 0xf8f8f8 },
-	{ 0x40bc4, 0xf9f9f9 },
-	{ 0x40bc8, 0xf9f9f9 },
-	{ 0x40bcc, 0xfafafa },
-	{ 0x40bd0, 0xfafafa },
-	{ 0x40bd4, 0xfafafa },
-	{ 0x40bd8, 0xfbfbfb },
-	{ 0x40bdc, 0xfbfbfb },
-	{ 0x40be0, 0xfcfcfc },
-	{ 0x40be4, 0xfcfcfc },
-	{ 0x40be8, 0xfdfdfd },
-	{ 0x40bec, 0xfdfdfd },
-	{ 0x40bf0, 0xfefefe },
-	{ 0x40bf4, 0xfefefe },
-	{ 0x40bf8, 0xffffff },
-	{ 0x40bfc, 0xffffff },
-	{ 0x40c00, 0x0 },
-	{ 0x40c04, 0x0 },
-	{ 0x40c08, 0x0 },
-	{ 0x40c0c, 0x0 },
-	{ 0x40c10, 0x0 },
-	{ 0x40c14, 0x0 },
-	{ 0x40c18, 0x0 },
-	{ 0x40c1c, 0x0 },
-	{ 0x40c20, 0x0 },
-	{ 0x40c24, 0x0 },
-	{ 0x40c28, 0x0 },
-	{ 0x40c2c, 0x0 },
-	{ 0x40c30, 0x0 },
-	{ 0x40c34, 0x0 },
-	{ 0x40c38, 0x0 },
-	{ 0x40c3c, 0x0 },
-	{ 0x40c40, 0x10101 },
-	{ 0x40c44, 0x10101 },
-	{ 0x40c48, 0x10101 },
-	{ 0x40c4c, 0x10101 },
-	{ 0x40c50, 0x10101 },
-	{ 0x40c54, 0x10101 },
-	{ 0x40c58, 0x10101 },
-	{ 0x40c5c, 0x10101 },
-	{ 0x40c60, 0x10101 },
-	{ 0x40c64, 0x10101 },
-	{ 0x40c68, 0x20202 },
-	{ 0x40c6c, 0x20202 },
-	{ 0x40c70, 0x20202 },
-	{ 0x40c74, 0x20202 },
-	{ 0x40c78, 0x20202 },
-	{ 0x40c7c, 0x20202 },
-	{ 0x40c80, 0x30303 },
-	{ 0x40c84, 0x30303 },
-	{ 0x40c88, 0x30303 },
-	{ 0x40c8c, 0x30303 },
-	{ 0x40c90, 0x30303 },
-	{ 0x40c94, 0x40404 },
-	{ 0x40c98, 0x40404 },
-	{ 0x40c9c, 0x40404 },
-	{ 0x40ca0, 0x40404 },
-	{ 0x40ca4, 0x40404 },
-	{ 0x40ca8, 0x50505 },
-	{ 0x40cac, 0x50505 },
-	{ 0x40cb0, 0x50505 },
-	{ 0x40cb4, 0x50505 },
-	{ 0x40cb8, 0x60606 },
-	{ 0x40cbc, 0x60606 },
-	{ 0x40cc0, 0x60606 },
-	{ 0x40cc4, 0x70707 },
-	{ 0x40cc8, 0x70707 },
-	{ 0x40ccc, 0x70707 },
-	{ 0x40cd0, 0x70707 },
-	{ 0x40cd4, 0x80808 },
-	{ 0x40cd8, 0x80808 },
-	{ 0x40cdc, 0x80808 },
-	{ 0x40ce0, 0x90909 },
-	{ 0x40ce4, 0x90909 },
-	{ 0x40ce8, 0xa0a0a },
-	{ 0x40cec, 0xa0a0a },
-	{ 0x40cf0, 0xa0a0a },
-	{ 0x40cf4, 0xb0b0b },
-	{ 0x40cf8, 0xb0b0b },
-	{ 0x40cfc, 0xb0b0b },
-	{ 0x40d00, 0xc0c0c },
-	{ 0x40d04, 0xc0c0c },
-	{ 0x40d08, 0xd0d0d },
-	{ 0x40d0c, 0xd0d0d },
-	{ 0x40d10, 0xe0e0e },
-	{ 0x40d14, 0xe0e0e },
-	{ 0x40d18, 0xe0e0e },
-	{ 0x40d1c, 0xf0f0f },
-	{ 0x40d20, 0xf0f0f },
-	{ 0x40d24, 0x101010 },
-	{ 0x40d28, 0x101010 },
-	{ 0x40d2c, 0x111111 },
-	{ 0x40d30, 0x111111 },
-	{ 0x40d34, 0x121212 },
-	{ 0x40d38, 0x121212 },
-	{ 0x40d3c, 0x131313 },
-	{ 0x40d40, 0x131313 },
-	{ 0x40d44, 0x141414 },
-	{ 0x40d48, 0x151515 },
-	{ 0x40d4c, 0x151515 },
-	{ 0x40d50, 0x161616 },
-	{ 0x40d54, 0x161616 },
-	{ 0x40d58, 0x171717 },
-	{ 0x40d5c, 0x171717 },
-	{ 0x40d60, 0x181818 },
-	{ 0x40d64, 0x191919 },
-	{ 0x40d68, 0x191919 },
-	{ 0x40d6c, 0x1a1a1a },
-	{ 0x40d70, 0x1b1b1b },
-	{ 0x40d74, 0x1b1b1b },
-	{ 0x40d78, 0x1c1c1c },
-	{ 0x40d7c, 0x1c1c1c },
-	{ 0x40d80, 0x1d1d1d },
-	{ 0x40d84, 0x1e1e1e },
-	{ 0x40d88, 0x1f1f1f },
-	{ 0x40d8c, 0x1f1f1f },
-	{ 0x40d90, 0x202020 },
-	{ 0x40d94, 0x212121 },
-	{ 0x40d98, 0x212121 },
-	{ 0x40d9c, 0x222222 },
-	{ 0x40da0, 0x232323 },
-	{ 0x40da4, 0x242424 },
-	{ 0x40da8, 0x242424 },
-	{ 0x40dac, 0x252525 },
-	{ 0x40db0, 0x262626 },
-	{ 0x40db4, 0x272727 },
-	{ 0x40db8, 0x272727 },
-	{ 0x40dbc, 0x282828 },
-	{ 0x40dc0, 0x292929 },
-	{ 0x40dc4, 0x2a2a2a },
-	{ 0x40dc8, 0x2b2b2b },
-	{ 0x40dcc, 0x2c2c2c },
-	{ 0x40dd0, 0x2c2c2c },
-	{ 0x40dd4, 0x2d2d2d },
-	{ 0x40dd8, 0x2e2e2e },
-	{ 0x40ddc, 0x2f2f2f },
-	{ 0x40de0, 0x303030 },
-	{ 0x40de4, 0x313131 },
-	{ 0x40de8, 0x323232 },
-	{ 0x40dec, 0x333333 },
-	{ 0x40df0, 0x333333 },
-	{ 0x40df4, 0x343434 },
-	{ 0x40df8, 0x353535 },
-	{ 0x40dfc, 0x363636 },
-	{ 0x40e00, 0x373737 },
-	{ 0x40e04, 0x383838 },
-	{ 0x40e08, 0x393939 },
-	{ 0x40e0c, 0x3a3a3a },
-	{ 0x40e10, 0x3b3b3b },
-	{ 0x40e14, 0x3c3c3c },
-	{ 0x40e18, 0x3d3d3d },
-	{ 0x40e1c, 0x3e3e3e },
-	{ 0x40e20, 0x3f3f3f },
-	{ 0x40e24, 0x404040 },
-	{ 0x40e28, 0x414141 },
-	{ 0x40e2c, 0x424242 },
-	{ 0x40e30, 0x434343 },
-	{ 0x40e34, 0x444444 },
-	{ 0x40e38, 0x464646 },
-	{ 0x40e3c, 0x474747 },
-	{ 0x40e40, 0x484848 },
-	{ 0x40e44, 0x494949 },
-	{ 0x40e48, 0x4a4a4a },
-	{ 0x40e4c, 0x4b4b4b },
-	{ 0x40e50, 0x4c4c4c },
-	{ 0x40e54, 0x4d4d4d },
-	{ 0x40e58, 0x4f4f4f },
-	{ 0x40e5c, 0x505050 },
-	{ 0x40e60, 0x515151 },
-	{ 0x40e64, 0x525252 },
-	{ 0x40e68, 0x535353 },
-	{ 0x40e6c, 0x545454 },
-	{ 0x40e70, 0x565656 },
-	{ 0x40e74, 0x575757 },
-	{ 0x40e78, 0x585858 },
-	{ 0x40e7c, 0x595959 },
-	{ 0x40e80, 0x5b5b5b },
-	{ 0x40e84, 0x5c5c5c },
-	{ 0x40e88, 0x5d5d5d },
-	{ 0x40e8c, 0x5e5e5e },
-	{ 0x40e90, 0x606060 },
-	{ 0x40e94, 0x616161 },
-	{ 0x40e98, 0x626262 },
-	{ 0x40e9c, 0x646464 },
-	{ 0x40ea0, 0x656565 },
-	{ 0x40ea4, 0x666666 },
-	{ 0x40ea8, 0x686868 },
-	{ 0x40eac, 0x696969 },
-	{ 0x40eb0, 0x6a6a6a },
-	{ 0x40eb4, 0x6c6c6c },
-	{ 0x40eb8, 0x6d6d6d },
-	{ 0x40ebc, 0x6f6f6f },
-	{ 0x40ec0, 0x707070 },
-	{ 0x40ec4, 0x717171 },
-	{ 0x40ec8, 0x737373 },
-	{ 0x40ecc, 0x747474 },
-	{ 0x40ed0, 0x767676 },
-	{ 0x40ed4, 0x777777 },
-	{ 0x40ed8, 0x797979 },
-	{ 0x40edc, 0x7a7a7a },
-	{ 0x40ee0, 0x7c7c7c },
-	{ 0x40ee4, 0x7d7d7d },
-	{ 0x40ee8, 0x7f7f7f },
-	{ 0x40eec, 0x808080 },
-	{ 0x40ef0, 0x828282 },
-	{ 0x40ef4, 0x838383 },
-	{ 0x40ef8, 0x858585 },
-	{ 0x40efc, 0x868686 },
-	{ 0x40f00, 0x888888 },
-	{ 0x40f04, 0x898989 },
-	{ 0x40f08, 0x8b8b8b },
-	{ 0x40f0c, 0x8d8d8d },
-	{ 0x40f10, 0x8e8e8e },
-	{ 0x40f14, 0x909090 },
-	{ 0x40f18, 0x919191 },
-	{ 0x40f1c, 0x939393 },
-	{ 0x40f20, 0x959595 },
-	{ 0x40f24, 0x969696 },
-	{ 0x40f28, 0x989898 },
-	{ 0x40f2c, 0x9a9a9a },
-	{ 0x40f30, 0x9b9b9b },
-	{ 0x40f34, 0x9d9d9d },
-	{ 0x40f38, 0x9f9f9f },
-	{ 0x40f3c, 0xa1a1a1 },
-	{ 0x40f40, 0xa2a2a2 },
-	{ 0x40f44, 0xa4a4a4 },
-	{ 0x40f48, 0xa6a6a6 },
-	{ 0x40f4c, 0xa7a7a7 },
-	{ 0x40f50, 0xa9a9a9 },
-	{ 0x40f54, 0xababab },
-	{ 0x40f58, 0xadadad },
-	{ 0x40f5c, 0xafafaf },
-	{ 0x40f60, 0xb0b0b0 },
-	{ 0x40f64, 0xb2b2b2 },
-	{ 0x40f68, 0xb4b4b4 },
-	{ 0x40f6c, 0xb6b6b6 },
-	{ 0x40f70, 0xb8b8b8 },
-	{ 0x40f74, 0xbababa },
-	{ 0x40f78, 0xbbbbbb },
-	{ 0x40f7c, 0xbdbdbd },
-	{ 0x40f80, 0xbfbfbf },
-	{ 0x40f84, 0xc1c1c1 },
-	{ 0x40f88, 0xc3c3c3 },
-	{ 0x40f8c, 0xc5c5c5 },
-	{ 0x40f90, 0xc7c7c7 },
-	{ 0x40f94, 0xc9c9c9 },
-	{ 0x40f98, 0xcbcbcb },
-	{ 0x40f9c, 0xcdcdcd },
-	{ 0x40fa0, 0xcfcfcf },
-	{ 0x40fa4, 0xd1d1d1 },
-	{ 0x40fa8, 0xd3d3d3 },
-	{ 0x40fac, 0xd5d5d5 },
-	{ 0x40fb0, 0xd7d7d7 },
-	{ 0x40fb4, 0xd9d9d9 },
-	{ 0x40fb8, 0xdbdbdb },
-	{ 0x40fbc, 0xdddddd },
-	{ 0x40fc0, 0xdfdfdf },
-	{ 0x40fc4, 0xe1e1e1 },
-	{ 0x40fc8, 0xe3e3e3 },
-	{ 0x40fcc, 0xe5e5e5 },
-	{ 0x40fd0, 0xe7e7e7 },
-	{ 0x40fd4, 0xe9e9e9 },
-	{ 0x40fd8, 0xebebeb },
-	{ 0x40fdc, 0xeeeeee },
-	{ 0x40fe0, 0xf0f0f0 },
-	{ 0x40fe4, 0xf2f2f2 },
-	{ 0x40fe8, 0xf4f4f4 },
-	{ 0x40fec, 0xf6f6f6 },
-	{ 0x40ff0, 0xf8f8f8 },
-	{ 0x40ff4, 0xfbfbfb },
-	{ 0x40ff8, 0xfdfdfd },
-	{ 0x40ffc, 0xffffff },
-};
diff --git a/drivers/video/fbdev/msm/mdp_hw.h b/drivers/video/fbdev/msm/mdp_hw.h
deleted file mode 100644
index 35848d741001..000000000000
--- a/drivers/video/fbdev/msm/mdp_hw.h
+++ /dev/null
@@ -1,627 +0,0 @@
-/* drivers/video/msm_fb/mdp_hw.h
- *
- * Copyright (C) 2007 QUALCOMM Incorporated
- * Copyright (C) 2007 Google Incorporated
- *
- * This software is licensed under the terms of the GNU General Public
- * License version 2, as published by the Free Software Foundation, and
- * may be copied, distributed, and modified under those terms.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- */
-#ifndef _MDP_HW_H_
-#define _MDP_HW_H_
-
-#include <linux/platform_data/video-msm_fb.h>
-
-struct mdp_info {
-	struct mdp_device mdp_dev;
-	char * __iomem base;
-	int irq;
-};
-struct mdp_blit_req;
-struct mdp_device;
-int mdp_ppp_blit(const struct mdp_info *mdp, struct mdp_blit_req *req,
-		 struct file *src_file, unsigned long src_start,
-		 unsigned long src_len, struct file *dst_file,
-		 unsigned long dst_start, unsigned long dst_len);
-#define mdp_writel(mdp, value, offset) writel(value, mdp->base + offset)
-#define mdp_readl(mdp, offset) readl(mdp->base + offset)
-
-#define MDP_SYNC_CONFIG_0                (0x00000)
-#define MDP_SYNC_CONFIG_1                (0x00004)
-#define MDP_SYNC_CONFIG_2                (0x00008)
-#define MDP_SYNC_STATUS_0                (0x0000c)
-#define MDP_SYNC_STATUS_1                (0x00010)
-#define MDP_SYNC_STATUS_2                (0x00014)
-#define MDP_SYNC_THRESH_0                (0x00018)
-#define MDP_SYNC_THRESH_1                (0x0001c)
-#define MDP_INTR_ENABLE                  (0x00020)
-#define MDP_INTR_STATUS                  (0x00024)
-#define MDP_INTR_CLEAR                   (0x00028)
-#define MDP_DISPLAY0_START               (0x00030)
-#define MDP_DISPLAY1_START               (0x00034)
-#define MDP_DISPLAY_STATUS               (0x00038)
-#define MDP_EBI2_LCD0                    (0x0003c)
-#define MDP_EBI2_LCD1                    (0x00040)
-#define MDP_DISPLAY0_ADDR                (0x00054)
-#define MDP_DISPLAY1_ADDR                (0x00058)
-#define MDP_EBI2_PORTMAP_MODE            (0x0005c)
-#define MDP_MODE                         (0x00060)
-#define MDP_TV_OUT_STATUS                (0x00064)
-#define MDP_HW_VERSION                   (0x00070)
-#define MDP_SW_RESET                     (0x00074)
-#define MDP_AXI_ERROR_MASTER_STOP        (0x00078)
-#define MDP_SEL_CLK_OR_HCLK_TEST_BUS     (0x0007c)
-#define MDP_PRIMARY_VSYNC_OUT_CTRL       (0x00080)
-#define MDP_SECONDARY_VSYNC_OUT_CTRL     (0x00084)
-#define MDP_EXTERNAL_VSYNC_OUT_CTRL      (0x00088)
-#define MDP_VSYNC_CTRL                   (0x0008c)
-#define MDP_CGC_EN                       (0x00100)
-#define MDP_CMD_STATUS                   (0x10008)
-#define MDP_PROFILE_EN                   (0x10010)
-#define MDP_PROFILE_COUNT                (0x10014)
-#define MDP_DMA_START                    (0x10044)
-#define MDP_FULL_BYPASS_WORD0            (0x10100)
-#define MDP_FULL_BYPASS_WORD1            (0x10104)
-#define MDP_COMMAND_CONFIG               (0x10104)
-#define MDP_FULL_BYPASS_WORD2            (0x10108)
-#define MDP_FULL_BYPASS_WORD3            (0x1010c)
-#define MDP_FULL_BYPASS_WORD4            (0x10110)
-#define MDP_FULL_BYPASS_WORD6            (0x10118)
-#define MDP_FULL_BYPASS_WORD7            (0x1011c)
-#define MDP_FULL_BYPASS_WORD8            (0x10120)
-#define MDP_FULL_BYPASS_WORD9            (0x10124)
-#define MDP_PPP_SOURCE_CONFIG            (0x10124)
-#define MDP_FULL_BYPASS_WORD10           (0x10128)
-#define MDP_FULL_BYPASS_WORD11           (0x1012c)
-#define MDP_FULL_BYPASS_WORD12           (0x10130)
-#define MDP_FULL_BYPASS_WORD13           (0x10134)
-#define MDP_FULL_BYPASS_WORD14           (0x10138)
-#define MDP_PPP_OPERATION_CONFIG         (0x10138)
-#define MDP_FULL_BYPASS_WORD15           (0x1013c)
-#define MDP_FULL_BYPASS_WORD16           (0x10140)
-#define MDP_FULL_BYPASS_WORD17           (0x10144)
-#define MDP_FULL_BYPASS_WORD18           (0x10148)
-#define MDP_FULL_BYPASS_WORD19           (0x1014c)
-#define MDP_FULL_BYPASS_WORD20           (0x10150)
-#define MDP_PPP_DESTINATION_CONFIG       (0x10150)
-#define MDP_FULL_BYPASS_WORD21           (0x10154)
-#define MDP_FULL_BYPASS_WORD22           (0x10158)
-#define MDP_FULL_BYPASS_WORD23           (0x1015c)
-#define MDP_FULL_BYPASS_WORD24           (0x10160)
-#define MDP_FULL_BYPASS_WORD25           (0x10164)
-#define MDP_FULL_BYPASS_WORD26           (0x10168)
-#define MDP_FULL_BYPASS_WORD27           (0x1016c)
-#define MDP_FULL_BYPASS_WORD29           (0x10174)
-#define MDP_FULL_BYPASS_WORD30           (0x10178)
-#define MDP_FULL_BYPASS_WORD31           (0x1017c)
-#define MDP_FULL_BYPASS_WORD32           (0x10180)
-#define MDP_DMA_CONFIG                   (0x10180)
-#define MDP_FULL_BYPASS_WORD33           (0x10184)
-#define MDP_FULL_BYPASS_WORD34           (0x10188)
-#define MDP_FULL_BYPASS_WORD35           (0x1018c)
-#define MDP_FULL_BYPASS_WORD37           (0x10194)
-#define MDP_FULL_BYPASS_WORD39           (0x1019c)
-#define MDP_FULL_BYPASS_WORD40           (0x101a0)
-#define MDP_FULL_BYPASS_WORD41           (0x101a4)
-#define MDP_FULL_BYPASS_WORD43           (0x101ac)
-#define MDP_FULL_BYPASS_WORD46           (0x101b8)
-#define MDP_FULL_BYPASS_WORD47           (0x101bc)
-#define MDP_FULL_BYPASS_WORD48           (0x101c0)
-#define MDP_FULL_BYPASS_WORD49           (0x101c4)
-#define MDP_FULL_BYPASS_WORD50           (0x101c8)
-#define MDP_FULL_BYPASS_WORD51           (0x101cc)
-#define MDP_FULL_BYPASS_WORD52           (0x101d0)
-#define MDP_FULL_BYPASS_WORD53           (0x101d4)
-#define MDP_FULL_BYPASS_WORD54           (0x101d8)
-#define MDP_FULL_BYPASS_WORD55           (0x101dc)
-#define MDP_FULL_BYPASS_WORD56           (0x101e0)
-#define MDP_FULL_BYPASS_WORD57           (0x101e4)
-#define MDP_FULL_BYPASS_WORD58           (0x101e8)
-#define MDP_FULL_BYPASS_WORD59           (0x101ec)
-#define MDP_FULL_BYPASS_WORD60           (0x101f0)
-#define MDP_VSYNC_THRESHOLD              (0x101f0)
-#define MDP_FULL_BYPASS_WORD61           (0x101f4)
-#define MDP_FULL_BYPASS_WORD62           (0x101f8)
-#define MDP_FULL_BYPASS_WORD63           (0x101fc)
-#define MDP_TFETCH_TEST_MODE             (0x20004)
-#define MDP_TFETCH_STATUS                (0x20008)
-#define MDP_TFETCH_TILE_COUNT            (0x20010)
-#define MDP_TFETCH_FETCH_COUNT           (0x20014)
-#define MDP_TFETCH_CONSTANT_COLOR        (0x20040)
-#define MDP_CSC_BYPASS                   (0x40004)
-#define MDP_SCALE_COEFF_LSB              (0x5fffc)
-#define MDP_TV_OUT_CTL                   (0xc0000)
-#define MDP_TV_OUT_FIR_COEFF             (0xc0004)
-#define MDP_TV_OUT_BUF_ADDR              (0xc0008)
-#define MDP_TV_OUT_CC_DATA               (0xc000c)
-#define MDP_TV_OUT_SOBEL                 (0xc0010)
-#define MDP_TV_OUT_Y_CLAMP               (0xc0018)
-#define MDP_TV_OUT_CB_CLAMP              (0xc001c)
-#define MDP_TV_OUT_CR_CLAMP              (0xc0020)
-#define MDP_TEST_MODE_CLK                (0xd0000)
-#define MDP_TEST_MISR_RESET_CLK          (0xd0004)
-#define MDP_TEST_EXPORT_MISR_CLK         (0xd0008)
-#define MDP_TEST_MISR_CURR_VAL_CLK       (0xd000c)
-#define MDP_TEST_MODE_HCLK               (0xd0100)
-#define MDP_TEST_MISR_RESET_HCLK         (0xd0104)
-#define MDP_TEST_EXPORT_MISR_HCLK        (0xd0108)
-#define MDP_TEST_MISR_CURR_VAL_HCLK      (0xd010c)
-#define MDP_TEST_MODE_DCLK               (0xd0200)
-#define MDP_TEST_MISR_RESET_DCLK         (0xd0204)
-#define MDP_TEST_EXPORT_MISR_DCLK        (0xd0208)
-#define MDP_TEST_MISR_CURR_VAL_DCLK      (0xd020c)
-#define MDP_TEST_CAPTURED_DCLK           (0xd0210)
-#define MDP_TEST_MISR_CAPT_VAL_DCLK      (0xd0214)
-#define MDP_LCDC_CTL                     (0xe0000)
-#define MDP_LCDC_HSYNC_CTL               (0xe0004)
-#define MDP_LCDC_VSYNC_CTL               (0xe0008)
-#define MDP_LCDC_ACTIVE_HCTL             (0xe000c)
-#define MDP_LCDC_ACTIVE_VCTL             (0xe0010)
-#define MDP_LCDC_BORDER_CLR              (0xe0014)
-#define MDP_LCDC_H_BLANK                 (0xe0018)
-#define MDP_LCDC_V_BLANK                 (0xe001c)
-#define MDP_LCDC_UNDERFLOW_CLR           (0xe0020)
-#define MDP_LCDC_HSYNC_SKEW              (0xe0024)
-#define MDP_LCDC_TEST_CTL                (0xe0028)
-#define MDP_LCDC_LINE_IRQ                (0xe002c)
-#define MDP_LCDC_CTL_POLARITY            (0xe0030)
-#define MDP_LCDC_DMA_CONFIG              (0xe1000)
-#define MDP_LCDC_DMA_SIZE                (0xe1004)
-#define MDP_LCDC_DMA_IBUF_ADDR           (0xe1008)
-#define MDP_LCDC_DMA_IBUF_Y_STRIDE       (0xe100c)
-
-
-#define MDP_DMA2_TERM 0x1
-#define MDP_DMA3_TERM 0x2
-#define MDP_PPP_TERM 0x3
-
-/* MDP_INTR_ENABLE */
-#define DL0_ROI_DONE           (1<<0)
-#define DL1_ROI_DONE           (1<<1)
-#define DL0_DMA2_TERM_DONE     (1<<2)
-#define DL1_DMA2_TERM_DONE     (1<<3)
-#define DL0_PPP_TERM_DONE      (1<<4)
-#define DL1_PPP_TERM_DONE      (1<<5)
-#define TV_OUT_DMA3_DONE       (1<<6)
-#define TV_ENC_UNDERRUN        (1<<7)
-#define DL0_FETCH_DONE         (1<<11)
-#define DL1_FETCH_DONE         (1<<12)
-
-#define MDP_PPP_BUSY_STATUS (DL0_ROI_DONE| \
-			   DL1_ROI_DONE| \
-			   DL0_PPP_TERM_DONE| \
-			   DL1_PPP_TERM_DONE)
-
-#define MDP_ANY_INTR_MASK (DL0_ROI_DONE| \
-			   DL1_ROI_DONE| \
-			   DL0_DMA2_TERM_DONE| \
-			   DL1_DMA2_TERM_DONE| \
-			   DL0_PPP_TERM_DONE| \
-			   DL1_PPP_TERM_DONE| \
-			   DL0_FETCH_DONE| \
-			   DL1_FETCH_DONE| \
-			   TV_ENC_UNDERRUN)
-
-#define MDP_TOP_LUMA       16
-#define MDP_TOP_CHROMA     0
-#define MDP_BOTTOM_LUMA    19
-#define MDP_BOTTOM_CHROMA  3
-#define MDP_LEFT_LUMA      22
-#define MDP_LEFT_CHROMA    6
-#define MDP_RIGHT_LUMA     25
-#define MDP_RIGHT_CHROMA   9
-
-#define CLR_G 0x0
-#define CLR_B 0x1
-#define CLR_R 0x2
-#define CLR_ALPHA 0x3
-
-#define CLR_Y  CLR_G
-#define CLR_CB CLR_B
-#define CLR_CR CLR_R
-
-/* from lsb to msb */
-#define MDP_GET_PACK_PATTERN(a, x, y, z, bit) \
-	(((a)<<(bit*3))|((x)<<(bit*2))|((y)<<bit)|(z))
-
-/* MDP_SYNC_CONFIG_0/1/2 */
-#define MDP_SYNCFG_HGT_LOC 22
-#define MDP_SYNCFG_VSYNC_EXT_EN (1<<21)
-#define MDP_SYNCFG_VSYNC_INT_EN (1<<20)
-
-/* MDP_SYNC_THRESH_0 */
-#define MDP_PRIM_BELOW_LOC 0
-#define MDP_PRIM_ABOVE_LOC 8
-
-/* MDP_{PRIMARY,SECONDARY,EXTERNAL}_VSYNC_OUT_CRL */
-#define VSYNC_PULSE_EN (1<<31)
-#define VSYNC_PULSE_INV (1<<30)
-
-/* MDP_VSYNC_CTRL */
-#define DISP0_VSYNC_MAP_VSYNC0 0
-#define DISP0_VSYNC_MAP_VSYNC1 (1<<0)
-#define DISP0_VSYNC_MAP_VSYNC2 ((1<<0)|(1<<1))
-
-#define DISP1_VSYNC_MAP_VSYNC0 0
-#define DISP1_VSYNC_MAP_VSYNC1 (1<<2)
-#define DISP1_VSYNC_MAP_VSYNC2 ((1<<2)|(1<<3))
-
-#define PRIMARY_LCD_SYNC_EN (1<<4)
-#define PRIMARY_LCD_SYNC_DISABLE 0
-
-#define SECONDARY_LCD_SYNC_EN (1<<5)
-#define SECONDARY_LCD_SYNC_DISABLE 0
-
-#define EXTERNAL_LCD_SYNC_EN (1<<6)
-#define EXTERNAL_LCD_SYNC_DISABLE 0
-
-/* MDP_VSYNC_THRESHOLD / MDP_FULL_BYPASS_WORD60 */
-#define VSYNC_THRESHOLD_ABOVE_LOC 0
-#define VSYNC_THRESHOLD_BELOW_LOC 16
-#define VSYNC_ANTI_TEAR_EN (1<<31)
-
-/* MDP_COMMAND_CONFIG / MDP_FULL_BYPASS_WORD1 */
-#define MDP_CMD_DBGBUS_EN (1<<0)
-
-/* MDP_PPP_SOURCE_CONFIG / MDP_FULL_BYPASS_WORD9&53 */
-#define PPP_SRC_C0G_8BIT ((1<<1)|(1<<0))
-#define PPP_SRC_C1B_8BIT ((1<<3)|(1<<2))
-#define PPP_SRC_C2R_8BIT ((1<<5)|(1<<4))
-#define PPP_SRC_C3A_8BIT ((1<<7)|(1<<6))
-
-#define PPP_SRC_C0G_6BIT (1<<1)
-#define PPP_SRC_C1B_6BIT (1<<3)
-#define PPP_SRC_C2R_6BIT (1<<5)
-
-#define PPP_SRC_C0G_5BIT (1<<0)
-#define PPP_SRC_C1B_5BIT (1<<2)
-#define PPP_SRC_C2R_5BIT (1<<4)
-
-#define PPP_SRC_C3ALPHA_EN (1<<8)
-
-#define PPP_SRC_BPP_1BYTES 0
-#define PPP_SRC_BPP_2BYTES (1<<9)
-#define PPP_SRC_BPP_3BYTES (1<<10)
-#define PPP_SRC_BPP_4BYTES ((1<<10)|(1<<9))
-
-#define PPP_SRC_BPP_ROI_ODD_X (1<<11)
-#define PPP_SRC_BPP_ROI_ODD_Y (1<<12)
-#define PPP_SRC_INTERLVD_2COMPONENTS (1<<13)
-#define PPP_SRC_INTERLVD_3COMPONENTS (1<<14)
-#define PPP_SRC_INTERLVD_4COMPONENTS ((1<<14)|(1<<13))
-
-
-/* RGB666 unpack format
-** TIGHT means R6+G6+B6 together
-** LOOSE means R6+2 +G6+2+ B6+2 (with MSB)
-**          or 2+R6 +2+G6 +2+B6 (with LSB)
-*/
-#define PPP_SRC_PACK_TIGHT (1<<17)
-#define PPP_SRC_PACK_LOOSE 0
-#define PPP_SRC_PACK_ALIGN_LSB 0
-#define PPP_SRC_PACK_ALIGN_MSB (1<<18)
-
-#define PPP_SRC_PLANE_INTERLVD 0
-#define PPP_SRC_PLANE_PSEUDOPLNR (1<<20)
-
-#define PPP_SRC_WMV9_MODE (1<<21)
-
-/* MDP_PPP_OPERATION_CONFIG / MDP_FULL_BYPASS_WORD14 */
-#define PPP_OP_SCALE_X_ON (1<<0)
-#define PPP_OP_SCALE_Y_ON (1<<1)
-
-#define PPP_OP_CONVERT_RGB2YCBCR 0
-#define PPP_OP_CONVERT_YCBCR2RGB (1<<2)
-#define PPP_OP_CONVERT_ON (1<<3)
-
-#define PPP_OP_CONVERT_MATRIX_PRIMARY 0
-#define PPP_OP_CONVERT_MATRIX_SECONDARY (1<<4)
-
-#define PPP_OP_LUT_C0_ON (1<<5)
-#define PPP_OP_LUT_C1_ON (1<<6)
-#define PPP_OP_LUT_C2_ON (1<<7)
-
-/* rotate or blend enable */
-#define PPP_OP_ROT_ON (1<<8)
-
-#define PPP_OP_ROT_90 (1<<9)
-#define PPP_OP_FLIP_LR (1<<10)
-#define PPP_OP_FLIP_UD (1<<11)
-
-#define PPP_OP_BLEND_ON (1<<12)
-
-#define PPP_OP_BLEND_SRCPIXEL_ALPHA 0
-#define PPP_OP_BLEND_DSTPIXEL_ALPHA (1<<13)
-#define PPP_OP_BLEND_CONSTANT_ALPHA (1<<14)
-#define PPP_OP_BLEND_SRCPIXEL_TRANSP ((1<<13)|(1<<14))
-
-#define PPP_OP_BLEND_ALPHA_BLEND_NORMAL 0
-#define PPP_OP_BLEND_ALPHA_BLEND_REVERSE (1<<15)
-
-#define PPP_OP_DITHER_EN (1<<16)
-
-#define PPP_OP_COLOR_SPACE_RGB 0
-#define PPP_OP_COLOR_SPACE_YCBCR (1<<17)
-
-#define PPP_OP_SRC_CHROMA_RGB 0
-#define PPP_OP_SRC_CHROMA_H2V1 (1<<18)
-#define PPP_OP_SRC_CHROMA_H1V2 (1<<19)
-#define PPP_OP_SRC_CHROMA_420 ((1<<18)|(1<<19))
-#define PPP_OP_SRC_CHROMA_COSITE 0
-#define PPP_OP_SRC_CHROMA_OFFSITE (1<<20)
-
-#define PPP_OP_DST_CHROMA_RGB 0
-#define PPP_OP_DST_CHROMA_H2V1 (1<<21)
-#define PPP_OP_DST_CHROMA_H1V2 (1<<22)
-#define PPP_OP_DST_CHROMA_420 ((1<<21)|(1<<22))
-#define PPP_OP_DST_CHROMA_COSITE 0
-#define PPP_OP_DST_CHROMA_OFFSITE (1<<23)
-
-#define PPP_BLEND_ALPHA_TRANSP (1<<24)
-
-#define PPP_OP_BG_CHROMA_RGB 0
-#define PPP_OP_BG_CHROMA_H2V1 (1<<25)
-#define PPP_OP_BG_CHROMA_H1V2 (1<<26)
-#define PPP_OP_BG_CHROMA_420 ((1<<25)|(1<<26))
-#define PPP_OP_BG_CHROMA_SITE_COSITE 0
-#define PPP_OP_BG_CHROMA_SITE_OFFSITE (1<<27)
-
-/* MDP_PPP_DESTINATION_CONFIG / MDP_FULL_BYPASS_WORD20 */
-#define PPP_DST_C0G_8BIT ((1<<0)|(1<<1))
-#define PPP_DST_C1B_8BIT ((1<<3)|(1<<2))
-#define PPP_DST_C2R_8BIT ((1<<5)|(1<<4))
-#define PPP_DST_C3A_8BIT ((1<<7)|(1<<6))
-
-#define PPP_DST_C0G_6BIT (1<<1)
-#define PPP_DST_C1B_6BIT (1<<3)
-#define PPP_DST_C2R_6BIT (1<<5)
-
-#define PPP_DST_C0G_5BIT (1<<0)
-#define PPP_DST_C1B_5BIT (1<<2)
-#define PPP_DST_C2R_5BIT (1<<4)
-
-#define PPP_DST_C3A_8BIT ((1<<7)|(1<<6))
-#define PPP_DST_C3ALPHA_EN (1<<8)
-
-#define PPP_DST_INTERLVD_2COMPONENTS (1<<9)
-#define PPP_DST_INTERLVD_3COMPONENTS (1<<10)
-#define PPP_DST_INTERLVD_4COMPONENTS ((1<<10)|(1<<9))
-#define PPP_DST_INTERLVD_6COMPONENTS ((1<<11)|(1<<9))
-
-#define PPP_DST_PACK_LOOSE 0
-#define PPP_DST_PACK_TIGHT (1<<13)
-#define PPP_DST_PACK_ALIGN_LSB 0
-#define PPP_DST_PACK_ALIGN_MSB (1<<14)
-
-#define PPP_DST_OUT_SEL_AXI 0
-#define PPP_DST_OUT_SEL_MDDI (1<<15)
-
-#define PPP_DST_BPP_2BYTES (1<<16)
-#define PPP_DST_BPP_3BYTES (1<<17)
-#define PPP_DST_BPP_4BYTES ((1<<17)|(1<<16))
-
-#define PPP_DST_PLANE_INTERLVD 0
-#define PPP_DST_PLANE_PLANAR (1<<18)
-#define PPP_DST_PLANE_PSEUDOPLNR (1<<19)
-
-#define PPP_DST_TO_TV (1<<20)
-
-#define PPP_DST_MDDI_PRIMARY 0
-#define PPP_DST_MDDI_SECONDARY (1<<21)
-#define PPP_DST_MDDI_EXTERNAL (1<<22)
-
-/* image configurations by image type */
-#define PPP_CFG_MDP_RGB_565(dir)       (PPP_##dir##_C2R_5BIT | \
-					PPP_##dir##_C0G_6BIT | \
-					PPP_##dir##_C1B_5BIT | \
-					PPP_##dir##_BPP_2BYTES | \
-					PPP_##dir##_INTERLVD_3COMPONENTS | \
-					PPP_##dir##_PACK_TIGHT | \
-					PPP_##dir##_PACK_ALIGN_LSB | \
-					PPP_##dir##_PLANE_INTERLVD)
-
-#define PPP_CFG_MDP_RGB_888(dir)       (PPP_##dir##_C2R_8BIT | \
-					PPP_##dir##_C0G_8BIT | \
-					PPP_##dir##_C1B_8BIT | \
-					PPP_##dir##_BPP_3BYTES | \
-					PPP_##dir##_INTERLVD_3COMPONENTS | \
-					PPP_##dir##_PACK_TIGHT | \
-					PPP_##dir##_PACK_ALIGN_LSB | \
-					PPP_##dir##_PLANE_INTERLVD)
-
-#define PPP_CFG_MDP_ARGB_8888(dir)     (PPP_##dir##_C2R_8BIT | \
-					PPP_##dir##_C0G_8BIT | \
-					PPP_##dir##_C1B_8BIT | \
-					PPP_##dir##_C3A_8BIT | \
-					PPP_##dir##_C3ALPHA_EN | \
-					PPP_##dir##_BPP_4BYTES | \
-					PPP_##dir##_INTERLVD_4COMPONENTS | \
-					PPP_##dir##_PACK_TIGHT | \
-					PPP_##dir##_PACK_ALIGN_LSB | \
-					PPP_##dir##_PLANE_INTERLVD)
-
-#define PPP_CFG_MDP_XRGB_8888(dir) PPP_CFG_MDP_ARGB_8888(dir)
-#define PPP_CFG_MDP_RGBA_8888(dir) PPP_CFG_MDP_ARGB_8888(dir)
-#define PPP_CFG_MDP_BGRA_8888(dir) PPP_CFG_MDP_ARGB_8888(dir)
-#define PPP_CFG_MDP_RGBX_8888(dir) PPP_CFG_MDP_ARGB_8888(dir)
-
-#define PPP_CFG_MDP_Y_CBCR_H2V2(dir)   (PPP_##dir##_C2R_8BIT | \
-					PPP_##dir##_C0G_8BIT | \
-					PPP_##dir##_C1B_8BIT | \
-					PPP_##dir##_C3A_8BIT | \
-					PPP_##dir##_BPP_2BYTES | \
-					PPP_##dir##_INTERLVD_2COMPONENTS | \
-					PPP_##dir##_PACK_TIGHT | \
-					PPP_##dir##_PACK_ALIGN_LSB | \
-					PPP_##dir##_PLANE_PSEUDOPLNR)
-
-#define PPP_CFG_MDP_Y_CRCB_H2V2(dir)	PPP_CFG_MDP_Y_CBCR_H2V2(dir)
-
-#define PPP_CFG_MDP_YCRYCB_H2V1(dir)   (PPP_##dir##_C2R_8BIT | \
-					PPP_##dir##_C0G_8BIT | \
-					PPP_##dir##_C1B_8BIT | \
-					PPP_##dir##_C3A_8BIT | \
-					PPP_##dir##_BPP_2BYTES | \
-					PPP_##dir##_INTERLVD_4COMPONENTS | \
-					PPP_##dir##_PACK_TIGHT | \
-					PPP_##dir##_PACK_ALIGN_LSB |\
-					PPP_##dir##_PLANE_INTERLVD)
-
-#define PPP_CFG_MDP_Y_CBCR_H2V1(dir)   (PPP_##dir##_C2R_8BIT | \
-					PPP_##dir##_C0G_8BIT | \
-					PPP_##dir##_C1B_8BIT | \
-					PPP_##dir##_C3A_8BIT | \
-					PPP_##dir##_BPP_2BYTES |   \
-					PPP_##dir##_INTERLVD_2COMPONENTS |  \
-					PPP_##dir##_PACK_TIGHT | \
-					PPP_##dir##_PACK_ALIGN_LSB | \
-					PPP_##dir##_PLANE_PSEUDOPLNR)
-
-#define PPP_CFG_MDP_Y_CRCB_H2V1(dir)	PPP_CFG_MDP_Y_CBCR_H2V1(dir)
-
-#define PPP_PACK_PATTERN_MDP_RGB_565 \
-	MDP_GET_PACK_PATTERN(0, CLR_R, CLR_G, CLR_B, 8)
-#define PPP_PACK_PATTERN_MDP_RGB_888 PPP_PACK_PATTERN_MDP_RGB_565
-#define PPP_PACK_PATTERN_MDP_XRGB_8888 \
-	MDP_GET_PACK_PATTERN(CLR_B, CLR_G, CLR_R, CLR_ALPHA, 8)
-#define PPP_PACK_PATTERN_MDP_ARGB_8888 PPP_PACK_PATTERN_MDP_XRGB_8888
-#define PPP_PACK_PATTERN_MDP_RGBA_8888 \
-	MDP_GET_PACK_PATTERN(CLR_ALPHA, CLR_B, CLR_G, CLR_R, 8)
-#define PPP_PACK_PATTERN_MDP_BGRA_8888 \
-	MDP_GET_PACK_PATTERN(CLR_ALPHA, CLR_R, CLR_G, CLR_B, 8)
-#define PPP_PACK_PATTERN_MDP_RGBX_8888 \
-	MDP_GET_PACK_PATTERN(CLR_ALPHA, CLR_B, CLR_G, CLR_R, 8)
-#define PPP_PACK_PATTERN_MDP_Y_CBCR_H2V1 \
-	MDP_GET_PACK_PATTERN(0, 0, CLR_CB, CLR_CR, 8)
-#define PPP_PACK_PATTERN_MDP_Y_CBCR_H2V2 PPP_PACK_PATTERN_MDP_Y_CBCR_H2V1
-#define PPP_PACK_PATTERN_MDP_Y_CRCB_H2V1 \
-	MDP_GET_PACK_PATTERN(0, 0, CLR_CR, CLR_CB, 8)
-#define PPP_PACK_PATTERN_MDP_Y_CRCB_H2V2 PPP_PACK_PATTERN_MDP_Y_CRCB_H2V1
-#define PPP_PACK_PATTERN_MDP_YCRYCB_H2V1 \
-	MDP_GET_PACK_PATTERN(CLR_Y, CLR_R, CLR_Y, CLR_B, 8)
-
-#define PPP_CHROMA_SAMP_MDP_RGB_565(dir) PPP_OP_##dir##_CHROMA_RGB
-#define PPP_CHROMA_SAMP_MDP_RGB_888(dir) PPP_OP_##dir##_CHROMA_RGB
-#define PPP_CHROMA_SAMP_MDP_XRGB_8888(dir) PPP_OP_##dir##_CHROMA_RGB
-#define PPP_CHROMA_SAMP_MDP_ARGB_8888(dir) PPP_OP_##dir##_CHROMA_RGB
-#define PPP_CHROMA_SAMP_MDP_RGBA_8888(dir) PPP_OP_##dir##_CHROMA_RGB
-#define PPP_CHROMA_SAMP_MDP_BGRA_8888(dir) PPP_OP_##dir##_CHROMA_RGB
-#define PPP_CHROMA_SAMP_MDP_RGBX_8888(dir) PPP_OP_##dir##_CHROMA_RGB
-#define PPP_CHROMA_SAMP_MDP_Y_CBCR_H2V1(dir) PPP_OP_##dir##_CHROMA_H2V1
-#define PPP_CHROMA_SAMP_MDP_Y_CBCR_H2V2(dir) PPP_OP_##dir##_CHROMA_420
-#define PPP_CHROMA_SAMP_MDP_Y_CRCB_H2V1(dir) PPP_OP_##dir##_CHROMA_H2V1
-#define PPP_CHROMA_SAMP_MDP_Y_CRCB_H2V2(dir) PPP_OP_##dir##_CHROMA_420
-#define PPP_CHROMA_SAMP_MDP_YCRYCB_H2V1(dir) PPP_OP_##dir##_CHROMA_H2V1
-
-/* Helpful array generation macros */
-#define PPP_ARRAY0(name) \
-	[MDP_RGB_565] = PPP_##name##_MDP_RGB_565,\
-	[MDP_RGB_888] = PPP_##name##_MDP_RGB_888,\
-	[MDP_XRGB_8888] = PPP_##name##_MDP_XRGB_8888,\
-	[MDP_ARGB_8888] = PPP_##name##_MDP_ARGB_8888,\
-	[MDP_RGBA_8888] = PPP_##name##_MDP_RGBA_8888,\
-	[MDP_BGRA_8888] = PPP_##name##_MDP_BGRA_8888,\
-	[MDP_RGBX_8888] = PPP_##name##_MDP_RGBX_8888,\
-	[MDP_Y_CBCR_H2V1] = PPP_##name##_MDP_Y_CBCR_H2V1,\
-	[MDP_Y_CBCR_H2V2] = PPP_##name##_MDP_Y_CBCR_H2V2,\
-	[MDP_Y_CRCB_H2V1] = PPP_##name##_MDP_Y_CRCB_H2V1,\
-	[MDP_Y_CRCB_H2V2] = PPP_##name##_MDP_Y_CRCB_H2V2,\
-	[MDP_YCRYCB_H2V1] = PPP_##name##_MDP_YCRYCB_H2V1
-
-#define PPP_ARRAY1(name, dir) \
-	[MDP_RGB_565] = PPP_##name##_MDP_RGB_565(dir),\
-	[MDP_RGB_888] = PPP_##name##_MDP_RGB_888(dir),\
-	[MDP_XRGB_8888] = PPP_##name##_MDP_XRGB_8888(dir),\
-	[MDP_ARGB_8888] = PPP_##name##_MDP_ARGB_8888(dir),\
-	[MDP_RGBA_8888] = PPP_##name##_MDP_RGBA_8888(dir),\
-	[MDP_BGRA_8888] = PPP_##name##_MDP_BGRA_8888(dir),\
-	[MDP_RGBX_8888] = PPP_##name##_MDP_RGBX_8888(dir),\
-	[MDP_Y_CBCR_H2V1] = PPP_##name##_MDP_Y_CBCR_H2V1(dir),\
-	[MDP_Y_CBCR_H2V2] = PPP_##name##_MDP_Y_CBCR_H2V2(dir),\
-	[MDP_Y_CRCB_H2V1] = PPP_##name##_MDP_Y_CRCB_H2V1(dir),\
-	[MDP_Y_CRCB_H2V2] = PPP_##name##_MDP_Y_CRCB_H2V2(dir),\
-	[MDP_YCRYCB_H2V1] = PPP_##name##_MDP_YCRYCB_H2V1(dir)
-
-#define IS_YCRCB(img) ((img == MDP_Y_CRCB_H2V2) | (img == MDP_Y_CBCR_H2V2) | \
-		       (img == MDP_Y_CRCB_H2V1) | (img == MDP_Y_CBCR_H2V1) | \
-		       (img == MDP_YCRYCB_H2V1))
-#define IS_RGB(img) ((img == MDP_RGB_565) | (img == MDP_RGB_888) | \
-		     (img == MDP_ARGB_8888) | (img == MDP_RGBA_8888) | \
-		     (img == MDP_XRGB_8888) | (img == MDP_BGRA_8888) | \
-		     (img == MDP_RGBX_8888))
-#define HAS_ALPHA(img) ((img == MDP_ARGB_8888) | (img == MDP_RGBA_8888) | \
-			(img == MDP_BGRA_8888))
-
-#define IS_PSEUDOPLNR(img) ((img == MDP_Y_CRCB_H2V2) | \
-			    (img == MDP_Y_CBCR_H2V2) | \
-			    (img == MDP_Y_CRCB_H2V1) | \
-			    (img == MDP_Y_CBCR_H2V1))
-
-/* Mappings from addr to purpose */
-#define PPP_ADDR_SRC_ROI		MDP_FULL_BYPASS_WORD2
-#define PPP_ADDR_SRC0			MDP_FULL_BYPASS_WORD3
-#define PPP_ADDR_SRC1			MDP_FULL_BYPASS_WORD4
-#define PPP_ADDR_SRC_YSTRIDE		MDP_FULL_BYPASS_WORD7
-#define PPP_ADDR_SRC_CFG		MDP_FULL_BYPASS_WORD9
-#define PPP_ADDR_SRC_PACK_PATTERN	MDP_FULL_BYPASS_WORD10
-#define PPP_ADDR_OPERATION		MDP_FULL_BYPASS_WORD14
-#define PPP_ADDR_PHASEX_INIT		MDP_FULL_BYPASS_WORD15
-#define PPP_ADDR_PHASEY_INIT		MDP_FULL_BYPASS_WORD16
-#define PPP_ADDR_PHASEX_STEP		MDP_FULL_BYPASS_WORD17
-#define PPP_ADDR_PHASEY_STEP		MDP_FULL_BYPASS_WORD18
-#define PPP_ADDR_ALPHA_TRANSP		MDP_FULL_BYPASS_WORD19
-#define PPP_ADDR_DST_CFG		MDP_FULL_BYPASS_WORD20
-#define PPP_ADDR_DST_PACK_PATTERN	MDP_FULL_BYPASS_WORD21
-#define PPP_ADDR_DST_ROI		MDP_FULL_BYPASS_WORD25
-#define PPP_ADDR_DST0			MDP_FULL_BYPASS_WORD26
-#define PPP_ADDR_DST1			MDP_FULL_BYPASS_WORD27
-#define PPP_ADDR_DST_YSTRIDE		MDP_FULL_BYPASS_WORD30
-#define PPP_ADDR_EDGE			MDP_FULL_BYPASS_WORD46
-#define PPP_ADDR_BG0			MDP_FULL_BYPASS_WORD48
-#define PPP_ADDR_BG1			MDP_FULL_BYPASS_WORD49
-#define PPP_ADDR_BG_YSTRIDE		MDP_FULL_BYPASS_WORD51
-#define PPP_ADDR_BG_CFG			MDP_FULL_BYPASS_WORD53
-#define PPP_ADDR_BG_PACK_PATTERN	MDP_FULL_BYPASS_WORD54
-
-/* MDP_DMA_CONFIG / MDP_FULL_BYPASS_WORD32 */
-#define DMA_DSTC0G_6BITS (1<<1)
-#define DMA_DSTC1B_6BITS (1<<3)
-#define DMA_DSTC2R_6BITS (1<<5)
-#define DMA_DSTC0G_5BITS (1<<0)
-#define DMA_DSTC1B_5BITS (1<<2)
-#define DMA_DSTC2R_5BITS (1<<4)
-
-#define DMA_PACK_TIGHT (1<<6)
-#define DMA_PACK_LOOSE 0
-#define DMA_PACK_ALIGN_LSB 0
-#define DMA_PACK_ALIGN_MSB (1<<7)
-#define DMA_PACK_PATTERN_RGB \
-	(MDP_GET_PACK_PATTERN(0, CLR_R, CLR_G, CLR_B, 2)<<8)
-
-#define DMA_OUT_SEL_AHB  0
-#define DMA_OUT_SEL_MDDI (1<<14)
-#define DMA_AHBM_LCD_SEL_PRIMARY 0
-#define DMA_AHBM_LCD_SEL_SECONDARY (1<<15)
-#define DMA_IBUF_C3ALPHA_EN (1<<16)
-#define DMA_DITHER_EN (1<<17)
-
-#define DMA_MDDI_DMAOUT_LCD_SEL_PRIMARY 0
-#define DMA_MDDI_DMAOUT_LCD_SEL_SECONDARY (1<<18)
-#define DMA_MDDI_DMAOUT_LCD_SEL_EXTERNAL (1<<19)
-
-#define DMA_IBUF_FORMAT_RGB565 (1<<20)
-#define DMA_IBUF_FORMAT_RGB888_OR_ARGB8888 0
-
-#define DMA_IBUF_NONCONTIGUOUS (1<<21)
-
-/* MDDI REGISTER ? */
-#define MDDI_VDO_PACKET_DESC  0x5666
-#define MDDI_VDO_PACKET_PRIM  0xC3
-#define MDDI_VDO_PACKET_SECD  0xC0
-
-#endif
diff --git a/drivers/video/fbdev/msm/mdp_ppp.c b/drivers/video/fbdev/msm/mdp_ppp.c
deleted file mode 100644
index be6079cdfbb6..000000000000
--- a/drivers/video/fbdev/msm/mdp_ppp.c
+++ /dev/null
@@ -1,731 +0,0 @@
-/* drivers/video/msm/mdp_ppp.c
- *
- * Copyright (C) 2007 QUALCOMM Incorporated
- * Copyright (C) 2007 Google Incorporated
- *
- * This software is licensed under the terms of the GNU General Public
- * License version 2, as published by the Free Software Foundation, and
- * may be copied, distributed, and modified under those terms.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- */
-#include <linux/fb.h>
-#include <linux/file.h>
-#include <linux/delay.h>
-#include <linux/msm_mdp.h>
-#include <linux/platform_data/video-msm_fb.h>
-
-#include "mdp_hw.h"
-#include "mdp_scale_tables.h"
-
-#define DLOG(x...) do {} while (0)
-
-#define MDP_DOWNSCALE_BLUR (MDP_DOWNSCALE_MAX + 1)
-static int downscale_y_table = MDP_DOWNSCALE_MAX;
-static int downscale_x_table = MDP_DOWNSCALE_MAX;
-
-struct mdp_regs {
-	uint32_t src0;
-	uint32_t src1;
-	uint32_t dst0;
-	uint32_t dst1;
-	uint32_t src_cfg;
-	uint32_t dst_cfg;
-	uint32_t src_pack;
-	uint32_t dst_pack;
-	uint32_t src_rect;
-	uint32_t dst_rect;
-	uint32_t src_ystride;
-	uint32_t dst_ystride;
-	uint32_t op;
-	uint32_t src_bpp;
-	uint32_t dst_bpp;
-	uint32_t edge;
-	uint32_t phasex_init;
-	uint32_t phasey_init;
-	uint32_t phasex_step;
-	uint32_t phasey_step;
-};
-
-static uint32_t pack_pattern[] = {
-	PPP_ARRAY0(PACK_PATTERN)
-};
-
-static uint32_t src_img_cfg[] = {
-	PPP_ARRAY1(CFG, SRC)
-};
-
-static uint32_t dst_img_cfg[] = {
-	PPP_ARRAY1(CFG, DST)
-};
-
-static uint32_t bytes_per_pixel[] = {
-	[MDP_RGB_565] = 2,
-	[MDP_RGB_888] = 3,
-	[MDP_XRGB_8888] = 4,
-	[MDP_ARGB_8888] = 4,
-	[MDP_RGBA_8888] = 4,
-	[MDP_BGRA_8888] = 4,
-	[MDP_RGBX_8888] = 4,
-	[MDP_Y_CBCR_H2V1] = 1,
-	[MDP_Y_CBCR_H2V2] = 1,
-	[MDP_Y_CRCB_H2V1] = 1,
-	[MDP_Y_CRCB_H2V2] = 1,
-	[MDP_YCRYCB_H2V1] = 2
-};
-
-static uint32_t dst_op_chroma[] = {
-	PPP_ARRAY1(CHROMA_SAMP, DST)
-};
-
-static uint32_t src_op_chroma[] = {
-	PPP_ARRAY1(CHROMA_SAMP, SRC)
-};
-
-static uint32_t bg_op_chroma[] = {
-	PPP_ARRAY1(CHROMA_SAMP, BG)
-};
-
-static void rotate_dst_addr_x(struct mdp_blit_req *req, struct mdp_regs *regs)
-{
-	regs->dst0 += (req->dst_rect.w -
-		       min((uint32_t)16, req->dst_rect.w)) * regs->dst_bpp;
-	regs->dst1 += (req->dst_rect.w -
-		       min((uint32_t)16, req->dst_rect.w)) * regs->dst_bpp;
-}
-
-static void rotate_dst_addr_y(struct mdp_blit_req *req, struct mdp_regs *regs)
-{
-	regs->dst0 += (req->dst_rect.h -
-		       min((uint32_t)16, req->dst_rect.h)) *
-		       regs->dst_ystride;
-	regs->dst1 += (req->dst_rect.h -
-		       min((uint32_t)16, req->dst_rect.h)) *
-		       regs->dst_ystride;
-}
-
-static void blit_rotate(struct mdp_blit_req *req,
-			struct mdp_regs *regs)
-{
-	if (req->flags == MDP_ROT_NOP)
-		return;
-
-	regs->op |= PPP_OP_ROT_ON;
-	if ((req->flags & MDP_ROT_90 || req->flags & MDP_FLIP_LR) &&
-	    !(req->flags & MDP_ROT_90 && req->flags & MDP_FLIP_LR))
-		rotate_dst_addr_x(req, regs);
-	if (req->flags & MDP_ROT_90)
-		regs->op |= PPP_OP_ROT_90;
-	if (req->flags & MDP_FLIP_UD) {
-		regs->op |= PPP_OP_FLIP_UD;
-		rotate_dst_addr_y(req, regs);
-	}
-	if (req->flags & MDP_FLIP_LR)
-		regs->op |= PPP_OP_FLIP_LR;
-}
-
-static void blit_convert(struct mdp_blit_req *req, struct mdp_regs *regs)
-{
-	if (req->src.format == req->dst.format)
-		return;
-	if (IS_RGB(req->src.format) && IS_YCRCB(req->dst.format)) {
-		regs->op |= PPP_OP_CONVERT_RGB2YCBCR | PPP_OP_CONVERT_ON;
-	} else if (IS_YCRCB(req->src.format) && IS_RGB(req->dst.format)) {
-		regs->op |= PPP_OP_CONVERT_YCBCR2RGB | PPP_OP_CONVERT_ON;
-		if (req->dst.format == MDP_RGB_565)
-			regs->op |= PPP_OP_CONVERT_MATRIX_SECONDARY;
-	}
-}
-
-#define GET_BIT_RANGE(value, high, low) \
-	(((1 << (high - low + 1)) - 1) & (value >> low))
-static uint32_t transp_convert(struct mdp_blit_req *req)
-{
-	uint32_t transp = 0;
-	if (req->src.format == MDP_RGB_565) {
-		/* pad each value to 8 bits by copying the high bits into the
-		 * low end, convert RGB to RBG by switching low 2 components */
-		transp |= ((GET_BIT_RANGE(req->transp_mask, 15, 11) << 3) |
-			   (GET_BIT_RANGE(req->transp_mask, 15, 13))) << 16;
-
-		transp |= ((GET_BIT_RANGE(req->transp_mask, 4, 0) << 3) |
-			   (GET_BIT_RANGE(req->transp_mask, 4, 2))) << 8;
-
-		transp |= (GET_BIT_RANGE(req->transp_mask, 10, 5) << 2) |
-			  (GET_BIT_RANGE(req->transp_mask, 10, 9));
-	} else {
-		/* convert RGB to RBG */
-		transp |= (GET_BIT_RANGE(req->transp_mask, 15, 8)) |
-			  (GET_BIT_RANGE(req->transp_mask, 23, 16) << 16) |
-			  (GET_BIT_RANGE(req->transp_mask, 7, 0) << 8);
-	}
-	return transp;
-}
-#undef GET_BIT_RANGE
-
-static void blit_blend(struct mdp_blit_req *req, struct mdp_regs *regs)
-{
-	/* TRANSP BLEND */
-	if (req->transp_mask != MDP_TRANSP_NOP) {
-		req->transp_mask = transp_convert(req);
-		if (req->alpha != MDP_ALPHA_NOP) {
-			/* use blended transparancy mode
-			 * pixel = (src == transp) ? dst : blend
-			 * blend is combo of blend_eq_sel and
-			 * blend_alpha_sel */
-			regs->op |= PPP_OP_ROT_ON | PPP_OP_BLEND_ON |
-				PPP_OP_BLEND_ALPHA_BLEND_NORMAL |
-				PPP_OP_BLEND_CONSTANT_ALPHA |
-				PPP_BLEND_ALPHA_TRANSP;
-		} else {
-			/* simple transparancy mode
-			 * pixel = (src == transp) ? dst : src */
-			regs->op |= PPP_OP_ROT_ON | PPP_OP_BLEND_ON |
-				PPP_OP_BLEND_SRCPIXEL_TRANSP;
-		}
-	}
-
-	req->alpha &= 0xff;
-	/* ALPHA BLEND */
-	if (HAS_ALPHA(req->src.format)) {
-		regs->op |= PPP_OP_ROT_ON | PPP_OP_BLEND_ON |
-			PPP_OP_BLEND_SRCPIXEL_ALPHA;
-	} else if (req->alpha < MDP_ALPHA_NOP) {
-		/* just blend by alpha */
-		regs->op |= PPP_OP_ROT_ON | PPP_OP_BLEND_ON |
-			PPP_OP_BLEND_ALPHA_BLEND_NORMAL |
-			PPP_OP_BLEND_CONSTANT_ALPHA;
-	}
-
-	regs->op |= bg_op_chroma[req->dst.format];
-}
-
-#define ONE_HALF	(1LL << 32)
-#define ONE		(1LL << 33)
-#define TWO		(2LL << 33)
-#define THREE		(3LL << 33)
-#define FRAC_MASK (ONE - 1)
-#define INT_MASK (~FRAC_MASK)
-
-static int scale_params(uint32_t dim_in, uint32_t dim_out, uint32_t origin,
-			uint32_t *phase_init, uint32_t *phase_step)
-{
-	/* to improve precicsion calculations are done in U31.33 and converted
-	 * to U3.29 at the end */
-	int64_t k1, k2, k3, k4, tmp;
-	uint64_t n, d, os, os_p, od, od_p, oreq;
-	unsigned rpa = 0;
-	int64_t ip64, delta;
-
-	if (dim_out % 3 == 0)
-		rpa = !(dim_in % (dim_out / 3));
-
-	n = ((uint64_t)dim_out) << 34;
-	d = dim_in;
-	if (!d)
-		return -1;
-	do_div(n, d);
-	k3 = (n + 1) >> 1;
-	if ((k3 >> 4) < (1LL << 27) || (k3 >> 4) > (1LL << 31)) {
-		DLOG("crap bad scale\n");
-		return -1;
-	}
-	n = ((uint64_t)dim_in) << 34;
-	d = (uint64_t)dim_out;
-	if (!d)
-		return -1;
-	do_div(n, d);
-	k1 = (n + 1) >> 1;
-	k2 = (k1 - ONE) >> 1;
-
-	*phase_init = (int)(k2 >> 4);
-	k4 = (k3 - ONE) >> 1;
-
-	if (rpa) {
-		os = ((uint64_t)origin << 33) - ONE_HALF;
-		tmp = (dim_out * os) + ONE_HALF;
-		if (!dim_in)
-			return -1;
-		do_div(tmp, dim_in);
-		od = tmp - ONE_HALF;
-	} else {
-		os = ((uint64_t)origin << 1) - 1;
-		od = (((k3 * os) >> 1) + k4);
-	}
-
-	od_p = od & INT_MASK;
-	if (od_p != od)
-		od_p += ONE;
-
-	if (rpa) {
-		tmp = (dim_in * od_p) + ONE_HALF;
-		if (!dim_in)
-			return -1;
-		do_div(tmp, dim_in);
-		os_p = tmp - ONE_HALF;
-	} else {
-		os_p = ((k1 * (od_p >> 33)) + k2);
-	}
-
-	oreq = (os_p & INT_MASK) - ONE;
-
-	ip64 = os_p - oreq;
-	delta = ((int64_t)(origin) << 33) - oreq;
-	ip64 -= delta;
-	/* limit to valid range before the left shift */
-	delta = (ip64 & (1LL << 63)) ? 4 : -4;
-	delta <<= 33;
-	while (abs((int)(ip64 >> 33)) > 4)
-		ip64 += delta;
-	*phase_init = (int)(ip64 >> 4);
-	*phase_step = (uint32_t)(k1 >> 4);
-	return 0;
-}
-
-static void load_scale_table(const struct mdp_info *mdp,
-			     struct mdp_table_entry *table, int len)
-{
-	int i;
-	for (i = 0; i < len; i++)
-		mdp_writel(mdp, table[i].val, table[i].reg);
-}
-
-enum {
-IMG_LEFT,
-IMG_RIGHT,
-IMG_TOP,
-IMG_BOTTOM,
-};
-
-static void get_edge_info(uint32_t src, uint32_t src_coord, uint32_t dst,
-			  uint32_t *interp1, uint32_t *interp2,
-			  uint32_t *repeat1, uint32_t *repeat2) {
-	if (src > 3 * dst) {
-		*interp1 = 0;
-		*interp2 = src - 1;
-		*repeat1 = 0;
-		*repeat2 = 0;
-	} else if (src == 3 * dst) {
-		*interp1 = 0;
-		*interp2 = src;
-		*repeat1 = 0;
-		*repeat2 = 1;
-	} else if (src > dst && src < 3 * dst) {
-		*interp1 = -1;
-		*interp2 = src;
-		*repeat1 = 1;
-		*repeat2 = 1;
-	} else if (src == dst) {
-		*interp1 = -1;
-		*interp2 = src + 1;
-		*repeat1 = 1;
-		*repeat2 = 2;
-	} else {
-		*interp1 = -2;
-		*interp2 = src + 1;
-		*repeat1 = 2;
-		*repeat2 = 2;
-	}
-	*interp1 += src_coord;
-	*interp2 += src_coord;
-}
-
-static int get_edge_cond(struct mdp_blit_req *req, struct mdp_regs *regs)
-{
-	int32_t luma_interp[4];
-	int32_t luma_repeat[4];
-	int32_t chroma_interp[4];
-	int32_t chroma_bound[4];
-	int32_t chroma_repeat[4];
-	uint32_t dst_w, dst_h;
-
-	memset(&luma_interp, 0, sizeof(int32_t) * 4);
-	memset(&luma_repeat, 0, sizeof(int32_t) * 4);
-	memset(&chroma_interp, 0, sizeof(int32_t) * 4);
-	memset(&chroma_bound, 0, sizeof(int32_t) * 4);
-	memset(&chroma_repeat, 0, sizeof(int32_t) * 4);
-	regs->edge = 0;
-
-	if (req->flags & MDP_ROT_90) {
-		dst_w = req->dst_rect.h;
-		dst_h = req->dst_rect.w;
-	} else {
-		dst_w = req->dst_rect.w;
-		dst_h = req->dst_rect.h;
-	}
-
-	if (regs->op & (PPP_OP_SCALE_Y_ON | PPP_OP_SCALE_X_ON)) {
-		get_edge_info(req->src_rect.h, req->src_rect.y, dst_h,
-			      &luma_interp[IMG_TOP], &luma_interp[IMG_BOTTOM],
-			      &luma_repeat[IMG_TOP], &luma_repeat[IMG_BOTTOM]);
-		get_edge_info(req->src_rect.w, req->src_rect.x, dst_w,
-			      &luma_interp[IMG_LEFT], &luma_interp[IMG_RIGHT],
-			      &luma_repeat[IMG_LEFT], &luma_repeat[IMG_RIGHT]);
-	} else {
-		luma_interp[IMG_LEFT] = req->src_rect.x;
-		luma_interp[IMG_RIGHT] = req->src_rect.x + req->src_rect.w - 1;
-		luma_interp[IMG_TOP] = req->src_rect.y;
-		luma_interp[IMG_BOTTOM] = req->src_rect.y + req->src_rect.h - 1;
-		luma_repeat[IMG_LEFT] = 0;
-		luma_repeat[IMG_TOP] = 0;
-		luma_repeat[IMG_RIGHT] = 0;
-		luma_repeat[IMG_BOTTOM] = 0;
-	}
-
-	chroma_interp[IMG_LEFT] = luma_interp[IMG_LEFT];
-	chroma_interp[IMG_RIGHT] = luma_interp[IMG_RIGHT];
-	chroma_interp[IMG_TOP] = luma_interp[IMG_TOP];
-	chroma_interp[IMG_BOTTOM] = luma_interp[IMG_BOTTOM];
-
-	chroma_bound[IMG_LEFT] = req->src_rect.x;
-	chroma_bound[IMG_RIGHT] = req->src_rect.x + req->src_rect.w - 1;
-	chroma_bound[IMG_TOP] = req->src_rect.y;
-	chroma_bound[IMG_BOTTOM] = req->src_rect.y + req->src_rect.h - 1;
-
-	if (IS_YCRCB(req->src.format)) {
-		chroma_interp[IMG_LEFT] = chroma_interp[IMG_LEFT] >> 1;
-		chroma_interp[IMG_RIGHT] = (chroma_interp[IMG_RIGHT] + 1) >> 1;
-
-		chroma_bound[IMG_LEFT] = chroma_bound[IMG_LEFT] >> 1;
-		chroma_bound[IMG_RIGHT] = chroma_bound[IMG_RIGHT] >> 1;
-	}
-
-	if (req->src.format == MDP_Y_CBCR_H2V2 ||
-	    req->src.format == MDP_Y_CRCB_H2V2) {
-		chroma_interp[IMG_TOP] = (chroma_interp[IMG_TOP] - 1) >> 1;
-		chroma_interp[IMG_BOTTOM] = (chroma_interp[IMG_BOTTOM] + 1)
-					    >> 1;
-		chroma_bound[IMG_TOP] = (chroma_bound[IMG_TOP] + 1) >> 1;
-		chroma_bound[IMG_BOTTOM] = chroma_bound[IMG_BOTTOM] >> 1;
-	}
-
-	chroma_repeat[IMG_LEFT] = chroma_bound[IMG_LEFT] -
-				  chroma_interp[IMG_LEFT];
-	chroma_repeat[IMG_RIGHT] = chroma_interp[IMG_RIGHT] -
-				  chroma_bound[IMG_RIGHT];
-	chroma_repeat[IMG_TOP] = chroma_bound[IMG_TOP] -
-				  chroma_interp[IMG_TOP];
-	chroma_repeat[IMG_BOTTOM] = chroma_interp[IMG_BOTTOM] -
-				  chroma_bound[IMG_BOTTOM];
-
-	if (chroma_repeat[IMG_LEFT] < 0 || chroma_repeat[IMG_LEFT] > 3 ||
-	    chroma_repeat[IMG_RIGHT] < 0 || chroma_repeat[IMG_RIGHT] > 3 ||
-	    chroma_repeat[IMG_TOP] < 0 || chroma_repeat[IMG_TOP] > 3 ||
-	    chroma_repeat[IMG_BOTTOM] < 0 || chroma_repeat[IMG_BOTTOM] > 3 ||
-	    luma_repeat[IMG_LEFT] < 0 || luma_repeat[IMG_LEFT] > 3 ||
-	    luma_repeat[IMG_RIGHT] < 0 || luma_repeat[IMG_RIGHT] > 3 ||
-	    luma_repeat[IMG_TOP] < 0 || luma_repeat[IMG_TOP] > 3 ||
-	    luma_repeat[IMG_BOTTOM] < 0 || luma_repeat[IMG_BOTTOM] > 3)
-		return -1;
-
-	regs->edge |= (chroma_repeat[IMG_LEFT] & 3) << MDP_LEFT_CHROMA;
-	regs->edge |= (chroma_repeat[IMG_RIGHT] & 3) << MDP_RIGHT_CHROMA;
-	regs->edge |= (chroma_repeat[IMG_TOP] & 3) << MDP_TOP_CHROMA;
-	regs->edge |= (chroma_repeat[IMG_BOTTOM] & 3) << MDP_BOTTOM_CHROMA;
-	regs->edge |= (luma_repeat[IMG_LEFT] & 3) << MDP_LEFT_LUMA;
-	regs->edge |= (luma_repeat[IMG_RIGHT] & 3) << MDP_RIGHT_LUMA;
-	regs->edge |= (luma_repeat[IMG_TOP] & 3) << MDP_TOP_LUMA;
-	regs->edge |= (luma_repeat[IMG_BOTTOM] & 3) << MDP_BOTTOM_LUMA;
-	return 0;
-}
-
-static int blit_scale(const struct mdp_info *mdp, struct mdp_blit_req *req,
-		      struct mdp_regs *regs)
-{
-	uint32_t phase_init_x, phase_init_y, phase_step_x, phase_step_y;
-	uint32_t scale_factor_x, scale_factor_y;
-	uint32_t downscale;
-	uint32_t dst_w, dst_h;
-
-	if (req->flags & MDP_ROT_90) {
-		dst_w = req->dst_rect.h;
-		dst_h = req->dst_rect.w;
-	} else {
-		dst_w = req->dst_rect.w;
-		dst_h = req->dst_rect.h;
-	}
-	if ((req->src_rect.w == dst_w)  && (req->src_rect.h == dst_h) &&
-	    !(req->flags & MDP_BLUR)) {
-		regs->phasex_init = 0;
-		regs->phasey_init = 0;
-		regs->phasex_step = 0;
-		regs->phasey_step = 0;
-		return 0;
-	}
-
-	if (scale_params(req->src_rect.w, dst_w, 1, &phase_init_x,
-			 &phase_step_x) ||
-	    scale_params(req->src_rect.h, dst_h, 1, &phase_init_y,
-			 &phase_step_y))
-		return -1;
-
-	scale_factor_x = (dst_w * 10) / req->src_rect.w;
-	scale_factor_y = (dst_h * 10) / req->src_rect.h;
-
-	if (scale_factor_x > 8)
-		downscale = MDP_DOWNSCALE_PT8TO1;
-	else if (scale_factor_x > 6)
-		downscale = MDP_DOWNSCALE_PT6TOPT8;
-	else if (scale_factor_x > 4)
-		downscale = MDP_DOWNSCALE_PT4TOPT6;
-	else
-		downscale = MDP_DOWNSCALE_PT2TOPT4;
-	if (downscale != downscale_x_table) {
-		load_scale_table(mdp, mdp_downscale_x_table[downscale], 64);
-		downscale_x_table = downscale;
-	}
-
-	if (scale_factor_y > 8)
-		downscale = MDP_DOWNSCALE_PT8TO1;
-	else if (scale_factor_y > 6)
-		downscale = MDP_DOWNSCALE_PT6TOPT8;
-	else if (scale_factor_y > 4)
-		downscale = MDP_DOWNSCALE_PT4TOPT6;
-	else
-		downscale = MDP_DOWNSCALE_PT2TOPT4;
-	if (downscale != downscale_y_table) {
-		load_scale_table(mdp, mdp_downscale_y_table[downscale], 64);
-		downscale_y_table = downscale;
-	}
-
-	regs->phasex_init = phase_init_x;
-	regs->phasey_init = phase_init_y;
-	regs->phasex_step = phase_step_x;
-	regs->phasey_step = phase_step_y;
-	regs->op |= (PPP_OP_SCALE_Y_ON | PPP_OP_SCALE_X_ON);
-	return 0;
-
-}
-
-static void blit_blur(const struct mdp_info *mdp, struct mdp_blit_req *req,
-		      struct mdp_regs *regs)
-{
-	if (!(req->flags & MDP_BLUR))
-		return;
-
-	if (!(downscale_x_table == MDP_DOWNSCALE_BLUR &&
-	      downscale_y_table == MDP_DOWNSCALE_BLUR)) {
-		load_scale_table(mdp, mdp_gaussian_blur_table, 128);
-		downscale_x_table = MDP_DOWNSCALE_BLUR;
-		downscale_y_table = MDP_DOWNSCALE_BLUR;
-	}
-
-	regs->op |= (PPP_OP_SCALE_Y_ON | PPP_OP_SCALE_X_ON);
-}
-
-
-#define IMG_LEN(rect_h, w, rect_w, bpp) (((rect_h) * w) * bpp)
-
-#define Y_TO_CRCB_RATIO(format) \
-	((format == MDP_Y_CBCR_H2V2 || format == MDP_Y_CRCB_H2V2) ?  2 :\
-	 (format == MDP_Y_CBCR_H2V1 || format == MDP_Y_CRCB_H2V1) ?  1 : 1)
-
-static void get_len(struct mdp_img *img, struct mdp_rect *rect, uint32_t bpp,
-		    uint32_t *len0, uint32_t *len1)
-{
-	*len0 = IMG_LEN(rect->h, img->width, rect->w, bpp);
-	if (IS_PSEUDOPLNR(img->format))
-		*len1 = *len0/Y_TO_CRCB_RATIO(img->format);
-	else
-		*len1 = 0;
-}
-
-static int valid_src_dst(unsigned long src_start, unsigned long src_len,
-			 unsigned long dst_start, unsigned long dst_len,
-			 struct mdp_blit_req *req, struct mdp_regs *regs)
-{
-	unsigned long src_min_ok = src_start;
-	unsigned long src_max_ok = src_start + src_len;
-	unsigned long dst_min_ok = dst_start;
-	unsigned long dst_max_ok = dst_start + dst_len;
-	uint32_t src0_len, src1_len, dst0_len, dst1_len;
-	get_len(&req->src, &req->src_rect, regs->src_bpp, &src0_len,
-		 &src1_len);
-	get_len(&req->dst, &req->dst_rect, regs->dst_bpp, &dst0_len,
-		 &dst1_len);
-
-	if (regs->src0 < src_min_ok || regs->src0 > src_max_ok ||
-	    regs->src0 + src0_len > src_max_ok) {
-		DLOG("invalid_src %x %x %lx %lx\n", regs->src0,
-		      src0_len, src_min_ok, src_max_ok);
-		return 0;
-	}
-	if (regs->src_cfg & PPP_SRC_PLANE_PSEUDOPLNR) {
-		if (regs->src1 < src_min_ok || regs->src1 > src_max_ok ||
-		    regs->src1 + src1_len > src_max_ok) {
-			DLOG("invalid_src1");
-			return 0;
-		}
-	}
-	if (regs->dst0 < dst_min_ok || regs->dst0 > dst_max_ok ||
-	    regs->dst0 + dst0_len > dst_max_ok) {
-		DLOG("invalid_dst");
-		return 0;
-	}
-	if (regs->dst_cfg & PPP_SRC_PLANE_PSEUDOPLNR) {
-		if (regs->dst1 < dst_min_ok || regs->dst1 > dst_max_ok ||
-		    regs->dst1 + dst1_len > dst_max_ok) {
-			DLOG("invalid_dst1");
-			return 0;
-		}
-	}
-	return 1;
-}
-
-
-static void flush_imgs(struct mdp_blit_req *req, struct mdp_regs *regs,
-		       struct file *src_file, struct file *dst_file)
-{
-}
-
-static void get_chroma_addr(struct mdp_img *img, struct mdp_rect *rect,
-			    uint32_t base, uint32_t bpp, uint32_t cfg,
-			    uint32_t *addr, uint32_t *ystride)
-{
-	uint32_t compress_v = Y_TO_CRCB_RATIO(img->format);
-	uint32_t compress_h = 2;
-	uint32_t  offset;
-
-	if (IS_PSEUDOPLNR(img->format)) {
-		offset = (rect->x / compress_h) * compress_h;
-		offset += rect->y == 0 ? 0 :
-			  ((rect->y + 1) / compress_v) * img->width;
-		*addr = base + (img->width * img->height * bpp);
-		*addr += offset * bpp;
-		*ystride |= *ystride << 16;
-	} else {
-		*addr = 0;
-	}
-}
-
-static int send_blit(const struct mdp_info *mdp, struct mdp_blit_req *req,
-		     struct mdp_regs *regs, struct file *src_file,
-		     struct file *dst_file)
-{
-	mdp_writel(mdp, 1, 0x060);
-	mdp_writel(mdp, regs->src_rect, PPP_ADDR_SRC_ROI);
-	mdp_writel(mdp, regs->src0, PPP_ADDR_SRC0);
-	mdp_writel(mdp, regs->src1, PPP_ADDR_SRC1);
-	mdp_writel(mdp, regs->src_ystride, PPP_ADDR_SRC_YSTRIDE);
-	mdp_writel(mdp, regs->src_cfg, PPP_ADDR_SRC_CFG);
-	mdp_writel(mdp, regs->src_pack, PPP_ADDR_SRC_PACK_PATTERN);
-
-	mdp_writel(mdp, regs->op, PPP_ADDR_OPERATION);
-	mdp_writel(mdp, regs->phasex_init, PPP_ADDR_PHASEX_INIT);
-	mdp_writel(mdp, regs->phasey_init, PPP_ADDR_PHASEY_INIT);
-	mdp_writel(mdp, regs->phasex_step, PPP_ADDR_PHASEX_STEP);
-	mdp_writel(mdp, regs->phasey_step, PPP_ADDR_PHASEY_STEP);
-
-	mdp_writel(mdp, (req->alpha << 24) | (req->transp_mask & 0xffffff),
-	       PPP_ADDR_ALPHA_TRANSP);
-
-	mdp_writel(mdp, regs->dst_cfg, PPP_ADDR_DST_CFG);
-	mdp_writel(mdp, regs->dst_pack, PPP_ADDR_DST_PACK_PATTERN);
-	mdp_writel(mdp, regs->dst_rect, PPP_ADDR_DST_ROI);
-	mdp_writel(mdp, regs->dst0, PPP_ADDR_DST0);
-	mdp_writel(mdp, regs->dst1, PPP_ADDR_DST1);
-	mdp_writel(mdp, regs->dst_ystride, PPP_ADDR_DST_YSTRIDE);
-
-	mdp_writel(mdp, regs->edge, PPP_ADDR_EDGE);
-	if (regs->op & PPP_OP_BLEND_ON) {
-		mdp_writel(mdp, regs->dst0, PPP_ADDR_BG0);
-		mdp_writel(mdp, regs->dst1, PPP_ADDR_BG1);
-		mdp_writel(mdp, regs->dst_ystride, PPP_ADDR_BG_YSTRIDE);
-		mdp_writel(mdp, src_img_cfg[req->dst.format], PPP_ADDR_BG_CFG);
-		mdp_writel(mdp, pack_pattern[req->dst.format],
-			   PPP_ADDR_BG_PACK_PATTERN);
-	}
-	flush_imgs(req, regs, src_file, dst_file);
-	mdp_writel(mdp, 0x1000, MDP_DISPLAY0_START);
-	return 0;
-}
-
-int mdp_ppp_blit(const struct mdp_info *mdp, struct mdp_blit_req *req,
-		 struct file *src_file, unsigned long src_start, unsigned long src_len,
-		 struct file *dst_file, unsigned long dst_start, unsigned long dst_len)
-{
-	struct mdp_regs regs = {0};
-
-	if (unlikely(req->src.format >= MDP_IMGTYPE_LIMIT ||
-		     req->dst.format >= MDP_IMGTYPE_LIMIT)) {
-		printk(KERN_ERR "mpd_ppp: img is of wrong format\n");
-		return -EINVAL;
-	}
-
-	if (unlikely(req->src_rect.x > req->src.width ||
-		     req->src_rect.y > req->src.height ||
-		     req->dst_rect.x > req->dst.width ||
-		     req->dst_rect.y > req->dst.height)) {
-		printk(KERN_ERR "mpd_ppp: img rect is outside of img!\n");
-		return -EINVAL;
-	}
-
-	/* set the src image configuration */
-	regs.src_cfg = src_img_cfg[req->src.format];
-	regs.src_cfg |= (req->src_rect.x & 0x1) ? PPP_SRC_BPP_ROI_ODD_X : 0;
-	regs.src_cfg |= (req->src_rect.y & 0x1) ? PPP_SRC_BPP_ROI_ODD_Y : 0;
-	regs.src_rect = (req->src_rect.h << 16) | req->src_rect.w;
-	regs.src_pack = pack_pattern[req->src.format];
-
-	/* set the dest image configuration */
-	regs.dst_cfg = dst_img_cfg[req->dst.format] | PPP_DST_OUT_SEL_AXI;
-	regs.dst_rect = (req->dst_rect.h << 16) | req->dst_rect.w;
-	regs.dst_pack = pack_pattern[req->dst.format];
-
-	/* set src, bpp, start pixel and ystride */
-	regs.src_bpp = bytes_per_pixel[req->src.format];
-	regs.src0 = src_start + req->src.offset;
-	regs.src_ystride = req->src.width * regs.src_bpp;
-	get_chroma_addr(&req->src, &req->src_rect, regs.src0, regs.src_bpp,
-			regs.src_cfg, &regs.src1, &regs.src_ystride);
-	regs.src0 += (req->src_rect.x + (req->src_rect.y * req->src.width)) *
-		      regs.src_bpp;
-
-	/* set dst, bpp, start pixel and ystride */
-	regs.dst_bpp = bytes_per_pixel[req->dst.format];
-	regs.dst0 = dst_start + req->dst.offset;
-	regs.dst_ystride = req->dst.width * regs.dst_bpp;
-	get_chroma_addr(&req->dst, &req->dst_rect, regs.dst0, regs.dst_bpp,
-			regs.dst_cfg, &regs.dst1, &regs.dst_ystride);
-	regs.dst0 += (req->dst_rect.x + (req->dst_rect.y * req->dst.width)) *
-		      regs.dst_bpp;
-
-	if (!valid_src_dst(src_start, src_len, dst_start, dst_len, req,
-			   &regs)) {
-		printk(KERN_ERR "mpd_ppp: final src or dst location is "
-			"invalid, are you trying to make an image too large "
-			"or to place it outside the screen?\n");
-		return -EINVAL;
-	}
-
-	/* set up operation register */
-	regs.op = 0;
-	blit_rotate(req, &regs);
-	blit_convert(req, &regs);
-	if (req->flags & MDP_DITHER)
-		regs.op |= PPP_OP_DITHER_EN;
-	blit_blend(req, &regs);
-	if (blit_scale(mdp, req, &regs)) {
-		printk(KERN_ERR "mpd_ppp: error computing scale for img.\n");
-		return -EINVAL;
-	}
-	blit_blur(mdp, req, &regs);
-	regs.op |= dst_op_chroma[req->dst.format] |
-		   src_op_chroma[req->src.format];
-
-	/* if the image is YCRYCB, the x and w must be even */
-	if (unlikely(req->src.format == MDP_YCRYCB_H2V1)) {
-		req->src_rect.x = req->src_rect.x & (~0x1);
-		req->src_rect.w = req->src_rect.w & (~0x1);
-		req->dst_rect.x = req->dst_rect.x & (~0x1);
-		req->dst_rect.w = req->dst_rect.w & (~0x1);
-	}
-	if (get_edge_cond(req, &regs))
-		return -EINVAL;
-
-	send_blit(mdp, req, &regs, src_file, dst_file);
-	return 0;
-}
diff --git a/drivers/video/fbdev/msm/mdp_scale_tables.c b/drivers/video/fbdev/msm/mdp_scale_tables.c
deleted file mode 100644
index 604783b2e17c..000000000000
--- a/drivers/video/fbdev/msm/mdp_scale_tables.c
+++ /dev/null
@@ -1,766 +0,0 @@
-/* drivers/video/msm_fb/mdp_scale_tables.c
- *
- * Copyright (C) 2007 QUALCOMM Incorporated
- * Copyright (C) 2007 Google Incorporated
- *
- * This software is licensed under the terms of the GNU General Public
- * License version 2, as published by the Free Software Foundation, and
- * may be copied, distributed, and modified under those terms.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- */
-
-#include "mdp_scale_tables.h"
-#include "mdp_hw.h"
-
-struct mdp_table_entry mdp_upscale_table[] = {
-	{ 0x5fffc, 0x0 },
-	{ 0x50200, 0x7fc00000 },
-	{ 0x5fffc, 0xff80000d },
-	{ 0x50204, 0x7ec003f9 },
-	{ 0x5fffc, 0xfec0001c },
-	{ 0x50208, 0x7d4003f3 },
-	{ 0x5fffc, 0xfe40002b },
-	{ 0x5020c, 0x7b8003ed },
-	{ 0x5fffc, 0xfd80003c },
-	{ 0x50210, 0x794003e8 },
-	{ 0x5fffc, 0xfcc0004d },
-	{ 0x50214, 0x76c003e4 },
-	{ 0x5fffc, 0xfc40005f },
-	{ 0x50218, 0x73c003e0 },
-	{ 0x5fffc, 0xfb800071 },
-	{ 0x5021c, 0x708003de },
-	{ 0x5fffc, 0xfac00085 },
-	{ 0x50220, 0x6d0003db },
-	{ 0x5fffc, 0xfa000098 },
-	{ 0x50224, 0x698003d9 },
-	{ 0x5fffc, 0xf98000ac },
-	{ 0x50228, 0x654003d8 },
-	{ 0x5fffc, 0xf8c000c1 },
-	{ 0x5022c, 0x610003d7 },
-	{ 0x5fffc, 0xf84000d5 },
-	{ 0x50230, 0x5c8003d7 },
-	{ 0x5fffc, 0xf7c000e9 },
-	{ 0x50234, 0x580003d7 },
-	{ 0x5fffc, 0xf74000fd },
-	{ 0x50238, 0x534003d8 },
-	{ 0x5fffc, 0xf6c00112 },
-	{ 0x5023c, 0x4e8003d8 },
-	{ 0x5fffc, 0xf6800126 },
-	{ 0x50240, 0x494003da },
-	{ 0x5fffc, 0xf600013a },
-	{ 0x50244, 0x448003db },
-	{ 0x5fffc, 0xf600014d },
-	{ 0x50248, 0x3f4003dd },
-	{ 0x5fffc, 0xf5c00160 },
-	{ 0x5024c, 0x3a4003df },
-	{ 0x5fffc, 0xf5c00172 },
-	{ 0x50250, 0x354003e1 },
-	{ 0x5fffc, 0xf5c00184 },
-	{ 0x50254, 0x304003e3 },
-	{ 0x5fffc, 0xf6000195 },
-	{ 0x50258, 0x2b0003e6 },
-	{ 0x5fffc, 0xf64001a6 },
-	{ 0x5025c, 0x260003e8 },
-	{ 0x5fffc, 0xf6c001b4 },
-	{ 0x50260, 0x214003eb },
-	{ 0x5fffc, 0xf78001c2 },
-	{ 0x50264, 0x1c4003ee },
-	{ 0x5fffc, 0xf80001cf },
-	{ 0x50268, 0x17c003f1 },
-	{ 0x5fffc, 0xf90001db },
-	{ 0x5026c, 0x134003f3 },
-	{ 0x5fffc, 0xfa0001e5 },
-	{ 0x50270, 0xf0003f6 },
-	{ 0x5fffc, 0xfb4001ee },
-	{ 0x50274, 0xac003f9 },
-	{ 0x5fffc, 0xfcc001f5 },
-	{ 0x50278, 0x70003fb },
-	{ 0x5fffc, 0xfe4001fb },
-	{ 0x5027c, 0x34003fe },
-};
-
-static struct mdp_table_entry mdp_downscale_x_table_PT2TOPT4[] = {
-	{ 0x5fffc, 0x740008c },
-	{ 0x50280, 0x33800088 },
-	{ 0x5fffc, 0x800008e },
-	{ 0x50284, 0x33400084 },
-	{ 0x5fffc, 0x8400092 },
-	{ 0x50288, 0x33000080 },
-	{ 0x5fffc, 0x9000094 },
-	{ 0x5028c, 0x3300007b },
-	{ 0x5fffc, 0x9c00098 },
-	{ 0x50290, 0x32400077 },
-	{ 0x5fffc, 0xa40009b },
-	{ 0x50294, 0x32000073 },
-	{ 0x5fffc, 0xb00009d },
-	{ 0x50298,  0x31c0006f },
-	{ 0x5fffc,  0xbc000a0 },
-	{ 0x5029c,  0x3140006b },
-	{ 0x5fffc,  0xc8000a2 },
-	{ 0x502a0,  0x31000067 },
-	{ 0x5fffc,  0xd8000a5 },
-	{ 0x502a4,  0x30800062 },
-	{ 0x5fffc,  0xe4000a8 },
-	{ 0x502a8,  0x2fc0005f },
-	{ 0x5fffc,  0xec000aa },
-	{ 0x502ac,  0x2fc0005b },
-	{ 0x5fffc,  0xf8000ad },
-	{ 0x502b0,  0x2f400057 },
-	{ 0x5fffc,  0x108000b0 },
-	{ 0x502b4,  0x2e400054 },
-	{ 0x5fffc,  0x114000b2 },
-	{ 0x502b8,  0x2e000050 },
-	{ 0x5fffc,  0x124000b4 },
-	{ 0x502bc,  0x2d80004c },
-	{ 0x5fffc,  0x130000b6 },
-	{ 0x502c0,  0x2d000049 },
-	{ 0x5fffc,  0x140000b8 },
-	{ 0x502c4,  0x2c800045 },
-	{ 0x5fffc,  0x150000b9 },
-	{ 0x502c8,  0x2c000042 },
-	{ 0x5fffc,  0x15c000bd },
-	{ 0x502cc,  0x2b40003e },
-	{ 0x5fffc,  0x16c000bf },
-	{ 0x502d0,  0x2a80003b },
-	{ 0x5fffc,  0x17c000bf },
-	{ 0x502d4,  0x2a000039 },
-	{ 0x5fffc,  0x188000c2 },
-	{ 0x502d8,  0x29400036 },
-	{ 0x5fffc,  0x19c000c4 },
-	{ 0x502dc,  0x28800032 },
-	{ 0x5fffc,  0x1ac000c5 },
-	{ 0x502e0,  0x2800002f },
-	{ 0x5fffc,  0x1bc000c7 },
-	{ 0x502e4,  0x2740002c },
-	{ 0x5fffc,  0x1cc000c8 },
-	{ 0x502e8,  0x26c00029 },
-	{ 0x5fffc,  0x1dc000c9 },
-	{ 0x502ec,  0x26000027 },
-	{ 0x5fffc,  0x1ec000cc },
-	{ 0x502f0,  0x25000024 },
-	{ 0x5fffc,  0x200000cc },
-	{ 0x502f4,  0x24800021 },
-	{ 0x5fffc,  0x210000cd },
-	{ 0x502f8,  0x23800020 },
-	{ 0x5fffc,  0x220000ce },
-	{ 0x502fc,  0x2300001d },
-};
-
-static struct mdp_table_entry mdp_downscale_x_table_PT4TOPT6[] = {
-	{ 0x5fffc,  0x740008c },
-	{ 0x50280,  0x33800088 },
-	{ 0x5fffc,  0x800008e },
-	{ 0x50284,  0x33400084 },
-	{ 0x5fffc,  0x8400092 },
-	{ 0x50288,  0x33000080 },
-	{ 0x5fffc,  0x9000094 },
-	{ 0x5028c,  0x3300007b },
-	{ 0x5fffc,  0x9c00098 },
-	{ 0x50290,  0x32400077 },
-	{ 0x5fffc,  0xa40009b },
-	{ 0x50294,  0x32000073 },
-	{ 0x5fffc,  0xb00009d },
-	{ 0x50298,  0x31c0006f },
-	{ 0x5fffc,  0xbc000a0 },
-	{ 0x5029c,  0x3140006b },
-	{ 0x5fffc,  0xc8000a2 },
-	{ 0x502a0,  0x31000067 },
-	{ 0x5fffc,  0xd8000a5 },
-	{ 0x502a4,  0x30800062 },
-	{ 0x5fffc,  0xe4000a8 },
-	{ 0x502a8,  0x2fc0005f },
-	{ 0x5fffc,  0xec000aa },
-	{ 0x502ac,  0x2fc0005b },
-	{ 0x5fffc,  0xf8000ad },
-	{ 0x502b0,  0x2f400057 },
-	{ 0x5fffc,  0x108000b0 },
-	{ 0x502b4,  0x2e400054 },
-	{ 0x5fffc,  0x114000b2 },
-	{ 0x502b8,  0x2e000050 },
-	{ 0x5fffc,  0x124000b4 },
-	{ 0x502bc,  0x2d80004c },
-	{ 0x5fffc,  0x130000b6 },
-	{ 0x502c0,  0x2d000049 },
-	{ 0x5fffc,  0x140000b8 },
-	{ 0x502c4,  0x2c800045 },
-	{ 0x5fffc,  0x150000b9 },
-	{ 0x502c8,  0x2c000042 },
-	{ 0x5fffc,  0x15c000bd },
-	{ 0x502cc,  0x2b40003e },
-	{ 0x5fffc,  0x16c000bf },
-	{ 0x502d0,  0x2a80003b },
-	{ 0x5fffc,  0x17c000bf },
-	{ 0x502d4,  0x2a000039 },
-	{ 0x5fffc,  0x188000c2 },
-	{ 0x502d8,  0x29400036 },
-	{ 0x5fffc,  0x19c000c4 },
-	{ 0x502dc,  0x28800032 },
-	{ 0x5fffc,  0x1ac000c5 },
-	{ 0x502e0,  0x2800002f },
-	{ 0x5fffc,  0x1bc000c7 },
-	{ 0x502e4,  0x2740002c },
-	{ 0x5fffc,  0x1cc000c8 },
-	{ 0x502e8,  0x26c00029 },
-	{ 0x5fffc,  0x1dc000c9 },
-	{ 0x502ec,  0x26000027 },
-	{ 0x5fffc,  0x1ec000cc },
-	{ 0x502f0,  0x25000024 },
-	{ 0x5fffc,  0x200000cc },
-	{ 0x502f4,  0x24800021 },
-	{ 0x5fffc,  0x210000cd },
-	{ 0x502f8,  0x23800020 },
-	{ 0x5fffc,  0x220000ce },
-	{ 0x502fc,  0x2300001d },
-};
-
-static struct mdp_table_entry mdp_downscale_x_table_PT6TOPT8[] = {
-	{ 0x5fffc,  0xfe000070 },
-	{ 0x50280,  0x4bc00068 },
-	{ 0x5fffc,  0xfe000078 },
-	{ 0x50284,  0x4bc00060 },
-	{ 0x5fffc,  0xfe000080 },
-	{ 0x50288,  0x4b800059 },
-	{ 0x5fffc,  0xfe000089 },
-	{ 0x5028c,  0x4b000052 },
-	{ 0x5fffc,  0xfe400091 },
-	{ 0x50290,  0x4a80004b },
-	{ 0x5fffc,  0xfe40009a },
-	{ 0x50294,  0x4a000044 },
-	{ 0x5fffc,  0xfe8000a3 },
-	{ 0x50298,  0x4940003d },
-	{ 0x5fffc,  0xfec000ac },
-	{ 0x5029c,  0x48400037 },
-	{ 0x5fffc,  0xff0000b4 },
-	{ 0x502a0,  0x47800031 },
-	{ 0x5fffc,  0xff8000bd },
-	{ 0x502a4,  0x4640002b },
-	{ 0x5fffc,  0xc5 },
-	{ 0x502a8,  0x45000026 },
-	{ 0x5fffc,  0x8000ce },
-	{ 0x502ac,  0x43800021 },
-	{ 0x5fffc,  0x10000d6 },
-	{ 0x502b0,  0x4240001c },
-	{ 0x5fffc,  0x18000df },
-	{ 0x502b4,  0x40800018 },
-	{ 0x5fffc,  0x24000e6 },
-	{ 0x502b8,  0x3f000014 },
-	{ 0x5fffc,  0x30000ee },
-	{ 0x502bc,  0x3d400010 },
-	{ 0x5fffc,  0x40000f5 },
-	{ 0x502c0,  0x3b80000c },
-	{ 0x5fffc,  0x50000fc },
-	{ 0x502c4,  0x39800009 },
-	{ 0x5fffc,  0x6000102 },
-	{ 0x502c8,  0x37c00006 },
-	{ 0x5fffc,  0x7000109 },
-	{ 0x502cc,  0x35800004 },
-	{ 0x5fffc,  0x840010e },
-	{ 0x502d0,  0x33800002 },
-	{ 0x5fffc,  0x9800114 },
-	{ 0x502d4,  0x31400000 },
-	{ 0x5fffc,  0xac00119 },
-	{ 0x502d8,  0x2f4003fe },
-	{ 0x5fffc,  0xc40011e },
-	{ 0x502dc,  0x2d0003fc },
-	{ 0x5fffc,  0xdc00121 },
-	{ 0x502e0,  0x2b0003fb },
-	{ 0x5fffc,  0xf400125 },
-	{ 0x502e4,  0x28c003fa },
-	{ 0x5fffc,  0x11000128 },
-	{ 0x502e8,  0x268003f9 },
-	{ 0x5fffc,  0x12c0012a },
-	{ 0x502ec,  0x244003f9 },
-	{ 0x5fffc,  0x1480012c },
-	{ 0x502f0,  0x224003f8 },
-	{ 0x5fffc,  0x1640012e },
-	{ 0x502f4,  0x200003f8 },
-	{ 0x5fffc,  0x1800012f },
-	{ 0x502f8,  0x1e0003f8 },
-	{ 0x5fffc,  0x1a00012f },
-	{ 0x502fc,  0x1c0003f8 },
-};
-
-static struct mdp_table_entry mdp_downscale_x_table_PT8TO1[] = {
-	{ 0x5fffc,  0x0 },
-	{ 0x50280,  0x7fc00000 },
-	{ 0x5fffc,  0xff80000d },
-	{ 0x50284,  0x7ec003f9 },
-	{ 0x5fffc,  0xfec0001c },
-	{ 0x50288,  0x7d4003f3 },
-	{ 0x5fffc,  0xfe40002b },
-	{ 0x5028c,  0x7b8003ed },
-	{ 0x5fffc,  0xfd80003c },
-	{ 0x50290,  0x794003e8 },
-	{ 0x5fffc,  0xfcc0004d },
-	{ 0x50294,  0x76c003e4 },
-	{ 0x5fffc,  0xfc40005f },
-	{ 0x50298,  0x73c003e0 },
-	{ 0x5fffc,  0xfb800071 },
-	{ 0x5029c,  0x708003de },
-	{ 0x5fffc,  0xfac00085 },
-	{ 0x502a0,  0x6d0003db },
-	{ 0x5fffc,  0xfa000098 },
-	{ 0x502a4,  0x698003d9 },
-	{ 0x5fffc,  0xf98000ac },
-	{ 0x502a8,  0x654003d8 },
-	{ 0x5fffc,  0xf8c000c1 },
-	{ 0x502ac,  0x610003d7 },
-	{ 0x5fffc,  0xf84000d5 },
-	{ 0x502b0,  0x5c8003d7 },
-	{ 0x5fffc,  0xf7c000e9 },
-	{ 0x502b4,  0x580003d7 },
-	{ 0x5fffc,  0xf74000fd },
-	{ 0x502b8,  0x534003d8 },
-	{ 0x5fffc,  0xf6c00112 },
-	{ 0x502bc,  0x4e8003d8 },
-	{ 0x5fffc,  0xf6800126 },
-	{ 0x502c0,  0x494003da },
-	{ 0x5fffc,  0xf600013a },
-	{ 0x502c4,  0x448003db },
-	{ 0x5fffc,  0xf600014d },
-	{ 0x502c8,  0x3f4003dd },
-	{ 0x5fffc,  0xf5c00160 },
-	{ 0x502cc,  0x3a4003df },
-	{ 0x5fffc,  0xf5c00172 },
-	{ 0x502d0,  0x354003e1 },
-	{ 0x5fffc,  0xf5c00184 },
-	{ 0x502d4,  0x304003e3 },
-	{ 0x5fffc,  0xf6000195 },
-	{ 0x502d8,  0x2b0003e6 },
-	{ 0x5fffc,  0xf64001a6 },
-	{ 0x502dc,  0x260003e8 },
-	{ 0x5fffc,  0xf6c001b4 },
-	{ 0x502e0,  0x214003eb },
-	{ 0x5fffc,  0xf78001c2 },
-	{ 0x502e4,  0x1c4003ee },
-	{ 0x5fffc,  0xf80001cf },
-	{ 0x502e8,  0x17c003f1 },
-	{ 0x5fffc,  0xf90001db },
-	{ 0x502ec,  0x134003f3 },
-	{ 0x5fffc,  0xfa0001e5 },
-	{ 0x502f0,  0xf0003f6 },
-	{ 0x5fffc,  0xfb4001ee },
-	{ 0x502f4,  0xac003f9 },
-	{ 0x5fffc,  0xfcc001f5 },
-	{ 0x502f8,  0x70003fb },
-	{ 0x5fffc,  0xfe4001fb },
-	{ 0x502fc,  0x34003fe },
-};
-
-struct mdp_table_entry *mdp_downscale_x_table[MDP_DOWNSCALE_MAX] = {
-	[MDP_DOWNSCALE_PT2TOPT4] = mdp_downscale_x_table_PT2TOPT4,
-	[MDP_DOWNSCALE_PT4TOPT6] = mdp_downscale_x_table_PT4TOPT6,
-	[MDP_DOWNSCALE_PT6TOPT8] = mdp_downscale_x_table_PT6TOPT8,
-	[MDP_DOWNSCALE_PT8TO1]  = mdp_downscale_x_table_PT8TO1,
-};
-
-static struct mdp_table_entry mdp_downscale_y_table_PT2TOPT4[] = {
-	{ 0x5fffc,  0x740008c },
-	{ 0x50300,  0x33800088 },
-	{ 0x5fffc,  0x800008e },
-	{ 0x50304,  0x33400084 },
-	{ 0x5fffc,  0x8400092 },
-	{ 0x50308,  0x33000080 },
-	{ 0x5fffc,  0x9000094 },
-	{ 0x5030c,  0x3300007b },
-	{ 0x5fffc,  0x9c00098 },
-	{ 0x50310,  0x32400077 },
-	{ 0x5fffc,  0xa40009b },
-	{ 0x50314,  0x32000073 },
-	{ 0x5fffc,  0xb00009d },
-	{ 0x50318,  0x31c0006f },
-	{ 0x5fffc,  0xbc000a0 },
-	{ 0x5031c,  0x3140006b },
-	{ 0x5fffc,  0xc8000a2 },
-	{ 0x50320,  0x31000067 },
-	{ 0x5fffc,  0xd8000a5 },
-	{ 0x50324,  0x30800062 },
-	{ 0x5fffc,  0xe4000a8 },
-	{ 0x50328,  0x2fc0005f },
-	{ 0x5fffc,  0xec000aa },
-	{ 0x5032c,  0x2fc0005b },
-	{ 0x5fffc,  0xf8000ad },
-	{ 0x50330,  0x2f400057 },
-	{ 0x5fffc,  0x108000b0 },
-	{ 0x50334,  0x2e400054 },
-	{ 0x5fffc,  0x114000b2 },
-	{ 0x50338,  0x2e000050 },
-	{ 0x5fffc,  0x124000b4 },
-	{ 0x5033c,  0x2d80004c },
-	{ 0x5fffc,  0x130000b6 },
-	{ 0x50340,  0x2d000049 },
-	{ 0x5fffc,  0x140000b8 },
-	{ 0x50344,  0x2c800045 },
-	{ 0x5fffc,  0x150000b9 },
-	{ 0x50348,  0x2c000042 },
-	{ 0x5fffc,  0x15c000bd },
-	{ 0x5034c,  0x2b40003e },
-	{ 0x5fffc,  0x16c000bf },
-	{ 0x50350,  0x2a80003b },
-	{ 0x5fffc,  0x17c000bf },
-	{ 0x50354,  0x2a000039 },
-	{ 0x5fffc,  0x188000c2 },
-	{ 0x50358,  0x29400036 },
-	{ 0x5fffc,  0x19c000c4 },
-	{ 0x5035c,  0x28800032 },
-	{ 0x5fffc,  0x1ac000c5 },
-	{ 0x50360,  0x2800002f },
-	{ 0x5fffc,  0x1bc000c7 },
-	{ 0x50364,  0x2740002c },
-	{ 0x5fffc,  0x1cc000c8 },
-	{ 0x50368,  0x26c00029 },
-	{ 0x5fffc,  0x1dc000c9 },
-	{ 0x5036c,  0x26000027 },
-	{ 0x5fffc,  0x1ec000cc },
-	{ 0x50370,  0x25000024 },
-	{ 0x5fffc,  0x200000cc },
-	{ 0x50374,  0x24800021 },
-	{ 0x5fffc,  0x210000cd },
-	{ 0x50378,  0x23800020 },
-	{ 0x5fffc,  0x220000ce },
-	{ 0x5037c,  0x2300001d },
-};
-
-static struct mdp_table_entry mdp_downscale_y_table_PT4TOPT6[] = {
-	{ 0x5fffc,  0x740008c },
-	{ 0x50300,  0x33800088 },
-	{ 0x5fffc,  0x800008e },
-	{ 0x50304,  0x33400084 },
-	{ 0x5fffc,  0x8400092 },
-	{ 0x50308,  0x33000080 },
-	{ 0x5fffc,  0x9000094 },
-	{ 0x5030c,  0x3300007b },
-	{ 0x5fffc,  0x9c00098 },
-	{ 0x50310,  0x32400077 },
-	{ 0x5fffc,  0xa40009b },
-	{ 0x50314,  0x32000073 },
-	{ 0x5fffc,  0xb00009d },
-	{ 0x50318,  0x31c0006f },
-	{ 0x5fffc,  0xbc000a0 },
-	{ 0x5031c,  0x3140006b },
-	{ 0x5fffc,  0xc8000a2 },
-	{ 0x50320,  0x31000067 },
-	{ 0x5fffc,  0xd8000a5 },
-	{ 0x50324,  0x30800062 },
-	{ 0x5fffc,  0xe4000a8 },
-	{ 0x50328,  0x2fc0005f },
-	{ 0x5fffc,  0xec000aa },
-	{ 0x5032c,  0x2fc0005b },
-	{ 0x5fffc,  0xf8000ad },
-	{ 0x50330,  0x2f400057 },
-	{ 0x5fffc,  0x108000b0 },
-	{ 0x50334,  0x2e400054 },
-	{ 0x5fffc,  0x114000b2 },
-	{ 0x50338,  0x2e000050 },
-	{ 0x5fffc,  0x124000b4 },
-	{ 0x5033c,  0x2d80004c },
-	{ 0x5fffc,  0x130000b6 },
-	{ 0x50340,  0x2d000049 },
-	{ 0x5fffc,  0x140000b8 },
-	{ 0x50344,  0x2c800045 },
-	{ 0x5fffc,  0x150000b9 },
-	{ 0x50348,  0x2c000042 },
-	{ 0x5fffc,  0x15c000bd },
-	{ 0x5034c,  0x2b40003e },
-	{ 0x5fffc,  0x16c000bf },
-	{ 0x50350,  0x2a80003b },
-	{ 0x5fffc,  0x17c000bf },
-	{ 0x50354,  0x2a000039 },
-	{ 0x5fffc,  0x188000c2 },
-	{ 0x50358,  0x29400036 },
-	{ 0x5fffc,  0x19c000c4 },
-	{ 0x5035c,  0x28800032 },
-	{ 0x5fffc,  0x1ac000c5 },
-	{ 0x50360,  0x2800002f },
-	{ 0x5fffc,  0x1bc000c7 },
-	{ 0x50364,  0x2740002c },
-	{ 0x5fffc,  0x1cc000c8 },
-	{ 0x50368,  0x26c00029 },
-	{ 0x5fffc,  0x1dc000c9 },
-	{ 0x5036c,  0x26000027 },
-	{ 0x5fffc,  0x1ec000cc },
-	{ 0x50370,  0x25000024 },
-	{ 0x5fffc,  0x200000cc },
-	{ 0x50374,  0x24800021 },
-	{ 0x5fffc,  0x210000cd },
-	{ 0x50378,  0x23800020 },
-	{ 0x5fffc,  0x220000ce },
-	{ 0x5037c,  0x2300001d },
-};
-
-static struct mdp_table_entry mdp_downscale_y_table_PT6TOPT8[] = {
-	{ 0x5fffc,  0xfe000070 },
-	{ 0x50300,  0x4bc00068 },
-	{ 0x5fffc,  0xfe000078 },
-	{ 0x50304,  0x4bc00060 },
-	{ 0x5fffc,  0xfe000080 },
-	{ 0x50308,  0x4b800059 },
-	{ 0x5fffc,  0xfe000089 },
-	{ 0x5030c,  0x4b000052 },
-	{ 0x5fffc,  0xfe400091 },
-	{ 0x50310,  0x4a80004b },
-	{ 0x5fffc,  0xfe40009a },
-	{ 0x50314,  0x4a000044 },
-	{ 0x5fffc,  0xfe8000a3 },
-	{ 0x50318,  0x4940003d },
-	{ 0x5fffc,  0xfec000ac },
-	{ 0x5031c,  0x48400037 },
-	{ 0x5fffc,  0xff0000b4 },
-	{ 0x50320,  0x47800031 },
-	{ 0x5fffc,  0xff8000bd },
-	{ 0x50324,  0x4640002b },
-	{ 0x5fffc,  0xc5 },
-	{ 0x50328,  0x45000026 },
-	{ 0x5fffc,  0x8000ce },
-	{ 0x5032c,  0x43800021 },
-	{ 0x5fffc,  0x10000d6 },
-	{ 0x50330,  0x4240001c },
-	{ 0x5fffc,  0x18000df },
-	{ 0x50334,  0x40800018 },
-	{ 0x5fffc,  0x24000e6 },
-	{ 0x50338,  0x3f000014 },
-	{ 0x5fffc,  0x30000ee },
-	{ 0x5033c,  0x3d400010 },
-	{ 0x5fffc,  0x40000f5 },
-	{ 0x50340,  0x3b80000c },
-	{ 0x5fffc,  0x50000fc },
-	{ 0x50344,  0x39800009 },
-	{ 0x5fffc,  0x6000102 },
-	{ 0x50348,  0x37c00006 },
-	{ 0x5fffc,  0x7000109 },
-	{ 0x5034c,  0x35800004 },
-	{ 0x5fffc,  0x840010e },
-	{ 0x50350,  0x33800002 },
-	{ 0x5fffc,  0x9800114 },
-	{ 0x50354,  0x31400000 },
-	{ 0x5fffc,  0xac00119 },
-	{ 0x50358,  0x2f4003fe },
-	{ 0x5fffc,  0xc40011e },
-	{ 0x5035c,  0x2d0003fc },
-	{ 0x5fffc,  0xdc00121 },
-	{ 0x50360,  0x2b0003fb },
-	{ 0x5fffc,  0xf400125 },
-	{ 0x50364,  0x28c003fa },
-	{ 0x5fffc,  0x11000128 },
-	{ 0x50368,  0x268003f9 },
-	{ 0x5fffc,  0x12c0012a },
-	{ 0x5036c,  0x244003f9 },
-	{ 0x5fffc,  0x1480012c },
-	{ 0x50370,  0x224003f8 },
-	{ 0x5fffc,  0x1640012e },
-	{ 0x50374,  0x200003f8 },
-	{ 0x5fffc,  0x1800012f },
-	{ 0x50378,  0x1e0003f8 },
-	{ 0x5fffc,  0x1a00012f },
-	{ 0x5037c,  0x1c0003f8 },
-};
-
-static struct mdp_table_entry mdp_downscale_y_table_PT8TO1[] = {
-	{ 0x5fffc,  0x0 },
-	{ 0x50300,  0x7fc00000 },
-	{ 0x5fffc,  0xff80000d },
-	{ 0x50304,  0x7ec003f9 },
-	{ 0x5fffc,  0xfec0001c },
-	{ 0x50308,  0x7d4003f3 },
-	{ 0x5fffc,  0xfe40002b },
-	{ 0x5030c,  0x7b8003ed },
-	{ 0x5fffc,  0xfd80003c },
-	{ 0x50310,  0x794003e8 },
-	{ 0x5fffc,  0xfcc0004d },
-	{ 0x50314,  0x76c003e4 },
-	{ 0x5fffc,  0xfc40005f },
-	{ 0x50318,  0x73c003e0 },
-	{ 0x5fffc,  0xfb800071 },
-	{ 0x5031c,  0x708003de },
-	{ 0x5fffc,  0xfac00085 },
-	{ 0x50320,  0x6d0003db },
-	{ 0x5fffc,  0xfa000098 },
-	{ 0x50324,  0x698003d9 },
-	{ 0x5fffc,  0xf98000ac },
-	{ 0x50328,  0x654003d8 },
-	{ 0x5fffc,  0xf8c000c1 },
-	{ 0x5032c,  0x610003d7 },
-	{ 0x5fffc,  0xf84000d5 },
-	{ 0x50330,  0x5c8003d7 },
-	{ 0x5fffc,  0xf7c000e9 },
-	{ 0x50334,  0x580003d7 },
-	{ 0x5fffc,  0xf74000fd },
-	{ 0x50338,  0x534003d8 },
-	{ 0x5fffc,  0xf6c00112 },
-	{ 0x5033c,  0x4e8003d8 },
-	{ 0x5fffc,  0xf6800126 },
-	{ 0x50340,  0x494003da },
-	{ 0x5fffc,  0xf600013a },
-	{ 0x50344,  0x448003db },
-	{ 0x5fffc,  0xf600014d },
-	{ 0x50348,  0x3f4003dd },
-	{ 0x5fffc,  0xf5c00160 },
-	{ 0x5034c,  0x3a4003df },
-	{ 0x5fffc,  0xf5c00172 },
-	{ 0x50350,  0x354003e1 },
-	{ 0x5fffc,  0xf5c00184 },
-	{ 0x50354,  0x304003e3 },
-	{ 0x5fffc,  0xf6000195 },
-	{ 0x50358,  0x2b0003e6 },
-	{ 0x5fffc,  0xf64001a6 },
-	{ 0x5035c,  0x260003e8 },
-	{ 0x5fffc,  0xf6c001b4 },
-	{ 0x50360,  0x214003eb },
-	{ 0x5fffc,  0xf78001c2 },
-	{ 0x50364,  0x1c4003ee },
-	{ 0x5fffc,  0xf80001cf },
-	{ 0x50368,  0x17c003f1 },
-	{ 0x5fffc,  0xf90001db },
-	{ 0x5036c,  0x134003f3 },
-	{ 0x5fffc,  0xfa0001e5 },
-	{ 0x50370,  0xf0003f6 },
-	{ 0x5fffc,  0xfb4001ee },
-	{ 0x50374,  0xac003f9 },
-	{ 0x5fffc,  0xfcc001f5 },
-	{ 0x50378,  0x70003fb },
-	{ 0x5fffc,  0xfe4001fb },
-	{ 0x5037c,  0x34003fe },
-};
-
-struct mdp_table_entry *mdp_downscale_y_table[MDP_DOWNSCALE_MAX] = {
-	[MDP_DOWNSCALE_PT2TOPT4] = mdp_downscale_y_table_PT2TOPT4,
-	[MDP_DOWNSCALE_PT4TOPT6] = mdp_downscale_y_table_PT4TOPT6,
-	[MDP_DOWNSCALE_PT6TOPT8] = mdp_downscale_y_table_PT6TOPT8,
-	[MDP_DOWNSCALE_PT8TO1]  = mdp_downscale_y_table_PT8TO1,
-};
-
-struct mdp_table_entry mdp_gaussian_blur_table[] = {
-	/* max variance */
-	{ 0x5fffc, 0x20000080 },
-	{ 0x50280, 0x20000080 },
-	{ 0x5fffc, 0x20000080 },
-	{ 0x50284, 0x20000080 },
-	{ 0x5fffc, 0x20000080 },
-	{ 0x50288, 0x20000080 },
-	{ 0x5fffc, 0x20000080 },
-	{ 0x5028c, 0x20000080 },
-	{ 0x5fffc, 0x20000080 },
-	{ 0x50290, 0x20000080 },
-	{ 0x5fffc, 0x20000080 },
-	{ 0x50294, 0x20000080 },
-	{ 0x5fffc, 0x20000080 },
-	{ 0x50298, 0x20000080 },
-	{ 0x5fffc, 0x20000080 },
-	{ 0x5029c, 0x20000080 },
-	{ 0x5fffc, 0x20000080 },
-	{ 0x502a0, 0x20000080 },
-	{ 0x5fffc, 0x20000080 },
-	{ 0x502a4, 0x20000080 },
-	{ 0x5fffc, 0x20000080 },
-	{ 0x502a8, 0x20000080 },
-	{ 0x5fffc, 0x20000080 },
-	{ 0x502ac, 0x20000080 },
-	{ 0x5fffc, 0x20000080 },
-	{ 0x502b0, 0x20000080 },
-	{ 0x5fffc, 0x20000080 },
-	{ 0x502b4, 0x20000080 },
-	{ 0x5fffc, 0x20000080 },
-	{ 0x502b8, 0x20000080 },
-	{ 0x5fffc, 0x20000080 },
-	{ 0x502bc, 0x20000080 },
-	{ 0x5fffc, 0x20000080 },
-	{ 0x502c0, 0x20000080 },
-	{ 0x5fffc, 0x20000080 },
-	{ 0x502c4, 0x20000080 },
-	{ 0x5fffc, 0x20000080 },
-	{ 0x502c8, 0x20000080 },
-	{ 0x5fffc, 0x20000080 },
-	{ 0x502cc, 0x20000080 },
-	{ 0x5fffc, 0x20000080 },
-	{ 0x502d0, 0x20000080 },
-	{ 0x5fffc, 0x20000080 },
-	{ 0x502d4, 0x20000080 },
-	{ 0x5fffc, 0x20000080 },
-	{ 0x502d8, 0x20000080 },
-	{ 0x5fffc, 0x20000080 },
-	{ 0x502dc, 0x20000080 },
-	{ 0x5fffc, 0x20000080 },
-	{ 0x502e0, 0x20000080 },
-	{ 0x5fffc, 0x20000080 },
-	{ 0x502e4, 0x20000080 },
-	{ 0x5fffc, 0x20000080 },
-	{ 0x502e8, 0x20000080 },
-	{ 0x5fffc, 0x20000080 },
-	{ 0x502ec, 0x20000080 },
-	{ 0x5fffc, 0x20000080 },
-	{ 0x502f0, 0x20000080 },
-	{ 0x5fffc, 0x20000080 },
-	{ 0x502f4, 0x20000080 },
-	{ 0x5fffc, 0x20000080 },
-	{ 0x502f8, 0x20000080 },
-	{ 0x5fffc, 0x20000080 },
-	{ 0x502fc, 0x20000080 },
-	{ 0x5fffc, 0x20000080 },
-	{ 0x50300, 0x20000080 },
-	{ 0x5fffc, 0x20000080 },
-	{ 0x50304, 0x20000080 },
-	{ 0x5fffc, 0x20000080 },
-	{ 0x50308, 0x20000080 },
-	{ 0x5fffc, 0x20000080 },
-	{ 0x5030c, 0x20000080 },
-	{ 0x5fffc, 0x20000080 },
-	{ 0x50310, 0x20000080 },
-	{ 0x5fffc, 0x20000080 },
-	{ 0x50314, 0x20000080 },
-	{ 0x5fffc, 0x20000080 },
-	{ 0x50318, 0x20000080 },
-	{ 0x5fffc, 0x20000080 },
-	{ 0x5031c, 0x20000080 },
-	{ 0x5fffc, 0x20000080 },
-	{ 0x50320, 0x20000080 },
-	{ 0x5fffc, 0x20000080 },
-	{ 0x50324, 0x20000080 },
-	{ 0x5fffc, 0x20000080 },
-	{ 0x50328, 0x20000080 },
-	{ 0x5fffc, 0x20000080 },
-	{ 0x5032c, 0x20000080 },
-	{ 0x5fffc, 0x20000080 },
-	{ 0x50330, 0x20000080 },
-	{ 0x5fffc, 0x20000080 },
-	{ 0x50334, 0x20000080 },
-	{ 0x5fffc, 0x20000080 },
-	{ 0x50338, 0x20000080 },
-	{ 0x5fffc, 0x20000080 },
-	{ 0x5033c, 0x20000080 },
-	{ 0x5fffc, 0x20000080 },
-	{ 0x50340, 0x20000080 },
-	{ 0x5fffc, 0x20000080 },
-	{ 0x50344, 0x20000080 },
-	{ 0x5fffc, 0x20000080 },
-	{ 0x50348, 0x20000080 },
-	{ 0x5fffc, 0x20000080 },
-	{ 0x5034c, 0x20000080 },
-	{ 0x5fffc, 0x20000080 },
-	{ 0x50350, 0x20000080 },
-	{ 0x5fffc, 0x20000080 },
-	{ 0x50354, 0x20000080 },
-	{ 0x5fffc, 0x20000080 },
-	{ 0x50358, 0x20000080 },
-	{ 0x5fffc, 0x20000080 },
-	{ 0x5035c, 0x20000080 },
-	{ 0x5fffc, 0x20000080 },
-	{ 0x50360, 0x20000080 },
-	{ 0x5fffc, 0x20000080 },
-	{ 0x50364, 0x20000080 },
-	{ 0x5fffc, 0x20000080 },
-	{ 0x50368, 0x20000080 },
-	{ 0x5fffc, 0x20000080 },
-	{ 0x5036c, 0x20000080 },
-	{ 0x5fffc, 0x20000080 },
-	{ 0x50370, 0x20000080 },
-	{ 0x5fffc, 0x20000080 },
-	{ 0x50374, 0x20000080 },
-	{ 0x5fffc, 0x20000080 },
-	{ 0x50378, 0x20000080 },
-	{ 0x5fffc, 0x20000080 },
-	{ 0x5037c, 0x20000080 },
-};
diff --git a/drivers/video/fbdev/msm/mdp_scale_tables.h b/drivers/video/fbdev/msm/mdp_scale_tables.h
deleted file mode 100644
index 34077b1af603..000000000000
--- a/drivers/video/fbdev/msm/mdp_scale_tables.h
+++ /dev/null
@@ -1,38 +0,0 @@
-/* drivers/video/msm_fb/mdp_scale_tables.h
- *
- * Copyright (C) 2007 QUALCOMM Incorporated
- * Copyright (C) 2007 Google Incorporated
- *
- * This software is licensed under the terms of the GNU General Public
- * License version 2, as published by the Free Software Foundation, and
- * may be copied, distributed, and modified under those terms.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- */
-#ifndef _MDP_SCALE_TABLES_H_
-#define _MDP_SCALE_TABLES_H_
-
-#include <linux/types.h>
-struct mdp_table_entry {
-	uint32_t reg;
-	uint32_t val;
-};
-
-extern struct mdp_table_entry mdp_upscale_table[64];
-
-enum {
-	MDP_DOWNSCALE_PT2TOPT4,
-	MDP_DOWNSCALE_PT4TOPT6,
-	MDP_DOWNSCALE_PT6TOPT8,
-	MDP_DOWNSCALE_PT8TO1,
-	MDP_DOWNSCALE_MAX,
-};
-
-extern struct mdp_table_entry *mdp_downscale_x_table[MDP_DOWNSCALE_MAX];
-extern struct mdp_table_entry *mdp_downscale_y_table[MDP_DOWNSCALE_MAX];
-extern struct mdp_table_entry mdp_gaussian_blur_table[];
-
-#endif
diff --git a/drivers/video/fbdev/msm/msm_fb.c b/drivers/video/fbdev/msm/msm_fb.c
deleted file mode 100644
index 2979d7e72126..000000000000
--- a/drivers/video/fbdev/msm/msm_fb.c
+++ /dev/null
@@ -1,659 +0,0 @@
-/* drivers/video/msm/msm_fb.c
- *
- * Core MSM framebuffer driver.
- *
- * Copyright (C) 2007 Google Incorporated
- *
- * This software is licensed under the terms of the GNU General Public
- * License version 2, as published by the Free Software Foundation, and
- * may be copied, distributed, and modified under those terms.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- */
-
-#include <linux/platform_device.h>
-#include <linux/module.h>
-#include <linux/fb.h>
-#include <linux/slab.h>
-#include <linux/delay.h>
-
-#include <linux/freezer.h>
-#include <linux/wait.h>
-#include <linux/msm_mdp.h>
-#include <linux/io.h>
-#include <linux/uaccess.h>
-#include <linux/platform_data/video-msm_fb.h>
-#include <linux/workqueue.h>
-#include <linux/clk.h>
-#include <linux/debugfs.h>
-#include <linux/dma-mapping.h>
-
-#define PRINT_FPS 0
-#define PRINT_BLIT_TIME 0
-
-#define SLEEPING 0x4
-#define UPDATING 0x3
-#define FULL_UPDATE_DONE 0x2
-#define WAKING 0x1
-#define AWAKE 0x0
-
-#define NONE 0
-#define SUSPEND_RESUME 0x1
-#define FPS 0x2
-#define BLIT_TIME 0x4
-#define SHOW_UPDATES 0x8
-
-#define DLOG(mask, fmt, args...) \
-do { \
-	if (msmfb_debug_mask & mask) \
-		printk(KERN_INFO "msmfb: "fmt, ##args); \
-} while (0)
-
-static int msmfb_debug_mask;
-module_param_named(msmfb_debug_mask, msmfb_debug_mask, int,
-		   S_IRUGO | S_IWUSR | S_IWGRP);
-
-struct mdp_device *mdp;
-
-struct msmfb_info {
-	struct fb_info *fb;
-	struct msm_panel_data *panel;
-	int xres;
-	int yres;
-	unsigned output_format;
-	unsigned yoffset;
-	unsigned frame_requested;
-	unsigned frame_done;
-	int sleeping;
-	unsigned update_frame;
-	struct {
-		int left;
-		int top;
-		int eright; /* exclusive */
-		int ebottom; /* exclusive */
-	} update_info;
-	char *black;
-
-	spinlock_t update_lock;
-	struct mutex panel_init_lock;
-	wait_queue_head_t frame_wq;
-	struct work_struct resume_work;
-	struct msmfb_callback dma_callback;
-	struct msmfb_callback vsync_callback;
-	struct hrtimer fake_vsync;
-	ktime_t vsync_request_time;
-};
-
-static int msmfb_open(struct fb_info *info, int user)
-{
-	return 0;
-}
-
-static int msmfb_release(struct fb_info *info, int user)
-{
-	return 0;
-}
-
-/* Called from dma interrupt handler, must not sleep */
-static void msmfb_handle_dma_interrupt(struct msmfb_callback *callback)
-{
-	unsigned long irq_flags;
-	struct msmfb_info *msmfb  = container_of(callback, struct msmfb_info,
-					       dma_callback);
-
-	spin_lock_irqsave(&msmfb->update_lock, irq_flags);
-	msmfb->frame_done = msmfb->frame_requested;
-	if (msmfb->sleeping == UPDATING &&
-	    msmfb->frame_done == msmfb->update_frame) {
-		DLOG(SUSPEND_RESUME, "full update completed\n");
-		schedule_work(&msmfb->resume_work);
-	}
-	spin_unlock_irqrestore(&msmfb->update_lock, irq_flags);
-	wake_up(&msmfb->frame_wq);
-}
-
-static int msmfb_start_dma(struct msmfb_info *msmfb)
-{
-	uint32_t x, y, w, h;
-	unsigned addr;
-	unsigned long irq_flags;
-	uint32_t yoffset;
-	s64 time_since_request;
-	struct msm_panel_data *panel = msmfb->panel;
-
-	spin_lock_irqsave(&msmfb->update_lock, irq_flags);
-	time_since_request = ktime_to_ns(ktime_sub(ktime_get(),
-			     msmfb->vsync_request_time));
-	if (time_since_request > 20 * NSEC_PER_MSEC) {
-		uint32_t us;
-		us = do_div(time_since_request, NSEC_PER_MSEC) / NSEC_PER_USEC;
-		printk(KERN_WARNING "msmfb_start_dma %lld.%03u ms after vsync "
-			"request\n", time_since_request, us);
-	}
-	if (msmfb->frame_done == msmfb->frame_requested) {
-		spin_unlock_irqrestore(&msmfb->update_lock, irq_flags);
-		return -1;
-	}
-	if (msmfb->sleeping == SLEEPING) {
-		DLOG(SUSPEND_RESUME, "tried to start dma while asleep\n");
-		spin_unlock_irqrestore(&msmfb->update_lock, irq_flags);
-		return -1;
-	}
-	x = msmfb->update_info.left;
-	y = msmfb->update_info.top;
-	w = msmfb->update_info.eright - x;
-	h = msmfb->update_info.ebottom - y;
-	yoffset = msmfb->yoffset;
-	msmfb->update_info.left = msmfb->xres + 1;
-	msmfb->update_info.top = msmfb->yres + 1;
-	msmfb->update_info.eright = 0;
-	msmfb->update_info.ebottom = 0;
-	if (unlikely(w > msmfb->xres || h > msmfb->yres ||
-		     w == 0 || h == 0)) {
-		printk(KERN_INFO "invalid update: %d %d %d "
-				"%d\n", x, y, w, h);
-		msmfb->frame_done = msmfb->frame_requested;
-		goto error;
-	}
-	spin_unlock_irqrestore(&msmfb->update_lock, irq_flags);
-
-	addr = ((msmfb->xres * (yoffset + y) + x) * 2);
-	mdp->dma(mdp, addr + msmfb->fb->fix.smem_start,
-		 msmfb->xres * 2, w, h, x, y, &msmfb->dma_callback,
-		 panel->interface_type);
-	return 0;
-error:
-	spin_unlock_irqrestore(&msmfb->update_lock, irq_flags);
-	/* some clients need to clear their vsync interrupt */
-	if (panel->clear_vsync)
-		panel->clear_vsync(panel);
-	wake_up(&msmfb->frame_wq);
-	return 0;
-}
-
-/* Called from esync interrupt handler, must not sleep */
-static void msmfb_handle_vsync_interrupt(struct msmfb_callback *callback)
-{
-	struct msmfb_info *msmfb = container_of(callback, struct msmfb_info,
-					       vsync_callback);
-	msmfb_start_dma(msmfb);
-}
-
-static enum hrtimer_restart msmfb_fake_vsync(struct hrtimer *timer)
-{
-	struct msmfb_info *msmfb  = container_of(timer, struct msmfb_info,
-					       fake_vsync);
-	msmfb_start_dma(msmfb);
-	return HRTIMER_NORESTART;
-}
-
-static void msmfb_pan_update(struct fb_info *info, uint32_t left, uint32_t top,
-			     uint32_t eright, uint32_t ebottom,
-			     uint32_t yoffset, int pan_display)
-{
-	struct msmfb_info *msmfb = info->par;
-	struct msm_panel_data *panel = msmfb->panel;
-	unsigned long irq_flags;
-	int sleeping;
-	int retry = 1;
-
-	DLOG(SHOW_UPDATES, "update %d %d %d %d %d %d\n",
-		left, top, eright, ebottom, yoffset, pan_display);
-restart:
-	spin_lock_irqsave(&msmfb->update_lock, irq_flags);
-
-	/* if we are sleeping, on a pan_display wait 10ms (to throttle back
-	 * drawing otherwise return */
-	if (msmfb->sleeping == SLEEPING) {
-		DLOG(SUSPEND_RESUME, "drawing while asleep\n");
-		spin_unlock_irqrestore(&msmfb->update_lock, irq_flags);
-		if (pan_display)
-			wait_event_interruptible_timeout(msmfb->frame_wq,
-				msmfb->sleeping != SLEEPING, HZ/10);
-		return;
-	}
-
-	sleeping = msmfb->sleeping;
-	/* on a full update, if the last frame has not completed, wait for it */
-	if ((pan_display && msmfb->frame_requested != msmfb->frame_done) ||
-			    sleeping == UPDATING) {
-		int ret;
-		spin_unlock_irqrestore(&msmfb->update_lock, irq_flags);
-		ret = wait_event_interruptible_timeout(msmfb->frame_wq,
-			msmfb->frame_done == msmfb->frame_requested &&
-			msmfb->sleeping != UPDATING, 5 * HZ);
-		if (ret <= 0 && (msmfb->frame_requested != msmfb->frame_done ||
-				 msmfb->sleeping == UPDATING)) {
-			if (retry && panel->request_vsync &&
-			    (sleeping == AWAKE)) {
-				panel->request_vsync(panel,
-					&msmfb->vsync_callback);
-				retry = 0;
-				printk(KERN_WARNING "msmfb_pan_display timeout "
-					"rerequest vsync\n");
-			} else {
-				printk(KERN_WARNING "msmfb_pan_display timeout "
-					"waiting for frame start, %d %d\n",
-					msmfb->frame_requested,
-					msmfb->frame_done);
-				return;
-			}
-		}
-		goto restart;
-	}
-
-
-	msmfb->frame_requested++;
-	/* if necessary, update the y offset, if this is the
-	 * first full update on resume, set the sleeping state */
-	if (pan_display) {
-		msmfb->yoffset = yoffset;
-		if (left == 0 && top == 0 && eright == info->var.xres &&
-		    ebottom == info->var.yres) {
-			if (sleeping == WAKING) {
-				msmfb->update_frame = msmfb->frame_requested;
-				DLOG(SUSPEND_RESUME, "full update starting\n");
-				msmfb->sleeping = UPDATING;
-			}
-		}
-	}
-
-	/* set the update request */
-	if (left < msmfb->update_info.left)
-		msmfb->update_info.left = left;
-	if (top < msmfb->update_info.top)
-		msmfb->update_info.top = top;
-	if (eright > msmfb->update_info.eright)
-		msmfb->update_info.eright = eright;
-	if (ebottom > msmfb->update_info.ebottom)
-		msmfb->update_info.ebottom = ebottom;
-	DLOG(SHOW_UPDATES, "update queued %d %d %d %d %d\n",
-		msmfb->update_info.left, msmfb->update_info.top,
-		msmfb->update_info.eright, msmfb->update_info.ebottom,
-		msmfb->yoffset);
-	spin_unlock_irqrestore(&msmfb->update_lock, irq_flags);
-
-	/* if the panel is all the way on wait for vsync, otherwise sleep
-	 * for 16 ms (long enough for the dma to panel) and then begin dma */
-	msmfb->vsync_request_time = ktime_get();
-	if (panel->request_vsync && (sleeping == AWAKE)) {
-		panel->request_vsync(panel, &msmfb->vsync_callback);
-	} else {
-		if (!hrtimer_active(&msmfb->fake_vsync)) {
-			hrtimer_start(&msmfb->fake_vsync,
-				      ktime_set(0, NSEC_PER_SEC/60),
-				      HRTIMER_MODE_REL);
-		}
-	}
-}
-
-static void msmfb_update(struct fb_info *info, uint32_t left, uint32_t top,
-			 uint32_t eright, uint32_t ebottom)
-{
-	msmfb_pan_update(info, left, top, eright, ebottom, 0, 0);
-}
-
-static void power_on_panel(struct work_struct *work)
-{
-	struct msmfb_info *msmfb =
-		container_of(work, struct msmfb_info, resume_work);
-	struct msm_panel_data *panel = msmfb->panel;
-	unsigned long irq_flags;
-
-	mutex_lock(&msmfb->panel_init_lock);
-	DLOG(SUSPEND_RESUME, "turning on panel\n");
-	if (msmfb->sleeping == UPDATING) {
-		if (panel->unblank(panel)) {
-			printk(KERN_INFO "msmfb: panel unblank failed,"
-			       "not starting drawing\n");
-			goto error;
-		}
-		spin_lock_irqsave(&msmfb->update_lock, irq_flags);
-		msmfb->sleeping = AWAKE;
-		wake_up(&msmfb->frame_wq);
-		spin_unlock_irqrestore(&msmfb->update_lock, irq_flags);
-	}
-error:
-	mutex_unlock(&msmfb->panel_init_lock);
-}
-
-
-static int msmfb_check_var(struct fb_var_screeninfo *var, struct fb_info *info)
-{
-	if ((var->xres != info->var.xres) ||
-	    (var->yres != info->var.yres) ||
-	    (var->xres_virtual != info->var.xres_virtual) ||
-	    (var->yres_virtual != info->var.yres_virtual) ||
-	    (var->xoffset != info->var.xoffset) ||
-	    (var->bits_per_pixel != info->var.bits_per_pixel) ||
-	    (var->grayscale != info->var.grayscale))
-		 return -EINVAL;
-	return 0;
-}
-
-int msmfb_pan_display(struct fb_var_screeninfo *var, struct fb_info *info)
-{
-	struct msmfb_info *msmfb = info->par;
-	struct msm_panel_data *panel = msmfb->panel;
-
-	/* "UPDT" */
-	if ((panel->caps & MSMFB_CAP_PARTIAL_UPDATES) &&
-	    (var->reserved[0] == 0x54445055)) {
-		msmfb_pan_update(info, var->reserved[1] & 0xffff,
-				 var->reserved[1] >> 16,
-				 var->reserved[2] & 0xffff,
-				 var->reserved[2] >> 16, var->yoffset, 1);
-	} else {
-		msmfb_pan_update(info, 0, 0, info->var.xres, info->var.yres,
-				 var->yoffset, 1);
-	}
-	return 0;
-}
-
-static void msmfb_fillrect(struct fb_info *p, const struct fb_fillrect *rect)
-{
-	cfb_fillrect(p, rect);
-	msmfb_update(p, rect->dx, rect->dy, rect->dx + rect->width,
-		     rect->dy + rect->height);
-}
-
-static void msmfb_copyarea(struct fb_info *p, const struct fb_copyarea *area)
-{
-	cfb_copyarea(p, area);
-	msmfb_update(p, area->dx, area->dy, area->dx + area->width,
-		     area->dy + area->height);
-}
-
-static void msmfb_imageblit(struct fb_info *p, const struct fb_image *image)
-{
-	cfb_imageblit(p, image);
-	msmfb_update(p, image->dx, image->dy, image->dx + image->width,
-		     image->dy + image->height);
-}
-
-
-static int msmfb_blit(struct fb_info *info,
-		      void __user *p)
-{
-	struct mdp_blit_req req;
-	struct mdp_blit_req_list req_list;
-	int i;
-	int ret;
-
-	if (copy_from_user(&req_list, p, sizeof(req_list)))
-		return -EFAULT;
-
-	for (i = 0; i < req_list.count; i++) {
-		struct mdp_blit_req_list *list =
-			(struct mdp_blit_req_list *)p;
-		if (copy_from_user(&req, &list->req[i], sizeof(req)))
-			return -EFAULT;
-		ret = mdp->blit(mdp, info, &req);
-		if (ret)
-			return ret;
-	}
-	return 0;
-}
-
-
-DEFINE_MUTEX(mdp_ppp_lock);
-
-static int msmfb_ioctl(struct fb_info *p, unsigned int cmd, unsigned long arg)
-{
-	void __user *argp = (void __user *)arg;
-	int ret;
-
-	switch (cmd) {
-	case MSMFB_GRP_DISP:
-		mdp->set_grp_disp(mdp, arg);
-		break;
-	case MSMFB_BLIT:
-		ret = msmfb_blit(p, argp);
-		if (ret)
-			return ret;
-		break;
-	default:
-			printk(KERN_INFO "msmfb unknown ioctl: %d\n", cmd);
-			return -EINVAL;
-	}
-	return 0;
-}
-
-static struct fb_ops msmfb_ops = {
-	.owner = THIS_MODULE,
-	.fb_open = msmfb_open,
-	.fb_release = msmfb_release,
-	.fb_check_var = msmfb_check_var,
-	.fb_pan_display = msmfb_pan_display,
-	.fb_fillrect = msmfb_fillrect,
-	.fb_copyarea = msmfb_copyarea,
-	.fb_imageblit = msmfb_imageblit,
-	.fb_ioctl = msmfb_ioctl,
-};
-
-static unsigned PP[16];
-
-
-
-#define BITS_PER_PIXEL 16
-
-static void setup_fb_info(struct msmfb_info *msmfb)
-{
-	struct fb_info *fb_info = msmfb->fb;
-	int r;
-
-	/* finish setting up the fb_info struct */
-	strncpy(fb_info->fix.id, "msmfb", 16);
-	fb_info->fix.ypanstep = 1;
-
-	fb_info->fbops = &msmfb_ops;
-	fb_info->flags = FBINFO_DEFAULT;
-
-	fb_info->fix.type = FB_TYPE_PACKED_PIXELS;
-	fb_info->fix.visual = FB_VISUAL_TRUECOLOR;
-	fb_info->fix.line_length = msmfb->xres * 2;
-
-	fb_info->var.xres = msmfb->xres;
-	fb_info->var.yres = msmfb->yres;
-	fb_info->var.width = msmfb->panel->fb_data->width;
-	fb_info->var.height = msmfb->panel->fb_data->height;
-	fb_info->var.xres_virtual = msmfb->xres;
-	fb_info->var.yres_virtual = msmfb->yres * 2;
-	fb_info->var.bits_per_pixel = BITS_PER_PIXEL;
-	fb_info->var.accel_flags = 0;
-
-	fb_info->var.yoffset = 0;
-
-	if (msmfb->panel->caps & MSMFB_CAP_PARTIAL_UPDATES) {
-		/*
-		 * Set the param in the fixed screen, so userspace can't
-		 * change it. This will be used to check for the
-		 * capability.
-		 */
-		fb_info->fix.reserved[0] = 0x5444;
-		fb_info->fix.reserved[1] = 0x5055;
-
-		/*
-		 * This preloads the value so that if userspace doesn't
-		 * change it, it will be a full update
-		 */
-		fb_info->var.reserved[0] = 0x54445055;
-		fb_info->var.reserved[1] = 0;
-		fb_info->var.reserved[2] = (uint16_t)msmfb->xres |
-					   ((uint32_t)msmfb->yres << 16);
-	}
-
-	fb_info->var.red.offset = 11;
-	fb_info->var.red.length = 5;
-	fb_info->var.red.msb_right = 0;
-	fb_info->var.green.offset = 5;
-	fb_info->var.green.length = 6;
-	fb_info->var.green.msb_right = 0;
-	fb_info->var.blue.offset = 0;
-	fb_info->var.blue.length = 5;
-	fb_info->var.blue.msb_right = 0;
-
-	r = fb_alloc_cmap(&fb_info->cmap, 16, 0);
-	fb_info->pseudo_palette = PP;
-
-	PP[0] = 0;
-	for (r = 1; r < 16; r++)
-		PP[r] = 0xffffffff;
-}
-
-static int setup_fbmem(struct msmfb_info *msmfb, struct platform_device *pdev)
-{
-	struct fb_info *fb = msmfb->fb;
-	struct resource *resource;
-	unsigned long size = msmfb->xres * msmfb->yres *
-			     (BITS_PER_PIXEL >> 3) * 2;
-	unsigned char *fbram;
-
-	/* board file might have attached a resource describing an fb */
-	resource = platform_get_resource(pdev, IORESOURCE_MEM, 0);
-	if (!resource)
-		return -EINVAL;
-
-	/* check the resource is large enough to fit the fb */
-	if (resource->end - resource->start < size) {
-		printk(KERN_ERR "allocated resource is too small for "
-				"fb\n");
-		return -ENOMEM;
-	}
-	fb->fix.smem_start = resource->start;
-	fb->fix.smem_len = resource_size(resource);
-	fbram = ioremap(resource->start, resource_size(resource));
-	if (fbram == NULL) {
-		printk(KERN_ERR "msmfb: cannot allocate fbram!\n");
-		return -ENOMEM;
-	}
-	fb->screen_base = fbram;
-	return 0;
-}
-
-static int msmfb_probe(struct platform_device *pdev)
-{
-	struct fb_info *fb;
-	struct msmfb_info *msmfb;
-	struct msm_panel_data *panel = pdev->dev.platform_data;
-	int ret;
-
-	if (!panel) {
-		pr_err("msmfb_probe: no platform data\n");
-		return -EINVAL;
-	}
-	if (!panel->fb_data) {
-		pr_err("msmfb_probe: no fb_data\n");
-		return -EINVAL;
-	}
-
-	fb = framebuffer_alloc(sizeof(struct msmfb_info), &pdev->dev);
-	if (!fb)
-		return -ENOMEM;
-	msmfb = fb->par;
-	msmfb->fb = fb;
-	msmfb->panel = panel;
-	msmfb->xres = panel->fb_data->xres;
-	msmfb->yres = panel->fb_data->yres;
-
-	ret = setup_fbmem(msmfb, pdev);
-	if (ret)
-		goto error_setup_fbmem;
-
-	setup_fb_info(msmfb);
-
-	spin_lock_init(&msmfb->update_lock);
-	mutex_init(&msmfb->panel_init_lock);
-	init_waitqueue_head(&msmfb->frame_wq);
-	INIT_WORK(&msmfb->resume_work, power_on_panel);
-	msmfb->black = devm_kzalloc(&pdev->dev,
-				    msmfb->fb->var.bits_per_pixel*msmfb->xres,
-				    GFP_KERNEL);
-	if (!msmfb->black) {
-		ret = -ENOMEM;
-		goto error_register_framebuffer;
-	}
-
-	printk(KERN_INFO "msmfb_probe() installing %d x %d panel\n",
-	       msmfb->xres, msmfb->yres);
-
-	msmfb->dma_callback.func = msmfb_handle_dma_interrupt;
-	msmfb->vsync_callback.func = msmfb_handle_vsync_interrupt;
-	hrtimer_init(&msmfb->fake_vsync, CLOCK_MONOTONIC,
-		     HRTIMER_MODE_REL);
-
-
-	msmfb->fake_vsync.function = msmfb_fake_vsync;
-
-	ret = register_framebuffer(fb);
-	if (ret)
-		goto error_register_framebuffer;
-
-	msmfb->sleeping = WAKING;
-
-	platform_set_drvdata(pdev, msmfb);
-
-	return 0;
-
-error_register_framebuffer:
-	iounmap(fb->screen_base);
-error_setup_fbmem:
-	framebuffer_release(msmfb->fb);
-	return ret;
-}
-
-static int msmfb_remove(struct platform_device *pdev)
-{
-	struct msmfb_info *msmfb;
-
-	msmfb = platform_get_drvdata(pdev);
-
-	unregister_framebuffer(msmfb->fb);
-	iounmap(msmfb->fb->screen_base);
-	framebuffer_release(msmfb->fb);
-
-	return 0;
-}
-
-static struct platform_driver msm_panel_driver = {
-	/* need to write remove */
-	.probe = msmfb_probe,
-	.remove = msmfb_remove,
-	.driver = {.name = "msm_panel"},
-};
-
-
-static int msmfb_add_mdp_device(struct device *dev,
-				struct class_interface *class_intf)
-{
-	/* might need locking if mulitple mdp devices */
-	if (mdp)
-		return 0;
-	mdp = container_of(dev, struct mdp_device, dev);
-	return platform_driver_register(&msm_panel_driver);
-}
-
-static void msmfb_remove_mdp_device(struct device *dev,
-				struct class_interface *class_intf)
-{
-	/* might need locking if mulitple mdp devices */
-	if (dev != &mdp->dev)
-		return;
-	platform_driver_unregister(&msm_panel_driver);
-	mdp = NULL;
-}
-
-static struct class_interface msm_fb_interface = {
-	.add_dev = &msmfb_add_mdp_device,
-	.remove_dev = &msmfb_remove_mdp_device,
-};
-
-static int __init msmfb_init(void)
-{
-	return register_mdp_client(&msm_fb_interface);
-}
-
-module_init(msmfb_init);
diff --git a/include/linux/platform_data/video-msm_fb.h b/include/linux/platform_data/video-msm_fb.h
deleted file mode 100644
index 31449be3eadb..000000000000
--- a/include/linux/platform_data/video-msm_fb.h
+++ /dev/null
@@ -1,146 +0,0 @@
-/*
- * Internal shared definitions for various MSM framebuffer parts.
- *
- * Copyright (C) 2007 Google Incorporated
- *
- * This software is licensed under the terms of the GNU General Public
- * License version 2, as published by the Free Software Foundation, and
- * may be copied, distributed, and modified under those terms.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- */
-
-#ifndef _MSM_FB_H_
-#define _MSM_FB_H_
-
-#include <linux/device.h>
-
-struct mddi_info;
-
-struct msm_fb_data {
-	int xres;	/* x resolution in pixels */
-	int yres;	/* y resolution in pixels */
-	int width;	/* disply width in mm */
-	int height;	/* display height in mm */
-	unsigned output_format;
-};
-
-struct msmfb_callback {
-	void (*func)(struct msmfb_callback *);
-};
-
-enum {
-	MSM_MDDI_PMDH_INTERFACE,
-	MSM_MDDI_EMDH_INTERFACE,
-	MSM_EBI2_INTERFACE,
-};
-
-#define MSMFB_CAP_PARTIAL_UPDATES	(1 << 0)
-
-struct msm_panel_data {
-	/* turns off the fb memory */
-	int (*suspend)(struct msm_panel_data *);
-	/* turns on the fb memory */
-	int (*resume)(struct msm_panel_data *);
-	/* turns off the panel */
-	int (*blank)(struct msm_panel_data *);
-	/* turns on the panel */
-	int (*unblank)(struct msm_panel_data *);
-	void (*wait_vsync)(struct msm_panel_data *);
-	void (*request_vsync)(struct msm_panel_data *, struct msmfb_callback *);
-	void (*clear_vsync)(struct msm_panel_data *);
-	/* from the enum above */
-	unsigned interface_type;
-	/* data to be passed to the fb driver */
-	struct msm_fb_data *fb_data;
-
-	/* capabilities supported by the panel */
-	uint32_t caps;
-};
-
-struct msm_mddi_client_data {
-	void (*suspend)(struct msm_mddi_client_data *);
-	void (*resume)(struct msm_mddi_client_data *);
-	void (*activate_link)(struct msm_mddi_client_data *);
-	void (*remote_write)(struct msm_mddi_client_data *, uint32_t val,
-			     uint32_t reg);
-	uint32_t (*remote_read)(struct msm_mddi_client_data *, uint32_t reg);
-	void (*auto_hibernate)(struct msm_mddi_client_data *, int);
-	/* custom data that needs to be passed from the board file to a 
-	 * particular client */
-	void *private_client_data;
-	struct resource *fb_resource;
-	/* from the list above */
-	unsigned interface_type;
-};
-
-struct msm_mddi_platform_data {
-	unsigned int clk_rate;
-	void (*power_client)(struct msm_mddi_client_data *, int on);
-
-	/* fixup the mfr name, product id */
-	void (*fixup)(uint16_t *mfr_name, uint16_t *product_id);
-
-	struct resource *fb_resource; /*optional*/
-	/* number of clients in the list that follows */
-	int num_clients;
-	/* array of client information of clients */
-	struct {
-		unsigned product_id; /* mfr id in top 16 bits, product id
-				      * in lower 16 bits
-				      */
-		char *name;	/* the device name will be the platform
-				 * device name registered for the client,
-				 * it should match the name of the associated
-				 * driver
-				 */
-		unsigned id;	/* id for mddi client device node, will also
-				 * be used as device id of panel devices, if
-				 * the client device will have multiple panels
-				 * space must be left here for them
-				 */
-		void *client_data;	/* required private client data */
-		unsigned int clk_rate;	/* optional: if the client requires a
-					* different mddi clk rate
-					*/
-	} client_platform_data[];
-};
-
-struct mdp_blit_req;
-struct fb_info;
-struct mdp_device {
-	struct device dev;
-	void (*dma)(struct mdp_device *mpd, uint32_t addr,
-		    uint32_t stride, uint32_t w, uint32_t h, uint32_t x,
-		    uint32_t y, struct msmfb_callback *callback, int interface);
-	void (*dma_wait)(struct mdp_device *mdp);
-	int (*blit)(struct mdp_device *mdp, struct fb_info *fb,
-		    struct mdp_blit_req *req);
-	void (*set_grp_disp)(struct mdp_device *mdp, uint32_t disp_id);
-};
-
-struct class_interface;
-int register_mdp_client(struct class_interface *class_intf);
-
-/**** private client data structs go below this line ***/
-
-struct msm_mddi_bridge_platform_data {
-	/* from board file */
-	int (*init)(struct msm_mddi_bridge_platform_data *,
-		    struct msm_mddi_client_data *);
-	int (*uninit)(struct msm_mddi_bridge_platform_data *,
-		      struct msm_mddi_client_data *);
-	/* passed to panel for use by the fb driver */
-	int (*blank)(struct msm_mddi_bridge_platform_data *,
-		     struct msm_mddi_client_data *);
-	int (*unblank)(struct msm_mddi_bridge_platform_data *,
-		       struct msm_mddi_client_data *);
-	struct msm_fb_data fb_data;
-};
-
-
-
-#endif
-- 
cgit v1.2.3


From 208489032bdd8d4a7de50f3057c175058f271956 Mon Sep 17 00:00:00 2001
From: Chaotian Jing <chaotian.jing@mediatek.com>
Date: Mon, 15 Jun 2015 19:20:48 +0800
Subject: mmc: mediatek: Add Mediatek MMC driver

Add Mediatek MMC driver code
Support eMMC/SD/SDIO

Signed-off-by: Chaotian Jing <chaotian.jing@mediatek.com>
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 drivers/mmc/host/Kconfig  |    8 +
 drivers/mmc/host/Makefile |    1 +
 drivers/mmc/host/mtk-sd.c | 1379 +++++++++++++++++++++++++++++++++++++++++++++
 include/linux/mmc/core.h  |    1 +
 4 files changed, 1389 insertions(+)
 create mode 100644 drivers/mmc/host/mtk-sd.c

(limited to 'include/linux')

diff --git a/drivers/mmc/host/Kconfig b/drivers/mmc/host/Kconfig
index d5e107b78d60..fd9a58e216a5 100644
--- a/drivers/mmc/host/Kconfig
+++ b/drivers/mmc/host/Kconfig
@@ -776,3 +776,11 @@ config MMC_TOSHIBA_PCI
 	tristate "Toshiba Type A SD/MMC Card Interface Driver"
 	depends on PCI
 	help
+
+config MMC_MTK
+	tristate "MediaTek SD/MMC Card Interface support"
+	help
+	  This selects the MediaTek(R) Secure digital and Multimedia card Interface.
+	  If you have a machine with a integrated SD/MMC card reader, say Y or M here.
+	  This is needed if support for any SD/SDIO/MMC devices is required.
+	  If unsure, say N.
diff --git a/drivers/mmc/host/Makefile b/drivers/mmc/host/Makefile
index e3ab5b968651..e928d61c5f4b 100644
--- a/drivers/mmc/host/Makefile
+++ b/drivers/mmc/host/Makefile
@@ -20,6 +20,7 @@ obj-$(CONFIG_MMC_SDHCI_F_SDH30)	+= sdhci_f_sdh30.o
 obj-$(CONFIG_MMC_SDHCI_SPEAR)	+= sdhci-spear.o
 obj-$(CONFIG_MMC_WBSD)		+= wbsd.o
 obj-$(CONFIG_MMC_AU1X)		+= au1xmmc.o
+obj-$(CONFIG_MMC_MTK)		+= mtk-sd.o
 obj-$(CONFIG_MMC_OMAP)		+= omap.o
 obj-$(CONFIG_MMC_OMAP_HS)	+= omap_hsmmc.o
 obj-$(CONFIG_MMC_ATMELMCI)	+= atmel-mci.o
diff --git a/drivers/mmc/host/mtk-sd.c b/drivers/mmc/host/mtk-sd.c
new file mode 100644
index 000000000000..fd965cb81133
--- /dev/null
+++ b/drivers/mmc/host/mtk-sd.c
@@ -0,0 +1,1379 @@
+/*
+ * Copyright (c) 2014-2015 MediaTek Inc.
+ * Author: Chaotian.Jing <chaotian.jing@mediatek.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/module.h>
+#include <linux/clk.h>
+#include <linux/delay.h>
+#include <linux/dma-mapping.h>
+#include <linux/ioport.h>
+#include <linux/irq.h>
+#include <linux/of_address.h>
+#include <linux/of_irq.h>
+#include <linux/of_gpio.h>
+#include <linux/pinctrl/consumer.h>
+#include <linux/platform_device.h>
+#include <linux/regulator/consumer.h>
+#include <linux/spinlock.h>
+
+#include <linux/mmc/card.h>
+#include <linux/mmc/core.h>
+#include <linux/mmc/host.h>
+#include <linux/mmc/mmc.h>
+#include <linux/mmc/sd.h>
+#include <linux/mmc/sdio.h>
+
+#define MAX_BD_NUM          1024
+
+/*--------------------------------------------------------------------------*/
+/* Common Definition                                                        */
+/*--------------------------------------------------------------------------*/
+#define MSDC_BUS_1BITS          0x0
+#define MSDC_BUS_4BITS          0x1
+#define MSDC_BUS_8BITS          0x2
+
+#define MSDC_BURST_64B          0x6
+
+/*--------------------------------------------------------------------------*/
+/* Register Offset                                                          */
+/*--------------------------------------------------------------------------*/
+#define MSDC_CFG         0x0
+#define MSDC_IOCON       0x04
+#define MSDC_PS          0x08
+#define MSDC_INT         0x0c
+#define MSDC_INTEN       0x10
+#define MSDC_FIFOCS      0x14
+#define SDC_CFG          0x30
+#define SDC_CMD          0x34
+#define SDC_ARG          0x38
+#define SDC_STS          0x3c
+#define SDC_RESP0        0x40
+#define SDC_RESP1        0x44
+#define SDC_RESP2        0x48
+#define SDC_RESP3        0x4c
+#define SDC_BLK_NUM      0x50
+#define SDC_ACMD_RESP    0x80
+#define MSDC_DMA_SA      0x90
+#define MSDC_DMA_CTRL    0x98
+#define MSDC_DMA_CFG     0x9c
+#define MSDC_PATCH_BIT   0xb0
+#define MSDC_PATCH_BIT1  0xb4
+#define MSDC_PAD_TUNE    0xec
+
+/*--------------------------------------------------------------------------*/
+/* Register Mask                                                            */
+/*--------------------------------------------------------------------------*/
+
+/* MSDC_CFG mask */
+#define MSDC_CFG_MODE           (0x1 << 0)	/* RW */
+#define MSDC_CFG_CKPDN          (0x1 << 1)	/* RW */
+#define MSDC_CFG_RST            (0x1 << 2)	/* RW */
+#define MSDC_CFG_PIO            (0x1 << 3)	/* RW */
+#define MSDC_CFG_CKDRVEN        (0x1 << 4)	/* RW */
+#define MSDC_CFG_BV18SDT        (0x1 << 5)	/* RW */
+#define MSDC_CFG_BV18PSS        (0x1 << 6)	/* R  */
+#define MSDC_CFG_CKSTB          (0x1 << 7)	/* R  */
+#define MSDC_CFG_CKDIV          (0xff << 8)	/* RW */
+#define MSDC_CFG_CKMOD          (0x3 << 16)	/* RW */
+
+/* MSDC_IOCON mask */
+#define MSDC_IOCON_SDR104CKS    (0x1 << 0)	/* RW */
+#define MSDC_IOCON_RSPL         (0x1 << 1)	/* RW */
+#define MSDC_IOCON_DSPL         (0x1 << 2)	/* RW */
+#define MSDC_IOCON_DDLSEL       (0x1 << 3)	/* RW */
+#define MSDC_IOCON_DDR50CKD     (0x1 << 4)	/* RW */
+#define MSDC_IOCON_DSPLSEL      (0x1 << 5)	/* RW */
+#define MSDC_IOCON_W_DSPL       (0x1 << 8)	/* RW */
+#define MSDC_IOCON_D0SPL        (0x1 << 16)	/* RW */
+#define MSDC_IOCON_D1SPL        (0x1 << 17)	/* RW */
+#define MSDC_IOCON_D2SPL        (0x1 << 18)	/* RW */
+#define MSDC_IOCON_D3SPL        (0x1 << 19)	/* RW */
+#define MSDC_IOCON_D4SPL        (0x1 << 20)	/* RW */
+#define MSDC_IOCON_D5SPL        (0x1 << 21)	/* RW */
+#define MSDC_IOCON_D6SPL        (0x1 << 22)	/* RW */
+#define MSDC_IOCON_D7SPL        (0x1 << 23)	/* RW */
+#define MSDC_IOCON_RISCSZ       (0x3 << 24)	/* RW */
+
+/* MSDC_PS mask */
+#define MSDC_PS_CDEN            (0x1 << 0)	/* RW */
+#define MSDC_PS_CDSTS           (0x1 << 1)	/* R  */
+#define MSDC_PS_CDDEBOUNCE      (0xf << 12)	/* RW */
+#define MSDC_PS_DAT             (0xff << 16)	/* R  */
+#define MSDC_PS_CMD             (0x1 << 24)	/* R  */
+#define MSDC_PS_WP              (0x1 << 31)	/* R  */
+
+/* MSDC_INT mask */
+#define MSDC_INT_MMCIRQ         (0x1 << 0)	/* W1C */
+#define MSDC_INT_CDSC           (0x1 << 1)	/* W1C */
+#define MSDC_INT_ACMDRDY        (0x1 << 3)	/* W1C */
+#define MSDC_INT_ACMDTMO        (0x1 << 4)	/* W1C */
+#define MSDC_INT_ACMDCRCERR     (0x1 << 5)	/* W1C */
+#define MSDC_INT_DMAQ_EMPTY     (0x1 << 6)	/* W1C */
+#define MSDC_INT_SDIOIRQ        (0x1 << 7)	/* W1C */
+#define MSDC_INT_CMDRDY         (0x1 << 8)	/* W1C */
+#define MSDC_INT_CMDTMO         (0x1 << 9)	/* W1C */
+#define MSDC_INT_RSPCRCERR      (0x1 << 10)	/* W1C */
+#define MSDC_INT_CSTA           (0x1 << 11)	/* R */
+#define MSDC_INT_XFER_COMPL     (0x1 << 12)	/* W1C */
+#define MSDC_INT_DXFER_DONE     (0x1 << 13)	/* W1C */
+#define MSDC_INT_DATTMO         (0x1 << 14)	/* W1C */
+#define MSDC_INT_DATCRCERR      (0x1 << 15)	/* W1C */
+#define MSDC_INT_ACMD19_DONE    (0x1 << 16)	/* W1C */
+#define MSDC_INT_DMA_BDCSERR    (0x1 << 17)	/* W1C */
+#define MSDC_INT_DMA_GPDCSERR   (0x1 << 18)	/* W1C */
+#define MSDC_INT_DMA_PROTECT    (0x1 << 19)	/* W1C */
+
+/* MSDC_INTEN mask */
+#define MSDC_INTEN_MMCIRQ       (0x1 << 0)	/* RW */
+#define MSDC_INTEN_CDSC         (0x1 << 1)	/* RW */
+#define MSDC_INTEN_ACMDRDY      (0x1 << 3)	/* RW */
+#define MSDC_INTEN_ACMDTMO      (0x1 << 4)	/* RW */
+#define MSDC_INTEN_ACMDCRCERR   (0x1 << 5)	/* RW */
+#define MSDC_INTEN_DMAQ_EMPTY   (0x1 << 6)	/* RW */
+#define MSDC_INTEN_SDIOIRQ      (0x1 << 7)	/* RW */
+#define MSDC_INTEN_CMDRDY       (0x1 << 8)	/* RW */
+#define MSDC_INTEN_CMDTMO       (0x1 << 9)	/* RW */
+#define MSDC_INTEN_RSPCRCERR    (0x1 << 10)	/* RW */
+#define MSDC_INTEN_CSTA         (0x1 << 11)	/* RW */
+#define MSDC_INTEN_XFER_COMPL   (0x1 << 12)	/* RW */
+#define MSDC_INTEN_DXFER_DONE   (0x1 << 13)	/* RW */
+#define MSDC_INTEN_DATTMO       (0x1 << 14)	/* RW */
+#define MSDC_INTEN_DATCRCERR    (0x1 << 15)	/* RW */
+#define MSDC_INTEN_ACMD19_DONE  (0x1 << 16)	/* RW */
+#define MSDC_INTEN_DMA_BDCSERR  (0x1 << 17)	/* RW */
+#define MSDC_INTEN_DMA_GPDCSERR (0x1 << 18)	/* RW */
+#define MSDC_INTEN_DMA_PROTECT  (0x1 << 19)	/* RW */
+
+/* MSDC_FIFOCS mask */
+#define MSDC_FIFOCS_RXCNT       (0xff << 0)	/* R */
+#define MSDC_FIFOCS_TXCNT       (0xff << 16)	/* R */
+#define MSDC_FIFOCS_CLR         (0x1 << 31)	/* RW */
+
+/* SDC_CFG mask */
+#define SDC_CFG_SDIOINTWKUP     (0x1 << 0)	/* RW */
+#define SDC_CFG_INSWKUP         (0x1 << 1)	/* RW */
+#define SDC_CFG_BUSWIDTH        (0x3 << 16)	/* RW */
+#define SDC_CFG_SDIO            (0x1 << 19)	/* RW */
+#define SDC_CFG_SDIOIDE         (0x1 << 20)	/* RW */
+#define SDC_CFG_INTATGAP        (0x1 << 21)	/* RW */
+#define SDC_CFG_DTOC            (0xff << 24)	/* RW */
+
+/* SDC_STS mask */
+#define SDC_STS_SDCBUSY         (0x1 << 0)	/* RW */
+#define SDC_STS_CMDBUSY         (0x1 << 1)	/* RW */
+#define SDC_STS_SWR_COMPL       (0x1 << 31)	/* RW */
+
+/* MSDC_DMA_CTRL mask */
+#define MSDC_DMA_CTRL_START     (0x1 << 0)	/* W */
+#define MSDC_DMA_CTRL_STOP      (0x1 << 1)	/* W */
+#define MSDC_DMA_CTRL_RESUME    (0x1 << 2)	/* W */
+#define MSDC_DMA_CTRL_MODE      (0x1 << 8)	/* RW */
+#define MSDC_DMA_CTRL_LASTBUF   (0x1 << 10)	/* RW */
+#define MSDC_DMA_CTRL_BRUSTSZ   (0x7 << 12)	/* RW */
+
+/* MSDC_DMA_CFG mask */
+#define MSDC_DMA_CFG_STS        (0x1 << 0)	/* R */
+#define MSDC_DMA_CFG_DECSEN     (0x1 << 1)	/* RW */
+#define MSDC_DMA_CFG_AHBHPROT2  (0x2 << 8)	/* RW */
+#define MSDC_DMA_CFG_ACTIVEEN   (0x2 << 12)	/* RW */
+#define MSDC_DMA_CFG_CS12B16B   (0x1 << 16)	/* RW */
+
+/* MSDC_PATCH_BIT mask */
+#define MSDC_PATCH_BIT_ODDSUPP    (0x1 <<  1)	/* RW */
+#define MSDC_INT_DAT_LATCH_CK_SEL (0x7 <<  7)
+#define MSDC_CKGEN_MSDC_DLY_SEL   (0x1f << 10)
+#define MSDC_PATCH_BIT_IODSSEL    (0x1 << 16)	/* RW */
+#define MSDC_PATCH_BIT_IOINTSEL   (0x1 << 17)	/* RW */
+#define MSDC_PATCH_BIT_BUSYDLY    (0xf << 18)	/* RW */
+#define MSDC_PATCH_BIT_WDOD       (0xf << 22)	/* RW */
+#define MSDC_PATCH_BIT_IDRTSEL    (0x1 << 26)	/* RW */
+#define MSDC_PATCH_BIT_CMDFSEL    (0x1 << 27)	/* RW */
+#define MSDC_PATCH_BIT_INTDLSEL   (0x1 << 28)	/* RW */
+#define MSDC_PATCH_BIT_SPCPUSH    (0x1 << 29)	/* RW */
+#define MSDC_PATCH_BIT_DECRCTMO   (0x1 << 30)	/* RW */
+
+#define REQ_CMD_EIO  (0x1 << 0)
+#define REQ_CMD_TMO  (0x1 << 1)
+#define REQ_DAT_ERR  (0x1 << 2)
+#define REQ_STOP_EIO (0x1 << 3)
+#define REQ_STOP_TMO (0x1 << 4)
+#define REQ_CMD_BUSY (0x1 << 5)
+
+#define MSDC_PREPARE_FLAG (0x1 << 0)
+#define MSDC_ASYNC_FLAG (0x1 << 1)
+#define MSDC_MMAP_FLAG (0x1 << 2)
+
+#define CMD_TIMEOUT         (HZ/10 * 5)	/* 100ms x5 */
+#define DAT_TIMEOUT         (HZ    * 5)	/* 1000ms x5 */
+
+/*--------------------------------------------------------------------------*/
+/* Descriptor Structure                                                     */
+/*--------------------------------------------------------------------------*/
+struct mt_gpdma_desc {
+	u32 gpd_info;
+#define GPDMA_DESC_HWO		(0x1 << 0)
+#define GPDMA_DESC_BDP		(0x1 << 1)
+#define GPDMA_DESC_CHECKSUM	(0xff << 8) /* bit8 ~ bit15 */
+#define GPDMA_DESC_INT		(0x1 << 16)
+	u32 next;
+	u32 ptr;
+	u32 gpd_data_len;
+#define GPDMA_DESC_BUFLEN	(0xffff) /* bit0 ~ bit15 */
+#define GPDMA_DESC_EXTLEN	(0xff << 16) /* bit16 ~ bit23 */
+	u32 arg;
+	u32 blknum;
+	u32 cmd;
+};
+
+struct mt_bdma_desc {
+	u32 bd_info;
+#define BDMA_DESC_EOL		(0x1 << 0)
+#define BDMA_DESC_CHECKSUM	(0xff << 8) /* bit8 ~ bit15 */
+#define BDMA_DESC_BLKPAD	(0x1 << 17)
+#define BDMA_DESC_DWPAD		(0x1 << 18)
+	u32 next;
+	u32 ptr;
+	u32 bd_data_len;
+#define BDMA_DESC_BUFLEN	(0xffff) /* bit0 ~ bit15 */
+};
+
+struct msdc_dma {
+	struct scatterlist *sg;	/* I/O scatter list */
+	struct mt_gpdma_desc *gpd;		/* pointer to gpd array */
+	struct mt_bdma_desc *bd;		/* pointer to bd array */
+	dma_addr_t gpd_addr;	/* the physical address of gpd array */
+	dma_addr_t bd_addr;	/* the physical address of bd array */
+};
+
+struct msdc_host {
+	struct device *dev;
+	struct mmc_host *mmc;	/* mmc structure */
+	int cmd_rsp;
+
+	spinlock_t lock;
+	struct mmc_request *mrq;
+	struct mmc_command *cmd;
+	struct mmc_data *data;
+	int error;
+
+	void __iomem *base;		/* host base address */
+
+	struct msdc_dma dma;	/* dma channel */
+	u64 dma_mask;
+
+	u32 timeout_ns;		/* data timeout ns */
+	u32 timeout_clks;	/* data timeout clks */
+
+	struct pinctrl *pinctrl;
+	struct pinctrl_state *pins_default;
+	struct pinctrl_state *pins_uhs;
+	struct delayed_work req_timeout;
+	int irq;		/* host interrupt */
+
+	struct clk *src_clk;	/* msdc source clock */
+	struct clk *h_clk;      /* msdc h_clk */
+	u32 mclk;		/* mmc subsystem clock frequency */
+	u32 src_clk_freq;	/* source clock frequency */
+	u32 sclk;		/* SD/MS bus clock frequency */
+	bool ddr;
+	bool vqmmc_enabled;
+};
+
+static void sdr_set_bits(void __iomem *reg, u32 bs)
+{
+	u32 val = readl(reg);
+
+	val |= bs;
+	writel(val, reg);
+}
+
+static void sdr_clr_bits(void __iomem *reg, u32 bs)
+{
+	u32 val = readl(reg);
+
+	val &= ~bs;
+	writel(val, reg);
+}
+
+static void sdr_set_field(void __iomem *reg, u32 field, u32 val)
+{
+	unsigned int tv = readl(reg);
+
+	tv &= ~field;
+	tv |= ((val) << (ffs((unsigned int)field) - 1));
+	writel(tv, reg);
+}
+
+static void sdr_get_field(void __iomem *reg, u32 field, u32 *val)
+{
+	unsigned int tv = readl(reg);
+
+	*val = ((tv & field) >> (ffs((unsigned int)field) - 1));
+}
+
+static void msdc_reset_hw(struct msdc_host *host)
+{
+	u32 val;
+
+	sdr_set_bits(host->base + MSDC_CFG, MSDC_CFG_RST);
+	while (readl(host->base + MSDC_CFG) & MSDC_CFG_RST)
+		cpu_relax();
+
+	sdr_set_bits(host->base + MSDC_FIFOCS, MSDC_FIFOCS_CLR);
+	while (readl(host->base + MSDC_FIFOCS) & MSDC_FIFOCS_CLR)
+		cpu_relax();
+
+	val = readl(host->base + MSDC_INT);
+	writel(val, host->base + MSDC_INT);
+}
+
+static void msdc_cmd_next(struct msdc_host *host,
+		struct mmc_request *mrq, struct mmc_command *cmd);
+
+static u32 data_ints_mask = MSDC_INTEN_XFER_COMPL | MSDC_INTEN_DATTMO |
+			MSDC_INTEN_DATCRCERR | MSDC_INTEN_DMA_BDCSERR |
+			MSDC_INTEN_DMA_GPDCSERR | MSDC_INTEN_DMA_PROTECT;
+
+static u8 msdc_dma_calcs(u8 *buf, u32 len)
+{
+	u32 i, sum = 0;
+
+	for (i = 0; i < len; i++)
+		sum += buf[i];
+	return 0xff - (u8) sum;
+}
+
+static inline void msdc_dma_setup(struct msdc_host *host, struct msdc_dma *dma,
+		struct mmc_data *data)
+{
+	unsigned int j, dma_len;
+	dma_addr_t dma_address;
+	u32 dma_ctrl;
+	struct scatterlist *sg;
+	struct mt_gpdma_desc *gpd;
+	struct mt_bdma_desc *bd;
+
+	sg = data->sg;
+
+	gpd = dma->gpd;
+	bd = dma->bd;
+
+	/* modify gpd */
+	gpd->gpd_info |= GPDMA_DESC_HWO;
+	gpd->gpd_info |= GPDMA_DESC_BDP;
+	/* need to clear first. use these bits to calc checksum */
+	gpd->gpd_info &= ~GPDMA_DESC_CHECKSUM;
+	gpd->gpd_info |= msdc_dma_calcs((u8 *) gpd, 16) << 8;
+
+	/* modify bd */
+	for_each_sg(data->sg, sg, data->sg_count, j) {
+		dma_address = sg_dma_address(sg);
+		dma_len = sg_dma_len(sg);
+
+		/* init bd */
+		bd[j].bd_info &= ~BDMA_DESC_BLKPAD;
+		bd[j].bd_info &= ~BDMA_DESC_DWPAD;
+		bd[j].ptr = (u32)dma_address;
+		bd[j].bd_data_len &= ~BDMA_DESC_BUFLEN;
+		bd[j].bd_data_len |= (dma_len & BDMA_DESC_BUFLEN);
+
+		if (j == data->sg_count - 1) /* the last bd */
+			bd[j].bd_info |= BDMA_DESC_EOL;
+		else
+			bd[j].bd_info &= ~BDMA_DESC_EOL;
+
+		/* checksume need to clear first */
+		bd[j].bd_info &= ~BDMA_DESC_CHECKSUM;
+		bd[j].bd_info |= msdc_dma_calcs((u8 *)(&bd[j]), 16) << 8;
+	}
+
+	sdr_set_field(host->base + MSDC_DMA_CFG, MSDC_DMA_CFG_DECSEN, 1);
+	dma_ctrl = readl_relaxed(host->base + MSDC_DMA_CTRL);
+	dma_ctrl &= ~(MSDC_DMA_CTRL_BRUSTSZ | MSDC_DMA_CTRL_MODE);
+	dma_ctrl |= (MSDC_BURST_64B << 12 | 1 << 8);
+	writel_relaxed(dma_ctrl, host->base + MSDC_DMA_CTRL);
+	writel((u32)dma->gpd_addr, host->base + MSDC_DMA_SA);
+}
+
+static void msdc_prepare_data(struct msdc_host *host, struct mmc_request *mrq)
+{
+	struct mmc_data *data = mrq->data;
+
+	if (!(data->host_cookie & MSDC_PREPARE_FLAG)) {
+		bool read = (data->flags & MMC_DATA_READ) != 0;
+
+		data->host_cookie |= MSDC_PREPARE_FLAG;
+		data->sg_count = dma_map_sg(host->dev, data->sg, data->sg_len,
+					   read ? DMA_FROM_DEVICE : DMA_TO_DEVICE);
+	}
+}
+
+static void msdc_unprepare_data(struct msdc_host *host, struct mmc_request *mrq)
+{
+	struct mmc_data *data = mrq->data;
+
+	if (data->host_cookie & MSDC_ASYNC_FLAG)
+		return;
+
+	if (data->host_cookie & MSDC_PREPARE_FLAG) {
+		bool read = (data->flags & MMC_DATA_READ) != 0;
+
+		dma_unmap_sg(host->dev, data->sg, data->sg_len,
+			     read ? DMA_FROM_DEVICE : DMA_TO_DEVICE);
+		data->host_cookie &= ~MSDC_PREPARE_FLAG;
+	}
+}
+
+/* clock control primitives */
+static void msdc_set_timeout(struct msdc_host *host, u32 ns, u32 clks)
+{
+	u32 timeout, clk_ns;
+	u32 mode = 0;
+
+	host->timeout_ns = ns;
+	host->timeout_clks = clks;
+	if (host->sclk == 0) {
+		timeout = 0;
+	} else {
+		clk_ns  = 1000000000UL / host->sclk;
+		timeout = (ns + clk_ns - 1) / clk_ns + clks;
+		/* in 1048576 sclk cycle unit */
+		timeout = (timeout + (0x1 << 20) - 1) >> 20;
+		sdr_get_field(host->base + MSDC_CFG, MSDC_CFG_CKMOD, &mode);
+		/*DDR mode will double the clk cycles for data timeout */
+		timeout = mode >= 2 ? timeout * 2 : timeout;
+		timeout = timeout > 1 ? timeout - 1 : 0;
+		timeout = timeout > 255 ? 255 : timeout;
+	}
+	sdr_set_field(host->base + SDC_CFG, SDC_CFG_DTOC, timeout);
+}
+
+static void msdc_gate_clock(struct msdc_host *host)
+{
+	clk_disable_unprepare(host->src_clk);
+	clk_disable_unprepare(host->h_clk);
+}
+
+static void msdc_ungate_clock(struct msdc_host *host)
+{
+	clk_prepare_enable(host->h_clk);
+	clk_prepare_enable(host->src_clk);
+	while (!(readl(host->base + MSDC_CFG) & MSDC_CFG_CKSTB))
+		cpu_relax();
+}
+
+static void msdc_set_mclk(struct msdc_host *host, int ddr, u32 hz)
+{
+	u32 mode;
+	u32 flags;
+	u32 div;
+	u32 sclk;
+
+	if (!hz) {
+		dev_dbg(host->dev, "set mclk to 0\n");
+		host->mclk = 0;
+		sdr_clr_bits(host->base + MSDC_CFG, MSDC_CFG_CKPDN);
+		return;
+	}
+
+	flags = readl(host->base + MSDC_INTEN);
+	sdr_clr_bits(host->base + MSDC_INTEN, flags);
+	if (ddr) { /* may need to modify later */
+		mode = 0x2; /* ddr mode and use divisor */
+		if (hz >= (host->src_clk_freq >> 2)) {
+			div = 0; /* mean div = 1/4 */
+			sclk = host->src_clk_freq >> 2; /* sclk = clk / 4 */
+		} else {
+			div = (host->src_clk_freq + ((hz << 2) - 1)) / (hz << 2);
+			sclk = (host->src_clk_freq >> 2) / div;
+			div = (div >> 1);
+		}
+	} else if (hz >= host->src_clk_freq) {
+		mode = 0x1; /* no divisor */
+		div = 0;
+		sclk = host->src_clk_freq;
+	} else {
+		mode = 0x0; /* use divisor */
+		if (hz >= (host->src_clk_freq >> 1)) {
+			div = 0; /* mean div = 1/2 */
+			sclk = host->src_clk_freq >> 1; /* sclk = clk / 2 */
+		} else {
+			div = (host->src_clk_freq + ((hz << 2) - 1)) / (hz << 2);
+			sclk = (host->src_clk_freq >> 2) / div;
+		}
+	}
+	sdr_set_field(host->base + MSDC_CFG, MSDC_CFG_CKMOD | MSDC_CFG_CKDIV,
+			(mode << 8) | (div % 0xff));
+	sdr_set_bits(host->base + MSDC_CFG, MSDC_CFG_CKPDN);
+	while (!(readl(host->base + MSDC_CFG) & MSDC_CFG_CKSTB))
+		cpu_relax();
+	host->sclk = sclk;
+	host->mclk = hz;
+	host->ddr = ddr;
+	/* need because clk changed. */
+	msdc_set_timeout(host, host->timeout_ns, host->timeout_clks);
+	sdr_set_bits(host->base + MSDC_INTEN, flags);
+
+	dev_dbg(host->dev, "sclk: %d, ddr: %d\n", host->sclk, ddr);
+}
+
+static inline u32 msdc_cmd_find_resp(struct msdc_host *host,
+		struct mmc_request *mrq, struct mmc_command *cmd)
+{
+	u32 resp;
+
+	switch (mmc_resp_type(cmd)) {
+		/* Actually, R1, R5, R6, R7 are the same */
+	case MMC_RSP_R1:
+		resp = 0x1;
+		break;
+	case MMC_RSP_R1B:
+		resp = 0x7;
+		break;
+	case MMC_RSP_R2:
+		resp = 0x2;
+		break;
+	case MMC_RSP_R3:
+		resp = 0x3;
+		break;
+	case MMC_RSP_NONE:
+	default:
+		resp = 0x0;
+		break;
+	}
+
+	return resp;
+}
+
+static inline u32 msdc_cmd_prepare_raw_cmd(struct msdc_host *host,
+		struct mmc_request *mrq, struct mmc_command *cmd)
+{
+	/* rawcmd :
+	 * vol_swt << 30 | auto_cmd << 28 | blklen << 16 | go_irq << 15 |
+	 * stop << 14 | rw << 13 | dtype << 11 | rsptyp << 7 | brk << 6 | opcode
+	 */
+	u32 opcode = cmd->opcode;
+	u32 resp = msdc_cmd_find_resp(host, mrq, cmd);
+	u32 rawcmd = (opcode & 0x3f) | ((resp & 0x7) << 7);
+
+	host->cmd_rsp = resp;
+
+	if ((opcode == SD_IO_RW_DIRECT && cmd->flags == (unsigned int) -1) ||
+	    opcode == MMC_STOP_TRANSMISSION)
+		rawcmd |= (0x1 << 14);
+	else if (opcode == SD_SWITCH_VOLTAGE)
+		rawcmd |= (0x1 << 30);
+	else if (opcode == SD_APP_SEND_SCR ||
+		 opcode == SD_APP_SEND_NUM_WR_BLKS ||
+		 (opcode == SD_SWITCH && mmc_cmd_type(cmd) == MMC_CMD_ADTC) ||
+		 (opcode == SD_APP_SD_STATUS && mmc_cmd_type(cmd) == MMC_CMD_ADTC) ||
+		 (opcode == MMC_SEND_EXT_CSD && mmc_cmd_type(cmd) == MMC_CMD_ADTC))
+		rawcmd |= (0x1 << 11);
+
+	if (cmd->data) {
+		struct mmc_data *data = cmd->data;
+
+		if (mmc_op_multi(opcode)) {
+			if (mmc_card_mmc(host->mmc->card) && mrq->sbc &&
+			    !(mrq->sbc->arg & 0xFFFF0000))
+				rawcmd |= 0x2 << 28; /* AutoCMD23 */
+		}
+
+		rawcmd |= ((data->blksz & 0xFFF) << 16);
+		if (data->flags & MMC_DATA_WRITE)
+			rawcmd |= (0x1 << 13);
+		if (data->blocks > 1)
+			rawcmd |= (0x2 << 11);
+		else
+			rawcmd |= (0x1 << 11);
+		/* Always use dma mode */
+		sdr_clr_bits(host->base + MSDC_CFG, MSDC_CFG_PIO);
+
+		if (host->timeout_ns != data->timeout_ns ||
+		    host->timeout_clks != data->timeout_clks)
+			msdc_set_timeout(host, data->timeout_ns,
+					data->timeout_clks);
+
+		writel(data->blocks, host->base + SDC_BLK_NUM);
+	}
+	return rawcmd;
+}
+
+static void msdc_start_data(struct msdc_host *host, struct mmc_request *mrq,
+			    struct mmc_command *cmd, struct mmc_data *data)
+{
+	bool read;
+
+	WARN_ON(host->data);
+	host->data = data;
+	read = data->flags & MMC_DATA_READ;
+
+	mod_delayed_work(system_wq, &host->req_timeout, DAT_TIMEOUT);
+	msdc_dma_setup(host, &host->dma, data);
+	sdr_set_bits(host->base + MSDC_INTEN, data_ints_mask);
+	sdr_set_field(host->base + MSDC_DMA_CTRL, MSDC_DMA_CTRL_START, 1);
+	dev_dbg(host->dev, "DMA start\n");
+	dev_dbg(host->dev, "%s: cmd=%d DMA data: %d blocks; read=%d\n",
+			__func__, cmd->opcode, data->blocks, read);
+}
+
+static int msdc_auto_cmd_done(struct msdc_host *host, int events,
+		struct mmc_command *cmd)
+{
+	u32 *rsp = cmd->resp;
+
+	rsp[0] = readl(host->base + SDC_ACMD_RESP);
+
+	if (events & MSDC_INT_ACMDRDY) {
+		cmd->error = 0;
+	} else {
+		msdc_reset_hw(host);
+		if (events & MSDC_INT_ACMDCRCERR) {
+			cmd->error = -EILSEQ;
+			host->error |= REQ_STOP_EIO;
+		} else if (events & MSDC_INT_ACMDTMO) {
+			cmd->error = -ETIMEDOUT;
+			host->error |= REQ_STOP_TMO;
+		}
+		dev_err(host->dev,
+			"%s: AUTO_CMD%d arg=%08X; rsp %08X; cmd_error=%d\n",
+			__func__, cmd->opcode, cmd->arg, rsp[0], cmd->error);
+	}
+	return cmd->error;
+}
+
+static void msdc_track_cmd_data(struct msdc_host *host,
+				struct mmc_command *cmd, struct mmc_data *data)
+{
+	if (host->error)
+		dev_dbg(host->dev, "%s: cmd=%d arg=%08X; host->error=0x%08X\n",
+			__func__, cmd->opcode, cmd->arg, host->error);
+}
+
+static void msdc_request_done(struct msdc_host *host, struct mmc_request *mrq)
+{
+	unsigned long flags;
+	bool ret;
+
+	ret = cancel_delayed_work(&host->req_timeout);
+	if (!ret) {
+		/* delay work already running */
+		return;
+	}
+	spin_lock_irqsave(&host->lock, flags);
+	host->mrq = NULL;
+	spin_unlock_irqrestore(&host->lock, flags);
+
+	msdc_track_cmd_data(host, mrq->cmd, mrq->data);
+	if (mrq->data)
+		msdc_unprepare_data(host, mrq);
+	mmc_request_done(host->mmc, mrq);
+}
+
+/* returns true if command is fully handled; returns false otherwise */
+static bool msdc_cmd_done(struct msdc_host *host, int events,
+			  struct mmc_request *mrq, struct mmc_command *cmd)
+{
+	bool done = false;
+	bool sbc_error;
+	unsigned long flags;
+	u32 *rsp = cmd->resp;
+
+	if (mrq->sbc && cmd == mrq->cmd &&
+	    (events & (MSDC_INT_ACMDRDY | MSDC_INT_ACMDCRCERR
+				   | MSDC_INT_ACMDTMO)))
+		msdc_auto_cmd_done(host, events, mrq->sbc);
+
+	sbc_error = mrq->sbc && mrq->sbc->error;
+
+	if (!sbc_error && !(events & (MSDC_INT_CMDRDY
+					| MSDC_INT_RSPCRCERR
+					| MSDC_INT_CMDTMO)))
+		return done;
+
+	spin_lock_irqsave(&host->lock, flags);
+	done = !host->cmd;
+	host->cmd = NULL;
+	spin_unlock_irqrestore(&host->lock, flags);
+
+	if (done)
+		return true;
+
+	sdr_clr_bits(host->base + MSDC_INTEN, MSDC_INTEN_CMDRDY |
+			MSDC_INTEN_RSPCRCERR | MSDC_INTEN_CMDTMO |
+			MSDC_INTEN_ACMDRDY | MSDC_INTEN_ACMDCRCERR |
+			MSDC_INTEN_ACMDTMO);
+	writel(cmd->arg, host->base + SDC_ARG);
+
+	if (cmd->flags & MMC_RSP_PRESENT) {
+		if (cmd->flags & MMC_RSP_136) {
+			rsp[0] = readl(host->base + SDC_RESP3);
+			rsp[1] = readl(host->base + SDC_RESP2);
+			rsp[2] = readl(host->base + SDC_RESP1);
+			rsp[3] = readl(host->base + SDC_RESP0);
+		} else {
+			rsp[0] = readl(host->base + SDC_RESP0);
+		}
+	}
+
+	if (!sbc_error && !(events & MSDC_INT_CMDRDY)) {
+		msdc_reset_hw(host);
+		if (events & MSDC_INT_RSPCRCERR) {
+			cmd->error = -EILSEQ;
+			host->error |= REQ_CMD_EIO;
+		} else if (events & MSDC_INT_CMDTMO) {
+			cmd->error = -ETIMEDOUT;
+			host->error |= REQ_CMD_TMO;
+		}
+	}
+	if (cmd->error)
+		dev_dbg(host->dev,
+				"%s: cmd=%d arg=%08X; rsp %08X; cmd_error=%d\n",
+				__func__, cmd->opcode, cmd->arg, rsp[0],
+				cmd->error);
+
+	msdc_cmd_next(host, mrq, cmd);
+	return true;
+}
+
+/* It is the core layer's responsibility to ensure card status
+ * is correct before issue a request. but host design do below
+ * checks recommended.
+ */
+static inline bool msdc_cmd_is_ready(struct msdc_host *host,
+		struct mmc_request *mrq, struct mmc_command *cmd)
+{
+	/* The max busy time we can endure is 20ms */
+	unsigned long tmo = jiffies + msecs_to_jiffies(20);
+
+	while ((readl(host->base + SDC_STS) & SDC_STS_CMDBUSY) &&
+			time_before(jiffies, tmo))
+		cpu_relax();
+	if (readl(host->base + SDC_STS) & SDC_STS_CMDBUSY) {
+		dev_err(host->dev, "CMD bus busy detected\n");
+		host->error |= REQ_CMD_BUSY;
+		msdc_cmd_done(host, MSDC_INT_CMDTMO, mrq, cmd);
+		return false;
+	}
+
+	if (mmc_resp_type(cmd) == MMC_RSP_R1B || cmd->data) {
+		tmo = jiffies + msecs_to_jiffies(20);
+		/* R1B or with data, should check SDCBUSY */
+		while ((readl(host->base + SDC_STS) & SDC_STS_SDCBUSY) &&
+				time_before(jiffies, tmo))
+			cpu_relax();
+		if (readl(host->base + SDC_STS) & SDC_STS_SDCBUSY) {
+			dev_err(host->dev, "Controller busy detected\n");
+			host->error |= REQ_CMD_BUSY;
+			msdc_cmd_done(host, MSDC_INT_CMDTMO, mrq, cmd);
+			return false;
+		}
+	}
+	return true;
+}
+
+static void msdc_start_command(struct msdc_host *host,
+		struct mmc_request *mrq, struct mmc_command *cmd)
+{
+	u32 rawcmd;
+
+	WARN_ON(host->cmd);
+	host->cmd = cmd;
+
+	if (!msdc_cmd_is_ready(host, mrq, cmd))
+		return;
+
+	if ((readl(host->base + MSDC_FIFOCS) & MSDC_FIFOCS_TXCNT) >> 16 ||
+	    readl(host->base + MSDC_FIFOCS) & MSDC_FIFOCS_RXCNT) {
+		dev_err(host->dev, "TX/RX FIFO non-empty before start of IO. Reset\n");
+		msdc_reset_hw(host);
+	}
+
+	cmd->error = 0;
+	rawcmd = msdc_cmd_prepare_raw_cmd(host, mrq, cmd);
+	mod_delayed_work(system_wq, &host->req_timeout, DAT_TIMEOUT);
+
+	sdr_set_bits(host->base + MSDC_INTEN, MSDC_INTEN_CMDRDY |
+			MSDC_INTEN_RSPCRCERR | MSDC_INTEN_CMDTMO |
+			MSDC_INTEN_ACMDRDY | MSDC_INTEN_ACMDCRCERR |
+			MSDC_INTEN_ACMDTMO);
+	writel(cmd->arg, host->base + SDC_ARG);
+	writel(rawcmd, host->base + SDC_CMD);
+}
+
+static void msdc_cmd_next(struct msdc_host *host,
+		struct mmc_request *mrq, struct mmc_command *cmd)
+{
+	if (cmd->error || (mrq->sbc && mrq->sbc->error))
+		msdc_request_done(host, mrq);
+	else if (cmd == mrq->sbc)
+		msdc_start_command(host, mrq, mrq->cmd);
+	else if (!cmd->data)
+		msdc_request_done(host, mrq);
+	else
+		msdc_start_data(host, mrq, cmd, cmd->data);
+}
+
+static void msdc_ops_request(struct mmc_host *mmc, struct mmc_request *mrq)
+{
+	struct msdc_host *host = mmc_priv(mmc);
+
+	host->error = 0;
+	WARN_ON(host->mrq);
+	host->mrq = mrq;
+
+	if (mrq->data)
+		msdc_prepare_data(host, mrq);
+
+	/* if SBC is required, we have HW option and SW option.
+	 * if HW option is enabled, and SBC does not have "special" flags,
+	 * use HW option,  otherwise use SW option
+	 */
+	if (mrq->sbc && (!mmc_card_mmc(mmc->card) ||
+	    (mrq->sbc->arg & 0xFFFF0000)))
+		msdc_start_command(host, mrq, mrq->sbc);
+	else
+		msdc_start_command(host, mrq, mrq->cmd);
+}
+
+static void msdc_pre_req(struct mmc_host *mmc, struct mmc_request *mrq,
+		bool is_first_req)
+{
+	struct msdc_host *host = mmc_priv(mmc);
+	struct mmc_data *data = mrq->data;
+
+	if (!data)
+		return;
+
+	msdc_prepare_data(host, mrq);
+	data->host_cookie |= MSDC_ASYNC_FLAG;
+}
+
+static void msdc_post_req(struct mmc_host *mmc, struct mmc_request *mrq,
+		int err)
+{
+	struct msdc_host *host = mmc_priv(mmc);
+	struct mmc_data *data;
+
+	data = mrq->data;
+	if (!data)
+		return;
+	if (data->host_cookie) {
+		data->host_cookie &= ~MSDC_ASYNC_FLAG;
+		msdc_unprepare_data(host, mrq);
+	}
+}
+
+static void msdc_data_xfer_next(struct msdc_host *host,
+				struct mmc_request *mrq, struct mmc_data *data)
+{
+	if (mmc_op_multi(mrq->cmd->opcode) && mrq->stop && !mrq->stop->error &&
+	    (!data->bytes_xfered || !mrq->sbc))
+		msdc_start_command(host, mrq, mrq->stop);
+	else
+		msdc_request_done(host, mrq);
+}
+
+static bool msdc_data_xfer_done(struct msdc_host *host, u32 events,
+				struct mmc_request *mrq, struct mmc_data *data)
+{
+	struct mmc_command *stop = data->stop;
+	unsigned long flags;
+	bool done;
+	unsigned int check_data = events &
+	    (MSDC_INT_XFER_COMPL | MSDC_INT_DATCRCERR | MSDC_INT_DATTMO
+	     | MSDC_INT_DMA_BDCSERR | MSDC_INT_DMA_GPDCSERR
+	     | MSDC_INT_DMA_PROTECT);
+
+	spin_lock_irqsave(&host->lock, flags);
+	done = !host->data;
+	if (check_data)
+		host->data = NULL;
+	spin_unlock_irqrestore(&host->lock, flags);
+
+	if (done)
+		return true;
+
+	if (check_data || (stop && stop->error)) {
+		dev_dbg(host->dev, "DMA status: 0x%8X\n",
+				readl(host->base + MSDC_DMA_CFG));
+		sdr_set_field(host->base + MSDC_DMA_CTRL, MSDC_DMA_CTRL_STOP,
+				1);
+		while (readl(host->base + MSDC_DMA_CFG) & MSDC_DMA_CFG_STS)
+			cpu_relax();
+		sdr_clr_bits(host->base + MSDC_INTEN, data_ints_mask);
+		dev_dbg(host->dev, "DMA stop\n");
+
+		if ((events & MSDC_INT_XFER_COMPL) && (!stop || !stop->error)) {
+			data->bytes_xfered = data->blocks * data->blksz;
+		} else {
+			dev_err(host->dev, "interrupt events: %x\n", events);
+			msdc_reset_hw(host);
+			host->error |= REQ_DAT_ERR;
+			data->bytes_xfered = 0;
+
+			if (events & MSDC_INT_DATTMO)
+				data->error = -ETIMEDOUT;
+
+			dev_err(host->dev, "%s: cmd=%d; blocks=%d",
+				__func__, mrq->cmd->opcode, data->blocks);
+			dev_err(host->dev, "data_error=%d xfer_size=%d\n",
+					(int)data->error, data->bytes_xfered);
+		}
+
+		msdc_data_xfer_next(host, mrq, data);
+		done = true;
+	}
+	return done;
+}
+
+static void msdc_set_buswidth(struct msdc_host *host, u32 width)
+{
+	u32 val = readl(host->base + SDC_CFG);
+
+	val &= ~SDC_CFG_BUSWIDTH;
+
+	switch (width) {
+	default:
+	case MMC_BUS_WIDTH_1:
+		val |= (MSDC_BUS_1BITS << 16);
+		break;
+	case MMC_BUS_WIDTH_4:
+		val |= (MSDC_BUS_4BITS << 16);
+		break;
+	case MMC_BUS_WIDTH_8:
+		val |= (MSDC_BUS_8BITS << 16);
+		break;
+	}
+
+	writel(val, host->base + SDC_CFG);
+	dev_dbg(host->dev, "Bus Width = %d", width);
+}
+
+static int msdc_ops_switch_volt(struct mmc_host *mmc, struct mmc_ios *ios)
+{
+	struct msdc_host *host = mmc_priv(mmc);
+	int min_uv, max_uv;
+	int ret = 0;
+
+	if (!IS_ERR(mmc->supply.vqmmc)) {
+		if (ios->signal_voltage == MMC_SIGNAL_VOLTAGE_330) {
+			min_uv = 3300000;
+			max_uv = 3300000;
+		} else if (ios->signal_voltage == MMC_SIGNAL_VOLTAGE_180) {
+			min_uv = 1800000;
+			max_uv = 1800000;
+		} else {
+			dev_err(host->dev, "Unsupported signal voltage!\n");
+			return -EINVAL;
+		}
+
+		ret = regulator_set_voltage(mmc->supply.vqmmc, min_uv, max_uv);
+		if (ret) {
+			dev_err(host->dev,
+					"Regulator set error %d: %d - %d\n",
+					ret, min_uv, max_uv);
+		} else {
+			/* Apply different pinctrl settings for different signal voltage */
+			if (ios->signal_voltage == MMC_SIGNAL_VOLTAGE_180)
+				pinctrl_select_state(host->pinctrl, host->pins_uhs);
+			else
+				pinctrl_select_state(host->pinctrl, host->pins_default);
+		}
+	}
+	return ret;
+}
+
+static int msdc_card_busy(struct mmc_host *mmc)
+{
+	struct msdc_host *host = mmc_priv(mmc);
+	u32 status = readl(host->base + MSDC_PS);
+
+	/* check if any pin between dat[0:3] is low */
+	if (((status >> 16) & 0xf) != 0xf)
+		return 1;
+
+	return 0;
+}
+
+static void msdc_request_timeout(struct work_struct *work)
+{
+	struct msdc_host *host = container_of(work, struct msdc_host,
+			req_timeout.work);
+
+	/* simulate HW timeout status */
+	dev_err(host->dev, "%s: aborting cmd/data/mrq\n", __func__);
+	if (host->mrq) {
+		dev_err(host->dev, "%s: aborting mrq=%p cmd=%d\n", __func__,
+				host->mrq, host->mrq->cmd->opcode);
+		if (host->cmd) {
+			dev_err(host->dev, "%s: aborting cmd=%d\n",
+					__func__, host->cmd->opcode);
+			msdc_cmd_done(host, MSDC_INT_CMDTMO, host->mrq,
+					host->cmd);
+		} else if (host->data) {
+			dev_err(host->dev, "%s: abort data: cmd%d; %d blocks\n",
+					__func__, host->mrq->cmd->opcode,
+					host->data->blocks);
+			msdc_data_xfer_done(host, MSDC_INT_DATTMO, host->mrq,
+					host->data);
+		}
+	}
+}
+
+static irqreturn_t msdc_irq(int irq, void *dev_id)
+{
+	struct msdc_host *host = (struct msdc_host *) dev_id;
+
+	while (true) {
+		unsigned long flags;
+		struct mmc_request *mrq;
+		struct mmc_command *cmd;
+		struct mmc_data *data;
+		u32 events, event_mask;
+
+		spin_lock_irqsave(&host->lock, flags);
+		events = readl(host->base + MSDC_INT);
+		event_mask = readl(host->base + MSDC_INTEN);
+		/* clear interrupts */
+		writel(events & event_mask, host->base + MSDC_INT);
+
+		mrq = host->mrq;
+		cmd = host->cmd;
+		data = host->data;
+		spin_unlock_irqrestore(&host->lock, flags);
+
+		if (!(events & event_mask))
+			break;
+
+		if (!mrq) {
+			dev_err(host->dev,
+				"%s: MRQ=NULL; events=%08X; event_mask=%08X\n",
+				__func__, events, event_mask);
+			WARN_ON(1);
+			break;
+		}
+
+		dev_dbg(host->dev, "%s: events=%08X\n", __func__, events);
+
+		if (cmd)
+			msdc_cmd_done(host, events, mrq, cmd);
+		else if (data)
+			msdc_data_xfer_done(host, events, mrq, data);
+	}
+
+	return IRQ_HANDLED;
+}
+
+static void msdc_init_hw(struct msdc_host *host)
+{
+	u32 val;
+
+	/* Configure to MMC/SD mode, clock free running */
+	sdr_set_bits(host->base + MSDC_CFG, MSDC_CFG_MODE | MSDC_CFG_CKPDN);
+
+	/* Reset */
+	msdc_reset_hw(host);
+
+	/* Disable card detection */
+	sdr_clr_bits(host->base + MSDC_PS, MSDC_PS_CDEN);
+
+	/* Disable and clear all interrupts */
+	writel(0, host->base + MSDC_INTEN);
+	val = readl(host->base + MSDC_INT);
+	writel(val, host->base + MSDC_INT);
+
+	writel(0, host->base + MSDC_PAD_TUNE);
+	writel(0, host->base + MSDC_IOCON);
+	sdr_set_field(host->base + MSDC_IOCON, MSDC_IOCON_DDLSEL, 1);
+	writel(0x403c004f, host->base + MSDC_PATCH_BIT);
+	sdr_set_field(host->base + MSDC_PATCH_BIT, MSDC_CKGEN_MSDC_DLY_SEL, 1);
+	writel(0xffff0089, host->base + MSDC_PATCH_BIT1);
+	/* Configure to enable SDIO mode.
+	 * it's must otherwise sdio cmd5 failed
+	 */
+	sdr_set_bits(host->base + SDC_CFG, SDC_CFG_SDIO);
+
+	/* disable detect SDIO device interrupt function */
+	sdr_clr_bits(host->base + SDC_CFG, SDC_CFG_SDIOIDE);
+
+	/* Configure to default data timeout */
+	sdr_set_field(host->base + SDC_CFG, SDC_CFG_DTOC, 3);
+
+	dev_dbg(host->dev, "init hardware done!");
+}
+
+static void msdc_deinit_hw(struct msdc_host *host)
+{
+	u32 val;
+	/* Disable and clear all interrupts */
+	writel(0, host->base + MSDC_INTEN);
+
+	val = readl(host->base + MSDC_INT);
+	writel(val, host->base + MSDC_INT);
+}
+
+/* init gpd and bd list in msdc_drv_probe */
+static void msdc_init_gpd_bd(struct msdc_host *host, struct msdc_dma *dma)
+{
+	struct mt_gpdma_desc *gpd = dma->gpd;
+	struct mt_bdma_desc *bd = dma->bd;
+	int i;
+
+	memset(gpd, 0, sizeof(struct mt_gpdma_desc));
+
+	gpd->gpd_info = GPDMA_DESC_BDP; /* hwo, cs, bd pointer */
+	gpd->ptr = (u32)dma->bd_addr; /* physical address */
+
+	memset(bd, 0, sizeof(struct mt_bdma_desc) * MAX_BD_NUM);
+	for (i = 0; i < (MAX_BD_NUM - 1); i++)
+		bd[i].next = (u32)dma->bd_addr + sizeof(*bd) * (i + 1);
+}
+
+static void msdc_ops_set_ios(struct mmc_host *mmc, struct mmc_ios *ios)
+{
+	struct msdc_host *host = mmc_priv(mmc);
+	int ret;
+	u32 ddr = 0;
+
+	if (ios->timing == MMC_TIMING_UHS_DDR50 ||
+	    ios->timing == MMC_TIMING_MMC_DDR52)
+		ddr = 1;
+
+	msdc_set_buswidth(host, ios->bus_width);
+
+	/* Suspend/Resume will do power off/on */
+	switch (ios->power_mode) {
+	case MMC_POWER_UP:
+		if (!IS_ERR(mmc->supply.vmmc)) {
+			ret = mmc_regulator_set_ocr(mmc, mmc->supply.vmmc,
+					ios->vdd);
+			if (ret) {
+				dev_err(host->dev, "Failed to set vmmc power!\n");
+				return;
+			}
+		}
+		break;
+	case MMC_POWER_ON:
+		if (!IS_ERR(mmc->supply.vqmmc) && !host->vqmmc_enabled) {
+			ret = regulator_enable(mmc->supply.vqmmc);
+			if (ret)
+				dev_err(host->dev, "Failed to set vqmmc power!\n");
+			else
+				host->vqmmc_enabled = true;
+		}
+		break;
+	case MMC_POWER_OFF:
+		if (!IS_ERR(mmc->supply.vmmc))
+			mmc_regulator_set_ocr(mmc, mmc->supply.vmmc, 0);
+
+		if (!IS_ERR(mmc->supply.vqmmc) && host->vqmmc_enabled) {
+			regulator_disable(mmc->supply.vqmmc);
+			host->vqmmc_enabled = false;
+		}
+		break;
+	default:
+		break;
+	}
+
+	if (host->mclk != ios->clock || host->ddr != ddr)
+		msdc_set_mclk(host, ddr, ios->clock);
+}
+
+static struct mmc_host_ops mt_msdc_ops = {
+	.post_req = msdc_post_req,
+	.pre_req = msdc_pre_req,
+	.request = msdc_ops_request,
+	.set_ios = msdc_ops_set_ios,
+	.start_signal_voltage_switch = msdc_ops_switch_volt,
+	.card_busy = msdc_card_busy,
+};
+
+static int msdc_drv_probe(struct platform_device *pdev)
+{
+	struct mmc_host *mmc;
+	struct msdc_host *host;
+	struct resource *res;
+	int ret;
+
+	if (!pdev->dev.of_node) {
+		dev_err(&pdev->dev, "No DT found\n");
+		return -EINVAL;
+	}
+	/* Allocate MMC host for this device */
+	mmc = mmc_alloc_host(sizeof(struct msdc_host), &pdev->dev);
+	if (!mmc)
+		return -ENOMEM;
+
+	host = mmc_priv(mmc);
+	ret = mmc_of_parse(mmc);
+	if (ret)
+		goto host_free;
+
+	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	host->base = devm_ioremap_resource(&pdev->dev, res);
+	if (IS_ERR(host->base)) {
+		ret = PTR_ERR(host->base);
+		goto host_free;
+	}
+
+	ret = mmc_regulator_get_supply(mmc);
+	if (ret == -EPROBE_DEFER)
+		goto host_free;
+
+	host->src_clk = devm_clk_get(&pdev->dev, "source");
+	if (IS_ERR(host->src_clk)) {
+		ret = PTR_ERR(host->src_clk);
+		goto host_free;
+	}
+
+	host->h_clk = devm_clk_get(&pdev->dev, "hclk");
+	if (IS_ERR(host->h_clk)) {
+		ret = PTR_ERR(host->h_clk);
+		goto host_free;
+	}
+
+	host->irq = platform_get_irq(pdev, 0);
+	if (host->irq < 0) {
+		ret = -EINVAL;
+		goto host_free;
+	}
+
+	host->pinctrl = devm_pinctrl_get(&pdev->dev);
+	if (IS_ERR(host->pinctrl)) {
+		ret = PTR_ERR(host->pinctrl);
+		dev_err(&pdev->dev, "Cannot find pinctrl!\n");
+		goto host_free;
+	}
+
+	host->pins_default = pinctrl_lookup_state(host->pinctrl, "default");
+	if (IS_ERR(host->pins_default)) {
+		ret = PTR_ERR(host->pins_default);
+		dev_err(&pdev->dev, "Cannot find pinctrl default!\n");
+		goto host_free;
+	}
+
+	host->pins_uhs = pinctrl_lookup_state(host->pinctrl, "state_uhs");
+	if (IS_ERR(host->pins_uhs)) {
+		ret = PTR_ERR(host->pins_uhs);
+		dev_err(&pdev->dev, "Cannot find pinctrl uhs!\n");
+		goto host_free;
+	}
+
+	host->dev = &pdev->dev;
+	host->mmc = mmc;
+	host->src_clk_freq = clk_get_rate(host->src_clk);
+	/* Set host parameters to mmc */
+	mmc->ops = &mt_msdc_ops;
+	mmc->f_min = host->src_clk_freq / (4 * 255);
+
+	mmc->caps |= MMC_CAP_ERASE | MMC_CAP_CMD23;
+	/* MMC core transfer sizes tunable parameters */
+	mmc->max_segs = MAX_BD_NUM;
+	mmc->max_seg_size = BDMA_DESC_BUFLEN;
+	mmc->max_blk_size = 2048;
+	mmc->max_req_size = 512 * 1024;
+	mmc->max_blk_count = mmc->max_req_size / 512;
+	host->dma_mask = DMA_BIT_MASK(32);
+	mmc_dev(mmc)->dma_mask = &host->dma_mask;
+
+	host->timeout_clks = 3 * 1048576;
+	host->dma.gpd = dma_alloc_coherent(&pdev->dev,
+				sizeof(struct mt_gpdma_desc),
+				&host->dma.gpd_addr, GFP_KERNEL);
+	host->dma.bd = dma_alloc_coherent(&pdev->dev,
+				MAX_BD_NUM * sizeof(struct mt_bdma_desc),
+				&host->dma.bd_addr, GFP_KERNEL);
+	if (!host->dma.gpd || !host->dma.bd) {
+		ret = -ENOMEM;
+		goto release_mem;
+	}
+	msdc_init_gpd_bd(host, &host->dma);
+	INIT_DELAYED_WORK(&host->req_timeout, msdc_request_timeout);
+	spin_lock_init(&host->lock);
+
+	platform_set_drvdata(pdev, mmc);
+	msdc_ungate_clock(host);
+	msdc_init_hw(host);
+
+	ret = devm_request_irq(&pdev->dev, host->irq, msdc_irq,
+		IRQF_TRIGGER_LOW | IRQF_ONESHOT, pdev->name, host);
+	if (ret)
+		goto release;
+
+	ret = mmc_add_host(mmc);
+	if (ret)
+		goto release;
+
+	return 0;
+
+release:
+	platform_set_drvdata(pdev, NULL);
+	msdc_deinit_hw(host);
+	msdc_gate_clock(host);
+release_mem:
+	if (host->dma.gpd)
+		dma_free_coherent(&pdev->dev,
+			sizeof(struct mt_gpdma_desc),
+			host->dma.gpd, host->dma.gpd_addr);
+	if (host->dma.bd)
+		dma_free_coherent(&pdev->dev,
+			MAX_BD_NUM * sizeof(struct mt_bdma_desc),
+			host->dma.bd, host->dma.bd_addr);
+host_free:
+	mmc_free_host(mmc);
+
+	return ret;
+}
+
+static int msdc_drv_remove(struct platform_device *pdev)
+{
+	struct mmc_host *mmc;
+	struct msdc_host *host;
+
+	mmc = platform_get_drvdata(pdev);
+	host = mmc_priv(mmc);
+
+	platform_set_drvdata(pdev, NULL);
+	mmc_remove_host(host->mmc);
+	msdc_deinit_hw(host);
+	msdc_gate_clock(host);
+
+	dma_free_coherent(&pdev->dev,
+			sizeof(struct mt_gpdma_desc),
+			host->dma.gpd, host->dma.gpd_addr);
+	dma_free_coherent(&pdev->dev, MAX_BD_NUM * sizeof(struct mt_bdma_desc),
+			host->dma.bd, host->dma.bd_addr);
+
+	mmc_free_host(host->mmc);
+
+	return 0;
+}
+
+static const struct of_device_id msdc_of_ids[] = {
+	{   .compatible = "mediatek,mt8135-mmc", },
+	{}
+};
+
+static struct platform_driver mt_msdc_driver = {
+	.probe = msdc_drv_probe,
+	.remove = msdc_drv_remove,
+	.driver = {
+		.name = "mtk-msdc",
+		.of_match_table = msdc_of_ids,
+	},
+};
+
+module_platform_driver(mt_msdc_driver);
+MODULE_LICENSE("GPL v2");
+MODULE_DESCRIPTION("MediaTek SD/MMC Card Driver");
diff --git a/include/linux/mmc/core.h b/include/linux/mmc/core.h
index de722d4e9d61..258daf914c6d 100644
--- a/include/linux/mmc/core.h
+++ b/include/linux/mmc/core.h
@@ -121,6 +121,7 @@ struct mmc_data {
 	struct mmc_request	*mrq;		/* associated request */
 
 	unsigned int		sg_len;		/* size of scatter list */
+	int			sg_count;	/* mapped sg entries */
 	struct scatterlist	*sg;		/* I/O scatter list */
 	s32			host_cookie;	/* host private data */
 };
-- 
cgit v1.2.3


From a1a56aaa0735c7821e85c37268a7c12e132415cb Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 15 Jun 2015 18:10:13 -0700
Subject: netfilter: x_tables: align per cpu xt_counter

Let's force a 16 bytes alignment on xt_counter percpu allocations,
so that bytes and packets sit in same cache line.

xt_counter being exported to user space, we cannot add __align(16) on
the structure itself.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Florian Westphal <fw@strlen.de>
Acked-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter/x_tables.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h
index 95693c4cebdd..1c97a2204379 100644
--- a/include/linux/netfilter/x_tables.h
+++ b/include/linux/netfilter/x_tables.h
@@ -356,7 +356,8 @@ static inline unsigned long ifname_compare_aligned(const char *_a,
  * so nothing needs to be done there.
  *
  * xt_percpu_counter_alloc returns the address of the percpu
- * counter, or 0 on !SMP.
+ * counter, or 0 on !SMP. We force an alignment of 16 bytes
+ * so that bytes/packets share a common cache line.
  *
  * Hence caller must use IS_ERR_VALUE to check for error, this
  * allows us to return 0 for single core systems without forcing
@@ -365,7 +366,8 @@ static inline unsigned long ifname_compare_aligned(const char *_a,
 static inline u64 xt_percpu_counter_alloc(void)
 {
 	if (nr_cpu_ids > 1) {
-		void __percpu *res = alloc_percpu(struct xt_counters);
+		void __percpu *res = __alloc_percpu(sizeof(struct xt_counters),
+						    sizeof(struct xt_counters));
 
 		if (res == NULL)
 			return (u64) -ENOMEM;
-- 
cgit v1.2.3


From 3b0f95be143bea1aa47beb20134ef82e4e4068dc Mon Sep 17 00:00:00 2001
From: Russell King <rmk+kernel@arm.linux.org.uk>
Date: Tue, 16 Jun 2015 23:06:20 +0100
Subject: irq: Add irq_set_chained_handler_and_data()

Driver authors seem to get the ordering of irq_set_chained_handler()
and irq_set_handler_data() wrong - ordering the former before the
latter.  This opens a race window where, if there is an interrupt
pending, the handler will be called between these two calls,
potentially resulting in an oops.

Provide a single interface to set both of these together, especially
as that's commonly what is required.

Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
Cc: Alexandre Courbot <gnurou@gmail.com>
Cc: Hans Ulli Kroll <ulli.kroll@googlemail.com>
Cc: Jason Cooper <jason@lakedaemon.net>
Cc: Lee Jones <lee.jones@linaro.org>
Cc: Linus Walleij <linus.walleij@linaro.org>
Cc: Thierry Reding <thierry.reding@gmail.com>
Cc: linux-arm-kernel@lists.infradead.org
Link: http://lkml.kernel.org/r/E1Z4yzs-0002Rw-4B@rmk-PC.arm.linux.org.uk
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/irq.h |  9 +++++++++
 kernel/irq/chip.c   | 45 ++++++++++++++++++++++++++++++++++-----------
 2 files changed, 43 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/irq.h b/include/linux/irq.h
index de3213d271ff..42861d28fc2a 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -525,6 +525,15 @@ irq_set_chained_handler(unsigned int irq, irq_flow_handler_t handle)
 	__irq_set_handler(irq, handle, 1, NULL);
 }
 
+/*
+ * Set a highlevel chained flow handler and its data for a given IRQ.
+ * (a chained handler is automatically enabled and set to
+ *  IRQ_NOREQUEST, IRQ_NOPROBE, and IRQ_NOTHREAD)
+ */
+void
+irq_set_chained_handler_and_data(unsigned int irq, irq_flow_handler_t handle,
+				 void *data);
+
 void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set);
 
 static inline void irq_set_status_flags(unsigned int irq, unsigned long set)
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 330fc797e632..27f4332c7f84 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -719,15 +719,9 @@ void handle_percpu_devid_irq(unsigned int irq, struct irq_desc *desc)
 }
 
 void
-__irq_set_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
-		  const char *name)
+__irq_do_set_handler(struct irq_desc *desc, irq_flow_handler_t handle,
+		     int is_chained, const char *name)
 {
-	unsigned long flags;
-	struct irq_desc *desc = irq_get_desc_buslock(irq, &flags, 0);
-
-	if (!desc)
-		return;
-
 	if (!handle) {
 		handle = handle_bad_irq;
 	} else {
@@ -749,13 +743,13 @@ __irq_set_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
 			 * right away.
 			 */
 			if (WARN_ON(is_chained))
-				goto out;
+				return;
 			/* Try the parent */
 			irq_data = irq_data->parent_data;
 		}
 #endif
 		if (WARN_ON(!irq_data || irq_data->chip == &no_irq_chip))
-			goto out;
+			return;
 	}
 
 	/* Uninstall? */
@@ -774,11 +768,40 @@ __irq_set_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
 		irq_settings_set_nothread(desc);
 		irq_startup(desc, true);
 	}
-out:
+}
+
+void
+__irq_set_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
+		  const char *name)
+{
+	unsigned long flags;
+	struct irq_desc *desc = irq_get_desc_buslock(irq, &flags, 0);
+
+	if (!desc)
+		return;
+
+	__irq_do_set_handler(desc, handle, is_chained, name);
 	irq_put_desc_busunlock(desc, flags);
 }
 EXPORT_SYMBOL_GPL(__irq_set_handler);
 
+void
+irq_set_chained_handler_and_data(unsigned int irq, irq_flow_handler_t handle,
+				 void *data)
+{
+	unsigned long flags;
+	struct irq_desc *desc = irq_get_desc_buslock(irq, &flags, 0);
+
+	if (!desc)
+		return;
+
+	__irq_do_set_handler(desc, handle, 1, NULL);
+	desc->irq_data.handler_data = data;
+
+	irq_put_desc_busunlock(desc, flags);
+}
+EXPORT_SYMBOL_GPL(irq_set_chained_handler_and_data);
+
 void
 irq_set_chip_and_handler_name(unsigned int irq, struct irq_chip *chip,
 			      irq_flow_handler_t handle, const char *name)
-- 
cgit v1.2.3


From 0f1b414d190724617eb1cdd615592fa8cd9d0b50 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Thu, 18 Jun 2015 18:32:02 +0200
Subject: ACPI / PNP: Avoid conflicting resource reservations

Commit b9a5e5e18fbf "ACPI / init: Fix the ordering of
acpi_reserve_resources()" overlooked the fact that the memory
and/or I/O regions reserved by acpi_reserve_resources() may
conflict with those reserved by the PNP "system" driver.

If that conflict actually takes place, it causes the reservations
made by the "system" driver to fail while before commit b9a5e5e18fbf
all reservations made by it and by acpi_reserve_resources() would be
successful.  In turn, that allows the resources that haven't been
reserved by the "system" driver to be used by others (e.g. PCI) which
sometimes leads to functional problems (up to and including boot
failures).

To fix that issue, introduce a common resource reservation routine,
acpi_reserve_region(), to be used by both acpi_reserve_resources()
and the "system" driver, that will track all resources reserved by
it and avoid making conflicting requests.

Link: https://bugzilla.kernel.org/show_bug.cgi?id=99831
Link: http://marc.info/?t=143389402600001&r=1&w=2
Fixes: b9a5e5e18fbf "ACPI / init: Fix the ordering of acpi_reserve_resources()"
Reported-by: Roland Dreier <roland@purestorage.com>
Cc: All applicable <stable@vger.kernel.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/osl.c      |   6 +-
 drivers/acpi/resource.c | 160 ++++++++++++++++++++++++++++++++++++++++++++++++
 drivers/pnp/system.c    |  35 ++++++++---
 include/linux/acpi.h    |  10 +++
 4 files changed, 197 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/acpi/osl.c b/drivers/acpi/osl.c
index 7ccba395c9dd..5226a8b921ae 100644
--- a/drivers/acpi/osl.c
+++ b/drivers/acpi/osl.c
@@ -175,11 +175,7 @@ static void __init acpi_request_region (struct acpi_generic_address *gas,
 	if (!addr || !length)
 		return;
 
-	/* Resources are never freed */
-	if (gas->space_id == ACPI_ADR_SPACE_SYSTEM_IO)
-		request_region(addr, length, desc);
-	else if (gas->space_id == ACPI_ADR_SPACE_SYSTEM_MEMORY)
-		request_mem_region(addr, length, desc);
+	acpi_reserve_region(addr, length, gas->space_id, 0, desc);
 }
 
 static void __init acpi_reserve_resources(void)
diff --git a/drivers/acpi/resource.c b/drivers/acpi/resource.c
index 8244f013f210..fcb7807ea8b7 100644
--- a/drivers/acpi/resource.c
+++ b/drivers/acpi/resource.c
@@ -26,6 +26,7 @@
 #include <linux/device.h>
 #include <linux/export.h>
 #include <linux/ioport.h>
+#include <linux/list.h>
 #include <linux/slab.h>
 
 #ifdef CONFIG_X86
@@ -621,3 +622,162 @@ int acpi_dev_filter_resource_type(struct acpi_resource *ares,
 	return (type & types) ? 0 : 1;
 }
 EXPORT_SYMBOL_GPL(acpi_dev_filter_resource_type);
+
+struct reserved_region {
+	struct list_head node;
+	u64 start;
+	u64 end;
+};
+
+static LIST_HEAD(reserved_io_regions);
+static LIST_HEAD(reserved_mem_regions);
+
+static int request_range(u64 start, u64 end, u8 space_id, unsigned long flags,
+			 char *desc)
+{
+	unsigned int length = end - start + 1;
+	struct resource *res;
+
+	res = space_id == ACPI_ADR_SPACE_SYSTEM_IO ?
+		request_region(start, length, desc) :
+		request_mem_region(start, length, desc);
+	if (!res)
+		return -EIO;
+
+	res->flags &= ~flags;
+	return 0;
+}
+
+static int add_region_before(u64 start, u64 end, u8 space_id,
+			     unsigned long flags, char *desc,
+			     struct list_head *head)
+{
+	struct reserved_region *reg;
+	int error;
+
+	reg = kmalloc(sizeof(*reg), GFP_KERNEL);
+	if (!reg)
+		return -ENOMEM;
+
+	error = request_range(start, end, space_id, flags, desc);
+	if (error)
+		return error;
+
+	reg->start = start;
+	reg->end = end;
+	list_add_tail(&reg->node, head);
+	return 0;
+}
+
+/**
+ * acpi_reserve_region - Reserve an I/O or memory region as a system resource.
+ * @start: Starting address of the region.
+ * @length: Length of the region.
+ * @space_id: Identifier of address space to reserve the region from.
+ * @flags: Resource flags to clear for the region after requesting it.
+ * @desc: Region description (for messages).
+ *
+ * Reserve an I/O or memory region as a system resource to prevent others from
+ * using it.  If the new region overlaps with one of the regions (in the given
+ * address space) already reserved by this routine, only the non-overlapping
+ * parts of it will be reserved.
+ *
+ * Returned is either 0 (success) or a negative error code indicating a resource
+ * reservation problem.  It is the code of the first encountered error, but the
+ * routine doesn't abort until it has attempted to request all of the parts of
+ * the new region that don't overlap with other regions reserved previously.
+ *
+ * The resources requested by this routine are never released.
+ */
+int acpi_reserve_region(u64 start, unsigned int length, u8 space_id,
+			unsigned long flags, char *desc)
+{
+	struct list_head *regions;
+	struct reserved_region *reg;
+	u64 end = start + length - 1;
+	int ret = 0, error = 0;
+
+	if (space_id == ACPI_ADR_SPACE_SYSTEM_IO)
+		regions = &reserved_io_regions;
+	else if (space_id == ACPI_ADR_SPACE_SYSTEM_MEMORY)
+		regions = &reserved_mem_regions;
+	else
+		return -EINVAL;
+
+	if (list_empty(regions))
+		return add_region_before(start, end, space_id, flags, desc, regions);
+
+	list_for_each_entry(reg, regions, node)
+		if (reg->start == end + 1) {
+			/* The new region can be prepended to this one. */
+			ret = request_range(start, end, space_id, flags, desc);
+			if (!ret)
+				reg->start = start;
+
+			return ret;
+		} else if (reg->start > end) {
+			/* No overlap.  Add the new region here and get out. */
+			return add_region_before(start, end, space_id, flags,
+						 desc, &reg->node);
+		} else if (reg->end == start - 1) {
+			goto combine;
+		} else if (reg->end >= start) {
+			goto overlap;
+		}
+
+	/* The new region goes after the last existing one. */
+	return add_region_before(start, end, space_id, flags, desc, regions);
+
+ overlap:
+	/*
+	 * The new region overlaps an existing one.
+	 *
+	 * The head part of the new region immediately preceding the existing
+	 * overlapping one can be combined with it right away.
+	 */
+	if (reg->start > start) {
+		error = request_range(start, reg->start - 1, space_id, flags, desc);
+		if (error)
+			ret = error;
+		else
+			reg->start = start;
+	}
+
+ combine:
+	/*
+	 * The new region is adjacent to an existing one.  If it extends beyond
+	 * that region all the way to the next one, it is possible to combine
+	 * all three of them.
+	 */
+	while (reg->end < end) {
+		struct reserved_region *next = NULL;
+		u64 a = reg->end + 1, b = end;
+
+		if (!list_is_last(&reg->node, regions)) {
+			next = list_next_entry(reg, node);
+			if (next->start <= end)
+				b = next->start - 1;
+		}
+		error = request_range(a, b, space_id, flags, desc);
+		if (!error) {
+			if (next && next->start == b + 1) {
+				reg->end = next->end;
+				list_del(&next->node);
+				kfree(next);
+			} else {
+				reg->end = end;
+				break;
+			}
+		} else if (next) {
+			if (!ret)
+				ret = error;
+
+			reg = next;
+		} else {
+			break;
+		}
+	}
+
+	return ret ? ret : error;
+}
+EXPORT_SYMBOL_GPL(acpi_reserve_region);
diff --git a/drivers/pnp/system.c b/drivers/pnp/system.c
index 49c1720df59a..515f33882ab8 100644
--- a/drivers/pnp/system.c
+++ b/drivers/pnp/system.c
@@ -7,6 +7,7 @@
  *	Bjorn Helgaas <bjorn.helgaas@hp.com>
  */
 
+#include <linux/acpi.h>
 #include <linux/pnp.h>
 #include <linux/device.h>
 #include <linux/init.h>
@@ -22,25 +23,41 @@ static const struct pnp_device_id pnp_dev_table[] = {
 	{"", 0}
 };
 
+#ifdef CONFIG_ACPI
+static bool __reserve_range(u64 start, unsigned int length, bool io, char *desc)
+{
+	u8 space_id = io ? ACPI_ADR_SPACE_SYSTEM_IO : ACPI_ADR_SPACE_SYSTEM_MEMORY;
+	return !acpi_reserve_region(start, length, space_id, IORESOURCE_BUSY, desc);
+}
+#else
+static bool __reserve_range(u64 start, unsigned int length, bool io, char *desc)
+{
+	struct resource *res;
+
+	res = io ? request_region(start, length, desc) :
+		request_mem_region(start, length, desc);
+	if (res) {
+		res->flags &= ~IORESOURCE_BUSY;
+		return true;
+	}
+	return false;
+}
+#endif
+
 static void reserve_range(struct pnp_dev *dev, struct resource *r, int port)
 {
 	char *regionid;
 	const char *pnpid = dev_name(&dev->dev);
 	resource_size_t start = r->start, end = r->end;
-	struct resource *res;
+	bool reserved;
 
 	regionid = kmalloc(16, GFP_KERNEL);
 	if (!regionid)
 		return;
 
 	snprintf(regionid, 16, "pnp %s", pnpid);
-	if (port)
-		res = request_region(start, end - start + 1, regionid);
-	else
-		res = request_mem_region(start, end - start + 1, regionid);
-	if (res)
-		res->flags &= ~IORESOURCE_BUSY;
-	else
+	reserved = __reserve_range(start, end - start + 1, !!port, regionid);
+	if (!reserved)
 		kfree(regionid);
 
 	/*
@@ -49,7 +66,7 @@ static void reserve_range(struct pnp_dev *dev, struct resource *r, int port)
 	 * have double reservations.
 	 */
 	dev_info(&dev->dev, "%pR %s reserved\n", r,
-		 res ? "has been" : "could not be");
+		 reserved ? "has been" : "could not be");
 }
 
 static void reserve_resources_of_dev(struct pnp_dev *dev)
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index e4da5e35e29c..299763df1b27 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -332,6 +332,9 @@ int acpi_check_region(resource_size_t start, resource_size_t n,
 
 int acpi_resources_are_enforced(void);
 
+int acpi_reserve_region(u64 start, unsigned int length, u8 space_id,
+			unsigned long flags, char *desc);
+
 #ifdef CONFIG_HIBERNATION
 void __init acpi_no_s4_hw_signature(void);
 #endif
@@ -525,6 +528,13 @@ static inline int acpi_check_region(resource_size_t start, resource_size_t n,
 	return 0;
 }
 
+static inline int acpi_reserve_region(u64 start, unsigned int length,
+				      u8 space_id, unsigned long flags,
+				      char *desc)
+{
+	return -ENXIO;
+}
+
 struct acpi_table_header;
 static inline int acpi_table_parse(char *id,
 				int (*handler)(struct acpi_table_header *))
-- 
cgit v1.2.3


From a263653ed798216c0069922d7b5237ca49436007 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Wed, 17 Jun 2015 10:28:27 -0500
Subject: netfilter: don't pull include/linux/netfilter.h from netns headers
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This pulls the full hook netfilter definitions from all those that include
net_namespace.h.

Instead let's just include the bare minimum required in the new
linux/netfilter_defs.h file, and use it from the netfilter netns header files.

I also needed to include in.h and in6.h from linux/netfilter.h otherwise we hit
this compilation error:

In file included from include/linux/netfilter_defs.h:4:0,
                 from include/net/netns/netfilter.h:4,
                 from include/net/net_namespace.h:22,
                 from include/linux/netdevice.h:43,
                 from net/netfilter/nfnetlink_queue_core.c:23:
include/uapi/linux/netfilter.h:76:17: error: field ‘in’ has incomplete type struct in_addr in;

And also explicit include linux/netfilter.h in several spots.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
 include/linux/netfilter.h      | 6 ++----
 include/linux/netfilter_defs.h | 9 +++++++++
 include/net/netns/netfilter.h  | 2 +-
 include/net/netns/x_tables.h   | 2 +-
 include/uapi/linux/netfilter.h | 3 ++-
 net/ipv6/output_core.c         | 1 +
 6 files changed, 16 insertions(+), 7 deletions(-)
 create mode 100644 include/linux/netfilter_defs.h

(limited to 'include/linux')

diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h
index f5ff5d156da8..00050dfd9f23 100644
--- a/include/linux/netfilter.h
+++ b/include/linux/netfilter.h
@@ -10,7 +10,8 @@
 #include <linux/wait.h>
 #include <linux/list.h>
 #include <linux/static_key.h>
-#include <uapi/linux/netfilter.h>
+#include <linux/netfilter_defs.h>
+
 #ifdef CONFIG_NETFILTER
 static inline int NF_DROP_GETERR(int verdict)
 {
@@ -38,9 +39,6 @@ static inline void nf_inet_addr_mask(const union nf_inet_addr *a1,
 
 int netfilter_init(void);
 
-/* Largest hook number + 1 */
-#define NF_MAX_HOOKS 8
-
 struct sk_buff;
 
 struct nf_hook_ops;
diff --git a/include/linux/netfilter_defs.h b/include/linux/netfilter_defs.h
new file mode 100644
index 000000000000..d3a7f8597e82
--- /dev/null
+++ b/include/linux/netfilter_defs.h
@@ -0,0 +1,9 @@
+#ifndef __LINUX_NETFILTER_CORE_H_
+#define __LINUX_NETFILTER_CORE_H_
+
+#include <uapi/linux/netfilter.h>
+
+/* Largest hook number + 1, see uapi/linux/netfilter_decnet.h */
+#define NF_MAX_HOOKS 8
+
+#endif
diff --git a/include/net/netns/netfilter.h b/include/net/netns/netfilter.h
index cf25b5e35f3c..532e4ba64f49 100644
--- a/include/net/netns/netfilter.h
+++ b/include/net/netns/netfilter.h
@@ -1,7 +1,7 @@
 #ifndef __NETNS_NETFILTER_H
 #define __NETNS_NETFILTER_H
 
-#include <linux/netfilter.h>
+#include <linux/netfilter_defs.h>
 
 struct proc_dir_entry;
 struct nf_logger;
diff --git a/include/net/netns/x_tables.h b/include/net/netns/x_tables.h
index 4d6597ad6067..c8a7681efa6a 100644
--- a/include/net/netns/x_tables.h
+++ b/include/net/netns/x_tables.h
@@ -2,7 +2,7 @@
 #define __NETNS_X_TABLES_H
 
 #include <linux/list.h>
-#include <linux/netfilter.h>
+#include <linux/netfilter_defs.h>
 
 struct ebt_table;
 
diff --git a/include/uapi/linux/netfilter.h b/include/uapi/linux/netfilter.h
index 177027cce6b3..d93f949d1d9a 100644
--- a/include/uapi/linux/netfilter.h
+++ b/include/uapi/linux/netfilter.h
@@ -4,7 +4,8 @@
 #include <linux/types.h>
 #include <linux/compiler.h>
 #include <linux/sysctl.h>
-
+#include <linux/in.h>
+#include <linux/in6.h>
 
 /* Responses from hook functions. */
 #define NF_DROP 0
diff --git a/net/ipv6/output_core.c b/net/ipv6/output_core.c
index 21678acd4521..928a0fb0b744 100644
--- a/net/ipv6/output_core.c
+++ b/net/ipv6/output_core.c
@@ -8,6 +8,7 @@
 #include <net/ip6_fib.h>
 #include <net/addrconf.h>
 #include <net/secure_seq.h>
+#include <linux/netfilter.h>
 
 static u32 __ipv6_select_ident(struct net *net, u32 hashrnd,
 			       const struct in6_addr *dst,
-- 
cgit v1.2.3


From dcb8f5c8139ef945cdfd55900fae265c4dbefc02 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Wed, 17 Jun 2015 23:58:28 +0200
Subject: netfilter: xtables: fix warnings on 32bit platforms

On 32bit archs gcc complains due to cast from void* to u64.
Add intermediate casts to long to silence these warnings.

include/linux/netfilter/x_tables.h:376:10: warning: cast from pointer to integer of different size [-Wpointer-to-int-cast]
include/linux/netfilter/x_tables.h:384:15: warning: cast to pointer from integer of different size [-Wint-to-pointer-cast]
include/linux/netfilter/x_tables.h:391:23: warning: cast to pointer from integer of different size [-Wint-to-pointer-cast]
include/linux/netfilter/x_tables.h:400:22: warning: cast to pointer from integer of different size [-Wint-to-pointer-cast]

Fixes: 71ae0dff02d756e ("netfilter: xtables: use percpu rule counters")
Reported-by: kbuild test robot <fengguang.wu@intel.com>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter/x_tables.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h
index 1c97a2204379..286098a5667f 100644
--- a/include/linux/netfilter/x_tables.h
+++ b/include/linux/netfilter/x_tables.h
@@ -372,7 +372,7 @@ static inline u64 xt_percpu_counter_alloc(void)
 		if (res == NULL)
 			return (u64) -ENOMEM;
 
-		return (__force u64) res;
+		return (u64) (__force unsigned long) res;
 	}
 
 	return 0;
@@ -380,14 +380,14 @@ static inline u64 xt_percpu_counter_alloc(void)
 static inline void xt_percpu_counter_free(u64 pcnt)
 {
 	if (nr_cpu_ids > 1)
-		free_percpu((void __percpu *) pcnt);
+		free_percpu((void __percpu *) (unsigned long) pcnt);
 }
 
 static inline struct xt_counters *
 xt_get_this_cpu_counter(struct xt_counters *cnt)
 {
 	if (nr_cpu_ids > 1)
-		return this_cpu_ptr((void __percpu *) cnt->pcnt);
+		return this_cpu_ptr((void __percpu *) (unsigned long) cnt->pcnt);
 
 	return cnt;
 }
@@ -396,7 +396,7 @@ static inline struct xt_counters *
 xt_get_per_cpu_counter(struct xt_counters *cnt, unsigned int cpu)
 {
 	if (nr_cpu_ids > 1)
-		return per_cpu_ptr((void __percpu *) cnt->pcnt, cpu);
+		return per_cpu_ptr((void __percpu *) (unsigned long) cnt->pcnt, cpu);
 
 	return cnt;
 }
-- 
cgit v1.2.3


From fb02915f47181e824339d91f8e385fd4bd746d6a Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 18 Jun 2015 16:54:28 -0400
Subject: kernfs: make kernfs_get_inode() public

Move kernfs_get_inode() prototype from fs/kernfs/kernfs-internal.h to
include/linux/kernfs.h.  It obtains the matching inode for a
kernfs_node.

It will be used by cgroup for inode based permission checks for now
but is generally useful.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/kernfs/kernfs-internal.h | 1 -
 include/linux/kernfs.h      | 5 +++++
 2 files changed, 5 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h
index af9fa7499919..6762bfbd8207 100644
--- a/fs/kernfs/kernfs-internal.h
+++ b/fs/kernfs/kernfs-internal.h
@@ -76,7 +76,6 @@ extern struct kmem_cache *kernfs_node_cache;
 /*
  * inode.c
  */
-struct inode *kernfs_get_inode(struct super_block *sb, struct kernfs_node *kn);
 void kernfs_evict_inode(struct inode *inode);
 int kernfs_iop_permission(struct inode *inode, int mask);
 int kernfs_iop_setattr(struct dentry *dentry, struct iattr *iattr);
diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h
index 71ecdab1671b..e6b2f7db9c0c 100644
--- a/include/linux/kernfs.h
+++ b/include/linux/kernfs.h
@@ -277,6 +277,7 @@ void kernfs_put(struct kernfs_node *kn);
 
 struct kernfs_node *kernfs_node_from_dentry(struct dentry *dentry);
 struct kernfs_root *kernfs_root_from_sb(struct super_block *sb);
+struct inode *kernfs_get_inode(struct super_block *sb, struct kernfs_node *kn);
 
 struct kernfs_root *kernfs_create_root(struct kernfs_syscall_ops *scops,
 				       unsigned int flags, void *priv);
@@ -352,6 +353,10 @@ static inline struct kernfs_node *kernfs_node_from_dentry(struct dentry *dentry)
 static inline struct kernfs_root *kernfs_root_from_sb(struct super_block *sb)
 { return NULL; }
 
+static inline struct inode *
+kernfs_get_inode(struct super_block *sb, struct kernfs_node *kn)
+{ return NULL; }
+
 static inline struct kernfs_root *
 kernfs_create_root(struct kernfs_syscall_ops *scops, unsigned int flags,
 		   void *priv)
-- 
cgit v1.2.3


From 187fe84067bd377047cfcb7f2bbc7c9dc12d290c Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 18 Jun 2015 16:54:28 -0400
Subject: cgroup: require write perm on common ancestor when moving processes
 on the default hierarchy

On traditional hierarchies, if a task has write access to "tasks" or
"cgroup.procs" file of a cgroup and its euid agrees with the target,
it can move the target to the cgroup; however, consider the following
scenario.  The owner of each cgroup is in the parentheses.

 R (root) - 0 (root) - 00 (user1) - 000 (user1)
          |                       \ 001 (user1)
          \ 1 (root) - 10 (user1)

The subtrees of 00 and 10 are delegated to user1; however, while both
subtrees may belong to the same user, it is clear that the two
subtrees are to be isolated - they're under completely separate
resource limits imposed by 0 and 1, respectively.  Note that 0 and 1
aren't strictly necessary but added to ease illustrating the issue.

If user1 is allowed to move processes between the two subtrees, the
intention of the hierarchy - keeping a given group of processes under
a subtree with certain resource restrictions while delegating
management of the subtree - can be circumvented by user1.

This happens because migration permission check doesn't consider the
hierarchical nature of cgroups.  To fix the issue, this patch adds an
extra permission requirement when userland tries to migrate a process
in the default hierarchy - the issuing task must have write access to
the common ancestor of "cgroup.procs" file of the ancestor in addition
to the destination's.

Conceptually, the issuer must be able to move the target process from
the source cgroup to the common ancestor of source and destination
cgroups and then to the destination.  As long as delegation is done in
a proper top-down way, this guarantees that a delegatee can't smuggle
processes across disjoint delegation domains.

The next patch will add documentation on the delegation model on the
default hierarchy.

v2: Fixed missing !ret test.  Spotted by Li Zefan.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Li Zefan <lizefan@huawei.com>
---
 include/linux/cgroup-defs.h |  1 +
 kernel/cgroup.c             | 30 +++++++++++++++++++++++++++---
 2 files changed, 28 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index c5588c438448..93755a629299 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -220,6 +220,7 @@ struct cgroup {
 	int populated_cnt;
 
 	struct kernfs_node *kn;		/* cgroup kernfs entry */
+	struct kernfs_node *procs_kn;	/* kn for "cgroup.procs" */
 	struct kernfs_node *populated_kn; /* kn for "cgroup.subtree_populated" */
 
 	/*
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 4504d64f91e1..9ef9fc8a774b 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -2392,7 +2392,9 @@ static int cgroup_attach_task(struct cgroup *dst_cgrp,
 	return ret;
 }
 
-static int cgroup_procs_write_permission(struct task_struct *task)
+static int cgroup_procs_write_permission(struct task_struct *task,
+					 struct cgroup *dst_cgrp,
+					 struct kernfs_open_file *of)
 {
 	const struct cred *cred = current_cred();
 	const struct cred *tcred = get_task_cred(task);
@@ -2407,6 +2409,26 @@ static int cgroup_procs_write_permission(struct task_struct *task)
 	    !uid_eq(cred->euid, tcred->suid))
 		ret = -EACCES;
 
+	if (!ret && cgroup_on_dfl(dst_cgrp)) {
+		struct super_block *sb = of->file->f_path.dentry->d_sb;
+		struct cgroup *cgrp;
+		struct inode *inode;
+
+		down_read(&css_set_rwsem);
+		cgrp = task_cgroup_from_root(task, &cgrp_dfl_root);
+		up_read(&css_set_rwsem);
+
+		while (!cgroup_is_descendant(dst_cgrp, cgrp))
+			cgrp = cgroup_parent(cgrp);
+
+		ret = -ENOMEM;
+		inode = kernfs_get_inode(sb, cgrp->procs_kn);
+		if (inode) {
+			ret = inode_permission(inode, MAY_WRITE);
+			iput(inode);
+		}
+	}
+
 	put_cred(tcred);
 	return ret;
 }
@@ -2459,7 +2481,7 @@ static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
 	get_task_struct(tsk);
 	rcu_read_unlock();
 
-	ret = cgroup_procs_write_permission(tsk);
+	ret = cgroup_procs_write_permission(tsk, cgrp, of);
 	if (!ret)
 		ret = cgroup_attach_task(cgrp, tsk, threadgroup);
 
@@ -3087,7 +3109,9 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cftype *cft)
 		return ret;
 	}
 
-	if (cft->seq_show == cgroup_populated_show)
+	if (cft->write == cgroup_procs_write)
+		cgrp->procs_kn = kn;
+	else if (cft->seq_show == cgroup_populated_show)
 		cgrp->populated_kn = kn;
 	return 0;
 }
-- 
cgit v1.2.3


From c04dca02bc73096435a5c36efd5ccb2171edcbe1 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Thu, 11 Jun 2015 14:46:44 +0200
Subject: hrtimer: Remove HRTIMER_STATE_MIGRATE

I do not understand HRTIMER_STATE_MIGRATE. Unless I am totally
confused it looks buggy and simply unneeded.

migrate_hrtimer_list() sets it to keep hrtimer_active() == T, but this
is not enough: this can fool, say, hrtimer_is_queued() in
dequeue_signal().

Can't migrate_hrtimer_list() simply use HRTIMER_STATE_ENQUEUED?
This fixes the race and we can kill STATE_MIGRATE.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: ktkhai@parallels.com
Cc: rostedt@goodmis.org
Cc: juri.lelli@gmail.com
Cc: pang.xunlei@linaro.org
Cc: wanpeng.li@linux.intel.com
Cc: umgwanakikbuti@gmail.com
Link: http://lkml.kernel.org/r/20150611124743.072387650@infradead.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/hrtimer.h | 6 +-----
 kernel/time/hrtimer.c   | 7 ++-----
 2 files changed, 3 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index 3f82a7edc03d..2f9e57d3d126 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -70,17 +70,13 @@ enum hrtimer_restart {
  * the handling of the timer.
  *
  * The HRTIMER_STATE_ENQUEUED bit is always or'ed to the current state
- * to preserve the HRTIMER_STATE_CALLBACK in the above scenario. This
- * also affects HRTIMER_STATE_MIGRATE where the preservation is not
- * necessary. HRTIMER_STATE_MIGRATE is cleared after the timer is
- * enqueued on the new cpu.
+ * to preserve the HRTIMER_STATE_CALLBACK in the above scenario.
  *
  * All state transitions are protected by cpu_base->lock.
  */
 #define HRTIMER_STATE_INACTIVE	0x00
 #define HRTIMER_STATE_ENQUEUED	0x01
 #define HRTIMER_STATE_CALLBACK	0x02
-#define HRTIMER_STATE_MIGRATE	0x04
 
 /**
  * struct hrtimer - the basic hrtimer structure
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 278d4b36fd94..b1b795e5e0b1 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -1508,11 +1508,11 @@ static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
 		debug_deactivate(timer);
 
 		/*
-		 * Mark it as STATE_MIGRATE not INACTIVE otherwise the
+		 * Mark it as ENQUEUED not INACTIVE otherwise the
 		 * timer could be seen as !active and just vanish away
 		 * under us on another CPU
 		 */
-		__remove_hrtimer(timer, old_base, HRTIMER_STATE_MIGRATE, 0);
+		__remove_hrtimer(timer, old_base, HRTIMER_STATE_ENQUEUED, 0);
 		timer->base = new_base;
 		/*
 		 * Enqueue the timers on the new cpu. This does not
@@ -1523,9 +1523,6 @@ static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
 		 * event device.
 		 */
 		enqueue_hrtimer(timer, new_base);
-
-		/* Clear the migration state bit */
-		timer->state &= ~HRTIMER_STATE_MIGRATE;
 	}
 }
 
-- 
cgit v1.2.3


From a7c6f571ff51cc77d90dd54968f7c5c938c43998 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 11 Jun 2015 14:46:46 +0200
Subject: seqcount: Rename write_seqcount_barrier()

I'll shortly be introducing another seqcount primitive that's useful
to provide ordering semantics and would like to use the
write_seqcount_barrier() name for that.

Seeing how there's only one user of the current primitive, lets rename
it to invalidate, as that appears what its doing.

While there, employ lockdep_assert_held() instead of
assert_spin_locked() to not generate debug code for regular kernels.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: ktkhai@parallels.com
Cc: rostedt@goodmis.org
Cc: juri.lelli@gmail.com
Cc: pang.xunlei@linaro.org
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: wanpeng.li@linux.intel.com
Cc: Paul McKenney <paulmck@linux.vnet.ibm.com>
Cc: Al Viro <viro@ZenIV.linux.org.uk>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: umgwanakikbuti@gmail.com
Link: http://lkml.kernel.org/r/20150611124743.279926217@infradead.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 fs/dcache.c             | 16 ++++++++--------
 include/linux/seqlock.h |  6 +++---
 2 files changed, 11 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/fs/dcache.c b/fs/dcache.c
index 656ce522a218..b43a1694d2ca 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -322,17 +322,17 @@ static void dentry_free(struct dentry *dentry)
 }
 
 /**
- * dentry_rcuwalk_barrier - invalidate in-progress rcu-walk lookups
+ * dentry_rcuwalk_invalidate - invalidate in-progress rcu-walk lookups
  * @dentry: the target dentry
  * After this call, in-progress rcu-walk path lookup will fail. This
  * should be called after unhashing, and after changing d_inode (if
  * the dentry has not already been unhashed).
  */
-static inline void dentry_rcuwalk_barrier(struct dentry *dentry)
+static inline void dentry_rcuwalk_invalidate(struct dentry *dentry)
 {
-	assert_spin_locked(&dentry->d_lock);
-	/* Go through a barrier */
-	write_seqcount_barrier(&dentry->d_seq);
+	lockdep_assert_held(&dentry->d_lock);
+	/* Go through am invalidation barrier */
+	write_seqcount_invalidate(&dentry->d_seq);
 }
 
 /*
@@ -372,7 +372,7 @@ static void dentry_unlink_inode(struct dentry * dentry)
 	struct inode *inode = dentry->d_inode;
 	__d_clear_type_and_inode(dentry);
 	hlist_del_init(&dentry->d_u.d_alias);
-	dentry_rcuwalk_barrier(dentry);
+	dentry_rcuwalk_invalidate(dentry);
 	spin_unlock(&dentry->d_lock);
 	spin_unlock(&inode->i_lock);
 	if (!inode->i_nlink)
@@ -494,7 +494,7 @@ void __d_drop(struct dentry *dentry)
 		__hlist_bl_del(&dentry->d_hash);
 		dentry->d_hash.pprev = NULL;
 		hlist_bl_unlock(b);
-		dentry_rcuwalk_barrier(dentry);
+		dentry_rcuwalk_invalidate(dentry);
 	}
 }
 EXPORT_SYMBOL(__d_drop);
@@ -1752,7 +1752,7 @@ static void __d_instantiate(struct dentry *dentry, struct inode *inode)
 	if (inode)
 		hlist_add_head(&dentry->d_u.d_alias, &inode->i_dentry);
 	__d_set_inode_and_type(dentry, inode, add_flags);
-	dentry_rcuwalk_barrier(dentry);
+	dentry_rcuwalk_invalidate(dentry);
 	spin_unlock(&dentry->d_lock);
 	fsnotify_d_instantiate(dentry, inode);
 }
diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h
index 5f68d0a391ce..c07e3a536099 100644
--- a/include/linux/seqlock.h
+++ b/include/linux/seqlock.h
@@ -266,13 +266,13 @@ static inline void write_seqcount_end(seqcount_t *s)
 }
 
 /**
- * write_seqcount_barrier - invalidate in-progress read-side seq operations
+ * write_seqcount_invalidate - invalidate in-progress read-side seq operations
  * @s: pointer to seqcount_t
  *
- * After write_seqcount_barrier, no read-side seq operations will complete
+ * After write_seqcount_invalidate, no read-side seq operations will complete
  * successfully and see data older than this.
  */
-static inline void write_seqcount_barrier(seqcount_t *s)
+static inline void write_seqcount_invalidate(seqcount_t *s)
 {
 	smp_wmb();
 	s->sequence+=2;
-- 
cgit v1.2.3


From c4bfa3f5f906aee2e084c5b1fb15caf876338ef8 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 17 Jun 2015 14:29:24 +0200
Subject: seqcount: Introduce raw_write_seqcount_barrier()

Introduce raw_write_seqcount_barrier(), a new construct that can be
used to provide write barrier semantics in seqcount read loops instead
of the usual consistency guarantee.

raw_write_seqcount_barier() is equivalent to:

	raw_write_seqcount_begin();
	raw_write_seqcount_end();

But avoids issueing two back-to-back smp_wmb() instructions.

This construct works because the read side will 'stall' when observing
odd values. This means that -- referring to the example in the comment
below -- even though there is no (matching) read barrier between the
loads of X and Y, we cannot observe !x && !y, because:

 - if we observe Y == false we must observe the first sequence
   increment, which makes us loop, until

 - we observe !(seq & 1) -- the second sequence increment -- at which
   time we must also observe T == true.

Suggested-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: umgwanakikbuti@gmail.com
Cc: ktkhai@parallels.com
Cc: rostedt@goodmis.org
Cc: juri.lelli@gmail.com
Cc: pang.xunlei@linaro.org
Cc: oleg@redhat.com
Cc: wanpeng.li@linux.intel.com
Cc: Al Viro <viro@ZenIV.linux.org.uk>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Link: http://lkml.kernel.org/r/20150617122924.GP3644@twins.programming.kicks-ass.net
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/seqlock.h | 41 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 41 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h
index c07e3a536099..486e685a226a 100644
--- a/include/linux/seqlock.h
+++ b/include/linux/seqlock.h
@@ -233,6 +233,47 @@ static inline void raw_write_seqcount_end(seqcount_t *s)
 	s->sequence++;
 }
 
+/**
+ * raw_write_seqcount_barrier - do a seq write barrier
+ * @s: pointer to seqcount_t
+ *
+ * This can be used to provide an ordering guarantee instead of the
+ * usual consistency guarantee. It is one wmb cheaper, because we can
+ * collapse the two back-to-back wmb()s.
+ *
+ *      seqcount_t seq;
+ *      bool X = true, Y = false;
+ *
+ *      void read(void)
+ *      {
+ *              bool x, y;
+ *
+ *              do {
+ *                      int s = read_seqcount_begin(&seq);
+ *
+ *                      x = X; y = Y;
+ *
+ *              } while (read_seqcount_retry(&seq, s));
+ *
+ *              BUG_ON(!x && !y);
+ *      }
+ *
+ *      void write(void)
+ *      {
+ *              Y = true;
+ *
+ *              raw_write_seqcount_barrier(seq);
+ *
+ *              X = false;
+ *      }
+ */
+static inline void raw_write_seqcount_barrier(seqcount_t *s)
+{
+	s->sequence++;
+	smp_wmb();
+	s->sequence++;
+}
+
 /*
  * raw_write_seqcount_latch - redirect readers to even/odd copy
  * @s: pointer to seqcount_t
-- 
cgit v1.2.3


From 887d9dc989eb0154492e41e7c07492edbb088ba1 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 11 Jun 2015 14:46:48 +0200
Subject: hrtimer: Allow hrtimer::function() to free the timer

Currently an hrtimer callback function cannot free its own timer
because __run_hrtimer() still needs to clear HRTIMER_STATE_CALLBACK
after it. Freeing the timer would result in a clear use-after-free.

Solve this by using a scheme similar to regular timers; track the
current running timer in hrtimer_clock_base::running.

Suggested-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: ktkhai@parallels.com
Cc: rostedt@goodmis.org
Cc: juri.lelli@gmail.com
Cc: pang.xunlei@linaro.org
Cc: wanpeng.li@linux.intel.com
Cc: Al Viro <viro@ZenIV.linux.org.uk>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul McKenney <paulmck@linux.vnet.ibm.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: umgwanakikbuti@gmail.com
Link: http://lkml.kernel.org/r/20150611124743.471563047@infradead.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/hrtimer.h |  41 +++++++----------
 kernel/time/hrtimer.c   | 114 ++++++++++++++++++++++++++++++++++++++----------
 2 files changed, 107 insertions(+), 48 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index 2f9e57d3d126..5db055821ef3 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -53,30 +53,25 @@ enum hrtimer_restart {
  *
  * 0x00		inactive
  * 0x01		enqueued into rbtree
- * 0x02		callback function running
- * 0x04		timer is migrated to another cpu
  *
- * Special cases:
- * 0x03		callback function running and enqueued
- *		(was requeued on another CPU)
- * 0x05		timer was migrated on CPU hotunplug
+ * The callback state is not part of the timer->state because clearing it would
+ * mean touching the timer after the callback, this makes it impossible to free
+ * the timer from the callback function.
  *
- * The "callback function running and enqueued" status is only possible on
- * SMP. It happens for example when a posix timer expired and the callback
+ * Therefore we track the callback state in:
+ *
+ *	timer->base->cpu_base->running == timer
+ *
+ * On SMP it is possible to have a "callback function running and enqueued"
+ * status. It happens for example when a posix timer expired and the callback
  * queued a signal. Between dropping the lock which protects the posix timer
  * and reacquiring the base lock of the hrtimer, another CPU can deliver the
- * signal and rearm the timer. We have to preserve the callback running state,
- * as otherwise the timer could be removed before the softirq code finishes the
- * the handling of the timer.
- *
- * The HRTIMER_STATE_ENQUEUED bit is always or'ed to the current state
- * to preserve the HRTIMER_STATE_CALLBACK in the above scenario.
+ * signal and rearm the timer.
  *
  * All state transitions are protected by cpu_base->lock.
  */
 #define HRTIMER_STATE_INACTIVE	0x00
 #define HRTIMER_STATE_ENQUEUED	0x01
-#define HRTIMER_STATE_CALLBACK	0x02
 
 /**
  * struct hrtimer - the basic hrtimer structure
@@ -163,6 +158,8 @@ enum  hrtimer_base_type {
  * struct hrtimer_cpu_base - the per cpu clock bases
  * @lock:		lock protecting the base and associated clock bases
  *			and timers
+ * @seq:		seqcount around __run_hrtimer
+ * @running:		pointer to the currently running hrtimer
  * @cpu:		cpu number
  * @active_bases:	Bitfield to mark bases with active timers
  * @clock_was_set_seq:	Sequence counter of clock was set events
@@ -184,6 +181,8 @@ enum  hrtimer_base_type {
  */
 struct hrtimer_cpu_base {
 	raw_spinlock_t			lock;
+	seqcount_t			seq;
+	struct hrtimer			*running;
 	unsigned int			cpu;
 	unsigned int			active_bases;
 	unsigned int			clock_was_set_seq;
@@ -391,15 +390,7 @@ extern ktime_t hrtimer_get_remaining(const struct hrtimer *timer);
 
 extern u64 hrtimer_get_next_event(void);
 
-/*
- * A timer is active, when it is enqueued into the rbtree or the
- * callback function is running or it's in the state of being migrated
- * to another cpu.
- */
-static inline int hrtimer_active(const struct hrtimer *timer)
-{
-	return timer->state != HRTIMER_STATE_INACTIVE;
-}
+extern bool hrtimer_active(const struct hrtimer *timer);
 
 /*
  * Helper function to check, whether the timer is on one of the queues
@@ -415,7 +406,7 @@ static inline int hrtimer_is_queued(struct hrtimer *timer)
  */
 static inline int hrtimer_callback_running(struct hrtimer *timer)
 {
-	return timer->state & HRTIMER_STATE_CALLBACK;
+	return timer->base->cpu_base->running == timer;
 }
 
 /* Forward a hrtimer so it expires after now: */
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 1604157374d7..f026413de4d6 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -67,6 +67,7 @@
 DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
 {
 	.lock = __RAW_SPIN_LOCK_UNLOCKED(hrtimer_bases.lock),
+	.seq = SEQCNT_ZERO(hrtimer_bases.seq),
 	.clock_base =
 	{
 		{
@@ -110,6 +111,18 @@ static inline int hrtimer_clockid_to_base(clockid_t clock_id)
  */
 #ifdef CONFIG_SMP
 
+/*
+ * We require the migration_base for lock_hrtimer_base()/switch_hrtimer_base()
+ * such that hrtimer_callback_running() can unconditionally dereference
+ * timer->base->cpu_base
+ */
+static struct hrtimer_cpu_base migration_cpu_base = {
+	.seq = SEQCNT_ZERO(migration_cpu_base),
+	.clock_base = { { .cpu_base = &migration_cpu_base, }, },
+};
+
+#define migration_base	migration_cpu_base.clock_base[0]
+
 /*
  * We are using hashed locking: holding per_cpu(hrtimer_bases)[n].lock
  * means that all timers which are tied to this base via timer->base are
@@ -119,8 +132,8 @@ static inline int hrtimer_clockid_to_base(clockid_t clock_id)
  * be found on the lists/queues.
  *
  * When the timer's base is locked, and the timer removed from list, it is
- * possible to set timer->base = NULL and drop the lock: the timer remains
- * locked.
+ * possible to set timer->base = &migration_base and drop the lock: the timer
+ * remains locked.
  */
 static
 struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer,
@@ -130,7 +143,7 @@ struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer,
 
 	for (;;) {
 		base = timer->base;
-		if (likely(base != NULL)) {
+		if (likely(base != &migration_base)) {
 			raw_spin_lock_irqsave(&base->cpu_base->lock, *flags);
 			if (likely(base == timer->base))
 				return base;
@@ -194,8 +207,8 @@ again:
 		if (unlikely(hrtimer_callback_running(timer)))
 			return base;
 
-		/* See the comment in lock_timer_base() */
-		timer->base = NULL;
+		/* See the comment in lock_hrtimer_base() */
+		timer->base = &migration_base;
 		raw_spin_unlock(&base->cpu_base->lock);
 		raw_spin_lock(&new_base->cpu_base->lock);
 
@@ -838,11 +851,7 @@ static int enqueue_hrtimer(struct hrtimer *timer,
 
 	base->cpu_base->active_bases |= 1 << base->index;
 
-	/*
-	 * HRTIMER_STATE_ENQUEUED is or'ed to the current state to preserve the
-	 * state of a possibly running callback.
-	 */
-	timer->state |= HRTIMER_STATE_ENQUEUED;
+	timer->state = HRTIMER_STATE_ENQUEUED;
 
 	return timerqueue_add(&base->active, &timer->node);
 }
@@ -907,14 +916,9 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, bool rest
 		timer_stats_hrtimer_clear_start_info(timer);
 		reprogram = base->cpu_base == this_cpu_ptr(&hrtimer_bases);
 
-		if (!restart) {
-			/*
-			 * We must preserve the CALLBACK state flag here,
-			 * otherwise we could move the timer base in
-			 * switch_hrtimer_base.
-			 */
-			state &= HRTIMER_STATE_CALLBACK;
-		}
+		if (!restart)
+			state = HRTIMER_STATE_INACTIVE;
+
 		__remove_hrtimer(timer, base, state, reprogram);
 		return 1;
 	}
@@ -1115,6 +1119,51 @@ void hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
 }
 EXPORT_SYMBOL_GPL(hrtimer_init);
 
+/*
+ * A timer is active, when it is enqueued into the rbtree or the
+ * callback function is running or it's in the state of being migrated
+ * to another cpu.
+ *
+ * It is important for this function to not return a false negative.
+ */
+bool hrtimer_active(const struct hrtimer *timer)
+{
+	struct hrtimer_cpu_base *cpu_base;
+	unsigned int seq;
+
+	do {
+		cpu_base = READ_ONCE(timer->base->cpu_base);
+		seq = raw_read_seqcount_begin(&cpu_base->seq);
+
+		if (timer->state != HRTIMER_STATE_INACTIVE ||
+		    cpu_base->running == timer)
+			return true;
+
+	} while (read_seqcount_retry(&cpu_base->seq, seq) ||
+		 cpu_base != READ_ONCE(timer->base->cpu_base));
+
+	return false;
+}
+EXPORT_SYMBOL_GPL(hrtimer_active);
+
+/*
+ * The write_seqcount_barrier()s in __run_hrtimer() split the thing into 3
+ * distinct sections:
+ *
+ *  - queued:	the timer is queued
+ *  - callback:	the timer is being ran
+ *  - post:	the timer is inactive or (re)queued
+ *
+ * On the read side we ensure we observe timer->state and cpu_base->running
+ * from the same section, if anything changed while we looked at it, we retry.
+ * This includes timer->base changing because sequence numbers alone are
+ * insufficient for that.
+ *
+ * The sequence numbers are required because otherwise we could still observe
+ * a false negative if the read side got smeared over multiple consequtive
+ * __run_hrtimer() invocations.
+ */
+
 static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base,
 			  struct hrtimer_clock_base *base,
 			  struct hrtimer *timer, ktime_t *now)
@@ -1122,10 +1171,21 @@ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base,
 	enum hrtimer_restart (*fn)(struct hrtimer *);
 	int restart;
 
-	WARN_ON(!irqs_disabled());
+	lockdep_assert_held(&cpu_base->lock);
 
 	debug_deactivate(timer);
-	__remove_hrtimer(timer, base, HRTIMER_STATE_CALLBACK, 0);
+	cpu_base->running = timer;
+
+	/*
+	 * Separate the ->running assignment from the ->state assignment.
+	 *
+	 * As with a regular write barrier, this ensures the read side in
+	 * hrtimer_active() cannot observe cpu_base->running == NULL &&
+	 * timer->state == INACTIVE.
+	 */
+	raw_write_seqcount_barrier(&cpu_base->seq);
+
+	__remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE, 0);
 	timer_stats_account_hrtimer(timer);
 	fn = timer->function;
 
@@ -1141,7 +1201,7 @@ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base,
 	raw_spin_lock(&cpu_base->lock);
 
 	/*
-	 * Note: We clear the CALLBACK bit after enqueue_hrtimer and
+	 * Note: We clear the running state after enqueue_hrtimer and
 	 * we do not reprogramm the event hardware. Happens either in
 	 * hrtimer_start_range_ns() or in hrtimer_interrupt()
 	 *
@@ -1153,9 +1213,17 @@ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base,
 	    !(timer->state & HRTIMER_STATE_ENQUEUED))
 		enqueue_hrtimer(timer, base);
 
-	WARN_ON_ONCE(!(timer->state & HRTIMER_STATE_CALLBACK));
+	/*
+	 * Separate the ->running assignment from the ->state assignment.
+	 *
+	 * As with a regular write barrier, this ensures the read side in
+	 * hrtimer_active() cannot observe cpu_base->running == NULL &&
+	 * timer->state == INACTIVE.
+	 */
+	raw_write_seqcount_barrier(&cpu_base->seq);
 
-	timer->state &= ~HRTIMER_STATE_CALLBACK;
+	WARN_ON_ONCE(cpu_base->running != timer);
+	cpu_base->running = NULL;
 }
 
 static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now)
-- 
cgit v1.2.3


From a24fc60d63da2b0b31bf7c876d12a51ed4b778bd Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 11 Jun 2015 14:46:53 +0200
Subject: lockdep: Implement lock pinning

Add a lockdep annotation that WARNs if you 'accidentially' unlock a
lock.

This is especially helpful for code with callbacks, where the upper
layer assumes a lock remains taken but a lower layer thinks it maybe
can drop and reacquire the lock.

By unwittingly breaking up the lock, races can be introduced.

Lock pinning is a lockdep annotation that helps with this, when you
lockdep_pin_lock() a held lock, any unlock without a
lockdep_unpin_lock() will produce a WARN. Think of this as a relative
of lockdep_assert_held(), except you don't only assert its held now,
but ensure it stays held until you release your assertion.

RFC: a possible alternative API would be something like:

  int cookie = lockdep_pin_lock(&foo);
  ...
  lockdep_unpin_lock(&foo, cookie);

Where we pick a random number for the pin_count; this makes it
impossible to sneak a lock break in without also passing the right
cookie along.

I've not done this because it ends up generating code for !LOCKDEP,
esp. if you need to pass the cookie around for some reason.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: ktkhai@parallels.com
Cc: rostedt@goodmis.org
Cc: juri.lelli@gmail.com
Cc: pang.xunlei@linaro.org
Cc: oleg@redhat.com
Cc: wanpeng.li@linux.intel.com
Cc: umgwanakikbuti@gmail.com
Link: http://lkml.kernel.org/r/20150611124743.906731065@infradead.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/lockdep.h  | 10 ++++++
 kernel/locking/lockdep.c | 80 ++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 90 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h
index 066ba4157541..c5b6b5830acf 100644
--- a/include/linux/lockdep.h
+++ b/include/linux/lockdep.h
@@ -255,6 +255,7 @@ struct held_lock {
 	unsigned int check:1;       /* see lock_acquire() comment */
 	unsigned int hardirqs_off:1;
 	unsigned int references:12;					/* 32 bits */
+	unsigned int pin_count;
 };
 
 /*
@@ -354,6 +355,9 @@ extern void lockdep_set_current_reclaim_state(gfp_t gfp_mask);
 extern void lockdep_clear_current_reclaim_state(void);
 extern void lockdep_trace_alloc(gfp_t mask);
 
+extern void lock_pin_lock(struct lockdep_map *lock);
+extern void lock_unpin_lock(struct lockdep_map *lock);
+
 # define INIT_LOCKDEP				.lockdep_recursion = 0, .lockdep_reclaim_gfp = 0,
 
 #define lockdep_depth(tsk)	(debug_locks ? (tsk)->lockdep_depth : 0)
@@ -368,6 +372,9 @@ extern void lockdep_trace_alloc(gfp_t mask);
 
 #define lockdep_recursing(tsk)	((tsk)->lockdep_recursion)
 
+#define lockdep_pin_lock(l)		lock_pin_lock(&(l)->dep_map)
+#define lockdep_unpin_lock(l)	lock_unpin_lock(&(l)->dep_map)
+
 #else /* !CONFIG_LOCKDEP */
 
 static inline void lockdep_off(void)
@@ -420,6 +427,9 @@ struct lock_class_key { };
 
 #define lockdep_recursing(tsk)			(0)
 
+#define lockdep_pin_lock(l)				do { (void)(l); } while (0)
+#define lockdep_unpin_lock(l)			do { (void)(l); } while (0)
+
 #endif /* !LOCKDEP */
 
 #ifdef CONFIG_LOCK_STAT
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index a266d5165b63..18f9f434d17e 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -3157,6 +3157,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
 	hlock->waittime_stamp = 0;
 	hlock->holdtime_stamp = lockstat_clock();
 #endif
+	hlock->pin_count = 0;
 
 	if (check && !mark_irqflags(curr, hlock))
 		return 0;
@@ -3403,6 +3404,8 @@ found_it:
 	if (hlock->instance == lock)
 		lock_release_holdtime(hlock);
 
+	WARN(hlock->pin_count, "releasing a pinned lock\n");
+
 	if (hlock->references) {
 		hlock->references--;
 		if (hlock->references) {
@@ -3459,6 +3462,49 @@ static int __lock_is_held(struct lockdep_map *lock)
 	return 0;
 }
 
+static void __lock_pin_lock(struct lockdep_map *lock)
+{
+	struct task_struct *curr = current;
+	int i;
+
+	if (unlikely(!debug_locks))
+		return;
+
+	for (i = 0; i < curr->lockdep_depth; i++) {
+		struct held_lock *hlock = curr->held_locks + i;
+
+		if (match_held_lock(hlock, lock)) {
+			hlock->pin_count++;
+			return;
+		}
+	}
+
+	WARN(1, "pinning an unheld lock\n");
+}
+
+static void __lock_unpin_lock(struct lockdep_map *lock)
+{
+	struct task_struct *curr = current;
+	int i;
+
+	if (unlikely(!debug_locks))
+		return;
+
+	for (i = 0; i < curr->lockdep_depth; i++) {
+		struct held_lock *hlock = curr->held_locks + i;
+
+		if (match_held_lock(hlock, lock)) {
+			if (WARN(!hlock->pin_count, "unpinning an unpinned lock\n"))
+				return;
+
+			hlock->pin_count--;
+			return;
+		}
+	}
+
+	WARN(1, "unpinning an unheld lock\n");
+}
+
 /*
  * Check whether we follow the irq-flags state precisely:
  */
@@ -3582,6 +3628,40 @@ int lock_is_held(struct lockdep_map *lock)
 }
 EXPORT_SYMBOL_GPL(lock_is_held);
 
+void lock_pin_lock(struct lockdep_map *lock)
+{
+	unsigned long flags;
+
+	if (unlikely(current->lockdep_recursion))
+		return;
+
+	raw_local_irq_save(flags);
+	check_flags(flags);
+
+	current->lockdep_recursion = 1;
+	__lock_pin_lock(lock);
+	current->lockdep_recursion = 0;
+	raw_local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(lock_pin_lock);
+
+void lock_unpin_lock(struct lockdep_map *lock)
+{
+	unsigned long flags;
+
+	if (unlikely(current->lockdep_recursion))
+		return;
+
+	raw_local_irq_save(flags);
+	check_flags(flags);
+
+	current->lockdep_recursion = 1;
+	__lock_unpin_lock(lock);
+	current->lockdep_recursion = 0;
+	raw_local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(lock_unpin_lock);
+
 void lockdep_set_current_reclaim_state(gfp_t gfp_mask)
 {
 	current->lockdep_reclaim_gfp = gfp_mask;
-- 
cgit v1.2.3


From 7f3b62cf945dc3e57fbd693022a5651206ce85b0 Mon Sep 17 00:00:00 2001
From: Hans de Goede <hdegoede@redhat.com>
Date: Tue, 16 Jun 2015 16:27:43 +0200
Subject: acpi-video-detect: Remove the unused acpi_video_dmi_demote_vendor()
 function

Remove the now unused acpi_video_dmi_demote_vendor() function, this was
never a proper counter part of acpi_video_dmi_promote_vendor() since
the calls to acpi_video_dmi_promote_vendor() are not counted.

Signed-off-by: Hans de Goede <hdegoede@redhat.com>
Acked-by: Darren Hart <dvhart@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/video_detect.c | 9 ---------
 include/linux/acpi.h        | 5 -----
 2 files changed, 14 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/acpi/video_detect.c b/drivers/acpi/video_detect.c
index be825802b228..bb78c9e16113 100644
--- a/drivers/acpi/video_detect.c
+++ b/drivers/acpi/video_detect.c
@@ -258,15 +258,6 @@ void acpi_video_dmi_promote_vendor(void)
 }
 EXPORT_SYMBOL(acpi_video_dmi_promote_vendor);
 
-/* To be called when a driver who previously promoted the vendor
- * interface */
-void acpi_video_dmi_demote_vendor(void)
-{
-	acpi_video_caps_check();
-	acpi_video_support &= ~ACPI_VIDEO_BACKLIGHT_DMI_VENDOR;
-}
-EXPORT_SYMBOL(acpi_video_dmi_demote_vendor);
-
 /* Returns true if video.ko can do backlight switching */
 int acpi_video_backlight_support(void)
 {
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index e4da5e35e29c..01bffd30c6c7 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -248,7 +248,6 @@ extern bool wmi_has_guid(const char *guid);
 extern long acpi_video_get_capabilities(acpi_handle graphics_dev_handle);
 extern long acpi_is_video_device(acpi_handle handle);
 extern void acpi_video_dmi_promote_vendor(void);
-extern void acpi_video_dmi_demote_vendor(void);
 extern int acpi_video_backlight_support(void);
 extern int acpi_video_display_switch_support(void);
 
@@ -268,10 +267,6 @@ static inline void acpi_video_dmi_promote_vendor(void)
 {
 }
 
-static inline void acpi_video_dmi_demote_vendor(void)
-{
-}
-
 static inline int acpi_video_backlight_support(void)
 {
 	return 0;
-- 
cgit v1.2.3


From fb105d964226ce4834b45d7e3d9f339aa716ed70 Mon Sep 17 00:00:00 2001
From: Hans de Goede <hdegoede@redhat.com>
Date: Tue, 16 Jun 2015 16:27:44 +0200
Subject: acpi-video-detect: Make acpi_video_get_capabilities a private
 function

acpi_video_get_capabilities() is only used inside video_detect.c so make
it static. While at it also remove the prototype for the non existent
acpi_video_display_switch_support function from acpi.h

Signed-off-by: Hans de Goede <hdegoede@redhat.com>
Acked-by: Darren Hart <dvhart@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/video_detect.c |  3 +--
 include/linux/acpi.h        | 12 ------------
 2 files changed, 1 insertion(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/acpi/video_detect.c b/drivers/acpi/video_detect.c
index bb78c9e16113..3af18beaea0a 100644
--- a/drivers/acpi/video_detect.c
+++ b/drivers/acpi/video_detect.c
@@ -184,7 +184,7 @@ static const struct dmi_system_id video_detect_dmi_table[] = {
  * all graphics capabilities of physically present devices are
  * summarized and returned. This is cached and done only once.
  */
-long acpi_video_get_capabilities(acpi_handle graphics_handle)
+static long acpi_video_get_capabilities(acpi_handle graphics_handle)
 {
 	long caps = 0;
 	struct acpi_device *tmp_dev;
@@ -227,7 +227,6 @@ long acpi_video_get_capabilities(acpi_handle graphics_handle)
 			  graphics_handle ? acpi_device_bid(tmp_dev) : ""));
 	return caps;
 }
-EXPORT_SYMBOL(acpi_video_get_capabilities);
 
 static void acpi_video_caps_check(void)
 {
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 01bffd30c6c7..88c92a03a77e 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -245,19 +245,12 @@ extern bool wmi_has_guid(const char *guid);
 
 #if defined(CONFIG_ACPI_VIDEO) || defined(CONFIG_ACPI_VIDEO_MODULE)
 
-extern long acpi_video_get_capabilities(acpi_handle graphics_dev_handle);
 extern long acpi_is_video_device(acpi_handle handle);
 extern void acpi_video_dmi_promote_vendor(void);
 extern int acpi_video_backlight_support(void);
-extern int acpi_video_display_switch_support(void);
 
 #else
 
-static inline long acpi_video_get_capabilities(acpi_handle graphics_dev_handle)
-{
-	return 0;
-}
-
 static inline long acpi_is_video_device(acpi_handle handle)
 {
 	return 0;
@@ -272,11 +265,6 @@ static inline int acpi_video_backlight_support(void)
 	return 0;
 }
 
-static inline int acpi_video_display_switch_support(void)
-{
-	return 0;
-}
-
 #endif /* defined(CONFIG_ACPI_VIDEO) || defined(CONFIG_ACPI_VIDEO_MODULE) */
 
 extern int acpi_blacklisted(void);
-- 
cgit v1.2.3


From adc8bb8e0fe005ed29366e6c4621652481878214 Mon Sep 17 00:00:00 2001
From: Hans de Goede <hdegoede@redhat.com>
Date: Tue, 16 Jun 2015 16:27:45 +0200
Subject: acpi-video-detect: Move acpi_is_video_device() to acpi/scan.c

This allows video_detect.c to be build as a module, this is a preparation
patch for the backlight interface selection logic cleanup.

Note this commit also causes acpi_is_video_device() to always be build
indepedent of CONFIG_ACPI_VIDEO, as there is no reason to make its
building depend on CONFIG_ACPI_VIDEO.

Signed-off-by: Hans de Goede <hdegoede@redhat.com>
Acked-by: Darren Hart <dvhart@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/scan.c         | 56 ++++++++++++++++++++++++++++++++++++++++++
 drivers/acpi/video_detect.c | 60 ---------------------------------------------
 include/linux/acpi.h        |  8 ++----
 3 files changed, 58 insertions(+), 66 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/acpi/scan.c b/drivers/acpi/scan.c
index 0a099917a006..aa997c66d697 100644
--- a/drivers/acpi/scan.c
+++ b/drivers/acpi/scan.c
@@ -1949,6 +1949,62 @@ bool acpi_dock_match(acpi_handle handle)
 	return acpi_has_method(handle, "_DCK");
 }
 
+static acpi_status
+acpi_backlight_cap_match(acpi_handle handle, u32 level, void *context,
+			  void **return_value)
+{
+	long *cap = context;
+
+	if (acpi_has_method(handle, "_BCM") &&
+	    acpi_has_method(handle, "_BCL")) {
+		ACPI_DEBUG_PRINT((ACPI_DB_INFO, "Found generic backlight "
+				  "support\n"));
+		*cap |= ACPI_VIDEO_BACKLIGHT;
+		if (!acpi_has_method(handle, "_BQC"))
+			printk(KERN_WARNING FW_BUG PREFIX "No _BQC method, "
+				"cannot determine initial brightness\n");
+		/* We have backlight support, no need to scan further */
+		return AE_CTRL_TERMINATE;
+	}
+	return 0;
+}
+
+/* Returns true if the ACPI object is a video device which can be
+ * handled by video.ko.
+ * The device will get a Linux specific CID added in scan.c to
+ * identify the device as an ACPI graphics device
+ * Be aware that the graphics device may not be physically present
+ * Use acpi_video_get_capabilities() to detect general ACPI video
+ * capabilities of present cards
+ */
+long acpi_is_video_device(acpi_handle handle)
+{
+	long video_caps = 0;
+
+	/* Is this device able to support video switching ? */
+	if (acpi_has_method(handle, "_DOD") || acpi_has_method(handle, "_DOS"))
+		video_caps |= ACPI_VIDEO_OUTPUT_SWITCHING;
+
+	/* Is this device able to retrieve a video ROM ? */
+	if (acpi_has_method(handle, "_ROM"))
+		video_caps |= ACPI_VIDEO_ROM_AVAILABLE;
+
+	/* Is this device able to configure which video head to be POSTed ? */
+	if (acpi_has_method(handle, "_VPO") &&
+	    acpi_has_method(handle, "_GPD") &&
+	    acpi_has_method(handle, "_SPD"))
+		video_caps |= ACPI_VIDEO_DEVICE_POSTING;
+
+	/* Only check for backlight functionality if one of the above hit. */
+	if (video_caps)
+		acpi_walk_namespace(ACPI_TYPE_DEVICE, handle,
+				    ACPI_UINT32_MAX, acpi_backlight_cap_match, NULL,
+				    &video_caps, NULL);
+
+	return video_caps;
+}
+EXPORT_SYMBOL(acpi_is_video_device);
+
 const char *acpi_device_hid(struct acpi_device *device)
 {
 	struct acpi_hardware_id *hid;
diff --git a/drivers/acpi/video_detect.c b/drivers/acpi/video_detect.c
index 3af18beaea0a..5076138156fd 100644
--- a/drivers/acpi/video_detect.c
+++ b/drivers/acpi/video_detect.c
@@ -5,10 +5,6 @@
  *  May be copied or modified under the terms of the GNU General Public License
  *
  * video_detect.c:
- * Provides acpi_is_video_device() for early scanning of ACPI devices in scan.c
- * There a Linux specific (Spec does not provide a HID for video devices) is
- * assigned
- *
  * After PCI devices are glued with ACPI devices
  * acpi_get_pci_dev() can be called to identify ACPI graphics
  * devices for which a real graphics card is plugged in
@@ -46,62 +42,6 @@ ACPI_MODULE_NAME("video");
 static long acpi_video_support;
 static bool acpi_video_caps_checked;
 
-static acpi_status
-acpi_backlight_cap_match(acpi_handle handle, u32 level, void *context,
-			  void **return_value)
-{
-	long *cap = context;
-
-	if (acpi_has_method(handle, "_BCM") &&
-	    acpi_has_method(handle, "_BCL")) {
-		ACPI_DEBUG_PRINT((ACPI_DB_INFO, "Found generic backlight "
-				  "support\n"));
-		*cap |= ACPI_VIDEO_BACKLIGHT;
-		if (!acpi_has_method(handle, "_BQC"))
-			printk(KERN_WARNING FW_BUG PREFIX "No _BQC method, "
-				"cannot determine initial brightness\n");
-		/* We have backlight support, no need to scan further */
-		return AE_CTRL_TERMINATE;
-	}
-	return 0;
-}
-
-/* Returns true if the ACPI object is a video device which can be
- * handled by video.ko.
- * The device will get a Linux specific CID added in scan.c to
- * identify the device as an ACPI graphics device
- * Be aware that the graphics device may not be physically present
- * Use acpi_video_get_capabilities() to detect general ACPI video
- * capabilities of present cards
- */
-long acpi_is_video_device(acpi_handle handle)
-{
-	long video_caps = 0;
-
-	/* Is this device able to support video switching ? */
-	if (acpi_has_method(handle, "_DOD") || acpi_has_method(handle, "_DOS"))
-		video_caps |= ACPI_VIDEO_OUTPUT_SWITCHING;
-
-	/* Is this device able to retrieve a video ROM ? */
-	if (acpi_has_method(handle, "_ROM"))
-		video_caps |= ACPI_VIDEO_ROM_AVAILABLE;
-
-	/* Is this device able to configure which video head to be POSTed ? */
-	if (acpi_has_method(handle, "_VPO") &&
-	    acpi_has_method(handle, "_GPD") &&
-	    acpi_has_method(handle, "_SPD"))
-		video_caps |= ACPI_VIDEO_DEVICE_POSTING;
-
-	/* Only check for backlight functionality if one of the above hit. */
-	if (video_caps)
-		acpi_walk_namespace(ACPI_TYPE_DEVICE, handle,
-				    ACPI_UINT32_MAX, acpi_backlight_cap_match, NULL,
-				    &video_caps, NULL);
-
-	return video_caps;
-}
-EXPORT_SYMBOL(acpi_is_video_device);
-
 static acpi_status
 find_video(acpi_handle handle, u32 lvl, void *context, void **rv)
 {
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 88c92a03a77e..7cb3b0bc4a7e 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -243,19 +243,15 @@ extern bool wmi_has_guid(const char *guid);
 #define ACPI_VIDEO_OUTPUT_SWITCHING_DMI_VENDOR		0x0400
 #define ACPI_VIDEO_OUTPUT_SWITCHING_DMI_VIDEO		0x0800
 
+extern long acpi_is_video_device(acpi_handle handle);
+
 #if defined(CONFIG_ACPI_VIDEO) || defined(CONFIG_ACPI_VIDEO_MODULE)
 
-extern long acpi_is_video_device(acpi_handle handle);
 extern void acpi_video_dmi_promote_vendor(void);
 extern int acpi_video_backlight_support(void);
 
 #else
 
-static inline long acpi_is_video_device(acpi_handle handle)
-{
-	return 0;
-}
-
 static inline void acpi_video_dmi_promote_vendor(void)
 {
 }
-- 
cgit v1.2.3


From a87878bafa1f82c20eddaf2d23780b194c35ccf5 Mon Sep 17 00:00:00 2001
From: Hans de Goede <hdegoede@redhat.com>
Date: Tue, 16 Jun 2015 16:27:46 +0200
Subject: acpi-video-detect: Move acpi_osi_is_win8 to osl.c

acpi_osi_is_win8 needs access to acpi_gbl_osi_data which is not exported,
so move it to osl.c. Alternatively we could export acpi_gbl_osi_data but
that seems undesirable.

This allows video_detect.c to be build as a module, besides that
acpi_osi_is_win8() is something which does not really belong in
video_detect.c in the first place.

Signed-off-by: Hans de Goede <hdegoede@redhat.com>
Acked-by: Darren Hart <dvhart@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/internal.h     | 7 -------
 drivers/acpi/osl.c          | 6 ++++++
 drivers/acpi/video_detect.c | 6 ------
 include/linux/acpi.h        | 1 +
 4 files changed, 7 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/acpi/internal.h b/drivers/acpi/internal.h
index d93628c65661..a637497217f0 100644
--- a/drivers/acpi/internal.h
+++ b/drivers/acpi/internal.h
@@ -181,13 +181,6 @@ static inline int suspend_nvs_save(void) { return 0; }
 static inline void suspend_nvs_restore(void) {}
 #endif
 
-/*--------------------------------------------------------------------------
-					Video
-  -------------------------------------------------------------------------- */
-#if defined(CONFIG_ACPI_VIDEO) || defined(CONFIG_ACPI_VIDEO_MODULE)
-bool acpi_osi_is_win8(void);
-#endif
-
 /*--------------------------------------------------------------------------
 				Device properties
   -------------------------------------------------------------------------- */
diff --git a/drivers/acpi/osl.c b/drivers/acpi/osl.c
index 7ccba395c9dd..b906deb10df5 100644
--- a/drivers/acpi/osl.c
+++ b/drivers/acpi/osl.c
@@ -1684,6 +1684,12 @@ int acpi_resources_are_enforced(void)
 }
 EXPORT_SYMBOL(acpi_resources_are_enforced);
 
+bool acpi_osi_is_win8(void)
+{
+	return acpi_gbl_osi_data >= ACPI_OSI_WIN_8;
+}
+EXPORT_SYMBOL(acpi_osi_is_win8);
+
 /*
  * Deallocate the memory for a spinlock.
  */
diff --git a/drivers/acpi/video_detect.c b/drivers/acpi/video_detect.c
index 5076138156fd..b2270ca21538 100644
--- a/drivers/acpi/video_detect.c
+++ b/drivers/acpi/video_detect.c
@@ -178,12 +178,6 @@ static void acpi_video_caps_check(void)
 		acpi_video_get_capabilities(NULL);
 }
 
-bool acpi_osi_is_win8(void)
-{
-	return acpi_gbl_osi_data >= ACPI_OSI_WIN_8;
-}
-EXPORT_SYMBOL(acpi_osi_is_win8);
-
 /* Promote the vendor interface instead of the generic video module.
  * This function allow DMI blacklists to be implemented by externals
  * platform drivers instead of putting a big blacklist in video_detect.c
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 7cb3b0bc4a7e..913a1c19818a 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -266,6 +266,7 @@ static inline int acpi_video_backlight_support(void)
 extern int acpi_blacklisted(void);
 extern void acpi_dmi_osi_linux(int enable, const struct dmi_system_id *d);
 extern void acpi_osi_setup(char *str);
+extern bool acpi_osi_is_win8(void);
 
 #ifdef CONFIG_ACPI_NUMA
 int acpi_get_node(acpi_handle handle);
-- 
cgit v1.2.3


From 14ca7a47d0ab2a7a35faab130e6d9682f8ff1a46 Mon Sep 17 00:00:00 2001
From: Hans de Goede <hdegoede@redhat.com>
Date: Tue, 16 Jun 2015 16:27:47 +0200
Subject: acpi-video-detect: video: Make video_detect code part of the video
 module

This is a preparation patch for the backlight interface selection logic
cleanup, there are 2 reasons to not always build the video_detect code
into the kernel:

1) In order for the video_detect.c to also deal with / select native
backlight interfaces on win8 systems, instead of doing this in video.c
where it does not belong, video_detect.c needs to call into the backlight
class code. Which cannot be done if it is builtin and the blacklight class
is not.

2) Currently all the platform/x86 drivers which have quirks to prefer
the vendor driver over acpi-video call acpi_video_unregister_backlight()
to remove the acpi-video backlight interface, this logic really belongs
in video_detect.c, which will cause video_detect.c to depend on symbols of
video.c and video.c already depends on video_detect.c symbols, so they
really need to be a single module.

Note that this commits make 2 changes so as to maintain 100% kernel
commandline compatibility:

1) The __setup call for the acpi_backlight= handling is moved to
   acpi/util.c as __setup may only be used by code which is alwasy builtin
2) video.c is renamed to acpi_video.c so that it can be combined with
   video_detect.c into video.ko

This commit also makes changes to drivers/platform/x86/Kconfig to ensure
that drivers which use acpi_video_backlight_support() from video_detect.c,
will not be built-in when acpi_video is not built in. This also changes
some "select" uses to "depends on" to avoid dependency loops.

Signed-off-by: Hans de Goede <hdegoede@redhat.com>
Acked-by: Darren Hart <dvhart@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/Makefile        |    5 +-
 drivers/acpi/acpi_video.c    | 2275 ++++++++++++++++++++++++++++++++++++++++++
 drivers/acpi/utils.c         |   15 +
 drivers/acpi/video.c         | 2275 ------------------------------------------
 drivers/acpi/video_detect.c  |   35 +-
 drivers/platform/x86/Kconfig |   21 +-
 include/linux/acpi.h         |    1 +
 7 files changed, 2321 insertions(+), 2306 deletions(-)
 create mode 100644 drivers/acpi/acpi_video.c
 delete mode 100644 drivers/acpi/video.c

(limited to 'include/linux')

diff --git a/drivers/acpi/Makefile b/drivers/acpi/Makefile
index 8a063e276530..73d840bef455 100644
--- a/drivers/acpi/Makefile
+++ b/drivers/acpi/Makefile
@@ -52,9 +52,6 @@ acpi-$(CONFIG_X86)		+= acpi_cmos_rtc.o
 acpi-$(CONFIG_DEBUG_FS)		+= debugfs.o
 acpi-$(CONFIG_ACPI_NUMA)	+= numa.o
 acpi-$(CONFIG_ACPI_PROCFS_POWER) += cm_sbs.o
-ifdef CONFIG_ACPI_VIDEO
-acpi-y				+= video_detect.o
-endif
 acpi-y				+= acpi_lpat.o
 acpi-$(CONFIG_ACPI_GENERIC_GSI) += gsi.o
 
@@ -95,3 +92,5 @@ obj-$(CONFIG_ACPI_EXTLOG)	+= acpi_extlog.o
 obj-$(CONFIG_PMIC_OPREGION)	+= pmic/intel_pmic.o
 obj-$(CONFIG_CRC_PMIC_OPREGION) += pmic/intel_pmic_crc.o
 obj-$(CONFIG_XPOWER_PMIC_OPREGION) += pmic/intel_pmic_xpower.o
+
+video-objs			+= acpi_video.o video_detect.o
diff --git a/drivers/acpi/acpi_video.c b/drivers/acpi/acpi_video.c
new file mode 100644
index 000000000000..db4f353e1e5c
--- /dev/null
+++ b/drivers/acpi/acpi_video.c
@@ -0,0 +1,2275 @@
+/*
+ *  video.c - ACPI Video Driver
+ *
+ *  Copyright (C) 2004 Luming Yu <luming.yu@intel.com>
+ *  Copyright (C) 2004 Bruno Ducrot <ducrot@poupinou.org>
+ *  Copyright (C) 2006 Thomas Tuttle <linux-kernel@ttuttle.net>
+ *
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or (at
+ *  your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful, but
+ *  WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ *  General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License along
+ *  with this program; if not, write to the Free Software Foundation, Inc.,
+ *  59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
+ *
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/mutex.h>
+#include <linux/input.h>
+#include <linux/backlight.h>
+#include <linux/thermal.h>
+#include <linux/sort.h>
+#include <linux/pci.h>
+#include <linux/pci_ids.h>
+#include <linux/slab.h>
+#include <linux/dmi.h>
+#include <linux/suspend.h>
+#include <linux/acpi.h>
+#include <acpi/video.h>
+#include <asm/uaccess.h>
+
+#define PREFIX "ACPI: "
+
+#define ACPI_VIDEO_BUS_NAME		"Video Bus"
+#define ACPI_VIDEO_DEVICE_NAME		"Video Device"
+#define ACPI_VIDEO_NOTIFY_SWITCH	0x80
+#define ACPI_VIDEO_NOTIFY_PROBE		0x81
+#define ACPI_VIDEO_NOTIFY_CYCLE		0x82
+#define ACPI_VIDEO_NOTIFY_NEXT_OUTPUT	0x83
+#define ACPI_VIDEO_NOTIFY_PREV_OUTPUT	0x84
+
+#define ACPI_VIDEO_NOTIFY_CYCLE_BRIGHTNESS	0x85
+#define	ACPI_VIDEO_NOTIFY_INC_BRIGHTNESS	0x86
+#define ACPI_VIDEO_NOTIFY_DEC_BRIGHTNESS	0x87
+#define ACPI_VIDEO_NOTIFY_ZERO_BRIGHTNESS	0x88
+#define ACPI_VIDEO_NOTIFY_DISPLAY_OFF		0x89
+
+#define MAX_NAME_LEN	20
+
+#define _COMPONENT		ACPI_VIDEO_COMPONENT
+ACPI_MODULE_NAME("video");
+
+MODULE_AUTHOR("Bruno Ducrot");
+MODULE_DESCRIPTION("ACPI Video Driver");
+MODULE_LICENSE("GPL");
+
+static bool brightness_switch_enabled = 1;
+module_param(brightness_switch_enabled, bool, 0644);
+
+/*
+ * By default, we don't allow duplicate ACPI video bus devices
+ * under the same VGA controller
+ */
+static bool allow_duplicates;
+module_param(allow_duplicates, bool, 0644);
+
+/*
+ * For Windows 8 systems: used to decide if video module
+ * should skip registering backlight interface of its own.
+ */
+enum {
+	NATIVE_BACKLIGHT_NOT_SET = -1,
+	NATIVE_BACKLIGHT_OFF,
+	NATIVE_BACKLIGHT_ON,
+};
+
+static int use_native_backlight_param = NATIVE_BACKLIGHT_NOT_SET;
+module_param_named(use_native_backlight, use_native_backlight_param, int, 0444);
+static int use_native_backlight_dmi = NATIVE_BACKLIGHT_NOT_SET;
+
+static int disable_backlight_sysfs_if = -1;
+module_param(disable_backlight_sysfs_if, int, 0444);
+
+static int register_count;
+static struct mutex video_list_lock;
+static struct list_head video_bus_head;
+static int acpi_video_bus_add(struct acpi_device *device);
+static int acpi_video_bus_remove(struct acpi_device *device);
+static void acpi_video_bus_notify(struct acpi_device *device, u32 event);
+
+static const struct acpi_device_id video_device_ids[] = {
+	{ACPI_VIDEO_HID, 0},
+	{"", 0},
+};
+MODULE_DEVICE_TABLE(acpi, video_device_ids);
+
+static struct acpi_driver acpi_video_bus = {
+	.name = "video",
+	.class = ACPI_VIDEO_CLASS,
+	.ids = video_device_ids,
+	.ops = {
+		.add = acpi_video_bus_add,
+		.remove = acpi_video_bus_remove,
+		.notify = acpi_video_bus_notify,
+		},
+};
+
+struct acpi_video_bus_flags {
+	u8 multihead:1;		/* can switch video heads */
+	u8 rom:1;		/* can retrieve a video rom */
+	u8 post:1;		/* can configure the head to */
+	u8 reserved:5;
+};
+
+struct acpi_video_bus_cap {
+	u8 _DOS:1;		/* Enable/Disable output switching */
+	u8 _DOD:1;		/* Enumerate all devices attached to display adapter */
+	u8 _ROM:1;		/* Get ROM Data */
+	u8 _GPD:1;		/* Get POST Device */
+	u8 _SPD:1;		/* Set POST Device */
+	u8 _VPO:1;		/* Video POST Options */
+	u8 reserved:2;
+};
+
+struct acpi_video_device_attrib {
+	u32 display_index:4;	/* A zero-based instance of the Display */
+	u32 display_port_attachment:4;	/* This field differentiates the display type */
+	u32 display_type:4;	/* Describe the specific type in use */
+	u32 vendor_specific:4;	/* Chipset Vendor Specific */
+	u32 bios_can_detect:1;	/* BIOS can detect the device */
+	u32 depend_on_vga:1;	/* Non-VGA output device whose power is related to
+				   the VGA device. */
+	u32 pipe_id:3;		/* For VGA multiple-head devices. */
+	u32 reserved:10;	/* Must be 0 */
+	u32 device_id_scheme:1;	/* Device ID Scheme */
+};
+
+struct acpi_video_enumerated_device {
+	union {
+		u32 int_val;
+		struct acpi_video_device_attrib attrib;
+	} value;
+	struct acpi_video_device *bind_info;
+};
+
+struct acpi_video_bus {
+	struct acpi_device *device;
+	bool backlight_registered;
+	bool backlight_notifier_registered;
+	u8 dos_setting;
+	struct acpi_video_enumerated_device *attached_array;
+	u8 attached_count;
+	u8 child_count;
+	struct acpi_video_bus_cap cap;
+	struct acpi_video_bus_flags flags;
+	struct list_head video_device_list;
+	struct mutex device_list_lock;	/* protects video_device_list */
+	struct list_head entry;
+	struct input_dev *input;
+	char phys[32];	/* for input device */
+	struct notifier_block pm_nb;
+	struct notifier_block backlight_nb;
+};
+
+struct acpi_video_device_flags {
+	u8 crt:1;
+	u8 lcd:1;
+	u8 tvout:1;
+	u8 dvi:1;
+	u8 bios:1;
+	u8 unknown:1;
+	u8 notify:1;
+	u8 reserved:1;
+};
+
+struct acpi_video_device_cap {
+	u8 _ADR:1;		/* Return the unique ID */
+	u8 _BCL:1;		/* Query list of brightness control levels supported */
+	u8 _BCM:1;		/* Set the brightness level */
+	u8 _BQC:1;		/* Get current brightness level */
+	u8 _BCQ:1;		/* Some buggy BIOS uses _BCQ instead of _BQC */
+	u8 _DDC:1;		/* Return the EDID for this device */
+};
+
+struct acpi_video_brightness_flags {
+	u8 _BCL_no_ac_battery_levels:1;	/* no AC/Battery levels in _BCL */
+	u8 _BCL_reversed:1;		/* _BCL package is in a reversed order */
+	u8 _BQC_use_index:1;		/* _BQC returns an index value */
+};
+
+struct acpi_video_device_brightness {
+	int curr;
+	int count;
+	int *levels;
+	struct acpi_video_brightness_flags flags;
+};
+
+struct acpi_video_device {
+	unsigned long device_id;
+	struct acpi_video_device_flags flags;
+	struct acpi_video_device_cap cap;
+	struct list_head entry;
+	struct delayed_work switch_brightness_work;
+	int switch_brightness_event;
+	struct acpi_video_bus *video;
+	struct acpi_device *dev;
+	struct acpi_video_device_brightness *brightness;
+	struct backlight_device *backlight;
+	struct thermal_cooling_device *cooling_dev;
+};
+
+static const char device_decode[][30] = {
+	"motherboard VGA device",
+	"PCI VGA device",
+	"AGP VGA device",
+	"UNKNOWN",
+};
+
+static void acpi_video_device_notify(acpi_handle handle, u32 event, void *data);
+static void acpi_video_device_rebind(struct acpi_video_bus *video);
+static void acpi_video_device_bind(struct acpi_video_bus *video,
+				   struct acpi_video_device *device);
+static int acpi_video_device_enumerate(struct acpi_video_bus *video);
+static int acpi_video_device_lcd_set_level(struct acpi_video_device *device,
+			int level);
+static int acpi_video_device_lcd_get_level_current(
+			struct acpi_video_device *device,
+			unsigned long long *level, bool raw);
+static int acpi_video_get_next_level(struct acpi_video_device *device,
+				     u32 level_current, u32 event);
+static void acpi_video_switch_brightness(struct work_struct *work);
+
+static bool acpi_video_use_native_backlight(void)
+{
+	if (use_native_backlight_param != NATIVE_BACKLIGHT_NOT_SET)
+		return use_native_backlight_param;
+	else if (use_native_backlight_dmi != NATIVE_BACKLIGHT_NOT_SET)
+		return use_native_backlight_dmi;
+	return acpi_osi_is_win8();
+}
+
+bool acpi_video_verify_backlight_support(void)
+{
+	if (acpi_video_use_native_backlight() &&
+	    backlight_device_registered(BACKLIGHT_RAW))
+		return false;
+	return acpi_video_backlight_support();
+}
+EXPORT_SYMBOL_GPL(acpi_video_verify_backlight_support);
+
+/* backlight device sysfs support */
+static int acpi_video_get_brightness(struct backlight_device *bd)
+{
+	unsigned long long cur_level;
+	int i;
+	struct acpi_video_device *vd = bl_get_data(bd);
+
+	if (acpi_video_device_lcd_get_level_current(vd, &cur_level, false))
+		return -EINVAL;
+	for (i = 2; i < vd->brightness->count; i++) {
+		if (vd->brightness->levels[i] == cur_level)
+			/*
+			 * The first two entries are special - see page 575
+			 * of the ACPI spec 3.0
+			 */
+			return i - 2;
+	}
+	return 0;
+}
+
+static int acpi_video_set_brightness(struct backlight_device *bd)
+{
+	int request_level = bd->props.brightness + 2;
+	struct acpi_video_device *vd = bl_get_data(bd);
+
+	cancel_delayed_work(&vd->switch_brightness_work);
+	return acpi_video_device_lcd_set_level(vd,
+				vd->brightness->levels[request_level]);
+}
+
+static const struct backlight_ops acpi_backlight_ops = {
+	.get_brightness = acpi_video_get_brightness,
+	.update_status  = acpi_video_set_brightness,
+};
+
+/* thermal cooling device callbacks */
+static int video_get_max_state(struct thermal_cooling_device *cooling_dev, unsigned
+			       long *state)
+{
+	struct acpi_device *device = cooling_dev->devdata;
+	struct acpi_video_device *video = acpi_driver_data(device);
+
+	*state = video->brightness->count - 3;
+	return 0;
+}
+
+static int video_get_cur_state(struct thermal_cooling_device *cooling_dev, unsigned
+			       long *state)
+{
+	struct acpi_device *device = cooling_dev->devdata;
+	struct acpi_video_device *video = acpi_driver_data(device);
+	unsigned long long level;
+	int offset;
+
+	if (acpi_video_device_lcd_get_level_current(video, &level, false))
+		return -EINVAL;
+	for (offset = 2; offset < video->brightness->count; offset++)
+		if (level == video->brightness->levels[offset]) {
+			*state = video->brightness->count - offset - 1;
+			return 0;
+		}
+
+	return -EINVAL;
+}
+
+static int
+video_set_cur_state(struct thermal_cooling_device *cooling_dev, unsigned long state)
+{
+	struct acpi_device *device = cooling_dev->devdata;
+	struct acpi_video_device *video = acpi_driver_data(device);
+	int level;
+
+	if (state >= video->brightness->count - 2)
+		return -EINVAL;
+
+	state = video->brightness->count - state;
+	level = video->brightness->levels[state - 1];
+	return acpi_video_device_lcd_set_level(video, level);
+}
+
+static const struct thermal_cooling_device_ops video_cooling_ops = {
+	.get_max_state = video_get_max_state,
+	.get_cur_state = video_get_cur_state,
+	.set_cur_state = video_set_cur_state,
+};
+
+/*
+ * --------------------------------------------------------------------------
+ *                             Video Management
+ * --------------------------------------------------------------------------
+ */
+
+static int
+acpi_video_device_lcd_query_levels(struct acpi_video_device *device,
+				   union acpi_object **levels)
+{
+	int status;
+	struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL };
+	union acpi_object *obj;
+
+
+	*levels = NULL;
+
+	status = acpi_evaluate_object(device->dev->handle, "_BCL", NULL, &buffer);
+	if (!ACPI_SUCCESS(status))
+		return status;
+	obj = (union acpi_object *)buffer.pointer;
+	if (!obj || (obj->type != ACPI_TYPE_PACKAGE)) {
+		printk(KERN_ERR PREFIX "Invalid _BCL data\n");
+		status = -EFAULT;
+		goto err;
+	}
+
+	*levels = obj;
+
+	return 0;
+
+err:
+	kfree(buffer.pointer);
+
+	return status;
+}
+
+static int
+acpi_video_device_lcd_set_level(struct acpi_video_device *device, int level)
+{
+	int status;
+	int state;
+
+	status = acpi_execute_simple_method(device->dev->handle,
+					    "_BCM", level);
+	if (ACPI_FAILURE(status)) {
+		ACPI_ERROR((AE_INFO, "Evaluating _BCM failed"));
+		return -EIO;
+	}
+
+	device->brightness->curr = level;
+	for (state = 2; state < device->brightness->count; state++)
+		if (level == device->brightness->levels[state]) {
+			if (device->backlight)
+				device->backlight->props.brightness = state - 2;
+			return 0;
+		}
+
+	ACPI_ERROR((AE_INFO, "Current brightness invalid"));
+	return -EINVAL;
+}
+
+/*
+ * For some buggy _BQC methods, we need to add a constant value to
+ * the _BQC return value to get the actual current brightness level
+ */
+
+static int bqc_offset_aml_bug_workaround;
+static int __init video_set_bqc_offset(const struct dmi_system_id *d)
+{
+	bqc_offset_aml_bug_workaround = 9;
+	return 0;
+}
+
+static int __init video_disable_native_backlight(const struct dmi_system_id *d)
+{
+	use_native_backlight_dmi = NATIVE_BACKLIGHT_OFF;
+	return 0;
+}
+
+static int __init video_enable_native_backlight(const struct dmi_system_id *d)
+{
+	use_native_backlight_dmi = NATIVE_BACKLIGHT_ON;
+	return 0;
+}
+
+static int __init video_disable_backlight_sysfs_if(
+	const struct dmi_system_id *d)
+{
+	if (disable_backlight_sysfs_if == -1)
+		disable_backlight_sysfs_if = 1;
+	return 0;
+}
+
+static struct dmi_system_id video_dmi_table[] __initdata = {
+	/*
+	 * Broken _BQC workaround http://bugzilla.kernel.org/show_bug.cgi?id=13121
+	 */
+	{
+	 .callback = video_set_bqc_offset,
+	 .ident = "Acer Aspire 5720",
+	 .matches = {
+		DMI_MATCH(DMI_BOARD_VENDOR, "Acer"),
+		DMI_MATCH(DMI_PRODUCT_NAME, "Aspire 5720"),
+		},
+	},
+	{
+	 .callback = video_set_bqc_offset,
+	 .ident = "Acer Aspire 5710Z",
+	 .matches = {
+		DMI_MATCH(DMI_BOARD_VENDOR, "Acer"),
+		DMI_MATCH(DMI_PRODUCT_NAME, "Aspire 5710Z"),
+		},
+	},
+	{
+	 .callback = video_set_bqc_offset,
+	 .ident = "eMachines E510",
+	 .matches = {
+		DMI_MATCH(DMI_BOARD_VENDOR, "EMACHINES"),
+		DMI_MATCH(DMI_PRODUCT_NAME, "eMachines E510"),
+		},
+	},
+	{
+	 .callback = video_set_bqc_offset,
+	 .ident = "Acer Aspire 5315",
+	 .matches = {
+		DMI_MATCH(DMI_BOARD_VENDOR, "Acer"),
+		DMI_MATCH(DMI_PRODUCT_NAME, "Aspire 5315"),
+		},
+	},
+	{
+	 .callback = video_set_bqc_offset,
+	 .ident = "Acer Aspire 7720",
+	 .matches = {
+		DMI_MATCH(DMI_BOARD_VENDOR, "Acer"),
+		DMI_MATCH(DMI_PRODUCT_NAME, "Aspire 7720"),
+		},
+	},
+
+	/*
+	 * These models have a working acpi_video backlight control, and using
+	 * native backlight causes a regression where backlight does not work
+	 * when userspace is not handling brightness key events. Disable
+	 * native_backlight on these to fix this:
+	 * https://bugzilla.kernel.org/show_bug.cgi?id=81691
+	 */
+	{
+	 .callback = video_disable_native_backlight,
+	 .ident = "ThinkPad T420",
+	 .matches = {
+		DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"),
+		DMI_MATCH(DMI_PRODUCT_VERSION, "ThinkPad T420"),
+		},
+	},
+	{
+	 .callback = video_disable_native_backlight,
+	 .ident = "ThinkPad T520",
+	 .matches = {
+		DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"),
+		DMI_MATCH(DMI_PRODUCT_VERSION, "ThinkPad T520"),
+		},
+	},
+	{
+	 .callback = video_disable_native_backlight,
+	 .ident = "ThinkPad X201s",
+	 .matches = {
+		DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"),
+		DMI_MATCH(DMI_PRODUCT_VERSION, "ThinkPad X201s"),
+		},
+	},
+
+	/* The native backlight controls do not work on some older machines */
+	{
+	 /* https://bugs.freedesktop.org/show_bug.cgi?id=81515 */
+	 .callback = video_disable_native_backlight,
+	 .ident = "HP ENVY 15 Notebook",
+	 .matches = {
+		DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"),
+		DMI_MATCH(DMI_PRODUCT_NAME, "HP ENVY 15 Notebook PC"),
+		},
+	},
+
+	{
+	 .callback = video_disable_native_backlight,
+	 .ident = "SAMSUNG 870Z5E/880Z5E/680Z5E",
+	 .matches = {
+		DMI_MATCH(DMI_SYS_VENDOR, "SAMSUNG ELECTRONICS CO., LTD."),
+		DMI_MATCH(DMI_PRODUCT_NAME, "870Z5E/880Z5E/680Z5E"),
+		},
+	},
+	{
+	 .callback = video_disable_native_backlight,
+	 .ident = "SAMSUNG 370R4E/370R4V/370R5E/3570RE/370R5V",
+	 .matches = {
+		DMI_MATCH(DMI_SYS_VENDOR, "SAMSUNG ELECTRONICS CO., LTD."),
+		DMI_MATCH(DMI_PRODUCT_NAME, "370R4E/370R4V/370R5E/3570RE/370R5V"),
+		},
+	},
+	{
+	 /* https://bugzilla.redhat.com/show_bug.cgi?id=1186097 */
+	 .callback = video_disable_native_backlight,
+	 .ident = "SAMSUNG 3570R/370R/470R/450R/510R/4450RV",
+	 .matches = {
+		DMI_MATCH(DMI_SYS_VENDOR, "SAMSUNG ELECTRONICS CO., LTD."),
+		DMI_MATCH(DMI_PRODUCT_NAME, "3570R/370R/470R/450R/510R/4450RV"),
+		},
+	},
+	{
+	 /* https://bugzilla.redhat.com/show_bug.cgi?id=1094948 */
+	 .callback = video_disable_native_backlight,
+	 .ident = "SAMSUNG 730U3E/740U3E",
+	 .matches = {
+		DMI_MATCH(DMI_SYS_VENDOR, "SAMSUNG ELECTRONICS CO., LTD."),
+		DMI_MATCH(DMI_PRODUCT_NAME, "730U3E/740U3E"),
+		},
+	},
+	{
+	 /* https://bugs.freedesktop.org/show_bug.cgi?id=87286 */
+	 .callback = video_disable_native_backlight,
+	 .ident = "SAMSUNG 900X3C/900X3D/900X3E/900X4C/900X4D",
+	 .matches = {
+		DMI_MATCH(DMI_SYS_VENDOR, "SAMSUNG ELECTRONICS CO., LTD."),
+		DMI_MATCH(DMI_PRODUCT_NAME, "900X3C/900X3D/900X3E/900X4C/900X4D"),
+		},
+	},
+
+	{
+	 /* https://bugzilla.redhat.com/show_bug.cgi?id=1163574 */
+	 .callback = video_disable_native_backlight,
+	 .ident = "Dell XPS15 L521X",
+	 .matches = {
+		DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+		DMI_MATCH(DMI_PRODUCT_NAME, "XPS L521X"),
+		},
+	},
+
+	/* Non win8 machines which need native backlight nevertheless */
+	{
+	 /* https://bugzilla.redhat.com/show_bug.cgi?id=1187004 */
+	 .callback = video_enable_native_backlight,
+	 .ident = "Lenovo Ideapad Z570",
+	 .matches = {
+		DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"),
+		DMI_MATCH(DMI_PRODUCT_NAME, "102434U"),
+		},
+	},
+	{
+	 /* https://bugzilla.redhat.com/show_bug.cgi?id=1217249 */
+	 .callback = video_enable_native_backlight,
+	 .ident = "Apple MacBook Pro 12,1",
+	 .matches = {
+		DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."),
+		DMI_MATCH(DMI_PRODUCT_NAME, "MacBookPro12,1"),
+		},
+	},
+
+	/*
+	 * Some machines have a broken acpi-video interface for brightness
+	 * control, but still need an acpi_video_device_lcd_set_level() call
+	 * on resume to turn the backlight power on.  We Enable backlight
+	 * control on these systems, but do not register a backlight sysfs
+	 * as brightness control does not work.
+	 */
+	{
+	 /* https://bugs.freedesktop.org/show_bug.cgi?id=82634 */
+	 .callback = video_disable_backlight_sysfs_if,
+	 .ident = "Toshiba Portege R830",
+	 .matches = {
+		DMI_MATCH(DMI_SYS_VENDOR, "TOSHIBA"),
+		DMI_MATCH(DMI_PRODUCT_NAME, "PORTEGE R830"),
+		},
+	},
+	{}
+};
+
+static unsigned long long
+acpi_video_bqc_value_to_level(struct acpi_video_device *device,
+			      unsigned long long bqc_value)
+{
+	unsigned long long level;
+
+	if (device->brightness->flags._BQC_use_index) {
+		/*
+		 * _BQC returns an index that doesn't account for
+		 * the first 2 items with special meaning, so we need
+		 * to compensate for that by offsetting ourselves
+		 */
+		if (device->brightness->flags._BCL_reversed)
+			bqc_value = device->brightness->count - 3 - bqc_value;
+
+		level = device->brightness->levels[bqc_value + 2];
+	} else {
+		level = bqc_value;
+	}
+
+	level += bqc_offset_aml_bug_workaround;
+
+	return level;
+}
+
+static int
+acpi_video_device_lcd_get_level_current(struct acpi_video_device *device,
+					unsigned long long *level, bool raw)
+{
+	acpi_status status = AE_OK;
+	int i;
+
+	if (device->cap._BQC || device->cap._BCQ) {
+		char *buf = device->cap._BQC ? "_BQC" : "_BCQ";
+
+		status = acpi_evaluate_integer(device->dev->handle, buf,
+						NULL, level);
+		if (ACPI_SUCCESS(status)) {
+			if (raw) {
+				/*
+				 * Caller has indicated he wants the raw
+				 * value returned by _BQC, so don't furtherly
+				 * mess with the value.
+				 */
+				return 0;
+			}
+
+			*level = acpi_video_bqc_value_to_level(device, *level);
+
+			for (i = 2; i < device->brightness->count; i++)
+				if (device->brightness->levels[i] == *level) {
+					device->brightness->curr = *level;
+					return 0;
+				}
+			/*
+			 * BQC returned an invalid level.
+			 * Stop using it.
+			 */
+			ACPI_WARNING((AE_INFO,
+				      "%s returned an invalid level",
+				      buf));
+			device->cap._BQC = device->cap._BCQ = 0;
+		} else {
+			/*
+			 * Fixme:
+			 * should we return an error or ignore this failure?
+			 * dev->brightness->curr is a cached value which stores
+			 * the correct current backlight level in most cases.
+			 * ACPI video backlight still works w/ buggy _BQC.
+			 * http://bugzilla.kernel.org/show_bug.cgi?id=12233
+			 */
+			ACPI_WARNING((AE_INFO, "Evaluating %s failed", buf));
+			device->cap._BQC = device->cap._BCQ = 0;
+		}
+	}
+
+	*level = device->brightness->curr;
+	return 0;
+}
+
+static int
+acpi_video_device_EDID(struct acpi_video_device *device,
+		       union acpi_object **edid, ssize_t length)
+{
+	int status;
+	struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL };
+	union acpi_object *obj;
+	union acpi_object arg0 = { ACPI_TYPE_INTEGER };
+	struct acpi_object_list args = { 1, &arg0 };
+
+
+	*edid = NULL;
+
+	if (!device)
+		return -ENODEV;
+	if (length == 128)
+		arg0.integer.value = 1;
+	else if (length == 256)
+		arg0.integer.value = 2;
+	else
+		return -EINVAL;
+
+	status = acpi_evaluate_object(device->dev->handle, "_DDC", &args, &buffer);
+	if (ACPI_FAILURE(status))
+		return -ENODEV;
+
+	obj = buffer.pointer;
+
+	if (obj && obj->type == ACPI_TYPE_BUFFER)
+		*edid = obj;
+	else {
+		printk(KERN_ERR PREFIX "Invalid _DDC data\n");
+		status = -EFAULT;
+		kfree(obj);
+	}
+
+	return status;
+}
+
+/* bus */
+
+/*
+ *  Arg:
+ *	video		: video bus device pointer
+ *	bios_flag	:
+ *		0.	The system BIOS should NOT automatically switch(toggle)
+ *			the active display output.
+ *		1.	The system BIOS should automatically switch (toggle) the
+ *			active display output. No switch event.
+ *		2.	The _DGS value should be locked.
+ *		3.	The system BIOS should not automatically switch (toggle) the
+ *			active display output, but instead generate the display switch
+ *			event notify code.
+ *	lcd_flag	:
+ *		0.	The system BIOS should automatically control the brightness level
+ *			of the LCD when the power changes from AC to DC
+ *		1.	The system BIOS should NOT automatically control the brightness
+ *			level of the LCD when the power changes from AC to DC.
+ *  Return Value:
+ *		-EINVAL	wrong arg.
+ */
+
+static int
+acpi_video_bus_DOS(struct acpi_video_bus *video, int bios_flag, int lcd_flag)
+{
+	acpi_status status;
+
+	if (!video->cap._DOS)
+		return 0;
+
+	if (bios_flag < 0 || bios_flag > 3 || lcd_flag < 0 || lcd_flag > 1)
+		return -EINVAL;
+	video->dos_setting = (lcd_flag << 2) | bios_flag;
+	status = acpi_execute_simple_method(video->device->handle, "_DOS",
+					    (lcd_flag << 2) | bios_flag);
+	if (ACPI_FAILURE(status))
+		return -EIO;
+
+	return 0;
+}
+
+/*
+ * Simple comparison function used to sort backlight levels.
+ */
+
+static int
+acpi_video_cmp_level(const void *a, const void *b)
+{
+	return *(int *)a - *(int *)b;
+}
+
+/*
+ * Decides if _BQC/_BCQ for this system is usable
+ *
+ * We do this by changing the level first and then read out the current
+ * brightness level, if the value does not match, find out if it is using
+ * index. If not, clear the _BQC/_BCQ capability.
+ */
+static int acpi_video_bqc_quirk(struct acpi_video_device *device,
+				int max_level, int current_level)
+{
+	struct acpi_video_device_brightness *br = device->brightness;
+	int result;
+	unsigned long long level;
+	int test_level;
+
+	/* don't mess with existing known broken systems */
+	if (bqc_offset_aml_bug_workaround)
+		return 0;
+
+	/*
+	 * Some systems always report current brightness level as maximum
+	 * through _BQC, we need to test another value for them.
+	 */
+	test_level = current_level == max_level ? br->levels[3] : max_level;
+
+	result = acpi_video_device_lcd_set_level(device, test_level);
+	if (result)
+		return result;
+
+	result = acpi_video_device_lcd_get_level_current(device, &level, true);
+	if (result)
+		return result;
+
+	if (level != test_level) {
+		/* buggy _BQC found, need to find out if it uses index */
+		if (level < br->count) {
+			if (br->flags._BCL_reversed)
+				level = br->count - 3 - level;
+			if (br->levels[level + 2] == test_level)
+				br->flags._BQC_use_index = 1;
+		}
+
+		if (!br->flags._BQC_use_index)
+			device->cap._BQC = device->cap._BCQ = 0;
+	}
+
+	return 0;
+}
+
+
+/*
+ *  Arg:
+ *	device	: video output device (LCD, CRT, ..)
+ *
+ *  Return Value:
+ *	Maximum brightness level
+ *
+ *  Allocate and initialize device->brightness.
+ */
+
+static int
+acpi_video_init_brightness(struct acpi_video_device *device)
+{
+	union acpi_object *obj = NULL;
+	int i, max_level = 0, count = 0, level_ac_battery = 0;
+	unsigned long long level, level_old;
+	union acpi_object *o;
+	struct acpi_video_device_brightness *br = NULL;
+	int result = -EINVAL;
+	u32 value;
+
+	if (!ACPI_SUCCESS(acpi_video_device_lcd_query_levels(device, &obj))) {
+		ACPI_DEBUG_PRINT((ACPI_DB_INFO, "Could not query available "
+						"LCD brightness level\n"));
+		goto out;
+	}
+
+	if (obj->package.count < 2)
+		goto out;
+
+	br = kzalloc(sizeof(*br), GFP_KERNEL);
+	if (!br) {
+		printk(KERN_ERR "can't allocate memory\n");
+		result = -ENOMEM;
+		goto out;
+	}
+
+	br->levels = kmalloc((obj->package.count + 2) * sizeof *(br->levels),
+				GFP_KERNEL);
+	if (!br->levels) {
+		result = -ENOMEM;
+		goto out_free;
+	}
+
+	for (i = 0; i < obj->package.count; i++) {
+		o = (union acpi_object *)&obj->package.elements[i];
+		if (o->type != ACPI_TYPE_INTEGER) {
+			printk(KERN_ERR PREFIX "Invalid data\n");
+			continue;
+		}
+		value = (u32) o->integer.value;
+		/* Skip duplicate entries */
+		if (count > 2 && br->levels[count - 1] == value)
+			continue;
+
+		br->levels[count] = value;
+
+		if (br->levels[count] > max_level)
+			max_level = br->levels[count];
+		count++;
+	}
+
+	/*
+	 * some buggy BIOS don't export the levels
+	 * when machine is on AC/Battery in _BCL package.
+	 * In this case, the first two elements in _BCL packages
+	 * are also supported brightness levels that OS should take care of.
+	 */
+	for (i = 2; i < count; i++) {
+		if (br->levels[i] == br->levels[0])
+			level_ac_battery++;
+		if (br->levels[i] == br->levels[1])
+			level_ac_battery++;
+	}
+
+	if (level_ac_battery < 2) {
+		level_ac_battery = 2 - level_ac_battery;
+		br->flags._BCL_no_ac_battery_levels = 1;
+		for (i = (count - 1 + level_ac_battery); i >= 2; i--)
+			br->levels[i] = br->levels[i - level_ac_battery];
+		count += level_ac_battery;
+	} else if (level_ac_battery > 2)
+		ACPI_ERROR((AE_INFO, "Too many duplicates in _BCL package"));
+
+	/* Check if the _BCL package is in a reversed order */
+	if (max_level == br->levels[2]) {
+		br->flags._BCL_reversed = 1;
+		sort(&br->levels[2], count - 2, sizeof(br->levels[2]),
+			acpi_video_cmp_level, NULL);
+	} else if (max_level != br->levels[count - 1])
+		ACPI_ERROR((AE_INFO,
+			    "Found unordered _BCL package"));
+
+	br->count = count;
+	device->brightness = br;
+
+	/* _BQC uses INDEX while _BCL uses VALUE in some laptops */
+	br->curr = level = max_level;
+
+	if (!device->cap._BQC)
+		goto set_level;
+
+	result = acpi_video_device_lcd_get_level_current(device,
+							 &level_old, true);
+	if (result)
+		goto out_free_levels;
+
+	result = acpi_video_bqc_quirk(device, max_level, level_old);
+	if (result)
+		goto out_free_levels;
+	/*
+	 * cap._BQC may get cleared due to _BQC is found to be broken
+	 * in acpi_video_bqc_quirk, so check again here.
+	 */
+	if (!device->cap._BQC)
+		goto set_level;
+
+	level = acpi_video_bqc_value_to_level(device, level_old);
+	/*
+	 * On some buggy laptops, _BQC returns an uninitialized
+	 * value when invoked for the first time, i.e.
+	 * level_old is invalid (no matter whether it's a level
+	 * or an index). Set the backlight to max_level in this case.
+	 */
+	for (i = 2; i < br->count; i++)
+		if (level == br->levels[i])
+			break;
+	if (i == br->count || !level)
+		level = max_level;
+
+set_level:
+	result = acpi_video_device_lcd_set_level(device, level);
+	if (result)
+		goto out_free_levels;
+
+	ACPI_DEBUG_PRINT((ACPI_DB_INFO,
+			  "found %d brightness levels\n", count - 2));
+	kfree(obj);
+	return result;
+
+out_free_levels:
+	kfree(br->levels);
+out_free:
+	kfree(br);
+out:
+	device->brightness = NULL;
+	kfree(obj);
+	return result;
+}
+
+/*
+ *  Arg:
+ *	device	: video output device (LCD, CRT, ..)
+ *
+ *  Return Value:
+ *	None
+ *
+ *  Find out all required AML methods defined under the output
+ *  device.
+ */
+
+static void acpi_video_device_find_cap(struct acpi_video_device *device)
+{
+	if (acpi_has_method(device->dev->handle, "_ADR"))
+		device->cap._ADR = 1;
+	if (acpi_has_method(device->dev->handle, "_BCL"))
+		device->cap._BCL = 1;
+	if (acpi_has_method(device->dev->handle, "_BCM"))
+		device->cap._BCM = 1;
+	if (acpi_has_method(device->dev->handle, "_BQC")) {
+		device->cap._BQC = 1;
+	} else if (acpi_has_method(device->dev->handle, "_BCQ")) {
+		printk(KERN_WARNING FW_BUG "_BCQ is used instead of _BQC\n");
+		device->cap._BCQ = 1;
+	}
+
+	if (acpi_has_method(device->dev->handle, "_DDC"))
+		device->cap._DDC = 1;
+}
+
+/*
+ *  Arg:
+ *	device	: video output device (VGA)
+ *
+ *  Return Value:
+ *	None
+ *
+ *  Find out all required AML methods defined under the video bus device.
+ */
+
+static void acpi_video_bus_find_cap(struct acpi_video_bus *video)
+{
+	if (acpi_has_method(video->device->handle, "_DOS"))
+		video->cap._DOS = 1;
+	if (acpi_has_method(video->device->handle, "_DOD"))
+		video->cap._DOD = 1;
+	if (acpi_has_method(video->device->handle, "_ROM"))
+		video->cap._ROM = 1;
+	if (acpi_has_method(video->device->handle, "_GPD"))
+		video->cap._GPD = 1;
+	if (acpi_has_method(video->device->handle, "_SPD"))
+		video->cap._SPD = 1;
+	if (acpi_has_method(video->device->handle, "_VPO"))
+		video->cap._VPO = 1;
+}
+
+/*
+ * Check whether the video bus device has required AML method to
+ * support the desired features
+ */
+
+static int acpi_video_bus_check(struct acpi_video_bus *video)
+{
+	acpi_status status = -ENOENT;
+	struct pci_dev *dev;
+
+	if (!video)
+		return -EINVAL;
+
+	dev = acpi_get_pci_dev(video->device->handle);
+	if (!dev)
+		return -ENODEV;
+	pci_dev_put(dev);
+
+	/*
+	 * Since there is no HID, CID and so on for VGA driver, we have
+	 * to check well known required nodes.
+	 */
+
+	/* Does this device support video switching? */
+	if (video->cap._DOS || video->cap._DOD) {
+		if (!video->cap._DOS) {
+			printk(KERN_WARNING FW_BUG
+				"ACPI(%s) defines _DOD but not _DOS\n",
+				acpi_device_bid(video->device));
+		}
+		video->flags.multihead = 1;
+		status = 0;
+	}
+
+	/* Does this device support retrieving a video ROM? */
+	if (video->cap._ROM) {
+		video->flags.rom = 1;
+		status = 0;
+	}
+
+	/* Does this device support configuring which video device to POST? */
+	if (video->cap._GPD && video->cap._SPD && video->cap._VPO) {
+		video->flags.post = 1;
+		status = 0;
+	}
+
+	return status;
+}
+
+/*
+ * --------------------------------------------------------------------------
+ *                               Driver Interface
+ * --------------------------------------------------------------------------
+ */
+
+/* device interface */
+static struct acpi_video_device_attrib *
+acpi_video_get_device_attr(struct acpi_video_bus *video, unsigned long device_id)
+{
+	struct acpi_video_enumerated_device *ids;
+	int i;
+
+	for (i = 0; i < video->attached_count; i++) {
+		ids = &video->attached_array[i];
+		if ((ids->value.int_val & 0xffff) == device_id)
+			return &ids->value.attrib;
+	}
+
+	return NULL;
+}
+
+static int
+acpi_video_get_device_type(struct acpi_video_bus *video,
+			   unsigned long device_id)
+{
+	struct acpi_video_enumerated_device *ids;
+	int i;
+
+	for (i = 0; i < video->attached_count; i++) {
+		ids = &video->attached_array[i];
+		if ((ids->value.int_val & 0xffff) == device_id)
+			return ids->value.int_val;
+	}
+
+	return 0;
+}
+
+static int
+acpi_video_bus_get_one_device(struct acpi_device *device,
+			      struct acpi_video_bus *video)
+{
+	unsigned long long device_id;
+	int status, device_type;
+	struct acpi_video_device *data;
+	struct acpi_video_device_attrib *attribute;
+
+	status =
+	    acpi_evaluate_integer(device->handle, "_ADR", NULL, &device_id);
+	/* Some device omits _ADR, we skip them instead of fail */
+	if (ACPI_FAILURE(status))
+		return 0;
+
+	data = kzalloc(sizeof(struct acpi_video_device), GFP_KERNEL);
+	if (!data)
+		return -ENOMEM;
+
+	strcpy(acpi_device_name(device), ACPI_VIDEO_DEVICE_NAME);
+	strcpy(acpi_device_class(device), ACPI_VIDEO_CLASS);
+	device->driver_data = data;
+
+	data->device_id = device_id;
+	data->video = video;
+	data->dev = device;
+	INIT_DELAYED_WORK(&data->switch_brightness_work,
+			  acpi_video_switch_brightness);
+
+	attribute = acpi_video_get_device_attr(video, device_id);
+
+	if (attribute && attribute->device_id_scheme) {
+		switch (attribute->display_type) {
+		case ACPI_VIDEO_DISPLAY_CRT:
+			data->flags.crt = 1;
+			break;
+		case ACPI_VIDEO_DISPLAY_TV:
+			data->flags.tvout = 1;
+			break;
+		case ACPI_VIDEO_DISPLAY_DVI:
+			data->flags.dvi = 1;
+			break;
+		case ACPI_VIDEO_DISPLAY_LCD:
+			data->flags.lcd = 1;
+			break;
+		default:
+			data->flags.unknown = 1;
+			break;
+		}
+		if (attribute->bios_can_detect)
+			data->flags.bios = 1;
+	} else {
+		/* Check for legacy IDs */
+		device_type = acpi_video_get_device_type(video, device_id);
+		/* Ignore bits 16 and 18-20 */
+		switch (device_type & 0xffe2ffff) {
+		case ACPI_VIDEO_DISPLAY_LEGACY_MONITOR:
+			data->flags.crt = 1;
+			break;
+		case ACPI_VIDEO_DISPLAY_LEGACY_PANEL:
+			data->flags.lcd = 1;
+			break;
+		case ACPI_VIDEO_DISPLAY_LEGACY_TV:
+			data->flags.tvout = 1;
+			break;
+		default:
+			data->flags.unknown = 1;
+		}
+	}
+
+	acpi_video_device_bind(video, data);
+	acpi_video_device_find_cap(data);
+
+	mutex_lock(&video->device_list_lock);
+	list_add_tail(&data->entry, &video->video_device_list);
+	mutex_unlock(&video->device_list_lock);
+
+	return status;
+}
+
+/*
+ *  Arg:
+ *	video	: video bus device
+ *
+ *  Return:
+ *	none
+ *
+ *  Enumerate the video device list of the video bus,
+ *  bind the ids with the corresponding video devices
+ *  under the video bus.
+ */
+
+static void acpi_video_device_rebind(struct acpi_video_bus *video)
+{
+	struct acpi_video_device *dev;
+
+	mutex_lock(&video->device_list_lock);
+
+	list_for_each_entry(dev, &video->video_device_list, entry)
+		acpi_video_device_bind(video, dev);
+
+	mutex_unlock(&video->device_list_lock);
+}
+
+/*
+ *  Arg:
+ *	video	: video bus device
+ *	device	: video output device under the video
+ *		bus
+ *
+ *  Return:
+ *	none
+ *
+ *  Bind the ids with the corresponding video devices
+ *  under the video bus.
+ */
+
+static void
+acpi_video_device_bind(struct acpi_video_bus *video,
+		       struct acpi_video_device *device)
+{
+	struct acpi_video_enumerated_device *ids;
+	int i;
+
+	for (i = 0; i < video->attached_count; i++) {
+		ids = &video->attached_array[i];
+		if (device->device_id == (ids->value.int_val & 0xffff)) {
+			ids->bind_info = device;
+			ACPI_DEBUG_PRINT((ACPI_DB_INFO, "device_bind %d\n", i));
+		}
+	}
+}
+
+static bool acpi_video_device_in_dod(struct acpi_video_device *device)
+{
+	struct acpi_video_bus *video = device->video;
+	int i;
+
+	/*
+	 * If we have a broken _DOD or we have more than 8 output devices
+	 * under the graphics controller node that we can't proper deal with
+	 * in the operation region code currently, no need to test.
+	 */
+	if (!video->attached_count || video->child_count > 8)
+		return true;
+
+	for (i = 0; i < video->attached_count; i++) {
+		if ((video->attached_array[i].value.int_val & 0xfff) ==
+		    (device->device_id & 0xfff))
+			return true;
+	}
+
+	return false;
+}
+
+/*
+ *  Arg:
+ *	video	: video bus device
+ *
+ *  Return:
+ *	< 0	: error
+ *
+ *  Call _DOD to enumerate all devices attached to display adapter
+ *
+ */
+
+static int acpi_video_device_enumerate(struct acpi_video_bus *video)
+{
+	int status;
+	int count;
+	int i;
+	struct acpi_video_enumerated_device *active_list;
+	struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL };
+	union acpi_object *dod = NULL;
+	union acpi_object *obj;
+
+	status = acpi_evaluate_object(video->device->handle, "_DOD", NULL, &buffer);
+	if (!ACPI_SUCCESS(status)) {
+		ACPI_EXCEPTION((AE_INFO, status, "Evaluating _DOD"));
+		return status;
+	}
+
+	dod = buffer.pointer;
+	if (!dod || (dod->type != ACPI_TYPE_PACKAGE)) {
+		ACPI_EXCEPTION((AE_INFO, status, "Invalid _DOD data"));
+		status = -EFAULT;
+		goto out;
+	}
+
+	ACPI_DEBUG_PRINT((ACPI_DB_INFO, "Found %d video heads in _DOD\n",
+			  dod->package.count));
+
+	active_list = kcalloc(1 + dod->package.count,
+			      sizeof(struct acpi_video_enumerated_device),
+			      GFP_KERNEL);
+	if (!active_list) {
+		status = -ENOMEM;
+		goto out;
+	}
+
+	count = 0;
+	for (i = 0; i < dod->package.count; i++) {
+		obj = &dod->package.elements[i];
+
+		if (obj->type != ACPI_TYPE_INTEGER) {
+			printk(KERN_ERR PREFIX
+				"Invalid _DOD data in element %d\n", i);
+			continue;
+		}
+
+		active_list[count].value.int_val = obj->integer.value;
+		active_list[count].bind_info = NULL;
+		ACPI_DEBUG_PRINT((ACPI_DB_INFO, "dod element[%d] = %d\n", i,
+				  (int)obj->integer.value));
+		count++;
+	}
+
+	kfree(video->attached_array);
+
+	video->attached_array = active_list;
+	video->attached_count = count;
+
+out:
+	kfree(buffer.pointer);
+	return status;
+}
+
+static int
+acpi_video_get_next_level(struct acpi_video_device *device,
+			  u32 level_current, u32 event)
+{
+	int min, max, min_above, max_below, i, l, delta = 255;
+	max = max_below = 0;
+	min = min_above = 255;
+	/* Find closest level to level_current */
+	for (i = 2; i < device->brightness->count; i++) {
+		l = device->brightness->levels[i];
+		if (abs(l - level_current) < abs(delta)) {
+			delta = l - level_current;
+			if (!delta)
+				break;
+		}
+	}
+	/* Ajust level_current to closest available level */
+	level_current += delta;
+	for (i = 2; i < device->brightness->count; i++) {
+		l = device->brightness->levels[i];
+		if (l < min)
+			min = l;
+		if (l > max)
+			max = l;
+		if (l < min_above && l > level_current)
+			min_above = l;
+		if (l > max_below && l < level_current)
+			max_below = l;
+	}
+
+	switch (event) {
+	case ACPI_VIDEO_NOTIFY_CYCLE_BRIGHTNESS:
+		return (level_current < max) ? min_above : min;
+	case ACPI_VIDEO_NOTIFY_INC_BRIGHTNESS:
+		return (level_current < max) ? min_above : max;
+	case ACPI_VIDEO_NOTIFY_DEC_BRIGHTNESS:
+		return (level_current > min) ? max_below : min;
+	case ACPI_VIDEO_NOTIFY_ZERO_BRIGHTNESS:
+	case ACPI_VIDEO_NOTIFY_DISPLAY_OFF:
+		return 0;
+	default:
+		return level_current;
+	}
+}
+
+static void
+acpi_video_switch_brightness(struct work_struct *work)
+{
+	struct acpi_video_device *device = container_of(to_delayed_work(work),
+			     struct acpi_video_device, switch_brightness_work);
+	unsigned long long level_current, level_next;
+	int event = device->switch_brightness_event;
+	int result = -EINVAL;
+
+	/* no warning message if acpi_backlight=vendor or a quirk is used */
+	if (!device->backlight)
+		return;
+
+	if (!device->brightness)
+		goto out;
+
+	result = acpi_video_device_lcd_get_level_current(device,
+							 &level_current,
+							 false);
+	if (result)
+		goto out;
+
+	level_next = acpi_video_get_next_level(device, level_current, event);
+
+	result = acpi_video_device_lcd_set_level(device, level_next);
+
+	if (!result)
+		backlight_force_update(device->backlight,
+				       BACKLIGHT_UPDATE_HOTKEY);
+
+out:
+	if (result)
+		printk(KERN_ERR PREFIX "Failed to switch the brightness\n");
+}
+
+int acpi_video_get_edid(struct acpi_device *device, int type, int device_id,
+			void **edid)
+{
+	struct acpi_video_bus *video;
+	struct acpi_video_device *video_device;
+	union acpi_object *buffer = NULL;
+	acpi_status status;
+	int i, length;
+
+	if (!device || !acpi_driver_data(device))
+		return -EINVAL;
+
+	video = acpi_driver_data(device);
+
+	for (i = 0; i < video->attached_count; i++) {
+		video_device = video->attached_array[i].bind_info;
+		length = 256;
+
+		if (!video_device)
+			continue;
+
+		if (!video_device->cap._DDC)
+			continue;
+
+		if (type) {
+			switch (type) {
+			case ACPI_VIDEO_DISPLAY_CRT:
+				if (!video_device->flags.crt)
+					continue;
+				break;
+			case ACPI_VIDEO_DISPLAY_TV:
+				if (!video_device->flags.tvout)
+					continue;
+				break;
+			case ACPI_VIDEO_DISPLAY_DVI:
+				if (!video_device->flags.dvi)
+					continue;
+				break;
+			case ACPI_VIDEO_DISPLAY_LCD:
+				if (!video_device->flags.lcd)
+					continue;
+				break;
+			}
+		} else if (video_device->device_id != device_id) {
+			continue;
+		}
+
+		status = acpi_video_device_EDID(video_device, &buffer, length);
+
+		if (ACPI_FAILURE(status) || !buffer ||
+		    buffer->type != ACPI_TYPE_BUFFER) {
+			length = 128;
+			status = acpi_video_device_EDID(video_device, &buffer,
+							length);
+			if (ACPI_FAILURE(status) || !buffer ||
+			    buffer->type != ACPI_TYPE_BUFFER) {
+				continue;
+			}
+		}
+
+		*edid = buffer->buffer.pointer;
+		return length;
+	}
+
+	return -ENODEV;
+}
+EXPORT_SYMBOL(acpi_video_get_edid);
+
+static int
+acpi_video_bus_get_devices(struct acpi_video_bus *video,
+			   struct acpi_device *device)
+{
+	int status = 0;
+	struct acpi_device *dev;
+
+	/*
+	 * There are systems where video module known to work fine regardless
+	 * of broken _DOD and ignoring returned value here doesn't cause
+	 * any issues later.
+	 */
+	acpi_video_device_enumerate(video);
+
+	list_for_each_entry(dev, &device->children, node) {
+
+		status = acpi_video_bus_get_one_device(dev, video);
+		if (status) {
+			dev_err(&dev->dev, "Can't attach device\n");
+			break;
+		}
+		video->child_count++;
+	}
+	return status;
+}
+
+/* acpi_video interface */
+
+/*
+ * Win8 requires setting bit2 of _DOS to let firmware know it shouldn't
+ * preform any automatic brightness change on receiving a notification.
+ */
+static int acpi_video_bus_start_devices(struct acpi_video_bus *video)
+{
+	return acpi_video_bus_DOS(video, 0,
+				  acpi_osi_is_win8() ? 1 : 0);
+}
+
+static int acpi_video_bus_stop_devices(struct acpi_video_bus *video)
+{
+	return acpi_video_bus_DOS(video, 0,
+				  acpi_osi_is_win8() ? 0 : 1);
+}
+
+static void acpi_video_bus_notify(struct acpi_device *device, u32 event)
+{
+	struct acpi_video_bus *video = acpi_driver_data(device);
+	struct input_dev *input;
+	int keycode = 0;
+
+	if (!video || !video->input)
+		return;
+
+	input = video->input;
+
+	switch (event) {
+	case ACPI_VIDEO_NOTIFY_SWITCH:	/* User requested a switch,
+					 * most likely via hotkey. */
+		keycode = KEY_SWITCHVIDEOMODE;
+		break;
+
+	case ACPI_VIDEO_NOTIFY_PROBE:	/* User plugged in or removed a video
+					 * connector. */
+		acpi_video_device_enumerate(video);
+		acpi_video_device_rebind(video);
+		keycode = KEY_SWITCHVIDEOMODE;
+		break;
+
+	case ACPI_VIDEO_NOTIFY_CYCLE:	/* Cycle Display output hotkey pressed. */
+		keycode = KEY_SWITCHVIDEOMODE;
+		break;
+	case ACPI_VIDEO_NOTIFY_NEXT_OUTPUT:	/* Next Display output hotkey pressed. */
+		keycode = KEY_VIDEO_NEXT;
+		break;
+	case ACPI_VIDEO_NOTIFY_PREV_OUTPUT:	/* previous Display output hotkey pressed. */
+		keycode = KEY_VIDEO_PREV;
+		break;
+
+	default:
+		ACPI_DEBUG_PRINT((ACPI_DB_INFO,
+				  "Unsupported event [0x%x]\n", event));
+		break;
+	}
+
+	if (acpi_notifier_call_chain(device, event, 0))
+		/* Something vetoed the keypress. */
+		keycode = 0;
+
+	if (keycode) {
+		input_report_key(input, keycode, 1);
+		input_sync(input);
+		input_report_key(input, keycode, 0);
+		input_sync(input);
+	}
+
+	return;
+}
+
+static void brightness_switch_event(struct acpi_video_device *video_device,
+				    u32 event)
+{
+	if (!brightness_switch_enabled)
+		return;
+
+	video_device->switch_brightness_event = event;
+	schedule_delayed_work(&video_device->switch_brightness_work, HZ / 10);
+}
+
+static void acpi_video_device_notify(acpi_handle handle, u32 event, void *data)
+{
+	struct acpi_video_device *video_device = data;
+	struct acpi_device *device = NULL;
+	struct acpi_video_bus *bus;
+	struct input_dev *input;
+	int keycode = 0;
+
+	if (!video_device)
+		return;
+
+	device = video_device->dev;
+	bus = video_device->video;
+	input = bus->input;
+
+	switch (event) {
+	case ACPI_VIDEO_NOTIFY_CYCLE_BRIGHTNESS:	/* Cycle brightness */
+		brightness_switch_event(video_device, event);
+		keycode = KEY_BRIGHTNESS_CYCLE;
+		break;
+	case ACPI_VIDEO_NOTIFY_INC_BRIGHTNESS:	/* Increase brightness */
+		brightness_switch_event(video_device, event);
+		keycode = KEY_BRIGHTNESSUP;
+		break;
+	case ACPI_VIDEO_NOTIFY_DEC_BRIGHTNESS:	/* Decrease brightness */
+		brightness_switch_event(video_device, event);
+		keycode = KEY_BRIGHTNESSDOWN;
+		break;
+	case ACPI_VIDEO_NOTIFY_ZERO_BRIGHTNESS:	/* zero brightness */
+		brightness_switch_event(video_device, event);
+		keycode = KEY_BRIGHTNESS_ZERO;
+		break;
+	case ACPI_VIDEO_NOTIFY_DISPLAY_OFF:	/* display device off */
+		brightness_switch_event(video_device, event);
+		keycode = KEY_DISPLAY_OFF;
+		break;
+	default:
+		ACPI_DEBUG_PRINT((ACPI_DB_INFO,
+				  "Unsupported event [0x%x]\n", event));
+		break;
+	}
+
+	acpi_notifier_call_chain(device, event, 0);
+
+	if (keycode) {
+		input_report_key(input, keycode, 1);
+		input_sync(input);
+		input_report_key(input, keycode, 0);
+		input_sync(input);
+	}
+
+	return;
+}
+
+static int acpi_video_resume(struct notifier_block *nb,
+				unsigned long val, void *ign)
+{
+	struct acpi_video_bus *video;
+	struct acpi_video_device *video_device;
+	int i;
+
+	switch (val) {
+	case PM_HIBERNATION_PREPARE:
+	case PM_SUSPEND_PREPARE:
+	case PM_RESTORE_PREPARE:
+		return NOTIFY_DONE;
+	}
+
+	video = container_of(nb, struct acpi_video_bus, pm_nb);
+
+	dev_info(&video->device->dev, "Restoring backlight state\n");
+
+	for (i = 0; i < video->attached_count; i++) {
+		video_device = video->attached_array[i].bind_info;
+		if (video_device && video_device->brightness)
+			acpi_video_device_lcd_set_level(video_device,
+					video_device->brightness->curr);
+	}
+
+	return NOTIFY_OK;
+}
+
+static acpi_status
+acpi_video_bus_match(acpi_handle handle, u32 level, void *context,
+			void **return_value)
+{
+	struct acpi_device *device = context;
+	struct acpi_device *sibling;
+	int result;
+
+	if (handle == device->handle)
+		return AE_CTRL_TERMINATE;
+
+	result = acpi_bus_get_device(handle, &sibling);
+	if (result)
+		return AE_OK;
+
+	if (!strcmp(acpi_device_name(sibling), ACPI_VIDEO_BUS_NAME))
+			return AE_ALREADY_EXISTS;
+
+	return AE_OK;
+}
+
+static void acpi_video_dev_register_backlight(struct acpi_video_device *device)
+{
+	struct backlight_properties props;
+	struct pci_dev *pdev;
+	acpi_handle acpi_parent;
+	struct device *parent = NULL;
+	int result;
+	static int count;
+	char *name;
+
+	/*
+	 * Do not create backlight device for video output
+	 * device that is not in the enumerated list.
+	 */
+	if (!acpi_video_device_in_dod(device)) {
+		dev_dbg(&device->dev->dev, "not in _DOD list, ignore\n");
+		return;
+	}
+
+	result = acpi_video_init_brightness(device);
+	if (result)
+		return;
+
+	if (disable_backlight_sysfs_if > 0)
+		return;
+
+	name = kasprintf(GFP_KERNEL, "acpi_video%d", count);
+	if (!name)
+		return;
+	count++;
+
+	acpi_get_parent(device->dev->handle, &acpi_parent);
+
+	pdev = acpi_get_pci_dev(acpi_parent);
+	if (pdev) {
+		parent = &pdev->dev;
+		pci_dev_put(pdev);
+	}
+
+	memset(&props, 0, sizeof(struct backlight_properties));
+	props.type = BACKLIGHT_FIRMWARE;
+	props.max_brightness = device->brightness->count - 3;
+	device->backlight = backlight_device_register(name,
+						      parent,
+						      device,
+						      &acpi_backlight_ops,
+						      &props);
+	kfree(name);
+	if (IS_ERR(device->backlight)) {
+		device->backlight = NULL;
+		return;
+	}
+
+	/*
+	 * Save current brightness level in case we have to restore it
+	 * before acpi_video_device_lcd_set_level() is called next time.
+	 */
+	device->backlight->props.brightness =
+			acpi_video_get_brightness(device->backlight);
+
+	device->cooling_dev = thermal_cooling_device_register("LCD",
+				device->dev, &video_cooling_ops);
+	if (IS_ERR(device->cooling_dev)) {
+		/*
+		 * Set cooling_dev to NULL so we don't crash trying to free it.
+		 * Also, why the hell we are returning early and not attempt to
+		 * register video output if cooling device registration failed?
+		 * -- dtor
+		 */
+		device->cooling_dev = NULL;
+		return;
+	}
+
+	dev_info(&device->dev->dev, "registered as cooling_device%d\n",
+		 device->cooling_dev->id);
+	result = sysfs_create_link(&device->dev->dev.kobj,
+			&device->cooling_dev->device.kobj,
+			"thermal_cooling");
+	if (result)
+		printk(KERN_ERR PREFIX "Create sysfs link\n");
+	result = sysfs_create_link(&device->cooling_dev->device.kobj,
+			&device->dev->dev.kobj, "device");
+	if (result)
+		printk(KERN_ERR PREFIX "Create sysfs link\n");
+}
+
+static void acpi_video_run_bcl_for_osi(struct acpi_video_bus *video)
+{
+	struct acpi_video_device *dev;
+	union acpi_object *levels;
+
+	mutex_lock(&video->device_list_lock);
+	list_for_each_entry(dev, &video->video_device_list, entry) {
+		if (!acpi_video_device_lcd_query_levels(dev, &levels))
+			kfree(levels);
+	}
+	mutex_unlock(&video->device_list_lock);
+}
+
+static int acpi_video_bus_register_backlight(struct acpi_video_bus *video)
+{
+	struct acpi_video_device *dev;
+
+	if (video->backlight_registered)
+		return 0;
+
+	acpi_video_run_bcl_for_osi(video);
+
+	if (!acpi_video_verify_backlight_support())
+		return 0;
+
+	mutex_lock(&video->device_list_lock);
+	list_for_each_entry(dev, &video->video_device_list, entry)
+		acpi_video_dev_register_backlight(dev);
+	mutex_unlock(&video->device_list_lock);
+
+	video->backlight_registered = true;
+
+	video->pm_nb.notifier_call = acpi_video_resume;
+	video->pm_nb.priority = 0;
+	return register_pm_notifier(&video->pm_nb);
+}
+
+static void acpi_video_dev_unregister_backlight(struct acpi_video_device *device)
+{
+	if (device->backlight) {
+		backlight_device_unregister(device->backlight);
+		device->backlight = NULL;
+	}
+	if (device->brightness) {
+		kfree(device->brightness->levels);
+		kfree(device->brightness);
+		device->brightness = NULL;
+	}
+	if (device->cooling_dev) {
+		sysfs_remove_link(&device->dev->dev.kobj, "thermal_cooling");
+		sysfs_remove_link(&device->cooling_dev->device.kobj, "device");
+		thermal_cooling_device_unregister(device->cooling_dev);
+		device->cooling_dev = NULL;
+	}
+}
+
+static int acpi_video_bus_unregister_backlight(struct acpi_video_bus *video)
+{
+	struct acpi_video_device *dev;
+	int error;
+
+	if (!video->backlight_registered)
+		return 0;
+
+	error = unregister_pm_notifier(&video->pm_nb);
+
+	mutex_lock(&video->device_list_lock);
+	list_for_each_entry(dev, &video->video_device_list, entry)
+		acpi_video_dev_unregister_backlight(dev);
+	mutex_unlock(&video->device_list_lock);
+
+	video->backlight_registered = false;
+
+	return error;
+}
+
+static void acpi_video_dev_add_notify_handler(struct acpi_video_device *device)
+{
+	acpi_status status;
+	struct acpi_device *adev = device->dev;
+
+	status = acpi_install_notify_handler(adev->handle, ACPI_DEVICE_NOTIFY,
+					     acpi_video_device_notify, device);
+	if (ACPI_FAILURE(status))
+		dev_err(&adev->dev, "Error installing notify handler\n");
+	else
+		device->flags.notify = 1;
+}
+
+static int acpi_video_bus_add_notify_handler(struct acpi_video_bus *video)
+{
+	struct input_dev *input;
+	struct acpi_video_device *dev;
+	int error;
+
+	video->input = input = input_allocate_device();
+	if (!input) {
+		error = -ENOMEM;
+		goto out;
+	}
+
+	error = acpi_video_bus_start_devices(video);
+	if (error)
+		goto err_free_input;
+
+	snprintf(video->phys, sizeof(video->phys),
+			"%s/video/input0", acpi_device_hid(video->device));
+
+	input->name = acpi_device_name(video->device);
+	input->phys = video->phys;
+	input->id.bustype = BUS_HOST;
+	input->id.product = 0x06;
+	input->dev.parent = &video->device->dev;
+	input->evbit[0] = BIT(EV_KEY);
+	set_bit(KEY_SWITCHVIDEOMODE, input->keybit);
+	set_bit(KEY_VIDEO_NEXT, input->keybit);
+	set_bit(KEY_VIDEO_PREV, input->keybit);
+	set_bit(KEY_BRIGHTNESS_CYCLE, input->keybit);
+	set_bit(KEY_BRIGHTNESSUP, input->keybit);
+	set_bit(KEY_BRIGHTNESSDOWN, input->keybit);
+	set_bit(KEY_BRIGHTNESS_ZERO, input->keybit);
+	set_bit(KEY_DISPLAY_OFF, input->keybit);
+
+	error = input_register_device(input);
+	if (error)
+		goto err_stop_dev;
+
+	mutex_lock(&video->device_list_lock);
+	list_for_each_entry(dev, &video->video_device_list, entry)
+		acpi_video_dev_add_notify_handler(dev);
+	mutex_unlock(&video->device_list_lock);
+
+	return 0;
+
+err_stop_dev:
+	acpi_video_bus_stop_devices(video);
+err_free_input:
+	input_free_device(input);
+	video->input = NULL;
+out:
+	return error;
+}
+
+static void acpi_video_dev_remove_notify_handler(struct acpi_video_device *dev)
+{
+	if (dev->flags.notify) {
+		acpi_remove_notify_handler(dev->dev->handle, ACPI_DEVICE_NOTIFY,
+					   acpi_video_device_notify);
+		dev->flags.notify = 0;
+	}
+}
+
+static void acpi_video_bus_remove_notify_handler(struct acpi_video_bus *video)
+{
+	struct acpi_video_device *dev;
+
+	mutex_lock(&video->device_list_lock);
+	list_for_each_entry(dev, &video->video_device_list, entry)
+		acpi_video_dev_remove_notify_handler(dev);
+	mutex_unlock(&video->device_list_lock);
+
+	acpi_video_bus_stop_devices(video);
+	input_unregister_device(video->input);
+	video->input = NULL;
+}
+
+static int acpi_video_backlight_notify(struct notifier_block *nb,
+					unsigned long val, void *bd)
+{
+	struct backlight_device *backlight = bd;
+	struct acpi_video_bus *video;
+
+	/* acpi_video_verify_backlight_support only cares about raw devices */
+	if (backlight->props.type != BACKLIGHT_RAW)
+		return NOTIFY_DONE;
+
+	video = container_of(nb, struct acpi_video_bus, backlight_nb);
+
+	switch (val) {
+	case BACKLIGHT_REGISTERED:
+		if (!acpi_video_verify_backlight_support())
+			acpi_video_bus_unregister_backlight(video);
+		break;
+	case BACKLIGHT_UNREGISTERED:
+		acpi_video_bus_register_backlight(video);
+		break;
+	}
+
+	return NOTIFY_OK;
+}
+
+static int acpi_video_bus_add_backlight_notify_handler(
+						struct acpi_video_bus *video)
+{
+	int error;
+
+	video->backlight_nb.notifier_call = acpi_video_backlight_notify;
+	video->backlight_nb.priority = 0;
+	error = backlight_register_notifier(&video->backlight_nb);
+	if (error == 0)
+		video->backlight_notifier_registered = true;
+
+	return error;
+}
+
+static int acpi_video_bus_remove_backlight_notify_handler(
+						struct acpi_video_bus *video)
+{
+	if (!video->backlight_notifier_registered)
+		return 0;
+
+	video->backlight_notifier_registered = false;
+
+	return backlight_unregister_notifier(&video->backlight_nb);
+}
+
+static int acpi_video_bus_put_devices(struct acpi_video_bus *video)
+{
+	struct acpi_video_device *dev, *next;
+
+	mutex_lock(&video->device_list_lock);
+	list_for_each_entry_safe(dev, next, &video->video_device_list, entry) {
+		list_del(&dev->entry);
+		kfree(dev);
+	}
+	mutex_unlock(&video->device_list_lock);
+
+	return 0;
+}
+
+static int instance;
+
+static int acpi_video_bus_add(struct acpi_device *device)
+{
+	struct acpi_video_bus *video;
+	int error;
+	acpi_status status;
+
+	status = acpi_walk_namespace(ACPI_TYPE_DEVICE,
+				device->parent->handle, 1,
+				acpi_video_bus_match, NULL,
+				device, NULL);
+	if (status == AE_ALREADY_EXISTS) {
+		printk(KERN_WARNING FW_BUG
+			"Duplicate ACPI video bus devices for the"
+			" same VGA controller, please try module "
+			"parameter \"video.allow_duplicates=1\""
+			"if the current driver doesn't work.\n");
+		if (!allow_duplicates)
+			return -ENODEV;
+	}
+
+	video = kzalloc(sizeof(struct acpi_video_bus), GFP_KERNEL);
+	if (!video)
+		return -ENOMEM;
+
+	/* a hack to fix the duplicate name "VID" problem on T61 */
+	if (!strcmp(device->pnp.bus_id, "VID")) {
+		if (instance)
+			device->pnp.bus_id[3] = '0' + instance;
+		instance++;
+	}
+	/* a hack to fix the duplicate name "VGA" problem on Pa 3553 */
+	if (!strcmp(device->pnp.bus_id, "VGA")) {
+		if (instance)
+			device->pnp.bus_id[3] = '0' + instance;
+		instance++;
+	}
+
+	video->device = device;
+	strcpy(acpi_device_name(device), ACPI_VIDEO_BUS_NAME);
+	strcpy(acpi_device_class(device), ACPI_VIDEO_CLASS);
+	device->driver_data = video;
+
+	acpi_video_bus_find_cap(video);
+	error = acpi_video_bus_check(video);
+	if (error)
+		goto err_free_video;
+
+	mutex_init(&video->device_list_lock);
+	INIT_LIST_HEAD(&video->video_device_list);
+
+	error = acpi_video_bus_get_devices(video, device);
+	if (error)
+		goto err_put_video;
+
+	printk(KERN_INFO PREFIX "%s [%s] (multi-head: %s  rom: %s  post: %s)\n",
+	       ACPI_VIDEO_DEVICE_NAME, acpi_device_bid(device),
+	       video->flags.multihead ? "yes" : "no",
+	       video->flags.rom ? "yes" : "no",
+	       video->flags.post ? "yes" : "no");
+	mutex_lock(&video_list_lock);
+	list_add_tail(&video->entry, &video_bus_head);
+	mutex_unlock(&video_list_lock);
+
+	acpi_video_bus_register_backlight(video);
+	acpi_video_bus_add_notify_handler(video);
+	acpi_video_bus_add_backlight_notify_handler(video);
+
+	return 0;
+
+err_put_video:
+	acpi_video_bus_put_devices(video);
+	kfree(video->attached_array);
+err_free_video:
+	kfree(video);
+	device->driver_data = NULL;
+
+	return error;
+}
+
+static int acpi_video_bus_remove(struct acpi_device *device)
+{
+	struct acpi_video_bus *video = NULL;
+
+
+	if (!device || !acpi_driver_data(device))
+		return -EINVAL;
+
+	video = acpi_driver_data(device);
+
+	acpi_video_bus_remove_backlight_notify_handler(video);
+	acpi_video_bus_remove_notify_handler(video);
+	acpi_video_bus_unregister_backlight(video);
+	acpi_video_bus_put_devices(video);
+
+	mutex_lock(&video_list_lock);
+	list_del(&video->entry);
+	mutex_unlock(&video_list_lock);
+
+	kfree(video->attached_array);
+	kfree(video);
+
+	return 0;
+}
+
+static int __init is_i740(struct pci_dev *dev)
+{
+	if (dev->device == 0x00D1)
+		return 1;
+	if (dev->device == 0x7000)
+		return 1;
+	return 0;
+}
+
+static int __init intel_opregion_present(void)
+{
+	int opregion = 0;
+	struct pci_dev *dev = NULL;
+	u32 address;
+
+	for_each_pci_dev(dev) {
+		if ((dev->class >> 8) != PCI_CLASS_DISPLAY_VGA)
+			continue;
+		if (dev->vendor != PCI_VENDOR_ID_INTEL)
+			continue;
+		/* We don't want to poke around undefined i740 registers */
+		if (is_i740(dev))
+			continue;
+		pci_read_config_dword(dev, 0xfc, &address);
+		if (!address)
+			continue;
+		opregion = 1;
+	}
+	return opregion;
+}
+
+int acpi_video_register(void)
+{
+	int ret;
+
+	if (register_count) {
+		/*
+		 * if the function of acpi_video_register is already called,
+		 * don't register the acpi_vide_bus again and return no error.
+		 */
+		return 0;
+	}
+
+	mutex_init(&video_list_lock);
+	INIT_LIST_HEAD(&video_bus_head);
+
+	ret = acpi_bus_register_driver(&acpi_video_bus);
+	if (ret)
+		return ret;
+
+	/*
+	 * When the acpi_video_bus is loaded successfully, increase
+	 * the counter reference.
+	 */
+	register_count = 1;
+
+	return 0;
+}
+EXPORT_SYMBOL(acpi_video_register);
+
+void acpi_video_unregister(void)
+{
+	if (!register_count) {
+		/*
+		 * If the acpi video bus is already unloaded, don't
+		 * unload it again and return directly.
+		 */
+		return;
+	}
+	acpi_bus_unregister_driver(&acpi_video_bus);
+
+	register_count = 0;
+
+	return;
+}
+EXPORT_SYMBOL(acpi_video_unregister);
+
+void acpi_video_unregister_backlight(void)
+{
+	struct acpi_video_bus *video;
+
+	if (!register_count)
+		return;
+
+	mutex_lock(&video_list_lock);
+	list_for_each_entry(video, &video_bus_head, entry)
+		acpi_video_bus_unregister_backlight(video);
+	mutex_unlock(&video_list_lock);
+}
+EXPORT_SYMBOL(acpi_video_unregister_backlight);
+
+/*
+ * This is kind of nasty. Hardware using Intel chipsets may require
+ * the video opregion code to be run first in order to initialise
+ * state before any ACPI video calls are made. To handle this we defer
+ * registration of the video class until the opregion code has run.
+ */
+
+static int __init acpi_video_init(void)
+{
+	/*
+	 * Let the module load even if ACPI is disabled (e.g. due to
+	 * a broken BIOS) so that i915.ko can still be loaded on such
+	 * old systems without an AcpiOpRegion.
+	 *
+	 * acpi_video_register() will report -ENODEV later as well due
+	 * to acpi_disabled when i915.ko tries to register itself afterwards.
+	 */
+	if (acpi_disabled)
+		return 0;
+
+	dmi_check_system(video_dmi_table);
+
+	if (intel_opregion_present())
+		return 0;
+
+	return acpi_video_register();
+}
+
+static void __exit acpi_video_exit(void)
+{
+	acpi_video_unregister();
+
+	return;
+}
+
+module_init(acpi_video_init);
+module_exit(acpi_video_exit);
diff --git a/drivers/acpi/utils.c b/drivers/acpi/utils.c
index cd49a3982b6a..67c548ad3764 100644
--- a/drivers/acpi/utils.c
+++ b/drivers/acpi/utils.c
@@ -712,3 +712,18 @@ bool acpi_check_dsm(acpi_handle handle, const u8 *uuid, int rev, u64 funcs)
 	return false;
 }
 EXPORT_SYMBOL(acpi_check_dsm);
+
+/*
+ * acpi_backlight= handling, this is done here rather then in video_detect.c
+ * because __setup cannot be used in modules.
+ */
+char acpi_video_backlight_string[16];
+EXPORT_SYMBOL(acpi_video_backlight_string);
+
+static int __init acpi_backlight(char *str)
+{
+	strlcpy(acpi_video_backlight_string, str,
+		sizeof(acpi_video_backlight_string));
+	return 1;
+}
+__setup("acpi_backlight=", acpi_backlight);
diff --git a/drivers/acpi/video.c b/drivers/acpi/video.c
deleted file mode 100644
index 3bc4c68d8bd0..000000000000
--- a/drivers/acpi/video.c
+++ /dev/null
@@ -1,2275 +0,0 @@
-/*
- *  video.c - ACPI Video Driver
- *
- *  Copyright (C) 2004 Luming Yu <luming.yu@intel.com>
- *  Copyright (C) 2004 Bruno Ducrot <ducrot@poupinou.org>
- *  Copyright (C) 2006 Thomas Tuttle <linux-kernel@ttuttle.net>
- *
- * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- *
- *  This program is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License as published by
- *  the Free Software Foundation; either version 2 of the License, or (at
- *  your option) any later version.
- *
- *  This program is distributed in the hope that it will be useful, but
- *  WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- *  General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License along
- *  with this program; if not, write to the Free Software Foundation, Inc.,
- *  59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
- *
- * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- */
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/types.h>
-#include <linux/list.h>
-#include <linux/mutex.h>
-#include <linux/input.h>
-#include <linux/backlight.h>
-#include <linux/thermal.h>
-#include <linux/sort.h>
-#include <linux/pci.h>
-#include <linux/pci_ids.h>
-#include <linux/slab.h>
-#include <linux/dmi.h>
-#include <linux/suspend.h>
-#include <linux/acpi.h>
-#include <acpi/video.h>
-#include <asm/uaccess.h>
-
-#include "internal.h"
-
-#define ACPI_VIDEO_BUS_NAME		"Video Bus"
-#define ACPI_VIDEO_DEVICE_NAME		"Video Device"
-#define ACPI_VIDEO_NOTIFY_SWITCH	0x80
-#define ACPI_VIDEO_NOTIFY_PROBE		0x81
-#define ACPI_VIDEO_NOTIFY_CYCLE		0x82
-#define ACPI_VIDEO_NOTIFY_NEXT_OUTPUT	0x83
-#define ACPI_VIDEO_NOTIFY_PREV_OUTPUT	0x84
-
-#define ACPI_VIDEO_NOTIFY_CYCLE_BRIGHTNESS	0x85
-#define	ACPI_VIDEO_NOTIFY_INC_BRIGHTNESS	0x86
-#define ACPI_VIDEO_NOTIFY_DEC_BRIGHTNESS	0x87
-#define ACPI_VIDEO_NOTIFY_ZERO_BRIGHTNESS	0x88
-#define ACPI_VIDEO_NOTIFY_DISPLAY_OFF		0x89
-
-#define MAX_NAME_LEN	20
-
-#define _COMPONENT		ACPI_VIDEO_COMPONENT
-ACPI_MODULE_NAME("video");
-
-MODULE_AUTHOR("Bruno Ducrot");
-MODULE_DESCRIPTION("ACPI Video Driver");
-MODULE_LICENSE("GPL");
-
-static bool brightness_switch_enabled = 1;
-module_param(brightness_switch_enabled, bool, 0644);
-
-/*
- * By default, we don't allow duplicate ACPI video bus devices
- * under the same VGA controller
- */
-static bool allow_duplicates;
-module_param(allow_duplicates, bool, 0644);
-
-/*
- * For Windows 8 systems: used to decide if video module
- * should skip registering backlight interface of its own.
- */
-enum {
-	NATIVE_BACKLIGHT_NOT_SET = -1,
-	NATIVE_BACKLIGHT_OFF,
-	NATIVE_BACKLIGHT_ON,
-};
-
-static int use_native_backlight_param = NATIVE_BACKLIGHT_NOT_SET;
-module_param_named(use_native_backlight, use_native_backlight_param, int, 0444);
-static int use_native_backlight_dmi = NATIVE_BACKLIGHT_NOT_SET;
-
-static int disable_backlight_sysfs_if = -1;
-module_param(disable_backlight_sysfs_if, int, 0444);
-
-static int register_count;
-static struct mutex video_list_lock;
-static struct list_head video_bus_head;
-static int acpi_video_bus_add(struct acpi_device *device);
-static int acpi_video_bus_remove(struct acpi_device *device);
-static void acpi_video_bus_notify(struct acpi_device *device, u32 event);
-
-static const struct acpi_device_id video_device_ids[] = {
-	{ACPI_VIDEO_HID, 0},
-	{"", 0},
-};
-MODULE_DEVICE_TABLE(acpi, video_device_ids);
-
-static struct acpi_driver acpi_video_bus = {
-	.name = "video",
-	.class = ACPI_VIDEO_CLASS,
-	.ids = video_device_ids,
-	.ops = {
-		.add = acpi_video_bus_add,
-		.remove = acpi_video_bus_remove,
-		.notify = acpi_video_bus_notify,
-		},
-};
-
-struct acpi_video_bus_flags {
-	u8 multihead:1;		/* can switch video heads */
-	u8 rom:1;		/* can retrieve a video rom */
-	u8 post:1;		/* can configure the head to */
-	u8 reserved:5;
-};
-
-struct acpi_video_bus_cap {
-	u8 _DOS:1;		/* Enable/Disable output switching */
-	u8 _DOD:1;		/* Enumerate all devices attached to display adapter */
-	u8 _ROM:1;		/* Get ROM Data */
-	u8 _GPD:1;		/* Get POST Device */
-	u8 _SPD:1;		/* Set POST Device */
-	u8 _VPO:1;		/* Video POST Options */
-	u8 reserved:2;
-};
-
-struct acpi_video_device_attrib {
-	u32 display_index:4;	/* A zero-based instance of the Display */
-	u32 display_port_attachment:4;	/* This field differentiates the display type */
-	u32 display_type:4;	/* Describe the specific type in use */
-	u32 vendor_specific:4;	/* Chipset Vendor Specific */
-	u32 bios_can_detect:1;	/* BIOS can detect the device */
-	u32 depend_on_vga:1;	/* Non-VGA output device whose power is related to
-				   the VGA device. */
-	u32 pipe_id:3;		/* For VGA multiple-head devices. */
-	u32 reserved:10;	/* Must be 0 */
-	u32 device_id_scheme:1;	/* Device ID Scheme */
-};
-
-struct acpi_video_enumerated_device {
-	union {
-		u32 int_val;
-		struct acpi_video_device_attrib attrib;
-	} value;
-	struct acpi_video_device *bind_info;
-};
-
-struct acpi_video_bus {
-	struct acpi_device *device;
-	bool backlight_registered;
-	bool backlight_notifier_registered;
-	u8 dos_setting;
-	struct acpi_video_enumerated_device *attached_array;
-	u8 attached_count;
-	u8 child_count;
-	struct acpi_video_bus_cap cap;
-	struct acpi_video_bus_flags flags;
-	struct list_head video_device_list;
-	struct mutex device_list_lock;	/* protects video_device_list */
-	struct list_head entry;
-	struct input_dev *input;
-	char phys[32];	/* for input device */
-	struct notifier_block pm_nb;
-	struct notifier_block backlight_nb;
-};
-
-struct acpi_video_device_flags {
-	u8 crt:1;
-	u8 lcd:1;
-	u8 tvout:1;
-	u8 dvi:1;
-	u8 bios:1;
-	u8 unknown:1;
-	u8 notify:1;
-	u8 reserved:1;
-};
-
-struct acpi_video_device_cap {
-	u8 _ADR:1;		/* Return the unique ID */
-	u8 _BCL:1;		/* Query list of brightness control levels supported */
-	u8 _BCM:1;		/* Set the brightness level */
-	u8 _BQC:1;		/* Get current brightness level */
-	u8 _BCQ:1;		/* Some buggy BIOS uses _BCQ instead of _BQC */
-	u8 _DDC:1;		/* Return the EDID for this device */
-};
-
-struct acpi_video_brightness_flags {
-	u8 _BCL_no_ac_battery_levels:1;	/* no AC/Battery levels in _BCL */
-	u8 _BCL_reversed:1;		/* _BCL package is in a reversed order */
-	u8 _BQC_use_index:1;		/* _BQC returns an index value */
-};
-
-struct acpi_video_device_brightness {
-	int curr;
-	int count;
-	int *levels;
-	struct acpi_video_brightness_flags flags;
-};
-
-struct acpi_video_device {
-	unsigned long device_id;
-	struct acpi_video_device_flags flags;
-	struct acpi_video_device_cap cap;
-	struct list_head entry;
-	struct delayed_work switch_brightness_work;
-	int switch_brightness_event;
-	struct acpi_video_bus *video;
-	struct acpi_device *dev;
-	struct acpi_video_device_brightness *brightness;
-	struct backlight_device *backlight;
-	struct thermal_cooling_device *cooling_dev;
-};
-
-static const char device_decode[][30] = {
-	"motherboard VGA device",
-	"PCI VGA device",
-	"AGP VGA device",
-	"UNKNOWN",
-};
-
-static void acpi_video_device_notify(acpi_handle handle, u32 event, void *data);
-static void acpi_video_device_rebind(struct acpi_video_bus *video);
-static void acpi_video_device_bind(struct acpi_video_bus *video,
-				   struct acpi_video_device *device);
-static int acpi_video_device_enumerate(struct acpi_video_bus *video);
-static int acpi_video_device_lcd_set_level(struct acpi_video_device *device,
-			int level);
-static int acpi_video_device_lcd_get_level_current(
-			struct acpi_video_device *device,
-			unsigned long long *level, bool raw);
-static int acpi_video_get_next_level(struct acpi_video_device *device,
-				     u32 level_current, u32 event);
-static void acpi_video_switch_brightness(struct work_struct *work);
-
-static bool acpi_video_use_native_backlight(void)
-{
-	if (use_native_backlight_param != NATIVE_BACKLIGHT_NOT_SET)
-		return use_native_backlight_param;
-	else if (use_native_backlight_dmi != NATIVE_BACKLIGHT_NOT_SET)
-		return use_native_backlight_dmi;
-	return acpi_osi_is_win8();
-}
-
-bool acpi_video_verify_backlight_support(void)
-{
-	if (acpi_video_use_native_backlight() &&
-	    backlight_device_registered(BACKLIGHT_RAW))
-		return false;
-	return acpi_video_backlight_support();
-}
-EXPORT_SYMBOL_GPL(acpi_video_verify_backlight_support);
-
-/* backlight device sysfs support */
-static int acpi_video_get_brightness(struct backlight_device *bd)
-{
-	unsigned long long cur_level;
-	int i;
-	struct acpi_video_device *vd = bl_get_data(bd);
-
-	if (acpi_video_device_lcd_get_level_current(vd, &cur_level, false))
-		return -EINVAL;
-	for (i = 2; i < vd->brightness->count; i++) {
-		if (vd->brightness->levels[i] == cur_level)
-			/*
-			 * The first two entries are special - see page 575
-			 * of the ACPI spec 3.0
-			 */
-			return i - 2;
-	}
-	return 0;
-}
-
-static int acpi_video_set_brightness(struct backlight_device *bd)
-{
-	int request_level = bd->props.brightness + 2;
-	struct acpi_video_device *vd = bl_get_data(bd);
-
-	cancel_delayed_work(&vd->switch_brightness_work);
-	return acpi_video_device_lcd_set_level(vd,
-				vd->brightness->levels[request_level]);
-}
-
-static const struct backlight_ops acpi_backlight_ops = {
-	.get_brightness = acpi_video_get_brightness,
-	.update_status  = acpi_video_set_brightness,
-};
-
-/* thermal cooling device callbacks */
-static int video_get_max_state(struct thermal_cooling_device *cooling_dev, unsigned
-			       long *state)
-{
-	struct acpi_device *device = cooling_dev->devdata;
-	struct acpi_video_device *video = acpi_driver_data(device);
-
-	*state = video->brightness->count - 3;
-	return 0;
-}
-
-static int video_get_cur_state(struct thermal_cooling_device *cooling_dev, unsigned
-			       long *state)
-{
-	struct acpi_device *device = cooling_dev->devdata;
-	struct acpi_video_device *video = acpi_driver_data(device);
-	unsigned long long level;
-	int offset;
-
-	if (acpi_video_device_lcd_get_level_current(video, &level, false))
-		return -EINVAL;
-	for (offset = 2; offset < video->brightness->count; offset++)
-		if (level == video->brightness->levels[offset]) {
-			*state = video->brightness->count - offset - 1;
-			return 0;
-		}
-
-	return -EINVAL;
-}
-
-static int
-video_set_cur_state(struct thermal_cooling_device *cooling_dev, unsigned long state)
-{
-	struct acpi_device *device = cooling_dev->devdata;
-	struct acpi_video_device *video = acpi_driver_data(device);
-	int level;
-
-	if (state >= video->brightness->count - 2)
-		return -EINVAL;
-
-	state = video->brightness->count - state;
-	level = video->brightness->levels[state - 1];
-	return acpi_video_device_lcd_set_level(video, level);
-}
-
-static const struct thermal_cooling_device_ops video_cooling_ops = {
-	.get_max_state = video_get_max_state,
-	.get_cur_state = video_get_cur_state,
-	.set_cur_state = video_set_cur_state,
-};
-
-/*
- * --------------------------------------------------------------------------
- *                             Video Management
- * --------------------------------------------------------------------------
- */
-
-static int
-acpi_video_device_lcd_query_levels(struct acpi_video_device *device,
-				   union acpi_object **levels)
-{
-	int status;
-	struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL };
-	union acpi_object *obj;
-
-
-	*levels = NULL;
-
-	status = acpi_evaluate_object(device->dev->handle, "_BCL", NULL, &buffer);
-	if (!ACPI_SUCCESS(status))
-		return status;
-	obj = (union acpi_object *)buffer.pointer;
-	if (!obj || (obj->type != ACPI_TYPE_PACKAGE)) {
-		printk(KERN_ERR PREFIX "Invalid _BCL data\n");
-		status = -EFAULT;
-		goto err;
-	}
-
-	*levels = obj;
-
-	return 0;
-
-err:
-	kfree(buffer.pointer);
-
-	return status;
-}
-
-static int
-acpi_video_device_lcd_set_level(struct acpi_video_device *device, int level)
-{
-	int status;
-	int state;
-
-	status = acpi_execute_simple_method(device->dev->handle,
-					    "_BCM", level);
-	if (ACPI_FAILURE(status)) {
-		ACPI_ERROR((AE_INFO, "Evaluating _BCM failed"));
-		return -EIO;
-	}
-
-	device->brightness->curr = level;
-	for (state = 2; state < device->brightness->count; state++)
-		if (level == device->brightness->levels[state]) {
-			if (device->backlight)
-				device->backlight->props.brightness = state - 2;
-			return 0;
-		}
-
-	ACPI_ERROR((AE_INFO, "Current brightness invalid"));
-	return -EINVAL;
-}
-
-/*
- * For some buggy _BQC methods, we need to add a constant value to
- * the _BQC return value to get the actual current brightness level
- */
-
-static int bqc_offset_aml_bug_workaround;
-static int __init video_set_bqc_offset(const struct dmi_system_id *d)
-{
-	bqc_offset_aml_bug_workaround = 9;
-	return 0;
-}
-
-static int __init video_disable_native_backlight(const struct dmi_system_id *d)
-{
-	use_native_backlight_dmi = NATIVE_BACKLIGHT_OFF;
-	return 0;
-}
-
-static int __init video_enable_native_backlight(const struct dmi_system_id *d)
-{
-	use_native_backlight_dmi = NATIVE_BACKLIGHT_ON;
-	return 0;
-}
-
-static int __init video_disable_backlight_sysfs_if(
-	const struct dmi_system_id *d)
-{
-	if (disable_backlight_sysfs_if == -1)
-		disable_backlight_sysfs_if = 1;
-	return 0;
-}
-
-static struct dmi_system_id video_dmi_table[] __initdata = {
-	/*
-	 * Broken _BQC workaround http://bugzilla.kernel.org/show_bug.cgi?id=13121
-	 */
-	{
-	 .callback = video_set_bqc_offset,
-	 .ident = "Acer Aspire 5720",
-	 .matches = {
-		DMI_MATCH(DMI_BOARD_VENDOR, "Acer"),
-		DMI_MATCH(DMI_PRODUCT_NAME, "Aspire 5720"),
-		},
-	},
-	{
-	 .callback = video_set_bqc_offset,
-	 .ident = "Acer Aspire 5710Z",
-	 .matches = {
-		DMI_MATCH(DMI_BOARD_VENDOR, "Acer"),
-		DMI_MATCH(DMI_PRODUCT_NAME, "Aspire 5710Z"),
-		},
-	},
-	{
-	 .callback = video_set_bqc_offset,
-	 .ident = "eMachines E510",
-	 .matches = {
-		DMI_MATCH(DMI_BOARD_VENDOR, "EMACHINES"),
-		DMI_MATCH(DMI_PRODUCT_NAME, "eMachines E510"),
-		},
-	},
-	{
-	 .callback = video_set_bqc_offset,
-	 .ident = "Acer Aspire 5315",
-	 .matches = {
-		DMI_MATCH(DMI_BOARD_VENDOR, "Acer"),
-		DMI_MATCH(DMI_PRODUCT_NAME, "Aspire 5315"),
-		},
-	},
-	{
-	 .callback = video_set_bqc_offset,
-	 .ident = "Acer Aspire 7720",
-	 .matches = {
-		DMI_MATCH(DMI_BOARD_VENDOR, "Acer"),
-		DMI_MATCH(DMI_PRODUCT_NAME, "Aspire 7720"),
-		},
-	},
-
-	/*
-	 * These models have a working acpi_video backlight control, and using
-	 * native backlight causes a regression where backlight does not work
-	 * when userspace is not handling brightness key events. Disable
-	 * native_backlight on these to fix this:
-	 * https://bugzilla.kernel.org/show_bug.cgi?id=81691
-	 */
-	{
-	 .callback = video_disable_native_backlight,
-	 .ident = "ThinkPad T420",
-	 .matches = {
-		DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"),
-		DMI_MATCH(DMI_PRODUCT_VERSION, "ThinkPad T420"),
-		},
-	},
-	{
-	 .callback = video_disable_native_backlight,
-	 .ident = "ThinkPad T520",
-	 .matches = {
-		DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"),
-		DMI_MATCH(DMI_PRODUCT_VERSION, "ThinkPad T520"),
-		},
-	},
-	{
-	 .callback = video_disable_native_backlight,
-	 .ident = "ThinkPad X201s",
-	 .matches = {
-		DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"),
-		DMI_MATCH(DMI_PRODUCT_VERSION, "ThinkPad X201s"),
-		},
-	},
-
-	/* The native backlight controls do not work on some older machines */
-	{
-	 /* https://bugs.freedesktop.org/show_bug.cgi?id=81515 */
-	 .callback = video_disable_native_backlight,
-	 .ident = "HP ENVY 15 Notebook",
-	 .matches = {
-		DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"),
-		DMI_MATCH(DMI_PRODUCT_NAME, "HP ENVY 15 Notebook PC"),
-		},
-	},
-
-	{
-	 .callback = video_disable_native_backlight,
-	 .ident = "SAMSUNG 870Z5E/880Z5E/680Z5E",
-	 .matches = {
-		DMI_MATCH(DMI_SYS_VENDOR, "SAMSUNG ELECTRONICS CO., LTD."),
-		DMI_MATCH(DMI_PRODUCT_NAME, "870Z5E/880Z5E/680Z5E"),
-		},
-	},
-	{
-	 .callback = video_disable_native_backlight,
-	 .ident = "SAMSUNG 370R4E/370R4V/370R5E/3570RE/370R5V",
-	 .matches = {
-		DMI_MATCH(DMI_SYS_VENDOR, "SAMSUNG ELECTRONICS CO., LTD."),
-		DMI_MATCH(DMI_PRODUCT_NAME, "370R4E/370R4V/370R5E/3570RE/370R5V"),
-		},
-	},
-	{
-	 /* https://bugzilla.redhat.com/show_bug.cgi?id=1186097 */
-	 .callback = video_disable_native_backlight,
-	 .ident = "SAMSUNG 3570R/370R/470R/450R/510R/4450RV",
-	 .matches = {
-		DMI_MATCH(DMI_SYS_VENDOR, "SAMSUNG ELECTRONICS CO., LTD."),
-		DMI_MATCH(DMI_PRODUCT_NAME, "3570R/370R/470R/450R/510R/4450RV"),
-		},
-	},
-	{
-	 /* https://bugzilla.redhat.com/show_bug.cgi?id=1094948 */
-	 .callback = video_disable_native_backlight,
-	 .ident = "SAMSUNG 730U3E/740U3E",
-	 .matches = {
-		DMI_MATCH(DMI_SYS_VENDOR, "SAMSUNG ELECTRONICS CO., LTD."),
-		DMI_MATCH(DMI_PRODUCT_NAME, "730U3E/740U3E"),
-		},
-	},
-	{
-	 /* https://bugs.freedesktop.org/show_bug.cgi?id=87286 */
-	 .callback = video_disable_native_backlight,
-	 .ident = "SAMSUNG 900X3C/900X3D/900X3E/900X4C/900X4D",
-	 .matches = {
-		DMI_MATCH(DMI_SYS_VENDOR, "SAMSUNG ELECTRONICS CO., LTD."),
-		DMI_MATCH(DMI_PRODUCT_NAME, "900X3C/900X3D/900X3E/900X4C/900X4D"),
-		},
-	},
-
-	{
-	 /* https://bugzilla.redhat.com/show_bug.cgi?id=1163574 */
-	 .callback = video_disable_native_backlight,
-	 .ident = "Dell XPS15 L521X",
-	 .matches = {
-		DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
-		DMI_MATCH(DMI_PRODUCT_NAME, "XPS L521X"),
-		},
-	},
-
-	/* Non win8 machines which need native backlight nevertheless */
-	{
-	 /* https://bugzilla.redhat.com/show_bug.cgi?id=1187004 */
-	 .callback = video_enable_native_backlight,
-	 .ident = "Lenovo Ideapad Z570",
-	 .matches = {
-		DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"),
-		DMI_MATCH(DMI_PRODUCT_NAME, "102434U"),
-		},
-	},
-	{
-	 /* https://bugzilla.redhat.com/show_bug.cgi?id=1217249 */
-	 .callback = video_enable_native_backlight,
-	 .ident = "Apple MacBook Pro 12,1",
-	 .matches = {
-		DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."),
-		DMI_MATCH(DMI_PRODUCT_NAME, "MacBookPro12,1"),
-		},
-	},
-
-	/*
-	 * Some machines have a broken acpi-video interface for brightness
-	 * control, but still need an acpi_video_device_lcd_set_level() call
-	 * on resume to turn the backlight power on.  We Enable backlight
-	 * control on these systems, but do not register a backlight sysfs
-	 * as brightness control does not work.
-	 */
-	{
-	 /* https://bugs.freedesktop.org/show_bug.cgi?id=82634 */
-	 .callback = video_disable_backlight_sysfs_if,
-	 .ident = "Toshiba Portege R830",
-	 .matches = {
-		DMI_MATCH(DMI_SYS_VENDOR, "TOSHIBA"),
-		DMI_MATCH(DMI_PRODUCT_NAME, "PORTEGE R830"),
-		},
-	},
-	{}
-};
-
-static unsigned long long
-acpi_video_bqc_value_to_level(struct acpi_video_device *device,
-			      unsigned long long bqc_value)
-{
-	unsigned long long level;
-
-	if (device->brightness->flags._BQC_use_index) {
-		/*
-		 * _BQC returns an index that doesn't account for
-		 * the first 2 items with special meaning, so we need
-		 * to compensate for that by offsetting ourselves
-		 */
-		if (device->brightness->flags._BCL_reversed)
-			bqc_value = device->brightness->count - 3 - bqc_value;
-
-		level = device->brightness->levels[bqc_value + 2];
-	} else {
-		level = bqc_value;
-	}
-
-	level += bqc_offset_aml_bug_workaround;
-
-	return level;
-}
-
-static int
-acpi_video_device_lcd_get_level_current(struct acpi_video_device *device,
-					unsigned long long *level, bool raw)
-{
-	acpi_status status = AE_OK;
-	int i;
-
-	if (device->cap._BQC || device->cap._BCQ) {
-		char *buf = device->cap._BQC ? "_BQC" : "_BCQ";
-
-		status = acpi_evaluate_integer(device->dev->handle, buf,
-						NULL, level);
-		if (ACPI_SUCCESS(status)) {
-			if (raw) {
-				/*
-				 * Caller has indicated he wants the raw
-				 * value returned by _BQC, so don't furtherly
-				 * mess with the value.
-				 */
-				return 0;
-			}
-
-			*level = acpi_video_bqc_value_to_level(device, *level);
-
-			for (i = 2; i < device->brightness->count; i++)
-				if (device->brightness->levels[i] == *level) {
-					device->brightness->curr = *level;
-					return 0;
-				}
-			/*
-			 * BQC returned an invalid level.
-			 * Stop using it.
-			 */
-			ACPI_WARNING((AE_INFO,
-				      "%s returned an invalid level",
-				      buf));
-			device->cap._BQC = device->cap._BCQ = 0;
-		} else {
-			/*
-			 * Fixme:
-			 * should we return an error or ignore this failure?
-			 * dev->brightness->curr is a cached value which stores
-			 * the correct current backlight level in most cases.
-			 * ACPI video backlight still works w/ buggy _BQC.
-			 * http://bugzilla.kernel.org/show_bug.cgi?id=12233
-			 */
-			ACPI_WARNING((AE_INFO, "Evaluating %s failed", buf));
-			device->cap._BQC = device->cap._BCQ = 0;
-		}
-	}
-
-	*level = device->brightness->curr;
-	return 0;
-}
-
-static int
-acpi_video_device_EDID(struct acpi_video_device *device,
-		       union acpi_object **edid, ssize_t length)
-{
-	int status;
-	struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL };
-	union acpi_object *obj;
-	union acpi_object arg0 = { ACPI_TYPE_INTEGER };
-	struct acpi_object_list args = { 1, &arg0 };
-
-
-	*edid = NULL;
-
-	if (!device)
-		return -ENODEV;
-	if (length == 128)
-		arg0.integer.value = 1;
-	else if (length == 256)
-		arg0.integer.value = 2;
-	else
-		return -EINVAL;
-
-	status = acpi_evaluate_object(device->dev->handle, "_DDC", &args, &buffer);
-	if (ACPI_FAILURE(status))
-		return -ENODEV;
-
-	obj = buffer.pointer;
-
-	if (obj && obj->type == ACPI_TYPE_BUFFER)
-		*edid = obj;
-	else {
-		printk(KERN_ERR PREFIX "Invalid _DDC data\n");
-		status = -EFAULT;
-		kfree(obj);
-	}
-
-	return status;
-}
-
-/* bus */
-
-/*
- *  Arg:
- *	video		: video bus device pointer
- *	bios_flag	:
- *		0.	The system BIOS should NOT automatically switch(toggle)
- *			the active display output.
- *		1.	The system BIOS should automatically switch (toggle) the
- *			active display output. No switch event.
- *		2.	The _DGS value should be locked.
- *		3.	The system BIOS should not automatically switch (toggle) the
- *			active display output, but instead generate the display switch
- *			event notify code.
- *	lcd_flag	:
- *		0.	The system BIOS should automatically control the brightness level
- *			of the LCD when the power changes from AC to DC
- *		1.	The system BIOS should NOT automatically control the brightness
- *			level of the LCD when the power changes from AC to DC.
- *  Return Value:
- *		-EINVAL	wrong arg.
- */
-
-static int
-acpi_video_bus_DOS(struct acpi_video_bus *video, int bios_flag, int lcd_flag)
-{
-	acpi_status status;
-
-	if (!video->cap._DOS)
-		return 0;
-
-	if (bios_flag < 0 || bios_flag > 3 || lcd_flag < 0 || lcd_flag > 1)
-		return -EINVAL;
-	video->dos_setting = (lcd_flag << 2) | bios_flag;
-	status = acpi_execute_simple_method(video->device->handle, "_DOS",
-					    (lcd_flag << 2) | bios_flag);
-	if (ACPI_FAILURE(status))
-		return -EIO;
-
-	return 0;
-}
-
-/*
- * Simple comparison function used to sort backlight levels.
- */
-
-static int
-acpi_video_cmp_level(const void *a, const void *b)
-{
-	return *(int *)a - *(int *)b;
-}
-
-/*
- * Decides if _BQC/_BCQ for this system is usable
- *
- * We do this by changing the level first and then read out the current
- * brightness level, if the value does not match, find out if it is using
- * index. If not, clear the _BQC/_BCQ capability.
- */
-static int acpi_video_bqc_quirk(struct acpi_video_device *device,
-				int max_level, int current_level)
-{
-	struct acpi_video_device_brightness *br = device->brightness;
-	int result;
-	unsigned long long level;
-	int test_level;
-
-	/* don't mess with existing known broken systems */
-	if (bqc_offset_aml_bug_workaround)
-		return 0;
-
-	/*
-	 * Some systems always report current brightness level as maximum
-	 * through _BQC, we need to test another value for them.
-	 */
-	test_level = current_level == max_level ? br->levels[3] : max_level;
-
-	result = acpi_video_device_lcd_set_level(device, test_level);
-	if (result)
-		return result;
-
-	result = acpi_video_device_lcd_get_level_current(device, &level, true);
-	if (result)
-		return result;
-
-	if (level != test_level) {
-		/* buggy _BQC found, need to find out if it uses index */
-		if (level < br->count) {
-			if (br->flags._BCL_reversed)
-				level = br->count - 3 - level;
-			if (br->levels[level + 2] == test_level)
-				br->flags._BQC_use_index = 1;
-		}
-
-		if (!br->flags._BQC_use_index)
-			device->cap._BQC = device->cap._BCQ = 0;
-	}
-
-	return 0;
-}
-
-
-/*
- *  Arg:
- *	device	: video output device (LCD, CRT, ..)
- *
- *  Return Value:
- *	Maximum brightness level
- *
- *  Allocate and initialize device->brightness.
- */
-
-static int
-acpi_video_init_brightness(struct acpi_video_device *device)
-{
-	union acpi_object *obj = NULL;
-	int i, max_level = 0, count = 0, level_ac_battery = 0;
-	unsigned long long level, level_old;
-	union acpi_object *o;
-	struct acpi_video_device_brightness *br = NULL;
-	int result = -EINVAL;
-	u32 value;
-
-	if (!ACPI_SUCCESS(acpi_video_device_lcd_query_levels(device, &obj))) {
-		ACPI_DEBUG_PRINT((ACPI_DB_INFO, "Could not query available "
-						"LCD brightness level\n"));
-		goto out;
-	}
-
-	if (obj->package.count < 2)
-		goto out;
-
-	br = kzalloc(sizeof(*br), GFP_KERNEL);
-	if (!br) {
-		printk(KERN_ERR "can't allocate memory\n");
-		result = -ENOMEM;
-		goto out;
-	}
-
-	br->levels = kmalloc((obj->package.count + 2) * sizeof *(br->levels),
-				GFP_KERNEL);
-	if (!br->levels) {
-		result = -ENOMEM;
-		goto out_free;
-	}
-
-	for (i = 0; i < obj->package.count; i++) {
-		o = (union acpi_object *)&obj->package.elements[i];
-		if (o->type != ACPI_TYPE_INTEGER) {
-			printk(KERN_ERR PREFIX "Invalid data\n");
-			continue;
-		}
-		value = (u32) o->integer.value;
-		/* Skip duplicate entries */
-		if (count > 2 && br->levels[count - 1] == value)
-			continue;
-
-		br->levels[count] = value;
-
-		if (br->levels[count] > max_level)
-			max_level = br->levels[count];
-		count++;
-	}
-
-	/*
-	 * some buggy BIOS don't export the levels
-	 * when machine is on AC/Battery in _BCL package.
-	 * In this case, the first two elements in _BCL packages
-	 * are also supported brightness levels that OS should take care of.
-	 */
-	for (i = 2; i < count; i++) {
-		if (br->levels[i] == br->levels[0])
-			level_ac_battery++;
-		if (br->levels[i] == br->levels[1])
-			level_ac_battery++;
-	}
-
-	if (level_ac_battery < 2) {
-		level_ac_battery = 2 - level_ac_battery;
-		br->flags._BCL_no_ac_battery_levels = 1;
-		for (i = (count - 1 + level_ac_battery); i >= 2; i--)
-			br->levels[i] = br->levels[i - level_ac_battery];
-		count += level_ac_battery;
-	} else if (level_ac_battery > 2)
-		ACPI_ERROR((AE_INFO, "Too many duplicates in _BCL package"));
-
-	/* Check if the _BCL package is in a reversed order */
-	if (max_level == br->levels[2]) {
-		br->flags._BCL_reversed = 1;
-		sort(&br->levels[2], count - 2, sizeof(br->levels[2]),
-			acpi_video_cmp_level, NULL);
-	} else if (max_level != br->levels[count - 1])
-		ACPI_ERROR((AE_INFO,
-			    "Found unordered _BCL package"));
-
-	br->count = count;
-	device->brightness = br;
-
-	/* _BQC uses INDEX while _BCL uses VALUE in some laptops */
-	br->curr = level = max_level;
-
-	if (!device->cap._BQC)
-		goto set_level;
-
-	result = acpi_video_device_lcd_get_level_current(device,
-							 &level_old, true);
-	if (result)
-		goto out_free_levels;
-
-	result = acpi_video_bqc_quirk(device, max_level, level_old);
-	if (result)
-		goto out_free_levels;
-	/*
-	 * cap._BQC may get cleared due to _BQC is found to be broken
-	 * in acpi_video_bqc_quirk, so check again here.
-	 */
-	if (!device->cap._BQC)
-		goto set_level;
-
-	level = acpi_video_bqc_value_to_level(device, level_old);
-	/*
-	 * On some buggy laptops, _BQC returns an uninitialized
-	 * value when invoked for the first time, i.e.
-	 * level_old is invalid (no matter whether it's a level
-	 * or an index). Set the backlight to max_level in this case.
-	 */
-	for (i = 2; i < br->count; i++)
-		if (level == br->levels[i])
-			break;
-	if (i == br->count || !level)
-		level = max_level;
-
-set_level:
-	result = acpi_video_device_lcd_set_level(device, level);
-	if (result)
-		goto out_free_levels;
-
-	ACPI_DEBUG_PRINT((ACPI_DB_INFO,
-			  "found %d brightness levels\n", count - 2));
-	kfree(obj);
-	return result;
-
-out_free_levels:
-	kfree(br->levels);
-out_free:
-	kfree(br);
-out:
-	device->brightness = NULL;
-	kfree(obj);
-	return result;
-}
-
-/*
- *  Arg:
- *	device	: video output device (LCD, CRT, ..)
- *
- *  Return Value:
- *	None
- *
- *  Find out all required AML methods defined under the output
- *  device.
- */
-
-static void acpi_video_device_find_cap(struct acpi_video_device *device)
-{
-	if (acpi_has_method(device->dev->handle, "_ADR"))
-		device->cap._ADR = 1;
-	if (acpi_has_method(device->dev->handle, "_BCL"))
-		device->cap._BCL = 1;
-	if (acpi_has_method(device->dev->handle, "_BCM"))
-		device->cap._BCM = 1;
-	if (acpi_has_method(device->dev->handle, "_BQC")) {
-		device->cap._BQC = 1;
-	} else if (acpi_has_method(device->dev->handle, "_BCQ")) {
-		printk(KERN_WARNING FW_BUG "_BCQ is used instead of _BQC\n");
-		device->cap._BCQ = 1;
-	}
-
-	if (acpi_has_method(device->dev->handle, "_DDC"))
-		device->cap._DDC = 1;
-}
-
-/*
- *  Arg:
- *	device	: video output device (VGA)
- *
- *  Return Value:
- *	None
- *
- *  Find out all required AML methods defined under the video bus device.
- */
-
-static void acpi_video_bus_find_cap(struct acpi_video_bus *video)
-{
-	if (acpi_has_method(video->device->handle, "_DOS"))
-		video->cap._DOS = 1;
-	if (acpi_has_method(video->device->handle, "_DOD"))
-		video->cap._DOD = 1;
-	if (acpi_has_method(video->device->handle, "_ROM"))
-		video->cap._ROM = 1;
-	if (acpi_has_method(video->device->handle, "_GPD"))
-		video->cap._GPD = 1;
-	if (acpi_has_method(video->device->handle, "_SPD"))
-		video->cap._SPD = 1;
-	if (acpi_has_method(video->device->handle, "_VPO"))
-		video->cap._VPO = 1;
-}
-
-/*
- * Check whether the video bus device has required AML method to
- * support the desired features
- */
-
-static int acpi_video_bus_check(struct acpi_video_bus *video)
-{
-	acpi_status status = -ENOENT;
-	struct pci_dev *dev;
-
-	if (!video)
-		return -EINVAL;
-
-	dev = acpi_get_pci_dev(video->device->handle);
-	if (!dev)
-		return -ENODEV;
-	pci_dev_put(dev);
-
-	/*
-	 * Since there is no HID, CID and so on for VGA driver, we have
-	 * to check well known required nodes.
-	 */
-
-	/* Does this device support video switching? */
-	if (video->cap._DOS || video->cap._DOD) {
-		if (!video->cap._DOS) {
-			printk(KERN_WARNING FW_BUG
-				"ACPI(%s) defines _DOD but not _DOS\n",
-				acpi_device_bid(video->device));
-		}
-		video->flags.multihead = 1;
-		status = 0;
-	}
-
-	/* Does this device support retrieving a video ROM? */
-	if (video->cap._ROM) {
-		video->flags.rom = 1;
-		status = 0;
-	}
-
-	/* Does this device support configuring which video device to POST? */
-	if (video->cap._GPD && video->cap._SPD && video->cap._VPO) {
-		video->flags.post = 1;
-		status = 0;
-	}
-
-	return status;
-}
-
-/*
- * --------------------------------------------------------------------------
- *                               Driver Interface
- * --------------------------------------------------------------------------
- */
-
-/* device interface */
-static struct acpi_video_device_attrib *
-acpi_video_get_device_attr(struct acpi_video_bus *video, unsigned long device_id)
-{
-	struct acpi_video_enumerated_device *ids;
-	int i;
-
-	for (i = 0; i < video->attached_count; i++) {
-		ids = &video->attached_array[i];
-		if ((ids->value.int_val & 0xffff) == device_id)
-			return &ids->value.attrib;
-	}
-
-	return NULL;
-}
-
-static int
-acpi_video_get_device_type(struct acpi_video_bus *video,
-			   unsigned long device_id)
-{
-	struct acpi_video_enumerated_device *ids;
-	int i;
-
-	for (i = 0; i < video->attached_count; i++) {
-		ids = &video->attached_array[i];
-		if ((ids->value.int_val & 0xffff) == device_id)
-			return ids->value.int_val;
-	}
-
-	return 0;
-}
-
-static int
-acpi_video_bus_get_one_device(struct acpi_device *device,
-			      struct acpi_video_bus *video)
-{
-	unsigned long long device_id;
-	int status, device_type;
-	struct acpi_video_device *data;
-	struct acpi_video_device_attrib *attribute;
-
-	status =
-	    acpi_evaluate_integer(device->handle, "_ADR", NULL, &device_id);
-	/* Some device omits _ADR, we skip them instead of fail */
-	if (ACPI_FAILURE(status))
-		return 0;
-
-	data = kzalloc(sizeof(struct acpi_video_device), GFP_KERNEL);
-	if (!data)
-		return -ENOMEM;
-
-	strcpy(acpi_device_name(device), ACPI_VIDEO_DEVICE_NAME);
-	strcpy(acpi_device_class(device), ACPI_VIDEO_CLASS);
-	device->driver_data = data;
-
-	data->device_id = device_id;
-	data->video = video;
-	data->dev = device;
-	INIT_DELAYED_WORK(&data->switch_brightness_work,
-			  acpi_video_switch_brightness);
-
-	attribute = acpi_video_get_device_attr(video, device_id);
-
-	if (attribute && attribute->device_id_scheme) {
-		switch (attribute->display_type) {
-		case ACPI_VIDEO_DISPLAY_CRT:
-			data->flags.crt = 1;
-			break;
-		case ACPI_VIDEO_DISPLAY_TV:
-			data->flags.tvout = 1;
-			break;
-		case ACPI_VIDEO_DISPLAY_DVI:
-			data->flags.dvi = 1;
-			break;
-		case ACPI_VIDEO_DISPLAY_LCD:
-			data->flags.lcd = 1;
-			break;
-		default:
-			data->flags.unknown = 1;
-			break;
-		}
-		if (attribute->bios_can_detect)
-			data->flags.bios = 1;
-	} else {
-		/* Check for legacy IDs */
-		device_type = acpi_video_get_device_type(video, device_id);
-		/* Ignore bits 16 and 18-20 */
-		switch (device_type & 0xffe2ffff) {
-		case ACPI_VIDEO_DISPLAY_LEGACY_MONITOR:
-			data->flags.crt = 1;
-			break;
-		case ACPI_VIDEO_DISPLAY_LEGACY_PANEL:
-			data->flags.lcd = 1;
-			break;
-		case ACPI_VIDEO_DISPLAY_LEGACY_TV:
-			data->flags.tvout = 1;
-			break;
-		default:
-			data->flags.unknown = 1;
-		}
-	}
-
-	acpi_video_device_bind(video, data);
-	acpi_video_device_find_cap(data);
-
-	mutex_lock(&video->device_list_lock);
-	list_add_tail(&data->entry, &video->video_device_list);
-	mutex_unlock(&video->device_list_lock);
-
-	return status;
-}
-
-/*
- *  Arg:
- *	video	: video bus device
- *
- *  Return:
- *	none
- *
- *  Enumerate the video device list of the video bus,
- *  bind the ids with the corresponding video devices
- *  under the video bus.
- */
-
-static void acpi_video_device_rebind(struct acpi_video_bus *video)
-{
-	struct acpi_video_device *dev;
-
-	mutex_lock(&video->device_list_lock);
-
-	list_for_each_entry(dev, &video->video_device_list, entry)
-		acpi_video_device_bind(video, dev);
-
-	mutex_unlock(&video->device_list_lock);
-}
-
-/*
- *  Arg:
- *	video	: video bus device
- *	device	: video output device under the video
- *		bus
- *
- *  Return:
- *	none
- *
- *  Bind the ids with the corresponding video devices
- *  under the video bus.
- */
-
-static void
-acpi_video_device_bind(struct acpi_video_bus *video,
-		       struct acpi_video_device *device)
-{
-	struct acpi_video_enumerated_device *ids;
-	int i;
-
-	for (i = 0; i < video->attached_count; i++) {
-		ids = &video->attached_array[i];
-		if (device->device_id == (ids->value.int_val & 0xffff)) {
-			ids->bind_info = device;
-			ACPI_DEBUG_PRINT((ACPI_DB_INFO, "device_bind %d\n", i));
-		}
-	}
-}
-
-static bool acpi_video_device_in_dod(struct acpi_video_device *device)
-{
-	struct acpi_video_bus *video = device->video;
-	int i;
-
-	/*
-	 * If we have a broken _DOD or we have more than 8 output devices
-	 * under the graphics controller node that we can't proper deal with
-	 * in the operation region code currently, no need to test.
-	 */
-	if (!video->attached_count || video->child_count > 8)
-		return true;
-
-	for (i = 0; i < video->attached_count; i++) {
-		if ((video->attached_array[i].value.int_val & 0xfff) ==
-		    (device->device_id & 0xfff))
-			return true;
-	}
-
-	return false;
-}
-
-/*
- *  Arg:
- *	video	: video bus device
- *
- *  Return:
- *	< 0	: error
- *
- *  Call _DOD to enumerate all devices attached to display adapter
- *
- */
-
-static int acpi_video_device_enumerate(struct acpi_video_bus *video)
-{
-	int status;
-	int count;
-	int i;
-	struct acpi_video_enumerated_device *active_list;
-	struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL };
-	union acpi_object *dod = NULL;
-	union acpi_object *obj;
-
-	status = acpi_evaluate_object(video->device->handle, "_DOD", NULL, &buffer);
-	if (!ACPI_SUCCESS(status)) {
-		ACPI_EXCEPTION((AE_INFO, status, "Evaluating _DOD"));
-		return status;
-	}
-
-	dod = buffer.pointer;
-	if (!dod || (dod->type != ACPI_TYPE_PACKAGE)) {
-		ACPI_EXCEPTION((AE_INFO, status, "Invalid _DOD data"));
-		status = -EFAULT;
-		goto out;
-	}
-
-	ACPI_DEBUG_PRINT((ACPI_DB_INFO, "Found %d video heads in _DOD\n",
-			  dod->package.count));
-
-	active_list = kcalloc(1 + dod->package.count,
-			      sizeof(struct acpi_video_enumerated_device),
-			      GFP_KERNEL);
-	if (!active_list) {
-		status = -ENOMEM;
-		goto out;
-	}
-
-	count = 0;
-	for (i = 0; i < dod->package.count; i++) {
-		obj = &dod->package.elements[i];
-
-		if (obj->type != ACPI_TYPE_INTEGER) {
-			printk(KERN_ERR PREFIX
-				"Invalid _DOD data in element %d\n", i);
-			continue;
-		}
-
-		active_list[count].value.int_val = obj->integer.value;
-		active_list[count].bind_info = NULL;
-		ACPI_DEBUG_PRINT((ACPI_DB_INFO, "dod element[%d] = %d\n", i,
-				  (int)obj->integer.value));
-		count++;
-	}
-
-	kfree(video->attached_array);
-
-	video->attached_array = active_list;
-	video->attached_count = count;
-
-out:
-	kfree(buffer.pointer);
-	return status;
-}
-
-static int
-acpi_video_get_next_level(struct acpi_video_device *device,
-			  u32 level_current, u32 event)
-{
-	int min, max, min_above, max_below, i, l, delta = 255;
-	max = max_below = 0;
-	min = min_above = 255;
-	/* Find closest level to level_current */
-	for (i = 2; i < device->brightness->count; i++) {
-		l = device->brightness->levels[i];
-		if (abs(l - level_current) < abs(delta)) {
-			delta = l - level_current;
-			if (!delta)
-				break;
-		}
-	}
-	/* Ajust level_current to closest available level */
-	level_current += delta;
-	for (i = 2; i < device->brightness->count; i++) {
-		l = device->brightness->levels[i];
-		if (l < min)
-			min = l;
-		if (l > max)
-			max = l;
-		if (l < min_above && l > level_current)
-			min_above = l;
-		if (l > max_below && l < level_current)
-			max_below = l;
-	}
-
-	switch (event) {
-	case ACPI_VIDEO_NOTIFY_CYCLE_BRIGHTNESS:
-		return (level_current < max) ? min_above : min;
-	case ACPI_VIDEO_NOTIFY_INC_BRIGHTNESS:
-		return (level_current < max) ? min_above : max;
-	case ACPI_VIDEO_NOTIFY_DEC_BRIGHTNESS:
-		return (level_current > min) ? max_below : min;
-	case ACPI_VIDEO_NOTIFY_ZERO_BRIGHTNESS:
-	case ACPI_VIDEO_NOTIFY_DISPLAY_OFF:
-		return 0;
-	default:
-		return level_current;
-	}
-}
-
-static void
-acpi_video_switch_brightness(struct work_struct *work)
-{
-	struct acpi_video_device *device = container_of(to_delayed_work(work),
-			     struct acpi_video_device, switch_brightness_work);
-	unsigned long long level_current, level_next;
-	int event = device->switch_brightness_event;
-	int result = -EINVAL;
-
-	/* no warning message if acpi_backlight=vendor or a quirk is used */
-	if (!device->backlight)
-		return;
-
-	if (!device->brightness)
-		goto out;
-
-	result = acpi_video_device_lcd_get_level_current(device,
-							 &level_current,
-							 false);
-	if (result)
-		goto out;
-
-	level_next = acpi_video_get_next_level(device, level_current, event);
-
-	result = acpi_video_device_lcd_set_level(device, level_next);
-
-	if (!result)
-		backlight_force_update(device->backlight,
-				       BACKLIGHT_UPDATE_HOTKEY);
-
-out:
-	if (result)
-		printk(KERN_ERR PREFIX "Failed to switch the brightness\n");
-}
-
-int acpi_video_get_edid(struct acpi_device *device, int type, int device_id,
-			void **edid)
-{
-	struct acpi_video_bus *video;
-	struct acpi_video_device *video_device;
-	union acpi_object *buffer = NULL;
-	acpi_status status;
-	int i, length;
-
-	if (!device || !acpi_driver_data(device))
-		return -EINVAL;
-
-	video = acpi_driver_data(device);
-
-	for (i = 0; i < video->attached_count; i++) {
-		video_device = video->attached_array[i].bind_info;
-		length = 256;
-
-		if (!video_device)
-			continue;
-
-		if (!video_device->cap._DDC)
-			continue;
-
-		if (type) {
-			switch (type) {
-			case ACPI_VIDEO_DISPLAY_CRT:
-				if (!video_device->flags.crt)
-					continue;
-				break;
-			case ACPI_VIDEO_DISPLAY_TV:
-				if (!video_device->flags.tvout)
-					continue;
-				break;
-			case ACPI_VIDEO_DISPLAY_DVI:
-				if (!video_device->flags.dvi)
-					continue;
-				break;
-			case ACPI_VIDEO_DISPLAY_LCD:
-				if (!video_device->flags.lcd)
-					continue;
-				break;
-			}
-		} else if (video_device->device_id != device_id) {
-			continue;
-		}
-
-		status = acpi_video_device_EDID(video_device, &buffer, length);
-
-		if (ACPI_FAILURE(status) || !buffer ||
-		    buffer->type != ACPI_TYPE_BUFFER) {
-			length = 128;
-			status = acpi_video_device_EDID(video_device, &buffer,
-							length);
-			if (ACPI_FAILURE(status) || !buffer ||
-			    buffer->type != ACPI_TYPE_BUFFER) {
-				continue;
-			}
-		}
-
-		*edid = buffer->buffer.pointer;
-		return length;
-	}
-
-	return -ENODEV;
-}
-EXPORT_SYMBOL(acpi_video_get_edid);
-
-static int
-acpi_video_bus_get_devices(struct acpi_video_bus *video,
-			   struct acpi_device *device)
-{
-	int status = 0;
-	struct acpi_device *dev;
-
-	/*
-	 * There are systems where video module known to work fine regardless
-	 * of broken _DOD and ignoring returned value here doesn't cause
-	 * any issues later.
-	 */
-	acpi_video_device_enumerate(video);
-
-	list_for_each_entry(dev, &device->children, node) {
-
-		status = acpi_video_bus_get_one_device(dev, video);
-		if (status) {
-			dev_err(&dev->dev, "Can't attach device\n");
-			break;
-		}
-		video->child_count++;
-	}
-	return status;
-}
-
-/* acpi_video interface */
-
-/*
- * Win8 requires setting bit2 of _DOS to let firmware know it shouldn't
- * preform any automatic brightness change on receiving a notification.
- */
-static int acpi_video_bus_start_devices(struct acpi_video_bus *video)
-{
-	return acpi_video_bus_DOS(video, 0,
-				  acpi_osi_is_win8() ? 1 : 0);
-}
-
-static int acpi_video_bus_stop_devices(struct acpi_video_bus *video)
-{
-	return acpi_video_bus_DOS(video, 0,
-				  acpi_osi_is_win8() ? 0 : 1);
-}
-
-static void acpi_video_bus_notify(struct acpi_device *device, u32 event)
-{
-	struct acpi_video_bus *video = acpi_driver_data(device);
-	struct input_dev *input;
-	int keycode = 0;
-
-	if (!video || !video->input)
-		return;
-
-	input = video->input;
-
-	switch (event) {
-	case ACPI_VIDEO_NOTIFY_SWITCH:	/* User requested a switch,
-					 * most likely via hotkey. */
-		keycode = KEY_SWITCHVIDEOMODE;
-		break;
-
-	case ACPI_VIDEO_NOTIFY_PROBE:	/* User plugged in or removed a video
-					 * connector. */
-		acpi_video_device_enumerate(video);
-		acpi_video_device_rebind(video);
-		keycode = KEY_SWITCHVIDEOMODE;
-		break;
-
-	case ACPI_VIDEO_NOTIFY_CYCLE:	/* Cycle Display output hotkey pressed. */
-		keycode = KEY_SWITCHVIDEOMODE;
-		break;
-	case ACPI_VIDEO_NOTIFY_NEXT_OUTPUT:	/* Next Display output hotkey pressed. */
-		keycode = KEY_VIDEO_NEXT;
-		break;
-	case ACPI_VIDEO_NOTIFY_PREV_OUTPUT:	/* previous Display output hotkey pressed. */
-		keycode = KEY_VIDEO_PREV;
-		break;
-
-	default:
-		ACPI_DEBUG_PRINT((ACPI_DB_INFO,
-				  "Unsupported event [0x%x]\n", event));
-		break;
-	}
-
-	if (acpi_notifier_call_chain(device, event, 0))
-		/* Something vetoed the keypress. */
-		keycode = 0;
-
-	if (keycode) {
-		input_report_key(input, keycode, 1);
-		input_sync(input);
-		input_report_key(input, keycode, 0);
-		input_sync(input);
-	}
-
-	return;
-}
-
-static void brightness_switch_event(struct acpi_video_device *video_device,
-				    u32 event)
-{
-	if (!brightness_switch_enabled)
-		return;
-
-	video_device->switch_brightness_event = event;
-	schedule_delayed_work(&video_device->switch_brightness_work, HZ / 10);
-}
-
-static void acpi_video_device_notify(acpi_handle handle, u32 event, void *data)
-{
-	struct acpi_video_device *video_device = data;
-	struct acpi_device *device = NULL;
-	struct acpi_video_bus *bus;
-	struct input_dev *input;
-	int keycode = 0;
-
-	if (!video_device)
-		return;
-
-	device = video_device->dev;
-	bus = video_device->video;
-	input = bus->input;
-
-	switch (event) {
-	case ACPI_VIDEO_NOTIFY_CYCLE_BRIGHTNESS:	/* Cycle brightness */
-		brightness_switch_event(video_device, event);
-		keycode = KEY_BRIGHTNESS_CYCLE;
-		break;
-	case ACPI_VIDEO_NOTIFY_INC_BRIGHTNESS:	/* Increase brightness */
-		brightness_switch_event(video_device, event);
-		keycode = KEY_BRIGHTNESSUP;
-		break;
-	case ACPI_VIDEO_NOTIFY_DEC_BRIGHTNESS:	/* Decrease brightness */
-		brightness_switch_event(video_device, event);
-		keycode = KEY_BRIGHTNESSDOWN;
-		break;
-	case ACPI_VIDEO_NOTIFY_ZERO_BRIGHTNESS:	/* zero brightness */
-		brightness_switch_event(video_device, event);
-		keycode = KEY_BRIGHTNESS_ZERO;
-		break;
-	case ACPI_VIDEO_NOTIFY_DISPLAY_OFF:	/* display device off */
-		brightness_switch_event(video_device, event);
-		keycode = KEY_DISPLAY_OFF;
-		break;
-	default:
-		ACPI_DEBUG_PRINT((ACPI_DB_INFO,
-				  "Unsupported event [0x%x]\n", event));
-		break;
-	}
-
-	acpi_notifier_call_chain(device, event, 0);
-
-	if (keycode) {
-		input_report_key(input, keycode, 1);
-		input_sync(input);
-		input_report_key(input, keycode, 0);
-		input_sync(input);
-	}
-
-	return;
-}
-
-static int acpi_video_resume(struct notifier_block *nb,
-				unsigned long val, void *ign)
-{
-	struct acpi_video_bus *video;
-	struct acpi_video_device *video_device;
-	int i;
-
-	switch (val) {
-	case PM_HIBERNATION_PREPARE:
-	case PM_SUSPEND_PREPARE:
-	case PM_RESTORE_PREPARE:
-		return NOTIFY_DONE;
-	}
-
-	video = container_of(nb, struct acpi_video_bus, pm_nb);
-
-	dev_info(&video->device->dev, "Restoring backlight state\n");
-
-	for (i = 0; i < video->attached_count; i++) {
-		video_device = video->attached_array[i].bind_info;
-		if (video_device && video_device->brightness)
-			acpi_video_device_lcd_set_level(video_device,
-					video_device->brightness->curr);
-	}
-
-	return NOTIFY_OK;
-}
-
-static acpi_status
-acpi_video_bus_match(acpi_handle handle, u32 level, void *context,
-			void **return_value)
-{
-	struct acpi_device *device = context;
-	struct acpi_device *sibling;
-	int result;
-
-	if (handle == device->handle)
-		return AE_CTRL_TERMINATE;
-
-	result = acpi_bus_get_device(handle, &sibling);
-	if (result)
-		return AE_OK;
-
-	if (!strcmp(acpi_device_name(sibling), ACPI_VIDEO_BUS_NAME))
-			return AE_ALREADY_EXISTS;
-
-	return AE_OK;
-}
-
-static void acpi_video_dev_register_backlight(struct acpi_video_device *device)
-{
-	struct backlight_properties props;
-	struct pci_dev *pdev;
-	acpi_handle acpi_parent;
-	struct device *parent = NULL;
-	int result;
-	static int count;
-	char *name;
-
-	/*
-	 * Do not create backlight device for video output
-	 * device that is not in the enumerated list.
-	 */
-	if (!acpi_video_device_in_dod(device)) {
-		dev_dbg(&device->dev->dev, "not in _DOD list, ignore\n");
-		return;
-	}
-
-	result = acpi_video_init_brightness(device);
-	if (result)
-		return;
-
-	if (disable_backlight_sysfs_if > 0)
-		return;
-
-	name = kasprintf(GFP_KERNEL, "acpi_video%d", count);
-	if (!name)
-		return;
-	count++;
-
-	acpi_get_parent(device->dev->handle, &acpi_parent);
-
-	pdev = acpi_get_pci_dev(acpi_parent);
-	if (pdev) {
-		parent = &pdev->dev;
-		pci_dev_put(pdev);
-	}
-
-	memset(&props, 0, sizeof(struct backlight_properties));
-	props.type = BACKLIGHT_FIRMWARE;
-	props.max_brightness = device->brightness->count - 3;
-	device->backlight = backlight_device_register(name,
-						      parent,
-						      device,
-						      &acpi_backlight_ops,
-						      &props);
-	kfree(name);
-	if (IS_ERR(device->backlight)) {
-		device->backlight = NULL;
-		return;
-	}
-
-	/*
-	 * Save current brightness level in case we have to restore it
-	 * before acpi_video_device_lcd_set_level() is called next time.
-	 */
-	device->backlight->props.brightness =
-			acpi_video_get_brightness(device->backlight);
-
-	device->cooling_dev = thermal_cooling_device_register("LCD",
-				device->dev, &video_cooling_ops);
-	if (IS_ERR(device->cooling_dev)) {
-		/*
-		 * Set cooling_dev to NULL so we don't crash trying to free it.
-		 * Also, why the hell we are returning early and not attempt to
-		 * register video output if cooling device registration failed?
-		 * -- dtor
-		 */
-		device->cooling_dev = NULL;
-		return;
-	}
-
-	dev_info(&device->dev->dev, "registered as cooling_device%d\n",
-		 device->cooling_dev->id);
-	result = sysfs_create_link(&device->dev->dev.kobj,
-			&device->cooling_dev->device.kobj,
-			"thermal_cooling");
-	if (result)
-		printk(KERN_ERR PREFIX "Create sysfs link\n");
-	result = sysfs_create_link(&device->cooling_dev->device.kobj,
-			&device->dev->dev.kobj, "device");
-	if (result)
-		printk(KERN_ERR PREFIX "Create sysfs link\n");
-}
-
-static void acpi_video_run_bcl_for_osi(struct acpi_video_bus *video)
-{
-	struct acpi_video_device *dev;
-	union acpi_object *levels;
-
-	mutex_lock(&video->device_list_lock);
-	list_for_each_entry(dev, &video->video_device_list, entry) {
-		if (!acpi_video_device_lcd_query_levels(dev, &levels))
-			kfree(levels);
-	}
-	mutex_unlock(&video->device_list_lock);
-}
-
-static int acpi_video_bus_register_backlight(struct acpi_video_bus *video)
-{
-	struct acpi_video_device *dev;
-
-	if (video->backlight_registered)
-		return 0;
-
-	acpi_video_run_bcl_for_osi(video);
-
-	if (!acpi_video_verify_backlight_support())
-		return 0;
-
-	mutex_lock(&video->device_list_lock);
-	list_for_each_entry(dev, &video->video_device_list, entry)
-		acpi_video_dev_register_backlight(dev);
-	mutex_unlock(&video->device_list_lock);
-
-	video->backlight_registered = true;
-
-	video->pm_nb.notifier_call = acpi_video_resume;
-	video->pm_nb.priority = 0;
-	return register_pm_notifier(&video->pm_nb);
-}
-
-static void acpi_video_dev_unregister_backlight(struct acpi_video_device *device)
-{
-	if (device->backlight) {
-		backlight_device_unregister(device->backlight);
-		device->backlight = NULL;
-	}
-	if (device->brightness) {
-		kfree(device->brightness->levels);
-		kfree(device->brightness);
-		device->brightness = NULL;
-	}
-	if (device->cooling_dev) {
-		sysfs_remove_link(&device->dev->dev.kobj, "thermal_cooling");
-		sysfs_remove_link(&device->cooling_dev->device.kobj, "device");
-		thermal_cooling_device_unregister(device->cooling_dev);
-		device->cooling_dev = NULL;
-	}
-}
-
-static int acpi_video_bus_unregister_backlight(struct acpi_video_bus *video)
-{
-	struct acpi_video_device *dev;
-	int error;
-
-	if (!video->backlight_registered)
-		return 0;
-
-	error = unregister_pm_notifier(&video->pm_nb);
-
-	mutex_lock(&video->device_list_lock);
-	list_for_each_entry(dev, &video->video_device_list, entry)
-		acpi_video_dev_unregister_backlight(dev);
-	mutex_unlock(&video->device_list_lock);
-
-	video->backlight_registered = false;
-
-	return error;
-}
-
-static void acpi_video_dev_add_notify_handler(struct acpi_video_device *device)
-{
-	acpi_status status;
-	struct acpi_device *adev = device->dev;
-
-	status = acpi_install_notify_handler(adev->handle, ACPI_DEVICE_NOTIFY,
-					     acpi_video_device_notify, device);
-	if (ACPI_FAILURE(status))
-		dev_err(&adev->dev, "Error installing notify handler\n");
-	else
-		device->flags.notify = 1;
-}
-
-static int acpi_video_bus_add_notify_handler(struct acpi_video_bus *video)
-{
-	struct input_dev *input;
-	struct acpi_video_device *dev;
-	int error;
-
-	video->input = input = input_allocate_device();
-	if (!input) {
-		error = -ENOMEM;
-		goto out;
-	}
-
-	error = acpi_video_bus_start_devices(video);
-	if (error)
-		goto err_free_input;
-
-	snprintf(video->phys, sizeof(video->phys),
-			"%s/video/input0", acpi_device_hid(video->device));
-
-	input->name = acpi_device_name(video->device);
-	input->phys = video->phys;
-	input->id.bustype = BUS_HOST;
-	input->id.product = 0x06;
-	input->dev.parent = &video->device->dev;
-	input->evbit[0] = BIT(EV_KEY);
-	set_bit(KEY_SWITCHVIDEOMODE, input->keybit);
-	set_bit(KEY_VIDEO_NEXT, input->keybit);
-	set_bit(KEY_VIDEO_PREV, input->keybit);
-	set_bit(KEY_BRIGHTNESS_CYCLE, input->keybit);
-	set_bit(KEY_BRIGHTNESSUP, input->keybit);
-	set_bit(KEY_BRIGHTNESSDOWN, input->keybit);
-	set_bit(KEY_BRIGHTNESS_ZERO, input->keybit);
-	set_bit(KEY_DISPLAY_OFF, input->keybit);
-
-	error = input_register_device(input);
-	if (error)
-		goto err_stop_dev;
-
-	mutex_lock(&video->device_list_lock);
-	list_for_each_entry(dev, &video->video_device_list, entry)
-		acpi_video_dev_add_notify_handler(dev);
-	mutex_unlock(&video->device_list_lock);
-
-	return 0;
-
-err_stop_dev:
-	acpi_video_bus_stop_devices(video);
-err_free_input:
-	input_free_device(input);
-	video->input = NULL;
-out:
-	return error;
-}
-
-static void acpi_video_dev_remove_notify_handler(struct acpi_video_device *dev)
-{
-	if (dev->flags.notify) {
-		acpi_remove_notify_handler(dev->dev->handle, ACPI_DEVICE_NOTIFY,
-					   acpi_video_device_notify);
-		dev->flags.notify = 0;
-	}
-}
-
-static void acpi_video_bus_remove_notify_handler(struct acpi_video_bus *video)
-{
-	struct acpi_video_device *dev;
-
-	mutex_lock(&video->device_list_lock);
-	list_for_each_entry(dev, &video->video_device_list, entry)
-		acpi_video_dev_remove_notify_handler(dev);
-	mutex_unlock(&video->device_list_lock);
-
-	acpi_video_bus_stop_devices(video);
-	input_unregister_device(video->input);
-	video->input = NULL;
-}
-
-static int acpi_video_backlight_notify(struct notifier_block *nb,
-					unsigned long val, void *bd)
-{
-	struct backlight_device *backlight = bd;
-	struct acpi_video_bus *video;
-
-	/* acpi_video_verify_backlight_support only cares about raw devices */
-	if (backlight->props.type != BACKLIGHT_RAW)
-		return NOTIFY_DONE;
-
-	video = container_of(nb, struct acpi_video_bus, backlight_nb);
-
-	switch (val) {
-	case BACKLIGHT_REGISTERED:
-		if (!acpi_video_verify_backlight_support())
-			acpi_video_bus_unregister_backlight(video);
-		break;
-	case BACKLIGHT_UNREGISTERED:
-		acpi_video_bus_register_backlight(video);
-		break;
-	}
-
-	return NOTIFY_OK;
-}
-
-static int acpi_video_bus_add_backlight_notify_handler(
-						struct acpi_video_bus *video)
-{
-	int error;
-
-	video->backlight_nb.notifier_call = acpi_video_backlight_notify;
-	video->backlight_nb.priority = 0;
-	error = backlight_register_notifier(&video->backlight_nb);
-	if (error == 0)
-		video->backlight_notifier_registered = true;
-
-	return error;
-}
-
-static int acpi_video_bus_remove_backlight_notify_handler(
-						struct acpi_video_bus *video)
-{
-	if (!video->backlight_notifier_registered)
-		return 0;
-
-	video->backlight_notifier_registered = false;
-
-	return backlight_unregister_notifier(&video->backlight_nb);
-}
-
-static int acpi_video_bus_put_devices(struct acpi_video_bus *video)
-{
-	struct acpi_video_device *dev, *next;
-
-	mutex_lock(&video->device_list_lock);
-	list_for_each_entry_safe(dev, next, &video->video_device_list, entry) {
-		list_del(&dev->entry);
-		kfree(dev);
-	}
-	mutex_unlock(&video->device_list_lock);
-
-	return 0;
-}
-
-static int instance;
-
-static int acpi_video_bus_add(struct acpi_device *device)
-{
-	struct acpi_video_bus *video;
-	int error;
-	acpi_status status;
-
-	status = acpi_walk_namespace(ACPI_TYPE_DEVICE,
-				device->parent->handle, 1,
-				acpi_video_bus_match, NULL,
-				device, NULL);
-	if (status == AE_ALREADY_EXISTS) {
-		printk(KERN_WARNING FW_BUG
-			"Duplicate ACPI video bus devices for the"
-			" same VGA controller, please try module "
-			"parameter \"video.allow_duplicates=1\""
-			"if the current driver doesn't work.\n");
-		if (!allow_duplicates)
-			return -ENODEV;
-	}
-
-	video = kzalloc(sizeof(struct acpi_video_bus), GFP_KERNEL);
-	if (!video)
-		return -ENOMEM;
-
-	/* a hack to fix the duplicate name "VID" problem on T61 */
-	if (!strcmp(device->pnp.bus_id, "VID")) {
-		if (instance)
-			device->pnp.bus_id[3] = '0' + instance;
-		instance++;
-	}
-	/* a hack to fix the duplicate name "VGA" problem on Pa 3553 */
-	if (!strcmp(device->pnp.bus_id, "VGA")) {
-		if (instance)
-			device->pnp.bus_id[3] = '0' + instance;
-		instance++;
-	}
-
-	video->device = device;
-	strcpy(acpi_device_name(device), ACPI_VIDEO_BUS_NAME);
-	strcpy(acpi_device_class(device), ACPI_VIDEO_CLASS);
-	device->driver_data = video;
-
-	acpi_video_bus_find_cap(video);
-	error = acpi_video_bus_check(video);
-	if (error)
-		goto err_free_video;
-
-	mutex_init(&video->device_list_lock);
-	INIT_LIST_HEAD(&video->video_device_list);
-
-	error = acpi_video_bus_get_devices(video, device);
-	if (error)
-		goto err_put_video;
-
-	printk(KERN_INFO PREFIX "%s [%s] (multi-head: %s  rom: %s  post: %s)\n",
-	       ACPI_VIDEO_DEVICE_NAME, acpi_device_bid(device),
-	       video->flags.multihead ? "yes" : "no",
-	       video->flags.rom ? "yes" : "no",
-	       video->flags.post ? "yes" : "no");
-	mutex_lock(&video_list_lock);
-	list_add_tail(&video->entry, &video_bus_head);
-	mutex_unlock(&video_list_lock);
-
-	acpi_video_bus_register_backlight(video);
-	acpi_video_bus_add_notify_handler(video);
-	acpi_video_bus_add_backlight_notify_handler(video);
-
-	return 0;
-
-err_put_video:
-	acpi_video_bus_put_devices(video);
-	kfree(video->attached_array);
-err_free_video:
-	kfree(video);
-	device->driver_data = NULL;
-
-	return error;
-}
-
-static int acpi_video_bus_remove(struct acpi_device *device)
-{
-	struct acpi_video_bus *video = NULL;
-
-
-	if (!device || !acpi_driver_data(device))
-		return -EINVAL;
-
-	video = acpi_driver_data(device);
-
-	acpi_video_bus_remove_backlight_notify_handler(video);
-	acpi_video_bus_remove_notify_handler(video);
-	acpi_video_bus_unregister_backlight(video);
-	acpi_video_bus_put_devices(video);
-
-	mutex_lock(&video_list_lock);
-	list_del(&video->entry);
-	mutex_unlock(&video_list_lock);
-
-	kfree(video->attached_array);
-	kfree(video);
-
-	return 0;
-}
-
-static int __init is_i740(struct pci_dev *dev)
-{
-	if (dev->device == 0x00D1)
-		return 1;
-	if (dev->device == 0x7000)
-		return 1;
-	return 0;
-}
-
-static int __init intel_opregion_present(void)
-{
-	int opregion = 0;
-	struct pci_dev *dev = NULL;
-	u32 address;
-
-	for_each_pci_dev(dev) {
-		if ((dev->class >> 8) != PCI_CLASS_DISPLAY_VGA)
-			continue;
-		if (dev->vendor != PCI_VENDOR_ID_INTEL)
-			continue;
-		/* We don't want to poke around undefined i740 registers */
-		if (is_i740(dev))
-			continue;
-		pci_read_config_dword(dev, 0xfc, &address);
-		if (!address)
-			continue;
-		opregion = 1;
-	}
-	return opregion;
-}
-
-int acpi_video_register(void)
-{
-	int ret;
-
-	if (register_count) {
-		/*
-		 * if the function of acpi_video_register is already called,
-		 * don't register the acpi_vide_bus again and return no error.
-		 */
-		return 0;
-	}
-
-	mutex_init(&video_list_lock);
-	INIT_LIST_HEAD(&video_bus_head);
-
-	ret = acpi_bus_register_driver(&acpi_video_bus);
-	if (ret)
-		return ret;
-
-	/*
-	 * When the acpi_video_bus is loaded successfully, increase
-	 * the counter reference.
-	 */
-	register_count = 1;
-
-	return 0;
-}
-EXPORT_SYMBOL(acpi_video_register);
-
-void acpi_video_unregister(void)
-{
-	if (!register_count) {
-		/*
-		 * If the acpi video bus is already unloaded, don't
-		 * unload it again and return directly.
-		 */
-		return;
-	}
-	acpi_bus_unregister_driver(&acpi_video_bus);
-
-	register_count = 0;
-
-	return;
-}
-EXPORT_SYMBOL(acpi_video_unregister);
-
-void acpi_video_unregister_backlight(void)
-{
-	struct acpi_video_bus *video;
-
-	if (!register_count)
-		return;
-
-	mutex_lock(&video_list_lock);
-	list_for_each_entry(video, &video_bus_head, entry)
-		acpi_video_bus_unregister_backlight(video);
-	mutex_unlock(&video_list_lock);
-}
-EXPORT_SYMBOL(acpi_video_unregister_backlight);
-
-/*
- * This is kind of nasty. Hardware using Intel chipsets may require
- * the video opregion code to be run first in order to initialise
- * state before any ACPI video calls are made. To handle this we defer
- * registration of the video class until the opregion code has run.
- */
-
-static int __init acpi_video_init(void)
-{
-	/*
-	 * Let the module load even if ACPI is disabled (e.g. due to
-	 * a broken BIOS) so that i915.ko can still be loaded on such
-	 * old systems without an AcpiOpRegion.
-	 *
-	 * acpi_video_register() will report -ENODEV later as well due
-	 * to acpi_disabled when i915.ko tries to register itself afterwards.
-	 */
-	if (acpi_disabled)
-		return 0;
-
-	dmi_check_system(video_dmi_table);
-
-	if (intel_opregion_present())
-		return 0;
-
-	return acpi_video_register();
-}
-
-static void __exit acpi_video_exit(void)
-{
-	acpi_video_unregister();
-
-	return;
-}
-
-module_init(acpi_video_init);
-module_exit(acpi_video_exit);
diff --git a/drivers/acpi/video_detect.c b/drivers/acpi/video_detect.c
index b2270ca21538..bb6133c60175 100644
--- a/drivers/acpi/video_detect.c
+++ b/drivers/acpi/video_detect.c
@@ -32,16 +32,23 @@
 #include <linux/export.h>
 #include <linux/acpi.h>
 #include <linux/dmi.h>
+#include <linux/module.h>
 #include <linux/pci.h>
 
-#include "internal.h"
-
 ACPI_MODULE_NAME("video");
 #define _COMPONENT		ACPI_VIDEO_COMPONENT
 
 static long acpi_video_support;
 static bool acpi_video_caps_checked;
 
+static void acpi_video_parse_cmdline(void)
+{
+	if (!strcmp("vendor", acpi_video_backlight_string))
+		acpi_video_support |= ACPI_VIDEO_BACKLIGHT_FORCE_VENDOR;
+	if (!strcmp("video", acpi_video_backlight_string))
+		acpi_video_support |= ACPI_VIDEO_BACKLIGHT_FORCE_VIDEO;
+}
+
 static acpi_status
 find_video(acpi_handle handle, u32 lvl, void *context, void **rv)
 {
@@ -174,8 +181,10 @@ static void acpi_video_caps_check(void)
 	 * We must check whether the ACPI graphics device is physically plugged
 	 * in. Therefore this must be called after binding PCI and ACPI devices
 	 */
-	if (!acpi_video_caps_checked)
+	if (!acpi_video_caps_checked) {
+		acpi_video_parse_cmdline();
 		acpi_video_get_capabilities(NULL);
+	}
 }
 
 /* Promote the vendor interface instead of the generic video module.
@@ -212,23 +221,3 @@ int acpi_video_backlight_support(void)
 	return acpi_video_support & ACPI_VIDEO_BACKLIGHT;
 }
 EXPORT_SYMBOL(acpi_video_backlight_support);
-
-/*
- * Use acpi_backlight=vendor/video to force that backlight switching
- * is processed by vendor specific acpi drivers or video.ko driver.
- */
-static int __init acpi_backlight(char *str)
-{
-	if (str == NULL || *str == '\0')
-		return 1;
-	else {
-		if (!strcmp("vendor", str))
-			acpi_video_support |=
-				ACPI_VIDEO_BACKLIGHT_FORCE_VENDOR;
-		if (!strcmp("video", str))
-			acpi_video_support |=
-				ACPI_VIDEO_BACKLIGHT_FORCE_VIDEO;
-	}
-	return 1;
-}
-__setup("acpi_backlight=", acpi_backlight);
diff --git a/drivers/platform/x86/Kconfig b/drivers/platform/x86/Kconfig
index f9f205cb1f11..909133c6ffb7 100644
--- a/drivers/platform/x86/Kconfig
+++ b/drivers/platform/x86/Kconfig
@@ -71,9 +71,10 @@ config ASUS_LAPTOP
 	depends on ACPI
 	select LEDS_CLASS
 	select NEW_LEDS
-	select BACKLIGHT_CLASS_DEVICE
+	depends on BACKLIGHT_CLASS_DEVICE
 	depends on INPUT
 	depends on RFKILL || RFKILL = n
+	depends on ACPI_VIDEO || ACPI_VIDEO = n
 	select INPUT_SPARSEKMAP
 	select INPUT_POLLDEV
 	---help---
@@ -95,6 +96,7 @@ config DELL_LAPTOP
 	depends on X86
 	depends on DCDBAS
 	depends on BACKLIGHT_CLASS_DEVICE
+	depends on ACPI_VIDEO || ACPI_VIDEO = n
 	depends on RFKILL || RFKILL = n
 	depends on SERIO_I8042
 	select POWER_SUPPLY
@@ -109,6 +111,7 @@ config DELL_WMI
 	tristate "Dell WMI extras"
 	depends on ACPI_WMI
 	depends on INPUT
+	depends on ACPI_VIDEO || ACPI_VIDEO = n
 	select INPUT_SPARSEKMAP
 	---help---
 	  Say Y here if you want to support WMI-based hotkeys on Dell laptops.
@@ -144,6 +147,7 @@ config FUJITSU_LAPTOP
 	depends on ACPI
 	depends on INPUT
 	depends on BACKLIGHT_CLASS_DEVICE
+	depends on ACPI_VIDEO || ACPI_VIDEO = n
 	depends on LEDS_CLASS || LEDS_CLASS=n
 	---help---
 	  This is a driver for laptops built by Fujitsu:
@@ -247,6 +251,7 @@ config MSI_LAPTOP
 	tristate "MSI Laptop Extras"
 	depends on ACPI
 	depends on BACKLIGHT_CLASS_DEVICE
+	depends on ACPI_VIDEO || ACPI_VIDEO = n
 	depends on RFKILL
 	depends on INPUT && SERIO_I8042
 	select INPUT_SPARSEKMAP
@@ -280,6 +285,7 @@ config COMPAL_LAPTOP
 	tristate "Compal (and others) Laptop Extras"
 	depends on ACPI
 	depends on BACKLIGHT_CLASS_DEVICE
+	depends on ACPI_VIDEO || ACPI_VIDEO = n
 	depends on RFKILL
 	depends on HWMON
 	depends on POWER_SUPPLY
@@ -296,7 +302,8 @@ config COMPAL_LAPTOP
 config SONY_LAPTOP
 	tristate "Sony Laptop Extras"
 	depends on ACPI
-	select BACKLIGHT_CLASS_DEVICE
+	depends on ACPI_VIDEO || ACPI_VIDEO = n
+	depends on BACKLIGHT_CLASS_DEVICE
 	depends on INPUT
 	depends on RFKILL
 	  ---help---
@@ -321,6 +328,7 @@ config IDEAPAD_LAPTOP
 	depends on RFKILL && INPUT
 	depends on SERIO_I8042
 	depends on BACKLIGHT_CLASS_DEVICE
+	depends on ACPI_VIDEO || ACPI_VIDEO = n
 	select INPUT_SPARSEKMAP
 	help
 	  This is a driver for Lenovo IdeaPad netbooks contains drivers for
@@ -331,8 +339,8 @@ config THINKPAD_ACPI
 	depends on ACPI
 	depends on INPUT
 	depends on RFKILL || RFKILL = n
-	select BACKLIGHT_LCD_SUPPORT
-	select BACKLIGHT_CLASS_DEVICE
+	depends on ACPI_VIDEO || ACPI_VIDEO = n
+	depends on BACKLIGHT_CLASS_DEVICE
 	select HWMON
 	select NVRAM
 	select NEW_LEDS
@@ -500,8 +508,9 @@ config EEEPC_LAPTOP
 	depends on ACPI
 	depends on INPUT
 	depends on RFKILL || RFKILL = n
+	depends on ACPI_VIDEO || ACPI_VIDEO = n
 	depends on HOTPLUG_PCI
-	select BACKLIGHT_CLASS_DEVICE
+	depends on BACKLIGHT_CLASS_DEVICE
 	select HWMON
 	select LEDS_CLASS
 	select NEW_LEDS
@@ -587,6 +596,7 @@ config MSI_WMI
 	depends on ACPI_WMI
 	depends on INPUT
 	depends on BACKLIGHT_CLASS_DEVICE
+	depends on ACPI_VIDEO || ACPI_VIDEO = n
 	select INPUT_SPARSEKMAP
 	help
 	 Say Y here if you want to support WMI-based hotkeys on MSI laptops.
@@ -824,6 +834,7 @@ config MXM_WMI
 config INTEL_OAKTRAIL
 	tristate "Intel Oaktrail Platform Extras"
 	depends on ACPI
+	depends on ACPI_VIDEO || ACPI_VIDEO = n
 	depends on RFKILL && BACKLIGHT_CLASS_DEVICE && ACPI
 	---help---
 	  Intel Oaktrail platform need this driver to provide interfaces to
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 913a1c19818a..f097c0a2718e 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -243,6 +243,7 @@ extern bool wmi_has_guid(const char *guid);
 #define ACPI_VIDEO_OUTPUT_SWITCHING_DMI_VENDOR		0x0400
 #define ACPI_VIDEO_OUTPUT_SWITCHING_DMI_VIDEO		0x0800
 
+extern char acpi_video_backlight_string[];
 extern long acpi_is_video_device(acpi_handle handle);
 
 #if defined(CONFIG_ACPI_VIDEO) || defined(CONFIG_ACPI_VIDEO_MODULE)
-- 
cgit v1.2.3


From d0a530ba424ec1be7630f7fce2db9860b9429b8f Mon Sep 17 00:00:00 2001
From: Hans de Goede <hdegoede@redhat.com>
Date: Tue, 16 Jun 2015 16:28:12 +0200
Subject: acpi-video-detect: Remove old API

Remove the old backlight interface selection API now that all drivers
have been ported to the new API.

Signed-off-by: Hans de Goede <hdegoede@redhat.com>
Acked-by: Darren Hart <dvhart@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/video_detect.c | 31 -------------------------------
 include/linux/acpi.h        | 19 -------------------
 2 files changed, 50 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/acpi/video_detect.c b/drivers/acpi/video_detect.c
index 0df15673eed2..c3925790203b 100644
--- a/drivers/acpi/video_detect.c
+++ b/drivers/acpi/video_detect.c
@@ -340,37 +340,6 @@ void acpi_video_set_dmi_backlight_type(enum acpi_backlight_type type)
 }
 EXPORT_SYMBOL(acpi_video_set_dmi_backlight_type);
 
-/*
- * Compatiblity function, this is going away as soon as all drivers are
- * converted to acpi_video_set_dmi_backlight_type().
- *
- * Promote the vendor interface instead of the generic video module.
- * After calling this function you will probably want to call
- * acpi_video_unregister() to make sure the video module is not loaded
- */
-void acpi_video_dmi_promote_vendor(void)
-{
-	acpi_video_set_dmi_backlight_type(acpi_backlight_vendor);
-}
-EXPORT_SYMBOL(acpi_video_dmi_promote_vendor);
-
-/*
- * Compatiblity function, this is going away as soon as all drivers are
- * converted to acpi_video_get_backlight_type().
- *
- * Returns true if video.ko can do backlight switching.
- */
-int acpi_video_backlight_support(void)
-{
-	/*
-	 * This is done this way since vendor drivers call this to see
-	 * if they should load, and we do not want them to load for both
-	 * the acpi_backlight_video and acpi_backlight_native cases.
-	 */
-	return acpi_video_get_backlight_type() != acpi_backlight_vendor;
-}
-EXPORT_SYMBOL(acpi_video_backlight_support);
-
 void __exit acpi_video_detect_exit(void)
 {
 	if (backlight_notifier_registered)
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index f097c0a2718e..5966d1d9280e 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -245,25 +245,6 @@ extern bool wmi_has_guid(const char *guid);
 
 extern char acpi_video_backlight_string[];
 extern long acpi_is_video_device(acpi_handle handle);
-
-#if defined(CONFIG_ACPI_VIDEO) || defined(CONFIG_ACPI_VIDEO_MODULE)
-
-extern void acpi_video_dmi_promote_vendor(void);
-extern int acpi_video_backlight_support(void);
-
-#else
-
-static inline void acpi_video_dmi_promote_vendor(void)
-{
-}
-
-static inline int acpi_video_backlight_support(void)
-{
-	return 0;
-}
-
-#endif /* defined(CONFIG_ACPI_VIDEO) || defined(CONFIG_ACPI_VIDEO_MODULE) */
-
 extern int acpi_blacklisted(void);
 extern void acpi_dmi_osi_linux(int enable, const struct dmi_system_id *d);
 extern void acpi_osi_setup(char *str);
-- 
cgit v1.2.3


From edf18b9108f5025f9e83b2c167c9122954acbc62 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Thu, 18 Jun 2015 14:00:48 +0800
Subject: crypto: api - Add CRYPTO_MINALIGN_ATTR to struct crypto_alg

The struct crypto_alg is embedded into various type-specific structs
such as aead_alg.  This is then used as part of instances such as
struct aead_instance.  It is also embedded into the generic struct
crypto_instance.  In order to ensure that struct aead_instance can
be converted to struct crypto_instance when necessary, we need to
ensure that crypto_alg is aligned properly.

This patch adds an alignment attribute to struct crypto_alg to
ensure this.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 include/linux/crypto.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/crypto.h b/include/linux/crypto.h
index 0e3f71a73e3b..964e5735a6a9 100644
--- a/include/linux/crypto.h
+++ b/include/linux/crypto.h
@@ -513,7 +513,7 @@ struct crypto_alg {
 	void (*cra_destroy)(struct crypto_alg *alg);
 	
 	struct module *cra_module;
-};
+} CRYPTO_MINALIGN_ATTR;
 
 /*
  * Algorithm registration interface.
-- 
cgit v1.2.3


From 4bacc9c9234c7c8eec44f5ed4e960d9f96fa0f01 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 18 Jun 2015 14:32:31 +0100
Subject: overlayfs: Make f_path always point to the overlay and f_inode to the
 underlay

Make file->f_path always point to the overlay dentry so that the path in
/proc/pid/fd is correct and to ensure that label-based LSMs have access to the
overlay as well as the underlay (path-based LSMs probably don't need it).

Using my union testsuite to set things up, before the patch I see:

	[root@andromeda union-testsuite]# bash 5</mnt/a/foo107
	[root@andromeda union-testsuite]# ls -l /proc/$$/fd/
	...
	lr-x------. 1 root root 64 Jun  5 14:38 5 -> /a/foo107
	[root@andromeda union-testsuite]# stat /mnt/a/foo107
	...
	Device: 23h/35d Inode: 13381       Links: 1
	...
	[root@andromeda union-testsuite]# stat -L /proc/$$/fd/5
	...
	Device: 23h/35d Inode: 13381       Links: 1
	...

After the patch:

	[root@andromeda union-testsuite]# bash 5</mnt/a/foo107
	[root@andromeda union-testsuite]# ls -l /proc/$$/fd/
	...
	lr-x------. 1 root root 64 Jun  5 14:22 5 -> /mnt/a/foo107
	[root@andromeda union-testsuite]# stat /mnt/a/foo107
	...
	Device: 23h/35d Inode: 40346       Links: 1
	...
	[root@andromeda union-testsuite]# stat -L /proc/$$/fd/5
	...
	Device: 23h/35d Inode: 40346       Links: 1
	...

Note the change in where /proc/$$/fd/5 points to in the ls command.  It was
pointing to /a/foo107 (which doesn't exist) and now points to /mnt/a/foo107
(which is correct).

The inode accessed, however, is the lower layer.  The union layer is on device
25h/37d and the upper layer on 24h/36d.

Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/dcache.c              |  5 ++++-
 fs/internal.h            |  1 +
 fs/open.c                | 49 +++++++++++++++++++++++++-----------------------
 fs/overlayfs/inode.c     | 14 ++++++--------
 fs/overlayfs/overlayfs.h |  1 +
 fs/overlayfs/super.c     |  1 +
 include/linux/dcache.h   |  2 ++
 include/linux/fs.h       |  2 --
 8 files changed, 41 insertions(+), 34 deletions(-)

(limited to 'include/linux')

diff --git a/fs/dcache.c b/fs/dcache.c
index 37b5afdaf698..c4ce35110704 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1673,7 +1673,8 @@ void d_set_d_op(struct dentry *dentry, const struct dentry_operations *op)
 				DCACHE_OP_COMPARE	|
 				DCACHE_OP_REVALIDATE	|
 				DCACHE_OP_WEAK_REVALIDATE	|
-				DCACHE_OP_DELETE ));
+				DCACHE_OP_DELETE	|
+				DCACHE_OP_SELECT_INODE));
 	dentry->d_op = op;
 	if (!op)
 		return;
@@ -1689,6 +1690,8 @@ void d_set_d_op(struct dentry *dentry, const struct dentry_operations *op)
 		dentry->d_flags |= DCACHE_OP_DELETE;
 	if (op->d_prune)
 		dentry->d_flags |= DCACHE_OP_PRUNE;
+	if (op->d_select_inode)
+		dentry->d_flags |= DCACHE_OP_SELECT_INODE;
 
 }
 EXPORT_SYMBOL(d_set_d_op);
diff --git a/fs/internal.h b/fs/internal.h
index 01dce1d1476b..4d5af583ab03 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -107,6 +107,7 @@ extern struct file *do_file_open_root(struct dentry *, struct vfsmount *,
 extern long do_handle_open(int mountdirfd,
 			   struct file_handle __user *ufh, int open_flag);
 extern int open_check_o_direct(struct file *f);
+extern int vfs_open(const struct path *, struct file *, const struct cred *);
 
 /*
  * inode.c
diff --git a/fs/open.c b/fs/open.c
index e0250bdcc440..b1c5823b7f11 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -678,18 +678,18 @@ int open_check_o_direct(struct file *f)
 }
 
 static int do_dentry_open(struct file *f,
+			  struct inode *inode,
 			  int (*open)(struct inode *, struct file *),
 			  const struct cred *cred)
 {
 	static const struct file_operations empty_fops = {};
-	struct inode *inode;
 	int error;
 
 	f->f_mode = OPEN_FMODE(f->f_flags) | FMODE_LSEEK |
 				FMODE_PREAD | FMODE_PWRITE;
 
 	path_get(&f->f_path);
-	inode = f->f_inode = f->f_path.dentry->d_inode;
+	f->f_inode = inode;
 	f->f_mapping = inode->i_mapping;
 
 	if (unlikely(f->f_flags & O_PATH)) {
@@ -793,7 +793,8 @@ int finish_open(struct file *file, struct dentry *dentry,
 	BUG_ON(*opened & FILE_OPENED); /* once it's opened, it's opened */
 
 	file->f_path.dentry = dentry;
-	error = do_dentry_open(file, open, current_cred());
+	error = do_dentry_open(file, d_backing_inode(dentry), open,
+			       current_cred());
 	if (!error)
 		*opened |= FILE_OPENED;
 
@@ -822,6 +823,28 @@ int finish_no_open(struct file *file, struct dentry *dentry)
 }
 EXPORT_SYMBOL(finish_no_open);
 
+/**
+ * vfs_open - open the file at the given path
+ * @path: path to open
+ * @file: newly allocated file with f_flag initialized
+ * @cred: credentials to use
+ */
+int vfs_open(const struct path *path, struct file *file,
+	     const struct cred *cred)
+{
+	struct dentry *dentry = path->dentry;
+	struct inode *inode = dentry->d_inode;
+
+	file->f_path = *path;
+	if (dentry->d_flags & DCACHE_OP_SELECT_INODE) {
+		inode = dentry->d_op->d_select_inode(dentry, file->f_flags);
+		if (IS_ERR(inode))
+			return PTR_ERR(inode);
+	}
+
+	return do_dentry_open(file, inode, NULL, cred);
+}
+
 struct file *dentry_open(const struct path *path, int flags,
 			 const struct cred *cred)
 {
@@ -853,26 +876,6 @@ struct file *dentry_open(const struct path *path, int flags,
 }
 EXPORT_SYMBOL(dentry_open);
 
-/**
- * vfs_open - open the file at the given path
- * @path: path to open
- * @filp: newly allocated file with f_flag initialized
- * @cred: credentials to use
- */
-int vfs_open(const struct path *path, struct file *filp,
-	     const struct cred *cred)
-{
-	struct inode *inode = path->dentry->d_inode;
-
-	if (inode->i_op->dentry_open)
-		return inode->i_op->dentry_open(path->dentry, filp, cred);
-	else {
-		filp->f_path = *path;
-		return do_dentry_open(filp, NULL, cred);
-	}
-}
-EXPORT_SYMBOL(vfs_open);
-
 static inline int build_open_flags(int flags, umode_t mode, struct open_flags *op)
 {
 	int lookup_flags = 0;
diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c
index 21079d1ca2aa..f140e3dbfb7b 100644
--- a/fs/overlayfs/inode.c
+++ b/fs/overlayfs/inode.c
@@ -337,31 +337,30 @@ static bool ovl_open_need_copy_up(int flags, enum ovl_path_type type,
 	return true;
 }
 
-static int ovl_dentry_open(struct dentry *dentry, struct file *file,
-		    const struct cred *cred)
+struct inode *ovl_d_select_inode(struct dentry *dentry, unsigned file_flags)
 {
 	int err;
 	struct path realpath;
 	enum ovl_path_type type;
 
 	type = ovl_path_real(dentry, &realpath);
-	if (ovl_open_need_copy_up(file->f_flags, type, realpath.dentry)) {
+	if (ovl_open_need_copy_up(file_flags, type, realpath.dentry)) {
 		err = ovl_want_write(dentry);
 		if (err)
-			return err;
+			return ERR_PTR(err);
 
-		if (file->f_flags & O_TRUNC)
+		if (file_flags & O_TRUNC)
 			err = ovl_copy_up_last(dentry, NULL, true);
 		else
 			err = ovl_copy_up(dentry);
 		ovl_drop_write(dentry);
 		if (err)
-			return err;
+			return ERR_PTR(err);
 
 		ovl_path_upper(dentry, &realpath);
 	}
 
-	return vfs_open(&realpath, file, cred);
+	return d_backing_inode(realpath.dentry);
 }
 
 static const struct inode_operations ovl_file_inode_operations = {
@@ -372,7 +371,6 @@ static const struct inode_operations ovl_file_inode_operations = {
 	.getxattr	= ovl_getxattr,
 	.listxattr	= ovl_listxattr,
 	.removexattr	= ovl_removexattr,
-	.dentry_open	= ovl_dentry_open,
 };
 
 static const struct inode_operations ovl_symlink_inode_operations = {
diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h
index 17ac5afc9ffb..ea5a40b06e3a 100644
--- a/fs/overlayfs/overlayfs.h
+++ b/fs/overlayfs/overlayfs.h
@@ -173,6 +173,7 @@ ssize_t ovl_getxattr(struct dentry *dentry, const char *name,
 		     void *value, size_t size);
 ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size);
 int ovl_removexattr(struct dentry *dentry, const char *name);
+struct inode *ovl_d_select_inode(struct dentry *dentry, unsigned file_flags);
 
 struct inode *ovl_new_inode(struct super_block *sb, umode_t mode,
 			    struct ovl_entry *oe);
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index 5f0d1993e6e3..84c5e27fbfd9 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -275,6 +275,7 @@ static void ovl_dentry_release(struct dentry *dentry)
 
 static const struct dentry_operations ovl_dentry_operations = {
 	.d_release = ovl_dentry_release,
+	.d_select_inode = ovl_d_select_inode,
 };
 
 static struct ovl_entry *ovl_alloc_entry(unsigned int numlower)
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index df334cbacc6d..167ec0934049 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -160,6 +160,7 @@ struct dentry_operations {
 	char *(*d_dname)(struct dentry *, char *, int);
 	struct vfsmount *(*d_automount)(struct path *);
 	int (*d_manage)(struct dentry *, bool);
+	struct inode *(*d_select_inode)(struct dentry *, unsigned);
 } ____cacheline_aligned;
 
 /*
@@ -225,6 +226,7 @@ struct dentry_operations {
 
 #define DCACHE_MAY_FREE			0x00800000
 #define DCACHE_FALLTHRU			0x01000000 /* Fall through to lower layer */
+#define DCACHE_OP_SELECT_INODE		0x02000000 /* Unioned entry: dcache op selects inode */
 
 extern seqlock_t rename_lock;
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index b577e801b4af..2bd77e10e8e5 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1641,7 +1641,6 @@ struct inode_operations {
 	int (*set_acl)(struct inode *, struct posix_acl *, int);
 
 	/* WARNING: probably going away soon, do not use! */
-	int (*dentry_open)(struct dentry *, struct file *, const struct cred *);
 } ____cacheline_aligned;
 
 ssize_t rw_copy_check_uvector(int type, const struct iovec __user * uvector,
@@ -2194,7 +2193,6 @@ extern struct file *file_open_name(struct filename *, int, umode_t);
 extern struct file *filp_open(const char *, int, umode_t);
 extern struct file *file_open_root(struct dentry *, struct vfsmount *,
 				   const char *, int);
-extern int vfs_open(const struct path *, struct file *, const struct cred *);
 extern struct file * dentry_open(const struct path *, int, const struct cred *);
 extern int filp_close(struct file *, fl_owner_t id);
 
-- 
cgit v1.2.3


From 68722101ec3a0e179408a13708dd020e04f54aab Mon Sep 17 00:00:00 2001
From: George Beshers <gbeshers@sgi.com>
Date: Thu, 18 Jun 2015 10:25:13 -0500
Subject: locking/lockdep: Remove hard coded array size dependency

An apparent oversight left a hardcoded '4' in place when
LOCKSTAT_POINTS was introduced.

The contention_point[] and contending_point[] arrays in the
structs lock_class and lock_class_stats need to be the same
size for the loops in lock_stats() to be correct.

This patch allows LOCKSTAT_POINTS to be changed without
affecting the correctness of the code.

Signed-off-by: George Beshers <gbeshers@sgi.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/lockdep.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h
index 066ba4157541..2722111591a3 100644
--- a/include/linux/lockdep.h
+++ b/include/linux/lockdep.h
@@ -130,8 +130,8 @@ enum bounce_type {
 };
 
 struct lock_class_stats {
-	unsigned long			contention_point[4];
-	unsigned long			contending_point[4];
+	unsigned long			contention_point[LOCKSTAT_POINTS];
+	unsigned long			contending_point[LOCKSTAT_POINTS];
 	struct lock_time		read_waittime;
 	struct lock_time		write_waittime;
 	struct lock_time		read_holdtime;
-- 
cgit v1.2.3


From b17718d02f54b90978d0e0146368b512b11c3e84 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 5 Jun 2015 17:30:23 +0200
Subject: sched/stop_machine: Fix deadlock between multiple stop_two_cpus()

Jiri reported a machine stuck in multi_cpu_stop() with
migrate_swap_stop() as function and with the following src,dst cpu
pairs: {11,  4} {13, 11} { 4, 13}

                        4       11      13

cpuM: queue(4 ,13)
                        *Ma
cpuN: queue(13,11)
                                *N      Na
                        *M              Mb
cpuO: queue(11, 4)
                        *O      Oa
                                *Nb
                        *Ob

Where *X denotes the cpu running the queueing of cpu-X and X[ab] denotes
the first/second queued work.

You'll observe the top of the workqueue for each cpu: 4,11,13 to be work
from cpus: M, O, N resp. IOW. deadlock.

Do away with the queueing trickery and introduce lg_double_lock() to
lock both CPUs and fully serialize the stop_two_cpus() callers instead
of the partial (and buggy) serialization we have now.

Reported-by: Jiri Olsa <jolsa@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20150605153023.GH19282@twins.programming.kicks-ass.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/lglock.h  |  5 +++++
 kernel/locking/lglock.c | 22 ++++++++++++++++++++++
 kernel/stop_machine.c   | 42 +++++-------------------------------------
 3 files changed, 32 insertions(+), 37 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/lglock.h b/include/linux/lglock.h
index 0081f000e34b..c92ebd100d9b 100644
--- a/include/linux/lglock.h
+++ b/include/linux/lglock.h
@@ -52,10 +52,15 @@ struct lglock {
 	static struct lglock name = { .lock = &name ## _lock }
 
 void lg_lock_init(struct lglock *lg, char *name);
+
 void lg_local_lock(struct lglock *lg);
 void lg_local_unlock(struct lglock *lg);
 void lg_local_lock_cpu(struct lglock *lg, int cpu);
 void lg_local_unlock_cpu(struct lglock *lg, int cpu);
+
+void lg_double_lock(struct lglock *lg, int cpu1, int cpu2);
+void lg_double_unlock(struct lglock *lg, int cpu1, int cpu2);
+
 void lg_global_lock(struct lglock *lg);
 void lg_global_unlock(struct lglock *lg);
 
diff --git a/kernel/locking/lglock.c b/kernel/locking/lglock.c
index 86ae2aebf004..951cfcd10b4a 100644
--- a/kernel/locking/lglock.c
+++ b/kernel/locking/lglock.c
@@ -60,6 +60,28 @@ void lg_local_unlock_cpu(struct lglock *lg, int cpu)
 }
 EXPORT_SYMBOL(lg_local_unlock_cpu);
 
+void lg_double_lock(struct lglock *lg, int cpu1, int cpu2)
+{
+	BUG_ON(cpu1 == cpu2);
+
+	/* lock in cpu order, just like lg_global_lock */
+	if (cpu2 < cpu1)
+		swap(cpu1, cpu2);
+
+	preempt_disable();
+	lock_acquire_shared(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_);
+	arch_spin_lock(per_cpu_ptr(lg->lock, cpu1));
+	arch_spin_lock(per_cpu_ptr(lg->lock, cpu2));
+}
+
+void lg_double_unlock(struct lglock *lg, int cpu1, int cpu2)
+{
+	lock_release(&lg->lock_dep_map, 1, _RET_IP_);
+	arch_spin_unlock(per_cpu_ptr(lg->lock, cpu1));
+	arch_spin_unlock(per_cpu_ptr(lg->lock, cpu2));
+	preempt_enable();
+}
+
 void lg_global_lock(struct lglock *lg)
 {
 	int i;
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 695f0c6cd169..fd643d8c4b42 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -211,25 +211,6 @@ static int multi_cpu_stop(void *data)
 	return err;
 }
 
-struct irq_cpu_stop_queue_work_info {
-	int cpu1;
-	int cpu2;
-	struct cpu_stop_work *work1;
-	struct cpu_stop_work *work2;
-};
-
-/*
- * This function is always run with irqs and preemption disabled.
- * This guarantees that both work1 and work2 get queued, before
- * our local migrate thread gets the chance to preempt us.
- */
-static void irq_cpu_stop_queue_work(void *arg)
-{
-	struct irq_cpu_stop_queue_work_info *info = arg;
-	cpu_stop_queue_work(info->cpu1, info->work1);
-	cpu_stop_queue_work(info->cpu2, info->work2);
-}
-
 /**
  * stop_two_cpus - stops two cpus
  * @cpu1: the cpu to stop
@@ -245,7 +226,6 @@ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *
 {
 	struct cpu_stop_done done;
 	struct cpu_stop_work work1, work2;
-	struct irq_cpu_stop_queue_work_info call_args;
 	struct multi_stop_data msdata;
 
 	preempt_disable();
@@ -262,13 +242,6 @@ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *
 		.done = &done
 	};
 
-	call_args = (struct irq_cpu_stop_queue_work_info){
-		.cpu1 = cpu1,
-		.cpu2 = cpu2,
-		.work1 = &work1,
-		.work2 = &work2,
-	};
-
 	cpu_stop_init_done(&done, 2);
 	set_state(&msdata, MULTI_STOP_PREPARE);
 
@@ -285,16 +258,11 @@ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *
 		return -ENOENT;
 	}
 
-	lg_local_lock(&stop_cpus_lock);
-	/*
-	 * Queuing needs to be done by the lowest numbered CPU, to ensure
-	 * that works are always queued in the same order on every CPU.
-	 * This prevents deadlocks.
-	 */
-	smp_call_function_single(min(cpu1, cpu2),
-				 &irq_cpu_stop_queue_work,
-				 &call_args, 1);
-	lg_local_unlock(&stop_cpus_lock);
+	lg_double_lock(&stop_cpus_lock, cpu1, cpu2);
+	cpu_stop_queue_work(cpu1, &work1);
+	cpu_stop_queue_work(cpu2, &work2);
+	lg_double_unlock(&stop_cpus_lock, cpu1, cpu2);
+
 	preempt_enable();
 
 	wait_for_completion(&done.completion);
-- 
cgit v1.2.3


From 1dabbcec2c0a36fe43509d06499b9e512e70a028 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 26 May 2015 22:50:28 +0000
Subject: timer: Use hlist for the timer wheel hash buckets

This reduces the size of struct tvec_base by 50% and results in
slightly smaller code as well.

Before:
   struct tvec_base: size: 8256, cachelines: 129

   text	   data	    bss	    dec	    hex	filename
  17698	  13297	   8256	  39251	   9953	../build/kernel/time/timer.o

After:
  struct tvec_base: 4160, cachelines: 65

   text	   data	    bss	    dec	    hex	filename
  17491	   9201	   4160	  30852	   7884	../build/kernel/time/timer.o

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Viresh Kumar <viresh.kumar@linaro.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Paul McKenney <paulmck@linux.vnet.ibm.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Eric Dumazet <edumazet@google.com>
Cc: John Stultz <john.stultz@linaro.org>
Cc: Joonwoo Park <joonwoop@codeaurora.org>
Cc: Wenbo Wang <wenbo.wang@memblaze.com>
Link: http://lkml.kernel.org/r/20150526224511.854731214@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/timer.h |  6 ++---
 kernel/time/timer.c   | 64 ++++++++++++++++++++++-----------------------------
 2 files changed, 30 insertions(+), 40 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/timer.h b/include/linux/timer.h
index fbb80e0030bf..064ee24d3f38 100644
--- a/include/linux/timer.h
+++ b/include/linux/timer.h
@@ -14,7 +14,7 @@ struct timer_list {
 	 * All fields that change during normal runtime grouped to the
 	 * same cacheline
 	 */
-	struct list_head entry;
+	struct hlist_node entry;
 	unsigned long expires;
 	struct tvec_base *base;
 
@@ -71,7 +71,7 @@ extern struct tvec_base boot_tvec_bases;
 #define TIMER_FLAG_MASK			0x3LU
 
 #define __TIMER_INITIALIZER(_function, _expires, _data, _flags) { \
-		.entry = { .prev = TIMER_ENTRY_STATIC },	\
+		.entry = { .next = TIMER_ENTRY_STATIC },	\
 		.function = (_function),			\
 		.expires = (_expires),				\
 		.data = (_data),				\
@@ -168,7 +168,7 @@ static inline void init_timer_on_stack_key(struct timer_list *timer,
  */
 static inline int timer_pending(const struct timer_list * timer)
 {
-	return timer->entry.next != NULL;
+	return timer->entry.pprev != NULL;
 }
 
 extern void add_timer_on(struct timer_list *timer, int cpu);
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index e212df24ad3f..3a5e0c840884 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -70,11 +70,11 @@ EXPORT_SYMBOL(jiffies_64);
 #define MAX_TVAL ((unsigned long)((1ULL << (TVR_BITS + 4*TVN_BITS)) - 1))
 
 struct tvec {
-	struct list_head vec[TVN_SIZE];
+	struct hlist_head vec[TVN_SIZE];
 };
 
 struct tvec_root {
-	struct list_head vec[TVR_SIZE];
+	struct hlist_head vec[TVR_SIZE];
 };
 
 struct tvec_base {
@@ -356,7 +356,7 @@ __internal_add_timer(struct tvec_base *base, struct timer_list *timer)
 {
 	unsigned long expires = timer->expires;
 	unsigned long idx = expires - base->timer_jiffies;
-	struct list_head *vec;
+	struct hlist_head *vec;
 
 	if (idx < TVR_SIZE) {
 		int i = expires & TVR_MASK;
@@ -390,7 +390,7 @@ __internal_add_timer(struct tvec_base *base, struct timer_list *timer)
 		vec = base->tv5.vec + i;
 	}
 
-	list_add(&timer->entry, vec);
+	hlist_add_head(&timer->entry, vec);
 }
 
 static void internal_add_timer(struct tvec_base *base, struct timer_list *timer)
@@ -504,8 +504,8 @@ static int timer_fixup_activate(void *addr, enum debug_obj_state state)
 		 * statically initialized. We just make sure that it
 		 * is tracked in the object tracker.
 		 */
-		if (timer->entry.next == NULL &&
-		    timer->entry.prev == TIMER_ENTRY_STATIC) {
+		if (timer->entry.pprev == NULL &&
+		    timer->entry.next == TIMER_ENTRY_STATIC) {
 			debug_object_init(timer, &timer_debug_descr);
 			debug_object_activate(timer, &timer_debug_descr);
 			return 0;
@@ -551,7 +551,7 @@ static int timer_fixup_assert_init(void *addr, enum debug_obj_state state)
 
 	switch (state) {
 	case ODEBUG_STATE_NOTAVAILABLE:
-		if (timer->entry.prev == TIMER_ENTRY_STATIC) {
+		if (timer->entry.next == TIMER_ENTRY_STATIC) {
 			/*
 			 * This is not really a fixup. The timer was
 			 * statically initialized. We just make sure that it
@@ -655,7 +655,7 @@ static void do_init_timer(struct timer_list *timer, unsigned int flags,
 {
 	struct tvec_base *base = raw_cpu_read(tvec_bases);
 
-	timer->entry.next = NULL;
+	timer->entry.pprev = NULL;
 	timer->base = (void *)((unsigned long)base | flags);
 	timer->slack = -1;
 #ifdef CONFIG_TIMER_STATS
@@ -687,14 +687,14 @@ EXPORT_SYMBOL(init_timer_key);
 
 static inline void detach_timer(struct timer_list *timer, bool clear_pending)
 {
-	struct list_head *entry = &timer->entry;
+	struct hlist_node *entry = &timer->entry;
 
 	debug_deactivate(timer);
 
-	__list_del(entry->prev, entry->next);
+	__hlist_del(entry);
 	if (clear_pending)
-		entry->next = NULL;
-	entry->prev = LIST_POISON2;
+		entry->pprev = NULL;
+	entry->next = LIST_POISON2;
 }
 
 static inline void
@@ -1095,16 +1095,17 @@ EXPORT_SYMBOL(del_timer_sync);
 static int cascade(struct tvec_base *base, struct tvec *tv, int index)
 {
 	/* cascade all the timers from tv up one level */
-	struct timer_list *timer, *tmp;
-	struct list_head tv_list;
+	struct timer_list *timer;
+	struct hlist_node *tmp;
+	struct hlist_head tv_list;
 
-	list_replace_init(tv->vec + index, &tv_list);
+	hlist_move_list(tv->vec + index, &tv_list);
 
 	/*
 	 * We are removing _all_ timers from the list, so we
 	 * don't have to detach them individually.
 	 */
-	list_for_each_entry_safe(timer, tmp, &tv_list, entry) {
+	hlist_for_each_entry_safe(timer, tmp, &tv_list, entry) {
 		BUG_ON(tbase_get_base(timer->base) != base);
 		/* No accounting, while moving them */
 		__internal_add_timer(base, timer);
@@ -1172,8 +1173,8 @@ static inline void __run_timers(struct tvec_base *base)
 	spin_lock_irq(&base->lock);
 
 	while (time_after_eq(jiffies, base->timer_jiffies)) {
-		struct list_head work_list;
-		struct list_head *head = &work_list;
+		struct hlist_head work_list;
+		struct hlist_head *head = &work_list;
 		int index;
 
 		if (!base->all_timers) {
@@ -1192,13 +1193,13 @@ static inline void __run_timers(struct tvec_base *base)
 					!cascade(base, &base->tv4, INDEX(2)))
 			cascade(base, &base->tv5, INDEX(3));
 		++base->timer_jiffies;
-		list_replace_init(base->tv1.vec + index, head);
-		while (!list_empty(head)) {
+		hlist_move_list(base->tv1.vec + index, head);
+		while (!hlist_empty(head)) {
 			void (*fn)(unsigned long);
 			unsigned long data;
 			bool irqsafe;
 
-			timer = list_first_entry(head, struct timer_list,entry);
+			timer = hlist_entry(head->first, struct timer_list, entry);
 			fn = timer->function;
 			data = timer->data;
 			irqsafe = tbase_get_irqsafe(timer->base);
@@ -1240,7 +1241,7 @@ static unsigned long __next_timer_interrupt(struct tvec_base *base)
 	/* Look for timer events in tv1. */
 	index = slot = timer_jiffies & TVR_MASK;
 	do {
-		list_for_each_entry(nte, base->tv1.vec + slot, entry) {
+		hlist_for_each_entry(nte, base->tv1.vec + slot, entry) {
 			if (tbase_get_deferrable(nte->base))
 				continue;
 
@@ -1271,7 +1272,7 @@ cascade:
 
 		index = slot = timer_jiffies & TVN_MASK;
 		do {
-			list_for_each_entry(nte, varp->vec + slot, entry) {
+			hlist_for_each_entry(nte, varp->vec + slot, entry) {
 				if (tbase_get_deferrable(nte->base))
 					continue;
 
@@ -1530,12 +1531,12 @@ signed long __sched schedule_timeout_uninterruptible(signed long timeout)
 EXPORT_SYMBOL(schedule_timeout_uninterruptible);
 
 #ifdef CONFIG_HOTPLUG_CPU
-static void migrate_timer_list(struct tvec_base *new_base, struct list_head *head)
+static void migrate_timer_list(struct tvec_base *new_base, struct hlist_head *head)
 {
 	struct timer_list *timer;
 
-	while (!list_empty(head)) {
-		timer = list_first_entry(head, struct timer_list, entry);
+	while (!hlist_empty(head)) {
+		timer = hlist_entry(head->first, struct timer_list, entry);
 		/* We ignore the accounting on the dying cpu */
 		detach_timer(timer, false);
 		timer_set_base(timer, new_base);
@@ -1603,23 +1604,12 @@ static inline void timer_register_cpu_notifier(void) { }
 
 static void __init init_timer_cpu(struct tvec_base *base, int cpu)
 {
-	int j;
-
 	BUG_ON(base != tbase_get_base(base));
 
 	base->cpu = cpu;
 	per_cpu(tvec_bases, cpu) = base;
 	spin_lock_init(&base->lock);
 
-	for (j = 0; j < TVN_SIZE; j++) {
-		INIT_LIST_HEAD(base->tv5.vec + j);
-		INIT_LIST_HEAD(base->tv4.vec + j);
-		INIT_LIST_HEAD(base->tv3.vec + j);
-		INIT_LIST_HEAD(base->tv2.vec + j);
-	}
-	for (j = 0; j < TVR_SIZE; j++)
-		INIT_LIST_HEAD(base->tv1.vec + j);
-
 	base->timer_jiffies = jiffies;
 	base->next_timer = base->timer_jiffies;
 }
-- 
cgit v1.2.3


From 0eeda71bc30d74f66f8231f45621d5ace3419186 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 26 May 2015 22:50:29 +0000
Subject: timer: Replace timer base by a cpu index

Instead of storing a pointer to the per cpu tvec_base we can simply
cache a CPU index in the timer_list and use that to get hold of the
correct per cpu tvec_base. This is only used in lock_timer_base() and
the slightly larger code is peanuts versus the spinlock operation and
the d-cache foot print of the timer wheel.

Aside of that this allows to get rid of following nuisances:

 - boot_tvec_base

   That statically allocated 4k bss data is just kept around so the
   timer has a home when it gets statically initialized. It serves no
   other purpose.

   With the CPU index we assign the timer to CPU0 at static
   initialization time and therefor can avoid the whole boot_tvec_base
   dance.  That also simplifies the init code, which just can use the
   per cpu base.

   Before:
     text	   data	    bss	    dec	    hex	filename
    17491	   9201	   4160	  30852	   7884	../build/kernel/time/timer.o
   After:
     text	   data	    bss	    dec	    hex	filename
    17440	   9193	      0	  26633	   6809	../build/kernel/time/timer.o

 - Overloading the base pointer with various flags

   The CPU index has enough space to hold the flags (deferrable,
   irqsafe) so we can get rid of the extra masking and bit fiddling
   with the base pointer.

As a benefit we reduce the size of struct timer_list on 64 bit
machines. 4 - 8 bytes, a size reduction up to 15% per struct timer_list,
which is a real win as we have tons of them embedded in other structs.

This changes also the newly added deferrable printout of the timer
start trace point to capture and print all timer->flags, which allows
us to decode the target cpu of the timer as well.

We might have used bitfields for this, but that would change the
static initializers and the init function for no value to accomodate
big endian bitfields.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Paul McKenney <paulmck@linux.vnet.ibm.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Viresh Kumar <viresh.kumar@linaro.org>
Cc: John Stultz <john.stultz@linaro.org>
Cc: Joonwoo Park <joonwoop@codeaurora.org>
Cc: Wenbo Wang <wenbo.wang@memblaze.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Badhri Jagan Sridharan <Badhri@google.com>
Link: http://lkml.kernel.org/r/20150526224511.950084301@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/timer.h        |  38 ++++++-------
 include/trace/events/timer.h |  13 ++---
 kernel/time/timer.c          | 127 ++++++++++++-------------------------------
 3 files changed, 58 insertions(+), 120 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/timer.h b/include/linux/timer.h
index 064ee24d3f38..4a0d52bc2073 100644
--- a/include/linux/timer.h
+++ b/include/linux/timer.h
@@ -14,27 +14,23 @@ struct timer_list {
 	 * All fields that change during normal runtime grouped to the
 	 * same cacheline
 	 */
-	struct hlist_node entry;
-	unsigned long expires;
-	struct tvec_base *base;
-
-	void (*function)(unsigned long);
-	unsigned long data;
-
-	int slack;
+	struct hlist_node	entry;
+	unsigned long		expires;
+	void			(*function)(unsigned long);
+	unsigned long		data;
+	u32			flags;
+	int			slack;
 
 #ifdef CONFIG_TIMER_STATS
-	int start_pid;
-	void *start_site;
-	char start_comm[16];
+	int			start_pid;
+	void			*start_site;
+	char			start_comm[16];
 #endif
 #ifdef CONFIG_LOCKDEP
-	struct lockdep_map lockdep_map;
+	struct lockdep_map	lockdep_map;
 #endif
 };
 
-extern struct tvec_base boot_tvec_bases;
-
 #ifdef CONFIG_LOCKDEP
 /*
  * NB: because we have to copy the lockdep_map, setting the lockdep_map key
@@ -49,9 +45,6 @@ extern struct tvec_base boot_tvec_bases;
 #endif
 
 /*
- * Note that all tvec_bases are at least 4 byte aligned and lower two bits
- * of base in timer_list is guaranteed to be zero. Use them for flags.
- *
  * A deferrable timer will work normally when the system is busy, but
  * will not cause a CPU to come out of idle just to service it; instead,
  * the timer will be serviced when the CPU eventually wakes up with a
@@ -65,17 +58,18 @@ extern struct tvec_base boot_tvec_bases;
  * workqueue locking issues. It's not meant for executing random crap
  * with interrupts disabled. Abuse is monitored!
  */
-#define TIMER_DEFERRABLE		0x1LU
-#define TIMER_IRQSAFE			0x2LU
-
-#define TIMER_FLAG_MASK			0x3LU
+#define TIMER_CPUMASK		0x0007FFFF
+#define TIMER_MIGRATING		0x00080000
+#define TIMER_BASEMASK		(TIMER_CPUMASK | TIMER_MIGRATING)
+#define TIMER_DEFERRABLE	0x00100000
+#define TIMER_IRQSAFE		0x00200000
 
 #define __TIMER_INITIALIZER(_function, _expires, _data, _flags) { \
 		.entry = { .next = TIMER_ENTRY_STATIC },	\
 		.function = (_function),			\
 		.expires = (_expires),				\
 		.data = (_data),				\
-		.base = (void *)((unsigned long)&boot_tvec_bases + (_flags)), \
+		.flags = (_flags),				\
 		.slack = -1,					\
 		__TIMER_LOCKDEP_MAP_INITIALIZER(		\
 			__FILE__ ":" __stringify(__LINE__))	\
diff --git a/include/trace/events/timer.h b/include/trace/events/timer.h
index d7abef1fe6e0..073b9ac245ba 100644
--- a/include/trace/events/timer.h
+++ b/include/trace/events/timer.h
@@ -45,16 +45,16 @@ TRACE_EVENT(timer_start,
 
 	TP_PROTO(struct timer_list *timer,
 		unsigned long expires,
-		unsigned int deferrable),
+		unsigned int flags),
 
-	TP_ARGS(timer, expires, deferrable),
+	TP_ARGS(timer, expires, flags),
 
 	TP_STRUCT__entry(
 		__field( void *,	timer		)
 		__field( void *,	function	)
 		__field( unsigned long,	expires		)
 		__field( unsigned long,	now		)
-		__field( unsigned int,	deferrable	)
+		__field( unsigned int,	flags		)
 	),
 
 	TP_fast_assign(
@@ -62,13 +62,12 @@ TRACE_EVENT(timer_start,
 		__entry->function	= timer->function;
 		__entry->expires	= expires;
 		__entry->now		= jiffies;
-		__entry->deferrable     = deferrable;
+		__entry->flags		= flags;
 	),
 
-	TP_printk("timer=%p function=%pf expires=%lu [timeout=%ld] defer=%c",
+	TP_printk("timer=%p function=%pf expires=%lu [timeout=%ld] flags=0x%08x",
 		  __entry->timer, __entry->function, __entry->expires,
-		  (long)__entry->expires - __entry->now,
-		  __entry->deferrable > 0 ? 'y':'n')
+		  (long)__entry->expires - __entry->now, __entry->flags)
 );
 
 /**
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 3a5e0c840884..1540af9f62eb 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -92,43 +92,8 @@ struct tvec_base {
 	struct tvec tv5;
 } ____cacheline_aligned;
 
-/*
- * __TIMER_INITIALIZER() needs to set ->base to a valid pointer (because we've
- * made NULL special, hint: lock_timer_base()) and we cannot get a compile time
- * pointer to per-cpu entries because we don't know where we'll map the section,
- * even for the boot cpu.
- *
- * And so we use boot_tvec_bases for boot CPU and per-cpu __tvec_bases for the
- * rest of them.
- */
-struct tvec_base boot_tvec_bases;
-EXPORT_SYMBOL(boot_tvec_bases);
-
-static DEFINE_PER_CPU(struct tvec_base *, tvec_bases) = &boot_tvec_bases;
-
-/* Functions below help us manage 'deferrable' flag */
-static inline unsigned int tbase_get_deferrable(struct tvec_base *base)
-{
-	return ((unsigned int)(unsigned long)base & TIMER_DEFERRABLE);
-}
-
-static inline unsigned int tbase_get_irqsafe(struct tvec_base *base)
-{
-	return ((unsigned int)(unsigned long)base & TIMER_IRQSAFE);
-}
-
-static inline struct tvec_base *tbase_get_base(struct tvec_base *base)
-{
-	return ((struct tvec_base *)((unsigned long)base & ~TIMER_FLAG_MASK));
-}
-
-static inline void
-timer_set_base(struct timer_list *timer, struct tvec_base *new_base)
-{
-	unsigned long flags = (unsigned long)timer->base & TIMER_FLAG_MASK;
 
-	timer->base = (struct tvec_base *)((unsigned long)(new_base) | flags);
-}
+static DEFINE_PER_CPU(struct tvec_base, tvec_bases);
 
 static unsigned long round_jiffies_common(unsigned long j, int cpu,
 		bool force_up)
@@ -403,7 +368,7 @@ static void internal_add_timer(struct tvec_base *base, struct timer_list *timer)
 	/*
 	 * Update base->active_timers and base->next_timer
 	 */
-	if (!tbase_get_deferrable(timer->base)) {
+	if (!(timer->flags & TIMER_DEFERRABLE)) {
 		if (!base->active_timers++ ||
 		    time_before(timer->expires, base->next_timer))
 			base->next_timer = timer->expires;
@@ -422,7 +387,7 @@ static void internal_add_timer(struct tvec_base *base, struct timer_list *timer)
 	 * require special care against races with idle_cpu(), lets deal
 	 * with that later.
 	 */
-	if (!tbase_get_deferrable(timer->base) || tick_nohz_full_cpu(base->cpu))
+	if (!(timer->flags & TIMER_DEFERRABLE) || tick_nohz_full_cpu(base->cpu))
 		wake_up_nohz_cpu(base->cpu);
 }
 
@@ -443,7 +408,7 @@ static void timer_stats_account_timer(struct timer_list *timer)
 
 	if (likely(!timer->start_site))
 		return;
-	if (unlikely(tbase_get_deferrable(timer->base)))
+	if (unlikely(timer->flags & TIMER_DEFERRABLE))
 		flag |= TIMER_STATS_FLAG_DEFERRABLE;
 
 	timer_stats_update_stats(timer, timer->start_pid, timer->start_site,
@@ -636,7 +601,7 @@ static inline void
 debug_activate(struct timer_list *timer, unsigned long expires)
 {
 	debug_timer_activate(timer);
-	trace_timer_start(timer, expires, tbase_get_deferrable(timer->base));
+	trace_timer_start(timer, expires, timer->flags);
 }
 
 static inline void debug_deactivate(struct timer_list *timer)
@@ -653,10 +618,8 @@ static inline void debug_assert_init(struct timer_list *timer)
 static void do_init_timer(struct timer_list *timer, unsigned int flags,
 			  const char *name, struct lock_class_key *key)
 {
-	struct tvec_base *base = raw_cpu_read(tvec_bases);
-
 	timer->entry.pprev = NULL;
-	timer->base = (void *)((unsigned long)base | flags);
+	timer->flags = flags | raw_smp_processor_id();
 	timer->slack = -1;
 #ifdef CONFIG_TIMER_STATS
 	timer->start_site = NULL;
@@ -701,7 +664,7 @@ static inline void
 detach_expired_timer(struct timer_list *timer, struct tvec_base *base)
 {
 	detach_timer(timer, true);
-	if (!tbase_get_deferrable(timer->base))
+	if (!(timer->flags & TIMER_DEFERRABLE))
 		base->active_timers--;
 	base->all_timers--;
 }
@@ -713,7 +676,7 @@ static int detach_if_pending(struct timer_list *timer, struct tvec_base *base,
 		return 0;
 
 	detach_timer(timer, clear_pending);
-	if (!tbase_get_deferrable(timer->base)) {
+	if (!(timer->flags & TIMER_DEFERRABLE)) {
 		base->active_timers--;
 		if (timer->expires == base->next_timer)
 			base->next_timer = base->timer_jiffies;
@@ -732,24 +695,22 @@ static int detach_if_pending(struct timer_list *timer, struct tvec_base *base,
  * So __run_timers/migrate_timers can safely modify all timers which could
  * be found on ->tvX lists.
  *
- * When the timer's base is locked, and the timer removed from list, it is
- * possible to set timer->base = NULL and drop the lock: the timer remains
- * locked.
+ * When the timer's base is locked and removed from the list, the
+ * TIMER_MIGRATING flag is set, FIXME
  */
 static struct tvec_base *lock_timer_base(struct timer_list *timer,
 					unsigned long *flags)
 	__acquires(timer->base->lock)
 {
-	struct tvec_base *base;
-
 	for (;;) {
-		struct tvec_base *prelock_base = timer->base;
-		base = tbase_get_base(prelock_base);
-		if (likely(base != NULL)) {
+		u32 tf = timer->flags;
+		struct tvec_base *base;
+
+		if (!(tf & TIMER_MIGRATING)) {
+			base = per_cpu_ptr(&tvec_bases, tf & TIMER_CPUMASK);
 			spin_lock_irqsave(&base->lock, *flags);
-			if (likely(prelock_base == timer->base))
+			if (timer->flags == tf)
 				return base;
-			/* The timer has migrated to another CPU */
 			spin_unlock_irqrestore(&base->lock, *flags);
 		}
 		cpu_relax();
@@ -776,7 +737,7 @@ __mod_timer(struct timer_list *timer, unsigned long expires,
 	debug_activate(timer, expires);
 
 	cpu = get_nohz_timer_target(pinned);
-	new_base = per_cpu(tvec_bases, cpu);
+	new_base = per_cpu_ptr(&tvec_bases, cpu);
 
 	if (base != new_base) {
 		/*
@@ -788,11 +749,12 @@ __mod_timer(struct timer_list *timer, unsigned long expires,
 		 */
 		if (likely(base->running_timer != timer)) {
 			/* See the comment in lock_timer_base() */
-			timer_set_base(timer, NULL);
+			timer->flags |= TIMER_MIGRATING;
+
 			spin_unlock(&base->lock);
 			base = new_base;
 			spin_lock(&base->lock);
-			timer_set_base(timer, base);
+			timer->flags = (timer->flags & ~TIMER_BASEMASK) | cpu;
 		}
 	}
 
@@ -954,13 +916,13 @@ EXPORT_SYMBOL(add_timer);
  */
 void add_timer_on(struct timer_list *timer, int cpu)
 {
-	struct tvec_base *base = per_cpu(tvec_bases, cpu);
+	struct tvec_base *base = per_cpu_ptr(&tvec_bases, cpu);
 	unsigned long flags;
 
 	timer_stats_timer_set_start_info(timer);
 	BUG_ON(timer_pending(timer) || !timer->function);
 	spin_lock_irqsave(&base->lock, flags);
-	timer_set_base(timer, base);
+	timer->flags = (timer->flags & ~TIMER_BASEMASK) | cpu;
 	debug_activate(timer, timer->expires);
 	internal_add_timer(base, timer);
 	spin_unlock_irqrestore(&base->lock, flags);
@@ -1025,8 +987,6 @@ int try_to_del_timer_sync(struct timer_list *timer)
 EXPORT_SYMBOL(try_to_del_timer_sync);
 
 #ifdef CONFIG_SMP
-static DEFINE_PER_CPU(struct tvec_base, __tvec_bases);
-
 /**
  * del_timer_sync - deactivate a timer and wait for the handler to finish.
  * @timer: the timer to be deactivated
@@ -1081,7 +1041,7 @@ int del_timer_sync(struct timer_list *timer)
 	 * don't use it in hardirq context, because it
 	 * could lead to deadlock.
 	 */
-	WARN_ON(in_irq() && !tbase_get_irqsafe(timer->base));
+	WARN_ON(in_irq() && !(timer->flags & TIMER_IRQSAFE));
 	for (;;) {
 		int ret = try_to_del_timer_sync(timer);
 		if (ret >= 0)
@@ -1106,7 +1066,6 @@ static int cascade(struct tvec_base *base, struct tvec *tv, int index)
 	 * don't have to detach them individually.
 	 */
 	hlist_for_each_entry_safe(timer, tmp, &tv_list, entry) {
-		BUG_ON(tbase_get_base(timer->base) != base);
 		/* No accounting, while moving them */
 		__internal_add_timer(base, timer);
 	}
@@ -1202,7 +1161,7 @@ static inline void __run_timers(struct tvec_base *base)
 			timer = hlist_entry(head->first, struct timer_list, entry);
 			fn = timer->function;
 			data = timer->data;
-			irqsafe = tbase_get_irqsafe(timer->base);
+			irqsafe = timer->flags & TIMER_IRQSAFE;
 
 			timer_stats_account_timer(timer);
 
@@ -1242,7 +1201,7 @@ static unsigned long __next_timer_interrupt(struct tvec_base *base)
 	index = slot = timer_jiffies & TVR_MASK;
 	do {
 		hlist_for_each_entry(nte, base->tv1.vec + slot, entry) {
-			if (tbase_get_deferrable(nte->base))
+			if (nte->flags & TIMER_DEFERRABLE)
 				continue;
 
 			found = 1;
@@ -1273,7 +1232,7 @@ cascade:
 		index = slot = timer_jiffies & TVN_MASK;
 		do {
 			hlist_for_each_entry(nte, varp->vec + slot, entry) {
-				if (tbase_get_deferrable(nte->base))
+				if (nte->flags & TIMER_DEFERRABLE)
 					continue;
 
 				found = 1;
@@ -1343,7 +1302,7 @@ static u64 cmp_next_hrtimer_event(u64 basem, u64 expires)
  */
 u64 get_next_timer_interrupt(unsigned long basej, u64 basem)
 {
-	struct tvec_base *base = __this_cpu_read(tvec_bases);
+	struct tvec_base *base = this_cpu_ptr(&tvec_bases);
 	u64 expires = KTIME_MAX;
 	unsigned long nextevt;
 
@@ -1395,7 +1354,7 @@ void update_process_times(int user_tick)
  */
 static void run_timer_softirq(struct softirq_action *h)
 {
-	struct tvec_base *base = __this_cpu_read(tvec_bases);
+	struct tvec_base *base = this_cpu_ptr(&tvec_bases);
 
 	if (time_after_eq(jiffies, base->timer_jiffies))
 		__run_timers(base);
@@ -1534,12 +1493,13 @@ EXPORT_SYMBOL(schedule_timeout_uninterruptible);
 static void migrate_timer_list(struct tvec_base *new_base, struct hlist_head *head)
 {
 	struct timer_list *timer;
+	int cpu = new_base->cpu;
 
 	while (!hlist_empty(head)) {
 		timer = hlist_entry(head->first, struct timer_list, entry);
 		/* We ignore the accounting on the dying cpu */
 		detach_timer(timer, false);
-		timer_set_base(timer, new_base);
+		timer->flags = (timer->flags & ~TIMER_BASEMASK) | cpu;
 		internal_add_timer(new_base, timer);
 	}
 }
@@ -1551,8 +1511,8 @@ static void migrate_timers(int cpu)
 	int i;
 
 	BUG_ON(cpu_online(cpu));
-	old_base = per_cpu(tvec_bases, cpu);
-	new_base = get_cpu_var(tvec_bases);
+	old_base = per_cpu_ptr(&tvec_bases, cpu);
+	new_base = this_cpu_ptr(&tvec_bases);
 	/*
 	 * The caller is globally serialized and nobody else
 	 * takes two locks at once, deadlock is not possible.
@@ -1576,7 +1536,6 @@ static void migrate_timers(int cpu)
 
 	spin_unlock(&old_base->lock);
 	spin_unlock_irq(&new_base->lock);
-	put_cpu_var(tvec_bases);
 }
 
 static int timer_cpu_notify(struct notifier_block *self,
@@ -1602,12 +1561,11 @@ static inline void timer_register_cpu_notifier(void)
 static inline void timer_register_cpu_notifier(void) { }
 #endif /* CONFIG_HOTPLUG_CPU */
 
-static void __init init_timer_cpu(struct tvec_base *base, int cpu)
+static void __init init_timer_cpu(int cpu)
 {
-	BUG_ON(base != tbase_get_base(base));
+	struct tvec_base *base = per_cpu_ptr(&tvec_bases, cpu);
 
 	base->cpu = cpu;
-	per_cpu(tvec_bases, cpu) = base;
 	spin_lock_init(&base->lock);
 
 	base->timer_jiffies = jiffies;
@@ -1616,27 +1574,14 @@ static void __init init_timer_cpu(struct tvec_base *base, int cpu)
 
 static void __init init_timer_cpus(void)
 {
-	struct tvec_base *base;
-	int local_cpu = smp_processor_id();
 	int cpu;
 
-	for_each_possible_cpu(cpu) {
-		if (cpu == local_cpu)
-			base = &boot_tvec_bases;
-#ifdef CONFIG_SMP
-		else
-			base = per_cpu_ptr(&__tvec_bases, cpu);
-#endif
-
-		init_timer_cpu(base, cpu);
-	}
+	for_each_possible_cpu(cpu)
+		init_timer_cpu(cpu);
 }
 
 void __init init_timers(void)
 {
-	/* ensure there are enough low bits for flags in timer->base pointer */
-	BUILD_BUG_ON(__alignof__(struct tvec_base) & TIMER_FLAG_MASK);
-
 	init_timer_cpus();
 	init_timer_stats();
 	timer_register_cpu_notifier();
-- 
cgit v1.2.3


From c74441a17eb975b604e339ca6c11b9ab9aaca11f Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 26 May 2015 22:50:31 +0000
Subject: timer: Stats: Simplify the flags handling

Simplify the handling of the flag storage for the timer statistics. No
intermediate storage anymore. Just hand over the flags field.

I left the printout of 'deferrable' for now because changing this
would be an ABI update and I have no idea how strong people feel about
that. OTOH, I wonder whether we should kill the whole timer stats
stuff because all of that information can be retrieved via ftrace/perf
as well.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Paul McKenney <paulmck@linux.vnet.ibm.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Viresh Kumar <viresh.kumar@linaro.org>
Cc: John Stultz <john.stultz@linaro.org>
Cc: Joonwoo Park <joonwoop@codeaurora.org>
Cc: Wenbo Wang <wenbo.wang@memblaze.com>
Link: http://lkml.kernel.org/r/20150526224512.046626248@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/timer.h     |  5 +----
 kernel/time/timer.c       |  7 ++-----
 kernel/time/timer_stats.c | 10 +++++-----
 3 files changed, 8 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/timer.h b/include/linux/timer.h
index 4a0d52bc2073..ff0689b6e297 100644
--- a/include/linux/timer.h
+++ b/include/linux/timer.h
@@ -188,13 +188,10 @@ extern void set_timer_slack(struct timer_list *time, int slack_hz);
 
 extern int timer_stats_active;
 
-#define TIMER_STATS_FLAG_DEFERRABLE	0x1
-
 extern void init_timer_stats(void);
 
 extern void timer_stats_update_stats(void *timer, pid_t pid, void *startf,
-				     void *timerf, char *comm,
-				     unsigned int timer_flag);
+				     void *timerf, char *comm, u32 flags);
 
 extern void __timer_stats_timer_set_start_info(struct timer_list *timer,
 					       void *addr);
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 1540af9f62eb..3398d93c74a7 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -404,15 +404,12 @@ void __timer_stats_timer_set_start_info(struct timer_list *timer, void *addr)
 
 static void timer_stats_account_timer(struct timer_list *timer)
 {
-	unsigned int flag = 0;
-
 	if (likely(!timer->start_site))
 		return;
-	if (unlikely(timer->flags & TIMER_DEFERRABLE))
-		flag |= TIMER_STATS_FLAG_DEFERRABLE;
 
 	timer_stats_update_stats(timer, timer->start_pid, timer->start_site,
-				 timer->function, timer->start_comm, flag);
+				 timer->function, timer->start_comm,
+				 timer->flags);
 }
 
 #else
diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c
index 1fb08f21302e..1adecb4b87c8 100644
--- a/kernel/time/timer_stats.c
+++ b/kernel/time/timer_stats.c
@@ -68,7 +68,7 @@ struct entry {
 	 * Number of timeout events:
 	 */
 	unsigned long		count;
-	unsigned int		timer_flag;
+	u32			flags;
 
 	/*
 	 * We save the command-line string to preserve
@@ -227,13 +227,13 @@ static struct entry *tstat_lookup(struct entry *entry, char *comm)
  * @startf:	pointer to the function which did the timer setup
  * @timerf:	pointer to the timer callback function of the timer
  * @comm:	name of the process which set up the timer
+ * @tflags:	The flags field of the timer
  *
  * When the timer is already registered, then the event counter is
  * incremented. Otherwise the timer is registered in a free slot.
  */
 void timer_stats_update_stats(void *timer, pid_t pid, void *startf,
-			      void *timerf, char *comm,
-			      unsigned int timer_flag)
+			      void *timerf, char *comm, u32 tflags)
 {
 	/*
 	 * It doesn't matter which lock we take:
@@ -251,7 +251,7 @@ void timer_stats_update_stats(void *timer, pid_t pid, void *startf,
 	input.start_func = startf;
 	input.expire_func = timerf;
 	input.pid = pid;
-	input.timer_flag = timer_flag;
+	input.flags = tflags;
 
 	raw_spin_lock_irqsave(lock, flags);
 	if (!timer_stats_active)
@@ -306,7 +306,7 @@ static int tstats_show(struct seq_file *m, void *v)
 
 	for (i = 0; i < nr_entries; i++) {
 		entry = entries + i;
-		if (entry->timer_flag & TIMER_STATS_FLAG_DEFERRABLE) {
+		if (entry->flags & TIMER_DEFERRABLE) {
 			seq_printf(m, "%4luD, %5d %-16s ",
 				entry->count, entry->pid, entry->comm);
 		} else {
-- 
cgit v1.2.3


From bc7a34b8b9ebfb0f4b8a35a72a0b134fd6c5ef50 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 26 May 2015 22:50:33 +0000
Subject: timer: Reduce timer migration overhead if disabled

Eric reported that the timer_migration sysctl is not really nice
performance wise as it needs to check at every timer insertion whether
the feature is enabled or not. Further the check does not live in the
timer code, so we have an extra function call which checks an extra
cache line to figure out that it is disabled.

We can do better and store that information in the per cpu (hr)timer
bases. I pondered to use a static key, but that's a nightmare to
update from the nohz code and the timer base cache line is hot anyway
when we select a timer base.

The old logic enabled the timer migration unconditionally if
CONFIG_NO_HZ was set even if nohz was disabled on the kernel command
line.

With this modification, we start off with migration disabled. The user
visible sysctl is still set to enabled. If the kernel switches to NOHZ
migration is enabled, if the user did not disable it via the sysctl
prior to the switch. If nohz=off is on the kernel command line,
migration stays disabled no matter what.

Before:
  47.76%  hog       [.] main
  14.84%  [kernel]  [k] _raw_spin_lock_irqsave
   9.55%  [kernel]  [k] _raw_spin_unlock_irqrestore
   6.71%  [kernel]  [k] mod_timer
   6.24%  [kernel]  [k] lock_timer_base.isra.38
   3.76%  [kernel]  [k] detach_if_pending
   3.71%  [kernel]  [k] del_timer
   2.50%  [kernel]  [k] internal_add_timer
   1.51%  [kernel]  [k] get_nohz_timer_target
   1.28%  [kernel]  [k] __internal_add_timer
   0.78%  [kernel]  [k] timerfn
   0.48%  [kernel]  [k] wake_up_nohz_cpu

After:
  48.10%  hog       [.] main
  15.25%  [kernel]  [k] _raw_spin_lock_irqsave
   9.76%  [kernel]  [k] _raw_spin_unlock_irqrestore
   6.50%  [kernel]  [k] mod_timer
   6.44%  [kernel]  [k] lock_timer_base.isra.38
   3.87%  [kernel]  [k] detach_if_pending
   3.80%  [kernel]  [k] del_timer
   2.67%  [kernel]  [k] internal_add_timer
   1.33%  [kernel]  [k] __internal_add_timer
   0.73%  [kernel]  [k] timerfn
   0.54%  [kernel]  [k] wake_up_nohz_cpu


Reported-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Paul McKenney <paulmck@linux.vnet.ibm.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Viresh Kumar <viresh.kumar@linaro.org>
Cc: John Stultz <john.stultz@linaro.org>
Cc: Joonwoo Park <joonwoop@codeaurora.org>
Cc: Wenbo Wang <wenbo.wang@memblaze.com>
Link: http://lkml.kernel.org/r/20150526224512.127050787@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/hrtimer.h      |  2 ++
 include/linux/sched.h        |  6 +----
 include/linux/sched/sysctl.h | 12 ---------
 include/linux/timer.h        |  9 +++++++
 kernel/rcu/tree_plugin.h     |  2 --
 kernel/sched/core.c          |  9 +++----
 kernel/sysctl.c              | 18 +++++++-------
 kernel/time/hrtimer.c        | 35 ++++++++++++++++++++------
 kernel/time/tick-internal.h  | 14 +++++++++++
 kernel/time/tick-sched.c     | 25 ++++++++++---------
 kernel/time/timer.c          | 59 ++++++++++++++++++++++++++++++++++++++++----
 kernel/time/timer_list.c     |  2 --
 12 files changed, 133 insertions(+), 60 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index 5db055821ef3..69551020bb97 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -163,6 +163,7 @@ enum  hrtimer_base_type {
  * @cpu:		cpu number
  * @active_bases:	Bitfield to mark bases with active timers
  * @clock_was_set_seq:	Sequence counter of clock was set events
+ * @migration_enabled:	The migration of hrtimers to other cpus is enabled
  * @expires_next:	absolute time of the next event which was scheduled
  *			via clock_set_next_event()
  * @next_timer:		Pointer to the first expiring timer
@@ -186,6 +187,7 @@ struct hrtimer_cpu_base {
 	unsigned int			cpu;
 	unsigned int			active_bases;
 	unsigned int			clock_was_set_seq;
+	bool				migration_enabled;
 #ifdef CONFIG_HIGH_RES_TIMERS
 	unsigned int			in_hrtirq	: 1,
 					hres_active	: 1,
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 26a2e6122734..d7151460b0cf 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -335,14 +335,10 @@ extern int runqueue_is_locked(int cpu);
 #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
 extern void nohz_balance_enter_idle(int cpu);
 extern void set_cpu_sd_state_idle(void);
-extern int get_nohz_timer_target(int pinned);
+extern int get_nohz_timer_target(void);
 #else
 static inline void nohz_balance_enter_idle(int cpu) { }
 static inline void set_cpu_sd_state_idle(void) { }
-static inline int get_nohz_timer_target(int pinned)
-{
-	return smp_processor_id();
-}
 #endif
 
 /*
diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h
index 596a0e007c62..c9e4731cf10b 100644
--- a/include/linux/sched/sysctl.h
+++ b/include/linux/sched/sysctl.h
@@ -57,24 +57,12 @@ extern unsigned int sysctl_numa_balancing_scan_size;
 extern unsigned int sysctl_sched_migration_cost;
 extern unsigned int sysctl_sched_nr_migrate;
 extern unsigned int sysctl_sched_time_avg;
-extern unsigned int sysctl_timer_migration;
 extern unsigned int sysctl_sched_shares_window;
 
 int sched_proc_update_handler(struct ctl_table *table, int write,
 		void __user *buffer, size_t *length,
 		loff_t *ppos);
 #endif
-#ifdef CONFIG_SCHED_DEBUG
-static inline unsigned int get_sysctl_timer_migration(void)
-{
-	return sysctl_timer_migration;
-}
-#else
-static inline unsigned int get_sysctl_timer_migration(void)
-{
-	return 1;
-}
-#endif
 
 /*
  *  control realtime throttling:
diff --git a/include/linux/timer.h b/include/linux/timer.h
index ff0689b6e297..61aa61dc410c 100644
--- a/include/linux/timer.h
+++ b/include/linux/timer.h
@@ -238,6 +238,15 @@ extern void run_local_timers(void);
 struct hrtimer;
 extern enum hrtimer_restart it_real_fn(struct hrtimer *);
 
+#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
+#include <linux/sysctl.h>
+
+extern unsigned int sysctl_timer_migration;
+int timer_migration_handler(struct ctl_table *table, int write,
+			    void __user *buffer, size_t *lenp,
+			    loff_t *ppos);
+#endif
+
 unsigned long __round_jiffies(unsigned long j, int cpu);
 unsigned long __round_jiffies_relative(unsigned long j, int cpu);
 unsigned long round_jiffies(unsigned long j);
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 0ef80a0bbabb..d72fa24f2312 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -1432,8 +1432,6 @@ module_param(rcu_idle_gp_delay, int, 0644);
 static int rcu_idle_lazy_gp_delay = RCU_IDLE_LAZY_GP_DELAY;
 module_param(rcu_idle_lazy_gp_delay, int, 0644);
 
-extern int tick_nohz_active;
-
 /*
  * Try to advance callbacks for all flavors of RCU on the current CPU, but
  * only if it has been awhile since the last time we did so.  Afterwards,
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index ecb7c4216350..e9f25ce70c77 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -572,13 +572,12 @@ void resched_cpu(int cpu)
  * selecting an idle cpu will add more delays to the timers than intended
  * (as that cpu's timer base may not be uptodate wrt jiffies etc).
  */
-int get_nohz_timer_target(int pinned)
+int get_nohz_timer_target(void)
 {
-	int cpu = smp_processor_id();
-	int i;
+	int i, cpu = smp_processor_id();
 	struct sched_domain *sd;
 
-	if (pinned || !get_sysctl_timer_migration() || !idle_cpu(cpu))
+	if (!idle_cpu(cpu))
 		return cpu;
 
 	rcu_read_lock();
@@ -7050,8 +7049,6 @@ void __init sched_init_smp(void)
 }
 #endif /* CONFIG_SMP */
 
-const_debug unsigned int sysctl_timer_migration = 1;
-
 int in_sched_functions(unsigned long addr)
 {
 	return in_lock_functions(addr) ||
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 2082b1a88fb9..b13e9d2de302 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -349,15 +349,6 @@ static struct ctl_table kern_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec,
 	},
-	{
-		.procname	= "timer_migration",
-		.data		= &sysctl_timer_migration,
-		.maxlen		= sizeof(unsigned int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
-		.extra2		= &one,
-	},
 #endif /* CONFIG_SMP */
 #ifdef CONFIG_NUMA_BALANCING
 	{
@@ -1132,6 +1123,15 @@ static struct ctl_table kern_table[] = {
 		.extra1		= &zero,
 		.extra2		= &one,
 	},
+#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
+	{
+		.procname	= "timer_migration",
+		.data		= &sysctl_timer_migration,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= timer_migration_handler,
+	},
+#endif
 	{ }
 };
 
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index f026413de4d6..6115f4df119b 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -177,6 +177,24 @@ hrtimer_check_target(struct hrtimer *timer, struct hrtimer_clock_base *new_base)
 #endif
 }
 
+#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
+static inline
+struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base,
+					 int pinned)
+{
+	if (pinned || !base->migration_enabled)
+		return this_cpu_ptr(&hrtimer_bases);
+	return &per_cpu(hrtimer_bases, get_nohz_timer_target());
+}
+#else
+static inline
+struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base,
+					 int pinned)
+{
+	return this_cpu_ptr(&hrtimer_bases);
+}
+#endif
+
 /*
  * Switch the timer base to the current CPU when possible.
  */
@@ -184,14 +202,13 @@ static inline struct hrtimer_clock_base *
 switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base,
 		    int pinned)
 {
+	struct hrtimer_cpu_base *new_cpu_base, *this_base;
 	struct hrtimer_clock_base *new_base;
-	struct hrtimer_cpu_base *new_cpu_base;
-	int this_cpu = smp_processor_id();
-	int cpu = get_nohz_timer_target(pinned);
 	int basenum = base->index;
 
+	this_base = this_cpu_ptr(&hrtimer_bases);
+	new_cpu_base = get_target_base(this_base, pinned);
 again:
-	new_cpu_base = &per_cpu(hrtimer_bases, cpu);
 	new_base = &new_cpu_base->clock_base[basenum];
 
 	if (base != new_base) {
@@ -212,17 +229,19 @@ again:
 		raw_spin_unlock(&base->cpu_base->lock);
 		raw_spin_lock(&new_base->cpu_base->lock);
 
-		if (cpu != this_cpu && hrtimer_check_target(timer, new_base)) {
-			cpu = this_cpu;
+		if (new_cpu_base != this_base &&
+		    hrtimer_check_target(timer, new_base)) {
 			raw_spin_unlock(&new_base->cpu_base->lock);
 			raw_spin_lock(&base->cpu_base->lock);
+			new_cpu_base = this_base;
 			timer->base = base;
 			goto again;
 		}
 		timer->base = new_base;
 	} else {
-		if (cpu != this_cpu && hrtimer_check_target(timer, new_base)) {
-			cpu = this_cpu;
+		if (new_cpu_base != this_base &&
+		    hrtimer_check_target(timer, new_base)) {
+			new_cpu_base = this_base;
 			goto again;
 		}
 	}
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index ec2208aabdd1..2edde84744df 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -149,4 +149,18 @@ extern void tick_nohz_init(void);
 static inline void tick_nohz_init(void) { }
 #endif
 
+#ifdef CONFIG_NO_HZ_COMMON
+extern unsigned long tick_nohz_active;
+#else
+#define tick_nohz_active (0)
+#endif
+
+#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
+extern void timers_update_migration(void);
+#else
+static inline void timers_update_migration(void) { }
+#endif
+
+DECLARE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases);
+
 extern u64 get_next_timer_interrupt(unsigned long basej, u64 basem);
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 812f7a3b9898..b1cb01699355 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -399,7 +399,7 @@ void __init tick_nohz_init(void)
  * NO HZ enabled ?
  */
 static int tick_nohz_enabled __read_mostly  = 1;
-int tick_nohz_active  __read_mostly;
+unsigned long tick_nohz_active  __read_mostly;
 /*
  * Enable / Disable tickless mode
  */
@@ -956,6 +956,16 @@ static void tick_nohz_handler(struct clock_event_device *dev)
 	tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1);
 }
 
+static inline void tick_nohz_activate(struct tick_sched *ts, int mode)
+{
+	if (!tick_nohz_enabled)
+		return;
+	ts->nohz_mode = mode;
+	/* One update is enough */
+	if (!test_and_set_bit(0, &tick_nohz_active))
+		timers_update_migration();
+}
+
 /**
  * tick_nohz_switch_to_nohz - switch to nohz mode
  */
@@ -970,9 +980,6 @@ static void tick_nohz_switch_to_nohz(void)
 	if (tick_switch_to_oneshot(tick_nohz_handler))
 		return;
 
-	tick_nohz_active = 1;
-	ts->nohz_mode = NOHZ_MODE_LOWRES;
-
 	/*
 	 * Recycle the hrtimer in ts, so we can share the
 	 * hrtimer_forward with the highres code.
@@ -984,6 +991,7 @@ static void tick_nohz_switch_to_nohz(void)
 	hrtimer_forward_now(&ts->sched_timer, tick_period);
 	hrtimer_set_expires(&ts->sched_timer, next);
 	tick_program_event(next, 1);
+	tick_nohz_activate(ts, NOHZ_MODE_LOWRES);
 }
 
 /*
@@ -1035,6 +1043,7 @@ static inline void tick_nohz_irq_enter(void)
 
 static inline void tick_nohz_switch_to_nohz(void) { }
 static inline void tick_nohz_irq_enter(void) { }
+static inline void tick_nohz_activate(struct tick_sched *ts, int mode) { }
 
 #endif /* CONFIG_NO_HZ_COMMON */
 
@@ -1117,13 +1126,7 @@ void tick_setup_sched_timer(void)
 
 	hrtimer_forward(&ts->sched_timer, now, tick_period);
 	hrtimer_start_expires(&ts->sched_timer, HRTIMER_MODE_ABS_PINNED);
-
-#ifdef CONFIG_NO_HZ_COMMON
-	if (tick_nohz_enabled) {
-		ts->nohz_mode = NOHZ_MODE_HIGHRES;
-		tick_nohz_active = 1;
-	}
-#endif
+	tick_nohz_activate(ts, NOHZ_MODE_HIGHRES);
 }
 #endif /* HIGH_RES_TIMERS */
 
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 3398d93c74a7..343142ed996a 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -85,6 +85,7 @@ struct tvec_base {
 	unsigned long active_timers;
 	unsigned long all_timers;
 	int cpu;
+	bool migration_enabled;
 	struct tvec_root tv1;
 	struct tvec tv2;
 	struct tvec tv3;
@@ -95,6 +96,54 @@ struct tvec_base {
 
 static DEFINE_PER_CPU(struct tvec_base, tvec_bases);
 
+#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
+unsigned int sysctl_timer_migration = 1;
+
+void timers_update_migration(void)
+{
+	bool on = sysctl_timer_migration && tick_nohz_active;
+	unsigned int cpu;
+
+	/* Avoid the loop, if nothing to update */
+	if (this_cpu_read(tvec_bases.migration_enabled) == on)
+		return;
+
+	for_each_possible_cpu(cpu) {
+		per_cpu(tvec_bases.migration_enabled, cpu) = on;
+		per_cpu(hrtimer_bases.migration_enabled, cpu) = on;
+	}
+}
+
+int timer_migration_handler(struct ctl_table *table, int write,
+			    void __user *buffer, size_t *lenp,
+			    loff_t *ppos)
+{
+	static DEFINE_MUTEX(mutex);
+	int ret;
+
+	mutex_lock(&mutex);
+	ret = proc_dointvec(table, write, buffer, lenp, ppos);
+	if (!ret && write)
+		timers_update_migration();
+	mutex_unlock(&mutex);
+	return ret;
+}
+
+static inline struct tvec_base *get_target_base(struct tvec_base *base,
+						int pinned)
+{
+	if (pinned || !base->migration_enabled)
+		return this_cpu_ptr(&tvec_bases);
+	return per_cpu_ptr(&tvec_bases, get_nohz_timer_target());
+}
+#else
+static inline struct tvec_base *get_target_base(struct tvec_base *base,
+						int pinned)
+{
+	return this_cpu_ptr(&tvec_bases);
+}
+#endif
+
 static unsigned long round_jiffies_common(unsigned long j, int cpu,
 		bool force_up)
 {
@@ -716,11 +765,11 @@ static struct tvec_base *lock_timer_base(struct timer_list *timer,
 
 static inline int
 __mod_timer(struct timer_list *timer, unsigned long expires,
-						bool pending_only, int pinned)
+	    bool pending_only, int pinned)
 {
 	struct tvec_base *base, *new_base;
 	unsigned long flags;
-	int ret = 0 , cpu;
+	int ret = 0;
 
 	timer_stats_timer_set_start_info(timer);
 	BUG_ON(!timer->function);
@@ -733,8 +782,7 @@ __mod_timer(struct timer_list *timer, unsigned long expires,
 
 	debug_activate(timer, expires);
 
-	cpu = get_nohz_timer_target(pinned);
-	new_base = per_cpu_ptr(&tvec_bases, cpu);
+	new_base = get_target_base(base, pinned);
 
 	if (base != new_base) {
 		/*
@@ -751,7 +799,8 @@ __mod_timer(struct timer_list *timer, unsigned long expires,
 			spin_unlock(&base->lock);
 			base = new_base;
 			spin_lock(&base->lock);
-			timer->flags = (timer->flags & ~TIMER_BASEMASK) | cpu;
+			timer->flags &= ~TIMER_BASEMASK;
+			timer->flags |= base->cpu;
 		}
 	}
 
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index 1327004429be..a4536e1e3e2a 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -29,8 +29,6 @@ struct timer_list_iter {
 
 typedef void (*print_fn_t)(struct seq_file *m, unsigned int *classes);
 
-DECLARE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases);
-
 /*
  * This allows printing both to /proc/timer_list and
  * to the console (on SysRq-Q):
-- 
cgit v1.2.3


From 683be13a284720205228e29207ef11a1c3c322b9 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 26 May 2015 22:50:35 +0000
Subject: timer: Minimize nohz off overhead

If nohz is disabled on the kernel command line the [hr]timer code
still calls wake_up_nohz_cpu() and tick_nohz_full_cpu(), a pretty
pointless exercise. Cache nohz_active in [hr]timer per cpu bases and
avoid the overhead.

Before:
  48.10%  hog       [.] main
  15.25%  [kernel]  [k] _raw_spin_lock_irqsave
   9.76%  [kernel]  [k] _raw_spin_unlock_irqrestore
   6.50%  [kernel]  [k] mod_timer
   6.44%  [kernel]  [k] lock_timer_base.isra.38
   3.87%  [kernel]  [k] detach_if_pending
   3.80%  [kernel]  [k] del_timer
   2.67%  [kernel]  [k] internal_add_timer
   1.33%  [kernel]  [k] __internal_add_timer
   0.73%  [kernel]  [k] timerfn
   0.54%  [kernel]  [k] wake_up_nohz_cpu

After:
  48.73%  hog       [.] main
  15.36%  [kernel]  [k] _raw_spin_lock_irqsave
   9.77%  [kernel]  [k] _raw_spin_unlock_irqrestore
   6.61%  [kernel]  [k] lock_timer_base.isra.38
   6.42%  [kernel]  [k] mod_timer
   3.90%  [kernel]  [k] detach_if_pending
   3.76%  [kernel]  [k] del_timer
   2.41%  [kernel]  [k] internal_add_timer
   1.39%  [kernel]  [k] __internal_add_timer
   0.76%  [kernel]  [k] timerfn

We probably should have a cached value for nohz full in the per cpu
bases as well to avoid the cpumask check. The base cache line is hot
already, the cpumask not necessarily.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Paul McKenney <paulmck@linux.vnet.ibm.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Viresh Kumar <viresh.kumar@linaro.org>
Cc: John Stultz <john.stultz@linaro.org>
Cc: Joonwoo Park <joonwoop@codeaurora.org>
Cc: Wenbo Wang <wenbo.wang@memblaze.com>
Link: http://lkml.kernel.org/r/20150526224512.207378134@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/hrtimer.h     |  2 ++
 kernel/time/hrtimer.c       |  3 ++-
 kernel/time/tick-internal.h |  4 ++--
 kernel/time/tick-sched.c    |  2 +-
 kernel/time/timer.c         | 16 ++++++++++++----
 5 files changed, 19 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index 69551020bb97..76dd4f0da5ca 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -164,6 +164,7 @@ enum  hrtimer_base_type {
  * @active_bases:	Bitfield to mark bases with active timers
  * @clock_was_set_seq:	Sequence counter of clock was set events
  * @migration_enabled:	The migration of hrtimers to other cpus is enabled
+ * @nohz_active:	The nohz functionality is enabled
  * @expires_next:	absolute time of the next event which was scheduled
  *			via clock_set_next_event()
  * @next_timer:		Pointer to the first expiring timer
@@ -188,6 +189,7 @@ struct hrtimer_cpu_base {
 	unsigned int			active_bases;
 	unsigned int			clock_was_set_seq;
 	bool				migration_enabled;
+	bool				nohz_active;
 #ifdef CONFIG_HIGH_RES_TIMERS
 	unsigned int			in_hrtirq	: 1,
 					hres_active	: 1,
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 6115f4df119b..db5c9508ed95 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -994,7 +994,8 @@ void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
 		 * Kick to reschedule the next tick to handle the new timer
 		 * on dynticks target.
 		 */
-		wake_up_nohz_cpu(new_base->cpu_base->cpu);
+		if (new_base->cpu_base->nohz_active)
+			wake_up_nohz_cpu(new_base->cpu_base->cpu);
 	} else {
 		hrtimer_reprogram(timer, new_base);
 	}
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index 2edde84744df..966a5a6fdd0a 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -156,9 +156,9 @@ extern unsigned long tick_nohz_active;
 #endif
 
 #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
-extern void timers_update_migration(void);
+extern void timers_update_migration(bool update_nohz);
 #else
-static inline void timers_update_migration(void) { }
+static inline void timers_update_migration(bool update_nohz) { }
 #endif
 
 DECLARE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases);
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index b1cb01699355..c792429e98c6 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -963,7 +963,7 @@ static inline void tick_nohz_activate(struct tick_sched *ts, int mode)
 	ts->nohz_mode = mode;
 	/* One update is enough */
 	if (!test_and_set_bit(0, &tick_nohz_active))
-		timers_update_migration();
+		timers_update_migration(true);
 }
 
 /**
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 343142ed996a..520499dd85af 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -86,6 +86,7 @@ struct tvec_base {
 	unsigned long all_timers;
 	int cpu;
 	bool migration_enabled;
+	bool nohz_active;
 	struct tvec_root tv1;
 	struct tvec tv2;
 	struct tvec tv3;
@@ -99,7 +100,7 @@ static DEFINE_PER_CPU(struct tvec_base, tvec_bases);
 #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
 unsigned int sysctl_timer_migration = 1;
 
-void timers_update_migration(void)
+void timers_update_migration(bool update_nohz)
 {
 	bool on = sysctl_timer_migration && tick_nohz_active;
 	unsigned int cpu;
@@ -111,6 +112,10 @@ void timers_update_migration(void)
 	for_each_possible_cpu(cpu) {
 		per_cpu(tvec_bases.migration_enabled, cpu) = on;
 		per_cpu(hrtimer_bases.migration_enabled, cpu) = on;
+		if (!update_nohz)
+			continue;
+		per_cpu(tvec_bases.nohz_active, cpu) = true;
+		per_cpu(hrtimer_bases.nohz_active, cpu) = true;
 	}
 }
 
@@ -124,7 +129,7 @@ int timer_migration_handler(struct ctl_table *table, int write,
 	mutex_lock(&mutex);
 	ret = proc_dointvec(table, write, buffer, lenp, ppos);
 	if (!ret && write)
-		timers_update_migration();
+		timers_update_migration(false);
 	mutex_unlock(&mutex);
 	return ret;
 }
@@ -436,8 +441,11 @@ static void internal_add_timer(struct tvec_base *base, struct timer_list *timer)
 	 * require special care against races with idle_cpu(), lets deal
 	 * with that later.
 	 */
-	if (!(timer->flags & TIMER_DEFERRABLE) || tick_nohz_full_cpu(base->cpu))
-		wake_up_nohz_cpu(base->cpu);
+	if (base->nohz_active) {
+		if (!(timer->flags & TIMER_DEFERRABLE) ||
+		    tick_nohz_full_cpu(base->cpu))
+			wake_up_nohz_cpu(base->cpu);
+	}
 }
 
 #ifdef CONFIG_TIMER_STATS
-- 
cgit v1.2.3


From 1796dcce2daacc125f2d60afc3f631ca29e36684 Mon Sep 17 00:00:00 2001
From: Krzysztof Kozlowski <k.kozlowski.k@gmail.com>
Date: Sun, 3 May 2015 18:57:10 +0900
Subject: rtc: interface: Fix coding style violations

Fix issues reported by checkpatch:
  ERROR: open brace '{' following struct go on the same line
  ERROR: "foo* bar" should be "foo *bar"
Additionally adjust alignment of wrapped function arguments.

Signed-off-by: Krzysztof Kozlowski <k.kozlowski.k@gmail.com>
Signed-off-by: Alexandre Belloni <alexandre.belloni@free-electrons.com>
---
 include/linux/rtc.h | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rtc.h b/include/linux/rtc.h
index 8dcf6825fa88..b0709f80dfb0 100644
--- a/include/linux/rtc.h
+++ b/include/linux/rtc.h
@@ -101,8 +101,7 @@ struct rtc_timer {
 /* flags */
 #define RTC_DEV_BUSY 0
 
-struct rtc_device
-{
+struct rtc_device {
 	struct device dev;
 	struct module *owner;
 
@@ -198,10 +197,10 @@ int rtc_register(rtc_task_t *task);
 int rtc_unregister(rtc_task_t *task);
 int rtc_control(rtc_task_t *t, unsigned int cmd, unsigned long arg);
 
-void rtc_timer_init(struct rtc_timer *timer, void (*f)(void* p), void* data);
-int rtc_timer_start(struct rtc_device *rtc, struct rtc_timer* timer,
-			ktime_t expires, ktime_t period);
-int rtc_timer_cancel(struct rtc_device *rtc, struct rtc_timer* timer);
+void rtc_timer_init(struct rtc_timer *timer, void (*f)(void *p), void *data);
+int rtc_timer_start(struct rtc_device *rtc, struct rtc_timer *timer,
+		    ktime_t expires, ktime_t period);
+int rtc_timer_cancel(struct rtc_device *rtc, struct rtc_timer *timer);
 void rtc_timer_do_work(struct work_struct *work);
 
 static inline bool is_leap_year(unsigned int year)
-- 
cgit v1.2.3


From 73744a64aab872703b851b9678a7f488b507eb81 Mon Sep 17 00:00:00 2001
From: Krzysztof Kozlowski <k.kozlowski.k@gmail.com>
Date: Sun, 3 May 2015 18:57:11 +0900
Subject: rtc: interface: Remove unused return value from rtc_timer_cancel()

The rtc_timer_cancel() always returns 0 and cannot fail (calls only
other void-returning functions).

Signed-off-by: Krzysztof Kozlowski <k.kozlowski.k@gmail.com>
Signed-off-by: Alexandre Belloni <alexandre.belloni@free-electrons.com>
---
 drivers/rtc/interface.c | 4 +---
 include/linux/rtc.h     | 2 +-
 2 files changed, 2 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/rtc/interface.c b/drivers/rtc/interface.c
index 166fc60d8b55..a6b14c24d7e3 100644
--- a/drivers/rtc/interface.c
+++ b/drivers/rtc/interface.c
@@ -976,14 +976,12 @@ int rtc_timer_start(struct rtc_device *rtc, struct rtc_timer *timer,
  *
  * Kernel interface to cancel an rtc_timer
  */
-int rtc_timer_cancel(struct rtc_device *rtc, struct rtc_timer *timer)
+void rtc_timer_cancel(struct rtc_device *rtc, struct rtc_timer *timer)
 {
-	int ret = 0;
 	mutex_lock(&rtc->ops_lock);
 	if (timer->enabled)
 		rtc_timer_remove(rtc, timer);
 	mutex_unlock(&rtc->ops_lock);
-	return ret;
 }
 
 
diff --git a/include/linux/rtc.h b/include/linux/rtc.h
index b0709f80dfb0..587017e7939c 100644
--- a/include/linux/rtc.h
+++ b/include/linux/rtc.h
@@ -200,7 +200,7 @@ int rtc_control(rtc_task_t *t, unsigned int cmd, unsigned long arg);
 void rtc_timer_init(struct rtc_timer *timer, void (*f)(void *p), void *data);
 int rtc_timer_start(struct rtc_device *rtc, struct rtc_timer *timer,
 		    ktime_t expires, ktime_t period);
-int rtc_timer_cancel(struct rtc_device *rtc, struct rtc_timer *timer);
+void rtc_timer_cancel(struct rtc_device *rtc, struct rtc_timer *timer);
 void rtc_timer_do_work(struct work_struct *work);
 
 static inline bool is_leap_year(unsigned int year)
-- 
cgit v1.2.3


From d8d919879e9a645061a560a0a26abb9f3bca97df Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
Date: Fri, 3 Apr 2015 18:43:44 +0200
Subject: clk: add CLK_RECALC_NEW_RATES clock flag for Exynos cpu clock support

This flag is needed to fix the issue with wrong dividers being setup
by Common Clock Framework when using the new Exynos cpu clock support.

The issue happens because clk_core_set_rate_nolock()  calls
clk_calc_new_rates(clk, rate) before both pre/post clock notifiers have
a chance to run.  In case of Exynos cpu clock support pre/post clock
notifiers are registered for mout_apll clock which is a parent of armclk
cpu clock and dividers are modified in both pre and post clock notifier.
This results in wrong dividers values being later programmed by
clk_change_rate(top).  To workaround the problem CLK_RECALC_NEW_RATES
flag is added and it is set for mout_apll clock later so the correct
divider values are re-calculated after both pre and post clock notifiers
had run.

For example when using "performance" governor on Exynos4210 Origen board
the cpufreq-dt driver requests to change the frequency from 1000MHz to
1200MHz and after the change state of the relevant clocks is following:

Without use of CLK_GET_RATE_NOCACHE flag:

 fout_apll rate: 1200000000
         fout_apll_div_2 rate: 600000000
                 mout_clkout_cpu rate: 600000000
                         div_clkout_cpu rate: 600000000
                                 clkout_cpu rate: 600000000
         mout_apll rate: 1200000000
                 armclk rate: 1200000000
                 mout_hpm rate: 1200000000
                         div_copy rate: 300000000
                                 div_hpm rate: 300000000
                 mout_core rate: 1200000000
                         div_core rate: 1200000000
                                 div_core2 rate: 1200000000
                                         arm_clk_div_2 rate: 600000000
                                         div_corem0 rate: 300000000
                                         div_corem1 rate: 150000000
                                         div_periph rate: 300000000
                         div_atb rate: 300000000
                                 div_pclk_dbg rate: 150000000
                 sclk_apll rate: 1200000000
                         sclk_apll_div_2 rate: 600000000

With use of CLK_GET_RATE_NOCACHE flag:

 fout_apll rate: 1200000000
         fout_apll_div_2 rate: 600000000
                 mout_clkout_cpu rate: 600000000
                         div_clkout_cpu rate: 600000000
                                 clkout_cpu rate: 600000000
         mout_apll rate: 1200000000
                 armclk rate: 1200000000
                 mout_hpm rate: 1200000000
                         div_copy rate: 200000000
                                 div_hpm rate: 200000000
                 mout_core rate: 1200000000
                         div_core rate: 1200000000
                                 div_core2 rate: 1200000000
                                         arm_clk_div_2 rate: 600000000
                                         div_corem0 rate: 300000000
                                         div_corem1 rate: 150000000
                                         div_periph rate: 300000000
                         div_atb rate: 240000000
                                 div_pclk_dbg rate: 120000000
                 sclk_apll rate: 150000000
                         sclk_apll_div_2 rate: 75000000

Without this change cpufreq-dt driver showed ~10 mA larger energy
consumption when compared to cpufreq-exynos one when "performance"
cpufreq governor was used on Exynos4210 SoC based Origen board.

This issue was probably meant to be workarounded by use of
CLK_GET_RATE_NOCACHE and CLK_DIVIDER_READ_ONLY clock flags in
the original Exynos cpu clock patchset (in "[PATCH v12 6/6] clk:
samsung: remove unused clock aliases and update clock flags" patch)
but usage of these flags is not sufficient to fix the issue observed.

Cc: Thomas Abraham <thomas.ab@samsung.com>
Cc: Tomasz Figa <tomasz.figa@gmail.com>
Cc: Mike Turquette <mturquette@linaro.org>
Cc: Javier Martinez Canillas <javier.martinez@collabora.co.uk>
Signed-off-by: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
Signed-off-by: Michael Turquette <mturquette@baylibre.com>
---
 drivers/clk/clk.c            | 3 +++
 include/linux/clk-provider.h | 1 +
 2 files changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/clk/clk.c b/drivers/clk/clk.c
index 5b0f41868b42..0c0124c4eeeb 100644
--- a/drivers/clk/clk.c
+++ b/drivers/clk/clk.c
@@ -1788,6 +1788,9 @@ static void clk_change_rate(struct clk_core *clk)
 	if (clk->notifier_count && old_rate != clk->rate)
 		__clk_notify(clk, POST_RATE_CHANGE, old_rate, clk->rate);
 
+	if (clk->flags & CLK_RECALC_NEW_RATES)
+		(void)clk_calc_new_rates(clk, clk->new_rate);
+
 	/*
 	 * Use safe iteration, as change_rate can actually swap parents
 	 * for certain clock types.
diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h
index df695313f975..82f59ca8188a 100644
--- a/include/linux/clk-provider.h
+++ b/include/linux/clk-provider.h
@@ -31,6 +31,7 @@
 #define CLK_GET_RATE_NOCACHE	BIT(6) /* do not use the cached clk rate */
 #define CLK_SET_RATE_NO_REPARENT BIT(7) /* don't re-parent on rate change */
 #define CLK_GET_ACCURACY_NOCACHE BIT(8) /* do not use the cached clk accuracy */
+#define CLK_RECALC_NEW_RATES	BIT(9) /* recalc rates after notifications */
 
 struct clk_hw;
 struct clk_core;
-- 
cgit v1.2.3


From 1387fe7d292b66677dae31d25a8e3c953bf21748 Mon Sep 17 00:00:00 2001
From: Rafał Miłecki <zajec5@gmail.com>
Date: Tue, 12 May 2015 11:31:02 +0200
Subject: MIPS: BCM47xx: Extract all boardflags to new u32 fields
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

For years we planned to get rid of old u16 fields, let's start doing it
with MIPS code. This process will take some time, it requires doing the
same in ssb/bcma and then switching all drivers to new fields. This will
be handled in separated patches submitted to appropriate trees.

Signed-off-by: Rafał Miłecki <zajec5@gmail.com>
Cc: linux-mips@linux-mips.org
Cc: Hauke Mehrtens <hauke@hauke-m.de>
Patchwork: https://patchwork.linux-mips.org/patch/10026/
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
---
 arch/mips/bcm47xx/sprom.c | 3 +++
 include/linux/ssb/ssb.h   | 5 ++++-
 2 files changed, 7 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/arch/mips/bcm47xx/sprom.c b/arch/mips/bcm47xx/sprom.c
index 68ebf2322f8b..404808388c03 100644
--- a/arch/mips/bcm47xx/sprom.c
+++ b/arch/mips/bcm47xx/sprom.c
@@ -201,6 +201,9 @@ static void bcm47xx_sprom_fill_auto(struct ssb_sprom *sprom,
 	bool fb = fallback;
 
 	ENTRY(0xfffffffe, u16, pre, "boardrev", board_rev, 0, true);
+	ENTRY(0xfffffffe, u32, pre, "boardflags", boardflags, 0, fb);
+	ENTRY(0xfffffff0, u32, pre, "boardflags2", boardflags2, 0, fb);
+	ENTRY(0xfffff800, u32, pre, "boardflags3", boardflags3, 0, fb);
 	ENTRY(0x00000002, u16, pre, "boardflags", boardflags_lo, 0, fb);
 	ENTRY(0xfffffffc, u16, pre, "boardtype", board_type, 0, true);
 	ENTRY(0xfffffffe, u16, pre, "boardnum", board_num, 0, fb);
diff --git a/include/linux/ssb/ssb.h b/include/linux/ssb/ssb.h
index 4568a5cc9ab8..ee90e32a0607 100644
--- a/include/linux/ssb/ssb.h
+++ b/include/linux/ssb/ssb.h
@@ -88,11 +88,14 @@ struct ssb_sprom {
 	u32 ofdm5glpo;		/* 5.2GHz OFDM power offset */
 	u32 ofdm5gpo;		/* 5.3GHz OFDM power offset */
 	u32 ofdm5ghpo;		/* 5.8GHz OFDM power offset */
+	u32 boardflags;
+	u32 boardflags2;
+	u32 boardflags3;
+	/* TODO: Switch all drivers to new u32 fields and drop below ones */
 	u16 boardflags_lo;	/* Board flags (bits 0-15) */
 	u16 boardflags_hi;	/* Board flags (bits 16-31) */
 	u16 boardflags2_lo;	/* Board flags (bits 32-47) */
 	u16 boardflags2_hi;	/* Board flags (bits 48-63) */
-	/* TODO store board flags in a single u64 */
 
 	struct ssb_sprom_core_pwr_info core_pwr_info[4];
 
-- 
cgit v1.2.3


From 6e122ac0053d071976686dd04cdd60ea8039bb7a Mon Sep 17 00:00:00 2001
From: Rafał Miłecki <zajec5@gmail.com>
Date: Tue, 12 May 2015 11:54:48 +0200
Subject: MIPS: BCM47xx: Extract info about et2 interface
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

New devices may have more than 1 Ethernet core (device). We should
extract info about them to make it available to Ethernet drivers.

Signed-off-by: Rafał Miłecki <zajec5@gmail.com>
Cc: linux-mips@linux-mips.org
Cc: Hauke Mehrtens <hauke@hauke-m.de>
Cc: Hante Meuleman <meuleman@broadcom.com>
Cc: Ian Kent <raven@themaw.net>
Patchwork: https://patchwork.linux-mips.org/patch/10027/
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
---
 arch/mips/bcm47xx/sprom.c | 6 ++++++
 include/linux/ssb/ssb.h   | 3 +++
 2 files changed, 9 insertions(+)

(limited to 'include/linux')

diff --git a/arch/mips/bcm47xx/sprom.c b/arch/mips/bcm47xx/sprom.c
index 404808388c03..92a6c9dc21bd 100644
--- a/arch/mips/bcm47xx/sprom.c
+++ b/arch/mips/bcm47xx/sprom.c
@@ -531,6 +531,8 @@ static int mac_addr_used = 2;
 static void bcm47xx_fill_sprom_ethernet(struct ssb_sprom *sprom,
 					const char *prefix, bool fallback)
 {
+	bool fb = fallback;
+
 	nvram_read_macaddr(prefix, "et0macaddr", sprom->et0mac, fallback);
 	nvram_read_u8(prefix, NULL, "et0mdcport", &sprom->et0mdcport, 0,
 		      fallback);
@@ -543,6 +545,10 @@ static void bcm47xx_fill_sprom_ethernet(struct ssb_sprom *sprom,
 	nvram_read_u8(prefix, NULL, "et1phyaddr", &sprom->et1phyaddr, 0,
 		      fallback);
 
+	nvram_read_macaddr(prefix, "et2macaddr", sprom->et2mac, fb);
+	nvram_read_u8(prefix, NULL, "et2mdcport", &sprom->et2mdcport, 0, fb);
+	nvram_read_u8(prefix, NULL, "et2phyaddr", &sprom->et2phyaddr, 0, fb);
+
 	nvram_read_macaddr(prefix, "macaddr", sprom->il0mac, fallback);
 	nvram_read_macaddr(prefix, "il0macaddr", sprom->il0mac, fallback);
 
diff --git a/include/linux/ssb/ssb.h b/include/linux/ssb/ssb.h
index ee90e32a0607..c3d1a525bacc 100644
--- a/include/linux/ssb/ssb.h
+++ b/include/linux/ssb/ssb.h
@@ -29,10 +29,13 @@ struct ssb_sprom {
 	u8 il0mac[6] __aligned(sizeof(u16));	/* MAC address for 802.11b/g */
 	u8 et0mac[6] __aligned(sizeof(u16));	/* MAC address for Ethernet */
 	u8 et1mac[6] __aligned(sizeof(u16));	/* MAC address for 802.11a */
+	u8 et2mac[6] __aligned(sizeof(u16));	/* MAC address for extra Ethernet */
 	u8 et0phyaddr;		/* MII address for enet0 */
 	u8 et1phyaddr;		/* MII address for enet1 */
+	u8 et2phyaddr;		/* MII address for enet2 */
 	u8 et0mdcport;		/* MDIO for enet0 */
 	u8 et1mdcport;		/* MDIO for enet1 */
+	u8 et2mdcport;		/* MDIO for enet2 */
 	u16 dev_id;		/* Device ID overriding e.g. PCI ID */
 	u16 board_rev;		/* Board revision number from SPROM. */
 	u16 board_num;		/* Board number from SPROM. */
-- 
cgit v1.2.3


From 44e08e7099c8de226606cfc989b45d6fa27f507f Mon Sep 17 00:00:00 2001
From: Paul Burton <paul.burton@imgtec.com>
Date: Sun, 24 May 2015 16:11:31 +0100
Subject: MIPS/IRQCHIP: Move Ingenic SoC intc driver to drivers/irqchip

Move the driver for Ingenic SoC interrupt controllers into
drivers/irqchip where it belongs.

Signed-off-by: Paul Burton <paul.burton@imgtec.com>
Cc: Lars-Peter Clausen <lars@metafoo.de>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Jason Cooper <jason@lakedaemon.net>
Cc: linux-mips@linux-mips.org
Cc: linux-kernel@vger.kernel.org
Cc: Brian Norris <computersforpeace@gmail.com>
Patchwork: https://patchwork.linux-mips.org/patch/10147/
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
---
 arch/mips/jz4740/Makefile       |   2 +-
 arch/mips/jz4740/gpio.c         |   3 +-
 arch/mips/jz4740/irq.c          | 178 ----------------------------------------
 arch/mips/jz4740/irq.h          |  23 ------
 drivers/irqchip/Kconfig         |   5 ++
 drivers/irqchip/Makefile        |   1 +
 drivers/irqchip/irq-ingenic.c   | 177 +++++++++++++++++++++++++++++++++++++++
 include/linux/irqchip/ingenic.h |  23 ++++++
 8 files changed, 208 insertions(+), 204 deletions(-)
 delete mode 100644 arch/mips/jz4740/irq.c
 delete mode 100644 arch/mips/jz4740/irq.h
 create mode 100644 drivers/irqchip/irq-ingenic.c
 create mode 100644 include/linux/irqchip/ingenic.h

(limited to 'include/linux')

diff --git a/arch/mips/jz4740/Makefile b/arch/mips/jz4740/Makefile
index 28e5535dfa9e..6cf5dd42fa34 100644
--- a/arch/mips/jz4740/Makefile
+++ b/arch/mips/jz4740/Makefile
@@ -4,7 +4,7 @@
 
 # Object file lists.
 
-obj-y += prom.o irq.o time.o reset.o setup.o \
+obj-y += prom.o time.o reset.o setup.o \
 	gpio.o clock.o platform.o timer.o serial.o
 
 obj-$(CONFIG_DEBUG_FS) += clock-debugfs.o
diff --git a/arch/mips/jz4740/gpio.c b/arch/mips/jz4740/gpio.c
index 994a7dfe6f22..54c80d42a88d 100644
--- a/arch/mips/jz4740/gpio.c
+++ b/arch/mips/jz4740/gpio.c
@@ -21,6 +21,7 @@
 #include <linux/gpio.h>
 #include <linux/delay.h>
 #include <linux/interrupt.h>
+#include <linux/irqchip/ingenic.h>
 #include <linux/bitops.h>
 
 #include <linux/debugfs.h>
@@ -28,8 +29,6 @@
 
 #include <asm/mach-jz4740/base.h>
 
-#include "irq.h"
-
 #define JZ4740_GPIO_BASE_A (32*0)
 #define JZ4740_GPIO_BASE_B (32*1)
 #define JZ4740_GPIO_BASE_C (32*2)
diff --git a/arch/mips/jz4740/irq.c b/arch/mips/jz4740/irq.c
deleted file mode 100644
index 64b4c3639280..000000000000
--- a/arch/mips/jz4740/irq.c
+++ /dev/null
@@ -1,178 +0,0 @@
-/*
- *  Copyright (C) 2009-2010, Lars-Peter Clausen <lars@metafoo.de>
- *  JZ4740 platform IRQ support
- *
- *  This program is free software; you can redistribute it and/or modify it
- *  under  the terms of the GNU General	 Public License as published by the
- *  Free Software Foundation;  either version 2 of the License, or (at your
- *  option) any later version.
- *
- *  You should have received a copy of the GNU General Public License along
- *  with this program; if not, write to the Free Software Foundation, Inc.,
- *  675 Mass Ave, Cambridge, MA 02139, USA.
- *
- */
-
-#include <linux/errno.h>
-#include <linux/init.h>
-#include <linux/types.h>
-#include <linux/interrupt.h>
-#include <linux/ioport.h>
-#include <linux/of_address.h>
-#include <linux/of_irq.h>
-#include <linux/timex.h>
-#include <linux/slab.h>
-#include <linux/delay.h>
-
-#include <asm/io.h>
-#include <asm/mach-jz4740/irq.h>
-
-#include "irq.h"
-
-#include "../../drivers/irqchip/irqchip.h"
-
-struct ingenic_intc_data {
-	void __iomem *base;
-	unsigned num_chips;
-};
-
-#define JZ_REG_INTC_STATUS	0x00
-#define JZ_REG_INTC_MASK	0x04
-#define JZ_REG_INTC_SET_MASK	0x08
-#define JZ_REG_INTC_CLEAR_MASK	0x0c
-#define JZ_REG_INTC_PENDING	0x10
-#define CHIP_SIZE		0x20
-
-static irqreturn_t intc_cascade(int irq, void *data)
-{
-	struct ingenic_intc_data *intc = irq_get_handler_data(irq);
-	uint32_t irq_reg;
-	unsigned i;
-
-	for (i = 0; i < intc->num_chips; i++) {
-		irq_reg = readl(intc->base + (i * CHIP_SIZE) +
-				JZ_REG_INTC_PENDING);
-		if (!irq_reg)
-			continue;
-
-		generic_handle_irq(__fls(irq_reg) + (i * 32) + JZ4740_IRQ_BASE);
-	}
-
-	return IRQ_HANDLED;
-}
-
-static void intc_irq_set_mask(struct irq_chip_generic *gc, uint32_t mask)
-{
-	struct irq_chip_regs *regs = &gc->chip_types->regs;
-
-	writel(mask, gc->reg_base + regs->enable);
-	writel(~mask, gc->reg_base + regs->disable);
-}
-
-void ingenic_intc_irq_suspend(struct irq_data *data)
-{
-	struct irq_chip_generic *gc = irq_data_get_irq_chip_data(data);
-	intc_irq_set_mask(gc, gc->wake_active);
-}
-
-void ingenic_intc_irq_resume(struct irq_data *data)
-{
-	struct irq_chip_generic *gc = irq_data_get_irq_chip_data(data);
-	intc_irq_set_mask(gc, gc->mask_cache);
-}
-
-static struct irqaction intc_cascade_action = {
-	.handler = intc_cascade,
-	.name = "SoC intc cascade interrupt",
-};
-
-static int __init ingenic_intc_of_init(struct device_node *node,
-				       unsigned num_chips)
-{
-	struct ingenic_intc_data *intc;
-	struct irq_chip_generic *gc;
-	struct irq_chip_type *ct;
-	struct irq_domain *domain;
-	int parent_irq, err = 0;
-	unsigned i;
-
-	intc = kzalloc(sizeof(*intc), GFP_KERNEL);
-	if (!intc) {
-		err = -ENOMEM;
-		goto out_err;
-	}
-
-	parent_irq = irq_of_parse_and_map(node, 0);
-	if (!parent_irq) {
-		err = -EINVAL;
-		goto out_free;
-	}
-
-	err = irq_set_handler_data(parent_irq, intc);
-	if (err)
-		goto out_unmap_irq;
-
-	intc->num_chips = num_chips;
-	intc->base = of_iomap(node, 0);
-	if (!intc->base) {
-		err = -ENODEV;
-		goto out_unmap_irq;
-	}
-
-	for (i = 0; i < num_chips; i++) {
-		/* Mask all irqs */
-		writel(0xffffffff, intc->base + (i * CHIP_SIZE) +
-		       JZ_REG_INTC_SET_MASK);
-
-		gc = irq_alloc_generic_chip("INTC", 1,
-					    JZ4740_IRQ_BASE + (i * 32),
-					    intc->base + (i * CHIP_SIZE),
-					    handle_level_irq);
-
-		gc->wake_enabled = IRQ_MSK(32);
-
-		ct = gc->chip_types;
-		ct->regs.enable = JZ_REG_INTC_CLEAR_MASK;
-		ct->regs.disable = JZ_REG_INTC_SET_MASK;
-		ct->chip.irq_unmask = irq_gc_unmask_enable_reg;
-		ct->chip.irq_mask = irq_gc_mask_disable_reg;
-		ct->chip.irq_mask_ack = irq_gc_mask_disable_reg;
-		ct->chip.irq_set_wake = irq_gc_set_wake;
-		ct->chip.irq_suspend = ingenic_intc_irq_suspend;
-		ct->chip.irq_resume = ingenic_intc_irq_resume;
-
-		irq_setup_generic_chip(gc, IRQ_MSK(32), 0, 0,
-				       IRQ_NOPROBE | IRQ_LEVEL);
-	}
-
-	domain = irq_domain_add_legacy(node, num_chips * 32, JZ4740_IRQ_BASE, 0,
-				       &irq_domain_simple_ops, NULL);
-	if (!domain)
-		pr_warn("unable to register IRQ domain\n");
-
-	setup_irq(parent_irq, &intc_cascade_action);
-	return 0;
-
-out_unmap_irq:
-	irq_dispose_mapping(parent_irq);
-out_free:
-	kfree(intc);
-out_err:
-	return err;
-}
-
-static int __init intc_1chip_of_init(struct device_node *node,
-				     struct device_node *parent)
-{
-	return ingenic_intc_of_init(node, 1);
-}
-IRQCHIP_DECLARE(jz4740_intc, "ingenic,jz4740-intc", intc_1chip_of_init);
-
-static int __init intc_2chip_of_init(struct device_node *node,
-	struct device_node *parent)
-{
-	return ingenic_intc_of_init(node, 2);
-}
-IRQCHIP_DECLARE(jz4770_intc, "ingenic,jz4770-intc", intc_2chip_of_init);
-IRQCHIP_DECLARE(jz4775_intc, "ingenic,jz4775-intc", intc_2chip_of_init);
-IRQCHIP_DECLARE(jz4780_intc, "ingenic,jz4780-intc", intc_2chip_of_init);
diff --git a/arch/mips/jz4740/irq.h b/arch/mips/jz4740/irq.h
deleted file mode 100644
index 601d5274ac69..000000000000
--- a/arch/mips/jz4740/irq.h
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- *  Copyright (C) 2010, Lars-Peter Clausen <lars@metafoo.de>
- *
- *  This program is free software; you can redistribute it and/or modify it
- *  under  the terms of the GNU General	 Public License as published by the
- *  Free Software Foundation;  either version 2 of the License, or (at your
- *  option) any later version.
- *
- *  You should have received a copy of the GNU General Public License along
- *  with this program; if not, write to the Free Software Foundation, Inc.,
- *  675 Mass Ave, Cambridge, MA 02139, USA.
- *
- */
-
-#ifndef __MIPS_JZ4740_IRQ_H__
-#define __MIPS_JZ4740_IRQ_H__
-
-#include <linux/irq.h>
-
-extern void ingenic_intc_irq_suspend(struct irq_data *data);
-extern void ingenic_intc_irq_resume(struct irq_data *data);
-
-#endif
diff --git a/drivers/irqchip/Kconfig b/drivers/irqchip/Kconfig
index 2b7531e0e84c..746daf37454c 100644
--- a/drivers/irqchip/Kconfig
+++ b/drivers/irqchip/Kconfig
@@ -163,3 +163,8 @@ config KEYSTONE_IRQ
 config MIPS_GIC
 	bool
 	select MIPS_CM
+
+config INGENIC_IRQ
+	bool
+	depends on MACH_INGENIC
+	default y
diff --git a/drivers/irqchip/Makefile b/drivers/irqchip/Makefile
index 129cde1ff5a7..db014e8e12c9 100644
--- a/drivers/irqchip/Makefile
+++ b/drivers/irqchip/Makefile
@@ -48,3 +48,4 @@ obj-$(CONFIG_KEYSTONE_IRQ)		+= irq-keystone.o
 obj-$(CONFIG_MIPS_GIC)			+= irq-mips-gic.o
 obj-$(CONFIG_ARCH_MEDIATEK)		+= irq-mtk-sysirq.o
 obj-$(CONFIG_ARCH_DIGICOLOR)		+= irq-digicolor.o
+obj-$(CONFIG_INGENIC_IRQ)		+= irq-ingenic.o
diff --git a/drivers/irqchip/irq-ingenic.c b/drivers/irqchip/irq-ingenic.c
new file mode 100644
index 000000000000..005de3f932ae
--- /dev/null
+++ b/drivers/irqchip/irq-ingenic.c
@@ -0,0 +1,177 @@
+/*
+ *  Copyright (C) 2009-2010, Lars-Peter Clausen <lars@metafoo.de>
+ *  JZ4740 platform IRQ support
+ *
+ *  This program is free software; you can redistribute it and/or modify it
+ *  under  the terms of the GNU General	 Public License as published by the
+ *  Free Software Foundation;  either version 2 of the License, or (at your
+ *  option) any later version.
+ *
+ *  You should have received a copy of the GNU General Public License along
+ *  with this program; if not, write to the Free Software Foundation, Inc.,
+ *  675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#include <linux/errno.h>
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/interrupt.h>
+#include <linux/ioport.h>
+#include <linux/irqchip/ingenic.h>
+#include <linux/of_address.h>
+#include <linux/of_irq.h>
+#include <linux/timex.h>
+#include <linux/slab.h>
+#include <linux/delay.h>
+
+#include <asm/io.h>
+#include <asm/mach-jz4740/irq.h>
+
+#include "irqchip.h"
+
+struct ingenic_intc_data {
+	void __iomem *base;
+	unsigned num_chips;
+};
+
+#define JZ_REG_INTC_STATUS	0x00
+#define JZ_REG_INTC_MASK	0x04
+#define JZ_REG_INTC_SET_MASK	0x08
+#define JZ_REG_INTC_CLEAR_MASK	0x0c
+#define JZ_REG_INTC_PENDING	0x10
+#define CHIP_SIZE		0x20
+
+static irqreturn_t intc_cascade(int irq, void *data)
+{
+	struct ingenic_intc_data *intc = irq_get_handler_data(irq);
+	uint32_t irq_reg;
+	unsigned i;
+
+	for (i = 0; i < intc->num_chips; i++) {
+		irq_reg = readl(intc->base + (i * CHIP_SIZE) +
+				JZ_REG_INTC_PENDING);
+		if (!irq_reg)
+			continue;
+
+		generic_handle_irq(__fls(irq_reg) + (i * 32) + JZ4740_IRQ_BASE);
+	}
+
+	return IRQ_HANDLED;
+}
+
+static void intc_irq_set_mask(struct irq_chip_generic *gc, uint32_t mask)
+{
+	struct irq_chip_regs *regs = &gc->chip_types->regs;
+
+	writel(mask, gc->reg_base + regs->enable);
+	writel(~mask, gc->reg_base + regs->disable);
+}
+
+void ingenic_intc_irq_suspend(struct irq_data *data)
+{
+	struct irq_chip_generic *gc = irq_data_get_irq_chip_data(data);
+	intc_irq_set_mask(gc, gc->wake_active);
+}
+
+void ingenic_intc_irq_resume(struct irq_data *data)
+{
+	struct irq_chip_generic *gc = irq_data_get_irq_chip_data(data);
+	intc_irq_set_mask(gc, gc->mask_cache);
+}
+
+static struct irqaction intc_cascade_action = {
+	.handler = intc_cascade,
+	.name = "SoC intc cascade interrupt",
+};
+
+static int __init ingenic_intc_of_init(struct device_node *node,
+				       unsigned num_chips)
+{
+	struct ingenic_intc_data *intc;
+	struct irq_chip_generic *gc;
+	struct irq_chip_type *ct;
+	struct irq_domain *domain;
+	int parent_irq, err = 0;
+	unsigned i;
+
+	intc = kzalloc(sizeof(*intc), GFP_KERNEL);
+	if (!intc) {
+		err = -ENOMEM;
+		goto out_err;
+	}
+
+	parent_irq = irq_of_parse_and_map(node, 0);
+	if (!parent_irq) {
+		err = -EINVAL;
+		goto out_free;
+	}
+
+	err = irq_set_handler_data(parent_irq, intc);
+	if (err)
+		goto out_unmap_irq;
+
+	intc->num_chips = num_chips;
+	intc->base = of_iomap(node, 0);
+	if (!intc->base) {
+		err = -ENODEV;
+		goto out_unmap_irq;
+	}
+
+	for (i = 0; i < num_chips; i++) {
+		/* Mask all irqs */
+		writel(0xffffffff, intc->base + (i * CHIP_SIZE) +
+		       JZ_REG_INTC_SET_MASK);
+
+		gc = irq_alloc_generic_chip("INTC", 1,
+					    JZ4740_IRQ_BASE + (i * 32),
+					    intc->base + (i * CHIP_SIZE),
+					    handle_level_irq);
+
+		gc->wake_enabled = IRQ_MSK(32);
+
+		ct = gc->chip_types;
+		ct->regs.enable = JZ_REG_INTC_CLEAR_MASK;
+		ct->regs.disable = JZ_REG_INTC_SET_MASK;
+		ct->chip.irq_unmask = irq_gc_unmask_enable_reg;
+		ct->chip.irq_mask = irq_gc_mask_disable_reg;
+		ct->chip.irq_mask_ack = irq_gc_mask_disable_reg;
+		ct->chip.irq_set_wake = irq_gc_set_wake;
+		ct->chip.irq_suspend = ingenic_intc_irq_suspend;
+		ct->chip.irq_resume = ingenic_intc_irq_resume;
+
+		irq_setup_generic_chip(gc, IRQ_MSK(32), 0, 0,
+				       IRQ_NOPROBE | IRQ_LEVEL);
+	}
+
+	domain = irq_domain_add_legacy(node, num_chips * 32, JZ4740_IRQ_BASE, 0,
+				       &irq_domain_simple_ops, NULL);
+	if (!domain)
+		pr_warn("unable to register IRQ domain\n");
+
+	setup_irq(parent_irq, &intc_cascade_action);
+	return 0;
+
+out_unmap_irq:
+	irq_dispose_mapping(parent_irq);
+out_free:
+	kfree(intc);
+out_err:
+	return err;
+}
+
+static int __init intc_1chip_of_init(struct device_node *node,
+				     struct device_node *parent)
+{
+	return ingenic_intc_of_init(node, 1);
+}
+IRQCHIP_DECLARE(jz4740_intc, "ingenic,jz4740-intc", intc_1chip_of_init);
+
+static int __init intc_2chip_of_init(struct device_node *node,
+	struct device_node *parent)
+{
+	return ingenic_intc_of_init(node, 2);
+}
+IRQCHIP_DECLARE(jz4770_intc, "ingenic,jz4770-intc", intc_2chip_of_init);
+IRQCHIP_DECLARE(jz4775_intc, "ingenic,jz4775-intc", intc_2chip_of_init);
+IRQCHIP_DECLARE(jz4780_intc, "ingenic,jz4780-intc", intc_2chip_of_init);
diff --git a/include/linux/irqchip/ingenic.h b/include/linux/irqchip/ingenic.h
new file mode 100644
index 000000000000..0ee319a4029d
--- /dev/null
+++ b/include/linux/irqchip/ingenic.h
@@ -0,0 +1,23 @@
+/*
+ *  Copyright (C) 2010, Lars-Peter Clausen <lars@metafoo.de>
+ *
+ *  This program is free software; you can redistribute it and/or modify it
+ *  under  the terms of the GNU General	 Public License as published by the
+ *  Free Software Foundation;  either version 2 of the License, or (at your
+ *  option) any later version.
+ *
+ *  You should have received a copy of the GNU General Public License along
+ *  with this program; if not, write to the Free Software Foundation, Inc.,
+ *  675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#ifndef __LINUX_IRQCHIP_INGENIC_H__
+#define __LINUX_IRQCHIP_INGENIC_H__
+
+#include <linux/irq.h>
+
+extern void ingenic_intc_irq_suspend(struct irq_data *data);
+extern void ingenic_intc_irq_resume(struct irq_data *data);
+
+#endif
-- 
cgit v1.2.3


From 55cab93bcf1422ab4298edc65c349c4304b4884e Mon Sep 17 00:00:00 2001
From: Hante Meuleman <meuleman@broadcom.com>
Date: Thu, 21 May 2015 15:27:23 +0200
Subject: mips: bcm47xx: allow retrieval of complete nvram contents

Host platforms such as routers supported by OpenWrt can
support NVRAM reading directly from internal NVRAM store.
The brcmfmac for one requires the complete nvram contents
to select what needs to be sent to wireless device.

Signed-off-by: Arend van Spriel <arend@broadcom.com>
Signed-off-by: Hante Meuleman <meuleman@broadcom.com>
Reviewed-by: Arend Van Spriel <arend@broadcom.com>
Reviewed-by: Franky (Zhenhui) Lin <frankyl@broadcom.com>
Reviewed-by: Pieter-Paul Giesberts <pieterpg@broadcom.com>
Reviewed-by: Daniel (Deognyoun) Kim <dekim@broadcom.com>
Cc: linux-mips@linux-mips.org
Patchwork: https://patchwork.linux-mips.org/patch/10093/
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
---
 arch/mips/bcm47xx/nvram.c     | 60 ++++++++++++++++++++++++++++++++-----------
 include/linux/bcm47xx_nvram.h | 15 +++++++++++
 2 files changed, 60 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/arch/mips/bcm47xx/nvram.c b/arch/mips/bcm47xx/nvram.c
index 95d028ca6f8a..2ed762ed4006 100644
--- a/arch/mips/bcm47xx/nvram.c
+++ b/arch/mips/bcm47xx/nvram.c
@@ -94,17 +94,22 @@ static int nvram_find_and_copy(void __iomem *iobase, u32 lim)
 	return -ENXIO;
 
 found:
-	if (header->len > size)
-		pr_err("The nvram size accoridng to the header seems to be bigger than the partition on flash\n");
-	if (header->len > NVRAM_SPACE)
-		pr_err("nvram on flash (%i bytes) is bigger than the reserved space in memory, will just copy the first %i bytes\n",
-		       header->len, NVRAM_SPACE - 1);
-
 	src = (u32 *)header;
 	dst = (u32 *)nvram_buf;
 	for (i = 0; i < sizeof(struct nvram_header); i += 4)
 		*dst++ = __raw_readl(src++);
-	for (; i < header->len && i < NVRAM_SPACE && i < size; i += 4)
+	header = (struct nvram_header *)nvram_buf;
+	if (header->len > size) {
+		pr_err("The nvram size according to the header seems to be bigger than the partition on flash\n");
+		header->len = size;
+	}
+	if (header->len >= NVRAM_SPACE) {
+		pr_err("nvram on flash (%i bytes) is bigger than the reserved space in memory, will just copy the first %i bytes\n",
+		       header->len, NVRAM_SPACE - 1);
+		header->len = NVRAM_SPACE - 1;
+	}
+	/* proceed reading data after header */
+	for (; i < header->len; i += 4)
 		*dst++ = readl(src++);
 	nvram_buf[NVRAM_SPACE - 1] = '\0';
 
@@ -139,6 +144,7 @@ static int nvram_init(void)
 #ifdef CONFIG_MTD
 	struct mtd_info *mtd;
 	struct nvram_header header;
+	struct nvram_header *pheader;
 	size_t bytes_read;
 	int err;
 
@@ -147,20 +153,21 @@ static int nvram_init(void)
 		return -ENODEV;
 
 	err = mtd_read(mtd, 0, sizeof(header), &bytes_read, (uint8_t *)&header);
-	if (!err && header.magic == NVRAM_MAGIC) {
-		u8 *dst = (uint8_t *)nvram_buf;
-		size_t len = header.len;
-
-		if (len >= NVRAM_SPACE) {
-			len = NVRAM_SPACE - 1;
+	if (!err && header.magic == NVRAM_MAGIC &&
+	    header.len > sizeof(header)) {
+		if (header.len >= NVRAM_SPACE) {
 			pr_err("nvram on flash (%i bytes) is bigger than the reserved space in memory, will just copy the first %i bytes\n",
-				header.len, len);
+				header.len, NVRAM_SPACE);
+			header.len = NVRAM_SPACE - 1;
 		}
 
-		err = mtd_read(mtd, 0, len, &bytes_read, dst);
+		err = mtd_read(mtd, 0, header.len, &bytes_read,
+			       (u8 *)nvram_buf);
 		if (err)
 			return err;
 
+		pheader = (struct nvram_header *)nvram_buf;
+		pheader->len = header.len;
 		return 0;
 	}
 #endif
@@ -219,3 +226,26 @@ int bcm47xx_nvram_gpio_pin(const char *name)
 	return -ENOENT;
 }
 EXPORT_SYMBOL(bcm47xx_nvram_gpio_pin);
+
+char *bcm47xx_nvram_get_contents(size_t *nvram_size)
+{
+	int err;
+	char *nvram;
+	struct nvram_header *header;
+
+	if (!nvram_buf[0]) {
+		err = nvram_init();
+		if (err)
+			return NULL;
+	}
+
+	header = (struct nvram_header *)nvram_buf;
+	*nvram_size = header->len - sizeof(struct nvram_header);
+	nvram = vmalloc(*nvram_size);
+	if (!nvram)
+		return NULL;
+	memcpy(nvram, &nvram_buf[sizeof(struct nvram_header)], *nvram_size);
+
+	return nvram;
+}
+EXPORT_SYMBOL(bcm47xx_nvram_get_contents);
diff --git a/include/linux/bcm47xx_nvram.h b/include/linux/bcm47xx_nvram.h
index b12b07e75929..c73927c66c2c 100644
--- a/include/linux/bcm47xx_nvram.h
+++ b/include/linux/bcm47xx_nvram.h
@@ -10,11 +10,17 @@
 
 #include <linux/types.h>
 #include <linux/kernel.h>
+#include <linux/vmalloc.h>
 
 #ifdef CONFIG_BCM47XX
 int bcm47xx_nvram_init_from_mem(u32 base, u32 lim);
 int bcm47xx_nvram_getenv(const char *name, char *val, size_t val_len);
 int bcm47xx_nvram_gpio_pin(const char *name);
+char *bcm47xx_nvram_get_contents(size_t *val_len);
+static inline void bcm47xx_nvram_release_contents(char *nvram)
+{
+	vfree(nvram);
+};
 #else
 static inline int bcm47xx_nvram_init_from_mem(u32 base, u32 lim)
 {
@@ -29,6 +35,15 @@ static inline int bcm47xx_nvram_gpio_pin(const char *name)
 {
 	return -ENOTSUPP;
 };
+
+static inline char *bcm47xx_nvram_get_contents(size_t *val_len)
+{
+	return NULL;
+};
+
+static inline void bcm47xx_nvram_release_contents(char *nvram)
+{
+};
 #endif
 
 #endif /* __BCM47XX_NVRAM_H */
-- 
cgit v1.2.3


From 2ddf3a792218cddd30140b1f8b32cb6e2d67921f Mon Sep 17 00:00:00 2001
From: Alban Bedel <albeu@free.fr>
Date: Sun, 31 May 2015 02:18:24 +0200
Subject: MIPS: ath79: Add OF support to the GPIO driver

Replace the simple GPIO chip registration by a platform driver
and make ath79_gpio_init() just register the device.

Signed-off-by: Alban Bedel <albeu@free.fr>
Cc: linux-mips@linux-mips.org
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
---
 arch/mips/ath79/dev-common.c             | 51 +++++++++++++++++++++
 arch/mips/ath79/gpio.c                   | 79 +++++++++++++++++++++++---------
 include/linux/platform_data/gpio-ath79.h | 19 ++++++++
 3 files changed, 127 insertions(+), 22 deletions(-)
 create mode 100644 include/linux/platform_data/gpio-ath79.h

(limited to 'include/linux')

diff --git a/arch/mips/ath79/dev-common.c b/arch/mips/ath79/dev-common.c
index 516225d207ee..9d0172a4dc69 100644
--- a/arch/mips/ath79/dev-common.c
+++ b/arch/mips/ath79/dev-common.c
@@ -14,6 +14,7 @@
 #include <linux/kernel.h>
 #include <linux/init.h>
 #include <linux/platform_device.h>
+#include <linux/platform_data/gpio-ath79.h>
 #include <linux/serial_8250.h>
 #include <linux/clk.h>
 #include <linux/err.h>
@@ -106,3 +107,53 @@ void __init ath79_register_wdt(void)
 
 	platform_device_register_simple("ath79-wdt", -1, &res, 1);
 }
+
+static struct ath79_gpio_platform_data ath79_gpio_pdata;
+
+static struct resource ath79_gpio_resources[] = {
+	{
+		.flags = IORESOURCE_MEM,
+		.start = AR71XX_GPIO_BASE,
+		.end = AR71XX_GPIO_BASE + AR71XX_GPIO_SIZE - 1,
+	},
+	{
+		.start	= ATH79_MISC_IRQ(2),
+		.end	= ATH79_MISC_IRQ(2),
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device ath79_gpio_device = {
+	.name		= "ath79-gpio",
+	.id		= -1,
+	.resource	= ath79_gpio_resources,
+	.num_resources	= ARRAY_SIZE(ath79_gpio_resources),
+	.dev = {
+		.platform_data	= &ath79_gpio_pdata
+	},
+};
+
+void __init ath79_gpio_init(void)
+{
+	if (soc_is_ar71xx()) {
+		ath79_gpio_pdata.ngpios = AR71XX_GPIO_COUNT;
+	} else if (soc_is_ar7240()) {
+		ath79_gpio_pdata.ngpios = AR7240_GPIO_COUNT;
+	} else if (soc_is_ar7241() || soc_is_ar7242()) {
+		ath79_gpio_pdata.ngpios = AR7241_GPIO_COUNT;
+	} else if (soc_is_ar913x()) {
+		ath79_gpio_pdata.ngpios = AR913X_GPIO_COUNT;
+	} else if (soc_is_ar933x()) {
+		ath79_gpio_pdata.ngpios = AR933X_GPIO_COUNT;
+	} else if (soc_is_ar934x()) {
+		ath79_gpio_pdata.ngpios = AR934X_GPIO_COUNT;
+		ath79_gpio_pdata.oe_inverted = 1;
+	} else if (soc_is_qca955x()) {
+		ath79_gpio_pdata.ngpios = QCA955X_GPIO_COUNT;
+		ath79_gpio_pdata.oe_inverted = 1;
+	} else {
+		BUG();
+	}
+
+	platform_device_register(&ath79_gpio_device);
+}
diff --git a/arch/mips/ath79/gpio.c b/arch/mips/ath79/gpio.c
index 8d025b028bb1..f59ccb26520a 100644
--- a/arch/mips/ath79/gpio.c
+++ b/arch/mips/ath79/gpio.c
@@ -20,13 +20,15 @@
 #include <linux/io.h>
 #include <linux/ioport.h>
 #include <linux/gpio.h>
+#include <linux/platform_data/gpio-ath79.h>
+#include <linux/of_device.h>
 
 #include <asm/mach-ath79/ar71xx_regs.h>
 #include <asm/mach-ath79/ath79.h>
 #include "common.h"
 
 static void __iomem *ath79_gpio_base;
-static unsigned long ath79_gpio_count;
+static u32 ath79_gpio_count;
 static DEFINE_SPINLOCK(ath79_gpio_lock);
 
 static void __ath79_gpio_set_value(unsigned gpio, int value)
@@ -178,39 +180,72 @@ void ath79_gpio_function_disable(u32 mask)
 	ath79_gpio_function_setup(0, mask);
 }
 
-void __init ath79_gpio_init(void)
+static const struct of_device_id ath79_gpio_of_match[] = {
+	{ .compatible = "qca,ar7100-gpio" },
+	{ .compatible = "qca,ar9340-gpio" },
+	{},
+};
+
+static int ath79_gpio_probe(struct platform_device *pdev)
 {
+	struct ath79_gpio_platform_data *pdata = pdev->dev.platform_data;
+	struct device_node *np = pdev->dev.of_node;
+	struct resource *res;
+	bool oe_inverted;
 	int err;
 
-	if (soc_is_ar71xx())
-		ath79_gpio_count = AR71XX_GPIO_COUNT;
-	else if (soc_is_ar7240())
-		ath79_gpio_count = AR7240_GPIO_COUNT;
-	else if (soc_is_ar7241() || soc_is_ar7242())
-		ath79_gpio_count = AR7241_GPIO_COUNT;
-	else if (soc_is_ar913x())
-		ath79_gpio_count = AR913X_GPIO_COUNT;
-	else if (soc_is_ar933x())
-		ath79_gpio_count = AR933X_GPIO_COUNT;
-	else if (soc_is_ar934x())
-		ath79_gpio_count = AR934X_GPIO_COUNT;
-	else if (soc_is_qca955x())
-		ath79_gpio_count = QCA955X_GPIO_COUNT;
-	else
-		BUG();
+	if (np) {
+		err = of_property_read_u32(np, "ngpios", &ath79_gpio_count);
+		if (err) {
+			dev_err(&pdev->dev, "ngpios property is not valid\n");
+			return err;
+		}
+		if (ath79_gpio_count >= 32) {
+			dev_err(&pdev->dev, "ngpios must be less than 32\n");
+			return -EINVAL;
+		}
+		oe_inverted = of_device_is_compatible(np, "qca,ar9340-gpio");
+	} else if (pdata) {
+		ath79_gpio_count = pdata->ngpios;
+		oe_inverted = pdata->oe_inverted;
+	} else {
+		dev_err(&pdev->dev, "No DT node or platform data found\n");
+		return -EINVAL;
+	}
+
+	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	ath79_gpio_base = devm_ioremap_nocache(
+		&pdev->dev, res->start, resource_size(res));
+	if (!ath79_gpio_base)
+		return -ENOMEM;
 
-	ath79_gpio_base = ioremap_nocache(AR71XX_GPIO_BASE, AR71XX_GPIO_SIZE);
+	ath79_gpio_chip.dev = &pdev->dev;
 	ath79_gpio_chip.ngpio = ath79_gpio_count;
-	if (soc_is_ar934x() || soc_is_qca955x()) {
+	if (oe_inverted) {
 		ath79_gpio_chip.direction_input = ar934x_gpio_direction_input;
 		ath79_gpio_chip.direction_output = ar934x_gpio_direction_output;
 	}
 
 	err = gpiochip_add(&ath79_gpio_chip);
-	if (err)
-		panic("cannot add AR71xx GPIO chip, error=%d", err);
+	if (err) {
+		dev_err(&pdev->dev,
+			"cannot add AR71xx GPIO chip, error=%d", err);
+		return err;
+	}
+
+	return 0;
 }
 
+static struct platform_driver ath79_gpio_driver = {
+	.driver = {
+		.name = "ath79-gpio",
+		.of_match_table	= ath79_gpio_of_match,
+	},
+	.probe = ath79_gpio_probe,
+};
+
+module_platform_driver(ath79_gpio_driver);
+
 int gpio_get_value(unsigned gpio)
 {
 	if (gpio < ath79_gpio_count)
diff --git a/include/linux/platform_data/gpio-ath79.h b/include/linux/platform_data/gpio-ath79.h
new file mode 100644
index 000000000000..88b0db7bee74
--- /dev/null
+++ b/include/linux/platform_data/gpio-ath79.h
@@ -0,0 +1,19 @@
+/*
+ *  Atheros AR7XXX/AR9XXX GPIO controller platform data
+ *
+ * Copyright (C) 2015 Alban Bedel <albeu@free.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef __LINUX_PLATFORM_DATA_GPIO_ATH79_H
+#define __LINUX_PLATFORM_DATA_GPIO_ATH79_H
+
+struct ath79_gpio_platform_data {
+	unsigned ngpios;
+	bool oe_inverted;
+};
+
+#endif
-- 
cgit v1.2.3


From f6e734a8c162297953d7bfc0f3f6bf4f8c33d72f Mon Sep 17 00:00:00 2001
From: Rafał Miłecki <zajec5@gmail.com>
Date: Wed, 10 Jun 2015 23:05:08 +0200
Subject: MIPS: BCM47xx: Move NVRAM driver to the drivers/firmware/
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

After Broadcom switched from MIPS to ARM for their home routers we need
to have NVRAM driver in some common place (not arch/mips/). As explained
in Kconfig, this driver is responsible for parsing SoC configuration
data that is passed to the kernel in flash from the bootloader firmware
called "CFE".

We were thinking about putting it in bus directory, however there are
two possible buses for MIPS: drivers/ssb/ and drivers/bcma/. So this
won't fit there and this is why I would like to move this driver to the
drivers/firmware/.

Signed-off-by: Rafał Miłecki <zajec5@gmail.com>
Reviewed-by: Paul Walmsley <paul@pwsan.com>
Cc: linux-mips@linux-mips.org
Cc: Hauke Mehrtens <hauke@hauke-m.de>
Cc: Seiji Aguchi <seiji.aguchi@hds.com>
Cc: Greg Kroah-Hartman <gregkh@suse.de>
Cc: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: Mike Waychison <mikew@google.com>
Cc: Roy Franz <roy.franz@linaro.org>
Cc: Matt Fleming <matt.fleming@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Patchwork: https://patchwork.linux-mips.org/patch/10544/
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
---
 MAINTAINERS                               |   6 +
 arch/mips/Kconfig                         |   1 +
 arch/mips/bcm47xx/Makefile                |   2 +-
 arch/mips/bcm47xx/nvram.c                 | 246 -----------------------------
 drivers/firmware/Kconfig                  |   1 +
 drivers/firmware/Makefile                 |   1 +
 drivers/firmware/broadcom/Kconfig         |  11 ++
 drivers/firmware/broadcom/Makefile        |   1 +
 drivers/firmware/broadcom/bcm47xx_nvram.c | 248 ++++++++++++++++++++++++++++++
 include/linux/bcm47xx_nvram.h             |   2 +-
 10 files changed, 271 insertions(+), 248 deletions(-)
 delete mode 100644 arch/mips/bcm47xx/nvram.c
 create mode 100644 drivers/firmware/broadcom/Kconfig
 create mode 100644 drivers/firmware/broadcom/Makefile
 create mode 100644 drivers/firmware/broadcom/bcm47xx_nvram.c

(limited to 'include/linux')

diff --git a/MAINTAINERS b/MAINTAINERS
index 43043f0b354e..25a17aa20741 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2265,6 +2265,12 @@ S:	Supported
 F:	drivers/gpio/gpio-bcm-kona.c
 F:	Documentation/devicetree/bindings/gpio/gpio-bcm-kona.txt
 
+BROADCOM NVRAM DRIVER
+M:	Rafał Miłecki <zajec5@gmail.com>
+L:	linux-mips@linux-mips.org
+S:	Maintained
+F:	drivers/firmware/broadcom/*
+
 BROADCOM SPECIFIC AMBA DRIVER (BCMA)
 M:	Rafał Miłecki <zajec5@gmail.com>
 L:	linux-wireless@vger.kernel.org
diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
index d53b43237f7f..dbb1a1d0f9b3 100644
--- a/arch/mips/Kconfig
+++ b/arch/mips/Kconfig
@@ -190,6 +190,7 @@ config BCM47XX
 	select USE_GENERIC_EARLY_PRINTK_8250
 	select GPIOLIB
 	select LEDS_GPIO_REGISTER
+	select BCM47XX_NVRAM
 	help
 	 Support for BCM47XX based boards
 
diff --git a/arch/mips/bcm47xx/Makefile b/arch/mips/bcm47xx/Makefile
index d58c51b5e501..66bea4ecf449 100644
--- a/arch/mips/bcm47xx/Makefile
+++ b/arch/mips/bcm47xx/Makefile
@@ -3,5 +3,5 @@
 # under Linux.
 #
 
-obj-y				+= irq.o nvram.o prom.o serial.o setup.o time.o sprom.o
+obj-y				+= irq.o prom.o serial.o setup.o time.o sprom.o
 obj-y				+= board.o buttons.o leds.o workarounds.o
diff --git a/arch/mips/bcm47xx/nvram.c b/arch/mips/bcm47xx/nvram.c
deleted file mode 100644
index 9ccdce816f84..000000000000
--- a/arch/mips/bcm47xx/nvram.c
+++ /dev/null
@@ -1,246 +0,0 @@
-/*
- * BCM947xx nvram variable access
- *
- * Copyright (C) 2005 Broadcom Corporation
- * Copyright (C) 2006 Felix Fietkau <nbd@openwrt.org>
- * Copyright (C) 2010-2012 Hauke Mehrtens <hauke@hauke-m.de>
- *
- * This program is free software; you can redistribute	it and/or modify it
- * under  the terms of	the GNU General	 Public License as published by the
- * Free Software Foundation;  either version 2 of the  License, or (at your
- * option) any later version.
- */
-
-#include <linux/io.h>
-#include <linux/types.h>
-#include <linux/module.h>
-#include <linux/kernel.h>
-#include <linux/string.h>
-#include <linux/mtd/mtd.h>
-#include <linux/bcm47xx_nvram.h>
-
-#define NVRAM_MAGIC			0x48534C46	/* 'FLSH' */
-#define NVRAM_SPACE			0x10000
-#define NVRAM_MAX_GPIO_ENTRIES		32
-#define NVRAM_MAX_GPIO_VALUE_LEN	30
-
-#define FLASH_MIN		0x00020000	/* Minimum flash size */
-
-struct nvram_header {
-	u32 magic;
-	u32 len;
-	u32 crc_ver_init;	/* 0:7 crc, 8:15 ver, 16:31 sdram_init */
-	u32 config_refresh;	/* 0:15 sdram_config, 16:31 sdram_refresh */
-	u32 config_ncdl;	/* ncdl values for memc */
-};
-
-static char nvram_buf[NVRAM_SPACE];
-static size_t nvram_len;
-static const u32 nvram_sizes[] = {0x8000, 0xF000, 0x10000};
-
-static u32 find_nvram_size(void __iomem *end)
-{
-	struct nvram_header __iomem *header;
-	int i;
-
-	for (i = 0; i < ARRAY_SIZE(nvram_sizes); i++) {
-		header = (struct nvram_header *)(end - nvram_sizes[i]);
-		if (header->magic == NVRAM_MAGIC)
-			return nvram_sizes[i];
-	}
-
-	return 0;
-}
-
-/* Probe for NVRAM header */
-static int nvram_find_and_copy(void __iomem *iobase, u32 lim)
-{
-	struct nvram_header __iomem *header;
-	int i;
-	u32 off;
-	u32 *src, *dst;
-	u32 size;
-
-	if (nvram_len) {
-		pr_warn("nvram already initialized\n");
-		return -EEXIST;
-	}
-
-	/* TODO: when nvram is on nand flash check for bad blocks first. */
-	off = FLASH_MIN;
-	while (off <= lim) {
-		/* Windowed flash access */
-		size = find_nvram_size(iobase + off);
-		if (size) {
-			header = (struct nvram_header *)(iobase + off - size);
-			goto found;
-		}
-		off <<= 1;
-	}
-
-	/* Try embedded NVRAM at 4 KB and 1 KB as last resorts */
-	header = (struct nvram_header *)(iobase + 4096);
-	if (header->magic == NVRAM_MAGIC) {
-		size = NVRAM_SPACE;
-		goto found;
-	}
-
-	header = (struct nvram_header *)(iobase + 1024);
-	if (header->magic == NVRAM_MAGIC) {
-		size = NVRAM_SPACE;
-		goto found;
-	}
-
-	pr_err("no nvram found\n");
-	return -ENXIO;
-
-found:
-	src = (u32 *)header;
-	dst = (u32 *)nvram_buf;
-	for (i = 0; i < sizeof(struct nvram_header); i += 4)
-		*dst++ = __raw_readl(src++);
-	header = (struct nvram_header *)nvram_buf;
-	nvram_len = header->len;
-	if (nvram_len > size) {
-		pr_err("The nvram size according to the header seems to be bigger than the partition on flash\n");
-		nvram_len = size;
-	}
-	if (nvram_len >= NVRAM_SPACE) {
-		pr_err("nvram on flash (%i bytes) is bigger than the reserved space in memory, will just copy the first %i bytes\n",
-		       header->len, NVRAM_SPACE - 1);
-		nvram_len = NVRAM_SPACE - 1;
-	}
-	/* proceed reading data after header */
-	for (; i < nvram_len; i += 4)
-		*dst++ = readl(src++);
-	nvram_buf[NVRAM_SPACE - 1] = '\0';
-
-	return 0;
-}
-
-/*
- * On bcm47xx we need access to the NVRAM very early, so we can't use mtd
- * subsystem to access flash. We can't even use platform device / driver to
- * store memory offset.
- * To handle this we provide following symbol. It's supposed to be called as
- * soon as we get info about flash device, before any NVRAM entry is needed.
- */
-int bcm47xx_nvram_init_from_mem(u32 base, u32 lim)
-{
-	void __iomem *iobase;
-	int err;
-
-	iobase = ioremap_nocache(base, lim);
-	if (!iobase)
-		return -ENOMEM;
-
-	err = nvram_find_and_copy(iobase, lim);
-
-	iounmap(iobase);
-
-	return err;
-}
-
-static int nvram_init(void)
-{
-#ifdef CONFIG_MTD
-	struct mtd_info *mtd;
-	struct nvram_header header;
-	size_t bytes_read;
-	int err;
-
-	mtd = get_mtd_device_nm("nvram");
-	if (IS_ERR(mtd))
-		return -ENODEV;
-
-	err = mtd_read(mtd, 0, sizeof(header), &bytes_read, (uint8_t *)&header);
-	if (!err && header.magic == NVRAM_MAGIC &&
-	    header.len > sizeof(header)) {
-		nvram_len = header.len;
-		if (nvram_len >= NVRAM_SPACE) {
-			pr_err("nvram on flash (%i bytes) is bigger than the reserved space in memory, will just copy the first %i bytes\n",
-				header.len, NVRAM_SPACE);
-			nvram_len = NVRAM_SPACE - 1;
-		}
-
-		err = mtd_read(mtd, 0, nvram_len, &nvram_len,
-			       (u8 *)nvram_buf);
-		return err;
-	}
-#endif
-
-	return -ENXIO;
-}
-
-int bcm47xx_nvram_getenv(const char *name, char *val, size_t val_len)
-{
-	char *var, *value, *end, *eq;
-	int err;
-
-	if (!name)
-		return -EINVAL;
-
-	if (!nvram_len) {
-		err = nvram_init();
-		if (err)
-			return err;
-	}
-
-	/* Look for name=value and return value */
-	var = &nvram_buf[sizeof(struct nvram_header)];
-	end = nvram_buf + sizeof(nvram_buf);
-	while (var < end && *var) {
-		eq = strchr(var, '=');
-		if (!eq)
-			break;
-		value = eq + 1;
-		if (eq - var == strlen(name) &&
-		    strncmp(var, name, eq - var) == 0)
-			return snprintf(val, val_len, "%s", value);
-		var = value + strlen(value) + 1;
-	}
-	return -ENOENT;
-}
-EXPORT_SYMBOL(bcm47xx_nvram_getenv);
-
-int bcm47xx_nvram_gpio_pin(const char *name)
-{
-	int i, err;
-	char nvram_var[] = "gpioXX";
-	char buf[NVRAM_MAX_GPIO_VALUE_LEN];
-
-	/* TODO: Optimize it to don't call getenv so many times */
-	for (i = 0; i < NVRAM_MAX_GPIO_ENTRIES; i++) {
-		err = snprintf(nvram_var, sizeof(nvram_var), "gpio%i", i);
-		if (err <= 0)
-			continue;
-		err = bcm47xx_nvram_getenv(nvram_var, buf, sizeof(buf));
-		if (err <= 0)
-			continue;
-		if (!strcmp(name, buf))
-			return i;
-	}
-	return -ENOENT;
-}
-EXPORT_SYMBOL(bcm47xx_nvram_gpio_pin);
-
-char *bcm47xx_nvram_get_contents(size_t *nvram_size)
-{
-	int err;
-	char *nvram;
-
-	if (!nvram_len) {
-		err = nvram_init();
-		if (err)
-			return NULL;
-	}
-
-	*nvram_size = nvram_len - sizeof(struct nvram_header);
-	nvram = vmalloc(*nvram_size);
-	if (!nvram)
-		return NULL;
-	memcpy(nvram, &nvram_buf[sizeof(struct nvram_header)], *nvram_size);
-
-	return nvram;
-}
-EXPORT_SYMBOL(bcm47xx_nvram_get_contents);
diff --git a/drivers/firmware/Kconfig b/drivers/firmware/Kconfig
index 6517132e5d8b..99c69a3205c4 100644
--- a/drivers/firmware/Kconfig
+++ b/drivers/firmware/Kconfig
@@ -136,6 +136,7 @@ config QCOM_SCM
 	bool
 	depends on ARM || ARM64
 
+source "drivers/firmware/broadcom/Kconfig"
 source "drivers/firmware/google/Kconfig"
 source "drivers/firmware/efi/Kconfig"
 
diff --git a/drivers/firmware/Makefile b/drivers/firmware/Makefile
index 3fdd3912709a..210c6e0550d3 100644
--- a/drivers/firmware/Makefile
+++ b/drivers/firmware/Makefile
@@ -14,6 +14,7 @@ obj-$(CONFIG_FIRMWARE_MEMMAP)	+= memmap.o
 obj-$(CONFIG_QCOM_SCM)		+= qcom_scm.o
 CFLAGS_qcom_scm.o :=$(call as-instr,.arch_extension sec,-DREQUIRES_SEC=1)
 
+obj-y				+= broadcom/
 obj-$(CONFIG_GOOGLE_FIRMWARE)	+= google/
 obj-$(CONFIG_EFI)		+= efi/
 obj-$(CONFIG_UEFI_CPER)		+= efi/
diff --git a/drivers/firmware/broadcom/Kconfig b/drivers/firmware/broadcom/Kconfig
new file mode 100644
index 000000000000..6bed119930dd
--- /dev/null
+++ b/drivers/firmware/broadcom/Kconfig
@@ -0,0 +1,11 @@
+config BCM47XX_NVRAM
+	bool "Broadcom NVRAM driver"
+	depends on BCM47XX || ARCH_BCM_5301X
+	help
+	  Broadcom home routers contain flash partition called "nvram" with all
+	  important hardware configuration as well as some minor user setup.
+	  NVRAM partition contains a text-like data representing name=value
+	  pairs.
+	  This driver provides an easy way to get value of requested parameter.
+	  It simply reads content of NVRAM and parses it. It doesn't control any
+	  hardware part itself.
diff --git a/drivers/firmware/broadcom/Makefile b/drivers/firmware/broadcom/Makefile
new file mode 100644
index 000000000000..d0e683583cd6
--- /dev/null
+++ b/drivers/firmware/broadcom/Makefile
@@ -0,0 +1 @@
+obj-$(CONFIG_BCM47XX_NVRAM)		+= bcm47xx_nvram.o
diff --git a/drivers/firmware/broadcom/bcm47xx_nvram.c b/drivers/firmware/broadcom/bcm47xx_nvram.c
new file mode 100644
index 000000000000..87add3fdce52
--- /dev/null
+++ b/drivers/firmware/broadcom/bcm47xx_nvram.c
@@ -0,0 +1,248 @@
+/*
+ * BCM947xx nvram variable access
+ *
+ * Copyright (C) 2005 Broadcom Corporation
+ * Copyright (C) 2006 Felix Fietkau <nbd@openwrt.org>
+ * Copyright (C) 2010-2012 Hauke Mehrtens <hauke@hauke-m.de>
+ *
+ * This program is free software; you can redistribute	it and/or modify it
+ * under  the terms of	the GNU General	 Public License as published by the
+ * Free Software Foundation;  either version 2 of the  License, or (at your
+ * option) any later version.
+ */
+
+#include <linux/io.h>
+#include <linux/types.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/mtd/mtd.h>
+#include <linux/bcm47xx_nvram.h>
+
+#define NVRAM_MAGIC			0x48534C46	/* 'FLSH' */
+#define NVRAM_SPACE			0x10000
+#define NVRAM_MAX_GPIO_ENTRIES		32
+#define NVRAM_MAX_GPIO_VALUE_LEN	30
+
+#define FLASH_MIN		0x00020000	/* Minimum flash size */
+
+struct nvram_header {
+	u32 magic;
+	u32 len;
+	u32 crc_ver_init;	/* 0:7 crc, 8:15 ver, 16:31 sdram_init */
+	u32 config_refresh;	/* 0:15 sdram_config, 16:31 sdram_refresh */
+	u32 config_ncdl;	/* ncdl values for memc */
+};
+
+static char nvram_buf[NVRAM_SPACE];
+static size_t nvram_len;
+static const u32 nvram_sizes[] = {0x8000, 0xF000, 0x10000};
+
+static u32 find_nvram_size(void __iomem *end)
+{
+	struct nvram_header __iomem *header;
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(nvram_sizes); i++) {
+		header = (struct nvram_header *)(end - nvram_sizes[i]);
+		if (header->magic == NVRAM_MAGIC)
+			return nvram_sizes[i];
+	}
+
+	return 0;
+}
+
+/* Probe for NVRAM header */
+static int nvram_find_and_copy(void __iomem *iobase, u32 lim)
+{
+	struct nvram_header __iomem *header;
+	int i;
+	u32 off;
+	u32 *src, *dst;
+	u32 size;
+
+	if (nvram_len) {
+		pr_warn("nvram already initialized\n");
+		return -EEXIST;
+	}
+
+	/* TODO: when nvram is on nand flash check for bad blocks first. */
+	off = FLASH_MIN;
+	while (off <= lim) {
+		/* Windowed flash access */
+		size = find_nvram_size(iobase + off);
+		if (size) {
+			header = (struct nvram_header *)(iobase + off - size);
+			goto found;
+		}
+		off <<= 1;
+	}
+
+	/* Try embedded NVRAM at 4 KB and 1 KB as last resorts */
+	header = (struct nvram_header *)(iobase + 4096);
+	if (header->magic == NVRAM_MAGIC) {
+		size = NVRAM_SPACE;
+		goto found;
+	}
+
+	header = (struct nvram_header *)(iobase + 1024);
+	if (header->magic == NVRAM_MAGIC) {
+		size = NVRAM_SPACE;
+		goto found;
+	}
+
+	pr_err("no nvram found\n");
+	return -ENXIO;
+
+found:
+	src = (u32 *)header;
+	dst = (u32 *)nvram_buf;
+	for (i = 0; i < sizeof(struct nvram_header); i += 4)
+		*dst++ = __raw_readl(src++);
+	header = (struct nvram_header *)nvram_buf;
+	nvram_len = header->len;
+	if (nvram_len > size) {
+		pr_err("The nvram size according to the header seems to be bigger than the partition on flash\n");
+		nvram_len = size;
+	}
+	if (nvram_len >= NVRAM_SPACE) {
+		pr_err("nvram on flash (%i bytes) is bigger than the reserved space in memory, will just copy the first %i bytes\n",
+		       header->len, NVRAM_SPACE - 1);
+		nvram_len = NVRAM_SPACE - 1;
+	}
+	/* proceed reading data after header */
+	for (; i < nvram_len; i += 4)
+		*dst++ = readl(src++);
+	nvram_buf[NVRAM_SPACE - 1] = '\0';
+
+	return 0;
+}
+
+/*
+ * On bcm47xx we need access to the NVRAM very early, so we can't use mtd
+ * subsystem to access flash. We can't even use platform device / driver to
+ * store memory offset.
+ * To handle this we provide following symbol. It's supposed to be called as
+ * soon as we get info about flash device, before any NVRAM entry is needed.
+ */
+int bcm47xx_nvram_init_from_mem(u32 base, u32 lim)
+{
+	void __iomem *iobase;
+	int err;
+
+	iobase = ioremap_nocache(base, lim);
+	if (!iobase)
+		return -ENOMEM;
+
+	err = nvram_find_and_copy(iobase, lim);
+
+	iounmap(iobase);
+
+	return err;
+}
+
+static int nvram_init(void)
+{
+#ifdef CONFIG_MTD
+	struct mtd_info *mtd;
+	struct nvram_header header;
+	size_t bytes_read;
+	int err;
+
+	mtd = get_mtd_device_nm("nvram");
+	if (IS_ERR(mtd))
+		return -ENODEV;
+
+	err = mtd_read(mtd, 0, sizeof(header), &bytes_read, (uint8_t *)&header);
+	if (!err && header.magic == NVRAM_MAGIC &&
+	    header.len > sizeof(header)) {
+		nvram_len = header.len;
+		if (nvram_len >= NVRAM_SPACE) {
+			pr_err("nvram on flash (%i bytes) is bigger than the reserved space in memory, will just copy the first %i bytes\n",
+				header.len, NVRAM_SPACE);
+			nvram_len = NVRAM_SPACE - 1;
+		}
+
+		err = mtd_read(mtd, 0, nvram_len, &nvram_len,
+			       (u8 *)nvram_buf);
+		return err;
+	}
+#endif
+
+	return -ENXIO;
+}
+
+int bcm47xx_nvram_getenv(const char *name, char *val, size_t val_len)
+{
+	char *var, *value, *end, *eq;
+	int err;
+
+	if (!name)
+		return -EINVAL;
+
+	if (!nvram_len) {
+		err = nvram_init();
+		if (err)
+			return err;
+	}
+
+	/* Look for name=value and return value */
+	var = &nvram_buf[sizeof(struct nvram_header)];
+	end = nvram_buf + sizeof(nvram_buf);
+	while (var < end && *var) {
+		eq = strchr(var, '=');
+		if (!eq)
+			break;
+		value = eq + 1;
+		if (eq - var == strlen(name) &&
+		    strncmp(var, name, eq - var) == 0)
+			return snprintf(val, val_len, "%s", value);
+		var = value + strlen(value) + 1;
+	}
+	return -ENOENT;
+}
+EXPORT_SYMBOL(bcm47xx_nvram_getenv);
+
+int bcm47xx_nvram_gpio_pin(const char *name)
+{
+	int i, err;
+	char nvram_var[] = "gpioXX";
+	char buf[NVRAM_MAX_GPIO_VALUE_LEN];
+
+	/* TODO: Optimize it to don't call getenv so many times */
+	for (i = 0; i < NVRAM_MAX_GPIO_ENTRIES; i++) {
+		err = snprintf(nvram_var, sizeof(nvram_var), "gpio%i", i);
+		if (err <= 0)
+			continue;
+		err = bcm47xx_nvram_getenv(nvram_var, buf, sizeof(buf));
+		if (err <= 0)
+			continue;
+		if (!strcmp(name, buf))
+			return i;
+	}
+	return -ENOENT;
+}
+EXPORT_SYMBOL(bcm47xx_nvram_gpio_pin);
+
+char *bcm47xx_nvram_get_contents(size_t *nvram_size)
+{
+	int err;
+	char *nvram;
+
+	if (!nvram_len) {
+		err = nvram_init();
+		if (err)
+			return NULL;
+	}
+
+	*nvram_size = nvram_len - sizeof(struct nvram_header);
+	nvram = vmalloc(*nvram_size);
+	if (!nvram)
+		return NULL;
+	memcpy(nvram, &nvram_buf[sizeof(struct nvram_header)], *nvram_size);
+
+	return nvram;
+}
+EXPORT_SYMBOL(bcm47xx_nvram_get_contents);
+
+MODULE_LICENSE("GPLv2");
diff --git a/include/linux/bcm47xx_nvram.h b/include/linux/bcm47xx_nvram.h
index c73927c66c2c..2793652fbf66 100644
--- a/include/linux/bcm47xx_nvram.h
+++ b/include/linux/bcm47xx_nvram.h
@@ -12,7 +12,7 @@
 #include <linux/kernel.h>
 #include <linux/vmalloc.h>
 
-#ifdef CONFIG_BCM47XX
+#ifdef CONFIG_BCM47XX_NVRAM
 int bcm47xx_nvram_init_from_mem(u32 base, u32 lim);
 int bcm47xx_nvram_getenv(const char *name, char *val, size_t val_len);
 int bcm47xx_nvram_gpio_pin(const char *name);
-- 
cgit v1.2.3


From d0497524658e37956737d7dbee73cc42120255dc Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Sun, 21 Jun 2015 19:11:44 +0800
Subject: crypto: user - Move cryptouser.h to uapi

The header file cryptouser.h only contains information that is
exported to user-space.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 include/linux/cryptouser.h      | 110 ----------------------------------------
 include/uapi/linux/cryptouser.h | 110 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 110 insertions(+), 110 deletions(-)
 delete mode 100644 include/linux/cryptouser.h
 create mode 100644 include/uapi/linux/cryptouser.h

(limited to 'include/linux')

diff --git a/include/linux/cryptouser.h b/include/linux/cryptouser.h
deleted file mode 100644
index 36efbbbf2f83..000000000000
--- a/include/linux/cryptouser.h
+++ /dev/null
@@ -1,110 +0,0 @@
-/*
- * Crypto user configuration API.
- *
- * Copyright (C) 2011 secunet Security Networks AG
- * Copyright (C) 2011 Steffen Klassert <steffen.klassert@secunet.com>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc.,
- * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
- */
-
-/* Netlink configuration messages.  */
-enum {
-	CRYPTO_MSG_BASE = 0x10,
-	CRYPTO_MSG_NEWALG = 0x10,
-	CRYPTO_MSG_DELALG,
-	CRYPTO_MSG_UPDATEALG,
-	CRYPTO_MSG_GETALG,
-	__CRYPTO_MSG_MAX
-};
-#define CRYPTO_MSG_MAX (__CRYPTO_MSG_MAX - 1)
-#define CRYPTO_NR_MSGTYPES (CRYPTO_MSG_MAX + 1 - CRYPTO_MSG_BASE)
-
-#define CRYPTO_MAX_NAME CRYPTO_MAX_ALG_NAME
-
-/* Netlink message attributes.  */
-enum crypto_attr_type_t {
-	CRYPTOCFGA_UNSPEC,
-	CRYPTOCFGA_PRIORITY_VAL,	/* __u32 */
-	CRYPTOCFGA_REPORT_LARVAL,	/* struct crypto_report_larval */
-	CRYPTOCFGA_REPORT_HASH,		/* struct crypto_report_hash */
-	CRYPTOCFGA_REPORT_BLKCIPHER,	/* struct crypto_report_blkcipher */
-	CRYPTOCFGA_REPORT_AEAD,		/* struct crypto_report_aead */
-	CRYPTOCFGA_REPORT_COMPRESS,	/* struct crypto_report_comp */
-	CRYPTOCFGA_REPORT_RNG,		/* struct crypto_report_rng */
-	CRYPTOCFGA_REPORT_CIPHER,	/* struct crypto_report_cipher */
-	CRYPTOCFGA_REPORT_AKCIPHER,	/* struct crypto_report_akcipher */
-	__CRYPTOCFGA_MAX
-
-#define CRYPTOCFGA_MAX (__CRYPTOCFGA_MAX - 1)
-};
-
-struct crypto_user_alg {
-	char cru_name[CRYPTO_MAX_ALG_NAME];
-	char cru_driver_name[CRYPTO_MAX_ALG_NAME];
-	char cru_module_name[CRYPTO_MAX_ALG_NAME];
-	__u32 cru_type;
-	__u32 cru_mask;
-	__u32 cru_refcnt;
-	__u32 cru_flags;
-};
-
-struct crypto_report_larval {
-	char type[CRYPTO_MAX_NAME];
-};
-
-struct crypto_report_hash {
-	char type[CRYPTO_MAX_NAME];
-	unsigned int blocksize;
-	unsigned int digestsize;
-};
-
-struct crypto_report_cipher {
-	char type[CRYPTO_MAX_ALG_NAME];
-	unsigned int blocksize;
-	unsigned int min_keysize;
-	unsigned int max_keysize;
-};
-
-struct crypto_report_blkcipher {
-	char type[CRYPTO_MAX_NAME];
-	char geniv[CRYPTO_MAX_NAME];
-	unsigned int blocksize;
-	unsigned int min_keysize;
-	unsigned int max_keysize;
-	unsigned int ivsize;
-};
-
-struct crypto_report_aead {
-	char type[CRYPTO_MAX_NAME];
-	char geniv[CRYPTO_MAX_NAME];
-	unsigned int blocksize;
-	unsigned int maxauthsize;
-	unsigned int ivsize;
-};
-
-struct crypto_report_comp {
-	char type[CRYPTO_MAX_NAME];
-};
-
-struct crypto_report_rng {
-	char type[CRYPTO_MAX_NAME];
-	unsigned int seedsize;
-};
-
-struct crypto_report_akcipher {
-	char type[CRYPTO_MAX_NAME];
-};
-
-#define CRYPTO_REPORT_MAXSIZE (sizeof(struct crypto_user_alg) + \
-			       sizeof(struct crypto_report_blkcipher))
diff --git a/include/uapi/linux/cryptouser.h b/include/uapi/linux/cryptouser.h
new file mode 100644
index 000000000000..36efbbbf2f83
--- /dev/null
+++ b/include/uapi/linux/cryptouser.h
@@ -0,0 +1,110 @@
+/*
+ * Crypto user configuration API.
+ *
+ * Copyright (C) 2011 secunet Security Networks AG
+ * Copyright (C) 2011 Steffen Klassert <steffen.klassert@secunet.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+/* Netlink configuration messages.  */
+enum {
+	CRYPTO_MSG_BASE = 0x10,
+	CRYPTO_MSG_NEWALG = 0x10,
+	CRYPTO_MSG_DELALG,
+	CRYPTO_MSG_UPDATEALG,
+	CRYPTO_MSG_GETALG,
+	__CRYPTO_MSG_MAX
+};
+#define CRYPTO_MSG_MAX (__CRYPTO_MSG_MAX - 1)
+#define CRYPTO_NR_MSGTYPES (CRYPTO_MSG_MAX + 1 - CRYPTO_MSG_BASE)
+
+#define CRYPTO_MAX_NAME CRYPTO_MAX_ALG_NAME
+
+/* Netlink message attributes.  */
+enum crypto_attr_type_t {
+	CRYPTOCFGA_UNSPEC,
+	CRYPTOCFGA_PRIORITY_VAL,	/* __u32 */
+	CRYPTOCFGA_REPORT_LARVAL,	/* struct crypto_report_larval */
+	CRYPTOCFGA_REPORT_HASH,		/* struct crypto_report_hash */
+	CRYPTOCFGA_REPORT_BLKCIPHER,	/* struct crypto_report_blkcipher */
+	CRYPTOCFGA_REPORT_AEAD,		/* struct crypto_report_aead */
+	CRYPTOCFGA_REPORT_COMPRESS,	/* struct crypto_report_comp */
+	CRYPTOCFGA_REPORT_RNG,		/* struct crypto_report_rng */
+	CRYPTOCFGA_REPORT_CIPHER,	/* struct crypto_report_cipher */
+	CRYPTOCFGA_REPORT_AKCIPHER,	/* struct crypto_report_akcipher */
+	__CRYPTOCFGA_MAX
+
+#define CRYPTOCFGA_MAX (__CRYPTOCFGA_MAX - 1)
+};
+
+struct crypto_user_alg {
+	char cru_name[CRYPTO_MAX_ALG_NAME];
+	char cru_driver_name[CRYPTO_MAX_ALG_NAME];
+	char cru_module_name[CRYPTO_MAX_ALG_NAME];
+	__u32 cru_type;
+	__u32 cru_mask;
+	__u32 cru_refcnt;
+	__u32 cru_flags;
+};
+
+struct crypto_report_larval {
+	char type[CRYPTO_MAX_NAME];
+};
+
+struct crypto_report_hash {
+	char type[CRYPTO_MAX_NAME];
+	unsigned int blocksize;
+	unsigned int digestsize;
+};
+
+struct crypto_report_cipher {
+	char type[CRYPTO_MAX_ALG_NAME];
+	unsigned int blocksize;
+	unsigned int min_keysize;
+	unsigned int max_keysize;
+};
+
+struct crypto_report_blkcipher {
+	char type[CRYPTO_MAX_NAME];
+	char geniv[CRYPTO_MAX_NAME];
+	unsigned int blocksize;
+	unsigned int min_keysize;
+	unsigned int max_keysize;
+	unsigned int ivsize;
+};
+
+struct crypto_report_aead {
+	char type[CRYPTO_MAX_NAME];
+	char geniv[CRYPTO_MAX_NAME];
+	unsigned int blocksize;
+	unsigned int maxauthsize;
+	unsigned int ivsize;
+};
+
+struct crypto_report_comp {
+	char type[CRYPTO_MAX_NAME];
+};
+
+struct crypto_report_rng {
+	char type[CRYPTO_MAX_NAME];
+	unsigned int seedsize;
+};
+
+struct crypto_report_akcipher {
+	char type[CRYPTO_MAX_NAME];
+};
+
+#define CRYPTO_REPORT_MAXSIZE (sizeof(struct crypto_user_alg) + \
+			       sizeof(struct crypto_report_blkcipher))
-- 
cgit v1.2.3


From 3e90950d36f19e5477becd5acb02e9b8d8c8956f Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Mon, 22 Jun 2015 10:31:40 +0800
Subject: crypto: algif_aead - Temporarily disable all AEAD algorithms

As the AEAD conversion is still ongoing, we do not yet wish to
export legacy AEAD implementations to user-space, as their calling
convention will change.

This patch actually disables all AEAD algorithms because some of
them (e.g., cryptd) will need to be modified to propagate this flag.

Subsequent patches will reenable them on an individual basis.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/algif_aead.c    | 3 ++-
 include/linux/crypto.h | 6 ++++++
 2 files changed, 8 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/crypto/algif_aead.c b/crypto/algif_aead.c
index 38a6cab7aeca..e0408a480d2f 100644
--- a/crypto/algif_aead.c
+++ b/crypto/algif_aead.c
@@ -514,7 +514,8 @@ static struct proto_ops algif_aead_ops = {
 
 static void *aead_bind(const char *name, u32 type, u32 mask)
 {
-	return crypto_alloc_aead(name, type, mask);
+	return crypto_alloc_aead(name, type | CRYPTO_ALG_AEAD_NEW,
+				 mask | CRYPTO_ALG_AEAD_NEW);
 }
 
 static void aead_release(void *private)
diff --git a/include/linux/crypto.h b/include/linux/crypto.h
index 964e5735a6a9..81ef938b0a8e 100644
--- a/include/linux/crypto.h
+++ b/include/linux/crypto.h
@@ -101,6 +101,12 @@
  */
 #define CRYPTO_ALG_INTERNAL		0x00002000
 
+/*
+ * Temporary flag used to prevent legacy AEAD implementations from
+ * being used by user-space.
+ */
+#define CRYPTO_ALG_AEAD_NEW		0x00004000
+
 /*
  * Transform masks and values (for crt_flags).
  */
-- 
cgit v1.2.3


From 8ccd0d0ca04147e91890c373677f1e741dda2631 Mon Sep 17 00:00:00 2001
From: Hyungwon Hwang <human.hwang@samsung.com>
Date: Fri, 12 Jun 2015 21:59:01 +0900
Subject: of: add helper for getting endpoint node of specific identifiers

When there are multiple ports or multiple endpoints in a port, they have to be
distinguished by the value of reg property. It is common. The drivers can get
the specific endpoint in the specific port via this function. Now the drivers
have to implement this code in themselves or have to force the order of dt nodes
to get the right node.

Signed-off-by: Hyungwon Hwang <human.hwang@samsung.com>
Acked-by: Rob Herring <robh+dt@kernel.org>
Signed-off-by: Inki Dae <inki.dae@samsung.com>
---
 drivers/of/base.c        | 33 +++++++++++++++++++++++++++++++++
 include/linux/of_graph.h |  8 ++++++++
 2 files changed, 41 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/of/base.c b/drivers/of/base.c
index 99764db0875a..f3b583b81105 100644
--- a/drivers/of/base.c
+++ b/drivers/of/base.c
@@ -2232,6 +2232,39 @@ struct device_node *of_graph_get_next_endpoint(const struct device_node *parent,
 }
 EXPORT_SYMBOL(of_graph_get_next_endpoint);
 
+/**
+ * of_graph_get_endpoint_by_regs() - get endpoint node of specific identifiers
+ * @parent: pointer to the parent device node
+ * @port_reg: identifier (value of reg property) of the parent port node
+ * @reg: identifier (value of reg property) of the endpoint node
+ *
+ * Return: An 'endpoint' node pointer which is identified by reg and at the same
+ * is the child of a port node identified by port_reg. reg and port_reg are
+ * ignored when they are -1.
+ */
+struct device_node *of_graph_get_endpoint_by_regs(
+	const struct device_node *parent, int port_reg, int reg)
+{
+	struct of_endpoint endpoint;
+	struct device_node *node, *prev_node = NULL;
+
+	while (1) {
+		node = of_graph_get_next_endpoint(parent, prev_node);
+		of_node_put(prev_node);
+		if (!node)
+			break;
+
+		of_graph_parse_endpoint(node, &endpoint);
+		if (((port_reg == -1) || (endpoint.port == port_reg)) &&
+			((reg == -1) || (endpoint.id == reg)))
+			return node;
+
+		prev_node = node;
+	}
+
+	return NULL;
+}
+
 /**
  * of_graph_get_remote_port_parent() - get remote port's parent node
  * @node: pointer to a local endpoint device_node
diff --git a/include/linux/of_graph.h b/include/linux/of_graph.h
index 7bc92e050608..1c1d5b901a72 100644
--- a/include/linux/of_graph.h
+++ b/include/linux/of_graph.h
@@ -45,6 +45,8 @@ int of_graph_parse_endpoint(const struct device_node *node,
 struct device_node *of_graph_get_port_by_id(struct device_node *node, u32 id);
 struct device_node *of_graph_get_next_endpoint(const struct device_node *parent,
 					struct device_node *previous);
+struct device_node *of_graph_get_endpoint_by_regs(
+		const struct device_node *parent, int port_reg, int reg);
 struct device_node *of_graph_get_remote_port_parent(
 					const struct device_node *node);
 struct device_node *of_graph_get_remote_port(const struct device_node *node);
@@ -69,6 +71,12 @@ static inline struct device_node *of_graph_get_next_endpoint(
 	return NULL;
 }
 
+struct device_node *of_graph_get_endpoint_by_regs(
+		const struct device_node *parent, int port_reg, int reg)
+{
+	return NULL;
+}
+
 static inline struct device_node *of_graph_get_remote_port_parent(
 					const struct device_node *node)
 {
-- 
cgit v1.2.3


From 7ce7b26f84cfcbcb04f526f56f685a56ccddf355 Mon Sep 17 00:00:00 2001
From: Krzysztof Kozlowski <k.kozlowski.k@gmail.com>
Date: Mon, 27 Apr 2015 21:54:13 +0900
Subject: mfd: Constify regmap and irq configuration data

Constify in various drivers configuration data which is not modified:
 - regmap_irq_chip,
 - individual regmap_irq's in array,
 - regmap_config,
 - irq_domain_ops,

Signed-off-by: Krzysztof Kozlowski <k.kozlowski.k@gmail.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/mfd/88pm860x-core.c       | 2 +-
 drivers/mfd/ab8500-core.c         | 2 +-
 drivers/mfd/arizona-irq.c         | 2 +-
 drivers/mfd/da9052-irq.c          | 4 ++--
 drivers/mfd/da9055-core.c         | 6 +++---
 drivers/mfd/da9063-irq.c          | 4 ++--
 drivers/mfd/da9150-core.c         | 4 ++--
 drivers/mfd/db8500-prcmu.c        | 2 +-
 drivers/mfd/intel_soc_pmic_core.h | 2 +-
 drivers/mfd/intel_soc_pmic_crc.c  | 2 +-
 drivers/mfd/lp8788-irq.c          | 2 +-
 drivers/mfd/max8925-core.c        | 2 +-
 drivers/mfd/max8997-irq.c         | 2 +-
 drivers/mfd/max8998-irq.c         | 2 +-
 drivers/mfd/mt6397-core.c         | 2 +-
 drivers/mfd/stmpe.c               | 2 +-
 drivers/mfd/tc3589x.c             | 2 +-
 drivers/mfd/tps6586x.c            | 2 +-
 drivers/mfd/twl6030-irq.c         | 2 +-
 drivers/mfd/wm831x-irq.c          | 2 +-
 drivers/mfd/wm8994-irq.c          | 6 +++---
 include/linux/mfd/da9055/core.h   | 2 +-
 22 files changed, 29 insertions(+), 29 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mfd/88pm860x-core.c b/drivers/mfd/88pm860x-core.c
index d2a85cde68da..e03b7f45b8f7 100644
--- a/drivers/mfd/88pm860x-core.c
+++ b/drivers/mfd/88pm860x-core.c
@@ -566,7 +566,7 @@ static int pm860x_irq_domain_map(struct irq_domain *d, unsigned int virq,
 	return 0;
 }
 
-static struct irq_domain_ops pm860x_irq_domain_ops = {
+static const struct irq_domain_ops pm860x_irq_domain_ops = {
 	.map	= pm860x_irq_domain_map,
 	.xlate	= irq_domain_xlate_onetwocell,
 };
diff --git a/drivers/mfd/ab8500-core.c b/drivers/mfd/ab8500-core.c
index c80a2925f8e5..000da72a0ae9 100644
--- a/drivers/mfd/ab8500-core.c
+++ b/drivers/mfd/ab8500-core.c
@@ -574,7 +574,7 @@ static int ab8500_irq_map(struct irq_domain *d, unsigned int virq,
 	return 0;
 }
 
-static struct irq_domain_ops ab8500_irq_ops = {
+static const struct irq_domain_ops ab8500_irq_ops = {
 	.map    = ab8500_irq_map,
 	.xlate  = irq_domain_xlate_twocell,
 };
diff --git a/drivers/mfd/arizona-irq.c b/drivers/mfd/arizona-irq.c
index d063b94b94b5..2b9965d53e4e 100644
--- a/drivers/mfd/arizona-irq.c
+++ b/drivers/mfd/arizona-irq.c
@@ -186,7 +186,7 @@ static int arizona_irq_map(struct irq_domain *h, unsigned int virq,
 	return 0;
 }
 
-static struct irq_domain_ops arizona_domain_ops = {
+static const struct irq_domain_ops arizona_domain_ops = {
 	.map	= arizona_irq_map,
 	.xlate	= irq_domain_xlate_twocell,
 };
diff --git a/drivers/mfd/da9052-irq.c b/drivers/mfd/da9052-irq.c
index e65ca194fa98..f4cb4613140b 100644
--- a/drivers/mfd/da9052-irq.c
+++ b/drivers/mfd/da9052-irq.c
@@ -35,7 +35,7 @@
 #define DA9052_IRQ_MASK_POS_7		0x40
 #define DA9052_IRQ_MASK_POS_8		0x80
 
-static struct regmap_irq da9052_irqs[] = {
+static const struct regmap_irq da9052_irqs[] = {
 	[DA9052_IRQ_DCIN] = {
 		.reg_offset = 0,
 		.mask = DA9052_IRQ_MASK_POS_1,
@@ -166,7 +166,7 @@ static struct regmap_irq da9052_irqs[] = {
 	},
 };
 
-static struct regmap_irq_chip da9052_regmap_irq_chip = {
+static const struct regmap_irq_chip da9052_regmap_irq_chip = {
 	.name = "da9052_irq",
 	.status_base = DA9052_EVENT_A_REG,
 	.mask_base = DA9052_IRQ_MASK_A_REG,
diff --git a/drivers/mfd/da9055-core.c b/drivers/mfd/da9055-core.c
index b4d920c1ead1..177e65a12c12 100644
--- a/drivers/mfd/da9055-core.c
+++ b/drivers/mfd/da9055-core.c
@@ -222,7 +222,7 @@ static bool da9055_register_volatile(struct device *dev, unsigned int reg)
 	}
 }
 
-static struct regmap_irq da9055_irqs[] = {
+static const struct regmap_irq da9055_irqs[] = {
 	[DA9055_IRQ_NONKEY] = {
 		.reg_offset = 0,
 		.mask = DA9055_IRQ_NONKEY_MASK,
@@ -245,7 +245,7 @@ static struct regmap_irq da9055_irqs[] = {
 	},
 };
 
-struct regmap_config da9055_regmap_config = {
+const struct regmap_config da9055_regmap_config = {
 	.reg_bits = 8,
 	.val_bits = 8,
 
@@ -367,7 +367,7 @@ static const struct mfd_cell da9055_devs[] = {
 	},
 };
 
-static struct regmap_irq_chip da9055_regmap_irq_chip = {
+static const struct regmap_irq_chip da9055_regmap_irq_chip = {
 	.name = "da9055_irq",
 	.status_base = DA9055_REG_EVENT_A,
 	.mask_base = DA9055_REG_IRQ_MASK_A,
diff --git a/drivers/mfd/da9063-irq.c b/drivers/mfd/da9063-irq.c
index 822922602ce9..eaf1ec9208b2 100644
--- a/drivers/mfd/da9063-irq.c
+++ b/drivers/mfd/da9063-irq.c
@@ -34,7 +34,7 @@ struct da9063_irq_data {
 	u8 mask;
 };
 
-static struct regmap_irq da9063_irqs[] = {
+static const struct regmap_irq da9063_irqs[] = {
 	/* DA9063 event A register */
 	[DA9063_IRQ_ONKEY] = {
 		.reg_offset = DA9063_REG_EVENT_A_OFFSET,
@@ -153,7 +153,7 @@ static struct regmap_irq da9063_irqs[] = {
 	},
 };
 
-static struct regmap_irq_chip da9063_irq_chip = {
+static const struct regmap_irq_chip da9063_irq_chip = {
 	.name = "da9063-irq",
 	.irqs = da9063_irqs,
 	.num_irqs = DA9063_NUM_IRQ,
diff --git a/drivers/mfd/da9150-core.c b/drivers/mfd/da9150-core.c
index 5549817df32e..94b9bbd1a69b 100644
--- a/drivers/mfd/da9150-core.c
+++ b/drivers/mfd/da9150-core.c
@@ -164,7 +164,7 @@ void da9150_bulk_write(struct da9150 *da9150, u16 reg, int count, const u8 *buf)
 }
 EXPORT_SYMBOL_GPL(da9150_bulk_write);
 
-static struct regmap_irq da9150_irqs[] = {
+static const struct regmap_irq da9150_irqs[] = {
 	[DA9150_IRQ_VBUS] = {
 		.reg_offset = 0,
 		.mask = DA9150_E_VBUS_MASK,
@@ -251,7 +251,7 @@ static struct regmap_irq da9150_irqs[] = {
 	},
 };
 
-static struct regmap_irq_chip da9150_regmap_irq_chip = {
+static const struct regmap_irq_chip da9150_regmap_irq_chip = {
 	.name = "da9150_irq",
 	.status_base = DA9150_EVENT_E,
 	.mask_base = DA9150_IRQ_MASK_E,
diff --git a/drivers/mfd/db8500-prcmu.c b/drivers/mfd/db8500-prcmu.c
index cc1a404328c2..8b14740f9fca 100644
--- a/drivers/mfd/db8500-prcmu.c
+++ b/drivers/mfd/db8500-prcmu.c
@@ -2659,7 +2659,7 @@ static int db8500_irq_map(struct irq_domain *d, unsigned int virq,
 	return 0;
 }
 
-static struct irq_domain_ops db8500_irq_ops = {
+static const struct irq_domain_ops db8500_irq_ops = {
 	.map    = db8500_irq_map,
 	.xlate  = irq_domain_xlate_twocell,
 };
diff --git a/drivers/mfd/intel_soc_pmic_core.h b/drivers/mfd/intel_soc_pmic_core.h
index 9498d6719847..ff2464bc172f 100644
--- a/drivers/mfd/intel_soc_pmic_core.h
+++ b/drivers/mfd/intel_soc_pmic_core.h
@@ -24,7 +24,7 @@ struct intel_soc_pmic_config {
 	struct mfd_cell *cell_dev;
 	int n_cell_devs;
 	const struct regmap_config *regmap_config;
-	struct regmap_irq_chip *irq_chip;
+	const struct regmap_irq_chip *irq_chip;
 };
 
 extern struct intel_soc_pmic_config intel_soc_pmic_config_crc;
diff --git a/drivers/mfd/intel_soc_pmic_crc.c b/drivers/mfd/intel_soc_pmic_crc.c
index 4cc1b324e971..7436075e8983 100644
--- a/drivers/mfd/intel_soc_pmic_crc.c
+++ b/drivers/mfd/intel_soc_pmic_crc.c
@@ -143,7 +143,7 @@ static const struct regmap_irq crystal_cove_irqs[] = {
 	},
 };
 
-static struct regmap_irq_chip crystal_cove_irq_chip = {
+static const struct regmap_irq_chip crystal_cove_irq_chip = {
 	.name = "Crystal Cove",
 	.irqs = crystal_cove_irqs,
 	.num_irqs = ARRAY_SIZE(crystal_cove_irqs),
diff --git a/drivers/mfd/lp8788-irq.c b/drivers/mfd/lp8788-irq.c
index 23982dbf014d..a87f2b548f71 100644
--- a/drivers/mfd/lp8788-irq.c
+++ b/drivers/mfd/lp8788-irq.c
@@ -151,7 +151,7 @@ static int lp8788_irq_map(struct irq_domain *d, unsigned int virq,
 	return 0;
 }
 
-static struct irq_domain_ops lp8788_domain_ops = {
+static const struct irq_domain_ops lp8788_domain_ops = {
 	.map = lp8788_irq_map,
 };
 
diff --git a/drivers/mfd/max8925-core.c b/drivers/mfd/max8925-core.c
index 97a787ab3d51..8520bd68c1ff 100644
--- a/drivers/mfd/max8925-core.c
+++ b/drivers/mfd/max8925-core.c
@@ -658,7 +658,7 @@ static int max8925_irq_domain_map(struct irq_domain *d, unsigned int virq,
 	return 0;
 }
 
-static struct irq_domain_ops max8925_irq_domain_ops = {
+static const struct irq_domain_ops max8925_irq_domain_ops = {
 	.map	= max8925_irq_domain_map,
 	.xlate	= irq_domain_xlate_onetwocell,
 };
diff --git a/drivers/mfd/max8997-irq.c b/drivers/mfd/max8997-irq.c
index 43fa61413e93..d3025be57f39 100644
--- a/drivers/mfd/max8997-irq.c
+++ b/drivers/mfd/max8997-irq.c
@@ -303,7 +303,7 @@ static int max8997_irq_domain_map(struct irq_domain *d, unsigned int irq,
 	return 0;
 }
 
-static struct irq_domain_ops max8997_irq_domain_ops = {
+static const struct irq_domain_ops max8997_irq_domain_ops = {
 	.map = max8997_irq_domain_map,
 };
 
diff --git a/drivers/mfd/max8998-irq.c b/drivers/mfd/max8998-irq.c
index c469477eb778..3702056628a8 100644
--- a/drivers/mfd/max8998-irq.c
+++ b/drivers/mfd/max8998-irq.c
@@ -214,7 +214,7 @@ static int max8998_irq_domain_map(struct irq_domain *d, unsigned int irq,
 	return 0;
 }
 
-static struct irq_domain_ops max8998_irq_domain_ops = {
+static const struct irq_domain_ops max8998_irq_domain_ops = {
 	.map = max8998_irq_domain_map,
 };
 
diff --git a/drivers/mfd/mt6397-core.c b/drivers/mfd/mt6397-core.c
index 09bc7804952a..32775ebbce18 100644
--- a/drivers/mfd/mt6397-core.c
+++ b/drivers/mfd/mt6397-core.c
@@ -130,7 +130,7 @@ static int mt6397_irq_domain_map(struct irq_domain *d, unsigned int irq,
 	return 0;
 }
 
-static struct irq_domain_ops mt6397_irq_domain_ops = {
+static const struct irq_domain_ops mt6397_irq_domain_ops = {
 	.map = mt6397_irq_domain_map,
 };
 
diff --git a/drivers/mfd/stmpe.c b/drivers/mfd/stmpe.c
index 2d7fae94c861..18c4d72d1d2a 100644
--- a/drivers/mfd/stmpe.c
+++ b/drivers/mfd/stmpe.c
@@ -989,7 +989,7 @@ static void stmpe_irq_unmap(struct irq_domain *d, unsigned int virq)
 		irq_set_chip_data(virq, NULL);
 }
 
-static struct irq_domain_ops stmpe_irq_ops = {
+static const struct irq_domain_ops stmpe_irq_ops = {
         .map    = stmpe_irq_map,
         .unmap  = stmpe_irq_unmap,
         .xlate  = irq_domain_xlate_twocell,
diff --git a/drivers/mfd/tc3589x.c b/drivers/mfd/tc3589x.c
index cf356395c9e9..96d420dfc15d 100644
--- a/drivers/mfd/tc3589x.c
+++ b/drivers/mfd/tc3589x.c
@@ -233,7 +233,7 @@ static void tc3589x_irq_unmap(struct irq_domain *d, unsigned int virq)
 	irq_set_chip_data(virq, NULL);
 }
 
-static struct irq_domain_ops tc3589x_irq_ops = {
+static const struct irq_domain_ops tc3589x_irq_ops = {
 	.map    = tc3589x_irq_map,
 	.unmap  = tc3589x_irq_unmap,
 	.xlate  = irq_domain_xlate_onecell,
diff --git a/drivers/mfd/tps6586x.c b/drivers/mfd/tps6586x.c
index 8e1dbc469580..e0a2583916ce 100644
--- a/drivers/mfd/tps6586x.c
+++ b/drivers/mfd/tps6586x.c
@@ -311,7 +311,7 @@ static int tps6586x_irq_map(struct irq_domain *h, unsigned int virq,
 	return 0;
 }
 
-static struct irq_domain_ops tps6586x_domain_ops = {
+static const struct irq_domain_ops tps6586x_domain_ops = {
 	.map    = tps6586x_irq_map,
 	.xlate  = irq_domain_xlate_twocell,
 };
diff --git a/drivers/mfd/twl6030-irq.c b/drivers/mfd/twl6030-irq.c
index 2807e1a95663..20fb58179ada 100644
--- a/drivers/mfd/twl6030-irq.c
+++ b/drivers/mfd/twl6030-irq.c
@@ -376,7 +376,7 @@ static void twl6030_irq_unmap(struct irq_domain *d, unsigned int virq)
 	irq_set_chip_data(virq, NULL);
 }
 
-static struct irq_domain_ops twl6030_irq_domain_ops = {
+static const struct irq_domain_ops twl6030_irq_domain_ops = {
 	.map	= twl6030_irq_map,
 	.unmap	= twl6030_irq_unmap,
 	.xlate	= irq_domain_xlate_onetwocell,
diff --git a/drivers/mfd/wm831x-irq.c b/drivers/mfd/wm831x-irq.c
index 64e512eadf17..3da81263c764 100644
--- a/drivers/mfd/wm831x-irq.c
+++ b/drivers/mfd/wm831x-irq.c
@@ -564,7 +564,7 @@ static int wm831x_irq_map(struct irq_domain *h, unsigned int virq,
 	return 0;
 }
 
-static struct irq_domain_ops wm831x_irq_domain_ops = {
+static const struct irq_domain_ops wm831x_irq_domain_ops = {
 	.map	= wm831x_irq_map,
 	.xlate	= irq_domain_xlate_twocell,
 };
diff --git a/drivers/mfd/wm8994-irq.c b/drivers/mfd/wm8994-irq.c
index a14407edbd89..55c380a67686 100644
--- a/drivers/mfd/wm8994-irq.c
+++ b/drivers/mfd/wm8994-irq.c
@@ -28,7 +28,7 @@
 
 #include <linux/delay.h>
 
-static struct regmap_irq wm8994_irqs[] = {
+static const struct regmap_irq wm8994_irqs[] = {
 	[WM8994_IRQ_TEMP_SHUT] = {
 		.reg_offset = 1,
 		.mask = WM8994_TEMP_SHUT_EINT,
@@ -128,7 +128,7 @@ static struct regmap_irq wm8994_irqs[] = {
 	},
 };
 
-static struct regmap_irq_chip wm8994_irq_chip = {
+static const struct regmap_irq_chip wm8994_irq_chip = {
 	.name = "wm8994",
 	.irqs = wm8994_irqs,
 	.num_irqs = ARRAY_SIZE(wm8994_irqs),
@@ -184,7 +184,7 @@ static int wm8994_edge_irq_map(struct irq_domain *h, unsigned int virq,
 	return 0;
 }
 
-static struct irq_domain_ops wm8994_edge_irq_ops = {
+static const struct irq_domain_ops wm8994_edge_irq_ops = {
 	.map	= wm8994_edge_irq_map,
 	.xlate	= irq_domain_xlate_twocell,
 };
diff --git a/include/linux/mfd/da9055/core.h b/include/linux/mfd/da9055/core.h
index 956afa445998..5dc743fd63a6 100644
--- a/include/linux/mfd/da9055/core.h
+++ b/include/linux/mfd/da9055/core.h
@@ -89,6 +89,6 @@ static inline int da9055_reg_update(struct da9055 *da9055, unsigned char reg,
 int da9055_device_init(struct da9055 *da9055);
 void da9055_device_exit(struct da9055 *da9055);
 
-extern struct regmap_config da9055_regmap_config;
+extern const struct regmap_config da9055_regmap_config;
 
 #endif /* __DA9055_CORE_H */
-- 
cgit v1.2.3


From 5c188d748216f67c928d67a42f14b5569b6404a5 Mon Sep 17 00:00:00 2001
From: Tony Lindgren <tony@atomide.com>
Date: Mon, 27 Apr 2015 10:18:14 -0700
Subject: mfd: twl4030-power: Fix pmic for boards that need AC charger disabled

I noticed the PMIC configuration on 37xx-evm won't actually shut down
the voltages during off-idle. Turns out 37xx-evm needs the AC charger
state transitions disabled like we are doing for SDP and LDP in the
legacy booting case.

Let's fix this for device tree based booting by setting up the quirk
flag based on the compatible flag. And let's also use the existing
define for STARTON_CHG.

Note that SDP and EVM do not have the PMIC clken wired to gate the
the oscillator while LDP has.

Signed-off-by: Tony Lindgren <tony@atomide.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/mfd/twl4030-power.c | 45 ++++++++++++++++++++++++++++++++++++++-------
 include/linux/i2c/twl.h     |  1 +
 2 files changed, 39 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mfd/twl4030-power.c b/drivers/mfd/twl4030-power.c
index f440aed61305..04b539850e72 100644
--- a/drivers/mfd/twl4030-power.c
+++ b/drivers/mfd/twl4030-power.c
@@ -264,7 +264,9 @@ out:
 	return err;
 }
 
-static int twl4030_config_wakeup12_sequence(u8 address)
+static int
+twl4030_config_wakeup12_sequence(const struct twl4030_power_data *pdata,
+				 u8 address)
 {
 	int err = 0;
 	u8 data;
@@ -293,13 +295,14 @@ static int twl4030_config_wakeup12_sequence(u8 address)
 	if (err)
 		goto out;
 
-	if (machine_is_omap_3430sdp() || machine_is_omap_ldp()) {
+	if (pdata->ac_charger_quirk || machine_is_omap_3430sdp() ||
+	    machine_is_omap_ldp()) {
 		/* Disabling AC charger effect on sleep-active transitions */
 		err = twl_i2c_read_u8(TWL_MODULE_PM_MASTER, &data,
 				      R_CFG_P1_TRANSITION);
 		if (err)
 			goto out;
-		data &= ~(1<<1);
+		data &= ~STARTON_CHG;
 		err = twl_i2c_write_u8(TWL_MODULE_PM_MASTER, data,
 				       R_CFG_P1_TRANSITION);
 		if (err)
@@ -459,8 +462,9 @@ static int twl4030_configure_resource(struct twl4030_resconfig *rconfig)
 	return 0;
 }
 
-static int load_twl4030_script(struct twl4030_script *tscript,
-	       u8 address)
+static int load_twl4030_script(const struct twl4030_power_data *pdata,
+			       struct twl4030_script *tscript,
+			       u8 address)
 {
 	int err;
 	static int order;
@@ -487,7 +491,7 @@ static int load_twl4030_script(struct twl4030_script *tscript,
 		if (err)
 			goto out;
 
-		err = twl4030_config_wakeup12_sequence(address);
+		err = twl4030_config_wakeup12_sequence(pdata, address);
 		if (err)
 			goto out;
 		order = 1;
@@ -567,7 +571,7 @@ twl4030_power_configure_scripts(const struct twl4030_power_data *pdata)
 	u8 address = twl4030_start_script_address;
 
 	for (i = 0; i < pdata->num; i++) {
-		err = load_twl4030_script(pdata->scripts[i], address);
+		err = load_twl4030_script(pdata, pdata->scripts[i], address);
 		if (err)
 			return err;
 		address += pdata->scripts[i]->size;
@@ -829,6 +833,21 @@ static struct twl4030_power_data osc_off_idle = {
 	.board_config		= osc_off_rconfig,
 };
 
+static struct twl4030_power_data omap3_idle_ac_quirk = {
+	.scripts		= omap3_idle_scripts,
+	.num			= ARRAY_SIZE(omap3_idle_scripts),
+	.resource_config	= omap3_idle_rconfig,
+	.ac_charger_quirk	= true,
+};
+
+static struct twl4030_power_data omap3_idle_ac_quirk_osc_off = {
+	.scripts		= omap3_idle_scripts,
+	.num			= ARRAY_SIZE(omap3_idle_scripts),
+	.resource_config	= omap3_idle_rconfig,
+	.board_config		= osc_off_rconfig,
+	.ac_charger_quirk	= true,
+};
+
 static const struct of_device_id twl4030_power_of_match[] = {
 	{
 		.compatible = "ti,twl4030-power",
@@ -845,6 +864,18 @@ static const struct of_device_id twl4030_power_of_match[] = {
 		.compatible = "ti,twl4030-power-idle-osc-off",
 		.data = &osc_off_idle,
 	},
+	{
+		.compatible = "ti,twl4030-power-omap3-sdp",
+		.data = &omap3_idle_ac_quirk,
+	},
+	{
+		.compatible = "ti,twl4030-power-omap3-ldp",
+		.data = &omap3_idle_ac_quirk_osc_off,
+	},
+	{
+		.compatible = "ti,twl4030-power-omap3-evm",
+		.data = &omap3_idle_ac_quirk,
+	},
 	{ },
 };
 MODULE_DEVICE_TABLE(of, twl4030_power_of_match);
diff --git a/include/linux/i2c/twl.h b/include/linux/i2c/twl.h
index 0bc03f100d04..9ad7828d9d34 100644
--- a/include/linux/i2c/twl.h
+++ b/include/linux/i2c/twl.h
@@ -675,6 +675,7 @@ struct twl4030_power_data {
 	struct twl4030_resconfig *board_config;
 #define TWL4030_RESCONFIG_UNDEF	((u8)-1)
 	bool use_poweroff;	/* Board is wired for TWL poweroff */
+	bool ac_charger_quirk;	/* Disable AC charger on board */
 };
 
 extern int twl4030_remove_script(u8 flags);
-- 
cgit v1.2.3


From 8be4efad81d814b607cbdad47176f426be83ba75 Mon Sep 17 00:00:00 2001
From: Javier Martinez Canillas <javier.martinez@collabora.co.uk>
Date: Wed, 6 May 2015 19:07:16 +0200
Subject: mfd: max77686: Remove unused struct max77686_opmode_data

The defined struct max77686_opmode_data isn't used neither by
the max77686 mfd driver nor the drivers for its sub-devices.

Signed-off-by: Javier Martinez Canillas <javier.martinez@collabora.co.uk>
Reviewed-by: Chanwoo Choi <cw00.choi@samsung.com>
Reviewed-by: Krzysztof Kozlowski <k.kozlowski@samsung.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 include/linux/mfd/max77686.h | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mfd/max77686.h b/include/linux/mfd/max77686.h
index bb995ab9a575..d4b72d519115 100644
--- a/include/linux/mfd/max77686.h
+++ b/include/linux/mfd/max77686.h
@@ -125,9 +125,4 @@ enum max77686_opmode {
 	MAX77686_OPMODE_STANDBY,
 };
 
-struct max77686_opmode_data {
-	int id;
-	int mode;
-};
-
 #endif /* __LINUX_MFD_MAX77686_H */
-- 
cgit v1.2.3


From e6cb73410a6db70eab266f15b7e25053a45b842d Mon Sep 17 00:00:00 2001
From: Charles Keepax <ckeepax@opensource.wolfsonmicro.com>
Date: Mon, 11 May 2015 13:58:09 +0100
Subject: mfd: arizona: Add better support for system suspend

Allow the chip to completely power off if we enter runtime suspend and
there is no jack detection active. This is helpful for systems where
system suspend might remove the supplies to the CODEC, without informing
us. Note the powering off is done in runtime suspend rather than system
suspend, because we need to hold reset until the first time DCVDD is
powered anyway (which would be in runtime resume), and we might as well
save the extra power.

Signed-off-by: Charles Keepax <ckeepax@opensource.wolfsonmicro.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/mfd/arizona-core.c       | 93 +++++++++++++++++++++++++++++++++-------
 include/linux/mfd/arizona/core.h |  1 +
 2 files changed, 78 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mfd/arizona-core.c b/drivers/mfd/arizona-core.c
index 93024acc145e..4f38ce654558 100644
--- a/drivers/mfd/arizona-core.c
+++ b/drivers/mfd/arizona-core.c
@@ -442,12 +442,33 @@ static int arizona_runtime_resume(struct device *dev)
 
 	dev_dbg(arizona->dev, "Leaving AoD mode\n");
 
+	if (arizona->has_fully_powered_off) {
+		dev_dbg(arizona->dev, "Re-enabling core supplies\n");
+
+		ret = regulator_bulk_enable(arizona->num_core_supplies,
+					    arizona->core_supplies);
+		if (ret) {
+			dev_err(dev, "Failed to enable core supplies: %d\n",
+				ret);
+			return ret;
+		}
+	}
+
 	ret = regulator_enable(arizona->dcvdd);
 	if (ret != 0) {
 		dev_err(arizona->dev, "Failed to enable DCVDD: %d\n", ret);
+		if (arizona->has_fully_powered_off)
+			regulator_bulk_disable(arizona->num_core_supplies,
+					       arizona->core_supplies);
 		return ret;
 	}
 
+	if (arizona->has_fully_powered_off) {
+		arizona_disable_reset(arizona);
+		enable_irq(arizona->irq);
+		arizona->has_fully_powered_off = false;
+	}
+
 	regcache_cache_only(arizona->regmap, false);
 
 	switch (arizona->type) {
@@ -508,6 +529,14 @@ static int arizona_runtime_resume(struct device *dev)
 				goto err;
 			}
 		}
+
+		ret = wm5110_apply_sleep_patch(arizona);
+		if (ret) {
+			dev_err(arizona->dev,
+				"Failed to re-apply sleep patch: %d\n",
+				ret);
+			goto err;
+		}
 		break;
 	default:
 		ret = arizona_wait_for_boot(arizona);
@@ -545,10 +574,17 @@ err:
 static int arizona_runtime_suspend(struct device *dev)
 {
 	struct arizona *arizona = dev_get_drvdata(dev);
+	unsigned int val;
 	int ret;
 
 	dev_dbg(arizona->dev, "Entering AoD mode\n");
 
+	ret = regmap_read(arizona->regmap, ARIZONA_JACK_DETECT_ANALOGUE, &val);
+	if (ret) {
+		dev_err(dev, "Failed to check jack det status: %d\n", ret);
+		return ret;
+	}
+
 	if (arizona->external_dcvdd) {
 		ret = regmap_update_bits(arizona->regmap,
 					 ARIZONA_ISOLATION_CONTROL,
@@ -559,33 +595,58 @@ static int arizona_runtime_suspend(struct device *dev)
 				ret);
 			return ret;
 		}
-	} else {
-		switch (arizona->type) {
-		case WM5110:
-		case WM8280:
-			/*
-			 * As this is only called for the internal regulator
-			 * (where we know voltage ranges available) it is ok
-			 * to request an exact range.
-			 */
-			ret = regulator_set_voltage(arizona->dcvdd,
-						    1175000, 1175000);
-			if (ret < 0) {
+	}
+
+	switch (arizona->type) {
+	case WM5110:
+	case WM8280:
+		if (arizona->external_dcvdd)
+			break;
+
+		/*
+		 * As this is only called for the internal regulator
+		 * (where we know voltage ranges available) it is ok
+		 * to request an exact range.
+		 */
+		ret = regulator_set_voltage(arizona->dcvdd, 1175000, 1175000);
+		if (ret < 0) {
+			dev_err(arizona->dev,
+				"Failed to set suspend voltage: %d\n", ret);
+			return ret;
+		}
+		break;
+	case WM5102:
+		if (!(val & ARIZONA_JD1_ENA)) {
+			ret = regmap_write(arizona->regmap,
+					   ARIZONA_WRITE_SEQUENCER_CTRL_3, 0x0);
+			if (ret) {
 				dev_err(arizona->dev,
-					"Failed to set suspend voltage: %d\n",
+					"Failed to clear write sequencer: %d\n",
 					ret);
 				return ret;
 			}
-			break;
-		default:
-			break;
 		}
+		break;
+	default:
+		break;
 	}
 
 	regcache_cache_only(arizona->regmap, true);
 	regcache_mark_dirty(arizona->regmap);
 	regulator_disable(arizona->dcvdd);
 
+	/* Allow us to completely power down if no jack detection */
+	if (!(val & ARIZONA_JD1_ENA)) {
+		dev_dbg(arizona->dev, "Fully powering off\n");
+
+		arizona->has_fully_powered_off = true;
+
+		disable_irq(arizona->irq);
+		arizona_enable_reset(arizona);
+		regulator_bulk_disable(arizona->num_core_supplies,
+				       arizona->core_supplies);
+	}
+
 	return 0;
 }
 #endif
diff --git a/include/linux/mfd/arizona/core.h b/include/linux/mfd/arizona/core.h
index 16a498f48169..847bdaf47f1d 100644
--- a/include/linux/mfd/arizona/core.h
+++ b/include/linux/mfd/arizona/core.h
@@ -117,6 +117,7 @@ struct arizona {
 	int num_core_supplies;
 	struct regulator_bulk_data core_supplies[ARIZONA_MAX_CORE_SUPPLIES];
 	struct regulator *dcvdd;
+	bool has_fully_powered_off;
 
 	struct arizona_pdata pdata;
 
-- 
cgit v1.2.3


From fc027d138b79537e5353f3d3bad2bcaac787cd17 Mon Sep 17 00:00:00 2001
From: Richard Fitzgerald <rf@opensource.wolfsonmicro.com>
Date: Fri, 1 May 2015 16:15:12 +0100
Subject: mfd: arizona: Split INx_MODE into two fields

Later arizona silicon has the single/differential selector
in a different register, and IN1_MODE only selects between
analogue or digital. Prepare for this by splitting the
INx_MODE definition into two fields.

Signed-off-by: Richard Fitzgerald <rf@opensource.wolfsonmicro.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/mfd/arizona-core.c            |  8 ++++++--
 include/linux/mfd/arizona/pdata.h     |  5 ++++-
 include/linux/mfd/arizona/registers.h | 27 ++++++++++++++++++---------
 3 files changed, 28 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mfd/arizona-core.c b/drivers/mfd/arizona-core.c
index cb8a1b2f07f1..bebf58a06a6b 100644
--- a/drivers/mfd/arizona-core.c
+++ b/drivers/mfd/arizona-core.c
@@ -1206,12 +1206,16 @@ int arizona_dev_init(struct arizona *arizona)
 		/* Default for both is 0 so noop with defaults */
 		val = arizona->pdata.dmic_ref[i]
 			<< ARIZONA_IN1_DMIC_SUP_SHIFT;
-		val |= arizona->pdata.inmode[i] << ARIZONA_IN1_MODE_SHIFT;
+		if (arizona->pdata.inmode[i] & ARIZONA_INMODE_DMIC)
+			val |= 1 << ARIZONA_IN1_MODE_SHIFT;
+		if (arizona->pdata.inmode[i] & ARIZONA_INMODE_SE)
+			val |= 1 << ARIZONA_IN1_SINGLE_ENDED_SHIFT;
 
 		regmap_update_bits(arizona->regmap,
 				   ARIZONA_IN1L_CONTROL + (i * 8),
 				   ARIZONA_IN1_DMIC_SUP_MASK |
-				   ARIZONA_IN1_MODE_MASK, val);
+				   ARIZONA_IN1_MODE_MASK |
+				   ARIZONA_IN1_SINGLE_ENDED_MASK, val);
 	}
 
 	for (i = 0; i < ARIZONA_MAX_OUTPUT; i++) {
diff --git a/include/linux/mfd/arizona/pdata.h b/include/linux/mfd/arizona/pdata.h
index 1789cb0f4f17..f6722677e6d0 100644
--- a/include/linux/mfd/arizona/pdata.h
+++ b/include/linux/mfd/arizona/pdata.h
@@ -156,7 +156,10 @@ struct arizona_pdata {
 	/** MICBIAS configurations */
 	struct arizona_micbias micbias[ARIZONA_MAX_MICBIAS];
 
-	/** Mode of input structures */
+	/**
+	 * Mode of input structures
+	 * One of the ARIZONA_INMODE_xxx values
+	 */
 	int inmode[ARIZONA_MAX_INPUT];
 
 	/** Mode for outputs */
diff --git a/include/linux/mfd/arizona/registers.h b/include/linux/mfd/arizona/registers.h
index aacc10d7789c..3499d36e6067 100644
--- a/include/linux/mfd/arizona/registers.h
+++ b/include/linux/mfd/arizona/registers.h
@@ -2515,9 +2515,12 @@
 #define ARIZONA_IN1_DMIC_SUP_MASK                0x1800  /* IN1_DMIC_SUP - [12:11] */
 #define ARIZONA_IN1_DMIC_SUP_SHIFT                   11  /* IN1_DMIC_SUP - [12:11] */
 #define ARIZONA_IN1_DMIC_SUP_WIDTH                    2  /* IN1_DMIC_SUP - [12:11] */
-#define ARIZONA_IN1_MODE_MASK                    0x0600  /* IN1_MODE - [10:9] */
-#define ARIZONA_IN1_MODE_SHIFT                        9  /* IN1_MODE - [10:9] */
-#define ARIZONA_IN1_MODE_WIDTH                        2  /* IN1_MODE - [10:9] */
+#define ARIZONA_IN1_MODE_MASK                    0x0400  /* IN1_MODE - [10] */
+#define ARIZONA_IN1_MODE_SHIFT                       10  /* IN1_MODE - [10] */
+#define ARIZONA_IN1_MODE_WIDTH                        1  /* IN1_MODE - [10] */
+#define ARIZONA_IN1_SINGLE_ENDED_MASK            0x0200  /* IN1_MODE - [9] */
+#define ARIZONA_IN1_SINGLE_ENDED_SHIFT                9  /* IN1_MODE - [9] */
+#define ARIZONA_IN1_SINGLE_ENDED_WIDTH                1  /* IN1_MODE - [9] */
 #define ARIZONA_IN1L_PGA_VOL_MASK                0x00FE  /* IN1L_PGA_VOL - [7:1] */
 #define ARIZONA_IN1L_PGA_VOL_SHIFT                    1  /* IN1L_PGA_VOL - [7:1] */
 #define ARIZONA_IN1L_PGA_VOL_WIDTH                    7  /* IN1L_PGA_VOL - [7:1] */
@@ -2588,9 +2591,12 @@
 #define ARIZONA_IN2_DMIC_SUP_MASK                0x1800  /* IN2_DMIC_SUP - [12:11] */
 #define ARIZONA_IN2_DMIC_SUP_SHIFT                   11  /* IN2_DMIC_SUP - [12:11] */
 #define ARIZONA_IN2_DMIC_SUP_WIDTH                    2  /* IN2_DMIC_SUP - [12:11] */
-#define ARIZONA_IN2_MODE_MASK                    0x0600  /* IN2_MODE - [10:9] */
-#define ARIZONA_IN2_MODE_SHIFT                        9  /* IN2_MODE - [10:9] */
-#define ARIZONA_IN2_MODE_WIDTH                        2  /* IN2_MODE - [10:9] */
+#define ARIZONA_IN2_MODE_MASK                    0x0400  /* IN2_MODE - [10] */
+#define ARIZONA_IN2_MODE_SHIFT                       10  /* IN2_MODE - [10] */
+#define ARIZONA_IN2_MODE_WIDTH                        1  /* IN2_MODE - [10] */
+#define ARIZONA_IN2_SINGLE_ENDED_MASK            0x0200  /* IN2_MODE - [9] */
+#define ARIZONA_IN2_SINGLE_ENDED_SHIFT                9  /* IN2_MODE - [9] */
+#define ARIZONA_IN2_SINGLE_ENDED_WIDTH                1  /* IN2_MODE - [9] */
 #define ARIZONA_IN2L_PGA_VOL_MASK                0x00FE  /* IN2L_PGA_VOL - [7:1] */
 #define ARIZONA_IN2L_PGA_VOL_SHIFT                    1  /* IN2L_PGA_VOL - [7:1] */
 #define ARIZONA_IN2L_PGA_VOL_WIDTH                    7  /* IN2L_PGA_VOL - [7:1] */
@@ -2661,9 +2667,12 @@
 #define ARIZONA_IN3_DMIC_SUP_MASK                0x1800  /* IN3_DMIC_SUP - [12:11] */
 #define ARIZONA_IN3_DMIC_SUP_SHIFT                   11  /* IN3_DMIC_SUP - [12:11] */
 #define ARIZONA_IN3_DMIC_SUP_WIDTH                    2  /* IN3_DMIC_SUP - [12:11] */
-#define ARIZONA_IN3_MODE_MASK                    0x0600  /* IN3_MODE - [10:9] */
-#define ARIZONA_IN3_MODE_SHIFT                        9  /* IN3_MODE - [10:9] */
-#define ARIZONA_IN3_MODE_WIDTH                        2  /* IN3_MODE - [10:9] */
+#define ARIZONA_IN3_MODE_MASK                    0x0400  /* IN3_MODE - [10] */
+#define ARIZONA_IN3_MODE_SHIFT                       10  /* IN3_MODE - [10] */
+#define ARIZONA_IN3_MODE_WIDTH                        1  /* IN3_MODE - [10] */
+#define ARIZONA_IN3_SINGLE_ENDED_MASK            0x0200  /* IN3_MODE - [9] */
+#define ARIZONA_IN3_SINGLE_ENDED_SHIFT                9  /* IN3_MODE - [9] */
+#define ARIZONA_IN3_SINGLE_ENDED_WIDTH                1  /* IN3_MODE - [9] */
 #define ARIZONA_IN3L_PGA_VOL_MASK                0x00FE  /* IN3L_PGA_VOL - [7:1] */
 #define ARIZONA_IN3L_PGA_VOL_SHIFT                    1  /* IN3L_PGA_VOL - [7:1] */
 #define ARIZONA_IN3L_PGA_VOL_WIDTH                    7  /* IN3L_PGA_VOL - [7:1] */
-- 
cgit v1.2.3


From 7e2d67e94ab05f70d2a73d00283f46778386ca52 Mon Sep 17 00:00:00 2001
From: Richard Fitzgerald <rf@opensource.wolfsonmicro.com>
Date: Wed, 13 May 2015 16:52:25 +0100
Subject: mfd: arizona: Add stub for wm5102_patch()

For the WM5102 there is a dependency in the core code on wm5102_patch()
which only exists when CONFIG_MFD_WM5102 is defined. To avoid having
to sprinkle #ifdefs around the code it is given an alternative empty
stub version when CONFIG_MFD_WM5102 is deselected

Signed-off-by: Richard Fitzgerald <rf@opensource.wolfsonmicro.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 include/linux/mfd/arizona/core.h | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mfd/arizona/core.h b/include/linux/mfd/arizona/core.h
index 847bdaf47f1d..2f434f4f79a1 100644
--- a/include/linux/mfd/arizona/core.h
+++ b/include/linux/mfd/arizona/core.h
@@ -154,7 +154,15 @@ int arizona_request_irq(struct arizona *arizona, int irq, char *name,
 void arizona_free_irq(struct arizona *arizona, int irq, void *data);
 int arizona_set_irq_wake(struct arizona *arizona, int irq, int on);
 
+#ifdef CONFIG_MFD_WM5102
 int wm5102_patch(struct arizona *arizona);
+#else
+static inline int wm5102_patch(struct arizona *arizona)
+{
+	return 0;
+}
+#endif
+
 int wm5110_patch(struct arizona *arizona);
 int wm8997_patch(struct arizona *arizona);
 
-- 
cgit v1.2.3


From 5104b7d7678b0029417f6ac08243773a77259ac6 Mon Sep 17 00:00:00 2001
From: Dan Streetman <ddstreet@ieee.org>
Date: Wed, 17 Jun 2015 06:17:52 +0930
Subject: module: make perm const

Change the struct kernel_param.perm field to a const, as it should never
be changed.

Signed-off-by: Dan Streetman <ddstreet@ieee.org>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au> (cut from larger patch)
---
 include/linux/moduleparam.h | 2 +-
 kernel/params.c             | 8 +++-----
 2 files changed, 4 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/moduleparam.h b/include/linux/moduleparam.h
index 7e0079936396..ab5031453807 100644
--- a/include/linux/moduleparam.h
+++ b/include/linux/moduleparam.h
@@ -68,7 +68,7 @@ enum {
 struct kernel_param {
 	const char *name;
 	const struct kernel_param_ops *ops;
-	u16 perm;
+	const u16 perm;
 	s8 level;
 	u8 flags;
 	union {
diff --git a/kernel/params.c b/kernel/params.c
index e906874da5fc..a8b09f6c87dc 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -395,12 +395,11 @@ EXPORT_SYMBOL(param_ops_invbool);
 
 int param_set_bint(const char *val, const struct kernel_param *kp)
 {
-	struct kernel_param boolkp;
+	/* Match bool exactly, by re-using it. */
+	struct kernel_param boolkp = *kp;
 	bool v;
 	int ret;
 
-	/* Match bool exactly, by re-using it. */
-	boolkp = *kp;
 	boolkp.arg = &v;
 
 	ret = param_set_bool(val, &boolkp);
@@ -480,9 +479,8 @@ static int param_array_get(char *buffer, const struct kernel_param *kp)
 {
 	int i, off, ret;
 	const struct kparam_array *arr = kp->arr;
-	struct kernel_param p;
+	struct kernel_param p = *kp;
 
-	p = *kp;
 	for (i = off = 0; i < (arr->num ? *arr->num : arr->max); i++) {
 		if (i)
 			buffer[off++] = ',';
-- 
cgit v1.2.3


From b51d23e4e9fea6f264d39535c2a62d1f51e7ccc3 Mon Sep 17 00:00:00 2001
From: Dan Streetman <ddstreet@ieee.org>
Date: Wed, 17 Jun 2015 06:18:52 +0930
Subject: module: add per-module param_lock

Add a "param_lock" mutex to each module, and update params.c to use
the correct built-in or module mutex while locking kernel params.
Remove the kparam_block_sysfs_r/w() macros, replace them with direct
calls to kernel_param_[un]lock(module).

The kernel param code currently uses a single mutex to protect
modification of any and all kernel params.  While this generally works,
there is one specific problem with it; a module callback function
cannot safely load another module, i.e. with request_module() or even
with indirect calls such as crypto_has_alg().  If the module to be
loaded has any of its params configured (e.g. with a /etc/modprobe.d/*
config file), then the attempt will result in a deadlock between the
first module param callback waiting for modprobe, and modprobe trying to
lock the single kernel param mutex to set the new module's param.

This fixes that by using per-module mutexes, so that each individual module
is protected against concurrent changes in its own kernel params, but is
not blocked by changes to other module params.  All built-in modules
continue to use the built-in mutex, since they will always be loaded at
runtime and references (e.g. request_module(), crypto_has_alg()) to them
will never cause load-time param changing.

This also simplifies the interface used by modules to block sysfs access
to their params; while there are currently functions to block and unblock
sysfs param access which are split up by read and write and expect a single
kernel param to be passed, their actual operation is identical and applies
to all params, not just the one passed to them; they simply lock and unlock
the global param mutex.  They are replaced with direct calls to
kernel_param_[un]lock(THIS_MODULE), which locks THIS_MODULE's param_lock, or
if the module is built-in, it locks the built-in mutex.

Suggested-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Dan Streetman <ddstreet@ieee.org>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 arch/um/drivers/hostaudio_kern.c                 | 20 ++++----
 drivers/net/ethernet/myricom/myri10ge/myri10ge.c |  6 +--
 drivers/net/wireless/libertas_tf/if_usb.c        |  6 +--
 drivers/usb/atm/ueagle-atm.c                     |  4 +-
 drivers/video/fbdev/vt8623fb.c                   |  4 +-
 include/linux/module.h                           |  1 +
 include/linux/moduleparam.h                      | 61 ++++--------------------
 kernel/module.c                                  |  2 +
 kernel/params.c                                  | 50 +++++++++++--------
 net/mac80211/rate.c                              |  4 +-
 10 files changed, 65 insertions(+), 93 deletions(-)

(limited to 'include/linux')

diff --git a/arch/um/drivers/hostaudio_kern.c b/arch/um/drivers/hostaudio_kern.c
index 9b90fdc4b151..f6b911cc3923 100644
--- a/arch/um/drivers/hostaudio_kern.c
+++ b/arch/um/drivers/hostaudio_kern.c
@@ -185,9 +185,9 @@ static int hostaudio_open(struct inode *inode, struct file *file)
 	int ret;
 
 #ifdef DEBUG
-	kparam_block_sysfs_write(dsp);
+	kernel_param_lock(THIS_MODULE);
 	printk(KERN_DEBUG "hostaudio: open called (host: %s)\n", dsp);
-	kparam_unblock_sysfs_write(dsp);
+	kernel_param_unlock(THIS_MODULE);
 #endif
 
 	state = kmalloc(sizeof(struct hostaudio_state), GFP_KERNEL);
@@ -199,11 +199,11 @@ static int hostaudio_open(struct inode *inode, struct file *file)
 	if (file->f_mode & FMODE_WRITE)
 		w = 1;
 
-	kparam_block_sysfs_write(dsp);
+	kernel_param_lock(THIS_MODULE);
 	mutex_lock(&hostaudio_mutex);
 	ret = os_open_file(dsp, of_set_rw(OPENFLAGS(), r, w), 0);
 	mutex_unlock(&hostaudio_mutex);
-	kparam_unblock_sysfs_write(dsp);
+	kernel_param_unlock(THIS_MODULE);
 
 	if (ret < 0) {
 		kfree(state);
@@ -260,17 +260,17 @@ static int hostmixer_open_mixdev(struct inode *inode, struct file *file)
 	if (file->f_mode & FMODE_WRITE)
 		w = 1;
 
-	kparam_block_sysfs_write(mixer);
+	kernel_param_lock(THIS_MODULE);
 	mutex_lock(&hostaudio_mutex);
 	ret = os_open_file(mixer, of_set_rw(OPENFLAGS(), r, w), 0);
 	mutex_unlock(&hostaudio_mutex);
-	kparam_unblock_sysfs_write(mixer);
+	kernel_param_unlock(THIS_MODULE);
 
 	if (ret < 0) {
-		kparam_block_sysfs_write(dsp);
+		kernel_param_lock(THIS_MODULE);
 		printk(KERN_ERR "hostaudio_open_mixdev failed to open '%s', "
 		       "err = %d\n", dsp, -ret);
-		kparam_unblock_sysfs_write(dsp);
+		kernel_param_unlock(THIS_MODULE);
 		kfree(state);
 		return ret;
 	}
@@ -326,10 +326,10 @@ MODULE_LICENSE("GPL");
 
 static int __init hostaudio_init_module(void)
 {
-	__kernel_param_lock();
+	kernel_param_lock(THIS_MODULE);
 	printk(KERN_INFO "UML Audio Relay (host dsp = %s, host mixer = %s)\n",
 	       dsp, mixer);
-	__kernel_param_unlock();
+	kernel_param_unlock(THIS_MODULE);
 
 	module_data.dev_audio = register_sound_dsp(&hostaudio_fops, -1);
 	if (module_data.dev_audio < 0) {
diff --git a/drivers/net/ethernet/myricom/myri10ge/myri10ge.c b/drivers/net/ethernet/myricom/myri10ge/myri10ge.c
index 2bae50292dcd..83651ac8ddb9 100644
--- a/drivers/net/ethernet/myricom/myri10ge/myri10ge.c
+++ b/drivers/net/ethernet/myricom/myri10ge/myri10ge.c
@@ -279,7 +279,7 @@ MODULE_FIRMWARE("myri10ge_eth_z8e.dat");
 MODULE_FIRMWARE("myri10ge_rss_ethp_z8e.dat");
 MODULE_FIRMWARE("myri10ge_rss_eth_z8e.dat");
 
-/* Careful: must be accessed under kparam_block_sysfs_write */
+/* Careful: must be accessed under kernel_param_lock() */
 static char *myri10ge_fw_name = NULL;
 module_param(myri10ge_fw_name, charp, S_IRUGO | S_IWUSR);
 MODULE_PARM_DESC(myri10ge_fw_name, "Firmware image name");
@@ -3427,7 +3427,7 @@ static void myri10ge_select_firmware(struct myri10ge_priv *mgp)
 		}
 	}
 
-	kparam_block_sysfs_write(myri10ge_fw_name);
+	kernel_param_lock(THIS_MODULE);
 	if (myri10ge_fw_name != NULL) {
 		char *fw_name = kstrdup(myri10ge_fw_name, GFP_KERNEL);
 		if (fw_name) {
@@ -3435,7 +3435,7 @@ static void myri10ge_select_firmware(struct myri10ge_priv *mgp)
 			set_fw_name(mgp, fw_name, true);
 		}
 	}
-	kparam_unblock_sysfs_write(myri10ge_fw_name);
+	kernel_param_unlock(THIS_MODULE);
 
 	if (mgp->board_number < MYRI10GE_MAX_BOARDS &&
 	    myri10ge_fw_names[mgp->board_number] != NULL &&
diff --git a/drivers/net/wireless/libertas_tf/if_usb.c b/drivers/net/wireless/libertas_tf/if_usb.c
index 1a20cee5febe..799a2efe5793 100644
--- a/drivers/net/wireless/libertas_tf/if_usb.c
+++ b/drivers/net/wireless/libertas_tf/if_usb.c
@@ -821,15 +821,15 @@ static int if_usb_prog_firmware(struct if_usb_card *cardp)
 
 	lbtf_deb_enter(LBTF_DEB_USB);
 
-	kparam_block_sysfs_write(fw_name);
+	kernel_param_lock(THIS_MODULE);
 	ret = request_firmware(&cardp->fw, lbtf_fw_name, &cardp->udev->dev);
 	if (ret < 0) {
 		pr_err("request_firmware() failed with %#x\n", ret);
 		pr_err("firmware %s not found\n", lbtf_fw_name);
-		kparam_unblock_sysfs_write(fw_name);
+		kernel_param_unlock(THIS_MODULE);
 		goto done;
 	}
-	kparam_unblock_sysfs_write(fw_name);
+	kernel_param_unlock(THIS_MODULE);
 
 	if (check_fwfile_format(cardp->fw->data, cardp->fw->size))
 		goto release_fw;
diff --git a/drivers/usb/atm/ueagle-atm.c b/drivers/usb/atm/ueagle-atm.c
index 888998a7fe31..a2ae88dbda78 100644
--- a/drivers/usb/atm/ueagle-atm.c
+++ b/drivers/usb/atm/ueagle-atm.c
@@ -1599,7 +1599,7 @@ static void cmvs_file_name(struct uea_softc *sc, char *const cmv_name, int ver)
 	char file_arr[] = "CMVxy.bin";
 	char *file;
 
-	kparam_block_sysfs_write(cmv_file);
+	kernel_param_lock(THIS_MODULE);
 	/* set proper name corresponding modem version and line type */
 	if (cmv_file[sc->modem_index] == NULL) {
 		if (UEA_CHIP_VERSION(sc) == ADI930)
@@ -1618,7 +1618,7 @@ static void cmvs_file_name(struct uea_softc *sc, char *const cmv_name, int ver)
 	strlcat(cmv_name, file, UEA_FW_NAME_MAX);
 	if (ver == 2)
 		strlcat(cmv_name, ".v2", UEA_FW_NAME_MAX);
-	kparam_unblock_sysfs_write(cmv_file);
+	kernel_param_unlock(THIS_MODULE);
 }
 
 static int request_cmvs_old(struct uea_softc *sc,
diff --git a/drivers/video/fbdev/vt8623fb.c b/drivers/video/fbdev/vt8623fb.c
index ea7f056ed5fe..8bac309c24b9 100644
--- a/drivers/video/fbdev/vt8623fb.c
+++ b/drivers/video/fbdev/vt8623fb.c
@@ -754,9 +754,9 @@ static int vt8623_pci_probe(struct pci_dev *dev, const struct pci_device_id *id)
 
 	/* Prepare startup mode */
 
-	kparam_block_sysfs_write(mode_option);
+	kernel_param_lock(THIS_MODULE);
 	rc = fb_find_mode(&(info->var), info, mode_option, NULL, 0, NULL, 8);
-	kparam_unblock_sysfs_write(mode_option);
+	kernel_param_unlock(THIS_MODULE);
 	if (! ((rc == 1) || (rc == 2))) {
 		rc = -EINVAL;
 		dev_err(info->device, "mode %s not found\n", mode_option);
diff --git a/include/linux/module.h b/include/linux/module.h
index 4c1b02e1361d..6ba0e87fa804 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -240,6 +240,7 @@ struct module {
 	unsigned int num_syms;
 
 	/* Kernel parameters. */
+	struct mutex param_lock;
 	struct kernel_param *kp;
 	unsigned int num_kp;
 
diff --git a/include/linux/moduleparam.h b/include/linux/moduleparam.h
index ab5031453807..f1fdc50520d8 100644
--- a/include/linux/moduleparam.h
+++ b/include/linux/moduleparam.h
@@ -67,6 +67,7 @@ enum {
 
 struct kernel_param {
 	const char *name;
+	struct module *mod;
 	const struct kernel_param_ops *ops;
 	const u16 perm;
 	s8 level;
@@ -108,7 +109,7 @@ struct kparam_array
  *
  * @perm is 0 if the the variable is not to appear in sysfs, or 0444
  * for world-readable, 0644 for root-writable, etc.  Note that if it
- * is writable, you may need to use kparam_block_sysfs_write() around
+ * is writable, you may need to use kernel_param_lock() around
  * accesses (esp. charp, which can be kfreed when it changes).
  *
  * The @type is simply pasted to refer to a param_ops_##type and a
@@ -216,12 +217,12 @@ struct kparam_array
    parameters. */
 #define __module_param_call(prefix, name, ops, arg, perm, level, flags)	\
 	/* Default value instead of permissions? */			\
-	static const char __param_str_##name[] = prefix #name; \
+	static const char __param_str_##name[] = prefix #name;		\
 	static struct kernel_param __moduleparam_const __param_##name	\
 	__used								\
     __attribute__ ((unused,__section__ ("__param"),aligned(sizeof(void *)))) \
-	= { __param_str_##name, ops, VERIFY_OCTAL_PERMISSIONS(perm),	\
-	    level, flags, { arg } }
+	= { __param_str_##name, THIS_MODULE, ops,			\
+	    VERIFY_OCTAL_PERMISSIONS(perm), level, flags, { arg } }
 
 /* Obsolete - use module_param_cb() */
 #define module_param_call(name, set, get, arg, perm)			\
@@ -238,58 +239,14 @@ __check_old_set_param(int (*oldset)(const char *, struct kernel_param *))
 	return 0;
 }
 
-/**
- * kparam_block_sysfs_write - make sure a parameter isn't written via sysfs.
- * @name: the name of the parameter
- *
- * There's no point blocking write on a paramter that isn't writable via sysfs!
- */
-#define kparam_block_sysfs_write(name)			\
-	do {						\
-		BUG_ON(!(__param_##name.perm & 0222));	\
-		__kernel_param_lock();			\
-	} while (0)
-
-/**
- * kparam_unblock_sysfs_write - allows sysfs to write to a parameter again.
- * @name: the name of the parameter
- */
-#define kparam_unblock_sysfs_write(name)		\
-	do {						\
-		BUG_ON(!(__param_##name.perm & 0222));	\
-		__kernel_param_unlock();		\
-	} while (0)
-
-/**
- * kparam_block_sysfs_read - make sure a parameter isn't read via sysfs.
- * @name: the name of the parameter
- *
- * This also blocks sysfs writes.
- */
-#define kparam_block_sysfs_read(name)			\
-	do {						\
-		BUG_ON(!(__param_##name.perm & 0444));	\
-		__kernel_param_lock();			\
-	} while (0)
-
-/**
- * kparam_unblock_sysfs_read - allows sysfs to read a parameter again.
- * @name: the name of the parameter
- */
-#define kparam_unblock_sysfs_read(name)			\
-	do {						\
-		BUG_ON(!(__param_##name.perm & 0444));	\
-		__kernel_param_unlock();		\
-	} while (0)
-
 #ifdef CONFIG_SYSFS
-extern void __kernel_param_lock(void);
-extern void __kernel_param_unlock(void);
+extern void kernel_param_lock(struct module *mod);
+extern void kernel_param_unlock(struct module *mod);
 #else
-static inline void __kernel_param_lock(void)
+static inline void kernel_param_lock(struct module *mod)
 {
 }
-static inline void __kernel_param_unlock(void)
+static inline void kernel_param_unlock(struct module *mod)
 {
 }
 #endif
diff --git a/kernel/module.c b/kernel/module.c
index 427b99f1a4b3..8ec33ce202a6 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -3442,6 +3442,8 @@ static int load_module(struct load_info *info, const char __user *uargs,
 	if (err)
 		goto unlink_mod;
 
+	mutex_init(&mod->param_lock);
+
 	/* Now we've got everything in the final locations, we can
 	 * find optional sections. */
 	err = find_module_sections(mod, info);
diff --git a/kernel/params.c b/kernel/params.c
index a8b09f6c87dc..8890d0b8dffc 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -25,15 +25,20 @@
 #include <linux/slab.h>
 #include <linux/ctype.h>
 
-/* Protects all parameters, and incidentally kmalloced_param list. */
+/* Protects all built-in parameters, modules use their own param_lock */
 static DEFINE_MUTEX(param_lock);
 
+/* Use the module's mutex, or if built-in use the built-in mutex */
+#define KPARAM_MUTEX(mod)	((mod) ? &(mod)->param_lock : &param_lock)
+#define KPARAM_IS_LOCKED(mod)	mutex_is_locked(KPARAM_MUTEX(mod))
+
 /* This just allows us to keep track of which parameters are kmalloced. */
 struct kmalloced_param {
 	struct list_head list;
 	char val[];
 };
 static LIST_HEAD(kmalloced_params);
+static DEFINE_SPINLOCK(kmalloced_params_lock);
 
 static void *kmalloc_parameter(unsigned int size)
 {
@@ -43,7 +48,10 @@ static void *kmalloc_parameter(unsigned int size)
 	if (!p)
 		return NULL;
 
+	spin_lock(&kmalloced_params_lock);
 	list_add(&p->list, &kmalloced_params);
+	spin_unlock(&kmalloced_params_lock);
+
 	return p->val;
 }
 
@@ -52,6 +60,7 @@ static void maybe_kfree_parameter(void *param)
 {
 	struct kmalloced_param *p;
 
+	spin_lock(&kmalloced_params_lock);
 	list_for_each_entry(p, &kmalloced_params, list) {
 		if (p->val == param) {
 			list_del(&p->list);
@@ -59,6 +68,7 @@ static void maybe_kfree_parameter(void *param)
 			break;
 		}
 	}
+	spin_unlock(&kmalloced_params_lock);
 }
 
 static char dash2underscore(char c)
@@ -118,10 +128,10 @@ static int parse_one(char *param,
 				return -EINVAL;
 			pr_debug("handling %s with %p\n", param,
 				params[i].ops->set);
-			mutex_lock(&param_lock);
+			kernel_param_lock(params[i].mod);
 			param_check_unsafe(&params[i]);
 			err = params[i].ops->set(val, &params[i]);
-			mutex_unlock(&param_lock);
+			kernel_param_unlock(params[i].mod);
 			return err;
 		}
 	}
@@ -417,7 +427,8 @@ const struct kernel_param_ops param_ops_bint = {
 EXPORT_SYMBOL(param_ops_bint);
 
 /* We break the rule and mangle the string. */
-static int param_array(const char *name,
+static int param_array(struct module *mod,
+		       const char *name,
 		       const char *val,
 		       unsigned int min, unsigned int max,
 		       void *elem, int elemsize,
@@ -448,7 +459,7 @@ static int param_array(const char *name,
 		/* nul-terminate and parse */
 		save = val[len];
 		((char *)val)[len] = '\0';
-		BUG_ON(!mutex_is_locked(&param_lock));
+		BUG_ON(!KPARAM_IS_LOCKED(mod));
 		ret = set(val, &kp);
 
 		if (ret != 0)
@@ -470,7 +481,7 @@ static int param_array_set(const char *val, const struct kernel_param *kp)
 	const struct kparam_array *arr = kp->arr;
 	unsigned int temp_num;
 
-	return param_array(kp->name, val, 1, arr->max, arr->elem,
+	return param_array(kp->mod, kp->name, val, 1, arr->max, arr->elem,
 			   arr->elemsize, arr->ops->set, kp->level,
 			   arr->num ?: &temp_num);
 }
@@ -485,7 +496,7 @@ static int param_array_get(char *buffer, const struct kernel_param *kp)
 		if (i)
 			buffer[off++] = ',';
 		p.arg = arr->elem + arr->elemsize * i;
-		BUG_ON(!mutex_is_locked(&param_lock));
+		BUG_ON(!KPARAM_IS_LOCKED(p.mod));
 		ret = arr->ops->get(buffer + off, &p);
 		if (ret < 0)
 			return ret;
@@ -568,9 +579,9 @@ static ssize_t param_attr_show(struct module_attribute *mattr,
 	if (!attribute->param->ops->get)
 		return -EPERM;
 
-	mutex_lock(&param_lock);
+	kernel_param_lock(mk->mod);
 	count = attribute->param->ops->get(buf, attribute->param);
-	mutex_unlock(&param_lock);
+	kernel_param_unlock(mk->mod);
 	if (count > 0) {
 		strcat(buf, "\n");
 		++count;
@@ -580,7 +591,7 @@ static ssize_t param_attr_show(struct module_attribute *mattr,
 
 /* sysfs always hands a nul-terminated string in buf.  We rely on that. */
 static ssize_t param_attr_store(struct module_attribute *mattr,
-				struct module_kobject *km,
+				struct module_kobject *mk,
 				const char *buf, size_t len)
 {
  	int err;
@@ -589,10 +600,10 @@ static ssize_t param_attr_store(struct module_attribute *mattr,
 	if (!attribute->param->ops->set)
 		return -EPERM;
 
-	mutex_lock(&param_lock);
+	kernel_param_lock(mk->mod);
 	param_check_unsafe(attribute->param);
 	err = attribute->param->ops->set(buf, attribute->param);
-	mutex_unlock(&param_lock);
+	kernel_param_unlock(mk->mod);
 	if (!err)
 		return len;
 	return err;
@@ -605,18 +616,19 @@ static ssize_t param_attr_store(struct module_attribute *mattr,
 #define __modinit __init
 #endif
 
-#ifdef CONFIG_SYSFS
-void __kernel_param_lock(void)
+void kernel_param_lock(struct module *mod)
 {
-	mutex_lock(&param_lock);
+	mutex_lock(KPARAM_MUTEX(mod));
 }
-EXPORT_SYMBOL(__kernel_param_lock);
 
-void __kernel_param_unlock(void)
+void kernel_param_unlock(struct module *mod)
 {
-	mutex_unlock(&param_lock);
+	mutex_unlock(KPARAM_MUTEX(mod));
 }
-EXPORT_SYMBOL(__kernel_param_unlock);
+
+#ifdef CONFIG_SYSFS
+EXPORT_SYMBOL(kernel_param_lock);
+EXPORT_SYMBOL(kernel_param_unlock);
 
 /*
  * add_sysfs_param - add a parameter to sysfs
diff --git a/net/mac80211/rate.c b/net/mac80211/rate.c
index d53355b011f5..8544e2eb570e 100644
--- a/net/mac80211/rate.c
+++ b/net/mac80211/rate.c
@@ -103,7 +103,7 @@ ieee80211_rate_control_ops_get(const char *name)
 	const struct rate_control_ops *ops;
 	const char *alg_name;
 
-	kparam_block_sysfs_write(ieee80211_default_rc_algo);
+	kernel_param_lock(THIS_MODULE);
 	if (!name)
 		alg_name = ieee80211_default_rc_algo;
 	else
@@ -117,7 +117,7 @@ ieee80211_rate_control_ops_get(const char *name)
 	/* try built-in one if specific alg requested but not found */
 	if (!ops && strlen(CONFIG_MAC80211_RC_DEFAULT))
 		ops = ieee80211_try_rate_control_ops_get(CONFIG_MAC80211_RC_DEFAULT);
-	kparam_unblock_sysfs_write(ieee80211_default_rc_algo);
+	kernel_param_unlock(THIS_MODULE);
 
 	return ops;
 }
-- 
cgit v1.2.3


From dfe816c5e37272f2f3c1311f0e9934e1b4229261 Mon Sep 17 00:00:00 2001
From: Pankaj Gupta <pagupta@redhat.com>
Date: Fri, 19 Jun 2015 19:47:53 +0530
Subject: macvtap: Increase limit of macvtap queues

Macvtap should be compatible with tuntap for
maximum number of queues.

commit 'baf71c5c1f80d82e92924050a60b5baaf97e3094 (tuntap:
Increase the number of queues in tun.)' removes
the limitations and increases number of queues in tuntap.
Now, Its safe to increase number of queues in Macvtap as well.

This patch also modifies 'macvtap_del_queues' function
to avoid extra memory allocation in stack.

Changes from v1->v2 :
Michael S. Tsirkin, Jason Wang  :
                  Better way to use linked list to
avoid use of extra memory in stack.
Sergei Shtylyov : Specify dependent commit's summary.

Signed-off-by: Pankaj Gupta <pagupta@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/macvtap.c      | 10 ++--------
 include/linux/if_macvlan.h |  2 +-
 2 files changed, 3 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c
index 483afb19596c..6a64197f5bce 100644
--- a/drivers/net/macvtap.c
+++ b/drivers/net/macvtap.c
@@ -263,27 +263,21 @@ out:
 static void macvtap_del_queues(struct net_device *dev)
 {
 	struct macvlan_dev *vlan = netdev_priv(dev);
-	struct macvtap_queue *q, *tmp, *qlist[MAX_MACVTAP_QUEUES];
-	int i, j = 0;
+	struct macvtap_queue *q, *tmp;
 
 	ASSERT_RTNL();
 	list_for_each_entry_safe(q, tmp, &vlan->queue_list, next) {
 		list_del_init(&q->next);
-		qlist[j++] = q;
 		RCU_INIT_POINTER(q->vlan, NULL);
 		if (q->enabled)
 			vlan->numvtaps--;
 		vlan->numqueues--;
+		sock_put(&q->sk);
 	}
-	for (i = 0; i < vlan->numvtaps; i++)
-		RCU_INIT_POINTER(vlan->taps[i], NULL);
 	BUG_ON(vlan->numvtaps);
 	BUG_ON(vlan->numqueues);
 	/* guarantee that any future macvtap_set_queue will fail */
 	vlan->numvtaps = MAX_MACVTAP_QUEUES;
-
-	for (--j; j >= 0; j--)
-		sock_put(&qlist[j]->sk);
 }
 
 static rx_handler_result_t macvtap_handle_frame(struct sk_buff **pskb)
diff --git a/include/linux/if_macvlan.h b/include/linux/if_macvlan.h
index 6f6929ea8a0c..a4ccc3122f93 100644
--- a/include/linux/if_macvlan.h
+++ b/include/linux/if_macvlan.h
@@ -29,7 +29,7 @@ struct macvtap_queue;
  * Maximum times a macvtap device can be opened. This can be used to
  * configure the number of receive queue, e.g. for multiqueue virtio.
  */
-#define MAX_MACVTAP_QUEUES	16
+#define MAX_MACVTAP_QUEUES	256
 
 #define MACVLAN_MC_FILTER_BITS	8
 #define MACVLAN_MC_FILTER_SZ	(1 << MACVLAN_MC_FILTER_BITS)
-- 
cgit v1.2.3


From 7d4f8d871ab15bd50a5771382ca2c9355b38d73c Mon Sep 17 00:00:00 2001
From: Scott Feldman <sfeldma@gmail.com>
Date: Mon, 22 Jun 2015 00:27:17 -0700
Subject: switchdev; add VLAN support for port's bridge_getlink

One more missing piece of the puzzle.  Add vlan dump support to switchdev
port's bridge_getlink.  iproute2 "bridge vlan show" cmd already knows how
to show the vlans installed on the bridge and the device , but (until now)
no one implemented the port vlan part of the netlink PF_BRIDGE:RTM_GETLINK
msg.  Before this patch, "bridge vlan show":

	$ bridge -c vlan show
	port    vlan ids
	sw1p1    30-34			<< bridge side vlans
		 57

	sw1p1				<< device side vlans (missing)

	sw1p2    57

	sw1p2

	sw1p3

	sw1p4

	br0     None

(When the port is bridged, the output repeats the vlan list for the vlans
on the bridge side of the port and the vlans on the device side of the
port.  The listing above show no vlans for the device side even though they
are installed).

After this patch:

	$ bridge -c vlan show
	port    vlan ids
	sw1p1    30-34			<< bridge side vlan
		 57

	sw1p1    30-34			<< device side vlans
		 57
		 3840 PVID

	sw1p2    57

	sw1p2    57
		 3840 PVID

	sw1p3    3842 PVID

	sw1p4    3843 PVID

	br0     None

I re-used ndo_dflt_bridge_getlink to add vlan fill call-back func.
switchdev support adds an obj dump for VLAN objects, using the same
call-back scheme as FDB dump.  Support included for both compressed and
un-compressed vlan dumps.

Signed-off-by: Scott Feldman <sfeldma@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/emulex/benet/be_main.c   |   2 +-
 drivers/net/ethernet/intel/i40e/i40e_main.c   |   4 +-
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c |   3 +-
 drivers/net/ethernet/rocker/rocker.c          |  25 ++++++
 include/linux/rtnetlink.h                     |   6 +-
 net/core/rtnetlink.c                          |  18 +++-
 net/switchdev/switchdev.c                     | 123 +++++++++++++++++++++++++-
 7 files changed, 172 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/emulex/benet/be_main.c b/drivers/net/ethernet/emulex/benet/be_main.c
index c0f34845cf59..6f642426308c 100644
--- a/drivers/net/ethernet/emulex/benet/be_main.c
+++ b/drivers/net/ethernet/emulex/benet/be_main.c
@@ -5096,7 +5096,7 @@ static int be_ndo_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq,
 	return ndo_dflt_bridge_getlink(skb, pid, seq, dev,
 				       hsw_mode == PORT_FWD_TYPE_VEPA ?
 				       BRIDGE_MODE_VEPA : BRIDGE_MODE_VEB,
-				       0, 0, nlflags);
+				       0, 0, nlflags, filter_mask, NULL);
 }
 
 #ifdef CONFIG_BE2NET_VXLAN
diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c
index 52d7d8b8f1f9..48a52b35b614 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -8069,7 +8069,7 @@ static int i40e_ndo_bridge_setlink(struct net_device *dev,
 #ifdef HAVE_BRIDGE_FILTER
 static int i40e_ndo_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq,
 				   struct net_device *dev,
-				   u32 __always_unused filter_mask, int nlflags)
+				   u32 filter_mask, int nlflags)
 #else
 static int i40e_ndo_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq,
 				   struct net_device *dev, int nlflags)
@@ -8095,7 +8095,7 @@ static int i40e_ndo_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq,
 		return 0;
 
 	return ndo_dflt_bridge_getlink(skb, pid, seq, dev, veb->bridge_mode,
-				       nlflags);
+				       nlflags, 0, 0, filter_mask, NULL);
 }
 #endif /* HAVE_BRIDGE_ATTRIBS */
 
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index 3bf2f3cfd9f6..9aa6104e34ea 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -8095,7 +8095,8 @@ static int ixgbe_ndo_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq,
 		return 0;
 
 	return ndo_dflt_bridge_getlink(skb, pid, seq, dev,
-				       adapter->bridge_mode, 0, 0, nlflags);
+				       adapter->bridge_mode, 0, 0, nlflags,
+				       filter_mask, NULL);
 }
 
 static void *ixgbe_fwd_add(struct net_device *pdev, struct net_device *vdev)
diff --git a/drivers/net/ethernet/rocker/rocker.c b/drivers/net/ethernet/rocker/rocker.c
index 7d5d92a10284..83e913b60d9c 100644
--- a/drivers/net/ethernet/rocker/rocker.c
+++ b/drivers/net/ethernet/rocker/rocker.c
@@ -4456,6 +4456,28 @@ static int rocker_port_fdb_dump(const struct rocker_port *rocker_port,
 	return err;
 }
 
+static int rocker_port_vlan_dump(const struct rocker_port *rocker_port,
+				 struct switchdev_obj *obj)
+{
+	struct switchdev_obj_vlan *vlan = &obj->u.vlan;
+	u16 vid;
+	int err = 0;
+
+	for (vid = 1; vid < VLAN_N_VID; vid++) {
+		if (!test_bit(vid, rocker_port->vlan_bitmap))
+			continue;
+		vlan->flags = 0;
+		if (rocker_vlan_id_is_internal(htons(vid)))
+			vlan->flags |= BRIDGE_VLAN_INFO_PVID;
+		vlan->vid_begin = vlan->vid_end = vid;
+		err = obj->cb(rocker_port->dev, obj);
+		if (err)
+			break;
+	}
+
+	return err;
+}
+
 static int rocker_port_obj_dump(struct net_device *dev,
 				struct switchdev_obj *obj)
 {
@@ -4466,6 +4488,9 @@ static int rocker_port_obj_dump(struct net_device *dev,
 	case SWITCHDEV_OBJ_PORT_FDB:
 		err = rocker_port_fdb_dump(rocker_port, obj);
 		break;
+	case SWITCHDEV_OBJ_PORT_VLAN:
+		err = rocker_port_vlan_dump(rocker_port, obj);
+		break;
 	default:
 		err = -EOPNOTSUPP;
 		break;
diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h
index a2324fb45cf4..39adaa9529eb 100644
--- a/include/linux/rtnetlink.h
+++ b/include/linux/rtnetlink.h
@@ -114,5 +114,9 @@ extern int ndo_dflt_fdb_del(struct ndmsg *ndm,
 
 extern int ndo_dflt_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq,
 				   struct net_device *dev, u16 mode,
-				   u32 flags, u32 mask, int nlflags);
+				   u32 flags, u32 mask, int nlflags,
+				   u32 filter_mask,
+				   int (*vlan_fill)(struct sk_buff *skb,
+						    struct net_device *dev,
+						    u32 filter_mask));
 #endif	/* __LINUX_RTNETLINK_H */
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 2d102ce1474f..01ced4a889e0 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -2908,7 +2908,11 @@ static int brport_nla_put_flag(struct sk_buff *skb, u32 flags, u32 mask,
 
 int ndo_dflt_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq,
 			    struct net_device *dev, u16 mode,
-			    u32 flags, u32 mask, int nlflags)
+			    u32 flags, u32 mask, int nlflags,
+			    u32 filter_mask,
+			    int (*vlan_fill)(struct sk_buff *skb,
+					     struct net_device *dev,
+					     u32 filter_mask))
 {
 	struct nlmsghdr *nlh;
 	struct ifinfomsg *ifm;
@@ -2916,6 +2920,7 @@ int ndo_dflt_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq,
 	struct nlattr *protinfo;
 	u8 operstate = netif_running(dev) ? dev->operstate : IF_OPER_DOWN;
 	struct net_device *br_dev = netdev_master_upper_dev_get(dev);
+	int err = 0;
 
 	nlh = nlmsg_put(skb, pid, seq, RTM_NEWLINK, sizeof(*ifm), nlflags);
 	if (nlh == NULL)
@@ -2956,6 +2961,13 @@ int ndo_dflt_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq,
 			goto nla_put_failure;
 		}
 	}
+	if (vlan_fill) {
+		err = vlan_fill(skb, dev, filter_mask);
+		if (err) {
+			nla_nest_cancel(skb, br_afspec);
+			goto nla_put_failure;
+		}
+	}
 	nla_nest_end(skb, br_afspec);
 
 	protinfo = nla_nest_start(skb, IFLA_PROTINFO | NLA_F_NESTED);
@@ -2989,9 +3001,9 @@ int ndo_dflt_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq,
 	return 0;
 nla_put_failure:
 	nlmsg_cancel(skb, nlh);
-	return -EMSGSIZE;
+	return err ? err : -EMSGSIZE;
 }
-EXPORT_SYMBOL(ndo_dflt_bridge_getlink);
+EXPORT_SYMBOL_GPL(ndo_dflt_bridge_getlink);
 
 static int rtnl_bridge_getlink(struct sk_buff *skb, struct netlink_callback *cb)
 {
diff --git a/net/switchdev/switchdev.c b/net/switchdev/switchdev.c
index 448d9199cea2..3e4b35a0c8cb 100644
--- a/net/switchdev/switchdev.c
+++ b/net/switchdev/switchdev.c
@@ -391,6 +391,126 @@ int call_switchdev_notifiers(unsigned long val, struct net_device *dev,
 }
 EXPORT_SYMBOL_GPL(call_switchdev_notifiers);
 
+struct switchdev_vlan_dump {
+	struct switchdev_obj obj;
+	struct sk_buff *skb;
+	u32 filter_mask;
+	u16 flags;
+	u16 begin;
+	u16 end;
+};
+
+static int switchdev_port_vlan_dump_put(struct net_device *dev,
+					struct switchdev_vlan_dump *dump)
+{
+	struct bridge_vlan_info vinfo;
+
+	vinfo.flags = dump->flags;
+
+	if (dump->begin == 0 && dump->end == 0) {
+		return 0;
+	} else if (dump->begin == dump->end) {
+		vinfo.vid = dump->begin;
+		if (nla_put(dump->skb, IFLA_BRIDGE_VLAN_INFO,
+			    sizeof(vinfo), &vinfo))
+			return -EMSGSIZE;
+	} else {
+		vinfo.vid = dump->begin;
+		vinfo.flags |= BRIDGE_VLAN_INFO_RANGE_BEGIN;
+		if (nla_put(dump->skb, IFLA_BRIDGE_VLAN_INFO,
+			    sizeof(vinfo), &vinfo))
+			return -EMSGSIZE;
+		vinfo.vid = dump->end;
+		vinfo.flags &= ~BRIDGE_VLAN_INFO_RANGE_BEGIN;
+		vinfo.flags |= BRIDGE_VLAN_INFO_RANGE_END;
+		if (nla_put(dump->skb, IFLA_BRIDGE_VLAN_INFO,
+			    sizeof(vinfo), &vinfo))
+			return -EMSGSIZE;
+	}
+
+	return 0;
+}
+
+static int switchdev_port_vlan_dump_cb(struct net_device *dev,
+				       struct switchdev_obj *obj)
+{
+	struct switchdev_vlan_dump *dump =
+		container_of(obj, struct switchdev_vlan_dump, obj);
+	struct switchdev_obj_vlan *vlan = &dump->obj.u.vlan;
+	int err = 0;
+
+	if (vlan->vid_begin > vlan->vid_end)
+		return -EINVAL;
+
+	if (dump->filter_mask & RTEXT_FILTER_BRVLAN) {
+		dump->flags = vlan->flags;
+		for (dump->begin = dump->end = vlan->vid_begin;
+		     dump->begin <= vlan->vid_end;
+		     dump->begin++, dump->end++) {
+			err = switchdev_port_vlan_dump_put(dev, dump);
+			if (err)
+				return err;
+		}
+	} else if (dump->filter_mask & RTEXT_FILTER_BRVLAN_COMPRESSED) {
+		if (dump->begin > vlan->vid_begin &&
+		    dump->begin >= vlan->vid_end) {
+			if ((dump->begin - 1) == vlan->vid_end &&
+			    dump->flags == vlan->flags) {
+				/* prepend */
+				dump->begin = vlan->vid_begin;
+			} else {
+				err = switchdev_port_vlan_dump_put(dev, dump);
+				dump->flags = vlan->flags;
+				dump->begin = vlan->vid_begin;
+				dump->end = vlan->vid_end;
+			}
+		} else if (dump->end <= vlan->vid_begin &&
+		           dump->end < vlan->vid_end) {
+			if ((dump->end  + 1) == vlan->vid_begin &&
+			    dump->flags == vlan->flags) {
+				/* append */
+				dump->end = vlan->vid_end;
+			} else {
+				err = switchdev_port_vlan_dump_put(dev, dump);
+				dump->flags = vlan->flags;
+				dump->begin = vlan->vid_begin;
+				dump->end = vlan->vid_end;
+			}
+		} else {
+			err = -EINVAL;
+		}
+	}
+
+	return err;
+}
+
+static int switchdev_port_vlan_fill(struct sk_buff *skb, struct net_device *dev,
+				    u32 filter_mask)
+{
+	struct switchdev_vlan_dump dump = {
+		.obj = {
+			.id = SWITCHDEV_OBJ_PORT_VLAN,
+			.cb = switchdev_port_vlan_dump_cb,
+		},
+		.skb = skb,
+		.filter_mask = filter_mask,
+	};
+	int err = 0;
+
+	if ((filter_mask & RTEXT_FILTER_BRVLAN) ||
+	    (filter_mask & RTEXT_FILTER_BRVLAN_COMPRESSED)) {
+		err = switchdev_port_obj_dump(dev, &dump.obj);
+		if (err)
+			goto err_out;
+		if (filter_mask & RTEXT_FILTER_BRVLAN_COMPRESSED)
+			/* last one */
+			err = switchdev_port_vlan_dump_put(dev, &dump);
+	}
+
+err_out:
+	return err == -EOPNOTSUPP ? 0 : err;
+}
+
 /**
  *	switchdev_port_bridge_getlink - Get bridge port attributes
  *
@@ -415,7 +535,8 @@ int switchdev_port_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq,
 		return err;
 
 	return ndo_dflt_bridge_getlink(skb, pid, seq, dev, mode,
-				       attr.u.brport_flags, mask, nlflags);
+				       attr.u.brport_flags, mask, nlflags,
+				       filter_mask, switchdev_port_vlan_fill);
 }
 EXPORT_SYMBOL_GPL(switchdev_port_bridge_getlink);
 
-- 
cgit v1.2.3


From cca0ba2df3d4000bacd9b7807d46ffafac62d53a Mon Sep 17 00:00:00 2001
From: Hyungwon Hwang <human.hwang@samsung.com>
Date: Thu, 28 May 2015 16:25:15 +0900
Subject: backlight: Change the return type of backlight_update_status() to int

Backlight device returns the result of update_status(), but
backlight_update_status() ignores it. So the consumers cannot confirm the
result of their function call. This patch makes the result to be returned
back for consumers.

Signed-off-by: Hyungwon Hwang <human.hwang@samsung.com>
Acked-by: Jingoo Han <jingoohan1@gmail.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 include/linux/backlight.h | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/backlight.h b/include/linux/backlight.h
index adb14a8616df..1e7a69adbe6f 100644
--- a/include/linux/backlight.h
+++ b/include/linux/backlight.h
@@ -117,12 +117,16 @@ struct backlight_device {
 	int use_count;
 };
 
-static inline void backlight_update_status(struct backlight_device *bd)
+static inline int backlight_update_status(struct backlight_device *bd)
 {
+	int ret = -ENOENT;
+
 	mutex_lock(&bd->update_lock);
 	if (bd->ops && bd->ops->update_status)
-		bd->ops->update_status(bd);
+		ret = bd->ops->update_status(bd);
 	mutex_unlock(&bd->update_lock);
+
+	return ret;
 }
 
 extern struct backlight_device *backlight_device_register(const char *name,
-- 
cgit v1.2.3


From ce16b9d2356125eb791bd920c710b8512eecce54 Mon Sep 17 00:00:00 2001
From: Suman Anna <s-anna@ti.com>
Date: Wed, 17 Jun 2015 11:53:53 -0500
Subject: of: define of_find_node_by_phandle for !CONFIG_OF

Define stub implementation for of_find_node_by_phandle() API
so that users of this API can build properly even when CONFIG_OF
is not defined.

Fixes x86 randconfig build failure of remoteproc.

Signed-off-by: Suman Anna <s-anna@ti.com>
[robh: add details on fixing remoteproc compile]
Signed-off-by: Rob Herring <robh@kernel.org>
---
 include/linux/of.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/of.h b/include/linux/of.h
index a3eb9d1e5800..54f858798e8c 100644
--- a/include/linux/of.h
+++ b/include/linux/of.h
@@ -422,6 +422,11 @@ static inline struct device_node *of_find_node_opts_by_path(const char *path,
 	return NULL;
 }
 
+static inline struct device_node *of_find_node_by_phandle(phandle handle)
+{
+	return NULL;
+}
+
 static inline struct device_node *of_get_parent(const struct device_node *node)
 {
 	return NULL;
-- 
cgit v1.2.3


From 9bf39ab2adafd7cf8740859cb49e7b7952813a5d Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@suse.cz>
Date: Fri, 19 Jun 2015 10:29:13 +0200
Subject: vfs: add file_path() helper

Turn
	d_path(&file->f_path, ...);
into
	file_path(file, ...);

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 arch/arc/kernel/troubleshoot.c               | 10 +++-------
 arch/blackfin/kernel/trace.c                 |  2 +-
 arch/tile/kernel/stack.c                     |  2 +-
 arch/tile/mm/elf.c                           |  2 +-
 drivers/block/loop.c                         |  2 +-
 drivers/md/bitmap.c                          |  2 +-
 drivers/md/md.c                              |  2 +-
 drivers/usb/gadget/function/f_mass_storage.c |  2 +-
 drivers/usb/gadget/function/storage_common.c |  2 +-
 fs/binfmt_elf.c                              |  4 ++--
 fs/coredump.c                                |  2 +-
 fs/ext4/super.c                              |  2 +-
 fs/open.c                                    |  6 ++++++
 include/linux/fs.h                           |  2 ++
 kernel/events/core.c                         |  2 +-
 mm/memory.c                                  |  2 +-
 16 files changed, 25 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arc/kernel/troubleshoot.c b/arch/arc/kernel/troubleshoot.c
index e00a01879025..9f80c5adcb68 100644
--- a/arch/arc/kernel/troubleshoot.c
+++ b/arch/arc/kernel/troubleshoot.c
@@ -67,15 +67,12 @@ static void print_task_path_n_nm(struct task_struct *tsk, char *buf)
 	mmput(mm);
 
 	if (exe_file) {
-		path = exe_file->f_path;
-		path_get(&exe_file->f_path);
+		path_nm = file_path(exe_file, buf, 255);
 		fput(exe_file);
-		path_nm = d_path(&path, buf, 255);
-		path_put(&path);
 	}
 
 done:
-	pr_info("Path: %s\n", path_nm);
+	pr_info("Path: %s\n", !IS_ERR(path_nm) ? path_nm : "?");
 }
 
 static void show_faulting_vma(unsigned long address, char *buf)
@@ -99,8 +96,7 @@ static void show_faulting_vma(unsigned long address, char *buf)
 	if (vma && (vma->vm_start <= address)) {
 		struct file *file = vma->vm_file;
 		if (file) {
-			struct path *path = &file->f_path;
-			nm = d_path(path, buf, PAGE_SIZE - 1);
+			nm = file_path(file, buf, PAGE_SIZE - 1);
 			inode = file_inode(vma->vm_file);
 			dev = inode->i_sb->s_dev;
 			ino = inode->i_ino;
diff --git a/arch/blackfin/kernel/trace.c b/arch/blackfin/kernel/trace.c
index c36efa0c7163..719dd796c12c 100644
--- a/arch/blackfin/kernel/trace.c
+++ b/arch/blackfin/kernel/trace.c
@@ -136,7 +136,7 @@ void decode_address(char *buf, unsigned long address)
 				struct file *file = vma->vm_file;
 
 				if (file) {
-					char *d_name = d_path(&file->f_path, _tmpbuf,
+					char *d_name = file_path(file, _tmpbuf,
 						      sizeof(_tmpbuf));
 					if (!IS_ERR(d_name))
 						name = d_name;
diff --git a/arch/tile/kernel/stack.c b/arch/tile/kernel/stack.c
index c42dce50acd8..8d62cf12c2c0 100644
--- a/arch/tile/kernel/stack.c
+++ b/arch/tile/kernel/stack.c
@@ -334,7 +334,7 @@ static void describe_addr(struct KBacktraceIterator *kbt,
 	}
 
 	if (vma->vm_file) {
-		p = d_path(&vma->vm_file->f_path, buf, bufsize);
+		p = file_path(vma->vm_file, buf, bufsize);
 		if (IS_ERR(p))
 			p = "?";
 		name = kbasename(p);
diff --git a/arch/tile/mm/elf.c b/arch/tile/mm/elf.c
index f7ddae3725a4..6225cc998db1 100644
--- a/arch/tile/mm/elf.c
+++ b/arch/tile/mm/elf.c
@@ -56,7 +56,7 @@ static int notify_exec(struct mm_struct *mm)
 	if (exe_file == NULL)
 		goto done_free;
 
-	path = d_path(&exe_file->f_path, buf, PAGE_SIZE);
+	path = file_path(exe_file, buf, PAGE_SIZE);
 	if (IS_ERR(path))
 		goto done_put;
 
diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index d7173cb1ea76..0d8ad59413cd 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -568,7 +568,7 @@ static ssize_t loop_attr_backing_file_show(struct loop_device *lo, char *buf)
 
 	spin_lock_irq(&lo->lo_lock);
 	if (lo->lo_backing_file)
-		p = d_path(&lo->lo_backing_file->f_path, buf, PAGE_SIZE - 1);
+		p = file_path(lo->lo_backing_file, buf, PAGE_SIZE - 1);
 	spin_unlock_irq(&lo->lo_lock);
 
 	if (IS_ERR_OR_NULL(p))
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index 2bc56e2a3526..dda33d648354 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -834,7 +834,7 @@ static void bitmap_file_kick(struct bitmap *bitmap)
 		if (bitmap->storage.file) {
 			path = kmalloc(PAGE_SIZE, GFP_KERNEL);
 			if (path)
-				ptr = d_path(&bitmap->storage.file->f_path,
+				ptr = file_path(bitmap->storage.file,
 					     path, PAGE_SIZE);
 
 			printk(KERN_ALERT
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 593a02476c78..e67f3ac137bf 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -5741,7 +5741,7 @@ static int get_bitmap_file(struct mddev *mddev, void __user * arg)
 	/* bitmap disabled, zero the first byte and copy out */
 	if (!mddev->bitmap_info.file)
 		file->pathname[0] = '\0';
-	else if ((ptr = d_path(&mddev->bitmap_info.file->f_path,
+	else if ((ptr = file_path(mddev->bitmap_info.file,
 			       file->pathname, sizeof(file->pathname))),
 		 IS_ERR(ptr))
 		err = PTR_ERR(ptr);
diff --git a/drivers/usb/gadget/function/f_mass_storage.c b/drivers/usb/gadget/function/f_mass_storage.c
index 3cc109f3c9c8..d2259c663996 100644
--- a/drivers/usb/gadget/function/f_mass_storage.c
+++ b/drivers/usb/gadget/function/f_mass_storage.c
@@ -2936,7 +2936,7 @@ int fsg_common_create_lun(struct fsg_common *common, struct fsg_lun_config *cfg,
 	if (fsg_lun_is_open(lun)) {
 		p = "(error)";
 		if (pathbuf) {
-			p = d_path(&lun->filp->f_path, pathbuf, PATH_MAX);
+			p = file_path(lun->filp, pathbuf, PATH_MAX);
 			if (IS_ERR(p))
 				p = "(error)";
 		}
diff --git a/drivers/usb/gadget/function/storage_common.c b/drivers/usb/gadget/function/storage_common.c
index 648f9e489b39..d62683017cf3 100644
--- a/drivers/usb/gadget/function/storage_common.c
+++ b/drivers/usb/gadget/function/storage_common.c
@@ -341,7 +341,7 @@ ssize_t fsg_show_file(struct fsg_lun *curlun, struct rw_semaphore *filesem,
 
 	down_read(filesem);
 	if (fsg_lun_is_open(curlun)) {	/* Get the complete pathname */
-		p = d_path(&curlun->filp->f_path, buf, PAGE_SIZE - 1);
+		p = file_path(curlun->filp, buf, PAGE_SIZE - 1);
 		if (IS_ERR(p))
 			rc = PTR_ERR(p);
 		else {
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 241ef68d2893..5046b6247103 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -1530,7 +1530,7 @@ static int fill_files_note(struct memelfnote *note)
 		file = vma->vm_file;
 		if (!file)
 			continue;
-		filename = d_path(&file->f_path, name_curpos, remaining);
+		filename = file_path(file, name_curpos, remaining);
 		if (IS_ERR(filename)) {
 			if (PTR_ERR(filename) == -ENAMETOOLONG) {
 				vfree(data);
@@ -1540,7 +1540,7 @@ static int fill_files_note(struct memelfnote *note)
 			continue;
 		}
 
-		/* d_path() fills at the end, move name down */
+		/* file_path() fills at the end, move name down */
 		/* n = strlen(filename) + 1: */
 		n = (name_curpos + remaining) - filename;
 		remaining = filename - name_curpos;
diff --git a/fs/coredump.c b/fs/coredump.c
index bbbe139ab280..5b771b36cc6e 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -138,7 +138,7 @@ static int cn_print_exe_file(struct core_name *cn)
 		goto put_exe_file;
 	}
 
-	path = d_path(&exe_file->f_path, pathbuf, PATH_MAX);
+	path = file_path(exe_file, pathbuf, PATH_MAX);
 	if (IS_ERR(path)) {
 		ret = PTR_ERR(path);
 		goto free_buf;
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index f06d0589ddba..0ae853d2e1f1 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -449,7 +449,7 @@ void __ext4_error_file(struct file *file, const char *function,
 	es = EXT4_SB(inode->i_sb)->s_es;
 	es->s_last_error_ino = cpu_to_le32(inode->i_ino);
 	if (ext4_error_ratelimit(inode->i_sb)) {
-		path = d_path(&(file->f_path), pathname, sizeof(pathname));
+		path = file_path(file, pathname, sizeof(pathname));
 		if (IS_ERR(path))
 			path = "(unknown)";
 		va_start(args, fmt);
diff --git a/fs/open.c b/fs/open.c
index b1c5823b7f11..1dbc79358d59 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -823,6 +823,12 @@ int finish_no_open(struct file *file, struct dentry *dentry)
 }
 EXPORT_SYMBOL(finish_no_open);
 
+char *file_path(struct file *filp, char *buf, int buflen)
+{
+	return d_path(&filp->f_path, buf, buflen);
+}
+EXPORT_SYMBOL(file_path);
+
 /**
  * vfs_open - open the file at the given path
  * @path: path to open
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 2bd77e10e8e5..2c135ad741a9 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2500,6 +2500,8 @@ extern struct file * open_exec(const char *);
 extern int is_subdir(struct dentry *, struct dentry *);
 extern int path_is_under(struct path *, struct path *);
 
+extern char *file_path(struct file *, char *, int);
+
 #include <linux/err.h>
 
 /* needed for stackable file system support */
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 81aa3a4ece9f..5c964e845483 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -5791,7 +5791,7 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
 		 * need to add enough zero bytes after the string to handle
 		 * the 64bit alignment we do later.
 		 */
-		name = d_path(&file->f_path, buf, PATH_MAX - sizeof(u64));
+		name = file_path(file, buf, PATH_MAX - sizeof(u64));
 		if (IS_ERR(name)) {
 			name = "//toolong";
 			goto cpy_name;
diff --git a/mm/memory.c b/mm/memory.c
index 22e037e3364e..28c10da1efbc 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3724,7 +3724,7 @@ void print_vma_addr(char *prefix, unsigned long ip)
 		if (buf) {
 			char *p;
 
-			p = d_path(&f->f_path, buf, PAGE_SIZE);
+			p = file_path(f, buf, PAGE_SIZE);
 			if (IS_ERR(p))
 				p = "?";
 			printk("%s%s[%lx+%lx]", prefix, kbasename(p),
-- 
cgit v1.2.3


From 2726d56620ce71f40dd583d51391b86e1ab8cc57 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@suse.cz>
Date: Fri, 19 Jun 2015 10:30:28 +0200
Subject: vfs: add seq_file_path() helper

Turn
	seq_path(..., &file->f_path, ...);
into
	seq_file_path(..., file, ...);

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 drivers/md/bitmap.c      |  2 +-
 fs/proc/nommu.c          |  2 +-
 fs/proc/task_mmu.c       |  4 ++--
 fs/proc/task_nommu.c     |  2 +-
 fs/seq_file.c            | 14 ++++++++++++++
 include/linux/seq_file.h |  1 +
 mm/swapfile.c            |  2 +-
 7 files changed, 21 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index dda33d648354..3813fdfee4be 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -1922,7 +1922,7 @@ void bitmap_status(struct seq_file *seq, struct bitmap *bitmap)
 		   chunk_kb ? "KB" : "B");
 	if (bitmap->storage.file) {
 		seq_printf(seq, ", file: ");
-		seq_path(seq, &bitmap->storage.file->f_path, " \t\n");
+		seq_file_path(seq, bitmap->storage.file, " \t\n");
 	}
 
 	seq_printf(seq, "\n");
diff --git a/fs/proc/nommu.c b/fs/proc/nommu.c
index d4a35746cab9..f8595e8b5cd0 100644
--- a/fs/proc/nommu.c
+++ b/fs/proc/nommu.c
@@ -64,7 +64,7 @@ static int nommu_region_show(struct seq_file *m, struct vm_region *region)
 
 	if (file) {
 		seq_pad(m, ' ');
-		seq_path(m, &file->f_path, "");
+		seq_file_path(m, file, "");
 	}
 
 	seq_putc(m, '\n');
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 6dee68d013ff..ca1e091881d4 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -310,7 +310,7 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid)
 	 */
 	if (file) {
 		seq_pad(m, ' ');
-		seq_path(m, &file->f_path, "\n");
+		seq_file_path(m, file, "\n");
 		goto done;
 	}
 
@@ -1509,7 +1509,7 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid)
 
 	if (file) {
 		seq_puts(m, " file=");
-		seq_path(m, &file->f_path, "\n\t= ");
+		seq_file_path(m, file, "\n\t= ");
 	} else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) {
 		seq_puts(m, " heap");
 	} else {
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index 599ec2e20104..e0d64c92e4f6 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -180,7 +180,7 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma,
 
 	if (file) {
 		seq_pad(m, ' ');
-		seq_path(m, &file->f_path, "");
+		seq_file_path(m, file, "");
 	} else if (mm) {
 		pid_t tid = pid_of_stack(priv, vma, is_pid);
 
diff --git a/fs/seq_file.c b/fs/seq_file.c
index 555f82155be8..d8a0545ad7ea 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -487,6 +487,20 @@ int seq_path(struct seq_file *m, const struct path *path, const char *esc)
 }
 EXPORT_SYMBOL(seq_path);
 
+/**
+ * seq_file_path - seq_file interface to print a pathname of a file
+ * @m: the seq_file handle
+ * @file: the struct file to print
+ * @esc: set of characters to escape in the output
+ *
+ * return the absolute path to the file.
+ */
+int seq_file_path(struct seq_file *m, struct file *file, const char *esc)
+{
+	return seq_path(m, &file->f_path, esc);
+}
+EXPORT_SYMBOL(seq_file_path);
+
 /*
  * Same as seq_path, but relative to supplied root.
  */
diff --git a/include/linux/seq_file.h b/include/linux/seq_file.h
index afbb1fd77c77..912a7c482649 100644
--- a/include/linux/seq_file.h
+++ b/include/linux/seq_file.h
@@ -123,6 +123,7 @@ __printf(2, 3) int seq_printf(struct seq_file *, const char *, ...);
 __printf(2, 0) int seq_vprintf(struct seq_file *, const char *, va_list args);
 
 int seq_path(struct seq_file *, const struct path *, const char *);
+int seq_file_path(struct seq_file *, struct file *, const char *);
 int seq_dentry(struct seq_file *, struct dentry *, const char *);
 int seq_path_root(struct seq_file *m, const struct path *path,
 		  const struct path *root, const char *esc);
diff --git a/mm/swapfile.c b/mm/swapfile.c
index a7e72103f23b..41e4581af7c5 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -2032,7 +2032,7 @@ static int swap_show(struct seq_file *swap, void *v)
 	}
 
 	file = si->swap_file;
-	len = seq_path(swap, &file->f_path, " \t\n\\");
+	len = seq_file_path(swap, file, " \t\n\\");
 	seq_printf(swap, "%*s%s\t%u\t%u\t%d\n",
 			len < 40 ? 40 - len : 1, " ",
 			S_ISBLK(file_inode(file)->i_mode) ?
-- 
cgit v1.2.3


From 5fa8e0a1c6a762857ae67d1628c58b9a02362003 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 21 May 2015 16:05:53 +0200
Subject: fs: Rename file_remove_suid() to file_remove_privs()

file_remove_suid() is a misnomer since it removes also file capabilities
stored in xattrs and sets S_NOSEC flag. Also should_remove_suid() tells
something else than whether file_remove_suid() call is necessary which
leads to bugs.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/btrfs/file.c    |  2 +-
 fs/ceph/file.c     |  2 +-
 fs/fuse/file.c     |  2 +-
 fs/inode.c         | 13 ++++++++-----
 fs/ntfs/file.c     |  2 +-
 fs/xfs/xfs_file.c  |  2 +-
 include/linux/fs.h |  2 +-
 mm/filemap.c       |  2 +-
 8 files changed, 15 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index b072e17479aa..86f97282779a 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1748,7 +1748,7 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
 	}
 
 	current->backing_dev_info = inode_to_bdi(inode);
-	err = file_remove_suid(file);
+	err = file_remove_privs(file);
 	if (err) {
 		mutex_unlock(&inode->i_mutex);
 		goto out;
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 3b6b522b4b31..e55fe32c6224 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -959,7 +959,7 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
 
 	pos = iocb->ki_pos;
 	count = iov_iter_count(from);
-	err = file_remove_suid(file);
+	err = file_remove_privs(file);
 	if (err)
 		goto out;
 
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 5ef05b5c4cff..1344647965dc 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -1169,7 +1169,7 @@ static ssize_t fuse_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 	if (err <= 0)
 		goto out;
 
-	err = file_remove_suid(file);
+	err = file_remove_privs(file);
 	if (err)
 		goto out;
 
diff --git a/fs/inode.c b/fs/inode.c
index 07f4cb5eab4b..849210c155dc 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -1685,7 +1685,11 @@ static int __remove_suid(struct dentry *dentry, int kill)
 	return notify_change(dentry, &newattrs, NULL);
 }
 
-int file_remove_suid(struct file *file)
+/*
+ * Remove special file priviledges (suid, capabilities) when file is written
+ * to or truncated.
+ */
+int file_remove_privs(struct file *file)
 {
 	struct dentry *dentry = file->f_path.dentry;
 	struct inode *inode = d_inode(dentry);
@@ -1711,7 +1715,7 @@ int file_remove_suid(struct file *file)
 
 	return error;
 }
-EXPORT_SYMBOL(file_remove_suid);
+EXPORT_SYMBOL(file_remove_privs);
 
 /**
  *	file_update_time	-	update mtime and ctime time
@@ -1966,9 +1970,8 @@ EXPORT_SYMBOL(inode_dio_wait);
  * inode is being instantiated).  The reason for the cmpxchg() loop
  * --- which wouldn't be necessary if all code paths which modify
  * i_flags actually followed this rule, is that there is at least one
- * code path which doesn't today --- for example,
- * __generic_file_aio_write() calls file_remove_suid() without holding
- * i_mutex --- so we use cmpxchg() out of an abundance of caution.
+ * code path which doesn't today so we use cmpxchg() out of an abundance
+ * of caution.
  *
  * In the long run, i_mutex is overkill, and we should probably look
  * at using the i_lock spinlock to protect i_flags, and then make sure
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index 7bb487e663b4..182bb93aa79c 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -382,7 +382,7 @@ static ssize_t ntfs_prepare_file_for_write(struct kiocb *iocb,
 	base_ni = ni;
 	if (NInoAttr(ni))
 		base_ni = ni->ext.base_ntfs_ino;
-	err = file_remove_suid(file);
+	err = file_remove_privs(file);
 	if (unlikely(err))
 		goto out;
 	/*
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 8121e75352ee..f3e4fbb59985 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -623,7 +623,7 @@ restart:
 	 * setgid bits if the process is not being run by root.  This keeps
 	 * people from modifying setuid and setgid binaries.
 	 */
-	return file_remove_suid(file);
+	return file_remove_privs(file);
 }
 
 /*
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 2c135ad741a9..641e68d850cf 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2553,7 +2553,7 @@ extern struct inode *new_inode_pseudo(struct super_block *sb);
 extern struct inode *new_inode(struct super_block *sb);
 extern void free_inode_nonrcu(struct inode *inode);
 extern int should_remove_suid(struct dentry *);
-extern int file_remove_suid(struct file *);
+extern int file_remove_privs(struct file *);
 
 extern void __insert_inode_hash(struct inode *, unsigned long hashval);
 static inline void insert_inode_hash(struct inode *inode)
diff --git a/mm/filemap.c b/mm/filemap.c
index 6bf5e42d560a..f851e36802d5 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2536,7 +2536,7 @@ ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 
 	/* We can write back this queue in page reclaim */
 	current->backing_dev_info = inode_to_bdi(inode);
-	err = file_remove_suid(file);
+	err = file_remove_privs(file);
 	if (err)
 		goto out;
 
-- 
cgit v1.2.3


From dbfae0cdcd87602737101d4417811f4323156b54 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 21 May 2015 16:05:54 +0200
Subject: fs: Provide function telling whether file_remove_privs() will do
 anything

Provide function telling whether file_remove_privs() will do anything.
Currently we only have should_remove_suid() and that does something
slightly different.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/inode.c         | 44 ++++++++++++++++++++++++++++++++------------
 include/linux/fs.h |  1 +
 2 files changed, 33 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/fs/inode.c b/fs/inode.c
index 849210c155dc..8c2dd74455c9 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -1673,7 +1673,32 @@ int should_remove_suid(struct dentry *dentry)
 }
 EXPORT_SYMBOL(should_remove_suid);
 
-static int __remove_suid(struct dentry *dentry, int kill)
+/*
+ * Return mask of changes for notify_change() that need to be done as a
+ * response to write or truncate. Return 0 if nothing has to be changed.
+ * Negative value on error (change should be denied).
+ */
+int file_needs_remove_privs(struct file *file)
+{
+	struct dentry *dentry = file->f_path.dentry;
+	struct inode *inode = d_inode(dentry);
+	int mask = 0;
+	int ret;
+
+	if (IS_NOSEC(inode))
+		return 0;
+
+	mask = should_remove_suid(dentry);
+	ret = security_inode_need_killpriv(dentry);
+	if (ret < 0)
+		return ret;
+	if (ret)
+		mask |= ATTR_KILL_PRIV;
+	return mask;
+}
+EXPORT_SYMBOL(file_needs_remove_privs);
+
+static int __remove_privs(struct dentry *dentry, int kill)
 {
 	struct iattr newattrs;
 
@@ -1693,23 +1718,18 @@ int file_remove_privs(struct file *file)
 {
 	struct dentry *dentry = file->f_path.dentry;
 	struct inode *inode = d_inode(dentry);
-	int killsuid;
-	int killpriv;
+	int kill;
 	int error = 0;
 
 	/* Fast path for nothing security related */
 	if (IS_NOSEC(inode))
 		return 0;
 
-	killsuid = should_remove_suid(dentry);
-	killpriv = security_inode_need_killpriv(dentry);
-
-	if (killpriv < 0)
-		return killpriv;
-	if (killpriv)
-		error = security_inode_killpriv(dentry);
-	if (!error && killsuid)
-		error = __remove_suid(dentry, killsuid);
+	kill = file_needs_remove_privs(file);
+	if (kill < 0)
+		return kill;
+	if (kill)
+		error = __remove_privs(dentry, kill);
 	if (!error)
 		inode_has_no_xattr(inode);
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 641e68d850cf..ee60e8ab210f 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2554,6 +2554,7 @@ extern struct inode *new_inode(struct super_block *sb);
 extern void free_inode_nonrcu(struct inode *inode);
 extern int should_remove_suid(struct dentry *);
 extern int file_remove_privs(struct file *);
+extern int file_needs_remove_privs(struct file *file);
 
 extern void __insert_inode_hash(struct inode *, unsigned long hashval);
 static inline void insert_inode_hash(struct inode *inode)
-- 
cgit v1.2.3


From 45f147a1bc97c743c6101a8d2741c69a51f583e4 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 21 May 2015 16:05:55 +0200
Subject: fs: Call security_ops->inode_killpriv on truncate

Comment in include/linux/security.h says that ->inode_killpriv() should
be called when setuid bit is being removed and that similar security
labels (in fact this applies only to file capabilities) should be
removed at this time as well. However we don't call ->inode_killpriv()
when we remove suid bit on truncate.

We fix the problem by calling ->inode_need_killpriv() and subsequently
->inode_killpriv() on truncate the same way as we do it on file write.

After this patch there's only one user of should_remove_suid() - ocfs2 -
and indeed it's buggy because it doesn't call ->inode_killpriv() on
write. However fixing it is difficult because of special locking
constraints.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/inode.c         | 5 ++---
 fs/open.c          | 6 ++++--
 include/linux/fs.h | 6 +++++-
 3 files changed, 11 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/fs/inode.c b/fs/inode.c
index 8c2dd74455c9..0401d2c6d087 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -1678,9 +1678,8 @@ EXPORT_SYMBOL(should_remove_suid);
  * response to write or truncate. Return 0 if nothing has to be changed.
  * Negative value on error (change should be denied).
  */
-int file_needs_remove_privs(struct file *file)
+int dentry_needs_remove_privs(struct dentry *dentry)
 {
-	struct dentry *dentry = file->f_path.dentry;
 	struct inode *inode = d_inode(dentry);
 	int mask = 0;
 	int ret;
@@ -1696,7 +1695,7 @@ int file_needs_remove_privs(struct file *file)
 		mask |= ATTR_KILL_PRIV;
 	return mask;
 }
-EXPORT_SYMBOL(file_needs_remove_privs);
+EXPORT_SYMBOL(dentry_needs_remove_privs);
 
 static int __remove_privs(struct dentry *dentry, int kill)
 {
diff --git a/fs/open.c b/fs/open.c
index 1dbc79358d59..e33dab287fa0 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -51,8 +51,10 @@ int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs,
 		newattrs.ia_valid |= ATTR_FILE;
 	}
 
-	/* Remove suid/sgid on truncate too */
-	ret = should_remove_suid(dentry);
+	/* Remove suid, sgid, and file capabilities on truncate too */
+	ret = dentry_needs_remove_privs(dentry);
+	if (ret < 0)
+		return ret;
 	if (ret)
 		newattrs.ia_valid |= ret | ATTR_FORCE;
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index ee60e8ab210f..1e658b11c265 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2554,7 +2554,11 @@ extern struct inode *new_inode(struct super_block *sb);
 extern void free_inode_nonrcu(struct inode *inode);
 extern int should_remove_suid(struct dentry *);
 extern int file_remove_privs(struct file *);
-extern int file_needs_remove_privs(struct file *file);
+extern int dentry_needs_remove_privs(struct dentry *dentry);
+static inline int file_needs_remove_privs(struct file *file)
+{
+	return dentry_needs_remove_privs(file->f_path.dentry);
+}
 
 extern void __insert_inode_hash(struct inode *, unsigned long hashval);
 static inline void insert_inode_hash(struct inode *inode)
-- 
cgit v1.2.3


From b57c2cb9ea1a02c2ae08e16de8c20cc13ffbf85a Mon Sep 17 00:00:00 2001
From: Fabian Frederick <fabf@skynet.be>
Date: Sun, 24 May 2015 17:19:41 +0200
Subject: pagemap.h: move dir_pages() over there

That function was declared in a lot of filesystems to calculate
directory pages.

Signed-off-by: Fabian Frederick <fabf@skynet.be>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/exofs/dir.c            | 6 ------
 fs/ext2/dir.c             | 5 -----
 fs/freevxfs/vxfs_lookup.c | 7 -------
 fs/minix/dir.c            | 5 -----
 fs/nilfs2/dir.c           | 5 -----
 fs/qnx6/dir.c             | 5 -----
 fs/sysv/dir.c             | 5 -----
 include/linux/pagemap.h   | 6 ++++++
 8 files changed, 6 insertions(+), 38 deletions(-)

(limited to 'include/linux')

diff --git a/fs/exofs/dir.c b/fs/exofs/dir.c
index 4deb0b05b011..e5bb2abf77f9 100644
--- a/fs/exofs/dir.c
+++ b/fs/exofs/dir.c
@@ -44,12 +44,6 @@ static inline void exofs_put_page(struct page *page)
 	page_cache_release(page);
 }
 
-/* Accesses dir's inode->i_size must be called under inode lock */
-static inline unsigned long dir_pages(struct inode *inode)
-{
-	return (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
-}
-
 static unsigned exofs_last_byte(struct inode *inode, unsigned long page_nr)
 {
 	loff_t last_byte = inode->i_size;
diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c
index 796b491e6978..0c6638b40f21 100644
--- a/fs/ext2/dir.c
+++ b/fs/ext2/dir.c
@@ -70,11 +70,6 @@ static inline void ext2_put_page(struct page *page)
 	page_cache_release(page);
 }
 
-static inline unsigned long dir_pages(struct inode *inode)
-{
-	return (inode->i_size+PAGE_CACHE_SIZE-1)>>PAGE_CACHE_SHIFT;
-}
-
 /*
  * Return the offset into page `page_nr' of the last valid
  * byte in that page, plus one.
diff --git a/fs/freevxfs/vxfs_lookup.c b/fs/freevxfs/vxfs_lookup.c
index 99c7f0a37af4..484b32d3234a 100644
--- a/fs/freevxfs/vxfs_lookup.c
+++ b/fs/freevxfs/vxfs_lookup.c
@@ -61,13 +61,6 @@ const struct file_operations vxfs_dir_operations = {
 	.iterate =		vxfs_readdir,
 };
 
- 
-static inline u_long
-dir_pages(struct inode *inode)
-{
-	return (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
-}
- 
 static inline u_long
 dir_blocks(struct inode *ip)
 {
diff --git a/fs/minix/dir.c b/fs/minix/dir.c
index 118e4e7bc935..d19ac258105a 100644
--- a/fs/minix/dir.c
+++ b/fs/minix/dir.c
@@ -45,11 +45,6 @@ minix_last_byte(struct inode *inode, unsigned long page_nr)
 	return last_byte;
 }
 
-static inline unsigned long dir_pages(struct inode *inode)
-{
-	return (inode->i_size+PAGE_CACHE_SIZE-1)>>PAGE_CACHE_SHIFT;
-}
-
 static int dir_commit_chunk(struct page *page, loff_t pos, unsigned len)
 {
 	struct address_space *mapping = page->mapping;
diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c
index 0ee0bed3649b..6b8b92b19cec 100644
--- a/fs/nilfs2/dir.c
+++ b/fs/nilfs2/dir.c
@@ -61,11 +61,6 @@ static inline void nilfs_put_page(struct page *page)
 	page_cache_release(page);
 }
 
-static inline unsigned long dir_pages(struct inode *inode)
-{
-	return (inode->i_size+PAGE_CACHE_SIZE-1)>>PAGE_CACHE_SHIFT;
-}
-
 /*
  * Return the offset into page `page_nr' of the last valid
  * byte in that page, plus one.
diff --git a/fs/qnx6/dir.c b/fs/qnx6/dir.c
index 8d64bb5366bf..e1f37278cf97 100644
--- a/fs/qnx6/dir.c
+++ b/fs/qnx6/dir.c
@@ -32,11 +32,6 @@ static struct page *qnx6_get_page(struct inode *dir, unsigned long n)
 	return page;
 }
 
-static inline unsigned long dir_pages(struct inode *inode)
-{
-	return (inode->i_size+PAGE_CACHE_SIZE-1)>>PAGE_CACHE_SHIFT;
-}
-
 static unsigned last_entry(struct inode *inode, unsigned long page_nr)
 {
 	unsigned long last_byte = inode->i_size;
diff --git a/fs/sysv/dir.c b/fs/sysv/dir.c
index 8f3555f00c54..63c1bcb224ee 100644
--- a/fs/sysv/dir.c
+++ b/fs/sysv/dir.c
@@ -33,11 +33,6 @@ static inline void dir_put_page(struct page *page)
 	page_cache_release(page);
 }
 
-static inline unsigned long dir_pages(struct inode *inode)
-{
-	return (inode->i_size+PAGE_CACHE_SIZE-1)>>PAGE_CACHE_SHIFT;
-}
-
 static int dir_commit_chunk(struct page *page, loff_t pos, unsigned len)
 {
 	struct address_space *mapping = page->mapping;
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 4b3736f7065c..808942d31062 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -670,4 +670,10 @@ static inline int add_to_page_cache(struct page *page,
 	return error;
 }
 
+static inline unsigned long dir_pages(struct inode *inode)
+{
+	return (unsigned long)(inode->i_size + PAGE_CACHE_SIZE - 1) >>
+			       PAGE_CACHE_SHIFT;
+}
+
 #endif /* _LINUX_PAGEMAP_H */
-- 
cgit v1.2.3


From dc3f4198eac14e52a98dfc79cd84b45e280f59cd Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 18 May 2015 10:10:34 -0400
Subject: make simple_positive() public

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 arch/powerpc/platforms/cell/spufs/inode.c |  2 +-
 arch/s390/hypfs/inode.c                   |  7 +------
 drivers/block/drbd/drbd_debugfs.c         | 10 +---------
 drivers/infiniband/hw/ipath/ipath_fs.c    |  2 +-
 drivers/infiniband/hw/qib/qib_fs.c        |  2 +-
 fs/autofs4/autofs_i.h                     |  5 -----
 fs/ceph/dir.c                             |  2 +-
 fs/configfs/inode.c                       |  2 +-
 fs/debugfs/inode.c                        | 11 +++--------
 fs/libfs.c                                |  5 -----
 fs/nfs/dir.c                              |  2 +-
 fs/tracefs/inode.c                        | 11 +++--------
 include/linux/dcache.h                    |  5 +++++
 security/inode.c                          | 19 ++++++-------------
 14 files changed, 25 insertions(+), 60 deletions(-)

(limited to 'include/linux')

diff --git a/arch/powerpc/platforms/cell/spufs/inode.c b/arch/powerpc/platforms/cell/spufs/inode.c
index 1ba6307be4db..11634fa7ab3c 100644
--- a/arch/powerpc/platforms/cell/spufs/inode.c
+++ b/arch/powerpc/platforms/cell/spufs/inode.c
@@ -166,7 +166,7 @@ static void spufs_prune_dir(struct dentry *dir)
 	mutex_lock(&d_inode(dir)->i_mutex);
 	list_for_each_entry_safe(dentry, tmp, &dir->d_subdirs, d_child) {
 		spin_lock(&dentry->d_lock);
-		if (!(d_unhashed(dentry)) && d_really_is_positive(dentry)) {
+		if (simple_positive(dentry)) {
 			dget_dlock(dentry);
 			__d_drop(dentry);
 			spin_unlock(&dentry->d_lock);
diff --git a/arch/s390/hypfs/inode.c b/arch/s390/hypfs/inode.c
index d3f896a35b98..8ffad5437232 100644
--- a/arch/s390/hypfs/inode.c
+++ b/arch/s390/hypfs/inode.c
@@ -62,18 +62,13 @@ static void hypfs_add_dentry(struct dentry *dentry)
 	hypfs_last_dentry = dentry;
 }
 
-static inline int hypfs_positive(struct dentry *dentry)
-{
-	return d_really_is_positive(dentry) && !d_unhashed(dentry);
-}
-
 static void hypfs_remove(struct dentry *dentry)
 {
 	struct dentry *parent;
 
 	parent = dentry->d_parent;
 	mutex_lock(&d_inode(parent)->i_mutex);
-	if (hypfs_positive(dentry)) {
+	if (simple_positive(dentry)) {
 		if (d_is_dir(dentry))
 			simple_rmdir(d_inode(parent), dentry);
 		else
diff --git a/drivers/block/drbd/drbd_debugfs.c b/drivers/block/drbd/drbd_debugfs.c
index a6ee3d750c30..6b88a35fb048 100644
--- a/drivers/block/drbd/drbd_debugfs.c
+++ b/drivers/block/drbd/drbd_debugfs.c
@@ -419,14 +419,6 @@ static int in_flight_summary_show(struct seq_file *m, void *pos)
 	return 0;
 }
 
-/* simple_positive(file->f_path.dentry) respectively debugfs_positive(),
- * but neither is "reachable" from here.
- * So we have our own inline version of it above.  :-( */
-static inline int debugfs_positive(struct dentry *dentry)
-{
-        return d_really_is_positive(dentry) && !d_unhashed(dentry);
-}
-
 /* make sure at *open* time that the respective object won't go away. */
 static int drbd_single_open(struct file *file, int (*show)(struct seq_file *, void *),
 		                void *data, struct kref *kref,
@@ -444,7 +436,7 @@ static int drbd_single_open(struct file *file, int (*show)(struct seq_file *, vo
 	/* serialize with d_delete() */
 	mutex_lock(&d_inode(parent)->i_mutex);
 	/* Make sure the object is still alive */
-	if (debugfs_positive(file->f_path.dentry)
+	if (simple_positive(file->f_path.dentry)
 	&& kref_get_unless_zero(kref))
 		ret = 0;
 	mutex_unlock(&d_inode(parent)->i_mutex);
diff --git a/drivers/infiniband/hw/ipath/ipath_fs.c b/drivers/infiniband/hw/ipath/ipath_fs.c
index 1ca8e32a9592..25422a3a7238 100644
--- a/drivers/infiniband/hw/ipath/ipath_fs.c
+++ b/drivers/infiniband/hw/ipath/ipath_fs.c
@@ -277,7 +277,7 @@ static int remove_file(struct dentry *parent, char *name)
 	}
 
 	spin_lock(&tmp->d_lock);
-	if (!d_unhashed(tmp) && d_really_is_positive(tmp)) {
+	if (simple_positive(tmp)) {
 		dget_dlock(tmp);
 		__d_drop(tmp);
 		spin_unlock(&tmp->d_lock);
diff --git a/drivers/infiniband/hw/qib/qib_fs.c b/drivers/infiniband/hw/qib/qib_fs.c
index bdd5d3857203..13ef22bd9459 100644
--- a/drivers/infiniband/hw/qib/qib_fs.c
+++ b/drivers/infiniband/hw/qib/qib_fs.c
@@ -455,7 +455,7 @@ static int remove_file(struct dentry *parent, char *name)
 	}
 
 	spin_lock(&tmp->d_lock);
-	if (!d_unhashed(tmp) && d_really_is_positive(tmp)) {
+	if (simple_positive(tmp)) {
 		__d_drop(tmp);
 		spin_unlock(&tmp->d_lock);
 		simple_unlink(d_inode(parent), tmp);
diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
index 5b700ef1e59d..c37149b929be 100644
--- a/fs/autofs4/autofs_i.h
+++ b/fs/autofs4/autofs_i.h
@@ -238,11 +238,6 @@ static inline u64 autofs4_get_ino(struct autofs_sb_info *sbi)
 	return d_inode(sbi->sb->s_root)->i_ino;
 }
 
-static inline int simple_positive(struct dentry *dentry)
-{
-	return d_really_is_positive(dentry) && !d_unhashed(dentry);
-}
-
 static inline void __autofs4_add_expiring(struct dentry *dentry)
 {
 	struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 4248307fea90..edbb8da02a6a 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -161,7 +161,7 @@ more:
 		}
 		spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
 		if (di->lease_shared_gen == shared_gen &&
-		    !d_unhashed(dentry) && d_really_is_positive(dentry) &&
+		    simple_positive(dentry) &&
 		    ceph_snap(d_inode(dentry)) != CEPH_SNAPDIR &&
 		    ceph_ino(d_inode(dentry)) != CEPH_INO_CEPH &&
 		    fpos_cmp(ctx->pos, di->offset) <= 0)
diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c
index 8d89f5fd0331..eae87575e681 100644
--- a/fs/configfs/inode.c
+++ b/fs/configfs/inode.c
@@ -236,7 +236,7 @@ void configfs_drop_dentry(struct configfs_dirent * sd, struct dentry * parent)
 
 	if (dentry) {
 		spin_lock(&dentry->d_lock);
-		if (!d_unhashed(dentry) && d_really_is_positive(dentry)) {
+		if (simple_positive(dentry)) {
 			dget_dlock(dentry);
 			__d_drop(dentry);
 			spin_unlock(&dentry->d_lock);
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 7eaec88ea970..ef86ad6bdc3e 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -44,11 +44,6 @@ static struct inode *debugfs_get_inode(struct super_block *sb)
 	return inode;
 }
 
-static inline int debugfs_positive(struct dentry *dentry)
-{
-	return d_really_is_positive(dentry) && !d_unhashed(dentry);
-}
-
 struct debugfs_mount_opts {
 	kuid_t uid;
 	kgid_t gid;
@@ -522,7 +517,7 @@ static int __debugfs_remove(struct dentry *dentry, struct dentry *parent)
 {
 	int ret = 0;
 
-	if (debugfs_positive(dentry)) {
+	if (simple_positive(dentry)) {
 		dget(dentry);
 		if (d_is_dir(dentry))
 			ret = simple_rmdir(d_inode(parent), dentry);
@@ -602,7 +597,7 @@ void debugfs_remove_recursive(struct dentry *dentry)
 	 */
 	spin_lock(&parent->d_lock);
 	list_for_each_entry(child, &parent->d_subdirs, d_child) {
-		if (!debugfs_positive(child))
+		if (!simple_positive(child))
 			continue;
 
 		/* perhaps simple_empty(child) makes more sense */
@@ -623,7 +618,7 @@ void debugfs_remove_recursive(struct dentry *dentry)
 		 * from d_subdirs. When releasing the parent->d_lock we can
 		 * no longer trust that the next pointer is valid.
 		 * Restart the loop. We'll skip this one with the
-		 * debugfs_positive() check.
+		 * simple_positive() check.
 		 */
 		goto loop;
 	}
diff --git a/fs/libfs.c b/fs/libfs.c
index 65e1feca8b98..4d9e6c118fe1 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -20,11 +20,6 @@
 
 #include "internal.h"
 
-static inline int simple_positive(struct dentry *dentry)
-{
-	return d_really_is_positive(dentry) && !d_unhashed(dentry);
-}
-
 int simple_getattr(struct vfsmount *mnt, struct dentry *dentry,
 		   struct kstat *stat)
 {
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index b2c8b31b2be7..b9108f4254a7 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1771,7 +1771,7 @@ EXPORT_SYMBOL_GPL(nfs_mkdir);
 
 static void nfs_dentry_handle_enoent(struct dentry *dentry)
 {
-	if (d_really_is_positive(dentry) && !d_unhashed(dentry))
+	if (simple_positive(dentry))
 		d_delete(dentry);
 }
 
diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c
index d92bdf3b079a..6e8a1400d662 100644
--- a/fs/tracefs/inode.c
+++ b/fs/tracefs/inode.c
@@ -496,16 +496,11 @@ struct dentry *tracefs_create_instance_dir(const char *name, struct dentry *pare
 	return dentry;
 }
 
-static inline int tracefs_positive(struct dentry *dentry)
-{
-	return dentry->d_inode && !d_unhashed(dentry);
-}
-
 static int __tracefs_remove(struct dentry *dentry, struct dentry *parent)
 {
 	int ret = 0;
 
-	if (tracefs_positive(dentry)) {
+	if (simple_positive(dentry)) {
 		if (dentry->d_inode) {
 			dget(dentry);
 			switch (dentry->d_inode->i_mode & S_IFMT) {
@@ -582,7 +577,7 @@ void tracefs_remove_recursive(struct dentry *dentry)
 	 */
 	spin_lock(&parent->d_lock);
 	list_for_each_entry(child, &parent->d_subdirs, d_child) {
-		if (!tracefs_positive(child))
+		if (!simple_positive(child))
 			continue;
 
 		/* perhaps simple_empty(child) makes more sense */
@@ -603,7 +598,7 @@ void tracefs_remove_recursive(struct dentry *dentry)
 		 * from d_subdirs. When releasing the parent->d_lock we can
 		 * no longer trust that the next pointer is valid.
 		 * Restart the loop. We'll skip this one with the
-		 * tracefs_positive() check.
+		 * simple_positive() check.
 		 */
 		goto loop;
 	}
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index 167ec0934049..d2d50249b7b2 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -507,6 +507,11 @@ static inline bool d_really_is_positive(const struct dentry *dentry)
 	return dentry->d_inode != NULL;
 }
 
+static inline int simple_positive(struct dentry *dentry)
+{
+	return d_really_is_positive(dentry) && !d_unhashed(dentry);
+}
+
 extern void d_set_fallthru(struct dentry *dentry);
 
 static inline bool d_is_fallthru(const struct dentry *dentry)
diff --git a/security/inode.c b/security/inode.c
index 91503b79c5f8..6df0d8dae1e0 100644
--- a/security/inode.c
+++ b/security/inode.c
@@ -25,11 +25,6 @@
 static struct vfsmount *mount;
 static int mount_count;
 
-static inline int positive(struct dentry *dentry)
-{
-	return d_really_is_positive(dentry) && !d_unhashed(dentry);
-}
-
 static int fill_super(struct super_block *sb, void *data, int silent)
 {
 	static struct tree_descr files[] = {{""}};
@@ -201,14 +196,12 @@ void securityfs_remove(struct dentry *dentry)
 		return;
 
 	mutex_lock(&d_inode(parent)->i_mutex);
-	if (positive(dentry)) {
-		if (d_really_is_positive(dentry)) {
-			if (d_is_dir(dentry))
-				simple_rmdir(d_inode(parent), dentry);
-			else
-				simple_unlink(d_inode(parent), dentry);
-			dput(dentry);
-		}
+	if (simple_positive(dentry)) {
+		if (d_is_dir(dentry))
+			simple_rmdir(d_inode(parent), dentry);
+		else
+			simple_unlink(d_inode(parent), dentry);
+		dput(dentry);
 	}
 	mutex_unlock(&d_inode(parent)->i_mutex);
 	simple_release_fs(&mount, &mount_count);
-- 
cgit v1.2.3


From 38183b9c31cf21d8996d6eee2e3a14508b20c418 Mon Sep 17 00:00:00 2001
From: Stephen Rothwell <sfr@canb.auug.org.au>
Date: Thu, 28 May 2015 17:20:58 +1000
Subject: rcu: merge fix for Convert ACCESS_ONCE() to READ_ONCE() and
 WRITE_ONCE()

This mirrors the change introduced by 7d0ae8086b8 of same title
in Linus' tree; it's not obvious as a merge resolution since we moved
the function.

Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 include/linux/compiler.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index eae42c21d5fd..52bdec710ed7 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -467,7 +467,7 @@ static __always_inline void __write_once_size(volatile void *p, void *res, int s
  */
 #define lockless_dereference(p) \
 ({ \
-	typeof(p) _________p1 = ACCESS_ONCE(p); \
+	typeof(p) _________p1 = READ_ONCE(p); \
 	smp_read_barrier_depends(); /* Dependency order vs. p above. */ \
 	(_________p1); \
 })
-- 
cgit v1.2.3


From ce0bdb849ad46e4b4e4cae6913b447ae9938bdcf Mon Sep 17 00:00:00 2001
From: Inki Dae <inki.dae@samsung.com>
Date: Tue, 23 Jun 2015 16:06:44 +0900
Subject: of: fix a build error to of_graph_get_endpoint_by_regs function

This patch fixes the below build error reported by Stephen,

     Stephen reported:
     After merging the drm-exynos tree, today's linux-next build (x86_64
     allmodconfig) failed like this:

     drivers/media/i2c/adv7604.o: In function `of_graph_get_endpoint_by_regs':
     adv7604.c:(.text+0x586c): multiple definition of `of_graph_get_endpoint_by_regs'
     drivers/media/i2c/adv7343.o:adv7343.c:(.text+0xa13): first defined here
     drivers/media/platform/soc_camera/atmel-isi.o: In function `of_graph_get_endpoint_by_regs':
     atmel-isi.c:(.text+0x1ec9): multiple definition of `of_graph_get_endpoint_by_regs'
     drivers/media/platform/soc_camera/soc_camera.o:soc_camera.c:(.text+0x2ce3): first defined here
     drivers/media/platform/soc_camera/rcar_vin.o: In function `of_graph_get_endpoint_by_regs':
     rcar_vin.c:(.text+0x307c): multiple definition of `of_graph_get_endpoint_by_regs'
     drivers/media/platform/soc_camera/soc_camera.o:soc_camera.c:(.text+0x2ce3): first defined here

     Caused by commit:
       a0f7001c18ca ("of: add helper for getting endpoint node of specific identifiers")

To fix the error, this patch declares of_graph_get_endpoint_by_regs function
with "static inline".

Signed-off-by: Inki Dae <inki.dae@samsung.com>
Signed-off-by: Dave Airlie <airlied@redhat.com>
---
 include/linux/of_graph.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/of_graph.h b/include/linux/of_graph.h
index 1c1d5b901a72..f8bcd0e21a26 100644
--- a/include/linux/of_graph.h
+++ b/include/linux/of_graph.h
@@ -71,7 +71,7 @@ static inline struct device_node *of_graph_get_next_endpoint(
 	return NULL;
 }
 
-struct device_node *of_graph_get_endpoint_by_regs(
+static inline struct device_node *of_graph_get_endpoint_by_regs(
 		const struct device_node *parent, int port_reg, int reg)
 {
 	return NULL;
-- 
cgit v1.2.3


From 0eeb075fad736fb92620af995c47c204bbb5e829 Mon Sep 17 00:00:00 2001
From: Andy Gospodarek <gospo@cumulusnetworks.com>
Date: Tue, 23 Jun 2015 13:45:37 -0400
Subject: net: ipv4 sysctl option to ignore routes when nexthop link is down

This feature is only enabled with the new per-interface or ipv4 global
sysctls called 'ignore_routes_with_linkdown'.

net.ipv4.conf.all.ignore_routes_with_linkdown = 0
net.ipv4.conf.default.ignore_routes_with_linkdown = 0
net.ipv4.conf.lo.ignore_routes_with_linkdown = 0
...

When the above sysctls are set, will report to userspace that a route is
dead and will no longer resolve to this nexthop when performing a fib
lookup.  This will signal to userspace that the route will not be
selected.  The signalling of a RTNH_F_DEAD is only passed to userspace
if the sysctl is enabled and link is down.  This was done as without it
the netlink listeners would have no idea whether or not a nexthop would
be selected.   The kernel only sets RTNH_F_DEAD internally if the
interface has IFF_UP cleared.

With the new sysctl set, the following behavior can be observed
(interface p8p1 is link-down):

default via 10.0.5.2 dev p9p1
10.0.5.0/24 dev p9p1  proto kernel  scope link  src 10.0.5.15
70.0.0.0/24 dev p7p1  proto kernel  scope link  src 70.0.0.1
80.0.0.0/24 dev p8p1  proto kernel  scope link  src 80.0.0.1 dead linkdown
90.0.0.0/24 via 80.0.0.2 dev p8p1  metric 1 dead linkdown
90.0.0.0/24 via 70.0.0.2 dev p7p1  metric 2
90.0.0.1 via 70.0.0.2 dev p7p1  src 70.0.0.1
    cache
local 80.0.0.1 dev lo  src 80.0.0.1
    cache <local>
80.0.0.2 via 10.0.5.2 dev p9p1  src 10.0.5.15
    cache

While the route does remain in the table (so it can be modified if
needed rather than being wiped away as it would be if IFF_UP was
cleared), the proper next-hop is chosen automatically when the link is
down.  Now interface p8p1 is linked-up:

default via 10.0.5.2 dev p9p1
10.0.5.0/24 dev p9p1  proto kernel  scope link  src 10.0.5.15
70.0.0.0/24 dev p7p1  proto kernel  scope link  src 70.0.0.1
80.0.0.0/24 dev p8p1  proto kernel  scope link  src 80.0.0.1
90.0.0.0/24 via 80.0.0.2 dev p8p1  metric 1
90.0.0.0/24 via 70.0.0.2 dev p7p1  metric 2
192.168.56.0/24 dev p2p1  proto kernel  scope link  src 192.168.56.2
90.0.0.1 via 80.0.0.2 dev p8p1  src 80.0.0.1
    cache
local 80.0.0.1 dev lo  src 80.0.0.1
    cache <local>
80.0.0.2 dev p8p1  src 80.0.0.1
    cache

and the output changes to what one would expect.

If the sysctl is not set, the following output would be expected when
p8p1 is down:

default via 10.0.5.2 dev p9p1
10.0.5.0/24 dev p9p1  proto kernel  scope link  src 10.0.5.15
70.0.0.0/24 dev p7p1  proto kernel  scope link  src 70.0.0.1
80.0.0.0/24 dev p8p1  proto kernel  scope link  src 80.0.0.1 linkdown
90.0.0.0/24 via 80.0.0.2 dev p8p1  metric 1 linkdown
90.0.0.0/24 via 70.0.0.2 dev p7p1  metric 2

Since the dead flag does not appear, there should be no expectation that
the kernel would skip using this route due to link being down.

v2: Split kernel changes into 2 patches, this actually makes a
behavioral change if the sysctl is set.  Also took suggestion from Alex
to simplify code by only checking sysctl during fib lookup and
suggestion from Scott to add a per-interface sysctl.

v3: Code clean-ups to make it more readable and efficient as well as a
reverse path check fix.

v4: Drop binary sysctl

v5: Whitespace fixups from Dave

v6: Style changes from Dave and checkpatch suggestions

v7: One more checkpatch fixup

Signed-off-by: Andy Gospodarek <gospo@cumulusnetworks.com>
Signed-off-by: Dinesh Dutt <ddutt@cumulusnetworks.com>
Acked-by: Scott Feldman <sfeldma@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/inetdevice.h        |  3 +++
 include/net/fib_rules.h           |  3 ++-
 include/net/ip_fib.h              | 16 +++++++++-------
 include/uapi/linux/ip.h           |  1 +
 net/ipv4/devinet.c                |  2 ++
 net/ipv4/fib_frontend.c           |  6 +++---
 net/ipv4/fib_rules.c              |  5 +++--
 net/ipv4/fib_semantics.c          | 33 ++++++++++++++++++++++++++++-----
 net/ipv4/fib_trie.c               |  6 ++++++
 net/ipv4/netfilter/ipt_rpfilter.c |  2 +-
 net/ipv4/route.c                  | 10 +++++-----
 11 files changed, 63 insertions(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/inetdevice.h b/include/linux/inetdevice.h
index 0a21fbefdfbe..a4328cea376a 100644
--- a/include/linux/inetdevice.h
+++ b/include/linux/inetdevice.h
@@ -120,6 +120,9 @@ static inline void ipv4_devconf_setall(struct in_device *in_dev)
 	 || (!IN_DEV_FORWARD(in_dev) && \
 	  IN_DEV_ORCONF((in_dev), ACCEPT_REDIRECTS)))
 
+#define IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev) \
+	IN_DEV_CONF_GET((in_dev), IGNORE_ROUTES_WITH_LINKDOWN)
+
 #define IN_DEV_ARPFILTER(in_dev)	IN_DEV_ORCONF((in_dev), ARPFILTER)
 #define IN_DEV_ARP_ACCEPT(in_dev)	IN_DEV_ORCONF((in_dev), ARP_ACCEPT)
 #define IN_DEV_ARP_ANNOUNCE(in_dev)	IN_DEV_MAXCONF((in_dev), ARP_ANNOUNCE)
diff --git a/include/net/fib_rules.h b/include/net/fib_rules.h
index 6d67383a5114..903a55efbffe 100644
--- a/include/net/fib_rules.h
+++ b/include/net/fib_rules.h
@@ -36,7 +36,8 @@ struct fib_lookup_arg {
 	void			*result;
 	struct fib_rule		*rule;
 	int			flags;
-#define FIB_LOOKUP_NOREF	1
+#define FIB_LOOKUP_NOREF		1
+#define FIB_LOOKUP_IGNORE_LINKSTATE	2
 };
 
 struct fib_rules_ops {
diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index f73d27c5575a..49c142bdf01e 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -226,7 +226,7 @@ static inline struct fib_table *fib_new_table(struct net *net, u32 id)
 }
 
 static inline int fib_lookup(struct net *net, const struct flowi4 *flp,
-			     struct fib_result *res)
+			     struct fib_result *res, unsigned int flags)
 {
 	struct fib_table *tb;
 	int err = -ENETUNREACH;
@@ -234,7 +234,7 @@ static inline int fib_lookup(struct net *net, const struct flowi4 *flp,
 	rcu_read_lock();
 
 	tb = fib_get_table(net, RT_TABLE_MAIN);
-	if (tb && !fib_table_lookup(tb, flp, res, FIB_LOOKUP_NOREF))
+	if (tb && !fib_table_lookup(tb, flp, res, flags | FIB_LOOKUP_NOREF))
 		err = 0;
 
 	rcu_read_unlock();
@@ -249,16 +249,18 @@ void __net_exit fib4_rules_exit(struct net *net);
 struct fib_table *fib_new_table(struct net *net, u32 id);
 struct fib_table *fib_get_table(struct net *net, u32 id);
 
-int __fib_lookup(struct net *net, struct flowi4 *flp, struct fib_result *res);
+int __fib_lookup(struct net *net, struct flowi4 *flp,
+		 struct fib_result *res, unsigned int flags);
 
 static inline int fib_lookup(struct net *net, struct flowi4 *flp,
-			     struct fib_result *res)
+			     struct fib_result *res, unsigned int flags)
 {
 	struct fib_table *tb;
 	int err;
 
+	flags |= FIB_LOOKUP_NOREF;
 	if (net->ipv4.fib_has_custom_rules)
-		return __fib_lookup(net, flp, res);
+		return __fib_lookup(net, flp, res, flags);
 
 	rcu_read_lock();
 
@@ -266,11 +268,11 @@ static inline int fib_lookup(struct net *net, struct flowi4 *flp,
 
 	for (err = 0; !err; err = -ENETUNREACH) {
 		tb = rcu_dereference_rtnl(net->ipv4.fib_main);
-		if (tb && !fib_table_lookup(tb, flp, res, FIB_LOOKUP_NOREF))
+		if (tb && !fib_table_lookup(tb, flp, res, flags))
 			break;
 
 		tb = rcu_dereference_rtnl(net->ipv4.fib_default);
-		if (tb && !fib_table_lookup(tb, flp, res, FIB_LOOKUP_NOREF))
+		if (tb && !fib_table_lookup(tb, flp, res, flags))
 			break;
 	}
 
diff --git a/include/uapi/linux/ip.h b/include/uapi/linux/ip.h
index 411959405ab6..08f894d2ddbd 100644
--- a/include/uapi/linux/ip.h
+++ b/include/uapi/linux/ip.h
@@ -164,6 +164,7 @@ enum
 	IPV4_DEVCONF_ROUTE_LOCALNET,
 	IPV4_DEVCONF_IGMPV2_UNSOLICITED_REPORT_INTERVAL,
 	IPV4_DEVCONF_IGMPV3_UNSOLICITED_REPORT_INTERVAL,
+	IPV4_DEVCONF_IGNORE_ROUTES_WITH_LINKDOWN,
 	__IPV4_DEVCONF_MAX
 };
 
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index 419d23c53ec7..7498716e8f54 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -2169,6 +2169,8 @@ static struct devinet_sysctl_table {
 					"igmpv2_unsolicited_report_interval"),
 		DEVINET_SYSCTL_RW_ENTRY(IGMPV3_UNSOLICITED_REPORT_INTERVAL,
 					"igmpv3_unsolicited_report_interval"),
+		DEVINET_SYSCTL_RW_ENTRY(IGNORE_ROUTES_WITH_LINKDOWN,
+					"ignore_routes_with_linkdown"),
 
 		DEVINET_SYSCTL_FLUSHING_ENTRY(NOXFRM, "disable_xfrm"),
 		DEVINET_SYSCTL_FLUSHING_ENTRY(NOPOLICY, "disable_policy"),
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 534eb1485045..6bbc54940eb4 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -280,7 +280,7 @@ __be32 fib_compute_spec_dst(struct sk_buff *skb)
 		fl4.flowi4_tos = RT_TOS(ip_hdr(skb)->tos);
 		fl4.flowi4_scope = scope;
 		fl4.flowi4_mark = IN_DEV_SRC_VMARK(in_dev) ? skb->mark : 0;
-		if (!fib_lookup(net, &fl4, &res))
+		if (!fib_lookup(net, &fl4, &res, 0))
 			return FIB_RES_PREFSRC(net, res);
 	} else {
 		scope = RT_SCOPE_LINK;
@@ -319,7 +319,7 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst,
 	fl4.flowi4_mark = IN_DEV_SRC_VMARK(idev) ? skb->mark : 0;
 
 	net = dev_net(dev);
-	if (fib_lookup(net, &fl4, &res))
+	if (fib_lookup(net, &fl4, &res, 0))
 		goto last_resort;
 	if (res.type != RTN_UNICAST &&
 	    (res.type != RTN_LOCAL || !IN_DEV_ACCEPT_LOCAL(idev)))
@@ -354,7 +354,7 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst,
 	fl4.flowi4_oif = dev->ifindex;
 
 	ret = 0;
-	if (fib_lookup(net, &fl4, &res) == 0) {
+	if (fib_lookup(net, &fl4, &res, FIB_LOOKUP_IGNORE_LINKSTATE) == 0) {
 		if (res.type == RTN_UNICAST)
 			ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST;
 	}
diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c
index 56151982f74e..18123d50f576 100644
--- a/net/ipv4/fib_rules.c
+++ b/net/ipv4/fib_rules.c
@@ -47,11 +47,12 @@ struct fib4_rule {
 #endif
 };
 
-int __fib_lookup(struct net *net, struct flowi4 *flp, struct fib_result *res)
+int __fib_lookup(struct net *net, struct flowi4 *flp,
+		 struct fib_result *res, unsigned int flags)
 {
 	struct fib_lookup_arg arg = {
 		.result = res,
-		.flags = FIB_LOOKUP_NOREF,
+		.flags = flags,
 	};
 	int err;
 
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index b1b305b1e340..3bfccd83551c 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -623,7 +623,8 @@ static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi,
 			/* It is not necessary, but requires a bit of thinking */
 			if (fl4.flowi4_scope < RT_SCOPE_LINK)
 				fl4.flowi4_scope = RT_SCOPE_LINK;
-			err = fib_lookup(net, &fl4, &res);
+			err = fib_lookup(net, &fl4, &res,
+					 FIB_LOOKUP_IGNORE_LINKSTATE);
 			if (err) {
 				rcu_read_unlock();
 				return err;
@@ -1035,12 +1036,20 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event,
 	    nla_put_in_addr(skb, RTA_PREFSRC, fi->fib_prefsrc))
 		goto nla_put_failure;
 	if (fi->fib_nhs == 1) {
+		struct in_device *in_dev;
+
 		if (fi->fib_nh->nh_gw &&
 		    nla_put_in_addr(skb, RTA_GATEWAY, fi->fib_nh->nh_gw))
 			goto nla_put_failure;
 		if (fi->fib_nh->nh_oif &&
 		    nla_put_u32(skb, RTA_OIF, fi->fib_nh->nh_oif))
 			goto nla_put_failure;
+		if (fi->fib_nh->nh_flags & RTNH_F_LINKDOWN) {
+			in_dev = __in_dev_get_rcu(fi->fib_nh->nh_dev);
+			if (in_dev &&
+			    IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev))
+				rtm->rtm_flags |= RTNH_F_DEAD;
+		}
 #ifdef CONFIG_IP_ROUTE_CLASSID
 		if (fi->fib_nh[0].nh_tclassid &&
 		    nla_put_u32(skb, RTA_FLOW, fi->fib_nh[0].nh_tclassid))
@@ -1057,11 +1066,19 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event,
 			goto nla_put_failure;
 
 		for_nexthops(fi) {
+			struct in_device *in_dev;
+
 			rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
 			if (!rtnh)
 				goto nla_put_failure;
 
 			rtnh->rtnh_flags = nh->nh_flags & 0xFF;
+			if (nh->nh_flags & RTNH_F_LINKDOWN) {
+				in_dev = __in_dev_get_rcu(nh->nh_dev);
+				if (in_dev &&
+				    IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev))
+					rtnh->rtnh_flags |= RTNH_F_DEAD;
+			}
 			rtnh->rtnh_hops = nh->nh_weight - 1;
 			rtnh->rtnh_ifindex = nh->nh_oif;
 
@@ -1310,16 +1327,22 @@ int fib_sync_up(struct net_device *dev, unsigned int nh_flags)
 void fib_select_multipath(struct fib_result *res)
 {
 	struct fib_info *fi = res->fi;
+	struct in_device *in_dev;
 	int w;
 
 	spin_lock_bh(&fib_multipath_lock);
 	if (fi->fib_power <= 0) {
 		int power = 0;
 		change_nexthops(fi) {
-			if (!(nexthop_nh->nh_flags & RTNH_F_DEAD)) {
-				power += nexthop_nh->nh_weight;
-				nexthop_nh->nh_power = nexthop_nh->nh_weight;
-			}
+			in_dev = __in_dev_get_rcu(nexthop_nh->nh_dev);
+			if (nexthop_nh->nh_flags & RTNH_F_DEAD)
+				continue;
+			if (in_dev &&
+			    IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev) &&
+			    nexthop_nh->nh_flags & RTNH_F_LINKDOWN)
+				continue;
+			power += nexthop_nh->nh_weight;
+			nexthop_nh->nh_power = nexthop_nh->nh_weight;
 		} endfor_nexthops(fi);
 		fi->fib_power = power;
 		if (power <= 0) {
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 6c666a9f1bd5..15d32612e3c6 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -1412,9 +1412,15 @@ found:
 			continue;
 		for (nhsel = 0; nhsel < fi->fib_nhs; nhsel++) {
 			const struct fib_nh *nh = &fi->fib_nh[nhsel];
+			struct in_device *in_dev = __in_dev_get_rcu(nh->nh_dev);
 
 			if (nh->nh_flags & RTNH_F_DEAD)
 				continue;
+			if (in_dev &&
+			    IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev) &&
+			    nh->nh_flags & RTNH_F_LINKDOWN &&
+			    !(fib_flags & FIB_LOOKUP_IGNORE_LINKSTATE))
+				continue;
 			if (flp->flowi4_oif && flp->flowi4_oif != nh->nh_oif)
 				continue;
 
diff --git a/net/ipv4/netfilter/ipt_rpfilter.c b/net/ipv4/netfilter/ipt_rpfilter.c
index 4bfaedf9b34e..8618fd150c96 100644
--- a/net/ipv4/netfilter/ipt_rpfilter.c
+++ b/net/ipv4/netfilter/ipt_rpfilter.c
@@ -40,7 +40,7 @@ static bool rpfilter_lookup_reverse(struct flowi4 *fl4,
 	struct net *net = dev_net(dev);
 	int ret __maybe_unused;
 
-	if (fib_lookup(net, fl4, &res))
+	if (fib_lookup(net, fl4, &res, FIB_LOOKUP_IGNORE_LINKSTATE))
 		return false;
 
 	if (res.type != RTN_UNICAST) {
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index f6055984c307..d0362a2de3d3 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -747,7 +747,7 @@ static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flow
 		if (!(n->nud_state & NUD_VALID)) {
 			neigh_event_send(n, NULL);
 		} else {
-			if (fib_lookup(net, fl4, &res) == 0) {
+			if (fib_lookup(net, fl4, &res, 0) == 0) {
 				struct fib_nh *nh = &FIB_RES_NH(res);
 
 				update_or_create_fnhe(nh, fl4->daddr, new_gw,
@@ -975,7 +975,7 @@ static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
 		return;
 
 	rcu_read_lock();
-	if (fib_lookup(dev_net(dst->dev), fl4, &res) == 0) {
+	if (fib_lookup(dev_net(dst->dev), fl4, &res, 0) == 0) {
 		struct fib_nh *nh = &FIB_RES_NH(res);
 
 		update_or_create_fnhe(nh, fl4->daddr, 0, mtu,
@@ -1186,7 +1186,7 @@ void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
 		fl4.flowi4_mark = skb->mark;
 
 		rcu_read_lock();
-		if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
+		if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0)
 			src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
 		else
 			src = inet_select_addr(rt->dst.dev,
@@ -1716,7 +1716,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
 	fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
 	fl4.daddr = daddr;
 	fl4.saddr = saddr;
-	err = fib_lookup(net, &fl4, &res);
+	err = fib_lookup(net, &fl4, &res, 0);
 	if (err != 0) {
 		if (!IN_DEV_FORWARD(in_dev))
 			err = -EHOSTUNREACH;
@@ -2123,7 +2123,7 @@ struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
 		goto make_route;
 	}
 
-	if (fib_lookup(net, fl4, &res)) {
+	if (fib_lookup(net, fl4, &res, 0)) {
 		res.fi = NULL;
 		res.table = NULL;
 		if (fl4->flowi4_oif) {
-- 
cgit v1.2.3


From be3a5d233922d73f27002ce2767f6ec03c3f473d Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Tue, 23 Jun 2015 19:51:55 +0800
Subject: NFSv.2/pnfs Add a LAYOUTSTATS rpc function

Reviewed-by: Jeff Layton <jeff.layton@primarydata.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
Signed-off-by: Peng Tao <tao.peng@primarydata.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/nfs42.h          |   9 +++-
 fs/nfs/nfs42proc.c      |  27 +++++++++++
 fs/nfs/nfs42xdr.c       | 122 ++++++++++++++++++++++++++++++++++++++++++++++++
 fs/nfs/nfs4_fs.h        |   1 +
 fs/nfs/nfs4proc.c       |   4 +-
 fs/nfs/nfs4xdr.c        |   1 +
 include/linux/nfs4.h    |   1 +
 include/linux/nfs_xdr.h |  43 +++++++++++++++++
 8 files changed, 205 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/nfs42.h b/fs/nfs/nfs42.h
index 7afb8947dfdf..ff66ae700b89 100644
--- a/fs/nfs/nfs42.h
+++ b/fs/nfs/nfs42.h
@@ -5,11 +5,18 @@
 #ifndef __LINUX_FS_NFS_NFS4_2_H
 #define __LINUX_FS_NFS_NFS4_2_H
 
+/*
+ * FIXME:  four LAYOUTSTATS calls per compound at most! Do we need to support
+ * more? Need to consider not to pre-alloc too much for a compound.
+ */
+#define PNFS_LAYOUTSTATS_MAXDEV (4)
+
 /* nfs4.2proc.c */
 int nfs42_proc_allocate(struct file *, loff_t, loff_t);
 int nfs42_proc_deallocate(struct file *, loff_t, loff_t);
 loff_t nfs42_proc_llseek(struct file *, loff_t, int);
-
+int nfs42_proc_layoutstats_generic(struct nfs_server *,
+				   struct nfs42_layoutstat_data *);
 /* nfs4.2xdr.h */
 extern struct rpc_procinfo nfs4_2_procedures[];
 
diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c
index 3a9e75235f30..ac929685350b 100644
--- a/fs/nfs/nfs42proc.c
+++ b/fs/nfs/nfs42proc.c
@@ -165,3 +165,30 @@ loff_t nfs42_proc_llseek(struct file *filep, loff_t offset, int whence)
 
 	return vfs_setpos(filep, res.sr_offset, inode->i_sb->s_maxbytes);
 }
+
+static const struct rpc_call_ops nfs42_layoutstat_ops = {
+};
+
+int nfs42_proc_layoutstats_generic(struct nfs_server *server,
+				   struct nfs42_layoutstat_data *data)
+{
+	struct rpc_message msg = {
+		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LAYOUTSTATS],
+		.rpc_argp = &data->args,
+		.rpc_resp = &data->res,
+	};
+	struct rpc_task_setup task_setup = {
+		.rpc_client = server->client,
+		.rpc_message = &msg,
+		.callback_ops = &nfs42_layoutstat_ops,
+		.callback_data = data,
+		.flags = RPC_TASK_ASYNC,
+	};
+	struct rpc_task *task;
+
+	nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 0);
+	task = rpc_run_task(&task_setup);
+	if (IS_ERR(task))
+		return PTR_ERR(task);
+	return 0;
+}
diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c
index 1a25b27248f2..9aae0205ef11 100644
--- a/fs/nfs/nfs42xdr.c
+++ b/fs/nfs/nfs42xdr.c
@@ -4,6 +4,8 @@
 #ifndef __LINUX_FS_NFS_NFS4_2XDR_H
 #define __LINUX_FS_NFS_NFS4_2XDR_H
 
+#include "nfs42.h"
+
 #define encode_fallocate_maxsz		(encode_stateid_maxsz + \
 					 2 /* offset */ + \
 					 2 /* length */)
@@ -22,6 +24,16 @@
 					 1 /* whence */ + \
 					 2 /* offset */ + \
 					 2 /* length */)
+#define encode_io_info_maxsz		4
+#define encode_layoutstats_maxsz	(op_decode_hdr_maxsz + \
+					2 /* offset */ + \
+					2 /* length */ + \
+					encode_stateid_maxsz + \
+					encode_io_info_maxsz + \
+					encode_io_info_maxsz + \
+					1 /* opaque devaddr4 length */ + \
+					XDR_QUADLEN(PNFS_LAYOUTSTATS_MAXSIZE))
+#define decode_layoutstats_maxsz	(op_decode_hdr_maxsz)
 
 #define NFS4_enc_allocate_sz		(compound_encode_hdr_maxsz + \
 					 encode_putfh_maxsz + \
@@ -45,6 +57,14 @@
 #define NFS4_dec_seek_sz		(compound_decode_hdr_maxsz + \
 					 decode_putfh_maxsz + \
 					 decode_seek_maxsz)
+#define NFS4_enc_layoutstats_sz		(compound_encode_hdr_maxsz + \
+					 encode_sequence_maxsz + \
+					 encode_putfh_maxsz + \
+					 PNFS_LAYOUTSTATS_MAXDEV * encode_layoutstats_maxsz)
+#define NFS4_dec_layoutstats_sz		(compound_decode_hdr_maxsz + \
+					 decode_sequence_maxsz + \
+					 decode_putfh_maxsz + \
+					 PNFS_LAYOUTSTATS_MAXDEV * decode_layoutstats_maxsz)
 
 
 static void encode_fallocate(struct xdr_stream *xdr,
@@ -81,6 +101,33 @@ static void encode_seek(struct xdr_stream *xdr,
 	encode_uint32(xdr, args->sa_what);
 }
 
+static void encode_layoutstats(struct xdr_stream *xdr,
+			       struct nfs42_layoutstat_args *args,
+			       struct nfs42_layoutstat_devinfo *devinfo,
+			       struct compound_hdr *hdr)
+{
+	__be32 *p;
+
+	encode_op_hdr(xdr, OP_LAYOUTSTATS, decode_layoutstats_maxsz, hdr);
+	p = reserve_space(xdr, 8 + 8);
+	p = xdr_encode_hyper(p, devinfo->offset);
+	p = xdr_encode_hyper(p, devinfo->length);
+	encode_nfs4_stateid(xdr, &args->stateid);
+	p = reserve_space(xdr, 4*8 + NFS4_DEVICEID4_SIZE + 4);
+	p = xdr_encode_hyper(p, devinfo->read_count);
+	p = xdr_encode_hyper(p, devinfo->read_bytes);
+	p = xdr_encode_hyper(p, devinfo->write_count);
+	p = xdr_encode_hyper(p, devinfo->write_bytes);
+	p = xdr_encode_opaque_fixed(p, devinfo->dev_id.data,
+			NFS4_DEVICEID4_SIZE);
+	/* Encode layoutupdate4 */
+	*p++ = cpu_to_be32(devinfo->layout_type);
+	if (devinfo->layoutstats_encode != NULL)
+		devinfo->layoutstats_encode(xdr, args, devinfo);
+	else
+		encode_uint32(xdr, 0);
+}
+
 /*
  * Encode ALLOCATE request
  */
@@ -137,6 +184,28 @@ static void nfs4_xdr_enc_seek(struct rpc_rqst *req,
 	encode_nops(&hdr);
 }
 
+/*
+ * Encode LAYOUTSTATS request
+ */
+static void nfs4_xdr_enc_layoutstats(struct rpc_rqst *req,
+				     struct xdr_stream *xdr,
+				     struct nfs42_layoutstat_args *args)
+{
+	int i;
+
+	struct compound_hdr hdr = {
+		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
+	};
+
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_sequence(xdr, &args->seq_args, &hdr);
+	encode_putfh(xdr, args->fh, &hdr);
+	WARN_ON(args->num_dev > PNFS_LAYOUTSTATS_MAXDEV);
+	for (i = 0; i < args->num_dev; i++)
+		encode_layoutstats(xdr, args, &args->devinfo[i], &hdr);
+	encode_nops(&hdr);
+}
+
 static int decode_allocate(struct xdr_stream *xdr, struct nfs42_falloc_res *res)
 {
 	return decode_op_hdr(xdr, OP_ALLOCATE);
@@ -169,6 +238,28 @@ out_overflow:
 	return -EIO;
 }
 
+static int decode_layoutstats(struct xdr_stream *xdr,
+			      struct nfs42_layoutstat_res *res)
+{
+	int status;
+	__be32 *p;
+
+	status = decode_op_hdr(xdr, OP_LAYOUTSTATS);
+	if (status)
+		return status;
+
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(!p))
+		goto out_overflow;
+
+	res->rpc_status = be32_to_cpup(p++);
+	return 0;
+
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
 /*
  * Decode ALLOCATE request
  */
@@ -246,4 +337,35 @@ static int nfs4_xdr_dec_seek(struct rpc_rqst *rqstp,
 out:
 	return status;
 }
+
+/*
+ * Decode LAYOUTSTATS request
+ */
+static int nfs4_xdr_dec_layoutstats(struct rpc_rqst *rqstp,
+				    struct xdr_stream *xdr,
+				    struct nfs42_layoutstat_res *res)
+{
+	struct compound_hdr hdr;
+	int status, i;
+
+	status = decode_compound_hdr(xdr, &hdr);
+	if (status)
+		goto out;
+	status = decode_sequence(xdr, &res->seq_res, rqstp);
+	if (status)
+		goto out;
+	status = decode_putfh(xdr);
+	if (status)
+		goto out;
+	WARN_ON(res->num_dev > PNFS_LAYOUTSTATS_MAXDEV);
+	for (i = 0; i < res->num_dev; i++) {
+		status = decode_layoutstats(xdr, res);
+		if (status)
+			goto out;
+	}
+out:
+	res->rpc_status = status;
+	return status;
+}
+
 #endif /* __LINUX_FS_NFS_NFS4_2XDR_H */
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index fdef424b0cd3..ea3bee919a76 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -233,6 +233,7 @@ extern int nfs4_handle_exception(struct nfs_server *, int, struct nfs4_exception
 extern int nfs4_call_sync(struct rpc_clnt *, struct nfs_server *,
 			  struct rpc_message *, struct nfs4_sequence_args *,
 			  struct nfs4_sequence_res *, int);
+extern void nfs4_init_sequence(struct nfs4_sequence_args *, struct nfs4_sequence_res *, int);
 extern int nfs4_proc_setclientid(struct nfs_client *, u32, unsigned short, struct rpc_cred *, struct nfs4_setclientid_res *);
 extern int nfs4_proc_setclientid_confirm(struct nfs_client *, struct nfs4_setclientid_res *arg, struct rpc_cred *);
 extern int nfs4_proc_get_rootfh(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *, bool);
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index e9cd45f1d60f..643ce3a91b22 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -476,8 +476,8 @@ struct nfs4_call_sync_data {
 	struct nfs4_sequence_res *seq_res;
 };
 
-static void nfs4_init_sequence(struct nfs4_sequence_args *args,
-			       struct nfs4_sequence_res *res, int cache_reply)
+void nfs4_init_sequence(struct nfs4_sequence_args *args,
+			struct nfs4_sequence_res *res, int cache_reply)
 {
 	args->sa_slot = NULL;
 	args->sa_cache_this = cache_reply;
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index a07555d6968c..558cd65dbdb7 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -7431,6 +7431,7 @@ struct rpc_procinfo	nfs4_procedures[] = {
 	PROC(SEEK,		enc_seek,		dec_seek),
 	PROC(ALLOCATE,		enc_allocate,		dec_allocate),
 	PROC(DEALLOCATE,	enc_deallocate,		dec_deallocate),
+	PROC(LAYOUTSTATS,	enc_layoutstats,	dec_layoutstats),
 #endif /* CONFIG_NFS_V4_2 */
 };
 
diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h
index 32201c269890..b8e72aad919c 100644
--- a/include/linux/nfs4.h
+++ b/include/linux/nfs4.h
@@ -500,6 +500,7 @@ enum {
 	NFSPROC4_CLNT_SEEK,
 	NFSPROC4_CLNT_ALLOCATE,
 	NFSPROC4_CLNT_DEALLOCATE,
+	NFSPROC4_CLNT_LAYOUTSTATS,
 };
 
 /* nfs41 types */
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index c959e7eb5bd4..7bbe50504211 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -316,6 +316,49 @@ struct nfs4_layoutreturn {
 	int rpc_status;
 };
 
+#define PNFS_LAYOUTSTATS_MAXSIZE 256
+
+struct nfs42_layoutstat_args;
+struct nfs42_layoutstat_devinfo;
+typedef	void (*layoutstats_encode_t)(struct xdr_stream *,
+		struct nfs42_layoutstat_args *,
+		struct nfs42_layoutstat_devinfo *);
+
+/* Per file per deviceid layoutstats */
+struct nfs42_layoutstat_devinfo {
+	struct nfs4_deviceid dev_id;
+	__u64 offset;
+	__u64 length;
+	__u64 read_count;
+	__u64 read_bytes;
+	__u64 write_count;
+	__u64 write_bytes;
+	__u32 layout_type;
+	layoutstats_encode_t layoutstats_encode;
+	void *layout_private;
+};
+
+struct nfs42_layoutstat_args {
+	struct nfs4_sequence_args seq_args;
+	struct nfs_fh *fh;
+	struct inode *inode;
+	nfs4_stateid stateid;
+	int num_dev;
+	struct nfs42_layoutstat_devinfo *devinfo;
+};
+
+struct nfs42_layoutstat_res {
+	struct nfs4_sequence_res seq_res;
+	int num_dev;
+	int rpc_status;
+};
+
+struct nfs42_layoutstat_data {
+	struct inode *inode;
+	struct nfs42_layoutstat_args args;
+	struct nfs42_layoutstat_res res;
+};
+
 struct stateowner_id {
 	__u64	create_time;
 	__u32	uniquifier;
-- 
cgit v1.2.3


From 1bfe3b259ff2e579b2acb190f874397d53274497 Mon Sep 17 00:00:00 2001
From: Peng Tao <tao.peng@primarydata.com>
Date: Tue, 23 Jun 2015 19:52:03 +0800
Subject: nfs42: serialize LAYOUTSTATS calls of the same file

There is no need to report concurrently.

Reviewed-by: Jeff Layton <jeff.layton@primarydata.com>
Signed-off-by: Peng Tao <tao.peng@primarydata.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/nfs42proc.c     | 3 +++
 fs/nfs/pnfs.c          | 7 +++++++
 include/linux/nfs_fs.h | 1 +
 3 files changed, 11 insertions(+)

(limited to 'include/linux')

diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c
index ee0248340a42..06c74cd93875 100644
--- a/fs/nfs/nfs42proc.c
+++ b/fs/nfs/nfs42proc.c
@@ -204,6 +204,9 @@ nfs42_layoutstat_release(void *calldata)
 		nfss->pnfs_curr_ld->cleanup_layoutstats(data);
 
 	pnfs_put_layout_hdr(NFS_I(data->args.inode)->layout);
+	smp_mb__before_atomic();
+	clear_bit(NFS_INO_LAYOUTSTATS, &NFS_I(data->args.inode)->flags);
+	smp_mb__after_atomic();
 	nfs_iput_and_deactive(data->inode);
 	kfree(data->args.devinfo);
 	kfree(data);
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 6279cff7df74..5b5224341bcd 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -2257,6 +2257,7 @@ pnfs_report_layoutstat(struct inode *inode)
 {
 	struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld;
 	struct nfs_server *server = NFS_SERVER(inode);
+	struct nfs_inode *nfsi = NFS_I(inode);
 	struct nfs42_layoutstat_data *data;
 	struct pnfs_layout_hdr *hdr;
 	int status = 0;
@@ -2264,6 +2265,9 @@ pnfs_report_layoutstat(struct inode *inode)
 	if (!pnfs_enabled_sb(server) || !ld->prepare_layoutstats)
 		goto out;
 
+	if (test_and_set_bit(NFS_INO_LAYOUTSTATS, &nfsi->flags))
+		goto out;
+
 	spin_lock(&inode->i_lock);
 	if (!NFS_I(inode)->layout) {
 		spin_unlock(&inode->i_lock);
@@ -2296,6 +2300,9 @@ out_free:
 	kfree(data);
 out_put:
 	pnfs_put_layout_hdr(hdr);
+	smp_mb__before_atomic();
+	clear_bit(NFS_INO_LAYOUTSTATS, &nfsi->flags);
+	smp_mb__after_atomic();
 	goto out;
 }
 EXPORT_SYMBOL_GPL(pnfs_report_layoutstat);
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index b95f914ce083..f91b5ade30c9 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -219,6 +219,7 @@ struct nfs_inode {
 #define NFS_INO_COMMIT		(7)		/* inode is committing unstable writes */
 #define NFS_INO_LAYOUTCOMMIT	(9)		/* layoutcommit required */
 #define NFS_INO_LAYOUTCOMMITTING (10)		/* layoutcommit inflight */
+#define NFS_INO_LAYOUTSTATS	(11)		/* layoutstats inflight */
 
 static inline struct nfs_inode *NFS_I(const struct inode *inode)
 {
-- 
cgit v1.2.3


From c181fb3e723351e2f7a1f76b6c0627a4b8ad1723 Mon Sep 17 00:00:00 2001
From: Alexander Sverdlin <alexander.sverdlin@gmail.com>
Date: Mon, 22 Jun 2015 22:38:53 +0200
Subject: ACPI / OF: Rename of_node() and acpi_node() to to_of_node() and
 to_acpi_node()

Commit 8a0662d9 introduced of_node and acpi_node symbols in global namespace
but there were already ~63 of_node local variables or function parameters
(no single acpi_node though, but anyway).

After debugging undefined but used of_node local varible (which turned out
to reference static function of_node() instead) it became clear that the names
for the functions are too short and too generic for global scope.

Signed-off-by: Alexander Sverdlin <alexander.sverdlin@gmail.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/base/property.c  | 26 +++++++++++++-------------
 drivers/gpio/gpiolib.c   |  4 ++--
 drivers/leds/leds-gpio.c |  2 +-
 include/acpi/acpi_bus.h  |  2 +-
 include/linux/acpi.h     |  4 ++--
 include/linux/of.h       |  4 ++--
 6 files changed, 21 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/property.c b/drivers/base/property.c
index 1d0b116cae95..dfd4de69b67b 100644
--- a/drivers/base/property.c
+++ b/drivers/base/property.c
@@ -128,9 +128,9 @@ EXPORT_SYMBOL_GPL(device_property_present);
 bool fwnode_property_present(struct fwnode_handle *fwnode, const char *propname)
 {
 	if (is_of_node(fwnode))
-		return of_property_read_bool(of_node(fwnode), propname);
+		return of_property_read_bool(to_of_node(fwnode), propname);
 	else if (is_acpi_node(fwnode))
-		return !acpi_dev_prop_get(acpi_node(fwnode), propname, NULL);
+		return !acpi_dev_prop_get(to_acpi_node(fwnode), propname, NULL);
 
 	return !!pset_prop_get(to_pset(fwnode), propname);
 }
@@ -285,10 +285,10 @@ EXPORT_SYMBOL_GPL(device_property_read_string);
 ({ \
 	int _ret_; \
 	if (is_of_node(_fwnode_)) \
-		_ret_ = OF_DEV_PROP_READ_ARRAY(of_node(_fwnode_), _propname_, \
+		_ret_ = OF_DEV_PROP_READ_ARRAY(to_of_node(_fwnode_), _propname_, \
 					       _type_, _val_, _nval_); \
 	else if (is_acpi_node(_fwnode_)) \
-		_ret_ = acpi_dev_prop_read(acpi_node(_fwnode_), _propname_, \
+		_ret_ = acpi_dev_prop_read(to_acpi_node(_fwnode_), _propname_, \
 					   _proptype_, _val_, _nval_); \
 	else \
 		_ret_ = pset_prop_read_array(to_pset(_fwnode_), _propname_, \
@@ -424,11 +424,11 @@ int fwnode_property_read_string_array(struct fwnode_handle *fwnode,
 {
 	if (is_of_node(fwnode))
 		return val ?
-			of_property_read_string_array(of_node(fwnode), propname,
-						      val, nval) :
-			of_property_count_strings(of_node(fwnode), propname);
+			of_property_read_string_array(to_of_node(fwnode),
+						      propname, val, nval) :
+			of_property_count_strings(to_of_node(fwnode), propname);
 	else if (is_acpi_node(fwnode))
-		return acpi_dev_prop_read(acpi_node(fwnode), propname,
+		return acpi_dev_prop_read(to_acpi_node(fwnode), propname,
 					  DEV_PROP_STRING, val, nval);
 
 	return pset_prop_read_array(to_pset(fwnode), propname,
@@ -455,9 +455,9 @@ int fwnode_property_read_string(struct fwnode_handle *fwnode,
 				const char *propname, const char **val)
 {
 	if (is_of_node(fwnode))
-		return of_property_read_string(of_node(fwnode), propname, val);
+		return of_property_read_string(to_of_node(fwnode), propname, val);
 	else if (is_acpi_node(fwnode))
-		return acpi_dev_prop_read(acpi_node(fwnode), propname,
+		return acpi_dev_prop_read(to_acpi_node(fwnode), propname,
 					  DEV_PROP_STRING, val, 1);
 
 	return -ENXIO;
@@ -475,13 +475,13 @@ struct fwnode_handle *device_get_next_child_node(struct device *dev,
 	if (IS_ENABLED(CONFIG_OF) && dev->of_node) {
 		struct device_node *node;
 
-		node = of_get_next_available_child(dev->of_node, of_node(child));
+		node = of_get_next_available_child(dev->of_node, to_of_node(child));
 		if (node)
 			return &node->fwnode;
 	} else if (IS_ENABLED(CONFIG_ACPI)) {
 		struct acpi_device *node;
 
-		node = acpi_get_next_child(dev, acpi_node(child));
+		node = acpi_get_next_child(dev, to_acpi_node(child));
 		if (node)
 			return acpi_fwnode_handle(node);
 	}
@@ -500,7 +500,7 @@ EXPORT_SYMBOL_GPL(device_get_next_child_node);
 void fwnode_handle_put(struct fwnode_handle *fwnode)
 {
 	if (is_of_node(fwnode))
-		of_node_put(of_node(fwnode));
+		of_node_put(to_of_node(fwnode));
 }
 EXPORT_SYMBOL_GPL(fwnode_handle_put);
 
diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c
index 6bc612b8a49f..5d8b2b35e2a2 100644
--- a/drivers/gpio/gpiolib.c
+++ b/drivers/gpio/gpiolib.c
@@ -2040,14 +2040,14 @@ struct gpio_desc *fwnode_get_named_gpiod(struct fwnode_handle *fwnode,
 	if (is_of_node(fwnode)) {
 		enum of_gpio_flags flags;
 
-		desc = of_get_named_gpiod_flags(of_node(fwnode), propname, 0,
+		desc = of_get_named_gpiod_flags(to_of_node(fwnode), propname, 0,
 						&flags);
 		if (!IS_ERR(desc))
 			active_low = flags & OF_GPIO_ACTIVE_LOW;
 	} else if (is_acpi_node(fwnode)) {
 		struct acpi_gpio_info info;
 
-		desc = acpi_get_gpiod_by_index(acpi_node(fwnode), propname, 0,
+		desc = acpi_get_gpiod_by_index(to_acpi_node(fwnode), propname, 0,
 					       &info);
 		if (!IS_ERR(desc))
 			active_low = info.active_low;
diff --git a/drivers/leds/leds-gpio.c b/drivers/leds/leds-gpio.c
index 15eb3f86f670..d2d54d62afee 100644
--- a/drivers/leds/leds-gpio.c
+++ b/drivers/leds/leds-gpio.c
@@ -191,7 +191,7 @@ static struct gpio_leds_priv *gpio_leds_create(struct platform_device *pdev)
 			goto err;
 		}
 
-		np = of_node(child);
+		np = to_of_node(child);
 
 		if (fwnode_property_present(child, "label")) {
 			fwnode_property_read_string(child, "label", &led.name);
diff --git a/include/acpi/acpi_bus.h b/include/acpi/acpi_bus.h
index 8de4fa90e8c4..1224be8184fc 100644
--- a/include/acpi/acpi_bus.h
+++ b/include/acpi/acpi_bus.h
@@ -385,7 +385,7 @@ static inline bool is_acpi_node(struct fwnode_handle *fwnode)
 	return fwnode && fwnode->type == FWNODE_ACPI;
 }
 
-static inline struct acpi_device *acpi_node(struct fwnode_handle *fwnode)
+static inline struct acpi_device *to_acpi_node(struct fwnode_handle *fwnode)
 {
 	return is_acpi_node(fwnode) ?
 		container_of(fwnode, struct acpi_device, fwnode) : NULL;
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index e4da5e35e29c..ec3c98ed9dca 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -53,7 +53,7 @@ static inline acpi_handle acpi_device_handle(struct acpi_device *adev)
 	return adev ? adev->handle : NULL;
 }
 
-#define ACPI_COMPANION(dev)		acpi_node((dev)->fwnode)
+#define ACPI_COMPANION(dev)		to_acpi_node((dev)->fwnode)
 #define ACPI_COMPANION_SET(dev, adev)	set_primary_fwnode(dev, (adev) ? \
 	acpi_fwnode_handle(adev) : NULL)
 #define ACPI_HANDLE(dev)		acpi_device_handle(ACPI_COMPANION(dev))
@@ -473,7 +473,7 @@ static inline bool is_acpi_node(struct fwnode_handle *fwnode)
 	return false;
 }
 
-static inline struct acpi_device *acpi_node(struct fwnode_handle *fwnode)
+static inline struct acpi_device *to_acpi_node(struct fwnode_handle *fwnode)
 {
 	return NULL;
 }
diff --git a/include/linux/of.h b/include/linux/of.h
index b871ff9d81d7..f05fdcea4e66 100644
--- a/include/linux/of.h
+++ b/include/linux/of.h
@@ -128,7 +128,7 @@ static inline bool is_of_node(struct fwnode_handle *fwnode)
 	return fwnode && fwnode->type == FWNODE_OF;
 }
 
-static inline struct device_node *of_node(struct fwnode_handle *fwnode)
+static inline struct device_node *to_of_node(struct fwnode_handle *fwnode)
 {
 	return fwnode ? container_of(fwnode, struct device_node, fwnode) : NULL;
 }
@@ -387,7 +387,7 @@ static inline bool is_of_node(struct fwnode_handle *fwnode)
 	return false;
 }
 
-static inline struct device_node *of_node(struct fwnode_handle *fwnode)
+static inline struct device_node *to_of_node(struct fwnode_handle *fwnode)
 {
 	return NULL;
 }
-- 
cgit v1.2.3


From 9200025724619d83f9fc366281f0bde36afe6e5a Mon Sep 17 00:00:00 2001
From: Xunlei Pang <pang.xunlei@linaro.org>
Date: Fri, 12 Jun 2015 10:04:10 +0800
Subject: rtc: Introduce rtc_tm_sub() helper function

There're many sites need comparing the two rtc_time variants for many
rtc drivers, especially in the instances of rtc_class_ops::set_alarm().

So add this common helper function to make things easy.

Suggested-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Xunlei Pang <pang.xunlei@linaro.org>
Signed-off-by: Alexandre Belloni <alexandre.belloni@free-electrons.com>
---
 include/linux/rtc.h | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/rtc.h b/include/linux/rtc.h
index 587017e7939c..b36160321458 100644
--- a/include/linux/rtc.h
+++ b/include/linux/rtc.h
@@ -24,6 +24,14 @@ extern void rtc_time64_to_tm(time64_t time, struct rtc_time *tm);
 ktime_t rtc_tm_to_ktime(struct rtc_time tm);
 struct rtc_time rtc_ktime_to_tm(ktime_t kt);
 
+/*
+ * rtc_tm_sub - Return the difference in seconds.
+ */
+static inline time64_t rtc_tm_sub(struct rtc_time *lhs, struct rtc_time *rhs)
+{
+	return rtc_tm_to_time64(lhs) - rtc_tm_to_time64(rhs);
+}
+
 /**
  * Deprecated. Use rtc_time64_to_tm().
  */
-- 
cgit v1.2.3


From c86a6c28957a9e8e9a71582a32e96971ad411ffe Mon Sep 17 00:00:00 2001
From: Xunlei Pang <pang.xunlei@linaro.org>
Date: Fri, 12 Jun 2015 11:10:18 +0800
Subject: rtc: interface: Remove rtc_set_mmss()

Now rtc_set_mmss() has no users, just remove it.

We still have rtc_set_time() doing similar things.

Signed-off-by: Xunlei Pang <pang.xunlei@linaro.org>
Signed-off-by: Alexandre Belloni <alexandre.belloni@free-electrons.com>
---
 drivers/rtc/interface.c | 45 ---------------------------------------------
 include/linux/rtc.h     |  1 -
 2 files changed, 46 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/rtc/interface.c b/drivers/rtc/interface.c
index a6b14c24d7e3..11b639067312 100644
--- a/drivers/rtc/interface.c
+++ b/drivers/rtc/interface.c
@@ -91,51 +91,6 @@ int rtc_set_time(struct rtc_device *rtc, struct rtc_time *tm)
 }
 EXPORT_SYMBOL_GPL(rtc_set_time);
 
-int rtc_set_mmss(struct rtc_device *rtc, unsigned long secs)
-{
-	int err;
-
-	err = mutex_lock_interruptible(&rtc->ops_lock);
-	if (err)
-		return err;
-
-	if (!rtc->ops)
-		err = -ENODEV;
-	else if (rtc->ops->set_mmss64)
-		err = rtc->ops->set_mmss64(rtc->dev.parent, secs);
-	else if (rtc->ops->set_mmss)
-		err = rtc->ops->set_mmss(rtc->dev.parent, secs);
-	else if (rtc->ops->read_time && rtc->ops->set_time) {
-		struct rtc_time new, old;
-
-		err = rtc->ops->read_time(rtc->dev.parent, &old);
-		if (err == 0) {
-			rtc_time64_to_tm(secs, &new);
-
-			/*
-			 * avoid writing when we're going to change the day of
-			 * the month. We will retry in the next minute. This
-			 * basically means that if the RTC must not drift
-			 * by more than 1 minute in 11 minutes.
-			 */
-			if (!((old.tm_hour == 23 && old.tm_min == 59) ||
-				(new.tm_hour == 23 && new.tm_min == 59)))
-				err = rtc->ops->set_time(rtc->dev.parent,
-						&new);
-		}
-	} else {
-		err = -EINVAL;
-	}
-
-	pm_stay_awake(rtc->dev.parent);
-	mutex_unlock(&rtc->ops_lock);
-	/* A timer might have just expired */
-	schedule_work(&rtc->irqwork);
-
-	return err;
-}
-EXPORT_SYMBOL_GPL(rtc_set_mmss);
-
 static int rtc_read_alarm_internal(struct rtc_device *rtc, struct rtc_wkalrm *alarm)
 {
 	int err;
diff --git a/include/linux/rtc.h b/include/linux/rtc.h
index b36160321458..3359f0422c6b 100644
--- a/include/linux/rtc.h
+++ b/include/linux/rtc.h
@@ -168,7 +168,6 @@ extern void devm_rtc_device_unregister(struct device *dev,
 
 extern int rtc_read_time(struct rtc_device *rtc, struct rtc_time *tm);
 extern int rtc_set_time(struct rtc_device *rtc, struct rtc_time *tm);
-extern int rtc_set_mmss(struct rtc_device *rtc, unsigned long secs);
 extern int rtc_set_ntp_time(struct timespec64 now);
 int __rtc_read_alarm(struct rtc_device *rtc, struct rtc_wkalrm *alarm);
 extern int rtc_read_alarm(struct rtc_device *rtc,
-- 
cgit v1.2.3


From c3cddc4c296c3a1498df7181015cbcea178a0546 Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <kernel@kyup.com>
Date: Wed, 24 Jun 2015 16:54:45 -0700
Subject: fsnotify: remove obsolete documentation

should_send_event is no longer part of struct fsnotify_ops, so remove it.

Signed-off-by: Nikolay Borisov <kernel@kyup.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/fsnotify_backend.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index 0f313f93c586..65a517dd32f7 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -84,8 +84,6 @@ struct fsnotify_fname;
  * Each group much define these ops.  The fsnotify infrastructure will call
  * these operations for each relevant group.
  *
- * should_send_event - given a group, inode, and mask this function determines
- *		if the group is interested in this event.
  * handle_event - main call for a group to handle an fs event
  * free_group_priv - called when a group refcnt hits 0 to clean up the private union
  * freeing_mark - called when a mark is being destroyed for some reason.  The group
-- 
cgit v1.2.3


From 5286d20c4eb7b0c29217f8756652609df74f5489 Mon Sep 17 00:00:00 2001
From: Fabian Frederick <fabf@skynet.be>
Date: Wed, 24 Jun 2015 16:54:51 -0700
Subject: configfs: unexport/make static config_item_init()

config_item_init() is only used in item.c

Signed-off-by: Fabian Frederick <fabf@skynet.be>
Cc: Joel Becker <jlbec@evilplan.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/configfs/item.c       | 3 +--
 include/linux/configfs.h | 1 -
 2 files changed, 1 insertion(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/fs/configfs/item.c b/fs/configfs/item.c
index e65f9ffbb999..4d6a30e76168 100644
--- a/fs/configfs/item.c
+++ b/fs/configfs/item.c
@@ -47,12 +47,11 @@ static void config_item_release(struct kref *kref);
  *	config_item_init - initialize item.
  *	@item:	item in question.
  */
-void config_item_init(struct config_item *item)
+static void config_item_init(struct config_item *item)
 {
 	kref_init(&item->ci_kref);
 	INIT_LIST_HEAD(&item->ci_entry);
 }
-EXPORT_SYMBOL(config_item_init);
 
 /**
  *	config_item_set_name - Set the name of an item
diff --git a/include/linux/configfs.h b/include/linux/configfs.h
index 34025df61829..c9e5c57e4edf 100644
--- a/include/linux/configfs.h
+++ b/include/linux/configfs.h
@@ -71,7 +71,6 @@ static inline char *config_item_name(struct config_item * item)
 	return item->ci_name;
 }
 
-extern void config_item_init(struct config_item *);
 extern void config_item_init_type_name(struct config_item *item,
 				       const char *name,
 				       struct config_item_type *type);
-- 
cgit v1.2.3


From b5242e98c1cb834feb1e84026f09a4796b49eb4d Mon Sep 17 00:00:00 2001
From: Chris Metcalf <cmetcalf@ezchip.com>
Date: Wed, 24 Jun 2015 16:55:42 -0700
Subject: smpboot: allow excluding cpus from the smpboot threads

This patch series allows the watchdog to run by default only on the
housekeeping cores when nohz_full is in effect; this seems to be a good
compromise short of turning it off completely (since the nohz_full cores
can't tolerate a watchdog).

To provide customizability, we add /proc/sys/kernel/watchdog_cpumask so
that the set of cores running the watchdog can be tuned to different
values after bootup.

To implement this customizability, we add a new
smpboot_update_cpumask_percpu_thread() API to the smpboot_thread
subsystem that lets us park or unpark "unwanted" threads.

And now that threads can be parked for long periods of time, we tweak the
/proc/<pid>/stat and /proc/<pid>/status code so parked threads aren't
reported as running, which is otherwise confusing.

This patch (of 3):

This change allows some cores to be excluded from running the
smp_hotplug_thread tasks.  The following commit to update
kernel/watchdog.c to use this functionality is the motivating example, and
more information on the motivation is provided there.

A new smp_hotplug_thread field is introduced, "cpumask", which is cpumask
field managed by the smpboot subsystem that indicates whether or not the
given smp_hotplug_thread should run on that core; the cpumask is checked
when deciding whether to unpark the thread.

To limit the cpumask to less than cpu_possible, you must call
smpboot_update_cpumask_percpu_thread() after registering.

Signed-off-by: Chris Metcalf <cmetcalf@ezchip.com>
Cc: Don Zickus <dzickus@redhat.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Ulrich Obergfell <uobergfe@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/smpboot.h |  5 +++++
 kernel/smpboot.c        | 59 ++++++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 63 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/smpboot.h b/include/linux/smpboot.h
index d600afb21926..da3c593f9845 100644
--- a/include/linux/smpboot.h
+++ b/include/linux/smpboot.h
@@ -27,6 +27,8 @@ struct smpboot_thread_data;
  * @pre_unpark:		Optional unpark function, called before the thread is
  *			unparked (cpu online). This is not guaranteed to be
  *			called on the target cpu of the thread. Careful!
+ * @cpumask:		Internal state.  To update which threads are unparked,
+ *			call smpboot_update_cpumask_percpu_thread().
  * @selfparking:	Thread is not parked by the park function.
  * @thread_comm:	The base name of the thread
  */
@@ -41,11 +43,14 @@ struct smp_hotplug_thread {
 	void				(*park)(unsigned int cpu);
 	void				(*unpark)(unsigned int cpu);
 	void				(*pre_unpark)(unsigned int cpu);
+	cpumask_var_t			cpumask;
 	bool				selfparking;
 	const char			*thread_comm;
 };
 
 int smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread);
 void smpboot_unregister_percpu_thread(struct smp_hotplug_thread *plug_thread);
+int smpboot_update_cpumask_percpu_thread(struct smp_hotplug_thread *plug_thread,
+					 const struct cpumask *);
 
 #endif
diff --git a/kernel/smpboot.c b/kernel/smpboot.c
index c697f73d82d6..5e46c2a75d59 100644
--- a/kernel/smpboot.c
+++ b/kernel/smpboot.c
@@ -232,7 +232,8 @@ void smpboot_unpark_threads(unsigned int cpu)
 
 	mutex_lock(&smpboot_threads_lock);
 	list_for_each_entry(cur, &hotplug_threads, list)
-		smpboot_unpark_thread(cur, cpu);
+		if (cpumask_test_cpu(cpu, cur->cpumask))
+			smpboot_unpark_thread(cur, cpu);
 	mutex_unlock(&smpboot_threads_lock);
 }
 
@@ -258,6 +259,15 @@ static void smpboot_destroy_threads(struct smp_hotplug_thread *ht)
 {
 	unsigned int cpu;
 
+	/* Unpark any threads that were voluntarily parked. */
+	for_each_cpu_not(cpu, ht->cpumask) {
+		if (cpu_online(cpu)) {
+			struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu);
+			if (tsk)
+				kthread_unpark(tsk);
+		}
+	}
+
 	/* We need to destroy also the parked threads of offline cpus */
 	for_each_possible_cpu(cpu) {
 		struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu);
@@ -281,6 +291,10 @@ int smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread)
 	unsigned int cpu;
 	int ret = 0;
 
+	if (!alloc_cpumask_var(&plug_thread->cpumask, GFP_KERNEL))
+		return -ENOMEM;
+	cpumask_copy(plug_thread->cpumask, cpu_possible_mask);
+
 	get_online_cpus();
 	mutex_lock(&smpboot_threads_lock);
 	for_each_online_cpu(cpu) {
@@ -313,9 +327,52 @@ void smpboot_unregister_percpu_thread(struct smp_hotplug_thread *plug_thread)
 	smpboot_destroy_threads(plug_thread);
 	mutex_unlock(&smpboot_threads_lock);
 	put_online_cpus();
+	free_cpumask_var(plug_thread->cpumask);
 }
 EXPORT_SYMBOL_GPL(smpboot_unregister_percpu_thread);
 
+/**
+ * smpboot_update_cpumask_percpu_thread - Adjust which per_cpu hotplug threads stay parked
+ * @plug_thread:	Hotplug thread descriptor
+ * @new:		Revised mask to use
+ *
+ * The cpumask field in the smp_hotplug_thread must not be updated directly
+ * by the client, but only by calling this function.
+ */
+int smpboot_update_cpumask_percpu_thread(struct smp_hotplug_thread *plug_thread,
+					 const struct cpumask *new)
+{
+	struct cpumask *old = plug_thread->cpumask;
+	cpumask_var_t tmp;
+	unsigned int cpu;
+
+	if (!alloc_cpumask_var(&tmp, GFP_KERNEL))
+		return -ENOMEM;
+
+	get_online_cpus();
+	mutex_lock(&smpboot_threads_lock);
+
+	/* Park threads that were exclusively enabled on the old mask. */
+	cpumask_andnot(tmp, old, new);
+	for_each_cpu_and(cpu, tmp, cpu_online_mask)
+		smpboot_park_thread(plug_thread, cpu);
+
+	/* Unpark threads that are exclusively enabled on the new mask. */
+	cpumask_andnot(tmp, new, old);
+	for_each_cpu_and(cpu, tmp, cpu_online_mask)
+		smpboot_unpark_thread(plug_thread, cpu);
+
+	cpumask_copy(old, new);
+
+	mutex_unlock(&smpboot_threads_lock);
+	put_online_cpus();
+
+	free_cpumask_var(tmp);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(smpboot_update_cpumask_percpu_thread);
+
 static DEFINE_PER_CPU(atomic_t, cpu_hotplug_state) = ATOMIC_INIT(CPU_POST_DEAD);
 
 /*
-- 
cgit v1.2.3


From fe4ba3c34352b7e8068b7f18eb233444aed17011 Mon Sep 17 00:00:00 2001
From: Chris Metcalf <cmetcalf@ezchip.com>
Date: Wed, 24 Jun 2015 16:55:45 -0700
Subject: watchdog: add watchdog_cpumask sysctl to assist nohz

Change the default behavior of watchdog so it only runs on the
housekeeping cores when nohz_full is enabled at build and boot time.
Allow modifying the set of cores the watchdog is currently running on
with a new kernel.watchdog_cpumask sysctl.

In the current system, the watchdog subsystem runs a periodic timer that
schedules the watchdog kthread to run.  However, nohz_full cores are
designed to allow userspace application code running on those cores to
have 100% access to the CPU.  So the watchdog system prevents the
nohz_full application code from being able to run the way it wants to,
thus the motivation to suppress the watchdog on nohz_full cores, which
this patchset provides by default.

However, if we disable the watchdog globally, then the housekeeping
cores can't benefit from the watchdog functionality.  So we allow
disabling it only on some cores.  See Documentation/lockup-watchdogs.txt
for more information.

[jhubbard@nvidia.com: fix a watchdog crash in some configurations]
Signed-off-by: Chris Metcalf <cmetcalf@ezchip.com>
Acked-by: Don Zickus <dzickus@redhat.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Ulrich Obergfell <uobergfe@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: John Hubbard <jhubbard@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/lockup-watchdogs.txt | 18 ++++++++++
 Documentation/sysctl/kernel.txt    | 21 ++++++++++++
 include/linux/nmi.h                |  3 ++
 kernel/smpboot.c                   |  1 +
 kernel/sysctl.c                    |  7 ++++
 kernel/watchdog.c                  | 67 +++++++++++++++++++++++++++++++++++---
 6 files changed, 112 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/lockup-watchdogs.txt b/Documentation/lockup-watchdogs.txt
index ab0baa692c13..22dd6af2e4bd 100644
--- a/Documentation/lockup-watchdogs.txt
+++ b/Documentation/lockup-watchdogs.txt
@@ -61,3 +61,21 @@ As explained above, a kernel knob is provided that allows
 administrators to configure the period of the hrtimer and the perf
 event. The right value for a particular environment is a trade-off
 between fast response to lockups and detection overhead.
+
+By default, the watchdog runs on all online cores.  However, on a
+kernel configured with NO_HZ_FULL, by default the watchdog runs only
+on the housekeeping cores, not the cores specified in the "nohz_full"
+boot argument.  If we allowed the watchdog to run by default on
+the "nohz_full" cores, we would have to run timer ticks to activate
+the scheduler, which would prevent the "nohz_full" functionality
+from protecting the user code on those cores from the kernel.
+Of course, disabling it by default on the nohz_full cores means that
+when those cores do enter the kernel, by default we will not be
+able to detect if they lock up.  However, allowing the watchdog
+to continue to run on the housekeeping (non-tickless) cores means
+that we will continue to detect lockups properly on those cores.
+
+In either case, the set of cores excluded from running the watchdog
+may be adjusted via the kernel.watchdog_cpumask sysctl.  For
+nohz_full cores, this may be useful for debugging a case where the
+kernel seems to be hanging on the nohz_full cores.
diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt
index c831001c45f1..e5d528e0c46e 100644
--- a/Documentation/sysctl/kernel.txt
+++ b/Documentation/sysctl/kernel.txt
@@ -923,6 +923,27 @@ and nmi_watchdog.
 
 ==============================================================
 
+watchdog_cpumask:
+
+This value can be used to control on which cpus the watchdog may run.
+The default cpumask is all possible cores, but if NO_HZ_FULL is
+enabled in the kernel config, and cores are specified with the
+nohz_full= boot argument, those cores are excluded by default.
+Offline cores can be included in this mask, and if the core is later
+brought online, the watchdog will be started based on the mask value.
+
+Typically this value would only be touched in the nohz_full case
+to re-enable cores that by default were not running the watchdog,
+if a kernel lockup was suspected on those cores.
+
+The argument value is the standard cpulist format for cpumasks,
+so for example to enable the watchdog on cores 0, 2, 3, and 4 you
+might say:
+
+  echo 0,2-4 > /proc/sys/kernel/watchdog_cpumask
+
+==============================================================
+
 watchdog_thresh:
 
 This value can be used to control the frequency of hrtimer and NMI
diff --git a/include/linux/nmi.h b/include/linux/nmi.h
index 3d46fb4708e0..f94da0e65dea 100644
--- a/include/linux/nmi.h
+++ b/include/linux/nmi.h
@@ -67,6 +67,7 @@ extern int nmi_watchdog_enabled;
 extern int soft_watchdog_enabled;
 extern int watchdog_user_enabled;
 extern int watchdog_thresh;
+extern unsigned long *watchdog_cpumask_bits;
 extern int sysctl_softlockup_all_cpu_backtrace;
 struct ctl_table;
 extern int proc_watchdog(struct ctl_table *, int ,
@@ -77,6 +78,8 @@ extern int proc_soft_watchdog(struct ctl_table *, int ,
 			      void __user *, size_t *, loff_t *);
 extern int proc_watchdog_thresh(struct ctl_table *, int ,
 				void __user *, size_t *, loff_t *);
+extern int proc_watchdog_cpumask(struct ctl_table *, int,
+				 void __user *, size_t *, loff_t *);
 #endif
 
 #ifdef CONFIG_HAVE_ACPI_APEI_NMI
diff --git a/kernel/smpboot.c b/kernel/smpboot.c
index 5e46c2a75d59..7c434c39f02a 100644
--- a/kernel/smpboot.c
+++ b/kernel/smpboot.c
@@ -338,6 +338,7 @@ EXPORT_SYMBOL_GPL(smpboot_unregister_percpu_thread);
  *
  * The cpumask field in the smp_hotplug_thread must not be updated directly
  * by the client, but only by calling this function.
+ * This function can only be called on a registered smp_hotplug_thread.
  */
 int smpboot_update_cpumask_percpu_thread(struct smp_hotplug_thread *plug_thread,
 					 const struct cpumask *new)
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index b13e9d2de302..812fcc3fd390 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -871,6 +871,13 @@ static struct ctl_table kern_table[] = {
 		.extra1		= &zero,
 		.extra2		= &one,
 	},
+	{
+		.procname	= "watchdog_cpumask",
+		.data		= &watchdog_cpumask_bits,
+		.maxlen		= NR_CPUS,
+		.mode		= 0644,
+		.proc_handler	= proc_watchdog_cpumask,
+	},
 	{
 		.procname	= "softlockup_panic",
 		.data		= &softlockup_panic,
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 581a68a04c64..a6ffa43f2993 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -19,6 +19,7 @@
 #include <linux/sysctl.h>
 #include <linux/smpboot.h>
 #include <linux/sched/rt.h>
+#include <linux/tick.h>
 
 #include <asm/irq_regs.h>
 #include <linux/kvm_para.h>
@@ -58,6 +59,12 @@ int __read_mostly sysctl_softlockup_all_cpu_backtrace;
 #else
 #define sysctl_softlockup_all_cpu_backtrace 0
 #endif
+static struct cpumask watchdog_cpumask __read_mostly;
+unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask);
+
+/* Helper for online, unparked cpus. */
+#define for_each_watchdog_cpu(cpu) \
+	for_each_cpu_and((cpu), cpu_online_mask, &watchdog_cpumask)
 
 static int __read_mostly watchdog_running;
 static u64 __read_mostly sample_period;
@@ -207,7 +214,7 @@ void touch_all_softlockup_watchdogs(void)
 	 * do we care if a 0 races with a timestamp?
 	 * all it means is the softlock check starts one cycle later
 	 */
-	for_each_online_cpu(cpu)
+	for_each_watchdog_cpu(cpu)
 		per_cpu(watchdog_touch_ts, cpu) = 0;
 }
 
@@ -616,7 +623,7 @@ void watchdog_nmi_enable_all(void)
 		goto unlock;
 
 	get_online_cpus();
-	for_each_online_cpu(cpu)
+	for_each_watchdog_cpu(cpu)
 		watchdog_nmi_enable(cpu);
 	put_online_cpus();
 
@@ -634,7 +641,7 @@ void watchdog_nmi_disable_all(void)
 		goto unlock;
 
 	get_online_cpus();
-	for_each_online_cpu(cpu)
+	for_each_watchdog_cpu(cpu)
 		watchdog_nmi_disable(cpu);
 	put_online_cpus();
 
@@ -696,7 +703,7 @@ static void update_watchdog_all_cpus(void)
 	int cpu;
 
 	get_online_cpus();
-	for_each_online_cpu(cpu)
+	for_each_watchdog_cpu(cpu)
 		update_watchdog(cpu);
 	put_online_cpus();
 }
@@ -709,8 +716,12 @@ static int watchdog_enable_all_cpus(void)
 		err = smpboot_register_percpu_thread(&watchdog_threads);
 		if (err)
 			pr_err("Failed to create watchdog threads, disabled\n");
-		else
+		else {
+			if (smpboot_update_cpumask_percpu_thread(
+				    &watchdog_threads, &watchdog_cpumask))
+				pr_err("Failed to set cpumask for watchdog threads\n");
 			watchdog_running = 1;
+		}
 	} else {
 		/*
 		 * Enable/disable the lockup detectors or
@@ -879,12 +890,58 @@ out:
 	mutex_unlock(&watchdog_proc_mutex);
 	return err;
 }
+
+/*
+ * The cpumask is the mask of possible cpus that the watchdog can run
+ * on, not the mask of cpus it is actually running on.  This allows the
+ * user to specify a mask that will include cpus that have not yet
+ * been brought online, if desired.
+ */
+int proc_watchdog_cpumask(struct ctl_table *table, int write,
+			  void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	int err;
+
+	mutex_lock(&watchdog_proc_mutex);
+	err = proc_do_large_bitmap(table, write, buffer, lenp, ppos);
+	if (!err && write) {
+		/* Remove impossible cpus to keep sysctl output cleaner. */
+		cpumask_and(&watchdog_cpumask, &watchdog_cpumask,
+			    cpu_possible_mask);
+
+		if (watchdog_running) {
+			/*
+			 * Failure would be due to being unable to allocate
+			 * a temporary cpumask, so we are likely not in a
+			 * position to do much else to make things better.
+			 */
+			if (smpboot_update_cpumask_percpu_thread(
+				    &watchdog_threads, &watchdog_cpumask) != 0)
+				pr_err("cpumask update failed\n");
+		}
+	}
+	mutex_unlock(&watchdog_proc_mutex);
+	return err;
+}
+
 #endif /* CONFIG_SYSCTL */
 
 void __init lockup_detector_init(void)
 {
 	set_sample_period();
 
+#ifdef CONFIG_NO_HZ_FULL
+	if (tick_nohz_full_enabled()) {
+		if (!cpumask_empty(tick_nohz_full_mask))
+			pr_info("Disabling watchdog on nohz_full cores by default\n");
+		cpumask_andnot(&watchdog_cpumask, cpu_possible_mask,
+			       tick_nohz_full_mask);
+	} else
+		cpumask_copy(&watchdog_cpumask, cpu_possible_mask);
+#else
+	cpumask_copy(&watchdog_cpumask, cpu_possible_mask);
+#endif
+
 	if (watchdog_enabled)
 		watchdog_enable_all_cpus();
 }
-- 
cgit v1.2.3


From 4066c33d0308f87e9a3b0c7fafb9141c0bfbfa77 Mon Sep 17 00:00:00 2001
From: Gavin Guo <gavin.guo@canonical.com>
Date: Wed, 24 Jun 2015 16:55:54 -0700
Subject: mm/slab_common: support the slub_debug boot option on specific object
 size

The slub_debug=PU,kmalloc-xx cannot work because in the
create_kmalloc_caches() the s->name is created after the
create_kmalloc_cache() is called.  The name is NULL in the
create_kmalloc_cache() so the kmem_cache_flags() would not set the
slub_debug flags to the s->flags.  The fix here set up a kmalloc_names
string array for the initialization purpose and delete the dynamic name
creation of kmalloc_caches.

[akpm@linux-foundation.org: s/kmalloc_names/kmalloc_info/, tweak comment text]
Signed-off-by: Gavin Guo <gavin.guo@canonical.com>
Acked-by: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/slab.h | 22 +++++++++++++++++++
 mm/slab_common.c     | 62 +++++++++++++++++++++++++++++++++-------------------
 2 files changed, 61 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/slab.h b/include/linux/slab.h
index ffd24c830151..96f0ea506b5c 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -153,8 +153,30 @@ size_t ksize(const void *);
 #define ARCH_KMALLOC_MINALIGN ARCH_DMA_MINALIGN
 #define KMALLOC_MIN_SIZE ARCH_DMA_MINALIGN
 #define KMALLOC_SHIFT_LOW ilog2(ARCH_DMA_MINALIGN)
+/*
+ * The KMALLOC_LOOP_LOW is the definition for the for loop index start number
+ * to create the kmalloc_caches object in create_kmalloc_caches(). The first
+ * and the second are 96 and 192. You can see that in the kmalloc_index(), if
+ * the KMALLOC_MIN_SIZE <= 32, then return 1 (96). If KMALLOC_MIN_SIZE <= 64,
+ * then return 2 (192). If the KMALLOC_MIN_SIZE is bigger than 64, we don't
+ * need to initialize 96 and 192. Go directly to start the KMALLOC_SHIFT_LOW.
+ */
+#if KMALLOC_MIN_SIZE <= 32
+#define KMALLOC_LOOP_LOW 1
+#elif KMALLOC_MIN_SIZE <= 64
+#define KMALLOC_LOOP_LOW 2
+#else
+#define KMALLOC_LOOP_LOW KMALLOC_SHIFT_LOW
+#endif
+
 #else
 #define ARCH_KMALLOC_MINALIGN __alignof__(unsigned long long)
+/*
+ * The KMALLOC_MIN_SIZE of slub/slab/slob is 2^3/2^5/2^3. So, even slab is used.
+ * The KMALLOC_MIN_SIZE <= 32. The kmalloc-96 and kmalloc-192 should also be
+ * initialized.
+ */
+#define KMALLOC_LOOP_LOW 1
 #endif
 
 /*
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 999bb3424d44..84e14582a14c 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -783,6 +783,31 @@ struct kmem_cache *kmalloc_slab(size_t size, gfp_t flags)
 	return kmalloc_caches[index];
 }
 
+/*
+ * kmalloc_info[] is to make slub_debug=,kmalloc-xx option work at boot time.
+ * kmalloc_index() supports up to 2^26=64MB, so the final entry of the table is
+ * kmalloc-67108864.
+ */
+static struct {
+	const char *name;
+	unsigned long size;
+} const kmalloc_info[] __initconst = {
+	{NULL,                      0},		{"kmalloc-96",             96},
+	{"kmalloc-192",           192},		{"kmalloc-8",               8},
+	{"kmalloc-16",             16},		{"kmalloc-32",             32},
+	{"kmalloc-64",             64},		{"kmalloc-128",           128},
+	{"kmalloc-256",           256},		{"kmalloc-512",           512},
+	{"kmalloc-1024",         1024},		{"kmalloc-2048",         2048},
+	{"kmalloc-4096",         4096},		{"kmalloc-8192",         8192},
+	{"kmalloc-16384",       16384},		{"kmalloc-32768",       32768},
+	{"kmalloc-65536",       65536},		{"kmalloc-131072",     131072},
+	{"kmalloc-262144",     262144},		{"kmalloc-524288",     524288},
+	{"kmalloc-1048576",   1048576},		{"kmalloc-2097152",   2097152},
+	{"kmalloc-4194304",   4194304},		{"kmalloc-8388608",   8388608},
+	{"kmalloc-16777216", 16777216},		{"kmalloc-33554432", 33554432},
+	{"kmalloc-67108864", 67108864}
+};
+
 /*
  * Create the kmalloc array. Some of the regular kmalloc arrays
  * may already have been created because they were needed to
@@ -833,39 +858,30 @@ void __init create_kmalloc_caches(unsigned long flags)
 		for (i = 128 + 8; i <= 192; i += 8)
 			size_index[size_index_elem(i)] = 8;
 	}
-	for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_HIGH; i++) {
+	for (i = KMALLOC_LOOP_LOW; i <= KMALLOC_SHIFT_HIGH; i++) {
 		if (!kmalloc_caches[i]) {
-			kmalloc_caches[i] = create_kmalloc_cache(NULL,
-							1 << i, flags);
+			kmalloc_caches[i] = create_kmalloc_cache(
+						kmalloc_info[i].name,
+						kmalloc_info[i].size,
+						flags);
 		}
 
 		/*
-		 * Caches that are not of the two-to-the-power-of size.
-		 * These have to be created immediately after the
-		 * earlier power of two caches
+		 * "i == 2" is the "kmalloc-192" case which is the last special
+		 * case for initialization and it's the point to jump to
+		 * allocate the minimize size of the object. In slab allocator,
+		 * the KMALLOC_SHIFT_LOW = 5. So, it needs to skip 2^3 and 2^4
+		 * and go straight to allocate 2^5. If the ARCH_DMA_MINALIGN is
+		 * defined, it may be larger than 2^5 and here is also the
+		 * trick to skip the empty gap.
 		 */
-		if (KMALLOC_MIN_SIZE <= 32 && !kmalloc_caches[1] && i == 6)
-			kmalloc_caches[1] = create_kmalloc_cache(NULL, 96, flags);
-
-		if (KMALLOC_MIN_SIZE <= 64 && !kmalloc_caches[2] && i == 7)
-			kmalloc_caches[2] = create_kmalloc_cache(NULL, 192, flags);
+		if (i == 2)
+			i = (KMALLOC_SHIFT_LOW - 1);
 	}
 
 	/* Kmalloc array is now usable */
 	slab_state = UP;
 
-	for (i = 0; i <= KMALLOC_SHIFT_HIGH; i++) {
-		struct kmem_cache *s = kmalloc_caches[i];
-		char *n;
-
-		if (s) {
-			n = kasprintf(GFP_NOWAIT, "kmalloc-%d", kmalloc_size(i));
-
-			BUG_ON(!n);
-			s->name = n;
-		}
-	}
-
 #ifdef CONFIG_ZONE_DMA
 	for (i = 0; i <= KMALLOC_SHIFT_HIGH; i++) {
 		struct kmem_cache *s = kmalloc_caches[i];
-- 
cgit v1.2.3


From 1ed58b6051b67e5cfe9e465fb60bf7d5f55e0a64 Mon Sep 17 00:00:00 2001
From: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Date: Wed, 24 Jun 2015 16:55:59 -0700
Subject: linux/slab.h: fix three off-by-one typos in comment

The first is a keyboard-off-by-one, the other two the ordinary mathy kind.

Signed-off-by: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Acked-by: Christoph Lameter <cl@linux.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/slab.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/slab.h b/include/linux/slab.h
index 96f0ea506b5c..9de2fdc8b5e4 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -262,8 +262,8 @@ extern struct kmem_cache *kmalloc_dma_caches[KMALLOC_SHIFT_HIGH + 1];
  * belongs to.
  * 0 = zero alloc
  * 1 =  65 .. 96 bytes
- * 2 = 120 .. 192 bytes
- * n = 2^(n-1) .. 2^n -1
+ * 2 = 129 .. 192 bytes
+ * n = 2^(n-1)+1 .. 2^n
  */
 static __always_inline int kmalloc_index(size_t size)
 {
-- 
cgit v1.2.3


From 2ae416b142b625c58c9ccb039aa3ef48ad0e9bae Mon Sep 17 00:00:00 2001
From: Laurent Dufour <ldufour@linux.vnet.ibm.com>
Date: Wed, 24 Jun 2015 16:56:16 -0700
Subject: mm: new mm hook framework

CRIU is recreating the process memory layout by remapping the checkpointee
memory area on top of the current process (criu).  This includes remapping
the vDSO to the place it has at checkpoint time.

However some architectures like powerpc are keeping a reference to the
vDSO base address to build the signal return stack frame by calling the
vDSO sigreturn service.  So once the vDSO has been moved, this reference
is no more valid and the signal frame built later are not usable.

This patch serie is introducing a new mm hook framework, and a new
arch_remap hook which is called when mremap is done and the mm lock still
hold.  The next patch is adding the vDSO remap and unmap tracking to the
powerpc architecture.

This patch (of 3):

This patch introduces a new set of header file to manage mm hooks:
- per architecture empty header file (arch/x/include/asm/mm-arch-hooks.h)
- a generic header (include/linux/mm-arch-hooks.h)

The architecture which need to overwrite a hook as to redefine it in its
header file, while architecture which doesn't need have nothing to do.

The default hooks are defined in the generic header and are used in the
case the architecture is not defining it.

In a next step, mm hooks defined in include/asm-generic/mm_hooks.h should
be moved here.

Signed-off-by: Laurent Dufour <ldufour@linux.vnet.ibm.com>
Suggested-by: Andrew Morton <akpm@linux-foundation.org>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Pavel Emelyanov <xemul@parallels.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/alpha/include/asm/mm-arch-hooks.h      | 15 +++++++++++++++
 arch/arc/include/asm/mm-arch-hooks.h        | 15 +++++++++++++++
 arch/arm/include/asm/mm-arch-hooks.h        | 15 +++++++++++++++
 arch/arm64/include/asm/mm-arch-hooks.h      | 15 +++++++++++++++
 arch/avr32/include/asm/mm-arch-hooks.h      | 15 +++++++++++++++
 arch/blackfin/include/asm/mm-arch-hooks.h   | 15 +++++++++++++++
 arch/c6x/include/asm/mm-arch-hooks.h        | 15 +++++++++++++++
 arch/cris/include/asm/mm-arch-hooks.h       | 15 +++++++++++++++
 arch/frv/include/asm/mm-arch-hooks.h        | 15 +++++++++++++++
 arch/hexagon/include/asm/mm-arch-hooks.h    | 15 +++++++++++++++
 arch/ia64/include/asm/mm-arch-hooks.h       | 15 +++++++++++++++
 arch/m32r/include/asm/mm-arch-hooks.h       | 15 +++++++++++++++
 arch/m68k/include/asm/mm-arch-hooks.h       | 15 +++++++++++++++
 arch/metag/include/asm/mm-arch-hooks.h      | 15 +++++++++++++++
 arch/microblaze/include/asm/mm-arch-hooks.h | 15 +++++++++++++++
 arch/mips/include/asm/mm-arch-hooks.h       | 15 +++++++++++++++
 arch/mn10300/include/asm/mm-arch-hooks.h    | 15 +++++++++++++++
 arch/nios2/include/asm/mm-arch-hooks.h      | 15 +++++++++++++++
 arch/openrisc/include/asm/mm-arch-hooks.h   | 15 +++++++++++++++
 arch/parisc/include/asm/mm-arch-hooks.h     | 15 +++++++++++++++
 arch/powerpc/include/asm/mm-arch-hooks.h    | 15 +++++++++++++++
 arch/s390/include/asm/mm-arch-hooks.h       | 15 +++++++++++++++
 arch/score/include/asm/mm-arch-hooks.h      | 15 +++++++++++++++
 arch/sh/include/asm/mm-arch-hooks.h         | 15 +++++++++++++++
 arch/sparc/include/asm/mm-arch-hooks.h      | 15 +++++++++++++++
 arch/tile/include/asm/mm-arch-hooks.h       | 15 +++++++++++++++
 arch/um/include/asm/mm-arch-hooks.h         | 15 +++++++++++++++
 arch/unicore32/include/asm/mm-arch-hooks.h  | 15 +++++++++++++++
 arch/x86/include/asm/mm-arch-hooks.h        | 15 +++++++++++++++
 arch/xtensa/include/asm/mm-arch-hooks.h     | 15 +++++++++++++++
 include/linux/mm-arch-hooks.h               | 16 ++++++++++++++++
 31 files changed, 466 insertions(+)
 create mode 100644 arch/alpha/include/asm/mm-arch-hooks.h
 create mode 100644 arch/arc/include/asm/mm-arch-hooks.h
 create mode 100644 arch/arm/include/asm/mm-arch-hooks.h
 create mode 100644 arch/arm64/include/asm/mm-arch-hooks.h
 create mode 100644 arch/avr32/include/asm/mm-arch-hooks.h
 create mode 100644 arch/blackfin/include/asm/mm-arch-hooks.h
 create mode 100644 arch/c6x/include/asm/mm-arch-hooks.h
 create mode 100644 arch/cris/include/asm/mm-arch-hooks.h
 create mode 100644 arch/frv/include/asm/mm-arch-hooks.h
 create mode 100644 arch/hexagon/include/asm/mm-arch-hooks.h
 create mode 100644 arch/ia64/include/asm/mm-arch-hooks.h
 create mode 100644 arch/m32r/include/asm/mm-arch-hooks.h
 create mode 100644 arch/m68k/include/asm/mm-arch-hooks.h
 create mode 100644 arch/metag/include/asm/mm-arch-hooks.h
 create mode 100644 arch/microblaze/include/asm/mm-arch-hooks.h
 create mode 100644 arch/mips/include/asm/mm-arch-hooks.h
 create mode 100644 arch/mn10300/include/asm/mm-arch-hooks.h
 create mode 100644 arch/nios2/include/asm/mm-arch-hooks.h
 create mode 100644 arch/openrisc/include/asm/mm-arch-hooks.h
 create mode 100644 arch/parisc/include/asm/mm-arch-hooks.h
 create mode 100644 arch/powerpc/include/asm/mm-arch-hooks.h
 create mode 100644 arch/s390/include/asm/mm-arch-hooks.h
 create mode 100644 arch/score/include/asm/mm-arch-hooks.h
 create mode 100644 arch/sh/include/asm/mm-arch-hooks.h
 create mode 100644 arch/sparc/include/asm/mm-arch-hooks.h
 create mode 100644 arch/tile/include/asm/mm-arch-hooks.h
 create mode 100644 arch/um/include/asm/mm-arch-hooks.h
 create mode 100644 arch/unicore32/include/asm/mm-arch-hooks.h
 create mode 100644 arch/x86/include/asm/mm-arch-hooks.h
 create mode 100644 arch/xtensa/include/asm/mm-arch-hooks.h
 create mode 100644 include/linux/mm-arch-hooks.h

(limited to 'include/linux')

diff --git a/arch/alpha/include/asm/mm-arch-hooks.h b/arch/alpha/include/asm/mm-arch-hooks.h
new file mode 100644
index 000000000000..b07fd862fec3
--- /dev/null
+++ b/arch/alpha/include/asm/mm-arch-hooks.h
@@ -0,0 +1,15 @@
+/*
+ * Architecture specific mm hooks
+ *
+ * Copyright (C) 2015, IBM Corporation
+ * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef _ASM_ALPHA_MM_ARCH_HOOKS_H
+#define _ASM_ALPHA_MM_ARCH_HOOKS_H
+
+#endif /* _ASM_ALPHA_MM_ARCH_HOOKS_H */
diff --git a/arch/arc/include/asm/mm-arch-hooks.h b/arch/arc/include/asm/mm-arch-hooks.h
new file mode 100644
index 000000000000..c37541c5f8ba
--- /dev/null
+++ b/arch/arc/include/asm/mm-arch-hooks.h
@@ -0,0 +1,15 @@
+/*
+ * Architecture specific mm hooks
+ *
+ * Copyright (C) 2015, IBM Corporation
+ * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef _ASM_ARC_MM_ARCH_HOOKS_H
+#define _ASM_ARC_MM_ARCH_HOOKS_H
+
+#endif /* _ASM_ARC_MM_ARCH_HOOKS_H */
diff --git a/arch/arm/include/asm/mm-arch-hooks.h b/arch/arm/include/asm/mm-arch-hooks.h
new file mode 100644
index 000000000000..7056660c7cc4
--- /dev/null
+++ b/arch/arm/include/asm/mm-arch-hooks.h
@@ -0,0 +1,15 @@
+/*
+ * Architecture specific mm hooks
+ *
+ * Copyright (C) 2015, IBM Corporation
+ * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef _ASM_ARM_MM_ARCH_HOOKS_H
+#define _ASM_ARM_MM_ARCH_HOOKS_H
+
+#endif /* _ASM_ARM_MM_ARCH_HOOKS_H */
diff --git a/arch/arm64/include/asm/mm-arch-hooks.h b/arch/arm64/include/asm/mm-arch-hooks.h
new file mode 100644
index 000000000000..562b655f5ba9
--- /dev/null
+++ b/arch/arm64/include/asm/mm-arch-hooks.h
@@ -0,0 +1,15 @@
+/*
+ * Architecture specific mm hooks
+ *
+ * Copyright (C) 2015, IBM Corporation
+ * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef _ASM_ARM64_MM_ARCH_HOOKS_H
+#define _ASM_ARM64_MM_ARCH_HOOKS_H
+
+#endif /* _ASM_ARM64_MM_ARCH_HOOKS_H */
diff --git a/arch/avr32/include/asm/mm-arch-hooks.h b/arch/avr32/include/asm/mm-arch-hooks.h
new file mode 100644
index 000000000000..145452ffbdad
--- /dev/null
+++ b/arch/avr32/include/asm/mm-arch-hooks.h
@@ -0,0 +1,15 @@
+/*
+ * Architecture specific mm hooks
+ *
+ * Copyright (C) 2015, IBM Corporation
+ * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef _ASM_AVR32_MM_ARCH_HOOKS_H
+#define _ASM_AVR32_MM_ARCH_HOOKS_H
+
+#endif /* _ASM_AVR32_MM_ARCH_HOOKS_H */
diff --git a/arch/blackfin/include/asm/mm-arch-hooks.h b/arch/blackfin/include/asm/mm-arch-hooks.h
new file mode 100644
index 000000000000..1c5211ec338f
--- /dev/null
+++ b/arch/blackfin/include/asm/mm-arch-hooks.h
@@ -0,0 +1,15 @@
+/*
+ * Architecture specific mm hooks
+ *
+ * Copyright (C) 2015, IBM Corporation
+ * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef _ASM_BLACKFIN_MM_ARCH_HOOKS_H
+#define _ASM_BLACKFIN_MM_ARCH_HOOKS_H
+
+#endif /* _ASM_BLACKFIN_MM_ARCH_HOOKS_H */
diff --git a/arch/c6x/include/asm/mm-arch-hooks.h b/arch/c6x/include/asm/mm-arch-hooks.h
new file mode 100644
index 000000000000..bb3c4a6ce8e9
--- /dev/null
+++ b/arch/c6x/include/asm/mm-arch-hooks.h
@@ -0,0 +1,15 @@
+/*
+ * Architecture specific mm hooks
+ *
+ * Copyright (C) 2015, IBM Corporation
+ * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef _ASM_C6X_MM_ARCH_HOOKS_H
+#define _ASM_C6X_MM_ARCH_HOOKS_H
+
+#endif /* _ASM_C6X_MM_ARCH_HOOKS_H */
diff --git a/arch/cris/include/asm/mm-arch-hooks.h b/arch/cris/include/asm/mm-arch-hooks.h
new file mode 100644
index 000000000000..314f774db2b0
--- /dev/null
+++ b/arch/cris/include/asm/mm-arch-hooks.h
@@ -0,0 +1,15 @@
+/*
+ * Architecture specific mm hooks
+ *
+ * Copyright (C) 2015, IBM Corporation
+ * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef _ASM_CRIS_MM_ARCH_HOOKS_H
+#define _ASM_CRIS_MM_ARCH_HOOKS_H
+
+#endif /* _ASM_CRIS_MM_ARCH_HOOKS_H */
diff --git a/arch/frv/include/asm/mm-arch-hooks.h b/arch/frv/include/asm/mm-arch-hooks.h
new file mode 100644
index 000000000000..51d13a870404
--- /dev/null
+++ b/arch/frv/include/asm/mm-arch-hooks.h
@@ -0,0 +1,15 @@
+/*
+ * Architecture specific mm hooks
+ *
+ * Copyright (C) 2015, IBM Corporation
+ * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef _ASM_FRV_MM_ARCH_HOOKS_H
+#define _ASM_FRV_MM_ARCH_HOOKS_H
+
+#endif /* _ASM_FRV_MM_ARCH_HOOKS_H */
diff --git a/arch/hexagon/include/asm/mm-arch-hooks.h b/arch/hexagon/include/asm/mm-arch-hooks.h
new file mode 100644
index 000000000000..05e8b939e416
--- /dev/null
+++ b/arch/hexagon/include/asm/mm-arch-hooks.h
@@ -0,0 +1,15 @@
+/*
+ * Architecture specific mm hooks
+ *
+ * Copyright (C) 2015, IBM Corporation
+ * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef _ASM_HEXAGON_MM_ARCH_HOOKS_H
+#define _ASM_HEXAGON_MM_ARCH_HOOKS_H
+
+#endif /* _ASM_HEXAGON_MM_ARCH_HOOKS_H */
diff --git a/arch/ia64/include/asm/mm-arch-hooks.h b/arch/ia64/include/asm/mm-arch-hooks.h
new file mode 100644
index 000000000000..ab4b5c698322
--- /dev/null
+++ b/arch/ia64/include/asm/mm-arch-hooks.h
@@ -0,0 +1,15 @@
+/*
+ * Architecture specific mm hooks
+ *
+ * Copyright (C) 2015, IBM Corporation
+ * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef _ASM_IA64_MM_ARCH_HOOKS_H
+#define _ASM_IA64_MM_ARCH_HOOKS_H
+
+#endif /* _ASM_IA64_MM_ARCH_HOOKS_H */
diff --git a/arch/m32r/include/asm/mm-arch-hooks.h b/arch/m32r/include/asm/mm-arch-hooks.h
new file mode 100644
index 000000000000..6d60b4750f41
--- /dev/null
+++ b/arch/m32r/include/asm/mm-arch-hooks.h
@@ -0,0 +1,15 @@
+/*
+ * Architecture specific mm hooks
+ *
+ * Copyright (C) 2015, IBM Corporation
+ * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef _ASM_M32R_MM_ARCH_HOOKS_H
+#define _ASM_M32R_MM_ARCH_HOOKS_H
+
+#endif /* _ASM_M32R_MM_ARCH_HOOKS_H */
diff --git a/arch/m68k/include/asm/mm-arch-hooks.h b/arch/m68k/include/asm/mm-arch-hooks.h
new file mode 100644
index 000000000000..7e8709bc90ae
--- /dev/null
+++ b/arch/m68k/include/asm/mm-arch-hooks.h
@@ -0,0 +1,15 @@
+/*
+ * Architecture specific mm hooks
+ *
+ * Copyright (C) 2015, IBM Corporation
+ * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef _ASM_M68K_MM_ARCH_HOOKS_H
+#define _ASM_M68K_MM_ARCH_HOOKS_H
+
+#endif /* _ASM_M68K_MM_ARCH_HOOKS_H */
diff --git a/arch/metag/include/asm/mm-arch-hooks.h b/arch/metag/include/asm/mm-arch-hooks.h
new file mode 100644
index 000000000000..b0072b2eb0de
--- /dev/null
+++ b/arch/metag/include/asm/mm-arch-hooks.h
@@ -0,0 +1,15 @@
+/*
+ * Architecture specific mm hooks
+ *
+ * Copyright (C) 2015, IBM Corporation
+ * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef _ASM_METAG_MM_ARCH_HOOKS_H
+#define _ASM_METAG_MM_ARCH_HOOKS_H
+
+#endif /* _ASM_METAG_MM_ARCH_HOOKS_H */
diff --git a/arch/microblaze/include/asm/mm-arch-hooks.h b/arch/microblaze/include/asm/mm-arch-hooks.h
new file mode 100644
index 000000000000..5c4065911bda
--- /dev/null
+++ b/arch/microblaze/include/asm/mm-arch-hooks.h
@@ -0,0 +1,15 @@
+/*
+ * Architecture specific mm hooks
+ *
+ * Copyright (C) 2015, IBM Corporation
+ * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef _ASM_MICROBLAZE_MM_ARCH_HOOKS_H
+#define _ASM_MICROBLAZE_MM_ARCH_HOOKS_H
+
+#endif /* _ASM_MICROBLAZE_MM_ARCH_HOOKS_H */
diff --git a/arch/mips/include/asm/mm-arch-hooks.h b/arch/mips/include/asm/mm-arch-hooks.h
new file mode 100644
index 000000000000..b5609fe8e475
--- /dev/null
+++ b/arch/mips/include/asm/mm-arch-hooks.h
@@ -0,0 +1,15 @@
+/*
+ * Architecture specific mm hooks
+ *
+ * Copyright (C) 2015, IBM Corporation
+ * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef _ASM_MIPS_MM_ARCH_HOOKS_H
+#define _ASM_MIPS_MM_ARCH_HOOKS_H
+
+#endif /* _ASM_MIPS_MM_ARCH_HOOKS_H */
diff --git a/arch/mn10300/include/asm/mm-arch-hooks.h b/arch/mn10300/include/asm/mm-arch-hooks.h
new file mode 100644
index 000000000000..e2029a652f4c
--- /dev/null
+++ b/arch/mn10300/include/asm/mm-arch-hooks.h
@@ -0,0 +1,15 @@
+/*
+ * Architecture specific mm hooks
+ *
+ * Copyright (C) 2015, IBM Corporation
+ * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef _ASM_MN10300_MM_ARCH_HOOKS_H
+#define _ASM_MN10300_MM_ARCH_HOOKS_H
+
+#endif /* _ASM_MN10300_MM_ARCH_HOOKS_H */
diff --git a/arch/nios2/include/asm/mm-arch-hooks.h b/arch/nios2/include/asm/mm-arch-hooks.h
new file mode 100644
index 000000000000..d7290dc68558
--- /dev/null
+++ b/arch/nios2/include/asm/mm-arch-hooks.h
@@ -0,0 +1,15 @@
+/*
+ * Architecture specific mm hooks
+ *
+ * Copyright (C) 2015, IBM Corporation
+ * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef _ASM_NIOS2_MM_ARCH_HOOKS_H
+#define _ASM_NIOS2_MM_ARCH_HOOKS_H
+
+#endif /* _ASM_NIOS2_MM_ARCH_HOOKS_H */
diff --git a/arch/openrisc/include/asm/mm-arch-hooks.h b/arch/openrisc/include/asm/mm-arch-hooks.h
new file mode 100644
index 000000000000..6d33cb555fe1
--- /dev/null
+++ b/arch/openrisc/include/asm/mm-arch-hooks.h
@@ -0,0 +1,15 @@
+/*
+ * Architecture specific mm hooks
+ *
+ * Copyright (C) 2015, IBM Corporation
+ * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef _ASM_OPENRISC_MM_ARCH_HOOKS_H
+#define _ASM_OPENRISC_MM_ARCH_HOOKS_H
+
+#endif /* _ASM_OPENRISC_MM_ARCH_HOOKS_H */
diff --git a/arch/parisc/include/asm/mm-arch-hooks.h b/arch/parisc/include/asm/mm-arch-hooks.h
new file mode 100644
index 000000000000..654ec63b0ee9
--- /dev/null
+++ b/arch/parisc/include/asm/mm-arch-hooks.h
@@ -0,0 +1,15 @@
+/*
+ * Architecture specific mm hooks
+ *
+ * Copyright (C) 2015, IBM Corporation
+ * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef _ASM_PARISC_MM_ARCH_HOOKS_H
+#define _ASM_PARISC_MM_ARCH_HOOKS_H
+
+#endif /* _ASM_PARISC_MM_ARCH_HOOKS_H */
diff --git a/arch/powerpc/include/asm/mm-arch-hooks.h b/arch/powerpc/include/asm/mm-arch-hooks.h
new file mode 100644
index 000000000000..63091a19de9f
--- /dev/null
+++ b/arch/powerpc/include/asm/mm-arch-hooks.h
@@ -0,0 +1,15 @@
+/*
+ * Architecture specific mm hooks
+ *
+ * Copyright (C) 2015, IBM Corporation
+ * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef _ASM_POWERPC_MM_ARCH_HOOKS_H
+#define _ASM_POWERPC_MM_ARCH_HOOKS_H
+
+#endif /* _ASM_POWERPC_MM_ARCH_HOOKS_H */
diff --git a/arch/s390/include/asm/mm-arch-hooks.h b/arch/s390/include/asm/mm-arch-hooks.h
new file mode 100644
index 000000000000..07680b2f3c59
--- /dev/null
+++ b/arch/s390/include/asm/mm-arch-hooks.h
@@ -0,0 +1,15 @@
+/*
+ * Architecture specific mm hooks
+ *
+ * Copyright (C) 2015, IBM Corporation
+ * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef _ASM_S390_MM_ARCH_HOOKS_H
+#define _ASM_S390_MM_ARCH_HOOKS_H
+
+#endif /* _ASM_S390_MM_ARCH_HOOKS_H */
diff --git a/arch/score/include/asm/mm-arch-hooks.h b/arch/score/include/asm/mm-arch-hooks.h
new file mode 100644
index 000000000000..5e38689f189a
--- /dev/null
+++ b/arch/score/include/asm/mm-arch-hooks.h
@@ -0,0 +1,15 @@
+/*
+ * Architecture specific mm hooks
+ *
+ * Copyright (C) 2015, IBM Corporation
+ * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef _ASM_SCORE_MM_ARCH_HOOKS_H
+#define _ASM_SCORE_MM_ARCH_HOOKS_H
+
+#endif /* _ASM_SCORE_MM_ARCH_HOOKS_H */
diff --git a/arch/sh/include/asm/mm-arch-hooks.h b/arch/sh/include/asm/mm-arch-hooks.h
new file mode 100644
index 000000000000..18087298b728
--- /dev/null
+++ b/arch/sh/include/asm/mm-arch-hooks.h
@@ -0,0 +1,15 @@
+/*
+ * Architecture specific mm hooks
+ *
+ * Copyright (C) 2015, IBM Corporation
+ * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef _ASM_SH_MM_ARCH_HOOKS_H
+#define _ASM_SH_MM_ARCH_HOOKS_H
+
+#endif /* _ASM_SH_MM_ARCH_HOOKS_H */
diff --git a/arch/sparc/include/asm/mm-arch-hooks.h b/arch/sparc/include/asm/mm-arch-hooks.h
new file mode 100644
index 000000000000..b89ba44c16f1
--- /dev/null
+++ b/arch/sparc/include/asm/mm-arch-hooks.h
@@ -0,0 +1,15 @@
+/*
+ * Architecture specific mm hooks
+ *
+ * Copyright (C) 2015, IBM Corporation
+ * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef _ASM_SPARC_MM_ARCH_HOOKS_H
+#define _ASM_SPARC_MM_ARCH_HOOKS_H
+
+#endif /* _ASM_SPARC_MM_ARCH_HOOKS_H */
diff --git a/arch/tile/include/asm/mm-arch-hooks.h b/arch/tile/include/asm/mm-arch-hooks.h
new file mode 100644
index 000000000000..d1709ea774f7
--- /dev/null
+++ b/arch/tile/include/asm/mm-arch-hooks.h
@@ -0,0 +1,15 @@
+/*
+ * Architecture specific mm hooks
+ *
+ * Copyright (C) 2015, IBM Corporation
+ * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef _ASM_TILE_MM_ARCH_HOOKS_H
+#define _ASM_TILE_MM_ARCH_HOOKS_H
+
+#endif /* _ASM_TILE_MM_ARCH_HOOKS_H */
diff --git a/arch/um/include/asm/mm-arch-hooks.h b/arch/um/include/asm/mm-arch-hooks.h
new file mode 100644
index 000000000000..a7c8b0dfdd4e
--- /dev/null
+++ b/arch/um/include/asm/mm-arch-hooks.h
@@ -0,0 +1,15 @@
+/*
+ * Architecture specific mm hooks
+ *
+ * Copyright (C) 2015, IBM Corporation
+ * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef _ASM_UM_MM_ARCH_HOOKS_H
+#define _ASM_UM_MM_ARCH_HOOKS_H
+
+#endif /* _ASM_UM_MM_ARCH_HOOKS_H */
diff --git a/arch/unicore32/include/asm/mm-arch-hooks.h b/arch/unicore32/include/asm/mm-arch-hooks.h
new file mode 100644
index 000000000000..4d79a850c509
--- /dev/null
+++ b/arch/unicore32/include/asm/mm-arch-hooks.h
@@ -0,0 +1,15 @@
+/*
+ * Architecture specific mm hooks
+ *
+ * Copyright (C) 2015, IBM Corporation
+ * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef _ASM_UNICORE32_MM_ARCH_HOOKS_H
+#define _ASM_UNICORE32_MM_ARCH_HOOKS_H
+
+#endif /* _ASM_UNICORE32_MM_ARCH_HOOKS_H */
diff --git a/arch/x86/include/asm/mm-arch-hooks.h b/arch/x86/include/asm/mm-arch-hooks.h
new file mode 100644
index 000000000000..4e881a342236
--- /dev/null
+++ b/arch/x86/include/asm/mm-arch-hooks.h
@@ -0,0 +1,15 @@
+/*
+ * Architecture specific mm hooks
+ *
+ * Copyright (C) 2015, IBM Corporation
+ * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef _ASM_X86_MM_ARCH_HOOKS_H
+#define _ASM_X86_MM_ARCH_HOOKS_H
+
+#endif /* _ASM_X86_MM_ARCH_HOOKS_H */
diff --git a/arch/xtensa/include/asm/mm-arch-hooks.h b/arch/xtensa/include/asm/mm-arch-hooks.h
new file mode 100644
index 000000000000..d2e5cfd3dd02
--- /dev/null
+++ b/arch/xtensa/include/asm/mm-arch-hooks.h
@@ -0,0 +1,15 @@
+/*
+ * Architecture specific mm hooks
+ *
+ * Copyright (C) 2015, IBM Corporation
+ * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef _ASM_XTENSA_MM_ARCH_HOOKS_H
+#define _ASM_XTENSA_MM_ARCH_HOOKS_H
+
+#endif /* _ASM_XTENSA_MM_ARCH_HOOKS_H */
diff --git a/include/linux/mm-arch-hooks.h b/include/linux/mm-arch-hooks.h
new file mode 100644
index 000000000000..63005e367abd
--- /dev/null
+++ b/include/linux/mm-arch-hooks.h
@@ -0,0 +1,16 @@
+/*
+ * Generic mm no-op hooks.
+ *
+ * Copyright (C) 2015, IBM Corporation
+ * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#ifndef _LINUX_MM_ARCH_HOOKS_H
+#define _LINUX_MM_ARCH_HOOKS_H
+
+#include <asm/mm-arch-hooks.h>
+
+#endif /* _LINUX_MM_ARCH_HOOKS_H */
-- 
cgit v1.2.3


From 4abad2ca4a4dbdd4a218c12451231ab628f2e60c Mon Sep 17 00:00:00 2001
From: Laurent Dufour <ldufour@linux.vnet.ibm.com>
Date: Wed, 24 Jun 2015 16:56:19 -0700
Subject: mm: new arch_remap() hook

Some architectures would like to be triggered when a memory area is moved
through the mremap system call.

This patch introduces a new arch_remap() mm hook which is placed in the
path of mremap, and is called before the old area is unmapped (and the
arch_unmap() hook is called).

Signed-off-by: Laurent Dufour <ldufour@linux.vnet.ibm.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Pavel Emelyanov <xemul@parallels.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm-arch-hooks.h |  9 +++++++++
 mm/mremap.c                   | 17 +++++++++++------
 2 files changed, 20 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm-arch-hooks.h b/include/linux/mm-arch-hooks.h
index 63005e367abd..4efc3f56e6df 100644
--- a/include/linux/mm-arch-hooks.h
+++ b/include/linux/mm-arch-hooks.h
@@ -13,4 +13,13 @@
 
 #include <asm/mm-arch-hooks.h>
 
+#ifndef arch_remap
+static inline void arch_remap(struct mm_struct *mm,
+			      unsigned long old_start, unsigned long old_end,
+			      unsigned long new_start, unsigned long new_end)
+{
+}
+#define arch_remap arch_remap
+#endif
+
 #endif /* _LINUX_MM_ARCH_HOOKS_H */
diff --git a/mm/mremap.c b/mm/mremap.c
index 034e2d360652..a7c93eceb1c8 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -22,6 +22,7 @@
 #include <linux/mmu_notifier.h>
 #include <linux/sched/sysctl.h>
 #include <linux/uaccess.h>
+#include <linux/mm-arch-hooks.h>
 
 #include <asm/cacheflush.h>
 #include <asm/tlbflush.h>
@@ -286,13 +287,17 @@ static unsigned long move_vma(struct vm_area_struct *vma,
 		old_len = new_len;
 		old_addr = new_addr;
 		new_addr = -ENOMEM;
-	} else if (vma->vm_file && vma->vm_file->f_op->mremap) {
-		err = vma->vm_file->f_op->mremap(vma->vm_file, new_vma);
-		if (err < 0) {
-			move_page_tables(new_vma, new_addr, vma, old_addr,
-					 moved_len, true);
-			return err;
+	} else {
+		if (vma->vm_file && vma->vm_file->f_op->mremap) {
+			err = vma->vm_file->f_op->mremap(vma->vm_file, new_vma);
+			if (err < 0) {
+				move_page_tables(new_vma, new_addr, vma,
+						 old_addr, moved_len, true);
+				return err;
+			}
 		}
+		arch_remap(mm, old_addr, old_addr + old_len,
+			   new_addr, new_addr + new_len);
 	}
 
 	/* Conceal VM_ACCOUNT so old reservation is not undone */
-- 
cgit v1.2.3


From a9919c79359df9a706ff9e14fc0a93cd15791c9b Mon Sep 17 00:00:00 2001
From: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Date: Wed, 24 Jun 2015 16:56:28 -0700
Subject: mm: only define hashdist variable when needed

For !CONFIG_NUMA, hashdist will always be 0, since it's setter is
otherwise compiled out.  So we can save 4 bytes of data and some .text
(although mostly in __init functions) by only defining it for
CONFIG_NUMA.

Signed-off-by: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Acked-by: David Rientjes <rientjes@google.com>
Reviewed-by: Michal Hocko <mhocko@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/bootmem.h | 8 ++++----
 mm/page_alloc.c         | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h
index 0995c2de8162..f589222bfa87 100644
--- a/include/linux/bootmem.h
+++ b/include/linux/bootmem.h
@@ -357,12 +357,12 @@ extern void *alloc_large_system_hash(const char *tablename,
 /* Only NUMA needs hash distribution. 64bit NUMA architectures have
  * sufficient vmalloc space.
  */
-#if defined(CONFIG_NUMA) && defined(CONFIG_64BIT)
-#define HASHDIST_DEFAULT 1
+#ifdef CONFIG_NUMA
+#define HASHDIST_DEFAULT IS_ENABLED(CONFIG_64BIT)
+extern int hashdist;		/* Distribute hashes across NUMA nodes? */
 #else
-#define HASHDIST_DEFAULT 0
+#define hashdist (0)
 #endif
-extern int hashdist;		/* Distribute hashes across NUMA nodes? */
 
 
 #endif /* _LINUX_BOOTMEM_H */
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index ebffa0e4a9c0..159dbbc3375d 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -6013,9 +6013,9 @@ out:
 	return ret;
 }
 
+#ifdef CONFIG_NUMA
 int hashdist = HASHDIST_DEFAULT;
 
-#ifdef CONFIG_NUMA
 static int __init set_hashdist(char *str)
 {
 	if (!str)
-- 
cgit v1.2.3


From c761471b58e6138938ebc6eafec20b2f60cb3397 Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Wed, 24 Jun 2015 16:56:33 -0700
Subject: mm: avoid tail page refcounting on non-THP compound pages

Reintroduce 8d63d99a5dfb ("mm: avoid tail page refcounting on non-THP
compound pages") after removing bogus VM_BUG_ON_PAGE() in
put_unrefcounted_compound_page().

THP uses tail page refcounting to be able to split huge pages at any time.
 Tail page refcounting is not needed for other users of compound pages and
it's harmful because of overhead.

We try to exclude non-THP pages from tail page refcounting using
__compound_tail_refcounted() check.  It excludes most common non-THP
compound pages: SL*B and hugetlb, but it doesn't catch rest of __GFP_COMP
users -- drivers.

And it's not only about overhead.

Drivers might want to use compound pages to get refcounting semantics
suitable for mapping high-order pages to userspace.  But tail page
refcounting breaks it.

Tail page refcounting uses ->_mapcount in tail pages to store GUP pins on
them.  It means GUP pins would affect page_mapcount() for tail pages.
It's not a problem for THP, because it never maps tail pages.  But unlike
THP, drivers map parts of compound pages with PTEs and it makes
page_mapcount() be called for tail pages.

In particular, GUP pins would shift PSS up and affect /proc/kpagecount for
such pages.  But, I'm not aware about anything which can lead to crash or
other serious misbehaviour.

Since currently all THP pages are anonymous and all drivers pages are not,
we can fix the __compound_tail_refcounted() check by requiring PageAnon()
to enable tail page refcounting.

Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Hugh Dickins <hughd@google.com>
Reviewed-by: Andrea Arcangeli <aarcange@redhat.com>
Reported-by: Borislav Petkov <bp@alien8.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 0755b9fd03a7..8b086070c3a5 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -499,7 +499,7 @@ static inline int page_count(struct page *page)
 
 static inline bool __compound_tail_refcounted(struct page *page)
 {
-	return !PageSlab(page) && !PageHeadHuge(page);
+	return PageAnon(page) && !PageSlab(page) && !PageHeadHuge(page);
 }
 
 /*
-- 
cgit v1.2.3


From ead07f6a867b5b1b41cf703735e8b39094987a7d Mon Sep 17 00:00:00 2001
From: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Date: Wed, 24 Jun 2015 16:56:48 -0700
Subject: mm/memory-failure: introduce get_hwpoison_page() for consistent
 refcount handling

memory_failure() can run in 2 different mode (specified by
MF_COUNT_INCREASED) in page refcount perspective.  When
MF_COUNT_INCREASED is set, memory_failure() assumes that the caller
takes a refcount of the target page.  And if cleared, memory_failure()
takes it in it's own.

In current code, however, refcounting is done differently in each caller.
For example, madvise_hwpoison() uses get_user_pages_fast() and
hwpoison_inject() uses get_page_unless_zero().  So this inconsistent
refcounting causes refcount failure especially for thp tail pages.
Typical user visible effects are like memory leak or
VM_BUG_ON_PAGE(!page_count(page)) in isolate_lru_page().

To fix this refcounting issue, this patch introduces get_hwpoison_page()
to handle thp tail pages in the same manner for each caller of hwpoison
code.

memory_failure() might fail to split thp and in such case it returns
without completing page isolation.  This is not good because PageHWPoison
on the thp is still set and there's no easy way to unpoison such thps.  So
this patch try to roll back any action to the thp in "non anonymous thp"
case and "thp split failed" case, expecting an MCE(SRAR) generated by
later access afterward will properly free such thps.

[akpm@linux-foundation.org: fix CONFIG_HWPOISON_INJECT=m]
Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Tony Luck <tony.luck@intel.com>
Cc: "Kirill A. Shutemov" <kirill@shutemov.name>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h   |  1 +
 mm/hwpoison-inject.c |  4 ++--
 mm/memory-failure.c  | 50 +++++++++++++++++++++++++++++++++++++++++++++-----
 3 files changed, 48 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 8b086070c3a5..cd9df66138a1 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2146,6 +2146,7 @@ enum mf_flags {
 extern int memory_failure(unsigned long pfn, int trapno, int flags);
 extern void memory_failure_queue(unsigned long pfn, int trapno, int flags);
 extern int unpoison_memory(unsigned long pfn);
+extern int get_hwpoison_page(struct page *page);
 extern int sysctl_memory_failure_early_kill;
 extern int sysctl_memory_failure_recovery;
 extern void shake_page(struct page *p, int access);
diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c
index 4ca5fe0042e1..bf73ac17dad4 100644
--- a/mm/hwpoison-inject.c
+++ b/mm/hwpoison-inject.c
@@ -28,7 +28,7 @@ static int hwpoison_inject(void *data, u64 val)
 	/*
 	 * This implies unable to support free buddy pages.
 	 */
-	if (!get_page_unless_zero(hpage))
+	if (!get_hwpoison_page(p))
 		return 0;
 
 	if (!hwpoison_filter_enable)
@@ -58,7 +58,7 @@ inject:
 	pr_info("Injecting memory failure at pfn %#lx\n", pfn);
 	return memory_failure(pfn, 18, MF_COUNT_INCREASED);
 put_out:
-	put_page(hpage);
+	put_page(p);
 	return 0;
 }
 
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 17a8e3bc3b01..a810ab1519f0 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -915,6 +915,39 @@ static int page_action(struct page_state *ps, struct page *p,
 	return (result == RECOVERED || result == DELAYED) ? 0 : -EBUSY;
 }
 
+/**
+ * get_hwpoison_page() - Get refcount for memory error handling:
+ * @page:	raw error page (hit by memory error)
+ *
+ * Return: return 0 if failed to grab the refcount, otherwise true (some
+ * non-zero value.)
+ */
+int get_hwpoison_page(struct page *page)
+{
+	struct page *head = compound_head(page);
+
+	if (PageHuge(head))
+		return get_page_unless_zero(head);
+
+	/*
+	 * Thp tail page has special refcounting rule (refcount of tail pages
+	 * is stored in ->_mapcount,) so we can't call get_page_unless_zero()
+	 * directly for tail pages.
+	 */
+	if (PageTransHuge(head)) {
+		if (get_page_unless_zero(head)) {
+			if (PageTail(page))
+				get_page(page);
+			return 1;
+		} else {
+			return 0;
+		}
+	}
+
+	return get_page_unless_zero(page);
+}
+EXPORT_SYMBOL_GPL(get_hwpoison_page);
+
 /*
  * Do all that is necessary to remove user space mappings. Unmap
  * the pages and send SIGBUS to the processes if the data was dirty.
@@ -1097,8 +1130,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
 	 * In fact it's dangerous to directly bump up page count from 0,
 	 * that may make page_freeze_refs()/page_unfreeze_refs() mismatch.
 	 */
-	if (!(flags & MF_COUNT_INCREASED) &&
-		!get_page_unless_zero(hpage)) {
+	if (!(flags & MF_COUNT_INCREASED) && !get_hwpoison_page(p)) {
 		if (is_free_buddy_page(p)) {
 			action_result(pfn, MSG_BUDDY, DELAYED);
 			return 0;
@@ -1130,12 +1162,20 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
 	if (!PageHuge(p) && PageTransHuge(hpage)) {
 		if (!PageAnon(hpage)) {
 			pr_err("MCE: %#lx: non anonymous thp\n", pfn);
+			if (TestClearPageHWPoison(p))
+				atomic_long_sub(nr_pages, &num_poisoned_pages);
 			put_page(p);
+			if (p != hpage)
+				put_page(hpage);
 			return -EBUSY;
 		}
 		if (unlikely(split_huge_page(hpage))) {
 			pr_err("MCE: %#lx: thp split failed\n", pfn);
+			if (TestClearPageHWPoison(p))
+				atomic_long_sub(nr_pages, &num_poisoned_pages);
 			put_page(p);
+			if (p != hpage)
+				put_page(hpage);
 			return -EBUSY;
 		}
 		VM_BUG_ON_PAGE(!page_count(p), p);
@@ -1413,12 +1453,12 @@ int unpoison_memory(unsigned long pfn)
 	 */
 	if (!PageHuge(page) && PageTransHuge(page)) {
 		pr_info("MCE: Memory failure is now running on %#lx\n", pfn);
-			return 0;
+		return 0;
 	}
 
 	nr_pages = 1 << compound_order(page);
 
-	if (!get_page_unless_zero(page)) {
+	if (!get_hwpoison_page(p)) {
 		/*
 		 * Since HWPoisoned hugepage should have non-zero refcount,
 		 * race between memory failure and unpoison seems to happen.
@@ -1486,7 +1526,7 @@ static int __get_any_page(struct page *p, unsigned long pfn, int flags)
 	 * When the target page is a free hugepage, just remove it
 	 * from free hugepage list.
 	 */
-	if (!get_page_unless_zero(compound_head(p))) {
+	if (!get_hwpoison_page(p)) {
 		if (PageHuge(p)) {
 			pr_info("%s: %#lx free huge page\n", __func__, pfn);
 			ret = 0;
-- 
cgit v1.2.3


From 16e951966f05da5ccd650104176f6ba289f7fa20 Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Wed, 24 Jun 2015 16:57:07 -0700
Subject: mm: oom_kill: clean up victim marking and exiting interfaces

Rename unmark_oom_victim() to exit_oom_victim().  Marking and unmarking
are related in functionality, but the interface is not symmetrical at
all: one is an internal OOM killer function used during the killing, the
other is for an OOM victim to signal its own death on exit later on.
This has locking implications, see follow-up changes.

While at it, rename mark_tsk_oom_victim() to mark_oom_victim(), which
is easier on the eye.

Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: David Rientjes <rientjes@google.com>
Acked-by: Michal Hocko <mhocko@suse.cz>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/staging/android/lowmemorykiller.c |  2 +-
 include/linux/oom.h                       |  7 ++++---
 kernel/exit.c                             |  2 +-
 mm/memcontrol.c                           |  2 +-
 mm/oom_kill.c                             | 16 +++++++---------
 5 files changed, 14 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/staging/android/lowmemorykiller.c b/drivers/staging/android/lowmemorykiller.c
index feafa172b155..2345ee7342d9 100644
--- a/drivers/staging/android/lowmemorykiller.c
+++ b/drivers/staging/android/lowmemorykiller.c
@@ -165,7 +165,7 @@ static unsigned long lowmem_scan(struct shrinker *s, struct shrink_control *sc)
 		 * infrastructure. There is no real reason why the selected
 		 * task should have access to the memory reserves.
 		 */
-		mark_tsk_oom_victim(selected);
+		mark_oom_victim(selected);
 		send_sig(SIGKILL, selected, 0);
 		rem += selected_tasksize;
 	}
diff --git a/include/linux/oom.h b/include/linux/oom.h
index 44b2f6f7bbd8..a8e6a498cbcb 100644
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -47,9 +47,7 @@ static inline bool oom_task_origin(const struct task_struct *p)
 	return !!(p->signal->oom_flags & OOM_FLAG_ORIGIN);
 }
 
-extern void mark_tsk_oom_victim(struct task_struct *tsk);
-
-extern void unmark_oom_victim(void);
+extern void mark_oom_victim(struct task_struct *tsk);
 
 extern unsigned long oom_badness(struct task_struct *p,
 		struct mem_cgroup *memcg, const nodemask_t *nodemask,
@@ -75,6 +73,9 @@ extern enum oom_scan_t oom_scan_process_thread(struct task_struct *task,
 
 extern bool out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
 		int order, nodemask_t *mask, bool force_kill);
+
+extern void exit_oom_victim(void);
+
 extern int register_oom_notifier(struct notifier_block *nb);
 extern int unregister_oom_notifier(struct notifier_block *nb);
 
diff --git a/kernel/exit.c b/kernel/exit.c
index 22fcc05dec40..185752a729f6 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -436,7 +436,7 @@ static void exit_mm(struct task_struct *tsk)
 	mm_update_next_owner(mm);
 	mmput(mm);
 	if (test_thread_flag(TIF_MEMDIE))
-		unmark_oom_victim();
+		exit_oom_victim();
 }
 
 static struct task_struct *find_alive_thread(struct task_struct *p)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index a04225d372ba..20a7e874f719 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1536,7 +1536,7 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
 	 * quickly exit and free its memory.
 	 */
 	if (fatal_signal_pending(current) || task_will_free_mem(current)) {
-		mark_tsk_oom_victim(current);
+		mark_oom_victim(current);
 		return;
 	}
 
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 73763e489e86..b2f081fe4b1a 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -408,13 +408,13 @@ bool oom_killer_disabled __read_mostly;
 static DECLARE_RWSEM(oom_sem);
 
 /**
- * mark_tsk_oom_victim - marks the given task as OOM victim.
+ * mark_oom_victim - mark the given task as OOM victim
  * @tsk: task to mark
  *
  * Has to be called with oom_sem taken for read and never after
  * oom has been disabled already.
  */
-void mark_tsk_oom_victim(struct task_struct *tsk)
+void mark_oom_victim(struct task_struct *tsk)
 {
 	WARN_ON(oom_killer_disabled);
 	/* OOM killer might race with memcg OOM */
@@ -431,11 +431,9 @@ void mark_tsk_oom_victim(struct task_struct *tsk)
 }
 
 /**
- * unmark_oom_victim - unmarks the current task as OOM victim.
- *
- * Wakes up all waiters in oom_killer_disable()
+ * exit_oom_victim - note the exit of an OOM victim
  */
-void unmark_oom_victim(void)
+void exit_oom_victim(void)
 {
 	if (!test_and_clear_thread_flag(TIF_MEMDIE))
 		return;
@@ -515,7 +513,7 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
 	 */
 	task_lock(p);
 	if (p->mm && task_will_free_mem(p)) {
-		mark_tsk_oom_victim(p);
+		mark_oom_victim(p);
 		task_unlock(p);
 		put_task_struct(p);
 		return;
@@ -570,7 +568,7 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
 
 	/* mm cannot safely be dereferenced after task_unlock(victim) */
 	mm = victim->mm;
-	mark_tsk_oom_victim(victim);
+	mark_oom_victim(victim);
 	pr_err("Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB\n",
 		task_pid_nr(victim), victim->comm, K(victim->mm->total_vm),
 		K(get_mm_counter(victim->mm, MM_ANONPAGES)),
@@ -728,7 +726,7 @@ static void __out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
 	 */
 	if (current->mm &&
 	    (fatal_signal_pending(current) || task_will_free_mem(current))) {
-		mark_tsk_oom_victim(current);
+		mark_oom_victim(current);
 		return;
 	}
 
-- 
cgit v1.2.3


From dc56401fc9f25e8f93899991ec858c98a331d88c Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Wed, 24 Jun 2015 16:57:19 -0700
Subject: mm: oom_kill: simplify OOM killer locking

The zonelist locking and the oom_sem are two overlapping locks that are
used to serialize global OOM killing against different things.

The historical zonelist locking serializes OOM kills from allocations with
overlapping zonelists against each other to prevent killing more tasks
than necessary in the same memory domain.  Only when neither tasklists nor
zonelists from two concurrent OOM kills overlap (tasks in separate memcgs
bound to separate nodes) are OOM kills allowed to execute in parallel.

The younger oom_sem is a read-write lock to serialize OOM killing against
the PM code trying to disable the OOM killer altogether.

However, the OOM killer is a fairly cold error path, there is really no
reason to optimize for highly performant and concurrent OOM kills.  And
the oom_sem is just flat-out redundant.

Replace both locking schemes with a single global mutex serializing OOM
kills regardless of context.

Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Acked-by: David Rientjes <rientjes@google.com>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/tty/sysrq.c |   2 +
 include/linux/oom.h |   5 +--
 mm/memcontrol.c     |  18 +++++---
 mm/oom_kill.c       | 127 +++++++++++-----------------------------------------
 mm/page_alloc.c     |   8 ++--
 5 files changed, 46 insertions(+), 114 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/tty/sysrq.c b/drivers/tty/sysrq.c
index 843f2cdc280b..b20d2c0ec451 100644
--- a/drivers/tty/sysrq.c
+++ b/drivers/tty/sysrq.c
@@ -356,9 +356,11 @@ static struct sysrq_key_op sysrq_term_op = {
 
 static void moom_callback(struct work_struct *ignored)
 {
+	mutex_lock(&oom_lock);
 	if (!out_of_memory(node_zonelist(first_memory_node, GFP_KERNEL),
 			   GFP_KERNEL, 0, NULL, true))
 		pr_info("OOM request ignored because killer is disabled\n");
+	mutex_unlock(&oom_lock);
 }
 
 static DECLARE_WORK(moom_work, moom_callback);
diff --git a/include/linux/oom.h b/include/linux/oom.h
index a8e6a498cbcb..7deecb7bca5e 100644
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -32,6 +32,8 @@ enum oom_scan_t {
 /* Thread is the potential origin of an oom condition; kill first on oom */
 #define OOM_FLAG_ORIGIN		((__force oom_flags_t)0x1)
 
+extern struct mutex oom_lock;
+
 static inline void set_current_oom_origin(void)
 {
 	current->signal->oom_flags |= OOM_FLAG_ORIGIN;
@@ -60,9 +62,6 @@ extern void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
 			     struct mem_cgroup *memcg, nodemask_t *nodemask,
 			     const char *message);
 
-extern bool oom_zonelist_trylock(struct zonelist *zonelist, gfp_t gfp_flags);
-extern void oom_zonelist_unlock(struct zonelist *zonelist, gfp_t gfp_flags);
-
 extern void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask,
 			       int order, const nodemask_t *nodemask,
 			       struct mem_cgroup *memcg);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 20a7e874f719..8da44a083397 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1530,6 +1530,8 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
 	unsigned int points = 0;
 	struct task_struct *chosen = NULL;
 
+	mutex_lock(&oom_lock);
+
 	/*
 	 * If current has a pending SIGKILL or is exiting, then automatically
 	 * select it.  The goal is to allow it to allocate so that it may
@@ -1537,7 +1539,7 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
 	 */
 	if (fatal_signal_pending(current) || task_will_free_mem(current)) {
 		mark_oom_victim(current);
-		return;
+		goto unlock;
 	}
 
 	check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL, memcg);
@@ -1564,7 +1566,7 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
 				mem_cgroup_iter_break(memcg, iter);
 				if (chosen)
 					put_task_struct(chosen);
-				return;
+				goto unlock;
 			case OOM_SCAN_OK:
 				break;
 			};
@@ -1585,11 +1587,13 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
 		css_task_iter_end(&it);
 	}
 
-	if (!chosen)
-		return;
-	points = chosen_points * 1000 / totalpages;
-	oom_kill_process(chosen, gfp_mask, order, points, totalpages, memcg,
-			 NULL, "Memory cgroup out of memory");
+	if (chosen) {
+		points = chosen_points * 1000 / totalpages;
+		oom_kill_process(chosen, gfp_mask, order, points, totalpages,
+				 memcg, NULL, "Memory cgroup out of memory");
+	}
+unlock:
+	mutex_unlock(&oom_lock);
 }
 
 #if MAX_NUMNODES > 1
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index d3490b019d46..5cfda39b3268 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -42,7 +42,8 @@
 int sysctl_panic_on_oom;
 int sysctl_oom_kill_allocating_task;
 int sysctl_oom_dump_tasks = 1;
-static DEFINE_SPINLOCK(zone_scan_lock);
+
+DEFINE_MUTEX(oom_lock);
 
 #ifdef CONFIG_NUMA
 /**
@@ -405,13 +406,12 @@ static atomic_t oom_victims = ATOMIC_INIT(0);
 static DECLARE_WAIT_QUEUE_HEAD(oom_victims_wait);
 
 bool oom_killer_disabled __read_mostly;
-static DECLARE_RWSEM(oom_sem);
 
 /**
  * mark_oom_victim - mark the given task as OOM victim
  * @tsk: task to mark
  *
- * Has to be called with oom_sem taken for read and never after
+ * Has to be called with oom_lock held and never after
  * oom has been disabled already.
  */
 void mark_oom_victim(struct task_struct *tsk)
@@ -460,14 +460,14 @@ bool oom_killer_disable(void)
 	 * Make sure to not race with an ongoing OOM killer
 	 * and that the current is not the victim.
 	 */
-	down_write(&oom_sem);
+	mutex_lock(&oom_lock);
 	if (test_thread_flag(TIF_MEMDIE)) {
-		up_write(&oom_sem);
+		mutex_unlock(&oom_lock);
 		return false;
 	}
 
 	oom_killer_disabled = true;
-	up_write(&oom_sem);
+	mutex_unlock(&oom_lock);
 
 	wait_event(oom_victims_wait, !atomic_read(&oom_victims));
 
@@ -634,52 +634,6 @@ int unregister_oom_notifier(struct notifier_block *nb)
 }
 EXPORT_SYMBOL_GPL(unregister_oom_notifier);
 
-/*
- * Try to acquire the OOM killer lock for the zones in zonelist.  Returns zero
- * if a parallel OOM killing is already taking place that includes a zone in
- * the zonelist.  Otherwise, locks all zones in the zonelist and returns 1.
- */
-bool oom_zonelist_trylock(struct zonelist *zonelist, gfp_t gfp_mask)
-{
-	struct zoneref *z;
-	struct zone *zone;
-	bool ret = true;
-
-	spin_lock(&zone_scan_lock);
-	for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask))
-		if (test_bit(ZONE_OOM_LOCKED, &zone->flags)) {
-			ret = false;
-			goto out;
-		}
-
-	/*
-	 * Lock each zone in the zonelist under zone_scan_lock so a parallel
-	 * call to oom_zonelist_trylock() doesn't succeed when it shouldn't.
-	 */
-	for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask))
-		set_bit(ZONE_OOM_LOCKED, &zone->flags);
-
-out:
-	spin_unlock(&zone_scan_lock);
-	return ret;
-}
-
-/*
- * Clears the ZONE_OOM_LOCKED flag for all zones in the zonelist so that failed
- * allocation attempts with zonelists containing them may now recall the OOM
- * killer, if necessary.
- */
-void oom_zonelist_unlock(struct zonelist *zonelist, gfp_t gfp_mask)
-{
-	struct zoneref *z;
-	struct zone *zone;
-
-	spin_lock(&zone_scan_lock);
-	for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask))
-		clear_bit(ZONE_OOM_LOCKED, &zone->flags);
-	spin_unlock(&zone_scan_lock);
-}
-
 /**
  * __out_of_memory - kill the "best" process when we run out of memory
  * @zonelist: zonelist pointer
@@ -693,8 +647,8 @@ void oom_zonelist_unlock(struct zonelist *zonelist, gfp_t gfp_mask)
  * OR try to be smart about which process to kill. Note that we
  * don't have to be perfect here, we just have to be good.
  */
-static void __out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
-		int order, nodemask_t *nodemask, bool force_kill)
+bool out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
+		   int order, nodemask_t *nodemask, bool force_kill)
 {
 	const nodemask_t *mpol_mask;
 	struct task_struct *p;
@@ -704,10 +658,13 @@ static void __out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
 	enum oom_constraint constraint = CONSTRAINT_NONE;
 	int killed = 0;
 
+	if (oom_killer_disabled)
+		return false;
+
 	blocking_notifier_call_chain(&oom_notify_list, 0, &freed);
 	if (freed > 0)
 		/* Got some memory back in the last second. */
-		return;
+		goto out;
 
 	/*
 	 * If current has a pending SIGKILL or is exiting, then automatically
@@ -720,7 +677,7 @@ static void __out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
 	if (current->mm &&
 	    (fatal_signal_pending(current) || task_will_free_mem(current))) {
 		mark_oom_victim(current);
-		return;
+		goto out;
 	}
 
 	/*
@@ -760,32 +717,8 @@ out:
 	 */
 	if (killed)
 		schedule_timeout_killable(1);
-}
-
-/**
- * out_of_memory -  tries to invoke OOM killer.
- * @zonelist: zonelist pointer
- * @gfp_mask: memory allocation flags
- * @order: amount of memory being requested as a power of 2
- * @nodemask: nodemask passed to page allocator
- * @force_kill: true if a task must be killed, even if others are exiting
- *
- * invokes __out_of_memory if the OOM is not disabled by oom_killer_disable()
- * when it returns false. Otherwise returns true.
- */
-bool out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
-		int order, nodemask_t *nodemask, bool force_kill)
-{
-	bool ret = false;
-
-	down_read(&oom_sem);
-	if (!oom_killer_disabled) {
-		__out_of_memory(zonelist, gfp_mask, order, nodemask, force_kill);
-		ret = true;
-	}
-	up_read(&oom_sem);
 
-	return ret;
+	return true;
 }
 
 /*
@@ -795,27 +728,21 @@ bool out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
  */
 void pagefault_out_of_memory(void)
 {
-	struct zonelist *zonelist;
-
-	down_read(&oom_sem);
 	if (mem_cgroup_oom_synchronize(true))
-		goto unlock;
+		return;
 
-	zonelist = node_zonelist(first_memory_node, GFP_KERNEL);
-	if (oom_zonelist_trylock(zonelist, GFP_KERNEL)) {
-		if (!oom_killer_disabled)
-			__out_of_memory(NULL, 0, 0, NULL, false);
-		else
-			/*
-			 * There shouldn't be any user tasks runable while the
-			 * OOM killer is disabled so the current task has to
-			 * be a racing OOM victim for which oom_killer_disable()
-			 * is waiting for.
-			 */
-			WARN_ON(test_thread_flag(TIF_MEMDIE));
+	if (!mutex_trylock(&oom_lock))
+		return;
 
-		oom_zonelist_unlock(zonelist, GFP_KERNEL);
+	if (!out_of_memory(NULL, 0, 0, NULL, false)) {
+		/*
+		 * There shouldn't be any user tasks runnable while the
+		 * OOM killer is disabled, so the current task has to
+		 * be a racing OOM victim for which oom_killer_disable()
+		 * is waiting for.
+		 */
+		WARN_ON(test_thread_flag(TIF_MEMDIE));
 	}
-unlock:
-	up_read(&oom_sem);
+
+	mutex_unlock(&oom_lock);
 }
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 3b02be4def90..cae21dc9d54e 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2360,10 +2360,10 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
 	*did_some_progress = 0;
 
 	/*
-	 * Acquire the per-zone oom lock for each zone.  If that
-	 * fails, somebody else is making progress for us.
+	 * Acquire the oom lock.  If that fails, somebody else is
+	 * making progress for us.
 	 */
-	if (!oom_zonelist_trylock(ac->zonelist, gfp_mask)) {
+	if (!mutex_trylock(&oom_lock)) {
 		*did_some_progress = 1;
 		schedule_timeout_uninterruptible(1);
 		return NULL;
@@ -2408,7 +2408,7 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
 			|| WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL))
 		*did_some_progress = 1;
 out:
-	oom_zonelist_unlock(ac->zonelist, gfp_mask);
+	mutex_unlock(&oom_lock);
 	return page;
 }
 
-- 
cgit v1.2.3


From cc637b1704d78b068c2eb700eec384c69ea56cdf Mon Sep 17 00:00:00 2001
From: Xie XiuQi <xiexiuqi@huawei.com>
Date: Wed, 24 Jun 2015 16:57:30 -0700
Subject: memory-failure: export page_type and action result

Export 'outcome' and 'action_page_type' to mm.h, so we could use
this emnus outside.

This patch is preparation for adding trace events for memory-failure
recovery action.

Signed-off-by: Xie XiuQi <xiexiuqi@huawei.com>
Acked-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Chen Gong <gong.chen@linux.intel.com>
Cc: Jim Davis <jim.epost@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Tony Luck <tony.luck@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h  |  34 +++++++++++
 mm/memory-failure.c | 168 +++++++++++++++++++++-------------------------------
 2 files changed, 101 insertions(+), 101 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index cd9df66138a1..986a9a221ee0 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2153,6 +2153,40 @@ extern void shake_page(struct page *p, int access);
 extern atomic_long_t num_poisoned_pages;
 extern int soft_offline_page(struct page *page, int flags);
 
+
+/*
+ * Error handlers for various types of pages.
+ */
+enum mf_outcome {
+	MF_IGNORED,	/* Error: cannot be handled */
+	MF_FAILED,	/* Error: handling failed */
+	MF_DELAYED,	/* Will be handled later */
+	MF_RECOVERED,	/* Successfully recovered */
+};
+
+enum mf_action_page_type {
+	MF_MSG_KERNEL,
+	MF_MSG_KERNEL_HIGH_ORDER,
+	MF_MSG_SLAB,
+	MF_MSG_DIFFERENT_COMPOUND,
+	MF_MSG_POISONED_HUGE,
+	MF_MSG_HUGE,
+	MF_MSG_FREE_HUGE,
+	MF_MSG_UNMAP_FAILED,
+	MF_MSG_DIRTY_SWAPCACHE,
+	MF_MSG_CLEAN_SWAPCACHE,
+	MF_MSG_DIRTY_MLOCKED_LRU,
+	MF_MSG_CLEAN_MLOCKED_LRU,
+	MF_MSG_DIRTY_UNEVICTABLE_LRU,
+	MF_MSG_CLEAN_UNEVICTABLE_LRU,
+	MF_MSG_DIRTY_LRU,
+	MF_MSG_CLEAN_LRU,
+	MF_MSG_TRUNCATED_LRU,
+	MF_MSG_BUDDY,
+	MF_MSG_BUDDY_2ND,
+	MF_MSG_UNKNOWN,
+};
+
 #if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS)
 extern void clear_huge_page(struct page *page,
 			    unsigned long addr,
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index c72f41bfbaaf..b71a3cd3d0b0 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -504,68 +504,34 @@ static void collect_procs(struct page *page, struct list_head *tokill,
 	kfree(tk);
 }
 
-/*
- * Error handlers for various types of pages.
- */
-
-enum outcome {
-	IGNORED,	/* Error: cannot be handled */
-	FAILED,		/* Error: handling failed */
-	DELAYED,	/* Will be handled later */
-	RECOVERED,	/* Successfully recovered */
-};
-
 static const char *action_name[] = {
-	[IGNORED] = "Ignored",
-	[FAILED] = "Failed",
-	[DELAYED] = "Delayed",
-	[RECOVERED] = "Recovered",
-};
-
-enum action_page_type {
-	MSG_KERNEL,
-	MSG_KERNEL_HIGH_ORDER,
-	MSG_SLAB,
-	MSG_DIFFERENT_COMPOUND,
-	MSG_POISONED_HUGE,
-	MSG_HUGE,
-	MSG_FREE_HUGE,
-	MSG_UNMAP_FAILED,
-	MSG_DIRTY_SWAPCACHE,
-	MSG_CLEAN_SWAPCACHE,
-	MSG_DIRTY_MLOCKED_LRU,
-	MSG_CLEAN_MLOCKED_LRU,
-	MSG_DIRTY_UNEVICTABLE_LRU,
-	MSG_CLEAN_UNEVICTABLE_LRU,
-	MSG_DIRTY_LRU,
-	MSG_CLEAN_LRU,
-	MSG_TRUNCATED_LRU,
-	MSG_BUDDY,
-	MSG_BUDDY_2ND,
-	MSG_UNKNOWN,
+	[MF_IGNORED] = "Ignored",
+	[MF_FAILED] = "Failed",
+	[MF_DELAYED] = "Delayed",
+	[MF_RECOVERED] = "Recovered",
 };
 
 static const char * const action_page_types[] = {
-	[MSG_KERNEL]			= "reserved kernel page",
-	[MSG_KERNEL_HIGH_ORDER]		= "high-order kernel page",
-	[MSG_SLAB]			= "kernel slab page",
-	[MSG_DIFFERENT_COMPOUND]	= "different compound page after locking",
-	[MSG_POISONED_HUGE]		= "huge page already hardware poisoned",
-	[MSG_HUGE]			= "huge page",
-	[MSG_FREE_HUGE]			= "free huge page",
-	[MSG_UNMAP_FAILED]		= "unmapping failed page",
-	[MSG_DIRTY_SWAPCACHE]		= "dirty swapcache page",
-	[MSG_CLEAN_SWAPCACHE]		= "clean swapcache page",
-	[MSG_DIRTY_MLOCKED_LRU]		= "dirty mlocked LRU page",
-	[MSG_CLEAN_MLOCKED_LRU]		= "clean mlocked LRU page",
-	[MSG_DIRTY_UNEVICTABLE_LRU]	= "dirty unevictable LRU page",
-	[MSG_CLEAN_UNEVICTABLE_LRU]	= "clean unevictable LRU page",
-	[MSG_DIRTY_LRU]			= "dirty LRU page",
-	[MSG_CLEAN_LRU]			= "clean LRU page",
-	[MSG_TRUNCATED_LRU]		= "already truncated LRU page",
-	[MSG_BUDDY]			= "free buddy page",
-	[MSG_BUDDY_2ND]			= "free buddy page (2nd try)",
-	[MSG_UNKNOWN]			= "unknown page",
+	[MF_MSG_KERNEL]			= "reserved kernel page",
+	[MF_MSG_KERNEL_HIGH_ORDER]	= "high-order kernel page",
+	[MF_MSG_SLAB]			= "kernel slab page",
+	[MF_MSG_DIFFERENT_COMPOUND]	= "different compound page after locking",
+	[MF_MSG_POISONED_HUGE]		= "huge page already hardware poisoned",
+	[MF_MSG_HUGE]			= "huge page",
+	[MF_MSG_FREE_HUGE]		= "free huge page",
+	[MF_MSG_UNMAP_FAILED]		= "unmapping failed page",
+	[MF_MSG_DIRTY_SWAPCACHE]	= "dirty swapcache page",
+	[MF_MSG_CLEAN_SWAPCACHE]	= "clean swapcache page",
+	[MF_MSG_DIRTY_MLOCKED_LRU]	= "dirty mlocked LRU page",
+	[MF_MSG_CLEAN_MLOCKED_LRU]	= "clean mlocked LRU page",
+	[MF_MSG_DIRTY_UNEVICTABLE_LRU]	= "dirty unevictable LRU page",
+	[MF_MSG_CLEAN_UNEVICTABLE_LRU]	= "clean unevictable LRU page",
+	[MF_MSG_DIRTY_LRU]		= "dirty LRU page",
+	[MF_MSG_CLEAN_LRU]		= "clean LRU page",
+	[MF_MSG_TRUNCATED_LRU]		= "already truncated LRU page",
+	[MF_MSG_BUDDY]			= "free buddy page",
+	[MF_MSG_BUDDY_2ND]		= "free buddy page (2nd try)",
+	[MF_MSG_UNKNOWN]		= "unknown page",
 };
 
 /*
@@ -599,7 +565,7 @@ static int delete_from_lru_cache(struct page *p)
  */
 static int me_kernel(struct page *p, unsigned long pfn)
 {
-	return IGNORED;
+	return MF_IGNORED;
 }
 
 /*
@@ -608,7 +574,7 @@ static int me_kernel(struct page *p, unsigned long pfn)
 static int me_unknown(struct page *p, unsigned long pfn)
 {
 	printk(KERN_ERR "MCE %#lx: Unknown page state\n", pfn);
-	return FAILED;
+	return MF_FAILED;
 }
 
 /*
@@ -617,7 +583,7 @@ static int me_unknown(struct page *p, unsigned long pfn)
 static int me_pagecache_clean(struct page *p, unsigned long pfn)
 {
 	int err;
-	int ret = FAILED;
+	int ret = MF_FAILED;
 	struct address_space *mapping;
 
 	delete_from_lru_cache(p);
@@ -627,7 +593,7 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn)
 	 * should be the one m_f() holds.
 	 */
 	if (PageAnon(p))
-		return RECOVERED;
+		return MF_RECOVERED;
 
 	/*
 	 * Now truncate the page in the page cache. This is really
@@ -641,7 +607,7 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn)
 		/*
 		 * Page has been teared down in the meanwhile
 		 */
-		return FAILED;
+		return MF_FAILED;
 	}
 
 	/*
@@ -658,7 +624,7 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn)
 				!try_to_release_page(p, GFP_NOIO)) {
 			pr_info("MCE %#lx: failed to release buffers\n", pfn);
 		} else {
-			ret = RECOVERED;
+			ret = MF_RECOVERED;
 		}
 	} else {
 		/*
@@ -666,7 +632,7 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn)
 		 * This fails on dirty or anything with private pages
 		 */
 		if (invalidate_inode_page(p))
-			ret = RECOVERED;
+			ret = MF_RECOVERED;
 		else
 			printk(KERN_INFO "MCE %#lx: Failed to invalidate\n",
 				pfn);
@@ -752,9 +718,9 @@ static int me_swapcache_dirty(struct page *p, unsigned long pfn)
 	ClearPageUptodate(p);
 
 	if (!delete_from_lru_cache(p))
-		return DELAYED;
+		return MF_DELAYED;
 	else
-		return FAILED;
+		return MF_FAILED;
 }
 
 static int me_swapcache_clean(struct page *p, unsigned long pfn)
@@ -762,9 +728,9 @@ static int me_swapcache_clean(struct page *p, unsigned long pfn)
 	delete_from_swap_cache(p);
 
 	if (!delete_from_lru_cache(p))
-		return RECOVERED;
+		return MF_RECOVERED;
 	else
-		return FAILED;
+		return MF_FAILED;
 }
 
 /*
@@ -794,9 +760,9 @@ static int me_huge_page(struct page *p, unsigned long pfn)
 	if (!(page_mapping(hpage) || PageAnon(hpage))) {
 		res = dequeue_hwpoisoned_huge_page(hpage);
 		if (!res)
-			return RECOVERED;
+			return MF_RECOVERED;
 	}
-	return DELAYED;
+	return MF_DELAYED;
 }
 
 /*
@@ -828,10 +794,10 @@ static int me_huge_page(struct page *p, unsigned long pfn)
 static struct page_state {
 	unsigned long mask;
 	unsigned long res;
-	enum action_page_type type;
+	enum mf_action_page_type type;
 	int (*action)(struct page *p, unsigned long pfn);
 } error_states[] = {
-	{ reserved,	reserved,	MSG_KERNEL,	me_kernel },
+	{ reserved,	reserved,	MF_MSG_KERNEL,	me_kernel },
 	/*
 	 * free pages are specially detected outside this table:
 	 * PG_buddy pages only make a small fraction of all free pages.
@@ -842,31 +808,31 @@ static struct page_state {
 	 * currently unused objects without touching them. But just
 	 * treat it as standard kernel for now.
 	 */
-	{ slab,		slab,		MSG_SLAB,	me_kernel },
+	{ slab,		slab,		MF_MSG_SLAB,	me_kernel },
 
 #ifdef CONFIG_PAGEFLAGS_EXTENDED
-	{ head,		head,		MSG_HUGE,		me_huge_page },
-	{ tail,		tail,		MSG_HUGE,		me_huge_page },
+	{ head,		head,		MF_MSG_HUGE,		me_huge_page },
+	{ tail,		tail,		MF_MSG_HUGE,		me_huge_page },
 #else
-	{ compound,	compound,	MSG_HUGE,		me_huge_page },
+	{ compound,	compound,	MF_MSG_HUGE,		me_huge_page },
 #endif
 
-	{ sc|dirty,	sc|dirty,	MSG_DIRTY_SWAPCACHE,	me_swapcache_dirty },
-	{ sc|dirty,	sc,		MSG_CLEAN_SWAPCACHE,	me_swapcache_clean },
+	{ sc|dirty,	sc|dirty,	MF_MSG_DIRTY_SWAPCACHE,	me_swapcache_dirty },
+	{ sc|dirty,	sc,		MF_MSG_CLEAN_SWAPCACHE,	me_swapcache_clean },
 
-	{ mlock|dirty,	mlock|dirty,	MSG_DIRTY_MLOCKED_LRU,	me_pagecache_dirty },
-	{ mlock|dirty,	mlock,		MSG_CLEAN_MLOCKED_LRU,	me_pagecache_clean },
+	{ mlock|dirty,	mlock|dirty,	MF_MSG_DIRTY_MLOCKED_LRU,	me_pagecache_dirty },
+	{ mlock|dirty,	mlock,		MF_MSG_CLEAN_MLOCKED_LRU,	me_pagecache_clean },
 
-	{ unevict|dirty, unevict|dirty,	MSG_DIRTY_UNEVICTABLE_LRU,	me_pagecache_dirty },
-	{ unevict|dirty, unevict,	MSG_CLEAN_UNEVICTABLE_LRU,	me_pagecache_clean },
+	{ unevict|dirty, unevict|dirty,	MF_MSG_DIRTY_UNEVICTABLE_LRU,	me_pagecache_dirty },
+	{ unevict|dirty, unevict,	MF_MSG_CLEAN_UNEVICTABLE_LRU,	me_pagecache_clean },
 
-	{ lru|dirty,	lru|dirty,	MSG_DIRTY_LRU,	me_pagecache_dirty },
-	{ lru|dirty,	lru,		MSG_CLEAN_LRU,	me_pagecache_clean },
+	{ lru|dirty,	lru|dirty,	MF_MSG_DIRTY_LRU,	me_pagecache_dirty },
+	{ lru|dirty,	lru,		MF_MSG_CLEAN_LRU,	me_pagecache_clean },
 
 	/*
 	 * Catchall entry: must be at end.
 	 */
-	{ 0,		0,		MSG_UNKNOWN,	me_unknown },
+	{ 0,		0,		MF_MSG_UNKNOWN,	me_unknown },
 };
 
 #undef dirty
@@ -886,7 +852,7 @@ static struct page_state {
  * "Dirty/Clean" indication is not 100% accurate due to the possibility of
  * setting PG_dirty outside page lock. See also comment above set_page_dirty().
  */
-static void action_result(unsigned long pfn, enum action_page_type type, int result)
+static void action_result(unsigned long pfn, enum mf_action_page_type type, int result)
 {
 	pr_err("MCE %#lx: recovery action for %s: %s\n",
 		pfn, action_page_types[type], action_name[result]);
@@ -901,13 +867,13 @@ static int page_action(struct page_state *ps, struct page *p,
 	result = ps->action(p, pfn);
 
 	count = page_count(p) - 1;
-	if (ps->action == me_swapcache_dirty && result == DELAYED)
+	if (ps->action == me_swapcache_dirty && result == MF_DELAYED)
 		count--;
 	if (count != 0) {
 		printk(KERN_ERR
 		       "MCE %#lx: %s still referenced by %d users\n",
 		       pfn, action_page_types[ps->type], count);
-		result = FAILED;
+		result = MF_FAILED;
 	}
 	action_result(pfn, ps->type, result);
 
@@ -916,7 +882,7 @@ static int page_action(struct page_state *ps, struct page *p,
 	 * Could adjust zone counters here to correct for the missing page.
 	 */
 
-	return (result == RECOVERED || result == DELAYED) ? 0 : -EBUSY;
+	return (result == MF_RECOVERED || result == MF_DELAYED) ? 0 : -EBUSY;
 }
 
 /**
@@ -1136,7 +1102,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
 	 */
 	if (!(flags & MF_COUNT_INCREASED) && !get_hwpoison_page(p)) {
 		if (is_free_buddy_page(p)) {
-			action_result(pfn, MSG_BUDDY, DELAYED);
+			action_result(pfn, MF_MSG_BUDDY, MF_DELAYED);
 			return 0;
 		} else if (PageHuge(hpage)) {
 			/*
@@ -1153,12 +1119,12 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
 			}
 			set_page_hwpoison_huge_page(hpage);
 			res = dequeue_hwpoisoned_huge_page(hpage);
-			action_result(pfn, MSG_FREE_HUGE,
-				      res ? IGNORED : DELAYED);
+			action_result(pfn, MF_MSG_FREE_HUGE,
+				      res ? MF_IGNORED : MF_DELAYED);
 			unlock_page(hpage);
 			return res;
 		} else {
-			action_result(pfn, MSG_KERNEL_HIGH_ORDER, IGNORED);
+			action_result(pfn, MF_MSG_KERNEL_HIGH_ORDER, MF_IGNORED);
 			return -EBUSY;
 		}
 	}
@@ -1203,10 +1169,10 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
 			 */
 			if (is_free_buddy_page(p)) {
 				if (flags & MF_COUNT_INCREASED)
-					action_result(pfn, MSG_BUDDY, DELAYED);
+					action_result(pfn, MF_MSG_BUDDY, MF_DELAYED);
 				else
-					action_result(pfn, MSG_BUDDY_2ND,
-						      DELAYED);
+					action_result(pfn, MF_MSG_BUDDY_2ND,
+						      MF_DELAYED);
 				return 0;
 			}
 		}
@@ -1219,7 +1185,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
 	 * If this happens just bail out.
 	 */
 	if (PageCompound(p) && compound_head(p) != orig_head) {
-		action_result(pfn, MSG_DIFFERENT_COMPOUND, IGNORED);
+		action_result(pfn, MF_MSG_DIFFERENT_COMPOUND, MF_IGNORED);
 		res = -EBUSY;
 		goto out;
 	}
@@ -1259,7 +1225,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
 	 * on the head page to show that the hugepage is hwpoisoned
 	 */
 	if (PageHuge(p) && PageTail(p) && TestSetPageHWPoison(hpage)) {
-		action_result(pfn, MSG_POISONED_HUGE, IGNORED);
+		action_result(pfn, MF_MSG_POISONED_HUGE, MF_IGNORED);
 		unlock_page(hpage);
 		put_page(hpage);
 		return 0;
@@ -1288,7 +1254,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
 	 */
 	if (hwpoison_user_mappings(p, pfn, trapno, flags, &hpage)
 	    != SWAP_SUCCESS) {
-		action_result(pfn, MSG_UNMAP_FAILED, IGNORED);
+		action_result(pfn, MF_MSG_UNMAP_FAILED, MF_IGNORED);
 		res = -EBUSY;
 		goto out;
 	}
@@ -1297,7 +1263,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
 	 * Torn down by someone else?
 	 */
 	if (PageLRU(p) && !PageSwapCache(p) && p->mapping == NULL) {
-		action_result(pfn, MSG_TRUNCATED_LRU, IGNORED);
+		action_result(pfn, MF_MSG_TRUNCATED_LRU, MF_IGNORED);
 		res = -EBUSY;
 		goto out;
 	}
-- 
cgit v1.2.3


From cc3e2af42e7b7e0457b93bf17c19b44c635cd40c Mon Sep 17 00:00:00 2001
From: Xie XiuQi <xiexiuqi@huawei.com>
Date: Wed, 24 Jun 2015 16:57:33 -0700
Subject: memory-failure: change type of action_result's param 3 to enum

Change type of action_result's param 3 to enum for type consistency,
and rename mf_outcome to mf_result for clearly.

Signed-off-by: Xie XiuQi <xiexiuqi@huawei.com>
Acked-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Chen Gong <gong.chen@linux.intel.com>
Cc: Jim Davis <jim.epost@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Tony Luck <tony.luck@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h  | 2 +-
 mm/memory-failure.c | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 986a9a221ee0..24ad583596d1 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2157,7 +2157,7 @@ extern int soft_offline_page(struct page *page, int flags);
 /*
  * Error handlers for various types of pages.
  */
-enum mf_outcome {
+enum mf_result {
 	MF_IGNORED,	/* Error: cannot be handled */
 	MF_FAILED,	/* Error: handling failed */
 	MF_DELAYED,	/* Will be handled later */
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index b71a3cd3d0b0..15c0d5ab0893 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -852,7 +852,8 @@ static struct page_state {
  * "Dirty/Clean" indication is not 100% accurate due to the possibility of
  * setting PG_dirty outside page lock. See also comment above set_page_dirty().
  */
-static void action_result(unsigned long pfn, enum mf_action_page_type type, int result)
+static void action_result(unsigned long pfn, enum mf_action_page_type type,
+			  enum mf_result result)
 {
 	pr_err("MCE %#lx: recovery action for %s: %s\n",
 		pfn, action_page_types[type], action_name[result]);
-- 
cgit v1.2.3


From 8809aa2d28d74111ff2f1928edaa4e9845c97a7d Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Wed, 24 Jun 2015 16:57:44 -0700
Subject: mm: clarify that the function operates on hugepage pte

We have confusing functions to clear pmd, pmd_clear_* and pmd_clear.  Add
_huge_ to pmdp_clear functions so that we are clear that they operate on
hugepage pte.

We don't bother about other functions like pmdp_set_wrprotect,
pmdp_clear_flush_young, because they operate on PTE bits and hence
indicate they are operating on hugepage ptes

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/mips/include/asm/pgtable.h          |  8 ++++----
 arch/powerpc/include/asm/pgtable-ppc64.h |  6 +++---
 arch/powerpc/mm/pgtable_64.c             |  4 ++--
 arch/s390/include/asm/pgtable.h          | 24 ++++++++++++------------
 arch/sparc/include/asm/pgtable_64.h      |  8 ++++----
 arch/tile/include/asm/pgtable.h          |  8 ++++----
 arch/x86/include/asm/pgtable.h           |  4 ++--
 include/asm-generic/pgtable.h            | 18 +++++++++---------
 include/linux/mmu_notifier.h             | 12 ++++++------
 mm/huge_memory.c                         | 16 ++++++++--------
 mm/migrate.c                             |  2 +-
 mm/pgtable-generic.c                     | 14 +++++++++-----
 mm/rmap.c                                |  2 +-
 13 files changed, 65 insertions(+), 61 deletions(-)

(limited to 'include/linux')

diff --git a/arch/mips/include/asm/pgtable.h b/arch/mips/include/asm/pgtable.h
index 819af9d057a8..9d8106758142 100644
--- a/arch/mips/include/asm/pgtable.h
+++ b/arch/mips/include/asm/pgtable.h
@@ -568,12 +568,12 @@ static inline pmd_t pmd_mknotpresent(pmd_t pmd)
 }
 
 /*
- * The generic version pmdp_get_and_clear uses a version of pmd_clear() with a
+ * The generic version pmdp_huge_get_and_clear uses a version of pmd_clear() with a
  * different prototype.
  */
-#define __HAVE_ARCH_PMDP_GET_AND_CLEAR
-static inline pmd_t pmdp_get_and_clear(struct mm_struct *mm,
-				       unsigned long address, pmd_t *pmdp)
+#define __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR
+static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm,
+					    unsigned long address, pmd_t *pmdp)
 {
 	pmd_t old = *pmdp;
 
diff --git a/arch/powerpc/include/asm/pgtable-ppc64.h b/arch/powerpc/include/asm/pgtable-ppc64.h
index 34ffbf52d078..3bb7488bd24b 100644
--- a/arch/powerpc/include/asm/pgtable-ppc64.h
+++ b/arch/powerpc/include/asm/pgtable-ppc64.h
@@ -569,9 +569,9 @@ extern int pmdp_test_and_clear_young(struct vm_area_struct *vma,
 extern int pmdp_clear_flush_young(struct vm_area_struct *vma,
 				  unsigned long address, pmd_t *pmdp);
 
-#define __HAVE_ARCH_PMDP_GET_AND_CLEAR
-extern pmd_t pmdp_get_and_clear(struct mm_struct *mm,
-				unsigned long addr, pmd_t *pmdp);
+#define __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR
+extern pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm,
+				     unsigned long addr, pmd_t *pmdp);
 
 #define __HAVE_ARCH_PMDP_SET_WRPROTECT
 static inline void pmdp_set_wrprotect(struct mm_struct *mm, unsigned long addr,
diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c
index f7775193d745..876232d64126 100644
--- a/arch/powerpc/mm/pgtable_64.c
+++ b/arch/powerpc/mm/pgtable_64.c
@@ -812,8 +812,8 @@ void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long addr,
 	return;
 }
 
-pmd_t pmdp_get_and_clear(struct mm_struct *mm,
-			 unsigned long addr, pmd_t *pmdp)
+pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm,
+			      unsigned long addr, pmd_t *pmdp)
 {
 	pmd_t old_pmd;
 	pgtable_t pgtable;
diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h
index 7242dd386ab7..f66d82798a6a 100644
--- a/arch/s390/include/asm/pgtable.h
+++ b/arch/s390/include/asm/pgtable.h
@@ -1498,9 +1498,9 @@ static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma,
 	return pmd_young(pmd);
 }
 
-#define __HAVE_ARCH_PMDP_GET_AND_CLEAR
-static inline pmd_t pmdp_get_and_clear(struct mm_struct *mm,
-				       unsigned long address, pmd_t *pmdp)
+#define __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR
+static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm,
+					    unsigned long address, pmd_t *pmdp)
 {
 	pmd_t pmd = *pmdp;
 
@@ -1509,10 +1509,10 @@ static inline pmd_t pmdp_get_and_clear(struct mm_struct *mm,
 	return pmd;
 }
 
-#define __HAVE_ARCH_PMDP_GET_AND_CLEAR_FULL
-static inline pmd_t pmdp_get_and_clear_full(struct mm_struct *mm,
-					    unsigned long address,
-					    pmd_t *pmdp, int full)
+#define __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR_FULL
+static inline pmd_t pmdp_huge_get_and_clear_full(struct mm_struct *mm,
+						 unsigned long address,
+						 pmd_t *pmdp, int full)
 {
 	pmd_t pmd = *pmdp;
 
@@ -1522,11 +1522,11 @@ static inline pmd_t pmdp_get_and_clear_full(struct mm_struct *mm,
 	return pmd;
 }
 
-#define __HAVE_ARCH_PMDP_CLEAR_FLUSH
-static inline pmd_t pmdp_clear_flush(struct vm_area_struct *vma,
-				     unsigned long address, pmd_t *pmdp)
+#define __HAVE_ARCH_PMDP_HUGE_CLEAR_FLUSH
+static inline pmd_t pmdp_huge_clear_flush(struct vm_area_struct *vma,
+					  unsigned long address, pmd_t *pmdp)
 {
-	return pmdp_get_and_clear(vma->vm_mm, address, pmdp);
+	return pmdp_huge_get_and_clear(vma->vm_mm, address, pmdp);
 }
 
 #define __HAVE_ARCH_PMDP_INVALIDATE
@@ -1552,7 +1552,7 @@ static inline pmd_t pmdp_collapse_flush(struct vm_area_struct *vma,
 					unsigned long address,
 					pmd_t *pmdp)
 {
-	return pmdp_get_and_clear(vma->vm_mm, address, pmdp);
+	return pmdp_huge_get_and_clear(vma->vm_mm, address, pmdp);
 }
 #define pmdp_collapse_flush pmdp_collapse_flush
 
diff --git a/arch/sparc/include/asm/pgtable_64.h b/arch/sparc/include/asm/pgtable_64.h
index 2a52c91d2c8a..131d36fcd07a 100644
--- a/arch/sparc/include/asm/pgtable_64.h
+++ b/arch/sparc/include/asm/pgtable_64.h
@@ -865,10 +865,10 @@ static inline unsigned long pud_pfn(pud_t pud)
 void tlb_batch_add(struct mm_struct *mm, unsigned long vaddr,
 		   pte_t *ptep, pte_t orig, int fullmm);
 
-#define __HAVE_ARCH_PMDP_GET_AND_CLEAR
-static inline pmd_t pmdp_get_and_clear(struct mm_struct *mm,
-				       unsigned long addr,
-				       pmd_t *pmdp)
+#define __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR
+static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm,
+					    unsigned long addr,
+					    pmd_t *pmdp)
 {
 	pmd_t pmd = *pmdp;
 	set_pmd_at(mm, addr, pmdp, __pmd(0UL));
diff --git a/arch/tile/include/asm/pgtable.h b/arch/tile/include/asm/pgtable.h
index 95a4f19d16c5..2b05ccbebed9 100644
--- a/arch/tile/include/asm/pgtable.h
+++ b/arch/tile/include/asm/pgtable.h
@@ -414,10 +414,10 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm,
 }
 
 
-#define __HAVE_ARCH_PMDP_GET_AND_CLEAR
-static inline pmd_t pmdp_get_and_clear(struct mm_struct *mm,
-				       unsigned long address,
-				       pmd_t *pmdp)
+#define __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR
+static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm,
+					    unsigned long address,
+					    pmd_t *pmdp)
 {
 	return pte_pmd(ptep_get_and_clear(mm, address, pmdp_ptep(pmdp)));
 }
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 2562e303405b..867da5bbb4a3 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -805,8 +805,8 @@ static inline int pmd_write(pmd_t pmd)
 	return pmd_flags(pmd) & _PAGE_RW;
 }
 
-#define __HAVE_ARCH_PMDP_GET_AND_CLEAR
-static inline pmd_t pmdp_get_and_clear(struct mm_struct *mm, unsigned long addr,
+#define __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR
+static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, unsigned long addr,
 				       pmd_t *pmdp)
 {
 	pmd_t pmd = native_pmdp_get_and_clear(pmdp);
diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
index f1108a72a672..29c57b2cb344 100644
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -96,11 +96,11 @@ static inline pte_t ptep_get_and_clear(struct mm_struct *mm,
 }
 #endif
 
-#ifndef __HAVE_ARCH_PMDP_GET_AND_CLEAR
+#ifndef __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
-static inline pmd_t pmdp_get_and_clear(struct mm_struct *mm,
-				       unsigned long address,
-				       pmd_t *pmdp)
+static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm,
+					    unsigned long address,
+					    pmd_t *pmdp)
 {
 	pmd_t pmd = *pmdp;
 	pmd_clear(pmdp);
@@ -109,13 +109,13 @@ static inline pmd_t pmdp_get_and_clear(struct mm_struct *mm,
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 #endif
 
-#ifndef __HAVE_ARCH_PMDP_GET_AND_CLEAR_FULL
+#ifndef __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR_FULL
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
-static inline pmd_t pmdp_get_and_clear_full(struct mm_struct *mm,
+static inline pmd_t pmdp_huge_get_and_clear_full(struct mm_struct *mm,
 					    unsigned long address, pmd_t *pmdp,
 					    int full)
 {
-	return pmdp_get_and_clear(mm, address, pmdp);
+	return pmdp_huge_get_and_clear(mm, address, pmdp);
 }
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 #endif
@@ -152,8 +152,8 @@ extern pte_t ptep_clear_flush(struct vm_area_struct *vma,
 			      pte_t *ptep);
 #endif
 
-#ifndef __HAVE_ARCH_PMDP_CLEAR_FLUSH
-extern pmd_t pmdp_clear_flush(struct vm_area_struct *vma,
+#ifndef __HAVE_ARCH_PMDP_HUGE_CLEAR_FLUSH
+extern pmd_t pmdp_huge_clear_flush(struct vm_area_struct *vma,
 			      unsigned long address,
 			      pmd_t *pmdp);
 #endif
diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
index 95243d28a0ee..61cd67f4d788 100644
--- a/include/linux/mmu_notifier.h
+++ b/include/linux/mmu_notifier.h
@@ -324,25 +324,25 @@ static inline void mmu_notifier_mm_destroy(struct mm_struct *mm)
 	___pte;								\
 })
 
-#define pmdp_clear_flush_notify(__vma, __haddr, __pmd)			\
+#define pmdp_huge_clear_flush_notify(__vma, __haddr, __pmd)		\
 ({									\
 	unsigned long ___haddr = __haddr & HPAGE_PMD_MASK;		\
 	struct mm_struct *___mm = (__vma)->vm_mm;			\
 	pmd_t ___pmd;							\
 									\
-	___pmd = pmdp_clear_flush(__vma, __haddr, __pmd);		\
+	___pmd = pmdp_huge_clear_flush(__vma, __haddr, __pmd);		\
 	mmu_notifier_invalidate_range(___mm, ___haddr,			\
 				      ___haddr + HPAGE_PMD_SIZE);	\
 									\
 	___pmd;								\
 })
 
-#define pmdp_get_and_clear_notify(__mm, __haddr, __pmd)			\
+#define pmdp_huge_get_and_clear_notify(__mm, __haddr, __pmd)		\
 ({									\
 	unsigned long ___haddr = __haddr & HPAGE_PMD_MASK;		\
 	pmd_t ___pmd;							\
 									\
-	___pmd = pmdp_get_and_clear(__mm, __haddr, __pmd);		\
+	___pmd = pmdp_huge_get_and_clear(__mm, __haddr, __pmd);		\
 	mmu_notifier_invalidate_range(__mm, ___haddr,			\
 				      ___haddr + HPAGE_PMD_SIZE);	\
 									\
@@ -428,8 +428,8 @@ static inline void mmu_notifier_mm_destroy(struct mm_struct *mm)
 #define ptep_clear_flush_young_notify ptep_clear_flush_young
 #define pmdp_clear_flush_young_notify pmdp_clear_flush_young
 #define	ptep_clear_flush_notify ptep_clear_flush
-#define pmdp_clear_flush_notify pmdp_clear_flush
-#define pmdp_get_and_clear_notify pmdp_get_and_clear
+#define pmdp_huge_clear_flush_notify pmdp_huge_clear_flush
+#define pmdp_huge_get_and_clear_notify pmdp_huge_get_and_clear
 #define set_pte_at_notify set_pte_at
 
 #endif /* CONFIG_MMU_NOTIFIER */
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 65dd8d67287b..c107094f79ba 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1031,7 +1031,7 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
 		goto out_free_pages;
 	VM_BUG_ON_PAGE(!PageHead(page), page);
 
-	pmdp_clear_flush_notify(vma, haddr, pmd);
+	pmdp_huge_clear_flush_notify(vma, haddr, pmd);
 	/* leave pmd empty until pte is filled */
 
 	pgtable = pgtable_trans_huge_withdraw(mm, pmd);
@@ -1174,7 +1174,7 @@ alloc:
 		pmd_t entry;
 		entry = mk_huge_pmd(new_page, vma->vm_page_prot);
 		entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
-		pmdp_clear_flush_notify(vma, haddr, pmd);
+		pmdp_huge_clear_flush_notify(vma, haddr, pmd);
 		page_add_new_anon_rmap(new_page, vma, haddr);
 		mem_cgroup_commit_charge(new_page, memcg, false);
 		lru_cache_add_active_or_unevictable(new_page, vma);
@@ -1396,12 +1396,12 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
 		pmd_t orig_pmd;
 		/*
 		 * For architectures like ppc64 we look at deposited pgtable
-		 * when calling pmdp_get_and_clear. So do the
+		 * when calling pmdp_huge_get_and_clear. So do the
 		 * pgtable_trans_huge_withdraw after finishing pmdp related
 		 * operations.
 		 */
-		orig_pmd = pmdp_get_and_clear_full(tlb->mm, addr, pmd,
-						   tlb->fullmm);
+		orig_pmd = pmdp_huge_get_and_clear_full(tlb->mm, addr, pmd,
+							tlb->fullmm);
 		tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
 		pgtable = pgtable_trans_huge_withdraw(tlb->mm, pmd);
 		if (is_huge_zero_pmd(orig_pmd)) {
@@ -1459,7 +1459,7 @@ int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma,
 		new_ptl = pmd_lockptr(mm, new_pmd);
 		if (new_ptl != old_ptl)
 			spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
-		pmd = pmdp_get_and_clear(mm, old_addr, old_pmd);
+		pmd = pmdp_huge_get_and_clear(mm, old_addr, old_pmd);
 		VM_BUG_ON(!pmd_none(*new_pmd));
 
 		if (pmd_move_must_withdraw(new_ptl, old_ptl)) {
@@ -1505,7 +1505,7 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
 		}
 
 		if (!prot_numa || !pmd_protnone(*pmd)) {
-			entry = pmdp_get_and_clear_notify(mm, addr, pmd);
+			entry = pmdp_huge_get_and_clear_notify(mm, addr, pmd);
 			entry = pmd_modify(entry, newprot);
 			if (preserve_write)
 				entry = pmd_mkwrite(entry);
@@ -2863,7 +2863,7 @@ static void __split_huge_zero_page_pmd(struct vm_area_struct *vma,
 	pmd_t _pmd;
 	int i;
 
-	pmdp_clear_flush_notify(vma, haddr, pmd);
+	pmdp_huge_clear_flush_notify(vma, haddr, pmd);
 	/* leave pmd empty until pte is filled */
 
 	pgtable = pgtable_trans_huge_withdraw(mm, pmd);
diff --git a/mm/migrate.c b/mm/migrate.c
index d4fe1f94120b..ee401e4e5ef1 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -1799,7 +1799,7 @@ fail_putback:
 	 */
 	flush_cache_range(vma, mmun_start, mmun_end);
 	page_add_anon_rmap(new_page, vma, mmun_start);
-	pmdp_clear_flush_notify(vma, mmun_start, pmd);
+	pmdp_huge_clear_flush_notify(vma, mmun_start, pmd);
 	set_pmd_at(mm, mmun_start, pmd, entry);
 	flush_tlb_range(vma, mmun_start, mmun_end);
 	update_mmu_cache_pmd(vma, address, &entry);
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
index f21dc5fbc6cd..6b674e00153c 100644
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -119,15 +119,15 @@ pte_t ptep_clear_flush(struct vm_area_struct *vma, unsigned long address,
 }
 #endif
 
-#ifndef __HAVE_ARCH_PMDP_CLEAR_FLUSH
+#ifndef __HAVE_ARCH_PMDP_HUGE_CLEAR_FLUSH
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
-pmd_t pmdp_clear_flush(struct vm_area_struct *vma, unsigned long address,
-		       pmd_t *pmdp)
+pmd_t pmdp_huge_clear_flush(struct vm_area_struct *vma, unsigned long address,
+			    pmd_t *pmdp)
 {
 	pmd_t pmd;
 	VM_BUG_ON(address & ~HPAGE_PMD_MASK);
 	VM_BUG_ON(!pmd_trans_huge(*pmdp));
-	pmd = pmdp_get_and_clear(vma->vm_mm, address, pmdp);
+	pmd = pmdp_huge_get_and_clear(vma->vm_mm, address, pmdp);
 	flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
 	return pmd;
 }
@@ -205,11 +205,15 @@ void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
 pmd_t pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address,
 			  pmd_t *pmdp)
 {
+	/*
+	 * pmd and hugepage pte format are same. So we could
+	 * use the same function.
+	 */
 	pmd_t pmd;
 
 	VM_BUG_ON(address & ~HPAGE_PMD_MASK);
 	VM_BUG_ON(pmd_trans_huge(*pmdp));
-	pmd = pmdp_get_and_clear(vma->vm_mm, address, pmdp);
+	pmd = pmdp_huge_get_and_clear(vma->vm_mm, address, pmdp);
 	flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
 	return pmd;
 }
diff --git a/mm/rmap.c b/mm/rmap.c
index 9f47f152b01e..7af1ecb21ccb 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -625,7 +625,7 @@ pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address)
 
 	pmd = pmd_offset(pud, address);
 	/*
-	 * Some THP functions use the sequence pmdp_clear_flush(), set_pmd_at()
+	 * Some THP functions use the sequence pmdp_huge_clear_flush(), set_pmd_at()
 	 * without holding anon_vma lock for write.  So when looking for a
 	 * genuine pmde (in which to find pte), test present and !THP together.
 	 */
-- 
cgit v1.2.3


From fc6daaf93151877748f8096af6b3fddb147f22d6 Mon Sep 17 00:00:00 2001
From: Tony Luck <tony.luck@intel.com>
Date: Wed, 24 Jun 2015 16:58:09 -0700
Subject: mm/memblock: add extra "flags" to memblock to allow selection of
 memory based on attribute

Some high end Intel Xeon systems report uncorrectable memory errors as a
recoverable machine check.  Linux has included code for some time to
process these and just signal the affected processes (or even recover
completely if the error was in a read only page that can be replaced by
reading from disk).

But we have no recovery path for errors encountered during kernel code
execution.  Except for some very specific cases were are unlikely to ever
be able to recover.

Enter memory mirroring. Actually 3rd generation of memory mirroing.

Gen1: All memory is mirrored
	Pro: No s/w enabling - h/w just gets good data from other side of the
	     mirror
	Con: Halves effective memory capacity available to OS/applications

Gen2: Partial memory mirror - just mirror memory begind some memory controllers
	Pro: Keep more of the capacity
	Con: Nightmare to enable. Have to choose between allocating from
	     mirrored memory for safety vs. NUMA local memory for performance

Gen3: Address range partial memory mirror - some mirror on each memory
      controller
	Pro: Can tune the amount of mirror and keep NUMA performance
	Con: I have to write memory management code to implement

The current plan is just to use mirrored memory for kernel allocations.
This has been broken into two phases:

1) This patch series - find the mirrored memory, use it for boot time
   allocations

2) Wade into mm/page_alloc.c and define a ZONE_MIRROR to pick up the
   unused mirrored memory from mm/memblock.c and only give it out to
   select kernel allocations (this is still being scoped because
   page_alloc.c is scary).

This patch (of 3):

Add extra "flags" to memblock to allow selection of memory based on
attribute.  No functional changes

Signed-off-by: Tony Luck <tony.luck@intel.com>
Cc: Xishi Qiu <qiuxishi@huawei.com>
Cc: Hanjun Guo <guohanjun@huawei.com>
Cc: Xiexiuqi <xiexiuqi@huawei.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: Naoya Horiguchi <nao.horiguchi@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/s390/kernel/crash_dump.c |  5 ++--
 arch/sparc/mm/init_64.c       |  6 +++--
 arch/x86/kernel/check.c       |  3 ++-
 arch/x86/kernel/e820.c        |  3 ++-
 arch/x86/mm/init_32.c         |  2 +-
 include/linux/memblock.h      | 41 ++++++++++++++++++++------------
 mm/cma.c                      |  6 +++--
 mm/memblock.c                 | 55 +++++++++++++++++++++++++++----------------
 mm/memtest.c                  |  3 ++-
 mm/nobootmem.c                |  6 +++--
 10 files changed, 83 insertions(+), 47 deletions(-)

(limited to 'include/linux')

diff --git a/arch/s390/kernel/crash_dump.c b/arch/s390/kernel/crash_dump.c
index d9f0dcfcae5e..7a75ad4594e3 100644
--- a/arch/s390/kernel/crash_dump.c
+++ b/arch/s390/kernel/crash_dump.c
@@ -33,11 +33,12 @@ static struct memblock_type oldmem_type = {
 };
 
 #define for_each_dump_mem_range(i, nid, p_start, p_end, p_nid)		\
-	for (i = 0, __next_mem_range(&i, nid, &memblock.physmem,	\
+	for (i = 0, __next_mem_range(&i, nid, MEMBLOCK_NONE,		\
+				     &memblock.physmem,			\
 				     &oldmem_type, p_start,		\
 				     p_end, p_nid);			\
 	     i != (u64)ULLONG_MAX;					\
-	     __next_mem_range(&i, nid, &memblock.physmem,		\
+	     __next_mem_range(&i, nid, MEMBLOCK_NONE, &memblock.physmem,\
 			      &oldmem_type,				\
 			      p_start, p_end, p_nid))
 
diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c
index c5d08b89a96c..4ac88b757514 100644
--- a/arch/sparc/mm/init_64.c
+++ b/arch/sparc/mm/init_64.c
@@ -1966,7 +1966,8 @@ static phys_addr_t __init available_memory(void)
 	phys_addr_t pa_start, pa_end;
 	u64 i;
 
-	for_each_free_mem_range(i, NUMA_NO_NODE, &pa_start, &pa_end, NULL)
+	for_each_free_mem_range(i, NUMA_NO_NODE, MEMBLOCK_NONE, &pa_start,
+				&pa_end, NULL)
 		available = available + (pa_end  - pa_start);
 
 	return available;
@@ -1992,7 +1993,8 @@ static void __init reduce_memory(phys_addr_t limit_ram)
 	if (limit_ram >= avail_ram)
 		return;
 
-	for_each_free_mem_range(i, NUMA_NO_NODE, &pa_start, &pa_end, NULL) {
+	for_each_free_mem_range(i, NUMA_NO_NODE, MEMBLOCK_NONE, &pa_start,
+				&pa_end, NULL) {
 		phys_addr_t region_size = pa_end - pa_start;
 		phys_addr_t clip_start = pa_start;
 
diff --git a/arch/x86/kernel/check.c b/arch/x86/kernel/check.c
index 83a7995625a6..58118e207a69 100644
--- a/arch/x86/kernel/check.c
+++ b/arch/x86/kernel/check.c
@@ -91,7 +91,8 @@ void __init setup_bios_corruption_check(void)
 
 	corruption_check_size = round_up(corruption_check_size, PAGE_SIZE);
 
-	for_each_free_mem_range(i, NUMA_NO_NODE, &start, &end, NULL) {
+	for_each_free_mem_range(i, NUMA_NO_NODE, MEMBLOCK_NONE, &start, &end,
+				NULL) {
 		start = clamp_t(phys_addr_t, round_up(start, PAGE_SIZE),
 				PAGE_SIZE, corruption_check_size);
 		end = clamp_t(phys_addr_t, round_down(end, PAGE_SIZE),
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index e2ce85db2283..c8dda42cb6a3 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -1123,7 +1123,8 @@ void __init memblock_find_dma_reserve(void)
 		nr_pages += end_pfn - start_pfn;
 	}
 
-	for_each_free_mem_range(u, NUMA_NO_NODE, &start, &end, NULL) {
+	for_each_free_mem_range(u, NUMA_NO_NODE, MEMBLOCK_NONE, &start, &end,
+				NULL) {
 		start_pfn = min_t(unsigned long, PFN_UP(start), MAX_DMA_PFN);
 		end_pfn = min_t(unsigned long, PFN_DOWN(end), MAX_DMA_PFN);
 		if (start_pfn < end_pfn)
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index c8140e12816a..8340e45c891a 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -433,7 +433,7 @@ void __init add_highpages_with_active_regions(int nid,
 	phys_addr_t start, end;
 	u64 i;
 
-	for_each_free_mem_range(i, nid, &start, &end, NULL) {
+	for_each_free_mem_range(i, nid, MEMBLOCK_NONE, &start, &end, NULL) {
 		unsigned long pfn = clamp_t(unsigned long, PFN_UP(start),
 					    start_pfn, end_pfn);
 		unsigned long e_pfn = clamp_t(unsigned long, PFN_DOWN(end),
diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index 9497ec7c77ea..7aeec0cb4c27 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -21,7 +21,10 @@
 #define INIT_PHYSMEM_REGIONS	4
 
 /* Definition of memblock flags. */
-#define MEMBLOCK_HOTPLUG	0x1	/* hotpluggable region */
+enum {
+	MEMBLOCK_NONE		= 0x0,	/* No special request */
+	MEMBLOCK_HOTPLUG	= 0x1,	/* hotpluggable region */
+};
 
 struct memblock_region {
 	phys_addr_t base;
@@ -61,7 +64,7 @@ extern bool movable_node_enabled;
 
 phys_addr_t memblock_find_in_range_node(phys_addr_t size, phys_addr_t align,
 					    phys_addr_t start, phys_addr_t end,
-					    int nid);
+					    int nid, ulong flags);
 phys_addr_t memblock_find_in_range(phys_addr_t start, phys_addr_t end,
 				   phys_addr_t size, phys_addr_t align);
 phys_addr_t get_allocated_memblock_reserved_regions_info(phys_addr_t *addr);
@@ -85,11 +88,13 @@ int memblock_remove_range(struct memblock_type *type,
 			  phys_addr_t base,
 			  phys_addr_t size);
 
-void __next_mem_range(u64 *idx, int nid, struct memblock_type *type_a,
+void __next_mem_range(u64 *idx, int nid, ulong flags,
+		      struct memblock_type *type_a,
 		      struct memblock_type *type_b, phys_addr_t *out_start,
 		      phys_addr_t *out_end, int *out_nid);
 
-void __next_mem_range_rev(u64 *idx, int nid, struct memblock_type *type_a,
+void __next_mem_range_rev(u64 *idx, int nid, ulong flags,
+			  struct memblock_type *type_a,
 			  struct memblock_type *type_b, phys_addr_t *out_start,
 			  phys_addr_t *out_end, int *out_nid);
 
@@ -100,16 +105,17 @@ void __next_mem_range_rev(u64 *idx, int nid, struct memblock_type *type_a,
  * @type_a: ptr to memblock_type to iterate
  * @type_b: ptr to memblock_type which excludes from the iteration
  * @nid: node selector, %NUMA_NO_NODE for all nodes
+ * @flags: pick from blocks based on memory attributes
  * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL
  * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL
  * @p_nid: ptr to int for nid of the range, can be %NULL
  */
-#define for_each_mem_range(i, type_a, type_b, nid,			\
+#define for_each_mem_range(i, type_a, type_b, nid, flags,		\
 			   p_start, p_end, p_nid)			\
-	for (i = 0, __next_mem_range(&i, nid, type_a, type_b,		\
+	for (i = 0, __next_mem_range(&i, nid, flags, type_a, type_b,	\
 				     p_start, p_end, p_nid);		\
 	     i != (u64)ULLONG_MAX;					\
-	     __next_mem_range(&i, nid, type_a, type_b,			\
+	     __next_mem_range(&i, nid, flags, type_a, type_b,		\
 			      p_start, p_end, p_nid))
 
 /**
@@ -119,17 +125,18 @@ void __next_mem_range_rev(u64 *idx, int nid, struct memblock_type *type_a,
  * @type_a: ptr to memblock_type to iterate
  * @type_b: ptr to memblock_type which excludes from the iteration
  * @nid: node selector, %NUMA_NO_NODE for all nodes
+ * @flags: pick from blocks based on memory attributes
  * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL
  * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL
  * @p_nid: ptr to int for nid of the range, can be %NULL
  */
-#define for_each_mem_range_rev(i, type_a, type_b, nid,			\
+#define for_each_mem_range_rev(i, type_a, type_b, nid, flags,		\
 			       p_start, p_end, p_nid)			\
 	for (i = (u64)ULLONG_MAX,					\
-		     __next_mem_range_rev(&i, nid, type_a, type_b,	\
+		     __next_mem_range_rev(&i, nid, flags, type_a, type_b,\
 					 p_start, p_end, p_nid);	\
 	     i != (u64)ULLONG_MAX;					\
-	     __next_mem_range_rev(&i, nid, type_a, type_b,		\
+	     __next_mem_range_rev(&i, nid, flags, type_a, type_b,	\
 				  p_start, p_end, p_nid))
 
 #ifdef CONFIG_MOVABLE_NODE
@@ -181,13 +188,14 @@ void __next_mem_pfn_range(int *idx, int nid, unsigned long *out_start_pfn,
  * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL
  * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL
  * @p_nid: ptr to int for nid of the range, can be %NULL
+ * @flags: pick from blocks based on memory attributes
  *
  * Walks over free (memory && !reserved) areas of memblock.  Available as
  * soon as memblock is initialized.
  */
-#define for_each_free_mem_range(i, nid, p_start, p_end, p_nid)		\
+#define for_each_free_mem_range(i, nid, flags, p_start, p_end, p_nid)	\
 	for_each_mem_range(i, &memblock.memory, &memblock.reserved,	\
-			   nid, p_start, p_end, p_nid)
+			   nid, flags, p_start, p_end, p_nid)
 
 /**
  * for_each_free_mem_range_reverse - rev-iterate through free memblock areas
@@ -196,13 +204,15 @@ void __next_mem_pfn_range(int *idx, int nid, unsigned long *out_start_pfn,
  * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL
  * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL
  * @p_nid: ptr to int for nid of the range, can be %NULL
+ * @flags: pick from blocks based on memory attributes
  *
  * Walks over free (memory && !reserved) areas of memblock in reverse
  * order.  Available as soon as memblock is initialized.
  */
-#define for_each_free_mem_range_reverse(i, nid, p_start, p_end, p_nid)	\
+#define for_each_free_mem_range_reverse(i, nid, flags, p_start, p_end,	\
+					p_nid)				\
 	for_each_mem_range_rev(i, &memblock.memory, &memblock.reserved,	\
-			       nid, p_start, p_end, p_nid)
+			       nid, flags, p_start, p_end, p_nid)
 
 static inline void memblock_set_region_flags(struct memblock_region *r,
 					     unsigned long flags)
@@ -273,7 +283,8 @@ static inline bool memblock_bottom_up(void) { return false; }
 #define MEMBLOCK_ALLOC_ACCESSIBLE	0
 
 phys_addr_t __init memblock_alloc_range(phys_addr_t size, phys_addr_t align,
-					phys_addr_t start, phys_addr_t end);
+					phys_addr_t start, phys_addr_t end,
+					ulong flags);
 phys_addr_t memblock_alloc_base(phys_addr_t size, phys_addr_t align,
 				phys_addr_t max_addr);
 phys_addr_t __memblock_alloc_base(phys_addr_t size, phys_addr_t align,
diff --git a/mm/cma.c b/mm/cma.c
index 661278025c46..e7d1db533025 100644
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -316,13 +316,15 @@ int __init cma_declare_contiguous(phys_addr_t base,
 		 */
 		if (base < highmem_start && limit > highmem_start) {
 			addr = memblock_alloc_range(size, alignment,
-						    highmem_start, limit);
+						    highmem_start, limit,
+						    MEMBLOCK_NONE);
 			limit = highmem_start;
 		}
 
 		if (!addr) {
 			addr = memblock_alloc_range(size, alignment, base,
-						    limit);
+						    limit,
+						    MEMBLOCK_NONE);
 			if (!addr) {
 				ret = -ENOMEM;
 				goto err;
diff --git a/mm/memblock.c b/mm/memblock.c
index 9318b567ed79..b9ff2f4f0285 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -107,6 +107,7 @@ static long __init_memblock memblock_overlaps_region(struct memblock_type *type,
  * @size: size of free area to find
  * @align: alignment of free area to find
  * @nid: nid of the free area to find, %NUMA_NO_NODE for any node
+ * @flags: pick from blocks based on memory attributes
  *
  * Utility called from memblock_find_in_range_node(), find free area bottom-up.
  *
@@ -115,12 +116,13 @@ static long __init_memblock memblock_overlaps_region(struct memblock_type *type,
  */
 static phys_addr_t __init_memblock
 __memblock_find_range_bottom_up(phys_addr_t start, phys_addr_t end,
-				phys_addr_t size, phys_addr_t align, int nid)
+				phys_addr_t size, phys_addr_t align, int nid,
+				ulong flags)
 {
 	phys_addr_t this_start, this_end, cand;
 	u64 i;
 
-	for_each_free_mem_range(i, nid, &this_start, &this_end, NULL) {
+	for_each_free_mem_range(i, nid, flags, &this_start, &this_end, NULL) {
 		this_start = clamp(this_start, start, end);
 		this_end = clamp(this_end, start, end);
 
@@ -139,6 +141,7 @@ __memblock_find_range_bottom_up(phys_addr_t start, phys_addr_t end,
  * @size: size of free area to find
  * @align: alignment of free area to find
  * @nid: nid of the free area to find, %NUMA_NO_NODE for any node
+ * @flags: pick from blocks based on memory attributes
  *
  * Utility called from memblock_find_in_range_node(), find free area top-down.
  *
@@ -147,12 +150,14 @@ __memblock_find_range_bottom_up(phys_addr_t start, phys_addr_t end,
  */
 static phys_addr_t __init_memblock
 __memblock_find_range_top_down(phys_addr_t start, phys_addr_t end,
-			       phys_addr_t size, phys_addr_t align, int nid)
+			       phys_addr_t size, phys_addr_t align, int nid,
+			       ulong flags)
 {
 	phys_addr_t this_start, this_end, cand;
 	u64 i;
 
-	for_each_free_mem_range_reverse(i, nid, &this_start, &this_end, NULL) {
+	for_each_free_mem_range_reverse(i, nid, flags, &this_start, &this_end,
+					NULL) {
 		this_start = clamp(this_start, start, end);
 		this_end = clamp(this_end, start, end);
 
@@ -174,6 +179,7 @@ __memblock_find_range_top_down(phys_addr_t start, phys_addr_t end,
  * @start: start of candidate range
  * @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE}
  * @nid: nid of the free area to find, %NUMA_NO_NODE for any node
+ * @flags: pick from blocks based on memory attributes
  *
  * Find @size free area aligned to @align in the specified range and node.
  *
@@ -190,7 +196,7 @@ __memblock_find_range_top_down(phys_addr_t start, phys_addr_t end,
  */
 phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t size,
 					phys_addr_t align, phys_addr_t start,
-					phys_addr_t end, int nid)
+					phys_addr_t end, int nid, ulong flags)
 {
 	phys_addr_t kernel_end, ret;
 
@@ -215,7 +221,7 @@ phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t size,
 
 		/* ok, try bottom-up allocation first */
 		ret = __memblock_find_range_bottom_up(bottom_up_start, end,
-						      size, align, nid);
+						      size, align, nid, flags);
 		if (ret)
 			return ret;
 
@@ -233,7 +239,8 @@ phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t size,
 			     "memory hotunplug may be affected\n");
 	}
 
-	return __memblock_find_range_top_down(start, end, size, align, nid);
+	return __memblock_find_range_top_down(start, end, size, align, nid,
+					      flags);
 }
 
 /**
@@ -253,7 +260,7 @@ phys_addr_t __init_memblock memblock_find_in_range(phys_addr_t start,
 					phys_addr_t align)
 {
 	return memblock_find_in_range_node(size, align, start, end,
-					    NUMA_NO_NODE);
+					    NUMA_NO_NODE, MEMBLOCK_NONE);
 }
 
 static void __init_memblock memblock_remove_region(struct memblock_type *type, unsigned long r)
@@ -782,6 +789,7 @@ int __init_memblock memblock_clear_hotplug(phys_addr_t base, phys_addr_t size)
  * __next__mem_range - next function for for_each_free_mem_range() etc.
  * @idx: pointer to u64 loop variable
  * @nid: node selector, %NUMA_NO_NODE for all nodes
+ * @flags: pick from blocks based on memory attributes
  * @type_a: pointer to memblock_type from where the range is taken
  * @type_b: pointer to memblock_type which excludes memory from being taken
  * @out_start: ptr to phys_addr_t for start address of the range, can be %NULL
@@ -803,7 +811,7 @@ int __init_memblock memblock_clear_hotplug(phys_addr_t base, phys_addr_t size)
  * As both region arrays are sorted, the function advances the two indices
  * in lockstep and returns each intersection.
  */
-void __init_memblock __next_mem_range(u64 *idx, int nid,
+void __init_memblock __next_mem_range(u64 *idx, int nid, ulong flags,
 				      struct memblock_type *type_a,
 				      struct memblock_type *type_b,
 				      phys_addr_t *out_start,
@@ -895,6 +903,7 @@ void __init_memblock __next_mem_range(u64 *idx, int nid,
  *
  * @idx: pointer to u64 loop variable
  * @nid: nid: node selector, %NUMA_NO_NODE for all nodes
+ * @flags: pick from blocks based on memory attributes
  * @type_a: pointer to memblock_type from where the range is taken
  * @type_b: pointer to memblock_type which excludes memory from being taken
  * @out_start: ptr to phys_addr_t for start address of the range, can be %NULL
@@ -903,7 +912,7 @@ void __init_memblock __next_mem_range(u64 *idx, int nid,
  *
  * Reverse of __next_mem_range().
  */
-void __init_memblock __next_mem_range_rev(u64 *idx, int nid,
+void __init_memblock __next_mem_range_rev(u64 *idx, int nid, ulong flags,
 					  struct memblock_type *type_a,
 					  struct memblock_type *type_b,
 					  phys_addr_t *out_start,
@@ -1050,14 +1059,15 @@ int __init_memblock memblock_set_node(phys_addr_t base, phys_addr_t size,
 
 static phys_addr_t __init memblock_alloc_range_nid(phys_addr_t size,
 					phys_addr_t align, phys_addr_t start,
-					phys_addr_t end, int nid)
+					phys_addr_t end, int nid, ulong flags)
 {
 	phys_addr_t found;
 
 	if (!align)
 		align = SMP_CACHE_BYTES;
 
-	found = memblock_find_in_range_node(size, align, start, end, nid);
+	found = memblock_find_in_range_node(size, align, start, end, nid,
+					    flags);
 	if (found && !memblock_reserve(found, size)) {
 		/*
 		 * The min_count is set to 0 so that memblock allocations are
@@ -1070,26 +1080,30 @@ static phys_addr_t __init memblock_alloc_range_nid(phys_addr_t size,
 }
 
 phys_addr_t __init memblock_alloc_range(phys_addr_t size, phys_addr_t align,
-					phys_addr_t start, phys_addr_t end)
+					phys_addr_t start, phys_addr_t end,
+					ulong flags)
 {
-	return memblock_alloc_range_nid(size, align, start, end, NUMA_NO_NODE);
+	return memblock_alloc_range_nid(size, align, start, end, NUMA_NO_NODE,
+					flags);
 }
 
 static phys_addr_t __init memblock_alloc_base_nid(phys_addr_t size,
 					phys_addr_t align, phys_addr_t max_addr,
-					int nid)
+					int nid, ulong flags)
 {
-	return memblock_alloc_range_nid(size, align, 0, max_addr, nid);
+	return memblock_alloc_range_nid(size, align, 0, max_addr, nid, flags);
 }
 
 phys_addr_t __init memblock_alloc_nid(phys_addr_t size, phys_addr_t align, int nid)
 {
-	return memblock_alloc_base_nid(size, align, MEMBLOCK_ALLOC_ACCESSIBLE, nid);
+	return memblock_alloc_base_nid(size, align, MEMBLOCK_ALLOC_ACCESSIBLE,
+				       nid, MEMBLOCK_NONE);
 }
 
 phys_addr_t __init __memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys_addr_t max_addr)
 {
-	return memblock_alloc_base_nid(size, align, max_addr, NUMA_NO_NODE);
+	return memblock_alloc_base_nid(size, align, max_addr, NUMA_NO_NODE,
+				       MEMBLOCK_NONE);
 }
 
 phys_addr_t __init memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys_addr_t max_addr)
@@ -1173,13 +1187,14 @@ static void * __init memblock_virt_alloc_internal(
 
 again:
 	alloc = memblock_find_in_range_node(size, align, min_addr, max_addr,
-					    nid);
+					    nid, MEMBLOCK_NONE);
 	if (alloc)
 		goto done;
 
 	if (nid != NUMA_NO_NODE) {
 		alloc = memblock_find_in_range_node(size, align, min_addr,
-						    max_addr,  NUMA_NO_NODE);
+						    max_addr, NUMA_NO_NODE,
+						    MEMBLOCK_NONE);
 		if (alloc)
 			goto done;
 	}
diff --git a/mm/memtest.c b/mm/memtest.c
index 1997d934b13b..0a1cc133f6d7 100644
--- a/mm/memtest.c
+++ b/mm/memtest.c
@@ -74,7 +74,8 @@ static void __init do_one_pass(u64 pattern, phys_addr_t start, phys_addr_t end)
 	u64 i;
 	phys_addr_t this_start, this_end;
 
-	for_each_free_mem_range(i, NUMA_NO_NODE, &this_start, &this_end, NULL) {
+	for_each_free_mem_range(i, NUMA_NO_NODE, MEMBLOCK_NONE, &this_start,
+				&this_end, NULL) {
 		this_start = clamp(this_start, start, end);
 		this_end = clamp(this_end, start, end);
 		if (this_start < this_end) {
diff --git a/mm/nobootmem.c b/mm/nobootmem.c
index 90b50468333e..ad3641dcdbe7 100644
--- a/mm/nobootmem.c
+++ b/mm/nobootmem.c
@@ -41,7 +41,8 @@ static void * __init __alloc_memory_core_early(int nid, u64 size, u64 align,
 	if (limit > memblock.current_limit)
 		limit = memblock.current_limit;
 
-	addr = memblock_find_in_range_node(size, align, goal, limit, nid);
+	addr = memblock_find_in_range_node(size, align, goal, limit, nid,
+					   MEMBLOCK_NONE);
 	if (!addr)
 		return NULL;
 
@@ -121,7 +122,8 @@ static unsigned long __init free_low_memory_core_early(void)
 
 	memblock_clear_hotplug(0, -1);
 
-	for_each_free_mem_range(i, NUMA_NO_NODE, &start, &end, NULL)
+	for_each_free_mem_range(i, NUMA_NO_NODE, MEMBLOCK_NONE, &start, &end,
+				NULL)
 		count += __free_memory_core(start, end);
 
 #ifdef CONFIG_ARCH_DISCARD_MEMBLOCK
-- 
cgit v1.2.3


From a3f5bafcc04aaf62990e0cf3ced1cc6d8dc6fe95 Mon Sep 17 00:00:00 2001
From: Tony Luck <tony.luck@intel.com>
Date: Wed, 24 Jun 2015 16:58:12 -0700
Subject: mm/memblock: allocate boot time data structures from mirrored memory

Try to allocate all boot time kernel data structures from mirrored
memory.

If we run out of mirrored memory print warnings, but fall back to using
non-mirrored memory to make sure that we still boot.

By number of bytes, most of what we allocate at boot time is the page
structures.  64 bytes per 4K page on x86_64 ...  or about 1.5% of total
system memory.  For workloads where the bulk of memory is allocated to
applications this may represent a useful improvement to system
availability since 1.5% of total memory might be a third of the memory
allocated to the kernel.

Signed-off-by: Tony Luck <tony.luck@intel.com>
Cc: Xishi Qiu <qiuxishi@huawei.com>
Cc: Hanjun Guo <guohanjun@huawei.com>
Cc: Xiexiuqi <xiexiuqi@huawei.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: Naoya Horiguchi <nao.horiguchi@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memblock.h |  8 +++++
 mm/memblock.c            | 78 +++++++++++++++++++++++++++++++++++++++++-------
 mm/nobootmem.c           | 10 ++++++-
 3 files changed, 84 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index 7aeec0cb4c27..0215ffd63069 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -24,6 +24,7 @@
 enum {
 	MEMBLOCK_NONE		= 0x0,	/* No special request */
 	MEMBLOCK_HOTPLUG	= 0x1,	/* hotpluggable region */
+	MEMBLOCK_MIRROR		= 0x2,	/* mirrored region */
 };
 
 struct memblock_region {
@@ -78,6 +79,8 @@ int memblock_reserve(phys_addr_t base, phys_addr_t size);
 void memblock_trim_memory(phys_addr_t align);
 int memblock_mark_hotplug(phys_addr_t base, phys_addr_t size);
 int memblock_clear_hotplug(phys_addr_t base, phys_addr_t size);
+int memblock_mark_mirror(phys_addr_t base, phys_addr_t size);
+ulong choose_memblock_flags(void);
 
 /* Low level functions */
 int memblock_add_range(struct memblock_type *type,
@@ -160,6 +163,11 @@ static inline bool movable_node_is_enabled(void)
 }
 #endif
 
+static inline bool memblock_is_mirror(struct memblock_region *m)
+{
+	return m->flags & MEMBLOCK_MIRROR;
+}
+
 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
 int memblock_search_pfn_nid(unsigned long pfn, unsigned long *start_pfn,
 			    unsigned long  *end_pfn);
diff --git a/mm/memblock.c b/mm/memblock.c
index b9ff2f4f0285..1b444c730846 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -54,10 +54,16 @@ int memblock_debug __initdata_memblock;
 #ifdef CONFIG_MOVABLE_NODE
 bool movable_node_enabled __initdata_memblock = false;
 #endif
+static bool system_has_some_mirror __initdata_memblock = false;
 static int memblock_can_resize __initdata_memblock;
 static int memblock_memory_in_slab __initdata_memblock = 0;
 static int memblock_reserved_in_slab __initdata_memblock = 0;
 
+ulong __init_memblock choose_memblock_flags(void)
+{
+	return system_has_some_mirror ? MEMBLOCK_MIRROR : MEMBLOCK_NONE;
+}
+
 /* inline so we don't get a warning when pr_debug is compiled out */
 static __init_memblock const char *
 memblock_type_name(struct memblock_type *type)
@@ -259,8 +265,21 @@ phys_addr_t __init_memblock memblock_find_in_range(phys_addr_t start,
 					phys_addr_t end, phys_addr_t size,
 					phys_addr_t align)
 {
-	return memblock_find_in_range_node(size, align, start, end,
-					    NUMA_NO_NODE, MEMBLOCK_NONE);
+	phys_addr_t ret;
+	ulong flags = choose_memblock_flags();
+
+again:
+	ret = memblock_find_in_range_node(size, align, start, end,
+					    NUMA_NO_NODE, flags);
+
+	if (!ret && (flags & MEMBLOCK_MIRROR)) {
+		pr_warn("Could not allocate %pap bytes of mirrored memory\n",
+			&size);
+		flags &= ~MEMBLOCK_MIRROR;
+		goto again;
+	}
+
+	return ret;
 }
 
 static void __init_memblock memblock_remove_region(struct memblock_type *type, unsigned long r)
@@ -785,6 +804,21 @@ int __init_memblock memblock_clear_hotplug(phys_addr_t base, phys_addr_t size)
 	return memblock_setclr_flag(base, size, 0, MEMBLOCK_HOTPLUG);
 }
 
+/**
+ * memblock_mark_mirror - Mark mirrored memory with flag MEMBLOCK_MIRROR.
+ * @base: the base phys addr of the region
+ * @size: the size of the region
+ *
+ * Return 0 on succees, -errno on failure.
+ */
+int __init_memblock memblock_mark_mirror(phys_addr_t base, phys_addr_t size)
+{
+	system_has_some_mirror = true;
+
+	return memblock_setclr_flag(base, size, 1, MEMBLOCK_MIRROR);
+}
+
+
 /**
  * __next__mem_range - next function for for_each_free_mem_range() etc.
  * @idx: pointer to u64 loop variable
@@ -839,6 +873,10 @@ void __init_memblock __next_mem_range(u64 *idx, int nid, ulong flags,
 		if (movable_node_is_enabled() && memblock_is_hotpluggable(m))
 			continue;
 
+		/* if we want mirror memory skip non-mirror memory regions */
+		if ((flags & MEMBLOCK_MIRROR) && !memblock_is_mirror(m))
+			continue;
+
 		if (!type_b) {
 			if (out_start)
 				*out_start = m_start;
@@ -944,6 +982,10 @@ void __init_memblock __next_mem_range_rev(u64 *idx, int nid, ulong flags,
 		if (movable_node_is_enabled() && memblock_is_hotpluggable(m))
 			continue;
 
+		/* if we want mirror memory skip non-mirror memory regions */
+		if ((flags & MEMBLOCK_MIRROR) && !memblock_is_mirror(m))
+			continue;
+
 		if (!type_b) {
 			if (out_start)
 				*out_start = m_start;
@@ -1096,8 +1138,18 @@ static phys_addr_t __init memblock_alloc_base_nid(phys_addr_t size,
 
 phys_addr_t __init memblock_alloc_nid(phys_addr_t size, phys_addr_t align, int nid)
 {
-	return memblock_alloc_base_nid(size, align, MEMBLOCK_ALLOC_ACCESSIBLE,
-				       nid, MEMBLOCK_NONE);
+	ulong flags = choose_memblock_flags();
+	phys_addr_t ret;
+
+again:
+	ret = memblock_alloc_base_nid(size, align, MEMBLOCK_ALLOC_ACCESSIBLE,
+				      nid, flags);
+
+	if (!ret && (flags & MEMBLOCK_MIRROR)) {
+		flags &= ~MEMBLOCK_MIRROR;
+		goto again;
+	}
+	return ret;
 }
 
 phys_addr_t __init __memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys_addr_t max_addr)
@@ -1167,6 +1219,7 @@ static void * __init memblock_virt_alloc_internal(
 {
 	phys_addr_t alloc;
 	void *ptr;
+	ulong flags = choose_memblock_flags();
 
 	if (WARN_ONCE(nid == MAX_NUMNODES, "Usage of MAX_NUMNODES is deprecated. Use NUMA_NO_NODE instead\n"))
 		nid = NUMA_NO_NODE;
@@ -1187,14 +1240,14 @@ static void * __init memblock_virt_alloc_internal(
 
 again:
 	alloc = memblock_find_in_range_node(size, align, min_addr, max_addr,
-					    nid, MEMBLOCK_NONE);
+					    nid, flags);
 	if (alloc)
 		goto done;
 
 	if (nid != NUMA_NO_NODE) {
 		alloc = memblock_find_in_range_node(size, align, min_addr,
 						    max_addr, NUMA_NO_NODE,
-						    MEMBLOCK_NONE);
+						    flags);
 		if (alloc)
 			goto done;
 	}
@@ -1202,10 +1255,16 @@ again:
 	if (min_addr) {
 		min_addr = 0;
 		goto again;
-	} else {
-		goto error;
 	}
 
+	if (flags & MEMBLOCK_MIRROR) {
+		flags &= ~MEMBLOCK_MIRROR;
+		pr_warn("Could not allocate %pap bytes of mirrored memory\n",
+			&size);
+		goto again;
+	}
+
+	return NULL;
 done:
 	memblock_reserve(alloc, size);
 	ptr = phys_to_virt(alloc);
@@ -1220,9 +1279,6 @@ done:
 	kmemleak_alloc(ptr, size, 0, 0);
 
 	return ptr;
-
-error:
-	return NULL;
 }
 
 /**
diff --git a/mm/nobootmem.c b/mm/nobootmem.c
index ad3641dcdbe7..5258386fa1be 100644
--- a/mm/nobootmem.c
+++ b/mm/nobootmem.c
@@ -37,12 +37,20 @@ static void * __init __alloc_memory_core_early(int nid, u64 size, u64 align,
 {
 	void *ptr;
 	u64 addr;
+	ulong flags = choose_memblock_flags();
 
 	if (limit > memblock.current_limit)
 		limit = memblock.current_limit;
 
+again:
 	addr = memblock_find_in_range_node(size, align, goal, limit, nid,
-					   MEMBLOCK_NONE);
+					   flags);
+	if (!addr && (flags & MEMBLOCK_MIRROR)) {
+		flags &= ~MEMBLOCK_MIRROR;
+		pr_warn("Could not allocate %pap bytes of mirrored memory\n",
+			&size);
+		goto again;
+	}
 	if (!addr)
 		return NULL;
 
-- 
cgit v1.2.3


From b05b9f5f9dcf593a0e9327676b78e6c17b4218e8 Mon Sep 17 00:00:00 2001
From: Tony Luck <tony.luck@intel.com>
Date: Wed, 24 Jun 2015 16:58:15 -0700
Subject: x86, mirror: x86 enabling - find mirrored memory ranges

UEFI GetMemoryMap() uses a new attribute bit to mark mirrored memory
address ranges.  See UEFI 2.5 spec pages 157-158:

  http://www.uefi.org/sites/default/files/resources/UEFI%202_5.pdf

On EFI enabled systems scan the memory map and tell memblock about any
mirrored ranges.

Signed-off-by: Tony Luck <tony.luck@intel.com>
Cc: Xishi Qiu <qiuxishi@huawei.com>
Cc: Hanjun Guo <guohanjun@huawei.com>
Cc: Xiexiuqi <xiexiuqi@huawei.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: Naoya Horiguchi <nao.horiguchi@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/kernel/setup.c     |  3 +++
 arch/x86/platform/efi/efi.c | 21 +++++++++++++++++++++
 include/linux/efi.h         |  3 +++
 3 files changed, 27 insertions(+)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 39ca113676fe..d3b95b89e9b2 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -1105,6 +1105,9 @@ void __init setup_arch(char **cmdline_p)
 	memblock_set_current_limit(ISA_END_ADDRESS);
 	memblock_x86_fill();
 
+	if (efi_enabled(EFI_BOOT))
+		efi_find_mirror();
+
 	/*
 	 * The EFI specification says that boot service code won't be called
 	 * after ExitBootServices(). This is, in fact, a lie.
diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c
index 3b984c3aa1b0..c1c382c58c60 100644
--- a/arch/x86/platform/efi/efi.c
+++ b/arch/x86/platform/efi/efi.c
@@ -117,6 +117,27 @@ void efi_get_time(struct timespec *now)
 	now->tv_nsec = 0;
 }
 
+void __init efi_find_mirror(void)
+{
+	void *p;
+	u64 mirror_size = 0, total_size = 0;
+
+	for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
+		efi_memory_desc_t *md = p;
+		unsigned long long start = md->phys_addr;
+		unsigned long long size = md->num_pages << EFI_PAGE_SHIFT;
+
+		total_size += size;
+		if (md->attribute & EFI_MEMORY_MORE_RELIABLE) {
+			memblock_mark_mirror(start, size);
+			mirror_size += size;
+		}
+	}
+	if (mirror_size)
+		pr_info("Memory: %lldM/%lldM mirrored memory\n",
+			mirror_size>>20, total_size>>20);
+}
+
 /*
  * Tell the kernel about the EFI memory map.  This might include
  * more than the max 128 entries that can fit in the e820 legacy
diff --git a/include/linux/efi.h b/include/linux/efi.h
index 2092965afca3..5f19efe4eb3f 100644
--- a/include/linux/efi.h
+++ b/include/linux/efi.h
@@ -96,6 +96,8 @@ typedef	struct {
 #define EFI_MEMORY_WP		((u64)0x0000000000001000ULL)	/* write-protect */
 #define EFI_MEMORY_RP		((u64)0x0000000000002000ULL)	/* read-protect */
 #define EFI_MEMORY_XP		((u64)0x0000000000004000ULL)	/* execute-protect */
+#define EFI_MEMORY_MORE_RELIABLE \
+				((u64)0x0000000000010000ULL)	/* higher reliability */
 #define EFI_MEMORY_RUNTIME	((u64)0x8000000000000000ULL)	/* range requires runtime mapping */
 #define EFI_MEMORY_DESCRIPTOR_VERSION	1
 
@@ -868,6 +870,7 @@ extern void efi_enter_virtual_mode (void);	/* switch EFI to virtual mode, if pos
 extern void efi_late_init(void);
 extern void efi_free_boot_services(void);
 extern efi_status_t efi_query_variable_store(u32 attributes, unsigned long size);
+extern void efi_find_mirror(void);
 #else
 static inline void efi_late_init(void) {}
 static inline void efi_free_boot_services(void) {}
-- 
cgit v1.2.3


From d1dc6f1bcf1e998e7ce65fc120da371ab047a999 Mon Sep 17 00:00:00 2001
From: Dan Streetman <ddstreet@ieee.org>
Date: Wed, 24 Jun 2015 16:58:18 -0700
Subject: frontswap: allow multiple backends

Change frontswap single pointer to a singly linked list of frontswap
implementations.  Update Xen tmem implementation as register no longer
returns anything.

Frontswap only keeps track of a single implementation; any
implementation that registers second (or later) will replace the
previously registered implementation, and gets a pointer to the previous
implementation that the new implementation is expected to pass all
frontswap functions to if it can't handle the function itself.  However
that method doesn't really make much sense, as passing that work on to
every implementation adds unnecessary work to implementations; instead,
frontswap should simply keep a list of all registered implementations
and try each implementation for any function.  Most importantly, neither
of the two currently existing frontswap implementations in the kernel
actually do anything with any previous frontswap implementation that
they replace when registering.

This allows frontswap to successfully manage multiple implementations by
keeping a list of them all.

Signed-off-by: Dan Streetman <ddstreet@ieee.org>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: David Vrabel <david.vrabel@citrix.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/xen/tmem.c        |   8 +-
 include/linux/frontswap.h |  14 +--
 mm/frontswap.c            | 215 ++++++++++++++++++++++++++++------------------
 3 files changed, 139 insertions(+), 98 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/xen/tmem.c b/drivers/xen/tmem.c
index c4211a31612d..d88f36754bf7 100644
--- a/drivers/xen/tmem.c
+++ b/drivers/xen/tmem.c
@@ -381,15 +381,9 @@ static int __init xen_tmem_init(void)
 #ifdef CONFIG_FRONTSWAP
 	if (tmem_enabled && frontswap) {
 		char *s = "";
-		struct frontswap_ops *old_ops;
 
 		tmem_frontswap_poolid = -1;
-		old_ops = frontswap_register_ops(&tmem_frontswap_ops);
-		if (IS_ERR(old_ops) || old_ops) {
-			if (IS_ERR(old_ops))
-				return PTR_ERR(old_ops);
-			s = " (WARNING: frontswap_ops overridden)";
-		}
+		frontswap_register_ops(&tmem_frontswap_ops);
 		pr_info("frontswap enabled, RAM provided by Xen Transcendent Memory%s\n",
 			s);
 	}
diff --git a/include/linux/frontswap.h b/include/linux/frontswap.h
index 8293262401de..e65ef959546c 100644
--- a/include/linux/frontswap.h
+++ b/include/linux/frontswap.h
@@ -6,16 +6,16 @@
 #include <linux/bitops.h>
 
 struct frontswap_ops {
-	void (*init)(unsigned);
-	int (*store)(unsigned, pgoff_t, struct page *);
-	int (*load)(unsigned, pgoff_t, struct page *);
-	void (*invalidate_page)(unsigned, pgoff_t);
-	void (*invalidate_area)(unsigned);
+	void (*init)(unsigned); /* this swap type was just swapon'ed */
+	int (*store)(unsigned, pgoff_t, struct page *); /* store a page */
+	int (*load)(unsigned, pgoff_t, struct page *); /* load a page */
+	void (*invalidate_page)(unsigned, pgoff_t); /* page no longer needed */
+	void (*invalidate_area)(unsigned); /* swap type just swapoff'ed */
+	struct frontswap_ops *next; /* private pointer to next ops */
 };
 
 extern bool frontswap_enabled;
-extern struct frontswap_ops *
-	frontswap_register_ops(struct frontswap_ops *ops);
+extern void frontswap_register_ops(struct frontswap_ops *ops);
 extern void frontswap_shrink(unsigned long);
 extern unsigned long frontswap_curr_pages(void);
 extern void frontswap_writethrough(bool);
diff --git a/mm/frontswap.c b/mm/frontswap.c
index 8d82809eb085..27a9924caf61 100644
--- a/mm/frontswap.c
+++ b/mm/frontswap.c
@@ -21,11 +21,16 @@
 #include <linux/swapfile.h>
 
 /*
- * frontswap_ops is set by frontswap_register_ops to contain the pointers
- * to the frontswap "backend" implementation functions.
+ * frontswap_ops are added by frontswap_register_ops, and provide the
+ * frontswap "backend" implementation functions.  Multiple implementations
+ * may be registered, but implementations can never deregister.  This
+ * is a simple singly-linked list of all registered implementations.
  */
 static struct frontswap_ops *frontswap_ops __read_mostly;
 
+#define for_each_frontswap_ops(ops)		\
+	for ((ops) = frontswap_ops; (ops); (ops) = (ops)->next)
+
 /*
  * If enabled, frontswap_store will return failure even on success.  As
  * a result, the swap subsystem will always write the page to swap, in
@@ -79,15 +84,6 @@ static inline void inc_frontswap_invalidates(void) { }
  * on all frontswap functions to not call the backend until the backend
  * has registered.
  *
- * Specifically when no backend is registered (nobody called
- * frontswap_register_ops) all calls to frontswap_init (which is done via
- * swapon -> enable_swap_info -> frontswap_init) are registered and remembered
- * (via the setting of need_init bitmap) but fail to create tmem_pools. When a
- * backend registers with frontswap at some later point the previous
- * calls to frontswap_init are executed (by iterating over the need_init
- * bitmap) to create tmem_pools and set the respective poolids. All of that is
- * guarded by us using atomic bit operations on the 'need_init' bitmap.
- *
  * This would not guards us against the user deciding to call swapoff right as
  * we are calling the backend to initialize (so swapon is in action).
  * Fortunatly for us, the swapon_mutex has been taked by the callee so we are
@@ -106,37 +102,64 @@ static inline void inc_frontswap_invalidates(void) { }
  *
  * Obviously the opposite (unloading the backend) must be done after all
  * the frontswap_[store|load|invalidate_area|invalidate_page] start
- * ignorning or failing the requests - at which point frontswap_ops
- * would have to be made in some fashion atomic.
+ * ignoring or failing the requests.  However, there is currently no way
+ * to unload a backend once it is registered.
  */
-static DECLARE_BITMAP(need_init, MAX_SWAPFILES);
 
 /*
- * Register operations for frontswap, returning previous thus allowing
- * detection of multiple backends and possible nesting.
+ * Register operations for frontswap
  */
-struct frontswap_ops *frontswap_register_ops(struct frontswap_ops *ops)
+void frontswap_register_ops(struct frontswap_ops *ops)
 {
-	struct frontswap_ops *old = frontswap_ops;
-	int i;
-
-	for (i = 0; i < MAX_SWAPFILES; i++) {
-		if (test_and_clear_bit(i, need_init)) {
-			struct swap_info_struct *sis = swap_info[i];
-			/* __frontswap_init _should_ have set it! */
-			if (!sis->frontswap_map)
-				return ERR_PTR(-EINVAL);
-			ops->init(i);
-		}
+	DECLARE_BITMAP(a, MAX_SWAPFILES);
+	DECLARE_BITMAP(b, MAX_SWAPFILES);
+	struct swap_info_struct *si;
+	unsigned int i;
+
+	bitmap_zero(a, MAX_SWAPFILES);
+	bitmap_zero(b, MAX_SWAPFILES);
+
+	spin_lock(&swap_lock);
+	plist_for_each_entry(si, &swap_active_head, list) {
+		if (!WARN_ON(!si->frontswap_map))
+			set_bit(si->type, a);
 	}
+	spin_unlock(&swap_lock);
+
+	/* the new ops needs to know the currently active swap devices */
+	for_each_set_bit(i, a, MAX_SWAPFILES)
+		ops->init(i);
+
 	/*
-	 * We MUST have frontswap_ops set _after_ the frontswap_init's
-	 * have been called. Otherwise __frontswap_store might fail. Hence
-	 * the barrier to make sure compiler does not re-order us.
+	 * Setting frontswap_ops must happen after the ops->init() calls
+	 * above; cmpxchg implies smp_mb() which will ensure the init is
+	 * complete at this point.
 	 */
-	barrier();
-	frontswap_ops = ops;
-	return old;
+	do {
+		ops->next = frontswap_ops;
+	} while (cmpxchg(&frontswap_ops, ops->next, ops) != ops->next);
+
+	spin_lock(&swap_lock);
+	plist_for_each_entry(si, &swap_active_head, list) {
+		if (si->frontswap_map)
+			set_bit(si->type, b);
+	}
+	spin_unlock(&swap_lock);
+
+	/*
+	 * On the very unlikely chance that a swap device was added or
+	 * removed between setting the "a" list bits and the ops init
+	 * calls, we re-check and do init or invalidate for any changed
+	 * bits.
+	 */
+	if (unlikely(!bitmap_equal(a, b, MAX_SWAPFILES))) {
+		for (i = 0; i < MAX_SWAPFILES; i++) {
+			if (!test_bit(i, a) && test_bit(i, b))
+				ops->init(i);
+			else if (test_bit(i, a) && !test_bit(i, b))
+				ops->invalidate_area(i);
+		}
+	}
 }
 EXPORT_SYMBOL(frontswap_register_ops);
 
@@ -164,6 +187,7 @@ EXPORT_SYMBOL(frontswap_tmem_exclusive_gets);
 void __frontswap_init(unsigned type, unsigned long *map)
 {
 	struct swap_info_struct *sis = swap_info[type];
+	struct frontswap_ops *ops;
 
 	BUG_ON(sis == NULL);
 
@@ -179,28 +203,30 @@ void __frontswap_init(unsigned type, unsigned long *map)
 	 * p->frontswap set to something valid to work properly.
 	 */
 	frontswap_map_set(sis, map);
-	if (frontswap_ops)
-		frontswap_ops->init(type);
-	else {
-		BUG_ON(type >= MAX_SWAPFILES);
-		set_bit(type, need_init);
-	}
+
+	for_each_frontswap_ops(ops)
+		ops->init(type);
 }
 EXPORT_SYMBOL(__frontswap_init);
 
 bool __frontswap_test(struct swap_info_struct *sis,
 				pgoff_t offset)
 {
-	bool ret = false;
-
-	if (frontswap_ops && sis->frontswap_map)
-		ret = test_bit(offset, sis->frontswap_map);
-	return ret;
+	if (sis->frontswap_map)
+		return test_bit(offset, sis->frontswap_map);
+	return false;
 }
 EXPORT_SYMBOL(__frontswap_test);
 
+static inline void __frontswap_set(struct swap_info_struct *sis,
+				   pgoff_t offset)
+{
+	set_bit(offset, sis->frontswap_map);
+	atomic_inc(&sis->frontswap_pages);
+}
+
 static inline void __frontswap_clear(struct swap_info_struct *sis,
-				pgoff_t offset)
+				     pgoff_t offset)
 {
 	clear_bit(offset, sis->frontswap_map);
 	atomic_dec(&sis->frontswap_pages);
@@ -215,39 +241,46 @@ static inline void __frontswap_clear(struct swap_info_struct *sis,
  */
 int __frontswap_store(struct page *page)
 {
-	int ret = -1, dup = 0;
+	int ret = -1;
 	swp_entry_t entry = { .val = page_private(page), };
 	int type = swp_type(entry);
 	struct swap_info_struct *sis = swap_info[type];
 	pgoff_t offset = swp_offset(entry);
+	struct frontswap_ops *ops;
 
 	/*
 	 * Return if no backend registed.
 	 * Don't need to inc frontswap_failed_stores here.
 	 */
 	if (!frontswap_ops)
-		return ret;
+		return -1;
 
 	BUG_ON(!PageLocked(page));
 	BUG_ON(sis == NULL);
-	if (__frontswap_test(sis, offset))
-		dup = 1;
-	ret = frontswap_ops->store(type, offset, page);
+
+	/*
+	 * If a dup, we must remove the old page first; we can't leave the
+	 * old page no matter if the store of the new page succeeds or fails,
+	 * and we can't rely on the new page replacing the old page as we may
+	 * not store to the same implementation that contains the old page.
+	 */
+	if (__frontswap_test(sis, offset)) {
+		__frontswap_clear(sis, offset);
+		for_each_frontswap_ops(ops)
+			ops->invalidate_page(type, offset);
+	}
+
+	/* Try to store in each implementation, until one succeeds. */
+	for_each_frontswap_ops(ops) {
+		ret = ops->store(type, offset, page);
+		if (!ret) /* successful store */
+			break;
+	}
 	if (ret == 0) {
-		set_bit(offset, sis->frontswap_map);
+		__frontswap_set(sis, offset);
 		inc_frontswap_succ_stores();
-		if (!dup)
-			atomic_inc(&sis->frontswap_pages);
 	} else {
-		/*
-		  failed dup always results in automatic invalidate of
-		  the (older) page from frontswap
-		 */
 		inc_frontswap_failed_stores();
-		if (dup) {
-			__frontswap_clear(sis, offset);
-			frontswap_ops->invalidate_page(type, offset);
-		}
 	}
 	if (frontswap_writethrough_enabled)
 		/* report failure so swap also writes to swap device */
@@ -268,14 +301,22 @@ int __frontswap_load(struct page *page)
 	int type = swp_type(entry);
 	struct swap_info_struct *sis = swap_info[type];
 	pgoff_t offset = swp_offset(entry);
+	struct frontswap_ops *ops;
+
+	if (!frontswap_ops)
+		return -1;
 
 	BUG_ON(!PageLocked(page));
 	BUG_ON(sis == NULL);
-	/*
-	 * __frontswap_test() will check whether there is backend registered
-	 */
-	if (__frontswap_test(sis, offset))
-		ret = frontswap_ops->load(type, offset, page);
+	if (!__frontswap_test(sis, offset))
+		return -1;
+
+	/* Try loading from each implementation, until one succeeds. */
+	for_each_frontswap_ops(ops) {
+		ret = ops->load(type, offset, page);
+		if (!ret) /* successful load */
+			break;
+	}
 	if (ret == 0) {
 		inc_frontswap_loads();
 		if (frontswap_tmem_exclusive_gets_enabled) {
@@ -294,16 +335,19 @@ EXPORT_SYMBOL(__frontswap_load);
 void __frontswap_invalidate_page(unsigned type, pgoff_t offset)
 {
 	struct swap_info_struct *sis = swap_info[type];
+	struct frontswap_ops *ops;
+
+	if (!frontswap_ops)
+		return;
 
 	BUG_ON(sis == NULL);
-	/*
-	 * __frontswap_test() will check whether there is backend registered
-	 */
-	if (__frontswap_test(sis, offset)) {
-		frontswap_ops->invalidate_page(type, offset);
-		__frontswap_clear(sis, offset);
-		inc_frontswap_invalidates();
-	}
+	if (!__frontswap_test(sis, offset))
+		return;
+
+	for_each_frontswap_ops(ops)
+		ops->invalidate_page(type, offset);
+	__frontswap_clear(sis, offset);
+	inc_frontswap_invalidates();
 }
 EXPORT_SYMBOL(__frontswap_invalidate_page);
 
@@ -314,16 +358,19 @@ EXPORT_SYMBOL(__frontswap_invalidate_page);
 void __frontswap_invalidate_area(unsigned type)
 {
 	struct swap_info_struct *sis = swap_info[type];
+	struct frontswap_ops *ops;
 
-	if (frontswap_ops) {
-		BUG_ON(sis == NULL);
-		if (sis->frontswap_map == NULL)
-			return;
-		frontswap_ops->invalidate_area(type);
-		atomic_set(&sis->frontswap_pages, 0);
-		bitmap_zero(sis->frontswap_map, sis->max);
-	}
-	clear_bit(type, need_init);
+	if (!frontswap_ops)
+		return;
+
+	BUG_ON(sis == NULL);
+	if (sis->frontswap_map == NULL)
+		return;
+
+	for_each_frontswap_ops(ops)
+		ops->invalidate_area(type);
+	atomic_set(&sis->frontswap_pages, 0);
+	bitmap_zero(sis->frontswap_map, sis->max);
 }
 EXPORT_SYMBOL(__frontswap_invalidate_area);
 
-- 
cgit v1.2.3


From 8a8c35fadfaf55629a37ef1a8ead1b8fb32581d2 Mon Sep 17 00:00:00 2001
From: Larry Finger <Larry.Finger@lwfinger.net>
Date: Wed, 24 Jun 2015 16:58:51 -0700
Subject: mm: kmemleak_alloc_percpu() should follow the gfp from per_alloc()

Beginning at commit d52d3997f843 ("ipv6: Create percpu rt6_info"), the
following INFO splat is logged:

  ===============================
  [ INFO: suspicious RCU usage. ]
  4.1.0-rc7-next-20150612 #1 Not tainted
  -------------------------------
  kernel/sched/core.c:7318 Illegal context switch in RCU-bh read-side critical section!
  other info that might help us debug this:
  rcu_scheduler_active = 1, debug_locks = 0
   3 locks held by systemd/1:
   #0:  (rtnl_mutex){+.+.+.}, at: [<ffffffff815f0c8f>] rtnetlink_rcv+0x1f/0x40
   #1:  (rcu_read_lock_bh){......}, at: [<ffffffff816a34e2>] ipv6_add_addr+0x62/0x540
   #2:  (addrconf_hash_lock){+...+.}, at: [<ffffffff816a3604>] ipv6_add_addr+0x184/0x540
  stack backtrace:
  CPU: 0 PID: 1 Comm: systemd Not tainted 4.1.0-rc7-next-20150612 #1
  Hardware name: TOSHIBA TECRA A50-A/TECRA A50-A, BIOS Version 4.20   04/17/2014
  Call Trace:
    dump_stack+0x4c/0x6e
    lockdep_rcu_suspicious+0xe7/0x120
    ___might_sleep+0x1d5/0x1f0
    __might_sleep+0x4d/0x90
    kmem_cache_alloc+0x47/0x250
    create_object+0x39/0x2e0
    kmemleak_alloc_percpu+0x61/0xe0
    pcpu_alloc+0x370/0x630

Additional backtrace lines are truncated.  In addition, the above splat
is followed by several "BUG: sleeping function called from invalid
context at mm/slub.c:1268" outputs.  As suggested by Martin KaFai Lau,
these are the clue to the fix.  Routine kmemleak_alloc_percpu() always
uses GFP_KERNEL for its allocations, whereas it should follow the gfp
from its callers.

Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Reviewed-by: Kamalesh Babulal <kamalesh@linux.vnet.ibm.com>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Larry Finger <Larry.Finger@lwfinger.net>
Cc: Martin KaFai Lau <kafai@fb.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Christoph Lameter <cl@linux-foundation.org>
Cc: <stable@vger.kernel.org>	[3.18+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kmemleak.h | 6 ++++--
 mm/kmemleak.c            | 9 +++++----
 mm/percpu.c              | 2 +-
 3 files changed, 10 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kmemleak.h b/include/linux/kmemleak.h
index e705467ddb47..d0a1f99e24e3 100644
--- a/include/linux/kmemleak.h
+++ b/include/linux/kmemleak.h
@@ -28,7 +28,8 @@
 extern void kmemleak_init(void) __ref;
 extern void kmemleak_alloc(const void *ptr, size_t size, int min_count,
 			   gfp_t gfp) __ref;
-extern void kmemleak_alloc_percpu(const void __percpu *ptr, size_t size) __ref;
+extern void kmemleak_alloc_percpu(const void __percpu *ptr, size_t size,
+				  gfp_t gfp) __ref;
 extern void kmemleak_free(const void *ptr) __ref;
 extern void kmemleak_free_part(const void *ptr, size_t size) __ref;
 extern void kmemleak_free_percpu(const void __percpu *ptr) __ref;
@@ -71,7 +72,8 @@ static inline void kmemleak_alloc_recursive(const void *ptr, size_t size,
 					    gfp_t gfp)
 {
 }
-static inline void kmemleak_alloc_percpu(const void __percpu *ptr, size_t size)
+static inline void kmemleak_alloc_percpu(const void __percpu *ptr, size_t size,
+					 gfp_t gfp)
 {
 }
 static inline void kmemleak_free(const void *ptr)
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index ca9e5a5969a8..cf79f110157c 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -930,12 +930,13 @@ EXPORT_SYMBOL_GPL(kmemleak_alloc);
  * kmemleak_alloc_percpu - register a newly allocated __percpu object
  * @ptr:	__percpu pointer to beginning of the object
  * @size:	size of the object
+ * @gfp:	flags used for kmemleak internal memory allocations
  *
  * This function is called from the kernel percpu allocator when a new object
- * (memory block) is allocated (alloc_percpu). It assumes GFP_KERNEL
- * allocation.
+ * (memory block) is allocated (alloc_percpu).
  */
-void __ref kmemleak_alloc_percpu(const void __percpu *ptr, size_t size)
+void __ref kmemleak_alloc_percpu(const void __percpu *ptr, size_t size,
+				 gfp_t gfp)
 {
 	unsigned int cpu;
 
@@ -948,7 +949,7 @@ void __ref kmemleak_alloc_percpu(const void __percpu *ptr, size_t size)
 	if (kmemleak_enabled && ptr && !IS_ERR(ptr))
 		for_each_possible_cpu(cpu)
 			create_object((unsigned long)per_cpu_ptr(ptr, cpu),
-				      size, 0, GFP_KERNEL);
+				      size, 0, gfp);
 	else if (kmemleak_early_log)
 		log_early(KMEMLEAK_ALLOC_PERCPU, ptr, size, 0);
 }
diff --git a/mm/percpu.c b/mm/percpu.c
index dfd02484e8de..2dd74487a0af 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -1030,7 +1030,7 @@ area_found:
 		memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size);
 
 	ptr = __addr_to_pcpu_ptr(chunk->base_addr + off);
-	kmemleak_alloc_percpu(ptr, size);
+	kmemleak_alloc_percpu(ptr, size, gfp);
 	return ptr;
 
 fail_unlock:
-- 
cgit v1.2.3


From b94d5230d06eb930be82e67fb1a9a58271e78297 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Tue, 19 May 2015 22:54:31 -0400
Subject: libnvdimm, nfit: initial libnvdimm infrastructure and NFIT support

A struct nvdimm_bus is the anchor device for registering nvdimm
resources and interfaces, for example, a character control device,
nvdimm devices, and I/O region devices.  The ACPI NFIT (NVDIMM Firmware
Interface Table) is one possible platform description for such
non-volatile memory resources in a system.  The nfit.ko driver attaches
to the "ACPI0012" device that indicates the presence of the NFIT and
parses the table to register a struct nvdimm_bus instance.

Cc: <linux-acpi@vger.kernel.org>
Cc: Lv Zheng <lv.zheng@intel.com>
Cc: Robert Moore <robert.moore@intel.com>
Cc: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Jeff Moyer <jmoyer@redhat.com>
Acked-by: Christoph Hellwig <hch@lst.de>
Acked-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Tested-by: Toshi Kani <toshi.kani@hp.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/Kconfig           |   2 +
 drivers/Makefile          |   1 +
 drivers/acpi/Kconfig      |  14 ++
 drivers/acpi/Makefile     |   1 +
 drivers/acpi/nfit.c       | 481 ++++++++++++++++++++++++++++++++++++++++++++++
 drivers/acpi/nfit.h       |  90 +++++++++
 drivers/nvdimm/Kconfig    |  15 ++
 drivers/nvdimm/Makefile   |   3 +
 drivers/nvdimm/core.c     |  69 +++++++
 drivers/nvdimm/nd-core.h  |  23 +++
 include/linux/libnvdimm.h |  34 ++++
 11 files changed, 733 insertions(+)
 create mode 100644 drivers/acpi/nfit.c
 create mode 100644 drivers/acpi/nfit.h
 create mode 100644 drivers/nvdimm/Kconfig
 create mode 100644 drivers/nvdimm/Makefile
 create mode 100644 drivers/nvdimm/core.c
 create mode 100644 drivers/nvdimm/nd-core.h
 create mode 100644 include/linux/libnvdimm.h

(limited to 'include/linux')

diff --git a/drivers/Kconfig b/drivers/Kconfig
index c0cc96bab9e7..6e973b8e3a3b 100644
--- a/drivers/Kconfig
+++ b/drivers/Kconfig
@@ -182,4 +182,6 @@ source "drivers/thunderbolt/Kconfig"
 
 source "drivers/android/Kconfig"
 
+source "drivers/nvdimm/Kconfig"
+
 endmenu
diff --git a/drivers/Makefile b/drivers/Makefile
index 46d2554be404..692adf659028 100644
--- a/drivers/Makefile
+++ b/drivers/Makefile
@@ -64,6 +64,7 @@ obj-$(CONFIG_FB_INTEL)          += video/fbdev/intelfb/
 
 obj-$(CONFIG_PARPORT)		+= parport/
 obj-y				+= base/ block/ misc/ mfd/ nfc/
+obj-$(CONFIG_LIBNVDIMM)		+= nvdimm/
 obj-$(CONFIG_DMA_SHARED_BUFFER) += dma-buf/
 obj-$(CONFIG_NUBUS)		+= nubus/
 obj-y				+= macintosh/
diff --git a/drivers/acpi/Kconfig b/drivers/acpi/Kconfig
index ab2cbb51c6aa..300b4ef3712b 100644
--- a/drivers/acpi/Kconfig
+++ b/drivers/acpi/Kconfig
@@ -383,6 +383,20 @@ config ACPI_REDUCED_HARDWARE_ONLY
 
 	  If you are unsure what to do, do not enable this option.
 
+config ACPI_NFIT
+	tristate "ACPI NVDIMM Firmware Interface Table (NFIT)"
+	depends on PHYS_ADDR_T_64BIT
+	depends on BLK_DEV
+	select LIBNVDIMM
+	help
+	  Infrastructure to probe ACPI 6 compliant platforms for
+	  NVDIMMs (NFIT) and register a libnvdimm device tree.  In
+	  addition to storage devices this also enables libnvdimm to pass
+	  ACPI._DSM messages for platform/dimm configuration.
+
+	  To compile this driver as a module, choose M here:
+	  the module will be called nfit.
+
 source "drivers/acpi/apei/Kconfig"
 
 config ACPI_EXTLOG
diff --git a/drivers/acpi/Makefile b/drivers/acpi/Makefile
index 8a063e276530..f7e9c92ccdcb 100644
--- a/drivers/acpi/Makefile
+++ b/drivers/acpi/Makefile
@@ -71,6 +71,7 @@ obj-$(CONFIG_ACPI_PCI_SLOT)	+= pci_slot.o
 obj-$(CONFIG_ACPI_PROCESSOR)	+= processor.o
 obj-y				+= container.o
 obj-$(CONFIG_ACPI_THERMAL)	+= thermal.o
+obj-$(CONFIG_ACPI_NFIT)		+= nfit.o
 obj-y				+= acpi_memhotplug.o
 obj-$(CONFIG_ACPI_HOTPLUG_IOAPIC) += ioapic.o
 obj-$(CONFIG_ACPI_BATTERY)	+= battery.o
diff --git a/drivers/acpi/nfit.c b/drivers/acpi/nfit.c
new file mode 100644
index 000000000000..9f9a20bae0ac
--- /dev/null
+++ b/drivers/acpi/nfit.c
@@ -0,0 +1,481 @@
+/*
+ * Copyright(c) 2013-2015 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+#include <linux/list_sort.h>
+#include <linux/libnvdimm.h>
+#include <linux/module.h>
+#include <linux/list.h>
+#include <linux/acpi.h>
+#include "nfit.h"
+
+static u8 nfit_uuid[NFIT_UUID_MAX][16];
+
+static const u8 *to_nfit_uuid(enum nfit_uuids id)
+{
+	return nfit_uuid[id];
+}
+
+static int acpi_nfit_ctl(struct nvdimm_bus_descriptor *nd_desc,
+		struct nvdimm *nvdimm, unsigned int cmd, void *buf,
+		unsigned int buf_len)
+{
+	return -ENOTTY;
+}
+
+static const char *spa_type_name(u16 type)
+{
+	static const char *to_name[] = {
+		[NFIT_SPA_VOLATILE] = "volatile",
+		[NFIT_SPA_PM] = "pmem",
+		[NFIT_SPA_DCR] = "dimm-control-region",
+		[NFIT_SPA_BDW] = "block-data-window",
+		[NFIT_SPA_VDISK] = "volatile-disk",
+		[NFIT_SPA_VCD] = "volatile-cd",
+		[NFIT_SPA_PDISK] = "persistent-disk",
+		[NFIT_SPA_PCD] = "persistent-cd",
+
+	};
+
+	if (type > NFIT_SPA_PCD)
+		return "unknown";
+
+	return to_name[type];
+}
+
+static int nfit_spa_type(struct acpi_nfit_system_address *spa)
+{
+	int i;
+
+	for (i = 0; i < NFIT_UUID_MAX; i++)
+		if (memcmp(to_nfit_uuid(i), spa->range_guid, 16) == 0)
+			return i;
+	return -1;
+}
+
+static bool add_spa(struct acpi_nfit_desc *acpi_desc,
+		struct acpi_nfit_system_address *spa)
+{
+	struct device *dev = acpi_desc->dev;
+	struct nfit_spa *nfit_spa = devm_kzalloc(dev, sizeof(*nfit_spa),
+			GFP_KERNEL);
+
+	if (!nfit_spa)
+		return false;
+	INIT_LIST_HEAD(&nfit_spa->list);
+	nfit_spa->spa = spa;
+	list_add_tail(&nfit_spa->list, &acpi_desc->spas);
+	dev_dbg(dev, "%s: spa index: %d type: %s\n", __func__,
+			spa->range_index,
+			spa_type_name(nfit_spa_type(spa)));
+	return true;
+}
+
+static bool add_memdev(struct acpi_nfit_desc *acpi_desc,
+		struct acpi_nfit_memory_map *memdev)
+{
+	struct device *dev = acpi_desc->dev;
+	struct nfit_memdev *nfit_memdev = devm_kzalloc(dev,
+			sizeof(*nfit_memdev), GFP_KERNEL);
+
+	if (!nfit_memdev)
+		return false;
+	INIT_LIST_HEAD(&nfit_memdev->list);
+	nfit_memdev->memdev = memdev;
+	list_add_tail(&nfit_memdev->list, &acpi_desc->memdevs);
+	dev_dbg(dev, "%s: memdev handle: %#x spa: %d dcr: %d\n",
+			__func__, memdev->device_handle, memdev->range_index,
+			memdev->region_index);
+	return true;
+}
+
+static bool add_dcr(struct acpi_nfit_desc *acpi_desc,
+		struct acpi_nfit_control_region *dcr)
+{
+	struct device *dev = acpi_desc->dev;
+	struct nfit_dcr *nfit_dcr = devm_kzalloc(dev, sizeof(*nfit_dcr),
+			GFP_KERNEL);
+
+	if (!nfit_dcr)
+		return false;
+	INIT_LIST_HEAD(&nfit_dcr->list);
+	nfit_dcr->dcr = dcr;
+	list_add_tail(&nfit_dcr->list, &acpi_desc->dcrs);
+	dev_dbg(dev, "%s: dcr index: %d windows: %d\n", __func__,
+			dcr->region_index, dcr->windows);
+	return true;
+}
+
+static bool add_bdw(struct acpi_nfit_desc *acpi_desc,
+		struct acpi_nfit_data_region *bdw)
+{
+	struct device *dev = acpi_desc->dev;
+	struct nfit_bdw *nfit_bdw = devm_kzalloc(dev, sizeof(*nfit_bdw),
+			GFP_KERNEL);
+
+	if (!nfit_bdw)
+		return false;
+	INIT_LIST_HEAD(&nfit_bdw->list);
+	nfit_bdw->bdw = bdw;
+	list_add_tail(&nfit_bdw->list, &acpi_desc->bdws);
+	dev_dbg(dev, "%s: bdw dcr: %d windows: %d\n", __func__,
+			bdw->region_index, bdw->windows);
+	return true;
+}
+
+static void *add_table(struct acpi_nfit_desc *acpi_desc, void *table,
+		const void *end)
+{
+	struct device *dev = acpi_desc->dev;
+	struct acpi_nfit_header *hdr;
+	void *err = ERR_PTR(-ENOMEM);
+
+	if (table >= end)
+		return NULL;
+
+	hdr = table;
+	switch (hdr->type) {
+	case ACPI_NFIT_TYPE_SYSTEM_ADDRESS:
+		if (!add_spa(acpi_desc, table))
+			return err;
+		break;
+	case ACPI_NFIT_TYPE_MEMORY_MAP:
+		if (!add_memdev(acpi_desc, table))
+			return err;
+		break;
+	case ACPI_NFIT_TYPE_CONTROL_REGION:
+		if (!add_dcr(acpi_desc, table))
+			return err;
+		break;
+	case ACPI_NFIT_TYPE_DATA_REGION:
+		if (!add_bdw(acpi_desc, table))
+			return err;
+		break;
+	/* TODO */
+	case ACPI_NFIT_TYPE_INTERLEAVE:
+		dev_dbg(dev, "%s: idt\n", __func__);
+		break;
+	case ACPI_NFIT_TYPE_FLUSH_ADDRESS:
+		dev_dbg(dev, "%s: flush\n", __func__);
+		break;
+	case ACPI_NFIT_TYPE_SMBIOS:
+		dev_dbg(dev, "%s: smbios\n", __func__);
+		break;
+	default:
+		dev_err(dev, "unknown table '%d' parsing nfit\n", hdr->type);
+		break;
+	}
+
+	return table + hdr->length;
+}
+
+static void nfit_mem_find_spa_bdw(struct acpi_nfit_desc *acpi_desc,
+		struct nfit_mem *nfit_mem)
+{
+	u32 device_handle = __to_nfit_memdev(nfit_mem)->device_handle;
+	u16 dcr = nfit_mem->dcr->region_index;
+	struct nfit_spa *nfit_spa;
+
+	list_for_each_entry(nfit_spa, &acpi_desc->spas, list) {
+		u16 range_index = nfit_spa->spa->range_index;
+		int type = nfit_spa_type(nfit_spa->spa);
+		struct nfit_memdev *nfit_memdev;
+
+		if (type != NFIT_SPA_BDW)
+			continue;
+
+		list_for_each_entry(nfit_memdev, &acpi_desc->memdevs, list) {
+			if (nfit_memdev->memdev->range_index != range_index)
+				continue;
+			if (nfit_memdev->memdev->device_handle != device_handle)
+				continue;
+			if (nfit_memdev->memdev->region_index != dcr)
+				continue;
+
+			nfit_mem->spa_bdw = nfit_spa->spa;
+			return;
+		}
+	}
+
+	dev_dbg(acpi_desc->dev, "SPA-BDW not found for SPA-DCR %d\n",
+			nfit_mem->spa_dcr->range_index);
+	nfit_mem->bdw = NULL;
+}
+
+static int nfit_mem_add(struct acpi_nfit_desc *acpi_desc,
+		struct nfit_mem *nfit_mem, struct acpi_nfit_system_address *spa)
+{
+	u16 dcr = __to_nfit_memdev(nfit_mem)->region_index;
+	struct nfit_dcr *nfit_dcr;
+	struct nfit_bdw *nfit_bdw;
+
+	list_for_each_entry(nfit_dcr, &acpi_desc->dcrs, list) {
+		if (nfit_dcr->dcr->region_index != dcr)
+			continue;
+		nfit_mem->dcr = nfit_dcr->dcr;
+		break;
+	}
+
+	if (!nfit_mem->dcr) {
+		dev_dbg(acpi_desc->dev, "SPA %d missing:%s%s\n",
+				spa->range_index, __to_nfit_memdev(nfit_mem)
+				? "" : " MEMDEV", nfit_mem->dcr ? "" : " DCR");
+		return -ENODEV;
+	}
+
+	/*
+	 * We've found enough to create an nvdimm, optionally
+	 * find an associated BDW
+	 */
+	list_add(&nfit_mem->list, &acpi_desc->dimms);
+
+	list_for_each_entry(nfit_bdw, &acpi_desc->bdws, list) {
+		if (nfit_bdw->bdw->region_index != dcr)
+			continue;
+		nfit_mem->bdw = nfit_bdw->bdw;
+		break;
+	}
+
+	if (!nfit_mem->bdw)
+		return 0;
+
+	nfit_mem_find_spa_bdw(acpi_desc, nfit_mem);
+	return 0;
+}
+
+static int nfit_mem_dcr_init(struct acpi_nfit_desc *acpi_desc,
+		struct acpi_nfit_system_address *spa)
+{
+	struct nfit_mem *nfit_mem, *found;
+	struct nfit_memdev *nfit_memdev;
+	int type = nfit_spa_type(spa);
+	u16 dcr;
+
+	switch (type) {
+	case NFIT_SPA_DCR:
+	case NFIT_SPA_PM:
+		break;
+	default:
+		return 0;
+	}
+
+	list_for_each_entry(nfit_memdev, &acpi_desc->memdevs, list) {
+		int rc;
+
+		if (nfit_memdev->memdev->range_index != spa->range_index)
+			continue;
+		found = NULL;
+		dcr = nfit_memdev->memdev->region_index;
+		list_for_each_entry(nfit_mem, &acpi_desc->dimms, list)
+			if (__to_nfit_memdev(nfit_mem)->region_index == dcr) {
+				found = nfit_mem;
+				break;
+			}
+
+		if (found)
+			nfit_mem = found;
+		else {
+			nfit_mem = devm_kzalloc(acpi_desc->dev,
+					sizeof(*nfit_mem), GFP_KERNEL);
+			if (!nfit_mem)
+				return -ENOMEM;
+			INIT_LIST_HEAD(&nfit_mem->list);
+		}
+
+		if (type == NFIT_SPA_DCR) {
+			/* multiple dimms may share a SPA when interleaved */
+			nfit_mem->spa_dcr = spa;
+			nfit_mem->memdev_dcr = nfit_memdev->memdev;
+		} else {
+			/*
+			 * A single dimm may belong to multiple SPA-PM
+			 * ranges, record at least one in addition to
+			 * any SPA-DCR range.
+			 */
+			nfit_mem->memdev_pmem = nfit_memdev->memdev;
+		}
+
+		if (found)
+			continue;
+
+		rc = nfit_mem_add(acpi_desc, nfit_mem, spa);
+		if (rc)
+			return rc;
+	}
+
+	return 0;
+}
+
+static int nfit_mem_cmp(void *priv, struct list_head *_a, struct list_head *_b)
+{
+	struct nfit_mem *a = container_of(_a, typeof(*a), list);
+	struct nfit_mem *b = container_of(_b, typeof(*b), list);
+	u32 handleA, handleB;
+
+	handleA = __to_nfit_memdev(a)->device_handle;
+	handleB = __to_nfit_memdev(b)->device_handle;
+	if (handleA < handleB)
+		return -1;
+	else if (handleA > handleB)
+		return 1;
+	return 0;
+}
+
+static int nfit_mem_init(struct acpi_nfit_desc *acpi_desc)
+{
+	struct nfit_spa *nfit_spa;
+
+	/*
+	 * For each SPA-DCR or SPA-PMEM address range find its
+	 * corresponding MEMDEV(s).  From each MEMDEV find the
+	 * corresponding DCR.  Then, if we're operating on a SPA-DCR,
+	 * try to find a SPA-BDW and a corresponding BDW that references
+	 * the DCR.  Throw it all into an nfit_mem object.  Note, that
+	 * BDWs are optional.
+	 */
+	list_for_each_entry(nfit_spa, &acpi_desc->spas, list) {
+		int rc;
+
+		rc = nfit_mem_dcr_init(acpi_desc, nfit_spa->spa);
+		if (rc)
+			return rc;
+	}
+
+	list_sort(NULL, &acpi_desc->dimms, nfit_mem_cmp);
+
+	return 0;
+}
+
+static int acpi_nfit_init(struct acpi_nfit_desc *acpi_desc, acpi_size sz)
+{
+	struct device *dev = acpi_desc->dev;
+	const void *end;
+	u8 *data;
+
+	INIT_LIST_HEAD(&acpi_desc->spas);
+	INIT_LIST_HEAD(&acpi_desc->dcrs);
+	INIT_LIST_HEAD(&acpi_desc->bdws);
+	INIT_LIST_HEAD(&acpi_desc->memdevs);
+	INIT_LIST_HEAD(&acpi_desc->dimms);
+
+	data = (u8 *) acpi_desc->nfit;
+	end = data + sz;
+	data += sizeof(struct acpi_table_nfit);
+	while (!IS_ERR_OR_NULL(data))
+		data = add_table(acpi_desc, data, end);
+
+	if (IS_ERR(data)) {
+		dev_dbg(dev, "%s: nfit table parsing error: %ld\n", __func__,
+				PTR_ERR(data));
+		return PTR_ERR(data);
+	}
+
+	if (nfit_mem_init(acpi_desc) != 0)
+		return -ENOMEM;
+
+	return 0;
+}
+
+static int acpi_nfit_add(struct acpi_device *adev)
+{
+	struct nvdimm_bus_descriptor *nd_desc;
+	struct acpi_nfit_desc *acpi_desc;
+	struct device *dev = &adev->dev;
+	struct acpi_table_header *tbl;
+	acpi_status status = AE_OK;
+	acpi_size sz;
+	int rc;
+
+	status = acpi_get_table_with_size("NFIT", 0, &tbl, &sz);
+	if (ACPI_FAILURE(status)) {
+		dev_err(dev, "failed to find NFIT\n");
+		return -ENXIO;
+	}
+
+	acpi_desc = devm_kzalloc(dev, sizeof(*acpi_desc), GFP_KERNEL);
+	if (!acpi_desc)
+		return -ENOMEM;
+
+	dev_set_drvdata(dev, acpi_desc);
+	acpi_desc->dev = dev;
+	acpi_desc->nfit = (struct acpi_table_nfit *) tbl;
+	nd_desc = &acpi_desc->nd_desc;
+	nd_desc->provider_name = "ACPI.NFIT";
+	nd_desc->ndctl = acpi_nfit_ctl;
+
+	acpi_desc->nvdimm_bus = nvdimm_bus_register(dev, nd_desc);
+	if (!acpi_desc->nvdimm_bus)
+		return -ENXIO;
+
+	rc = acpi_nfit_init(acpi_desc, sz);
+	if (rc) {
+		nvdimm_bus_unregister(acpi_desc->nvdimm_bus);
+		return rc;
+	}
+	return 0;
+}
+
+static int acpi_nfit_remove(struct acpi_device *adev)
+{
+	struct acpi_nfit_desc *acpi_desc = dev_get_drvdata(&adev->dev);
+
+	nvdimm_bus_unregister(acpi_desc->nvdimm_bus);
+	return 0;
+}
+
+static const struct acpi_device_id acpi_nfit_ids[] = {
+	{ "ACPI0012", 0 },
+	{ "", 0 },
+};
+MODULE_DEVICE_TABLE(acpi, acpi_nfit_ids);
+
+static struct acpi_driver acpi_nfit_driver = {
+	.name = KBUILD_MODNAME,
+	.ids = acpi_nfit_ids,
+	.ops = {
+		.add = acpi_nfit_add,
+		.remove = acpi_nfit_remove,
+	},
+};
+
+static __init int nfit_init(void)
+{
+	BUILD_BUG_ON(sizeof(struct acpi_table_nfit) != 40);
+	BUILD_BUG_ON(sizeof(struct acpi_nfit_system_address) != 56);
+	BUILD_BUG_ON(sizeof(struct acpi_nfit_memory_map) != 48);
+	BUILD_BUG_ON(sizeof(struct acpi_nfit_interleave) != 20);
+	BUILD_BUG_ON(sizeof(struct acpi_nfit_smbios) != 9);
+	BUILD_BUG_ON(sizeof(struct acpi_nfit_control_region) != 80);
+	BUILD_BUG_ON(sizeof(struct acpi_nfit_data_region) != 40);
+
+	acpi_str_to_uuid(UUID_VOLATILE_MEMORY, nfit_uuid[NFIT_SPA_VOLATILE]);
+	acpi_str_to_uuid(UUID_PERSISTENT_MEMORY, nfit_uuid[NFIT_SPA_PM]);
+	acpi_str_to_uuid(UUID_CONTROL_REGION, nfit_uuid[NFIT_SPA_DCR]);
+	acpi_str_to_uuid(UUID_DATA_REGION, nfit_uuid[NFIT_SPA_BDW]);
+	acpi_str_to_uuid(UUID_VOLATILE_VIRTUAL_DISK, nfit_uuid[NFIT_SPA_VDISK]);
+	acpi_str_to_uuid(UUID_VOLATILE_VIRTUAL_CD, nfit_uuid[NFIT_SPA_VCD]);
+	acpi_str_to_uuid(UUID_PERSISTENT_VIRTUAL_DISK, nfit_uuid[NFIT_SPA_PDISK]);
+	acpi_str_to_uuid(UUID_PERSISTENT_VIRTUAL_CD, nfit_uuid[NFIT_SPA_PCD]);
+	acpi_str_to_uuid(UUID_NFIT_BUS, nfit_uuid[NFIT_DEV_BUS]);
+	acpi_str_to_uuid(UUID_NFIT_DIMM, nfit_uuid[NFIT_DEV_DIMM]);
+
+	return acpi_bus_register_driver(&acpi_nfit_driver);
+}
+
+static __exit void nfit_exit(void)
+{
+	acpi_bus_unregister_driver(&acpi_nfit_driver);
+}
+
+module_init(nfit_init);
+module_exit(nfit_exit);
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Intel Corporation");
diff --git a/drivers/acpi/nfit.h b/drivers/acpi/nfit.h
new file mode 100644
index 000000000000..c6baba64703f
--- /dev/null
+++ b/drivers/acpi/nfit.h
@@ -0,0 +1,90 @@
+/*
+ * NVDIMM Firmware Interface Table - NFIT
+ *
+ * Copyright(c) 2013-2015 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+#ifndef __NFIT_H__
+#define __NFIT_H__
+#include <linux/libnvdimm.h>
+#include <linux/types.h>
+#include <linux/uuid.h>
+#include <linux/acpi.h>
+#include <acpi/acuuid.h>
+
+#define UUID_NFIT_BUS "2f10e7a4-9e91-11e4-89d3-123b93f75cba"
+#define UUID_NFIT_DIMM "4309ac30-0d11-11e4-9191-0800200c9a66"
+
+enum nfit_uuids {
+	NFIT_SPA_VOLATILE,
+	NFIT_SPA_PM,
+	NFIT_SPA_DCR,
+	NFIT_SPA_BDW,
+	NFIT_SPA_VDISK,
+	NFIT_SPA_VCD,
+	NFIT_SPA_PDISK,
+	NFIT_SPA_PCD,
+	NFIT_DEV_BUS,
+	NFIT_DEV_DIMM,
+	NFIT_UUID_MAX,
+};
+
+struct nfit_spa {
+	struct acpi_nfit_system_address *spa;
+	struct list_head list;
+};
+
+struct nfit_dcr {
+	struct acpi_nfit_control_region *dcr;
+	struct list_head list;
+};
+
+struct nfit_bdw {
+	struct acpi_nfit_data_region *bdw;
+	struct list_head list;
+};
+
+struct nfit_memdev {
+	struct acpi_nfit_memory_map *memdev;
+	struct list_head list;
+};
+
+/* assembled tables for a given dimm/memory-device */
+struct nfit_mem {
+	struct acpi_nfit_memory_map *memdev_dcr;
+	struct acpi_nfit_memory_map *memdev_pmem;
+	struct acpi_nfit_control_region *dcr;
+	struct acpi_nfit_data_region *bdw;
+	struct acpi_nfit_system_address *spa_dcr;
+	struct acpi_nfit_system_address *spa_bdw;
+	struct list_head list;
+};
+
+struct acpi_nfit_desc {
+	struct nvdimm_bus_descriptor nd_desc;
+	struct acpi_table_nfit *nfit;
+	struct list_head memdevs;
+	struct list_head dimms;
+	struct list_head spas;
+	struct list_head dcrs;
+	struct list_head bdws;
+	struct nvdimm_bus *nvdimm_bus;
+	struct device *dev;
+};
+
+static inline struct acpi_nfit_memory_map *__to_nfit_memdev(
+		struct nfit_mem *nfit_mem)
+{
+	if (nfit_mem->memdev_dcr)
+		return nfit_mem->memdev_dcr;
+	return nfit_mem->memdev_pmem;
+}
+#endif /* __NFIT_H__ */
diff --git a/drivers/nvdimm/Kconfig b/drivers/nvdimm/Kconfig
new file mode 100644
index 000000000000..92933551f846
--- /dev/null
+++ b/drivers/nvdimm/Kconfig
@@ -0,0 +1,15 @@
+config LIBNVDIMM
+	tristate "NVDIMM (Non-Volatile Memory Device) Support"
+	depends on PHYS_ADDR_T_64BIT
+	depends on BLK_DEV
+	help
+	  Generic support for non-volatile memory devices including
+	  ACPI-6-NFIT defined resources.  On platforms that define an
+	  NFIT, or otherwise can discover NVDIMM resources, a libnvdimm
+	  bus is registered to advertise PMEM (persistent memory)
+	  namespaces (/dev/pmemX) and BLK (sliding mmio window(s))
+	  namespaces (/dev/ndX). A PMEM namespace refers to a memory
+	  resource that may span multiple DIMMs and support DAX (see
+	  CONFIG_DAX).  A BLK namespace refers to an NVDIMM control
+	  region which exposes an mmio register set for windowed
+	  access mode to non-volatile memory.
diff --git a/drivers/nvdimm/Makefile b/drivers/nvdimm/Makefile
new file mode 100644
index 000000000000..10bc7af47992
--- /dev/null
+++ b/drivers/nvdimm/Makefile
@@ -0,0 +1,3 @@
+obj-$(CONFIG_LIBNVDIMM) += libnvdimm.o
+
+libnvdimm-y := core.o
diff --git a/drivers/nvdimm/core.c b/drivers/nvdimm/core.c
new file mode 100644
index 000000000000..c578a49867ac
--- /dev/null
+++ b/drivers/nvdimm/core.c
@@ -0,0 +1,69 @@
+/*
+ * Copyright(c) 2013-2015 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+#include <linux/libnvdimm.h>
+#include <linux/export.h>
+#include <linux/module.h>
+#include <linux/device.h>
+#include <linux/slab.h>
+#include "nd-core.h"
+
+static DEFINE_IDA(nd_ida);
+
+static void nvdimm_bus_release(struct device *dev)
+{
+	struct nvdimm_bus *nvdimm_bus;
+
+	nvdimm_bus = container_of(dev, struct nvdimm_bus, dev);
+	ida_simple_remove(&nd_ida, nvdimm_bus->id);
+	kfree(nvdimm_bus);
+}
+
+struct nvdimm_bus *nvdimm_bus_register(struct device *parent,
+		struct nvdimm_bus_descriptor *nd_desc)
+{
+	struct nvdimm_bus *nvdimm_bus;
+	int rc;
+
+	nvdimm_bus = kzalloc(sizeof(*nvdimm_bus), GFP_KERNEL);
+	if (!nvdimm_bus)
+		return NULL;
+	nvdimm_bus->id = ida_simple_get(&nd_ida, 0, 0, GFP_KERNEL);
+	if (nvdimm_bus->id < 0) {
+		kfree(nvdimm_bus);
+		return NULL;
+	}
+	nvdimm_bus->nd_desc = nd_desc;
+	nvdimm_bus->dev.parent = parent;
+	nvdimm_bus->dev.release = nvdimm_bus_release;
+	dev_set_name(&nvdimm_bus->dev, "ndbus%d", nvdimm_bus->id);
+	rc = device_register(&nvdimm_bus->dev);
+	if (rc) {
+		dev_dbg(&nvdimm_bus->dev, "registration failed: %d\n", rc);
+		put_device(&nvdimm_bus->dev);
+		return NULL;
+	}
+
+	return nvdimm_bus;
+}
+EXPORT_SYMBOL_GPL(nvdimm_bus_register);
+
+void nvdimm_bus_unregister(struct nvdimm_bus *nvdimm_bus)
+{
+	if (!nvdimm_bus)
+		return;
+	device_unregister(&nvdimm_bus->dev);
+}
+EXPORT_SYMBOL_GPL(nvdimm_bus_unregister);
+
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Intel Corporation");
diff --git a/drivers/nvdimm/nd-core.h b/drivers/nvdimm/nd-core.h
new file mode 100644
index 000000000000..291b2fdcd96b
--- /dev/null
+++ b/drivers/nvdimm/nd-core.h
@@ -0,0 +1,23 @@
+/*
+ * Copyright(c) 2013-2015 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+#ifndef __ND_CORE_H__
+#define __ND_CORE_H__
+#include <linux/libnvdimm.h>
+#include <linux/device.h>
+
+struct nvdimm_bus {
+	struct nvdimm_bus_descriptor *nd_desc;
+	struct device dev;
+	int id;
+};
+#endif /* __ND_CORE_H__ */
diff --git a/include/linux/libnvdimm.h b/include/linux/libnvdimm.h
new file mode 100644
index 000000000000..2b3c63950c91
--- /dev/null
+++ b/include/linux/libnvdimm.h
@@ -0,0 +1,34 @@
+/*
+ * libnvdimm - Non-volatile-memory Devices Subsystem
+ *
+ * Copyright(c) 2013-2015 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+#ifndef __LIBNVDIMM_H__
+#define __LIBNVDIMM_H__
+struct nvdimm;
+struct nvdimm_bus_descriptor;
+typedef int (*ndctl_fn)(struct nvdimm_bus_descriptor *nd_desc,
+		struct nvdimm *nvdimm, unsigned int cmd, void *buf,
+		unsigned int buf_len);
+
+struct nvdimm_bus_descriptor {
+	unsigned long dsm_mask;
+	char *provider_name;
+	ndctl_fn ndctl;
+};
+
+struct device;
+struct nvdimm_bus;
+struct nvdimm_bus *nvdimm_bus_register(struct device *parent,
+		struct nvdimm_bus_descriptor *nfit_desc);
+void nvdimm_bus_unregister(struct nvdimm_bus *nvdimm_bus);
+#endif /* __LIBNVDIMM_H__ */
-- 
cgit v1.2.3


From 45def22c1fab85764646746ce38d45b2f3281fa5 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Sun, 26 Apr 2015 19:26:48 -0400
Subject: libnvdimm: control character device and nvdimm_bus sysfs attributes

The control device for a nvdimm_bus is registered as an "nd" class
device.  The expectation is that there will usually only be one "nd" bus
registered under /sys/class/nd.  However, we allow for the possibility
of multiple buses and they will listed in discovery order as
ndctl0...ndctlN.  This character device hosts the ioctl for passing
control messages.  The initial command set has a 1:1 correlation with
the commands listed in the by the "NFIT DSM Example" document [1], but
this scheme is extensible to future command sets.

Note, nd_ioctl() and the backing ->ndctl() implementation are defined in
a subsequent patch.  This is simply the initial registrations and sysfs
attributes.

[1]: http://pmem.io/documents/NVDIMM_DSM_Interface_Example.pdf

Cc: Neil Brown <neilb@suse.de>
Cc: Greg KH <gregkh@linuxfoundation.org>
Cc: <linux-acpi@vger.kernel.org>
Cc: Robert Moore <robert.moore@intel.com>
Cc: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Christoph Hellwig <hch@lst.de>
Acked-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Tested-by: Toshi Kani <toshi.kani@hp.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/acpi/nfit.c       | 28 +++++++++++++++
 drivers/acpi/nfit.h       |  6 ++++
 drivers/nvdimm/Makefile   |  1 +
 drivers/nvdimm/bus.c      | 83 ++++++++++++++++++++++++++++++++++++++++++++
 drivers/nvdimm/core.c     | 88 +++++++++++++++++++++++++++++++++++++++++++++--
 drivers/nvdimm/nd-core.h  |  6 ++++
 include/linux/libnvdimm.h |  6 +++-
 7 files changed, 215 insertions(+), 3 deletions(-)
 create mode 100644 drivers/nvdimm/bus.c

(limited to 'include/linux')

diff --git a/drivers/acpi/nfit.c b/drivers/acpi/nfit.c
index 9f9a20bae0ac..162391b32022 100644
--- a/drivers/acpi/nfit.c
+++ b/drivers/acpi/nfit.c
@@ -354,6 +354,33 @@ static int nfit_mem_init(struct acpi_nfit_desc *acpi_desc)
 	return 0;
 }
 
+static ssize_t revision_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct nvdimm_bus *nvdimm_bus = to_nvdimm_bus(dev);
+	struct nvdimm_bus_descriptor *nd_desc = to_nd_desc(nvdimm_bus);
+	struct acpi_nfit_desc *acpi_desc = to_acpi_desc(nd_desc);
+
+	return sprintf(buf, "%d\n", acpi_desc->nfit->header.revision);
+}
+static DEVICE_ATTR_RO(revision);
+
+static struct attribute *acpi_nfit_attributes[] = {
+	&dev_attr_revision.attr,
+	NULL,
+};
+
+static struct attribute_group acpi_nfit_attribute_group = {
+	.name = "nfit",
+	.attrs = acpi_nfit_attributes,
+};
+
+static const struct attribute_group *acpi_nfit_attribute_groups[] = {
+	&nvdimm_bus_attribute_group,
+	&acpi_nfit_attribute_group,
+	NULL,
+};
+
 static int acpi_nfit_init(struct acpi_nfit_desc *acpi_desc, acpi_size sz)
 {
 	struct device *dev = acpi_desc->dev;
@@ -410,6 +437,7 @@ static int acpi_nfit_add(struct acpi_device *adev)
 	nd_desc = &acpi_desc->nd_desc;
 	nd_desc->provider_name = "ACPI.NFIT";
 	nd_desc->ndctl = acpi_nfit_ctl;
+	nd_desc->attr_groups = acpi_nfit_attribute_groups;
 
 	acpi_desc->nvdimm_bus = nvdimm_bus_register(dev, nd_desc);
 	if (!acpi_desc->nvdimm_bus)
diff --git a/drivers/acpi/nfit.h b/drivers/acpi/nfit.h
index c6baba64703f..eda565fa81ef 100644
--- a/drivers/acpi/nfit.h
+++ b/drivers/acpi/nfit.h
@@ -87,4 +87,10 @@ static inline struct acpi_nfit_memory_map *__to_nfit_memdev(
 		return nfit_mem->memdev_dcr;
 	return nfit_mem->memdev_pmem;
 }
+
+static inline struct acpi_nfit_desc *to_acpi_desc(
+		struct nvdimm_bus_descriptor *nd_desc)
+{
+	return container_of(nd_desc, struct acpi_nfit_desc, nd_desc);
+}
 #endif /* __NFIT_H__ */
diff --git a/drivers/nvdimm/Makefile b/drivers/nvdimm/Makefile
index 10bc7af47992..8a8293c72c7c 100644
--- a/drivers/nvdimm/Makefile
+++ b/drivers/nvdimm/Makefile
@@ -1,3 +1,4 @@
 obj-$(CONFIG_LIBNVDIMM) += libnvdimm.o
 
 libnvdimm-y := core.o
+libnvdimm-y += bus.o
diff --git a/drivers/nvdimm/bus.c b/drivers/nvdimm/bus.c
new file mode 100644
index 000000000000..3f7c690a5d0c
--- /dev/null
+++ b/drivers/nvdimm/bus.c
@@ -0,0 +1,83 @@
+/*
+ * Copyright(c) 2013-2015 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/uaccess.h>
+#include <linux/fcntl.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/io.h>
+#include "nd-core.h"
+
+static int nvdimm_bus_major;
+static struct class *nd_class;
+
+int nvdimm_bus_create_ndctl(struct nvdimm_bus *nvdimm_bus)
+{
+	dev_t devt = MKDEV(nvdimm_bus_major, nvdimm_bus->id);
+	struct device *dev;
+
+	dev = device_create(nd_class, &nvdimm_bus->dev, devt, nvdimm_bus,
+			"ndctl%d", nvdimm_bus->id);
+
+	if (IS_ERR(dev)) {
+		dev_dbg(&nvdimm_bus->dev, "failed to register ndctl%d: %ld\n",
+				nvdimm_bus->id, PTR_ERR(dev));
+		return PTR_ERR(dev);
+	}
+	return 0;
+}
+
+void nvdimm_bus_destroy_ndctl(struct nvdimm_bus *nvdimm_bus)
+{
+	device_destroy(nd_class, MKDEV(nvdimm_bus_major, nvdimm_bus->id));
+}
+
+static long nd_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+	return -ENXIO;
+}
+
+static const struct file_operations nvdimm_bus_fops = {
+	.owner = THIS_MODULE,
+	.open = nonseekable_open,
+	.unlocked_ioctl = nd_ioctl,
+	.compat_ioctl = nd_ioctl,
+	.llseek = noop_llseek,
+};
+
+int __init nvdimm_bus_init(void)
+{
+	int rc;
+
+	rc = register_chrdev(0, "ndctl", &nvdimm_bus_fops);
+	if (rc < 0)
+		return rc;
+	nvdimm_bus_major = rc;
+
+	nd_class = class_create(THIS_MODULE, "nd");
+	if (IS_ERR(nd_class))
+		goto err_class;
+
+	return 0;
+
+ err_class:
+	unregister_chrdev(nvdimm_bus_major, "ndctl");
+
+	return rc;
+}
+
+void __exit nvdimm_bus_exit(void)
+{
+	class_destroy(nd_class);
+	unregister_chrdev(nvdimm_bus_major, "ndctl");
+}
diff --git a/drivers/nvdimm/core.c b/drivers/nvdimm/core.c
index c578a49867ac..fa7ab5ad0318 100644
--- a/drivers/nvdimm/core.c
+++ b/drivers/nvdimm/core.c
@@ -14,9 +14,12 @@
 #include <linux/export.h>
 #include <linux/module.h>
 #include <linux/device.h>
+#include <linux/mutex.h>
 #include <linux/slab.h>
 #include "nd-core.h"
 
+static LIST_HEAD(nvdimm_bus_list);
+static DEFINE_MUTEX(nvdimm_bus_list_mutex);
 static DEFINE_IDA(nd_ida);
 
 static void nvdimm_bus_release(struct device *dev)
@@ -28,6 +31,55 @@ static void nvdimm_bus_release(struct device *dev)
 	kfree(nvdimm_bus);
 }
 
+struct nvdimm_bus *to_nvdimm_bus(struct device *dev)
+{
+	struct nvdimm_bus *nvdimm_bus;
+
+	nvdimm_bus = container_of(dev, struct nvdimm_bus, dev);
+	WARN_ON(nvdimm_bus->dev.release != nvdimm_bus_release);
+	return nvdimm_bus;
+}
+EXPORT_SYMBOL_GPL(to_nvdimm_bus);
+
+struct nvdimm_bus_descriptor *to_nd_desc(struct nvdimm_bus *nvdimm_bus)
+{
+	/* struct nvdimm_bus definition is private to libnvdimm */
+	return nvdimm_bus->nd_desc;
+}
+EXPORT_SYMBOL_GPL(to_nd_desc);
+
+static const char *nvdimm_bus_provider(struct nvdimm_bus *nvdimm_bus)
+{
+	struct nvdimm_bus_descriptor *nd_desc = nvdimm_bus->nd_desc;
+	struct device *parent = nvdimm_bus->dev.parent;
+
+	if (nd_desc->provider_name)
+		return nd_desc->provider_name;
+	else if (parent)
+		return dev_name(parent);
+	else
+		return "unknown";
+}
+
+static ssize_t provider_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct nvdimm_bus *nvdimm_bus = to_nvdimm_bus(dev);
+
+	return sprintf(buf, "%s\n", nvdimm_bus_provider(nvdimm_bus));
+}
+static DEVICE_ATTR_RO(provider);
+
+static struct attribute *nvdimm_bus_attributes[] = {
+	&dev_attr_provider.attr,
+	NULL,
+};
+
+struct attribute_group nvdimm_bus_attribute_group = {
+	.attrs = nvdimm_bus_attributes,
+};
+EXPORT_SYMBOL_GPL(nvdimm_bus_attribute_group);
+
 struct nvdimm_bus *nvdimm_bus_register(struct device *parent,
 		struct nvdimm_bus_descriptor *nd_desc)
 {
@@ -37,6 +89,7 @@ struct nvdimm_bus *nvdimm_bus_register(struct device *parent,
 	nvdimm_bus = kzalloc(sizeof(*nvdimm_bus), GFP_KERNEL);
 	if (!nvdimm_bus)
 		return NULL;
+	INIT_LIST_HEAD(&nvdimm_bus->list);
 	nvdimm_bus->id = ida_simple_get(&nd_ida, 0, 0, GFP_KERNEL);
 	if (nvdimm_bus->id < 0) {
 		kfree(nvdimm_bus);
@@ -45,15 +98,26 @@ struct nvdimm_bus *nvdimm_bus_register(struct device *parent,
 	nvdimm_bus->nd_desc = nd_desc;
 	nvdimm_bus->dev.parent = parent;
 	nvdimm_bus->dev.release = nvdimm_bus_release;
+	nvdimm_bus->dev.groups = nd_desc->attr_groups;
 	dev_set_name(&nvdimm_bus->dev, "ndbus%d", nvdimm_bus->id);
 	rc = device_register(&nvdimm_bus->dev);
 	if (rc) {
 		dev_dbg(&nvdimm_bus->dev, "registration failed: %d\n", rc);
-		put_device(&nvdimm_bus->dev);
-		return NULL;
+		goto err;
 	}
 
+	rc = nvdimm_bus_create_ndctl(nvdimm_bus);
+	if (rc)
+		goto err;
+
+	mutex_lock(&nvdimm_bus_list_mutex);
+	list_add_tail(&nvdimm_bus->list, &nvdimm_bus_list);
+	mutex_unlock(&nvdimm_bus_list_mutex);
+
 	return nvdimm_bus;
+ err:
+	put_device(&nvdimm_bus->dev);
+	return NULL;
 }
 EXPORT_SYMBOL_GPL(nvdimm_bus_register);
 
@@ -61,9 +125,29 @@ void nvdimm_bus_unregister(struct nvdimm_bus *nvdimm_bus)
 {
 	if (!nvdimm_bus)
 		return;
+
+	mutex_lock(&nvdimm_bus_list_mutex);
+	list_del_init(&nvdimm_bus->list);
+	mutex_unlock(&nvdimm_bus_list_mutex);
+
+	nvdimm_bus_destroy_ndctl(nvdimm_bus);
+
 	device_unregister(&nvdimm_bus->dev);
 }
 EXPORT_SYMBOL_GPL(nvdimm_bus_unregister);
 
+static __init int libnvdimm_init(void)
+{
+	return nvdimm_bus_init();
+}
+
+static __exit void libnvdimm_exit(void)
+{
+	WARN_ON(!list_empty(&nvdimm_bus_list));
+	nvdimm_bus_exit();
+}
+
 MODULE_LICENSE("GPL v2");
 MODULE_AUTHOR("Intel Corporation");
+subsys_initcall(libnvdimm_init);
+module_exit(libnvdimm_exit);
diff --git a/drivers/nvdimm/nd-core.h b/drivers/nvdimm/nd-core.h
index 291b2fdcd96b..0b0ff2423161 100644
--- a/drivers/nvdimm/nd-core.h
+++ b/drivers/nvdimm/nd-core.h
@@ -17,7 +17,13 @@
 
 struct nvdimm_bus {
 	struct nvdimm_bus_descriptor *nd_desc;
+	struct list_head list;
 	struct device dev;
 	int id;
 };
+
+int __init nvdimm_bus_init(void);
+void __exit nvdimm_bus_exit(void);
+int nvdimm_bus_create_ndctl(struct nvdimm_bus *nvdimm_bus);
+void nvdimm_bus_destroy_ndctl(struct nvdimm_bus *nvdimm_bus);
 #endif /* __ND_CORE_H__ */
diff --git a/include/linux/libnvdimm.h b/include/linux/libnvdimm.h
index 2b3c63950c91..d375cdc4abd5 100644
--- a/include/linux/libnvdimm.h
+++ b/include/linux/libnvdimm.h
@@ -14,6 +14,8 @@
  */
 #ifndef __LIBNVDIMM_H__
 #define __LIBNVDIMM_H__
+extern struct attribute_group nvdimm_bus_attribute_group;
+
 struct nvdimm;
 struct nvdimm_bus_descriptor;
 typedef int (*ndctl_fn)(struct nvdimm_bus_descriptor *nd_desc,
@@ -21,14 +23,16 @@ typedef int (*ndctl_fn)(struct nvdimm_bus_descriptor *nd_desc,
 		unsigned int buf_len);
 
 struct nvdimm_bus_descriptor {
+	const struct attribute_group **attr_groups;
 	unsigned long dsm_mask;
 	char *provider_name;
 	ndctl_fn ndctl;
 };
 
 struct device;
-struct nvdimm_bus;
 struct nvdimm_bus *nvdimm_bus_register(struct device *parent,
 		struct nvdimm_bus_descriptor *nfit_desc);
 void nvdimm_bus_unregister(struct nvdimm_bus *nvdimm_bus);
+struct nvdimm_bus *to_nvdimm_bus(struct device *dev);
+struct nvdimm_bus_descriptor *to_nd_desc(struct nvdimm_bus *nvdimm_bus);
 #endif /* __LIBNVDIMM_H__ */
-- 
cgit v1.2.3


From e6dfb2de47768efe8cc37c9a1863d2aff81440fb Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Sat, 25 Apr 2015 03:56:17 -0400
Subject: libnvdimm, nfit: dimm/memory-devices

Enable nvdimm devices to be registered on a nvdimm_bus.  The kernel
assigned device id for nvdimm devicesis dynamic.  If userspace needs a
more static identifier it should consult a provider-specific attribute.
In the case where NFIT is the provider, the 'nmemX/nfit/handle' or
'nmemX/nfit/serial' attributes may be used for this purpose.

Cc: Neil Brown <neilb@suse.de>
Cc: <linux-acpi@vger.kernel.org>
Cc: Greg KH <gregkh@linuxfoundation.org>
Cc: Robert Moore <robert.moore@intel.com>
Cc: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Christoph Hellwig <hch@lst.de>
Acked-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Tested-by: Toshi Kani <toshi.kani@hp.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/acpi/nfit.c        | 161 ++++++++++++++++++++++++++++++++++++++++++++-
 drivers/acpi/nfit.h        |   1 +
 drivers/nvdimm/Makefile    |   1 +
 drivers/nvdimm/bus.c       |  14 +++-
 drivers/nvdimm/core.c      |  33 +++++++++-
 drivers/nvdimm/dimm_devs.c |  92 ++++++++++++++++++++++++++
 drivers/nvdimm/nd-core.h   |  12 ++++
 include/linux/libnvdimm.h  |  11 ++++
 8 files changed, 321 insertions(+), 4 deletions(-)
 create mode 100644 drivers/nvdimm/dimm_devs.c

(limited to 'include/linux')

diff --git a/drivers/acpi/nfit.c b/drivers/acpi/nfit.c
index 162391b32022..9fd7781f966e 100644
--- a/drivers/acpi/nfit.c
+++ b/drivers/acpi/nfit.c
@@ -381,6 +381,165 @@ static const struct attribute_group *acpi_nfit_attribute_groups[] = {
 	NULL,
 };
 
+static struct acpi_nfit_memory_map *to_nfit_memdev(struct device *dev)
+{
+	struct nvdimm *nvdimm = to_nvdimm(dev);
+	struct nfit_mem *nfit_mem = nvdimm_provider_data(nvdimm);
+
+	return __to_nfit_memdev(nfit_mem);
+}
+
+static struct acpi_nfit_control_region *to_nfit_dcr(struct device *dev)
+{
+	struct nvdimm *nvdimm = to_nvdimm(dev);
+	struct nfit_mem *nfit_mem = nvdimm_provider_data(nvdimm);
+
+	return nfit_mem->dcr;
+}
+
+static ssize_t handle_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct acpi_nfit_memory_map *memdev = to_nfit_memdev(dev);
+
+	return sprintf(buf, "%#x\n", memdev->device_handle);
+}
+static DEVICE_ATTR_RO(handle);
+
+static ssize_t phys_id_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct acpi_nfit_memory_map *memdev = to_nfit_memdev(dev);
+
+	return sprintf(buf, "%#x\n", memdev->physical_id);
+}
+static DEVICE_ATTR_RO(phys_id);
+
+static ssize_t vendor_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct acpi_nfit_control_region *dcr = to_nfit_dcr(dev);
+
+	return sprintf(buf, "%#x\n", dcr->vendor_id);
+}
+static DEVICE_ATTR_RO(vendor);
+
+static ssize_t rev_id_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct acpi_nfit_control_region *dcr = to_nfit_dcr(dev);
+
+	return sprintf(buf, "%#x\n", dcr->revision_id);
+}
+static DEVICE_ATTR_RO(rev_id);
+
+static ssize_t device_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct acpi_nfit_control_region *dcr = to_nfit_dcr(dev);
+
+	return sprintf(buf, "%#x\n", dcr->device_id);
+}
+static DEVICE_ATTR_RO(device);
+
+static ssize_t format_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct acpi_nfit_control_region *dcr = to_nfit_dcr(dev);
+
+	return sprintf(buf, "%#x\n", dcr->code);
+}
+static DEVICE_ATTR_RO(format);
+
+static ssize_t serial_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct acpi_nfit_control_region *dcr = to_nfit_dcr(dev);
+
+	return sprintf(buf, "%#x\n", dcr->serial_number);
+}
+static DEVICE_ATTR_RO(serial);
+
+static struct attribute *acpi_nfit_dimm_attributes[] = {
+	&dev_attr_handle.attr,
+	&dev_attr_phys_id.attr,
+	&dev_attr_vendor.attr,
+	&dev_attr_device.attr,
+	&dev_attr_format.attr,
+	&dev_attr_serial.attr,
+	&dev_attr_rev_id.attr,
+	NULL,
+};
+
+static umode_t acpi_nfit_dimm_attr_visible(struct kobject *kobj,
+		struct attribute *a, int n)
+{
+	struct device *dev = container_of(kobj, struct device, kobj);
+
+	if (to_nfit_dcr(dev))
+		return a->mode;
+	else
+		return 0;
+}
+
+static struct attribute_group acpi_nfit_dimm_attribute_group = {
+	.name = "nfit",
+	.attrs = acpi_nfit_dimm_attributes,
+	.is_visible = acpi_nfit_dimm_attr_visible,
+};
+
+static const struct attribute_group *acpi_nfit_dimm_attribute_groups[] = {
+	&acpi_nfit_dimm_attribute_group,
+	NULL,
+};
+
+static struct nvdimm *acpi_nfit_dimm_by_handle(struct acpi_nfit_desc *acpi_desc,
+		u32 device_handle)
+{
+	struct nfit_mem *nfit_mem;
+
+	list_for_each_entry(nfit_mem, &acpi_desc->dimms, list)
+		if (__to_nfit_memdev(nfit_mem)->device_handle == device_handle)
+			return nfit_mem->nvdimm;
+
+	return NULL;
+}
+
+static int acpi_nfit_register_dimms(struct acpi_nfit_desc *acpi_desc)
+{
+	struct nfit_mem *nfit_mem;
+
+	list_for_each_entry(nfit_mem, &acpi_desc->dimms, list) {
+		struct nvdimm *nvdimm;
+		unsigned long flags = 0;
+		u32 device_handle;
+
+		device_handle = __to_nfit_memdev(nfit_mem)->device_handle;
+		nvdimm = acpi_nfit_dimm_by_handle(acpi_desc, device_handle);
+		if (nvdimm) {
+			/*
+			 * If for some reason we find multiple DCRs the
+			 * first one wins
+			 */
+			dev_err(acpi_desc->dev, "duplicate DCR detected: %s\n",
+					nvdimm_name(nvdimm));
+			continue;
+		}
+
+		if (nfit_mem->bdw && nfit_mem->memdev_pmem)
+			flags |= NDD_ALIASING;
+
+		nvdimm = nvdimm_create(acpi_desc->nvdimm_bus, nfit_mem,
+				acpi_nfit_dimm_attribute_groups, flags);
+		if (!nvdimm)
+			return -ENOMEM;
+
+		nfit_mem->nvdimm = nvdimm;
+	}
+
+	return 0;
+}
+
 static int acpi_nfit_init(struct acpi_nfit_desc *acpi_desc, acpi_size sz)
 {
 	struct device *dev = acpi_desc->dev;
@@ -408,7 +567,7 @@ static int acpi_nfit_init(struct acpi_nfit_desc *acpi_desc, acpi_size sz)
 	if (nfit_mem_init(acpi_desc) != 0)
 		return -ENOMEM;
 
-	return 0;
+	return acpi_nfit_register_dimms(acpi_desc);
 }
 
 static int acpi_nfit_add(struct acpi_device *adev)
diff --git a/drivers/acpi/nfit.h b/drivers/acpi/nfit.h
index eda565fa81ef..9dd437fe5563 100644
--- a/drivers/acpi/nfit.h
+++ b/drivers/acpi/nfit.h
@@ -59,6 +59,7 @@ struct nfit_memdev {
 
 /* assembled tables for a given dimm/memory-device */
 struct nfit_mem {
+	struct nvdimm *nvdimm;
 	struct acpi_nfit_memory_map *memdev_dcr;
 	struct acpi_nfit_memory_map *memdev_pmem;
 	struct acpi_nfit_control_region *dcr;
diff --git a/drivers/nvdimm/Makefile b/drivers/nvdimm/Makefile
index 8a8293c72c7c..5b68738ba406 100644
--- a/drivers/nvdimm/Makefile
+++ b/drivers/nvdimm/Makefile
@@ -2,3 +2,4 @@ obj-$(CONFIG_LIBNVDIMM) += libnvdimm.o
 
 libnvdimm-y := core.o
 libnvdimm-y += bus.o
+libnvdimm-y += dimm_devs.o
diff --git a/drivers/nvdimm/bus.c b/drivers/nvdimm/bus.c
index 3f7c690a5d0c..a8802577fb55 100644
--- a/drivers/nvdimm/bus.c
+++ b/drivers/nvdimm/bus.c
@@ -13,6 +13,7 @@
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 #include <linux/uaccess.h>
 #include <linux/fcntl.h>
+#include <linux/async.h>
 #include <linux/slab.h>
 #include <linux/fs.h>
 #include <linux/io.h>
@@ -21,6 +22,10 @@
 static int nvdimm_bus_major;
 static struct class *nd_class;
 
+struct bus_type nvdimm_bus_type = {
+	.name = "nd",
+};
+
 int nvdimm_bus_create_ndctl(struct nvdimm_bus *nvdimm_bus)
 {
 	dev_t devt = MKDEV(nvdimm_bus_major, nvdimm_bus->id);
@@ -59,9 +64,13 @@ int __init nvdimm_bus_init(void)
 {
 	int rc;
 
+	rc = bus_register(&nvdimm_bus_type);
+	if (rc)
+		return rc;
+
 	rc = register_chrdev(0, "ndctl", &nvdimm_bus_fops);
 	if (rc < 0)
-		return rc;
+		goto err_chrdev;
 	nvdimm_bus_major = rc;
 
 	nd_class = class_create(THIS_MODULE, "nd");
@@ -72,6 +81,8 @@ int __init nvdimm_bus_init(void)
 
  err_class:
 	unregister_chrdev(nvdimm_bus_major, "ndctl");
+ err_chrdev:
+	bus_unregister(&nvdimm_bus_type);
 
 	return rc;
 }
@@ -80,4 +91,5 @@ void __exit nvdimm_bus_exit(void)
 {
 	class_destroy(nd_class);
 	unregister_chrdev(nvdimm_bus_major, "ndctl");
+	bus_unregister(&nvdimm_bus_type);
 }
diff --git a/drivers/nvdimm/core.c b/drivers/nvdimm/core.c
index fa7ab5ad0318..ef957eb37c90 100644
--- a/drivers/nvdimm/core.c
+++ b/drivers/nvdimm/core.c
@@ -18,8 +18,8 @@
 #include <linux/slab.h>
 #include "nd-core.h"
 
-static LIST_HEAD(nvdimm_bus_list);
-static DEFINE_MUTEX(nvdimm_bus_list_mutex);
+LIST_HEAD(nvdimm_bus_list);
+DEFINE_MUTEX(nvdimm_bus_list_mutex);
 static DEFINE_IDA(nd_ida);
 
 static void nvdimm_bus_release(struct device *dev)
@@ -48,6 +48,19 @@ struct nvdimm_bus_descriptor *to_nd_desc(struct nvdimm_bus *nvdimm_bus)
 }
 EXPORT_SYMBOL_GPL(to_nd_desc);
 
+struct nvdimm_bus *walk_to_nvdimm_bus(struct device *nd_dev)
+{
+	struct device *dev;
+
+	for (dev = nd_dev; dev; dev = dev->parent)
+		if (dev->release == nvdimm_bus_release)
+			break;
+	dev_WARN_ONCE(nd_dev, !dev, "invalid dev, not on nd bus\n");
+	if (dev)
+		return to_nvdimm_bus(dev);
+	return NULL;
+}
+
 static const char *nvdimm_bus_provider(struct nvdimm_bus *nvdimm_bus)
 {
 	struct nvdimm_bus_descriptor *nd_desc = nvdimm_bus->nd_desc;
@@ -121,6 +134,21 @@ struct nvdimm_bus *nvdimm_bus_register(struct device *parent,
 }
 EXPORT_SYMBOL_GPL(nvdimm_bus_register);
 
+static int child_unregister(struct device *dev, void *data)
+{
+	/*
+	 * the singular ndctl class device per bus needs to be
+	 * "device_destroy"ed, so skip it here
+	 *
+	 * i.e. remove classless children
+	 */
+	if (dev->class)
+		/* pass */;
+	else
+		device_unregister(dev);
+	return 0;
+}
+
 void nvdimm_bus_unregister(struct nvdimm_bus *nvdimm_bus)
 {
 	if (!nvdimm_bus)
@@ -130,6 +158,7 @@ void nvdimm_bus_unregister(struct nvdimm_bus *nvdimm_bus)
 	list_del_init(&nvdimm_bus->list);
 	mutex_unlock(&nvdimm_bus_list_mutex);
 
+	device_for_each_child(&nvdimm_bus->dev, NULL, child_unregister);
 	nvdimm_bus_destroy_ndctl(nvdimm_bus);
 
 	device_unregister(&nvdimm_bus->dev);
diff --git a/drivers/nvdimm/dimm_devs.c b/drivers/nvdimm/dimm_devs.c
new file mode 100644
index 000000000000..51ea52cc2079
--- /dev/null
+++ b/drivers/nvdimm/dimm_devs.c
@@ -0,0 +1,92 @@
+/*
+ * Copyright(c) 2013-2015 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/device.h>
+#include <linux/slab.h>
+#include <linux/io.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include "nd-core.h"
+
+static DEFINE_IDA(dimm_ida);
+
+static void nvdimm_release(struct device *dev)
+{
+	struct nvdimm *nvdimm = to_nvdimm(dev);
+
+	ida_simple_remove(&dimm_ida, nvdimm->id);
+	kfree(nvdimm);
+}
+
+static struct device_type nvdimm_device_type = {
+	.name = "nvdimm",
+	.release = nvdimm_release,
+};
+
+static bool is_nvdimm(struct device *dev)
+{
+	return dev->type == &nvdimm_device_type;
+}
+
+struct nvdimm *to_nvdimm(struct device *dev)
+{
+	struct nvdimm *nvdimm = container_of(dev, struct nvdimm, dev);
+
+	WARN_ON(!is_nvdimm(dev));
+	return nvdimm;
+}
+EXPORT_SYMBOL_GPL(to_nvdimm);
+
+const char *nvdimm_name(struct nvdimm *nvdimm)
+{
+	return dev_name(&nvdimm->dev);
+}
+EXPORT_SYMBOL_GPL(nvdimm_name);
+
+void *nvdimm_provider_data(struct nvdimm *nvdimm)
+{
+	return nvdimm->provider_data;
+}
+EXPORT_SYMBOL_GPL(nvdimm_provider_data);
+
+struct nvdimm *nvdimm_create(struct nvdimm_bus *nvdimm_bus, void *provider_data,
+		const struct attribute_group **groups, unsigned long flags)
+{
+	struct nvdimm *nvdimm = kzalloc(sizeof(*nvdimm), GFP_KERNEL);
+	struct device *dev;
+
+	if (!nvdimm)
+		return NULL;
+
+	nvdimm->id = ida_simple_get(&dimm_ida, 0, 0, GFP_KERNEL);
+	if (nvdimm->id < 0) {
+		kfree(nvdimm);
+		return NULL;
+	}
+	nvdimm->provider_data = provider_data;
+	nvdimm->flags = flags;
+
+	dev = &nvdimm->dev;
+	dev_set_name(dev, "nmem%d", nvdimm->id);
+	dev->parent = &nvdimm_bus->dev;
+	dev->type = &nvdimm_device_type;
+	dev->bus = &nvdimm_bus_type;
+	dev->groups = groups;
+	if (device_register(dev) != 0) {
+		put_device(dev);
+		return NULL;
+	}
+
+	return nvdimm;
+}
+EXPORT_SYMBOL_GPL(nvdimm_create);
diff --git a/drivers/nvdimm/nd-core.h b/drivers/nvdimm/nd-core.h
index 0b0ff2423161..9b8303413b60 100644
--- a/drivers/nvdimm/nd-core.h
+++ b/drivers/nvdimm/nd-core.h
@@ -15,6 +15,10 @@
 #include <linux/libnvdimm.h>
 #include <linux/device.h>
 
+extern struct list_head nvdimm_bus_list;
+extern struct mutex nvdimm_bus_list_mutex;
+extern struct bus_type nvdimm_bus_type;
+
 struct nvdimm_bus {
 	struct nvdimm_bus_descriptor *nd_desc;
 	struct list_head list;
@@ -22,6 +26,14 @@ struct nvdimm_bus {
 	int id;
 };
 
+struct nvdimm {
+	unsigned long flags;
+	void *provider_data;
+	struct device dev;
+	int id;
+};
+
+struct nvdimm_bus *walk_to_nvdimm_bus(struct device *nd_dev);
 int __init nvdimm_bus_init(void);
 void __exit nvdimm_bus_exit(void);
 int nvdimm_bus_create_ndctl(struct nvdimm_bus *nvdimm_bus);
diff --git a/include/linux/libnvdimm.h b/include/linux/libnvdimm.h
index d375cdc4abd5..07787f0dd7de 100644
--- a/include/linux/libnvdimm.h
+++ b/include/linux/libnvdimm.h
@@ -14,6 +14,12 @@
  */
 #ifndef __LIBNVDIMM_H__
 #define __LIBNVDIMM_H__
+
+enum {
+	/* when a dimm supports both PMEM and BLK access a label is required */
+	NDD_ALIASING = 1 << 0,
+};
+
 extern struct attribute_group nvdimm_bus_attribute_group;
 
 struct nvdimm;
@@ -34,5 +40,10 @@ struct nvdimm_bus *nvdimm_bus_register(struct device *parent,
 		struct nvdimm_bus_descriptor *nfit_desc);
 void nvdimm_bus_unregister(struct nvdimm_bus *nvdimm_bus);
 struct nvdimm_bus *to_nvdimm_bus(struct device *dev);
+struct nvdimm *to_nvdimm(struct device *dev);
 struct nvdimm_bus_descriptor *to_nd_desc(struct nvdimm_bus *nvdimm_bus);
+const char *nvdimm_name(struct nvdimm *nvdimm);
+void *nvdimm_provider_data(struct nvdimm *nvdimm);
+struct nvdimm *nvdimm_create(struct nvdimm_bus *nvdimm_bus, void *provider_data,
+		const struct attribute_group **groups, unsigned long flags);
 #endif /* __LIBNVDIMM_H__ */
-- 
cgit v1.2.3


From 62232e45f4a265abb43f0acf16e58f5d0b6e1ec9 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Mon, 8 Jun 2015 14:27:06 -0400
Subject: libnvdimm: control (ioctl) messages for nvdimm_bus and nvdimm devices

Most discovery/configuration of the nvdimm-subsystem is done via sysfs
attributes.  However, some nvdimm_bus instances, particularly the
ACPI.NFIT bus, define a small set of messages that can be passed to the
platform.  For convenience we derive the initial libnvdimm-ioctl command
formats directly from the NFIT DSM Interface Example formats.

    ND_CMD_SMART: media health and diagnostics
    ND_CMD_GET_CONFIG_SIZE: size of the label space
    ND_CMD_GET_CONFIG_DATA: read label space
    ND_CMD_SET_CONFIG_DATA: write label space
    ND_CMD_VENDOR: vendor-specific command passthrough
    ND_CMD_ARS_CAP: report address-range-scrubbing capabilities
    ND_CMD_ARS_START: initiate scrubbing
    ND_CMD_ARS_STATUS: report on scrubbing state
    ND_CMD_SMART_THRESHOLD: configure alarm thresholds for smart events

If a platform later defines different commands than this set it is
straightforward to extend support to those formats.

Most of the commands target a specific dimm.  However, the
address-range-scrubbing commands target the bus.  The 'commands'
attribute in sysfs of an nvdimm_bus, or nvdimm, enumerate the supported
commands for that object.

Cc: <linux-acpi@vger.kernel.org>
Cc: Robert Moore <robert.moore@intel.com>
Cc: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Reported-by: Nicholas Moulin <nicholas.w.moulin@linux.intel.com>
Acked-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/acpi/Kconfig       |  12 ++
 drivers/acpi/nfit.c        | 216 +++++++++++++++++++++++++++++-
 drivers/acpi/nfit.h        |   3 +
 drivers/nvdimm/bus.c       | 326 ++++++++++++++++++++++++++++++++++++++++++++-
 drivers/nvdimm/core.c      |  16 +++
 drivers/nvdimm/dimm_devs.c |  38 +++++-
 drivers/nvdimm/nd-core.h   |   3 +
 include/linux/libnvdimm.h  |  27 +++-
 include/uapi/linux/Kbuild  |   1 +
 include/uapi/linux/ndctl.h | 178 +++++++++++++++++++++++++
 10 files changed, 810 insertions(+), 10 deletions(-)
 create mode 100644 include/uapi/linux/ndctl.h

(limited to 'include/linux')

diff --git a/drivers/acpi/Kconfig b/drivers/acpi/Kconfig
index 300b4ef3712b..9c43ae301300 100644
--- a/drivers/acpi/Kconfig
+++ b/drivers/acpi/Kconfig
@@ -397,6 +397,18 @@ config ACPI_NFIT
 	  To compile this driver as a module, choose M here:
 	  the module will be called nfit.
 
+config ACPI_NFIT_DEBUG
+	bool "NFIT DSM debug"
+	depends on ACPI_NFIT
+	depends on DYNAMIC_DEBUG
+	default n
+	help
+	  Enabling this option causes the nfit driver to dump the
+	  input and output buffers of _DSM operations on the ACPI0012
+	  device and its children.  This can be very verbose, so leave
+	  it disabled unless you are debugging a hardware / firmware
+	  issue.
+
 source "drivers/acpi/apei/Kconfig"
 
 config ACPI_EXTLOG
diff --git a/drivers/acpi/nfit.c b/drivers/acpi/nfit.c
index 9fd7781f966e..9112a6210a4b 100644
--- a/drivers/acpi/nfit.c
+++ b/drivers/acpi/nfit.c
@@ -13,6 +13,7 @@
 #include <linux/list_sort.h>
 #include <linux/libnvdimm.h>
 #include <linux/module.h>
+#include <linux/ndctl.h>
 #include <linux/list.h>
 #include <linux/acpi.h>
 #include "nfit.h"
@@ -24,11 +25,153 @@ static const u8 *to_nfit_uuid(enum nfit_uuids id)
 	return nfit_uuid[id];
 }
 
+static struct acpi_nfit_desc *to_acpi_nfit_desc(
+		struct nvdimm_bus_descriptor *nd_desc)
+{
+	return container_of(nd_desc, struct acpi_nfit_desc, nd_desc);
+}
+
+static struct acpi_device *to_acpi_dev(struct acpi_nfit_desc *acpi_desc)
+{
+	struct nvdimm_bus_descriptor *nd_desc = &acpi_desc->nd_desc;
+
+	/*
+	 * If provider == 'ACPI.NFIT' we can assume 'dev' is a struct
+	 * acpi_device.
+	 */
+	if (!nd_desc->provider_name
+			|| strcmp(nd_desc->provider_name, "ACPI.NFIT") != 0)
+		return NULL;
+
+	return to_acpi_device(acpi_desc->dev);
+}
+
 static int acpi_nfit_ctl(struct nvdimm_bus_descriptor *nd_desc,
 		struct nvdimm *nvdimm, unsigned int cmd, void *buf,
 		unsigned int buf_len)
 {
-	return -ENOTTY;
+	struct acpi_nfit_desc *acpi_desc = to_acpi_nfit_desc(nd_desc);
+	const struct nd_cmd_desc *desc = NULL;
+	union acpi_object in_obj, in_buf, *out_obj;
+	struct device *dev = acpi_desc->dev;
+	const char *cmd_name, *dimm_name;
+	unsigned long dsm_mask;
+	acpi_handle handle;
+	const u8 *uuid;
+	u32 offset;
+	int rc, i;
+
+	if (nvdimm) {
+		struct nfit_mem *nfit_mem = nvdimm_provider_data(nvdimm);
+		struct acpi_device *adev = nfit_mem->adev;
+
+		if (!adev)
+			return -ENOTTY;
+		dimm_name = dev_name(&adev->dev);
+		cmd_name = nvdimm_cmd_name(cmd);
+		dsm_mask = nfit_mem->dsm_mask;
+		desc = nd_cmd_dimm_desc(cmd);
+		uuid = to_nfit_uuid(NFIT_DEV_DIMM);
+		handle = adev->handle;
+	} else {
+		struct acpi_device *adev = to_acpi_dev(acpi_desc);
+
+		cmd_name = nvdimm_bus_cmd_name(cmd);
+		dsm_mask = nd_desc->dsm_mask;
+		desc = nd_cmd_bus_desc(cmd);
+		uuid = to_nfit_uuid(NFIT_DEV_BUS);
+		handle = adev->handle;
+		dimm_name = "bus";
+	}
+
+	if (!desc || (cmd && (desc->out_num + desc->in_num == 0)))
+		return -ENOTTY;
+
+	if (!test_bit(cmd, &dsm_mask))
+		return -ENOTTY;
+
+	in_obj.type = ACPI_TYPE_PACKAGE;
+	in_obj.package.count = 1;
+	in_obj.package.elements = &in_buf;
+	in_buf.type = ACPI_TYPE_BUFFER;
+	in_buf.buffer.pointer = buf;
+	in_buf.buffer.length = 0;
+
+	/* libnvdimm has already validated the input envelope */
+	for (i = 0; i < desc->in_num; i++)
+		in_buf.buffer.length += nd_cmd_in_size(nvdimm, cmd, desc,
+				i, buf);
+
+	if (IS_ENABLED(CONFIG_ACPI_NFIT_DEBUG)) {
+		dev_dbg(dev, "%s:%s cmd: %s input length: %d\n", __func__,
+				dimm_name, cmd_name, in_buf.buffer.length);
+		print_hex_dump_debug(cmd_name, DUMP_PREFIX_OFFSET, 4,
+				4, in_buf.buffer.pointer, min_t(u32, 128,
+					in_buf.buffer.length), true);
+	}
+
+	out_obj = acpi_evaluate_dsm(handle, uuid, 1, cmd, &in_obj);
+	if (!out_obj) {
+		dev_dbg(dev, "%s:%s _DSM failed cmd: %s\n", __func__, dimm_name,
+				cmd_name);
+		return -EINVAL;
+	}
+
+	if (out_obj->package.type != ACPI_TYPE_BUFFER) {
+		dev_dbg(dev, "%s:%s unexpected output object type cmd: %s type: %d\n",
+				__func__, dimm_name, cmd_name, out_obj->type);
+		rc = -EINVAL;
+		goto out;
+	}
+
+	if (IS_ENABLED(CONFIG_ACPI_NFIT_DEBUG)) {
+		dev_dbg(dev, "%s:%s cmd: %s output length: %d\n", __func__,
+				dimm_name, cmd_name, out_obj->buffer.length);
+		print_hex_dump_debug(cmd_name, DUMP_PREFIX_OFFSET, 4,
+				4, out_obj->buffer.pointer, min_t(u32, 128,
+					out_obj->buffer.length), true);
+	}
+
+	for (i = 0, offset = 0; i < desc->out_num; i++) {
+		u32 out_size = nd_cmd_out_size(nvdimm, cmd, desc, i, buf,
+				(u32 *) out_obj->buffer.pointer);
+
+		if (offset + out_size > out_obj->buffer.length) {
+			dev_dbg(dev, "%s:%s output object underflow cmd: %s field: %d\n",
+					__func__, dimm_name, cmd_name, i);
+			break;
+		}
+
+		if (in_buf.buffer.length + offset + out_size > buf_len) {
+			dev_dbg(dev, "%s:%s output overrun cmd: %s field: %d\n",
+					__func__, dimm_name, cmd_name, i);
+			rc = -ENXIO;
+			goto out;
+		}
+		memcpy(buf + in_buf.buffer.length + offset,
+				out_obj->buffer.pointer + offset, out_size);
+		offset += out_size;
+	}
+	if (offset + in_buf.buffer.length < buf_len) {
+		if (i >= 1) {
+			/*
+			 * status valid, return the number of bytes left
+			 * unfilled in the output buffer
+			 */
+			rc = buf_len - offset - in_buf.buffer.length;
+		} else {
+			dev_err(dev, "%s:%s underrun cmd: %s buf_len: %d out_len: %d\n",
+					__func__, dimm_name, cmd_name, buf_len,
+					offset);
+			rc = -ENXIO;
+		}
+	} else
+		rc = 0;
+
+ out:
+	ACPI_FREE(out_obj);
+
+	return rc;
 }
 
 static const char *spa_type_name(u16 type)
@@ -489,6 +632,7 @@ static struct attribute_group acpi_nfit_dimm_attribute_group = {
 };
 
 static const struct attribute_group *acpi_nfit_dimm_attribute_groups[] = {
+	&nvdimm_attribute_group,
 	&acpi_nfit_dimm_attribute_group,
 	NULL,
 };
@@ -505,6 +649,50 @@ static struct nvdimm *acpi_nfit_dimm_by_handle(struct acpi_nfit_desc *acpi_desc,
 	return NULL;
 }
 
+static int acpi_nfit_add_dimm(struct acpi_nfit_desc *acpi_desc,
+		struct nfit_mem *nfit_mem, u32 device_handle)
+{
+	struct acpi_device *adev, *adev_dimm;
+	struct device *dev = acpi_desc->dev;
+	const u8 *uuid = to_nfit_uuid(NFIT_DEV_DIMM);
+	unsigned long long sta;
+	int i, rc = -ENODEV;
+	acpi_status status;
+
+	nfit_mem->dsm_mask = acpi_desc->dimm_dsm_force_en;
+	adev = to_acpi_dev(acpi_desc);
+	if (!adev)
+		return 0;
+
+	adev_dimm = acpi_find_child_device(adev, device_handle, false);
+	nfit_mem->adev = adev_dimm;
+	if (!adev_dimm) {
+		dev_err(dev, "no ACPI.NFIT device with _ADR %#x, disabling...\n",
+				device_handle);
+		return -ENODEV;
+	}
+
+	status = acpi_evaluate_integer(adev_dimm->handle, "_STA", NULL, &sta);
+	if (status == AE_NOT_FOUND) {
+		dev_dbg(dev, "%s missing _STA, assuming enabled...\n",
+				dev_name(&adev_dimm->dev));
+		rc = 0;
+	} else if (ACPI_FAILURE(status))
+		dev_err(dev, "%s failed to retrieve_STA, disabling...\n",
+				dev_name(&adev_dimm->dev));
+	else if ((sta & ACPI_STA_DEVICE_ENABLED) == 0)
+		dev_info(dev, "%s disabled by firmware\n",
+				dev_name(&adev_dimm->dev));
+	else
+		rc = 0;
+
+	for (i = ND_CMD_SMART; i <= ND_CMD_VENDOR; i++)
+		if (acpi_check_dsm(adev_dimm->handle, uuid, 1, 1ULL << i))
+			set_bit(i, &nfit_mem->dsm_mask);
+
+	return rc;
+}
+
 static int acpi_nfit_register_dimms(struct acpi_nfit_desc *acpi_desc)
 {
 	struct nfit_mem *nfit_mem;
@@ -513,6 +701,7 @@ static int acpi_nfit_register_dimms(struct acpi_nfit_desc *acpi_desc)
 		struct nvdimm *nvdimm;
 		unsigned long flags = 0;
 		u32 device_handle;
+		int rc;
 
 		device_handle = __to_nfit_memdev(nfit_mem)->device_handle;
 		nvdimm = acpi_nfit_dimm_by_handle(acpi_desc, device_handle);
@@ -529,8 +718,13 @@ static int acpi_nfit_register_dimms(struct acpi_nfit_desc *acpi_desc)
 		if (nfit_mem->bdw && nfit_mem->memdev_pmem)
 			flags |= NDD_ALIASING;
 
+		rc = acpi_nfit_add_dimm(acpi_desc, nfit_mem, device_handle);
+		if (rc)
+			continue;
+
 		nvdimm = nvdimm_create(acpi_desc->nvdimm_bus, nfit_mem,
-				acpi_nfit_dimm_attribute_groups, flags);
+				acpi_nfit_dimm_attribute_groups,
+				flags, &nfit_mem->dsm_mask);
 		if (!nvdimm)
 			return -ENOMEM;
 
@@ -540,6 +734,22 @@ static int acpi_nfit_register_dimms(struct acpi_nfit_desc *acpi_desc)
 	return 0;
 }
 
+static void acpi_nfit_init_dsms(struct acpi_nfit_desc *acpi_desc)
+{
+	struct nvdimm_bus_descriptor *nd_desc = &acpi_desc->nd_desc;
+	const u8 *uuid = to_nfit_uuid(NFIT_DEV_BUS);
+	struct acpi_device *adev;
+	int i;
+
+	adev = to_acpi_dev(acpi_desc);
+	if (!adev)
+		return;
+
+	for (i = ND_CMD_ARS_CAP; i <= ND_CMD_ARS_STATUS; i++)
+		if (acpi_check_dsm(adev->handle, uuid, 1, 1ULL << i))
+			set_bit(i, &nd_desc->dsm_mask);
+}
+
 static int acpi_nfit_init(struct acpi_nfit_desc *acpi_desc, acpi_size sz)
 {
 	struct device *dev = acpi_desc->dev;
@@ -567,6 +777,8 @@ static int acpi_nfit_init(struct acpi_nfit_desc *acpi_desc, acpi_size sz)
 	if (nfit_mem_init(acpi_desc) != 0)
 		return -ENOMEM;
 
+	acpi_nfit_init_dsms(acpi_desc);
+
 	return acpi_nfit_register_dimms(acpi_desc);
 }
 
diff --git a/drivers/acpi/nfit.h b/drivers/acpi/nfit.h
index 9dd437fe5563..b76e33629098 100644
--- a/drivers/acpi/nfit.h
+++ b/drivers/acpi/nfit.h
@@ -67,6 +67,8 @@ struct nfit_mem {
 	struct acpi_nfit_system_address *spa_dcr;
 	struct acpi_nfit_system_address *spa_bdw;
 	struct list_head list;
+	struct acpi_device *adev;
+	unsigned long dsm_mask;
 };
 
 struct acpi_nfit_desc {
@@ -79,6 +81,7 @@ struct acpi_nfit_desc {
 	struct list_head bdws;
 	struct nvdimm_bus *nvdimm_bus;
 	struct device *dev;
+	unsigned long dimm_dsm_force_en;
 };
 
 static inline struct acpi_nfit_memory_map *__to_nfit_memdev(
diff --git a/drivers/nvdimm/bus.c b/drivers/nvdimm/bus.c
index a8802577fb55..15f3a3ddc225 100644
--- a/drivers/nvdimm/bus.c
+++ b/drivers/nvdimm/bus.c
@@ -11,14 +11,18 @@
  * General Public License for more details.
  */
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/vmalloc.h>
 #include <linux/uaccess.h>
 #include <linux/fcntl.h>
 #include <linux/async.h>
+#include <linux/ndctl.h>
 #include <linux/slab.h>
 #include <linux/fs.h>
 #include <linux/io.h>
+#include <linux/mm.h>
 #include "nd-core.h"
 
+int nvdimm_major;
 static int nvdimm_bus_major;
 static struct class *nd_class;
 
@@ -47,19 +51,325 @@ void nvdimm_bus_destroy_ndctl(struct nvdimm_bus *nvdimm_bus)
 	device_destroy(nd_class, MKDEV(nvdimm_bus_major, nvdimm_bus->id));
 }
 
+static const struct nd_cmd_desc __nd_cmd_dimm_descs[] = {
+	[ND_CMD_IMPLEMENTED] = { },
+	[ND_CMD_SMART] = {
+		.out_num = 2,
+		.out_sizes = { 4, 8, },
+	},
+	[ND_CMD_SMART_THRESHOLD] = {
+		.out_num = 2,
+		.out_sizes = { 4, 8, },
+	},
+	[ND_CMD_DIMM_FLAGS] = {
+		.out_num = 2,
+		.out_sizes = { 4, 4 },
+	},
+	[ND_CMD_GET_CONFIG_SIZE] = {
+		.out_num = 3,
+		.out_sizes = { 4, 4, 4, },
+	},
+	[ND_CMD_GET_CONFIG_DATA] = {
+		.in_num = 2,
+		.in_sizes = { 4, 4, },
+		.out_num = 2,
+		.out_sizes = { 4, UINT_MAX, },
+	},
+	[ND_CMD_SET_CONFIG_DATA] = {
+		.in_num = 3,
+		.in_sizes = { 4, 4, UINT_MAX, },
+		.out_num = 1,
+		.out_sizes = { 4, },
+	},
+	[ND_CMD_VENDOR] = {
+		.in_num = 3,
+		.in_sizes = { 4, 4, UINT_MAX, },
+		.out_num = 3,
+		.out_sizes = { 4, 4, UINT_MAX, },
+	},
+};
+
+const struct nd_cmd_desc *nd_cmd_dimm_desc(int cmd)
+{
+	if (cmd < ARRAY_SIZE(__nd_cmd_dimm_descs))
+		return &__nd_cmd_dimm_descs[cmd];
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(nd_cmd_dimm_desc);
+
+static const struct nd_cmd_desc __nd_cmd_bus_descs[] = {
+	[ND_CMD_IMPLEMENTED] = { },
+	[ND_CMD_ARS_CAP] = {
+		.in_num = 2,
+		.in_sizes = { 8, 8, },
+		.out_num = 2,
+		.out_sizes = { 4, 4, },
+	},
+	[ND_CMD_ARS_START] = {
+		.in_num = 4,
+		.in_sizes = { 8, 8, 2, 6, },
+		.out_num = 1,
+		.out_sizes = { 4, },
+	},
+	[ND_CMD_ARS_STATUS] = {
+		.out_num = 2,
+		.out_sizes = { 4, UINT_MAX, },
+	},
+};
+
+const struct nd_cmd_desc *nd_cmd_bus_desc(int cmd)
+{
+	if (cmd < ARRAY_SIZE(__nd_cmd_bus_descs))
+		return &__nd_cmd_bus_descs[cmd];
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(nd_cmd_bus_desc);
+
+u32 nd_cmd_in_size(struct nvdimm *nvdimm, int cmd,
+		const struct nd_cmd_desc *desc, int idx, void *buf)
+{
+	if (idx >= desc->in_num)
+		return UINT_MAX;
+
+	if (desc->in_sizes[idx] < UINT_MAX)
+		return desc->in_sizes[idx];
+
+	if (nvdimm && cmd == ND_CMD_SET_CONFIG_DATA && idx == 2) {
+		struct nd_cmd_set_config_hdr *hdr = buf;
+
+		return hdr->in_length;
+	} else if (nvdimm && cmd == ND_CMD_VENDOR && idx == 2) {
+		struct nd_cmd_vendor_hdr *hdr = buf;
+
+		return hdr->in_length;
+	}
+
+	return UINT_MAX;
+}
+EXPORT_SYMBOL_GPL(nd_cmd_in_size);
+
+u32 nd_cmd_out_size(struct nvdimm *nvdimm, int cmd,
+		const struct nd_cmd_desc *desc, int idx, const u32 *in_field,
+		const u32 *out_field)
+{
+	if (idx >= desc->out_num)
+		return UINT_MAX;
+
+	if (desc->out_sizes[idx] < UINT_MAX)
+		return desc->out_sizes[idx];
+
+	if (nvdimm && cmd == ND_CMD_GET_CONFIG_DATA && idx == 1)
+		return in_field[1];
+	else if (nvdimm && cmd == ND_CMD_VENDOR && idx == 2)
+		return out_field[1];
+	else if (!nvdimm && cmd == ND_CMD_ARS_STATUS && idx == 1)
+		return ND_CMD_ARS_STATUS_MAX;
+
+	return UINT_MAX;
+}
+EXPORT_SYMBOL_GPL(nd_cmd_out_size);
+
+static int __nd_ioctl(struct nvdimm_bus *nvdimm_bus, struct nvdimm *nvdimm,
+		int read_only, unsigned int ioctl_cmd, unsigned long arg)
+{
+	struct nvdimm_bus_descriptor *nd_desc = nvdimm_bus->nd_desc;
+	size_t buf_len = 0, in_len = 0, out_len = 0;
+	static char out_env[ND_CMD_MAX_ENVELOPE];
+	static char in_env[ND_CMD_MAX_ENVELOPE];
+	const struct nd_cmd_desc *desc = NULL;
+	unsigned int cmd = _IOC_NR(ioctl_cmd);
+	void __user *p = (void __user *) arg;
+	struct device *dev = &nvdimm_bus->dev;
+	const char *cmd_name, *dimm_name;
+	unsigned long dsm_mask;
+	void *buf;
+	int rc, i;
+
+	if (nvdimm) {
+		desc = nd_cmd_dimm_desc(cmd);
+		cmd_name = nvdimm_cmd_name(cmd);
+		dsm_mask = nvdimm->dsm_mask ? *(nvdimm->dsm_mask) : 0;
+		dimm_name = dev_name(&nvdimm->dev);
+	} else {
+		desc = nd_cmd_bus_desc(cmd);
+		cmd_name = nvdimm_bus_cmd_name(cmd);
+		dsm_mask = nd_desc->dsm_mask;
+		dimm_name = "bus";
+	}
+
+	if (!desc || (desc->out_num + desc->in_num == 0) ||
+			!test_bit(cmd, &dsm_mask))
+		return -ENOTTY;
+
+	/* fail write commands (when read-only) */
+	if (read_only)
+		switch (ioctl_cmd) {
+		case ND_IOCTL_VENDOR:
+		case ND_IOCTL_SET_CONFIG_DATA:
+		case ND_IOCTL_ARS_START:
+			dev_dbg(&nvdimm_bus->dev, "'%s' command while read-only.\n",
+					nvdimm ? nvdimm_cmd_name(cmd)
+					: nvdimm_bus_cmd_name(cmd));
+			return -EPERM;
+		default:
+			break;
+		}
+
+	/* process an input envelope */
+	for (i = 0; i < desc->in_num; i++) {
+		u32 in_size, copy;
+
+		in_size = nd_cmd_in_size(nvdimm, cmd, desc, i, in_env);
+		if (in_size == UINT_MAX) {
+			dev_err(dev, "%s:%s unknown input size cmd: %s field: %d\n",
+					__func__, dimm_name, cmd_name, i);
+			return -ENXIO;
+		}
+		if (!access_ok(VERIFY_READ, p + in_len, in_size))
+			return -EFAULT;
+		if (in_len < sizeof(in_env))
+			copy = min_t(u32, sizeof(in_env) - in_len, in_size);
+		else
+			copy = 0;
+		if (copy && copy_from_user(&in_env[in_len], p + in_len, copy))
+			return -EFAULT;
+		in_len += in_size;
+	}
+
+	/* process an output envelope */
+	for (i = 0; i < desc->out_num; i++) {
+		u32 out_size = nd_cmd_out_size(nvdimm, cmd, desc, i,
+				(u32 *) in_env, (u32 *) out_env);
+		u32 copy;
+
+		if (out_size == UINT_MAX) {
+			dev_dbg(dev, "%s:%s unknown output size cmd: %s field: %d\n",
+					__func__, dimm_name, cmd_name, i);
+			return -EFAULT;
+		}
+		if (!access_ok(VERIFY_WRITE, p + in_len + out_len, out_size))
+			return -EFAULT;
+		if (out_len < sizeof(out_env))
+			copy = min_t(u32, sizeof(out_env) - out_len, out_size);
+		else
+			copy = 0;
+		if (copy && copy_from_user(&out_env[out_len],
+					p + in_len + out_len, copy))
+			return -EFAULT;
+		out_len += out_size;
+	}
+
+	buf_len = out_len + in_len;
+	if (!access_ok(VERIFY_WRITE, p, sizeof(buf_len)))
+		return -EFAULT;
+
+	if (buf_len > ND_IOCTL_MAX_BUFLEN) {
+		dev_dbg(dev, "%s:%s cmd: %s buf_len: %zu > %d\n", __func__,
+				dimm_name, cmd_name, buf_len,
+				ND_IOCTL_MAX_BUFLEN);
+		return -EINVAL;
+	}
+
+	buf = vmalloc(buf_len);
+	if (!buf)
+		return -ENOMEM;
+
+	if (copy_from_user(buf, p, buf_len)) {
+		rc = -EFAULT;
+		goto out;
+	}
+
+	rc = nd_desc->ndctl(nd_desc, nvdimm, cmd, buf, buf_len);
+	if (rc < 0)
+		goto out;
+	if (copy_to_user(p, buf, buf_len))
+		rc = -EFAULT;
+ out:
+	vfree(buf);
+	return rc;
+}
+
 static long nd_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 {
-	return -ENXIO;
+	long id = (long) file->private_data;
+	int rc = -ENXIO, read_only;
+	struct nvdimm_bus *nvdimm_bus;
+
+	read_only = (O_RDWR != (file->f_flags & O_ACCMODE));
+	mutex_lock(&nvdimm_bus_list_mutex);
+	list_for_each_entry(nvdimm_bus, &nvdimm_bus_list, list) {
+		if (nvdimm_bus->id == id) {
+			rc = __nd_ioctl(nvdimm_bus, NULL, read_only, cmd, arg);
+			break;
+		}
+	}
+	mutex_unlock(&nvdimm_bus_list_mutex);
+
+	return rc;
+}
+
+static int match_dimm(struct device *dev, void *data)
+{
+	long id = (long) data;
+
+	if (is_nvdimm(dev)) {
+		struct nvdimm *nvdimm = to_nvdimm(dev);
+
+		return nvdimm->id == id;
+	}
+
+	return 0;
+}
+
+static long nvdimm_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+	int rc = -ENXIO, read_only;
+	struct nvdimm_bus *nvdimm_bus;
+
+	read_only = (O_RDWR != (file->f_flags & O_ACCMODE));
+	mutex_lock(&nvdimm_bus_list_mutex);
+	list_for_each_entry(nvdimm_bus, &nvdimm_bus_list, list) {
+		struct device *dev = device_find_child(&nvdimm_bus->dev,
+				file->private_data, match_dimm);
+		struct nvdimm *nvdimm;
+
+		if (!dev)
+			continue;
+
+		nvdimm = to_nvdimm(dev);
+		rc = __nd_ioctl(nvdimm_bus, nvdimm, read_only, cmd, arg);
+		put_device(dev);
+		break;
+	}
+	mutex_unlock(&nvdimm_bus_list_mutex);
+
+	return rc;
+}
+
+static int nd_open(struct inode *inode, struct file *file)
+{
+	long minor = iminor(inode);
+
+	file->private_data = (void *) minor;
+	return 0;
 }
 
 static const struct file_operations nvdimm_bus_fops = {
 	.owner = THIS_MODULE,
-	.open = nonseekable_open,
+	.open = nd_open,
 	.unlocked_ioctl = nd_ioctl,
 	.compat_ioctl = nd_ioctl,
 	.llseek = noop_llseek,
 };
 
+static const struct file_operations nvdimm_fops = {
+	.owner = THIS_MODULE,
+	.open = nd_open,
+	.unlocked_ioctl = nvdimm_ioctl,
+	.compat_ioctl = nvdimm_ioctl,
+	.llseek = noop_llseek,
+};
+
 int __init nvdimm_bus_init(void)
 {
 	int rc;
@@ -70,9 +380,14 @@ int __init nvdimm_bus_init(void)
 
 	rc = register_chrdev(0, "ndctl", &nvdimm_bus_fops);
 	if (rc < 0)
-		goto err_chrdev;
+		goto err_bus_chrdev;
 	nvdimm_bus_major = rc;
 
+	rc = register_chrdev(0, "dimmctl", &nvdimm_fops);
+	if (rc < 0)
+		goto err_dimm_chrdev;
+	nvdimm_major = rc;
+
 	nd_class = class_create(THIS_MODULE, "nd");
 	if (IS_ERR(nd_class))
 		goto err_class;
@@ -80,8 +395,10 @@ int __init nvdimm_bus_init(void)
 	return 0;
 
  err_class:
+	unregister_chrdev(nvdimm_major, "dimmctl");
+ err_dimm_chrdev:
 	unregister_chrdev(nvdimm_bus_major, "ndctl");
- err_chrdev:
+ err_bus_chrdev:
 	bus_unregister(&nvdimm_bus_type);
 
 	return rc;
@@ -91,5 +408,6 @@ void __exit nvdimm_bus_exit(void)
 {
 	class_destroy(nd_class);
 	unregister_chrdev(nvdimm_bus_major, "ndctl");
+	unregister_chrdev(nvdimm_major, "dimmctl");
 	bus_unregister(&nvdimm_bus_type);
 }
diff --git a/drivers/nvdimm/core.c b/drivers/nvdimm/core.c
index ef957eb37c90..1ce159095c52 100644
--- a/drivers/nvdimm/core.c
+++ b/drivers/nvdimm/core.c
@@ -14,6 +14,7 @@
 #include <linux/export.h>
 #include <linux/module.h>
 #include <linux/device.h>
+#include <linux/ndctl.h>
 #include <linux/mutex.h>
 #include <linux/slab.h>
 #include "nd-core.h"
@@ -61,6 +62,20 @@ struct nvdimm_bus *walk_to_nvdimm_bus(struct device *nd_dev)
 	return NULL;
 }
 
+static ssize_t commands_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	int cmd, len = 0;
+	struct nvdimm_bus *nvdimm_bus = to_nvdimm_bus(dev);
+	struct nvdimm_bus_descriptor *nd_desc = nvdimm_bus->nd_desc;
+
+	for_each_set_bit(cmd, &nd_desc->dsm_mask, BITS_PER_LONG)
+		len += sprintf(buf + len, "%s ", nvdimm_bus_cmd_name(cmd));
+	len += sprintf(buf + len, "\n");
+	return len;
+}
+static DEVICE_ATTR_RO(commands);
+
 static const char *nvdimm_bus_provider(struct nvdimm_bus *nvdimm_bus)
 {
 	struct nvdimm_bus_descriptor *nd_desc = nvdimm_bus->nd_desc;
@@ -84,6 +99,7 @@ static ssize_t provider_show(struct device *dev,
 static DEVICE_ATTR_RO(provider);
 
 static struct attribute *nvdimm_bus_attributes[] = {
+	&dev_attr_commands.attr,
 	&dev_attr_provider.attr,
 	NULL,
 };
diff --git a/drivers/nvdimm/dimm_devs.c b/drivers/nvdimm/dimm_devs.c
index 51ea52cc2079..c3dd7227d1bb 100644
--- a/drivers/nvdimm/dimm_devs.c
+++ b/drivers/nvdimm/dimm_devs.c
@@ -12,6 +12,7 @@
  */
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 #include <linux/device.h>
+#include <linux/ndctl.h>
 #include <linux/slab.h>
 #include <linux/io.h>
 #include <linux/fs.h>
@@ -33,7 +34,7 @@ static struct device_type nvdimm_device_type = {
 	.release = nvdimm_release,
 };
 
-static bool is_nvdimm(struct device *dev)
+bool is_nvdimm(struct device *dev)
 {
 	return dev->type == &nvdimm_device_type;
 }
@@ -55,12 +56,41 @@ EXPORT_SYMBOL_GPL(nvdimm_name);
 
 void *nvdimm_provider_data(struct nvdimm *nvdimm)
 {
-	return nvdimm->provider_data;
+	if (nvdimm)
+		return nvdimm->provider_data;
+	return NULL;
 }
 EXPORT_SYMBOL_GPL(nvdimm_provider_data);
 
+static ssize_t commands_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct nvdimm *nvdimm = to_nvdimm(dev);
+	int cmd, len = 0;
+
+	if (!nvdimm->dsm_mask)
+		return sprintf(buf, "\n");
+
+	for_each_set_bit(cmd, nvdimm->dsm_mask, BITS_PER_LONG)
+		len += sprintf(buf + len, "%s ", nvdimm_cmd_name(cmd));
+	len += sprintf(buf + len, "\n");
+	return len;
+}
+static DEVICE_ATTR_RO(commands);
+
+static struct attribute *nvdimm_attributes[] = {
+	&dev_attr_commands.attr,
+	NULL,
+};
+
+struct attribute_group nvdimm_attribute_group = {
+	.attrs = nvdimm_attributes,
+};
+EXPORT_SYMBOL_GPL(nvdimm_attribute_group);
+
 struct nvdimm *nvdimm_create(struct nvdimm_bus *nvdimm_bus, void *provider_data,
-		const struct attribute_group **groups, unsigned long flags)
+		const struct attribute_group **groups, unsigned long flags,
+		unsigned long *dsm_mask)
 {
 	struct nvdimm *nvdimm = kzalloc(sizeof(*nvdimm), GFP_KERNEL);
 	struct device *dev;
@@ -75,12 +105,14 @@ struct nvdimm *nvdimm_create(struct nvdimm_bus *nvdimm_bus, void *provider_data,
 	}
 	nvdimm->provider_data = provider_data;
 	nvdimm->flags = flags;
+	nvdimm->dsm_mask = dsm_mask;
 
 	dev = &nvdimm->dev;
 	dev_set_name(dev, "nmem%d", nvdimm->id);
 	dev->parent = &nvdimm_bus->dev;
 	dev->type = &nvdimm_device_type;
 	dev->bus = &nvdimm_bus_type;
+	dev->devt = MKDEV(nvdimm_major, nvdimm->id);
 	dev->groups = groups;
 	if (device_register(dev) != 0) {
 		put_device(dev);
diff --git a/drivers/nvdimm/nd-core.h b/drivers/nvdimm/nd-core.h
index 9b8303413b60..59528b3c9de8 100644
--- a/drivers/nvdimm/nd-core.h
+++ b/drivers/nvdimm/nd-core.h
@@ -18,6 +18,7 @@
 extern struct list_head nvdimm_bus_list;
 extern struct mutex nvdimm_bus_list_mutex;
 extern struct bus_type nvdimm_bus_type;
+extern int nvdimm_major;
 
 struct nvdimm_bus {
 	struct nvdimm_bus_descriptor *nd_desc;
@@ -29,10 +30,12 @@ struct nvdimm_bus {
 struct nvdimm {
 	unsigned long flags;
 	void *provider_data;
+	unsigned long *dsm_mask;
 	struct device dev;
 	int id;
 };
 
+bool is_nvdimm(struct device *dev);
 struct nvdimm_bus *walk_to_nvdimm_bus(struct device *nd_dev);
 int __init nvdimm_bus_init(void);
 void __exit nvdimm_bus_exit(void);
diff --git a/include/linux/libnvdimm.h b/include/linux/libnvdimm.h
index 07787f0dd7de..a39235819af3 100644
--- a/include/linux/libnvdimm.h
+++ b/include/linux/libnvdimm.h
@@ -14,13 +14,22 @@
  */
 #ifndef __LIBNVDIMM_H__
 #define __LIBNVDIMM_H__
+#include <linux/sizes.h>
+#include <linux/types.h>
 
 enum {
 	/* when a dimm supports both PMEM and BLK access a label is required */
 	NDD_ALIASING = 1 << 0,
+
+	/* need to set a limit somewhere, but yes, this is likely overkill */
+	ND_IOCTL_MAX_BUFLEN = SZ_4M,
+	ND_CMD_MAX_ELEM = 4,
+	ND_CMD_MAX_ENVELOPE = 16,
+	ND_CMD_ARS_STATUS_MAX = SZ_4K,
 };
 
 extern struct attribute_group nvdimm_bus_attribute_group;
+extern struct attribute_group nvdimm_attribute_group;
 
 struct nvdimm;
 struct nvdimm_bus_descriptor;
@@ -35,6 +44,14 @@ struct nvdimm_bus_descriptor {
 	ndctl_fn ndctl;
 };
 
+struct nd_cmd_desc {
+	int in_num;
+	int out_num;
+	u32 in_sizes[ND_CMD_MAX_ELEM];
+	int out_sizes[ND_CMD_MAX_ELEM];
+};
+
+struct nvdimm_bus;
 struct device;
 struct nvdimm_bus *nvdimm_bus_register(struct device *parent,
 		struct nvdimm_bus_descriptor *nfit_desc);
@@ -45,5 +62,13 @@ struct nvdimm_bus_descriptor *to_nd_desc(struct nvdimm_bus *nvdimm_bus);
 const char *nvdimm_name(struct nvdimm *nvdimm);
 void *nvdimm_provider_data(struct nvdimm *nvdimm);
 struct nvdimm *nvdimm_create(struct nvdimm_bus *nvdimm_bus, void *provider_data,
-		const struct attribute_group **groups, unsigned long flags);
+		const struct attribute_group **groups, unsigned long flags,
+		unsigned long *dsm_mask);
+const struct nd_cmd_desc *nd_cmd_dimm_desc(int cmd);
+const struct nd_cmd_desc *nd_cmd_bus_desc(int cmd);
+u32 nd_cmd_in_size(struct nvdimm *nvdimm, int cmd,
+		const struct nd_cmd_desc *desc, int idx, void *buf);
+u32 nd_cmd_out_size(struct nvdimm *nvdimm, int cmd,
+		const struct nd_cmd_desc *desc, int idx, const u32 *in_field,
+		const u32 *out_field);
 #endif /* __LIBNVDIMM_H__ */
diff --git a/include/uapi/linux/Kbuild b/include/uapi/linux/Kbuild
index 1a0006a76b00..200cc5ea2998 100644
--- a/include/uapi/linux/Kbuild
+++ b/include/uapi/linux/Kbuild
@@ -271,6 +271,7 @@ header-y += ncp_fs.h
 header-y += ncp.h
 header-y += ncp_mount.h
 header-y += ncp_no.h
+header-y += ndctl.h
 header-y += neighbour.h
 header-y += netconf.h
 header-y += netdevice.h
diff --git a/include/uapi/linux/ndctl.h b/include/uapi/linux/ndctl.h
new file mode 100644
index 000000000000..ff13c23b26df
--- /dev/null
+++ b/include/uapi/linux/ndctl.h
@@ -0,0 +1,178 @@
+/*
+ * Copyright (c) 2014-2015, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU Lesser General Public License,
+ * version 2.1, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for
+ * more details.
+ */
+#ifndef __NDCTL_H__
+#define __NDCTL_H__
+
+#include <linux/types.h>
+
+struct nd_cmd_smart {
+	__u32 status;
+	__u8 data[128];
+} __packed;
+
+struct nd_cmd_smart_threshold {
+	__u32 status;
+	__u8 data[8];
+} __packed;
+
+struct nd_cmd_dimm_flags {
+	__u32 status;
+	__u32 flags;
+} __packed;
+
+struct nd_cmd_get_config_size {
+	__u32 status;
+	__u32 config_size;
+	__u32 max_xfer;
+} __packed;
+
+struct nd_cmd_get_config_data_hdr {
+	__u32 in_offset;
+	__u32 in_length;
+	__u32 status;
+	__u8 out_buf[0];
+} __packed;
+
+struct nd_cmd_set_config_hdr {
+	__u32 in_offset;
+	__u32 in_length;
+	__u8 in_buf[0];
+} __packed;
+
+struct nd_cmd_vendor_hdr {
+	__u32 opcode;
+	__u32 in_length;
+	__u8 in_buf[0];
+} __packed;
+
+struct nd_cmd_vendor_tail {
+	__u32 status;
+	__u32 out_length;
+	__u8 out_buf[0];
+} __packed;
+
+struct nd_cmd_ars_cap {
+	__u64 address;
+	__u64 length;
+	__u32 status;
+	__u32 max_ars_out;
+} __packed;
+
+struct nd_cmd_ars_start {
+	__u64 address;
+	__u64 length;
+	__u16 type;
+	__u8 reserved[6];
+	__u32 status;
+} __packed;
+
+struct nd_cmd_ars_status {
+	__u32 status;
+	__u32 out_length;
+	__u64 address;
+	__u64 length;
+	__u16 type;
+	__u32 num_records;
+	struct nd_ars_record {
+		__u32 handle;
+		__u32 flags;
+		__u64 err_address;
+		__u64 mask;
+	} __packed records[0];
+} __packed;
+
+enum {
+	ND_CMD_IMPLEMENTED = 0,
+
+	/* bus commands */
+	ND_CMD_ARS_CAP = 1,
+	ND_CMD_ARS_START = 2,
+	ND_CMD_ARS_STATUS = 3,
+
+	/* per-dimm commands */
+	ND_CMD_SMART = 1,
+	ND_CMD_SMART_THRESHOLD = 2,
+	ND_CMD_DIMM_FLAGS = 3,
+	ND_CMD_GET_CONFIG_SIZE = 4,
+	ND_CMD_GET_CONFIG_DATA = 5,
+	ND_CMD_SET_CONFIG_DATA = 6,
+	ND_CMD_VENDOR_EFFECT_LOG_SIZE = 7,
+	ND_CMD_VENDOR_EFFECT_LOG = 8,
+	ND_CMD_VENDOR = 9,
+};
+
+static inline const char *nvdimm_bus_cmd_name(unsigned cmd)
+{
+	static const char * const names[] = {
+		[ND_CMD_ARS_CAP] = "ars_cap",
+		[ND_CMD_ARS_START] = "ars_start",
+		[ND_CMD_ARS_STATUS] = "ars_status",
+	};
+
+	if (cmd < ARRAY_SIZE(names) && names[cmd])
+		return names[cmd];
+	return "unknown";
+}
+
+static inline const char *nvdimm_cmd_name(unsigned cmd)
+{
+	static const char * const names[] = {
+		[ND_CMD_SMART] = "smart",
+		[ND_CMD_SMART_THRESHOLD] = "smart_thresh",
+		[ND_CMD_DIMM_FLAGS] = "flags",
+		[ND_CMD_GET_CONFIG_SIZE] = "get_size",
+		[ND_CMD_GET_CONFIG_DATA] = "get_data",
+		[ND_CMD_SET_CONFIG_DATA] = "set_data",
+		[ND_CMD_VENDOR_EFFECT_LOG_SIZE] = "effect_size",
+		[ND_CMD_VENDOR_EFFECT_LOG] = "effect_log",
+		[ND_CMD_VENDOR] = "vendor",
+	};
+
+	if (cmd < ARRAY_SIZE(names) && names[cmd])
+		return names[cmd];
+	return "unknown";
+}
+
+#define ND_IOCTL 'N'
+
+#define ND_IOCTL_SMART			_IOWR(ND_IOCTL, ND_CMD_SMART,\
+					struct nd_cmd_smart)
+
+#define ND_IOCTL_SMART_THRESHOLD	_IOWR(ND_IOCTL, ND_CMD_SMART_THRESHOLD,\
+					struct nd_cmd_smart_threshold)
+
+#define ND_IOCTL_DIMM_FLAGS		_IOWR(ND_IOCTL, ND_CMD_DIMM_FLAGS,\
+					struct nd_cmd_dimm_flags)
+
+#define ND_IOCTL_GET_CONFIG_SIZE	_IOWR(ND_IOCTL, ND_CMD_GET_CONFIG_SIZE,\
+					struct nd_cmd_get_config_size)
+
+#define ND_IOCTL_GET_CONFIG_DATA	_IOWR(ND_IOCTL, ND_CMD_GET_CONFIG_DATA,\
+					struct nd_cmd_get_config_data_hdr)
+
+#define ND_IOCTL_SET_CONFIG_DATA	_IOWR(ND_IOCTL, ND_CMD_SET_CONFIG_DATA,\
+					struct nd_cmd_set_config_hdr)
+
+#define ND_IOCTL_VENDOR			_IOWR(ND_IOCTL, ND_CMD_VENDOR,\
+					struct nd_cmd_vendor_hdr)
+
+#define ND_IOCTL_ARS_CAP		_IOWR(ND_IOCTL, ND_CMD_ARS_CAP,\
+					struct nd_cmd_ars_cap)
+
+#define ND_IOCTL_ARS_START		_IOWR(ND_IOCTL, ND_CMD_ARS_START,\
+					struct nd_cmd_ars_start)
+
+#define ND_IOCTL_ARS_STATUS		_IOWR(ND_IOCTL, ND_CMD_ARS_STATUS,\
+					struct nd_cmd_ars_status)
+
+#endif /* __NDCTL_H__ */
-- 
cgit v1.2.3


From 4d88a97aa9e8cfa6460aab119c5da60ad2267423 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Sun, 31 May 2015 14:41:48 -0400
Subject: libnvdimm, nvdimm: dimm driver and base libnvdimm device-driver
 infrastructure

* Implement the device-model infrastructure for loading modules and
  attaching drivers to nvdimm devices.  This is a simple association of a
  nd-device-type number with a driver that has a bitmask of supported
  device types.  To facilitate userspace bind/unbind operations 'modalias'
  and 'devtype', that also appear in the uevent, are added as generic
  sysfs attributes for all nvdimm devices.  The reason for the device-type
  number is to support sub-types within a given parent devtype, be it a
  vendor-specific sub-type or otherwise.

* The first consumer of this infrastructure is the driver
  for dimm devices.  It simply uses control messages to retrieve and
  store the configuration-data image (label set) from each dimm.

Note: nd_device_register() arranges for asynchronous registration of
      nvdimm bus devices by default.

Cc: Greg KH <gregkh@linuxfoundation.org>
Cc: Neil Brown <neilb@suse.de>
Acked-by: Christoph Hellwig <hch@lst.de>
Tested-by: Toshi Kani <toshi.kani@hp.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/acpi/nfit.c        |  13 +++-
 drivers/nvdimm/Makefile    |   1 +
 drivers/nvdimm/bus.c       | 168 ++++++++++++++++++++++++++++++++++++++++++++-
 drivers/nvdimm/core.c      |  43 +++++++++++-
 drivers/nvdimm/dimm.c      |  92 +++++++++++++++++++++++++
 drivers/nvdimm/dimm_devs.c | 136 ++++++++++++++++++++++++++++++++++--
 drivers/nvdimm/nd-core.h   |   6 +-
 drivers/nvdimm/nd.h        |  36 ++++++++++
 include/linux/libnvdimm.h  |   2 +
 include/linux/nd.h         |  39 +++++++++++
 include/uapi/linux/ndctl.h |   6 ++
 11 files changed, 527 insertions(+), 15 deletions(-)
 create mode 100644 drivers/nvdimm/dimm.c
 create mode 100644 drivers/nvdimm/nd.h
 create mode 100644 include/linux/nd.h

(limited to 'include/linux')

diff --git a/drivers/acpi/nfit.c b/drivers/acpi/nfit.c
index 9112a6210a4b..c4ccec1bc60b 100644
--- a/drivers/acpi/nfit.c
+++ b/drivers/acpi/nfit.c
@@ -18,6 +18,10 @@
 #include <linux/acpi.h>
 #include "nfit.h"
 
+static bool force_enable_dimms;
+module_param(force_enable_dimms, bool, S_IRUGO|S_IWUSR);
+MODULE_PARM_DESC(force_enable_dimms, "Ignore _STA (ACPI DIMM device) status");
+
 static u8 nfit_uuid[NFIT_UUID_MAX][16];
 
 static const u8 *to_nfit_uuid(enum nfit_uuids id)
@@ -633,6 +637,7 @@ static struct attribute_group acpi_nfit_dimm_attribute_group = {
 
 static const struct attribute_group *acpi_nfit_dimm_attribute_groups[] = {
 	&nvdimm_attribute_group,
+	&nd_device_attribute_group,
 	&acpi_nfit_dimm_attribute_group,
 	NULL,
 };
@@ -669,7 +674,7 @@ static int acpi_nfit_add_dimm(struct acpi_nfit_desc *acpi_desc,
 	if (!adev_dimm) {
 		dev_err(dev, "no ACPI.NFIT device with _ADR %#x, disabling...\n",
 				device_handle);
-		return -ENODEV;
+		return force_enable_dimms ? 0 : -ENODEV;
 	}
 
 	status = acpi_evaluate_integer(adev_dimm->handle, "_STA", NULL, &sta);
@@ -690,12 +695,13 @@ static int acpi_nfit_add_dimm(struct acpi_nfit_desc *acpi_desc,
 		if (acpi_check_dsm(adev_dimm->handle, uuid, 1, 1ULL << i))
 			set_bit(i, &nfit_mem->dsm_mask);
 
-	return rc;
+	return force_enable_dimms ? 0 : rc;
 }
 
 static int acpi_nfit_register_dimms(struct acpi_nfit_desc *acpi_desc)
 {
 	struct nfit_mem *nfit_mem;
+	int dimm_count = 0;
 
 	list_for_each_entry(nfit_mem, &acpi_desc->dimms, list) {
 		struct nvdimm *nvdimm;
@@ -729,9 +735,10 @@ static int acpi_nfit_register_dimms(struct acpi_nfit_desc *acpi_desc)
 			return -ENOMEM;
 
 		nfit_mem->nvdimm = nvdimm;
+		dimm_count++;
 	}
 
-	return 0;
+	return nvdimm_bus_check_dimm_count(acpi_desc->nvdimm_bus, dimm_count);
 }
 
 static void acpi_nfit_init_dsms(struct acpi_nfit_desc *acpi_desc)
diff --git a/drivers/nvdimm/Makefile b/drivers/nvdimm/Makefile
index 5b68738ba406..d44b5c1fcd3b 100644
--- a/drivers/nvdimm/Makefile
+++ b/drivers/nvdimm/Makefile
@@ -3,3 +3,4 @@ obj-$(CONFIG_LIBNVDIMM) += libnvdimm.o
 libnvdimm-y := core.o
 libnvdimm-y += bus.o
 libnvdimm-y += dimm_devs.o
+libnvdimm-y += dimm.o
diff --git a/drivers/nvdimm/bus.c b/drivers/nvdimm/bus.c
index 15f3a3ddc225..a0308f1872bf 100644
--- a/drivers/nvdimm/bus.c
+++ b/drivers/nvdimm/bus.c
@@ -16,19 +16,183 @@
 #include <linux/fcntl.h>
 #include <linux/async.h>
 #include <linux/ndctl.h>
+#include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/fs.h>
 #include <linux/io.h>
 #include <linux/mm.h>
+#include <linux/nd.h>
 #include "nd-core.h"
+#include "nd.h"
 
 int nvdimm_major;
 static int nvdimm_bus_major;
 static struct class *nd_class;
 
-struct bus_type nvdimm_bus_type = {
+static int to_nd_device_type(struct device *dev)
+{
+	if (is_nvdimm(dev))
+		return ND_DEVICE_DIMM;
+
+	return 0;
+}
+
+static int nvdimm_bus_uevent(struct device *dev, struct kobj_uevent_env *env)
+{
+	return add_uevent_var(env, "MODALIAS=" ND_DEVICE_MODALIAS_FMT,
+			to_nd_device_type(dev));
+}
+
+static int nvdimm_bus_match(struct device *dev, struct device_driver *drv)
+{
+	struct nd_device_driver *nd_drv = to_nd_device_driver(drv);
+
+	return test_bit(to_nd_device_type(dev), &nd_drv->type);
+}
+
+static int nvdimm_bus_probe(struct device *dev)
+{
+	struct nd_device_driver *nd_drv = to_nd_device_driver(dev->driver);
+	struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(dev);
+	int rc;
+
+	rc = nd_drv->probe(dev);
+	dev_dbg(&nvdimm_bus->dev, "%s.probe(%s) = %d\n", dev->driver->name,
+			dev_name(dev), rc);
+	return rc;
+}
+
+static int nvdimm_bus_remove(struct device *dev)
+{
+	struct nd_device_driver *nd_drv = to_nd_device_driver(dev->driver);
+	struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(dev);
+	int rc;
+
+	rc = nd_drv->remove(dev);
+	dev_dbg(&nvdimm_bus->dev, "%s.remove(%s) = %d\n", dev->driver->name,
+			dev_name(dev), rc);
+	return rc;
+}
+
+static struct bus_type nvdimm_bus_type = {
 	.name = "nd",
+	.uevent = nvdimm_bus_uevent,
+	.match = nvdimm_bus_match,
+	.probe = nvdimm_bus_probe,
+	.remove = nvdimm_bus_remove,
+};
+
+static ASYNC_DOMAIN_EXCLUSIVE(nd_async_domain);
+
+void nd_synchronize(void)
+{
+	async_synchronize_full_domain(&nd_async_domain);
+}
+EXPORT_SYMBOL_GPL(nd_synchronize);
+
+static void nd_async_device_register(void *d, async_cookie_t cookie)
+{
+	struct device *dev = d;
+
+	if (device_add(dev) != 0) {
+		dev_err(dev, "%s: failed\n", __func__);
+		put_device(dev);
+	}
+	put_device(dev);
+}
+
+static void nd_async_device_unregister(void *d, async_cookie_t cookie)
+{
+	struct device *dev = d;
+
+	device_unregister(dev);
+	put_device(dev);
+}
+
+void nd_device_register(struct device *dev)
+{
+	dev->bus = &nvdimm_bus_type;
+	device_initialize(dev);
+	get_device(dev);
+	async_schedule_domain(nd_async_device_register, dev,
+			&nd_async_domain);
+}
+EXPORT_SYMBOL(nd_device_register);
+
+void nd_device_unregister(struct device *dev, enum nd_async_mode mode)
+{
+	switch (mode) {
+	case ND_ASYNC:
+		get_device(dev);
+		async_schedule_domain(nd_async_device_unregister, dev,
+				&nd_async_domain);
+		break;
+	case ND_SYNC:
+		nd_synchronize();
+		device_unregister(dev);
+		break;
+	}
+}
+EXPORT_SYMBOL(nd_device_unregister);
+
+/**
+ * __nd_driver_register() - register a region or a namespace driver
+ * @nd_drv: driver to register
+ * @owner: automatically set by nd_driver_register() macro
+ * @mod_name: automatically set by nd_driver_register() macro
+ */
+int __nd_driver_register(struct nd_device_driver *nd_drv, struct module *owner,
+		const char *mod_name)
+{
+	struct device_driver *drv = &nd_drv->drv;
+
+	if (!nd_drv->type) {
+		pr_debug("driver type bitmask not set (%pf)\n",
+				__builtin_return_address(0));
+		return -EINVAL;
+	}
+
+	if (!nd_drv->probe || !nd_drv->remove) {
+		pr_debug("->probe() and ->remove() must be specified\n");
+		return -EINVAL;
+	}
+
+	drv->bus = &nvdimm_bus_type;
+	drv->owner = owner;
+	drv->mod_name = mod_name;
+
+	return driver_register(drv);
+}
+EXPORT_SYMBOL(__nd_driver_register);
+
+static ssize_t modalias_show(struct device *dev, struct device_attribute *attr,
+		char *buf)
+{
+	return sprintf(buf, ND_DEVICE_MODALIAS_FMT "\n",
+			to_nd_device_type(dev));
+}
+static DEVICE_ATTR_RO(modalias);
+
+static ssize_t devtype_show(struct device *dev, struct device_attribute *attr,
+		char *buf)
+{
+	return sprintf(buf, "%s\n", dev->type->name);
+}
+static DEVICE_ATTR_RO(devtype);
+
+static struct attribute *nd_device_attributes[] = {
+	&dev_attr_modalias.attr,
+	&dev_attr_devtype.attr,
+	NULL,
+};
+
+/**
+ * nd_device_attribute_group - generic attributes for all devices on an nd bus
+ */
+struct attribute_group nd_device_attribute_group = {
+	.attrs = nd_device_attributes,
 };
+EXPORT_SYMBOL_GPL(nd_device_attribute_group);
 
 int nvdimm_bus_create_ndctl(struct nvdimm_bus *nvdimm_bus)
 {
@@ -404,7 +568,7 @@ int __init nvdimm_bus_init(void)
 	return rc;
 }
 
-void __exit nvdimm_bus_exit(void)
+void nvdimm_bus_exit(void)
 {
 	class_destroy(nd_class);
 	unregister_chrdev(nvdimm_bus_major, "ndctl");
diff --git a/drivers/nvdimm/core.c b/drivers/nvdimm/core.c
index 1ce159095c52..50ab880f0dc0 100644
--- a/drivers/nvdimm/core.c
+++ b/drivers/nvdimm/core.c
@@ -18,6 +18,7 @@
 #include <linux/mutex.h>
 #include <linux/slab.h>
 #include "nd-core.h"
+#include "nd.h"
 
 LIST_HEAD(nvdimm_bus_list);
 DEFINE_MUTEX(nvdimm_bus_list_mutex);
@@ -98,8 +99,33 @@ static ssize_t provider_show(struct device *dev,
 }
 static DEVICE_ATTR_RO(provider);
 
+static int flush_namespaces(struct device *dev, void *data)
+{
+	device_lock(dev);
+	device_unlock(dev);
+	return 0;
+}
+
+static int flush_regions_dimms(struct device *dev, void *data)
+{
+	device_lock(dev);
+	device_unlock(dev);
+	device_for_each_child(dev, NULL, flush_namespaces);
+	return 0;
+}
+
+static ssize_t wait_probe_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	nd_synchronize();
+	device_for_each_child(dev, NULL, flush_regions_dimms);
+	return sprintf(buf, "1\n");
+}
+static DEVICE_ATTR_RO(wait_probe);
+
 static struct attribute *nvdimm_bus_attributes[] = {
 	&dev_attr_commands.attr,
+	&dev_attr_wait_probe.attr,
 	&dev_attr_provider.attr,
 	NULL,
 };
@@ -161,7 +187,7 @@ static int child_unregister(struct device *dev, void *data)
 	if (dev->class)
 		/* pass */;
 	else
-		device_unregister(dev);
+		nd_device_unregister(dev, ND_SYNC);
 	return 0;
 }
 
@@ -174,6 +200,7 @@ void nvdimm_bus_unregister(struct nvdimm_bus *nvdimm_bus)
 	list_del_init(&nvdimm_bus->list);
 	mutex_unlock(&nvdimm_bus_list_mutex);
 
+	nd_synchronize();
 	device_for_each_child(&nvdimm_bus->dev, NULL, child_unregister);
 	nvdimm_bus_destroy_ndctl(nvdimm_bus);
 
@@ -183,12 +210,24 @@ EXPORT_SYMBOL_GPL(nvdimm_bus_unregister);
 
 static __init int libnvdimm_init(void)
 {
-	return nvdimm_bus_init();
+	int rc;
+
+	rc = nvdimm_bus_init();
+	if (rc)
+		return rc;
+	rc = nvdimm_init();
+	if (rc)
+		goto err_dimm;
+	return 0;
+ err_dimm:
+	nvdimm_bus_exit();
+	return rc;
 }
 
 static __exit void libnvdimm_exit(void)
 {
 	WARN_ON(!list_empty(&nvdimm_bus_list));
+	nvdimm_exit();
 	nvdimm_bus_exit();
 }
 
diff --git a/drivers/nvdimm/dimm.c b/drivers/nvdimm/dimm.c
new file mode 100644
index 000000000000..28001a6ccd4e
--- /dev/null
+++ b/drivers/nvdimm/dimm.c
@@ -0,0 +1,92 @@
+/*
+ * Copyright(c) 2013-2015 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+#include <linux/vmalloc.h>
+#include <linux/module.h>
+#include <linux/device.h>
+#include <linux/sizes.h>
+#include <linux/ndctl.h>
+#include <linux/slab.h>
+#include <linux/mm.h>
+#include <linux/nd.h>
+#include "nd.h"
+
+static void free_data(struct nvdimm_drvdata *ndd)
+{
+	if (!ndd)
+		return;
+
+	if (ndd->data && is_vmalloc_addr(ndd->data))
+		vfree(ndd->data);
+	else
+		kfree(ndd->data);
+	kfree(ndd);
+}
+
+static int nvdimm_probe(struct device *dev)
+{
+	struct nvdimm_drvdata *ndd;
+	int rc;
+
+	ndd = kzalloc(sizeof(*ndd), GFP_KERNEL);
+	if (!ndd)
+		return -ENOMEM;
+
+	dev_set_drvdata(dev, ndd);
+	ndd->dev = dev;
+
+	rc = nvdimm_init_nsarea(ndd);
+	if (rc)
+		goto err;
+
+	rc = nvdimm_init_config_data(ndd);
+	if (rc)
+		goto err;
+
+	dev_dbg(dev, "config data size: %d\n", ndd->nsarea.config_size);
+
+	return 0;
+
+ err:
+	free_data(ndd);
+	return rc;
+}
+
+static int nvdimm_remove(struct device *dev)
+{
+	struct nvdimm_drvdata *ndd = dev_get_drvdata(dev);
+
+	free_data(ndd);
+
+	return 0;
+}
+
+static struct nd_device_driver nvdimm_driver = {
+	.probe = nvdimm_probe,
+	.remove = nvdimm_remove,
+	.drv = {
+		.name = "nvdimm",
+	},
+	.type = ND_DRIVER_DIMM,
+};
+
+int __init nvdimm_init(void)
+{
+	return nd_driver_register(&nvdimm_driver);
+}
+
+void __exit nvdimm_exit(void)
+{
+	driver_unregister(&nvdimm_driver.drv);
+}
+
+MODULE_ALIAS_ND_DEVICE(ND_DEVICE_DIMM);
diff --git a/drivers/nvdimm/dimm_devs.c b/drivers/nvdimm/dimm_devs.c
index c3dd7227d1bb..b3ae86f2e1da 100644
--- a/drivers/nvdimm/dimm_devs.c
+++ b/drivers/nvdimm/dimm_devs.c
@@ -11,6 +11,7 @@
  * General Public License for more details.
  */
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/vmalloc.h>
 #include <linux/device.h>
 #include <linux/ndctl.h>
 #include <linux/slab.h>
@@ -18,9 +19,115 @@
 #include <linux/fs.h>
 #include <linux/mm.h>
 #include "nd-core.h"
+#include "nd.h"
 
 static DEFINE_IDA(dimm_ida);
 
+/*
+ * Retrieve bus and dimm handle and return if this bus supports
+ * get_config_data commands
+ */
+static int __validate_dimm(struct nvdimm_drvdata *ndd)
+{
+	struct nvdimm *nvdimm;
+
+	if (!ndd)
+		return -EINVAL;
+
+	nvdimm = to_nvdimm(ndd->dev);
+
+	if (!nvdimm->dsm_mask)
+		return -ENXIO;
+	if (!test_bit(ND_CMD_GET_CONFIG_DATA, nvdimm->dsm_mask))
+		return -ENXIO;
+
+	return 0;
+}
+
+static int validate_dimm(struct nvdimm_drvdata *ndd)
+{
+	int rc = __validate_dimm(ndd);
+
+	if (rc && ndd)
+		dev_dbg(ndd->dev, "%pf: %s error: %d\n",
+				__builtin_return_address(0), __func__, rc);
+	return rc;
+}
+
+/**
+ * nvdimm_init_nsarea - determine the geometry of a dimm's namespace area
+ * @nvdimm: dimm to initialize
+ */
+int nvdimm_init_nsarea(struct nvdimm_drvdata *ndd)
+{
+	struct nd_cmd_get_config_size *cmd = &ndd->nsarea;
+	struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(ndd->dev);
+	struct nvdimm_bus_descriptor *nd_desc;
+	int rc = validate_dimm(ndd);
+
+	if (rc)
+		return rc;
+
+	if (cmd->config_size)
+		return 0; /* already valid */
+
+	memset(cmd, 0, sizeof(*cmd));
+	nd_desc = nvdimm_bus->nd_desc;
+	return nd_desc->ndctl(nd_desc, to_nvdimm(ndd->dev),
+			ND_CMD_GET_CONFIG_SIZE, cmd, sizeof(*cmd));
+}
+
+int nvdimm_init_config_data(struct nvdimm_drvdata *ndd)
+{
+	struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(ndd->dev);
+	struct nd_cmd_get_config_data_hdr *cmd;
+	struct nvdimm_bus_descriptor *nd_desc;
+	int rc = validate_dimm(ndd);
+	u32 max_cmd_size, config_size;
+	size_t offset;
+
+	if (rc)
+		return rc;
+
+	if (ndd->data)
+		return 0;
+
+	if (ndd->nsarea.status || ndd->nsarea.max_xfer == 0)
+		return -ENXIO;
+
+	ndd->data = kmalloc(ndd->nsarea.config_size, GFP_KERNEL);
+	if (!ndd->data)
+		ndd->data = vmalloc(ndd->nsarea.config_size);
+
+	if (!ndd->data)
+		return -ENOMEM;
+
+	max_cmd_size = min_t(u32, PAGE_SIZE, ndd->nsarea.max_xfer);
+	cmd = kzalloc(max_cmd_size + sizeof(*cmd), GFP_KERNEL);
+	if (!cmd)
+		return -ENOMEM;
+
+	nd_desc = nvdimm_bus->nd_desc;
+	for (config_size = ndd->nsarea.config_size, offset = 0;
+			config_size; config_size -= cmd->in_length,
+			offset += cmd->in_length) {
+		cmd->in_length = min(config_size, max_cmd_size);
+		cmd->in_offset = offset;
+		rc = nd_desc->ndctl(nd_desc, to_nvdimm(ndd->dev),
+				ND_CMD_GET_CONFIG_DATA, cmd,
+				cmd->in_length + sizeof(*cmd));
+		if (rc || cmd->status) {
+			rc = -ENXIO;
+			break;
+		}
+		memcpy(ndd->data + offset, cmd->out_buf, cmd->in_length);
+	}
+	dev_dbg(ndd->dev, "%s: len: %zu rc: %d\n", __func__, offset, rc);
+	kfree(cmd);
+
+	return rc;
+}
+
 static void nvdimm_release(struct device *dev)
 {
 	struct nvdimm *nvdimm = to_nvdimm(dev);
@@ -111,14 +218,33 @@ struct nvdimm *nvdimm_create(struct nvdimm_bus *nvdimm_bus, void *provider_data,
 	dev_set_name(dev, "nmem%d", nvdimm->id);
 	dev->parent = &nvdimm_bus->dev;
 	dev->type = &nvdimm_device_type;
-	dev->bus = &nvdimm_bus_type;
 	dev->devt = MKDEV(nvdimm_major, nvdimm->id);
 	dev->groups = groups;
-	if (device_register(dev) != 0) {
-		put_device(dev);
-		return NULL;
-	}
+	nd_device_register(dev);
 
 	return nvdimm;
 }
 EXPORT_SYMBOL_GPL(nvdimm_create);
+
+static int count_dimms(struct device *dev, void *c)
+{
+	int *count = c;
+
+	if (is_nvdimm(dev))
+		(*count)++;
+	return 0;
+}
+
+int nvdimm_bus_check_dimm_count(struct nvdimm_bus *nvdimm_bus, int dimm_count)
+{
+	int count = 0;
+	/* Flush any possible dimm registration failures */
+	nd_synchronize();
+
+	device_for_each_child(&nvdimm_bus->dev, &count, count_dimms);
+	dev_dbg(&nvdimm_bus->dev, "%s: count: %d\n", __func__, count);
+	if (count != dimm_count)
+		return -ENXIO;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(nvdimm_bus_check_dimm_count);
diff --git a/drivers/nvdimm/nd-core.h b/drivers/nvdimm/nd-core.h
index 59528b3c9de8..f2004b790874 100644
--- a/drivers/nvdimm/nd-core.h
+++ b/drivers/nvdimm/nd-core.h
@@ -17,7 +17,6 @@
 
 extern struct list_head nvdimm_bus_list;
 extern struct mutex nvdimm_bus_list_mutex;
-extern struct bus_type nvdimm_bus_type;
 extern int nvdimm_major;
 
 struct nvdimm_bus {
@@ -35,10 +34,11 @@ struct nvdimm {
 	int id;
 };
 
-bool is_nvdimm(struct device *dev);
 struct nvdimm_bus *walk_to_nvdimm_bus(struct device *nd_dev);
 int __init nvdimm_bus_init(void);
-void __exit nvdimm_bus_exit(void);
+void nvdimm_bus_exit(void);
 int nvdimm_bus_create_ndctl(struct nvdimm_bus *nvdimm_bus);
 void nvdimm_bus_destroy_ndctl(struct nvdimm_bus *nvdimm_bus);
+void nd_synchronize(void);
+bool is_nvdimm(struct device *dev);
 #endif /* __ND_CORE_H__ */
diff --git a/drivers/nvdimm/nd.h b/drivers/nvdimm/nd.h
new file mode 100644
index 000000000000..1f7f6ecab0fc
--- /dev/null
+++ b/drivers/nvdimm/nd.h
@@ -0,0 +1,36 @@
+/*
+ * Copyright(c) 2013-2015 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+#ifndef __ND_H__
+#define __ND_H__
+#include <linux/device.h>
+#include <linux/mutex.h>
+#include <linux/ndctl.h>
+
+struct nvdimm_drvdata {
+	struct device *dev;
+	struct nd_cmd_get_config_size nsarea;
+	void *data;
+};
+
+enum nd_async_mode {
+	ND_SYNC,
+	ND_ASYNC,
+};
+
+void nd_device_register(struct device *dev);
+void nd_device_unregister(struct device *dev, enum nd_async_mode mode);
+int __init nvdimm_init(void);
+void nvdimm_exit(void);
+int nvdimm_init_nsarea(struct nvdimm_drvdata *ndd);
+int nvdimm_init_config_data(struct nvdimm_drvdata *ndd);
+#endif /* __ND_H__ */
diff --git a/include/linux/libnvdimm.h b/include/linux/libnvdimm.h
index a39235819af3..d3ebccf4ea8b 100644
--- a/include/linux/libnvdimm.h
+++ b/include/linux/libnvdimm.h
@@ -30,6 +30,7 @@ enum {
 
 extern struct attribute_group nvdimm_bus_attribute_group;
 extern struct attribute_group nvdimm_attribute_group;
+extern struct attribute_group nd_device_attribute_group;
 
 struct nvdimm;
 struct nvdimm_bus_descriptor;
@@ -71,4 +72,5 @@ u32 nd_cmd_in_size(struct nvdimm *nvdimm, int cmd,
 u32 nd_cmd_out_size(struct nvdimm *nvdimm, int cmd,
 		const struct nd_cmd_desc *desc, int idx, const u32 *in_field,
 		const u32 *out_field);
+int nvdimm_bus_check_dimm_count(struct nvdimm_bus *nvdimm_bus, int dimm_count);
 #endif /* __LIBNVDIMM_H__ */
diff --git a/include/linux/nd.h b/include/linux/nd.h
new file mode 100644
index 000000000000..e074f67e53a3
--- /dev/null
+++ b/include/linux/nd.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright(c) 2013-2015 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+#ifndef __LINUX_ND_H__
+#define __LINUX_ND_H__
+#include <linux/ndctl.h>
+#include <linux/device.h>
+
+struct nd_device_driver {
+	struct device_driver drv;
+	unsigned long type;
+	int (*probe)(struct device *dev);
+	int (*remove)(struct device *dev);
+};
+
+static inline struct nd_device_driver *to_nd_device_driver(
+		struct device_driver *drv)
+{
+	return container_of(drv, struct nd_device_driver, drv);
+}
+
+#define MODULE_ALIAS_ND_DEVICE(type) \
+	MODULE_ALIAS("nd:t" __stringify(type) "*")
+#define ND_DEVICE_MODALIAS_FMT "nd:t%d"
+
+int __must_check __nd_driver_register(struct nd_device_driver *nd_drv,
+		struct module *module, const char *mod_name);
+#define nd_driver_register(driver) \
+	__nd_driver_register(driver, THIS_MODULE, KBUILD_MODNAME)
+#endif /* __LINUX_ND_H__ */
diff --git a/include/uapi/linux/ndctl.h b/include/uapi/linux/ndctl.h
index ff13c23b26df..37640916d146 100644
--- a/include/uapi/linux/ndctl.h
+++ b/include/uapi/linux/ndctl.h
@@ -175,4 +175,10 @@ static inline const char *nvdimm_cmd_name(unsigned cmd)
 #define ND_IOCTL_ARS_STATUS		_IOWR(ND_IOCTL, ND_CMD_ARS_STATUS,\
 					struct nd_cmd_ars_status)
 
+
+#define ND_DEVICE_DIMM 1            /* nd_dimm: container for "config data" */
+
+enum nd_driver_flags {
+	ND_DRIVER_DIMM            = 1 << ND_DEVICE_DIMM,
+};
 #endif /* __NDCTL_H__ */
-- 
cgit v1.2.3


From 1f7df6f88b9245a7f2d0f8ecbc97dc88c8d0d8e1 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Tue, 9 Jun 2015 20:13:14 -0400
Subject: libnvdimm, nfit: regions (block-data-window, persistent memory,
 volatile memory)

A "region" device represents the maximum capacity of a BLK range (mmio
block-data-window(s)), or a PMEM range (DAX-capable persistent memory or
volatile memory), without regard for aliasing.  Aliasing, in the
dimm-local address space (DPA), is resolved by metadata on a dimm to
designate which exclusive interface will access the aliased DPA ranges.
Support for the per-dimm metadata/label arrvies is in a subsequent
patch.

The name format of "region" devices is "regionN" where, like dimms, N is
a global ida index assigned at discovery time.  This id is not reliable
across reboots nor in the presence of hotplug.  Look to attributes of
the region or static id-data of the sub-namespace to generate a
persistent name.  However, if the platform configuration does not change
it is reasonable to expect the same region id to be assigned at the next
boot.

"region"s have 2 generic attributes "size", and "mapping"s where:
- size: the BLK accessible capacity or the span of the
  system physical address range in the case of PMEM.

- mappingN: a tuple describing a dimm's contribution to the region's
  capacity in the format (<nmemX>,<dpa>,<size>).  For a PMEM-region
  there will be at least one mapping per dimm in the interleave set.  For
  a BLK-region there is only "mapping0" listing the starting DPA of the
  BLK-region and the available DPA capacity of that space (matches "size"
  above).

The max number of mappings per "region" is hard coded per the
constraints of sysfs attribute groups.  That said the number of mappings
per region should never exceed the maximum number of possible dimms in
the system.  If the current number turns out to not be enough then the
"mappings" attribute clarifies how many there are supposed to be. "32
should be enough for anybody...".

Cc: Neil Brown <neilb@suse.de>
Cc: <linux-acpi@vger.kernel.org>
Cc: Greg KH <gregkh@linuxfoundation.org>
Cc: Robert Moore <robert.moore@intel.com>
Cc: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Christoph Hellwig <hch@lst.de>
Acked-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Tested-by: Toshi Kani <toshi.kani@hp.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/acpi/nfit.c          | 148 ++++++++++++++++++++-
 drivers/nvdimm/Makefile      |   1 +
 drivers/nvdimm/nd-core.h     |   3 +
 drivers/nvdimm/nd.h          |  11 ++
 drivers/nvdimm/region_devs.c | 297 +++++++++++++++++++++++++++++++++++++++++++
 include/linux/libnvdimm.h    |  25 ++++
 6 files changed, 484 insertions(+), 1 deletion(-)
 create mode 100644 drivers/nvdimm/region_devs.c

(limited to 'include/linux')

diff --git a/drivers/acpi/nfit.c b/drivers/acpi/nfit.c
index c4ccec1bc60b..068f69d70c9e 100644
--- a/drivers/acpi/nfit.c
+++ b/drivers/acpi/nfit.c
@@ -757,11 +757,153 @@ static void acpi_nfit_init_dsms(struct acpi_nfit_desc *acpi_desc)
 			set_bit(i, &nd_desc->dsm_mask);
 }
 
+static ssize_t range_index_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct nd_region *nd_region = to_nd_region(dev);
+	struct nfit_spa *nfit_spa = nd_region_provider_data(nd_region);
+
+	return sprintf(buf, "%d\n", nfit_spa->spa->range_index);
+}
+static DEVICE_ATTR_RO(range_index);
+
+static struct attribute *acpi_nfit_region_attributes[] = {
+	&dev_attr_range_index.attr,
+	NULL,
+};
+
+static struct attribute_group acpi_nfit_region_attribute_group = {
+	.name = "nfit",
+	.attrs = acpi_nfit_region_attributes,
+};
+
+static const struct attribute_group *acpi_nfit_region_attribute_groups[] = {
+	&nd_region_attribute_group,
+	&nd_mapping_attribute_group,
+	&acpi_nfit_region_attribute_group,
+	NULL,
+};
+
+static int acpi_nfit_init_mapping(struct acpi_nfit_desc *acpi_desc,
+		struct nd_mapping *nd_mapping, struct nd_region_desc *ndr_desc,
+		struct acpi_nfit_memory_map *memdev,
+		struct acpi_nfit_system_address *spa)
+{
+	struct nvdimm *nvdimm = acpi_nfit_dimm_by_handle(acpi_desc,
+			memdev->device_handle);
+	struct nfit_mem *nfit_mem;
+	int blk_valid = 0;
+
+	if (!nvdimm) {
+		dev_err(acpi_desc->dev, "spa%d dimm: %#x not found\n",
+				spa->range_index, memdev->device_handle);
+		return -ENODEV;
+	}
+
+	nd_mapping->nvdimm = nvdimm;
+	switch (nfit_spa_type(spa)) {
+	case NFIT_SPA_PM:
+	case NFIT_SPA_VOLATILE:
+		nd_mapping->start = memdev->address;
+		nd_mapping->size = memdev->region_size;
+		break;
+	case NFIT_SPA_DCR:
+		nfit_mem = nvdimm_provider_data(nvdimm);
+		if (!nfit_mem || !nfit_mem->bdw) {
+			dev_dbg(acpi_desc->dev, "spa%d %s missing bdw\n",
+					spa->range_index, nvdimm_name(nvdimm));
+		} else {
+			nd_mapping->size = nfit_mem->bdw->capacity;
+			nd_mapping->start = nfit_mem->bdw->start_address;
+			blk_valid = 1;
+		}
+
+		ndr_desc->nd_mapping = nd_mapping;
+		ndr_desc->num_mappings = blk_valid;
+		if (!nvdimm_blk_region_create(acpi_desc->nvdimm_bus, ndr_desc))
+			return -ENOMEM;
+		break;
+	}
+
+	return 0;
+}
+
+static int acpi_nfit_register_region(struct acpi_nfit_desc *acpi_desc,
+		struct nfit_spa *nfit_spa)
+{
+	static struct nd_mapping nd_mappings[ND_MAX_MAPPINGS];
+	struct acpi_nfit_system_address *spa = nfit_spa->spa;
+	struct nfit_memdev *nfit_memdev;
+	struct nd_region_desc ndr_desc;
+	struct nvdimm_bus *nvdimm_bus;
+	struct resource res;
+	int count = 0;
+
+	if (spa->range_index == 0) {
+		dev_dbg(acpi_desc->dev, "%s: detected invalid spa index\n",
+				__func__);
+		return 0;
+	}
+
+	memset(&res, 0, sizeof(res));
+	memset(&nd_mappings, 0, sizeof(nd_mappings));
+	memset(&ndr_desc, 0, sizeof(ndr_desc));
+	res.start = spa->address;
+	res.end = res.start + spa->length - 1;
+	ndr_desc.res = &res;
+	ndr_desc.provider_data = nfit_spa;
+	ndr_desc.attr_groups = acpi_nfit_region_attribute_groups;
+	list_for_each_entry(nfit_memdev, &acpi_desc->memdevs, list) {
+		struct acpi_nfit_memory_map *memdev = nfit_memdev->memdev;
+		struct nd_mapping *nd_mapping;
+		int rc;
+
+		if (memdev->range_index != spa->range_index)
+			continue;
+		if (count >= ND_MAX_MAPPINGS) {
+			dev_err(acpi_desc->dev, "spa%d exceeds max mappings %d\n",
+					spa->range_index, ND_MAX_MAPPINGS);
+			return -ENXIO;
+		}
+		nd_mapping = &nd_mappings[count++];
+		rc = acpi_nfit_init_mapping(acpi_desc, nd_mapping, &ndr_desc,
+				memdev, spa);
+		if (rc)
+			return rc;
+	}
+
+	ndr_desc.nd_mapping = nd_mappings;
+	ndr_desc.num_mappings = count;
+	nvdimm_bus = acpi_desc->nvdimm_bus;
+	if (nfit_spa_type(spa) == NFIT_SPA_PM) {
+		if (!nvdimm_pmem_region_create(nvdimm_bus, &ndr_desc))
+			return -ENOMEM;
+	} else if (nfit_spa_type(spa) == NFIT_SPA_VOLATILE) {
+		if (!nvdimm_volatile_region_create(nvdimm_bus, &ndr_desc))
+			return -ENOMEM;
+	}
+	return 0;
+}
+
+static int acpi_nfit_register_regions(struct acpi_nfit_desc *acpi_desc)
+{
+	struct nfit_spa *nfit_spa;
+
+	list_for_each_entry(nfit_spa, &acpi_desc->spas, list) {
+		int rc = acpi_nfit_register_region(acpi_desc, nfit_spa);
+
+		if (rc)
+			return rc;
+	}
+	return 0;
+}
+
 static int acpi_nfit_init(struct acpi_nfit_desc *acpi_desc, acpi_size sz)
 {
 	struct device *dev = acpi_desc->dev;
 	const void *end;
 	u8 *data;
+	int rc;
 
 	INIT_LIST_HEAD(&acpi_desc->spas);
 	INIT_LIST_HEAD(&acpi_desc->dcrs);
@@ -786,7 +928,11 @@ static int acpi_nfit_init(struct acpi_nfit_desc *acpi_desc, acpi_size sz)
 
 	acpi_nfit_init_dsms(acpi_desc);
 
-	return acpi_nfit_register_dimms(acpi_desc);
+	rc = acpi_nfit_register_dimms(acpi_desc);
+	if (rc)
+		return rc;
+
+	return acpi_nfit_register_regions(acpi_desc);
 }
 
 static int acpi_nfit_add(struct acpi_device *adev)
diff --git a/drivers/nvdimm/Makefile b/drivers/nvdimm/Makefile
index d44b5c1fcd3b..88afd0d849c3 100644
--- a/drivers/nvdimm/Makefile
+++ b/drivers/nvdimm/Makefile
@@ -4,3 +4,4 @@ libnvdimm-y := core.o
 libnvdimm-y += bus.o
 libnvdimm-y += dimm_devs.o
 libnvdimm-y += dimm.o
+libnvdimm-y += region_devs.o
diff --git a/drivers/nvdimm/nd-core.h b/drivers/nvdimm/nd-core.h
index f2004b790874..1d760bf24857 100644
--- a/drivers/nvdimm/nd-core.h
+++ b/drivers/nvdimm/nd-core.h
@@ -40,5 +40,8 @@ void nvdimm_bus_exit(void);
 int nvdimm_bus_create_ndctl(struct nvdimm_bus *nvdimm_bus);
 void nvdimm_bus_destroy_ndctl(struct nvdimm_bus *nvdimm_bus);
 void nd_synchronize(void);
+int nvdimm_bus_register_dimms(struct nvdimm_bus *nvdimm_bus);
+int nvdimm_bus_register_regions(struct nvdimm_bus *nvdimm_bus);
+int nd_match_dimm(struct device *dev, void *data);
 bool is_nvdimm(struct device *dev);
 #endif /* __ND_CORE_H__ */
diff --git a/drivers/nvdimm/nd.h b/drivers/nvdimm/nd.h
index 1f7f6ecab0fc..ea0cca337aa6 100644
--- a/drivers/nvdimm/nd.h
+++ b/drivers/nvdimm/nd.h
@@ -12,6 +12,7 @@
  */
 #ifndef __ND_H__
 #define __ND_H__
+#include <linux/libnvdimm.h>
 #include <linux/device.h>
 #include <linux/mutex.h>
 #include <linux/ndctl.h>
@@ -22,6 +23,16 @@ struct nvdimm_drvdata {
 	void *data;
 };
 
+struct nd_region {
+	struct device dev;
+	u16 ndr_mappings;
+	u64 ndr_size;
+	u64 ndr_start;
+	int id;
+	void *provider_data;
+	struct nd_mapping mapping[0];
+};
+
 enum nd_async_mode {
 	ND_SYNC,
 	ND_ASYNC,
diff --git a/drivers/nvdimm/region_devs.c b/drivers/nvdimm/region_devs.c
new file mode 100644
index 000000000000..4bda2e0df8f7
--- /dev/null
+++ b/drivers/nvdimm/region_devs.c
@@ -0,0 +1,297 @@
+/*
+ * Copyright(c) 2013-2015 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+#include <linux/slab.h>
+#include <linux/io.h>
+#include "nd-core.h"
+#include "nd.h"
+
+static DEFINE_IDA(region_ida);
+
+static void nd_region_release(struct device *dev)
+{
+	struct nd_region *nd_region = to_nd_region(dev);
+	u16 i;
+
+	for (i = 0; i < nd_region->ndr_mappings; i++) {
+		struct nd_mapping *nd_mapping = &nd_region->mapping[i];
+		struct nvdimm *nvdimm = nd_mapping->nvdimm;
+
+		put_device(&nvdimm->dev);
+	}
+	ida_simple_remove(&region_ida, nd_region->id);
+	kfree(nd_region);
+}
+
+static struct device_type nd_blk_device_type = {
+	.name = "nd_blk",
+	.release = nd_region_release,
+};
+
+static struct device_type nd_pmem_device_type = {
+	.name = "nd_pmem",
+	.release = nd_region_release,
+};
+
+static struct device_type nd_volatile_device_type = {
+	.name = "nd_volatile",
+	.release = nd_region_release,
+};
+
+static bool is_nd_pmem(struct device *dev)
+{
+	return dev ? dev->type == &nd_pmem_device_type : false;
+}
+
+struct nd_region *to_nd_region(struct device *dev)
+{
+	struct nd_region *nd_region = container_of(dev, struct nd_region, dev);
+
+	WARN_ON(dev->type->release != nd_region_release);
+	return nd_region;
+}
+EXPORT_SYMBOL_GPL(to_nd_region);
+
+static ssize_t size_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct nd_region *nd_region = to_nd_region(dev);
+	unsigned long long size = 0;
+
+	if (is_nd_pmem(dev)) {
+		size = nd_region->ndr_size;
+	} else if (nd_region->ndr_mappings == 1) {
+		struct nd_mapping *nd_mapping = &nd_region->mapping[0];
+
+		size = nd_mapping->size;
+	}
+
+	return sprintf(buf, "%llu\n", size);
+}
+static DEVICE_ATTR_RO(size);
+
+static ssize_t mappings_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct nd_region *nd_region = to_nd_region(dev);
+
+	return sprintf(buf, "%d\n", nd_region->ndr_mappings);
+}
+static DEVICE_ATTR_RO(mappings);
+
+static struct attribute *nd_region_attributes[] = {
+	&dev_attr_size.attr,
+	&dev_attr_mappings.attr,
+	NULL,
+};
+
+struct attribute_group nd_region_attribute_group = {
+	.attrs = nd_region_attributes,
+};
+EXPORT_SYMBOL_GPL(nd_region_attribute_group);
+
+static ssize_t mappingN(struct device *dev, char *buf, int n)
+{
+	struct nd_region *nd_region = to_nd_region(dev);
+	struct nd_mapping *nd_mapping;
+	struct nvdimm *nvdimm;
+
+	if (n >= nd_region->ndr_mappings)
+		return -ENXIO;
+	nd_mapping = &nd_region->mapping[n];
+	nvdimm = nd_mapping->nvdimm;
+
+	return sprintf(buf, "%s,%llu,%llu\n", dev_name(&nvdimm->dev),
+			nd_mapping->start, nd_mapping->size);
+}
+
+#define REGION_MAPPING(idx) \
+static ssize_t mapping##idx##_show(struct device *dev,		\
+		struct device_attribute *attr, char *buf)	\
+{								\
+	return mappingN(dev, buf, idx);				\
+}								\
+static DEVICE_ATTR_RO(mapping##idx)
+
+/*
+ * 32 should be enough for a while, even in the presence of socket
+ * interleave a 32-way interleave set is a degenerate case.
+ */
+REGION_MAPPING(0);
+REGION_MAPPING(1);
+REGION_MAPPING(2);
+REGION_MAPPING(3);
+REGION_MAPPING(4);
+REGION_MAPPING(5);
+REGION_MAPPING(6);
+REGION_MAPPING(7);
+REGION_MAPPING(8);
+REGION_MAPPING(9);
+REGION_MAPPING(10);
+REGION_MAPPING(11);
+REGION_MAPPING(12);
+REGION_MAPPING(13);
+REGION_MAPPING(14);
+REGION_MAPPING(15);
+REGION_MAPPING(16);
+REGION_MAPPING(17);
+REGION_MAPPING(18);
+REGION_MAPPING(19);
+REGION_MAPPING(20);
+REGION_MAPPING(21);
+REGION_MAPPING(22);
+REGION_MAPPING(23);
+REGION_MAPPING(24);
+REGION_MAPPING(25);
+REGION_MAPPING(26);
+REGION_MAPPING(27);
+REGION_MAPPING(28);
+REGION_MAPPING(29);
+REGION_MAPPING(30);
+REGION_MAPPING(31);
+
+static umode_t mapping_visible(struct kobject *kobj, struct attribute *a, int n)
+{
+	struct device *dev = container_of(kobj, struct device, kobj);
+	struct nd_region *nd_region = to_nd_region(dev);
+
+	if (n < nd_region->ndr_mappings)
+		return a->mode;
+	return 0;
+}
+
+static struct attribute *mapping_attributes[] = {
+	&dev_attr_mapping0.attr,
+	&dev_attr_mapping1.attr,
+	&dev_attr_mapping2.attr,
+	&dev_attr_mapping3.attr,
+	&dev_attr_mapping4.attr,
+	&dev_attr_mapping5.attr,
+	&dev_attr_mapping6.attr,
+	&dev_attr_mapping7.attr,
+	&dev_attr_mapping8.attr,
+	&dev_attr_mapping9.attr,
+	&dev_attr_mapping10.attr,
+	&dev_attr_mapping11.attr,
+	&dev_attr_mapping12.attr,
+	&dev_attr_mapping13.attr,
+	&dev_attr_mapping14.attr,
+	&dev_attr_mapping15.attr,
+	&dev_attr_mapping16.attr,
+	&dev_attr_mapping17.attr,
+	&dev_attr_mapping18.attr,
+	&dev_attr_mapping19.attr,
+	&dev_attr_mapping20.attr,
+	&dev_attr_mapping21.attr,
+	&dev_attr_mapping22.attr,
+	&dev_attr_mapping23.attr,
+	&dev_attr_mapping24.attr,
+	&dev_attr_mapping25.attr,
+	&dev_attr_mapping26.attr,
+	&dev_attr_mapping27.attr,
+	&dev_attr_mapping28.attr,
+	&dev_attr_mapping29.attr,
+	&dev_attr_mapping30.attr,
+	&dev_attr_mapping31.attr,
+	NULL,
+};
+
+struct attribute_group nd_mapping_attribute_group = {
+	.is_visible = mapping_visible,
+	.attrs = mapping_attributes,
+};
+EXPORT_SYMBOL_GPL(nd_mapping_attribute_group);
+
+void *nd_region_provider_data(struct nd_region *nd_region)
+{
+	return nd_region->provider_data;
+}
+EXPORT_SYMBOL_GPL(nd_region_provider_data);
+
+static struct nd_region *nd_region_create(struct nvdimm_bus *nvdimm_bus,
+		struct nd_region_desc *ndr_desc, struct device_type *dev_type,
+		const char *caller)
+{
+	struct nd_region *nd_region;
+	struct device *dev;
+	u16 i;
+
+	for (i = 0; i < ndr_desc->num_mappings; i++) {
+		struct nd_mapping *nd_mapping = &ndr_desc->nd_mapping[i];
+		struct nvdimm *nvdimm = nd_mapping->nvdimm;
+
+		if ((nd_mapping->start | nd_mapping->size) % SZ_4K) {
+			dev_err(&nvdimm_bus->dev, "%s: %s mapping%d is not 4K aligned\n",
+					caller, dev_name(&nvdimm->dev), i);
+
+			return NULL;
+		}
+	}
+
+	nd_region = kzalloc(sizeof(struct nd_region)
+			+ sizeof(struct nd_mapping) * ndr_desc->num_mappings,
+			GFP_KERNEL);
+	if (!nd_region)
+		return NULL;
+	nd_region->id = ida_simple_get(&region_ida, 0, 0, GFP_KERNEL);
+	if (nd_region->id < 0) {
+		kfree(nd_region);
+		return NULL;
+	}
+
+	memcpy(nd_region->mapping, ndr_desc->nd_mapping,
+			sizeof(struct nd_mapping) * ndr_desc->num_mappings);
+	for (i = 0; i < ndr_desc->num_mappings; i++) {
+		struct nd_mapping *nd_mapping = &ndr_desc->nd_mapping[i];
+		struct nvdimm *nvdimm = nd_mapping->nvdimm;
+
+		get_device(&nvdimm->dev);
+	}
+	nd_region->ndr_mappings = ndr_desc->num_mappings;
+	nd_region->provider_data = ndr_desc->provider_data;
+	dev = &nd_region->dev;
+	dev_set_name(dev, "region%d", nd_region->id);
+	dev->parent = &nvdimm_bus->dev;
+	dev->type = dev_type;
+	dev->groups = ndr_desc->attr_groups;
+	nd_region->ndr_size = resource_size(ndr_desc->res);
+	nd_region->ndr_start = ndr_desc->res->start;
+	nd_device_register(dev);
+
+	return nd_region;
+}
+
+struct nd_region *nvdimm_pmem_region_create(struct nvdimm_bus *nvdimm_bus,
+		struct nd_region_desc *ndr_desc)
+{
+	return nd_region_create(nvdimm_bus, ndr_desc, &nd_pmem_device_type,
+			__func__);
+}
+EXPORT_SYMBOL_GPL(nvdimm_pmem_region_create);
+
+struct nd_region *nvdimm_blk_region_create(struct nvdimm_bus *nvdimm_bus,
+		struct nd_region_desc *ndr_desc)
+{
+	if (ndr_desc->num_mappings > 1)
+		return NULL;
+	return nd_region_create(nvdimm_bus, ndr_desc, &nd_blk_device_type,
+			__func__);
+}
+EXPORT_SYMBOL_GPL(nvdimm_blk_region_create);
+
+struct nd_region *nvdimm_volatile_region_create(struct nvdimm_bus *nvdimm_bus,
+		struct nd_region_desc *ndr_desc)
+{
+	return nd_region_create(nvdimm_bus, ndr_desc, &nd_volatile_device_type,
+			__func__);
+}
+EXPORT_SYMBOL_GPL(nvdimm_volatile_region_create);
diff --git a/include/linux/libnvdimm.h b/include/linux/libnvdimm.h
index d3ebccf4ea8b..39e7e606092a 100644
--- a/include/linux/libnvdimm.h
+++ b/include/linux/libnvdimm.h
@@ -26,11 +26,14 @@ enum {
 	ND_CMD_MAX_ELEM = 4,
 	ND_CMD_MAX_ENVELOPE = 16,
 	ND_CMD_ARS_STATUS_MAX = SZ_4K,
+	ND_MAX_MAPPINGS = 32,
 };
 
 extern struct attribute_group nvdimm_bus_attribute_group;
 extern struct attribute_group nvdimm_attribute_group;
 extern struct attribute_group nd_device_attribute_group;
+extern struct attribute_group nd_region_attribute_group;
+extern struct attribute_group nd_mapping_attribute_group;
 
 struct nvdimm;
 struct nvdimm_bus_descriptor;
@@ -38,6 +41,12 @@ typedef int (*ndctl_fn)(struct nvdimm_bus_descriptor *nd_desc,
 		struct nvdimm *nvdimm, unsigned int cmd, void *buf,
 		unsigned int buf_len);
 
+struct nd_mapping {
+	struct nvdimm *nvdimm;
+	u64 start;
+	u64 size;
+};
+
 struct nvdimm_bus_descriptor {
 	const struct attribute_group **attr_groups;
 	unsigned long dsm_mask;
@@ -52,6 +61,14 @@ struct nd_cmd_desc {
 	int out_sizes[ND_CMD_MAX_ELEM];
 };
 
+struct nd_region_desc {
+	struct resource *res;
+	struct nd_mapping *nd_mapping;
+	u16 num_mappings;
+	const struct attribute_group **attr_groups;
+	void *provider_data;
+};
+
 struct nvdimm_bus;
 struct device;
 struct nvdimm_bus *nvdimm_bus_register(struct device *parent,
@@ -59,9 +76,11 @@ struct nvdimm_bus *nvdimm_bus_register(struct device *parent,
 void nvdimm_bus_unregister(struct nvdimm_bus *nvdimm_bus);
 struct nvdimm_bus *to_nvdimm_bus(struct device *dev);
 struct nvdimm *to_nvdimm(struct device *dev);
+struct nd_region *to_nd_region(struct device *dev);
 struct nvdimm_bus_descriptor *to_nd_desc(struct nvdimm_bus *nvdimm_bus);
 const char *nvdimm_name(struct nvdimm *nvdimm);
 void *nvdimm_provider_data(struct nvdimm *nvdimm);
+void *nd_region_provider_data(struct nd_region *nd_region);
 struct nvdimm *nvdimm_create(struct nvdimm_bus *nvdimm_bus, void *provider_data,
 		const struct attribute_group **groups, unsigned long flags,
 		unsigned long *dsm_mask);
@@ -73,4 +92,10 @@ u32 nd_cmd_out_size(struct nvdimm *nvdimm, int cmd,
 		const struct nd_cmd_desc *desc, int idx, const u32 *in_field,
 		const u32 *out_field);
 int nvdimm_bus_check_dimm_count(struct nvdimm_bus *nvdimm_bus, int dimm_count);
+struct nd_region *nvdimm_pmem_region_create(struct nvdimm_bus *nvdimm_bus,
+		struct nd_region_desc *ndr_desc);
+struct nd_region *nvdimm_blk_region_create(struct nvdimm_bus *nvdimm_bus,
+		struct nd_region_desc *ndr_desc);
+struct nd_region *nvdimm_volatile_region_create(struct nvdimm_bus *nvdimm_bus,
+		struct nd_region_desc *ndr_desc);
 #endif /* __LIBNVDIMM_H__ */
-- 
cgit v1.2.3


From 3d88002e4a7bd40f355550284c6cd140e6fe29dc Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Sun, 31 May 2015 15:02:11 -0400
Subject: libnvdimm: support for legacy (non-aliasing) nvdimms

The libnvdimm region driver is an intermediary driver that translates
non-volatile "region"s into "namespace" sub-devices that are surfaced by
persistent memory block-device drivers (PMEM and BLK).

ACPI 6 introduces the concept that a given nvdimm may simultaneously
offer multiple access modes to its media through direct PMEM load/store
access, or windowed BLK mode.  Existing nvdimms mostly implement a PMEM
interface, some offer a BLK-like mode, but never both as ACPI 6 defines.
If an nvdimm is single interfaced, then there is no need for dimm
metadata labels.  For these devices we can take the region boundaries
directly to create a child namespace device (nd_namespace_io).

Acked-by: Christoph Hellwig <hch@lst.de>
Tested-by: Toshi Kani <toshi.kani@hp.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/acpi/nfit.c             |   1 +
 drivers/nvdimm/Makefile         |   2 +
 drivers/nvdimm/bus.c            |  26 ++++++++++
 drivers/nvdimm/core.c           |  44 ++++++++++++++--
 drivers/nvdimm/dimm.c           |   2 +-
 drivers/nvdimm/namespace_devs.c | 111 ++++++++++++++++++++++++++++++++++++++++
 drivers/nvdimm/nd-core.h        |   6 ++-
 drivers/nvdimm/nd.h             |  13 +++++
 drivers/nvdimm/region.c         |  93 +++++++++++++++++++++++++++++++++
 drivers/nvdimm/region_devs.c    |  66 +++++++++++++++++++++++-
 include/linux/libnvdimm.h       |   7 ++-
 include/linux/nd.h              |  10 ++++
 include/uapi/linux/ndctl.h      |  10 ++++
 13 files changed, 383 insertions(+), 8 deletions(-)
 create mode 100644 drivers/nvdimm/namespace_devs.c
 create mode 100644 drivers/nvdimm/region.c

(limited to 'include/linux')

diff --git a/drivers/acpi/nfit.c b/drivers/acpi/nfit.c
index 068f69d70c9e..ce290748fe36 100644
--- a/drivers/acpi/nfit.c
+++ b/drivers/acpi/nfit.c
@@ -780,6 +780,7 @@ static struct attribute_group acpi_nfit_region_attribute_group = {
 static const struct attribute_group *acpi_nfit_region_attribute_groups[] = {
 	&nd_region_attribute_group,
 	&nd_mapping_attribute_group,
+	&nd_device_attribute_group,
 	&acpi_nfit_region_attribute_group,
 	NULL,
 };
diff --git a/drivers/nvdimm/Makefile b/drivers/nvdimm/Makefile
index 88afd0d849c3..af5e2760ddbd 100644
--- a/drivers/nvdimm/Makefile
+++ b/drivers/nvdimm/Makefile
@@ -5,3 +5,5 @@ libnvdimm-y += bus.o
 libnvdimm-y += dimm_devs.o
 libnvdimm-y += dimm.o
 libnvdimm-y += region_devs.o
+libnvdimm-y += region.o
+libnvdimm-y += namespace_devs.o
diff --git a/drivers/nvdimm/bus.c b/drivers/nvdimm/bus.c
index a0308f1872bf..4b77665a6cc8 100644
--- a/drivers/nvdimm/bus.c
+++ b/drivers/nvdimm/bus.c
@@ -13,6 +13,7 @@
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 #include <linux/vmalloc.h>
 #include <linux/uaccess.h>
+#include <linux/module.h>
 #include <linux/fcntl.h>
 #include <linux/async.h>
 #include <linux/ndctl.h>
@@ -33,6 +34,12 @@ static int to_nd_device_type(struct device *dev)
 {
 	if (is_nvdimm(dev))
 		return ND_DEVICE_DIMM;
+	else if (is_nd_pmem(dev))
+		return ND_DEVICE_REGION_PMEM;
+	else if (is_nd_blk(dev))
+		return ND_DEVICE_REGION_BLK;
+	else if (is_nd_pmem(dev->parent) || is_nd_blk(dev->parent))
+		return nd_region_to_nstype(to_nd_region(dev->parent));
 
 	return 0;
 }
@@ -50,27 +57,46 @@ static int nvdimm_bus_match(struct device *dev, struct device_driver *drv)
 	return test_bit(to_nd_device_type(dev), &nd_drv->type);
 }
 
+static struct module *to_bus_provider(struct device *dev)
+{
+	/* pin bus providers while regions are enabled */
+	if (is_nd_pmem(dev) || is_nd_blk(dev)) {
+		struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(dev);
+
+		return nvdimm_bus->module;
+	}
+	return NULL;
+}
+
 static int nvdimm_bus_probe(struct device *dev)
 {
 	struct nd_device_driver *nd_drv = to_nd_device_driver(dev->driver);
+	struct module *provider = to_bus_provider(dev);
 	struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(dev);
 	int rc;
 
+	if (!try_module_get(provider))
+		return -ENXIO;
+
 	rc = nd_drv->probe(dev);
 	dev_dbg(&nvdimm_bus->dev, "%s.probe(%s) = %d\n", dev->driver->name,
 			dev_name(dev), rc);
+	if (rc != 0)
+		module_put(provider);
 	return rc;
 }
 
 static int nvdimm_bus_remove(struct device *dev)
 {
 	struct nd_device_driver *nd_drv = to_nd_device_driver(dev->driver);
+	struct module *provider = to_bus_provider(dev);
 	struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(dev);
 	int rc;
 
 	rc = nd_drv->remove(dev);
 	dev_dbg(&nvdimm_bus->dev, "%s.remove(%s) = %d\n", dev->driver->name,
 			dev_name(dev), rc);
+	module_put(provider);
 	return rc;
 }
 
diff --git a/drivers/nvdimm/core.c b/drivers/nvdimm/core.c
index 50ab880f0dc0..1b6b15d11f54 100644
--- a/drivers/nvdimm/core.c
+++ b/drivers/nvdimm/core.c
@@ -24,6 +24,36 @@ LIST_HEAD(nvdimm_bus_list);
 DEFINE_MUTEX(nvdimm_bus_list_mutex);
 static DEFINE_IDA(nd_ida);
 
+void nvdimm_bus_lock(struct device *dev)
+{
+	struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(dev);
+
+	if (!nvdimm_bus)
+		return;
+	mutex_lock(&nvdimm_bus->reconfig_mutex);
+}
+EXPORT_SYMBOL(nvdimm_bus_lock);
+
+void nvdimm_bus_unlock(struct device *dev)
+{
+	struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(dev);
+
+	if (!nvdimm_bus)
+		return;
+	mutex_unlock(&nvdimm_bus->reconfig_mutex);
+}
+EXPORT_SYMBOL(nvdimm_bus_unlock);
+
+bool is_nvdimm_bus_locked(struct device *dev)
+{
+	struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(dev);
+
+	if (!nvdimm_bus)
+		return false;
+	return mutex_is_locked(&nvdimm_bus->reconfig_mutex);
+}
+EXPORT_SYMBOL(is_nvdimm_bus_locked);
+
 static void nvdimm_bus_release(struct device *dev)
 {
 	struct nvdimm_bus *nvdimm_bus;
@@ -135,8 +165,8 @@ struct attribute_group nvdimm_bus_attribute_group = {
 };
 EXPORT_SYMBOL_GPL(nvdimm_bus_attribute_group);
 
-struct nvdimm_bus *nvdimm_bus_register(struct device *parent,
-		struct nvdimm_bus_descriptor *nd_desc)
+struct nvdimm_bus *__nvdimm_bus_register(struct device *parent,
+		struct nvdimm_bus_descriptor *nd_desc, struct module *module)
 {
 	struct nvdimm_bus *nvdimm_bus;
 	int rc;
@@ -146,11 +176,13 @@ struct nvdimm_bus *nvdimm_bus_register(struct device *parent,
 		return NULL;
 	INIT_LIST_HEAD(&nvdimm_bus->list);
 	nvdimm_bus->id = ida_simple_get(&nd_ida, 0, 0, GFP_KERNEL);
+	mutex_init(&nvdimm_bus->reconfig_mutex);
 	if (nvdimm_bus->id < 0) {
 		kfree(nvdimm_bus);
 		return NULL;
 	}
 	nvdimm_bus->nd_desc = nd_desc;
+	nvdimm_bus->module = module;
 	nvdimm_bus->dev.parent = parent;
 	nvdimm_bus->dev.release = nvdimm_bus_release;
 	nvdimm_bus->dev.groups = nd_desc->attr_groups;
@@ -174,7 +206,7 @@ struct nvdimm_bus *nvdimm_bus_register(struct device *parent,
 	put_device(&nvdimm_bus->dev);
 	return NULL;
 }
-EXPORT_SYMBOL_GPL(nvdimm_bus_register);
+EXPORT_SYMBOL_GPL(__nvdimm_bus_register);
 
 static int child_unregister(struct device *dev, void *data)
 {
@@ -218,7 +250,12 @@ static __init int libnvdimm_init(void)
 	rc = nvdimm_init();
 	if (rc)
 		goto err_dimm;
+	rc = nd_region_init();
+	if (rc)
+		goto err_region;
 	return 0;
+ err_region:
+	nvdimm_exit();
  err_dimm:
 	nvdimm_bus_exit();
 	return rc;
@@ -227,6 +264,7 @@ static __init int libnvdimm_init(void)
 static __exit void libnvdimm_exit(void)
 {
 	WARN_ON(!list_empty(&nvdimm_bus_list));
+	nd_region_exit();
 	nvdimm_exit();
 	nvdimm_bus_exit();
 }
diff --git a/drivers/nvdimm/dimm.c b/drivers/nvdimm/dimm.c
index 28001a6ccd4e..eb20fc2df32b 100644
--- a/drivers/nvdimm/dimm.c
+++ b/drivers/nvdimm/dimm.c
@@ -84,7 +84,7 @@ int __init nvdimm_init(void)
 	return nd_driver_register(&nvdimm_driver);
 }
 
-void __exit nvdimm_exit(void)
+void nvdimm_exit(void)
 {
 	driver_unregister(&nvdimm_driver.drv);
 }
diff --git a/drivers/nvdimm/namespace_devs.c b/drivers/nvdimm/namespace_devs.c
new file mode 100644
index 000000000000..4f653d1e61ad
--- /dev/null
+++ b/drivers/nvdimm/namespace_devs.c
@@ -0,0 +1,111 @@
+/*
+ * Copyright(c) 2013-2015 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+#include <linux/module.h>
+#include <linux/device.h>
+#include <linux/slab.h>
+#include <linux/nd.h>
+#include "nd.h"
+
+static void namespace_io_release(struct device *dev)
+{
+	struct nd_namespace_io *nsio = to_nd_namespace_io(dev);
+
+	kfree(nsio);
+}
+
+static struct device_type namespace_io_device_type = {
+	.name = "nd_namespace_io",
+	.release = namespace_io_release,
+};
+
+static ssize_t nstype_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct nd_region *nd_region = to_nd_region(dev->parent);
+
+	return sprintf(buf, "%d\n", nd_region_to_nstype(nd_region));
+}
+static DEVICE_ATTR_RO(nstype);
+
+static struct attribute *nd_namespace_attributes[] = {
+	&dev_attr_nstype.attr,
+	NULL,
+};
+
+static struct attribute_group nd_namespace_attribute_group = {
+	.attrs = nd_namespace_attributes,
+};
+
+static const struct attribute_group *nd_namespace_attribute_groups[] = {
+	&nd_device_attribute_group,
+	&nd_namespace_attribute_group,
+	NULL,
+};
+
+static struct device **create_namespace_io(struct nd_region *nd_region)
+{
+	struct nd_namespace_io *nsio;
+	struct device *dev, **devs;
+	struct resource *res;
+
+	nsio = kzalloc(sizeof(*nsio), GFP_KERNEL);
+	if (!nsio)
+		return NULL;
+
+	devs = kcalloc(2, sizeof(struct device *), GFP_KERNEL);
+	if (!devs) {
+		kfree(nsio);
+		return NULL;
+	}
+
+	dev = &nsio->dev;
+	dev->type = &namespace_io_device_type;
+	dev->parent = &nd_region->dev;
+	res = &nsio->res;
+	res->name = dev_name(&nd_region->dev);
+	res->flags = IORESOURCE_MEM;
+	res->start = nd_region->ndr_start;
+	res->end = res->start + nd_region->ndr_size - 1;
+
+	devs[0] = dev;
+	return devs;
+}
+
+int nd_region_register_namespaces(struct nd_region *nd_region, int *err)
+{
+	struct device **devs = NULL;
+	int i;
+
+	*err = 0;
+	switch (nd_region_to_nstype(nd_region)) {
+	case ND_DEVICE_NAMESPACE_IO:
+		devs = create_namespace_io(nd_region);
+		break;
+	default:
+		break;
+	}
+
+	if (!devs)
+		return -ENODEV;
+
+	for (i = 0; devs[i]; i++) {
+		struct device *dev = devs[i];
+
+		dev_set_name(dev, "namespace%d.%d", nd_region->id, i);
+		dev->groups = nd_namespace_attribute_groups;
+		nd_device_register(dev);
+	}
+	kfree(devs);
+
+	return i;
+}
diff --git a/drivers/nvdimm/nd-core.h b/drivers/nvdimm/nd-core.h
index 1d760bf24857..0e9b41fd2546 100644
--- a/drivers/nvdimm/nd-core.h
+++ b/drivers/nvdimm/nd-core.h
@@ -21,9 +21,11 @@ extern int nvdimm_major;
 
 struct nvdimm_bus {
 	struct nvdimm_bus_descriptor *nd_desc;
+	struct module *module;
 	struct list_head list;
 	struct device dev;
 	int id;
+	struct mutex reconfig_mutex;
 };
 
 struct nvdimm {
@@ -34,6 +36,9 @@ struct nvdimm {
 	int id;
 };
 
+bool is_nvdimm(struct device *dev);
+bool is_nd_blk(struct device *dev);
+bool is_nd_pmem(struct device *dev);
 struct nvdimm_bus *walk_to_nvdimm_bus(struct device *nd_dev);
 int __init nvdimm_bus_init(void);
 void nvdimm_bus_exit(void);
@@ -43,5 +48,4 @@ void nd_synchronize(void);
 int nvdimm_bus_register_dimms(struct nvdimm_bus *nvdimm_bus);
 int nvdimm_bus_register_regions(struct nvdimm_bus *nvdimm_bus);
 int nd_match_dimm(struct device *dev, void *data);
-bool is_nvdimm(struct device *dev);
 #endif /* __ND_CORE_H__ */
diff --git a/drivers/nvdimm/nd.h b/drivers/nvdimm/nd.h
index ea0cca337aa6..bc5a08e36a25 100644
--- a/drivers/nvdimm/nd.h
+++ b/drivers/nvdimm/nd.h
@@ -23,6 +23,11 @@ struct nvdimm_drvdata {
 	void *data;
 };
 
+struct nd_region_namespaces {
+	int count;
+	int active;
+};
+
 struct nd_region {
 	struct device dev;
 	u16 ndr_mappings;
@@ -41,7 +46,15 @@ enum nd_async_mode {
 void nd_device_register(struct device *dev);
 void nd_device_unregister(struct device *dev, enum nd_async_mode mode);
 int __init nvdimm_init(void);
+int __init nd_region_init(void);
 void nvdimm_exit(void);
+void nd_region_exit(void);
 int nvdimm_init_nsarea(struct nvdimm_drvdata *ndd);
 int nvdimm_init_config_data(struct nvdimm_drvdata *ndd);
+struct nd_region *to_nd_region(struct device *dev);
+int nd_region_to_nstype(struct nd_region *nd_region);
+int nd_region_register_namespaces(struct nd_region *nd_region, int *err);
+void nvdimm_bus_lock(struct device *dev);
+void nvdimm_bus_unlock(struct device *dev);
+bool is_nvdimm_bus_locked(struct device *dev);
 #endif /* __ND_H__ */
diff --git a/drivers/nvdimm/region.c b/drivers/nvdimm/region.c
new file mode 100644
index 000000000000..ade3dba81afd
--- /dev/null
+++ b/drivers/nvdimm/region.c
@@ -0,0 +1,93 @@
+/*
+ * Copyright(c) 2013-2015 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+#include <linux/module.h>
+#include <linux/device.h>
+#include <linux/nd.h>
+#include "nd.h"
+
+static int nd_region_probe(struct device *dev)
+{
+	int err;
+	struct nd_region_namespaces *num_ns;
+	struct nd_region *nd_region = to_nd_region(dev);
+	int rc = nd_region_register_namespaces(nd_region, &err);
+
+	num_ns = devm_kzalloc(dev, sizeof(*num_ns), GFP_KERNEL);
+	if (!num_ns)
+		return -ENOMEM;
+
+	if (rc < 0)
+		return rc;
+
+	num_ns->active = rc;
+	num_ns->count = rc + err;
+	dev_set_drvdata(dev, num_ns);
+
+	if (err == 0)
+		return 0;
+
+	if (rc == err)
+		return -ENODEV;
+
+	/*
+	 * Given multiple namespaces per region, we do not want to
+	 * disable all the successfully registered peer namespaces upon
+	 * a single registration failure.  If userspace is missing a
+	 * namespace that it expects it can disable/re-enable the region
+	 * to retry discovery after correcting the failure.
+	 * <regionX>/namespaces returns the current
+	 * "<async-registered>/<total>" namespace count.
+	 */
+	dev_err(dev, "failed to register %d namespace%s, continuing...\n",
+			err, err == 1 ? "" : "s");
+	return 0;
+}
+
+static int child_unregister(struct device *dev, void *data)
+{
+	nd_device_unregister(dev, ND_SYNC);
+	return 0;
+}
+
+static int nd_region_remove(struct device *dev)
+{
+	/* flush attribute readers and disable */
+	nvdimm_bus_lock(dev);
+	dev_set_drvdata(dev, NULL);
+	nvdimm_bus_unlock(dev);
+
+	device_for_each_child(dev, NULL, child_unregister);
+	return 0;
+}
+
+static struct nd_device_driver nd_region_driver = {
+	.probe = nd_region_probe,
+	.remove = nd_region_remove,
+	.drv = {
+		.name = "nd_region",
+	},
+	.type = ND_DRIVER_REGION_BLK | ND_DRIVER_REGION_PMEM,
+};
+
+int __init nd_region_init(void)
+{
+	return nd_driver_register(&nd_region_driver);
+}
+
+void nd_region_exit(void)
+{
+	driver_unregister(&nd_region_driver.drv);
+}
+
+MODULE_ALIAS_ND_DEVICE(ND_DEVICE_REGION_PMEM);
+MODULE_ALIAS_ND_DEVICE(ND_DEVICE_REGION_BLK);
diff --git a/drivers/nvdimm/region_devs.c b/drivers/nvdimm/region_devs.c
index 4bda2e0df8f7..b5c5b9095b28 100644
--- a/drivers/nvdimm/region_devs.c
+++ b/drivers/nvdimm/region_devs.c
@@ -47,11 +47,16 @@ static struct device_type nd_volatile_device_type = {
 	.release = nd_region_release,
 };
 
-static bool is_nd_pmem(struct device *dev)
+bool is_nd_pmem(struct device *dev)
 {
 	return dev ? dev->type == &nd_pmem_device_type : false;
 }
 
+bool is_nd_blk(struct device *dev)
+{
+	return dev ? dev->type == &nd_blk_device_type : false;
+}
+
 struct nd_region *to_nd_region(struct device *dev)
 {
 	struct nd_region *nd_region = container_of(dev, struct nd_region, dev);
@@ -61,6 +66,37 @@ struct nd_region *to_nd_region(struct device *dev)
 }
 EXPORT_SYMBOL_GPL(to_nd_region);
 
+/**
+ * nd_region_to_nstype() - region to an integer namespace type
+ * @nd_region: region-device to interrogate
+ *
+ * This is the 'nstype' attribute of a region as well, an input to the
+ * MODALIAS for namespace devices, and bit number for a nvdimm_bus to match
+ * namespace devices with namespace drivers.
+ */
+int nd_region_to_nstype(struct nd_region *nd_region)
+{
+	if (is_nd_pmem(&nd_region->dev)) {
+		u16 i, alias;
+
+		for (i = 0, alias = 0; i < nd_region->ndr_mappings; i++) {
+			struct nd_mapping *nd_mapping = &nd_region->mapping[i];
+			struct nvdimm *nvdimm = nd_mapping->nvdimm;
+
+			if (nvdimm->flags & NDD_ALIASING)
+				alias++;
+		}
+		if (alias)
+			return ND_DEVICE_NAMESPACE_PMEM;
+		else
+			return ND_DEVICE_NAMESPACE_IO;
+	} else if (is_nd_blk(&nd_region->dev)) {
+		return ND_DEVICE_NAMESPACE_BLK;
+	}
+
+	return 0;
+}
+
 static ssize_t size_show(struct device *dev,
 		struct device_attribute *attr, char *buf)
 {
@@ -88,9 +124,37 @@ static ssize_t mappings_show(struct device *dev,
 }
 static DEVICE_ATTR_RO(mappings);
 
+static ssize_t nstype_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct nd_region *nd_region = to_nd_region(dev);
+
+	return sprintf(buf, "%d\n", nd_region_to_nstype(nd_region));
+}
+static DEVICE_ATTR_RO(nstype);
+
+static ssize_t init_namespaces_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct nd_region_namespaces *num_ns = dev_get_drvdata(dev);
+	ssize_t rc;
+
+	nvdimm_bus_lock(dev);
+	if (num_ns)
+		rc = sprintf(buf, "%d/%d\n", num_ns->active, num_ns->count);
+	else
+		rc = -ENXIO;
+	nvdimm_bus_unlock(dev);
+
+	return rc;
+}
+static DEVICE_ATTR_RO(init_namespaces);
+
 static struct attribute *nd_region_attributes[] = {
 	&dev_attr_size.attr,
+	&dev_attr_nstype.attr,
 	&dev_attr_mappings.attr,
+	&dev_attr_init_namespaces.attr,
 	NULL,
 };
 
diff --git a/include/linux/libnvdimm.h b/include/linux/libnvdimm.h
index 39e7e606092a..37f966aff386 100644
--- a/include/linux/libnvdimm.h
+++ b/include/linux/libnvdimm.h
@@ -71,8 +71,11 @@ struct nd_region_desc {
 
 struct nvdimm_bus;
 struct device;
-struct nvdimm_bus *nvdimm_bus_register(struct device *parent,
-		struct nvdimm_bus_descriptor *nfit_desc);
+struct module;
+struct nvdimm_bus *__nvdimm_bus_register(struct device *parent,
+		struct nvdimm_bus_descriptor *nfit_desc, struct module *module);
+#define nvdimm_bus_register(parent, desc) \
+	__nvdimm_bus_register(parent, desc, THIS_MODULE)
 void nvdimm_bus_unregister(struct nvdimm_bus *nvdimm_bus);
 struct nvdimm_bus *to_nvdimm_bus(struct device *dev);
 struct nvdimm *to_nvdimm(struct device *dev);
diff --git a/include/linux/nd.h b/include/linux/nd.h
index e074f67e53a3..da70e9962197 100644
--- a/include/linux/nd.h
+++ b/include/linux/nd.h
@@ -26,6 +26,16 @@ static inline struct nd_device_driver *to_nd_device_driver(
 		struct device_driver *drv)
 {
 	return container_of(drv, struct nd_device_driver, drv);
+};
+
+struct nd_namespace_io {
+	struct device dev;
+	struct resource res;
+};
+
+static inline struct nd_namespace_io *to_nd_namespace_io(struct device *dev)
+{
+	return container_of(dev, struct nd_namespace_io, dev);
 }
 
 #define MODULE_ALIAS_ND_DEVICE(type) \
diff --git a/include/uapi/linux/ndctl.h b/include/uapi/linux/ndctl.h
index 37640916d146..174b6371dcc1 100644
--- a/include/uapi/linux/ndctl.h
+++ b/include/uapi/linux/ndctl.h
@@ -177,8 +177,18 @@ static inline const char *nvdimm_cmd_name(unsigned cmd)
 
 
 #define ND_DEVICE_DIMM 1            /* nd_dimm: container for "config data" */
+#define ND_DEVICE_REGION_PMEM 2     /* nd_region: (parent of PMEM namespaces) */
+#define ND_DEVICE_REGION_BLK 3      /* nd_region: (parent of BLK namespaces) */
+#define ND_DEVICE_NAMESPACE_IO 4    /* legacy persistent memory */
+#define ND_DEVICE_NAMESPACE_PMEM 5  /* PMEM namespace (may alias with BLK) */
+#define ND_DEVICE_NAMESPACE_BLK 6   /* BLK namespace (may alias with PMEM) */
 
 enum nd_driver_flags {
 	ND_DRIVER_DIMM            = 1 << ND_DEVICE_DIMM,
+	ND_DRIVER_REGION_PMEM     = 1 << ND_DEVICE_REGION_PMEM,
+	ND_DRIVER_REGION_BLK      = 1 << ND_DEVICE_REGION_BLK,
+	ND_DRIVER_NAMESPACE_IO    = 1 << ND_DEVICE_NAMESPACE_IO,
+	ND_DRIVER_NAMESPACE_PMEM  = 1 << ND_DEVICE_NAMESPACE_PMEM,
+	ND_DRIVER_NAMESPACE_BLK   = 1 << ND_DEVICE_NAMESPACE_BLK,
 };
 #endif /* __NDCTL_H__ */
-- 
cgit v1.2.3


From eaf961536e1622ad21247ac8d44acd48ba65566e Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Fri, 1 May 2015 13:11:27 -0400
Subject: libnvdimm, nfit: add interleave-set state-tracking infrastructure

On platforms that have firmware support for reading/writing per-dimm
label space, a portion of the dimm may be accessible via an interleave
set PMEM mapping in addition to the dimm's BLK (block-data-window
aperture(s)) interface.  A label, stored in a "configuration data
region" on the dimm, disambiguates which dimm addresses are accessed
through which exclusive interface.

Add infrastructure that allows the kernel to block modifications to a
label in the set while any member dimm is active.  Note that this is
meant only for enforcing "no modifications of active labels" via the
coarse ioctl command.  Adding/deleting namespaces from an active
interleave set is always possible via sysfs.

Another aspect of tracking interleave sets is tracking their integrity
when DIMMs in a set are physically re-ordered.  For this purpose we
generate an "interleave-set cookie" that can be recorded in a label and
validated against the current configuration.  It is the bus provider
implementation's responsibility to calculate the interleave set cookie
and attach it to a given region.

Cc: Neil Brown <neilb@suse.de>
Cc: <linux-acpi@vger.kernel.org>
Cc: Greg KH <gregkh@linuxfoundation.org>
Cc: Robert Moore <robert.moore@intel.com>
Cc: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Christoph Hellwig <hch@lst.de>
Acked-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/acpi/nfit.c          | 93 +++++++++++++++++++++++++++++++++++++++++++-
 drivers/nvdimm/bus.c         | 59 +++++++++++++++++++++++++++-
 drivers/nvdimm/core.c        | 17 ++++++++
 drivers/nvdimm/dimm_devs.c   | 19 ++++++++-
 drivers/nvdimm/nd-core.h     | 10 ++++-
 drivers/nvdimm/nd.h          |  1 +
 drivers/nvdimm/region_devs.c | 69 ++++++++++++++++++++++++++++++++
 include/linux/libnvdimm.h    |  6 +++
 8 files changed, 269 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/acpi/nfit.c b/drivers/acpi/nfit.c
index ce290748fe36..35af6f7f0abd 100644
--- a/drivers/acpi/nfit.c
+++ b/drivers/acpi/nfit.c
@@ -16,6 +16,7 @@
 #include <linux/ndctl.h>
 #include <linux/list.h>
 #include <linux/acpi.h>
+#include <linux/sort.h>
 #include "nfit.h"
 
 static bool force_enable_dimms;
@@ -785,6 +786,91 @@ static const struct attribute_group *acpi_nfit_region_attribute_groups[] = {
 	NULL,
 };
 
+/* enough info to uniquely specify an interleave set */
+struct nfit_set_info {
+	struct nfit_set_info_map {
+		u64 region_offset;
+		u32 serial_number;
+		u32 pad;
+	} mapping[0];
+};
+
+static size_t sizeof_nfit_set_info(int num_mappings)
+{
+	return sizeof(struct nfit_set_info)
+		+ num_mappings * sizeof(struct nfit_set_info_map);
+}
+
+static int cmp_map(const void *m0, const void *m1)
+{
+	const struct nfit_set_info_map *map0 = m0;
+	const struct nfit_set_info_map *map1 = m1;
+
+	return memcmp(&map0->region_offset, &map1->region_offset,
+			sizeof(u64));
+}
+
+/* Retrieve the nth entry referencing this spa */
+static struct acpi_nfit_memory_map *memdev_from_spa(
+		struct acpi_nfit_desc *acpi_desc, u16 range_index, int n)
+{
+	struct nfit_memdev *nfit_memdev;
+
+	list_for_each_entry(nfit_memdev, &acpi_desc->memdevs, list)
+		if (nfit_memdev->memdev->range_index == range_index)
+			if (n-- == 0)
+				return nfit_memdev->memdev;
+	return NULL;
+}
+
+static int acpi_nfit_init_interleave_set(struct acpi_nfit_desc *acpi_desc,
+		struct nd_region_desc *ndr_desc,
+		struct acpi_nfit_system_address *spa)
+{
+	int i, spa_type = nfit_spa_type(spa);
+	struct device *dev = acpi_desc->dev;
+	struct nd_interleave_set *nd_set;
+	u16 nr = ndr_desc->num_mappings;
+	struct nfit_set_info *info;
+
+	if (spa_type == NFIT_SPA_PM || spa_type == NFIT_SPA_VOLATILE)
+		/* pass */;
+	else
+		return 0;
+
+	nd_set = devm_kzalloc(dev, sizeof(*nd_set), GFP_KERNEL);
+	if (!nd_set)
+		return -ENOMEM;
+
+	info = devm_kzalloc(dev, sizeof_nfit_set_info(nr), GFP_KERNEL);
+	if (!info)
+		return -ENOMEM;
+	for (i = 0; i < nr; i++) {
+		struct nd_mapping *nd_mapping = &ndr_desc->nd_mapping[i];
+		struct nfit_set_info_map *map = &info->mapping[i];
+		struct nvdimm *nvdimm = nd_mapping->nvdimm;
+		struct nfit_mem *nfit_mem = nvdimm_provider_data(nvdimm);
+		struct acpi_nfit_memory_map *memdev = memdev_from_spa(acpi_desc,
+				spa->range_index, i);
+
+		if (!memdev || !nfit_mem->dcr) {
+			dev_err(dev, "%s: failed to find DCR\n", __func__);
+			return -ENODEV;
+		}
+
+		map->region_offset = memdev->region_offset;
+		map->serial_number = nfit_mem->dcr->serial_number;
+	}
+
+	sort(&info->mapping[0], nr, sizeof(struct nfit_set_info_map),
+			cmp_map, NULL);
+	nd_set->cookie = nd_fletcher64(info, sizeof_nfit_set_info(nr), 0);
+	ndr_desc->nd_set = nd_set;
+	devm_kfree(dev, info);
+
+	return 0;
+}
+
 static int acpi_nfit_init_mapping(struct acpi_nfit_desc *acpi_desc,
 		struct nd_mapping *nd_mapping, struct nd_region_desc *ndr_desc,
 		struct acpi_nfit_memory_map *memdev,
@@ -838,7 +924,7 @@ static int acpi_nfit_register_region(struct acpi_nfit_desc *acpi_desc,
 	struct nd_region_desc ndr_desc;
 	struct nvdimm_bus *nvdimm_bus;
 	struct resource res;
-	int count = 0;
+	int count = 0, rc;
 
 	if (spa->range_index == 0) {
 		dev_dbg(acpi_desc->dev, "%s: detected invalid spa index\n",
@@ -857,7 +943,6 @@ static int acpi_nfit_register_region(struct acpi_nfit_desc *acpi_desc,
 	list_for_each_entry(nfit_memdev, &acpi_desc->memdevs, list) {
 		struct acpi_nfit_memory_map *memdev = nfit_memdev->memdev;
 		struct nd_mapping *nd_mapping;
-		int rc;
 
 		if (memdev->range_index != spa->range_index)
 			continue;
@@ -875,6 +960,10 @@ static int acpi_nfit_register_region(struct acpi_nfit_desc *acpi_desc,
 
 	ndr_desc.nd_mapping = nd_mappings;
 	ndr_desc.num_mappings = count;
+	rc = acpi_nfit_init_interleave_set(acpi_desc, &ndr_desc, spa);
+	if (rc)
+		return rc;
+
 	nvdimm_bus = acpi_desc->nvdimm_bus;
 	if (nfit_spa_type(spa) == NFIT_SPA_PM) {
 		if (!nvdimm_pmem_region_create(nvdimm_bus, &ndr_desc))
diff --git a/drivers/nvdimm/bus.c b/drivers/nvdimm/bus.c
index 4b77665a6cc8..ffb43cada625 100644
--- a/drivers/nvdimm/bus.c
+++ b/drivers/nvdimm/bus.c
@@ -68,6 +68,21 @@ static struct module *to_bus_provider(struct device *dev)
 	return NULL;
 }
 
+static void nvdimm_bus_probe_start(struct nvdimm_bus *nvdimm_bus)
+{
+	nvdimm_bus_lock(&nvdimm_bus->dev);
+	nvdimm_bus->probe_active++;
+	nvdimm_bus_unlock(&nvdimm_bus->dev);
+}
+
+static void nvdimm_bus_probe_end(struct nvdimm_bus *nvdimm_bus)
+{
+	nvdimm_bus_lock(&nvdimm_bus->dev);
+	if (--nvdimm_bus->probe_active == 0)
+		wake_up(&nvdimm_bus->probe_wait);
+	nvdimm_bus_unlock(&nvdimm_bus->dev);
+}
+
 static int nvdimm_bus_probe(struct device *dev)
 {
 	struct nd_device_driver *nd_drv = to_nd_device_driver(dev->driver);
@@ -78,7 +93,12 @@ static int nvdimm_bus_probe(struct device *dev)
 	if (!try_module_get(provider))
 		return -ENXIO;
 
+	nvdimm_bus_probe_start(nvdimm_bus);
 	rc = nd_drv->probe(dev);
+	if (rc == 0)
+		nd_region_probe_success(nvdimm_bus, dev);
+	nvdimm_bus_probe_end(nvdimm_bus);
+
 	dev_dbg(&nvdimm_bus->dev, "%s.probe(%s) = %d\n", dev->driver->name,
 			dev_name(dev), rc);
 	if (rc != 0)
@@ -94,6 +114,8 @@ static int nvdimm_bus_remove(struct device *dev)
 	int rc;
 
 	rc = nd_drv->remove(dev);
+	nd_region_disable(nvdimm_bus, dev);
+
 	dev_dbg(&nvdimm_bus->dev, "%s.remove(%s) = %d\n", dev->driver->name,
 			dev_name(dev), rc);
 	module_put(provider);
@@ -359,6 +381,34 @@ u32 nd_cmd_out_size(struct nvdimm *nvdimm, int cmd,
 }
 EXPORT_SYMBOL_GPL(nd_cmd_out_size);
 
+static void wait_nvdimm_bus_probe_idle(struct nvdimm_bus *nvdimm_bus)
+{
+	do {
+		if (nvdimm_bus->probe_active == 0)
+			break;
+		nvdimm_bus_unlock(&nvdimm_bus->dev);
+		wait_event(nvdimm_bus->probe_wait,
+				nvdimm_bus->probe_active == 0);
+		nvdimm_bus_lock(&nvdimm_bus->dev);
+	} while (true);
+}
+
+/* set_config requires an idle interleave set */
+static int nd_cmd_clear_to_send(struct nvdimm *nvdimm, unsigned int cmd)
+{
+	struct nvdimm_bus *nvdimm_bus;
+
+	if (!nvdimm || cmd != ND_CMD_SET_CONFIG_DATA)
+		return 0;
+
+	nvdimm_bus = walk_to_nvdimm_bus(&nvdimm->dev);
+	wait_nvdimm_bus_probe_idle(nvdimm_bus);
+
+	if (atomic_read(&nvdimm->busy))
+		return -EBUSY;
+	return 0;
+}
+
 static int __nd_ioctl(struct nvdimm_bus *nvdimm_bus, struct nvdimm *nvdimm,
 		int read_only, unsigned int ioctl_cmd, unsigned long arg)
 {
@@ -469,11 +519,18 @@ static int __nd_ioctl(struct nvdimm_bus *nvdimm_bus, struct nvdimm *nvdimm,
 		goto out;
 	}
 
+	nvdimm_bus_lock(&nvdimm_bus->dev);
+	rc = nd_cmd_clear_to_send(nvdimm, cmd);
+	if (rc)
+		goto out_unlock;
+
 	rc = nd_desc->ndctl(nd_desc, nvdimm, cmd, buf, buf_len);
 	if (rc < 0)
-		goto out;
+		goto out_unlock;
 	if (copy_to_user(p, buf, buf_len))
 		rc = -EFAULT;
+ out_unlock:
+	nvdimm_bus_unlock(&nvdimm_bus->dev);
  out:
 	vfree(buf);
 	return rc;
diff --git a/drivers/nvdimm/core.c b/drivers/nvdimm/core.c
index 1b6b15d11f54..7806eaaf4707 100644
--- a/drivers/nvdimm/core.c
+++ b/drivers/nvdimm/core.c
@@ -54,6 +54,22 @@ bool is_nvdimm_bus_locked(struct device *dev)
 }
 EXPORT_SYMBOL(is_nvdimm_bus_locked);
 
+u64 nd_fletcher64(void *addr, size_t len, bool le)
+{
+	u32 *buf = addr;
+	u32 lo32 = 0;
+	u64 hi32 = 0;
+	int i;
+
+	for (i = 0; i < len / sizeof(u32); i++) {
+		lo32 += le ? le32_to_cpu((__le32) buf[i]) : buf[i];
+		hi32 += lo32;
+	}
+
+	return hi32 << 32 | lo32;
+}
+EXPORT_SYMBOL_GPL(nd_fletcher64);
+
 static void nvdimm_bus_release(struct device *dev)
 {
 	struct nvdimm_bus *nvdimm_bus;
@@ -175,6 +191,7 @@ struct nvdimm_bus *__nvdimm_bus_register(struct device *parent,
 	if (!nvdimm_bus)
 		return NULL;
 	INIT_LIST_HEAD(&nvdimm_bus->list);
+	init_waitqueue_head(&nvdimm_bus->probe_wait);
 	nvdimm_bus->id = ida_simple_get(&nd_ida, 0, 0, GFP_KERNEL);
 	mutex_init(&nvdimm_bus->reconfig_mutex);
 	if (nvdimm_bus->id < 0) {
diff --git a/drivers/nvdimm/dimm_devs.c b/drivers/nvdimm/dimm_devs.c
index b3ae86f2e1da..bdf8241b6525 100644
--- a/drivers/nvdimm/dimm_devs.c
+++ b/drivers/nvdimm/dimm_devs.c
@@ -185,7 +185,24 @@ static ssize_t commands_show(struct device *dev,
 }
 static DEVICE_ATTR_RO(commands);
 
+static ssize_t state_show(struct device *dev, struct device_attribute *attr,
+		char *buf)
+{
+	struct nvdimm *nvdimm = to_nvdimm(dev);
+
+	/*
+	 * The state may be in the process of changing, userspace should
+	 * quiesce probing if it wants a static answer
+	 */
+	nvdimm_bus_lock(dev);
+	nvdimm_bus_unlock(dev);
+	return sprintf(buf, "%s\n", atomic_read(&nvdimm->busy)
+			? "active" : "idle");
+}
+static DEVICE_ATTR_RO(state);
+
 static struct attribute *nvdimm_attributes[] = {
+	&dev_attr_state.attr,
 	&dev_attr_commands.attr,
 	NULL,
 };
@@ -213,7 +230,7 @@ struct nvdimm *nvdimm_create(struct nvdimm_bus *nvdimm_bus, void *provider_data,
 	nvdimm->provider_data = provider_data;
 	nvdimm->flags = flags;
 	nvdimm->dsm_mask = dsm_mask;
-
+	atomic_set(&nvdimm->busy, 0);
 	dev = &nvdimm->dev;
 	dev_set_name(dev, "nmem%d", nvdimm->id);
 	dev->parent = &nvdimm_bus->dev;
diff --git a/drivers/nvdimm/nd-core.h b/drivers/nvdimm/nd-core.h
index 0e9b41fd2546..6a4b2c066ee7 100644
--- a/drivers/nvdimm/nd-core.h
+++ b/drivers/nvdimm/nd-core.h
@@ -14,6 +14,9 @@
 #define __ND_CORE_H__
 #include <linux/libnvdimm.h>
 #include <linux/device.h>
+#include <linux/libnvdimm.h>
+#include <linux/sizes.h>
+#include <linux/mutex.h>
 
 extern struct list_head nvdimm_bus_list;
 extern struct mutex nvdimm_bus_list_mutex;
@@ -21,10 +24,11 @@ extern int nvdimm_major;
 
 struct nvdimm_bus {
 	struct nvdimm_bus_descriptor *nd_desc;
+	wait_queue_head_t probe_wait;
 	struct module *module;
 	struct list_head list;
 	struct device dev;
-	int id;
+	int id, probe_active;
 	struct mutex reconfig_mutex;
 };
 
@@ -33,6 +37,7 @@ struct nvdimm {
 	void *provider_data;
 	unsigned long *dsm_mask;
 	struct device dev;
+	atomic_t busy;
 	int id;
 };
 
@@ -42,10 +47,13 @@ bool is_nd_pmem(struct device *dev);
 struct nvdimm_bus *walk_to_nvdimm_bus(struct device *nd_dev);
 int __init nvdimm_bus_init(void);
 void nvdimm_bus_exit(void);
+void nd_region_probe_success(struct nvdimm_bus *nvdimm_bus, struct device *dev);
+void nd_region_disable(struct nvdimm_bus *nvdimm_bus, struct device *dev);
 int nvdimm_bus_create_ndctl(struct nvdimm_bus *nvdimm_bus);
 void nvdimm_bus_destroy_ndctl(struct nvdimm_bus *nvdimm_bus);
 void nd_synchronize(void);
 int nvdimm_bus_register_dimms(struct nvdimm_bus *nvdimm_bus);
 int nvdimm_bus_register_regions(struct nvdimm_bus *nvdimm_bus);
+int nvdimm_bus_init_interleave_sets(struct nvdimm_bus *nvdimm_bus);
 int nd_match_dimm(struct device *dev, void *data);
 #endif /* __ND_CORE_H__ */
diff --git a/drivers/nvdimm/nd.h b/drivers/nvdimm/nd.h
index bc5a08e36a25..0285e4588b03 100644
--- a/drivers/nvdimm/nd.h
+++ b/drivers/nvdimm/nd.h
@@ -35,6 +35,7 @@ struct nd_region {
 	u64 ndr_start;
 	int id;
 	void *provider_data;
+	struct nd_interleave_set *nd_set;
 	struct nd_mapping mapping[0];
 };
 
diff --git a/drivers/nvdimm/region_devs.c b/drivers/nvdimm/region_devs.c
index b5c5b9095b28..1571424578f0 100644
--- a/drivers/nvdimm/region_devs.c
+++ b/drivers/nvdimm/region_devs.c
@@ -10,7 +10,10 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * General Public License for more details.
  */
+#include <linux/scatterlist.h>
+#include <linux/sched.h>
 #include <linux/slab.h>
+#include <linux/sort.h>
 #include <linux/io.h>
 #include "nd-core.h"
 #include "nd.h"
@@ -133,6 +136,21 @@ static ssize_t nstype_show(struct device *dev,
 }
 static DEVICE_ATTR_RO(nstype);
 
+static ssize_t set_cookie_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct nd_region *nd_region = to_nd_region(dev);
+	struct nd_interleave_set *nd_set = nd_region->nd_set;
+
+	if (is_nd_pmem(dev) && nd_set)
+		/* pass, should be precluded by region_visible */;
+	else
+		return -ENXIO;
+
+	return sprintf(buf, "%#llx\n", nd_set->cookie);
+}
+static DEVICE_ATTR_RO(set_cookie);
+
 static ssize_t init_namespaces_show(struct device *dev,
 		struct device_attribute *attr, char *buf)
 {
@@ -154,15 +172,65 @@ static struct attribute *nd_region_attributes[] = {
 	&dev_attr_size.attr,
 	&dev_attr_nstype.attr,
 	&dev_attr_mappings.attr,
+	&dev_attr_set_cookie.attr,
 	&dev_attr_init_namespaces.attr,
 	NULL,
 };
 
+static umode_t region_visible(struct kobject *kobj, struct attribute *a, int n)
+{
+	struct device *dev = container_of(kobj, typeof(*dev), kobj);
+	struct nd_region *nd_region = to_nd_region(dev);
+	struct nd_interleave_set *nd_set = nd_region->nd_set;
+
+	if (a != &dev_attr_set_cookie.attr)
+		return a->mode;
+
+	if (is_nd_pmem(dev) && nd_set)
+			return a->mode;
+
+	return 0;
+}
+
 struct attribute_group nd_region_attribute_group = {
 	.attrs = nd_region_attributes,
+	.is_visible = region_visible,
 };
 EXPORT_SYMBOL_GPL(nd_region_attribute_group);
 
+/*
+ * Upon successful probe/remove, take/release a reference on the
+ * associated interleave set (if present)
+ */
+static void nd_region_notify_driver_action(struct nvdimm_bus *nvdimm_bus,
+		struct device *dev, bool probe)
+{
+	if (is_nd_pmem(dev) || is_nd_blk(dev)) {
+		struct nd_region *nd_region = to_nd_region(dev);
+		int i;
+
+		for (i = 0; i < nd_region->ndr_mappings; i++) {
+			struct nd_mapping *nd_mapping = &nd_region->mapping[i];
+			struct nvdimm *nvdimm = nd_mapping->nvdimm;
+
+			if (probe)
+				atomic_inc(&nvdimm->busy);
+			else
+				atomic_dec(&nvdimm->busy);
+		}
+	}
+}
+
+void nd_region_probe_success(struct nvdimm_bus *nvdimm_bus, struct device *dev)
+{
+	nd_region_notify_driver_action(nvdimm_bus, dev, true);
+}
+
+void nd_region_disable(struct nvdimm_bus *nvdimm_bus, struct device *dev)
+{
+	nd_region_notify_driver_action(nvdimm_bus, dev, false);
+}
+
 static ssize_t mappingN(struct device *dev, char *buf, int n)
 {
 	struct nd_region *nd_region = to_nd_region(dev);
@@ -322,6 +390,7 @@ static struct nd_region *nd_region_create(struct nvdimm_bus *nvdimm_bus,
 	}
 	nd_region->ndr_mappings = ndr_desc->num_mappings;
 	nd_region->provider_data = ndr_desc->provider_data;
+	nd_region->nd_set = ndr_desc->nd_set;
 	dev = &nd_region->dev;
 	dev_set_name(dev, "region%d", nd_region->id);
 	dev->parent = &nvdimm_bus->dev;
diff --git a/include/linux/libnvdimm.h b/include/linux/libnvdimm.h
index 37f966aff386..1b627b109360 100644
--- a/include/linux/libnvdimm.h
+++ b/include/linux/libnvdimm.h
@@ -61,11 +61,16 @@ struct nd_cmd_desc {
 	int out_sizes[ND_CMD_MAX_ELEM];
 };
 
+struct nd_interleave_set {
+	u64 cookie;
+};
+
 struct nd_region_desc {
 	struct resource *res;
 	struct nd_mapping *nd_mapping;
 	u16 num_mappings;
 	const struct attribute_group **attr_groups;
+	struct nd_interleave_set *nd_set;
 	void *provider_data;
 };
 
@@ -101,4 +106,5 @@ struct nd_region *nvdimm_blk_region_create(struct nvdimm_bus *nvdimm_bus,
 		struct nd_region_desc *ndr_desc);
 struct nd_region *nvdimm_volatile_region_create(struct nvdimm_bus *nvdimm_bus,
 		struct nd_region_desc *ndr_desc);
+u64 nd_fletcher64(void *addr, size_t len, bool le);
 #endif /* __LIBNVDIMM_H__ */
-- 
cgit v1.2.3


From bf9bccc14c05dae8caba29df6187c731710f5380 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Wed, 17 Jun 2015 17:14:46 -0400
Subject: libnvdimm: pmem label sets and namespace instantiation.

A complete label set is a PMEM-label per-dimm per-interleave-set where
all the UUIDs match and the interleave set cookie matches the hosting
interleave set.

Present sysfs attributes for manipulation of a PMEM-namespace's
'alt_name', 'uuid', and 'size' attributes.  A later patch will make
these settings persistent by writing back the label.

Note that PMEM allocations grow forwards from the start of an interleave
set (lowest dimm-physical-address (DPA)).  BLK-namespaces that alias
with a PMEM interleave set will grow allocations backward from the
highest DPA.

Cc: Greg KH <gregkh@linuxfoundation.org>
Cc: Neil Brown <neilb@suse.de>
Acked-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/nvdimm/bus.c            |    8 +-
 drivers/nvdimm/core.c           |   64 +++
 drivers/nvdimm/dimm.c           |   21 +-
 drivers/nvdimm/dimm_devs.c      |  137 ++++++
 drivers/nvdimm/label.c          |   55 ++-
 drivers/nvdimm/label.h          |    2 +
 drivers/nvdimm/namespace_devs.c | 1002 ++++++++++++++++++++++++++++++++++++++-
 drivers/nvdimm/nd-core.h        |   12 +
 drivers/nvdimm/nd.h             |   17 +
 drivers/nvdimm/pmem.c           |   20 +-
 drivers/nvdimm/region.c         |    3 +
 drivers/nvdimm/region_devs.c    |  158 +++++-
 include/linux/libnvdimm.h       |   10 +
 include/linux/nd.h              |   24 +
 include/uapi/linux/ndctl.h      |    4 +
 15 files changed, 1506 insertions(+), 31 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/nvdimm/bus.c b/drivers/nvdimm/bus.c
index ffb43cada625..fddc3f2a8f80 100644
--- a/drivers/nvdimm/bus.c
+++ b/drivers/nvdimm/bus.c
@@ -97,6 +97,8 @@ static int nvdimm_bus_probe(struct device *dev)
 	rc = nd_drv->probe(dev);
 	if (rc == 0)
 		nd_region_probe_success(nvdimm_bus, dev);
+	else
+		nd_region_disable(nvdimm_bus, dev);
 	nvdimm_bus_probe_end(nvdimm_bus);
 
 	dev_dbg(&nvdimm_bus->dev, "%s.probe(%s) = %d\n", dev->driver->name,
@@ -381,8 +383,10 @@ u32 nd_cmd_out_size(struct nvdimm *nvdimm, int cmd,
 }
 EXPORT_SYMBOL_GPL(nd_cmd_out_size);
 
-static void wait_nvdimm_bus_probe_idle(struct nvdimm_bus *nvdimm_bus)
+void wait_nvdimm_bus_probe_idle(struct device *dev)
 {
+	struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(dev);
+
 	do {
 		if (nvdimm_bus->probe_active == 0)
 			break;
@@ -402,7 +406,7 @@ static int nd_cmd_clear_to_send(struct nvdimm *nvdimm, unsigned int cmd)
 		return 0;
 
 	nvdimm_bus = walk_to_nvdimm_bus(&nvdimm->dev);
-	wait_nvdimm_bus_probe_idle(nvdimm_bus);
+	wait_nvdimm_bus_probe_idle(&nvdimm_bus->dev);
 
 	if (atomic_read(&nvdimm->busy))
 		return -EBUSY;
diff --git a/drivers/nvdimm/core.c b/drivers/nvdimm/core.c
index 7806eaaf4707..cf99cce8ef33 100644
--- a/drivers/nvdimm/core.c
+++ b/drivers/nvdimm/core.c
@@ -14,6 +14,7 @@
 #include <linux/export.h>
 #include <linux/module.h>
 #include <linux/device.h>
+#include <linux/ctype.h>
 #include <linux/ndctl.h>
 #include <linux/mutex.h>
 #include <linux/slab.h>
@@ -109,6 +110,69 @@ struct nvdimm_bus *walk_to_nvdimm_bus(struct device *nd_dev)
 	return NULL;
 }
 
+static bool is_uuid_sep(char sep)
+{
+	if (sep == '\n' || sep == '-' || sep == ':' || sep == '\0')
+		return true;
+	return false;
+}
+
+static int nd_uuid_parse(struct device *dev, u8 *uuid_out, const char *buf,
+		size_t len)
+{
+	const char *str = buf;
+	u8 uuid[16];
+	int i;
+
+	for (i = 0; i < 16; i++) {
+		if (!isxdigit(str[0]) || !isxdigit(str[1])) {
+			dev_dbg(dev, "%s: pos: %d buf[%zd]: %c buf[%zd]: %c\n",
+					__func__, i, str - buf, str[0],
+					str + 1 - buf, str[1]);
+			return -EINVAL;
+		}
+
+		uuid[i] = (hex_to_bin(str[0]) << 4) | hex_to_bin(str[1]);
+		str += 2;
+		if (is_uuid_sep(*str))
+			str++;
+	}
+
+	memcpy(uuid_out, uuid, sizeof(uuid));
+	return 0;
+}
+
+/**
+ * nd_uuid_store: common implementation for writing 'uuid' sysfs attributes
+ * @dev: container device for the uuid property
+ * @uuid_out: uuid buffer to replace
+ * @buf: raw sysfs buffer to parse
+ *
+ * Enforce that uuids can only be changed while the device is disabled
+ * (driver detached)
+ * LOCKING: expects device_lock() is held on entry
+ */
+int nd_uuid_store(struct device *dev, u8 **uuid_out, const char *buf,
+		size_t len)
+{
+	u8 uuid[16];
+	int rc;
+
+	if (dev->driver)
+		return -EBUSY;
+
+	rc = nd_uuid_parse(dev, uuid, buf, len);
+	if (rc)
+		return rc;
+
+	kfree(*uuid_out);
+	*uuid_out = kmemdup(uuid, sizeof(uuid), GFP_KERNEL);
+	if (!(*uuid_out))
+		return -ENOMEM;
+
+	return 0;
+}
+
 static ssize_t commands_show(struct device *dev,
 		struct device_attribute *attr, char *buf)
 {
diff --git a/drivers/nvdimm/dimm.c b/drivers/nvdimm/dimm.c
index 2df97c3c3b34..71d12bb67339 100644
--- a/drivers/nvdimm/dimm.c
+++ b/drivers/nvdimm/dimm.c
@@ -21,18 +21,6 @@
 #include "label.h"
 #include "nd.h"
 
-static void free_data(struct nvdimm_drvdata *ndd)
-{
-	if (!ndd)
-		return;
-
-	if (ndd->data && is_vmalloc_addr(ndd->data))
-		vfree(ndd->data);
-	else
-		kfree(ndd->data);
-	kfree(ndd);
-}
-
 static int nvdimm_probe(struct device *dev)
 {
 	struct nvdimm_drvdata *ndd;
@@ -49,6 +37,8 @@ static int nvdimm_probe(struct device *dev)
 	ndd->dpa.start = 0;
 	ndd->dpa.end = -1;
 	ndd->dev = dev;
+	get_device(dev);
+	kref_init(&ndd->kref);
 
 	rc = nvdimm_init_nsarea(ndd);
 	if (rc)
@@ -74,21 +64,18 @@ static int nvdimm_probe(struct device *dev)
 	return 0;
 
  err:
-	free_data(ndd);
+	put_ndd(ndd);
 	return rc;
 }
 
 static int nvdimm_remove(struct device *dev)
 {
 	struct nvdimm_drvdata *ndd = dev_get_drvdata(dev);
-	struct resource *res, *_r;
 
 	nvdimm_bus_lock(dev);
 	dev_set_drvdata(dev, NULL);
-	for_each_dpa_resource_safe(ndd, res, _r)
-		nvdimm_free_dpa(ndd, res);
 	nvdimm_bus_unlock(dev);
-	free_data(ndd);
+	put_ndd(ndd);
 
 	return 0;
 }
diff --git a/drivers/nvdimm/dimm_devs.c b/drivers/nvdimm/dimm_devs.c
index d2ef02e4be6c..b55acef179ba 100644
--- a/drivers/nvdimm/dimm_devs.c
+++ b/drivers/nvdimm/dimm_devs.c
@@ -159,6 +159,48 @@ struct nvdimm *to_nvdimm(struct device *dev)
 }
 EXPORT_SYMBOL_GPL(to_nvdimm);
 
+struct nvdimm_drvdata *to_ndd(struct nd_mapping *nd_mapping)
+{
+	struct nvdimm *nvdimm = nd_mapping->nvdimm;
+
+	WARN_ON_ONCE(!is_nvdimm_bus_locked(&nvdimm->dev));
+
+	return dev_get_drvdata(&nvdimm->dev);
+}
+EXPORT_SYMBOL(to_ndd);
+
+void nvdimm_drvdata_release(struct kref *kref)
+{
+	struct nvdimm_drvdata *ndd = container_of(kref, typeof(*ndd), kref);
+	struct device *dev = ndd->dev;
+	struct resource *res, *_r;
+
+	dev_dbg(dev, "%s\n", __func__);
+
+	nvdimm_bus_lock(dev);
+	for_each_dpa_resource_safe(ndd, res, _r)
+		nvdimm_free_dpa(ndd, res);
+	nvdimm_bus_unlock(dev);
+
+	if (ndd->data && is_vmalloc_addr(ndd->data))
+		vfree(ndd->data);
+	else
+		kfree(ndd->data);
+	kfree(ndd);
+	put_device(dev);
+}
+
+void get_ndd(struct nvdimm_drvdata *ndd)
+{
+	kref_get(&ndd->kref);
+}
+
+void put_ndd(struct nvdimm_drvdata *ndd)
+{
+	if (ndd)
+		kref_put(&ndd->kref, nvdimm_drvdata_release);
+}
+
 const char *nvdimm_name(struct nvdimm *nvdimm)
 {
 	return dev_name(&nvdimm->dev);
@@ -247,6 +289,83 @@ struct nvdimm *nvdimm_create(struct nvdimm_bus *nvdimm_bus, void *provider_data,
 }
 EXPORT_SYMBOL_GPL(nvdimm_create);
 
+/**
+ * nd_pmem_available_dpa - for the given dimm+region account unallocated dpa
+ * @nd_mapping: container of dpa-resource-root + labels
+ * @nd_region: constrain available space check to this reference region
+ * @overlap: calculate available space assuming this level of overlap
+ *
+ * Validate that a PMEM label, if present, aligns with the start of an
+ * interleave set and truncate the available size at the lowest BLK
+ * overlap point.
+ *
+ * The expectation is that this routine is called multiple times as it
+ * probes for the largest BLK encroachment for any single member DIMM of
+ * the interleave set.  Once that value is determined the PMEM-limit for
+ * the set can be established.
+ */
+resource_size_t nd_pmem_available_dpa(struct nd_region *nd_region,
+		struct nd_mapping *nd_mapping, resource_size_t *overlap)
+{
+	resource_size_t map_start, map_end, busy = 0, available, blk_start;
+	struct nvdimm_drvdata *ndd = to_ndd(nd_mapping);
+	struct resource *res;
+	const char *reason;
+
+	if (!ndd)
+		return 0;
+
+	map_start = nd_mapping->start;
+	map_end = map_start + nd_mapping->size - 1;
+	blk_start = max(map_start, map_end + 1 - *overlap);
+	for_each_dpa_resource(ndd, res)
+		if (res->start >= map_start && res->start < map_end) {
+			if (strncmp(res->name, "blk", 3) == 0)
+				blk_start = min(blk_start, res->start);
+			else if (res->start != map_start) {
+				reason = "misaligned to iset";
+				goto err;
+			} else {
+				if (busy) {
+					reason = "duplicate overlapping PMEM reservations?";
+					goto err;
+				}
+				busy += resource_size(res);
+				continue;
+			}
+		} else if (res->end >= map_start && res->end <= map_end) {
+			if (strncmp(res->name, "blk", 3) == 0) {
+				/*
+				 * If a BLK allocation overlaps the start of
+				 * PMEM the entire interleave set may now only
+				 * be used for BLK.
+				 */
+				blk_start = map_start;
+			} else {
+				reason = "misaligned to iset";
+				goto err;
+			}
+		} else if (map_start > res->start && map_start < res->end) {
+			/* total eclipse of the mapping */
+			busy += nd_mapping->size;
+			blk_start = map_start;
+		}
+
+	*overlap = map_end + 1 - blk_start;
+	available = blk_start - map_start;
+	if (busy < available)
+		return available - busy;
+	return 0;
+
+ err:
+	/*
+	 * Something is wrong, PMEM must align with the start of the
+	 * interleave set, and there can only be one allocation per set.
+	 */
+	nd_dbg_dpa(nd_region, ndd, res, "%s\n", reason);
+	return 0;
+}
+
 void nvdimm_free_dpa(struct nvdimm_drvdata *ndd, struct resource *res)
 {
 	WARN_ON_ONCE(!is_nvdimm_bus_locked(ndd->dev));
@@ -271,6 +390,24 @@ struct resource *nvdimm_allocate_dpa(struct nvdimm_drvdata *ndd,
 	return res;
 }
 
+/**
+ * nvdimm_allocated_dpa - sum up the dpa currently allocated to this label_id
+ * @nvdimm: container of dpa-resource-root + labels
+ * @label_id: dpa resource name of the form {pmem|blk}-<human readable uuid>
+ */
+resource_size_t nvdimm_allocated_dpa(struct nvdimm_drvdata *ndd,
+		struct nd_label_id *label_id)
+{
+	resource_size_t allocated = 0;
+	struct resource *res;
+
+	for_each_dpa_resource(ndd, res)
+		if (strcmp(res->name, label_id->id) == 0)
+			allocated += resource_size(res);
+
+	return allocated;
+}
+
 static int count_dimms(struct device *dev, void *c)
 {
 	int *count = c;
diff --git a/drivers/nvdimm/label.c b/drivers/nvdimm/label.c
index db5d7492dc8d..1a3bcd27a57a 100644
--- a/drivers/nvdimm/label.c
+++ b/drivers/nvdimm/label.c
@@ -230,7 +230,7 @@ static bool preamble_current(struct nvdimm_drvdata *ndd,
 	return true;
 }
 
-static char *nd_label_gen_id(struct nd_label_id *label_id, u8 *uuid, u32 flags)
+char *nd_label_gen_id(struct nd_label_id *label_id, u8 *uuid, u32 flags)
 {
 	if (!label_id || !uuid)
 		return NULL;
@@ -288,3 +288,56 @@ int nd_label_reserve_dpa(struct nvdimm_drvdata *ndd)
 
 	return 0;
 }
+
+int nd_label_active_count(struct nvdimm_drvdata *ndd)
+{
+	struct nd_namespace_index *nsindex;
+	unsigned long *free;
+	u32 nslot, slot;
+	int count = 0;
+
+	if (!preamble_current(ndd, &nsindex, &free, &nslot))
+		return 0;
+
+	for_each_clear_bit_le(slot, free, nslot) {
+		struct nd_namespace_label *nd_label;
+
+		nd_label = nd_label_base(ndd) + slot;
+
+		if (!slot_valid(nd_label, slot)) {
+			u32 label_slot = __le32_to_cpu(nd_label->slot);
+			u64 size = __le64_to_cpu(nd_label->rawsize);
+			u64 dpa = __le64_to_cpu(nd_label->dpa);
+
+			dev_dbg(ndd->dev,
+				"%s: slot%d invalid slot: %d dpa: %llx size: %llx\n",
+					__func__, slot, label_slot, dpa, size);
+			continue;
+		}
+		count++;
+	}
+	return count;
+}
+
+struct nd_namespace_label *nd_label_active(struct nvdimm_drvdata *ndd, int n)
+{
+	struct nd_namespace_index *nsindex;
+	unsigned long *free;
+	u32 nslot, slot;
+
+	if (!preamble_current(ndd, &nsindex, &free, &nslot))
+		return NULL;
+
+	for_each_clear_bit_le(slot, free, nslot) {
+		struct nd_namespace_label *nd_label;
+
+		nd_label = nd_label_base(ndd) + slot;
+		if (!slot_valid(nd_label, slot))
+			continue;
+
+		if (n-- == 0)
+			return nd_label_base(ndd) + slot;
+	}
+
+	return NULL;
+}
diff --git a/drivers/nvdimm/label.h b/drivers/nvdimm/label.h
index d6aa0d5c6b4e..8ee1376526c7 100644
--- a/drivers/nvdimm/label.h
+++ b/drivers/nvdimm/label.h
@@ -125,4 +125,6 @@ int nd_label_validate(struct nvdimm_drvdata *ndd);
 void nd_label_copy(struct nvdimm_drvdata *ndd, struct nd_namespace_index *dst,
 		struct nd_namespace_index *src);
 size_t sizeof_namespace_index(struct nvdimm_drvdata *ndd);
+int nd_label_active_count(struct nvdimm_drvdata *ndd);
+struct nd_namespace_label *nd_label_active(struct nvdimm_drvdata *ndd, int n);
 #endif /* __LABEL_H__ */
diff --git a/drivers/nvdimm/namespace_devs.c b/drivers/nvdimm/namespace_devs.c
index 4f653d1e61ad..5d81032fcfc5 100644
--- a/drivers/nvdimm/namespace_devs.c
+++ b/drivers/nvdimm/namespace_devs.c
@@ -14,6 +14,7 @@
 #include <linux/device.h>
 #include <linux/slab.h>
 #include <linux/nd.h>
+#include "nd-core.h"
 #include "nd.h"
 
 static void namespace_io_release(struct device *dev)
@@ -23,11 +24,50 @@ static void namespace_io_release(struct device *dev)
 	kfree(nsio);
 }
 
+static void namespace_pmem_release(struct device *dev)
+{
+	struct nd_namespace_pmem *nspm = to_nd_namespace_pmem(dev);
+
+	kfree(nspm->alt_name);
+	kfree(nspm->uuid);
+	kfree(nspm);
+}
+
+static void namespace_blk_release(struct device *dev)
+{
+	/* TODO: blk namespace support */
+}
+
 static struct device_type namespace_io_device_type = {
 	.name = "nd_namespace_io",
 	.release = namespace_io_release,
 };
 
+static struct device_type namespace_pmem_device_type = {
+	.name = "nd_namespace_pmem",
+	.release = namespace_pmem_release,
+};
+
+static struct device_type namespace_blk_device_type = {
+	.name = "nd_namespace_blk",
+	.release = namespace_blk_release,
+};
+
+static bool is_namespace_pmem(struct device *dev)
+{
+	return dev ? dev->type == &namespace_pmem_device_type : false;
+}
+
+static bool is_namespace_blk(struct device *dev)
+{
+	return dev ? dev->type == &namespace_blk_device_type : false;
+}
+
+static bool is_namespace_io(struct device *dev)
+{
+	return dev ? dev->type == &namespace_io_device_type : false;
+}
+
 static ssize_t nstype_show(struct device *dev,
 		struct device_attribute *attr, char *buf)
 {
@@ -37,13 +77,676 @@ static ssize_t nstype_show(struct device *dev,
 }
 static DEVICE_ATTR_RO(nstype);
 
+static ssize_t __alt_name_store(struct device *dev, const char *buf,
+		const size_t len)
+{
+	char *input, *pos, *alt_name, **ns_altname;
+	ssize_t rc;
+
+	if (is_namespace_pmem(dev)) {
+		struct nd_namespace_pmem *nspm = to_nd_namespace_pmem(dev);
+
+		ns_altname = &nspm->alt_name;
+	} else if (is_namespace_blk(dev)) {
+		/* TODO: blk namespace support */
+		return -ENXIO;
+	} else
+		return -ENXIO;
+
+	if (dev->driver)
+		return -EBUSY;
+
+	input = kmemdup(buf, len + 1, GFP_KERNEL);
+	if (!input)
+		return -ENOMEM;
+
+	input[len] = '\0';
+	pos = strim(input);
+	if (strlen(pos) + 1 > NSLABEL_NAME_LEN) {
+		rc = -EINVAL;
+		goto out;
+	}
+
+	alt_name = kzalloc(NSLABEL_NAME_LEN, GFP_KERNEL);
+	if (!alt_name) {
+		rc = -ENOMEM;
+		goto out;
+	}
+	kfree(*ns_altname);
+	*ns_altname = alt_name;
+	sprintf(*ns_altname, "%s", pos);
+	rc = len;
+
+out:
+	kfree(input);
+	return rc;
+}
+
+static ssize_t alt_name_store(struct device *dev,
+		struct device_attribute *attr, const char *buf, size_t len)
+{
+	ssize_t rc;
+
+	device_lock(dev);
+	nvdimm_bus_lock(dev);
+	wait_nvdimm_bus_probe_idle(dev);
+	rc = __alt_name_store(dev, buf, len);
+	dev_dbg(dev, "%s: %s(%zd)\n", __func__, rc < 0 ? "fail " : "", rc);
+	nvdimm_bus_unlock(dev);
+	device_unlock(dev);
+
+	return rc;
+}
+
+static ssize_t alt_name_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	char *ns_altname;
+
+	if (is_namespace_pmem(dev)) {
+		struct nd_namespace_pmem *nspm = to_nd_namespace_pmem(dev);
+
+		ns_altname = nspm->alt_name;
+	} else if (is_namespace_blk(dev)) {
+		/* TODO: blk namespace support */
+		return -ENXIO;
+	} else
+		return -ENXIO;
+
+	return sprintf(buf, "%s\n", ns_altname ? ns_altname : "");
+}
+static DEVICE_ATTR_RW(alt_name);
+
+static int scan_free(struct nd_region *nd_region,
+		struct nd_mapping *nd_mapping, struct nd_label_id *label_id,
+		resource_size_t n)
+{
+	bool is_blk = strncmp(label_id->id, "blk", 3) == 0;
+	struct nvdimm_drvdata *ndd = to_ndd(nd_mapping);
+	int rc = 0;
+
+	while (n) {
+		struct resource *res, *last;
+		resource_size_t new_start;
+
+		last = NULL;
+		for_each_dpa_resource(ndd, res)
+			if (strcmp(res->name, label_id->id) == 0)
+				last = res;
+		res = last;
+		if (!res)
+			return 0;
+
+		if (n >= resource_size(res)) {
+			n -= resource_size(res);
+			nd_dbg_dpa(nd_region, ndd, res, "delete %d\n", rc);
+			nvdimm_free_dpa(ndd, res);
+			/* retry with last resource deleted */
+			continue;
+		}
+
+		/*
+		 * Keep BLK allocations relegated to high DPA as much as
+		 * possible
+		 */
+		if (is_blk)
+			new_start = res->start + n;
+		else
+			new_start = res->start;
+
+		rc = adjust_resource(res, new_start, resource_size(res) - n);
+		nd_dbg_dpa(nd_region, ndd, res, "shrink %d\n", rc);
+		break;
+	}
+
+	return rc;
+}
+
+/**
+ * shrink_dpa_allocation - for each dimm in region free n bytes for label_id
+ * @nd_region: the set of dimms to reclaim @n bytes from
+ * @label_id: unique identifier for the namespace consuming this dpa range
+ * @n: number of bytes per-dimm to release
+ *
+ * Assumes resources are ordered.  Starting from the end try to
+ * adjust_resource() the allocation to @n, but if @n is larger than the
+ * allocation delete it and find the 'new' last allocation in the label
+ * set.
+ */
+static int shrink_dpa_allocation(struct nd_region *nd_region,
+		struct nd_label_id *label_id, resource_size_t n)
+{
+	int i;
+
+	for (i = 0; i < nd_region->ndr_mappings; i++) {
+		struct nd_mapping *nd_mapping = &nd_region->mapping[i];
+		int rc;
+
+		rc = scan_free(nd_region, nd_mapping, label_id, n);
+		if (rc)
+			return rc;
+	}
+
+	return 0;
+}
+
+static resource_size_t init_dpa_allocation(struct nd_label_id *label_id,
+		struct nd_region *nd_region, struct nd_mapping *nd_mapping,
+		resource_size_t n)
+{
+	bool is_blk = strncmp(label_id->id, "blk", 3) == 0;
+	struct nvdimm_drvdata *ndd = to_ndd(nd_mapping);
+	resource_size_t first_dpa;
+	struct resource *res;
+	int rc = 0;
+
+	/* allocate blk from highest dpa first */
+	if (is_blk)
+		first_dpa = nd_mapping->start + nd_mapping->size - n;
+	else
+		first_dpa = nd_mapping->start;
+
+	/* first resource allocation for this label-id or dimm */
+	res = nvdimm_allocate_dpa(ndd, label_id, first_dpa, n);
+	if (!res)
+		rc = -EBUSY;
+
+	nd_dbg_dpa(nd_region, ndd, res, "init %d\n", rc);
+	return rc ? n : 0;
+}
+
+static bool space_valid(bool is_pmem, struct nd_label_id *label_id,
+		struct resource *res)
+{
+	/*
+	 * For BLK-space any space is valid, for PMEM-space, it must be
+	 * contiguous with an existing allocation.
+	 */
+	if (!is_pmem)
+		return true;
+	if (!res || strcmp(res->name, label_id->id) == 0)
+		return true;
+	return false;
+}
+
+enum alloc_loc {
+	ALLOC_ERR = 0, ALLOC_BEFORE, ALLOC_MID, ALLOC_AFTER,
+};
+
+static resource_size_t scan_allocate(struct nd_region *nd_region,
+		struct nd_mapping *nd_mapping, struct nd_label_id *label_id,
+		resource_size_t n)
+{
+	resource_size_t mapping_end = nd_mapping->start + nd_mapping->size - 1;
+	bool is_pmem = strncmp(label_id->id, "pmem", 4) == 0;
+	struct nvdimm_drvdata *ndd = to_ndd(nd_mapping);
+	const resource_size_t to_allocate = n;
+	struct resource *res;
+	int first;
+
+ retry:
+	first = 0;
+	for_each_dpa_resource(ndd, res) {
+		resource_size_t allocate, available = 0, free_start, free_end;
+		struct resource *next = res->sibling, *new_res = NULL;
+		enum alloc_loc loc = ALLOC_ERR;
+		const char *action;
+		int rc = 0;
+
+		/* ignore resources outside this nd_mapping */
+		if (res->start > mapping_end)
+			continue;
+		if (res->end < nd_mapping->start)
+			continue;
+
+		/* space at the beginning of the mapping */
+		if (!first++ && res->start > nd_mapping->start) {
+			free_start = nd_mapping->start;
+			available = res->start - free_start;
+			if (space_valid(is_pmem, label_id, NULL))
+				loc = ALLOC_BEFORE;
+		}
+
+		/* space between allocations */
+		if (!loc && next) {
+			free_start = res->start + resource_size(res);
+			free_end = min(mapping_end, next->start - 1);
+			if (space_valid(is_pmem, label_id, res)
+					&& free_start < free_end) {
+				available = free_end + 1 - free_start;
+				loc = ALLOC_MID;
+			}
+		}
+
+		/* space at the end of the mapping */
+		if (!loc && !next) {
+			free_start = res->start + resource_size(res);
+			free_end = mapping_end;
+			if (space_valid(is_pmem, label_id, res)
+					&& free_start < free_end) {
+				available = free_end + 1 - free_start;
+				loc = ALLOC_AFTER;
+			}
+		}
+
+		if (!loc || !available)
+			continue;
+		allocate = min(available, n);
+		switch (loc) {
+		case ALLOC_BEFORE:
+			if (strcmp(res->name, label_id->id) == 0) {
+				/* adjust current resource up */
+				if (is_pmem)
+					return n;
+				rc = adjust_resource(res, res->start - allocate,
+						resource_size(res) + allocate);
+				action = "cur grow up";
+			} else
+				action = "allocate";
+			break;
+		case ALLOC_MID:
+			if (strcmp(next->name, label_id->id) == 0) {
+				/* adjust next resource up */
+				if (is_pmem)
+					return n;
+				rc = adjust_resource(next, next->start
+						- allocate, resource_size(next)
+						+ allocate);
+				new_res = next;
+				action = "next grow up";
+			} else if (strcmp(res->name, label_id->id) == 0) {
+				action = "grow down";
+			} else
+				action = "allocate";
+			break;
+		case ALLOC_AFTER:
+			if (strcmp(res->name, label_id->id) == 0)
+				action = "grow down";
+			else
+				action = "allocate";
+			break;
+		default:
+			return n;
+		}
+
+		if (strcmp(action, "allocate") == 0) {
+			/* BLK allocate bottom up */
+			if (!is_pmem)
+				free_start += available - allocate;
+			else if (free_start != nd_mapping->start)
+				return n;
+
+			new_res = nvdimm_allocate_dpa(ndd, label_id,
+					free_start, allocate);
+			if (!new_res)
+				rc = -EBUSY;
+		} else if (strcmp(action, "grow down") == 0) {
+			/* adjust current resource down */
+			rc = adjust_resource(res, res->start, resource_size(res)
+					+ allocate);
+		}
+
+		if (!new_res)
+			new_res = res;
+
+		nd_dbg_dpa(nd_region, ndd, new_res, "%s(%d) %d\n",
+				action, loc, rc);
+
+		if (rc)
+			return n;
+
+		n -= allocate;
+		if (n) {
+			/*
+			 * Retry scan with newly inserted resources.
+			 * For example, if we did an ALLOC_BEFORE
+			 * insertion there may also have been space
+			 * available for an ALLOC_AFTER insertion, so we
+			 * need to check this same resource again
+			 */
+			goto retry;
+		} else
+			return 0;
+	}
+
+	if (is_pmem && n == to_allocate)
+		return init_dpa_allocation(label_id, nd_region, nd_mapping, n);
+	return n;
+}
+
+/**
+ * grow_dpa_allocation - for each dimm allocate n bytes for @label_id
+ * @nd_region: the set of dimms to allocate @n more bytes from
+ * @label_id: unique identifier for the namespace consuming this dpa range
+ * @n: number of bytes per-dimm to add to the existing allocation
+ *
+ * Assumes resources are ordered.  For BLK regions, first consume
+ * BLK-only available DPA free space, then consume PMEM-aliased DPA
+ * space starting at the highest DPA.  For PMEM regions start
+ * allocations from the start of an interleave set and end at the first
+ * BLK allocation or the end of the interleave set, whichever comes
+ * first.
+ */
+static int grow_dpa_allocation(struct nd_region *nd_region,
+		struct nd_label_id *label_id, resource_size_t n)
+{
+	int i;
+
+	for (i = 0; i < nd_region->ndr_mappings; i++) {
+		struct nd_mapping *nd_mapping = &nd_region->mapping[i];
+		int rc;
+
+		rc = scan_allocate(nd_region, nd_mapping, label_id, n);
+		if (rc)
+			return rc;
+	}
+
+	return 0;
+}
+
+static void nd_namespace_pmem_set_size(struct nd_region *nd_region,
+		struct nd_namespace_pmem *nspm, resource_size_t size)
+{
+	struct resource *res = &nspm->nsio.res;
+
+	res->start = nd_region->ndr_start;
+	res->end = nd_region->ndr_start + size - 1;
+}
+
+static ssize_t __size_store(struct device *dev, unsigned long long val)
+{
+	resource_size_t allocated = 0, available = 0;
+	struct nd_region *nd_region = to_nd_region(dev->parent);
+	struct nd_mapping *nd_mapping;
+	struct nvdimm_drvdata *ndd;
+	struct nd_label_id label_id;
+	u32 flags = 0, remainder;
+	u8 *uuid = NULL;
+	int rc, i;
+
+	if (dev->driver)
+		return -EBUSY;
+
+	if (is_namespace_pmem(dev)) {
+		struct nd_namespace_pmem *nspm = to_nd_namespace_pmem(dev);
+
+		uuid = nspm->uuid;
+	} else if (is_namespace_blk(dev)) {
+		/* TODO: blk namespace support */
+		return -ENXIO;
+	}
+
+	/*
+	 * We need a uuid for the allocation-label and dimm(s) on which
+	 * to store the label.
+	 */
+	if (!uuid || nd_region->ndr_mappings == 0)
+		return -ENXIO;
+
+	div_u64_rem(val, SZ_4K * nd_region->ndr_mappings, &remainder);
+	if (remainder) {
+		dev_dbg(dev, "%llu is not %dK aligned\n", val,
+				(SZ_4K * nd_region->ndr_mappings) / SZ_1K);
+		return -EINVAL;
+	}
+
+	nd_label_gen_id(&label_id, uuid, flags);
+	for (i = 0; i < nd_region->ndr_mappings; i++) {
+		nd_mapping = &nd_region->mapping[i];
+		ndd = to_ndd(nd_mapping);
+
+		/*
+		 * All dimms in an interleave set, or the base dimm for a blk
+		 * region, need to be enabled for the size to be changed.
+		 */
+		if (!ndd)
+			return -ENXIO;
+
+		allocated += nvdimm_allocated_dpa(ndd, &label_id);
+	}
+	available = nd_region_available_dpa(nd_region);
+
+	if (val > available + allocated)
+		return -ENOSPC;
+
+	if (val == allocated)
+		return 0;
+
+	val = div_u64(val, nd_region->ndr_mappings);
+	allocated = div_u64(allocated, nd_region->ndr_mappings);
+	if (val < allocated)
+		rc = shrink_dpa_allocation(nd_region, &label_id,
+				allocated - val);
+	else
+		rc = grow_dpa_allocation(nd_region, &label_id, val - allocated);
+
+	if (rc)
+		return rc;
+
+	if (is_namespace_pmem(dev)) {
+		struct nd_namespace_pmem *nspm = to_nd_namespace_pmem(dev);
+
+		nd_namespace_pmem_set_size(nd_region, nspm,
+				val * nd_region->ndr_mappings);
+	}
+
+	return rc;
+}
+
+static ssize_t size_store(struct device *dev,
+		struct device_attribute *attr, const char *buf, size_t len)
+{
+	unsigned long long val;
+	u8 **uuid = NULL;
+	int rc;
+
+	rc = kstrtoull(buf, 0, &val);
+	if (rc)
+		return rc;
+
+	device_lock(dev);
+	nvdimm_bus_lock(dev);
+	wait_nvdimm_bus_probe_idle(dev);
+	rc = __size_store(dev, val);
+
+	if (is_namespace_pmem(dev)) {
+		struct nd_namespace_pmem *nspm = to_nd_namespace_pmem(dev);
+
+		uuid = &nspm->uuid;
+	} else if (is_namespace_blk(dev)) {
+		/* TODO: blk namespace support */
+		rc = -ENXIO;
+	}
+
+	if (rc == 0 && val == 0 && uuid) {
+		/* setting size zero == 'delete namespace' */
+		kfree(*uuid);
+		*uuid = NULL;
+	}
+
+	dev_dbg(dev, "%s: %llx %s (%d)\n", __func__, val, rc < 0
+			? "fail" : "success", rc);
+
+	nvdimm_bus_unlock(dev);
+	device_unlock(dev);
+
+	return rc ? rc : len;
+}
+
+static ssize_t size_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	if (is_namespace_pmem(dev)) {
+		struct nd_namespace_pmem *nspm = to_nd_namespace_pmem(dev);
+
+		return sprintf(buf, "%llu\n", (unsigned long long)
+				resource_size(&nspm->nsio.res));
+	} else if (is_namespace_blk(dev)) {
+		/* TODO: blk namespace support */
+		return -ENXIO;
+	} else if (is_namespace_io(dev)) {
+		struct nd_namespace_io *nsio = to_nd_namespace_io(dev);
+
+		return sprintf(buf, "%llu\n", (unsigned long long)
+				resource_size(&nsio->res));
+	} else
+		return -ENXIO;
+}
+static DEVICE_ATTR(size, S_IRUGO, size_show, size_store);
+
+static ssize_t uuid_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	u8 *uuid;
+
+	if (is_namespace_pmem(dev)) {
+		struct nd_namespace_pmem *nspm = to_nd_namespace_pmem(dev);
+
+		uuid = nspm->uuid;
+	} else if (is_namespace_blk(dev)) {
+		/* TODO: blk namespace support */
+		return -ENXIO;
+	} else
+		return -ENXIO;
+
+	if (uuid)
+		return sprintf(buf, "%pUb\n", uuid);
+	return sprintf(buf, "\n");
+}
+
+/**
+ * namespace_update_uuid - check for a unique uuid and whether we're "renaming"
+ * @nd_region: parent region so we can updates all dimms in the set
+ * @dev: namespace type for generating label_id
+ * @new_uuid: incoming uuid
+ * @old_uuid: reference to the uuid storage location in the namespace object
+ */
+static int namespace_update_uuid(struct nd_region *nd_region,
+		struct device *dev, u8 *new_uuid, u8 **old_uuid)
+{
+	u32 flags = is_namespace_blk(dev) ? NSLABEL_FLAG_LOCAL : 0;
+	struct nd_label_id old_label_id;
+	struct nd_label_id new_label_id;
+	int i, rc;
+
+	rc = nd_is_uuid_unique(dev, new_uuid) ? 0 : -EINVAL;
+	if (rc) {
+		kfree(new_uuid);
+		return rc;
+	}
+
+	if (*old_uuid == NULL)
+		goto out;
+
+	nd_label_gen_id(&old_label_id, *old_uuid, flags);
+	nd_label_gen_id(&new_label_id, new_uuid, flags);
+	for (i = 0; i < nd_region->ndr_mappings; i++) {
+		struct nd_mapping *nd_mapping = &nd_region->mapping[i];
+		struct nvdimm_drvdata *ndd = to_ndd(nd_mapping);
+		struct resource *res;
+
+		for_each_dpa_resource(ndd, res)
+			if (strcmp(res->name, old_label_id.id) == 0)
+				sprintf((void *) res->name, "%s",
+						new_label_id.id);
+	}
+	kfree(*old_uuid);
+ out:
+	*old_uuid = new_uuid;
+	return 0;
+}
+
+static ssize_t uuid_store(struct device *dev,
+		struct device_attribute *attr, const char *buf, size_t len)
+{
+	struct nd_region *nd_region = to_nd_region(dev->parent);
+	u8 *uuid = NULL;
+	u8 **ns_uuid;
+	ssize_t rc;
+
+	if (is_namespace_pmem(dev)) {
+		struct nd_namespace_pmem *nspm = to_nd_namespace_pmem(dev);
+
+		ns_uuid = &nspm->uuid;
+	} else if (is_namespace_blk(dev)) {
+		/* TODO: blk namespace support */
+		return -ENXIO;
+	} else
+		return -ENXIO;
+
+	device_lock(dev);
+	nvdimm_bus_lock(dev);
+	wait_nvdimm_bus_probe_idle(dev);
+	rc = nd_uuid_store(dev, &uuid, buf, len);
+	if (rc >= 0)
+		rc = namespace_update_uuid(nd_region, dev, uuid, ns_uuid);
+	dev_dbg(dev, "%s: result: %zd wrote: %s%s", __func__,
+			rc, buf, buf[len - 1] == '\n' ? "" : "\n");
+	nvdimm_bus_unlock(dev);
+	device_unlock(dev);
+
+	return rc ? rc : len;
+}
+static DEVICE_ATTR_RW(uuid);
+
+static ssize_t resource_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct resource *res;
+
+	if (is_namespace_pmem(dev)) {
+		struct nd_namespace_pmem *nspm = to_nd_namespace_pmem(dev);
+
+		res = &nspm->nsio.res;
+	} else if (is_namespace_io(dev)) {
+		struct nd_namespace_io *nsio = to_nd_namespace_io(dev);
+
+		res = &nsio->res;
+	} else
+		return -ENXIO;
+
+	/* no address to convey if the namespace has no allocation */
+	if (resource_size(res) == 0)
+		return -ENXIO;
+	return sprintf(buf, "%#llx\n", (unsigned long long) res->start);
+}
+static DEVICE_ATTR_RO(resource);
+
 static struct attribute *nd_namespace_attributes[] = {
 	&dev_attr_nstype.attr,
+	&dev_attr_size.attr,
+	&dev_attr_uuid.attr,
+	&dev_attr_resource.attr,
+	&dev_attr_alt_name.attr,
 	NULL,
 };
 
+static umode_t namespace_visible(struct kobject *kobj,
+		struct attribute *a, int n)
+{
+	struct device *dev = container_of(kobj, struct device, kobj);
+
+	if (a == &dev_attr_resource.attr) {
+		if (is_namespace_blk(dev))
+			return 0;
+		return a->mode;
+	}
+
+	if (is_namespace_pmem(dev) || is_namespace_blk(dev)) {
+		if (a == &dev_attr_size.attr)
+			return S_IWUSR | S_IRUGO;
+		return a->mode;
+	}
+
+	if (a == &dev_attr_nstype.attr || a == &dev_attr_size.attr)
+		return a->mode;
+
+	return 0;
+}
+
 static struct attribute_group nd_namespace_attribute_group = {
 	.attrs = nd_namespace_attributes,
+	.is_visible = namespace_visible,
 };
 
 static const struct attribute_group *nd_namespace_attribute_groups[] = {
@@ -81,23 +784,318 @@ static struct device **create_namespace_io(struct nd_region *nd_region)
 	return devs;
 }
 
+static bool has_uuid_at_pos(struct nd_region *nd_region, u8 *uuid,
+		u64 cookie, u16 pos)
+{
+	struct nd_namespace_label *found = NULL;
+	int i;
+
+	for (i = 0; i < nd_region->ndr_mappings; i++) {
+		struct nd_mapping *nd_mapping = &nd_region->mapping[i];
+		struct nd_namespace_label *nd_label;
+		bool found_uuid = false;
+		int l;
+
+		for_each_label(l, nd_label, nd_mapping->labels) {
+			u64 isetcookie = __le64_to_cpu(nd_label->isetcookie);
+			u16 position = __le16_to_cpu(nd_label->position);
+			u16 nlabel = __le16_to_cpu(nd_label->nlabel);
+
+			if (isetcookie != cookie)
+				continue;
+
+			if (memcmp(nd_label->uuid, uuid, NSLABEL_UUID_LEN) != 0)
+				continue;
+
+			if (found_uuid) {
+				dev_dbg(to_ndd(nd_mapping)->dev,
+						"%s duplicate entry for uuid\n",
+						__func__);
+				return false;
+			}
+			found_uuid = true;
+			if (nlabel != nd_region->ndr_mappings)
+				continue;
+			if (position != pos)
+				continue;
+			found = nd_label;
+			break;
+		}
+		if (found)
+			break;
+	}
+	return found != NULL;
+}
+
+static int select_pmem_id(struct nd_region *nd_region, u8 *pmem_id)
+{
+	struct nd_namespace_label *select = NULL;
+	int i;
+
+	if (!pmem_id)
+		return -ENODEV;
+
+	for (i = 0; i < nd_region->ndr_mappings; i++) {
+		struct nd_mapping *nd_mapping = &nd_region->mapping[i];
+		struct nd_namespace_label *nd_label;
+		u64 hw_start, hw_end, pmem_start, pmem_end;
+		int l;
+
+		for_each_label(l, nd_label, nd_mapping->labels)
+			if (memcmp(nd_label->uuid, pmem_id, NSLABEL_UUID_LEN) == 0)
+				break;
+
+		if (!nd_label) {
+			WARN_ON(1);
+			return -EINVAL;
+		}
+
+		select = nd_label;
+		/*
+		 * Check that this label is compliant with the dpa
+		 * range published in NFIT
+		 */
+		hw_start = nd_mapping->start;
+		hw_end = hw_start + nd_mapping->size;
+		pmem_start = __le64_to_cpu(select->dpa);
+		pmem_end = pmem_start + __le64_to_cpu(select->rawsize);
+		if (pmem_start == hw_start && pmem_end <= hw_end)
+			/* pass */;
+		else
+			return -EINVAL;
+
+		nd_mapping->labels[0] = select;
+		nd_mapping->labels[1] = NULL;
+	}
+	return 0;
+}
+
+/**
+ * find_pmem_label_set - validate interleave set labelling, retrieve label0
+ * @nd_region: region with mappings to validate
+ */
+static int find_pmem_label_set(struct nd_region *nd_region,
+		struct nd_namespace_pmem *nspm)
+{
+	u64 cookie = nd_region_interleave_set_cookie(nd_region);
+	struct nd_namespace_label *nd_label;
+	u8 select_id[NSLABEL_UUID_LEN];
+	resource_size_t size = 0;
+	u8 *pmem_id = NULL;
+	int rc = -ENODEV, l;
+	u16 i;
+
+	if (cookie == 0)
+		return -ENXIO;
+
+	/*
+	 * Find a complete set of labels by uuid.  By definition we can start
+	 * with any mapping as the reference label
+	 */
+	for_each_label(l, nd_label, nd_region->mapping[0].labels) {
+		u64 isetcookie = __le64_to_cpu(nd_label->isetcookie);
+
+		if (isetcookie != cookie)
+			continue;
+
+		for (i = 0; nd_region->ndr_mappings; i++)
+			if (!has_uuid_at_pos(nd_region, nd_label->uuid,
+						cookie, i))
+				break;
+		if (i < nd_region->ndr_mappings) {
+			/*
+			 * Give up if we don't find an instance of a
+			 * uuid at each position (from 0 to
+			 * nd_region->ndr_mappings - 1), or if we find a
+			 * dimm with two instances of the same uuid.
+			 */
+			rc = -EINVAL;
+			goto err;
+		} else if (pmem_id) {
+			/*
+			 * If there is more than one valid uuid set, we
+			 * need userspace to clean this up.
+			 */
+			rc = -EBUSY;
+			goto err;
+		}
+		memcpy(select_id, nd_label->uuid, NSLABEL_UUID_LEN);
+		pmem_id = select_id;
+	}
+
+	/*
+	 * Fix up each mapping's 'labels' to have the validated pmem label for
+	 * that position at labels[0], and NULL at labels[1].  In the process,
+	 * check that the namespace aligns with interleave-set.  We know
+	 * that it does not overlap with any blk namespaces by virtue of
+	 * the dimm being enabled (i.e. nd_label_reserve_dpa()
+	 * succeeded).
+	 */
+	rc = select_pmem_id(nd_region, pmem_id);
+	if (rc)
+		goto err;
+
+	/* Calculate total size and populate namespace properties from label0 */
+	for (i = 0; i < nd_region->ndr_mappings; i++) {
+		struct nd_mapping *nd_mapping = &nd_region->mapping[i];
+		struct nd_namespace_label *label0 = nd_mapping->labels[0];
+
+		size += __le64_to_cpu(label0->rawsize);
+		if (__le16_to_cpu(label0->position) != 0)
+			continue;
+		WARN_ON(nspm->alt_name || nspm->uuid);
+		nspm->alt_name = kmemdup((void __force *) label0->name,
+				NSLABEL_NAME_LEN, GFP_KERNEL);
+		nspm->uuid = kmemdup((void __force *) label0->uuid,
+				NSLABEL_UUID_LEN, GFP_KERNEL);
+	}
+
+	if (!nspm->alt_name || !nspm->uuid) {
+		rc = -ENOMEM;
+		goto err;
+	}
+
+	nd_namespace_pmem_set_size(nd_region, nspm, size);
+
+	return 0;
+ err:
+	switch (rc) {
+	case -EINVAL:
+		dev_dbg(&nd_region->dev, "%s: invalid label(s)\n", __func__);
+		break;
+	case -ENODEV:
+		dev_dbg(&nd_region->dev, "%s: label not found\n", __func__);
+		break;
+	default:
+		dev_dbg(&nd_region->dev, "%s: unexpected err: %d\n",
+				__func__, rc);
+		break;
+	}
+	return rc;
+}
+
+static struct device **create_namespace_pmem(struct nd_region *nd_region)
+{
+	struct nd_namespace_pmem *nspm;
+	struct device *dev, **devs;
+	struct resource *res;
+	int rc;
+
+	nspm = kzalloc(sizeof(*nspm), GFP_KERNEL);
+	if (!nspm)
+		return NULL;
+
+	dev = &nspm->nsio.dev;
+	dev->type = &namespace_pmem_device_type;
+	dev->parent = &nd_region->dev;
+	res = &nspm->nsio.res;
+	res->name = dev_name(&nd_region->dev);
+	res->flags = IORESOURCE_MEM;
+	rc = find_pmem_label_set(nd_region, nspm);
+	if (rc == -ENODEV) {
+		int i;
+
+		/* Pass, try to permit namespace creation... */
+		for (i = 0; i < nd_region->ndr_mappings; i++) {
+			struct nd_mapping *nd_mapping = &nd_region->mapping[i];
+
+			kfree(nd_mapping->labels);
+			nd_mapping->labels = NULL;
+		}
+
+		/* Publish a zero-sized namespace for userspace to configure. */
+		nd_namespace_pmem_set_size(nd_region, nspm, 0);
+
+		rc = 0;
+	} else if (rc)
+		goto err;
+
+	devs = kcalloc(2, sizeof(struct device *), GFP_KERNEL);
+	if (!devs)
+		goto err;
+
+	devs[0] = dev;
+	return devs;
+
+ err:
+	namespace_pmem_release(&nspm->nsio.dev);
+	return NULL;
+}
+
+static int init_active_labels(struct nd_region *nd_region)
+{
+	int i;
+
+	for (i = 0; i < nd_region->ndr_mappings; i++) {
+		struct nd_mapping *nd_mapping = &nd_region->mapping[i];
+		struct nvdimm_drvdata *ndd = to_ndd(nd_mapping);
+		struct nvdimm *nvdimm = nd_mapping->nvdimm;
+		int count, j;
+
+		/*
+		 * If the dimm is disabled then prevent the region from
+		 * being activated if it aliases DPA.
+		 */
+		if (!ndd) {
+			if ((nvdimm->flags & NDD_ALIASING) == 0)
+				return 0;
+			dev_dbg(&nd_region->dev, "%s: is disabled, failing probe\n",
+					dev_name(&nd_mapping->nvdimm->dev));
+			return -ENXIO;
+		}
+		nd_mapping->ndd = ndd;
+		atomic_inc(&nvdimm->busy);
+		get_ndd(ndd);
+
+		count = nd_label_active_count(ndd);
+		dev_dbg(ndd->dev, "%s: %d\n", __func__, count);
+		if (!count)
+			continue;
+		nd_mapping->labels = kcalloc(count + 1, sizeof(void *),
+				GFP_KERNEL);
+		if (!nd_mapping->labels)
+			return -ENOMEM;
+		for (j = 0; j < count; j++) {
+			struct nd_namespace_label *label;
+
+			label = nd_label_active(ndd, j);
+			nd_mapping->labels[j] = label;
+		}
+	}
+
+	return 0;
+}
+
 int nd_region_register_namespaces(struct nd_region *nd_region, int *err)
 {
 	struct device **devs = NULL;
-	int i;
+	int i, rc = 0, type;
 
 	*err = 0;
-	switch (nd_region_to_nstype(nd_region)) {
+	nvdimm_bus_lock(&nd_region->dev);
+	rc = init_active_labels(nd_region);
+	if (rc) {
+		nvdimm_bus_unlock(&nd_region->dev);
+		return rc;
+	}
+
+	type = nd_region_to_nstype(nd_region);
+	switch (type) {
 	case ND_DEVICE_NAMESPACE_IO:
 		devs = create_namespace_io(nd_region);
 		break;
+	case ND_DEVICE_NAMESPACE_PMEM:
+		devs = create_namespace_pmem(nd_region);
+		break;
 	default:
 		break;
 	}
+	nvdimm_bus_unlock(&nd_region->dev);
 
 	if (!devs)
 		return -ENODEV;
 
+	nd_region->ns_seed = devs[0];
 	for (i = 0; devs[i]; i++) {
 		struct device *dev = devs[i];
 
diff --git a/drivers/nvdimm/nd-core.h b/drivers/nvdimm/nd-core.h
index 6a4b2c066ee7..c6c889292bab 100644
--- a/drivers/nvdimm/nd-core.h
+++ b/drivers/nvdimm/nd-core.h
@@ -56,4 +56,16 @@ int nvdimm_bus_register_dimms(struct nvdimm_bus *nvdimm_bus);
 int nvdimm_bus_register_regions(struct nvdimm_bus *nvdimm_bus);
 int nvdimm_bus_init_interleave_sets(struct nvdimm_bus *nvdimm_bus);
 int nd_match_dimm(struct device *dev, void *data);
+struct nd_label_id;
+char *nd_label_gen_id(struct nd_label_id *label_id, u8 *uuid, u32 flags);
+bool nd_is_uuid_unique(struct device *dev, u8 *uuid);
+struct nd_region;
+struct nvdimm_drvdata;
+struct nd_mapping;
+resource_size_t nd_pmem_available_dpa(struct nd_region *nd_region,
+		struct nd_mapping *nd_mapping, resource_size_t *overlap);
+resource_size_t nd_region_available_dpa(struct nd_region *nd_region);
+resource_size_t nvdimm_allocated_dpa(struct nvdimm_drvdata *ndd,
+		struct nd_label_id *label_id);
+void get_ndd(struct nvdimm_drvdata *ndd);
 #endif /* __ND_CORE_H__ */
diff --git a/drivers/nvdimm/nd.h b/drivers/nvdimm/nd.h
index 401fa0d5b6ea..03e610cd9f43 100644
--- a/drivers/nvdimm/nd.h
+++ b/drivers/nvdimm/nd.h
@@ -16,6 +16,7 @@
 #include <linux/device.h>
 #include <linux/mutex.h>
 #include <linux/ndctl.h>
+#include <linux/types.h>
 #include "label.h"
 
 struct nvdimm_drvdata {
@@ -25,6 +26,7 @@ struct nvdimm_drvdata {
 	void *data;
 	int ns_current, ns_next;
 	struct resource dpa;
+	struct kref kref;
 };
 
 struct nd_region_namespaces {
@@ -59,12 +61,19 @@ static inline struct nd_namespace_index *to_next_namespace_index(
 		(unsigned long long) (res ? resource_size(res) : 0), \
 		(unsigned long long) (res ? res->start : 0), ##arg)
 
+#define for_each_label(l, label, labels) \
+	for (l = 0; (label = labels ? labels[l] : NULL); l++)
+
+#define for_each_dpa_resource(ndd, res) \
+	for (res = (ndd)->dpa.child; res; res = res->sibling)
+
 #define for_each_dpa_resource_safe(ndd, res, next) \
 	for (res = (ndd)->dpa.child, next = res ? res->sibling : NULL; \
 			res; res = next, next = next ? next->sibling : NULL)
 
 struct nd_region {
 	struct device dev;
+	struct device *ns_seed;
 	u16 ndr_mappings;
 	u64 ndr_size;
 	u64 ndr_start;
@@ -88,20 +97,28 @@ enum nd_async_mode {
 	ND_ASYNC,
 };
 
+void wait_nvdimm_bus_probe_idle(struct device *dev);
 void nd_device_register(struct device *dev);
 void nd_device_unregister(struct device *dev, enum nd_async_mode mode);
+int nd_uuid_store(struct device *dev, u8 **uuid_out, const char *buf,
+		size_t len);
 int __init nvdimm_init(void);
 int __init nd_region_init(void);
 void nvdimm_exit(void);
 void nd_region_exit(void);
+struct nvdimm;
+struct nvdimm_drvdata *to_ndd(struct nd_mapping *nd_mapping);
 int nvdimm_init_nsarea(struct nvdimm_drvdata *ndd);
 int nvdimm_init_config_data(struct nvdimm_drvdata *ndd);
 struct nd_region *to_nd_region(struct device *dev);
 int nd_region_to_nstype(struct nd_region *nd_region);
 int nd_region_register_namespaces(struct nd_region *nd_region, int *err);
+u64 nd_region_interleave_set_cookie(struct nd_region *nd_region);
 void nvdimm_bus_lock(struct device *dev);
 void nvdimm_bus_unlock(struct device *dev);
 bool is_nvdimm_bus_locked(struct device *dev);
+void nvdimm_drvdata_release(struct kref *kref);
+void put_ndd(struct nvdimm_drvdata *ndd);
 int nd_label_reserve_dpa(struct nvdimm_drvdata *ndd);
 void nvdimm_free_dpa(struct nvdimm_drvdata *ndd, struct resource *res);
 struct resource *nvdimm_allocate_dpa(struct nvdimm_drvdata *ndd,
diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index d46975ed9e40..90902a142e35 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -203,6 +203,23 @@ static int nd_pmem_probe(struct device *dev)
 	struct nd_namespace_io *nsio = to_nd_namespace_io(dev);
 	struct pmem_device *pmem;
 
+	if (resource_size(&nsio->res) < ND_MIN_NAMESPACE_SIZE) {
+		resource_size_t size = resource_size(&nsio->res);
+
+		dev_dbg(dev, "%s: size: %pa, too small must be at least %#x\n",
+				__func__, &size, ND_MIN_NAMESPACE_SIZE);
+		return -ENODEV;
+	}
+
+	if (nd_region_to_nstype(nd_region) == ND_DEVICE_NAMESPACE_PMEM) {
+		struct nd_namespace_pmem *nspm = to_nd_namespace_pmem(dev);
+
+		if (!nspm->uuid) {
+			dev_dbg(dev, "%s: uuid not set\n", __func__);
+			return -ENODEV;
+		}
+	}
+
 	pmem = pmem_alloc(dev, &nsio->res, nd_region->id);
 	if (IS_ERR(pmem))
 		return PTR_ERR(pmem);
@@ -222,13 +239,14 @@ static int nd_pmem_remove(struct device *dev)
 
 MODULE_ALIAS("pmem");
 MODULE_ALIAS_ND_DEVICE(ND_DEVICE_NAMESPACE_IO);
+MODULE_ALIAS_ND_DEVICE(ND_DEVICE_NAMESPACE_PMEM);
 static struct nd_device_driver nd_pmem_driver = {
 	.probe = nd_pmem_probe,
 	.remove = nd_pmem_remove,
 	.drv = {
 		.name = "nd_pmem",
 	},
-	.type = ND_DRIVER_NAMESPACE_IO,
+	.type = ND_DRIVER_NAMESPACE_IO | ND_DRIVER_NAMESPACE_PMEM,
 };
 
 static int __init pmem_init(void)
diff --git a/drivers/nvdimm/region.c b/drivers/nvdimm/region.c
index ade3dba81afd..9aba44e483e0 100644
--- a/drivers/nvdimm/region.c
+++ b/drivers/nvdimm/region.c
@@ -61,8 +61,11 @@ static int child_unregister(struct device *dev, void *data)
 
 static int nd_region_remove(struct device *dev)
 {
+	struct nd_region *nd_region = to_nd_region(dev);
+
 	/* flush attribute readers and disable */
 	nvdimm_bus_lock(dev);
+	nd_region->ns_seed = NULL;
 	dev_set_drvdata(dev, NULL);
 	nvdimm_bus_unlock(dev);
 
diff --git a/drivers/nvdimm/region_devs.c b/drivers/nvdimm/region_devs.c
index 1571424578f0..b45806f7176d 100644
--- a/drivers/nvdimm/region_devs.c
+++ b/drivers/nvdimm/region_devs.c
@@ -15,6 +15,7 @@
 #include <linux/slab.h>
 #include <linux/sort.h>
 #include <linux/io.h>
+#include <linux/nd.h>
 #include "nd-core.h"
 #include "nd.h"
 
@@ -99,6 +100,58 @@ int nd_region_to_nstype(struct nd_region *nd_region)
 
 	return 0;
 }
+EXPORT_SYMBOL(nd_region_to_nstype);
+
+static int is_uuid_busy(struct device *dev, void *data)
+{
+	struct nd_region *nd_region = to_nd_region(dev->parent);
+	u8 *uuid = data;
+
+	switch (nd_region_to_nstype(nd_region)) {
+	case ND_DEVICE_NAMESPACE_PMEM: {
+		struct nd_namespace_pmem *nspm = to_nd_namespace_pmem(dev);
+
+		if (!nspm->uuid)
+			break;
+		if (memcmp(uuid, nspm->uuid, NSLABEL_UUID_LEN) == 0)
+			return -EBUSY;
+		break;
+	}
+	case ND_DEVICE_NAMESPACE_BLK: {
+		/* TODO: blk namespace support */
+		break;
+	}
+	default:
+		break;
+	}
+
+	return 0;
+}
+
+static int is_namespace_uuid_busy(struct device *dev, void *data)
+{
+	if (is_nd_pmem(dev) || is_nd_blk(dev))
+		return device_for_each_child(dev, data, is_uuid_busy);
+	return 0;
+}
+
+/**
+ * nd_is_uuid_unique - verify that no other namespace has @uuid
+ * @dev: any device on a nvdimm_bus
+ * @uuid: uuid to check
+ */
+bool nd_is_uuid_unique(struct device *dev, u8 *uuid)
+{
+	struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(dev);
+
+	if (!nvdimm_bus)
+		return false;
+	WARN_ON_ONCE(!is_nvdimm_bus_locked(&nvdimm_bus->dev));
+	if (device_for_each_child(&nvdimm_bus->dev, uuid,
+				is_namespace_uuid_busy) != 0)
+		return false;
+	return true;
+}
 
 static ssize_t size_show(struct device *dev,
 		struct device_attribute *attr, char *buf)
@@ -151,6 +204,60 @@ static ssize_t set_cookie_show(struct device *dev,
 }
 static DEVICE_ATTR_RO(set_cookie);
 
+resource_size_t nd_region_available_dpa(struct nd_region *nd_region)
+{
+	resource_size_t blk_max_overlap = 0, available, overlap;
+	int i;
+
+	WARN_ON(!is_nvdimm_bus_locked(&nd_region->dev));
+
+ retry:
+	available = 0;
+	overlap = blk_max_overlap;
+	for (i = 0; i < nd_region->ndr_mappings; i++) {
+		struct nd_mapping *nd_mapping = &nd_region->mapping[i];
+		struct nvdimm_drvdata *ndd = to_ndd(nd_mapping);
+
+		/* if a dimm is disabled the available capacity is zero */
+		if (!ndd)
+			return 0;
+
+		if (is_nd_pmem(&nd_region->dev)) {
+			available += nd_pmem_available_dpa(nd_region,
+					nd_mapping, &overlap);
+			if (overlap > blk_max_overlap) {
+				blk_max_overlap = overlap;
+				goto retry;
+			}
+		} else if (is_nd_blk(&nd_region->dev)) {
+			/* TODO: BLK Namespace support */
+		}
+	}
+
+	return available;
+}
+
+static ssize_t available_size_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct nd_region *nd_region = to_nd_region(dev);
+	unsigned long long available = 0;
+
+	/*
+	 * Flush in-flight updates and grab a snapshot of the available
+	 * size.  Of course, this value is potentially invalidated the
+	 * memory nvdimm_bus_lock() is dropped, but that's userspace's
+	 * problem to not race itself.
+	 */
+	nvdimm_bus_lock(dev);
+	wait_nvdimm_bus_probe_idle(dev);
+	available = nd_region_available_dpa(nd_region);
+	nvdimm_bus_unlock(dev);
+
+	return sprintf(buf, "%llu\n", available);
+}
+static DEVICE_ATTR_RO(available_size);
+
 static ssize_t init_namespaces_show(struct device *dev,
 		struct device_attribute *attr, char *buf)
 {
@@ -168,11 +275,29 @@ static ssize_t init_namespaces_show(struct device *dev,
 }
 static DEVICE_ATTR_RO(init_namespaces);
 
+static ssize_t namespace_seed_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct nd_region *nd_region = to_nd_region(dev);
+	ssize_t rc;
+
+	nvdimm_bus_lock(dev);
+	if (nd_region->ns_seed)
+		rc = sprintf(buf, "%s\n", dev_name(nd_region->ns_seed));
+	else
+		rc = sprintf(buf, "\n");
+	nvdimm_bus_unlock(dev);
+	return rc;
+}
+static DEVICE_ATTR_RO(namespace_seed);
+
 static struct attribute *nd_region_attributes[] = {
 	&dev_attr_size.attr,
 	&dev_attr_nstype.attr,
 	&dev_attr_mappings.attr,
 	&dev_attr_set_cookie.attr,
+	&dev_attr_available_size.attr,
+	&dev_attr_namespace_seed.attr,
 	&dev_attr_init_namespaces.attr,
 	NULL,
 };
@@ -182,12 +307,18 @@ static umode_t region_visible(struct kobject *kobj, struct attribute *a, int n)
 	struct device *dev = container_of(kobj, typeof(*dev), kobj);
 	struct nd_region *nd_region = to_nd_region(dev);
 	struct nd_interleave_set *nd_set = nd_region->nd_set;
+	int type = nd_region_to_nstype(nd_region);
 
-	if (a != &dev_attr_set_cookie.attr)
+	if (a != &dev_attr_set_cookie.attr
+			&& a != &dev_attr_available_size.attr)
 		return a->mode;
 
-	if (is_nd_pmem(dev) && nd_set)
-			return a->mode;
+	if ((type == ND_DEVICE_NAMESPACE_PMEM
+				|| type == ND_DEVICE_NAMESPACE_BLK)
+			&& a == &dev_attr_available_size.attr)
+		return a->mode;
+	else if (is_nd_pmem(dev) && nd_set)
+		return a->mode;
 
 	return 0;
 }
@@ -198,6 +329,15 @@ struct attribute_group nd_region_attribute_group = {
 };
 EXPORT_SYMBOL_GPL(nd_region_attribute_group);
 
+u64 nd_region_interleave_set_cookie(struct nd_region *nd_region)
+{
+	struct nd_interleave_set *nd_set = nd_region->nd_set;
+
+	if (nd_set)
+		return nd_set->cookie;
+	return 0;
+}
+
 /*
  * Upon successful probe/remove, take/release a reference on the
  * associated interleave set (if present)
@@ -205,18 +345,20 @@ EXPORT_SYMBOL_GPL(nd_region_attribute_group);
 static void nd_region_notify_driver_action(struct nvdimm_bus *nvdimm_bus,
 		struct device *dev, bool probe)
 {
-	if (is_nd_pmem(dev) || is_nd_blk(dev)) {
+	if (!probe && (is_nd_pmem(dev) || is_nd_blk(dev))) {
 		struct nd_region *nd_region = to_nd_region(dev);
 		int i;
 
 		for (i = 0; i < nd_region->ndr_mappings; i++) {
 			struct nd_mapping *nd_mapping = &nd_region->mapping[i];
+			struct nvdimm_drvdata *ndd = nd_mapping->ndd;
 			struct nvdimm *nvdimm = nd_mapping->nvdimm;
 
-			if (probe)
-				atomic_inc(&nvdimm->busy);
-			else
-				atomic_dec(&nvdimm->busy);
+			kfree(nd_mapping->labels);
+			nd_mapping->labels = NULL;
+			put_ndd(ndd);
+			nd_mapping->ndd = NULL;
+			atomic_dec(&nvdimm->busy);
 		}
 	}
 }
diff --git a/include/linux/libnvdimm.h b/include/linux/libnvdimm.h
index 1b627b109360..c130972e08c4 100644
--- a/include/linux/libnvdimm.h
+++ b/include/linux/libnvdimm.h
@@ -41,10 +41,20 @@ typedef int (*ndctl_fn)(struct nvdimm_bus_descriptor *nd_desc,
 		struct nvdimm *nvdimm, unsigned int cmd, void *buf,
 		unsigned int buf_len);
 
+struct nd_namespace_label;
+struct nvdimm_drvdata;
 struct nd_mapping {
 	struct nvdimm *nvdimm;
+	struct nd_namespace_label **labels;
 	u64 start;
 	u64 size;
+	/*
+	 * @ndd is for private use at region enable / disable time for
+	 * get_ndd() + put_ndd(), all other nd_mapping to ndd
+	 * conversions use to_ndd() which respects enabled state of the
+	 * nvdimm.
+	 */
+	struct nvdimm_drvdata *ndd;
 };
 
 struct nvdimm_bus_descriptor {
diff --git a/include/linux/nd.h b/include/linux/nd.h
index da70e9962197..255c38a83083 100644
--- a/include/linux/nd.h
+++ b/include/linux/nd.h
@@ -28,16 +28,40 @@ static inline struct nd_device_driver *to_nd_device_driver(
 	return container_of(drv, struct nd_device_driver, drv);
 };
 
+/**
+ * struct nd_namespace_io - infrastructure for loading an nd_pmem instance
+ * @dev: namespace device created by the nd region driver
+ * @res: struct resource conversion of a NFIT SPA table
+ */
 struct nd_namespace_io {
 	struct device dev;
 	struct resource res;
 };
 
+/**
+ * struct nd_namespace_pmem - namespace device for dimm-backed interleaved memory
+ * @nsio: device and system physical address range to drive
+ * @alt_name: namespace name supplied in the dimm label
+ * @uuid: namespace name supplied in the dimm label
+ */
+struct nd_namespace_pmem {
+	struct nd_namespace_io nsio;
+	char *alt_name;
+	u8 *uuid;
+};
+
 static inline struct nd_namespace_io *to_nd_namespace_io(struct device *dev)
 {
 	return container_of(dev, struct nd_namespace_io, dev);
 }
 
+static inline struct nd_namespace_pmem *to_nd_namespace_pmem(struct device *dev)
+{
+	struct nd_namespace_io *nsio = to_nd_namespace_io(dev);
+
+	return container_of(nsio, struct nd_namespace_pmem, nsio);
+}
+
 #define MODULE_ALIAS_ND_DEVICE(type) \
 	MODULE_ALIAS("nd:t" __stringify(type) "*")
 #define ND_DEVICE_MODALIAS_FMT "nd:t%d"
diff --git a/include/uapi/linux/ndctl.h b/include/uapi/linux/ndctl.h
index 1357a87b8714..2b94ea2287bb 100644
--- a/include/uapi/linux/ndctl.h
+++ b/include/uapi/linux/ndctl.h
@@ -190,4 +190,8 @@ enum nd_driver_flags {
 	ND_DRIVER_NAMESPACE_PMEM  = 1 << ND_DEVICE_NAMESPACE_PMEM,
 	ND_DRIVER_NAMESPACE_BLK   = 1 << ND_DEVICE_NAMESPACE_BLK,
 };
+
+enum {
+	ND_MIN_NAMESPACE_SIZE = 0x00400000,
+};
 #endif /* __NDCTL_H__ */
-- 
cgit v1.2.3


From 1b40e09a1232de537b193fa1b6b3ef16d3a1e397 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Fri, 1 May 2015 13:34:01 -0400
Subject: libnvdimm: blk labels and namespace instantiation

A blk label set describes a namespace comprised of one or more
discontiguous dpa ranges on a single dimm.  They may alias with one or
more pmem interleave sets that include the given dimm.

This is the runtime/volatile configuration infrastructure for sysfs
manipulation of 'alt_name', 'uuid', 'size', and 'sector_size'.  A later
patch will make these settings persistent by writing back the label(s).

Unlike pmem namespaces, multiple blk namespaces can be created per
region.  Once a blk namespace has been created a new seed device
(unconfigured child of a parent blk region) is instantiated.  As long as
a region has 'available_size' != 0 new child namespaces may be created.

Cc: Greg KH <gregkh@linuxfoundation.org>
Cc: Neil Brown <neilb@suse.de>
Acked-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/nvdimm/core.c           |  40 ++++
 drivers/nvdimm/dimm_devs.c      |  36 +++
 drivers/nvdimm/namespace_devs.c | 498 +++++++++++++++++++++++++++++++++++++---
 drivers/nvdimm/nd-core.h        |   8 +
 drivers/nvdimm/nd.h             |   5 +
 drivers/nvdimm/region_devs.c    |  17 +-
 include/linux/libnvdimm.h       |   3 +
 include/linux/nd.h              |  25 ++
 8 files changed, 594 insertions(+), 38 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/nvdimm/core.c b/drivers/nvdimm/core.c
index cf99cce8ef33..dd824d7c2669 100644
--- a/drivers/nvdimm/core.c
+++ b/drivers/nvdimm/core.c
@@ -173,6 +173,46 @@ int nd_uuid_store(struct device *dev, u8 **uuid_out, const char *buf,
 	return 0;
 }
 
+ssize_t nd_sector_size_show(unsigned long current_lbasize,
+		const unsigned long *supported, char *buf)
+{
+	ssize_t len = 0;
+	int i;
+
+	for (i = 0; supported[i]; i++)
+		if (current_lbasize == supported[i])
+			len += sprintf(buf + len, "[%ld] ", supported[i]);
+		else
+			len += sprintf(buf + len, "%ld ", supported[i]);
+	len += sprintf(buf + len, "\n");
+	return len;
+}
+
+ssize_t nd_sector_size_store(struct device *dev, const char *buf,
+		unsigned long *current_lbasize, const unsigned long *supported)
+{
+	unsigned long lbasize;
+	int rc, i;
+
+	if (dev->driver)
+		return -EBUSY;
+
+	rc = kstrtoul(buf, 0, &lbasize);
+	if (rc)
+		return rc;
+
+	for (i = 0; supported[i]; i++)
+		if (lbasize == supported[i])
+			break;
+
+	if (supported[i]) {
+		*current_lbasize = lbasize;
+		return 0;
+	} else {
+		return -EINVAL;
+	}
+}
+
 static ssize_t commands_show(struct device *dev,
 		struct device_attribute *attr, char *buf)
 {
diff --git a/drivers/nvdimm/dimm_devs.c b/drivers/nvdimm/dimm_devs.c
index b55acef179ba..101d3b76e405 100644
--- a/drivers/nvdimm/dimm_devs.c
+++ b/drivers/nvdimm/dimm_devs.c
@@ -289,6 +289,42 @@ struct nvdimm *nvdimm_create(struct nvdimm_bus *nvdimm_bus, void *provider_data,
 }
 EXPORT_SYMBOL_GPL(nvdimm_create);
 
+/**
+ * nd_blk_available_dpa - account the unused dpa of BLK region
+ * @nd_mapping: container of dpa-resource-root + labels
+ *
+ * Unlike PMEM, BLK namespaces can occupy discontiguous DPA ranges.
+ */
+resource_size_t nd_blk_available_dpa(struct nd_mapping *nd_mapping)
+{
+	struct nvdimm_drvdata *ndd = to_ndd(nd_mapping);
+	resource_size_t map_end, busy = 0, available;
+	struct resource *res;
+
+	if (!ndd)
+		return 0;
+
+	map_end = nd_mapping->start + nd_mapping->size - 1;
+	for_each_dpa_resource(ndd, res)
+		if (res->start >= nd_mapping->start && res->start < map_end) {
+			resource_size_t end = min(map_end, res->end);
+
+			busy += end - res->start + 1;
+		} else if (res->end >= nd_mapping->start
+				&& res->end <= map_end) {
+			busy += res->end - nd_mapping->start;
+		} else if (nd_mapping->start > res->start
+				&& nd_mapping->start < res->end) {
+			/* total eclipse of the BLK region mapping */
+			busy += nd_mapping->size;
+		}
+
+	available = map_end - nd_mapping->start + 1;
+	if (busy < available)
+		return available - busy;
+	return 0;
+}
+
 /**
  * nd_pmem_available_dpa - for the given dimm+region account unallocated dpa
  * @nd_mapping: container of dpa-resource-root + labels
diff --git a/drivers/nvdimm/namespace_devs.c b/drivers/nvdimm/namespace_devs.c
index 5d81032fcfc5..ad0ec09ca40f 100644
--- a/drivers/nvdimm/namespace_devs.c
+++ b/drivers/nvdimm/namespace_devs.c
@@ -35,7 +35,15 @@ static void namespace_pmem_release(struct device *dev)
 
 static void namespace_blk_release(struct device *dev)
 {
-	/* TODO: blk namespace support */
+	struct nd_namespace_blk *nsblk = to_nd_namespace_blk(dev);
+	struct nd_region *nd_region = to_nd_region(dev->parent);
+
+	if (nsblk->id >= 0)
+		ida_simple_remove(&nd_region->ns_ida, nsblk->id);
+	kfree(nsblk->alt_name);
+	kfree(nsblk->uuid);
+	kfree(nsblk->res);
+	kfree(nsblk);
 }
 
 static struct device_type namespace_io_device_type = {
@@ -88,8 +96,9 @@ static ssize_t __alt_name_store(struct device *dev, const char *buf,
 
 		ns_altname = &nspm->alt_name;
 	} else if (is_namespace_blk(dev)) {
-		/* TODO: blk namespace support */
-		return -ENXIO;
+		struct nd_namespace_blk *nsblk = to_nd_namespace_blk(dev);
+
+		ns_altname = &nsblk->alt_name;
 	} else
 		return -ENXIO;
 
@@ -122,6 +131,24 @@ out:
 	return rc;
 }
 
+static resource_size_t nd_namespace_blk_size(struct nd_namespace_blk *nsblk)
+{
+	struct nd_region *nd_region = to_nd_region(nsblk->dev.parent);
+	struct nd_mapping *nd_mapping = &nd_region->mapping[0];
+	struct nvdimm_drvdata *ndd = to_ndd(nd_mapping);
+	struct nd_label_id label_id;
+	resource_size_t size = 0;
+	struct resource *res;
+
+	if (!nsblk->uuid)
+		return 0;
+	nd_label_gen_id(&label_id, nsblk->uuid, NSLABEL_FLAG_LOCAL);
+	for_each_dpa_resource(ndd, res)
+		if (strcmp(res->name, label_id.id) == 0)
+			size += resource_size(res);
+	return size;
+}
+
 static ssize_t alt_name_store(struct device *dev,
 		struct device_attribute *attr, const char *buf, size_t len)
 {
@@ -148,8 +175,9 @@ static ssize_t alt_name_show(struct device *dev,
 
 		ns_altname = nspm->alt_name;
 	} else if (is_namespace_blk(dev)) {
-		/* TODO: blk namespace support */
-		return -ENXIO;
+		struct nd_namespace_blk *nsblk = to_nd_namespace_blk(dev);
+
+		ns_altname = nsblk->alt_name;
 	} else
 		return -ENXIO;
 
@@ -195,6 +223,8 @@ static int scan_free(struct nd_region *nd_region,
 			new_start = res->start;
 
 		rc = adjust_resource(res, new_start, resource_size(res) - n);
+		if (rc == 0)
+			res->flags |= DPA_RESOURCE_ADJUSTED;
 		nd_dbg_dpa(nd_region, ndd, res, "shrink %d\n", rc);
 		break;
 	}
@@ -255,14 +285,15 @@ static resource_size_t init_dpa_allocation(struct nd_label_id *label_id,
 	return rc ? n : 0;
 }
 
-static bool space_valid(bool is_pmem, struct nd_label_id *label_id,
-		struct resource *res)
+static bool space_valid(bool is_pmem, bool is_reserve,
+		struct nd_label_id *label_id, struct resource *res)
 {
 	/*
 	 * For BLK-space any space is valid, for PMEM-space, it must be
-	 * contiguous with an existing allocation.
+	 * contiguous with an existing allocation unless we are
+	 * reserving pmem.
 	 */
-	if (!is_pmem)
+	if (is_reserve || !is_pmem)
 		return true;
 	if (!res || strcmp(res->name, label_id->id) == 0)
 		return true;
@@ -278,6 +309,7 @@ static resource_size_t scan_allocate(struct nd_region *nd_region,
 		resource_size_t n)
 {
 	resource_size_t mapping_end = nd_mapping->start + nd_mapping->size - 1;
+	bool is_reserve = strcmp(label_id->id, "pmem-reserve") == 0;
 	bool is_pmem = strncmp(label_id->id, "pmem", 4) == 0;
 	struct nvdimm_drvdata *ndd = to_ndd(nd_mapping);
 	const resource_size_t to_allocate = n;
@@ -303,7 +335,7 @@ static resource_size_t scan_allocate(struct nd_region *nd_region,
 		if (!first++ && res->start > nd_mapping->start) {
 			free_start = nd_mapping->start;
 			available = res->start - free_start;
-			if (space_valid(is_pmem, label_id, NULL))
+			if (space_valid(is_pmem, is_reserve, label_id, NULL))
 				loc = ALLOC_BEFORE;
 		}
 
@@ -311,7 +343,7 @@ static resource_size_t scan_allocate(struct nd_region *nd_region,
 		if (!loc && next) {
 			free_start = res->start + resource_size(res);
 			free_end = min(mapping_end, next->start - 1);
-			if (space_valid(is_pmem, label_id, res)
+			if (space_valid(is_pmem, is_reserve, label_id, res)
 					&& free_start < free_end) {
 				available = free_end + 1 - free_start;
 				loc = ALLOC_MID;
@@ -322,7 +354,7 @@ static resource_size_t scan_allocate(struct nd_region *nd_region,
 		if (!loc && !next) {
 			free_start = res->start + resource_size(res);
 			free_end = mapping_end;
-			if (space_valid(is_pmem, label_id, res)
+			if (space_valid(is_pmem, is_reserve, label_id, res)
 					&& free_start < free_end) {
 				available = free_end + 1 - free_start;
 				loc = ALLOC_AFTER;
@@ -336,7 +368,7 @@ static resource_size_t scan_allocate(struct nd_region *nd_region,
 		case ALLOC_BEFORE:
 			if (strcmp(res->name, label_id->id) == 0) {
 				/* adjust current resource up */
-				if (is_pmem)
+				if (is_pmem && !is_reserve)
 					return n;
 				rc = adjust_resource(res, res->start - allocate,
 						resource_size(res) + allocate);
@@ -347,7 +379,7 @@ static resource_size_t scan_allocate(struct nd_region *nd_region,
 		case ALLOC_MID:
 			if (strcmp(next->name, label_id->id) == 0) {
 				/* adjust next resource up */
-				if (is_pmem)
+				if (is_pmem && !is_reserve)
 					return n;
 				rc = adjust_resource(next, next->start
 						- allocate, resource_size(next)
@@ -373,7 +405,7 @@ static resource_size_t scan_allocate(struct nd_region *nd_region,
 			/* BLK allocate bottom up */
 			if (!is_pmem)
 				free_start += available - allocate;
-			else if (free_start != nd_mapping->start)
+			else if (!is_reserve && free_start != nd_mapping->start)
 				return n;
 
 			new_res = nvdimm_allocate_dpa(ndd, label_id,
@@ -384,6 +416,8 @@ static resource_size_t scan_allocate(struct nd_region *nd_region,
 			/* adjust current resource down */
 			rc = adjust_resource(res, res->start, resource_size(res)
 					+ allocate);
+			if (rc == 0)
+				res->flags |= DPA_RESOURCE_ADJUSTED;
 		}
 
 		if (!new_res)
@@ -409,11 +443,108 @@ static resource_size_t scan_allocate(struct nd_region *nd_region,
 			return 0;
 	}
 
-	if (is_pmem && n == to_allocate)
+	/*
+	 * If we allocated nothing in the BLK case it may be because we are in
+	 * an initial "pmem-reserve pass".  Only do an initial BLK allocation
+	 * when none of the DPA space is reserved.
+	 */
+	if ((is_pmem || !ndd->dpa.child) && n == to_allocate)
 		return init_dpa_allocation(label_id, nd_region, nd_mapping, n);
 	return n;
 }
 
+static int merge_dpa(struct nd_region *nd_region,
+		struct nd_mapping *nd_mapping, struct nd_label_id *label_id)
+{
+	struct nvdimm_drvdata *ndd = to_ndd(nd_mapping);
+	struct resource *res;
+
+	if (strncmp("pmem", label_id->id, 4) == 0)
+		return 0;
+ retry:
+	for_each_dpa_resource(ndd, res) {
+		int rc;
+		struct resource *next = res->sibling;
+		resource_size_t end = res->start + resource_size(res);
+
+		if (!next || strcmp(res->name, label_id->id) != 0
+				|| strcmp(next->name, label_id->id) != 0
+				|| end != next->start)
+			continue;
+		end += resource_size(next);
+		nvdimm_free_dpa(ndd, next);
+		rc = adjust_resource(res, res->start, end - res->start);
+		nd_dbg_dpa(nd_region, ndd, res, "merge %d\n", rc);
+		if (rc)
+			return rc;
+		res->flags |= DPA_RESOURCE_ADJUSTED;
+		goto retry;
+	}
+
+	return 0;
+}
+
+static int __reserve_free_pmem(struct device *dev, void *data)
+{
+	struct nvdimm *nvdimm = data;
+	struct nd_region *nd_region;
+	struct nd_label_id label_id;
+	int i;
+
+	if (!is_nd_pmem(dev))
+		return 0;
+
+	nd_region = to_nd_region(dev);
+	if (nd_region->ndr_mappings == 0)
+		return 0;
+
+	memset(&label_id, 0, sizeof(label_id));
+	strcat(label_id.id, "pmem-reserve");
+	for (i = 0; i < nd_region->ndr_mappings; i++) {
+		struct nd_mapping *nd_mapping = &nd_region->mapping[i];
+		resource_size_t n, rem = 0;
+
+		if (nd_mapping->nvdimm != nvdimm)
+			continue;
+
+		n = nd_pmem_available_dpa(nd_region, nd_mapping, &rem);
+		if (n == 0)
+			return 0;
+		rem = scan_allocate(nd_region, nd_mapping, &label_id, n);
+		dev_WARN_ONCE(&nd_region->dev, rem,
+				"pmem reserve underrun: %#llx of %#llx bytes\n",
+				(unsigned long long) n - rem,
+				(unsigned long long) n);
+		return rem ? -ENXIO : 0;
+	}
+
+	return 0;
+}
+
+static void release_free_pmem(struct nvdimm_bus *nvdimm_bus,
+		struct nd_mapping *nd_mapping)
+{
+	struct nvdimm_drvdata *ndd = to_ndd(nd_mapping);
+	struct resource *res, *_res;
+
+	for_each_dpa_resource_safe(ndd, res, _res)
+		if (strcmp(res->name, "pmem-reserve") == 0)
+			nvdimm_free_dpa(ndd, res);
+}
+
+static int reserve_free_pmem(struct nvdimm_bus *nvdimm_bus,
+		struct nd_mapping *nd_mapping)
+{
+	struct nvdimm *nvdimm = nd_mapping->nvdimm;
+	int rc;
+
+	rc = device_for_each_child(&nvdimm_bus->dev, nvdimm,
+			__reserve_free_pmem);
+	if (rc)
+		release_free_pmem(nvdimm_bus, nd_mapping);
+	return rc;
+}
+
 /**
  * grow_dpa_allocation - for each dimm allocate n bytes for @label_id
  * @nd_region: the set of dimms to allocate @n more bytes from
@@ -430,13 +561,45 @@ static resource_size_t scan_allocate(struct nd_region *nd_region,
 static int grow_dpa_allocation(struct nd_region *nd_region,
 		struct nd_label_id *label_id, resource_size_t n)
 {
+	struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(&nd_region->dev);
+	bool is_pmem = strncmp(label_id->id, "pmem", 4) == 0;
 	int i;
 
 	for (i = 0; i < nd_region->ndr_mappings; i++) {
 		struct nd_mapping *nd_mapping = &nd_region->mapping[i];
-		int rc;
+		resource_size_t rem = n;
+		int rc, j;
+
+		/*
+		 * In the BLK case try once with all unallocated PMEM
+		 * reserved, and once without
+		 */
+		for (j = is_pmem; j < 2; j++) {
+			bool blk_only = j == 0;
+
+			if (blk_only) {
+				rc = reserve_free_pmem(nvdimm_bus, nd_mapping);
+				if (rc)
+					return rc;
+			}
+			rem = scan_allocate(nd_region, nd_mapping,
+					label_id, rem);
+			if (blk_only)
+				release_free_pmem(nvdimm_bus, nd_mapping);
 
-		rc = scan_allocate(nd_region, nd_mapping, label_id, n);
+			/* try again and allow encroachments into PMEM */
+			if (rem == 0)
+				break;
+		}
+
+		dev_WARN_ONCE(&nd_region->dev, rem,
+				"allocation underrun: %#llx of %#llx bytes\n",
+				(unsigned long long) n - rem,
+				(unsigned long long) n);
+		if (rem)
+			return -ENXIO;
+
+		rc = merge_dpa(nd_region, nd_mapping, label_id);
 		if (rc)
 			return rc;
 	}
@@ -472,8 +635,10 @@ static ssize_t __size_store(struct device *dev, unsigned long long val)
 
 		uuid = nspm->uuid;
 	} else if (is_namespace_blk(dev)) {
-		/* TODO: blk namespace support */
-		return -ENXIO;
+		struct nd_namespace_blk *nsblk = to_nd_namespace_blk(dev);
+
+		uuid = nsblk->uuid;
+		flags = NSLABEL_FLAG_LOCAL;
 	}
 
 	/*
@@ -528,6 +693,14 @@ static ssize_t __size_store(struct device *dev, unsigned long long val)
 
 		nd_namespace_pmem_set_size(nd_region, nspm,
 				val * nd_region->ndr_mappings);
+	} else if (is_namespace_blk(dev)) {
+		/*
+		 * Try to delete the namespace if we deleted all of its
+		 * allocation and this is not the seed device for the
+		 * region.
+		 */
+		if (val == 0 && nd_region->ns_seed != dev)
+			nd_device_unregister(dev, ND_ASYNC);
 	}
 
 	return rc;
@@ -554,8 +727,9 @@ static ssize_t size_store(struct device *dev,
 
 		uuid = &nspm->uuid;
 	} else if (is_namespace_blk(dev)) {
-		/* TODO: blk namespace support */
-		rc = -ENXIO;
+		struct nd_namespace_blk *nsblk = to_nd_namespace_blk(dev);
+
+		uuid = &nsblk->uuid;
 	}
 
 	if (rc == 0 && val == 0 && uuid) {
@@ -576,21 +750,23 @@ static ssize_t size_store(struct device *dev,
 static ssize_t size_show(struct device *dev,
 		struct device_attribute *attr, char *buf)
 {
+	unsigned long long size = 0;
+
+	nvdimm_bus_lock(dev);
 	if (is_namespace_pmem(dev)) {
 		struct nd_namespace_pmem *nspm = to_nd_namespace_pmem(dev);
 
-		return sprintf(buf, "%llu\n", (unsigned long long)
-				resource_size(&nspm->nsio.res));
+		size = resource_size(&nspm->nsio.res);
 	} else if (is_namespace_blk(dev)) {
-		/* TODO: blk namespace support */
-		return -ENXIO;
+		size = nd_namespace_blk_size(to_nd_namespace_blk(dev));
 	} else if (is_namespace_io(dev)) {
 		struct nd_namespace_io *nsio = to_nd_namespace_io(dev);
 
-		return sprintf(buf, "%llu\n", (unsigned long long)
-				resource_size(&nsio->res));
-	} else
-		return -ENXIO;
+		size = resource_size(&nsio->res);
+	}
+	nvdimm_bus_unlock(dev);
+
+	return sprintf(buf, "%llu\n", size);
 }
 static DEVICE_ATTR(size, S_IRUGO, size_show, size_store);
 
@@ -604,8 +780,9 @@ static ssize_t uuid_show(struct device *dev,
 
 		uuid = nspm->uuid;
 	} else if (is_namespace_blk(dev)) {
-		/* TODO: blk namespace support */
-		return -ENXIO;
+		struct nd_namespace_blk *nsblk = to_nd_namespace_blk(dev);
+
+		uuid = nsblk->uuid;
 	} else
 		return -ENXIO;
 
@@ -669,8 +846,9 @@ static ssize_t uuid_store(struct device *dev,
 
 		ns_uuid = &nspm->uuid;
 	} else if (is_namespace_blk(dev)) {
-		/* TODO: blk namespace support */
-		return -ENXIO;
+		struct nd_namespace_blk *nsblk = to_nd_namespace_blk(dev);
+
+		ns_uuid = &nsblk->uuid;
 	} else
 		return -ENXIO;
 
@@ -712,12 +890,48 @@ static ssize_t resource_show(struct device *dev,
 }
 static DEVICE_ATTR_RO(resource);
 
+static const unsigned long ns_lbasize_supported[] = { 512, 0 };
+
+static ssize_t sector_size_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct nd_namespace_blk *nsblk = to_nd_namespace_blk(dev);
+
+	if (!is_namespace_blk(dev))
+		return -ENXIO;
+
+	return nd_sector_size_show(nsblk->lbasize, ns_lbasize_supported, buf);
+}
+
+static ssize_t sector_size_store(struct device *dev,
+		struct device_attribute *attr, const char *buf, size_t len)
+{
+	struct nd_namespace_blk *nsblk = to_nd_namespace_blk(dev);
+	ssize_t rc;
+
+	if (!is_namespace_blk(dev))
+		return -ENXIO;
+
+	device_lock(dev);
+	nvdimm_bus_lock(dev);
+	rc = nd_sector_size_store(dev, buf, &nsblk->lbasize,
+			ns_lbasize_supported);
+	dev_dbg(dev, "%s: result: %zd wrote: %s%s", __func__,
+			rc, buf, buf[len - 1] == '\n' ? "" : "\n");
+	nvdimm_bus_unlock(dev);
+	device_unlock(dev);
+
+	return rc ? rc : len;
+}
+static DEVICE_ATTR_RW(sector_size);
+
 static struct attribute *nd_namespace_attributes[] = {
 	&dev_attr_nstype.attr,
 	&dev_attr_size.attr,
 	&dev_attr_uuid.attr,
 	&dev_attr_resource.attr,
 	&dev_attr_alt_name.attr,
+	&dev_attr_sector_size.attr,
 	NULL,
 };
 
@@ -735,6 +949,10 @@ static umode_t namespace_visible(struct kobject *kobj,
 	if (is_namespace_pmem(dev) || is_namespace_blk(dev)) {
 		if (a == &dev_attr_size.attr)
 			return S_IWUSR | S_IRUGO;
+
+		if (is_namespace_pmem(dev) && a == &dev_attr_sector_size.attr)
+			return 0;
+
 		return a->mode;
 	}
 
@@ -1022,6 +1240,176 @@ static struct device **create_namespace_pmem(struct nd_region *nd_region)
 	return NULL;
 }
 
+struct resource *nsblk_add_resource(struct nd_region *nd_region,
+		struct nvdimm_drvdata *ndd, struct nd_namespace_blk *nsblk,
+		resource_size_t start)
+{
+	struct nd_label_id label_id;
+	struct resource *res;
+
+	nd_label_gen_id(&label_id, nsblk->uuid, NSLABEL_FLAG_LOCAL);
+	res = krealloc(nsblk->res,
+			sizeof(void *) * (nsblk->num_resources + 1),
+			GFP_KERNEL);
+	if (!res)
+		return NULL;
+	nsblk->res = (struct resource **) res;
+	for_each_dpa_resource(ndd, res)
+		if (strcmp(res->name, label_id.id) == 0
+				&& res->start == start) {
+			nsblk->res[nsblk->num_resources++] = res;
+			return res;
+		}
+	return NULL;
+}
+
+static struct device *nd_namespace_blk_create(struct nd_region *nd_region)
+{
+	struct nd_namespace_blk *nsblk;
+	struct device *dev;
+
+	if (!is_nd_blk(&nd_region->dev))
+		return NULL;
+
+	nsblk = kzalloc(sizeof(*nsblk), GFP_KERNEL);
+	if (!nsblk)
+		return NULL;
+
+	dev = &nsblk->dev;
+	dev->type = &namespace_blk_device_type;
+	nsblk->id = ida_simple_get(&nd_region->ns_ida, 0, 0, GFP_KERNEL);
+	if (nsblk->id < 0) {
+		kfree(nsblk);
+		return NULL;
+	}
+	dev_set_name(dev, "namespace%d.%d", nd_region->id, nsblk->id);
+	dev->parent = &nd_region->dev;
+	dev->groups = nd_namespace_attribute_groups;
+
+	return &nsblk->dev;
+}
+
+void nd_region_create_blk_seed(struct nd_region *nd_region)
+{
+	WARN_ON(!is_nvdimm_bus_locked(&nd_region->dev));
+	nd_region->ns_seed = nd_namespace_blk_create(nd_region);
+	/*
+	 * Seed creation failures are not fatal, provisioning is simply
+	 * disabled until memory becomes available
+	 */
+	if (!nd_region->ns_seed)
+		dev_err(&nd_region->dev, "failed to create blk namespace\n");
+	else
+		nd_device_register(nd_region->ns_seed);
+}
+
+static struct device **create_namespace_blk(struct nd_region *nd_region)
+{
+	struct nd_mapping *nd_mapping = &nd_region->mapping[0];
+	struct nd_namespace_label *nd_label;
+	struct device *dev, **devs = NULL;
+	struct nd_namespace_blk *nsblk;
+	struct nvdimm_drvdata *ndd;
+	int i, l, count = 0;
+	struct resource *res;
+
+	if (nd_region->ndr_mappings == 0)
+		return NULL;
+
+	ndd = to_ndd(nd_mapping);
+	for_each_label(l, nd_label, nd_mapping->labels) {
+		u32 flags = __le32_to_cpu(nd_label->flags);
+		char *name[NSLABEL_NAME_LEN];
+		struct device **__devs;
+
+		if (flags & NSLABEL_FLAG_LOCAL)
+			/* pass */;
+		else
+			continue;
+
+		for (i = 0; i < count; i++) {
+			nsblk = to_nd_namespace_blk(devs[i]);
+			if (memcmp(nsblk->uuid, nd_label->uuid,
+						NSLABEL_UUID_LEN) == 0) {
+				res = nsblk_add_resource(nd_region, ndd, nsblk,
+						__le64_to_cpu(nd_label->dpa));
+				if (!res)
+					goto err;
+				nd_dbg_dpa(nd_region, ndd, res, "%s assign\n",
+					dev_name(&nsblk->dev));
+				break;
+			}
+		}
+		if (i < count)
+			continue;
+		__devs = kcalloc(count + 2, sizeof(dev), GFP_KERNEL);
+		if (!__devs)
+			goto err;
+		memcpy(__devs, devs, sizeof(dev) * count);
+		kfree(devs);
+		devs = __devs;
+
+		nsblk = kzalloc(sizeof(*nsblk), GFP_KERNEL);
+		if (!nsblk)
+			goto err;
+		dev = &nsblk->dev;
+		dev->type = &namespace_blk_device_type;
+		dev->parent = &nd_region->dev;
+		dev_set_name(dev, "namespace%d.%d", nd_region->id, count);
+		devs[count++] = dev;
+		nsblk->id = -1;
+		nsblk->lbasize = __le64_to_cpu(nd_label->lbasize);
+		nsblk->uuid = kmemdup(nd_label->uuid, NSLABEL_UUID_LEN,
+				GFP_KERNEL);
+		if (!nsblk->uuid)
+			goto err;
+		memcpy(name, nd_label->name, NSLABEL_NAME_LEN);
+		if (name[0])
+			nsblk->alt_name = kmemdup(name, NSLABEL_NAME_LEN,
+					GFP_KERNEL);
+		res = nsblk_add_resource(nd_region, ndd, nsblk,
+				__le64_to_cpu(nd_label->dpa));
+		if (!res)
+			goto err;
+		nd_dbg_dpa(nd_region, ndd, res, "%s assign\n",
+				dev_name(&nsblk->dev));
+	}
+
+	dev_dbg(&nd_region->dev, "%s: discovered %d blk namespace%s\n",
+			__func__, count, count == 1 ? "" : "s");
+
+	if (count == 0) {
+		/* Publish a zero-sized namespace for userspace to configure. */
+		for (i = 0; i < nd_region->ndr_mappings; i++) {
+			struct nd_mapping *nd_mapping = &nd_region->mapping[i];
+
+			kfree(nd_mapping->labels);
+			nd_mapping->labels = NULL;
+		}
+
+		devs = kcalloc(2, sizeof(dev), GFP_KERNEL);
+		if (!devs)
+			goto err;
+		nsblk = kzalloc(sizeof(*nsblk), GFP_KERNEL);
+		if (!nsblk)
+			goto err;
+		dev = &nsblk->dev;
+		dev->type = &namespace_blk_device_type;
+		dev->parent = &nd_region->dev;
+		devs[count++] = dev;
+	}
+
+	return devs;
+
+err:
+	for (i = 0; i < count; i++) {
+		nsblk = to_nd_namespace_blk(devs[i]);
+		namespace_blk_release(&nsblk->dev);
+	}
+	kfree(devs);
+	return NULL;
+}
+
 static int init_active_labels(struct nd_region *nd_region)
 {
 	int i;
@@ -1087,6 +1475,9 @@ int nd_region_register_namespaces(struct nd_region *nd_region, int *err)
 	case ND_DEVICE_NAMESPACE_PMEM:
 		devs = create_namespace_pmem(nd_region);
 		break;
+	case ND_DEVICE_NAMESPACE_BLK:
+		devs = create_namespace_blk(nd_region);
+		break;
 	default:
 		break;
 	}
@@ -1095,15 +1486,50 @@ int nd_region_register_namespaces(struct nd_region *nd_region, int *err)
 	if (!devs)
 		return -ENODEV;
 
-	nd_region->ns_seed = devs[0];
 	for (i = 0; devs[i]; i++) {
 		struct device *dev = devs[i];
+		int id;
 
-		dev_set_name(dev, "namespace%d.%d", nd_region->id, i);
+		if (type == ND_DEVICE_NAMESPACE_BLK) {
+			struct nd_namespace_blk *nsblk;
+
+			nsblk = to_nd_namespace_blk(dev);
+			id = ida_simple_get(&nd_region->ns_ida, 0, 0,
+					GFP_KERNEL);
+			nsblk->id = id;
+		} else
+			id = i;
+
+		if (id < 0)
+			break;
+		dev_set_name(dev, "namespace%d.%d", nd_region->id, id);
 		dev->groups = nd_namespace_attribute_groups;
 		nd_device_register(dev);
 	}
+	if (i)
+		nd_region->ns_seed = devs[0];
+
+	if (devs[i]) {
+		int j;
+
+		for (j = i; devs[j]; j++) {
+			struct device *dev = devs[j];
+
+			device_initialize(dev);
+			put_device(dev);
+		}
+		*err = j - i;
+		/*
+		 * All of the namespaces we tried to register failed, so
+		 * fail region activation.
+		 */
+		if (*err == 0)
+			rc = -ENODEV;
+	}
 	kfree(devs);
 
+	if (rc == -ENODEV)
+		return rc;
+
 	return i;
 }
diff --git a/drivers/nvdimm/nd-core.h b/drivers/nvdimm/nd-core.h
index c6c889292bab..22489555a6f1 100644
--- a/drivers/nvdimm/nd-core.h
+++ b/drivers/nvdimm/nd-core.h
@@ -17,6 +17,7 @@
 #include <linux/libnvdimm.h>
 #include <linux/sizes.h>
 #include <linux/mutex.h>
+#include <linux/nd.h>
 
 extern struct list_head nvdimm_bus_list;
 extern struct mutex nvdimm_bus_list_mutex;
@@ -48,6 +49,8 @@ struct nvdimm_bus *walk_to_nvdimm_bus(struct device *nd_dev);
 int __init nvdimm_bus_init(void);
 void nvdimm_bus_exit(void);
 void nd_region_probe_success(struct nvdimm_bus *nvdimm_bus, struct device *dev);
+struct nd_region;
+void nd_region_create_blk_seed(struct nd_region *nd_region);
 void nd_region_disable(struct nvdimm_bus *nvdimm_bus, struct device *dev);
 int nvdimm_bus_create_ndctl(struct nvdimm_bus *nvdimm_bus);
 void nvdimm_bus_destroy_ndctl(struct nvdimm_bus *nvdimm_bus);
@@ -64,8 +67,13 @@ struct nvdimm_drvdata;
 struct nd_mapping;
 resource_size_t nd_pmem_available_dpa(struct nd_region *nd_region,
 		struct nd_mapping *nd_mapping, resource_size_t *overlap);
+resource_size_t nd_blk_available_dpa(struct nd_mapping *nd_mapping);
 resource_size_t nd_region_available_dpa(struct nd_region *nd_region);
 resource_size_t nvdimm_allocated_dpa(struct nvdimm_drvdata *ndd,
 		struct nd_label_id *label_id);
+struct nd_mapping;
+struct resource *nsblk_add_resource(struct nd_region *nd_region,
+		struct nvdimm_drvdata *ndd, struct nd_namespace_blk *nsblk,
+		resource_size_t start);
 void get_ndd(struct nvdimm_drvdata *ndd);
 #endif /* __ND_CORE_H__ */
diff --git a/drivers/nvdimm/nd.h b/drivers/nvdimm/nd.h
index 03e610cd9f43..9b021b626202 100644
--- a/drivers/nvdimm/nd.h
+++ b/drivers/nvdimm/nd.h
@@ -73,6 +73,7 @@ static inline struct nd_namespace_index *to_next_namespace_index(
 
 struct nd_region {
 	struct device dev;
+	struct ida ns_ida;
 	struct device *ns_seed;
 	u16 ndr_mappings;
 	u64 ndr_size;
@@ -102,6 +103,10 @@ void nd_device_register(struct device *dev);
 void nd_device_unregister(struct device *dev, enum nd_async_mode mode);
 int nd_uuid_store(struct device *dev, u8 **uuid_out, const char *buf,
 		size_t len);
+ssize_t nd_sector_size_show(unsigned long current_lbasize,
+		const unsigned long *supported, char *buf);
+ssize_t nd_sector_size_store(struct device *dev, const char *buf,
+		unsigned long *current_lbasize, const unsigned long *supported);
 int __init nvdimm_init(void);
 int __init nd_region_init(void);
 void nvdimm_exit(void);
diff --git a/drivers/nvdimm/region_devs.c b/drivers/nvdimm/region_devs.c
index b45806f7176d..ac21ce419beb 100644
--- a/drivers/nvdimm/region_devs.c
+++ b/drivers/nvdimm/region_devs.c
@@ -118,7 +118,12 @@ static int is_uuid_busy(struct device *dev, void *data)
 		break;
 	}
 	case ND_DEVICE_NAMESPACE_BLK: {
-		/* TODO: blk namespace support */
+		struct nd_namespace_blk *nsblk = to_nd_namespace_blk(dev);
+
+		if (!nsblk->uuid)
+			break;
+		if (memcmp(uuid, nsblk->uuid, NSLABEL_UUID_LEN) == 0)
+			return -EBUSY;
 		break;
 	}
 	default:
@@ -230,7 +235,7 @@ resource_size_t nd_region_available_dpa(struct nd_region *nd_region)
 				goto retry;
 			}
 		} else if (is_nd_blk(&nd_region->dev)) {
-			/* TODO: BLK Namespace support */
+			available += nd_blk_available_dpa(nd_mapping);
 		}
 	}
 
@@ -360,6 +365,13 @@ static void nd_region_notify_driver_action(struct nvdimm_bus *nvdimm_bus,
 			nd_mapping->ndd = NULL;
 			atomic_dec(&nvdimm->busy);
 		}
+	} else if (dev->parent && is_nd_blk(dev->parent) && probe) {
+		struct nd_region *nd_region = to_nd_region(dev->parent);
+
+		nvdimm_bus_lock(dev);
+		if (nd_region->ns_seed == dev)
+			nd_region_create_blk_seed(nd_region);
+		nvdimm_bus_unlock(dev);
 	}
 }
 
@@ -533,6 +545,7 @@ static struct nd_region *nd_region_create(struct nvdimm_bus *nvdimm_bus,
 	nd_region->ndr_mappings = ndr_desc->num_mappings;
 	nd_region->provider_data = ndr_desc->provider_data;
 	nd_region->nd_set = ndr_desc->nd_set;
+	ida_init(&nd_region->ns_ida);
 	dev = &nd_region->dev;
 	dev_set_name(dev, "region%d", nd_region->id);
 	dev->parent = &nvdimm_bus->dev;
diff --git a/include/linux/libnvdimm.h b/include/linux/libnvdimm.h
index c130972e08c4..a59dca17b3aa 100644
--- a/include/linux/libnvdimm.h
+++ b/include/linux/libnvdimm.h
@@ -27,6 +27,9 @@ enum {
 	ND_CMD_MAX_ENVELOPE = 16,
 	ND_CMD_ARS_STATUS_MAX = SZ_4K,
 	ND_MAX_MAPPINGS = 32,
+
+	/* mark newly adjusted resources as requiring a label update */
+	DPA_RESOURCE_ADJUSTED = 1 << 0,
 };
 
 extern struct attribute_group nvdimm_bus_attribute_group;
diff --git a/include/linux/nd.h b/include/linux/nd.h
index 255c38a83083..23276ea91690 100644
--- a/include/linux/nd.h
+++ b/include/linux/nd.h
@@ -50,6 +50,26 @@ struct nd_namespace_pmem {
 	u8 *uuid;
 };
 
+/**
+ * struct nd_namespace_blk - namespace for dimm-bounded persistent memory
+ * @dev: namespace device creation by the nd region driver
+ * @alt_name: namespace name supplied in the dimm label
+ * @uuid: namespace name supplied in the dimm label
+ * @id: ida allocated id
+ * @lbasize: blk namespaces have a native sector size when btt not present
+ * @num_resources: number of dpa extents to claim
+ * @res: discontiguous dpa extents for given dimm
+ */
+struct nd_namespace_blk {
+	struct device dev;
+	char *alt_name;
+	u8 *uuid;
+	int id;
+	unsigned long lbasize;
+	int num_resources;
+	struct resource **res;
+};
+
 static inline struct nd_namespace_io *to_nd_namespace_io(struct device *dev)
 {
 	return container_of(dev, struct nd_namespace_io, dev);
@@ -62,6 +82,11 @@ static inline struct nd_namespace_pmem *to_nd_namespace_pmem(struct device *dev)
 	return container_of(nsio, struct nd_namespace_pmem, nsio);
 }
 
+static inline struct nd_namespace_blk *to_nd_namespace_blk(struct device *dev)
+{
+	return container_of(dev, struct nd_namespace_blk, dev);
+}
+
 #define MODULE_ALIAS_ND_DEVICE(type) \
 	MODULE_ALIAS("nd:t" __stringify(type) "*")
 #define ND_DEVICE_MODALIAS_FMT "nd:t%d"
-- 
cgit v1.2.3


From d7f96f97c4031fa4ffdb7801f9aae23e96170a6f Mon Sep 17 00:00:00 2001
From: Ivan Khoronzhuk <ivan.khoronzhuk@globallogic.com>
Date: Thu, 25 Jun 2015 09:06:56 +0200
Subject: firmware: dmi_scan: add SBMIOS entry and DMI tables

Some utils, like dmidecode and smbios, need to access SMBIOS entry
table area in order to get information like SMBIOS version, size, etc.
Currently it's done via /dev/mem. But for situation when /dev/mem
usage is disabled, the utils have to use dmi sysfs instead, which
doesn't represent SMBIOS entry and adds code/delay redundancy when direct
access for table is needed.

So this patch creates dmi/tables and adds SMBIOS entry point to allow
utils in question to work correctly without /dev/mem. Also patch adds
raw dmi table to simplify dmi table processing in user space, as
proposed by Jean Delvare.

Tested-by: Roy Franz <roy.franz@linaro.org>
Signed-off-by: Ivan Khoronzhuk <ivan.khoronzhuk@globallogic.com>
Signed-off-by: Jean Delvare <jdelvare@suse.de>
---
 .../ABI/testing/sysfs-firmware-dmi-tables          | 22 ++++++
 MAINTAINERS                                        |  1 +
 drivers/firmware/dmi-sysfs.c                       | 17 +++--
 drivers/firmware/dmi_scan.c                        | 78 ++++++++++++++++++++++
 include/linux/dmi.h                                |  2 +
 5 files changed, 111 insertions(+), 9 deletions(-)
 create mode 100644 Documentation/ABI/testing/sysfs-firmware-dmi-tables

(limited to 'include/linux')

diff --git a/Documentation/ABI/testing/sysfs-firmware-dmi-tables b/Documentation/ABI/testing/sysfs-firmware-dmi-tables
new file mode 100644
index 000000000000..ff3cac8ed0bd
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-firmware-dmi-tables
@@ -0,0 +1,22 @@
+What:		/sys/firmware/dmi/tables/
+Date:		April 2015
+Contact:	Ivan Khoronzhuk <ivan.khoronzhuk@globallogic.com>
+Description:
+		The firmware provides DMI structures as a packed list of
+		data referenced by a SMBIOS table entry point. The SMBIOS
+		entry point contains general information, like SMBIOS
+		version, DMI table size, etc. The structure, content and
+		size of SMBIOS entry point is dependent on SMBIOS version.
+		The format of SMBIOS entry point and DMI structures
+		can be read in SMBIOS specification.
+
+		The dmi/tables provides raw SMBIOS entry point and DMI tables
+		through sysfs as an alternative to utilities reading them
+		from /dev/mem. The raw SMBIOS entry point and DMI table are
+		presented as binary attributes and are accessible via:
+
+		/sys/firmware/dmi/tables/smbios_entry_point
+		/sys/firmware/dmi/tables/DMI
+
+		The complete DMI information can be obtained using these two
+		tables.
diff --git a/MAINTAINERS b/MAINTAINERS
index 0502f9a7087f..ca473689d0f4 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -3289,6 +3289,7 @@ DMI/SMBIOS SUPPORT
 M:	Jean Delvare <jdelvare@suse.de>
 S:	Maintained
 T:	quilt http://jdelvare.nerim.net/devel/linux/jdelvare-dmi/
+F:	Documentation/ABI/testing/sysfs-firmware-dmi-tables
 F:	drivers/firmware/dmi-id.c
 F:	drivers/firmware/dmi_scan.c
 F:	include/linux/dmi.h
diff --git a/drivers/firmware/dmi-sysfs.c b/drivers/firmware/dmi-sysfs.c
index e0f1cb3d3598..ef76e5eecf0b 100644
--- a/drivers/firmware/dmi-sysfs.c
+++ b/drivers/firmware/dmi-sysfs.c
@@ -566,7 +566,6 @@ static struct kobj_type dmi_sysfs_entry_ktype = {
 	.default_attrs = dmi_sysfs_entry_attrs,
 };
 
-static struct kobject *dmi_kobj;
 static struct kset *dmi_kset;
 
 /* Global count of all instances seen.  Only for setup */
@@ -648,17 +647,20 @@ static void cleanup_entry_list(void)
 
 static int __init dmi_sysfs_init(void)
 {
-	int error = -ENOMEM;
+	int error;
 	int val;
 
-	/* Set up our directory */
-	dmi_kobj = kobject_create_and_add("dmi", firmware_kobj);
-	if (!dmi_kobj)
+	if (!dmi_kobj) {
+		pr_err("dmi-sysfs: dmi entry is absent.\n");
+		error = -ENODATA;
 		goto err;
+	}
 
 	dmi_kset = kset_create_and_add("entries", NULL, dmi_kobj);
-	if (!dmi_kset)
+	if (!dmi_kset) {
+		error = -ENOMEM;
 		goto err;
+	}
 
 	val = 0;
 	error = dmi_walk(dmi_sysfs_register_handle, &val);
@@ -675,7 +677,6 @@ static int __init dmi_sysfs_init(void)
 err:
 	cleanup_entry_list();
 	kset_unregister(dmi_kset);
-	kobject_put(dmi_kobj);
 	return error;
 }
 
@@ -685,8 +686,6 @@ static void __exit dmi_sysfs_exit(void)
 	pr_debug("dmi-sysfs: unloading.\n");
 	cleanup_entry_list();
 	kset_unregister(dmi_kset);
-	kobject_del(dmi_kobj);
-	kobject_put(dmi_kobj);
 }
 
 module_init(dmi_sysfs_init);
diff --git a/drivers/firmware/dmi_scan.c b/drivers/firmware/dmi_scan.c
index 7fdf2868a276..5e0b770fdae5 100644
--- a/drivers/firmware/dmi_scan.c
+++ b/drivers/firmware/dmi_scan.c
@@ -10,6 +10,9 @@
 #include <asm/dmi.h>
 #include <asm/unaligned.h>
 
+struct kobject *dmi_kobj;
+EXPORT_SYMBOL_GPL(dmi_kobj);
+
 /*
  * DMI stands for "Desktop Management Interface".  It is part
  * of and an antecedent to, SMBIOS, which stands for System
@@ -20,6 +23,9 @@ static const char dmi_empty_string[] = "        ";
 static u32 dmi_ver __initdata;
 static u32 dmi_len;
 static u16 dmi_num;
+static u8 smbios_entry_point[32];
+static int smbios_entry_point_size;
+
 /*
  * Catch too early calls to dmi_check_system():
  */
@@ -488,6 +494,8 @@ static int __init dmi_present(const u8 *buf)
 	if (memcmp(buf, "_SM_", 4) == 0 &&
 	    buf[5] < 32 && dmi_checksum(buf, buf[5])) {
 		smbios_ver = get_unaligned_be16(buf + 6);
+		smbios_entry_point_size = buf[5];
+		memcpy(smbios_entry_point, buf, smbios_entry_point_size);
 
 		/* Some BIOS report weird SMBIOS version, fix that up */
 		switch (smbios_ver) {
@@ -522,6 +530,9 @@ static int __init dmi_present(const u8 *buf)
 				pr_info("SMBIOS %d.%d present.\n",
 				       dmi_ver >> 8, dmi_ver & 0xFF);
 			} else {
+				smbios_entry_point_size = 15;
+				memcpy(smbios_entry_point, buf,
+				       smbios_entry_point_size);
 				pr_info("Legacy DMI %d.%d present.\n",
 				       dmi_ver >> 8, dmi_ver & 0xFF);
 			}
@@ -548,6 +559,8 @@ static int __init dmi_smbios3_present(const u8 *buf)
 		dmi_num = 0;			/* No longer specified */
 		dmi_len = get_unaligned_le32(buf + 12);
 		dmi_base = get_unaligned_le64(buf + 16);
+		smbios_entry_point_size = buf[6];
+		memcpy(smbios_entry_point, buf, smbios_entry_point_size);
 
 		if (dmi_walk_early(dmi_decode) == 0) {
 			pr_info("SMBIOS %d.%d.%d present.\n",
@@ -639,6 +652,71 @@ void __init dmi_scan_machine(void)
 	dmi_initialized = 1;
 }
 
+static ssize_t raw_table_read(struct file *file, struct kobject *kobj,
+			      struct bin_attribute *attr, char *buf,
+			      loff_t pos, size_t count)
+{
+	memcpy(buf, attr->private + pos, count);
+	return count;
+}
+
+static BIN_ATTR(smbios_entry_point, S_IRUSR, raw_table_read, NULL, 0);
+static BIN_ATTR(DMI, S_IRUSR, raw_table_read, NULL, 0);
+
+static int __init dmi_init(void)
+{
+	struct kobject *tables_kobj;
+	u8 *dmi_table;
+	int ret = -ENOMEM;
+
+	if (!dmi_available) {
+		ret = -ENODATA;
+		goto err;
+	}
+
+	/*
+	 * Set up dmi directory at /sys/firmware/dmi. This entry should stay
+	 * even after farther error, as it can be used by other modules like
+	 * dmi-sysfs.
+	 */
+	dmi_kobj = kobject_create_and_add("dmi", firmware_kobj);
+	if (!dmi_kobj)
+		goto err;
+
+	tables_kobj = kobject_create_and_add("tables", dmi_kobj);
+	if (!tables_kobj)
+		goto err;
+
+	dmi_table = dmi_remap(dmi_base, dmi_len);
+	if (!dmi_table)
+		goto err_tables;
+
+	bin_attr_smbios_entry_point.size = smbios_entry_point_size;
+	bin_attr_smbios_entry_point.private = smbios_entry_point;
+	ret = sysfs_create_bin_file(tables_kobj, &bin_attr_smbios_entry_point);
+	if (ret)
+		goto err_unmap;
+
+	bin_attr_DMI.size = dmi_len;
+	bin_attr_DMI.private = dmi_table;
+	ret = sysfs_create_bin_file(tables_kobj, &bin_attr_DMI);
+	if (!ret)
+		return 0;
+
+	sysfs_remove_bin_file(tables_kobj,
+			      &bin_attr_smbios_entry_point);
+ err_unmap:
+	dmi_unmap(dmi_table);
+ err_tables:
+	kobject_del(tables_kobj);
+	kobject_put(tables_kobj);
+ err:
+	pr_err("dmi: Firmware registration failed.\n");
+
+	return ret;
+}
+subsys_initcall(dmi_init);
+
 /**
  * dmi_set_dump_stack_arch_desc - set arch description for dump_stack()
  *
diff --git a/include/linux/dmi.h b/include/linux/dmi.h
index f820f0a336c9..2f9f98827c0a 100644
--- a/include/linux/dmi.h
+++ b/include/linux/dmi.h
@@ -2,6 +2,7 @@
 #define __DMI_H__
 
 #include <linux/list.h>
+#include <linux/kobject.h>
 #include <linux/mod_devicetable.h>
 
 /* enum dmi_field is in mod_devicetable.h */
@@ -93,6 +94,7 @@ struct dmi_dev_onboard {
 	int devfn;
 };
 
+extern struct kobject *dmi_kobj;
 extern int dmi_check_system(const struct dmi_system_id *list);
 const struct dmi_system_id *dmi_first_match(const struct dmi_system_id *list);
 extern const char * dmi_get_system_info(int field);
-- 
cgit v1.2.3


From 9ea650c804404e55dbf17c928203141282e759ab Mon Sep 17 00:00:00 2001
From: Jean Delvare <jdelvare@suse.de>
Date: Thu, 25 Jun 2015 09:06:57 +0200
Subject: firmware: dmi: struct dmi_header should be packed

Apparently the compiler does fine without it, but it feels safer and
clearer to add the missing attribute.

Signed-off-by: Jean Delvare <jdelvare@suse.de>
---
 include/linux/dmi.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/dmi.h b/include/linux/dmi.h
index 2f9f98827c0a..5055ac34142d 100644
--- a/include/linux/dmi.h
+++ b/include/linux/dmi.h
@@ -75,7 +75,7 @@ struct dmi_header {
 	u8 type;
 	u8 length;
 	u16 handle;
-};
+} __packed;
 
 struct dmi_device {
 	struct list_head list;
-- 
cgit v1.2.3


From 8c2f7e8658df1d3b7cbfa62706941d14c715823a Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Thu, 25 Jun 2015 04:20:04 -0400
Subject: libnvdimm: infrastructure for btt devices

NVDIMM namespaces, in addition to accepting "struct bio" based requests,
also have the capability to perform byte-aligned accesses.  By default
only the bio/block interface is used.  However, if another driver can
make effective use of the byte-aligned capability it can claim namespace
interface and use the byte-aligned ->rw_bytes() interface.

The BTT driver is the initial first consumer of this mechanism to allow
adding atomic sector update semantics to a pmem or blk namespace.  This
patch is the sysfs infrastructure to allow configuring a BTT instance
for a namespace.  Enabling that BTT and performing i/o is in a
subsequent patch.

Cc: Greg KH <gregkh@linuxfoundation.org>
Cc: Neil Brown <neilb@suse.de>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/nvdimm/Kconfig          |   3 +
 drivers/nvdimm/Makefile         |   1 +
 drivers/nvdimm/btt.h            |  45 +++++
 drivers/nvdimm/btt_devs.c       | 422 ++++++++++++++++++++++++++++++++++++++++
 drivers/nvdimm/bus.c            |  12 +-
 drivers/nvdimm/label.c          |   5 +-
 drivers/nvdimm/namespace_devs.c | 204 +++++++++++++++----
 drivers/nvdimm/nd-core.h        |   4 +
 drivers/nvdimm/nd.h             |  40 ++++
 drivers/nvdimm/pmem.c           | 132 ++++++++-----
 drivers/nvdimm/region.c         |   8 +-
 drivers/nvdimm/region_devs.c    |  39 +++-
 include/linux/nd.h              |  63 +++++-
 13 files changed, 879 insertions(+), 99 deletions(-)
 create mode 100644 drivers/nvdimm/btt.h
 create mode 100644 drivers/nvdimm/btt_devs.c

(limited to 'include/linux')

diff --git a/drivers/nvdimm/Kconfig b/drivers/nvdimm/Kconfig
index 07a29113b870..5680e8e7a7aa 100644
--- a/drivers/nvdimm/Kconfig
+++ b/drivers/nvdimm/Kconfig
@@ -33,4 +33,7 @@ config BLK_DEV_PMEM
 
 	  Say Y if you want to use an NVDIMM
 
+config BTT
+	def_bool y
+
 endif
diff --git a/drivers/nvdimm/Makefile b/drivers/nvdimm/Makefile
index abce98f87f16..6085b4bd7312 100644
--- a/drivers/nvdimm/Makefile
+++ b/drivers/nvdimm/Makefile
@@ -11,3 +11,4 @@ libnvdimm-y += region_devs.o
 libnvdimm-y += region.o
 libnvdimm-y += namespace_devs.o
 libnvdimm-y += label.o
+libnvdimm-$(CONFIG_BTT) += btt_devs.o
diff --git a/drivers/nvdimm/btt.h b/drivers/nvdimm/btt.h
new file mode 100644
index 000000000000..e8f6d8e0ddd3
--- /dev/null
+++ b/drivers/nvdimm/btt.h
@@ -0,0 +1,45 @@
+/*
+ * Block Translation Table library
+ * Copyright (c) 2014-2015, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+
+#ifndef _LINUX_BTT_H
+#define _LINUX_BTT_H
+
+#include <linux/types.h>
+
+#define BTT_SIG_LEN 16
+#define BTT_SIG "BTT_ARENA_INFO\0"
+
+struct btt_sb {
+	u8 signature[BTT_SIG_LEN];
+	u8 uuid[16];
+	u8 parent_uuid[16];
+	__le32 flags;
+	__le16 version_major;
+	__le16 version_minor;
+	__le32 external_lbasize;
+	__le32 external_nlba;
+	__le32 internal_lbasize;
+	__le32 internal_nlba;
+	__le32 nfree;
+	__le32 infosize;
+	__le64 nextoff;
+	__le64 dataoff;
+	__le64 mapoff;
+	__le64 logoff;
+	__le64 info2off;
+	u8 padding[3968];
+	__le64 checksum;
+};
+
+#endif
diff --git a/drivers/nvdimm/btt_devs.c b/drivers/nvdimm/btt_devs.c
new file mode 100644
index 000000000000..effb70a88347
--- /dev/null
+++ b/drivers/nvdimm/btt_devs.c
@@ -0,0 +1,422 @@
+/*
+ * Copyright(c) 2013-2015 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+#include <linux/blkdev.h>
+#include <linux/device.h>
+#include <linux/genhd.h>
+#include <linux/sizes.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include "nd-core.h"
+#include "btt.h"
+#include "nd.h"
+
+static void __nd_btt_detach_ndns(struct nd_btt *nd_btt)
+{
+	struct nd_namespace_common *ndns = nd_btt->ndns;
+
+	dev_WARN_ONCE(&nd_btt->dev, !mutex_is_locked(&ndns->dev.mutex)
+			|| ndns->claim != &nd_btt->dev,
+			"%s: invalid claim\n", __func__);
+	ndns->claim = NULL;
+	nd_btt->ndns = NULL;
+	put_device(&ndns->dev);
+}
+
+static void nd_btt_detach_ndns(struct nd_btt *nd_btt)
+{
+	struct nd_namespace_common *ndns = nd_btt->ndns;
+
+	if (!ndns)
+		return;
+	get_device(&ndns->dev);
+	device_lock(&ndns->dev);
+	__nd_btt_detach_ndns(nd_btt);
+	device_unlock(&ndns->dev);
+	put_device(&ndns->dev);
+}
+
+static bool __nd_btt_attach_ndns(struct nd_btt *nd_btt,
+		struct nd_namespace_common *ndns)
+{
+	if (ndns->claim)
+		return false;
+	dev_WARN_ONCE(&nd_btt->dev, !mutex_is_locked(&ndns->dev.mutex)
+			|| nd_btt->ndns,
+			"%s: invalid claim\n", __func__);
+	ndns->claim = &nd_btt->dev;
+	nd_btt->ndns = ndns;
+	get_device(&ndns->dev);
+	return true;
+}
+
+static bool nd_btt_attach_ndns(struct nd_btt *nd_btt,
+		struct nd_namespace_common *ndns)
+{
+	bool claimed;
+
+	device_lock(&ndns->dev);
+	claimed = __nd_btt_attach_ndns(nd_btt, ndns);
+	device_unlock(&ndns->dev);
+	return claimed;
+}
+
+static void nd_btt_release(struct device *dev)
+{
+	struct nd_region *nd_region = to_nd_region(dev->parent);
+	struct nd_btt *nd_btt = to_nd_btt(dev);
+
+	dev_dbg(dev, "%s\n", __func__);
+	nd_btt_detach_ndns(nd_btt);
+	ida_simple_remove(&nd_region->btt_ida, nd_btt->id);
+	kfree(nd_btt->uuid);
+	kfree(nd_btt);
+}
+
+static struct device_type nd_btt_device_type = {
+	.name = "nd_btt",
+	.release = nd_btt_release,
+};
+
+bool is_nd_btt(struct device *dev)
+{
+	return dev->type == &nd_btt_device_type;
+}
+EXPORT_SYMBOL(is_nd_btt);
+
+struct nd_btt *to_nd_btt(struct device *dev)
+{
+	struct nd_btt *nd_btt = container_of(dev, struct nd_btt, dev);
+
+	WARN_ON(!is_nd_btt(dev));
+	return nd_btt;
+}
+EXPORT_SYMBOL(to_nd_btt);
+
+static const unsigned long btt_lbasize_supported[] = { 512, 4096, 0 };
+
+static ssize_t sector_size_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct nd_btt *nd_btt = to_nd_btt(dev);
+
+	return nd_sector_size_show(nd_btt->lbasize, btt_lbasize_supported, buf);
+}
+
+static ssize_t sector_size_store(struct device *dev,
+		struct device_attribute *attr, const char *buf, size_t len)
+{
+	struct nd_btt *nd_btt = to_nd_btt(dev);
+	ssize_t rc;
+
+	device_lock(dev);
+	nvdimm_bus_lock(dev);
+	rc = nd_sector_size_store(dev, buf, &nd_btt->lbasize,
+			btt_lbasize_supported);
+	dev_dbg(dev, "%s: result: %zd wrote: %s%s", __func__,
+			rc, buf, buf[len - 1] == '\n' ? "" : "\n");
+	nvdimm_bus_unlock(dev);
+	device_unlock(dev);
+
+	return rc ? rc : len;
+}
+static DEVICE_ATTR_RW(sector_size);
+
+static ssize_t uuid_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct nd_btt *nd_btt = to_nd_btt(dev);
+
+	if (nd_btt->uuid)
+		return sprintf(buf, "%pUb\n", nd_btt->uuid);
+	return sprintf(buf, "\n");
+}
+
+static ssize_t uuid_store(struct device *dev,
+		struct device_attribute *attr, const char *buf, size_t len)
+{
+	struct nd_btt *nd_btt = to_nd_btt(dev);
+	ssize_t rc;
+
+	device_lock(dev);
+	rc = nd_uuid_store(dev, &nd_btt->uuid, buf, len);
+	dev_dbg(dev, "%s: result: %zd wrote: %s%s", __func__,
+			rc, buf, buf[len - 1] == '\n' ? "" : "\n");
+	device_unlock(dev);
+
+	return rc ? rc : len;
+}
+static DEVICE_ATTR_RW(uuid);
+
+static ssize_t namespace_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct nd_btt *nd_btt = to_nd_btt(dev);
+	ssize_t rc;
+
+	nvdimm_bus_lock(dev);
+	rc = sprintf(buf, "%s\n", nd_btt->ndns
+			? dev_name(&nd_btt->ndns->dev) : "");
+	nvdimm_bus_unlock(dev);
+	return rc;
+}
+
+static int namespace_match(struct device *dev, void *data)
+{
+	char *name = data;
+
+	return strcmp(name, dev_name(dev)) == 0;
+}
+
+static bool is_nd_btt_idle(struct device *dev)
+{
+	struct nd_region *nd_region = to_nd_region(dev->parent);
+	struct nd_btt *nd_btt = to_nd_btt(dev);
+
+	if (nd_region->btt_seed == dev || nd_btt->ndns || dev->driver)
+		return false;
+	return true;
+}
+
+static ssize_t __namespace_store(struct device *dev,
+		struct device_attribute *attr, const char *buf, size_t len)
+{
+	struct nd_btt *nd_btt = to_nd_btt(dev);
+	struct nd_namespace_common *ndns;
+	struct device *found;
+	char *name;
+
+	if (dev->driver) {
+		dev_dbg(dev, "%s: -EBUSY\n", __func__);
+		return -EBUSY;
+	}
+
+	name = kstrndup(buf, len, GFP_KERNEL);
+	if (!name)
+		return -ENOMEM;
+	strim(name);
+
+	if (strncmp(name, "namespace", 9) == 0 || strcmp(name, "") == 0)
+		/* pass */;
+	else {
+		len = -EINVAL;
+		goto out;
+	}
+
+	ndns = nd_btt->ndns;
+	if (strcmp(name, "") == 0) {
+		/* detach the namespace and destroy / reset the btt device */
+		nd_btt_detach_ndns(nd_btt);
+		if (is_nd_btt_idle(dev))
+			nd_device_unregister(dev, ND_ASYNC);
+		else {
+			nd_btt->lbasize = 0;
+			kfree(nd_btt->uuid);
+			nd_btt->uuid = NULL;
+		}
+		goto out;
+	} else if (ndns) {
+		dev_dbg(dev, "namespace already set to: %s\n",
+				dev_name(&ndns->dev));
+		len = -EBUSY;
+		goto out;
+	}
+
+	found = device_find_child(dev->parent, name, namespace_match);
+	if (!found) {
+		dev_dbg(dev, "'%s' not found under %s\n", name,
+				dev_name(dev->parent));
+		len = -ENODEV;
+		goto out;
+	}
+
+	ndns = to_ndns(found);
+	if (__nvdimm_namespace_capacity(ndns) < SZ_16M) {
+		dev_dbg(dev, "%s too small to host btt\n", name);
+		len = -ENXIO;
+		goto out_attach;
+	}
+
+	WARN_ON_ONCE(!is_nvdimm_bus_locked(&nd_btt->dev));
+	if (!nd_btt_attach_ndns(nd_btt, ndns)) {
+		dev_dbg(dev, "%s already claimed\n",
+				dev_name(&ndns->dev));
+		len = -EBUSY;
+	}
+
+ out_attach:
+	put_device(&ndns->dev); /* from device_find_child */
+ out:
+	kfree(name);
+	return len;
+}
+
+static ssize_t namespace_store(struct device *dev,
+		struct device_attribute *attr, const char *buf, size_t len)
+{
+	ssize_t rc;
+
+	nvdimm_bus_lock(dev);
+	device_lock(dev);
+	rc = __namespace_store(dev, attr, buf, len);
+	dev_dbg(dev, "%s: result: %zd wrote: %s%s", __func__,
+			rc, buf, buf[len - 1] == '\n' ? "" : "\n");
+	device_unlock(dev);
+	nvdimm_bus_unlock(dev);
+
+	return rc;
+}
+static DEVICE_ATTR_RW(namespace);
+
+static struct attribute *nd_btt_attributes[] = {
+	&dev_attr_sector_size.attr,
+	&dev_attr_namespace.attr,
+	&dev_attr_uuid.attr,
+	NULL,
+};
+
+static struct attribute_group nd_btt_attribute_group = {
+	.attrs = nd_btt_attributes,
+};
+
+static const struct attribute_group *nd_btt_attribute_groups[] = {
+	&nd_btt_attribute_group,
+	&nd_device_attribute_group,
+	NULL,
+};
+
+static struct device *__nd_btt_create(struct nd_region *nd_region,
+		unsigned long lbasize, u8 *uuid,
+		struct nd_namespace_common *ndns)
+{
+	struct nd_btt *nd_btt;
+	struct device *dev;
+
+	nd_btt = kzalloc(sizeof(*nd_btt), GFP_KERNEL);
+	if (!nd_btt)
+		return NULL;
+
+	nd_btt->id = ida_simple_get(&nd_region->btt_ida, 0, 0, GFP_KERNEL);
+	if (nd_btt->id < 0) {
+		kfree(nd_btt);
+		return NULL;
+	}
+
+	nd_btt->lbasize = lbasize;
+	if (uuid)
+		uuid = kmemdup(uuid, 16, GFP_KERNEL);
+	nd_btt->uuid = uuid;
+	dev = &nd_btt->dev;
+	dev_set_name(dev, "btt%d.%d", nd_region->id, nd_btt->id);
+	dev->parent = &nd_region->dev;
+	dev->type = &nd_btt_device_type;
+	dev->groups = nd_btt_attribute_groups;
+	device_initialize(&nd_btt->dev);
+	if (ndns && !__nd_btt_attach_ndns(nd_btt, ndns)) {
+		dev_dbg(&ndns->dev, "%s failed, already claimed by %s\n",
+				__func__, dev_name(ndns->claim));
+		put_device(dev);
+		return NULL;
+	}
+	return dev;
+}
+
+struct device *nd_btt_create(struct nd_region *nd_region)
+{
+	struct device *dev = __nd_btt_create(nd_region, 0, NULL, NULL);
+
+	if (dev)
+		__nd_device_register(dev);
+	return dev;
+}
+
+/*
+ * nd_btt_sb_checksum: compute checksum for btt info block
+ *
+ * Returns a fletcher64 checksum of everything in the given info block
+ * except the last field (since that's where the checksum lives).
+ */
+u64 nd_btt_sb_checksum(struct btt_sb *btt_sb)
+{
+	u64 sum, sum_save;
+
+	sum_save = btt_sb->checksum;
+	btt_sb->checksum = 0;
+	sum = nd_fletcher64(btt_sb, sizeof(*btt_sb), 1);
+	btt_sb->checksum = sum_save;
+	return sum;
+}
+EXPORT_SYMBOL(nd_btt_sb_checksum);
+
+static int __nd_btt_probe(struct nd_btt *nd_btt,
+		struct nd_namespace_common *ndns, struct btt_sb *btt_sb)
+{
+	u64 checksum;
+
+	if (!btt_sb || !ndns || !nd_btt)
+		return -ENODEV;
+
+	if (nvdimm_read_bytes(ndns, SZ_4K, btt_sb, sizeof(*btt_sb)))
+		return -ENXIO;
+
+	if (nvdimm_namespace_capacity(ndns) < SZ_16M)
+		return -ENXIO;
+
+	if (memcmp(btt_sb->signature, BTT_SIG, BTT_SIG_LEN) != 0)
+		return -ENODEV;
+
+	checksum = le64_to_cpu(btt_sb->checksum);
+	btt_sb->checksum = 0;
+	if (checksum != nd_btt_sb_checksum(btt_sb))
+		return -ENODEV;
+	btt_sb->checksum = cpu_to_le64(checksum);
+
+	nd_btt->lbasize = le32_to_cpu(btt_sb->external_lbasize);
+	nd_btt->uuid = kmemdup(btt_sb->uuid, 16, GFP_KERNEL);
+	if (!nd_btt->uuid)
+		return -ENOMEM;
+
+	__nd_device_register(&nd_btt->dev);
+
+	return 0;
+}
+
+int nd_btt_probe(struct nd_namespace_common *ndns, void *drvdata)
+{
+	int rc;
+	struct device *dev;
+	struct btt_sb *btt_sb;
+	struct nd_region *nd_region = to_nd_region(ndns->dev.parent);
+
+	if (ndns->force_raw)
+		return -ENODEV;
+
+	nvdimm_bus_lock(&ndns->dev);
+	dev = __nd_btt_create(nd_region, 0, NULL, ndns);
+	nvdimm_bus_unlock(&ndns->dev);
+	if (!dev)
+		return -ENOMEM;
+	dev_set_drvdata(dev, drvdata);
+	btt_sb = kzalloc(sizeof(*btt_sb), GFP_KERNEL);
+	rc = __nd_btt_probe(to_nd_btt(dev), ndns, btt_sb);
+	kfree(btt_sb);
+	dev_dbg(&ndns->dev, "%s: btt: %s\n", __func__,
+			rc == 0 ? dev_name(dev) : "<none>");
+	if (rc < 0) {
+		__nd_btt_detach_ndns(to_nd_btt(dev));
+		put_device(dev);
+	}
+
+	return rc;
+}
+EXPORT_SYMBOL(nd_btt_probe);
diff --git a/drivers/nvdimm/bus.c b/drivers/nvdimm/bus.c
index ca802702440e..dd12f38397db 100644
--- a/drivers/nvdimm/bus.c
+++ b/drivers/nvdimm/bus.c
@@ -14,8 +14,10 @@
 #include <linux/vmalloc.h>
 #include <linux/uaccess.h>
 #include <linux/module.h>
+#include <linux/blkdev.h>
 #include <linux/fcntl.h>
 #include <linux/async.h>
+#include <linux/genhd.h>
 #include <linux/ndctl.h>
 #include <linux/sched.h>
 #include <linux/slab.h>
@@ -103,6 +105,7 @@ static int nvdimm_bus_probe(struct device *dev)
 
 	dev_dbg(&nvdimm_bus->dev, "%s.probe(%s) = %d\n", dev->driver->name,
 			dev_name(dev), rc);
+
 	if (rc != 0)
 		module_put(provider);
 	return rc;
@@ -163,14 +166,19 @@ static void nd_async_device_unregister(void *d, async_cookie_t cookie)
 	put_device(dev);
 }
 
-void nd_device_register(struct device *dev)
+void __nd_device_register(struct device *dev)
 {
 	dev->bus = &nvdimm_bus_type;
-	device_initialize(dev);
 	get_device(dev);
 	async_schedule_domain(nd_async_device_register, dev,
 			&nd_async_domain);
 }
+
+void nd_device_register(struct device *dev)
+{
+	device_initialize(dev);
+	__nd_device_register(dev);
+}
 EXPORT_SYMBOL(nd_device_register);
 
 void nd_device_unregister(struct device *dev, enum nd_async_mode mode)
diff --git a/drivers/nvdimm/label.c b/drivers/nvdimm/label.c
index 34148003fc73..96526dcfdd37 100644
--- a/drivers/nvdimm/label.c
+++ b/drivers/nvdimm/label.c
@@ -666,7 +666,7 @@ static int __blk_label_update(struct nd_region *nd_region,
 
 	/* don't allow updates that consume the last label */
 	if (nfree - alloc < 0 || nfree - alloc + victims < 1) {
-		dev_info(&nsblk->dev, "insufficient label space\n");
+		dev_info(&nsblk->common.dev, "insufficient label space\n");
 		kfree(victim_map);
 		return -ENOSPC;
 	}
@@ -762,7 +762,8 @@ static int __blk_label_update(struct nd_region *nd_region,
 			continue;
 		res = to_resource(ndd, nd_label);
 		res->flags &= ~DPA_RESOURCE_ADJUSTED;
-		dev_vdbg(&nsblk->dev, "assign label[%d] slot: %d\n", l, slot);
+		dev_vdbg(&nsblk->common.dev, "assign label[%d] slot: %d\n",
+				l, slot);
 		nd_mapping->labels[l++] = nd_label;
 	}
 	nd_mapping->labels[l] = NULL;
diff --git a/drivers/nvdimm/namespace_devs.c b/drivers/nvdimm/namespace_devs.c
index 50b502b1908e..2c50a0719f8d 100644
--- a/drivers/nvdimm/namespace_devs.c
+++ b/drivers/nvdimm/namespace_devs.c
@@ -102,7 +102,7 @@ static ssize_t __alt_name_store(struct device *dev, const char *buf,
 	} else
 		return -ENXIO;
 
-	if (dev->driver)
+	if (dev->driver || to_ndns(dev)->claim)
 		return -EBUSY;
 
 	input = kmemdup(buf, len + 1, GFP_KERNEL);
@@ -133,7 +133,7 @@ out:
 
 static resource_size_t nd_namespace_blk_size(struct nd_namespace_blk *nsblk)
 {
-	struct nd_region *nd_region = to_nd_region(nsblk->dev.parent);
+	struct nd_region *nd_region = to_nd_region(nsblk->common.dev.parent);
 	struct nd_mapping *nd_mapping = &nd_region->mapping[0];
 	struct nvdimm_drvdata *ndd = to_ndd(nd_mapping);
 	struct nd_label_id label_id;
@@ -152,9 +152,9 @@ static resource_size_t nd_namespace_blk_size(struct nd_namespace_blk *nsblk)
 static int nd_namespace_label_update(struct nd_region *nd_region,
 		struct device *dev)
 {
-	dev_WARN_ONCE(dev, dev->driver,
+	dev_WARN_ONCE(dev, dev->driver || to_ndns(dev)->claim,
 			"namespace must be idle during label update\n");
-	if (dev->driver)
+	if (dev->driver || to_ndns(dev)->claim)
 		return 0;
 
 	/*
@@ -666,7 +666,7 @@ static ssize_t __size_store(struct device *dev, unsigned long long val)
 	u8 *uuid = NULL;
 	int rc, i;
 
-	if (dev->driver)
+	if (dev->driver || to_ndns(dev)->claim)
 		return -EBUSY;
 
 	if (is_namespace_pmem(dev)) {
@@ -733,12 +733,16 @@ static ssize_t __size_store(struct device *dev, unsigned long long val)
 		nd_namespace_pmem_set_size(nd_region, nspm,
 				val * nd_region->ndr_mappings);
 	} else if (is_namespace_blk(dev)) {
+		struct nd_namespace_blk *nsblk = to_nd_namespace_blk(dev);
+
 		/*
 		 * Try to delete the namespace if we deleted all of its
-		 * allocation and this is not the seed device for the
-		 * region.
+		 * allocation, this is not the seed device for the
+		 * region, and it is not actively claimed by a btt
+		 * instance.
 		 */
-		if (val == 0 && nd_region->ns_seed != dev)
+		if (val == 0 && nd_region->ns_seed != dev
+				&& !nsblk->common.claim)
 			nd_device_unregister(dev, ND_ASYNC);
 	}
 
@@ -789,26 +793,42 @@ static ssize_t size_store(struct device *dev,
 	return rc < 0 ? rc : len;
 }
 
-static ssize_t size_show(struct device *dev,
-		struct device_attribute *attr, char *buf)
+resource_size_t __nvdimm_namespace_capacity(struct nd_namespace_common *ndns)
 {
-	unsigned long long size = 0;
+	struct device *dev = &ndns->dev;
 
-	nvdimm_bus_lock(dev);
 	if (is_namespace_pmem(dev)) {
 		struct nd_namespace_pmem *nspm = to_nd_namespace_pmem(dev);
 
-		size = resource_size(&nspm->nsio.res);
+		return resource_size(&nspm->nsio.res);
 	} else if (is_namespace_blk(dev)) {
-		size = nd_namespace_blk_size(to_nd_namespace_blk(dev));
+		return nd_namespace_blk_size(to_nd_namespace_blk(dev));
 	} else if (is_namespace_io(dev)) {
 		struct nd_namespace_io *nsio = to_nd_namespace_io(dev);
 
-		size = resource_size(&nsio->res);
-	}
-	nvdimm_bus_unlock(dev);
+		return resource_size(&nsio->res);
+	} else
+		WARN_ONCE(1, "unknown namespace type\n");
+	return 0;
+}
+
+resource_size_t nvdimm_namespace_capacity(struct nd_namespace_common *ndns)
+{
+	resource_size_t size;
 
-	return sprintf(buf, "%llu\n", size);
+	nvdimm_bus_lock(&ndns->dev);
+	size = __nvdimm_namespace_capacity(ndns);
+	nvdimm_bus_unlock(&ndns->dev);
+
+	return size;
+}
+EXPORT_SYMBOL(nvdimm_namespace_capacity);
+
+static ssize_t size_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	return sprintf(buf, "%llu\n", (unsigned long long)
+			nvdimm_namespace_capacity(to_ndns(dev)));
 }
 static DEVICE_ATTR(size, S_IRUGO, size_show, size_store);
 
@@ -897,8 +917,8 @@ static ssize_t uuid_store(struct device *dev,
 {
 	struct nd_region *nd_region = to_nd_region(dev->parent);
 	u8 *uuid = NULL;
+	ssize_t rc = 0;
 	u8 **ns_uuid;
-	ssize_t rc;
 
 	if (is_namespace_pmem(dev)) {
 		struct nd_namespace_pmem *nspm = to_nd_namespace_pmem(dev);
@@ -914,7 +934,10 @@ static ssize_t uuid_store(struct device *dev,
 	device_lock(dev);
 	nvdimm_bus_lock(dev);
 	wait_nvdimm_bus_probe_idle(dev);
-	rc = nd_uuid_store(dev, &uuid, buf, len);
+	if (to_ndns(dev)->claim)
+		rc = -EBUSY;
+	if (rc >= 0)
+		rc = nd_uuid_store(dev, &uuid, buf, len);
 	if (rc >= 0)
 		rc = namespace_update_uuid(nd_region, dev, uuid, ns_uuid);
 	if (rc >= 0)
@@ -971,15 +994,18 @@ static ssize_t sector_size_store(struct device *dev,
 {
 	struct nd_namespace_blk *nsblk = to_nd_namespace_blk(dev);
 	struct nd_region *nd_region = to_nd_region(dev->parent);
-	ssize_t rc;
+	ssize_t rc = 0;
 
 	if (!is_namespace_blk(dev))
 		return -ENXIO;
 
 	device_lock(dev);
 	nvdimm_bus_lock(dev);
-	rc = nd_sector_size_store(dev, buf, &nsblk->lbasize,
-			ns_lbasize_supported);
+	if (to_ndns(dev)->claim)
+		rc = -EBUSY;
+	if (rc >= 0)
+		rc = nd_sector_size_store(dev, buf, &nsblk->lbasize,
+				ns_lbasize_supported);
 	if (rc >= 0)
 		rc = nd_namespace_label_update(nd_region, dev);
 	dev_dbg(dev, "%s: result: %zd %s: %s%s", __func__,
@@ -1034,12 +1060,48 @@ static ssize_t dpa_extents_show(struct device *dev,
 }
 static DEVICE_ATTR_RO(dpa_extents);
 
+static ssize_t holder_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct nd_namespace_common *ndns = to_ndns(dev);
+	ssize_t rc;
+
+	device_lock(dev);
+	rc = sprintf(buf, "%s\n", ndns->claim ? dev_name(ndns->claim) : "");
+	device_unlock(dev);
+
+	return rc;
+}
+static DEVICE_ATTR_RO(holder);
+
+static ssize_t force_raw_store(struct device *dev,
+		struct device_attribute *attr, const char *buf, size_t len)
+{
+	bool force_raw;
+	int rc = strtobool(buf, &force_raw);
+
+	if (rc)
+		return rc;
+
+	to_ndns(dev)->force_raw = force_raw;
+	return len;
+}
+
+static ssize_t force_raw_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	return sprintf(buf, "%d\n", to_ndns(dev)->force_raw);
+}
+static DEVICE_ATTR_RW(force_raw);
+
 static struct attribute *nd_namespace_attributes[] = {
 	&dev_attr_nstype.attr,
 	&dev_attr_size.attr,
 	&dev_attr_uuid.attr,
+	&dev_attr_holder.attr,
 	&dev_attr_resource.attr,
 	&dev_attr_alt_name.attr,
+	&dev_attr_force_raw.attr,
 	&dev_attr_sector_size.attr,
 	&dev_attr_dpa_extents.attr,
 	NULL,
@@ -1066,7 +1128,9 @@ static umode_t namespace_visible(struct kobject *kobj,
 		return a->mode;
 	}
 
-	if (a == &dev_attr_nstype.attr || a == &dev_attr_size.attr)
+	if (a == &dev_attr_nstype.attr || a == &dev_attr_size.attr
+			|| a == &dev_attr_holder.attr
+			|| a == &dev_attr_force_raw.attr)
 		return a->mode;
 
 	return 0;
@@ -1083,6 +1147,66 @@ static const struct attribute_group *nd_namespace_attribute_groups[] = {
 	NULL,
 };
 
+struct nd_namespace_common *nvdimm_namespace_common_probe(struct device *dev)
+{
+	struct nd_btt *nd_btt = is_nd_btt(dev) ? to_nd_btt(dev) : NULL;
+	struct nd_namespace_common *ndns;
+	resource_size_t size;
+
+	if (nd_btt) {
+		ndns = nd_btt->ndns;
+		if (!ndns)
+			return ERR_PTR(-ENODEV);
+
+		/*
+		 * Flush any in-progess probes / removals in the driver
+		 * for the raw personality of this namespace.
+		 */
+		device_lock(&ndns->dev);
+		device_unlock(&ndns->dev);
+		if (ndns->dev.driver) {
+			dev_dbg(&ndns->dev, "is active, can't bind %s\n",
+					dev_name(&nd_btt->dev));
+			return ERR_PTR(-EBUSY);
+		}
+		if (dev_WARN_ONCE(&ndns->dev, ndns->claim != &nd_btt->dev,
+					"host (%s) vs claim (%s) mismatch\n",
+					dev_name(&nd_btt->dev),
+					dev_name(ndns->claim)))
+			return ERR_PTR(-ENXIO);
+	} else {
+		ndns = to_ndns(dev);
+		if (ndns->claim) {
+			dev_dbg(dev, "claimed by %s, failing probe\n",
+				dev_name(ndns->claim));
+
+			return ERR_PTR(-ENXIO);
+		}
+	}
+
+	size = nvdimm_namespace_capacity(ndns);
+	if (size < ND_MIN_NAMESPACE_SIZE) {
+		dev_dbg(&ndns->dev, "%pa, too small must be at least %#x\n",
+				&size, ND_MIN_NAMESPACE_SIZE);
+		return ERR_PTR(-ENODEV);
+	}
+
+	if (is_namespace_pmem(&ndns->dev)) {
+		struct nd_namespace_pmem *nspm;
+
+		nspm = to_nd_namespace_pmem(&ndns->dev);
+		if (!nspm->uuid) {
+			dev_dbg(&ndns->dev, "%s: uuid not set\n", __func__);
+			return ERR_PTR(-ENODEV);
+		}
+	} else if (is_namespace_blk(&ndns->dev)) {
+		return ERR_PTR(-ENODEV); /* TODO */
+	}
+
+	return ndns;
+}
+EXPORT_SYMBOL(nvdimm_namespace_common_probe);
+
 static struct device **create_namespace_io(struct nd_region *nd_region)
 {
 	struct nd_namespace_io *nsio;
@@ -1099,7 +1223,7 @@ static struct device **create_namespace_io(struct nd_region *nd_region)
 		return NULL;
 	}
 
-	dev = &nsio->dev;
+	dev = &nsio->common.dev;
 	dev->type = &namespace_io_device_type;
 	dev->parent = &nd_region->dev;
 	res = &nsio->res;
@@ -1313,7 +1437,7 @@ static struct device **create_namespace_pmem(struct nd_region *nd_region)
 	if (!nspm)
 		return NULL;
 
-	dev = &nspm->nsio.dev;
+	dev = &nspm->nsio.common.dev;
 	dev->type = &namespace_pmem_device_type;
 	dev->parent = &nd_region->dev;
 	res = &nspm->nsio.res;
@@ -1346,7 +1470,7 @@ static struct device **create_namespace_pmem(struct nd_region *nd_region)
 	return devs;
 
  err:
-	namespace_pmem_release(&nspm->nsio.dev);
+	namespace_pmem_release(&nspm->nsio.common.dev);
 	return NULL;
 }
 
@@ -1385,7 +1509,7 @@ static struct device *nd_namespace_blk_create(struct nd_region *nd_region)
 	if (!nsblk)
 		return NULL;
 
-	dev = &nsblk->dev;
+	dev = &nsblk->common.dev;
 	dev->type = &namespace_blk_device_type;
 	nsblk->id = ida_simple_get(&nd_region->ns_ida, 0, 0, GFP_KERNEL);
 	if (nsblk->id < 0) {
@@ -1396,7 +1520,7 @@ static struct device *nd_namespace_blk_create(struct nd_region *nd_region)
 	dev->parent = &nd_region->dev;
 	dev->groups = nd_namespace_attribute_groups;
 
-	return &nsblk->dev;
+	return &nsblk->common.dev;
 }
 
 void nd_region_create_blk_seed(struct nd_region *nd_region)
@@ -1413,6 +1537,18 @@ void nd_region_create_blk_seed(struct nd_region *nd_region)
 		nd_device_register(nd_region->ns_seed);
 }
 
+void nd_region_create_btt_seed(struct nd_region *nd_region)
+{
+	WARN_ON(!is_nvdimm_bus_locked(&nd_region->dev));
+	nd_region->btt_seed = nd_btt_create(nd_region);
+	/*
+	 * Seed creation failures are not fatal, provisioning is simply
+	 * disabled until memory becomes available
+	 */
+	if (!nd_region->btt_seed)
+		dev_err(&nd_region->dev, "failed to create btt namespace\n");
+}
+
 static struct device **create_namespace_blk(struct nd_region *nd_region)
 {
 	struct nd_mapping *nd_mapping = &nd_region->mapping[0];
@@ -1446,7 +1582,7 @@ static struct device **create_namespace_blk(struct nd_region *nd_region)
 				if (!res)
 					goto err;
 				nd_dbg_dpa(nd_region, ndd, res, "%s assign\n",
-					dev_name(&nsblk->dev));
+					dev_name(&nsblk->common.dev));
 				break;
 			}
 		}
@@ -1462,7 +1598,7 @@ static struct device **create_namespace_blk(struct nd_region *nd_region)
 		nsblk = kzalloc(sizeof(*nsblk), GFP_KERNEL);
 		if (!nsblk)
 			goto err;
-		dev = &nsblk->dev;
+		dev = &nsblk->common.dev;
 		dev->type = &namespace_blk_device_type;
 		dev->parent = &nd_region->dev;
 		dev_set_name(dev, "namespace%d.%d", nd_region->id, count);
@@ -1482,7 +1618,7 @@ static struct device **create_namespace_blk(struct nd_region *nd_region)
 		if (!res)
 			goto err;
 		nd_dbg_dpa(nd_region, ndd, res, "%s assign\n",
-				dev_name(&nsblk->dev));
+				dev_name(&nsblk->common.dev));
 	}
 
 	dev_dbg(&nd_region->dev, "%s: discovered %d blk namespace%s\n",
@@ -1503,7 +1639,7 @@ static struct device **create_namespace_blk(struct nd_region *nd_region)
 		nsblk = kzalloc(sizeof(*nsblk), GFP_KERNEL);
 		if (!nsblk)
 			goto err;
-		dev = &nsblk->dev;
+		dev = &nsblk->common.dev;
 		dev->type = &namespace_blk_device_type;
 		dev->parent = &nd_region->dev;
 		devs[count++] = dev;
@@ -1514,7 +1650,7 @@ static struct device **create_namespace_blk(struct nd_region *nd_region)
 err:
 	for (i = 0; i < count; i++) {
 		nsblk = to_nd_namespace_blk(devs[i]);
-		namespace_blk_release(&nsblk->dev);
+		namespace_blk_release(&nsblk->common.dev);
 	}
 	kfree(devs);
 	return NULL;
diff --git a/drivers/nvdimm/nd-core.h b/drivers/nvdimm/nd-core.h
index 78d6c51f4bac..5e6413964776 100644
--- a/drivers/nvdimm/nd-core.h
+++ b/drivers/nvdimm/nd-core.h
@@ -45,12 +45,14 @@ struct nvdimm {
 bool is_nvdimm(struct device *dev);
 bool is_nd_blk(struct device *dev);
 bool is_nd_pmem(struct device *dev);
+struct nd_btt;
 struct nvdimm_bus *walk_to_nvdimm_bus(struct device *nd_dev);
 int __init nvdimm_bus_init(void);
 void nvdimm_bus_exit(void);
 void nd_region_probe_success(struct nvdimm_bus *nvdimm_bus, struct device *dev);
 struct nd_region;
 void nd_region_create_blk_seed(struct nd_region *nd_region);
+void nd_region_create_btt_seed(struct nd_region *nd_region);
 void nd_region_disable(struct nvdimm_bus *nvdimm_bus, struct device *dev);
 int nvdimm_bus_create_ndctl(struct nvdimm_bus *nvdimm_bus);
 void nvdimm_bus_destroy_ndctl(struct nvdimm_bus *nvdimm_bus);
@@ -58,6 +60,7 @@ void nd_synchronize(void);
 int nvdimm_bus_register_dimms(struct nvdimm_bus *nvdimm_bus);
 int nvdimm_bus_register_regions(struct nvdimm_bus *nvdimm_bus);
 int nvdimm_bus_init_interleave_sets(struct nvdimm_bus *nvdimm_bus);
+void __nd_device_register(struct device *dev);
 int nd_match_dimm(struct device *dev, void *data);
 struct nd_label_id;
 char *nd_label_gen_id(struct nd_label_id *label_id, u8 *uuid, u32 flags);
@@ -77,4 +80,5 @@ struct resource *nsblk_add_resource(struct nd_region *nd_region,
 		resource_size_t start);
 int nvdimm_num_label_slots(struct nvdimm_drvdata *ndd);
 void get_ndd(struct nvdimm_drvdata *ndd);
+resource_size_t __nvdimm_namespace_capacity(struct nd_namespace_common *ndns);
 #endif /* __ND_CORE_H__ */
diff --git a/drivers/nvdimm/nd.h b/drivers/nvdimm/nd.h
index bfa849617358..d13eccbb67e9 100644
--- a/drivers/nvdimm/nd.h
+++ b/drivers/nvdimm/nd.h
@@ -19,6 +19,10 @@
 #include <linux/types.h>
 #include "label.h"
 
+enum {
+	SECTOR_SHIFT = 9,
+};
+
 struct nvdimm_drvdata {
 	struct device *dev;
 	int nsindex_size;
@@ -74,7 +78,9 @@ static inline struct nd_namespace_index *to_next_namespace_index(
 struct nd_region {
 	struct device dev;
 	struct ida ns_ida;
+	struct ida btt_ida;
 	struct device *ns_seed;
+	struct device *btt_seed;
 	u16 ndr_mappings;
 	u64 ndr_size;
 	u64 ndr_start;
@@ -94,6 +100,14 @@ static inline unsigned nd_inc_seq(unsigned seq)
 	return next[seq & 3];
 }
 
+struct nd_btt {
+	struct device dev;
+	struct nd_namespace_common *ndns;
+	unsigned long lbasize;
+	u8 *uuid;
+	int id;
+};
+
 enum nd_async_mode {
 	ND_SYNC,
 	ND_ASYNC,
@@ -118,6 +132,30 @@ int nvdimm_init_nsarea(struct nvdimm_drvdata *ndd);
 int nvdimm_init_config_data(struct nvdimm_drvdata *ndd);
 int nvdimm_set_config_data(struct nvdimm_drvdata *ndd, size_t offset,
 		void *buf, size_t len);
+struct nd_btt *to_nd_btt(struct device *dev);
+struct btt_sb;
+u64 nd_btt_sb_checksum(struct btt_sb *btt_sb);
+#if IS_ENABLED(CONFIG_BTT)
+int nd_btt_probe(struct nd_namespace_common *ndns, void *drvdata);
+bool is_nd_btt(struct device *dev);
+struct device *nd_btt_create(struct nd_region *nd_region);
+#else
+static inline nd_btt_probe(struct nd_namespace_common *ndns, void *drvdata)
+{
+	return -ENODEV;
+}
+
+static inline bool is_nd_btt(struct device *dev)
+{
+	return false;
+}
+
+static inline struct device *nd_btt_create(struct nd_region *nd_region)
+{
+	return NULL;
+}
+
+#endif
 struct nd_region *to_nd_region(struct device *dev);
 int nd_region_to_nstype(struct nd_region *nd_region);
 int nd_region_register_namespaces(struct nd_region *nd_region, int *err);
@@ -132,4 +170,6 @@ void nvdimm_free_dpa(struct nvdimm_drvdata *ndd, struct resource *res);
 struct resource *nvdimm_allocate_dpa(struct nvdimm_drvdata *ndd,
 		struct nd_label_id *label_id, resource_size_t start,
 		resource_size_t n);
+resource_size_t nvdimm_namespace_capacity(struct nd_namespace_common *ndns);
+struct nd_namespace_common *nvdimm_namespace_common_probe(struct device *dev);
 #endif /* __ND_H__ */
diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index 90902a142e35..d0c6b4bdba69 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -121,44 +121,61 @@ static struct pmem_device *pmem_alloc(struct device *dev,
 		struct resource *res, int id)
 {
 	struct pmem_device *pmem;
-	struct gendisk *disk;
-	int err;
 
-	err = -ENOMEM;
 	pmem = kzalloc(sizeof(*pmem), GFP_KERNEL);
 	if (!pmem)
-		goto out;
+		return ERR_PTR(-ENOMEM);
 
 	pmem->phys_addr = res->start;
 	pmem->size = resource_size(res);
 
-	err = -EINVAL;
-	if (!request_mem_region(pmem->phys_addr, pmem->size, "pmem")) {
+	if (!request_mem_region(pmem->phys_addr, pmem->size, dev_name(dev))) {
 		dev_warn(dev, "could not reserve region [0x%pa:0x%zx]\n",
 				&pmem->phys_addr, pmem->size);
-		goto out_free_dev;
+		kfree(pmem);
+		return ERR_PTR(-EBUSY);
 	}
 
 	/*
 	 * Map the memory as non-cachable, as we can't write back the contents
 	 * of the CPU caches in case of a crash.
 	 */
-	err = -ENOMEM;
 	pmem->virt_addr = ioremap_nocache(pmem->phys_addr, pmem->size);
-	if (!pmem->virt_addr)
-		goto out_release_region;
+	if (!pmem->virt_addr) {
+		release_mem_region(pmem->phys_addr, pmem->size);
+		kfree(pmem);
+		return ERR_PTR(-ENXIO);
+	}
+
+	return pmem;
+}
+
+static void pmem_detach_disk(struct pmem_device *pmem)
+{
+	del_gendisk(pmem->pmem_disk);
+	put_disk(pmem->pmem_disk);
+	blk_cleanup_queue(pmem->pmem_queue);
+}
+
+static int pmem_attach_disk(struct nd_namespace_common *ndns,
+		struct pmem_device *pmem)
+{
+	struct nd_region *nd_region = to_nd_region(ndns->dev.parent);
+	struct gendisk *disk;
 
 	pmem->pmem_queue = blk_alloc_queue(GFP_KERNEL);
 	if (!pmem->pmem_queue)
-		goto out_unmap;
+		return -ENOMEM;
 
 	blk_queue_make_request(pmem->pmem_queue, pmem_make_request);
 	blk_queue_max_hw_sectors(pmem->pmem_queue, 1024);
 	blk_queue_bounce_limit(pmem->pmem_queue, BLK_BOUNCE_ANY);
 
 	disk = alloc_disk(0);
-	if (!disk)
-		goto out_free_queue;
+	if (!disk) {
+		blk_cleanup_queue(pmem->pmem_queue);
+		return -ENOMEM;
+	}
 
 	disk->major		= pmem_major;
 	disk->first_minor	= 0;
@@ -166,32 +183,47 @@ static struct pmem_device *pmem_alloc(struct device *dev,
 	disk->private_data	= pmem;
 	disk->queue		= pmem->pmem_queue;
 	disk->flags		= GENHD_FL_EXT_DEVT;
-	sprintf(disk->disk_name, "pmem%d", id);
-	disk->driverfs_dev = dev;
+	sprintf(disk->disk_name, "pmem%d", nd_region->id);
+	disk->driverfs_dev = &ndns->dev;
 	set_capacity(disk, pmem->size >> 9);
 	pmem->pmem_disk = disk;
 
 	add_disk(disk);
 
-	return pmem;
+	return 0;
+}
 
-out_free_queue:
-	blk_cleanup_queue(pmem->pmem_queue);
-out_unmap:
-	iounmap(pmem->virt_addr);
-out_release_region:
-	release_mem_region(pmem->phys_addr, pmem->size);
-out_free_dev:
-	kfree(pmem);
-out:
-	return ERR_PTR(err);
+static int pmem_rw_bytes(struct nd_namespace_common *ndns,
+		resource_size_t offset, void *buf, size_t size, int rw)
+{
+	struct pmem_device *pmem = dev_get_drvdata(ndns->claim);
+
+	if (unlikely(offset + size > pmem->size)) {
+		dev_WARN_ONCE(&ndns->dev, 1, "request out of range\n");
+		return -EFAULT;
+	}
+
+	if (rw == READ)
+		memcpy(buf, pmem->virt_addr + offset, size);
+	else
+		memcpy(pmem->virt_addr + offset, buf, size);
+
+	return 0;
+}
+
+static int nvdimm_namespace_attach_btt(struct nd_namespace_common *ndns)
+{
+	/* TODO */
+	return -ENXIO;
+}
+
+static void nvdimm_namespace_detach_btt(struct nd_namespace_common *ndns)
+{
+	/* TODO */
 }
 
 static void pmem_free(struct pmem_device *pmem)
 {
-	del_gendisk(pmem->pmem_disk);
-	put_disk(pmem->pmem_disk);
-	blk_cleanup_queue(pmem->pmem_queue);
 	iounmap(pmem->virt_addr);
 	release_mem_region(pmem->phys_addr, pmem->size);
 	kfree(pmem);
@@ -200,40 +232,44 @@ static void pmem_free(struct pmem_device *pmem)
 static int nd_pmem_probe(struct device *dev)
 {
 	struct nd_region *nd_region = to_nd_region(dev->parent);
-	struct nd_namespace_io *nsio = to_nd_namespace_io(dev);
+	struct nd_namespace_common *ndns;
+	struct nd_namespace_io *nsio;
 	struct pmem_device *pmem;
+	int rc;
 
-	if (resource_size(&nsio->res) < ND_MIN_NAMESPACE_SIZE) {
-		resource_size_t size = resource_size(&nsio->res);
-
-		dev_dbg(dev, "%s: size: %pa, too small must be at least %#x\n",
-				__func__, &size, ND_MIN_NAMESPACE_SIZE);
-		return -ENODEV;
-	}
-
-	if (nd_region_to_nstype(nd_region) == ND_DEVICE_NAMESPACE_PMEM) {
-		struct nd_namespace_pmem *nspm = to_nd_namespace_pmem(dev);
-
-		if (!nspm->uuid) {
-			dev_dbg(dev, "%s: uuid not set\n", __func__);
-			return -ENODEV;
-		}
-	}
+	ndns = nvdimm_namespace_common_probe(dev);
+	if (IS_ERR(ndns))
+		return PTR_ERR(ndns);
 
+	nsio = to_nd_namespace_io(&ndns->dev);
 	pmem = pmem_alloc(dev, &nsio->res, nd_region->id);
 	if (IS_ERR(pmem))
 		return PTR_ERR(pmem);
 
 	dev_set_drvdata(dev, pmem);
-
-	return 0;
+	ndns->rw_bytes = pmem_rw_bytes;
+	if (is_nd_btt(dev))
+		rc = nvdimm_namespace_attach_btt(ndns);
+	else if (nd_btt_probe(ndns, pmem) == 0) {
+		/* we'll come back as btt-pmem */
+		rc = -ENXIO;
+	} else
+		rc = pmem_attach_disk(ndns, pmem);
+	if (rc)
+		pmem_free(pmem);
+	return rc;
 }
 
 static int nd_pmem_remove(struct device *dev)
 {
 	struct pmem_device *pmem = dev_get_drvdata(dev);
 
+	if (is_nd_btt(dev))
+		nvdimm_namespace_detach_btt(to_nd_btt(dev)->ndns);
+	else
+		pmem_detach_disk(pmem);
 	pmem_free(pmem);
+
 	return 0;
 }
 
diff --git a/drivers/nvdimm/region.c b/drivers/nvdimm/region.c
index 9aba44e483e0..2a5f3f53d79d 100644
--- a/drivers/nvdimm/region.c
+++ b/drivers/nvdimm/region.c
@@ -33,12 +33,13 @@ static int nd_region_probe(struct device *dev)
 	num_ns->count = rc + err;
 	dev_set_drvdata(dev, num_ns);
 
+	if (rc && err && rc == err)
+		return -ENODEV;
+
+	nd_region->btt_seed = nd_btt_create(nd_region);
 	if (err == 0)
 		return 0;
 
-	if (rc == err)
-		return -ENODEV;
-
 	/*
 	 * Given multiple namespaces per region, we do not want to
 	 * disable all the successfully registered peer namespaces upon
@@ -66,6 +67,7 @@ static int nd_region_remove(struct device *dev)
 	/* flush attribute readers and disable */
 	nvdimm_bus_lock(dev);
 	nd_region->ns_seed = NULL;
+	nd_region->btt_seed = NULL;
 	dev_set_drvdata(dev, NULL);
 	nvdimm_bus_unlock(dev);
 
diff --git a/drivers/nvdimm/region_devs.c b/drivers/nvdimm/region_devs.c
index ac21ce419beb..4fd92389fa7e 100644
--- a/drivers/nvdimm/region_devs.c
+++ b/drivers/nvdimm/region_devs.c
@@ -296,10 +296,28 @@ static ssize_t namespace_seed_show(struct device *dev,
 }
 static DEVICE_ATTR_RO(namespace_seed);
 
+static ssize_t btt_seed_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct nd_region *nd_region = to_nd_region(dev);
+	ssize_t rc;
+
+	nvdimm_bus_lock(dev);
+	if (nd_region->btt_seed)
+		rc = sprintf(buf, "%s\n", dev_name(nd_region->btt_seed));
+	else
+		rc = sprintf(buf, "\n");
+	nvdimm_bus_unlock(dev);
+
+	return rc;
+}
+static DEVICE_ATTR_RO(btt_seed);
+
 static struct attribute *nd_region_attributes[] = {
 	&dev_attr_size.attr,
 	&dev_attr_nstype.attr,
 	&dev_attr_mappings.attr,
+	&dev_attr_btt_seed.attr,
 	&dev_attr_set_cookie.attr,
 	&dev_attr_available_size.attr,
 	&dev_attr_namespace_seed.attr,
@@ -345,15 +363,18 @@ u64 nd_region_interleave_set_cookie(struct nd_region *nd_region)
 
 /*
  * Upon successful probe/remove, take/release a reference on the
- * associated interleave set (if present)
+ * associated interleave set (if present), and plant new btt + namespace
+ * seeds.
  */
 static void nd_region_notify_driver_action(struct nvdimm_bus *nvdimm_bus,
 		struct device *dev, bool probe)
 {
+	struct nd_region *nd_region;
+
 	if (!probe && (is_nd_pmem(dev) || is_nd_blk(dev))) {
-		struct nd_region *nd_region = to_nd_region(dev);
 		int i;
 
+		nd_region = to_nd_region(dev);
 		for (i = 0; i < nd_region->ndr_mappings; i++) {
 			struct nd_mapping *nd_mapping = &nd_region->mapping[i];
 			struct nvdimm_drvdata *ndd = nd_mapping->ndd;
@@ -365,14 +386,21 @@ static void nd_region_notify_driver_action(struct nvdimm_bus *nvdimm_bus,
 			nd_mapping->ndd = NULL;
 			atomic_dec(&nvdimm->busy);
 		}
-	} else if (dev->parent && is_nd_blk(dev->parent) && probe) {
-		struct nd_region *nd_region = to_nd_region(dev->parent);
-
+	}
+	if (dev->parent && is_nd_blk(dev->parent) && probe) {
+		nd_region = to_nd_region(dev->parent);
 		nvdimm_bus_lock(dev);
 		if (nd_region->ns_seed == dev)
 			nd_region_create_blk_seed(nd_region);
 		nvdimm_bus_unlock(dev);
 	}
+	if (is_nd_btt(dev) && probe) {
+		nd_region = to_nd_region(dev->parent);
+		nvdimm_bus_lock(dev);
+		if (nd_region->btt_seed == dev)
+			nd_region_create_btt_seed(nd_region);
+		nvdimm_bus_unlock(dev);
+	}
 }
 
 void nd_region_probe_success(struct nvdimm_bus *nvdimm_bus, struct device *dev)
@@ -546,6 +574,7 @@ static struct nd_region *nd_region_create(struct nvdimm_bus *nvdimm_bus,
 	nd_region->provider_data = ndr_desc->provider_data;
 	nd_region->nd_set = ndr_desc->nd_set;
 	ida_init(&nd_region->ns_ida);
+	ida_init(&nd_region->btt_ida);
 	dev = &nd_region->dev;
 	dev_set_name(dev, "region%d", nd_region->id);
 	dev->parent = &nvdimm_bus->dev;
diff --git a/include/linux/nd.h b/include/linux/nd.h
index 23276ea91690..507e47c86737 100644
--- a/include/linux/nd.h
+++ b/include/linux/nd.h
@@ -12,6 +12,7 @@
  */
 #ifndef __LINUX_ND_H__
 #define __LINUX_ND_H__
+#include <linux/fs.h>
 #include <linux/ndctl.h>
 #include <linux/device.h>
 
@@ -28,13 +29,33 @@ static inline struct nd_device_driver *to_nd_device_driver(
 	return container_of(drv, struct nd_device_driver, drv);
 };
 
+/**
+ * struct nd_namespace_common - core infrastructure of a namespace
+ * @force_raw: ignore other personalities for the namespace (e.g. btt)
+ * @dev: device model node
+ * @claim: when set a another personality has taken ownership of the namespace
+ * @rw_bytes: access the raw namespace capacity with byte-aligned transfers
+ */
+struct nd_namespace_common {
+	int force_raw;
+	struct device dev;
+	struct device *claim;
+	int (*rw_bytes)(struct nd_namespace_common *, resource_size_t offset,
+			void *buf, size_t size, int rw);
+};
+
+static inline struct nd_namespace_common *to_ndns(struct device *dev)
+{
+	return container_of(dev, struct nd_namespace_common, dev);
+}
+
 /**
  * struct nd_namespace_io - infrastructure for loading an nd_pmem instance
  * @dev: namespace device created by the nd region driver
  * @res: struct resource conversion of a NFIT SPA table
  */
 struct nd_namespace_io {
-	struct device dev;
+	struct nd_namespace_common common;
 	struct resource res;
 };
 
@@ -52,7 +73,6 @@ struct nd_namespace_pmem {
 
 /**
  * struct nd_namespace_blk - namespace for dimm-bounded persistent memory
- * @dev: namespace device creation by the nd region driver
  * @alt_name: namespace name supplied in the dimm label
  * @uuid: namespace name supplied in the dimm label
  * @id: ida allocated id
@@ -61,7 +81,7 @@ struct nd_namespace_pmem {
  * @res: discontiguous dpa extents for given dimm
  */
 struct nd_namespace_blk {
-	struct device dev;
+	struct nd_namespace_common common;
 	char *alt_name;
 	u8 *uuid;
 	int id;
@@ -72,7 +92,7 @@ struct nd_namespace_blk {
 
 static inline struct nd_namespace_io *to_nd_namespace_io(struct device *dev)
 {
-	return container_of(dev, struct nd_namespace_io, dev);
+	return container_of(dev, struct nd_namespace_io, common.dev);
 }
 
 static inline struct nd_namespace_pmem *to_nd_namespace_pmem(struct device *dev)
@@ -84,7 +104,40 @@ static inline struct nd_namespace_pmem *to_nd_namespace_pmem(struct device *dev)
 
 static inline struct nd_namespace_blk *to_nd_namespace_blk(struct device *dev)
 {
-	return container_of(dev, struct nd_namespace_blk, dev);
+	return container_of(dev, struct nd_namespace_blk, common.dev);
+}
+
+/**
+ * nvdimm_read_bytes() - synchronously read bytes from an nvdimm namespace
+ * @ndns: device to read
+ * @offset: namespace-relative starting offset
+ * @buf: buffer to fill
+ * @size: transfer length
+ *
+ * @buf is up-to-date upon return from this routine.
+ */
+static inline int nvdimm_read_bytes(struct nd_namespace_common *ndns,
+		resource_size_t offset, void *buf, size_t size)
+{
+	return ndns->rw_bytes(ndns, offset, buf, size, READ);
+}
+
+/**
+ * nvdimm_write_bytes() - synchronously write bytes to an nvdimm namespace
+ * @ndns: device to read
+ * @offset: namespace-relative starting offset
+ * @buf: buffer to drain
+ * @size: transfer length
+ *
+ * NVDIMM Namepaces disks do not implement sectors internally.  Depending on
+ * the @ndns, the contents of @buf may be in cpu cache, platform buffers,
+ * or on backing memory media upon return from this routine.  Flushing
+ * to media is handled internal to the @ndns driver, if at all.
+ */
+static inline int nvdimm_write_bytes(struct nd_namespace_common *ndns,
+		resource_size_t offset, void *buf, size_t size)
+{
+	return ndns->rw_bytes(ndns, offset, buf, size, WRITE);
 }
 
 #define MODULE_ALIAS_ND_DEVICE(type) \
-- 
cgit v1.2.3


From 144cba1493fdd6e3e1980e439a31df877831ebcd Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zyan@redhat.com>
Date: Mon, 27 Apr 2015 11:09:54 +0800
Subject: libceph: allow setting osd_req_op's flags

Signed-off-by: Yan, Zheng <zyan@redhat.com>
Reviewed-by: Alex Elder <elder@linaro.org>
---
 drivers/block/rbd.c             |  4 ++--
 fs/ceph/addr.c                  |  3 ++-
 fs/ceph/file.c                  |  2 +-
 include/linux/ceph/osd_client.h |  2 +-
 net/ceph/osd_client.c           | 24 +++++++++++++++---------
 5 files changed, 21 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index ec6c5c6e1ac9..349115ae3bc2 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -2376,7 +2376,7 @@ static void rbd_img_obj_request_fill(struct rbd_obj_request *obj_request,
 	}
 
 	if (opcode == CEPH_OSD_OP_DELETE)
-		osd_req_op_init(osd_request, num_ops, opcode);
+		osd_req_op_init(osd_request, num_ops, opcode, 0);
 	else
 		osd_req_op_extent_init(osd_request, num_ops, opcode,
 				       offset, length, 0, 0);
@@ -2848,7 +2848,7 @@ static int rbd_img_obj_exists_submit(struct rbd_obj_request *obj_request)
 		goto out;
 	stat_request->callback = rbd_img_obj_exists_callback;
 
-	osd_req_op_init(stat_request->osd_req, 0, CEPH_OSD_OP_STAT);
+	osd_req_op_init(stat_request->osd_req, 0, CEPH_OSD_OP_STAT, 0);
 	osd_req_op_raw_data_in_pages(stat_request->osd_req, 0, pages, size, 0,
 					false, false);
 	rbd_osd_req_format_read(stat_request);
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index e162bcd105ee..feeaf3b65fa0 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -884,7 +884,8 @@ get_more_pages:
 				}
 
 				if (do_sync)
-					osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC);
+					osd_req_op_init(req, 1,
+							CEPH_OSD_OP_STARTSYNC, 0);
 
 				req->r_callback = writepages_finish;
 				req->r_inode = inode;
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 3b6b522b4b31..777eb21d556f 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -614,7 +614,7 @@ ceph_sync_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos)
 			break;
 		}
 
-		osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC);
+		osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC, 0);
 
 		n = iov_iter_get_pages_alloc(from, &pages, len, &start);
 		if (unlikely(n < 0)) {
diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index 61b19c46bdb3..7506b485bb6d 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -249,7 +249,7 @@ extern void ceph_osdc_handle_map(struct ceph_osd_client *osdc,
 				 struct ceph_msg *msg);
 
 extern void osd_req_op_init(struct ceph_osd_request *osd_req,
-					unsigned int which, u16 opcode);
+			    unsigned int which, u16 opcode, u32 flags);
 
 extern void osd_req_op_raw_data_in_pages(struct ceph_osd_request *,
 					unsigned int which,
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 72459b9df8a1..4cb4fab46e4f 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -453,7 +453,7 @@ __CEPH_FORALL_OSD_OPS(GENERATE_CASE)
  */
 static struct ceph_osd_req_op *
 _osd_req_op_init(struct ceph_osd_request *osd_req, unsigned int which,
-				u16 opcode)
+		 u16 opcode, u32 flags)
 {
 	struct ceph_osd_req_op *op;
 
@@ -463,14 +463,15 @@ _osd_req_op_init(struct ceph_osd_request *osd_req, unsigned int which,
 	op = &osd_req->r_ops[which];
 	memset(op, 0, sizeof (*op));
 	op->op = opcode;
+	op->flags = flags;
 
 	return op;
 }
 
 void osd_req_op_init(struct ceph_osd_request *osd_req,
-				unsigned int which, u16 opcode)
+		     unsigned int which, u16 opcode, u32 flags)
 {
-	(void)_osd_req_op_init(osd_req, which, opcode);
+	(void)_osd_req_op_init(osd_req, which, opcode, flags);
 }
 EXPORT_SYMBOL(osd_req_op_init);
 
@@ -479,7 +480,8 @@ void osd_req_op_extent_init(struct ceph_osd_request *osd_req,
 				u64 offset, u64 length,
 				u64 truncate_size, u32 truncate_seq)
 {
-	struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which, opcode);
+	struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which,
+						      opcode, 0);
 	size_t payload_len = 0;
 
 	BUG_ON(opcode != CEPH_OSD_OP_READ && opcode != CEPH_OSD_OP_WRITE &&
@@ -518,7 +520,8 @@ EXPORT_SYMBOL(osd_req_op_extent_update);
 void osd_req_op_cls_init(struct ceph_osd_request *osd_req, unsigned int which,
 			u16 opcode, const char *class, const char *method)
 {
-	struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which, opcode);
+	struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which,
+						      opcode, 0);
 	struct ceph_pagelist *pagelist;
 	size_t payload_len = 0;
 	size_t size;
@@ -555,7 +558,8 @@ int osd_req_op_xattr_init(struct ceph_osd_request *osd_req, unsigned int which,
 			  u16 opcode, const char *name, const void *value,
 			  size_t size, u8 cmp_op, u8 cmp_mode)
 {
-	struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which, opcode);
+	struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which,
+						      opcode, 0);
 	struct ceph_pagelist *pagelist;
 	size_t payload_len;
 
@@ -588,7 +592,8 @@ void osd_req_op_watch_init(struct ceph_osd_request *osd_req,
 				unsigned int which, u16 opcode,
 				u64 cookie, u64 version, int flag)
 {
-	struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which, opcode);
+	struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which,
+						      opcode, 0);
 
 	BUG_ON(opcode != CEPH_OSD_OP_NOTIFY_ACK && opcode != CEPH_OSD_OP_WATCH);
 
@@ -605,7 +610,8 @@ void osd_req_op_alloc_hint_init(struct ceph_osd_request *osd_req,
 				u64 expected_write_size)
 {
 	struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which,
-						      CEPH_OSD_OP_SETALLOCHINT);
+						      CEPH_OSD_OP_SETALLOCHINT,
+						      0);
 
 	op->alloc_hint.expected_object_size = expected_object_size;
 	op->alloc_hint.expected_write_size = expected_write_size;
@@ -789,7 +795,7 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
 	}
 
 	if (opcode == CEPH_OSD_OP_CREATE || opcode == CEPH_OSD_OP_DELETE) {
-		osd_req_op_init(req, which, opcode);
+		osd_req_op_init(req, which, opcode, 0);
 	} else {
 		u32 object_size = le32_to_cpu(layout->fl_object_size);
 		u32 object_base = off - objoff;
-- 
cgit v1.2.3


From d50c97b566c5bbf990eff472e9feaa58fdebdd33 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Fri, 15 May 2015 11:52:20 +0300
Subject: libceph: nuke time_sub()

Unused since ceph got merged into mainline I guess.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
Reviewed-by: Alex Elder <elder@linaro.org>
---
 include/linux/ceph/libceph.h | 9 ---------
 1 file changed, 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h
index 30f92cefaa72..85ae9a889a3f 100644
--- a/include/linux/ceph/libceph.h
+++ b/include/linux/ceph/libceph.h
@@ -93,15 +93,6 @@ enum {
 	CEPH_MOUNT_SHUTDOWN,
 };
 
-/*
- * subtract jiffies
- */
-static inline unsigned long time_sub(unsigned long a, unsigned long b)
-{
-	BUG_ON(time_after(b, a));
-	return (long)a - (long)b;
-}
-
 struct ceph_mds_client;
 
 /*
-- 
cgit v1.2.3


From a319bf56a617354e62cf5f774d2ca4e1a8a3bff3 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Fri, 15 May 2015 12:02:17 +0300
Subject: libceph: store timeouts in jiffies, verify user input

There are currently three libceph-level timeouts that the user can
specify on mount: mount_timeout, osd_idle_ttl and osdkeepalive.  All of
these are in seconds and no checking is done on user input: negative
values are accepted, we multiply them all by HZ which may or may not
overflow, arbitrarily large jiffies then get added together, etc.

There is also a bug in the way mount_timeout=0 is handled.  It's
supposed to mean "infinite timeout", but that's not how wait.h APIs
treat it and so __ceph_open_session() for example will busy loop
without much chance of being interrupted if none of ceph-mons are
there.

Fix all this by verifying user input, storing timeouts capped by
msecs_to_jiffies() in jiffies and using the new ceph_timeout_jiffies()
helper for all user-specified waits to handle infinite timeouts
correctly.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
Reviewed-by: Alex Elder <elder@linaro.org>
---
 drivers/block/rbd.c          |  5 +++--
 fs/ceph/dir.c                |  4 ++--
 fs/ceph/mds_client.c         | 12 ++++++------
 fs/ceph/mds_client.h         |  2 +-
 fs/ceph/super.c              |  2 +-
 include/linux/ceph/libceph.h | 17 +++++++++++------
 net/ceph/ceph_common.c       | 41 +++++++++++++++++++++++++++++++----------
 net/ceph/mon_client.c        | 11 +++++++++--
 net/ceph/osd_client.c        | 15 +++++++--------
 9 files changed, 71 insertions(+), 38 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 349115ae3bc2..992683b6b299 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -4963,8 +4963,8 @@ out_err:
  */
 static int rbd_add_get_pool_id(struct rbd_client *rbdc, const char *pool_name)
 {
+	struct ceph_options *opts = rbdc->client->options;
 	u64 newest_epoch;
-	unsigned long timeout = rbdc->client->options->mount_timeout * HZ;
 	int tries = 0;
 	int ret;
 
@@ -4979,7 +4979,8 @@ again:
 		if (rbdc->client->osdc.osdmap->epoch < newest_epoch) {
 			ceph_monc_request_next_osdmap(&rbdc->client->monc);
 			(void) ceph_monc_wait_osdmap(&rbdc->client->monc,
-						     newest_epoch, timeout);
+						     newest_epoch,
+						     opts->mount_timeout);
 			goto again;
 		} else {
 			/* the osdmap we have is new enough */
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 4248307fea90..173dd4b58c71 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -1259,8 +1259,8 @@ static int ceph_dir_fsync(struct file *file, loff_t start, loff_t end,
 		     inode, req->r_tid, last_tid);
 		if (req->r_timeout) {
 			unsigned long time_left = wait_for_completion_timeout(
-							&req->r_safe_completion,
-							req->r_timeout);
+					&req->r_safe_completion,
+					ceph_timeout_jiffies(req->r_timeout));
 			if (time_left > 0)
 				ret = 0;
 			else
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 69a36f40517f..0b0e0a9a81c0 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -2268,7 +2268,8 @@ int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
 	dout("do_request waiting\n");
 	if (req->r_timeout) {
 		err = (long)wait_for_completion_killable_timeout(
-			&req->r_completion, req->r_timeout);
+					&req->r_completion,
+					ceph_timeout_jiffies(req->r_timeout));
 		if (err == 0)
 			err = -EIO;
 	} else if (req->r_wait_for_completion) {
@@ -3424,8 +3425,8 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc)
  */
 static void wait_requests(struct ceph_mds_client *mdsc)
 {
+	struct ceph_options *opts = mdsc->fsc->client->options;
 	struct ceph_mds_request *req;
-	struct ceph_fs_client *fsc = mdsc->fsc;
 
 	mutex_lock(&mdsc->mutex);
 	if (__get_oldest_req(mdsc)) {
@@ -3433,7 +3434,7 @@ static void wait_requests(struct ceph_mds_client *mdsc)
 
 		dout("wait_requests waiting for requests\n");
 		wait_for_completion_timeout(&mdsc->safe_umount_waiters,
-				    fsc->client->options->mount_timeout * HZ);
+				    ceph_timeout_jiffies(opts->mount_timeout));
 
 		/* tear down remaining requests */
 		mutex_lock(&mdsc->mutex);
@@ -3556,10 +3557,9 @@ static bool done_closing_sessions(struct ceph_mds_client *mdsc)
  */
 void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc)
 {
+	struct ceph_options *opts = mdsc->fsc->client->options;
 	struct ceph_mds_session *session;
 	int i;
-	struct ceph_fs_client *fsc = mdsc->fsc;
-	unsigned long timeout = fsc->client->options->mount_timeout * HZ;
 
 	dout("close_sessions\n");
 
@@ -3580,7 +3580,7 @@ void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc)
 
 	dout("waiting for sessions to close\n");
 	wait_event_timeout(mdsc->session_close_wq, done_closing_sessions(mdsc),
-			   timeout);
+			   ceph_timeout_jiffies(opts->mount_timeout));
 
 	/* tear down remaining sessions */
 	mutex_lock(&mdsc->mutex);
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index 2ef799961ebb..509d6822e9b1 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -227,7 +227,7 @@ struct ceph_mds_request {
 	int r_err;
 	bool r_aborted;
 
-	unsigned long r_timeout;  /* optional.  jiffies */
+	unsigned long r_timeout;  /* optional.  jiffies, 0 is "wait forever" */
 	unsigned long r_started;  /* start time to measure timeout against */
 	unsigned long r_request_started; /* start time for mds request only,
 					    used to measure lease durations */
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 9a5350030af8..edeb83c43112 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -742,7 +742,7 @@ static struct dentry *open_root_dentry(struct ceph_fs_client *fsc,
 	req->r_ino1.ino = CEPH_INO_ROOT;
 	req->r_ino1.snap = CEPH_NOSNAP;
 	req->r_started = started;
-	req->r_timeout = fsc->client->options->mount_timeout * HZ;
+	req->r_timeout = fsc->client->options->mount_timeout;
 	req->r_args.getattr.mask = cpu_to_le32(CEPH_STAT_CAP_INODE);
 	req->r_num_caps = 2;
 	err = ceph_mdsc_do_request(mdsc, NULL, req);
diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h
index 85ae9a889a3f..d73a569f9bf5 100644
--- a/include/linux/ceph/libceph.h
+++ b/include/linux/ceph/libceph.h
@@ -43,9 +43,9 @@ struct ceph_options {
 	int flags;
 	struct ceph_fsid fsid;
 	struct ceph_entity_addr my_addr;
-	int mount_timeout;
-	int osd_idle_ttl;
-	int osd_keepalive_timeout;
+	unsigned long mount_timeout;		/* jiffies */
+	unsigned long osd_idle_ttl;		/* jiffies */
+	unsigned long osd_keepalive_timeout;	/* jiffies */
 
 	/*
 	 * any type that can't be simply compared or doesn't need need
@@ -63,9 +63,9 @@ struct ceph_options {
 /*
  * defaults
  */
-#define CEPH_MOUNT_TIMEOUT_DEFAULT  60
-#define CEPH_OSD_KEEPALIVE_DEFAULT  5
-#define CEPH_OSD_IDLE_TTL_DEFAULT    60
+#define CEPH_MOUNT_TIMEOUT_DEFAULT	msecs_to_jiffies(60 * 1000)
+#define CEPH_OSD_KEEPALIVE_DEFAULT	msecs_to_jiffies(5 * 1000)
+#define CEPH_OSD_IDLE_TTL_DEFAULT	msecs_to_jiffies(60 * 1000)
 
 #define CEPH_MSG_MAX_FRONT_LEN	(16*1024*1024)
 #define CEPH_MSG_MAX_MIDDLE_LEN	(16*1024*1024)
@@ -93,6 +93,11 @@ enum {
 	CEPH_MOUNT_SHUTDOWN,
 };
 
+static inline unsigned long ceph_timeout_jiffies(unsigned long timeout)
+{
+	return timeout ?: MAX_SCHEDULE_TIMEOUT;
+}
+
 struct ceph_mds_client;
 
 /*
diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c
index 79e8f71aef5b..a80e91c2c9a3 100644
--- a/net/ceph/ceph_common.c
+++ b/net/ceph/ceph_common.c
@@ -352,8 +352,8 @@ ceph_parse_options(char *options, const char *dev_name,
 	/* start with defaults */
 	opt->flags = CEPH_OPT_DEFAULT;
 	opt->osd_keepalive_timeout = CEPH_OSD_KEEPALIVE_DEFAULT;
-	opt->mount_timeout = CEPH_MOUNT_TIMEOUT_DEFAULT; /* seconds */
-	opt->osd_idle_ttl = CEPH_OSD_IDLE_TTL_DEFAULT;   /* seconds */
+	opt->mount_timeout = CEPH_MOUNT_TIMEOUT_DEFAULT;
+	opt->osd_idle_ttl = CEPH_OSD_IDLE_TTL_DEFAULT;
 
 	/* get mon ip(s) */
 	/* ip1[:port1][,ip2[:port2]...] */
@@ -439,13 +439,32 @@ ceph_parse_options(char *options, const char *dev_name,
 			pr_warn("ignoring deprecated osdtimeout option\n");
 			break;
 		case Opt_osdkeepalivetimeout:
-			opt->osd_keepalive_timeout = intval;
+			/* 0 isn't well defined right now, reject it */
+			if (intval < 1 || intval > INT_MAX / 1000) {
+				pr_err("osdkeepalive out of range\n");
+				err = -EINVAL;
+				goto out;
+			}
+			opt->osd_keepalive_timeout =
+					msecs_to_jiffies(intval * 1000);
 			break;
 		case Opt_osd_idle_ttl:
-			opt->osd_idle_ttl = intval;
+			/* 0 isn't well defined right now, reject it */
+			if (intval < 1 || intval > INT_MAX / 1000) {
+				pr_err("osd_idle_ttl out of range\n");
+				err = -EINVAL;
+				goto out;
+			}
+			opt->osd_idle_ttl = msecs_to_jiffies(intval * 1000);
 			break;
 		case Opt_mount_timeout:
-			opt->mount_timeout = intval;
+			/* 0 is "wait forever" (i.e. infinite timeout) */
+			if (intval < 0 || intval > INT_MAX / 1000) {
+				pr_err("mount_timeout out of range\n");
+				err = -EINVAL;
+				goto out;
+			}
+			opt->mount_timeout = msecs_to_jiffies(intval * 1000);
 			break;
 
 		case Opt_share:
@@ -512,12 +531,14 @@ int ceph_print_client_options(struct seq_file *m, struct ceph_client *client)
 		seq_puts(m, "notcp_nodelay,");
 
 	if (opt->mount_timeout != CEPH_MOUNT_TIMEOUT_DEFAULT)
-		seq_printf(m, "mount_timeout=%d,", opt->mount_timeout);
+		seq_printf(m, "mount_timeout=%d,",
+			   jiffies_to_msecs(opt->mount_timeout) / 1000);
 	if (opt->osd_idle_ttl != CEPH_OSD_IDLE_TTL_DEFAULT)
-		seq_printf(m, "osd_idle_ttl=%d,", opt->osd_idle_ttl);
+		seq_printf(m, "osd_idle_ttl=%d,",
+			   jiffies_to_msecs(opt->osd_idle_ttl) / 1000);
 	if (opt->osd_keepalive_timeout != CEPH_OSD_KEEPALIVE_DEFAULT)
 		seq_printf(m, "osdkeepalivetimeout=%d,",
-			   opt->osd_keepalive_timeout);
+		    jiffies_to_msecs(opt->osd_keepalive_timeout) / 1000);
 
 	/* drop redundant comma */
 	if (m->count != pos)
@@ -627,7 +648,7 @@ static int have_mon_and_osd_map(struct ceph_client *client)
 int __ceph_open_session(struct ceph_client *client, unsigned long started)
 {
 	int err;
-	unsigned long timeout = client->options->mount_timeout * HZ;
+	unsigned long timeout = client->options->mount_timeout;
 
 	/* open session, and wait for mon and osd maps */
 	err = ceph_monc_open_session(&client->monc);
@@ -643,7 +664,7 @@ int __ceph_open_session(struct ceph_client *client, unsigned long started)
 		dout("mount waiting for mon_map\n");
 		err = wait_event_interruptible_timeout(client->auth_wq,
 			have_mon_and_osd_map(client) || (client->auth_err < 0),
-			timeout);
+			ceph_timeout_jiffies(timeout));
 		if (err == -EINTR || err == -ERESTARTSYS)
 			return err;
 		if (client->auth_err < 0)
diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c
index 2b3cf05e87b0..0da3bdc116f7 100644
--- a/net/ceph/mon_client.c
+++ b/net/ceph/mon_client.c
@@ -298,6 +298,12 @@ void ceph_monc_request_next_osdmap(struct ceph_mon_client *monc)
 }
 EXPORT_SYMBOL(ceph_monc_request_next_osdmap);
 
+/*
+ * Wait for an osdmap with a given epoch.
+ *
+ * @epoch: epoch to wait for
+ * @timeout: in jiffies, 0 means "wait forever"
+ */
 int ceph_monc_wait_osdmap(struct ceph_mon_client *monc, u32 epoch,
 			  unsigned long timeout)
 {
@@ -308,11 +314,12 @@ int ceph_monc_wait_osdmap(struct ceph_mon_client *monc, u32 epoch,
 	while (monc->have_osdmap < epoch) {
 		mutex_unlock(&monc->mutex);
 
-		if (timeout != 0 && time_after_eq(jiffies, started + timeout))
+		if (timeout && time_after_eq(jiffies, started + timeout))
 			return -ETIMEDOUT;
 
 		ret = wait_event_interruptible_timeout(monc->client->auth_wq,
-					 monc->have_osdmap >= epoch, timeout);
+						monc->have_osdmap >= epoch,
+						ceph_timeout_jiffies(timeout));
 		if (ret < 0)
 			return ret;
 
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 4cb4fab46e4f..50033677c0fa 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -1097,7 +1097,7 @@ static void __move_osd_to_lru(struct ceph_osd_client *osdc,
 	BUG_ON(!list_empty(&osd->o_osd_lru));
 
 	list_add_tail(&osd->o_osd_lru, &osdc->osd_lru);
-	osd->lru_ttl = jiffies + osdc->client->options->osd_idle_ttl * HZ;
+	osd->lru_ttl = jiffies + osdc->client->options->osd_idle_ttl;
 }
 
 static void maybe_move_osd_to_lru(struct ceph_osd_client *osdc,
@@ -1208,7 +1208,7 @@ static struct ceph_osd *__lookup_osd(struct ceph_osd_client *osdc, int o)
 static void __schedule_osd_timeout(struct ceph_osd_client *osdc)
 {
 	schedule_delayed_work(&osdc->timeout_work,
-			osdc->client->options->osd_keepalive_timeout * HZ);
+			      osdc->client->options->osd_keepalive_timeout);
 }
 
 static void __cancel_osd_timeout(struct ceph_osd_client *osdc)
@@ -1576,10 +1576,9 @@ static void handle_timeout(struct work_struct *work)
 {
 	struct ceph_osd_client *osdc =
 		container_of(work, struct ceph_osd_client, timeout_work.work);
+	struct ceph_options *opts = osdc->client->options;
 	struct ceph_osd_request *req;
 	struct ceph_osd *osd;
-	unsigned long keepalive =
-		osdc->client->options->osd_keepalive_timeout * HZ;
 	struct list_head slow_osds;
 	dout("timeout\n");
 	down_read(&osdc->map_sem);
@@ -1595,7 +1594,8 @@ static void handle_timeout(struct work_struct *work)
 	 */
 	INIT_LIST_HEAD(&slow_osds);
 	list_for_each_entry(req, &osdc->req_lru, r_req_lru_item) {
-		if (time_before(jiffies, req->r_stamp + keepalive))
+		if (time_before(jiffies,
+				req->r_stamp + opts->osd_keepalive_timeout))
 			break;
 
 		osd = req->r_osd;
@@ -1622,8 +1622,7 @@ static void handle_osds_timeout(struct work_struct *work)
 	struct ceph_osd_client *osdc =
 		container_of(work, struct ceph_osd_client,
 			     osds_timeout_work.work);
-	unsigned long delay =
-		osdc->client->options->osd_idle_ttl * HZ >> 2;
+	unsigned long delay = osdc->client->options->osd_idle_ttl / 4;
 
 	dout("osds timeout\n");
 	down_read(&osdc->map_sem);
@@ -2628,7 +2627,7 @@ int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client)
 	osdc->event_count = 0;
 
 	schedule_delayed_work(&osdc->osds_timeout_work,
-	   round_jiffies_relative(osdc->client->options->osd_idle_ttl * HZ));
+	    round_jiffies_relative(osdc->client->options->osd_idle_ttl));
 
 	err = -ENOMEM;
 	osdc->req_mempool = mempool_create_kmalloc_pool(10,
-- 
cgit v1.2.3


From f66fd9f0952187d274c13c136b74548f792c1925 Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zyan@redhat.com>
Date: Wed, 10 Jun 2015 17:26:13 +0800
Subject: ceph: pre-allocate data structure that tracks caps flushing

Signed-off-by: Yan, Zheng <zyan@redhat.com>
---
 fs/ceph/addr.c               | 19 +++++++++++++++----
 fs/ceph/caps.c               | 26 ++++++++++++++++++++++----
 fs/ceph/file.c               | 18 ++++++++++++++++--
 fs/ceph/inode.c              | 15 +++++++++++++--
 fs/ceph/mds_client.c         |  6 +++++-
 fs/ceph/super.c              |  8 ++++++++
 fs/ceph/super.h              |  6 +++++-
 fs/ceph/xattr.c              | 20 ++++++++++++++++++--
 include/linux/ceph/libceph.h |  1 +
 9 files changed, 103 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 5f53ac0d9d7c..7edf3c49e661 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -1308,12 +1308,17 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 	struct inode *inode = file_inode(vma->vm_file);
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	struct ceph_file_info *fi = vma->vm_file->private_data;
+	struct ceph_cap_flush *prealloc_cf;
 	struct page *page = vmf->page;
 	loff_t off = page_offset(page);
 	loff_t size = i_size_read(inode);
 	size_t len;
 	int want, got, ret;
 
+	prealloc_cf = ceph_alloc_cap_flush();
+	if (!prealloc_cf)
+		return VM_FAULT_SIGBUS;
+
 	if (ci->i_inline_version != CEPH_INLINE_NONE) {
 		struct page *locked_page = NULL;
 		if (off == 0) {
@@ -1323,8 +1328,10 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 		ret = ceph_uninline_data(vma->vm_file, locked_page);
 		if (locked_page)
 			unlock_page(locked_page);
-		if (ret < 0)
-			return VM_FAULT_SIGBUS;
+		if (ret < 0) {
+			ret = VM_FAULT_SIGBUS;
+			goto out_free;
+		}
 	}
 
 	if (off + PAGE_CACHE_SIZE <= size)
@@ -1346,7 +1353,8 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 			break;
 		if (ret != -ERESTARTSYS) {
 			WARN_ON(1);
-			return VM_FAULT_SIGBUS;
+			ret = VM_FAULT_SIGBUS;
+			goto out_free;
 		}
 	}
 	dout("page_mkwrite %p %llu~%zd got cap refs on %s\n",
@@ -1381,7 +1389,8 @@ out:
 		int dirty;
 		spin_lock(&ci->i_ceph_lock);
 		ci->i_inline_version = CEPH_INLINE_NONE;
-		dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR);
+		dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR,
+					       &prealloc_cf);
 		spin_unlock(&ci->i_ceph_lock);
 		if (dirty)
 			__mark_inode_dirty(inode, dirty);
@@ -1390,6 +1399,8 @@ out:
 	dout("page_mkwrite %p %llu~%zd dropping cap refs on %s ret %d\n",
 	     inode, off, len, ceph_cap_string(got), ret);
 	ceph_put_cap_refs(ci, got);
+out_free:
+	ceph_free_cap_flush(prealloc_cf);
 
 	return ret;
 }
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 69a16044ec41..dd7b20adf1d4 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -1356,7 +1356,8 @@ static void ceph_flush_snaps(struct ceph_inode_info *ci)
  * Caller is then responsible for calling __mark_inode_dirty with the
  * returned flags value.
  */
-int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask)
+int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask,
+			   struct ceph_cap_flush **pcf)
 {
 	struct ceph_mds_client *mdsc =
 		ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc;
@@ -1376,6 +1377,9 @@ int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask)
 	     ceph_cap_string(was | mask));
 	ci->i_dirty_caps |= mask;
 	if (was == 0) {
+		WARN_ON_ONCE(ci->i_prealloc_cap_flush);
+		swap(ci->i_prealloc_cap_flush, *pcf);
+
 		if (!ci->i_head_snapc) {
 			WARN_ON_ONCE(!rwsem_is_locked(&mdsc->snap_rwsem));
 			ci->i_head_snapc = ceph_get_snap_context(
@@ -1391,6 +1395,8 @@ int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask)
 			ihold(inode);
 			dirty |= I_DIRTY_SYNC;
 		}
+	} else {
+		WARN_ON_ONCE(!ci->i_prealloc_cap_flush);
 	}
 	BUG_ON(list_empty(&ci->i_dirty_item));
 	if (((was | ci->i_flushing_caps) & CEPH_CAP_FILE_BUFFER) &&
@@ -1446,6 +1452,17 @@ static void __add_cap_flushing_to_mdsc(struct ceph_mds_client *mdsc,
 	rb_insert_color(&cf->g_node, &mdsc->cap_flush_tree);
 }
 
+struct ceph_cap_flush *ceph_alloc_cap_flush(void)
+{
+	return kmem_cache_alloc(ceph_cap_flush_cachep, GFP_KERNEL);
+}
+
+void ceph_free_cap_flush(struct ceph_cap_flush *cf)
+{
+	if (cf)
+		kmem_cache_free(ceph_cap_flush_cachep, cf);
+}
+
 static u64 __get_oldest_flush_tid(struct ceph_mds_client *mdsc)
 {
 	struct rb_node *n = rb_first(&mdsc->cap_flush_tree);
@@ -1469,11 +1486,12 @@ static int __mark_caps_flushing(struct inode *inode,
 {
 	struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
 	struct ceph_inode_info *ci = ceph_inode(inode);
-	struct ceph_cap_flush *cf;
+	struct ceph_cap_flush *cf = NULL;
 	int flushing;
 
 	BUG_ON(ci->i_dirty_caps == 0);
 	BUG_ON(list_empty(&ci->i_dirty_item));
+	BUG_ON(!ci->i_prealloc_cap_flush);
 
 	flushing = ci->i_dirty_caps;
 	dout("__mark_caps_flushing flushing %s, flushing_caps %s -> %s\n",
@@ -1484,7 +1502,7 @@ static int __mark_caps_flushing(struct inode *inode,
 	ci->i_dirty_caps = 0;
 	dout(" inode %p now !dirty\n", inode);
 
-	cf = kmalloc(sizeof(*cf), GFP_ATOMIC);
+	swap(cf, ci->i_prealloc_cap_flush);
 	cf->caps = flushing;
 	cf->kick = false;
 
@@ -3075,7 +3093,7 @@ out:
 		cf = list_first_entry(&to_remove,
 				      struct ceph_cap_flush, list);
 		list_del(&cf->list);
-		kfree(cf);
+		ceph_free_cap_flush(cf);
 	}
 	if (drop)
 		iput(inode);
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 0a76a370d798..8a4eb4d21d3c 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -939,6 +939,7 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	struct ceph_osd_client *osdc =
 		&ceph_sb_to_client(inode->i_sb)->client->osdc;
+	struct ceph_cap_flush *prealloc_cf;
 	ssize_t count, written = 0;
 	int err, want, got;
 	loff_t pos;
@@ -946,6 +947,10 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
 	if (ceph_snap(inode) != CEPH_NOSNAP)
 		return -EROFS;
 
+	prealloc_cf = ceph_alloc_cap_flush();
+	if (!prealloc_cf)
+		return -ENOMEM;
+
 	mutex_lock(&inode->i_mutex);
 
 	/* We can write back this queue in page reclaim */
@@ -1050,7 +1055,8 @@ retry_snap:
 		int dirty;
 		spin_lock(&ci->i_ceph_lock);
 		ci->i_inline_version = CEPH_INLINE_NONE;
-		dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR);
+		dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR,
+					       &prealloc_cf);
 		spin_unlock(&ci->i_ceph_lock);
 		if (dirty)
 			__mark_inode_dirty(inode, dirty);
@@ -1074,6 +1080,7 @@ retry_snap:
 out:
 	mutex_unlock(&inode->i_mutex);
 out_unlocked:
+	ceph_free_cap_flush(prealloc_cf);
 	current->backing_dev_info = NULL;
 	return written ? written : err;
 }
@@ -1270,6 +1277,7 @@ static long ceph_fallocate(struct file *file, int mode,
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	struct ceph_osd_client *osdc =
 		&ceph_inode_to_client(inode)->client->osdc;
+	struct ceph_cap_flush *prealloc_cf;
 	int want, got = 0;
 	int dirty;
 	int ret = 0;
@@ -1282,6 +1290,10 @@ static long ceph_fallocate(struct file *file, int mode,
 	if (!S_ISREG(inode->i_mode))
 		return -EOPNOTSUPP;
 
+	prealloc_cf = ceph_alloc_cap_flush();
+	if (!prealloc_cf)
+		return -ENOMEM;
+
 	mutex_lock(&inode->i_mutex);
 
 	if (ceph_snap(inode) != CEPH_NOSNAP) {
@@ -1328,7 +1340,8 @@ static long ceph_fallocate(struct file *file, int mode,
 	if (!ret) {
 		spin_lock(&ci->i_ceph_lock);
 		ci->i_inline_version = CEPH_INLINE_NONE;
-		dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR);
+		dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR,
+					       &prealloc_cf);
 		spin_unlock(&ci->i_ceph_lock);
 		if (dirty)
 			__mark_inode_dirty(inode, dirty);
@@ -1337,6 +1350,7 @@ static long ceph_fallocate(struct file *file, int mode,
 	ceph_put_cap_refs(ci, got);
 unlock:
 	mutex_unlock(&inode->i_mutex);
+	ceph_free_cap_flush(prealloc_cf);
 	return ret;
 }
 
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 3326302f5884..e86d1a4efc46 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -416,6 +416,7 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
 	ci->i_flushing_caps = 0;
 	INIT_LIST_HEAD(&ci->i_dirty_item);
 	INIT_LIST_HEAD(&ci->i_flushing_item);
+	ci->i_prealloc_cap_flush = NULL;
 	ci->i_cap_flush_tree = RB_ROOT;
 	init_waitqueue_head(&ci->i_cap_wq);
 	ci->i_hold_caps_min = 0;
@@ -1720,6 +1721,7 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
 	const unsigned int ia_valid = attr->ia_valid;
 	struct ceph_mds_request *req;
 	struct ceph_mds_client *mdsc = ceph_sb_to_client(dentry->d_sb)->mdsc;
+	struct ceph_cap_flush *prealloc_cf;
 	int issued;
 	int release = 0, dirtied = 0;
 	int mask = 0;
@@ -1734,10 +1736,16 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
 	if (err != 0)
 		return err;
 
+	prealloc_cf = ceph_alloc_cap_flush();
+	if (!prealloc_cf)
+		return -ENOMEM;
+
 	req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETATTR,
 				       USE_AUTH_MDS);
-	if (IS_ERR(req))
+	if (IS_ERR(req)) {
+		ceph_free_cap_flush(prealloc_cf);
 		return PTR_ERR(req);
+	}
 
 	spin_lock(&ci->i_ceph_lock);
 	issued = __ceph_caps_issued(ci, NULL);
@@ -1895,7 +1903,8 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
 		dout("setattr %p ATTR_FILE ... hrm!\n", inode);
 
 	if (dirtied) {
-		inode_dirty_flags = __ceph_mark_dirty_caps(ci, dirtied);
+		inode_dirty_flags = __ceph_mark_dirty_caps(ci, dirtied,
+							   &prealloc_cf);
 		inode->i_ctime = CURRENT_TIME;
 	}
 
@@ -1927,9 +1936,11 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
 	ceph_mdsc_put_request(req);
 	if (mask & CEPH_SETATTR_SIZE)
 		__ceph_do_pending_vmtruncate(inode);
+	ceph_free_cap_flush(prealloc_cf);
 	return err;
 out_put:
 	ceph_mdsc_put_request(req);
+	ceph_free_cap_flush(prealloc_cf);
 	return err;
 }
 
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 89e4305a94d4..8d73fe9d488b 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -1189,6 +1189,10 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
 		}
 		spin_unlock(&mdsc->cap_dirty_lock);
 
+		if (!ci->i_dirty_caps && ci->i_prealloc_cap_flush) {
+			list_add(&ci->i_prealloc_cap_flush->list, &to_remove);
+			ci->i_prealloc_cap_flush = NULL;
+		}
 	}
 	spin_unlock(&ci->i_ceph_lock);
 	while (!list_empty(&to_remove)) {
@@ -1196,7 +1200,7 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
 		cf = list_first_entry(&to_remove,
 				      struct ceph_cap_flush, list);
 		list_del(&cf->list);
-		kfree(cf);
+		ceph_free_cap_flush(cf);
 	}
 	while (drop--)
 		iput(inode);
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index edeb83c43112..d1c833c321b9 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -622,6 +622,7 @@ static void destroy_fs_client(struct ceph_fs_client *fsc)
  */
 struct kmem_cache *ceph_inode_cachep;
 struct kmem_cache *ceph_cap_cachep;
+struct kmem_cache *ceph_cap_flush_cachep;
 struct kmem_cache *ceph_dentry_cachep;
 struct kmem_cache *ceph_file_cachep;
 
@@ -647,6 +648,10 @@ static int __init init_caches(void)
 				     SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
 	if (ceph_cap_cachep == NULL)
 		goto bad_cap;
+	ceph_cap_flush_cachep = KMEM_CACHE(ceph_cap_flush,
+					   SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
+	if (ceph_cap_flush_cachep == NULL)
+		goto bad_cap_flush;
 
 	ceph_dentry_cachep = KMEM_CACHE(ceph_dentry_info,
 					SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
@@ -665,6 +670,8 @@ static int __init init_caches(void)
 bad_file:
 	kmem_cache_destroy(ceph_dentry_cachep);
 bad_dentry:
+	kmem_cache_destroy(ceph_cap_flush_cachep);
+bad_cap_flush:
 	kmem_cache_destroy(ceph_cap_cachep);
 bad_cap:
 	kmem_cache_destroy(ceph_inode_cachep);
@@ -681,6 +688,7 @@ static void destroy_caches(void)
 
 	kmem_cache_destroy(ceph_inode_cachep);
 	kmem_cache_destroy(ceph_cap_cachep);
+	kmem_cache_destroy(ceph_cap_flush_cachep);
 	kmem_cache_destroy(ceph_dentry_cachep);
 	kmem_cache_destroy(ceph_file_cachep);
 
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index e7f13f742357..4415e977d72b 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -309,6 +309,7 @@ struct ceph_inode_info {
 	/* we need to track cap writeback on a per-cap-bit basis, to allow
 	 * overlapping, pipelined cap flushes to the mds.  we can probably
 	 * reduce the tid to 8 bits if we're concerned about inode size. */
+	struct ceph_cap_flush *i_prealloc_cap_flush;
 	struct rb_root i_cap_flush_tree;
 	wait_queue_head_t i_cap_wq;      /* threads waiting on a capability */
 	unsigned long i_hold_caps_min; /* jiffies */
@@ -578,7 +579,10 @@ static inline int __ceph_caps_dirty(struct ceph_inode_info *ci)
 {
 	return ci->i_dirty_caps | ci->i_flushing_caps;
 }
-extern int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask);
+extern struct ceph_cap_flush *ceph_alloc_cap_flush(void);
+extern void ceph_free_cap_flush(struct ceph_cap_flush *cf);
+extern int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask,
+				  struct ceph_cap_flush **pcf);
 
 extern int __ceph_caps_revoking_other(struct ceph_inode_info *ci,
 				      struct ceph_cap *ocap, int mask);
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index c6f7d9b82085..819163d8313b 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -912,6 +912,7 @@ int __ceph_setxattr(struct dentry *dentry, const char *name,
 	struct ceph_vxattr *vxattr;
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	struct ceph_mds_client *mdsc = ceph_sb_to_client(dentry->d_sb)->mdsc;
+	struct ceph_cap_flush *prealloc_cf = NULL;
 	int issued;
 	int err;
 	int dirty = 0;
@@ -950,6 +951,10 @@ int __ceph_setxattr(struct dentry *dentry, const char *name,
 	if (!xattr)
 		goto out;
 
+	prealloc_cf = ceph_alloc_cap_flush();
+	if (!prealloc_cf)
+		goto out;
+
 	spin_lock(&ci->i_ceph_lock);
 retry:
 	issued = __ceph_caps_issued(ci, NULL);
@@ -991,7 +996,8 @@ retry:
 			  flags, value ? 1 : -1, &xattr);
 
 	if (!err) {
-		dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL);
+		dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL,
+					       &prealloc_cf);
 		ci->i_xattrs.dirty = true;
 		inode->i_ctime = CURRENT_TIME;
 	}
@@ -1001,6 +1007,7 @@ retry:
 		up_read(&mdsc->snap_rwsem);
 	if (dirty)
 		__mark_inode_dirty(inode, dirty);
+	ceph_free_cap_flush(prealloc_cf);
 	return err;
 
 do_sync:
@@ -1010,6 +1017,7 @@ do_sync_unlocked:
 		up_read(&mdsc->snap_rwsem);
 	err = ceph_sync_setxattr(dentry, name, value, size, flags);
 out:
+	ceph_free_cap_flush(prealloc_cf);
 	kfree(newname);
 	kfree(newval);
 	kfree(xattr);
@@ -1062,6 +1070,7 @@ int __ceph_removexattr(struct dentry *dentry, const char *name)
 	struct ceph_vxattr *vxattr;
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	struct ceph_mds_client *mdsc = ceph_sb_to_client(dentry->d_sb)->mdsc;
+	struct ceph_cap_flush *prealloc_cf = NULL;
 	int issued;
 	int err;
 	int required_blob_size;
@@ -1079,6 +1088,10 @@ int __ceph_removexattr(struct dentry *dentry, const char *name)
 	if (!strncmp(name, XATTR_CEPH_PREFIX, XATTR_CEPH_PREFIX_LEN))
 		goto do_sync_unlocked;
 
+	prealloc_cf = ceph_alloc_cap_flush();
+	if (!prealloc_cf)
+		return -ENOMEM;
+
 	err = -ENOMEM;
 	spin_lock(&ci->i_ceph_lock);
 retry:
@@ -1120,7 +1133,8 @@ retry:
 
 	err = __remove_xattr_by_name(ceph_inode(inode), name);
 
-	dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL);
+	dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL,
+				       &prealloc_cf);
 	ci->i_xattrs.dirty = true;
 	inode->i_ctime = CURRENT_TIME;
 	spin_unlock(&ci->i_ceph_lock);
@@ -1128,12 +1142,14 @@ retry:
 		up_read(&mdsc->snap_rwsem);
 	if (dirty)
 		__mark_inode_dirty(inode, dirty);
+	ceph_free_cap_flush(prealloc_cf);
 	return err;
 do_sync:
 	spin_unlock(&ci->i_ceph_lock);
 do_sync_unlocked:
 	if (lock_snap_rwsem)
 		up_read(&mdsc->snap_rwsem);
+	ceph_free_cap_flush(prealloc_cf);
 	err = ceph_send_removexattr(dentry, name);
 	return err;
 }
diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h
index d73a569f9bf5..9ebee53d3bf5 100644
--- a/include/linux/ceph/libceph.h
+++ b/include/linux/ceph/libceph.h
@@ -174,6 +174,7 @@ static inline int calc_pages_for(u64 off, u64 len)
 
 extern struct kmem_cache *ceph_inode_cachep;
 extern struct kmem_cache *ceph_cap_cachep;
+extern struct kmem_cache *ceph_cap_flush_cachep;
 extern struct kmem_cache *ceph_dentry_cachep;
 extern struct kmem_cache *ceph_file_cachep;
 
-- 
cgit v1.2.3


From b459be739f97e2062b2ba77cfe8ea198dbd58904 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Fri, 12 Jun 2015 13:21:07 +0300
Subject: crush: sync up with userspace

.. up to ceph.git commit 1db1abc8328d ("crush: eliminate ad hoc diff
between kernel and userspace").  This fixes a bunch of recently pulled
coding style issues and makes includes a bit cleaner.

A patch "crush:Make the function crush_ln static" from Nicholas Krause
<xerofoify@gmail.com> is folded in as crush_ln() has been made static
in userspace as well.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 include/linux/crush/crush.h     |  40 +++++++++++-
 include/linux/crush/hash.h      |   6 ++
 include/linux/crush/mapper.h    |   2 +-
 net/ceph/crush/crush.c          |  13 ++--
 net/ceph/crush/crush_ln_table.h |  32 +++++-----
 net/ceph/crush/hash.c           |   8 ++-
 net/ceph/crush/mapper.c         | 137 ++++++++++++++++++++++++++--------------
 7 files changed, 160 insertions(+), 78 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/crush/crush.h b/include/linux/crush/crush.h
index 48a1a7d100f1..48b49305716b 100644
--- a/include/linux/crush/crush.h
+++ b/include/linux/crush/crush.h
@@ -1,7 +1,11 @@
 #ifndef CEPH_CRUSH_CRUSH_H
 #define CEPH_CRUSH_CRUSH_H
 
-#include <linux/types.h>
+#ifdef __KERNEL__
+# include <linux/types.h>
+#else
+# include "crush_compat.h"
+#endif
 
 /*
  * CRUSH is a pseudo-random data distribution algorithm that
@@ -20,7 +24,11 @@
 #define CRUSH_MAGIC 0x00010000ul   /* for detecting algorithm revisions */
 
 #define CRUSH_MAX_DEPTH 10  /* max crush hierarchy depth */
+#define CRUSH_MAX_RULESET (1<<8)  /* max crush ruleset number */
+#define CRUSH_MAX_RULES CRUSH_MAX_RULESET  /* should be the same as max rulesets */
 
+#define CRUSH_MAX_DEVICE_WEIGHT (100u * 0x10000u)
+#define CRUSH_MAX_BUCKET_WEIGHT (65535u * 0x10000u)
 
 #define CRUSH_ITEM_UNDEF  0x7ffffffe  /* undefined result (internal use only) */
 #define CRUSH_ITEM_NONE   0x7fffffff  /* no result */
@@ -108,6 +116,15 @@ enum {
 };
 extern const char *crush_bucket_alg_name(int alg);
 
+/*
+ * although tree was a legacy algorithm, it has been buggy, so
+ * exclude it.
+ */
+#define CRUSH_LEGACY_ALLOWED_BUCKET_ALGS (	\
+		(1 << CRUSH_BUCKET_UNIFORM) |	\
+		(1 << CRUSH_BUCKET_LIST) |	\
+		(1 << CRUSH_BUCKET_STRAW))
+
 struct crush_bucket {
 	__s32 id;        /* this'll be negative */
 	__u16 type;      /* non-zero; type=0 is reserved for devices */
@@ -174,7 +191,7 @@ struct crush_map {
 	/* choose local attempts using a fallback permutation before
 	 * re-descent */
 	__u32 choose_local_fallback_tries;
-	/* choose attempts before giving up */ 
+	/* choose attempts before giving up */
 	__u32 choose_total_tries;
 	/* attempt chooseleaf inner descent once for firstn mode; on
 	 * reject retry outer descent.  Note that this does *not*
@@ -187,6 +204,25 @@ struct crush_map {
 	 * that want to limit reshuffling, a value of 3 or 4 will make the
 	 * mappings line up a bit better with previous mappings. */
 	__u8 chooseleaf_vary_r;
+
+#ifndef __KERNEL__
+	/*
+	 * version 0 (original) of straw_calc has various flaws.  version 1
+	 * fixes a few of them.
+	 */
+	__u8 straw_calc_version;
+
+	/*
+	 * allowed bucket algs is a bitmask, here the bit positions
+	 * are CRUSH_BUCKET_*.  note that these are *bits* and
+	 * CRUSH_BUCKET_* values are not, so we need to or together (1
+	 * << CRUSH_BUCKET_WHATEVER).  The 0th bit is not used to
+	 * minimize confusion (bucket type values start at 1).
+	 */
+	__u32 allowed_bucket_algs;
+
+	__u32 *choose_tries;
+#endif
 };
 
 
diff --git a/include/linux/crush/hash.h b/include/linux/crush/hash.h
index 91e884230d5d..d1d90258242e 100644
--- a/include/linux/crush/hash.h
+++ b/include/linux/crush/hash.h
@@ -1,6 +1,12 @@
 #ifndef CEPH_CRUSH_HASH_H
 #define CEPH_CRUSH_HASH_H
 
+#ifdef __KERNEL__
+# include <linux/types.h>
+#else
+# include "crush_compat.h"
+#endif
+
 #define CRUSH_HASH_RJENKINS1   0
 
 #define CRUSH_HASH_DEFAULT CRUSH_HASH_RJENKINS1
diff --git a/include/linux/crush/mapper.h b/include/linux/crush/mapper.h
index eab367446eea..5dfd5b1125d2 100644
--- a/include/linux/crush/mapper.h
+++ b/include/linux/crush/mapper.h
@@ -8,7 +8,7 @@
  * LGPL2
  */
 
-#include <linux/crush/crush.h>
+#include "crush.h"
 
 extern int crush_find_rule(const struct crush_map *map, int ruleset, int type, int size);
 extern int crush_do_rule(const struct crush_map *map,
diff --git a/net/ceph/crush/crush.c b/net/ceph/crush/crush.c
index 9d84ce4ea0df..80d7c3a97cb8 100644
--- a/net/ceph/crush/crush.c
+++ b/net/ceph/crush/crush.c
@@ -1,15 +1,11 @@
-
 #ifdef __KERNEL__
 # include <linux/slab.h>
+# include <linux/crush/crush.h>
 #else
-# include <stdlib.h>
-# include <assert.h>
-# define kfree(x) do { if (x) free(x); } while (0)
-# define BUG_ON(x) assert(!(x))
+# include "crush_compat.h"
+# include "crush.h"
 #endif
 
-#include <linux/crush/crush.h>
-
 const char *crush_bucket_alg_name(int alg)
 {
 	switch (alg) {
@@ -134,6 +130,9 @@ void crush_destroy(struct crush_map *map)
 		kfree(map->rules);
 	}
 
+#ifndef __KERNEL__
+	kfree(map->choose_tries);
+#endif
 	kfree(map);
 }
 
diff --git a/net/ceph/crush/crush_ln_table.h b/net/ceph/crush/crush_ln_table.h
index 6192c7fc958c..aae534c901a4 100644
--- a/net/ceph/crush/crush_ln_table.h
+++ b/net/ceph/crush/crush_ln_table.h
@@ -10,20 +10,20 @@
  *
  */
 
-#if defined(__linux__)
-#include <linux/types.h>
-#elif defined(__FreeBSD__)
-#include <sys/types.h>
-#endif
-
 #ifndef CEPH_CRUSH_LN_H
 #define CEPH_CRUSH_LN_H
 
+#ifdef __KERNEL__
+# include <linux/types.h>
+#else
+# include "crush_compat.h"
+#endif
 
-// RH_LH_tbl[2*k] = 2^48/(1.0+k/128.0)
-// RH_LH_tbl[2*k+1] = 2^48*log2(1.0+k/128.0)
-
-static int64_t __RH_LH_tbl[128*2+2] = {
+/*
+ * RH_LH_tbl[2*k] = 2^48/(1.0+k/128.0)
+ * RH_LH_tbl[2*k+1] = 2^48*log2(1.0+k/128.0)
+ */
+static __s64 __RH_LH_tbl[128*2+2] = {
   0x0001000000000000ll, 0x0000000000000000ll, 0x0000fe03f80fe040ll, 0x000002dfca16dde1ll,
   0x0000fc0fc0fc0fc1ll, 0x000005b9e5a170b4ll, 0x0000fa232cf25214ll, 0x0000088e68ea899all,
   0x0000f83e0f83e0f9ll, 0x00000b5d69bac77ell, 0x0000f6603d980f67ll, 0x00000e26fd5c8555ll,
@@ -89,11 +89,12 @@ static int64_t __RH_LH_tbl[128*2+2] = {
   0x0000820820820821ll, 0x0000fa2f045e7832ll, 0x000081848da8faf1ll, 0x0000fba577877d7dll,
   0x0000810204081021ll, 0x0000fd1a708bbe11ll, 0x0000808080808081ll, 0x0000fe8df263f957ll,
   0x0000800000000000ll, 0x0000ffff00000000ll,
-  };
-
+};
 
-    // LL_tbl[k] = 2^48*log2(1.0+k/2^15);
-static int64_t __LL_tbl[256] = {
+/*
+ * LL_tbl[k] = 2^48*log2(1.0+k/2^15)
+ */
+static __s64 __LL_tbl[256] = {
   0x0000000000000000ull, 0x00000002e2a60a00ull, 0x000000070cb64ec5ull, 0x00000009ef50ce67ull,
   0x0000000cd1e588fdull, 0x0000000fb4747e9cull, 0x0000001296fdaf5eull, 0x0000001579811b58ull,
   0x000000185bfec2a1ull, 0x0000001b3e76a552ull, 0x0000001e20e8c380ull, 0x0000002103551d43ull,
@@ -160,7 +161,4 @@ static int64_t __LL_tbl[256] = {
   0x000002d4562d2ec6ull, 0x000002d73330209dull, 0x000002da102d63b0ull, 0x000002dced24f814ull,
 };
 
-
-
-
 #endif
diff --git a/net/ceph/crush/hash.c b/net/ceph/crush/hash.c
index 5bb63e37a8a1..ed123af49eba 100644
--- a/net/ceph/crush/hash.c
+++ b/net/ceph/crush/hash.c
@@ -1,6 +1,8 @@
-
-#include <linux/types.h>
-#include <linux/crush/hash.h>
+#ifdef __KERNEL__
+# include <linux/crush/hash.h>
+#else
+# include "hash.h"
+#endif
 
 /*
  * Robert Jenkins' function for mixing 32-bit values
diff --git a/net/ceph/crush/mapper.c b/net/ceph/crush/mapper.c
index 7568cb59b9e5..393bfb22d5bb 100644
--- a/net/ceph/crush/mapper.c
+++ b/net/ceph/crush/mapper.c
@@ -1,27 +1,31 @@
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2015 Intel Corporation All Rights Reserved
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
 
 #ifdef __KERNEL__
 # include <linux/string.h>
 # include <linux/slab.h>
 # include <linux/bug.h>
 # include <linux/kernel.h>
-# ifndef dprintk
-#  define dprintk(args...)
-# endif
+# include <linux/crush/crush.h>
+# include <linux/crush/hash.h>
 #else
-# include <string.h>
-# include <stdio.h>
-# include <stdlib.h>
-# include <assert.h>
-# define BUG_ON(x) assert(!(x))
-# define dprintk(args...) /* printf(args) */
-# define kmalloc(x, f) malloc(x)
-# define kfree(x) free(x)
+# include "crush_compat.h"
+# include "crush.h"
+# include "hash.h"
 #endif
-
-#include <linux/crush/crush.h>
-#include <linux/crush/hash.h>
 #include "crush_ln_table.h"
 
+#define dprintk(args...) /* printf(args) */
+
 /*
  * Implement the core CRUSH mapping algorithm.
  */
@@ -139,7 +143,7 @@ static int bucket_list_choose(struct crush_bucket_list *bucket,
 	int i;
 
 	for (i = bucket->h.size-1; i >= 0; i--) {
-		__u64 w = crush_hash32_4(bucket->h.hash,x, bucket->h.items[i],
+		__u64 w = crush_hash32_4(bucket->h.hash, x, bucket->h.items[i],
 					 r, bucket->h.id);
 		w &= 0xffff;
 		dprintk("list_choose i=%d x=%d r=%d item %d weight %x "
@@ -238,43 +242,46 @@ static int bucket_straw_choose(struct crush_bucket_straw *bucket,
 	return bucket->h.items[high];
 }
 
-// compute 2^44*log2(input+1)
-uint64_t crush_ln(unsigned xin)
+/* compute 2^44*log2(input+1) */
+static __u64 crush_ln(unsigned int xin)
 {
-    unsigned x=xin, x1;
-    int iexpon, index1, index2;
-    uint64_t RH, LH, LL, xl64, result;
+	unsigned int x = xin, x1;
+	int iexpon, index1, index2;
+	__u64 RH, LH, LL, xl64, result;
 
-    x++;
+	x++;
 
-    // normalize input
-    iexpon = 15;
-    while(!(x&0x18000)) { x<<=1; iexpon--; }
+	/* normalize input */
+	iexpon = 15;
+	while (!(x & 0x18000)) {
+		x <<= 1;
+		iexpon--;
+	}
 
-    index1 = (x>>8)<<1;
-    // RH ~ 2^56/index1
-    RH = __RH_LH_tbl[index1 - 256];
-    // LH ~ 2^48 * log2(index1/256)
-    LH = __RH_LH_tbl[index1 + 1 - 256];
+	index1 = (x >> 8) << 1;
+	/* RH ~ 2^56/index1 */
+	RH = __RH_LH_tbl[index1 - 256];
+	/* LH ~ 2^48 * log2(index1/256) */
+	LH = __RH_LH_tbl[index1 + 1 - 256];
 
-    // RH*x ~ 2^48 * (2^15 + xf), xf<2^8
-    xl64 = (int64_t)x * RH;
-    xl64 >>= 48;
-    x1 = xl64;
+	/* RH*x ~ 2^48 * (2^15 + xf), xf<2^8 */
+	xl64 = (__s64)x * RH;
+	xl64 >>= 48;
+	x1 = xl64;
 
-    result = iexpon;
-    result <<= (12 + 32);
+	result = iexpon;
+	result <<= (12 + 32);
 
-    index2 = x1 & 0xff;
-    // LL ~ 2^48*log2(1.0+index2/2^15)
-    LL = __LL_tbl[index2];
+	index2 = x1 & 0xff;
+	/* LL ~ 2^48*log2(1.0+index2/2^15) */
+	LL = __LL_tbl[index2];
 
-    LH = LH + LL;
+	LH = LH + LL;
 
-    LH >>= (48-12 - 32);
-    result += LH;
+	LH >>= (48 - 12 - 32);
+	result += LH;
 
-    return result;
+	return result;
 }
 
 
@@ -290,9 +297,9 @@ uint64_t crush_ln(unsigned xin)
 static int bucket_straw2_choose(struct crush_bucket_straw2 *bucket,
 				int x, int r)
 {
-	unsigned i, high = 0;
-	unsigned u;
-	unsigned w;
+	unsigned int i, high = 0;
+	unsigned int u;
+	unsigned int w;
 	__s64 ln, draw, high_draw = 0;
 
 	for (i = 0; i < bucket->h.size; i++) {
@@ -567,6 +574,10 @@ reject:
 		out[outpos] = item;
 		outpos++;
 		count--;
+#ifndef __KERNEL__
+		if (map->choose_tries && ftotal <= map->choose_total_tries)
+			map->choose_tries[ftotal]++;
+#endif
 	}
 
 	dprintk("CHOOSE returns %d\n", outpos);
@@ -610,6 +621,20 @@ static void crush_choose_indep(const struct crush_map *map,
 	}
 
 	for (ftotal = 0; left > 0 && ftotal < tries; ftotal++) {
+#ifdef DEBUG_INDEP
+		if (out2 && ftotal) {
+			dprintk("%u %d a: ", ftotal, left);
+			for (rep = outpos; rep < endpos; rep++) {
+				dprintk(" %d", out[rep]);
+			}
+			dprintk("\n");
+			dprintk("%u %d b: ", ftotal, left);
+			for (rep = outpos; rep < endpos; rep++) {
+				dprintk(" %d", out2[rep]);
+			}
+			dprintk("\n");
+		}
+#endif
 		for (rep = outpos; rep < endpos; rep++) {
 			if (out[rep] != CRUSH_ITEM_UNDEF)
 				continue;
@@ -726,6 +751,24 @@ static void crush_choose_indep(const struct crush_map *map,
 			out2[rep] = CRUSH_ITEM_NONE;
 		}
 	}
+#ifndef __KERNEL__
+	if (map->choose_tries && ftotal <= map->choose_total_tries)
+		map->choose_tries[ftotal]++;
+#endif
+#ifdef DEBUG_INDEP
+	if (out2) {
+		dprintk("%u %d a: ", ftotal, left);
+		for (rep = outpos; rep < endpos; rep++) {
+			dprintk(" %d", out[rep]);
+		}
+		dprintk("\n");
+		dprintk("%u %d b: ", ftotal, left);
+		for (rep = outpos; rep < endpos; rep++) {
+			dprintk(" %d", out2[rep]);
+		}
+		dprintk("\n");
+	}
+#endif
 }
 
 /**
@@ -884,7 +927,7 @@ int crush_do_rule(const struct crush_map *map,
 						0);
 				} else {
 					out_size = ((numrep < (result_max-osize)) ?
-                                                    numrep : (result_max-osize));
+						    numrep : (result_max-osize));
 					crush_choose_indep(
 						map,
 						map->buckets[-1-w[i]],
@@ -930,5 +973,3 @@ int crush_do_rule(const struct crush_map *map,
 	}
 	return result_len;
 }
-
-
-- 
cgit v1.2.3


From daf7ab7c58ac5f6304b63cca47a06cdc213361d7 Mon Sep 17 00:00:00 2001
From: Jiang Liu <jiang.liu@linux.intel.com>
Date: Thu, 4 Jun 2015 12:13:11 +0800
Subject: genirq: Clean up outdated comments related to include/linux/irqdesc.h

Seems we have little chance to move irqdesc.h from include/linux/ into
kernel/irq/, so remove the outdated comments.

Signed-off-by: Jiang Liu <jiang.liu@linux.intel.com>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Jason Cooper <jason@lakedaemon.net>
Cc: Kevin Cernekee <cernekee@gmail.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Marc Zyngier <marc.zyngier@arm.com>
Link: http://lkml.kernel.org/r/1433391238-19471-2-git-send-email-jiang.liu@linux.intel.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/irq.h     | 1 -
 include/linux/irqdesc.h | 3 ---
 2 files changed, 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/irq.h b/include/linux/irq.h
index 812149160d3b..92188b0225bb 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -407,7 +407,6 @@ enum {
 	IRQCHIP_EOI_THREADED		= (1 <<  6),
 };
 
-/* This include will go away once we isolated irq_desc usage to core code */
 #include <linux/irqdesc.h>
 
 /*
diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h
index c52d1480f272..66c41b40ca50 100644
--- a/include/linux/irqdesc.h
+++ b/include/linux/irqdesc.h
@@ -3,9 +3,6 @@
 
 /*
  * Core internal functions to deal with irq descriptors
- *
- * This include will move to kernel/irq once we cleaned up the tree.
- * For now it's included from <linux/irq.h>
  */
 
 struct irq_affinity_notify;
-- 
cgit v1.2.3


From 08dad486d98e55d49b760cbaa20b71f35566d4e6 Mon Sep 17 00:00:00 2001
From: Jiang Liu <jiang.liu@linux.intel.com>
Date: Thu, 4 Jun 2015 12:13:12 +0800
Subject: genirq: Remove irq_node()

Macro irq_node() has no user. Remove it.

Signed-off-by: Jiang Liu <jiang.liu@linux.intel.com>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Link: http://lkml.kernel.org/r/1433391238-19471-3-git-send-email-jiang.liu@linux.intel.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/irqnr.h | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/irqnr.h b/include/linux/irqnr.h
index fdd5cc16c9c4..9669bf9d4f48 100644
--- a/include/linux/irqnr.h
+++ b/include/linux/irqnr.h
@@ -23,12 +23,6 @@ unsigned int irq_get_next_irq(unsigned int offset);
 			;						\
 		else
 
-#ifdef CONFIG_SMP
-#define irq_node(irq)	(irq_get_irq_data(irq)->node)
-#else
-#define irq_node(irq)	0
-#endif
-
 # define for_each_active_irq(irq)			\
 	for (irq = irq_get_next_irq(0); irq < nr_irqs;	\
 	     irq = irq_get_next_irq(irq + 1))
-- 
cgit v1.2.3


From 7c494375b773497228502133879072a9e959c063 Mon Sep 17 00:00:00 2001
From: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Date: Mon, 1 Jun 2015 10:35:16 -0700
Subject: Input: improve parsing OF parameters for touchscreens

When applying touchscreen parameters specified in device tree let's make
sure we keep whatever setup was done by the driver and not reset the
missing values to zero.

Reported-by: Pavel Machek <pavel@ucw.cz>
Tested-by: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
---
 drivers/input/touchscreen/edt-ft5x06.c     |  2 +-
 drivers/input/touchscreen/of_touchscreen.c | 69 +++++++++++++++++++-----------
 drivers/input/touchscreen/tsc2005.c        |  2 +-
 include/linux/input/touchscreen.h          |  5 ++-
 4 files changed, 49 insertions(+), 29 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/input/touchscreen/edt-ft5x06.c b/drivers/input/touchscreen/edt-ft5x06.c
index 29d179a1953b..394b1de9a2a3 100644
--- a/drivers/input/touchscreen/edt-ft5x06.c
+++ b/drivers/input/touchscreen/edt-ft5x06.c
@@ -1041,7 +1041,7 @@ static int edt_ft5x06_ts_probe(struct i2c_client *client,
 			     0, tsdata->num_y * 64 - 1, 0, 0);
 
 	if (!pdata)
-		touchscreen_parse_of_params(input);
+		touchscreen_parse_of_params(input, true);
 
 	error = input_mt_init_slots(input, MAX_SUPPORT_POINTS, INPUT_MT_DIRECT);
 	if (error) {
diff --git a/drivers/input/touchscreen/of_touchscreen.c b/drivers/input/touchscreen/of_touchscreen.c
index b82b5207c78b..806cd0ad160f 100644
--- a/drivers/input/touchscreen/of_touchscreen.c
+++ b/drivers/input/touchscreen/of_touchscreen.c
@@ -14,14 +14,22 @@
 #include <linux/input/mt.h>
 #include <linux/input/touchscreen.h>
 
-static u32 of_get_optional_u32(struct device_node *np,
-			       const char *property)
+static bool touchscreen_get_prop_u32(struct device_node *np,
+				     const char *property,
+				     unsigned int default_value,
+				     unsigned int *value)
 {
-	u32 val = 0;
+	u32 val;
+	int error;
 
-	of_property_read_u32(np, property, &val);
+	error = of_property_read_u32(np, property, &val);
+	if (error) {
+		*value = default_value;
+		return false;
+	}
 
-	return val;
+	*value = val;
+	return true;
 }
 
 static void touchscreen_set_params(struct input_dev *dev,
@@ -54,34 +62,45 @@ static void touchscreen_set_params(struct input_dev *dev,
  * input device accordingly. The function keeps previously setuped default
  * values if no value is specified via DT.
  */
-void touchscreen_parse_of_params(struct input_dev *dev)
+void touchscreen_parse_of_params(struct input_dev *dev, bool multitouch)
 {
 	struct device_node *np = dev->dev.parent->of_node;
-	u32 maximum, fuzz;
+	unsigned int axis;
+	unsigned int maximum, fuzz;
+	bool data_present;
 
 	input_alloc_absinfo(dev);
 	if (!dev->absinfo)
 		return;
 
-	maximum = of_get_optional_u32(np, "touchscreen-size-x");
-	fuzz = of_get_optional_u32(np, "touchscreen-fuzz-x");
-	if (maximum || fuzz) {
-		touchscreen_set_params(dev, ABS_X, maximum, fuzz);
-		touchscreen_set_params(dev, ABS_MT_POSITION_X, maximum, fuzz);
-	}
+	axis = multitouch ? ABS_MT_POSITION_X : ABS_X;
+	data_present = touchscreen_get_prop_u32(np, "touchscreen-size-x",
+						input_abs_get_max(dev, axis),
+						&maximum) |
+		       touchscreen_get_prop_u32(np, "touchscreen-fuzz-x",
+						input_abs_get_fuzz(dev, axis),
+						&fuzz);
+	if (data_present)
+		touchscreen_set_params(dev, axis, maximum, fuzz);
 
-	maximum = of_get_optional_u32(np, "touchscreen-size-y");
-	fuzz = of_get_optional_u32(np, "touchscreen-fuzz-y");
-	if (maximum || fuzz) {
-		touchscreen_set_params(dev, ABS_Y, maximum, fuzz);
-		touchscreen_set_params(dev, ABS_MT_POSITION_Y, maximum, fuzz);
-	}
+	axis = multitouch ? ABS_MT_POSITION_Y : ABS_Y;
+	data_present = touchscreen_get_prop_u32(np, "touchscreen-size-y",
+						input_abs_get_max(dev, axis),
+						&maximum) |
+		       touchscreen_get_prop_u32(np, "touchscreen-fuzz-y",
+						input_abs_get_fuzz(dev, axis),
+						&fuzz);
+	if (data_present)
+		touchscreen_set_params(dev, axis, maximum, fuzz);
 
-	maximum = of_get_optional_u32(np, "touchscreen-max-pressure");
-	fuzz = of_get_optional_u32(np, "touchscreen-fuzz-pressure");
-	if (maximum || fuzz) {
-		touchscreen_set_params(dev, ABS_PRESSURE, maximum, fuzz);
-		touchscreen_set_params(dev, ABS_MT_PRESSURE, maximum, fuzz);
-	}
+	axis = multitouch ? ABS_MT_PRESSURE : ABS_PRESSURE;
+	data_present = touchscreen_get_prop_u32(np, "touchscreen-max-pressure",
+						input_abs_get_max(dev, axis),
+						&maximum) |
+		       touchscreen_get_prop_u32(np, "touchscreen-fuzz-pressure",
+						input_abs_get_fuzz(dev, axis),
+						&fuzz);
+	if (data_present)
+		touchscreen_set_params(dev, axis, maximum, fuzz);
 }
 EXPORT_SYMBOL(touchscreen_parse_of_params);
diff --git a/drivers/input/touchscreen/tsc2005.c b/drivers/input/touchscreen/tsc2005.c
index 72657c579430..d8c025b0f88c 100644
--- a/drivers/input/touchscreen/tsc2005.c
+++ b/drivers/input/touchscreen/tsc2005.c
@@ -709,7 +709,7 @@ static int tsc2005_probe(struct spi_device *spi)
 	input_set_abs_params(input_dev, ABS_PRESSURE, 0, max_p, fudge_p, 0);
 
 	if (np)
-		touchscreen_parse_of_params(input_dev);
+		touchscreen_parse_of_params(input_dev, false);
 
 	input_dev->open = tsc2005_open;
 	input_dev->close = tsc2005_close;
diff --git a/include/linux/input/touchscreen.h b/include/linux/input/touchscreen.h
index 08a5ef6e8f25..eecc9ea6cd58 100644
--- a/include/linux/input/touchscreen.h
+++ b/include/linux/input/touchscreen.h
@@ -12,9 +12,10 @@
 #include <linux/input.h>
 
 #ifdef CONFIG_OF
-void touchscreen_parse_of_params(struct input_dev *dev);
+void touchscreen_parse_of_params(struct input_dev *dev, bool multitouch);
 #else
-static inline void touchscreen_parse_of_params(struct input_dev *dev)
+static inline void touchscreen_parse_of_params(struct input_dev *dev,
+					       bool multitouch)
 {
 }
 #endif
-- 
cgit v1.2.3


From 479305fd7172503772575997eb6f1b0a2bb4a107 Mon Sep 17 00:00:00 2001
From: Dan Streetman <ddstreet@ieee.org>
Date: Thu, 25 Jun 2015 15:00:40 -0700
Subject: zpool: remove zpool_evict()

Remove zpool_evict() helper function.  As zbud is currently the only
zpool implementation that supports eviction, add zpool and zpool_ops
references to struct zbud_pool and directly call zpool_ops->evict(zpool,
handle) on eviction.

Currently zpool provides the zpool_evict helper which locks the zpool
list lock and searches through all pools to find the specific one
matching the caller, and call the corresponding zpool_ops->evict
function.  However, this is unnecessary, as the zbud pool can simply
keep a reference to the zpool that created it, as well as the zpool_ops,
and directly call the zpool_ops->evict function, when it needs to evict
a page.  This avoids a spinlock and list search in zpool for each
eviction.

Signed-off-by: Dan Streetman <ddstreet@ieee.org>
Cc: Seth Jennings <sjennings@variantweb.net>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Nitin Gupta <ngupta@vflare.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/zpool.h |  5 ++---
 mm/zbud.c             | 23 +++++++++++++++++++----
 mm/zpool.c            | 29 +----------------------------
 mm/zsmalloc.c         |  3 ++-
 4 files changed, 24 insertions(+), 36 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/zpool.h b/include/linux/zpool.h
index 56529b34dc63..d30eff3d84d5 100644
--- a/include/linux/zpool.h
+++ b/include/linux/zpool.h
@@ -81,7 +81,8 @@ struct zpool_driver {
 	atomic_t refcount;
 	struct list_head list;
 
-	void *(*create)(char *name, gfp_t gfp, struct zpool_ops *ops);
+	void *(*create)(char *name, gfp_t gfp, struct zpool_ops *ops,
+			struct zpool *zpool);
 	void (*destroy)(void *pool);
 
 	int (*malloc)(void *pool, size_t size, gfp_t gfp,
@@ -102,6 +103,4 @@ void zpool_register_driver(struct zpool_driver *driver);
 
 int zpool_unregister_driver(struct zpool_driver *driver);
 
-int zpool_evict(void *pool, unsigned long handle);
-
 #endif
diff --git a/mm/zbud.c b/mm/zbud.c
index 2ee4e4520493..f3bf6f7627d8 100644
--- a/mm/zbud.c
+++ b/mm/zbud.c
@@ -97,6 +97,10 @@ struct zbud_pool {
 	struct list_head lru;
 	u64 pages_nr;
 	struct zbud_ops *ops;
+#ifdef CONFIG_ZPOOL
+	struct zpool *zpool;
+	struct zpool_ops *zpool_ops;
+#endif
 };
 
 /*
@@ -123,7 +127,10 @@ struct zbud_header {
 
 static int zbud_zpool_evict(struct zbud_pool *pool, unsigned long handle)
 {
-	return zpool_evict(pool, handle);
+	if (pool->zpool && pool->zpool_ops && pool->zpool_ops->evict)
+		return pool->zpool_ops->evict(pool->zpool, handle);
+	else
+		return -ENOENT;
 }
 
 static struct zbud_ops zbud_zpool_ops = {
@@ -131,9 +138,17 @@ static struct zbud_ops zbud_zpool_ops = {
 };
 
 static void *zbud_zpool_create(char *name, gfp_t gfp,
-			struct zpool_ops *zpool_ops)
+			       struct zpool_ops *zpool_ops,
+			       struct zpool *zpool)
 {
-	return zbud_create_pool(gfp, zpool_ops ? &zbud_zpool_ops : NULL);
+	struct zbud_pool *pool;
+
+	pool = zbud_create_pool(gfp, zpool_ops ? &zbud_zpool_ops : NULL);
+	if (pool) {
+		pool->zpool = zpool;
+		pool->zpool_ops = zpool_ops;
+	}
+	return pool;
 }
 
 static void zbud_zpool_destroy(void *pool)
@@ -292,7 +307,7 @@ struct zbud_pool *zbud_create_pool(gfp_t gfp, struct zbud_ops *ops)
 	struct zbud_pool *pool;
 	int i;
 
-	pool = kmalloc(sizeof(struct zbud_pool), gfp);
+	pool = kzalloc(sizeof(struct zbud_pool), gfp);
 	if (!pool)
 		return NULL;
 	spin_lock_init(&pool->lock);
diff --git a/mm/zpool.c b/mm/zpool.c
index 6b1f1035cd99..722a4f60e90b 100644
--- a/mm/zpool.c
+++ b/mm/zpool.c
@@ -73,33 +73,6 @@ int zpool_unregister_driver(struct zpool_driver *driver)
 }
 EXPORT_SYMBOL(zpool_unregister_driver);
 
-/**
- * zpool_evict() - evict callback from a zpool implementation.
- * @pool:	pool to evict from.
- * @handle:	handle to evict.
- *
- * This can be used by zpool implementations to call the
- * user's evict zpool_ops struct evict callback.
- */
-int zpool_evict(void *pool, unsigned long handle)
-{
-	struct zpool *zpool;
-
-	spin_lock(&pools_lock);
-	list_for_each_entry(zpool, &pools_head, list) {
-		if (zpool->pool == pool) {
-			spin_unlock(&pools_lock);
-			if (!zpool->ops || !zpool->ops->evict)
-				return -EINVAL;
-			return zpool->ops->evict(zpool, handle);
-		}
-	}
-	spin_unlock(&pools_lock);
-
-	return -ENOENT;
-}
-EXPORT_SYMBOL(zpool_evict);
-
 static struct zpool_driver *zpool_get_driver(char *type)
 {
 	struct zpool_driver *driver;
@@ -170,7 +143,7 @@ struct zpool *zpool_create_pool(char *type, char *name, gfp_t gfp,
 
 	zpool->type = driver->type;
 	zpool->driver = driver;
-	zpool->pool = driver->create(name, gfp, ops);
+	zpool->pool = driver->create(name, gfp, ops, zpool);
 	zpool->ops = ops;
 
 	if (!zpool->pool) {
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index c76624080f9c..0a7f81aa2249 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -309,7 +309,8 @@ static void record_obj(unsigned long handle, unsigned long obj)
 
 #ifdef CONFIG_ZPOOL
 
-static void *zs_zpool_create(char *name, gfp_t gfp, struct zpool_ops *zpool_ops)
+static void *zs_zpool_create(char *name, gfp_t gfp, struct zpool_ops *zpool_ops,
+			     struct zpool *zpool)
 {
 	return zs_create_pool(name, gfp);
 }
-- 
cgit v1.2.3


From f6d133f877c8bb0a0934dc8c521c758ee771e901 Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Thu, 25 Jun 2015 15:01:00 -0700
Subject: compiler-gcc.h: neatening

 - Move the inline and noinline blocks together

 - Comment neatening

 - Alignment of __attribute__ uses

 - Consistent naming of __must_be_array macro argument

 - Multiline macro neatening

Signed-off-by: Joe Perches <joe@perches.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Michal Marek <mmarek@suse.cz>
Cc: Segher Boessenkool <segher@kernel.crashing.org>
Cc: Sasha Levin <levinsasha928@gmail.com>
Cc: Anton Blanchard <anton@samba.org>
Cc: Alan Modra <amodra@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/compiler-gcc.h | 85 +++++++++++++++++++++++---------------------
 1 file changed, 45 insertions(+), 40 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h
index 371e560d13cf..5c2c14e3c647 100644
--- a/include/linux/compiler-gcc.h
+++ b/include/linux/compiler-gcc.h
@@ -5,9 +5,9 @@
 /*
  * Common definitions for all gcc versions go here.
  */
-#define GCC_VERSION (__GNUC__ * 10000 \
-		   + __GNUC_MINOR__ * 100 \
-		   + __GNUC_PATCHLEVEL__)
+#define GCC_VERSION (__GNUC__ * 10000		\
+		     + __GNUC_MINOR__ * 100	\
+		     + __GNUC_PATCHLEVEL__)
 
 /* Optimization barrier */
 
@@ -46,55 +46,63 @@
  * the inline assembly constraint from =g to =r, in this particular
  * case either is valid.
  */
-#define RELOC_HIDE(ptr, off)					\
-  ({ unsigned long __ptr;					\
-    __asm__ ("" : "=r"(__ptr) : "0"(ptr));		\
-    (typeof(ptr)) (__ptr + (off)); })
+#define RELOC_HIDE(ptr, off)						\
+({									\
+	unsigned long __ptr;						\
+	__asm__ ("" : "=r"(__ptr) : "0"(ptr));				\
+	(typeof(ptr)) (__ptr + (off));					\
+})
 
 /* Make the optimizer believe the variable can be manipulated arbitrarily. */
-#define OPTIMIZER_HIDE_VAR(var) __asm__ ("" : "=r" (var) : "0" (var))
+#define OPTIMIZER_HIDE_VAR(var)						\
+	__asm__ ("" : "=r" (var) : "0" (var))
 
 #ifdef __CHECKER__
-#define __must_be_array(arr) 0
+#define __must_be_array(a)	0
 #else
 /* &a[0] degrades to a pointer: a different type from an array */
-#define __must_be_array(a) BUILD_BUG_ON_ZERO(__same_type((a), &(a)[0]))
+#define __must_be_array(a)	BUILD_BUG_ON_ZERO(__same_type((a), &(a)[0]))
 #endif
 
 /*
  * Force always-inline if the user requests it so via the .config,
  * or if gcc is too old:
  */
-#if !defined(CONFIG_ARCH_SUPPORTS_OPTIMIZED_INLINING) || \
+#if !defined(CONFIG_ARCH_SUPPORTS_OPTIMIZED_INLINING) ||		\
     !defined(CONFIG_OPTIMIZE_INLINING) || (__GNUC__ < 4)
-# define inline		inline		__attribute__((always_inline)) notrace
-# define __inline__	__inline__	__attribute__((always_inline)) notrace
-# define __inline	__inline	__attribute__((always_inline)) notrace
+#define inline		inline		__attribute__((always_inline)) notrace
+#define __inline__	__inline__	__attribute__((always_inline)) notrace
+#define __inline	__inline	__attribute__((always_inline)) notrace
 #else
 /* A lot of inline functions can cause havoc with function tracing */
-# define inline		inline		notrace
-# define __inline__	__inline__	notrace
-# define __inline	__inline	notrace
+#define inline		inline		notrace
+#define __inline__	__inline__	notrace
+#define __inline	__inline	notrace
 #endif
 
-#define __deprecated			__attribute__((deprecated))
-#define __packed			__attribute__((packed))
-#define __weak				__attribute__((weak))
-#define __alias(symbol)		__attribute__((alias(#symbol)))
+#define __always_inline	inline __attribute__((always_inline))
+#define  noinline	__attribute__((noinline))
+
+#define __deprecated	__attribute__((deprecated))
+#define __packed	__attribute__((packed))
+#define __weak		__attribute__((weak))
+#define __alias(symbol)	__attribute__((alias(#symbol)))
 
 /*
- * it doesn't make sense on ARM (currently the only user of __naked) to trace
- * naked functions because then mcount is called without stack and frame pointer
- * being set up and there is no chance to restore the lr register to the value
- * before mcount was called.
+ * it doesn't make sense on ARM (currently the only user of __naked)
+ * to trace naked functions because then mcount is called without
+ * stack and frame pointer being set up and there is no chance to
+ * restore the lr register to the value before mcount was called.
+ *
+ * The asm() bodies of naked functions often depend on standard calling
+ * conventions, therefore they must be noinline and noclone.
  *
- * The asm() bodies of naked functions often depend on standard calling conventions,
- * therefore they must be noinline and noclone.  GCC 4.[56] currently fail to enforce
- * this, so we must do so ourselves.  See GCC PR44290.
+ * GCC 4.[56] currently fail to enforce this, so we must do so ourselves.
+ * See GCC PR44290.
  */
-#define __naked				__attribute__((naked)) noinline __noclone notrace
+#define __naked		__attribute__((naked)) noinline __noclone notrace
 
-#define __noreturn			__attribute__((noreturn))
+#define __noreturn	__attribute__((noreturn))
 
 /*
  * From the GCC manual:
@@ -106,14 +114,13 @@
  * would be.
  * [...]
  */
-#define __pure				__attribute__((pure))
-#define __aligned(x)			__attribute__((aligned(x)))
-#define __printf(a, b)			__attribute__((format(printf, a, b)))
-#define __scanf(a, b)			__attribute__((format(scanf, a, b)))
-#define  noinline			__attribute__((noinline))
-#define __attribute_const__		__attribute__((__const__))
-#define __maybe_unused			__attribute__((unused))
-#define __always_unused			__attribute__((unused))
+#define __pure			__attribute__((pure))
+#define __aligned(x)		__attribute__((aligned(x)))
+#define __printf(a, b)		__attribute__((format(printf, a, b)))
+#define __scanf(a, b)		__attribute__((format(scanf, a, b)))
+#define __attribute_const__	__attribute__((__const__))
+#define __maybe_unused		__attribute__((unused))
+#define __always_unused		__attribute__((unused))
 
 #define __gcc_header(x) #x
 #define _gcc_header(x) __gcc_header(linux/compiler-gcc##x.h)
@@ -129,5 +136,3 @@
  * code
  */
 #define uninitialized_var(x) x = x
-
-#define __always_inline		inline __attribute__((always_inline))
-- 
cgit v1.2.3


From cb984d101b30eb7478d32df56a0023e4603cba7f Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Thu, 25 Jun 2015 15:01:02 -0700
Subject: compiler-gcc: integrate the various compiler-gcc[345].h files

As gcc major version numbers are going to advance rather rapidly in the
future, there's no real value in separate files for each compiler
version.

Deduplicate some of the macros #defined in each file too.

Neaten comments using normal kernel commenting style.

Signed-off-by: Joe Perches <joe@perches.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Michal Marek <mmarek@suse.cz>
Cc: Segher Boessenkool <segher@kernel.crashing.org>
Cc: Sasha Levin <levinsasha928@gmail.com>
Cc: Anton Blanchard <anton@samba.org>
Cc: Alan Modra <amodra@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/compiler-gcc.h  | 120 ++++++++++++++++++++++++++++++++++++++++--
 include/linux/compiler-gcc3.h |  23 --------
 include/linux/compiler-gcc4.h |  91 --------------------------------
 include/linux/compiler-gcc5.h |  67 -----------------------
 4 files changed, 116 insertions(+), 185 deletions(-)
 delete mode 100644 include/linux/compiler-gcc3.h
 delete mode 100644 include/linux/compiler-gcc4.h
 delete mode 100644 include/linux/compiler-gcc5.h

(limited to 'include/linux')

diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h
index 5c2c14e3c647..dfaa7b3e9ae9 100644
--- a/include/linux/compiler-gcc.h
+++ b/include/linux/compiler-gcc.h
@@ -122,10 +122,122 @@
 #define __maybe_unused		__attribute__((unused))
 #define __always_unused		__attribute__((unused))
 
-#define __gcc_header(x) #x
-#define _gcc_header(x) __gcc_header(linux/compiler-gcc##x.h)
-#define gcc_header(x) _gcc_header(x)
-#include gcc_header(__GNUC__)
+/* gcc version specific checks */
+
+#if GCC_VERSION < 30200
+# error Sorry, your compiler is too old - please upgrade it.
+#endif
+
+#if GCC_VERSION < 30300
+# define __used			__attribute__((__unused__))
+#else
+# define __used			__attribute__((__used__))
+#endif
+
+#ifdef CONFIG_GCOV_KERNEL
+# if GCC_VERSION < 30400
+#   error "GCOV profiling support for gcc versions below 3.4 not included"
+# endif /* __GNUC_MINOR__ */
+#endif /* CONFIG_GCOV_KERNEL */
+
+#if GCC_VERSION >= 30400
+#define __must_check		__attribute__((warn_unused_result))
+#endif
+
+#if GCC_VERSION >= 40000
+
+/* GCC 4.1.[01] miscompiles __weak */
+#ifdef __KERNEL__
+# if GCC_VERSION >= 40100 &&  GCC_VERSION <= 40101
+#  error Your version of gcc miscompiles the __weak directive
+# endif
+#endif
+
+#define __used			__attribute__((__used__))
+#define __compiler_offsetof(a, b)					\
+	__builtin_offsetof(a, b)
+
+#if GCC_VERSION >= 40100 && GCC_VERSION < 40600
+# define __compiletime_object_size(obj) __builtin_object_size(obj, 0)
+#endif
+
+#if GCC_VERSION >= 40300
+/* Mark functions as cold. gcc will assume any path leading to a call
+ * to them will be unlikely.  This means a lot of manual unlikely()s
+ * are unnecessary now for any paths leading to the usual suspects
+ * like BUG(), printk(), panic() etc. [but let's keep them for now for
+ * older compilers]
+ *
+ * Early snapshots of gcc 4.3 don't support this and we can't detect this
+ * in the preprocessor, but we can live with this because they're unreleased.
+ * Maketime probing would be overkill here.
+ *
+ * gcc also has a __attribute__((__hot__)) to move hot functions into
+ * a special section, but I don't see any sense in this right now in
+ * the kernel context
+ */
+#define __cold			__attribute__((__cold__))
+
+#define __UNIQUE_ID(prefix) __PASTE(__PASTE(__UNIQUE_ID_, prefix), __COUNTER__)
+
+#ifndef __CHECKER__
+# define __compiletime_warning(message) __attribute__((warning(message)))
+# define __compiletime_error(message) __attribute__((error(message)))
+#endif /* __CHECKER__ */
+#endif /* GCC_VERSION >= 40300 */
+
+#if GCC_VERSION >= 40500
+/*
+ * Mark a position in code as unreachable.  This can be used to
+ * suppress control flow warnings after asm blocks that transfer
+ * control elsewhere.
+ *
+ * Early snapshots of gcc 4.5 don't support this and we can't detect
+ * this in the preprocessor, but we can live with this because they're
+ * unreleased.  Really, we need to have autoconf for the kernel.
+ */
+#define unreachable() __builtin_unreachable()
+
+/* Mark a function definition as prohibited from being cloned. */
+#define __noclone	__attribute__((__noclone__))
+
+#endif /* GCC_VERSION >= 40500 */
+
+#if GCC_VERSION >= 40600
+/*
+ * Tell the optimizer that something else uses this function or variable.
+ */
+#define __visible	__attribute__((externally_visible))
+#endif
+
+/*
+ * GCC 'asm goto' miscompiles certain code sequences:
+ *
+ *   http://gcc.gnu.org/bugzilla/show_bug.cgi?id=58670
+ *
+ * Work it around via a compiler barrier quirk suggested by Jakub Jelinek.
+ *
+ * (asm goto is automatically volatile - the naming reflects this.)
+ */
+#define asm_volatile_goto(x...)	do { asm goto(x); asm (""); } while (0)
+
+#ifdef CONFIG_ARCH_USE_BUILTIN_BSWAP
+#if GCC_VERSION >= 40400
+#define __HAVE_BUILTIN_BSWAP32__
+#define __HAVE_BUILTIN_BSWAP64__
+#endif
+#if GCC_VERSION >= 40800 || (defined(__powerpc__) && GCC_VERSION >= 40600)
+#define __HAVE_BUILTIN_BSWAP16__
+#endif
+#endif /* CONFIG_ARCH_USE_BUILTIN_BSWAP */
+
+#if GCC_VERSION >= 50000
+#define KASAN_ABI_VERSION 4
+#elif GCC_VERSION >= 40902
+#define KASAN_ABI_VERSION 3
+#endif
+
+#endif	/* gcc version >= 40000 specific checks */
 
 #if !defined(__noclone)
 #define __noclone	/* not needed */
diff --git a/include/linux/compiler-gcc3.h b/include/linux/compiler-gcc3.h
deleted file mode 100644
index 7d89febe4d79..000000000000
--- a/include/linux/compiler-gcc3.h
+++ /dev/null
@@ -1,23 +0,0 @@
-#ifndef __LINUX_COMPILER_H
-#error "Please don't include <linux/compiler-gcc3.h> directly, include <linux/compiler.h> instead."
-#endif
-
-#if GCC_VERSION < 30200
-# error Sorry, your compiler is too old - please upgrade it.
-#endif
-
-#if GCC_VERSION >= 30300
-# define __used			__attribute__((__used__))
-#else
-# define __used			__attribute__((__unused__))
-#endif
-
-#if GCC_VERSION >= 30400
-#define __must_check		__attribute__((warn_unused_result))
-#endif
-
-#ifdef CONFIG_GCOV_KERNEL
-# if GCC_VERSION < 30400
-#   error "GCOV profiling support for gcc versions below 3.4 not included"
-# endif /* __GNUC_MINOR__ */
-#endif /* CONFIG_GCOV_KERNEL */
diff --git a/include/linux/compiler-gcc4.h b/include/linux/compiler-gcc4.h
deleted file mode 100644
index 769e19864632..000000000000
--- a/include/linux/compiler-gcc4.h
+++ /dev/null
@@ -1,91 +0,0 @@
-#ifndef __LINUX_COMPILER_H
-#error "Please don't include <linux/compiler-gcc4.h> directly, include <linux/compiler.h> instead."
-#endif
-
-/* GCC 4.1.[01] miscompiles __weak */
-#ifdef __KERNEL__
-# if GCC_VERSION >= 40100 &&  GCC_VERSION <= 40101
-#  error Your version of gcc miscompiles the __weak directive
-# endif
-#endif
-
-#define __used			__attribute__((__used__))
-#define __must_check 		__attribute__((warn_unused_result))
-#define __compiler_offsetof(a,b) __builtin_offsetof(a,b)
-
-#if GCC_VERSION >= 40100 && GCC_VERSION < 40600
-# define __compiletime_object_size(obj) __builtin_object_size(obj, 0)
-#endif
-
-#if GCC_VERSION >= 40300
-/* Mark functions as cold. gcc will assume any path leading to a call
-   to them will be unlikely.  This means a lot of manual unlikely()s
-   are unnecessary now for any paths leading to the usual suspects
-   like BUG(), printk(), panic() etc. [but let's keep them for now for
-   older compilers]
-
-   Early snapshots of gcc 4.3 don't support this and we can't detect this
-   in the preprocessor, but we can live with this because they're unreleased.
-   Maketime probing would be overkill here.
-
-   gcc also has a __attribute__((__hot__)) to move hot functions into
-   a special section, but I don't see any sense in this right now in
-   the kernel context */
-#define __cold			__attribute__((__cold__))
-
-#define __UNIQUE_ID(prefix) __PASTE(__PASTE(__UNIQUE_ID_, prefix), __COUNTER__)
-
-#ifndef __CHECKER__
-# define __compiletime_warning(message) __attribute__((warning(message)))
-# define __compiletime_error(message) __attribute__((error(message)))
-#endif /* __CHECKER__ */
-#endif /* GCC_VERSION >= 40300 */
-
-#if GCC_VERSION >= 40500
-/*
- * Mark a position in code as unreachable.  This can be used to
- * suppress control flow warnings after asm blocks that transfer
- * control elsewhere.
- *
- * Early snapshots of gcc 4.5 don't support this and we can't detect
- * this in the preprocessor, but we can live with this because they're
- * unreleased.  Really, we need to have autoconf for the kernel.
- */
-#define unreachable() __builtin_unreachable()
-
-/* Mark a function definition as prohibited from being cloned. */
-#define __noclone	__attribute__((__noclone__))
-
-#endif /* GCC_VERSION >= 40500 */
-
-#if GCC_VERSION >= 40600
-/*
- * Tell the optimizer that something else uses this function or variable.
- */
-#define __visible __attribute__((externally_visible))
-#endif
-
-/*
- * GCC 'asm goto' miscompiles certain code sequences:
- *
- *   http://gcc.gnu.org/bugzilla/show_bug.cgi?id=58670
- *
- * Work it around via a compiler barrier quirk suggested by Jakub Jelinek.
- *
- * (asm goto is automatically volatile - the naming reflects this.)
- */
-#define asm_volatile_goto(x...)	do { asm goto(x); asm (""); } while (0)
-
-#ifdef CONFIG_ARCH_USE_BUILTIN_BSWAP
-#if GCC_VERSION >= 40400
-#define __HAVE_BUILTIN_BSWAP32__
-#define __HAVE_BUILTIN_BSWAP64__
-#endif
-#if GCC_VERSION >= 40800 || (defined(__powerpc__) && GCC_VERSION >= 40600)
-#define __HAVE_BUILTIN_BSWAP16__
-#endif
-#endif /* CONFIG_ARCH_USE_BUILTIN_BSWAP */
-
-#if GCC_VERSION >= 40902
-#define KASAN_ABI_VERSION 3
-#endif
diff --git a/include/linux/compiler-gcc5.h b/include/linux/compiler-gcc5.h
deleted file mode 100644
index efee493714eb..000000000000
--- a/include/linux/compiler-gcc5.h
+++ /dev/null
@@ -1,67 +0,0 @@
-#ifndef __LINUX_COMPILER_H
-#error "Please don't include <linux/compiler-gcc5.h> directly, include <linux/compiler.h> instead."
-#endif
-
-#define __used				__attribute__((__used__))
-#define __must_check			__attribute__((warn_unused_result))
-#define __compiler_offsetof(a, b)	__builtin_offsetof(a, b)
-
-/* Mark functions as cold. gcc will assume any path leading to a call
-   to them will be unlikely.  This means a lot of manual unlikely()s
-   are unnecessary now for any paths leading to the usual suspects
-   like BUG(), printk(), panic() etc. [but let's keep them for now for
-   older compilers]
-
-   Early snapshots of gcc 4.3 don't support this and we can't detect this
-   in the preprocessor, but we can live with this because they're unreleased.
-   Maketime probing would be overkill here.
-
-   gcc also has a __attribute__((__hot__)) to move hot functions into
-   a special section, but I don't see any sense in this right now in
-   the kernel context */
-#define __cold			__attribute__((__cold__))
-
-#define __UNIQUE_ID(prefix) __PASTE(__PASTE(__UNIQUE_ID_, prefix), __COUNTER__)
-
-#ifndef __CHECKER__
-# define __compiletime_warning(message) __attribute__((warning(message)))
-# define __compiletime_error(message) __attribute__((error(message)))
-#endif /* __CHECKER__ */
-
-/*
- * Mark a position in code as unreachable.  This can be used to
- * suppress control flow warnings after asm blocks that transfer
- * control elsewhere.
- *
- * Early snapshots of gcc 4.5 don't support this and we can't detect
- * this in the preprocessor, but we can live with this because they're
- * unreleased.  Really, we need to have autoconf for the kernel.
- */
-#define unreachable() __builtin_unreachable()
-
-/* Mark a function definition as prohibited from being cloned. */
-#define __noclone	__attribute__((__noclone__))
-
-/*
- * Tell the optimizer that something else uses this function or variable.
- */
-#define __visible __attribute__((externally_visible))
-
-/*
- * GCC 'asm goto' miscompiles certain code sequences:
- *
- *   http://gcc.gnu.org/bugzilla/show_bug.cgi?id=58670
- *
- * Work it around via a compiler barrier quirk suggested by Jakub Jelinek.
- *
- * (asm goto is automatically volatile - the naming reflects this.)
- */
-#define asm_volatile_goto(x...)	do { asm goto(x); asm (""); } while (0)
-
-#ifdef CONFIG_ARCH_USE_BUILTIN_BSWAP
-#define __HAVE_BUILTIN_BSWAP32__
-#define __HAVE_BUILTIN_BSWAP64__
-#define __HAVE_BUILTIN_BSWAP16__
-#endif /* CONFIG_ARCH_USE_BUILTIN_BSWAP */
-
-#define KASAN_ABI_VERSION 4
-- 
cgit v1.2.3


From b86a50c3b5414eafdbee7f34af4a201a4a7817c2 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Thu, 25 Jun 2015 15:01:05 -0700
Subject: compiler-intel: fix wrong compiler barrier() macro

Cleanup commit 73679e508201 ("compiler-intel.h: Remove duplicate
definition") removed the double definition of __memory_barrier()
intrinsics.

However, in doing so, it also removed the preceding #undef barrier by
accident, meaning, the actual barrier() macro from compiler-gcc.h with
inline asm is still in place as __GNUC__ is provided.

Subsequently, barrier() can never be defined as __memory_barrier() from
compiler.h since it already has a definition in place and if we trust
the comment in compiler-intel.h, ecc doesn't support gcc specific asm
statements.

I don't have an ecc at hand (unsure if that's still used in the field?)
and only found this by accident during code review, a revert of that
cleanup would be simplest option.

Fixes: 73679e508201 ("compiler-intel.h: Remove duplicate definition")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: Pranith Kumar <bobby.prani@gmail.com>
Cc: Pranith Kumar <bobby.prani@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: mancha security <mancha1@zoho.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/compiler-intel.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/compiler-intel.h b/include/linux/compiler-intel.h
index 0c9a2f2c2802..d4c71132d07f 100644
--- a/include/linux/compiler-intel.h
+++ b/include/linux/compiler-intel.h
@@ -13,10 +13,12 @@
 /* Intel ECC compiler doesn't support gcc specific asm stmts.
  * It uses intrinsics to do the equivalent things.
  */
+#undef barrier
 #undef barrier_data
 #undef RELOC_HIDE
 #undef OPTIMIZER_HIDE_VAR
 
+#define barrier() __memory_barrier()
 #define barrier_data(ptr) barrier()
 
 #define RELOC_HIDE(ptr, off)					\
-- 
cgit v1.2.3


From 8c7fbe5795a016259445a61e072eb0118aaf6a61 Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Thu, 25 Jun 2015 15:01:16 -0700
Subject: stddef.h: move offsetofend inside #ifndef/#endif guard, neaten

Commit 3876488444e7 ("include/stddef.h: Move offsetofend() from vfio.h
to a generic kernel header") added offsetofend outside the normal
include #ifndef/#endif guard.  Move it inside.

Miscellanea:

o remove unnecessary blank line
o standardize offsetof macros whitespace style

Signed-off-by: Joe Perches <joe@perches.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/stddef.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/stddef.h b/include/linux/stddef.h
index 076af437284d..9c61c7cda936 100644
--- a/include/linux/stddef.h
+++ b/include/linux/stddef.h
@@ -3,7 +3,6 @@
 
 #include <uapi/linux/stddef.h>
 
-
 #undef NULL
 #define NULL ((void *)0)
 
@@ -14,10 +13,9 @@ enum {
 
 #undef offsetof
 #ifdef __compiler_offsetof
-#define offsetof(TYPE,MEMBER) __compiler_offsetof(TYPE,MEMBER)
+#define offsetof(TYPE, MEMBER)	__compiler_offsetof(TYPE, MEMBER)
 #else
-#define offsetof(TYPE, MEMBER) ((size_t) &((TYPE *)0)->MEMBER)
-#endif
+#define offsetof(TYPE, MEMBER)	((size_t)&((TYPE *)0)->MEMBER)
 #endif
 
 /**
@@ -28,3 +26,5 @@ enum {
  */
 #define offsetofend(TYPE, MEMBER) \
 	(offsetof(TYPE, MEMBER)	+ sizeof(((TYPE *)0)->MEMBER))
+
+#endif
-- 
cgit v1.2.3


From 3033f14ab78c326871a4902591c2518410add24a Mon Sep 17 00:00:00 2001
From: Josh Triplett <josh@joshtriplett.org>
Date: Thu, 25 Jun 2015 15:01:19 -0700
Subject: clone: support passing tls argument via C rather than pt_regs magic

clone has some of the quirkiest syscall handling in the kernel, with a
pile of special cases, historical curiosities, and architecture-specific
calling conventions.  In particular, clone with CLONE_SETTLS accepts a
parameter "tls" that the C entry point completely ignores and some
assembly entry points overwrite; instead, the low-level arch-specific
code pulls the tls parameter out of the arch-specific register captured
as part of pt_regs on entry to the kernel.  That's a massive hack, and
it makes the arch-specific code only work when called via the specific
existing syscall entry points; because of this hack, any new clone-like
system call would have to accept an identical tls argument in exactly
the same arch-specific position, rather than providing a unified system
call entry point across architectures.

The first patch allows architectures to handle the tls argument via
normal C parameter passing, if they opt in by selecting
HAVE_COPY_THREAD_TLS.  The second patch makes 32-bit and 64-bit x86 opt
into this.

These two patches came out of the clone4 series, which isn't ready for
this merge window, but these first two cleanup patches were entirely
uncontroversial and have acks.  I'd like to go ahead and submit these
two so that other architectures can begin building on top of this and
opting into HAVE_COPY_THREAD_TLS.  However, I'm also happy to wait and
send these through the next merge window (along with v3 of clone4) if
anyone would prefer that.

This patch (of 2):

clone with CLONE_SETTLS accepts an argument to set the thread-local
storage area for the new thread.  sys_clone declares an int argument
tls_val in the appropriate point in the argument list (based on the
various CLONE_BACKWARDS variants), but doesn't actually use or pass along
that argument.  Instead, sys_clone calls do_fork, which calls
copy_process, which calls the arch-specific copy_thread, and copy_thread
pulls the corresponding syscall argument out of the pt_regs captured at
kernel entry (knowing what argument of clone that architecture passes tls
in).

Apart from being awful and inscrutable, that also only works because only
one code path into copy_thread can pass the CLONE_SETTLS flag, and that
code path comes from sys_clone with its architecture-specific
argument-passing order.  This prevents introducing a new version of the
clone system call without propagating the same architecture-specific
position of the tls argument.

However, there's no reason to pull the argument out of pt_regs when
sys_clone could just pass it down via C function call arguments.

Introduce a new CONFIG_HAVE_COPY_THREAD_TLS for architectures to opt into,
and a new copy_thread_tls that accepts the tls parameter as an additional
unsigned long (syscall-argument-sized) argument.  Change sys_clone's tls
argument to an unsigned long (which does not change the ABI), and pass
that down to copy_thread_tls.

Architectures that don't opt into copy_thread_tls will continue to ignore
the C argument to sys_clone in favor of the pt_regs captured at kernel
entry, and thus will be unable to introduce new versions of the clone
syscall.

Patch co-authored by Josh Triplett and Thiago Macieira.

Signed-off-by: Josh Triplett <josh@joshtriplett.org>
Acked-by: Andy Lutomirski <luto@kernel.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Thiago Macieira <thiago.macieira@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/Kconfig                      |  7 ++++++
 arch/s390/kernel/compat_wrapper.c |  2 +-
 drivers/misc/kgdbts.c             |  2 +-
 include/linux/sched.h             | 15 ++++++++++++
 include/linux/syscalls.h          |  6 ++---
 kernel/fork.c                     | 48 ++++++++++++++++++++++++++-------------
 6 files changed, 59 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/arch/Kconfig b/arch/Kconfig
index a65eafb24997..bec6666a3cc4 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -499,6 +499,13 @@ config ARCH_HAS_ELF_RANDOMIZE
 	  - arch_mmap_rnd()
 	  - arch_randomize_brk()
 
+config HAVE_COPY_THREAD_TLS
+	bool
+	help
+	  Architecture provides copy_thread_tls to accept tls argument via
+	  normal C parameter passing, rather than extracting the syscall
+	  argument from pt_regs.
+
 #
 # ABI hall of shame
 #
diff --git a/arch/s390/kernel/compat_wrapper.c b/arch/s390/kernel/compat_wrapper.c
index d7fa2f0f1425..f8498dde67b1 100644
--- a/arch/s390/kernel/compat_wrapper.c
+++ b/arch/s390/kernel/compat_wrapper.c
@@ -202,7 +202,7 @@ COMPAT_SYSCALL_WRAP1(epoll_create1, int, flags);
 COMPAT_SYSCALL_WRAP2(tkill, int, pid, int, sig);
 COMPAT_SYSCALL_WRAP3(tgkill, int, tgid, int, pid, int, sig);
 COMPAT_SYSCALL_WRAP5(perf_event_open, struct perf_event_attr __user *, attr_uptr, pid_t, pid, int, cpu, int, group_fd, unsigned long, flags);
-COMPAT_SYSCALL_WRAP5(clone, unsigned long, newsp, unsigned long, clone_flags, int __user *, parent_tidptr, int __user *, child_tidptr, int, tls_val);
+COMPAT_SYSCALL_WRAP5(clone, unsigned long, newsp, unsigned long, clone_flags, int __user *, parent_tidptr, int __user *, child_tidptr, unsigned long, tls);
 COMPAT_SYSCALL_WRAP2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags);
 COMPAT_SYSCALL_WRAP4(prlimit64, pid_t, pid, unsigned int, resource, const struct rlimit64 __user *, new_rlim, struct rlimit64 __user *, old_rlim);
 COMPAT_SYSCALL_WRAP5(name_to_handle_at, int, dfd, const char __user *, name, struct file_handle __user *, handle, int __user *, mnt_id, int, flag);
diff --git a/drivers/misc/kgdbts.c b/drivers/misc/kgdbts.c
index 36f5d52775a9..9a60bd4d3c49 100644
--- a/drivers/misc/kgdbts.c
+++ b/drivers/misc/kgdbts.c
@@ -220,7 +220,7 @@ static unsigned long lookup_addr(char *arg)
 	else if (!strcmp(arg, "sys_open"))
 		addr = (unsigned long)do_sys_open;
 	else if (!strcmp(arg, "do_fork"))
-		addr = (unsigned long)do_fork;
+		addr = (unsigned long)_do_fork;
 	else if (!strcmp(arg, "hw_break_val"))
 		addr = (unsigned long)&hw_break_val;
 	addr = (unsigned long) dereference_function_descriptor((void *)addr);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 6633e83e608a..93ed0b682adb 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2556,8 +2556,22 @@ extern struct mm_struct *mm_access(struct task_struct *task, unsigned int mode);
 /* Remove the current tasks stale references to the old mm_struct */
 extern void mm_release(struct task_struct *, struct mm_struct *);
 
+#ifdef CONFIG_HAVE_COPY_THREAD_TLS
+extern int copy_thread_tls(unsigned long, unsigned long, unsigned long,
+			struct task_struct *, unsigned long);
+#else
 extern int copy_thread(unsigned long, unsigned long, unsigned long,
 			struct task_struct *);
+
+/* Architectures that haven't opted into copy_thread_tls get the tls argument
+ * via pt_regs, so ignore the tls argument passed via C. */
+static inline int copy_thread_tls(
+		unsigned long clone_flags, unsigned long sp, unsigned long arg,
+		struct task_struct *p, unsigned long tls)
+{
+	return copy_thread(clone_flags, sp, arg, p);
+}
+#endif
 extern void flush_thread(void);
 extern void exit_thread(void);
 
@@ -2576,6 +2590,7 @@ extern int do_execveat(int, struct filename *,
 		       const char __user * const __user *,
 		       const char __user * const __user *,
 		       int);
+extern long _do_fork(unsigned long, unsigned long, unsigned long, int __user *, int __user *, unsigned long);
 extern long do_fork(unsigned long, unsigned long, unsigned long, int __user *, int __user *);
 struct task_struct *fork_idle(int);
 extern pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags);
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 76d1e38aabe1..bb51becf23f8 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -827,15 +827,15 @@ asmlinkage long sys_syncfs(int fd);
 asmlinkage long sys_fork(void);
 asmlinkage long sys_vfork(void);
 #ifdef CONFIG_CLONE_BACKWARDS
-asmlinkage long sys_clone(unsigned long, unsigned long, int __user *, int,
+asmlinkage long sys_clone(unsigned long, unsigned long, int __user *, unsigned long,
 	       int __user *);
 #else
 #ifdef CONFIG_CLONE_BACKWARDS3
 asmlinkage long sys_clone(unsigned long, unsigned long, int, int __user *,
-			  int __user *, int);
+			  int __user *, unsigned long);
 #else
 asmlinkage long sys_clone(unsigned long, unsigned long, int __user *,
-	       int __user *, int);
+	       int __user *, unsigned long);
 #endif
 #endif
 
diff --git a/kernel/fork.c b/kernel/fork.c
index 0bb88b555550..4c95cb34243c 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1238,7 +1238,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 					unsigned long stack_size,
 					int __user *child_tidptr,
 					struct pid *pid,
-					int trace)
+					int trace,
+					unsigned long tls)
 {
 	int retval;
 	struct task_struct *p;
@@ -1447,7 +1448,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	retval = copy_io(clone_flags, p);
 	if (retval)
 		goto bad_fork_cleanup_namespaces;
-	retval = copy_thread(clone_flags, stack_start, stack_size, p);
+	retval = copy_thread_tls(clone_flags, stack_start, stack_size, p, tls);
 	if (retval)
 		goto bad_fork_cleanup_io;
 
@@ -1659,7 +1660,7 @@ static inline void init_idle_pids(struct pid_link *links)
 struct task_struct *fork_idle(int cpu)
 {
 	struct task_struct *task;
-	task = copy_process(CLONE_VM, 0, 0, NULL, &init_struct_pid, 0);
+	task = copy_process(CLONE_VM, 0, 0, NULL, &init_struct_pid, 0, 0);
 	if (!IS_ERR(task)) {
 		init_idle_pids(task->pids);
 		init_idle(task, cpu);
@@ -1674,11 +1675,12 @@ struct task_struct *fork_idle(int cpu)
  * It copies the process, and if successful kick-starts
  * it and waits for it to finish using the VM if required.
  */
-long do_fork(unsigned long clone_flags,
+long _do_fork(unsigned long clone_flags,
 	      unsigned long stack_start,
 	      unsigned long stack_size,
 	      int __user *parent_tidptr,
-	      int __user *child_tidptr)
+	      int __user *child_tidptr,
+	      unsigned long tls)
 {
 	struct task_struct *p;
 	int trace = 0;
@@ -1703,7 +1705,7 @@ long do_fork(unsigned long clone_flags,
 	}
 
 	p = copy_process(clone_flags, stack_start, stack_size,
-			 child_tidptr, NULL, trace);
+			 child_tidptr, NULL, trace, tls);
 	/*
 	 * Do this prior waking up the new thread - the thread pointer
 	 * might get invalid after that point, if the thread exits quickly.
@@ -1744,20 +1746,34 @@ long do_fork(unsigned long clone_flags,
 	return nr;
 }
 
+#ifndef CONFIG_HAVE_COPY_THREAD_TLS
+/* For compatibility with architectures that call do_fork directly rather than
+ * using the syscall entry points below. */
+long do_fork(unsigned long clone_flags,
+	      unsigned long stack_start,
+	      unsigned long stack_size,
+	      int __user *parent_tidptr,
+	      int __user *child_tidptr)
+{
+	return _do_fork(clone_flags, stack_start, stack_size,
+			parent_tidptr, child_tidptr, 0);
+}
+#endif
+
 /*
  * Create a kernel thread.
  */
 pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags)
 {
-	return do_fork(flags|CLONE_VM|CLONE_UNTRACED, (unsigned long)fn,
-		(unsigned long)arg, NULL, NULL);
+	return _do_fork(flags|CLONE_VM|CLONE_UNTRACED, (unsigned long)fn,
+		(unsigned long)arg, NULL, NULL, 0);
 }
 
 #ifdef __ARCH_WANT_SYS_FORK
 SYSCALL_DEFINE0(fork)
 {
 #ifdef CONFIG_MMU
-	return do_fork(SIGCHLD, 0, 0, NULL, NULL);
+	return _do_fork(SIGCHLD, 0, 0, NULL, NULL, 0);
 #else
 	/* can not support in nommu mode */
 	return -EINVAL;
@@ -1768,8 +1784,8 @@ SYSCALL_DEFINE0(fork)
 #ifdef __ARCH_WANT_SYS_VFORK
 SYSCALL_DEFINE0(vfork)
 {
-	return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, 0,
-			0, NULL, NULL);
+	return _do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, 0,
+			0, NULL, NULL, 0);
 }
 #endif
 
@@ -1777,27 +1793,27 @@ SYSCALL_DEFINE0(vfork)
 #ifdef CONFIG_CLONE_BACKWARDS
 SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
 		 int __user *, parent_tidptr,
-		 int, tls_val,
+		 unsigned long, tls,
 		 int __user *, child_tidptr)
 #elif defined(CONFIG_CLONE_BACKWARDS2)
 SYSCALL_DEFINE5(clone, unsigned long, newsp, unsigned long, clone_flags,
 		 int __user *, parent_tidptr,
 		 int __user *, child_tidptr,
-		 int, tls_val)
+		 unsigned long, tls)
 #elif defined(CONFIG_CLONE_BACKWARDS3)
 SYSCALL_DEFINE6(clone, unsigned long, clone_flags, unsigned long, newsp,
 		int, stack_size,
 		int __user *, parent_tidptr,
 		int __user *, child_tidptr,
-		int, tls_val)
+		unsigned long, tls)
 #else
 SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
 		 int __user *, parent_tidptr,
 		 int __user *, child_tidptr,
-		 int, tls_val)
+		 unsigned long, tls)
 #endif
 {
-	return do_fork(clone_flags, newsp, 0, parent_tidptr, child_tidptr);
+	return _do_fork(clone_flags, newsp, 0, parent_tidptr, child_tidptr, tls);
 }
 #endif
 
-- 
cgit v1.2.3


From d43ff430f434d862db59582c0f1f02382a678036 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 25 Jun 2015 15:01:24 -0700
Subject: printk: guard the amount written per line by devkmsg_read()

This patchset updates netconsole so that it can emit messages with the
same header as used in /dev/kmsg which gives neconsole receiver full log
information which enables things like structured logging and detection
of lost messages.

This patch (of 7):

devkmsg_read() uses 8k buffer and assumes that the formatted output
message won't overrun which seems safe given LOG_LINE_MAX, the current use
of dict and the escaping method being used; however, we're planning to use
devkmsg formatting wider and accounting for the buffer size properly isn't
that complicated.

This patch defines CONSOLE_EXT_LOG_MAX as 8192 and updates devkmsg_read()
so that it limits output accordingly.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: David Miller <davem@davemloft.net>
Cc: Kay Sievers <kay@vrfy.org>
Reviewed-by: Petr Mladek <pmladek@suse.cz>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/printk.h |  2 ++
 kernel/printk/printk.c | 34 ++++++++++++++++++++++------------
 2 files changed, 24 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/printk.h b/include/linux/printk.h
index 9b30871c9149..58b1fec40d37 100644
--- a/include/linux/printk.h
+++ b/include/linux/printk.h
@@ -30,6 +30,8 @@ static inline const char *printk_skip_level(const char *buffer)
 	return buffer;
 }
 
+#define CONSOLE_EXT_LOG_MAX	8192
+
 /* printk's without a loglevel use this.. */
 #define MESSAGE_LOGLEVEL_DEFAULT CONFIG_MESSAGE_LOGLEVEL_DEFAULT
 
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index c099b082cd02..a11549034e07 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -505,6 +505,11 @@ int check_syslog_permissions(int type, bool from_file)
 	return security_syslog(type);
 }
 
+static void append_char(char **pp, char *e, char c)
+{
+	if (*pp < e)
+		*(*pp)++ = c;
+}
 
 /* /dev/kmsg - userspace message inject/listen interface */
 struct devkmsg_user {
@@ -512,7 +517,7 @@ struct devkmsg_user {
 	u32 idx;
 	enum log_flags prev;
 	struct mutex lock;
-	char buf[8192];
+	char buf[CONSOLE_EXT_LOG_MAX];
 };
 
 static ssize_t devkmsg_write(struct kiocb *iocb, struct iov_iter *from)
@@ -570,6 +575,7 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf,
 {
 	struct devkmsg_user *user = file->private_data;
 	struct printk_log *msg;
+	char *p, *e;
 	u64 ts_usec;
 	size_t i;
 	char cont = '-';
@@ -579,6 +585,9 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf,
 	if (!user)
 		return -EBADF;
 
+	p = user->buf;
+	e = user->buf + sizeof(user->buf);
+
 	ret = mutex_lock_interruptible(&user->lock);
 	if (ret)
 		return ret;
@@ -625,9 +634,9 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf,
 		 ((user->prev & LOG_CONT) && !(msg->flags & LOG_PREFIX)))
 		cont = '+';
 
-	len = sprintf(user->buf, "%u,%llu,%llu,%c;",
-		      (msg->facility << 3) | msg->level,
-		      user->seq, ts_usec, cont);
+	p += scnprintf(p, e - p, "%u,%llu,%llu,%c;",
+		       (msg->facility << 3) | msg->level,
+		       user->seq, ts_usec, cont);
 	user->prev = msg->flags;
 
 	/* escape non-printable characters */
@@ -635,11 +644,11 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf,
 		unsigned char c = log_text(msg)[i];
 
 		if (c < ' ' || c >= 127 || c == '\\')
-			len += sprintf(user->buf + len, "\\x%02x", c);
+			p += scnprintf(p, e - p, "\\x%02x", c);
 		else
-			user->buf[len++] = c;
+			append_char(&p, e, c);
 	}
-	user->buf[len++] = '\n';
+	append_char(&p, e, '\n');
 
 	if (msg->dict_len) {
 		bool line = true;
@@ -648,30 +657,31 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf,
 			unsigned char c = log_dict(msg)[i];
 
 			if (line) {
-				user->buf[len++] = ' ';
+				append_char(&p, e, ' ');
 				line = false;
 			}
 
 			if (c == '\0') {
-				user->buf[len++] = '\n';
+				append_char(&p, e, '\n');
 				line = true;
 				continue;
 			}
 
 			if (c < ' ' || c >= 127 || c == '\\') {
-				len += sprintf(user->buf + len, "\\x%02x", c);
+				p += scnprintf(p, e - p, "\\x%02x", c);
 				continue;
 			}
 
-			user->buf[len++] = c;
+			append_char(&p, e, c);
 		}
-		user->buf[len++] = '\n';
+		append_char(&p, e, '\n');
 	}
 
 	user->idx = log_next(user->idx);
 	user->seq++;
 	raw_spin_unlock_irq(&logbuf_lock);
 
+	len = p - user->buf;
 	if (len > count) {
 		ret = -EINVAL;
 		goto out;
-- 
cgit v1.2.3


From 6fe29354befe4c46eb308b662155d4d8017358e1 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 25 Jun 2015 15:01:30 -0700
Subject: printk: implement support for extended console drivers

printk log_buf keeps various metadata for each message including its
sequence number and timestamp.  The metadata is currently available only
through /dev/kmsg and stripped out before passed onto console drivers.  We
want this metadata to be available to console drivers too so that console
consumers can get full information including the metadata and dictionary,
which among other things can be used to detect whether messages got lost
in transit.

This patch implements support for extended console drivers.  Consoles can
indicate that they want extended messages by setting the new CON_EXTENDED
flag and they'll be fed messages formatted the same way as /dev/kmsg.

 "<level>,<sequnum>,<timestamp>,<contflag>;<message text>\n"

If extended consoles exist, in-kernel fragment assembly is disabled.  This
ensures that all messages emitted to consoles have full metadata including
sequence number.  The contflag carries enough information to reassemble
the fragments from the reader side trivially.  Note that this only affects
/dev/kmsg.  Regular console and /proc/kmsg outputs are not affected by
this change.

* Extended message formatting for console drivers is enabled iff there
  are registered extended consoles.

* Comment describing /dev/kmsg message format updated to add missing
  contflag field and help distinguishing variable from verbatim terms.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: David Miller <davem@davemloft.net>
Cc: Kay Sievers <kay@vrfy.org>
Reviewed-by: Petr Mladek <pmladek@suse.cz>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/ABI/testing/dev-kmsg |  9 ++++++
 include/linux/console.h            |  1 +
 kernel/printk/printk.c             | 66 +++++++++++++++++++++++++++++++++-----
 3 files changed, 68 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/ABI/testing/dev-kmsg b/Documentation/ABI/testing/dev-kmsg
index bb820be48179..fff817efa508 100644
--- a/Documentation/ABI/testing/dev-kmsg
+++ b/Documentation/ABI/testing/dev-kmsg
@@ -98,4 +98,13 @@ Description:	The /dev/kmsg character device node provides userspace access
 		logic is used internally when messages are printed to the
 		console, /proc/kmsg or the syslog() syscall.
 
+		By default, kernel tries to avoid fragments by concatenating
+		when it can and fragments are rare; however, when extended
+		console support is enabled, the in-kernel concatenation is
+		disabled and /dev/kmsg output will contain more fragments. If
+		the log consumer performs concatenation, the end result
+		should be the same. In the future, the in-kernel concatenation
+		may be removed entirely and /dev/kmsg users are recommended to
+		implement fragment handling.
+
 Users:		dmesg(1), userspace kernel log consumers
diff --git a/include/linux/console.h b/include/linux/console.h
index 9f50fb413c11..bd194343c346 100644
--- a/include/linux/console.h
+++ b/include/linux/console.h
@@ -115,6 +115,7 @@ static inline int con_debug_leave(void)
 #define CON_BOOT	(8)
 #define CON_ANYTIME	(16) /* Safe to call when cpu is offline */
 #define CON_BRL		(32) /* Used for a braille device */
+#define CON_EXTENDED	(64) /* Use the extended output format a la /dev/kmsg */
 
 struct console {
 	char	name[16];
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 51ce4f1838b3..ae980dc3ac1e 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -84,6 +84,18 @@ static struct lockdep_map console_lock_dep_map = {
 };
 #endif
 
+/*
+ * Number of registered extended console drivers.
+ *
+ * If extended consoles are present, in-kernel cont reassembly is disabled
+ * and each fragment is stored as a separate log entry with proper
+ * continuation flag so that every emitted message has full metadata.  This
+ * doesn't change the result for regular consoles or /proc/kmsg.  For
+ * /dev/kmsg, as long as the reader concatenates messages according to
+ * consecutive continuation flags, the end result should be the same too.
+ */
+static int nr_ext_console_drivers;
+
 /*
  * Helper macros to handle lockdep when locking/unlocking console_sem. We use
  * macros instead of functions so that _RET_IP_ contains useful information.
@@ -195,7 +207,7 @@ static int console_may_schedule;
  * need to be changed in the future, when the requirements change.
  *
  * /dev/kmsg exports the structured data in the following line format:
- *   "level,sequnum,timestamp;<message text>\n"
+ *   "<level>,<sequnum>,<timestamp>,<contflag>;<message text>\n"
  *
  * The optional key/value pairs are attached as continuation lines starting
  * with a space character and terminated by a newline. All possible
@@ -1417,7 +1429,9 @@ SYSCALL_DEFINE3(syslog, int, type, char __user *, buf, int, len)
  * log_buf[start] to log_buf[end - 1].
  * The console_lock must be held.
  */
-static void call_console_drivers(int level, const char *text, size_t len)
+static void call_console_drivers(int level,
+				 const char *ext_text, size_t ext_len,
+				 const char *text, size_t len)
 {
 	struct console *con;
 
@@ -1438,7 +1452,10 @@ static void call_console_drivers(int level, const char *text, size_t len)
 		if (!cpu_online(smp_processor_id()) &&
 		    !(con->flags & CON_ANYTIME))
 			continue;
-		con->write(con, text, len);
+		if (con->flags & CON_EXTENDED)
+			con->write(con, ext_text, ext_len);
+		else
+			con->write(con, text, len);
 	}
 }
 
@@ -1581,8 +1598,12 @@ static bool cont_add(int facility, int level, const char *text, size_t len)
 	if (cont.len && cont.flushed)
 		return false;
 
-	if (cont.len + len > sizeof(cont.buf)) {
-		/* the line gets too long, split it up in separate records */
+	/*
+	 * If ext consoles are present, flush and skip in-kernel
+	 * continuation.  See nr_ext_console_drivers definition.  Also, if
+	 * the line gets too long, split it up in separate records.
+	 */
+	if (nr_ext_console_drivers || cont.len + len > sizeof(cont.buf)) {
 		cont_flush(LOG_CONT);
 		return false;
 	}
@@ -1917,9 +1938,19 @@ static struct cont {
 	u8 level;
 	bool flushed:1;
 } cont;
+static char *log_text(const struct printk_log *msg) { return NULL; }
+static char *log_dict(const struct printk_log *msg) { return NULL; }
 static struct printk_log *log_from_idx(u32 idx) { return NULL; }
 static u32 log_next(u32 idx) { return 0; }
-static void call_console_drivers(int level, const char *text, size_t len) {}
+static ssize_t msg_print_ext_header(char *buf, size_t size,
+				    struct printk_log *msg, u64 seq,
+				    enum log_flags prev_flags) { return 0; }
+static ssize_t msg_print_ext_body(char *buf, size_t size,
+				  char *dict, size_t dict_len,
+				  char *text, size_t text_len) { return 0; }
+static void call_console_drivers(int level,
+				 const char *ext_text, size_t ext_len,
+				 const char *text, size_t len) {}
 static size_t msg_print_text(const struct printk_log *msg, enum log_flags prev,
 			     bool syslog, char *buf, size_t size) { return 0; }
 static size_t cont_print_text(char *text, size_t size) { return 0; }
@@ -2172,7 +2203,7 @@ static void console_cont_flush(char *text, size_t size)
 	len = cont_print_text(text, size);
 	raw_spin_unlock(&logbuf_lock);
 	stop_critical_timings();
-	call_console_drivers(cont.level, text, len);
+	call_console_drivers(cont.level, NULL, 0, text, len);
 	start_critical_timings();
 	local_irq_restore(flags);
 	return;
@@ -2196,6 +2227,7 @@ out:
  */
 void console_unlock(void)
 {
+	static char ext_text[CONSOLE_EXT_LOG_MAX];
 	static char text[LOG_LINE_MAX + PREFIX_MAX];
 	static u64 seen_seq;
 	unsigned long flags;
@@ -2214,6 +2246,7 @@ void console_unlock(void)
 again:
 	for (;;) {
 		struct printk_log *msg;
+		size_t ext_len = 0;
 		size_t len;
 		int level;
 
@@ -2259,13 +2292,22 @@ skip:
 		level = msg->level;
 		len += msg_print_text(msg, console_prev, false,
 				      text + len, sizeof(text) - len);
+		if (nr_ext_console_drivers) {
+			ext_len = msg_print_ext_header(ext_text,
+						sizeof(ext_text),
+						msg, console_seq, console_prev);
+			ext_len += msg_print_ext_body(ext_text + ext_len,
+						sizeof(ext_text) - ext_len,
+						log_dict(msg), msg->dict_len,
+						log_text(msg), msg->text_len);
+		}
 		console_idx = log_next(console_idx);
 		console_seq++;
 		console_prev = msg->flags;
 		raw_spin_unlock(&logbuf_lock);
 
 		stop_critical_timings();	/* don't trace print latency */
-		call_console_drivers(level, text, len);
+		call_console_drivers(level, ext_text, ext_len, text, len);
 		start_critical_timings();
 		local_irq_restore(flags);
 	}
@@ -2521,6 +2563,11 @@ void register_console(struct console *newcon)
 		newcon->next = console_drivers->next;
 		console_drivers->next = newcon;
 	}
+
+	if (newcon->flags & CON_EXTENDED)
+		if (!nr_ext_console_drivers++)
+			pr_info("printk: continuation disabled due to ext consoles, expect more fragments in /dev/kmsg\n");
+
 	if (newcon->flags & CON_PRINTBUFFER) {
 		/*
 		 * console_unlock(); will print out the buffered messages
@@ -2593,6 +2640,9 @@ int unregister_console(struct console *console)
 		}
 	}
 
+	if (!res && (console->flags & CON_EXTENDED))
+		nr_ext_console_drivers--;
+
 	/*
 	 * If this isn't the last console and it has CON_CONSDEV set, we
 	 * need to set it on the next preferred console.
-- 
cgit v1.2.3


From 3ea4331c60be3eee4c97e5ddabad95399f879b76 Mon Sep 17 00:00:00 2001
From: Vasily Averin <vvs@virtuozzo.com>
Date: Thu, 25 Jun 2015 15:01:47 -0700
Subject: check_syslog_permissions() cleanup

Patch fixes drawbacks in heck_syslog_permissions() noticed by AKPM:
"from_file handling makes me cry.

That's not a boolean - it's an enumerated value with two values
currently defined.

But the code in check_syslog_permissions() treats it as a boolean and
also hardwires the knowledge that SYSLOG_FROM_PROC == 1 (or == `true`).

And the name is wrong: it should be called from_proc to match
SYSLOG_FROM_PROC."

Signed-off-by: Vasily Averin <vvs@virtuozzo.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Josh Boyer <jwboyer@redhat.com>
Cc: Eric Paris <eparis@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/syslog.h |  6 +++---
 kernel/printk/printk.c | 10 +++++-----
 2 files changed, 8 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/syslog.h b/include/linux/syslog.h
index 4b7b875a7ce1..c3a7f0cc3a27 100644
--- a/include/linux/syslog.h
+++ b/include/linux/syslog.h
@@ -47,12 +47,12 @@
 #define SYSLOG_FROM_READER           0
 #define SYSLOG_FROM_PROC             1
 
-int do_syslog(int type, char __user *buf, int count, bool from_file);
+int do_syslog(int type, char __user *buf, int count, int source);
 
 #ifdef CONFIG_PRINTK
-int check_syslog_permissions(int type, bool from_file);
+int check_syslog_permissions(int type, int source);
 #else
-static inline int check_syslog_permissions(int type, bool from_file)
+static inline int check_syslog_permissions(int type, int source)
 {
 	return 0;
 }
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 45fa8c88ac47..de553849f3ac 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -489,13 +489,13 @@ static int syslog_action_restricted(int type)
 	       type != SYSLOG_ACTION_SIZE_BUFFER;
 }
 
-int check_syslog_permissions(int type, bool from_file)
+int check_syslog_permissions(int type, int source)
 {
 	/*
 	 * If this is from /proc/kmsg and we've already opened it, then we've
 	 * already done the capabilities checks at open time.
 	 */
-	if (from_file && type != SYSLOG_ACTION_OPEN)
+	if (source == SYSLOG_FROM_PROC && type != SYSLOG_ACTION_OPEN)
 		goto ok;
 
 	if (syslog_action_restricted(type)) {
@@ -1290,13 +1290,13 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
 	return len;
 }
 
-int do_syslog(int type, char __user *buf, int len, bool from_file)
+int do_syslog(int type, char __user *buf, int len, int source)
 {
 	bool clear = false;
 	static int saved_console_loglevel = LOGLEVEL_DEFAULT;
 	int error;
 
-	error = check_syslog_permissions(type, from_file);
+	error = check_syslog_permissions(type, source);
 	if (error)
 		goto out;
 
@@ -1379,7 +1379,7 @@ int do_syslog(int type, char __user *buf, int len, bool from_file)
 			syslog_prev = 0;
 			syslog_partial = 0;
 		}
-		if (from_file) {
+		if (source == SYSLOG_FROM_PROC) {
 			/*
 			 * Short-cut for poll(/"proc/kmsg") which simply checks
 			 * for pending data, not the size; return the count of
-- 
cgit v1.2.3


From 94df290404cd0da8016698bf3f398410f29d9a64 Mon Sep 17 00:00:00 2001
From: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Date: Thu, 25 Jun 2015 15:02:22 -0700
Subject: lib/string.c: introduce strreplace()

Strings are sometimes sanitized by replacing a certain character (often
'/') by another (often '!').  In a few places, this is done the same way
Schlemiel the Painter would do it.  Others are slightly smarter but still
do multiple strchr() calls.  Introduce strreplace() to do this using a
single function call and a single pass over the string.

One would expect the return value to be one of three things: void, s, or
the number of replacements made.  I chose the fourth, returning a pointer
to the end of the string.  This is more likely to be useful (for example
allowing the caller to avoid a strlen call).

Signed-off-by: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Cc: "Theodore Ts'o" <tytso@mit.edu>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Neil Brown <neilb@suse.de>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Joe Perches <joe@perches.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/string.h |  1 +
 lib/string.c           | 17 +++++++++++++++++
 2 files changed, 18 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/string.h b/include/linux/string.h
index e40099e585c9..a8d90db9c4b0 100644
--- a/include/linux/string.h
+++ b/include/linux/string.h
@@ -111,6 +111,7 @@ extern int memcmp(const void *,const void *,__kernel_size_t);
 extern void * memchr(const void *,int,__kernel_size_t);
 #endif
 void *memchr_inv(const void *s, int c, size_t n);
+char *strreplace(char *s, char old, char new);
 
 extern void kfree_const(const void *x);
 
diff --git a/lib/string.c b/lib/string.c
index bb3d4b6993c4..13d1e84ddb80 100644
--- a/lib/string.c
+++ b/lib/string.c
@@ -849,3 +849,20 @@ void *memchr_inv(const void *start, int c, size_t bytes)
 	return check_bytes8(start, value, bytes % 8);
 }
 EXPORT_SYMBOL(memchr_inv);
+
+/**
+ * strreplace - Replace all occurrences of character in string.
+ * @s: The string to operate on.
+ * @old: The character being replaced.
+ * @new: The character @old is replaced with.
+ *
+ * Returns pointer to the nul byte at the end of @s.
+ */
+char *strreplace(char *s, char old, char new)
+{
+	for (; *s; ++s)
+		if (*s == old)
+			*s = new;
+	return s;
+}
+EXPORT_SYMBOL(strreplace);
-- 
cgit v1.2.3


From 78d8e58a086b214dddf1fd463e20a7e1d82d7866 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Fri, 26 Jun 2015 10:01:13 -0400
Subject: Revert "block, dm: don't copy bios for request clones"

This reverts commit 5f1b670d0bef508a5554d92525f5f6d00d640b38.

Justification for revert as reported in this dm-devel post:
https://www.redhat.com/archives/dm-devel/2015-June/msg00160.html

this change should not be pushed to mainline yet.

Firstly, Christoph has a newer version of the patch that fixes silent
data corruption problem:
  https://www.redhat.com/archives/dm-devel/2015-May/msg00229.html

And the new version still depends on LLDDs to always complete requests
to the end when error happens, while block API doesn't enforce such a
requirement. If the assumption is ever broken, the inconsistency between
request and bio (e.g. rq->__sector and rq->bio) will cause silent data
corruption:
  https://www.redhat.com/archives/dm-devel/2015-June/msg00022.html

Reported-by: Junichi Nomura <j-nomura@ce.jp.nec.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 block/blk-core.c          |  94 ++++++++++++++++++++++---
 drivers/md/dm-table.c     |  25 +++----
 drivers/md/dm.c           | 171 +++++++++++++++++++++++++++++++++++-----------
 drivers/md/dm.h           |   5 +-
 include/linux/blk_types.h |   2 -
 include/linux/blkdev.h    |   6 +-
 6 files changed, 230 insertions(+), 73 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-core.c b/block/blk-core.c
index f6ab750060fe..706dd087022a 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -117,7 +117,7 @@ EXPORT_SYMBOL(blk_rq_init);
 static void req_bio_endio(struct request *rq, struct bio *bio,
 			  unsigned int nbytes, int error)
 {
-	if (error && !(rq->cmd_flags & REQ_CLONE))
+	if (error)
 		clear_bit(BIO_UPTODATE, &bio->bi_flags);
 	else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
 		error = -EIO;
@@ -128,8 +128,7 @@ static void req_bio_endio(struct request *rq, struct bio *bio,
 	bio_advance(bio, nbytes);
 
 	/* don't actually finish bio if it's part of flush sequence */
-	if (bio->bi_iter.bi_size == 0 &&
-	    !(rq->cmd_flags & (REQ_FLUSH_SEQ|REQ_CLONE)))
+	if (bio->bi_iter.bi_size == 0 && !(rq->cmd_flags & REQ_FLUSH_SEQ))
 		bio_endio(bio, error);
 }
 
@@ -2913,22 +2912,95 @@ int blk_lld_busy(struct request_queue *q)
 }
 EXPORT_SYMBOL_GPL(blk_lld_busy);
 
-void blk_rq_prep_clone(struct request *dst, struct request *src)
+/**
+ * blk_rq_unprep_clone - Helper function to free all bios in a cloned request
+ * @rq: the clone request to be cleaned up
+ *
+ * Description:
+ *     Free all bios in @rq for a cloned request.
+ */
+void blk_rq_unprep_clone(struct request *rq)
+{
+	struct bio *bio;
+
+	while ((bio = rq->bio) != NULL) {
+		rq->bio = bio->bi_next;
+
+		bio_put(bio);
+	}
+}
+EXPORT_SYMBOL_GPL(blk_rq_unprep_clone);
+
+/*
+ * Copy attributes of the original request to the clone request.
+ * The actual data parts (e.g. ->cmd, ->sense) are not copied.
+ */
+static void __blk_rq_prep_clone(struct request *dst, struct request *src)
 {
 	dst->cpu = src->cpu;
-	dst->cmd_flags |= (src->cmd_flags & REQ_CLONE_MASK);
-	dst->cmd_flags |= REQ_NOMERGE | REQ_CLONE;
+	dst->cmd_flags |= (src->cmd_flags & REQ_CLONE_MASK) | REQ_NOMERGE;
 	dst->cmd_type = src->cmd_type;
 	dst->__sector = blk_rq_pos(src);
 	dst->__data_len = blk_rq_bytes(src);
 	dst->nr_phys_segments = src->nr_phys_segments;
 	dst->ioprio = src->ioprio;
 	dst->extra_len = src->extra_len;
-	dst->bio = src->bio;
-	dst->biotail = src->biotail;
-	dst->cmd = src->cmd;
-	dst->cmd_len = src->cmd_len;
-	dst->sense = src->sense;
+}
+
+/**
+ * blk_rq_prep_clone - Helper function to setup clone request
+ * @rq: the request to be setup
+ * @rq_src: original request to be cloned
+ * @bs: bio_set that bios for clone are allocated from
+ * @gfp_mask: memory allocation mask for bio
+ * @bio_ctr: setup function to be called for each clone bio.
+ *           Returns %0 for success, non %0 for failure.
+ * @data: private data to be passed to @bio_ctr
+ *
+ * Description:
+ *     Clones bios in @rq_src to @rq, and copies attributes of @rq_src to @rq.
+ *     The actual data parts of @rq_src (e.g. ->cmd, ->sense)
+ *     are not copied, and copying such parts is the caller's responsibility.
+ *     Also, pages which the original bios are pointing to are not copied
+ *     and the cloned bios just point same pages.
+ *     So cloned bios must be completed before original bios, which means
+ *     the caller must complete @rq before @rq_src.
+ */
+int blk_rq_prep_clone(struct request *rq, struct request *rq_src,
+		      struct bio_set *bs, gfp_t gfp_mask,
+		      int (*bio_ctr)(struct bio *, struct bio *, void *),
+		      void *data)
+{
+	struct bio *bio, *bio_src;
+
+	if (!bs)
+		bs = fs_bio_set;
+
+	__rq_for_each_bio(bio_src, rq_src) {
+		bio = bio_clone_fast(bio_src, gfp_mask, bs);
+		if (!bio)
+			goto free_and_out;
+
+		if (bio_ctr && bio_ctr(bio, bio_src, data))
+			goto free_and_out;
+
+		if (rq->bio) {
+			rq->biotail->bi_next = bio;
+			rq->biotail = bio;
+		} else
+			rq->bio = rq->biotail = bio;
+	}
+
+	__blk_rq_prep_clone(rq, rq_src);
+
+	return 0;
+
+free_and_out:
+	if (bio)
+		bio_put(bio);
+	blk_rq_unprep_clone(rq);
+
+	return -ENOMEM;
 }
 EXPORT_SYMBOL_GPL(blk_rq_prep_clone);
 
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index a5f94125ad01..16ba55ad7089 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -942,28 +942,21 @@ static int dm_table_alloc_md_mempools(struct dm_table *t, struct mapped_device *
 {
 	unsigned type = dm_table_get_type(t);
 	unsigned per_bio_data_size = 0;
+	struct dm_target *tgt;
 	unsigned i;
 
-	switch (type) {
-	case DM_TYPE_BIO_BASED:
-		for (i = 0; i < t->num_targets; i++) {
-			struct dm_target *tgt = t->targets + i;
-
-			per_bio_data_size = max(per_bio_data_size,
-						tgt->per_bio_data_size);
-		}
-		t->mempools = dm_alloc_bio_mempools(t->integrity_supported,
-						    per_bio_data_size);
-		break;
-	case DM_TYPE_REQUEST_BASED:
-	case DM_TYPE_MQ_REQUEST_BASED:
-		t->mempools = dm_alloc_rq_mempools(md, type);
-		break;
-	default:
+	if (unlikely(type == DM_TYPE_NONE)) {
 		DMWARN("no table type is set, can't allocate mempools");
 		return -EINVAL;
 	}
 
+	if (type == DM_TYPE_BIO_BASED)
+		for (i = 0; i < t->num_targets; i++) {
+			tgt = t->targets + i;
+			per_bio_data_size = max(per_bio_data_size, tgt->per_bio_data_size);
+		}
+
+	t->mempools = dm_alloc_md_mempools(md, type, t->integrity_supported, per_bio_data_size);
 	if (!t->mempools)
 		return -ENOMEM;
 
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 492181e16c69..9d942aef0f75 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -993,6 +993,57 @@ static void clone_endio(struct bio *bio, int error)
 	dec_pending(io, error);
 }
 
+/*
+ * Partial completion handling for request-based dm
+ */
+static void end_clone_bio(struct bio *clone, int error)
+{
+	struct dm_rq_clone_bio_info *info =
+		container_of(clone, struct dm_rq_clone_bio_info, clone);
+	struct dm_rq_target_io *tio = info->tio;
+	struct bio *bio = info->orig;
+	unsigned int nr_bytes = info->orig->bi_iter.bi_size;
+
+	bio_put(clone);
+
+	if (tio->error)
+		/*
+		 * An error has already been detected on the request.
+		 * Once error occurred, just let clone->end_io() handle
+		 * the remainder.
+		 */
+		return;
+	else if (error) {
+		/*
+		 * Don't notice the error to the upper layer yet.
+		 * The error handling decision is made by the target driver,
+		 * when the request is completed.
+		 */
+		tio->error = error;
+		return;
+	}
+
+	/*
+	 * I/O for the bio successfully completed.
+	 * Notice the data completion to the upper layer.
+	 */
+
+	/*
+	 * bios are processed from the head of the list.
+	 * So the completing bio should always be rq->bio.
+	 * If it's not, something wrong is happening.
+	 */
+	if (tio->orig->bio != bio)
+		DMERR("bio completion is going in the middle of the request");
+
+	/*
+	 * Update the original request.
+	 * Do not use blk_end_request() here, because it may complete
+	 * the original request before the clone, and break the ordering.
+	 */
+	blk_update_request(tio->orig, 0, nr_bytes);
+}
+
 static struct dm_rq_target_io *tio_from_request(struct request *rq)
 {
 	return (rq->q->mq_ops ? blk_mq_rq_to_pdu(rq) : rq->special);
@@ -1050,6 +1101,8 @@ static void free_rq_clone(struct request *clone)
 	struct dm_rq_target_io *tio = clone->end_io_data;
 	struct mapped_device *md = tio->md;
 
+	blk_rq_unprep_clone(clone);
+
 	if (md->type == DM_TYPE_MQ_REQUEST_BASED)
 		/* stacked on blk-mq queue(s) */
 		tio->ti->type->release_clone_rq(clone);
@@ -1784,13 +1837,39 @@ static void dm_dispatch_clone_request(struct request *clone, struct request *rq)
 		dm_complete_request(rq, r);
 }
 
-static void setup_clone(struct request *clone, struct request *rq,
-		        struct dm_rq_target_io *tio)
+static int dm_rq_bio_constructor(struct bio *bio, struct bio *bio_orig,
+				 void *data)
 {
-	blk_rq_prep_clone(clone, rq);
+	struct dm_rq_target_io *tio = data;
+	struct dm_rq_clone_bio_info *info =
+		container_of(bio, struct dm_rq_clone_bio_info, clone);
+
+	info->orig = bio_orig;
+	info->tio = tio;
+	bio->bi_end_io = end_clone_bio;
+
+	return 0;
+}
+
+static int setup_clone(struct request *clone, struct request *rq,
+		       struct dm_rq_target_io *tio, gfp_t gfp_mask)
+{
+	int r;
+
+	r = blk_rq_prep_clone(clone, rq, tio->md->bs, gfp_mask,
+			      dm_rq_bio_constructor, tio);
+	if (r)
+		return r;
+
+	clone->cmd = rq->cmd;
+	clone->cmd_len = rq->cmd_len;
+	clone->sense = rq->sense;
 	clone->end_io = end_clone_request;
 	clone->end_io_data = tio;
+
 	tio->clone = clone;
+
+	return 0;
 }
 
 static struct request *clone_rq(struct request *rq, struct mapped_device *md,
@@ -1811,7 +1890,12 @@ static struct request *clone_rq(struct request *rq, struct mapped_device *md,
 		clone = tio->clone;
 
 	blk_rq_init(NULL, clone);
-	setup_clone(clone, rq, tio);
+	if (setup_clone(clone, rq, tio, gfp_mask)) {
+		/* -ENOMEM */
+		if (alloc_clone)
+			free_clone_request(md, clone);
+		return NULL;
+	}
 
 	return clone;
 }
@@ -1905,7 +1989,11 @@ static int map_request(struct dm_rq_target_io *tio, struct request *rq,
 		}
 		if (r != DM_MAPIO_REMAPPED)
 			return r;
-		setup_clone(clone, rq, tio);
+		if (setup_clone(clone, rq, tio, GFP_ATOMIC)) {
+			/* -ENOMEM */
+			ti->type->release_clone_rq(clone);
+			return DM_MAPIO_REQUEUE;
+		}
 	}
 
 	switch (r) {
@@ -2375,6 +2463,8 @@ static void __bind_mempools(struct mapped_device *md, struct dm_table *t)
 		goto out;
 	}
 
+	BUG_ON(!p || md->io_pool || md->rq_pool || md->bs);
+
 	md->io_pool = p->io_pool;
 	p->io_pool = NULL;
 	md->rq_pool = p->rq_pool;
@@ -3481,23 +3571,48 @@ int dm_noflush_suspending(struct dm_target *ti)
 }
 EXPORT_SYMBOL_GPL(dm_noflush_suspending);
 
-struct dm_md_mempools *dm_alloc_bio_mempools(unsigned integrity,
-					     unsigned per_bio_data_size)
+struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, unsigned type,
+					    unsigned integrity, unsigned per_bio_data_size)
 {
-	struct dm_md_mempools *pools;
-	unsigned int pool_size = dm_get_reserved_bio_based_ios();
+	struct dm_md_mempools *pools = kzalloc(sizeof(*pools), GFP_KERNEL);
+	struct kmem_cache *cachep = NULL;
+	unsigned int pool_size = 0;
 	unsigned int front_pad;
 
-	pools = kzalloc(sizeof(*pools), GFP_KERNEL);
 	if (!pools)
 		return NULL;
 
-	front_pad = roundup(per_bio_data_size, __alignof__(struct dm_target_io)) +
-		offsetof(struct dm_target_io, clone);
+	type = filter_md_type(type, md);
 
-	pools->io_pool = mempool_create_slab_pool(pool_size, _io_cache);
-	if (!pools->io_pool)
-		goto out;
+	switch (type) {
+	case DM_TYPE_BIO_BASED:
+		cachep = _io_cache;
+		pool_size = dm_get_reserved_bio_based_ios();
+		front_pad = roundup(per_bio_data_size, __alignof__(struct dm_target_io)) + offsetof(struct dm_target_io, clone);
+		break;
+	case DM_TYPE_REQUEST_BASED:
+		cachep = _rq_tio_cache;
+		pool_size = dm_get_reserved_rq_based_ios();
+		pools->rq_pool = mempool_create_slab_pool(pool_size, _rq_cache);
+		if (!pools->rq_pool)
+			goto out;
+		/* fall through to setup remaining rq-based pools */
+	case DM_TYPE_MQ_REQUEST_BASED:
+		if (!pool_size)
+			pool_size = dm_get_reserved_rq_based_ios();
+		front_pad = offsetof(struct dm_rq_clone_bio_info, clone);
+		/* per_bio_data_size is not used. See __bind_mempools(). */
+		WARN_ON(per_bio_data_size != 0);
+		break;
+	default:
+		BUG();
+	}
+
+	if (cachep) {
+		pools->io_pool = mempool_create_slab_pool(pool_size, cachep);
+		if (!pools->io_pool)
+			goto out;
+	}
 
 	pools->bs = bioset_create_nobvec(pool_size, front_pad);
 	if (!pools->bs)
@@ -3507,34 +3622,10 @@ struct dm_md_mempools *dm_alloc_bio_mempools(unsigned integrity,
 		goto out;
 
 	return pools;
-out:
-	dm_free_md_mempools(pools);
-	return NULL;
-}
-
-struct dm_md_mempools *dm_alloc_rq_mempools(struct mapped_device *md,
-					    unsigned type)
-{
-	unsigned int pool_size = dm_get_reserved_rq_based_ios();
-	struct dm_md_mempools *pools;
-
-	pools = kzalloc(sizeof(*pools), GFP_KERNEL);
-	if (!pools)
-		return NULL;
 
-	if (filter_md_type(type, md) == DM_TYPE_REQUEST_BASED) {
-		pools->rq_pool = mempool_create_slab_pool(pool_size, _rq_cache);
-		if (!pools->rq_pool)
-			goto out;
-	}
-
-	pools->io_pool = mempool_create_slab_pool(pool_size, _rq_tio_cache);
-	if (!pools->io_pool)
-		goto out;
-
-	return pools;
 out:
 	dm_free_md_mempools(pools);
+
 	return NULL;
 }
 
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index e6e66d087b26..6123c2bf9150 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -222,9 +222,8 @@ void dm_kcopyd_exit(void);
 /*
  * Mempool operations
  */
-struct dm_md_mempools *dm_alloc_bio_mempools(unsigned integrity,
-					     unsigned per_bio_data_size);
-struct dm_md_mempools *dm_alloc_rq_mempools(struct mapped_device *md, unsigned type);
+struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, unsigned type,
+					    unsigned integrity, unsigned per_bio_data_size);
 void dm_free_md_mempools(struct dm_md_mempools *pools);
 
 /*
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 6ab9d12d1f17..7303b3405520 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -192,7 +192,6 @@ enum rq_flag_bits {
 	__REQ_HASHED,		/* on IO scheduler merge hash */
 	__REQ_MQ_INFLIGHT,	/* track inflight for MQ */
 	__REQ_NO_TIMEOUT,	/* requests may never expire */
-	__REQ_CLONE,		/* cloned bios */
 	__REQ_NR_BITS,		/* stops here */
 };
 
@@ -247,6 +246,5 @@ enum rq_flag_bits {
 #define REQ_HASHED		(1ULL << __REQ_HASHED)
 #define REQ_MQ_INFLIGHT		(1ULL << __REQ_MQ_INFLIGHT)
 #define REQ_NO_TIMEOUT		(1ULL << __REQ_NO_TIMEOUT)
-#define REQ_CLONE		(1ULL << __REQ_CLONE)
 
 #endif /* __LINUX_BLK_TYPES_H */
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 776d2ee43ba6..41c0fb573dff 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -775,7 +775,11 @@ extern void blk_add_request_payload(struct request *rq, struct page *page,
 		unsigned int len);
 extern int blk_rq_check_limits(struct request_queue *q, struct request *rq);
 extern int blk_lld_busy(struct request_queue *q);
-extern void blk_rq_prep_clone(struct request *rq, struct request *rq_src);
+extern int blk_rq_prep_clone(struct request *rq, struct request *rq_src,
+			     struct bio_set *bs, gfp_t gfp_mask,
+			     int (*bio_ctr)(struct bio *, struct bio *, void *),
+			     void *data);
+extern void blk_rq_unprep_clone(struct request *rq);
 extern int blk_insert_cloned_request(struct request_queue *q,
 				     struct request *rq);
 extern void blk_delay_queue(struct request_queue *, unsigned long);
-- 
cgit v1.2.3


From 5212e11fde4d40fa627668b4f2222d20db488f71 Mon Sep 17 00:00:00 2001
From: Vishal Verma <vishal.l.verma@intel.com>
Date: Thu, 25 Jun 2015 04:20:32 -0400
Subject: nd_btt: atomic sector updates

BTT stands for Block Translation Table, and is a way to provide power
fail sector atomicity semantics for block devices that have the ability
to perform byte granularity IO. It relies on the capability of libnvdimm
namespace devices to do byte aligned IO.

The BTT works as a stacked blocked device, and reserves a chunk of space
from the backing device for its accounting metadata. It is a bio-based
driver because all IO is done synchronously, and there is no queuing or
asynchronous completions at either the device or the driver level.

The BTT uses 'lanes' to index into various 'on-disk' data structures,
and lanes also act as a synchronization mechanism in case there are more
CPUs than available lanes. We did a comparison between two lane lock
strategies - first where we kept an atomic counter around that tracked
which was the last lane that was used, and 'our' lane was determined by
atomically incrementing that. That way, for the nr_cpus > nr_lanes case,
theoretically, no CPU would be blocked waiting for a lane. The other
strategy was to use the cpu number we're scheduled on to and hash it to
a lane number. Theoretically, this could block an IO that could've
otherwise run using a different, free lane. But some fio workloads
showed that the direct cpu -> lane hash performed faster than tracking
'last lane' - my reasoning is the cache thrash caused by moving the
atomic variable made that approach slower than simply waiting out the
in-progress IO. This supports the conclusion that the driver can be a
very simple bio-based one that does synchronous IOs instead of queuing.

Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Boaz Harrosh <boaz@plexistor.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Jens Axboe <axboe@fb.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Neil Brown <neilb@suse.de>
Cc: Jeff Moyer <jmoyer@redhat.com>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Greg KH <gregkh@linuxfoundation.org>
[jmoyer: fix nmi watchdog timeout in btt_map_init]
[jmoyer: move btt initialization to module load path]
[jmoyer: fix memory leak in the btt initialization path]
[jmoyer: Don't overwrite corrupted arenas]
Signed-off-by: Vishal Verma <vishal.l.verma@linux.intel.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 Documentation/nvdimm/btt.txt    |  273 ++++++++
 drivers/acpi/nfit.c             |    1 +
 drivers/nvdimm/Kconfig          |   28 +-
 drivers/nvdimm/Makefile         |    3 +
 drivers/nvdimm/btt.c            | 1371 +++++++++++++++++++++++++++++++++++++++
 drivers/nvdimm/btt.h            |  141 ++++
 drivers/nvdimm/btt_devs.c       |    3 +-
 drivers/nvdimm/namespace_devs.c |   24 +
 drivers/nvdimm/nd.h             |   22 +-
 drivers/nvdimm/pmem.c           |   14 +-
 drivers/nvdimm/region.c         |   12 +
 drivers/nvdimm/region_devs.c    |   82 ++-
 include/linux/libnvdimm.h       |    1 +
 13 files changed, 1950 insertions(+), 25 deletions(-)
 create mode 100644 Documentation/nvdimm/btt.txt
 create mode 100644 drivers/nvdimm/btt.c

(limited to 'include/linux')

diff --git a/Documentation/nvdimm/btt.txt b/Documentation/nvdimm/btt.txt
new file mode 100644
index 000000000000..95134d5ec4a0
--- /dev/null
+++ b/Documentation/nvdimm/btt.txt
@@ -0,0 +1,273 @@
+BTT - Block Translation Table
+=============================
+
+
+1. Introduction
+---------------
+
+Persistent memory based storage is able to perform IO at byte (or more
+accurately, cache line) granularity. However, we often want to expose such
+storage as traditional block devices. The block drivers for persistent memory
+will do exactly this. However, they do not provide any atomicity guarantees.
+Traditional SSDs typically provide protection against torn sectors in hardware,
+using stored energy in capacitors to complete in-flight block writes, or perhaps
+in firmware. We don't have this luxury with persistent memory - if a write is in
+progress, and we experience a power failure, the block will contain a mix of old
+and new data. Applications may not be prepared to handle such a scenario.
+
+The Block Translation Table (BTT) provides atomic sector update semantics for
+persistent memory devices, so that applications that rely on sector writes not
+being torn can continue to do so. The BTT manifests itself as a stacked block
+device, and reserves a portion of the underlying storage for its metadata. At
+the heart of it, is an indirection table that re-maps all the blocks on the
+volume. It can be thought of as an extremely simple file system that only
+provides atomic sector updates.
+
+
+2. Static Layout
+----------------
+
+The underlying storage on which a BTT can be laid out is not limited in any way.
+The BTT, however, splits the available space into chunks of up to 512 GiB,
+called "Arenas".
+
+Each arena follows the same layout for its metadata, and all references in an
+arena are internal to it (with the exception of one field that points to the
+next arena). The following depicts the "On-disk" metadata layout:
+
+
+  Backing Store     +------->  Arena
++---------------+   |   +------------------+
+|               |   |   | Arena info block |
+|    Arena 0    +---+   |       4K         |
+|     512G      |       +------------------+
+|               |       |                  |
++---------------+       |                  |
+|               |       |                  |
+|    Arena 1    |       |   Data Blocks    |
+|     512G      |       |                  |
+|               |       |                  |
++---------------+       |                  |
+|       .       |       |                  |
+|       .       |       |                  |
+|       .       |       |                  |
+|               |       |                  |
+|               |       |                  |
++---------------+       +------------------+
+                        |                  |
+                        |     BTT Map      |
+                        |                  |
+                        |                  |
+                        +------------------+
+                        |                  |
+                        |     BTT Flog     |
+                        |                  |
+                        +------------------+
+                        | Info block copy  |
+                        |       4K         |
+                        +------------------+
+
+
+3. Theory of Operation
+----------------------
+
+
+a. The BTT Map
+--------------
+
+The map is a simple lookup/indirection table that maps an LBA to an internal
+block. Each map entry is 32 bits. The two most significant bits are special
+flags, and the remaining form the internal block number.
+
+Bit      Description
+31     : TRIM flag - marks if the block was trimmed or discarded
+30     : ERROR flag - marks an error block. Cleared on write.
+29 - 0 : Mappings to internal 'postmap' blocks
+
+
+Some of the terminology that will be subsequently used:
+
+External LBA  : LBA as made visible to upper layers.
+ABA           : Arena Block Address - Block offset/number within an arena
+Premap ABA    : The block offset into an arena, which was decided upon by range
+		checking the External LBA
+Postmap ABA   : The block number in the "Data Blocks" area obtained after
+		indirection from the map
+nfree	      : The number of free blocks that are maintained at any given time.
+		This is the number of concurrent writes that can happen to the
+		arena.
+
+
+For example, after adding a BTT, we surface a disk of 1024G. We get a read for
+the external LBA at 768G. This falls into the second arena, and of the 512G
+worth of blocks that this arena contributes, this block is at 256G. Thus, the
+premap ABA is 256G. We now refer to the map, and find out the mapping for block
+'X' (256G) points to block 'Y', say '64'. Thus the postmap ABA is 64.
+
+
+b. The BTT Flog
+---------------
+
+The BTT provides sector atomicity by making every write an "allocating write",
+i.e. Every write goes to a "free" block. A running list of free blocks is
+maintained in the form of the BTT flog. 'Flog' is a combination of the words
+"free list" and "log". The flog contains 'nfree' entries, and an entry contains:
+
+lba     : The premap ABA that is being written to
+old_map : The old postmap ABA - after 'this' write completes, this will be a
+	  free block.
+new_map : The new postmap ABA. The map will up updated to reflect this
+	  lba->postmap_aba mapping, but we log it here in case we have to
+	  recover.
+seq	: Sequence number to mark which of the 2 sections of this flog entry is
+	  valid/newest. It cycles between 01->10->11->01 (binary) under normal
+	  operation, with 00 indicating an uninitialized state.
+lba'	: alternate lba entry
+old_map': alternate old postmap entry
+new_map': alternate new postmap entry
+seq'	: alternate sequence number.
+
+Each of the above fields is 32-bit, making one entry 16 bytes. Flog updates are
+done such that for any entry being written, it:
+a. overwrites the 'old' section in the entry based on sequence numbers
+b. writes the new entry such that the sequence number is written last.
+
+
+c. The concept of lanes
+-----------------------
+
+While 'nfree' describes the number of concurrent IOs an arena can process
+concurrently, 'nlanes' is the number of IOs the BTT device as a whole can
+process.
+ nlanes = min(nfree, num_cpus)
+A lane number is obtained at the start of any IO, and is used for indexing into
+all the on-disk and in-memory data structures for the duration of the IO. It is
+protected by a spinlock.
+
+
+d. In-memory data structure: Read Tracking Table (RTT)
+------------------------------------------------------
+
+Consider a case where we have two threads, one doing reads and the other,
+writes. We can hit a condition where the writer thread grabs a free block to do
+a new IO, but the (slow) reader thread is still reading from it. In other words,
+the reader consulted a map entry, and started reading the corresponding block. A
+writer started writing to the same external LBA, and finished the write updating
+the map for that external LBA to point to its new postmap ABA. At this point the
+internal, postmap block that the reader is (still) reading has been inserted
+into the list of free blocks. If another write comes in for the same LBA, it can
+grab this free block, and start writing to it, causing the reader to read
+incorrect data. To prevent this, we introduce the RTT.
+
+The RTT is a simple, per arena table with 'nfree' entries. Every reader inserts
+into rtt[lane_number], the postmap ABA it is reading, and clears it after the
+read is complete. Every writer thread, after grabbing a free block, checks the
+RTT for its presence. If the postmap free block is in the RTT, it waits till the
+reader clears the RTT entry, and only then starts writing to it.
+
+
+e. In-memory data structure: map locks
+--------------------------------------
+
+Consider a case where two writer threads are writing to the same LBA. There can
+be a race in the following sequence of steps:
+
+free[lane] = map[premap_aba]
+map[premap_aba] = postmap_aba
+
+Both threads can update their respective free[lane] with the same old, freed
+postmap_aba. This has made the layout inconsistent by losing a free entry, and
+at the same time, duplicating another free entry for two lanes.
+
+To solve this, we could have a single map lock (per arena) that has to be taken
+before performing the above sequence, but we feel that could be too contentious.
+Instead we use an array of (nfree) map_locks that is indexed by
+(premap_aba modulo nfree).
+
+
+f. Reconstruction from the Flog
+-------------------------------
+
+On startup, we analyze the BTT flog to create our list of free blocks. We walk
+through all the entries, and for each lane, of the set of two possible
+'sections', we always look at the most recent one only (based on the sequence
+number). The reconstruction rules/steps are simple:
+- Read map[log_entry.lba].
+- If log_entry.new matches the map entry, then log_entry.old is free.
+- If log_entry.new does not match the map entry, then log_entry.new is free.
+  (This case can only be caused by power-fails/unsafe shutdowns)
+
+
+g. Summarizing - Read and Write flows
+-------------------------------------
+
+Read:
+
+1.  Convert external LBA to arena number + pre-map ABA
+2.  Get a lane (and take lane_lock)
+3.  Read map to get the entry for this pre-map ABA
+4.  Enter post-map ABA into RTT[lane]
+5.  If TRIM flag set in map, return zeroes, and end IO (go to step 8)
+6.  If ERROR flag set in map, end IO with EIO (go to step 8)
+7.  Read data from this block
+8.  Remove post-map ABA entry from RTT[lane]
+9.  Release lane (and lane_lock)
+
+Write:
+
+1.  Convert external LBA to Arena number + pre-map ABA
+2.  Get a lane (and take lane_lock)
+3.  Use lane to index into in-memory free list and obtain a new block, next flog
+        index, next sequence number
+4.  Scan the RTT to check if free block is present, and spin/wait if it is.
+5.  Write data to this free block
+6.  Read map to get the existing post-map ABA entry for this pre-map ABA
+7.  Write flog entry: [premap_aba / old postmap_aba / new postmap_aba / seq_num]
+8.  Write new post-map ABA into map.
+9.  Write old post-map entry into the free list
+10. Calculate next sequence number and write into the free list entry
+11. Release lane (and lane_lock)
+
+
+4. Error Handling
+=================
+
+An arena would be in an error state if any of the metadata is corrupted
+irrecoverably, either due to a bug or a media error. The following conditions
+indicate an error:
+- Info block checksum does not match (and recovering from the copy also fails)
+- All internal available blocks are not uniquely and entirely addressed by the
+  sum of mapped blocks and free blocks (from the BTT flog).
+- Rebuilding free list from the flog reveals missing/duplicate/impossible
+  entries
+- A map entry is out of bounds
+
+If any of these error conditions are encountered, the arena is put into a read
+only state using a flag in the info block.
+
+
+5. In-kernel usage
+==================
+
+Any block driver that supports byte granularity IO to the storage may register
+with the BTT. It will have to provide the rw_bytes interface in its
+block_device_operations struct:
+
+	int (*rw_bytes)(struct gendisk *, void *, size_t, off_t, int rw);
+
+It may register with the BTT after it adds its own gendisk, using btt_init:
+
+	struct btt *btt_init(struct gendisk *disk, unsigned long long rawsize,
+			u32 lbasize, u8 uuid[], int maxlane);
+
+note that maxlane is the maximum amount of concurrency the driver wishes to
+allow the BTT to use.
+
+The BTT 'disk' appears as a stacked block device that grabs the underlying block
+device in the O_EXCL mode.
+
+When the driver wishes to remove the backing disk, it should similarly call
+btt_fini using the same struct btt* handle that was provided to it by btt_init.
+
+	void btt_fini(struct btt *btt);
+
diff --git a/drivers/acpi/nfit.c b/drivers/acpi/nfit.c
index 35af6f7f0abd..fc38b49eff7d 100644
--- a/drivers/acpi/nfit.c
+++ b/drivers/acpi/nfit.c
@@ -902,6 +902,7 @@ static int acpi_nfit_init_mapping(struct acpi_nfit_desc *acpi_desc,
 		} else {
 			nd_mapping->size = nfit_mem->bdw->capacity;
 			nd_mapping->start = nfit_mem->bdw->start_address;
+			ndr_desc->num_lanes = nfit_mem->bdw->windows;
 			blk_valid = 1;
 		}
 
diff --git a/drivers/nvdimm/Kconfig b/drivers/nvdimm/Kconfig
index 5680e8e7a7aa..204ee0796411 100644
--- a/drivers/nvdimm/Kconfig
+++ b/drivers/nvdimm/Kconfig
@@ -8,11 +8,11 @@ menuconfig LIBNVDIMM
 	  NFIT, or otherwise can discover NVDIMM resources, a libnvdimm
 	  bus is registered to advertise PMEM (persistent memory)
 	  namespaces (/dev/pmemX) and BLK (sliding mmio window(s))
-	  namespaces (/dev/ndX). A PMEM namespace refers to a memory
-	  resource that may span multiple DIMMs and support DAX (see
-	  CONFIG_DAX).  A BLK namespace refers to an NVDIMM control
-	  region which exposes an mmio register set for windowed
-	  access mode to non-volatile memory.
+	  namespaces (/dev/ndblkX.Y). A PMEM namespace refers to a
+	  memory resource that may span multiple DIMMs and support DAX
+	  (see CONFIG_DAX).  A BLK namespace refers to an NVDIMM control
+	  region which exposes an mmio register set for windowed access
+	  mode to non-volatile memory.
 
 if LIBNVDIMM
 
@@ -20,6 +20,7 @@ config BLK_DEV_PMEM
 	tristate "PMEM: Persistent memory block device support"
 	default LIBNVDIMM
 	depends on HAS_IOMEM
+	select ND_BTT if BTT
 	help
 	  Memory ranges for PMEM are described by either an NFIT
 	  (NVDIMM Firmware Interface Table, see CONFIG_NFIT_ACPI), a
@@ -33,7 +34,22 @@ config BLK_DEV_PMEM
 
 	  Say Y if you want to use an NVDIMM
 
+config ND_BTT
+	tristate
+
 config BTT
-	def_bool y
+	bool "BTT: Block Translation Table (atomic sector updates)"
+	default y if LIBNVDIMM
+	help
+	  The Block Translation Table (BTT) provides atomic sector
+	  update semantics for persistent memory devices, so that
+	  applications that rely on sector writes not being torn (a
+	  guarantee that typical disks provide) can continue to do so.
+	  The BTT manifests itself as an alternate personality for an
+	  NVDIMM namespace, i.e. a namespace can be in raw mode (pmemX,
+	  ndblkX.Y, etc...), or 'sectored' mode, (pmemXs, ndblkX.Ys,
+	  etc...).
+
+	  Select Y if unsure
 
 endif
diff --git a/drivers/nvdimm/Makefile b/drivers/nvdimm/Makefile
index 6085b4bd7312..d2aab6c58492 100644
--- a/drivers/nvdimm/Makefile
+++ b/drivers/nvdimm/Makefile
@@ -1,8 +1,11 @@
 obj-$(CONFIG_LIBNVDIMM) += libnvdimm.o
 obj-$(CONFIG_BLK_DEV_PMEM) += nd_pmem.o
+obj-$(CONFIG_ND_BTT) += nd_btt.o
 
 nd_pmem-y := pmem.o
 
+nd_btt-y := btt.o
+
 libnvdimm-y := core.o
 libnvdimm-y += bus.o
 libnvdimm-y += dimm_devs.o
diff --git a/drivers/nvdimm/btt.c b/drivers/nvdimm/btt.c
new file mode 100644
index 000000000000..7ae38aac2c25
--- /dev/null
+++ b/drivers/nvdimm/btt.c
@@ -0,0 +1,1371 @@
+/*
+ * Block Translation Table
+ * Copyright (c) 2014-2015, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+#include <linux/highmem.h>
+#include <linux/debugfs.h>
+#include <linux/blkdev.h>
+#include <linux/module.h>
+#include <linux/device.h>
+#include <linux/mutex.h>
+#include <linux/hdreg.h>
+#include <linux/genhd.h>
+#include <linux/sizes.h>
+#include <linux/ndctl.h>
+#include <linux/fs.h>
+#include <linux/nd.h>
+#include "btt.h"
+#include "nd.h"
+
+enum log_ent_request {
+	LOG_NEW_ENT = 0,
+	LOG_OLD_ENT
+};
+
+static int btt_major;
+
+static int arena_read_bytes(struct arena_info *arena, resource_size_t offset,
+		void *buf, size_t n)
+{
+	struct nd_btt *nd_btt = arena->nd_btt;
+	struct nd_namespace_common *ndns = nd_btt->ndns;
+
+	/* arena offsets are 4K from the base of the device */
+	offset += SZ_4K;
+	return nvdimm_read_bytes(ndns, offset, buf, n);
+}
+
+static int arena_write_bytes(struct arena_info *arena, resource_size_t offset,
+		void *buf, size_t n)
+{
+	struct nd_btt *nd_btt = arena->nd_btt;
+	struct nd_namespace_common *ndns = nd_btt->ndns;
+
+	/* arena offsets are 4K from the base of the device */
+	offset += SZ_4K;
+	return nvdimm_write_bytes(ndns, offset, buf, n);
+}
+
+static int btt_info_write(struct arena_info *arena, struct btt_sb *super)
+{
+	int ret;
+
+	ret = arena_write_bytes(arena, arena->info2off, super,
+			sizeof(struct btt_sb));
+	if (ret)
+		return ret;
+
+	return arena_write_bytes(arena, arena->infooff, super,
+			sizeof(struct btt_sb));
+}
+
+static int btt_info_read(struct arena_info *arena, struct btt_sb *super)
+{
+	WARN_ON(!super);
+	return arena_read_bytes(arena, arena->infooff, super,
+			sizeof(struct btt_sb));
+}
+
+/*
+ * 'raw' version of btt_map write
+ * Assumptions:
+ *   mapping is in little-endian
+ *   mapping contains 'E' and 'Z' flags as desired
+ */
+static int __btt_map_write(struct arena_info *arena, u32 lba, __le32 mapping)
+{
+	u64 ns_off = arena->mapoff + (lba * MAP_ENT_SIZE);
+
+	WARN_ON(lba >= arena->external_nlba);
+	return arena_write_bytes(arena, ns_off, &mapping, MAP_ENT_SIZE);
+}
+
+static int btt_map_write(struct arena_info *arena, u32 lba, u32 mapping,
+			u32 z_flag, u32 e_flag)
+{
+	u32 ze;
+	__le32 mapping_le;
+
+	/*
+	 * This 'mapping' is supposed to be just the LBA mapping, without
+	 * any flags set, so strip the flag bits.
+	 */
+	mapping &= MAP_LBA_MASK;
+
+	ze = (z_flag << 1) + e_flag;
+	switch (ze) {
+	case 0:
+		/*
+		 * We want to set neither of the Z or E flags, and
+		 * in the actual layout, this means setting the bit
+		 * positions of both to '1' to indicate a 'normal'
+		 * map entry
+		 */
+		mapping |= MAP_ENT_NORMAL;
+		break;
+	case 1:
+		mapping |= (1 << MAP_ERR_SHIFT);
+		break;
+	case 2:
+		mapping |= (1 << MAP_TRIM_SHIFT);
+		break;
+	default:
+		/*
+		 * The case where Z and E are both sent in as '1' could be
+		 * construed as a valid 'normal' case, but we decide not to,
+		 * to avoid confusion
+		 */
+		WARN_ONCE(1, "Invalid use of Z and E flags\n");
+		return -EIO;
+	}
+
+	mapping_le = cpu_to_le32(mapping);
+	return __btt_map_write(arena, lba, mapping_le);
+}
+
+static int btt_map_read(struct arena_info *arena, u32 lba, u32 *mapping,
+			int *trim, int *error)
+{
+	int ret;
+	__le32 in;
+	u32 raw_mapping, postmap, ze, z_flag, e_flag;
+	u64 ns_off = arena->mapoff + (lba * MAP_ENT_SIZE);
+
+	WARN_ON(lba >= arena->external_nlba);
+
+	ret = arena_read_bytes(arena, ns_off, &in, MAP_ENT_SIZE);
+	if (ret)
+		return ret;
+
+	raw_mapping = le32_to_cpu(in);
+
+	z_flag = (raw_mapping & MAP_TRIM_MASK) >> MAP_TRIM_SHIFT;
+	e_flag = (raw_mapping & MAP_ERR_MASK) >> MAP_ERR_SHIFT;
+	ze = (z_flag << 1) + e_flag;
+	postmap = raw_mapping & MAP_LBA_MASK;
+
+	/* Reuse the {z,e}_flag variables for *trim and *error */
+	z_flag = 0;
+	e_flag = 0;
+
+	switch (ze) {
+	case 0:
+		/* Initial state. Return postmap = premap */
+		*mapping = lba;
+		break;
+	case 1:
+		*mapping = postmap;
+		e_flag = 1;
+		break;
+	case 2:
+		*mapping = postmap;
+		z_flag = 1;
+		break;
+	case 3:
+		*mapping = postmap;
+		break;
+	default:
+		return -EIO;
+	}
+
+	if (trim)
+		*trim = z_flag;
+	if (error)
+		*error = e_flag;
+
+	return ret;
+}
+
+static int btt_log_read_pair(struct arena_info *arena, u32 lane,
+			struct log_entry *ent)
+{
+	WARN_ON(!ent);
+	return arena_read_bytes(arena,
+			arena->logoff + (2 * lane * LOG_ENT_SIZE), ent,
+			2 * LOG_ENT_SIZE);
+}
+
+static struct dentry *debugfs_root;
+
+static void arena_debugfs_init(struct arena_info *a, struct dentry *parent,
+				int idx)
+{
+	char dirname[32];
+	struct dentry *d;
+
+	/* If for some reason, parent bttN was not created, exit */
+	if (!parent)
+		return;
+
+	snprintf(dirname, 32, "arena%d", idx);
+	d = debugfs_create_dir(dirname, parent);
+	if (IS_ERR_OR_NULL(d))
+		return;
+	a->debugfs_dir = d;
+
+	debugfs_create_x64("size", S_IRUGO, d, &a->size);
+	debugfs_create_x64("external_lba_start", S_IRUGO, d,
+				&a->external_lba_start);
+	debugfs_create_x32("internal_nlba", S_IRUGO, d, &a->internal_nlba);
+	debugfs_create_u32("internal_lbasize", S_IRUGO, d,
+				&a->internal_lbasize);
+	debugfs_create_x32("external_nlba", S_IRUGO, d, &a->external_nlba);
+	debugfs_create_u32("external_lbasize", S_IRUGO, d,
+				&a->external_lbasize);
+	debugfs_create_u32("nfree", S_IRUGO, d, &a->nfree);
+	debugfs_create_u16("version_major", S_IRUGO, d, &a->version_major);
+	debugfs_create_u16("version_minor", S_IRUGO, d, &a->version_minor);
+	debugfs_create_x64("nextoff", S_IRUGO, d, &a->nextoff);
+	debugfs_create_x64("infooff", S_IRUGO, d, &a->infooff);
+	debugfs_create_x64("dataoff", S_IRUGO, d, &a->dataoff);
+	debugfs_create_x64("mapoff", S_IRUGO, d, &a->mapoff);
+	debugfs_create_x64("logoff", S_IRUGO, d, &a->logoff);
+	debugfs_create_x64("info2off", S_IRUGO, d, &a->info2off);
+	debugfs_create_x32("flags", S_IRUGO, d, &a->flags);
+}
+
+static void btt_debugfs_init(struct btt *btt)
+{
+	int i = 0;
+	struct arena_info *arena;
+
+	btt->debugfs_dir = debugfs_create_dir(dev_name(&btt->nd_btt->dev),
+						debugfs_root);
+	if (IS_ERR_OR_NULL(btt->debugfs_dir))
+		return;
+
+	list_for_each_entry(arena, &btt->arena_list, list) {
+		arena_debugfs_init(arena, btt->debugfs_dir, i);
+		i++;
+	}
+}
+
+/*
+ * This function accepts two log entries, and uses the
+ * sequence number to find the 'older' entry.
+ * It also updates the sequence number in this old entry to
+ * make it the 'new' one if the mark_flag is set.
+ * Finally, it returns which of the entries was the older one.
+ *
+ * TODO The logic feels a bit kludge-y. make it better..
+ */
+static int btt_log_get_old(struct log_entry *ent)
+{
+	int old;
+
+	/*
+	 * the first ever time this is seen, the entry goes into [0]
+	 * the next time, the following logic works out to put this
+	 * (next) entry into [1]
+	 */
+	if (ent[0].seq == 0) {
+		ent[0].seq = cpu_to_le32(1);
+		return 0;
+	}
+
+	if (ent[0].seq == ent[1].seq)
+		return -EINVAL;
+	if (le32_to_cpu(ent[0].seq) + le32_to_cpu(ent[1].seq) > 5)
+		return -EINVAL;
+
+	if (le32_to_cpu(ent[0].seq) < le32_to_cpu(ent[1].seq)) {
+		if (le32_to_cpu(ent[1].seq) - le32_to_cpu(ent[0].seq) == 1)
+			old = 0;
+		else
+			old = 1;
+	} else {
+		if (le32_to_cpu(ent[0].seq) - le32_to_cpu(ent[1].seq) == 1)
+			old = 1;
+		else
+			old = 0;
+	}
+
+	return old;
+}
+
+static struct device *to_dev(struct arena_info *arena)
+{
+	return &arena->nd_btt->dev;
+}
+
+/*
+ * This function copies the desired (old/new) log entry into ent if
+ * it is not NULL. It returns the sub-slot number (0 or 1)
+ * where the desired log entry was found. Negative return values
+ * indicate errors.
+ */
+static int btt_log_read(struct arena_info *arena, u32 lane,
+			struct log_entry *ent, int old_flag)
+{
+	int ret;
+	int old_ent, ret_ent;
+	struct log_entry log[2];
+
+	ret = btt_log_read_pair(arena, lane, log);
+	if (ret)
+		return -EIO;
+
+	old_ent = btt_log_get_old(log);
+	if (old_ent < 0 || old_ent > 1) {
+		dev_info(to_dev(arena),
+				"log corruption (%d): lane %d seq [%d, %d]\n",
+			old_ent, lane, log[0].seq, log[1].seq);
+		/* TODO set error state? */
+		return -EIO;
+	}
+
+	ret_ent = (old_flag ? old_ent : (1 - old_ent));
+
+	if (ent != NULL)
+		memcpy(ent, &log[ret_ent], LOG_ENT_SIZE);
+
+	return ret_ent;
+}
+
+/*
+ * This function commits a log entry to media
+ * It does _not_ prepare the freelist entry for the next write
+ * btt_flog_write is the wrapper for updating the freelist elements
+ */
+static int __btt_log_write(struct arena_info *arena, u32 lane,
+			u32 sub, struct log_entry *ent)
+{
+	int ret;
+	/*
+	 * Ignore the padding in log_entry for calculating log_half.
+	 * The entry is 'committed' when we write the sequence number,
+	 * and we want to ensure that that is the last thing written.
+	 * We don't bother writing the padding as that would be extra
+	 * media wear and write amplification
+	 */
+	unsigned int log_half = (LOG_ENT_SIZE - 2 * sizeof(u64)) / 2;
+	u64 ns_off = arena->logoff + (((2 * lane) + sub) * LOG_ENT_SIZE);
+	void *src = ent;
+
+	/* split the 16B write into atomic, durable halves */
+	ret = arena_write_bytes(arena, ns_off, src, log_half);
+	if (ret)
+		return ret;
+
+	ns_off += log_half;
+	src += log_half;
+	return arena_write_bytes(arena, ns_off, src, log_half);
+}
+
+static int btt_flog_write(struct arena_info *arena, u32 lane, u32 sub,
+			struct log_entry *ent)
+{
+	int ret;
+
+	ret = __btt_log_write(arena, lane, sub, ent);
+	if (ret)
+		return ret;
+
+	/* prepare the next free entry */
+	arena->freelist[lane].sub = 1 - arena->freelist[lane].sub;
+	if (++(arena->freelist[lane].seq) == 4)
+		arena->freelist[lane].seq = 1;
+	arena->freelist[lane].block = le32_to_cpu(ent->old_map);
+
+	return ret;
+}
+
+/*
+ * This function initializes the BTT map to the initial state, which is
+ * all-zeroes, and indicates an identity mapping
+ */
+static int btt_map_init(struct arena_info *arena)
+{
+	int ret = -EINVAL;
+	void *zerobuf;
+	size_t offset = 0;
+	size_t chunk_size = SZ_2M;
+	size_t mapsize = arena->logoff - arena->mapoff;
+
+	zerobuf = kzalloc(chunk_size, GFP_KERNEL);
+	if (!zerobuf)
+		return -ENOMEM;
+
+	while (mapsize) {
+		size_t size = min(mapsize, chunk_size);
+
+		ret = arena_write_bytes(arena, arena->mapoff + offset, zerobuf,
+				size);
+		if (ret)
+			goto free;
+
+		offset += size;
+		mapsize -= size;
+		cond_resched();
+	}
+
+ free:
+	kfree(zerobuf);
+	return ret;
+}
+
+/*
+ * This function initializes the BTT log with 'fake' entries pointing
+ * to the initial reserved set of blocks as being free
+ */
+static int btt_log_init(struct arena_info *arena)
+{
+	int ret;
+	u32 i;
+	struct log_entry log, zerolog;
+
+	memset(&zerolog, 0, sizeof(zerolog));
+
+	for (i = 0; i < arena->nfree; i++) {
+		log.lba = cpu_to_le32(i);
+		log.old_map = cpu_to_le32(arena->external_nlba + i);
+		log.new_map = cpu_to_le32(arena->external_nlba + i);
+		log.seq = cpu_to_le32(LOG_SEQ_INIT);
+		ret = __btt_log_write(arena, i, 0, &log);
+		if (ret)
+			return ret;
+		ret = __btt_log_write(arena, i, 1, &zerolog);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+static int btt_freelist_init(struct arena_info *arena)
+{
+	int old, new, ret;
+	u32 i, map_entry;
+	struct log_entry log_new, log_old;
+
+	arena->freelist = kcalloc(arena->nfree, sizeof(struct free_entry),
+					GFP_KERNEL);
+	if (!arena->freelist)
+		return -ENOMEM;
+
+	for (i = 0; i < arena->nfree; i++) {
+		old = btt_log_read(arena, i, &log_old, LOG_OLD_ENT);
+		if (old < 0)
+			return old;
+
+		new = btt_log_read(arena, i, &log_new, LOG_NEW_ENT);
+		if (new < 0)
+			return new;
+
+		/* sub points to the next one to be overwritten */
+		arena->freelist[i].sub = 1 - new;
+		arena->freelist[i].seq = nd_inc_seq(le32_to_cpu(log_new.seq));
+		arena->freelist[i].block = le32_to_cpu(log_new.old_map);
+
+		/* This implies a newly created or untouched flog entry */
+		if (log_new.old_map == log_new.new_map)
+			continue;
+
+		/* Check if map recovery is needed */
+		ret = btt_map_read(arena, le32_to_cpu(log_new.lba), &map_entry,
+				NULL, NULL);
+		if (ret)
+			return ret;
+		if ((le32_to_cpu(log_new.new_map) != map_entry) &&
+				(le32_to_cpu(log_new.old_map) == map_entry)) {
+			/*
+			 * Last transaction wrote the flog, but wasn't able
+			 * to complete the map write. So fix up the map.
+			 */
+			ret = btt_map_write(arena, le32_to_cpu(log_new.lba),
+					le32_to_cpu(log_new.new_map), 0, 0);
+			if (ret)
+				return ret;
+		}
+
+	}
+
+	return 0;
+}
+
+static int btt_rtt_init(struct arena_info *arena)
+{
+	arena->rtt = kcalloc(arena->nfree, sizeof(u32), GFP_KERNEL);
+	if (arena->rtt == NULL)
+		return -ENOMEM;
+
+	return 0;
+}
+
+static int btt_maplocks_init(struct arena_info *arena)
+{
+	u32 i;
+
+	arena->map_locks = kcalloc(arena->nfree, sizeof(struct aligned_lock),
+				GFP_KERNEL);
+	if (!arena->map_locks)
+		return -ENOMEM;
+
+	for (i = 0; i < arena->nfree; i++)
+		spin_lock_init(&arena->map_locks[i].lock);
+
+	return 0;
+}
+
+static struct arena_info *alloc_arena(struct btt *btt, size_t size,
+				size_t start, size_t arena_off)
+{
+	struct arena_info *arena;
+	u64 logsize, mapsize, datasize;
+	u64 available = size;
+
+	arena = kzalloc(sizeof(struct arena_info), GFP_KERNEL);
+	if (!arena)
+		return NULL;
+	arena->nd_btt = btt->nd_btt;
+
+	if (!size)
+		return arena;
+
+	arena->size = size;
+	arena->external_lba_start = start;
+	arena->external_lbasize = btt->lbasize;
+	arena->internal_lbasize = roundup(arena->external_lbasize,
+					INT_LBASIZE_ALIGNMENT);
+	arena->nfree = BTT_DEFAULT_NFREE;
+	arena->version_major = 1;
+	arena->version_minor = 1;
+
+	if (available % BTT_PG_SIZE)
+		available -= (available % BTT_PG_SIZE);
+
+	/* Two pages are reserved for the super block and its copy */
+	available -= 2 * BTT_PG_SIZE;
+
+	/* The log takes a fixed amount of space based on nfree */
+	logsize = roundup(2 * arena->nfree * sizeof(struct log_entry),
+				BTT_PG_SIZE);
+	available -= logsize;
+
+	/* Calculate optimal split between map and data area */
+	arena->internal_nlba = div_u64(available - BTT_PG_SIZE,
+			arena->internal_lbasize + MAP_ENT_SIZE);
+	arena->external_nlba = arena->internal_nlba - arena->nfree;
+
+	mapsize = roundup((arena->external_nlba * MAP_ENT_SIZE), BTT_PG_SIZE);
+	datasize = available - mapsize;
+
+	/* 'Absolute' values, relative to start of storage space */
+	arena->infooff = arena_off;
+	arena->dataoff = arena->infooff + BTT_PG_SIZE;
+	arena->mapoff = arena->dataoff + datasize;
+	arena->logoff = arena->mapoff + mapsize;
+	arena->info2off = arena->logoff + logsize;
+	return arena;
+}
+
+static void free_arenas(struct btt *btt)
+{
+	struct arena_info *arena, *next;
+
+	list_for_each_entry_safe(arena, next, &btt->arena_list, list) {
+		list_del(&arena->list);
+		kfree(arena->rtt);
+		kfree(arena->map_locks);
+		kfree(arena->freelist);
+		debugfs_remove_recursive(arena->debugfs_dir);
+		kfree(arena);
+	}
+}
+
+/*
+ * This function checks if the metadata layout is valid and error free
+ */
+static int arena_is_valid(struct arena_info *arena, struct btt_sb *super,
+				u8 *uuid, u32 lbasize)
+{
+	u64 checksum;
+
+	if (memcmp(super->uuid, uuid, 16))
+		return 0;
+
+	checksum = le64_to_cpu(super->checksum);
+	super->checksum = 0;
+	if (checksum != nd_btt_sb_checksum(super))
+		return 0;
+	super->checksum = cpu_to_le64(checksum);
+
+	if (lbasize != le32_to_cpu(super->external_lbasize))
+		return 0;
+
+	/* TODO: figure out action for this */
+	if ((le32_to_cpu(super->flags) & IB_FLAG_ERROR_MASK) != 0)
+		dev_info(to_dev(arena), "Found arena with an error flag\n");
+
+	return 1;
+}
+
+/*
+ * This function reads an existing valid btt superblock and
+ * populates the corresponding arena_info struct
+ */
+static void parse_arena_meta(struct arena_info *arena, struct btt_sb *super,
+				u64 arena_off)
+{
+	arena->internal_nlba = le32_to_cpu(super->internal_nlba);
+	arena->internal_lbasize = le32_to_cpu(super->internal_lbasize);
+	arena->external_nlba = le32_to_cpu(super->external_nlba);
+	arena->external_lbasize = le32_to_cpu(super->external_lbasize);
+	arena->nfree = le32_to_cpu(super->nfree);
+	arena->version_major = le16_to_cpu(super->version_major);
+	arena->version_minor = le16_to_cpu(super->version_minor);
+
+	arena->nextoff = (super->nextoff == 0) ? 0 : (arena_off +
+			le64_to_cpu(super->nextoff));
+	arena->infooff = arena_off;
+	arena->dataoff = arena_off + le64_to_cpu(super->dataoff);
+	arena->mapoff = arena_off + le64_to_cpu(super->mapoff);
+	arena->logoff = arena_off + le64_to_cpu(super->logoff);
+	arena->info2off = arena_off + le64_to_cpu(super->info2off);
+
+	arena->size = (super->nextoff > 0) ? (le64_to_cpu(super->nextoff)) :
+			(arena->info2off - arena->infooff + BTT_PG_SIZE);
+
+	arena->flags = le32_to_cpu(super->flags);
+}
+
+static int discover_arenas(struct btt *btt)
+{
+	int ret = 0;
+	struct arena_info *arena;
+	struct btt_sb *super;
+	size_t remaining = btt->rawsize;
+	u64 cur_nlba = 0;
+	size_t cur_off = 0;
+	int num_arenas = 0;
+
+	super = kzalloc(sizeof(*super), GFP_KERNEL);
+	if (!super)
+		return -ENOMEM;
+
+	while (remaining) {
+		/* Alloc memory for arena */
+		arena = alloc_arena(btt, 0, 0, 0);
+		if (!arena) {
+			ret = -ENOMEM;
+			goto out_super;
+		}
+
+		arena->infooff = cur_off;
+		ret = btt_info_read(arena, super);
+		if (ret)
+			goto out;
+
+		if (!arena_is_valid(arena, super, btt->nd_btt->uuid,
+				btt->lbasize)) {
+			if (remaining == btt->rawsize) {
+				btt->init_state = INIT_NOTFOUND;
+				dev_info(to_dev(arena), "No existing arenas\n");
+				goto out;
+			} else {
+				dev_info(to_dev(arena),
+						"Found corrupted metadata!\n");
+				ret = -ENODEV;
+				goto out;
+			}
+		}
+
+		arena->external_lba_start = cur_nlba;
+		parse_arena_meta(arena, super, cur_off);
+
+		ret = btt_freelist_init(arena);
+		if (ret)
+			goto out;
+
+		ret = btt_rtt_init(arena);
+		if (ret)
+			goto out;
+
+		ret = btt_maplocks_init(arena);
+		if (ret)
+			goto out;
+
+		list_add_tail(&arena->list, &btt->arena_list);
+
+		remaining -= arena->size;
+		cur_off += arena->size;
+		cur_nlba += arena->external_nlba;
+		num_arenas++;
+
+		if (arena->nextoff == 0)
+			break;
+	}
+	btt->num_arenas = num_arenas;
+	btt->nlba = cur_nlba;
+	btt->init_state = INIT_READY;
+
+	kfree(super);
+	return ret;
+
+ out:
+	kfree(arena);
+	free_arenas(btt);
+ out_super:
+	kfree(super);
+	return ret;
+}
+
+static int create_arenas(struct btt *btt)
+{
+	size_t remaining = btt->rawsize;
+	size_t cur_off = 0;
+
+	while (remaining) {
+		struct arena_info *arena;
+		size_t arena_size = min_t(u64, ARENA_MAX_SIZE, remaining);
+
+		remaining -= arena_size;
+		if (arena_size < ARENA_MIN_SIZE)
+			break;
+
+		arena = alloc_arena(btt, arena_size, btt->nlba, cur_off);
+		if (!arena) {
+			free_arenas(btt);
+			return -ENOMEM;
+		}
+		btt->nlba += arena->external_nlba;
+		if (remaining >= ARENA_MIN_SIZE)
+			arena->nextoff = arena->size;
+		else
+			arena->nextoff = 0;
+		cur_off += arena_size;
+		list_add_tail(&arena->list, &btt->arena_list);
+	}
+
+	return 0;
+}
+
+/*
+ * This function completes arena initialization by writing
+ * all the metadata.
+ * It is only called for an uninitialized arena when a write
+ * to that arena occurs for the first time.
+ */
+static int btt_arena_write_layout(struct arena_info *arena, u8 *uuid)
+{
+	int ret;
+	struct btt_sb *super;
+
+	ret = btt_map_init(arena);
+	if (ret)
+		return ret;
+
+	ret = btt_log_init(arena);
+	if (ret)
+		return ret;
+
+	super = kzalloc(sizeof(struct btt_sb), GFP_NOIO);
+	if (!super)
+		return -ENOMEM;
+
+	strncpy(super->signature, BTT_SIG, BTT_SIG_LEN);
+	memcpy(super->uuid, uuid, 16);
+	super->flags = cpu_to_le32(arena->flags);
+	super->version_major = cpu_to_le16(arena->version_major);
+	super->version_minor = cpu_to_le16(arena->version_minor);
+	super->external_lbasize = cpu_to_le32(arena->external_lbasize);
+	super->external_nlba = cpu_to_le32(arena->external_nlba);
+	super->internal_lbasize = cpu_to_le32(arena->internal_lbasize);
+	super->internal_nlba = cpu_to_le32(arena->internal_nlba);
+	super->nfree = cpu_to_le32(arena->nfree);
+	super->infosize = cpu_to_le32(sizeof(struct btt_sb));
+	super->nextoff = cpu_to_le64(arena->nextoff);
+	/*
+	 * Subtract arena->infooff (arena start) so numbers are relative
+	 * to 'this' arena
+	 */
+	super->dataoff = cpu_to_le64(arena->dataoff - arena->infooff);
+	super->mapoff = cpu_to_le64(arena->mapoff - arena->infooff);
+	super->logoff = cpu_to_le64(arena->logoff - arena->infooff);
+	super->info2off = cpu_to_le64(arena->info2off - arena->infooff);
+
+	super->flags = 0;
+	super->checksum = cpu_to_le64(nd_btt_sb_checksum(super));
+
+	ret = btt_info_write(arena, super);
+
+	kfree(super);
+	return ret;
+}
+
+/*
+ * This function completes the initialization for the BTT namespace
+ * such that it is ready to accept IOs
+ */
+static int btt_meta_init(struct btt *btt)
+{
+	int ret = 0;
+	struct arena_info *arena;
+
+	mutex_lock(&btt->init_lock);
+	list_for_each_entry(arena, &btt->arena_list, list) {
+		ret = btt_arena_write_layout(arena, btt->nd_btt->uuid);
+		if (ret)
+			goto unlock;
+
+		ret = btt_freelist_init(arena);
+		if (ret)
+			goto unlock;
+
+		ret = btt_rtt_init(arena);
+		if (ret)
+			goto unlock;
+
+		ret = btt_maplocks_init(arena);
+		if (ret)
+			goto unlock;
+	}
+
+	btt->init_state = INIT_READY;
+
+ unlock:
+	mutex_unlock(&btt->init_lock);
+	return ret;
+}
+
+/*
+ * This function calculates the arena in which the given LBA lies
+ * by doing a linear walk. This is acceptable since we expect only
+ * a few arenas. If we have backing devices that get much larger,
+ * we can construct a balanced binary tree of arenas at init time
+ * so that this range search becomes faster.
+ */
+static int lba_to_arena(struct btt *btt, sector_t sector, __u32 *premap,
+				struct arena_info **arena)
+{
+	struct arena_info *arena_list;
+	__u64 lba = div_u64(sector << SECTOR_SHIFT, btt->sector_size);
+
+	list_for_each_entry(arena_list, &btt->arena_list, list) {
+		if (lba < arena_list->external_nlba) {
+			*arena = arena_list;
+			*premap = lba;
+			return 0;
+		}
+		lba -= arena_list->external_nlba;
+	}
+
+	return -EIO;
+}
+
+/*
+ * The following (lock_map, unlock_map) are mostly just to improve
+ * readability, since they index into an array of locks
+ */
+static void lock_map(struct arena_info *arena, u32 premap)
+		__acquires(&arena->map_locks[idx].lock)
+{
+	u32 idx = (premap * MAP_ENT_SIZE / L1_CACHE_BYTES) % arena->nfree;
+
+	spin_lock(&arena->map_locks[idx].lock);
+}
+
+static void unlock_map(struct arena_info *arena, u32 premap)
+		__releases(&arena->map_locks[idx].lock)
+{
+	u32 idx = (premap * MAP_ENT_SIZE / L1_CACHE_BYTES) % arena->nfree;
+
+	spin_unlock(&arena->map_locks[idx].lock);
+}
+
+static u64 to_namespace_offset(struct arena_info *arena, u64 lba)
+{
+	return arena->dataoff + ((u64)lba * arena->internal_lbasize);
+}
+
+static int btt_data_read(struct arena_info *arena, struct page *page,
+			unsigned int off, u32 lba, u32 len)
+{
+	int ret;
+	u64 nsoff = to_namespace_offset(arena, lba);
+	void *mem = kmap_atomic(page);
+
+	ret = arena_read_bytes(arena, nsoff, mem + off, len);
+	kunmap_atomic(mem);
+
+	return ret;
+}
+
+static int btt_data_write(struct arena_info *arena, u32 lba,
+			struct page *page, unsigned int off, u32 len)
+{
+	int ret;
+	u64 nsoff = to_namespace_offset(arena, lba);
+	void *mem = kmap_atomic(page);
+
+	ret = arena_write_bytes(arena, nsoff, mem + off, len);
+	kunmap_atomic(mem);
+
+	return ret;
+}
+
+static void zero_fill_data(struct page *page, unsigned int off, u32 len)
+{
+	void *mem = kmap_atomic(page);
+
+	memset(mem + off, 0, len);
+	kunmap_atomic(mem);
+}
+
+static int btt_read_pg(struct btt *btt, struct page *page, unsigned int off,
+			sector_t sector, unsigned int len)
+{
+	int ret = 0;
+	int t_flag, e_flag;
+	struct arena_info *arena = NULL;
+	u32 lane = 0, premap, postmap;
+
+	while (len) {
+		u32 cur_len;
+
+		lane = nd_region_acquire_lane(btt->nd_region);
+
+		ret = lba_to_arena(btt, sector, &premap, &arena);
+		if (ret)
+			goto out_lane;
+
+		cur_len = min(btt->sector_size, len);
+
+		ret = btt_map_read(arena, premap, &postmap, &t_flag, &e_flag);
+		if (ret)
+			goto out_lane;
+
+		/*
+		 * We loop to make sure that the post map LBA didn't change
+		 * from under us between writing the RTT and doing the actual
+		 * read.
+		 */
+		while (1) {
+			u32 new_map;
+
+			if (t_flag) {
+				zero_fill_data(page, off, cur_len);
+				goto out_lane;
+			}
+
+			if (e_flag) {
+				ret = -EIO;
+				goto out_lane;
+			}
+
+			arena->rtt[lane] = RTT_VALID | postmap;
+			/*
+			 * Barrier to make sure this write is not reordered
+			 * to do the verification map_read before the RTT store
+			 */
+			barrier();
+
+			ret = btt_map_read(arena, premap, &new_map, &t_flag,
+						&e_flag);
+			if (ret)
+				goto out_rtt;
+
+			if (postmap == new_map)
+				break;
+
+			postmap = new_map;
+		}
+
+		ret = btt_data_read(arena, page, off, postmap, cur_len);
+		if (ret)
+			goto out_rtt;
+
+		arena->rtt[lane] = RTT_INVALID;
+		nd_region_release_lane(btt->nd_region, lane);
+
+		len -= cur_len;
+		off += cur_len;
+		sector += btt->sector_size >> SECTOR_SHIFT;
+	}
+
+	return 0;
+
+ out_rtt:
+	arena->rtt[lane] = RTT_INVALID;
+ out_lane:
+	nd_region_release_lane(btt->nd_region, lane);
+	return ret;
+}
+
+static int btt_write_pg(struct btt *btt, sector_t sector, struct page *page,
+		unsigned int off, unsigned int len)
+{
+	int ret = 0;
+	struct arena_info *arena = NULL;
+	u32 premap = 0, old_postmap, new_postmap, lane = 0, i;
+	struct log_entry log;
+	int sub;
+
+	while (len) {
+		u32 cur_len;
+
+		lane = nd_region_acquire_lane(btt->nd_region);
+
+		ret = lba_to_arena(btt, sector, &premap, &arena);
+		if (ret)
+			goto out_lane;
+		cur_len = min(btt->sector_size, len);
+
+		if ((arena->flags & IB_FLAG_ERROR_MASK) != 0) {
+			ret = -EIO;
+			goto out_lane;
+		}
+
+		new_postmap = arena->freelist[lane].block;
+
+		/* Wait if the new block is being read from */
+		for (i = 0; i < arena->nfree; i++)
+			while (arena->rtt[i] == (RTT_VALID | new_postmap))
+				cpu_relax();
+
+
+		if (new_postmap >= arena->internal_nlba) {
+			ret = -EIO;
+			goto out_lane;
+		} else
+			ret = btt_data_write(arena, new_postmap, page,
+						off, cur_len);
+		if (ret)
+			goto out_lane;
+
+		lock_map(arena, premap);
+		ret = btt_map_read(arena, premap, &old_postmap, NULL, NULL);
+		if (ret)
+			goto out_map;
+		if (old_postmap >= arena->internal_nlba) {
+			ret = -EIO;
+			goto out_map;
+		}
+
+		log.lba = cpu_to_le32(premap);
+		log.old_map = cpu_to_le32(old_postmap);
+		log.new_map = cpu_to_le32(new_postmap);
+		log.seq = cpu_to_le32(arena->freelist[lane].seq);
+		sub = arena->freelist[lane].sub;
+		ret = btt_flog_write(arena, lane, sub, &log);
+		if (ret)
+			goto out_map;
+
+		ret = btt_map_write(arena, premap, new_postmap, 0, 0);
+		if (ret)
+			goto out_map;
+
+		unlock_map(arena, premap);
+		nd_region_release_lane(btt->nd_region, lane);
+
+		len -= cur_len;
+		off += cur_len;
+		sector += btt->sector_size >> SECTOR_SHIFT;
+	}
+
+	return 0;
+
+ out_map:
+	unlock_map(arena, premap);
+ out_lane:
+	nd_region_release_lane(btt->nd_region, lane);
+	return ret;
+}
+
+static int btt_do_bvec(struct btt *btt, struct page *page,
+			unsigned int len, unsigned int off, int rw,
+			sector_t sector)
+{
+	int ret;
+
+	if (rw == READ) {
+		ret = btt_read_pg(btt, page, off, sector, len);
+		flush_dcache_page(page);
+	} else {
+		flush_dcache_page(page);
+		ret = btt_write_pg(btt, sector, page, off, len);
+	}
+
+	return ret;
+}
+
+static void btt_make_request(struct request_queue *q, struct bio *bio)
+{
+	struct btt *btt = q->queuedata;
+	struct bvec_iter iter;
+	struct bio_vec bvec;
+	int err = 0, rw;
+
+	rw = bio_data_dir(bio);
+	bio_for_each_segment(bvec, bio, iter) {
+		unsigned int len = bvec.bv_len;
+
+		BUG_ON(len > PAGE_SIZE);
+		/* Make sure len is in multiples of sector size. */
+		/* XXX is this right? */
+		BUG_ON(len < btt->sector_size);
+		BUG_ON(len % btt->sector_size);
+
+		err = btt_do_bvec(btt, bvec.bv_page, len, bvec.bv_offset,
+				rw, iter.bi_sector);
+		if (err) {
+			dev_info(&btt->nd_btt->dev,
+					"io error in %s sector %lld, len %d,\n",
+					(rw == READ) ? "READ" : "WRITE",
+					(unsigned long long) iter.bi_sector, len);
+			goto out;
+		}
+	}
+
+out:
+	bio_endio(bio, err);
+}
+
+static int btt_rw_page(struct block_device *bdev, sector_t sector,
+		struct page *page, int rw)
+{
+	struct btt *btt = bdev->bd_disk->private_data;
+
+	btt_do_bvec(btt, page, PAGE_CACHE_SIZE, 0, rw, sector);
+	page_endio(page, rw & WRITE, 0);
+	return 0;
+}
+
+
+static int btt_getgeo(struct block_device *bd, struct hd_geometry *geo)
+{
+	/* some standard values */
+	geo->heads = 1 << 6;
+	geo->sectors = 1 << 5;
+	geo->cylinders = get_capacity(bd->bd_disk) >> 11;
+	return 0;
+}
+
+static const struct block_device_operations btt_fops = {
+	.owner =		THIS_MODULE,
+	.rw_page =		btt_rw_page,
+	.getgeo =		btt_getgeo,
+};
+
+static int btt_blk_init(struct btt *btt)
+{
+	struct nd_btt *nd_btt = btt->nd_btt;
+	struct nd_namespace_common *ndns = nd_btt->ndns;
+
+	/* create a new disk and request queue for btt */
+	btt->btt_queue = blk_alloc_queue(GFP_KERNEL);
+	if (!btt->btt_queue)
+		return -ENOMEM;
+
+	btt->btt_disk = alloc_disk(0);
+	if (!btt->btt_disk) {
+		blk_cleanup_queue(btt->btt_queue);
+		return -ENOMEM;
+	}
+
+	nvdimm_namespace_disk_name(ndns, btt->btt_disk->disk_name);
+	btt->btt_disk->driverfs_dev = &btt->nd_btt->dev;
+	btt->btt_disk->major = btt_major;
+	btt->btt_disk->first_minor = 0;
+	btt->btt_disk->fops = &btt_fops;
+	btt->btt_disk->private_data = btt;
+	btt->btt_disk->queue = btt->btt_queue;
+	btt->btt_disk->flags = GENHD_FL_EXT_DEVT;
+
+	blk_queue_make_request(btt->btt_queue, btt_make_request);
+	blk_queue_logical_block_size(btt->btt_queue, btt->sector_size);
+	blk_queue_max_hw_sectors(btt->btt_queue, UINT_MAX);
+	blk_queue_bounce_limit(btt->btt_queue, BLK_BOUNCE_ANY);
+	queue_flag_set_unlocked(QUEUE_FLAG_NONROT, btt->btt_queue);
+	btt->btt_queue->queuedata = btt;
+
+	set_capacity(btt->btt_disk,
+			btt->nlba * btt->sector_size >> SECTOR_SHIFT);
+	add_disk(btt->btt_disk);
+
+	return 0;
+}
+
+static void btt_blk_cleanup(struct btt *btt)
+{
+	del_gendisk(btt->btt_disk);
+	put_disk(btt->btt_disk);
+	blk_cleanup_queue(btt->btt_queue);
+}
+
+/**
+ * btt_init - initialize a block translation table for the given device
+ * @nd_btt:	device with BTT geometry and backing device info
+ * @rawsize:	raw size in bytes of the backing device
+ * @lbasize:	lba size of the backing device
+ * @uuid:	A uuid for the backing device - this is stored on media
+ * @maxlane:	maximum number of parallel requests the device can handle
+ *
+ * Initialize a Block Translation Table on a backing device to provide
+ * single sector power fail atomicity.
+ *
+ * Context:
+ * Might sleep.
+ *
+ * Returns:
+ * Pointer to a new struct btt on success, NULL on failure.
+ */
+static struct btt *btt_init(struct nd_btt *nd_btt, unsigned long long rawsize,
+		u32 lbasize, u8 *uuid, struct nd_region *nd_region)
+{
+	int ret;
+	struct btt *btt;
+	struct device *dev = &nd_btt->dev;
+
+	btt = kzalloc(sizeof(struct btt), GFP_KERNEL);
+	if (!btt)
+		return NULL;
+
+	btt->nd_btt = nd_btt;
+	btt->rawsize = rawsize;
+	btt->lbasize = lbasize;
+	btt->sector_size = ((lbasize >= 4096) ? 4096 : 512);
+	INIT_LIST_HEAD(&btt->arena_list);
+	mutex_init(&btt->init_lock);
+	btt->nd_region = nd_region;
+
+	ret = discover_arenas(btt);
+	if (ret) {
+		dev_err(dev, "init: error in arena_discover: %d\n", ret);
+		goto out_free;
+	}
+
+	if (btt->init_state != INIT_READY) {
+		btt->num_arenas = (rawsize / ARENA_MAX_SIZE) +
+			((rawsize % ARENA_MAX_SIZE) ? 1 : 0);
+		dev_dbg(dev, "init: %d arenas for %llu rawsize\n",
+				btt->num_arenas, rawsize);
+
+		ret = create_arenas(btt);
+		if (ret) {
+			dev_info(dev, "init: create_arenas: %d\n", ret);
+			goto out_free;
+		}
+
+		ret = btt_meta_init(btt);
+		if (ret) {
+			dev_err(dev, "init: error in meta_init: %d\n", ret);
+			return NULL;
+		}
+	}
+
+	ret = btt_blk_init(btt);
+	if (ret) {
+		dev_err(dev, "init: error in blk_init: %d\n", ret);
+		goto out_free;
+	}
+
+	btt_debugfs_init(btt);
+
+	return btt;
+
+ out_free:
+	kfree(btt);
+	return NULL;
+}
+
+/**
+ * btt_fini - de-initialize a BTT
+ * @btt:	the BTT handle that was generated by btt_init
+ *
+ * De-initialize a Block Translation Table on device removal
+ *
+ * Context:
+ * Might sleep.
+ */
+static void btt_fini(struct btt *btt)
+{
+	if (btt) {
+		btt_blk_cleanup(btt);
+		free_arenas(btt);
+		debugfs_remove_recursive(btt->debugfs_dir);
+		kfree(btt);
+	}
+}
+
+int nvdimm_namespace_attach_btt(struct nd_namespace_common *ndns)
+{
+	struct nd_btt *nd_btt = to_nd_btt(ndns->claim);
+	struct nd_region *nd_region;
+	struct btt *btt;
+	size_t rawsize;
+
+	if (!nd_btt->uuid || !nd_btt->ndns || !nd_btt->lbasize)
+		return -ENODEV;
+
+	rawsize = nvdimm_namespace_capacity(ndns) - SZ_4K;
+	if (rawsize < ARENA_MIN_SIZE) {
+		return -ENXIO;
+	}
+	nd_region = to_nd_region(nd_btt->dev.parent);
+	btt = btt_init(nd_btt, rawsize, nd_btt->lbasize, nd_btt->uuid,
+			nd_region);
+	if (!btt)
+		return -ENOMEM;
+	nd_btt->btt = btt;
+
+	return 0;
+}
+EXPORT_SYMBOL(nvdimm_namespace_attach_btt);
+
+int nvdimm_namespace_detach_btt(struct nd_namespace_common *ndns)
+{
+	struct nd_btt *nd_btt = to_nd_btt(ndns->claim);
+	struct btt *btt = nd_btt->btt;
+
+	btt_fini(btt);
+	nd_btt->btt = NULL;
+
+	return 0;
+}
+EXPORT_SYMBOL(nvdimm_namespace_detach_btt);
+
+static int __init nd_btt_init(void)
+{
+	int rc;
+
+	BUILD_BUG_ON(sizeof(struct btt_sb) != SZ_4K);
+
+	btt_major = register_blkdev(0, "btt");
+	if (btt_major < 0)
+		return btt_major;
+
+	debugfs_root = debugfs_create_dir("btt", NULL);
+	if (IS_ERR_OR_NULL(debugfs_root)) {
+		rc = -ENXIO;
+		goto err_debugfs;
+	}
+
+	return 0;
+
+ err_debugfs:
+	unregister_blkdev(btt_major, "btt");
+
+	return rc;
+}
+
+static void __exit nd_btt_exit(void)
+{
+	debugfs_remove_recursive(debugfs_root);
+	unregister_blkdev(btt_major, "btt");
+}
+
+MODULE_ALIAS_ND_DEVICE(ND_DEVICE_BTT);
+MODULE_AUTHOR("Vishal Verma <vishal.l.verma@linux.intel.com>");
+MODULE_LICENSE("GPL v2");
+module_init(nd_btt_init);
+module_exit(nd_btt_exit);
diff --git a/drivers/nvdimm/btt.h b/drivers/nvdimm/btt.h
index e8f6d8e0ddd3..8c95a7792c3e 100644
--- a/drivers/nvdimm/btt.h
+++ b/drivers/nvdimm/btt.h
@@ -19,6 +19,39 @@
 
 #define BTT_SIG_LEN 16
 #define BTT_SIG "BTT_ARENA_INFO\0"
+#define MAP_ENT_SIZE 4
+#define MAP_TRIM_SHIFT 31
+#define MAP_TRIM_MASK (1 << MAP_TRIM_SHIFT)
+#define MAP_ERR_SHIFT 30
+#define MAP_ERR_MASK (1 << MAP_ERR_SHIFT)
+#define MAP_LBA_MASK (~((1 << MAP_TRIM_SHIFT) | (1 << MAP_ERR_SHIFT)))
+#define MAP_ENT_NORMAL 0xC0000000
+#define LOG_ENT_SIZE sizeof(struct log_entry)
+#define ARENA_MIN_SIZE (1UL << 24)	/* 16 MB */
+#define ARENA_MAX_SIZE (1ULL << 39)	/* 512 GB */
+#define RTT_VALID (1UL << 31)
+#define RTT_INVALID 0
+#define INT_LBASIZE_ALIGNMENT 256
+#define BTT_PG_SIZE 4096
+#define BTT_DEFAULT_NFREE ND_MAX_LANES
+#define LOG_SEQ_INIT 1
+
+#define IB_FLAG_ERROR 0x00000001
+#define IB_FLAG_ERROR_MASK 0x00000001
+
+enum btt_init_state {
+	INIT_UNCHECKED = 0,
+	INIT_NOTFOUND,
+	INIT_READY
+};
+
+struct log_entry {
+	__le32 lba;
+	__le32 old_map;
+	__le32 new_map;
+	__le32 seq;
+	__le64 padding[2];
+};
 
 struct btt_sb {
 	u8 signature[BTT_SIG_LEN];
@@ -42,4 +75,112 @@ struct btt_sb {
 	__le64 checksum;
 };
 
+struct free_entry {
+	u32 block;
+	u8 sub;
+	u8 seq;
+};
+
+struct aligned_lock {
+	union {
+		spinlock_t lock;
+		u8 cacheline_padding[L1_CACHE_BYTES];
+	};
+};
+
+/**
+ * struct arena_info - handle for an arena
+ * @size:		Size in bytes this arena occupies on the raw device.
+ *			This includes arena metadata.
+ * @external_lba_start:	The first external LBA in this arena.
+ * @internal_nlba:	Number of internal blocks available in the arena
+ *			including nfree reserved blocks
+ * @internal_lbasize:	Internal and external lba sizes may be different as
+ *			we can round up 'odd' external lbasizes such as 520B
+ *			to be aligned.
+ * @external_nlba:	Number of blocks contributed by the arena to the number
+ *			reported to upper layers. (internal_nlba - nfree)
+ * @external_lbasize:	LBA size as exposed to upper layers.
+ * @nfree:		A reserve number of 'free' blocks that is used to
+ *			handle incoming writes.
+ * @version_major:	Metadata layout version major.
+ * @version_minor:	Metadata layout version minor.
+ * @nextoff:		Offset in bytes to the start of the next arena.
+ * @infooff:		Offset in bytes to the info block of this arena.
+ * @dataoff:		Offset in bytes to the data area of this arena.
+ * @mapoff:		Offset in bytes to the map area of this arena.
+ * @logoff:		Offset in bytes to the log area of this arena.
+ * @info2off:		Offset in bytes to the backup info block of this arena.
+ * @freelist:		Pointer to in-memory list of free blocks
+ * @rtt:		Pointer to in-memory "Read Tracking Table"
+ * @map_locks:		Spinlocks protecting concurrent map writes
+ * @nd_btt:		Pointer to parent nd_btt structure.
+ * @list:		List head for list of arenas
+ * @debugfs_dir:	Debugfs dentry
+ * @flags:		Arena flags - may signify error states.
+ *
+ * arena_info is a per-arena handle. Once an arena is narrowed down for an
+ * IO, this struct is passed around for the duration of the IO.
+ */
+struct arena_info {
+	u64 size;			/* Total bytes for this arena */
+	u64 external_lba_start;
+	u32 internal_nlba;
+	u32 internal_lbasize;
+	u32 external_nlba;
+	u32 external_lbasize;
+	u32 nfree;
+	u16 version_major;
+	u16 version_minor;
+	/* Byte offsets to the different on-media structures */
+	u64 nextoff;
+	u64 infooff;
+	u64 dataoff;
+	u64 mapoff;
+	u64 logoff;
+	u64 info2off;
+	/* Pointers to other in-memory structures for this arena */
+	struct free_entry *freelist;
+	u32 *rtt;
+	struct aligned_lock *map_locks;
+	struct nd_btt *nd_btt;
+	struct list_head list;
+	struct dentry *debugfs_dir;
+	/* Arena flags */
+	u32 flags;
+};
+
+/**
+ * struct btt - handle for a BTT instance
+ * @btt_disk:		Pointer to the gendisk for BTT device
+ * @btt_queue:		Pointer to the request queue for the BTT device
+ * @arena_list:		Head of the list of arenas
+ * @debugfs_dir:	Debugfs dentry
+ * @nd_btt:		Parent nd_btt struct
+ * @nlba:		Number of logical blocks exposed to the	upper layers
+ *			after removing the amount of space needed by metadata
+ * @rawsize:		Total size in bytes of the available backing device
+ * @lbasize:		LBA size as requested and presented to upper layers.
+ *			This is sector_size + size of any metadata.
+ * @sector_size:	The Linux sector size - 512 or 4096
+ * @lanes:		Per-lane spinlocks
+ * @init_lock:		Mutex used for the BTT initialization
+ * @init_state:		Flag describing the initialization state for the BTT
+ * @num_arenas:		Number of arenas in the BTT instance
+ */
+struct btt {
+	struct gendisk *btt_disk;
+	struct request_queue *btt_queue;
+	struct list_head arena_list;
+	struct dentry *debugfs_dir;
+	struct nd_btt *nd_btt;
+	u64 nlba;
+	unsigned long long rawsize;
+	u32 lbasize;
+	u32 sector_size;
+	struct nd_region *nd_region;
+	struct mutex init_lock;
+	int init_state;
+	int num_arenas;
+};
 #endif
diff --git a/drivers/nvdimm/btt_devs.c b/drivers/nvdimm/btt_devs.c
index effb70a88347..470fbdccd0ac 100644
--- a/drivers/nvdimm/btt_devs.c
+++ b/drivers/nvdimm/btt_devs.c
@@ -348,7 +348,8 @@ struct device *nd_btt_create(struct nd_region *nd_region)
  */
 u64 nd_btt_sb_checksum(struct btt_sb *btt_sb)
 {
-	u64 sum, sum_save;
+	u64 sum;
+	__le64 sum_save;
 
 	sum_save = btt_sb->checksum;
 	btt_sb->checksum = 0;
diff --git a/drivers/nvdimm/namespace_devs.c b/drivers/nvdimm/namespace_devs.c
index 2c50a0719f8d..4aa647c8d644 100644
--- a/drivers/nvdimm/namespace_devs.c
+++ b/drivers/nvdimm/namespace_devs.c
@@ -76,6 +76,30 @@ static bool is_namespace_io(struct device *dev)
 	return dev ? dev->type == &namespace_io_device_type : false;
 }
 
+const char *nvdimm_namespace_disk_name(struct nd_namespace_common *ndns,
+		char *name)
+{
+	struct nd_region *nd_region = to_nd_region(ndns->dev.parent);
+	const char *suffix = "";
+
+	if (ndns->claim && is_nd_btt(ndns->claim))
+		suffix = "s";
+
+	if (is_namespace_pmem(&ndns->dev) || is_namespace_io(&ndns->dev))
+		sprintf(name, "pmem%d%s", nd_region->id, suffix);
+	else if (is_namespace_blk(&ndns->dev)) {
+		struct nd_namespace_blk *nsblk;
+
+		nsblk = to_nd_namespace_blk(&ndns->dev);
+		sprintf(name, "ndblk%d.%d%s", nd_region->id, nsblk->id, suffix);
+	} else {
+		return NULL;
+	}
+
+	return name;
+}
+EXPORT_SYMBOL(nvdimm_namespace_disk_name);
+
 static ssize_t nstype_show(struct device *dev,
 		struct device_attribute *attr, char *buf)
 {
diff --git a/drivers/nvdimm/nd.h b/drivers/nvdimm/nd.h
index d13eccbb67e9..1b937c235913 100644
--- a/drivers/nvdimm/nd.h
+++ b/drivers/nvdimm/nd.h
@@ -20,6 +20,12 @@
 #include "label.h"
 
 enum {
+	/*
+	 * Limits the maximum number of block apertures a dimm can
+	 * support and is an input to the geometry/on-disk-format of a
+	 * BTT instance
+	 */
+	ND_MAX_LANES = 256,
 	SECTOR_SHIFT = 9,
 };
 
@@ -75,6 +81,11 @@ static inline struct nd_namespace_index *to_next_namespace_index(
 	for (res = (ndd)->dpa.child, next = res ? res->sibling : NULL; \
 			res; res = next, next = next ? next->sibling : NULL)
 
+struct nd_percpu_lane {
+	int count;
+	spinlock_t lock;
+};
+
 struct nd_region {
 	struct device dev;
 	struct ida ns_ida;
@@ -84,9 +95,10 @@ struct nd_region {
 	u16 ndr_mappings;
 	u64 ndr_size;
 	u64 ndr_start;
-	int id;
+	int id, num_lanes;
 	void *provider_data;
 	struct nd_interleave_set *nd_set;
+	struct nd_percpu_lane __percpu *lane;
 	struct nd_mapping mapping[0];
 };
 
@@ -100,9 +112,11 @@ static inline unsigned nd_inc_seq(unsigned seq)
 	return next[seq & 3];
 }
 
+struct btt;
 struct nd_btt {
 	struct device dev;
 	struct nd_namespace_common *ndns;
+	struct btt *btt;
 	unsigned long lbasize;
 	u8 *uuid;
 	int id;
@@ -157,6 +171,8 @@ static inline struct device *nd_btt_create(struct nd_region *nd_region)
 
 #endif
 struct nd_region *to_nd_region(struct device *dev);
+unsigned int nd_region_acquire_lane(struct nd_region *nd_region);
+void nd_region_release_lane(struct nd_region *nd_region, unsigned int lane);
 int nd_region_to_nstype(struct nd_region *nd_region);
 int nd_region_register_namespaces(struct nd_region *nd_region, int *err);
 u64 nd_region_interleave_set_cookie(struct nd_region *nd_region);
@@ -172,4 +188,8 @@ struct resource *nvdimm_allocate_dpa(struct nvdimm_drvdata *ndd,
 		resource_size_t n);
 resource_size_t nvdimm_namespace_capacity(struct nd_namespace_common *ndns);
 struct nd_namespace_common *nvdimm_namespace_common_probe(struct device *dev);
+int nvdimm_namespace_attach_btt(struct nd_namespace_common *ndns);
+int nvdimm_namespace_detach_btt(struct nd_namespace_common *ndns);
+const char *nvdimm_namespace_disk_name(struct nd_namespace_common *ndns,
+		char *name);
 #endif /* __ND_H__ */
diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index d0c6b4bdba69..7346054bccbb 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -160,7 +160,6 @@ static void pmem_detach_disk(struct pmem_device *pmem)
 static int pmem_attach_disk(struct nd_namespace_common *ndns,
 		struct pmem_device *pmem)
 {
-	struct nd_region *nd_region = to_nd_region(ndns->dev.parent);
 	struct gendisk *disk;
 
 	pmem->pmem_queue = blk_alloc_queue(GFP_KERNEL);
@@ -183,7 +182,7 @@ static int pmem_attach_disk(struct nd_namespace_common *ndns,
 	disk->private_data	= pmem;
 	disk->queue		= pmem->pmem_queue;
 	disk->flags		= GENHD_FL_EXT_DEVT;
-	sprintf(disk->disk_name, "pmem%d", nd_region->id);
+	nvdimm_namespace_disk_name(ndns, disk->disk_name);
 	disk->driverfs_dev = &ndns->dev;
 	set_capacity(disk, pmem->size >> 9);
 	pmem->pmem_disk = disk;
@@ -211,17 +210,6 @@ static int pmem_rw_bytes(struct nd_namespace_common *ndns,
 	return 0;
 }
 
-static int nvdimm_namespace_attach_btt(struct nd_namespace_common *ndns)
-{
-	/* TODO */
-	return -ENXIO;
-}
-
-static void nvdimm_namespace_detach_btt(struct nd_namespace_common *ndns)
-{
-	/* TODO */
-}
-
 static void pmem_free(struct pmem_device *pmem)
 {
 	iounmap(pmem->virt_addr);
diff --git a/drivers/nvdimm/region.c b/drivers/nvdimm/region.c
index 2a5f3f53d79d..eb8aebcd4800 100644
--- a/drivers/nvdimm/region.c
+++ b/drivers/nvdimm/region.c
@@ -10,6 +10,7 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * General Public License for more details.
  */
+#include <linux/cpumask.h>
 #include <linux/module.h>
 #include <linux/device.h>
 #include <linux/nd.h>
@@ -18,10 +19,21 @@
 static int nd_region_probe(struct device *dev)
 {
 	int err;
+	static unsigned long once;
 	struct nd_region_namespaces *num_ns;
 	struct nd_region *nd_region = to_nd_region(dev);
 	int rc = nd_region_register_namespaces(nd_region, &err);
 
+	if (nd_region->num_lanes > num_online_cpus()
+			&& nd_region->num_lanes < num_possible_cpus()
+			&& !test_and_set_bit(0, &once)) {
+		dev_info(dev, "online cpus (%d) < concurrent i/o lanes (%d) < possible cpus (%d)\n",
+				num_online_cpus(), nd_region->num_lanes,
+				num_possible_cpus());
+		dev_info(dev, "setting nr_cpus=%d may yield better libnvdimm device performance\n",
+				nd_region->num_lanes);
+	}
+
 	num_ns = devm_kzalloc(dev, sizeof(*num_ns), GFP_KERNEL);
 	if (!num_ns)
 		return -ENOMEM;
diff --git a/drivers/nvdimm/region_devs.c b/drivers/nvdimm/region_devs.c
index 4fd92389fa7e..fe8ec21fe3d7 100644
--- a/drivers/nvdimm/region_devs.c
+++ b/drivers/nvdimm/region_devs.c
@@ -32,6 +32,7 @@ static void nd_region_release(struct device *dev)
 
 		put_device(&nvdimm->dev);
 	}
+	free_percpu(nd_region->lane);
 	ida_simple_remove(&region_ida, nd_region->id);
 	kfree(nd_region);
 }
@@ -531,13 +532,66 @@ void *nd_region_provider_data(struct nd_region *nd_region)
 }
 EXPORT_SYMBOL_GPL(nd_region_provider_data);
 
+/**
+ * nd_region_acquire_lane - allocate and lock a lane
+ * @nd_region: region id and number of lanes possible
+ *
+ * A lane correlates to a BLK-data-window and/or a log slot in the BTT.
+ * We optimize for the common case where there are 256 lanes, one
+ * per-cpu.  For larger systems we need to lock to share lanes.  For now
+ * this implementation assumes the cost of maintaining an allocator for
+ * free lanes is on the order of the lock hold time, so it implements a
+ * static lane = cpu % num_lanes mapping.
+ *
+ * In the case of a BTT instance on top of a BLK namespace a lane may be
+ * acquired recursively.  We lock on the first instance.
+ *
+ * In the case of a BTT instance on top of PMEM, we only acquire a lane
+ * for the BTT metadata updates.
+ */
+unsigned int nd_region_acquire_lane(struct nd_region *nd_region)
+{
+	unsigned int cpu, lane;
+
+	cpu = get_cpu();
+	if (nd_region->num_lanes < nr_cpu_ids) {
+		struct nd_percpu_lane *ndl_lock, *ndl_count;
+
+		lane = cpu % nd_region->num_lanes;
+		ndl_count = per_cpu_ptr(nd_region->lane, cpu);
+		ndl_lock = per_cpu_ptr(nd_region->lane, lane);
+		if (ndl_count->count++ == 0)
+			spin_lock(&ndl_lock->lock);
+	} else
+		lane = cpu;
+
+	return lane;
+}
+EXPORT_SYMBOL(nd_region_acquire_lane);
+
+void nd_region_release_lane(struct nd_region *nd_region, unsigned int lane)
+{
+	if (nd_region->num_lanes < nr_cpu_ids) {
+		unsigned int cpu = get_cpu();
+		struct nd_percpu_lane *ndl_lock, *ndl_count;
+
+		ndl_count = per_cpu_ptr(nd_region->lane, cpu);
+		ndl_lock = per_cpu_ptr(nd_region->lane, lane);
+		if (--ndl_count->count == 0)
+			spin_unlock(&ndl_lock->lock);
+		put_cpu();
+	}
+	put_cpu();
+}
+EXPORT_SYMBOL(nd_region_release_lane);
+
 static struct nd_region *nd_region_create(struct nvdimm_bus *nvdimm_bus,
 		struct nd_region_desc *ndr_desc, struct device_type *dev_type,
 		const char *caller)
 {
 	struct nd_region *nd_region;
 	struct device *dev;
-	u16 i;
+	unsigned int i;
 
 	for (i = 0; i < ndr_desc->num_mappings; i++) {
 		struct nd_mapping *nd_mapping = &ndr_desc->nd_mapping[i];
@@ -557,9 +611,19 @@ static struct nd_region *nd_region_create(struct nvdimm_bus *nvdimm_bus,
 	if (!nd_region)
 		return NULL;
 	nd_region->id = ida_simple_get(&region_ida, 0, 0, GFP_KERNEL);
-	if (nd_region->id < 0) {
-		kfree(nd_region);
-		return NULL;
+	if (nd_region->id < 0)
+		goto err_id;
+
+	nd_region->lane = alloc_percpu(struct nd_percpu_lane);
+	if (!nd_region->lane)
+		goto err_percpu;
+
+        for (i = 0; i < nr_cpu_ids; i++) {
+		struct nd_percpu_lane *ndl;
+
+		ndl = per_cpu_ptr(nd_region->lane, i);
+		spin_lock_init(&ndl->lock);
+		ndl->count = 0;
 	}
 
 	memcpy(nd_region->mapping, ndr_desc->nd_mapping,
@@ -573,6 +637,7 @@ static struct nd_region *nd_region_create(struct nvdimm_bus *nvdimm_bus,
 	nd_region->ndr_mappings = ndr_desc->num_mappings;
 	nd_region->provider_data = ndr_desc->provider_data;
 	nd_region->nd_set = ndr_desc->nd_set;
+	nd_region->num_lanes = ndr_desc->num_lanes;
 	ida_init(&nd_region->ns_ida);
 	ida_init(&nd_region->btt_ida);
 	dev = &nd_region->dev;
@@ -585,11 +650,18 @@ static struct nd_region *nd_region_create(struct nvdimm_bus *nvdimm_bus,
 	nd_device_register(dev);
 
 	return nd_region;
+
+ err_percpu:
+	ida_simple_remove(&region_ida, nd_region->id);
+ err_id:
+	kfree(nd_region);
+	return NULL;
 }
 
 struct nd_region *nvdimm_pmem_region_create(struct nvdimm_bus *nvdimm_bus,
 		struct nd_region_desc *ndr_desc)
 {
+	ndr_desc->num_lanes = ND_MAX_LANES;
 	return nd_region_create(nvdimm_bus, ndr_desc, &nd_pmem_device_type,
 			__func__);
 }
@@ -600,6 +672,7 @@ struct nd_region *nvdimm_blk_region_create(struct nvdimm_bus *nvdimm_bus,
 {
 	if (ndr_desc->num_mappings > 1)
 		return NULL;
+	ndr_desc->num_lanes = min(ndr_desc->num_lanes, ND_MAX_LANES);
 	return nd_region_create(nvdimm_bus, ndr_desc, &nd_blk_device_type,
 			__func__);
 }
@@ -608,6 +681,7 @@ EXPORT_SYMBOL_GPL(nvdimm_blk_region_create);
 struct nd_region *nvdimm_volatile_region_create(struct nvdimm_bus *nvdimm_bus,
 		struct nd_region_desc *ndr_desc)
 {
+	ndr_desc->num_lanes = ND_MAX_LANES;
 	return nd_region_create(nvdimm_bus, ndr_desc, &nd_volatile_device_type,
 			__func__);
 }
diff --git a/include/linux/libnvdimm.h b/include/linux/libnvdimm.h
index a59dca17b3aa..531d99dfac68 100644
--- a/include/linux/libnvdimm.h
+++ b/include/linux/libnvdimm.h
@@ -85,6 +85,7 @@ struct nd_region_desc {
 	const struct attribute_group **attr_groups;
 	struct nd_interleave_set *nd_set;
 	void *provider_data;
+	int num_lanes;
 };
 
 struct nvdimm_bus;
-- 
cgit v1.2.3


From 047fc8a1f9a6330eacc80374dff087e20dc2304b Mon Sep 17 00:00:00 2001
From: Ross Zwisler <ross.zwisler@linux.intel.com>
Date: Thu, 25 Jun 2015 04:21:02 -0400
Subject: libnvdimm, nfit, nd_blk: driver for BLK-mode access persistent memory

The libnvdimm implementation handles allocating dimm address space (DPA)
between PMEM and BLK mode interfaces.  After DPA has been allocated from
a BLK-region to a BLK-namespace the nd_blk driver attaches to handle I/O
as a struct bio based block device. Unlike PMEM, BLK is required to
handle platform specific details like mmio register formats and memory
controller interleave.  For this reason the libnvdimm generic nd_blk
driver calls back into the bus provider to carry out the I/O.

This initial implementation handles the BLK interface defined by the
ACPI 6 NFIT [1] and the NVDIMM DSM Interface Example [2] composed from
DCR (dimm control region), BDW (block data window), IDT (interleave
descriptor) NFIT structures and the hardware register format.
[1]: http://www.uefi.org/sites/default/files/resources/ACPI_6.0.pdf
[2]: http://pmem.io/documents/NVDIMM_DSM_Interface_Example.pdf

Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Boaz Harrosh <boaz@plexistor.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Jens Axboe <axboe@fb.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Christoph Hellwig <hch@lst.de>
Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Acked-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/acpi/nfit.c             | 449 ++++++++++++++++++++++++++++++++++++++--
 drivers/acpi/nfit.h             |  49 +++++
 drivers/nvdimm/Kconfig          |  13 ++
 drivers/nvdimm/Makefile         |   3 +
 drivers/nvdimm/blk.c            | 245 ++++++++++++++++++++++
 drivers/nvdimm/dimm_devs.c      |   9 +
 drivers/nvdimm/namespace_devs.c |  65 +++++-
 drivers/nvdimm/nd-core.h        |   3 +-
 drivers/nvdimm/nd.h             |  13 +-
 drivers/nvdimm/region.c         |   8 +-
 drivers/nvdimm/region_devs.c    |  91 +++++++-
 include/linux/libnvdimm.h       |  27 ++-
 12 files changed, 941 insertions(+), 34 deletions(-)
 create mode 100644 drivers/nvdimm/blk.c

(limited to 'include/linux')

diff --git a/drivers/acpi/nfit.c b/drivers/acpi/nfit.c
index fc38b49eff7d..2aa6aa97b40c 100644
--- a/drivers/acpi/nfit.c
+++ b/drivers/acpi/nfit.c
@@ -13,12 +13,20 @@
 #include <linux/list_sort.h>
 #include <linux/libnvdimm.h>
 #include <linux/module.h>
+#include <linux/mutex.h>
 #include <linux/ndctl.h>
 #include <linux/list.h>
 #include <linux/acpi.h>
 #include <linux/sort.h>
+#include <linux/io.h>
 #include "nfit.h"
 
+/*
+ * For readq() and writeq() on 32-bit builds, the hi-lo, lo-hi order is
+ * irrelevant.
+ */
+#include <asm-generic/io-64-nonatomic-hi-lo.h>
+
 static bool force_enable_dimms;
 module_param(force_enable_dimms, bool, S_IRUGO|S_IWUSR);
 MODULE_PARM_DESC(force_enable_dimms, "Ignore _STA (ACPI DIMM device) status");
@@ -72,7 +80,7 @@ static int acpi_nfit_ctl(struct nvdimm_bus_descriptor *nd_desc,
 
 		if (!adev)
 			return -ENOTTY;
-		dimm_name = dev_name(&adev->dev);
+		dimm_name = nvdimm_name(nvdimm);
 		cmd_name = nvdimm_cmd_name(cmd);
 		dsm_mask = nfit_mem->dsm_mask;
 		desc = nd_cmd_dimm_desc(cmd);
@@ -279,6 +287,23 @@ static bool add_bdw(struct acpi_nfit_desc *acpi_desc,
 	return true;
 }
 
+static bool add_idt(struct acpi_nfit_desc *acpi_desc,
+		struct acpi_nfit_interleave *idt)
+{
+	struct device *dev = acpi_desc->dev;
+	struct nfit_idt *nfit_idt = devm_kzalloc(dev, sizeof(*nfit_idt),
+			GFP_KERNEL);
+
+	if (!nfit_idt)
+		return false;
+	INIT_LIST_HEAD(&nfit_idt->list);
+	nfit_idt->idt = idt;
+	list_add_tail(&nfit_idt->list, &acpi_desc->idts);
+	dev_dbg(dev, "%s: idt index: %d num_lines: %d\n", __func__,
+			idt->interleave_index, idt->line_count);
+	return true;
+}
+
 static void *add_table(struct acpi_nfit_desc *acpi_desc, void *table,
 		const void *end)
 {
@@ -307,9 +332,9 @@ static void *add_table(struct acpi_nfit_desc *acpi_desc, void *table,
 		if (!add_bdw(acpi_desc, table))
 			return err;
 		break;
-	/* TODO */
 	case ACPI_NFIT_TYPE_INTERLEAVE:
-		dev_dbg(dev, "%s: idt\n", __func__);
+		if (!add_idt(acpi_desc, table))
+			return err;
 		break;
 	case ACPI_NFIT_TYPE_FLUSH_ADDRESS:
 		dev_dbg(dev, "%s: flush\n", __func__);
@@ -362,8 +387,11 @@ static int nfit_mem_add(struct acpi_nfit_desc *acpi_desc,
 		struct nfit_mem *nfit_mem, struct acpi_nfit_system_address *spa)
 {
 	u16 dcr = __to_nfit_memdev(nfit_mem)->region_index;
+	struct nfit_memdev *nfit_memdev;
 	struct nfit_dcr *nfit_dcr;
 	struct nfit_bdw *nfit_bdw;
+	struct nfit_idt *nfit_idt;
+	u16 idt_idx, range_index;
 
 	list_for_each_entry(nfit_dcr, &acpi_desc->dcrs, list) {
 		if (nfit_dcr->dcr->region_index != dcr)
@@ -396,6 +424,26 @@ static int nfit_mem_add(struct acpi_nfit_desc *acpi_desc,
 		return 0;
 
 	nfit_mem_find_spa_bdw(acpi_desc, nfit_mem);
+
+	if (!nfit_mem->spa_bdw)
+		return 0;
+
+	range_index = nfit_mem->spa_bdw->range_index;
+	list_for_each_entry(nfit_memdev, &acpi_desc->memdevs, list) {
+		if (nfit_memdev->memdev->range_index != range_index ||
+				nfit_memdev->memdev->region_index != dcr)
+			continue;
+		nfit_mem->memdev_bdw = nfit_memdev->memdev;
+		idt_idx = nfit_memdev->memdev->interleave_index;
+		list_for_each_entry(nfit_idt, &acpi_desc->idts, list) {
+			if (nfit_idt->idt->interleave_index != idt_idx)
+				continue;
+			nfit_mem->idt_bdw = nfit_idt->idt;
+			break;
+		}
+		break;
+	}
+
 	return 0;
 }
 
@@ -439,9 +487,19 @@ static int nfit_mem_dcr_init(struct acpi_nfit_desc *acpi_desc,
 		}
 
 		if (type == NFIT_SPA_DCR) {
+			struct nfit_idt *nfit_idt;
+			u16 idt_idx;
+
 			/* multiple dimms may share a SPA when interleaved */
 			nfit_mem->spa_dcr = spa;
 			nfit_mem->memdev_dcr = nfit_memdev->memdev;
+			idt_idx = nfit_memdev->memdev->interleave_index;
+			list_for_each_entry(nfit_idt, &acpi_desc->idts, list) {
+				if (nfit_idt->idt->interleave_index != idt_idx)
+					continue;
+				nfit_mem->idt_dcr = nfit_idt->idt;
+				break;
+			}
 		} else {
 			/*
 			 * A single dimm may belong to multiple SPA-PM
@@ -871,6 +929,359 @@ static int acpi_nfit_init_interleave_set(struct acpi_nfit_desc *acpi_desc,
 	return 0;
 }
 
+static u64 to_interleave_offset(u64 offset, struct nfit_blk_mmio *mmio)
+{
+	struct acpi_nfit_interleave *idt = mmio->idt;
+	u32 sub_line_offset, line_index, line_offset;
+	u64 line_no, table_skip_count, table_offset;
+
+	line_no = div_u64_rem(offset, mmio->line_size, &sub_line_offset);
+	table_skip_count = div_u64_rem(line_no, mmio->num_lines, &line_index);
+	line_offset = idt->line_offset[line_index]
+		* mmio->line_size;
+	table_offset = table_skip_count * mmio->table_size;
+
+	return mmio->base_offset + line_offset + table_offset + sub_line_offset;
+}
+
+static u64 read_blk_stat(struct nfit_blk *nfit_blk, unsigned int bw)
+{
+	struct nfit_blk_mmio *mmio = &nfit_blk->mmio[DCR];
+	u64 offset = nfit_blk->stat_offset + mmio->size * bw;
+
+	if (mmio->num_lines)
+		offset = to_interleave_offset(offset, mmio);
+
+	return readq(mmio->base + offset);
+}
+
+static void write_blk_ctl(struct nfit_blk *nfit_blk, unsigned int bw,
+		resource_size_t dpa, unsigned int len, unsigned int write)
+{
+	u64 cmd, offset;
+	struct nfit_blk_mmio *mmio = &nfit_blk->mmio[DCR];
+
+	enum {
+		BCW_OFFSET_MASK = (1ULL << 48)-1,
+		BCW_LEN_SHIFT = 48,
+		BCW_LEN_MASK = (1ULL << 8) - 1,
+		BCW_CMD_SHIFT = 56,
+	};
+
+	cmd = (dpa >> L1_CACHE_SHIFT) & BCW_OFFSET_MASK;
+	len = len >> L1_CACHE_SHIFT;
+	cmd |= ((u64) len & BCW_LEN_MASK) << BCW_LEN_SHIFT;
+	cmd |= ((u64) write) << BCW_CMD_SHIFT;
+
+	offset = nfit_blk->cmd_offset + mmio->size * bw;
+	if (mmio->num_lines)
+		offset = to_interleave_offset(offset, mmio);
+
+	writeq(cmd, mmio->base + offset);
+	/* FIXME: conditionally perform read-back if mandated by firmware */
+}
+
+static int acpi_nfit_blk_single_io(struct nfit_blk *nfit_blk,
+		resource_size_t dpa, void *iobuf, size_t len, int rw,
+		unsigned int lane)
+{
+	struct nfit_blk_mmio *mmio = &nfit_blk->mmio[BDW];
+	unsigned int copied = 0;
+	u64 base_offset;
+	int rc;
+
+	base_offset = nfit_blk->bdw_offset + dpa % L1_CACHE_BYTES
+		+ lane * mmio->size;
+	/* TODO: non-temporal access, flush hints, cache management etc... */
+	write_blk_ctl(nfit_blk, lane, dpa, len, rw);
+	while (len) {
+		unsigned int c;
+		u64 offset;
+
+		if (mmio->num_lines) {
+			u32 line_offset;
+
+			offset = to_interleave_offset(base_offset + copied,
+					mmio);
+			div_u64_rem(offset, mmio->line_size, &line_offset);
+			c = min_t(size_t, len, mmio->line_size - line_offset);
+		} else {
+			offset = base_offset + nfit_blk->bdw_offset;
+			c = len;
+		}
+
+		if (rw)
+			memcpy(mmio->aperture + offset, iobuf + copied, c);
+		else
+			memcpy(iobuf + copied, mmio->aperture + offset, c);
+
+		copied += c;
+		len -= c;
+	}
+	rc = read_blk_stat(nfit_blk, lane) ? -EIO : 0;
+	return rc;
+}
+
+static int acpi_nfit_blk_region_do_io(struct nd_blk_region *ndbr,
+		resource_size_t dpa, void *iobuf, u64 len, int rw)
+{
+	struct nfit_blk *nfit_blk = nd_blk_region_provider_data(ndbr);
+	struct nfit_blk_mmio *mmio = &nfit_blk->mmio[BDW];
+	struct nd_region *nd_region = nfit_blk->nd_region;
+	unsigned int lane, copied = 0;
+	int rc = 0;
+
+	lane = nd_region_acquire_lane(nd_region);
+	while (len) {
+		u64 c = min(len, mmio->size);
+
+		rc = acpi_nfit_blk_single_io(nfit_blk, dpa + copied,
+				iobuf + copied, c, rw, lane);
+		if (rc)
+			break;
+
+		copied += c;
+		len -= c;
+	}
+	nd_region_release_lane(nd_region, lane);
+
+	return rc;
+}
+
+static void nfit_spa_mapping_release(struct kref *kref)
+{
+	struct nfit_spa_mapping *spa_map = to_spa_map(kref);
+	struct acpi_nfit_system_address *spa = spa_map->spa;
+	struct acpi_nfit_desc *acpi_desc = spa_map->acpi_desc;
+
+	WARN_ON(!mutex_is_locked(&acpi_desc->spa_map_mutex));
+	dev_dbg(acpi_desc->dev, "%s: SPA%d\n", __func__, spa->range_index);
+	iounmap(spa_map->iomem);
+	release_mem_region(spa->address, spa->length);
+	list_del(&spa_map->list);
+	kfree(spa_map);
+}
+
+static struct nfit_spa_mapping *find_spa_mapping(
+		struct acpi_nfit_desc *acpi_desc,
+		struct acpi_nfit_system_address *spa)
+{
+	struct nfit_spa_mapping *spa_map;
+
+	WARN_ON(!mutex_is_locked(&acpi_desc->spa_map_mutex));
+	list_for_each_entry(spa_map, &acpi_desc->spa_maps, list)
+		if (spa_map->spa == spa)
+			return spa_map;
+
+	return NULL;
+}
+
+static void nfit_spa_unmap(struct acpi_nfit_desc *acpi_desc,
+		struct acpi_nfit_system_address *spa)
+{
+	struct nfit_spa_mapping *spa_map;
+
+	mutex_lock(&acpi_desc->spa_map_mutex);
+	spa_map = find_spa_mapping(acpi_desc, spa);
+
+	if (spa_map)
+		kref_put(&spa_map->kref, nfit_spa_mapping_release);
+	mutex_unlock(&acpi_desc->spa_map_mutex);
+}
+
+static void __iomem *__nfit_spa_map(struct acpi_nfit_desc *acpi_desc,
+		struct acpi_nfit_system_address *spa)
+{
+	resource_size_t start = spa->address;
+	resource_size_t n = spa->length;
+	struct nfit_spa_mapping *spa_map;
+	struct resource *res;
+
+	WARN_ON(!mutex_is_locked(&acpi_desc->spa_map_mutex));
+
+	spa_map = find_spa_mapping(acpi_desc, spa);
+	if (spa_map) {
+		kref_get(&spa_map->kref);
+		return spa_map->iomem;
+	}
+
+	spa_map = kzalloc(sizeof(*spa_map), GFP_KERNEL);
+	if (!spa_map)
+		return NULL;
+
+	INIT_LIST_HEAD(&spa_map->list);
+	spa_map->spa = spa;
+	kref_init(&spa_map->kref);
+	spa_map->acpi_desc = acpi_desc;
+
+	res = request_mem_region(start, n, dev_name(acpi_desc->dev));
+	if (!res)
+		goto err_mem;
+
+	/* TODO: cacheability based on the spa type */
+	spa_map->iomem = ioremap_nocache(start, n);
+	if (!spa_map->iomem)
+		goto err_map;
+
+	list_add_tail(&spa_map->list, &acpi_desc->spa_maps);
+	return spa_map->iomem;
+
+ err_map:
+	release_mem_region(start, n);
+ err_mem:
+	kfree(spa_map);
+	return NULL;
+}
+
+/**
+ * nfit_spa_map - interleave-aware managed-mappings of acpi_nfit_system_address ranges
+ * @nvdimm_bus: NFIT-bus that provided the spa table entry
+ * @nfit_spa: spa table to map
+ *
+ * In the case where block-data-window apertures and
+ * dimm-control-regions are interleaved they will end up sharing a
+ * single request_mem_region() + ioremap() for the address range.  In
+ * the style of devm nfit_spa_map() mappings are automatically dropped
+ * when all region devices referencing the same mapping are disabled /
+ * unbound.
+ */
+static void __iomem *nfit_spa_map(struct acpi_nfit_desc *acpi_desc,
+		struct acpi_nfit_system_address *spa)
+{
+	void __iomem *iomem;
+
+	mutex_lock(&acpi_desc->spa_map_mutex);
+	iomem = __nfit_spa_map(acpi_desc, spa);
+	mutex_unlock(&acpi_desc->spa_map_mutex);
+
+	return iomem;
+}
+
+static int nfit_blk_init_interleave(struct nfit_blk_mmio *mmio,
+		struct acpi_nfit_interleave *idt, u16 interleave_ways)
+{
+	if (idt) {
+		mmio->num_lines = idt->line_count;
+		mmio->line_size = idt->line_size;
+		if (interleave_ways == 0)
+			return -ENXIO;
+		mmio->table_size = mmio->num_lines * interleave_ways
+			* mmio->line_size;
+	}
+
+	return 0;
+}
+
+static int acpi_nfit_blk_region_enable(struct nvdimm_bus *nvdimm_bus,
+		struct device *dev)
+{
+	struct nvdimm_bus_descriptor *nd_desc = to_nd_desc(nvdimm_bus);
+	struct acpi_nfit_desc *acpi_desc = to_acpi_desc(nd_desc);
+	struct nd_blk_region *ndbr = to_nd_blk_region(dev);
+	struct nfit_blk_mmio *mmio;
+	struct nfit_blk *nfit_blk;
+	struct nfit_mem *nfit_mem;
+	struct nvdimm *nvdimm;
+	int rc;
+
+	nvdimm = nd_blk_region_to_dimm(ndbr);
+	nfit_mem = nvdimm_provider_data(nvdimm);
+	if (!nfit_mem || !nfit_mem->dcr || !nfit_mem->bdw) {
+		dev_dbg(dev, "%s: missing%s%s%s\n", __func__,
+				nfit_mem ? "" : " nfit_mem",
+				nfit_mem->dcr ? "" : " dcr",
+				nfit_mem->bdw ? "" : " bdw");
+		return -ENXIO;
+	}
+
+	nfit_blk = devm_kzalloc(dev, sizeof(*nfit_blk), GFP_KERNEL);
+	if (!nfit_blk)
+		return -ENOMEM;
+	nd_blk_region_set_provider_data(ndbr, nfit_blk);
+	nfit_blk->nd_region = to_nd_region(dev);
+
+	/* map block aperture memory */
+	nfit_blk->bdw_offset = nfit_mem->bdw->offset;
+	mmio = &nfit_blk->mmio[BDW];
+	mmio->base = nfit_spa_map(acpi_desc, nfit_mem->spa_bdw);
+	if (!mmio->base) {
+		dev_dbg(dev, "%s: %s failed to map bdw\n", __func__,
+				nvdimm_name(nvdimm));
+		return -ENOMEM;
+	}
+	mmio->size = nfit_mem->bdw->size;
+	mmio->base_offset = nfit_mem->memdev_bdw->region_offset;
+	mmio->idt = nfit_mem->idt_bdw;
+	mmio->spa = nfit_mem->spa_bdw;
+	rc = nfit_blk_init_interleave(mmio, nfit_mem->idt_bdw,
+			nfit_mem->memdev_bdw->interleave_ways);
+	if (rc) {
+		dev_dbg(dev, "%s: %s failed to init bdw interleave\n",
+				__func__, nvdimm_name(nvdimm));
+		return rc;
+	}
+
+	/* map block control memory */
+	nfit_blk->cmd_offset = nfit_mem->dcr->command_offset;
+	nfit_blk->stat_offset = nfit_mem->dcr->status_offset;
+	mmio = &nfit_blk->mmio[DCR];
+	mmio->base = nfit_spa_map(acpi_desc, nfit_mem->spa_dcr);
+	if (!mmio->base) {
+		dev_dbg(dev, "%s: %s failed to map dcr\n", __func__,
+				nvdimm_name(nvdimm));
+		return -ENOMEM;
+	}
+	mmio->size = nfit_mem->dcr->window_size;
+	mmio->base_offset = nfit_mem->memdev_dcr->region_offset;
+	mmio->idt = nfit_mem->idt_dcr;
+	mmio->spa = nfit_mem->spa_dcr;
+	rc = nfit_blk_init_interleave(mmio, nfit_mem->idt_dcr,
+			nfit_mem->memdev_dcr->interleave_ways);
+	if (rc) {
+		dev_dbg(dev, "%s: %s failed to init dcr interleave\n",
+				__func__, nvdimm_name(nvdimm));
+		return rc;
+	}
+
+	if (mmio->line_size == 0)
+		return 0;
+
+	if ((u32) nfit_blk->cmd_offset % mmio->line_size
+			+ 8 > mmio->line_size) {
+		dev_dbg(dev, "cmd_offset crosses interleave boundary\n");
+		return -ENXIO;
+	} else if ((u32) nfit_blk->stat_offset % mmio->line_size
+			+ 8 > mmio->line_size) {
+		dev_dbg(dev, "stat_offset crosses interleave boundary\n");
+		return -ENXIO;
+	}
+
+	return 0;
+}
+
+static void acpi_nfit_blk_region_disable(struct nvdimm_bus *nvdimm_bus,
+		struct device *dev)
+{
+	struct nvdimm_bus_descriptor *nd_desc = to_nd_desc(nvdimm_bus);
+	struct acpi_nfit_desc *acpi_desc = to_acpi_desc(nd_desc);
+	struct nd_blk_region *ndbr = to_nd_blk_region(dev);
+	struct nfit_blk *nfit_blk = nd_blk_region_provider_data(ndbr);
+	int i;
+
+	if (!nfit_blk)
+		return; /* never enabled */
+
+	/* auto-free BLK spa mappings */
+	for (i = 0; i < 2; i++) {
+		struct nfit_blk_mmio *mmio = &nfit_blk->mmio[i];
+
+		if (mmio->base)
+			nfit_spa_unmap(acpi_desc, mmio->spa);
+	}
+	nd_blk_region_set_provider_data(ndbr, NULL);
+	/* devm will free nfit_blk */
+}
+
 static int acpi_nfit_init_mapping(struct acpi_nfit_desc *acpi_desc,
 		struct nd_mapping *nd_mapping, struct nd_region_desc *ndr_desc,
 		struct acpi_nfit_memory_map *memdev,
@@ -878,6 +1289,7 @@ static int acpi_nfit_init_mapping(struct acpi_nfit_desc *acpi_desc,
 {
 	struct nvdimm *nvdimm = acpi_nfit_dimm_by_handle(acpi_desc,
 			memdev->device_handle);
+	struct nd_blk_region_desc *ndbr_desc;
 	struct nfit_mem *nfit_mem;
 	int blk_valid = 0;
 
@@ -908,6 +1320,10 @@ static int acpi_nfit_init_mapping(struct acpi_nfit_desc *acpi_desc,
 
 		ndr_desc->nd_mapping = nd_mapping;
 		ndr_desc->num_mappings = blk_valid;
+		ndbr_desc = to_blk_region_desc(ndr_desc);
+		ndbr_desc->enable = acpi_nfit_blk_region_enable;
+		ndbr_desc->disable = acpi_nfit_blk_region_disable;
+		ndbr_desc->do_io = acpi_nfit_blk_region_do_io;
 		if (!nvdimm_blk_region_create(acpi_desc->nvdimm_bus, ndr_desc))
 			return -ENOMEM;
 		break;
@@ -921,8 +1337,9 @@ static int acpi_nfit_register_region(struct acpi_nfit_desc *acpi_desc,
 {
 	static struct nd_mapping nd_mappings[ND_MAX_MAPPINGS];
 	struct acpi_nfit_system_address *spa = nfit_spa->spa;
+	struct nd_blk_region_desc ndbr_desc;
+	struct nd_region_desc *ndr_desc;
 	struct nfit_memdev *nfit_memdev;
-	struct nd_region_desc ndr_desc;
 	struct nvdimm_bus *nvdimm_bus;
 	struct resource res;
 	int count = 0, rc;
@@ -935,12 +1352,13 @@ static int acpi_nfit_register_region(struct acpi_nfit_desc *acpi_desc,
 
 	memset(&res, 0, sizeof(res));
 	memset(&nd_mappings, 0, sizeof(nd_mappings));
-	memset(&ndr_desc, 0, sizeof(ndr_desc));
+	memset(&ndbr_desc, 0, sizeof(ndbr_desc));
 	res.start = spa->address;
 	res.end = res.start + spa->length - 1;
-	ndr_desc.res = &res;
-	ndr_desc.provider_data = nfit_spa;
-	ndr_desc.attr_groups = acpi_nfit_region_attribute_groups;
+	ndr_desc = &ndbr_desc.ndr_desc;
+	ndr_desc->res = &res;
+	ndr_desc->provider_data = nfit_spa;
+	ndr_desc->attr_groups = acpi_nfit_region_attribute_groups;
 	list_for_each_entry(nfit_memdev, &acpi_desc->memdevs, list) {
 		struct acpi_nfit_memory_map *memdev = nfit_memdev->memdev;
 		struct nd_mapping *nd_mapping;
@@ -953,24 +1371,24 @@ static int acpi_nfit_register_region(struct acpi_nfit_desc *acpi_desc,
 			return -ENXIO;
 		}
 		nd_mapping = &nd_mappings[count++];
-		rc = acpi_nfit_init_mapping(acpi_desc, nd_mapping, &ndr_desc,
+		rc = acpi_nfit_init_mapping(acpi_desc, nd_mapping, ndr_desc,
 				memdev, spa);
 		if (rc)
 			return rc;
 	}
 
-	ndr_desc.nd_mapping = nd_mappings;
-	ndr_desc.num_mappings = count;
-	rc = acpi_nfit_init_interleave_set(acpi_desc, &ndr_desc, spa);
+	ndr_desc->nd_mapping = nd_mappings;
+	ndr_desc->num_mappings = count;
+	rc = acpi_nfit_init_interleave_set(acpi_desc, ndr_desc, spa);
 	if (rc)
 		return rc;
 
 	nvdimm_bus = acpi_desc->nvdimm_bus;
 	if (nfit_spa_type(spa) == NFIT_SPA_PM) {
-		if (!nvdimm_pmem_region_create(nvdimm_bus, &ndr_desc))
+		if (!nvdimm_pmem_region_create(nvdimm_bus, ndr_desc))
 			return -ENOMEM;
 	} else if (nfit_spa_type(spa) == NFIT_SPA_VOLATILE) {
-		if (!nvdimm_volatile_region_create(nvdimm_bus, &ndr_desc))
+		if (!nvdimm_volatile_region_create(nvdimm_bus, ndr_desc))
 			return -ENOMEM;
 	}
 	return 0;
@@ -996,11 +1414,14 @@ static int acpi_nfit_init(struct acpi_nfit_desc *acpi_desc, acpi_size sz)
 	u8 *data;
 	int rc;
 
+	INIT_LIST_HEAD(&acpi_desc->spa_maps);
 	INIT_LIST_HEAD(&acpi_desc->spas);
 	INIT_LIST_HEAD(&acpi_desc->dcrs);
 	INIT_LIST_HEAD(&acpi_desc->bdws);
+	INIT_LIST_HEAD(&acpi_desc->idts);
 	INIT_LIST_HEAD(&acpi_desc->memdevs);
 	INIT_LIST_HEAD(&acpi_desc->dimms);
+	mutex_init(&acpi_desc->spa_map_mutex);
 
 	data = (u8 *) acpi_desc->nfit;
 	end = data + sz;
diff --git a/drivers/acpi/nfit.h b/drivers/acpi/nfit.h
index b76e33629098..7bd38b7baf39 100644
--- a/drivers/acpi/nfit.h
+++ b/drivers/acpi/nfit.h
@@ -52,6 +52,11 @@ struct nfit_bdw {
 	struct list_head list;
 };
 
+struct nfit_idt {
+	struct acpi_nfit_interleave *idt;
+	struct list_head list;
+};
+
 struct nfit_memdev {
 	struct acpi_nfit_memory_map *memdev;
 	struct list_head list;
@@ -62,10 +67,13 @@ struct nfit_mem {
 	struct nvdimm *nvdimm;
 	struct acpi_nfit_memory_map *memdev_dcr;
 	struct acpi_nfit_memory_map *memdev_pmem;
+	struct acpi_nfit_memory_map *memdev_bdw;
 	struct acpi_nfit_control_region *dcr;
 	struct acpi_nfit_data_region *bdw;
 	struct acpi_nfit_system_address *spa_dcr;
 	struct acpi_nfit_system_address *spa_bdw;
+	struct acpi_nfit_interleave *idt_dcr;
+	struct acpi_nfit_interleave *idt_bdw;
 	struct list_head list;
 	struct acpi_device *adev;
 	unsigned long dsm_mask;
@@ -74,16 +82,57 @@ struct nfit_mem {
 struct acpi_nfit_desc {
 	struct nvdimm_bus_descriptor nd_desc;
 	struct acpi_table_nfit *nfit;
+	struct mutex spa_map_mutex;
+	struct list_head spa_maps;
 	struct list_head memdevs;
 	struct list_head dimms;
 	struct list_head spas;
 	struct list_head dcrs;
 	struct list_head bdws;
+	struct list_head idts;
 	struct nvdimm_bus *nvdimm_bus;
 	struct device *dev;
 	unsigned long dimm_dsm_force_en;
 };
 
+enum nd_blk_mmio_selector {
+	BDW,
+	DCR,
+};
+
+struct nfit_blk {
+	struct nfit_blk_mmio {
+		union {
+			void __iomem *base;
+			void *aperture;
+		};
+		u64 size;
+		u64 base_offset;
+		u32 line_size;
+		u32 num_lines;
+		u32 table_size;
+		struct acpi_nfit_interleave *idt;
+		struct acpi_nfit_system_address *spa;
+	} mmio[2];
+	struct nd_region *nd_region;
+	u64 bdw_offset; /* post interleave offset */
+	u64 stat_offset;
+	u64 cmd_offset;
+};
+
+struct nfit_spa_mapping {
+	struct acpi_nfit_desc *acpi_desc;
+	struct acpi_nfit_system_address *spa;
+	struct list_head list;
+	struct kref kref;
+	void __iomem *iomem;
+};
+
+static inline struct nfit_spa_mapping *to_spa_map(struct kref *kref)
+{
+	return container_of(kref, struct nfit_spa_mapping, kref);
+}
+
 static inline struct acpi_nfit_memory_map *__to_nfit_memdev(
 		struct nfit_mem *nfit_mem)
 {
diff --git a/drivers/nvdimm/Kconfig b/drivers/nvdimm/Kconfig
index 204ee0796411..72226acb5c0f 100644
--- a/drivers/nvdimm/Kconfig
+++ b/drivers/nvdimm/Kconfig
@@ -34,6 +34,19 @@ config BLK_DEV_PMEM
 
 	  Say Y if you want to use an NVDIMM
 
+config ND_BLK
+	tristate "BLK: Block data window (aperture) device support"
+	default LIBNVDIMM
+	select ND_BTT if BTT
+	help
+	  Support NVDIMMs, or other devices, that implement a BLK-mode
+	  access capability.  BLK-mode access uses memory-mapped-i/o
+	  apertures to access persistent media.
+
+	  Say Y if your platform firmware emits an ACPI.NFIT table
+	  (CONFIG_ACPI_NFIT), or otherwise exposes BLK-mode
+	  capabilities.
+
 config ND_BTT
 	tristate
 
diff --git a/drivers/nvdimm/Makefile b/drivers/nvdimm/Makefile
index d2aab6c58492..594bb97c867a 100644
--- a/drivers/nvdimm/Makefile
+++ b/drivers/nvdimm/Makefile
@@ -1,11 +1,14 @@
 obj-$(CONFIG_LIBNVDIMM) += libnvdimm.o
 obj-$(CONFIG_BLK_DEV_PMEM) += nd_pmem.o
 obj-$(CONFIG_ND_BTT) += nd_btt.o
+obj-$(CONFIG_ND_BLK) += nd_blk.o
 
 nd_pmem-y := pmem.o
 
 nd_btt-y := btt.o
 
+nd_blk-y := blk.o
+
 libnvdimm-y := core.o
 libnvdimm-y += bus.o
 libnvdimm-y += dimm_devs.o
diff --git a/drivers/nvdimm/blk.c b/drivers/nvdimm/blk.c
new file mode 100644
index 000000000000..9ac0c266c15c
--- /dev/null
+++ b/drivers/nvdimm/blk.c
@@ -0,0 +1,245 @@
+/*
+ * NVDIMM Block Window Driver
+ * Copyright (c) 2014, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+
+#include <linux/blkdev.h>
+#include <linux/fs.h>
+#include <linux/genhd.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/nd.h>
+#include <linux/sizes.h>
+#include "nd.h"
+
+struct nd_blk_device {
+	struct request_queue *queue;
+	struct gendisk *disk;
+	struct nd_namespace_blk *nsblk;
+	struct nd_blk_region *ndbr;
+	size_t disk_size;
+};
+
+static int nd_blk_major;
+
+static resource_size_t to_dev_offset(struct nd_namespace_blk *nsblk,
+				resource_size_t ns_offset, unsigned int len)
+{
+	int i;
+
+	for (i = 0; i < nsblk->num_resources; i++) {
+		if (ns_offset < resource_size(nsblk->res[i])) {
+			if (ns_offset + len > resource_size(nsblk->res[i])) {
+				dev_WARN_ONCE(&nsblk->common.dev, 1,
+					"illegal request\n");
+				return SIZE_MAX;
+			}
+			return nsblk->res[i]->start + ns_offset;
+		}
+		ns_offset -= resource_size(nsblk->res[i]);
+	}
+
+	dev_WARN_ONCE(&nsblk->common.dev, 1, "request out of range\n");
+	return SIZE_MAX;
+}
+
+static void nd_blk_make_request(struct request_queue *q, struct bio *bio)
+{
+	struct block_device *bdev = bio->bi_bdev;
+	struct gendisk *disk = bdev->bd_disk;
+	struct nd_namespace_blk *nsblk;
+	struct nd_blk_device *blk_dev;
+	struct nd_blk_region *ndbr;
+	struct bvec_iter iter;
+	struct bio_vec bvec;
+	int err = 0, rw;
+
+	blk_dev = disk->private_data;
+	nsblk = blk_dev->nsblk;
+	ndbr = blk_dev->ndbr;
+	rw = bio_data_dir(bio);
+	bio_for_each_segment(bvec, bio, iter) {
+		unsigned int len = bvec.bv_len;
+		resource_size_t	dev_offset;
+		void *iobuf;
+
+		BUG_ON(len > PAGE_SIZE);
+
+		dev_offset = to_dev_offset(nsblk,
+				iter.bi_sector << SECTOR_SHIFT, len);
+		if (dev_offset == SIZE_MAX) {
+			err = -EIO;
+			goto out;
+		}
+
+		iobuf = kmap_atomic(bvec.bv_page);
+		err = ndbr->do_io(ndbr, dev_offset, iobuf + bvec.bv_offset,
+				len, rw);
+		kunmap_atomic(iobuf);
+		if (err)
+			goto out;
+	}
+
+ out:
+	bio_endio(bio, err);
+}
+
+static int nd_blk_rw_bytes(struct nd_namespace_common *ndns,
+		resource_size_t offset, void *iobuf, size_t n, int rw)
+{
+	struct nd_blk_device *blk_dev = dev_get_drvdata(ndns->claim);
+	struct nd_namespace_blk *nsblk = blk_dev->nsblk;
+	struct nd_blk_region *ndbr = blk_dev->ndbr;
+	resource_size_t	dev_offset;
+
+	dev_offset = to_dev_offset(nsblk, offset, n);
+
+	if (unlikely(offset + n > blk_dev->disk_size)) {
+		dev_WARN_ONCE(&ndns->dev, 1, "request out of range\n");
+		return -EFAULT;
+	}
+
+	if (dev_offset == SIZE_MAX)
+		return -EIO;
+
+	return ndbr->do_io(ndbr, dev_offset, iobuf, n, rw);
+}
+
+static const struct block_device_operations nd_blk_fops = {
+	.owner = THIS_MODULE,
+};
+
+static int nd_blk_attach_disk(struct nd_namespace_common *ndns,
+		struct nd_blk_device *blk_dev)
+{
+	struct nd_namespace_blk *nsblk = to_nd_namespace_blk(&ndns->dev);
+	struct gendisk *disk;
+
+	blk_dev->queue = blk_alloc_queue(GFP_KERNEL);
+	if (!blk_dev->queue)
+		return -ENOMEM;
+
+	blk_queue_make_request(blk_dev->queue, nd_blk_make_request);
+	blk_queue_max_hw_sectors(blk_dev->queue, UINT_MAX);
+	blk_queue_bounce_limit(blk_dev->queue, BLK_BOUNCE_ANY);
+	blk_queue_logical_block_size(blk_dev->queue, nsblk->lbasize);
+	queue_flag_set_unlocked(QUEUE_FLAG_NONROT, blk_dev->queue);
+
+	disk = blk_dev->disk = alloc_disk(0);
+	if (!disk) {
+		blk_cleanup_queue(blk_dev->queue);
+		return -ENOMEM;
+	}
+
+	disk->driverfs_dev	= &ndns->dev;
+	disk->major		= nd_blk_major;
+	disk->first_minor	= 0;
+	disk->fops		= &nd_blk_fops;
+	disk->private_data	= blk_dev;
+	disk->queue		= blk_dev->queue;
+	disk->flags		= GENHD_FL_EXT_DEVT;
+	nvdimm_namespace_disk_name(ndns, disk->disk_name);
+	set_capacity(disk, blk_dev->disk_size >> SECTOR_SHIFT);
+	add_disk(disk);
+
+	return 0;
+}
+
+static int nd_blk_probe(struct device *dev)
+{
+	struct nd_namespace_common *ndns;
+	struct nd_blk_device *blk_dev;
+	int rc;
+
+	ndns = nvdimm_namespace_common_probe(dev);
+	if (IS_ERR(ndns))
+		return PTR_ERR(ndns);
+
+	blk_dev = kzalloc(sizeof(*blk_dev), GFP_KERNEL);
+	if (!blk_dev)
+		return -ENOMEM;
+
+	blk_dev->disk_size = nvdimm_namespace_capacity(ndns);
+	blk_dev->ndbr = to_nd_blk_region(dev->parent);
+	blk_dev->nsblk = to_nd_namespace_blk(&ndns->dev);
+	dev_set_drvdata(dev, blk_dev);
+
+	ndns->rw_bytes = nd_blk_rw_bytes;
+	if (is_nd_btt(dev))
+		rc = nvdimm_namespace_attach_btt(ndns);
+	else if (nd_btt_probe(ndns, blk_dev) == 0) {
+		/* we'll come back as btt-blk */
+		rc = -ENXIO;
+	} else
+		rc = nd_blk_attach_disk(ndns, blk_dev);
+	if (rc)
+		kfree(blk_dev);
+	return rc;
+}
+
+static void nd_blk_detach_disk(struct nd_blk_device *blk_dev)
+{
+	del_gendisk(blk_dev->disk);
+	put_disk(blk_dev->disk);
+	blk_cleanup_queue(blk_dev->queue);
+}
+
+static int nd_blk_remove(struct device *dev)
+{
+	struct nd_blk_device *blk_dev = dev_get_drvdata(dev);
+
+	if (is_nd_btt(dev))
+		nvdimm_namespace_detach_btt(to_nd_btt(dev)->ndns);
+	else
+		nd_blk_detach_disk(blk_dev);
+	kfree(blk_dev);
+
+	return 0;
+}
+
+static struct nd_device_driver nd_blk_driver = {
+	.probe = nd_blk_probe,
+	.remove = nd_blk_remove,
+	.drv = {
+		.name = "nd_blk",
+	},
+	.type = ND_DRIVER_NAMESPACE_BLK,
+};
+
+static int __init nd_blk_init(void)
+{
+	int rc;
+
+	rc = register_blkdev(0, "nd_blk");
+	if (rc < 0)
+		return rc;
+
+	nd_blk_major = rc;
+	rc = nd_driver_register(&nd_blk_driver);
+
+	if (rc < 0)
+		unregister_blkdev(nd_blk_major, "nd_blk");
+
+	return rc;
+}
+
+static void __exit nd_blk_exit(void)
+{
+	driver_unregister(&nd_blk_driver.drv);
+	unregister_blkdev(nd_blk_major, "nd_blk");
+}
+
+MODULE_AUTHOR("Ross Zwisler <ross.zwisler@linux.intel.com>");
+MODULE_LICENSE("GPL v2");
+MODULE_ALIAS_ND_DEVICE(ND_DEVICE_NAMESPACE_BLK);
+module_init(nd_blk_init);
+module_exit(nd_blk_exit);
diff --git a/drivers/nvdimm/dimm_devs.c b/drivers/nvdimm/dimm_devs.c
index 83b179ed6d61..c05eb807d674 100644
--- a/drivers/nvdimm/dimm_devs.c
+++ b/drivers/nvdimm/dimm_devs.c
@@ -209,6 +209,15 @@ struct nvdimm *to_nvdimm(struct device *dev)
 }
 EXPORT_SYMBOL_GPL(to_nvdimm);
 
+struct nvdimm *nd_blk_region_to_dimm(struct nd_blk_region *ndbr)
+{
+	struct nd_region *nd_region = &ndbr->nd_region;
+	struct nd_mapping *nd_mapping = &nd_region->mapping[0];
+
+	return nd_mapping->nvdimm;
+}
+EXPORT_SYMBOL_GPL(nd_blk_region_to_dimm);
+
 struct nvdimm_drvdata *to_ndd(struct nd_mapping *nd_mapping)
 {
 	struct nvdimm *nvdimm = nd_mapping->nvdimm;
diff --git a/drivers/nvdimm/namespace_devs.c b/drivers/nvdimm/namespace_devs.c
index 4aa647c8d644..1ce1e70de44a 100644
--- a/drivers/nvdimm/namespace_devs.c
+++ b/drivers/nvdimm/namespace_devs.c
@@ -173,6 +173,65 @@ static resource_size_t nd_namespace_blk_size(struct nd_namespace_blk *nsblk)
 	return size;
 }
 
+static bool __nd_namespace_blk_validate(struct nd_namespace_blk *nsblk)
+{
+	struct nd_region *nd_region = to_nd_region(nsblk->common.dev.parent);
+	struct nd_mapping *nd_mapping = &nd_region->mapping[0];
+	struct nvdimm_drvdata *ndd = to_ndd(nd_mapping);
+	struct nd_label_id label_id;
+	struct resource *res;
+	int count, i;
+
+	if (!nsblk->uuid || !nsblk->lbasize || !ndd)
+		return false;
+
+	count = 0;
+	nd_label_gen_id(&label_id, nsblk->uuid, NSLABEL_FLAG_LOCAL);
+	for_each_dpa_resource(ndd, res) {
+		if (strcmp(res->name, label_id.id) != 0)
+			continue;
+		/*
+		 * Resources with unacknoweldged adjustments indicate a
+		 * failure to update labels
+		 */
+		if (res->flags & DPA_RESOURCE_ADJUSTED)
+			return false;
+		count++;
+	}
+
+	/* These values match after a successful label update */
+	if (count != nsblk->num_resources)
+		return false;
+
+	for (i = 0; i < nsblk->num_resources; i++) {
+		struct resource *found = NULL;
+
+		for_each_dpa_resource(ndd, res)
+			if (res == nsblk->res[i]) {
+				found = res;
+				break;
+			}
+		/* stale resource */
+		if (!found)
+			return false;
+	}
+
+	return true;
+}
+
+resource_size_t nd_namespace_blk_validate(struct nd_namespace_blk *nsblk)
+{
+	resource_size_t size;
+
+	nvdimm_bus_lock(&nsblk->common.dev);
+	size = __nd_namespace_blk_validate(nsblk);
+	nvdimm_bus_unlock(&nsblk->common.dev);
+
+	return size;
+}
+EXPORT_SYMBOL(nd_namespace_blk_validate);
+
+
 static int nd_namespace_label_update(struct nd_region *nd_region,
 		struct device *dev)
 {
@@ -1224,7 +1283,11 @@ struct nd_namespace_common *nvdimm_namespace_common_probe(struct device *dev)
 			return ERR_PTR(-ENODEV);
 		}
 	} else if (is_namespace_blk(&ndns->dev)) {
-		return ERR_PTR(-ENODEV); /* TODO */
+		struct nd_namespace_blk *nsblk;
+
+		nsblk = to_nd_namespace_blk(&ndns->dev);
+		if (!nd_namespace_blk_validate(nsblk))
+			return ERR_PTR(-ENODEV);
 	}
 
 	return ndns;
diff --git a/drivers/nvdimm/nd-core.h b/drivers/nvdimm/nd-core.h
index 5e6413964776..e1970c71ad1c 100644
--- a/drivers/nvdimm/nd-core.h
+++ b/drivers/nvdimm/nd-core.h
@@ -43,9 +43,8 @@ struct nvdimm {
 };
 
 bool is_nvdimm(struct device *dev);
-bool is_nd_blk(struct device *dev);
 bool is_nd_pmem(struct device *dev);
-struct nd_btt;
+bool is_nd_blk(struct device *dev);
 struct nvdimm_bus *walk_to_nvdimm_bus(struct device *nd_dev);
 int __init nvdimm_bus_init(void);
 void nvdimm_bus_exit(void);
diff --git a/drivers/nvdimm/nd.h b/drivers/nvdimm/nd.h
index 1b937c235913..f153f43ca3d6 100644
--- a/drivers/nvdimm/nd.h
+++ b/drivers/nvdimm/nd.h
@@ -102,6 +102,15 @@ struct nd_region {
 	struct nd_mapping mapping[0];
 };
 
+struct nd_blk_region {
+	int (*enable)(struct nvdimm_bus *nvdimm_bus, struct device *dev);
+	void (*disable)(struct nvdimm_bus *nvdimm_bus, struct device *dev);
+	int (*do_io)(struct nd_blk_region *ndbr, resource_size_t dpa,
+			void *iobuf, u64 len, int rw);
+	void *blk_provider_data;
+	struct nd_region nd_region;
+};
+
 /*
  * Lookup next in the repeating sequence of 01, 10, and 11.
  */
@@ -171,8 +180,6 @@ static inline struct device *nd_btt_create(struct nd_region *nd_region)
 
 #endif
 struct nd_region *to_nd_region(struct device *dev);
-unsigned int nd_region_acquire_lane(struct nd_region *nd_region);
-void nd_region_release_lane(struct nd_region *nd_region, unsigned int lane);
 int nd_region_to_nstype(struct nd_region *nd_region);
 int nd_region_register_namespaces(struct nd_region *nd_region, int *err);
 u64 nd_region_interleave_set_cookie(struct nd_region *nd_region);
@@ -192,4 +199,6 @@ int nvdimm_namespace_attach_btt(struct nd_namespace_common *ndns);
 int nvdimm_namespace_detach_btt(struct nd_namespace_common *ndns);
 const char *nvdimm_namespace_disk_name(struct nd_namespace_common *ndns,
 		char *name);
+int nd_blk_region_init(struct nd_region *nd_region);
+resource_size_t nd_namespace_blk_validate(struct nd_namespace_blk *nsblk);
 #endif /* __ND_H__ */
diff --git a/drivers/nvdimm/region.c b/drivers/nvdimm/region.c
index eb8aebcd4800..f28f78ccff19 100644
--- a/drivers/nvdimm/region.c
+++ b/drivers/nvdimm/region.c
@@ -18,11 +18,10 @@
 
 static int nd_region_probe(struct device *dev)
 {
-	int err;
+	int err, rc;
 	static unsigned long once;
 	struct nd_region_namespaces *num_ns;
 	struct nd_region *nd_region = to_nd_region(dev);
-	int rc = nd_region_register_namespaces(nd_region, &err);
 
 	if (nd_region->num_lanes > num_online_cpus()
 			&& nd_region->num_lanes < num_possible_cpus()
@@ -34,6 +33,11 @@ static int nd_region_probe(struct device *dev)
 				nd_region->num_lanes);
 	}
 
+	rc = nd_blk_region_init(nd_region);
+	if (rc)
+		return rc;
+
+	rc = nd_region_register_namespaces(nd_region, &err);
 	num_ns = devm_kzalloc(dev, sizeof(*num_ns), GFP_KERNEL);
 	if (!num_ns)
 		return -ENOMEM;
diff --git a/drivers/nvdimm/region_devs.c b/drivers/nvdimm/region_devs.c
index fe8ec21fe3d7..2cfb3f74bcbf 100644
--- a/drivers/nvdimm/region_devs.c
+++ b/drivers/nvdimm/region_devs.c
@@ -11,6 +11,7 @@
  * General Public License for more details.
  */
 #include <linux/scatterlist.h>
+#include <linux/highmem.h>
 #include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/sort.h>
@@ -34,7 +35,10 @@ static void nd_region_release(struct device *dev)
 	}
 	free_percpu(nd_region->lane);
 	ida_simple_remove(&region_ida, nd_region->id);
-	kfree(nd_region);
+	if (is_nd_blk(dev))
+		kfree(to_nd_blk_region(dev));
+	else
+		kfree(nd_region);
 }
 
 static struct device_type nd_blk_device_type = {
@@ -71,6 +75,33 @@ struct nd_region *to_nd_region(struct device *dev)
 }
 EXPORT_SYMBOL_GPL(to_nd_region);
 
+struct nd_blk_region *to_nd_blk_region(struct device *dev)
+{
+	struct nd_region *nd_region = to_nd_region(dev);
+
+	WARN_ON(!is_nd_blk(dev));
+	return container_of(nd_region, struct nd_blk_region, nd_region);
+}
+EXPORT_SYMBOL_GPL(to_nd_blk_region);
+
+void *nd_region_provider_data(struct nd_region *nd_region)
+{
+	return nd_region->provider_data;
+}
+EXPORT_SYMBOL_GPL(nd_region_provider_data);
+
+void *nd_blk_region_provider_data(struct nd_blk_region *ndbr)
+{
+	return ndbr->blk_provider_data;
+}
+EXPORT_SYMBOL_GPL(nd_blk_region_provider_data);
+
+void nd_blk_region_set_provider_data(struct nd_blk_region *ndbr, void *data)
+{
+	ndbr->blk_provider_data = data;
+}
+EXPORT_SYMBOL_GPL(nd_blk_region_set_provider_data);
+
 /**
  * nd_region_to_nstype() - region to an integer namespace type
  * @nd_region: region-device to interrogate
@@ -365,7 +396,8 @@ u64 nd_region_interleave_set_cookie(struct nd_region *nd_region)
 /*
  * Upon successful probe/remove, take/release a reference on the
  * associated interleave set (if present), and plant new btt + namespace
- * seeds.
+ * seeds.  Also, on the removal of a BLK region, notify the provider to
+ * disable the region.
  */
 static void nd_region_notify_driver_action(struct nvdimm_bus *nvdimm_bus,
 		struct device *dev, bool probe)
@@ -385,8 +417,14 @@ static void nd_region_notify_driver_action(struct nvdimm_bus *nvdimm_bus,
 			nd_mapping->labels = NULL;
 			put_ndd(ndd);
 			nd_mapping->ndd = NULL;
-			atomic_dec(&nvdimm->busy);
+			if (ndd)
+				atomic_dec(&nvdimm->busy);
 		}
+
+		if (is_nd_pmem(dev))
+			return;
+
+		to_nd_blk_region(dev)->disable(nvdimm_bus, dev);
 	}
 	if (dev->parent && is_nd_blk(dev->parent) && probe) {
 		nd_region = to_nd_region(dev->parent);
@@ -526,11 +564,21 @@ struct attribute_group nd_mapping_attribute_group = {
 };
 EXPORT_SYMBOL_GPL(nd_mapping_attribute_group);
 
-void *nd_region_provider_data(struct nd_region *nd_region)
+int nd_blk_region_init(struct nd_region *nd_region)
 {
-	return nd_region->provider_data;
+	struct device *dev = &nd_region->dev;
+	struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(dev);
+
+	if (!is_nd_blk(dev))
+		return 0;
+
+	if (nd_region->ndr_mappings < 1) {
+		dev_err(dev, "invalid BLK region\n");
+		return -ENXIO;
+	}
+
+	return to_nd_blk_region(dev)->enable(nvdimm_bus, dev);
 }
-EXPORT_SYMBOL_GPL(nd_region_provider_data);
 
 /**
  * nd_region_acquire_lane - allocate and lock a lane
@@ -591,6 +639,7 @@ static struct nd_region *nd_region_create(struct nvdimm_bus *nvdimm_bus,
 {
 	struct nd_region *nd_region;
 	struct device *dev;
+	void *region_buf;
 	unsigned int i;
 
 	for (i = 0; i < ndr_desc->num_mappings; i++) {
@@ -605,10 +654,30 @@ static struct nd_region *nd_region_create(struct nvdimm_bus *nvdimm_bus,
 		}
 	}
 
-	nd_region = kzalloc(sizeof(struct nd_region)
-			+ sizeof(struct nd_mapping) * ndr_desc->num_mappings,
-			GFP_KERNEL);
-	if (!nd_region)
+	if (dev_type == &nd_blk_device_type) {
+		struct nd_blk_region_desc *ndbr_desc;
+		struct nd_blk_region *ndbr;
+
+		ndbr_desc = to_blk_region_desc(ndr_desc);
+		ndbr = kzalloc(sizeof(*ndbr) + sizeof(struct nd_mapping)
+				* ndr_desc->num_mappings,
+				GFP_KERNEL);
+		if (ndbr) {
+			nd_region = &ndbr->nd_region;
+			ndbr->enable = ndbr_desc->enable;
+			ndbr->disable = ndbr_desc->disable;
+			ndbr->do_io = ndbr_desc->do_io;
+		}
+		region_buf = ndbr;
+	} else {
+		nd_region = kzalloc(sizeof(struct nd_region)
+				+ sizeof(struct nd_mapping)
+				* ndr_desc->num_mappings,
+				GFP_KERNEL);
+		region_buf = nd_region;
+	}
+
+	if (!region_buf)
 		return NULL;
 	nd_region->id = ida_simple_get(&region_ida, 0, 0, GFP_KERNEL);
 	if (nd_region->id < 0)
@@ -654,7 +723,7 @@ static struct nd_region *nd_region_create(struct nvdimm_bus *nvdimm_bus,
  err_percpu:
 	ida_simple_remove(&region_ida, nd_region->id);
  err_id:
-	kfree(nd_region);
+	kfree(region_buf);
 	return NULL;
 }
 
diff --git a/include/linux/libnvdimm.h b/include/linux/libnvdimm.h
index 531d99dfac68..7fc1b25bdb5d 100644
--- a/include/linux/libnvdimm.h
+++ b/include/linux/libnvdimm.h
@@ -14,6 +14,7 @@
  */
 #ifndef __LIBNVDIMM_H__
 #define __LIBNVDIMM_H__
+#include <linux/kernel.h>
 #include <linux/sizes.h>
 #include <linux/types.h>
 
@@ -89,8 +90,24 @@ struct nd_region_desc {
 };
 
 struct nvdimm_bus;
-struct device;
 struct module;
+struct device;
+struct nd_blk_region;
+struct nd_blk_region_desc {
+	int (*enable)(struct nvdimm_bus *nvdimm_bus, struct device *dev);
+	void (*disable)(struct nvdimm_bus *nvdimm_bus, struct device *dev);
+	int (*do_io)(struct nd_blk_region *ndbr, resource_size_t dpa,
+			void *iobuf, u64 len, int rw);
+	struct nd_region_desc ndr_desc;
+};
+
+static inline struct nd_blk_region_desc *to_blk_region_desc(
+		struct nd_region_desc *ndr_desc)
+{
+	return container_of(ndr_desc, struct nd_blk_region_desc, ndr_desc);
+
+}
+
 struct nvdimm_bus *__nvdimm_bus_register(struct device *parent,
 		struct nvdimm_bus_descriptor *nfit_desc, struct module *module);
 #define nvdimm_bus_register(parent, desc) \
@@ -99,10 +116,10 @@ void nvdimm_bus_unregister(struct nvdimm_bus *nvdimm_bus);
 struct nvdimm_bus *to_nvdimm_bus(struct device *dev);
 struct nvdimm *to_nvdimm(struct device *dev);
 struct nd_region *to_nd_region(struct device *dev);
+struct nd_blk_region *to_nd_blk_region(struct device *dev);
 struct nvdimm_bus_descriptor *to_nd_desc(struct nvdimm_bus *nvdimm_bus);
 const char *nvdimm_name(struct nvdimm *nvdimm);
 void *nvdimm_provider_data(struct nvdimm *nvdimm);
-void *nd_region_provider_data(struct nd_region *nd_region);
 struct nvdimm *nvdimm_create(struct nvdimm_bus *nvdimm_bus, void *provider_data,
 		const struct attribute_group **groups, unsigned long flags,
 		unsigned long *dsm_mask);
@@ -120,5 +137,11 @@ struct nd_region *nvdimm_blk_region_create(struct nvdimm_bus *nvdimm_bus,
 		struct nd_region_desc *ndr_desc);
 struct nd_region *nvdimm_volatile_region_create(struct nvdimm_bus *nvdimm_bus,
 		struct nd_region_desc *ndr_desc);
+void *nd_region_provider_data(struct nd_region *nd_region);
+void *nd_blk_region_provider_data(struct nd_blk_region *ndbr);
+void nd_blk_region_set_provider_data(struct nd_blk_region *ndbr, void *data);
+struct nvdimm *nd_blk_region_to_dimm(struct nd_blk_region *ndbr);
+unsigned int nd_region_acquire_lane(struct nd_region *nd_region);
+void nd_region_release_lane(struct nd_region *nd_region, unsigned int lane);
 u64 nd_fletcher64(void *addr, size_t len, bool le);
 #endif /* __LIBNVDIMM_H__ */
-- 
cgit v1.2.3


From 581388209405902b56d055f644b4dd124a206112 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Tue, 23 Jun 2015 20:08:34 -0400
Subject: libnvdimm, nfit: handle unarmed dimms, mark namespaces read-only

Upon detection of an unarmed dimm in a region, arrange for descendant
BTT, PMEM, or BLK instances to be read-only.  A dimm is primarily marked
"unarmed" via flags passed by platform firmware (NFIT).

The flags in the NFIT memory device sub-structure indicate the state of
the data on the nvdimm relative to its energy source or last "flush to
persistence".  For the most part there is nothing the driver can do but
advertise the state of these flags in sysfs and emit a message if
firmware indicates that the contents of the device may be corrupted.
However, for the case of ACPI_NFIT_MEM_ARMED, the driver can arrange for
the block devices incorporating that nvdimm to be marked read-only.
This is a safe default as the data is still available and new writes are
held off until the administrator either forces read-write mode, or the
energy source becomes armed.

A 'read_only' attribute is added to REGION devices to allow for
overriding the default read-only policy of all descendant block devices.

Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/acpi/nfit.c              | 31 +++++++++++++++++++++++++++++++
 drivers/acpi/nfit.h              |  3 +++
 drivers/nvdimm/blk.c             |  2 ++
 drivers/nvdimm/btt.c             | 10 ++++++++--
 drivers/nvdimm/bus.c             | 18 ++++++++++++++++++
 drivers/nvdimm/nd.h              |  3 ++-
 drivers/nvdimm/pmem.c            |  2 ++
 drivers/nvdimm/region_devs.c     | 29 +++++++++++++++++++++++++++++
 include/linux/libnvdimm.h        |  2 ++
 tools/testing/nvdimm/test/nfit.c |  3 +++
 10 files changed, 100 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/acpi/nfit.c b/drivers/acpi/nfit.c
index 07d630e9f4ae..1f6f1b1a54f4 100644
--- a/drivers/acpi/nfit.c
+++ b/drivers/acpi/nfit.c
@@ -668,6 +668,20 @@ static ssize_t serial_show(struct device *dev,
 }
 static DEVICE_ATTR_RO(serial);
 
+static ssize_t flags_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	u16 flags = to_nfit_memdev(dev)->flags;
+
+	return sprintf(buf, "%s%s%s%s%s\n",
+			flags & ACPI_NFIT_MEM_SAVE_FAILED ? "save " : "",
+			flags & ACPI_NFIT_MEM_RESTORE_FAILED ? "restore " : "",
+			flags & ACPI_NFIT_MEM_FLUSH_FAILED ? "flush " : "",
+			flags & ACPI_NFIT_MEM_ARMED ? "arm " : "",
+			flags & ACPI_NFIT_MEM_HEALTH_OBSERVED ? "smart " : "");
+}
+static DEVICE_ATTR_RO(flags);
+
 static struct attribute *acpi_nfit_dimm_attributes[] = {
 	&dev_attr_handle.attr,
 	&dev_attr_phys_id.attr,
@@ -676,6 +690,7 @@ static struct attribute *acpi_nfit_dimm_attributes[] = {
 	&dev_attr_format.attr,
 	&dev_attr_serial.attr,
 	&dev_attr_rev_id.attr,
+	&dev_attr_flags.attr,
 	NULL,
 };
 
@@ -768,6 +783,7 @@ static int acpi_nfit_register_dimms(struct acpi_nfit_desc *acpi_desc)
 		struct nvdimm *nvdimm;
 		unsigned long flags = 0;
 		u32 device_handle;
+		u16 mem_flags;
 		int rc;
 
 		device_handle = __to_nfit_memdev(nfit_mem)->device_handle;
@@ -785,6 +801,10 @@ static int acpi_nfit_register_dimms(struct acpi_nfit_desc *acpi_desc)
 		if (nfit_mem->bdw && nfit_mem->memdev_pmem)
 			flags |= NDD_ALIASING;
 
+		mem_flags = __to_nfit_memdev(nfit_mem)->flags;
+		if (mem_flags & ACPI_NFIT_MEM_ARMED)
+			flags |= NDD_UNARMED;
+
 		rc = acpi_nfit_add_dimm(acpi_desc, nfit_mem, device_handle);
 		if (rc)
 			continue;
@@ -797,6 +817,17 @@ static int acpi_nfit_register_dimms(struct acpi_nfit_desc *acpi_desc)
 
 		nfit_mem->nvdimm = nvdimm;
 		dimm_count++;
+
+		if ((mem_flags & ACPI_NFIT_MEM_FAILED_MASK) == 0)
+			continue;
+
+		dev_info(acpi_desc->dev, "%s: failed: %s%s%s%s\n",
+				nvdimm_name(nvdimm),
+			mem_flags & ACPI_NFIT_MEM_SAVE_FAILED ? "save " : "",
+			mem_flags & ACPI_NFIT_MEM_RESTORE_FAILED ? "restore " : "",
+			mem_flags & ACPI_NFIT_MEM_FLUSH_FAILED ? "flush " : "",
+			mem_flags & ACPI_NFIT_MEM_ARMED ? "arm " : "");
+
 	}
 
 	return nvdimm_bus_check_dimm_count(acpi_desc->nvdimm_bus, dimm_count);
diff --git a/drivers/acpi/nfit.h b/drivers/acpi/nfit.h
index c62fffea8423..81f2e8c5a79c 100644
--- a/drivers/acpi/nfit.h
+++ b/drivers/acpi/nfit.h
@@ -22,6 +22,9 @@
 
 #define UUID_NFIT_BUS "2f10e7a4-9e91-11e4-89d3-123b93f75cba"
 #define UUID_NFIT_DIMM "4309ac30-0d11-11e4-9191-0800200c9a66"
+#define ACPI_NFIT_MEM_FAILED_MASK (ACPI_NFIT_MEM_SAVE_FAILED \
+		| ACPI_NFIT_MEM_RESTORE_FAILED | ACPI_NFIT_MEM_FLUSH_FAILED \
+		| ACPI_NFIT_MEM_ARMED)
 
 enum nfit_uuids {
 	NFIT_SPA_VOLATILE,
diff --git a/drivers/nvdimm/blk.c b/drivers/nvdimm/blk.c
index 96ef38ceeceb..4f97b248c236 100644
--- a/drivers/nvdimm/blk.c
+++ b/drivers/nvdimm/blk.c
@@ -232,6 +232,7 @@ static int nd_blk_rw_bytes(struct nd_namespace_common *ndns,
 
 static const struct block_device_operations nd_blk_fops = {
 	.owner = THIS_MODULE,
+	.revalidate_disk = nvdimm_revalidate_disk,
 };
 
 static int nd_blk_attach_disk(struct nd_namespace_common *ndns,
@@ -283,6 +284,7 @@ static int nd_blk_attach_disk(struct nd_namespace_common *ndns,
 	}
 
 	set_capacity(disk, available_disk_size >> SECTOR_SHIFT);
+	revalidate_disk(disk);
 	return 0;
 }
 
diff --git a/drivers/nvdimm/btt.c b/drivers/nvdimm/btt.c
index c02065aed03d..411c7b2bb37a 100644
--- a/drivers/nvdimm/btt.c
+++ b/drivers/nvdimm/btt.c
@@ -1245,6 +1245,7 @@ static const struct block_device_operations btt_fops = {
 	.owner =		THIS_MODULE,
 	.rw_page =		btt_rw_page,
 	.getgeo =		btt_getgeo,
+	.revalidate_disk =	nvdimm_revalidate_disk,
 };
 
 static int btt_blk_init(struct btt *btt)
@@ -1292,6 +1293,7 @@ static int btt_blk_init(struct btt *btt)
 		}
 	}
 	set_capacity(btt->btt_disk, btt->nlba * btt->sector_size >> 9);
+	revalidate_disk(btt->btt_disk);
 
 	return 0;
 }
@@ -1346,7 +1348,11 @@ static struct btt *btt_init(struct nd_btt *nd_btt, unsigned long long rawsize,
 		goto out_free;
 	}
 
-	if (btt->init_state != INIT_READY) {
+	if (btt->init_state != INIT_READY && nd_region->ro) {
+		dev_info(dev, "%s is read-only, unable to init btt metadata\n",
+				dev_name(&nd_region->dev));
+		goto out_free;
+	} else if (btt->init_state != INIT_READY) {
 		btt->num_arenas = (rawsize / ARENA_MAX_SIZE) +
 			((rawsize % ARENA_MAX_SIZE) ? 1 : 0);
 		dev_dbg(dev, "init: %d arenas for %llu rawsize\n",
@@ -1361,7 +1367,7 @@ static struct btt *btt_init(struct nd_btt *nd_btt, unsigned long long rawsize,
 		ret = btt_meta_init(btt);
 		if (ret) {
 			dev_err(dev, "init: error in meta_init: %d\n", ret);
-			return NULL;
+			goto out_free;
 		}
 	}
 
diff --git a/drivers/nvdimm/bus.c b/drivers/nvdimm/bus.c
index dd12f38397db..ec59f1f26d95 100644
--- a/drivers/nvdimm/bus.c
+++ b/drivers/nvdimm/bus.c
@@ -227,6 +227,24 @@ int __nd_driver_register(struct nd_device_driver *nd_drv, struct module *owner,
 }
 EXPORT_SYMBOL(__nd_driver_register);
 
+int nvdimm_revalidate_disk(struct gendisk *disk)
+{
+	struct device *dev = disk->driverfs_dev;
+	struct nd_region *nd_region = to_nd_region(dev->parent);
+	const char *pol = nd_region->ro ? "only" : "write";
+
+	if (nd_region->ro == get_disk_ro(disk))
+		return 0;
+
+	dev_info(dev, "%s read-%s, marking %s read-%s\n",
+			dev_name(&nd_region->dev), pol, disk->disk_name, pol);
+	set_disk_ro(disk, nd_region->ro);
+
+	return 0;
+
+}
+EXPORT_SYMBOL(nvdimm_revalidate_disk);
+
 static ssize_t modalias_show(struct device *dev, struct device_attribute *attr,
 		char *buf)
 {
diff --git a/drivers/nvdimm/nd.h b/drivers/nvdimm/nd.h
index 4614b00542d1..48b09a210689 100644
--- a/drivers/nvdimm/nd.h
+++ b/drivers/nvdimm/nd.h
@@ -97,7 +97,7 @@ struct nd_region {
 	u16 ndr_mappings;
 	u64 ndr_size;
 	u64 ndr_start;
-	int id, num_lanes;
+	int id, num_lanes, ro;
 	void *provider_data;
 	struct nd_interleave_set *nd_set;
 	struct nd_percpu_lane __percpu *lane;
@@ -189,6 +189,7 @@ u64 nd_region_interleave_set_cookie(struct nd_region *nd_region);
 void nvdimm_bus_lock(struct device *dev);
 void nvdimm_bus_unlock(struct device *dev);
 bool is_nvdimm_bus_locked(struct device *dev);
+int nvdimm_revalidate_disk(struct gendisk *disk);
 void nvdimm_drvdata_release(struct kref *kref);
 void put_ndd(struct nvdimm_drvdata *ndd);
 int nd_label_reserve_dpa(struct nvdimm_drvdata *ndd);
diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index a9709db0704c..42b766f33e59 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -104,6 +104,7 @@ static const struct block_device_operations pmem_fops = {
 	.owner =		THIS_MODULE,
 	.rw_page =		pmem_rw_page,
 	.direct_access =	pmem_direct_access,
+	.revalidate_disk =	nvdimm_revalidate_disk,
 };
 
 static struct pmem_device *pmem_alloc(struct device *dev,
@@ -178,6 +179,7 @@ static int pmem_attach_disk(struct nd_namespace_common *ndns,
 	pmem->pmem_disk = disk;
 
 	add_disk(disk);
+	revalidate_disk(disk);
 
 	return 0;
 }
diff --git a/drivers/nvdimm/region_devs.c b/drivers/nvdimm/region_devs.c
index 2cfb3f74bcbf..482ee3e4e04a 100644
--- a/drivers/nvdimm/region_devs.c
+++ b/drivers/nvdimm/region_devs.c
@@ -345,11 +345,35 @@ static ssize_t btt_seed_show(struct device *dev,
 }
 static DEVICE_ATTR_RO(btt_seed);
 
+static ssize_t read_only_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct nd_region *nd_region = to_nd_region(dev);
+
+	return sprintf(buf, "%d\n", nd_region->ro);
+}
+
+static ssize_t read_only_store(struct device *dev,
+		struct device_attribute *attr, const char *buf, size_t len)
+{
+	bool ro;
+	int rc = strtobool(buf, &ro);
+	struct nd_region *nd_region = to_nd_region(dev);
+
+	if (rc)
+		return rc;
+
+	nd_region->ro = ro;
+	return len;
+}
+static DEVICE_ATTR_RW(read_only);
+
 static struct attribute *nd_region_attributes[] = {
 	&dev_attr_size.attr,
 	&dev_attr_nstype.attr,
 	&dev_attr_mappings.attr,
 	&dev_attr_btt_seed.attr,
+	&dev_attr_read_only.attr,
 	&dev_attr_set_cookie.attr,
 	&dev_attr_available_size.attr,
 	&dev_attr_namespace_seed.attr,
@@ -641,6 +665,7 @@ static struct nd_region *nd_region_create(struct nvdimm_bus *nvdimm_bus,
 	struct device *dev;
 	void *region_buf;
 	unsigned int i;
+	int ro = 0;
 
 	for (i = 0; i < ndr_desc->num_mappings; i++) {
 		struct nd_mapping *nd_mapping = &ndr_desc->nd_mapping[i];
@@ -652,6 +677,9 @@ static struct nd_region *nd_region_create(struct nvdimm_bus *nvdimm_bus,
 
 			return NULL;
 		}
+
+		if (nvdimm->flags & NDD_UNARMED)
+			ro = 1;
 	}
 
 	if (dev_type == &nd_blk_device_type) {
@@ -707,6 +735,7 @@ static struct nd_region *nd_region_create(struct nvdimm_bus *nvdimm_bus,
 	nd_region->provider_data = ndr_desc->provider_data;
 	nd_region->nd_set = ndr_desc->nd_set;
 	nd_region->num_lanes = ndr_desc->num_lanes;
+	nd_region->ro = ro;
 	ida_init(&nd_region->ns_ida);
 	ida_init(&nd_region->btt_ida);
 	dev = &nd_region->dev;
diff --git a/include/linux/libnvdimm.h b/include/linux/libnvdimm.h
index 7fc1b25bdb5d..dc799a29ed1a 100644
--- a/include/linux/libnvdimm.h
+++ b/include/linux/libnvdimm.h
@@ -21,6 +21,8 @@
 enum {
 	/* when a dimm supports both PMEM and BLK access a label is required */
 	NDD_ALIASING = 1 << 0,
+	/* unarmed memory devices may not persist writes */
+	NDD_UNARMED = 1 << 1,
 
 	/* need to set a limit somewhere, but yes, this is likely overkill */
 	ND_IOCTL_MAX_BUFLEN = SZ_4M,
diff --git a/tools/testing/nvdimm/test/nfit.c b/tools/testing/nvdimm/test/nfit.c
index 7a4a5a5edbe4..4b69b8368de0 100644
--- a/tools/testing/nvdimm/test/nfit.c
+++ b/tools/testing/nvdimm/test/nfit.c
@@ -874,6 +874,9 @@ static void nfit_test1_setup(struct nfit_test *t)
 	memdev->address = 0;
 	memdev->interleave_index = 0;
 	memdev->interleave_ways = 1;
+	memdev->flags = ACPI_NFIT_MEM_SAVE_FAILED | ACPI_NFIT_MEM_RESTORE_FAILED
+		| ACPI_NFIT_MEM_FLUSH_FAILED | ACPI_NFIT_MEM_HEALTH_OBSERVED
+		| ACPI_NFIT_MEM_ARMED;
 
 	offset += sizeof(*memdev);
 	/* dcr-descriptor0 */
-- 
cgit v1.2.3


From 99759869faf15471cfce251bc138848d8af7d162 Mon Sep 17 00:00:00 2001
From: Toshi Kani <toshi.kani@hp.com>
Date: Fri, 19 Jun 2015 17:14:15 -0600
Subject: acpi: Add acpi_map_pxm_to_online_node()

The kernel initializes CPU & memory's NUMA topology from ACPI
SRAT table.  Some other ACPI tables, such as NFIT and DMAR, also
contain proximity IDs for their device's NUMA topology.  This
information can be used to improve performance of these devices.

This patch introduces acpi_map_pxm_to_online_node(), which is
similar to acpi_map_pxm_to_node(), but always returns an online
node.  When the mapped node from a given proximity ID is offline,
it looks up the node distance table and returns the nearest
online node.

ACPI device drivers, which are called after the NUMA initialization
has completed in the kernel, can call this interface to obtain their
device NUMA topology from ACPI tables.  Such drivers do not have to
deal with offline nodes.  A node may be offline when a device
proximity ID is unique, SRAT memory entry does not exist, or NUMA is
disabled, ex. "numa=off" on x86.

This patch also moves the pxm range check from acpi_get_node() to
acpi_map_pxm_to_node().

Signed-off-by: Toshi Kani <toshi.kani@hp.com>
Acked-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/acpi/numa.c  | 50 +++++++++++++++++++++++++++++++++++++++++++++++---
 include/linux/acpi.h |  5 +++++
 2 files changed, 52 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/acpi/numa.c b/drivers/acpi/numa.c
index 1333cbdc3ea2..acaa3b4ea504 100644
--- a/drivers/acpi/numa.c
+++ b/drivers/acpi/numa.c
@@ -29,6 +29,8 @@
 #include <linux/errno.h>
 #include <linux/acpi.h>
 #include <linux/numa.h>
+#include <linux/nodemask.h>
+#include <linux/topology.h>
 
 #define PREFIX "ACPI: "
 
@@ -70,7 +72,12 @@ static void __acpi_map_pxm_to_node(int pxm, int node)
 
 int acpi_map_pxm_to_node(int pxm)
 {
-	int node = pxm_to_node_map[pxm];
+	int node;
+
+	if (pxm < 0 || pxm >= MAX_PXM_DOMAINS)
+		return NUMA_NO_NODE;
+
+	node = pxm_to_node_map[pxm];
 
 	if (node == NUMA_NO_NODE) {
 		if (nodes_weight(nodes_found_map) >= MAX_NUMNODES)
@@ -83,6 +90,45 @@ int acpi_map_pxm_to_node(int pxm)
 	return node;
 }
 
+/**
+ * acpi_map_pxm_to_online_node - Map proximity ID to online node
+ * @pxm: ACPI proximity ID
+ *
+ * This is similar to acpi_map_pxm_to_node(), but always returns an online
+ * node.  When the mapped node from a given proximity ID is offline, it
+ * looks up the node distance table and returns the nearest online node.
+ *
+ * ACPI device drivers, which are called after the NUMA initialization has
+ * completed in the kernel, can call this interface to obtain their device
+ * NUMA topology from ACPI tables.  Such drivers do not have to deal with
+ * offline nodes.  A node may be offline when a device proximity ID is
+ * unique, SRAT memory entry does not exist, or NUMA is disabled, ex.
+ * "numa=off" on x86.
+ */
+int acpi_map_pxm_to_online_node(int pxm)
+{
+	int node, n, dist, min_dist;
+
+	node = acpi_map_pxm_to_node(pxm);
+
+	if (node == NUMA_NO_NODE)
+		node = 0;
+
+	if (!node_online(node)) {
+		min_dist = INT_MAX;
+		for_each_online_node(n) {
+			dist = node_distance(node, n);
+			if (dist < min_dist) {
+				min_dist = dist;
+				node = n;
+			}
+		}
+	}
+
+	return node;
+}
+EXPORT_SYMBOL(acpi_map_pxm_to_online_node);
+
 static void __init
 acpi_table_print_srat_entry(struct acpi_subtable_header *header)
 {
@@ -328,8 +374,6 @@ int acpi_get_node(acpi_handle handle)
 	int pxm;
 
 	pxm = acpi_get_pxm(handle);
-	if (pxm < 0 || pxm >= MAX_PXM_DOMAINS)
-		return NUMA_NO_NODE;
 
 	return acpi_map_pxm_to_node(pxm);
 }
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index e4da5e35e29c..1b3bbb11d11c 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -289,8 +289,13 @@ extern void acpi_dmi_osi_linux(int enable, const struct dmi_system_id *d);
 extern void acpi_osi_setup(char *str);
 
 #ifdef CONFIG_ACPI_NUMA
+int acpi_map_pxm_to_online_node(int pxm);
 int acpi_get_node(acpi_handle handle);
 #else
+static inline int acpi_map_pxm_to_online_node(int pxm)
+{
+	return 0;
+}
 static inline int acpi_get_node(acpi_handle handle)
 {
 	return 0;
-- 
cgit v1.2.3


From 41d7a6d637e1440f5410cb43c25a3c41255540c5 Mon Sep 17 00:00:00 2001
From: Toshi Kani <toshi.kani@hp.com>
Date: Fri, 19 Jun 2015 12:18:33 -0600
Subject: libnvdimm: Set numa_node to NVDIMM devices

ACPI NFIT table has System Physical Address Range Structure entries that
describe a proximity ID of each range when ACPI_NFIT_PROXIMITY_VALID is
set in the flags.

Change acpi_nfit_register_region() to map a proximity ID to its node ID,
and set it to a new numa_node field of nd_region_desc, which is then
conveyed to the nd_region device.

The device core arranges for btt and namespace devices to inherit their
node from their parent region.

Signed-off-by: Toshi Kani <toshi.kani@hp.com>
[djbw: move set_dev_node() from region.c to bus.c]
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 arch/x86/kernel/pmem.c       | 1 +
 drivers/acpi/nfit.c          | 6 ++++++
 drivers/nvdimm/bus.c         | 6 ++++++
 drivers/nvdimm/nd.h          | 2 +-
 drivers/nvdimm/region_devs.c | 1 +
 include/linux/libnvdimm.h    | 1 +
 6 files changed, 16 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/pmem.c b/arch/x86/kernel/pmem.c
index 0f4ef472ab9e..64f90f53bb85 100644
--- a/arch/x86/kernel/pmem.c
+++ b/arch/x86/kernel/pmem.c
@@ -67,6 +67,7 @@ static __init int register_e820_pmem(void)
 		memset(&ndr_desc, 0, sizeof(ndr_desc));
 		ndr_desc.res = &res;
 		ndr_desc.attr_groups = e820_pmem_region_attribute_groups;
+		ndr_desc.numa_node = NUMA_NO_NODE;
 		if (!nvdimm_pmem_region_create(nvdimm_bus, &ndr_desc))
 			goto err;
 	}
diff --git a/drivers/acpi/nfit.c b/drivers/acpi/nfit.c
index 1f6f1b1a54f4..d96c8fe974dd 100644
--- a/drivers/acpi/nfit.c
+++ b/drivers/acpi/nfit.c
@@ -1392,6 +1392,12 @@ static int acpi_nfit_register_region(struct acpi_nfit_desc *acpi_desc,
 	ndr_desc->res = &res;
 	ndr_desc->provider_data = nfit_spa;
 	ndr_desc->attr_groups = acpi_nfit_region_attribute_groups;
+	if (spa->flags & ACPI_NFIT_PROXIMITY_VALID)
+		ndr_desc->numa_node = acpi_map_pxm_to_online_node(
+						spa->proximity_domain);
+	else
+		ndr_desc->numa_node = NUMA_NO_NODE;
+
 	list_for_each_entry(nfit_memdev, &acpi_desc->memdevs, list) {
 		struct acpi_nfit_memory_map *memdev = nfit_memdev->memdev;
 		struct nd_mapping *nd_mapping;
diff --git a/drivers/nvdimm/bus.c b/drivers/nvdimm/bus.c
index ec59f1f26d95..205344643852 100644
--- a/drivers/nvdimm/bus.c
+++ b/drivers/nvdimm/bus.c
@@ -48,6 +48,12 @@ static int to_nd_device_type(struct device *dev)
 
 static int nvdimm_bus_uevent(struct device *dev, struct kobj_uevent_env *env)
 {
+	/*
+	 * Ensure that region devices always have their numa node set as
+	 * early as possible.
+	 */
+	if (is_nd_pmem(dev) || is_nd_blk(dev))
+		set_dev_node(dev, to_nd_region(dev)->numa_node);
 	return add_uevent_var(env, "MODALIAS=" ND_DEVICE_MODALIAS_FMT,
 			to_nd_device_type(dev));
 }
diff --git a/drivers/nvdimm/nd.h b/drivers/nvdimm/nd.h
index 48b09a210689..c41f53e74277 100644
--- a/drivers/nvdimm/nd.h
+++ b/drivers/nvdimm/nd.h
@@ -97,7 +97,7 @@ struct nd_region {
 	u16 ndr_mappings;
 	u64 ndr_size;
 	u64 ndr_start;
-	int id, num_lanes, ro;
+	int id, num_lanes, ro, numa_node;
 	void *provider_data;
 	struct nd_interleave_set *nd_set;
 	struct nd_percpu_lane __percpu *lane;
diff --git a/drivers/nvdimm/region_devs.c b/drivers/nvdimm/region_devs.c
index 482ee3e4e04a..a5233422f9dc 100644
--- a/drivers/nvdimm/region_devs.c
+++ b/drivers/nvdimm/region_devs.c
@@ -736,6 +736,7 @@ static struct nd_region *nd_region_create(struct nvdimm_bus *nvdimm_bus,
 	nd_region->nd_set = ndr_desc->nd_set;
 	nd_region->num_lanes = ndr_desc->num_lanes;
 	nd_region->ro = ro;
+	nd_region->numa_node = ndr_desc->numa_node;
 	ida_init(&nd_region->ns_ida);
 	ida_init(&nd_region->btt_ida);
 	dev = &nd_region->dev;
diff --git a/include/linux/libnvdimm.h b/include/linux/libnvdimm.h
index dc799a29ed1a..30b3deaafd51 100644
--- a/include/linux/libnvdimm.h
+++ b/include/linux/libnvdimm.h
@@ -89,6 +89,7 @@ struct nd_region_desc {
 	struct nd_interleave_set *nd_set;
 	void *provider_data;
 	int num_lanes;
+	int numa_node;
 };
 
 struct nvdimm_bus;
-- 
cgit v1.2.3


From 74ae66c3b14ffa94c8d2dea201cdf8e6203d13d5 Mon Sep 17 00:00:00 2001
From: Toshi Kani <toshi.kani@hp.com>
Date: Fri, 19 Jun 2015 12:18:34 -0600
Subject: libnvdimm: Add sysfs numa_node to NVDIMM devices

Add support of sysfs 'numa_node' to I/O-related NVDIMM devices
under /sys/bus/nd/devices, regionN, namespaceN.0, and bttN.x.

An example of numa_node values on a 2-socket system with a single
NVDIMM range on each socket is shown below.
  /sys/bus/nd/devices
  |-- btt0.0/numa_node:0
  |-- btt1.0/numa_node:1
  |-- btt1.1/numa_node:1
  |-- namespace0.0/numa_node:0
  |-- namespace1.0/numa_node:1
  |-- region0/numa_node:0
  |-- region1/numa_node:1

These numa_node files are then linked under the block class of
their device names.
  /sys/class/block/pmem0/device/numa_node:0
  /sys/class/block/pmem1s/device/numa_node:1

This enables numactl(8) to accept 'block:' and 'file:' paths of
pmem and btt devices as shown in the examples below.
  numactl --preferred block:pmem0 --show
  numactl --preferred file:/dev/pmem1s --show

Signed-off-by: Toshi Kani <toshi.kani@hp.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/acpi/nfit.c             |  1 +
 drivers/nvdimm/btt_devs.c       |  1 +
 drivers/nvdimm/bus.c            | 30 ++++++++++++++++++++++++++++++
 drivers/nvdimm/namespace_devs.c |  1 +
 include/linux/libnvdimm.h       |  1 +
 5 files changed, 34 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/acpi/nfit.c b/drivers/acpi/nfit.c
index d96c8fe974dd..2161fa178c8d 100644
--- a/drivers/acpi/nfit.c
+++ b/drivers/acpi/nfit.c
@@ -873,6 +873,7 @@ static const struct attribute_group *acpi_nfit_region_attribute_groups[] = {
 	&nd_region_attribute_group,
 	&nd_mapping_attribute_group,
 	&nd_device_attribute_group,
+	&nd_numa_attribute_group,
 	&acpi_nfit_region_attribute_group,
 	NULL,
 };
diff --git a/drivers/nvdimm/btt_devs.c b/drivers/nvdimm/btt_devs.c
index 661aacedc140..6ac8c0fea3ec 100644
--- a/drivers/nvdimm/btt_devs.c
+++ b/drivers/nvdimm/btt_devs.c
@@ -293,6 +293,7 @@ static struct attribute_group nd_btt_attribute_group = {
 static const struct attribute_group *nd_btt_attribute_groups[] = {
 	&nd_btt_attribute_group,
 	&nd_device_attribute_group,
+	&nd_numa_attribute_group,
 	NULL,
 };
 
diff --git a/drivers/nvdimm/bus.c b/drivers/nvdimm/bus.c
index 205344643852..8eb22c0ca7ce 100644
--- a/drivers/nvdimm/bus.c
+++ b/drivers/nvdimm/bus.c
@@ -280,6 +280,36 @@ struct attribute_group nd_device_attribute_group = {
 };
 EXPORT_SYMBOL_GPL(nd_device_attribute_group);
 
+static ssize_t numa_node_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	return sprintf(buf, "%d\n", dev_to_node(dev));
+}
+static DEVICE_ATTR_RO(numa_node);
+
+static struct attribute *nd_numa_attributes[] = {
+	&dev_attr_numa_node.attr,
+	NULL,
+};
+
+static umode_t nd_numa_attr_visible(struct kobject *kobj, struct attribute *a,
+		int n)
+{
+	if (!IS_ENABLED(CONFIG_NUMA))
+		return 0;
+
+	return a->mode;
+}
+
+/**
+ * nd_numa_attribute_group - NUMA attributes for all devices on an nd bus
+ */
+struct attribute_group nd_numa_attribute_group = {
+	.attrs = nd_numa_attributes,
+	.is_visible = nd_numa_attr_visible,
+};
+EXPORT_SYMBOL_GPL(nd_numa_attribute_group);
+
 int nvdimm_bus_create_ndctl(struct nvdimm_bus *nvdimm_bus)
 {
 	dev_t devt = MKDEV(nvdimm_bus_major, nvdimm_bus->id);
diff --git a/drivers/nvdimm/namespace_devs.c b/drivers/nvdimm/namespace_devs.c
index 27d69bd3b4d6..fef0dd80d4ad 100644
--- a/drivers/nvdimm/namespace_devs.c
+++ b/drivers/nvdimm/namespace_devs.c
@@ -1228,6 +1228,7 @@ static struct attribute_group nd_namespace_attribute_group = {
 static const struct attribute_group *nd_namespace_attribute_groups[] = {
 	&nd_device_attribute_group,
 	&nd_namespace_attribute_group,
+	&nd_numa_attribute_group,
 	NULL,
 };
 
diff --git a/include/linux/libnvdimm.h b/include/linux/libnvdimm.h
index 30b3deaafd51..75e3af01ee32 100644
--- a/include/linux/libnvdimm.h
+++ b/include/linux/libnvdimm.h
@@ -38,6 +38,7 @@ enum {
 extern struct attribute_group nvdimm_bus_attribute_group;
 extern struct attribute_group nvdimm_attribute_group;
 extern struct attribute_group nd_device_attribute_group;
+extern struct attribute_group nd_numa_attribute_group;
 extern struct attribute_group nd_region_attribute_group;
 extern struct attribute_group nd_mapping_attribute_group;
 
-- 
cgit v1.2.3


From 61031952f4c89dba1065f7a5b9419badb112554c Mon Sep 17 00:00:00 2001
From: Ross Zwisler <ross.zwisler@linux.intel.com>
Date: Thu, 25 Jun 2015 03:08:39 -0400
Subject: arch, x86: pmem api for ensuring durability of persistent memory
 updates

Based on an original patch by Ross Zwisler [1].

Writes to persistent memory have the potential to be posted to cpu
cache, cpu write buffers, and platform write buffers (memory controller)
before being committed to persistent media.  Provide apis,
memcpy_to_pmem(), wmb_pmem(), and memremap_pmem(), to write data to
pmem and assert that it is durable in PMEM (a persistent linear address
range).  A '__pmem' attribute is added so sparse can track proper usage
of pointers to pmem.

This continues the status quo of pmem being x86 only for 4.2, but
reworks to ioremap, and wider implementation of memremap() will enable
other archs in 4.3.

[1]: https://lists.01.org/pipermail/linux-nvdimm/2015-May/000932.html

Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
[djbw: various reworks]
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 arch/x86/Kconfig                  |   1 +
 arch/x86/include/asm/cacheflush.h |  72 ++++++++++++++++++
 arch/x86/include/asm/io.h         |   6 ++
 drivers/nvdimm/pmem.c             |  33 ++++----
 include/linux/compiler.h          |   2 +
 include/linux/pmem.h              | 153 ++++++++++++++++++++++++++++++++++++++
 lib/Kconfig                       |   3 +
 7 files changed, 257 insertions(+), 13 deletions(-)
 create mode 100644 include/linux/pmem.h

(limited to 'include/linux')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 1a2cbf641667..62564ddf7f78 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -27,6 +27,7 @@ config X86
 	select ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS
 	select ARCH_HAS_FAST_MULTIPLIER
 	select ARCH_HAS_GCOV_PROFILE_ALL
+	select ARCH_HAS_PMEM_API
 	select ARCH_MIGHT_HAVE_PC_PARPORT
 	select ARCH_MIGHT_HAVE_PC_SERIO
 	select HAVE_AOUT if X86_32
diff --git a/arch/x86/include/asm/cacheflush.h b/arch/x86/include/asm/cacheflush.h
index 47c8e32f621a..ec23bb753a3e 100644
--- a/arch/x86/include/asm/cacheflush.h
+++ b/arch/x86/include/asm/cacheflush.h
@@ -4,6 +4,7 @@
 /* Caches aren't brain-dead on the intel. */
 #include <asm-generic/cacheflush.h>
 #include <asm/special_insns.h>
+#include <asm/uaccess.h>
 
 /*
  * The set_memory_* API can be used to change various attributes of a virtual
@@ -104,4 +105,75 @@ static inline int rodata_test(void)
 }
 #endif
 
+#ifdef ARCH_HAS_NOCACHE_UACCESS
+
+/**
+ * arch_memcpy_to_pmem - copy data to persistent memory
+ * @dst: destination buffer for the copy
+ * @src: source buffer for the copy
+ * @n: length of the copy in bytes
+ *
+ * Copy data to persistent memory media via non-temporal stores so that
+ * a subsequent arch_wmb_pmem() can flush cpu and memory controller
+ * write buffers to guarantee durability.
+ */
+static inline void arch_memcpy_to_pmem(void __pmem *dst, const void *src,
+		size_t n)
+{
+	int unwritten;
+
+	/*
+	 * We are copying between two kernel buffers, if
+	 * __copy_from_user_inatomic_nocache() returns an error (page
+	 * fault) we would have already reported a general protection fault
+	 * before the WARN+BUG.
+	 */
+	unwritten = __copy_from_user_inatomic_nocache((void __force *) dst,
+			(void __user *) src, n);
+	if (WARN(unwritten, "%s: fault copying %p <- %p unwritten: %d\n",
+				__func__, dst, src, unwritten))
+		BUG();
+}
+
+/**
+ * arch_wmb_pmem - synchronize writes to persistent memory
+ *
+ * After a series of arch_memcpy_to_pmem() operations this drains data
+ * from cpu write buffers and any platform (memory controller) buffers
+ * to ensure that written data is durable on persistent memory media.
+ */
+static inline void arch_wmb_pmem(void)
+{
+	/*
+	 * wmb() to 'sfence' all previous writes such that they are
+	 * architecturally visible to 'pcommit'.  Note, that we've
+	 * already arranged for pmem writes to avoid the cache via
+	 * arch_memcpy_to_pmem().
+	 */
+	wmb();
+	pcommit_sfence();
+}
+
+static inline bool __arch_has_wmb_pmem(void)
+{
+#ifdef CONFIG_X86_64
+	/*
+	 * We require that wmb() be an 'sfence', that is only guaranteed on
+	 * 64-bit builds
+	 */
+	return static_cpu_has(X86_FEATURE_PCOMMIT);
+#else
+	return false;
+#endif
+}
+#else /* ARCH_HAS_NOCACHE_UACCESS i.e. ARCH=um */
+extern void arch_memcpy_to_pmem(void __pmem *dst, const void *src, size_t n);
+extern void arch_wmb_pmem(void);
+
+static inline bool __arch_has_wmb_pmem(void)
+{
+	return false;
+}
+#endif
+
 #endif /* _ASM_X86_CACHEFLUSH_H */
diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h
index 34a5b93704d3..c60c3f3b0183 100644
--- a/arch/x86/include/asm/io.h
+++ b/arch/x86/include/asm/io.h
@@ -247,6 +247,12 @@ static inline void flush_write_buffers(void)
 #endif
 }
 
+static inline void __pmem *arch_memremap_pmem(resource_size_t offset,
+	unsigned long size)
+{
+	return (void __force __pmem *) ioremap_cache(offset, size);
+}
+
 #endif /* __KERNEL__ */
 
 extern void native_io_delay(void);
diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index 42b766f33e59..ade9eb917a4d 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -23,6 +23,7 @@
 #include <linux/module.h>
 #include <linux/moduleparam.h>
 #include <linux/slab.h>
+#include <linux/pmem.h>
 #include <linux/nd.h>
 #include "nd.h"
 
@@ -32,7 +33,7 @@ struct pmem_device {
 
 	/* One contiguous memory region per device */
 	phys_addr_t		phys_addr;
-	void			*virt_addr;
+	void __pmem		*virt_addr;
 	size_t			size;
 };
 
@@ -44,13 +45,14 @@ static void pmem_do_bvec(struct pmem_device *pmem, struct page *page,
 {
 	void *mem = kmap_atomic(page);
 	size_t pmem_off = sector << 9;
+	void __pmem *pmem_addr = pmem->virt_addr + pmem_off;
 
 	if (rw == READ) {
-		memcpy(mem + off, pmem->virt_addr + pmem_off, len);
+		memcpy_from_pmem(mem + off, pmem_addr, len);
 		flush_dcache_page(page);
 	} else {
 		flush_dcache_page(page);
-		memcpy(pmem->virt_addr + pmem_off, mem + off, len);
+		memcpy_to_pmem(pmem_addr, mem + off, len);
 	}
 
 	kunmap_atomic(mem);
@@ -71,6 +73,10 @@ static void pmem_make_request(struct request_queue *q, struct bio *bio)
 				bio_data_dir(bio), iter.bi_sector);
 	if (do_acct)
 		nd_iostat_end(bio, start);
+
+	if (bio_data_dir(bio))
+		wmb_pmem();
+
 	bio_endio(bio, 0);
 }
 
@@ -94,7 +100,8 @@ static long pmem_direct_access(struct block_device *bdev, sector_t sector,
 	if (!pmem)
 		return -ENODEV;
 
-	*kaddr = pmem->virt_addr + offset;
+	/* FIXME convert DAX to comprehend that this mapping has a lifetime */
+	*kaddr = (void __force *) pmem->virt_addr + offset;
 	*pfn = (pmem->phys_addr + offset) >> PAGE_SHIFT;
 
 	return pmem->size - offset;
@@ -118,6 +125,8 @@ static struct pmem_device *pmem_alloc(struct device *dev,
 
 	pmem->phys_addr = res->start;
 	pmem->size = resource_size(res);
+	if (!arch_has_pmem_api())
+		dev_warn(dev, "unable to guarantee persistence of writes\n");
 
 	if (!request_mem_region(pmem->phys_addr, pmem->size, dev_name(dev))) {
 		dev_warn(dev, "could not reserve region [0x%pa:0x%zx]\n",
@@ -126,11 +135,7 @@ static struct pmem_device *pmem_alloc(struct device *dev,
 		return ERR_PTR(-EBUSY);
 	}
 
-	/*
-	 * Map the memory as non-cachable, as we can't write back the contents
-	 * of the CPU caches in case of a crash.
-	 */
-	pmem->virt_addr = ioremap_nocache(pmem->phys_addr, pmem->size);
+	pmem->virt_addr = memremap_pmem(pmem->phys_addr, pmem->size);
 	if (!pmem->virt_addr) {
 		release_mem_region(pmem->phys_addr, pmem->size);
 		kfree(pmem);
@@ -195,16 +200,18 @@ static int pmem_rw_bytes(struct nd_namespace_common *ndns,
 	}
 
 	if (rw == READ)
-		memcpy(buf, pmem->virt_addr + offset, size);
-	else
-		memcpy(pmem->virt_addr + offset, buf, size);
+		memcpy_from_pmem(buf, pmem->virt_addr + offset, size);
+	else {
+		memcpy_to_pmem(pmem->virt_addr + offset, buf, size);
+		wmb_pmem();
+	}
 
 	return 0;
 }
 
 static void pmem_free(struct pmem_device *pmem)
 {
-	iounmap(pmem->virt_addr);
+	memunmap_pmem(pmem->virt_addr);
 	release_mem_region(pmem->phys_addr, pmem->size);
 	kfree(pmem);
 }
diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index 867722591be2..9a528d945498 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -21,6 +21,7 @@
 # define __rcu		__attribute__((noderef, address_space(4)))
 #else
 # define __rcu
+# define __pmem		__attribute__((noderef, address_space(5)))
 #endif
 extern void __chk_user_ptr(const volatile void __user *);
 extern void __chk_io_ptr(const volatile void __iomem *);
@@ -42,6 +43,7 @@ extern void __chk_io_ptr(const volatile void __iomem *);
 # define __cond_lock(x,c) (c)
 # define __percpu
 # define __rcu
+# define __pmem
 #endif
 
 /* Indirect macros required for expanded argument pasting, eg. __LINE__. */
diff --git a/include/linux/pmem.h b/include/linux/pmem.h
new file mode 100644
index 000000000000..f6481a0b1d4f
--- /dev/null
+++ b/include/linux/pmem.h
@@ -0,0 +1,153 @@
+/*
+ * Copyright(c) 2015 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+#ifndef __PMEM_H__
+#define __PMEM_H__
+
+#include <linux/io.h>
+
+#ifdef CONFIG_ARCH_HAS_PMEM_API
+#include <asm/cacheflush.h>
+#else
+static inline void arch_wmb_pmem(void)
+{
+	BUG();
+}
+
+static inline bool __arch_has_wmb_pmem(void)
+{
+	return false;
+}
+
+static inline void __pmem *arch_memremap_pmem(resource_size_t offset,
+		unsigned long size)
+{
+	return NULL;
+}
+
+static inline void arch_memcpy_to_pmem(void __pmem *dst, const void *src,
+		size_t n)
+{
+	BUG();
+}
+#endif
+
+/*
+ * Architectures that define ARCH_HAS_PMEM_API must provide
+ * implementations for arch_memremap_pmem(), arch_memcpy_to_pmem(),
+ * arch_wmb_pmem(), and __arch_has_wmb_pmem().
+ */
+
+static inline void memcpy_from_pmem(void *dst, void __pmem const *src, size_t size)
+{
+	memcpy(dst, (void __force const *) src, size);
+}
+
+static inline void memunmap_pmem(void __pmem *addr)
+{
+	iounmap((void __force __iomem *) addr);
+}
+
+/**
+ * arch_has_wmb_pmem - true if wmb_pmem() ensures durability
+ *
+ * For a given cpu implementation within an architecture it is possible
+ * that wmb_pmem() resolves to a nop.  In the case this returns
+ * false, pmem api users are unable to ensure durability and may want to
+ * fall back to a different data consistency model, or otherwise notify
+ * the user.
+ */
+static inline bool arch_has_wmb_pmem(void)
+{
+	if (IS_ENABLED(CONFIG_ARCH_HAS_PMEM_API))
+		return __arch_has_wmb_pmem();
+	return false;
+}
+
+static inline bool arch_has_pmem_api(void)
+{
+	return IS_ENABLED(CONFIG_ARCH_HAS_PMEM_API) && arch_has_wmb_pmem();
+}
+
+/*
+ * These defaults seek to offer decent performance and minimize the
+ * window between i/o completion and writes being durable on media.
+ * However, it is undefined / architecture specific whether
+ * default_memremap_pmem + default_memcpy_to_pmem is sufficient for
+ * making data durable relative to i/o completion.
+ */
+static void default_memcpy_to_pmem(void __pmem *dst, const void *src,
+		size_t size)
+{
+	memcpy((void __force *) dst, src, size);
+}
+
+static void __pmem *default_memremap_pmem(resource_size_t offset,
+		unsigned long size)
+{
+	/* TODO: convert to ioremap_wt() */
+	return (void __pmem __force *)ioremap_nocache(offset, size);
+}
+
+/**
+ * memremap_pmem - map physical persistent memory for pmem api
+ * @offset: physical address of persistent memory
+ * @size: size of the mapping
+ *
+ * Establish a mapping of the architecture specific memory type expected
+ * by memcpy_to_pmem() and wmb_pmem().  For example, it may be
+ * the case that an uncacheable or writethrough mapping is sufficient,
+ * or a writeback mapping provided memcpy_to_pmem() and
+ * wmb_pmem() arrange for the data to be written through the
+ * cache to persistent media.
+ */
+static inline void __pmem *memremap_pmem(resource_size_t offset,
+		unsigned long size)
+{
+	if (arch_has_pmem_api())
+		return arch_memremap_pmem(offset, size);
+	return default_memremap_pmem(offset, size);
+}
+
+/**
+ * memcpy_to_pmem - copy data to persistent memory
+ * @dst: destination buffer for the copy
+ * @src: source buffer for the copy
+ * @n: length of the copy in bytes
+ *
+ * Perform a memory copy that results in the destination of the copy
+ * being effectively evicted from, or never written to, the processor
+ * cache hierarchy after the copy completes.  After memcpy_to_pmem()
+ * data may still reside in cpu or platform buffers, so this operation
+ * must be followed by a wmb_pmem().
+ */
+static inline void memcpy_to_pmem(void __pmem *dst, const void *src, size_t n)
+{
+	if (arch_has_pmem_api())
+		arch_memcpy_to_pmem(dst, src, n);
+	else
+		default_memcpy_to_pmem(dst, src, n);
+}
+
+/**
+ * wmb_pmem - synchronize writes to persistent memory
+ *
+ * After a series of memcpy_to_pmem() operations this drains data from
+ * cpu write buffers and any platform (memory controller) buffers to
+ * ensure that written data is durable on persistent memory media.
+ */
+static inline void wmb_pmem(void)
+{
+	if (arch_has_pmem_api())
+		arch_wmb_pmem();
+}
+#endif /* __PMEM_H__ */
diff --git a/lib/Kconfig b/lib/Kconfig
index 601965a948e8..d27c13a91c28 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -522,4 +522,7 @@ source "lib/fonts/Kconfig"
 config ARCH_HAS_SG_CHAIN
 	def_bool n
 
+config ARCH_HAS_PMEM_API
+	bool
+
 endmenu
-- 
cgit v1.2.3


From 304adf8a8fff972f633bf52b3d160682d3f3d5d2 Mon Sep 17 00:00:00 2001
From: Jiang Liu <jiang.liu@linux.intel.com>
Date: Thu, 4 Jun 2015 12:13:26 +0800
Subject: genirq: Introduce helper irq_desc_get_irq()

Introduce helper irq_desc_get_irq() to retrieve the irq number from
the irq descriptor.

Signed-off-by: Jiang Liu <jiang.liu@linux.intel.com>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Marc Zyngier <marc.zyngier@arm.com>
Link: http://lkml.kernel.org/r/1433391238-19471-17-git-send-email-jiang.liu@linux.intel.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/irqdesc.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h
index 66c41b40ca50..a72bfd94ea10 100644
--- a/include/linux/irqdesc.h
+++ b/include/linux/irqdesc.h
@@ -100,6 +100,11 @@ static inline struct irq_desc *irq_data_to_desc(struct irq_data *data)
 #endif
 }
 
+static inline unsigned int irq_desc_get_irq(struct irq_desc *desc)
+{
+	return desc->irq_data.irq;
+}
+
 static inline struct irq_data *irq_desc_get_irq_data(struct irq_desc *desc)
 {
 	return &desc->irq_data;
-- 
cgit v1.2.3


From bbc9d21fc0071c245c19077155ea371092ff0db8 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 23 Jun 2015 15:01:30 +0200
Subject: genirq: Implement
 irq_set_handler_locked()/irq_set_chip_handler_name_locked()

The main use case for the exisiting __irq_set_*_locked() inlines is to
replace the handler [,chip and name] of an interrupt from a region
which has the irq descriptor lock held, e.g. from the irq_set_type()
callback. The first argument is the irq number, so the functions need
so perform a pointless lookup of the interrupt descriptor for those
cases which have the irq_data pointer handy.

Provide new functions which take an irq_data pointer instead of the
interrupt number, so the lookup of the interrupt descriptor can be
avoided.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Jiang Liu <jiang.liu@linux.intel.com>

Conflicts:
	include/linux/irqdesc.h
---
 include/linux/irqdesc.h | 41 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 41 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h
index a72bfd94ea10..624a668e61f1 100644
--- a/include/linux/irqdesc.h
+++ b/include/linux/irqdesc.h
@@ -190,6 +190,47 @@ __irq_set_chip_handler_name_locked(unsigned int irq, struct irq_chip *chip,
 	desc->name = name;
 }
 
+/**
+ * irq_set_handler_locked - Set irq handler from a locked region
+ * @data:	Pointer to the irq_data structure which identifies the irq
+ * @handler:	Flow control handler function for this interrupt
+ *
+ * Sets the handler in the irq descriptor associated to @data.
+ *
+ * Must be called with irq_desc locked and valid parameters. Typical
+ * call site is the irq_set_type() callback.
+ */
+static inline void irq_set_handler_locked(struct irq_data *data,
+					  irq_flow_handler_t handler)
+{
+	struct irq_desc *desc = irq_data_to_desc(data);
+
+	desc->handle_irq = handler;
+}
+
+/**
+ * irq_set_chip_handler_name_locked - Set chip, handler and name from a locked region
+ * @data:	Pointer to the irq_data structure for which the chip is set
+ * @chip:	Pointer to the new irq chip
+ * @handler:	Flow control handler function for this interrupt
+ * @name:	Name of the interrupt
+ *
+ * Replace the irq chip at the proper hierarchy level in @data and
+ * sets the handler and name in the associated irq descriptor.
+ *
+ * Must be called with irq_desc locked and valid parameters.
+ */
+static inline void
+irq_set_chip_handler_name_locked(struct irq_data *data, struct irq_chip *chip,
+				 irq_flow_handler_t handler, const char *name)
+{
+	struct irq_desc *desc = irq_data_to_desc(data);
+
+	desc->handle_irq = handler;
+	desc->name = name;
+	data->chip = chip;
+}
+
 static inline int irq_balancing_disabled(unsigned int irq)
 {
 	struct irq_desc *desc;
-- 
cgit v1.2.3


From 6c5a0d891543873aefc3aaf846c1e7afe0982ff9 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Sat, 27 Jun 2015 11:45:46 -0400
Subject: NFSv4.2: LAYOUTSTATS is optional to implement

Make it so, by checking the return value for NFS4ERR_MOTSUPP and
caching the information as a server capability.

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/nfs42proc.c        | 10 ++++++++--
 fs/nfs/nfs4proc.c         |  3 ++-
 fs/nfs/pnfs.c             |  3 +++
 include/linux/nfs_fs_sb.h |  1 +
 4 files changed, 14 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c
index 06c74cd93875..f486b80f927a 100644
--- a/fs/nfs/nfs42proc.c
+++ b/fs/nfs/nfs42proc.c
@@ -189,9 +189,15 @@ nfs42_layoutstat_done(struct rpc_task *task, void *calldata)
 	if (!nfs4_sequence_done(task, &data->res.seq_res))
 		return;
 
-	/* well, we don't care about errors at all! */
-	if (task->tk_status)
+	switch (task->tk_status) {
+	case 0:
+		break;
+	case -ENOTSUPP:
+	case -EOPNOTSUPP:
+		NFS_SERVER(data->inode)->caps &= ~NFS_CAP_LAYOUTSTATS;
+	default:
 		dprintk("%s server returns %d\n", __func__, task->tk_status);
+	}
 }
 
 static void
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 643ce3a91b22..8eab42407d39 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -8635,7 +8635,8 @@ static const struct nfs4_minor_version_ops nfs_v4_2_minor_ops = {
 		| NFS_CAP_ATOMIC_OPEN_V1
 		| NFS_CAP_ALLOCATE
 		| NFS_CAP_DEALLOCATE
-		| NFS_CAP_SEEK,
+		| NFS_CAP_SEEK
+		| NFS_CAP_LAYOUTSTATS,
 	.init_client = nfs41_init_client,
 	.shutdown_client = nfs41_shutdown_client,
 	.match_stateid = nfs41_match_stateid,
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 40bacebb5b97..0ba9a02c9566 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -2266,6 +2266,9 @@ pnfs_report_layoutstat(struct inode *inode)
 	if (!pnfs_enabled_sb(server) || !ld->prepare_layoutstats)
 		goto out;
 
+	if (!nfs_server_capable(inode, NFS_CAP_LAYOUTSTATS))
+		goto out;
+
 	if (test_and_set_bit(NFS_INO_LAYOUTSTATS, &nfsi->flags))
 		goto out;
 
diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
index 5e1273d4de14..a2ea1491d3df 100644
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -237,5 +237,6 @@ struct nfs_server {
 #define NFS_CAP_SEEK		(1U << 19)
 #define NFS_CAP_ALLOCATE	(1U << 20)
 #define NFS_CAP_DEALLOCATE	(1U << 21)
+#define NFS_CAP_LAYOUTSTATS	(1U << 22)
 
 #endif
-- 
cgit v1.2.3


From cf2fde7b39e9446e2af015215d7fb695781af0c1 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Fri, 26 Jun 2015 06:44:38 +0930
Subject: param: fix module param locks when !CONFIG_SYSFS.

As Dan Streetman points out, the entire point of locking for is to
stop sysfs accesses, so they're elided entirely in the !SYSFS case.

Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 include/linux/module.h |  2 ++
 kernel/module.c        |  9 ++++++++-
 kernel/params.c        | 18 ++++++++++++++----
 3 files changed, 24 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/module.h b/include/linux/module.h
index 6ba0e87fa804..46efa1c9de60 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -240,7 +240,9 @@ struct module {
 	unsigned int num_syms;
 
 	/* Kernel parameters. */
+#ifdef CONFIG_SYSFS
 	struct mutex param_lock;
+#endif
 	struct kernel_param *kp;
 	unsigned int num_kp;
 
diff --git a/kernel/module.c b/kernel/module.c
index 8ec33ce202a6..b4994adf7187 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1820,6 +1820,10 @@ static void mod_sysfs_fini(struct module *mod)
 	mod_kobject_put(mod);
 }
 
+static void init_param_lock(struct module *mod)
+{
+	mutex_init(&mod->param_lock);
+}
 #else /* !CONFIG_SYSFS */
 
 static int mod_sysfs_setup(struct module *mod,
@@ -1842,6 +1846,9 @@ static void del_usage_links(struct module *mod)
 {
 }
 
+static void init_param_lock(struct module *mod)
+{
+}
 #endif /* CONFIG_SYSFS */
 
 static void mod_sysfs_teardown(struct module *mod)
@@ -3442,7 +3449,7 @@ static int load_module(struct load_info *info, const char __user *uargs,
 	if (err)
 		goto unlink_mod;
 
-	mutex_init(&mod->param_lock);
+	init_param_lock(mod);
 
 	/* Now we've got everything in the final locations, we can
 	 * find optional sections. */
diff --git a/kernel/params.c b/kernel/params.c
index 8890d0b8dffc..faa461c16f12 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -25,12 +25,22 @@
 #include <linux/slab.h>
 #include <linux/ctype.h>
 
+#ifdef CONFIG_SYSFS
 /* Protects all built-in parameters, modules use their own param_lock */
 static DEFINE_MUTEX(param_lock);
 
 /* Use the module's mutex, or if built-in use the built-in mutex */
 #define KPARAM_MUTEX(mod)	((mod) ? &(mod)->param_lock : &param_lock)
-#define KPARAM_IS_LOCKED(mod)	mutex_is_locked(KPARAM_MUTEX(mod))
+
+static inline void check_kparam_locked(struct module *mod)
+{
+	BUG_ON(!mutex_is_locked(KPARAM_MUTEX(mod)));
+}
+#else
+static inline void check_kparam_locked(struct module *mod)
+{
+}
+#endif /* !CONFIG_SYSFS */
 
 /* This just allows us to keep track of which parameters are kmalloced. */
 struct kmalloced_param {
@@ -459,7 +469,7 @@ static int param_array(struct module *mod,
 		/* nul-terminate and parse */
 		save = val[len];
 		((char *)val)[len] = '\0';
-		BUG_ON(!KPARAM_IS_LOCKED(mod));
+		check_kparam_locked(mod);
 		ret = set(val, &kp);
 
 		if (ret != 0)
@@ -496,7 +506,7 @@ static int param_array_get(char *buffer, const struct kernel_param *kp)
 		if (i)
 			buffer[off++] = ',';
 		p.arg = arr->elem + arr->elemsize * i;
-		BUG_ON(!KPARAM_IS_LOCKED(p.mod));
+		check_kparam_locked(p.mod);
 		ret = arr->ops->get(buffer + off, &p);
 		if (ret < 0)
 			return ret;
@@ -616,6 +626,7 @@ static ssize_t param_attr_store(struct module_attribute *mattr,
 #define __modinit __init
 #endif
 
+#ifdef CONFIG_SYSFS
 void kernel_param_lock(struct module *mod)
 {
 	mutex_lock(KPARAM_MUTEX(mod));
@@ -626,7 +637,6 @@ void kernel_param_unlock(struct module *mod)
 	mutex_unlock(KPARAM_MUTEX(mod));
 }
 
-#ifdef CONFIG_SYSFS
 EXPORT_SYMBOL(kernel_param_lock);
 EXPORT_SYMBOL(kernel_param_unlock);
 
-- 
cgit v1.2.3


From ef90174f821041313d42d99c1c8b35a3af64a910 Mon Sep 17 00:00:00 2001
From: Jean-Baptiste Theou <jtheou@adeneo-embedded.us>
Date: Tue, 9 Jun 2015 09:55:02 -0700
Subject: watchdog: watchdog_core: Add watchdog registration deferral mechanism

Currently, watchdog subsystem require the misc subsystem to
register a watchdog. This may not be the case in case of an
early registration of a watchdog, which can be required when
the watchdog cannot be disabled.

This patch introduces a deferral mechanism to remove this requirement.

Signed-off-by: Jean-Baptiste Theou <jtheou@adeneo-embedded.us>
Reviewed-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Wim Van Sebroeck <wim@iguana.be>
---
 Documentation/watchdog/watchdog-kernel-api.txt |   7 ++
 drivers/watchdog/watchdog_core.c               | 118 +++++++++++++++++++++----
 include/linux/watchdog.h                       |   3 +
 3 files changed, 110 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/watchdog/watchdog-kernel-api.txt b/Documentation/watchdog/watchdog-kernel-api.txt
index a0438f3957ca..d8b0d3367706 100644
--- a/Documentation/watchdog/watchdog-kernel-api.txt
+++ b/Documentation/watchdog/watchdog-kernel-api.txt
@@ -36,6 +36,10 @@ The watchdog_unregister_device routine deregisters a registered watchdog timer
 device. The parameter of this routine is the pointer to the registered
 watchdog_device structure.
 
+The watchdog subsystem includes an registration deferral mechanism,
+which allows you to register an watchdog as early as you wish during
+the boot process.
+
 The watchdog device structure looks like this:
 
 struct watchdog_device {
@@ -52,6 +56,7 @@ struct watchdog_device {
 	void *driver_data;
 	struct mutex lock;
 	unsigned long status;
+	struct list_head deferred;
 };
 
 It contains following fields:
@@ -80,6 +85,8 @@ It contains following fields:
   information about the status of the device (Like: is the watchdog timer
   running/active, is the nowayout bit set, is the device opened via
   the /dev/watchdog interface or not, ...).
+* deferred: entry in wtd_deferred_reg_list which is used to
+  register early initialized watchdogs.
 
 The list of watchdog operations is defined as:
 
diff --git a/drivers/watchdog/watchdog_core.c b/drivers/watchdog/watchdog_core.c
index cec9b559647d..1a8059455413 100644
--- a/drivers/watchdog/watchdog_core.c
+++ b/drivers/watchdog/watchdog_core.c
@@ -43,6 +43,45 @@
 static DEFINE_IDA(watchdog_ida);
 static struct class *watchdog_class;
 
+/*
+ * Deferred Registration infrastructure.
+ *
+ * Sometimes watchdog drivers needs to be loaded as soon as possible,
+ * for example when it's impossible to disable it. To do so,
+ * raising the initcall level of the watchdog driver is a solution.
+ * But in such case, the miscdev is maybe not ready (subsys_initcall), and
+ * watchdog_core need miscdev to register the watchdog as a char device.
+ *
+ * The deferred registration infrastructure offer a way for the watchdog
+ * subsystem to register a watchdog properly, even before miscdev is ready.
+ */
+
+static DEFINE_MUTEX(wtd_deferred_reg_mutex);
+static LIST_HEAD(wtd_deferred_reg_list);
+static bool wtd_deferred_reg_done;
+
+static int watchdog_deferred_registration_add(struct watchdog_device *wdd)
+{
+	list_add_tail(&wdd->deferred,
+		      &wtd_deferred_reg_list);
+	return 0;
+}
+
+static void watchdog_deferred_registration_del(struct watchdog_device *wdd)
+{
+	struct list_head *p, *n;
+	struct watchdog_device *wdd_tmp;
+
+	list_for_each_safe(p, n, &wtd_deferred_reg_list) {
+		wdd_tmp = list_entry(p, struct watchdog_device,
+				     deferred);
+		if (wdd_tmp == wdd) {
+			list_del(&wdd_tmp->deferred);
+			break;
+		}
+	}
+}
+
 static void watchdog_check_min_max_timeout(struct watchdog_device *wdd)
 {
 	/*
@@ -98,17 +137,7 @@ int watchdog_init_timeout(struct watchdog_device *wdd,
 }
 EXPORT_SYMBOL_GPL(watchdog_init_timeout);
 
-/**
- * watchdog_register_device() - register a watchdog device
- * @wdd: watchdog device
- *
- * Register a watchdog device with the kernel so that the
- * watchdog timer can be accessed from userspace.
- *
- * A zero is returned on success and a negative errno code for
- * failure.
- */
-int watchdog_register_device(struct watchdog_device *wdd)
+static int __watchdog_register_device(struct watchdog_device *wdd)
 {
 	int ret, id, devno;
 
@@ -164,16 +193,33 @@ int watchdog_register_device(struct watchdog_device *wdd)
 
 	return 0;
 }
-EXPORT_SYMBOL_GPL(watchdog_register_device);
 
 /**
- * watchdog_unregister_device() - unregister a watchdog device
- * @wdd: watchdog device to unregister
+ * watchdog_register_device() - register a watchdog device
+ * @wdd: watchdog device
  *
- * Unregister a watchdog device that was previously successfully
- * registered with watchdog_register_device().
+ * Register a watchdog device with the kernel so that the
+ * watchdog timer can be accessed from userspace.
+ *
+ * A zero is returned on success and a negative errno code for
+ * failure.
  */
-void watchdog_unregister_device(struct watchdog_device *wdd)
+
+int watchdog_register_device(struct watchdog_device *wdd)
+{
+	int ret;
+
+	mutex_lock(&wtd_deferred_reg_mutex);
+	if (wtd_deferred_reg_done)
+		ret = __watchdog_register_device(wdd);
+	else
+		ret = watchdog_deferred_registration_add(wdd);
+	mutex_unlock(&wtd_deferred_reg_mutex);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(watchdog_register_device);
+
+static void __watchdog_unregister_device(struct watchdog_device *wdd)
 {
 	int ret;
 	int devno;
@@ -189,8 +235,43 @@ void watchdog_unregister_device(struct watchdog_device *wdd)
 	ida_simple_remove(&watchdog_ida, wdd->id);
 	wdd->dev = NULL;
 }
+
+/**
+ * watchdog_unregister_device() - unregister a watchdog device
+ * @wdd: watchdog device to unregister
+ *
+ * Unregister a watchdog device that was previously successfully
+ * registered with watchdog_register_device().
+ */
+
+void watchdog_unregister_device(struct watchdog_device *wdd)
+{
+	mutex_lock(&wtd_deferred_reg_mutex);
+	if (wtd_deferred_reg_done)
+		__watchdog_unregister_device(wdd);
+	else
+		watchdog_deferred_registration_del(wdd);
+	mutex_unlock(&wtd_deferred_reg_mutex);
+}
+
 EXPORT_SYMBOL_GPL(watchdog_unregister_device);
 
+static int __init watchdog_deferred_registration(void)
+{
+	mutex_lock(&wtd_deferred_reg_mutex);
+	wtd_deferred_reg_done = true;
+	while (!list_empty(&wtd_deferred_reg_list)) {
+		struct watchdog_device *wdd;
+
+		wdd = list_first_entry(&wtd_deferred_reg_list,
+				       struct watchdog_device, deferred);
+		list_del(&wdd->deferred);
+		__watchdog_register_device(wdd);
+	}
+	mutex_unlock(&wtd_deferred_reg_mutex);
+	return 0;
+}
+
 static int __init watchdog_init(void)
 {
 	int err;
@@ -207,6 +288,7 @@ static int __init watchdog_init(void)
 		return err;
 	}
 
+	watchdog_deferred_registration();
 	return 0;
 }
 
@@ -217,7 +299,7 @@ static void __exit watchdog_exit(void)
 	ida_destroy(&watchdog_ida);
 }
 
-subsys_initcall(watchdog_init);
+subsys_initcall_sync(watchdog_init);
 module_exit(watchdog_exit);
 
 MODULE_AUTHOR("Alan Cox <alan@lxorguk.ukuu.org.uk>");
diff --git a/include/linux/watchdog.h b/include/linux/watchdog.h
index a746bf5216f8..f47feada5b42 100644
--- a/include/linux/watchdog.h
+++ b/include/linux/watchdog.h
@@ -65,6 +65,8 @@ struct watchdog_ops {
  * @driver-data:Pointer to the drivers private data.
  * @lock:	Lock for watchdog core internal use only.
  * @status:	Field that contains the devices internal status bits.
+ * @deferred: entry in wtd_deferred_reg_list which is used to
+ *			   register early initialized watchdogs.
  *
  * The watchdog_device structure contains all information about a
  * watchdog timer device.
@@ -95,6 +97,7 @@ struct watchdog_device {
 #define WDOG_ALLOW_RELEASE	2	/* Did we receive the magic char ? */
 #define WDOG_NO_WAY_OUT		3	/* Is 'nowayout' feature set ? */
 #define WDOG_UNREGISTERED	4	/* Has the device been unregistered */
+	struct list_head deferred;
 };
 
 #define WATCHDOG_NOWAYOUT		IS_BUILTIN(CONFIG_WATCHDOG_NOWAYOUT)
-- 
cgit v1.2.3


From a9730fca9946f3697410479e0ef1bd759ba00a77 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <cl@linux.com>
Date: Mon, 29 Jun 2015 09:28:08 -0500
Subject: Fix kmalloc slab creation sequence

This patch restores the slab creation sequence that was broken by commit
4066c33d0308f8 and also reverts the portions that introduced the
KMALLOC_LOOP_XXX macros. Those can never really work since the slab creation
is much more complex than just going from a minimum to a maximum number.

The latest upstream kernel boots cleanly on my machine with a 64 bit x86
configuration under KVM using either SLAB or SLUB.

Fixes: 4066c33d0308f8 ("support the slub_debug boot option")
Reported-by: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Christoph Lameter <cl@linux.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/slab.h | 22 ----------------------
 mm/slab_common.c     | 32 ++++++++++++++++----------------
 2 files changed, 16 insertions(+), 38 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/slab.h b/include/linux/slab.h
index 9de2fdc8b5e4..a99f0e5243e1 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -153,30 +153,8 @@ size_t ksize(const void *);
 #define ARCH_KMALLOC_MINALIGN ARCH_DMA_MINALIGN
 #define KMALLOC_MIN_SIZE ARCH_DMA_MINALIGN
 #define KMALLOC_SHIFT_LOW ilog2(ARCH_DMA_MINALIGN)
-/*
- * The KMALLOC_LOOP_LOW is the definition for the for loop index start number
- * to create the kmalloc_caches object in create_kmalloc_caches(). The first
- * and the second are 96 and 192. You can see that in the kmalloc_index(), if
- * the KMALLOC_MIN_SIZE <= 32, then return 1 (96). If KMALLOC_MIN_SIZE <= 64,
- * then return 2 (192). If the KMALLOC_MIN_SIZE is bigger than 64, we don't
- * need to initialize 96 and 192. Go directly to start the KMALLOC_SHIFT_LOW.
- */
-#if KMALLOC_MIN_SIZE <= 32
-#define KMALLOC_LOOP_LOW 1
-#elif KMALLOC_MIN_SIZE <= 64
-#define KMALLOC_LOOP_LOW 2
-#else
-#define KMALLOC_LOOP_LOW KMALLOC_SHIFT_LOW
-#endif
-
 #else
 #define ARCH_KMALLOC_MINALIGN __alignof__(unsigned long long)
-/*
- * The KMALLOC_MIN_SIZE of slub/slab/slob is 2^3/2^5/2^3. So, even slab is used.
- * The KMALLOC_MIN_SIZE <= 32. The kmalloc-96 and kmalloc-192 should also be
- * initialized.
- */
-#define KMALLOC_LOOP_LOW 1
 #endif
 
 /*
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 9f8d71f78404..983b78694c46 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -855,6 +855,12 @@ void __init setup_kmalloc_cache_index_table(void)
 	}
 }
 
+static void new_kmalloc_cache(int idx, unsigned long flags)
+{
+	kmalloc_caches[idx] = create_kmalloc_cache(kmalloc_info[idx].name,
+					kmalloc_info[idx].size, flags);
+}
+
 /*
  * Create the kmalloc array. Some of the regular kmalloc arrays
  * may already have been created because they were needed to
@@ -864,25 +870,19 @@ void __init create_kmalloc_caches(unsigned long flags)
 {
 	int i;
 
-	for (i = KMALLOC_LOOP_LOW; i <= KMALLOC_SHIFT_HIGH; i++) {
-		if (!kmalloc_caches[i]) {
-			kmalloc_caches[i] = create_kmalloc_cache(
-						kmalloc_info[i].name,
-						kmalloc_info[i].size,
-						flags);
-		}
+	for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_HIGH; i++) {
+		if (!kmalloc_caches[i])
+			new_kmalloc_cache(i, flags);
 
 		/*
-		 * "i == 2" is the "kmalloc-192" case which is the last special
-		 * case for initialization and it's the point to jump to
-		 * allocate the minimize size of the object. In slab allocator,
-		 * the KMALLOC_SHIFT_LOW = 5. So, it needs to skip 2^3 and 2^4
-		 * and go straight to allocate 2^5. If the ARCH_DMA_MINALIGN is
-		 * defined, it may be larger than 2^5 and here is also the
-		 * trick to skip the empty gap.
+		 * Caches that are not of the two-to-the-power-of size.
+		 * These have to be created immediately after the
+		 * earlier power of two caches
 		 */
-		if (i == 2)
-			i = (KMALLOC_SHIFT_LOW - 1);
+		if (KMALLOC_MIN_SIZE <= 32 && !kmalloc_caches[1] && i == 6)
+			new_kmalloc_cache(1, flags);
+		if (KMALLOC_MIN_SIZE <= 64 && !kmalloc_caches[2] && i == 7)
+			new_kmalloc_cache(2, flags);
 	}
 
 	/* Kmalloc array is now usable */
-- 
cgit v1.2.3


From 31f02455455d405320e2f749696bef4e02903b35 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Tue, 30 Jun 2015 12:07:17 -0400
Subject: sparse: fix misplaced __pmem definition

Move the definition of __pmem outside of CONFIG_SPARSE_RCU_POINTER to fix:

drivers/nvdimm/pmem.c:198:17: sparse: too many arguments for function __builtin_expect
drivers/nvdimm/pmem.c:36:33: sparse: expected ; at end of declaration
drivers/nvdimm/pmem.c:48:21: sparse: void declaration

...due to __pmem failing to be defined in some configurations when
CONFIG_SPARSE_RCU_POINTER=y.

Reported-by: kbuild test robot <fengguang.wu@intel.com>
Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 include/linux/compiler.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index 26fc8bc77f85..d8fbd500330e 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -17,11 +17,11 @@
 # define __release(x)	__context__(x,-1)
 # define __cond_lock(x,c)	((c) ? ({ __acquire(x); 1; }) : 0)
 # define __percpu	__attribute__((noderef, address_space(3)))
+# define __pmem		__attribute__((noderef, address_space(5)))
 #ifdef CONFIG_SPARSE_RCU_POINTER
 # define __rcu		__attribute__((noderef, address_space(4)))
 #else
 # define __rcu
-# define __pmem		__attribute__((noderef, address_space(5)))
 #endif
 extern void __chk_user_ptr(const volatile void __user *);
 extern void __chk_io_ptr(const volatile void __iomem *);
-- 
cgit v1.2.3


From 8e7a7f8619f1f93736d9bb7e31caf4721bdc739d Mon Sep 17 00:00:00 2001
From: Robin Holt <holt@sgi.com>
Date: Tue, 30 Jun 2015 14:56:41 -0700
Subject: memblock: introduce a for_each_reserved_mem_region iterator

Struct page initialisation had been identified as one of the reasons why
large machines take a long time to boot. Patches were posted a long time ago
to defer initialisation until they were first used.  This was rejected on
the grounds it should not be necessary to hurt the fast paths. This series
reuses much of the work from that time but defers the initialisation of
memory to kswapd so that one thread per node initialises memory local to
that node.

After applying the series and setting the appropriate Kconfig variable I
see this in the boot log on a 64G machine

[    7.383764] kswapd 0 initialised deferred memory in 188ms
[    7.404253] kswapd 1 initialised deferred memory in 208ms
[    7.411044] kswapd 3 initialised deferred memory in 216ms
[    7.411551] kswapd 2 initialised deferred memory in 216ms

On a 1TB machine, I see

[    8.406511] kswapd 3 initialised deferred memory in 1116ms
[    8.428518] kswapd 1 initialised deferred memory in 1140ms
[    8.435977] kswapd 0 initialised deferred memory in 1148ms
[    8.437416] kswapd 2 initialised deferred memory in 1148ms

Once booted the machine appears to work as normal. Boot times were measured
from the time shutdown was called until ssh was available again.  In the
64G case, the boot time savings are negligible. On the 1TB machine, the
savings were 16 seconds.

Nate Zimmer said:

: On an older 8 TB box with lots and lots of cpus the boot time, as
: measure from grub to login prompt, the boot time improved from 1484
: seconds to exactly 1000 seconds.

Waiman Long said:

: I ran a bootup timing test on a 12-TB 16-socket IvyBridge-EX system.  From
: grub menu to ssh login, the bootup time was 453s before the patch and 265s
: after the patch - a saving of 188s (42%).

Daniel Blueman said:

: On a 7TB, 1728-core NumaConnect system with 108 NUMA nodes, we're seeing
: stock 4.0 boot in 7136s.  This drops to 2159s, or a 70% reduction with
: this patchset.  Non-temporal PMD init (https://lkml.org/lkml/2015/4/23/350)
: drops this to 1045s.

This patch (of 13):

As part of initializing struct page's in 2MiB chunks, we noticed that at
the end of free_all_bootmem(), there was nothing which had forced the
reserved/allocated 4KiB pages to be initialized.

This helper function will be used for that expansion.

Signed-off-by: Robin Holt <holt@sgi.com>
Signed-off-by: Nate Zimmer <nzimmer@sgi.com>
Signed-off-by: Mel Gorman <mgorman@suse.de>
Tested-by: Nate Zimmer <nzimmer@sgi.com>
Tested-by: Waiman Long <waiman.long@hp.com>
Tested-by: Daniel J Blueman <daniel@numascale.com>
Acked-by: Pekka Enberg <penberg@kernel.org>
Cc: Robin Holt <robinmholt@gmail.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Waiman Long <waiman.long@hp.com>
Cc: Scott Norton <scott.norton@hp.com>
Cc: "Luck, Tony" <tony.luck@intel.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memblock.h | 18 ++++++++++++++++++
 mm/memblock.c            | 32 ++++++++++++++++++++++++++++++++
 2 files changed, 50 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index 0215ffd63069..cc4b01972060 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -101,6 +101,9 @@ void __next_mem_range_rev(u64 *idx, int nid, ulong flags,
 			  struct memblock_type *type_b, phys_addr_t *out_start,
 			  phys_addr_t *out_end, int *out_nid);
 
+void __next_reserved_mem_region(u64 *idx, phys_addr_t *out_start,
+			       phys_addr_t *out_end);
+
 /**
  * for_each_mem_range - iterate through memblock areas from type_a and not
  * included in type_b. Or just type_a if type_b is NULL.
@@ -142,6 +145,21 @@ void __next_mem_range_rev(u64 *idx, int nid, ulong flags,
 	     __next_mem_range_rev(&i, nid, flags, type_a, type_b,	\
 				  p_start, p_end, p_nid))
 
+/**
+ * for_each_reserved_mem_region - iterate over all reserved memblock areas
+ * @i: u64 used as loop variable
+ * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL
+ * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL
+ *
+ * Walks over reserved areas of memblock. Available as soon as memblock
+ * is initialized.
+ */
+#define for_each_reserved_mem_region(i, p_start, p_end)			\
+	for (i = 0UL,							\
+	     __next_reserved_mem_region(&i, p_start, p_end);		\
+	     i != (u64)ULLONG_MAX;					\
+	     __next_reserved_mem_region(&i, p_start, p_end))
+
 #ifdef CONFIG_MOVABLE_NODE
 static inline bool memblock_is_hotpluggable(struct memblock_region *m)
 {
diff --git a/mm/memblock.c b/mm/memblock.c
index 1b444c730846..114df622853f 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -819,6 +819,38 @@ int __init_memblock memblock_mark_mirror(phys_addr_t base, phys_addr_t size)
 }
 
 
+/**
+ * __next_reserved_mem_region - next function for for_each_reserved_region()
+ * @idx: pointer to u64 loop variable
+ * @out_start: ptr to phys_addr_t for start address of the region, can be %NULL
+ * @out_end: ptr to phys_addr_t for end address of the region, can be %NULL
+ *
+ * Iterate over all reserved memory regions.
+ */
+void __init_memblock __next_reserved_mem_region(u64 *idx,
+					   phys_addr_t *out_start,
+					   phys_addr_t *out_end)
+{
+	struct memblock_type *rsv = &memblock.reserved;
+
+	if (*idx >= 0 && *idx < rsv->cnt) {
+		struct memblock_region *r = &rsv->regions[*idx];
+		phys_addr_t base = r->base;
+		phys_addr_t size = r->size;
+
+		if (out_start)
+			*out_start = base;
+		if (out_end)
+			*out_end = base + size - 1;
+
+		*idx += 1;
+		return;
+	}
+
+	/* signal end of iteration */
+	*idx = ULLONG_MAX;
+}
+
 /**
  * __next__mem_range - next function for for_each_free_mem_range() etc.
  * @idx: pointer to u64 loop variable
-- 
cgit v1.2.3


From 92923ca3aacef63c92dc297a75ad0c6dfe4eab37 Mon Sep 17 00:00:00 2001
From: Nathan Zimmer <nzimmer@sgi.com>
Date: Tue, 30 Jun 2015 14:56:48 -0700
Subject: mm: meminit: only set page reserved in the memblock region

Currently each page struct is set as reserved upon initialization.  This
patch leaves the reserved bit clear and only sets the reserved bit when it
is known the memory was allocated by the bootmem allocator.  This makes it
easier to distinguish between uninitialised struct pages and reserved
struct pages in later patches.

Signed-off-by: Robin Holt <holt@sgi.com>
Signed-off-by: Nathan Zimmer <nzimmer@sgi.com>
Signed-off-by: Mel Gorman <mgorman@suse.de>
Tested-by: Nate Zimmer <nzimmer@sgi.com>
Tested-by: Waiman Long <waiman.long@hp.com>
Tested-by: Daniel J Blueman <daniel@numascale.com>
Acked-by: Pekka Enberg <penberg@kernel.org>
Cc: Robin Holt <robinmholt@gmail.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Waiman Long <waiman.long@hp.com>
Cc: Scott Norton <scott.norton@hp.com>
Cc: "Luck, Tony" <tony.luck@intel.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h |  2 ++
 mm/nobootmem.c     |  3 +++
 mm/page_alloc.c    | 17 ++++++++++++++++-
 3 files changed, 21 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 99959a34f4f1..d662af2d0d01 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1635,6 +1635,8 @@ extern void free_highmem_page(struct page *page);
 extern void adjust_managed_page_count(struct page *page, long count);
 extern void mem_init_print_info(const char *str);
 
+extern void reserve_bootmem_region(unsigned long start, unsigned long end);
+
 /* Free the reserved page into the buddy system, so it gets managed. */
 static inline void __free_reserved_page(struct page *page)
 {
diff --git a/mm/nobootmem.c b/mm/nobootmem.c
index 5258386fa1be..4af8f88c2bd1 100644
--- a/mm/nobootmem.c
+++ b/mm/nobootmem.c
@@ -130,6 +130,9 @@ static unsigned long __init free_low_memory_core_early(void)
 
 	memblock_clear_hotplug(0, -1);
 
+	for_each_reserved_mem_region(i, &start, &end)
+		reserve_bootmem_region(start, end);
+
 	for_each_free_mem_range(i, NUMA_NO_NODE, MEMBLOCK_NONE, &start, &end,
 				NULL)
 		count += __free_memory_core(start, end);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index bc5da2cdfc84..39c8d56a4056 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -774,7 +774,6 @@ static void __meminit __init_single_page(struct page *page, unsigned long pfn,
 	init_page_count(page);
 	page_mapcount_reset(page);
 	page_cpupid_reset_last(page);
-	SetPageReserved(page);
 
 	/*
 	 * Mark the block movable so that blocks are reserved for
@@ -809,6 +808,22 @@ static void __meminit __init_single_pfn(unsigned long pfn, unsigned long zone,
 	return __init_single_page(pfn_to_page(pfn), pfn, zone, nid);
 }
 
+/*
+ * Initialised pages do not have PageReserved set. This function is
+ * called for each range allocated by the bootmem allocator and
+ * marks the pages PageReserved. The remaining valid pages are later
+ * sent to the buddy page allocator.
+ */
+void reserve_bootmem_region(unsigned long start, unsigned long end)
+{
+	unsigned long start_pfn = PFN_DOWN(start);
+	unsigned long end_pfn = PFN_UP(end);
+
+	for (; start_pfn < end_pfn; start_pfn++)
+		if (pfn_valid(start_pfn))
+			SetPageReserved(pfn_to_page(start_pfn));
+}
+
 static bool free_pages_prepare(struct page *page, unsigned int order)
 {
 	bool compound = PageCompound(page);
-- 
cgit v1.2.3


From 8a942fdea560d4ac0e9d9fabcd5201ad20e0c382 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@suse.de>
Date: Tue, 30 Jun 2015 14:56:55 -0700
Subject: mm: meminit: make __early_pfn_to_nid SMP-safe and introduce
 meminit_pfn_in_nid

__early_pfn_to_nid() use static variables to cache recent lookups as
memblock lookups are very expensive but it assumes that memory
initialisation is single-threaded.  Parallel initialisation of struct
pages will break that assumption so this patch makes __early_pfn_to_nid()
SMP-safe by requiring the caller to cache recent search information.
early_pfn_to_nid() keeps the same interface but is only safe to use early
in boot due to the use of a global static variable.  meminit_pfn_in_nid()
is an SMP-safe version that callers must maintain their own state for.

Signed-off-by: Mel Gorman <mgorman@suse.de>
Tested-by: Nate Zimmer <nzimmer@sgi.com>
Tested-by: Waiman Long <waiman.long@hp.com>
Tested-by: Daniel J Blueman <daniel@numascale.com>
Acked-by: Pekka Enberg <penberg@kernel.org>
Cc: Robin Holt <robinmholt@gmail.com>
Cc: Nate Zimmer <nzimmer@sgi.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Waiman Long <waiman.long@hp.com>
Cc: Scott Norton <scott.norton@hp.com>
Cc: "Luck, Tony" <tony.luck@intel.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/ia64/mm/numa.c    | 19 +++++++------------
 include/linux/mm.h     |  6 ++++--
 include/linux/mmzone.h | 16 +++++++++++++++-
 mm/page_alloc.c        | 40 +++++++++++++++++++++++++---------------
 4 files changed, 51 insertions(+), 30 deletions(-)

(limited to 'include/linux')

diff --git a/arch/ia64/mm/numa.c b/arch/ia64/mm/numa.c
index ea21d4cad540..aa19b7ac8222 100644
--- a/arch/ia64/mm/numa.c
+++ b/arch/ia64/mm/numa.c
@@ -58,27 +58,22 @@ paddr_to_nid(unsigned long paddr)
  * SPARSEMEM to allocate the SPARSEMEM sectionmap on the NUMA node where
  * the section resides.
  */
-int __meminit __early_pfn_to_nid(unsigned long pfn)
+int __meminit __early_pfn_to_nid(unsigned long pfn,
+					struct mminit_pfnnid_cache *state)
 {
 	int i, section = pfn >> PFN_SECTION_SHIFT, ssec, esec;
-	/*
-	 * NOTE: The following SMP-unsafe globals are only used early in boot
-	 * when the kernel is running single-threaded.
-	 */
-	static int __meminitdata last_ssec, last_esec;
-	static int __meminitdata last_nid;
 
-	if (section >= last_ssec && section < last_esec)
-		return last_nid;
+	if (section >= state->last_start && section < state->last_end)
+		return state->last_nid;
 
 	for (i = 0; i < num_node_memblks; i++) {
 		ssec = node_memblk[i].start_paddr >> PA_SECTION_SHIFT;
 		esec = (node_memblk[i].start_paddr + node_memblk[i].size +
 			((1L << PA_SECTION_SHIFT) - 1)) >> PA_SECTION_SHIFT;
 		if (section >= ssec && section < esec) {
-			last_ssec = ssec;
-			last_esec = esec;
-			last_nid = node_memblk[i].nid;
+			state->last_start = ssec;
+			state->last_end = esec;
+			state->last_nid = node_memblk[i].nid;
 			return node_memblk[i].nid;
 		}
 	}
diff --git a/include/linux/mm.h b/include/linux/mm.h
index d662af2d0d01..2e872f92dbac 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1726,7 +1726,8 @@ extern void sparse_memory_present_with_active_regions(int nid);
 
 #if !defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP) && \
     !defined(CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID)
-static inline int __early_pfn_to_nid(unsigned long pfn)
+static inline int __early_pfn_to_nid(unsigned long pfn,
+					struct mminit_pfnnid_cache *state)
 {
 	return 0;
 }
@@ -1734,7 +1735,8 @@ static inline int __early_pfn_to_nid(unsigned long pfn)
 /* please see mm/page_alloc.c */
 extern int __meminit early_pfn_to_nid(unsigned long pfn);
 /* there is a per-arch backend function. */
-extern int __meminit __early_pfn_to_nid(unsigned long pfn);
+extern int __meminit __early_pfn_to_nid(unsigned long pfn,
+					struct mminit_pfnnid_cache *state);
 #endif
 
 extern void set_dma_reserve(unsigned long new_dma_reserve);
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 54d74f6eb233..b2473d822549 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -1216,10 +1216,24 @@ void sparse_init(void);
 #define sparse_index_init(_sec, _nid)  do {} while (0)
 #endif /* CONFIG_SPARSEMEM */
 
+/*
+ * During memory init memblocks map pfns to nids. The search is expensive and
+ * this caches recent lookups. The implementation of __early_pfn_to_nid
+ * may treat start/end as pfns or sections.
+ */
+struct mminit_pfnnid_cache {
+	unsigned long last_start;
+	unsigned long last_end;
+	int last_nid;
+};
+
 #ifdef CONFIG_NODES_SPAN_OTHER_NODES
 bool early_pfn_in_nid(unsigned long pfn, int nid);
+bool meminit_pfn_in_nid(unsigned long pfn, int node,
+			struct mminit_pfnnid_cache *state);
 #else
-#define early_pfn_in_nid(pfn, nid)	(1)
+#define early_pfn_in_nid(pfn, nid)		(1)
+#define meminit_pfn_in_nid(pfn, nid, state)	(1)
 #endif
 
 #ifndef early_pfn_valid
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index c2ee4ecad083..ffdb2308848d 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4551,39 +4551,41 @@ int __meminit init_currently_empty_zone(struct zone *zone,
 
 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
 #ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID
+
 /*
  * Required by SPARSEMEM. Given a PFN, return what node the PFN is on.
  */
-int __meminit __early_pfn_to_nid(unsigned long pfn)
+int __meminit __early_pfn_to_nid(unsigned long pfn,
+					struct mminit_pfnnid_cache *state)
 {
 	unsigned long start_pfn, end_pfn;
 	int nid;
-	/*
-	 * NOTE: The following SMP-unsafe globals are only used early in boot
-	 * when the kernel is running single-threaded.
-	 */
-	static unsigned long __meminitdata last_start_pfn, last_end_pfn;
-	static int __meminitdata last_nid;
 
-	if (last_start_pfn <= pfn && pfn < last_end_pfn)
-		return last_nid;
+	if (state->last_start <= pfn && pfn < state->last_end)
+		return state->last_nid;
 
 	nid = memblock_search_pfn_nid(pfn, &start_pfn, &end_pfn);
 	if (nid != -1) {
-		last_start_pfn = start_pfn;
-		last_end_pfn = end_pfn;
-		last_nid = nid;
+		state->last_start = start_pfn;
+		state->last_end = end_pfn;
+		state->last_nid = nid;
 	}
 
 	return nid;
 }
 #endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */
 
+static struct mminit_pfnnid_cache early_pfnnid_cache __meminitdata;
+
+/* Only safe to use early in boot when initialisation is single-threaded */
 int __meminit early_pfn_to_nid(unsigned long pfn)
 {
 	int nid;
 
-	nid = __early_pfn_to_nid(pfn);
+	/* The system will behave unpredictably otherwise */
+	BUG_ON(system_state != SYSTEM_BOOTING);
+
+	nid = __early_pfn_to_nid(pfn, &early_pfnnid_cache);
 	if (nid >= 0)
 		return nid;
 	/* just returns 0 */
@@ -4591,15 +4593,23 @@ int __meminit early_pfn_to_nid(unsigned long pfn)
 }
 
 #ifdef CONFIG_NODES_SPAN_OTHER_NODES
-bool __meminit early_pfn_in_nid(unsigned long pfn, int node)
+bool __meminit meminit_pfn_in_nid(unsigned long pfn, int node,
+					struct mminit_pfnnid_cache *state)
 {
 	int nid;
 
-	nid = __early_pfn_to_nid(pfn);
+	nid = __early_pfn_to_nid(pfn, state);
 	if (nid >= 0 && nid != node)
 		return false;
 	return true;
 }
+
+/* Only safe to use early in boot when initialisation is single-threaded */
+bool __meminit early_pfn_in_nid(unsigned long pfn, int node)
+{
+	return meminit_pfn_in_nid(pfn, node, &early_pfnnid_cache);
+}
+
 #endif
 
 /**
-- 
cgit v1.2.3


From 75a592a47129dcfc1aec40e7d3cdf239a767d441 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@suse.de>
Date: Tue, 30 Jun 2015 14:56:59 -0700
Subject: mm: meminit: inline some helper functions

early_pfn_in_nid() and meminit_pfn_in_nid() are small functions that are
unnecessarily visible outside memory initialisation.  As well as
unnecessary visibility, it's unnecessary function call overhead when
initialising pages.  This patch moves the helpers inline.

[akpm@linux-foundation.org: fix build]
[mhocko@suse.cz: fix build]
Signed-off-by: Mel Gorman <mgorman@suse.de>
Tested-by: Nate Zimmer <nzimmer@sgi.com>
Tested-by: Waiman Long <waiman.long@hp.com>
Tested-by: Daniel J Blueman <daniel@numascale.com>
Acked-by: Pekka Enberg <penberg@kernel.org>
Cc: Robin Holt <robinmholt@gmail.com>
Cc: Nate Zimmer <nzimmer@sgi.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Waiman Long <waiman.long@hp.com>
Cc: Scott Norton <scott.norton@hp.com>
Cc: "Luck, Tony" <tony.luck@intel.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Michal Hocko <mhocko@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mmzone.h |  9 -----
 mm/page_alloc.c        | 89 +++++++++++++++++++++++++++++---------------------
 2 files changed, 52 insertions(+), 46 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index b2473d822549..1e05dc7449cd 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -1227,15 +1227,6 @@ struct mminit_pfnnid_cache {
 	int last_nid;
 };
 
-#ifdef CONFIG_NODES_SPAN_OTHER_NODES
-bool early_pfn_in_nid(unsigned long pfn, int nid);
-bool meminit_pfn_in_nid(unsigned long pfn, int node,
-			struct mminit_pfnnid_cache *state);
-#else
-#define early_pfn_in_nid(pfn, nid)		(1)
-#define meminit_pfn_in_nid(pfn, nid, state)	(1)
-#endif
-
 #ifndef early_pfn_valid
 #define early_pfn_valid(pfn)	(1)
 #endif
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index ffdb2308848d..12a81870815f 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -899,6 +899,58 @@ void __init __free_pages_bootmem(struct page *page, unsigned long pfn,
 	__free_pages(page, order);
 }
 
+#if defined(CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID) || \
+	defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP)
+/* Only safe to use early in boot when initialisation is single-threaded */
+static struct mminit_pfnnid_cache early_pfnnid_cache __meminitdata;
+
+int __meminit early_pfn_to_nid(unsigned long pfn)
+{
+	int nid;
+
+	/* The system will behave unpredictably otherwise */
+	BUG_ON(system_state != SYSTEM_BOOTING);
+
+	nid = __early_pfn_to_nid(pfn, &early_pfnnid_cache);
+	if (nid >= 0)
+		return nid;
+	/* just returns 0 */
+	return 0;
+}
+#endif
+
+#ifdef CONFIG_NODES_SPAN_OTHER_NODES
+static inline bool __meminit meminit_pfn_in_nid(unsigned long pfn, int node,
+					struct mminit_pfnnid_cache *state)
+{
+	int nid;
+
+	nid = __early_pfn_to_nid(pfn, state);
+	if (nid >= 0 && nid != node)
+		return false;
+	return true;
+}
+
+/* Only safe to use early in boot when initialisation is single-threaded */
+static inline bool __meminit early_pfn_in_nid(unsigned long pfn, int node)
+{
+	return meminit_pfn_in_nid(pfn, node, &early_pfnnid_cache);
+}
+
+#else
+
+static inline bool __meminit early_pfn_in_nid(unsigned long pfn, int node)
+{
+	return true;
+}
+static inline bool __meminit meminit_pfn_in_nid(unsigned long pfn, int node,
+					struct mminit_pfnnid_cache *state)
+{
+	return true;
+}
+#endif
+
+
 #ifdef CONFIG_CMA
 /* Free whole pageblock and set its migration type to MIGRATE_CMA. */
 void __init init_cma_reserved_pageblock(struct page *page)
@@ -4575,43 +4627,6 @@ int __meminit __early_pfn_to_nid(unsigned long pfn,
 }
 #endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */
 
-static struct mminit_pfnnid_cache early_pfnnid_cache __meminitdata;
-
-/* Only safe to use early in boot when initialisation is single-threaded */
-int __meminit early_pfn_to_nid(unsigned long pfn)
-{
-	int nid;
-
-	/* The system will behave unpredictably otherwise */
-	BUG_ON(system_state != SYSTEM_BOOTING);
-
-	nid = __early_pfn_to_nid(pfn, &early_pfnnid_cache);
-	if (nid >= 0)
-		return nid;
-	/* just returns 0 */
-	return 0;
-}
-
-#ifdef CONFIG_NODES_SPAN_OTHER_NODES
-bool __meminit meminit_pfn_in_nid(unsigned long pfn, int node,
-					struct mminit_pfnnid_cache *state)
-{
-	int nid;
-
-	nid = __early_pfn_to_nid(pfn, state);
-	if (nid >= 0 && nid != node)
-		return false;
-	return true;
-}
-
-/* Only safe to use early in boot when initialisation is single-threaded */
-bool __meminit early_pfn_in_nid(unsigned long pfn, int node)
-{
-	return meminit_pfn_in_nid(pfn, node, &early_pfnnid_cache);
-}
-
-#endif
-
 /**
  * free_bootmem_with_active_regions - Call memblock_free_early_nid for each active range
  * @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed.
-- 
cgit v1.2.3


From 3a80a7fa7989fbb6aa56bb6ad31811b62cf99e60 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@suse.de>
Date: Tue, 30 Jun 2015 14:57:02 -0700
Subject: mm: meminit: initialise a subset of struct pages if
 CONFIG_DEFERRED_STRUCT_PAGE_INIT is set

This patch initalises all low memory struct pages and 2G of the highest
zone on each node during memory initialisation if
CONFIG_DEFERRED_STRUCT_PAGE_INIT is set.  That config option cannot be set
but will be available in a later patch.  Parallel initialisation of struct
page depends on some features from memory hotplug and it is necessary to
alter alter section annotations.

Signed-off-by: Mel Gorman <mgorman@suse.de>
Tested-by: Nate Zimmer <nzimmer@sgi.com>
Tested-by: Waiman Long <waiman.long@hp.com>
Tested-by: Daniel J Blueman <daniel@numascale.com>
Acked-by: Pekka Enberg <penberg@kernel.org>
Cc: Robin Holt <robinmholt@gmail.com>
Cc: Nate Zimmer <nzimmer@sgi.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Waiman Long <waiman.long@hp.com>
Cc: Scott Norton <scott.norton@hp.com>
Cc: "Luck, Tony" <tony.luck@intel.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/base/node.c    |  6 +++-
 include/linux/mmzone.h |  8 ++++++
 mm/Kconfig             | 18 ++++++++++++
 mm/internal.h          | 18 ++++++++++++
 mm/page_alloc.c        | 78 ++++++++++++++++++++++++++++++++++++++++++++++++--
 5 files changed, 124 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/node.c b/drivers/base/node.c
index a2aa65b4215d..31df474d72f4 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -359,12 +359,16 @@ int unregister_cpu_under_node(unsigned int cpu, unsigned int nid)
 #ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
 #define page_initialized(page)  (page->lru.next)
 
-static int get_nid_for_pfn(unsigned long pfn)
+static int __init_refok get_nid_for_pfn(unsigned long pfn)
 {
 	struct page *page;
 
 	if (!pfn_valid_within(pfn))
 		return -1;
+#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
+	if (system_state == SYSTEM_BOOTING)
+		return early_pfn_to_nid(pfn);
+#endif
 	page = pfn_to_page(pfn);
 	if (!page_initialized(page))
 		return -1;
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 1e05dc7449cd..754c25966a0a 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -762,6 +762,14 @@ typedef struct pglist_data {
 	/* Number of pages migrated during the rate limiting time interval */
 	unsigned long numabalancing_migrate_nr_pages;
 #endif
+
+#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
+	/*
+	 * If memory initialisation on large machines is deferred then this
+	 * is the first PFN that needs to be initialised.
+	 */
+	unsigned long first_deferred_pfn;
+#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
 } pg_data_t;
 
 #define node_present_pages(nid)	(NODE_DATA(nid)->node_present_pages)
diff --git a/mm/Kconfig b/mm/Kconfig
index c180af880ed5..e79de2bd12cd 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -636,3 +636,21 @@ config MAX_STACK_SIZE_MB
 	  changed to a smaller value in which case that is used.
 
 	  A sane initial value is 80 MB.
+
+# For architectures that support deferred memory initialisation
+config ARCH_SUPPORTS_DEFERRED_STRUCT_PAGE_INIT
+	bool
+
+config DEFERRED_STRUCT_PAGE_INIT
+	bool "Defer initialisation of struct pages to kswapd"
+	default n
+	depends on ARCH_SUPPORTS_DEFERRED_STRUCT_PAGE_INIT
+	depends on MEMORY_HOTPLUG
+	help
+	  Ordinarily all struct pages are initialised during early boot in a
+	  single thread. On very large machines this can take a considerable
+	  amount of time. If this option is set, large machines will bring up
+	  a subset of memmap at boot and then initialise the rest in parallel
+	  when kswapd starts. This has a potential performance impact on
+	  processes running early in the lifetime of the systemm until kswapd
+	  finishes the initialisation.
diff --git a/mm/internal.h b/mm/internal.h
index 58e9022e3757..88ac7be741ca 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -387,6 +387,24 @@ static inline void mminit_verify_zonelist(void)
 }
 #endif /* CONFIG_DEBUG_MEMORY_INIT */
 
+/*
+ * Deferred struct page initialisation requires init functions that are freed
+ * before kswapd is available. Reuse the memory hotplug section annotation
+ * to mark the required code.
+ *
+ * __defermem_init is code that always exists but is annotated __meminit to
+ * 	avoid section warnings.
+ * __defer_init code gets marked __meminit when deferring struct page
+ *	initialistion but is otherwise in the init section.
+ */
+#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
+#define __defermem_init __meminit
+#define __defer_init    __meminit
+#else
+#define __defermem_init
+#define __defer_init __init
+#endif
+
 /* mminit_validate_memmodel_limits is independent of CONFIG_DEBUG_MEMORY_INIT */
 #if defined(CONFIG_SPARSEMEM)
 extern void mminit_validate_memmodel_limits(unsigned long *start_pfn,
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 12a81870815f..7af45b2e8870 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -235,6 +235,64 @@ EXPORT_SYMBOL(nr_online_nodes);
 
 int page_group_by_mobility_disabled __read_mostly;
 
+#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
+static inline void reset_deferred_meminit(pg_data_t *pgdat)
+{
+	pgdat->first_deferred_pfn = ULONG_MAX;
+}
+
+/* Returns true if the struct page for the pfn is uninitialised */
+static inline bool __defermem_init early_page_uninitialised(unsigned long pfn)
+{
+	int nid = early_pfn_to_nid(pfn);
+
+	if (pfn >= NODE_DATA(nid)->first_deferred_pfn)
+		return true;
+
+	return false;
+}
+
+/*
+ * Returns false when the remaining initialisation should be deferred until
+ * later in the boot cycle when it can be parallelised.
+ */
+static inline bool update_defer_init(pg_data_t *pgdat,
+				unsigned long pfn, unsigned long zone_end,
+				unsigned long *nr_initialised)
+{
+	/* Always populate low zones for address-contrained allocations */
+	if (zone_end < pgdat_end_pfn(pgdat))
+		return true;
+
+	/* Initialise at least 2G of the highest zone */
+	(*nr_initialised)++;
+	if (*nr_initialised > (2UL << (30 - PAGE_SHIFT)) &&
+	    (pfn & (PAGES_PER_SECTION - 1)) == 0) {
+		pgdat->first_deferred_pfn = pfn;
+		return false;
+	}
+
+	return true;
+}
+#else
+static inline void reset_deferred_meminit(pg_data_t *pgdat)
+{
+}
+
+static inline bool early_page_uninitialised(unsigned long pfn)
+{
+	return false;
+}
+
+static inline bool update_defer_init(pg_data_t *pgdat,
+				unsigned long pfn, unsigned long zone_end,
+				unsigned long *nr_initialised)
+{
+	return true;
+}
+#endif
+
+
 void set_pageblock_migratetype(struct page *page, int migratetype)
 {
 	if (unlikely(page_group_by_mobility_disabled &&
@@ -878,8 +936,8 @@ static void __free_pages_ok(struct page *page, unsigned int order)
 	local_irq_restore(flags);
 }
 
-void __init __free_pages_bootmem(struct page *page, unsigned long pfn,
-							unsigned int order)
+static void __defer_init __free_pages_boot_core(struct page *page,
+					unsigned long pfn, unsigned int order)
 {
 	unsigned int nr_pages = 1 << order;
 	struct page *p = page;
@@ -951,6 +1009,14 @@ static inline bool __meminit meminit_pfn_in_nid(unsigned long pfn, int node,
 #endif
 
 
+void __defer_init __free_pages_bootmem(struct page *page, unsigned long pfn,
+							unsigned int order)
+{
+	if (early_page_uninitialised(pfn))
+		return;
+	return __free_pages_boot_core(page, pfn, order);
+}
+
 #ifdef CONFIG_CMA
 /* Free whole pageblock and set its migration type to MIGRATE_CMA. */
 void __init init_cma_reserved_pageblock(struct page *page)
@@ -4325,14 +4391,16 @@ static void setup_zone_migrate_reserve(struct zone *zone)
 void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
 		unsigned long start_pfn, enum memmap_context context)
 {
+	pg_data_t *pgdat = NODE_DATA(nid);
 	unsigned long end_pfn = start_pfn + size;
 	unsigned long pfn;
 	struct zone *z;
+	unsigned long nr_initialised = 0;
 
 	if (highest_memmap_pfn < end_pfn - 1)
 		highest_memmap_pfn = end_pfn - 1;
 
-	z = &NODE_DATA(nid)->node_zones[zone];
+	z = &pgdat->node_zones[zone];
 	for (pfn = start_pfn; pfn < end_pfn; pfn++) {
 		/*
 		 * There can be holes in boot-time mem_map[]s
@@ -4344,6 +4412,9 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
 				continue;
 			if (!early_pfn_in_nid(pfn, nid))
 				continue;
+			if (!update_defer_init(pgdat, pfn, end_pfn,
+						&nr_initialised))
+				break;
 		}
 		__init_single_pfn(pfn, zone, nid);
 	}
@@ -5144,6 +5215,7 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
 	/* pg_data_t should be reset to zero when it's allocated */
 	WARN_ON(pgdat->nr_zones || pgdat->classzone_idx);
 
+	reset_deferred_meminit(pgdat);
 	pgdat->node_id = nid;
 	pgdat->node_start_pfn = node_start_pfn;
 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
-- 
cgit v1.2.3


From 0e1cc95b4cc7293bb7b39175035e7f7e45c90977 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@suse.de>
Date: Tue, 30 Jun 2015 14:57:27 -0700
Subject: mm: meminit: finish initialisation of struct pages before basic setup

Waiman Long reported that 24TB machines hit OOM during basic setup when
struct page initialisation was deferred.  One approach is to initialise
memory on demand but it interferes with page allocator paths.  This patch
creates dedicated threads to initialise memory before basic setup.  It
then blocks on a rw_semaphore until completion as a wait_queue and counter
is overkill.  This may be slower to boot but it's simplier overall and
also gets rid of a section mangling which existed so kswapd could do the
initialisation.

[akpm@linux-foundation.org: include rwsem.h, use DECLARE_RWSEM, fix comment, remove unneeded cast]
Signed-off-by: Mel Gorman <mgorman@suse.de>
Cc: Waiman Long <waiman.long@hp.com
Cc: Nathan Zimmer <nzimmer@sgi.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Scott Norton <scott.norton@hp.com>
Tested-by: Daniel J Blueman <daniel@numascale.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/gfp.h |  8 ++++++++
 init/main.c         |  2 ++
 mm/internal.h       | 24 ------------------------
 mm/page_alloc.c     | 46 +++++++++++++++++++++++++++++++++++++---------
 mm/vmscan.c         |  6 ++----
 5 files changed, 49 insertions(+), 37 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 6ba7cf23748f..ad35f300b9a4 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -384,6 +384,14 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp);
 void drain_all_pages(struct zone *zone);
 void drain_local_pages(struct zone *zone);
 
+#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
+void page_alloc_init_late(void);
+#else
+static inline void page_alloc_init_late(void)
+{
+}
+#endif
+
 /*
  * gfp_allowed_mask is set to GFP_BOOT_MASK during early boot to restrict what
  * GFP flags are used before interrupts are enabled. Once interrupts are
diff --git a/init/main.c b/init/main.c
index c599aea23bb1..c5d5626289ce 100644
--- a/init/main.c
+++ b/init/main.c
@@ -1004,6 +1004,8 @@ static noinline void __init kernel_init_freeable(void)
 	smp_init();
 	sched_init_smp();
 
+	page_alloc_init_late();
+
 	do_basic_setup();
 
 	/* Open the /dev/console on the rootfs, this should never fail */
diff --git a/mm/internal.h b/mm/internal.h
index a48cbefde8ca..36b23f1e2ca6 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -379,30 +379,6 @@ static inline void mminit_verify_zonelist(void)
 }
 #endif /* CONFIG_DEBUG_MEMORY_INIT */
 
-/*
- * Deferred struct page initialisation requires init functions that are freed
- * before kswapd is available. Reuse the memory hotplug section annotation
- * to mark the required code.
- *
- * __defermem_init is code that always exists but is annotated __meminit to
- * 	avoid section warnings.
- * __defer_init code gets marked __meminit when deferring struct page
- *	initialistion but is otherwise in the init section.
- */
-#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
-#define __defermem_init __meminit
-#define __defer_init    __meminit
-
-void deferred_init_memmap(int nid);
-#else
-#define __defermem_init
-#define __defer_init __init
-
-static inline void deferred_init_memmap(int nid)
-{
-}
-#endif
-
 /* mminit_validate_memmodel_limits is independent of CONFIG_DEBUG_MEMORY_INIT */
 #if defined(CONFIG_SPARSEMEM)
 extern void mminit_validate_memmodel_limits(unsigned long *start_pfn,
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 5a38e39b30d1..506eac8b38af 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -18,6 +18,7 @@
 #include <linux/mm.h>
 #include <linux/swap.h>
 #include <linux/interrupt.h>
+#include <linux/rwsem.h>
 #include <linux/pagemap.h>
 #include <linux/jiffies.h>
 #include <linux/bootmem.h>
@@ -61,6 +62,7 @@
 #include <linux/hugetlb.h>
 #include <linux/sched/rt.h>
 #include <linux/page_owner.h>
+#include <linux/kthread.h>
 
 #include <asm/sections.h>
 #include <asm/tlbflush.h>
@@ -242,7 +244,7 @@ static inline void reset_deferred_meminit(pg_data_t *pgdat)
 }
 
 /* Returns true if the struct page for the pfn is uninitialised */
-static inline bool __defermem_init early_page_uninitialised(unsigned long pfn)
+static inline bool __meminit early_page_uninitialised(unsigned long pfn)
 {
 	int nid = early_pfn_to_nid(pfn);
 
@@ -958,7 +960,7 @@ static void __free_pages_ok(struct page *page, unsigned int order)
 	local_irq_restore(flags);
 }
 
-static void __defer_init __free_pages_boot_core(struct page *page,
+static void __init __free_pages_boot_core(struct page *page,
 					unsigned long pfn, unsigned int order)
 {
 	unsigned int nr_pages = 1 << order;
@@ -1031,7 +1033,7 @@ static inline bool __meminit meminit_pfn_in_nid(unsigned long pfn, int node,
 #endif
 
 
-void __defer_init __free_pages_bootmem(struct page *page, unsigned long pfn,
+void __init __free_pages_bootmem(struct page *page, unsigned long pfn,
 							unsigned int order)
 {
 	if (early_page_uninitialised(pfn))
@@ -1040,7 +1042,7 @@ void __defer_init __free_pages_bootmem(struct page *page, unsigned long pfn,
 }
 
 #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
-static void __defermem_init deferred_free_range(struct page *page,
+static void __init deferred_free_range(struct page *page,
 					unsigned long pfn, int nr_pages)
 {
 	int i;
@@ -1060,20 +1062,30 @@ static void __defermem_init deferred_free_range(struct page *page,
 		__free_pages_boot_core(page, pfn, 0);
 }
 
+static __initdata DECLARE_RWSEM(pgdat_init_rwsem);
+
 /* Initialise remaining memory on a node */
-void __defermem_init deferred_init_memmap(int nid)
+static int __init deferred_init_memmap(void *data)
 {
+	pg_data_t *pgdat = data;
+	int nid = pgdat->node_id;
 	struct mminit_pfnnid_cache nid_init_state = { };
 	unsigned long start = jiffies;
 	unsigned long nr_pages = 0;
 	unsigned long walk_start, walk_end;
 	int i, zid;
 	struct zone *zone;
-	pg_data_t *pgdat = NODE_DATA(nid);
 	unsigned long first_init_pfn = pgdat->first_deferred_pfn;
+	const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
 
-	if (first_init_pfn == ULONG_MAX)
-		return;
+	if (first_init_pfn == ULONG_MAX) {
+		up_read(&pgdat_init_rwsem);
+		return 0;
+	}
+
+	/* Bind memory initialisation thread to a local node if possible */
+	if (!cpumask_empty(cpumask))
+		set_cpus_allowed_ptr(current, cpumask);
 
 	/* Sanity check boundaries */
 	BUG_ON(pgdat->first_deferred_pfn < pgdat->node_start_pfn);
@@ -1165,8 +1177,24 @@ free_range:
 	/* Sanity check that the next zone really is unpopulated */
 	WARN_ON(++zid < MAX_NR_ZONES && populated_zone(++zone));
 
-	pr_info("kswapd %d initialised %lu pages in %ums\n", nid, nr_pages,
+	pr_info("node %d initialised, %lu pages in %ums\n", nid, nr_pages,
 					jiffies_to_msecs(jiffies - start));
+	up_read(&pgdat_init_rwsem);
+	return 0;
+}
+
+void __init page_alloc_init_late(void)
+{
+	int nid;
+
+	for_each_node_state(nid, N_MEMORY) {
+		down_read(&pgdat_init_rwsem);
+		kthread_run(deferred_init_memmap, NODE_DATA(nid), "pgdatinit%d", nid);
+	}
+
+	/* Block until all are initialised */
+	down_write(&pgdat_init_rwsem);
+	up_write(&pgdat_init_rwsem);
 }
 #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
 
diff --git a/mm/vmscan.c b/mm/vmscan.c
index f4a487110764..e61445dce04e 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -3386,7 +3386,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
  * If there are applications that are active memory-allocators
  * (most normal use), this basically shouldn't matter.
  */
-static int __defermem_init kswapd(void *p)
+static int kswapd(void *p)
 {
 	unsigned long order, new_order;
 	unsigned balanced_order;
@@ -3421,8 +3421,6 @@ static int __defermem_init kswapd(void *p)
 	tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD;
 	set_freezable();
 
-	deferred_init_memmap(pgdat->node_id);
-
 	order = new_order = 0;
 	balanced_order = 0;
 	classzone_idx = new_classzone_idx = pgdat->nr_zones - 1;
@@ -3578,7 +3576,7 @@ static int cpu_callback(struct notifier_block *nfb, unsigned long action,
  * This kswapd start function will be called by init and node-hot-add.
  * On node-hot-add, kswapd will moved to proper cpus if cpus are hot-added.
  */
-int __defermem_init kswapd_run(int nid)
+int kswapd_run(int nid)
 {
 	pg_data_t *pgdat = NODE_DATA(nid);
 	int ret = 0;
-- 
cgit v1.2.3


From 5375b708f2547f70cd2bee2fd8663ab7035f9551 Mon Sep 17 00:00:00 2001
From: HATAYAMA Daisuke <d.hatayama@jp.fujitsu.com>
Date: Tue, 30 Jun 2015 14:57:46 -0700
Subject: kernel/panic/kexec: fix "crash_kexec_post_notifiers" option issue in
 oops path

Commit f06e5153f4ae2e ("kernel/panic.c: add "crash_kexec_post_notifiers"
option for kdump after panic_notifers") introduced
"crash_kexec_post_notifiers" kernel boot option, which toggles wheather
panic() calls crash_kexec() before panic_notifiers and dump kmsg or after.

The problem is that the commit overlooks panic_on_oops kernel boot option.
 If it is enabled, crash_kexec() is called directly without going through
panic() in oops path.

To fix this issue, this patch adds a check to "crash_kexec_post_notifiers"
in the condition of kexec_should_crash().

Also, put a comment in kexec_should_crash() to explain not obvious things
on this patch.

Signed-off-by: HATAYAMA Daisuke <d.hatayama@jp.fujitsu.com>
Acked-by: Baoquan He <bhe@redhat.com>
Tested-by: Hidehiro Kawai <hidehiro.kawai.ez@hitachi.com>
Reviewed-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Hidehiro Kawai <hidehiro.kawai.ez@hitachi.com>
Cc: Baoquan He <bhe@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kernel.h |  3 +++
 kernel/kexec.c         | 11 +++++++++++
 kernel/panic.c         |  2 +-
 3 files changed, 15 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 5acf5b70866d..0dfa4e31563d 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -439,6 +439,9 @@ extern int panic_on_unrecovered_nmi;
 extern int panic_on_io_nmi;
 extern int panic_on_warn;
 extern int sysctl_panic_on_stackoverflow;
+
+extern bool crash_kexec_post_notifiers;
+
 /*
  * Only to be used by arch init code. If the user over-wrote the default
  * CONFIG_PANIC_TIMEOUT, honor it.
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 7a36fdcca5bf..a785c1015e25 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -84,6 +84,17 @@ struct resource crashk_low_res = {
 
 int kexec_should_crash(struct task_struct *p)
 {
+	/*
+	 * If crash_kexec_post_notifiers is enabled, don't run
+	 * crash_kexec() here yet, which must be run after panic
+	 * notifiers in panic().
+	 */
+	if (crash_kexec_post_notifiers)
+		return 0;
+	/*
+	 * There are 4 panic() calls in do_exit() path, each of which
+	 * corresponds to each of these 4 conditions.
+	 */
 	if (in_interrupt() || !p->pid || is_global_init(p) || panic_on_oops)
 		return 1;
 	return 0;
diff --git a/kernel/panic.c b/kernel/panic.c
index 774614f72cbd..04e91ff7560b 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -32,7 +32,7 @@ static unsigned long tainted_mask;
 static int pause_on_oops;
 static int pause_on_oops_flag;
 static DEFINE_SPINLOCK(pause_on_oops_lock);
-static bool crash_kexec_post_notifiers;
+bool crash_kexec_post_notifiers;
 int panic_on_warn __read_mostly;
 
 int panic_timeout = CONFIG_PANIC_TIMEOUT;
-- 
cgit v1.2.3


From 2a1bf8f93b33992bb0457512b28d046e279bbd7e Mon Sep 17 00:00:00 2001
From: Dave Gordon <david.s.gordon@intel.com>
Date: Tue, 30 Jun 2015 14:58:54 -0700
Subject: lib/scatterlist: mark input buffer parameters as 'const'

The 'buf' parameter of sg(p)copy_from_buffer() can and should be
const-qualified, although because of the shared implementation of
_to_buffer() and _from_buffer(), we have to cast this away internally.

This means that callers who have a 'const' buffer containing the data to
be copied to the sg-list no longer have to cast away the const-ness
themselves.  It also enables improved coverage by code analysis tools.

Signed-off-by: Dave Gordon <david.s.gordon@intel.com>
Cc: Akinobu Mita <akinobu.mita@gmail.com>
Cc: "Martin K. Petersen" <martin.petersen@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/scatterlist.h | 4 ++--
 lib/scatterlist.c           | 8 ++++----
 2 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/scatterlist.h b/include/linux/scatterlist.h
index 50a8486c524b..505d0481df1e 100644
--- a/include/linux/scatterlist.h
+++ b/include/linux/scatterlist.h
@@ -266,12 +266,12 @@ int sg_alloc_table_from_pages(struct sg_table *sgt,
 	gfp_t gfp_mask);
 
 size_t sg_copy_from_buffer(struct scatterlist *sgl, unsigned int nents,
-			   void *buf, size_t buflen);
+			   const void *buf, size_t buflen);
 size_t sg_copy_to_buffer(struct scatterlist *sgl, unsigned int nents,
 			 void *buf, size_t buflen);
 
 size_t sg_pcopy_from_buffer(struct scatterlist *sgl, unsigned int nents,
-			    void *buf, size_t buflen, off_t skip);
+			    const void *buf, size_t buflen, off_t skip);
 size_t sg_pcopy_to_buffer(struct scatterlist *sgl, unsigned int nents,
 			  void *buf, size_t buflen, off_t skip);
 
diff --git a/lib/scatterlist.c b/lib/scatterlist.c
index 965c36e7a5a4..317b62c6da3c 100644
--- a/lib/scatterlist.c
+++ b/lib/scatterlist.c
@@ -701,9 +701,9 @@ static size_t sg_copy_buffer(struct scatterlist *sgl, unsigned int nents,
  *
  **/
 size_t sg_copy_from_buffer(struct scatterlist *sgl, unsigned int nents,
-			   void *buf, size_t buflen)
+			   const void *buf, size_t buflen)
 {
-	return sg_copy_buffer(sgl, nents, buf, buflen, 0, false);
+	return sg_copy_buffer(sgl, nents, (void *)buf, buflen, 0, false);
 }
 EXPORT_SYMBOL(sg_copy_from_buffer);
 
@@ -736,9 +736,9 @@ EXPORT_SYMBOL(sg_copy_to_buffer);
  *
  **/
 size_t sg_pcopy_from_buffer(struct scatterlist *sgl, unsigned int nents,
-			    void *buf, size_t buflen, off_t skip)
+			    const void *buf, size_t buflen, off_t skip)
 {
-	return sg_copy_buffer(sgl, nents, buf, buflen, skip, false);
+	return sg_copy_buffer(sgl, nents, (void *)buf, buflen, skip, false);
 }
 EXPORT_SYMBOL(sg_pcopy_from_buffer);
 
-- 
cgit v1.2.3


From 386ecb1216f9e38947ce6a2af22e5e1e47256a97 Mon Sep 17 00:00:00 2001
From: Dave Gordon <david.s.gordon@intel.com>
Date: Tue, 30 Jun 2015 14:58:57 -0700
Subject: drivers/scsi/scsi_debug.c: resolve sg buffer const-ness issue

do_device_access() takes a separate parameter to indicate the direction of
data transfer, which it used to use to select the appropriate function out
of sg_pcopy_{to,from}_buffer().  However these two functions now have

So this patch makes it bypass these wrappers and call the underlying
function sg_copy_buffer() directly; this has the same calling style as
do_device_access() i.e.  a separate direction-of-transfer parameter and no
pointers-to-const, so skipping the wrappers not only eliminates the
warning, it also make the code simpler :)

[akpm@linux-foundation.org: fix very broken build]
Signed-off-by: Dave Gordon <david.s.gordon@intel.com>
Acked-by: Arnd Bergmann <arnd@arndb.de>
Cc: James Bottomley <James.Bottomley@HansenPartnership.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/scsi/scsi_debug.c   | 12 ++++--------
 include/linux/scatterlist.h |  3 +++
 lib/scatterlist.c           |  6 +++---
 3 files changed, 10 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/scsi/scsi_debug.c b/drivers/scsi/scsi_debug.c
index 1f8e2dc9c616..30268bb2ddb6 100644
--- a/drivers/scsi/scsi_debug.c
+++ b/drivers/scsi/scsi_debug.c
@@ -2363,17 +2363,13 @@ do_device_access(struct scsi_cmnd *scmd, u64 lba, u32 num, bool do_write)
 	u64 block, rest = 0;
 	struct scsi_data_buffer *sdb;
 	enum dma_data_direction dir;
-	size_t (*func)(struct scatterlist *, unsigned int, void *, size_t,
-		       off_t);
 
 	if (do_write) {
 		sdb = scsi_out(scmd);
 		dir = DMA_TO_DEVICE;
-		func = sg_pcopy_to_buffer;
 	} else {
 		sdb = scsi_in(scmd);
 		dir = DMA_FROM_DEVICE;
-		func = sg_pcopy_from_buffer;
 	}
 
 	if (!sdb->length)
@@ -2385,16 +2381,16 @@ do_device_access(struct scsi_cmnd *scmd, u64 lba, u32 num, bool do_write)
 	if (block + num > sdebug_store_sectors)
 		rest = block + num - sdebug_store_sectors;
 
-	ret = func(sdb->table.sgl, sdb->table.nents,
+	ret = sg_copy_buffer(sdb->table.sgl, sdb->table.nents,
 		   fake_storep + (block * scsi_debug_sector_size),
-		   (num - rest) * scsi_debug_sector_size, 0);
+		   (num - rest) * scsi_debug_sector_size, 0, do_write);
 	if (ret != (num - rest) * scsi_debug_sector_size)
 		return ret;
 
 	if (rest) {
-		ret += func(sdb->table.sgl, sdb->table.nents,
+		ret += sg_copy_buffer(sdb->table.sgl, sdb->table.nents,
 			    fake_storep, rest * scsi_debug_sector_size,
-			    (num - rest) * scsi_debug_sector_size);
+			    (num - rest) * scsi_debug_sector_size, do_write);
 	}
 
 	return ret;
diff --git a/include/linux/scatterlist.h b/include/linux/scatterlist.h
index 505d0481df1e..9b1ef0c820a7 100644
--- a/include/linux/scatterlist.h
+++ b/include/linux/scatterlist.h
@@ -265,6 +265,9 @@ int sg_alloc_table_from_pages(struct sg_table *sgt,
 	unsigned long offset, unsigned long size,
 	gfp_t gfp_mask);
 
+size_t sg_copy_buffer(struct scatterlist *sgl, unsigned int nents, void *buf,
+		      size_t buflen, off_t skip, bool to_buffer);
+
 size_t sg_copy_from_buffer(struct scatterlist *sgl, unsigned int nents,
 			   const void *buf, size_t buflen);
 size_t sg_copy_to_buffer(struct scatterlist *sgl, unsigned int nents,
diff --git a/lib/scatterlist.c b/lib/scatterlist.c
index 317b62c6da3c..d105a9f56878 100644
--- a/lib/scatterlist.c
+++ b/lib/scatterlist.c
@@ -650,9 +650,8 @@ EXPORT_SYMBOL(sg_miter_stop);
  * Returns the number of copied bytes.
  *
  **/
-static size_t sg_copy_buffer(struct scatterlist *sgl, unsigned int nents,
-			     void *buf, size_t buflen, off_t skip,
-			     bool to_buffer)
+size_t sg_copy_buffer(struct scatterlist *sgl, unsigned int nents, void *buf,
+		      size_t buflen, off_t skip, bool to_buffer)
 {
 	unsigned int offset = 0;
 	struct sg_mapping_iter miter;
@@ -689,6 +688,7 @@ static size_t sg_copy_buffer(struct scatterlist *sgl, unsigned int nents,
 	local_irq_restore(flags);
 	return offset;
 }
+EXPORT_SYMBOL(sg_copy_buffer);
 
 /**
  * sg_copy_from_buffer - Copy from a linear buffer to an SG list
-- 
cgit v1.2.3


From 0030edf296db8a7afb13573eb12977b7d399cd40 Mon Sep 17 00:00:00 2001
From: Vladimir Zapolskiy <vladimir_zapolskiy@mentor.com>
Date: Tue, 30 Jun 2015 15:00:03 -0700
Subject: genalloc: rename dev_get_gen_pool() to gen_pool_get()

To be consistent with other genalloc interface namings, rename
dev_get_gen_pool() to gen_pool_get().  The original omitted "dev_" prefix
is removed, since it points to argument type of the function, and so it
does not bring any useful information.

[akpm@linux-foundation.org: update arch/arm/mach-socfpga/pm.c]
Signed-off-by: Vladimir Zapolskiy <vladimir_zapolskiy@mentor.com>
Acked-by: Nicolas Ferre <nicolas.ferre@atmel.com>
Cc: Philipp Zabel <p.zabel@pengutronix.de>
Cc: Shawn Guo <shawn.guo@linaro.org>
Cc: Sascha Hauer <kernel@pengutronix.de>
Cc: Alexandre Belloni <alexandre.belloni@free-electrons.com>
Cc: Russell King <linux@arm.linux.org.uk>
Cc: Mauro Carvalho Chehab <mchehab@osg.samsung.com>
Cc: Vinod Koul <vinod.koul@intel.com>
Cc: Takashi Iwai <tiwai@suse.de>
Cc: Jaroslav Kysela <perex@perex.cz>
Cc: Mark Brown <broonie@kernel.org>
Cc: Nicolas Ferre <nicolas.ferre@atmel.com>
Cc: Alan Tull <atull@opensource.altera.com>
Cc: Dinh Nguyen <dinguyen@opensource.altera.com>
Cc: Kevin Hilman <khilman@linaro.org>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/arm/mach-at91/pm.c                   | 2 +-
 arch/arm/mach-imx/pm-imx5.c               | 2 +-
 arch/arm/mach-imx/pm-imx6.c               | 2 +-
 arch/arm/mach-socfpga/pm.c                | 2 +-
 drivers/media/platform/coda/coda-common.c | 2 +-
 include/linux/genalloc.h                  | 2 +-
 lib/genalloc.c                            | 8 ++++----
 7 files changed, 10 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/mach-at91/pm.c b/arch/arm/mach-at91/pm.c
index 1e184767c3be..e24df77abd79 100644
--- a/arch/arm/mach-at91/pm.c
+++ b/arch/arm/mach-at91/pm.c
@@ -369,7 +369,7 @@ static void __init at91_pm_sram_init(void)
 		return;
 	}
 
-	sram_pool = dev_get_gen_pool(&pdev->dev);
+	sram_pool = gen_pool_get(&pdev->dev);
 	if (!sram_pool) {
 		pr_warn("%s: sram pool unavailable!\n", __func__);
 		return;
diff --git a/arch/arm/mach-imx/pm-imx5.c b/arch/arm/mach-imx/pm-imx5.c
index 0309ccda36a9..1885676c23c0 100644
--- a/arch/arm/mach-imx/pm-imx5.c
+++ b/arch/arm/mach-imx/pm-imx5.c
@@ -297,7 +297,7 @@ static int __init imx_suspend_alloc_ocram(
 		goto put_node;
 	}
 
-	ocram_pool = dev_get_gen_pool(&pdev->dev);
+	ocram_pool = gen_pool_get(&pdev->dev);
 	if (!ocram_pool) {
 		pr_warn("%s: ocram pool unavailable!\n", __func__);
 		ret = -ENODEV;
diff --git a/arch/arm/mach-imx/pm-imx6.c b/arch/arm/mach-imx/pm-imx6.c
index b01650d94f91..93ecf559d06d 100644
--- a/arch/arm/mach-imx/pm-imx6.c
+++ b/arch/arm/mach-imx/pm-imx6.c
@@ -451,7 +451,7 @@ static int __init imx6q_suspend_init(const struct imx6_pm_socdata *socdata)
 		goto put_node;
 	}
 
-	ocram_pool = dev_get_gen_pool(&pdev->dev);
+	ocram_pool = gen_pool_get(&pdev->dev);
 	if (!ocram_pool) {
 		pr_warn("%s: ocram pool unavailable!\n", __func__);
 		ret = -ENODEV;
diff --git a/arch/arm/mach-socfpga/pm.c b/arch/arm/mach-socfpga/pm.c
index 1ed89fc2b7a8..6a4199f2bffb 100644
--- a/arch/arm/mach-socfpga/pm.c
+++ b/arch/arm/mach-socfpga/pm.c
@@ -56,7 +56,7 @@ static int socfpga_setup_ocram_self_refresh(void)
 		goto put_node;
 	}
 
-	ocram_pool = dev_get_gen_pool(&pdev->dev);
+	ocram_pool = gen_pool_get(&pdev->dev);
 	if (!ocram_pool) {
 		pr_warn("%s: ocram pool unavailable!\n", __func__);
 		ret = -ENODEV;
diff --git a/drivers/media/platform/coda/coda-common.c b/drivers/media/platform/coda/coda-common.c
index 6d6e0ca91fb4..6e640c0f0b35 100644
--- a/drivers/media/platform/coda/coda-common.c
+++ b/drivers/media/platform/coda/coda-common.c
@@ -2157,7 +2157,7 @@ static int coda_probe(struct platform_device *pdev)
 	/* Get IRAM pool from device tree or platform data */
 	pool = of_get_named_gen_pool(np, "iram", 0);
 	if (!pool && pdata)
-		pool = dev_get_gen_pool(pdata->iram_dev);
+		pool = gen_pool_get(pdata->iram_dev);
 	if (!pool) {
 		dev_err(&pdev->dev, "iram pool not available\n");
 		return -ENOMEM;
diff --git a/include/linux/genalloc.h b/include/linux/genalloc.h
index 1ccaab44abcc..015d17068615 100644
--- a/include/linux/genalloc.h
+++ b/include/linux/genalloc.h
@@ -119,7 +119,7 @@ extern unsigned long gen_pool_best_fit(unsigned long *map, unsigned long size,
 
 extern struct gen_pool *devm_gen_pool_create(struct device *dev,
 		int min_alloc_order, int nid);
-extern struct gen_pool *dev_get_gen_pool(struct device *dev);
+extern struct gen_pool *gen_pool_get(struct device *dev);
 
 bool addr_in_gen_pool(struct gen_pool *pool, unsigned long start,
 			size_t size);
diff --git a/lib/genalloc.c b/lib/genalloc.c
index d214866eeea2..948e92cd9794 100644
--- a/lib/genalloc.c
+++ b/lib/genalloc.c
@@ -602,12 +602,12 @@ struct gen_pool *devm_gen_pool_create(struct device *dev, int min_alloc_order,
 EXPORT_SYMBOL(devm_gen_pool_create);
 
 /**
- * dev_get_gen_pool - Obtain the gen_pool (if any) for a device
+ * gen_pool_get - Obtain the gen_pool (if any) for a device
  * @dev: device to retrieve the gen_pool from
  *
  * Returns the gen_pool for the device if one is present, or NULL.
  */
-struct gen_pool *dev_get_gen_pool(struct device *dev)
+struct gen_pool *gen_pool_get(struct device *dev)
 {
 	struct gen_pool **p = devres_find(dev, devm_gen_pool_release, NULL,
 					NULL);
@@ -616,7 +616,7 @@ struct gen_pool *dev_get_gen_pool(struct device *dev)
 		return NULL;
 	return *p;
 }
-EXPORT_SYMBOL_GPL(dev_get_gen_pool);
+EXPORT_SYMBOL_GPL(gen_pool_get);
 
 #ifdef CONFIG_OF
 /**
@@ -642,7 +642,7 @@ struct gen_pool *of_get_named_gen_pool(struct device_node *np,
 	of_node_put(np_pool);
 	if (!pdev)
 		return NULL;
-	return dev_get_gen_pool(&pdev->dev);
+	return gen_pool_get(&pdev->dev);
 }
 EXPORT_SYMBOL_GPL(of_get_named_gen_pool);
 #endif /* CONFIG_OF */
-- 
cgit v1.2.3


From abdd4a7025282fbe3737e1bcb5f51afc8d8ea1b8 Mon Sep 17 00:00:00 2001
From: Vladimir Zapolskiy <vladimir_zapolskiy@mentor.com>
Date: Tue, 30 Jun 2015 15:00:07 -0700
Subject: genalloc: rename of_get_named_gen_pool() to of_gen_pool_get()

To be consistent with other kernel interface namings, rename
of_get_named_gen_pool() to of_gen_pool_get().  In the original function
name "_named" suffix references to a device tree property, which contains
a phandle to a device and the corresponding device driver is assumed to
register a gen_pool object.

Due to a weak relation and to avoid any confusion (e.g.  in future
possible scenario if gen_pool objects are named) the suffix is removed.

[sfr@canb.auug.org.au: crypto/marvell/cesa - fix up for of_get_named_gen_pool() rename]
Signed-off-by: Vladimir Zapolskiy <vladimir_zapolskiy@mentor.com>
Cc: Nicolas Ferre <nicolas.ferre@atmel.com>
Cc: Philipp Zabel <p.zabel@pengutronix.de>
Cc: Shawn Guo <shawn.guo@linaro.org>
Cc: Sascha Hauer <kernel@pengutronix.de>
Cc: Alexandre Belloni <alexandre.belloni@free-electrons.com>
Cc: Russell King <linux@arm.linux.org.uk>
Cc: Mauro Carvalho Chehab <mchehab@osg.samsung.com>
Cc: Vinod Koul <vinod.koul@intel.com>
Cc: Takashi Iwai <tiwai@suse.de>
Cc: Jaroslav Kysela <perex@perex.cz>
Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Cc: Boris BREZILLON <boris.brezillon@free-electrons.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/crypto/marvell/cesa.c             | 5 ++---
 drivers/dma/mmp_tdma.c                    | 2 +-
 drivers/media/platform/coda/coda-common.c | 2 +-
 include/linux/genalloc.h                  | 4 ++--
 lib/genalloc.c                            | 6 +++---
 sound/core/memalloc.c                     | 2 +-
 6 files changed, 10 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/crypto/marvell/cesa.c b/drivers/crypto/marvell/cesa.c
index a432633bced4..1c6f98dd88f4 100644
--- a/drivers/crypto/marvell/cesa.c
+++ b/drivers/crypto/marvell/cesa.c
@@ -321,9 +321,8 @@ static int mv_cesa_get_sram(struct platform_device *pdev, int idx)
 	const char *res_name = "sram";
 	struct resource *res;
 
-	engine->pool = of_get_named_gen_pool(cesa->dev->of_node,
-					     "marvell,crypto-srams",
-					     idx);
+	engine->pool = of_gen_pool_get(cesa->dev->of_node,
+				       "marvell,crypto-srams", idx);
 	if (engine->pool) {
 		engine->sram = gen_pool_dma_alloc(engine->pool,
 						  cesa->sram_size,
diff --git a/drivers/dma/mmp_tdma.c b/drivers/dma/mmp_tdma.c
index 449e785def17..e683761e0f8f 100644
--- a/drivers/dma/mmp_tdma.c
+++ b/drivers/dma/mmp_tdma.c
@@ -657,7 +657,7 @@ static int mmp_tdma_probe(struct platform_device *pdev)
 	INIT_LIST_HEAD(&tdev->device.channels);
 
 	if (pdev->dev.of_node)
-		pool = of_get_named_gen_pool(pdev->dev.of_node, "asram", 0);
+		pool = of_gen_pool_get(pdev->dev.of_node, "asram", 0);
 	else
 		pool = sram_get_gpool("asram");
 	if (!pool) {
diff --git a/drivers/media/platform/coda/coda-common.c b/drivers/media/platform/coda/coda-common.c
index 6e640c0f0b35..58f65486de33 100644
--- a/drivers/media/platform/coda/coda-common.c
+++ b/drivers/media/platform/coda/coda-common.c
@@ -2155,7 +2155,7 @@ static int coda_probe(struct platform_device *pdev)
 	}
 
 	/* Get IRAM pool from device tree or platform data */
-	pool = of_get_named_gen_pool(np, "iram", 0);
+	pool = of_gen_pool_get(np, "iram", 0);
 	if (!pool && pdata)
 		pool = gen_pool_get(pdata->iram_dev);
 	if (!pool) {
diff --git a/include/linux/genalloc.h b/include/linux/genalloc.h
index 015d17068615..5383bb1394a1 100644
--- a/include/linux/genalloc.h
+++ b/include/linux/genalloc.h
@@ -125,10 +125,10 @@ bool addr_in_gen_pool(struct gen_pool *pool, unsigned long start,
 			size_t size);
 
 #ifdef CONFIG_OF
-extern struct gen_pool *of_get_named_gen_pool(struct device_node *np,
+extern struct gen_pool *of_gen_pool_get(struct device_node *np,
 	const char *propname, int index);
 #else
-static inline struct gen_pool *of_get_named_gen_pool(struct device_node *np,
+static inline struct gen_pool *of_gen_pool_get(struct device_node *np,
 	const char *propname, int index)
 {
 	return NULL;
diff --git a/lib/genalloc.c b/lib/genalloc.c
index 948e92cd9794..daf0afb6d979 100644
--- a/lib/genalloc.c
+++ b/lib/genalloc.c
@@ -620,7 +620,7 @@ EXPORT_SYMBOL_GPL(gen_pool_get);
 
 #ifdef CONFIG_OF
 /**
- * of_get_named_gen_pool - find a pool by phandle property
+ * of_gen_pool_get - find a pool by phandle property
  * @np: device node
  * @propname: property name containing phandle(s)
  * @index: index into the phandle array
@@ -629,7 +629,7 @@ EXPORT_SYMBOL_GPL(gen_pool_get);
  * address of the device tree node pointed at by the phandle property,
  * or NULL if not found.
  */
-struct gen_pool *of_get_named_gen_pool(struct device_node *np,
+struct gen_pool *of_gen_pool_get(struct device_node *np,
 	const char *propname, int index)
 {
 	struct platform_device *pdev;
@@ -644,5 +644,5 @@ struct gen_pool *of_get_named_gen_pool(struct device_node *np,
 		return NULL;
 	return gen_pool_get(&pdev->dev);
 }
-EXPORT_SYMBOL_GPL(of_get_named_gen_pool);
+EXPORT_SYMBOL_GPL(of_gen_pool_get);
 #endif /* CONFIG_OF */
diff --git a/sound/core/memalloc.c b/sound/core/memalloc.c
index 082509eb805d..f05cb6a8cbe0 100644
--- a/sound/core/memalloc.c
+++ b/sound/core/memalloc.c
@@ -124,7 +124,7 @@ static void snd_malloc_dev_iram(struct snd_dma_buffer *dmab, size_t size)
 	dmab->addr = 0;
 
 	if (dev->of_node)
-		pool = of_get_named_gen_pool(dev->of_node, "iram", 0);
+		pool = of_gen_pool_get(dev->of_node, "iram", 0);
 
 	if (!pool)
 		return;
-- 
cgit v1.2.3


From 8a81252b774b53e628a8a0fe18e2b8fc236d92cc Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 30 Jun 2015 15:54:08 +0200
Subject: fs/file.c: don't acquire files->file_lock in fd_install()

Mateusz Guzik reported :

 Currently obtaining a new file descriptor results in locking fdtable
 twice - once in order to reserve a slot and second time to fill it.

Holding the spinlock in __fd_install() is needed in case a resize is
done, or to prevent a resize.

Mateusz provided an RFC patch and a micro benchmark :
  http://people.redhat.com/~mguzik/pipebench.c

A resize is an unlikely operation in a process lifetime,
as table size is at least doubled at every resize.

We can use RCU instead of the spinlock.

__fd_install() must wait if a resize is in progress.

The resize must block new __fd_install() callers from starting,
and wait that ongoing install are finished (synchronize_sched())

resize should be attempted by a single thread to not waste resources.

rcu_sched variant is used, as __fd_install() and expand_fdtable() run
from process context.

It gives us a ~30% speedup using pipebench on a dual Intel(R) Xeon(R)
CPU E5-2696 v2 @ 2.50GHz

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: Mateusz Guzik <mguzik@redhat.com>
Acked-by: Mateusz Guzik <mguzik@redhat.com>
Tested-by: Mateusz Guzik <mguzik@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 Documentation/filesystems/porting |  4 +++
 fs/file.c                         | 67 ++++++++++++++++++++++++++++-----------
 include/linux/fdtable.h           |  3 ++
 3 files changed, 55 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/filesystems/porting b/Documentation/filesystems/porting
index 3eae250254d5..ec5456113072 100644
--- a/Documentation/filesystems/porting
+++ b/Documentation/filesystems/porting
@@ -500,3 +500,7 @@ in your dentry operations instead.
 	dentry,  it does not get nameidata at all and it gets called only when cookie
 	is non-NULL.  Note that link body isn't available anymore, so if you need it,
 	store it as cookie.
+--
+[mandatory]
+	__fd_install() & fd_install() can now sleep. Callers should not
+	hold a spinlock	or other resources that do not allow a schedule.
diff --git a/fs/file.c b/fs/file.c
index 93c5f89c248b..3d2eb4c542a4 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -147,6 +147,13 @@ static int expand_fdtable(struct files_struct *files, int nr)
 
 	spin_unlock(&files->file_lock);
 	new_fdt = alloc_fdtable(nr);
+
+	/* make sure all __fd_install() have seen resize_in_progress
+	 * or have finished their rcu_read_lock_sched() section.
+	 */
+	if (atomic_read(&files->count) > 1)
+		synchronize_sched();
+
 	spin_lock(&files->file_lock);
 	if (!new_fdt)
 		return -ENOMEM;
@@ -158,21 +165,14 @@ static int expand_fdtable(struct files_struct *files, int nr)
 		__free_fdtable(new_fdt);
 		return -EMFILE;
 	}
-	/*
-	 * Check again since another task may have expanded the fd table while
-	 * we dropped the lock
-	 */
 	cur_fdt = files_fdtable(files);
-	if (nr >= cur_fdt->max_fds) {
-		/* Continue as planned */
-		copy_fdtable(new_fdt, cur_fdt);
-		rcu_assign_pointer(files->fdt, new_fdt);
-		if (cur_fdt != &files->fdtab)
-			call_rcu(&cur_fdt->rcu, free_fdtable_rcu);
-	} else {
-		/* Somebody else expanded, so undo our attempt */
-		__free_fdtable(new_fdt);
-	}
+	BUG_ON(nr < cur_fdt->max_fds);
+	copy_fdtable(new_fdt, cur_fdt);
+	rcu_assign_pointer(files->fdt, new_fdt);
+	if (cur_fdt != &files->fdtab)
+		call_rcu(&cur_fdt->rcu, free_fdtable_rcu);
+	/* coupled with smp_rmb() in __fd_install() */
+	smp_wmb();
 	return 1;
 }
 
@@ -185,21 +185,38 @@ static int expand_fdtable(struct files_struct *files, int nr)
  * The files->file_lock should be held on entry, and will be held on exit.
  */
 static int expand_files(struct files_struct *files, int nr)
+	__releases(files->file_lock)
+	__acquires(files->file_lock)
 {
 	struct fdtable *fdt;
+	int expanded = 0;
 
+repeat:
 	fdt = files_fdtable(files);
 
 	/* Do we need to expand? */
 	if (nr < fdt->max_fds)
-		return 0;
+		return expanded;
 
 	/* Can we expand? */
 	if (nr >= sysctl_nr_open)
 		return -EMFILE;
 
+	if (unlikely(files->resize_in_progress)) {
+		spin_unlock(&files->file_lock);
+		expanded = 1;
+		wait_event(files->resize_wait, !files->resize_in_progress);
+		spin_lock(&files->file_lock);
+		goto repeat;
+	}
+
 	/* All good, so we try */
-	return expand_fdtable(files, nr);
+	files->resize_in_progress = true;
+	expanded = expand_fdtable(files, nr);
+	files->resize_in_progress = false;
+
+	wake_up_all(&files->resize_wait);
+	return expanded;
 }
 
 static inline void __set_close_on_exec(int fd, struct fdtable *fdt)
@@ -256,6 +273,8 @@ struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
 	atomic_set(&newf->count, 1);
 
 	spin_lock_init(&newf->file_lock);
+	newf->resize_in_progress = false;
+	init_waitqueue_head(&newf->resize_wait);
 	newf->next_fd = 0;
 	new_fdt = &newf->fdtab;
 	new_fdt->max_fds = NR_OPEN_DEFAULT;
@@ -553,11 +572,21 @@ void __fd_install(struct files_struct *files, unsigned int fd,
 		struct file *file)
 {
 	struct fdtable *fdt;
-	spin_lock(&files->file_lock);
-	fdt = files_fdtable(files);
+
+	might_sleep();
+	rcu_read_lock_sched();
+
+	while (unlikely(files->resize_in_progress)) {
+		rcu_read_unlock_sched();
+		wait_event(files->resize_wait, !files->resize_in_progress);
+		rcu_read_lock_sched();
+	}
+	/* coupled with smp_wmb() in expand_fdtable() */
+	smp_rmb();
+	fdt = rcu_dereference_sched(files->fdt);
 	BUG_ON(fdt->fd[fd] != NULL);
 	rcu_assign_pointer(fdt->fd[fd], file);
-	spin_unlock(&files->file_lock);
+	rcu_read_unlock_sched();
 }
 
 void fd_install(unsigned int fd, struct file *file)
diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h
index 230f87bdf5ad..fbb88740634a 100644
--- a/include/linux/fdtable.h
+++ b/include/linux/fdtable.h
@@ -47,6 +47,9 @@ struct files_struct {
    * read mostly part
    */
 	atomic_t count;
+	bool resize_in_progress;
+	wait_queue_head_t resize_wait;
+
 	struct fdtable __rcu *fdt;
 	struct fdtable fdtab;
   /*
-- 
cgit v1.2.3


From fbabfd0f4ee2e8847bf56edf481249ad1bb8c44d Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Sat, 9 May 2015 15:54:49 -0500
Subject: fs: Add helper functions for permanently empty directories.

To ensure it is safe to mount proc and sysfs I need to check if
filesystems that are mounted on top of them are mounted on truly empty
directories.  Given that some directories can gain entries over time,
knowing that a directory is empty right now is insufficient.

Therefore add supporting infrastructure for permantently empty
directories that proc and sysfs can use when they create mount points
for filesystems and fs_fully_visible can use to test for permanently
empty directories to ensure that nothing will be gained by mounting a
fresh copy of proc or sysfs.

Cc: stable@vger.kernel.org
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 fs/libfs.c         | 96 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/fs.h |  2 ++
 2 files changed, 98 insertions(+)

(limited to 'include/linux')

diff --git a/fs/libfs.c b/fs/libfs.c
index cb1fb4b9b637..02813592e121 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -1093,3 +1093,99 @@ simple_nosetlease(struct file *filp, long arg, struct file_lock **flp,
 	return -EINVAL;
 }
 EXPORT_SYMBOL(simple_nosetlease);
+
+
+/*
+ * Operations for a permanently empty directory.
+ */
+static struct dentry *empty_dir_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
+{
+	return ERR_PTR(-ENOENT);
+}
+
+static int empty_dir_getattr(struct vfsmount *mnt, struct dentry *dentry,
+				 struct kstat *stat)
+{
+	struct inode *inode = d_inode(dentry);
+	generic_fillattr(inode, stat);
+	return 0;
+}
+
+static int empty_dir_setattr(struct dentry *dentry, struct iattr *attr)
+{
+	return -EPERM;
+}
+
+static int empty_dir_setxattr(struct dentry *dentry, const char *name,
+			      const void *value, size_t size, int flags)
+{
+	return -EOPNOTSUPP;
+}
+
+static ssize_t empty_dir_getxattr(struct dentry *dentry, const char *name,
+				  void *value, size_t size)
+{
+	return -EOPNOTSUPP;
+}
+
+static int empty_dir_removexattr(struct dentry *dentry, const char *name)
+{
+	return -EOPNOTSUPP;
+}
+
+static ssize_t empty_dir_listxattr(struct dentry *dentry, char *list, size_t size)
+{
+	return -EOPNOTSUPP;
+}
+
+static const struct inode_operations empty_dir_inode_operations = {
+	.lookup		= empty_dir_lookup,
+	.permission	= generic_permission,
+	.setattr	= empty_dir_setattr,
+	.getattr	= empty_dir_getattr,
+	.setxattr	= empty_dir_setxattr,
+	.getxattr	= empty_dir_getxattr,
+	.removexattr	= empty_dir_removexattr,
+	.listxattr	= empty_dir_listxattr,
+};
+
+static loff_t empty_dir_llseek(struct file *file, loff_t offset, int whence)
+{
+	/* An empty directory has two entries . and .. at offsets 0 and 1 */
+	return generic_file_llseek_size(file, offset, whence, 2, 2);
+}
+
+static int empty_dir_readdir(struct file *file, struct dir_context *ctx)
+{
+	dir_emit_dots(file, ctx);
+	return 0;
+}
+
+static const struct file_operations empty_dir_operations = {
+	.llseek		= empty_dir_llseek,
+	.read		= generic_read_dir,
+	.iterate	= empty_dir_readdir,
+	.fsync		= noop_fsync,
+};
+
+
+void make_empty_dir_inode(struct inode *inode)
+{
+	set_nlink(inode, 2);
+	inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO;
+	inode->i_uid = GLOBAL_ROOT_UID;
+	inode->i_gid = GLOBAL_ROOT_GID;
+	inode->i_rdev = 0;
+	inode->i_size = 2;
+	inode->i_blkbits = PAGE_SHIFT;
+	inode->i_blocks = 0;
+
+	inode->i_op = &empty_dir_inode_operations;
+	inode->i_fop = &empty_dir_operations;
+}
+
+bool is_empty_dir_inode(struct inode *inode)
+{
+	return (inode->i_fop == &empty_dir_operations) &&
+		(inode->i_op == &empty_dir_inode_operations);
+}
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 2d24eeb8e59c..571aab91bfc0 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2780,6 +2780,8 @@ extern struct dentry *simple_lookup(struct inode *, struct dentry *, unsigned in
 extern ssize_t generic_read_dir(struct file *, char __user *, size_t, loff_t *);
 extern const struct file_operations simple_dir_operations;
 extern const struct inode_operations simple_dir_inode_operations;
+extern void make_empty_dir_inode(struct inode *inode);
+extern bool is_empty_dir_inode(struct inode *inode);
 struct tree_descr { char *name; const struct file_operations *ops; int mode; };
 struct dentry *d_alloc_name(struct dentry *, const char *);
 extern int simple_fill_super(struct super_block *, unsigned long, struct tree_descr *);
-- 
cgit v1.2.3


From f9bd6733d3f11e24f3949becf277507d422ee1eb Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Sat, 9 May 2015 22:09:14 -0500
Subject: sysctl: Allow creating permanently empty directories that serve as
 mountpoints.

Add a magic sysctl table sysctl_mount_point that when used to
create a directory forces that directory to be permanently empty.

Update the code to use make_empty_dir_inode when accessing permanently
empty directories.

Update the code to not allow adding to permanently empty directories.

Update /proc/sys/fs/binfmt_misc to be a permanently empty directory.

Cc: stable@vger.kernel.org
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 fs/proc/proc_sysctl.c  | 37 +++++++++++++++++++++++++++++++++++++
 include/linux/sysctl.h |  3 +++
 kernel/sysctl.c        |  8 +-------
 3 files changed, 41 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index fea2561d773b..fdda62e6115e 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -19,6 +19,28 @@ static const struct inode_operations proc_sys_inode_operations;
 static const struct file_operations proc_sys_dir_file_operations;
 static const struct inode_operations proc_sys_dir_operations;
 
+/* Support for permanently empty directories */
+
+struct ctl_table sysctl_mount_point[] = {
+	{ }
+};
+
+static bool is_empty_dir(struct ctl_table_header *head)
+{
+	return head->ctl_table[0].child == sysctl_mount_point;
+}
+
+static void set_empty_dir(struct ctl_dir *dir)
+{
+	dir->header.ctl_table[0].child = sysctl_mount_point;
+}
+
+static void clear_empty_dir(struct ctl_dir *dir)
+
+{
+	dir->header.ctl_table[0].child = NULL;
+}
+
 void proc_sys_poll_notify(struct ctl_table_poll *poll)
 {
 	if (!poll)
@@ -187,6 +209,17 @@ static int insert_header(struct ctl_dir *dir, struct ctl_table_header *header)
 	struct ctl_table *entry;
 	int err;
 
+	/* Is this a permanently empty directory? */
+	if (is_empty_dir(&dir->header))
+		return -EROFS;
+
+	/* Am I creating a permanently empty directory? */
+	if (header->ctl_table == sysctl_mount_point) {
+		if (!RB_EMPTY_ROOT(&dir->root))
+			return -EINVAL;
+		set_empty_dir(dir);
+	}
+
 	dir->header.nreg++;
 	header->parent = dir;
 	err = insert_links(header);
@@ -202,6 +235,8 @@ fail:
 	erase_header(header);
 	put_links(header);
 fail_links:
+	if (header->ctl_table == sysctl_mount_point)
+		clear_empty_dir(dir);
 	header->parent = NULL;
 	drop_sysctl_table(&dir->header);
 	return err;
@@ -419,6 +454,8 @@ static struct inode *proc_sys_make_inode(struct super_block *sb,
 		inode->i_mode |= S_IFDIR;
 		inode->i_op = &proc_sys_dir_operations;
 		inode->i_fop = &proc_sys_dir_file_operations;
+		if (is_empty_dir(head))
+			make_empty_dir_inode(inode);
 	}
 out:
 	return inode;
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index 795d5fea5697..fa7bc29925c9 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -188,6 +188,9 @@ struct ctl_table_header *register_sysctl_paths(const struct ctl_path *path,
 void unregister_sysctl_table(struct ctl_table_header * table);
 
 extern int sysctl_init(void);
+
+extern struct ctl_table sysctl_mount_point[];
+
 #else /* CONFIG_SYSCTL */
 static inline struct ctl_table_header *register_sysctl_table(struct ctl_table * table)
 {
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 2082b1a88fb9..c3eee4c6d6c1 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1531,12 +1531,6 @@ static struct ctl_table vm_table[] = {
 	{ }
 };
 
-#if defined(CONFIG_BINFMT_MISC) || defined(CONFIG_BINFMT_MISC_MODULE)
-static struct ctl_table binfmt_misc_table[] = {
-	{ }
-};
-#endif
-
 static struct ctl_table fs_table[] = {
 	{
 		.procname	= "inode-nr",
@@ -1690,7 +1684,7 @@ static struct ctl_table fs_table[] = {
 	{
 		.procname	= "binfmt_misc",
 		.mode		= 0555,
-		.child		= binfmt_misc_table,
+		.child		= sysctl_mount_point,
 	},
 #endif
 	{
-- 
cgit v1.2.3


From ea015218f2f7ace2dad9cedd21ed95bdba2886d7 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Wed, 13 May 2015 16:09:29 -0500
Subject: kernfs: Add support for always empty directories.

Add a new function kernfs_create_empty_dir that can be used to create
directory that can not be modified.

Update the code to use make_empty_dir_inode when reporting a
permanently empty directory to the vfs.

Update the code to not allow adding to permanently empty directories.

Cc: stable@vger.kernel.org
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 fs/kernfs/dir.c        | 38 +++++++++++++++++++++++++++++++++++++-
 fs/kernfs/inode.c      |  2 ++
 include/linux/kernfs.h |  3 +++
 3 files changed, 42 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index f131fc23ffc4..47dc636d80ed 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -585,6 +585,9 @@ int kernfs_add_one(struct kernfs_node *kn)
 		goto out_unlock;
 
 	ret = -ENOENT;
+	if (parent->flags & KERNFS_EMPTY_DIR)
+		goto out_unlock;
+
 	if ((parent->flags & KERNFS_ACTIVATED) && !kernfs_active(parent))
 		goto out_unlock;
 
@@ -776,6 +779,38 @@ struct kernfs_node *kernfs_create_dir_ns(struct kernfs_node *parent,
 	return ERR_PTR(rc);
 }
 
+/**
+ * kernfs_create_empty_dir - create an always empty directory
+ * @parent: parent in which to create a new directory
+ * @name: name of the new directory
+ *
+ * Returns the created node on success, ERR_PTR() value on failure.
+ */
+struct kernfs_node *kernfs_create_empty_dir(struct kernfs_node *parent,
+					    const char *name)
+{
+	struct kernfs_node *kn;
+	int rc;
+
+	/* allocate */
+	kn = kernfs_new_node(parent, name, S_IRUGO|S_IXUGO|S_IFDIR, KERNFS_DIR);
+	if (!kn)
+		return ERR_PTR(-ENOMEM);
+
+	kn->flags |= KERNFS_EMPTY_DIR;
+	kn->dir.root = parent->dir.root;
+	kn->ns = NULL;
+	kn->priv = NULL;
+
+	/* link in */
+	rc = kernfs_add_one(kn);
+	if (!rc)
+		return kn;
+
+	kernfs_put(kn);
+	return ERR_PTR(rc);
+}
+
 static struct dentry *kernfs_iop_lookup(struct inode *dir,
 					struct dentry *dentry,
 					unsigned int flags)
@@ -1247,7 +1282,8 @@ int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent,
 	mutex_lock(&kernfs_mutex);
 
 	error = -ENOENT;
-	if (!kernfs_active(kn) || !kernfs_active(new_parent))
+	if (!kernfs_active(kn) || !kernfs_active(new_parent) ||
+	    (new_parent->flags & KERNFS_EMPTY_DIR))
 		goto out;
 
 	error = 0;
diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c
index 2da8493a380b..756dd56aaf60 100644
--- a/fs/kernfs/inode.c
+++ b/fs/kernfs/inode.c
@@ -296,6 +296,8 @@ static void kernfs_init_inode(struct kernfs_node *kn, struct inode *inode)
 	case KERNFS_DIR:
 		inode->i_op = &kernfs_dir_iops;
 		inode->i_fop = &kernfs_dir_fops;
+		if (kn->flags & KERNFS_EMPTY_DIR)
+			make_empty_dir_inode(inode);
 		break;
 	case KERNFS_FILE:
 		inode->i_size = kn->attr.size;
diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h
index 71ecdab1671b..29d1896c3ba5 100644
--- a/include/linux/kernfs.h
+++ b/include/linux/kernfs.h
@@ -45,6 +45,7 @@ enum kernfs_node_flag {
 	KERNFS_LOCKDEP		= 0x0100,
 	KERNFS_SUICIDAL		= 0x0400,
 	KERNFS_SUICIDED		= 0x0800,
+	KERNFS_EMPTY_DIR	= 0x1000,
 };
 
 /* @flags for kernfs_create_root() */
@@ -285,6 +286,8 @@ void kernfs_destroy_root(struct kernfs_root *root);
 struct kernfs_node *kernfs_create_dir_ns(struct kernfs_node *parent,
 					 const char *name, umode_t mode,
 					 void *priv, const void *ns);
+struct kernfs_node *kernfs_create_empty_dir(struct kernfs_node *parent,
+					    const char *name);
 struct kernfs_node *__kernfs_create_file(struct kernfs_node *parent,
 					 const char *name,
 					 umode_t mode, loff_t size,
-- 
cgit v1.2.3


From 87d2846fcf88113fae2341da1ca9a71f0d916f2c Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Wed, 13 May 2015 16:31:40 -0500
Subject: sysfs: Add support for permanently empty directories to serve as
 mount points.

Add two functions sysfs_create_mount_point and
sysfs_remove_mount_point that hang a permanently empty directory off
of a kobject or remove a permanently emptpy directory hanging from a
kobject.  Export these new functions so modular filesystems can use
them.

Cc: stable@vger.kernel.org
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 fs/sysfs/dir.c        | 34 ++++++++++++++++++++++++++++++++++
 include/linux/sysfs.h | 15 +++++++++++++++
 2 files changed, 49 insertions(+)

(limited to 'include/linux')

diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index 0b45ff42f374..94374e435025 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -121,3 +121,37 @@ int sysfs_move_dir_ns(struct kobject *kobj, struct kobject *new_parent_kobj,
 
 	return kernfs_rename_ns(kn, new_parent, kn->name, new_ns);
 }
+
+/**
+ * sysfs_create_mount_point - create an always empty directory
+ * @parent_kobj:  kobject that will contain this always empty directory
+ * @name: The name of the always empty directory to add
+ */
+int sysfs_create_mount_point(struct kobject *parent_kobj, const char *name)
+{
+	struct kernfs_node *kn, *parent = parent_kobj->sd;
+
+	kn = kernfs_create_empty_dir(parent, name);
+	if (IS_ERR(kn)) {
+		if (PTR_ERR(kn) == -EEXIST)
+			sysfs_warn_dup(parent, name);
+		return PTR_ERR(kn);
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(sysfs_create_mount_point);
+
+/**
+ *	sysfs_remove_mount_point - remove an always empty directory.
+ *	@parent_kobj: kobject that will contain this always empty directory
+ *	@name: The name of the always empty directory to remove
+ *
+ */
+void sysfs_remove_mount_point(struct kobject *parent_kobj, const char *name)
+{
+	struct kernfs_node *parent = parent_kobj->sd;
+
+	kernfs_remove_by_name_ns(parent, name, NULL);
+}
+EXPORT_SYMBOL_GPL(sysfs_remove_mount_point);
diff --git a/include/linux/sysfs.h b/include/linux/sysfs.h
index 99382c0df17e..9f65758311a4 100644
--- a/include/linux/sysfs.h
+++ b/include/linux/sysfs.h
@@ -210,6 +210,10 @@ int __must_check sysfs_rename_dir_ns(struct kobject *kobj, const char *new_name,
 int __must_check sysfs_move_dir_ns(struct kobject *kobj,
 				   struct kobject *new_parent_kobj,
 				   const void *new_ns);
+int __must_check sysfs_create_mount_point(struct kobject *parent_kobj,
+					  const char *name);
+void sysfs_remove_mount_point(struct kobject *parent_kobj,
+			      const char *name);
 
 int __must_check sysfs_create_file_ns(struct kobject *kobj,
 				      const struct attribute *attr,
@@ -298,6 +302,17 @@ static inline int sysfs_move_dir_ns(struct kobject *kobj,
 	return 0;
 }
 
+static inline int sysfs_create_mount_point(struct kobject *parent_kobj,
+					   const char *name)
+{
+	return 0;
+}
+
+static inline void sysfs_remove_mount_point(struct kobject *parent_kobj,
+					    const char *name)
+{
+}
+
 static inline int sysfs_create_file_ns(struct kobject *kobj,
 				       const struct attribute *attr,
 				       const void *ns)
-- 
cgit v1.2.3


From bd7ade3cd9b0850264306f5c2b79024a417b6396 Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <kernel@kyup.com>
Date: Thu, 2 Jul 2015 01:32:44 -0400
Subject: bufferhead: Add _gfp version for sb_getblk()

sb_getblk() is used during ext4 (and possibly other FSes) writeback
paths. Sometimes such path require allocating memory and guaranteeing
that such allocation won't block. Currently, however, there is no way
to provide user flags for sb_getblk which could lead to deadlocks.

This patch implements a sb_getblk_gfp with the only difference it can
accept user-provided GFP flags.

Signed-off-by: Nikolay Borisov <kernel@kyup.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Cc: stable@vger.kernel.org
---
 include/linux/buffer_head.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index 73b45225a7ca..e6797ded700e 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -317,6 +317,13 @@ sb_getblk(struct super_block *sb, sector_t block)
 	return __getblk_gfp(sb->s_bdev, block, sb->s_blocksize, __GFP_MOVABLE);
 }
 
+
+static inline struct buffer_head *
+sb_getblk_gfp(struct super_block *sb, sector_t block, gfp_t gfp)
+{
+	return __getblk_gfp(sb->s_bdev, block, sb->s_blocksize, gfp);
+}
+
 static inline struct buffer_head *
 sb_find_get_block(struct super_block *sb, sector_t block)
 {
-- 
cgit v1.2.3


From ec110bc7cc48d7806c9b65094e6afb19452d458f Mon Sep 17 00:00:00 2001
From: Allen Hubbe <Allen.Hubbe@emc.com>
Date: Thu, 7 May 2015 06:45:21 -0400
Subject: NTB: Move files in preparation for NTB abstraction

This patch only moves files to their new locations, before applying the
next two patches adding the NTB Abstraction layer.  Splitting this patch
from the next is intended make distinct which code is changed only due
to moving the files, versus which are substantial code changes in adding
the NTB Abstraction layer.

Signed-off-by: Allen Hubbe <Allen.Hubbe@emc.com>
Signed-off-by: Jon Mason <jdmason@kudzu.us>
---
 drivers/net/Kconfig                 |    4 +-
 drivers/net/ntb_netdev.c            |    6 +-
 drivers/ntb/Makefile                |    4 +-
 drivers/ntb/hw/intel/ntb_hw_intel.c | 1894 ++++++++++++++++++++++++++++++++++
 drivers/ntb/hw/intel/ntb_hw_intel.h |  385 +++++++
 drivers/ntb/ntb_hw.c                | 1895 -----------------------------------
 drivers/ntb/ntb_hw.h                |  256 -----
 drivers/ntb/ntb_regs.h              |  177 ----
 drivers/ntb/ntb_transport.c         |   14 +-
 include/linux/ntb.h                 |   88 --
 include/linux/ntb_transport.h       |   88 ++
 11 files changed, 2381 insertions(+), 2430 deletions(-)
 create mode 100644 drivers/ntb/hw/intel/ntb_hw_intel.c
 create mode 100644 drivers/ntb/hw/intel/ntb_hw_intel.h
 delete mode 100644 drivers/ntb/ntb_hw.c
 delete mode 100644 drivers/ntb/ntb_hw.h
 delete mode 100644 drivers/ntb/ntb_regs.h
 delete mode 100644 include/linux/ntb.h
 create mode 100644 include/linux/ntb_transport.h

(limited to 'include/linux')

diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig
index df51d6025a90..bda3cde62bb3 100644
--- a/drivers/net/Kconfig
+++ b/drivers/net/Kconfig
@@ -203,8 +203,8 @@ config NET_POLL_CONTROLLER
 	def_bool NETPOLL
 
 config NTB_NETDEV
-	tristate "Virtual Ethernet over NTB"
-	depends on NTB
+	tristate "Virtual Ethernet over NTB Transport"
+	depends on NTB_TRANSPORT
 
 config RIONET
 	tristate "RapidIO Ethernet over messaging driver support"
diff --git a/drivers/net/ntb_netdev.c b/drivers/net/ntb_netdev.c
index 5a7e6397440a..6d3bfa62f5ec 100644
--- a/drivers/net/ntb_netdev.c
+++ b/drivers/net/ntb_netdev.c
@@ -49,7 +49,7 @@
 #include <linux/ethtool.h>
 #include <linux/module.h>
 #include <linux/pci.h>
-#include <linux/ntb.h>
+#include <linux/ntb_transport.h>
 
 #define NTB_NETDEV_VER	"0.7"
 
@@ -410,13 +410,13 @@ static int __init ntb_netdev_init_module(void)
 	rc = ntb_register_client_dev(KBUILD_MODNAME);
 	if (rc)
 		return rc;
-	return ntb_register_client(&ntb_netdev_client);
+	return ntb_transport_register_client(&ntb_netdev_client);
 }
 module_init(ntb_netdev_init_module);
 
 static void __exit ntb_netdev_exit_module(void)
 {
-	ntb_unregister_client(&ntb_netdev_client);
+	ntb_transport_unregister_client(&ntb_netdev_client);
 	ntb_unregister_client_dev(KBUILD_MODNAME);
 }
 module_exit(ntb_netdev_exit_module);
diff --git a/drivers/ntb/Makefile b/drivers/ntb/Makefile
index 15cb59fd354e..545b10a131af 100644
--- a/drivers/ntb/Makefile
+++ b/drivers/ntb/Makefile
@@ -1,3 +1,3 @@
-obj-$(CONFIG_NTB) += ntb.o
+obj-$(CONFIG_NTB) += ntb_hw_intel.o
 
-ntb-objs := ntb_hw.o ntb_transport.o
+ntb_hw_intel-objs := hw/intel/ntb_hw_intel.o ntb_transport.o
diff --git a/drivers/ntb/hw/intel/ntb_hw_intel.c b/drivers/ntb/hw/intel/ntb_hw_intel.c
new file mode 100644
index 000000000000..044534a995f1
--- /dev/null
+++ b/drivers/ntb/hw/intel/ntb_hw_intel.c
@@ -0,0 +1,1894 @@
+/*
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ *   redistributing this file, you may do so under either license.
+ *
+ *   GPL LICENSE SUMMARY
+ *
+ *   Copyright(c) 2012 Intel Corporation. All rights reserved.
+ *
+ *   This program is free software; you can redistribute it and/or modify
+ *   it under the terms of version 2 of the GNU General Public License as
+ *   published by the Free Software Foundation.
+ *
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2012 Intel Corporation. All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copy
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Intel PCIe NTB Linux driver
+ *
+ * Contact Information:
+ * Jon Mason <jon.mason@intel.com>
+ */
+#include <linux/debugfs.h>
+#include <linux/delay.h>
+#include <linux/init.h>
+#include <linux/interrupt.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/random.h>
+#include <linux/slab.h>
+#include "ntb_hw_intel.h"
+
+#define NTB_NAME	"Intel(R) PCI-E Non-Transparent Bridge Driver"
+#define NTB_VER		"1.0"
+
+MODULE_DESCRIPTION(NTB_NAME);
+MODULE_VERSION(NTB_VER);
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_AUTHOR("Intel Corporation");
+
+enum {
+	NTB_CONN_TRANSPARENT = 0,
+	NTB_CONN_B2B,
+	NTB_CONN_RP,
+};
+
+enum {
+	NTB_DEV_USD = 0,
+	NTB_DEV_DSD,
+};
+
+enum {
+	SNB_HW = 0,
+	BWD_HW,
+};
+
+static struct dentry *debugfs_dir;
+
+#define BWD_LINK_RECOVERY_TIME	500
+
+/* Translate memory window 0,1,2 to BAR 2,4,5 */
+#define MW_TO_BAR(mw)	(mw == 0 ? 2 : (mw == 1 ? 4 : 5))
+
+static const struct pci_device_id ntb_pci_tbl[] = {
+	{PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_NTB_B2B_BWD)},
+	{PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_NTB_B2B_JSF)},
+	{PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_NTB_B2B_SNB)},
+	{PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_NTB_B2B_IVT)},
+	{PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_NTB_B2B_HSX)},
+	{PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_NTB_PS_JSF)},
+	{PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_NTB_PS_SNB)},
+	{PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_NTB_PS_IVT)},
+	{PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_NTB_PS_HSX)},
+	{PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_NTB_SS_JSF)},
+	{PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_NTB_SS_SNB)},
+	{PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_NTB_SS_IVT)},
+	{PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_NTB_SS_HSX)},
+	{0}
+};
+MODULE_DEVICE_TABLE(pci, ntb_pci_tbl);
+
+static int is_ntb_xeon(struct ntb_device *ndev)
+{
+	switch (ndev->pdev->device) {
+	case PCI_DEVICE_ID_INTEL_NTB_SS_JSF:
+	case PCI_DEVICE_ID_INTEL_NTB_SS_SNB:
+	case PCI_DEVICE_ID_INTEL_NTB_SS_IVT:
+	case PCI_DEVICE_ID_INTEL_NTB_SS_HSX:
+	case PCI_DEVICE_ID_INTEL_NTB_PS_JSF:
+	case PCI_DEVICE_ID_INTEL_NTB_PS_SNB:
+	case PCI_DEVICE_ID_INTEL_NTB_PS_IVT:
+	case PCI_DEVICE_ID_INTEL_NTB_PS_HSX:
+	case PCI_DEVICE_ID_INTEL_NTB_B2B_JSF:
+	case PCI_DEVICE_ID_INTEL_NTB_B2B_SNB:
+	case PCI_DEVICE_ID_INTEL_NTB_B2B_IVT:
+	case PCI_DEVICE_ID_INTEL_NTB_B2B_HSX:
+		return 1;
+	default:
+		return 0;
+	}
+
+	return 0;
+}
+
+static int is_ntb_atom(struct ntb_device *ndev)
+{
+	switch (ndev->pdev->device) {
+	case PCI_DEVICE_ID_INTEL_NTB_B2B_BWD:
+		return 1;
+	default:
+		return 0;
+	}
+
+	return 0;
+}
+
+static void ntb_set_errata_flags(struct ntb_device *ndev)
+{
+	switch (ndev->pdev->device) {
+	/*
+	 * this workaround applies to all platform up to IvyBridge
+	 * Haswell has splitbar support and use a different workaround
+	 */
+	case PCI_DEVICE_ID_INTEL_NTB_SS_JSF:
+	case PCI_DEVICE_ID_INTEL_NTB_SS_SNB:
+	case PCI_DEVICE_ID_INTEL_NTB_SS_IVT:
+	case PCI_DEVICE_ID_INTEL_NTB_SS_HSX:
+	case PCI_DEVICE_ID_INTEL_NTB_PS_JSF:
+	case PCI_DEVICE_ID_INTEL_NTB_PS_SNB:
+	case PCI_DEVICE_ID_INTEL_NTB_PS_IVT:
+	case PCI_DEVICE_ID_INTEL_NTB_PS_HSX:
+	case PCI_DEVICE_ID_INTEL_NTB_B2B_JSF:
+	case PCI_DEVICE_ID_INTEL_NTB_B2B_SNB:
+	case PCI_DEVICE_ID_INTEL_NTB_B2B_IVT:
+	case PCI_DEVICE_ID_INTEL_NTB_B2B_HSX:
+		ndev->wa_flags |= WA_SNB_ERR;
+		break;
+	}
+}
+
+/**
+ * ntb_register_event_callback() - register event callback
+ * @ndev: pointer to ntb_device instance
+ * @func: callback function to register
+ *
+ * This function registers a callback for any HW driver events such as link
+ * up/down, power management notices and etc.
+ *
+ * RETURNS: An appropriate -ERRNO error value on error, or zero for success.
+ */
+int ntb_register_event_callback(struct ntb_device *ndev,
+				void (*func)(void *handle,
+					     enum ntb_hw_event event))
+{
+	if (ndev->event_cb)
+		return -EINVAL;
+
+	ndev->event_cb = func;
+
+	return 0;
+}
+
+/**
+ * ntb_unregister_event_callback() - unregisters the event callback
+ * @ndev: pointer to ntb_device instance
+ *
+ * This function unregisters the existing callback from transport
+ */
+void ntb_unregister_event_callback(struct ntb_device *ndev)
+{
+	ndev->event_cb = NULL;
+}
+
+static void ntb_irq_work(unsigned long data)
+{
+	struct ntb_db_cb *db_cb = (struct ntb_db_cb *)data;
+	int rc;
+
+	rc = db_cb->callback(db_cb->data, db_cb->db_num);
+	if (rc)
+		tasklet_schedule(&db_cb->irq_work);
+	else {
+		struct ntb_device *ndev = db_cb->ndev;
+		unsigned long mask;
+
+		mask = readw(ndev->reg_ofs.ldb_mask);
+		clear_bit(db_cb->db_num * ndev->bits_per_vector, &mask);
+		writew(mask, ndev->reg_ofs.ldb_mask);
+	}
+}
+
+/**
+ * ntb_register_db_callback() - register a callback for doorbell interrupt
+ * @ndev: pointer to ntb_device instance
+ * @idx: doorbell index to register callback, zero based
+ * @data: pointer to be returned to caller with every callback
+ * @func: callback function to register
+ *
+ * This function registers a callback function for the doorbell interrupt
+ * on the primary side. The function will unmask the doorbell as well to
+ * allow interrupt.
+ *
+ * RETURNS: An appropriate -ERRNO error value on error, or zero for success.
+ */
+int ntb_register_db_callback(struct ntb_device *ndev, unsigned int idx,
+			     void *data, int (*func)(void *data, int db_num))
+{
+	unsigned long mask;
+
+	if (idx >= ndev->max_cbs || ndev->db_cb[idx].callback) {
+		dev_warn(&ndev->pdev->dev, "Invalid Index.\n");
+		return -EINVAL;
+	}
+
+	ndev->db_cb[idx].callback = func;
+	ndev->db_cb[idx].data = data;
+	ndev->db_cb[idx].ndev = ndev;
+
+	tasklet_init(&ndev->db_cb[idx].irq_work, ntb_irq_work,
+		     (unsigned long) &ndev->db_cb[idx]);
+
+	/* unmask interrupt */
+	mask = readw(ndev->reg_ofs.ldb_mask);
+	clear_bit(idx * ndev->bits_per_vector, &mask);
+	writew(mask, ndev->reg_ofs.ldb_mask);
+
+	return 0;
+}
+
+/**
+ * ntb_unregister_db_callback() - unregister a callback for doorbell interrupt
+ * @ndev: pointer to ntb_device instance
+ * @idx: doorbell index to register callback, zero based
+ *
+ * This function unregisters a callback function for the doorbell interrupt
+ * on the primary side. The function will also mask the said doorbell.
+ */
+void ntb_unregister_db_callback(struct ntb_device *ndev, unsigned int idx)
+{
+	unsigned long mask;
+
+	if (idx >= ndev->max_cbs || !ndev->db_cb[idx].callback)
+		return;
+
+	mask = readw(ndev->reg_ofs.ldb_mask);
+	set_bit(idx * ndev->bits_per_vector, &mask);
+	writew(mask, ndev->reg_ofs.ldb_mask);
+
+	tasklet_disable(&ndev->db_cb[idx].irq_work);
+
+	ndev->db_cb[idx].callback = NULL;
+}
+
+/**
+ * ntb_find_transport() - find the transport pointer
+ * @transport: pointer to pci device
+ *
+ * Given the pci device pointer, return the transport pointer passed in when
+ * the transport attached when it was inited.
+ *
+ * RETURNS: pointer to transport.
+ */
+void *ntb_find_transport(struct pci_dev *pdev)
+{
+	struct ntb_device *ndev = pci_get_drvdata(pdev);
+	return ndev->ntb_transport;
+}
+
+/**
+ * ntb_register_transport() - Register NTB transport with NTB HW driver
+ * @transport: transport identifier
+ *
+ * This function allows a transport to reserve the hardware driver for
+ * NTB usage.
+ *
+ * RETURNS: pointer to ntb_device, NULL on error.
+ */
+struct ntb_device *ntb_register_transport(struct pci_dev *pdev, void *transport)
+{
+	struct ntb_device *ndev = pci_get_drvdata(pdev);
+
+	if (ndev->ntb_transport)
+		return NULL;
+
+	ndev->ntb_transport = transport;
+	return ndev;
+}
+
+/**
+ * ntb_unregister_transport() - Unregister the transport with the NTB HW driver
+ * @ndev - ntb_device of the transport to be freed
+ *
+ * This function unregisters the transport from the HW driver and performs any
+ * necessary cleanups.
+ */
+void ntb_unregister_transport(struct ntb_device *ndev)
+{
+	int i;
+
+	if (!ndev->ntb_transport)
+		return;
+
+	for (i = 0; i < ndev->max_cbs; i++)
+		ntb_unregister_db_callback(ndev, i);
+
+	ntb_unregister_event_callback(ndev);
+	ndev->ntb_transport = NULL;
+}
+
+/**
+ * ntb_write_local_spad() - write to the secondary scratchpad register
+ * @ndev: pointer to ntb_device instance
+ * @idx: index to the scratchpad register, 0 based
+ * @val: the data value to put into the register
+ *
+ * This function allows writing of a 32bit value to the indexed scratchpad
+ * register. This writes over the data mirrored to the local scratchpad register
+ * by the remote system.
+ *
+ * RETURNS: An appropriate -ERRNO error value on error, or zero for success.
+ */
+int ntb_write_local_spad(struct ntb_device *ndev, unsigned int idx, u32 val)
+{
+	if (idx >= ndev->limits.max_spads)
+		return -EINVAL;
+
+	dev_dbg(&ndev->pdev->dev, "Writing %x to local scratch pad index %d\n",
+		val, idx);
+	writel(val, ndev->reg_ofs.spad_read + idx * 4);
+
+	return 0;
+}
+
+/**
+ * ntb_read_local_spad() - read from the primary scratchpad register
+ * @ndev: pointer to ntb_device instance
+ * @idx: index to scratchpad register, 0 based
+ * @val: pointer to 32bit integer for storing the register value
+ *
+ * This function allows reading of the 32bit scratchpad register on
+ * the primary (internal) side.  This allows the local system to read data
+ * written and mirrored to the scratchpad register by the remote system.
+ *
+ * RETURNS: An appropriate -ERRNO error value on error, or zero for success.
+ */
+int ntb_read_local_spad(struct ntb_device *ndev, unsigned int idx, u32 *val)
+{
+	if (idx >= ndev->limits.max_spads)
+		return -EINVAL;
+
+	*val = readl(ndev->reg_ofs.spad_write + idx * 4);
+	dev_dbg(&ndev->pdev->dev,
+		"Reading %x from local scratch pad index %d\n", *val, idx);
+
+	return 0;
+}
+
+/**
+ * ntb_write_remote_spad() - write to the secondary scratchpad register
+ * @ndev: pointer to ntb_device instance
+ * @idx: index to the scratchpad register, 0 based
+ * @val: the data value to put into the register
+ *
+ * This function allows writing of a 32bit value to the indexed scratchpad
+ * register. The register resides on the secondary (external) side.  This allows
+ * the local system to write data to be mirrored to the remote systems
+ * scratchpad register.
+ *
+ * RETURNS: An appropriate -ERRNO error value on error, or zero for success.
+ */
+int ntb_write_remote_spad(struct ntb_device *ndev, unsigned int idx, u32 val)
+{
+	if (idx >= ndev->limits.max_spads)
+		return -EINVAL;
+
+	dev_dbg(&ndev->pdev->dev, "Writing %x to remote scratch pad index %d\n",
+		val, idx);
+	writel(val, ndev->reg_ofs.spad_write + idx * 4);
+
+	return 0;
+}
+
+/**
+ * ntb_read_remote_spad() - read from the primary scratchpad register
+ * @ndev: pointer to ntb_device instance
+ * @idx: index to scratchpad register, 0 based
+ * @val: pointer to 32bit integer for storing the register value
+ *
+ * This function allows reading of the 32bit scratchpad register on
+ * the primary (internal) side.  This alloows the local system to read the data
+ * it wrote to be mirrored on the remote system.
+ *
+ * RETURNS: An appropriate -ERRNO error value on error, or zero for success.
+ */
+int ntb_read_remote_spad(struct ntb_device *ndev, unsigned int idx, u32 *val)
+{
+	if (idx >= ndev->limits.max_spads)
+		return -EINVAL;
+
+	*val = readl(ndev->reg_ofs.spad_read + idx * 4);
+	dev_dbg(&ndev->pdev->dev,
+		"Reading %x from remote scratch pad index %d\n", *val, idx);
+
+	return 0;
+}
+
+/**
+ * ntb_get_mw_base() - get addr for the NTB memory window
+ * @ndev: pointer to ntb_device instance
+ * @mw: memory window number
+ *
+ * This function provides the base address of the memory window specified.
+ *
+ * RETURNS: address, or NULL on error.
+ */
+resource_size_t ntb_get_mw_base(struct ntb_device *ndev, unsigned int mw)
+{
+	if (mw >= ntb_max_mw(ndev))
+		return 0;
+
+	return pci_resource_start(ndev->pdev, MW_TO_BAR(mw));
+}
+
+/**
+ * ntb_get_mw_vbase() - get virtual addr for the NTB memory window
+ * @ndev: pointer to ntb_device instance
+ * @mw: memory window number
+ *
+ * This function provides the base virtual address of the memory window
+ * specified.
+ *
+ * RETURNS: pointer to virtual address, or NULL on error.
+ */
+void __iomem *ntb_get_mw_vbase(struct ntb_device *ndev, unsigned int mw)
+{
+	if (mw >= ntb_max_mw(ndev))
+		return NULL;
+
+	return ndev->mw[mw].vbase;
+}
+
+/**
+ * ntb_get_mw_size() - return size of NTB memory window
+ * @ndev: pointer to ntb_device instance
+ * @mw: memory window number
+ *
+ * This function provides the physical size of the memory window specified
+ *
+ * RETURNS: the size of the memory window or zero on error
+ */
+u64 ntb_get_mw_size(struct ntb_device *ndev, unsigned int mw)
+{
+	if (mw >= ntb_max_mw(ndev))
+		return 0;
+
+	return ndev->mw[mw].bar_sz;
+}
+
+/**
+ * ntb_set_mw_addr - set the memory window address
+ * @ndev: pointer to ntb_device instance
+ * @mw: memory window number
+ * @addr: base address for data
+ *
+ * This function sets the base physical address of the memory window.  This
+ * memory address is where data from the remote system will be transfered into
+ * or out of depending on how the transport is configured.
+ */
+void ntb_set_mw_addr(struct ntb_device *ndev, unsigned int mw, u64 addr)
+{
+	if (mw >= ntb_max_mw(ndev))
+		return;
+
+	dev_dbg(&ndev->pdev->dev, "Writing addr %Lx to BAR %d\n", addr,
+		MW_TO_BAR(mw));
+
+	ndev->mw[mw].phys_addr = addr;
+
+	switch (MW_TO_BAR(mw)) {
+	case NTB_BAR_23:
+		writeq(addr, ndev->reg_ofs.bar2_xlat);
+		break;
+	case NTB_BAR_4:
+		if (ndev->split_bar)
+			writel(addr, ndev->reg_ofs.bar4_xlat);
+		else
+			writeq(addr, ndev->reg_ofs.bar4_xlat);
+		break;
+	case NTB_BAR_5:
+		writel(addr, ndev->reg_ofs.bar5_xlat);
+		break;
+	}
+}
+
+/**
+ * ntb_ring_doorbell() - Set the doorbell on the secondary/external side
+ * @ndev: pointer to ntb_device instance
+ * @db: doorbell to ring
+ *
+ * This function allows triggering of a doorbell on the secondary/external
+ * side that will initiate an interrupt on the remote host
+ *
+ * RETURNS: An appropriate -ERRNO error value on error, or zero for success.
+ */
+void ntb_ring_doorbell(struct ntb_device *ndev, unsigned int db)
+{
+	dev_dbg(&ndev->pdev->dev, "%s: ringing doorbell %d\n", __func__, db);
+
+	if (ndev->hw_type == BWD_HW)
+		writeq((u64) 1 << db, ndev->reg_ofs.rdb);
+	else
+		writew(((1 << ndev->bits_per_vector) - 1) <<
+		       (db * ndev->bits_per_vector), ndev->reg_ofs.rdb);
+}
+
+static void bwd_recover_link(struct ntb_device *ndev)
+{
+	u32 status;
+
+	/* Driver resets the NTB ModPhy lanes - magic! */
+	writeb(0xe0, ndev->reg_base + BWD_MODPHY_PCSREG6);
+	writeb(0x40, ndev->reg_base + BWD_MODPHY_PCSREG4);
+	writeb(0x60, ndev->reg_base + BWD_MODPHY_PCSREG4);
+	writeb(0x60, ndev->reg_base + BWD_MODPHY_PCSREG6);
+
+	/* Driver waits 100ms to allow the NTB ModPhy to settle */
+	msleep(100);
+
+	/* Clear AER Errors, write to clear */
+	status = readl(ndev->reg_base + BWD_ERRCORSTS_OFFSET);
+	dev_dbg(&ndev->pdev->dev, "ERRCORSTS = %x\n", status);
+	status &= PCI_ERR_COR_REP_ROLL;
+	writel(status, ndev->reg_base + BWD_ERRCORSTS_OFFSET);
+
+	/* Clear unexpected electrical idle event in LTSSM, write to clear */
+	status = readl(ndev->reg_base + BWD_LTSSMERRSTS0_OFFSET);
+	dev_dbg(&ndev->pdev->dev, "LTSSMERRSTS0 = %x\n", status);
+	status |= BWD_LTSSMERRSTS0_UNEXPECTEDEI;
+	writel(status, ndev->reg_base + BWD_LTSSMERRSTS0_OFFSET);
+
+	/* Clear DeSkew Buffer error, write to clear */
+	status = readl(ndev->reg_base + BWD_DESKEWSTS_OFFSET);
+	dev_dbg(&ndev->pdev->dev, "DESKEWSTS = %x\n", status);
+	status |= BWD_DESKEWSTS_DBERR;
+	writel(status, ndev->reg_base + BWD_DESKEWSTS_OFFSET);
+
+	status = readl(ndev->reg_base + BWD_IBSTERRRCRVSTS0_OFFSET);
+	dev_dbg(&ndev->pdev->dev, "IBSTERRRCRVSTS0 = %x\n", status);
+	status &= BWD_IBIST_ERR_OFLOW;
+	writel(status, ndev->reg_base + BWD_IBSTERRRCRVSTS0_OFFSET);
+
+	/* Releases the NTB state machine to allow the link to retrain */
+	status = readl(ndev->reg_base + BWD_LTSSMSTATEJMP_OFFSET);
+	dev_dbg(&ndev->pdev->dev, "LTSSMSTATEJMP = %x\n", status);
+	status &= ~BWD_LTSSMSTATEJMP_FORCEDETECT;
+	writel(status, ndev->reg_base + BWD_LTSSMSTATEJMP_OFFSET);
+}
+
+static void ntb_link_event(struct ntb_device *ndev, int link_state)
+{
+	unsigned int event;
+
+	if (ndev->link_status == link_state)
+		return;
+
+	if (link_state == NTB_LINK_UP) {
+		u16 status;
+
+		dev_info(&ndev->pdev->dev, "Link Up\n");
+		ndev->link_status = NTB_LINK_UP;
+		event = NTB_EVENT_HW_LINK_UP;
+
+		if (is_ntb_atom(ndev) ||
+		    ndev->conn_type == NTB_CONN_TRANSPARENT)
+			status = readw(ndev->reg_ofs.lnk_stat);
+		else {
+			int rc = pci_read_config_word(ndev->pdev,
+						      SNB_LINK_STATUS_OFFSET,
+						      &status);
+			if (rc)
+				return;
+		}
+
+		ndev->link_width = (status & NTB_LINK_WIDTH_MASK) >> 4;
+		ndev->link_speed = (status & NTB_LINK_SPEED_MASK);
+		dev_info(&ndev->pdev->dev, "Link Width %d, Link Speed %d\n",
+			 ndev->link_width, ndev->link_speed);
+	} else {
+		dev_info(&ndev->pdev->dev, "Link Down\n");
+		ndev->link_status = NTB_LINK_DOWN;
+		event = NTB_EVENT_HW_LINK_DOWN;
+		/* Don't modify link width/speed, we need it in link recovery */
+	}
+
+	/* notify the upper layer if we have an event change */
+	if (ndev->event_cb)
+		ndev->event_cb(ndev->ntb_transport, event);
+}
+
+static int ntb_link_status(struct ntb_device *ndev)
+{
+	int link_state;
+
+	if (is_ntb_atom(ndev)) {
+		u32 ntb_cntl;
+
+		ntb_cntl = readl(ndev->reg_ofs.lnk_cntl);
+		if (ntb_cntl & BWD_CNTL_LINK_DOWN)
+			link_state = NTB_LINK_DOWN;
+		else
+			link_state = NTB_LINK_UP;
+	} else {
+		u16 status;
+		int rc;
+
+		rc = pci_read_config_word(ndev->pdev, SNB_LINK_STATUS_OFFSET,
+					  &status);
+		if (rc)
+			return rc;
+
+		if (status & NTB_LINK_STATUS_ACTIVE)
+			link_state = NTB_LINK_UP;
+		else
+			link_state = NTB_LINK_DOWN;
+	}
+
+	ntb_link_event(ndev, link_state);
+
+	return 0;
+}
+
+static void bwd_link_recovery(struct work_struct *work)
+{
+	struct ntb_device *ndev = container_of(work, struct ntb_device,
+					       lr_timer.work);
+	u32 status32;
+
+	bwd_recover_link(ndev);
+	/* There is a potential race between the 2 NTB devices recovering at the
+	 * same time.  If the times are the same, the link will not recover and
+	 * the driver will be stuck in this loop forever.  Add a random interval
+	 * to the recovery time to prevent this race.
+	 */
+	msleep(BWD_LINK_RECOVERY_TIME + prandom_u32() % BWD_LINK_RECOVERY_TIME);
+
+	status32 = readl(ndev->reg_base + BWD_LTSSMSTATEJMP_OFFSET);
+	if (status32 & BWD_LTSSMSTATEJMP_FORCEDETECT)
+		goto retry;
+
+	status32 = readl(ndev->reg_base + BWD_IBSTERRRCRVSTS0_OFFSET);
+	if (status32 & BWD_IBIST_ERR_OFLOW)
+		goto retry;
+
+	status32 = readl(ndev->reg_ofs.lnk_cntl);
+	if (!(status32 & BWD_CNTL_LINK_DOWN)) {
+		unsigned char speed, width;
+		u16 status16;
+
+		status16 = readw(ndev->reg_ofs.lnk_stat);
+		width = (status16 & NTB_LINK_WIDTH_MASK) >> 4;
+		speed = (status16 & NTB_LINK_SPEED_MASK);
+		if (ndev->link_width != width || ndev->link_speed != speed)
+			goto retry;
+	}
+
+	schedule_delayed_work(&ndev->hb_timer, NTB_HB_TIMEOUT);
+	return;
+
+retry:
+	schedule_delayed_work(&ndev->lr_timer, NTB_HB_TIMEOUT);
+}
+
+/* BWD doesn't have link status interrupt, poll on that platform */
+static void bwd_link_poll(struct work_struct *work)
+{
+	struct ntb_device *ndev = container_of(work, struct ntb_device,
+					       hb_timer.work);
+	unsigned long ts = jiffies;
+
+	/* If we haven't gotten an interrupt in a while, check the BWD link
+	 * status bit
+	 */
+	if (ts > ndev->last_ts + NTB_HB_TIMEOUT) {
+		int rc = ntb_link_status(ndev);
+		if (rc)
+			dev_err(&ndev->pdev->dev,
+				"Error determining link status\n");
+
+		/* Check to see if a link error is the cause of the link down */
+		if (ndev->link_status == NTB_LINK_DOWN) {
+			u32 status32 = readl(ndev->reg_base +
+					     BWD_LTSSMSTATEJMP_OFFSET);
+			if (status32 & BWD_LTSSMSTATEJMP_FORCEDETECT) {
+				schedule_delayed_work(&ndev->lr_timer, 0);
+				return;
+			}
+		}
+	}
+
+	schedule_delayed_work(&ndev->hb_timer, NTB_HB_TIMEOUT);
+}
+
+static int ntb_xeon_setup(struct ntb_device *ndev)
+{
+	switch (ndev->conn_type) {
+	case NTB_CONN_B2B:
+		ndev->reg_ofs.ldb = ndev->reg_base + SNB_PDOORBELL_OFFSET;
+		ndev->reg_ofs.ldb_mask = ndev->reg_base + SNB_PDBMSK_OFFSET;
+		ndev->reg_ofs.spad_read = ndev->reg_base + SNB_SPAD_OFFSET;
+		ndev->reg_ofs.bar2_xlat = ndev->reg_base + SNB_SBAR2XLAT_OFFSET;
+		ndev->reg_ofs.bar4_xlat = ndev->reg_base + SNB_SBAR4XLAT_OFFSET;
+		if (ndev->split_bar)
+			ndev->reg_ofs.bar5_xlat =
+				ndev->reg_base + SNB_SBAR5XLAT_OFFSET;
+		ndev->limits.max_spads = SNB_MAX_B2B_SPADS;
+
+		/* There is a Xeon hardware errata related to writes to
+		 * SDOORBELL or B2BDOORBELL in conjunction with inbound access
+		 * to NTB MMIO Space, which may hang the system.  To workaround
+		 * this use the second memory window to access the interrupt and
+		 * scratch pad registers on the remote system.
+		 */
+		if (ndev->wa_flags & WA_SNB_ERR) {
+			if (!ndev->mw[ndev->limits.max_mw - 1].bar_sz)
+				return -EINVAL;
+
+			ndev->limits.max_db_bits = SNB_MAX_DB_BITS;
+			ndev->reg_ofs.spad_write =
+				ndev->mw[ndev->limits.max_mw - 1].vbase +
+				SNB_SPAD_OFFSET;
+			ndev->reg_ofs.rdb =
+				ndev->mw[ndev->limits.max_mw - 1].vbase +
+				SNB_PDOORBELL_OFFSET;
+
+			/* Set the Limit register to 4k, the minimum size, to
+			 * prevent an illegal access
+			 */
+			writeq(ndev->mw[1].bar_sz + 0x1000, ndev->reg_base +
+			       SNB_PBAR4LMT_OFFSET);
+			/* HW errata on the Limit registers.  They can only be
+			 * written when the base register is 4GB aligned and
+			 * < 32bit.  This should already be the case based on
+			 * the driver defaults, but write the Limit registers
+			 * first just in case.
+			 */
+
+			ndev->limits.max_mw = SNB_ERRATA_MAX_MW;
+		} else {
+			/* HW Errata on bit 14 of b2bdoorbell register.  Writes
+			 * will not be mirrored to the remote system.  Shrink
+			 * the number of bits by one, since bit 14 is the last
+			 * bit.
+			 */
+			ndev->limits.max_db_bits = SNB_MAX_DB_BITS - 1;
+			ndev->reg_ofs.spad_write = ndev->reg_base +
+						   SNB_B2B_SPAD_OFFSET;
+			ndev->reg_ofs.rdb = ndev->reg_base +
+					    SNB_B2B_DOORBELL_OFFSET;
+
+			/* Disable the Limit register, just incase it is set to
+			 * something silly. A 64bit write should handle it
+			 * regardless of whether it has a split BAR or not.
+			 */
+			writeq(0, ndev->reg_base + SNB_PBAR4LMT_OFFSET);
+			/* HW errata on the Limit registers.  They can only be
+			 * written when the base register is 4GB aligned and
+			 * < 32bit.  This should already be the case based on
+			 * the driver defaults, but write the Limit registers
+			 * first just in case.
+			 */
+			if (ndev->split_bar)
+				ndev->limits.max_mw = HSX_SPLITBAR_MAX_MW;
+			else
+				ndev->limits.max_mw = SNB_MAX_MW;
+		}
+
+		/* The Xeon errata workaround requires setting SBAR Base
+		 * addresses to known values, so that the PBAR XLAT can be
+		 * pointed at SBAR0 of the remote system.
+		 */
+		if (ndev->dev_type == NTB_DEV_USD) {
+			writeq(SNB_MBAR23_DSD_ADDR, ndev->reg_base +
+			       SNB_PBAR2XLAT_OFFSET);
+			if (ndev->wa_flags & WA_SNB_ERR)
+				writeq(SNB_MBAR01_DSD_ADDR, ndev->reg_base +
+				       SNB_PBAR4XLAT_OFFSET);
+			else {
+				if (ndev->split_bar) {
+					writel(SNB_MBAR4_DSD_ADDR,
+					       ndev->reg_base +
+					       SNB_PBAR4XLAT_OFFSET);
+					writel(SNB_MBAR5_DSD_ADDR,
+					       ndev->reg_base +
+					       SNB_PBAR5XLAT_OFFSET);
+				} else
+					writeq(SNB_MBAR4_DSD_ADDR,
+					       ndev->reg_base +
+					       SNB_PBAR4XLAT_OFFSET);
+
+				/* B2B_XLAT_OFFSET is a 64bit register, but can
+				 * only take 32bit writes
+				 */
+				writel(SNB_MBAR01_DSD_ADDR & 0xffffffff,
+				       ndev->reg_base + SNB_B2B_XLAT_OFFSETL);
+				writel(SNB_MBAR01_DSD_ADDR >> 32,
+				       ndev->reg_base + SNB_B2B_XLAT_OFFSETU);
+			}
+
+			writeq(SNB_MBAR01_USD_ADDR, ndev->reg_base +
+			       SNB_SBAR0BASE_OFFSET);
+			writeq(SNB_MBAR23_USD_ADDR, ndev->reg_base +
+			       SNB_SBAR2BASE_OFFSET);
+			if (ndev->split_bar) {
+				writel(SNB_MBAR4_USD_ADDR, ndev->reg_base +
+				       SNB_SBAR4BASE_OFFSET);
+				writel(SNB_MBAR5_USD_ADDR, ndev->reg_base +
+				       SNB_SBAR5BASE_OFFSET);
+			} else
+				writeq(SNB_MBAR4_USD_ADDR, ndev->reg_base +
+				       SNB_SBAR4BASE_OFFSET);
+		} else {
+			writeq(SNB_MBAR23_USD_ADDR, ndev->reg_base +
+			       SNB_PBAR2XLAT_OFFSET);
+			if (ndev->wa_flags & WA_SNB_ERR)
+				writeq(SNB_MBAR01_USD_ADDR, ndev->reg_base +
+				       SNB_PBAR4XLAT_OFFSET);
+			else {
+				if (ndev->split_bar) {
+					writel(SNB_MBAR4_USD_ADDR,
+					       ndev->reg_base +
+					       SNB_PBAR4XLAT_OFFSET);
+					writel(SNB_MBAR5_USD_ADDR,
+					       ndev->reg_base +
+					       SNB_PBAR5XLAT_OFFSET);
+				} else
+					writeq(SNB_MBAR4_USD_ADDR,
+					       ndev->reg_base +
+					       SNB_PBAR4XLAT_OFFSET);
+
+				/*
+				 * B2B_XLAT_OFFSET is a 64bit register, but can
+				 * only take 32bit writes
+				 */
+				writel(SNB_MBAR01_USD_ADDR & 0xffffffff,
+				       ndev->reg_base + SNB_B2B_XLAT_OFFSETL);
+				writel(SNB_MBAR01_USD_ADDR >> 32,
+				       ndev->reg_base + SNB_B2B_XLAT_OFFSETU);
+			}
+			writeq(SNB_MBAR01_DSD_ADDR, ndev->reg_base +
+			       SNB_SBAR0BASE_OFFSET);
+			writeq(SNB_MBAR23_DSD_ADDR, ndev->reg_base +
+			       SNB_SBAR2BASE_OFFSET);
+			if (ndev->split_bar) {
+				writel(SNB_MBAR4_DSD_ADDR, ndev->reg_base +
+				       SNB_SBAR4BASE_OFFSET);
+				writel(SNB_MBAR5_DSD_ADDR, ndev->reg_base +
+				       SNB_SBAR5BASE_OFFSET);
+			} else
+				writeq(SNB_MBAR4_DSD_ADDR, ndev->reg_base +
+				       SNB_SBAR4BASE_OFFSET);
+
+		}
+		break;
+	case NTB_CONN_RP:
+		if (ndev->wa_flags & WA_SNB_ERR) {
+			dev_err(&ndev->pdev->dev,
+				"NTB-RP disabled due to hardware errata.\n");
+			return -EINVAL;
+		}
+
+		/* Scratch pads need to have exclusive access from the primary
+		 * or secondary side.  Halve the num spads so that each side can
+		 * have an equal amount.
+		 */
+		ndev->limits.max_spads = SNB_MAX_COMPAT_SPADS / 2;
+		ndev->limits.max_db_bits = SNB_MAX_DB_BITS;
+		/* Note: The SDOORBELL is the cause of the errata.  You REALLY
+		 * don't want to touch it.
+		 */
+		ndev->reg_ofs.rdb = ndev->reg_base + SNB_SDOORBELL_OFFSET;
+		ndev->reg_ofs.ldb = ndev->reg_base + SNB_PDOORBELL_OFFSET;
+		ndev->reg_ofs.ldb_mask = ndev->reg_base + SNB_PDBMSK_OFFSET;
+		/* Offset the start of the spads to correspond to whether it is
+		 * primary or secondary
+		 */
+		ndev->reg_ofs.spad_write = ndev->reg_base + SNB_SPAD_OFFSET +
+					   ndev->limits.max_spads * 4;
+		ndev->reg_ofs.spad_read = ndev->reg_base + SNB_SPAD_OFFSET;
+		ndev->reg_ofs.bar2_xlat = ndev->reg_base + SNB_SBAR2XLAT_OFFSET;
+		ndev->reg_ofs.bar4_xlat = ndev->reg_base + SNB_SBAR4XLAT_OFFSET;
+		if (ndev->split_bar) {
+			ndev->reg_ofs.bar5_xlat =
+				ndev->reg_base + SNB_SBAR5XLAT_OFFSET;
+			ndev->limits.max_mw = HSX_SPLITBAR_MAX_MW;
+		} else
+			ndev->limits.max_mw = SNB_MAX_MW;
+		break;
+	case NTB_CONN_TRANSPARENT:
+		if (ndev->wa_flags & WA_SNB_ERR) {
+			dev_err(&ndev->pdev->dev,
+				"NTB-TRANSPARENT disabled due to hardware errata.\n");
+			return -EINVAL;
+		}
+
+		/* Scratch pads need to have exclusive access from the primary
+		 * or secondary side.  Halve the num spads so that each side can
+		 * have an equal amount.
+		 */
+		ndev->limits.max_spads = SNB_MAX_COMPAT_SPADS / 2;
+		ndev->limits.max_db_bits = SNB_MAX_DB_BITS;
+		ndev->reg_ofs.rdb = ndev->reg_base + SNB_PDOORBELL_OFFSET;
+		ndev->reg_ofs.ldb = ndev->reg_base + SNB_SDOORBELL_OFFSET;
+		ndev->reg_ofs.ldb_mask = ndev->reg_base + SNB_SDBMSK_OFFSET;
+		ndev->reg_ofs.spad_write = ndev->reg_base + SNB_SPAD_OFFSET;
+		/* Offset the start of the spads to correspond to whether it is
+		 * primary or secondary
+		 */
+		ndev->reg_ofs.spad_read = ndev->reg_base + SNB_SPAD_OFFSET +
+					  ndev->limits.max_spads * 4;
+		ndev->reg_ofs.bar2_xlat = ndev->reg_base + SNB_PBAR2XLAT_OFFSET;
+		ndev->reg_ofs.bar4_xlat = ndev->reg_base + SNB_PBAR4XLAT_OFFSET;
+
+		if (ndev->split_bar) {
+			ndev->reg_ofs.bar5_xlat =
+				ndev->reg_base + SNB_PBAR5XLAT_OFFSET;
+			ndev->limits.max_mw = HSX_SPLITBAR_MAX_MW;
+		} else
+			ndev->limits.max_mw = SNB_MAX_MW;
+		break;
+	default:
+		/*
+		 * we should never hit this. the detect function should've
+		 * take cared of everything.
+		 */
+		return -EINVAL;
+	}
+
+	ndev->reg_ofs.lnk_cntl = ndev->reg_base + SNB_NTBCNTL_OFFSET;
+	ndev->reg_ofs.lnk_stat = ndev->reg_base + SNB_SLINK_STATUS_OFFSET;
+	ndev->reg_ofs.spci_cmd = ndev->reg_base + SNB_PCICMD_OFFSET;
+
+	ndev->limits.msix_cnt = SNB_MSIX_CNT;
+	ndev->bits_per_vector = SNB_DB_BITS_PER_VEC;
+
+	return 0;
+}
+
+static int ntb_bwd_setup(struct ntb_device *ndev)
+{
+	int rc;
+	u32 val;
+
+	ndev->hw_type = BWD_HW;
+
+	rc = pci_read_config_dword(ndev->pdev, NTB_PPD_OFFSET, &val);
+	if (rc)
+		return rc;
+
+	switch ((val & BWD_PPD_CONN_TYPE) >> 8) {
+	case NTB_CONN_B2B:
+		ndev->conn_type = NTB_CONN_B2B;
+		break;
+	case NTB_CONN_RP:
+	default:
+		dev_err(&ndev->pdev->dev, "Unsupported NTB configuration\n");
+		return -EINVAL;
+	}
+
+	if (val & BWD_PPD_DEV_TYPE)
+		ndev->dev_type = NTB_DEV_DSD;
+	else
+		ndev->dev_type = NTB_DEV_USD;
+
+	/* Initiate PCI-E link training */
+	rc = pci_write_config_dword(ndev->pdev, NTB_PPD_OFFSET,
+				    val | BWD_PPD_INIT_LINK);
+	if (rc)
+		return rc;
+
+	ndev->reg_ofs.ldb = ndev->reg_base + BWD_PDOORBELL_OFFSET;
+	ndev->reg_ofs.ldb_mask = ndev->reg_base + BWD_PDBMSK_OFFSET;
+	ndev->reg_ofs.rdb = ndev->reg_base + BWD_B2B_DOORBELL_OFFSET;
+	ndev->reg_ofs.bar2_xlat = ndev->reg_base + BWD_SBAR2XLAT_OFFSET;
+	ndev->reg_ofs.bar4_xlat = ndev->reg_base + BWD_SBAR4XLAT_OFFSET;
+	ndev->reg_ofs.lnk_cntl = ndev->reg_base + BWD_NTBCNTL_OFFSET;
+	ndev->reg_ofs.lnk_stat = ndev->reg_base + BWD_LINK_STATUS_OFFSET;
+	ndev->reg_ofs.spad_read = ndev->reg_base + BWD_SPAD_OFFSET;
+	ndev->reg_ofs.spad_write = ndev->reg_base + BWD_B2B_SPAD_OFFSET;
+	ndev->reg_ofs.spci_cmd = ndev->reg_base + BWD_PCICMD_OFFSET;
+	ndev->limits.max_mw = BWD_MAX_MW;
+	ndev->limits.max_spads = BWD_MAX_SPADS;
+	ndev->limits.max_db_bits = BWD_MAX_DB_BITS;
+	ndev->limits.msix_cnt = BWD_MSIX_CNT;
+	ndev->bits_per_vector = BWD_DB_BITS_PER_VEC;
+
+	/* Since bwd doesn't have a link interrupt, setup a poll timer */
+	INIT_DELAYED_WORK(&ndev->hb_timer, bwd_link_poll);
+	INIT_DELAYED_WORK(&ndev->lr_timer, bwd_link_recovery);
+	schedule_delayed_work(&ndev->hb_timer, NTB_HB_TIMEOUT);
+
+	return 0;
+}
+
+static int ntb_device_setup(struct ntb_device *ndev)
+{
+	int rc;
+
+	if (is_ntb_xeon(ndev))
+		rc = ntb_xeon_setup(ndev);
+	else if (is_ntb_atom(ndev))
+		rc = ntb_bwd_setup(ndev);
+	else
+		rc = -ENODEV;
+
+	if (rc)
+		return rc;
+
+	if (ndev->conn_type == NTB_CONN_B2B)
+		/* Enable Bus Master and Memory Space on the secondary side */
+		writew(PCI_COMMAND_MEMORY | PCI_COMMAND_MASTER,
+		       ndev->reg_ofs.spci_cmd);
+
+	return 0;
+}
+
+static void ntb_device_free(struct ntb_device *ndev)
+{
+	if (is_ntb_atom(ndev)) {
+		cancel_delayed_work_sync(&ndev->hb_timer);
+		cancel_delayed_work_sync(&ndev->lr_timer);
+	}
+}
+
+static irqreturn_t bwd_callback_msix_irq(int irq, void *data)
+{
+	struct ntb_db_cb *db_cb = data;
+	struct ntb_device *ndev = db_cb->ndev;
+	unsigned long mask;
+
+	dev_dbg(&ndev->pdev->dev, "MSI-X irq %d received for DB %d\n", irq,
+		db_cb->db_num);
+
+	mask = readw(ndev->reg_ofs.ldb_mask);
+	set_bit(db_cb->db_num * ndev->bits_per_vector, &mask);
+	writew(mask, ndev->reg_ofs.ldb_mask);
+
+	tasklet_schedule(&db_cb->irq_work);
+
+	/* No need to check for the specific HB irq, any interrupt means
+	 * we're connected.
+	 */
+	ndev->last_ts = jiffies;
+
+	writeq((u64) 1 << db_cb->db_num, ndev->reg_ofs.ldb);
+
+	return IRQ_HANDLED;
+}
+
+static irqreturn_t xeon_callback_msix_irq(int irq, void *data)
+{
+	struct ntb_db_cb *db_cb = data;
+	struct ntb_device *ndev = db_cb->ndev;
+	unsigned long mask;
+
+	dev_dbg(&ndev->pdev->dev, "MSI-X irq %d received for DB %d\n", irq,
+		db_cb->db_num);
+
+	mask = readw(ndev->reg_ofs.ldb_mask);
+	set_bit(db_cb->db_num * ndev->bits_per_vector, &mask);
+	writew(mask, ndev->reg_ofs.ldb_mask);
+
+	tasklet_schedule(&db_cb->irq_work);
+
+	/* On Sandybridge, there are 16 bits in the interrupt register
+	 * but only 4 vectors.  So, 5 bits are assigned to the first 3
+	 * vectors, with the 4th having a single bit for link
+	 * interrupts.
+	 */
+	writew(((1 << ndev->bits_per_vector) - 1) <<
+	       (db_cb->db_num * ndev->bits_per_vector), ndev->reg_ofs.ldb);
+
+	return IRQ_HANDLED;
+}
+
+/* Since we do not have a HW doorbell in BWD, this is only used in JF/JT */
+static irqreturn_t xeon_event_msix_irq(int irq, void *dev)
+{
+	struct ntb_device *ndev = dev;
+	int rc;
+
+	dev_dbg(&ndev->pdev->dev, "MSI-X irq %d received for Events\n", irq);
+
+	rc = ntb_link_status(ndev);
+	if (rc)
+		dev_err(&ndev->pdev->dev, "Error determining link status\n");
+
+	/* bit 15 is always the link bit */
+	writew(1 << SNB_LINK_DB, ndev->reg_ofs.ldb);
+
+	return IRQ_HANDLED;
+}
+
+static irqreturn_t ntb_interrupt(int irq, void *dev)
+{
+	struct ntb_device *ndev = dev;
+	unsigned int i = 0;
+
+	if (is_ntb_atom(ndev)) {
+		u64 ldb = readq(ndev->reg_ofs.ldb);
+
+		dev_dbg(&ndev->pdev->dev, "irq %d - ldb = %Lx\n", irq, ldb);
+
+		while (ldb) {
+			i = __ffs(ldb);
+			ldb &= ldb - 1;
+			bwd_callback_msix_irq(irq, &ndev->db_cb[i]);
+		}
+	} else {
+		u16 ldb = readw(ndev->reg_ofs.ldb);
+
+		dev_dbg(&ndev->pdev->dev, "irq %d - ldb = %x\n", irq, ldb);
+
+		if (ldb & SNB_DB_HW_LINK) {
+			xeon_event_msix_irq(irq, dev);
+			ldb &= ~SNB_DB_HW_LINK;
+		}
+
+		while (ldb) {
+			i = __ffs(ldb);
+			ldb &= ldb - 1;
+			xeon_callback_msix_irq(irq, &ndev->db_cb[i]);
+		}
+	}
+
+	return IRQ_HANDLED;
+}
+
+static int ntb_setup_snb_msix(struct ntb_device *ndev, int msix_entries)
+{
+	struct pci_dev *pdev = ndev->pdev;
+	struct msix_entry *msix;
+	int rc, i;
+
+	if (msix_entries < ndev->limits.msix_cnt)
+		return -ENOSPC;
+
+	rc = pci_enable_msix_exact(pdev, ndev->msix_entries, msix_entries);
+	if (rc < 0)
+		return rc;
+
+	for (i = 0; i < msix_entries; i++) {
+		msix = &ndev->msix_entries[i];
+		WARN_ON(!msix->vector);
+
+		if (i == msix_entries - 1) {
+			rc = request_irq(msix->vector,
+					 xeon_event_msix_irq, 0,
+					 "ntb-event-msix", ndev);
+			if (rc)
+				goto err;
+		} else {
+			rc = request_irq(msix->vector,
+					 xeon_callback_msix_irq, 0,
+					 "ntb-callback-msix",
+					 &ndev->db_cb[i]);
+			if (rc)
+				goto err;
+		}
+	}
+
+	ndev->num_msix = msix_entries;
+	ndev->max_cbs = msix_entries - 1;
+
+	return 0;
+
+err:
+	while (--i >= 0) {
+		/* Code never reaches here for entry nr 'ndev->num_msix - 1' */
+		msix = &ndev->msix_entries[i];
+		free_irq(msix->vector, &ndev->db_cb[i]);
+	}
+
+	pci_disable_msix(pdev);
+	ndev->num_msix = 0;
+
+	return rc;
+}
+
+static int ntb_setup_bwd_msix(struct ntb_device *ndev, int msix_entries)
+{
+	struct pci_dev *pdev = ndev->pdev;
+	struct msix_entry *msix;
+	int rc, i;
+
+	msix_entries = pci_enable_msix_range(pdev, ndev->msix_entries,
+					     1, msix_entries);
+	if (msix_entries < 0)
+		return msix_entries;
+
+	for (i = 0; i < msix_entries; i++) {
+		msix = &ndev->msix_entries[i];
+		WARN_ON(!msix->vector);
+
+		rc = request_irq(msix->vector, bwd_callback_msix_irq, 0,
+				 "ntb-callback-msix", &ndev->db_cb[i]);
+		if (rc)
+			goto err;
+	}
+
+	ndev->num_msix = msix_entries;
+	ndev->max_cbs = msix_entries;
+
+	return 0;
+
+err:
+	while (--i >= 0)
+		free_irq(msix->vector, &ndev->db_cb[i]);
+
+	pci_disable_msix(pdev);
+	ndev->num_msix = 0;
+
+	return rc;
+}
+
+static int ntb_setup_msix(struct ntb_device *ndev)
+{
+	struct pci_dev *pdev = ndev->pdev;
+	int msix_entries;
+	int rc, i;
+
+	msix_entries = pci_msix_vec_count(pdev);
+	if (msix_entries < 0) {
+		rc = msix_entries;
+		goto err;
+	} else if (msix_entries > ndev->limits.msix_cnt) {
+		rc = -EINVAL;
+		goto err;
+	}
+
+	ndev->msix_entries = kmalloc(sizeof(struct msix_entry) * msix_entries,
+				     GFP_KERNEL);
+	if (!ndev->msix_entries) {
+		rc = -ENOMEM;
+		goto err;
+	}
+
+	for (i = 0; i < msix_entries; i++)
+		ndev->msix_entries[i].entry = i;
+
+	if (is_ntb_atom(ndev))
+		rc = ntb_setup_bwd_msix(ndev, msix_entries);
+	else
+		rc = ntb_setup_snb_msix(ndev, msix_entries);
+	if (rc)
+		goto err1;
+
+	return 0;
+
+err1:
+	kfree(ndev->msix_entries);
+err:
+	dev_err(&pdev->dev, "Error allocating MSI-X interrupt\n");
+	return rc;
+}
+
+static int ntb_setup_msi(struct ntb_device *ndev)
+{
+	struct pci_dev *pdev = ndev->pdev;
+	int rc;
+
+	rc = pci_enable_msi(pdev);
+	if (rc)
+		return rc;
+
+	rc = request_irq(pdev->irq, ntb_interrupt, 0, "ntb-msi", ndev);
+	if (rc) {
+		pci_disable_msi(pdev);
+		dev_err(&pdev->dev, "Error allocating MSI interrupt\n");
+		return rc;
+	}
+
+	return 0;
+}
+
+static int ntb_setup_intx(struct ntb_device *ndev)
+{
+	struct pci_dev *pdev = ndev->pdev;
+	int rc;
+
+	/* Verify intx is enabled */
+	pci_intx(pdev, 1);
+
+	rc = request_irq(pdev->irq, ntb_interrupt, IRQF_SHARED, "ntb-intx",
+			 ndev);
+	if (rc)
+		return rc;
+
+	return 0;
+}
+
+static int ntb_setup_interrupts(struct ntb_device *ndev)
+{
+	int rc;
+
+	/* On BWD, disable all interrupts.  On SNB, disable all but Link
+	 * Interrupt.  The rest will be unmasked as callbacks are registered.
+	 */
+	if (is_ntb_atom(ndev))
+		writeq(~0, ndev->reg_ofs.ldb_mask);
+	else {
+		u16 var = 1 << SNB_LINK_DB;
+		writew(~var, ndev->reg_ofs.ldb_mask);
+	}
+
+	rc = ntb_setup_msix(ndev);
+	if (!rc)
+		goto done;
+
+	ndev->bits_per_vector = 1;
+	ndev->max_cbs = ndev->limits.max_db_bits;
+
+	rc = ntb_setup_msi(ndev);
+	if (!rc)
+		goto done;
+
+	rc = ntb_setup_intx(ndev);
+	if (rc) {
+		dev_err(&ndev->pdev->dev, "no usable interrupts\n");
+		return rc;
+	}
+
+done:
+	return 0;
+}
+
+static void ntb_free_interrupts(struct ntb_device *ndev)
+{
+	struct pci_dev *pdev = ndev->pdev;
+
+	/* mask interrupts */
+	if (is_ntb_atom(ndev))
+		writeq(~0, ndev->reg_ofs.ldb_mask);
+	else
+		writew(~0, ndev->reg_ofs.ldb_mask);
+
+	if (ndev->num_msix) {
+		struct msix_entry *msix;
+		u32 i;
+
+		for (i = 0; i < ndev->num_msix; i++) {
+			msix = &ndev->msix_entries[i];
+			if (is_ntb_xeon(ndev) && i == ndev->num_msix - 1)
+				free_irq(msix->vector, ndev);
+			else
+				free_irq(msix->vector, &ndev->db_cb[i]);
+		}
+		pci_disable_msix(pdev);
+		kfree(ndev->msix_entries);
+	} else {
+		free_irq(pdev->irq, ndev);
+
+		if (pci_dev_msi_enabled(pdev))
+			pci_disable_msi(pdev);
+	}
+}
+
+static int ntb_create_callbacks(struct ntb_device *ndev)
+{
+	int i;
+
+	/* Chicken-egg issue.  We won't know how many callbacks are necessary
+	 * until we see how many MSI-X vectors we get, but these pointers need
+	 * to be passed into the MSI-X register function.  So, we allocate the
+	 * max, knowing that they might not all be used, to work around this.
+	 */
+	ndev->db_cb = kcalloc(ndev->limits.max_db_bits,
+			      sizeof(struct ntb_db_cb),
+			      GFP_KERNEL);
+	if (!ndev->db_cb)
+		return -ENOMEM;
+
+	for (i = 0; i < ndev->limits.max_db_bits; i++) {
+		ndev->db_cb[i].db_num = i;
+		ndev->db_cb[i].ndev = ndev;
+	}
+
+	return 0;
+}
+
+static void ntb_free_callbacks(struct ntb_device *ndev)
+{
+	int i;
+
+	for (i = 0; i < ndev->limits.max_db_bits; i++)
+		ntb_unregister_db_callback(ndev, i);
+
+	kfree(ndev->db_cb);
+}
+
+static ssize_t ntb_debugfs_read(struct file *filp, char __user *ubuf,
+				size_t count, loff_t *offp)
+{
+	struct ntb_device *ndev;
+	char *buf;
+	ssize_t ret, offset, out_count;
+
+	out_count = 500;
+
+	buf = kmalloc(out_count, GFP_KERNEL);
+	if (!buf)
+		return -ENOMEM;
+
+	ndev = filp->private_data;
+	offset = 0;
+	offset += snprintf(buf + offset, out_count - offset,
+			   "NTB Device Information:\n");
+	offset += snprintf(buf + offset, out_count - offset,
+			   "Connection Type - \t\t%s\n",
+			   ndev->conn_type == NTB_CONN_TRANSPARENT ?
+			   "Transparent" : (ndev->conn_type == NTB_CONN_B2B) ?
+			   "Back to back" : "Root Port");
+	offset += snprintf(buf + offset, out_count - offset,
+			   "Device Type - \t\t\t%s\n",
+			   ndev->dev_type == NTB_DEV_USD ?
+			   "DSD/USP" : "USD/DSP");
+	offset += snprintf(buf + offset, out_count - offset,
+			   "Max Number of Callbacks - \t%u\n",
+			   ntb_max_cbs(ndev));
+	offset += snprintf(buf + offset, out_count - offset,
+			   "Link Status - \t\t\t%s\n",
+			   ntb_hw_link_status(ndev) ? "Up" : "Down");
+	if (ntb_hw_link_status(ndev)) {
+		offset += snprintf(buf + offset, out_count - offset,
+				   "Link Speed - \t\t\tPCI-E Gen %u\n",
+				   ndev->link_speed);
+		offset += snprintf(buf + offset, out_count - offset,
+				   "Link Width - \t\t\tx%u\n",
+				   ndev->link_width);
+	}
+
+	if (is_ntb_xeon(ndev)) {
+		u32 status32;
+		u16 status16;
+		int rc;
+
+		offset += snprintf(buf + offset, out_count - offset,
+				   "\nNTB Device Statistics:\n");
+		offset += snprintf(buf + offset, out_count - offset,
+				   "Upstream Memory Miss - \t%u\n",
+				   readw(ndev->reg_base +
+					 SNB_USMEMMISS_OFFSET));
+
+		offset += snprintf(buf + offset, out_count - offset,
+				   "\nNTB Hardware Errors:\n");
+
+		rc = pci_read_config_word(ndev->pdev, SNB_DEVSTS_OFFSET,
+					  &status16);
+		if (!rc)
+			offset += snprintf(buf + offset, out_count - offset,
+					   "DEVSTS - \t%#06x\n", status16);
+
+		rc = pci_read_config_word(ndev->pdev, SNB_LINK_STATUS_OFFSET,
+					  &status16);
+		if (!rc)
+			offset += snprintf(buf + offset, out_count - offset,
+					   "LNKSTS - \t%#06x\n", status16);
+
+		rc = pci_read_config_dword(ndev->pdev, SNB_UNCERRSTS_OFFSET,
+					   &status32);
+		if (!rc)
+			offset += snprintf(buf + offset, out_count - offset,
+					   "UNCERRSTS - \t%#010x\n", status32);
+
+		rc = pci_read_config_dword(ndev->pdev, SNB_CORERRSTS_OFFSET,
+					   &status32);
+		if (!rc)
+			offset += snprintf(buf + offset, out_count - offset,
+					   "CORERRSTS - \t%#010x\n", status32);
+	}
+
+	if (offset > out_count)
+		offset = out_count;
+
+	ret = simple_read_from_buffer(ubuf, count, offp, buf, offset);
+	kfree(buf);
+	return ret;
+}
+
+static const struct file_operations ntb_debugfs_info = {
+	.owner = THIS_MODULE,
+	.open = simple_open,
+	.read = ntb_debugfs_read,
+};
+
+static void ntb_setup_debugfs(struct ntb_device *ndev)
+{
+	if (!debugfs_initialized())
+		return;
+
+	if (!debugfs_dir)
+		debugfs_dir = debugfs_create_dir(KBUILD_MODNAME, NULL);
+
+	ndev->debugfs_dir = debugfs_create_dir(pci_name(ndev->pdev),
+					       debugfs_dir);
+	if (ndev->debugfs_dir)
+		ndev->debugfs_info = debugfs_create_file("info", S_IRUSR,
+							 ndev->debugfs_dir,
+							 ndev,
+							 &ntb_debugfs_info);
+}
+
+static void ntb_free_debugfs(struct ntb_device *ndev)
+{
+	debugfs_remove_recursive(ndev->debugfs_dir);
+
+	if (debugfs_dir && simple_empty(debugfs_dir)) {
+		debugfs_remove_recursive(debugfs_dir);
+		debugfs_dir = NULL;
+	}
+}
+
+static void ntb_hw_link_up(struct ntb_device *ndev)
+{
+	if (ndev->conn_type == NTB_CONN_TRANSPARENT)
+		ntb_link_event(ndev, NTB_LINK_UP);
+	else {
+		u32 ntb_cntl;
+
+		/* Let's bring the NTB link up */
+		ntb_cntl = readl(ndev->reg_ofs.lnk_cntl);
+		ntb_cntl &= ~(NTB_CNTL_LINK_DISABLE | NTB_CNTL_CFG_LOCK);
+		ntb_cntl |= NTB_CNTL_P2S_BAR23_SNOOP | NTB_CNTL_S2P_BAR23_SNOOP;
+		ntb_cntl |= NTB_CNTL_P2S_BAR4_SNOOP | NTB_CNTL_S2P_BAR4_SNOOP;
+		if (ndev->split_bar)
+			ntb_cntl |= NTB_CNTL_P2S_BAR5_SNOOP |
+				    NTB_CNTL_S2P_BAR5_SNOOP;
+
+		writel(ntb_cntl, ndev->reg_ofs.lnk_cntl);
+	}
+}
+
+static void ntb_hw_link_down(struct ntb_device *ndev)
+{
+	u32 ntb_cntl;
+
+	if (ndev->conn_type == NTB_CONN_TRANSPARENT) {
+		ntb_link_event(ndev, NTB_LINK_DOWN);
+		return;
+	}
+
+	/* Bring NTB link down */
+	ntb_cntl = readl(ndev->reg_ofs.lnk_cntl);
+	ntb_cntl &= ~(NTB_CNTL_P2S_BAR23_SNOOP | NTB_CNTL_S2P_BAR23_SNOOP);
+	ntb_cntl &= ~(NTB_CNTL_P2S_BAR4_SNOOP | NTB_CNTL_S2P_BAR4_SNOOP);
+	if (ndev->split_bar)
+		ntb_cntl &= ~(NTB_CNTL_P2S_BAR5_SNOOP |
+			      NTB_CNTL_S2P_BAR5_SNOOP);
+	ntb_cntl |= NTB_CNTL_LINK_DISABLE | NTB_CNTL_CFG_LOCK;
+	writel(ntb_cntl, ndev->reg_ofs.lnk_cntl);
+}
+
+static void ntb_max_mw_detect(struct ntb_device *ndev)
+{
+	if (ndev->split_bar)
+		ndev->limits.max_mw = HSX_SPLITBAR_MAX_MW;
+	else
+		ndev->limits.max_mw = SNB_MAX_MW;
+}
+
+static int ntb_xeon_detect(struct ntb_device *ndev)
+{
+	int rc, bars_mask;
+	u32 bars;
+	u8 ppd;
+
+	ndev->hw_type = SNB_HW;
+
+	rc = pci_read_config_byte(ndev->pdev, NTB_PPD_OFFSET, &ppd);
+	if (rc)
+		return -EIO;
+
+	if (ppd & SNB_PPD_DEV_TYPE)
+		ndev->dev_type = NTB_DEV_USD;
+	else
+		ndev->dev_type = NTB_DEV_DSD;
+
+	ndev->split_bar = (ppd & SNB_PPD_SPLIT_BAR) ? 1 : 0;
+
+	switch (ppd & SNB_PPD_CONN_TYPE) {
+	case NTB_CONN_B2B:
+		dev_info(&ndev->pdev->dev, "Conn Type = B2B\n");
+		ndev->conn_type = NTB_CONN_B2B;
+		break;
+	case NTB_CONN_RP:
+		dev_info(&ndev->pdev->dev, "Conn Type = RP\n");
+		ndev->conn_type = NTB_CONN_RP;
+		break;
+	case NTB_CONN_TRANSPARENT:
+		dev_info(&ndev->pdev->dev, "Conn Type = TRANSPARENT\n");
+		ndev->conn_type = NTB_CONN_TRANSPARENT;
+		/*
+		 * This mode is default to USD/DSP. HW does not report
+		 * properly in transparent mode as it has no knowledge of
+		 * NTB. We will just force correct here.
+		 */
+		ndev->dev_type = NTB_DEV_USD;
+
+		/*
+		 * This is a way for transparent BAR to figure out if we
+		 * are doing split BAR or not. There is no way for the hw
+		 * on the transparent side to know and set the PPD.
+		 */
+		bars_mask = pci_select_bars(ndev->pdev, IORESOURCE_MEM);
+		bars = hweight32(bars_mask);
+		if (bars == (HSX_SPLITBAR_MAX_MW + 1))
+			ndev->split_bar = 1;
+
+		break;
+	default:
+		dev_err(&ndev->pdev->dev, "Unknown PPD %x\n", ppd);
+		return -ENODEV;
+	}
+
+	ntb_max_mw_detect(ndev);
+
+	return 0;
+}
+
+static int ntb_atom_detect(struct ntb_device *ndev)
+{
+	int rc;
+	u32 ppd;
+
+	ndev->hw_type = BWD_HW;
+	ndev->limits.max_mw = BWD_MAX_MW;
+
+	rc = pci_read_config_dword(ndev->pdev, NTB_PPD_OFFSET, &ppd);
+	if (rc)
+		return rc;
+
+	switch ((ppd & BWD_PPD_CONN_TYPE) >> 8) {
+	case NTB_CONN_B2B:
+		dev_info(&ndev->pdev->dev, "Conn Type = B2B\n");
+		ndev->conn_type = NTB_CONN_B2B;
+		break;
+	case NTB_CONN_RP:
+	default:
+		dev_err(&ndev->pdev->dev, "Unsupported NTB configuration\n");
+		return -EINVAL;
+	}
+
+	if (ppd & BWD_PPD_DEV_TYPE)
+		ndev->dev_type = NTB_DEV_DSD;
+	else
+		ndev->dev_type = NTB_DEV_USD;
+
+	return 0;
+}
+
+static int ntb_device_detect(struct ntb_device *ndev)
+{
+	int rc;
+
+	if (is_ntb_xeon(ndev))
+		rc = ntb_xeon_detect(ndev);
+	else if (is_ntb_atom(ndev))
+		rc = ntb_atom_detect(ndev);
+	else
+		rc = -ENODEV;
+
+	dev_info(&ndev->pdev->dev, "Device Type = %s\n",
+		 ndev->dev_type == NTB_DEV_USD ? "USD/DSP" : "DSD/USP");
+
+	return 0;
+}
+
+static int ntb_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
+{
+	struct ntb_device *ndev;
+	int rc, i;
+
+	ndev = kzalloc(sizeof(struct ntb_device), GFP_KERNEL);
+	if (!ndev)
+		return -ENOMEM;
+
+	ndev->pdev = pdev;
+
+	ntb_set_errata_flags(ndev);
+
+	ndev->link_status = NTB_LINK_DOWN;
+	pci_set_drvdata(pdev, ndev);
+	ntb_setup_debugfs(ndev);
+
+	rc = pci_enable_device(pdev);
+	if (rc)
+		goto err;
+
+	pci_set_master(ndev->pdev);
+
+	rc = ntb_device_detect(ndev);
+	if (rc)
+		goto err;
+
+	ndev->mw = kcalloc(ndev->limits.max_mw, sizeof(struct ntb_mw),
+			   GFP_KERNEL);
+	if (!ndev->mw) {
+		rc = -ENOMEM;
+		goto err1;
+	}
+
+	if (ndev->split_bar)
+		rc = pci_request_selected_regions(pdev, NTB_SPLITBAR_MASK,
+						  KBUILD_MODNAME);
+	else
+		rc = pci_request_selected_regions(pdev, NTB_BAR_MASK,
+						  KBUILD_MODNAME);
+
+	if (rc)
+		goto err2;
+
+	ndev->reg_base = pci_ioremap_bar(pdev, NTB_BAR_MMIO);
+	if (!ndev->reg_base) {
+		dev_warn(&pdev->dev, "Cannot remap BAR 0\n");
+		rc = -EIO;
+		goto err3;
+	}
+
+	for (i = 0; i < ndev->limits.max_mw; i++) {
+		ndev->mw[i].bar_sz = pci_resource_len(pdev, MW_TO_BAR(i));
+
+		/*
+		 * with the errata we need to steal last of the memory
+		 * windows for workarounds and they point to MMIO registers.
+		 */
+		if ((ndev->wa_flags & WA_SNB_ERR) &&
+		    (i == (ndev->limits.max_mw - 1))) {
+			ndev->mw[i].vbase =
+				ioremap_nocache(pci_resource_start(pdev,
+							MW_TO_BAR(i)),
+						ndev->mw[i].bar_sz);
+		} else {
+			ndev->mw[i].vbase =
+				ioremap_wc(pci_resource_start(pdev,
+							MW_TO_BAR(i)),
+					   ndev->mw[i].bar_sz);
+		}
+
+		dev_info(&pdev->dev, "MW %d size %llu\n", i,
+			 (unsigned long long) ndev->mw[i].bar_sz);
+		if (!ndev->mw[i].vbase) {
+			dev_warn(&pdev->dev, "Cannot remap BAR %d\n",
+				 MW_TO_BAR(i));
+			rc = -EIO;
+			goto err4;
+		}
+	}
+
+	rc = pci_set_dma_mask(pdev, DMA_BIT_MASK(64));
+	if (rc) {
+		rc = pci_set_dma_mask(pdev, DMA_BIT_MASK(32));
+		if (rc)
+			goto err4;
+
+		dev_warn(&pdev->dev, "Cannot DMA highmem\n");
+	}
+
+	rc = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64));
+	if (rc) {
+		rc = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(32));
+		if (rc)
+			goto err4;
+
+		dev_warn(&pdev->dev, "Cannot DMA consistent highmem\n");
+	}
+
+	rc = ntb_device_setup(ndev);
+	if (rc)
+		goto err4;
+
+	rc = ntb_create_callbacks(ndev);
+	if (rc)
+		goto err5;
+
+	rc = ntb_setup_interrupts(ndev);
+	if (rc)
+		goto err6;
+
+	/* The scratchpad registers keep the values between rmmod/insmod,
+	 * blast them now
+	 */
+	for (i = 0; i < ndev->limits.max_spads; i++) {
+		ntb_write_local_spad(ndev, i, 0);
+		ntb_write_remote_spad(ndev, i, 0);
+	}
+
+	rc = ntb_transport_init(pdev);
+	if (rc)
+		goto err7;
+
+	ntb_hw_link_up(ndev);
+
+	return 0;
+
+err7:
+	ntb_free_interrupts(ndev);
+err6:
+	ntb_free_callbacks(ndev);
+err5:
+	ntb_device_free(ndev);
+err4:
+	for (i--; i >= 0; i--)
+		iounmap(ndev->mw[i].vbase);
+	iounmap(ndev->reg_base);
+err3:
+	if (ndev->split_bar)
+		pci_release_selected_regions(pdev, NTB_SPLITBAR_MASK);
+	else
+		pci_release_selected_regions(pdev, NTB_BAR_MASK);
+err2:
+	kfree(ndev->mw);
+err1:
+	pci_disable_device(pdev);
+err:
+	ntb_free_debugfs(ndev);
+	kfree(ndev);
+
+	dev_err(&pdev->dev, "Error loading %s module\n", KBUILD_MODNAME);
+	return rc;
+}
+
+static void ntb_pci_remove(struct pci_dev *pdev)
+{
+	struct ntb_device *ndev = pci_get_drvdata(pdev);
+	int i;
+
+	ntb_hw_link_down(ndev);
+
+	ntb_transport_free(ndev->ntb_transport);
+
+	ntb_free_interrupts(ndev);
+	ntb_free_callbacks(ndev);
+	ntb_device_free(ndev);
+
+	/* need to reset max_mw limits so we can unmap properly */
+	if (ndev->hw_type == SNB_HW)
+		ntb_max_mw_detect(ndev);
+
+	for (i = 0; i < ndev->limits.max_mw; i++)
+		iounmap(ndev->mw[i].vbase);
+
+	kfree(ndev->mw);
+	iounmap(ndev->reg_base);
+	if (ndev->split_bar)
+		pci_release_selected_regions(pdev, NTB_SPLITBAR_MASK);
+	else
+		pci_release_selected_regions(pdev, NTB_BAR_MASK);
+	pci_disable_device(pdev);
+	ntb_free_debugfs(ndev);
+	kfree(ndev);
+}
+
+static struct pci_driver ntb_pci_driver = {
+	.name = KBUILD_MODNAME,
+	.id_table = ntb_pci_tbl,
+	.probe = ntb_pci_probe,
+	.remove = ntb_pci_remove,
+};
+
+module_pci_driver(ntb_pci_driver);
diff --git a/drivers/ntb/hw/intel/ntb_hw_intel.h b/drivers/ntb/hw/intel/ntb_hw_intel.h
new file mode 100644
index 000000000000..935610454f70
--- /dev/null
+++ b/drivers/ntb/hw/intel/ntb_hw_intel.h
@@ -0,0 +1,385 @@
+/*
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ *   redistributing this file, you may do so under either license.
+ *
+ *   GPL LICENSE SUMMARY
+ *
+ *   Copyright(c) 2012 Intel Corporation. All rights reserved.
+ *
+ *   This program is free software; you can redistribute it and/or modify
+ *   it under the terms of version 2 of the GNU General Public License as
+ *   published by the Free Software Foundation.
+ *
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2012 Intel Corporation. All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copy
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Intel PCIe NTB Linux driver
+ *
+ * Contact Information:
+ * Jon Mason <jon.mason@intel.com>
+ */
+#include <linux/ntb_transport.h>
+
+#define NTB_LINK_STATUS_ACTIVE	0x2000
+#define NTB_LINK_SPEED_MASK	0x000f
+#define NTB_LINK_WIDTH_MASK	0x03f0
+
+#define SNB_MSIX_CNT		4
+#define SNB_MAX_B2B_SPADS	16
+#define SNB_MAX_COMPAT_SPADS	16
+/* Reserve the uppermost bit for link interrupt */
+#define SNB_MAX_DB_BITS		15
+#define SNB_LINK_DB		15
+#define SNB_DB_BITS_PER_VEC	5
+#define HSX_SPLITBAR_MAX_MW	3
+#define SNB_MAX_MW		2
+#define SNB_ERRATA_MAX_MW	1
+
+#define SNB_DB_HW_LINK		0x8000
+
+#define SNB_UNCERRSTS_OFFSET	0x014C
+#define SNB_CORERRSTS_OFFSET	0x0158
+#define SNB_LINK_STATUS_OFFSET	0x01A2
+#define SNB_PCICMD_OFFSET	0x0504
+#define SNB_DEVCTRL_OFFSET	0x0598
+#define SNB_DEVSTS_OFFSET	0x059A
+#define SNB_SLINK_STATUS_OFFSET	0x05A2
+
+#define SNB_PBAR2LMT_OFFSET	0x0000
+#define SNB_PBAR4LMT_OFFSET	0x0008
+#define SNB_PBAR5LMT_OFFSET	0x000C
+#define SNB_PBAR2XLAT_OFFSET	0x0010
+#define SNB_PBAR4XLAT_OFFSET	0x0018
+#define SNB_PBAR5XLAT_OFFSET	0x001C
+#define SNB_SBAR2LMT_OFFSET	0x0020
+#define SNB_SBAR4LMT_OFFSET	0x0028
+#define SNB_SBAR5LMT_OFFSET	0x002C
+#define SNB_SBAR2XLAT_OFFSET	0x0030
+#define SNB_SBAR4XLAT_OFFSET	0x0038
+#define SNB_SBAR5XLAT_OFFSET	0x003C
+#define SNB_SBAR0BASE_OFFSET	0x0040
+#define SNB_SBAR2BASE_OFFSET	0x0048
+#define SNB_SBAR4BASE_OFFSET	0x0050
+#define SNB_SBAR5BASE_OFFSET	0x0054
+#define SNB_NTBCNTL_OFFSET	0x0058
+#define SNB_SBDF_OFFSET		0x005C
+#define SNB_PDOORBELL_OFFSET	0x0060
+#define SNB_PDBMSK_OFFSET	0x0062
+#define SNB_SDOORBELL_OFFSET	0x0064
+#define SNB_SDBMSK_OFFSET	0x0066
+#define SNB_USMEMMISS_OFFSET	0x0070
+#define SNB_SPAD_OFFSET		0x0080
+#define SNB_SPADSEMA4_OFFSET	0x00c0
+#define SNB_WCCNTRL_OFFSET	0x00e0
+#define SNB_B2B_SPAD_OFFSET	0x0100
+#define SNB_B2B_DOORBELL_OFFSET	0x0140
+#define SNB_B2B_XLAT_OFFSETL	0x0144
+#define SNB_B2B_XLAT_OFFSETU	0x0148
+
+/*
+ * The addresses are setup so the 32bit BARs can function. Thus
+ * the addresses are all in 32bit space
+ */
+#define SNB_MBAR01_USD_ADDR	0x000000002100000CULL
+#define SNB_MBAR23_USD_ADDR	0x000000004100000CULL
+#define SNB_MBAR4_USD_ADDR	0x000000008100000CULL
+#define SNB_MBAR5_USD_ADDR	0x00000000A100000CULL
+#define SNB_MBAR01_DSD_ADDR	0x000000002000000CULL
+#define SNB_MBAR23_DSD_ADDR	0x000000004000000CULL
+#define SNB_MBAR4_DSD_ADDR	0x000000008000000CULL
+#define SNB_MBAR5_DSD_ADDR	0x00000000A000000CULL
+
+#define BWD_MSIX_CNT		34
+#define BWD_MAX_SPADS		16
+#define BWD_MAX_DB_BITS		34
+#define BWD_DB_BITS_PER_VEC	1
+#define BWD_MAX_MW		2
+
+#define BWD_PCICMD_OFFSET	0xb004
+#define BWD_MBAR23_OFFSET	0xb018
+#define BWD_MBAR45_OFFSET	0xb020
+#define BWD_DEVCTRL_OFFSET	0xb048
+#define BWD_LINK_STATUS_OFFSET	0xb052
+#define BWD_ERRCORSTS_OFFSET	0xb110
+
+#define BWD_SBAR2XLAT_OFFSET	0x0008
+#define BWD_SBAR4XLAT_OFFSET	0x0010
+#define BWD_PDOORBELL_OFFSET	0x0020
+#define BWD_PDBMSK_OFFSET	0x0028
+#define BWD_NTBCNTL_OFFSET	0x0060
+#define BWD_EBDF_OFFSET		0x0064
+#define BWD_SPAD_OFFSET		0x0080
+#define BWD_SPADSEMA_OFFSET	0x00c0
+#define BWD_STKYSPAD_OFFSET	0x00c4
+#define BWD_PBAR2XLAT_OFFSET	0x8008
+#define BWD_PBAR4XLAT_OFFSET	0x8010
+#define BWD_B2B_DOORBELL_OFFSET	0x8020
+#define BWD_B2B_SPAD_OFFSET	0x8080
+#define BWD_B2B_SPADSEMA_OFFSET	0x80c0
+#define BWD_B2B_STKYSPAD_OFFSET	0x80c4
+
+#define BWD_MODPHY_PCSREG4	0x1c004
+#define BWD_MODPHY_PCSREG6	0x1c006
+
+#define BWD_IP_BASE		0xC000
+#define BWD_DESKEWSTS_OFFSET	(BWD_IP_BASE + 0x3024)
+#define BWD_LTSSMERRSTS0_OFFSET (BWD_IP_BASE + 0x3180)
+#define BWD_LTSSMSTATEJMP_OFFSET	(BWD_IP_BASE + 0x3040)
+#define BWD_IBSTERRRCRVSTS0_OFFSET	(BWD_IP_BASE + 0x3324)
+
+#define BWD_DESKEWSTS_DBERR	(1 << 15)
+#define BWD_LTSSMERRSTS0_UNEXPECTEDEI	(1 << 20)
+#define BWD_LTSSMSTATEJMP_FORCEDETECT	(1 << 2)
+#define BWD_IBIST_ERR_OFLOW	0x7FFF7FFF
+
+#define NTB_CNTL_CFG_LOCK		(1 << 0)
+#define NTB_CNTL_LINK_DISABLE		(1 << 1)
+#define NTB_CNTL_S2P_BAR23_SNOOP	(1 << 2)
+#define NTB_CNTL_P2S_BAR23_SNOOP	(1 << 4)
+#define NTB_CNTL_S2P_BAR4_SNOOP	(1 << 6)
+#define NTB_CNTL_P2S_BAR4_SNOOP	(1 << 8)
+#define NTB_CNTL_S2P_BAR5_SNOOP	(1 << 12)
+#define NTB_CNTL_P2S_BAR5_SNOOP	(1 << 14)
+#define BWD_CNTL_LINK_DOWN		(1 << 16)
+
+#define NTB_PPD_OFFSET		0x00D4
+#define SNB_PPD_CONN_TYPE	0x0003
+#define SNB_PPD_DEV_TYPE	0x0010
+#define SNB_PPD_SPLIT_BAR	(1 << 6)
+#define BWD_PPD_INIT_LINK	0x0008
+#define BWD_PPD_CONN_TYPE	0x0300
+#define BWD_PPD_DEV_TYPE	0x1000
+#define PCI_DEVICE_ID_INTEL_NTB_B2B_JSF		0x3725
+#define PCI_DEVICE_ID_INTEL_NTB_PS_JSF		0x3726
+#define PCI_DEVICE_ID_INTEL_NTB_SS_JSF		0x3727
+#define PCI_DEVICE_ID_INTEL_NTB_B2B_SNB		0x3C0D
+#define PCI_DEVICE_ID_INTEL_NTB_PS_SNB		0x3C0E
+#define PCI_DEVICE_ID_INTEL_NTB_SS_SNB		0x3C0F
+#define PCI_DEVICE_ID_INTEL_NTB_B2B_IVT		0x0E0D
+#define PCI_DEVICE_ID_INTEL_NTB_PS_IVT		0x0E0E
+#define PCI_DEVICE_ID_INTEL_NTB_SS_IVT		0x0E0F
+#define PCI_DEVICE_ID_INTEL_NTB_B2B_HSX		0x2F0D
+#define PCI_DEVICE_ID_INTEL_NTB_PS_HSX		0x2F0E
+#define PCI_DEVICE_ID_INTEL_NTB_SS_HSX		0x2F0F
+#define PCI_DEVICE_ID_INTEL_NTB_B2B_BWD		0x0C4E
+
+#ifndef readq
+static inline u64 readq(void __iomem *addr)
+{
+	return readl(addr) | (((u64) readl(addr + 4)) << 32LL);
+}
+#endif
+
+#ifndef writeq
+static inline void writeq(u64 val, void __iomem *addr)
+{
+	writel(val & 0xffffffff, addr);
+	writel(val >> 32, addr + 4);
+}
+#endif
+
+#define NTB_BAR_MMIO		0
+#define NTB_BAR_23		2
+#define NTB_BAR_4		4
+#define NTB_BAR_5		5
+
+#define NTB_BAR_MASK		((1 << NTB_BAR_MMIO) | (1 << NTB_BAR_23) |\
+				 (1 << NTB_BAR_4))
+#define NTB_SPLITBAR_MASK	((1 << NTB_BAR_MMIO) | (1 << NTB_BAR_23) |\
+				 (1 << NTB_BAR_4) | (1 << NTB_BAR_5))
+
+#define NTB_HB_TIMEOUT		msecs_to_jiffies(1000)
+
+enum ntb_hw_event {
+	NTB_EVENT_SW_EVENT0 = 0,
+	NTB_EVENT_SW_EVENT1,
+	NTB_EVENT_SW_EVENT2,
+	NTB_EVENT_HW_ERROR,
+	NTB_EVENT_HW_LINK_UP,
+	NTB_EVENT_HW_LINK_DOWN,
+};
+
+struct ntb_mw {
+	dma_addr_t phys_addr;
+	void __iomem *vbase;
+	resource_size_t bar_sz;
+};
+
+struct ntb_db_cb {
+	int (*callback)(void *data, int db_num);
+	unsigned int db_num;
+	void *data;
+	struct ntb_device *ndev;
+	struct tasklet_struct irq_work;
+};
+
+#define WA_SNB_ERR	0x00000001
+
+struct ntb_device {
+	struct pci_dev *pdev;
+	struct msix_entry *msix_entries;
+	void __iomem *reg_base;
+	struct ntb_mw *mw;
+	struct {
+		unsigned char max_mw;
+		unsigned char max_spads;
+		unsigned char max_db_bits;
+		unsigned char msix_cnt;
+	} limits;
+	struct {
+		void __iomem *ldb;
+		void __iomem *ldb_mask;
+		void __iomem *rdb;
+		void __iomem *bar2_xlat;
+		void __iomem *bar4_xlat;
+		void __iomem *bar5_xlat;
+		void __iomem *spad_write;
+		void __iomem *spad_read;
+		void __iomem *lnk_cntl;
+		void __iomem *lnk_stat;
+		void __iomem *spci_cmd;
+	} reg_ofs;
+	struct ntb_transport *ntb_transport;
+	void (*event_cb)(void *handle, enum ntb_hw_event event);
+
+	struct ntb_db_cb *db_cb;
+	unsigned char hw_type;
+	unsigned char conn_type;
+	unsigned char dev_type;
+	unsigned char num_msix;
+	unsigned char bits_per_vector;
+	unsigned char max_cbs;
+	unsigned char link_width;
+	unsigned char link_speed;
+	unsigned char link_status;
+	unsigned char split_bar;
+
+	struct delayed_work hb_timer;
+	unsigned long last_ts;
+
+	struct delayed_work lr_timer;
+
+	struct dentry *debugfs_dir;
+	struct dentry *debugfs_info;
+
+	unsigned int wa_flags;
+};
+
+/**
+ * ntb_max_cbs() - return the max callbacks
+ * @ndev: pointer to ntb_device instance
+ *
+ * Given the ntb pointer, return the maximum number of callbacks
+ *
+ * RETURNS: the maximum number of callbacks
+ */
+static inline unsigned char ntb_max_cbs(struct ntb_device *ndev)
+{
+	return ndev->max_cbs;
+}
+
+/**
+ * ntb_max_mw() - return the max number of memory windows
+ * @ndev: pointer to ntb_device instance
+ *
+ * Given the ntb pointer, return the maximum number of memory windows
+ *
+ * RETURNS: the maximum number of memory windows
+ */
+static inline unsigned char ntb_max_mw(struct ntb_device *ndev)
+{
+	return ndev->limits.max_mw;
+}
+
+/**
+ * ntb_hw_link_status() - return the hardware link status
+ * @ndev: pointer to ntb_device instance
+ *
+ * Returns true if the hardware is connected to the remote system
+ *
+ * RETURNS: true or false based on the hardware link state
+ */
+static inline bool ntb_hw_link_status(struct ntb_device *ndev)
+{
+	return ndev->link_status == NTB_LINK_UP;
+}
+
+/**
+ * ntb_query_pdev() - return the pci_dev pointer
+ * @ndev: pointer to ntb_device instance
+ *
+ * Given the ntb pointer, return the pci_dev pointer for the NTB hardware device
+ *
+ * RETURNS: a pointer to the ntb pci_dev
+ */
+static inline struct pci_dev *ntb_query_pdev(struct ntb_device *ndev)
+{
+	return ndev->pdev;
+}
+
+/**
+ * ntb_query_debugfs() - return the debugfs pointer
+ * @ndev: pointer to ntb_device instance
+ *
+ * Given the ntb pointer, return the debugfs directory pointer for the NTB
+ * hardware device
+ *
+ * RETURNS: a pointer to the debugfs directory
+ */
+static inline struct dentry *ntb_query_debugfs(struct ntb_device *ndev)
+{
+	return ndev->debugfs_dir;
+}
+
+struct ntb_device *ntb_register_transport(struct pci_dev *pdev,
+					  void *transport);
+void ntb_unregister_transport(struct ntb_device *ndev);
+void ntb_set_mw_addr(struct ntb_device *ndev, unsigned int mw, u64 addr);
+int ntb_register_db_callback(struct ntb_device *ndev, unsigned int idx,
+			     void *data, int (*db_cb_func)(void *data,
+							   int db_num));
+void ntb_unregister_db_callback(struct ntb_device *ndev, unsigned int idx);
+int ntb_register_event_callback(struct ntb_device *ndev,
+				void (*event_cb_func)(void *handle,
+						      enum ntb_hw_event event));
+void ntb_unregister_event_callback(struct ntb_device *ndev);
+int ntb_get_max_spads(struct ntb_device *ndev);
+int ntb_write_local_spad(struct ntb_device *ndev, unsigned int idx, u32 val);
+int ntb_read_local_spad(struct ntb_device *ndev, unsigned int idx, u32 *val);
+int ntb_write_remote_spad(struct ntb_device *ndev, unsigned int idx, u32 val);
+int ntb_read_remote_spad(struct ntb_device *ndev, unsigned int idx, u32 *val);
+resource_size_t ntb_get_mw_base(struct ntb_device *ndev, unsigned int mw);
+void __iomem *ntb_get_mw_vbase(struct ntb_device *ndev, unsigned int mw);
+u64 ntb_get_mw_size(struct ntb_device *ndev, unsigned int mw);
+void ntb_ring_doorbell(struct ntb_device *ndev, unsigned int idx);
+void *ntb_find_transport(struct pci_dev *pdev);
+
+int ntb_transport_init(struct pci_dev *pdev);
+void ntb_transport_free(void *transport);
diff --git a/drivers/ntb/ntb_hw.c b/drivers/ntb/ntb_hw.c
deleted file mode 100644
index 3f6738612f45..000000000000
--- a/drivers/ntb/ntb_hw.c
+++ /dev/null
@@ -1,1895 +0,0 @@
-/*
- * This file is provided under a dual BSD/GPLv2 license.  When using or
- *   redistributing this file, you may do so under either license.
- *
- *   GPL LICENSE SUMMARY
- *
- *   Copyright(c) 2012 Intel Corporation. All rights reserved.
- *
- *   This program is free software; you can redistribute it and/or modify
- *   it under the terms of version 2 of the GNU General Public License as
- *   published by the Free Software Foundation.
- *
- *   BSD LICENSE
- *
- *   Copyright(c) 2012 Intel Corporation. All rights reserved.
- *
- *   Redistribution and use in source and binary forms, with or without
- *   modification, are permitted provided that the following conditions
- *   are met:
- *
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copy
- *       notice, this list of conditions and the following disclaimer in
- *       the documentation and/or other materials provided with the
- *       distribution.
- *     * Neither the name of Intel Corporation nor the names of its
- *       contributors may be used to endorse or promote products derived
- *       from this software without specific prior written permission.
- *
- *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- * Intel PCIe NTB Linux driver
- *
- * Contact Information:
- * Jon Mason <jon.mason@intel.com>
- */
-#include <linux/debugfs.h>
-#include <linux/delay.h>
-#include <linux/init.h>
-#include <linux/interrupt.h>
-#include <linux/module.h>
-#include <linux/pci.h>
-#include <linux/random.h>
-#include <linux/slab.h>
-#include "ntb_hw.h"
-#include "ntb_regs.h"
-
-#define NTB_NAME	"Intel(R) PCI-E Non-Transparent Bridge Driver"
-#define NTB_VER		"1.0"
-
-MODULE_DESCRIPTION(NTB_NAME);
-MODULE_VERSION(NTB_VER);
-MODULE_LICENSE("Dual BSD/GPL");
-MODULE_AUTHOR("Intel Corporation");
-
-enum {
-	NTB_CONN_TRANSPARENT = 0,
-	NTB_CONN_B2B,
-	NTB_CONN_RP,
-};
-
-enum {
-	NTB_DEV_USD = 0,
-	NTB_DEV_DSD,
-};
-
-enum {
-	SNB_HW = 0,
-	BWD_HW,
-};
-
-static struct dentry *debugfs_dir;
-
-#define BWD_LINK_RECOVERY_TIME	500
-
-/* Translate memory window 0,1,2 to BAR 2,4,5 */
-#define MW_TO_BAR(mw)	(mw == 0 ? 2 : (mw == 1 ? 4 : 5))
-
-static const struct pci_device_id ntb_pci_tbl[] = {
-	{PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_NTB_B2B_BWD)},
-	{PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_NTB_B2B_JSF)},
-	{PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_NTB_B2B_SNB)},
-	{PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_NTB_B2B_IVT)},
-	{PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_NTB_B2B_HSX)},
-	{PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_NTB_PS_JSF)},
-	{PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_NTB_PS_SNB)},
-	{PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_NTB_PS_IVT)},
-	{PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_NTB_PS_HSX)},
-	{PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_NTB_SS_JSF)},
-	{PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_NTB_SS_SNB)},
-	{PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_NTB_SS_IVT)},
-	{PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_NTB_SS_HSX)},
-	{0}
-};
-MODULE_DEVICE_TABLE(pci, ntb_pci_tbl);
-
-static int is_ntb_xeon(struct ntb_device *ndev)
-{
-	switch (ndev->pdev->device) {
-	case PCI_DEVICE_ID_INTEL_NTB_SS_JSF:
-	case PCI_DEVICE_ID_INTEL_NTB_SS_SNB:
-	case PCI_DEVICE_ID_INTEL_NTB_SS_IVT:
-	case PCI_DEVICE_ID_INTEL_NTB_SS_HSX:
-	case PCI_DEVICE_ID_INTEL_NTB_PS_JSF:
-	case PCI_DEVICE_ID_INTEL_NTB_PS_SNB:
-	case PCI_DEVICE_ID_INTEL_NTB_PS_IVT:
-	case PCI_DEVICE_ID_INTEL_NTB_PS_HSX:
-	case PCI_DEVICE_ID_INTEL_NTB_B2B_JSF:
-	case PCI_DEVICE_ID_INTEL_NTB_B2B_SNB:
-	case PCI_DEVICE_ID_INTEL_NTB_B2B_IVT:
-	case PCI_DEVICE_ID_INTEL_NTB_B2B_HSX:
-		return 1;
-	default:
-		return 0;
-	}
-
-	return 0;
-}
-
-static int is_ntb_atom(struct ntb_device *ndev)
-{
-	switch (ndev->pdev->device) {
-	case PCI_DEVICE_ID_INTEL_NTB_B2B_BWD:
-		return 1;
-	default:
-		return 0;
-	}
-
-	return 0;
-}
-
-static void ntb_set_errata_flags(struct ntb_device *ndev)
-{
-	switch (ndev->pdev->device) {
-	/*
-	 * this workaround applies to all platform up to IvyBridge
-	 * Haswell has splitbar support and use a different workaround
-	 */
-	case PCI_DEVICE_ID_INTEL_NTB_SS_JSF:
-	case PCI_DEVICE_ID_INTEL_NTB_SS_SNB:
-	case PCI_DEVICE_ID_INTEL_NTB_SS_IVT:
-	case PCI_DEVICE_ID_INTEL_NTB_SS_HSX:
-	case PCI_DEVICE_ID_INTEL_NTB_PS_JSF:
-	case PCI_DEVICE_ID_INTEL_NTB_PS_SNB:
-	case PCI_DEVICE_ID_INTEL_NTB_PS_IVT:
-	case PCI_DEVICE_ID_INTEL_NTB_PS_HSX:
-	case PCI_DEVICE_ID_INTEL_NTB_B2B_JSF:
-	case PCI_DEVICE_ID_INTEL_NTB_B2B_SNB:
-	case PCI_DEVICE_ID_INTEL_NTB_B2B_IVT:
-	case PCI_DEVICE_ID_INTEL_NTB_B2B_HSX:
-		ndev->wa_flags |= WA_SNB_ERR;
-		break;
-	}
-}
-
-/**
- * ntb_register_event_callback() - register event callback
- * @ndev: pointer to ntb_device instance
- * @func: callback function to register
- *
- * This function registers a callback for any HW driver events such as link
- * up/down, power management notices and etc.
- *
- * RETURNS: An appropriate -ERRNO error value on error, or zero for success.
- */
-int ntb_register_event_callback(struct ntb_device *ndev,
-				void (*func)(void *handle,
-					     enum ntb_hw_event event))
-{
-	if (ndev->event_cb)
-		return -EINVAL;
-
-	ndev->event_cb = func;
-
-	return 0;
-}
-
-/**
- * ntb_unregister_event_callback() - unregisters the event callback
- * @ndev: pointer to ntb_device instance
- *
- * This function unregisters the existing callback from transport
- */
-void ntb_unregister_event_callback(struct ntb_device *ndev)
-{
-	ndev->event_cb = NULL;
-}
-
-static void ntb_irq_work(unsigned long data)
-{
-	struct ntb_db_cb *db_cb = (struct ntb_db_cb *)data;
-	int rc;
-
-	rc = db_cb->callback(db_cb->data, db_cb->db_num);
-	if (rc)
-		tasklet_schedule(&db_cb->irq_work);
-	else {
-		struct ntb_device *ndev = db_cb->ndev;
-		unsigned long mask;
-
-		mask = readw(ndev->reg_ofs.ldb_mask);
-		clear_bit(db_cb->db_num * ndev->bits_per_vector, &mask);
-		writew(mask, ndev->reg_ofs.ldb_mask);
-	}
-}
-
-/**
- * ntb_register_db_callback() - register a callback for doorbell interrupt
- * @ndev: pointer to ntb_device instance
- * @idx: doorbell index to register callback, zero based
- * @data: pointer to be returned to caller with every callback
- * @func: callback function to register
- *
- * This function registers a callback function for the doorbell interrupt
- * on the primary side. The function will unmask the doorbell as well to
- * allow interrupt.
- *
- * RETURNS: An appropriate -ERRNO error value on error, or zero for success.
- */
-int ntb_register_db_callback(struct ntb_device *ndev, unsigned int idx,
-			     void *data, int (*func)(void *data, int db_num))
-{
-	unsigned long mask;
-
-	if (idx >= ndev->max_cbs || ndev->db_cb[idx].callback) {
-		dev_warn(&ndev->pdev->dev, "Invalid Index.\n");
-		return -EINVAL;
-	}
-
-	ndev->db_cb[idx].callback = func;
-	ndev->db_cb[idx].data = data;
-	ndev->db_cb[idx].ndev = ndev;
-
-	tasklet_init(&ndev->db_cb[idx].irq_work, ntb_irq_work,
-		     (unsigned long) &ndev->db_cb[idx]);
-
-	/* unmask interrupt */
-	mask = readw(ndev->reg_ofs.ldb_mask);
-	clear_bit(idx * ndev->bits_per_vector, &mask);
-	writew(mask, ndev->reg_ofs.ldb_mask);
-
-	return 0;
-}
-
-/**
- * ntb_unregister_db_callback() - unregister a callback for doorbell interrupt
- * @ndev: pointer to ntb_device instance
- * @idx: doorbell index to register callback, zero based
- *
- * This function unregisters a callback function for the doorbell interrupt
- * on the primary side. The function will also mask the said doorbell.
- */
-void ntb_unregister_db_callback(struct ntb_device *ndev, unsigned int idx)
-{
-	unsigned long mask;
-
-	if (idx >= ndev->max_cbs || !ndev->db_cb[idx].callback)
-		return;
-
-	mask = readw(ndev->reg_ofs.ldb_mask);
-	set_bit(idx * ndev->bits_per_vector, &mask);
-	writew(mask, ndev->reg_ofs.ldb_mask);
-
-	tasklet_disable(&ndev->db_cb[idx].irq_work);
-
-	ndev->db_cb[idx].callback = NULL;
-}
-
-/**
- * ntb_find_transport() - find the transport pointer
- * @transport: pointer to pci device
- *
- * Given the pci device pointer, return the transport pointer passed in when
- * the transport attached when it was inited.
- *
- * RETURNS: pointer to transport.
- */
-void *ntb_find_transport(struct pci_dev *pdev)
-{
-	struct ntb_device *ndev = pci_get_drvdata(pdev);
-	return ndev->ntb_transport;
-}
-
-/**
- * ntb_register_transport() - Register NTB transport with NTB HW driver
- * @transport: transport identifier
- *
- * This function allows a transport to reserve the hardware driver for
- * NTB usage.
- *
- * RETURNS: pointer to ntb_device, NULL on error.
- */
-struct ntb_device *ntb_register_transport(struct pci_dev *pdev, void *transport)
-{
-	struct ntb_device *ndev = pci_get_drvdata(pdev);
-
-	if (ndev->ntb_transport)
-		return NULL;
-
-	ndev->ntb_transport = transport;
-	return ndev;
-}
-
-/**
- * ntb_unregister_transport() - Unregister the transport with the NTB HW driver
- * @ndev - ntb_device of the transport to be freed
- *
- * This function unregisters the transport from the HW driver and performs any
- * necessary cleanups.
- */
-void ntb_unregister_transport(struct ntb_device *ndev)
-{
-	int i;
-
-	if (!ndev->ntb_transport)
-		return;
-
-	for (i = 0; i < ndev->max_cbs; i++)
-		ntb_unregister_db_callback(ndev, i);
-
-	ntb_unregister_event_callback(ndev);
-	ndev->ntb_transport = NULL;
-}
-
-/**
- * ntb_write_local_spad() - write to the secondary scratchpad register
- * @ndev: pointer to ntb_device instance
- * @idx: index to the scratchpad register, 0 based
- * @val: the data value to put into the register
- *
- * This function allows writing of a 32bit value to the indexed scratchpad
- * register. This writes over the data mirrored to the local scratchpad register
- * by the remote system.
- *
- * RETURNS: An appropriate -ERRNO error value on error, or zero for success.
- */
-int ntb_write_local_spad(struct ntb_device *ndev, unsigned int idx, u32 val)
-{
-	if (idx >= ndev->limits.max_spads)
-		return -EINVAL;
-
-	dev_dbg(&ndev->pdev->dev, "Writing %x to local scratch pad index %d\n",
-		val, idx);
-	writel(val, ndev->reg_ofs.spad_read + idx * 4);
-
-	return 0;
-}
-
-/**
- * ntb_read_local_spad() - read from the primary scratchpad register
- * @ndev: pointer to ntb_device instance
- * @idx: index to scratchpad register, 0 based
- * @val: pointer to 32bit integer for storing the register value
- *
- * This function allows reading of the 32bit scratchpad register on
- * the primary (internal) side.  This allows the local system to read data
- * written and mirrored to the scratchpad register by the remote system.
- *
- * RETURNS: An appropriate -ERRNO error value on error, or zero for success.
- */
-int ntb_read_local_spad(struct ntb_device *ndev, unsigned int idx, u32 *val)
-{
-	if (idx >= ndev->limits.max_spads)
-		return -EINVAL;
-
-	*val = readl(ndev->reg_ofs.spad_write + idx * 4);
-	dev_dbg(&ndev->pdev->dev,
-		"Reading %x from local scratch pad index %d\n", *val, idx);
-
-	return 0;
-}
-
-/**
- * ntb_write_remote_spad() - write to the secondary scratchpad register
- * @ndev: pointer to ntb_device instance
- * @idx: index to the scratchpad register, 0 based
- * @val: the data value to put into the register
- *
- * This function allows writing of a 32bit value to the indexed scratchpad
- * register. The register resides on the secondary (external) side.  This allows
- * the local system to write data to be mirrored to the remote systems
- * scratchpad register.
- *
- * RETURNS: An appropriate -ERRNO error value on error, or zero for success.
- */
-int ntb_write_remote_spad(struct ntb_device *ndev, unsigned int idx, u32 val)
-{
-	if (idx >= ndev->limits.max_spads)
-		return -EINVAL;
-
-	dev_dbg(&ndev->pdev->dev, "Writing %x to remote scratch pad index %d\n",
-		val, idx);
-	writel(val, ndev->reg_ofs.spad_write + idx * 4);
-
-	return 0;
-}
-
-/**
- * ntb_read_remote_spad() - read from the primary scratchpad register
- * @ndev: pointer to ntb_device instance
- * @idx: index to scratchpad register, 0 based
- * @val: pointer to 32bit integer for storing the register value
- *
- * This function allows reading of the 32bit scratchpad register on
- * the primary (internal) side.  This alloows the local system to read the data
- * it wrote to be mirrored on the remote system.
- *
- * RETURNS: An appropriate -ERRNO error value on error, or zero for success.
- */
-int ntb_read_remote_spad(struct ntb_device *ndev, unsigned int idx, u32 *val)
-{
-	if (idx >= ndev->limits.max_spads)
-		return -EINVAL;
-
-	*val = readl(ndev->reg_ofs.spad_read + idx * 4);
-	dev_dbg(&ndev->pdev->dev,
-		"Reading %x from remote scratch pad index %d\n", *val, idx);
-
-	return 0;
-}
-
-/**
- * ntb_get_mw_base() - get addr for the NTB memory window
- * @ndev: pointer to ntb_device instance
- * @mw: memory window number
- *
- * This function provides the base address of the memory window specified.
- *
- * RETURNS: address, or NULL on error.
- */
-resource_size_t ntb_get_mw_base(struct ntb_device *ndev, unsigned int mw)
-{
-	if (mw >= ntb_max_mw(ndev))
-		return 0;
-
-	return pci_resource_start(ndev->pdev, MW_TO_BAR(mw));
-}
-
-/**
- * ntb_get_mw_vbase() - get virtual addr for the NTB memory window
- * @ndev: pointer to ntb_device instance
- * @mw: memory window number
- *
- * This function provides the base virtual address of the memory window
- * specified.
- *
- * RETURNS: pointer to virtual address, or NULL on error.
- */
-void __iomem *ntb_get_mw_vbase(struct ntb_device *ndev, unsigned int mw)
-{
-	if (mw >= ntb_max_mw(ndev))
-		return NULL;
-
-	return ndev->mw[mw].vbase;
-}
-
-/**
- * ntb_get_mw_size() - return size of NTB memory window
- * @ndev: pointer to ntb_device instance
- * @mw: memory window number
- *
- * This function provides the physical size of the memory window specified
- *
- * RETURNS: the size of the memory window or zero on error
- */
-u64 ntb_get_mw_size(struct ntb_device *ndev, unsigned int mw)
-{
-	if (mw >= ntb_max_mw(ndev))
-		return 0;
-
-	return ndev->mw[mw].bar_sz;
-}
-
-/**
- * ntb_set_mw_addr - set the memory window address
- * @ndev: pointer to ntb_device instance
- * @mw: memory window number
- * @addr: base address for data
- *
- * This function sets the base physical address of the memory window.  This
- * memory address is where data from the remote system will be transfered into
- * or out of depending on how the transport is configured.
- */
-void ntb_set_mw_addr(struct ntb_device *ndev, unsigned int mw, u64 addr)
-{
-	if (mw >= ntb_max_mw(ndev))
-		return;
-
-	dev_dbg(&ndev->pdev->dev, "Writing addr %Lx to BAR %d\n", addr,
-		MW_TO_BAR(mw));
-
-	ndev->mw[mw].phys_addr = addr;
-
-	switch (MW_TO_BAR(mw)) {
-	case NTB_BAR_23:
-		writeq(addr, ndev->reg_ofs.bar2_xlat);
-		break;
-	case NTB_BAR_4:
-		if (ndev->split_bar)
-			writel(addr, ndev->reg_ofs.bar4_xlat);
-		else
-			writeq(addr, ndev->reg_ofs.bar4_xlat);
-		break;
-	case NTB_BAR_5:
-		writel(addr, ndev->reg_ofs.bar5_xlat);
-		break;
-	}
-}
-
-/**
- * ntb_ring_doorbell() - Set the doorbell on the secondary/external side
- * @ndev: pointer to ntb_device instance
- * @db: doorbell to ring
- *
- * This function allows triggering of a doorbell on the secondary/external
- * side that will initiate an interrupt on the remote host
- *
- * RETURNS: An appropriate -ERRNO error value on error, or zero for success.
- */
-void ntb_ring_doorbell(struct ntb_device *ndev, unsigned int db)
-{
-	dev_dbg(&ndev->pdev->dev, "%s: ringing doorbell %d\n", __func__, db);
-
-	if (ndev->hw_type == BWD_HW)
-		writeq((u64) 1 << db, ndev->reg_ofs.rdb);
-	else
-		writew(((1 << ndev->bits_per_vector) - 1) <<
-		       (db * ndev->bits_per_vector), ndev->reg_ofs.rdb);
-}
-
-static void bwd_recover_link(struct ntb_device *ndev)
-{
-	u32 status;
-
-	/* Driver resets the NTB ModPhy lanes - magic! */
-	writeb(0xe0, ndev->reg_base + BWD_MODPHY_PCSREG6);
-	writeb(0x40, ndev->reg_base + BWD_MODPHY_PCSREG4);
-	writeb(0x60, ndev->reg_base + BWD_MODPHY_PCSREG4);
-	writeb(0x60, ndev->reg_base + BWD_MODPHY_PCSREG6);
-
-	/* Driver waits 100ms to allow the NTB ModPhy to settle */
-	msleep(100);
-
-	/* Clear AER Errors, write to clear */
-	status = readl(ndev->reg_base + BWD_ERRCORSTS_OFFSET);
-	dev_dbg(&ndev->pdev->dev, "ERRCORSTS = %x\n", status);
-	status &= PCI_ERR_COR_REP_ROLL;
-	writel(status, ndev->reg_base + BWD_ERRCORSTS_OFFSET);
-
-	/* Clear unexpected electrical idle event in LTSSM, write to clear */
-	status = readl(ndev->reg_base + BWD_LTSSMERRSTS0_OFFSET);
-	dev_dbg(&ndev->pdev->dev, "LTSSMERRSTS0 = %x\n", status);
-	status |= BWD_LTSSMERRSTS0_UNEXPECTEDEI;
-	writel(status, ndev->reg_base + BWD_LTSSMERRSTS0_OFFSET);
-
-	/* Clear DeSkew Buffer error, write to clear */
-	status = readl(ndev->reg_base + BWD_DESKEWSTS_OFFSET);
-	dev_dbg(&ndev->pdev->dev, "DESKEWSTS = %x\n", status);
-	status |= BWD_DESKEWSTS_DBERR;
-	writel(status, ndev->reg_base + BWD_DESKEWSTS_OFFSET);
-
-	status = readl(ndev->reg_base + BWD_IBSTERRRCRVSTS0_OFFSET);
-	dev_dbg(&ndev->pdev->dev, "IBSTERRRCRVSTS0 = %x\n", status);
-	status &= BWD_IBIST_ERR_OFLOW;
-	writel(status, ndev->reg_base + BWD_IBSTERRRCRVSTS0_OFFSET);
-
-	/* Releases the NTB state machine to allow the link to retrain */
-	status = readl(ndev->reg_base + BWD_LTSSMSTATEJMP_OFFSET);
-	dev_dbg(&ndev->pdev->dev, "LTSSMSTATEJMP = %x\n", status);
-	status &= ~BWD_LTSSMSTATEJMP_FORCEDETECT;
-	writel(status, ndev->reg_base + BWD_LTSSMSTATEJMP_OFFSET);
-}
-
-static void ntb_link_event(struct ntb_device *ndev, int link_state)
-{
-	unsigned int event;
-
-	if (ndev->link_status == link_state)
-		return;
-
-	if (link_state == NTB_LINK_UP) {
-		u16 status;
-
-		dev_info(&ndev->pdev->dev, "Link Up\n");
-		ndev->link_status = NTB_LINK_UP;
-		event = NTB_EVENT_HW_LINK_UP;
-
-		if (is_ntb_atom(ndev) ||
-		    ndev->conn_type == NTB_CONN_TRANSPARENT)
-			status = readw(ndev->reg_ofs.lnk_stat);
-		else {
-			int rc = pci_read_config_word(ndev->pdev,
-						      SNB_LINK_STATUS_OFFSET,
-						      &status);
-			if (rc)
-				return;
-		}
-
-		ndev->link_width = (status & NTB_LINK_WIDTH_MASK) >> 4;
-		ndev->link_speed = (status & NTB_LINK_SPEED_MASK);
-		dev_info(&ndev->pdev->dev, "Link Width %d, Link Speed %d\n",
-			 ndev->link_width, ndev->link_speed);
-	} else {
-		dev_info(&ndev->pdev->dev, "Link Down\n");
-		ndev->link_status = NTB_LINK_DOWN;
-		event = NTB_EVENT_HW_LINK_DOWN;
-		/* Don't modify link width/speed, we need it in link recovery */
-	}
-
-	/* notify the upper layer if we have an event change */
-	if (ndev->event_cb)
-		ndev->event_cb(ndev->ntb_transport, event);
-}
-
-static int ntb_link_status(struct ntb_device *ndev)
-{
-	int link_state;
-
-	if (is_ntb_atom(ndev)) {
-		u32 ntb_cntl;
-
-		ntb_cntl = readl(ndev->reg_ofs.lnk_cntl);
-		if (ntb_cntl & BWD_CNTL_LINK_DOWN)
-			link_state = NTB_LINK_DOWN;
-		else
-			link_state = NTB_LINK_UP;
-	} else {
-		u16 status;
-		int rc;
-
-		rc = pci_read_config_word(ndev->pdev, SNB_LINK_STATUS_OFFSET,
-					  &status);
-		if (rc)
-			return rc;
-
-		if (status & NTB_LINK_STATUS_ACTIVE)
-			link_state = NTB_LINK_UP;
-		else
-			link_state = NTB_LINK_DOWN;
-	}
-
-	ntb_link_event(ndev, link_state);
-
-	return 0;
-}
-
-static void bwd_link_recovery(struct work_struct *work)
-{
-	struct ntb_device *ndev = container_of(work, struct ntb_device,
-					       lr_timer.work);
-	u32 status32;
-
-	bwd_recover_link(ndev);
-	/* There is a potential race between the 2 NTB devices recovering at the
-	 * same time.  If the times are the same, the link will not recover and
-	 * the driver will be stuck in this loop forever.  Add a random interval
-	 * to the recovery time to prevent this race.
-	 */
-	msleep(BWD_LINK_RECOVERY_TIME + prandom_u32() % BWD_LINK_RECOVERY_TIME);
-
-	status32 = readl(ndev->reg_base + BWD_LTSSMSTATEJMP_OFFSET);
-	if (status32 & BWD_LTSSMSTATEJMP_FORCEDETECT)
-		goto retry;
-
-	status32 = readl(ndev->reg_base + BWD_IBSTERRRCRVSTS0_OFFSET);
-	if (status32 & BWD_IBIST_ERR_OFLOW)
-		goto retry;
-
-	status32 = readl(ndev->reg_ofs.lnk_cntl);
-	if (!(status32 & BWD_CNTL_LINK_DOWN)) {
-		unsigned char speed, width;
-		u16 status16;
-
-		status16 = readw(ndev->reg_ofs.lnk_stat);
-		width = (status16 & NTB_LINK_WIDTH_MASK) >> 4;
-		speed = (status16 & NTB_LINK_SPEED_MASK);
-		if (ndev->link_width != width || ndev->link_speed != speed)
-			goto retry;
-	}
-
-	schedule_delayed_work(&ndev->hb_timer, NTB_HB_TIMEOUT);
-	return;
-
-retry:
-	schedule_delayed_work(&ndev->lr_timer, NTB_HB_TIMEOUT);
-}
-
-/* BWD doesn't have link status interrupt, poll on that platform */
-static void bwd_link_poll(struct work_struct *work)
-{
-	struct ntb_device *ndev = container_of(work, struct ntb_device,
-					       hb_timer.work);
-	unsigned long ts = jiffies;
-
-	/* If we haven't gotten an interrupt in a while, check the BWD link
-	 * status bit
-	 */
-	if (ts > ndev->last_ts + NTB_HB_TIMEOUT) {
-		int rc = ntb_link_status(ndev);
-		if (rc)
-			dev_err(&ndev->pdev->dev,
-				"Error determining link status\n");
-
-		/* Check to see if a link error is the cause of the link down */
-		if (ndev->link_status == NTB_LINK_DOWN) {
-			u32 status32 = readl(ndev->reg_base +
-					     BWD_LTSSMSTATEJMP_OFFSET);
-			if (status32 & BWD_LTSSMSTATEJMP_FORCEDETECT) {
-				schedule_delayed_work(&ndev->lr_timer, 0);
-				return;
-			}
-		}
-	}
-
-	schedule_delayed_work(&ndev->hb_timer, NTB_HB_TIMEOUT);
-}
-
-static int ntb_xeon_setup(struct ntb_device *ndev)
-{
-	switch (ndev->conn_type) {
-	case NTB_CONN_B2B:
-		ndev->reg_ofs.ldb = ndev->reg_base + SNB_PDOORBELL_OFFSET;
-		ndev->reg_ofs.ldb_mask = ndev->reg_base + SNB_PDBMSK_OFFSET;
-		ndev->reg_ofs.spad_read = ndev->reg_base + SNB_SPAD_OFFSET;
-		ndev->reg_ofs.bar2_xlat = ndev->reg_base + SNB_SBAR2XLAT_OFFSET;
-		ndev->reg_ofs.bar4_xlat = ndev->reg_base + SNB_SBAR4XLAT_OFFSET;
-		if (ndev->split_bar)
-			ndev->reg_ofs.bar5_xlat =
-				ndev->reg_base + SNB_SBAR5XLAT_OFFSET;
-		ndev->limits.max_spads = SNB_MAX_B2B_SPADS;
-
-		/* There is a Xeon hardware errata related to writes to
-		 * SDOORBELL or B2BDOORBELL in conjunction with inbound access
-		 * to NTB MMIO Space, which may hang the system.  To workaround
-		 * this use the second memory window to access the interrupt and
-		 * scratch pad registers on the remote system.
-		 */
-		if (ndev->wa_flags & WA_SNB_ERR) {
-			if (!ndev->mw[ndev->limits.max_mw - 1].bar_sz)
-				return -EINVAL;
-
-			ndev->limits.max_db_bits = SNB_MAX_DB_BITS;
-			ndev->reg_ofs.spad_write =
-				ndev->mw[ndev->limits.max_mw - 1].vbase +
-				SNB_SPAD_OFFSET;
-			ndev->reg_ofs.rdb =
-				ndev->mw[ndev->limits.max_mw - 1].vbase +
-				SNB_PDOORBELL_OFFSET;
-
-			/* Set the Limit register to 4k, the minimum size, to
-			 * prevent an illegal access
-			 */
-			writeq(ndev->mw[1].bar_sz + 0x1000, ndev->reg_base +
-			       SNB_PBAR4LMT_OFFSET);
-			/* HW errata on the Limit registers.  They can only be
-			 * written when the base register is 4GB aligned and
-			 * < 32bit.  This should already be the case based on
-			 * the driver defaults, but write the Limit registers
-			 * first just in case.
-			 */
-
-			ndev->limits.max_mw = SNB_ERRATA_MAX_MW;
-		} else {
-			/* HW Errata on bit 14 of b2bdoorbell register.  Writes
-			 * will not be mirrored to the remote system.  Shrink
-			 * the number of bits by one, since bit 14 is the last
-			 * bit.
-			 */
-			ndev->limits.max_db_bits = SNB_MAX_DB_BITS - 1;
-			ndev->reg_ofs.spad_write = ndev->reg_base +
-						   SNB_B2B_SPAD_OFFSET;
-			ndev->reg_ofs.rdb = ndev->reg_base +
-					    SNB_B2B_DOORBELL_OFFSET;
-
-			/* Disable the Limit register, just incase it is set to
-			 * something silly. A 64bit write should handle it
-			 * regardless of whether it has a split BAR or not.
-			 */
-			writeq(0, ndev->reg_base + SNB_PBAR4LMT_OFFSET);
-			/* HW errata on the Limit registers.  They can only be
-			 * written when the base register is 4GB aligned and
-			 * < 32bit.  This should already be the case based on
-			 * the driver defaults, but write the Limit registers
-			 * first just in case.
-			 */
-			if (ndev->split_bar)
-				ndev->limits.max_mw = HSX_SPLITBAR_MAX_MW;
-			else
-				ndev->limits.max_mw = SNB_MAX_MW;
-		}
-
-		/* The Xeon errata workaround requires setting SBAR Base
-		 * addresses to known values, so that the PBAR XLAT can be
-		 * pointed at SBAR0 of the remote system.
-		 */
-		if (ndev->dev_type == NTB_DEV_USD) {
-			writeq(SNB_MBAR23_DSD_ADDR, ndev->reg_base +
-			       SNB_PBAR2XLAT_OFFSET);
-			if (ndev->wa_flags & WA_SNB_ERR)
-				writeq(SNB_MBAR01_DSD_ADDR, ndev->reg_base +
-				       SNB_PBAR4XLAT_OFFSET);
-			else {
-				if (ndev->split_bar) {
-					writel(SNB_MBAR4_DSD_ADDR,
-					       ndev->reg_base +
-					       SNB_PBAR4XLAT_OFFSET);
-					writel(SNB_MBAR5_DSD_ADDR,
-					       ndev->reg_base +
-					       SNB_PBAR5XLAT_OFFSET);
-				} else
-					writeq(SNB_MBAR4_DSD_ADDR,
-					       ndev->reg_base +
-					       SNB_PBAR4XLAT_OFFSET);
-
-				/* B2B_XLAT_OFFSET is a 64bit register, but can
-				 * only take 32bit writes
-				 */
-				writel(SNB_MBAR01_DSD_ADDR & 0xffffffff,
-				       ndev->reg_base + SNB_B2B_XLAT_OFFSETL);
-				writel(SNB_MBAR01_DSD_ADDR >> 32,
-				       ndev->reg_base + SNB_B2B_XLAT_OFFSETU);
-			}
-
-			writeq(SNB_MBAR01_USD_ADDR, ndev->reg_base +
-			       SNB_SBAR0BASE_OFFSET);
-			writeq(SNB_MBAR23_USD_ADDR, ndev->reg_base +
-			       SNB_SBAR2BASE_OFFSET);
-			if (ndev->split_bar) {
-				writel(SNB_MBAR4_USD_ADDR, ndev->reg_base +
-				       SNB_SBAR4BASE_OFFSET);
-				writel(SNB_MBAR5_USD_ADDR, ndev->reg_base +
-				       SNB_SBAR5BASE_OFFSET);
-			} else
-				writeq(SNB_MBAR4_USD_ADDR, ndev->reg_base +
-				       SNB_SBAR4BASE_OFFSET);
-		} else {
-			writeq(SNB_MBAR23_USD_ADDR, ndev->reg_base +
-			       SNB_PBAR2XLAT_OFFSET);
-			if (ndev->wa_flags & WA_SNB_ERR)
-				writeq(SNB_MBAR01_USD_ADDR, ndev->reg_base +
-				       SNB_PBAR4XLAT_OFFSET);
-			else {
-				if (ndev->split_bar) {
-					writel(SNB_MBAR4_USD_ADDR,
-					       ndev->reg_base +
-					       SNB_PBAR4XLAT_OFFSET);
-					writel(SNB_MBAR5_USD_ADDR,
-					       ndev->reg_base +
-					       SNB_PBAR5XLAT_OFFSET);
-				} else
-					writeq(SNB_MBAR4_USD_ADDR,
-					       ndev->reg_base +
-					       SNB_PBAR4XLAT_OFFSET);
-
-				/*
-				 * B2B_XLAT_OFFSET is a 64bit register, but can
-				 * only take 32bit writes
-				 */
-				writel(SNB_MBAR01_USD_ADDR & 0xffffffff,
-				       ndev->reg_base + SNB_B2B_XLAT_OFFSETL);
-				writel(SNB_MBAR01_USD_ADDR >> 32,
-				       ndev->reg_base + SNB_B2B_XLAT_OFFSETU);
-			}
-			writeq(SNB_MBAR01_DSD_ADDR, ndev->reg_base +
-			       SNB_SBAR0BASE_OFFSET);
-			writeq(SNB_MBAR23_DSD_ADDR, ndev->reg_base +
-			       SNB_SBAR2BASE_OFFSET);
-			if (ndev->split_bar) {
-				writel(SNB_MBAR4_DSD_ADDR, ndev->reg_base +
-				       SNB_SBAR4BASE_OFFSET);
-				writel(SNB_MBAR5_DSD_ADDR, ndev->reg_base +
-				       SNB_SBAR5BASE_OFFSET);
-			} else
-				writeq(SNB_MBAR4_DSD_ADDR, ndev->reg_base +
-				       SNB_SBAR4BASE_OFFSET);
-
-		}
-		break;
-	case NTB_CONN_RP:
-		if (ndev->wa_flags & WA_SNB_ERR) {
-			dev_err(&ndev->pdev->dev,
-				"NTB-RP disabled due to hardware errata.\n");
-			return -EINVAL;
-		}
-
-		/* Scratch pads need to have exclusive access from the primary
-		 * or secondary side.  Halve the num spads so that each side can
-		 * have an equal amount.
-		 */
-		ndev->limits.max_spads = SNB_MAX_COMPAT_SPADS / 2;
-		ndev->limits.max_db_bits = SNB_MAX_DB_BITS;
-		/* Note: The SDOORBELL is the cause of the errata.  You REALLY
-		 * don't want to touch it.
-		 */
-		ndev->reg_ofs.rdb = ndev->reg_base + SNB_SDOORBELL_OFFSET;
-		ndev->reg_ofs.ldb = ndev->reg_base + SNB_PDOORBELL_OFFSET;
-		ndev->reg_ofs.ldb_mask = ndev->reg_base + SNB_PDBMSK_OFFSET;
-		/* Offset the start of the spads to correspond to whether it is
-		 * primary or secondary
-		 */
-		ndev->reg_ofs.spad_write = ndev->reg_base + SNB_SPAD_OFFSET +
-					   ndev->limits.max_spads * 4;
-		ndev->reg_ofs.spad_read = ndev->reg_base + SNB_SPAD_OFFSET;
-		ndev->reg_ofs.bar2_xlat = ndev->reg_base + SNB_SBAR2XLAT_OFFSET;
-		ndev->reg_ofs.bar4_xlat = ndev->reg_base + SNB_SBAR4XLAT_OFFSET;
-		if (ndev->split_bar) {
-			ndev->reg_ofs.bar5_xlat =
-				ndev->reg_base + SNB_SBAR5XLAT_OFFSET;
-			ndev->limits.max_mw = HSX_SPLITBAR_MAX_MW;
-		} else
-			ndev->limits.max_mw = SNB_MAX_MW;
-		break;
-	case NTB_CONN_TRANSPARENT:
-		if (ndev->wa_flags & WA_SNB_ERR) {
-			dev_err(&ndev->pdev->dev,
-				"NTB-TRANSPARENT disabled due to hardware errata.\n");
-			return -EINVAL;
-		}
-
-		/* Scratch pads need to have exclusive access from the primary
-		 * or secondary side.  Halve the num spads so that each side can
-		 * have an equal amount.
-		 */
-		ndev->limits.max_spads = SNB_MAX_COMPAT_SPADS / 2;
-		ndev->limits.max_db_bits = SNB_MAX_DB_BITS;
-		ndev->reg_ofs.rdb = ndev->reg_base + SNB_PDOORBELL_OFFSET;
-		ndev->reg_ofs.ldb = ndev->reg_base + SNB_SDOORBELL_OFFSET;
-		ndev->reg_ofs.ldb_mask = ndev->reg_base + SNB_SDBMSK_OFFSET;
-		ndev->reg_ofs.spad_write = ndev->reg_base + SNB_SPAD_OFFSET;
-		/* Offset the start of the spads to correspond to whether it is
-		 * primary or secondary
-		 */
-		ndev->reg_ofs.spad_read = ndev->reg_base + SNB_SPAD_OFFSET +
-					  ndev->limits.max_spads * 4;
-		ndev->reg_ofs.bar2_xlat = ndev->reg_base + SNB_PBAR2XLAT_OFFSET;
-		ndev->reg_ofs.bar4_xlat = ndev->reg_base + SNB_PBAR4XLAT_OFFSET;
-
-		if (ndev->split_bar) {
-			ndev->reg_ofs.bar5_xlat =
-				ndev->reg_base + SNB_PBAR5XLAT_OFFSET;
-			ndev->limits.max_mw = HSX_SPLITBAR_MAX_MW;
-		} else
-			ndev->limits.max_mw = SNB_MAX_MW;
-		break;
-	default:
-		/*
-		 * we should never hit this. the detect function should've
-		 * take cared of everything.
-		 */
-		return -EINVAL;
-	}
-
-	ndev->reg_ofs.lnk_cntl = ndev->reg_base + SNB_NTBCNTL_OFFSET;
-	ndev->reg_ofs.lnk_stat = ndev->reg_base + SNB_SLINK_STATUS_OFFSET;
-	ndev->reg_ofs.spci_cmd = ndev->reg_base + SNB_PCICMD_OFFSET;
-
-	ndev->limits.msix_cnt = SNB_MSIX_CNT;
-	ndev->bits_per_vector = SNB_DB_BITS_PER_VEC;
-
-	return 0;
-}
-
-static int ntb_bwd_setup(struct ntb_device *ndev)
-{
-	int rc;
-	u32 val;
-
-	ndev->hw_type = BWD_HW;
-
-	rc = pci_read_config_dword(ndev->pdev, NTB_PPD_OFFSET, &val);
-	if (rc)
-		return rc;
-
-	switch ((val & BWD_PPD_CONN_TYPE) >> 8) {
-	case NTB_CONN_B2B:
-		ndev->conn_type = NTB_CONN_B2B;
-		break;
-	case NTB_CONN_RP:
-	default:
-		dev_err(&ndev->pdev->dev, "Unsupported NTB configuration\n");
-		return -EINVAL;
-	}
-
-	if (val & BWD_PPD_DEV_TYPE)
-		ndev->dev_type = NTB_DEV_DSD;
-	else
-		ndev->dev_type = NTB_DEV_USD;
-
-	/* Initiate PCI-E link training */
-	rc = pci_write_config_dword(ndev->pdev, NTB_PPD_OFFSET,
-				    val | BWD_PPD_INIT_LINK);
-	if (rc)
-		return rc;
-
-	ndev->reg_ofs.ldb = ndev->reg_base + BWD_PDOORBELL_OFFSET;
-	ndev->reg_ofs.ldb_mask = ndev->reg_base + BWD_PDBMSK_OFFSET;
-	ndev->reg_ofs.rdb = ndev->reg_base + BWD_B2B_DOORBELL_OFFSET;
-	ndev->reg_ofs.bar2_xlat = ndev->reg_base + BWD_SBAR2XLAT_OFFSET;
-	ndev->reg_ofs.bar4_xlat = ndev->reg_base + BWD_SBAR4XLAT_OFFSET;
-	ndev->reg_ofs.lnk_cntl = ndev->reg_base + BWD_NTBCNTL_OFFSET;
-	ndev->reg_ofs.lnk_stat = ndev->reg_base + BWD_LINK_STATUS_OFFSET;
-	ndev->reg_ofs.spad_read = ndev->reg_base + BWD_SPAD_OFFSET;
-	ndev->reg_ofs.spad_write = ndev->reg_base + BWD_B2B_SPAD_OFFSET;
-	ndev->reg_ofs.spci_cmd = ndev->reg_base + BWD_PCICMD_OFFSET;
-	ndev->limits.max_mw = BWD_MAX_MW;
-	ndev->limits.max_spads = BWD_MAX_SPADS;
-	ndev->limits.max_db_bits = BWD_MAX_DB_BITS;
-	ndev->limits.msix_cnt = BWD_MSIX_CNT;
-	ndev->bits_per_vector = BWD_DB_BITS_PER_VEC;
-
-	/* Since bwd doesn't have a link interrupt, setup a poll timer */
-	INIT_DELAYED_WORK(&ndev->hb_timer, bwd_link_poll);
-	INIT_DELAYED_WORK(&ndev->lr_timer, bwd_link_recovery);
-	schedule_delayed_work(&ndev->hb_timer, NTB_HB_TIMEOUT);
-
-	return 0;
-}
-
-static int ntb_device_setup(struct ntb_device *ndev)
-{
-	int rc;
-
-	if (is_ntb_xeon(ndev))
-		rc = ntb_xeon_setup(ndev);
-	else if (is_ntb_atom(ndev))
-		rc = ntb_bwd_setup(ndev);
-	else
-		rc = -ENODEV;
-
-	if (rc)
-		return rc;
-
-	if (ndev->conn_type == NTB_CONN_B2B)
-		/* Enable Bus Master and Memory Space on the secondary side */
-		writew(PCI_COMMAND_MEMORY | PCI_COMMAND_MASTER,
-		       ndev->reg_ofs.spci_cmd);
-
-	return 0;
-}
-
-static void ntb_device_free(struct ntb_device *ndev)
-{
-	if (is_ntb_atom(ndev)) {
-		cancel_delayed_work_sync(&ndev->hb_timer);
-		cancel_delayed_work_sync(&ndev->lr_timer);
-	}
-}
-
-static irqreturn_t bwd_callback_msix_irq(int irq, void *data)
-{
-	struct ntb_db_cb *db_cb = data;
-	struct ntb_device *ndev = db_cb->ndev;
-	unsigned long mask;
-
-	dev_dbg(&ndev->pdev->dev, "MSI-X irq %d received for DB %d\n", irq,
-		db_cb->db_num);
-
-	mask = readw(ndev->reg_ofs.ldb_mask);
-	set_bit(db_cb->db_num * ndev->bits_per_vector, &mask);
-	writew(mask, ndev->reg_ofs.ldb_mask);
-
-	tasklet_schedule(&db_cb->irq_work);
-
-	/* No need to check for the specific HB irq, any interrupt means
-	 * we're connected.
-	 */
-	ndev->last_ts = jiffies;
-
-	writeq((u64) 1 << db_cb->db_num, ndev->reg_ofs.ldb);
-
-	return IRQ_HANDLED;
-}
-
-static irqreturn_t xeon_callback_msix_irq(int irq, void *data)
-{
-	struct ntb_db_cb *db_cb = data;
-	struct ntb_device *ndev = db_cb->ndev;
-	unsigned long mask;
-
-	dev_dbg(&ndev->pdev->dev, "MSI-X irq %d received for DB %d\n", irq,
-		db_cb->db_num);
-
-	mask = readw(ndev->reg_ofs.ldb_mask);
-	set_bit(db_cb->db_num * ndev->bits_per_vector, &mask);
-	writew(mask, ndev->reg_ofs.ldb_mask);
-
-	tasklet_schedule(&db_cb->irq_work);
-
-	/* On Sandybridge, there are 16 bits in the interrupt register
-	 * but only 4 vectors.  So, 5 bits are assigned to the first 3
-	 * vectors, with the 4th having a single bit for link
-	 * interrupts.
-	 */
-	writew(((1 << ndev->bits_per_vector) - 1) <<
-	       (db_cb->db_num * ndev->bits_per_vector), ndev->reg_ofs.ldb);
-
-	return IRQ_HANDLED;
-}
-
-/* Since we do not have a HW doorbell in BWD, this is only used in JF/JT */
-static irqreturn_t xeon_event_msix_irq(int irq, void *dev)
-{
-	struct ntb_device *ndev = dev;
-	int rc;
-
-	dev_dbg(&ndev->pdev->dev, "MSI-X irq %d received for Events\n", irq);
-
-	rc = ntb_link_status(ndev);
-	if (rc)
-		dev_err(&ndev->pdev->dev, "Error determining link status\n");
-
-	/* bit 15 is always the link bit */
-	writew(1 << SNB_LINK_DB, ndev->reg_ofs.ldb);
-
-	return IRQ_HANDLED;
-}
-
-static irqreturn_t ntb_interrupt(int irq, void *dev)
-{
-	struct ntb_device *ndev = dev;
-	unsigned int i = 0;
-
-	if (is_ntb_atom(ndev)) {
-		u64 ldb = readq(ndev->reg_ofs.ldb);
-
-		dev_dbg(&ndev->pdev->dev, "irq %d - ldb = %Lx\n", irq, ldb);
-
-		while (ldb) {
-			i = __ffs(ldb);
-			ldb &= ldb - 1;
-			bwd_callback_msix_irq(irq, &ndev->db_cb[i]);
-		}
-	} else {
-		u16 ldb = readw(ndev->reg_ofs.ldb);
-
-		dev_dbg(&ndev->pdev->dev, "irq %d - ldb = %x\n", irq, ldb);
-
-		if (ldb & SNB_DB_HW_LINK) {
-			xeon_event_msix_irq(irq, dev);
-			ldb &= ~SNB_DB_HW_LINK;
-		}
-
-		while (ldb) {
-			i = __ffs(ldb);
-			ldb &= ldb - 1;
-			xeon_callback_msix_irq(irq, &ndev->db_cb[i]);
-		}
-	}
-
-	return IRQ_HANDLED;
-}
-
-static int ntb_setup_snb_msix(struct ntb_device *ndev, int msix_entries)
-{
-	struct pci_dev *pdev = ndev->pdev;
-	struct msix_entry *msix;
-	int rc, i;
-
-	if (msix_entries < ndev->limits.msix_cnt)
-		return -ENOSPC;
-
-	rc = pci_enable_msix_exact(pdev, ndev->msix_entries, msix_entries);
-	if (rc < 0)
-		return rc;
-
-	for (i = 0; i < msix_entries; i++) {
-		msix = &ndev->msix_entries[i];
-		WARN_ON(!msix->vector);
-
-		if (i == msix_entries - 1) {
-			rc = request_irq(msix->vector,
-					 xeon_event_msix_irq, 0,
-					 "ntb-event-msix", ndev);
-			if (rc)
-				goto err;
-		} else {
-			rc = request_irq(msix->vector,
-					 xeon_callback_msix_irq, 0,
-					 "ntb-callback-msix",
-					 &ndev->db_cb[i]);
-			if (rc)
-				goto err;
-		}
-	}
-
-	ndev->num_msix = msix_entries;
-	ndev->max_cbs = msix_entries - 1;
-
-	return 0;
-
-err:
-	while (--i >= 0) {
-		/* Code never reaches here for entry nr 'ndev->num_msix - 1' */
-		msix = &ndev->msix_entries[i];
-		free_irq(msix->vector, &ndev->db_cb[i]);
-	}
-
-	pci_disable_msix(pdev);
-	ndev->num_msix = 0;
-
-	return rc;
-}
-
-static int ntb_setup_bwd_msix(struct ntb_device *ndev, int msix_entries)
-{
-	struct pci_dev *pdev = ndev->pdev;
-	struct msix_entry *msix;
-	int rc, i;
-
-	msix_entries = pci_enable_msix_range(pdev, ndev->msix_entries,
-					     1, msix_entries);
-	if (msix_entries < 0)
-		return msix_entries;
-
-	for (i = 0; i < msix_entries; i++) {
-		msix = &ndev->msix_entries[i];
-		WARN_ON(!msix->vector);
-
-		rc = request_irq(msix->vector, bwd_callback_msix_irq, 0,
-				 "ntb-callback-msix", &ndev->db_cb[i]);
-		if (rc)
-			goto err;
-	}
-
-	ndev->num_msix = msix_entries;
-	ndev->max_cbs = msix_entries;
-
-	return 0;
-
-err:
-	while (--i >= 0)
-		free_irq(msix->vector, &ndev->db_cb[i]);
-
-	pci_disable_msix(pdev);
-	ndev->num_msix = 0;
-
-	return rc;
-}
-
-static int ntb_setup_msix(struct ntb_device *ndev)
-{
-	struct pci_dev *pdev = ndev->pdev;
-	int msix_entries;
-	int rc, i;
-
-	msix_entries = pci_msix_vec_count(pdev);
-	if (msix_entries < 0) {
-		rc = msix_entries;
-		goto err;
-	} else if (msix_entries > ndev->limits.msix_cnt) {
-		rc = -EINVAL;
-		goto err;
-	}
-
-	ndev->msix_entries = kmalloc(sizeof(struct msix_entry) * msix_entries,
-				     GFP_KERNEL);
-	if (!ndev->msix_entries) {
-		rc = -ENOMEM;
-		goto err;
-	}
-
-	for (i = 0; i < msix_entries; i++)
-		ndev->msix_entries[i].entry = i;
-
-	if (is_ntb_atom(ndev))
-		rc = ntb_setup_bwd_msix(ndev, msix_entries);
-	else
-		rc = ntb_setup_snb_msix(ndev, msix_entries);
-	if (rc)
-		goto err1;
-
-	return 0;
-
-err1:
-	kfree(ndev->msix_entries);
-err:
-	dev_err(&pdev->dev, "Error allocating MSI-X interrupt\n");
-	return rc;
-}
-
-static int ntb_setup_msi(struct ntb_device *ndev)
-{
-	struct pci_dev *pdev = ndev->pdev;
-	int rc;
-
-	rc = pci_enable_msi(pdev);
-	if (rc)
-		return rc;
-
-	rc = request_irq(pdev->irq, ntb_interrupt, 0, "ntb-msi", ndev);
-	if (rc) {
-		pci_disable_msi(pdev);
-		dev_err(&pdev->dev, "Error allocating MSI interrupt\n");
-		return rc;
-	}
-
-	return 0;
-}
-
-static int ntb_setup_intx(struct ntb_device *ndev)
-{
-	struct pci_dev *pdev = ndev->pdev;
-	int rc;
-
-	/* Verify intx is enabled */
-	pci_intx(pdev, 1);
-
-	rc = request_irq(pdev->irq, ntb_interrupt, IRQF_SHARED, "ntb-intx",
-			 ndev);
-	if (rc)
-		return rc;
-
-	return 0;
-}
-
-static int ntb_setup_interrupts(struct ntb_device *ndev)
-{
-	int rc;
-
-	/* On BWD, disable all interrupts.  On SNB, disable all but Link
-	 * Interrupt.  The rest will be unmasked as callbacks are registered.
-	 */
-	if (is_ntb_atom(ndev))
-		writeq(~0, ndev->reg_ofs.ldb_mask);
-	else {
-		u16 var = 1 << SNB_LINK_DB;
-		writew(~var, ndev->reg_ofs.ldb_mask);
-	}
-
-	rc = ntb_setup_msix(ndev);
-	if (!rc)
-		goto done;
-
-	ndev->bits_per_vector = 1;
-	ndev->max_cbs = ndev->limits.max_db_bits;
-
-	rc = ntb_setup_msi(ndev);
-	if (!rc)
-		goto done;
-
-	rc = ntb_setup_intx(ndev);
-	if (rc) {
-		dev_err(&ndev->pdev->dev, "no usable interrupts\n");
-		return rc;
-	}
-
-done:
-	return 0;
-}
-
-static void ntb_free_interrupts(struct ntb_device *ndev)
-{
-	struct pci_dev *pdev = ndev->pdev;
-
-	/* mask interrupts */
-	if (is_ntb_atom(ndev))
-		writeq(~0, ndev->reg_ofs.ldb_mask);
-	else
-		writew(~0, ndev->reg_ofs.ldb_mask);
-
-	if (ndev->num_msix) {
-		struct msix_entry *msix;
-		u32 i;
-
-		for (i = 0; i < ndev->num_msix; i++) {
-			msix = &ndev->msix_entries[i];
-			if (is_ntb_xeon(ndev) && i == ndev->num_msix - 1)
-				free_irq(msix->vector, ndev);
-			else
-				free_irq(msix->vector, &ndev->db_cb[i]);
-		}
-		pci_disable_msix(pdev);
-		kfree(ndev->msix_entries);
-	} else {
-		free_irq(pdev->irq, ndev);
-
-		if (pci_dev_msi_enabled(pdev))
-			pci_disable_msi(pdev);
-	}
-}
-
-static int ntb_create_callbacks(struct ntb_device *ndev)
-{
-	int i;
-
-	/* Chicken-egg issue.  We won't know how many callbacks are necessary
-	 * until we see how many MSI-X vectors we get, but these pointers need
-	 * to be passed into the MSI-X register function.  So, we allocate the
-	 * max, knowing that they might not all be used, to work around this.
-	 */
-	ndev->db_cb = kcalloc(ndev->limits.max_db_bits,
-			      sizeof(struct ntb_db_cb),
-			      GFP_KERNEL);
-	if (!ndev->db_cb)
-		return -ENOMEM;
-
-	for (i = 0; i < ndev->limits.max_db_bits; i++) {
-		ndev->db_cb[i].db_num = i;
-		ndev->db_cb[i].ndev = ndev;
-	}
-
-	return 0;
-}
-
-static void ntb_free_callbacks(struct ntb_device *ndev)
-{
-	int i;
-
-	for (i = 0; i < ndev->limits.max_db_bits; i++)
-		ntb_unregister_db_callback(ndev, i);
-
-	kfree(ndev->db_cb);
-}
-
-static ssize_t ntb_debugfs_read(struct file *filp, char __user *ubuf,
-				size_t count, loff_t *offp)
-{
-	struct ntb_device *ndev;
-	char *buf;
-	ssize_t ret, offset, out_count;
-
-	out_count = 500;
-
-	buf = kmalloc(out_count, GFP_KERNEL);
-	if (!buf)
-		return -ENOMEM;
-
-	ndev = filp->private_data;
-	offset = 0;
-	offset += snprintf(buf + offset, out_count - offset,
-			   "NTB Device Information:\n");
-	offset += snprintf(buf + offset, out_count - offset,
-			   "Connection Type - \t\t%s\n",
-			   ndev->conn_type == NTB_CONN_TRANSPARENT ?
-			   "Transparent" : (ndev->conn_type == NTB_CONN_B2B) ?
-			   "Back to back" : "Root Port");
-	offset += snprintf(buf + offset, out_count - offset,
-			   "Device Type - \t\t\t%s\n",
-			   ndev->dev_type == NTB_DEV_USD ?
-			   "DSD/USP" : "USD/DSP");
-	offset += snprintf(buf + offset, out_count - offset,
-			   "Max Number of Callbacks - \t%u\n",
-			   ntb_max_cbs(ndev));
-	offset += snprintf(buf + offset, out_count - offset,
-			   "Link Status - \t\t\t%s\n",
-			   ntb_hw_link_status(ndev) ? "Up" : "Down");
-	if (ntb_hw_link_status(ndev)) {
-		offset += snprintf(buf + offset, out_count - offset,
-				   "Link Speed - \t\t\tPCI-E Gen %u\n",
-				   ndev->link_speed);
-		offset += snprintf(buf + offset, out_count - offset,
-				   "Link Width - \t\t\tx%u\n",
-				   ndev->link_width);
-	}
-
-	if (is_ntb_xeon(ndev)) {
-		u32 status32;
-		u16 status16;
-		int rc;
-
-		offset += snprintf(buf + offset, out_count - offset,
-				   "\nNTB Device Statistics:\n");
-		offset += snprintf(buf + offset, out_count - offset,
-				   "Upstream Memory Miss - \t%u\n",
-				   readw(ndev->reg_base +
-					 SNB_USMEMMISS_OFFSET));
-
-		offset += snprintf(buf + offset, out_count - offset,
-				   "\nNTB Hardware Errors:\n");
-
-		rc = pci_read_config_word(ndev->pdev, SNB_DEVSTS_OFFSET,
-					  &status16);
-		if (!rc)
-			offset += snprintf(buf + offset, out_count - offset,
-					   "DEVSTS - \t%#06x\n", status16);
-
-		rc = pci_read_config_word(ndev->pdev, SNB_LINK_STATUS_OFFSET,
-					  &status16);
-		if (!rc)
-			offset += snprintf(buf + offset, out_count - offset,
-					   "LNKSTS - \t%#06x\n", status16);
-
-		rc = pci_read_config_dword(ndev->pdev, SNB_UNCERRSTS_OFFSET,
-					   &status32);
-		if (!rc)
-			offset += snprintf(buf + offset, out_count - offset,
-					   "UNCERRSTS - \t%#010x\n", status32);
-
-		rc = pci_read_config_dword(ndev->pdev, SNB_CORERRSTS_OFFSET,
-					   &status32);
-		if (!rc)
-			offset += snprintf(buf + offset, out_count - offset,
-					   "CORERRSTS - \t%#010x\n", status32);
-	}
-
-	if (offset > out_count)
-		offset = out_count;
-
-	ret = simple_read_from_buffer(ubuf, count, offp, buf, offset);
-	kfree(buf);
-	return ret;
-}
-
-static const struct file_operations ntb_debugfs_info = {
-	.owner = THIS_MODULE,
-	.open = simple_open,
-	.read = ntb_debugfs_read,
-};
-
-static void ntb_setup_debugfs(struct ntb_device *ndev)
-{
-	if (!debugfs_initialized())
-		return;
-
-	if (!debugfs_dir)
-		debugfs_dir = debugfs_create_dir(KBUILD_MODNAME, NULL);
-
-	ndev->debugfs_dir = debugfs_create_dir(pci_name(ndev->pdev),
-					       debugfs_dir);
-	if (ndev->debugfs_dir)
-		ndev->debugfs_info = debugfs_create_file("info", S_IRUSR,
-							 ndev->debugfs_dir,
-							 ndev,
-							 &ntb_debugfs_info);
-}
-
-static void ntb_free_debugfs(struct ntb_device *ndev)
-{
-	debugfs_remove_recursive(ndev->debugfs_dir);
-
-	if (debugfs_dir && simple_empty(debugfs_dir)) {
-		debugfs_remove_recursive(debugfs_dir);
-		debugfs_dir = NULL;
-	}
-}
-
-static void ntb_hw_link_up(struct ntb_device *ndev)
-{
-	if (ndev->conn_type == NTB_CONN_TRANSPARENT)
-		ntb_link_event(ndev, NTB_LINK_UP);
-	else {
-		u32 ntb_cntl;
-
-		/* Let's bring the NTB link up */
-		ntb_cntl = readl(ndev->reg_ofs.lnk_cntl);
-		ntb_cntl &= ~(NTB_CNTL_LINK_DISABLE | NTB_CNTL_CFG_LOCK);
-		ntb_cntl |= NTB_CNTL_P2S_BAR23_SNOOP | NTB_CNTL_S2P_BAR23_SNOOP;
-		ntb_cntl |= NTB_CNTL_P2S_BAR4_SNOOP | NTB_CNTL_S2P_BAR4_SNOOP;
-		if (ndev->split_bar)
-			ntb_cntl |= NTB_CNTL_P2S_BAR5_SNOOP |
-				    NTB_CNTL_S2P_BAR5_SNOOP;
-
-		writel(ntb_cntl, ndev->reg_ofs.lnk_cntl);
-	}
-}
-
-static void ntb_hw_link_down(struct ntb_device *ndev)
-{
-	u32 ntb_cntl;
-
-	if (ndev->conn_type == NTB_CONN_TRANSPARENT) {
-		ntb_link_event(ndev, NTB_LINK_DOWN);
-		return;
-	}
-
-	/* Bring NTB link down */
-	ntb_cntl = readl(ndev->reg_ofs.lnk_cntl);
-	ntb_cntl &= ~(NTB_CNTL_P2S_BAR23_SNOOP | NTB_CNTL_S2P_BAR23_SNOOP);
-	ntb_cntl &= ~(NTB_CNTL_P2S_BAR4_SNOOP | NTB_CNTL_S2P_BAR4_SNOOP);
-	if (ndev->split_bar)
-		ntb_cntl &= ~(NTB_CNTL_P2S_BAR5_SNOOP |
-			      NTB_CNTL_S2P_BAR5_SNOOP);
-	ntb_cntl |= NTB_CNTL_LINK_DISABLE | NTB_CNTL_CFG_LOCK;
-	writel(ntb_cntl, ndev->reg_ofs.lnk_cntl);
-}
-
-static void ntb_max_mw_detect(struct ntb_device *ndev)
-{
-	if (ndev->split_bar)
-		ndev->limits.max_mw = HSX_SPLITBAR_MAX_MW;
-	else
-		ndev->limits.max_mw = SNB_MAX_MW;
-}
-
-static int ntb_xeon_detect(struct ntb_device *ndev)
-{
-	int rc, bars_mask;
-	u32 bars;
-	u8 ppd;
-
-	ndev->hw_type = SNB_HW;
-
-	rc = pci_read_config_byte(ndev->pdev, NTB_PPD_OFFSET, &ppd);
-	if (rc)
-		return -EIO;
-
-	if (ppd & SNB_PPD_DEV_TYPE)
-		ndev->dev_type = NTB_DEV_USD;
-	else
-		ndev->dev_type = NTB_DEV_DSD;
-
-	ndev->split_bar = (ppd & SNB_PPD_SPLIT_BAR) ? 1 : 0;
-
-	switch (ppd & SNB_PPD_CONN_TYPE) {
-	case NTB_CONN_B2B:
-		dev_info(&ndev->pdev->dev, "Conn Type = B2B\n");
-		ndev->conn_type = NTB_CONN_B2B;
-		break;
-	case NTB_CONN_RP:
-		dev_info(&ndev->pdev->dev, "Conn Type = RP\n");
-		ndev->conn_type = NTB_CONN_RP;
-		break;
-	case NTB_CONN_TRANSPARENT:
-		dev_info(&ndev->pdev->dev, "Conn Type = TRANSPARENT\n");
-		ndev->conn_type = NTB_CONN_TRANSPARENT;
-		/*
-		 * This mode is default to USD/DSP. HW does not report
-		 * properly in transparent mode as it has no knowledge of
-		 * NTB. We will just force correct here.
-		 */
-		ndev->dev_type = NTB_DEV_USD;
-
-		/*
-		 * This is a way for transparent BAR to figure out if we
-		 * are doing split BAR or not. There is no way for the hw
-		 * on the transparent side to know and set the PPD.
-		 */
-		bars_mask = pci_select_bars(ndev->pdev, IORESOURCE_MEM);
-		bars = hweight32(bars_mask);
-		if (bars == (HSX_SPLITBAR_MAX_MW + 1))
-			ndev->split_bar = 1;
-
-		break;
-	default:
-		dev_err(&ndev->pdev->dev, "Unknown PPD %x\n", ppd);
-		return -ENODEV;
-	}
-
-	ntb_max_mw_detect(ndev);
-
-	return 0;
-}
-
-static int ntb_atom_detect(struct ntb_device *ndev)
-{
-	int rc;
-	u32 ppd;
-
-	ndev->hw_type = BWD_HW;
-	ndev->limits.max_mw = BWD_MAX_MW;
-
-	rc = pci_read_config_dword(ndev->pdev, NTB_PPD_OFFSET, &ppd);
-	if (rc)
-		return rc;
-
-	switch ((ppd & BWD_PPD_CONN_TYPE) >> 8) {
-	case NTB_CONN_B2B:
-		dev_info(&ndev->pdev->dev, "Conn Type = B2B\n");
-		ndev->conn_type = NTB_CONN_B2B;
-		break;
-	case NTB_CONN_RP:
-	default:
-		dev_err(&ndev->pdev->dev, "Unsupported NTB configuration\n");
-		return -EINVAL;
-	}
-
-	if (ppd & BWD_PPD_DEV_TYPE)
-		ndev->dev_type = NTB_DEV_DSD;
-	else
-		ndev->dev_type = NTB_DEV_USD;
-
-	return 0;
-}
-
-static int ntb_device_detect(struct ntb_device *ndev)
-{
-	int rc;
-
-	if (is_ntb_xeon(ndev))
-		rc = ntb_xeon_detect(ndev);
-	else if (is_ntb_atom(ndev))
-		rc = ntb_atom_detect(ndev);
-	else
-		rc = -ENODEV;
-
-	dev_info(&ndev->pdev->dev, "Device Type = %s\n",
-		 ndev->dev_type == NTB_DEV_USD ? "USD/DSP" : "DSD/USP");
-
-	return 0;
-}
-
-static int ntb_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
-{
-	struct ntb_device *ndev;
-	int rc, i;
-
-	ndev = kzalloc(sizeof(struct ntb_device), GFP_KERNEL);
-	if (!ndev)
-		return -ENOMEM;
-
-	ndev->pdev = pdev;
-
-	ntb_set_errata_flags(ndev);
-
-	ndev->link_status = NTB_LINK_DOWN;
-	pci_set_drvdata(pdev, ndev);
-	ntb_setup_debugfs(ndev);
-
-	rc = pci_enable_device(pdev);
-	if (rc)
-		goto err;
-
-	pci_set_master(ndev->pdev);
-
-	rc = ntb_device_detect(ndev);
-	if (rc)
-		goto err;
-
-	ndev->mw = kcalloc(ndev->limits.max_mw, sizeof(struct ntb_mw),
-			   GFP_KERNEL);
-	if (!ndev->mw) {
-		rc = -ENOMEM;
-		goto err1;
-	}
-
-	if (ndev->split_bar)
-		rc = pci_request_selected_regions(pdev, NTB_SPLITBAR_MASK,
-						  KBUILD_MODNAME);
-	else
-		rc = pci_request_selected_regions(pdev, NTB_BAR_MASK,
-						  KBUILD_MODNAME);
-
-	if (rc)
-		goto err2;
-
-	ndev->reg_base = pci_ioremap_bar(pdev, NTB_BAR_MMIO);
-	if (!ndev->reg_base) {
-		dev_warn(&pdev->dev, "Cannot remap BAR 0\n");
-		rc = -EIO;
-		goto err3;
-	}
-
-	for (i = 0; i < ndev->limits.max_mw; i++) {
-		ndev->mw[i].bar_sz = pci_resource_len(pdev, MW_TO_BAR(i));
-
-		/*
-		 * with the errata we need to steal last of the memory
-		 * windows for workarounds and they point to MMIO registers.
-		 */
-		if ((ndev->wa_flags & WA_SNB_ERR) &&
-		    (i == (ndev->limits.max_mw - 1))) {
-			ndev->mw[i].vbase =
-				ioremap_nocache(pci_resource_start(pdev,
-							MW_TO_BAR(i)),
-						ndev->mw[i].bar_sz);
-		} else {
-			ndev->mw[i].vbase =
-				ioremap_wc(pci_resource_start(pdev,
-							MW_TO_BAR(i)),
-					   ndev->mw[i].bar_sz);
-		}
-
-		dev_info(&pdev->dev, "MW %d size %llu\n", i,
-			 (unsigned long long) ndev->mw[i].bar_sz);
-		if (!ndev->mw[i].vbase) {
-			dev_warn(&pdev->dev, "Cannot remap BAR %d\n",
-				 MW_TO_BAR(i));
-			rc = -EIO;
-			goto err4;
-		}
-	}
-
-	rc = pci_set_dma_mask(pdev, DMA_BIT_MASK(64));
-	if (rc) {
-		rc = pci_set_dma_mask(pdev, DMA_BIT_MASK(32));
-		if (rc)
-			goto err4;
-
-		dev_warn(&pdev->dev, "Cannot DMA highmem\n");
-	}
-
-	rc = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64));
-	if (rc) {
-		rc = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(32));
-		if (rc)
-			goto err4;
-
-		dev_warn(&pdev->dev, "Cannot DMA consistent highmem\n");
-	}
-
-	rc = ntb_device_setup(ndev);
-	if (rc)
-		goto err4;
-
-	rc = ntb_create_callbacks(ndev);
-	if (rc)
-		goto err5;
-
-	rc = ntb_setup_interrupts(ndev);
-	if (rc)
-		goto err6;
-
-	/* The scratchpad registers keep the values between rmmod/insmod,
-	 * blast them now
-	 */
-	for (i = 0; i < ndev->limits.max_spads; i++) {
-		ntb_write_local_spad(ndev, i, 0);
-		ntb_write_remote_spad(ndev, i, 0);
-	}
-
-	rc = ntb_transport_init(pdev);
-	if (rc)
-		goto err7;
-
-	ntb_hw_link_up(ndev);
-
-	return 0;
-
-err7:
-	ntb_free_interrupts(ndev);
-err6:
-	ntb_free_callbacks(ndev);
-err5:
-	ntb_device_free(ndev);
-err4:
-	for (i--; i >= 0; i--)
-		iounmap(ndev->mw[i].vbase);
-	iounmap(ndev->reg_base);
-err3:
-	if (ndev->split_bar)
-		pci_release_selected_regions(pdev, NTB_SPLITBAR_MASK);
-	else
-		pci_release_selected_regions(pdev, NTB_BAR_MASK);
-err2:
-	kfree(ndev->mw);
-err1:
-	pci_disable_device(pdev);
-err:
-	ntb_free_debugfs(ndev);
-	kfree(ndev);
-
-	dev_err(&pdev->dev, "Error loading %s module\n", KBUILD_MODNAME);
-	return rc;
-}
-
-static void ntb_pci_remove(struct pci_dev *pdev)
-{
-	struct ntb_device *ndev = pci_get_drvdata(pdev);
-	int i;
-
-	ntb_hw_link_down(ndev);
-
-	ntb_transport_free(ndev->ntb_transport);
-
-	ntb_free_interrupts(ndev);
-	ntb_free_callbacks(ndev);
-	ntb_device_free(ndev);
-
-	/* need to reset max_mw limits so we can unmap properly */
-	if (ndev->hw_type == SNB_HW)
-		ntb_max_mw_detect(ndev);
-
-	for (i = 0; i < ndev->limits.max_mw; i++)
-		iounmap(ndev->mw[i].vbase);
-
-	kfree(ndev->mw);
-	iounmap(ndev->reg_base);
-	if (ndev->split_bar)
-		pci_release_selected_regions(pdev, NTB_SPLITBAR_MASK);
-	else
-		pci_release_selected_regions(pdev, NTB_BAR_MASK);
-	pci_disable_device(pdev);
-	ntb_free_debugfs(ndev);
-	kfree(ndev);
-}
-
-static struct pci_driver ntb_pci_driver = {
-	.name = KBUILD_MODNAME,
-	.id_table = ntb_pci_tbl,
-	.probe = ntb_pci_probe,
-	.remove = ntb_pci_remove,
-};
-
-module_pci_driver(ntb_pci_driver);
diff --git a/drivers/ntb/ntb_hw.h b/drivers/ntb/ntb_hw.h
deleted file mode 100644
index 96de5fc95f90..000000000000
--- a/drivers/ntb/ntb_hw.h
+++ /dev/null
@@ -1,256 +0,0 @@
-/*
- * This file is provided under a dual BSD/GPLv2 license.  When using or
- *   redistributing this file, you may do so under either license.
- *
- *   GPL LICENSE SUMMARY
- *
- *   Copyright(c) 2012 Intel Corporation. All rights reserved.
- *
- *   This program is free software; you can redistribute it and/or modify
- *   it under the terms of version 2 of the GNU General Public License as
- *   published by the Free Software Foundation.
- *
- *   BSD LICENSE
- *
- *   Copyright(c) 2012 Intel Corporation. All rights reserved.
- *
- *   Redistribution and use in source and binary forms, with or without
- *   modification, are permitted provided that the following conditions
- *   are met:
- *
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copy
- *       notice, this list of conditions and the following disclaimer in
- *       the documentation and/or other materials provided with the
- *       distribution.
- *     * Neither the name of Intel Corporation nor the names of its
- *       contributors may be used to endorse or promote products derived
- *       from this software without specific prior written permission.
- *
- *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- * Intel PCIe NTB Linux driver
- *
- * Contact Information:
- * Jon Mason <jon.mason@intel.com>
- */
-#include <linux/ntb.h>
-
-#define PCI_DEVICE_ID_INTEL_NTB_B2B_JSF		0x3725
-#define PCI_DEVICE_ID_INTEL_NTB_PS_JSF		0x3726
-#define PCI_DEVICE_ID_INTEL_NTB_SS_JSF		0x3727
-#define PCI_DEVICE_ID_INTEL_NTB_B2B_SNB		0x3C0D
-#define PCI_DEVICE_ID_INTEL_NTB_PS_SNB		0x3C0E
-#define PCI_DEVICE_ID_INTEL_NTB_SS_SNB		0x3C0F
-#define PCI_DEVICE_ID_INTEL_NTB_B2B_IVT		0x0E0D
-#define PCI_DEVICE_ID_INTEL_NTB_PS_IVT		0x0E0E
-#define PCI_DEVICE_ID_INTEL_NTB_SS_IVT		0x0E0F
-#define PCI_DEVICE_ID_INTEL_NTB_B2B_HSX		0x2F0D
-#define PCI_DEVICE_ID_INTEL_NTB_PS_HSX		0x2F0E
-#define PCI_DEVICE_ID_INTEL_NTB_SS_HSX		0x2F0F
-#define PCI_DEVICE_ID_INTEL_NTB_B2B_BWD		0x0C4E
-
-#ifndef readq
-static inline u64 readq(void __iomem *addr)
-{
-	return readl(addr) | (((u64) readl(addr + 4)) << 32LL);
-}
-#endif
-
-#ifndef writeq
-static inline void writeq(u64 val, void __iomem *addr)
-{
-	writel(val & 0xffffffff, addr);
-	writel(val >> 32, addr + 4);
-}
-#endif
-
-#define NTB_BAR_MMIO		0
-#define NTB_BAR_23		2
-#define NTB_BAR_4		4
-#define NTB_BAR_5		5
-
-#define NTB_BAR_MASK		((1 << NTB_BAR_MMIO) | (1 << NTB_BAR_23) |\
-				 (1 << NTB_BAR_4))
-#define NTB_SPLITBAR_MASK	((1 << NTB_BAR_MMIO) | (1 << NTB_BAR_23) |\
-				 (1 << NTB_BAR_4) | (1 << NTB_BAR_5))
-
-#define NTB_HB_TIMEOUT		msecs_to_jiffies(1000)
-
-enum ntb_hw_event {
-	NTB_EVENT_SW_EVENT0 = 0,
-	NTB_EVENT_SW_EVENT1,
-	NTB_EVENT_SW_EVENT2,
-	NTB_EVENT_HW_ERROR,
-	NTB_EVENT_HW_LINK_UP,
-	NTB_EVENT_HW_LINK_DOWN,
-};
-
-struct ntb_mw {
-	dma_addr_t phys_addr;
-	void __iomem *vbase;
-	resource_size_t bar_sz;
-};
-
-struct ntb_db_cb {
-	int (*callback)(void *data, int db_num);
-	unsigned int db_num;
-	void *data;
-	struct ntb_device *ndev;
-	struct tasklet_struct irq_work;
-};
-
-#define WA_SNB_ERR	0x00000001
-
-struct ntb_device {
-	struct pci_dev *pdev;
-	struct msix_entry *msix_entries;
-	void __iomem *reg_base;
-	struct ntb_mw *mw;
-	struct {
-		unsigned char max_mw;
-		unsigned char max_spads;
-		unsigned char max_db_bits;
-		unsigned char msix_cnt;
-	} limits;
-	struct {
-		void __iomem *ldb;
-		void __iomem *ldb_mask;
-		void __iomem *rdb;
-		void __iomem *bar2_xlat;
-		void __iomem *bar4_xlat;
-		void __iomem *bar5_xlat;
-		void __iomem *spad_write;
-		void __iomem *spad_read;
-		void __iomem *lnk_cntl;
-		void __iomem *lnk_stat;
-		void __iomem *spci_cmd;
-	} reg_ofs;
-	struct ntb_transport *ntb_transport;
-	void (*event_cb)(void *handle, enum ntb_hw_event event);
-
-	struct ntb_db_cb *db_cb;
-	unsigned char hw_type;
-	unsigned char conn_type;
-	unsigned char dev_type;
-	unsigned char num_msix;
-	unsigned char bits_per_vector;
-	unsigned char max_cbs;
-	unsigned char link_width;
-	unsigned char link_speed;
-	unsigned char link_status;
-	unsigned char split_bar;
-
-	struct delayed_work hb_timer;
-	unsigned long last_ts;
-
-	struct delayed_work lr_timer;
-
-	struct dentry *debugfs_dir;
-	struct dentry *debugfs_info;
-
-	unsigned int wa_flags;
-};
-
-/**
- * ntb_max_cbs() - return the max callbacks
- * @ndev: pointer to ntb_device instance
- *
- * Given the ntb pointer, return the maximum number of callbacks
- *
- * RETURNS: the maximum number of callbacks
- */
-static inline unsigned char ntb_max_cbs(struct ntb_device *ndev)
-{
-	return ndev->max_cbs;
-}
-
-/**
- * ntb_max_mw() - return the max number of memory windows
- * @ndev: pointer to ntb_device instance
- *
- * Given the ntb pointer, return the maximum number of memory windows
- *
- * RETURNS: the maximum number of memory windows
- */
-static inline unsigned char ntb_max_mw(struct ntb_device *ndev)
-{
-	return ndev->limits.max_mw;
-}
-
-/**
- * ntb_hw_link_status() - return the hardware link status
- * @ndev: pointer to ntb_device instance
- *
- * Returns true if the hardware is connected to the remote system
- *
- * RETURNS: true or false based on the hardware link state
- */
-static inline bool ntb_hw_link_status(struct ntb_device *ndev)
-{
-	return ndev->link_status == NTB_LINK_UP;
-}
-
-/**
- * ntb_query_pdev() - return the pci_dev pointer
- * @ndev: pointer to ntb_device instance
- *
- * Given the ntb pointer, return the pci_dev pointer for the NTB hardware device
- *
- * RETURNS: a pointer to the ntb pci_dev
- */
-static inline struct pci_dev *ntb_query_pdev(struct ntb_device *ndev)
-{
-	return ndev->pdev;
-}
-
-/**
- * ntb_query_debugfs() - return the debugfs pointer
- * @ndev: pointer to ntb_device instance
- *
- * Given the ntb pointer, return the debugfs directory pointer for the NTB
- * hardware device
- *
- * RETURNS: a pointer to the debugfs directory
- */
-static inline struct dentry *ntb_query_debugfs(struct ntb_device *ndev)
-{
-	return ndev->debugfs_dir;
-}
-
-struct ntb_device *ntb_register_transport(struct pci_dev *pdev,
-					  void *transport);
-void ntb_unregister_transport(struct ntb_device *ndev);
-void ntb_set_mw_addr(struct ntb_device *ndev, unsigned int mw, u64 addr);
-int ntb_register_db_callback(struct ntb_device *ndev, unsigned int idx,
-			     void *data, int (*db_cb_func)(void *data,
-							   int db_num));
-void ntb_unregister_db_callback(struct ntb_device *ndev, unsigned int idx);
-int ntb_register_event_callback(struct ntb_device *ndev,
-				void (*event_cb_func)(void *handle,
-						      enum ntb_hw_event event));
-void ntb_unregister_event_callback(struct ntb_device *ndev);
-int ntb_get_max_spads(struct ntb_device *ndev);
-int ntb_write_local_spad(struct ntb_device *ndev, unsigned int idx, u32 val);
-int ntb_read_local_spad(struct ntb_device *ndev, unsigned int idx, u32 *val);
-int ntb_write_remote_spad(struct ntb_device *ndev, unsigned int idx, u32 val);
-int ntb_read_remote_spad(struct ntb_device *ndev, unsigned int idx, u32 *val);
-resource_size_t ntb_get_mw_base(struct ntb_device *ndev, unsigned int mw);
-void __iomem *ntb_get_mw_vbase(struct ntb_device *ndev, unsigned int mw);
-u64 ntb_get_mw_size(struct ntb_device *ndev, unsigned int mw);
-void ntb_ring_doorbell(struct ntb_device *ndev, unsigned int idx);
-void *ntb_find_transport(struct pci_dev *pdev);
-
-int ntb_transport_init(struct pci_dev *pdev);
-void ntb_transport_free(void *transport);
diff --git a/drivers/ntb/ntb_regs.h b/drivers/ntb/ntb_regs.h
deleted file mode 100644
index f028ff81fd77..000000000000
--- a/drivers/ntb/ntb_regs.h
+++ /dev/null
@@ -1,177 +0,0 @@
-/*
- * This file is provided under a dual BSD/GPLv2 license.  When using or
- *   redistributing this file, you may do so under either license.
- *
- *   GPL LICENSE SUMMARY
- *
- *   Copyright(c) 2012 Intel Corporation. All rights reserved.
- *
- *   This program is free software; you can redistribute it and/or modify
- *   it under the terms of version 2 of the GNU General Public License as
- *   published by the Free Software Foundation.
- *
- *   BSD LICENSE
- *
- *   Copyright(c) 2012 Intel Corporation. All rights reserved.
- *
- *   Redistribution and use in source and binary forms, with or without
- *   modification, are permitted provided that the following conditions
- *   are met:
- *
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copy
- *       notice, this list of conditions and the following disclaimer in
- *       the documentation and/or other materials provided with the
- *       distribution.
- *     * Neither the name of Intel Corporation nor the names of its
- *       contributors may be used to endorse or promote products derived
- *       from this software without specific prior written permission.
- *
- *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- * Intel PCIe NTB Linux driver
- *
- * Contact Information:
- * Jon Mason <jon.mason@intel.com>
- */
-
-#define NTB_LINK_STATUS_ACTIVE	0x2000
-#define NTB_LINK_SPEED_MASK	0x000f
-#define NTB_LINK_WIDTH_MASK	0x03f0
-
-#define SNB_MSIX_CNT		4
-#define SNB_MAX_B2B_SPADS	16
-#define SNB_MAX_COMPAT_SPADS	16
-/* Reserve the uppermost bit for link interrupt */
-#define SNB_MAX_DB_BITS		15
-#define SNB_LINK_DB		15
-#define SNB_DB_BITS_PER_VEC	5
-#define HSX_SPLITBAR_MAX_MW	3
-#define SNB_MAX_MW		2
-#define SNB_ERRATA_MAX_MW	1
-
-#define SNB_DB_HW_LINK		0x8000
-
-#define SNB_UNCERRSTS_OFFSET	0x014C
-#define SNB_CORERRSTS_OFFSET	0x0158
-#define SNB_LINK_STATUS_OFFSET	0x01A2
-#define SNB_PCICMD_OFFSET	0x0504
-#define SNB_DEVCTRL_OFFSET	0x0598
-#define SNB_DEVSTS_OFFSET	0x059A
-#define SNB_SLINK_STATUS_OFFSET	0x05A2
-
-#define SNB_PBAR2LMT_OFFSET	0x0000
-#define SNB_PBAR4LMT_OFFSET	0x0008
-#define SNB_PBAR5LMT_OFFSET	0x000C
-#define SNB_PBAR2XLAT_OFFSET	0x0010
-#define SNB_PBAR4XLAT_OFFSET	0x0018
-#define SNB_PBAR5XLAT_OFFSET	0x001C
-#define SNB_SBAR2LMT_OFFSET	0x0020
-#define SNB_SBAR4LMT_OFFSET	0x0028
-#define SNB_SBAR5LMT_OFFSET	0x002C
-#define SNB_SBAR2XLAT_OFFSET	0x0030
-#define SNB_SBAR4XLAT_OFFSET	0x0038
-#define SNB_SBAR5XLAT_OFFSET	0x003C
-#define SNB_SBAR0BASE_OFFSET	0x0040
-#define SNB_SBAR2BASE_OFFSET	0x0048
-#define SNB_SBAR4BASE_OFFSET	0x0050
-#define SNB_SBAR5BASE_OFFSET	0x0054
-#define SNB_NTBCNTL_OFFSET	0x0058
-#define SNB_SBDF_OFFSET		0x005C
-#define SNB_PDOORBELL_OFFSET	0x0060
-#define SNB_PDBMSK_OFFSET	0x0062
-#define SNB_SDOORBELL_OFFSET	0x0064
-#define SNB_SDBMSK_OFFSET	0x0066
-#define SNB_USMEMMISS_OFFSET	0x0070
-#define SNB_SPAD_OFFSET		0x0080
-#define SNB_SPADSEMA4_OFFSET	0x00c0
-#define SNB_WCCNTRL_OFFSET	0x00e0
-#define SNB_B2B_SPAD_OFFSET	0x0100
-#define SNB_B2B_DOORBELL_OFFSET	0x0140
-#define SNB_B2B_XLAT_OFFSETL	0x0144
-#define SNB_B2B_XLAT_OFFSETU	0x0148
-
-/*
- * The addresses are setup so the 32bit BARs can function. Thus
- * the addresses are all in 32bit space
- */
-#define SNB_MBAR01_USD_ADDR	0x000000002100000CULL
-#define SNB_MBAR23_USD_ADDR	0x000000004100000CULL
-#define SNB_MBAR4_USD_ADDR	0x000000008100000CULL
-#define SNB_MBAR5_USD_ADDR	0x00000000A100000CULL
-#define SNB_MBAR01_DSD_ADDR	0x000000002000000CULL
-#define SNB_MBAR23_DSD_ADDR	0x000000004000000CULL
-#define SNB_MBAR4_DSD_ADDR	0x000000008000000CULL
-#define SNB_MBAR5_DSD_ADDR	0x00000000A000000CULL
-
-#define BWD_MSIX_CNT		34
-#define BWD_MAX_SPADS		16
-#define BWD_MAX_DB_BITS		34
-#define BWD_DB_BITS_PER_VEC	1
-#define BWD_MAX_MW		2
-
-#define BWD_PCICMD_OFFSET	0xb004
-#define BWD_MBAR23_OFFSET	0xb018
-#define BWD_MBAR45_OFFSET	0xb020
-#define BWD_DEVCTRL_OFFSET	0xb048
-#define BWD_LINK_STATUS_OFFSET	0xb052
-#define BWD_ERRCORSTS_OFFSET	0xb110
-
-#define BWD_SBAR2XLAT_OFFSET	0x0008
-#define BWD_SBAR4XLAT_OFFSET	0x0010
-#define BWD_PDOORBELL_OFFSET	0x0020
-#define BWD_PDBMSK_OFFSET	0x0028
-#define BWD_NTBCNTL_OFFSET	0x0060
-#define BWD_EBDF_OFFSET		0x0064
-#define BWD_SPAD_OFFSET		0x0080
-#define BWD_SPADSEMA_OFFSET	0x00c0
-#define BWD_STKYSPAD_OFFSET	0x00c4
-#define BWD_PBAR2XLAT_OFFSET	0x8008
-#define BWD_PBAR4XLAT_OFFSET	0x8010
-#define BWD_B2B_DOORBELL_OFFSET	0x8020
-#define BWD_B2B_SPAD_OFFSET	0x8080
-#define BWD_B2B_SPADSEMA_OFFSET	0x80c0
-#define BWD_B2B_STKYSPAD_OFFSET	0x80c4
-
-#define BWD_MODPHY_PCSREG4	0x1c004
-#define BWD_MODPHY_PCSREG6	0x1c006
-
-#define BWD_IP_BASE		0xC000
-#define BWD_DESKEWSTS_OFFSET	(BWD_IP_BASE + 0x3024)
-#define BWD_LTSSMERRSTS0_OFFSET (BWD_IP_BASE + 0x3180)
-#define BWD_LTSSMSTATEJMP_OFFSET	(BWD_IP_BASE + 0x3040)
-#define BWD_IBSTERRRCRVSTS0_OFFSET	(BWD_IP_BASE + 0x3324)
-
-#define BWD_DESKEWSTS_DBERR	(1 << 15)
-#define BWD_LTSSMERRSTS0_UNEXPECTEDEI	(1 << 20)
-#define BWD_LTSSMSTATEJMP_FORCEDETECT	(1 << 2)
-#define BWD_IBIST_ERR_OFLOW	0x7FFF7FFF
-
-#define NTB_CNTL_CFG_LOCK		(1 << 0)
-#define NTB_CNTL_LINK_DISABLE		(1 << 1)
-#define NTB_CNTL_S2P_BAR23_SNOOP	(1 << 2)
-#define NTB_CNTL_P2S_BAR23_SNOOP	(1 << 4)
-#define NTB_CNTL_S2P_BAR4_SNOOP	(1 << 6)
-#define NTB_CNTL_P2S_BAR4_SNOOP	(1 << 8)
-#define NTB_CNTL_S2P_BAR5_SNOOP	(1 << 12)
-#define NTB_CNTL_P2S_BAR5_SNOOP	(1 << 14)
-#define BWD_CNTL_LINK_DOWN		(1 << 16)
-
-#define NTB_PPD_OFFSET		0x00D4
-#define SNB_PPD_CONN_TYPE	0x0003
-#define SNB_PPD_DEV_TYPE	0x0010
-#define SNB_PPD_SPLIT_BAR	(1 << 6)
-#define BWD_PPD_INIT_LINK	0x0008
-#define BWD_PPD_CONN_TYPE	0x0300
-#define BWD_PPD_DEV_TYPE	0x1000
diff --git a/drivers/ntb/ntb_transport.c b/drivers/ntb/ntb_transport.c
index e9bf2f47b61a..c5f26cda9f97 100644
--- a/drivers/ntb/ntb_transport.c
+++ b/drivers/ntb/ntb_transport.c
@@ -56,7 +56,7 @@
 #include <linux/pci.h>
 #include <linux/slab.h>
 #include <linux/types.h>
-#include "ntb_hw.h"
+#include "hw/intel/ntb_hw_intel.h"
 
 #define NTB_TRANSPORT_VERSION	3
 
@@ -360,14 +360,14 @@ err:
 EXPORT_SYMBOL_GPL(ntb_register_client_dev);
 
 /**
- * ntb_register_client - Register NTB client driver
+ * ntb_transport_register_client - Register NTB client driver
  * @drv: NTB client driver to be registered
  *
  * Register an NTB client driver with the NTB transport layer
  *
  * RETURNS: An appropriate -ERRNO error value on error, or zero for success.
  */
-int ntb_register_client(struct ntb_client *drv)
+int ntb_transport_register_client(struct ntb_client *drv)
 {
 	drv->driver.bus = &ntb_bus_type;
 
@@ -376,21 +376,21 @@ int ntb_register_client(struct ntb_client *drv)
 
 	return driver_register(&drv->driver);
 }
-EXPORT_SYMBOL_GPL(ntb_register_client);
+EXPORT_SYMBOL_GPL(ntb_transport_register_client);
 
 /**
- * ntb_unregister_client - Unregister NTB client driver
+ * ntb_transport_unregister_client - Unregister NTB client driver
  * @drv: NTB client driver to be unregistered
  *
  * Unregister an NTB client driver with the NTB transport layer
  *
  * RETURNS: An appropriate -ERRNO error value on error, or zero for success.
  */
-void ntb_unregister_client(struct ntb_client *drv)
+void ntb_transport_unregister_client(struct ntb_client *drv)
 {
 	driver_unregister(&drv->driver);
 }
-EXPORT_SYMBOL_GPL(ntb_unregister_client);
+EXPORT_SYMBOL_GPL(ntb_transport_unregister_client);
 
 static ssize_t debugfs_read(struct file *filp, char __user *ubuf, size_t count,
 			    loff_t *offp)
diff --git a/include/linux/ntb.h b/include/linux/ntb.h
deleted file mode 100644
index 9ac1a62fc6f5..000000000000
--- a/include/linux/ntb.h
+++ /dev/null
@@ -1,88 +0,0 @@
-/*
- * This file is provided under a dual BSD/GPLv2 license.  When using or
- *   redistributing this file, you may do so under either license.
- *
- *   GPL LICENSE SUMMARY
- *
- *   Copyright(c) 2012 Intel Corporation. All rights reserved.
- *
- *   This program is free software; you can redistribute it and/or modify
- *   it under the terms of version 2 of the GNU General Public License as
- *   published by the Free Software Foundation.
- *
- *   BSD LICENSE
- *
- *   Copyright(c) 2012 Intel Corporation. All rights reserved.
- *
- *   Redistribution and use in source and binary forms, with or without
- *   modification, are permitted provided that the following conditions
- *   are met:
- *
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copy
- *       notice, this list of conditions and the following disclaimer in
- *       the documentation and/or other materials provided with the
- *       distribution.
- *     * Neither the name of Intel Corporation nor the names of its
- *       contributors may be used to endorse or promote products derived
- *       from this software without specific prior written permission.
- *
- *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- * Intel PCIe NTB Linux driver
- *
- * Contact Information:
- * Jon Mason <jon.mason@intel.com>
- */
-
-struct ntb_transport_qp;
-
-struct ntb_client {
-	struct device_driver driver;
-	int (*probe)(struct pci_dev *pdev);
-	void (*remove)(struct pci_dev *pdev);
-};
-
-enum {
-	NTB_LINK_DOWN = 0,
-	NTB_LINK_UP,
-};
-
-int ntb_register_client(struct ntb_client *drvr);
-void ntb_unregister_client(struct ntb_client *drvr);
-int ntb_register_client_dev(char *device_name);
-void ntb_unregister_client_dev(char *device_name);
-
-struct ntb_queue_handlers {
-	void (*rx_handler)(struct ntb_transport_qp *qp, void *qp_data,
-			   void *data, int len);
-	void (*tx_handler)(struct ntb_transport_qp *qp, void *qp_data,
-			   void *data, int len);
-	void (*event_handler)(void *data, int status);
-};
-
-unsigned char ntb_transport_qp_num(struct ntb_transport_qp *qp);
-unsigned int ntb_transport_max_size(struct ntb_transport_qp *qp);
-struct ntb_transport_qp *
-ntb_transport_create_queue(void *data, struct pci_dev *pdev,
-			   const struct ntb_queue_handlers *handlers);
-void ntb_transport_free_queue(struct ntb_transport_qp *qp);
-int ntb_transport_rx_enqueue(struct ntb_transport_qp *qp, void *cb, void *data,
-			     unsigned int len);
-int ntb_transport_tx_enqueue(struct ntb_transport_qp *qp, void *cb, void *data,
-			     unsigned int len);
-void *ntb_transport_rx_remove(struct ntb_transport_qp *qp, unsigned int *len);
-void ntb_transport_link_up(struct ntb_transport_qp *qp);
-void ntb_transport_link_down(struct ntb_transport_qp *qp);
-bool ntb_transport_link_query(struct ntb_transport_qp *qp);
diff --git a/include/linux/ntb_transport.h b/include/linux/ntb_transport.h
new file mode 100644
index 000000000000..f78b64cc09d5
--- /dev/null
+++ b/include/linux/ntb_transport.h
@@ -0,0 +1,88 @@
+/*
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ *   redistributing this file, you may do so under either license.
+ *
+ *   GPL LICENSE SUMMARY
+ *
+ *   Copyright(c) 2012 Intel Corporation. All rights reserved.
+ *
+ *   This program is free software; you can redistribute it and/or modify
+ *   it under the terms of version 2 of the GNU General Public License as
+ *   published by the Free Software Foundation.
+ *
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2012 Intel Corporation. All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copy
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Intel PCIe NTB Linux driver
+ *
+ * Contact Information:
+ * Jon Mason <jon.mason@intel.com>
+ */
+
+struct ntb_transport_qp;
+
+struct ntb_client {
+	struct device_driver driver;
+	int (*probe)(struct pci_dev *pdev);
+	void (*remove)(struct pci_dev *pdev);
+};
+
+enum {
+	NTB_LINK_DOWN = 0,
+	NTB_LINK_UP,
+};
+
+int ntb_transport_register_client(struct ntb_client *drvr);
+void ntb_transport_unregister_client(struct ntb_client *drvr);
+int ntb_register_client_dev(char *device_name);
+void ntb_unregister_client_dev(char *device_name);
+
+struct ntb_queue_handlers {
+	void (*rx_handler)(struct ntb_transport_qp *qp, void *qp_data,
+			   void *data, int len);
+	void (*tx_handler)(struct ntb_transport_qp *qp, void *qp_data,
+			   void *data, int len);
+	void (*event_handler)(void *data, int status);
+};
+
+unsigned char ntb_transport_qp_num(struct ntb_transport_qp *qp);
+unsigned int ntb_transport_max_size(struct ntb_transport_qp *qp);
+struct ntb_transport_qp *
+ntb_transport_create_queue(void *data, struct pci_dev *pdev,
+			   const struct ntb_queue_handlers *handlers);
+void ntb_transport_free_queue(struct ntb_transport_qp *qp);
+int ntb_transport_rx_enqueue(struct ntb_transport_qp *qp, void *cb, void *data,
+			     unsigned int len);
+int ntb_transport_tx_enqueue(struct ntb_transport_qp *qp, void *cb, void *data,
+			     unsigned int len);
+void *ntb_transport_rx_remove(struct ntb_transport_qp *qp, unsigned int *len);
+void ntb_transport_link_up(struct ntb_transport_qp *qp);
+void ntb_transport_link_down(struct ntb_transport_qp *qp);
+bool ntb_transport_link_query(struct ntb_transport_qp *qp);
-- 
cgit v1.2.3


From a13f35e8714009145e32ebe2bf25b84e1376e314 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 2 Jul 2015 08:44:34 -0600
Subject: writeback: don't embed root bdi_writeback_congested in bdi_writeback

52ebea749aae ("writeback: make backing_dev_info host cgroup-specific
bdi_writebacks") made bdi (backing_dev_info) host per-cgroup wb's
(bdi_writeback's).  As the congested state needs to be per-wb and
referenced from blkcg side and multiple wbs, the patch made all
non-root cong's (bdi_writeback_congested's) reference counted and
indexed on bdi.

When a bdi is destroyed, cgwb_bdi_destroy() tries to drain all
non-root cong's; however, this can hang indefinitely because wb's can
also be referenced from blkcg_gq's which are destroyed after bdi
destruction is complete.

To fix the bug, bdi destruction will be updated to not wait for cong's
to drain, which naturally means that cong's may outlive the associated
bdi.  This is fine for non-root cong's but is problematic for the root
cong's which are embedded in their bdi's as they may end up getting
dereferenced after the containing bdi's are freed.

This patch makes root cong's behave the same as non-root cong's.  They
are no longer embedded in their bdi's but allocated separately during
bdi initialization, indexed and reference counted the same way.

* As cong handling is the same for all wb's, wb->congested
  initialization is moved into wb_init().

* When !CONFIG_CGROUP_WRITEBACK, there was no indexing or refcnting.
  bdi->wb_congested is now a pointer pointing to the root cong
  allocated during bdi init and minimal refcnting operations are
  implemented.

* The above makes root wb init paths diverge depending on
  CONFIG_CGROUP_WRITEBACK.  root wb init is moved to cgwb_bdi_init().

This patch in itself shouldn't cause any consequential behavior
differences but prepares for the actual fix.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: Jon Christopherson <jon@jons.org>
Link: https://bugzilla.kernel.org/show_bug.cgi?id=100681
Tested-by: Jon Christopherson <jon@jons.org>

Added <linux/slab.h> include to backing-dev.h for kfree() definition.

Signed-off-by: Jens Axboe <axboe@fb.com>
---
 include/linux/backing-dev-defs.h |  5 ++-
 include/linux/backing-dev.h      |  6 ++-
 mm/backing-dev.c                 | 87 +++++++++++++++++++++-------------------
 3 files changed, 54 insertions(+), 44 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h
index a48d90e3bcbb..a23209b43842 100644
--- a/include/linux/backing-dev-defs.h
+++ b/include/linux/backing-dev-defs.h
@@ -50,10 +50,10 @@ enum wb_stat_item {
  */
 struct bdi_writeback_congested {
 	unsigned long state;		/* WB_[a]sync_congested flags */
+	atomic_t refcnt;		/* nr of attached wb's and blkg */
 
 #ifdef CONFIG_CGROUP_WRITEBACK
 	struct backing_dev_info *bdi;	/* the associated bdi */
-	atomic_t refcnt;		/* nr of attached wb's and blkg */
 	int blkcg_id;			/* ID of the associated blkcg */
 	struct rb_node rb_node;		/* on bdi->cgwb_congestion_tree */
 #endif
@@ -150,11 +150,12 @@ struct backing_dev_info {
 	atomic_long_t tot_write_bandwidth;
 
 	struct bdi_writeback wb;  /* the root writeback info for this bdi */
-	struct bdi_writeback_congested wb_congested; /* its congested state */
 #ifdef CONFIG_CGROUP_WRITEBACK
 	struct radix_tree_root cgwb_tree; /* radix tree of active cgroup wbs */
 	struct rb_root cgwb_congested_tree; /* their congested states */
 	atomic_t usage_cnt; /* counts both cgwbs and cgwb_contested's */
+#else
+	struct bdi_writeback_congested *wb_congested;
 #endif
 	wait_queue_head_t wb_waitq;
 
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index 0e6d4828a77a..0fe9df983ab7 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -15,6 +15,7 @@
 #include <linux/writeback.h>
 #include <linux/blk-cgroup.h>
 #include <linux/backing-dev-defs.h>
+#include <linux/slab.h>
 
 int __must_check bdi_init(struct backing_dev_info *bdi);
 void bdi_destroy(struct backing_dev_info *bdi);
@@ -465,11 +466,14 @@ static inline bool inode_cgwb_enabled(struct inode *inode)
 static inline struct bdi_writeback_congested *
 wb_congested_get_create(struct backing_dev_info *bdi, int blkcg_id, gfp_t gfp)
 {
-	return bdi->wb.congested;
+	atomic_inc(&bdi->wb_congested->refcnt);
+	return bdi->wb_congested;
 }
 
 static inline void wb_congested_put(struct bdi_writeback_congested *congested)
 {
+	if (atomic_dec_and_test(&congested->refcnt))
+		kfree(congested);
 }
 
 static inline struct bdi_writeback *wb_find_current(struct backing_dev_info *bdi)
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 7756da31b02b..51cc461e7256 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -287,7 +287,7 @@ void wb_wakeup_delayed(struct bdi_writeback *wb)
 #define INIT_BW		(100 << (20 - PAGE_SHIFT))
 
 static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi,
-		   gfp_t gfp)
+		   int blkcg_id, gfp_t gfp)
 {
 	int i, err;
 
@@ -311,21 +311,29 @@ static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi,
 	INIT_LIST_HEAD(&wb->work_list);
 	INIT_DELAYED_WORK(&wb->dwork, wb_workfn);
 
+	wb->congested = wb_congested_get_create(bdi, blkcg_id, gfp);
+	if (!wb->congested)
+		return -ENOMEM;
+
 	err = fprop_local_init_percpu(&wb->completions, gfp);
 	if (err)
-		return err;
+		goto out_put_cong;
 
 	for (i = 0; i < NR_WB_STAT_ITEMS; i++) {
 		err = percpu_counter_init(&wb->stat[i], 0, gfp);
-		if (err) {
-			while (--i)
-				percpu_counter_destroy(&wb->stat[i]);
-			fprop_local_destroy_percpu(&wb->completions);
-			return err;
-		}
+		if (err)
+			goto out_destroy_stat;
 	}
 
 	return 0;
+
+out_destroy_stat:
+	while (--i)
+		percpu_counter_destroy(&wb->stat[i]);
+	fprop_local_destroy_percpu(&wb->completions);
+out_put_cong:
+	wb_congested_put(wb->congested);
+	return err;
 }
 
 /*
@@ -361,6 +369,7 @@ static void wb_exit(struct bdi_writeback *wb)
 		percpu_counter_destroy(&wb->stat[i]);
 
 	fprop_local_destroy_percpu(&wb->completions);
+	wb_congested_put(wb->congested);
 }
 
 #ifdef CONFIG_CGROUP_WRITEBACK
@@ -392,9 +401,6 @@ wb_congested_get_create(struct backing_dev_info *bdi, int blkcg_id, gfp_t gfp)
 	struct bdi_writeback_congested *new_congested = NULL, *congested;
 	struct rb_node **node, *parent;
 	unsigned long flags;
-
-	if (blkcg_id == 1)
-		return &bdi->wb_congested;
 retry:
 	spin_lock_irqsave(&cgwb_lock, flags);
 
@@ -453,9 +459,6 @@ void wb_congested_put(struct bdi_writeback_congested *congested)
 	struct backing_dev_info *bdi = congested->bdi;
 	unsigned long flags;
 
-	if (congested->blkcg_id == 1)
-		return;
-
 	local_irq_save(flags);
 	if (!atomic_dec_and_lock(&congested->refcnt, &cgwb_lock)) {
 		local_irq_restore(flags);
@@ -480,7 +483,6 @@ static void cgwb_release_workfn(struct work_struct *work)
 
 	css_put(wb->memcg_css);
 	css_put(wb->blkcg_css);
-	wb_congested_put(wb->congested);
 
 	fprop_local_destroy_percpu(&wb->memcg_completions);
 	percpu_ref_exit(&wb->refcnt);
@@ -541,7 +543,7 @@ static int cgwb_create(struct backing_dev_info *bdi,
 	if (!wb)
 		return -ENOMEM;
 
-	ret = wb_init(wb, bdi, gfp);
+	ret = wb_init(wb, bdi, blkcg_css->id, gfp);
 	if (ret)
 		goto err_free;
 
@@ -553,12 +555,6 @@ static int cgwb_create(struct backing_dev_info *bdi,
 	if (ret)
 		goto err_ref_exit;
 
-	wb->congested = wb_congested_get_create(bdi, blkcg_css->id, gfp);
-	if (!wb->congested) {
-		ret = -ENOMEM;
-		goto err_fprop_exit;
-	}
-
 	wb->memcg_css = memcg_css;
 	wb->blkcg_css = blkcg_css;
 	INIT_WORK(&wb->release_work, cgwb_release_workfn);
@@ -588,12 +584,10 @@ static int cgwb_create(struct backing_dev_info *bdi,
 	if (ret) {
 		if (ret == -EEXIST)
 			ret = 0;
-		goto err_put_congested;
+		goto err_fprop_exit;
 	}
 	goto out_put;
 
-err_put_congested:
-	wb_congested_put(wb->congested);
 err_fprop_exit:
 	fprop_local_destroy_percpu(&wb->memcg_completions);
 err_ref_exit:
@@ -662,14 +656,20 @@ struct bdi_writeback *wb_get_create(struct backing_dev_info *bdi,
 	return wb;
 }
 
-static void cgwb_bdi_init(struct backing_dev_info *bdi)
+static int cgwb_bdi_init(struct backing_dev_info *bdi)
 {
-	bdi->wb.memcg_css = mem_cgroup_root_css;
-	bdi->wb.blkcg_css = blkcg_root_css;
-	bdi->wb_congested.blkcg_id = 1;
+	int ret;
+
 	INIT_RADIX_TREE(&bdi->cgwb_tree, GFP_ATOMIC);
 	bdi->cgwb_congested_tree = RB_ROOT;
 	atomic_set(&bdi->usage_cnt, 1);
+
+	ret = wb_init(&bdi->wb, bdi, 1, GFP_KERNEL);
+	if (!ret) {
+		bdi->wb.memcg_css = mem_cgroup_root_css;
+		bdi->wb.blkcg_css = blkcg_root_css;
+	}
+	return ret;
 }
 
 static void cgwb_bdi_destroy(struct backing_dev_info *bdi)
@@ -732,15 +732,28 @@ void wb_blkcg_offline(struct blkcg *blkcg)
 
 #else	/* CONFIG_CGROUP_WRITEBACK */
 
-static void cgwb_bdi_init(struct backing_dev_info *bdi) { }
+static int cgwb_bdi_init(struct backing_dev_info *bdi)
+{
+	int err;
+
+	bdi->wb_congested = kzalloc(sizeof(*bdi->wb_congested), GFP_KERNEL);
+	if (!bdi->wb_congested)
+		return -ENOMEM;
+
+	err = wb_init(&bdi->wb, bdi, 1, GFP_KERNEL);
+	if (err) {
+		kfree(bdi->wb_congested);
+		return err;
+	}
+	return 0;
+}
+
 static void cgwb_bdi_destroy(struct backing_dev_info *bdi) { }
 
 #endif	/* CONFIG_CGROUP_WRITEBACK */
 
 int bdi_init(struct backing_dev_info *bdi)
 {
-	int err;
-
 	bdi->dev = NULL;
 
 	bdi->min_ratio = 0;
@@ -749,15 +762,7 @@ int bdi_init(struct backing_dev_info *bdi)
 	INIT_LIST_HEAD(&bdi->bdi_list);
 	init_waitqueue_head(&bdi->wb_waitq);
 
-	err = wb_init(&bdi->wb, bdi, GFP_KERNEL);
-	if (err)
-		return err;
-
-	bdi->wb_congested.state = 0;
-	bdi->wb.congested = &bdi->wb_congested;
-
-	cgwb_bdi_init(bdi);
-	return 0;
+	return cgwb_bdi_init(bdi);
 }
 EXPORT_SYMBOL(bdi_init);
 
-- 
cgit v1.2.3


From 91e20b5040c67c51aad88cf87db4305c5bd7f79d Mon Sep 17 00:00:00 2001
From: Joel Porquet <joel@porquet.org>
Date: Thu, 2 Jul 2015 15:32:00 -0400
Subject: irqchip: Move IRQCHIP_DECLARE macro to include/linux/irqchip.h

At the moment the IRQCHIP_DECLARE macro is only declared locally in
drivers/irqchip/irqchip.h. It prevents from using it directly in arch/*
directories whenever irqchip drivers only exist there, which happens in a few
cases (e.g. arc, arm, microblaze and mips).

This patch makes the macro to be globally defined, i.e. in
include/linux/irqchip.h, and thus usable for arch-specific declarations of
irqchip drivers. In this way, it is very similar to what clocksource does (ie
CLOCKSOURCE_OF_DECLARE is defined in include/linux/clocksource.h).

For now, this patch only moves the declaration of the macro
IRQCHIP_DECLARE to the global header 'include/linux/irqchip.h' and make
'drivers/irqchip/irqchip.h' include 'include/linux/irqchip.h'. Later, other
patches will get rid of 'drivers/irqchip/irqchip.h' and modify all the impacted
irqchip drivers.

Signed-off-by: Joel Porquet <joel@porquet.org>
Cc: Jason Cooper <jason@lakedaemon.net>
Link: http://lkml.kernel.org/r/1435865565-14114-1-git-send-email-joel@porquet.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 drivers/irqchip/irqchip.h | 19 +------------------
 include/linux/irqchip.h   | 14 ++++++++++++++
 2 files changed, 15 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/irqchip/irqchip.h b/drivers/irqchip/irqchip.h
index 0f6486d4f1b0..0f67ae32464f 100644
--- a/drivers/irqchip/irqchip.h
+++ b/drivers/irqchip/irqchip.h
@@ -8,21 +8,4 @@
  * warranty of any kind, whether express or implied.
  */
 
-#ifndef _IRQCHIP_H
-#define _IRQCHIP_H
-
-#include <linux/of.h>
-
-/*
- * This macro must be used by the different irqchip drivers to declare
- * the association between their DT compatible string and their
- * initialization function.
- *
- * @name: name that must be unique accross all IRQCHIP_DECLARE of the
- * same file.
- * @compstr: compatible string of the irqchip driver
- * @fn: initialization function
- */
-#define IRQCHIP_DECLARE(name, compat, fn) OF_DECLARE_2(irqchip, name, compat, fn)
-
-#endif
+#include <linux/irqchip.h>
diff --git a/include/linux/irqchip.h b/include/linux/irqchip.h
index 14d79131f53d..638887376e58 100644
--- a/include/linux/irqchip.h
+++ b/include/linux/irqchip.h
@@ -11,6 +11,20 @@
 #ifndef _LINUX_IRQCHIP_H
 #define _LINUX_IRQCHIP_H
 
+#include <linux/of.h>
+
+/*
+ * This macro must be used by the different irqchip drivers to declare
+ * the association between their DT compatible string and their
+ * initialization function.
+ *
+ * @name: name that must be unique accross all IRQCHIP_DECLARE of the
+ * same file.
+ * @compstr: compatible string of the irqchip driver
+ * @fn: initialization function
+ */
+#define IRQCHIP_DECLARE(name, compat, fn) OF_DECLARE_2(irqchip, name, compat, fn)
+
 #ifdef CONFIG_IRQCHIP
 void irqchip_init(void);
 #else
-- 
cgit v1.2.3


From 2ecd9d29abb171d6e97a4f3eb29d7456a11401b7 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 3 Jul 2015 18:53:58 +0200
Subject: sched, preempt_notifier: separate notifier registration from
 static_key inc/dec

Commit 1cde2930e154 ("sched/preempt: Add static_key() to preempt_notifiers")
had two problems.  First, the preempt-notifier API needs to sleep with the
addition of the static_key, we do however need to hold off preemption
while modifying the preempt notifier list, otherwise a preemption could
observe an inconsistent list state.  KVM correctly registers and
unregisters preempt notifiers with preemption disabled, so the sleep
caused dmesg splats.

Second, KVM registers and unregisters preemption notifiers very often
(in vcpu_load/vcpu_put).  With a single uniprocessor guest the static key
would move between 0 and 1 continuously, hitting the slow path on every
userspace exit.

To fix this, wrap the static_key inc/dec in a new API, and call it from
KVM.

Fixes: 1cde2930e154 ("sched/preempt: Add static_key() to preempt_notifiers")
Reported-by: Pontus Fuchs <pontus.fuchs@gmail.com>
Reported-by: Takashi Iwai <tiwai@suse.de>
Tested-by: Takashi Iwai <tiwai@suse.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 include/linux/preempt.h |  2 ++
 kernel/sched/core.c     | 17 +++++++++++++++--
 virt/kvm/kvm_main.c     |  3 +++
 3 files changed, 20 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/preempt.h b/include/linux/preempt.h
index 0f1534acaf60..84991f185173 100644
--- a/include/linux/preempt.h
+++ b/include/linux/preempt.h
@@ -293,6 +293,8 @@ struct preempt_notifier {
 	struct preempt_ops *ops;
 };
 
+void preempt_notifier_inc(void);
+void preempt_notifier_dec(void);
 void preempt_notifier_register(struct preempt_notifier *notifier);
 void preempt_notifier_unregister(struct preempt_notifier *notifier);
 
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index b803e1b8ab0c..552710ab19e0 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2320,13 +2320,27 @@ void wake_up_new_task(struct task_struct *p)
 
 static struct static_key preempt_notifier_key = STATIC_KEY_INIT_FALSE;
 
+void preempt_notifier_inc(void)
+{
+	static_key_slow_inc(&preempt_notifier_key);
+}
+EXPORT_SYMBOL_GPL(preempt_notifier_inc);
+
+void preempt_notifier_dec(void)
+{
+	static_key_slow_dec(&preempt_notifier_key);
+}
+EXPORT_SYMBOL_GPL(preempt_notifier_dec);
+
 /**
  * preempt_notifier_register - tell me when current is being preempted & rescheduled
  * @notifier: notifier struct to register
  */
 void preempt_notifier_register(struct preempt_notifier *notifier)
 {
-	static_key_slow_inc(&preempt_notifier_key);
+	if (!static_key_false(&preempt_notifier_key))
+		WARN(1, "registering preempt_notifier while notifiers disabled\n");
+
 	hlist_add_head(&notifier->link, &current->preempt_notifiers);
 }
 EXPORT_SYMBOL_GPL(preempt_notifier_register);
@@ -2340,7 +2354,6 @@ EXPORT_SYMBOL_GPL(preempt_notifier_register);
 void preempt_notifier_unregister(struct preempt_notifier *notifier)
 {
 	hlist_del(&notifier->link);
-	static_key_slow_dec(&preempt_notifier_key);
 }
 EXPORT_SYMBOL_GPL(preempt_notifier_unregister);
 
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 848af90b8091..8b8a44453670 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -553,6 +553,8 @@ static struct kvm *kvm_create_vm(unsigned long type)
 	list_add(&kvm->vm_list, &vm_list);
 	spin_unlock(&kvm_lock);
 
+	preempt_notifier_inc();
+
 	return kvm;
 
 out_err:
@@ -620,6 +622,7 @@ static void kvm_destroy_vm(struct kvm *kvm)
 	cleanup_srcu_struct(&kvm->irq_srcu);
 	cleanup_srcu_struct(&kvm->srcu);
 	kvm_arch_free_vm(kvm);
+	preempt_notifier_dec();
 	hardware_disable_all();
 	mmdrop(mm);
 }
-- 
cgit v1.2.3


From f6db8347993256b58bd4746b0c4c5b935c32210d Mon Sep 17 00:00:00 2001
From: "Naveen N. Rao" <naveen.n.rao@linux.vnet.ibm.com>
Date: Thu, 25 Jun 2015 23:53:37 +0530
Subject: sched/stat: Simplify the sched_info accounting dependency

Both CONFIG_SCHEDSTATS=y and CONFIG_TASK_DELAY_ACCT=y track task
sched_info, which results in ugly #if clauses.

Simplify the code by introducing a synthethic CONFIG_SCHED_INFO
switch, selected by both.

Signed-off-by: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com>
Cc: Balbir Singh <bsingharora@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: a.p.zijlstra@chello.nl
Cc: ricklind@us.ibm.com
Link: http://lkml.kernel.org/r/8d19eef800811a94b0f91bcbeb27430a884d7433.1435255405.git.naveen.n.rao@linux.vnet.ibm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h | 6 +++---
 init/Kconfig          | 1 +
 kernel/sched/core.c   | 2 +-
 kernel/sched/stats.h  | 4 ++--
 lib/Kconfig.debug     | 5 +++++
 5 files changed, 12 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 6633e83e608a..9bf4bc0e3b8b 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -849,7 +849,7 @@ extern struct user_struct root_user;
 struct backing_dev_info;
 struct reclaim_state;
 
-#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
+#ifdef CONFIG_SCHED_INFO
 struct sched_info {
 	/* cumulative counters */
 	unsigned long pcount;	      /* # of times run on this cpu */
@@ -859,7 +859,7 @@ struct sched_info {
 	unsigned long long last_arrival,/* when we last ran on a cpu */
 			   last_queued;	/* when we were last queued to run */
 };
-#endif /* defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) */
+#endif /* CONFIG_SCHED_INFO */
 
 #ifdef CONFIG_TASK_DELAY_ACCT
 struct task_delay_info {
@@ -1408,7 +1408,7 @@ struct task_struct {
 	int rcu_tasks_idle_cpu;
 #endif /* #ifdef CONFIG_TASKS_RCU */
 
-#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
+#ifdef CONFIG_SCHED_INFO
 	struct sched_info sched_info;
 #endif
 
diff --git a/init/Kconfig b/init/Kconfig
index b999fa381bf9..12cb556ad160 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -435,6 +435,7 @@ config TASKSTATS
 config TASK_DELAY_ACCT
 	bool "Enable per-task delay accounting"
 	depends on TASKSTATS
+	select SCHED_INFO
 	help
 	  Collect information on time spent by a task waiting for system
 	  resources like cpu, synchronous block I/O completion and swapping
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index c86935a7f1f8..abb8785ed1ba 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1975,7 +1975,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
 	set_task_cpu(p, cpu);
 	raw_spin_unlock_irqrestore(&p->pi_lock, flags);
 
-#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
+#ifdef CONFIG_SCHED_INFO
 	if (likely(sched_info_on()))
 		memset(&p->sched_info, 0, sizeof(p->sched_info));
 #endif
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index 077ebbd5e10f..b0fbc7632de5 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -47,7 +47,7 @@ rq_sched_info_depart(struct rq *rq, unsigned long long delta)
 # define schedstat_set(var, val)	do { } while (0)
 #endif
 
-#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
+#ifdef CONFIG_SCHED_INFO
 static inline void sched_info_reset_dequeued(struct task_struct *t)
 {
 	t->sched_info.last_queued = 0;
@@ -156,7 +156,7 @@ sched_info_switch(struct rq *rq,
 #define sched_info_depart(rq, t)		do { } while (0)
 #define sched_info_arrive(rq, next)		do { } while (0)
 #define sched_info_switch(rq, t, next)		do { } while (0)
-#endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */
+#endif /* CONFIG_SCHED_INFO */
 
 /*
  * The following are functions that support scheduler-internal time accounting.
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index b908048f8d6a..e2894b23efb6 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -841,9 +841,14 @@ config SCHED_DEBUG
 	  that can help debug the scheduler. The runtime overhead of this
 	  option is minimal.
 
+config SCHED_INFO
+	bool
+	default n
+
 config SCHEDSTATS
 	bool "Collect scheduler statistics"
 	depends on DEBUG_KERNEL && PROC_FS
+	select SCHED_INFO
 	help
 	  If you say Y here, additional code will be inserted into the
 	  scheduler and related routines to collect statistics about
-- 
cgit v1.2.3


From 6b55c9654fccf69ae7ace23ca101dc37b903181b Mon Sep 17 00:00:00 2001
From: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Date: Thu, 25 Jun 2015 22:51:41 +0530
Subject: sched/debug: Move print_cfs_rq() declaration to kernel/sched/sched.h

Currently print_cfs_rq() is declared in include/linux/sched.h.
However it's not used outside kernel/sched. Hence move the
declaration to kernel/sched/sched.h

Also some functions are only available for CONFIG_SCHED_DEBUG=y.
Hence move the declarations to within the #ifdef.

Signed-off-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Acked-by: Rik van Riel <riel@redhat.com>
Cc: Iulia Manda <iulia.manda21@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1435252903-1081-2-git-send-email-srikar@linux.vnet.ibm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h | 2 --
 kernel/sched/sched.h  | 5 +++++
 2 files changed, 5 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 9bf4bc0e3b8b..3505352dd296 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -191,8 +191,6 @@ struct task_group;
 #ifdef CONFIG_SCHED_DEBUG
 extern void proc_sched_show_task(struct task_struct *p, struct seq_file *m);
 extern void proc_sched_set_task(struct task_struct *p);
-extern void
-print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq);
 #endif
 
 /*
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index aea7c1f393cb..7d5895258fe3 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1668,9 +1668,14 @@ static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2)
 
 extern struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq);
 extern struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq);
+
+#ifdef	CONFIG_SCHED_DEBUG
 extern void print_cfs_stats(struct seq_file *m, int cpu);
 extern void print_rt_stats(struct seq_file *m, int cpu);
 extern void print_dl_stats(struct seq_file *m, int cpu);
+extern void
+print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq);
+#endif
 
 extern void init_cfs_rq(struct cfs_rq *cfs_rq);
 extern void init_rt_rq(struct rt_rq *rt_rq);
-- 
cgit v1.2.3


From a1bd3baeb2f18b2b3d0f98ce5fdaa725149b950b Mon Sep 17 00:00:00 2001
From: Allen Hubbe <Allen.Hubbe@emc.com>
Date: Thu, 9 Apr 2015 10:33:20 -0400
Subject: NTB: Add NTB hardware abstraction layer

Abstract the NTB device behind a programming interface, so that it can
support different hardware and client drivers.

Signed-off-by: Allen Hubbe <Allen.Hubbe@emc.com>
Signed-off-by: Jon Mason <jdmason@kudzu.us>
---
 Documentation/ntb.txt |  32 ++
 MAINTAINERS           |   4 +-
 drivers/ntb/Makefile  |   1 +
 drivers/ntb/ntb.c     | 251 +++++++++++++
 include/linux/ntb.h   | 984 ++++++++++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 1271 insertions(+), 1 deletion(-)
 create mode 100644 Documentation/ntb.txt
 create mode 100644 drivers/ntb/ntb.c
 create mode 100644 include/linux/ntb.h

(limited to 'include/linux')

diff --git a/Documentation/ntb.txt b/Documentation/ntb.txt
new file mode 100644
index 000000000000..9d46dc9712a8
--- /dev/null
+++ b/Documentation/ntb.txt
@@ -0,0 +1,32 @@
+# NTB Drivers
+
+NTB (Non-Transparent Bridge) is a type of PCI-Express bridge chip that connects
+the separate memory systems of two computers to the same PCI-Express fabric.
+Existing NTB hardware supports a common feature set, including scratchpad
+registers, doorbell registers, and memory translation windows.  Scratchpad
+registers are read-and-writable registers that are accessible from either side
+of the device, so that peers can exchange a small amount of information at a
+fixed address.  Doorbell registers provide a way for peers to send interrupt
+events.  Memory windows allow translated read and write access to the peer
+memory.
+
+## NTB Core Driver (ntb)
+
+The NTB core driver defines an api wrapping the common feature set, and allows
+clients interested in NTB features to discover NTB the devices supported by
+hardware drivers.  The term "client" is used here to mean an upper layer
+component making use of the NTB api.  The term "driver," or "hardware driver,"
+is used here to mean a driver for a specific vendor and model of NTB hardware.
+
+## NTB Client Drivers
+
+NTB client drivers should register with the NTB core driver.  After
+registering, the client probe and remove functions will be called appropriately
+as ntb hardware, or hardware drivers, are inserted and removed.  The
+registration uses the Linux Device framework, so it should feel familiar to
+anyone who has written a pci driver.
+
+## NTB Hardware Drivers
+
+NTB hardware drivers should register devices with the NTB core driver.  After
+registering, clients probe and remove functions will be called.
diff --git a/MAINTAINERS b/MAINTAINERS
index 663bc8ed1860..e2fc9eec2e16 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -6995,15 +6995,17 @@ F:	drivers/power/bq27x00_battery.c
 F:	drivers/power/isp1704_charger.c
 F:	drivers/power/rx51_battery.c
 
-NTB DRIVER
+NTB DRIVER CORE
 M:	Jon Mason <jdmason@kudzu.us>
 M:	Dave Jiang <dave.jiang@intel.com>
+M:	Allen Hubbe <Allen.Hubbe@emc.com>
 S:	Supported
 W:	https://github.com/jonmason/ntb/wiki
 T:	git git://github.com/jonmason/ntb.git
 F:	drivers/ntb/
 F:	drivers/net/ntb_netdev.c
 F:	include/linux/ntb.h
+F:	include/linux/ntb_transport.h
 
 NTFS FILESYSTEM
 M:	Anton Altaparmakov <anton@tuxera.com>
diff --git a/drivers/ntb/Makefile b/drivers/ntb/Makefile
index 545b10a131af..712e953a8fda 100644
--- a/drivers/ntb/Makefile
+++ b/drivers/ntb/Makefile
@@ -1,3 +1,4 @@
+obj-$(CONFIG_NTB) += ntb.o
 obj-$(CONFIG_NTB) += ntb_hw_intel.o
 
 ntb_hw_intel-objs := hw/intel/ntb_hw_intel.o ntb_transport.o
diff --git a/drivers/ntb/ntb.c b/drivers/ntb/ntb.c
new file mode 100644
index 000000000000..23435f2a5486
--- /dev/null
+++ b/drivers/ntb/ntb.c
@@ -0,0 +1,251 @@
+/*
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ *   redistributing this file, you may do so under either license.
+ *
+ *   GPL LICENSE SUMMARY
+ *
+ *   Copyright (C) 2015 EMC Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or modify
+ *   it under the terms of version 2 of the GNU General Public License as
+ *   published by the Free Software Foundation.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ *   General Public License for more details.
+ *
+ *   BSD LICENSE
+ *
+ *   Copyright (C) 2015 EMC Corporation. All Rights Reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copy
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * PCIe NTB Linux driver
+ *
+ * Contact Information:
+ * Allen Hubbe <Allen.Hubbe@emc.com>
+ */
+
+#include <linux/device.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+
+#include <linux/ntb.h>
+#include <linux/pci.h>
+
+#define DRIVER_NAME			"ntb"
+#define DRIVER_DESCRIPTION		"PCIe NTB Driver Framework"
+
+#define DRIVER_LICENSE			"Dual BSD/GPL"
+#define DRIVER_VERSION			"1.0"
+#define DRIVER_RELDATE			"24 March 2015"
+#define DRIVER_AUTHOR			"Allen Hubbe <Allen.Hubbe@emc.com>"
+
+MODULE_LICENSE(DRIVER_LICENSE);
+MODULE_VERSION(DRIVER_VERSION);
+MODULE_AUTHOR(DRIVER_AUTHOR);
+MODULE_DESCRIPTION(DRIVER_DESCRIPTION);
+
+static struct bus_type ntb_bus;
+static void ntb_dev_release(struct device *dev);
+
+int __ntb_register_client(struct ntb_client *client, struct module *mod,
+			  const char *mod_name)
+{
+	if (!client)
+		return -EINVAL;
+	if (!ntb_client_ops_is_valid(&client->ops))
+		return -EINVAL;
+
+	memset(&client->drv, 0, sizeof(client->drv));
+	client->drv.bus = &ntb_bus;
+	client->drv.name = mod_name;
+	client->drv.owner = mod;
+
+	return driver_register(&client->drv);
+}
+EXPORT_SYMBOL(__ntb_register_client);
+
+void ntb_unregister_client(struct ntb_client *client)
+{
+	driver_unregister(&client->drv);
+}
+EXPORT_SYMBOL(ntb_unregister_client);
+
+int ntb_register_device(struct ntb_dev *ntb)
+{
+	if (!ntb)
+		return -EINVAL;
+	if (!ntb->pdev)
+		return -EINVAL;
+	if (!ntb->ops)
+		return -EINVAL;
+	if (!ntb_dev_ops_is_valid(ntb->ops))
+		return -EINVAL;
+
+	init_completion(&ntb->released);
+
+	memset(&ntb->dev, 0, sizeof(ntb->dev));
+	ntb->dev.bus = &ntb_bus;
+	ntb->dev.parent = &ntb->pdev->dev;
+	ntb->dev.release = ntb_dev_release;
+	dev_set_name(&ntb->dev, pci_name(ntb->pdev));
+
+	ntb->ctx = NULL;
+	ntb->ctx_ops = NULL;
+	spin_lock_init(&ntb->ctx_lock);
+
+	return device_register(&ntb->dev);
+}
+EXPORT_SYMBOL(ntb_register_device);
+
+void ntb_unregister_device(struct ntb_dev *ntb)
+{
+	device_unregister(&ntb->dev);
+	wait_for_completion(&ntb->released);
+}
+EXPORT_SYMBOL(ntb_unregister_device);
+
+int ntb_set_ctx(struct ntb_dev *ntb, void *ctx,
+		const struct ntb_ctx_ops *ctx_ops)
+{
+	unsigned long irqflags;
+
+	if (!ntb_ctx_ops_is_valid(ctx_ops))
+		return -EINVAL;
+	if (ntb->ctx_ops)
+		return -EINVAL;
+
+	spin_lock_irqsave(&ntb->ctx_lock, irqflags);
+	{
+		ntb->ctx = ctx;
+		ntb->ctx_ops = ctx_ops;
+	}
+	spin_unlock_irqrestore(&ntb->ctx_lock, irqflags);
+
+	return 0;
+}
+EXPORT_SYMBOL(ntb_set_ctx);
+
+void ntb_clear_ctx(struct ntb_dev *ntb)
+{
+	unsigned long irqflags;
+
+	spin_lock_irqsave(&ntb->ctx_lock, irqflags);
+	{
+		ntb->ctx_ops = NULL;
+		ntb->ctx = NULL;
+	}
+	spin_unlock_irqrestore(&ntb->ctx_lock, irqflags);
+}
+EXPORT_SYMBOL(ntb_clear_ctx);
+
+void ntb_link_event(struct ntb_dev *ntb)
+{
+	unsigned long irqflags;
+
+	spin_lock_irqsave(&ntb->ctx_lock, irqflags);
+	{
+		if (ntb->ctx_ops && ntb->ctx_ops->link_event)
+			ntb->ctx_ops->link_event(ntb->ctx);
+	}
+	spin_unlock_irqrestore(&ntb->ctx_lock, irqflags);
+}
+EXPORT_SYMBOL(ntb_link_event);
+
+void ntb_db_event(struct ntb_dev *ntb, int vector)
+{
+	unsigned long irqflags;
+
+	spin_lock_irqsave(&ntb->ctx_lock, irqflags);
+	{
+		if (ntb->ctx_ops && ntb->ctx_ops->db_event)
+			ntb->ctx_ops->db_event(ntb->ctx, vector);
+	}
+	spin_unlock_irqrestore(&ntb->ctx_lock, irqflags);
+}
+EXPORT_SYMBOL(ntb_db_event);
+
+static int ntb_probe(struct device *dev)
+{
+	struct ntb_dev *ntb;
+	struct ntb_client *client;
+	int rc;
+
+	get_device(dev);
+	ntb = dev_ntb(dev);
+	client = drv_ntb_client(dev->driver);
+
+	rc = client->ops.probe(client, ntb);
+	if (rc)
+		put_device(dev);
+
+	return rc;
+}
+
+static int ntb_remove(struct device *dev)
+{
+	struct ntb_dev *ntb;
+	struct ntb_client *client;
+
+	if (dev->driver) {
+		ntb = dev_ntb(dev);
+		client = drv_ntb_client(dev->driver);
+
+		client->ops.remove(client, ntb);
+		put_device(dev);
+	}
+
+	return 0;
+}
+
+static void ntb_dev_release(struct device *dev)
+{
+	struct ntb_dev *ntb = dev_ntb(dev);
+
+	complete(&ntb->released);
+}
+
+static struct bus_type ntb_bus = {
+	.name = "ntb",
+	.probe = ntb_probe,
+	.remove = ntb_remove,
+};
+
+static int __init ntb_driver_init(void)
+{
+	return bus_register(&ntb_bus);
+}
+module_init(ntb_driver_init);
+
+static void __exit ntb_driver_exit(void)
+{
+	bus_unregister(&ntb_bus);
+}
+module_exit(ntb_driver_exit);
+
diff --git a/include/linux/ntb.h b/include/linux/ntb.h
new file mode 100644
index 000000000000..b02f72bb8e32
--- /dev/null
+++ b/include/linux/ntb.h
@@ -0,0 +1,984 @@
+/*
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ *   redistributing this file, you may do so under either license.
+ *
+ *   GPL LICENSE SUMMARY
+ *
+ *   Copyright (C) 2015 EMC Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or modify
+ *   it under the terms of version 2 of the GNU General Public License as
+ *   published by the Free Software Foundation.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ *   General Public License for more details.
+ *
+ *   BSD LICENSE
+ *
+ *   Copyright (C) 2015 EMC Corporation. All Rights Reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copy
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * PCIe NTB Linux driver
+ *
+ * Contact Information:
+ * Allen Hubbe <Allen.Hubbe@emc.com>
+ */
+
+#ifndef _NTB_H_
+#define _NTB_H_
+
+#include <linux/completion.h>
+#include <linux/device.h>
+
+struct ntb_client;
+struct ntb_dev;
+struct pci_dev;
+
+/**
+ * enum ntb_topo - NTB connection topology
+ * @NTB_TOPO_NONE:	Topology is unknown or invalid.
+ * @NTB_TOPO_PRI:	On primary side of local ntb.
+ * @NTB_TOPO_SEC:	On secondary side of remote ntb.
+ * @NTB_TOPO_B2B_USD:	On primary side of local ntb upstream of remote ntb.
+ * @NTB_TOPO_B2B_DSD:	On primary side of local ntb downstream of remote ntb.
+ */
+enum ntb_topo {
+	NTB_TOPO_NONE = -1,
+	NTB_TOPO_PRI,
+	NTB_TOPO_SEC,
+	NTB_TOPO_B2B_USD,
+	NTB_TOPO_B2B_DSD,
+};
+
+static inline int ntb_topo_is_b2b(enum ntb_topo topo)
+{
+	switch ((int)topo) {
+	case NTB_TOPO_B2B_USD:
+	case NTB_TOPO_B2B_DSD:
+		return 1;
+	}
+	return 0;
+}
+
+static inline char *ntb_topo_string(enum ntb_topo topo)
+{
+	switch (topo) {
+	case NTB_TOPO_NONE:	return "NTB_TOPO_NONE";
+	case NTB_TOPO_PRI:	return "NTB_TOPO_PRI";
+	case NTB_TOPO_SEC:	return "NTB_TOPO_SEC";
+	case NTB_TOPO_B2B_USD:	return "NTB_TOPO_B2B_USD";
+	case NTB_TOPO_B2B_DSD:	return "NTB_TOPO_B2B_DSD";
+	}
+	return "NTB_TOPO_INVALID";
+}
+
+/**
+ * enum ntb_speed - NTB link training speed
+ * @NTB_SPEED_AUTO:	Request the max supported speed.
+ * @NTB_SPEED_NONE:	Link is not trained to any speed.
+ * @NTB_SPEED_GEN1:	Link is trained to gen1 speed.
+ * @NTB_SPEED_GEN2:	Link is trained to gen2 speed.
+ * @NTB_SPEED_GEN3:	Link is trained to gen3 speed.
+ */
+enum ntb_speed {
+	NTB_SPEED_AUTO = -1,
+	NTB_SPEED_NONE = 0,
+	NTB_SPEED_GEN1 = 1,
+	NTB_SPEED_GEN2 = 2,
+	NTB_SPEED_GEN3 = 3,
+};
+
+/**
+ * enum ntb_width - NTB link training width
+ * @NTB_WIDTH_AUTO:	Request the max supported width.
+ * @NTB_WIDTH_NONE:	Link is not trained to any width.
+ * @NTB_WIDTH_1:	Link is trained to 1 lane width.
+ * @NTB_WIDTH_2:	Link is trained to 2 lane width.
+ * @NTB_WIDTH_4:	Link is trained to 4 lane width.
+ * @NTB_WIDTH_8:	Link is trained to 8 lane width.
+ * @NTB_WIDTH_12:	Link is trained to 12 lane width.
+ * @NTB_WIDTH_16:	Link is trained to 16 lane width.
+ * @NTB_WIDTH_32:	Link is trained to 32 lane width.
+ */
+enum ntb_width {
+	NTB_WIDTH_AUTO = -1,
+	NTB_WIDTH_NONE = 0,
+	NTB_WIDTH_1 = 1,
+	NTB_WIDTH_2 = 2,
+	NTB_WIDTH_4 = 4,
+	NTB_WIDTH_8 = 8,
+	NTB_WIDTH_12 = 12,
+	NTB_WIDTH_16 = 16,
+	NTB_WIDTH_32 = 32,
+};
+
+/**
+ * struct ntb_client_ops - ntb client operations
+ * @probe:		Notify client of a new device.
+ * @remove:		Notify client to remove a device.
+ */
+struct ntb_client_ops {
+	int (*probe)(struct ntb_client *client, struct ntb_dev *ntb);
+	void (*remove)(struct ntb_client *client, struct ntb_dev *ntb);
+};
+
+static inline int ntb_client_ops_is_valid(const struct ntb_client_ops *ops)
+{
+	/* commented callbacks are not required: */
+	return
+		ops->probe			&&
+		ops->remove			&&
+		1;
+}
+
+/**
+ * struct ntb_ctx_ops - ntb driver context operations
+ * @link_event:		See ntb_link_event().
+ * @db_event:		See ntb_db_event().
+ */
+struct ntb_ctx_ops {
+	void (*link_event)(void *ctx);
+	void (*db_event)(void *ctx, int db_vector);
+};
+
+static inline int ntb_ctx_ops_is_valid(const struct ntb_ctx_ops *ops)
+{
+	/* commented callbacks are not required: */
+	return
+		/* ops->link_event		&& */
+		/* ops->db_event		&& */
+		1;
+}
+
+/**
+ * struct ntb_ctx_ops - ntb device operations
+ * @mw_count:		See ntb_mw_count().
+ * @mw_get_range:	See ntb_mw_get_range().
+ * @mw_set_trans:	See ntb_mw_set_trans().
+ * @mw_clear_trans:	See ntb_mw_clear_trans().
+ * @link_is_up:		See ntb_link_is_up().
+ * @link_enable:	See ntb_link_enable().
+ * @link_disable:	See ntb_link_disable().
+ * @db_is_unsafe:	See ntb_db_is_unsafe().
+ * @db_valid_mask:	See ntb_db_valid_mask().
+ * @db_vector_count:	See ntb_db_vector_count().
+ * @db_vector_mask:	See ntb_db_vector_mask().
+ * @db_read:		See ntb_db_read().
+ * @db_set:		See ntb_db_set().
+ * @db_clear:		See ntb_db_clear().
+ * @db_read_mask:	See ntb_db_read_mask().
+ * @db_set_mask:	See ntb_db_set_mask().
+ * @db_clear_mask:	See ntb_db_clear_mask().
+ * @peer_db_addr:	See ntb_peer_db_addr().
+ * @peer_db_read:	See ntb_peer_db_read().
+ * @peer_db_set:	See ntb_peer_db_set().
+ * @peer_db_clear:	See ntb_peer_db_clear().
+ * @peer_db_read_mask:	See ntb_peer_db_read_mask().
+ * @peer_db_set_mask:	See ntb_peer_db_set_mask().
+ * @peer_db_clear_mask:	See ntb_peer_db_clear_mask().
+ * @spad_is_unsafe:	See ntb_spad_is_unsafe().
+ * @spad_count:		See ntb_spad_count().
+ * @spad_read:		See ntb_spad_read().
+ * @spad_write:		See ntb_spad_write().
+ * @peer_spad_addr:	See ntb_peer_spad_addr().
+ * @peer_spad_read:	See ntb_peer_spad_read().
+ * @peer_spad_write:	See ntb_peer_spad_write().
+ */
+struct ntb_dev_ops {
+	int (*mw_count)(struct ntb_dev *ntb);
+	int (*mw_get_range)(struct ntb_dev *ntb, int idx,
+			    phys_addr_t *base, resource_size_t *size,
+			resource_size_t *align, resource_size_t *align_size);
+	int (*mw_set_trans)(struct ntb_dev *ntb, int idx,
+			    dma_addr_t addr, resource_size_t size);
+	int (*mw_clear_trans)(struct ntb_dev *ntb, int idx);
+
+	int (*link_is_up)(struct ntb_dev *ntb,
+			  enum ntb_speed *speed, enum ntb_width *width);
+	int (*link_enable)(struct ntb_dev *ntb,
+			   enum ntb_speed max_speed, enum ntb_width max_width);
+	int (*link_disable)(struct ntb_dev *ntb);
+
+	int (*db_is_unsafe)(struct ntb_dev *ntb);
+	u64 (*db_valid_mask)(struct ntb_dev *ntb);
+	int (*db_vector_count)(struct ntb_dev *ntb);
+	u64 (*db_vector_mask)(struct ntb_dev *ntb, int db_vector);
+
+	u64 (*db_read)(struct ntb_dev *ntb);
+	int (*db_set)(struct ntb_dev *ntb, u64 db_bits);
+	int (*db_clear)(struct ntb_dev *ntb, u64 db_bits);
+
+	u64 (*db_read_mask)(struct ntb_dev *ntb);
+	int (*db_set_mask)(struct ntb_dev *ntb, u64 db_bits);
+	int (*db_clear_mask)(struct ntb_dev *ntb, u64 db_bits);
+
+	int (*peer_db_addr)(struct ntb_dev *ntb,
+			    phys_addr_t *db_addr, resource_size_t *db_size);
+	u64 (*peer_db_read)(struct ntb_dev *ntb);
+	int (*peer_db_set)(struct ntb_dev *ntb, u64 db_bits);
+	int (*peer_db_clear)(struct ntb_dev *ntb, u64 db_bits);
+
+	u64 (*peer_db_read_mask)(struct ntb_dev *ntb);
+	int (*peer_db_set_mask)(struct ntb_dev *ntb, u64 db_bits);
+	int (*peer_db_clear_mask)(struct ntb_dev *ntb, u64 db_bits);
+
+	int (*spad_is_unsafe)(struct ntb_dev *ntb);
+	int (*spad_count)(struct ntb_dev *ntb);
+
+	u32 (*spad_read)(struct ntb_dev *ntb, int idx);
+	int (*spad_write)(struct ntb_dev *ntb, int idx, u32 val);
+
+	int (*peer_spad_addr)(struct ntb_dev *ntb, int idx,
+			      phys_addr_t *spad_addr);
+	u32 (*peer_spad_read)(struct ntb_dev *ntb, int idx);
+	int (*peer_spad_write)(struct ntb_dev *ntb, int idx, u32 val);
+};
+
+static inline int ntb_dev_ops_is_valid(const struct ntb_dev_ops *ops)
+{
+	/* commented callbacks are not required: */
+	return
+		ops->mw_count				&&
+		ops->mw_get_range			&&
+		ops->mw_set_trans			&&
+		/* ops->mw_clear_trans			&& */
+		ops->link_is_up				&&
+		ops->link_enable			&&
+		ops->link_disable			&&
+		/* ops->db_is_unsafe			&& */
+		ops->db_valid_mask			&&
+
+		/* both set, or both unset */
+		(!ops->db_vector_count == !ops->db_vector_mask) &&
+
+		ops->db_read				&&
+		/* ops->db_set				&& */
+		ops->db_clear				&&
+		/* ops->db_read_mask			&& */
+		ops->db_set_mask			&&
+		ops->db_clear_mask			&&
+		ops->peer_db_addr			&&
+		/* ops->peer_db_read			&& */
+		ops->peer_db_set			&&
+		/* ops->peer_db_clear			&& */
+		/* ops->peer_db_read_mask		&& */
+		/* ops->peer_db_set_mask		&& */
+		/* ops->peer_db_clear_mask		&& */
+		/* ops->spad_is_unsafe			&& */
+		ops->spad_count				&&
+		ops->spad_read				&&
+		ops->spad_write				&&
+		ops->peer_spad_addr			&&
+		/* ops->peer_spad_read			&& */
+		ops->peer_spad_write			&&
+		1;
+}
+
+/**
+ * struct ntb_client - client interested in ntb devices
+ * @drv:		Linux driver object.
+ * @ops:		See &ntb_client_ops.
+ */
+struct ntb_client {
+	struct device_driver		drv;
+	const struct ntb_client_ops	ops;
+};
+
+#define drv_ntb_client(__drv) container_of((__drv), struct ntb_client, drv)
+
+/**
+ * struct ntb_device - ntb device
+ * @dev:		Linux device object.
+ * @pdev:		Pci device entry of the ntb.
+ * @topo:		Detected topology of the ntb.
+ * @ops:		See &ntb_dev_ops.
+ * @ctx:		See &ntb_ctx_ops.
+ * @ctx_ops:		See &ntb_ctx_ops.
+ */
+struct ntb_dev {
+	struct device			dev;
+	struct pci_dev			*pdev;
+	enum ntb_topo			topo;
+	const struct ntb_dev_ops	*ops;
+	void				*ctx;
+	const struct ntb_ctx_ops	*ctx_ops;
+
+	/* private: */
+
+	/* synchronize setting, clearing, and calling ctx_ops */
+	spinlock_t			ctx_lock;
+	/* block unregister until device is fully released */
+	struct completion		released;
+};
+
+#define dev_ntb(__dev) container_of((__dev), struct ntb_dev, dev)
+
+/**
+ * ntb_register_client() - register a client for interest in ntb devices
+ * @client:	Client context.
+ *
+ * The client will be added to the list of clients interested in ntb devices.
+ * The client will be notified of any ntb devices that are not already
+ * associated with a client, or if ntb devices are registered later.
+ *
+ * Return: Zero if the client is registered, otherwise an error number.
+ */
+#define ntb_register_client(client) \
+	__ntb_register_client((client), THIS_MODULE, KBUILD_MODNAME)
+
+int __ntb_register_client(struct ntb_client *client, struct module *mod,
+			  const char *mod_name);
+
+/**
+ * ntb_unregister_client() - unregister a client for interest in ntb devices
+ * @client:	Client context.
+ *
+ * The client will be removed from the list of clients interested in ntb
+ * devices.  If any ntb devices are associated with the client, the client will
+ * be notified to remove those devices.
+ */
+void ntb_unregister_client(struct ntb_client *client);
+
+#define module_ntb_client(__ntb_client) \
+	module_driver(__ntb_client, ntb_register_client, \
+			ntb_unregister_client)
+
+/**
+ * ntb_register_device() - register a ntb device
+ * @ntb:	NTB device context.
+ *
+ * The device will be added to the list of ntb devices.  If any clients are
+ * interested in ntb devices, each client will be notified of the ntb device,
+ * until at most one client accepts the device.
+ *
+ * Return: Zero if the device is registered, otherwise an error number.
+ */
+int ntb_register_device(struct ntb_dev *ntb);
+
+/**
+ * ntb_register_device() - unregister a ntb device
+ * @ntb:	NTB device context.
+ *
+ * The device will be removed from the list of ntb devices.  If the ntb device
+ * is associated with a client, the client will be notified to remove the
+ * device.
+ */
+void ntb_unregister_device(struct ntb_dev *ntb);
+
+/**
+ * ntb_set_ctx() - associate a driver context with an ntb device
+ * @ntb:	NTB device context.
+ * @ctx:	Driver context.
+ * @ctx_ops:	Driver context operations.
+ *
+ * Associate a driver context and operations with a ntb device.  The context is
+ * provided by the client driver, and the driver may associate a different
+ * context with each ntb device.
+ *
+ * Return: Zero if the context is associated, otherwise an error number.
+ */
+int ntb_set_ctx(struct ntb_dev *ntb, void *ctx,
+		const struct ntb_ctx_ops *ctx_ops);
+
+/**
+ * ntb_clear_ctx() - disassociate any driver context from an ntb device
+ * @ntb:	NTB device context.
+ *
+ * Clear any association that may exist between a driver context and the ntb
+ * device.
+ */
+void ntb_clear_ctx(struct ntb_dev *ntb);
+
+/**
+ * ntb_link_event() - notify driver context of a change in link status
+ * @ntb:	NTB device context.
+ *
+ * Notify the driver context that the link status may have changed.  The driver
+ * should call ntb_link_is_up() to get the current status.
+ */
+void ntb_link_event(struct ntb_dev *ntb);
+
+/**
+ * ntb_db_event() - notify driver context of a doorbell event
+ * @ntb:	NTB device context.
+ * @vector:	Interrupt vector number.
+ *
+ * Notify the driver context of a doorbell event.  If hardware supports
+ * multiple interrupt vectors for doorbells, the vector number indicates which
+ * vector received the interrupt.  The vector number is relative to the first
+ * vector used for doorbells, starting at zero, and must be less than
+ ** ntb_db_vector_count().  The driver may call ntb_db_read() to check which
+ * doorbell bits need service, and ntb_db_vector_mask() to determine which of
+ * those bits are associated with the vector number.
+ */
+void ntb_db_event(struct ntb_dev *ntb, int vector);
+
+/**
+ * ntb_mw_count() - get the number of memory windows
+ * @ntb:	NTB device context.
+ *
+ * Hardware and topology may support a different number of memory windows.
+ *
+ * Return: the number of memory windows.
+ */
+static inline int ntb_mw_count(struct ntb_dev *ntb)
+{
+	return ntb->ops->mw_count(ntb);
+}
+
+/**
+ * ntb_mw_get_range() - get the range of a memory window
+ * @ntb:	NTB device context.
+ * @idx:	Memory window number.
+ * @base:	OUT - the base address for mapping the memory window
+ * @size:	OUT - the size for mapping the memory window
+ * @align:	OUT - the base alignment for translating the memory window
+ * @align_size:	OUT - the size alignment for translating the memory window
+ *
+ * Get the range of a memory window.  NULL may be given for any output
+ * parameter if the value is not needed.  The base and size may be used for
+ * mapping the memory window, to access the peer memory.  The alignment and
+ * size may be used for translating the memory window, for the peer to access
+ * memory on the local system.
+ *
+ * Return: Zero on success, otherwise an error number.
+ */
+static inline int ntb_mw_get_range(struct ntb_dev *ntb, int idx,
+				   phys_addr_t *base, resource_size_t *size,
+		resource_size_t *align, resource_size_t *align_size)
+{
+	return ntb->ops->mw_get_range(ntb, idx, base, size,
+			align, align_size);
+}
+
+/**
+ * ntb_mw_set_trans() - set the translation of a memory window
+ * @ntb:	NTB device context.
+ * @idx:	Memory window number.
+ * @addr:	The dma address local memory to expose to the peer.
+ * @size:	The size of the local memory to expose to the peer.
+ *
+ * Set the translation of a memory window.  The peer may access local memory
+ * through the window starting at the address, up to the size.  The address
+ * must be aligned to the alignment specified by ntb_mw_get_range().  The size
+ * must be aligned to the size alignment specified by ntb_mw_get_range().
+ *
+ * Return: Zero on success, otherwise an error number.
+ */
+static inline int ntb_mw_set_trans(struct ntb_dev *ntb, int idx,
+				   dma_addr_t addr, resource_size_t size)
+{
+	return ntb->ops->mw_set_trans(ntb, idx, addr, size);
+}
+
+/**
+ * ntb_mw_clear_trans() - clear the translation of a memory window
+ * @ntb:	NTB device context.
+ * @idx:	Memory window number.
+ *
+ * Clear the translation of a memory window.  The peer may no longer access
+ * local memory through the window.
+ *
+ * Return: Zero on success, otherwise an error number.
+ */
+static inline int ntb_mw_clear_trans(struct ntb_dev *ntb, int idx)
+{
+	if (!ntb->ops->mw_clear_trans)
+		return ntb->ops->mw_set_trans(ntb, idx, 0, 0);
+
+	return ntb->ops->mw_clear_trans(ntb, idx);
+}
+
+/**
+ * ntb_link_is_up() - get the current ntb link state
+ * @ntb:	NTB device context.
+ * @speed:	OUT - The link speed expressed as PCIe generation number.
+ * @width:	OUT - The link width expressed as the number of PCIe lanes.
+ *
+ * Set the translation of a memory window.  The peer may access local memory
+ * through the window starting at the address, up to the size.  The address
+ * must be aligned to the alignment specified by ntb_mw_get_range().  The size
+ * must be aligned to the size alignment specified by ntb_mw_get_range().
+ *
+ * Return: One if the link is up, zero if the link is down, otherwise a
+ *		negative value indicating the error number.
+ */
+static inline int ntb_link_is_up(struct ntb_dev *ntb,
+				 enum ntb_speed *speed, enum ntb_width *width)
+{
+	return ntb->ops->link_is_up(ntb, speed, width);
+}
+
+/**
+ * ntb_link_enable() - enable the link on the secondary side of the ntb
+ * @ntb:	NTB device context.
+ * @max_speed:	The maximum link speed expressed as PCIe generation number.
+ * @max_width:	The maximum link width expressed as the number of PCIe lanes.
+ *
+ * Enable the link on the secondary side of the ntb.  This can only be done
+ * from the primary side of the ntb in primary or b2b topology.  The ntb device
+ * should train the link to its maximum speed and width, or the requested speed
+ * and width, whichever is smaller, if supported.
+ *
+ * Return: Zero on success, otherwise an error number.
+ */
+static inline int ntb_link_enable(struct ntb_dev *ntb,
+				  enum ntb_speed max_speed,
+				  enum ntb_width max_width)
+{
+	return ntb->ops->link_enable(ntb, max_speed, max_width);
+}
+
+/**
+ * ntb_link_disable() - disable the link on the secondary side of the ntb
+ * @ntb:	NTB device context.
+ *
+ * Disable the link on the secondary side of the ntb.  This can only be
+ * done from the primary side of the ntb in primary or b2b topology.  The ntb
+ * device should disable the link.  Returning from this call must indicate that
+ * a barrier has passed, though with no more writes may pass in either
+ * direction across the link, except if this call returns an error number.
+ *
+ * Return: Zero on success, otherwise an error number.
+ */
+static inline int ntb_link_disable(struct ntb_dev *ntb)
+{
+	return ntb->ops->link_disable(ntb);
+}
+
+/**
+ * ntb_db_is_unsafe() - check if it is safe to use hardware doorbell
+ * @ntb:	NTB device context.
+ *
+ * It is possible for some ntb hardware to be affected by errata.  Hardware
+ * drivers can advise clients to avoid using doorbells.  Clients may ignore
+ * this advice, though caution is recommended.
+ *
+ * Return: Zero if it is safe to use doorbells, or One if it is not safe.
+ */
+static inline int ntb_db_is_unsafe(struct ntb_dev *ntb)
+{
+	if (!ntb->ops->db_is_unsafe)
+		return 0;
+
+	return ntb->ops->db_is_unsafe(ntb);
+}
+
+/**
+ * ntb_db_valid_mask() - get a mask of doorbell bits supported by the ntb
+ * @ntb:	NTB device context.
+ *
+ * Hardware may support different number or arrangement of doorbell bits.
+ *
+ * Return: A mask of doorbell bits supported by the ntb.
+ */
+static inline u64 ntb_db_valid_mask(struct ntb_dev *ntb)
+{
+	return ntb->ops->db_valid_mask(ntb);
+}
+
+/**
+ * ntb_db_vector_count() - get the number of doorbell interrupt vectors
+ * @ntb:	NTB device context.
+ *
+ * Hardware may support different number of interrupt vectors.
+ *
+ * Return: The number of doorbell interrupt vectors.
+ */
+static inline int ntb_db_vector_count(struct ntb_dev *ntb)
+{
+	if (!ntb->ops->db_vector_count)
+		return 1;
+
+	return ntb->ops->db_vector_count(ntb);
+}
+
+/**
+ * ntb_db_vector_mask() - get a mask of doorbell bits serviced by a vector
+ * @ntb:	NTB device context.
+ * @vector:	Doorbell vector number.
+ *
+ * Each interrupt vector may have a different number or arrangement of bits.
+ *
+ * Return: A mask of doorbell bits serviced by a vector.
+ */
+static inline u64 ntb_db_vector_mask(struct ntb_dev *ntb, int vector)
+{
+	if (!ntb->ops->db_vector_mask)
+		return ntb_db_valid_mask(ntb);
+
+	return ntb->ops->db_vector_mask(ntb, vector);
+}
+
+/**
+ * ntb_db_read() - read the local doorbell register
+ * @ntb:	NTB device context.
+ *
+ * Read the local doorbell register, and return the bits that are set.
+ *
+ * Return: The bits currently set in the local doorbell register.
+ */
+static inline u64 ntb_db_read(struct ntb_dev *ntb)
+{
+	return ntb->ops->db_read(ntb);
+}
+
+/**
+ * ntb_db_set() - set bits in the local doorbell register
+ * @ntb:	NTB device context.
+ * @db_bits:	Doorbell bits to set.
+ *
+ * Set bits in the local doorbell register, which may generate a local doorbell
+ * interrupt.  Bits that were already set must remain set.
+ *
+ * This is unusual, and hardware may not support it.
+ *
+ * Return: Zero on success, otherwise an error number.
+ */
+static inline int ntb_db_set(struct ntb_dev *ntb, u64 db_bits)
+{
+	if (!ntb->ops->db_set)
+		return -EINVAL;
+
+	return ntb->ops->db_set(ntb, db_bits);
+}
+
+/**
+ * ntb_db_clear() - clear bits in the local doorbell register
+ * @ntb:	NTB device context.
+ * @db_bits:	Doorbell bits to clear.
+ *
+ * Clear bits in the local doorbell register, arming the bits for the next
+ * doorbell.
+ *
+ * Return: Zero on success, otherwise an error number.
+ */
+static inline int ntb_db_clear(struct ntb_dev *ntb, u64 db_bits)
+{
+	return ntb->ops->db_clear(ntb, db_bits);
+}
+
+/**
+ * ntb_db_read_mask() - read the local doorbell mask
+ * @ntb:	NTB device context.
+ *
+ * Read the local doorbell mask register, and return the bits that are set.
+ *
+ * This is unusual, though hardware is likely to support it.
+ *
+ * Return: The bits currently set in the local doorbell mask register.
+ */
+static inline u64 ntb_db_read_mask(struct ntb_dev *ntb)
+{
+	if (!ntb->ops->db_read_mask)
+		return 0;
+
+	return ntb->ops->db_read_mask(ntb);
+}
+
+/**
+ * ntb_db_set_mask() - set bits in the local doorbell mask
+ * @ntb:	NTB device context.
+ * @db_bits:	Doorbell mask bits to set.
+ *
+ * Set bits in the local doorbell mask register, preventing doorbell interrupts
+ * from being generated for those doorbell bits.  Bits that were already set
+ * must remain set.
+ *
+ * Return: Zero on success, otherwise an error number.
+ */
+static inline int ntb_db_set_mask(struct ntb_dev *ntb, u64 db_bits)
+{
+	return ntb->ops->db_set_mask(ntb, db_bits);
+}
+
+/**
+ * ntb_db_clear_mask() - clear bits in the local doorbell mask
+ * @ntb:	NTB device context.
+ * @db_bits:	Doorbell bits to clear.
+ *
+ * Clear bits in the local doorbell mask register, allowing doorbell interrupts
+ * from being generated for those doorbell bits.  If a doorbell bit is already
+ * set at the time the mask is cleared, and the corresponding mask bit is
+ * changed from set to clear, then the ntb driver must ensure that
+ * ntb_db_event() is called.  If the hardware does not generate the interrupt
+ * on clearing the mask bit, then the driver must call ntb_db_event() anyway.
+ *
+ * Return: Zero on success, otherwise an error number.
+ */
+static inline int ntb_db_clear_mask(struct ntb_dev *ntb, u64 db_bits)
+{
+	return ntb->ops->db_clear_mask(ntb, db_bits);
+}
+
+/**
+ * ntb_peer_db_addr() - address and size of the peer doorbell register
+ * @ntb:	NTB device context.
+ * @db_addr:	OUT - The address of the peer doorbell register.
+ * @db_size:	OUT - The number of bytes to write the peer doorbell register.
+ *
+ * Return the address of the peer doorbell register.  This may be used, for
+ * example, by drivers that offload memory copy operations to a dma engine.
+ * The drivers may wish to ring the peer doorbell at the completion of memory
+ * copy operations.  For efficiency, and to simplify ordering of operations
+ * between the dma memory copies and the ringing doorbell, the driver may
+ * append one additional dma memory copy with the doorbell register as the
+ * destination, after the memory copy operations.
+ *
+ * Return: Zero on success, otherwise an error number.
+ */
+static inline int ntb_peer_db_addr(struct ntb_dev *ntb,
+				   phys_addr_t *db_addr,
+				   resource_size_t *db_size)
+{
+	return ntb->ops->peer_db_addr(ntb, db_addr, db_size);
+}
+
+/**
+ * ntb_peer_db_read() - read the peer doorbell register
+ * @ntb:	NTB device context.
+ *
+ * Read the peer doorbell register, and return the bits that are set.
+ *
+ * This is unusual, and hardware may not support it.
+ *
+ * Return: The bits currently set in the peer doorbell register.
+ */
+static inline u64 ntb_peer_db_read(struct ntb_dev *ntb)
+{
+	if (!ntb->ops->peer_db_read)
+		return 0;
+
+	return ntb->ops->peer_db_read(ntb);
+}
+
+/**
+ * ntb_peer_db_set() - set bits in the peer doorbell register
+ * @ntb:	NTB device context.
+ * @db_bits:	Doorbell bits to set.
+ *
+ * Set bits in the peer doorbell register, which may generate a peer doorbell
+ * interrupt.  Bits that were already set must remain set.
+ *
+ * Return: Zero on success, otherwise an error number.
+ */
+static inline int ntb_peer_db_set(struct ntb_dev *ntb, u64 db_bits)
+{
+	return ntb->ops->peer_db_set(ntb, db_bits);
+}
+
+/**
+ * ntb_peer_db_clear() - clear bits in the local doorbell register
+ * @ntb:	NTB device context.
+ * @db_bits:	Doorbell bits to clear.
+ *
+ * Clear bits in the peer doorbell register, arming the bits for the next
+ * doorbell.
+ *
+ * This is unusual, and hardware may not support it.
+ *
+ * Return: Zero on success, otherwise an error number.
+ */
+static inline int ntb_peer_db_clear(struct ntb_dev *ntb, u64 db_bits)
+{
+	if (!ntb->ops->db_clear)
+		return -EINVAL;
+
+	return ntb->ops->peer_db_clear(ntb, db_bits);
+}
+
+/**
+ * ntb_peer_db_read_mask() - read the peer doorbell mask
+ * @ntb:	NTB device context.
+ *
+ * Read the peer doorbell mask register, and return the bits that are set.
+ *
+ * This is unusual, and hardware may not support it.
+ *
+ * Return: The bits currently set in the peer doorbell mask register.
+ */
+static inline u64 ntb_peer_db_read_mask(struct ntb_dev *ntb)
+{
+	if (!ntb->ops->db_read_mask)
+		return 0;
+
+	return ntb->ops->peer_db_read_mask(ntb);
+}
+
+/**
+ * ntb_peer_db_set_mask() - set bits in the peer doorbell mask
+ * @ntb:	NTB device context.
+ * @db_bits:	Doorbell mask bits to set.
+ *
+ * Set bits in the peer doorbell mask register, preventing doorbell interrupts
+ * from being generated for those doorbell bits.  Bits that were already set
+ * must remain set.
+ *
+ * This is unusual, and hardware may not support it.
+ *
+ * Return: Zero on success, otherwise an error number.
+ */
+static inline int ntb_peer_db_set_mask(struct ntb_dev *ntb, u64 db_bits)
+{
+	if (!ntb->ops->db_set_mask)
+		return -EINVAL;
+
+	return ntb->ops->peer_db_set_mask(ntb, db_bits);
+}
+
+/**
+ * ntb_peer_db_clear_mask() - clear bits in the peer doorbell mask
+ * @ntb:	NTB device context.
+ * @db_bits:	Doorbell bits to clear.
+ *
+ * Clear bits in the peer doorbell mask register, allowing doorbell interrupts
+ * from being generated for those doorbell bits.  If the hardware does not
+ * generate the interrupt on clearing the mask bit, then the driver should not
+ * implement this function!
+ *
+ * This is unusual, and hardware may not support it.
+ *
+ * Return: Zero on success, otherwise an error number.
+ */
+static inline int ntb_peer_db_clear_mask(struct ntb_dev *ntb, u64 db_bits)
+{
+	if (!ntb->ops->db_clear_mask)
+		return -EINVAL;
+
+	return ntb->ops->peer_db_clear_mask(ntb, db_bits);
+}
+
+/**
+ * ntb_spad_is_unsafe() - check if it is safe to use the hardware scratchpads
+ * @ntb:	NTB device context.
+ *
+ * It is possible for some ntb hardware to be affected by errata.  Hardware
+ * drivers can advise clients to avoid using scratchpads.  Clients may ignore
+ * this advice, though caution is recommended.
+ *
+ * Return: Zero if it is safe to use scratchpads, or One if it is not safe.
+ */
+static inline int ntb_spad_is_unsafe(struct ntb_dev *ntb)
+{
+	if (!ntb->ops->spad_is_unsafe)
+		return 0;
+
+	return ntb->ops->spad_is_unsafe(ntb);
+}
+
+/**
+ * ntb_mw_count() - get the number of scratchpads
+ * @ntb:	NTB device context.
+ *
+ * Hardware and topology may support a different number of scratchpads.
+ *
+ * Return: the number of scratchpads.
+ */
+static inline int ntb_spad_count(struct ntb_dev *ntb)
+{
+	return ntb->ops->spad_count(ntb);
+}
+
+/**
+ * ntb_spad_read() - read the local scratchpad register
+ * @ntb:	NTB device context.
+ * @idx:	Scratchpad index.
+ *
+ * Read the local scratchpad register, and return the value.
+ *
+ * Return: The value of the local scratchpad register.
+ */
+static inline u32 ntb_spad_read(struct ntb_dev *ntb, int idx)
+{
+	return ntb->ops->spad_read(ntb, idx);
+}
+
+/**
+ * ntb_spad_write() - write the local scratchpad register
+ * @ntb:	NTB device context.
+ * @idx:	Scratchpad index.
+ * @val:	Scratchpad value.
+ *
+ * Write the value to the local scratchpad register.
+ *
+ * Return: Zero on success, otherwise an error number.
+ */
+static inline int ntb_spad_write(struct ntb_dev *ntb, int idx, u32 val)
+{
+	return ntb->ops->spad_write(ntb, idx, val);
+}
+
+/**
+ * ntb_peer_spad_addr() - address of the peer scratchpad register
+ * @ntb:	NTB device context.
+ * @idx:	Scratchpad index.
+ * @spad_addr:	OUT - The address of the peer scratchpad register.
+ *
+ * Return the address of the peer doorbell register.  This may be used, for
+ * example, by drivers that offload memory copy operations to a dma engine.
+ *
+ * Return: Zero on success, otherwise an error number.
+ */
+static inline int ntb_peer_spad_addr(struct ntb_dev *ntb, int idx,
+				     phys_addr_t *spad_addr)
+{
+	return ntb->ops->peer_spad_addr(ntb, idx, spad_addr);
+}
+
+/**
+ * ntb_peer_spad_read() - read the peer scratchpad register
+ * @ntb:	NTB device context.
+ * @idx:	Scratchpad index.
+ *
+ * Read the peer scratchpad register, and return the value.
+ *
+ * Return: The value of the local scratchpad register.
+ */
+static inline u32 ntb_peer_spad_read(struct ntb_dev *ntb, int idx)
+{
+	return ntb->ops->peer_spad_read(ntb, idx);
+}
+
+/**
+ * ntb_peer_spad_write() - write the peer scratchpad register
+ * @ntb:	NTB device context.
+ * @idx:	Scratchpad index.
+ * @val:	Scratchpad value.
+ *
+ * Write the value to the peer scratchpad register.
+ *
+ * Return: Zero on success, otherwise an error number.
+ */
+static inline int ntb_peer_spad_write(struct ntb_dev *ntb, int idx, u32 val)
+{
+	return ntb->ops->peer_spad_write(ntb, idx, val);
+}
+
+#endif
-- 
cgit v1.2.3


From e26a5843f7f5014ae4460030ca4de029a3ac35d3 Mon Sep 17 00:00:00 2001
From: Allen Hubbe <Allen.Hubbe@emc.com>
Date: Thu, 9 Apr 2015 10:33:20 -0400
Subject: NTB: Split ntb_hw_intel and ntb_transport drivers

Change ntb_hw_intel to use the new NTB hardware abstraction layer.

Split ntb_transport into its own driver.  Change it to use the new NTB
hardware abstraction layer.

Signed-off-by: Allen Hubbe <Allen.Hubbe@emc.com>
Signed-off-by: Jon Mason <jdmason@kudzu.us>
---
 Documentation/ntb.txt               |   26 +
 MAINTAINERS                         |    8 +
 drivers/net/ntb_netdev.c            |   54 +-
 drivers/ntb/Kconfig                 |   37 +-
 drivers/ntb/Makefile                |    6 +-
 drivers/ntb/hw/Kconfig              |    1 +
 drivers/ntb/hw/Makefile             |    1 +
 drivers/ntb/hw/intel/Kconfig        |    7 +
 drivers/ntb/hw/intel/Makefile       |    1 +
 drivers/ntb/hw/intel/ntb_hw_intel.c | 3075 +++++++++++++++++++----------------
 drivers/ntb/hw/intel/ntb_hw_intel.h |  607 ++++---
 drivers/ntb/ntb_transport.c         |  936 ++++++-----
 include/linux/ntb_transport.h       |   25 +-
 13 files changed, 2589 insertions(+), 2195 deletions(-)
 create mode 100644 drivers/ntb/hw/Kconfig
 create mode 100644 drivers/ntb/hw/Makefile
 create mode 100644 drivers/ntb/hw/intel/Kconfig
 create mode 100644 drivers/ntb/hw/intel/Makefile

(limited to 'include/linux')

diff --git a/Documentation/ntb.txt b/Documentation/ntb.txt
index 9d46dc9712a8..725ba1e6c127 100644
--- a/Documentation/ntb.txt
+++ b/Documentation/ntb.txt
@@ -26,7 +26,33 @@ as ntb hardware, or hardware drivers, are inserted and removed.  The
 registration uses the Linux Device framework, so it should feel familiar to
 anyone who has written a pci driver.
 
+### NTB Transport Client (ntb\_transport) and NTB Netdev (ntb\_netdev)
+
+The primary client for NTB is the Transport client, used in tandem with NTB
+Netdev.  These drivers function together to create a logical link to the peer,
+across the ntb, to exchange packets of network data.  The Transport client
+establishes a logical link to the peer, and creates queue pairs to exchange
+messages and data.  The NTB Netdev then creates an ethernet device using a
+Transport queue pair.  Network data is copied between socket buffers and the
+Transport queue pair buffer.  The Transport client may be used for other things
+besides Netdev, however no other applications have yet been written.
+
 ## NTB Hardware Drivers
 
 NTB hardware drivers should register devices with the NTB core driver.  After
 registering, clients probe and remove functions will be called.
+
+### NTB Intel Hardware Driver (ntb\_hw\_intel)
+
+The Intel hardware driver supports NTB on Xeon and Atom CPUs.
+
+Module Parameters:
+
+* b2b\_mw\_idx - If the peer ntb is to be accessed via a memory window, then use
+	this memory window to access the peer ntb.  A value of zero or positive
+	starts from the first mw idx, and a negative value starts from the last
+	mw idx.  Both sides MUST set the same value here!  The default value is
+	`-1`.
+* b2b\_mw\_share - If the peer ntb is to be accessed via a memory window, and if
+	the memory window is large enough, still allow the client to use the
+	second half of the memory window for address translation to the peer.
diff --git a/MAINTAINERS b/MAINTAINERS
index e2fc9eec2e16..a682805d6d56 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -7007,6 +7007,14 @@ F:	drivers/net/ntb_netdev.c
 F:	include/linux/ntb.h
 F:	include/linux/ntb_transport.h
 
+NTB INTEL DRIVER
+M:	Jon Mason <jdmason@kudzu.us>
+M:	Dave Jiang <dave.jiang@intel.com>
+S:	Supported
+W:	https://github.com/jonmason/ntb/wiki
+T:	git git://github.com/jonmason/ntb.git
+F:	drivers/ntb/hw/intel/
+
 NTFS FILESYSTEM
 M:	Anton Altaparmakov <anton@tuxera.com>
 L:	linux-ntfs-dev@lists.sourceforge.net
diff --git a/drivers/net/ntb_netdev.c b/drivers/net/ntb_netdev.c
index 6d3bfa62f5ec..3cc316cb7e6b 100644
--- a/drivers/net/ntb_netdev.c
+++ b/drivers/net/ntb_netdev.c
@@ -5,6 +5,7 @@
  *   GPL LICENSE SUMMARY
  *
  *   Copyright(c) 2012 Intel Corporation. All rights reserved.
+ *   Copyright (C) 2015 EMC Corporation. All Rights Reserved.
  *
  *   This program is free software; you can redistribute it and/or modify
  *   it under the terms of version 2 of the GNU General Public License as
@@ -13,6 +14,7 @@
  *   BSD LICENSE
  *
  *   Copyright(c) 2012 Intel Corporation. All rights reserved.
+ *   Copyright (C) 2015 EMC Corporation. All Rights Reserved.
  *
  *   Redistribution and use in source and binary forms, with or without
  *   modification, are permitted provided that the following conditions
@@ -40,7 +42,7 @@
  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
- * Intel PCIe NTB Network Linux driver
+ * PCIe NTB Network Linux driver
  *
  * Contact Information:
  * Jon Mason <jon.mason@intel.com>
@@ -49,6 +51,7 @@
 #include <linux/ethtool.h>
 #include <linux/module.h>
 #include <linux/pci.h>
+#include <linux/ntb.h>
 #include <linux/ntb_transport.h>
 
 #define NTB_NETDEV_VER	"0.7"
@@ -70,26 +73,19 @@ struct ntb_netdev {
 
 static LIST_HEAD(dev_list);
 
-static void ntb_netdev_event_handler(void *data, int status)
+static void ntb_netdev_event_handler(void *data, int link_is_up)
 {
 	struct net_device *ndev = data;
 	struct ntb_netdev *dev = netdev_priv(ndev);
 
-	netdev_dbg(ndev, "Event %x, Link %x\n", status,
+	netdev_dbg(ndev, "Event %x, Link %x\n", link_is_up,
 		   ntb_transport_link_query(dev->qp));
 
-	switch (status) {
-	case NTB_LINK_DOWN:
+	if (link_is_up) {
+		if (ntb_transport_link_query(dev->qp))
+			netif_carrier_on(ndev);
+	} else {
 		netif_carrier_off(ndev);
-		break;
-	case NTB_LINK_UP:
-		if (!ntb_transport_link_query(dev->qp))
-			return;
-
-		netif_carrier_on(ndev);
-		break;
-	default:
-		netdev_warn(ndev, "Unsupported event type %d\n", status);
 	}
 }
 
@@ -160,8 +156,6 @@ static netdev_tx_t ntb_netdev_start_xmit(struct sk_buff *skb,
 	struct ntb_netdev *dev = netdev_priv(ndev);
 	int rc;
 
-	netdev_dbg(ndev, "%s: skb len %d\n", __func__, skb->len);
-
 	rc = ntb_transport_tx_enqueue(dev->qp, skb, skb->data, skb->len);
 	if (rc)
 		goto err;
@@ -322,20 +316,26 @@ static const struct ntb_queue_handlers ntb_netdev_handlers = {
 	.event_handler = ntb_netdev_event_handler,
 };
 
-static int ntb_netdev_probe(struct pci_dev *pdev)
+static int ntb_netdev_probe(struct device *client_dev)
 {
+	struct ntb_dev *ntb;
 	struct net_device *ndev;
+	struct pci_dev *pdev;
 	struct ntb_netdev *dev;
 	int rc;
 
-	ndev = alloc_etherdev(sizeof(struct ntb_netdev));
+	ntb = dev_ntb(client_dev->parent);
+	pdev = ntb->pdev;
+	if (!pdev)
+		return -ENODEV;
+
+	ndev = alloc_etherdev(sizeof(*dev));
 	if (!ndev)
 		return -ENOMEM;
 
 	dev = netdev_priv(ndev);
 	dev->ndev = ndev;
 	dev->pdev = pdev;
-	BUG_ON(!dev->pdev);
 	ndev->features = NETIF_F_HIGHDMA;
 
 	ndev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
@@ -349,7 +349,8 @@ static int ntb_netdev_probe(struct pci_dev *pdev)
 	ndev->netdev_ops = &ntb_netdev_ops;
 	ndev->ethtool_ops = &ntb_ethtool_ops;
 
-	dev->qp = ntb_transport_create_queue(ndev, pdev, &ntb_netdev_handlers);
+	dev->qp = ntb_transport_create_queue(ndev, client_dev,
+					     &ntb_netdev_handlers);
 	if (!dev->qp) {
 		rc = -EIO;
 		goto err;
@@ -372,12 +373,17 @@ err:
 	return rc;
 }
 
-static void ntb_netdev_remove(struct pci_dev *pdev)
+static void ntb_netdev_remove(struct device *client_dev)
 {
+	struct ntb_dev *ntb;
 	struct net_device *ndev;
+	struct pci_dev *pdev;
 	struct ntb_netdev *dev;
 	bool found = false;
 
+	ntb = dev_ntb(client_dev->parent);
+	pdev = ntb->pdev;
+
 	list_for_each_entry(dev, &dev_list, list) {
 		if (dev->pdev == pdev) {
 			found = true;
@@ -396,7 +402,7 @@ static void ntb_netdev_remove(struct pci_dev *pdev)
 	free_netdev(ndev);
 }
 
-static struct ntb_client ntb_netdev_client = {
+static struct ntb_transport_client ntb_netdev_client = {
 	.driver.name = KBUILD_MODNAME,
 	.driver.owner = THIS_MODULE,
 	.probe = ntb_netdev_probe,
@@ -407,7 +413,7 @@ static int __init ntb_netdev_init_module(void)
 {
 	int rc;
 
-	rc = ntb_register_client_dev(KBUILD_MODNAME);
+	rc = ntb_transport_register_client_dev(KBUILD_MODNAME);
 	if (rc)
 		return rc;
 	return ntb_transport_register_client(&ntb_netdev_client);
@@ -417,6 +423,6 @@ module_init(ntb_netdev_init_module);
 static void __exit ntb_netdev_exit_module(void)
 {
 	ntb_transport_unregister_client(&ntb_netdev_client);
-	ntb_unregister_client_dev(KBUILD_MODNAME);
+	ntb_transport_unregister_client_dev(KBUILD_MODNAME);
 }
 module_exit(ntb_netdev_exit_module);
diff --git a/drivers/ntb/Kconfig b/drivers/ntb/Kconfig
index f69df793dbe2..53b042429673 100644
--- a/drivers/ntb/Kconfig
+++ b/drivers/ntb/Kconfig
@@ -1,13 +1,26 @@
-config NTB
-       tristate "Intel Non-Transparent Bridge support"
-       depends on PCI
-       depends on X86
-       help
-        The PCI-E Non-transparent bridge hardware is a point-to-point PCI-E bus
-        connecting 2 systems.  When configured, writes to the device's PCI
-        mapped memory will be mirrored to a buffer on the remote system.  The
-        ntb Linux driver uses this point-to-point communication as a method to
-        transfer data from one system to the other.
-
-        If unsure, say N.
+menuconfig NTB
+	tristate "Non-Transparent Bridge support"
+	depends on PCI
+	help
+	 The PCI-E Non-transparent bridge hardware is a point-to-point PCI-E bus
+	 connecting 2 systems.  When configured, writes to the device's PCI
+	 mapped memory will be mirrored to a buffer on the remote system.  The
+	 ntb Linux driver uses this point-to-point communication as a method to
+	 transfer data from one system to the other.
 
+	 If unsure, say N.
+
+if NTB
+
+source "drivers/ntb/hw/Kconfig"
+
+config NTB_TRANSPORT
+	tristate "NTB Transport Client"
+	help
+	 This is a transport driver that enables connected systems to exchange
+	 messages over the ntb hardware.  The transport exposes a queue pair api
+	 to client drivers.
+
+	 If unsure, say N.
+
+endif # NTB
diff --git a/drivers/ntb/Makefile b/drivers/ntb/Makefile
index 712e953a8fda..b9fa663ecfec 100644
--- a/drivers/ntb/Makefile
+++ b/drivers/ntb/Makefile
@@ -1,4 +1,2 @@
-obj-$(CONFIG_NTB) += ntb.o
-obj-$(CONFIG_NTB) += ntb_hw_intel.o
-
-ntb_hw_intel-objs := hw/intel/ntb_hw_intel.o ntb_transport.o
+obj-$(CONFIG_NTB) += ntb.o hw/
+obj-$(CONFIG_NTB_TRANSPORT) += ntb_transport.o
diff --git a/drivers/ntb/hw/Kconfig b/drivers/ntb/hw/Kconfig
new file mode 100644
index 000000000000..4d5535c4cddf
--- /dev/null
+++ b/drivers/ntb/hw/Kconfig
@@ -0,0 +1 @@
+source "drivers/ntb/hw/intel/Kconfig"
diff --git a/drivers/ntb/hw/Makefile b/drivers/ntb/hw/Makefile
new file mode 100644
index 000000000000..175d7c92a569
--- /dev/null
+++ b/drivers/ntb/hw/Makefile
@@ -0,0 +1 @@
+obj-$(CONFIG_NTB_INTEL)	+= intel/
diff --git a/drivers/ntb/hw/intel/Kconfig b/drivers/ntb/hw/intel/Kconfig
new file mode 100644
index 000000000000..91f995e33ac6
--- /dev/null
+++ b/drivers/ntb/hw/intel/Kconfig
@@ -0,0 +1,7 @@
+config NTB_INTEL
+	tristate "Intel Non-Transparent Bridge support"
+	depends on X86_64
+	help
+	 This driver supports Intel NTB on capable Xeon and Atom hardware.
+
+	 If unsure, say N.
diff --git a/drivers/ntb/hw/intel/Makefile b/drivers/ntb/hw/intel/Makefile
new file mode 100644
index 000000000000..1b434568d2ad
--- /dev/null
+++ b/drivers/ntb/hw/intel/Makefile
@@ -0,0 +1 @@
+obj-$(CONFIG_NTB_INTEL) += ntb_hw_intel.o
diff --git a/drivers/ntb/hw/intel/ntb_hw_intel.c b/drivers/ntb/hw/intel/ntb_hw_intel.c
index 044534a995f1..686091756ba4 100644
--- a/drivers/ntb/hw/intel/ntb_hw_intel.c
+++ b/drivers/ntb/hw/intel/ntb_hw_intel.c
@@ -5,6 +5,7 @@
  *   GPL LICENSE SUMMARY
  *
  *   Copyright(c) 2012 Intel Corporation. All rights reserved.
+ *   Copyright (C) 2015 EMC Corporation. All Rights Reserved.
  *
  *   This program is free software; you can redistribute it and/or modify
  *   it under the terms of version 2 of the GNU General Public License as
@@ -13,6 +14,7 @@
  *   BSD LICENSE
  *
  *   Copyright(c) 2012 Intel Corporation. All rights reserved.
+ *   Copyright (C) 2015 EMC Corporation. All Rights Reserved.
  *
  *   Redistribution and use in source and binary forms, with or without
  *   modification, are permitted provided that the following conditions
@@ -45,6 +47,7 @@
  * Contact Information:
  * Jon Mason <jon.mason@intel.com>
  */
+
 #include <linux/debugfs.h>
 #include <linux/delay.h>
 #include <linux/init.h>
@@ -53,99 +56,97 @@
 #include <linux/pci.h>
 #include <linux/random.h>
 #include <linux/slab.h>
+#include <linux/ntb.h>
+
 #include "ntb_hw_intel.h"
 
-#define NTB_NAME	"Intel(R) PCI-E Non-Transparent Bridge Driver"
-#define NTB_VER		"1.0"
+#define NTB_NAME	"ntb_hw_intel"
+#define NTB_DESC	"Intel(R) PCI-E Non-Transparent Bridge Driver"
+#define NTB_VER		"2.0"
 
-MODULE_DESCRIPTION(NTB_NAME);
+MODULE_DESCRIPTION(NTB_DESC);
 MODULE_VERSION(NTB_VER);
 MODULE_LICENSE("Dual BSD/GPL");
 MODULE_AUTHOR("Intel Corporation");
 
-enum {
-	NTB_CONN_TRANSPARENT = 0,
-	NTB_CONN_B2B,
-	NTB_CONN_RP,
-};
-
-enum {
-	NTB_DEV_USD = 0,
-	NTB_DEV_DSD,
-};
-
-enum {
-	SNB_HW = 0,
-	BWD_HW,
-};
-
+#define bar0_off(base, bar) ((base) + ((bar) << 2))
+#define bar2_off(base, bar) bar0_off(base, (bar) - 2)
+
+static int b2b_mw_idx = -1;
+module_param(b2b_mw_idx, int, 0644);
+MODULE_PARM_DESC(b2b_mw_idx, "Use this mw idx to access the peer ntb.  A "
+		 "value of zero or positive starts from first mw idx, and a "
+		 "negative value starts from last mw idx.  Both sides MUST "
+		 "set the same value here!");
+
+static unsigned int b2b_mw_share;
+module_param(b2b_mw_share, uint, 0644);
+MODULE_PARM_DESC(b2b_mw_share, "If the b2b mw is large enough, configure the "
+		 "ntb so that the peer ntb only occupies the first half of "
+		 "the mw, so the second half can still be used as a mw.  Both "
+		 "sides MUST set the same value here!");
+
+static const struct intel_ntb_reg bwd_reg;
+static const struct intel_ntb_alt_reg bwd_pri_reg;
+static const struct intel_ntb_alt_reg bwd_sec_reg;
+static const struct intel_ntb_alt_reg bwd_b2b_reg;
+static const struct intel_ntb_xlat_reg bwd_pri_xlat;
+static const struct intel_ntb_xlat_reg bwd_sec_xlat;
+static const struct intel_ntb_reg snb_reg;
+static const struct intel_ntb_alt_reg snb_pri_reg;
+static const struct intel_ntb_alt_reg snb_sec_reg;
+static const struct intel_ntb_alt_reg snb_b2b_reg;
+static const struct intel_ntb_xlat_reg snb_pri_xlat;
+static const struct intel_ntb_xlat_reg snb_sec_xlat;
+static const struct intel_b2b_addr snb_b2b_usd_addr;
+static const struct intel_b2b_addr snb_b2b_dsd_addr;
+
+static const struct ntb_dev_ops intel_ntb_ops;
+
+static const struct file_operations intel_ntb_debugfs_info;
 static struct dentry *debugfs_dir;
 
-#define BWD_LINK_RECOVERY_TIME	500
-
-/* Translate memory window 0,1,2 to BAR 2,4,5 */
-#define MW_TO_BAR(mw)	(mw == 0 ? 2 : (mw == 1 ? 4 : 5))
-
-static const struct pci_device_id ntb_pci_tbl[] = {
-	{PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_NTB_B2B_BWD)},
-	{PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_NTB_B2B_JSF)},
-	{PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_NTB_B2B_SNB)},
-	{PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_NTB_B2B_IVT)},
-	{PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_NTB_B2B_HSX)},
-	{PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_NTB_PS_JSF)},
-	{PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_NTB_PS_SNB)},
-	{PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_NTB_PS_IVT)},
-	{PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_NTB_PS_HSX)},
-	{PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_NTB_SS_JSF)},
-	{PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_NTB_SS_SNB)},
-	{PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_NTB_SS_IVT)},
-	{PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_NTB_SS_HSX)},
-	{0}
-};
-MODULE_DEVICE_TABLE(pci, ntb_pci_tbl);
-
-static int is_ntb_xeon(struct ntb_device *ndev)
+#ifndef ioread64
+#ifdef readq
+#define ioread64 readq
+#else
+#define ioread64 _ioread64
+static inline u64 _ioread64(void __iomem *mmio)
 {
-	switch (ndev->pdev->device) {
-	case PCI_DEVICE_ID_INTEL_NTB_SS_JSF:
-	case PCI_DEVICE_ID_INTEL_NTB_SS_SNB:
-	case PCI_DEVICE_ID_INTEL_NTB_SS_IVT:
-	case PCI_DEVICE_ID_INTEL_NTB_SS_HSX:
-	case PCI_DEVICE_ID_INTEL_NTB_PS_JSF:
-	case PCI_DEVICE_ID_INTEL_NTB_PS_SNB:
-	case PCI_DEVICE_ID_INTEL_NTB_PS_IVT:
-	case PCI_DEVICE_ID_INTEL_NTB_PS_HSX:
-	case PCI_DEVICE_ID_INTEL_NTB_B2B_JSF:
-	case PCI_DEVICE_ID_INTEL_NTB_B2B_SNB:
-	case PCI_DEVICE_ID_INTEL_NTB_B2B_IVT:
-	case PCI_DEVICE_ID_INTEL_NTB_B2B_HSX:
-		return 1;
-	default:
-		return 0;
-	}
+	u64 low, high;
 
-	return 0;
+	low = ioread32(mmio);
+	high = ioread32(mmio + sizeof(u32));
+	return low | (high << 32);
+}
+#endif
+#endif
+
+#ifndef iowrite64
+#ifdef writeq
+#define iowrite64 writeq
+#else
+#define iowrite64 _iowrite64
+static inline void _iowrite64(u64 val, void __iomem *mmio)
+{
+	iowrite32(val, mmio);
+	iowrite32(val >> 32, mmio + sizeof(u32));
 }
+#endif
+#endif
 
-static int is_ntb_atom(struct ntb_device *ndev)
+static inline int pdev_is_bwd(struct pci_dev *pdev)
 {
-	switch (ndev->pdev->device) {
+	switch (pdev->device) {
 	case PCI_DEVICE_ID_INTEL_NTB_B2B_BWD:
 		return 1;
-	default:
-		return 0;
 	}
-
 	return 0;
 }
 
-static void ntb_set_errata_flags(struct ntb_device *ndev)
+static inline int pdev_is_snb(struct pci_dev *pdev)
 {
-	switch (ndev->pdev->device) {
-	/*
-	 * this workaround applies to all platform up to IvyBridge
-	 * Haswell has splitbar support and use a different workaround
-	 */
+	switch (pdev->device) {
 	case PCI_DEVICE_ID_INTEL_NTB_SS_JSF:
 	case PCI_DEVICE_ID_INTEL_NTB_SS_SNB:
 	case PCI_DEVICE_ID_INTEL_NTB_SS_IVT:
@@ -158,1737 +159,1957 @@ static void ntb_set_errata_flags(struct ntb_device *ndev)
 	case PCI_DEVICE_ID_INTEL_NTB_B2B_SNB:
 	case PCI_DEVICE_ID_INTEL_NTB_B2B_IVT:
 	case PCI_DEVICE_ID_INTEL_NTB_B2B_HSX:
-		ndev->wa_flags |= WA_SNB_ERR;
-		break;
+		return 1;
 	}
+	return 0;
 }
 
-/**
- * ntb_register_event_callback() - register event callback
- * @ndev: pointer to ntb_device instance
- * @func: callback function to register
- *
- * This function registers a callback for any HW driver events such as link
- * up/down, power management notices and etc.
- *
- * RETURNS: An appropriate -ERRNO error value on error, or zero for success.
- */
-int ntb_register_event_callback(struct ntb_device *ndev,
-				void (*func)(void *handle,
-					     enum ntb_hw_event event))
+static inline void ndev_reset_unsafe_flags(struct intel_ntb_dev *ndev)
 {
-	if (ndev->event_cb)
-		return -EINVAL;
-
-	ndev->event_cb = func;
-
-	return 0;
+	ndev->unsafe_flags = 0;
+	ndev->unsafe_flags_ignore = 0;
+
+	/* Only B2B has a workaround to avoid SDOORBELL */
+	if (ndev->hwerr_flags & NTB_HWERR_SDOORBELL_LOCKUP)
+		if (!ntb_topo_is_b2b(ndev->ntb.topo))
+			ndev->unsafe_flags |= NTB_UNSAFE_DB;
+
+	/* No low level workaround to avoid SB01BASE */
+	if (ndev->hwerr_flags & NTB_HWERR_SB01BASE_LOCKUP) {
+		ndev->unsafe_flags |= NTB_UNSAFE_DB;
+		ndev->unsafe_flags |= NTB_UNSAFE_SPAD;
+	}
 }
 
-/**
- * ntb_unregister_event_callback() - unregisters the event callback
- * @ndev: pointer to ntb_device instance
- *
- * This function unregisters the existing callback from transport
- */
-void ntb_unregister_event_callback(struct ntb_device *ndev)
+static inline int ndev_is_unsafe(struct intel_ntb_dev *ndev,
+				 unsigned long flag)
 {
-	ndev->event_cb = NULL;
+	return !!(flag & ndev->unsafe_flags & ~ndev->unsafe_flags_ignore);
 }
 
-static void ntb_irq_work(unsigned long data)
+static inline int ndev_ignore_unsafe(struct intel_ntb_dev *ndev,
+				     unsigned long flag)
 {
-	struct ntb_db_cb *db_cb = (struct ntb_db_cb *)data;
-	int rc;
+	flag &= ndev->unsafe_flags;
+	ndev->unsafe_flags_ignore |= flag;
 
-	rc = db_cb->callback(db_cb->data, db_cb->db_num);
-	if (rc)
-		tasklet_schedule(&db_cb->irq_work);
-	else {
-		struct ntb_device *ndev = db_cb->ndev;
-		unsigned long mask;
-
-		mask = readw(ndev->reg_ofs.ldb_mask);
-		clear_bit(db_cb->db_num * ndev->bits_per_vector, &mask);
-		writew(mask, ndev->reg_ofs.ldb_mask);
-	}
+	return !!flag;
 }
 
-/**
- * ntb_register_db_callback() - register a callback for doorbell interrupt
- * @ndev: pointer to ntb_device instance
- * @idx: doorbell index to register callback, zero based
- * @data: pointer to be returned to caller with every callback
- * @func: callback function to register
- *
- * This function registers a callback function for the doorbell interrupt
- * on the primary side. The function will unmask the doorbell as well to
- * allow interrupt.
- *
- * RETURNS: An appropriate -ERRNO error value on error, or zero for success.
- */
-int ntb_register_db_callback(struct ntb_device *ndev, unsigned int idx,
-			     void *data, int (*func)(void *data, int db_num))
+static int ndev_mw_to_bar(struct intel_ntb_dev *ndev, int idx)
 {
-	unsigned long mask;
-
-	if (idx >= ndev->max_cbs || ndev->db_cb[idx].callback) {
-		dev_warn(&ndev->pdev->dev, "Invalid Index.\n");
+	if (idx < 0 || idx > ndev->mw_count)
 		return -EINVAL;
-	}
+	return ndev->reg->mw_bar[idx];
+}
 
-	ndev->db_cb[idx].callback = func;
-	ndev->db_cb[idx].data = data;
-	ndev->db_cb[idx].ndev = ndev;
+static inline int ndev_db_addr(struct intel_ntb_dev *ndev,
+			       phys_addr_t *db_addr, resource_size_t *db_size,
+			       phys_addr_t reg_addr, unsigned long reg)
+{
+	WARN_ON_ONCE(ndev_is_unsafe(ndev, NTB_UNSAFE_DB));
 
-	tasklet_init(&ndev->db_cb[idx].irq_work, ntb_irq_work,
-		     (unsigned long) &ndev->db_cb[idx]);
+	if (db_addr) {
+		*db_addr = reg_addr + reg;
+		dev_dbg(ndev_dev(ndev), "Peer db addr %llx\n", *db_addr);
+	}
 
-	/* unmask interrupt */
-	mask = readw(ndev->reg_ofs.ldb_mask);
-	clear_bit(idx * ndev->bits_per_vector, &mask);
-	writew(mask, ndev->reg_ofs.ldb_mask);
+	if (db_size) {
+		*db_size = ndev->reg->db_size;
+		dev_dbg(ndev_dev(ndev), "Peer db size %llx\n", *db_size);
+	}
 
 	return 0;
 }
 
-/**
- * ntb_unregister_db_callback() - unregister a callback for doorbell interrupt
- * @ndev: pointer to ntb_device instance
- * @idx: doorbell index to register callback, zero based
- *
- * This function unregisters a callback function for the doorbell interrupt
- * on the primary side. The function will also mask the said doorbell.
- */
-void ntb_unregister_db_callback(struct ntb_device *ndev, unsigned int idx)
+static inline u64 ndev_db_read(struct intel_ntb_dev *ndev,
+			       void __iomem *mmio)
 {
-	unsigned long mask;
+	WARN_ON_ONCE(ndev_is_unsafe(ndev, NTB_UNSAFE_DB));
 
-	if (idx >= ndev->max_cbs || !ndev->db_cb[idx].callback)
-		return;
+	return ndev->reg->db_ioread(mmio);
+}
+
+static inline int ndev_db_write(struct intel_ntb_dev *ndev, u64 db_bits,
+				void __iomem *mmio)
+{
+	WARN_ON_ONCE(ndev_is_unsafe(ndev, NTB_UNSAFE_DB));
 
-	mask = readw(ndev->reg_ofs.ldb_mask);
-	set_bit(idx * ndev->bits_per_vector, &mask);
-	writew(mask, ndev->reg_ofs.ldb_mask);
+	if (db_bits & ~ndev->db_valid_mask)
+		return -EINVAL;
 
-	tasklet_disable(&ndev->db_cb[idx].irq_work);
+	ndev->reg->db_iowrite(db_bits, mmio);
 
-	ndev->db_cb[idx].callback = NULL;
+	return 0;
 }
 
-/**
- * ntb_find_transport() - find the transport pointer
- * @transport: pointer to pci device
- *
- * Given the pci device pointer, return the transport pointer passed in when
- * the transport attached when it was inited.
- *
- * RETURNS: pointer to transport.
- */
-void *ntb_find_transport(struct pci_dev *pdev)
+static inline int ndev_db_set_mask(struct intel_ntb_dev *ndev, u64 db_bits,
+				   void __iomem *mmio)
 {
-	struct ntb_device *ndev = pci_get_drvdata(pdev);
-	return ndev->ntb_transport;
-}
+	unsigned long irqflags;
 
-/**
- * ntb_register_transport() - Register NTB transport with NTB HW driver
- * @transport: transport identifier
- *
- * This function allows a transport to reserve the hardware driver for
- * NTB usage.
- *
- * RETURNS: pointer to ntb_device, NULL on error.
- */
-struct ntb_device *ntb_register_transport(struct pci_dev *pdev, void *transport)
-{
-	struct ntb_device *ndev = pci_get_drvdata(pdev);
+	WARN_ON_ONCE(ndev_is_unsafe(ndev, NTB_UNSAFE_DB));
+
+	if (db_bits & ~ndev->db_valid_mask)
+		return -EINVAL;
 
-	if (ndev->ntb_transport)
-		return NULL;
+	spin_lock_irqsave(&ndev->db_mask_lock, irqflags);
+	{
+		ndev->db_mask |= db_bits;
+		ndev->reg->db_iowrite(ndev->db_mask, mmio);
+	}
+	spin_unlock_irqrestore(&ndev->db_mask_lock, irqflags);
 
-	ndev->ntb_transport = transport;
-	return ndev;
+	return 0;
 }
 
-/**
- * ntb_unregister_transport() - Unregister the transport with the NTB HW driver
- * @ndev - ntb_device of the transport to be freed
- *
- * This function unregisters the transport from the HW driver and performs any
- * necessary cleanups.
- */
-void ntb_unregister_transport(struct ntb_device *ndev)
+static inline int ndev_db_clear_mask(struct intel_ntb_dev *ndev, u64 db_bits,
+				     void __iomem *mmio)
 {
-	int i;
+	unsigned long irqflags;
 
-	if (!ndev->ntb_transport)
-		return;
+	WARN_ON_ONCE(ndev_is_unsafe(ndev, NTB_UNSAFE_DB));
+
+	if (db_bits & ~ndev->db_valid_mask)
+		return -EINVAL;
 
-	for (i = 0; i < ndev->max_cbs; i++)
-		ntb_unregister_db_callback(ndev, i);
+	spin_lock_irqsave(&ndev->db_mask_lock, irqflags);
+	{
+		ndev->db_mask &= ~db_bits;
+		ndev->reg->db_iowrite(ndev->db_mask, mmio);
+	}
+	spin_unlock_irqrestore(&ndev->db_mask_lock, irqflags);
 
-	ntb_unregister_event_callback(ndev);
-	ndev->ntb_transport = NULL;
+	return 0;
 }
 
-/**
- * ntb_write_local_spad() - write to the secondary scratchpad register
- * @ndev: pointer to ntb_device instance
- * @idx: index to the scratchpad register, 0 based
- * @val: the data value to put into the register
- *
- * This function allows writing of a 32bit value to the indexed scratchpad
- * register. This writes over the data mirrored to the local scratchpad register
- * by the remote system.
- *
- * RETURNS: An appropriate -ERRNO error value on error, or zero for success.
- */
-int ntb_write_local_spad(struct ntb_device *ndev, unsigned int idx, u32 val)
+static inline int ndev_vec_mask(struct intel_ntb_dev *ndev, int db_vector)
 {
-	if (idx >= ndev->limits.max_spads)
-		return -EINVAL;
+	u64 shift, mask;
 
-	dev_dbg(&ndev->pdev->dev, "Writing %x to local scratch pad index %d\n",
-		val, idx);
-	writel(val, ndev->reg_ofs.spad_read + idx * 4);
+	shift = ndev->db_vec_shift;
+	mask = BIT_ULL(shift) - 1;
 
-	return 0;
+	return mask << (shift * db_vector);
 }
 
-/**
- * ntb_read_local_spad() - read from the primary scratchpad register
- * @ndev: pointer to ntb_device instance
- * @idx: index to scratchpad register, 0 based
- * @val: pointer to 32bit integer for storing the register value
- *
- * This function allows reading of the 32bit scratchpad register on
- * the primary (internal) side.  This allows the local system to read data
- * written and mirrored to the scratchpad register by the remote system.
- *
- * RETURNS: An appropriate -ERRNO error value on error, or zero for success.
- */
-int ntb_read_local_spad(struct ntb_device *ndev, unsigned int idx, u32 *val)
+static inline int ndev_spad_addr(struct intel_ntb_dev *ndev, int idx,
+				 phys_addr_t *spad_addr, phys_addr_t reg_addr,
+				 unsigned long reg)
 {
-	if (idx >= ndev->limits.max_spads)
+	WARN_ON_ONCE(ndev_is_unsafe(ndev, NTB_UNSAFE_SPAD));
+
+	if (idx < 0 || idx >= ndev->spad_count)
 		return -EINVAL;
 
-	*val = readl(ndev->reg_ofs.spad_write + idx * 4);
-	dev_dbg(&ndev->pdev->dev,
-		"Reading %x from local scratch pad index %d\n", *val, idx);
+	if (spad_addr) {
+		*spad_addr = reg_addr + reg + (idx << 2);
+		dev_dbg(ndev_dev(ndev), "Peer spad addr %llx\n", *spad_addr);
+	}
 
 	return 0;
 }
 
-/**
- * ntb_write_remote_spad() - write to the secondary scratchpad register
- * @ndev: pointer to ntb_device instance
- * @idx: index to the scratchpad register, 0 based
- * @val: the data value to put into the register
- *
- * This function allows writing of a 32bit value to the indexed scratchpad
- * register. The register resides on the secondary (external) side.  This allows
- * the local system to write data to be mirrored to the remote systems
- * scratchpad register.
- *
- * RETURNS: An appropriate -ERRNO error value on error, or zero for success.
- */
-int ntb_write_remote_spad(struct ntb_device *ndev, unsigned int idx, u32 val)
+static inline u32 ndev_spad_read(struct intel_ntb_dev *ndev, int idx,
+				 void __iomem *mmio)
 {
-	if (idx >= ndev->limits.max_spads)
-		return -EINVAL;
+	WARN_ON_ONCE(ndev_is_unsafe(ndev, NTB_UNSAFE_SPAD));
 
-	dev_dbg(&ndev->pdev->dev, "Writing %x to remote scratch pad index %d\n",
-		val, idx);
-	writel(val, ndev->reg_ofs.spad_write + idx * 4);
+	if (idx < 0 || idx >= ndev->spad_count)
+		return 0;
 
-	return 0;
+	return ioread32(mmio + (idx << 2));
 }
 
-/**
- * ntb_read_remote_spad() - read from the primary scratchpad register
- * @ndev: pointer to ntb_device instance
- * @idx: index to scratchpad register, 0 based
- * @val: pointer to 32bit integer for storing the register value
- *
- * This function allows reading of the 32bit scratchpad register on
- * the primary (internal) side.  This alloows the local system to read the data
- * it wrote to be mirrored on the remote system.
- *
- * RETURNS: An appropriate -ERRNO error value on error, or zero for success.
- */
-int ntb_read_remote_spad(struct ntb_device *ndev, unsigned int idx, u32 *val)
+static inline int ndev_spad_write(struct intel_ntb_dev *ndev, int idx, u32 val,
+				  void __iomem *mmio)
 {
-	if (idx >= ndev->limits.max_spads)
+	WARN_ON_ONCE(ndev_is_unsafe(ndev, NTB_UNSAFE_SPAD));
+
+	if (idx < 0 || idx >= ndev->spad_count)
 		return -EINVAL;
 
-	*val = readl(ndev->reg_ofs.spad_read + idx * 4);
-	dev_dbg(&ndev->pdev->dev,
-		"Reading %x from remote scratch pad index %d\n", *val, idx);
+	iowrite32(val, mmio + (idx << 2));
 
 	return 0;
 }
 
-/**
- * ntb_get_mw_base() - get addr for the NTB memory window
- * @ndev: pointer to ntb_device instance
- * @mw: memory window number
- *
- * This function provides the base address of the memory window specified.
- *
- * RETURNS: address, or NULL on error.
- */
-resource_size_t ntb_get_mw_base(struct ntb_device *ndev, unsigned int mw)
+static irqreturn_t ndev_interrupt(struct intel_ntb_dev *ndev, int vec)
 {
-	if (mw >= ntb_max_mw(ndev))
-		return 0;
+	u64 vec_mask;
+
+	vec_mask = ndev_vec_mask(ndev, vec);
+
+	dev_dbg(ndev_dev(ndev), "vec %d vec_mask %llx\n", vec, vec_mask);
 
-	return pci_resource_start(ndev->pdev, MW_TO_BAR(mw));
+	ndev->last_ts = jiffies;
+
+	if (vec_mask & ndev->db_link_mask) {
+		if (ndev->reg->poll_link(ndev))
+			ntb_link_event(&ndev->ntb);
+	}
+
+	if (vec_mask & ndev->db_valid_mask)
+		ntb_db_event(&ndev->ntb, vec);
+
+	return IRQ_HANDLED;
 }
 
-/**
- * ntb_get_mw_vbase() - get virtual addr for the NTB memory window
- * @ndev: pointer to ntb_device instance
- * @mw: memory window number
- *
- * This function provides the base virtual address of the memory window
- * specified.
- *
- * RETURNS: pointer to virtual address, or NULL on error.
- */
-void __iomem *ntb_get_mw_vbase(struct ntb_device *ndev, unsigned int mw)
+static irqreturn_t ndev_vec_isr(int irq, void *dev)
 {
-	if (mw >= ntb_max_mw(ndev))
-		return NULL;
+	struct intel_ntb_vec *nvec = dev;
 
-	return ndev->mw[mw].vbase;
+	return ndev_interrupt(nvec->ndev, nvec->num);
 }
 
-/**
- * ntb_get_mw_size() - return size of NTB memory window
- * @ndev: pointer to ntb_device instance
- * @mw: memory window number
- *
- * This function provides the physical size of the memory window specified
- *
- * RETURNS: the size of the memory window or zero on error
- */
-u64 ntb_get_mw_size(struct ntb_device *ndev, unsigned int mw)
+static irqreturn_t ndev_irq_isr(int irq, void *dev)
 {
-	if (mw >= ntb_max_mw(ndev))
-		return 0;
+	struct intel_ntb_dev *ndev = dev;
 
-	return ndev->mw[mw].bar_sz;
+	return ndev_interrupt(ndev, irq - ndev_pdev(ndev)->irq);
 }
 
-/**
- * ntb_set_mw_addr - set the memory window address
- * @ndev: pointer to ntb_device instance
- * @mw: memory window number
- * @addr: base address for data
- *
- * This function sets the base physical address of the memory window.  This
- * memory address is where data from the remote system will be transfered into
- * or out of depending on how the transport is configured.
- */
-void ntb_set_mw_addr(struct ntb_device *ndev, unsigned int mw, u64 addr)
+static int ndev_init_isr(struct intel_ntb_dev *ndev,
+			 int msix_min, int msix_max,
+			 int msix_shift, int total_shift)
 {
-	if (mw >= ntb_max_mw(ndev))
-		return;
+	struct pci_dev *pdev;
+	int rc, i, msix_count;
 
-	dev_dbg(&ndev->pdev->dev, "Writing addr %Lx to BAR %d\n", addr,
-		MW_TO_BAR(mw));
+	pdev = ndev_pdev(ndev);
 
-	ndev->mw[mw].phys_addr = addr;
+	/* Mask all doorbell interrupts */
+	ndev->db_mask = ndev->db_valid_mask;
+	ndev->reg->db_iowrite(ndev->db_mask,
+			      ndev->self_mmio +
+			      ndev->self_reg->db_mask);
 
-	switch (MW_TO_BAR(mw)) {
-	case NTB_BAR_23:
-		writeq(addr, ndev->reg_ofs.bar2_xlat);
-		break;
-	case NTB_BAR_4:
-		if (ndev->split_bar)
-			writel(addr, ndev->reg_ofs.bar4_xlat);
-		else
-			writeq(addr, ndev->reg_ofs.bar4_xlat);
-		break;
-	case NTB_BAR_5:
-		writel(addr, ndev->reg_ofs.bar5_xlat);
-		break;
+	/* Try to set up msix irq */
+
+	ndev->vec = kcalloc(msix_max, sizeof(*ndev->vec), GFP_KERNEL);
+	if (!ndev->vec)
+		goto err_msix_vec_alloc;
+
+	ndev->msix = kcalloc(msix_max, sizeof(*ndev->msix), GFP_KERNEL);
+	if (!ndev->msix)
+		goto err_msix_alloc;
+
+	for (i = 0; i < msix_max; ++i)
+		ndev->msix[i].entry = i;
+
+	msix_count = pci_enable_msix_range(pdev, ndev->msix,
+					   msix_min, msix_max);
+	if (msix_count < 0)
+		goto err_msix_enable;
+
+	for (i = 0; i < msix_count; ++i) {
+		ndev->vec[i].ndev = ndev;
+		ndev->vec[i].num = i;
+		rc = request_irq(ndev->msix[i].vector, ndev_vec_isr, 0,
+				 "ndev_vec_isr", &ndev->vec[i]);
+		if (rc)
+			goto err_msix_request;
 	}
-}
 
-/**
- * ntb_ring_doorbell() - Set the doorbell on the secondary/external side
- * @ndev: pointer to ntb_device instance
- * @db: doorbell to ring
- *
- * This function allows triggering of a doorbell on the secondary/external
- * side that will initiate an interrupt on the remote host
- *
- * RETURNS: An appropriate -ERRNO error value on error, or zero for success.
- */
-void ntb_ring_doorbell(struct ntb_device *ndev, unsigned int db)
-{
-	dev_dbg(&ndev->pdev->dev, "%s: ringing doorbell %d\n", __func__, db);
+	dev_dbg(ndev_dev(ndev), "Using msix interrupts\n");
+	ndev->db_vec_count = msix_count;
+	ndev->db_vec_shift = msix_shift;
+	return 0;
 
-	if (ndev->hw_type == BWD_HW)
-		writeq((u64) 1 << db, ndev->reg_ofs.rdb);
-	else
-		writew(((1 << ndev->bits_per_vector) - 1) <<
-		       (db * ndev->bits_per_vector), ndev->reg_ofs.rdb);
-}
+err_msix_request:
+	while (i-- > 0)
+		free_irq(ndev->msix[i].vector, ndev);
+	pci_disable_msix(pdev);
+err_msix_enable:
+	kfree(ndev->msix);
+err_msix_alloc:
+	kfree(ndev->vec);
+err_msix_vec_alloc:
+	ndev->msix = NULL;
+	ndev->vec = NULL;
 
-static void bwd_recover_link(struct ntb_device *ndev)
-{
-	u32 status;
+	/* Try to set up msi irq */
 
-	/* Driver resets the NTB ModPhy lanes - magic! */
-	writeb(0xe0, ndev->reg_base + BWD_MODPHY_PCSREG6);
-	writeb(0x40, ndev->reg_base + BWD_MODPHY_PCSREG4);
-	writeb(0x60, ndev->reg_base + BWD_MODPHY_PCSREG4);
-	writeb(0x60, ndev->reg_base + BWD_MODPHY_PCSREG6);
+	rc = pci_enable_msi(pdev);
+	if (rc)
+		goto err_msi_enable;
 
-	/* Driver waits 100ms to allow the NTB ModPhy to settle */
-	msleep(100);
+	rc = request_irq(pdev->irq, ndev_irq_isr, 0,
+			 "ndev_irq_isr", ndev);
+	if (rc)
+		goto err_msi_request;
 
-	/* Clear AER Errors, write to clear */
-	status = readl(ndev->reg_base + BWD_ERRCORSTS_OFFSET);
-	dev_dbg(&ndev->pdev->dev, "ERRCORSTS = %x\n", status);
-	status &= PCI_ERR_COR_REP_ROLL;
-	writel(status, ndev->reg_base + BWD_ERRCORSTS_OFFSET);
+	dev_dbg(ndev_dev(ndev), "Using msi interrupts\n");
+	ndev->db_vec_count = 1;
+	ndev->db_vec_shift = total_shift;
+	return 0;
 
-	/* Clear unexpected electrical idle event in LTSSM, write to clear */
-	status = readl(ndev->reg_base + BWD_LTSSMERRSTS0_OFFSET);
-	dev_dbg(&ndev->pdev->dev, "LTSSMERRSTS0 = %x\n", status);
-	status |= BWD_LTSSMERRSTS0_UNEXPECTEDEI;
-	writel(status, ndev->reg_base + BWD_LTSSMERRSTS0_OFFSET);
+err_msi_request:
+	pci_disable_msi(pdev);
+err_msi_enable:
 
-	/* Clear DeSkew Buffer error, write to clear */
-	status = readl(ndev->reg_base + BWD_DESKEWSTS_OFFSET);
-	dev_dbg(&ndev->pdev->dev, "DESKEWSTS = %x\n", status);
-	status |= BWD_DESKEWSTS_DBERR;
-	writel(status, ndev->reg_base + BWD_DESKEWSTS_OFFSET);
+	/* Try to set up intx irq */
 
-	status = readl(ndev->reg_base + BWD_IBSTERRRCRVSTS0_OFFSET);
-	dev_dbg(&ndev->pdev->dev, "IBSTERRRCRVSTS0 = %x\n", status);
-	status &= BWD_IBIST_ERR_OFLOW;
-	writel(status, ndev->reg_base + BWD_IBSTERRRCRVSTS0_OFFSET);
+	pci_intx(pdev, 1);
 
-	/* Releases the NTB state machine to allow the link to retrain */
-	status = readl(ndev->reg_base + BWD_LTSSMSTATEJMP_OFFSET);
-	dev_dbg(&ndev->pdev->dev, "LTSSMSTATEJMP = %x\n", status);
-	status &= ~BWD_LTSSMSTATEJMP_FORCEDETECT;
-	writel(status, ndev->reg_base + BWD_LTSSMSTATEJMP_OFFSET);
+	rc = request_irq(pdev->irq, ndev_irq_isr, IRQF_SHARED,
+			 "ndev_irq_isr", ndev);
+	if (rc)
+		goto err_intx_request;
+
+	dev_dbg(ndev_dev(ndev), "Using intx interrupts\n");
+	ndev->db_vec_count = 1;
+	ndev->db_vec_shift = total_shift;
+	return 0;
+
+err_intx_request:
+	return rc;
 }
 
-static void ntb_link_event(struct ntb_device *ndev, int link_state)
+static void ndev_deinit_isr(struct intel_ntb_dev *ndev)
 {
-	unsigned int event;
+	struct pci_dev *pdev;
+	int i;
 
-	if (ndev->link_status == link_state)
-		return;
+	pdev = ndev_pdev(ndev);
 
-	if (link_state == NTB_LINK_UP) {
-		u16 status;
-
-		dev_info(&ndev->pdev->dev, "Link Up\n");
-		ndev->link_status = NTB_LINK_UP;
-		event = NTB_EVENT_HW_LINK_UP;
-
-		if (is_ntb_atom(ndev) ||
-		    ndev->conn_type == NTB_CONN_TRANSPARENT)
-			status = readw(ndev->reg_ofs.lnk_stat);
-		else {
-			int rc = pci_read_config_word(ndev->pdev,
-						      SNB_LINK_STATUS_OFFSET,
-						      &status);
-			if (rc)
-				return;
-		}
+	/* Mask all doorbell interrupts */
+	ndev->db_mask = ndev->db_valid_mask;
+	ndev->reg->db_iowrite(ndev->db_mask,
+			      ndev->self_mmio +
+			      ndev->self_reg->db_mask);
 
-		ndev->link_width = (status & NTB_LINK_WIDTH_MASK) >> 4;
-		ndev->link_speed = (status & NTB_LINK_SPEED_MASK);
-		dev_info(&ndev->pdev->dev, "Link Width %d, Link Speed %d\n",
-			 ndev->link_width, ndev->link_speed);
+	if (ndev->msix) {
+		i = ndev->db_vec_count;
+		while (i--)
+			free_irq(ndev->msix[i].vector, &ndev->vec[i]);
+		pci_disable_msix(pdev);
+		kfree(ndev->msix);
+		kfree(ndev->vec);
 	} else {
-		dev_info(&ndev->pdev->dev, "Link Down\n");
-		ndev->link_status = NTB_LINK_DOWN;
-		event = NTB_EVENT_HW_LINK_DOWN;
-		/* Don't modify link width/speed, we need it in link recovery */
+		free_irq(pdev->irq, ndev);
+		if (pci_dev_msi_enabled(pdev))
+			pci_disable_msi(pdev);
 	}
-
-	/* notify the upper layer if we have an event change */
-	if (ndev->event_cb)
-		ndev->event_cb(ndev->ntb_transport, event);
 }
 
-static int ntb_link_status(struct ntb_device *ndev)
+static ssize_t ndev_debugfs_read(struct file *filp, char __user *ubuf,
+				 size_t count, loff_t *offp)
 {
-	int link_state;
+	struct intel_ntb_dev *ndev;
+	void __iomem *mmio;
+	char *buf;
+	size_t buf_size;
+	ssize_t ret, off;
+	union { u64 v64; u32 v32; u16 v16; } u;
 
-	if (is_ntb_atom(ndev)) {
-		u32 ntb_cntl;
+	ndev = filp->private_data;
+	mmio = ndev->self_mmio;
 
-		ntb_cntl = readl(ndev->reg_ofs.lnk_cntl);
-		if (ntb_cntl & BWD_CNTL_LINK_DOWN)
-			link_state = NTB_LINK_DOWN;
-		else
-			link_state = NTB_LINK_UP;
-	} else {
-		u16 status;
-		int rc;
+	buf_size = min(count, 0x800ul);
 
-		rc = pci_read_config_word(ndev->pdev, SNB_LINK_STATUS_OFFSET,
-					  &status);
-		if (rc)
-			return rc;
+	buf = kmalloc(buf_size, GFP_KERNEL);
+	if (!buf)
+		return -ENOMEM;
 
-		if (status & NTB_LINK_STATUS_ACTIVE)
-			link_state = NTB_LINK_UP;
-		else
-			link_state = NTB_LINK_DOWN;
-	}
+	off = 0;
 
-	ntb_link_event(ndev, link_state);
+	off += scnprintf(buf + off, buf_size - off,
+			 "NTB Device Information:\n");
 
-	return 0;
-}
+	off += scnprintf(buf + off, buf_size - off,
+			 "Connection Topology -\t%s\n",
+			 ntb_topo_string(ndev->ntb.topo));
 
-static void bwd_link_recovery(struct work_struct *work)
-{
-	struct ntb_device *ndev = container_of(work, struct ntb_device,
-					       lr_timer.work);
-	u32 status32;
+	off += scnprintf(buf + off, buf_size - off,
+			 "B2B Offset -\t\t%#lx\n", ndev->b2b_off);
+	off += scnprintf(buf + off, buf_size - off,
+			 "B2B MW Idx -\t\t%d\n", ndev->b2b_idx);
+	off += scnprintf(buf + off, buf_size - off,
+			 "BAR4 Split -\t\t%s\n",
+			 ndev->bar4_split ? "yes" : "no");
 
-	bwd_recover_link(ndev);
-	/* There is a potential race between the 2 NTB devices recovering at the
-	 * same time.  If the times are the same, the link will not recover and
-	 * the driver will be stuck in this loop forever.  Add a random interval
-	 * to the recovery time to prevent this race.
-	 */
-	msleep(BWD_LINK_RECOVERY_TIME + prandom_u32() % BWD_LINK_RECOVERY_TIME);
-
-	status32 = readl(ndev->reg_base + BWD_LTSSMSTATEJMP_OFFSET);
-	if (status32 & BWD_LTSSMSTATEJMP_FORCEDETECT)
-		goto retry;
-
-	status32 = readl(ndev->reg_base + BWD_IBSTERRRCRVSTS0_OFFSET);
-	if (status32 & BWD_IBIST_ERR_OFLOW)
-		goto retry;
-
-	status32 = readl(ndev->reg_ofs.lnk_cntl);
-	if (!(status32 & BWD_CNTL_LINK_DOWN)) {
-		unsigned char speed, width;
-		u16 status16;
-
-		status16 = readw(ndev->reg_ofs.lnk_stat);
-		width = (status16 & NTB_LINK_WIDTH_MASK) >> 4;
-		speed = (status16 & NTB_LINK_SPEED_MASK);
-		if (ndev->link_width != width || ndev->link_speed != speed)
-			goto retry;
+	off += scnprintf(buf + off, buf_size - off,
+			 "NTB CTL -\t\t%#06x\n", ndev->ntb_ctl);
+	off += scnprintf(buf + off, buf_size - off,
+			 "LNK STA -\t\t%#06x\n", ndev->lnk_sta);
+
+	if (!ndev->reg->link_is_up(ndev)) {
+		off += scnprintf(buf + off, buf_size - off,
+				 "Link Status -\t\tDown\n");
+	} else {
+		off += scnprintf(buf + off, buf_size - off,
+				 "Link Status -\t\tUp\n");
+		off += scnprintf(buf + off, buf_size - off,
+				 "Link Speed -\t\tPCI-E Gen %u\n",
+				 NTB_LNK_STA_SPEED(ndev->lnk_sta));
+		off += scnprintf(buf + off, buf_size - off,
+				 "Link Width -\t\tx%u\n",
+				 NTB_LNK_STA_WIDTH(ndev->lnk_sta));
 	}
 
-	schedule_delayed_work(&ndev->hb_timer, NTB_HB_TIMEOUT);
-	return;
+	off += scnprintf(buf + off, buf_size - off,
+			 "Memory Window Count -\t%u\n", ndev->mw_count);
+	off += scnprintf(buf + off, buf_size - off,
+			 "Scratchpad Count -\t%u\n", ndev->spad_count);
+	off += scnprintf(buf + off, buf_size - off,
+			 "Doorbell Count -\t%u\n", ndev->db_count);
+	off += scnprintf(buf + off, buf_size - off,
+			 "Doorbell Vector Count -\t%u\n", ndev->db_vec_count);
+	off += scnprintf(buf + off, buf_size - off,
+			 "Doorbell Vector Shift -\t%u\n", ndev->db_vec_shift);
+
+	off += scnprintf(buf + off, buf_size - off,
+			 "Doorbell Valid Mask -\t%#llx\n", ndev->db_valid_mask);
+	off += scnprintf(buf + off, buf_size - off,
+			 "Doorbell Link Mask -\t%#llx\n", ndev->db_link_mask);
+	off += scnprintf(buf + off, buf_size - off,
+			 "Doorbell Mask Cached -\t%#llx\n", ndev->db_mask);
+
+	u.v64 = ndev_db_read(ndev, mmio + ndev->self_reg->db_mask);
+	off += scnprintf(buf + off, buf_size - off,
+			 "Doorbell Mask -\t\t%#llx\n", u.v64);
+
+	u.v64 = ndev_db_read(ndev, mmio + ndev->self_reg->db_bell);
+	off += scnprintf(buf + off, buf_size - off,
+			 "Doorbell Bell -\t\t%#llx\n", u.v64);
+
+	off += scnprintf(buf + off, buf_size - off,
+			 "\nNTB Incoming XLAT:\n");
+
+	u.v64 = ioread64(mmio + bar2_off(ndev->xlat_reg->bar2_xlat, 2));
+	off += scnprintf(buf + off, buf_size - off,
+			 "XLAT23 -\t\t%#018llx\n", u.v64);
+
+	u.v64 = ioread64(mmio + bar2_off(ndev->xlat_reg->bar2_xlat, 4));
+	off += scnprintf(buf + off, buf_size - off,
+			 "XLAT45 -\t\t%#018llx\n", u.v64);
+
+	u.v64 = ioread64(mmio + bar2_off(ndev->xlat_reg->bar2_limit, 2));
+	off += scnprintf(buf + off, buf_size - off,
+			 "LMT23 -\t\t\t%#018llx\n", u.v64);
+
+	u.v64 = ioread64(mmio + bar2_off(ndev->xlat_reg->bar2_limit, 4));
+	off += scnprintf(buf + off, buf_size - off,
+			 "LMT45 -\t\t\t%#018llx\n", u.v64);
+
+	if (pdev_is_snb(ndev->ntb.pdev)) {
+		if (ntb_topo_is_b2b(ndev->ntb.topo)) {
+			off += scnprintf(buf + off, buf_size - off,
+					 "\nNTB Outgoing B2B XLAT:\n");
+
+			u.v64 = ioread64(mmio + SNB_PBAR23XLAT_OFFSET);
+			off += scnprintf(buf + off, buf_size - off,
+					 "B2B XLAT23 -\t\t%#018llx\n", u.v64);
+
+			u.v64 = ioread64(mmio + SNB_PBAR45XLAT_OFFSET);
+			off += scnprintf(buf + off, buf_size - off,
+					 "B2B XLAT45 -\t\t%#018llx\n", u.v64);
+
+			u.v64 = ioread64(mmio + SNB_PBAR23LMT_OFFSET);
+			off += scnprintf(buf + off, buf_size - off,
+					 "B2B LMT23 -\t\t%#018llx\n", u.v64);
+
+			u.v64 = ioread64(mmio + SNB_PBAR45LMT_OFFSET);
+			off += scnprintf(buf + off, buf_size - off,
+					 "B2B LMT45 -\t\t%#018llx\n", u.v64);
+
+			off += scnprintf(buf + off, buf_size - off,
+					 "\nNTB Secondary BAR:\n");
+
+			u.v64 = ioread64(mmio + SNB_SBAR0BASE_OFFSET);
+			off += scnprintf(buf + off, buf_size - off,
+					 "SBAR01 -\t\t%#018llx\n", u.v64);
+
+			u.v64 = ioread64(mmio + SNB_SBAR23BASE_OFFSET);
+			off += scnprintf(buf + off, buf_size - off,
+					 "SBAR23 -\t\t%#018llx\n", u.v64);
+
+			u.v64 = ioread64(mmio + SNB_SBAR45BASE_OFFSET);
+			off += scnprintf(buf + off, buf_size - off,
+					 "SBAR45 -\t\t%#018llx\n", u.v64);
+		}
+
+		off += scnprintf(buf + off, buf_size - off,
+				 "\nSNB NTB Statistics:\n");
+
+		u.v16 = ioread16(mmio + SNB_USMEMMISS_OFFSET);
+		off += scnprintf(buf + off, buf_size - off,
+				 "Upstream Memory Miss -\t%u\n", u.v16);
+
+		off += scnprintf(buf + off, buf_size - off,
+				 "\nSNB NTB Hardware Errors:\n");
+
+		if (!pci_read_config_word(ndev->ntb.pdev,
+					  SNB_DEVSTS_OFFSET, &u.v16))
+			off += scnprintf(buf + off, buf_size - off,
+					 "DEVSTS -\t\t%#06x\n", u.v16);
+
+		if (!pci_read_config_word(ndev->ntb.pdev,
+					  SNB_LINK_STATUS_OFFSET, &u.v16))
+			off += scnprintf(buf + off, buf_size - off,
+					 "LNKSTS -\t\t%#06x\n", u.v16);
 
-retry:
-	schedule_delayed_work(&ndev->lr_timer, NTB_HB_TIMEOUT);
+		if (!pci_read_config_dword(ndev->ntb.pdev,
+					   SNB_UNCERRSTS_OFFSET, &u.v32))
+			off += scnprintf(buf + off, buf_size - off,
+					 "UNCERRSTS -\t\t%#06x\n", u.v32);
+
+		if (!pci_read_config_dword(ndev->ntb.pdev,
+					   SNB_CORERRSTS_OFFSET, &u.v32))
+			off += scnprintf(buf + off, buf_size - off,
+					 "CORERRSTS -\t\t%#06x\n", u.v32);
+	}
+
+	ret = simple_read_from_buffer(ubuf, count, offp, buf, off);
+	kfree(buf);
+	return ret;
 }
 
-/* BWD doesn't have link status interrupt, poll on that platform */
-static void bwd_link_poll(struct work_struct *work)
+static void ndev_init_debugfs(struct intel_ntb_dev *ndev)
 {
-	struct ntb_device *ndev = container_of(work, struct ntb_device,
-					       hb_timer.work);
-	unsigned long ts = jiffies;
-
-	/* If we haven't gotten an interrupt in a while, check the BWD link
-	 * status bit
-	 */
-	if (ts > ndev->last_ts + NTB_HB_TIMEOUT) {
-		int rc = ntb_link_status(ndev);
-		if (rc)
-			dev_err(&ndev->pdev->dev,
-				"Error determining link status\n");
-
-		/* Check to see if a link error is the cause of the link down */
-		if (ndev->link_status == NTB_LINK_DOWN) {
-			u32 status32 = readl(ndev->reg_base +
-					     BWD_LTSSMSTATEJMP_OFFSET);
-			if (status32 & BWD_LTSSMSTATEJMP_FORCEDETECT) {
-				schedule_delayed_work(&ndev->lr_timer, 0);
-				return;
-			}
-		}
+	if (!debugfs_dir) {
+		ndev->debugfs_dir = NULL;
+		ndev->debugfs_info = NULL;
+	} else {
+		ndev->debugfs_dir =
+			debugfs_create_dir(ndev_name(ndev), debugfs_dir);
+		if (!ndev->debugfs_dir)
+			ndev->debugfs_info = NULL;
+		else
+			ndev->debugfs_info =
+				debugfs_create_file("info", S_IRUSR,
+						    ndev->debugfs_dir, ndev,
+						    &intel_ntb_debugfs_info);
 	}
+}
 
-	schedule_delayed_work(&ndev->hb_timer, NTB_HB_TIMEOUT);
+static void ndev_deinit_debugfs(struct intel_ntb_dev *ndev)
+{
+	debugfs_remove_recursive(ndev->debugfs_dir);
 }
 
-static int ntb_xeon_setup(struct ntb_device *ndev)
+static int intel_ntb_mw_count(struct ntb_dev *ntb)
 {
-	switch (ndev->conn_type) {
-	case NTB_CONN_B2B:
-		ndev->reg_ofs.ldb = ndev->reg_base + SNB_PDOORBELL_OFFSET;
-		ndev->reg_ofs.ldb_mask = ndev->reg_base + SNB_PDBMSK_OFFSET;
-		ndev->reg_ofs.spad_read = ndev->reg_base + SNB_SPAD_OFFSET;
-		ndev->reg_ofs.bar2_xlat = ndev->reg_base + SNB_SBAR2XLAT_OFFSET;
-		ndev->reg_ofs.bar4_xlat = ndev->reg_base + SNB_SBAR4XLAT_OFFSET;
-		if (ndev->split_bar)
-			ndev->reg_ofs.bar5_xlat =
-				ndev->reg_base + SNB_SBAR5XLAT_OFFSET;
-		ndev->limits.max_spads = SNB_MAX_B2B_SPADS;
+	return ntb_ndev(ntb)->mw_count;
+}
 
-		/* There is a Xeon hardware errata related to writes to
-		 * SDOORBELL or B2BDOORBELL in conjunction with inbound access
-		 * to NTB MMIO Space, which may hang the system.  To workaround
-		 * this use the second memory window to access the interrupt and
-		 * scratch pad registers on the remote system.
-		 */
-		if (ndev->wa_flags & WA_SNB_ERR) {
-			if (!ndev->mw[ndev->limits.max_mw - 1].bar_sz)
-				return -EINVAL;
-
-			ndev->limits.max_db_bits = SNB_MAX_DB_BITS;
-			ndev->reg_ofs.spad_write =
-				ndev->mw[ndev->limits.max_mw - 1].vbase +
-				SNB_SPAD_OFFSET;
-			ndev->reg_ofs.rdb =
-				ndev->mw[ndev->limits.max_mw - 1].vbase +
-				SNB_PDOORBELL_OFFSET;
-
-			/* Set the Limit register to 4k, the minimum size, to
-			 * prevent an illegal access
-			 */
-			writeq(ndev->mw[1].bar_sz + 0x1000, ndev->reg_base +
-			       SNB_PBAR4LMT_OFFSET);
-			/* HW errata on the Limit registers.  They can only be
-			 * written when the base register is 4GB aligned and
-			 * < 32bit.  This should already be the case based on
-			 * the driver defaults, but write the Limit registers
-			 * first just in case.
-			 */
-
-			ndev->limits.max_mw = SNB_ERRATA_MAX_MW;
-		} else {
-			/* HW Errata on bit 14 of b2bdoorbell register.  Writes
-			 * will not be mirrored to the remote system.  Shrink
-			 * the number of bits by one, since bit 14 is the last
-			 * bit.
-			 */
-			ndev->limits.max_db_bits = SNB_MAX_DB_BITS - 1;
-			ndev->reg_ofs.spad_write = ndev->reg_base +
-						   SNB_B2B_SPAD_OFFSET;
-			ndev->reg_ofs.rdb = ndev->reg_base +
-					    SNB_B2B_DOORBELL_OFFSET;
-
-			/* Disable the Limit register, just incase it is set to
-			 * something silly. A 64bit write should handle it
-			 * regardless of whether it has a split BAR or not.
-			 */
-			writeq(0, ndev->reg_base + SNB_PBAR4LMT_OFFSET);
-			/* HW errata on the Limit registers.  They can only be
-			 * written when the base register is 4GB aligned and
-			 * < 32bit.  This should already be the case based on
-			 * the driver defaults, but write the Limit registers
-			 * first just in case.
-			 */
-			if (ndev->split_bar)
-				ndev->limits.max_mw = HSX_SPLITBAR_MAX_MW;
-			else
-				ndev->limits.max_mw = SNB_MAX_MW;
-		}
+static int intel_ntb_mw_get_range(struct ntb_dev *ntb, int idx,
+				  phys_addr_t *base,
+				  resource_size_t *size,
+				  resource_size_t *align,
+				  resource_size_t *align_size)
+{
+	struct intel_ntb_dev *ndev = ntb_ndev(ntb);
+	int bar;
 
-		/* The Xeon errata workaround requires setting SBAR Base
-		 * addresses to known values, so that the PBAR XLAT can be
-		 * pointed at SBAR0 of the remote system.
-		 */
-		if (ndev->dev_type == NTB_DEV_USD) {
-			writeq(SNB_MBAR23_DSD_ADDR, ndev->reg_base +
-			       SNB_PBAR2XLAT_OFFSET);
-			if (ndev->wa_flags & WA_SNB_ERR)
-				writeq(SNB_MBAR01_DSD_ADDR, ndev->reg_base +
-				       SNB_PBAR4XLAT_OFFSET);
-			else {
-				if (ndev->split_bar) {
-					writel(SNB_MBAR4_DSD_ADDR,
-					       ndev->reg_base +
-					       SNB_PBAR4XLAT_OFFSET);
-					writel(SNB_MBAR5_DSD_ADDR,
-					       ndev->reg_base +
-					       SNB_PBAR5XLAT_OFFSET);
-				} else
-					writeq(SNB_MBAR4_DSD_ADDR,
-					       ndev->reg_base +
-					       SNB_PBAR4XLAT_OFFSET);
-
-				/* B2B_XLAT_OFFSET is a 64bit register, but can
-				 * only take 32bit writes
-				 */
-				writel(SNB_MBAR01_DSD_ADDR & 0xffffffff,
-				       ndev->reg_base + SNB_B2B_XLAT_OFFSETL);
-				writel(SNB_MBAR01_DSD_ADDR >> 32,
-				       ndev->reg_base + SNB_B2B_XLAT_OFFSETU);
-			}
-
-			writeq(SNB_MBAR01_USD_ADDR, ndev->reg_base +
-			       SNB_SBAR0BASE_OFFSET);
-			writeq(SNB_MBAR23_USD_ADDR, ndev->reg_base +
-			       SNB_SBAR2BASE_OFFSET);
-			if (ndev->split_bar) {
-				writel(SNB_MBAR4_USD_ADDR, ndev->reg_base +
-				       SNB_SBAR4BASE_OFFSET);
-				writel(SNB_MBAR5_USD_ADDR, ndev->reg_base +
-				       SNB_SBAR5BASE_OFFSET);
-			} else
-				writeq(SNB_MBAR4_USD_ADDR, ndev->reg_base +
-				       SNB_SBAR4BASE_OFFSET);
-		} else {
-			writeq(SNB_MBAR23_USD_ADDR, ndev->reg_base +
-			       SNB_PBAR2XLAT_OFFSET);
-			if (ndev->wa_flags & WA_SNB_ERR)
-				writeq(SNB_MBAR01_USD_ADDR, ndev->reg_base +
-				       SNB_PBAR4XLAT_OFFSET);
-			else {
-				if (ndev->split_bar) {
-					writel(SNB_MBAR4_USD_ADDR,
-					       ndev->reg_base +
-					       SNB_PBAR4XLAT_OFFSET);
-					writel(SNB_MBAR5_USD_ADDR,
-					       ndev->reg_base +
-					       SNB_PBAR5XLAT_OFFSET);
-				} else
-					writeq(SNB_MBAR4_USD_ADDR,
-					       ndev->reg_base +
-					       SNB_PBAR4XLAT_OFFSET);
-
-				/*
-				 * B2B_XLAT_OFFSET is a 64bit register, but can
-				 * only take 32bit writes
-				 */
-				writel(SNB_MBAR01_USD_ADDR & 0xffffffff,
-				       ndev->reg_base + SNB_B2B_XLAT_OFFSETL);
-				writel(SNB_MBAR01_USD_ADDR >> 32,
-				       ndev->reg_base + SNB_B2B_XLAT_OFFSETU);
-			}
-			writeq(SNB_MBAR01_DSD_ADDR, ndev->reg_base +
-			       SNB_SBAR0BASE_OFFSET);
-			writeq(SNB_MBAR23_DSD_ADDR, ndev->reg_base +
-			       SNB_SBAR2BASE_OFFSET);
-			if (ndev->split_bar) {
-				writel(SNB_MBAR4_DSD_ADDR, ndev->reg_base +
-				       SNB_SBAR4BASE_OFFSET);
-				writel(SNB_MBAR5_DSD_ADDR, ndev->reg_base +
-				       SNB_SBAR5BASE_OFFSET);
-			} else
-				writeq(SNB_MBAR4_DSD_ADDR, ndev->reg_base +
-				       SNB_SBAR4BASE_OFFSET);
+	if (idx >= ndev->b2b_idx && !ndev->b2b_off)
+		idx += 1;
 
-		}
-		break;
-	case NTB_CONN_RP:
-		if (ndev->wa_flags & WA_SNB_ERR) {
-			dev_err(&ndev->pdev->dev,
-				"NTB-RP disabled due to hardware errata.\n");
-			return -EINVAL;
-		}
+	bar = ndev_mw_to_bar(ndev, idx);
+	if (bar < 0)
+		return bar;
 
-		/* Scratch pads need to have exclusive access from the primary
-		 * or secondary side.  Halve the num spads so that each side can
-		 * have an equal amount.
-		 */
-		ndev->limits.max_spads = SNB_MAX_COMPAT_SPADS / 2;
-		ndev->limits.max_db_bits = SNB_MAX_DB_BITS;
-		/* Note: The SDOORBELL is the cause of the errata.  You REALLY
-		 * don't want to touch it.
-		 */
-		ndev->reg_ofs.rdb = ndev->reg_base + SNB_SDOORBELL_OFFSET;
-		ndev->reg_ofs.ldb = ndev->reg_base + SNB_PDOORBELL_OFFSET;
-		ndev->reg_ofs.ldb_mask = ndev->reg_base + SNB_PDBMSK_OFFSET;
-		/* Offset the start of the spads to correspond to whether it is
-		 * primary or secondary
-		 */
-		ndev->reg_ofs.spad_write = ndev->reg_base + SNB_SPAD_OFFSET +
-					   ndev->limits.max_spads * 4;
-		ndev->reg_ofs.spad_read = ndev->reg_base + SNB_SPAD_OFFSET;
-		ndev->reg_ofs.bar2_xlat = ndev->reg_base + SNB_SBAR2XLAT_OFFSET;
-		ndev->reg_ofs.bar4_xlat = ndev->reg_base + SNB_SBAR4XLAT_OFFSET;
-		if (ndev->split_bar) {
-			ndev->reg_ofs.bar5_xlat =
-				ndev->reg_base + SNB_SBAR5XLAT_OFFSET;
-			ndev->limits.max_mw = HSX_SPLITBAR_MAX_MW;
-		} else
-			ndev->limits.max_mw = SNB_MAX_MW;
-		break;
-	case NTB_CONN_TRANSPARENT:
-		if (ndev->wa_flags & WA_SNB_ERR) {
-			dev_err(&ndev->pdev->dev,
-				"NTB-TRANSPARENT disabled due to hardware errata.\n");
-			return -EINVAL;
-		}
+	if (base)
+		*base = pci_resource_start(ndev->ntb.pdev, bar) +
+			(idx == ndev->b2b_idx ? ndev->b2b_off : 0);
 
-		/* Scratch pads need to have exclusive access from the primary
-		 * or secondary side.  Halve the num spads so that each side can
-		 * have an equal amount.
-		 */
-		ndev->limits.max_spads = SNB_MAX_COMPAT_SPADS / 2;
-		ndev->limits.max_db_bits = SNB_MAX_DB_BITS;
-		ndev->reg_ofs.rdb = ndev->reg_base + SNB_PDOORBELL_OFFSET;
-		ndev->reg_ofs.ldb = ndev->reg_base + SNB_SDOORBELL_OFFSET;
-		ndev->reg_ofs.ldb_mask = ndev->reg_base + SNB_SDBMSK_OFFSET;
-		ndev->reg_ofs.spad_write = ndev->reg_base + SNB_SPAD_OFFSET;
-		/* Offset the start of the spads to correspond to whether it is
-		 * primary or secondary
-		 */
-		ndev->reg_ofs.spad_read = ndev->reg_base + SNB_SPAD_OFFSET +
-					  ndev->limits.max_spads * 4;
-		ndev->reg_ofs.bar2_xlat = ndev->reg_base + SNB_PBAR2XLAT_OFFSET;
-		ndev->reg_ofs.bar4_xlat = ndev->reg_base + SNB_PBAR4XLAT_OFFSET;
-
-		if (ndev->split_bar) {
-			ndev->reg_ofs.bar5_xlat =
-				ndev->reg_base + SNB_PBAR5XLAT_OFFSET;
-			ndev->limits.max_mw = HSX_SPLITBAR_MAX_MW;
-		} else
-			ndev->limits.max_mw = SNB_MAX_MW;
-		break;
-	default:
-		/*
-		 * we should never hit this. the detect function should've
-		 * take cared of everything.
-		 */
-		return -EINVAL;
-	}
+	if (size)
+		*size = pci_resource_len(ndev->ntb.pdev, bar) -
+			(idx == ndev->b2b_idx ? ndev->b2b_off : 0);
 
-	ndev->reg_ofs.lnk_cntl = ndev->reg_base + SNB_NTBCNTL_OFFSET;
-	ndev->reg_ofs.lnk_stat = ndev->reg_base + SNB_SLINK_STATUS_OFFSET;
-	ndev->reg_ofs.spci_cmd = ndev->reg_base + SNB_PCICMD_OFFSET;
+	if (align)
+		*align = pci_resource_len(ndev->ntb.pdev, bar);
 
-	ndev->limits.msix_cnt = SNB_MSIX_CNT;
-	ndev->bits_per_vector = SNB_DB_BITS_PER_VEC;
+	if (align_size)
+		*align_size = 1;
 
 	return 0;
 }
 
-static int ntb_bwd_setup(struct ntb_device *ndev)
+static int intel_ntb_mw_set_trans(struct ntb_dev *ntb, int idx,
+				  dma_addr_t addr, resource_size_t size)
 {
-	int rc;
-	u32 val;
+	struct intel_ntb_dev *ndev = ntb_ndev(ntb);
+	unsigned long base_reg, xlat_reg, limit_reg;
+	resource_size_t bar_size, mw_size;
+	void __iomem *mmio;
+	u64 base, limit, reg_val;
+	int bar;
 
-	ndev->hw_type = BWD_HW;
+	if (idx >= ndev->b2b_idx && !ndev->b2b_off)
+		idx += 1;
 
-	rc = pci_read_config_dword(ndev->pdev, NTB_PPD_OFFSET, &val);
-	if (rc)
-		return rc;
+	bar = ndev_mw_to_bar(ndev, idx);
+	if (bar < 0)
+		return bar;
 
-	switch ((val & BWD_PPD_CONN_TYPE) >> 8) {
-	case NTB_CONN_B2B:
-		ndev->conn_type = NTB_CONN_B2B;
-		break;
-	case NTB_CONN_RP:
-	default:
-		dev_err(&ndev->pdev->dev, "Unsupported NTB configuration\n");
+	bar_size = pci_resource_len(ndev->ntb.pdev, bar);
+
+	if (idx == ndev->b2b_idx)
+		mw_size = bar_size - ndev->b2b_off;
+	else
+		mw_size = bar_size;
+
+	/* hardware requires that addr is aligned to bar size */
+	if (addr & (bar_size - 1))
 		return -EINVAL;
+
+	/* make sure the range fits in the usable mw size */
+	if (size > mw_size)
+		return -EINVAL;
+
+	mmio = ndev->self_mmio;
+	base_reg = bar0_off(ndev->xlat_reg->bar0_base, bar);
+	xlat_reg = bar2_off(ndev->xlat_reg->bar2_xlat, bar);
+	limit_reg = bar2_off(ndev->xlat_reg->bar2_limit, bar);
+
+	if (bar < 4 || !ndev->bar4_split) {
+		base = ioread64(mmio + base_reg);
+
+		/* Set the limit if supported, if size is not mw_size */
+		if (limit_reg && size != mw_size)
+			limit = base + size;
+		else
+			limit = 0;
+
+		/* set and verify setting the translation address */
+		iowrite64(addr, mmio + xlat_reg);
+		reg_val = ioread64(mmio + xlat_reg);
+		if (reg_val != addr) {
+			iowrite64(0, mmio + xlat_reg);
+			return -EIO;
+		}
+
+		/* set and verify setting the limit */
+		iowrite64(limit, mmio + limit_reg);
+		reg_val = ioread64(mmio + limit_reg);
+		if (reg_val != limit) {
+			iowrite64(base, mmio + limit_reg);
+			iowrite64(0, mmio + xlat_reg);
+			return -EIO;
+		}
+	} else {
+		/* split bar addr range must all be 32 bit */
+		if (addr & (~0ull << 32))
+			return -EINVAL;
+		if ((addr + size) & (~0ull << 32))
+			return -EINVAL;
+
+		base = ioread32(mmio + base_reg);
+
+		/* Set the limit if supported, if size is not mw_size */
+		if (limit_reg && size != mw_size)
+			limit = base + size;
+		else
+			limit = 0;
+
+		/* set and verify setting the translation address */
+		iowrite32(addr, mmio + xlat_reg);
+		reg_val = ioread32(mmio + xlat_reg);
+		if (reg_val != addr) {
+			iowrite32(0, mmio + xlat_reg);
+			return -EIO;
+		}
+
+		/* set and verify setting the limit */
+		iowrite32(limit, mmio + limit_reg);
+		reg_val = ioread32(mmio + limit_reg);
+		if (reg_val != limit) {
+			iowrite32(base, mmio + limit_reg);
+			iowrite32(0, mmio + xlat_reg);
+			return -EIO;
+		}
 	}
 
-	if (val & BWD_PPD_DEV_TYPE)
-		ndev->dev_type = NTB_DEV_DSD;
-	else
-		ndev->dev_type = NTB_DEV_USD;
+	return 0;
+}
 
-	/* Initiate PCI-E link training */
-	rc = pci_write_config_dword(ndev->pdev, NTB_PPD_OFFSET,
-				    val | BWD_PPD_INIT_LINK);
-	if (rc)
-		return rc;
+static int intel_ntb_link_is_up(struct ntb_dev *ntb,
+				enum ntb_speed *speed,
+				enum ntb_width *width)
+{
+	struct intel_ntb_dev *ndev = ntb_ndev(ntb);
 
-	ndev->reg_ofs.ldb = ndev->reg_base + BWD_PDOORBELL_OFFSET;
-	ndev->reg_ofs.ldb_mask = ndev->reg_base + BWD_PDBMSK_OFFSET;
-	ndev->reg_ofs.rdb = ndev->reg_base + BWD_B2B_DOORBELL_OFFSET;
-	ndev->reg_ofs.bar2_xlat = ndev->reg_base + BWD_SBAR2XLAT_OFFSET;
-	ndev->reg_ofs.bar4_xlat = ndev->reg_base + BWD_SBAR4XLAT_OFFSET;
-	ndev->reg_ofs.lnk_cntl = ndev->reg_base + BWD_NTBCNTL_OFFSET;
-	ndev->reg_ofs.lnk_stat = ndev->reg_base + BWD_LINK_STATUS_OFFSET;
-	ndev->reg_ofs.spad_read = ndev->reg_base + BWD_SPAD_OFFSET;
-	ndev->reg_ofs.spad_write = ndev->reg_base + BWD_B2B_SPAD_OFFSET;
-	ndev->reg_ofs.spci_cmd = ndev->reg_base + BWD_PCICMD_OFFSET;
-	ndev->limits.max_mw = BWD_MAX_MW;
-	ndev->limits.max_spads = BWD_MAX_SPADS;
-	ndev->limits.max_db_bits = BWD_MAX_DB_BITS;
-	ndev->limits.msix_cnt = BWD_MSIX_CNT;
-	ndev->bits_per_vector = BWD_DB_BITS_PER_VEC;
-
-	/* Since bwd doesn't have a link interrupt, setup a poll timer */
-	INIT_DELAYED_WORK(&ndev->hb_timer, bwd_link_poll);
-	INIT_DELAYED_WORK(&ndev->lr_timer, bwd_link_recovery);
-	schedule_delayed_work(&ndev->hb_timer, NTB_HB_TIMEOUT);
+	if (ndev->reg->link_is_up(ndev)) {
+		if (speed)
+			*speed = NTB_LNK_STA_SPEED(ndev->lnk_sta);
+		if (width)
+			*width = NTB_LNK_STA_WIDTH(ndev->lnk_sta);
+		return 1;
+	} else {
+		/* TODO MAYBE: is it possible to observe the link speed and
+		 * width while link is training? */
+		if (speed)
+			*speed = NTB_SPEED_NONE;
+		if (width)
+			*width = NTB_WIDTH_NONE;
+		return 0;
+	}
+}
+
+static int intel_ntb_link_enable(struct ntb_dev *ntb,
+				 enum ntb_speed max_speed,
+				 enum ntb_width max_width)
+{
+	struct intel_ntb_dev *ndev;
+	u32 ntb_ctl;
+
+	ndev = container_of(ntb, struct intel_ntb_dev, ntb);
+
+	if (ndev->ntb.topo == NTB_TOPO_SEC)
+		return -EINVAL;
+
+	dev_dbg(ndev_dev(ndev),
+		"Enabling link with max_speed %d max_width %d\n",
+		max_speed, max_width);
+	if (max_speed != NTB_SPEED_AUTO)
+		dev_dbg(ndev_dev(ndev), "ignoring max_speed %d\n", max_speed);
+	if (max_width != NTB_WIDTH_AUTO)
+		dev_dbg(ndev_dev(ndev), "ignoring max_width %d\n", max_width);
+
+	ntb_ctl = ioread32(ndev->self_mmio + ndev->reg->ntb_ctl);
+	ntb_ctl &= ~(NTB_CTL_DISABLE | NTB_CTL_CFG_LOCK);
+	ntb_ctl |= NTB_CTL_P2S_BAR2_SNOOP | NTB_CTL_S2P_BAR2_SNOOP;
+	ntb_ctl |= NTB_CTL_P2S_BAR4_SNOOP | NTB_CTL_S2P_BAR4_SNOOP;
+	if (ndev->bar4_split)
+		ntb_ctl |= NTB_CTL_P2S_BAR5_SNOOP | NTB_CTL_S2P_BAR5_SNOOP;
+	iowrite32(ntb_ctl, ndev->self_mmio + ndev->reg->ntb_ctl);
 
 	return 0;
 }
 
-static int ntb_device_setup(struct ntb_device *ndev)
+static int intel_ntb_link_disable(struct ntb_dev *ntb)
 {
-	int rc;
+	struct intel_ntb_dev *ndev;
+	u32 ntb_cntl;
 
-	if (is_ntb_xeon(ndev))
-		rc = ntb_xeon_setup(ndev);
-	else if (is_ntb_atom(ndev))
-		rc = ntb_bwd_setup(ndev);
-	else
-		rc = -ENODEV;
+	ndev = container_of(ntb, struct intel_ntb_dev, ntb);
 
-	if (rc)
-		return rc;
+	if (ndev->ntb.topo == NTB_TOPO_SEC)
+		return -EINVAL;
 
-	if (ndev->conn_type == NTB_CONN_B2B)
-		/* Enable Bus Master and Memory Space on the secondary side */
-		writew(PCI_COMMAND_MEMORY | PCI_COMMAND_MASTER,
-		       ndev->reg_ofs.spci_cmd);
+	dev_dbg(ndev_dev(ndev), "Disabling link\n");
+
+	/* Bring NTB link down */
+	ntb_cntl = ioread32(ndev->self_mmio + ndev->reg->ntb_ctl);
+	ntb_cntl &= ~(NTB_CTL_P2S_BAR2_SNOOP | NTB_CTL_S2P_BAR2_SNOOP);
+	ntb_cntl &= ~(NTB_CTL_P2S_BAR4_SNOOP | NTB_CTL_S2P_BAR4_SNOOP);
+	if (ndev->bar4_split)
+		ntb_cntl &= ~(NTB_CTL_P2S_BAR5_SNOOP | NTB_CTL_S2P_BAR5_SNOOP);
+	ntb_cntl |= NTB_CTL_DISABLE | NTB_CTL_CFG_LOCK;
+	iowrite32(ntb_cntl, ndev->self_mmio + ndev->reg->ntb_ctl);
 
 	return 0;
 }
 
-static void ntb_device_free(struct ntb_device *ndev)
+static int intel_ntb_db_is_unsafe(struct ntb_dev *ntb)
 {
-	if (is_ntb_atom(ndev)) {
-		cancel_delayed_work_sync(&ndev->hb_timer);
-		cancel_delayed_work_sync(&ndev->lr_timer);
-	}
+	return ndev_ignore_unsafe(ntb_ndev(ntb), NTB_UNSAFE_DB);
 }
 
-static irqreturn_t bwd_callback_msix_irq(int irq, void *data)
+static u64 intel_ntb_db_valid_mask(struct ntb_dev *ntb)
 {
-	struct ntb_db_cb *db_cb = data;
-	struct ntb_device *ndev = db_cb->ndev;
-	unsigned long mask;
+	return ntb_ndev(ntb)->db_valid_mask;
+}
 
-	dev_dbg(&ndev->pdev->dev, "MSI-X irq %d received for DB %d\n", irq,
-		db_cb->db_num);
+static int intel_ntb_db_vector_count(struct ntb_dev *ntb)
+{
+	struct intel_ntb_dev *ndev;
 
-	mask = readw(ndev->reg_ofs.ldb_mask);
-	set_bit(db_cb->db_num * ndev->bits_per_vector, &mask);
-	writew(mask, ndev->reg_ofs.ldb_mask);
+	ndev = container_of(ntb, struct intel_ntb_dev, ntb);
 
-	tasklet_schedule(&db_cb->irq_work);
+	return ndev->db_vec_count;
+}
 
-	/* No need to check for the specific HB irq, any interrupt means
-	 * we're connected.
-	 */
-	ndev->last_ts = jiffies;
+static u64 intel_ntb_db_vector_mask(struct ntb_dev *ntb, int db_vector)
+{
+	struct intel_ntb_dev *ndev = ntb_ndev(ntb);
 
-	writeq((u64) 1 << db_cb->db_num, ndev->reg_ofs.ldb);
+	if (db_vector < 0 || db_vector > ndev->db_vec_count)
+		return 0;
 
-	return IRQ_HANDLED;
+	return ndev->db_valid_mask & ndev_vec_mask(ndev, db_vector);
 }
 
-static irqreturn_t xeon_callback_msix_irq(int irq, void *data)
+static u64 intel_ntb_db_read(struct ntb_dev *ntb)
 {
-	struct ntb_db_cb *db_cb = data;
-	struct ntb_device *ndev = db_cb->ndev;
-	unsigned long mask;
+	struct intel_ntb_dev *ndev = ntb_ndev(ntb);
 
-	dev_dbg(&ndev->pdev->dev, "MSI-X irq %d received for DB %d\n", irq,
-		db_cb->db_num);
+	return ndev_db_read(ndev,
+			    ndev->self_mmio +
+			    ndev->self_reg->db_bell);
+}
 
-	mask = readw(ndev->reg_ofs.ldb_mask);
-	set_bit(db_cb->db_num * ndev->bits_per_vector, &mask);
-	writew(mask, ndev->reg_ofs.ldb_mask);
+static int intel_ntb_db_clear(struct ntb_dev *ntb, u64 db_bits)
+{
+	struct intel_ntb_dev *ndev = ntb_ndev(ntb);
 
-	tasklet_schedule(&db_cb->irq_work);
+	return ndev_db_write(ndev, db_bits,
+			     ndev->self_mmio +
+			     ndev->self_reg->db_bell);
+}
 
-	/* On Sandybridge, there are 16 bits in the interrupt register
-	 * but only 4 vectors.  So, 5 bits are assigned to the first 3
-	 * vectors, with the 4th having a single bit for link
-	 * interrupts.
-	 */
-	writew(((1 << ndev->bits_per_vector) - 1) <<
-	       (db_cb->db_num * ndev->bits_per_vector), ndev->reg_ofs.ldb);
+static int intel_ntb_db_set_mask(struct ntb_dev *ntb, u64 db_bits)
+{
+	struct intel_ntb_dev *ndev = ntb_ndev(ntb);
 
-	return IRQ_HANDLED;
+	return ndev_db_set_mask(ndev, db_bits,
+				ndev->self_mmio +
+				ndev->self_reg->db_mask);
 }
 
-/* Since we do not have a HW doorbell in BWD, this is only used in JF/JT */
-static irqreturn_t xeon_event_msix_irq(int irq, void *dev)
+static int intel_ntb_db_clear_mask(struct ntb_dev *ntb, u64 db_bits)
 {
-	struct ntb_device *ndev = dev;
-	int rc;
-
-	dev_dbg(&ndev->pdev->dev, "MSI-X irq %d received for Events\n", irq);
+	struct intel_ntb_dev *ndev = ntb_ndev(ntb);
 
-	rc = ntb_link_status(ndev);
-	if (rc)
-		dev_err(&ndev->pdev->dev, "Error determining link status\n");
+	return ndev_db_clear_mask(ndev, db_bits,
+				  ndev->self_mmio +
+				  ndev->self_reg->db_mask);
+}
 
-	/* bit 15 is always the link bit */
-	writew(1 << SNB_LINK_DB, ndev->reg_ofs.ldb);
+static int intel_ntb_peer_db_addr(struct ntb_dev *ntb,
+				  phys_addr_t *db_addr,
+				  resource_size_t *db_size)
+{
+	struct intel_ntb_dev *ndev = ntb_ndev(ntb);
 
-	return IRQ_HANDLED;
+	return ndev_db_addr(ndev, db_addr, db_size, ndev->peer_addr,
+			    ndev->peer_reg->db_bell);
 }
 
-static irqreturn_t ntb_interrupt(int irq, void *dev)
+static int intel_ntb_peer_db_set(struct ntb_dev *ntb, u64 db_bits)
 {
-	struct ntb_device *ndev = dev;
-	unsigned int i = 0;
+	struct intel_ntb_dev *ndev = ntb_ndev(ntb);
 
-	if (is_ntb_atom(ndev)) {
-		u64 ldb = readq(ndev->reg_ofs.ldb);
+	return ndev_db_write(ndev, db_bits,
+			     ndev->peer_mmio +
+			     ndev->peer_reg->db_bell);
+}
 
-		dev_dbg(&ndev->pdev->dev, "irq %d - ldb = %Lx\n", irq, ldb);
+static int intel_ntb_spad_is_unsafe(struct ntb_dev *ntb)
+{
+	return ndev_ignore_unsafe(ntb_ndev(ntb), NTB_UNSAFE_SPAD);
+}
 
-		while (ldb) {
-			i = __ffs(ldb);
-			ldb &= ldb - 1;
-			bwd_callback_msix_irq(irq, &ndev->db_cb[i]);
-		}
-	} else {
-		u16 ldb = readw(ndev->reg_ofs.ldb);
+static int intel_ntb_spad_count(struct ntb_dev *ntb)
+{
+	struct intel_ntb_dev *ndev;
 
-		dev_dbg(&ndev->pdev->dev, "irq %d - ldb = %x\n", irq, ldb);
+	ndev = container_of(ntb, struct intel_ntb_dev, ntb);
 
-		if (ldb & SNB_DB_HW_LINK) {
-			xeon_event_msix_irq(irq, dev);
-			ldb &= ~SNB_DB_HW_LINK;
-		}
+	return ndev->spad_count;
+}
 
-		while (ldb) {
-			i = __ffs(ldb);
-			ldb &= ldb - 1;
-			xeon_callback_msix_irq(irq, &ndev->db_cb[i]);
-		}
-	}
+static u32 intel_ntb_spad_read(struct ntb_dev *ntb, int idx)
+{
+	struct intel_ntb_dev *ndev = ntb_ndev(ntb);
 
-	return IRQ_HANDLED;
+	return ndev_spad_read(ndev, idx,
+			      ndev->self_mmio +
+			      ndev->self_reg->spad);
 }
 
-static int ntb_setup_snb_msix(struct ntb_device *ndev, int msix_entries)
+static int intel_ntb_spad_write(struct ntb_dev *ntb,
+				int idx, u32 val)
 {
-	struct pci_dev *pdev = ndev->pdev;
-	struct msix_entry *msix;
-	int rc, i;
+	struct intel_ntb_dev *ndev = ntb_ndev(ntb);
 
-	if (msix_entries < ndev->limits.msix_cnt)
-		return -ENOSPC;
+	return ndev_spad_write(ndev, idx, val,
+			       ndev->self_mmio +
+			       ndev->self_reg->spad);
+}
 
-	rc = pci_enable_msix_exact(pdev, ndev->msix_entries, msix_entries);
-	if (rc < 0)
-		return rc;
+static int intel_ntb_peer_spad_addr(struct ntb_dev *ntb, int idx,
+				    phys_addr_t *spad_addr)
+{
+	struct intel_ntb_dev *ndev = ntb_ndev(ntb);
 
-	for (i = 0; i < msix_entries; i++) {
-		msix = &ndev->msix_entries[i];
-		WARN_ON(!msix->vector);
+	return ndev_spad_addr(ndev, idx, spad_addr, ndev->peer_addr,
+			      ndev->peer_reg->spad);
+}
 
-		if (i == msix_entries - 1) {
-			rc = request_irq(msix->vector,
-					 xeon_event_msix_irq, 0,
-					 "ntb-event-msix", ndev);
-			if (rc)
-				goto err;
-		} else {
-			rc = request_irq(msix->vector,
-					 xeon_callback_msix_irq, 0,
-					 "ntb-callback-msix",
-					 &ndev->db_cb[i]);
-			if (rc)
-				goto err;
-		}
-	}
+static u32 intel_ntb_peer_spad_read(struct ntb_dev *ntb, int idx)
+{
+	struct intel_ntb_dev *ndev = ntb_ndev(ntb);
 
-	ndev->num_msix = msix_entries;
-	ndev->max_cbs = msix_entries - 1;
+	return ndev_spad_read(ndev, idx,
+			      ndev->peer_mmio +
+			      ndev->peer_reg->spad);
+}
 
-	return 0;
+static int intel_ntb_peer_spad_write(struct ntb_dev *ntb,
+				     int idx, u32 val)
+{
+	struct intel_ntb_dev *ndev = ntb_ndev(ntb);
 
-err:
-	while (--i >= 0) {
-		/* Code never reaches here for entry nr 'ndev->num_msix - 1' */
-		msix = &ndev->msix_entries[i];
-		free_irq(msix->vector, &ndev->db_cb[i]);
-	}
+	return ndev_spad_write(ndev, idx, val,
+			       ndev->peer_mmio +
+			       ndev->peer_reg->spad);
+}
 
-	pci_disable_msix(pdev);
-	ndev->num_msix = 0;
+/* BWD */
 
-	return rc;
+static u64 bwd_db_ioread(void __iomem *mmio)
+{
+	return ioread64(mmio);
+}
+
+static void bwd_db_iowrite(u64 bits, void __iomem *mmio)
+{
+	iowrite64(bits, mmio);
 }
 
-static int ntb_setup_bwd_msix(struct ntb_device *ndev, int msix_entries)
+static int bwd_poll_link(struct intel_ntb_dev *ndev)
 {
-	struct pci_dev *pdev = ndev->pdev;
-	struct msix_entry *msix;
-	int rc, i;
+	u32 ntb_ctl;
 
-	msix_entries = pci_enable_msix_range(pdev, ndev->msix_entries,
-					     1, msix_entries);
-	if (msix_entries < 0)
-		return msix_entries;
+	ntb_ctl = ioread32(ndev->self_mmio + BWD_NTBCNTL_OFFSET);
 
-	for (i = 0; i < msix_entries; i++) {
-		msix = &ndev->msix_entries[i];
-		WARN_ON(!msix->vector);
+	if (ntb_ctl == ndev->ntb_ctl)
+		return 0;
 
-		rc = request_irq(msix->vector, bwd_callback_msix_irq, 0,
-				 "ntb-callback-msix", &ndev->db_cb[i]);
-		if (rc)
-			goto err;
-	}
+	ndev->ntb_ctl = ntb_ctl;
 
-	ndev->num_msix = msix_entries;
-	ndev->max_cbs = msix_entries;
+	ndev->lnk_sta = ioread32(ndev->self_mmio + BWD_LINK_STATUS_OFFSET);
 
-	return 0;
+	return 1;
+}
 
-err:
-	while (--i >= 0)
-		free_irq(msix->vector, &ndev->db_cb[i]);
+static int bwd_link_is_up(struct intel_ntb_dev *ndev)
+{
+	return BWD_NTB_CTL_ACTIVE(ndev->ntb_ctl);
+}
 
-	pci_disable_msix(pdev);
-	ndev->num_msix = 0;
+static int bwd_link_is_err(struct intel_ntb_dev *ndev)
+{
+	if (ioread32(ndev->self_mmio + BWD_LTSSMSTATEJMP_OFFSET)
+	    & BWD_LTSSMSTATEJMP_FORCEDETECT)
+		return 1;
 
-	return rc;
+	if (ioread32(ndev->self_mmio + BWD_IBSTERRRCRVSTS0_OFFSET)
+	    & BWD_IBIST_ERR_OFLOW)
+		return 1;
+
+	return 0;
 }
 
-static int ntb_setup_msix(struct ntb_device *ndev)
+static inline enum ntb_topo bwd_ppd_topo(struct intel_ntb_dev *ndev, u32 ppd)
 {
-	struct pci_dev *pdev = ndev->pdev;
-	int msix_entries;
-	int rc, i;
+	switch (ppd & BWD_PPD_TOPO_MASK) {
+	case BWD_PPD_TOPO_B2B_USD:
+		dev_dbg(ndev_dev(ndev), "PPD %d B2B USD\n", ppd);
+		return NTB_TOPO_B2B_USD;
+
+	case BWD_PPD_TOPO_B2B_DSD:
+		dev_dbg(ndev_dev(ndev), "PPD %d B2B DSD\n", ppd);
+		return NTB_TOPO_B2B_DSD;
+
+	case BWD_PPD_TOPO_PRI_USD:
+	case BWD_PPD_TOPO_PRI_DSD: /* accept bogus PRI_DSD */
+	case BWD_PPD_TOPO_SEC_USD:
+	case BWD_PPD_TOPO_SEC_DSD: /* accept bogus SEC_DSD */
+		dev_dbg(ndev_dev(ndev), "PPD %d non B2B disabled\n", ppd);
+		return NTB_TOPO_NONE;
+	}
 
-	msix_entries = pci_msix_vec_count(pdev);
-	if (msix_entries < 0) {
-		rc = msix_entries;
-		goto err;
-	} else if (msix_entries > ndev->limits.msix_cnt) {
-		rc = -EINVAL;
-		goto err;
+	dev_dbg(ndev_dev(ndev), "PPD %d invalid\n", ppd);
+	return NTB_TOPO_NONE;
+}
+
+static void bwd_link_hb(struct work_struct *work)
+{
+	struct intel_ntb_dev *ndev = hb_ndev(work);
+	unsigned long poll_ts;
+	void __iomem *mmio;
+	u32 status32;
+
+	poll_ts = ndev->last_ts + BWD_LINK_HB_TIMEOUT;
+
+	/* Delay polling the link status if an interrupt was received,
+	 * unless the cached link status says the link is down.
+	 */
+	if (time_after(poll_ts, jiffies) && bwd_link_is_up(ndev)) {
+		schedule_delayed_work(&ndev->hb_timer, poll_ts - jiffies);
+		return;
 	}
 
-	ndev->msix_entries = kmalloc(sizeof(struct msix_entry) * msix_entries,
-				     GFP_KERNEL);
-	if (!ndev->msix_entries) {
-		rc = -ENOMEM;
-		goto err;
+	if (bwd_poll_link(ndev))
+		ntb_link_event(&ndev->ntb);
+
+	if (bwd_link_is_up(ndev) || !bwd_link_is_err(ndev)) {
+		schedule_delayed_work(&ndev->hb_timer, BWD_LINK_HB_TIMEOUT);
+		return;
 	}
 
-	for (i = 0; i < msix_entries; i++)
-		ndev->msix_entries[i].entry = i;
+	/* Link is down with error: recover the link! */
 
-	if (is_ntb_atom(ndev))
-		rc = ntb_setup_bwd_msix(ndev, msix_entries);
-	else
-		rc = ntb_setup_snb_msix(ndev, msix_entries);
-	if (rc)
-		goto err1;
+	mmio = ndev->self_mmio;
 
-	return 0;
+	/* Driver resets the NTB ModPhy lanes - magic! */
+	iowrite8(0xe0, mmio + BWD_MODPHY_PCSREG6);
+	iowrite8(0x40, mmio + BWD_MODPHY_PCSREG4);
+	iowrite8(0x60, mmio + BWD_MODPHY_PCSREG4);
+	iowrite8(0x60, mmio + BWD_MODPHY_PCSREG6);
 
-err1:
-	kfree(ndev->msix_entries);
-err:
-	dev_err(&pdev->dev, "Error allocating MSI-X interrupt\n");
-	return rc;
+	/* Driver waits 100ms to allow the NTB ModPhy to settle */
+	msleep(100);
+
+	/* Clear AER Errors, write to clear */
+	status32 = ioread32(mmio + BWD_ERRCORSTS_OFFSET);
+	dev_dbg(ndev_dev(ndev), "ERRCORSTS = %x\n", status32);
+	status32 &= PCI_ERR_COR_REP_ROLL;
+	iowrite32(status32, mmio + BWD_ERRCORSTS_OFFSET);
+
+	/* Clear unexpected electrical idle event in LTSSM, write to clear */
+	status32 = ioread32(mmio + BWD_LTSSMERRSTS0_OFFSET);
+	dev_dbg(ndev_dev(ndev), "LTSSMERRSTS0 = %x\n", status32);
+	status32 |= BWD_LTSSMERRSTS0_UNEXPECTEDEI;
+	iowrite32(status32, mmio + BWD_LTSSMERRSTS0_OFFSET);
+
+	/* Clear DeSkew Buffer error, write to clear */
+	status32 = ioread32(mmio + BWD_DESKEWSTS_OFFSET);
+	dev_dbg(ndev_dev(ndev), "DESKEWSTS = %x\n", status32);
+	status32 |= BWD_DESKEWSTS_DBERR;
+	iowrite32(status32, mmio + BWD_DESKEWSTS_OFFSET);
+
+	status32 = ioread32(mmio + BWD_IBSTERRRCRVSTS0_OFFSET);
+	dev_dbg(ndev_dev(ndev), "IBSTERRRCRVSTS0 = %x\n", status32);
+	status32 &= BWD_IBIST_ERR_OFLOW;
+	iowrite32(status32, mmio + BWD_IBSTERRRCRVSTS0_OFFSET);
+
+	/* Releases the NTB state machine to allow the link to retrain */
+	status32 = ioread32(mmio + BWD_LTSSMSTATEJMP_OFFSET);
+	dev_dbg(ndev_dev(ndev), "LTSSMSTATEJMP = %x\n", status32);
+	status32 &= ~BWD_LTSSMSTATEJMP_FORCEDETECT;
+	iowrite32(status32, mmio + BWD_LTSSMSTATEJMP_OFFSET);
+
+	/* There is a potential race between the 2 NTB devices recovering at the
+	 * same time.  If the times are the same, the link will not recover and
+	 * the driver will be stuck in this loop forever.  Add a random interval
+	 * to the recovery time to prevent this race.
+	 */
+	schedule_delayed_work(&ndev->hb_timer, BWD_LINK_RECOVERY_TIME
+			      + prandom_u32() % BWD_LINK_RECOVERY_TIME);
 }
 
-static int ntb_setup_msi(struct ntb_device *ndev)
+static int bwd_init_isr(struct intel_ntb_dev *ndev)
 {
-	struct pci_dev *pdev = ndev->pdev;
 	int rc;
 
-	rc = pci_enable_msi(pdev);
+	rc = ndev_init_isr(ndev, 1, BWD_DB_MSIX_VECTOR_COUNT,
+			   BWD_DB_MSIX_VECTOR_SHIFT, BWD_DB_TOTAL_SHIFT);
 	if (rc)
 		return rc;
 
-	rc = request_irq(pdev->irq, ntb_interrupt, 0, "ntb-msi", ndev);
-	if (rc) {
-		pci_disable_msi(pdev);
-		dev_err(&pdev->dev, "Error allocating MSI interrupt\n");
-		return rc;
-	}
+	/* BWD doesn't have link status interrupt, poll on that platform */
+	ndev->last_ts = jiffies;
+	INIT_DELAYED_WORK(&ndev->hb_timer, bwd_link_hb);
+	schedule_delayed_work(&ndev->hb_timer, BWD_LINK_HB_TIMEOUT);
 
 	return 0;
 }
 
-static int ntb_setup_intx(struct ntb_device *ndev)
+static void bwd_deinit_isr(struct intel_ntb_dev *ndev)
 {
-	struct pci_dev *pdev = ndev->pdev;
-	int rc;
+	cancel_delayed_work_sync(&ndev->hb_timer);
+	ndev_deinit_isr(ndev);
+}
 
-	/* Verify intx is enabled */
-	pci_intx(pdev, 1);
+static int bwd_init_ntb(struct intel_ntb_dev *ndev)
+{
+	ndev->mw_count = BWD_MW_COUNT;
+	ndev->spad_count = BWD_SPAD_COUNT;
+	ndev->db_count = BWD_DB_COUNT;
 
-	rc = request_irq(pdev->irq, ntb_interrupt, IRQF_SHARED, "ntb-intx",
-			 ndev);
-	if (rc)
-		return rc;
+	switch (ndev->ntb.topo) {
+	case NTB_TOPO_B2B_USD:
+	case NTB_TOPO_B2B_DSD:
+		ndev->self_reg = &bwd_pri_reg;
+		ndev->peer_reg = &bwd_b2b_reg;
+		ndev->xlat_reg = &bwd_sec_xlat;
+
+		/* Enable Bus Master and Memory Space on the secondary side */
+		iowrite16(PCI_COMMAND_MEMORY | PCI_COMMAND_MASTER,
+			  ndev->self_mmio + BWD_SPCICMD_OFFSET);
+
+		break;
+
+	default:
+		return -EINVAL;
+	}
+
+	ndev->db_valid_mask = BIT_ULL(ndev->db_count) - 1;
 
 	return 0;
 }
 
-static int ntb_setup_interrupts(struct ntb_device *ndev)
+static int bwd_init_dev(struct intel_ntb_dev *ndev)
 {
+	u32 ppd;
 	int rc;
 
-	/* On BWD, disable all interrupts.  On SNB, disable all but Link
-	 * Interrupt.  The rest will be unmasked as callbacks are registered.
-	 */
-	if (is_ntb_atom(ndev))
-		writeq(~0, ndev->reg_ofs.ldb_mask);
-	else {
-		u16 var = 1 << SNB_LINK_DB;
-		writew(~var, ndev->reg_ofs.ldb_mask);
-	}
-
-	rc = ntb_setup_msix(ndev);
-	if (!rc)
-		goto done;
+	rc = pci_read_config_dword(ndev->ntb.pdev, BWD_PPD_OFFSET, &ppd);
+	if (rc)
+		return -EIO;
 
-	ndev->bits_per_vector = 1;
-	ndev->max_cbs = ndev->limits.max_db_bits;
+	ndev->ntb.topo = bwd_ppd_topo(ndev, ppd);
+	if (ndev->ntb.topo == NTB_TOPO_NONE)
+		return -EINVAL;
 
-	rc = ntb_setup_msi(ndev);
-	if (!rc)
-		goto done;
+	rc = bwd_init_ntb(ndev);
+	if (rc)
+		return rc;
 
-	rc = ntb_setup_intx(ndev);
-	if (rc) {
-		dev_err(&ndev->pdev->dev, "no usable interrupts\n");
+	rc = bwd_init_isr(ndev);
+	if (rc)
 		return rc;
+
+	if (ndev->ntb.topo != NTB_TOPO_SEC) {
+		/* Initiate PCI-E link training */
+		rc = pci_write_config_dword(ndev->ntb.pdev, BWD_PPD_OFFSET,
+					    ppd | BWD_PPD_INIT_LINK);
+		if (rc)
+			return rc;
 	}
 
-done:
 	return 0;
 }
 
-static void ntb_free_interrupts(struct ntb_device *ndev)
+static void bwd_deinit_dev(struct intel_ntb_dev *ndev)
 {
-	struct pci_dev *pdev = ndev->pdev;
+	bwd_deinit_isr(ndev);
+}
 
-	/* mask interrupts */
-	if (is_ntb_atom(ndev))
-		writeq(~0, ndev->reg_ofs.ldb_mask);
-	else
-		writew(~0, ndev->reg_ofs.ldb_mask);
+/* SNB */
 
-	if (ndev->num_msix) {
-		struct msix_entry *msix;
-		u32 i;
+static u64 snb_db_ioread(void __iomem *mmio)
+{
+	return (u64)ioread16(mmio);
+}
 
-		for (i = 0; i < ndev->num_msix; i++) {
-			msix = &ndev->msix_entries[i];
-			if (is_ntb_xeon(ndev) && i == ndev->num_msix - 1)
-				free_irq(msix->vector, ndev);
-			else
-				free_irq(msix->vector, &ndev->db_cb[i]);
-		}
-		pci_disable_msix(pdev);
-		kfree(ndev->msix_entries);
-	} else {
-		free_irq(pdev->irq, ndev);
+static void snb_db_iowrite(u64 bits, void __iomem *mmio)
+{
+	iowrite16((u16)bits, mmio);
+}
 
-		if (pci_dev_msi_enabled(pdev))
-			pci_disable_msi(pdev);
-	}
+static int snb_poll_link(struct intel_ntb_dev *ndev)
+{
+	u16 reg_val;
+	int rc;
+
+	ndev->reg->db_iowrite(ndev->db_link_mask,
+			      ndev->self_mmio +
+			      ndev->self_reg->db_bell);
+
+	rc = pci_read_config_word(ndev->ntb.pdev,
+				  SNB_LINK_STATUS_OFFSET, &reg_val);
+	if (rc)
+		return 0;
+
+	if (reg_val == ndev->lnk_sta)
+		return 0;
+
+	ndev->lnk_sta = reg_val;
+
+	return 1;
 }
 
-static int ntb_create_callbacks(struct ntb_device *ndev)
+static int snb_link_is_up(struct intel_ntb_dev *ndev)
 {
-	int i;
+	return NTB_LNK_STA_ACTIVE(ndev->lnk_sta);
+}
 
-	/* Chicken-egg issue.  We won't know how many callbacks are necessary
-	 * until we see how many MSI-X vectors we get, but these pointers need
-	 * to be passed into the MSI-X register function.  So, we allocate the
-	 * max, knowing that they might not all be used, to work around this.
-	 */
-	ndev->db_cb = kcalloc(ndev->limits.max_db_bits,
-			      sizeof(struct ntb_db_cb),
-			      GFP_KERNEL);
-	if (!ndev->db_cb)
-		return -ENOMEM;
+static inline enum ntb_topo snb_ppd_topo(struct intel_ntb_dev *ndev, u8 ppd)
+{
+	switch (ppd & SNB_PPD_TOPO_MASK) {
+	case SNB_PPD_TOPO_B2B_USD:
+		return NTB_TOPO_B2B_USD;
+
+	case SNB_PPD_TOPO_B2B_DSD:
+		return NTB_TOPO_B2B_DSD;
+
+	case SNB_PPD_TOPO_PRI_USD:
+	case SNB_PPD_TOPO_PRI_DSD: /* accept bogus PRI_DSD */
+		return NTB_TOPO_PRI;
 
-	for (i = 0; i < ndev->limits.max_db_bits; i++) {
-		ndev->db_cb[i].db_num = i;
-		ndev->db_cb[i].ndev = ndev;
+	case SNB_PPD_TOPO_SEC_USD:
+	case SNB_PPD_TOPO_SEC_DSD: /* accept bogus SEC_DSD */
+		return NTB_TOPO_SEC;
 	}
 
-	return 0;
+	return NTB_TOPO_NONE;
 }
 
-static void ntb_free_callbacks(struct ntb_device *ndev)
+static inline int snb_ppd_bar4_split(struct intel_ntb_dev *ndev, u8 ppd)
 {
-	int i;
+	if (ppd & SNB_PPD_SPLIT_BAR_MASK) {
+		dev_dbg(ndev_dev(ndev), "PPD %d split bar\n", ppd);
+		return 1;
+	}
+	return 0;
+}
 
-	for (i = 0; i < ndev->limits.max_db_bits; i++)
-		ntb_unregister_db_callback(ndev, i);
+static int snb_init_isr(struct intel_ntb_dev *ndev)
+{
+	return ndev_init_isr(ndev, SNB_DB_MSIX_VECTOR_COUNT,
+			     SNB_DB_MSIX_VECTOR_COUNT,
+			     SNB_DB_MSIX_VECTOR_SHIFT,
+			     SNB_DB_TOTAL_SHIFT);
+}
 
-	kfree(ndev->db_cb);
+static void snb_deinit_isr(struct intel_ntb_dev *ndev)
+{
+	ndev_deinit_isr(ndev);
 }
 
-static ssize_t ntb_debugfs_read(struct file *filp, char __user *ubuf,
-				size_t count, loff_t *offp)
+static int snb_setup_b2b_mw(struct intel_ntb_dev *ndev,
+			    const struct intel_b2b_addr *addr,
+			    const struct intel_b2b_addr *peer_addr)
 {
-	struct ntb_device *ndev;
-	char *buf;
-	ssize_t ret, offset, out_count;
+	struct pci_dev *pdev;
+	void __iomem *mmio;
+	resource_size_t bar_size;
+	phys_addr_t bar_addr;
+	int b2b_bar;
+	u8 bar_sz;
+
+	pdev = ndev_pdev(ndev);
+	mmio = ndev->self_mmio;
+
+	if (ndev->b2b_idx >= ndev->mw_count) {
+		dev_dbg(ndev_dev(ndev), "not using b2b mw\n");
+		b2b_bar = 0;
+		ndev->b2b_off = 0;
+	} else {
+		b2b_bar = ndev_mw_to_bar(ndev, ndev->b2b_idx);
+		if (b2b_bar < 0)
+			return -EIO;
 
-	out_count = 500;
+		dev_dbg(ndev_dev(ndev), "using b2b mw bar %d\n", b2b_bar);
 
-	buf = kmalloc(out_count, GFP_KERNEL);
-	if (!buf)
-		return -ENOMEM;
+		bar_size = pci_resource_len(ndev->ntb.pdev, b2b_bar);
 
-	ndev = filp->private_data;
-	offset = 0;
-	offset += snprintf(buf + offset, out_count - offset,
-			   "NTB Device Information:\n");
-	offset += snprintf(buf + offset, out_count - offset,
-			   "Connection Type - \t\t%s\n",
-			   ndev->conn_type == NTB_CONN_TRANSPARENT ?
-			   "Transparent" : (ndev->conn_type == NTB_CONN_B2B) ?
-			   "Back to back" : "Root Port");
-	offset += snprintf(buf + offset, out_count - offset,
-			   "Device Type - \t\t\t%s\n",
-			   ndev->dev_type == NTB_DEV_USD ?
-			   "DSD/USP" : "USD/DSP");
-	offset += snprintf(buf + offset, out_count - offset,
-			   "Max Number of Callbacks - \t%u\n",
-			   ntb_max_cbs(ndev));
-	offset += snprintf(buf + offset, out_count - offset,
-			   "Link Status - \t\t\t%s\n",
-			   ntb_hw_link_status(ndev) ? "Up" : "Down");
-	if (ntb_hw_link_status(ndev)) {
-		offset += snprintf(buf + offset, out_count - offset,
-				   "Link Speed - \t\t\tPCI-E Gen %u\n",
-				   ndev->link_speed);
-		offset += snprintf(buf + offset, out_count - offset,
-				   "Link Width - \t\t\tx%u\n",
-				   ndev->link_width);
-	}
+		dev_dbg(ndev_dev(ndev), "b2b bar size %#llx\n", bar_size);
 
-	if (is_ntb_xeon(ndev)) {
-		u32 status32;
-		u16 status16;
-		int rc;
-
-		offset += snprintf(buf + offset, out_count - offset,
-				   "\nNTB Device Statistics:\n");
-		offset += snprintf(buf + offset, out_count - offset,
-				   "Upstream Memory Miss - \t%u\n",
-				   readw(ndev->reg_base +
-					 SNB_USMEMMISS_OFFSET));
-
-		offset += snprintf(buf + offset, out_count - offset,
-				   "\nNTB Hardware Errors:\n");
-
-		rc = pci_read_config_word(ndev->pdev, SNB_DEVSTS_OFFSET,
-					  &status16);
-		if (!rc)
-			offset += snprintf(buf + offset, out_count - offset,
-					   "DEVSTS - \t%#06x\n", status16);
-
-		rc = pci_read_config_word(ndev->pdev, SNB_LINK_STATUS_OFFSET,
-					  &status16);
-		if (!rc)
-			offset += snprintf(buf + offset, out_count - offset,
-					   "LNKSTS - \t%#06x\n", status16);
-
-		rc = pci_read_config_dword(ndev->pdev, SNB_UNCERRSTS_OFFSET,
-					   &status32);
-		if (!rc)
-			offset += snprintf(buf + offset, out_count - offset,
-					   "UNCERRSTS - \t%#010x\n", status32);
-
-		rc = pci_read_config_dword(ndev->pdev, SNB_CORERRSTS_OFFSET,
-					   &status32);
-		if (!rc)
-			offset += snprintf(buf + offset, out_count - offset,
-					   "CORERRSTS - \t%#010x\n", status32);
+		if (b2b_mw_share && SNB_B2B_MIN_SIZE <= bar_size >> 1) {
+			dev_dbg(ndev_dev(ndev),
+				"b2b using first half of bar\n");
+			ndev->b2b_off = bar_size >> 1;
+		} else if (SNB_B2B_MIN_SIZE <= bar_size) {
+			dev_dbg(ndev_dev(ndev),
+				"b2b using whole bar\n");
+			ndev->b2b_off = 0;
+			--ndev->mw_count;
+		} else {
+			dev_dbg(ndev_dev(ndev),
+				"b2b bar size is too small\n");
+			return -EIO;
+		}
 	}
 
-	if (offset > out_count)
-		offset = out_count;
+	/* Reset the secondary bar sizes to match the primary bar sizes,
+	 * except disable or halve the size of the b2b secondary bar.
+	 *
+	 * Note: code for each specific bar size register, because the register
+	 * offsets are not in a consistent order (bar5sz comes after ppd, odd).
+	 */
+	pci_read_config_byte(pdev, SNB_PBAR23SZ_OFFSET, &bar_sz);
+	dev_dbg(ndev_dev(ndev), "PBAR23SZ %#x\n", bar_sz);
+	if (b2b_bar == 2) {
+		if (ndev->b2b_off)
+			bar_sz -= 1;
+		else
+			bar_sz = 0;
+	}
+	pci_write_config_byte(pdev, SNB_SBAR23SZ_OFFSET, bar_sz);
+	pci_read_config_byte(pdev, SNB_SBAR23SZ_OFFSET, &bar_sz);
+	dev_dbg(ndev_dev(ndev), "SBAR23SZ %#x\n", bar_sz);
+
+	if (!ndev->bar4_split) {
+		pci_read_config_byte(pdev, SNB_PBAR45SZ_OFFSET, &bar_sz);
+		dev_dbg(ndev_dev(ndev), "PBAR45SZ %#x\n", bar_sz);
+		if (b2b_bar == 4) {
+			if (ndev->b2b_off)
+				bar_sz -= 1;
+			else
+				bar_sz = 0;
+		}
+		pci_write_config_byte(pdev, SNB_SBAR45SZ_OFFSET, bar_sz);
+		pci_read_config_byte(pdev, SNB_SBAR45SZ_OFFSET, &bar_sz);
+		dev_dbg(ndev_dev(ndev), "SBAR45SZ %#x\n", bar_sz);
+	} else {
+		pci_read_config_byte(pdev, SNB_PBAR4SZ_OFFSET, &bar_sz);
+		dev_dbg(ndev_dev(ndev), "PBAR4SZ %#x\n", bar_sz);
+		if (b2b_bar == 4) {
+			if (ndev->b2b_off)
+				bar_sz -= 1;
+			else
+				bar_sz = 0;
+		}
+		pci_write_config_byte(pdev, SNB_SBAR4SZ_OFFSET, bar_sz);
+		pci_read_config_byte(pdev, SNB_SBAR4SZ_OFFSET, &bar_sz);
+		dev_dbg(ndev_dev(ndev), "SBAR4SZ %#x\n", bar_sz);
+
+		pci_read_config_byte(pdev, SNB_PBAR5SZ_OFFSET, &bar_sz);
+		dev_dbg(ndev_dev(ndev), "PBAR5SZ %#x\n", bar_sz);
+		if (b2b_bar == 5) {
+			if (ndev->b2b_off)
+				bar_sz -= 1;
+			else
+				bar_sz = 0;
+		}
+		pci_write_config_byte(pdev, SNB_SBAR5SZ_OFFSET, bar_sz);
+		pci_read_config_byte(pdev, SNB_SBAR5SZ_OFFSET, &bar_sz);
+		dev_dbg(ndev_dev(ndev), "SBAR5SZ %#x\n", bar_sz);
+	}
 
-	ret = simple_read_from_buffer(ubuf, count, offp, buf, offset);
-	kfree(buf);
-	return ret;
-}
+	/* SBAR01 hit by first part of the b2b bar */
+	if (b2b_bar == 0)
+		bar_addr = addr->bar0_addr;
+	else if (b2b_bar == 2)
+		bar_addr = addr->bar2_addr64;
+	else if (b2b_bar == 4 && !ndev->bar4_split)
+		bar_addr = addr->bar4_addr64;
+	else if (b2b_bar == 4)
+		bar_addr = addr->bar4_addr32;
+	else if (b2b_bar == 5)
+		bar_addr = addr->bar5_addr32;
+	else
+		return -EIO;
 
-static const struct file_operations ntb_debugfs_info = {
-	.owner = THIS_MODULE,
-	.open = simple_open,
-	.read = ntb_debugfs_read,
-};
+	dev_dbg(ndev_dev(ndev), "SBAR01 %#018llx\n", bar_addr);
+	iowrite64(bar_addr, mmio + SNB_SBAR0BASE_OFFSET);
 
-static void ntb_setup_debugfs(struct ntb_device *ndev)
-{
-	if (!debugfs_initialized())
-		return;
+	/* Other SBAR are normally hit by the PBAR xlat, except for b2b bar.
+	 * The b2b bar is either disabled above, or configured half-size, and
+	 * it starts at the PBAR xlat + offset.
+	 */
 
-	if (!debugfs_dir)
-		debugfs_dir = debugfs_create_dir(KBUILD_MODNAME, NULL);
+	bar_addr = addr->bar2_addr64 + (b2b_bar == 2 ? ndev->b2b_off : 0);
+	iowrite64(bar_addr, mmio + SNB_SBAR23BASE_OFFSET);
+	bar_addr = ioread64(mmio + SNB_SBAR23BASE_OFFSET);
+	dev_dbg(ndev_dev(ndev), "SBAR23 %#018llx\n", bar_addr);
+
+	if (!ndev->bar4_split) {
+		bar_addr = addr->bar4_addr64 +
+			(b2b_bar == 4 ? ndev->b2b_off : 0);
+		iowrite64(bar_addr, mmio + SNB_SBAR45BASE_OFFSET);
+		bar_addr = ioread64(mmio + SNB_SBAR45BASE_OFFSET);
+		dev_dbg(ndev_dev(ndev), "SBAR45 %#018llx\n", bar_addr);
+	} else {
+		bar_addr = addr->bar4_addr32 +
+			(b2b_bar == 4 ? ndev->b2b_off : 0);
+		iowrite32(bar_addr, mmio + SNB_SBAR4BASE_OFFSET);
+		bar_addr = ioread32(mmio + SNB_SBAR4BASE_OFFSET);
+		dev_dbg(ndev_dev(ndev), "SBAR4 %#010llx\n", bar_addr);
+
+		bar_addr = addr->bar5_addr32 +
+			(b2b_bar == 5 ? ndev->b2b_off : 0);
+		iowrite32(bar_addr, mmio + SNB_SBAR5BASE_OFFSET);
+		bar_addr = ioread32(mmio + SNB_SBAR5BASE_OFFSET);
+		dev_dbg(ndev_dev(ndev), "SBAR5 %#010llx\n", bar_addr);
+	}
 
-	ndev->debugfs_dir = debugfs_create_dir(pci_name(ndev->pdev),
-					       debugfs_dir);
-	if (ndev->debugfs_dir)
-		ndev->debugfs_info = debugfs_create_file("info", S_IRUSR,
-							 ndev->debugfs_dir,
-							 ndev,
-							 &ntb_debugfs_info);
-}
+	/* setup incoming bar limits == base addrs (zero length windows) */
 
-static void ntb_free_debugfs(struct ntb_device *ndev)
-{
-	debugfs_remove_recursive(ndev->debugfs_dir);
+	bar_addr = addr->bar2_addr64 + (b2b_bar == 2 ? ndev->b2b_off : 0);
+	iowrite64(bar_addr, mmio + SNB_SBAR23LMT_OFFSET);
+	bar_addr = ioread64(mmio + SNB_SBAR23LMT_OFFSET);
+	dev_dbg(ndev_dev(ndev), "SBAR23LMT %#018llx\n", bar_addr);
 
-	if (debugfs_dir && simple_empty(debugfs_dir)) {
-		debugfs_remove_recursive(debugfs_dir);
-		debugfs_dir = NULL;
+	if (!ndev->bar4_split) {
+		bar_addr = addr->bar4_addr64 +
+			(b2b_bar == 4 ? ndev->b2b_off : 0);
+		iowrite64(bar_addr, mmio + SNB_SBAR45LMT_OFFSET);
+		bar_addr = ioread64(mmio + SNB_SBAR45LMT_OFFSET);
+		dev_dbg(ndev_dev(ndev), "SBAR45LMT %#018llx\n", bar_addr);
+	} else {
+		bar_addr = addr->bar4_addr32 +
+			(b2b_bar == 4 ? ndev->b2b_off : 0);
+		iowrite32(bar_addr, mmio + SNB_SBAR4LMT_OFFSET);
+		bar_addr = ioread32(mmio + SNB_SBAR4LMT_OFFSET);
+		dev_dbg(ndev_dev(ndev), "SBAR4LMT %#010llx\n", bar_addr);
+
+		bar_addr = addr->bar5_addr32 +
+			(b2b_bar == 5 ? ndev->b2b_off : 0);
+		iowrite32(bar_addr, mmio + SNB_SBAR5LMT_OFFSET);
+		bar_addr = ioread32(mmio + SNB_SBAR5LMT_OFFSET);
+		dev_dbg(ndev_dev(ndev), "SBAR5LMT %#05llx\n", bar_addr);
 	}
-}
 
-static void ntb_hw_link_up(struct ntb_device *ndev)
-{
-	if (ndev->conn_type == NTB_CONN_TRANSPARENT)
-		ntb_link_event(ndev, NTB_LINK_UP);
-	else {
-		u32 ntb_cntl;
+	/* zero incoming translation addrs */
+	iowrite64(0, mmio + SNB_SBAR23XLAT_OFFSET);
 
-		/* Let's bring the NTB link up */
-		ntb_cntl = readl(ndev->reg_ofs.lnk_cntl);
-		ntb_cntl &= ~(NTB_CNTL_LINK_DISABLE | NTB_CNTL_CFG_LOCK);
-		ntb_cntl |= NTB_CNTL_P2S_BAR23_SNOOP | NTB_CNTL_S2P_BAR23_SNOOP;
-		ntb_cntl |= NTB_CNTL_P2S_BAR4_SNOOP | NTB_CNTL_S2P_BAR4_SNOOP;
-		if (ndev->split_bar)
-			ntb_cntl |= NTB_CNTL_P2S_BAR5_SNOOP |
-				    NTB_CNTL_S2P_BAR5_SNOOP;
+	if (!ndev->bar4_split) {
+		iowrite64(0, mmio + SNB_SBAR45XLAT_OFFSET);
+	} else {
+		iowrite32(0, mmio + SNB_SBAR4XLAT_OFFSET);
+		iowrite32(0, mmio + SNB_SBAR5XLAT_OFFSET);
+	}
 
-		writel(ntb_cntl, ndev->reg_ofs.lnk_cntl);
+	/* zero outgoing translation limits (whole bar size windows) */
+	iowrite64(0, mmio + SNB_PBAR23LMT_OFFSET);
+	if (!ndev->bar4_split) {
+		iowrite64(0, mmio + SNB_PBAR45LMT_OFFSET);
+	} else {
+		iowrite32(0, mmio + SNB_PBAR4LMT_OFFSET);
+		iowrite32(0, mmio + SNB_PBAR5LMT_OFFSET);
 	}
-}
 
-static void ntb_hw_link_down(struct ntb_device *ndev)
-{
-	u32 ntb_cntl;
+	/* set outgoing translation offsets */
+	bar_addr = peer_addr->bar2_addr64;
+	iowrite64(bar_addr, mmio + SNB_PBAR23XLAT_OFFSET);
+	bar_addr = ioread64(mmio + SNB_PBAR23XLAT_OFFSET);
+	dev_dbg(ndev_dev(ndev), "PBAR23XLAT %#018llx\n", bar_addr);
+
+	if (!ndev->bar4_split) {
+		bar_addr = peer_addr->bar4_addr64;
+		iowrite64(bar_addr, mmio + SNB_PBAR45XLAT_OFFSET);
+		bar_addr = ioread64(mmio + SNB_PBAR45XLAT_OFFSET);
+		dev_dbg(ndev_dev(ndev), "PBAR45XLAT %#018llx\n", bar_addr);
+	} else {
+		bar_addr = peer_addr->bar4_addr32;
+		iowrite32(bar_addr, mmio + SNB_PBAR4XLAT_OFFSET);
+		bar_addr = ioread32(mmio + SNB_PBAR4XLAT_OFFSET);
+		dev_dbg(ndev_dev(ndev), "PBAR4XLAT %#010llx\n", bar_addr);
+
+		bar_addr = peer_addr->bar5_addr32;
+		iowrite32(bar_addr, mmio + SNB_PBAR5XLAT_OFFSET);
+		bar_addr = ioread32(mmio + SNB_PBAR5XLAT_OFFSET);
+		dev_dbg(ndev_dev(ndev), "PBAR5XLAT %#010llx\n", bar_addr);
+	}
 
-	if (ndev->conn_type == NTB_CONN_TRANSPARENT) {
-		ntb_link_event(ndev, NTB_LINK_DOWN);
-		return;
+	/* set the translation offset for b2b registers */
+	if (b2b_bar == 0)
+		bar_addr = peer_addr->bar0_addr;
+	else if (b2b_bar == 2)
+		bar_addr = peer_addr->bar2_addr64;
+	else if (b2b_bar == 4 && !ndev->bar4_split)
+		bar_addr = peer_addr->bar4_addr64;
+	else if (b2b_bar == 4)
+		bar_addr = peer_addr->bar4_addr32;
+	else if (b2b_bar == 5)
+		bar_addr = peer_addr->bar5_addr32;
+	else
+		return -EIO;
+
+	/* B2B_XLAT_OFFSET is 64bit, but can only take 32bit writes */
+	dev_dbg(ndev_dev(ndev), "B2BXLAT %#018llx\n", bar_addr);
+	iowrite32(bar_addr, mmio + SNB_B2B_XLAT_OFFSETL);
+	iowrite32(bar_addr >> 32, mmio + SNB_B2B_XLAT_OFFSETU);
+
+	if (b2b_bar) {
+		/* map peer ntb mmio config space registers */
+		ndev->peer_mmio = pci_iomap(pdev, b2b_bar,
+					    SNB_B2B_MIN_SIZE);
+		if (!ndev->peer_mmio)
+			return -EIO;
 	}
 
-	/* Bring NTB link down */
-	ntb_cntl = readl(ndev->reg_ofs.lnk_cntl);
-	ntb_cntl &= ~(NTB_CNTL_P2S_BAR23_SNOOP | NTB_CNTL_S2P_BAR23_SNOOP);
-	ntb_cntl &= ~(NTB_CNTL_P2S_BAR4_SNOOP | NTB_CNTL_S2P_BAR4_SNOOP);
-	if (ndev->split_bar)
-		ntb_cntl &= ~(NTB_CNTL_P2S_BAR5_SNOOP |
-			      NTB_CNTL_S2P_BAR5_SNOOP);
-	ntb_cntl |= NTB_CNTL_LINK_DISABLE | NTB_CNTL_CFG_LOCK;
-	writel(ntb_cntl, ndev->reg_ofs.lnk_cntl);
+	return 0;
 }
 
-static void ntb_max_mw_detect(struct ntb_device *ndev)
+static int snb_init_ntb(struct intel_ntb_dev *ndev)
 {
-	if (ndev->split_bar)
-		ndev->limits.max_mw = HSX_SPLITBAR_MAX_MW;
+	int rc;
+
+	if (ndev->bar4_split)
+		ndev->mw_count = HSX_SPLIT_BAR_MW_COUNT;
 	else
-		ndev->limits.max_mw = SNB_MAX_MW;
-}
+		ndev->mw_count = SNB_MW_COUNT;
 
-static int ntb_xeon_detect(struct ntb_device *ndev)
-{
-	int rc, bars_mask;
-	u32 bars;
-	u8 ppd;
+	ndev->spad_count = SNB_SPAD_COUNT;
+	ndev->db_count = SNB_DB_COUNT;
+	ndev->db_link_mask = SNB_DB_LINK_BIT;
 
-	ndev->hw_type = SNB_HW;
+	switch (ndev->ntb.topo) {
+	case NTB_TOPO_PRI:
+		if (ndev->hwerr_flags & NTB_HWERR_SDOORBELL_LOCKUP) {
+			dev_err(ndev_dev(ndev), "NTB Primary config disabled\n");
+			return -EINVAL;
+		}
+		/* use half the spads for the peer */
+		ndev->spad_count >>= 1;
+		ndev->self_reg = &snb_pri_reg;
+		ndev->peer_reg = &snb_sec_reg;
+		ndev->xlat_reg = &snb_sec_xlat;
+		break;
 
-	rc = pci_read_config_byte(ndev->pdev, NTB_PPD_OFFSET, &ppd);
-	if (rc)
-		return -EIO;
+	case NTB_TOPO_SEC:
+		if (ndev->hwerr_flags & NTB_HWERR_SDOORBELL_LOCKUP) {
+			dev_err(ndev_dev(ndev), "NTB Secondary config disabled\n");
+			return -EINVAL;
+		}
+		/* use half the spads for the peer */
+		ndev->spad_count >>= 1;
+		ndev->self_reg = &snb_sec_reg;
+		ndev->peer_reg = &snb_pri_reg;
+		ndev->xlat_reg = &snb_pri_xlat;
+		break;
 
-	if (ppd & SNB_PPD_DEV_TYPE)
-		ndev->dev_type = NTB_DEV_USD;
-	else
-		ndev->dev_type = NTB_DEV_DSD;
+	case NTB_TOPO_B2B_USD:
+	case NTB_TOPO_B2B_DSD:
+		ndev->self_reg = &snb_pri_reg;
+		ndev->peer_reg = &snb_b2b_reg;
+		ndev->xlat_reg = &snb_sec_xlat;
 
-	ndev->split_bar = (ppd & SNB_PPD_SPLIT_BAR) ? 1 : 0;
+		if (ndev->hwerr_flags & NTB_HWERR_SDOORBELL_LOCKUP) {
+			ndev->peer_reg = &snb_pri_reg;
 
-	switch (ppd & SNB_PPD_CONN_TYPE) {
-	case NTB_CONN_B2B:
-		dev_info(&ndev->pdev->dev, "Conn Type = B2B\n");
-		ndev->conn_type = NTB_CONN_B2B;
-		break;
-	case NTB_CONN_RP:
-		dev_info(&ndev->pdev->dev, "Conn Type = RP\n");
-		ndev->conn_type = NTB_CONN_RP;
-		break;
-	case NTB_CONN_TRANSPARENT:
-		dev_info(&ndev->pdev->dev, "Conn Type = TRANSPARENT\n");
-		ndev->conn_type = NTB_CONN_TRANSPARENT;
-		/*
-		 * This mode is default to USD/DSP. HW does not report
-		 * properly in transparent mode as it has no knowledge of
-		 * NTB. We will just force correct here.
-		 */
-		ndev->dev_type = NTB_DEV_USD;
+			if (b2b_mw_idx < 0)
+				ndev->b2b_idx = b2b_mw_idx + ndev->mw_count;
+			else
+				ndev->b2b_idx = b2b_mw_idx;
 
-		/*
-		 * This is a way for transparent BAR to figure out if we
-		 * are doing split BAR or not. There is no way for the hw
-		 * on the transparent side to know and set the PPD.
-		 */
-		bars_mask = pci_select_bars(ndev->pdev, IORESOURCE_MEM);
-		bars = hweight32(bars_mask);
-		if (bars == (HSX_SPLITBAR_MAX_MW + 1))
-			ndev->split_bar = 1;
+			dev_dbg(ndev_dev(ndev),
+				"setting up b2b mw idx %d means %d\n",
+				b2b_mw_idx, ndev->b2b_idx);
+
+		} else if (ndev->hwerr_flags & NTB_HWERR_B2BDOORBELL_BIT14) {
+			dev_warn(ndev_dev(ndev), "Reduce doorbell count by 1\n");
+			ndev->db_count -= 1;
+		}
+
+		if (ndev->ntb.topo == NTB_TOPO_B2B_USD) {
+			rc = snb_setup_b2b_mw(ndev,
+					      &snb_b2b_dsd_addr,
+					      &snb_b2b_usd_addr);
+		} else {
+			rc = snb_setup_b2b_mw(ndev,
+					      &snb_b2b_usd_addr,
+					      &snb_b2b_dsd_addr);
+		}
+		if (rc)
+			return rc;
+
+		/* Enable Bus Master and Memory Space on the secondary side */
+		iowrite16(PCI_COMMAND_MEMORY | PCI_COMMAND_MASTER,
+			  ndev->self_mmio + SNB_SPCICMD_OFFSET);
 
 		break;
+
 	default:
-		dev_err(&ndev->pdev->dev, "Unknown PPD %x\n", ppd);
-		return -ENODEV;
+		return -EINVAL;
 	}
 
-	ntb_max_mw_detect(ndev);
+	ndev->db_valid_mask = BIT_ULL(ndev->db_count) - 1;
+
+	ndev->reg->db_iowrite(ndev->db_valid_mask,
+			      ndev->self_mmio +
+			      ndev->self_reg->db_mask);
 
 	return 0;
 }
 
-static int ntb_atom_detect(struct ntb_device *ndev)
+static int snb_init_dev(struct intel_ntb_dev *ndev)
 {
-	int rc;
-	u32 ppd;
+	struct pci_dev *pdev;
+	u8 ppd;
+	int rc, mem;
+
+	/* There is a Xeon hardware errata related to writes to SDOORBELL or
+	 * B2BDOORBELL in conjunction with inbound access to NTB MMIO Space,
+	 * which may hang the system.  To workaround this use the second memory
+	 * window to access the interrupt and scratch pad registers on the
+	 * remote system.
+	 */
+	ndev->hwerr_flags |= NTB_HWERR_SDOORBELL_LOCKUP;
 
-	ndev->hw_type = BWD_HW;
-	ndev->limits.max_mw = BWD_MAX_MW;
+	/* There is a hardware errata related to accessing any register in
+	 * SB01BASE in the presence of bidirectional traffic crossing the NTB.
+	 */
+	ndev->hwerr_flags |= NTB_HWERR_SB01BASE_LOCKUP;
+
+	/* HW Errata on bit 14 of b2bdoorbell register.  Writes will not be
+	 * mirrored to the remote system.  Shrink the number of bits by one,
+	 * since bit 14 is the last bit.
+	 */
+	ndev->hwerr_flags |= NTB_HWERR_B2BDOORBELL_BIT14;
 
-	rc = pci_read_config_dword(ndev->pdev, NTB_PPD_OFFSET, &ppd);
+	ndev->reg = &snb_reg;
+
+	pdev = ndev_pdev(ndev);
+
+	rc = pci_read_config_byte(pdev, SNB_PPD_OFFSET, &ppd);
 	if (rc)
-		return rc;
+		return -EIO;
 
-	switch ((ppd & BWD_PPD_CONN_TYPE) >> 8) {
-	case NTB_CONN_B2B:
-		dev_info(&ndev->pdev->dev, "Conn Type = B2B\n");
-		ndev->conn_type = NTB_CONN_B2B;
-		break;
-	case NTB_CONN_RP:
-	default:
-		dev_err(&ndev->pdev->dev, "Unsupported NTB configuration\n");
+	ndev->ntb.topo = snb_ppd_topo(ndev, ppd);
+	dev_dbg(ndev_dev(ndev), "ppd %#x topo %s\n", ppd,
+		ntb_topo_string(ndev->ntb.topo));
+	if (ndev->ntb.topo == NTB_TOPO_NONE)
 		return -EINVAL;
+
+	if (ndev->ntb.topo != NTB_TOPO_SEC) {
+		ndev->bar4_split = snb_ppd_bar4_split(ndev, ppd);
+		dev_dbg(ndev_dev(ndev), "ppd %#x bar4_split %d\n",
+			ppd, ndev->bar4_split);
+	} else {
+		/* This is a way for transparent BAR to figure out if we are
+		 * doing split BAR or not. There is no way for the hw on the
+		 * transparent side to know and set the PPD.
+		 */
+		mem = pci_select_bars(pdev, IORESOURCE_MEM);
+		ndev->bar4_split = hweight32(mem) ==
+			HSX_SPLIT_BAR_MW_COUNT + 1;
+		dev_dbg(ndev_dev(ndev), "mem %#x bar4_split %d\n",
+			mem, ndev->bar4_split);
 	}
 
-	if (ppd & BWD_PPD_DEV_TYPE)
-		ndev->dev_type = NTB_DEV_DSD;
-	else
-		ndev->dev_type = NTB_DEV_USD;
+	rc = snb_init_ntb(ndev);
+	if (rc)
+		return rc;
 
-	return 0;
+	return snb_init_isr(ndev);
+}
+
+static void snb_deinit_dev(struct intel_ntb_dev *ndev)
+{
+	snb_deinit_isr(ndev);
 }
 
-static int ntb_device_detect(struct ntb_device *ndev)
+static int intel_ntb_init_pci(struct intel_ntb_dev *ndev, struct pci_dev *pdev)
 {
 	int rc;
 
-	if (is_ntb_xeon(ndev))
-		rc = ntb_xeon_detect(ndev);
-	else if (is_ntb_atom(ndev))
-		rc = ntb_atom_detect(ndev);
-	else
-		rc = -ENODEV;
+	pci_set_drvdata(pdev, ndev);
+
+	rc = pci_enable_device(pdev);
+	if (rc)
+		goto err_pci_enable;
+
+	rc = pci_request_regions(pdev, NTB_NAME);
+	if (rc)
+		goto err_pci_regions;
+
+	pci_set_master(pdev);
+
+	rc = pci_set_dma_mask(pdev, DMA_BIT_MASK(64));
+	if (rc) {
+		rc = pci_set_dma_mask(pdev, DMA_BIT_MASK(32));
+		if (rc)
+			goto err_dma_mask;
+		dev_warn(ndev_dev(ndev), "Cannot DMA highmem\n");
+	}
 
-	dev_info(&ndev->pdev->dev, "Device Type = %s\n",
-		 ndev->dev_type == NTB_DEV_USD ? "USD/DSP" : "DSD/USP");
+	rc = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64));
+	if (rc) {
+		rc = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(32));
+		if (rc)
+			goto err_dma_mask;
+		dev_warn(ndev_dev(ndev), "Cannot DMA consistent highmem\n");
+	}
+
+	ndev->self_mmio = pci_iomap(pdev, 0, 0);
+	if (!ndev->self_mmio) {
+		rc = -EIO;
+		goto err_mmio;
+	}
+	ndev->peer_mmio = ndev->self_mmio;
 
 	return 0;
+
+err_mmio:
+err_dma_mask:
+	pci_clear_master(pdev);
+	pci_release_regions(pdev);
+err_pci_regions:
+	pci_disable_device(pdev);
+err_pci_enable:
+	pci_set_drvdata(pdev, NULL);
+	return rc;
 }
 
-static int ntb_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
+static void intel_ntb_deinit_pci(struct intel_ntb_dev *ndev)
 {
-	struct ntb_device *ndev;
-	int rc, i;
+	struct pci_dev *pdev = ndev_pdev(ndev);
 
-	ndev = kzalloc(sizeof(struct ntb_device), GFP_KERNEL);
-	if (!ndev)
-		return -ENOMEM;
+	if (ndev->peer_mmio && ndev->peer_mmio != ndev->self_mmio)
+		pci_iounmap(pdev, ndev->peer_mmio);
+	pci_iounmap(pdev, ndev->self_mmio);
 
-	ndev->pdev = pdev;
+	pci_clear_master(pdev);
+	pci_release_regions(pdev);
+	pci_disable_device(pdev);
+	pci_set_drvdata(pdev, NULL);
+}
 
-	ntb_set_errata_flags(ndev);
+static inline void ndev_init_struct(struct intel_ntb_dev *ndev,
+				    struct pci_dev *pdev)
+{
+	ndev->ntb.pdev = pdev;
+	ndev->ntb.topo = NTB_TOPO_NONE;
+	ndev->ntb.ops = &intel_ntb_ops;
 
-	ndev->link_status = NTB_LINK_DOWN;
-	pci_set_drvdata(pdev, ndev);
-	ntb_setup_debugfs(ndev);
+	ndev->b2b_off = 0;
+	ndev->b2b_idx = INT_MAX;
 
-	rc = pci_enable_device(pdev);
-	if (rc)
-		goto err;
+	ndev->bar4_split = 0;
 
-	pci_set_master(ndev->pdev);
+	ndev->mw_count = 0;
+	ndev->spad_count = 0;
+	ndev->db_count = 0;
+	ndev->db_vec_count = 0;
+	ndev->db_vec_shift = 0;
 
-	rc = ntb_device_detect(ndev);
-	if (rc)
-		goto err;
+	ndev->ntb_ctl = 0;
+	ndev->lnk_sta = 0;
 
-	ndev->mw = kcalloc(ndev->limits.max_mw, sizeof(struct ntb_mw),
-			   GFP_KERNEL);
-	if (!ndev->mw) {
-		rc = -ENOMEM;
-		goto err1;
-	}
+	ndev->db_valid_mask = 0;
+	ndev->db_link_mask = 0;
+	ndev->db_mask = 0;
 
-	if (ndev->split_bar)
-		rc = pci_request_selected_regions(pdev, NTB_SPLITBAR_MASK,
-						  KBUILD_MODNAME);
-	else
-		rc = pci_request_selected_regions(pdev, NTB_BAR_MASK,
-						  KBUILD_MODNAME);
+	spin_lock_init(&ndev->db_mask_lock);
+}
 
-	if (rc)
-		goto err2;
+static int intel_ntb_pci_probe(struct pci_dev *pdev,
+			       const struct pci_device_id *id)
+{
+	struct intel_ntb_dev *ndev;
+	int rc;
 
-	ndev->reg_base = pci_ioremap_bar(pdev, NTB_BAR_MMIO);
-	if (!ndev->reg_base) {
-		dev_warn(&pdev->dev, "Cannot remap BAR 0\n");
-		rc = -EIO;
-		goto err3;
-	}
+	if (pdev_is_bwd(pdev)) {
+		ndev = kzalloc(sizeof(*ndev), GFP_KERNEL);
+		if (!ndev) {
+			rc = -ENOMEM;
+			goto err_ndev;
+		}
 
-	for (i = 0; i < ndev->limits.max_mw; i++) {
-		ndev->mw[i].bar_sz = pci_resource_len(pdev, MW_TO_BAR(i));
+		ndev_init_struct(ndev, pdev);
 
-		/*
-		 * with the errata we need to steal last of the memory
-		 * windows for workarounds and they point to MMIO registers.
-		 */
-		if ((ndev->wa_flags & WA_SNB_ERR) &&
-		    (i == (ndev->limits.max_mw - 1))) {
-			ndev->mw[i].vbase =
-				ioremap_nocache(pci_resource_start(pdev,
-							MW_TO_BAR(i)),
-						ndev->mw[i].bar_sz);
-		} else {
-			ndev->mw[i].vbase =
-				ioremap_wc(pci_resource_start(pdev,
-							MW_TO_BAR(i)),
-					   ndev->mw[i].bar_sz);
-		}
+		rc = intel_ntb_init_pci(ndev, pdev);
+		if (rc)
+			goto err_init_pci;
+
+		rc = bwd_init_dev(ndev);
+		if (rc)
+			goto err_init_dev;
 
-		dev_info(&pdev->dev, "MW %d size %llu\n", i,
-			 (unsigned long long) ndev->mw[i].bar_sz);
-		if (!ndev->mw[i].vbase) {
-			dev_warn(&pdev->dev, "Cannot remap BAR %d\n",
-				 MW_TO_BAR(i));
-			rc = -EIO;
-			goto err4;
+	} else if (pdev_is_snb(pdev)) {
+		ndev = kzalloc(sizeof(*ndev), GFP_KERNEL);
+		if (!ndev) {
+			rc = -ENOMEM;
+			goto err_ndev;
 		}
-	}
 
-	rc = pci_set_dma_mask(pdev, DMA_BIT_MASK(64));
-	if (rc) {
-		rc = pci_set_dma_mask(pdev, DMA_BIT_MASK(32));
-		if (rc)
-			goto err4;
+		ndev_init_struct(ndev, pdev);
 
-		dev_warn(&pdev->dev, "Cannot DMA highmem\n");
-	}
+		rc = intel_ntb_init_pci(ndev, pdev);
+		if (rc)
+			goto err_init_pci;
 
-	rc = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64));
-	if (rc) {
-		rc = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(32));
+		rc = snb_init_dev(ndev);
 		if (rc)
-			goto err4;
+			goto err_init_dev;
 
-		dev_warn(&pdev->dev, "Cannot DMA consistent highmem\n");
+	} else {
+		rc = -EINVAL;
+		goto err_ndev;
 	}
 
-	rc = ntb_device_setup(ndev);
-	if (rc)
-		goto err4;
-
-	rc = ntb_create_callbacks(ndev);
-	if (rc)
-		goto err5;
+	ndev_reset_unsafe_flags(ndev);
 
-	rc = ntb_setup_interrupts(ndev);
-	if (rc)
-		goto err6;
+	ndev->reg->poll_link(ndev);
 
-	/* The scratchpad registers keep the values between rmmod/insmod,
-	 * blast them now
-	 */
-	for (i = 0; i < ndev->limits.max_spads; i++) {
-		ntb_write_local_spad(ndev, i, 0);
-		ntb_write_remote_spad(ndev, i, 0);
-	}
+	ndev_init_debugfs(ndev);
 
-	rc = ntb_transport_init(pdev);
+	rc = ntb_register_device(&ndev->ntb);
 	if (rc)
-		goto err7;
-
-	ntb_hw_link_up(ndev);
+		goto err_register;
 
 	return 0;
 
-err7:
-	ntb_free_interrupts(ndev);
-err6:
-	ntb_free_callbacks(ndev);
-err5:
-	ntb_device_free(ndev);
-err4:
-	for (i--; i >= 0; i--)
-		iounmap(ndev->mw[i].vbase);
-	iounmap(ndev->reg_base);
-err3:
-	if (ndev->split_bar)
-		pci_release_selected_regions(pdev, NTB_SPLITBAR_MASK);
-	else
-		pci_release_selected_regions(pdev, NTB_BAR_MASK);
-err2:
-	kfree(ndev->mw);
-err1:
-	pci_disable_device(pdev);
-err:
-	ntb_free_debugfs(ndev);
+err_register:
+	ndev_deinit_debugfs(ndev);
+	if (pdev_is_bwd(pdev))
+		bwd_deinit_dev(ndev);
+	else if (pdev_is_snb(pdev))
+		snb_deinit_dev(ndev);
+err_init_dev:
+	intel_ntb_deinit_pci(ndev);
+err_init_pci:
 	kfree(ndev);
-
-	dev_err(&pdev->dev, "Error loading %s module\n", KBUILD_MODNAME);
+err_ndev:
 	return rc;
 }
 
-static void ntb_pci_remove(struct pci_dev *pdev)
+static void intel_ntb_pci_remove(struct pci_dev *pdev)
 {
-	struct ntb_device *ndev = pci_get_drvdata(pdev);
-	int i;
+	struct intel_ntb_dev *ndev = pci_get_drvdata(pdev);
+
+	ntb_unregister_device(&ndev->ntb);
+	ndev_deinit_debugfs(ndev);
+	if (pdev_is_bwd(pdev))
+		bwd_deinit_dev(ndev);
+	else if (pdev_is_snb(pdev))
+		snb_deinit_dev(ndev);
+	intel_ntb_deinit_pci(ndev);
+	kfree(ndev);
+}
 
-	ntb_hw_link_down(ndev);
+static const struct intel_ntb_reg bwd_reg = {
+	.poll_link		= bwd_poll_link,
+	.link_is_up		= bwd_link_is_up,
+	.db_ioread		= bwd_db_ioread,
+	.db_iowrite		= bwd_db_iowrite,
+	.db_size		= sizeof(u64),
+	.ntb_ctl		= BWD_NTBCNTL_OFFSET,
+	.mw_bar			= {2, 4},
+};
 
-	ntb_transport_free(ndev->ntb_transport);
+static const struct intel_ntb_alt_reg bwd_pri_reg = {
+	.db_bell		= BWD_PDOORBELL_OFFSET,
+	.db_mask		= BWD_PDBMSK_OFFSET,
+	.spad			= BWD_SPAD_OFFSET,
+};
 
-	ntb_free_interrupts(ndev);
-	ntb_free_callbacks(ndev);
-	ntb_device_free(ndev);
+static const struct intel_ntb_alt_reg bwd_b2b_reg = {
+	.db_bell		= BWD_B2B_DOORBELL_OFFSET,
+	.spad			= BWD_B2B_SPAD_OFFSET,
+};
 
-	/* need to reset max_mw limits so we can unmap properly */
-	if (ndev->hw_type == SNB_HW)
-		ntb_max_mw_detect(ndev);
+static const struct intel_ntb_xlat_reg bwd_sec_xlat = {
+	/* FIXME : .bar0_base	= BWD_SBAR0BASE_OFFSET, */
+	/* FIXME : .bar2_limit	= BWD_SBAR2LMT_OFFSET, */
+	.bar2_xlat		= BWD_SBAR2XLAT_OFFSET,
+};
 
-	for (i = 0; i < ndev->limits.max_mw; i++)
-		iounmap(ndev->mw[i].vbase);
+static const struct intel_ntb_reg snb_reg = {
+	.poll_link		= snb_poll_link,
+	.link_is_up		= snb_link_is_up,
+	.db_ioread		= snb_db_ioread,
+	.db_iowrite		= snb_db_iowrite,
+	.db_size		= sizeof(u32),
+	.ntb_ctl		= SNB_NTBCNTL_OFFSET,
+	.mw_bar			= {2, 4, 5},
+};
 
-	kfree(ndev->mw);
-	iounmap(ndev->reg_base);
-	if (ndev->split_bar)
-		pci_release_selected_regions(pdev, NTB_SPLITBAR_MASK);
-	else
-		pci_release_selected_regions(pdev, NTB_BAR_MASK);
-	pci_disable_device(pdev);
-	ntb_free_debugfs(ndev);
-	kfree(ndev);
-}
+static const struct intel_ntb_alt_reg snb_pri_reg = {
+	.db_bell		= SNB_PDOORBELL_OFFSET,
+	.db_mask		= SNB_PDBMSK_OFFSET,
+	.spad			= SNB_SPAD_OFFSET,
+};
+
+static const struct intel_ntb_alt_reg snb_sec_reg = {
+	.db_bell		= SNB_SDOORBELL_OFFSET,
+	.db_mask		= SNB_SDBMSK_OFFSET,
+	/* second half of the scratchpads */
+	.spad			= SNB_SPAD_OFFSET + (SNB_SPAD_COUNT << 1),
+};
 
-static struct pci_driver ntb_pci_driver = {
+static const struct intel_ntb_alt_reg snb_b2b_reg = {
+	.db_bell		= SNB_B2B_DOORBELL_OFFSET,
+	.spad			= SNB_B2B_SPAD_OFFSET,
+};
+
+static const struct intel_ntb_xlat_reg snb_pri_xlat = {
+	/* Note: no primary .bar0_base visible to the secondary side.
+	 *
+	 * The secondary side cannot get the base address stored in primary
+	 * bars.  The base address is necessary to set the limit register to
+	 * any value other than zero, or unlimited.
+	 *
+	 * WITHOUT THE BASE ADDRESS, THE SECONDARY SIDE CANNOT DISABLE the
+	 * window by setting the limit equal to base, nor can it limit the size
+	 * of the memory window by setting the limit to base + size.
+	 */
+	.bar2_limit		= SNB_PBAR23LMT_OFFSET,
+	.bar2_xlat		= SNB_PBAR23XLAT_OFFSET,
+};
+
+static const struct intel_ntb_xlat_reg snb_sec_xlat = {
+	.bar0_base		= SNB_SBAR0BASE_OFFSET,
+	.bar2_limit		= SNB_SBAR23LMT_OFFSET,
+	.bar2_xlat		= SNB_SBAR23XLAT_OFFSET,
+};
+
+static const struct intel_b2b_addr snb_b2b_usd_addr = {
+	.bar2_addr64		= SNB_B2B_BAR2_USD_ADDR64,
+	.bar4_addr64		= SNB_B2B_BAR4_USD_ADDR64,
+	.bar4_addr32		= SNB_B2B_BAR4_USD_ADDR32,
+	.bar5_addr32		= SNB_B2B_BAR5_USD_ADDR32,
+};
+
+static const struct intel_b2b_addr snb_b2b_dsd_addr = {
+	.bar2_addr64		= SNB_B2B_BAR2_DSD_ADDR64,
+	.bar4_addr64		= SNB_B2B_BAR4_DSD_ADDR64,
+	.bar4_addr32		= SNB_B2B_BAR4_DSD_ADDR32,
+	.bar5_addr32		= SNB_B2B_BAR5_DSD_ADDR32,
+};
+
+/* operations for primary side of local ntb */
+static const struct ntb_dev_ops intel_ntb_ops = {
+	.mw_count		= intel_ntb_mw_count,
+	.mw_get_range		= intel_ntb_mw_get_range,
+	.mw_set_trans		= intel_ntb_mw_set_trans,
+	.link_is_up		= intel_ntb_link_is_up,
+	.link_enable		= intel_ntb_link_enable,
+	.link_disable		= intel_ntb_link_disable,
+	.db_is_unsafe		= intel_ntb_db_is_unsafe,
+	.db_valid_mask		= intel_ntb_db_valid_mask,
+	.db_vector_count	= intel_ntb_db_vector_count,
+	.db_vector_mask		= intel_ntb_db_vector_mask,
+	.db_read		= intel_ntb_db_read,
+	.db_clear		= intel_ntb_db_clear,
+	.db_set_mask		= intel_ntb_db_set_mask,
+	.db_clear_mask		= intel_ntb_db_clear_mask,
+	.peer_db_addr		= intel_ntb_peer_db_addr,
+	.peer_db_set		= intel_ntb_peer_db_set,
+	.spad_is_unsafe		= intel_ntb_spad_is_unsafe,
+	.spad_count		= intel_ntb_spad_count,
+	.spad_read		= intel_ntb_spad_read,
+	.spad_write		= intel_ntb_spad_write,
+	.peer_spad_addr		= intel_ntb_peer_spad_addr,
+	.peer_spad_read		= intel_ntb_peer_spad_read,
+	.peer_spad_write	= intel_ntb_peer_spad_write,
+};
+
+static const struct file_operations intel_ntb_debugfs_info = {
+	.owner = THIS_MODULE,
+	.open = simple_open,
+	.read = ndev_debugfs_read,
+};
+
+static const struct pci_device_id intel_ntb_pci_tbl[] = {
+	{PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_NTB_B2B_BWD)},
+	{PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_NTB_B2B_JSF)},
+	{PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_NTB_B2B_SNB)},
+	{PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_NTB_B2B_IVT)},
+	{PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_NTB_B2B_HSX)},
+	{PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_NTB_PS_JSF)},
+	{PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_NTB_PS_SNB)},
+	{PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_NTB_PS_IVT)},
+	{PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_NTB_PS_HSX)},
+	{PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_NTB_SS_JSF)},
+	{PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_NTB_SS_SNB)},
+	{PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_NTB_SS_IVT)},
+	{PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_NTB_SS_HSX)},
+	{0}
+};
+MODULE_DEVICE_TABLE(pci, intel_ntb_pci_tbl);
+
+static struct pci_driver intel_ntb_pci_driver = {
 	.name = KBUILD_MODNAME,
-	.id_table = ntb_pci_tbl,
-	.probe = ntb_pci_probe,
-	.remove = ntb_pci_remove,
+	.id_table = intel_ntb_pci_tbl,
+	.probe = intel_ntb_pci_probe,
+	.remove = intel_ntb_pci_remove,
 };
 
-module_pci_driver(ntb_pci_driver);
+static int __init intel_ntb_pci_driver_init(void)
+{
+	if (debugfs_initialized())
+		debugfs_dir = debugfs_create_dir(KBUILD_MODNAME, NULL);
+
+	return pci_register_driver(&intel_ntb_pci_driver);
+}
+module_init(intel_ntb_pci_driver_init);
+
+static void __exit intel_ntb_pci_driver_exit(void)
+{
+	pci_unregister_driver(&intel_ntb_pci_driver);
+
+	debugfs_remove_recursive(debugfs_dir);
+}
+module_exit(intel_ntb_pci_driver_exit);
+
diff --git a/drivers/ntb/hw/intel/ntb_hw_intel.h b/drivers/ntb/hw/intel/ntb_hw_intel.h
index 935610454f70..fec689dc95cf 100644
--- a/drivers/ntb/hw/intel/ntb_hw_intel.h
+++ b/drivers/ntb/hw/intel/ntb_hw_intel.h
@@ -5,6 +5,7 @@
  *   GPL LICENSE SUMMARY
  *
  *   Copyright(c) 2012 Intel Corporation. All rights reserved.
+ *   Copyright (C) 2015 EMC Corporation. All Rights Reserved.
  *
  *   This program is free software; you can redistribute it and/or modify
  *   it under the terms of version 2 of the GNU General Public License as
@@ -13,6 +14,7 @@
  *   BSD LICENSE
  *
  *   Copyright(c) 2012 Intel Corporation. All rights reserved.
+ *   Copyright (C) 2015 EMC Corporation. All Rights Reserved.
  *
  *   Redistribution and use in source and binary forms, with or without
  *   modification, are permitted provided that the following conditions
@@ -45,341 +47,296 @@
  * Contact Information:
  * Jon Mason <jon.mason@intel.com>
  */
-#include <linux/ntb_transport.h>
-
-#define NTB_LINK_STATUS_ACTIVE	0x2000
-#define NTB_LINK_SPEED_MASK	0x000f
-#define NTB_LINK_WIDTH_MASK	0x03f0
-
-#define SNB_MSIX_CNT		4
-#define SNB_MAX_B2B_SPADS	16
-#define SNB_MAX_COMPAT_SPADS	16
-/* Reserve the uppermost bit for link interrupt */
-#define SNB_MAX_DB_BITS		15
-#define SNB_LINK_DB		15
-#define SNB_DB_BITS_PER_VEC	5
-#define HSX_SPLITBAR_MAX_MW	3
-#define SNB_MAX_MW		2
-#define SNB_ERRATA_MAX_MW	1
-
-#define SNB_DB_HW_LINK		0x8000
-
-#define SNB_UNCERRSTS_OFFSET	0x014C
-#define SNB_CORERRSTS_OFFSET	0x0158
-#define SNB_LINK_STATUS_OFFSET	0x01A2
-#define SNB_PCICMD_OFFSET	0x0504
-#define SNB_DEVCTRL_OFFSET	0x0598
-#define SNB_DEVSTS_OFFSET	0x059A
-#define SNB_SLINK_STATUS_OFFSET	0x05A2
-
-#define SNB_PBAR2LMT_OFFSET	0x0000
-#define SNB_PBAR4LMT_OFFSET	0x0008
-#define SNB_PBAR5LMT_OFFSET	0x000C
-#define SNB_PBAR2XLAT_OFFSET	0x0010
-#define SNB_PBAR4XLAT_OFFSET	0x0018
-#define SNB_PBAR5XLAT_OFFSET	0x001C
-#define SNB_SBAR2LMT_OFFSET	0x0020
-#define SNB_SBAR4LMT_OFFSET	0x0028
-#define SNB_SBAR5LMT_OFFSET	0x002C
-#define SNB_SBAR2XLAT_OFFSET	0x0030
-#define SNB_SBAR4XLAT_OFFSET	0x0038
-#define SNB_SBAR5XLAT_OFFSET	0x003C
-#define SNB_SBAR0BASE_OFFSET	0x0040
-#define SNB_SBAR2BASE_OFFSET	0x0048
-#define SNB_SBAR4BASE_OFFSET	0x0050
-#define SNB_SBAR5BASE_OFFSET	0x0054
-#define SNB_NTBCNTL_OFFSET	0x0058
-#define SNB_SBDF_OFFSET		0x005C
-#define SNB_PDOORBELL_OFFSET	0x0060
-#define SNB_PDBMSK_OFFSET	0x0062
-#define SNB_SDOORBELL_OFFSET	0x0064
-#define SNB_SDBMSK_OFFSET	0x0066
-#define SNB_USMEMMISS_OFFSET	0x0070
-#define SNB_SPAD_OFFSET		0x0080
-#define SNB_SPADSEMA4_OFFSET	0x00c0
-#define SNB_WCCNTRL_OFFSET	0x00e0
-#define SNB_B2B_SPAD_OFFSET	0x0100
-#define SNB_B2B_DOORBELL_OFFSET	0x0140
-#define SNB_B2B_XLAT_OFFSETL	0x0144
-#define SNB_B2B_XLAT_OFFSETU	0x0148
 
-/*
- * The addresses are setup so the 32bit BARs can function. Thus
- * the addresses are all in 32bit space
- */
-#define SNB_MBAR01_USD_ADDR	0x000000002100000CULL
-#define SNB_MBAR23_USD_ADDR	0x000000004100000CULL
-#define SNB_MBAR4_USD_ADDR	0x000000008100000CULL
-#define SNB_MBAR5_USD_ADDR	0x00000000A100000CULL
-#define SNB_MBAR01_DSD_ADDR	0x000000002000000CULL
-#define SNB_MBAR23_DSD_ADDR	0x000000004000000CULL
-#define SNB_MBAR4_DSD_ADDR	0x000000008000000CULL
-#define SNB_MBAR5_DSD_ADDR	0x00000000A000000CULL
-
-#define BWD_MSIX_CNT		34
-#define BWD_MAX_SPADS		16
-#define BWD_MAX_DB_BITS		34
-#define BWD_DB_BITS_PER_VEC	1
-#define BWD_MAX_MW		2
-
-#define BWD_PCICMD_OFFSET	0xb004
-#define BWD_MBAR23_OFFSET	0xb018
-#define BWD_MBAR45_OFFSET	0xb020
-#define BWD_DEVCTRL_OFFSET	0xb048
-#define BWD_LINK_STATUS_OFFSET	0xb052
-#define BWD_ERRCORSTS_OFFSET	0xb110
-
-#define BWD_SBAR2XLAT_OFFSET	0x0008
-#define BWD_SBAR4XLAT_OFFSET	0x0010
-#define BWD_PDOORBELL_OFFSET	0x0020
-#define BWD_PDBMSK_OFFSET	0x0028
-#define BWD_NTBCNTL_OFFSET	0x0060
-#define BWD_EBDF_OFFSET		0x0064
-#define BWD_SPAD_OFFSET		0x0080
-#define BWD_SPADSEMA_OFFSET	0x00c0
-#define BWD_STKYSPAD_OFFSET	0x00c4
-#define BWD_PBAR2XLAT_OFFSET	0x8008
-#define BWD_PBAR4XLAT_OFFSET	0x8010
-#define BWD_B2B_DOORBELL_OFFSET	0x8020
-#define BWD_B2B_SPAD_OFFSET	0x8080
-#define BWD_B2B_SPADSEMA_OFFSET	0x80c0
-#define BWD_B2B_STKYSPAD_OFFSET	0x80c4
-
-#define BWD_MODPHY_PCSREG4	0x1c004
-#define BWD_MODPHY_PCSREG6	0x1c006
-
-#define BWD_IP_BASE		0xC000
-#define BWD_DESKEWSTS_OFFSET	(BWD_IP_BASE + 0x3024)
-#define BWD_LTSSMERRSTS0_OFFSET (BWD_IP_BASE + 0x3180)
+#ifndef NTB_HW_INTEL_H
+#define NTB_HW_INTEL_H
+
+#include <linux/ntb.h>
+#include <linux/pci.h>
+
+#define PCI_DEVICE_ID_INTEL_NTB_B2B_JSF	0x3725
+#define PCI_DEVICE_ID_INTEL_NTB_PS_JSF	0x3726
+#define PCI_DEVICE_ID_INTEL_NTB_SS_JSF	0x3727
+#define PCI_DEVICE_ID_INTEL_NTB_B2B_SNB	0x3C0D
+#define PCI_DEVICE_ID_INTEL_NTB_PS_SNB	0x3C0E
+#define PCI_DEVICE_ID_INTEL_NTB_SS_SNB	0x3C0F
+#define PCI_DEVICE_ID_INTEL_NTB_B2B_IVT	0x0E0D
+#define PCI_DEVICE_ID_INTEL_NTB_PS_IVT	0x0E0E
+#define PCI_DEVICE_ID_INTEL_NTB_SS_IVT	0x0E0F
+#define PCI_DEVICE_ID_INTEL_NTB_B2B_HSX	0x2F0D
+#define PCI_DEVICE_ID_INTEL_NTB_PS_HSX	0x2F0E
+#define PCI_DEVICE_ID_INTEL_NTB_SS_HSX	0x2F0F
+#define PCI_DEVICE_ID_INTEL_NTB_B2B_BWD	0x0C4E
+
+/* SNB hardware (and JSF, IVT, HSX) */
+
+#define SNB_PBAR23LMT_OFFSET		0x0000
+#define SNB_PBAR45LMT_OFFSET		0x0008
+#define SNB_PBAR4LMT_OFFSET		0x0008
+#define SNB_PBAR5LMT_OFFSET		0x000c
+#define SNB_PBAR23XLAT_OFFSET		0x0010
+#define SNB_PBAR45XLAT_OFFSET		0x0018
+#define SNB_PBAR4XLAT_OFFSET		0x0018
+#define SNB_PBAR5XLAT_OFFSET		0x001c
+#define SNB_SBAR23LMT_OFFSET		0x0020
+#define SNB_SBAR45LMT_OFFSET		0x0028
+#define SNB_SBAR4LMT_OFFSET		0x0028
+#define SNB_SBAR5LMT_OFFSET		0x002c
+#define SNB_SBAR23XLAT_OFFSET		0x0030
+#define SNB_SBAR45XLAT_OFFSET		0x0038
+#define SNB_SBAR4XLAT_OFFSET		0x0038
+#define SNB_SBAR5XLAT_OFFSET		0x003c
+#define SNB_SBAR0BASE_OFFSET		0x0040
+#define SNB_SBAR23BASE_OFFSET		0x0048
+#define SNB_SBAR45BASE_OFFSET		0x0050
+#define SNB_SBAR4BASE_OFFSET		0x0050
+#define SNB_SBAR5BASE_OFFSET		0x0054
+#define SNB_SBDF_OFFSET			0x005c
+#define SNB_NTBCNTL_OFFSET		0x0058
+#define SNB_PDOORBELL_OFFSET		0x0060
+#define SNB_PDBMSK_OFFSET		0x0062
+#define SNB_SDOORBELL_OFFSET		0x0064
+#define SNB_SDBMSK_OFFSET		0x0066
+#define SNB_USMEMMISS_OFFSET		0x0070
+#define SNB_SPAD_OFFSET			0x0080
+#define SNB_PBAR23SZ_OFFSET		0x00d0
+#define SNB_PBAR45SZ_OFFSET		0x00d1
+#define SNB_PBAR4SZ_OFFSET		0x00d1
+#define SNB_SBAR23SZ_OFFSET		0x00d2
+#define SNB_SBAR45SZ_OFFSET		0x00d3
+#define SNB_SBAR4SZ_OFFSET		0x00d3
+#define SNB_PPD_OFFSET			0x00d4
+#define SNB_PBAR5SZ_OFFSET		0x00d5
+#define SNB_SBAR5SZ_OFFSET		0x00d6
+#define SNB_WCCNTRL_OFFSET		0x00e0
+#define SNB_UNCERRSTS_OFFSET		0x014c
+#define SNB_CORERRSTS_OFFSET		0x0158
+#define SNB_LINK_STATUS_OFFSET		0x01a2
+#define SNB_SPCICMD_OFFSET		0x0504
+#define SNB_DEVCTRL_OFFSET		0x0598
+#define SNB_DEVSTS_OFFSET		0x059a
+#define SNB_SLINK_STATUS_OFFSET		0x05a2
+#define SNB_B2B_SPAD_OFFSET		0x0100
+#define SNB_B2B_DOORBELL_OFFSET		0x0140
+#define SNB_B2B_XLAT_OFFSETL		0x0144
+#define SNB_B2B_XLAT_OFFSETU		0x0148
+#define SNB_PPD_CONN_MASK		0x03
+#define SNB_PPD_CONN_TRANSPARENT	0x00
+#define SNB_PPD_CONN_B2B		0x01
+#define SNB_PPD_CONN_RP			0x02
+#define SNB_PPD_DEV_MASK		0x10
+#define SNB_PPD_DEV_USD			0x00
+#define SNB_PPD_DEV_DSD			0x10
+#define SNB_PPD_SPLIT_BAR_MASK		0x40
+
+#define SNB_PPD_TOPO_MASK	(SNB_PPD_CONN_MASK | SNB_PPD_DEV_MASK)
+#define SNB_PPD_TOPO_PRI_USD	(SNB_PPD_CONN_RP | SNB_PPD_DEV_USD)
+#define SNB_PPD_TOPO_PRI_DSD	(SNB_PPD_CONN_RP | SNB_PPD_DEV_DSD)
+#define SNB_PPD_TOPO_SEC_USD	(SNB_PPD_CONN_TRANSPARENT | SNB_PPD_DEV_USD)
+#define SNB_PPD_TOPO_SEC_DSD	(SNB_PPD_CONN_TRANSPARENT | SNB_PPD_DEV_DSD)
+#define SNB_PPD_TOPO_B2B_USD	(SNB_PPD_CONN_B2B | SNB_PPD_DEV_USD)
+#define SNB_PPD_TOPO_B2B_DSD	(SNB_PPD_CONN_B2B | SNB_PPD_DEV_DSD)
+
+#define SNB_MW_COUNT			2
+#define HSX_SPLIT_BAR_MW_COUNT		3
+#define SNB_DB_COUNT			15
+#define SNB_DB_LINK			15
+#define SNB_DB_LINK_BIT			BIT_ULL(SNB_DB_LINK)
+#define SNB_DB_MSIX_VECTOR_COUNT	4
+#define SNB_DB_MSIX_VECTOR_SHIFT	5
+#define SNB_DB_TOTAL_SHIFT		16
+#define SNB_SPAD_COUNT			16
+
+/* BWD hardware */
+
+#define BWD_SBAR2XLAT_OFFSET		0x0008
+#define BWD_PDOORBELL_OFFSET		0x0020
+#define BWD_PDBMSK_OFFSET		0x0028
+#define BWD_NTBCNTL_OFFSET		0x0060
+#define BWD_SPAD_OFFSET			0x0080
+#define BWD_PPD_OFFSET			0x00d4
+#define BWD_PBAR2XLAT_OFFSET		0x8008
+#define BWD_B2B_DOORBELL_OFFSET		0x8020
+#define BWD_B2B_SPAD_OFFSET		0x8080
+#define BWD_SPCICMD_OFFSET		0xb004
+#define BWD_LINK_STATUS_OFFSET		0xb052
+#define BWD_ERRCORSTS_OFFSET		0xb110
+#define BWD_IP_BASE			0xc000
+#define BWD_DESKEWSTS_OFFSET		(BWD_IP_BASE + 0x3024)
+#define BWD_LTSSMERRSTS0_OFFSET		(BWD_IP_BASE + 0x3180)
 #define BWD_LTSSMSTATEJMP_OFFSET	(BWD_IP_BASE + 0x3040)
 #define BWD_IBSTERRRCRVSTS0_OFFSET	(BWD_IP_BASE + 0x3324)
+#define BWD_MODPHY_PCSREG4		0x1c004
+#define BWD_MODPHY_PCSREG6		0x1c006
+
+#define BWD_PPD_INIT_LINK		0x0008
+#define BWD_PPD_CONN_MASK		0x0300
+#define BWD_PPD_CONN_TRANSPARENT	0x0000
+#define BWD_PPD_CONN_B2B		0x0100
+#define BWD_PPD_CONN_RP			0x0200
+#define BWD_PPD_DEV_MASK		0x1000
+#define BWD_PPD_DEV_USD			0x0000
+#define BWD_PPD_DEV_DSD			0x1000
+#define BWD_PPD_TOPO_MASK	(BWD_PPD_CONN_MASK | BWD_PPD_DEV_MASK)
+#define BWD_PPD_TOPO_PRI_USD	(BWD_PPD_CONN_TRANSPARENT | BWD_PPD_DEV_USD)
+#define BWD_PPD_TOPO_PRI_DSD	(BWD_PPD_CONN_TRANSPARENT | BWD_PPD_DEV_DSD)
+#define BWD_PPD_TOPO_SEC_USD	(BWD_PPD_CONN_RP | BWD_PPD_DEV_USD)
+#define BWD_PPD_TOPO_SEC_DSD	(BWD_PPD_CONN_RP | BWD_PPD_DEV_DSD)
+#define BWD_PPD_TOPO_B2B_USD	(BWD_PPD_CONN_B2B | BWD_PPD_DEV_USD)
+#define BWD_PPD_TOPO_B2B_DSD	(BWD_PPD_CONN_B2B | BWD_PPD_DEV_DSD)
+
+#define BWD_MW_COUNT			2
+#define BWD_DB_COUNT			34
+#define BWD_DB_VALID_MASK		(BIT_ULL(BWD_DB_COUNT) - 1)
+#define BWD_DB_MSIX_VECTOR_COUNT	34
+#define BWD_DB_MSIX_VECTOR_SHIFT	1
+#define BWD_DB_TOTAL_SHIFT		34
+#define BWD_SPAD_COUNT			16
+
+#define BWD_NTB_CTL_DOWN_BIT		BIT(16)
+#define BWD_NTB_CTL_ACTIVE(x)		!(x & BWD_NTB_CTL_DOWN_BIT)
+
+#define BWD_DESKEWSTS_DBERR		BIT(15)
+#define BWD_LTSSMERRSTS0_UNEXPECTEDEI	BIT(20)
+#define BWD_LTSSMSTATEJMP_FORCEDETECT	BIT(2)
+#define BWD_IBIST_ERR_OFLOW		0x7FFF7FFF
+
+#define BWD_LINK_HB_TIMEOUT		msecs_to_jiffies(1000)
+#define BWD_LINK_RECOVERY_TIME		msecs_to_jiffies(500)
+
+/* Ntb control and link status */
+
+#define NTB_CTL_CFG_LOCK		BIT(0)
+#define NTB_CTL_DISABLE			BIT(1)
+#define NTB_CTL_S2P_BAR2_SNOOP		BIT(2)
+#define NTB_CTL_P2S_BAR2_SNOOP		BIT(4)
+#define NTB_CTL_S2P_BAR4_SNOOP		BIT(6)
+#define NTB_CTL_P2S_BAR4_SNOOP		BIT(8)
+#define NTB_CTL_S2P_BAR5_SNOOP		BIT(12)
+#define NTB_CTL_P2S_BAR5_SNOOP		BIT(14)
+
+#define NTB_LNK_STA_ACTIVE_BIT		0x2000
+#define NTB_LNK_STA_SPEED_MASK		0x000f
+#define NTB_LNK_STA_WIDTH_MASK		0x03f0
+#define NTB_LNK_STA_ACTIVE(x)		(!!((x) & NTB_LNK_STA_ACTIVE_BIT))
+#define NTB_LNK_STA_SPEED(x)		((x) & NTB_LNK_STA_SPEED_MASK)
+#define NTB_LNK_STA_WIDTH(x)		(((x) & NTB_LNK_STA_WIDTH_MASK) >> 4)
+
+/* Use the following addresses for translation between b2b ntb devices in case
+ * the hardware default values are not reliable. */
+#define SNB_B2B_BAR0_USD_ADDR		0x1000000000000000ull
+#define SNB_B2B_BAR2_USD_ADDR64		0x2000000000000000ull
+#define SNB_B2B_BAR4_USD_ADDR64		0x4000000000000000ull
+#define SNB_B2B_BAR4_USD_ADDR32		0x20000000u
+#define SNB_B2B_BAR5_USD_ADDR32		0x40000000u
+#define SNB_B2B_BAR0_DSD_ADDR		0x9000000000000000ull
+#define SNB_B2B_BAR2_DSD_ADDR64		0xa000000000000000ull
+#define SNB_B2B_BAR4_DSD_ADDR64		0xc000000000000000ull
+#define SNB_B2B_BAR4_DSD_ADDR32		0xa0000000u
+#define SNB_B2B_BAR5_DSD_ADDR32		0xc0000000u
+
+/* The peer ntb secondary config space is 32KB fixed size */
+#define SNB_B2B_MIN_SIZE		0x8000
+
+/* flags to indicate hardware errata */
+#define NTB_HWERR_SDOORBELL_LOCKUP	BIT_ULL(0)
+#define NTB_HWERR_SB01BASE_LOCKUP	BIT_ULL(1)
+#define NTB_HWERR_B2BDOORBELL_BIT14	BIT_ULL(2)
+
+/* flags to indicate unsafe api */
+#define NTB_UNSAFE_DB			BIT_ULL(0)
+#define NTB_UNSAFE_SPAD			BIT_ULL(1)
+
+struct intel_ntb_dev;
+
+struct intel_ntb_reg {
+	int (*poll_link)(struct intel_ntb_dev *ndev);
+	int (*link_is_up)(struct intel_ntb_dev *ndev);
+	u64 (*db_ioread)(void __iomem *mmio);
+	void (*db_iowrite)(u64 db_bits, void __iomem *mmio);
+	unsigned long			ntb_ctl;
+	resource_size_t			db_size;
+	int				mw_bar[];
+};
 
-#define BWD_DESKEWSTS_DBERR	(1 << 15)
-#define BWD_LTSSMERRSTS0_UNEXPECTEDEI	(1 << 20)
-#define BWD_LTSSMSTATEJMP_FORCEDETECT	(1 << 2)
-#define BWD_IBIST_ERR_OFLOW	0x7FFF7FFF
-
-#define NTB_CNTL_CFG_LOCK		(1 << 0)
-#define NTB_CNTL_LINK_DISABLE		(1 << 1)
-#define NTB_CNTL_S2P_BAR23_SNOOP	(1 << 2)
-#define NTB_CNTL_P2S_BAR23_SNOOP	(1 << 4)
-#define NTB_CNTL_S2P_BAR4_SNOOP	(1 << 6)
-#define NTB_CNTL_P2S_BAR4_SNOOP	(1 << 8)
-#define NTB_CNTL_S2P_BAR5_SNOOP	(1 << 12)
-#define NTB_CNTL_P2S_BAR5_SNOOP	(1 << 14)
-#define BWD_CNTL_LINK_DOWN		(1 << 16)
-
-#define NTB_PPD_OFFSET		0x00D4
-#define SNB_PPD_CONN_TYPE	0x0003
-#define SNB_PPD_DEV_TYPE	0x0010
-#define SNB_PPD_SPLIT_BAR	(1 << 6)
-#define BWD_PPD_INIT_LINK	0x0008
-#define BWD_PPD_CONN_TYPE	0x0300
-#define BWD_PPD_DEV_TYPE	0x1000
-#define PCI_DEVICE_ID_INTEL_NTB_B2B_JSF		0x3725
-#define PCI_DEVICE_ID_INTEL_NTB_PS_JSF		0x3726
-#define PCI_DEVICE_ID_INTEL_NTB_SS_JSF		0x3727
-#define PCI_DEVICE_ID_INTEL_NTB_B2B_SNB		0x3C0D
-#define PCI_DEVICE_ID_INTEL_NTB_PS_SNB		0x3C0E
-#define PCI_DEVICE_ID_INTEL_NTB_SS_SNB		0x3C0F
-#define PCI_DEVICE_ID_INTEL_NTB_B2B_IVT		0x0E0D
-#define PCI_DEVICE_ID_INTEL_NTB_PS_IVT		0x0E0E
-#define PCI_DEVICE_ID_INTEL_NTB_SS_IVT		0x0E0F
-#define PCI_DEVICE_ID_INTEL_NTB_B2B_HSX		0x2F0D
-#define PCI_DEVICE_ID_INTEL_NTB_PS_HSX		0x2F0E
-#define PCI_DEVICE_ID_INTEL_NTB_SS_HSX		0x2F0F
-#define PCI_DEVICE_ID_INTEL_NTB_B2B_BWD		0x0C4E
-
-#ifndef readq
-static inline u64 readq(void __iomem *addr)
-{
-	return readl(addr) | (((u64) readl(addr + 4)) << 32LL);
-}
-#endif
-
-#ifndef writeq
-static inline void writeq(u64 val, void __iomem *addr)
-{
-	writel(val & 0xffffffff, addr);
-	writel(val >> 32, addr + 4);
-}
-#endif
+struct intel_ntb_alt_reg {
+	unsigned long			db_bell;
+	unsigned long			db_mask;
+	unsigned long			spad;
+};
 
-#define NTB_BAR_MMIO		0
-#define NTB_BAR_23		2
-#define NTB_BAR_4		4
-#define NTB_BAR_5		5
-
-#define NTB_BAR_MASK		((1 << NTB_BAR_MMIO) | (1 << NTB_BAR_23) |\
-				 (1 << NTB_BAR_4))
-#define NTB_SPLITBAR_MASK	((1 << NTB_BAR_MMIO) | (1 << NTB_BAR_23) |\
-				 (1 << NTB_BAR_4) | (1 << NTB_BAR_5))
-
-#define NTB_HB_TIMEOUT		msecs_to_jiffies(1000)
-
-enum ntb_hw_event {
-	NTB_EVENT_SW_EVENT0 = 0,
-	NTB_EVENT_SW_EVENT1,
-	NTB_EVENT_SW_EVENT2,
-	NTB_EVENT_HW_ERROR,
-	NTB_EVENT_HW_LINK_UP,
-	NTB_EVENT_HW_LINK_DOWN,
+struct intel_ntb_xlat_reg {
+	unsigned long			bar0_base;
+	unsigned long			bar2_xlat;
+	unsigned long			bar2_limit;
 };
 
-struct ntb_mw {
-	dma_addr_t phys_addr;
-	void __iomem *vbase;
-	resource_size_t bar_sz;
+struct intel_b2b_addr {
+	phys_addr_t			bar0_addr;
+	phys_addr_t			bar2_addr64;
+	phys_addr_t			bar4_addr64;
+	phys_addr_t			bar4_addr32;
+	phys_addr_t			bar5_addr32;
 };
 
-struct ntb_db_cb {
-	int (*callback)(void *data, int db_num);
-	unsigned int db_num;
-	void *data;
-	struct ntb_device *ndev;
-	struct tasklet_struct irq_work;
+struct intel_ntb_vec {
+	struct intel_ntb_dev		*ndev;
+	int				num;
 };
 
-#define WA_SNB_ERR	0x00000001
-
-struct ntb_device {
-	struct pci_dev *pdev;
-	struct msix_entry *msix_entries;
-	void __iomem *reg_base;
-	struct ntb_mw *mw;
-	struct {
-		unsigned char max_mw;
-		unsigned char max_spads;
-		unsigned char max_db_bits;
-		unsigned char msix_cnt;
-	} limits;
-	struct {
-		void __iomem *ldb;
-		void __iomem *ldb_mask;
-		void __iomem *rdb;
-		void __iomem *bar2_xlat;
-		void __iomem *bar4_xlat;
-		void __iomem *bar5_xlat;
-		void __iomem *spad_write;
-		void __iomem *spad_read;
-		void __iomem *lnk_cntl;
-		void __iomem *lnk_stat;
-		void __iomem *spci_cmd;
-	} reg_ofs;
-	struct ntb_transport *ntb_transport;
-	void (*event_cb)(void *handle, enum ntb_hw_event event);
-
-	struct ntb_db_cb *db_cb;
-	unsigned char hw_type;
-	unsigned char conn_type;
-	unsigned char dev_type;
-	unsigned char num_msix;
-	unsigned char bits_per_vector;
-	unsigned char max_cbs;
-	unsigned char link_width;
-	unsigned char link_speed;
-	unsigned char link_status;
-	unsigned char split_bar;
-
-	struct delayed_work hb_timer;
-	unsigned long last_ts;
-
-	struct delayed_work lr_timer;
-
-	struct dentry *debugfs_dir;
-	struct dentry *debugfs_info;
-
-	unsigned int wa_flags;
+struct intel_ntb_dev {
+	struct ntb_dev			ntb;
+
+	/* offset of peer bar0 in b2b bar */
+	unsigned long			b2b_off;
+	/* mw idx used to access peer bar0 */
+	unsigned int			b2b_idx;
+
+	/* BAR45 is split into BAR4 and BAR5 */
+	bool				bar4_split;
+
+	u32				ntb_ctl;
+	u32				lnk_sta;
+
+	unsigned char			mw_count;
+	unsigned char			spad_count;
+	unsigned char			db_count;
+	unsigned char			db_vec_count;
+	unsigned char			db_vec_shift;
+
+	u64				db_valid_mask;
+	u64				db_link_mask;
+	u64				db_mask;
+
+	/* synchronize rmw access of db_mask and hw reg */
+	spinlock_t			db_mask_lock;
+
+	struct msix_entry		*msix;
+	struct intel_ntb_vec		*vec;
+
+	const struct intel_ntb_reg	*reg;
+	const struct intel_ntb_alt_reg	*self_reg;
+	const struct intel_ntb_alt_reg	*peer_reg;
+	const struct intel_ntb_xlat_reg	*xlat_reg;
+	void				__iomem *self_mmio;
+	void				__iomem *peer_mmio;
+	phys_addr_t			peer_addr;
+
+	unsigned long			last_ts;
+	struct delayed_work		hb_timer;
+
+	unsigned long			hwerr_flags;
+	unsigned long			unsafe_flags;
+	unsigned long			unsafe_flags_ignore;
+
+	struct dentry			*debugfs_dir;
+	struct dentry			*debugfs_info;
 };
 
-/**
- * ntb_max_cbs() - return the max callbacks
- * @ndev: pointer to ntb_device instance
- *
- * Given the ntb pointer, return the maximum number of callbacks
- *
- * RETURNS: the maximum number of callbacks
- */
-static inline unsigned char ntb_max_cbs(struct ntb_device *ndev)
-{
-	return ndev->max_cbs;
-}
-
-/**
- * ntb_max_mw() - return the max number of memory windows
- * @ndev: pointer to ntb_device instance
- *
- * Given the ntb pointer, return the maximum number of memory windows
- *
- * RETURNS: the maximum number of memory windows
- */
-static inline unsigned char ntb_max_mw(struct ntb_device *ndev)
-{
-	return ndev->limits.max_mw;
-}
-
-/**
- * ntb_hw_link_status() - return the hardware link status
- * @ndev: pointer to ntb_device instance
- *
- * Returns true if the hardware is connected to the remote system
- *
- * RETURNS: true or false based on the hardware link state
- */
-static inline bool ntb_hw_link_status(struct ntb_device *ndev)
-{
-	return ndev->link_status == NTB_LINK_UP;
-}
-
-/**
- * ntb_query_pdev() - return the pci_dev pointer
- * @ndev: pointer to ntb_device instance
- *
- * Given the ntb pointer, return the pci_dev pointer for the NTB hardware device
- *
- * RETURNS: a pointer to the ntb pci_dev
- */
-static inline struct pci_dev *ntb_query_pdev(struct ntb_device *ndev)
-{
-	return ndev->pdev;
-}
-
-/**
- * ntb_query_debugfs() - return the debugfs pointer
- * @ndev: pointer to ntb_device instance
- *
- * Given the ntb pointer, return the debugfs directory pointer for the NTB
- * hardware device
- *
- * RETURNS: a pointer to the debugfs directory
- */
-static inline struct dentry *ntb_query_debugfs(struct ntb_device *ndev)
-{
-	return ndev->debugfs_dir;
-}
-
-struct ntb_device *ntb_register_transport(struct pci_dev *pdev,
-					  void *transport);
-void ntb_unregister_transport(struct ntb_device *ndev);
-void ntb_set_mw_addr(struct ntb_device *ndev, unsigned int mw, u64 addr);
-int ntb_register_db_callback(struct ntb_device *ndev, unsigned int idx,
-			     void *data, int (*db_cb_func)(void *data,
-							   int db_num));
-void ntb_unregister_db_callback(struct ntb_device *ndev, unsigned int idx);
-int ntb_register_event_callback(struct ntb_device *ndev,
-				void (*event_cb_func)(void *handle,
-						      enum ntb_hw_event event));
-void ntb_unregister_event_callback(struct ntb_device *ndev);
-int ntb_get_max_spads(struct ntb_device *ndev);
-int ntb_write_local_spad(struct ntb_device *ndev, unsigned int idx, u32 val);
-int ntb_read_local_spad(struct ntb_device *ndev, unsigned int idx, u32 *val);
-int ntb_write_remote_spad(struct ntb_device *ndev, unsigned int idx, u32 val);
-int ntb_read_remote_spad(struct ntb_device *ndev, unsigned int idx, u32 *val);
-resource_size_t ntb_get_mw_base(struct ntb_device *ndev, unsigned int mw);
-void __iomem *ntb_get_mw_vbase(struct ntb_device *ndev, unsigned int mw);
-u64 ntb_get_mw_size(struct ntb_device *ndev, unsigned int mw);
-void ntb_ring_doorbell(struct ntb_device *ndev, unsigned int idx);
-void *ntb_find_transport(struct pci_dev *pdev);
-
-int ntb_transport_init(struct pci_dev *pdev);
-void ntb_transport_free(void *transport);
+#define ndev_pdev(ndev) ((ndev)->ntb.pdev)
+#define ndev_name(ndev) pci_name(ndev_pdev(ndev))
+#define ndev_dev(ndev) (&ndev_pdev(ndev)->dev)
+#define ntb_ndev(ntb) container_of(ntb, struct intel_ntb_dev, ntb)
+#define hb_ndev(work) container_of(work, struct intel_ntb_dev, hb_timer.work)
+
+#endif
diff --git a/drivers/ntb/ntb_transport.c b/drivers/ntb/ntb_transport.c
index c5f26cda9f97..9faf1c6029af 100644
--- a/drivers/ntb/ntb_transport.c
+++ b/drivers/ntb/ntb_transport.c
@@ -5,6 +5,7 @@
  *   GPL LICENSE SUMMARY
  *
  *   Copyright(c) 2012 Intel Corporation. All rights reserved.
+ *   Copyright (C) 2015 EMC Corporation. All Rights Reserved.
  *
  *   This program is free software; you can redistribute it and/or modify
  *   it under the terms of version 2 of the GNU General Public License as
@@ -13,6 +14,7 @@
  *   BSD LICENSE
  *
  *   Copyright(c) 2012 Intel Corporation. All rights reserved.
+ *   Copyright (C) 2015 EMC Corporation. All Rights Reserved.
  *
  *   Redistribution and use in source and binary forms, with or without
  *   modification, are permitted provided that the following conditions
@@ -40,7 +42,7 @@
  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
- * Intel PCIe NTB Linux driver
+ * PCIe NTB Transport Linux driver
  *
  * Contact Information:
  * Jon Mason <jon.mason@intel.com>
@@ -56,9 +58,22 @@
 #include <linux/pci.h>
 #include <linux/slab.h>
 #include <linux/types.h>
-#include "hw/intel/ntb_hw_intel.h"
+#include "linux/ntb.h"
+#include "linux/ntb_transport.h"
 
-#define NTB_TRANSPORT_VERSION	3
+#define NTB_TRANSPORT_VERSION	4
+#define NTB_TRANSPORT_VER	"4"
+#define NTB_TRANSPORT_NAME	"ntb_transport"
+#define NTB_TRANSPORT_DESC	"Software Queue-Pair Transport over NTB"
+
+MODULE_DESCRIPTION(NTB_TRANSPORT_DESC);
+MODULE_VERSION(NTB_TRANSPORT_VER);
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_AUTHOR("Intel Corporation");
+
+static unsigned long max_mw_size;
+module_param(max_mw_size, ulong, 0644);
+MODULE_PARM_DESC(max_mw_size, "Limit size of large memory windows");
 
 static unsigned int transport_mtu = 0x401E;
 module_param(transport_mtu, uint, 0644);
@@ -72,10 +87,12 @@ static unsigned int copy_bytes = 1024;
 module_param(copy_bytes, uint, 0644);
 MODULE_PARM_DESC(copy_bytes, "Threshold under which NTB will use the CPU to copy instead of DMA");
 
+static struct dentry *nt_debugfs_dir;
+
 struct ntb_queue_entry {
 	/* ntb_queue list reference */
 	struct list_head entry;
-	/* pointers to data to be transfered */
+	/* pointers to data to be transferred */
 	void *cb_data;
 	void *buf;
 	unsigned int len;
@@ -94,14 +111,16 @@ struct ntb_rx_info {
 };
 
 struct ntb_transport_qp {
-	struct ntb_transport *transport;
-	struct ntb_device *ndev;
+	struct ntb_transport_ctx *transport;
+	struct ntb_dev *ndev;
 	void *cb_data;
 	struct dma_chan *dma_chan;
 
 	bool client_ready;
-	bool qp_link;
+	bool link_is_up;
+
 	u8 qp_num;	/* Only 64 QP's are allowed.  0-63 */
+	u64 qp_bit;
 
 	struct ntb_rx_info __iomem *rx_info;
 	struct ntb_rx_info *remote_rx_info;
@@ -127,6 +146,7 @@ struct ntb_transport_qp {
 	unsigned int rx_max_entry;
 	unsigned int rx_max_frame;
 	dma_cookie_t last_cookie;
+	struct tasklet_struct rxc_db_work;
 
 	void (*event_handler)(void *data, int status);
 	struct delayed_work link_work;
@@ -153,33 +173,44 @@ struct ntb_transport_qp {
 };
 
 struct ntb_transport_mw {
-	size_t size;
+	phys_addr_t phys_addr;
+	resource_size_t phys_size;
+	resource_size_t xlat_align;
+	resource_size_t xlat_align_size;
+	void __iomem *vbase;
+	size_t xlat_size;
+	size_t buff_size;
 	void *virt_addr;
 	dma_addr_t dma_addr;
 };
 
 struct ntb_transport_client_dev {
 	struct list_head entry;
+	struct ntb_transport_ctx *nt;
 	struct device dev;
 };
 
-struct ntb_transport {
+struct ntb_transport_ctx {
 	struct list_head entry;
 	struct list_head client_devs;
 
-	struct ntb_device *ndev;
-	struct ntb_transport_mw *mw;
-	struct ntb_transport_qp *qps;
-	unsigned int max_qps;
-	unsigned long qp_bitmap;
-	bool transport_link;
+	struct ntb_dev *ndev;
+
+	struct ntb_transport_mw *mw_vec;
+	struct ntb_transport_qp *qp_vec;
+	unsigned int mw_count;
+	unsigned int qp_count;
+	u64 qp_bitmap;
+	u64 qp_bitmap_free;
+
+	bool link_is_up;
 	struct delayed_work link_work;
 	struct work_struct link_cleanup;
 };
 
 enum {
-	DESC_DONE_FLAG = 1 << 0,
-	LINK_DOWN_FLAG = 1 << 1,
+	DESC_DONE_FLAG = BIT(0),
+	LINK_DOWN_FLAG = BIT(1),
 };
 
 struct ntb_payload_header {
@@ -200,68 +231,69 @@ enum {
 	MAX_SPAD,
 };
 
-#define QP_TO_MW(ndev, qp)	((qp) % ntb_max_mw(ndev))
+#define dev_client_dev(__dev) \
+	container_of((__dev), struct ntb_transport_client_dev, dev)
+
+#define drv_client(__drv) \
+	container_of((__drv), struct ntb_transport_client, driver)
+
+#define QP_TO_MW(nt, qp)	((qp) % nt->mw_count)
 #define NTB_QP_DEF_NUM_ENTRIES	100
 #define NTB_LINK_DOWN_TIMEOUT	10
 
-static int ntb_match_bus(struct device *dev, struct device_driver *drv)
+static void ntb_transport_rxc_db(unsigned long data);
+static const struct ntb_ctx_ops ntb_transport_ops;
+static struct ntb_client ntb_transport_client;
+
+static int ntb_transport_bus_match(struct device *dev,
+				   struct device_driver *drv)
 {
 	return !strncmp(dev_name(dev), drv->name, strlen(drv->name));
 }
 
-static int ntb_client_probe(struct device *dev)
+static int ntb_transport_bus_probe(struct device *dev)
 {
-	const struct ntb_client *drv = container_of(dev->driver,
-						    struct ntb_client, driver);
-	struct pci_dev *pdev = container_of(dev->parent, struct pci_dev, dev);
+	const struct ntb_transport_client *client;
 	int rc = -EINVAL;
 
 	get_device(dev);
-	if (drv && drv->probe)
-		rc = drv->probe(pdev);
+
+	client = drv_client(dev->driver);
+	rc = client->probe(dev);
 	if (rc)
 		put_device(dev);
 
 	return rc;
 }
 
-static int ntb_client_remove(struct device *dev)
+static int ntb_transport_bus_remove(struct device *dev)
 {
-	const struct ntb_client *drv = container_of(dev->driver,
-						    struct ntb_client, driver);
-	struct pci_dev *pdev = container_of(dev->parent, struct pci_dev, dev);
+	const struct ntb_transport_client *client;
 
-	if (drv && drv->remove)
-		drv->remove(pdev);
+	client = drv_client(dev->driver);
+	client->remove(dev);
 
 	put_device(dev);
 
 	return 0;
 }
 
-static struct bus_type ntb_bus_type = {
-	.name = "ntb_bus",
-	.match = ntb_match_bus,
-	.probe = ntb_client_probe,
-	.remove = ntb_client_remove,
+static struct bus_type ntb_transport_bus = {
+	.name = "ntb_transport",
+	.match = ntb_transport_bus_match,
+	.probe = ntb_transport_bus_probe,
+	.remove = ntb_transport_bus_remove,
 };
 
 static LIST_HEAD(ntb_transport_list);
 
-static int ntb_bus_init(struct ntb_transport *nt)
+static int ntb_bus_init(struct ntb_transport_ctx *nt)
 {
-	if (list_empty(&ntb_transport_list)) {
-		int rc = bus_register(&ntb_bus_type);
-		if (rc)
-			return rc;
-	}
-
 	list_add(&nt->entry, &ntb_transport_list);
-
 	return 0;
 }
 
-static void ntb_bus_remove(struct ntb_transport *nt)
+static void ntb_bus_remove(struct ntb_transport_ctx *nt)
 {
 	struct ntb_transport_client_dev *client_dev, *cd;
 
@@ -273,29 +305,26 @@ static void ntb_bus_remove(struct ntb_transport *nt)
 	}
 
 	list_del(&nt->entry);
-
-	if (list_empty(&ntb_transport_list))
-		bus_unregister(&ntb_bus_type);
 }
 
-static void ntb_client_release(struct device *dev)
+static void ntb_transport_client_release(struct device *dev)
 {
 	struct ntb_transport_client_dev *client_dev;
-	client_dev = container_of(dev, struct ntb_transport_client_dev, dev);
 
+	client_dev = dev_client_dev(dev);
 	kfree(client_dev);
 }
 
 /**
- * ntb_unregister_client_dev - Unregister NTB client device
+ * ntb_transport_unregister_client_dev - Unregister NTB client device
  * @device_name: Name of NTB client device
  *
  * Unregister an NTB client device with the NTB transport layer
  */
-void ntb_unregister_client_dev(char *device_name)
+void ntb_transport_unregister_client_dev(char *device_name)
 {
 	struct ntb_transport_client_dev *client, *cd;
-	struct ntb_transport *nt;
+	struct ntb_transport_ctx *nt;
 
 	list_for_each_entry(nt, &ntb_transport_list, entry)
 		list_for_each_entry_safe(client, cd, &nt->client_devs, entry)
@@ -305,18 +334,18 @@ void ntb_unregister_client_dev(char *device_name)
 				device_unregister(&client->dev);
 			}
 }
-EXPORT_SYMBOL_GPL(ntb_unregister_client_dev);
+EXPORT_SYMBOL_GPL(ntb_transport_unregister_client_dev);
 
 /**
- * ntb_register_client_dev - Register NTB client device
+ * ntb_transport_register_client_dev - Register NTB client device
  * @device_name: Name of NTB client device
  *
  * Register an NTB client device with the NTB transport layer
  */
-int ntb_register_client_dev(char *device_name)
+int ntb_transport_register_client_dev(char *device_name)
 {
 	struct ntb_transport_client_dev *client_dev;
-	struct ntb_transport *nt;
+	struct ntb_transport_ctx *nt;
 	int rc, i = 0;
 
 	if (list_empty(&ntb_transport_list))
@@ -325,7 +354,7 @@ int ntb_register_client_dev(char *device_name)
 	list_for_each_entry(nt, &ntb_transport_list, entry) {
 		struct device *dev;
 
-		client_dev = kzalloc(sizeof(struct ntb_transport_client_dev),
+		client_dev = kzalloc(sizeof(*client_dev),
 				     GFP_KERNEL);
 		if (!client_dev) {
 			rc = -ENOMEM;
@@ -336,9 +365,9 @@ int ntb_register_client_dev(char *device_name)
 
 		/* setup and register client devices */
 		dev_set_name(dev, "%s%d", device_name, i);
-		dev->bus = &ntb_bus_type;
-		dev->release = ntb_client_release;
-		dev->parent = &ntb_query_pdev(nt->ndev)->dev;
+		dev->bus = &ntb_transport_bus;
+		dev->release = ntb_transport_client_release;
+		dev->parent = &nt->ndev->dev;
 
 		rc = device_register(dev);
 		if (rc) {
@@ -353,11 +382,11 @@ int ntb_register_client_dev(char *device_name)
 	return 0;
 
 err:
-	ntb_unregister_client_dev(device_name);
+	ntb_transport_unregister_client_dev(device_name);
 
 	return rc;
 }
-EXPORT_SYMBOL_GPL(ntb_register_client_dev);
+EXPORT_SYMBOL_GPL(ntb_transport_register_client_dev);
 
 /**
  * ntb_transport_register_client - Register NTB client driver
@@ -367,9 +396,9 @@ EXPORT_SYMBOL_GPL(ntb_register_client_dev);
  *
  * RETURNS: An appropriate -ERRNO error value on error, or zero for success.
  */
-int ntb_transport_register_client(struct ntb_client *drv)
+int ntb_transport_register_client(struct ntb_transport_client *drv)
 {
-	drv->driver.bus = &ntb_bus_type;
+	drv->driver.bus = &ntb_transport_bus;
 
 	if (list_empty(&ntb_transport_list))
 		return -ENODEV;
@@ -386,7 +415,7 @@ EXPORT_SYMBOL_GPL(ntb_transport_register_client);
  *
  * RETURNS: An appropriate -ERRNO error value on error, or zero for success.
  */
-void ntb_transport_unregister_client(struct ntb_client *drv)
+void ntb_transport_unregister_client(struct ntb_transport_client *drv)
 {
 	driver_unregister(&drv->driver);
 }
@@ -452,8 +481,8 @@ static ssize_t debugfs_read(struct file *filp, char __user *ubuf, size_t count,
 			       "tx_max_entry - \t%u\n", qp->tx_max_entry);
 
 	out_offset += snprintf(buf + out_offset, out_count - out_offset,
-			       "\nQP Link %s\n", (qp->qp_link == NTB_LINK_UP) ?
-			       "Up" : "Down");
+			       "\nQP Link %s\n",
+			       qp->link_is_up ? "Up" : "Down");
 	if (out_offset > out_count)
 		out_offset = out_count;
 
@@ -497,26 +526,31 @@ out:
 	return entry;
 }
 
-static void ntb_transport_setup_qp_mw(struct ntb_transport *nt,
-				      unsigned int qp_num)
+static int ntb_transport_setup_qp_mw(struct ntb_transport_ctx *nt,
+				     unsigned int qp_num)
 {
-	struct ntb_transport_qp *qp = &nt->qps[qp_num];
+	struct ntb_transport_qp *qp = &nt->qp_vec[qp_num];
+	struct ntb_transport_mw *mw;
 	unsigned int rx_size, num_qps_mw;
-	u8 mw_num, mw_max;
+	unsigned int mw_num, mw_count, qp_count;
 	unsigned int i;
 
-	mw_max = ntb_max_mw(nt->ndev);
-	mw_num = QP_TO_MW(nt->ndev, qp_num);
+	mw_count = nt->mw_count;
+	qp_count = nt->qp_count;
 
-	WARN_ON(nt->mw[mw_num].virt_addr == NULL);
+	mw_num = QP_TO_MW(nt, qp_num);
+	mw = &nt->mw_vec[mw_num];
+
+	if (!mw->virt_addr)
+		return -ENOMEM;
 
-	if (nt->max_qps % mw_max && mw_num + 1 < nt->max_qps / mw_max)
-		num_qps_mw = nt->max_qps / mw_max + 1;
+	if (qp_count % mw_count && mw_num + 1 < qp_count / mw_count)
+		num_qps_mw = qp_count / mw_count + 1;
 	else
-		num_qps_mw = nt->max_qps / mw_max;
+		num_qps_mw = qp_count / mw_count;
 
-	rx_size = (unsigned int) nt->mw[mw_num].size / num_qps_mw;
-	qp->rx_buff = nt->mw[mw_num].virt_addr + qp_num / mw_max * rx_size;
+	rx_size = (unsigned int)mw->xlat_size / num_qps_mw;
+	qp->rx_buff = mw->virt_addr + rx_size * qp_num / mw_count;
 	rx_size -= sizeof(struct ntb_rx_info);
 
 	qp->remote_rx_info = qp->rx_buff + rx_size;
@@ -530,49 +564,63 @@ static void ntb_transport_setup_qp_mw(struct ntb_transport *nt,
 
 	/* setup the hdr offsets with 0's */
 	for (i = 0; i < qp->rx_max_entry; i++) {
-		void *offset = qp->rx_buff + qp->rx_max_frame * (i + 1) -
-			       sizeof(struct ntb_payload_header);
+		void *offset = (qp->rx_buff + qp->rx_max_frame * (i + 1) -
+				sizeof(struct ntb_payload_header));
 		memset(offset, 0, sizeof(struct ntb_payload_header));
 	}
 
 	qp->rx_pkts = 0;
 	qp->tx_pkts = 0;
 	qp->tx_index = 0;
+
+	return 0;
 }
 
-static void ntb_free_mw(struct ntb_transport *nt, int num_mw)
+static void ntb_free_mw(struct ntb_transport_ctx *nt, int num_mw)
 {
-	struct ntb_transport_mw *mw = &nt->mw[num_mw];
-	struct pci_dev *pdev = ntb_query_pdev(nt->ndev);
+	struct ntb_transport_mw *mw = &nt->mw_vec[num_mw];
+	struct pci_dev *pdev = nt->ndev->pdev;
 
 	if (!mw->virt_addr)
 		return;
 
-	dma_free_coherent(&pdev->dev, mw->size, mw->virt_addr, mw->dma_addr);
+	ntb_mw_clear_trans(nt->ndev, num_mw);
+	dma_free_coherent(&pdev->dev, mw->buff_size,
+			  mw->virt_addr, mw->dma_addr);
+	mw->xlat_size = 0;
+	mw->buff_size = 0;
 	mw->virt_addr = NULL;
 }
 
-static int ntb_set_mw(struct ntb_transport *nt, int num_mw, unsigned int size)
+static int ntb_set_mw(struct ntb_transport_ctx *nt, int num_mw,
+		      unsigned int size)
 {
-	struct ntb_transport_mw *mw = &nt->mw[num_mw];
-	struct pci_dev *pdev = ntb_query_pdev(nt->ndev);
+	struct ntb_transport_mw *mw = &nt->mw_vec[num_mw];
+	struct pci_dev *pdev = nt->ndev->pdev;
+	unsigned int xlat_size, buff_size;
+	int rc;
+
+	xlat_size = round_up(size, mw->xlat_align_size);
+	buff_size = round_up(size, mw->xlat_align);
 
 	/* No need to re-setup */
-	if (mw->size == ALIGN(size, 4096))
+	if (mw->xlat_size == xlat_size)
 		return 0;
 
-	if (mw->size != 0)
+	if (mw->buff_size)
 		ntb_free_mw(nt, num_mw);
 
-	/* Alloc memory for receiving data.  Must be 4k aligned */
-	mw->size = ALIGN(size, 4096);
+	/* Alloc memory for receiving data.  Must be aligned */
+	mw->xlat_size = xlat_size;
+	mw->buff_size = buff_size;
 
-	mw->virt_addr = dma_alloc_coherent(&pdev->dev, mw->size, &mw->dma_addr,
-					   GFP_KERNEL);
+	mw->virt_addr = dma_alloc_coherent(&pdev->dev, buff_size,
+					   &mw->dma_addr, GFP_KERNEL);
 	if (!mw->virt_addr) {
-		mw->size = 0;
-		dev_err(&pdev->dev, "Unable to allocate MW buffer of size %d\n",
-		       (int) mw->size);
+		mw->xlat_size = 0;
+		mw->buff_size = 0;
+		dev_err(&pdev->dev, "Unable to alloc MW buff of size %d\n",
+			buff_size);
 		return -ENOMEM;
 	}
 
@@ -582,34 +630,39 @@ static int ntb_set_mw(struct ntb_transport *nt, int num_mw, unsigned int size)
 	 * is a requirement of the hardware. It is recommended to setup CMA
 	 * for BAR sizes equal or greater than 4MB.
 	 */
-	if (!IS_ALIGNED(mw->dma_addr, mw->size)) {
-		dev_err(&pdev->dev, "DMA memory %pad not aligned to BAR size\n",
+	if (!IS_ALIGNED(mw->dma_addr, mw->xlat_align)) {
+		dev_err(&pdev->dev, "DMA memory %pad is not aligned\n",
 			&mw->dma_addr);
 		ntb_free_mw(nt, num_mw);
 		return -ENOMEM;
 	}
 
 	/* Notify HW the memory location of the receive buffer */
-	ntb_set_mw_addr(nt->ndev, num_mw, mw->dma_addr);
+	rc = ntb_mw_set_trans(nt->ndev, num_mw, mw->dma_addr, mw->xlat_size);
+	if (rc) {
+		dev_err(&pdev->dev, "Unable to set mw%d translation", num_mw);
+		ntb_free_mw(nt, num_mw);
+		return -EIO;
+	}
 
 	return 0;
 }
 
 static void ntb_qp_link_cleanup(struct ntb_transport_qp *qp)
 {
-	struct ntb_transport *nt = qp->transport;
-	struct pci_dev *pdev = ntb_query_pdev(nt->ndev);
+	struct ntb_transport_ctx *nt = qp->transport;
+	struct pci_dev *pdev = nt->ndev->pdev;
 
-	if (qp->qp_link == NTB_LINK_DOWN) {
+	if (qp->link_is_up) {
 		cancel_delayed_work_sync(&qp->link_work);
 		return;
 	}
 
-	if (qp->event_handler)
-		qp->event_handler(qp->cb_data, NTB_LINK_DOWN);
-
 	dev_info(&pdev->dev, "qp %d: Link Down\n", qp->qp_num);
-	qp->qp_link = NTB_LINK_DOWN;
+	qp->link_is_up = false;
+
+	if (qp->event_handler)
+		qp->event_handler(qp->cb_data, qp->link_is_up);
 }
 
 static void ntb_qp_link_cleanup_work(struct work_struct *work)
@@ -617,11 +670,11 @@ static void ntb_qp_link_cleanup_work(struct work_struct *work)
 	struct ntb_transport_qp *qp = container_of(work,
 						   struct ntb_transport_qp,
 						   link_cleanup);
-	struct ntb_transport *nt = qp->transport;
+	struct ntb_transport_ctx *nt = qp->transport;
 
 	ntb_qp_link_cleanup(qp);
 
-	if (nt->transport_link == NTB_LINK_UP)
+	if (nt->link_is_up)
 		schedule_delayed_work(&qp->link_work,
 				      msecs_to_jiffies(NTB_LINK_DOWN_TIMEOUT));
 }
@@ -631,180 +684,132 @@ static void ntb_qp_link_down(struct ntb_transport_qp *qp)
 	schedule_work(&qp->link_cleanup);
 }
 
-static void ntb_transport_link_cleanup(struct ntb_transport *nt)
+static void ntb_transport_link_cleanup(struct ntb_transport_ctx *nt)
 {
+	struct ntb_transport_qp *qp;
+	u64 qp_bitmap_alloc;
 	int i;
 
+	qp_bitmap_alloc = nt->qp_bitmap & ~nt->qp_bitmap_free;
+
 	/* Pass along the info to any clients */
-	for (i = 0; i < nt->max_qps; i++)
-		if (!test_bit(i, &nt->qp_bitmap))
-			ntb_qp_link_cleanup(&nt->qps[i]);
+	for (i = 0; i < nt->qp_count; i++)
+		if (qp_bitmap_alloc & BIT_ULL(i)) {
+			qp = &nt->qp_vec[i];
+			ntb_qp_link_cleanup(qp);
+			cancel_work_sync(&qp->link_cleanup);
+			cancel_delayed_work_sync(&qp->link_work);
+		}
 
-	if (nt->transport_link == NTB_LINK_DOWN)
+	if (!nt->link_is_up)
 		cancel_delayed_work_sync(&nt->link_work);
-	else
-		nt->transport_link = NTB_LINK_DOWN;
 
 	/* The scratchpad registers keep the values if the remote side
 	 * goes down, blast them now to give them a sane value the next
 	 * time they are accessed
 	 */
 	for (i = 0; i < MAX_SPAD; i++)
-		ntb_write_local_spad(nt->ndev, i, 0);
+		ntb_spad_write(nt->ndev, i, 0);
 }
 
 static void ntb_transport_link_cleanup_work(struct work_struct *work)
 {
-	struct ntb_transport *nt = container_of(work, struct ntb_transport,
-						link_cleanup);
+	struct ntb_transport_ctx *nt =
+		container_of(work, struct ntb_transport_ctx, link_cleanup);
 
 	ntb_transport_link_cleanup(nt);
 }
 
-static void ntb_transport_event_callback(void *data, enum ntb_hw_event event)
+static void ntb_transport_event_callback(void *data)
 {
-	struct ntb_transport *nt = data;
+	struct ntb_transport_ctx *nt = data;
 
-	switch (event) {
-	case NTB_EVENT_HW_LINK_UP:
+	if (ntb_link_is_up(nt->ndev, NULL, NULL) == 1)
 		schedule_delayed_work(&nt->link_work, 0);
-		break;
-	case NTB_EVENT_HW_LINK_DOWN:
+	else
 		schedule_work(&nt->link_cleanup);
-		break;
-	default:
-		BUG();
-	}
 }
 
 static void ntb_transport_link_work(struct work_struct *work)
 {
-	struct ntb_transport *nt = container_of(work, struct ntb_transport,
-						link_work.work);
-	struct ntb_device *ndev = nt->ndev;
-	struct pci_dev *pdev = ntb_query_pdev(ndev);
+	struct ntb_transport_ctx *nt =
+		container_of(work, struct ntb_transport_ctx, link_work.work);
+	struct ntb_dev *ndev = nt->ndev;
+	struct pci_dev *pdev = ndev->pdev;
+	resource_size_t size;
 	u32 val;
-	int rc, i;
+	int rc, i, spad;
 
 	/* send the local info, in the opposite order of the way we read it */
-	for (i = 0; i < ntb_max_mw(ndev); i++) {
-		rc = ntb_write_remote_spad(ndev, MW0_SZ_HIGH + (i * 2),
-					   ntb_get_mw_size(ndev, i) >> 32);
-		if (rc) {
-			dev_err(&pdev->dev, "Error writing %u to remote spad %d\n",
-				(u32)(ntb_get_mw_size(ndev, i) >> 32),
-				MW0_SZ_HIGH + (i * 2));
-			goto out;
-		}
+	for (i = 0; i < nt->mw_count; i++) {
+		size = nt->mw_vec[i].phys_size;
 
-		rc = ntb_write_remote_spad(ndev, MW0_SZ_LOW + (i * 2),
-					   (u32) ntb_get_mw_size(ndev, i));
-		if (rc) {
-			dev_err(&pdev->dev, "Error writing %u to remote spad %d\n",
-				(u32) ntb_get_mw_size(ndev, i),
-				MW0_SZ_LOW + (i * 2));
-			goto out;
-		}
-	}
+		if (max_mw_size && size > max_mw_size)
+			size = max_mw_size;
 
-	rc = ntb_write_remote_spad(ndev, NUM_MWS, ntb_max_mw(ndev));
-	if (rc) {
-		dev_err(&pdev->dev, "Error writing %x to remote spad %d\n",
-			ntb_max_mw(ndev), NUM_MWS);
-		goto out;
-	}
+		spad = MW0_SZ_HIGH + (i * 2);
+		ntb_peer_spad_write(ndev, spad, (u32)(size >> 32));
 
-	rc = ntb_write_remote_spad(ndev, NUM_QPS, nt->max_qps);
-	if (rc) {
-		dev_err(&pdev->dev, "Error writing %x to remote spad %d\n",
-			nt->max_qps, NUM_QPS);
-		goto out;
+		spad = MW0_SZ_LOW + (i * 2);
+		ntb_peer_spad_write(ndev, spad, (u32)size);
 	}
 
-	rc = ntb_write_remote_spad(ndev, VERSION, NTB_TRANSPORT_VERSION);
-	if (rc) {
-		dev_err(&pdev->dev, "Error writing %x to remote spad %d\n",
-			NTB_TRANSPORT_VERSION, VERSION);
-		goto out;
-	}
+	ntb_peer_spad_write(ndev, NUM_MWS, nt->mw_count);
 
-	/* Query the remote side for its info */
-	rc = ntb_read_remote_spad(ndev, VERSION, &val);
-	if (rc) {
-		dev_err(&pdev->dev, "Error reading remote spad %d\n", VERSION);
-		goto out;
-	}
+	ntb_peer_spad_write(ndev, NUM_QPS, nt->qp_count);
 
-	if (val != NTB_TRANSPORT_VERSION)
-		goto out;
-	dev_dbg(&pdev->dev, "Remote version = %d\n", val);
+	ntb_peer_spad_write(ndev, VERSION, NTB_TRANSPORT_VERSION);
 
-	rc = ntb_read_remote_spad(ndev, NUM_QPS, &val);
-	if (rc) {
-		dev_err(&pdev->dev, "Error reading remote spad %d\n", NUM_QPS);
+	/* Query the remote side for its info */
+	val = ntb_peer_spad_read(ndev, VERSION);
+	dev_dbg(&pdev->dev, "Remote version = %d\n", val);
+	if (val != NTB_TRANSPORT_VERSION)
 		goto out;
-	}
 
-	if (val != nt->max_qps)
-		goto out;
+	val = ntb_peer_spad_read(ndev, NUM_QPS);
 	dev_dbg(&pdev->dev, "Remote max number of qps = %d\n", val);
-
-	rc = ntb_read_remote_spad(ndev, NUM_MWS, &val);
-	if (rc) {
-		dev_err(&pdev->dev, "Error reading remote spad %d\n", NUM_MWS);
+	if (val != nt->qp_count)
 		goto out;
-	}
 
-	if (val != ntb_max_mw(ndev))
-		goto out;
+	val = ntb_peer_spad_read(ndev, NUM_MWS);
 	dev_dbg(&pdev->dev, "Remote number of mws = %d\n", val);
+	if (val != nt->mw_count)
+		goto out;
 
-	for (i = 0; i < ntb_max_mw(ndev); i++) {
+	for (i = 0; i < nt->mw_count; i++) {
 		u64 val64;
 
-		rc = ntb_read_remote_spad(ndev, MW0_SZ_HIGH + (i * 2), &val);
-		if (rc) {
-			dev_err(&pdev->dev, "Error reading remote spad %d\n",
-				MW0_SZ_HIGH + (i * 2));
-			goto out1;
-		}
-
-		val64 = (u64) val << 32;
-
-		rc = ntb_read_remote_spad(ndev, MW0_SZ_LOW + (i * 2), &val);
-		if (rc) {
-			dev_err(&pdev->dev, "Error reading remote spad %d\n",
-				MW0_SZ_LOW + (i * 2));
-			goto out1;
-		}
+		val = ntb_peer_spad_read(ndev, MW0_SZ_HIGH + (i * 2));
+		val64 = (u64)val << 32;
 
+		val = ntb_peer_spad_read(ndev, MW0_SZ_LOW + (i * 2));
 		val64 |= val;
 
-		dev_dbg(&pdev->dev, "Remote MW%d size = %llu\n", i, val64);
+		dev_dbg(&pdev->dev, "Remote MW%d size = %#llx\n", i, val64);
 
 		rc = ntb_set_mw(nt, i, val64);
 		if (rc)
 			goto out1;
 	}
 
-	nt->transport_link = NTB_LINK_UP;
+	nt->link_is_up = true;
 
-	for (i = 0; i < nt->max_qps; i++) {
-		struct ntb_transport_qp *qp = &nt->qps[i];
+	for (i = 0; i < nt->qp_count; i++) {
+		struct ntb_transport_qp *qp = &nt->qp_vec[i];
 
 		ntb_transport_setup_qp_mw(nt, i);
 
-		if (qp->client_ready == NTB_LINK_UP)
+		if (qp->client_ready)
 			schedule_delayed_work(&qp->link_work, 0);
 	}
 
 	return;
 
 out1:
-	for (i = 0; i < ntb_max_mw(ndev); i++)
+	for (i = 0; i < nt->mw_count; i++)
 		ntb_free_mw(nt, i);
 out:
-	if (ntb_hw_link_status(ndev))
+	if (ntb_link_is_up(ndev, NULL, NULL) == 1)
 		schedule_delayed_work(&nt->link_work,
 				      msecs_to_jiffies(NTB_LINK_DOWN_TIMEOUT));
 }
@@ -814,73 +819,73 @@ static void ntb_qp_link_work(struct work_struct *work)
 	struct ntb_transport_qp *qp = container_of(work,
 						   struct ntb_transport_qp,
 						   link_work.work);
-	struct pci_dev *pdev = ntb_query_pdev(qp->ndev);
-	struct ntb_transport *nt = qp->transport;
-	int rc, val;
+	struct pci_dev *pdev = qp->ndev->pdev;
+	struct ntb_transport_ctx *nt = qp->transport;
+	int val;
 
-	WARN_ON(nt->transport_link != NTB_LINK_UP);
+	WARN_ON(!nt->link_is_up);
 
-	rc = ntb_read_local_spad(nt->ndev, QP_LINKS, &val);
-	if (rc) {
-		dev_err(&pdev->dev, "Error reading spad %d\n", QP_LINKS);
-		return;
-	}
+	val = ntb_spad_read(nt->ndev, QP_LINKS);
 
-	rc = ntb_write_remote_spad(nt->ndev, QP_LINKS, val | 1 << qp->qp_num);
-	if (rc)
-		dev_err(&pdev->dev, "Error writing %x to remote spad %d\n",
-			val | 1 << qp->qp_num, QP_LINKS);
+	ntb_peer_spad_write(nt->ndev, QP_LINKS, val | BIT(qp->qp_num));
 
 	/* query remote spad for qp ready bits */
-	rc = ntb_read_remote_spad(nt->ndev, QP_LINKS, &val);
-	if (rc)
-		dev_err(&pdev->dev, "Error reading remote spad %d\n", QP_LINKS);
-
+	ntb_peer_spad_read(nt->ndev, QP_LINKS);
 	dev_dbg(&pdev->dev, "Remote QP link status = %x\n", val);
 
 	/* See if the remote side is up */
-	if (1 << qp->qp_num & val) {
-		qp->qp_link = NTB_LINK_UP;
-
+	if (val & BIT(qp->qp_num)) {
 		dev_info(&pdev->dev, "qp %d: Link Up\n", qp->qp_num);
+		qp->link_is_up = true;
+
 		if (qp->event_handler)
-			qp->event_handler(qp->cb_data, NTB_LINK_UP);
-	} else if (nt->transport_link == NTB_LINK_UP)
+			qp->event_handler(qp->cb_data, qp->link_is_up);
+	} else if (nt->link_is_up)
 		schedule_delayed_work(&qp->link_work,
 				      msecs_to_jiffies(NTB_LINK_DOWN_TIMEOUT));
 }
 
-static int ntb_transport_init_queue(struct ntb_transport *nt,
+static int ntb_transport_init_queue(struct ntb_transport_ctx *nt,
 				    unsigned int qp_num)
 {
 	struct ntb_transport_qp *qp;
+	struct ntb_transport_mw *mw;
+	phys_addr_t mw_base;
+	resource_size_t mw_size;
 	unsigned int num_qps_mw, tx_size;
-	u8 mw_num, mw_max;
+	unsigned int mw_num, mw_count, qp_count;
 	u64 qp_offset;
 
-	mw_max = ntb_max_mw(nt->ndev);
-	mw_num = QP_TO_MW(nt->ndev, qp_num);
+	mw_count = nt->mw_count;
+	qp_count = nt->qp_count;
 
-	qp = &nt->qps[qp_num];
+	mw_num = QP_TO_MW(nt, qp_num);
+	mw = &nt->mw_vec[mw_num];
+
+	qp = &nt->qp_vec[qp_num];
 	qp->qp_num = qp_num;
 	qp->transport = nt;
 	qp->ndev = nt->ndev;
-	qp->qp_link = NTB_LINK_DOWN;
-	qp->client_ready = NTB_LINK_DOWN;
+	qp->link_is_up = false;
+	qp->client_ready = false;
 	qp->event_handler = NULL;
 
-	if (nt->max_qps % mw_max && mw_num + 1 < nt->max_qps / mw_max)
-		num_qps_mw = nt->max_qps / mw_max + 1;
+	if (qp_count % mw_count && mw_num + 1 < qp_count / mw_count)
+		num_qps_mw = qp_count / mw_count + 1;
 	else
-		num_qps_mw = nt->max_qps / mw_max;
+		num_qps_mw = qp_count / mw_count;
+
+	mw_base = nt->mw_vec[mw_num].phys_addr;
+	mw_size = nt->mw_vec[mw_num].phys_size;
 
-	tx_size = (unsigned int) ntb_get_mw_size(qp->ndev, mw_num) / num_qps_mw;
-	qp_offset = qp_num / mw_max * tx_size;
-	qp->tx_mw = ntb_get_mw_vbase(nt->ndev, mw_num) + qp_offset;
+	tx_size = (unsigned int)mw_size / num_qps_mw;
+	qp_offset = tx_size * qp_num / mw_count;
+
+	qp->tx_mw = nt->mw_vec[mw_num].vbase + qp_offset;
 	if (!qp->tx_mw)
 		return -EINVAL;
 
-	qp->tx_mw_phys = ntb_get_mw_base(qp->ndev, mw_num) + qp_offset;
+	qp->tx_mw_phys = mw_base + qp_offset;
 	if (!qp->tx_mw_phys)
 		return -EINVAL;
 
@@ -891,16 +896,19 @@ static int ntb_transport_init_queue(struct ntb_transport *nt,
 	qp->tx_max_frame = min(transport_mtu, tx_size / 2);
 	qp->tx_max_entry = tx_size / qp->tx_max_frame;
 
-	if (ntb_query_debugfs(nt->ndev)) {
+	if (nt_debugfs_dir) {
 		char debugfs_name[4];
 
 		snprintf(debugfs_name, 4, "qp%d", qp_num);
 		qp->debugfs_dir = debugfs_create_dir(debugfs_name,
-						 ntb_query_debugfs(nt->ndev));
+						     nt_debugfs_dir);
 
 		qp->debugfs_stats = debugfs_create_file("stats", S_IRUSR,
 							qp->debugfs_dir, qp,
 							&ntb_qp_debugfs_stats);
+	} else {
+		qp->debugfs_dir = NULL;
+		qp->debugfs_stats = NULL;
 	}
 
 	INIT_DELAYED_WORK(&qp->link_work, ntb_qp_link_work);
@@ -914,46 +922,84 @@ static int ntb_transport_init_queue(struct ntb_transport *nt,
 	INIT_LIST_HEAD(&qp->rx_free_q);
 	INIT_LIST_HEAD(&qp->tx_free_q);
 
+	tasklet_init(&qp->rxc_db_work, ntb_transport_rxc_db,
+		     (unsigned long)qp);
+
 	return 0;
 }
 
-int ntb_transport_init(struct pci_dev *pdev)
+static int ntb_transport_probe(struct ntb_client *self, struct ntb_dev *ndev)
 {
-	struct ntb_transport *nt;
+	struct ntb_transport_ctx *nt;
+	struct ntb_transport_mw *mw;
+	unsigned int mw_count, qp_count;
+	u64 qp_bitmap;
 	int rc, i;
 
-	nt = kzalloc(sizeof(struct ntb_transport), GFP_KERNEL);
+	if (ntb_db_is_unsafe(ndev))
+		dev_dbg(&ndev->dev,
+			"doorbell is unsafe, proceed anyway...\n");
+	if (ntb_spad_is_unsafe(ndev))
+		dev_dbg(&ndev->dev,
+			"scratchpad is unsafe, proceed anyway...\n");
+
+	nt = kzalloc(sizeof(*nt), GFP_KERNEL);
 	if (!nt)
 		return -ENOMEM;
 
-	nt->ndev = ntb_register_transport(pdev, nt);
-	if (!nt->ndev) {
-		rc = -EIO;
+	nt->ndev = ndev;
+
+	mw_count = ntb_mw_count(ndev);
+
+	nt->mw_count = mw_count;
+
+	nt->mw_vec = kcalloc(mw_count, sizeof(*nt->mw_vec), GFP_KERNEL);
+	if (!nt->mw_vec) {
+		rc = -ENOMEM;
 		goto err;
 	}
 
-	nt->mw = kcalloc(ntb_max_mw(nt->ndev), sizeof(struct ntb_transport_mw),
-			 GFP_KERNEL);
-	if (!nt->mw) {
-		rc = -ENOMEM;
-		goto err1;
+	for (i = 0; i < mw_count; i++) {
+		mw = &nt->mw_vec[i];
+
+		rc = ntb_mw_get_range(ndev, i, &mw->phys_addr, &mw->phys_size,
+				      &mw->xlat_align, &mw->xlat_align_size);
+		if (rc)
+			goto err1;
+
+		mw->vbase = ioremap(mw->phys_addr, mw->phys_size);
+		if (!mw->vbase) {
+			rc = -ENOMEM;
+			goto err1;
+		}
+
+		mw->buff_size = 0;
+		mw->xlat_size = 0;
+		mw->virt_addr = NULL;
+		mw->dma_addr = 0;
 	}
 
-	if (max_num_clients)
-		nt->max_qps = min(ntb_max_cbs(nt->ndev), max_num_clients);
-	else
-		nt->max_qps = min(ntb_max_cbs(nt->ndev), ntb_max_mw(nt->ndev));
+	qp_bitmap = ntb_db_valid_mask(ndev);
+
+	qp_count = ilog2(qp_bitmap);
+	if (max_num_clients && max_num_clients < qp_count)
+		qp_count = max_num_clients;
+	else if (mw_count < qp_count)
+		qp_count = mw_count;
+
+	qp_bitmap &= BIT_ULL(qp_count) - 1;
+
+	nt->qp_count = qp_count;
+	nt->qp_bitmap = qp_bitmap;
+	nt->qp_bitmap_free = qp_bitmap;
 
-	nt->qps = kcalloc(nt->max_qps, sizeof(struct ntb_transport_qp),
-			  GFP_KERNEL);
-	if (!nt->qps) {
+	nt->qp_vec = kcalloc(qp_count, sizeof(*nt->qp_vec), GFP_KERNEL);
+	if (!nt->qp_vec) {
 		rc = -ENOMEM;
 		goto err2;
 	}
 
-	nt->qp_bitmap = ((u64) 1 << nt->max_qps) - 1;
-
-	for (i = 0; i < nt->max_qps; i++) {
+	for (i = 0; i < qp_count; i++) {
 		rc = ntb_transport_init_queue(nt, i);
 		if (rc)
 			goto err3;
@@ -962,8 +1008,7 @@ int ntb_transport_init(struct pci_dev *pdev)
 	INIT_DELAYED_WORK(&nt->link_work, ntb_transport_link_work);
 	INIT_WORK(&nt->link_cleanup, ntb_transport_link_cleanup_work);
 
-	rc = ntb_register_event_callback(nt->ndev,
-					 ntb_transport_event_callback);
+	rc = ntb_set_ctx(ndev, nt, &ntb_transport_ops);
 	if (rc)
 		goto err3;
 
@@ -972,51 +1017,61 @@ int ntb_transport_init(struct pci_dev *pdev)
 	if (rc)
 		goto err4;
 
-	if (ntb_hw_link_status(nt->ndev))
-		schedule_delayed_work(&nt->link_work, 0);
+	nt->link_is_up = false;
+	ntb_link_enable(ndev, NTB_SPEED_AUTO, NTB_WIDTH_AUTO);
+	ntb_link_event(ndev);
 
 	return 0;
 
 err4:
-	ntb_unregister_event_callback(nt->ndev);
+	ntb_clear_ctx(ndev);
 err3:
-	kfree(nt->qps);
+	kfree(nt->qp_vec);
 err2:
-	kfree(nt->mw);
+	kfree(nt->mw_vec);
 err1:
-	ntb_unregister_transport(nt->ndev);
+	while (i--) {
+		mw = &nt->mw_vec[i];
+		iounmap(mw->vbase);
+	}
 err:
 	kfree(nt);
 	return rc;
 }
 
-void ntb_transport_free(void *transport)
+static void ntb_transport_free(struct ntb_client *self, struct ntb_dev *ndev)
 {
-	struct ntb_transport *nt = transport;
-	struct ntb_device *ndev = nt->ndev;
+	struct ntb_transport_ctx *nt = ndev->ctx;
+	struct ntb_transport_qp *qp;
+	u64 qp_bitmap_alloc;
 	int i;
 
 	ntb_transport_link_cleanup(nt);
+	cancel_work_sync(&nt->link_cleanup);
+	cancel_delayed_work_sync(&nt->link_work);
+
+	qp_bitmap_alloc = nt->qp_bitmap & ~nt->qp_bitmap_free;
 
 	/* verify that all the qp's are freed */
-	for (i = 0; i < nt->max_qps; i++) {
-		if (!test_bit(i, &nt->qp_bitmap))
-			ntb_transport_free_queue(&nt->qps[i]);
-		debugfs_remove_recursive(nt->qps[i].debugfs_dir);
+	for (i = 0; i < nt->qp_count; i++) {
+		qp = &nt->qp_vec[i];
+		if (qp_bitmap_alloc & BIT_ULL(i))
+			ntb_transport_free_queue(qp);
+		debugfs_remove_recursive(qp->debugfs_dir);
 	}
 
-	ntb_bus_remove(nt);
+	ntb_link_disable(ndev);
+	ntb_clear_ctx(ndev);
 
-	cancel_delayed_work_sync(&nt->link_work);
-
-	ntb_unregister_event_callback(ndev);
+	ntb_bus_remove(nt);
 
-	for (i = 0; i < ntb_max_mw(ndev); i++)
+	for (i = nt->mw_count; i--; ) {
 		ntb_free_mw(nt, i);
+		iounmap(nt->mw_vec[i].vbase);
+	}
 
-	kfree(nt->qps);
-	kfree(nt->mw);
-	ntb_unregister_transport(ndev);
+	kfree(nt->qp_vec);
+	kfree(nt->mw_vec);
 	kfree(nt);
 }
 
@@ -1028,15 +1083,13 @@ static void ntb_rx_copy_callback(void *data)
 	unsigned int len = entry->len;
 	struct ntb_payload_header *hdr = entry->rx_hdr;
 
-	/* Ensure that the data is fully copied out before clearing the flag */
-	wmb();
 	hdr->flags = 0;
 
 	iowrite32(entry->index, &qp->rx_info->entry);
 
 	ntb_list_add(&qp->ntb_rx_free_q_lock, &entry->entry, &qp->rx_free_q);
 
-	if (qp->rx_handler && qp->client_ready == NTB_LINK_UP)
+	if (qp->rx_handler && qp->client_ready)
 		qp->rx_handler(qp, qp->cb_data, cb_data, len);
 }
 
@@ -1047,6 +1100,9 @@ static void ntb_memcpy_rx(struct ntb_queue_entry *entry, void *offset)
 
 	memcpy(buf, offset, len);
 
+	/* Ensure that the data is fully copied out before clearing the flag */
+	wmb();
+
 	ntb_rx_copy_callback(entry);
 }
 
@@ -1071,8 +1127,8 @@ static void ntb_async_rx(struct ntb_queue_entry *entry, void *offset,
 		goto err_wait;
 
 	device = chan->device;
-	pay_off = (size_t) offset & ~PAGE_MASK;
-	buff_off = (size_t) buf & ~PAGE_MASK;
+	pay_off = (size_t)offset & ~PAGE_MASK;
+	buff_off = (size_t)buf & ~PAGE_MASK;
 
 	if (!is_dma_copy_aligned(device, pay_off, buff_off, len))
 		goto err_wait;
@@ -1138,86 +1194,104 @@ static int ntb_process_rxc(struct ntb_transport_qp *qp)
 	struct ntb_payload_header *hdr;
 	struct ntb_queue_entry *entry;
 	void *offset;
+	int rc;
 
 	offset = qp->rx_buff + qp->rx_max_frame * qp->rx_index;
 	hdr = offset + qp->rx_max_frame - sizeof(struct ntb_payload_header);
 
-	entry = ntb_list_rm(&qp->ntb_rx_pend_q_lock, &qp->rx_pend_q);
-	if (!entry) {
-		dev_dbg(&ntb_query_pdev(qp->ndev)->dev,
-			"no buffer - HDR ver %u, len %d, flags %x\n",
-			hdr->ver, hdr->len, hdr->flags);
-		qp->rx_err_no_buf++;
-		return -ENOMEM;
-	}
+	dev_dbg(&qp->ndev->pdev->dev, "qp %d: RX ver %u len %d flags %x\n",
+		qp->qp_num, hdr->ver, hdr->len, hdr->flags);
 
 	if (!(hdr->flags & DESC_DONE_FLAG)) {
-		ntb_list_add(&qp->ntb_rx_pend_q_lock, &entry->entry,
-			     &qp->rx_pend_q);
+		dev_dbg(&qp->ndev->pdev->dev, "done flag not set\n");
 		qp->rx_ring_empty++;
 		return -EAGAIN;
 	}
 
-	if (hdr->ver != (u32) qp->rx_pkts) {
-		dev_dbg(&ntb_query_pdev(qp->ndev)->dev,
-			"qp %d: version mismatch, expected %llu - got %u\n",
-			qp->qp_num, qp->rx_pkts, hdr->ver);
-		ntb_list_add(&qp->ntb_rx_pend_q_lock, &entry->entry,
-			     &qp->rx_pend_q);
+	if (hdr->flags & LINK_DOWN_FLAG) {
+		dev_dbg(&qp->ndev->pdev->dev, "link down flag set\n");
+		ntb_qp_link_down(qp);
+		hdr->flags = 0;
+		iowrite32(qp->rx_index, &qp->rx_info->entry);
+		return 0;
+	}
+
+	if (hdr->ver != (u32)qp->rx_pkts) {
+		dev_dbg(&qp->ndev->pdev->dev,
+			"version mismatch, expected %llu - got %u\n",
+			qp->rx_pkts, hdr->ver);
 		qp->rx_err_ver++;
 		return -EIO;
 	}
 
-	if (hdr->flags & LINK_DOWN_FLAG) {
-		ntb_qp_link_down(qp);
+	entry = ntb_list_rm(&qp->ntb_rx_pend_q_lock, &qp->rx_pend_q);
+	if (!entry) {
+		dev_dbg(&qp->ndev->pdev->dev, "no receive buffer\n");
+		qp->rx_err_no_buf++;
 
+		rc = -ENOMEM;
 		goto err;
 	}
 
-	dev_dbg(&ntb_query_pdev(qp->ndev)->dev,
-		"rx offset %u, ver %u - %d payload received, buf size %d\n",
-		qp->rx_index, hdr->ver, hdr->len, entry->len);
-
-	qp->rx_bytes += hdr->len;
-	qp->rx_pkts++;
-
 	if (hdr->len > entry->len) {
-		qp->rx_err_oflow++;
-		dev_dbg(&ntb_query_pdev(qp->ndev)->dev,
-			"RX overflow! Wanted %d got %d\n",
+		dev_dbg(&qp->ndev->pdev->dev,
+			"receive buffer overflow! Wanted %d got %d\n",
 			hdr->len, entry->len);
+		qp->rx_err_oflow++;
 
+		rc = -EIO;
 		goto err;
 	}
 
+	dev_dbg(&qp->ndev->pdev->dev,
+		"RX OK index %u ver %u size %d into buf size %d\n",
+		qp->rx_index, hdr->ver, hdr->len, entry->len);
+
+	qp->rx_bytes += hdr->len;
+	qp->rx_pkts++;
+
 	entry->index = qp->rx_index;
 	entry->rx_hdr = hdr;
 
 	ntb_async_rx(entry, offset, hdr->len);
 
-out:
 	qp->rx_index++;
 	qp->rx_index %= qp->rx_max_entry;
 
 	return 0;
 
 err:
-	ntb_list_add(&qp->ntb_rx_pend_q_lock, &entry->entry, &qp->rx_pend_q);
-	/* Ensure that the data is fully copied out before clearing the flag */
-	wmb();
+	/* FIXME: if this syncrhonous update of the rx_index gets ahead of
+	 * asyncrhonous ntb_rx_copy_callback of previous entry, there are three
+	 * scenarios:
+	 *
+	 * 1) The peer might miss this update, but observe the update
+	 * from the memcpy completion callback.  In this case, the buffer will
+	 * not be freed on the peer to be reused for a different packet.  The
+	 * successful rx of a later packet would clear the condition, but the
+	 * condition could persist if several rx fail in a row.
+	 *
+	 * 2) The peer may observe this update before the asyncrhonous copy of
+	 * prior packets is completed.  The peer may overwrite the buffers of
+	 * the prior packets before they are copied.
+	 *
+	 * 3) Both: the peer may observe the update, and then observe the index
+	 * decrement by the asynchronous completion callback.  Who knows what
+	 * badness that will cause.
+	 */
 	hdr->flags = 0;
 	iowrite32(qp->rx_index, &qp->rx_info->entry);
 
-	goto out;
+	return rc;
 }
 
-static int ntb_transport_rxc_db(void *data, int db_num)
+static void ntb_transport_rxc_db(unsigned long data)
 {
-	struct ntb_transport_qp *qp = data;
+	struct ntb_transport_qp *qp = (void *)data;
 	int rc, i;
 
-	dev_dbg(&ntb_query_pdev(qp->ndev)->dev, "%s: doorbell %d received\n",
-		__func__, db_num);
+	dev_dbg(&qp->ndev->pdev->dev, "%s: doorbell %d received\n",
+		__func__, qp->qp_num);
 
 	/* Limit the number of packets processed in a single interrupt to
 	 * provide fairness to others
@@ -1231,7 +1305,21 @@ static int ntb_transport_rxc_db(void *data, int db_num)
 	if (qp->dma_chan)
 		dma_async_issue_pending(qp->dma_chan);
 
-	return i;
+	if (i == qp->rx_max_entry) {
+		/* there is more work to do */
+		tasklet_schedule(&qp->rxc_db_work);
+	} else if (ntb_db_read(qp->ndev) & BIT_ULL(qp->qp_num)) {
+		/* the doorbell bit is set: clear it */
+		ntb_db_clear(qp->ndev, BIT_ULL(qp->qp_num));
+		/* ntb_db_read ensures ntb_db_clear write is committed */
+		ntb_db_read(qp->ndev);
+
+		/* an interrupt may have arrived between finishing
+		 * ntb_process_rxc and clearing the doorbell bit:
+		 * there might be some more work to do.
+		 */
+		tasklet_schedule(&qp->rxc_db_work);
+	}
 }
 
 static void ntb_tx_copy_callback(void *data)
@@ -1240,11 +1328,9 @@ static void ntb_tx_copy_callback(void *data)
 	struct ntb_transport_qp *qp = entry->qp;
 	struct ntb_payload_header __iomem *hdr = entry->tx_hdr;
 
-	/* Ensure that the data is fully copied out before setting the flags */
-	wmb();
 	iowrite32(entry->flags | DESC_DONE_FLAG, &hdr->flags);
 
-	ntb_ring_doorbell(qp->ndev, qp->qp_num);
+	ntb_peer_db_set(qp->ndev, BIT_ULL(qp->qp_num));
 
 	/* The entry length can only be zero if the packet is intended to be a
 	 * "link down" or similar.  Since no payload is being sent in these
@@ -1265,6 +1351,9 @@ static void ntb_memcpy_tx(struct ntb_queue_entry *entry, void __iomem *offset)
 {
 	memcpy_toio(offset, entry->buf, entry->len);
 
+	/* Ensure that the data is fully copied out before setting the flags */
+	wmb();
+
 	ntb_tx_copy_callback(entry);
 }
 
@@ -1288,7 +1377,7 @@ static void ntb_async_tx(struct ntb_transport_qp *qp,
 	entry->tx_hdr = hdr;
 
 	iowrite32(entry->len, &hdr->len);
-	iowrite32((u32) qp->tx_pkts, &hdr->ver);
+	iowrite32((u32)qp->tx_pkts, &hdr->ver);
 
 	if (!chan)
 		goto err;
@@ -1298,8 +1387,8 @@ static void ntb_async_tx(struct ntb_transport_qp *qp,
 
 	device = chan->device;
 	dest = qp->tx_mw_phys + qp->tx_max_frame * qp->tx_index;
-	buff_off = (size_t) buf & ~PAGE_MASK;
-	dest_off = (size_t) dest & ~PAGE_MASK;
+	buff_off = (size_t)buf & ~PAGE_MASK;
+	dest_off = (size_t)dest & ~PAGE_MASK;
 
 	if (!is_dma_copy_aligned(device, buff_off, dest_off, len))
 		goto err;
@@ -1347,9 +1436,6 @@ err:
 static int ntb_process_tx(struct ntb_transport_qp *qp,
 			  struct ntb_queue_entry *entry)
 {
-	dev_dbg(&ntb_query_pdev(qp->ndev)->dev, "%lld - tx %u, entry len %d flags %x buff %p\n",
-		qp->tx_pkts, qp->tx_index, entry->len, entry->flags,
-		entry->buf);
 	if (qp->tx_index == qp->remote_rx_info->entry) {
 		qp->tx_ring_full++;
 		return -EAGAIN;
@@ -1376,14 +1462,14 @@ static int ntb_process_tx(struct ntb_transport_qp *qp,
 
 static void ntb_send_link_down(struct ntb_transport_qp *qp)
 {
-	struct pci_dev *pdev = ntb_query_pdev(qp->ndev);
+	struct pci_dev *pdev = qp->ndev->pdev;
 	struct ntb_queue_entry *entry;
 	int i, rc;
 
-	if (qp->qp_link == NTB_LINK_DOWN)
+	if (!qp->link_is_up)
 		return;
 
-	qp->qp_link = NTB_LINK_DOWN;
+	qp->link_is_up = false;
 	dev_info(&pdev->dev, "qp %d: Link Down\n", qp->qp_num);
 
 	for (i = 0; i < NTB_LINK_DOWN_TIMEOUT; i++) {
@@ -1422,18 +1508,21 @@ static void ntb_send_link_down(struct ntb_transport_qp *qp)
  * RETURNS: pointer to newly created ntb_queue, NULL on error.
  */
 struct ntb_transport_qp *
-ntb_transport_create_queue(void *data, struct pci_dev *pdev,
+ntb_transport_create_queue(void *data, struct device *client_dev,
 			   const struct ntb_queue_handlers *handlers)
 {
+	struct ntb_dev *ndev;
+	struct pci_dev *pdev;
+	struct ntb_transport_ctx *nt;
 	struct ntb_queue_entry *entry;
 	struct ntb_transport_qp *qp;
-	struct ntb_transport *nt;
+	u64 qp_bit;
 	unsigned int free_queue;
-	int rc, i;
+	int i;
 
-	nt = ntb_find_transport(pdev);
-	if (!nt)
-		goto err;
+	ndev = dev_ntb(client_dev->parent);
+	pdev = ndev->pdev;
+	nt = ndev->ctx;
 
 	free_queue = ffs(nt->qp_bitmap);
 	if (!free_queue)
@@ -1442,9 +1531,11 @@ ntb_transport_create_queue(void *data, struct pci_dev *pdev,
 	/* decrement free_queue to make it zero based */
 	free_queue--;
 
-	clear_bit(free_queue, &nt->qp_bitmap);
+	qp = &nt->qp_vec[free_queue];
+	qp_bit = BIT_ULL(qp->qp_num);
+
+	nt->qp_bitmap_free &= ~qp_bit;
 
-	qp = &nt->qps[free_queue];
 	qp->cb_data = data;
 	qp->rx_handler = handlers->rx_handler;
 	qp->tx_handler = handlers->tx_handler;
@@ -1458,7 +1549,7 @@ ntb_transport_create_queue(void *data, struct pci_dev *pdev,
 	}
 
 	for (i = 0; i < NTB_QP_DEF_NUM_ENTRIES; i++) {
-		entry = kzalloc(sizeof(struct ntb_queue_entry), GFP_ATOMIC);
+		entry = kzalloc(sizeof(*entry), GFP_ATOMIC);
 		if (!entry)
 			goto err1;
 
@@ -1468,7 +1559,7 @@ ntb_transport_create_queue(void *data, struct pci_dev *pdev,
 	}
 
 	for (i = 0; i < NTB_QP_DEF_NUM_ENTRIES; i++) {
-		entry = kzalloc(sizeof(struct ntb_queue_entry), GFP_ATOMIC);
+		entry = kzalloc(sizeof(*entry), GFP_ATOMIC);
 		if (!entry)
 			goto err2;
 
@@ -1477,10 +1568,8 @@ ntb_transport_create_queue(void *data, struct pci_dev *pdev,
 			     &qp->tx_free_q);
 	}
 
-	rc = ntb_register_db_callback(qp->ndev, free_queue, qp,
-				      ntb_transport_rxc_db);
-	if (rc)
-		goto err2;
+	ntb_db_clear(qp->ndev, qp_bit);
+	ntb_db_clear_mask(qp->ndev, qp_bit);
 
 	dev_info(&pdev->dev, "NTB Transport QP %d created\n", qp->qp_num);
 
@@ -1494,7 +1583,7 @@ err1:
 		kfree(entry);
 	if (qp->dma_chan)
 		dmaengine_put();
-	set_bit(free_queue, &nt->qp_bitmap);
+	nt->qp_bitmap_free |= qp_bit;
 err:
 	return NULL;
 }
@@ -1508,13 +1597,15 @@ EXPORT_SYMBOL_GPL(ntb_transport_create_queue);
  */
 void ntb_transport_free_queue(struct ntb_transport_qp *qp)
 {
+	struct ntb_transport_ctx *nt = qp->transport;
 	struct pci_dev *pdev;
 	struct ntb_queue_entry *entry;
+	u64 qp_bit;
 
 	if (!qp)
 		return;
 
-	pdev = ntb_query_pdev(qp->ndev);
+	pdev = qp->ndev->pdev;
 
 	if (qp->dma_chan) {
 		struct dma_chan *chan = qp->dma_chan;
@@ -1531,10 +1622,18 @@ void ntb_transport_free_queue(struct ntb_transport_qp *qp)
 		dmaengine_put();
 	}
 
-	ntb_unregister_db_callback(qp->ndev, qp->qp_num);
+	qp_bit = BIT_ULL(qp->qp_num);
+
+	ntb_db_set_mask(qp->ndev, qp_bit);
+	tasklet_disable(&qp->rxc_db_work);
 
 	cancel_delayed_work_sync(&qp->link_work);
 
+	qp->cb_data = NULL;
+	qp->rx_handler = NULL;
+	qp->tx_handler = NULL;
+	qp->event_handler = NULL;
+
 	while ((entry = ntb_list_rm(&qp->ntb_rx_free_q_lock, &qp->rx_free_q)))
 		kfree(entry);
 
@@ -1546,7 +1645,7 @@ void ntb_transport_free_queue(struct ntb_transport_qp *qp)
 	while ((entry = ntb_list_rm(&qp->ntb_tx_free_q_lock, &qp->tx_free_q)))
 		kfree(entry);
 
-	set_bit(qp->qp_num, &qp->transport->qp_bitmap);
+	nt->qp_bitmap_free |= qp_bit;
 
 	dev_info(&pdev->dev, "NTB Transport QP %d freed\n", qp->qp_num);
 }
@@ -1567,7 +1666,7 @@ void *ntb_transport_rx_remove(struct ntb_transport_qp *qp, unsigned int *len)
 	struct ntb_queue_entry *entry;
 	void *buf;
 
-	if (!qp || qp->client_ready == NTB_LINK_UP)
+	if (!qp || qp->client_ready)
 		return NULL;
 
 	entry = ntb_list_rm(&qp->ntb_rx_pend_q_lock, &qp->rx_pend_q);
@@ -1636,7 +1735,7 @@ int ntb_transport_tx_enqueue(struct ntb_transport_qp *qp, void *cb, void *data,
 	struct ntb_queue_entry *entry;
 	int rc;
 
-	if (!qp || qp->qp_link != NTB_LINK_UP || !len)
+	if (!qp || !qp->link_is_up || !len)
 		return -EINVAL;
 
 	entry = ntb_list_rm(&qp->ntb_tx_free_q_lock, &qp->tx_free_q);
@@ -1670,9 +1769,9 @@ void ntb_transport_link_up(struct ntb_transport_qp *qp)
 	if (!qp)
 		return;
 
-	qp->client_ready = NTB_LINK_UP;
+	qp->client_ready = true;
 
-	if (qp->transport->transport_link == NTB_LINK_UP)
+	if (qp->transport->link_is_up)
 		schedule_delayed_work(&qp->link_work, 0);
 }
 EXPORT_SYMBOL_GPL(ntb_transport_link_up);
@@ -1688,27 +1787,20 @@ EXPORT_SYMBOL_GPL(ntb_transport_link_up);
 void ntb_transport_link_down(struct ntb_transport_qp *qp)
 {
 	struct pci_dev *pdev;
-	int rc, val;
+	int val;
 
 	if (!qp)
 		return;
 
-	pdev = ntb_query_pdev(qp->ndev);
-	qp->client_ready = NTB_LINK_DOWN;
+	pdev = qp->ndev->pdev;
+	qp->client_ready = false;
 
-	rc = ntb_read_local_spad(qp->ndev, QP_LINKS, &val);
-	if (rc) {
-		dev_err(&pdev->dev, "Error reading spad %d\n", QP_LINKS);
-		return;
-	}
+	val = ntb_spad_read(qp->ndev, QP_LINKS);
 
-	rc = ntb_write_remote_spad(qp->ndev, QP_LINKS,
-				   val & ~(1 << qp->qp_num));
-	if (rc)
-		dev_err(&pdev->dev, "Error writing %x to remote spad %d\n",
-			val & ~(1 << qp->qp_num), QP_LINKS);
+	ntb_peer_spad_write(qp->ndev, QP_LINKS,
+			    val & ~BIT(qp->qp_num));
 
-	if (qp->qp_link == NTB_LINK_UP)
+	if (qp->link_is_up)
 		ntb_send_link_down(qp);
 	else
 		cancel_delayed_work_sync(&qp->link_work);
@@ -1728,7 +1820,7 @@ bool ntb_transport_link_query(struct ntb_transport_qp *qp)
 	if (!qp)
 		return false;
 
-	return qp->qp_link == NTB_LINK_UP;
+	return qp->link_is_up;
 }
 EXPORT_SYMBOL_GPL(ntb_transport_link_query);
 
@@ -1774,3 +1866,69 @@ unsigned int ntb_transport_max_size(struct ntb_transport_qp *qp)
 	return max;
 }
 EXPORT_SYMBOL_GPL(ntb_transport_max_size);
+
+static void ntb_transport_doorbell_callback(void *data, int vector)
+{
+	struct ntb_transport_ctx *nt = data;
+	struct ntb_transport_qp *qp;
+	u64 db_bits;
+	unsigned int qp_num;
+
+	db_bits = (nt->qp_bitmap & ~nt->qp_bitmap_free &
+		   ntb_db_vector_mask(nt->ndev, vector));
+
+	while (db_bits) {
+		qp_num = __ffs(db_bits);
+		qp = &nt->qp_vec[qp_num];
+
+		tasklet_schedule(&qp->rxc_db_work);
+
+		db_bits &= ~BIT_ULL(qp_num);
+	}
+}
+
+static const struct ntb_ctx_ops ntb_transport_ops = {
+	.link_event = ntb_transport_event_callback,
+	.db_event = ntb_transport_doorbell_callback,
+};
+
+static struct ntb_client ntb_transport_client = {
+	.ops = {
+		.probe = ntb_transport_probe,
+		.remove = ntb_transport_free,
+	},
+};
+
+static int __init ntb_transport_init(void)
+{
+	int rc;
+
+	if (debugfs_initialized())
+		nt_debugfs_dir = debugfs_create_dir(KBUILD_MODNAME, NULL);
+
+	rc = bus_register(&ntb_transport_bus);
+	if (rc)
+		goto err_bus;
+
+	rc = ntb_register_client(&ntb_transport_client);
+	if (rc)
+		goto err_client;
+
+	return 0;
+
+err_client:
+	bus_unregister(&ntb_transport_bus);
+err_bus:
+	debugfs_remove_recursive(nt_debugfs_dir);
+	return rc;
+}
+module_init(ntb_transport_init);
+
+static void __exit ntb_transport_exit(void)
+{
+	debugfs_remove_recursive(nt_debugfs_dir);
+
+	ntb_unregister_client(&ntb_transport_client);
+	bus_unregister(&ntb_transport_bus);
+}
+module_exit(ntb_transport_exit);
diff --git a/include/linux/ntb_transport.h b/include/linux/ntb_transport.h
index f78b64cc09d5..2862861366a5 100644
--- a/include/linux/ntb_transport.h
+++ b/include/linux/ntb_transport.h
@@ -5,6 +5,7 @@
  *   GPL LICENSE SUMMARY
  *
  *   Copyright(c) 2012 Intel Corporation. All rights reserved.
+ *   Copyright (C) 2015 EMC Corporation. All Rights Reserved.
  *
  *   This program is free software; you can redistribute it and/or modify
  *   it under the terms of version 2 of the GNU General Public License as
@@ -13,6 +14,7 @@
  *   BSD LICENSE
  *
  *   Copyright(c) 2012 Intel Corporation. All rights reserved.
+ *   Copyright (C) 2015 EMC Corporation. All Rights Reserved.
  *
  *   Redistribution and use in source and binary forms, with or without
  *   modification, are permitted provided that the following conditions
@@ -40,7 +42,7 @@
  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
- * Intel PCIe NTB Linux driver
+ * PCIe NTB Transport Linux driver
  *
  * Contact Information:
  * Jon Mason <jon.mason@intel.com>
@@ -48,21 +50,16 @@
 
 struct ntb_transport_qp;
 
-struct ntb_client {
+struct ntb_transport_client {
 	struct device_driver driver;
-	int (*probe)(struct pci_dev *pdev);
-	void (*remove)(struct pci_dev *pdev);
+	int (*probe)(struct device *client_dev);
+	void (*remove)(struct device *client_dev);
 };
 
-enum {
-	NTB_LINK_DOWN = 0,
-	NTB_LINK_UP,
-};
-
-int ntb_transport_register_client(struct ntb_client *drvr);
-void ntb_transport_unregister_client(struct ntb_client *drvr);
-int ntb_register_client_dev(char *device_name);
-void ntb_unregister_client_dev(char *device_name);
+int ntb_transport_register_client(struct ntb_transport_client *drvr);
+void ntb_transport_unregister_client(struct ntb_transport_client *drvr);
+int ntb_transport_register_client_dev(char *device_name);
+void ntb_transport_unregister_client_dev(char *device_name);
 
 struct ntb_queue_handlers {
 	void (*rx_handler)(struct ntb_transport_qp *qp, void *qp_data,
@@ -75,7 +72,7 @@ struct ntb_queue_handlers {
 unsigned char ntb_transport_qp_num(struct ntb_transport_qp *qp);
 unsigned int ntb_transport_max_size(struct ntb_transport_qp *qp);
 struct ntb_transport_qp *
-ntb_transport_create_queue(void *data, struct pci_dev *pdev,
+ntb_transport_create_queue(void *data, struct device *client_dev,
 			   const struct ntb_queue_handlers *handlers);
 void ntb_transport_free_queue(struct ntb_transport_qp *qp);
 int ntb_transport_rx_enqueue(struct ntb_transport_qp *qp, void *cb, void *data,
-- 
cgit v1.2.3


From 0fd972a7d91d6e15393c449492a04d94c0b89351 Mon Sep 17 00:00:00 2001
From: Paul Gortmaker <paul.gortmaker@windriver.com>
Date: Fri, 1 May 2015 20:13:42 -0400
Subject: module: relocate module_init from init.h to module.h

Modular users will always be users of init functionality, but
users of init functionality are not necessarily always modules.

Hence any functionality like module_init and module_exit would
be more at home in the module.h file.  And module.h should
explicitly include init.h to make the dependency clear.

We've already done all the legwork needed to ensure that this
move does not cause any build regressions due to implicit
header file include assumptions about where module_init lives.

Cc: Rusty Russell <rusty@rustcorp.com.au>
Acked-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
---
 include/linux/init.h   | 78 ----------------------------------------------
 include/linux/module.h | 84 ++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 84 insertions(+), 78 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/init.h b/include/linux/init.h
index 7c68c36d3fd8..b449f378f995 100644
--- a/include/linux/init.h
+++ b/include/linux/init.h
@@ -282,68 +282,8 @@ void __init parse_early_param(void);
 void __init parse_early_options(char *cmdline);
 #endif /* __ASSEMBLY__ */
 
-/**
- * module_init() - driver initialization entry point
- * @x: function to be run at kernel boot time or module insertion
- * 
- * module_init() will either be called during do_initcalls() (if
- * builtin) or at module insertion time (if a module).  There can only
- * be one per module.
- */
-#define module_init(x)	__initcall(x);
-
-/**
- * module_exit() - driver exit entry point
- * @x: function to be run when driver is removed
- * 
- * module_exit() will wrap the driver clean-up code
- * with cleanup_module() when used with rmmod when
- * the driver is a module.  If the driver is statically
- * compiled into the kernel, module_exit() has no effect.
- * There can only be one per module.
- */
-#define module_exit(x)	__exitcall(x);
-
 #else /* MODULE */
 
-/*
- * In most cases loadable modules do not need custom
- * initcall levels. There are still some valid cases where
- * a driver may be needed early if built in, and does not
- * matter when built as a loadable module. Like bus
- * snooping debug drivers.
- */
-#define early_initcall(fn)		module_init(fn)
-#define core_initcall(fn)		module_init(fn)
-#define core_initcall_sync(fn)		module_init(fn)
-#define postcore_initcall(fn)		module_init(fn)
-#define postcore_initcall_sync(fn)	module_init(fn)
-#define arch_initcall(fn)		module_init(fn)
-#define subsys_initcall(fn)		module_init(fn)
-#define subsys_initcall_sync(fn)	module_init(fn)
-#define fs_initcall(fn)			module_init(fn)
-#define fs_initcall_sync(fn)		module_init(fn)
-#define rootfs_initcall(fn)		module_init(fn)
-#define device_initcall(fn)		module_init(fn)
-#define device_initcall_sync(fn)	module_init(fn)
-#define late_initcall(fn)		module_init(fn)
-#define late_initcall_sync(fn)		module_init(fn)
-
-#define console_initcall(fn)		module_init(fn)
-#define security_initcall(fn)		module_init(fn)
-
-/* Each module must use one module_init(). */
-#define module_init(initfn)					\
-	static inline initcall_t __inittest(void)		\
-	{ return initfn; }					\
-	int init_module(void) __attribute__((alias(#initfn)));
-
-/* This is only required if you want to be unloadable. */
-#define module_exit(exitfn)					\
-	static inline exitcall_t __exittest(void)		\
-	{ return exitfn; }					\
-	void cleanup_module(void) __attribute__((alias(#exitfn)));
-
 #define __setup_param(str, unique_id, fn)	/* nothing */
 #define __setup(str, func) 			/* nothing */
 #endif
@@ -351,24 +291,6 @@ void __init parse_early_options(char *cmdline);
 /* Data marked not to be saved by software suspend */
 #define __nosavedata __section(.data..nosave)
 
-/* This means "can be init if no module support, otherwise module load
-   may call it." */
-#ifdef CONFIG_MODULES
-#define __init_or_module
-#define __initdata_or_module
-#define __initconst_or_module
-#define __INIT_OR_MODULE	.text
-#define __INITDATA_OR_MODULE	.data
-#define __INITRODATA_OR_MODULE	.section ".rodata","a",%progbits
-#else
-#define __init_or_module __init
-#define __initdata_or_module __initdata
-#define __initconst_or_module __initconst
-#define __INIT_OR_MODULE __INIT
-#define __INITDATA_OR_MODULE __INITDATA
-#define __INITRODATA_OR_MODULE __INITRODATA
-#endif /*CONFIG_MODULES*/
-
 #ifdef MODULE
 #define __exit_p(x) x
 #else
diff --git a/include/linux/module.h b/include/linux/module.h
index d67b1932cc59..3a19c79918e0 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -11,6 +11,7 @@
 #include <linux/compiler.h>
 #include <linux/cache.h>
 #include <linux/kmod.h>
+#include <linux/init.h>
 #include <linux/elf.h>
 #include <linux/stringify.h>
 #include <linux/kobject.h>
@@ -71,6 +72,89 @@ extern struct module_attribute module_uevent;
 extern int init_module(void);
 extern void cleanup_module(void);
 
+#ifndef MODULE
+/**
+ * module_init() - driver initialization entry point
+ * @x: function to be run at kernel boot time or module insertion
+ *
+ * module_init() will either be called during do_initcalls() (if
+ * builtin) or at module insertion time (if a module).  There can only
+ * be one per module.
+ */
+#define module_init(x)	__initcall(x);
+
+/**
+ * module_exit() - driver exit entry point
+ * @x: function to be run when driver is removed
+ *
+ * module_exit() will wrap the driver clean-up code
+ * with cleanup_module() when used with rmmod when
+ * the driver is a module.  If the driver is statically
+ * compiled into the kernel, module_exit() has no effect.
+ * There can only be one per module.
+ */
+#define module_exit(x)	__exitcall(x);
+
+#else /* MODULE */
+
+/*
+ * In most cases loadable modules do not need custom
+ * initcall levels. There are still some valid cases where
+ * a driver may be needed early if built in, and does not
+ * matter when built as a loadable module. Like bus
+ * snooping debug drivers.
+ */
+#define early_initcall(fn)		module_init(fn)
+#define core_initcall(fn)		module_init(fn)
+#define core_initcall_sync(fn)		module_init(fn)
+#define postcore_initcall(fn)		module_init(fn)
+#define postcore_initcall_sync(fn)	module_init(fn)
+#define arch_initcall(fn)		module_init(fn)
+#define subsys_initcall(fn)		module_init(fn)
+#define subsys_initcall_sync(fn)	module_init(fn)
+#define fs_initcall(fn)			module_init(fn)
+#define fs_initcall_sync(fn)		module_init(fn)
+#define rootfs_initcall(fn)		module_init(fn)
+#define device_initcall(fn)		module_init(fn)
+#define device_initcall_sync(fn)	module_init(fn)
+#define late_initcall(fn)		module_init(fn)
+#define late_initcall_sync(fn)		module_init(fn)
+
+#define console_initcall(fn)		module_init(fn)
+#define security_initcall(fn)		module_init(fn)
+
+/* Each module must use one module_init(). */
+#define module_init(initfn)					\
+	static inline initcall_t __inittest(void)		\
+	{ return initfn; }					\
+	int init_module(void) __attribute__((alias(#initfn)));
+
+/* This is only required if you want to be unloadable. */
+#define module_exit(exitfn)					\
+	static inline exitcall_t __exittest(void)		\
+	{ return exitfn; }					\
+	void cleanup_module(void) __attribute__((alias(#exitfn)));
+
+#endif
+
+/* This means "can be init if no module support, otherwise module load
+   may call it." */
+#ifdef CONFIG_MODULES
+#define __init_or_module
+#define __initdata_or_module
+#define __initconst_or_module
+#define __INIT_OR_MODULE	.text
+#define __INITDATA_OR_MODULE	.data
+#define __INITRODATA_OR_MODULE	.section ".rodata","a",%progbits
+#else
+#define __init_or_module __init
+#define __initdata_or_module __initdata
+#define __initconst_or_module __initconst
+#define __INIT_OR_MODULE __INIT
+#define __INITDATA_OR_MODULE __INITDATA
+#define __INITRODATA_OR_MODULE __INITRODATA
+#endif /*CONFIG_MODULES*/
+
 /* Archs provide a method of finding the correct exception table. */
 struct exception_table_entry;
 
-- 
cgit v1.2.3


From 0294112ee3135fbd15eaa70015af8283642dd970 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Sat, 4 Jul 2015 03:09:03 +0200
Subject: ACPI / PNP: Reserve ACPI resources at the fs_initcall_sync stage

This effectively reverts the following three commits:

 7bc10388ccdd ACPI / resources: free memory on error in add_region_before()
 0f1b414d1907 ACPI / PNP: Avoid conflicting resource reservations
 b9a5e5e18fbf ACPI / init: Fix the ordering of acpi_reserve_resources()

(commit b9a5e5e18fbf introduced regressions some of which, but not
all, were addressed by commit 0f1b414d1907 and commit 7bc10388ccdd
was a fixup on top of the latter) and causes ACPI fixed hardware
resources to be reserved at the fs_initcall_sync stage of system
initialization.

The story is as follows.  First, a boot regression was reported due
to an apparent resource reservation ordering change after a commit
that shouldn't lead to such changes.  Investigation led to the
conclusion that the problem happened because acpi_reserve_resources()
was executed at the device_initcall() stage of system initialization
which wasn't strictly ordered with respect to driver initialization
(and with respect to the initialization of the pcieport driver in
particular), so a random change causing the device initcalls to be
run in a different order might break things.

The response to that was to attempt to run acpi_reserve_resources()
as soon as we knew that ACPI would be in use (commit b9a5e5e18fbf).
However, that turned out to be too early, because it caused resource
reservations made by the PNP system driver to fail on at least one
system and that failure was addressed by commit 0f1b414d1907.

That fix still turned out to be insufficient, though, because
calling acpi_reserve_resources() before the fs_initcall stage of
system initialization caused a boot regression to happen on the
eCAFE EC-800-H20G/S netbook.  That meant that we only could call
acpi_reserve_resources() at the fs_initcall initialization stage
or later, but then we might just as well call it after the PNP
initalization in which case commit 0f1b414d1907 wouldn't be
necessary any more.

For this reason, the changes made by commit 0f1b414d1907 are reverted
(along with a memory leak fixup on top of that commit), the changes
made by commit b9a5e5e18fbf that went too far are reverted too and
acpi_reserve_resources() is changed into fs_initcall_sync, which
will cause it to be executed after the PNP subsystem initialization
(which is an fs_initcall) and before device initcalls (including
the pcieport driver initialization) which should avoid the initial
issue.

Link: https://bugzilla.kernel.org/show_bug.cgi?id=100581
Link: http://marc.info/?t=143092384600002&r=1&w=2
Link: https://bugzilla.kernel.org/show_bug.cgi?id=99831
Link: http://marc.info/?t=143389402600001&r=1&w=2
Fixes: b9a5e5e18fbf "ACPI / init: Fix the ordering of acpi_reserve_resources()"
Reported-by: Roland Dreier <roland@purestorage.com>
Cc: All applicable <stable@vger.kernel.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/osl.c      |  12 +++-
 drivers/acpi/resource.c | 162 ------------------------------------------------
 drivers/pnp/system.c    |  35 +++--------
 include/linux/acpi.h    |  10 ---
 4 files changed, 18 insertions(+), 201 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/acpi/osl.c b/drivers/acpi/osl.c
index c262e4acd68d..3b8963f21b36 100644
--- a/drivers/acpi/osl.c
+++ b/drivers/acpi/osl.c
@@ -175,10 +175,14 @@ static void __init acpi_request_region (struct acpi_generic_address *gas,
 	if (!addr || !length)
 		return;
 
-	acpi_reserve_region(addr, length, gas->space_id, 0, desc);
+	/* Resources are never freed */
+	if (gas->space_id == ACPI_ADR_SPACE_SYSTEM_IO)
+		request_region(addr, length, desc);
+	else if (gas->space_id == ACPI_ADR_SPACE_SYSTEM_MEMORY)
+		request_mem_region(addr, length, desc);
 }
 
-static void __init acpi_reserve_resources(void)
+static int __init acpi_reserve_resources(void)
 {
 	acpi_request_region(&acpi_gbl_FADT.xpm1a_event_block, acpi_gbl_FADT.pm1_event_length,
 		"ACPI PM1a_EVT_BLK");
@@ -207,7 +211,10 @@ static void __init acpi_reserve_resources(void)
 	if (!(acpi_gbl_FADT.gpe1_block_length & 0x1))
 		acpi_request_region(&acpi_gbl_FADT.xgpe1_block,
 			       acpi_gbl_FADT.gpe1_block_length, "ACPI GPE1_BLK");
+
+	return 0;
 }
+fs_initcall_sync(acpi_reserve_resources);
 
 void acpi_os_printf(const char *fmt, ...)
 {
@@ -1862,7 +1869,6 @@ acpi_status __init acpi_os_initialize(void)
 
 acpi_status __init acpi_os_initialize1(void)
 {
-	acpi_reserve_resources();
 	kacpid_wq = alloc_workqueue("kacpid", 0, 1);
 	kacpi_notify_wq = alloc_workqueue("kacpi_notify", 0, 1);
 	kacpi_hotplug_wq = alloc_ordered_workqueue("kacpi_hotplug", 0);
diff --git a/drivers/acpi/resource.c b/drivers/acpi/resource.c
index 10561ce16ed1..8244f013f210 100644
--- a/drivers/acpi/resource.c
+++ b/drivers/acpi/resource.c
@@ -26,7 +26,6 @@
 #include <linux/device.h>
 #include <linux/export.h>
 #include <linux/ioport.h>
-#include <linux/list.h>
 #include <linux/slab.h>
 
 #ifdef CONFIG_X86
@@ -622,164 +621,3 @@ int acpi_dev_filter_resource_type(struct acpi_resource *ares,
 	return (type & types) ? 0 : 1;
 }
 EXPORT_SYMBOL_GPL(acpi_dev_filter_resource_type);
-
-struct reserved_region {
-	struct list_head node;
-	u64 start;
-	u64 end;
-};
-
-static LIST_HEAD(reserved_io_regions);
-static LIST_HEAD(reserved_mem_regions);
-
-static int request_range(u64 start, u64 end, u8 space_id, unsigned long flags,
-			 char *desc)
-{
-	unsigned int length = end - start + 1;
-	struct resource *res;
-
-	res = space_id == ACPI_ADR_SPACE_SYSTEM_IO ?
-		request_region(start, length, desc) :
-		request_mem_region(start, length, desc);
-	if (!res)
-		return -EIO;
-
-	res->flags &= ~flags;
-	return 0;
-}
-
-static int add_region_before(u64 start, u64 end, u8 space_id,
-			     unsigned long flags, char *desc,
-			     struct list_head *head)
-{
-	struct reserved_region *reg;
-	int error;
-
-	reg = kmalloc(sizeof(*reg), GFP_KERNEL);
-	if (!reg)
-		return -ENOMEM;
-
-	error = request_range(start, end, space_id, flags, desc);
-	if (error) {
-		kfree(reg);
-		return error;
-	}
-
-	reg->start = start;
-	reg->end = end;
-	list_add_tail(&reg->node, head);
-	return 0;
-}
-
-/**
- * acpi_reserve_region - Reserve an I/O or memory region as a system resource.
- * @start: Starting address of the region.
- * @length: Length of the region.
- * @space_id: Identifier of address space to reserve the region from.
- * @flags: Resource flags to clear for the region after requesting it.
- * @desc: Region description (for messages).
- *
- * Reserve an I/O or memory region as a system resource to prevent others from
- * using it.  If the new region overlaps with one of the regions (in the given
- * address space) already reserved by this routine, only the non-overlapping
- * parts of it will be reserved.
- *
- * Returned is either 0 (success) or a negative error code indicating a resource
- * reservation problem.  It is the code of the first encountered error, but the
- * routine doesn't abort until it has attempted to request all of the parts of
- * the new region that don't overlap with other regions reserved previously.
- *
- * The resources requested by this routine are never released.
- */
-int acpi_reserve_region(u64 start, unsigned int length, u8 space_id,
-			unsigned long flags, char *desc)
-{
-	struct list_head *regions;
-	struct reserved_region *reg;
-	u64 end = start + length - 1;
-	int ret = 0, error = 0;
-
-	if (space_id == ACPI_ADR_SPACE_SYSTEM_IO)
-		regions = &reserved_io_regions;
-	else if (space_id == ACPI_ADR_SPACE_SYSTEM_MEMORY)
-		regions = &reserved_mem_regions;
-	else
-		return -EINVAL;
-
-	if (list_empty(regions))
-		return add_region_before(start, end, space_id, flags, desc, regions);
-
-	list_for_each_entry(reg, regions, node)
-		if (reg->start == end + 1) {
-			/* The new region can be prepended to this one. */
-			ret = request_range(start, end, space_id, flags, desc);
-			if (!ret)
-				reg->start = start;
-
-			return ret;
-		} else if (reg->start > end) {
-			/* No overlap.  Add the new region here and get out. */
-			return add_region_before(start, end, space_id, flags,
-						 desc, &reg->node);
-		} else if (reg->end == start - 1) {
-			goto combine;
-		} else if (reg->end >= start) {
-			goto overlap;
-		}
-
-	/* The new region goes after the last existing one. */
-	return add_region_before(start, end, space_id, flags, desc, regions);
-
- overlap:
-	/*
-	 * The new region overlaps an existing one.
-	 *
-	 * The head part of the new region immediately preceding the existing
-	 * overlapping one can be combined with it right away.
-	 */
-	if (reg->start > start) {
-		error = request_range(start, reg->start - 1, space_id, flags, desc);
-		if (error)
-			ret = error;
-		else
-			reg->start = start;
-	}
-
- combine:
-	/*
-	 * The new region is adjacent to an existing one.  If it extends beyond
-	 * that region all the way to the next one, it is possible to combine
-	 * all three of them.
-	 */
-	while (reg->end < end) {
-		struct reserved_region *next = NULL;
-		u64 a = reg->end + 1, b = end;
-
-		if (!list_is_last(&reg->node, regions)) {
-			next = list_next_entry(reg, node);
-			if (next->start <= end)
-				b = next->start - 1;
-		}
-		error = request_range(a, b, space_id, flags, desc);
-		if (!error) {
-			if (next && next->start == b + 1) {
-				reg->end = next->end;
-				list_del(&next->node);
-				kfree(next);
-			} else {
-				reg->end = end;
-				break;
-			}
-		} else if (next) {
-			if (!ret)
-				ret = error;
-
-			reg = next;
-		} else {
-			break;
-		}
-	}
-
-	return ret ? ret : error;
-}
-EXPORT_SYMBOL_GPL(acpi_reserve_region);
diff --git a/drivers/pnp/system.c b/drivers/pnp/system.c
index 515f33882ab8..49c1720df59a 100644
--- a/drivers/pnp/system.c
+++ b/drivers/pnp/system.c
@@ -7,7 +7,6 @@
  *	Bjorn Helgaas <bjorn.helgaas@hp.com>
  */
 
-#include <linux/acpi.h>
 #include <linux/pnp.h>
 #include <linux/device.h>
 #include <linux/init.h>
@@ -23,41 +22,25 @@ static const struct pnp_device_id pnp_dev_table[] = {
 	{"", 0}
 };
 
-#ifdef CONFIG_ACPI
-static bool __reserve_range(u64 start, unsigned int length, bool io, char *desc)
-{
-	u8 space_id = io ? ACPI_ADR_SPACE_SYSTEM_IO : ACPI_ADR_SPACE_SYSTEM_MEMORY;
-	return !acpi_reserve_region(start, length, space_id, IORESOURCE_BUSY, desc);
-}
-#else
-static bool __reserve_range(u64 start, unsigned int length, bool io, char *desc)
-{
-	struct resource *res;
-
-	res = io ? request_region(start, length, desc) :
-		request_mem_region(start, length, desc);
-	if (res) {
-		res->flags &= ~IORESOURCE_BUSY;
-		return true;
-	}
-	return false;
-}
-#endif
-
 static void reserve_range(struct pnp_dev *dev, struct resource *r, int port)
 {
 	char *regionid;
 	const char *pnpid = dev_name(&dev->dev);
 	resource_size_t start = r->start, end = r->end;
-	bool reserved;
+	struct resource *res;
 
 	regionid = kmalloc(16, GFP_KERNEL);
 	if (!regionid)
 		return;
 
 	snprintf(regionid, 16, "pnp %s", pnpid);
-	reserved = __reserve_range(start, end - start + 1, !!port, regionid);
-	if (!reserved)
+	if (port)
+		res = request_region(start, end - start + 1, regionid);
+	else
+		res = request_mem_region(start, end - start + 1, regionid);
+	if (res)
+		res->flags &= ~IORESOURCE_BUSY;
+	else
 		kfree(regionid);
 
 	/*
@@ -66,7 +49,7 @@ static void reserve_range(struct pnp_dev *dev, struct resource *r, int port)
 	 * have double reservations.
 	 */
 	dev_info(&dev->dev, "%pR %s reserved\n", r,
-		 reserved ? "has been" : "could not be");
+		 res ? "has been" : "could not be");
 }
 
 static void reserve_resources_of_dev(struct pnp_dev *dev)
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index c471dfc93b71..fedfd1bea3bf 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -309,9 +309,6 @@ int acpi_check_region(resource_size_t start, resource_size_t n,
 
 int acpi_resources_are_enforced(void);
 
-int acpi_reserve_region(u64 start, unsigned int length, u8 space_id,
-			unsigned long flags, char *desc);
-
 #ifdef CONFIG_HIBERNATION
 void __init acpi_no_s4_hw_signature(void);
 #endif
@@ -507,13 +504,6 @@ static inline int acpi_check_region(resource_size_t start, resource_size_t n,
 	return 0;
 }
 
-static inline int acpi_reserve_region(u64 start, unsigned int length,
-				      u8 space_id, unsigned long flags,
-				      char *desc)
-{
-	return -ENXIO;
-}
-
 struct acpi_table_header;
 static inline int acpi_table_parse(char *id,
 				int (*handler)(struct acpi_table_header *))
-- 
cgit v1.2.3


From 26095a01d359827eeccec5459c28ddd976183179 Mon Sep 17 00:00:00 2001
From: "Suthikulpanit, Suravee" <Suravee.Suthikulpanit@amd.com>
Date: Tue, 7 Jul 2015 01:55:20 +0200
Subject: ACPI / scan: Add support for ACPI _CLS device matching

Device drivers typically use ACPI _HIDs/_CIDs listed in struct device_driver
acpi_match_table to match devices. However, for generic drivers, we do not
want to list _HID for all supported devices. Also, certain classes of devices
do not have _CID (e.g. SATA, USB). Instead, we can leverage ACPI _CLS,
which specifies PCI-defined class code (i.e. base-class, subclass and
programming interface). This patch adds support for matching ACPI devices using
the _CLS method.

To support loadable module, current design uses _HID or _CID to match device's
modalias. With the new way of matching with _CLS this would requires modification
to the current ACPI modalias key to include _CLS. This patch appends PCI-defined
class-code to the existing ACPI modalias as following.

    acpi:<HID>:<CID1>:<CID2>:..:<CIDn>:<bbsspp>:
E.g:
    # cat /sys/devices/platform/AMDI0600:00/modalias
    acpi:AMDI0600:010601:

where bb is th base-class code, ss is te sub-class code, and pp is the
programming interface code

Since there would not be _HID/_CID in the ACPI matching table of the driver,
this patch adds a field to acpi_device_id to specify the matching _CLS.

    static const struct acpi_device_id ahci_acpi_match[] = {
        { ACPI_DEVICE_CLASS(PCI_CLASS_STORAGE_SATA_AHCI, 0xffffff) },
        {},
    };

In this case, the corresponded entry in modules.alias file would be:

    alias acpi*:010601:* ahci_platform

Acked-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Reviewed-by: Hanjun Guo <hanjun.guo@linaro.org>
Signed-off-by: Suravee Suthikulpanit <Suravee.Suthikulpanit@amd.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/scan.c               | 32 ++++++++++++++++++++++++++++++--
 include/linux/acpi.h              | 14 ++++++++++++++
 include/linux/mod_devicetable.h   |  2 ++
 scripts/mod/devicetable-offsets.c |  2 ++
 scripts/mod/file2alias.c          | 32 ++++++++++++++++++++++++++++++--
 5 files changed, 78 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/acpi/scan.c b/drivers/acpi/scan.c
index 2649a068671d..ec256352f423 100644
--- a/drivers/acpi/scan.c
+++ b/drivers/acpi/scan.c
@@ -1019,6 +1019,29 @@ static bool acpi_of_match_device(struct acpi_device *adev,
 	return false;
 }
 
+static bool __acpi_match_device_cls(const struct acpi_device_id *id,
+				    struct acpi_hardware_id *hwid)
+{
+	int i, msk, byte_shift;
+	char buf[3];
+
+	if (!id->cls)
+		return false;
+
+	/* Apply class-code bitmask, before checking each class-code byte */
+	for (i = 1; i <= 3; i++) {
+		byte_shift = 8 * (3 - i);
+		msk = (id->cls_msk >> byte_shift) & 0xFF;
+		if (!msk)
+			continue;
+
+		sprintf(buf, "%02x", (id->cls >> byte_shift) & msk);
+		if (strncmp(buf, &hwid->id[(i - 1) * 2], 2))
+			return false;
+	}
+	return true;
+}
+
 static const struct acpi_device_id *__acpi_match_device(
 	struct acpi_device *device,
 	const struct acpi_device_id *ids,
@@ -1036,9 +1059,12 @@ static const struct acpi_device_id *__acpi_match_device(
 
 	list_for_each_entry(hwid, &device->pnp.ids, list) {
 		/* First, check the ACPI/PNP IDs provided by the caller. */
-		for (id = ids; id->id[0]; id++)
-			if (!strcmp((char *) id->id, hwid->id))
+		for (id = ids; id->id[0] || id->cls; id++) {
+			if (id->id[0] && !strcmp((char *) id->id, hwid->id))
 				return id;
+			else if (id->cls && __acpi_match_device_cls(id, hwid))
+				return id;
+		}
 
 		/*
 		 * Next, check ACPI_DT_NAMESPACE_HID and try to match the
@@ -2101,6 +2127,8 @@ static void acpi_set_pnp_ids(acpi_handle handle, struct acpi_device_pnp *pnp,
 		if (info->valid & ACPI_VALID_UID)
 			pnp->unique_id = kstrdup(info->unique_id.string,
 							GFP_KERNEL);
+		if (info->valid & ACPI_VALID_CLS)
+			acpi_add_id(pnp, info->class_code.string);
 
 		kfree(info);
 
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index c471dfc93b71..bdfdb9f65c23 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -58,6 +58,19 @@ static inline acpi_handle acpi_device_handle(struct acpi_device *adev)
 	acpi_fwnode_handle(adev) : NULL)
 #define ACPI_HANDLE(dev)		acpi_device_handle(ACPI_COMPANION(dev))
 
+/**
+ * ACPI_DEVICE_CLASS - macro used to describe an ACPI device with
+ * the PCI-defined class-code information
+ *
+ * @_cls : the class, subclass, prog-if triple for this device
+ * @_msk : the class mask for this device
+ *
+ * This macro is used to create a struct acpi_device_id that matches a
+ * specific PCI class. The .id and .driver_data fields will be left
+ * initialized with the default value.
+ */
+#define ACPI_DEVICE_CLASS(_cls, _msk)	.cls = (_cls), .cls_msk = (_msk),
+
 static inline bool has_acpi_companion(struct device *dev)
 {
 	return is_acpi_node(dev->fwnode);
@@ -446,6 +459,7 @@ struct platform_device *acpi_create_platform_device(struct acpi_device *);
 #define ACPI_COMPANION(dev)		(NULL)
 #define ACPI_COMPANION_SET(dev, adev)	do { } while (0)
 #define ACPI_HANDLE(dev)		(NULL)
+#define ACPI_DEVICE_CLASS(_cls, _msk)	.cls = (0), .cls_msk = (0),
 
 struct fwnode_handle;
 
diff --git a/include/linux/mod_devicetable.h b/include/linux/mod_devicetable.h
index 8183d6640ca7..34f25b7bf642 100644
--- a/include/linux/mod_devicetable.h
+++ b/include/linux/mod_devicetable.h
@@ -189,6 +189,8 @@ struct css_device_id {
 struct acpi_device_id {
 	__u8 id[ACPI_ID_LEN];
 	kernel_ulong_t driver_data;
+	__u32 cls;
+	__u32 cls_msk;
 };
 
 #define PNP_ID_LEN	8
diff --git a/scripts/mod/devicetable-offsets.c b/scripts/mod/devicetable-offsets.c
index eff7de1fc82e..e70fcd12eeeb 100644
--- a/scripts/mod/devicetable-offsets.c
+++ b/scripts/mod/devicetable-offsets.c
@@ -63,6 +63,8 @@ int main(void)
 
 	DEVID(acpi_device_id);
 	DEVID_FIELD(acpi_device_id, id);
+	DEVID_FIELD(acpi_device_id, cls);
+	DEVID_FIELD(acpi_device_id, cls_msk);
 
 	DEVID(pnp_device_id);
 	DEVID_FIELD(pnp_device_id, id);
diff --git a/scripts/mod/file2alias.c b/scripts/mod/file2alias.c
index 84c86f3cd6cd..5f2088209132 100644
--- a/scripts/mod/file2alias.c
+++ b/scripts/mod/file2alias.c
@@ -523,12 +523,40 @@ static int do_serio_entry(const char *filename,
 }
 ADD_TO_DEVTABLE("serio", serio_device_id, do_serio_entry);
 
-/* looks like: "acpi:ACPI0003 or acpi:PNP0C0B" or "acpi:LNXVIDEO" */
+/* looks like: "acpi:ACPI0003" or "acpi:PNP0C0B" or "acpi:LNXVIDEO" or
+ *             "acpi:bbsspp" (bb=base-class, ss=sub-class, pp=prog-if)
+ *
+ * NOTE: Each driver should use one of the following : _HID, _CIDs
+ *       or _CLS. Also, bb, ss, and pp can be substituted with ??
+ *       as don't care byte.
+ */
 static int do_acpi_entry(const char *filename,
 			void *symval, char *alias)
 {
 	DEF_FIELD_ADDR(symval, acpi_device_id, id);
-	sprintf(alias, "acpi*:%s:*", *id);
+	DEF_FIELD_ADDR(symval, acpi_device_id, cls);
+	DEF_FIELD_ADDR(symval, acpi_device_id, cls_msk);
+
+	if (id && strlen((const char *)*id))
+		sprintf(alias, "acpi*:%s:*", *id);
+	else if (cls) {
+		int i, byte_shift, cnt = 0;
+		unsigned int msk;
+
+		sprintf(&alias[cnt], "acpi*:");
+		cnt = 6;
+		for (i = 1; i <= 3; i++) {
+			byte_shift = 8 * (3-i);
+			msk = (*cls_msk >> byte_shift) & 0xFF;
+			if (msk)
+				sprintf(&alias[cnt], "%02x",
+					(*cls >> byte_shift) & 0xFF);
+			else
+				sprintf(&alias[cnt], "??");
+			cnt += 2;
+		}
+		sprintf(&alias[cnt], ":*");
+	}
 	return 1;
 }
 ADD_TO_DEVTABLE("acpi", acpi_device_id, do_acpi_entry);
-- 
cgit v1.2.3


From f32dd117051185da6e923b35491a44d7debeeea5 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 7 Jul 2015 16:29:38 +0200
Subject: tick/broadcast: Make idle check independent from mode and config

Currently the broadcast busy check, which prevents the idle code from
going into deep idle, works only in one shot mode.

If NOHZ and HIGHRES are off (config or command line) there is no
sanity check at all, so under certain conditions cpus are allowed to
go into deep idle, where the local timer stops, and are not woken up
again because there is no broadcast timer installed or a hrtimer based
broadcast device is not evaluated.

Move tick_broadcast_oneshot_control() into the common code and provide
proper subfunctions for the various config combinations.

The common check in tick_broadcast_oneshot_control() is for the C3STOP
misfeature flag of the local clock event device. If its not set, idle
can proceed. If set, further checks are necessary.

Provide checks for the trivial cases:

 - If broadcast is disabled in the config, then return busy

 - If oneshot mode (NOHZ/HIGHES) is disabled in the config, return
   busy if the broadcast device is hrtimer based.

 - If oneshot mode is enabled in the config call the original
   tick_broadcast_oneshot_control() function. That function needs
   extra checks which will be implemented in seperate patches.

[ Split out from a larger combo patch ]

Reported-and-tested-by: Sudeep Holla <sudeep.holla@arm.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Suzuki Poulose <Suzuki.Poulose@arm.com>
Cc: Lorenzo Pieralisi <Lorenzo.Pieralisi@arm.com>
Cc: Catalin Marinas <Catalin.Marinas@arm.com>
Cc: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Preeti U Murthy <preeti@linux.vnet.ibm.com>
Cc: Ingo Molnar <mingo@kernel.org>
Link: http://lkml.kernel.org/r/alpine.DEB.2.11.1507070929360.3916@nanos
---
 include/linux/tick.h         |  4 ----
 kernel/time/tick-broadcast.c | 26 +++++++++++---------------
 kernel/time/tick-common.c    | 21 +++++++++++++++++++++
 kernel/time/tick-sched.h     | 10 ++++++++++
 4 files changed, 42 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/tick.h b/include/linux/tick.h
index 3741ba1a652c..6916dcb61857 100644
--- a/include/linux/tick.h
+++ b/include/linux/tick.h
@@ -67,11 +67,7 @@ extern void tick_broadcast_control(enum tick_broadcast_mode mode);
 static inline void tick_broadcast_control(enum tick_broadcast_mode mode) { }
 #endif /* BROADCAST */
 
-#if defined(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST) && defined(CONFIG_TICK_ONESHOT)
 extern int tick_broadcast_oneshot_control(enum tick_broadcast_state state);
-#else
-static inline int tick_broadcast_oneshot_control(enum tick_broadcast_state state) { return 0; }
-#endif
 
 static inline void tick_broadcast_enable(void)
 {
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 9877d0b0aefc..ef77b16ad5df 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -685,18 +685,7 @@ static void broadcast_shutdown_local(struct clock_event_device *bc,
 	clockevents_switch_state(dev, CLOCK_EVT_STATE_SHUTDOWN);
 }
 
-/**
- * tick_broadcast_oneshot_control - Enter/exit broadcast oneshot mode
- * @state:	The target state (enter/exit)
- *
- * The system enters/leaves a state, where affected devices might stop
- * Returns 0 on success, -EBUSY if the cpu is used to broadcast wakeups.
- *
- * Called with interrupts disabled, so clockevents_lock is not
- * required here because the local clock event device cannot go away
- * under us.
- */
-int tick_broadcast_oneshot_control(enum tick_broadcast_state state)
+int __tick_broadcast_oneshot_control(enum tick_broadcast_state state)
 {
 	struct clock_event_device *bc, *dev;
 	struct tick_device *td;
@@ -717,9 +706,6 @@ int tick_broadcast_oneshot_control(enum tick_broadcast_state state)
 	td = this_cpu_ptr(&tick_cpu_device);
 	dev = td->evtdev;
 
-	if (!(dev->features & CLOCK_EVT_FEAT_C3STOP))
-		return 0;
-
 	raw_spin_lock(&tick_broadcast_lock);
 	bc = tick_broadcast_device.evtdev;
 	cpu = smp_processor_id();
@@ -961,6 +947,16 @@ bool tick_broadcast_oneshot_available(void)
 	return bc ? bc->features & CLOCK_EVT_FEAT_ONESHOT : false;
 }
 
+#else
+int __tick_broadcast_oneshot_control(enum tick_broadcast_state state)
+{
+	struct clock_event_device *bc = tick_broadcast_device.evtdev;
+
+	if (!bc || (bc->features & CLOCK_EVT_FEAT_HRTIMER))
+		return -EBUSY;
+
+	return 0;
+}
 #endif
 
 void __init tick_broadcast_init(void)
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index 76446cb5dfe1..55e13efff1ab 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -343,6 +343,27 @@ out_bc:
 	tick_install_broadcast_device(newdev);
 }
 
+/**
+ * tick_broadcast_oneshot_control - Enter/exit broadcast oneshot mode
+ * @state:	The target state (enter/exit)
+ *
+ * The system enters/leaves a state, where affected devices might stop
+ * Returns 0 on success, -EBUSY if the cpu is used to broadcast wakeups.
+ *
+ * Called with interrupts disabled, so clockevents_lock is not
+ * required here because the local clock event device cannot go away
+ * under us.
+ */
+int tick_broadcast_oneshot_control(enum tick_broadcast_state state)
+{
+	struct tick_device *td = this_cpu_ptr(&tick_cpu_device);
+
+	if (!(td->evtdev->features & CLOCK_EVT_FEAT_C3STOP))
+		return 0;
+
+	return __tick_broadcast_oneshot_control(state);
+}
+
 #ifdef CONFIG_HOTPLUG_CPU
 /*
  * Transfer the do_timer job away from a dying cpu.
diff --git a/kernel/time/tick-sched.h b/kernel/time/tick-sched.h
index 42fdf4958bcc..a4a8d4e9baa1 100644
--- a/kernel/time/tick-sched.h
+++ b/kernel/time/tick-sched.h
@@ -71,4 +71,14 @@ extern void tick_cancel_sched_timer(int cpu);
 static inline void tick_cancel_sched_timer(int cpu) { }
 #endif
 
+#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
+extern int __tick_broadcast_oneshot_control(enum tick_broadcast_state state);
+#else
+static inline int
+__tick_broadcast_oneshot_control(enum tick_broadcast_state state)
+{
+	return -EBUSY;
+}
+#endif
+
 #endif
-- 
cgit v1.2.3


From 37b64a42067a04a22468c4e52c12af00d72e462b Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 7 Jul 2015 21:56:34 +0200
Subject: tick/broadcast: Unbreak CONFIG_GENERIC_CLOCKEVENTS=n build

Making tick_broadcast_oneshot_control() independent from
CONFIG_GENERIC_CLOCKEVENTS_BROADCAST broke the build for
CONFIG_GENERIC_CLOCKEVENTS=n because the function is not defined
there.

Provide a proper stub inline.

Fixes: f32dd1170511 'tick/broadcast: Make idle check independent from mode and config'
Reported-by: kbuild test robot <fengguang.wu@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/tick.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/tick.h b/include/linux/tick.h
index 6916dcb61857..edbfc9a5293e 100644
--- a/include/linux/tick.h
+++ b/include/linux/tick.h
@@ -67,7 +67,14 @@ extern void tick_broadcast_control(enum tick_broadcast_mode mode);
 static inline void tick_broadcast_control(enum tick_broadcast_mode mode) { }
 #endif /* BROADCAST */
 
+#ifdef CONFIG_GENERIC_CLOCKEVENTS
 extern int tick_broadcast_oneshot_control(enum tick_broadcast_state state);
+#else
+static inline int tick_broadcast_oneshot_control(enum tick_broadcast_state state)
+{
+	return 0;
+}
+#endif
 
 static inline void tick_broadcast_enable(void)
 {
-- 
cgit v1.2.3


From a899418167264c7bac574b1a0f1b2c26c5b0995a Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 5 Jul 2015 17:12:30 +0000
Subject: hotplug: Prevent alloc/free of irq descriptors during cpu up/down

When a cpu goes up some architectures (e.g. x86) have to walk the irq
space to set up the vector space for the cpu. While this needs extra
protection at the architecture level we can avoid a few race
conditions by preventing the concurrent allocation/free of irq
descriptors and the associated data.

When a cpu goes down it moves the interrupts which are targeted to
this cpu away by reassigning the affinities. While this happens
interrupts can be allocated and freed, which opens a can of race
conditions in the code which reassignes the affinities because
interrupt descriptors might be freed underneath.

Example:

CPU1				CPU2
cpu_up/down
 irq_desc = irq_to_desc(irq);
				remove_from_radix_tree(desc);
 raw_spin_lock(&desc->lock);
				free(desc);

We could protect the irq descriptors with RCU, but that would require
a full tree change of all accesses to interrupt descriptors. But
fortunately these kind of race conditions are rather limited to a few
things like cpu hotplug. The normal setup/teardown is very well
serialized. So the simpler and obvious solution is:

Prevent allocation and freeing of interrupt descriptors accross cpu
hotplug.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: xiao jin <jin.xiao@intel.com>
Cc: Joerg Roedel <jroedel@suse.de>
Cc: Borislav Petkov <bp@suse.de>
Cc: Yanmin Zhang <yanmin_zhang@linux.intel.com>
Link: http://lkml.kernel.org/r/20150705171102.063519515@linutronix.de
---
 include/linux/irqdesc.h |  7 ++++++-
 kernel/cpu.c            | 22 +++++++++++++++++++++-
 kernel/irq/internals.h  |  4 ----
 3 files changed, 27 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h
index 624a668e61f1..fcea4e48e21f 100644
--- a/include/linux/irqdesc.h
+++ b/include/linux/irqdesc.h
@@ -87,7 +87,12 @@ struct irq_desc {
 	const char		*name;
 } ____cacheline_internodealigned_in_smp;
 
-#ifndef CONFIG_SPARSE_IRQ
+#ifdef CONFIG_SPARSE_IRQ
+extern void irq_lock_sparse(void);
+extern void irq_unlock_sparse(void);
+#else
+static inline void irq_lock_sparse(void) { }
+static inline void irq_unlock_sparse(void) { }
 extern struct irq_desc irq_desc[NR_IRQS];
 #endif
 
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 9c9c9fab16cc..6a374544d495 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -21,6 +21,7 @@
 #include <linux/suspend.h>
 #include <linux/lockdep.h>
 #include <linux/tick.h>
+#include <linux/irq.h>
 #include <trace/events/power.h>
 
 #include "smpboot.h"
@@ -392,13 +393,19 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
 	smpboot_park_threads(cpu);
 
 	/*
-	 * So now all preempt/rcu users must observe !cpu_active().
+	 * Prevent irq alloc/free while the dying cpu reorganizes the
+	 * interrupt affinities.
 	 */
+	irq_lock_sparse();
 
+	/*
+	 * So now all preempt/rcu users must observe !cpu_active().
+	 */
 	err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu));
 	if (err) {
 		/* CPU didn't die: tell everyone.  Can't complain. */
 		cpu_notify_nofail(CPU_DOWN_FAILED | mod, hcpu);
+		irq_unlock_sparse();
 		goto out_release;
 	}
 	BUG_ON(cpu_online(cpu));
@@ -415,6 +422,9 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
 	smp_mb(); /* Read from cpu_dead_idle before __cpu_die(). */
 	per_cpu(cpu_dead_idle, cpu) = false;
 
+	/* Interrupts are moved away from the dying cpu, reenable alloc/free */
+	irq_unlock_sparse();
+
 	hotplug_cpu__broadcast_tick_pull(cpu);
 	/* This actually kills the CPU. */
 	__cpu_die(cpu);
@@ -517,8 +527,18 @@ static int _cpu_up(unsigned int cpu, int tasks_frozen)
 		goto out_notify;
 	}
 
+	/*
+	 * Some architectures have to walk the irq descriptors to
+	 * setup the vector space for the cpu which comes online.
+	 * Prevent irq alloc/free across the bringup.
+	 */
+	irq_lock_sparse();
+
 	/* Arch-specific enabling code. */
 	ret = __cpu_up(cpu, idle);
+
+	irq_unlock_sparse();
+
 	if (ret != 0)
 		goto out_notify;
 	BUG_ON(!cpu_online(cpu));
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 4834ee828c41..61008b8433ab 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -76,12 +76,8 @@ extern void unmask_threaded_irq(struct irq_desc *desc);
 
 #ifdef CONFIG_SPARSE_IRQ
 static inline void irq_mark_irq(unsigned int irq) { }
-extern void irq_lock_sparse(void);
-extern void irq_unlock_sparse(void);
 #else
 extern void irq_mark_irq(unsigned int irq);
-static inline void irq_lock_sparse(void) { }
-static inline void irq_unlock_sparse(void) { }
 #endif
 
 extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr);
-- 
cgit v1.2.3


From 1f6823faa8c563431a94e614d2b63ce16bb6f658 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 9 Jul 2015 10:51:46 +0200
Subject: time: Get rid of do_posix_clock_monotonic_gettime

All users gone. Remove it before we get another one.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/timekeeping.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/timekeeping.h b/include/linux/timekeeping.h
index 3aa72e648650..6e191e4e6ab6 100644
--- a/include/linux/timekeeping.h
+++ b/include/linux/timekeeping.h
@@ -145,7 +145,6 @@ static inline void getboottime(struct timespec *ts)
 }
 #endif
 
-#define do_posix_clock_monotonic_gettime(ts) ktime_get_ts(ts)
 #define ktime_get_real_ts64(ts)	getnstimeofday64(ts)
 
 /*
-- 
cgit v1.2.3


From 757856d2b9568a701df9ea6a4be68effbb9d6f44 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Thu, 25 Jun 2015 17:47:45 +0300
Subject: libceph: enable ceph in a non-default network namespace

Grab a reference on a network namespace of the 'rbd map' (in case of
rbd) or 'mount' (in case of ceph) process and use that to open sockets
instead of always using init_net and bailing if network namespace is
anything but init_net.  Be careful to not share struct ceph_client
instances between different namespaces and don't add any code in the
!CONFIG_NET_NS case.

This is based on a patch from Hong Zhiguo <zhiguohong@tencent.com>.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
Reviewed-by: Sage Weil <sage@redhat.com>
---
 include/linux/ceph/messenger.h |  3 +++
 net/ceph/ceph_common.c         | 16 ++++++++++------
 net/ceph/messenger.c           | 10 +++++++++-
 3 files changed, 22 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h
index e15499422fdc..37753278987a 100644
--- a/include/linux/ceph/messenger.h
+++ b/include/linux/ceph/messenger.h
@@ -8,6 +8,7 @@
 #include <linux/radix-tree.h>
 #include <linux/uio.h>
 #include <linux/workqueue.h>
+#include <net/net_namespace.h>
 
 #include <linux/ceph/types.h>
 #include <linux/ceph/buffer.h>
@@ -56,6 +57,7 @@ struct ceph_messenger {
 	struct ceph_entity_addr my_enc_addr;
 
 	atomic_t stopping;
+	possible_net_t net;
 	bool nocrc;
 	bool tcp_nodelay;
 
@@ -267,6 +269,7 @@ extern void ceph_messenger_init(struct ceph_messenger *msgr,
 			u64 required_features,
 			bool nocrc,
 			bool tcp_nodelay);
+extern void ceph_messenger_fini(struct ceph_messenger *msgr);
 
 extern void ceph_con_init(struct ceph_connection *con, void *private,
 			const struct ceph_connection_operations *ops,
diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c
index cb7db320dd27..f30329f72641 100644
--- a/net/ceph/ceph_common.c
+++ b/net/ceph/ceph_common.c
@@ -9,6 +9,7 @@
 #include <keys/ceph-type.h>
 #include <linux/module.h>
 #include <linux/mount.h>
+#include <linux/nsproxy.h>
 #include <linux/parser.h>
 #include <linux/sched.h>
 #include <linux/seq_file.h>
@@ -16,8 +17,6 @@
 #include <linux/statfs.h>
 #include <linux/string.h>
 #include <linux/vmalloc.h>
-#include <linux/nsproxy.h>
-#include <net/net_namespace.h>
 
 
 #include <linux/ceph/ceph_features.h>
@@ -131,6 +130,13 @@ int ceph_compare_options(struct ceph_options *new_opt,
 	int i;
 	int ret;
 
+	/*
+	 * Don't bother comparing options if network namespaces don't
+	 * match.
+	 */
+	if (!net_eq(current->nsproxy->net_ns, read_pnet(&client->msgr.net)))
+		return -1;
+
 	ret = memcmp(opt1, opt2, ofs);
 	if (ret)
 		return ret;
@@ -335,9 +341,6 @@ ceph_parse_options(char *options, const char *dev_name,
 	int err = -ENOMEM;
 	substring_t argstr[MAX_OPT_ARGS];
 
-	if (current->nsproxy->net_ns != &init_net)
-		return ERR_PTR(-EINVAL);
-
 	opt = kzalloc(sizeof(*opt), GFP_KERNEL);
 	if (!opt)
 		return ERR_PTR(-ENOMEM);
@@ -608,6 +611,7 @@ struct ceph_client *ceph_create_client(struct ceph_options *opt, void *private,
 fail_monc:
 	ceph_monc_stop(&client->monc);
 fail:
+	ceph_messenger_fini(&client->msgr);
 	kfree(client);
 	return ERR_PTR(err);
 }
@@ -621,8 +625,8 @@ void ceph_destroy_client(struct ceph_client *client)
 
 	/* unmount */
 	ceph_osdc_stop(&client->osdc);
-
 	ceph_monc_stop(&client->monc);
+	ceph_messenger_fini(&client->msgr);
 
 	ceph_debugfs_client_cleanup(client);
 
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 1679f47280e2..5c1f98ea6741 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -6,6 +6,7 @@
 #include <linux/inet.h>
 #include <linux/kthread.h>
 #include <linux/net.h>
+#include <linux/nsproxy.h>
 #include <linux/slab.h>
 #include <linux/socket.h>
 #include <linux/string.h>
@@ -479,7 +480,7 @@ static int ceph_tcp_connect(struct ceph_connection *con)
 	int ret;
 
 	BUG_ON(con->sock);
-	ret = sock_create_kern(&init_net, con->peer_addr.in_addr.ss_family,
+	ret = sock_create_kern(read_pnet(&con->msgr->net), paddr->ss_family,
 			       SOCK_STREAM, IPPROTO_TCP, &sock);
 	if (ret)
 		return ret;
@@ -2944,11 +2945,18 @@ void ceph_messenger_init(struct ceph_messenger *msgr,
 	msgr->tcp_nodelay = tcp_nodelay;
 
 	atomic_set(&msgr->stopping, 0);
+	write_pnet(&msgr->net, get_net(current->nsproxy->net_ns));
 
 	dout("%s %p\n", __func__, msgr);
 }
 EXPORT_SYMBOL(ceph_messenger_init);
 
+void ceph_messenger_fini(struct ceph_messenger *msgr)
+{
+	put_net(read_pnet(&msgr->net));
+}
+EXPORT_SYMBOL(ceph_messenger_fini);
+
 static void clear_standby(struct ceph_connection *con)
 {
 	/* come back from STANDBY? */
-- 
cgit v1.2.3


From 7876f930d0e78addc6bbdbba0d6c196a0788d545 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 9 Jul 2015 16:39:49 -0400
Subject: blkcg: implement all_blkcgs list

Add all_blkcgs list goes through blkcg->all_blkcgs_node and is
protected by blkcg_pol_mutex.  This will be used to fix
blkcg_policy_data allocation bug.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: Arianna Avanzini <avanzini.arianna@gmail.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-cgroup.c         | 17 ++++++++++++-----
 include/linux/blk-cgroup.h |  1 +
 2 files changed, 13 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 05b893de516b..42ff436ffaf4 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -46,6 +46,8 @@ struct cgroup_subsys_state * const blkcg_root_css = &blkcg_root.css;
 
 static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS];
 
+static LIST_HEAD(all_blkcgs);		/* protected by blkcg_pol_mutex */
+
 static bool blkcg_policy_enabled(struct request_queue *q,
 				 const struct blkcg_policy *pol)
 {
@@ -817,6 +819,10 @@ static void blkcg_css_free(struct cgroup_subsys_state *css)
 {
 	struct blkcg *blkcg = css_to_blkcg(css);
 
+	mutex_lock(&blkcg_pol_mutex);
+	list_del(&blkcg->all_blkcgs_node);
+	mutex_unlock(&blkcg_pol_mutex);
+
 	if (blkcg != &blkcg_root) {
 		int i;
 
@@ -833,6 +839,8 @@ blkcg_css_alloc(struct cgroup_subsys_state *parent_css)
 	struct cgroup_subsys_state *ret;
 	int i;
 
+	mutex_lock(&blkcg_pol_mutex);
+
 	if (!parent_css) {
 		blkcg = &blkcg_root;
 		goto done;
@@ -844,8 +852,6 @@ blkcg_css_alloc(struct cgroup_subsys_state *parent_css)
 		goto free_blkcg;
 	}
 
-	mutex_lock(&blkcg_pol_mutex);
-
 	for (i = 0; i < BLKCG_MAX_POLS ; i++) {
 		struct blkcg_policy *pol = blkcg_policy[i];
 		struct blkcg_policy_data *cpd;
@@ -862,7 +868,6 @@ blkcg_css_alloc(struct cgroup_subsys_state *parent_css)
 		BUG_ON(blkcg->pd[i]);
 		cpd = kzalloc(pol->cpd_size, GFP_KERNEL);
 		if (!cpd) {
-			mutex_unlock(&blkcg_pol_mutex);
 			ret = ERR_PTR(-ENOMEM);
 			goto free_pd_blkcg;
 		}
@@ -871,7 +876,6 @@ blkcg_css_alloc(struct cgroup_subsys_state *parent_css)
 		pol->cpd_init_fn(blkcg);
 	}
 
-	mutex_unlock(&blkcg_pol_mutex);
 done:
 	spin_lock_init(&blkcg->lock);
 	INIT_RADIX_TREE(&blkcg->blkg_tree, GFP_ATOMIC);
@@ -879,14 +883,17 @@ done:
 #ifdef CONFIG_CGROUP_WRITEBACK
 	INIT_LIST_HEAD(&blkcg->cgwb_list);
 #endif
+	list_add_tail(&blkcg->all_blkcgs_node, &all_blkcgs);
+
+	mutex_unlock(&blkcg_pol_mutex);
 	return &blkcg->css;
 
 free_pd_blkcg:
 	for (i--; i >= 0; i--)
 		kfree(blkcg->pd[i]);
-
 free_blkcg:
 	kfree(blkcg);
+	mutex_unlock(&blkcg_pol_mutex);
 	return ret;
 }
 
diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h
index 58cfab80dd70..cf3e7bc22ef3 100644
--- a/include/linux/blk-cgroup.h
+++ b/include/linux/blk-cgroup.h
@@ -47,6 +47,7 @@ struct blkcg {
 
 	struct blkcg_policy_data	*pd[BLKCG_MAX_POLS];
 
+	struct list_head		all_blkcgs_node;
 #ifdef CONFIG_CGROUP_WRITEBACK
 	struct list_head		cgwb_list;
 #endif
-- 
cgit v1.2.3


From 06b285bd11257bccc5a1b85a835507e33656aff2 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 9 Jul 2015 16:39:50 -0400
Subject: blkcg: fix blkcg_policy_data allocation bug

e48453c386f3 ("block, cgroup: implement policy-specific per-blkcg
data") updated per-blkcg policy data to be dynamically allocated.
When a policy is registered, its policy data aren't created.  Instead,
when the policy is activated on a queue, the policy data are allocated
if there are blkg's (blkcg_gq's) which are attached to a given blkcg.
This is buggy.  Consider the following scenario.

1. A blkcg is created.  No blkg's attached yet.

2. The policy is registered.  No policy data is allocated.

3. The policy is activated on a queue.  As the above blkcg doesn't
   have any blkg's, it won't allocate the matching blkcg_policy_data.

4. An IO is issued from the blkcg and blkg is created and the blkcg
   still doesn't have the matching policy data allocated.

With cfq-iosched, this leads to an oops.

It also doesn't free policy data on policy unregistration assuming
that freeing of all policy data on blkcg destruction should take care
of it; however, this also is incorrect.

1. A blkcg has policy data.

2. The policy gets unregistered but the policy data remains.

3. Another policy gets registered on the same slot.

4. Later, the new policy tries to allocate policy data on the previous
   blkcg but the slot is already occupied and gets skipped.  The
   policy ends up operating on the policy data of the previous policy.

There's no reason to manage blkcg_policy_data lazily.  The reason we
do lazy allocation of blkg's is that the number of all possible blkg's
is the product of cgroups and block devices which can reach a
surprising level.  blkcg_policy_data is contrained by the number of
cgroups and shouldn't be a problem.

This patch makes blkcg_policy_data to be allocated for all existing
blkcg's on policy registration and freed on unregistration and removes
blkcg_policy_data handling from policy [de]activation paths.  This
makes that blkcg_policy_data are created and removed with the policy
they belong to and fixes the above described problems.

Signed-off-by: Tejun Heo <tj@kernel.org>
Fixes: e48453c386f3 ("block, cgroup: implement policy-specific per-blkcg data")
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: Arianna Avanzini <avanzini.arianna@gmail.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-cgroup.c         | 78 +++++++++++++++++++++++++---------------------
 include/linux/blk-cgroup.h | 10 ++----
 2 files changed, 44 insertions(+), 44 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 42ff436ffaf4..9da02c021ebe 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -1048,10 +1048,8 @@ int blkcg_activate_policy(struct request_queue *q,
 			  const struct blkcg_policy *pol)
 {
 	LIST_HEAD(pds);
-	LIST_HEAD(cpds);
 	struct blkcg_gq *blkg;
 	struct blkg_policy_data *pd, *nd;
-	struct blkcg_policy_data *cpd, *cnd;
 	int cnt = 0, ret;
 
 	if (blkcg_policy_enabled(q, pol))
@@ -1064,10 +1062,7 @@ int blkcg_activate_policy(struct request_queue *q,
 		cnt++;
 	spin_unlock_irq(q->queue_lock);
 
-	/*
-	 * Allocate per-blkg and per-blkcg policy data
-	 * for all existing blkgs.
-	 */
+	/* allocate per-blkg policy data for all existing blkgs */
 	while (cnt--) {
 		pd = kzalloc_node(pol->pd_size, GFP_KERNEL, q->node);
 		if (!pd) {
@@ -1075,15 +1070,6 @@ int blkcg_activate_policy(struct request_queue *q,
 			goto out_free;
 		}
 		list_add_tail(&pd->alloc_node, &pds);
-
-		if (!pol->cpd_size)
-			continue;
-		cpd = kzalloc_node(pol->cpd_size, GFP_KERNEL, q->node);
-		if (!cpd) {
-			ret = -ENOMEM;
-			goto out_free;
-		}
-		list_add_tail(&cpd->alloc_node, &cpds);
 	}
 
 	/*
@@ -1093,32 +1079,17 @@ int blkcg_activate_policy(struct request_queue *q,
 	spin_lock_irq(q->queue_lock);
 
 	list_for_each_entry(blkg, &q->blkg_list, q_node) {
-		if (WARN_ON(list_empty(&pds)) ||
-		    WARN_ON(pol->cpd_size && list_empty(&cpds))) {
+		if (WARN_ON(list_empty(&pds))) {
 			/* umm... this shouldn't happen, just abort */
 			ret = -ENOMEM;
 			goto out_unlock;
 		}
-		cpd = list_first_entry(&cpds, struct blkcg_policy_data,
-				       alloc_node);
-		list_del_init(&cpd->alloc_node);
 		pd = list_first_entry(&pds, struct blkg_policy_data, alloc_node);
 		list_del_init(&pd->alloc_node);
 
 		/* grab blkcg lock too while installing @pd on @blkg */
 		spin_lock(&blkg->blkcg->lock);
 
-		if (!pol->cpd_size)
-			goto no_cpd;
-		if (!blkg->blkcg->pd[pol->plid]) {
-			/* Per-policy per-blkcg data */
-			blkg->blkcg->pd[pol->plid] = cpd;
-			cpd->plid = pol->plid;
-			pol->cpd_init_fn(blkg->blkcg);
-		} else { /* must free it as it has already been extracted */
-			kfree(cpd);
-		}
-no_cpd:
 		blkg->pd[pol->plid] = pd;
 		pd->blkg = blkg;
 		pd->plid = pol->plid;
@@ -1135,8 +1106,6 @@ out_free:
 	blk_queue_bypass_end(q);
 	list_for_each_entry_safe(pd, nd, &pds, alloc_node)
 		kfree(pd);
-	list_for_each_entry_safe(cpd, cnd, &cpds, alloc_node)
-		kfree(cpd);
 	return ret;
 }
 EXPORT_SYMBOL_GPL(blkcg_activate_policy);
@@ -1191,6 +1160,7 @@ EXPORT_SYMBOL_GPL(blkcg_deactivate_policy);
  */
 int blkcg_policy_register(struct blkcg_policy *pol)
 {
+	struct blkcg *blkcg;
 	int i, ret;
 
 	if (WARN_ON(pol->pd_size < sizeof(struct blkg_policy_data)))
@@ -1207,9 +1177,27 @@ int blkcg_policy_register(struct blkcg_policy *pol)
 	if (i >= BLKCG_MAX_POLS)
 		goto err_unlock;
 
-	/* register and update blkgs */
+	/* register @pol */
 	pol->plid = i;
-	blkcg_policy[i] = pol;
+	blkcg_policy[pol->plid] = pol;
+
+	/* allocate and install cpd's */
+	if (pol->cpd_size) {
+		list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) {
+			struct blkcg_policy_data *cpd;
+
+			cpd = kzalloc(pol->cpd_size, GFP_KERNEL);
+			if (!cpd) {
+				mutex_unlock(&blkcg_pol_mutex);
+				goto err_free_cpds;
+			}
+
+			blkcg->pd[pol->plid] = cpd;
+			cpd->plid = pol->plid;
+			pol->cpd_init_fn(blkcg);
+		}
+	}
+
 	mutex_unlock(&blkcg_pol_mutex);
 
 	/* everything is in place, add intf files for the new policy */
@@ -1219,6 +1207,14 @@ int blkcg_policy_register(struct blkcg_policy *pol)
 	mutex_unlock(&blkcg_pol_register_mutex);
 	return 0;
 
+err_free_cpds:
+	if (pol->cpd_size) {
+		list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) {
+			kfree(blkcg->pd[pol->plid]);
+			blkcg->pd[pol->plid] = NULL;
+		}
+	}
+	blkcg_policy[pol->plid] = NULL;
 err_unlock:
 	mutex_unlock(&blkcg_pol_mutex);
 	mutex_unlock(&blkcg_pol_register_mutex);
@@ -1234,6 +1230,8 @@ EXPORT_SYMBOL_GPL(blkcg_policy_register);
  */
 void blkcg_policy_unregister(struct blkcg_policy *pol)
 {
+	struct blkcg *blkcg;
+
 	mutex_lock(&blkcg_pol_register_mutex);
 
 	if (WARN_ON(blkcg_policy[pol->plid] != pol))
@@ -1243,9 +1241,17 @@ void blkcg_policy_unregister(struct blkcg_policy *pol)
 	if (pol->cftypes)
 		cgroup_rm_cftypes(pol->cftypes);
 
-	/* unregister and update blkgs */
+	/* remove cpds and unregister */
 	mutex_lock(&blkcg_pol_mutex);
+
+	if (pol->cpd_size) {
+		list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) {
+			kfree(blkcg->pd[pol->plid]);
+			blkcg->pd[pol->plid] = NULL;
+		}
+	}
 	blkcg_policy[pol->plid] = NULL;
+
 	mutex_unlock(&blkcg_pol_mutex);
 out_unlock:
 	mutex_unlock(&blkcg_pol_register_mutex);
diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h
index cf3e7bc22ef3..1b62d768c7df 100644
--- a/include/linux/blk-cgroup.h
+++ b/include/linux/blk-cgroup.h
@@ -89,18 +89,12 @@ struct blkg_policy_data {
  * Policies that need to keep per-blkcg data which is independent
  * from any request_queue associated to it must specify its size
  * with the cpd_size field of the blkcg_policy structure and
- * embed a blkcg_policy_data in it. blkcg core allocates
- * policy-specific per-blkcg structures lazily the first time
- * they are actually needed, so it handles them together with
- * blkgs. cpd_init() is invoked to let each policy handle
- * per-blkcg data.
+ * embed a blkcg_policy_data in it.  cpd_init() is invoked to let
+ * each policy handle per-blkcg data.
  */
 struct blkcg_policy_data {
 	/* the policy id this per-policy data belongs to */
 	int				plid;
-
-	/* used during policy activation */
-	struct list_head		alloc_node;
 };
 
 /* association between a blk cgroup and a request queue */
-- 
cgit v1.2.3


From 4a0e3e989d66bb7204b163d9cfaa7fa96d0f2023 Mon Sep 17 00:00:00 2001
From: Enrico Mioso <mrkiko.rs@gmail.com>
Date: Wed, 8 Jul 2015 13:05:57 +0200
Subject: cdc_ncm: Add support for moving NDP to end of NCM frame

NCM specs are not actually mandating a specific position in the frame for
the NDP (Network Datagram Pointer). However, some Huawei devices will
ignore our aggregates if it is not placed after the datagrams it points
to. Add support for doing just this, in a per-device configurable way.
While at it, update NCM subdrivers, disabling this functionality in all of
them, except in huawei_cdc_ncm where it is enabled instead.
We aren't making any distinction between different Huawei NCM devices,
based on what the vendor driver does. Standard NCM devices are left
unaffected: if they are compliant, they should be always usable, still
stay on the safe side.

This change has been tested and working with a Huawei E3131 device (which
works regardless of NDP position), a Huawei E3531 (also working both
ways) and an E3372 (which mandates NDP to be after indexed datagrams).

V1->V2:
- corrected wrong NDP acronym definition
- fixed possible NULL pointer dereference
- patch cleanup
V2->V3:
- Properly account for the NDP size when writing new packets to SKB

Signed-off-by: Enrico Mioso <mrkiko.rs@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/usb/cdc_mbim.c       |  2 +-
 drivers/net/usb/cdc_ncm.c        | 61 ++++++++++++++++++++++++++++++++++++----
 drivers/net/usb/huawei_cdc_ncm.c |  7 +++--
 include/linux/usb/cdc_ncm.h      |  7 ++++-
 4 files changed, 67 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/usb/cdc_mbim.c b/drivers/net/usb/cdc_mbim.c
index e4b7a47a825c..efc18e05af0a 100644
--- a/drivers/net/usb/cdc_mbim.c
+++ b/drivers/net/usb/cdc_mbim.c
@@ -158,7 +158,7 @@ static int cdc_mbim_bind(struct usbnet *dev, struct usb_interface *intf)
 	if (!cdc_ncm_comm_intf_is_mbim(intf->cur_altsetting))
 		goto err;
 
-	ret = cdc_ncm_bind_common(dev, intf, data_altsetting);
+	ret = cdc_ncm_bind_common(dev, intf, data_altsetting, 0);
 	if (ret)
 		goto err;
 
diff --git a/drivers/net/usb/cdc_ncm.c b/drivers/net/usb/cdc_ncm.c
index 8067b8fbb0ee..1991e4a24657 100644
--- a/drivers/net/usb/cdc_ncm.c
+++ b/drivers/net/usb/cdc_ncm.c
@@ -684,10 +684,12 @@ static void cdc_ncm_free(struct cdc_ncm_ctx *ctx)
 		ctx->tx_curr_skb = NULL;
 	}
 
+	kfree(ctx->delayed_ndp16);
+
 	kfree(ctx);
 }
 
-int cdc_ncm_bind_common(struct usbnet *dev, struct usb_interface *intf, u8 data_altsetting)
+int cdc_ncm_bind_common(struct usbnet *dev, struct usb_interface *intf, u8 data_altsetting, int drvflags)
 {
 	const struct usb_cdc_union_desc *union_desc = NULL;
 	struct cdc_ncm_ctx *ctx;
@@ -855,6 +857,17 @@ advance:
 	/* finish setting up the device specific data */
 	cdc_ncm_setup(dev);
 
+	/* Device-specific flags */
+	ctx->drvflags = drvflags;
+
+	/* Allocate the delayed NDP if needed. */
+	if (ctx->drvflags & CDC_NCM_FLAG_NDP_TO_END) {
+		ctx->delayed_ndp16 = kzalloc(ctx->max_ndp_size, GFP_KERNEL);
+		if (!ctx->delayed_ndp16)
+			goto error2;
+		dev_info(&intf->dev, "NDP will be placed at end of frame for this device.");
+	}
+
 	/* override ethtool_ops */
 	dev->net->ethtool_ops = &cdc_ncm_ethtool_ops;
 
@@ -954,8 +967,11 @@ static int cdc_ncm_bind(struct usbnet *dev, struct usb_interface *intf)
 	if (cdc_ncm_select_altsetting(intf) != CDC_NCM_COMM_ALTSETTING_NCM)
 		return -ENODEV;
 
-	/* The NCM data altsetting is fixed */
-	ret = cdc_ncm_bind_common(dev, intf, CDC_NCM_DATA_ALTSETTING_NCM);
+	/* The NCM data altsetting is fixed, so we hard-coded it.
+	 * Additionally, generic NCM devices are assumed to accept arbitrarily
+	 * placed NDP.
+	 */
+	ret = cdc_ncm_bind_common(dev, intf, CDC_NCM_DATA_ALTSETTING_NCM, 0);
 
 	/*
 	 * We should get an event when network connection is "connected" or
@@ -986,6 +1002,14 @@ static struct usb_cdc_ncm_ndp16 *cdc_ncm_ndp(struct cdc_ncm_ctx *ctx, struct sk_
 	struct usb_cdc_ncm_nth16 *nth16 = (void *)skb->data;
 	size_t ndpoffset = le16_to_cpu(nth16->wNdpIndex);
 
+	/* If NDP should be moved to the end of the NCM package, we can't follow the
+	* NTH16 header as we would normally do. NDP isn't written to the SKB yet, and
+	* the wNdpIndex field in the header is actually not consistent with reality. It will be later.
+	*/
+	if (ctx->drvflags & CDC_NCM_FLAG_NDP_TO_END)
+		if (ctx->delayed_ndp16->dwSignature == sign)
+			return ctx->delayed_ndp16;
+
 	/* follow the chain of NDPs, looking for a match */
 	while (ndpoffset) {
 		ndp16 = (struct usb_cdc_ncm_ndp16 *)(skb->data + ndpoffset);
@@ -995,7 +1019,8 @@ static struct usb_cdc_ncm_ndp16 *cdc_ncm_ndp(struct cdc_ncm_ctx *ctx, struct sk_
 	}
 
 	/* align new NDP */
-	cdc_ncm_align_tail(skb, ctx->tx_ndp_modulus, 0, ctx->tx_max);
+	if (!(ctx->drvflags & CDC_NCM_FLAG_NDP_TO_END))
+		cdc_ncm_align_tail(skb, ctx->tx_ndp_modulus, 0, ctx->tx_max);
 
 	/* verify that there is room for the NDP and the datagram (reserve) */
 	if ((ctx->tx_max - skb->len - reserve) < ctx->max_ndp_size)
@@ -1008,7 +1033,11 @@ static struct usb_cdc_ncm_ndp16 *cdc_ncm_ndp(struct cdc_ncm_ctx *ctx, struct sk_
 		nth16->wNdpIndex = cpu_to_le16(skb->len);
 
 	/* push a new empty NDP */
-	ndp16 = (struct usb_cdc_ncm_ndp16 *)memset(skb_put(skb, ctx->max_ndp_size), 0, ctx->max_ndp_size);
+	if (!(ctx->drvflags & CDC_NCM_FLAG_NDP_TO_END))
+		ndp16 = (struct usb_cdc_ncm_ndp16 *)memset(skb_put(skb, ctx->max_ndp_size), 0, ctx->max_ndp_size);
+	else
+		ndp16 = ctx->delayed_ndp16;
+
 	ndp16->dwSignature = sign;
 	ndp16->wLength = cpu_to_le16(sizeof(struct usb_cdc_ncm_ndp16) + sizeof(struct usb_cdc_ncm_dpe16));
 	return ndp16;
@@ -1023,6 +1052,15 @@ cdc_ncm_fill_tx_frame(struct usbnet *dev, struct sk_buff *skb, __le32 sign)
 	struct sk_buff *skb_out;
 	u16 n = 0, index, ndplen;
 	u8 ready2send = 0;
+	u32 delayed_ndp_size;
+
+	/* When our NDP gets written in cdc_ncm_ndp(), then skb_out->len gets updated
+	 * accordingly. Otherwise, we should check here.
+	 */
+	if (ctx->drvflags & CDC_NCM_FLAG_NDP_TO_END)
+		delayed_ndp_size = ctx->max_ndp_size;
+	else
+		delayed_ndp_size = 0;
 
 	/* if there is a remaining skb, it gets priority */
 	if (skb != NULL) {
@@ -1077,7 +1115,7 @@ cdc_ncm_fill_tx_frame(struct usbnet *dev, struct sk_buff *skb, __le32 sign)
 		cdc_ncm_align_tail(skb_out,  ctx->tx_modulus, ctx->tx_remainder, ctx->tx_max);
 
 		/* check if we had enough room left for both NDP and frame */
-		if (!ndp16 || skb_out->len + skb->len > ctx->tx_max) {
+		if (!ndp16 || skb_out->len + skb->len + delayed_ndp_size > ctx->tx_max) {
 			if (n == 0) {
 				/* won't fit, MTU problem? */
 				dev_kfree_skb_any(skb);
@@ -1150,6 +1188,17 @@ cdc_ncm_fill_tx_frame(struct usbnet *dev, struct sk_buff *skb, __le32 sign)
 		/* variables will be reset at next call */
 	}
 
+	/* If requested, put NDP at end of frame. */
+	if (ctx->drvflags & CDC_NCM_FLAG_NDP_TO_END) {
+		nth16 = (struct usb_cdc_ncm_nth16 *)skb_out->data;
+		cdc_ncm_align_tail(skb_out, ctx->tx_ndp_modulus, 0, ctx->tx_max);
+		nth16->wNdpIndex = cpu_to_le16(skb_out->len);
+		memcpy(skb_put(skb_out, ctx->max_ndp_size), ctx->delayed_ndp16, ctx->max_ndp_size);
+
+		/* Zero out delayed NDP - signature checking will naturally fail. */
+		ndp16 = memset(ctx->delayed_ndp16, 0, ctx->max_ndp_size);
+	}
+
 	/* If collected data size is less or equal ctx->min_tx_pkt
 	 * bytes, we send buffers as it is. If we get more data, it
 	 * would be more efficient for USB HS mobile device with DMA
diff --git a/drivers/net/usb/huawei_cdc_ncm.c b/drivers/net/usb/huawei_cdc_ncm.c
index 735f7dadb9a0..2680a65cd5e4 100644
--- a/drivers/net/usb/huawei_cdc_ncm.c
+++ b/drivers/net/usb/huawei_cdc_ncm.c
@@ -73,11 +73,14 @@ static int huawei_cdc_ncm_bind(struct usbnet *usbnet_dev,
 	struct usb_driver *subdriver = ERR_PTR(-ENODEV);
 	int ret = -ENODEV;
 	struct huawei_cdc_ncm_state *drvstate = (void *)&usbnet_dev->data;
+	int drvflags = 0;
 
 	/* altsetting should always be 1 for NCM devices - so we hard-coded
-	 * it here
+	 * it here. Some huawei devices will need the NDP part of the NCM package to
+	 * be at the end of the frame.
 	 */
-	ret = cdc_ncm_bind_common(usbnet_dev, intf, 1);
+	drvflags |= CDC_NCM_FLAG_NDP_TO_END;
+	ret = cdc_ncm_bind_common(usbnet_dev, intf, 1, drvflags);
 	if (ret)
 		goto err;
 
diff --git a/include/linux/usb/cdc_ncm.h b/include/linux/usb/cdc_ncm.h
index 7c9b484735c5..1f6526c76ee8 100644
--- a/include/linux/usb/cdc_ncm.h
+++ b/include/linux/usb/cdc_ncm.h
@@ -80,6 +80,9 @@
 #define CDC_NCM_TIMER_INTERVAL_MIN		5UL
 #define CDC_NCM_TIMER_INTERVAL_MAX		(U32_MAX / NSEC_PER_USEC)
 
+/* Driver flags */
+#define CDC_NCM_FLAG_NDP_TO_END	0x02		/* NDP is placed at end of frame */
+
 #define cdc_ncm_comm_intf_is_mbim(x)  ((x)->desc.bInterfaceSubClass == USB_CDC_SUBCLASS_MBIM && \
 				       (x)->desc.bInterfaceProtocol == USB_CDC_PROTO_NONE)
 #define cdc_ncm_data_intf_is_mbim(x)  ((x)->desc.bInterfaceProtocol == USB_CDC_MBIM_PROTO_NTB)
@@ -103,9 +106,11 @@ struct cdc_ncm_ctx {
 
 	spinlock_t mtx;
 	atomic_t stop;
+	int drvflags;
 
 	u32 timer_interval;
 	u32 max_ndp_size;
+	struct usb_cdc_ncm_ndp16 *delayed_ndp16;
 
 	u32 tx_timer_pending;
 	u32 tx_curr_frame_num;
@@ -133,7 +138,7 @@ struct cdc_ncm_ctx {
 };
 
 u8 cdc_ncm_select_altsetting(struct usb_interface *intf);
-int cdc_ncm_bind_common(struct usbnet *dev, struct usb_interface *intf, u8 data_altsetting);
+int cdc_ncm_bind_common(struct usbnet *dev, struct usb_interface *intf, u8 data_altsetting, int drvflags);
 void cdc_ncm_unbind(struct usbnet *dev, struct usb_interface *intf);
 struct sk_buff *cdc_ncm_fill_tx_frame(struct usbnet *dev, struct sk_buff *skb, __le32 sign);
 int cdc_ncm_rx_verify_nth16(struct cdc_ncm_ctx *ctx, struct sk_buff *skb_in);
-- 
cgit v1.2.3


From 5544eb9b81940647b8fad1f251b37cbe2819ce44 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Tue, 7 Jul 2015 15:41:58 +0200
Subject: KVM: count number of assigned devices

If there are no assigned devices, the guest PAT are not providing
any useful information and can be overridden to writeback; VMX
always does this because it has the "IPAT" bit in its extended
page table entries, but SVM does not have anything similar.
Hook into VFIO and legacy device assignment so that they
provide this information to KVM.

Reviewed-by: Alex Williamson <alex.williamson@redhat.com>
Tested-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  2 ++
 arch/x86/kvm/iommu.c            |  2 ++
 arch/x86/kvm/x86.c              | 18 ++++++++++++++++++
 include/linux/kvm_host.h        | 18 ++++++++++++++++++
 virt/kvm/vfio.c                 |  5 +++++
 5 files changed, 45 insertions(+)

(limited to 'include/linux')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 2a7f5d782c33..49ec9038ec14 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -604,6 +604,8 @@ struct kvm_arch {
 	bool iommu_noncoherent;
 #define __KVM_HAVE_ARCH_NONCOHERENT_DMA
 	atomic_t noncoherent_dma_count;
+#define __KVM_HAVE_ARCH_ASSIGNED_DEVICE
+	atomic_t assigned_device_count;
 	struct kvm_pic *vpic;
 	struct kvm_ioapic *vioapic;
 	struct kvm_pit *vpit;
diff --git a/arch/x86/kvm/iommu.c b/arch/x86/kvm/iommu.c
index 7dbced309ddb..5c520ebf6343 100644
--- a/arch/x86/kvm/iommu.c
+++ b/arch/x86/kvm/iommu.c
@@ -200,6 +200,7 @@ int kvm_assign_device(struct kvm *kvm, struct pci_dev *pdev)
 			goto out_unmap;
 	}
 
+	kvm_arch_start_assignment(kvm);
 	pci_set_dev_assigned(pdev);
 
 	dev_info(&pdev->dev, "kvm assign device\n");
@@ -224,6 +225,7 @@ int kvm_deassign_device(struct kvm *kvm, struct pci_dev *pdev)
 	iommu_detach_device(domain, &pdev->dev);
 
 	pci_clear_dev_assigned(pdev);
+	kvm_arch_end_assignment(kvm);
 
 	dev_info(&pdev->dev, "kvm deassign device\n");
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 6bd19c7abc65..0024968b342d 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -8213,6 +8213,24 @@ bool kvm_arch_can_inject_async_page_present(struct kvm_vcpu *vcpu)
 			kvm_x86_ops->interrupt_allowed(vcpu);
 }
 
+void kvm_arch_start_assignment(struct kvm *kvm)
+{
+	atomic_inc(&kvm->arch.assigned_device_count);
+}
+EXPORT_SYMBOL_GPL(kvm_arch_start_assignment);
+
+void kvm_arch_end_assignment(struct kvm *kvm)
+{
+	atomic_dec(&kvm->arch.assigned_device_count);
+}
+EXPORT_SYMBOL_GPL(kvm_arch_end_assignment);
+
+bool kvm_arch_has_assigned_device(struct kvm *kvm)
+{
+	return atomic_read(&kvm->arch.assigned_device_count);
+}
+EXPORT_SYMBOL_GPL(kvm_arch_has_assigned_device);
+
 void kvm_arch_register_noncoherent_dma(struct kvm *kvm)
 {
 	atomic_inc(&kvm->arch.noncoherent_dma_count);
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 9564fd78c547..05e99b8ef465 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -734,6 +734,24 @@ static inline bool kvm_arch_has_noncoherent_dma(struct kvm *kvm)
 	return false;
 }
 #endif
+#ifdef __KVM_HAVE_ARCH_ASSIGNED_DEVICE
+void kvm_arch_start_assignment(struct kvm *kvm);
+void kvm_arch_end_assignment(struct kvm *kvm);
+bool kvm_arch_has_assigned_device(struct kvm *kvm);
+#else
+static inline void kvm_arch_start_assignment(struct kvm *kvm)
+{
+}
+
+static inline void kvm_arch_end_assignment(struct kvm *kvm)
+{
+}
+
+static inline bool kvm_arch_has_assigned_device(struct kvm *kvm)
+{
+	return false;
+}
+#endif
 
 static inline wait_queue_head_t *kvm_arch_vcpu_wq(struct kvm_vcpu *vcpu)
 {
diff --git a/virt/kvm/vfio.c b/virt/kvm/vfio.c
index 620e37f741b8..1dd087da6f31 100644
--- a/virt/kvm/vfio.c
+++ b/virt/kvm/vfio.c
@@ -155,6 +155,8 @@ static int kvm_vfio_set_group(struct kvm_device *dev, long attr, u64 arg)
 		list_add_tail(&kvg->node, &kv->group_list);
 		kvg->vfio_group = vfio_group;
 
+		kvm_arch_start_assignment(dev->kvm);
+
 		mutex_unlock(&kv->lock);
 
 		kvm_vfio_update_coherency(dev);
@@ -190,6 +192,8 @@ static int kvm_vfio_set_group(struct kvm_device *dev, long attr, u64 arg)
 			break;
 		}
 
+		kvm_arch_end_assignment(dev->kvm);
+
 		mutex_unlock(&kv->lock);
 
 		kvm_vfio_group_put_external_user(vfio_group);
@@ -239,6 +243,7 @@ static void kvm_vfio_destroy(struct kvm_device *dev)
 		kvm_vfio_group_put_external_user(kvg->vfio_group);
 		list_del(&kvg->node);
 		kfree(kvg);
+		kvm_arch_end_assignment(dev->kvm);
 	}
 
 	kvm_vfio_update_coherency(dev);
-- 
cgit v1.2.3


From d3b58c47d330de8c29898fe9746f7530408f8a59 Mon Sep 17 00:00:00 2001
From: Oliver Hartkopp <socketcan@hartkopp.net>
Date: Fri, 26 Jun 2015 11:58:19 +0200
Subject: can: replace timestamp as unique skb attribute

Commit 514ac99c64b "can: fix multiple delivery of a single CAN frame for
overlapping CAN filters" requires the skb->tstamp to be set to check for
identical CAN skbs.

Without timestamping to be required by user space applications this timestamp
was not generated which lead to commit 36c01245eb8 "can: fix loss of CAN frames
in raw_rcv" - which forces the timestamp to be set in all CAN related skbuffs
by introducing several __net_timestamp() calls.

This forces e.g. out of tree drivers which are not using alloc_can{,fd}_skb()
to add __net_timestamp() after skbuff creation to prevent the frame loss fixed
in mainline Linux.

This patch removes the timestamp dependency and uses an atomic counter to
create an unique identifier together with the skbuff pointer.

Btw: the new skbcnt element introduced in struct can_skb_priv has to be
initialized with zero in out-of-tree drivers which are not using
alloc_can{,fd}_skb() too.

Signed-off-by: Oliver Hartkopp <socketcan@hartkopp.net>
Cc: linux-stable <stable@vger.kernel.org>
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
---
 drivers/net/can/dev.c   |  7 ++-----
 drivers/net/can/slcan.c |  2 +-
 drivers/net/can/vcan.c  |  3 ---
 include/linux/can/skb.h |  2 ++
 net/can/af_can.c        | 12 +++++++-----
 net/can/bcm.c           |  2 ++
 net/can/raw.c           |  7 ++++---
 7 files changed, 18 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/can/dev.c b/drivers/net/can/dev.c
index e9b1810d319f..aede704605c6 100644
--- a/drivers/net/can/dev.c
+++ b/drivers/net/can/dev.c
@@ -440,9 +440,6 @@ unsigned int can_get_echo_skb(struct net_device *dev, unsigned int idx)
 		struct can_frame *cf = (struct can_frame *)skb->data;
 		u8 dlc = cf->can_dlc;
 
-		if (!(skb->tstamp.tv64))
-			__net_timestamp(skb);
-
 		netif_rx(priv->echo_skb[idx]);
 		priv->echo_skb[idx] = NULL;
 
@@ -578,7 +575,6 @@ struct sk_buff *alloc_can_skb(struct net_device *dev, struct can_frame **cf)
 	if (unlikely(!skb))
 		return NULL;
 
-	__net_timestamp(skb);
 	skb->protocol = htons(ETH_P_CAN);
 	skb->pkt_type = PACKET_BROADCAST;
 	skb->ip_summed = CHECKSUM_UNNECESSARY;
@@ -589,6 +585,7 @@ struct sk_buff *alloc_can_skb(struct net_device *dev, struct can_frame **cf)
 
 	can_skb_reserve(skb);
 	can_skb_prv(skb)->ifindex = dev->ifindex;
+	can_skb_prv(skb)->skbcnt = 0;
 
 	*cf = (struct can_frame *)skb_put(skb, sizeof(struct can_frame));
 	memset(*cf, 0, sizeof(struct can_frame));
@@ -607,7 +604,6 @@ struct sk_buff *alloc_canfd_skb(struct net_device *dev,
 	if (unlikely(!skb))
 		return NULL;
 
-	__net_timestamp(skb);
 	skb->protocol = htons(ETH_P_CANFD);
 	skb->pkt_type = PACKET_BROADCAST;
 	skb->ip_summed = CHECKSUM_UNNECESSARY;
@@ -618,6 +614,7 @@ struct sk_buff *alloc_canfd_skb(struct net_device *dev,
 
 	can_skb_reserve(skb);
 	can_skb_prv(skb)->ifindex = dev->ifindex;
+	can_skb_prv(skb)->skbcnt = 0;
 
 	*cfd = (struct canfd_frame *)skb_put(skb, sizeof(struct canfd_frame));
 	memset(*cfd, 0, sizeof(struct canfd_frame));
diff --git a/drivers/net/can/slcan.c b/drivers/net/can/slcan.c
index f64f5290d6f8..a23a7af8eb9a 100644
--- a/drivers/net/can/slcan.c
+++ b/drivers/net/can/slcan.c
@@ -207,7 +207,6 @@ static void slc_bump(struct slcan *sl)
 	if (!skb)
 		return;
 
-	__net_timestamp(skb);
 	skb->dev = sl->dev;
 	skb->protocol = htons(ETH_P_CAN);
 	skb->pkt_type = PACKET_BROADCAST;
@@ -215,6 +214,7 @@ static void slc_bump(struct slcan *sl)
 
 	can_skb_reserve(skb);
 	can_skb_prv(skb)->ifindex = sl->dev->ifindex;
+	can_skb_prv(skb)->skbcnt = 0;
 
 	memcpy(skb_put(skb, sizeof(struct can_frame)),
 	       &cf, sizeof(struct can_frame));
diff --git a/drivers/net/can/vcan.c b/drivers/net/can/vcan.c
index 0ce868de855d..674f367087c5 100644
--- a/drivers/net/can/vcan.c
+++ b/drivers/net/can/vcan.c
@@ -78,9 +78,6 @@ static void vcan_rx(struct sk_buff *skb, struct net_device *dev)
 	skb->dev       = dev;
 	skb->ip_summed = CHECKSUM_UNNECESSARY;
 
-	if (!(skb->tstamp.tv64))
-		__net_timestamp(skb);
-
 	netif_rx_ni(skb);
 }
 
diff --git a/include/linux/can/skb.h b/include/linux/can/skb.h
index b6a52a4b457a..51bb6532785c 100644
--- a/include/linux/can/skb.h
+++ b/include/linux/can/skb.h
@@ -27,10 +27,12 @@
 /**
  * struct can_skb_priv - private additional data inside CAN sk_buffs
  * @ifindex:	ifindex of the first interface the CAN frame appeared on
+ * @skbcnt:	atomic counter to have an unique id together with skb pointer
  * @cf:		align to the following CAN frame at skb->data
  */
 struct can_skb_priv {
 	int ifindex;
+	int skbcnt;
 	struct can_frame cf[0];
 };
 
diff --git a/net/can/af_can.c b/net/can/af_can.c
index 7933e62a7318..166d436196c1 100644
--- a/net/can/af_can.c
+++ b/net/can/af_can.c
@@ -89,6 +89,8 @@ struct timer_list can_stattimer;   /* timer for statistics update */
 struct s_stats    can_stats;       /* packet statistics */
 struct s_pstats   can_pstats;      /* receive list statistics */
 
+static atomic_t skbcounter = ATOMIC_INIT(0);
+
 /*
  * af_can socket functions
  */
@@ -310,12 +312,8 @@ int can_send(struct sk_buff *skb, int loop)
 		return err;
 	}
 
-	if (newskb) {
-		if (!(newskb->tstamp.tv64))
-			__net_timestamp(newskb);
-
+	if (newskb)
 		netif_rx_ni(newskb);
-	}
 
 	/* update statistics */
 	can_stats.tx_frames++;
@@ -683,6 +681,10 @@ static void can_receive(struct sk_buff *skb, struct net_device *dev)
 	can_stats.rx_frames++;
 	can_stats.rx_frames_delta++;
 
+	/* create non-zero unique skb identifier together with *skb */
+	while (!(can_skb_prv(skb)->skbcnt))
+		can_skb_prv(skb)->skbcnt = atomic_inc_return(&skbcounter);
+
 	rcu_read_lock();
 
 	/* deliver the packet to sockets listening on all devices */
diff --git a/net/can/bcm.c b/net/can/bcm.c
index b523453585be..a1ba6875c2a2 100644
--- a/net/can/bcm.c
+++ b/net/can/bcm.c
@@ -261,6 +261,7 @@ static void bcm_can_tx(struct bcm_op *op)
 
 	can_skb_reserve(skb);
 	can_skb_prv(skb)->ifindex = dev->ifindex;
+	can_skb_prv(skb)->skbcnt = 0;
 
 	memcpy(skb_put(skb, CFSIZ), cf, CFSIZ);
 
@@ -1217,6 +1218,7 @@ static int bcm_tx_send(struct msghdr *msg, int ifindex, struct sock *sk)
 	}
 
 	can_skb_prv(skb)->ifindex = dev->ifindex;
+	can_skb_prv(skb)->skbcnt = 0;
 	skb->dev = dev;
 	can_skb_set_owner(skb, sk);
 	err = can_send(skb, 1); /* send with loopback */
diff --git a/net/can/raw.c b/net/can/raw.c
index 31b9748cbb4e..2e67b1423cd3 100644
--- a/net/can/raw.c
+++ b/net/can/raw.c
@@ -75,7 +75,7 @@ MODULE_ALIAS("can-proto-1");
  */
 
 struct uniqframe {
-	ktime_t tstamp;
+	int skbcnt;
 	const struct sk_buff *skb;
 	unsigned int join_rx_count;
 };
@@ -133,7 +133,7 @@ static void raw_rcv(struct sk_buff *oskb, void *data)
 
 	/* eliminate multiple filter matches for the same skb */
 	if (this_cpu_ptr(ro->uniq)->skb == oskb &&
-	    ktime_equal(this_cpu_ptr(ro->uniq)->tstamp, oskb->tstamp)) {
+	    this_cpu_ptr(ro->uniq)->skbcnt == can_skb_prv(oskb)->skbcnt) {
 		if (ro->join_filters) {
 			this_cpu_inc(ro->uniq->join_rx_count);
 			/* drop frame until all enabled filters matched */
@@ -144,7 +144,7 @@ static void raw_rcv(struct sk_buff *oskb, void *data)
 		}
 	} else {
 		this_cpu_ptr(ro->uniq)->skb = oskb;
-		this_cpu_ptr(ro->uniq)->tstamp = oskb->tstamp;
+		this_cpu_ptr(ro->uniq)->skbcnt = can_skb_prv(oskb)->skbcnt;
 		this_cpu_ptr(ro->uniq)->join_rx_count = 1;
 		/* drop first frame to check all enabled filters? */
 		if (ro->join_filters && ro->count > 1)
@@ -749,6 +749,7 @@ static int raw_sendmsg(struct socket *sock, struct msghdr *msg, size_t size)
 
 	can_skb_reserve(skb);
 	can_skb_prv(skb)->ifindex = dev->ifindex;
+	can_skb_prv(skb)->skbcnt = 0;
 
 	err = memcpy_from_msg(skb_put(skb, size), msg, size);
 	if (err < 0)
-- 
cgit v1.2.3


From 29d01b22eaa18d8b46091d3c98c6001c49f78e4a Mon Sep 17 00:00:00 2001
From: Jeff Layton <jeff.layton@primarydata.com>
Date: Sat, 11 Jul 2015 06:43:02 -0400
Subject: locks: new helpers - flock_lock_inode_wait and posix_lock_inode_wait

Allow callers to pass in an inode instead of a filp.

Signed-off-by: Jeff Layton <jeff.layton@primarydata.com>
Reviewed-by: "J. Bruce Fields" <bfields@fieldses.org>
Tested-by: "J. Bruce Fields" <bfields@fieldses.org>
---
 fs/locks.c         | 50 ++++++++++++++++++++++++++++++++++++++------------
 include/linux/fs.h | 14 ++++++++++++++
 2 files changed, 52 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/fs/locks.c b/fs/locks.c
index 4366b7c54e6d..ba268a503c1b 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -1163,20 +1163,19 @@ int posix_lock_file(struct file *filp, struct file_lock *fl,
 EXPORT_SYMBOL(posix_lock_file);
 
 /**
- * posix_lock_file_wait - Apply a POSIX-style lock to a file
- * @filp: The file to apply the lock to
+ * posix_lock_inode_wait - Apply a POSIX-style lock to a file
+ * @inode: inode of file to which lock request should be applied
  * @fl: The lock to be applied
  *
- * Add a POSIX style lock to a file.
- * We merge adjacent & overlapping locks whenever possible.
- * POSIX locks are sorted by owner task, then by starting address
+ * Variant of posix_lock_file_wait that does not take a filp, and so can be
+ * used after the filp has already been torn down.
  */
-int posix_lock_file_wait(struct file *filp, struct file_lock *fl)
+int posix_lock_inode_wait(struct inode *inode, struct file_lock *fl)
 {
 	int error;
 	might_sleep ();
 	for (;;) {
-		error = posix_lock_file(filp, fl, NULL);
+		error = __posix_lock_file(inode, fl, NULL);
 		if (error != FILE_LOCK_DEFERRED)
 			break;
 		error = wait_event_interruptible(fl->fl_wait, !fl->fl_next);
@@ -1188,6 +1187,21 @@ int posix_lock_file_wait(struct file *filp, struct file_lock *fl)
 	}
 	return error;
 }
+EXPORT_SYMBOL(posix_lock_inode_wait);
+
+/**
+ * posix_lock_file_wait - Apply a POSIX-style lock to a file
+ * @filp: The file to apply the lock to
+ * @fl: The lock to be applied
+ *
+ * Add a POSIX style lock to a file.
+ * We merge adjacent & overlapping locks whenever possible.
+ * POSIX locks are sorted by owner task, then by starting address
+ */
+int posix_lock_file_wait(struct file *filp, struct file_lock *fl)
+{
+	return posix_lock_inode_wait(file_inode(filp), fl);
+}
 EXPORT_SYMBOL(posix_lock_file_wait);
 
 /**
@@ -1850,18 +1864,18 @@ int fcntl_setlease(unsigned int fd, struct file *filp, long arg)
 }
 
 /**
- * flock_lock_file_wait - Apply a FLOCK-style lock to a file
- * @filp: The file to apply the lock to
+ * flock_lock_inode_wait - Apply a FLOCK-style lock to a file
+ * @inode: inode of the file to apply to
  * @fl: The lock to be applied
  *
- * Add a FLOCK style lock to a file.
+ * Apply a FLOCK style lock request to an inode.
  */
-int flock_lock_file_wait(struct file *filp, struct file_lock *fl)
+int flock_lock_inode_wait(struct inode *inode, struct file_lock *fl)
 {
 	int error;
 	might_sleep();
 	for (;;) {
-		error = flock_lock_inode(file_inode(filp), fl);
+		error = flock_lock_inode(inode, fl);
 		if (error != FILE_LOCK_DEFERRED)
 			break;
 		error = wait_event_interruptible(fl->fl_wait, !fl->fl_next);
@@ -1873,7 +1887,19 @@ int flock_lock_file_wait(struct file *filp, struct file_lock *fl)
 	}
 	return error;
 }
+EXPORT_SYMBOL(flock_lock_inode_wait);
 
+/**
+ * flock_lock_file_wait - Apply a FLOCK-style lock to a file
+ * @filp: The file to apply the lock to
+ * @fl: The lock to be applied
+ *
+ * Add a FLOCK style lock to a file.
+ */
+int flock_lock_file_wait(struct file *filp, struct file_lock *fl)
+{
+	return flock_lock_inode_wait(file_inode(filp), fl);
+}
 EXPORT_SYMBOL(flock_lock_file_wait);
 
 /**
diff --git a/include/linux/fs.h b/include/linux/fs.h
index a0653e560c26..4c990edd1377 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1046,11 +1046,13 @@ extern void locks_remove_file(struct file *);
 extern void locks_release_private(struct file_lock *);
 extern void posix_test_lock(struct file *, struct file_lock *);
 extern int posix_lock_file(struct file *, struct file_lock *, struct file_lock *);
+extern int posix_lock_inode_wait(struct inode *, struct file_lock *);
 extern int posix_lock_file_wait(struct file *, struct file_lock *);
 extern int posix_unblock_lock(struct file_lock *);
 extern int vfs_test_lock(struct file *, struct file_lock *);
 extern int vfs_lock_file(struct file *, unsigned int, struct file_lock *, struct file_lock *);
 extern int vfs_cancel_lock(struct file *filp, struct file_lock *fl);
+extern int flock_lock_inode_wait(struct inode *inode, struct file_lock *fl);
 extern int flock_lock_file_wait(struct file *filp, struct file_lock *fl);
 extern int __break_lease(struct inode *inode, unsigned int flags, unsigned int type);
 extern void lease_get_mtime(struct inode *, struct timespec *time);
@@ -1137,6 +1139,12 @@ static inline int posix_lock_file(struct file *filp, struct file_lock *fl,
 	return -ENOLCK;
 }
 
+static inline int posix_lock_inode_wait(struct inode *inode,
+					struct file_lock *fl)
+{
+	return -ENOLCK;
+}
+
 static inline int posix_lock_file_wait(struct file *filp, struct file_lock *fl)
 {
 	return -ENOLCK;
@@ -1163,6 +1171,12 @@ static inline int vfs_cancel_lock(struct file *filp, struct file_lock *fl)
 	return 0;
 }
 
+static inline int flock_lock_inode_wait(struct inode *inode,
+					struct file_lock *request)
+{
+	return -ENOLCK;
+}
+
 static inline int flock_lock_file_wait(struct file *filp,
 				       struct file_lock *request)
 {
-- 
cgit v1.2.3


From ee296d7c5709440f8abd36b5b65c6b3e388538d9 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jeff.layton@primarydata.com>
Date: Sat, 11 Jul 2015 06:43:03 -0400
Subject: locks: inline posix_lock_file_wait and flock_lock_file_wait

They just call file_inode and then the corresponding *_inode_file_wait
function. Just make them static inlines instead.

Signed-off-by: Jeff Layton <jeff.layton@primarydata.com>
---
 fs/locks.c         | 28 ----------------------------
 include/linux/fs.h | 32 ++++++++++++++------------------
 2 files changed, 14 insertions(+), 46 deletions(-)

(limited to 'include/linux')

diff --git a/fs/locks.c b/fs/locks.c
index ba268a503c1b..d3d558ba4da7 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -1189,21 +1189,6 @@ int posix_lock_inode_wait(struct inode *inode, struct file_lock *fl)
 }
 EXPORT_SYMBOL(posix_lock_inode_wait);
 
-/**
- * posix_lock_file_wait - Apply a POSIX-style lock to a file
- * @filp: The file to apply the lock to
- * @fl: The lock to be applied
- *
- * Add a POSIX style lock to a file.
- * We merge adjacent & overlapping locks whenever possible.
- * POSIX locks are sorted by owner task, then by starting address
- */
-int posix_lock_file_wait(struct file *filp, struct file_lock *fl)
-{
-	return posix_lock_inode_wait(file_inode(filp), fl);
-}
-EXPORT_SYMBOL(posix_lock_file_wait);
-
 /**
  * locks_mandatory_locked - Check for an active lock
  * @file: the file to check
@@ -1889,19 +1874,6 @@ int flock_lock_inode_wait(struct inode *inode, struct file_lock *fl)
 }
 EXPORT_SYMBOL(flock_lock_inode_wait);
 
-/**
- * flock_lock_file_wait - Apply a FLOCK-style lock to a file
- * @filp: The file to apply the lock to
- * @fl: The lock to be applied
- *
- * Add a FLOCK style lock to a file.
- */
-int flock_lock_file_wait(struct file *filp, struct file_lock *fl)
-{
-	return flock_lock_inode_wait(file_inode(filp), fl);
-}
-EXPORT_SYMBOL(flock_lock_file_wait);
-
 /**
  *	sys_flock: - flock() system call.
  *	@fd: the file descriptor to lock.
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 4c990edd1377..cc008c338f5a 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1047,13 +1047,11 @@ extern void locks_release_private(struct file_lock *);
 extern void posix_test_lock(struct file *, struct file_lock *);
 extern int posix_lock_file(struct file *, struct file_lock *, struct file_lock *);
 extern int posix_lock_inode_wait(struct inode *, struct file_lock *);
-extern int posix_lock_file_wait(struct file *, struct file_lock *);
 extern int posix_unblock_lock(struct file_lock *);
 extern int vfs_test_lock(struct file *, struct file_lock *);
 extern int vfs_lock_file(struct file *, unsigned int, struct file_lock *, struct file_lock *);
 extern int vfs_cancel_lock(struct file *filp, struct file_lock *fl);
 extern int flock_lock_inode_wait(struct inode *inode, struct file_lock *fl);
-extern int flock_lock_file_wait(struct file *filp, struct file_lock *fl);
 extern int __break_lease(struct inode *inode, unsigned int flags, unsigned int type);
 extern void lease_get_mtime(struct inode *, struct timespec *time);
 extern int generic_setlease(struct file *, long, struct file_lock **, void **priv);
@@ -1145,11 +1143,6 @@ static inline int posix_lock_inode_wait(struct inode *inode,
 	return -ENOLCK;
 }
 
-static inline int posix_lock_file_wait(struct file *filp, struct file_lock *fl)
-{
-	return -ENOLCK;
-}
-
 static inline int posix_unblock_lock(struct file_lock *waiter)
 {
 	return -ENOENT;
@@ -1177,12 +1170,6 @@ static inline int flock_lock_inode_wait(struct inode *inode,
 	return -ENOLCK;
 }
 
-static inline int flock_lock_file_wait(struct file *filp,
-				       struct file_lock *request)
-{
-	return -ENOLCK;
-}
-
 static inline int __break_lease(struct inode *inode, unsigned int mode, unsigned int type)
 {
 	return 0;
@@ -1216,6 +1203,20 @@ static inline void show_fd_locks(struct seq_file *f,
 			struct file *filp, struct files_struct *files) {}
 #endif /* !CONFIG_FILE_LOCKING */
 
+static inline struct inode *file_inode(const struct file *f)
+{
+	return f->f_inode;
+}
+
+static inline int posix_lock_file_wait(struct file *filp, struct file_lock *fl)
+{
+	return posix_lock_inode_wait(file_inode(filp), fl);
+}
+
+static inline int flock_lock_file_wait(struct file *filp, struct file_lock *fl)
+{
+	return flock_lock_inode_wait(file_inode(filp), fl);
+}
 
 struct fasync_struct {
 	spinlock_t		fa_lock;
@@ -2025,11 +2026,6 @@ extern void ihold(struct inode * inode);
 extern void iput(struct inode *);
 extern int generic_update_time(struct inode *, struct timespec *, int);
 
-static inline struct inode *file_inode(const struct file *f)
-{
-	return f->f_inode;
-}
-
 /* /sys/fs */
 extern struct kobject *fs_kobj;
 
-- 
cgit v1.2.3


From 30bb6fb39e5c08b9db5bc592d6cbc9a5fc5e67a4 Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert+renesas@glider.be>
Date: Mon, 15 Jun 2015 13:31:33 +0200
Subject: gpio: Remove double "base" in comment

Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 include/linux/gpio/driver.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h
index cc7ec129b329..c8393cd4d44f 100644
--- a/include/linux/gpio/driver.h
+++ b/include/linux/gpio/driver.h
@@ -45,7 +45,7 @@ struct seq_file;
  * @base: identifies the first GPIO number handled by this chip;
  *	or, if negative during registration, requests dynamic ID allocation.
  *	DEPRECATION: providing anything non-negative and nailing the base
- *	base offset of GPIO chips is deprecated. Please pass -1 as base to
+ *	offset of GPIO chips is deprecated. Please pass -1 as base to
  *	let gpiolib select the chip base in all possible cases. We want to
  *	get rid of the static GPIO number space in the long run.
  * @ngpio: the number of GPIOs handled by this controller; the last GPIO
-- 
cgit v1.2.3


From 2531c8cf56a640cd7d17057df8484e570716a450 Mon Sep 17 00:00:00 2001
From: Dominik Dingel <dingel@linux.vnet.ibm.com>
Date: Fri, 17 Jul 2015 16:23:37 -0700
Subject: mm: hugetlb: allow hugepages_supported to be architecture specific

s390 has a constant hugepage size, by setting HPAGE_SHIFT we also change
e.g. the pageblock_order, which should be independent in respect to
hugepage support.

With this patch every architecture is free to define how to check
for hugepage support.

Signed-off-by: Dominik Dingel <dingel@linux.vnet.ibm.com>
Acked-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
Cc: Michael Holzheu <holzheu@linux.vnet.ibm.com>
Cc: Gerald Schaefer <gerald.schaefer@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/hugetlb.h | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 205026175c42..d891f949466a 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -460,15 +460,14 @@ static inline spinlock_t *huge_pte_lockptr(struct hstate *h,
 	return &mm->page_table_lock;
 }
 
-static inline bool hugepages_supported(void)
-{
-	/*
-	 * Some platform decide whether they support huge pages at boot
-	 * time. On these, such as powerpc, HPAGE_SHIFT is set to 0 when
-	 * there is no such support
-	 */
-	return HPAGE_SHIFT != 0;
-}
+#ifndef hugepages_supported
+/*
+ * Some platform decide whether they support huge pages at boot
+ * time. Some of them, such as powerpc, set HPAGE_SHIFT to 0
+ * when there is no such support
+ */
+#define hugepages_supported() (HPAGE_SHIFT != 0)
+#endif
 
 #else	/* CONFIG_HUGETLB_PAGE */
 struct hstate {};
-- 
cgit v1.2.3


From 8db1486065141e619e4855b84e350ef32064f7e1 Mon Sep 17 00:00:00 2001
From: Nicolas Iooss <nicolas.iooss_linux@m4x.org>
Date: Fri, 17 Jul 2015 16:23:42 -0700
Subject: include, lib: add __printf attributes to several function prototypes

Using __printf attributes helps to detect several format string issues
at compile time (even though -Wformat-security is currently disabled in
Makefile).  For example it can detect when formatting a pointer as a
number, like the issue fixed in commit a3fa71c40f18 ("wl18xx: show
rx_frames_per_rates as an array as it really is"), or when the arguments
do not match the format string, c.f.  for example commit 5ce1aca81435
("reiserfs: fix __RASSERT format string").

To prevent similar bugs in the future, add a __printf attribute to every
function prototype which needs one in include/linux/ and lib/.  These
functions were mostly found by using gcc's -Wsuggest-attribute=format
flag.

Signed-off-by: Nicolas Iooss <nicolas.iooss_linux@m4x.org>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Felipe Balbi <balbi@ti.com>
Cc: Joel Becker <jlbec@evilplan.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/clkdev.h    |  7 ++++---
 include/linux/compat.h    |  2 +-
 include/linux/configfs.h  |  3 ++-
 include/linux/cpu.h       |  7 ++++---
 include/linux/dcache.h    |  3 ++-
 include/linux/device.h    | 15 +++++++--------
 include/linux/iommu.h     |  2 +-
 include/linux/kernel.h    |  9 +++++----
 include/linux/kobject.h   |  5 +++--
 include/linux/mmiotrace.h |  2 +-
 include/linux/printk.h    |  6 +++---
 lib/kobject.c             |  5 +++--
 12 files changed, 36 insertions(+), 30 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/clkdev.h b/include/linux/clkdev.h
index a240b18e86fa..08bffcc466de 100644
--- a/include/linux/clkdev.h
+++ b/include/linux/clkdev.h
@@ -33,18 +33,19 @@ struct clk_lookup {
 	}
 
 struct clk_lookup *clkdev_alloc(struct clk *clk, const char *con_id,
-	const char *dev_fmt, ...);
+	const char *dev_fmt, ...) __printf(3, 4);
 
 void clkdev_add(struct clk_lookup *cl);
 void clkdev_drop(struct clk_lookup *cl);
 
 struct clk_lookup *clkdev_create(struct clk *clk, const char *con_id,
-	const char *dev_fmt, ...);
+	const char *dev_fmt, ...) __printf(3, 4);
 
 void clkdev_add_table(struct clk_lookup *, size_t);
 int clk_add_alias(const char *, const char *, const char *, struct device *);
 
-int clk_register_clkdev(struct clk *, const char *, const char *, ...);
+int clk_register_clkdev(struct clk *, const char *, const char *, ...)
+	__printf(3, 4);
 int clk_register_clkdevs(struct clk *, struct clk_lookup *, size_t);
 
 #ifdef CONFIG_COMMON_CLK
diff --git a/include/linux/compat.h b/include/linux/compat.h
index ab25814690bc..a76c9172b2eb 100644
--- a/include/linux/compat.h
+++ b/include/linux/compat.h
@@ -424,7 +424,7 @@ asmlinkage long compat_sys_settimeofday(struct compat_timeval __user *tv,
 
 asmlinkage long compat_sys_adjtimex(struct compat_timex __user *utp);
 
-extern int compat_printk(const char *fmt, ...);
+extern __printf(1, 2) int compat_printk(const char *fmt, ...);
 extern void sigset_from_compat(sigset_t *set, const compat_sigset_t *compat);
 extern void sigset_to_compat(compat_sigset_t *compat, const sigset_t *set);
 
diff --git a/include/linux/configfs.h b/include/linux/configfs.h
index c9e5c57e4edf..63a36e89d0eb 100644
--- a/include/linux/configfs.h
+++ b/include/linux/configfs.h
@@ -64,7 +64,8 @@ struct config_item {
 	struct dentry		*ci_dentry;
 };
 
-extern int config_item_set_name(struct config_item *, const char *, ...);
+extern __printf(2, 3)
+int config_item_set_name(struct config_item *, const char *, ...);
 
 static inline char *config_item_name(struct config_item * item)
 {
diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index c0fb6b1b4712..23c30bdcca86 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -40,9 +40,10 @@ extern void cpu_remove_dev_attr(struct device_attribute *attr);
 extern int cpu_add_dev_attr_group(struct attribute_group *attrs);
 extern void cpu_remove_dev_attr_group(struct attribute_group *attrs);
 
-extern struct device *cpu_device_create(struct device *parent, void *drvdata,
-					const struct attribute_group **groups,
-					const char *fmt, ...);
+extern __printf(4, 5)
+struct device *cpu_device_create(struct device *parent, void *drvdata,
+				 const struct attribute_group **groups,
+				 const char *fmt, ...);
 #ifdef CONFIG_HOTPLUG_CPU
 extern void unregister_cpu(struct cpu *cpu);
 extern ssize_t arch_cpu_probe(const char *, size_t);
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index d2d50249b7b2..d67ae119cf4e 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -327,7 +327,8 @@ static inline unsigned d_count(const struct dentry *dentry)
 /*
  * helper function for dentry_operations.d_dname() members
  */
-extern char *dynamic_dname(struct dentry *, char *, int, const char *, ...);
+extern __printf(4, 5)
+char *dynamic_dname(struct dentry *, char *, int, const char *, ...);
 extern char *simple_dname(struct dentry *, char *, int);
 
 extern char *__d_path(const struct path *, const struct path *, char *, int);
diff --git a/include/linux/device.h b/include/linux/device.h
index 5a31bf3a4024..a2b4ea70a946 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -637,8 +637,9 @@ extern int devres_release_group(struct device *dev, void *id);
 
 /* managed devm_k.alloc/kfree for device drivers */
 extern void *devm_kmalloc(struct device *dev, size_t size, gfp_t gfp);
-extern char *devm_kvasprintf(struct device *dev, gfp_t gfp, const char *fmt,
-			     va_list ap);
+extern __printf(3, 0)
+char *devm_kvasprintf(struct device *dev, gfp_t gfp, const char *fmt,
+		      va_list ap);
 extern __printf(3, 4)
 char *devm_kasprintf(struct device *dev, gfp_t gfp, const char *fmt, ...);
 static inline void *devm_kzalloc(struct device *dev, size_t size, gfp_t gfp)
@@ -1011,12 +1012,10 @@ extern int __must_check device_reprobe(struct device *dev);
 /*
  * Easy functions for dynamically creating devices on the fly
  */
-extern struct device *device_create_vargs(struct class *cls,
-					  struct device *parent,
-					  dev_t devt,
-					  void *drvdata,
-					  const char *fmt,
-					  va_list vargs);
+extern __printf(5, 0)
+struct device *device_create_vargs(struct class *cls, struct device *parent,
+				   dev_t devt, void *drvdata,
+				   const char *fmt, va_list vargs);
 extern __printf(5, 6)
 struct device *device_create(struct class *cls, struct device *parent,
 			     dev_t devt, void *drvdata,
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index dc767f7c3704..f9c1b6d0f2e4 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -258,7 +258,7 @@ extern int iommu_domain_set_attr(struct iommu_domain *domain, enum iommu_attr,
 				 void *data);
 struct device *iommu_device_create(struct device *parent, void *drvdata,
 				   const struct attribute_group **groups,
-				   const char *fmt, ...);
+				   const char *fmt, ...) __printf(4, 5);
 void iommu_device_destroy(struct device *dev);
 int iommu_device_link(struct device *dev, struct device *link);
 void iommu_device_unlink(struct device *dev, struct device *link);
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 5f0be58640ea..5582410727cb 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -411,7 +411,8 @@ extern __printf(3, 0)
 int vscnprintf(char *buf, size_t size, const char *fmt, va_list args);
 extern __printf(2, 3)
 char *kasprintf(gfp_t gfp, const char *fmt, ...);
-extern char *kvasprintf(gfp_t gfp, const char *fmt, va_list args);
+extern __printf(2, 0)
+char *kvasprintf(gfp_t gfp, const char *fmt, va_list args);
 
 extern __scanf(2, 3)
 int sscanf(const char *, const char *, ...);
@@ -679,10 +680,10 @@ do {									\
 		__ftrace_vprintk(_THIS_IP_, fmt, vargs);		\
 } while (0)
 
-extern int
+extern __printf(2, 0) int
 __ftrace_vbprintk(unsigned long ip, const char *fmt, va_list ap);
 
-extern int
+extern __printf(2, 0) int
 __ftrace_vprintk(unsigned long ip, const char *fmt, va_list ap);
 
 extern void ftrace_dump(enum ftrace_dump_mode oops_dump_mode);
@@ -702,7 +703,7 @@ int trace_printk(const char *fmt, ...)
 {
 	return 0;
 }
-static inline int
+static __printf(1, 0) inline int
 ftrace_vprintk(const char *fmt, va_list ap)
 {
 	return 0;
diff --git a/include/linux/kobject.h b/include/linux/kobject.h
index 2d61b909f414..637f67002c5a 100644
--- a/include/linux/kobject.h
+++ b/include/linux/kobject.h
@@ -80,8 +80,9 @@ struct kobject {
 
 extern __printf(2, 3)
 int kobject_set_name(struct kobject *kobj, const char *name, ...);
-extern int kobject_set_name_vargs(struct kobject *kobj, const char *fmt,
-				  va_list vargs);
+extern __printf(2, 0)
+int kobject_set_name_vargs(struct kobject *kobj, const char *fmt,
+			   va_list vargs);
 
 static inline const char *kobject_name(const struct kobject *kobj)
 {
diff --git a/include/linux/mmiotrace.h b/include/linux/mmiotrace.h
index c5d52780d6a0..3ba327af055c 100644
--- a/include/linux/mmiotrace.h
+++ b/include/linux/mmiotrace.h
@@ -106,6 +106,6 @@ extern void enable_mmiotrace(void);
 extern void disable_mmiotrace(void);
 extern void mmio_trace_rw(struct mmiotrace_rw *rw);
 extern void mmio_trace_mapping(struct mmiotrace_map *map);
-extern int mmio_trace_printk(const char *fmt, va_list args);
+extern __printf(1, 0) int mmio_trace_printk(const char *fmt, va_list args);
 
 #endif /* _LINUX_MMIOTRACE_H */
diff --git a/include/linux/printk.h b/include/linux/printk.h
index 58b1fec40d37..a6298b27ac99 100644
--- a/include/linux/printk.h
+++ b/include/linux/printk.h
@@ -122,7 +122,7 @@ static inline __printf(1, 2) __cold
 void early_printk(const char *s, ...) { }
 #endif
 
-typedef int(*printk_func_t)(const char *fmt, va_list args);
+typedef __printf(1, 0) int (*printk_func_t)(const char *fmt, va_list args);
 
 #ifdef CONFIG_PRINTK
 asmlinkage __printf(5, 0)
@@ -166,7 +166,7 @@ char *log_buf_addr_get(void);
 u32 log_buf_len_get(void);
 void log_buf_kexec_setup(void);
 void __init setup_log_buf(int early);
-void dump_stack_set_arch_desc(const char *fmt, ...);
+__printf(1, 2) void dump_stack_set_arch_desc(const char *fmt, ...);
 void dump_stack_print_info(const char *log_lvl);
 void show_regs_print_info(const char *log_lvl);
 #else
@@ -217,7 +217,7 @@ static inline void setup_log_buf(int early)
 {
 }
 
-static inline void dump_stack_set_arch_desc(const char *fmt, ...)
+static inline __printf(1, 2) void dump_stack_set_arch_desc(const char *fmt, ...)
 {
 }
 
diff --git a/lib/kobject.c b/lib/kobject.c
index 2e3bd01964a9..3e3a5c3cb330 100644
--- a/lib/kobject.c
+++ b/lib/kobject.c
@@ -337,8 +337,9 @@ error:
 }
 EXPORT_SYMBOL(kobject_init);
 
-static int kobject_add_varg(struct kobject *kobj, struct kobject *parent,
-			    const char *fmt, va_list vargs)
+static __printf(3, 0) int kobject_add_varg(struct kobject *kobj,
+					   struct kobject *parent,
+					   const char *fmt, va_list vargs)
 {
 	int retval;
 
-- 
cgit v1.2.3


From da89947b47a3a355f33a75d7672892c147ed880d Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Fri, 17 Jul 2015 16:23:50 -0700
Subject: Update Viresh Kumar's email address

Switch to my kernel.org alias instead of a badly named gmail address,
which I rarely use.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 .mailmap                                      |  4 +++-
 Documentation/arm/SPEAr/overview.txt          |  2 +-
 MAINTAINERS                                   | 12 ++++++------
 arch/arm/boot/dts/spear1310-evb.dts           |  2 +-
 arch/arm/boot/dts/spear1310.dtsi              |  2 +-
 arch/arm/boot/dts/spear1340-evb.dts           |  2 +-
 arch/arm/boot/dts/spear1340.dtsi              |  2 +-
 arch/arm/boot/dts/spear13xx.dtsi              |  2 +-
 arch/arm/boot/dts/spear300-evb.dts            |  2 +-
 arch/arm/boot/dts/spear300.dtsi               |  2 +-
 arch/arm/boot/dts/spear310-evb.dts            |  2 +-
 arch/arm/boot/dts/spear310.dtsi               |  2 +-
 arch/arm/boot/dts/spear320-evb.dts            |  2 +-
 arch/arm/boot/dts/spear320.dtsi               |  2 +-
 arch/arm/boot/dts/spear3xx.dtsi               |  2 +-
 arch/arm/mach-spear/generic.h                 |  2 +-
 arch/arm/mach-spear/include/mach/irqs.h       |  2 +-
 arch/arm/mach-spear/include/mach/misc_regs.h  |  2 +-
 arch/arm/mach-spear/include/mach/spear.h      |  2 +-
 arch/arm/mach-spear/include/mach/uncompress.h |  2 +-
 arch/arm/mach-spear/pl080.c                   |  2 +-
 arch/arm/mach-spear/pl080.h                   |  2 +-
 arch/arm/mach-spear/restart.c                 |  2 +-
 arch/arm/mach-spear/spear1310.c               |  2 +-
 arch/arm/mach-spear/spear1340.c               |  2 +-
 arch/arm/mach-spear/spear13xx.c               |  2 +-
 arch/arm/mach-spear/spear300.c                |  2 +-
 arch/arm/mach-spear/spear310.c                |  2 +-
 arch/arm/mach-spear/spear320.c                |  2 +-
 arch/arm/mach-spear/spear3xx.c                |  2 +-
 drivers/ata/pata_arasan_cf.c                  |  4 ++--
 drivers/clk/spear/clk-aux-synth.c             |  2 +-
 drivers/clk/spear/clk-frac-synth.c            |  2 +-
 drivers/clk/spear/clk-gpt-synth.c             |  2 +-
 drivers/clk/spear/clk-vco-pll.c               |  2 +-
 drivers/clk/spear/clk.c                       |  2 +-
 drivers/clk/spear/clk.h                       |  2 +-
 drivers/clk/spear/spear1310_clock.c           |  2 +-
 drivers/clk/spear/spear1340_clock.c           |  2 +-
 drivers/clk/spear/spear3xx_clock.c            |  2 +-
 drivers/clk/spear/spear6xx_clock.c            |  2 +-
 drivers/dma/dw/core.c                         |  2 +-
 drivers/irqchip/spear-shirq.c                 |  2 +-
 drivers/mfd/stmpe-i2c.c                       |  2 +-
 drivers/mfd/stmpe-spi.c                       |  4 ++--
 drivers/mmc/host/sdhci-spear.c                |  4 ++--
 drivers/pinctrl/spear/pinctrl-spear.c         |  2 +-
 drivers/pinctrl/spear/pinctrl-spear.h         |  2 +-
 drivers/pinctrl/spear/pinctrl-spear1310.c     |  4 ++--
 drivers/pinctrl/spear/pinctrl-spear1340.c     |  4 ++--
 drivers/pinctrl/spear/pinctrl-spear300.c      |  4 ++--
 drivers/pinctrl/spear/pinctrl-spear310.c      |  4 ++--
 drivers/pinctrl/spear/pinctrl-spear320.c      |  4 ++--
 drivers/pinctrl/spear/pinctrl-spear3xx.c      |  2 +-
 drivers/pinctrl/spear/pinctrl-spear3xx.h      |  2 +-
 drivers/watchdog/sp805_wdt.c                  |  4 ++--
 include/linux/amba/sp810.h                    |  2 +-
 include/linux/pata_arasan_cf_data.h           |  2 +-
 58 files changed, 74 insertions(+), 72 deletions(-)

(limited to 'include/linux')

diff --git a/.mailmap b/.mailmap
index 977f958eedbe..8d8ad17dcf9c 100644
--- a/.mailmap
+++ b/.mailmap
@@ -125,7 +125,9 @@ Uwe Kleine-König <ukleinek@informatik.uni-freiburg.de>
 Uwe Kleine-König <ukl@pengutronix.de>
 Uwe Kleine-König <Uwe.Kleine-Koenig@digi.com>
 Valdis Kletnieks <Valdis.Kletnieks@vt.edu>
-Viresh Kumar <viresh.linux@gmail.com> <viresh.kumar@st.com>
+Viresh Kumar <vireshk@kernel.org> <viresh.kumar@st.com>
+Viresh Kumar <vireshk@kernel.org> <viresh.linux@gmail.com>
+Viresh Kumar <vireshk@kernel.org> <viresh.kumar2@arm.com>
 Takashi YOSHII <takashi.yoshii.zj@renesas.com>
 Yusuke Goda <goda.yusuke@renesas.com>
 Gustavo Padovan <gustavo@las.ic.unicamp.br>
diff --git a/Documentation/arm/SPEAr/overview.txt b/Documentation/arm/SPEAr/overview.txt
index 65610bf52ebf..1b049be6c84f 100644
--- a/Documentation/arm/SPEAr/overview.txt
+++ b/Documentation/arm/SPEAr/overview.txt
@@ -60,4 +60,4 @@ Introduction
   Document Author
   ---------------
 
-  Viresh Kumar <viresh.linux@gmail.com>, (c) 2010-2012 ST Microelectronics
+  Viresh Kumar <vireshk@kernel.org>, (c) 2010-2012 ST Microelectronics
diff --git a/MAINTAINERS b/MAINTAINERS
index 2de150384670..093da8a3f8b2 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -6090,7 +6090,7 @@ F:	include/linux/ata.h
 F:	include/linux/libata.h
 
 LIBATA PATA ARASAN COMPACT FLASH CONTROLLER
-M:	Viresh Kumar <viresh.linux@gmail.com>
+M:	Viresh Kumar <vireshk@kernel.org>
 L:	linux-ide@vger.kernel.org
 T:	git git://git.kernel.org/pub/scm/linux/kernel/git/tj/libata.git
 S:	Maintained
@@ -7996,7 +7996,7 @@ S:	Maintained
 F:	drivers/pinctrl/samsung/
 
 PIN CONTROLLER - ST SPEAR
-M:	Viresh Kumar <viresh.linux@gmail.com>
+M:	Viresh Kumar <vireshk@kernel.org>
 L:	spear-devel@list.st.com
 L:	linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
 W:	http://www.st.com/spear
@@ -8895,7 +8895,7 @@ S:	Maintained
 F:	drivers/tty/serial/
 
 SYNOPSYS DESIGNWARE DMAC DRIVER
-M:	Viresh Kumar <viresh.linux@gmail.com>
+M:	Viresh Kumar <vireshk@kernel.org>
 M:	Andy Shevchenko <andriy.shevchenko@linux.intel.com>
 S:	Maintained
 F:	include/linux/dma/dw.h
@@ -9062,7 +9062,7 @@ S:	Maintained
 F:	drivers/mmc/host/sdhci-s3c*
 
 SECURE DIGITAL HOST CONTROLLER INTERFACE (SDHCI) ST SPEAR DRIVER
-M:	Viresh Kumar <viresh.linux@gmail.com>
+M:	Viresh Kumar <vireshk@kernel.org>
 L:	spear-devel@list.st.com
 L:	linux-mmc@vger.kernel.org
 S:	Maintained
@@ -9600,7 +9600,7 @@ S:	Maintained
 F:	include/linux/compiler.h
 
 SPEAR PLATFORM SUPPORT
-M:	Viresh Kumar <viresh.linux@gmail.com>
+M:	Viresh Kumar <vireshk@kernel.org>
 M:	Shiraz Hashim <shiraz.linux.kernel@gmail.com>
 L:	spear-devel@list.st.com
 L:	linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
@@ -9609,7 +9609,7 @@ S:	Maintained
 F:	arch/arm/mach-spear/
 
 SPEAR CLOCK FRAMEWORK SUPPORT
-M:	Viresh Kumar <viresh.linux@gmail.com>
+M:	Viresh Kumar <vireshk@kernel.org>
 L:	spear-devel@list.st.com
 L:	linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
 W:	http://www.st.com/spear
diff --git a/arch/arm/boot/dts/spear1310-evb.dts b/arch/arm/boot/dts/spear1310-evb.dts
index d42c84b1df8d..e48857249ce7 100644
--- a/arch/arm/boot/dts/spear1310-evb.dts
+++ b/arch/arm/boot/dts/spear1310-evb.dts
@@ -1,7 +1,7 @@
 /*
  * DTS file for SPEAr1310 Evaluation Baord
  *
- * Copyright 2012 Viresh Kumar <viresh.linux@gmail.com>
+ * Copyright 2012 Viresh Kumar <vireshk@kernel.org>
  *
  * The code contained herein is licensed under the GNU General Public
  * License. You may obtain a copy of the GNU General Public License
diff --git a/arch/arm/boot/dts/spear1310.dtsi b/arch/arm/boot/dts/spear1310.dtsi
index 9d342920695a..54bc6d3cf290 100644
--- a/arch/arm/boot/dts/spear1310.dtsi
+++ b/arch/arm/boot/dts/spear1310.dtsi
@@ -1,7 +1,7 @@
 /*
  * DTS file for all SPEAr1310 SoCs
  *
- * Copyright 2012 Viresh Kumar <viresh.linux@gmail.com>
+ * Copyright 2012 Viresh Kumar <vireshk@kernel.org>
  *
  * The code contained herein is licensed under the GNU General Public
  * License. You may obtain a copy of the GNU General Public License
diff --git a/arch/arm/boot/dts/spear1340-evb.dts b/arch/arm/boot/dts/spear1340-evb.dts
index b23e05ed1d60..c611f5606dfe 100644
--- a/arch/arm/boot/dts/spear1340-evb.dts
+++ b/arch/arm/boot/dts/spear1340-evb.dts
@@ -1,7 +1,7 @@
 /*
  * DTS file for SPEAr1340 Evaluation Baord
  *
- * Copyright 2012 Viresh Kumar <viresh.linux@gmail.com>
+ * Copyright 2012 Viresh Kumar <vireshk@kernel.org>
  *
  * The code contained herein is licensed under the GNU General Public
  * License. You may obtain a copy of the GNU General Public License
diff --git a/arch/arm/boot/dts/spear1340.dtsi b/arch/arm/boot/dts/spear1340.dtsi
index 13e1aa33daa2..df2232d767ed 100644
--- a/arch/arm/boot/dts/spear1340.dtsi
+++ b/arch/arm/boot/dts/spear1340.dtsi
@@ -1,7 +1,7 @@
 /*
  * DTS file for all SPEAr1340 SoCs
  *
- * Copyright 2012 Viresh Kumar <viresh.linux@gmail.com>
+ * Copyright 2012 Viresh Kumar <vireshk@kernel.org>
  *
  * The code contained herein is licensed under the GNU General Public
  * License. You may obtain a copy of the GNU General Public License
diff --git a/arch/arm/boot/dts/spear13xx.dtsi b/arch/arm/boot/dts/spear13xx.dtsi
index 40accc87e3a2..14594ce8c18a 100644
--- a/arch/arm/boot/dts/spear13xx.dtsi
+++ b/arch/arm/boot/dts/spear13xx.dtsi
@@ -1,7 +1,7 @@
 /*
  * DTS file for all SPEAr13xx SoCs
  *
- * Copyright 2012 Viresh Kumar <viresh.linux@gmail.com>
+ * Copyright 2012 Viresh Kumar <vireshk@kernel.org>
  *
  * The code contained herein is licensed under the GNU General Public
  * License. You may obtain a copy of the GNU General Public License
diff --git a/arch/arm/boot/dts/spear300-evb.dts b/arch/arm/boot/dts/spear300-evb.dts
index 5de1431653e4..e859e8288bcd 100644
--- a/arch/arm/boot/dts/spear300-evb.dts
+++ b/arch/arm/boot/dts/spear300-evb.dts
@@ -1,7 +1,7 @@
 /*
  * DTS file for SPEAr300 Evaluation Baord
  *
- * Copyright 2012 Viresh Kumar <viresh.linux@gmail.com>
+ * Copyright 2012 Viresh Kumar <vireshk@kernel.org>
  *
  * The code contained herein is licensed under the GNU General Public
  * License. You may obtain a copy of the GNU General Public License
diff --git a/arch/arm/boot/dts/spear300.dtsi b/arch/arm/boot/dts/spear300.dtsi
index f79b3dfaabe6..f4e92e599729 100644
--- a/arch/arm/boot/dts/spear300.dtsi
+++ b/arch/arm/boot/dts/spear300.dtsi
@@ -1,7 +1,7 @@
 /*
  * DTS file for SPEAr300 SoC
  *
- * Copyright 2012 Viresh Kumar <viresh.linux@gmail.com>
+ * Copyright 2012 Viresh Kumar <vireshk@kernel.org>
  *
  * The code contained herein is licensed under the GNU General Public
  * License. You may obtain a copy of the GNU General Public License
diff --git a/arch/arm/boot/dts/spear310-evb.dts b/arch/arm/boot/dts/spear310-evb.dts
index b09632963d15..070f2c1b7851 100644
--- a/arch/arm/boot/dts/spear310-evb.dts
+++ b/arch/arm/boot/dts/spear310-evb.dts
@@ -1,7 +1,7 @@
 /*
  * DTS file for SPEAr310 Evaluation Baord
  *
- * Copyright 2012 Viresh Kumar <viresh.linux@gmail.com>
+ * Copyright 2012 Viresh Kumar <vireshk@kernel.org>
  *
  * The code contained herein is licensed under the GNU General Public
  * License. You may obtain a copy of the GNU General Public License
diff --git a/arch/arm/boot/dts/spear310.dtsi b/arch/arm/boot/dts/spear310.dtsi
index 95372080eea6..da210b454753 100644
--- a/arch/arm/boot/dts/spear310.dtsi
+++ b/arch/arm/boot/dts/spear310.dtsi
@@ -1,7 +1,7 @@
 /*
  * DTS file for SPEAr310 SoC
  *
- * Copyright 2012 Viresh Kumar <viresh.linux@gmail.com>
+ * Copyright 2012 Viresh Kumar <vireshk@kernel.org>
  *
  * The code contained herein is licensed under the GNU General Public
  * License. You may obtain a copy of the GNU General Public License
diff --git a/arch/arm/boot/dts/spear320-evb.dts b/arch/arm/boot/dts/spear320-evb.dts
index fdedbb514102..1b1034477923 100644
--- a/arch/arm/boot/dts/spear320-evb.dts
+++ b/arch/arm/boot/dts/spear320-evb.dts
@@ -1,7 +1,7 @@
 /*
  * DTS file for SPEAr320 Evaluation Baord
  *
- * Copyright 2012 Viresh Kumar <viresh.linux@gmail.com>
+ * Copyright 2012 Viresh Kumar <vireshk@kernel.org>
  *
  * The code contained herein is licensed under the GNU General Public
  * License. You may obtain a copy of the GNU General Public License
diff --git a/arch/arm/boot/dts/spear320.dtsi b/arch/arm/boot/dts/spear320.dtsi
index ffea342aeec9..22be6e5edaac 100644
--- a/arch/arm/boot/dts/spear320.dtsi
+++ b/arch/arm/boot/dts/spear320.dtsi
@@ -1,7 +1,7 @@
 /*
  * DTS file for SPEAr320 SoC
  *
- * Copyright 2012 Viresh Kumar <viresh.linux@gmail.com>
+ * Copyright 2012 Viresh Kumar <vireshk@kernel.org>
  *
  * The code contained herein is licensed under the GNU General Public
  * License. You may obtain a copy of the GNU General Public License
diff --git a/arch/arm/boot/dts/spear3xx.dtsi b/arch/arm/boot/dts/spear3xx.dtsi
index f0e3fcf8e323..118135d75899 100644
--- a/arch/arm/boot/dts/spear3xx.dtsi
+++ b/arch/arm/boot/dts/spear3xx.dtsi
@@ -1,7 +1,7 @@
 /*
  * DTS file for all SPEAr3xx SoCs
  *
- * Copyright 2012 Viresh Kumar <viresh.linux@gmail.com>
+ * Copyright 2012 Viresh Kumar <vireshk@kernel.org>
  *
  * The code contained herein is licensed under the GNU General Public
  * License. You may obtain a copy of the GNU General Public License
diff --git a/arch/arm/mach-spear/generic.h b/arch/arm/mach-spear/generic.h
index a99d90a4d09c..06640914d9a0 100644
--- a/arch/arm/mach-spear/generic.h
+++ b/arch/arm/mach-spear/generic.h
@@ -3,7 +3,7 @@
  *
  * Copyright (C) 2009-2012 ST Microelectronics
  * Rajeev Kumar <rajeev-dlh.kumar@st.com>
- * Viresh Kumar <viresh.linux@gmail.com>
+ * Viresh Kumar <vireshk@kernel.org>
  *
  * This file is licensed under the terms of the GNU General Public
  * License version 2. This program is licensed "as is" without any
diff --git a/arch/arm/mach-spear/include/mach/irqs.h b/arch/arm/mach-spear/include/mach/irqs.h
index 92da0a8c6bce..7058720c5278 100644
--- a/arch/arm/mach-spear/include/mach/irqs.h
+++ b/arch/arm/mach-spear/include/mach/irqs.h
@@ -3,7 +3,7 @@
  *
  * Copyright (C) 2009-2012 ST Microelectronics
  * Rajeev Kumar <rajeev-dlh.kumar@st.com>
- * Viresh Kumar <viresh.linux@gmail.com>
+ * Viresh Kumar <vireshk@kernel.org>
  *
  * This file is licensed under the terms of the GNU General Public
  * License version 2. This program is licensed "as is" without any
diff --git a/arch/arm/mach-spear/include/mach/misc_regs.h b/arch/arm/mach-spear/include/mach/misc_regs.h
index 935639ce59ba..cfaf7c665b58 100644
--- a/arch/arm/mach-spear/include/mach/misc_regs.h
+++ b/arch/arm/mach-spear/include/mach/misc_regs.h
@@ -4,7 +4,7 @@
  * Miscellaneous registers definitions for SPEAr3xx machine family
  *
  * Copyright (C) 2009 ST Microelectronics
- * Viresh Kumar <viresh.linux@gmail.com>
+ * Viresh Kumar <vireshk@kernel.org>
  *
  * This file is licensed under the terms of the GNU General Public
  * License version 2. This program is licensed "as is" without any
diff --git a/arch/arm/mach-spear/include/mach/spear.h b/arch/arm/mach-spear/include/mach/spear.h
index f2d6a0176575..5ed841ccf8a3 100644
--- a/arch/arm/mach-spear/include/mach/spear.h
+++ b/arch/arm/mach-spear/include/mach/spear.h
@@ -3,7 +3,7 @@
  *
  * Copyright (C) 2009,2012 ST Microelectronics
  * Rajeev Kumar<rajeev-dlh.kumar@st.com>
- * Viresh Kumar <viresh.linux@gmail.com>
+ * Viresh Kumar <vireshk@kernel.org>
  *
  * This file is licensed under the terms of the GNU General Public
  * License version 2. This program is licensed "as is" without any
diff --git a/arch/arm/mach-spear/include/mach/uncompress.h b/arch/arm/mach-spear/include/mach/uncompress.h
index 51b2dc93e4da..8439b9c12edb 100644
--- a/arch/arm/mach-spear/include/mach/uncompress.h
+++ b/arch/arm/mach-spear/include/mach/uncompress.h
@@ -4,7 +4,7 @@
  * Serial port stubs for kernel decompress status messages
  *
  * Copyright (C) 2009 ST Microelectronics
- * Viresh Kumar <viresh.linux@gmail.com>
+ * Viresh Kumar <vireshk@kernel.org>
  *
  * This file is licensed under the terms of the GNU General Public
  * License version 2. This program is licensed "as is" without any
diff --git a/arch/arm/mach-spear/pl080.c b/arch/arm/mach-spear/pl080.c
index cfa1199d0f4a..b4529f3e0ee9 100644
--- a/arch/arm/mach-spear/pl080.c
+++ b/arch/arm/mach-spear/pl080.c
@@ -4,7 +4,7 @@
  * DMAC pl080 definitions for SPEAr platform
  *
  * Copyright (C) 2012 ST Microelectronics
- * Viresh Kumar <viresh.linux@gmail.com>
+ * Viresh Kumar <vireshk@kernel.org>
  *
  * This file is licensed under the terms of the GNU General Public
  * License version 2. This program is licensed "as is" without any
diff --git a/arch/arm/mach-spear/pl080.h b/arch/arm/mach-spear/pl080.h
index eb6590ded40d..608dec6725ae 100644
--- a/arch/arm/mach-spear/pl080.h
+++ b/arch/arm/mach-spear/pl080.h
@@ -4,7 +4,7 @@
  * DMAC pl080 definitions for SPEAr platform
  *
  * Copyright (C) 2012 ST Microelectronics
- * Viresh Kumar <viresh.linux@gmail.com>
+ * Viresh Kumar <vireshk@kernel.org>
  *
  * This file is licensed under the terms of the GNU General Public
  * License version 2. This program is licensed "as is" without any
diff --git a/arch/arm/mach-spear/restart.c b/arch/arm/mach-spear/restart.c
index ce5e098c4888..b4342155a783 100644
--- a/arch/arm/mach-spear/restart.c
+++ b/arch/arm/mach-spear/restart.c
@@ -4,7 +4,7 @@
  * SPEAr platform specific restart functions
  *
  * Copyright (C) 2009 ST Microelectronics
- * Viresh Kumar <viresh.linux@gmail.com>
+ * Viresh Kumar <vireshk@kernel.org>
  *
  * This file is licensed under the terms of the GNU General Public
  * License version 2. This program is licensed "as is" without any
diff --git a/arch/arm/mach-spear/spear1310.c b/arch/arm/mach-spear/spear1310.c
index d9ce4d8000f0..cd5d375d91f0 100644
--- a/arch/arm/mach-spear/spear1310.c
+++ b/arch/arm/mach-spear/spear1310.c
@@ -4,7 +4,7 @@
  * SPEAr1310 machine source file
  *
  * Copyright (C) 2012 ST Microelectronics
- * Viresh Kumar <viresh.linux@gmail.com>
+ * Viresh Kumar <vireshk@kernel.org>
  *
  * This file is licensed under the terms of the GNU General Public
  * License version 2. This program is licensed "as is" without any
diff --git a/arch/arm/mach-spear/spear1340.c b/arch/arm/mach-spear/spear1340.c
index 3f3c0f124bd3..94594d5a446c 100644
--- a/arch/arm/mach-spear/spear1340.c
+++ b/arch/arm/mach-spear/spear1340.c
@@ -4,7 +4,7 @@
  * SPEAr1340 machine source file
  *
  * Copyright (C) 2012 ST Microelectronics
- * Viresh Kumar <viresh.linux@gmail.com>
+ * Viresh Kumar <vireshk@kernel.org>
  *
  * This file is licensed under the terms of the GNU General Public
  * License version 2. This program is licensed "as is" without any
diff --git a/arch/arm/mach-spear/spear13xx.c b/arch/arm/mach-spear/spear13xx.c
index 2e463a93468d..b7afce6795f4 100644
--- a/arch/arm/mach-spear/spear13xx.c
+++ b/arch/arm/mach-spear/spear13xx.c
@@ -4,7 +4,7 @@
  * SPEAr13XX machines common source file
  *
  * Copyright (C) 2012 ST Microelectronics
- * Viresh Kumar <viresh.linux@gmail.com>
+ * Viresh Kumar <vireshk@kernel.org>
  *
  * This file is licensed under the terms of the GNU General Public
  * License version 2. This program is licensed "as is" without any
diff --git a/arch/arm/mach-spear/spear300.c b/arch/arm/mach-spear/spear300.c
index b52e48f342f4..5b32edda2276 100644
--- a/arch/arm/mach-spear/spear300.c
+++ b/arch/arm/mach-spear/spear300.c
@@ -4,7 +4,7 @@
  * SPEAr300 machine source file
  *
  * Copyright (C) 2009-2012 ST Microelectronics
- * Viresh Kumar <viresh.linux@gmail.com>
+ * Viresh Kumar <vireshk@kernel.org>
  *
  * This file is licensed under the terms of the GNU General Public
  * License version 2. This program is licensed "as is" without any
diff --git a/arch/arm/mach-spear/spear310.c b/arch/arm/mach-spear/spear310.c
index ed2029db391f..86a44ac7ff67 100644
--- a/arch/arm/mach-spear/spear310.c
+++ b/arch/arm/mach-spear/spear310.c
@@ -4,7 +4,7 @@
  * SPEAr310 machine source file
  *
  * Copyright (C) 2009-2012 ST Microelectronics
- * Viresh Kumar <viresh.linux@gmail.com>
+ * Viresh Kumar <vireshk@kernel.org>
  *
  * This file is licensed under the terms of the GNU General Public
  * License version 2. This program is licensed "as is" without any
diff --git a/arch/arm/mach-spear/spear320.c b/arch/arm/mach-spear/spear320.c
index bf634b32a930..d45d751926c5 100644
--- a/arch/arm/mach-spear/spear320.c
+++ b/arch/arm/mach-spear/spear320.c
@@ -4,7 +4,7 @@
  * SPEAr320 machine source file
  *
  * Copyright (C) 2009-2012 ST Microelectronics
- * Viresh Kumar <viresh.linux@gmail.com>
+ * Viresh Kumar <vireshk@kernel.org>
  *
  * This file is licensed under the terms of the GNU General Public
  * License version 2. This program is licensed "as is" without any
diff --git a/arch/arm/mach-spear/spear3xx.c b/arch/arm/mach-spear/spear3xx.c
index bf3b1fd8cb23..23394ac76cf2 100644
--- a/arch/arm/mach-spear/spear3xx.c
+++ b/arch/arm/mach-spear/spear3xx.c
@@ -4,7 +4,7 @@
  * SPEAr3XX machines common source file
  *
  * Copyright (C) 2009-2012 ST Microelectronics
- * Viresh Kumar <viresh.linux@gmail.com>
+ * Viresh Kumar <vireshk@kernel.org>
  *
  * This file is licensed under the terms of the GNU General Public
  * License version 2. This program is licensed "as is" without any
diff --git a/drivers/ata/pata_arasan_cf.c b/drivers/ata/pata_arasan_cf.c
index a9b0c820f2eb..5d9ee99c2148 100644
--- a/drivers/ata/pata_arasan_cf.c
+++ b/drivers/ata/pata_arasan_cf.c
@@ -4,7 +4,7 @@
  * Arasan Compact Flash host controller source file
  *
  * Copyright (C) 2011 ST Microelectronics
- * Viresh Kumar <viresh.linux@gmail.com>
+ * Viresh Kumar <vireshk@kernel.org>
  *
  * This file is licensed under the terms of the GNU General Public
  * License version 2. This program is licensed "as is" without any
@@ -968,7 +968,7 @@ static struct platform_driver arasan_cf_driver = {
 
 module_platform_driver(arasan_cf_driver);
 
-MODULE_AUTHOR("Viresh Kumar <viresh.linux@gmail.com>");
+MODULE_AUTHOR("Viresh Kumar <vireshk@kernel.org>");
 MODULE_DESCRIPTION("Arasan ATA Compact Flash driver");
 MODULE_LICENSE("GPL");
 MODULE_ALIAS("platform:" DRIVER_NAME);
diff --git a/drivers/clk/spear/clk-aux-synth.c b/drivers/clk/spear/clk-aux-synth.c
index bdfb4421c643..f271c350ef94 100644
--- a/drivers/clk/spear/clk-aux-synth.c
+++ b/drivers/clk/spear/clk-aux-synth.c
@@ -1,6 +1,6 @@
 /*
  * Copyright (C) 2012 ST Microelectronics
- * Viresh Kumar <viresh.linux@gmail.com>
+ * Viresh Kumar <vireshk@kernel.org>
  *
  * This file is licensed under the terms of the GNU General Public
  * License version 2. This program is licensed "as is" without any
diff --git a/drivers/clk/spear/clk-frac-synth.c b/drivers/clk/spear/clk-frac-synth.c
index dffd4ce6c8b5..58d678b5b40a 100644
--- a/drivers/clk/spear/clk-frac-synth.c
+++ b/drivers/clk/spear/clk-frac-synth.c
@@ -1,6 +1,6 @@
 /*
  * Copyright (C) 2012 ST Microelectronics
- * Viresh Kumar <viresh.linux@gmail.com>
+ * Viresh Kumar <vireshk@kernel.org>
  *
  * This file is licensed under the terms of the GNU General Public
  * License version 2. This program is licensed "as is" without any
diff --git a/drivers/clk/spear/clk-gpt-synth.c b/drivers/clk/spear/clk-gpt-synth.c
index 1afc18c4effc..1a722e99e76e 100644
--- a/drivers/clk/spear/clk-gpt-synth.c
+++ b/drivers/clk/spear/clk-gpt-synth.c
@@ -1,6 +1,6 @@
 /*
  * Copyright (C) 2012 ST Microelectronics
- * Viresh Kumar <viresh.linux@gmail.com>
+ * Viresh Kumar <vireshk@kernel.org>
  *
  * This file is licensed under the terms of the GNU General Public
  * License version 2. This program is licensed "as is" without any
diff --git a/drivers/clk/spear/clk-vco-pll.c b/drivers/clk/spear/clk-vco-pll.c
index 1b9b65bca51e..5ebddc528145 100644
--- a/drivers/clk/spear/clk-vco-pll.c
+++ b/drivers/clk/spear/clk-vco-pll.c
@@ -1,6 +1,6 @@
 /*
  * Copyright (C) 2012 ST Microelectronics
- * Viresh Kumar <viresh.linux@gmail.com>
+ * Viresh Kumar <vireshk@kernel.org>
  *
  * This file is licensed under the terms of the GNU General Public
  * License version 2. This program is licensed "as is" without any
diff --git a/drivers/clk/spear/clk.c b/drivers/clk/spear/clk.c
index 628b6d5ed3d9..157fe099ea6a 100644
--- a/drivers/clk/spear/clk.c
+++ b/drivers/clk/spear/clk.c
@@ -1,6 +1,6 @@
 /*
  * Copyright (C) 2012 ST Microelectronics
- * Viresh Kumar <viresh.linux@gmail.com>
+ * Viresh Kumar <vireshk@kernel.org>
  *
  * This file is licensed under the terms of the GNU General Public
  * License version 2. This program is licensed "as is" without any
diff --git a/drivers/clk/spear/clk.h b/drivers/clk/spear/clk.h
index 931737677dfa..9834944f08b1 100644
--- a/drivers/clk/spear/clk.h
+++ b/drivers/clk/spear/clk.h
@@ -2,7 +2,7 @@
  * Clock framework definitions for SPEAr platform
  *
  * Copyright (C) 2012 ST Microelectronics
- * Viresh Kumar <viresh.linux@gmail.com>
+ * Viresh Kumar <vireshk@kernel.org>
  *
  * This file is licensed under the terms of the GNU General Public
  * License version 2. This program is licensed "as is" without any
diff --git a/drivers/clk/spear/spear1310_clock.c b/drivers/clk/spear/spear1310_clock.c
index 4daa5977793a..222ce108b41a 100644
--- a/drivers/clk/spear/spear1310_clock.c
+++ b/drivers/clk/spear/spear1310_clock.c
@@ -4,7 +4,7 @@
  * SPEAr1310 machine clock framework source file
  *
  * Copyright (C) 2012 ST Microelectronics
- * Viresh Kumar <viresh.linux@gmail.com>
+ * Viresh Kumar <vireshk@kernel.org>
  *
  * This file is licensed under the terms of the GNU General Public
  * License version 2. This program is licensed "as is" without any
diff --git a/drivers/clk/spear/spear1340_clock.c b/drivers/clk/spear/spear1340_clock.c
index 5a5c6648308d..973c9d3fbcf8 100644
--- a/drivers/clk/spear/spear1340_clock.c
+++ b/drivers/clk/spear/spear1340_clock.c
@@ -4,7 +4,7 @@
  * SPEAr1340 machine clock framework source file
  *
  * Copyright (C) 2012 ST Microelectronics
- * Viresh Kumar <viresh.linux@gmail.com>
+ * Viresh Kumar <vireshk@kernel.org>
  *
  * This file is licensed under the terms of the GNU General Public
  * License version 2. This program is licensed "as is" without any
diff --git a/drivers/clk/spear/spear3xx_clock.c b/drivers/clk/spear/spear3xx_clock.c
index bb5f387774e2..404a55edd613 100644
--- a/drivers/clk/spear/spear3xx_clock.c
+++ b/drivers/clk/spear/spear3xx_clock.c
@@ -2,7 +2,7 @@
  * SPEAr3xx machines clock framework source file
  *
  * Copyright (C) 2012 ST Microelectronics
- * Viresh Kumar <viresh.linux@gmail.com>
+ * Viresh Kumar <vireshk@kernel.org>
  *
  * This file is licensed under the terms of the GNU General Public
  * License version 2. This program is licensed "as is" without any
diff --git a/drivers/clk/spear/spear6xx_clock.c b/drivers/clk/spear/spear6xx_clock.c
index 4f649c9cb094..231061fa73a4 100644
--- a/drivers/clk/spear/spear6xx_clock.c
+++ b/drivers/clk/spear/spear6xx_clock.c
@@ -2,7 +2,7 @@
  * SPEAr6xx machines clock framework source file
  *
  * Copyright (C) 2012 ST Microelectronics
- * Viresh Kumar <viresh.linux@gmail.com>
+ * Viresh Kumar <vireshk@kernel.org>
  *
  * This file is licensed under the terms of the GNU General Public
  * License version 2. This program is licensed "as is" without any
diff --git a/drivers/dma/dw/core.c b/drivers/dma/dw/core.c
index 1022c2e1a2b0..cf1c87fa1edd 100644
--- a/drivers/dma/dw/core.c
+++ b/drivers/dma/dw/core.c
@@ -1746,4 +1746,4 @@ EXPORT_SYMBOL_GPL(dw_dma_enable);
 MODULE_LICENSE("GPL v2");
 MODULE_DESCRIPTION("Synopsys DesignWare DMA Controller core driver");
 MODULE_AUTHOR("Haavard Skinnemoen (Atmel)");
-MODULE_AUTHOR("Viresh Kumar <viresh.linux@gmail.com>");
+MODULE_AUTHOR("Viresh Kumar <vireshk@kernel.org>");
diff --git a/drivers/irqchip/spear-shirq.c b/drivers/irqchip/spear-shirq.c
index a45121546caf..acb721b31bcf 100644
--- a/drivers/irqchip/spear-shirq.c
+++ b/drivers/irqchip/spear-shirq.c
@@ -2,7 +2,7 @@
  * SPEAr platform shared irq layer source file
  *
  * Copyright (C) 2009-2012 ST Microelectronics
- * Viresh Kumar <viresh.linux@gmail.com>
+ * Viresh Kumar <vireshk@kernel.org>
  *
  * Copyright (C) 2012 ST Microelectronics
  * Shiraz Hashim <shiraz.linux.kernel@gmail.com>
diff --git a/drivers/mfd/stmpe-i2c.c b/drivers/mfd/stmpe-i2c.c
index 5c054031c3f8..e14c8c9d189b 100644
--- a/drivers/mfd/stmpe-i2c.c
+++ b/drivers/mfd/stmpe-i2c.c
@@ -6,7 +6,7 @@
  *
  * License Terms: GNU General Public License, version 2
  * Author: Rabin Vincent <rabin.vincent@stericsson.com> for ST-Ericsson
- * Author: Viresh Kumar <viresh.linux@gmail.com> for ST Microelectronics
+ * Author: Viresh Kumar <vireshk@kernel.org> for ST Microelectronics
  */
 
 #include <linux/i2c.h>
diff --git a/drivers/mfd/stmpe-spi.c b/drivers/mfd/stmpe-spi.c
index a81badbaa917..6fdb30e84a2b 100644
--- a/drivers/mfd/stmpe-spi.c
+++ b/drivers/mfd/stmpe-spi.c
@@ -4,7 +4,7 @@
  * Copyright (C) ST Microelectronics SA 2011
  *
  * License Terms: GNU General Public License, version 2
- * Author: Viresh Kumar <viresh.linux@gmail.com> for ST Microelectronics
+ * Author: Viresh Kumar <vireshk@kernel.org> for ST Microelectronics
  */
 
 #include <linux/spi/spi.h>
@@ -146,4 +146,4 @@ module_exit(stmpe_exit);
 
 MODULE_LICENSE("GPL v2");
 MODULE_DESCRIPTION("STMPE MFD SPI Interface Driver");
-MODULE_AUTHOR("Viresh Kumar <viresh.linux@gmail.com>");
+MODULE_AUTHOR("Viresh Kumar <vireshk@kernel.org>");
diff --git a/drivers/mmc/host/sdhci-spear.c b/drivers/mmc/host/sdhci-spear.c
index df088343d60f..255a896769b8 100644
--- a/drivers/mmc/host/sdhci-spear.c
+++ b/drivers/mmc/host/sdhci-spear.c
@@ -4,7 +4,7 @@
  * Support of SDHCI platform devices for spear soc family
  *
  * Copyright (C) 2010 ST Microelectronics
- * Viresh Kumar <viresh.linux@gmail.com>
+ * Viresh Kumar <vireshk@kernel.org>
  *
  * Inspired by sdhci-pltfm.c
  *
@@ -211,5 +211,5 @@ static struct platform_driver sdhci_driver = {
 module_platform_driver(sdhci_driver);
 
 MODULE_DESCRIPTION("SPEAr Secure Digital Host Controller Interface driver");
-MODULE_AUTHOR("Viresh Kumar <viresh.linux@gmail.com>");
+MODULE_AUTHOR("Viresh Kumar <vireshk@kernel.org>");
 MODULE_LICENSE("GPL v2");
diff --git a/drivers/pinctrl/spear/pinctrl-spear.c b/drivers/pinctrl/spear/pinctrl-spear.c
index f87a5eaf75da..0afaf79a4e51 100644
--- a/drivers/pinctrl/spear/pinctrl-spear.c
+++ b/drivers/pinctrl/spear/pinctrl-spear.c
@@ -2,7 +2,7 @@
  * Driver for the ST Microelectronics SPEAr pinmux
  *
  * Copyright (C) 2012 ST Microelectronics
- * Viresh Kumar <viresh.linux@gmail.com>
+ * Viresh Kumar <vireshk@kernel.org>
  *
  * Inspired from:
  * - U300 Pinctl drivers
diff --git a/drivers/pinctrl/spear/pinctrl-spear.h b/drivers/pinctrl/spear/pinctrl-spear.h
index dc8bf85ecb2a..27c2cc8d83ad 100644
--- a/drivers/pinctrl/spear/pinctrl-spear.h
+++ b/drivers/pinctrl/spear/pinctrl-spear.h
@@ -2,7 +2,7 @@
  * Driver header file for the ST Microelectronics SPEAr pinmux
  *
  * Copyright (C) 2012 ST Microelectronics
- * Viresh Kumar <viresh.linux@gmail.com>
+ * Viresh Kumar <vireshk@kernel.org>
  *
  * This file is licensed under the terms of the GNU General Public
  * License version 2. This program is licensed "as is" without any
diff --git a/drivers/pinctrl/spear/pinctrl-spear1310.c b/drivers/pinctrl/spear/pinctrl-spear1310.c
index a7bdc537efa7..92611bb757ac 100644
--- a/drivers/pinctrl/spear/pinctrl-spear1310.c
+++ b/drivers/pinctrl/spear/pinctrl-spear1310.c
@@ -2,7 +2,7 @@
  * Driver for the ST Microelectronics SPEAr1310 pinmux
  *
  * Copyright (C) 2012 ST Microelectronics
- * Viresh Kumar <viresh.linux@gmail.com>
+ * Viresh Kumar <vireshk@kernel.org>
  *
  * This file is licensed under the terms of the GNU General Public
  * License version 2. This program is licensed "as is" without any
@@ -2730,7 +2730,7 @@ static void __exit spear1310_pinctrl_exit(void)
 }
 module_exit(spear1310_pinctrl_exit);
 
-MODULE_AUTHOR("Viresh Kumar <viresh.linux@gmail.com>");
+MODULE_AUTHOR("Viresh Kumar <vireshk@kernel.org>");
 MODULE_DESCRIPTION("ST Microelectronics SPEAr1310 pinctrl driver");
 MODULE_LICENSE("GPL v2");
 MODULE_DEVICE_TABLE(of, spear1310_pinctrl_of_match);
diff --git a/drivers/pinctrl/spear/pinctrl-spear1340.c b/drivers/pinctrl/spear/pinctrl-spear1340.c
index f43ec85a0328..f842e9dc40d0 100644
--- a/drivers/pinctrl/spear/pinctrl-spear1340.c
+++ b/drivers/pinctrl/spear/pinctrl-spear1340.c
@@ -2,7 +2,7 @@
  * Driver for the ST Microelectronics SPEAr1340 pinmux
  *
  * Copyright (C) 2012 ST Microelectronics
- * Viresh Kumar <viresh.linux@gmail.com>
+ * Viresh Kumar <vireshk@kernel.org>
  *
  * This file is licensed under the terms of the GNU General Public
  * License version 2. This program is licensed "as is" without any
@@ -2046,7 +2046,7 @@ static void __exit spear1340_pinctrl_exit(void)
 }
 module_exit(spear1340_pinctrl_exit);
 
-MODULE_AUTHOR("Viresh Kumar <viresh.linux@gmail.com>");
+MODULE_AUTHOR("Viresh Kumar <vireshk@kernel.org>");
 MODULE_DESCRIPTION("ST Microelectronics SPEAr1340 pinctrl driver");
 MODULE_LICENSE("GPL v2");
 MODULE_DEVICE_TABLE(of, spear1340_pinctrl_of_match);
diff --git a/drivers/pinctrl/spear/pinctrl-spear300.c b/drivers/pinctrl/spear/pinctrl-spear300.c
index da8990a8eeef..d998a2ccff48 100644
--- a/drivers/pinctrl/spear/pinctrl-spear300.c
+++ b/drivers/pinctrl/spear/pinctrl-spear300.c
@@ -2,7 +2,7 @@
  * Driver for the ST Microelectronics SPEAr300 pinmux
  *
  * Copyright (C) 2012 ST Microelectronics
- * Viresh Kumar <viresh.linux@gmail.com>
+ * Viresh Kumar <vireshk@kernel.org>
  *
  * This file is licensed under the terms of the GNU General Public
  * License version 2. This program is licensed "as is" without any
@@ -703,7 +703,7 @@ static void __exit spear300_pinctrl_exit(void)
 }
 module_exit(spear300_pinctrl_exit);
 
-MODULE_AUTHOR("Viresh Kumar <viresh.linux@gmail.com>");
+MODULE_AUTHOR("Viresh Kumar <vireshk@kernel.org>");
 MODULE_DESCRIPTION("ST Microelectronics SPEAr300 pinctrl driver");
 MODULE_LICENSE("GPL v2");
 MODULE_DEVICE_TABLE(of, spear300_pinctrl_of_match);
diff --git a/drivers/pinctrl/spear/pinctrl-spear310.c b/drivers/pinctrl/spear/pinctrl-spear310.c
index 31ede51e819b..609b18aceb16 100644
--- a/drivers/pinctrl/spear/pinctrl-spear310.c
+++ b/drivers/pinctrl/spear/pinctrl-spear310.c
@@ -2,7 +2,7 @@
  * Driver for the ST Microelectronics SPEAr310 pinmux
  *
  * Copyright (C) 2012 ST Microelectronics
- * Viresh Kumar <viresh.linux@gmail.com>
+ * Viresh Kumar <vireshk@kernel.org>
  *
  * This file is licensed under the terms of the GNU General Public
  * License version 2. This program is licensed "as is" without any
@@ -426,7 +426,7 @@ static void __exit spear310_pinctrl_exit(void)
 }
 module_exit(spear310_pinctrl_exit);
 
-MODULE_AUTHOR("Viresh Kumar <viresh.linux@gmail.com>");
+MODULE_AUTHOR("Viresh Kumar <vireshk@kernel.org>");
 MODULE_DESCRIPTION("ST Microelectronics SPEAr310 pinctrl driver");
 MODULE_LICENSE("GPL v2");
 MODULE_DEVICE_TABLE(of, spear310_pinctrl_of_match);
diff --git a/drivers/pinctrl/spear/pinctrl-spear320.c b/drivers/pinctrl/spear/pinctrl-spear320.c
index 506e40b641e0..c07114431bd4 100644
--- a/drivers/pinctrl/spear/pinctrl-spear320.c
+++ b/drivers/pinctrl/spear/pinctrl-spear320.c
@@ -2,7 +2,7 @@
  * Driver for the ST Microelectronics SPEAr320 pinmux
  *
  * Copyright (C) 2012 ST Microelectronics
- * Viresh Kumar <viresh.linux@gmail.com>
+ * Viresh Kumar <vireshk@kernel.org>
  *
  * This file is licensed under the terms of the GNU General Public
  * License version 2. This program is licensed "as is" without any
@@ -3467,7 +3467,7 @@ static void __exit spear320_pinctrl_exit(void)
 }
 module_exit(spear320_pinctrl_exit);
 
-MODULE_AUTHOR("Viresh Kumar <viresh.linux@gmail.com>");
+MODULE_AUTHOR("Viresh Kumar <vireshk@kernel.org>");
 MODULE_DESCRIPTION("ST Microelectronics SPEAr320 pinctrl driver");
 MODULE_LICENSE("GPL v2");
 MODULE_DEVICE_TABLE(of, spear320_pinctrl_of_match);
diff --git a/drivers/pinctrl/spear/pinctrl-spear3xx.c b/drivers/pinctrl/spear/pinctrl-spear3xx.c
index 12ee21af766b..d3119aafe709 100644
--- a/drivers/pinctrl/spear/pinctrl-spear3xx.c
+++ b/drivers/pinctrl/spear/pinctrl-spear3xx.c
@@ -2,7 +2,7 @@
  * Driver for the ST Microelectronics SPEAr3xx pinmux
  *
  * Copyright (C) 2012 ST Microelectronics
- * Viresh Kumar <viresh.linux@gmail.com>
+ * Viresh Kumar <vireshk@kernel.org>
  *
  * This file is licensed under the terms of the GNU General Public
  * License version 2. This program is licensed "as is" without any
diff --git a/drivers/pinctrl/spear/pinctrl-spear3xx.h b/drivers/pinctrl/spear/pinctrl-spear3xx.h
index 7860b36053c4..ce19dcf8f08b 100644
--- a/drivers/pinctrl/spear/pinctrl-spear3xx.h
+++ b/drivers/pinctrl/spear/pinctrl-spear3xx.h
@@ -2,7 +2,7 @@
  * Header file for the ST Microelectronics SPEAr3xx pinmux
  *
  * Copyright (C) 2012 ST Microelectronics
- * Viresh Kumar <viresh.linux@gmail.com>
+ * Viresh Kumar <vireshk@kernel.org>
  *
  * This file is licensed under the terms of the GNU General Public
  * License version 2. This program is licensed "as is" without any
diff --git a/drivers/watchdog/sp805_wdt.c b/drivers/watchdog/sp805_wdt.c
index c1b03f4235b9..4e7fec36f5c3 100644
--- a/drivers/watchdog/sp805_wdt.c
+++ b/drivers/watchdog/sp805_wdt.c
@@ -4,7 +4,7 @@
  * Watchdog driver for ARM SP805 watchdog module
  *
  * Copyright (C) 2010 ST Microelectronics
- * Viresh Kumar <viresh.linux@gmail.com>
+ * Viresh Kumar <vireshk@kernel.org>
  *
  * This file is licensed under the terms of the GNU General Public
  * License version 2 or later. This program is licensed "as is" without any
@@ -303,6 +303,6 @@ static struct amba_driver sp805_wdt_driver = {
 
 module_amba_driver(sp805_wdt_driver);
 
-MODULE_AUTHOR("Viresh Kumar <viresh.linux@gmail.com>");
+MODULE_AUTHOR("Viresh Kumar <vireshk@kernel.org>");
 MODULE_DESCRIPTION("ARM SP805 Watchdog Driver");
 MODULE_LICENSE("GPL");
diff --git a/include/linux/amba/sp810.h b/include/linux/amba/sp810.h
index c7df89f99115..58fe9e8b6fd7 100644
--- a/include/linux/amba/sp810.h
+++ b/include/linux/amba/sp810.h
@@ -2,7 +2,7 @@
  * ARM PrimeXsys System Controller SP810 header file
  *
  * Copyright (C) 2009 ST Microelectronics
- * Viresh Kumar <viresh.linux@gmail.com>
+ * Viresh Kumar <vireshk@kernel.org>
  *
  * This file is licensed under the terms of the GNU General Public
  * License version 2. This program is licensed "as is" without any
diff --git a/include/linux/pata_arasan_cf_data.h b/include/linux/pata_arasan_cf_data.h
index 3cc21c9cc1e8..9fade5dd2e86 100644
--- a/include/linux/pata_arasan_cf_data.h
+++ b/include/linux/pata_arasan_cf_data.h
@@ -4,7 +4,7 @@
  * Arasan Compact Flash host controller platform data header file
  *
  * Copyright (C) 2011 ST Microelectronics
- * Viresh Kumar <viresh.linux@gmail.com>
+ * Viresh Kumar <vireshk@kernel.org>
  *
  * This file is licensed under the terms of the GNU General Public
  * License version 2. This program is licensed "as is" without any
-- 
cgit v1.2.3


From e2cfc91120fa01e3458167054af993fb83d7d0ec Mon Sep 17 00:00:00 2001
From: Joonsoo Kim <js1304@gmail.com>
Date: Fri, 17 Jul 2015 16:24:18 -0700
Subject: mm/page_owner: set correct gfp_mask on page_owner

Currently, we set wrong gfp_mask to page_owner info in case of isolated
freepage by compaction and split page.  It causes incorrect mixed
pageblock report that we can get from '/proc/pagetypeinfo'.  This metric
is really useful to measure fragmentation effect so should be accurate.
This patch fixes it by setting correct information.

Without this patch, after kernel build workload is finished, number of
mixed pageblock is 112 among roughly 210 movable pageblocks.

But, with this fix, output shows that mixed pageblock is just 57.

Signed-off-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/page_owner.h | 13 +++++++++++++
 mm/page_alloc.c            |  8 +++++---
 mm/page_owner.c            |  7 +++++++
 3 files changed, 25 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/page_owner.h b/include/linux/page_owner.h
index b48c3471c254..cacaabea8a09 100644
--- a/include/linux/page_owner.h
+++ b/include/linux/page_owner.h
@@ -8,6 +8,7 @@ extern struct page_ext_operations page_owner_ops;
 extern void __reset_page_owner(struct page *page, unsigned int order);
 extern void __set_page_owner(struct page *page,
 			unsigned int order, gfp_t gfp_mask);
+extern gfp_t __get_page_owner_gfp(struct page *page);
 
 static inline void reset_page_owner(struct page *page, unsigned int order)
 {
@@ -25,6 +26,14 @@ static inline void set_page_owner(struct page *page,
 
 	__set_page_owner(page, order, gfp_mask);
 }
+
+static inline gfp_t get_page_owner_gfp(struct page *page)
+{
+	if (likely(!page_owner_inited))
+		return 0;
+
+	return __get_page_owner_gfp(page);
+}
 #else
 static inline void reset_page_owner(struct page *page, unsigned int order)
 {
@@ -33,6 +42,10 @@ static inline void set_page_owner(struct page *page,
 			unsigned int order, gfp_t gfp_mask)
 {
 }
+static inline gfp_t get_page_owner_gfp(struct page *page)
+{
+	return 0;
+}
 
 #endif /* CONFIG_PAGE_OWNER */
 #endif /* __LINUX_PAGE_OWNER_H */
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index fbba675a0bd9..ef19f22b2b7d 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1948,6 +1948,7 @@ void free_hot_cold_page_list(struct list_head *list, bool cold)
 void split_page(struct page *page, unsigned int order)
 {
 	int i;
+	gfp_t gfp_mask;
 
 	VM_BUG_ON_PAGE(PageCompound(page), page);
 	VM_BUG_ON_PAGE(!page_count(page), page);
@@ -1961,10 +1962,11 @@ void split_page(struct page *page, unsigned int order)
 		split_page(virt_to_page(page[0].shadow), order);
 #endif
 
-	set_page_owner(page, 0, 0);
+	gfp_mask = get_page_owner_gfp(page);
+	set_page_owner(page, 0, gfp_mask);
 	for (i = 1; i < (1 << order); i++) {
 		set_page_refcounted(page + i);
-		set_page_owner(page + i, 0, 0);
+		set_page_owner(page + i, 0, gfp_mask);
 	}
 }
 EXPORT_SYMBOL_GPL(split_page);
@@ -1994,7 +1996,7 @@ int __isolate_free_page(struct page *page, unsigned int order)
 	zone->free_area[order].nr_free--;
 	rmv_page_order(page);
 
-	set_page_owner(page, order, 0);
+	set_page_owner(page, order, __GFP_MOVABLE);
 
 	/* Set the pageblock if the isolated page is at least a pageblock */
 	if (order >= pageblock_order - 1) {
diff --git a/mm/page_owner.c b/mm/page_owner.c
index bd5f842b56d2..983c3a10fa07 100644
--- a/mm/page_owner.c
+++ b/mm/page_owner.c
@@ -76,6 +76,13 @@ void __set_page_owner(struct page *page, unsigned int order, gfp_t gfp_mask)
 	__set_bit(PAGE_EXT_OWNER, &page_ext->flags);
 }
 
+gfp_t __get_page_owner_gfp(struct page *page)
+{
+	struct page_ext *page_ext = lookup_page_ext(page);
+
+	return page_ext->gfp_mask;
+}
+
 static ssize_t
 print_page_owner(char __user *buf, size_t count, unsigned long pfn,
 		struct page *page, struct page_ext *page_ext)
-- 
cgit v1.2.3


From 0c8c0f03e3a292e031596484275c14cf39c0ab7a Mon Sep 17 00:00:00 2001
From: Dave Hansen <dave@sr71.net>
Date: Fri, 17 Jul 2015 12:28:11 +0200
Subject: x86/fpu, sched: Dynamically allocate 'struct fpu'

The FPU rewrite removed the dynamic allocations of 'struct fpu'.
But, this potentially wastes massive amounts of memory (2k per
task on systems that do not have AVX-512 for instance).

Instead of having a separate slab, this patch just appends the
space that we need to the 'task_struct' which we dynamically
allocate already.  This saves from doing an extra slab
allocation at fork().

The only real downside here is that we have to stick everything
and the end of the task_struct.  But, I think the
BUILD_BUG_ON()s I stuck in there should keep that from being too
fragile.

Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave@sr71.net>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1437128892-9831-2-git-send-email-mingo@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/fpu/types.h | 72 +++++++++++++++++++++-------------------
 arch/x86/include/asm/processor.h | 10 ++++--
 arch/x86/kernel/fpu/init.c       | 39 ++++++++++++++++++++++
 arch/x86/kernel/process.c        |  2 +-
 fs/proc/kcore.c                  |  4 +--
 include/linux/sched.h            | 12 +++++--
 kernel/fork.c                    |  8 ++++-
 7 files changed, 104 insertions(+), 43 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h
index 0637826292de..c49c5173158e 100644
--- a/arch/x86/include/asm/fpu/types.h
+++ b/arch/x86/include/asm/fpu/types.h
@@ -189,6 +189,7 @@ union fpregs_state {
 	struct fxregs_state		fxsave;
 	struct swregs_state		soft;
 	struct xregs_state		xsave;
+	u8 __padding[PAGE_SIZE];
 };
 
 /*
@@ -197,40 +198,6 @@ union fpregs_state {
  * state fields:
  */
 struct fpu {
-	/*
-	 * @state:
-	 *
-	 * In-memory copy of all FPU registers that we save/restore
-	 * over context switches. If the task is using the FPU then
-	 * the registers in the FPU are more recent than this state
-	 * copy. If the task context-switches away then they get
-	 * saved here and represent the FPU state.
-	 *
-	 * After context switches there may be a (short) time period
-	 * during which the in-FPU hardware registers are unchanged
-	 * and still perfectly match this state, if the tasks
-	 * scheduled afterwards are not using the FPU.
-	 *
-	 * This is the 'lazy restore' window of optimization, which
-	 * we track though 'fpu_fpregs_owner_ctx' and 'fpu->last_cpu'.
-	 *
-	 * We detect whether a subsequent task uses the FPU via setting
-	 * CR0::TS to 1, which causes any FPU use to raise a #NM fault.
-	 *
-	 * During this window, if the task gets scheduled again, we
-	 * might be able to skip having to do a restore from this
-	 * memory buffer to the hardware registers - at the cost of
-	 * incurring the overhead of #NM fault traps.
-	 *
-	 * Note that on modern CPUs that support the XSAVEOPT (or other
-	 * optimized XSAVE instructions), we don't use #NM traps anymore,
-	 * as the hardware can track whether FPU registers need saving
-	 * or not. On such CPUs we activate the non-lazy ('eagerfpu')
-	 * logic, which unconditionally saves/restores all FPU state
-	 * across context switches. (if FPU state exists.)
-	 */
-	union fpregs_state		state;
-
 	/*
 	 * @last_cpu:
 	 *
@@ -288,6 +255,43 @@ struct fpu {
 	 * deal with bursty apps that only use the FPU for a short time:
 	 */
 	unsigned char			counter;
+	/*
+	 * @state:
+	 *
+	 * In-memory copy of all FPU registers that we save/restore
+	 * over context switches. If the task is using the FPU then
+	 * the registers in the FPU are more recent than this state
+	 * copy. If the task context-switches away then they get
+	 * saved here and represent the FPU state.
+	 *
+	 * After context switches there may be a (short) time period
+	 * during which the in-FPU hardware registers are unchanged
+	 * and still perfectly match this state, if the tasks
+	 * scheduled afterwards are not using the FPU.
+	 *
+	 * This is the 'lazy restore' window of optimization, which
+	 * we track though 'fpu_fpregs_owner_ctx' and 'fpu->last_cpu'.
+	 *
+	 * We detect whether a subsequent task uses the FPU via setting
+	 * CR0::TS to 1, which causes any FPU use to raise a #NM fault.
+	 *
+	 * During this window, if the task gets scheduled again, we
+	 * might be able to skip having to do a restore from this
+	 * memory buffer to the hardware registers - at the cost of
+	 * incurring the overhead of #NM fault traps.
+	 *
+	 * Note that on modern CPUs that support the XSAVEOPT (or other
+	 * optimized XSAVE instructions), we don't use #NM traps anymore,
+	 * as the hardware can track whether FPU registers need saving
+	 * or not. On such CPUs we activate the non-lazy ('eagerfpu')
+	 * logic, which unconditionally saves/restores all FPU state
+	 * across context switches. (if FPU state exists.)
+	 */
+	union fpregs_state		state;
+	/*
+	 * WARNING: 'state' is dynamically-sized.  Do not put
+	 * anything after it here.
+	 */
 };
 
 #endif /* _ASM_X86_FPU_H */
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 43e6519df0d5..944f1785ed0d 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -390,9 +390,6 @@ struct thread_struct {
 #endif
 	unsigned long		gs;
 
-	/* Floating point and extended processor state */
-	struct fpu		fpu;
-
 	/* Save middle states of ptrace breakpoints */
 	struct perf_event	*ptrace_bps[HBP_NUM];
 	/* Debug status used for traps, single steps, etc... */
@@ -418,6 +415,13 @@ struct thread_struct {
 	unsigned long		iopl;
 	/* Max allowed port in the bitmap, in bytes: */
 	unsigned		io_bitmap_max;
+
+	/* Floating point and extended processor state */
+	struct fpu		fpu;
+	/*
+	 * WARNING: 'fpu' is dynamically-sized.  It *MUST* be at
+	 * the end.
+	 */
 };
 
 /*
diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c
index 32826791e675..deacbfa6b33e 100644
--- a/arch/x86/kernel/fpu/init.c
+++ b/arch/x86/kernel/fpu/init.c
@@ -136,6 +136,45 @@ static void __init fpu__init_system_generic(void)
 unsigned int xstate_size;
 EXPORT_SYMBOL_GPL(xstate_size);
 
+#define CHECK_MEMBER_AT_END_OF(TYPE, MEMBER)	\
+	BUILD_BUG_ON((sizeof(TYPE) -			\
+			offsetof(TYPE, MEMBER) -	\
+			sizeof(((TYPE *)0)->MEMBER)) > 	\
+			0)				\
+
+/*
+ * We append the 'struct fpu' to the task_struct.
+ */
+int __weak arch_task_struct_size(void)
+{
+	int task_size = sizeof(struct task_struct);
+
+	/*
+	 * Subtract off the static size of the register state.
+	 * It potentially has a bunch of padding.
+	 */
+	task_size -= sizeof(((struct task_struct *)0)->thread.fpu.state);
+
+	/*
+	 * Add back the dynamically-calculated register state
+	 * size.
+	 */
+	task_size += xstate_size;
+
+	/*
+	 * We dynamically size 'struct fpu', so we require that
+	 * it be at the end of 'thread_struct' and that
+	 * 'thread_struct' be at the end of 'task_struct'.  If
+	 * you hit a compile error here, check the structure to
+	 * see if something got added to the end.
+	 */
+	CHECK_MEMBER_AT_END_OF(struct fpu, state);
+	CHECK_MEMBER_AT_END_OF(struct thread_struct, fpu);
+	CHECK_MEMBER_AT_END_OF(struct task_struct, thread);
+
+	return task_size;
+}
+
 /*
  * Set up the xstate_size based on the legacy FPU context size.
  *
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 9cad694ed7c4..975420eac105 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -81,7 +81,7 @@ EXPORT_SYMBOL_GPL(idle_notifier_unregister);
  */
 int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
 {
-	*dst = *src;
+	memcpy(dst, src, arch_task_struct_size());
 
 	return fpu__copy(&dst->thread.fpu, &src->thread.fpu);
 }
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index 91a4e6426321..a0fe99485687 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -92,7 +92,7 @@ static size_t get_kcore_size(int *nphdr, size_t *elf_buflen)
 			     roundup(sizeof(CORE_STR), 4)) +
 			roundup(sizeof(struct elf_prstatus), 4) +
 			roundup(sizeof(struct elf_prpsinfo), 4) +
-			roundup(sizeof(struct task_struct), 4);
+			roundup(arch_task_struct_size(), 4);
 	*elf_buflen = PAGE_ALIGN(*elf_buflen);
 	return size + *elf_buflen;
 }
@@ -415,7 +415,7 @@ static void elf_kcore_store_hdr(char *bufp, int nphdr, int dataoff)
 	/* set up the task structure */
 	notes[2].name	= CORE_STR;
 	notes[2].type	= NT_TASKSTRUCT;
-	notes[2].datasz	= sizeof(struct task_struct);
+	notes[2].datasz	= arch_task_struct_size();
 	notes[2].data	= current;
 
 	nhdr->p_filesz	+= notesize(&notes[2]);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index ae21f1591615..e43a41d892b6 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1522,8 +1522,6 @@ struct task_struct {
 /* hung task detection */
 	unsigned long last_switch_count;
 #endif
-/* CPU-specific state of this task */
-	struct thread_struct thread;
 /* filesystem information */
 	struct fs_struct *fs;
 /* open file information */
@@ -1778,8 +1776,18 @@ struct task_struct {
 	unsigned long	task_state_change;
 #endif
 	int pagefault_disabled;
+/* CPU-specific state of this task */
+	struct thread_struct thread;
+/*
+ * WARNING: on x86, 'thread_struct' contains a variable-sized
+ * structure.  It *MUST* be at the end of 'task_struct'.
+ *
+ * Do not put anything below here!
+ */
 };
 
+extern int arch_task_struct_size(void);
+
 /* Future-safe accessor for struct task_struct's cpus_allowed. */
 #define tsk_cpus_allowed(tsk) (&(tsk)->cpus_allowed)
 
diff --git a/kernel/fork.c b/kernel/fork.c
index 1bfefc6f96a4..431b67a6098c 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -287,15 +287,21 @@ static void set_max_threads(unsigned int max_threads_suggested)
 	max_threads = clamp_t(u64, threads, MIN_THREADS, MAX_THREADS);
 }
 
+int __weak arch_task_struct_size(void)
+{
+	return sizeof(struct task_struct);
+}
+
 void __init fork_init(void)
 {
+	int task_struct_size = arch_task_struct_size();
 #ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR
 #ifndef ARCH_MIN_TASKALIGN
 #define ARCH_MIN_TASKALIGN	L1_CACHE_BYTES
 #endif
 	/* create a slab on which task_structs can be allocated */
 	task_struct_cachep =
-		kmem_cache_create("task_struct", sizeof(struct task_struct),
+		kmem_cache_create("task_struct", task_struct_size,
 			ARCH_MIN_TASKALIGN, SLAB_PANIC | SLAB_NOTRACK, NULL);
 #endif
 
-- 
cgit v1.2.3


From 5aaeb5c01c5b6c0be7b7aadbf3ace9f3a4458c3d Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Fri, 17 Jul 2015 12:28:12 +0200
Subject: x86/fpu, sched: Introduce CONFIG_ARCH_WANTS_DYNAMIC_TASK_STRUCT and
 use it on x86

Don't burden architectures without dynamic task_struct sizing
with the overhead of dynamic sizing.

Also optimize the x86 code a bit by caching task_struct_size.

Acked-and-Tested-by: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave@sr71.net>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1437128892-9831-3-git-send-email-mingo@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/Kconfig               |  4 ++++
 arch/x86/Kconfig           |  1 +
 arch/x86/kernel/fpu/init.c | 17 +++++++++--------
 arch/x86/kernel/process.c  |  2 +-
 fs/proc/kcore.c            |  4 ++--
 include/linux/sched.h      |  6 +++++-
 kernel/fork.c              | 11 +++++------
 7 files changed, 27 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/arch/Kconfig b/arch/Kconfig
index bec6666a3cc4..8a8ea7110de8 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -221,6 +221,10 @@ config ARCH_TASK_STRUCT_ALLOCATOR
 config ARCH_THREAD_INFO_ALLOCATOR
 	bool
 
+# Select if arch wants to size task_struct dynamically via arch_task_struct_size:
+config ARCH_WANTS_DYNAMIC_TASK_STRUCT
+	bool
+
 config HAVE_REGS_AND_STACK_ACCESS_API
 	bool
 	help
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 3dbb7e7909ca..b3a1a5d77d92 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -41,6 +41,7 @@ config X86
 	select ARCH_USE_CMPXCHG_LOCKREF		if X86_64
 	select ARCH_USE_QUEUED_RWLOCKS
 	select ARCH_USE_QUEUED_SPINLOCKS
+	select ARCH_WANTS_DYNAMIC_TASK_STRUCT
 	select ARCH_WANT_FRAME_POINTERS
 	select ARCH_WANT_IPC_PARSE_VERSION	if X86_32
 	select ARCH_WANT_OPTIONAL_GPIOLIB
diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c
index deacbfa6b33e..0b39173dd971 100644
--- a/arch/x86/kernel/fpu/init.c
+++ b/arch/x86/kernel/fpu/init.c
@@ -4,6 +4,8 @@
 #include <asm/fpu/internal.h>
 #include <asm/tlbflush.h>
 
+#include <linux/sched.h>
+
 /*
  * Initialize the TS bit in CR0 according to the style of context-switches
  * we are using:
@@ -136,16 +138,14 @@ static void __init fpu__init_system_generic(void)
 unsigned int xstate_size;
 EXPORT_SYMBOL_GPL(xstate_size);
 
-#define CHECK_MEMBER_AT_END_OF(TYPE, MEMBER)	\
-	BUILD_BUG_ON((sizeof(TYPE) -			\
-			offsetof(TYPE, MEMBER) -	\
-			sizeof(((TYPE *)0)->MEMBER)) > 	\
-			0)				\
+/* Enforce that 'MEMBER' is the last field of 'TYPE': */
+#define CHECK_MEMBER_AT_END_OF(TYPE, MEMBER) \
+	BUILD_BUG_ON(sizeof(TYPE) != offsetofend(TYPE, MEMBER))
 
 /*
- * We append the 'struct fpu' to the task_struct.
+ * We append the 'struct fpu' to the task_struct:
  */
-int __weak arch_task_struct_size(void)
+static void __init fpu__init_task_struct_size(void)
 {
 	int task_size = sizeof(struct task_struct);
 
@@ -172,7 +172,7 @@ int __weak arch_task_struct_size(void)
 	CHECK_MEMBER_AT_END_OF(struct thread_struct, fpu);
 	CHECK_MEMBER_AT_END_OF(struct task_struct, thread);
 
-	return task_size;
+	arch_task_struct_size = task_size;
 }
 
 /*
@@ -326,6 +326,7 @@ void __init fpu__init_system(struct cpuinfo_x86 *c)
 	fpu__init_system_generic();
 	fpu__init_system_xstate_size_legacy();
 	fpu__init_system_xstate();
+	fpu__init_task_struct_size();
 
 	fpu__init_system_ctx_switch();
 }
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 975420eac105..397688beed4b 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -81,7 +81,7 @@ EXPORT_SYMBOL_GPL(idle_notifier_unregister);
  */
 int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
 {
-	memcpy(dst, src, arch_task_struct_size());
+	memcpy(dst, src, arch_task_struct_size);
 
 	return fpu__copy(&dst->thread.fpu, &src->thread.fpu);
 }
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index a0fe99485687..92e6726f6e37 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -92,7 +92,7 @@ static size_t get_kcore_size(int *nphdr, size_t *elf_buflen)
 			     roundup(sizeof(CORE_STR), 4)) +
 			roundup(sizeof(struct elf_prstatus), 4) +
 			roundup(sizeof(struct elf_prpsinfo), 4) +
-			roundup(arch_task_struct_size(), 4);
+			roundup(arch_task_struct_size, 4);
 	*elf_buflen = PAGE_ALIGN(*elf_buflen);
 	return size + *elf_buflen;
 }
@@ -415,7 +415,7 @@ static void elf_kcore_store_hdr(char *bufp, int nphdr, int dataoff)
 	/* set up the task structure */
 	notes[2].name	= CORE_STR;
 	notes[2].type	= NT_TASKSTRUCT;
-	notes[2].datasz	= arch_task_struct_size();
+	notes[2].datasz	= arch_task_struct_size;
 	notes[2].data	= current;
 
 	nhdr->p_filesz	+= notesize(&notes[2]);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index e43a41d892b6..04b5ada460b4 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1786,7 +1786,11 @@ struct task_struct {
  */
 };
 
-extern int arch_task_struct_size(void);
+#ifdef CONFIG_ARCH_WANTS_DYNAMIC_TASK_STRUCT
+extern int arch_task_struct_size __read_mostly;
+#else
+# define arch_task_struct_size (sizeof(struct task_struct))
+#endif
 
 /* Future-safe accessor for struct task_struct's cpus_allowed. */
 #define tsk_cpus_allowed(tsk) (&(tsk)->cpus_allowed)
diff --git a/kernel/fork.c b/kernel/fork.c
index 431b67a6098c..dbd9b8d7b7cc 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -287,21 +287,20 @@ static void set_max_threads(unsigned int max_threads_suggested)
 	max_threads = clamp_t(u64, threads, MIN_THREADS, MAX_THREADS);
 }
 
-int __weak arch_task_struct_size(void)
-{
-	return sizeof(struct task_struct);
-}
+#ifdef CONFIG_ARCH_WANTS_DYNAMIC_TASK_STRUCT
+/* Initialized by the architecture: */
+int arch_task_struct_size __read_mostly;
+#endif
 
 void __init fork_init(void)
 {
-	int task_struct_size = arch_task_struct_size();
 #ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR
 #ifndef ARCH_MIN_TASKALIGN
 #define ARCH_MIN_TASKALIGN	L1_CACHE_BYTES
 #endif
 	/* create a slab on which task_structs can be allocated */
 	task_struct_cachep =
-		kmem_cache_create("task_struct", task_struct_size,
+		kmem_cache_create("task_struct", arch_task_struct_size,
 			ARCH_MIN_TASKALIGN, SLAB_PANIC | SLAB_NOTRACK, NULL);
 #endif
 
-- 
cgit v1.2.3