From 439e7271dc2b63de379e37971dc2f64d71e24f8a Mon Sep 17 00:00:00 2001
From: Joe Lawrence <joe.lawrence@redhat.com>
Date: Thu, 31 Aug 2017 16:37:41 -0400
Subject: livepatch: introduce shadow variable API

Add exported API for livepatch modules:

  klp_shadow_get()
  klp_shadow_alloc()
  klp_shadow_get_or_alloc()
  klp_shadow_free()
  klp_shadow_free_all()

that implement "shadow" variables, which allow callers to associate new
shadow fields to existing data structures.  This is intended to be used
by livepatch modules seeking to emulate additions to data structure
definitions.

See Documentation/livepatch/shadow-vars.txt for a summary of the new
shadow variable API, including a few common use cases.

See samples/livepatch/livepatch-shadow-* for example modules that
demonstrate shadow variables.

[jkosina@suse.cz: fix __klp_shadow_get_or_alloc() comment as spotted by
 Josh]
Signed-off-by: Joe Lawrence <joe.lawrence@redhat.com>
Acked-by: Josh Poimboeuf <jpoimboe@redhat.com>
Acked-by: Miroslav Benes <mbenes@suse.cz>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 kernel/livepatch/Makefile |   2 +-
 kernel/livepatch/shadow.c | 277 ++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 278 insertions(+), 1 deletion(-)
 create mode 100644 kernel/livepatch/shadow.c

(limited to 'kernel')

diff --git a/kernel/livepatch/Makefile b/kernel/livepatch/Makefile
index 2b8bdb1925da..b36ceda6488e 100644
--- a/kernel/livepatch/Makefile
+++ b/kernel/livepatch/Makefile
@@ -1,3 +1,3 @@
 obj-$(CONFIG_LIVEPATCH) += livepatch.o
 
-livepatch-objs := core.o patch.o transition.o
+livepatch-objs := core.o patch.o shadow.o transition.o
diff --git a/kernel/livepatch/shadow.c b/kernel/livepatch/shadow.c
new file mode 100644
index 000000000000..67e4360521f3
--- /dev/null
+++ b/kernel/livepatch/shadow.c
@@ -0,0 +1,277 @@
+/*
+ * shadow.c - Shadow Variables
+ *
+ * Copyright (C) 2014 Josh Poimboeuf <jpoimboe@redhat.com>
+ * Copyright (C) 2014 Seth Jennings <sjenning@redhat.com>
+ * Copyright (C) 2017 Joe Lawrence <joe.lawrence@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+/**
+ * DOC: Shadow variable API concurrency notes:
+ *
+ * The shadow variable API provides a simple relationship between an
+ * <obj, id> pair and a pointer value.  It is the responsibility of the
+ * caller to provide any mutual exclusion required of the shadow data.
+ *
+ * Once a shadow variable is attached to its parent object via the
+ * klp_shadow_*alloc() API calls, it is considered live: any subsequent
+ * call to klp_shadow_get() may then return the shadow variable's data
+ * pointer.  Callers of klp_shadow_*alloc() should prepare shadow data
+ * accordingly.
+ *
+ * The klp_shadow_*alloc() API calls may allocate memory for new shadow
+ * variable structures.  Their implementation does not call kmalloc
+ * inside any spinlocks, but API callers should pass GFP flags according
+ * to their specific needs.
+ *
+ * The klp_shadow_hash is an RCU-enabled hashtable and is safe against
+ * concurrent klp_shadow_free() and klp_shadow_get() operations.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/hashtable.h>
+#include <linux/slab.h>
+#include <linux/livepatch.h>
+
+static DEFINE_HASHTABLE(klp_shadow_hash, 12);
+
+/*
+ * klp_shadow_lock provides exclusive access to the klp_shadow_hash and
+ * the shadow variables it references.
+ */
+static DEFINE_SPINLOCK(klp_shadow_lock);
+
+/**
+ * struct klp_shadow - shadow variable structure
+ * @node:	klp_shadow_hash hash table node
+ * @rcu_head:	RCU is used to safely free this structure
+ * @obj:	pointer to parent object
+ * @id:		data identifier
+ * @data:	data area
+ */
+struct klp_shadow {
+	struct hlist_node node;
+	struct rcu_head rcu_head;
+	void *obj;
+	unsigned long id;
+	char data[];
+};
+
+/**
+ * klp_shadow_match() - verify a shadow variable matches given <obj, id>
+ * @shadow:	shadow variable to match
+ * @obj:	pointer to parent object
+ * @id:		data identifier
+ *
+ * Return: true if the shadow variable matches.
+ */
+static inline bool klp_shadow_match(struct klp_shadow *shadow, void *obj,
+				unsigned long id)
+{
+	return shadow->obj == obj && shadow->id == id;
+}
+
+/**
+ * klp_shadow_get() - retrieve a shadow variable data pointer
+ * @obj:	pointer to parent object
+ * @id:		data identifier
+ *
+ * Return: the shadow variable data element, NULL on failure.
+ */
+void *klp_shadow_get(void *obj, unsigned long id)
+{
+	struct klp_shadow *shadow;
+
+	rcu_read_lock();
+
+	hash_for_each_possible_rcu(klp_shadow_hash, shadow, node,
+				   (unsigned long)obj) {
+
+		if (klp_shadow_match(shadow, obj, id)) {
+			rcu_read_unlock();
+			return shadow->data;
+		}
+	}
+
+	rcu_read_unlock();
+
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(klp_shadow_get);
+
+void *__klp_shadow_get_or_alloc(void *obj, unsigned long id, void *data,
+		       size_t size, gfp_t gfp_flags, bool warn_on_exist)
+{
+	struct klp_shadow *new_shadow;
+	void *shadow_data;
+	unsigned long flags;
+
+	/* Check if the shadow variable already exists */
+	shadow_data = klp_shadow_get(obj, id);
+	if (shadow_data)
+		goto exists;
+
+	/* Allocate a new shadow variable for use inside the lock below */
+	new_shadow = kzalloc(size + sizeof(*new_shadow), gfp_flags);
+	if (!new_shadow)
+		return NULL;
+
+	new_shadow->obj = obj;
+	new_shadow->id = id;
+
+	/* Initialize the shadow variable if data provided */
+	if (data)
+		memcpy(new_shadow->data, data, size);
+
+	/* Look for <obj, id> again under the lock */
+	spin_lock_irqsave(&klp_shadow_lock, flags);
+	shadow_data = klp_shadow_get(obj, id);
+	if (unlikely(shadow_data)) {
+		/*
+		 * Shadow variable was found, throw away speculative
+		 * allocation.
+		 */
+		spin_unlock_irqrestore(&klp_shadow_lock, flags);
+		kfree(new_shadow);
+		goto exists;
+	}
+
+	/* No <obj, id> found, so attach the newly allocated one */
+	hash_add_rcu(klp_shadow_hash, &new_shadow->node,
+		     (unsigned long)new_shadow->obj);
+	spin_unlock_irqrestore(&klp_shadow_lock, flags);
+
+	return new_shadow->data;
+
+exists:
+	if (warn_on_exist) {
+		WARN(1, "Duplicate shadow variable <%p, %lx>\n", obj, id);
+		return NULL;
+	}
+
+	return shadow_data;
+}
+
+/**
+ * klp_shadow_alloc() - allocate and add a new shadow variable
+ * @obj:	pointer to parent object
+ * @id:		data identifier
+ * @data:	pointer to data to attach to parent
+ * @size:	size of attached data
+ * @gfp_flags:	GFP mask for allocation
+ *
+ * Allocates @size bytes for new shadow variable data using @gfp_flags
+ * and copies @size bytes from @data into the new shadow variable's own
+ * data space.  If @data is NULL, @size bytes are still allocated, but
+ * no copy is performed.  The new shadow variable is then added to the
+ * global hashtable.
+ *
+ * If an existing <obj, id> shadow variable can be found, this routine
+ * will issue a WARN, exit early and return NULL.
+ *
+ * Return: the shadow variable data element, NULL on duplicate or
+ * failure.
+ */
+void *klp_shadow_alloc(void *obj, unsigned long id, void *data,
+		       size_t size, gfp_t gfp_flags)
+{
+	return __klp_shadow_get_or_alloc(obj, id, data, size, gfp_flags, true);
+}
+EXPORT_SYMBOL_GPL(klp_shadow_alloc);
+
+/**
+ * klp_shadow_get_or_alloc() - get existing or allocate a new shadow variable
+ * @obj:	pointer to parent object
+ * @id:		data identifier
+ * @data:	pointer to data to attach to parent
+ * @size:	size of attached data
+ * @gfp_flags:	GFP mask for allocation
+ *
+ * Returns a pointer to existing shadow data if an <obj, id> shadow
+ * variable is already present.  Otherwise, it creates a new shadow
+ * variable like klp_shadow_alloc().
+ *
+ * This function guarantees that only one shadow variable exists with
+ * the given @id for the given @obj.  It also guarantees that the shadow
+ * variable will be initialized by the given @data only when it did not
+ * exist before.
+ *
+ * Return: the shadow variable data element, NULL on failure.
+ */
+void *klp_shadow_get_or_alloc(void *obj, unsigned long id, void *data,
+			       size_t size, gfp_t gfp_flags)
+{
+	return __klp_shadow_get_or_alloc(obj, id, data, size, gfp_flags, false);
+}
+EXPORT_SYMBOL_GPL(klp_shadow_get_or_alloc);
+
+/**
+ * klp_shadow_free() - detach and free a <obj, id> shadow variable
+ * @obj:	pointer to parent object
+ * @id:		data identifier
+ *
+ * This function releases the memory for this <obj, id> shadow variable
+ * instance, callers should stop referencing it accordingly.
+ */
+void klp_shadow_free(void *obj, unsigned long id)
+{
+	struct klp_shadow *shadow;
+	unsigned long flags;
+
+	spin_lock_irqsave(&klp_shadow_lock, flags);
+
+	/* Delete <obj, id> from hash */
+	hash_for_each_possible(klp_shadow_hash, shadow, node,
+			       (unsigned long)obj) {
+
+		if (klp_shadow_match(shadow, obj, id)) {
+			hash_del_rcu(&shadow->node);
+			kfree_rcu(shadow, rcu_head);
+			break;
+		}
+	}
+
+	spin_unlock_irqrestore(&klp_shadow_lock, flags);
+}
+EXPORT_SYMBOL_GPL(klp_shadow_free);
+
+/**
+ * klp_shadow_free_all() - detach and free all <*, id> shadow variables
+ * @id:		data identifier
+ *
+ * This function releases the memory for all <*, id> shadow variable
+ * instances, callers should stop referencing them accordingly.
+ */
+void klp_shadow_free_all(unsigned long id)
+{
+	struct klp_shadow *shadow;
+	unsigned long flags;
+	int i;
+
+	spin_lock_irqsave(&klp_shadow_lock, flags);
+
+	/* Delete all <*, id> from hash */
+	hash_for_each(klp_shadow_hash, i, shadow, node) {
+		if (klp_shadow_match(shadow, shadow->obj, id)) {
+			hash_del_rcu(&shadow->node);
+			kfree_rcu(shadow, rcu_head);
+		}
+	}
+
+	spin_unlock_irqrestore(&klp_shadow_lock, flags);
+}
+EXPORT_SYMBOL_GPL(klp_shadow_free_all);
-- 
cgit v1.2.3


From 5d9da759f7587c87252ef98e70bc0b4a89e4d036 Mon Sep 17 00:00:00 2001
From: Jiri Kosina <jkosina@suse.cz>
Date: Thu, 14 Sep 2017 14:15:36 -0700
Subject: livepatch: __klp_shadow_get_or_alloc() is local to shadow.c

... therefore make it static.

Fixes: 439e7271dc2 ("livepatch: introduce shadow variable API")
Acked-by: Joe Lawrence <joe.lawrence@redhat.com>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 kernel/livepatch/shadow.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/livepatch/shadow.c b/kernel/livepatch/shadow.c
index 67e4360521f3..fdac27588d60 100644
--- a/kernel/livepatch/shadow.c
+++ b/kernel/livepatch/shadow.c
@@ -113,7 +113,7 @@ void *klp_shadow_get(void *obj, unsigned long id)
 }
 EXPORT_SYMBOL_GPL(klp_shadow_get);
 
-void *__klp_shadow_get_or_alloc(void *obj, unsigned long id, void *data,
+static void *__klp_shadow_get_or_alloc(void *obj, unsigned long id, void *data,
 		       size_t size, gfp_t gfp_flags, bool warn_on_exist)
 {
 	struct klp_shadow *new_shadow;
-- 
cgit v1.2.3


From e454cf5958538666635488030046b6a84a22d447 Mon Sep 17 00:00:00 2001
From: Craig Gallek <kraig@google.com>
Date: Mon, 18 Sep 2017 15:30:55 -0400
Subject: bpf: Implement map_delete_elem for BPF_MAP_TYPE_LPM_TRIE

This is a simple non-recursive delete operation.  It prunes paths
of empty nodes in the tree, but it does not try to further compress
the tree as nodes are removed.

Signed-off-by: Craig Gallek <kraig@google.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/lpm_trie.c | 80 +++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 77 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c
index 1b767844a76f..9d58a576b2ae 100644
--- a/kernel/bpf/lpm_trie.c
+++ b/kernel/bpf/lpm_trie.c
@@ -389,10 +389,84 @@ out:
 	return ret;
 }
 
-static int trie_delete_elem(struct bpf_map *map, void *key)
+/* Called from syscall or from eBPF program */
+static int trie_delete_elem(struct bpf_map *map, void *_key)
 {
-	/* TODO */
-	return -ENOSYS;
+	struct lpm_trie *trie = container_of(map, struct lpm_trie, map);
+	struct bpf_lpm_trie_key *key = _key;
+	struct lpm_trie_node __rcu **trim;
+	struct lpm_trie_node *node;
+	unsigned long irq_flags;
+	unsigned int next_bit;
+	size_t matchlen = 0;
+	int ret = 0;
+
+	if (key->prefixlen > trie->max_prefixlen)
+		return -EINVAL;
+
+	raw_spin_lock_irqsave(&trie->lock, irq_flags);
+
+	/* Walk the tree looking for an exact key/length match and keeping
+	 * track of where we could begin trimming the tree.  The trim-point
+	 * is the sub-tree along the walk consisting of only single-child
+	 * intermediate nodes and ending at a leaf node that we want to
+	 * remove.
+	 */
+	trim = &trie->root;
+	node = rcu_dereference_protected(
+		trie->root, lockdep_is_held(&trie->lock));
+	while (node) {
+		matchlen = longest_prefix_match(trie, node, key);
+
+		if (node->prefixlen != matchlen ||
+		    node->prefixlen == key->prefixlen)
+			break;
+
+		next_bit = extract_bit(key->data, node->prefixlen);
+		/* If we hit a node that has more than one child or is a valid
+		 * prefix itself, do not remove it. Reset the root of the trim
+		 * path to its descendant on our path.
+		 */
+		if (!(node->flags & LPM_TREE_NODE_FLAG_IM) ||
+		    (node->child[0] && node->child[1]))
+			trim = &node->child[next_bit];
+		node = rcu_dereference_protected(
+			node->child[next_bit], lockdep_is_held(&trie->lock));
+	}
+
+	if (!node || node->prefixlen != key->prefixlen ||
+	    (node->flags & LPM_TREE_NODE_FLAG_IM)) {
+		ret = -ENOENT;
+		goto out;
+	}
+
+	trie->n_entries--;
+
+	/* If the node we are removing is not a leaf node, simply mark it
+	 * as intermediate and we are done.
+	 */
+	if (rcu_access_pointer(node->child[0]) ||
+	    rcu_access_pointer(node->child[1])) {
+		node->flags |= LPM_TREE_NODE_FLAG_IM;
+		goto out;
+	}
+
+	/* trim should now point to the slot holding the start of a path from
+	 * zero or more intermediate nodes to our leaf node for deletion.
+	 */
+	while ((node = rcu_dereference_protected(
+			*trim, lockdep_is_held(&trie->lock)))) {
+		RCU_INIT_POINTER(*trim, NULL);
+		trim = rcu_access_pointer(node->child[0]) ?
+			&node->child[0] :
+			&node->child[1];
+		kfree_rcu(node, rcu);
+	}
+
+out:
+	raw_spin_unlock_irqrestore(&trie->lock, irq_flags);
+
+	return ret;
 }
 
 #define LPM_DATA_SIZE_MAX	256
-- 
cgit v1.2.3


From f454322efbf6faee695f517c6b52c4dc03cacd3e Mon Sep 17 00:00:00 2001
From: "Dmitry V. Levin" <ldv@altlinux.org>
Date: Tue, 22 Aug 2017 02:16:11 +0300
Subject: signal: replace sigset_to_compat() with put_compat_sigset()

There are 4 callers of sigset_to_compat() in the entire kernel.  One is
in sparc compat rt_sigaction(2), the rest are in kernel/signal.c itself.
All are followed by copy_to_user(), and all but the sparc one are under
"if it's big-endian..." ifdefs.

Let's transform sigset_to_compat() into put_compat_sigset() that also
calls copy_to_user().

Suggested-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Dmitry V. Levin <ldv@altlinux.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 arch/sparc/kernel/sys_sparc32.c |  6 +++---
 include/linux/compat.h          |  3 ++-
 kernel/compat.c                 | 20 ++++++++++++++------
 kernel/signal.c                 | 27 ++++++---------------------
 4 files changed, 25 insertions(+), 31 deletions(-)

(limited to 'kernel')

diff --git a/arch/sparc/kernel/sys_sparc32.c b/arch/sparc/kernel/sys_sparc32.c
index bca44f3e6b86..5e2bec9e41b2 100644
--- a/arch/sparc/kernel/sys_sparc32.c
+++ b/arch/sparc/kernel/sys_sparc32.c
@@ -159,7 +159,6 @@ COMPAT_SYSCALL_DEFINE5(rt_sigaction, int, sig,
 {
         struct k_sigaction new_ka, old_ka;
         int ret;
-	compat_sigset_t set32;
 
         /* XXX: Don't preclude handling different sized sigset_t's.  */
         if (sigsetsize != sizeof(compat_sigset_t))
@@ -167,6 +166,7 @@ COMPAT_SYSCALL_DEFINE5(rt_sigaction, int, sig,
 
         if (act) {
 		u32 u_handler, u_restorer;
+		compat_sigset_t set32;
 
 		new_ka.ka_restorer = restorer;
 		ret = get_user(u_handler, &act->sa_handler);
@@ -183,9 +183,9 @@ COMPAT_SYSCALL_DEFINE5(rt_sigaction, int, sig,
 	ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL);
 
 	if (!ret && oact) {
-		sigset_to_compat(&set32, &old_ka.sa.sa_mask);
 		ret = put_user(ptr_to_compat(old_ka.sa.sa_handler), &oact->sa_handler);
-		ret |= copy_to_user(&oact->sa_mask, &set32, sizeof(compat_sigset_t));
+		ret |= put_compat_sigset(&oact->sa_mask, &old_ka.sa.sa_mask,
+					 sizeof(oact->sa_mask));
 		ret |= put_user(old_ka.sa.sa_flags, &oact->sa_flags);
 		ret |= put_user(ptr_to_compat(old_ka.sa.sa_restorer), &oact->sa_restorer);
 		if (ret)
diff --git a/include/linux/compat.h b/include/linux/compat.h
index a5619de3437d..ab1baa79abe8 100644
--- a/include/linux/compat.h
+++ b/include/linux/compat.h
@@ -456,7 +456,8 @@ asmlinkage long compat_sys_settimeofday(struct compat_timeval __user *tv,
 asmlinkage long compat_sys_adjtimex(struct compat_timex __user *utp);
 
 extern void sigset_from_compat(sigset_t *set, const compat_sigset_t *compat);
-extern void sigset_to_compat(compat_sigset_t *compat, const sigset_t *set);
+extern int put_compat_sigset(compat_sigset_t __user *compat,
+			     const sigset_t *set, unsigned int size);
 
 asmlinkage long compat_sys_migrate_pages(compat_pid_t pid,
 		compat_ulong_t maxnode, const compat_ulong_t __user *old_nodes,
diff --git a/kernel/compat.c b/kernel/compat.c
index 772e038d04d9..18dd902c9052 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -497,15 +497,23 @@ sigset_from_compat(sigset_t *set, const compat_sigset_t *compat)
 }
 EXPORT_SYMBOL_GPL(sigset_from_compat);
 
-void
-sigset_to_compat(compat_sigset_t *compat, const sigset_t *set)
+int
+put_compat_sigset(compat_sigset_t __user *compat, const sigset_t *set,
+		  unsigned int size)
 {
+	/* size <= sizeof(compat_sigset_t) <= sizeof(sigset_t) */
+#ifdef __BIG_ENDIAN
+	compat_sigset_t v;
 	switch (_NSIG_WORDS) {
-	case 4: compat->sig[7] = (set->sig[3] >> 32); compat->sig[6] = set->sig[3];
-	case 3: compat->sig[5] = (set->sig[2] >> 32); compat->sig[4] = set->sig[2];
-	case 2: compat->sig[3] = (set->sig[1] >> 32); compat->sig[2] = set->sig[1];
-	case 1: compat->sig[1] = (set->sig[0] >> 32); compat->sig[0] = set->sig[0];
+	case 4: v.sig[7] = (set->sig[3] >> 32); v.sig[6] = set->sig[3];
+	case 3: v.sig[5] = (set->sig[2] >> 32); v.sig[4] = set->sig[2];
+	case 2: v.sig[3] = (set->sig[1] >> 32); v.sig[2] = set->sig[1];
+	case 1: v.sig[1] = (set->sig[0] >> 32); v.sig[0] = set->sig[0];
 	}
+	return copy_to_user(compat, &v, size) ? -EFAULT : 0;
+#else
+	return copy_to_user(compat, set, size) ? -EFAULT : 0;
+#endif
 }
 
 #ifdef CONFIG_NUMA
diff --git a/kernel/signal.c b/kernel/signal.c
index 800a18f77732..14ad6bb90dad 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2621,13 +2621,7 @@ COMPAT_SYSCALL_DEFINE4(rt_sigprocmask, int, how, compat_sigset_t __user *, nset,
 		if (error)
 			return error;
 	}
-	if (oset) {
-		compat_sigset_t old32;
-		sigset_to_compat(&old32, &old_set);
-		if (copy_to_user(oset, &old32, sizeof(compat_sigset_t)))
-			return -EFAULT;
-	}
-	return 0;
+	return oset ? put_compat_sigset(oset, &old_set, sizeof(*oset)) : 0;
 #else
 	return sys_rt_sigprocmask(how, (sigset_t __user *)nset,
 				  (sigset_t __user *)oset, sigsetsize);
@@ -2669,20 +2663,11 @@ SYSCALL_DEFINE2(rt_sigpending, sigset_t __user *, uset, size_t, sigsetsize)
 COMPAT_SYSCALL_DEFINE2(rt_sigpending, compat_sigset_t __user *, uset,
 		compat_size_t, sigsetsize)
 {
-#ifdef __BIG_ENDIAN
 	sigset_t set;
 	int err = do_sigpending(&set, sigsetsize);
-	if (!err) {
-		compat_sigset_t set32;
-		sigset_to_compat(&set32, &set);
-		/* we can get here only if sigsetsize <= sizeof(set) */
-		if (copy_to_user(uset, &set32, sigsetsize))
-			err = -EFAULT;
-	}
+	if (!err)
+		err = put_compat_sigset(uset, &set, sigsetsize);
 	return err;
-#else
-	return sys_rt_sigpending((sigset_t __user *)uset, sigsetsize);
-#endif
 }
 #endif
 
@@ -3451,7 +3436,6 @@ COMPAT_SYSCALL_DEFINE4(rt_sigaction, int, sig,
 		compat_size_t, sigsetsize)
 {
 	struct k_sigaction new_ka, old_ka;
-	compat_sigset_t mask;
 #ifdef __ARCH_HAS_SA_RESTORER
 	compat_uptr_t restorer;
 #endif
@@ -3463,6 +3447,7 @@ COMPAT_SYSCALL_DEFINE4(rt_sigaction, int, sig,
 
 	if (act) {
 		compat_uptr_t handler;
+		compat_sigset_t mask;
 		ret = get_user(handler, &act->sa_handler);
 		new_ka.sa.sa_handler = compat_ptr(handler);
 #ifdef __ARCH_HAS_SA_RESTORER
@@ -3478,10 +3463,10 @@ COMPAT_SYSCALL_DEFINE4(rt_sigaction, int, sig,
 
 	ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL);
 	if (!ret && oact) {
-		sigset_to_compat(&mask, &old_ka.sa.sa_mask);
 		ret = put_user(ptr_to_compat(old_ka.sa.sa_handler), 
 			       &oact->sa_handler);
-		ret |= copy_to_user(&oact->sa_mask, &mask, sizeof(mask));
+		ret |= put_compat_sigset(&oact->sa_mask, &old_ka.sa.sa_mask,
+					 sizeof(oact->sa_mask));
 		ret |= put_user(old_ka.sa.sa_flags, &oact->sa_flags);
 #ifdef __ARCH_HAS_SA_RESTORER
 		ret |= put_user(ptr_to_compat(old_ka.sa.sa_restorer),
-- 
cgit v1.2.3


From 1681634b8c70353d8ca211b9b3443889a16dafeb Mon Sep 17 00:00:00 2001
From: "Dmitry V. Levin" <ldv@altlinux.org>
Date: Tue, 22 Aug 2017 02:16:29 +0300
Subject: signal: simplify compat_sigpending()

Remove "if it's big-endian..." ifdef in compat_sigpending(),
use the endian-agnostic variant.

Suggested-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Dmitry V. Levin <ldv@altlinux.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 kernel/signal.c | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index 14ad6bb90dad..f59c05fc374a 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -3330,15 +3330,11 @@ SYSCALL_DEFINE1(sigpending, old_sigset_t __user *, set)
 #ifdef CONFIG_COMPAT
 COMPAT_SYSCALL_DEFINE1(sigpending, compat_old_sigset_t __user *, set32)
 {
-#ifdef __BIG_ENDIAN
 	sigset_t set;
 	int err = do_sigpending(&set, sizeof(set.sig[0]));
 	if (!err)
 		err = put_user(set.sig[0], set32);
 	return err;
-#else
-	return sys_rt_sigpending((sigset_t __user *)set32, sizeof(*set32));
-#endif
 }
 #endif
 
-- 
cgit v1.2.3


From 176826af03666758c656dd27f098cc889b71638b Mon Sep 17 00:00:00 2001
From: "Dmitry V. Levin" <ldv@altlinux.org>
Date: Tue, 22 Aug 2017 02:16:43 +0300
Subject: signal: lift sigset size check out of do_sigpending()

As sigsetsize argument of do_sigpending() is not used anywhere else in
that function after the check, remove this argument and move the check
out of do_sigpending() into rt_sigpending() and its compat analog.

Suggested-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Dmitry V. Levin <ldv@altlinux.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 kernel/signal.c | 21 ++++++++++++++-------
 1 file changed, 14 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index f59c05fc374a..9fbc574ced10 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2629,11 +2629,8 @@ COMPAT_SYSCALL_DEFINE4(rt_sigprocmask, int, how, compat_sigset_t __user *, nset,
 }
 #endif
 
-static int do_sigpending(void *set, unsigned long sigsetsize)
+static int do_sigpending(sigset_t *set)
 {
-	if (sigsetsize > sizeof(sigset_t))
-		return -EINVAL;
-
 	spin_lock_irq(&current->sighand->siglock);
 	sigorsets(set, &current->pending.signal,
 		  &current->signal->shared_pending.signal);
@@ -2653,7 +2650,12 @@ static int do_sigpending(void *set, unsigned long sigsetsize)
 SYSCALL_DEFINE2(rt_sigpending, sigset_t __user *, uset, size_t, sigsetsize)
 {
 	sigset_t set;
-	int err = do_sigpending(&set, sigsetsize);
+	int err;
+
+	if (sigsetsize > sizeof(*uset))
+		return -EINVAL;
+
+	err = do_sigpending(&set);
 	if (!err && copy_to_user(uset, &set, sigsetsize))
 		err = -EFAULT;
 	return err;
@@ -2664,7 +2666,12 @@ COMPAT_SYSCALL_DEFINE2(rt_sigpending, compat_sigset_t __user *, uset,
 		compat_size_t, sigsetsize)
 {
 	sigset_t set;
-	int err = do_sigpending(&set, sigsetsize);
+	int err;
+
+	if (sigsetsize > sizeof(*uset))
+		return -EINVAL;
+
+	err = do_sigpending(&set);
 	if (!err)
 		err = put_compat_sigset(uset, &set, sigsetsize);
 	return err;
@@ -3331,7 +3338,7 @@ SYSCALL_DEFINE1(sigpending, old_sigset_t __user *, set)
 COMPAT_SYSCALL_DEFINE1(sigpending, compat_old_sigset_t __user *, set32)
 {
 	sigset_t set;
-	int err = do_sigpending(&set, sizeof(set.sig[0]));
+	int err = do_sigpending(&set);
 	if (!err)
 		err = put_user(set.sig[0], set32);
 	return err;
-- 
cgit v1.2.3


From b8e8e1aa9f14110da180569908bbe538c9e9dc63 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 3 Sep 2017 20:42:54 -0400
Subject: get rid of {get,put}_compat_itimerspec()

no users left

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/compat.h |  5 -----
 kernel/compat.c        | 18 ------------------
 2 files changed, 23 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/compat.h b/include/linux/compat.h
index ab1baa79abe8..21d30be5c0a5 100644
--- a/include/linux/compat.h
+++ b/include/linux/compat.h
@@ -443,11 +443,6 @@ static inline int compat_timespec_compare(struct compat_timespec *lhs,
 	return lhs->tv_nsec - rhs->tv_nsec;
 }
 
-extern int get_compat_itimerspec(struct itimerspec *dst,
-				 const struct compat_itimerspec __user *src);
-extern int put_compat_itimerspec(struct compat_itimerspec __user *dst,
-				 const struct itimerspec *src);
-
 asmlinkage long compat_sys_gettimeofday(struct compat_timeval __user *tv,
 		struct timezone __user *tz);
 asmlinkage long compat_sys_settimeofday(struct compat_timeval __user *tv,
diff --git a/kernel/compat.c b/kernel/compat.c
index 18dd902c9052..d43b18031116 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -367,24 +367,6 @@ COMPAT_SYSCALL_DEFINE3(sched_getaffinity, compat_pid_t,  pid, unsigned int, len,
 	return ret;
 }
 
-int get_compat_itimerspec(struct itimerspec *dst,
-			  const struct compat_itimerspec __user *src)
-{
-	if (__compat_get_timespec(&dst->it_interval, &src->it_interval) ||
-	    __compat_get_timespec(&dst->it_value, &src->it_value))
-		return -EFAULT;
-	return 0;
-}
-
-int put_compat_itimerspec(struct compat_itimerspec __user *dst,
-			  const struct itimerspec *src)
-{
-	if (__compat_put_timespec(&src->it_interval, &dst->it_interval) ||
-	    __compat_put_timespec(&src->it_value, &dst->it_value))
-		return -EFAULT;
-	return 0;
-}
-
 int get_compat_itimerspec64(struct itimerspec64 *its,
 			const struct compat_itimerspec __user *uits)
 {
-- 
cgit v1.2.3


From 3968cf623892d710e651070243fd16af312a9797 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 3 Sep 2017 21:45:17 -0400
Subject: get_compat_sigset()

similar to put_compat_sigset()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 arch/sparc/kernel/sys_sparc32.c |  4 +---
 fs/eventpoll.c                  |  4 +---
 fs/select.c                     |  8 ++------
 fs/signalfd.c                   |  4 +---
 include/linux/compat.h          |  2 +-
 kernel/compat.c                 | 23 ++++++++++++++++-------
 kernel/signal.c                 | 27 ++++-----------------------
 virt/kvm/kvm_main.c             |  7 ++-----
 8 files changed, 28 insertions(+), 51 deletions(-)

(limited to 'kernel')

diff --git a/arch/sparc/kernel/sys_sparc32.c b/arch/sparc/kernel/sys_sparc32.c
index 5e2bec9e41b2..34ece61ee970 100644
--- a/arch/sparc/kernel/sys_sparc32.c
+++ b/arch/sparc/kernel/sys_sparc32.c
@@ -166,13 +166,11 @@ COMPAT_SYSCALL_DEFINE5(rt_sigaction, int, sig,
 
         if (act) {
 		u32 u_handler, u_restorer;
-		compat_sigset_t set32;
 
 		new_ka.ka_restorer = restorer;
 		ret = get_user(u_handler, &act->sa_handler);
 		new_ka.sa.sa_handler =  compat_ptr(u_handler);
-		ret |= copy_from_user(&set32, &act->sa_mask, sizeof(compat_sigset_t));
-		sigset_from_compat(&new_ka.sa.sa_mask, &set32);
+		ret |= get_compat_sigset(&new_ka.sa.sa_mask, &act->sa_mask);
 		ret |= get_user(new_ka.sa.sa_flags, &act->sa_flags);
 		ret |= get_user(u_restorer, &act->sa_restorer);
 		new_ka.sa.sa_restorer = compat_ptr(u_restorer);
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 2fabd19cdeea..396a3c075fd4 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -2259,7 +2259,6 @@ COMPAT_SYSCALL_DEFINE6(epoll_pwait, int, epfd,
 			compat_size_t, sigsetsize)
 {
 	long err;
-	compat_sigset_t csigmask;
 	sigset_t ksigmask, sigsaved;
 
 	/*
@@ -2269,9 +2268,8 @@ COMPAT_SYSCALL_DEFINE6(epoll_pwait, int, epfd,
 	if (sigmask) {
 		if (sigsetsize != sizeof(compat_sigset_t))
 			return -EINVAL;
-		if (copy_from_user(&csigmask, sigmask, sizeof(csigmask)))
+		if (get_compat_sigset(&ksigmask, sigmask))
 			return -EFAULT;
-		sigset_from_compat(&ksigmask, &csigmask);
 		sigsaved = current->blocked;
 		set_current_blocked(&ksigmask);
 	}
diff --git a/fs/select.c b/fs/select.c
index 20a7d061904f..9c980162c9fe 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -1301,7 +1301,6 @@ static long do_compat_pselect(int n, compat_ulong_t __user *inp,
 	struct compat_timespec __user *tsp, compat_sigset_t __user *sigmask,
 	compat_size_t sigsetsize)
 {
-	compat_sigset_t ss32;
 	sigset_t ksigmask, sigsaved;
 	struct timespec64 ts, end_time, *to = NULL;
 	int ret;
@@ -1318,9 +1317,8 @@ static long do_compat_pselect(int n, compat_ulong_t __user *inp,
 	if (sigmask) {
 		if (sigsetsize != sizeof(compat_sigset_t))
 			return -EINVAL;
-		if (copy_from_user(&ss32, sigmask, sizeof(ss32)))
+		if (get_compat_sigset(&ksigmask, sigmask))
 			return -EFAULT;
-		sigset_from_compat(&ksigmask, &ss32);
 
 		sigdelsetmask(&ksigmask, sigmask(SIGKILL)|sigmask(SIGSTOP));
 		sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
@@ -1369,7 +1367,6 @@ COMPAT_SYSCALL_DEFINE5(ppoll, struct pollfd __user *, ufds,
 	unsigned int,  nfds, struct compat_timespec __user *, tsp,
 	const compat_sigset_t __user *, sigmask, compat_size_t, sigsetsize)
 {
-	compat_sigset_t ss32;
 	sigset_t ksigmask, sigsaved;
 	struct timespec64 ts, end_time, *to = NULL;
 	int ret;
@@ -1386,9 +1383,8 @@ COMPAT_SYSCALL_DEFINE5(ppoll, struct pollfd __user *, ufds,
 	if (sigmask) {
 		if (sigsetsize != sizeof(compat_sigset_t))
 			return -EINVAL;
-		if (copy_from_user(&ss32, sigmask, sizeof(ss32)))
+		if (get_compat_sigset(&ksigmask, sigmask))
 			return -EFAULT;
-		sigset_from_compat(&ksigmask, &ss32);
 
 		sigdelsetmask(&ksigmask, sigmask(SIGKILL)|sigmask(SIGSTOP));
 		sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
diff --git a/fs/signalfd.c b/fs/signalfd.c
index d2c434112f42..9de5beeb771d 100644
--- a/fs/signalfd.c
+++ b/fs/signalfd.c
@@ -312,15 +312,13 @@ COMPAT_SYSCALL_DEFINE4(signalfd4, int, ufd,
 		     compat_size_t, sigsetsize,
 		     int, flags)
 {
-	compat_sigset_t ss32;
 	sigset_t tmp;
 	sigset_t __user *ksigmask;
 
 	if (sigsetsize != sizeof(compat_sigset_t))
 		return -EINVAL;
-	if (copy_from_user(&ss32, sigmask, sizeof(ss32)))
+	if (get_compat_sigset(&tmp, sigmask))
 		return -EFAULT;
-	sigset_from_compat(&tmp, &ss32);
 	ksigmask = compat_alloc_user_space(sizeof(sigset_t));
 	if (copy_to_user(ksigmask, &tmp, sizeof(sigset_t)))
 		return -EFAULT;
diff --git a/include/linux/compat.h b/include/linux/compat.h
index 21d30be5c0a5..57cb6ecafa86 100644
--- a/include/linux/compat.h
+++ b/include/linux/compat.h
@@ -450,7 +450,7 @@ asmlinkage long compat_sys_settimeofday(struct compat_timeval __user *tv,
 
 asmlinkage long compat_sys_adjtimex(struct compat_timex __user *utp);
 
-extern void sigset_from_compat(sigset_t *set, const compat_sigset_t *compat);
+extern int get_compat_sigset(sigset_t *set, const compat_sigset_t __user *compat);
 extern int put_compat_sigset(compat_sigset_t __user *compat,
 			     const sigset_t *set, unsigned int size);
 
diff --git a/kernel/compat.c b/kernel/compat.c
index d43b18031116..a46a4a40bb8b 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -467,17 +467,26 @@ Efault:
 	return -EFAULT;
 }
 
-void
-sigset_from_compat(sigset_t *set, const compat_sigset_t *compat)
+int
+get_compat_sigset(sigset_t *set, const compat_sigset_t __user *compat)
 {
+#ifdef __BIG_ENDIAN
+	compat_sigset_t v;
+	if (copy_from_user(&v, compat, sizeof(compat_sigset_t)))
+		return -EFAULT;
 	switch (_NSIG_WORDS) {
-	case 4: set->sig[3] = compat->sig[6] | (((long)compat->sig[7]) << 32 );
-	case 3: set->sig[2] = compat->sig[4] | (((long)compat->sig[5]) << 32 );
-	case 2: set->sig[1] = compat->sig[2] | (((long)compat->sig[3]) << 32 );
-	case 1: set->sig[0] = compat->sig[0] | (((long)compat->sig[1]) << 32 );
+	case 4: set->sig[3] = v.sig[6] | (((long)v.sig[7]) << 32 );
+	case 3: set->sig[2] = v.sig[4] | (((long)v.sig[5]) << 32 );
+	case 2: set->sig[1] = v.sig[2] | (((long)v.sig[3]) << 32 );
+	case 1: set->sig[0] = v.sig[0] | (((long)v.sig[1]) << 32 );
 	}
+#else
+	if (copy_from_user(set, compat, sizeof(compat_sigset_t)))
+		return -EFAULT;
+#endif
+	return 0;
 }
-EXPORT_SYMBOL_GPL(sigset_from_compat);
+EXPORT_SYMBOL_GPL(get_compat_sigset);
 
 int
 put_compat_sigset(compat_sigset_t __user *compat, const sigset_t *set,
diff --git a/kernel/signal.c b/kernel/signal.c
index 9fbc574ced10..36a523640894 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2600,7 +2600,6 @@ SYSCALL_DEFINE4(rt_sigprocmask, int, how, sigset_t __user *, nset,
 COMPAT_SYSCALL_DEFINE4(rt_sigprocmask, int, how, compat_sigset_t __user *, nset,
 		compat_sigset_t __user *, oset, compat_size_t, sigsetsize)
 {
-#ifdef __BIG_ENDIAN
 	sigset_t old_set = current->blocked;
 
 	/* XXX: Don't preclude handling different sized sigset_t's.  */
@@ -2608,13 +2607,10 @@ COMPAT_SYSCALL_DEFINE4(rt_sigprocmask, int, how, compat_sigset_t __user *, nset,
 		return -EINVAL;
 
 	if (nset) {
-		compat_sigset_t new32;
 		sigset_t new_set;
 		int error;
-		if (copy_from_user(&new32, nset, sizeof(compat_sigset_t)))
+		if (get_compat_sigset(&new_set, nset))
 			return -EFAULT;
-
-		sigset_from_compat(&new_set, &new32);
 		sigdelsetmask(&new_set, sigmask(SIGKILL)|sigmask(SIGSTOP));
 
 		error = sigprocmask(how, &new_set, NULL);
@@ -2622,10 +2618,6 @@ COMPAT_SYSCALL_DEFINE4(rt_sigprocmask, int, how, compat_sigset_t __user *, nset,
 			return error;
 	}
 	return oset ? put_compat_sigset(oset, &old_set, sizeof(*oset)) : 0;
-#else
-	return sys_rt_sigprocmask(how, (sigset_t __user *)nset,
-				  (sigset_t __user *)oset, sigsetsize);
-#endif
 }
 #endif
 
@@ -2908,7 +2900,6 @@ COMPAT_SYSCALL_DEFINE4(rt_sigtimedwait, compat_sigset_t __user *, uthese,
 		struct compat_siginfo __user *, uinfo,
 		struct compat_timespec __user *, uts, compat_size_t, sigsetsize)
 {
-	compat_sigset_t s32;
 	sigset_t s;
 	struct timespec t;
 	siginfo_t info;
@@ -2917,9 +2908,8 @@ COMPAT_SYSCALL_DEFINE4(rt_sigtimedwait, compat_sigset_t __user *, uthese,
 	if (sigsetsize != sizeof(sigset_t))
 		return -EINVAL;
 
-	if (copy_from_user(&s32, uthese, sizeof(compat_sigset_t)))
+	if (get_compat_sigset(&s, uthese))
 		return -EFAULT;
-	sigset_from_compat(&s, &s32);
 
 	if (uts) {
 		if (compat_get_timespec(&t, uts))
@@ -3450,18 +3440,16 @@ COMPAT_SYSCALL_DEFINE4(rt_sigaction, int, sig,
 
 	if (act) {
 		compat_uptr_t handler;
-		compat_sigset_t mask;
 		ret = get_user(handler, &act->sa_handler);
 		new_ka.sa.sa_handler = compat_ptr(handler);
 #ifdef __ARCH_HAS_SA_RESTORER
 		ret |= get_user(restorer, &act->sa_restorer);
 		new_ka.sa.sa_restorer = compat_ptr(restorer);
 #endif
-		ret |= copy_from_user(&mask, &act->sa_mask, sizeof(mask));
+		ret |= get_compat_sigset(&new_ka.sa.sa_mask, &act->sa_mask);
 		ret |= get_user(new_ka.sa.sa_flags, &act->sa_flags);
 		if (ret)
 			return -EFAULT;
-		sigset_from_compat(&new_ka.sa.sa_mask, &mask);
 	}
 
 	ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL);
@@ -3649,22 +3637,15 @@ SYSCALL_DEFINE2(rt_sigsuspend, sigset_t __user *, unewset, size_t, sigsetsize)
 #ifdef CONFIG_COMPAT
 COMPAT_SYSCALL_DEFINE2(rt_sigsuspend, compat_sigset_t __user *, unewset, compat_size_t, sigsetsize)
 {
-#ifdef __BIG_ENDIAN
 	sigset_t newset;
-	compat_sigset_t newset32;
 
 	/* XXX: Don't preclude handling different sized sigset_t's.  */
 	if (sigsetsize != sizeof(sigset_t))
 		return -EINVAL;
 
-	if (copy_from_user(&newset32, unewset, sizeof(compat_sigset_t)))
+	if (get_compat_sigset(&newset, unewset))
 		return -EFAULT;
-	sigset_from_compat(&newset, &newset32);
 	return sigsuspend(&newset);
-#else
-	/* on little-endian bitmaps don't care about granularity */
-	return sys_rt_sigsuspend((sigset_t __user *)unewset, sigsetsize);
-#endif
 }
 #endif
 
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 9deb5a245b83..99bfe50a0589 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2724,7 +2724,6 @@ static long kvm_vcpu_compat_ioctl(struct file *filp,
 	case KVM_SET_SIGNAL_MASK: {
 		struct kvm_signal_mask __user *sigmask_arg = argp;
 		struct kvm_signal_mask kvm_sigmask;
-		compat_sigset_t csigset;
 		sigset_t sigset;
 
 		if (argp) {
@@ -2733,13 +2732,11 @@ static long kvm_vcpu_compat_ioctl(struct file *filp,
 					   sizeof(kvm_sigmask)))
 				goto out;
 			r = -EINVAL;
-			if (kvm_sigmask.len != sizeof(csigset))
+			if (kvm_sigmask.len != sizeof(compat_sigset_t))
 				goto out;
 			r = -EFAULT;
-			if (copy_from_user(&csigset, sigmask_arg->sigset,
-					   sizeof(csigset)))
+			if (get_compat_sigset(&sigset, (void *)sigmask_arg->sigset))
 				goto out;
-			sigset_from_compat(&sigset, &csigset);
 			r = kvm_vcpu_ioctl_set_sigmask(vcpu, &sigset);
 		} else
 			r = kvm_vcpu_ioctl_set_sigmask(vcpu, NULL);
-- 
cgit v1.2.3


From abca5fc535a3ee0f36fb6d4468a453eaae769921 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 19 Sep 2017 18:17:46 -0400
Subject: sched_rr_get_interval(): move compat to native, get rid of set_fs()

switch to using timespec64 internally, while we are at it

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 kernel/compat.c     | 16 ----------------
 kernel/sched/core.c | 36 ++++++++++++++++++++++++++++++------
 2 files changed, 30 insertions(+), 22 deletions(-)

(limited to 'kernel')

diff --git a/kernel/compat.c b/kernel/compat.c
index a46a4a40bb8b..d1cee656a7ed 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -562,22 +562,6 @@ COMPAT_SYSCALL_DEFINE4(migrate_pages, compat_pid_t, pid,
 }
 #endif
 
-COMPAT_SYSCALL_DEFINE2(sched_rr_get_interval,
-		       compat_pid_t, pid,
-		       struct compat_timespec __user *, interval)
-{
-	struct timespec t;
-	int ret;
-	mm_segment_t old_fs = get_fs();
-
-	set_fs(KERNEL_DS);
-	ret = sys_sched_rr_get_interval(pid, (struct timespec __user *)&t);
-	set_fs(old_fs);
-	if (compat_put_timespec(&t, interval))
-		return -EFAULT;
-	return ret;
-}
-
 /*
  * Allocate user-space memory for the duration of a single system call,
  * in order to marshall parameters inside a compat thunk.
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 18a6966567da..e74f0a5a8647 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -16,6 +16,7 @@
 #include <linux/init_task.h>
 #include <linux/context_tracking.h>
 #include <linux/rcupdate_wait.h>
+#include <linux/compat.h>
 
 #include <linux/blkdev.h>
 #include <linux/kprobes.h>
@@ -5098,13 +5099,11 @@ SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
  * Return: On success, 0 and the timeslice is in @interval. Otherwise,
  * an error code.
  */
-SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
-		struct timespec __user *, interval)
+static int sched_rr_get_interval(pid_t pid, struct timespec64 *t)
 {
 	struct task_struct *p;
 	unsigned int time_slice;
 	struct rq_flags rf;
-	struct timespec t;
 	struct rq *rq;
 	int retval;
 
@@ -5128,15 +5127,40 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
 	task_rq_unlock(rq, p, &rf);
 
 	rcu_read_unlock();
-	jiffies_to_timespec(time_slice, &t);
-	retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
-	return retval;
+	jiffies_to_timespec64(time_slice, t);
+	return 0;
 
 out_unlock:
 	rcu_read_unlock();
 	return retval;
 }
 
+SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
+		struct timespec __user *, interval)
+{
+	struct timespec64 t;
+	int retval = sched_rr_get_interval(pid, &t);
+
+	if (retval == 0)
+		retval = put_timespec64(&t, interval);
+
+	return retval;
+}
+
+#ifdef CONFIG_COMPAT
+COMPAT_SYSCALL_DEFINE2(sched_rr_get_interval,
+		       compat_pid_t, pid,
+		       struct compat_timespec __user *, interval)
+{
+	struct timespec64 t;
+	int retval = sched_rr_get_interval(pid, &t);
+
+	if (retval == 0)
+		retval = compat_put_timespec64(&t, interval);
+	return retval;
+}
+#endif
+
 void sched_show_task(struct task_struct *p)
 {
 	unsigned long free = 0;
-- 
cgit v1.2.3


From cfb766da54d98ceb145e1bb0bd11c559569dcbfc Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 25 Sep 2017 08:12:04 -0700
Subject: sched/cputime: Expose cputime_adjust()

Will be used by basic cgroup resource stat reporting later.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Li Zefan <lizefan@huawei.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
---
 include/linux/sched/cputime.h | 3 ++-
 kernel/sched/cputime.c        | 5 ++---
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched/cputime.h b/include/linux/sched/cputime.h
index 4c5b9735c1ae..9251044335c5 100644
--- a/include/linux/sched/cputime.h
+++ b/include/linux/sched/cputime.h
@@ -53,7 +53,8 @@ static inline void task_cputime_scaled(struct task_struct *t,
 
 extern void task_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st);
 extern void thread_group_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st);
-
+extern void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev,
+			   u64 *ut, u64 *st);
 
 /*
  * Thread group CPU time accounting.
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 14d2dbf97c53..8839b6e8a104 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -585,9 +585,8 @@ drop_precision:
  *
  * Assuming that rtime_i+1 >= rtime_i.
  */
-static void cputime_adjust(struct task_cputime *curr,
-			   struct prev_cputime *prev,
-			   u64 *ut, u64 *st)
+void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev,
+		    u64 *ut, u64 *st)
 {
 	u64 rtime, stime, utime;
 	unsigned long flags;
-- 
cgit v1.2.3


From d2cc5ed6949085cfba30ec5228816cf6eb1d02b9 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 25 Sep 2017 08:12:04 -0700
Subject: cpuacct: Introduce cgroup_account_cputime[_field]()

Introduce cgroup_account_cputime[_field]() which wrap cpuacct_charge()
and cgroup_account_field().  This doesn't introduce any functional
changes and will be used to add cgroup basic resource accounting.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Ingo Molnar <mingo@redhat.com>
---
 include/linux/cgroup.h   | 38 ++++++++++++++++++++++++++++++++++++++
 kernel/sched/cpuacct.h   | 17 -----------------
 kernel/sched/cputime.c   |  2 +-
 kernel/sched/deadline.c  |  2 +-
 kernel/sched/fair.c      |  2 +-
 kernel/sched/rt.c        |  2 +-
 kernel/sched/sched.h     |  2 +-
 kernel/sched/stop_task.c |  2 +-
 8 files changed, 44 insertions(+), 23 deletions(-)
 delete mode 100644 kernel/sched/cpuacct.h

(limited to 'kernel')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index d023ac5e377f..6cd579329310 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -22,6 +22,7 @@
 #include <linux/nsproxy.h>
 #include <linux/user_namespace.h>
 #include <linux/refcount.h>
+#include <linux/kernel_stat.h>
 
 #include <linux/cgroup-defs.h>
 
@@ -688,6 +689,43 @@ static inline void cgroup_path_from_kernfs_id(const union kernfs_node_id *id,
 	char *buf, size_t buflen) {}
 #endif /* !CONFIG_CGROUPS */
 
+/*
+ * Basic resource stats.
+ */
+#ifdef CONFIG_CGROUPS
+
+#ifdef CONFIG_CGROUP_CPUACCT
+void cpuacct_charge(struct task_struct *tsk, u64 cputime);
+void cpuacct_account_field(struct task_struct *tsk, int index, u64 val);
+#else
+static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}
+static inline void cpuacct_account_field(struct task_struct *tsk, int index,
+					 u64 val) {}
+#endif
+
+static inline void cgroup_account_cputime(struct task_struct *task,
+					  u64 delta_exec)
+{
+	cpuacct_charge(task, delta_exec);
+}
+
+static inline void cgroup_account_cputime_field(struct task_struct *task,
+						enum cpu_usage_stat index,
+						u64 delta_exec)
+{
+	cpuacct_account_field(task, index, delta_exec);
+}
+
+#else	/* CONFIG_CGROUPS */
+
+static inline void cgroup_account_cputime(struct task_struct *task,
+					  u64 delta_exec) {}
+static inline void cgroup_account_cputime_field(struct task_struct *task,
+						enum cpu_usage_stat index,
+						u64 delta_exec) {}
+
+#endif	/* CONFIG_CGROUPS */
+
 /*
  * sock->sk_cgrp_data handling.  For more info, see sock_cgroup_data
  * definition in cgroup-defs.h.
diff --git a/kernel/sched/cpuacct.h b/kernel/sched/cpuacct.h
deleted file mode 100644
index ba72807c73d4..000000000000
--- a/kernel/sched/cpuacct.h
+++ /dev/null
@@ -1,17 +0,0 @@
-#ifdef CONFIG_CGROUP_CPUACCT
-
-extern void cpuacct_charge(struct task_struct *tsk, u64 cputime);
-extern void cpuacct_account_field(struct task_struct *tsk, int index, u64 val);
-
-#else
-
-static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime)
-{
-}
-
-static inline void
-cpuacct_account_field(struct task_struct *tsk, int index, u64 val)
-{
-}
-
-#endif
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 8839b6e8a104..e01b699bbd5b 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -109,7 +109,7 @@ static inline void task_group_account_field(struct task_struct *p, int index,
 	 */
 	__this_cpu_add(kernel_cpustat.cpustat[index], tmp);
 
-	cpuacct_account_field(p, index, tmp);
+	cgroup_account_cputime_field(p, index, tmp);
 }
 
 /*
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 0191ec7667c3..abd913c1b99e 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -1143,7 +1143,7 @@ static void update_curr_dl(struct rq *rq)
 	account_group_exec_runtime(curr, delta_exec);
 
 	curr->se.exec_start = rq_clock_task(rq);
-	cpuacct_charge(curr, delta_exec);
+	cgroup_account_cputime(curr, delta_exec);
 
 	sched_rt_avg_update(rq, delta_exec);
 
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 70ba32e08a23..0ae69af95b8b 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -851,7 +851,7 @@ static void update_curr(struct cfs_rq *cfs_rq)
 		struct task_struct *curtask = task_of(curr);
 
 		trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);
-		cpuacct_charge(curtask, delta_exec);
+		cgroup_account_cputime(curtask, delta_exec);
 		account_group_exec_runtime(curtask, delta_exec);
 	}
 
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 0af5ca9e3e3f..fdc2c5d1f82e 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -979,7 +979,7 @@ static void update_curr_rt(struct rq *rq)
 	account_group_exec_runtime(curr, delta_exec);
 
 	curr->se.exec_start = rq_clock_task(rq);
-	cpuacct_charge(curr, delta_exec);
+	cgroup_account_cputime(curr, delta_exec);
 
 	sched_rt_avg_update(rq, delta_exec);
 
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 14db76cd496f..f0b98f978843 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -29,6 +29,7 @@
 #include <linux/irq_work.h>
 #include <linux/tick.h>
 #include <linux/slab.h>
+#include <linux/cgroup.h>
 
 #ifdef CONFIG_PARAVIRT
 #include <asm/paravirt.h>
@@ -36,7 +37,6 @@
 
 #include "cpupri.h"
 #include "cpudeadline.h"
-#include "cpuacct.h"
 
 #ifdef CONFIG_SCHED_DEBUG
 # define SCHED_WARN_ON(x)	WARN_ONCE(x, #x)
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index 9f69fb630853..ec0bb5ab9024 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -71,7 +71,7 @@ static void put_prev_task_stop(struct rq *rq, struct task_struct *prev)
 	account_group_exec_runtime(curr, delta_exec);
 
 	curr->se.exec_start = rq_clock_task(rq);
-	cpuacct_charge(curr, delta_exec);
+	cgroup_account_cputime(curr, delta_exec);
 }
 
 static void task_tick_stop(struct rq *rq, struct task_struct *curr, int queued)
-- 
cgit v1.2.3


From 041cd640b2f3c5607171c59d8712b503659d21f7 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 25 Sep 2017 08:12:05 -0700
Subject: cgroup: Implement cgroup2 basic CPU usage accounting

In cgroup1, while cpuacct isn't actually controlling any resources, it
is a separate controller due to combination of two factors -
1. enabling cpu controller has significant side effects, and 2. we
have to pick one of the hierarchies to account CPU usages on.  cpuacct
controller is effectively used to designate a hierarchy to track CPU
usages on.

cgroup2's unified hierarchy removes the second reason and we can
account basic CPU usages by default.  While we can use cpuacct for
this purpose, both its interface and implementation leave a lot to be
desired - it collects and exposes two sources of truth which don't
agree with each other and some of the exposed statistics don't make
much sense.  Also, it propagates all the way up the hierarchy on each
accounting event which is unnecessary.

This patch adds basic resource accounting mechanism to cgroup2's
unified hierarchy and accounts CPU usages using it.

* All accountings are done per-cpu and don't propagate immediately.
  It just bumps the per-cgroup per-cpu counters and links to the
  parent's updated list if not already on it.

* On a read, the per-cpu counters are collected into the global ones
  and then propagated upwards.  Only the per-cpu counters which have
  changed since the last read are propagated.

* CPU usage stats are collected and shown in "cgroup.stat" with "cpu."
  prefix.  Total usage is collected from scheduling events.  User/sys
  breakdown is sourced from tick sampling and adjusted to the usage
  using cputime_adjust().

This keeps the accounting side hot path O(1) and per-cpu and the read
side O(nr_updated_since_last_read).

v2: Minor changes and documentation updates as suggested by Waiman and
    Roman.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Li Zefan <lizefan@huawei.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Waiman Long <longman@redhat.com>
Cc: Roman Gushchin <guro@fb.com>
---
 Documentation/cgroup-v2.txt     |   9 ++
 include/linux/cgroup-defs.h     |  57 +++++++
 include/linux/cgroup.h          |  22 +++
 kernel/cgroup/Makefile          |   2 +-
 kernel/cgroup/cgroup-internal.h |   8 +
 kernel/cgroup/cgroup.c          |  24 ++-
 kernel/cgroup/stat.c            | 334 ++++++++++++++++++++++++++++++++++++++++
 7 files changed, 453 insertions(+), 3 deletions(-)
 create mode 100644 kernel/cgroup/stat.c

(limited to 'kernel')

diff --git a/Documentation/cgroup-v2.txt b/Documentation/cgroup-v2.txt
index dc44785dc0fa..3f8216912df0 100644
--- a/Documentation/cgroup-v2.txt
+++ b/Documentation/cgroup-v2.txt
@@ -886,6 +886,15 @@ All cgroup core files are prefixed with "cgroup."
 		A dying cgroup can consume system resources not exceeding
 		limits, which were active at the moment of cgroup deletion.
 
+	  cpu.usage_usec
+		CPU time consumed in the subtree.
+
+	  cpu.user_usec
+		User CPU time consumed in the subtree.
+
+	  cpu.system_usec
+		System CPU time consumed in the subtree.
+
 
 Controllers
 ===========
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index ade4a78a54c2..3e55bbd31ad1 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -16,6 +16,7 @@
 #include <linux/refcount.h>
 #include <linux/percpu-refcount.h>
 #include <linux/percpu-rwsem.h>
+#include <linux/u64_stats_sync.h>
 #include <linux/workqueue.h>
 #include <linux/bpf-cgroup.h>
 
@@ -254,6 +255,57 @@ struct css_set {
 	struct rcu_head rcu_head;
 };
 
+/*
+ * cgroup basic resource usage statistics.  Accounting is done per-cpu in
+ * cgroup_cpu_stat which is then lazily propagated up the hierarchy on
+ * reads.
+ *
+ * When a stat gets updated, the cgroup_cpu_stat and its ancestors are
+ * linked into the updated tree.  On the following read, propagation only
+ * considers and consumes the updated tree.  This makes reading O(the
+ * number of descendants which have been active since last read) instead of
+ * O(the total number of descendants).
+ *
+ * This is important because there can be a lot of (draining) cgroups which
+ * aren't active and stat may be read frequently.  The combination can
+ * become very expensive.  By propagating selectively, increasing reading
+ * frequency decreases the cost of each read.
+ */
+struct cgroup_cpu_stat {
+	/*
+	 * ->sync protects all the current counters.  These are the only
+	 * fields which get updated in the hot path.
+	 */
+	struct u64_stats_sync sync;
+	struct task_cputime cputime;
+
+	/*
+	 * Snapshots at the last reading.  These are used to calculate the
+	 * deltas to propagate to the global counters.
+	 */
+	struct task_cputime last_cputime;
+
+	/*
+	 * Child cgroups with stat updates on this cpu since the last read
+	 * are linked on the parent's ->updated_children through
+	 * ->updated_next.
+	 *
+	 * In addition to being more compact, singly-linked list pointing
+	 * to the cgroup makes it unnecessary for each per-cpu struct to
+	 * point back to the associated cgroup.
+	 *
+	 * Protected by per-cpu cgroup_cpu_stat_lock.
+	 */
+	struct cgroup *updated_children;	/* terminated by self cgroup */
+	struct cgroup *updated_next;		/* NULL iff not on the list */
+};
+
+struct cgroup_stat {
+	/* per-cpu statistics are collected into the folowing global counters */
+	struct task_cputime cputime;
+	struct prev_cputime prev_cputime;
+};
+
 struct cgroup {
 	/* self css with NULL ->ss, points back to this cgroup */
 	struct cgroup_subsys_state self;
@@ -353,6 +405,11 @@ struct cgroup {
 	 */
 	struct cgroup *dom_cgrp;
 
+	/* cgroup basic resource statistics */
+	struct cgroup_cpu_stat __percpu *cpu_stat;
+	struct cgroup_stat pending_stat;	/* pending from children */
+	struct cgroup_stat stat;
+
 	/*
 	 * list of pidlists, up to two for each namespace (one for procs, one
 	 * for tasks); created on demand.
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 6cd579329310..328a70ce0e23 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -703,17 +703,39 @@ static inline void cpuacct_account_field(struct task_struct *tsk, int index,
 					 u64 val) {}
 #endif
 
+void cgroup_stat_show_cputime(struct seq_file *seq, const char *prefix);
+
+void __cgroup_account_cputime(struct cgroup *cgrp, u64 delta_exec);
+void __cgroup_account_cputime_field(struct cgroup *cgrp,
+				    enum cpu_usage_stat index, u64 delta_exec);
+
 static inline void cgroup_account_cputime(struct task_struct *task,
 					  u64 delta_exec)
 {
+	struct cgroup *cgrp;
+
 	cpuacct_charge(task, delta_exec);
+
+	rcu_read_lock();
+	cgrp = task_dfl_cgroup(task);
+	if (cgroup_parent(cgrp))
+		__cgroup_account_cputime(cgrp, delta_exec);
+	rcu_read_unlock();
 }
 
 static inline void cgroup_account_cputime_field(struct task_struct *task,
 						enum cpu_usage_stat index,
 						u64 delta_exec)
 {
+	struct cgroup *cgrp;
+
 	cpuacct_account_field(task, index, delta_exec);
+
+	rcu_read_lock();
+	cgrp = task_dfl_cgroup(task);
+	if (cgroup_parent(cgrp))
+		__cgroup_account_cputime_field(cgrp, index, delta_exec);
+	rcu_read_unlock();
 }
 
 #else	/* CONFIG_CGROUPS */
diff --git a/kernel/cgroup/Makefile b/kernel/cgroup/Makefile
index ce693ccb8c58..0acee616e06c 100644
--- a/kernel/cgroup/Makefile
+++ b/kernel/cgroup/Makefile
@@ -1,4 +1,4 @@
-obj-y := cgroup.o namespace.o cgroup-v1.o
+obj-y := cgroup.o stat.o namespace.o cgroup-v1.o
 
 obj-$(CONFIG_CGROUP_FREEZER) += freezer.o
 obj-$(CONFIG_CGROUP_PIDS) += pids.o
diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h
index 5151ff256c29..fa642c99586a 100644
--- a/kernel/cgroup/cgroup-internal.h
+++ b/kernel/cgroup/cgroup-internal.h
@@ -199,6 +199,14 @@ int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node,
 
 int cgroup_task_count(const struct cgroup *cgrp);
 
+/*
+ * stat.c
+ */
+void cgroup_stat_flush(struct cgroup *cgrp);
+int cgroup_stat_init(struct cgroup *cgrp);
+void cgroup_stat_exit(struct cgroup *cgrp);
+void cgroup_stat_boot(void);
+
 /*
  * namespace.c
  */
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index d6551cd45238..d036625556c9 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -142,12 +142,14 @@ static struct static_key_true *cgroup_subsys_on_dfl_key[] = {
 };
 #undef SUBSYS
 
+static DEFINE_PER_CPU(struct cgroup_cpu_stat, cgrp_dfl_root_cpu_stat);
+
 /*
  * The default hierarchy, reserved for the subsystems that are otherwise
  * unattached - it never has more than a single cgroup, and all tasks are
  * part of that cgroup.
  */
-struct cgroup_root cgrp_dfl_root;
+struct cgroup_root cgrp_dfl_root = { .cgrp.cpu_stat = &cgrp_dfl_root_cpu_stat };
 EXPORT_SYMBOL_GPL(cgrp_dfl_root);
 
 /*
@@ -3301,6 +3303,8 @@ static int cgroup_stat_show(struct seq_file *seq, void *v)
 	seq_printf(seq, "nr_dying_descendants %d\n",
 		   cgroup->nr_dying_descendants);
 
+	cgroup_stat_show_cputime(seq, "cpu.");
+
 	return 0;
 }
 
@@ -4471,6 +4475,8 @@ static void css_free_work_fn(struct work_struct *work)
 			 */
 			cgroup_put(cgroup_parent(cgrp));
 			kernfs_put(cgrp->kn);
+			if (cgroup_on_dfl(cgrp))
+				cgroup_stat_exit(cgrp);
 			kfree(cgrp);
 		} else {
 			/*
@@ -4515,6 +4521,9 @@ static void css_release_work_fn(struct work_struct *work)
 		/* cgroup release path */
 		trace_cgroup_release(cgrp);
 
+		if (cgroup_on_dfl(cgrp))
+			cgroup_stat_flush(cgrp);
+
 		for (tcgrp = cgroup_parent(cgrp); tcgrp;
 		     tcgrp = cgroup_parent(tcgrp))
 			tcgrp->nr_dying_descendants--;
@@ -4698,6 +4707,12 @@ static struct cgroup *cgroup_create(struct cgroup *parent)
 	if (ret)
 		goto out_free_cgrp;
 
+	if (cgroup_on_dfl(parent)) {
+		ret = cgroup_stat_init(cgrp);
+		if (ret)
+			goto out_cancel_ref;
+	}
+
 	/*
 	 * Temporarily set the pointer to NULL, so idr_find() won't return
 	 * a half-baked cgroup.
@@ -4705,7 +4720,7 @@ static struct cgroup *cgroup_create(struct cgroup *parent)
 	cgrp->id = cgroup_idr_alloc(&root->cgroup_idr, NULL, 2, 0, GFP_KERNEL);
 	if (cgrp->id < 0) {
 		ret = -ENOMEM;
-		goto out_cancel_ref;
+		goto out_stat_exit;
 	}
 
 	init_cgroup_housekeeping(cgrp);
@@ -4754,6 +4769,9 @@ static struct cgroup *cgroup_create(struct cgroup *parent)
 
 	return cgrp;
 
+out_stat_exit:
+	if (cgroup_on_dfl(parent))
+		cgroup_stat_exit(cgrp);
 out_cancel_ref:
 	percpu_ref_exit(&cgrp->self.refcnt);
 out_free_cgrp:
@@ -5148,6 +5166,8 @@ int __init cgroup_init(void)
 	BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files));
 	BUG_ON(cgroup_init_cftypes(NULL, cgroup1_base_files));
 
+	cgroup_stat_boot();
+
 	/*
 	 * The latency of the synchronize_sched() is too high for cgroups,
 	 * avoid it at the cost of forcing all readers into the slow path.
diff --git a/kernel/cgroup/stat.c b/kernel/cgroup/stat.c
new file mode 100644
index 000000000000..9cce79e89320
--- /dev/null
+++ b/kernel/cgroup/stat.c
@@ -0,0 +1,334 @@
+#include "cgroup-internal.h"
+
+#include <linux/sched/cputime.h>
+
+static DEFINE_MUTEX(cgroup_stat_mutex);
+static DEFINE_PER_CPU(raw_spinlock_t, cgroup_cpu_stat_lock);
+
+static struct cgroup_cpu_stat *cgroup_cpu_stat(struct cgroup *cgrp, int cpu)
+{
+	return per_cpu_ptr(cgrp->cpu_stat, cpu);
+}
+
+/**
+ * cgroup_cpu_stat_updated - keep track of updated cpu_stat
+ * @cgrp: target cgroup
+ * @cpu: cpu on which cpu_stat was updated
+ *
+ * @cgrp's cpu_stat on @cpu was updated.  Put it on the parent's matching
+ * cpu_stat->updated_children list.  See the comment on top of
+ * cgroup_cpu_stat definition for details.
+ */
+static void cgroup_cpu_stat_updated(struct cgroup *cgrp, int cpu)
+{
+	raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_cpu_stat_lock, cpu);
+	struct cgroup *parent;
+	unsigned long flags;
+
+	/*
+	 * Speculative already-on-list test.  This may race leading to
+	 * temporary inaccuracies, which is fine.
+	 *
+	 * Because @parent's updated_children is terminated with @parent
+	 * instead of NULL, we can tell whether @cgrp is on the list by
+	 * testing the next pointer for NULL.
+	 */
+	if (cgroup_cpu_stat(cgrp, cpu)->updated_next)
+		return;
+
+	raw_spin_lock_irqsave(cpu_lock, flags);
+
+	/* put @cgrp and all ancestors on the corresponding updated lists */
+	for (parent = cgroup_parent(cgrp); parent;
+	     cgrp = parent, parent = cgroup_parent(cgrp)) {
+		struct cgroup_cpu_stat *cstat = cgroup_cpu_stat(cgrp, cpu);
+		struct cgroup_cpu_stat *pcstat = cgroup_cpu_stat(parent, cpu);
+
+		/*
+		 * Both additions and removals are bottom-up.  If a cgroup
+		 * is already in the tree, all ancestors are.
+		 */
+		if (cstat->updated_next)
+			break;
+
+		cstat->updated_next = pcstat->updated_children;
+		pcstat->updated_children = cgrp;
+	}
+
+	raw_spin_unlock_irqrestore(cpu_lock, flags);
+}
+
+/**
+ * cgroup_cpu_stat_pop_updated - iterate and dismantle cpu_stat updated tree
+ * @pos: current position
+ * @root: root of the tree to traversal
+ * @cpu: target cpu
+ *
+ * Walks the udpated cpu_stat tree on @cpu from @root.  %NULL @pos starts
+ * the traversal and %NULL return indicates the end.  During traversal,
+ * each returned cgroup is unlinked from the tree.  Must be called with the
+ * matching cgroup_cpu_stat_lock held.
+ *
+ * The only ordering guarantee is that, for a parent and a child pair
+ * covered by a given traversal, if a child is visited, its parent is
+ * guaranteed to be visited afterwards.
+ */
+static struct cgroup *cgroup_cpu_stat_pop_updated(struct cgroup *pos,
+						  struct cgroup *root, int cpu)
+{
+	struct cgroup_cpu_stat *cstat;
+	struct cgroup *parent;
+
+	if (pos == root)
+		return NULL;
+
+	/*
+	 * We're gonna walk down to the first leaf and visit/remove it.  We
+	 * can pick whatever unvisited node as the starting point.
+	 */
+	if (!pos)
+		pos = root;
+	else
+		pos = cgroup_parent(pos);
+
+	/* walk down to the first leaf */
+	while (true) {
+		cstat = cgroup_cpu_stat(pos, cpu);
+		if (cstat->updated_children == pos)
+			break;
+		pos = cstat->updated_children;
+	}
+
+	/*
+	 * Unlink @pos from the tree.  As the updated_children list is
+	 * singly linked, we have to walk it to find the removal point.
+	 * However, due to the way we traverse, @pos will be the first
+	 * child in most cases. The only exception is @root.
+	 */
+	parent = cgroup_parent(pos);
+	if (parent && cstat->updated_next) {
+		struct cgroup_cpu_stat *pcstat = cgroup_cpu_stat(parent, cpu);
+		struct cgroup_cpu_stat *ncstat;
+		struct cgroup **nextp;
+
+		nextp = &pcstat->updated_children;
+		while (true) {
+			ncstat = cgroup_cpu_stat(*nextp, cpu);
+			if (*nextp == pos)
+				break;
+
+			WARN_ON_ONCE(*nextp == parent);
+			nextp = &ncstat->updated_next;
+		}
+
+		*nextp = cstat->updated_next;
+		cstat->updated_next = NULL;
+	}
+
+	return pos;
+}
+
+static void cgroup_stat_accumulate(struct cgroup_stat *dst_stat,
+				   struct cgroup_stat *src_stat)
+{
+	dst_stat->cputime.utime += src_stat->cputime.utime;
+	dst_stat->cputime.stime += src_stat->cputime.stime;
+	dst_stat->cputime.sum_exec_runtime += src_stat->cputime.sum_exec_runtime;
+}
+
+static void cgroup_cpu_stat_flush_one(struct cgroup *cgrp, int cpu)
+{
+	struct cgroup *parent = cgroup_parent(cgrp);
+	struct cgroup_cpu_stat *cstat = cgroup_cpu_stat(cgrp, cpu);
+	struct task_cputime *last_cputime = &cstat->last_cputime;
+	struct task_cputime cputime;
+	struct cgroup_stat delta;
+	unsigned seq;
+
+	lockdep_assert_held(&cgroup_stat_mutex);
+
+	/* fetch the current per-cpu values */
+	do {
+		seq = __u64_stats_fetch_begin(&cstat->sync);
+		cputime = cstat->cputime;
+	} while (__u64_stats_fetch_retry(&cstat->sync, seq));
+
+	/* accumulate the deltas to propgate */
+	delta.cputime.utime = cputime.utime - last_cputime->utime;
+	delta.cputime.stime = cputime.stime - last_cputime->stime;
+	delta.cputime.sum_exec_runtime = cputime.sum_exec_runtime -
+					 last_cputime->sum_exec_runtime;
+	*last_cputime = cputime;
+
+	/* transfer the pending stat into delta */
+	cgroup_stat_accumulate(&delta, &cgrp->pending_stat);
+	memset(&cgrp->pending_stat, 0, sizeof(cgrp->pending_stat));
+
+	/* propagate delta into the global stat and the parent's pending */
+	cgroup_stat_accumulate(&cgrp->stat, &delta);
+	if (parent)
+		cgroup_stat_accumulate(&parent->pending_stat, &delta);
+}
+
+/* see cgroup_stat_flush() */
+static void cgroup_stat_flush_locked(struct cgroup *cgrp)
+{
+	int cpu;
+
+	lockdep_assert_held(&cgroup_stat_mutex);
+
+	for_each_possible_cpu(cpu) {
+		raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_cpu_stat_lock, cpu);
+		struct cgroup *pos = NULL;
+
+		raw_spin_lock_irq(cpu_lock);
+		while ((pos = cgroup_cpu_stat_pop_updated(pos, cgrp, cpu)))
+			cgroup_cpu_stat_flush_one(pos, cpu);
+		raw_spin_unlock_irq(cpu_lock);
+	}
+}
+
+/**
+ * cgroup_stat_flush - flush stats in @cgrp's subtree
+ * @cgrp: target cgroup
+ *
+ * Collect all per-cpu stats in @cgrp's subtree into the global counters
+ * and propagate them upwards.  After this function returns, all cgroups in
+ * the subtree have up-to-date ->stat.
+ *
+ * This also gets all cgroups in the subtree including @cgrp off the
+ * ->updated_children lists.
+ */
+void cgroup_stat_flush(struct cgroup *cgrp)
+{
+	mutex_lock(&cgroup_stat_mutex);
+	cgroup_stat_flush_locked(cgrp);
+	mutex_unlock(&cgroup_stat_mutex);
+}
+
+static struct cgroup_cpu_stat *cgroup_cpu_stat_account_begin(struct cgroup *cgrp)
+{
+	struct cgroup_cpu_stat *cstat;
+
+	cstat = get_cpu_ptr(cgrp->cpu_stat);
+	u64_stats_update_begin(&cstat->sync);
+	return cstat;
+}
+
+static void cgroup_cpu_stat_account_end(struct cgroup *cgrp,
+					struct cgroup_cpu_stat *cstat)
+{
+	u64_stats_update_end(&cstat->sync);
+	cgroup_cpu_stat_updated(cgrp, smp_processor_id());
+	put_cpu_ptr(cstat);
+}
+
+void __cgroup_account_cputime(struct cgroup *cgrp, u64 delta_exec)
+{
+	struct cgroup_cpu_stat *cstat;
+
+	cstat = cgroup_cpu_stat_account_begin(cgrp);
+	cstat->cputime.sum_exec_runtime += delta_exec;
+	cgroup_cpu_stat_account_end(cgrp, cstat);
+}
+
+void __cgroup_account_cputime_field(struct cgroup *cgrp,
+				    enum cpu_usage_stat index, u64 delta_exec)
+{
+	struct cgroup_cpu_stat *cstat;
+
+	cstat = cgroup_cpu_stat_account_begin(cgrp);
+
+	switch (index) {
+	case CPUTIME_USER:
+	case CPUTIME_NICE:
+		cstat->cputime.utime += delta_exec;
+		break;
+	case CPUTIME_SYSTEM:
+	case CPUTIME_IRQ:
+	case CPUTIME_SOFTIRQ:
+		cstat->cputime.stime += delta_exec;
+		break;
+	default:
+		break;
+	}
+
+	cgroup_cpu_stat_account_end(cgrp, cstat);
+}
+
+void cgroup_stat_show_cputime(struct seq_file *seq, const char *prefix)
+{
+	struct cgroup *cgrp = seq_css(seq)->cgroup;
+	u64 usage, utime, stime;
+
+	if (!cgroup_parent(cgrp))
+		return;
+
+	mutex_lock(&cgroup_stat_mutex);
+
+	cgroup_stat_flush_locked(cgrp);
+
+	usage = cgrp->stat.cputime.sum_exec_runtime;
+	cputime_adjust(&cgrp->stat.cputime, &cgrp->stat.prev_cputime,
+		       &utime, &stime);
+
+	mutex_unlock(&cgroup_stat_mutex);
+
+	do_div(usage, NSEC_PER_USEC);
+	do_div(utime, NSEC_PER_USEC);
+	do_div(stime, NSEC_PER_USEC);
+
+	seq_printf(seq, "%susage_usec %llu\n"
+		   "%suser_usec %llu\n"
+		   "%ssystem_usec %llu\n",
+		   prefix, usage, prefix, utime, prefix, stime);
+}
+
+int cgroup_stat_init(struct cgroup *cgrp)
+{
+	int cpu;
+
+	/* the root cgrp has cpu_stat preallocated */
+	if (!cgrp->cpu_stat) {
+		cgrp->cpu_stat = alloc_percpu(struct cgroup_cpu_stat);
+		if (!cgrp->cpu_stat)
+			return -ENOMEM;
+	}
+
+	/* ->updated_children list is self terminated */
+	for_each_possible_cpu(cpu)
+		cgroup_cpu_stat(cgrp, cpu)->updated_children = cgrp;
+
+	prev_cputime_init(&cgrp->stat.prev_cputime);
+
+	return 0;
+}
+
+void cgroup_stat_exit(struct cgroup *cgrp)
+{
+	int cpu;
+
+	cgroup_stat_flush(cgrp);
+
+	/* sanity check */
+	for_each_possible_cpu(cpu) {
+		struct cgroup_cpu_stat *cstat = cgroup_cpu_stat(cgrp, cpu);
+
+		if (WARN_ON_ONCE(cstat->updated_children != cgrp) ||
+		    WARN_ON_ONCE(cstat->updated_next))
+			return;
+	}
+
+	free_percpu(cgrp->cpu_stat);
+	cgrp->cpu_stat = NULL;
+}
+
+void __init cgroup_stat_boot(void)
+{
+	int cpu;
+
+	for_each_possible_cpu(cpu)
+		raw_spin_lock_init(per_cpu_ptr(&cgroup_cpu_stat_lock, cpu));
+
+	BUG_ON(cgroup_stat_init(&cgrp_dfl_root.cgrp));
+}
-- 
cgit v1.2.3


From 38683148828165ea0b66ace93a9fedc2d3281e27 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 25 Sep 2017 13:50:20 -0700
Subject: cgroup: statically initialize init_css_set->dfl_cgrp
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Like other csets, init_css_set's dfl_cgrp is initialized when the cset
gets linked.  init_css_set gets linked in cgroup_init().  This has
been fine till now but the recently added basic CPU usage accounting
may end up accessing dfl_cgrp of init before cgroup_init() leading to
the following oops.

  SELinux:  Initializing.
  BUG: unable to handle kernel NULL pointer dereference at 00000000000000b0
  IP: account_system_index_time+0x60/0x90
  PGD 0 P4D 0
  Oops: 0000 [#1] SMP
  Modules linked in:
  CPU: 0 PID: 0 Comm: swapper/0 Not tainted 4.14.0-rc2-00003-g041cd64 #10
  Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS
  +1.9.3-20161025_171302-gandalf 04/01/2014
  task: ffffffff81e10480 task.stack: ffffffff81e00000
  RIP: 0010:account_system_index_time+0x60/0x90
  RSP: 0000:ffff880011e03cb8 EFLAGS: 00010002
  RAX: ffffffff81ef8800 RBX: ffffffff81e10480 RCX: 0000000000000003
  RDX: 0000000000000000 RSI: 00000000000f4240 RDI: 0000000000000000
  RBP: ffff880011e03cc0 R08: 0000000000010000 R09: 0000000000000000
  R10: 0000000000000020 R11: 0000003b9aca0000 R12: 000000000001c100
  R13: 0000000000000000 R14: ffffffff81e10480 R15: ffffffff81e03cd8
  FS:  0000000000000000(0000) GS:ffff880011e00000(0000) knlGS:0000000000000000
  CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
  CR2: 00000000000000b0 CR3: 0000000001e09000 CR4: 00000000000006b0
  DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
  DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
  Call Trace:
   <IRQ>
   account_system_time+0x45/0x60
   account_process_tick+0x5a/0x140
   update_process_times+0x22/0x60
   tick_periodic+0x2b/0x90
   tick_handle_periodic+0x25/0x70
   timer_interrupt+0x15/0x20
   __handle_irq_event_percpu+0x7e/0x1b0
   handle_irq_event_percpu+0x23/0x60
   handle_irq_event+0x42/0x70
   handle_level_irq+0x83/0x100
   handle_irq+0x6f/0x110
   do_IRQ+0x46/0xd0
   common_interrupt+0x9d/0x9d

Fix it by statically initializing init_css_set.dfl_cgrp so that init's
default cgroup is accessible from the get-go.

Fixes: 041cd640b2f3 ("cgroup: Implement cgroup2 basic CPU usage accounting")
Reported-by: “kbuild-all@01.org” <kbuild-all@01.org>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/cgroup/cgroup.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'kernel')

diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index d036625556c9..7975b20f1fd1 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -649,6 +649,14 @@ struct css_set init_css_set = {
 	.cgrp_links		= LIST_HEAD_INIT(init_css_set.cgrp_links),
 	.mg_preload_node	= LIST_HEAD_INIT(init_css_set.mg_preload_node),
 	.mg_node		= LIST_HEAD_INIT(init_css_set.mg_node),
+
+	/*
+	 * The following field is re-initialized when this cset gets linked
+	 * in cgroup_init().  However, let's initialize the field
+	 * statically too so that the default cgroup can be accessed safely
+	 * early during boot.
+	 */
+	.dfl_cgrp		= &cgrp_dfl_root.cgrp,
 };
 
 static int css_set_count	= 1;	/* 1 for init_css_set */
-- 
cgit v1.2.3


From 8157a7faf94135386bf04b1cf94e6efd3fb94702 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 25 Sep 2017 14:27:54 -0700
Subject: sched/cputime: Add dummy cputime_adjust() implementation for
 CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

cfb766da54d9 ("sched/cputime: Expose cputime_adjust()") made
cputime_adjust() public for cgroup basic cpu stat support; however,
the commit forgot to add a dummy implementaiton for
CONFIG_VIRT_CPU_ACCOUNTING_NATIVE leading to compiler errors on some
s390 configurations.

Fix it by adding the missing dummy implementation.

Reported-by: “kbuild-all@01.org” <kbuild-all@01.org>
Fixes: cfb766da54d9 ("sched/cputime: Expose cputime_adjust()")
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/sched/cputime.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index e01b699bbd5b..5498f20d2475 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -447,6 +447,13 @@ void vtime_account_irq_enter(struct task_struct *tsk)
 EXPORT_SYMBOL_GPL(vtime_account_irq_enter);
 #endif /* __ARCH_HAS_VTIME_ACCOUNT */
 
+void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev,
+		    u64 *ut, u64 *st)
+{
+	*ut = curr->utime;
+	*st = curr->stime;
+}
+
 void task_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st)
 {
 	*ut = p->utime;
-- 
cgit v1.2.3


From b5d7388f9db78f19e6af7b56a469ca8d1860329d Mon Sep 17 00:00:00 2001
From: Craig Gallek <kraig@google.com>
Date: Thu, 21 Sep 2017 18:43:29 -0400
Subject: bpf: Optimize lpm trie delete

Before the delete operator was added, this datastructure maintained
an invariant that intermediate nodes were only present when necessary
to build the tree.  This patch updates the delete operation to reinstate
that invariant by removing unnecessary intermediate nodes after a node is
removed and thus keeping the tree structure at a minimal size.

Suggested-by: Daniel Mack <daniel@zonque.org>
Signed-off-by: Craig Gallek <kraig@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/lpm_trie.c | 71 +++++++++++++++++++++++++++++++--------------------
 1 file changed, 43 insertions(+), 28 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c
index 9d58a576b2ae..34d8a690ea05 100644
--- a/kernel/bpf/lpm_trie.c
+++ b/kernel/bpf/lpm_trie.c
@@ -394,8 +394,8 @@ static int trie_delete_elem(struct bpf_map *map, void *_key)
 {
 	struct lpm_trie *trie = container_of(map, struct lpm_trie, map);
 	struct bpf_lpm_trie_key *key = _key;
-	struct lpm_trie_node __rcu **trim;
-	struct lpm_trie_node *node;
+	struct lpm_trie_node __rcu **trim, **trim2;
+	struct lpm_trie_node *node, *parent;
 	unsigned long irq_flags;
 	unsigned int next_bit;
 	size_t matchlen = 0;
@@ -407,31 +407,26 @@ static int trie_delete_elem(struct bpf_map *map, void *_key)
 	raw_spin_lock_irqsave(&trie->lock, irq_flags);
 
 	/* Walk the tree looking for an exact key/length match and keeping
-	 * track of where we could begin trimming the tree.  The trim-point
-	 * is the sub-tree along the walk consisting of only single-child
-	 * intermediate nodes and ending at a leaf node that we want to
-	 * remove.
+	 * track of the path we traverse.  We will need to know the node
+	 * we wish to delete, and the slot that points to the node we want
+	 * to delete.  We may also need to know the nodes parent and the
+	 * slot that contains it.
 	 */
 	trim = &trie->root;
-	node = rcu_dereference_protected(
-		trie->root, lockdep_is_held(&trie->lock));
-	while (node) {
+	trim2 = trim;
+	parent = NULL;
+	while ((node = rcu_dereference_protected(
+		       *trim, lockdep_is_held(&trie->lock)))) {
 		matchlen = longest_prefix_match(trie, node, key);
 
 		if (node->prefixlen != matchlen ||
 		    node->prefixlen == key->prefixlen)
 			break;
 
+		parent = node;
+		trim2 = trim;
 		next_bit = extract_bit(key->data, node->prefixlen);
-		/* If we hit a node that has more than one child or is a valid
-		 * prefix itself, do not remove it. Reset the root of the trim
-		 * path to its descendant on our path.
-		 */
-		if (!(node->flags & LPM_TREE_NODE_FLAG_IM) ||
-		    (node->child[0] && node->child[1]))
-			trim = &node->child[next_bit];
-		node = rcu_dereference_protected(
-			node->child[next_bit], lockdep_is_held(&trie->lock));
+		trim = &node->child[next_bit];
 	}
 
 	if (!node || node->prefixlen != key->prefixlen ||
@@ -442,27 +437,47 @@ static int trie_delete_elem(struct bpf_map *map, void *_key)
 
 	trie->n_entries--;
 
-	/* If the node we are removing is not a leaf node, simply mark it
+	/* If the node we are removing has two children, simply mark it
 	 * as intermediate and we are done.
 	 */
-	if (rcu_access_pointer(node->child[0]) ||
+	if (rcu_access_pointer(node->child[0]) &&
 	    rcu_access_pointer(node->child[1])) {
 		node->flags |= LPM_TREE_NODE_FLAG_IM;
 		goto out;
 	}
 
-	/* trim should now point to the slot holding the start of a path from
-	 * zero or more intermediate nodes to our leaf node for deletion.
+	/* If the parent of the node we are about to delete is an intermediate
+	 * node, and the deleted node doesn't have any children, we can delete
+	 * the intermediate parent as well and promote its other child
+	 * up the tree.  Doing this maintains the invariant that all
+	 * intermediate nodes have exactly 2 children and that there are no
+	 * unnecessary intermediate nodes in the tree.
 	 */
-	while ((node = rcu_dereference_protected(
-			*trim, lockdep_is_held(&trie->lock)))) {
-		RCU_INIT_POINTER(*trim, NULL);
-		trim = rcu_access_pointer(node->child[0]) ?
-			&node->child[0] :
-			&node->child[1];
+	if (parent && (parent->flags & LPM_TREE_NODE_FLAG_IM) &&
+	    !node->child[0] && !node->child[1]) {
+		if (node == rcu_access_pointer(parent->child[0]))
+			rcu_assign_pointer(
+				*trim2, rcu_access_pointer(parent->child[1]));
+		else
+			rcu_assign_pointer(
+				*trim2, rcu_access_pointer(parent->child[0]));
+		kfree_rcu(parent, rcu);
 		kfree_rcu(node, rcu);
+		goto out;
 	}
 
+	/* The node we are removing has either zero or one child. If there
+	 * is a child, move it into the removed node's slot then delete
+	 * the node.  Otherwise just clear the slot and delete the node.
+	 */
+	if (node->child[0])
+		rcu_assign_pointer(*trim, rcu_access_pointer(node->child[0]));
+	else if (node->child[1])
+		rcu_assign_pointer(*trim, rcu_access_pointer(node->child[1]));
+	else
+		RCU_INIT_POINTER(*trim, NULL);
+	kfree_rcu(node, rcu);
+
 out:
 	raw_spin_unlock_irqrestore(&trie->lock, irq_flags);
 
-- 
cgit v1.2.3


From 05e3db95ebfc5c06a29a1d8c7a3e02f46f3a25a7 Mon Sep 17 00:00:00 2001
From: Shaohua Li <shli@fb.com>
Date: Thu, 14 Sep 2017 14:02:04 -0700
Subject: kthread: add a mechanism to store cgroup info

kthread usually runs jobs on behalf of other threads. The jobs should be
charged to cgroup of original threads. But the jobs run in a kthread,
where we lose the cgroup context of original threads. The patch adds a
machanism to record cgroup info of original threads in kthread context.
Later we can retrieve the cgroup info and attach the cgroup info to jobs.

Since this mechanism is only required by kthread, we store the cgroup
info in kthread data instead of generic task_struct.

Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Shaohua Li <shli@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/kthread.h | 11 +++++++++
 kernel/kthread.c        | 66 +++++++++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 75 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/kthread.h b/include/linux/kthread.h
index 82e197eeac91..bd4369c83dfb 100644
--- a/include/linux/kthread.h
+++ b/include/linux/kthread.h
@@ -3,6 +3,7 @@
 /* Simple interface for creating and stopping kernel threads without mess. */
 #include <linux/err.h>
 #include <linux/sched.h>
+#include <linux/cgroup.h>
 
 __printf(4, 5)
 struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
@@ -198,4 +199,14 @@ bool kthread_cancel_delayed_work_sync(struct kthread_delayed_work *work);
 
 void kthread_destroy_worker(struct kthread_worker *worker);
 
+#ifdef CONFIG_CGROUPS
+void kthread_associate_blkcg(struct cgroup_subsys_state *css);
+struct cgroup_subsys_state *kthread_blkcg(void);
+#else
+static inline void kthread_associate_blkcg(struct cgroup_subsys_state *css) { }
+static inline struct cgroup_subsys_state *kthread_blkcg(void)
+{
+	return NULL;
+}
+#endif
 #endif /* _LINUX_KTHREAD_H */
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 1c19edf82427..b011ea08967f 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -20,7 +20,6 @@
 #include <linux/freezer.h>
 #include <linux/ptrace.h>
 #include <linux/uaccess.h>
-#include <linux/cgroup.h>
 #include <trace/events/sched.h>
 
 static DEFINE_SPINLOCK(kthread_create_lock);
@@ -47,6 +46,9 @@ struct kthread {
 	void *data;
 	struct completion parked;
 	struct completion exited;
+#ifdef CONFIG_CGROUPS
+	struct cgroup_subsys_state *blkcg_css;
+#endif
 };
 
 enum KTHREAD_BITS {
@@ -74,11 +76,17 @@ static inline struct kthread *to_kthread(struct task_struct *k)
 
 void free_kthread_struct(struct task_struct *k)
 {
+	struct kthread *kthread;
+
 	/*
 	 * Can be NULL if this kthread was created by kernel_thread()
 	 * or if kmalloc() in kthread() failed.
 	 */
-	kfree(to_kthread(k));
+	kthread = to_kthread(k);
+#ifdef CONFIG_CGROUPS
+	WARN_ON_ONCE(kthread && kthread->blkcg_css);
+#endif
+	kfree(kthread);
 }
 
 /**
@@ -216,6 +224,9 @@ static int kthread(void *_create)
 	self->data = data;
 	init_completion(&self->exited);
 	init_completion(&self->parked);
+#ifdef CONFIG_CGROUPS
+	self->blkcg_css = NULL;
+#endif
 	current->vfork_done = &self->exited;
 
 	/* OK, tell user we're spawned, wait for stop or wakeup */
@@ -1154,3 +1165,54 @@ void kthread_destroy_worker(struct kthread_worker *worker)
 	kfree(worker);
 }
 EXPORT_SYMBOL(kthread_destroy_worker);
+
+#ifdef CONFIG_CGROUPS
+/**
+ * kthread_associate_blkcg - associate blkcg to current kthread
+ * @css: the cgroup info
+ *
+ * Current thread must be a kthread. The thread is running jobs on behalf of
+ * other threads. In some cases, we expect the jobs attach cgroup info of
+ * original threads instead of that of current thread. This function stores
+ * original thread's cgroup info in current kthread context for later
+ * retrieval.
+ */
+void kthread_associate_blkcg(struct cgroup_subsys_state *css)
+{
+	struct kthread *kthread;
+
+	if (!(current->flags & PF_KTHREAD))
+		return;
+	kthread = to_kthread(current);
+	if (!kthread)
+		return;
+
+	if (kthread->blkcg_css) {
+		css_put(kthread->blkcg_css);
+		kthread->blkcg_css = NULL;
+	}
+	if (css) {
+		css_get(css);
+		kthread->blkcg_css = css;
+	}
+}
+EXPORT_SYMBOL(kthread_associate_blkcg);
+
+/**
+ * kthread_blkcg - get associated blkcg css of current kthread
+ *
+ * Current thread must be a kthread.
+ */
+struct cgroup_subsys_state *kthread_blkcg(void)
+{
+	struct kthread *kthread;
+
+	if (current->flags & PF_KTHREAD) {
+		kthread = to_kthread(current);
+		if (kthread)
+			return kthread->blkcg_css;
+	}
+	return NULL;
+}
+EXPORT_SYMBOL(kthread_blkcg);
+#endif
-- 
cgit v1.2.3


From 0b508bc926bdced678febee2a2b8cdba0a19e481 Mon Sep 17 00:00:00 2001
From: Shaohua Li <shli@fb.com>
Date: Tue, 26 Sep 2017 11:02:12 -0700
Subject: block: fix a build error

The code is only for blkcg not for all cgroups

Fixes: d4478e92d618 ("block/loop: make loop cgroup aware")
Reported-by: kbuild test robot <fengguang.wu@intel.com>
Signed-off-by: Shaohua Li <shli@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/loop.c    | 2 +-
 include/linux/kthread.h | 2 +-
 kernel/kthread.c        | 8 ++++----
 3 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index fd4eff5f5b76..bc8e61506968 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -1692,7 +1692,7 @@ static blk_status_t loop_queue_rq(struct blk_mq_hw_ctx *hctx,
 	}
 
 	/* always use the first bio's css */
-#ifdef CONFIG_CGROUPS
+#ifdef CONFIG_BLK_CGROUP
 	if (cmd->use_aio && cmd->rq->bio && cmd->rq->bio->bi_css) {
 		cmd->css = cmd->rq->bio->bi_css;
 		css_get(cmd->css);
diff --git a/include/linux/kthread.h b/include/linux/kthread.h
index bd4369c83dfb..fb201842c635 100644
--- a/include/linux/kthread.h
+++ b/include/linux/kthread.h
@@ -199,7 +199,7 @@ bool kthread_cancel_delayed_work_sync(struct kthread_delayed_work *work);
 
 void kthread_destroy_worker(struct kthread_worker *worker);
 
-#ifdef CONFIG_CGROUPS
+#ifdef CONFIG_BLK_CGROUP
 void kthread_associate_blkcg(struct cgroup_subsys_state *css);
 struct cgroup_subsys_state *kthread_blkcg(void);
 #else
diff --git a/kernel/kthread.c b/kernel/kthread.c
index b011ea08967f..f87cd8b4eb2a 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -46,7 +46,7 @@ struct kthread {
 	void *data;
 	struct completion parked;
 	struct completion exited;
-#ifdef CONFIG_CGROUPS
+#ifdef CONFIG_BLK_CGROUP
 	struct cgroup_subsys_state *blkcg_css;
 #endif
 };
@@ -83,7 +83,7 @@ void free_kthread_struct(struct task_struct *k)
 	 * or if kmalloc() in kthread() failed.
 	 */
 	kthread = to_kthread(k);
-#ifdef CONFIG_CGROUPS
+#ifdef CONFIG_BLK_CGROUP
 	WARN_ON_ONCE(kthread && kthread->blkcg_css);
 #endif
 	kfree(kthread);
@@ -224,7 +224,7 @@ static int kthread(void *_create)
 	self->data = data;
 	init_completion(&self->exited);
 	init_completion(&self->parked);
-#ifdef CONFIG_CGROUPS
+#ifdef CONFIG_BLK_CGROUP
 	self->blkcg_css = NULL;
 #endif
 	current->vfork_done = &self->exited;
@@ -1166,7 +1166,7 @@ void kthread_destroy_worker(struct kthread_worker *worker)
 }
 EXPORT_SYMBOL(kthread_destroy_worker);
 
-#ifdef CONFIG_CGROUPS
+#ifdef CONFIG_BLK_CGROUP
 /**
  * kthread_associate_blkcg - associate blkcg to current kthread
  * @css: the cgroup info
-- 
cgit v1.2.3


From 6aaae2b6c4330a46204bca042f1d2f41e8e18dea Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Mon, 25 Sep 2017 02:25:50 +0200
Subject: bpf: rename bpf_compute_data_end into bpf_compute_data_pointers

Just do the rename into bpf_compute_data_pointers() as we'll add
one more pointer here to recompute.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/filter.h |  9 ++++++---
 kernel/bpf/sockmap.c   |  4 ++--
 net/bpf/test_run.c     |  2 +-
 net/core/filter.c      | 14 +++++++-------
 net/core/lwt_bpf.c     |  2 +-
 net/sched/act_bpf.c    |  4 ++--
 net/sched/cls_bpf.c    |  4 ++--
 7 files changed, 21 insertions(+), 18 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/filter.h b/include/linux/filter.h
index d29e58fde364..052bab3d62e7 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -496,10 +496,13 @@ struct xdp_buff {
 	void *data_hard_start;
 };
 
-/* compute the linear packet data range [data, data_end) which
- * will be accessed by cls_bpf, act_bpf and lwt programs
+/* Compute the linear packet data range [data, data_end) which
+ * will be accessed by various program types (cls_bpf, act_bpf,
+ * lwt, ...). Subsystems allowing direct data access must (!)
+ * ensure that cb[] area can be written to when BPF program is
+ * invoked (otherwise cb[] save/restore is necessary).
  */
-static inline void bpf_compute_data_end(struct sk_buff *skb)
+static inline void bpf_compute_data_pointers(struct sk_buff *skb)
 {
 	struct bpf_skb_data_end *cb = (struct bpf_skb_data_end *)skb->cb;
 
diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
index 6424ce0e4969..a298d6666698 100644
--- a/kernel/bpf/sockmap.c
+++ b/kernel/bpf/sockmap.c
@@ -102,7 +102,7 @@ static int smap_verdict_func(struct smap_psock *psock, struct sk_buff *skb)
 
 	skb_orphan(skb);
 	skb->sk = psock->sock;
-	bpf_compute_data_end(skb);
+	bpf_compute_data_pointers(skb);
 	rc = (*prog->bpf_func)(skb, prog->insnsi);
 	skb->sk = NULL;
 
@@ -369,7 +369,7 @@ static int smap_parse_func_strparser(struct strparser *strp,
 	 * any socket yet.
 	 */
 	skb->sk = psock->sock;
-	bpf_compute_data_end(skb);
+	bpf_compute_data_pointers(skb);
 	rc = (*prog->bpf_func)(skb, prog->insnsi);
 	skb->sk = NULL;
 	rcu_read_unlock();
diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c
index 6be41a44d688..df672517b4fd 100644
--- a/net/bpf/test_run.c
+++ b/net/bpf/test_run.c
@@ -133,7 +133,7 @@ int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr,
 	if (is_l2)
 		__skb_push(skb, ETH_HLEN);
 	if (is_direct_pkt_access)
-		bpf_compute_data_end(skb);
+		bpf_compute_data_pointers(skb);
 	retval = bpf_test_run(prog, skb, repeat, &duration);
 	if (!is_l2)
 		__skb_push(skb, ETH_HLEN);
diff --git a/net/core/filter.c b/net/core/filter.c
index 82edad58d066..c468e7cfad19 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1402,7 +1402,7 @@ static inline int bpf_try_make_writable(struct sk_buff *skb,
 {
 	int err = __bpf_try_make_writable(skb, write_len);
 
-	bpf_compute_data_end(skb);
+	bpf_compute_data_pointers(skb);
 	return err;
 }
 
@@ -1962,7 +1962,7 @@ BPF_CALL_3(bpf_skb_vlan_push, struct sk_buff *, skb, __be16, vlan_proto,
 	ret = skb_vlan_push(skb, vlan_proto, vlan_tci);
 	bpf_pull_mac_rcsum(skb);
 
-	bpf_compute_data_end(skb);
+	bpf_compute_data_pointers(skb);
 	return ret;
 }
 
@@ -1984,7 +1984,7 @@ BPF_CALL_1(bpf_skb_vlan_pop, struct sk_buff *, skb)
 	ret = skb_vlan_pop(skb);
 	bpf_pull_mac_rcsum(skb);
 
-	bpf_compute_data_end(skb);
+	bpf_compute_data_pointers(skb);
 	return ret;
 }
 
@@ -2178,7 +2178,7 @@ BPF_CALL_3(bpf_skb_change_proto, struct sk_buff *, skb, __be16, proto,
 	 * need to be verified first.
 	 */
 	ret = bpf_skb_proto_xlat(skb, proto);
-	bpf_compute_data_end(skb);
+	bpf_compute_data_pointers(skb);
 	return ret;
 }
 
@@ -2303,7 +2303,7 @@ static int bpf_skb_adjust_net(struct sk_buff *skb, s32 len_diff)
 	ret = shrink ? bpf_skb_net_shrink(skb, len_diff_abs) :
 		       bpf_skb_net_grow(skb, len_diff_abs);
 
-	bpf_compute_data_end(skb);
+	bpf_compute_data_pointers(skb);
 	return ret;
 }
 
@@ -2394,7 +2394,7 @@ BPF_CALL_3(bpf_skb_change_tail, struct sk_buff *, skb, u32, new_len,
 			skb_gso_reset(skb);
 	}
 
-	bpf_compute_data_end(skb);
+	bpf_compute_data_pointers(skb);
 	return ret;
 }
 
@@ -2434,7 +2434,7 @@ BPF_CALL_3(bpf_skb_change_head, struct sk_buff *, skb, u32, head_room,
 		skb_reset_mac_header(skb);
 	}
 
-	bpf_compute_data_end(skb);
+	bpf_compute_data_pointers(skb);
 	return 0;
 }
 
diff --git a/net/core/lwt_bpf.c b/net/core/lwt_bpf.c
index 1307731ddfe4..e7e626fb87bb 100644
--- a/net/core/lwt_bpf.c
+++ b/net/core/lwt_bpf.c
@@ -51,7 +51,7 @@ static int run_lwt_bpf(struct sk_buff *skb, struct bpf_lwt_prog *lwt,
 	 */
 	preempt_disable();
 	rcu_read_lock();
-	bpf_compute_data_end(skb);
+	bpf_compute_data_pointers(skb);
 	ret = bpf_prog_run_save_cb(lwt->prog, skb);
 	rcu_read_unlock();
 
diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c
index c0c707eb2c96..5ef8ce8c83d4 100644
--- a/net/sched/act_bpf.c
+++ b/net/sched/act_bpf.c
@@ -49,11 +49,11 @@ static int tcf_bpf(struct sk_buff *skb, const struct tc_action *act,
 	filter = rcu_dereference(prog->filter);
 	if (at_ingress) {
 		__skb_push(skb, skb->mac_len);
-		bpf_compute_data_end(skb);
+		bpf_compute_data_pointers(skb);
 		filter_res = BPF_PROG_RUN(filter, skb);
 		__skb_pull(skb, skb->mac_len);
 	} else {
-		bpf_compute_data_end(skb);
+		bpf_compute_data_pointers(skb);
 		filter_res = BPF_PROG_RUN(filter, skb);
 	}
 	rcu_read_unlock();
diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c
index 520c5027646a..36671b0fb125 100644
--- a/net/sched/cls_bpf.c
+++ b/net/sched/cls_bpf.c
@@ -99,11 +99,11 @@ static int cls_bpf_classify(struct sk_buff *skb, const struct tcf_proto *tp,
 		} else if (at_ingress) {
 			/* It is safe to push/pull even if skb_shared() */
 			__skb_push(skb, skb->mac_len);
-			bpf_compute_data_end(skb);
+			bpf_compute_data_pointers(skb);
 			filter_res = BPF_PROG_RUN(prog->filter, skb);
 			__skb_pull(skb, skb->mac_len);
 		} else {
-			bpf_compute_data_end(skb);
+			bpf_compute_data_pointers(skb);
 			filter_res = BPF_PROG_RUN(prog->filter, skb);
 		}
 
-- 
cgit v1.2.3


From de8f3a83b0a0fddb2cf56e7a718127e9619ea3da Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Mon, 25 Sep 2017 02:25:51 +0200
Subject: bpf: add meta pointer for direct access

This work enables generic transfer of metadata from XDP into skb. The
basic idea is that we can make use of the fact that the resulting skb
must be linear and already comes with a larger headroom for supporting
bpf_xdp_adjust_head(), which mangles xdp->data. Here, we base our work
on a similar principle and introduce a small helper bpf_xdp_adjust_meta()
for adjusting a new pointer called xdp->data_meta. Thus, the packet has
a flexible and programmable room for meta data, followed by the actual
packet data. struct xdp_buff is therefore laid out that we first point
to data_hard_start, then data_meta directly prepended to data followed
by data_end marking the end of packet. bpf_xdp_adjust_head() takes into
account whether we have meta data already prepended and if so, memmove()s
this along with the given offset provided there's enough room.

xdp->data_meta is optional and programs are not required to use it. The
rationale is that when we process the packet in XDP (e.g. as DoS filter),
we can push further meta data along with it for the XDP_PASS case, and
give the guarantee that a clsact ingress BPF program on the same device
can pick this up for further post-processing. Since we work with skb
there, we can also set skb->mark, skb->priority or other skb meta data
out of BPF, thus having this scratch space generic and programmable
allows for more flexibility than defining a direct 1:1 transfer of
potentially new XDP members into skb (it's also more efficient as we
don't need to initialize/handle each of such new members). The facility
also works together with GRO aggregation. The scratch space at the head
of the packet can be multiple of 4 byte up to 32 byte large. Drivers not
yet supporting xdp->data_meta can simply be set up with xdp->data_meta
as xdp->data + 1 as bpf_xdp_adjust_meta() will detect this and bail out,
such that the subsequent match against xdp->data for later access is
guaranteed to fail.

The verifier treats xdp->data_meta/xdp->data the same way as we treat
xdp->data/xdp->data_end pointer comparisons. The requirement for doing
the compare against xdp->data is that it hasn't been modified from it's
original address we got from ctx access. It may have a range marking
already from prior successful xdp->data/xdp->data_end pointer comparisons
though.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c      |   1 +
 drivers/net/ethernet/cavium/thunder/nicvf_main.c   |   1 +
 drivers/net/ethernet/intel/i40e/i40e_txrx.c        |   1 +
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c      |   1 +
 drivers/net/ethernet/mellanox/mlx4/en_rx.c         |   1 +
 drivers/net/ethernet/mellanox/mlx5/core/en_rx.c    |   1 +
 .../net/ethernet/netronome/nfp/nfp_net_common.c    |   1 +
 drivers/net/ethernet/qlogic/qede/qede_fp.c         |   1 +
 drivers/net/tun.c                                  |   1 +
 drivers/net/virtio_net.c                           |   2 +
 include/linux/bpf.h                                |   1 +
 include/linux/filter.h                             |  21 +++-
 include/linux/skbuff.h                             |  68 +++++++++++-
 include/uapi/linux/bpf.h                           |  13 ++-
 kernel/bpf/verifier.c                              | 114 ++++++++++++++++-----
 net/bpf/test_run.c                                 |   1 +
 net/core/dev.c                                     |  31 +++++-
 net/core/filter.c                                  |  77 +++++++++++++-
 net/core/skbuff.c                                  |   2 +
 19 files changed, 297 insertions(+), 42 deletions(-)

(limited to 'kernel')

diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c
index d8f0c837b72c..06ce63c00821 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c
@@ -94,6 +94,7 @@ bool bnxt_rx_xdp(struct bnxt *bp, struct bnxt_rx_ring_info *rxr, u16 cons,
 
 	xdp.data_hard_start = *data_ptr - offset;
 	xdp.data = *data_ptr;
+	xdp_set_data_meta_invalid(&xdp);
 	xdp.data_end = *data_ptr + *len;
 	orig_data = xdp.data;
 	mapping = rx_buf->mapping - bp->rx_dma_offset;
diff --git a/drivers/net/ethernet/cavium/thunder/nicvf_main.c b/drivers/net/ethernet/cavium/thunder/nicvf_main.c
index 49b80da51ba7..d68478afccbf 100644
--- a/drivers/net/ethernet/cavium/thunder/nicvf_main.c
+++ b/drivers/net/ethernet/cavium/thunder/nicvf_main.c
@@ -523,6 +523,7 @@ static inline bool nicvf_xdp_rx(struct nicvf *nic, struct bpf_prog *prog,
 
 	xdp.data_hard_start = page_address(page);
 	xdp.data = (void *)cpu_addr;
+	xdp_set_data_meta_invalid(&xdp);
 	xdp.data_end = xdp.data + len;
 	orig_data = xdp.data;
 
diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
index 1519dfb851d0..f426762bd83a 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
@@ -2107,6 +2107,7 @@ static int i40e_clean_rx_irq(struct i40e_ring *rx_ring, int budget)
 		if (!skb) {
 			xdp.data = page_address(rx_buffer->page) +
 				   rx_buffer->page_offset;
+			xdp_set_data_meta_invalid(&xdp);
 			xdp.data_hard_start = xdp.data -
 					      i40e_rx_offset(rx_ring);
 			xdp.data_end = xdp.data + size;
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index d962368d08d0..04bb03bda1cd 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -2326,6 +2326,7 @@ static int ixgbe_clean_rx_irq(struct ixgbe_q_vector *q_vector,
 		if (!skb) {
 			xdp.data = page_address(rx_buffer->page) +
 				   rx_buffer->page_offset;
+			xdp_set_data_meta_invalid(&xdp);
 			xdp.data_hard_start = xdp.data -
 					      ixgbe_rx_offset(rx_ring);
 			xdp.data_end = xdp.data + size;
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_rx.c b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
index b97a55c827eb..8f9cb8abc497 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
@@ -762,6 +762,7 @@ int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int bud
 
 			xdp.data_hard_start = va - frags[0].page_offset;
 			xdp.data = va;
+			xdp_set_data_meta_invalid(&xdp);
 			xdp.data_end = xdp.data + length;
 			orig_data = xdp.data;
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
index f1dd638384d3..30b3f3fbd719 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
@@ -794,6 +794,7 @@ static inline int mlx5e_xdp_handle(struct mlx5e_rq *rq,
 		return false;
 
 	xdp.data = va + *rx_headroom;
+	xdp_set_data_meta_invalid(&xdp);
 	xdp.data_end = xdp.data + *len;
 	xdp.data_hard_start = va;
 
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
index 1c0187f0af51..e3a38be3600a 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
@@ -1583,6 +1583,7 @@ static int nfp_net_run_xdp(struct bpf_prog *prog, void *data, void *hard_start,
 
 	xdp.data_hard_start = hard_start;
 	xdp.data = data + *off;
+	xdp_set_data_meta_invalid(&xdp);
 	xdp.data_end = data + *off + *len;
 
 	orig_data = xdp.data;
diff --git a/drivers/net/ethernet/qlogic/qede/qede_fp.c b/drivers/net/ethernet/qlogic/qede/qede_fp.c
index 6fc854b120b0..48ec4c56cddf 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_fp.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_fp.c
@@ -1004,6 +1004,7 @@ static bool qede_rx_xdp(struct qede_dev *edev,
 
 	xdp.data_hard_start = page_address(bd->data);
 	xdp.data = xdp.data_hard_start + *data_offset;
+	xdp_set_data_meta_invalid(&xdp);
 	xdp.data_end = xdp.data + *len;
 
 	/* Queues always have a full reset currently, so for the time
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 2c36f6ebad79..a6e0bffe3d29 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -1468,6 +1468,7 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun,
 
 		xdp.data_hard_start = buf;
 		xdp.data = buf + pad;
+		xdp_set_data_meta_invalid(&xdp);
 		xdp.data_end = xdp.data + len;
 		orig_data = xdp.data;
 		act = bpf_prog_run_xdp(xdp_prog, &xdp);
diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index dd14a4547932..fc059f193e7d 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -554,6 +554,7 @@ static struct sk_buff *receive_small(struct net_device *dev,
 
 		xdp.data_hard_start = buf + VIRTNET_RX_PAD + vi->hdr_len;
 		xdp.data = xdp.data_hard_start + xdp_headroom;
+		xdp_set_data_meta_invalid(&xdp);
 		xdp.data_end = xdp.data + len;
 		orig_data = xdp.data;
 		act = bpf_prog_run_xdp(xdp_prog, &xdp);
@@ -686,6 +687,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
 		data = page_address(xdp_page) + offset;
 		xdp.data_hard_start = data - VIRTIO_XDP_HEADROOM + vi->hdr_len;
 		xdp.data = data + vi->hdr_len;
+		xdp_set_data_meta_invalid(&xdp);
 		xdp.data_end = xdp.data + (len - vi->hdr_len);
 		act = bpf_prog_run_xdp(xdp_prog, &xdp);
 
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 8390859e79e7..2b672c50f160 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -137,6 +137,7 @@ enum bpf_reg_type {
 	PTR_TO_MAP_VALUE,	 /* reg points to map element value */
 	PTR_TO_MAP_VALUE_OR_NULL,/* points to map elem value or NULL */
 	PTR_TO_STACK,		 /* reg == frame_pointer + offset */
+	PTR_TO_PACKET_META,	 /* skb->data - meta_len */
 	PTR_TO_PACKET,		 /* reg points to skb->data */
 	PTR_TO_PACKET_END,	 /* skb->data + headlen */
 };
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 052bab3d62e7..911d454af107 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -487,12 +487,14 @@ struct sk_filter {
 
 struct bpf_skb_data_end {
 	struct qdisc_skb_cb qdisc_cb;
+	void *data_meta;
 	void *data_end;
 };
 
 struct xdp_buff {
 	void *data;
 	void *data_end;
+	void *data_meta;
 	void *data_hard_start;
 };
 
@@ -507,7 +509,8 @@ static inline void bpf_compute_data_pointers(struct sk_buff *skb)
 	struct bpf_skb_data_end *cb = (struct bpf_skb_data_end *)skb->cb;
 
 	BUILD_BUG_ON(sizeof(*cb) > FIELD_SIZEOF(struct sk_buff, cb));
-	cb->data_end = skb->data + skb_headlen(skb);
+	cb->data_meta = skb->data - skb_metadata_len(skb);
+	cb->data_end  = skb->data + skb_headlen(skb);
 }
 
 static inline u8 *bpf_skb_cb(struct sk_buff *skb)
@@ -728,8 +731,22 @@ int xdp_do_redirect(struct net_device *dev,
 		    struct bpf_prog *prog);
 void xdp_do_flush_map(void);
 
+/* Drivers not supporting XDP metadata can use this helper, which
+ * rejects any room expansion for metadata as a result.
+ */
+static __always_inline void
+xdp_set_data_meta_invalid(struct xdp_buff *xdp)
+{
+	xdp->data_meta = xdp->data + 1;
+}
+
+static __always_inline bool
+xdp_data_meta_unsupported(const struct xdp_buff *xdp)
+{
+	return unlikely(xdp->data_meta > xdp->data);
+}
+
 void bpf_warn_invalid_xdp_action(u32 act);
-void bpf_warn_invalid_xdp_redirect(u32 ifindex);
 
 struct sock *do_sk_redirect_map(void);
 
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index f9db5539a6fb..19e64bfb1a66 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -489,8 +489,9 @@ int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb,
  * the end of the header data, ie. at skb->end.
  */
 struct skb_shared_info {
-	unsigned short	_unused;
-	unsigned char	nr_frags;
+	__u8		__unused;
+	__u8		meta_len;
+	__u8		nr_frags;
 	__u8		tx_flags;
 	unsigned short	gso_size;
 	/* Warning: this field is not always filled in (UFO)! */
@@ -3400,6 +3401,69 @@ static inline ktime_t net_invalid_timestamp(void)
 	return 0;
 }
 
+static inline u8 skb_metadata_len(const struct sk_buff *skb)
+{
+	return skb_shinfo(skb)->meta_len;
+}
+
+static inline void *skb_metadata_end(const struct sk_buff *skb)
+{
+	return skb_mac_header(skb);
+}
+
+static inline bool __skb_metadata_differs(const struct sk_buff *skb_a,
+					  const struct sk_buff *skb_b,
+					  u8 meta_len)
+{
+	const void *a = skb_metadata_end(skb_a);
+	const void *b = skb_metadata_end(skb_b);
+	/* Using more efficient varaiant than plain call to memcmp(). */
+#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64
+	u64 diffs = 0;
+
+	switch (meta_len) {
+#define __it(x, op) (x -= sizeof(u##op))
+#define __it_diff(a, b, op) (*(u##op *)__it(a, op)) ^ (*(u##op *)__it(b, op))
+	case 32: diffs |= __it_diff(a, b, 64);
+	case 24: diffs |= __it_diff(a, b, 64);
+	case 16: diffs |= __it_diff(a, b, 64);
+	case  8: diffs |= __it_diff(a, b, 64);
+		break;
+	case 28: diffs |= __it_diff(a, b, 64);
+	case 20: diffs |= __it_diff(a, b, 64);
+	case 12: diffs |= __it_diff(a, b, 64);
+	case  4: diffs |= __it_diff(a, b, 32);
+		break;
+	}
+	return diffs;
+#else
+	return memcmp(a - meta_len, b - meta_len, meta_len);
+#endif
+}
+
+static inline bool skb_metadata_differs(const struct sk_buff *skb_a,
+					const struct sk_buff *skb_b)
+{
+	u8 len_a = skb_metadata_len(skb_a);
+	u8 len_b = skb_metadata_len(skb_b);
+
+	if (!(len_a | len_b))
+		return false;
+
+	return len_a != len_b ?
+	       true : __skb_metadata_differs(skb_a, skb_b, len_a);
+}
+
+static inline void skb_metadata_set(struct sk_buff *skb, u8 meta_len)
+{
+	skb_shinfo(skb)->meta_len = meta_len;
+}
+
+static inline void skb_metadata_clear(struct sk_buff *skb)
+{
+	skb_metadata_set(skb, 0);
+}
+
 struct sk_buff *skb_clone_sk(struct sk_buff *skb);
 
 #ifdef CONFIG_NETWORK_PHY_TIMESTAMPING
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 43ab5c402f98..e43491ac4823 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -582,6 +582,12 @@ union bpf_attr {
  *	@map: pointer to sockmap to update
  *	@key: key to insert/update sock in map
  *	@flags: same flags as map update elem
+ *
+ * int bpf_xdp_adjust_meta(xdp_md, delta)
+ *     Adjust the xdp_md.data_meta by delta
+ *     @xdp_md: pointer to xdp_md
+ *     @delta: An positive/negative integer to be added to xdp_md.data_meta
+ *     Return: 0 on success or negative on error
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -638,6 +644,7 @@ union bpf_attr {
 	FN(redirect_map),		\
 	FN(sk_redirect_map),		\
 	FN(sock_map_update),		\
+	FN(xdp_adjust_meta),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
@@ -715,7 +722,7 @@ struct __sk_buff {
 	__u32 data_end;
 	__u32 napi_id;
 
-	/* accessed by BPF_PROG_TYPE_sk_skb types */
+	/* Accessed by BPF_PROG_TYPE_sk_skb types from here to ... */
 	__u32 family;
 	__u32 remote_ip4;	/* Stored in network byte order */
 	__u32 local_ip4;	/* Stored in network byte order */
@@ -723,6 +730,9 @@ struct __sk_buff {
 	__u32 local_ip6[4];	/* Stored in network byte order */
 	__u32 remote_port;	/* Stored in network byte order */
 	__u32 local_port;	/* stored in host byte order */
+	/* ... here. */
+
+	__u32 data_meta;
 };
 
 struct bpf_tunnel_key {
@@ -783,6 +793,7 @@ enum xdp_action {
 struct xdp_md {
 	__u32 data;
 	__u32 data_end;
+	__u32 data_meta;
 };
 
 enum sk_action {
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index b914fbe1383e..f849eca36052 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -177,6 +177,12 @@ static __printf(1, 2) void verbose(const char *fmt, ...)
 	va_end(args);
 }
 
+static bool type_is_pkt_pointer(enum bpf_reg_type type)
+{
+	return type == PTR_TO_PACKET ||
+	       type == PTR_TO_PACKET_META;
+}
+
 /* string representation of 'enum bpf_reg_type' */
 static const char * const reg_type_str[] = {
 	[NOT_INIT]		= "?",
@@ -187,6 +193,7 @@ static const char * const reg_type_str[] = {
 	[PTR_TO_MAP_VALUE_OR_NULL] = "map_value_or_null",
 	[PTR_TO_STACK]		= "fp",
 	[PTR_TO_PACKET]		= "pkt",
+	[PTR_TO_PACKET_META]	= "pkt_meta",
 	[PTR_TO_PACKET_END]	= "pkt_end",
 };
 
@@ -226,7 +233,7 @@ static void print_verifier_state(struct bpf_verifier_state *state)
 			verbose("(id=%d", reg->id);
 			if (t != SCALAR_VALUE)
 				verbose(",off=%d", reg->off);
-			if (t == PTR_TO_PACKET)
+			if (type_is_pkt_pointer(t))
 				verbose(",r=%d", reg->range);
 			else if (t == CONST_PTR_TO_MAP ||
 				 t == PTR_TO_MAP_VALUE ||
@@ -519,6 +526,31 @@ static void mark_reg_known_zero(struct bpf_reg_state *regs, u32 regno)
 	__mark_reg_known_zero(regs + regno);
 }
 
+static bool reg_is_pkt_pointer(const struct bpf_reg_state *reg)
+{
+	return type_is_pkt_pointer(reg->type);
+}
+
+static bool reg_is_pkt_pointer_any(const struct bpf_reg_state *reg)
+{
+	return reg_is_pkt_pointer(reg) ||
+	       reg->type == PTR_TO_PACKET_END;
+}
+
+/* Unmodified PTR_TO_PACKET[_META,_END] register from ctx access. */
+static bool reg_is_init_pkt_pointer(const struct bpf_reg_state *reg,
+				    enum bpf_reg_type which)
+{
+	/* The register can already have a range from prior markings.
+	 * This is fine as long as it hasn't been advanced from its
+	 * origin.
+	 */
+	return reg->type == which &&
+	       reg->id == 0 &&
+	       reg->off == 0 &&
+	       tnum_equals_const(reg->var_off, 0);
+}
+
 /* Attempts to improve min/max values based on var_off information */
 static void __update_reg_bounds(struct bpf_reg_state *reg)
 {
@@ -702,6 +734,7 @@ static bool is_spillable_regtype(enum bpf_reg_type type)
 	case PTR_TO_STACK:
 	case PTR_TO_CTX:
 	case PTR_TO_PACKET:
+	case PTR_TO_PACKET_META:
 	case PTR_TO_PACKET_END:
 	case CONST_PTR_TO_MAP:
 		return true;
@@ -1047,7 +1080,10 @@ static int check_ptr_alignment(struct bpf_verifier_env *env,
 
 	switch (reg->type) {
 	case PTR_TO_PACKET:
-		/* special case, because of NET_IP_ALIGN */
+	case PTR_TO_PACKET_META:
+		/* Special case, because of NET_IP_ALIGN. Given metadata sits
+		 * right in front, treat it the very same way.
+		 */
 		return check_pkt_ptr_alignment(reg, off, size, strict);
 	case PTR_TO_MAP_VALUE:
 		pointer_desc = "value ";
@@ -1124,8 +1160,8 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
 		err = check_ctx_access(env, insn_idx, off, size, t, &reg_type);
 		if (!err && t == BPF_READ && value_regno >= 0) {
 			/* ctx access returns either a scalar, or a
-			 * PTR_TO_PACKET[_END].  In the latter case, we know
-			 * the offset is zero.
+			 * PTR_TO_PACKET[_META,_END]. In the latter
+			 * case, we know the offset is zero.
 			 */
 			if (reg_type == SCALAR_VALUE)
 				mark_reg_unknown(state->regs, value_regno);
@@ -1170,7 +1206,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
 		} else {
 			err = check_stack_read(state, off, size, value_regno);
 		}
-	} else if (reg->type == PTR_TO_PACKET) {
+	} else if (reg_is_pkt_pointer(reg)) {
 		if (t == BPF_WRITE && !may_access_direct_pkt_data(env, NULL, t)) {
 			verbose("cannot write into packet\n");
 			return -EACCES;
@@ -1310,6 +1346,7 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,
 
 	switch (reg->type) {
 	case PTR_TO_PACKET:
+	case PTR_TO_PACKET_META:
 		return check_packet_access(env, regno, reg->off, access_size);
 	case PTR_TO_MAP_VALUE:
 		return check_map_access(env, regno, reg->off, access_size);
@@ -1342,7 +1379,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
 		return 0;
 	}
 
-	if (type == PTR_TO_PACKET &&
+	if (type_is_pkt_pointer(type) &&
 	    !may_access_direct_pkt_data(env, meta, BPF_READ)) {
 		verbose("helper access to the packet is not allowed\n");
 		return -EACCES;
@@ -1351,7 +1388,8 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
 	if (arg_type == ARG_PTR_TO_MAP_KEY ||
 	    arg_type == ARG_PTR_TO_MAP_VALUE) {
 		expected_type = PTR_TO_STACK;
-		if (type != PTR_TO_PACKET && type != expected_type)
+		if (!type_is_pkt_pointer(type) &&
+		    type != expected_type)
 			goto err_type;
 	} else if (arg_type == ARG_CONST_SIZE ||
 		   arg_type == ARG_CONST_SIZE_OR_ZERO) {
@@ -1375,7 +1413,8 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
 		 */
 		if (register_is_null(*reg))
 			/* final test in check_stack_boundary() */;
-		else if (type != PTR_TO_PACKET && type != PTR_TO_MAP_VALUE &&
+		else if (!type_is_pkt_pointer(type) &&
+			 type != PTR_TO_MAP_VALUE &&
 			 type != expected_type)
 			goto err_type;
 		meta->raw_mode = arg_type == ARG_PTR_TO_UNINIT_MEM;
@@ -1401,7 +1440,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
 			verbose("invalid map_ptr to access map->key\n");
 			return -EACCES;
 		}
-		if (type == PTR_TO_PACKET)
+		if (type_is_pkt_pointer(type))
 			err = check_packet_access(env, regno, reg->off,
 						  meta->map_ptr->key_size);
 		else
@@ -1417,7 +1456,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
 			verbose("invalid map_ptr to access map->value\n");
 			return -EACCES;
 		}
-		if (type == PTR_TO_PACKET)
+		if (type_is_pkt_pointer(type))
 			err = check_packet_access(env, regno, reg->off,
 						  meta->map_ptr->value_size);
 		else
@@ -1590,8 +1629,8 @@ static int check_raw_mode(const struct bpf_func_proto *fn)
 	return count > 1 ? -EINVAL : 0;
 }
 
-/* Packet data might have moved, any old PTR_TO_PACKET[_END] are now invalid,
- * so turn them into unknown SCALAR_VALUE.
+/* Packet data might have moved, any old PTR_TO_PACKET[_META,_END]
+ * are now invalid, so turn them into unknown SCALAR_VALUE.
  */
 static void clear_all_pkt_pointers(struct bpf_verifier_env *env)
 {
@@ -1600,18 +1639,15 @@ static void clear_all_pkt_pointers(struct bpf_verifier_env *env)
 	int i;
 
 	for (i = 0; i < MAX_BPF_REG; i++)
-		if (regs[i].type == PTR_TO_PACKET ||
-		    regs[i].type == PTR_TO_PACKET_END)
+		if (reg_is_pkt_pointer_any(&regs[i]))
 			mark_reg_unknown(regs, i);
 
 	for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) {
 		if (state->stack_slot_type[i] != STACK_SPILL)
 			continue;
 		reg = &state->spilled_regs[i / BPF_REG_SIZE];
-		if (reg->type != PTR_TO_PACKET &&
-		    reg->type != PTR_TO_PACKET_END)
-			continue;
-		__mark_reg_unknown(reg);
+		if (reg_is_pkt_pointer_any(reg))
+			__mark_reg_unknown(reg);
 	}
 }
 
@@ -1871,7 +1907,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
 		}
 		dst_reg->var_off = tnum_add(ptr_reg->var_off, off_reg->var_off);
 		dst_reg->off = ptr_reg->off;
-		if (ptr_reg->type == PTR_TO_PACKET) {
+		if (reg_is_pkt_pointer(ptr_reg)) {
 			dst_reg->id = ++env->id_gen;
 			/* something was added to pkt_ptr, set range to zero */
 			dst_reg->range = 0;
@@ -1931,7 +1967,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
 		}
 		dst_reg->var_off = tnum_sub(ptr_reg->var_off, off_reg->var_off);
 		dst_reg->off = ptr_reg->off;
-		if (ptr_reg->type == PTR_TO_PACKET) {
+		if (reg_is_pkt_pointer(ptr_reg)) {
 			dst_reg->id = ++env->id_gen;
 			/* something was added to pkt_ptr, set range to zero */
 			if (smin_val < 0)
@@ -2421,7 +2457,8 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
 }
 
 static void find_good_pkt_pointers(struct bpf_verifier_state *state,
-				   struct bpf_reg_state *dst_reg)
+				   struct bpf_reg_state *dst_reg,
+				   enum bpf_reg_type type)
 {
 	struct bpf_reg_state *regs = state->regs, *reg;
 	int i;
@@ -2483,7 +2520,7 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *state,
 	 * dst_reg->off is known < MAX_PACKET_OFF, therefore it fits in a u16.
 	 */
 	for (i = 0; i < MAX_BPF_REG; i++)
-		if (regs[i].type == PTR_TO_PACKET && regs[i].id == dst_reg->id)
+		if (regs[i].type == type && regs[i].id == dst_reg->id)
 			/* keep the maximum range already checked */
 			regs[i].range = max_t(u16, regs[i].range, dst_reg->off);
 
@@ -2491,7 +2528,7 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *state,
 		if (state->stack_slot_type[i] != STACK_SPILL)
 			continue;
 		reg = &state->spilled_regs[i / BPF_REG_SIZE];
-		if (reg->type == PTR_TO_PACKET && reg->id == dst_reg->id)
+		if (reg->type == type && reg->id == dst_reg->id)
 			reg->range = max_t(u16, reg->range, dst_reg->off);
 	}
 }
@@ -2856,19 +2893,39 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
 	} else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGT &&
 		   dst_reg->type == PTR_TO_PACKET &&
 		   regs[insn->src_reg].type == PTR_TO_PACKET_END) {
-		find_good_pkt_pointers(this_branch, dst_reg);
+		find_good_pkt_pointers(this_branch, dst_reg, PTR_TO_PACKET);
 	} else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLT &&
 		   dst_reg->type == PTR_TO_PACKET &&
 		   regs[insn->src_reg].type == PTR_TO_PACKET_END) {
-		find_good_pkt_pointers(other_branch, dst_reg);
+		find_good_pkt_pointers(other_branch, dst_reg, PTR_TO_PACKET);
 	} else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGE &&
 		   dst_reg->type == PTR_TO_PACKET_END &&
 		   regs[insn->src_reg].type == PTR_TO_PACKET) {
-		find_good_pkt_pointers(other_branch, &regs[insn->src_reg]);
+		find_good_pkt_pointers(other_branch, &regs[insn->src_reg],
+				       PTR_TO_PACKET);
 	} else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLE &&
 		   dst_reg->type == PTR_TO_PACKET_END &&
 		   regs[insn->src_reg].type == PTR_TO_PACKET) {
-		find_good_pkt_pointers(this_branch, &regs[insn->src_reg]);
+		find_good_pkt_pointers(this_branch, &regs[insn->src_reg],
+				       PTR_TO_PACKET);
+	} else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGT &&
+		   dst_reg->type == PTR_TO_PACKET_META &&
+		   reg_is_init_pkt_pointer(&regs[insn->src_reg], PTR_TO_PACKET)) {
+		find_good_pkt_pointers(this_branch, dst_reg, PTR_TO_PACKET_META);
+	} else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLT &&
+		   dst_reg->type == PTR_TO_PACKET_META &&
+		   reg_is_init_pkt_pointer(&regs[insn->src_reg], PTR_TO_PACKET)) {
+		find_good_pkt_pointers(other_branch, dst_reg, PTR_TO_PACKET_META);
+	} else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGE &&
+		   reg_is_init_pkt_pointer(dst_reg, PTR_TO_PACKET) &&
+		   regs[insn->src_reg].type == PTR_TO_PACKET_META) {
+		find_good_pkt_pointers(other_branch, &regs[insn->src_reg],
+				       PTR_TO_PACKET_META);
+	} else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLE &&
+		   reg_is_init_pkt_pointer(dst_reg, PTR_TO_PACKET) &&
+		   regs[insn->src_reg].type == PTR_TO_PACKET_META) {
+		find_good_pkt_pointers(this_branch, &regs[insn->src_reg],
+				       PTR_TO_PACKET_META);
 	} else if (is_pointer_value(env, insn->dst_reg)) {
 		verbose("R%d pointer comparison prohibited\n", insn->dst_reg);
 		return -EACCES;
@@ -3298,8 +3355,9 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur,
 			return false;
 		/* Check our ids match any regs they're supposed to */
 		return check_ids(rold->id, rcur->id, idmap);
+	case PTR_TO_PACKET_META:
 	case PTR_TO_PACKET:
-		if (rcur->type != PTR_TO_PACKET)
+		if (rcur->type != rold->type)
 			return false;
 		/* We must have at least as much range as the old ptr
 		 * did, so that any accesses which were safe before are
diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c
index df672517b4fd..a86e6687026e 100644
--- a/net/bpf/test_run.c
+++ b/net/bpf/test_run.c
@@ -162,6 +162,7 @@ int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr,
 
 	xdp.data_hard_start = data;
 	xdp.data = data + XDP_PACKET_HEADROOM + NET_IP_ALIGN;
+	xdp.data_meta = xdp.data;
 	xdp.data_end = xdp.data + size;
 
 	retval = bpf_test_run(prog, &xdp, repeat, &duration);
diff --git a/net/core/dev.c b/net/core/dev.c
index 97abddd9039a..e350c768d4b5 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3864,8 +3864,8 @@ drop:
 static u32 netif_receive_generic_xdp(struct sk_buff *skb,
 				     struct bpf_prog *xdp_prog)
 {
+	u32 metalen, act = XDP_DROP;
 	struct xdp_buff xdp;
-	u32 act = XDP_DROP;
 	void *orig_data;
 	int hlen, off;
 	u32 mac_len;
@@ -3876,8 +3876,25 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb,
 	if (skb_cloned(skb))
 		return XDP_PASS;
 
-	if (skb_linearize(skb))
-		goto do_drop;
+	/* XDP packets must be linear and must have sufficient headroom
+	 * of XDP_PACKET_HEADROOM bytes. This is the guarantee that also
+	 * native XDP provides, thus we need to do it here as well.
+	 */
+	if (skb_is_nonlinear(skb) ||
+	    skb_headroom(skb) < XDP_PACKET_HEADROOM) {
+		int hroom = XDP_PACKET_HEADROOM - skb_headroom(skb);
+		int troom = skb->tail + skb->data_len - skb->end;
+
+		/* In case we have to go down the path and also linearize,
+		 * then lets do the pskb_expand_head() work just once here.
+		 */
+		if (pskb_expand_head(skb,
+				     hroom > 0 ? ALIGN(hroom, NET_SKB_PAD) : 0,
+				     troom > 0 ? troom + 128 : 0, GFP_ATOMIC))
+			goto do_drop;
+		if (troom > 0 && __skb_linearize(skb))
+			goto do_drop;
+	}
 
 	/* The XDP program wants to see the packet starting at the MAC
 	 * header.
@@ -3885,6 +3902,7 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb,
 	mac_len = skb->data - skb_mac_header(skb);
 	hlen = skb_headlen(skb) + mac_len;
 	xdp.data = skb->data - mac_len;
+	xdp.data_meta = xdp.data;
 	xdp.data_end = xdp.data + hlen;
 	xdp.data_hard_start = skb->data - skb_headroom(skb);
 	orig_data = xdp.data;
@@ -3902,10 +3920,12 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb,
 	case XDP_REDIRECT:
 	case XDP_TX:
 		__skb_push(skb, mac_len);
-		/* fall through */
+		break;
 	case XDP_PASS:
+		metalen = xdp.data - xdp.data_meta;
+		if (metalen)
+			skb_metadata_set(skb, metalen);
 		break;
-
 	default:
 		bpf_warn_invalid_xdp_action(act);
 		/* fall through */
@@ -4695,6 +4715,7 @@ static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb)
 		diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
 		diffs |= p->vlan_tci ^ skb->vlan_tci;
 		diffs |= skb_metadata_dst_cmp(p, skb);
+		diffs |= skb_metadata_differs(p, skb);
 		if (maclen == ETH_HLEN)
 			diffs |= compare_ether_header(skb_mac_header(p),
 						      skb_mac_header(skb));
diff --git a/net/core/filter.c b/net/core/filter.c
index c468e7cfad19..9b6e7e84aafd 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2447,14 +2447,26 @@ static const struct bpf_func_proto bpf_skb_change_head_proto = {
 	.arg3_type	= ARG_ANYTHING,
 };
 
+static unsigned long xdp_get_metalen(const struct xdp_buff *xdp)
+{
+	return xdp_data_meta_unsupported(xdp) ? 0 :
+	       xdp->data - xdp->data_meta;
+}
+
 BPF_CALL_2(bpf_xdp_adjust_head, struct xdp_buff *, xdp, int, offset)
 {
+	unsigned long metalen = xdp_get_metalen(xdp);
+	void *data_start = xdp->data_hard_start + metalen;
 	void *data = xdp->data + offset;
 
-	if (unlikely(data < xdp->data_hard_start ||
+	if (unlikely(data < data_start ||
 		     data > xdp->data_end - ETH_HLEN))
 		return -EINVAL;
 
+	if (metalen)
+		memmove(xdp->data_meta + offset,
+			xdp->data_meta, metalen);
+	xdp->data_meta += offset;
 	xdp->data = data;
 
 	return 0;
@@ -2468,6 +2480,33 @@ static const struct bpf_func_proto bpf_xdp_adjust_head_proto = {
 	.arg2_type	= ARG_ANYTHING,
 };
 
+BPF_CALL_2(bpf_xdp_adjust_meta, struct xdp_buff *, xdp, int, offset)
+{
+	void *meta = xdp->data_meta + offset;
+	unsigned long metalen = xdp->data - meta;
+
+	if (xdp_data_meta_unsupported(xdp))
+		return -ENOTSUPP;
+	if (unlikely(meta < xdp->data_hard_start ||
+		     meta > xdp->data))
+		return -EINVAL;
+	if (unlikely((metalen & (sizeof(__u32) - 1)) ||
+		     (metalen > 32)))
+		return -EACCES;
+
+	xdp->data_meta = meta;
+
+	return 0;
+}
+
+static const struct bpf_func_proto bpf_xdp_adjust_meta_proto = {
+	.func		= bpf_xdp_adjust_meta,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_ANYTHING,
+};
+
 static int __bpf_tx_xdp(struct net_device *dev,
 			struct bpf_map *map,
 			struct xdp_buff *xdp,
@@ -2692,7 +2731,8 @@ bool bpf_helper_changes_pkt_data(void *func)
 	    func == bpf_clone_redirect ||
 	    func == bpf_l3_csum_replace ||
 	    func == bpf_l4_csum_replace ||
-	    func == bpf_xdp_adjust_head)
+	    func == bpf_xdp_adjust_head ||
+	    func == bpf_xdp_adjust_meta)
 		return true;
 
 	return false;
@@ -3288,6 +3328,8 @@ xdp_func_proto(enum bpf_func_id func_id)
 		return &bpf_get_smp_processor_id_proto;
 	case BPF_FUNC_xdp_adjust_head:
 		return &bpf_xdp_adjust_head_proto;
+	case BPF_FUNC_xdp_adjust_meta:
+		return &bpf_xdp_adjust_meta_proto;
 	case BPF_FUNC_redirect:
 		return &bpf_xdp_redirect_proto;
 	case BPF_FUNC_redirect_map:
@@ -3418,6 +3460,7 @@ static bool bpf_skb_is_valid_access(int off, int size, enum bpf_access_type type
 	case bpf_ctx_range_till(struct __sk_buff, remote_ip4, remote_ip4):
 	case bpf_ctx_range_till(struct __sk_buff, local_ip4, local_ip4):
 	case bpf_ctx_range(struct __sk_buff, data):
+	case bpf_ctx_range(struct __sk_buff, data_meta):
 	case bpf_ctx_range(struct __sk_buff, data_end):
 		if (size != size_default)
 			return false;
@@ -3444,6 +3487,7 @@ static bool sk_filter_is_valid_access(int off, int size,
 	switch (off) {
 	case bpf_ctx_range(struct __sk_buff, tc_classid):
 	case bpf_ctx_range(struct __sk_buff, data):
+	case bpf_ctx_range(struct __sk_buff, data_meta):
 	case bpf_ctx_range(struct __sk_buff, data_end):
 	case bpf_ctx_range_till(struct __sk_buff, family, local_port):
 		return false;
@@ -3468,6 +3512,7 @@ static bool lwt_is_valid_access(int off, int size,
 	switch (off) {
 	case bpf_ctx_range(struct __sk_buff, tc_classid):
 	case bpf_ctx_range_till(struct __sk_buff, family, local_port):
+	case bpf_ctx_range(struct __sk_buff, data_meta):
 		return false;
 	}
 
@@ -3586,6 +3631,9 @@ static bool tc_cls_act_is_valid_access(int off, int size,
 	case bpf_ctx_range(struct __sk_buff, data):
 		info->reg_type = PTR_TO_PACKET;
 		break;
+	case bpf_ctx_range(struct __sk_buff, data_meta):
+		info->reg_type = PTR_TO_PACKET_META;
+		break;
 	case bpf_ctx_range(struct __sk_buff, data_end):
 		info->reg_type = PTR_TO_PACKET_END;
 		break;
@@ -3619,6 +3667,9 @@ static bool xdp_is_valid_access(int off, int size,
 	case offsetof(struct xdp_md, data):
 		info->reg_type = PTR_TO_PACKET;
 		break;
+	case offsetof(struct xdp_md, data_meta):
+		info->reg_type = PTR_TO_PACKET_META;
+		break;
 	case offsetof(struct xdp_md, data_end):
 		info->reg_type = PTR_TO_PACKET_END;
 		break;
@@ -3677,6 +3728,12 @@ static bool sk_skb_is_valid_access(int off, int size,
 				   enum bpf_access_type type,
 				   struct bpf_insn_access_aux *info)
 {
+	switch (off) {
+	case bpf_ctx_range(struct __sk_buff, tc_classid):
+	case bpf_ctx_range(struct __sk_buff, data_meta):
+		return false;
+	}
+
 	if (type == BPF_WRITE) {
 		switch (off) {
 		case bpf_ctx_range(struct __sk_buff, mark):
@@ -3689,8 +3746,6 @@ static bool sk_skb_is_valid_access(int off, int size,
 	}
 
 	switch (off) {
-	case bpf_ctx_range(struct __sk_buff, tc_classid):
-		return false;
 	case bpf_ctx_range(struct __sk_buff, data):
 		info->reg_type = PTR_TO_PACKET;
 		break;
@@ -3847,6 +3902,15 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type,
 				      offsetof(struct sk_buff, data));
 		break;
 
+	case offsetof(struct __sk_buff, data_meta):
+		off  = si->off;
+		off -= offsetof(struct __sk_buff, data_meta);
+		off += offsetof(struct sk_buff, cb);
+		off += offsetof(struct bpf_skb_data_end, data_meta);
+		*insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg,
+				      si->src_reg, off);
+		break;
+
 	case offsetof(struct __sk_buff, data_end):
 		off  = si->off;
 		off -= offsetof(struct __sk_buff, data_end);
@@ -4095,6 +4159,11 @@ static u32 xdp_convert_ctx_access(enum bpf_access_type type,
 				      si->dst_reg, si->src_reg,
 				      offsetof(struct xdp_buff, data));
 		break;
+	case offsetof(struct xdp_md, data_meta):
+		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, data_meta),
+				      si->dst_reg, si->src_reg,
+				      offsetof(struct xdp_buff, data_meta));
+		break;
 	case offsetof(struct xdp_md, data_end):
 		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, data_end),
 				      si->dst_reg, si->src_reg,
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 000ce735fa8d..d98c2e3ce2bf 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -1509,6 +1509,8 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
 	skb->nohdr    = 0;
 	atomic_set(&skb_shinfo(skb)->dataref, 1);
 
+	skb_metadata_clear(skb);
+
 	/* It is not generally safe to change skb->truesize.
 	 * For the moment, we really care of rx path, or
 	 * when skb is orphaned (not attached to a socket).
-- 
cgit v1.2.3


From 2b7c6ba945fd3c10ca3e030be402098aff2f89d3 Mon Sep 17 00:00:00 2001
From: Edward Cree <ecree@solarflare.com>
Date: Tue, 26 Sep 2017 16:35:13 +0100
Subject: bpf/verifier: improve disassembly of BPF_END instructions

print_bpf_insn() was treating all BPF_ALU[64] the same, but BPF_END has a
 different structure: it has a size in insn->imm (even if it's BPF_X) and
 uses the BPF_SRC (X or K) to indicate which endianness to use.  So it
 needs different code to print it.

Signed-off-by: Edward Cree <ecree@solarflare.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/verifier.c | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index f849eca36052..e8d7bb8e6b98 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -332,26 +332,40 @@ static const char *const bpf_jmp_string[16] = {
 	[BPF_EXIT >> 4] = "exit",
 };
 
+static void print_bpf_end_insn(const struct bpf_verifier_env *env,
+			       const struct bpf_insn *insn)
+{
+	verbose("(%02x) r%d = %s%d r%d\n", insn->code, insn->dst_reg,
+		BPF_SRC(insn->code) == BPF_TO_BE ? "be" : "le",
+		insn->imm, insn->dst_reg);
+}
+
 static void print_bpf_insn(const struct bpf_verifier_env *env,
 			   const struct bpf_insn *insn)
 {
 	u8 class = BPF_CLASS(insn->code);
 
 	if (class == BPF_ALU || class == BPF_ALU64) {
-		if (BPF_SRC(insn->code) == BPF_X)
+		if (BPF_OP(insn->code) == BPF_END) {
+			if (class == BPF_ALU64)
+				verbose("BUG_alu64_%02x\n", insn->code);
+			else
+				print_bpf_end_insn(env, insn);
+		} else if (BPF_SRC(insn->code) == BPF_X) {
 			verbose("(%02x) %sr%d %s %sr%d\n",
 				insn->code, class == BPF_ALU ? "(u32) " : "",
 				insn->dst_reg,
 				bpf_alu_string[BPF_OP(insn->code) >> 4],
 				class == BPF_ALU ? "(u32) " : "",
 				insn->src_reg);
-		else
+		} else {
 			verbose("(%02x) %sr%d %s %s%d\n",
 				insn->code, class == BPF_ALU ? "(u32) " : "",
 				insn->dst_reg,
 				bpf_alu_string[BPF_OP(insn->code) >> 4],
 				class == BPF_ALU ? "(u32) " : "",
 				insn->imm);
+		}
 	} else if (class == BPF_STX) {
 		if (BPF_MODE(insn->code) == BPF_MEM)
 			verbose("(%02x) *(%s *)(r%d %+d) = r%d\n",
-- 
cgit v1.2.3


From 73c864b38383f4abc9b559025cd663f4a81afa89 Mon Sep 17 00:00:00 2001
From: Edward Cree <ecree@solarflare.com>
Date: Tue, 26 Sep 2017 16:35:29 +0100
Subject: bpf/verifier: improve disassembly of BPF_NEG instructions

BPF_NEG takes only one operand, unlike the bulk of BPF_ALU[64] which are
 compound-assignments.  So give it its own format in print_bpf_insn().

Signed-off-by: Edward Cree <ecree@solarflare.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/verifier.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index e8d7bb8e6b98..4cf9b72c59a0 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -351,6 +351,11 @@ static void print_bpf_insn(const struct bpf_verifier_env *env,
 				verbose("BUG_alu64_%02x\n", insn->code);
 			else
 				print_bpf_end_insn(env, insn);
+		} else if (BPF_OP(insn->code) == BPF_NEG) {
+			verbose("(%02x) r%d = %s-r%d\n",
+				insn->code, insn->dst_reg,
+				class == BPF_ALU ? "(u32) " : "",
+				insn->dst_reg);
 		} else if (BPF_SRC(insn->code) == BPF_X) {
 			verbose("(%02x) %sr%d %s %sr%d\n",
 				insn->code, class == BPF_ALU ? "(u32) " : "",
-- 
cgit v1.2.3


From cb4d2b3f03d8eed90be3a194e5b54b734ec4bbe9 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Wed, 27 Sep 2017 14:37:52 -0700
Subject: bpf: Add name, load_time, uid and map_ids to bpf_prog_info

The patch adds name and load_time to struct bpf_prog_aux.  They
are also exported to bpf_prog_info.

The bpf_prog's name is passed by userspace during BPF_PROG_LOAD.
The kernel only stores the first (BPF_PROG_NAME_LEN - 1) bytes
and the name stored in the kernel is always \0 terminated.

The kernel will reject name that contains characters other than
isalnum() and '_'.  It will also reject name that is not null
terminated.

The existing 'user->uid' of the bpf_prog_aux is also exported to
the bpf_prog_info as created_by_uid.

The existing 'used_maps' of the bpf_prog_aux is exported to
the newly added members 'nr_map_ids' and 'map_ids' of
the bpf_prog_info.  On the input, nr_map_ids tells how
big the userspace's map_ids buffer is.  On the output,
nr_map_ids tells the exact user_map_cnt and it will only
copy up to the userspace's map_ids buffer is allowed.

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Acked-by: Alexei Starovoitov <ast@fb.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h      |  2 ++
 include/uapi/linux/bpf.h |  8 ++++++++
 kernel/bpf/syscall.c     | 51 +++++++++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 60 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 2b672c50f160..33ccc474fb04 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -187,6 +187,8 @@ struct bpf_prog_aux {
 	struct bpf_map **used_maps;
 	struct bpf_prog *prog;
 	struct user_struct *user;
+	u64 load_time; /* ns since boottime */
+	u8 name[BPF_OBJ_NAME_LEN];
 	union {
 		struct work_struct work;
 		struct rcu_head	rcu;
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index e43491ac4823..bd6348269bf5 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -175,6 +175,8 @@ enum bpf_attach_type {
 /* Specify numa node during map creation */
 #define BPF_F_NUMA_NODE		(1U << 2)
 
+#define BPF_OBJ_NAME_LEN 16U
+
 union bpf_attr {
 	struct { /* anonymous struct used by BPF_MAP_CREATE command */
 		__u32	map_type;	/* one of enum bpf_map_type */
@@ -210,6 +212,7 @@ union bpf_attr {
 		__aligned_u64	log_buf;	/* user supplied buffer */
 		__u32		kern_version;	/* checked when prog_type=kprobe */
 		__u32		prog_flags;
+		__u8		prog_name[BPF_OBJ_NAME_LEN];
 	};
 
 	struct { /* anonymous struct used by BPF_OBJ_* commands */
@@ -812,6 +815,11 @@ struct bpf_prog_info {
 	__u32 xlated_prog_len;
 	__aligned_u64 jited_prog_insns;
 	__aligned_u64 xlated_prog_insns;
+	__u64 load_time;	/* ns since boottime */
+	__u32 created_by_uid;
+	__u32 nr_map_ids;
+	__aligned_u64 map_ids;
+	__u8  name[BPF_OBJ_NAME_LEN];
 } __attribute__((aligned(8)));
 
 struct bpf_map_info {
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 25d074920a00..45970df3f820 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -23,6 +23,9 @@
 #include <linux/version.h>
 #include <linux/kernel.h>
 #include <linux/idr.h>
+#include <linux/cred.h>
+#include <linux/timekeeping.h>
+#include <linux/ctype.h>
 
 #define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PROG_ARRAY || \
 			   (map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \
@@ -312,6 +315,30 @@ int bpf_map_new_fd(struct bpf_map *map)
 		   offsetof(union bpf_attr, CMD##_LAST_FIELD) - \
 		   sizeof(attr->CMD##_LAST_FIELD)) != NULL
 
+/* dst and src must have at least BPF_OBJ_NAME_LEN number of bytes.
+ * Return 0 on success and < 0 on error.
+ */
+static int bpf_obj_name_cpy(char *dst, const char *src)
+{
+	const char *end = src + BPF_OBJ_NAME_LEN;
+
+	/* Copy all isalnum() and '_' char */
+	while (src < end && *src) {
+		if (!isalnum(*src) && *src != '_')
+			return -EINVAL;
+		*dst++ = *src++;
+	}
+
+	/* No '\0' found in BPF_OBJ_NAME_LEN number of bytes */
+	if (src == end)
+		return -EINVAL;
+
+	/* '\0' terminates dst */
+	*dst = 0;
+
+	return 0;
+}
+
 #define BPF_MAP_CREATE_LAST_FIELD numa_node
 /* called via syscall */
 static int map_create(union bpf_attr *attr)
@@ -973,7 +1000,7 @@ struct bpf_prog *bpf_prog_get_type(u32 ufd, enum bpf_prog_type type)
 EXPORT_SYMBOL_GPL(bpf_prog_get_type);
 
 /* last field in 'union bpf_attr' used by this command */
-#define	BPF_PROG_LOAD_LAST_FIELD prog_flags
+#define	BPF_PROG_LOAD_LAST_FIELD prog_name
 
 static int bpf_prog_load(union bpf_attr *attr)
 {
@@ -1037,6 +1064,11 @@ static int bpf_prog_load(union bpf_attr *attr)
 	if (err < 0)
 		goto free_prog;
 
+	prog->aux->load_time = ktime_get_boot_ns();
+	err = bpf_obj_name_cpy(prog->aux->name, attr->prog_name);
+	if (err)
+		goto free_prog;
+
 	/* run eBPF verifier */
 	err = bpf_check(&prog, attr);
 	if (err < 0)
@@ -1358,8 +1390,25 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
 
 	info.type = prog->type;
 	info.id = prog->aux->id;
+	info.load_time = prog->aux->load_time;
+	info.created_by_uid = from_kuid_munged(current_user_ns(),
+					       prog->aux->user->uid);
 
 	memcpy(info.tag, prog->tag, sizeof(prog->tag));
+	memcpy(info.name, prog->aux->name, sizeof(prog->aux->name));
+
+	ulen = info.nr_map_ids;
+	info.nr_map_ids = prog->aux->used_map_cnt;
+	ulen = min_t(u32, info.nr_map_ids, ulen);
+	if (ulen) {
+		u32 *user_map_ids = (u32 *)info.map_ids;
+		u32 i;
+
+		for (i = 0; i < ulen; i++)
+			if (put_user(prog->aux->used_maps[i]->id,
+				     &user_map_ids[i]))
+				return -EFAULT;
+	}
 
 	if (!capable(CAP_SYS_ADMIN)) {
 		info.jited_prog_len = 0;
-- 
cgit v1.2.3


From ad5b177bd73f5107d97c36f56395c4281fb6f089 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Wed, 27 Sep 2017 14:37:53 -0700
Subject: bpf: Add map_name to bpf_map_info

This patch allows userspace to specify a name for a map
during BPF_MAP_CREATE.

The map's name can later be exported to user space
via BPF_OBJ_GET_INFO_BY_FD.

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Acked-by: Alexei Starovoitov <ast@fb.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h      | 1 +
 include/uapi/linux/bpf.h | 2 ++
 kernel/bpf/syscall.c     | 7 ++++++-
 3 files changed, 9 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 33ccc474fb04..252f4bc9eb25 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -56,6 +56,7 @@ struct bpf_map {
 	struct work_struct work;
 	atomic_t usercnt;
 	struct bpf_map *inner_map_meta;
+	u8 name[BPF_OBJ_NAME_LEN];
 };
 
 /* function argument constraints */
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index bd6348269bf5..6d2137b4cf38 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -190,6 +190,7 @@ union bpf_attr {
 		__u32	numa_node;	/* numa node (effective only if
 					 * BPF_F_NUMA_NODE is set).
 					 */
+		__u8	map_name[BPF_OBJ_NAME_LEN];
 	};
 
 	struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */
@@ -829,6 +830,7 @@ struct bpf_map_info {
 	__u32 value_size;
 	__u32 max_entries;
 	__u32 map_flags;
+	__u8  name[BPF_OBJ_NAME_LEN];
 } __attribute__((aligned(8)));
 
 /* User bpf_sock_ops struct to access socket values and specify request ops
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 45970df3f820..11a7f82a55d1 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -339,7 +339,7 @@ static int bpf_obj_name_cpy(char *dst, const char *src)
 	return 0;
 }
 
-#define BPF_MAP_CREATE_LAST_FIELD numa_node
+#define BPF_MAP_CREATE_LAST_FIELD map_name
 /* called via syscall */
 static int map_create(union bpf_attr *attr)
 {
@@ -361,6 +361,10 @@ static int map_create(union bpf_attr *attr)
 	if (IS_ERR(map))
 		return PTR_ERR(map);
 
+	err = bpf_obj_name_cpy(map->name, attr->map_name);
+	if (err)
+		goto free_map_nouncharge;
+
 	atomic_set(&map->refcnt, 1);
 	atomic_set(&map->usercnt, 1);
 
@@ -1462,6 +1466,7 @@ static int bpf_map_get_info_by_fd(struct bpf_map *map,
 	info.value_size = map->value_size;
 	info.max_entries = map->max_entries;
 	info.map_flags = map->map_flags;
+	memcpy(info.name, map->name, sizeof(map->name));
 
 	if (copy_to_user(uinfo, &info, info_len) ||
 	    put_user(info_len, &uattr->info.info_len))
-- 
cgit v1.2.3


From a1f7164c7b8b0d46f63bfb4ca0bb5971c760b921 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 25 Sep 2017 09:00:18 -0700
Subject: sched: Misc preps for cgroup unified hierarchy interface

Make the following changes in preparation for the cpu controller
interface implementation for cgroup2.  This patch doesn't cause any
functional differences.

* s/cpu_stats_show()/cpu_cfs_stat_show()/

* s/cpu_files/cpu_legacy_files/

v2: Dropped cpuacct changes as it won't be used by cpu controller
    interface anymore.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Li Zefan <lizefan@huawei.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
---
 kernel/sched/core.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 18a6966567da..6815fa424a7a 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6599,7 +6599,7 @@ static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota)
 	return ret;
 }
 
-static int cpu_stats_show(struct seq_file *sf, void *v)
+static int cpu_cfs_stat_show(struct seq_file *sf, void *v)
 {
 	struct task_group *tg = css_tg(seq_css(sf));
 	struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
@@ -6639,7 +6639,7 @@ static u64 cpu_rt_period_read_uint(struct cgroup_subsys_state *css,
 }
 #endif /* CONFIG_RT_GROUP_SCHED */
 
-static struct cftype cpu_files[] = {
+static struct cftype cpu_legacy_files[] = {
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	{
 		.name = "shares",
@@ -6660,7 +6660,7 @@ static struct cftype cpu_files[] = {
 	},
 	{
 		.name = "stat",
-		.seq_show = cpu_stats_show,
+		.seq_show = cpu_cfs_stat_show,
 	},
 #endif
 #ifdef CONFIG_RT_GROUP_SCHED
@@ -6686,7 +6686,7 @@ struct cgroup_subsys cpu_cgrp_subsys = {
 	.fork		= cpu_cgroup_fork,
 	.can_attach	= cpu_cgroup_can_attach,
 	.attach		= cpu_cgroup_attach,
-	.legacy_cftypes	= cpu_files,
+	.legacy_cftypes	= cpu_legacy_files,
 	.early_init	= true,
 };
 
-- 
cgit v1.2.3


From 0d5936344f30aba0f6ddb92b030cb6a05168efe6 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 25 Sep 2017 09:00:19 -0700
Subject: sched: Implement interface for cgroup unified hierarchy

There are a couple interface issues which can be addressed in cgroup2
interface.

* Stats from cpuacct being reported separately from the cpu stats.

* Use of different time units.  Writable control knobs use
  microseconds, some stat fields use nanoseconds while other cpuacct
  stat fields use centiseconds.

* Control knobs which can't be used in the root cgroup still show up
  in the root.

* Control knob names and semantics aren't consistent with other
  controllers.

This patchset implements cpu controller's interface on cgroup2 which
adheres to the controller file conventions described in
Documentation/cgroups/cgroup-v2.txt.  Overall, the following changes
are made.

* cpuacct is implictly enabled and disabled by cpu and its information
  is reported through "cpu.stat" which now uses microseconds for all
  time durations.  All time duration fields now have "_usec" appended
  to them for clarity.

  Note that cpuacct.usage_percpu is currently not included in
  "cpu.stat".  If this information is actually called for, it will be
  added later.

* "cpu.shares" is replaced with "cpu.weight" and operates on the
  standard scale defined by CGROUP_WEIGHT_MIN/DFL/MAX (1, 100, 10000).
  The weight is scaled to scheduler weight so that 100 maps to 1024
  and the ratio relationship is preserved - if weight is W and its
  scaled value is S, W / 100 == S / 1024.  While the mapped range is a
  bit smaller than the orignal scheduler weight range, the dead zones
  on both sides are relatively small and covers wider range than the
  nice value mappings.  This file doesn't make sense in the root
  cgroup and isn't created on root.

* "cpu.weight.nice" is added. When read, it reads back the nice value
  which is closest to the current "cpu.weight".  When written, it sets
  "cpu.weight" to the weight value which matches the nice value.  This
  makes it easy to configure cgroups when they're competing against
  threads in threaded subtrees.

* "cpu.cfs_quota_us" and "cpu.cfs_period_us" are replaced by "cpu.max"
  which contains both quota and period.

v4: - Use cgroup2 basic usage stat as the information source instead
      of cpuacct.

v3: - Added "cpu.weight.nice" to allow using nice values when
      configuring the weight.  The feature is requested by PeterZ.
    - Merge the patch to enable threaded support on cpu and cpuacct.
    - Dropped the bits about getting rid of cpuacct from patch
      description as there is a pretty strong case for making cpuacct
      an implicit controller so that basic cpu usage stats are always
      available.
    - Documentation updated accordingly.  "cpu.rt.max" section is
      dropped for now.

v2: - cpu_stats_show() was incorrectly using CONFIG_FAIR_GROUP_SCHED
      for CFS bandwidth stats and also using raw division for u64.
      Use CONFIG_CFS_BANDWITH and do_div() instead.  "cpu.rt.max" is
      not included yet.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Li Zefan <lizefan@huawei.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
---
 Documentation/cgroup-v2.txt |  36 ++++------
 kernel/sched/core.c         | 171 ++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 183 insertions(+), 24 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/cgroup-v2.txt b/Documentation/cgroup-v2.txt
index 3f8216912df0..0bbdc720dd7c 100644
--- a/Documentation/cgroup-v2.txt
+++ b/Documentation/cgroup-v2.txt
@@ -902,10 +902,6 @@ Controllers
 CPU
 ---
 
-.. note::
-
-	The interface for the cpu controller hasn't been merged yet
-
 The "cpu" controllers regulates distribution of CPU cycles.  This
 controller implements weight and absolute bandwidth limit models for
 normal scheduling policy and absolute bandwidth allocation model for
@@ -935,6 +931,18 @@ All time durations are in microseconds.
 
 	The weight in the range [1, 10000].
 
+  cpu.weight.nice
+	A read-write single value file which exists on non-root
+	cgroups.  The default is "0".
+
+	The nice value is in the range [-20, 19].
+
+	This interface file is an alternative interface for
+	"cpu.weight" and allows reading and setting weight using the
+	same values used by nice(2).  Because the range is smaller and
+	granularity is coarser for the nice values, the read value is
+	the closest approximation of the current weight.
+
   cpu.max
 	A read-write two value file which exists on non-root cgroups.
 	The default is "max 100000".
@@ -947,26 +955,6 @@ All time durations are in microseconds.
 	$PERIOD duration.  "max" for $MAX indicates no limit.  If only
 	one number is written, $MAX is updated.
 
-  cpu.rt.max
-	.. note::
-
-	   The semantics of this file is still under discussion and the
-	   interface hasn't been merged yet
-
-	A read-write two value file which exists on all cgroups.
-	The default is "0 100000".
-
-	The maximum realtime runtime allocation.  Over-committing
-	configurations are disallowed and process migrations are
-	rejected if not enough bandwidth is available.  It's in the
-	following format::
-
-	  $MAX $PERIOD
-
-	which indicates that the group may consume upto $MAX in each
-	$PERIOD duration.  If only one number is written, $MAX is
-	updated.
-
 
 Memory
 ------
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 6815fa424a7a..ad255162a830 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6678,6 +6678,175 @@ static struct cftype cpu_legacy_files[] = {
 	{ }	/* Terminate */
 };
 
+static int cpu_stat_show(struct seq_file *sf, void *v)
+{
+	cgroup_stat_show_cputime(sf, "");
+
+#ifdef CONFIG_CFS_BANDWIDTH
+	{
+		struct task_group *tg = css_tg(seq_css(sf));
+		struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
+		u64 throttled_usec;
+
+		throttled_usec = cfs_b->throttled_time;
+		do_div(throttled_usec, NSEC_PER_USEC);
+
+		seq_printf(sf, "nr_periods %d\n"
+			   "nr_throttled %d\n"
+			   "throttled_usec %llu\n",
+			   cfs_b->nr_periods, cfs_b->nr_throttled,
+			   throttled_usec);
+	}
+#endif
+	return 0;
+}
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+static u64 cpu_weight_read_u64(struct cgroup_subsys_state *css,
+			       struct cftype *cft)
+{
+	struct task_group *tg = css_tg(css);
+	u64 weight = scale_load_down(tg->shares);
+
+	return DIV_ROUND_CLOSEST_ULL(weight * CGROUP_WEIGHT_DFL, 1024);
+}
+
+static int cpu_weight_write_u64(struct cgroup_subsys_state *css,
+				struct cftype *cft, u64 weight)
+{
+	/*
+	 * cgroup weight knobs should use the common MIN, DFL and MAX
+	 * values which are 1, 100 and 10000 respectively.  While it loses
+	 * a bit of range on both ends, it maps pretty well onto the shares
+	 * value used by scheduler and the round-trip conversions preserve
+	 * the original value over the entire range.
+	 */
+	if (weight < CGROUP_WEIGHT_MIN || weight > CGROUP_WEIGHT_MAX)
+		return -ERANGE;
+
+	weight = DIV_ROUND_CLOSEST_ULL(weight * 1024, CGROUP_WEIGHT_DFL);
+
+	return sched_group_set_shares(css_tg(css), scale_load(weight));
+}
+
+static s64 cpu_weight_nice_read_s64(struct cgroup_subsys_state *css,
+				    struct cftype *cft)
+{
+	unsigned long weight = scale_load_down(css_tg(css)->shares);
+	int last_delta = INT_MAX;
+	int prio, delta;
+
+	/* find the closest nice value to the current weight */
+	for (prio = 0; prio < ARRAY_SIZE(sched_prio_to_weight); prio++) {
+		delta = abs(sched_prio_to_weight[prio] - weight);
+		if (delta >= last_delta)
+			break;
+		last_delta = delta;
+	}
+
+	return PRIO_TO_NICE(prio - 1 + MAX_RT_PRIO);
+}
+
+static int cpu_weight_nice_write_s64(struct cgroup_subsys_state *css,
+				     struct cftype *cft, s64 nice)
+{
+	unsigned long weight;
+
+	if (nice < MIN_NICE || nice > MAX_NICE)
+		return -ERANGE;
+
+	weight = sched_prio_to_weight[NICE_TO_PRIO(nice) - MAX_RT_PRIO];
+	return sched_group_set_shares(css_tg(css), scale_load(weight));
+}
+#endif
+
+static void __maybe_unused cpu_period_quota_print(struct seq_file *sf,
+						  long period, long quota)
+{
+	if (quota < 0)
+		seq_puts(sf, "max");
+	else
+		seq_printf(sf, "%ld", quota);
+
+	seq_printf(sf, " %ld\n", period);
+}
+
+/* caller should put the current value in *@periodp before calling */
+static int __maybe_unused cpu_period_quota_parse(char *buf,
+						 u64 *periodp, u64 *quotap)
+{
+	char tok[21];	/* U64_MAX */
+
+	if (!sscanf(buf, "%s %llu", tok, periodp))
+		return -EINVAL;
+
+	*periodp *= NSEC_PER_USEC;
+
+	if (sscanf(tok, "%llu", quotap))
+		*quotap *= NSEC_PER_USEC;
+	else if (!strcmp(tok, "max"))
+		*quotap = RUNTIME_INF;
+	else
+		return -EINVAL;
+
+	return 0;
+}
+
+#ifdef CONFIG_CFS_BANDWIDTH
+static int cpu_max_show(struct seq_file *sf, void *v)
+{
+	struct task_group *tg = css_tg(seq_css(sf));
+
+	cpu_period_quota_print(sf, tg_get_cfs_period(tg), tg_get_cfs_quota(tg));
+	return 0;
+}
+
+static ssize_t cpu_max_write(struct kernfs_open_file *of,
+			     char *buf, size_t nbytes, loff_t off)
+{
+	struct task_group *tg = css_tg(of_css(of));
+	u64 period = tg_get_cfs_period(tg);
+	u64 quota;
+	int ret;
+
+	ret = cpu_period_quota_parse(buf, &period, &quota);
+	if (!ret)
+		ret = tg_set_cfs_bandwidth(tg, period, quota);
+	return ret ?: nbytes;
+}
+#endif
+
+static struct cftype cpu_files[] = {
+	{
+		.name = "stat",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.seq_show = cpu_stat_show,
+	},
+#ifdef CONFIG_FAIR_GROUP_SCHED
+	{
+		.name = "weight",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.read_u64 = cpu_weight_read_u64,
+		.write_u64 = cpu_weight_write_u64,
+	},
+	{
+		.name = "weight.nice",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.read_s64 = cpu_weight_nice_read_s64,
+		.write_s64 = cpu_weight_nice_write_s64,
+	},
+#endif
+#ifdef CONFIG_CFS_BANDWIDTH
+	{
+		.name = "max",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.seq_show = cpu_max_show,
+		.write = cpu_max_write,
+	},
+#endif
+	{ }	/* terminate */
+};
+
 struct cgroup_subsys cpu_cgrp_subsys = {
 	.css_alloc	= cpu_cgroup_css_alloc,
 	.css_online	= cpu_cgroup_css_online,
@@ -6687,7 +6856,9 @@ struct cgroup_subsys cpu_cgrp_subsys = {
 	.can_attach	= cpu_cgroup_can_attach,
 	.attach		= cpu_cgroup_attach,
 	.legacy_cftypes	= cpu_legacy_files,
+	.dfl_cftypes	= cpu_files,
 	.early_init	= true,
+	.threaded	= true,
 };
 
 #endif	/* CONFIG_CGROUP_SCHED */
-- 
cgit v1.2.3


From 721e08dad17e226ef68819d0a23dc53c25fe8ea5 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Fri, 29 Sep 2017 10:52:17 -0700
Subject: bpf: Fix compiler warning on info.map_ids for 32bit platform

This patch uses u64_to_user_ptr() to cast info.map_ids to a userspace ptr.
It also tags the user_map_ids with '__user' for sparse check.

Fixes: cb4d2b3f03d8 ("bpf: Add name, load_time, uid and map_ids to bpf_prog_info")
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/syscall.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 11a7f82a55d1..b927da66f653 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1405,7 +1405,7 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
 	info.nr_map_ids = prog->aux->used_map_cnt;
 	ulen = min_t(u32, info.nr_map_ids, ulen);
 	if (ulen) {
-		u32 *user_map_ids = (u32 *)info.map_ids;
+		u32 __user *user_map_ids = u64_to_user_ptr(info.map_ids);
 		u32 i;
 
 		for (i = 0; i < ulen; i++)
-- 
cgit v1.2.3


From 7813dd6fc75fb375d4caf002e7f80a826fc3153a Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Tue, 26 Sep 2017 15:12:40 -0700
Subject: PM / OPP: Move the OPP directory out of power/

The drivers/base/power/ directory is special and contains code related
to power management core like system suspend/resume, hibernation, etc.
It was fine to keep the OPP code inside it when we had just one file for
it, but it is growing now and already has a directory for itself.

Lets move it directly under drivers/ directory, just like cpufreq and
cpuidle.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Acked-by: Stephen Boyd <sboyd@codeaurora.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 MAINTAINERS                      |    2 +-
 drivers/Kconfig                  |    2 +
 drivers/Makefile                 |    1 +
 drivers/base/power/Makefile      |    1 -
 drivers/base/power/opp/Makefile  |    4 -
 drivers/base/power/opp/core.c    | 1747 --------------------------------------
 drivers/base/power/opp/cpu.c     |  236 -----
 drivers/base/power/opp/debugfs.c |  249 ------
 drivers/base/power/opp/of.c      |  633 --------------
 drivers/base/power/opp/opp.h     |  222 -----
 drivers/opp/Kconfig              |   13 +
 drivers/opp/Makefile             |    4 +
 drivers/opp/core.c               | 1747 ++++++++++++++++++++++++++++++++++++++
 drivers/opp/cpu.c                |  236 +++++
 drivers/opp/debugfs.c            |  249 ++++++
 drivers/opp/of.c                 |  633 ++++++++++++++
 drivers/opp/opp.h                |  222 +++++
 kernel/power/Kconfig             |   14 -
 18 files changed, 3108 insertions(+), 3107 deletions(-)
 delete mode 100644 drivers/base/power/opp/Makefile
 delete mode 100644 drivers/base/power/opp/core.c
 delete mode 100644 drivers/base/power/opp/cpu.c
 delete mode 100644 drivers/base/power/opp/debugfs.c
 delete mode 100644 drivers/base/power/opp/of.c
 delete mode 100644 drivers/base/power/opp/opp.h
 create mode 100644 drivers/opp/Kconfig
 create mode 100644 drivers/opp/Makefile
 create mode 100644 drivers/opp/core.c
 create mode 100644 drivers/opp/cpu.c
 create mode 100644 drivers/opp/debugfs.c
 create mode 100644 drivers/opp/of.c
 create mode 100644 drivers/opp/opp.h

(limited to 'kernel')

diff --git a/MAINTAINERS b/MAINTAINERS
index 65b0c88d5ee0..7c8c649fc68b 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -10043,7 +10043,7 @@ M:	Stephen Boyd <sboyd@codeaurora.org>
 L:	linux-pm@vger.kernel.org
 S:	Maintained
 T:	git git://git.kernel.org/pub/scm/linux/kernel/git/vireshk/pm.git
-F:	drivers/base/power/opp/
+F:	drivers/opp/
 F:	include/linux/pm_opp.h
 F:	Documentation/power/opp.txt
 F:	Documentation/devicetree/bindings/opp/
diff --git a/drivers/Kconfig b/drivers/Kconfig
index 505c676fa9c7..9e264d410c23 100644
--- a/drivers/Kconfig
+++ b/drivers/Kconfig
@@ -208,4 +208,6 @@ source "drivers/tee/Kconfig"
 
 source "drivers/mux/Kconfig"
 
+source "drivers/opp/Kconfig"
+
 endmenu
diff --git a/drivers/Makefile b/drivers/Makefile
index d90fdc413648..dd718a3007e9 100644
--- a/drivers/Makefile
+++ b/drivers/Makefile
@@ -125,6 +125,7 @@ obj-$(CONFIG_ACCESSIBILITY)	+= accessibility/
 obj-$(CONFIG_ISDN)		+= isdn/
 obj-$(CONFIG_EDAC)		+= edac/
 obj-$(CONFIG_EISA)		+= eisa/
+obj-$(CONFIG_PM_OPP)		+= opp/
 obj-$(CONFIG_CPU_FREQ)		+= cpufreq/
 obj-$(CONFIG_CPU_IDLE)		+= cpuidle/
 obj-y				+= mmc/
diff --git a/drivers/base/power/Makefile b/drivers/base/power/Makefile
index 5998c53280f5..73a1cffc0a5f 100644
--- a/drivers/base/power/Makefile
+++ b/drivers/base/power/Makefile
@@ -1,7 +1,6 @@
 obj-$(CONFIG_PM)	+= sysfs.o generic_ops.o common.o qos.o runtime.o wakeirq.o
 obj-$(CONFIG_PM_SLEEP)	+= main.o wakeup.o
 obj-$(CONFIG_PM_TRACE_RTC)	+= trace.o
-obj-$(CONFIG_PM_OPP)	+= opp/
 obj-$(CONFIG_PM_GENERIC_DOMAINS)	+=  domain.o domain_governor.o
 obj-$(CONFIG_HAVE_CLK)	+= clock_ops.o
 
diff --git a/drivers/base/power/opp/Makefile b/drivers/base/power/opp/Makefile
deleted file mode 100644
index e70ceb406fe9..000000000000
--- a/drivers/base/power/opp/Makefile
+++ /dev/null
@@ -1,4 +0,0 @@
-ccflags-$(CONFIG_DEBUG_DRIVER)	:= -DDEBUG
-obj-y				+= core.o cpu.o
-obj-$(CONFIG_OF)		+= of.o
-obj-$(CONFIG_DEBUG_FS)		+= debugfs.o
diff --git a/drivers/base/power/opp/core.c b/drivers/base/power/opp/core.c
deleted file mode 100644
index a6de32530693..000000000000
--- a/drivers/base/power/opp/core.c
+++ /dev/null
@@ -1,1747 +0,0 @@
-/*
- * Generic OPP Interface
- *
- * Copyright (C) 2009-2010 Texas Instruments Incorporated.
- *	Nishanth Menon
- *	Romit Dasgupta
- *	Kevin Hilman
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
-#include <linux/clk.h>
-#include <linux/errno.h>
-#include <linux/err.h>
-#include <linux/slab.h>
-#include <linux/device.h>
-#include <linux/export.h>
-#include <linux/regulator/consumer.h>
-
-#include "opp.h"
-
-/*
- * The root of the list of all opp-tables. All opp_table structures branch off
- * from here, with each opp_table containing the list of opps it supports in
- * various states of availability.
- */
-LIST_HEAD(opp_tables);
-/* Lock to allow exclusive modification to the device and opp lists */
-DEFINE_MUTEX(opp_table_lock);
-
-static void dev_pm_opp_get(struct dev_pm_opp *opp);
-
-static struct opp_device *_find_opp_dev(const struct device *dev,
-					struct opp_table *opp_table)
-{
-	struct opp_device *opp_dev;
-
-	list_for_each_entry(opp_dev, &opp_table->dev_list, node)
-		if (opp_dev->dev == dev)
-			return opp_dev;
-
-	return NULL;
-}
-
-static struct opp_table *_find_opp_table_unlocked(struct device *dev)
-{
-	struct opp_table *opp_table;
-
-	list_for_each_entry(opp_table, &opp_tables, node) {
-		if (_find_opp_dev(dev, opp_table)) {
-			_get_opp_table_kref(opp_table);
-
-			return opp_table;
-		}
-	}
-
-	return ERR_PTR(-ENODEV);
-}
-
-/**
- * _find_opp_table() - find opp_table struct using device pointer
- * @dev:	device pointer used to lookup OPP table
- *
- * Search OPP table for one containing matching device.
- *
- * Return: pointer to 'struct opp_table' if found, otherwise -ENODEV or
- * -EINVAL based on type of error.
- *
- * The callers must call dev_pm_opp_put_opp_table() after the table is used.
- */
-struct opp_table *_find_opp_table(struct device *dev)
-{
-	struct opp_table *opp_table;
-
-	if (IS_ERR_OR_NULL(dev)) {
-		pr_err("%s: Invalid parameters\n", __func__);
-		return ERR_PTR(-EINVAL);
-	}
-
-	mutex_lock(&opp_table_lock);
-	opp_table = _find_opp_table_unlocked(dev);
-	mutex_unlock(&opp_table_lock);
-
-	return opp_table;
-}
-
-/**
- * dev_pm_opp_get_voltage() - Gets the voltage corresponding to an opp
- * @opp:	opp for which voltage has to be returned for
- *
- * Return: voltage in micro volt corresponding to the opp, else
- * return 0
- *
- * This is useful only for devices with single power supply.
- */
-unsigned long dev_pm_opp_get_voltage(struct dev_pm_opp *opp)
-{
-	if (IS_ERR_OR_NULL(opp)) {
-		pr_err("%s: Invalid parameters\n", __func__);
-		return 0;
-	}
-
-	return opp->supplies[0].u_volt;
-}
-EXPORT_SYMBOL_GPL(dev_pm_opp_get_voltage);
-
-/**
- * dev_pm_opp_get_freq() - Gets the frequency corresponding to an available opp
- * @opp:	opp for which frequency has to be returned for
- *
- * Return: frequency in hertz corresponding to the opp, else
- * return 0
- */
-unsigned long dev_pm_opp_get_freq(struct dev_pm_opp *opp)
-{
-	if (IS_ERR_OR_NULL(opp) || !opp->available) {
-		pr_err("%s: Invalid parameters\n", __func__);
-		return 0;
-	}
-
-	return opp->rate;
-}
-EXPORT_SYMBOL_GPL(dev_pm_opp_get_freq);
-
-/**
- * dev_pm_opp_is_turbo() - Returns if opp is turbo OPP or not
- * @opp: opp for which turbo mode is being verified
- *
- * Turbo OPPs are not for normal use, and can be enabled (under certain
- * conditions) for short duration of times to finish high throughput work
- * quickly. Running on them for longer times may overheat the chip.
- *
- * Return: true if opp is turbo opp, else false.
- */
-bool dev_pm_opp_is_turbo(struct dev_pm_opp *opp)
-{
-	if (IS_ERR_OR_NULL(opp) || !opp->available) {
-		pr_err("%s: Invalid parameters\n", __func__);
-		return false;
-	}
-
-	return opp->turbo;
-}
-EXPORT_SYMBOL_GPL(dev_pm_opp_is_turbo);
-
-/**
- * dev_pm_opp_get_max_clock_latency() - Get max clock latency in nanoseconds
- * @dev:	device for which we do this operation
- *
- * Return: This function returns the max clock latency in nanoseconds.
- */
-unsigned long dev_pm_opp_get_max_clock_latency(struct device *dev)
-{
-	struct opp_table *opp_table;
-	unsigned long clock_latency_ns;
-
-	opp_table = _find_opp_table(dev);
-	if (IS_ERR(opp_table))
-		return 0;
-
-	clock_latency_ns = opp_table->clock_latency_ns_max;
-
-	dev_pm_opp_put_opp_table(opp_table);
-
-	return clock_latency_ns;
-}
-EXPORT_SYMBOL_GPL(dev_pm_opp_get_max_clock_latency);
-
-/**
- * dev_pm_opp_get_max_volt_latency() - Get max voltage latency in nanoseconds
- * @dev: device for which we do this operation
- *
- * Return: This function returns the max voltage latency in nanoseconds.
- */
-unsigned long dev_pm_opp_get_max_volt_latency(struct device *dev)
-{
-	struct opp_table *opp_table;
-	struct dev_pm_opp *opp;
-	struct regulator *reg;
-	unsigned long latency_ns = 0;
-	int ret, i, count;
-	struct {
-		unsigned long min;
-		unsigned long max;
-	} *uV;
-
-	opp_table = _find_opp_table(dev);
-	if (IS_ERR(opp_table))
-		return 0;
-
-	count = opp_table->regulator_count;
-
-	/* Regulator may not be required for the device */
-	if (!count)
-		goto put_opp_table;
-
-	uV = kmalloc_array(count, sizeof(*uV), GFP_KERNEL);
-	if (!uV)
-		goto put_opp_table;
-
-	mutex_lock(&opp_table->lock);
-
-	for (i = 0; i < count; i++) {
-		uV[i].min = ~0;
-		uV[i].max = 0;
-
-		list_for_each_entry(opp, &opp_table->opp_list, node) {
-			if (!opp->available)
-				continue;
-
-			if (opp->supplies[i].u_volt_min < uV[i].min)
-				uV[i].min = opp->supplies[i].u_volt_min;
-			if (opp->supplies[i].u_volt_max > uV[i].max)
-				uV[i].max = opp->supplies[i].u_volt_max;
-		}
-	}
-
-	mutex_unlock(&opp_table->lock);
-
-	/*
-	 * The caller needs to ensure that opp_table (and hence the regulator)
-	 * isn't freed, while we are executing this routine.
-	 */
-	for (i = 0; i < count; i++) {
-		reg = opp_table->regulators[i];
-		ret = regulator_set_voltage_time(reg, uV[i].min, uV[i].max);
-		if (ret > 0)
-			latency_ns += ret * 1000;
-	}
-
-	kfree(uV);
-put_opp_table:
-	dev_pm_opp_put_opp_table(opp_table);
-
-	return latency_ns;
-}
-EXPORT_SYMBOL_GPL(dev_pm_opp_get_max_volt_latency);
-
-/**
- * dev_pm_opp_get_max_transition_latency() - Get max transition latency in
- *					     nanoseconds
- * @dev: device for which we do this operation
- *
- * Return: This function returns the max transition latency, in nanoseconds, to
- * switch from one OPP to other.
- */
-unsigned long dev_pm_opp_get_max_transition_latency(struct device *dev)
-{
-	return dev_pm_opp_get_max_volt_latency(dev) +
-		dev_pm_opp_get_max_clock_latency(dev);
-}
-EXPORT_SYMBOL_GPL(dev_pm_opp_get_max_transition_latency);
-
-/**
- * dev_pm_opp_get_suspend_opp_freq() - Get frequency of suspend opp in Hz
- * @dev:	device for which we do this operation
- *
- * Return: This function returns the frequency of the OPP marked as suspend_opp
- * if one is available, else returns 0;
- */
-unsigned long dev_pm_opp_get_suspend_opp_freq(struct device *dev)
-{
-	struct opp_table *opp_table;
-	unsigned long freq = 0;
-
-	opp_table = _find_opp_table(dev);
-	if (IS_ERR(opp_table))
-		return 0;
-
-	if (opp_table->suspend_opp && opp_table->suspend_opp->available)
-		freq = dev_pm_opp_get_freq(opp_table->suspend_opp);
-
-	dev_pm_opp_put_opp_table(opp_table);
-
-	return freq;
-}
-EXPORT_SYMBOL_GPL(dev_pm_opp_get_suspend_opp_freq);
-
-/**
- * dev_pm_opp_get_opp_count() - Get number of opps available in the opp table
- * @dev:	device for which we do this operation
- *
- * Return: This function returns the number of available opps if there are any,
- * else returns 0 if none or the corresponding error value.
- */
-int dev_pm_opp_get_opp_count(struct device *dev)
-{
-	struct opp_table *opp_table;
-	struct dev_pm_opp *temp_opp;
-	int count = 0;
-
-	opp_table = _find_opp_table(dev);
-	if (IS_ERR(opp_table)) {
-		count = PTR_ERR(opp_table);
-		dev_err(dev, "%s: OPP table not found (%d)\n",
-			__func__, count);
-		return count;
-	}
-
-	mutex_lock(&opp_table->lock);
-
-	list_for_each_entry(temp_opp, &opp_table->opp_list, node) {
-		if (temp_opp->available)
-			count++;
-	}
-
-	mutex_unlock(&opp_table->lock);
-	dev_pm_opp_put_opp_table(opp_table);
-
-	return count;
-}
-EXPORT_SYMBOL_GPL(dev_pm_opp_get_opp_count);
-
-/**
- * dev_pm_opp_find_freq_exact() - search for an exact frequency
- * @dev:		device for which we do this operation
- * @freq:		frequency to search for
- * @available:		true/false - match for available opp
- *
- * Return: Searches for exact match in the opp table and returns pointer to the
- * matching opp if found, else returns ERR_PTR in case of error and should
- * be handled using IS_ERR. Error return values can be:
- * EINVAL:	for bad pointer
- * ERANGE:	no match found for search
- * ENODEV:	if device not found in list of registered devices
- *
- * Note: available is a modifier for the search. if available=true, then the
- * match is for exact matching frequency and is available in the stored OPP
- * table. if false, the match is for exact frequency which is not available.
- *
- * This provides a mechanism to enable an opp which is not available currently
- * or the opposite as well.
- *
- * The callers are required to call dev_pm_opp_put() for the returned OPP after
- * use.
- */
-struct dev_pm_opp *dev_pm_opp_find_freq_exact(struct device *dev,
-					      unsigned long freq,
-					      bool available)
-{
-	struct opp_table *opp_table;
-	struct dev_pm_opp *temp_opp, *opp = ERR_PTR(-ERANGE);
-
-	opp_table = _find_opp_table(dev);
-	if (IS_ERR(opp_table)) {
-		int r = PTR_ERR(opp_table);
-
-		dev_err(dev, "%s: OPP table not found (%d)\n", __func__, r);
-		return ERR_PTR(r);
-	}
-
-	mutex_lock(&opp_table->lock);
-
-	list_for_each_entry(temp_opp, &opp_table->opp_list, node) {
-		if (temp_opp->available == available &&
-				temp_opp->rate == freq) {
-			opp = temp_opp;
-
-			/* Increment the reference count of OPP */
-			dev_pm_opp_get(opp);
-			break;
-		}
-	}
-
-	mutex_unlock(&opp_table->lock);
-	dev_pm_opp_put_opp_table(opp_table);
-
-	return opp;
-}
-EXPORT_SYMBOL_GPL(dev_pm_opp_find_freq_exact);
-
-static noinline struct dev_pm_opp *_find_freq_ceil(struct opp_table *opp_table,
-						   unsigned long *freq)
-{
-	struct dev_pm_opp *temp_opp, *opp = ERR_PTR(-ERANGE);
-
-	mutex_lock(&opp_table->lock);
-
-	list_for_each_entry(temp_opp, &opp_table->opp_list, node) {
-		if (temp_opp->available && temp_opp->rate >= *freq) {
-			opp = temp_opp;
-			*freq = opp->rate;
-
-			/* Increment the reference count of OPP */
-			dev_pm_opp_get(opp);
-			break;
-		}
-	}
-
-	mutex_unlock(&opp_table->lock);
-
-	return opp;
-}
-
-/**
- * dev_pm_opp_find_freq_ceil() - Search for an rounded ceil freq
- * @dev:	device for which we do this operation
- * @freq:	Start frequency
- *
- * Search for the matching ceil *available* OPP from a starting freq
- * for a device.
- *
- * Return: matching *opp and refreshes *freq accordingly, else returns
- * ERR_PTR in case of error and should be handled using IS_ERR. Error return
- * values can be:
- * EINVAL:	for bad pointer
- * ERANGE:	no match found for search
- * ENODEV:	if device not found in list of registered devices
- *
- * The callers are required to call dev_pm_opp_put() for the returned OPP after
- * use.
- */
-struct dev_pm_opp *dev_pm_opp_find_freq_ceil(struct device *dev,
-					     unsigned long *freq)
-{
-	struct opp_table *opp_table;
-	struct dev_pm_opp *opp;
-
-	if (!dev || !freq) {
-		dev_err(dev, "%s: Invalid argument freq=%p\n", __func__, freq);
-		return ERR_PTR(-EINVAL);
-	}
-
-	opp_table = _find_opp_table(dev);
-	if (IS_ERR(opp_table))
-		return ERR_CAST(opp_table);
-
-	opp = _find_freq_ceil(opp_table, freq);
-
-	dev_pm_opp_put_opp_table(opp_table);
-
-	return opp;
-}
-EXPORT_SYMBOL_GPL(dev_pm_opp_find_freq_ceil);
-
-/**
- * dev_pm_opp_find_freq_floor() - Search for a rounded floor freq
- * @dev:	device for which we do this operation
- * @freq:	Start frequency
- *
- * Search for the matching floor *available* OPP from a starting freq
- * for a device.
- *
- * Return: matching *opp and refreshes *freq accordingly, else returns
- * ERR_PTR in case of error and should be handled using IS_ERR. Error return
- * values can be:
- * EINVAL:	for bad pointer
- * ERANGE:	no match found for search
- * ENODEV:	if device not found in list of registered devices
- *
- * The callers are required to call dev_pm_opp_put() for the returned OPP after
- * use.
- */
-struct dev_pm_opp *dev_pm_opp_find_freq_floor(struct device *dev,
-					      unsigned long *freq)
-{
-	struct opp_table *opp_table;
-	struct dev_pm_opp *temp_opp, *opp = ERR_PTR(-ERANGE);
-
-	if (!dev || !freq) {
-		dev_err(dev, "%s: Invalid argument freq=%p\n", __func__, freq);
-		return ERR_PTR(-EINVAL);
-	}
-
-	opp_table = _find_opp_table(dev);
-	if (IS_ERR(opp_table))
-		return ERR_CAST(opp_table);
-
-	mutex_lock(&opp_table->lock);
-
-	list_for_each_entry(temp_opp, &opp_table->opp_list, node) {
-		if (temp_opp->available) {
-			/* go to the next node, before choosing prev */
-			if (temp_opp->rate > *freq)
-				break;
-			else
-				opp = temp_opp;
-		}
-	}
-
-	/* Increment the reference count of OPP */
-	if (!IS_ERR(opp))
-		dev_pm_opp_get(opp);
-	mutex_unlock(&opp_table->lock);
-	dev_pm_opp_put_opp_table(opp_table);
-
-	if (!IS_ERR(opp))
-		*freq = opp->rate;
-
-	return opp;
-}
-EXPORT_SYMBOL_GPL(dev_pm_opp_find_freq_floor);
-
-static int _set_opp_voltage(struct device *dev, struct regulator *reg,
-			    struct dev_pm_opp_supply *supply)
-{
-	int ret;
-
-	/* Regulator not available for device */
-	if (IS_ERR(reg)) {
-		dev_dbg(dev, "%s: regulator not available: %ld\n", __func__,
-			PTR_ERR(reg));
-		return 0;
-	}
-
-	dev_dbg(dev, "%s: voltages (mV): %lu %lu %lu\n", __func__,
-		supply->u_volt_min, supply->u_volt, supply->u_volt_max);
-
-	ret = regulator_set_voltage_triplet(reg, supply->u_volt_min,
-					    supply->u_volt, supply->u_volt_max);
-	if (ret)
-		dev_err(dev, "%s: failed to set voltage (%lu %lu %lu mV): %d\n",
-			__func__, supply->u_volt_min, supply->u_volt,
-			supply->u_volt_max, ret);
-
-	return ret;
-}
-
-static inline int
-_generic_set_opp_clk_only(struct device *dev, struct clk *clk,
-			  unsigned long old_freq, unsigned long freq)
-{
-	int ret;
-
-	ret = clk_set_rate(clk, freq);
-	if (ret) {
-		dev_err(dev, "%s: failed to set clock rate: %d\n", __func__,
-			ret);
-	}
-
-	return ret;
-}
-
-static int _generic_set_opp_regulator(const struct opp_table *opp_table,
-				      struct device *dev,
-				      unsigned long old_freq,
-				      unsigned long freq,
-				      struct dev_pm_opp_supply *old_supply,
-				      struct dev_pm_opp_supply *new_supply)
-{
-	struct regulator *reg = opp_table->regulators[0];
-	int ret;
-
-	/* This function only supports single regulator per device */
-	if (WARN_ON(opp_table->regulator_count > 1)) {
-		dev_err(dev, "multiple regulators are not supported\n");
-		return -EINVAL;
-	}
-
-	/* Scaling up? Scale voltage before frequency */
-	if (freq > old_freq) {
-		ret = _set_opp_voltage(dev, reg, new_supply);
-		if (ret)
-			goto restore_voltage;
-	}
-
-	/* Change frequency */
-	ret = _generic_set_opp_clk_only(dev, opp_table->clk, old_freq, freq);
-	if (ret)
-		goto restore_voltage;
-
-	/* Scaling down? Scale voltage after frequency */
-	if (freq < old_freq) {
-		ret = _set_opp_voltage(dev, reg, new_supply);
-		if (ret)
-			goto restore_freq;
-	}
-
-	return 0;
-
-restore_freq:
-	if (_generic_set_opp_clk_only(dev, opp_table->clk, freq, old_freq))
-		dev_err(dev, "%s: failed to restore old-freq (%lu Hz)\n",
-			__func__, old_freq);
-restore_voltage:
-	/* This shouldn't harm even if the voltages weren't updated earlier */
-	if (old_supply)
-		_set_opp_voltage(dev, reg, old_supply);
-
-	return ret;
-}
-
-/**
- * dev_pm_opp_set_rate() - Configure new OPP based on frequency
- * @dev:	 device for which we do this operation
- * @target_freq: frequency to achieve
- *
- * This configures the power-supplies and clock source to the levels specified
- * by the OPP corresponding to the target_freq.
- */
-int dev_pm_opp_set_rate(struct device *dev, unsigned long target_freq)
-{
-	struct opp_table *opp_table;
-	unsigned long freq, old_freq;
-	struct dev_pm_opp *old_opp, *opp;
-	struct clk *clk;
-	int ret, size;
-
-	if (unlikely(!target_freq)) {
-		dev_err(dev, "%s: Invalid target frequency %lu\n", __func__,
-			target_freq);
-		return -EINVAL;
-	}
-
-	opp_table = _find_opp_table(dev);
-	if (IS_ERR(opp_table)) {
-		dev_err(dev, "%s: device opp doesn't exist\n", __func__);
-		return PTR_ERR(opp_table);
-	}
-
-	clk = opp_table->clk;
-	if (IS_ERR(clk)) {
-		dev_err(dev, "%s: No clock available for the device\n",
-			__func__);
-		ret = PTR_ERR(clk);
-		goto put_opp_table;
-	}
-
-	freq = clk_round_rate(clk, target_freq);
-	if ((long)freq <= 0)
-		freq = target_freq;
-
-	old_freq = clk_get_rate(clk);
-
-	/* Return early if nothing to do */
-	if (old_freq == freq) {
-		dev_dbg(dev, "%s: old/new frequencies (%lu Hz) are same, nothing to do\n",
-			__func__, freq);
-		ret = 0;
-		goto put_opp_table;
-	}
-
-	old_opp = _find_freq_ceil(opp_table, &old_freq);
-	if (IS_ERR(old_opp)) {
-		dev_err(dev, "%s: failed to find current OPP for freq %lu (%ld)\n",
-			__func__, old_freq, PTR_ERR(old_opp));
-	}
-
-	opp = _find_freq_ceil(opp_table, &freq);
-	if (IS_ERR(opp)) {
-		ret = PTR_ERR(opp);
-		dev_err(dev, "%s: failed to find OPP for freq %lu (%d)\n",
-			__func__, freq, ret);
-		goto put_old_opp;
-	}
-
-	dev_dbg(dev, "%s: switching OPP: %lu Hz --> %lu Hz\n", __func__,
-		old_freq, freq);
-
-	/* Only frequency scaling */
-	if (!opp_table->regulators) {
-		ret = _generic_set_opp_clk_only(dev, clk, old_freq, freq);
-	} else if (!opp_table->set_opp) {
-		ret = _generic_set_opp_regulator(opp_table, dev, old_freq, freq,
-						 IS_ERR(old_opp) ? NULL : old_opp->supplies,
-						 opp->supplies);
-	} else {
-		struct dev_pm_set_opp_data *data;
-
-		data = opp_table->set_opp_data;
-		data->regulators = opp_table->regulators;
-		data->regulator_count = opp_table->regulator_count;
-		data->clk = clk;
-		data->dev = dev;
-
-		data->old_opp.rate = old_freq;
-		size = sizeof(*opp->supplies) * opp_table->regulator_count;
-		if (IS_ERR(old_opp))
-			memset(data->old_opp.supplies, 0, size);
-		else
-			memcpy(data->old_opp.supplies, old_opp->supplies, size);
-
-		data->new_opp.rate = freq;
-		memcpy(data->new_opp.supplies, opp->supplies, size);
-
-		ret = opp_table->set_opp(data);
-	}
-
-	dev_pm_opp_put(opp);
-put_old_opp:
-	if (!IS_ERR(old_opp))
-		dev_pm_opp_put(old_opp);
-put_opp_table:
-	dev_pm_opp_put_opp_table(opp_table);
-	return ret;
-}
-EXPORT_SYMBOL_GPL(dev_pm_opp_set_rate);
-
-/* OPP-dev Helpers */
-static void _remove_opp_dev(struct opp_device *opp_dev,
-			    struct opp_table *opp_table)
-{
-	opp_debug_unregister(opp_dev, opp_table);
-	list_del(&opp_dev->node);
-	kfree(opp_dev);
-}
-
-struct opp_device *_add_opp_dev(const struct device *dev,
-				struct opp_table *opp_table)
-{
-	struct opp_device *opp_dev;
-	int ret;
-
-	opp_dev = kzalloc(sizeof(*opp_dev), GFP_KERNEL);
-	if (!opp_dev)
-		return NULL;
-
-	/* Initialize opp-dev */
-	opp_dev->dev = dev;
-	list_add(&opp_dev->node, &opp_table->dev_list);
-
-	/* Create debugfs entries for the opp_table */
-	ret = opp_debug_register(opp_dev, opp_table);
-	if (ret)
-		dev_err(dev, "%s: Failed to register opp debugfs (%d)\n",
-			__func__, ret);
-
-	return opp_dev;
-}
-
-static struct opp_table *_allocate_opp_table(struct device *dev)
-{
-	struct opp_table *opp_table;
-	struct opp_device *opp_dev;
-	int ret;
-
-	/*
-	 * Allocate a new OPP table. In the infrequent case where a new
-	 * device is needed to be added, we pay this penalty.
-	 */
-	opp_table = kzalloc(sizeof(*opp_table), GFP_KERNEL);
-	if (!opp_table)
-		return NULL;
-
-	INIT_LIST_HEAD(&opp_table->dev_list);
-
-	opp_dev = _add_opp_dev(dev, opp_table);
-	if (!opp_dev) {
-		kfree(opp_table);
-		return NULL;
-	}
-
-	_of_init_opp_table(opp_table, dev);
-
-	/* Find clk for the device */
-	opp_table->clk = clk_get(dev, NULL);
-	if (IS_ERR(opp_table->clk)) {
-		ret = PTR_ERR(opp_table->clk);
-		if (ret != -EPROBE_DEFER)
-			dev_dbg(dev, "%s: Couldn't find clock: %d\n", __func__,
-				ret);
-	}
-
-	BLOCKING_INIT_NOTIFIER_HEAD(&opp_table->head);
-	INIT_LIST_HEAD(&opp_table->opp_list);
-	mutex_init(&opp_table->lock);
-	kref_init(&opp_table->kref);
-
-	/* Secure the device table modification */
-	list_add(&opp_table->node, &opp_tables);
-	return opp_table;
-}
-
-void _get_opp_table_kref(struct opp_table *opp_table)
-{
-	kref_get(&opp_table->kref);
-}
-
-struct opp_table *dev_pm_opp_get_opp_table(struct device *dev)
-{
-	struct opp_table *opp_table;
-
-	/* Hold our table modification lock here */
-	mutex_lock(&opp_table_lock);
-
-	opp_table = _find_opp_table_unlocked(dev);
-	if (!IS_ERR(opp_table))
-		goto unlock;
-
-	opp_table = _allocate_opp_table(dev);
-
-unlock:
-	mutex_unlock(&opp_table_lock);
-
-	return opp_table;
-}
-EXPORT_SYMBOL_GPL(dev_pm_opp_get_opp_table);
-
-static void _opp_table_kref_release(struct kref *kref)
-{
-	struct opp_table *opp_table = container_of(kref, struct opp_table, kref);
-	struct opp_device *opp_dev;
-
-	/* Release clk */
-	if (!IS_ERR(opp_table->clk))
-		clk_put(opp_table->clk);
-
-	opp_dev = list_first_entry(&opp_table->dev_list, struct opp_device,
-				   node);
-
-	_remove_opp_dev(opp_dev, opp_table);
-
-	/* dev_list must be empty now */
-	WARN_ON(!list_empty(&opp_table->dev_list));
-
-	mutex_destroy(&opp_table->lock);
-	list_del(&opp_table->node);
-	kfree(opp_table);
-
-	mutex_unlock(&opp_table_lock);
-}
-
-void dev_pm_opp_put_opp_table(struct opp_table *opp_table)
-{
-	kref_put_mutex(&opp_table->kref, _opp_table_kref_release,
-		       &opp_table_lock);
-}
-EXPORT_SYMBOL_GPL(dev_pm_opp_put_opp_table);
-
-void _opp_free(struct dev_pm_opp *opp)
-{
-	kfree(opp);
-}
-
-static void _opp_kref_release(struct kref *kref)
-{
-	struct dev_pm_opp *opp = container_of(kref, struct dev_pm_opp, kref);
-	struct opp_table *opp_table = opp->opp_table;
-
-	/*
-	 * Notify the changes in the availability of the operable
-	 * frequency/voltage list.
-	 */
-	blocking_notifier_call_chain(&opp_table->head, OPP_EVENT_REMOVE, opp);
-	opp_debug_remove_one(opp);
-	list_del(&opp->node);
-	kfree(opp);
-
-	mutex_unlock(&opp_table->lock);
-	dev_pm_opp_put_opp_table(opp_table);
-}
-
-static void dev_pm_opp_get(struct dev_pm_opp *opp)
-{
-	kref_get(&opp->kref);
-}
-
-void dev_pm_opp_put(struct dev_pm_opp *opp)
-{
-	kref_put_mutex(&opp->kref, _opp_kref_release, &opp->opp_table->lock);
-}
-EXPORT_SYMBOL_GPL(dev_pm_opp_put);
-
-/**
- * dev_pm_opp_remove()  - Remove an OPP from OPP table
- * @dev:	device for which we do this operation
- * @freq:	OPP to remove with matching 'freq'
- *
- * This function removes an opp from the opp table.
- */
-void dev_pm_opp_remove(struct device *dev, unsigned long freq)
-{
-	struct dev_pm_opp *opp;
-	struct opp_table *opp_table;
-	bool found = false;
-
-	opp_table = _find_opp_table(dev);
-	if (IS_ERR(opp_table))
-		return;
-
-	mutex_lock(&opp_table->lock);
-
-	list_for_each_entry(opp, &opp_table->opp_list, node) {
-		if (opp->rate == freq) {
-			found = true;
-			break;
-		}
-	}
-
-	mutex_unlock(&opp_table->lock);
-
-	if (found) {
-		dev_pm_opp_put(opp);
-	} else {
-		dev_warn(dev, "%s: Couldn't find OPP with freq: %lu\n",
-			 __func__, freq);
-	}
-
-	dev_pm_opp_put_opp_table(opp_table);
-}
-EXPORT_SYMBOL_GPL(dev_pm_opp_remove);
-
-struct dev_pm_opp *_opp_allocate(struct opp_table *table)
-{
-	struct dev_pm_opp *opp;
-	int count, supply_size;
-
-	/* Allocate space for at least one supply */
-	count = table->regulator_count ? table->regulator_count : 1;
-	supply_size = sizeof(*opp->supplies) * count;
-
-	/* allocate new OPP node and supplies structures */
-	opp = kzalloc(sizeof(*opp) + supply_size, GFP_KERNEL);
-	if (!opp)
-		return NULL;
-
-	/* Put the supplies at the end of the OPP structure as an empty array */
-	opp->supplies = (struct dev_pm_opp_supply *)(opp + 1);
-	INIT_LIST_HEAD(&opp->node);
-
-	return opp;
-}
-
-static bool _opp_supported_by_regulators(struct dev_pm_opp *opp,
-					 struct opp_table *opp_table)
-{
-	struct regulator *reg;
-	int i;
-
-	for (i = 0; i < opp_table->regulator_count; i++) {
-		reg = opp_table->regulators[i];
-
-		if (!regulator_is_supported_voltage(reg,
-					opp->supplies[i].u_volt_min,
-					opp->supplies[i].u_volt_max)) {
-			pr_warn("%s: OPP minuV: %lu maxuV: %lu, not supported by regulator\n",
-				__func__, opp->supplies[i].u_volt_min,
-				opp->supplies[i].u_volt_max);
-			return false;
-		}
-	}
-
-	return true;
-}
-
-/*
- * Returns:
- * 0: On success. And appropriate error message for duplicate OPPs.
- * -EBUSY: For OPP with same freq/volt and is available. The callers of
- *  _opp_add() must return 0 if they receive -EBUSY from it. This is to make
- *  sure we don't print error messages unnecessarily if different parts of
- *  kernel try to initialize the OPP table.
- * -EEXIST: For OPP with same freq but different volt or is unavailable. This
- *  should be considered an error by the callers of _opp_add().
- */
-int _opp_add(struct device *dev, struct dev_pm_opp *new_opp,
-	     struct opp_table *opp_table)
-{
-	struct dev_pm_opp *opp;
-	struct list_head *head;
-	int ret;
-
-	/*
-	 * Insert new OPP in order of increasing frequency and discard if
-	 * already present.
-	 *
-	 * Need to use &opp_table->opp_list in the condition part of the 'for'
-	 * loop, don't replace it with head otherwise it will become an infinite
-	 * loop.
-	 */
-	mutex_lock(&opp_table->lock);
-	head = &opp_table->opp_list;
-
-	list_for_each_entry(opp, &opp_table->opp_list, node) {
-		if (new_opp->rate > opp->rate) {
-			head = &opp->node;
-			continue;
-		}
-
-		if (new_opp->rate < opp->rate)
-			break;
-
-		/* Duplicate OPPs */
-		dev_warn(dev, "%s: duplicate OPPs detected. Existing: freq: %lu, volt: %lu, enabled: %d. New: freq: %lu, volt: %lu, enabled: %d\n",
-			 __func__, opp->rate, opp->supplies[0].u_volt,
-			 opp->available, new_opp->rate,
-			 new_opp->supplies[0].u_volt, new_opp->available);
-
-		/* Should we compare voltages for all regulators here ? */
-		ret = opp->available &&
-		      new_opp->supplies[0].u_volt == opp->supplies[0].u_volt ? -EBUSY : -EEXIST;
-
-		mutex_unlock(&opp_table->lock);
-		return ret;
-	}
-
-	list_add(&new_opp->node, head);
-	mutex_unlock(&opp_table->lock);
-
-	new_opp->opp_table = opp_table;
-	kref_init(&new_opp->kref);
-
-	/* Get a reference to the OPP table */
-	_get_opp_table_kref(opp_table);
-
-	ret = opp_debug_create_one(new_opp, opp_table);
-	if (ret)
-		dev_err(dev, "%s: Failed to register opp to debugfs (%d)\n",
-			__func__, ret);
-
-	if (!_opp_supported_by_regulators(new_opp, opp_table)) {
-		new_opp->available = false;
-		dev_warn(dev, "%s: OPP not supported by regulators (%lu)\n",
-			 __func__, new_opp->rate);
-	}
-
-	return 0;
-}
-
-/**
- * _opp_add_v1() - Allocate a OPP based on v1 bindings.
- * @opp_table:	OPP table
- * @dev:	device for which we do this operation
- * @freq:	Frequency in Hz for this OPP
- * @u_volt:	Voltage in uVolts for this OPP
- * @dynamic:	Dynamically added OPPs.
- *
- * This function adds an opp definition to the opp table and returns status.
- * The opp is made available by default and it can be controlled using
- * dev_pm_opp_enable/disable functions and may be removed by dev_pm_opp_remove.
- *
- * NOTE: "dynamic" parameter impacts OPPs added by the dev_pm_opp_of_add_table
- * and freed by dev_pm_opp_of_remove_table.
- *
- * Return:
- * 0		On success OR
- *		Duplicate OPPs (both freq and volt are same) and opp->available
- * -EEXIST	Freq are same and volt are different OR
- *		Duplicate OPPs (both freq and volt are same) and !opp->available
- * -ENOMEM	Memory allocation failure
- */
-int _opp_add_v1(struct opp_table *opp_table, struct device *dev,
-		unsigned long freq, long u_volt, bool dynamic)
-{
-	struct dev_pm_opp *new_opp;
-	unsigned long tol;
-	int ret;
-
-	new_opp = _opp_allocate(opp_table);
-	if (!new_opp)
-		return -ENOMEM;
-
-	/* populate the opp table */
-	new_opp->rate = freq;
-	tol = u_volt * opp_table->voltage_tolerance_v1 / 100;
-	new_opp->supplies[0].u_volt = u_volt;
-	new_opp->supplies[0].u_volt_min = u_volt - tol;
-	new_opp->supplies[0].u_volt_max = u_volt + tol;
-	new_opp->available = true;
-	new_opp->dynamic = dynamic;
-
-	ret = _opp_add(dev, new_opp, opp_table);
-	if (ret) {
-		/* Don't return error for duplicate OPPs */
-		if (ret == -EBUSY)
-			ret = 0;
-		goto free_opp;
-	}
-
-	/*
-	 * Notify the changes in the availability of the operable
-	 * frequency/voltage list.
-	 */
-	blocking_notifier_call_chain(&opp_table->head, OPP_EVENT_ADD, new_opp);
-	return 0;
-
-free_opp:
-	_opp_free(new_opp);
-
-	return ret;
-}
-
-/**
- * dev_pm_opp_set_supported_hw() - Set supported platforms
- * @dev: Device for which supported-hw has to be set.
- * @versions: Array of hierarchy of versions to match.
- * @count: Number of elements in the array.
- *
- * This is required only for the V2 bindings, and it enables a platform to
- * specify the hierarchy of versions it supports. OPP layer will then enable
- * OPPs, which are available for those versions, based on its 'opp-supported-hw'
- * property.
- */
-struct opp_table *dev_pm_opp_set_supported_hw(struct device *dev,
-			const u32 *versions, unsigned int count)
-{
-	struct opp_table *opp_table;
-	int ret;
-
-	opp_table = dev_pm_opp_get_opp_table(dev);
-	if (!opp_table)
-		return ERR_PTR(-ENOMEM);
-
-	/* Make sure there are no concurrent readers while updating opp_table */
-	WARN_ON(!list_empty(&opp_table->opp_list));
-
-	/* Do we already have a version hierarchy associated with opp_table? */
-	if (opp_table->supported_hw) {
-		dev_err(dev, "%s: Already have supported hardware list\n",
-			__func__);
-		ret = -EBUSY;
-		goto err;
-	}
-
-	opp_table->supported_hw = kmemdup(versions, count * sizeof(*versions),
-					GFP_KERNEL);
-	if (!opp_table->supported_hw) {
-		ret = -ENOMEM;
-		goto err;
-	}
-
-	opp_table->supported_hw_count = count;
-
-	return opp_table;
-
-err:
-	dev_pm_opp_put_opp_table(opp_table);
-
-	return ERR_PTR(ret);
-}
-EXPORT_SYMBOL_GPL(dev_pm_opp_set_supported_hw);
-
-/**
- * dev_pm_opp_put_supported_hw() - Releases resources blocked for supported hw
- * @opp_table: OPP table returned by dev_pm_opp_set_supported_hw().
- *
- * This is required only for the V2 bindings, and is called for a matching
- * dev_pm_opp_set_supported_hw(). Until this is called, the opp_table structure
- * will not be freed.
- */
-void dev_pm_opp_put_supported_hw(struct opp_table *opp_table)
-{
-	/* Make sure there are no concurrent readers while updating opp_table */
-	WARN_ON(!list_empty(&opp_table->opp_list));
-
-	if (!opp_table->supported_hw) {
-		pr_err("%s: Doesn't have supported hardware list\n",
-		       __func__);
-		return;
-	}
-
-	kfree(opp_table->supported_hw);
-	opp_table->supported_hw = NULL;
-	opp_table->supported_hw_count = 0;
-
-	dev_pm_opp_put_opp_table(opp_table);
-}
-EXPORT_SYMBOL_GPL(dev_pm_opp_put_supported_hw);
-
-/**
- * dev_pm_opp_set_prop_name() - Set prop-extn name
- * @dev: Device for which the prop-name has to be set.
- * @name: name to postfix to properties.
- *
- * This is required only for the V2 bindings, and it enables a platform to
- * specify the extn to be used for certain property names. The properties to
- * which the extension will apply are opp-microvolt and opp-microamp. OPP core
- * should postfix the property name with -<name> while looking for them.
- */
-struct opp_table *dev_pm_opp_set_prop_name(struct device *dev, const char *name)
-{
-	struct opp_table *opp_table;
-	int ret;
-
-	opp_table = dev_pm_opp_get_opp_table(dev);
-	if (!opp_table)
-		return ERR_PTR(-ENOMEM);
-
-	/* Make sure there are no concurrent readers while updating opp_table */
-	WARN_ON(!list_empty(&opp_table->opp_list));
-
-	/* Do we already have a prop-name associated with opp_table? */
-	if (opp_table->prop_name) {
-		dev_err(dev, "%s: Already have prop-name %s\n", __func__,
-			opp_table->prop_name);
-		ret = -EBUSY;
-		goto err;
-	}
-
-	opp_table->prop_name = kstrdup(name, GFP_KERNEL);
-	if (!opp_table->prop_name) {
-		ret = -ENOMEM;
-		goto err;
-	}
-
-	return opp_table;
-
-err:
-	dev_pm_opp_put_opp_table(opp_table);
-
-	return ERR_PTR(ret);
-}
-EXPORT_SYMBOL_GPL(dev_pm_opp_set_prop_name);
-
-/**
- * dev_pm_opp_put_prop_name() - Releases resources blocked for prop-name
- * @opp_table: OPP table returned by dev_pm_opp_set_prop_name().
- *
- * This is required only for the V2 bindings, and is called for a matching
- * dev_pm_opp_set_prop_name(). Until this is called, the opp_table structure
- * will not be freed.
- */
-void dev_pm_opp_put_prop_name(struct opp_table *opp_table)
-{
-	/* Make sure there are no concurrent readers while updating opp_table */
-	WARN_ON(!list_empty(&opp_table->opp_list));
-
-	if (!opp_table->prop_name) {
-		pr_err("%s: Doesn't have a prop-name\n", __func__);
-		return;
-	}
-
-	kfree(opp_table->prop_name);
-	opp_table->prop_name = NULL;
-
-	dev_pm_opp_put_opp_table(opp_table);
-}
-EXPORT_SYMBOL_GPL(dev_pm_opp_put_prop_name);
-
-static int _allocate_set_opp_data(struct opp_table *opp_table)
-{
-	struct dev_pm_set_opp_data *data;
-	int len, count = opp_table->regulator_count;
-
-	if (WARN_ON(!count))
-		return -EINVAL;
-
-	/* space for set_opp_data */
-	len = sizeof(*data);
-
-	/* space for old_opp.supplies and new_opp.supplies */
-	len += 2 * sizeof(struct dev_pm_opp_supply) * count;
-
-	data = kzalloc(len, GFP_KERNEL);
-	if (!data)
-		return -ENOMEM;
-
-	data->old_opp.supplies = (void *)(data + 1);
-	data->new_opp.supplies = data->old_opp.supplies + count;
-
-	opp_table->set_opp_data = data;
-
-	return 0;
-}
-
-static void _free_set_opp_data(struct opp_table *opp_table)
-{
-	kfree(opp_table->set_opp_data);
-	opp_table->set_opp_data = NULL;
-}
-
-/**
- * dev_pm_opp_set_regulators() - Set regulator names for the device
- * @dev: Device for which regulator name is being set.
- * @names: Array of pointers to the names of the regulator.
- * @count: Number of regulators.
- *
- * In order to support OPP switching, OPP layer needs to know the name of the
- * device's regulators, as the core would be required to switch voltages as
- * well.
- *
- * This must be called before any OPPs are initialized for the device.
- */
-struct opp_table *dev_pm_opp_set_regulators(struct device *dev,
-					    const char * const names[],
-					    unsigned int count)
-{
-	struct opp_table *opp_table;
-	struct regulator *reg;
-	int ret, i;
-
-	opp_table = dev_pm_opp_get_opp_table(dev);
-	if (!opp_table)
-		return ERR_PTR(-ENOMEM);
-
-	/* This should be called before OPPs are initialized */
-	if (WARN_ON(!list_empty(&opp_table->opp_list))) {
-		ret = -EBUSY;
-		goto err;
-	}
-
-	/* Already have regulators set */
-	if (opp_table->regulators) {
-		ret = -EBUSY;
-		goto err;
-	}
-
-	opp_table->regulators = kmalloc_array(count,
-					      sizeof(*opp_table->regulators),
-					      GFP_KERNEL);
-	if (!opp_table->regulators) {
-		ret = -ENOMEM;
-		goto err;
-	}
-
-	for (i = 0; i < count; i++) {
-		reg = regulator_get_optional(dev, names[i]);
-		if (IS_ERR(reg)) {
-			ret = PTR_ERR(reg);
-			if (ret != -EPROBE_DEFER)
-				dev_err(dev, "%s: no regulator (%s) found: %d\n",
-					__func__, names[i], ret);
-			goto free_regulators;
-		}
-
-		opp_table->regulators[i] = reg;
-	}
-
-	opp_table->regulator_count = count;
-
-	/* Allocate block only once to pass to set_opp() routines */
-	ret = _allocate_set_opp_data(opp_table);
-	if (ret)
-		goto free_regulators;
-
-	return opp_table;
-
-free_regulators:
-	while (i != 0)
-		regulator_put(opp_table->regulators[--i]);
-
-	kfree(opp_table->regulators);
-	opp_table->regulators = NULL;
-	opp_table->regulator_count = 0;
-err:
-	dev_pm_opp_put_opp_table(opp_table);
-
-	return ERR_PTR(ret);
-}
-EXPORT_SYMBOL_GPL(dev_pm_opp_set_regulators);
-
-/**
- * dev_pm_opp_put_regulators() - Releases resources blocked for regulator
- * @opp_table: OPP table returned from dev_pm_opp_set_regulators().
- */
-void dev_pm_opp_put_regulators(struct opp_table *opp_table)
-{
-	int i;
-
-	if (!opp_table->regulators) {
-		pr_err("%s: Doesn't have regulators set\n", __func__);
-		return;
-	}
-
-	/* Make sure there are no concurrent readers while updating opp_table */
-	WARN_ON(!list_empty(&opp_table->opp_list));
-
-	for (i = opp_table->regulator_count - 1; i >= 0; i--)
-		regulator_put(opp_table->regulators[i]);
-
-	_free_set_opp_data(opp_table);
-
-	kfree(opp_table->regulators);
-	opp_table->regulators = NULL;
-	opp_table->regulator_count = 0;
-
-	dev_pm_opp_put_opp_table(opp_table);
-}
-EXPORT_SYMBOL_GPL(dev_pm_opp_put_regulators);
-
-/**
- * dev_pm_opp_set_clkname() - Set clk name for the device
- * @dev: Device for which clk name is being set.
- * @name: Clk name.
- *
- * In order to support OPP switching, OPP layer needs to get pointer to the
- * clock for the device. Simple cases work fine without using this routine (i.e.
- * by passing connection-id as NULL), but for a device with multiple clocks
- * available, the OPP core needs to know the exact name of the clk to use.
- *
- * This must be called before any OPPs are initialized for the device.
- */
-struct opp_table *dev_pm_opp_set_clkname(struct device *dev, const char *name)
-{
-	struct opp_table *opp_table;
-	int ret;
-
-	opp_table = dev_pm_opp_get_opp_table(dev);
-	if (!opp_table)
-		return ERR_PTR(-ENOMEM);
-
-	/* This should be called before OPPs are initialized */
-	if (WARN_ON(!list_empty(&opp_table->opp_list))) {
-		ret = -EBUSY;
-		goto err;
-	}
-
-	/* Already have default clk set, free it */
-	if (!IS_ERR(opp_table->clk))
-		clk_put(opp_table->clk);
-
-	/* Find clk for the device */
-	opp_table->clk = clk_get(dev, name);
-	if (IS_ERR(opp_table->clk)) {
-		ret = PTR_ERR(opp_table->clk);
-		if (ret != -EPROBE_DEFER) {
-			dev_err(dev, "%s: Couldn't find clock: %d\n", __func__,
-				ret);
-		}
-		goto err;
-	}
-
-	return opp_table;
-
-err:
-	dev_pm_opp_put_opp_table(opp_table);
-
-	return ERR_PTR(ret);
-}
-EXPORT_SYMBOL_GPL(dev_pm_opp_set_clkname);
-
-/**
- * dev_pm_opp_put_clkname() - Releases resources blocked for clk.
- * @opp_table: OPP table returned from dev_pm_opp_set_clkname().
- */
-void dev_pm_opp_put_clkname(struct opp_table *opp_table)
-{
-	/* Make sure there are no concurrent readers while updating opp_table */
-	WARN_ON(!list_empty(&opp_table->opp_list));
-
-	clk_put(opp_table->clk);
-	opp_table->clk = ERR_PTR(-EINVAL);
-
-	dev_pm_opp_put_opp_table(opp_table);
-}
-EXPORT_SYMBOL_GPL(dev_pm_opp_put_clkname);
-
-/**
- * dev_pm_opp_register_set_opp_helper() - Register custom set OPP helper
- * @dev: Device for which the helper is getting registered.
- * @set_opp: Custom set OPP helper.
- *
- * This is useful to support complex platforms (like platforms with multiple
- * regulators per device), instead of the generic OPP set rate helper.
- *
- * This must be called before any OPPs are initialized for the device.
- */
-struct opp_table *dev_pm_opp_register_set_opp_helper(struct device *dev,
-			int (*set_opp)(struct dev_pm_set_opp_data *data))
-{
-	struct opp_table *opp_table;
-	int ret;
-
-	if (!set_opp)
-		return ERR_PTR(-EINVAL);
-
-	opp_table = dev_pm_opp_get_opp_table(dev);
-	if (!opp_table)
-		return ERR_PTR(-ENOMEM);
-
-	/* This should be called before OPPs are initialized */
-	if (WARN_ON(!list_empty(&opp_table->opp_list))) {
-		ret = -EBUSY;
-		goto err;
-	}
-
-	/* Already have custom set_opp helper */
-	if (WARN_ON(opp_table->set_opp)) {
-		ret = -EBUSY;
-		goto err;
-	}
-
-	opp_table->set_opp = set_opp;
-
-	return opp_table;
-
-err:
-	dev_pm_opp_put_opp_table(opp_table);
-
-	return ERR_PTR(ret);
-}
-EXPORT_SYMBOL_GPL(dev_pm_opp_register_set_opp_helper);
-
-/**
- * dev_pm_opp_register_put_opp_helper() - Releases resources blocked for
- *					   set_opp helper
- * @opp_table: OPP table returned from dev_pm_opp_register_set_opp_helper().
- *
- * Release resources blocked for platform specific set_opp helper.
- */
-void dev_pm_opp_register_put_opp_helper(struct opp_table *opp_table)
-{
-	if (!opp_table->set_opp) {
-		pr_err("%s: Doesn't have custom set_opp helper set\n",
-		       __func__);
-		return;
-	}
-
-	/* Make sure there are no concurrent readers while updating opp_table */
-	WARN_ON(!list_empty(&opp_table->opp_list));
-
-	opp_table->set_opp = NULL;
-
-	dev_pm_opp_put_opp_table(opp_table);
-}
-EXPORT_SYMBOL_GPL(dev_pm_opp_register_put_opp_helper);
-
-/**
- * dev_pm_opp_add()  - Add an OPP table from a table definitions
- * @dev:	device for which we do this operation
- * @freq:	Frequency in Hz for this OPP
- * @u_volt:	Voltage in uVolts for this OPP
- *
- * This function adds an opp definition to the opp table and returns status.
- * The opp is made available by default and it can be controlled using
- * dev_pm_opp_enable/disable functions.
- *
- * Return:
- * 0		On success OR
- *		Duplicate OPPs (both freq and volt are same) and opp->available
- * -EEXIST	Freq are same and volt are different OR
- *		Duplicate OPPs (both freq and volt are same) and !opp->available
- * -ENOMEM	Memory allocation failure
- */
-int dev_pm_opp_add(struct device *dev, unsigned long freq, unsigned long u_volt)
-{
-	struct opp_table *opp_table;
-	int ret;
-
-	opp_table = dev_pm_opp_get_opp_table(dev);
-	if (!opp_table)
-		return -ENOMEM;
-
-	ret = _opp_add_v1(opp_table, dev, freq, u_volt, true);
-
-	dev_pm_opp_put_opp_table(opp_table);
-	return ret;
-}
-EXPORT_SYMBOL_GPL(dev_pm_opp_add);
-
-/**
- * _opp_set_availability() - helper to set the availability of an opp
- * @dev:		device for which we do this operation
- * @freq:		OPP frequency to modify availability
- * @availability_req:	availability status requested for this opp
- *
- * Set the availability of an OPP, opp_{enable,disable} share a common logic
- * which is isolated here.
- *
- * Return: -EINVAL for bad pointers, -ENOMEM if no memory available for the
- * copy operation, returns 0 if no modification was done OR modification was
- * successful.
- */
-static int _opp_set_availability(struct device *dev, unsigned long freq,
-				 bool availability_req)
-{
-	struct opp_table *opp_table;
-	struct dev_pm_opp *tmp_opp, *opp = ERR_PTR(-ENODEV);
-	int r = 0;
-
-	/* Find the opp_table */
-	opp_table = _find_opp_table(dev);
-	if (IS_ERR(opp_table)) {
-		r = PTR_ERR(opp_table);
-		dev_warn(dev, "%s: Device OPP not found (%d)\n", __func__, r);
-		return r;
-	}
-
-	mutex_lock(&opp_table->lock);
-
-	/* Do we have the frequency? */
-	list_for_each_entry(tmp_opp, &opp_table->opp_list, node) {
-		if (tmp_opp->rate == freq) {
-			opp = tmp_opp;
-			break;
-		}
-	}
-
-	if (IS_ERR(opp)) {
-		r = PTR_ERR(opp);
-		goto unlock;
-	}
-
-	/* Is update really needed? */
-	if (opp->available == availability_req)
-		goto unlock;
-
-	opp->available = availability_req;
-
-	dev_pm_opp_get(opp);
-	mutex_unlock(&opp_table->lock);
-
-	/* Notify the change of the OPP availability */
-	if (availability_req)
-		blocking_notifier_call_chain(&opp_table->head, OPP_EVENT_ENABLE,
-					     opp);
-	else
-		blocking_notifier_call_chain(&opp_table->head,
-					     OPP_EVENT_DISABLE, opp);
-
-	dev_pm_opp_put(opp);
-	goto put_table;
-
-unlock:
-	mutex_unlock(&opp_table->lock);
-put_table:
-	dev_pm_opp_put_opp_table(opp_table);
-	return r;
-}
-
-/**
- * dev_pm_opp_enable() - Enable a specific OPP
- * @dev:	device for which we do this operation
- * @freq:	OPP frequency to enable
- *
- * Enables a provided opp. If the operation is valid, this returns 0, else the
- * corresponding error value. It is meant to be used for users an OPP available
- * after being temporarily made unavailable with dev_pm_opp_disable.
- *
- * Return: -EINVAL for bad pointers, -ENOMEM if no memory available for the
- * copy operation, returns 0 if no modification was done OR modification was
- * successful.
- */
-int dev_pm_opp_enable(struct device *dev, unsigned long freq)
-{
-	return _opp_set_availability(dev, freq, true);
-}
-EXPORT_SYMBOL_GPL(dev_pm_opp_enable);
-
-/**
- * dev_pm_opp_disable() - Disable a specific OPP
- * @dev:	device for which we do this operation
- * @freq:	OPP frequency to disable
- *
- * Disables a provided opp. If the operation is valid, this returns
- * 0, else the corresponding error value. It is meant to be a temporary
- * control by users to make this OPP not available until the circumstances are
- * right to make it available again (with a call to dev_pm_opp_enable).
- *
- * Return: -EINVAL for bad pointers, -ENOMEM if no memory available for the
- * copy operation, returns 0 if no modification was done OR modification was
- * successful.
- */
-int dev_pm_opp_disable(struct device *dev, unsigned long freq)
-{
-	return _opp_set_availability(dev, freq, false);
-}
-EXPORT_SYMBOL_GPL(dev_pm_opp_disable);
-
-/**
- * dev_pm_opp_register_notifier() - Register OPP notifier for the device
- * @dev:	Device for which notifier needs to be registered
- * @nb:		Notifier block to be registered
- *
- * Return: 0 on success or a negative error value.
- */
-int dev_pm_opp_register_notifier(struct device *dev, struct notifier_block *nb)
-{
-	struct opp_table *opp_table;
-	int ret;
-
-	opp_table = _find_opp_table(dev);
-	if (IS_ERR(opp_table))
-		return PTR_ERR(opp_table);
-
-	ret = blocking_notifier_chain_register(&opp_table->head, nb);
-
-	dev_pm_opp_put_opp_table(opp_table);
-
-	return ret;
-}
-EXPORT_SYMBOL(dev_pm_opp_register_notifier);
-
-/**
- * dev_pm_opp_unregister_notifier() - Unregister OPP notifier for the device
- * @dev:	Device for which notifier needs to be unregistered
- * @nb:		Notifier block to be unregistered
- *
- * Return: 0 on success or a negative error value.
- */
-int dev_pm_opp_unregister_notifier(struct device *dev,
-				   struct notifier_block *nb)
-{
-	struct opp_table *opp_table;
-	int ret;
-
-	opp_table = _find_opp_table(dev);
-	if (IS_ERR(opp_table))
-		return PTR_ERR(opp_table);
-
-	ret = blocking_notifier_chain_unregister(&opp_table->head, nb);
-
-	dev_pm_opp_put_opp_table(opp_table);
-
-	return ret;
-}
-EXPORT_SYMBOL(dev_pm_opp_unregister_notifier);
-
-/*
- * Free OPPs either created using static entries present in DT or even the
- * dynamically added entries based on remove_all param.
- */
-void _dev_pm_opp_remove_table(struct opp_table *opp_table, struct device *dev,
-			      bool remove_all)
-{
-	struct dev_pm_opp *opp, *tmp;
-
-	/* Find if opp_table manages a single device */
-	if (list_is_singular(&opp_table->dev_list)) {
-		/* Free static OPPs */
-		list_for_each_entry_safe(opp, tmp, &opp_table->opp_list, node) {
-			if (remove_all || !opp->dynamic)
-				dev_pm_opp_put(opp);
-		}
-	} else {
-		_remove_opp_dev(_find_opp_dev(dev, opp_table), opp_table);
-	}
-}
-
-void _dev_pm_opp_find_and_remove_table(struct device *dev, bool remove_all)
-{
-	struct opp_table *opp_table;
-
-	/* Check for existing table for 'dev' */
-	opp_table = _find_opp_table(dev);
-	if (IS_ERR(opp_table)) {
-		int error = PTR_ERR(opp_table);
-
-		if (error != -ENODEV)
-			WARN(1, "%s: opp_table: %d\n",
-			     IS_ERR_OR_NULL(dev) ?
-					"Invalid device" : dev_name(dev),
-			     error);
-		return;
-	}
-
-	_dev_pm_opp_remove_table(opp_table, dev, remove_all);
-
-	dev_pm_opp_put_opp_table(opp_table);
-}
-
-/**
- * dev_pm_opp_remove_table() - Free all OPPs associated with the device
- * @dev:	device pointer used to lookup OPP table.
- *
- * Free both OPPs created using static entries present in DT and the
- * dynamically added entries.
- */
-void dev_pm_opp_remove_table(struct device *dev)
-{
-	_dev_pm_opp_find_and_remove_table(dev, true);
-}
-EXPORT_SYMBOL_GPL(dev_pm_opp_remove_table);
diff --git a/drivers/base/power/opp/cpu.c b/drivers/base/power/opp/cpu.c
deleted file mode 100644
index 2d87bc1adf38..000000000000
--- a/drivers/base/power/opp/cpu.c
+++ /dev/null
@@ -1,236 +0,0 @@
-/*
- * Generic OPP helper interface for CPU device
- *
- * Copyright (C) 2009-2014 Texas Instruments Incorporated.
- *	Nishanth Menon
- *	Romit Dasgupta
- *	Kevin Hilman
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
-#include <linux/cpu.h>
-#include <linux/cpufreq.h>
-#include <linux/err.h>
-#include <linux/errno.h>
-#include <linux/export.h>
-#include <linux/slab.h>
-
-#include "opp.h"
-
-#ifdef CONFIG_CPU_FREQ
-
-/**
- * dev_pm_opp_init_cpufreq_table() - create a cpufreq table for a device
- * @dev:	device for which we do this operation
- * @table:	Cpufreq table returned back to caller
- *
- * Generate a cpufreq table for a provided device- this assumes that the
- * opp table is already initialized and ready for usage.
- *
- * This function allocates required memory for the cpufreq table. It is
- * expected that the caller does the required maintenance such as freeing
- * the table as required.
- *
- * Returns -EINVAL for bad pointers, -ENODEV if the device is not found, -ENOMEM
- * if no memory available for the operation (table is not populated), returns 0
- * if successful and table is populated.
- *
- * WARNING: It is  important for the callers to ensure refreshing their copy of
- * the table if any of the mentioned functions have been invoked in the interim.
- */
-int dev_pm_opp_init_cpufreq_table(struct device *dev,
-				  struct cpufreq_frequency_table **table)
-{
-	struct dev_pm_opp *opp;
-	struct cpufreq_frequency_table *freq_table = NULL;
-	int i, max_opps, ret = 0;
-	unsigned long rate;
-
-	max_opps = dev_pm_opp_get_opp_count(dev);
-	if (max_opps <= 0)
-		return max_opps ? max_opps : -ENODATA;
-
-	freq_table = kcalloc((max_opps + 1), sizeof(*freq_table), GFP_ATOMIC);
-	if (!freq_table)
-		return -ENOMEM;
-
-	for (i = 0, rate = 0; i < max_opps; i++, rate++) {
-		/* find next rate */
-		opp = dev_pm_opp_find_freq_ceil(dev, &rate);
-		if (IS_ERR(opp)) {
-			ret = PTR_ERR(opp);
-			goto out;
-		}
-		freq_table[i].driver_data = i;
-		freq_table[i].frequency = rate / 1000;
-
-		/* Is Boost/turbo opp ? */
-		if (dev_pm_opp_is_turbo(opp))
-			freq_table[i].flags = CPUFREQ_BOOST_FREQ;
-
-		dev_pm_opp_put(opp);
-	}
-
-	freq_table[i].driver_data = i;
-	freq_table[i].frequency = CPUFREQ_TABLE_END;
-
-	*table = &freq_table[0];
-
-out:
-	if (ret)
-		kfree(freq_table);
-
-	return ret;
-}
-EXPORT_SYMBOL_GPL(dev_pm_opp_init_cpufreq_table);
-
-/**
- * dev_pm_opp_free_cpufreq_table() - free the cpufreq table
- * @dev:	device for which we do this operation
- * @table:	table to free
- *
- * Free up the table allocated by dev_pm_opp_init_cpufreq_table
- */
-void dev_pm_opp_free_cpufreq_table(struct device *dev,
-				   struct cpufreq_frequency_table **table)
-{
-	if (!table)
-		return;
-
-	kfree(*table);
-	*table = NULL;
-}
-EXPORT_SYMBOL_GPL(dev_pm_opp_free_cpufreq_table);
-#endif	/* CONFIG_CPU_FREQ */
-
-void _dev_pm_opp_cpumask_remove_table(const struct cpumask *cpumask, bool of)
-{
-	struct device *cpu_dev;
-	int cpu;
-
-	WARN_ON(cpumask_empty(cpumask));
-
-	for_each_cpu(cpu, cpumask) {
-		cpu_dev = get_cpu_device(cpu);
-		if (!cpu_dev) {
-			pr_err("%s: failed to get cpu%d device\n", __func__,
-			       cpu);
-			continue;
-		}
-
-		if (of)
-			dev_pm_opp_of_remove_table(cpu_dev);
-		else
-			dev_pm_opp_remove_table(cpu_dev);
-	}
-}
-
-/**
- * dev_pm_opp_cpumask_remove_table() - Removes OPP table for @cpumask
- * @cpumask:	cpumask for which OPP table needs to be removed
- *
- * This removes the OPP tables for CPUs present in the @cpumask.
- * This should be used to remove all the OPPs entries associated with
- * the cpus in @cpumask.
- */
-void dev_pm_opp_cpumask_remove_table(const struct cpumask *cpumask)
-{
-	_dev_pm_opp_cpumask_remove_table(cpumask, false);
-}
-EXPORT_SYMBOL_GPL(dev_pm_opp_cpumask_remove_table);
-
-/**
- * dev_pm_opp_set_sharing_cpus() - Mark OPP table as shared by few CPUs
- * @cpu_dev:	CPU device for which we do this operation
- * @cpumask:	cpumask of the CPUs which share the OPP table with @cpu_dev
- *
- * This marks OPP table of the @cpu_dev as shared by the CPUs present in
- * @cpumask.
- *
- * Returns -ENODEV if OPP table isn't already present.
- */
-int dev_pm_opp_set_sharing_cpus(struct device *cpu_dev,
-				const struct cpumask *cpumask)
-{
-	struct opp_device *opp_dev;
-	struct opp_table *opp_table;
-	struct device *dev;
-	int cpu, ret = 0;
-
-	opp_table = _find_opp_table(cpu_dev);
-	if (IS_ERR(opp_table))
-		return PTR_ERR(opp_table);
-
-	for_each_cpu(cpu, cpumask) {
-		if (cpu == cpu_dev->id)
-			continue;
-
-		dev = get_cpu_device(cpu);
-		if (!dev) {
-			dev_err(cpu_dev, "%s: failed to get cpu%d device\n",
-				__func__, cpu);
-			continue;
-		}
-
-		opp_dev = _add_opp_dev(dev, opp_table);
-		if (!opp_dev) {
-			dev_err(dev, "%s: failed to add opp-dev for cpu%d device\n",
-				__func__, cpu);
-			continue;
-		}
-
-		/* Mark opp-table as multiple CPUs are sharing it now */
-		opp_table->shared_opp = OPP_TABLE_ACCESS_SHARED;
-	}
-
-	dev_pm_opp_put_opp_table(opp_table);
-
-	return ret;
-}
-EXPORT_SYMBOL_GPL(dev_pm_opp_set_sharing_cpus);
-
-/**
- * dev_pm_opp_get_sharing_cpus() - Get cpumask of CPUs sharing OPPs with @cpu_dev
- * @cpu_dev:	CPU device for which we do this operation
- * @cpumask:	cpumask to update with information of sharing CPUs
- *
- * This updates the @cpumask with CPUs that are sharing OPPs with @cpu_dev.
- *
- * Returns -ENODEV if OPP table isn't already present and -EINVAL if the OPP
- * table's status is access-unknown.
- */
-int dev_pm_opp_get_sharing_cpus(struct device *cpu_dev, struct cpumask *cpumask)
-{
-	struct opp_device *opp_dev;
-	struct opp_table *opp_table;
-	int ret = 0;
-
-	opp_table = _find_opp_table(cpu_dev);
-	if (IS_ERR(opp_table))
-		return PTR_ERR(opp_table);
-
-	if (opp_table->shared_opp == OPP_TABLE_ACCESS_UNKNOWN) {
-		ret = -EINVAL;
-		goto put_opp_table;
-	}
-
-	cpumask_clear(cpumask);
-
-	if (opp_table->shared_opp == OPP_TABLE_ACCESS_SHARED) {
-		list_for_each_entry(opp_dev, &opp_table->dev_list, node)
-			cpumask_set_cpu(opp_dev->dev->id, cpumask);
-	} else {
-		cpumask_set_cpu(cpu_dev->id, cpumask);
-	}
-
-put_opp_table:
-	dev_pm_opp_put_opp_table(opp_table);
-
-	return ret;
-}
-EXPORT_SYMBOL_GPL(dev_pm_opp_get_sharing_cpus);
diff --git a/drivers/base/power/opp/debugfs.c b/drivers/base/power/opp/debugfs.c
deleted file mode 100644
index 81cf120fcf43..000000000000
--- a/drivers/base/power/opp/debugfs.c
+++ /dev/null
@@ -1,249 +0,0 @@
-/*
- * Generic OPP debugfs interface
- *
- * Copyright (C) 2015-2016 Viresh Kumar <viresh.kumar@linaro.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
-#include <linux/debugfs.h>
-#include <linux/device.h>
-#include <linux/err.h>
-#include <linux/init.h>
-#include <linux/limits.h>
-#include <linux/slab.h>
-
-#include "opp.h"
-
-static struct dentry *rootdir;
-
-static void opp_set_dev_name(const struct device *dev, char *name)
-{
-	if (dev->parent)
-		snprintf(name, NAME_MAX, "%s-%s", dev_name(dev->parent),
-			 dev_name(dev));
-	else
-		snprintf(name, NAME_MAX, "%s", dev_name(dev));
-}
-
-void opp_debug_remove_one(struct dev_pm_opp *opp)
-{
-	debugfs_remove_recursive(opp->dentry);
-}
-
-static bool opp_debug_create_supplies(struct dev_pm_opp *opp,
-				      struct opp_table *opp_table,
-				      struct dentry *pdentry)
-{
-	struct dentry *d;
-	int i;
-	char *name;
-
-	for (i = 0; i < opp_table->regulator_count; i++) {
-		name = kasprintf(GFP_KERNEL, "supply-%d", i);
-
-		/* Create per-opp directory */
-		d = debugfs_create_dir(name, pdentry);
-
-		kfree(name);
-
-		if (!d)
-			return false;
-
-		if (!debugfs_create_ulong("u_volt_target", S_IRUGO, d,
-					  &opp->supplies[i].u_volt))
-			return false;
-
-		if (!debugfs_create_ulong("u_volt_min", S_IRUGO, d,
-					  &opp->supplies[i].u_volt_min))
-			return false;
-
-		if (!debugfs_create_ulong("u_volt_max", S_IRUGO, d,
-					  &opp->supplies[i].u_volt_max))
-			return false;
-
-		if (!debugfs_create_ulong("u_amp", S_IRUGO, d,
-					  &opp->supplies[i].u_amp))
-			return false;
-	}
-
-	return true;
-}
-
-int opp_debug_create_one(struct dev_pm_opp *opp, struct opp_table *opp_table)
-{
-	struct dentry *pdentry = opp_table->dentry;
-	struct dentry *d;
-	char name[25];	/* 20 chars for 64 bit value + 5 (opp:\0) */
-
-	/* Rate is unique to each OPP, use it to give opp-name */
-	snprintf(name, sizeof(name), "opp:%lu", opp->rate);
-
-	/* Create per-opp directory */
-	d = debugfs_create_dir(name, pdentry);
-	if (!d)
-		return -ENOMEM;
-
-	if (!debugfs_create_bool("available", S_IRUGO, d, &opp->available))
-		return -ENOMEM;
-
-	if (!debugfs_create_bool("dynamic", S_IRUGO, d, &opp->dynamic))
-		return -ENOMEM;
-
-	if (!debugfs_create_bool("turbo", S_IRUGO, d, &opp->turbo))
-		return -ENOMEM;
-
-	if (!debugfs_create_bool("suspend", S_IRUGO, d, &opp->suspend))
-		return -ENOMEM;
-
-	if (!debugfs_create_ulong("rate_hz", S_IRUGO, d, &opp->rate))
-		return -ENOMEM;
-
-	if (!opp_debug_create_supplies(opp, opp_table, d))
-		return -ENOMEM;
-
-	if (!debugfs_create_ulong("clock_latency_ns", S_IRUGO, d,
-				  &opp->clock_latency_ns))
-		return -ENOMEM;
-
-	opp->dentry = d;
-	return 0;
-}
-
-static int opp_list_debug_create_dir(struct opp_device *opp_dev,
-				     struct opp_table *opp_table)
-{
-	const struct device *dev = opp_dev->dev;
-	struct dentry *d;
-
-	opp_set_dev_name(dev, opp_table->dentry_name);
-
-	/* Create device specific directory */
-	d = debugfs_create_dir(opp_table->dentry_name, rootdir);
-	if (!d) {
-		dev_err(dev, "%s: Failed to create debugfs dir\n", __func__);
-		return -ENOMEM;
-	}
-
-	opp_dev->dentry = d;
-	opp_table->dentry = d;
-
-	return 0;
-}
-
-static int opp_list_debug_create_link(struct opp_device *opp_dev,
-				      struct opp_table *opp_table)
-{
-	const struct device *dev = opp_dev->dev;
-	char name[NAME_MAX];
-	struct dentry *d;
-
-	opp_set_dev_name(opp_dev->dev, name);
-
-	/* Create device specific directory link */
-	d = debugfs_create_symlink(name, rootdir, opp_table->dentry_name);
-	if (!d) {
-		dev_err(dev, "%s: Failed to create link\n", __func__);
-		return -ENOMEM;
-	}
-
-	opp_dev->dentry = d;
-
-	return 0;
-}
-
-/**
- * opp_debug_register - add a device opp node to the debugfs 'opp' directory
- * @opp_dev: opp-dev pointer for device
- * @opp_table: the device-opp being added
- *
- * Dynamically adds device specific directory in debugfs 'opp' directory. If the
- * device-opp is shared with other devices, then links will be created for all
- * devices except the first.
- *
- * Return: 0 on success, otherwise negative error.
- */
-int opp_debug_register(struct opp_device *opp_dev, struct opp_table *opp_table)
-{
-	if (!rootdir) {
-		pr_debug("%s: Uninitialized rootdir\n", __func__);
-		return -EINVAL;
-	}
-
-	if (opp_table->dentry)
-		return opp_list_debug_create_link(opp_dev, opp_table);
-
-	return opp_list_debug_create_dir(opp_dev, opp_table);
-}
-
-static void opp_migrate_dentry(struct opp_device *opp_dev,
-			       struct opp_table *opp_table)
-{
-	struct opp_device *new_dev;
-	const struct device *dev;
-	struct dentry *dentry;
-
-	/* Look for next opp-dev */
-	list_for_each_entry(new_dev, &opp_table->dev_list, node)
-		if (new_dev != opp_dev)
-			break;
-
-	/* new_dev is guaranteed to be valid here */
-	dev = new_dev->dev;
-	debugfs_remove_recursive(new_dev->dentry);
-
-	opp_set_dev_name(dev, opp_table->dentry_name);
-
-	dentry = debugfs_rename(rootdir, opp_dev->dentry, rootdir,
-				opp_table->dentry_name);
-	if (!dentry) {
-		dev_err(dev, "%s: Failed to rename link from: %s to %s\n",
-			__func__, dev_name(opp_dev->dev), dev_name(dev));
-		return;
-	}
-
-	new_dev->dentry = dentry;
-	opp_table->dentry = dentry;
-}
-
-/**
- * opp_debug_unregister - remove a device opp node from debugfs opp directory
- * @opp_dev: opp-dev pointer for device
- * @opp_table: the device-opp being removed
- *
- * Dynamically removes device specific directory from debugfs 'opp' directory.
- */
-void opp_debug_unregister(struct opp_device *opp_dev,
-			  struct opp_table *opp_table)
-{
-	if (opp_dev->dentry == opp_table->dentry) {
-		/* Move the real dentry object under another device */
-		if (!list_is_singular(&opp_table->dev_list)) {
-			opp_migrate_dentry(opp_dev, opp_table);
-			goto out;
-		}
-		opp_table->dentry = NULL;
-	}
-
-	debugfs_remove_recursive(opp_dev->dentry);
-
-out:
-	opp_dev->dentry = NULL;
-}
-
-static int __init opp_debug_init(void)
-{
-	/* Create /sys/kernel/debug/opp directory */
-	rootdir = debugfs_create_dir("opp", NULL);
-	if (!rootdir) {
-		pr_err("%s: Failed to create root directory\n", __func__);
-		return -ENOMEM;
-	}
-
-	return 0;
-}
-core_initcall(opp_debug_init);
diff --git a/drivers/base/power/opp/of.c b/drivers/base/power/opp/of.c
deleted file mode 100644
index 0b718886479b..000000000000
--- a/drivers/base/power/opp/of.c
+++ /dev/null
@@ -1,633 +0,0 @@
-/*
- * Generic OPP OF helpers
- *
- * Copyright (C) 2009-2010 Texas Instruments Incorporated.
- *	Nishanth Menon
- *	Romit Dasgupta
- *	Kevin Hilman
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
-#include <linux/cpu.h>
-#include <linux/errno.h>
-#include <linux/device.h>
-#include <linux/of.h>
-#include <linux/slab.h>
-#include <linux/export.h>
-
-#include "opp.h"
-
-static struct opp_table *_managed_opp(const struct device_node *np)
-{
-	struct opp_table *opp_table, *managed_table = NULL;
-
-	mutex_lock(&opp_table_lock);
-
-	list_for_each_entry(opp_table, &opp_tables, node) {
-		if (opp_table->np == np) {
-			/*
-			 * Multiple devices can point to the same OPP table and
-			 * so will have same node-pointer, np.
-			 *
-			 * But the OPPs will be considered as shared only if the
-			 * OPP table contains a "opp-shared" property.
-			 */
-			if (opp_table->shared_opp == OPP_TABLE_ACCESS_SHARED) {
-				_get_opp_table_kref(opp_table);
-				managed_table = opp_table;
-			}
-
-			break;
-		}
-	}
-
-	mutex_unlock(&opp_table_lock);
-
-	return managed_table;
-}
-
-void _of_init_opp_table(struct opp_table *opp_table, struct device *dev)
-{
-	struct device_node *np;
-
-	/*
-	 * Only required for backward compatibility with v1 bindings, but isn't
-	 * harmful for other cases. And so we do it unconditionally.
-	 */
-	np = of_node_get(dev->of_node);
-	if (np) {
-		u32 val;
-
-		if (!of_property_read_u32(np, "clock-latency", &val))
-			opp_table->clock_latency_ns_max = val;
-		of_property_read_u32(np, "voltage-tolerance",
-				     &opp_table->voltage_tolerance_v1);
-		of_node_put(np);
-	}
-}
-
-static bool _opp_is_supported(struct device *dev, struct opp_table *opp_table,
-			      struct device_node *np)
-{
-	unsigned int count = opp_table->supported_hw_count;
-	u32 version;
-	int ret;
-
-	if (!opp_table->supported_hw) {
-		/*
-		 * In the case that no supported_hw has been set by the
-		 * platform but there is an opp-supported-hw value set for
-		 * an OPP then the OPP should not be enabled as there is
-		 * no way to see if the hardware supports it.
-		 */
-		if (of_find_property(np, "opp-supported-hw", NULL))
-			return false;
-		else
-			return true;
-	}
-
-	while (count--) {
-		ret = of_property_read_u32_index(np, "opp-supported-hw", count,
-						 &version);
-		if (ret) {
-			dev_warn(dev, "%s: failed to read opp-supported-hw property at index %d: %d\n",
-				 __func__, count, ret);
-			return false;
-		}
-
-		/* Both of these are bitwise masks of the versions */
-		if (!(version & opp_table->supported_hw[count]))
-			return false;
-	}
-
-	return true;
-}
-
-static int opp_parse_supplies(struct dev_pm_opp *opp, struct device *dev,
-			      struct opp_table *opp_table)
-{
-	u32 *microvolt, *microamp = NULL;
-	int supplies, vcount, icount, ret, i, j;
-	struct property *prop = NULL;
-	char name[NAME_MAX];
-
-	supplies = opp_table->regulator_count ? opp_table->regulator_count : 1;
-
-	/* Search for "opp-microvolt-<name>" */
-	if (opp_table->prop_name) {
-		snprintf(name, sizeof(name), "opp-microvolt-%s",
-			 opp_table->prop_name);
-		prop = of_find_property(opp->np, name, NULL);
-	}
-
-	if (!prop) {
-		/* Search for "opp-microvolt" */
-		sprintf(name, "opp-microvolt");
-		prop = of_find_property(opp->np, name, NULL);
-
-		/* Missing property isn't a problem, but an invalid entry is */
-		if (!prop) {
-			if (!opp_table->regulator_count)
-				return 0;
-
-			dev_err(dev, "%s: opp-microvolt missing although OPP managing regulators\n",
-				__func__);
-			return -EINVAL;
-		}
-	}
-
-	vcount = of_property_count_u32_elems(opp->np, name);
-	if (vcount < 0) {
-		dev_err(dev, "%s: Invalid %s property (%d)\n",
-			__func__, name, vcount);
-		return vcount;
-	}
-
-	/* There can be one or three elements per supply */
-	if (vcount != supplies && vcount != supplies * 3) {
-		dev_err(dev, "%s: Invalid number of elements in %s property (%d) with supplies (%d)\n",
-			__func__, name, vcount, supplies);
-		return -EINVAL;
-	}
-
-	microvolt = kmalloc_array(vcount, sizeof(*microvolt), GFP_KERNEL);
-	if (!microvolt)
-		return -ENOMEM;
-
-	ret = of_property_read_u32_array(opp->np, name, microvolt, vcount);
-	if (ret) {
-		dev_err(dev, "%s: error parsing %s: %d\n", __func__, name, ret);
-		ret = -EINVAL;
-		goto free_microvolt;
-	}
-
-	/* Search for "opp-microamp-<name>" */
-	prop = NULL;
-	if (opp_table->prop_name) {
-		snprintf(name, sizeof(name), "opp-microamp-%s",
-			 opp_table->prop_name);
-		prop = of_find_property(opp->np, name, NULL);
-	}
-
-	if (!prop) {
-		/* Search for "opp-microamp" */
-		sprintf(name, "opp-microamp");
-		prop = of_find_property(opp->np, name, NULL);
-	}
-
-	if (prop) {
-		icount = of_property_count_u32_elems(opp->np, name);
-		if (icount < 0) {
-			dev_err(dev, "%s: Invalid %s property (%d)\n", __func__,
-				name, icount);
-			ret = icount;
-			goto free_microvolt;
-		}
-
-		if (icount != supplies) {
-			dev_err(dev, "%s: Invalid number of elements in %s property (%d) with supplies (%d)\n",
-				__func__, name, icount, supplies);
-			ret = -EINVAL;
-			goto free_microvolt;
-		}
-
-		microamp = kmalloc_array(icount, sizeof(*microamp), GFP_KERNEL);
-		if (!microamp) {
-			ret = -EINVAL;
-			goto free_microvolt;
-		}
-
-		ret = of_property_read_u32_array(opp->np, name, microamp,
-						 icount);
-		if (ret) {
-			dev_err(dev, "%s: error parsing %s: %d\n", __func__,
-				name, ret);
-			ret = -EINVAL;
-			goto free_microamp;
-		}
-	}
-
-	for (i = 0, j = 0; i < supplies; i++) {
-		opp->supplies[i].u_volt = microvolt[j++];
-
-		if (vcount == supplies) {
-			opp->supplies[i].u_volt_min = opp->supplies[i].u_volt;
-			opp->supplies[i].u_volt_max = opp->supplies[i].u_volt;
-		} else {
-			opp->supplies[i].u_volt_min = microvolt[j++];
-			opp->supplies[i].u_volt_max = microvolt[j++];
-		}
-
-		if (microamp)
-			opp->supplies[i].u_amp = microamp[i];
-	}
-
-free_microamp:
-	kfree(microamp);
-free_microvolt:
-	kfree(microvolt);
-
-	return ret;
-}
-
-/**
- * dev_pm_opp_of_remove_table() - Free OPP table entries created from static DT
- *				  entries
- * @dev:	device pointer used to lookup OPP table.
- *
- * Free OPPs created using static entries present in DT.
- */
-void dev_pm_opp_of_remove_table(struct device *dev)
-{
-	_dev_pm_opp_find_and_remove_table(dev, false);
-}
-EXPORT_SYMBOL_GPL(dev_pm_opp_of_remove_table);
-
-/* Returns opp descriptor node for a device node, caller must
- * do of_node_put() */
-static struct device_node *_opp_of_get_opp_desc_node(struct device_node *np)
-{
-	/*
-	 * There should be only ONE phandle present in "operating-points-v2"
-	 * property.
-	 */
-
-	return of_parse_phandle(np, "operating-points-v2", 0);
-}
-
-/* Returns opp descriptor node for a device, caller must do of_node_put() */
-struct device_node *dev_pm_opp_of_get_opp_desc_node(struct device *dev)
-{
-	return _opp_of_get_opp_desc_node(dev->of_node);
-}
-EXPORT_SYMBOL_GPL(dev_pm_opp_of_get_opp_desc_node);
-
-/**
- * _opp_add_static_v2() - Allocate static OPPs (As per 'v2' DT bindings)
- * @opp_table:	OPP table
- * @dev:	device for which we do this operation
- * @np:		device node
- *
- * This function adds an opp definition to the opp table and returns status. The
- * opp can be controlled using dev_pm_opp_enable/disable functions and may be
- * removed by dev_pm_opp_remove.
- *
- * Return:
- * 0		On success OR
- *		Duplicate OPPs (both freq and volt are same) and opp->available
- * -EEXIST	Freq are same and volt are different OR
- *		Duplicate OPPs (both freq and volt are same) and !opp->available
- * -ENOMEM	Memory allocation failure
- * -EINVAL	Failed parsing the OPP node
- */
-static int _opp_add_static_v2(struct opp_table *opp_table, struct device *dev,
-			      struct device_node *np)
-{
-	struct dev_pm_opp *new_opp;
-	u64 rate;
-	u32 val;
-	int ret;
-
-	new_opp = _opp_allocate(opp_table);
-	if (!new_opp)
-		return -ENOMEM;
-
-	ret = of_property_read_u64(np, "opp-hz", &rate);
-	if (ret < 0) {
-		dev_err(dev, "%s: opp-hz not found\n", __func__);
-		goto free_opp;
-	}
-
-	/* Check if the OPP supports hardware's hierarchy of versions or not */
-	if (!_opp_is_supported(dev, opp_table, np)) {
-		dev_dbg(dev, "OPP not supported by hardware: %llu\n", rate);
-		goto free_opp;
-	}
-
-	/*
-	 * Rate is defined as an unsigned long in clk API, and so casting
-	 * explicitly to its type. Must be fixed once rate is 64 bit
-	 * guaranteed in clk API.
-	 */
-	new_opp->rate = (unsigned long)rate;
-	new_opp->turbo = of_property_read_bool(np, "turbo-mode");
-
-	new_opp->np = np;
-	new_opp->dynamic = false;
-	new_opp->available = true;
-
-	if (!of_property_read_u32(np, "clock-latency-ns", &val))
-		new_opp->clock_latency_ns = val;
-
-	ret = opp_parse_supplies(new_opp, dev, opp_table);
-	if (ret)
-		goto free_opp;
-
-	ret = _opp_add(dev, new_opp, opp_table);
-	if (ret) {
-		/* Don't return error for duplicate OPPs */
-		if (ret == -EBUSY)
-			ret = 0;
-		goto free_opp;
-	}
-
-	/* OPP to select on device suspend */
-	if (of_property_read_bool(np, "opp-suspend")) {
-		if (opp_table->suspend_opp) {
-			dev_warn(dev, "%s: Multiple suspend OPPs found (%lu %lu)\n",
-				 __func__, opp_table->suspend_opp->rate,
-				 new_opp->rate);
-		} else {
-			new_opp->suspend = true;
-			opp_table->suspend_opp = new_opp;
-		}
-	}
-
-	if (new_opp->clock_latency_ns > opp_table->clock_latency_ns_max)
-		opp_table->clock_latency_ns_max = new_opp->clock_latency_ns;
-
-	pr_debug("%s: turbo:%d rate:%lu uv:%lu uvmin:%lu uvmax:%lu latency:%lu\n",
-		 __func__, new_opp->turbo, new_opp->rate,
-		 new_opp->supplies[0].u_volt, new_opp->supplies[0].u_volt_min,
-		 new_opp->supplies[0].u_volt_max, new_opp->clock_latency_ns);
-
-	/*
-	 * Notify the changes in the availability of the operable
-	 * frequency/voltage list.
-	 */
-	blocking_notifier_call_chain(&opp_table->head, OPP_EVENT_ADD, new_opp);
-	return 0;
-
-free_opp:
-	_opp_free(new_opp);
-
-	return ret;
-}
-
-/* Initializes OPP tables based on new bindings */
-static int _of_add_opp_table_v2(struct device *dev, struct device_node *opp_np)
-{
-	struct device_node *np;
-	struct opp_table *opp_table;
-	int ret = 0, count = 0;
-
-	opp_table = _managed_opp(opp_np);
-	if (opp_table) {
-		/* OPPs are already managed */
-		if (!_add_opp_dev(dev, opp_table))
-			ret = -ENOMEM;
-		goto put_opp_table;
-	}
-
-	opp_table = dev_pm_opp_get_opp_table(dev);
-	if (!opp_table)
-		return -ENOMEM;
-
-	/* We have opp-table node now, iterate over it and add OPPs */
-	for_each_available_child_of_node(opp_np, np) {
-		count++;
-
-		ret = _opp_add_static_v2(opp_table, dev, np);
-		if (ret) {
-			dev_err(dev, "%s: Failed to add OPP, %d\n", __func__,
-				ret);
-			_dev_pm_opp_remove_table(opp_table, dev, false);
-			goto put_opp_table;
-		}
-	}
-
-	/* There should be one of more OPP defined */
-	if (WARN_ON(!count)) {
-		ret = -ENOENT;
-		goto put_opp_table;
-	}
-
-	opp_table->np = opp_np;
-	if (of_property_read_bool(opp_np, "opp-shared"))
-		opp_table->shared_opp = OPP_TABLE_ACCESS_SHARED;
-	else
-		opp_table->shared_opp = OPP_TABLE_ACCESS_EXCLUSIVE;
-
-put_opp_table:
-	dev_pm_opp_put_opp_table(opp_table);
-
-	return ret;
-}
-
-/* Initializes OPP tables based on old-deprecated bindings */
-static int _of_add_opp_table_v1(struct device *dev)
-{
-	struct opp_table *opp_table;
-	const struct property *prop;
-	const __be32 *val;
-	int nr, ret = 0;
-
-	prop = of_find_property(dev->of_node, "operating-points", NULL);
-	if (!prop)
-		return -ENODEV;
-	if (!prop->value)
-		return -ENODATA;
-
-	/*
-	 * Each OPP is a set of tuples consisting of frequency and
-	 * voltage like <freq-kHz vol-uV>.
-	 */
-	nr = prop->length / sizeof(u32);
-	if (nr % 2) {
-		dev_err(dev, "%s: Invalid OPP table\n", __func__);
-		return -EINVAL;
-	}
-
-	opp_table = dev_pm_opp_get_opp_table(dev);
-	if (!opp_table)
-		return -ENOMEM;
-
-	val = prop->value;
-	while (nr) {
-		unsigned long freq = be32_to_cpup(val++) * 1000;
-		unsigned long volt = be32_to_cpup(val++);
-
-		ret = _opp_add_v1(opp_table, dev, freq, volt, false);
-		if (ret) {
-			dev_err(dev, "%s: Failed to add OPP %ld (%d)\n",
-				__func__, freq, ret);
-			_dev_pm_opp_remove_table(opp_table, dev, false);
-			break;
-		}
-		nr -= 2;
-	}
-
-	dev_pm_opp_put_opp_table(opp_table);
-	return ret;
-}
-
-/**
- * dev_pm_opp_of_add_table() - Initialize opp table from device tree
- * @dev:	device pointer used to lookup OPP table.
- *
- * Register the initial OPP table with the OPP library for given device.
- *
- * Return:
- * 0		On success OR
- *		Duplicate OPPs (both freq and volt are same) and opp->available
- * -EEXIST	Freq are same and volt are different OR
- *		Duplicate OPPs (both freq and volt are same) and !opp->available
- * -ENOMEM	Memory allocation failure
- * -ENODEV	when 'operating-points' property is not found or is invalid data
- *		in device node.
- * -ENODATA	when empty 'operating-points' property is found
- * -EINVAL	when invalid entries are found in opp-v2 table
- */
-int dev_pm_opp_of_add_table(struct device *dev)
-{
-	struct device_node *opp_np;
-	int ret;
-
-	/*
-	 * OPPs have two version of bindings now. The older one is deprecated,
-	 * try for the new binding first.
-	 */
-	opp_np = dev_pm_opp_of_get_opp_desc_node(dev);
-	if (!opp_np) {
-		/*
-		 * Try old-deprecated bindings for backward compatibility with
-		 * older dtbs.
-		 */
-		return _of_add_opp_table_v1(dev);
-	}
-
-	ret = _of_add_opp_table_v2(dev, opp_np);
-	of_node_put(opp_np);
-
-	return ret;
-}
-EXPORT_SYMBOL_GPL(dev_pm_opp_of_add_table);
-
-/* CPU device specific helpers */
-
-/**
- * dev_pm_opp_of_cpumask_remove_table() - Removes OPP table for @cpumask
- * @cpumask:	cpumask for which OPP table needs to be removed
- *
- * This removes the OPP tables for CPUs present in the @cpumask.
- * This should be used only to remove static entries created from DT.
- */
-void dev_pm_opp_of_cpumask_remove_table(const struct cpumask *cpumask)
-{
-	_dev_pm_opp_cpumask_remove_table(cpumask, true);
-}
-EXPORT_SYMBOL_GPL(dev_pm_opp_of_cpumask_remove_table);
-
-/**
- * dev_pm_opp_of_cpumask_add_table() - Adds OPP table for @cpumask
- * @cpumask:	cpumask for which OPP table needs to be added.
- *
- * This adds the OPP tables for CPUs present in the @cpumask.
- */
-int dev_pm_opp_of_cpumask_add_table(const struct cpumask *cpumask)
-{
-	struct device *cpu_dev;
-	int cpu, ret = 0;
-
-	WARN_ON(cpumask_empty(cpumask));
-
-	for_each_cpu(cpu, cpumask) {
-		cpu_dev = get_cpu_device(cpu);
-		if (!cpu_dev) {
-			pr_err("%s: failed to get cpu%d device\n", __func__,
-			       cpu);
-			continue;
-		}
-
-		ret = dev_pm_opp_of_add_table(cpu_dev);
-		if (ret) {
-			/*
-			 * OPP may get registered dynamically, don't print error
-			 * message here.
-			 */
-			pr_debug("%s: couldn't find opp table for cpu:%d, %d\n",
-				 __func__, cpu, ret);
-
-			/* Free all other OPPs */
-			dev_pm_opp_of_cpumask_remove_table(cpumask);
-			break;
-		}
-	}
-
-	return ret;
-}
-EXPORT_SYMBOL_GPL(dev_pm_opp_of_cpumask_add_table);
-
-/*
- * Works only for OPP v2 bindings.
- *
- * Returns -ENOENT if operating-points-v2 bindings aren't supported.
- */
-/**
- * dev_pm_opp_of_get_sharing_cpus() - Get cpumask of CPUs sharing OPPs with
- *				      @cpu_dev using operating-points-v2
- *				      bindings.
- *
- * @cpu_dev:	CPU device for which we do this operation
- * @cpumask:	cpumask to update with information of sharing CPUs
- *
- * This updates the @cpumask with CPUs that are sharing OPPs with @cpu_dev.
- *
- * Returns -ENOENT if operating-points-v2 isn't present for @cpu_dev.
- */
-int dev_pm_opp_of_get_sharing_cpus(struct device *cpu_dev,
-				   struct cpumask *cpumask)
-{
-	struct device_node *np, *tmp_np, *cpu_np;
-	int cpu, ret = 0;
-
-	/* Get OPP descriptor node */
-	np = dev_pm_opp_of_get_opp_desc_node(cpu_dev);
-	if (!np) {
-		dev_dbg(cpu_dev, "%s: Couldn't find opp node.\n", __func__);
-		return -ENOENT;
-	}
-
-	cpumask_set_cpu(cpu_dev->id, cpumask);
-
-	/* OPPs are shared ? */
-	if (!of_property_read_bool(np, "opp-shared"))
-		goto put_cpu_node;
-
-	for_each_possible_cpu(cpu) {
-		if (cpu == cpu_dev->id)
-			continue;
-
-		cpu_np = of_get_cpu_node(cpu, NULL);
-		if (!cpu_np) {
-			dev_err(cpu_dev, "%s: failed to get cpu%d node\n",
-				__func__, cpu);
-			ret = -ENOENT;
-			goto put_cpu_node;
-		}
-
-		/* Get OPP descriptor node */
-		tmp_np = _opp_of_get_opp_desc_node(cpu_np);
-		if (!tmp_np) {
-			pr_err("%pOF: Couldn't find opp node\n", cpu_np);
-			ret = -ENOENT;
-			goto put_cpu_node;
-		}
-
-		/* CPUs are sharing opp node */
-		if (np == tmp_np)
-			cpumask_set_cpu(cpu, cpumask);
-
-		of_node_put(tmp_np);
-	}
-
-put_cpu_node:
-	of_node_put(np);
-	return ret;
-}
-EXPORT_SYMBOL_GPL(dev_pm_opp_of_get_sharing_cpus);
diff --git a/drivers/base/power/opp/opp.h b/drivers/base/power/opp/opp.h
deleted file mode 100644
index 166eef990599..000000000000
--- a/drivers/base/power/opp/opp.h
+++ /dev/null
@@ -1,222 +0,0 @@
-/*
- * Generic OPP Interface
- *
- * Copyright (C) 2009-2010 Texas Instruments Incorporated.
- *	Nishanth Menon
- *	Romit Dasgupta
- *	Kevin Hilman
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#ifndef __DRIVER_OPP_H__
-#define __DRIVER_OPP_H__
-
-#include <linux/device.h>
-#include <linux/kernel.h>
-#include <linux/kref.h>
-#include <linux/list.h>
-#include <linux/limits.h>
-#include <linux/pm_opp.h>
-#include <linux/notifier.h>
-
-struct clk;
-struct regulator;
-
-/* Lock to allow exclusive modification to the device and opp lists */
-extern struct mutex opp_table_lock;
-
-extern struct list_head opp_tables;
-
-/*
- * Internal data structure organization with the OPP layer library is as
- * follows:
- * opp_tables (root)
- *	|- device 1 (represents voltage domain 1)
- *	|	|- opp 1 (availability, freq, voltage)
- *	|	|- opp 2 ..
- *	...	...
- *	|	`- opp n ..
- *	|- device 2 (represents the next voltage domain)
- *	...
- *	`- device m (represents mth voltage domain)
- * device 1, 2.. are represented by opp_table structure while each opp
- * is represented by the opp structure.
- */
-
-/**
- * struct dev_pm_opp - Generic OPP description structure
- * @node:	opp table node. The nodes are maintained throughout the lifetime
- *		of boot. It is expected only an optimal set of OPPs are
- *		added to the library by the SoC framework.
- *		IMPORTANT: the opp nodes should be maintained in increasing
- *		order.
- * @kref:	for reference count of the OPP.
- * @available:	true/false - marks if this OPP as available or not
- * @dynamic:	not-created from static DT entries.
- * @turbo:	true if turbo (boost) OPP
- * @suspend:	true if suspend OPP
- * @rate:	Frequency in hertz
- * @supplies:	Power supplies voltage/current values
- * @clock_latency_ns: Latency (in nanoseconds) of switching to this OPP's
- *		frequency from any other OPP's frequency.
- * @opp_table:	points back to the opp_table struct this opp belongs to
- * @np:		OPP's device node.
- * @dentry:	debugfs dentry pointer (per opp)
- *
- * This structure stores the OPP information for a given device.
- */
-struct dev_pm_opp {
-	struct list_head node;
-	struct kref kref;
-
-	bool available;
-	bool dynamic;
-	bool turbo;
-	bool suspend;
-	unsigned long rate;
-
-	struct dev_pm_opp_supply *supplies;
-
-	unsigned long clock_latency_ns;
-
-	struct opp_table *opp_table;
-
-	struct device_node *np;
-
-#ifdef CONFIG_DEBUG_FS
-	struct dentry *dentry;
-#endif
-};
-
-/**
- * struct opp_device - devices managed by 'struct opp_table'
- * @node:	list node
- * @dev:	device to which the struct object belongs
- * @dentry:	debugfs dentry pointer (per device)
- *
- * This is an internal data structure maintaining the devices that are managed
- * by 'struct opp_table'.
- */
-struct opp_device {
-	struct list_head node;
-	const struct device *dev;
-
-#ifdef CONFIG_DEBUG_FS
-	struct dentry *dentry;
-#endif
-};
-
-enum opp_table_access {
-	OPP_TABLE_ACCESS_UNKNOWN = 0,
-	OPP_TABLE_ACCESS_EXCLUSIVE = 1,
-	OPP_TABLE_ACCESS_SHARED = 2,
-};
-
-/**
- * struct opp_table - Device opp structure
- * @node:	table node - contains the devices with OPPs that
- *		have been registered. Nodes once added are not modified in this
- *		table.
- * @head:	notifier head to notify the OPP availability changes.
- * @dev_list:	list of devices that share these OPPs
- * @opp_list:	table of opps
- * @kref:	for reference count of the table.
- * @lock:	mutex protecting the opp_list.
- * @np:		struct device_node pointer for opp's DT node.
- * @clock_latency_ns_max: Max clock latency in nanoseconds.
- * @shared_opp: OPP is shared between multiple devices.
- * @suspend_opp: Pointer to OPP to be used during device suspend.
- * @supported_hw: Array of version number to support.
- * @supported_hw_count: Number of elements in supported_hw array.
- * @prop_name: A name to postfix to many DT properties, while parsing them.
- * @clk: Device's clock handle
- * @regulators: Supply regulators
- * @regulator_count: Number of power supply regulators
- * @set_opp: Platform specific set_opp callback
- * @set_opp_data: Data to be passed to set_opp callback
- * @dentry:	debugfs dentry pointer of the real device directory (not links).
- * @dentry_name: Name of the real dentry.
- *
- * @voltage_tolerance_v1: In percentage, for v1 bindings only.
- *
- * This is an internal data structure maintaining the link to opps attached to
- * a device. This structure is not meant to be shared to users as it is
- * meant for book keeping and private to OPP library.
- */
-struct opp_table {
-	struct list_head node;
-
-	struct blocking_notifier_head head;
-	struct list_head dev_list;
-	struct list_head opp_list;
-	struct kref kref;
-	struct mutex lock;
-
-	struct device_node *np;
-	unsigned long clock_latency_ns_max;
-
-	/* For backward compatibility with v1 bindings */
-	unsigned int voltage_tolerance_v1;
-
-	enum opp_table_access shared_opp;
-	struct dev_pm_opp *suspend_opp;
-
-	unsigned int *supported_hw;
-	unsigned int supported_hw_count;
-	const char *prop_name;
-	struct clk *clk;
-	struct regulator **regulators;
-	unsigned int regulator_count;
-
-	int (*set_opp)(struct dev_pm_set_opp_data *data);
-	struct dev_pm_set_opp_data *set_opp_data;
-
-#ifdef CONFIG_DEBUG_FS
-	struct dentry *dentry;
-	char dentry_name[NAME_MAX];
-#endif
-};
-
-/* Routines internal to opp core */
-void _get_opp_table_kref(struct opp_table *opp_table);
-struct opp_table *_find_opp_table(struct device *dev);
-struct opp_device *_add_opp_dev(const struct device *dev, struct opp_table *opp_table);
-void _dev_pm_opp_remove_table(struct opp_table *opp_table, struct device *dev, bool remove_all);
-void _dev_pm_opp_find_and_remove_table(struct device *dev, bool remove_all);
-struct dev_pm_opp *_opp_allocate(struct opp_table *opp_table);
-void _opp_free(struct dev_pm_opp *opp);
-int _opp_add(struct device *dev, struct dev_pm_opp *new_opp, struct opp_table *opp_table);
-int _opp_add_v1(struct opp_table *opp_table, struct device *dev, unsigned long freq, long u_volt, bool dynamic);
-void _dev_pm_opp_cpumask_remove_table(const struct cpumask *cpumask, bool of);
-struct opp_table *_add_opp_table(struct device *dev);
-
-#ifdef CONFIG_OF
-void _of_init_opp_table(struct opp_table *opp_table, struct device *dev);
-#else
-static inline void _of_init_opp_table(struct opp_table *opp_table, struct device *dev) {}
-#endif
-
-#ifdef CONFIG_DEBUG_FS
-void opp_debug_remove_one(struct dev_pm_opp *opp);
-int opp_debug_create_one(struct dev_pm_opp *opp, struct opp_table *opp_table);
-int opp_debug_register(struct opp_device *opp_dev, struct opp_table *opp_table);
-void opp_debug_unregister(struct opp_device *opp_dev, struct opp_table *opp_table);
-#else
-static inline void opp_debug_remove_one(struct dev_pm_opp *opp) {}
-
-static inline int opp_debug_create_one(struct dev_pm_opp *opp,
-				       struct opp_table *opp_table)
-{ return 0; }
-static inline int opp_debug_register(struct opp_device *opp_dev,
-				     struct opp_table *opp_table)
-{ return 0; }
-
-static inline void opp_debug_unregister(struct opp_device *opp_dev,
-					struct opp_table *opp_table)
-{ }
-#endif		/* DEBUG_FS */
-
-#endif		/* __DRIVER_OPP_H__ */
diff --git a/drivers/opp/Kconfig b/drivers/opp/Kconfig
new file mode 100644
index 000000000000..a7fbb93f302c
--- /dev/null
+++ b/drivers/opp/Kconfig
@@ -0,0 +1,13 @@
+config PM_OPP
+	bool
+	select SRCU
+	---help---
+	  SOCs have a standard set of tuples consisting of frequency and
+	  voltage pairs that the device will support per voltage domain. This
+	  is called Operating Performance Point or OPP. The actual definitions
+	  of OPP varies over silicon within the same family of devices.
+
+	  OPP layer organizes the data internally using device pointers
+	  representing individual voltage domains and provides SOC
+	  implementations a ready to use framework to manage OPPs.
+	  For more information, read <file:Documentation/power/opp.txt>
diff --git a/drivers/opp/Makefile b/drivers/opp/Makefile
new file mode 100644
index 000000000000..e70ceb406fe9
--- /dev/null
+++ b/drivers/opp/Makefile
@@ -0,0 +1,4 @@
+ccflags-$(CONFIG_DEBUG_DRIVER)	:= -DDEBUG
+obj-y				+= core.o cpu.o
+obj-$(CONFIG_OF)		+= of.o
+obj-$(CONFIG_DEBUG_FS)		+= debugfs.o
diff --git a/drivers/opp/core.c b/drivers/opp/core.c
new file mode 100644
index 000000000000..a6de32530693
--- /dev/null
+++ b/drivers/opp/core.c
@@ -0,0 +1,1747 @@
+/*
+ * Generic OPP Interface
+ *
+ * Copyright (C) 2009-2010 Texas Instruments Incorporated.
+ *	Nishanth Menon
+ *	Romit Dasgupta
+ *	Kevin Hilman
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/clk.h>
+#include <linux/errno.h>
+#include <linux/err.h>
+#include <linux/slab.h>
+#include <linux/device.h>
+#include <linux/export.h>
+#include <linux/regulator/consumer.h>
+
+#include "opp.h"
+
+/*
+ * The root of the list of all opp-tables. All opp_table structures branch off
+ * from here, with each opp_table containing the list of opps it supports in
+ * various states of availability.
+ */
+LIST_HEAD(opp_tables);
+/* Lock to allow exclusive modification to the device and opp lists */
+DEFINE_MUTEX(opp_table_lock);
+
+static void dev_pm_opp_get(struct dev_pm_opp *opp);
+
+static struct opp_device *_find_opp_dev(const struct device *dev,
+					struct opp_table *opp_table)
+{
+	struct opp_device *opp_dev;
+
+	list_for_each_entry(opp_dev, &opp_table->dev_list, node)
+		if (opp_dev->dev == dev)
+			return opp_dev;
+
+	return NULL;
+}
+
+static struct opp_table *_find_opp_table_unlocked(struct device *dev)
+{
+	struct opp_table *opp_table;
+
+	list_for_each_entry(opp_table, &opp_tables, node) {
+		if (_find_opp_dev(dev, opp_table)) {
+			_get_opp_table_kref(opp_table);
+
+			return opp_table;
+		}
+	}
+
+	return ERR_PTR(-ENODEV);
+}
+
+/**
+ * _find_opp_table() - find opp_table struct using device pointer
+ * @dev:	device pointer used to lookup OPP table
+ *
+ * Search OPP table for one containing matching device.
+ *
+ * Return: pointer to 'struct opp_table' if found, otherwise -ENODEV or
+ * -EINVAL based on type of error.
+ *
+ * The callers must call dev_pm_opp_put_opp_table() after the table is used.
+ */
+struct opp_table *_find_opp_table(struct device *dev)
+{
+	struct opp_table *opp_table;
+
+	if (IS_ERR_OR_NULL(dev)) {
+		pr_err("%s: Invalid parameters\n", __func__);
+		return ERR_PTR(-EINVAL);
+	}
+
+	mutex_lock(&opp_table_lock);
+	opp_table = _find_opp_table_unlocked(dev);
+	mutex_unlock(&opp_table_lock);
+
+	return opp_table;
+}
+
+/**
+ * dev_pm_opp_get_voltage() - Gets the voltage corresponding to an opp
+ * @opp:	opp for which voltage has to be returned for
+ *
+ * Return: voltage in micro volt corresponding to the opp, else
+ * return 0
+ *
+ * This is useful only for devices with single power supply.
+ */
+unsigned long dev_pm_opp_get_voltage(struct dev_pm_opp *opp)
+{
+	if (IS_ERR_OR_NULL(opp)) {
+		pr_err("%s: Invalid parameters\n", __func__);
+		return 0;
+	}
+
+	return opp->supplies[0].u_volt;
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_get_voltage);
+
+/**
+ * dev_pm_opp_get_freq() - Gets the frequency corresponding to an available opp
+ * @opp:	opp for which frequency has to be returned for
+ *
+ * Return: frequency in hertz corresponding to the opp, else
+ * return 0
+ */
+unsigned long dev_pm_opp_get_freq(struct dev_pm_opp *opp)
+{
+	if (IS_ERR_OR_NULL(opp) || !opp->available) {
+		pr_err("%s: Invalid parameters\n", __func__);
+		return 0;
+	}
+
+	return opp->rate;
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_get_freq);
+
+/**
+ * dev_pm_opp_is_turbo() - Returns if opp is turbo OPP or not
+ * @opp: opp for which turbo mode is being verified
+ *
+ * Turbo OPPs are not for normal use, and can be enabled (under certain
+ * conditions) for short duration of times to finish high throughput work
+ * quickly. Running on them for longer times may overheat the chip.
+ *
+ * Return: true if opp is turbo opp, else false.
+ */
+bool dev_pm_opp_is_turbo(struct dev_pm_opp *opp)
+{
+	if (IS_ERR_OR_NULL(opp) || !opp->available) {
+		pr_err("%s: Invalid parameters\n", __func__);
+		return false;
+	}
+
+	return opp->turbo;
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_is_turbo);
+
+/**
+ * dev_pm_opp_get_max_clock_latency() - Get max clock latency in nanoseconds
+ * @dev:	device for which we do this operation
+ *
+ * Return: This function returns the max clock latency in nanoseconds.
+ */
+unsigned long dev_pm_opp_get_max_clock_latency(struct device *dev)
+{
+	struct opp_table *opp_table;
+	unsigned long clock_latency_ns;
+
+	opp_table = _find_opp_table(dev);
+	if (IS_ERR(opp_table))
+		return 0;
+
+	clock_latency_ns = opp_table->clock_latency_ns_max;
+
+	dev_pm_opp_put_opp_table(opp_table);
+
+	return clock_latency_ns;
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_get_max_clock_latency);
+
+/**
+ * dev_pm_opp_get_max_volt_latency() - Get max voltage latency in nanoseconds
+ * @dev: device for which we do this operation
+ *
+ * Return: This function returns the max voltage latency in nanoseconds.
+ */
+unsigned long dev_pm_opp_get_max_volt_latency(struct device *dev)
+{
+	struct opp_table *opp_table;
+	struct dev_pm_opp *opp;
+	struct regulator *reg;
+	unsigned long latency_ns = 0;
+	int ret, i, count;
+	struct {
+		unsigned long min;
+		unsigned long max;
+	} *uV;
+
+	opp_table = _find_opp_table(dev);
+	if (IS_ERR(opp_table))
+		return 0;
+
+	count = opp_table->regulator_count;
+
+	/* Regulator may not be required for the device */
+	if (!count)
+		goto put_opp_table;
+
+	uV = kmalloc_array(count, sizeof(*uV), GFP_KERNEL);
+	if (!uV)
+		goto put_opp_table;
+
+	mutex_lock(&opp_table->lock);
+
+	for (i = 0; i < count; i++) {
+		uV[i].min = ~0;
+		uV[i].max = 0;
+
+		list_for_each_entry(opp, &opp_table->opp_list, node) {
+			if (!opp->available)
+				continue;
+
+			if (opp->supplies[i].u_volt_min < uV[i].min)
+				uV[i].min = opp->supplies[i].u_volt_min;
+			if (opp->supplies[i].u_volt_max > uV[i].max)
+				uV[i].max = opp->supplies[i].u_volt_max;
+		}
+	}
+
+	mutex_unlock(&opp_table->lock);
+
+	/*
+	 * The caller needs to ensure that opp_table (and hence the regulator)
+	 * isn't freed, while we are executing this routine.
+	 */
+	for (i = 0; i < count; i++) {
+		reg = opp_table->regulators[i];
+		ret = regulator_set_voltage_time(reg, uV[i].min, uV[i].max);
+		if (ret > 0)
+			latency_ns += ret * 1000;
+	}
+
+	kfree(uV);
+put_opp_table:
+	dev_pm_opp_put_opp_table(opp_table);
+
+	return latency_ns;
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_get_max_volt_latency);
+
+/**
+ * dev_pm_opp_get_max_transition_latency() - Get max transition latency in
+ *					     nanoseconds
+ * @dev: device for which we do this operation
+ *
+ * Return: This function returns the max transition latency, in nanoseconds, to
+ * switch from one OPP to other.
+ */
+unsigned long dev_pm_opp_get_max_transition_latency(struct device *dev)
+{
+	return dev_pm_opp_get_max_volt_latency(dev) +
+		dev_pm_opp_get_max_clock_latency(dev);
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_get_max_transition_latency);
+
+/**
+ * dev_pm_opp_get_suspend_opp_freq() - Get frequency of suspend opp in Hz
+ * @dev:	device for which we do this operation
+ *
+ * Return: This function returns the frequency of the OPP marked as suspend_opp
+ * if one is available, else returns 0;
+ */
+unsigned long dev_pm_opp_get_suspend_opp_freq(struct device *dev)
+{
+	struct opp_table *opp_table;
+	unsigned long freq = 0;
+
+	opp_table = _find_opp_table(dev);
+	if (IS_ERR(opp_table))
+		return 0;
+
+	if (opp_table->suspend_opp && opp_table->suspend_opp->available)
+		freq = dev_pm_opp_get_freq(opp_table->suspend_opp);
+
+	dev_pm_opp_put_opp_table(opp_table);
+
+	return freq;
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_get_suspend_opp_freq);
+
+/**
+ * dev_pm_opp_get_opp_count() - Get number of opps available in the opp table
+ * @dev:	device for which we do this operation
+ *
+ * Return: This function returns the number of available opps if there are any,
+ * else returns 0 if none or the corresponding error value.
+ */
+int dev_pm_opp_get_opp_count(struct device *dev)
+{
+	struct opp_table *opp_table;
+	struct dev_pm_opp *temp_opp;
+	int count = 0;
+
+	opp_table = _find_opp_table(dev);
+	if (IS_ERR(opp_table)) {
+		count = PTR_ERR(opp_table);
+		dev_err(dev, "%s: OPP table not found (%d)\n",
+			__func__, count);
+		return count;
+	}
+
+	mutex_lock(&opp_table->lock);
+
+	list_for_each_entry(temp_opp, &opp_table->opp_list, node) {
+		if (temp_opp->available)
+			count++;
+	}
+
+	mutex_unlock(&opp_table->lock);
+	dev_pm_opp_put_opp_table(opp_table);
+
+	return count;
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_get_opp_count);
+
+/**
+ * dev_pm_opp_find_freq_exact() - search for an exact frequency
+ * @dev:		device for which we do this operation
+ * @freq:		frequency to search for
+ * @available:		true/false - match for available opp
+ *
+ * Return: Searches for exact match in the opp table and returns pointer to the
+ * matching opp if found, else returns ERR_PTR in case of error and should
+ * be handled using IS_ERR. Error return values can be:
+ * EINVAL:	for bad pointer
+ * ERANGE:	no match found for search
+ * ENODEV:	if device not found in list of registered devices
+ *
+ * Note: available is a modifier for the search. if available=true, then the
+ * match is for exact matching frequency and is available in the stored OPP
+ * table. if false, the match is for exact frequency which is not available.
+ *
+ * This provides a mechanism to enable an opp which is not available currently
+ * or the opposite as well.
+ *
+ * The callers are required to call dev_pm_opp_put() for the returned OPP after
+ * use.
+ */
+struct dev_pm_opp *dev_pm_opp_find_freq_exact(struct device *dev,
+					      unsigned long freq,
+					      bool available)
+{
+	struct opp_table *opp_table;
+	struct dev_pm_opp *temp_opp, *opp = ERR_PTR(-ERANGE);
+
+	opp_table = _find_opp_table(dev);
+	if (IS_ERR(opp_table)) {
+		int r = PTR_ERR(opp_table);
+
+		dev_err(dev, "%s: OPP table not found (%d)\n", __func__, r);
+		return ERR_PTR(r);
+	}
+
+	mutex_lock(&opp_table->lock);
+
+	list_for_each_entry(temp_opp, &opp_table->opp_list, node) {
+		if (temp_opp->available == available &&
+				temp_opp->rate == freq) {
+			opp = temp_opp;
+
+			/* Increment the reference count of OPP */
+			dev_pm_opp_get(opp);
+			break;
+		}
+	}
+
+	mutex_unlock(&opp_table->lock);
+	dev_pm_opp_put_opp_table(opp_table);
+
+	return opp;
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_find_freq_exact);
+
+static noinline struct dev_pm_opp *_find_freq_ceil(struct opp_table *opp_table,
+						   unsigned long *freq)
+{
+	struct dev_pm_opp *temp_opp, *opp = ERR_PTR(-ERANGE);
+
+	mutex_lock(&opp_table->lock);
+
+	list_for_each_entry(temp_opp, &opp_table->opp_list, node) {
+		if (temp_opp->available && temp_opp->rate >= *freq) {
+			opp = temp_opp;
+			*freq = opp->rate;
+
+			/* Increment the reference count of OPP */
+			dev_pm_opp_get(opp);
+			break;
+		}
+	}
+
+	mutex_unlock(&opp_table->lock);
+
+	return opp;
+}
+
+/**
+ * dev_pm_opp_find_freq_ceil() - Search for an rounded ceil freq
+ * @dev:	device for which we do this operation
+ * @freq:	Start frequency
+ *
+ * Search for the matching ceil *available* OPP from a starting freq
+ * for a device.
+ *
+ * Return: matching *opp and refreshes *freq accordingly, else returns
+ * ERR_PTR in case of error and should be handled using IS_ERR. Error return
+ * values can be:
+ * EINVAL:	for bad pointer
+ * ERANGE:	no match found for search
+ * ENODEV:	if device not found in list of registered devices
+ *
+ * The callers are required to call dev_pm_opp_put() for the returned OPP after
+ * use.
+ */
+struct dev_pm_opp *dev_pm_opp_find_freq_ceil(struct device *dev,
+					     unsigned long *freq)
+{
+	struct opp_table *opp_table;
+	struct dev_pm_opp *opp;
+
+	if (!dev || !freq) {
+		dev_err(dev, "%s: Invalid argument freq=%p\n", __func__, freq);
+		return ERR_PTR(-EINVAL);
+	}
+
+	opp_table = _find_opp_table(dev);
+	if (IS_ERR(opp_table))
+		return ERR_CAST(opp_table);
+
+	opp = _find_freq_ceil(opp_table, freq);
+
+	dev_pm_opp_put_opp_table(opp_table);
+
+	return opp;
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_find_freq_ceil);
+
+/**
+ * dev_pm_opp_find_freq_floor() - Search for a rounded floor freq
+ * @dev:	device for which we do this operation
+ * @freq:	Start frequency
+ *
+ * Search for the matching floor *available* OPP from a starting freq
+ * for a device.
+ *
+ * Return: matching *opp and refreshes *freq accordingly, else returns
+ * ERR_PTR in case of error and should be handled using IS_ERR. Error return
+ * values can be:
+ * EINVAL:	for bad pointer
+ * ERANGE:	no match found for search
+ * ENODEV:	if device not found in list of registered devices
+ *
+ * The callers are required to call dev_pm_opp_put() for the returned OPP after
+ * use.
+ */
+struct dev_pm_opp *dev_pm_opp_find_freq_floor(struct device *dev,
+					      unsigned long *freq)
+{
+	struct opp_table *opp_table;
+	struct dev_pm_opp *temp_opp, *opp = ERR_PTR(-ERANGE);
+
+	if (!dev || !freq) {
+		dev_err(dev, "%s: Invalid argument freq=%p\n", __func__, freq);
+		return ERR_PTR(-EINVAL);
+	}
+
+	opp_table = _find_opp_table(dev);
+	if (IS_ERR(opp_table))
+		return ERR_CAST(opp_table);
+
+	mutex_lock(&opp_table->lock);
+
+	list_for_each_entry(temp_opp, &opp_table->opp_list, node) {
+		if (temp_opp->available) {
+			/* go to the next node, before choosing prev */
+			if (temp_opp->rate > *freq)
+				break;
+			else
+				opp = temp_opp;
+		}
+	}
+
+	/* Increment the reference count of OPP */
+	if (!IS_ERR(opp))
+		dev_pm_opp_get(opp);
+	mutex_unlock(&opp_table->lock);
+	dev_pm_opp_put_opp_table(opp_table);
+
+	if (!IS_ERR(opp))
+		*freq = opp->rate;
+
+	return opp;
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_find_freq_floor);
+
+static int _set_opp_voltage(struct device *dev, struct regulator *reg,
+			    struct dev_pm_opp_supply *supply)
+{
+	int ret;
+
+	/* Regulator not available for device */
+	if (IS_ERR(reg)) {
+		dev_dbg(dev, "%s: regulator not available: %ld\n", __func__,
+			PTR_ERR(reg));
+		return 0;
+	}
+
+	dev_dbg(dev, "%s: voltages (mV): %lu %lu %lu\n", __func__,
+		supply->u_volt_min, supply->u_volt, supply->u_volt_max);
+
+	ret = regulator_set_voltage_triplet(reg, supply->u_volt_min,
+					    supply->u_volt, supply->u_volt_max);
+	if (ret)
+		dev_err(dev, "%s: failed to set voltage (%lu %lu %lu mV): %d\n",
+			__func__, supply->u_volt_min, supply->u_volt,
+			supply->u_volt_max, ret);
+
+	return ret;
+}
+
+static inline int
+_generic_set_opp_clk_only(struct device *dev, struct clk *clk,
+			  unsigned long old_freq, unsigned long freq)
+{
+	int ret;
+
+	ret = clk_set_rate(clk, freq);
+	if (ret) {
+		dev_err(dev, "%s: failed to set clock rate: %d\n", __func__,
+			ret);
+	}
+
+	return ret;
+}
+
+static int _generic_set_opp_regulator(const struct opp_table *opp_table,
+				      struct device *dev,
+				      unsigned long old_freq,
+				      unsigned long freq,
+				      struct dev_pm_opp_supply *old_supply,
+				      struct dev_pm_opp_supply *new_supply)
+{
+	struct regulator *reg = opp_table->regulators[0];
+	int ret;
+
+	/* This function only supports single regulator per device */
+	if (WARN_ON(opp_table->regulator_count > 1)) {
+		dev_err(dev, "multiple regulators are not supported\n");
+		return -EINVAL;
+	}
+
+	/* Scaling up? Scale voltage before frequency */
+	if (freq > old_freq) {
+		ret = _set_opp_voltage(dev, reg, new_supply);
+		if (ret)
+			goto restore_voltage;
+	}
+
+	/* Change frequency */
+	ret = _generic_set_opp_clk_only(dev, opp_table->clk, old_freq, freq);
+	if (ret)
+		goto restore_voltage;
+
+	/* Scaling down? Scale voltage after frequency */
+	if (freq < old_freq) {
+		ret = _set_opp_voltage(dev, reg, new_supply);
+		if (ret)
+			goto restore_freq;
+	}
+
+	return 0;
+
+restore_freq:
+	if (_generic_set_opp_clk_only(dev, opp_table->clk, freq, old_freq))
+		dev_err(dev, "%s: failed to restore old-freq (%lu Hz)\n",
+			__func__, old_freq);
+restore_voltage:
+	/* This shouldn't harm even if the voltages weren't updated earlier */
+	if (old_supply)
+		_set_opp_voltage(dev, reg, old_supply);
+
+	return ret;
+}
+
+/**
+ * dev_pm_opp_set_rate() - Configure new OPP based on frequency
+ * @dev:	 device for which we do this operation
+ * @target_freq: frequency to achieve
+ *
+ * This configures the power-supplies and clock source to the levels specified
+ * by the OPP corresponding to the target_freq.
+ */
+int dev_pm_opp_set_rate(struct device *dev, unsigned long target_freq)
+{
+	struct opp_table *opp_table;
+	unsigned long freq, old_freq;
+	struct dev_pm_opp *old_opp, *opp;
+	struct clk *clk;
+	int ret, size;
+
+	if (unlikely(!target_freq)) {
+		dev_err(dev, "%s: Invalid target frequency %lu\n", __func__,
+			target_freq);
+		return -EINVAL;
+	}
+
+	opp_table = _find_opp_table(dev);
+	if (IS_ERR(opp_table)) {
+		dev_err(dev, "%s: device opp doesn't exist\n", __func__);
+		return PTR_ERR(opp_table);
+	}
+
+	clk = opp_table->clk;
+	if (IS_ERR(clk)) {
+		dev_err(dev, "%s: No clock available for the device\n",
+			__func__);
+		ret = PTR_ERR(clk);
+		goto put_opp_table;
+	}
+
+	freq = clk_round_rate(clk, target_freq);
+	if ((long)freq <= 0)
+		freq = target_freq;
+
+	old_freq = clk_get_rate(clk);
+
+	/* Return early if nothing to do */
+	if (old_freq == freq) {
+		dev_dbg(dev, "%s: old/new frequencies (%lu Hz) are same, nothing to do\n",
+			__func__, freq);
+		ret = 0;
+		goto put_opp_table;
+	}
+
+	old_opp = _find_freq_ceil(opp_table, &old_freq);
+	if (IS_ERR(old_opp)) {
+		dev_err(dev, "%s: failed to find current OPP for freq %lu (%ld)\n",
+			__func__, old_freq, PTR_ERR(old_opp));
+	}
+
+	opp = _find_freq_ceil(opp_table, &freq);
+	if (IS_ERR(opp)) {
+		ret = PTR_ERR(opp);
+		dev_err(dev, "%s: failed to find OPP for freq %lu (%d)\n",
+			__func__, freq, ret);
+		goto put_old_opp;
+	}
+
+	dev_dbg(dev, "%s: switching OPP: %lu Hz --> %lu Hz\n", __func__,
+		old_freq, freq);
+
+	/* Only frequency scaling */
+	if (!opp_table->regulators) {
+		ret = _generic_set_opp_clk_only(dev, clk, old_freq, freq);
+	} else if (!opp_table->set_opp) {
+		ret = _generic_set_opp_regulator(opp_table, dev, old_freq, freq,
+						 IS_ERR(old_opp) ? NULL : old_opp->supplies,
+						 opp->supplies);
+	} else {
+		struct dev_pm_set_opp_data *data;
+
+		data = opp_table->set_opp_data;
+		data->regulators = opp_table->regulators;
+		data->regulator_count = opp_table->regulator_count;
+		data->clk = clk;
+		data->dev = dev;
+
+		data->old_opp.rate = old_freq;
+		size = sizeof(*opp->supplies) * opp_table->regulator_count;
+		if (IS_ERR(old_opp))
+			memset(data->old_opp.supplies, 0, size);
+		else
+			memcpy(data->old_opp.supplies, old_opp->supplies, size);
+
+		data->new_opp.rate = freq;
+		memcpy(data->new_opp.supplies, opp->supplies, size);
+
+		ret = opp_table->set_opp(data);
+	}
+
+	dev_pm_opp_put(opp);
+put_old_opp:
+	if (!IS_ERR(old_opp))
+		dev_pm_opp_put(old_opp);
+put_opp_table:
+	dev_pm_opp_put_opp_table(opp_table);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_set_rate);
+
+/* OPP-dev Helpers */
+static void _remove_opp_dev(struct opp_device *opp_dev,
+			    struct opp_table *opp_table)
+{
+	opp_debug_unregister(opp_dev, opp_table);
+	list_del(&opp_dev->node);
+	kfree(opp_dev);
+}
+
+struct opp_device *_add_opp_dev(const struct device *dev,
+				struct opp_table *opp_table)
+{
+	struct opp_device *opp_dev;
+	int ret;
+
+	opp_dev = kzalloc(sizeof(*opp_dev), GFP_KERNEL);
+	if (!opp_dev)
+		return NULL;
+
+	/* Initialize opp-dev */
+	opp_dev->dev = dev;
+	list_add(&opp_dev->node, &opp_table->dev_list);
+
+	/* Create debugfs entries for the opp_table */
+	ret = opp_debug_register(opp_dev, opp_table);
+	if (ret)
+		dev_err(dev, "%s: Failed to register opp debugfs (%d)\n",
+			__func__, ret);
+
+	return opp_dev;
+}
+
+static struct opp_table *_allocate_opp_table(struct device *dev)
+{
+	struct opp_table *opp_table;
+	struct opp_device *opp_dev;
+	int ret;
+
+	/*
+	 * Allocate a new OPP table. In the infrequent case where a new
+	 * device is needed to be added, we pay this penalty.
+	 */
+	opp_table = kzalloc(sizeof(*opp_table), GFP_KERNEL);
+	if (!opp_table)
+		return NULL;
+
+	INIT_LIST_HEAD(&opp_table->dev_list);
+
+	opp_dev = _add_opp_dev(dev, opp_table);
+	if (!opp_dev) {
+		kfree(opp_table);
+		return NULL;
+	}
+
+	_of_init_opp_table(opp_table, dev);
+
+	/* Find clk for the device */
+	opp_table->clk = clk_get(dev, NULL);
+	if (IS_ERR(opp_table->clk)) {
+		ret = PTR_ERR(opp_table->clk);
+		if (ret != -EPROBE_DEFER)
+			dev_dbg(dev, "%s: Couldn't find clock: %d\n", __func__,
+				ret);
+	}
+
+	BLOCKING_INIT_NOTIFIER_HEAD(&opp_table->head);
+	INIT_LIST_HEAD(&opp_table->opp_list);
+	mutex_init(&opp_table->lock);
+	kref_init(&opp_table->kref);
+
+	/* Secure the device table modification */
+	list_add(&opp_table->node, &opp_tables);
+	return opp_table;
+}
+
+void _get_opp_table_kref(struct opp_table *opp_table)
+{
+	kref_get(&opp_table->kref);
+}
+
+struct opp_table *dev_pm_opp_get_opp_table(struct device *dev)
+{
+	struct opp_table *opp_table;
+
+	/* Hold our table modification lock here */
+	mutex_lock(&opp_table_lock);
+
+	opp_table = _find_opp_table_unlocked(dev);
+	if (!IS_ERR(opp_table))
+		goto unlock;
+
+	opp_table = _allocate_opp_table(dev);
+
+unlock:
+	mutex_unlock(&opp_table_lock);
+
+	return opp_table;
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_get_opp_table);
+
+static void _opp_table_kref_release(struct kref *kref)
+{
+	struct opp_table *opp_table = container_of(kref, struct opp_table, kref);
+	struct opp_device *opp_dev;
+
+	/* Release clk */
+	if (!IS_ERR(opp_table->clk))
+		clk_put(opp_table->clk);
+
+	opp_dev = list_first_entry(&opp_table->dev_list, struct opp_device,
+				   node);
+
+	_remove_opp_dev(opp_dev, opp_table);
+
+	/* dev_list must be empty now */
+	WARN_ON(!list_empty(&opp_table->dev_list));
+
+	mutex_destroy(&opp_table->lock);
+	list_del(&opp_table->node);
+	kfree(opp_table);
+
+	mutex_unlock(&opp_table_lock);
+}
+
+void dev_pm_opp_put_opp_table(struct opp_table *opp_table)
+{
+	kref_put_mutex(&opp_table->kref, _opp_table_kref_release,
+		       &opp_table_lock);
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_put_opp_table);
+
+void _opp_free(struct dev_pm_opp *opp)
+{
+	kfree(opp);
+}
+
+static void _opp_kref_release(struct kref *kref)
+{
+	struct dev_pm_opp *opp = container_of(kref, struct dev_pm_opp, kref);
+	struct opp_table *opp_table = opp->opp_table;
+
+	/*
+	 * Notify the changes in the availability of the operable
+	 * frequency/voltage list.
+	 */
+	blocking_notifier_call_chain(&opp_table->head, OPP_EVENT_REMOVE, opp);
+	opp_debug_remove_one(opp);
+	list_del(&opp->node);
+	kfree(opp);
+
+	mutex_unlock(&opp_table->lock);
+	dev_pm_opp_put_opp_table(opp_table);
+}
+
+static void dev_pm_opp_get(struct dev_pm_opp *opp)
+{
+	kref_get(&opp->kref);
+}
+
+void dev_pm_opp_put(struct dev_pm_opp *opp)
+{
+	kref_put_mutex(&opp->kref, _opp_kref_release, &opp->opp_table->lock);
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_put);
+
+/**
+ * dev_pm_opp_remove()  - Remove an OPP from OPP table
+ * @dev:	device for which we do this operation
+ * @freq:	OPP to remove with matching 'freq'
+ *
+ * This function removes an opp from the opp table.
+ */
+void dev_pm_opp_remove(struct device *dev, unsigned long freq)
+{
+	struct dev_pm_opp *opp;
+	struct opp_table *opp_table;
+	bool found = false;
+
+	opp_table = _find_opp_table(dev);
+	if (IS_ERR(opp_table))
+		return;
+
+	mutex_lock(&opp_table->lock);
+
+	list_for_each_entry(opp, &opp_table->opp_list, node) {
+		if (opp->rate == freq) {
+			found = true;
+			break;
+		}
+	}
+
+	mutex_unlock(&opp_table->lock);
+
+	if (found) {
+		dev_pm_opp_put(opp);
+	} else {
+		dev_warn(dev, "%s: Couldn't find OPP with freq: %lu\n",
+			 __func__, freq);
+	}
+
+	dev_pm_opp_put_opp_table(opp_table);
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_remove);
+
+struct dev_pm_opp *_opp_allocate(struct opp_table *table)
+{
+	struct dev_pm_opp *opp;
+	int count, supply_size;
+
+	/* Allocate space for at least one supply */
+	count = table->regulator_count ? table->regulator_count : 1;
+	supply_size = sizeof(*opp->supplies) * count;
+
+	/* allocate new OPP node and supplies structures */
+	opp = kzalloc(sizeof(*opp) + supply_size, GFP_KERNEL);
+	if (!opp)
+		return NULL;
+
+	/* Put the supplies at the end of the OPP structure as an empty array */
+	opp->supplies = (struct dev_pm_opp_supply *)(opp + 1);
+	INIT_LIST_HEAD(&opp->node);
+
+	return opp;
+}
+
+static bool _opp_supported_by_regulators(struct dev_pm_opp *opp,
+					 struct opp_table *opp_table)
+{
+	struct regulator *reg;
+	int i;
+
+	for (i = 0; i < opp_table->regulator_count; i++) {
+		reg = opp_table->regulators[i];
+
+		if (!regulator_is_supported_voltage(reg,
+					opp->supplies[i].u_volt_min,
+					opp->supplies[i].u_volt_max)) {
+			pr_warn("%s: OPP minuV: %lu maxuV: %lu, not supported by regulator\n",
+				__func__, opp->supplies[i].u_volt_min,
+				opp->supplies[i].u_volt_max);
+			return false;
+		}
+	}
+
+	return true;
+}
+
+/*
+ * Returns:
+ * 0: On success. And appropriate error message for duplicate OPPs.
+ * -EBUSY: For OPP with same freq/volt and is available. The callers of
+ *  _opp_add() must return 0 if they receive -EBUSY from it. This is to make
+ *  sure we don't print error messages unnecessarily if different parts of
+ *  kernel try to initialize the OPP table.
+ * -EEXIST: For OPP with same freq but different volt or is unavailable. This
+ *  should be considered an error by the callers of _opp_add().
+ */
+int _opp_add(struct device *dev, struct dev_pm_opp *new_opp,
+	     struct opp_table *opp_table)
+{
+	struct dev_pm_opp *opp;
+	struct list_head *head;
+	int ret;
+
+	/*
+	 * Insert new OPP in order of increasing frequency and discard if
+	 * already present.
+	 *
+	 * Need to use &opp_table->opp_list in the condition part of the 'for'
+	 * loop, don't replace it with head otherwise it will become an infinite
+	 * loop.
+	 */
+	mutex_lock(&opp_table->lock);
+	head = &opp_table->opp_list;
+
+	list_for_each_entry(opp, &opp_table->opp_list, node) {
+		if (new_opp->rate > opp->rate) {
+			head = &opp->node;
+			continue;
+		}
+
+		if (new_opp->rate < opp->rate)
+			break;
+
+		/* Duplicate OPPs */
+		dev_warn(dev, "%s: duplicate OPPs detected. Existing: freq: %lu, volt: %lu, enabled: %d. New: freq: %lu, volt: %lu, enabled: %d\n",
+			 __func__, opp->rate, opp->supplies[0].u_volt,
+			 opp->available, new_opp->rate,
+			 new_opp->supplies[0].u_volt, new_opp->available);
+
+		/* Should we compare voltages for all regulators here ? */
+		ret = opp->available &&
+		      new_opp->supplies[0].u_volt == opp->supplies[0].u_volt ? -EBUSY : -EEXIST;
+
+		mutex_unlock(&opp_table->lock);
+		return ret;
+	}
+
+	list_add(&new_opp->node, head);
+	mutex_unlock(&opp_table->lock);
+
+	new_opp->opp_table = opp_table;
+	kref_init(&new_opp->kref);
+
+	/* Get a reference to the OPP table */
+	_get_opp_table_kref(opp_table);
+
+	ret = opp_debug_create_one(new_opp, opp_table);
+	if (ret)
+		dev_err(dev, "%s: Failed to register opp to debugfs (%d)\n",
+			__func__, ret);
+
+	if (!_opp_supported_by_regulators(new_opp, opp_table)) {
+		new_opp->available = false;
+		dev_warn(dev, "%s: OPP not supported by regulators (%lu)\n",
+			 __func__, new_opp->rate);
+	}
+
+	return 0;
+}
+
+/**
+ * _opp_add_v1() - Allocate a OPP based on v1 bindings.
+ * @opp_table:	OPP table
+ * @dev:	device for which we do this operation
+ * @freq:	Frequency in Hz for this OPP
+ * @u_volt:	Voltage in uVolts for this OPP
+ * @dynamic:	Dynamically added OPPs.
+ *
+ * This function adds an opp definition to the opp table and returns status.
+ * The opp is made available by default and it can be controlled using
+ * dev_pm_opp_enable/disable functions and may be removed by dev_pm_opp_remove.
+ *
+ * NOTE: "dynamic" parameter impacts OPPs added by the dev_pm_opp_of_add_table
+ * and freed by dev_pm_opp_of_remove_table.
+ *
+ * Return:
+ * 0		On success OR
+ *		Duplicate OPPs (both freq and volt are same) and opp->available
+ * -EEXIST	Freq are same and volt are different OR
+ *		Duplicate OPPs (both freq and volt are same) and !opp->available
+ * -ENOMEM	Memory allocation failure
+ */
+int _opp_add_v1(struct opp_table *opp_table, struct device *dev,
+		unsigned long freq, long u_volt, bool dynamic)
+{
+	struct dev_pm_opp *new_opp;
+	unsigned long tol;
+	int ret;
+
+	new_opp = _opp_allocate(opp_table);
+	if (!new_opp)
+		return -ENOMEM;
+
+	/* populate the opp table */
+	new_opp->rate = freq;
+	tol = u_volt * opp_table->voltage_tolerance_v1 / 100;
+	new_opp->supplies[0].u_volt = u_volt;
+	new_opp->supplies[0].u_volt_min = u_volt - tol;
+	new_opp->supplies[0].u_volt_max = u_volt + tol;
+	new_opp->available = true;
+	new_opp->dynamic = dynamic;
+
+	ret = _opp_add(dev, new_opp, opp_table);
+	if (ret) {
+		/* Don't return error for duplicate OPPs */
+		if (ret == -EBUSY)
+			ret = 0;
+		goto free_opp;
+	}
+
+	/*
+	 * Notify the changes in the availability of the operable
+	 * frequency/voltage list.
+	 */
+	blocking_notifier_call_chain(&opp_table->head, OPP_EVENT_ADD, new_opp);
+	return 0;
+
+free_opp:
+	_opp_free(new_opp);
+
+	return ret;
+}
+
+/**
+ * dev_pm_opp_set_supported_hw() - Set supported platforms
+ * @dev: Device for which supported-hw has to be set.
+ * @versions: Array of hierarchy of versions to match.
+ * @count: Number of elements in the array.
+ *
+ * This is required only for the V2 bindings, and it enables a platform to
+ * specify the hierarchy of versions it supports. OPP layer will then enable
+ * OPPs, which are available for those versions, based on its 'opp-supported-hw'
+ * property.
+ */
+struct opp_table *dev_pm_opp_set_supported_hw(struct device *dev,
+			const u32 *versions, unsigned int count)
+{
+	struct opp_table *opp_table;
+	int ret;
+
+	opp_table = dev_pm_opp_get_opp_table(dev);
+	if (!opp_table)
+		return ERR_PTR(-ENOMEM);
+
+	/* Make sure there are no concurrent readers while updating opp_table */
+	WARN_ON(!list_empty(&opp_table->opp_list));
+
+	/* Do we already have a version hierarchy associated with opp_table? */
+	if (opp_table->supported_hw) {
+		dev_err(dev, "%s: Already have supported hardware list\n",
+			__func__);
+		ret = -EBUSY;
+		goto err;
+	}
+
+	opp_table->supported_hw = kmemdup(versions, count * sizeof(*versions),
+					GFP_KERNEL);
+	if (!opp_table->supported_hw) {
+		ret = -ENOMEM;
+		goto err;
+	}
+
+	opp_table->supported_hw_count = count;
+
+	return opp_table;
+
+err:
+	dev_pm_opp_put_opp_table(opp_table);
+
+	return ERR_PTR(ret);
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_set_supported_hw);
+
+/**
+ * dev_pm_opp_put_supported_hw() - Releases resources blocked for supported hw
+ * @opp_table: OPP table returned by dev_pm_opp_set_supported_hw().
+ *
+ * This is required only for the V2 bindings, and is called for a matching
+ * dev_pm_opp_set_supported_hw(). Until this is called, the opp_table structure
+ * will not be freed.
+ */
+void dev_pm_opp_put_supported_hw(struct opp_table *opp_table)
+{
+	/* Make sure there are no concurrent readers while updating opp_table */
+	WARN_ON(!list_empty(&opp_table->opp_list));
+
+	if (!opp_table->supported_hw) {
+		pr_err("%s: Doesn't have supported hardware list\n",
+		       __func__);
+		return;
+	}
+
+	kfree(opp_table->supported_hw);
+	opp_table->supported_hw = NULL;
+	opp_table->supported_hw_count = 0;
+
+	dev_pm_opp_put_opp_table(opp_table);
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_put_supported_hw);
+
+/**
+ * dev_pm_opp_set_prop_name() - Set prop-extn name
+ * @dev: Device for which the prop-name has to be set.
+ * @name: name to postfix to properties.
+ *
+ * This is required only for the V2 bindings, and it enables a platform to
+ * specify the extn to be used for certain property names. The properties to
+ * which the extension will apply are opp-microvolt and opp-microamp. OPP core
+ * should postfix the property name with -<name> while looking for them.
+ */
+struct opp_table *dev_pm_opp_set_prop_name(struct device *dev, const char *name)
+{
+	struct opp_table *opp_table;
+	int ret;
+
+	opp_table = dev_pm_opp_get_opp_table(dev);
+	if (!opp_table)
+		return ERR_PTR(-ENOMEM);
+
+	/* Make sure there are no concurrent readers while updating opp_table */
+	WARN_ON(!list_empty(&opp_table->opp_list));
+
+	/* Do we already have a prop-name associated with opp_table? */
+	if (opp_table->prop_name) {
+		dev_err(dev, "%s: Already have prop-name %s\n", __func__,
+			opp_table->prop_name);
+		ret = -EBUSY;
+		goto err;
+	}
+
+	opp_table->prop_name = kstrdup(name, GFP_KERNEL);
+	if (!opp_table->prop_name) {
+		ret = -ENOMEM;
+		goto err;
+	}
+
+	return opp_table;
+
+err:
+	dev_pm_opp_put_opp_table(opp_table);
+
+	return ERR_PTR(ret);
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_set_prop_name);
+
+/**
+ * dev_pm_opp_put_prop_name() - Releases resources blocked for prop-name
+ * @opp_table: OPP table returned by dev_pm_opp_set_prop_name().
+ *
+ * This is required only for the V2 bindings, and is called for a matching
+ * dev_pm_opp_set_prop_name(). Until this is called, the opp_table structure
+ * will not be freed.
+ */
+void dev_pm_opp_put_prop_name(struct opp_table *opp_table)
+{
+	/* Make sure there are no concurrent readers while updating opp_table */
+	WARN_ON(!list_empty(&opp_table->opp_list));
+
+	if (!opp_table->prop_name) {
+		pr_err("%s: Doesn't have a prop-name\n", __func__);
+		return;
+	}
+
+	kfree(opp_table->prop_name);
+	opp_table->prop_name = NULL;
+
+	dev_pm_opp_put_opp_table(opp_table);
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_put_prop_name);
+
+static int _allocate_set_opp_data(struct opp_table *opp_table)
+{
+	struct dev_pm_set_opp_data *data;
+	int len, count = opp_table->regulator_count;
+
+	if (WARN_ON(!count))
+		return -EINVAL;
+
+	/* space for set_opp_data */
+	len = sizeof(*data);
+
+	/* space for old_opp.supplies and new_opp.supplies */
+	len += 2 * sizeof(struct dev_pm_opp_supply) * count;
+
+	data = kzalloc(len, GFP_KERNEL);
+	if (!data)
+		return -ENOMEM;
+
+	data->old_opp.supplies = (void *)(data + 1);
+	data->new_opp.supplies = data->old_opp.supplies + count;
+
+	opp_table->set_opp_data = data;
+
+	return 0;
+}
+
+static void _free_set_opp_data(struct opp_table *opp_table)
+{
+	kfree(opp_table->set_opp_data);
+	opp_table->set_opp_data = NULL;
+}
+
+/**
+ * dev_pm_opp_set_regulators() - Set regulator names for the device
+ * @dev: Device for which regulator name is being set.
+ * @names: Array of pointers to the names of the regulator.
+ * @count: Number of regulators.
+ *
+ * In order to support OPP switching, OPP layer needs to know the name of the
+ * device's regulators, as the core would be required to switch voltages as
+ * well.
+ *
+ * This must be called before any OPPs are initialized for the device.
+ */
+struct opp_table *dev_pm_opp_set_regulators(struct device *dev,
+					    const char * const names[],
+					    unsigned int count)
+{
+	struct opp_table *opp_table;
+	struct regulator *reg;
+	int ret, i;
+
+	opp_table = dev_pm_opp_get_opp_table(dev);
+	if (!opp_table)
+		return ERR_PTR(-ENOMEM);
+
+	/* This should be called before OPPs are initialized */
+	if (WARN_ON(!list_empty(&opp_table->opp_list))) {
+		ret = -EBUSY;
+		goto err;
+	}
+
+	/* Already have regulators set */
+	if (opp_table->regulators) {
+		ret = -EBUSY;
+		goto err;
+	}
+
+	opp_table->regulators = kmalloc_array(count,
+					      sizeof(*opp_table->regulators),
+					      GFP_KERNEL);
+	if (!opp_table->regulators) {
+		ret = -ENOMEM;
+		goto err;
+	}
+
+	for (i = 0; i < count; i++) {
+		reg = regulator_get_optional(dev, names[i]);
+		if (IS_ERR(reg)) {
+			ret = PTR_ERR(reg);
+			if (ret != -EPROBE_DEFER)
+				dev_err(dev, "%s: no regulator (%s) found: %d\n",
+					__func__, names[i], ret);
+			goto free_regulators;
+		}
+
+		opp_table->regulators[i] = reg;
+	}
+
+	opp_table->regulator_count = count;
+
+	/* Allocate block only once to pass to set_opp() routines */
+	ret = _allocate_set_opp_data(opp_table);
+	if (ret)
+		goto free_regulators;
+
+	return opp_table;
+
+free_regulators:
+	while (i != 0)
+		regulator_put(opp_table->regulators[--i]);
+
+	kfree(opp_table->regulators);
+	opp_table->regulators = NULL;
+	opp_table->regulator_count = 0;
+err:
+	dev_pm_opp_put_opp_table(opp_table);
+
+	return ERR_PTR(ret);
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_set_regulators);
+
+/**
+ * dev_pm_opp_put_regulators() - Releases resources blocked for regulator
+ * @opp_table: OPP table returned from dev_pm_opp_set_regulators().
+ */
+void dev_pm_opp_put_regulators(struct opp_table *opp_table)
+{
+	int i;
+
+	if (!opp_table->regulators) {
+		pr_err("%s: Doesn't have regulators set\n", __func__);
+		return;
+	}
+
+	/* Make sure there are no concurrent readers while updating opp_table */
+	WARN_ON(!list_empty(&opp_table->opp_list));
+
+	for (i = opp_table->regulator_count - 1; i >= 0; i--)
+		regulator_put(opp_table->regulators[i]);
+
+	_free_set_opp_data(opp_table);
+
+	kfree(opp_table->regulators);
+	opp_table->regulators = NULL;
+	opp_table->regulator_count = 0;
+
+	dev_pm_opp_put_opp_table(opp_table);
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_put_regulators);
+
+/**
+ * dev_pm_opp_set_clkname() - Set clk name for the device
+ * @dev: Device for which clk name is being set.
+ * @name: Clk name.
+ *
+ * In order to support OPP switching, OPP layer needs to get pointer to the
+ * clock for the device. Simple cases work fine without using this routine (i.e.
+ * by passing connection-id as NULL), but for a device with multiple clocks
+ * available, the OPP core needs to know the exact name of the clk to use.
+ *
+ * This must be called before any OPPs are initialized for the device.
+ */
+struct opp_table *dev_pm_opp_set_clkname(struct device *dev, const char *name)
+{
+	struct opp_table *opp_table;
+	int ret;
+
+	opp_table = dev_pm_opp_get_opp_table(dev);
+	if (!opp_table)
+		return ERR_PTR(-ENOMEM);
+
+	/* This should be called before OPPs are initialized */
+	if (WARN_ON(!list_empty(&opp_table->opp_list))) {
+		ret = -EBUSY;
+		goto err;
+	}
+
+	/* Already have default clk set, free it */
+	if (!IS_ERR(opp_table->clk))
+		clk_put(opp_table->clk);
+
+	/* Find clk for the device */
+	opp_table->clk = clk_get(dev, name);
+	if (IS_ERR(opp_table->clk)) {
+		ret = PTR_ERR(opp_table->clk);
+		if (ret != -EPROBE_DEFER) {
+			dev_err(dev, "%s: Couldn't find clock: %d\n", __func__,
+				ret);
+		}
+		goto err;
+	}
+
+	return opp_table;
+
+err:
+	dev_pm_opp_put_opp_table(opp_table);
+
+	return ERR_PTR(ret);
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_set_clkname);
+
+/**
+ * dev_pm_opp_put_clkname() - Releases resources blocked for clk.
+ * @opp_table: OPP table returned from dev_pm_opp_set_clkname().
+ */
+void dev_pm_opp_put_clkname(struct opp_table *opp_table)
+{
+	/* Make sure there are no concurrent readers while updating opp_table */
+	WARN_ON(!list_empty(&opp_table->opp_list));
+
+	clk_put(opp_table->clk);
+	opp_table->clk = ERR_PTR(-EINVAL);
+
+	dev_pm_opp_put_opp_table(opp_table);
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_put_clkname);
+
+/**
+ * dev_pm_opp_register_set_opp_helper() - Register custom set OPP helper
+ * @dev: Device for which the helper is getting registered.
+ * @set_opp: Custom set OPP helper.
+ *
+ * This is useful to support complex platforms (like platforms with multiple
+ * regulators per device), instead of the generic OPP set rate helper.
+ *
+ * This must be called before any OPPs are initialized for the device.
+ */
+struct opp_table *dev_pm_opp_register_set_opp_helper(struct device *dev,
+			int (*set_opp)(struct dev_pm_set_opp_data *data))
+{
+	struct opp_table *opp_table;
+	int ret;
+
+	if (!set_opp)
+		return ERR_PTR(-EINVAL);
+
+	opp_table = dev_pm_opp_get_opp_table(dev);
+	if (!opp_table)
+		return ERR_PTR(-ENOMEM);
+
+	/* This should be called before OPPs are initialized */
+	if (WARN_ON(!list_empty(&opp_table->opp_list))) {
+		ret = -EBUSY;
+		goto err;
+	}
+
+	/* Already have custom set_opp helper */
+	if (WARN_ON(opp_table->set_opp)) {
+		ret = -EBUSY;
+		goto err;
+	}
+
+	opp_table->set_opp = set_opp;
+
+	return opp_table;
+
+err:
+	dev_pm_opp_put_opp_table(opp_table);
+
+	return ERR_PTR(ret);
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_register_set_opp_helper);
+
+/**
+ * dev_pm_opp_register_put_opp_helper() - Releases resources blocked for
+ *					   set_opp helper
+ * @opp_table: OPP table returned from dev_pm_opp_register_set_opp_helper().
+ *
+ * Release resources blocked for platform specific set_opp helper.
+ */
+void dev_pm_opp_register_put_opp_helper(struct opp_table *opp_table)
+{
+	if (!opp_table->set_opp) {
+		pr_err("%s: Doesn't have custom set_opp helper set\n",
+		       __func__);
+		return;
+	}
+
+	/* Make sure there are no concurrent readers while updating opp_table */
+	WARN_ON(!list_empty(&opp_table->opp_list));
+
+	opp_table->set_opp = NULL;
+
+	dev_pm_opp_put_opp_table(opp_table);
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_register_put_opp_helper);
+
+/**
+ * dev_pm_opp_add()  - Add an OPP table from a table definitions
+ * @dev:	device for which we do this operation
+ * @freq:	Frequency in Hz for this OPP
+ * @u_volt:	Voltage in uVolts for this OPP
+ *
+ * This function adds an opp definition to the opp table and returns status.
+ * The opp is made available by default and it can be controlled using
+ * dev_pm_opp_enable/disable functions.
+ *
+ * Return:
+ * 0		On success OR
+ *		Duplicate OPPs (both freq and volt are same) and opp->available
+ * -EEXIST	Freq are same and volt are different OR
+ *		Duplicate OPPs (both freq and volt are same) and !opp->available
+ * -ENOMEM	Memory allocation failure
+ */
+int dev_pm_opp_add(struct device *dev, unsigned long freq, unsigned long u_volt)
+{
+	struct opp_table *opp_table;
+	int ret;
+
+	opp_table = dev_pm_opp_get_opp_table(dev);
+	if (!opp_table)
+		return -ENOMEM;
+
+	ret = _opp_add_v1(opp_table, dev, freq, u_volt, true);
+
+	dev_pm_opp_put_opp_table(opp_table);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_add);
+
+/**
+ * _opp_set_availability() - helper to set the availability of an opp
+ * @dev:		device for which we do this operation
+ * @freq:		OPP frequency to modify availability
+ * @availability_req:	availability status requested for this opp
+ *
+ * Set the availability of an OPP, opp_{enable,disable} share a common logic
+ * which is isolated here.
+ *
+ * Return: -EINVAL for bad pointers, -ENOMEM if no memory available for the
+ * copy operation, returns 0 if no modification was done OR modification was
+ * successful.
+ */
+static int _opp_set_availability(struct device *dev, unsigned long freq,
+				 bool availability_req)
+{
+	struct opp_table *opp_table;
+	struct dev_pm_opp *tmp_opp, *opp = ERR_PTR(-ENODEV);
+	int r = 0;
+
+	/* Find the opp_table */
+	opp_table = _find_opp_table(dev);
+	if (IS_ERR(opp_table)) {
+		r = PTR_ERR(opp_table);
+		dev_warn(dev, "%s: Device OPP not found (%d)\n", __func__, r);
+		return r;
+	}
+
+	mutex_lock(&opp_table->lock);
+
+	/* Do we have the frequency? */
+	list_for_each_entry(tmp_opp, &opp_table->opp_list, node) {
+		if (tmp_opp->rate == freq) {
+			opp = tmp_opp;
+			break;
+		}
+	}
+
+	if (IS_ERR(opp)) {
+		r = PTR_ERR(opp);
+		goto unlock;
+	}
+
+	/* Is update really needed? */
+	if (opp->available == availability_req)
+		goto unlock;
+
+	opp->available = availability_req;
+
+	dev_pm_opp_get(opp);
+	mutex_unlock(&opp_table->lock);
+
+	/* Notify the change of the OPP availability */
+	if (availability_req)
+		blocking_notifier_call_chain(&opp_table->head, OPP_EVENT_ENABLE,
+					     opp);
+	else
+		blocking_notifier_call_chain(&opp_table->head,
+					     OPP_EVENT_DISABLE, opp);
+
+	dev_pm_opp_put(opp);
+	goto put_table;
+
+unlock:
+	mutex_unlock(&opp_table->lock);
+put_table:
+	dev_pm_opp_put_opp_table(opp_table);
+	return r;
+}
+
+/**
+ * dev_pm_opp_enable() - Enable a specific OPP
+ * @dev:	device for which we do this operation
+ * @freq:	OPP frequency to enable
+ *
+ * Enables a provided opp. If the operation is valid, this returns 0, else the
+ * corresponding error value. It is meant to be used for users an OPP available
+ * after being temporarily made unavailable with dev_pm_opp_disable.
+ *
+ * Return: -EINVAL for bad pointers, -ENOMEM if no memory available for the
+ * copy operation, returns 0 if no modification was done OR modification was
+ * successful.
+ */
+int dev_pm_opp_enable(struct device *dev, unsigned long freq)
+{
+	return _opp_set_availability(dev, freq, true);
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_enable);
+
+/**
+ * dev_pm_opp_disable() - Disable a specific OPP
+ * @dev:	device for which we do this operation
+ * @freq:	OPP frequency to disable
+ *
+ * Disables a provided opp. If the operation is valid, this returns
+ * 0, else the corresponding error value. It is meant to be a temporary
+ * control by users to make this OPP not available until the circumstances are
+ * right to make it available again (with a call to dev_pm_opp_enable).
+ *
+ * Return: -EINVAL for bad pointers, -ENOMEM if no memory available for the
+ * copy operation, returns 0 if no modification was done OR modification was
+ * successful.
+ */
+int dev_pm_opp_disable(struct device *dev, unsigned long freq)
+{
+	return _opp_set_availability(dev, freq, false);
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_disable);
+
+/**
+ * dev_pm_opp_register_notifier() - Register OPP notifier for the device
+ * @dev:	Device for which notifier needs to be registered
+ * @nb:		Notifier block to be registered
+ *
+ * Return: 0 on success or a negative error value.
+ */
+int dev_pm_opp_register_notifier(struct device *dev, struct notifier_block *nb)
+{
+	struct opp_table *opp_table;
+	int ret;
+
+	opp_table = _find_opp_table(dev);
+	if (IS_ERR(opp_table))
+		return PTR_ERR(opp_table);
+
+	ret = blocking_notifier_chain_register(&opp_table->head, nb);
+
+	dev_pm_opp_put_opp_table(opp_table);
+
+	return ret;
+}
+EXPORT_SYMBOL(dev_pm_opp_register_notifier);
+
+/**
+ * dev_pm_opp_unregister_notifier() - Unregister OPP notifier for the device
+ * @dev:	Device for which notifier needs to be unregistered
+ * @nb:		Notifier block to be unregistered
+ *
+ * Return: 0 on success or a negative error value.
+ */
+int dev_pm_opp_unregister_notifier(struct device *dev,
+				   struct notifier_block *nb)
+{
+	struct opp_table *opp_table;
+	int ret;
+
+	opp_table = _find_opp_table(dev);
+	if (IS_ERR(opp_table))
+		return PTR_ERR(opp_table);
+
+	ret = blocking_notifier_chain_unregister(&opp_table->head, nb);
+
+	dev_pm_opp_put_opp_table(opp_table);
+
+	return ret;
+}
+EXPORT_SYMBOL(dev_pm_opp_unregister_notifier);
+
+/*
+ * Free OPPs either created using static entries present in DT or even the
+ * dynamically added entries based on remove_all param.
+ */
+void _dev_pm_opp_remove_table(struct opp_table *opp_table, struct device *dev,
+			      bool remove_all)
+{
+	struct dev_pm_opp *opp, *tmp;
+
+	/* Find if opp_table manages a single device */
+	if (list_is_singular(&opp_table->dev_list)) {
+		/* Free static OPPs */
+		list_for_each_entry_safe(opp, tmp, &opp_table->opp_list, node) {
+			if (remove_all || !opp->dynamic)
+				dev_pm_opp_put(opp);
+		}
+	} else {
+		_remove_opp_dev(_find_opp_dev(dev, opp_table), opp_table);
+	}
+}
+
+void _dev_pm_opp_find_and_remove_table(struct device *dev, bool remove_all)
+{
+	struct opp_table *opp_table;
+
+	/* Check for existing table for 'dev' */
+	opp_table = _find_opp_table(dev);
+	if (IS_ERR(opp_table)) {
+		int error = PTR_ERR(opp_table);
+
+		if (error != -ENODEV)
+			WARN(1, "%s: opp_table: %d\n",
+			     IS_ERR_OR_NULL(dev) ?
+					"Invalid device" : dev_name(dev),
+			     error);
+		return;
+	}
+
+	_dev_pm_opp_remove_table(opp_table, dev, remove_all);
+
+	dev_pm_opp_put_opp_table(opp_table);
+}
+
+/**
+ * dev_pm_opp_remove_table() - Free all OPPs associated with the device
+ * @dev:	device pointer used to lookup OPP table.
+ *
+ * Free both OPPs created using static entries present in DT and the
+ * dynamically added entries.
+ */
+void dev_pm_opp_remove_table(struct device *dev)
+{
+	_dev_pm_opp_find_and_remove_table(dev, true);
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_remove_table);
diff --git a/drivers/opp/cpu.c b/drivers/opp/cpu.c
new file mode 100644
index 000000000000..2d87bc1adf38
--- /dev/null
+++ b/drivers/opp/cpu.c
@@ -0,0 +1,236 @@
+/*
+ * Generic OPP helper interface for CPU device
+ *
+ * Copyright (C) 2009-2014 Texas Instruments Incorporated.
+ *	Nishanth Menon
+ *	Romit Dasgupta
+ *	Kevin Hilman
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/cpu.h>
+#include <linux/cpufreq.h>
+#include <linux/err.h>
+#include <linux/errno.h>
+#include <linux/export.h>
+#include <linux/slab.h>
+
+#include "opp.h"
+
+#ifdef CONFIG_CPU_FREQ
+
+/**
+ * dev_pm_opp_init_cpufreq_table() - create a cpufreq table for a device
+ * @dev:	device for which we do this operation
+ * @table:	Cpufreq table returned back to caller
+ *
+ * Generate a cpufreq table for a provided device- this assumes that the
+ * opp table is already initialized and ready for usage.
+ *
+ * This function allocates required memory for the cpufreq table. It is
+ * expected that the caller does the required maintenance such as freeing
+ * the table as required.
+ *
+ * Returns -EINVAL for bad pointers, -ENODEV if the device is not found, -ENOMEM
+ * if no memory available for the operation (table is not populated), returns 0
+ * if successful and table is populated.
+ *
+ * WARNING: It is  important for the callers to ensure refreshing their copy of
+ * the table if any of the mentioned functions have been invoked in the interim.
+ */
+int dev_pm_opp_init_cpufreq_table(struct device *dev,
+				  struct cpufreq_frequency_table **table)
+{
+	struct dev_pm_opp *opp;
+	struct cpufreq_frequency_table *freq_table = NULL;
+	int i, max_opps, ret = 0;
+	unsigned long rate;
+
+	max_opps = dev_pm_opp_get_opp_count(dev);
+	if (max_opps <= 0)
+		return max_opps ? max_opps : -ENODATA;
+
+	freq_table = kcalloc((max_opps + 1), sizeof(*freq_table), GFP_ATOMIC);
+	if (!freq_table)
+		return -ENOMEM;
+
+	for (i = 0, rate = 0; i < max_opps; i++, rate++) {
+		/* find next rate */
+		opp = dev_pm_opp_find_freq_ceil(dev, &rate);
+		if (IS_ERR(opp)) {
+			ret = PTR_ERR(opp);
+			goto out;
+		}
+		freq_table[i].driver_data = i;
+		freq_table[i].frequency = rate / 1000;
+
+		/* Is Boost/turbo opp ? */
+		if (dev_pm_opp_is_turbo(opp))
+			freq_table[i].flags = CPUFREQ_BOOST_FREQ;
+
+		dev_pm_opp_put(opp);
+	}
+
+	freq_table[i].driver_data = i;
+	freq_table[i].frequency = CPUFREQ_TABLE_END;
+
+	*table = &freq_table[0];
+
+out:
+	if (ret)
+		kfree(freq_table);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_init_cpufreq_table);
+
+/**
+ * dev_pm_opp_free_cpufreq_table() - free the cpufreq table
+ * @dev:	device for which we do this operation
+ * @table:	table to free
+ *
+ * Free up the table allocated by dev_pm_opp_init_cpufreq_table
+ */
+void dev_pm_opp_free_cpufreq_table(struct device *dev,
+				   struct cpufreq_frequency_table **table)
+{
+	if (!table)
+		return;
+
+	kfree(*table);
+	*table = NULL;
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_free_cpufreq_table);
+#endif	/* CONFIG_CPU_FREQ */
+
+void _dev_pm_opp_cpumask_remove_table(const struct cpumask *cpumask, bool of)
+{
+	struct device *cpu_dev;
+	int cpu;
+
+	WARN_ON(cpumask_empty(cpumask));
+
+	for_each_cpu(cpu, cpumask) {
+		cpu_dev = get_cpu_device(cpu);
+		if (!cpu_dev) {
+			pr_err("%s: failed to get cpu%d device\n", __func__,
+			       cpu);
+			continue;
+		}
+
+		if (of)
+			dev_pm_opp_of_remove_table(cpu_dev);
+		else
+			dev_pm_opp_remove_table(cpu_dev);
+	}
+}
+
+/**
+ * dev_pm_opp_cpumask_remove_table() - Removes OPP table for @cpumask
+ * @cpumask:	cpumask for which OPP table needs to be removed
+ *
+ * This removes the OPP tables for CPUs present in the @cpumask.
+ * This should be used to remove all the OPPs entries associated with
+ * the cpus in @cpumask.
+ */
+void dev_pm_opp_cpumask_remove_table(const struct cpumask *cpumask)
+{
+	_dev_pm_opp_cpumask_remove_table(cpumask, false);
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_cpumask_remove_table);
+
+/**
+ * dev_pm_opp_set_sharing_cpus() - Mark OPP table as shared by few CPUs
+ * @cpu_dev:	CPU device for which we do this operation
+ * @cpumask:	cpumask of the CPUs which share the OPP table with @cpu_dev
+ *
+ * This marks OPP table of the @cpu_dev as shared by the CPUs present in
+ * @cpumask.
+ *
+ * Returns -ENODEV if OPP table isn't already present.
+ */
+int dev_pm_opp_set_sharing_cpus(struct device *cpu_dev,
+				const struct cpumask *cpumask)
+{
+	struct opp_device *opp_dev;
+	struct opp_table *opp_table;
+	struct device *dev;
+	int cpu, ret = 0;
+
+	opp_table = _find_opp_table(cpu_dev);
+	if (IS_ERR(opp_table))
+		return PTR_ERR(opp_table);
+
+	for_each_cpu(cpu, cpumask) {
+		if (cpu == cpu_dev->id)
+			continue;
+
+		dev = get_cpu_device(cpu);
+		if (!dev) {
+			dev_err(cpu_dev, "%s: failed to get cpu%d device\n",
+				__func__, cpu);
+			continue;
+		}
+
+		opp_dev = _add_opp_dev(dev, opp_table);
+		if (!opp_dev) {
+			dev_err(dev, "%s: failed to add opp-dev for cpu%d device\n",
+				__func__, cpu);
+			continue;
+		}
+
+		/* Mark opp-table as multiple CPUs are sharing it now */
+		opp_table->shared_opp = OPP_TABLE_ACCESS_SHARED;
+	}
+
+	dev_pm_opp_put_opp_table(opp_table);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_set_sharing_cpus);
+
+/**
+ * dev_pm_opp_get_sharing_cpus() - Get cpumask of CPUs sharing OPPs with @cpu_dev
+ * @cpu_dev:	CPU device for which we do this operation
+ * @cpumask:	cpumask to update with information of sharing CPUs
+ *
+ * This updates the @cpumask with CPUs that are sharing OPPs with @cpu_dev.
+ *
+ * Returns -ENODEV if OPP table isn't already present and -EINVAL if the OPP
+ * table's status is access-unknown.
+ */
+int dev_pm_opp_get_sharing_cpus(struct device *cpu_dev, struct cpumask *cpumask)
+{
+	struct opp_device *opp_dev;
+	struct opp_table *opp_table;
+	int ret = 0;
+
+	opp_table = _find_opp_table(cpu_dev);
+	if (IS_ERR(opp_table))
+		return PTR_ERR(opp_table);
+
+	if (opp_table->shared_opp == OPP_TABLE_ACCESS_UNKNOWN) {
+		ret = -EINVAL;
+		goto put_opp_table;
+	}
+
+	cpumask_clear(cpumask);
+
+	if (opp_table->shared_opp == OPP_TABLE_ACCESS_SHARED) {
+		list_for_each_entry(opp_dev, &opp_table->dev_list, node)
+			cpumask_set_cpu(opp_dev->dev->id, cpumask);
+	} else {
+		cpumask_set_cpu(cpu_dev->id, cpumask);
+	}
+
+put_opp_table:
+	dev_pm_opp_put_opp_table(opp_table);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_get_sharing_cpus);
diff --git a/drivers/opp/debugfs.c b/drivers/opp/debugfs.c
new file mode 100644
index 000000000000..81cf120fcf43
--- /dev/null
+++ b/drivers/opp/debugfs.c
@@ -0,0 +1,249 @@
+/*
+ * Generic OPP debugfs interface
+ *
+ * Copyright (C) 2015-2016 Viresh Kumar <viresh.kumar@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/debugfs.h>
+#include <linux/device.h>
+#include <linux/err.h>
+#include <linux/init.h>
+#include <linux/limits.h>
+#include <linux/slab.h>
+
+#include "opp.h"
+
+static struct dentry *rootdir;
+
+static void opp_set_dev_name(const struct device *dev, char *name)
+{
+	if (dev->parent)
+		snprintf(name, NAME_MAX, "%s-%s", dev_name(dev->parent),
+			 dev_name(dev));
+	else
+		snprintf(name, NAME_MAX, "%s", dev_name(dev));
+}
+
+void opp_debug_remove_one(struct dev_pm_opp *opp)
+{
+	debugfs_remove_recursive(opp->dentry);
+}
+
+static bool opp_debug_create_supplies(struct dev_pm_opp *opp,
+				      struct opp_table *opp_table,
+				      struct dentry *pdentry)
+{
+	struct dentry *d;
+	int i;
+	char *name;
+
+	for (i = 0; i < opp_table->regulator_count; i++) {
+		name = kasprintf(GFP_KERNEL, "supply-%d", i);
+
+		/* Create per-opp directory */
+		d = debugfs_create_dir(name, pdentry);
+
+		kfree(name);
+
+		if (!d)
+			return false;
+
+		if (!debugfs_create_ulong("u_volt_target", S_IRUGO, d,
+					  &opp->supplies[i].u_volt))
+			return false;
+
+		if (!debugfs_create_ulong("u_volt_min", S_IRUGO, d,
+					  &opp->supplies[i].u_volt_min))
+			return false;
+
+		if (!debugfs_create_ulong("u_volt_max", S_IRUGO, d,
+					  &opp->supplies[i].u_volt_max))
+			return false;
+
+		if (!debugfs_create_ulong("u_amp", S_IRUGO, d,
+					  &opp->supplies[i].u_amp))
+			return false;
+	}
+
+	return true;
+}
+
+int opp_debug_create_one(struct dev_pm_opp *opp, struct opp_table *opp_table)
+{
+	struct dentry *pdentry = opp_table->dentry;
+	struct dentry *d;
+	char name[25];	/* 20 chars for 64 bit value + 5 (opp:\0) */
+
+	/* Rate is unique to each OPP, use it to give opp-name */
+	snprintf(name, sizeof(name), "opp:%lu", opp->rate);
+
+	/* Create per-opp directory */
+	d = debugfs_create_dir(name, pdentry);
+	if (!d)
+		return -ENOMEM;
+
+	if (!debugfs_create_bool("available", S_IRUGO, d, &opp->available))
+		return -ENOMEM;
+
+	if (!debugfs_create_bool("dynamic", S_IRUGO, d, &opp->dynamic))
+		return -ENOMEM;
+
+	if (!debugfs_create_bool("turbo", S_IRUGO, d, &opp->turbo))
+		return -ENOMEM;
+
+	if (!debugfs_create_bool("suspend", S_IRUGO, d, &opp->suspend))
+		return -ENOMEM;
+
+	if (!debugfs_create_ulong("rate_hz", S_IRUGO, d, &opp->rate))
+		return -ENOMEM;
+
+	if (!opp_debug_create_supplies(opp, opp_table, d))
+		return -ENOMEM;
+
+	if (!debugfs_create_ulong("clock_latency_ns", S_IRUGO, d,
+				  &opp->clock_latency_ns))
+		return -ENOMEM;
+
+	opp->dentry = d;
+	return 0;
+}
+
+static int opp_list_debug_create_dir(struct opp_device *opp_dev,
+				     struct opp_table *opp_table)
+{
+	const struct device *dev = opp_dev->dev;
+	struct dentry *d;
+
+	opp_set_dev_name(dev, opp_table->dentry_name);
+
+	/* Create device specific directory */
+	d = debugfs_create_dir(opp_table->dentry_name, rootdir);
+	if (!d) {
+		dev_err(dev, "%s: Failed to create debugfs dir\n", __func__);
+		return -ENOMEM;
+	}
+
+	opp_dev->dentry = d;
+	opp_table->dentry = d;
+
+	return 0;
+}
+
+static int opp_list_debug_create_link(struct opp_device *opp_dev,
+				      struct opp_table *opp_table)
+{
+	const struct device *dev = opp_dev->dev;
+	char name[NAME_MAX];
+	struct dentry *d;
+
+	opp_set_dev_name(opp_dev->dev, name);
+
+	/* Create device specific directory link */
+	d = debugfs_create_symlink(name, rootdir, opp_table->dentry_name);
+	if (!d) {
+		dev_err(dev, "%s: Failed to create link\n", __func__);
+		return -ENOMEM;
+	}
+
+	opp_dev->dentry = d;
+
+	return 0;
+}
+
+/**
+ * opp_debug_register - add a device opp node to the debugfs 'opp' directory
+ * @opp_dev: opp-dev pointer for device
+ * @opp_table: the device-opp being added
+ *
+ * Dynamically adds device specific directory in debugfs 'opp' directory. If the
+ * device-opp is shared with other devices, then links will be created for all
+ * devices except the first.
+ *
+ * Return: 0 on success, otherwise negative error.
+ */
+int opp_debug_register(struct opp_device *opp_dev, struct opp_table *opp_table)
+{
+	if (!rootdir) {
+		pr_debug("%s: Uninitialized rootdir\n", __func__);
+		return -EINVAL;
+	}
+
+	if (opp_table->dentry)
+		return opp_list_debug_create_link(opp_dev, opp_table);
+
+	return opp_list_debug_create_dir(opp_dev, opp_table);
+}
+
+static void opp_migrate_dentry(struct opp_device *opp_dev,
+			       struct opp_table *opp_table)
+{
+	struct opp_device *new_dev;
+	const struct device *dev;
+	struct dentry *dentry;
+
+	/* Look for next opp-dev */
+	list_for_each_entry(new_dev, &opp_table->dev_list, node)
+		if (new_dev != opp_dev)
+			break;
+
+	/* new_dev is guaranteed to be valid here */
+	dev = new_dev->dev;
+	debugfs_remove_recursive(new_dev->dentry);
+
+	opp_set_dev_name(dev, opp_table->dentry_name);
+
+	dentry = debugfs_rename(rootdir, opp_dev->dentry, rootdir,
+				opp_table->dentry_name);
+	if (!dentry) {
+		dev_err(dev, "%s: Failed to rename link from: %s to %s\n",
+			__func__, dev_name(opp_dev->dev), dev_name(dev));
+		return;
+	}
+
+	new_dev->dentry = dentry;
+	opp_table->dentry = dentry;
+}
+
+/**
+ * opp_debug_unregister - remove a device opp node from debugfs opp directory
+ * @opp_dev: opp-dev pointer for device
+ * @opp_table: the device-opp being removed
+ *
+ * Dynamically removes device specific directory from debugfs 'opp' directory.
+ */
+void opp_debug_unregister(struct opp_device *opp_dev,
+			  struct opp_table *opp_table)
+{
+	if (opp_dev->dentry == opp_table->dentry) {
+		/* Move the real dentry object under another device */
+		if (!list_is_singular(&opp_table->dev_list)) {
+			opp_migrate_dentry(opp_dev, opp_table);
+			goto out;
+		}
+		opp_table->dentry = NULL;
+	}
+
+	debugfs_remove_recursive(opp_dev->dentry);
+
+out:
+	opp_dev->dentry = NULL;
+}
+
+static int __init opp_debug_init(void)
+{
+	/* Create /sys/kernel/debug/opp directory */
+	rootdir = debugfs_create_dir("opp", NULL);
+	if (!rootdir) {
+		pr_err("%s: Failed to create root directory\n", __func__);
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+core_initcall(opp_debug_init);
diff --git a/drivers/opp/of.c b/drivers/opp/of.c
new file mode 100644
index 000000000000..0b718886479b
--- /dev/null
+++ b/drivers/opp/of.c
@@ -0,0 +1,633 @@
+/*
+ * Generic OPP OF helpers
+ *
+ * Copyright (C) 2009-2010 Texas Instruments Incorporated.
+ *	Nishanth Menon
+ *	Romit Dasgupta
+ *	Kevin Hilman
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/cpu.h>
+#include <linux/errno.h>
+#include <linux/device.h>
+#include <linux/of.h>
+#include <linux/slab.h>
+#include <linux/export.h>
+
+#include "opp.h"
+
+static struct opp_table *_managed_opp(const struct device_node *np)
+{
+	struct opp_table *opp_table, *managed_table = NULL;
+
+	mutex_lock(&opp_table_lock);
+
+	list_for_each_entry(opp_table, &opp_tables, node) {
+		if (opp_table->np == np) {
+			/*
+			 * Multiple devices can point to the same OPP table and
+			 * so will have same node-pointer, np.
+			 *
+			 * But the OPPs will be considered as shared only if the
+			 * OPP table contains a "opp-shared" property.
+			 */
+			if (opp_table->shared_opp == OPP_TABLE_ACCESS_SHARED) {
+				_get_opp_table_kref(opp_table);
+				managed_table = opp_table;
+			}
+
+			break;
+		}
+	}
+
+	mutex_unlock(&opp_table_lock);
+
+	return managed_table;
+}
+
+void _of_init_opp_table(struct opp_table *opp_table, struct device *dev)
+{
+	struct device_node *np;
+
+	/*
+	 * Only required for backward compatibility with v1 bindings, but isn't
+	 * harmful for other cases. And so we do it unconditionally.
+	 */
+	np = of_node_get(dev->of_node);
+	if (np) {
+		u32 val;
+
+		if (!of_property_read_u32(np, "clock-latency", &val))
+			opp_table->clock_latency_ns_max = val;
+		of_property_read_u32(np, "voltage-tolerance",
+				     &opp_table->voltage_tolerance_v1);
+		of_node_put(np);
+	}
+}
+
+static bool _opp_is_supported(struct device *dev, struct opp_table *opp_table,
+			      struct device_node *np)
+{
+	unsigned int count = opp_table->supported_hw_count;
+	u32 version;
+	int ret;
+
+	if (!opp_table->supported_hw) {
+		/*
+		 * In the case that no supported_hw has been set by the
+		 * platform but there is an opp-supported-hw value set for
+		 * an OPP then the OPP should not be enabled as there is
+		 * no way to see if the hardware supports it.
+		 */
+		if (of_find_property(np, "opp-supported-hw", NULL))
+			return false;
+		else
+			return true;
+	}
+
+	while (count--) {
+		ret = of_property_read_u32_index(np, "opp-supported-hw", count,
+						 &version);
+		if (ret) {
+			dev_warn(dev, "%s: failed to read opp-supported-hw property at index %d: %d\n",
+				 __func__, count, ret);
+			return false;
+		}
+
+		/* Both of these are bitwise masks of the versions */
+		if (!(version & opp_table->supported_hw[count]))
+			return false;
+	}
+
+	return true;
+}
+
+static int opp_parse_supplies(struct dev_pm_opp *opp, struct device *dev,
+			      struct opp_table *opp_table)
+{
+	u32 *microvolt, *microamp = NULL;
+	int supplies, vcount, icount, ret, i, j;
+	struct property *prop = NULL;
+	char name[NAME_MAX];
+
+	supplies = opp_table->regulator_count ? opp_table->regulator_count : 1;
+
+	/* Search for "opp-microvolt-<name>" */
+	if (opp_table->prop_name) {
+		snprintf(name, sizeof(name), "opp-microvolt-%s",
+			 opp_table->prop_name);
+		prop = of_find_property(opp->np, name, NULL);
+	}
+
+	if (!prop) {
+		/* Search for "opp-microvolt" */
+		sprintf(name, "opp-microvolt");
+		prop = of_find_property(opp->np, name, NULL);
+
+		/* Missing property isn't a problem, but an invalid entry is */
+		if (!prop) {
+			if (!opp_table->regulator_count)
+				return 0;
+
+			dev_err(dev, "%s: opp-microvolt missing although OPP managing regulators\n",
+				__func__);
+			return -EINVAL;
+		}
+	}
+
+	vcount = of_property_count_u32_elems(opp->np, name);
+	if (vcount < 0) {
+		dev_err(dev, "%s: Invalid %s property (%d)\n",
+			__func__, name, vcount);
+		return vcount;
+	}
+
+	/* There can be one or three elements per supply */
+	if (vcount != supplies && vcount != supplies * 3) {
+		dev_err(dev, "%s: Invalid number of elements in %s property (%d) with supplies (%d)\n",
+			__func__, name, vcount, supplies);
+		return -EINVAL;
+	}
+
+	microvolt = kmalloc_array(vcount, sizeof(*microvolt), GFP_KERNEL);
+	if (!microvolt)
+		return -ENOMEM;
+
+	ret = of_property_read_u32_array(opp->np, name, microvolt, vcount);
+	if (ret) {
+		dev_err(dev, "%s: error parsing %s: %d\n", __func__, name, ret);
+		ret = -EINVAL;
+		goto free_microvolt;
+	}
+
+	/* Search for "opp-microamp-<name>" */
+	prop = NULL;
+	if (opp_table->prop_name) {
+		snprintf(name, sizeof(name), "opp-microamp-%s",
+			 opp_table->prop_name);
+		prop = of_find_property(opp->np, name, NULL);
+	}
+
+	if (!prop) {
+		/* Search for "opp-microamp" */
+		sprintf(name, "opp-microamp");
+		prop = of_find_property(opp->np, name, NULL);
+	}
+
+	if (prop) {
+		icount = of_property_count_u32_elems(opp->np, name);
+		if (icount < 0) {
+			dev_err(dev, "%s: Invalid %s property (%d)\n", __func__,
+				name, icount);
+			ret = icount;
+			goto free_microvolt;
+		}
+
+		if (icount != supplies) {
+			dev_err(dev, "%s: Invalid number of elements in %s property (%d) with supplies (%d)\n",
+				__func__, name, icount, supplies);
+			ret = -EINVAL;
+			goto free_microvolt;
+		}
+
+		microamp = kmalloc_array(icount, sizeof(*microamp), GFP_KERNEL);
+		if (!microamp) {
+			ret = -EINVAL;
+			goto free_microvolt;
+		}
+
+		ret = of_property_read_u32_array(opp->np, name, microamp,
+						 icount);
+		if (ret) {
+			dev_err(dev, "%s: error parsing %s: %d\n", __func__,
+				name, ret);
+			ret = -EINVAL;
+			goto free_microamp;
+		}
+	}
+
+	for (i = 0, j = 0; i < supplies; i++) {
+		opp->supplies[i].u_volt = microvolt[j++];
+
+		if (vcount == supplies) {
+			opp->supplies[i].u_volt_min = opp->supplies[i].u_volt;
+			opp->supplies[i].u_volt_max = opp->supplies[i].u_volt;
+		} else {
+			opp->supplies[i].u_volt_min = microvolt[j++];
+			opp->supplies[i].u_volt_max = microvolt[j++];
+		}
+
+		if (microamp)
+			opp->supplies[i].u_amp = microamp[i];
+	}
+
+free_microamp:
+	kfree(microamp);
+free_microvolt:
+	kfree(microvolt);
+
+	return ret;
+}
+
+/**
+ * dev_pm_opp_of_remove_table() - Free OPP table entries created from static DT
+ *				  entries
+ * @dev:	device pointer used to lookup OPP table.
+ *
+ * Free OPPs created using static entries present in DT.
+ */
+void dev_pm_opp_of_remove_table(struct device *dev)
+{
+	_dev_pm_opp_find_and_remove_table(dev, false);
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_of_remove_table);
+
+/* Returns opp descriptor node for a device node, caller must
+ * do of_node_put() */
+static struct device_node *_opp_of_get_opp_desc_node(struct device_node *np)
+{
+	/*
+	 * There should be only ONE phandle present in "operating-points-v2"
+	 * property.
+	 */
+
+	return of_parse_phandle(np, "operating-points-v2", 0);
+}
+
+/* Returns opp descriptor node for a device, caller must do of_node_put() */
+struct device_node *dev_pm_opp_of_get_opp_desc_node(struct device *dev)
+{
+	return _opp_of_get_opp_desc_node(dev->of_node);
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_of_get_opp_desc_node);
+
+/**
+ * _opp_add_static_v2() - Allocate static OPPs (As per 'v2' DT bindings)
+ * @opp_table:	OPP table
+ * @dev:	device for which we do this operation
+ * @np:		device node
+ *
+ * This function adds an opp definition to the opp table and returns status. The
+ * opp can be controlled using dev_pm_opp_enable/disable functions and may be
+ * removed by dev_pm_opp_remove.
+ *
+ * Return:
+ * 0		On success OR
+ *		Duplicate OPPs (both freq and volt are same) and opp->available
+ * -EEXIST	Freq are same and volt are different OR
+ *		Duplicate OPPs (both freq and volt are same) and !opp->available
+ * -ENOMEM	Memory allocation failure
+ * -EINVAL	Failed parsing the OPP node
+ */
+static int _opp_add_static_v2(struct opp_table *opp_table, struct device *dev,
+			      struct device_node *np)
+{
+	struct dev_pm_opp *new_opp;
+	u64 rate;
+	u32 val;
+	int ret;
+
+	new_opp = _opp_allocate(opp_table);
+	if (!new_opp)
+		return -ENOMEM;
+
+	ret = of_property_read_u64(np, "opp-hz", &rate);
+	if (ret < 0) {
+		dev_err(dev, "%s: opp-hz not found\n", __func__);
+		goto free_opp;
+	}
+
+	/* Check if the OPP supports hardware's hierarchy of versions or not */
+	if (!_opp_is_supported(dev, opp_table, np)) {
+		dev_dbg(dev, "OPP not supported by hardware: %llu\n", rate);
+		goto free_opp;
+	}
+
+	/*
+	 * Rate is defined as an unsigned long in clk API, and so casting
+	 * explicitly to its type. Must be fixed once rate is 64 bit
+	 * guaranteed in clk API.
+	 */
+	new_opp->rate = (unsigned long)rate;
+	new_opp->turbo = of_property_read_bool(np, "turbo-mode");
+
+	new_opp->np = np;
+	new_opp->dynamic = false;
+	new_opp->available = true;
+
+	if (!of_property_read_u32(np, "clock-latency-ns", &val))
+		new_opp->clock_latency_ns = val;
+
+	ret = opp_parse_supplies(new_opp, dev, opp_table);
+	if (ret)
+		goto free_opp;
+
+	ret = _opp_add(dev, new_opp, opp_table);
+	if (ret) {
+		/* Don't return error for duplicate OPPs */
+		if (ret == -EBUSY)
+			ret = 0;
+		goto free_opp;
+	}
+
+	/* OPP to select on device suspend */
+	if (of_property_read_bool(np, "opp-suspend")) {
+		if (opp_table->suspend_opp) {
+			dev_warn(dev, "%s: Multiple suspend OPPs found (%lu %lu)\n",
+				 __func__, opp_table->suspend_opp->rate,
+				 new_opp->rate);
+		} else {
+			new_opp->suspend = true;
+			opp_table->suspend_opp = new_opp;
+		}
+	}
+
+	if (new_opp->clock_latency_ns > opp_table->clock_latency_ns_max)
+		opp_table->clock_latency_ns_max = new_opp->clock_latency_ns;
+
+	pr_debug("%s: turbo:%d rate:%lu uv:%lu uvmin:%lu uvmax:%lu latency:%lu\n",
+		 __func__, new_opp->turbo, new_opp->rate,
+		 new_opp->supplies[0].u_volt, new_opp->supplies[0].u_volt_min,
+		 new_opp->supplies[0].u_volt_max, new_opp->clock_latency_ns);
+
+	/*
+	 * Notify the changes in the availability of the operable
+	 * frequency/voltage list.
+	 */
+	blocking_notifier_call_chain(&opp_table->head, OPP_EVENT_ADD, new_opp);
+	return 0;
+
+free_opp:
+	_opp_free(new_opp);
+
+	return ret;
+}
+
+/* Initializes OPP tables based on new bindings */
+static int _of_add_opp_table_v2(struct device *dev, struct device_node *opp_np)
+{
+	struct device_node *np;
+	struct opp_table *opp_table;
+	int ret = 0, count = 0;
+
+	opp_table = _managed_opp(opp_np);
+	if (opp_table) {
+		/* OPPs are already managed */
+		if (!_add_opp_dev(dev, opp_table))
+			ret = -ENOMEM;
+		goto put_opp_table;
+	}
+
+	opp_table = dev_pm_opp_get_opp_table(dev);
+	if (!opp_table)
+		return -ENOMEM;
+
+	/* We have opp-table node now, iterate over it and add OPPs */
+	for_each_available_child_of_node(opp_np, np) {
+		count++;
+
+		ret = _opp_add_static_v2(opp_table, dev, np);
+		if (ret) {
+			dev_err(dev, "%s: Failed to add OPP, %d\n", __func__,
+				ret);
+			_dev_pm_opp_remove_table(opp_table, dev, false);
+			goto put_opp_table;
+		}
+	}
+
+	/* There should be one of more OPP defined */
+	if (WARN_ON(!count)) {
+		ret = -ENOENT;
+		goto put_opp_table;
+	}
+
+	opp_table->np = opp_np;
+	if (of_property_read_bool(opp_np, "opp-shared"))
+		opp_table->shared_opp = OPP_TABLE_ACCESS_SHARED;
+	else
+		opp_table->shared_opp = OPP_TABLE_ACCESS_EXCLUSIVE;
+
+put_opp_table:
+	dev_pm_opp_put_opp_table(opp_table);
+
+	return ret;
+}
+
+/* Initializes OPP tables based on old-deprecated bindings */
+static int _of_add_opp_table_v1(struct device *dev)
+{
+	struct opp_table *opp_table;
+	const struct property *prop;
+	const __be32 *val;
+	int nr, ret = 0;
+
+	prop = of_find_property(dev->of_node, "operating-points", NULL);
+	if (!prop)
+		return -ENODEV;
+	if (!prop->value)
+		return -ENODATA;
+
+	/*
+	 * Each OPP is a set of tuples consisting of frequency and
+	 * voltage like <freq-kHz vol-uV>.
+	 */
+	nr = prop->length / sizeof(u32);
+	if (nr % 2) {
+		dev_err(dev, "%s: Invalid OPP table\n", __func__);
+		return -EINVAL;
+	}
+
+	opp_table = dev_pm_opp_get_opp_table(dev);
+	if (!opp_table)
+		return -ENOMEM;
+
+	val = prop->value;
+	while (nr) {
+		unsigned long freq = be32_to_cpup(val++) * 1000;
+		unsigned long volt = be32_to_cpup(val++);
+
+		ret = _opp_add_v1(opp_table, dev, freq, volt, false);
+		if (ret) {
+			dev_err(dev, "%s: Failed to add OPP %ld (%d)\n",
+				__func__, freq, ret);
+			_dev_pm_opp_remove_table(opp_table, dev, false);
+			break;
+		}
+		nr -= 2;
+	}
+
+	dev_pm_opp_put_opp_table(opp_table);
+	return ret;
+}
+
+/**
+ * dev_pm_opp_of_add_table() - Initialize opp table from device tree
+ * @dev:	device pointer used to lookup OPP table.
+ *
+ * Register the initial OPP table with the OPP library for given device.
+ *
+ * Return:
+ * 0		On success OR
+ *		Duplicate OPPs (both freq and volt are same) and opp->available
+ * -EEXIST	Freq are same and volt are different OR
+ *		Duplicate OPPs (both freq and volt are same) and !opp->available
+ * -ENOMEM	Memory allocation failure
+ * -ENODEV	when 'operating-points' property is not found or is invalid data
+ *		in device node.
+ * -ENODATA	when empty 'operating-points' property is found
+ * -EINVAL	when invalid entries are found in opp-v2 table
+ */
+int dev_pm_opp_of_add_table(struct device *dev)
+{
+	struct device_node *opp_np;
+	int ret;
+
+	/*
+	 * OPPs have two version of bindings now. The older one is deprecated,
+	 * try for the new binding first.
+	 */
+	opp_np = dev_pm_opp_of_get_opp_desc_node(dev);
+	if (!opp_np) {
+		/*
+		 * Try old-deprecated bindings for backward compatibility with
+		 * older dtbs.
+		 */
+		return _of_add_opp_table_v1(dev);
+	}
+
+	ret = _of_add_opp_table_v2(dev, opp_np);
+	of_node_put(opp_np);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_of_add_table);
+
+/* CPU device specific helpers */
+
+/**
+ * dev_pm_opp_of_cpumask_remove_table() - Removes OPP table for @cpumask
+ * @cpumask:	cpumask for which OPP table needs to be removed
+ *
+ * This removes the OPP tables for CPUs present in the @cpumask.
+ * This should be used only to remove static entries created from DT.
+ */
+void dev_pm_opp_of_cpumask_remove_table(const struct cpumask *cpumask)
+{
+	_dev_pm_opp_cpumask_remove_table(cpumask, true);
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_of_cpumask_remove_table);
+
+/**
+ * dev_pm_opp_of_cpumask_add_table() - Adds OPP table for @cpumask
+ * @cpumask:	cpumask for which OPP table needs to be added.
+ *
+ * This adds the OPP tables for CPUs present in the @cpumask.
+ */
+int dev_pm_opp_of_cpumask_add_table(const struct cpumask *cpumask)
+{
+	struct device *cpu_dev;
+	int cpu, ret = 0;
+
+	WARN_ON(cpumask_empty(cpumask));
+
+	for_each_cpu(cpu, cpumask) {
+		cpu_dev = get_cpu_device(cpu);
+		if (!cpu_dev) {
+			pr_err("%s: failed to get cpu%d device\n", __func__,
+			       cpu);
+			continue;
+		}
+
+		ret = dev_pm_opp_of_add_table(cpu_dev);
+		if (ret) {
+			/*
+			 * OPP may get registered dynamically, don't print error
+			 * message here.
+			 */
+			pr_debug("%s: couldn't find opp table for cpu:%d, %d\n",
+				 __func__, cpu, ret);
+
+			/* Free all other OPPs */
+			dev_pm_opp_of_cpumask_remove_table(cpumask);
+			break;
+		}
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_of_cpumask_add_table);
+
+/*
+ * Works only for OPP v2 bindings.
+ *
+ * Returns -ENOENT if operating-points-v2 bindings aren't supported.
+ */
+/**
+ * dev_pm_opp_of_get_sharing_cpus() - Get cpumask of CPUs sharing OPPs with
+ *				      @cpu_dev using operating-points-v2
+ *				      bindings.
+ *
+ * @cpu_dev:	CPU device for which we do this operation
+ * @cpumask:	cpumask to update with information of sharing CPUs
+ *
+ * This updates the @cpumask with CPUs that are sharing OPPs with @cpu_dev.
+ *
+ * Returns -ENOENT if operating-points-v2 isn't present for @cpu_dev.
+ */
+int dev_pm_opp_of_get_sharing_cpus(struct device *cpu_dev,
+				   struct cpumask *cpumask)
+{
+	struct device_node *np, *tmp_np, *cpu_np;
+	int cpu, ret = 0;
+
+	/* Get OPP descriptor node */
+	np = dev_pm_opp_of_get_opp_desc_node(cpu_dev);
+	if (!np) {
+		dev_dbg(cpu_dev, "%s: Couldn't find opp node.\n", __func__);
+		return -ENOENT;
+	}
+
+	cpumask_set_cpu(cpu_dev->id, cpumask);
+
+	/* OPPs are shared ? */
+	if (!of_property_read_bool(np, "opp-shared"))
+		goto put_cpu_node;
+
+	for_each_possible_cpu(cpu) {
+		if (cpu == cpu_dev->id)
+			continue;
+
+		cpu_np = of_get_cpu_node(cpu, NULL);
+		if (!cpu_np) {
+			dev_err(cpu_dev, "%s: failed to get cpu%d node\n",
+				__func__, cpu);
+			ret = -ENOENT;
+			goto put_cpu_node;
+		}
+
+		/* Get OPP descriptor node */
+		tmp_np = _opp_of_get_opp_desc_node(cpu_np);
+		if (!tmp_np) {
+			pr_err("%pOF: Couldn't find opp node\n", cpu_np);
+			ret = -ENOENT;
+			goto put_cpu_node;
+		}
+
+		/* CPUs are sharing opp node */
+		if (np == tmp_np)
+			cpumask_set_cpu(cpu, cpumask);
+
+		of_node_put(tmp_np);
+	}
+
+put_cpu_node:
+	of_node_put(np);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_of_get_sharing_cpus);
diff --git a/drivers/opp/opp.h b/drivers/opp/opp.h
new file mode 100644
index 000000000000..166eef990599
--- /dev/null
+++ b/drivers/opp/opp.h
@@ -0,0 +1,222 @@
+/*
+ * Generic OPP Interface
+ *
+ * Copyright (C) 2009-2010 Texas Instruments Incorporated.
+ *	Nishanth Menon
+ *	Romit Dasgupta
+ *	Kevin Hilman
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef __DRIVER_OPP_H__
+#define __DRIVER_OPP_H__
+
+#include <linux/device.h>
+#include <linux/kernel.h>
+#include <linux/kref.h>
+#include <linux/list.h>
+#include <linux/limits.h>
+#include <linux/pm_opp.h>
+#include <linux/notifier.h>
+
+struct clk;
+struct regulator;
+
+/* Lock to allow exclusive modification to the device and opp lists */
+extern struct mutex opp_table_lock;
+
+extern struct list_head opp_tables;
+
+/*
+ * Internal data structure organization with the OPP layer library is as
+ * follows:
+ * opp_tables (root)
+ *	|- device 1 (represents voltage domain 1)
+ *	|	|- opp 1 (availability, freq, voltage)
+ *	|	|- opp 2 ..
+ *	...	...
+ *	|	`- opp n ..
+ *	|- device 2 (represents the next voltage domain)
+ *	...
+ *	`- device m (represents mth voltage domain)
+ * device 1, 2.. are represented by opp_table structure while each opp
+ * is represented by the opp structure.
+ */
+
+/**
+ * struct dev_pm_opp - Generic OPP description structure
+ * @node:	opp table node. The nodes are maintained throughout the lifetime
+ *		of boot. It is expected only an optimal set of OPPs are
+ *		added to the library by the SoC framework.
+ *		IMPORTANT: the opp nodes should be maintained in increasing
+ *		order.
+ * @kref:	for reference count of the OPP.
+ * @available:	true/false - marks if this OPP as available or not
+ * @dynamic:	not-created from static DT entries.
+ * @turbo:	true if turbo (boost) OPP
+ * @suspend:	true if suspend OPP
+ * @rate:	Frequency in hertz
+ * @supplies:	Power supplies voltage/current values
+ * @clock_latency_ns: Latency (in nanoseconds) of switching to this OPP's
+ *		frequency from any other OPP's frequency.
+ * @opp_table:	points back to the opp_table struct this opp belongs to
+ * @np:		OPP's device node.
+ * @dentry:	debugfs dentry pointer (per opp)
+ *
+ * This structure stores the OPP information for a given device.
+ */
+struct dev_pm_opp {
+	struct list_head node;
+	struct kref kref;
+
+	bool available;
+	bool dynamic;
+	bool turbo;
+	bool suspend;
+	unsigned long rate;
+
+	struct dev_pm_opp_supply *supplies;
+
+	unsigned long clock_latency_ns;
+
+	struct opp_table *opp_table;
+
+	struct device_node *np;
+
+#ifdef CONFIG_DEBUG_FS
+	struct dentry *dentry;
+#endif
+};
+
+/**
+ * struct opp_device - devices managed by 'struct opp_table'
+ * @node:	list node
+ * @dev:	device to which the struct object belongs
+ * @dentry:	debugfs dentry pointer (per device)
+ *
+ * This is an internal data structure maintaining the devices that are managed
+ * by 'struct opp_table'.
+ */
+struct opp_device {
+	struct list_head node;
+	const struct device *dev;
+
+#ifdef CONFIG_DEBUG_FS
+	struct dentry *dentry;
+#endif
+};
+
+enum opp_table_access {
+	OPP_TABLE_ACCESS_UNKNOWN = 0,
+	OPP_TABLE_ACCESS_EXCLUSIVE = 1,
+	OPP_TABLE_ACCESS_SHARED = 2,
+};
+
+/**
+ * struct opp_table - Device opp structure
+ * @node:	table node - contains the devices with OPPs that
+ *		have been registered. Nodes once added are not modified in this
+ *		table.
+ * @head:	notifier head to notify the OPP availability changes.
+ * @dev_list:	list of devices that share these OPPs
+ * @opp_list:	table of opps
+ * @kref:	for reference count of the table.
+ * @lock:	mutex protecting the opp_list.
+ * @np:		struct device_node pointer for opp's DT node.
+ * @clock_latency_ns_max: Max clock latency in nanoseconds.
+ * @shared_opp: OPP is shared between multiple devices.
+ * @suspend_opp: Pointer to OPP to be used during device suspend.
+ * @supported_hw: Array of version number to support.
+ * @supported_hw_count: Number of elements in supported_hw array.
+ * @prop_name: A name to postfix to many DT properties, while parsing them.
+ * @clk: Device's clock handle
+ * @regulators: Supply regulators
+ * @regulator_count: Number of power supply regulators
+ * @set_opp: Platform specific set_opp callback
+ * @set_opp_data: Data to be passed to set_opp callback
+ * @dentry:	debugfs dentry pointer of the real device directory (not links).
+ * @dentry_name: Name of the real dentry.
+ *
+ * @voltage_tolerance_v1: In percentage, for v1 bindings only.
+ *
+ * This is an internal data structure maintaining the link to opps attached to
+ * a device. This structure is not meant to be shared to users as it is
+ * meant for book keeping and private to OPP library.
+ */
+struct opp_table {
+	struct list_head node;
+
+	struct blocking_notifier_head head;
+	struct list_head dev_list;
+	struct list_head opp_list;
+	struct kref kref;
+	struct mutex lock;
+
+	struct device_node *np;
+	unsigned long clock_latency_ns_max;
+
+	/* For backward compatibility with v1 bindings */
+	unsigned int voltage_tolerance_v1;
+
+	enum opp_table_access shared_opp;
+	struct dev_pm_opp *suspend_opp;
+
+	unsigned int *supported_hw;
+	unsigned int supported_hw_count;
+	const char *prop_name;
+	struct clk *clk;
+	struct regulator **regulators;
+	unsigned int regulator_count;
+
+	int (*set_opp)(struct dev_pm_set_opp_data *data);
+	struct dev_pm_set_opp_data *set_opp_data;
+
+#ifdef CONFIG_DEBUG_FS
+	struct dentry *dentry;
+	char dentry_name[NAME_MAX];
+#endif
+};
+
+/* Routines internal to opp core */
+void _get_opp_table_kref(struct opp_table *opp_table);
+struct opp_table *_find_opp_table(struct device *dev);
+struct opp_device *_add_opp_dev(const struct device *dev, struct opp_table *opp_table);
+void _dev_pm_opp_remove_table(struct opp_table *opp_table, struct device *dev, bool remove_all);
+void _dev_pm_opp_find_and_remove_table(struct device *dev, bool remove_all);
+struct dev_pm_opp *_opp_allocate(struct opp_table *opp_table);
+void _opp_free(struct dev_pm_opp *opp);
+int _opp_add(struct device *dev, struct dev_pm_opp *new_opp, struct opp_table *opp_table);
+int _opp_add_v1(struct opp_table *opp_table, struct device *dev, unsigned long freq, long u_volt, bool dynamic);
+void _dev_pm_opp_cpumask_remove_table(const struct cpumask *cpumask, bool of);
+struct opp_table *_add_opp_table(struct device *dev);
+
+#ifdef CONFIG_OF
+void _of_init_opp_table(struct opp_table *opp_table, struct device *dev);
+#else
+static inline void _of_init_opp_table(struct opp_table *opp_table, struct device *dev) {}
+#endif
+
+#ifdef CONFIG_DEBUG_FS
+void opp_debug_remove_one(struct dev_pm_opp *opp);
+int opp_debug_create_one(struct dev_pm_opp *opp, struct opp_table *opp_table);
+int opp_debug_register(struct opp_device *opp_dev, struct opp_table *opp_table);
+void opp_debug_unregister(struct opp_device *opp_dev, struct opp_table *opp_table);
+#else
+static inline void opp_debug_remove_one(struct dev_pm_opp *opp) {}
+
+static inline int opp_debug_create_one(struct dev_pm_opp *opp,
+				       struct opp_table *opp_table)
+{ return 0; }
+static inline int opp_debug_register(struct opp_device *opp_dev,
+				     struct opp_table *opp_table)
+{ return 0; }
+
+static inline void opp_debug_unregister(struct opp_device *opp_dev,
+					struct opp_table *opp_table)
+{ }
+#endif		/* DEBUG_FS */
+
+#endif		/* __DRIVER_OPP_H__ */
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index e8517b63eb37..e880ca22c5a5 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -259,20 +259,6 @@ config APM_EMULATION
 	  anything, try disabling/enabling this option (or disabling/enabling
 	  APM in your BIOS).
 
-config PM_OPP
-	bool
-	select SRCU
-	---help---
-	  SOCs have a standard set of tuples consisting of frequency and
-	  voltage pairs that the device will support per voltage domain. This
-	  is called Operating Performance Point or OPP. The actual definitions
-	  of OPP varies over silicon within the same family of devices.
-
-	  OPP layer organizes the data internally using device pointers
-	  representing individual voltage domains and provides SOC
-	  implementations a ready to use framework to manage OPPs.
-	  For more information, read <file:Documentation/power/opp.txt>
-
 config PM_CLK
 	def_bool y
 	depends on PM && HAVE_CLK
-- 
cgit v1.2.3


From 64ec72a1ece37d9bc7ba8b11d6091ce7cb1d8eec Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Wed, 27 Sep 2017 22:01:34 -0700
Subject: PM: Use a more common logging style

Convert printks to pr_<level>.

Miscellanea:

o Use pr_fmt with "PM:" and remove "PM: " from format strings
o Coalesce format strings and realign format arguments
o Convert an embedded incorrect function name to "%s: ", __func__
o Convert a couple multi-line formats to multiple pr_<level> calls

Signed-off-by: Joe Perches <joe@perches.com>
Acked-by: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 kernel/power/qos.c      |   4 +-
 kernel/power/snapshot.c |  35 ++++++-------
 kernel/power/swap.c     | 128 +++++++++++++++++++++---------------------------
 3 files changed, 77 insertions(+), 90 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/qos.c b/kernel/power/qos.c
index 97b0df71303e..9d7503910ce2 100644
--- a/kernel/power/qos.c
+++ b/kernel/power/qos.c
@@ -701,8 +701,8 @@ static int __init pm_qos_power_init(void)
 	for (i = PM_QOS_CPU_DMA_LATENCY; i < PM_QOS_NUM_CLASSES; i++) {
 		ret = register_pm_qos_misc(pm_qos_array[i], d);
 		if (ret < 0) {
-			printk(KERN_ERR "pm_qos_param: %s setup failed\n",
-			       pm_qos_array[i]->name);
+			pr_err("%s: %s setup failed\n",
+			       __func__, pm_qos_array[i]->name);
 			return ret;
 		}
 	}
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 0972a8e09d08..a917a301e201 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -10,6 +10,8 @@
  *
  */
 
+#define pr_fmt(fmt) "PM: " fmt
+
 #include <linux/version.h>
 #include <linux/module.h>
 #include <linux/mm.h>
@@ -967,7 +969,7 @@ void __init __register_nosave_region(unsigned long start_pfn,
 	region->end_pfn = end_pfn;
 	list_add_tail(&region->list, &nosave_regions);
  Report:
-	printk(KERN_INFO "PM: Registered nosave memory: [mem %#010llx-%#010llx]\n",
+	pr_info("Registered nosave memory: [mem %#010llx-%#010llx]\n",
 		(unsigned long long) start_pfn << PAGE_SHIFT,
 		((unsigned long long) end_pfn << PAGE_SHIFT) - 1);
 }
@@ -1039,7 +1041,7 @@ static void mark_nosave_pages(struct memory_bitmap *bm)
 	list_for_each_entry(region, &nosave_regions, list) {
 		unsigned long pfn;
 
-		pr_debug("PM: Marking nosave pages: [mem %#010llx-%#010llx]\n",
+		pr_debug("Marking nosave pages: [mem %#010llx-%#010llx]\n",
 			 (unsigned long long) region->start_pfn << PAGE_SHIFT,
 			 ((unsigned long long) region->end_pfn << PAGE_SHIFT)
 				- 1);
@@ -1095,7 +1097,7 @@ int create_basic_memory_bitmaps(void)
 	free_pages_map = bm2;
 	mark_nosave_pages(forbidden_pages_map);
 
-	pr_debug("PM: Basic memory bitmaps created\n");
+	pr_debug("Basic memory bitmaps created\n");
 
 	return 0;
 
@@ -1131,7 +1133,7 @@ void free_basic_memory_bitmaps(void)
 	memory_bm_free(bm2, PG_UNSAFE_CLEAR);
 	kfree(bm2);
 
-	pr_debug("PM: Basic memory bitmaps freed\n");
+	pr_debug("Basic memory bitmaps freed\n");
 }
 
 void clear_free_pages(void)
@@ -1152,7 +1154,7 @@ void clear_free_pages(void)
 		pfn = memory_bm_next_pfn(bm);
 	}
 	memory_bm_position_reset(bm);
-	pr_info("PM: free pages cleared after restore\n");
+	pr_info("free pages cleared after restore\n");
 #endif /* PAGE_POISONING_ZERO */
 }
 
@@ -1690,7 +1692,7 @@ int hibernate_preallocate_memory(void)
 	ktime_t start, stop;
 	int error;
 
-	printk(KERN_INFO "PM: Preallocating image memory... ");
+	pr_info("Preallocating image memory... ");
 	start = ktime_get();
 
 	error = memory_bm_create(&orig_bm, GFP_IMAGE, PG_ANY);
@@ -1821,13 +1823,13 @@ int hibernate_preallocate_memory(void)
 
  out:
 	stop = ktime_get();
-	printk(KERN_CONT "done (allocated %lu pages)\n", pages);
+	pr_cont("done (allocated %lu pages)\n", pages);
 	swsusp_show_speed(start, stop, pages, "Allocated");
 
 	return 0;
 
  err_out:
-	printk(KERN_CONT "\n");
+	pr_cont("\n");
 	swsusp_free();
 	return -ENOMEM;
 }
@@ -1867,8 +1869,8 @@ static int enough_free_mem(unsigned int nr_pages, unsigned int nr_highmem)
 			free += zone_page_state(zone, NR_FREE_PAGES);
 
 	nr_pages += count_pages_for_highmem(nr_highmem);
-	pr_debug("PM: Normal pages needed: %u + %u, available pages: %u\n",
-		nr_pages, PAGES_FOR_IO, free);
+	pr_debug("Normal pages needed: %u + %u, available pages: %u\n",
+		 nr_pages, PAGES_FOR_IO, free);
 
 	return free > nr_pages + PAGES_FOR_IO;
 }
@@ -1961,20 +1963,20 @@ asmlinkage __visible int swsusp_save(void)
 {
 	unsigned int nr_pages, nr_highmem;
 
-	printk(KERN_INFO "PM: Creating hibernation image:\n");
+	pr_info("Creating hibernation image:\n");
 
 	drain_local_pages(NULL);
 	nr_pages = count_data_pages();
 	nr_highmem = count_highmem_pages();
-	printk(KERN_INFO "PM: Need to copy %u pages\n", nr_pages + nr_highmem);
+	pr_info("Need to copy %u pages\n", nr_pages + nr_highmem);
 
 	if (!enough_free_mem(nr_pages, nr_highmem)) {
-		printk(KERN_ERR "PM: Not enough free memory\n");
+		pr_err("Not enough free memory\n");
 		return -ENOMEM;
 	}
 
 	if (swsusp_alloc(&copy_bm, nr_pages, nr_highmem)) {
-		printk(KERN_ERR "PM: Memory allocation failed\n");
+		pr_err("Memory allocation failed\n");
 		return -ENOMEM;
 	}
 
@@ -1995,8 +1997,7 @@ asmlinkage __visible int swsusp_save(void)
 	nr_copy_pages = nr_pages;
 	nr_meta_pages = DIV_ROUND_UP(nr_pages * sizeof(long), PAGE_SIZE);
 
-	printk(KERN_INFO "PM: Hibernation image created (%d pages copied)\n",
-		nr_pages);
+	pr_info("Hibernation image created (%d pages copied)\n", nr_pages);
 
 	return 0;
 }
@@ -2170,7 +2171,7 @@ static int check_header(struct swsusp_info *info)
 	if (!reason && info->num_physpages != get_num_physpages())
 		reason = "memory size";
 	if (reason) {
-		printk(KERN_ERR "PM: Image mismatch: %s\n", reason);
+		pr_err("Image mismatch: %s\n", reason);
 		return -EPERM;
 	}
 	return 0;
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index d7cdc426ee38..293ead59eccc 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -12,6 +12,8 @@
  *
  */
 
+#define pr_fmt(fmt) "PM: " fmt
+
 #include <linux/module.h>
 #include <linux/file.h>
 #include <linux/delay.h>
@@ -241,9 +243,9 @@ static void hib_end_io(struct bio *bio)
 	struct page *page = bio->bi_io_vec[0].bv_page;
 
 	if (bio->bi_status) {
-		printk(KERN_ALERT "Read-error on swap-device (%u:%u:%Lu)\n",
-				MAJOR(bio_dev(bio)), MINOR(bio_dev(bio)),
-				(unsigned long long)bio->bi_iter.bi_sector);
+		pr_alert("Read-error on swap-device (%u:%u:%Lu)\n",
+			 MAJOR(bio_dev(bio)), MINOR(bio_dev(bio)),
+			 (unsigned long long)bio->bi_iter.bi_sector);
 	}
 
 	if (bio_data_dir(bio) == WRITE)
@@ -273,8 +275,8 @@ static int hib_submit_io(int op, int op_flags, pgoff_t page_off, void *addr,
 	bio_set_op_attrs(bio, op, op_flags);
 
 	if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) {
-		printk(KERN_ERR "PM: Adding page to bio failed at %llu\n",
-			(unsigned long long)bio->bi_iter.bi_sector);
+		pr_err("Adding page to bio failed at %llu\n",
+		       (unsigned long long)bio->bi_iter.bi_sector);
 		bio_put(bio);
 		return -EFAULT;
 	}
@@ -319,7 +321,7 @@ static int mark_swapfiles(struct swap_map_handle *handle, unsigned int flags)
 		error = hib_submit_io(REQ_OP_WRITE, REQ_SYNC,
 				      swsusp_resume_block, swsusp_header, NULL);
 	} else {
-		printk(KERN_ERR "PM: Swap header not found!\n");
+		pr_err("Swap header not found!\n");
 		error = -ENODEV;
 	}
 	return error;
@@ -413,8 +415,7 @@ static int get_swap_writer(struct swap_map_handle *handle)
 	ret = swsusp_swap_check();
 	if (ret) {
 		if (ret != -ENOSPC)
-			printk(KERN_ERR "PM: Cannot find swap device, try "
-					"swapon -a.\n");
+			pr_err("Cannot find swap device, try swapon -a\n");
 		return ret;
 	}
 	handle->cur = (struct swap_map_page *)get_zeroed_page(GFP_KERNEL);
@@ -491,9 +492,9 @@ static int swap_writer_finish(struct swap_map_handle *handle,
 {
 	if (!error) {
 		flush_swap_writer(handle);
-		printk(KERN_INFO "PM: S");
+		pr_info("S");
 		error = mark_swapfiles(handle, flags);
-		printk("|\n");
+		pr_cont("|\n");
 	}
 
 	if (error)
@@ -542,7 +543,7 @@ static int save_image(struct swap_map_handle *handle,
 
 	hib_init_batch(&hb);
 
-	printk(KERN_INFO "PM: Saving image data pages (%u pages)...\n",
+	pr_info("Saving image data pages (%u pages)...\n",
 		nr_to_write);
 	m = nr_to_write / 10;
 	if (!m)
@@ -557,8 +558,8 @@ static int save_image(struct swap_map_handle *handle,
 		if (ret)
 			break;
 		if (!(nr_pages % m))
-			printk(KERN_INFO "PM: Image saving progress: %3d%%\n",
-			       nr_pages / m * 10);
+			pr_info("Image saving progress: %3d%%\n",
+				nr_pages / m * 10);
 		nr_pages++;
 	}
 	err2 = hib_wait_io(&hb);
@@ -566,7 +567,7 @@ static int save_image(struct swap_map_handle *handle,
 	if (!ret)
 		ret = err2;
 	if (!ret)
-		printk(KERN_INFO "PM: Image saving done.\n");
+		pr_info("Image saving done\n");
 	swsusp_show_speed(start, stop, nr_to_write, "Wrote");
 	return ret;
 }
@@ -692,14 +693,14 @@ static int save_image_lzo(struct swap_map_handle *handle,
 
 	page = (void *)__get_free_page(__GFP_RECLAIM | __GFP_HIGH);
 	if (!page) {
-		printk(KERN_ERR "PM: Failed to allocate LZO page\n");
+		pr_err("Failed to allocate LZO page\n");
 		ret = -ENOMEM;
 		goto out_clean;
 	}
 
 	data = vmalloc(sizeof(*data) * nr_threads);
 	if (!data) {
-		printk(KERN_ERR "PM: Failed to allocate LZO data\n");
+		pr_err("Failed to allocate LZO data\n");
 		ret = -ENOMEM;
 		goto out_clean;
 	}
@@ -708,7 +709,7 @@ static int save_image_lzo(struct swap_map_handle *handle,
 
 	crc = kmalloc(sizeof(*crc), GFP_KERNEL);
 	if (!crc) {
-		printk(KERN_ERR "PM: Failed to allocate crc\n");
+		pr_err("Failed to allocate crc\n");
 		ret = -ENOMEM;
 		goto out_clean;
 	}
@@ -726,8 +727,7 @@ static int save_image_lzo(struct swap_map_handle *handle,
 		                            "image_compress/%u", thr);
 		if (IS_ERR(data[thr].thr)) {
 			data[thr].thr = NULL;
-			printk(KERN_ERR
-			       "PM: Cannot start compression threads\n");
+			pr_err("Cannot start compression threads\n");
 			ret = -ENOMEM;
 			goto out_clean;
 		}
@@ -749,7 +749,7 @@ static int save_image_lzo(struct swap_map_handle *handle,
 	crc->thr = kthread_run(crc32_threadfn, crc, "image_crc32");
 	if (IS_ERR(crc->thr)) {
 		crc->thr = NULL;
-		printk(KERN_ERR "PM: Cannot start CRC32 thread\n");
+		pr_err("Cannot start CRC32 thread\n");
 		ret = -ENOMEM;
 		goto out_clean;
 	}
@@ -760,10 +760,9 @@ static int save_image_lzo(struct swap_map_handle *handle,
 	 */
 	handle->reqd_free_pages = reqd_free_pages();
 
-	printk(KERN_INFO
-		"PM: Using %u thread(s) for compression.\n"
-		"PM: Compressing and saving image data (%u pages)...\n",
-		nr_threads, nr_to_write);
+	pr_info("Using %u thread(s) for compression\n", nr_threads);
+	pr_info("Compressing and saving image data (%u pages)...\n",
+		nr_to_write);
 	m = nr_to_write / 10;
 	if (!m)
 		m = 1;
@@ -783,10 +782,8 @@ static int save_image_lzo(struct swap_map_handle *handle,
 				       data_of(*snapshot), PAGE_SIZE);
 
 				if (!(nr_pages % m))
-					printk(KERN_INFO
-					       "PM: Image saving progress: "
-					       "%3d%%\n",
-				               nr_pages / m * 10);
+					pr_info("Image saving progress: %3d%%\n",
+						nr_pages / m * 10);
 				nr_pages++;
 			}
 			if (!off)
@@ -813,15 +810,14 @@ static int save_image_lzo(struct swap_map_handle *handle,
 			ret = data[thr].ret;
 
 			if (ret < 0) {
-				printk(KERN_ERR "PM: LZO compression failed\n");
+				pr_err("LZO compression failed\n");
 				goto out_finish;
 			}
 
 			if (unlikely(!data[thr].cmp_len ||
 			             data[thr].cmp_len >
 			             lzo1x_worst_compress(data[thr].unc_len))) {
-				printk(KERN_ERR
-				       "PM: Invalid LZO compressed length\n");
+				pr_err("Invalid LZO compressed length\n");
 				ret = -1;
 				goto out_finish;
 			}
@@ -857,7 +853,7 @@ out_finish:
 	if (!ret)
 		ret = err2;
 	if (!ret)
-		printk(KERN_INFO "PM: Image saving done.\n");
+		pr_info("Image saving done\n");
 	swsusp_show_speed(start, stop, nr_to_write, "Wrote");
 out_clean:
 	if (crc) {
@@ -888,7 +884,7 @@ static int enough_swap(unsigned int nr_pages, unsigned int flags)
 	unsigned int free_swap = count_swap_pages(root_swap, 1);
 	unsigned int required;
 
-	pr_debug("PM: Free swap pages: %u\n", free_swap);
+	pr_debug("Free swap pages: %u\n", free_swap);
 
 	required = PAGES_FOR_IO + nr_pages;
 	return free_swap > required;
@@ -915,12 +911,12 @@ int swsusp_write(unsigned int flags)
 	pages = snapshot_get_image_size();
 	error = get_swap_writer(&handle);
 	if (error) {
-		printk(KERN_ERR "PM: Cannot get swap writer\n");
+		pr_err("Cannot get swap writer\n");
 		return error;
 	}
 	if (flags & SF_NOCOMPRESS_MODE) {
 		if (!enough_swap(pages, flags)) {
-			printk(KERN_ERR "PM: Not enough free swap\n");
+			pr_err("Not enough free swap\n");
 			error = -ENOSPC;
 			goto out_finish;
 		}
@@ -1068,8 +1064,7 @@ static int load_image(struct swap_map_handle *handle,
 	hib_init_batch(&hb);
 
 	clean_pages_on_read = true;
-	printk(KERN_INFO "PM: Loading image data pages (%u pages)...\n",
-		nr_to_read);
+	pr_info("Loading image data pages (%u pages)...\n", nr_to_read);
 	m = nr_to_read / 10;
 	if (!m)
 		m = 1;
@@ -1087,8 +1082,8 @@ static int load_image(struct swap_map_handle *handle,
 		if (ret)
 			break;
 		if (!(nr_pages % m))
-			printk(KERN_INFO "PM: Image loading progress: %3d%%\n",
-			       nr_pages / m * 10);
+			pr_info("Image loading progress: %3d%%\n",
+				nr_pages / m * 10);
 		nr_pages++;
 	}
 	err2 = hib_wait_io(&hb);
@@ -1096,7 +1091,7 @@ static int load_image(struct swap_map_handle *handle,
 	if (!ret)
 		ret = err2;
 	if (!ret) {
-		printk(KERN_INFO "PM: Image loading done.\n");
+		pr_info("Image loading done\n");
 		snapshot_write_finalize(snapshot);
 		if (!snapshot_image_loaded(snapshot))
 			ret = -ENODATA;
@@ -1190,14 +1185,14 @@ static int load_image_lzo(struct swap_map_handle *handle,
 
 	page = vmalloc(sizeof(*page) * LZO_MAX_RD_PAGES);
 	if (!page) {
-		printk(KERN_ERR "PM: Failed to allocate LZO page\n");
+		pr_err("Failed to allocate LZO page\n");
 		ret = -ENOMEM;
 		goto out_clean;
 	}
 
 	data = vmalloc(sizeof(*data) * nr_threads);
 	if (!data) {
-		printk(KERN_ERR "PM: Failed to allocate LZO data\n");
+		pr_err("Failed to allocate LZO data\n");
 		ret = -ENOMEM;
 		goto out_clean;
 	}
@@ -1206,7 +1201,7 @@ static int load_image_lzo(struct swap_map_handle *handle,
 
 	crc = kmalloc(sizeof(*crc), GFP_KERNEL);
 	if (!crc) {
-		printk(KERN_ERR "PM: Failed to allocate crc\n");
+		pr_err("Failed to allocate crc\n");
 		ret = -ENOMEM;
 		goto out_clean;
 	}
@@ -1226,8 +1221,7 @@ static int load_image_lzo(struct swap_map_handle *handle,
 		                            "image_decompress/%u", thr);
 		if (IS_ERR(data[thr].thr)) {
 			data[thr].thr = NULL;
-			printk(KERN_ERR
-			       "PM: Cannot start decompression threads\n");
+			pr_err("Cannot start decompression threads\n");
 			ret = -ENOMEM;
 			goto out_clean;
 		}
@@ -1249,7 +1243,7 @@ static int load_image_lzo(struct swap_map_handle *handle,
 	crc->thr = kthread_run(crc32_threadfn, crc, "image_crc32");
 	if (IS_ERR(crc->thr)) {
 		crc->thr = NULL;
-		printk(KERN_ERR "PM: Cannot start CRC32 thread\n");
+		pr_err("Cannot start CRC32 thread\n");
 		ret = -ENOMEM;
 		goto out_clean;
 	}
@@ -1274,8 +1268,7 @@ static int load_image_lzo(struct swap_map_handle *handle,
 		if (!page[i]) {
 			if (i < LZO_CMP_PAGES) {
 				ring_size = i;
-				printk(KERN_ERR
-				       "PM: Failed to allocate LZO pages\n");
+				pr_err("Failed to allocate LZO pages\n");
 				ret = -ENOMEM;
 				goto out_clean;
 			} else {
@@ -1285,10 +1278,9 @@ static int load_image_lzo(struct swap_map_handle *handle,
 	}
 	want = ring_size = i;
 
-	printk(KERN_INFO
-		"PM: Using %u thread(s) for decompression.\n"
-		"PM: Loading and decompressing image data (%u pages)...\n",
-		nr_threads, nr_to_read);
+	pr_info("Using %u thread(s) for decompression\n", nr_threads);
+	pr_info("Loading and decompressing image data (%u pages)...\n",
+		nr_to_read);
 	m = nr_to_read / 10;
 	if (!m)
 		m = 1;
@@ -1348,8 +1340,7 @@ static int load_image_lzo(struct swap_map_handle *handle,
 			if (unlikely(!data[thr].cmp_len ||
 			             data[thr].cmp_len >
 			             lzo1x_worst_compress(LZO_UNC_SIZE))) {
-				printk(KERN_ERR
-				       "PM: Invalid LZO compressed length\n");
+				pr_err("Invalid LZO compressed length\n");
 				ret = -1;
 				goto out_finish;
 			}
@@ -1400,16 +1391,14 @@ static int load_image_lzo(struct swap_map_handle *handle,
 			ret = data[thr].ret;
 
 			if (ret < 0) {
-				printk(KERN_ERR
-				       "PM: LZO decompression failed\n");
+				pr_err("LZO decompression failed\n");
 				goto out_finish;
 			}
 
 			if (unlikely(!data[thr].unc_len ||
 			             data[thr].unc_len > LZO_UNC_SIZE ||
 			             data[thr].unc_len & (PAGE_SIZE - 1))) {
-				printk(KERN_ERR
-				       "PM: Invalid LZO uncompressed length\n");
+				pr_err("Invalid LZO uncompressed length\n");
 				ret = -1;
 				goto out_finish;
 			}
@@ -1420,10 +1409,8 @@ static int load_image_lzo(struct swap_map_handle *handle,
 				       data[thr].unc + off, PAGE_SIZE);
 
 				if (!(nr_pages % m))
-					printk(KERN_INFO
-					       "PM: Image loading progress: "
-					       "%3d%%\n",
-					       nr_pages / m * 10);
+					pr_info("Image loading progress: %3d%%\n",
+						nr_pages / m * 10);
 				nr_pages++;
 
 				ret = snapshot_write_next(snapshot);
@@ -1448,15 +1435,14 @@ out_finish:
 	}
 	stop = ktime_get();
 	if (!ret) {
-		printk(KERN_INFO "PM: Image loading done.\n");
+		pr_info("Image loading done\n");
 		snapshot_write_finalize(snapshot);
 		if (!snapshot_image_loaded(snapshot))
 			ret = -ENODATA;
 		if (!ret) {
 			if (swsusp_header->flags & SF_CRC32_MODE) {
 				if(handle->crc32 != swsusp_header->crc32) {
-					printk(KERN_ERR
-					       "PM: Invalid image CRC32!\n");
+					pr_err("Invalid image CRC32!\n");
 					ret = -ENODATA;
 				}
 			}
@@ -1513,9 +1499,9 @@ int swsusp_read(unsigned int *flags_p)
 	swap_reader_finish(&handle);
 end:
 	if (!error)
-		pr_debug("PM: Image successfully loaded\n");
+		pr_debug("Image successfully loaded\n");
 	else
-		pr_debug("PM: Error %d resuming\n", error);
+		pr_debug("Error %d resuming\n", error);
 	return error;
 }
 
@@ -1552,13 +1538,13 @@ put:
 		if (error)
 			blkdev_put(hib_resume_bdev, FMODE_READ);
 		else
-			pr_debug("PM: Image signature found, resuming\n");
+			pr_debug("Image signature found, resuming\n");
 	} else {
 		error = PTR_ERR(hib_resume_bdev);
 	}
 
 	if (error)
-		pr_debug("PM: Image not found (code %d)\n", error);
+		pr_debug("Image not found (code %d)\n", error);
 
 	return error;
 }
@@ -1570,7 +1556,7 @@ put:
 void swsusp_close(fmode_t mode)
 {
 	if (IS_ERR(hib_resume_bdev)) {
-		pr_debug("PM: Image device not initialised\n");
+		pr_debug("Image device not initialised\n");
 		return;
 	}
 
@@ -1594,7 +1580,7 @@ int swsusp_unmark(void)
 					swsusp_resume_block,
 					swsusp_header, NULL);
 	} else {
-		printk(KERN_ERR "PM: Cannot find swsusp signature!\n");
+		pr_err("Cannot find swsusp signature!\n");
 		error = -ENODEV;
 	}
 
-- 
cgit v1.2.3


From d8c4deee6dc6876ae3b7d09a5b58138a57ee45f6 Mon Sep 17 00:00:00 2001
From: Joel Fernandes <joelaf@google.com>
Date: Fri, 8 Sep 2017 23:55:17 -0700
Subject: tracing: Remove obsolete sched_switch tracer selftest

Since commit 87d80de2800d087ea833cb79bc13f85ff34ed49f ("tracing: Remove
obsolete sched_switch tracer"), the sched_switch tracer selftest is no longer
used.  This patch removes the same.

Link: http://lkml.kernel.org/r/20170909065517.22262-1-joelaf@google.com

Cc: Ingo Molnar <mingo@redhat.com>
Cc: kernel-team@android.com
Signed-off-by: Joel Fernandes <joelaf@google.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace.h          |  2 --
 kernel/trace/trace_selftest.c | 32 --------------------------------
 2 files changed, 34 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 652c682707cd..3c078e2ad777 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -738,8 +738,6 @@ extern int trace_selftest_startup_wakeup(struct tracer *trace,
 					 struct trace_array *tr);
 extern int trace_selftest_startup_nop(struct tracer *trace,
 					 struct trace_array *tr);
-extern int trace_selftest_startup_sched_switch(struct tracer *trace,
-					       struct trace_array *tr);
 extern int trace_selftest_startup_branch(struct tracer *trace,
 					 struct trace_array *tr);
 /*
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index b17ec642793b..364f78abdf47 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -1150,38 +1150,6 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr)
 }
 #endif /* CONFIG_SCHED_TRACER */
 
-#ifdef CONFIG_CONTEXT_SWITCH_TRACER
-int
-trace_selftest_startup_sched_switch(struct tracer *trace, struct trace_array *tr)
-{
-	unsigned long count;
-	int ret;
-
-	/* start the tracing */
-	ret = tracer_init(trace, tr);
-	if (ret) {
-		warn_failed_init_tracer(trace, ret);
-		return ret;
-	}
-
-	/* Sleep for a 1/10 of a second */
-	msleep(100);
-	/* stop the tracing. */
-	tracing_stop();
-	/* check the trace buffer */
-	ret = trace_test_buffer(&tr->trace_buffer, &count);
-	trace->reset(tr);
-	tracing_start();
-
-	if (!ret && !count) {
-		printk(KERN_CONT ".. no entries found ..");
-		ret = -1;
-	}
-
-	return ret;
-}
-#endif /* CONFIG_CONTEXT_SWITCH_TRACER */
-
 #ifdef CONFIG_BRANCH_TRACER
 int
 trace_selftest_startup_branch(struct tracer *trace, struct trace_array *tr)
-- 
cgit v1.2.3


From 6e7a2398114a2597a0995f96f44d908741ae5035 Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.king@canonical.com>
Date: Wed, 23 Aug 2017 12:23:09 +0100
Subject: tracing: Remove redundant unread variable ret

Integer ret is being assigned but never used and hence it is
redundant. Remove it, fixes clang warning:

trace_events_hist.c:1077:3: warning: Value stored to 'ret' is never read

Link: http://lkml.kernel.org/r/20170823112309.19383-1-colin.king@canonical.com

Signed-off-by: Colin Ian King <colin.king@canonical.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace_events_hist.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index 1c21d0e2a145..f123b5d0c226 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -1062,7 +1062,7 @@ static void hist_trigger_show(struct seq_file *m,
 			      struct event_trigger_data *data, int n)
 {
 	struct hist_trigger_data *hist_data;
-	int n_entries, ret = 0;
+	int n_entries;
 
 	if (n > 0)
 		seq_puts(m, "\n\n");
@@ -1073,10 +1073,8 @@ static void hist_trigger_show(struct seq_file *m,
 
 	hist_data = data->private_data;
 	n_entries = print_entries(m, hist_data);
-	if (n_entries < 0) {
-		ret = n_entries;
+	if (n_entries < 0)
 		n_entries = 0;
-	}
 
 	seq_printf(m, "\nTotals:\n    Hits: %llu\n    Entries: %u\n    Dropped: %llu\n",
 		   (u64)atomic64_read(&hist_data->map->hits),
-- 
cgit v1.2.3


From 12ecef0cb12102d8c034770173d2d1363cb97d52 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Thu, 21 Sep 2017 16:22:49 -0400
Subject: tracing: Reverse the order of trace_types_lock and event_mutex

In order to make future changes where we need to call
tracing_set_clock() from within an event command, the order of
trace_types_lock and event_mutex must be reversed, as the event command
will hold event_mutex and the trace_types_lock is taken from within
tracing_set_clock().

Link: http://lkml.kernel.org/r/20170921162249.0dde3dca@gandalf.local.home

Requested-by: Tom Zanussi <tom.zanussi@linux.intel.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace.c        |  5 +++++
 kernel/trace/trace_events.c | 31 +++++++++++++++----------------
 2 files changed, 20 insertions(+), 16 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 752e5daf0896..5f1ac7d3402c 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -7687,6 +7687,7 @@ static int instance_mkdir(const char *name)
 	struct trace_array *tr;
 	int ret;
 
+	mutex_lock(&event_mutex);
 	mutex_lock(&trace_types_lock);
 
 	ret = -EEXIST;
@@ -7742,6 +7743,7 @@ static int instance_mkdir(const char *name)
 	list_add(&tr->list, &ftrace_trace_arrays);
 
 	mutex_unlock(&trace_types_lock);
+	mutex_unlock(&event_mutex);
 
 	return 0;
 
@@ -7753,6 +7755,7 @@ static int instance_mkdir(const char *name)
 
  out_unlock:
 	mutex_unlock(&trace_types_lock);
+	mutex_unlock(&event_mutex);
 
 	return ret;
 
@@ -7765,6 +7768,7 @@ static int instance_rmdir(const char *name)
 	int ret;
 	int i;
 
+	mutex_lock(&event_mutex);
 	mutex_lock(&trace_types_lock);
 
 	ret = -ENODEV;
@@ -7810,6 +7814,7 @@ static int instance_rmdir(const char *name)
 
  out_unlock:
 	mutex_unlock(&trace_types_lock);
+	mutex_unlock(&event_mutex);
 
 	return ret;
 }
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 87468398b9ed..ec0f9aa4e151 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -1406,8 +1406,8 @@ static int subsystem_open(struct inode *inode, struct file *filp)
 		return -ENODEV;
 
 	/* Make sure the system still exists */
-	mutex_lock(&trace_types_lock);
 	mutex_lock(&event_mutex);
+	mutex_lock(&trace_types_lock);
 	list_for_each_entry(tr, &ftrace_trace_arrays, list) {
 		list_for_each_entry(dir, &tr->systems, list) {
 			if (dir == inode->i_private) {
@@ -1421,8 +1421,8 @@ static int subsystem_open(struct inode *inode, struct file *filp)
 		}
 	}
  exit_loop:
-	mutex_unlock(&event_mutex);
 	mutex_unlock(&trace_types_lock);
+	mutex_unlock(&event_mutex);
 
 	if (!system)
 		return -ENODEV;
@@ -2294,15 +2294,15 @@ static void __add_event_to_tracers(struct trace_event_call *call);
 int trace_add_event_call(struct trace_event_call *call)
 {
 	int ret;
-	mutex_lock(&trace_types_lock);
 	mutex_lock(&event_mutex);
+	mutex_lock(&trace_types_lock);
 
 	ret = __register_event(call, NULL);
 	if (ret >= 0)
 		__add_event_to_tracers(call);
 
-	mutex_unlock(&event_mutex);
 	mutex_unlock(&trace_types_lock);
+	mutex_unlock(&event_mutex);
 	return ret;
 }
 
@@ -2356,13 +2356,13 @@ int trace_remove_event_call(struct trace_event_call *call)
 {
 	int ret;
 
-	mutex_lock(&trace_types_lock);
 	mutex_lock(&event_mutex);
+	mutex_lock(&trace_types_lock);
 	down_write(&trace_event_sem);
 	ret = probe_remove_event_call(call);
 	up_write(&trace_event_sem);
-	mutex_unlock(&event_mutex);
 	mutex_unlock(&trace_types_lock);
+	mutex_unlock(&event_mutex);
 
 	return ret;
 }
@@ -2424,8 +2424,8 @@ static int trace_module_notify(struct notifier_block *self,
 {
 	struct module *mod = data;
 
-	mutex_lock(&trace_types_lock);
 	mutex_lock(&event_mutex);
+	mutex_lock(&trace_types_lock);
 	switch (val) {
 	case MODULE_STATE_COMING:
 		trace_module_add_events(mod);
@@ -2434,8 +2434,8 @@ static int trace_module_notify(struct notifier_block *self,
 		trace_module_remove_events(mod);
 		break;
 	}
-	mutex_unlock(&event_mutex);
 	mutex_unlock(&trace_types_lock);
+	mutex_unlock(&event_mutex);
 
 	return 0;
 }
@@ -2950,24 +2950,24 @@ create_event_toplevel_files(struct dentry *parent, struct trace_array *tr)
  * creates the event hierachry in the @parent/events directory.
  *
  * Returns 0 on success.
+ *
+ * Must be called with event_mutex held.
  */
 int event_trace_add_tracer(struct dentry *parent, struct trace_array *tr)
 {
 	int ret;
 
-	mutex_lock(&event_mutex);
+	lockdep_assert_held(&event_mutex);
 
 	ret = create_event_toplevel_files(parent, tr);
 	if (ret)
-		goto out_unlock;
+		goto out;
 
 	down_write(&trace_event_sem);
 	__trace_add_event_dirs(tr);
 	up_write(&trace_event_sem);
 
- out_unlock:
-	mutex_unlock(&event_mutex);
-
+ out:
 	return ret;
 }
 
@@ -2996,9 +2996,10 @@ early_event_add_tracer(struct dentry *parent, struct trace_array *tr)
 	return ret;
 }
 
+/* Must be called with event_mutex held */
 int event_trace_del_tracer(struct trace_array *tr)
 {
-	mutex_lock(&event_mutex);
+	lockdep_assert_held(&event_mutex);
 
 	/* Disable any event triggers and associated soft-disabled events */
 	clear_event_triggers(tr);
@@ -3019,8 +3020,6 @@ int event_trace_del_tracer(struct trace_array *tr)
 
 	tr->event_dir = NULL;
 
-	mutex_unlock(&event_mutex);
-
 	return 0;
 }
 
-- 
cgit v1.2.3


From 1a149d7d3f45d311da1f63473736c05f30ae8a75 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Fri, 22 Sep 2017 16:59:02 -0400
Subject: ring-buffer: Rewrite trace_recursive_(un)lock() to be simpler

The current method to prevent the ring buffer from entering into a recursize
loop is to use a bitmask and set the bit that maps to the current context
(normal, softirq, irq or NMI), and if that bit was already set, it is
considered a recursive loop.

New code is being added that may require the ring buffer to be entered a
second time in the current context. The recursive locking prevents that from
happening. Instead of mapping a bitmask to the current context, just allow 4
levels of nesting in the ring buffer. This matches the 4 context levels that
it can already nest. It is highly unlikely to have more than two levels,
thus it should be fine when we add the second entry into the ring buffer. If
that proves to be a problem, we can always up the number to 8.

An added benefit is that reading preempt_count() to get the current level
adds a very slight but noticeable overhead. This removes that need.

Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c | 64 ++++++++++++----------------------------------
 1 file changed, 17 insertions(+), 47 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 81279c6602ff..f6ee9b1ef62a 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -2538,61 +2538,29 @@ rb_wakeups(struct ring_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer)
  * The lock and unlock are done within a preempt disable section.
  * The current_context per_cpu variable can only be modified
  * by the current task between lock and unlock. But it can
- * be modified more than once via an interrupt. To pass this
- * information from the lock to the unlock without having to
- * access the 'in_interrupt()' functions again (which do show
- * a bit of overhead in something as critical as function tracing,
- * we use a bitmask trick.
+ * be modified more than once via an interrupt. There are four
+ * different contexts that we need to consider.
  *
- *  bit 0 =  NMI context
- *  bit 1 =  IRQ context
- *  bit 2 =  SoftIRQ context
- *  bit 3 =  normal context.
+ *  Normal context.
+ *  SoftIRQ context
+ *  IRQ context
+ *  NMI context
  *
- * This works because this is the order of contexts that can
- * preempt other contexts. A SoftIRQ never preempts an IRQ
- * context.
- *
- * When the context is determined, the corresponding bit is
- * checked and set (if it was set, then a recursion of that context
- * happened).
- *
- * On unlock, we need to clear this bit. To do so, just subtract
- * 1 from the current_context and AND it to itself.
- *
- * (binary)
- *  101 - 1 = 100
- *  101 & 100 = 100 (clearing bit zero)
- *
- *  1010 - 1 = 1001
- *  1010 & 1001 = 1000 (clearing bit 1)
- *
- * The least significant bit can be cleared this way, and it
- * just so happens that it is the same bit corresponding to
- * the current context.
+ * If for some reason the ring buffer starts to recurse, we
+ * only allow that to happen at most 4 times (one for each
+ * context). If it happens 5 times, then we consider this a
+ * recusive loop and do not let it go further.
  */
 
 static __always_inline int
 trace_recursive_lock(struct ring_buffer_per_cpu *cpu_buffer)
 {
-	unsigned int val = cpu_buffer->current_context;
-	int bit;
-
-	if (in_interrupt()) {
-		if (in_nmi())
-			bit = RB_CTX_NMI;
-		else if (in_irq())
-			bit = RB_CTX_IRQ;
-		else
-			bit = RB_CTX_SOFTIRQ;
-	} else
-		bit = RB_CTX_NORMAL;
-
-	if (unlikely(val & (1 << bit)))
+	if (cpu_buffer->current_context >= 4)
 		return 1;
 
-	val |= (1 << bit);
-	cpu_buffer->current_context = val;
+	cpu_buffer->current_context++;
+	/* Interrupts must see this update */
+	barrier();
 
 	return 0;
 }
@@ -2600,7 +2568,9 @@ trace_recursive_lock(struct ring_buffer_per_cpu *cpu_buffer)
 static __always_inline void
 trace_recursive_unlock(struct ring_buffer_per_cpu *cpu_buffer)
 {
-	cpu_buffer->current_context &= cpu_buffer->current_context - 1;
+	/* Don't let the dec leak out */
+	barrier();
+	cpu_buffer->current_context--;
 }
 
 /**
-- 
cgit v1.2.3


From a15f7fc20389a8827d5859907568b201234d4b79 Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tom.zanussi@linux.intel.com>
Date: Fri, 22 Sep 2017 14:58:17 -0500
Subject: tracing: Exclude 'generic fields' from histograms

There are a small number of 'generic fields' (comm/COMM/cpu/CPU) that
are found by trace_find_event_field() but are only meant for
filtering.  Specifically, they unlike normal fields, they have a size
of 0 and thus wreak havoc when used as a histogram key.

Exclude these (return -EINVAL) when used as histogram keys.

Link: http://lkml.kernel.org/r/956154cbc3e8a4f0633d619b886c97f0f0edf7b4.1506105045.git.tom.zanussi@linux.intel.com

Signed-off-by: Tom Zanussi <tom.zanussi@linux.intel.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace_events_hist.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index f123b5d0c226..121d56850f24 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -450,7 +450,7 @@ static int create_val_field(struct hist_trigger_data *hist_data,
 	}
 
 	field = trace_find_event_field(file->event_call, field_name);
-	if (!field) {
+	if (!field || !field->size) {
 		ret = -EINVAL;
 		goto out;
 	}
@@ -548,7 +548,7 @@ static int create_key_field(struct hist_trigger_data *hist_data,
 		}
 
 		field = trace_find_event_field(file->event_call, field_name);
-		if (!field) {
+		if (!field || !field->size) {
 			ret = -EINVAL;
 			goto out;
 		}
-- 
cgit v1.2.3


From 83c07ecc4203728e85fc4a2ce6fdf25d16ea118e Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tom.zanussi@linux.intel.com>
Date: Fri, 22 Sep 2017 14:58:18 -0500
Subject: tracing: Remove lookups from tracing_map hitcount

Lookups inflate the hitcount, making it essentially useless.  Only
inserts and updates should really affect the hitcount anyway, so
explicitly filter lookups out.

Link: http://lkml.kernel.org/r/c8d9dc39d269a8abf88bf4102d0dfc65deb0fc7f.1506105045.git.tom.zanussi@linux.intel.com

Signed-off-by: Tom Zanussi <tom.zanussi@linux.intel.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/tracing_map.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/tracing_map.c b/kernel/trace/tracing_map.c
index 305039b122fa..07e75344725b 100644
--- a/kernel/trace/tracing_map.c
+++ b/kernel/trace/tracing_map.c
@@ -428,7 +428,8 @@ __tracing_map_insert(struct tracing_map *map, void *key, bool lookup_only)
 
 		if (test_key && test_key == key_hash && entry->val &&
 		    keys_match(key, entry->val->key, map->key_size)) {
-			atomic64_inc(&map->hits);
+			if (!lookup_only)
+				atomic64_inc(&map->hits);
 			return entry->val;
 		}
 
-- 
cgit v1.2.3


From 4f36c2d85cedea60ad424d44534121ab0458069e Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tom.zanussi@linux.intel.com>
Date: Fri, 22 Sep 2017 14:58:19 -0500
Subject: tracing: Increase tracing map KEYS_MAX size

The current default for the number of subkeys in a compound key is 2,
which is too restrictive.  Increase it to a more realistic value of 3.

Link: http://lkml.kernel.org/r/b6952cca06d1f912eba33804a6fd6069b3847d44.1506105045.git.tom.zanussi@linux.intel.com

Signed-off-by: Tom Zanussi <tom.zanussi@linux.intel.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/tracing_map.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/tracing_map.h b/kernel/trace/tracing_map.h
index 618838f5f30a..f0975110b967 100644
--- a/kernel/trace/tracing_map.h
+++ b/kernel/trace/tracing_map.h
@@ -5,7 +5,7 @@
 #define TRACING_MAP_BITS_MAX		17
 #define TRACING_MAP_BITS_MIN		7
 
-#define TRACING_MAP_KEYS_MAX		2
+#define TRACING_MAP_KEYS_MAX		3
 #define TRACING_MAP_VALS_MAX		3
 #define TRACING_MAP_FIELDS_MAX		(TRACING_MAP_KEYS_MAX + \
 					 TRACING_MAP_VALS_MAX)
-- 
cgit v1.2.3


From 7e465baa80293ed5f87fdf6405391d6f02110d4e Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tom.zanussi@linux.intel.com>
Date: Fri, 22 Sep 2017 14:58:20 -0500
Subject: tracing: Make traceprobe parsing code reusable

traceprobe_probes_write() and traceprobe_command() actually contain
nothing that ties them to kprobes - the code is generically useful for
similar types of parsing elsewhere, so separate it out and move it to
trace.c/trace.h.

Other than moving it, the only change is in naming:
traceprobe_probes_write() becomes trace_parse_run_command() and
traceprobe_command() becomes trace_run_command().

Link: http://lkml.kernel.org/r/ae5c26ea40c196a8986854d921eb6e713ede7e3f.1506105045.git.tom.zanussi@linux.intel.com

Signed-off-by: Tom Zanussi <tom.zanussi@linux.intel.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace.c        | 86 +++++++++++++++++++++++++++++++++++++++++++++
 kernel/trace/trace.h        |  7 ++++
 kernel/trace/trace_kprobe.c | 18 +++++-----
 kernel/trace/trace_probe.c  | 86 ---------------------------------------------
 kernel/trace/trace_probe.h  |  7 ----
 kernel/trace/trace_uprobe.c |  2 +-
 6 files changed, 103 insertions(+), 103 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 5f1ac7d3402c..73e67b68c53b 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -8281,6 +8281,92 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode)
 }
 EXPORT_SYMBOL_GPL(ftrace_dump);
 
+int trace_run_command(const char *buf, int (*createfn)(int, char **))
+{
+	char **argv;
+	int argc, ret;
+
+	argc = 0;
+	ret = 0;
+	argv = argv_split(GFP_KERNEL, buf, &argc);
+	if (!argv)
+		return -ENOMEM;
+
+	if (argc)
+		ret = createfn(argc, argv);
+
+	argv_free(argv);
+
+	return ret;
+}
+
+#define WRITE_BUFSIZE  4096
+
+ssize_t trace_parse_run_command(struct file *file, const char __user *buffer,
+				size_t count, loff_t *ppos,
+				int (*createfn)(int, char **))
+{
+	char *kbuf, *buf, *tmp;
+	int ret = 0;
+	size_t done = 0;
+	size_t size;
+
+	kbuf = kmalloc(WRITE_BUFSIZE, GFP_KERNEL);
+	if (!kbuf)
+		return -ENOMEM;
+
+	while (done < count) {
+		size = count - done;
+
+		if (size >= WRITE_BUFSIZE)
+			size = WRITE_BUFSIZE - 1;
+
+		if (copy_from_user(kbuf, buffer + done, size)) {
+			ret = -EFAULT;
+			goto out;
+		}
+		kbuf[size] = '\0';
+		buf = kbuf;
+		do {
+			tmp = strchr(buf, '\n');
+			if (tmp) {
+				*tmp = '\0';
+				size = tmp - buf + 1;
+			} else {
+				size = strlen(buf);
+				if (done + size < count) {
+					if (buf != kbuf)
+						break;
+					/* This can accept WRITE_BUFSIZE - 2 ('\n' + '\0') */
+					pr_warn("Line length is too long: Should be less than %d\n",
+						WRITE_BUFSIZE - 2);
+					ret = -EINVAL;
+					goto out;
+				}
+			}
+			done += size;
+
+			/* Remove comments */
+			tmp = strchr(buf, '#');
+
+			if (tmp)
+				*tmp = '\0';
+
+			ret = trace_run_command(buf, createfn);
+			if (ret)
+				goto out;
+			buf += size;
+
+		} while (done < count);
+	}
+	ret = done;
+
+out:
+	kfree(kbuf);
+
+	return ret;
+}
+
 __init static int tracer_alloc_buffers(void)
 {
 	int ring_buf_size;
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 3c078e2ad777..f8343eb3c1f9 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -1752,6 +1752,13 @@ void trace_printk_start_comm(void);
 int trace_keep_overwrite(struct tracer *tracer, u32 mask, int set);
 int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled);
 
+#define MAX_EVENT_NAME_LEN	64
+
+extern int trace_run_command(const char *buf, int (*createfn)(int, char**));
+extern ssize_t trace_parse_run_command(struct file *file,
+		const char __user *buffer, size_t count, loff_t *ppos,
+		int (*createfn)(int, char**));
+
 /*
  * Normal trace_printk() and friends allocates special buffers
  * to do the manipulation, as well as saves the print formats
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 8a907e12b6b9..af6134f2e597 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -907,8 +907,8 @@ static int probes_open(struct inode *inode, struct file *file)
 static ssize_t probes_write(struct file *file, const char __user *buffer,
 			    size_t count, loff_t *ppos)
 {
-	return traceprobe_probes_write(file, buffer, count, ppos,
-			create_trace_kprobe);
+	return trace_parse_run_command(file, buffer, count, ppos,
+				       create_trace_kprobe);
 }
 
 static const struct file_operations kprobe_events_ops = {
@@ -1433,9 +1433,9 @@ static __init int kprobe_trace_self_tests_init(void)
 
 	pr_info("Testing kprobe tracing: ");
 
-	ret = traceprobe_command("p:testprobe kprobe_trace_selftest_target "
-				  "$stack $stack0 +0($stack)",
-				  create_trace_kprobe);
+	ret = trace_run_command("p:testprobe kprobe_trace_selftest_target "
+				"$stack $stack0 +0($stack)",
+				create_trace_kprobe);
 	if (WARN_ON_ONCE(ret)) {
 		pr_warn("error on probing function entry.\n");
 		warn++;
@@ -1455,8 +1455,8 @@ static __init int kprobe_trace_self_tests_init(void)
 		}
 	}
 
-	ret = traceprobe_command("r:testprobe2 kprobe_trace_selftest_target "
-				  "$retval", create_trace_kprobe);
+	ret = trace_run_command("r:testprobe2 kprobe_trace_selftest_target "
+				"$retval", create_trace_kprobe);
 	if (WARN_ON_ONCE(ret)) {
 		pr_warn("error on probing function return.\n");
 		warn++;
@@ -1526,13 +1526,13 @@ static __init int kprobe_trace_self_tests_init(void)
 			disable_trace_kprobe(tk, file);
 	}
 
-	ret = traceprobe_command("-:testprobe", create_trace_kprobe);
+	ret = trace_run_command("-:testprobe", create_trace_kprobe);
 	if (WARN_ON_ONCE(ret)) {
 		pr_warn("error on deleting a probe.\n");
 		warn++;
 	}
 
-	ret = traceprobe_command("-:testprobe2", create_trace_kprobe);
+	ret = trace_run_command("-:testprobe2", create_trace_kprobe);
 	if (WARN_ON_ONCE(ret)) {
 		pr_warn("error on deleting a probe.\n");
 		warn++;
diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
index 52478f033f88..d59357308677 100644
--- a/kernel/trace/trace_probe.c
+++ b/kernel/trace/trace_probe.c
@@ -623,92 +623,6 @@ void traceprobe_free_probe_arg(struct probe_arg *arg)
 	kfree(arg->comm);
 }
 
-int traceprobe_command(const char *buf, int (*createfn)(int, char **))
-{
-	char **argv;
-	int argc, ret;
-
-	argc = 0;
-	ret = 0;
-	argv = argv_split(GFP_KERNEL, buf, &argc);
-	if (!argv)
-		return -ENOMEM;
-
-	if (argc)
-		ret = createfn(argc, argv);
-
-	argv_free(argv);
-
-	return ret;
-}
-
-#define WRITE_BUFSIZE  4096
-
-ssize_t traceprobe_probes_write(struct file *file, const char __user *buffer,
-				size_t count, loff_t *ppos,
-				int (*createfn)(int, char **))
-{
-	char *kbuf, *buf, *tmp;
-	int ret = 0;
-	size_t done = 0;
-	size_t size;
-
-	kbuf = kmalloc(WRITE_BUFSIZE, GFP_KERNEL);
-	if (!kbuf)
-		return -ENOMEM;
-
-	while (done < count) {
-		size = count - done;
-
-		if (size >= WRITE_BUFSIZE)
-			size = WRITE_BUFSIZE - 1;
-
-		if (copy_from_user(kbuf, buffer + done, size)) {
-			ret = -EFAULT;
-			goto out;
-		}
-		kbuf[size] = '\0';
-		buf = kbuf;
-		do {
-			tmp = strchr(buf, '\n');
-			if (tmp) {
-				*tmp = '\0';
-				size = tmp - buf + 1;
-			} else {
-				size = strlen(buf);
-				if (done + size < count) {
-					if (buf != kbuf)
-						break;
-					/* This can accept WRITE_BUFSIZE - 2 ('\n' + '\0') */
-					pr_warn("Line length is too long: Should be less than %d\n",
-						WRITE_BUFSIZE - 2);
-					ret = -EINVAL;
-					goto out;
-				}
-			}
-			done += size;
-
-			/* Remove comments */
-			tmp = strchr(buf, '#');
-
-			if (tmp)
-				*tmp = '\0';
-
-			ret = traceprobe_command(buf, createfn);
-			if (ret)
-				goto out;
-			buf += size;
-
-		} while (done < count);
-	}
-	ret = done;
-
-out:
-	kfree(kbuf);
-
-	return ret;
-}
-
 static int __set_print_fmt(struct trace_probe *tp, char *buf, int len,
 			   bool is_return)
 {
diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h
index 903273c93e61..fb66e3eaa192 100644
--- a/kernel/trace/trace_probe.h
+++ b/kernel/trace/trace_probe.h
@@ -42,7 +42,6 @@
 
 #define MAX_TRACE_ARGS		128
 #define MAX_ARGSTR_LEN		63
-#define MAX_EVENT_NAME_LEN	64
 #define MAX_STRING_SIZE		PATH_MAX
 
 /* Reserved field names */
@@ -356,12 +355,6 @@ extern void traceprobe_free_probe_arg(struct probe_arg *arg);
 
 extern int traceprobe_split_symbol_offset(char *symbol, unsigned long *offset);
 
-extern ssize_t traceprobe_probes_write(struct file *file,
-		const char __user *buffer, size_t count, loff_t *ppos,
-		int (*createfn)(int, char**));
-
-extern int traceprobe_command(const char *buf, int (*createfn)(int, char**));
-
 /* Sum up total data length for dynamic arraies (strings) */
 static nokprobe_inline int
 __get_data_size(struct trace_probe *tp, struct pt_regs *regs)
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 4525e0271a53..b34965e62fb9 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -651,7 +651,7 @@ static int probes_open(struct inode *inode, struct file *file)
 static ssize_t probes_write(struct file *file, const char __user *buffer,
 			    size_t count, loff_t *ppos)
 {
-	return traceprobe_probes_write(file, buffer, count, ppos, create_trace_uprobe);
+	return trace_parse_run_command(file, buffer, count, ppos, create_trace_uprobe);
 }
 
 static const struct file_operations uprobe_events_ops = {
-- 
cgit v1.2.3


From 0d7a8325bf3326c92da2d21b4496a9ddde896d4f Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tom.zanussi@linux.intel.com>
Date: Fri, 22 Sep 2017 14:58:21 -0500
Subject: tracing: Clean up hist_field_flags enum

As we add more flags, specifying explicit integers for the flag values
becomes more unwieldy and error-prone - switch them over to left-shift
values.

Link: http://lkml.kernel.org/r/e644e4fb7665aec015f4a2d84a2f990d3dd5b8a1.1506105045.git.tom.zanussi@linux.intel.com

Signed-off-by: Tom Zanussi <tom.zanussi@linux.intel.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace_events_hist.c | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index 121d56850f24..0c7ec3048624 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -110,16 +110,16 @@ DEFINE_HIST_FIELD_FN(u8);
 #define HIST_KEY_SIZE_MAX	(MAX_FILTER_STR_VAL + HIST_STACKTRACE_SIZE)
 
 enum hist_field_flags {
-	HIST_FIELD_FL_HITCOUNT		= 1,
-	HIST_FIELD_FL_KEY		= 2,
-	HIST_FIELD_FL_STRING		= 4,
-	HIST_FIELD_FL_HEX		= 8,
-	HIST_FIELD_FL_SYM		= 16,
-	HIST_FIELD_FL_SYM_OFFSET	= 32,
-	HIST_FIELD_FL_EXECNAME		= 64,
-	HIST_FIELD_FL_SYSCALL		= 128,
-	HIST_FIELD_FL_STACKTRACE	= 256,
-	HIST_FIELD_FL_LOG2		= 512,
+	HIST_FIELD_FL_HITCOUNT		= 1 << 0,
+	HIST_FIELD_FL_KEY		= 1 << 1,
+	HIST_FIELD_FL_STRING		= 1 << 2,
+	HIST_FIELD_FL_HEX		= 1 << 3,
+	HIST_FIELD_FL_SYM		= 1 << 4,
+	HIST_FIELD_FL_SYM_OFFSET	= 1 << 5,
+	HIST_FIELD_FL_EXECNAME		= 1 << 6,
+	HIST_FIELD_FL_SYSCALL		= 1 << 7,
+	HIST_FIELD_FL_STACKTRACE	= 1 << 8,
+	HIST_FIELD_FL_LOG2		= 1 << 9,
 };
 
 struct hist_trigger_attrs {
-- 
cgit v1.2.3


From 85013256cf01629f72a327674c5d007b4a4b40da Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tom.zanussi@linux.intel.com>
Date: Fri, 22 Sep 2017 14:58:22 -0500
Subject: tracing: Add hist_field_name() accessor

In preparation for hist_fields that won't be strictly based on
trace_event_fields, add a new hist_field_name() accessor to allow that
flexibility and update associated users.

Link: http://lkml.kernel.org/r/5b5a2d36dde067cbbe2434b10f06daac27b7dbd5.1506105045.git.tom.zanussi@linux.intel.com

Signed-off-by: Tom Zanussi <tom.zanussi@linux.intel.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace_events_hist.c | 67 +++++++++++++++++++++++++++-------------
 1 file changed, 45 insertions(+), 22 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index 0c7ec3048624..34edf5fd85fd 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -146,6 +146,23 @@ struct hist_trigger_data {
 	struct tracing_map		*map;
 };
 
+static const char *hist_field_name(struct hist_field *field,
+				   unsigned int level)
+{
+	const char *field_name = "";
+
+	if (level > 1)
+		return field_name;
+
+	if (field->field)
+		field_name = field->field->name;
+
+	if (field_name == NULL)
+		field_name = "";
+
+	return field_name;
+}
+
 static hist_field_fn_t select_value_fn(int field_size, int field_is_signed)
 {
 	hist_field_fn_t fn = NULL;
@@ -653,7 +670,6 @@ static int is_descending(const char *str)
 static int create_sort_keys(struct hist_trigger_data *hist_data)
 {
 	char *fields_str = hist_data->attrs->sort_key_str;
-	struct ftrace_event_field *field = NULL;
 	struct tracing_map_sort_key *sort_key;
 	int descending, ret = 0;
 	unsigned int i, j;
@@ -670,7 +686,9 @@ static int create_sort_keys(struct hist_trigger_data *hist_data)
 	}
 
 	for (i = 0; i < TRACING_MAP_SORT_KEYS_MAX; i++) {
+		struct hist_field *hist_field;
 		char *field_str, *field_name;
+		const char *test_name;
 
 		sort_key = &hist_data->sort_keys[i];
 
@@ -703,8 +721,10 @@ static int create_sort_keys(struct hist_trigger_data *hist_data)
 		}
 
 		for (j = 1; j < hist_data->n_fields; j++) {
-			field = hist_data->fields[j]->field;
-			if (field && (strcmp(field_name, field->name) == 0)) {
+			hist_field = hist_data->fields[j];
+			test_name = hist_field_name(hist_field, 0);
+
+			if (strcmp(field_name, test_name) == 0) {
 				sort_key->field_idx = j;
 				descending = is_descending(field_str);
 				if (descending < 0) {
@@ -952,6 +972,7 @@ hist_trigger_entry_print(struct seq_file *m,
 	struct hist_field *key_field;
 	char str[KSYM_SYMBOL_LEN];
 	bool multiline = false;
+	const char *field_name;
 	unsigned int i;
 	u64 uval;
 
@@ -963,26 +984,27 @@ hist_trigger_entry_print(struct seq_file *m,
 		if (i > hist_data->n_vals)
 			seq_puts(m, ", ");
 
+		field_name = hist_field_name(key_field, 0);
+
 		if (key_field->flags & HIST_FIELD_FL_HEX) {
 			uval = *(u64 *)(key + key_field->offset);
-			seq_printf(m, "%s: %llx",
-				   key_field->field->name, uval);
+			seq_printf(m, "%s: %llx", field_name, uval);
 		} else if (key_field->flags & HIST_FIELD_FL_SYM) {
 			uval = *(u64 *)(key + key_field->offset);
 			sprint_symbol_no_offset(str, uval);
-			seq_printf(m, "%s: [%llx] %-45s",
-				   key_field->field->name, uval, str);
+			seq_printf(m, "%s: [%llx] %-45s", field_name,
+				   uval, str);
 		} else if (key_field->flags & HIST_FIELD_FL_SYM_OFFSET) {
 			uval = *(u64 *)(key + key_field->offset);
 			sprint_symbol(str, uval);
-			seq_printf(m, "%s: [%llx] %-55s",
-				   key_field->field->name, uval, str);
+			seq_printf(m, "%s: [%llx] %-55s", field_name,
+				   uval, str);
 		} else if (key_field->flags & HIST_FIELD_FL_EXECNAME) {
 			char *comm = elt->private_data;
 
 			uval = *(u64 *)(key + key_field->offset);
-			seq_printf(m, "%s: %-16s[%10llu]",
-				   key_field->field->name, comm, uval);
+			seq_printf(m, "%s: %-16s[%10llu]", field_name,
+				   comm, uval);
 		} else if (key_field->flags & HIST_FIELD_FL_SYSCALL) {
 			const char *syscall_name;
 
@@ -991,8 +1013,8 @@ hist_trigger_entry_print(struct seq_file *m,
 			if (!syscall_name)
 				syscall_name = "unknown_syscall";
 
-			seq_printf(m, "%s: %-30s[%3llu]",
-				   key_field->field->name, syscall_name, uval);
+			seq_printf(m, "%s: %-30s[%3llu]", field_name,
+				   syscall_name, uval);
 		} else if (key_field->flags & HIST_FIELD_FL_STACKTRACE) {
 			seq_puts(m, "stacktrace:\n");
 			hist_trigger_stacktrace_print(m,
@@ -1000,15 +1022,14 @@ hist_trigger_entry_print(struct seq_file *m,
 						      HIST_STACKTRACE_DEPTH);
 			multiline = true;
 		} else if (key_field->flags & HIST_FIELD_FL_LOG2) {
-			seq_printf(m, "%s: ~ 2^%-2llu", key_field->field->name,
+			seq_printf(m, "%s: ~ 2^%-2llu", field_name,
 				   *(u64 *)(key + key_field->offset));
 		} else if (key_field->flags & HIST_FIELD_FL_STRING) {
-			seq_printf(m, "%s: %-50s", key_field->field->name,
+			seq_printf(m, "%s: %-50s", field_name,
 				   (char *)(key + key_field->offset));
 		} else {
 			uval = *(u64 *)(key + key_field->offset);
-			seq_printf(m, "%s: %10llu", key_field->field->name,
-				   uval);
+			seq_printf(m, "%s: %10llu", field_name, uval);
 		}
 	}
 
@@ -1021,13 +1042,13 @@ hist_trigger_entry_print(struct seq_file *m,
 		   tracing_map_read_sum(elt, HITCOUNT_IDX));
 
 	for (i = 1; i < hist_data->n_vals; i++) {
+		field_name = hist_field_name(hist_data->fields[i], 0);
+
 		if (hist_data->fields[i]->flags & HIST_FIELD_FL_HEX) {
-			seq_printf(m, "  %s: %10llx",
-				   hist_data->fields[i]->field->name,
+			seq_printf(m, "  %s: %10llx", field_name,
 				   tracing_map_read_sum(elt, i));
 		} else {
-			seq_printf(m, "  %s: %10llu",
-				   hist_data->fields[i]->field->name,
+			seq_printf(m, "  %s: %10llu", field_name,
 				   tracing_map_read_sum(elt, i));
 		}
 	}
@@ -1140,7 +1161,9 @@ static const char *get_hist_field_flags(struct hist_field *hist_field)
 
 static void hist_field_print(struct seq_file *m, struct hist_field *hist_field)
 {
-	seq_printf(m, "%s", hist_field->field->name);
+	const char *field_name = hist_field_name(hist_field, 0);
+
+	seq_printf(m, "%s", field_name);
 	if (hist_field->flags) {
 		const char *flags_str = get_hist_field_flags(hist_field);
 
-- 
cgit v1.2.3


From 5819eaddf35b24d628ddfa4fbb5f8d4026e44b96 Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tom.zanussi@linux.intel.com>
Date: Fri, 22 Sep 2017 14:58:23 -0500
Subject: tracing: Reimplement log2

log2 as currently implemented applies only to u64 trace_event_field
derived fields, and assumes that anything it's applied to is a u64
field.

To prepare for synthetic fields like latencies, log2 should be
applicable to those as well, so take the opportunity now to fix the
current problems as well as expand to more general uses.

log2 should be thought of as a chaining function rather than a field
type.  To enable this as well as possible future function
implementations, add a hist_field operand array into the hist_field
definition for this purpose, and make use of it to implement the log2
'function'.

Link: http://lkml.kernel.org/r/b47f93fc0b87b36eccf716b0c018f3a71e1f1111.1506105045.git.tom.zanussi@linux.intel.com

Signed-off-by: Tom Zanussi <tom.zanussi@linux.intel.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace_events_hist.c | 31 +++++++++++++++++++++++++++----
 1 file changed, 27 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index 34edf5fd85fd..1e1558c99d56 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -28,12 +28,16 @@ struct hist_field;
 
 typedef u64 (*hist_field_fn_t) (struct hist_field *field, void *event);
 
+#define HIST_FIELD_OPERANDS_MAX	2
+
 struct hist_field {
 	struct ftrace_event_field	*field;
 	unsigned long			flags;
 	hist_field_fn_t			fn;
 	unsigned int			size;
 	unsigned int			offset;
+	unsigned int                    is_signed;
+	struct hist_field		*operands[HIST_FIELD_OPERANDS_MAX];
 };
 
 static u64 hist_field_none(struct hist_field *field, void *event)
@@ -71,7 +75,9 @@ static u64 hist_field_pstring(struct hist_field *hist_field, void *event)
 
 static u64 hist_field_log2(struct hist_field *hist_field, void *event)
 {
-	u64 val = *(u64 *)(event + hist_field->field->offset);
+	struct hist_field *operand = hist_field->operands[0];
+
+	u64 val = operand->fn(operand, event);
 
 	return (u64) ilog2(roundup_pow_of_two(val));
 }
@@ -156,6 +162,8 @@ static const char *hist_field_name(struct hist_field *field,
 
 	if (field->field)
 		field_name = field->field->name;
+	else if (field->flags & HIST_FIELD_FL_LOG2)
+		field_name = hist_field_name(field->operands[0], ++level);
 
 	if (field_name == NULL)
 		field_name = "";
@@ -357,8 +365,20 @@ static const struct tracing_map_ops hist_trigger_elt_comm_ops = {
 	.elt_init	= hist_trigger_elt_comm_init,
 };
 
-static void destroy_hist_field(struct hist_field *hist_field)
+static void destroy_hist_field(struct hist_field *hist_field,
+			       unsigned int level)
 {
+	unsigned int i;
+
+	if (level > 2)
+		return;
+
+	if (!hist_field)
+		return;
+
+	for (i = 0; i < HIST_FIELD_OPERANDS_MAX; i++)
+		destroy_hist_field(hist_field->operands[i], level + 1);
+
 	kfree(hist_field);
 }
 
@@ -385,7 +405,10 @@ static struct hist_field *create_hist_field(struct ftrace_event_field *field,
 	}
 
 	if (flags & HIST_FIELD_FL_LOG2) {
+		unsigned long fl = flags & ~HIST_FIELD_FL_LOG2;
 		hist_field->fn = hist_field_log2;
+		hist_field->operands[0] = create_hist_field(field, fl);
+		hist_field->size = hist_field->operands[0]->size;
 		goto out;
 	}
 
@@ -405,7 +428,7 @@ static struct hist_field *create_hist_field(struct ftrace_event_field *field,
 		hist_field->fn = select_value_fn(field->size,
 						 field->is_signed);
 		if (!hist_field->fn) {
-			destroy_hist_field(hist_field);
+			destroy_hist_field(hist_field, 0);
 			return NULL;
 		}
 	}
@@ -422,7 +445,7 @@ static void destroy_hist_fields(struct hist_trigger_data *hist_data)
 
 	for (i = 0; i < TRACING_MAP_FIELDS_MAX; i++) {
 		if (hist_data->fields[i]) {
-			destroy_hist_field(hist_data->fields[i]);
+			destroy_hist_field(hist_data->fields[i], 0);
 			hist_data->fields[i] = NULL;
 		}
 	}
-- 
cgit v1.2.3


From b35bd0d9f8a8ea17aae40893e18274d191a2d2c5 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Sat, 30 Sep 2017 23:39:05 -0600
Subject: sysctl: remove /proc/sys/vm/nr_pdflush_threads

This tunable has been obsolete since 2.6.32, and writes to the
file have been failing and complaining in dmesg since then:

nr_pdflush_threads exported in /proc is scheduled for removal

That was 8 years ago. Remove the file ABI obsolete notice, and
the sysfs file.

Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 Documentation/ABI/obsolete/proc-sys-vm-nr_pdflush_threads | 5 -----
 kernel/sysctl.c                                           | 5 -----
 2 files changed, 10 deletions(-)
 delete mode 100644 Documentation/ABI/obsolete/proc-sys-vm-nr_pdflush_threads

(limited to 'kernel')

diff --git a/Documentation/ABI/obsolete/proc-sys-vm-nr_pdflush_threads b/Documentation/ABI/obsolete/proc-sys-vm-nr_pdflush_threads
deleted file mode 100644
index b0b0eeb20fe3..000000000000
--- a/Documentation/ABI/obsolete/proc-sys-vm-nr_pdflush_threads
+++ /dev/null
@@ -1,5 +0,0 @@
-What:		/proc/sys/vm/nr_pdflush_threads
-Date:		June 2012
-Contact:	Wanpeng Li <liwp@linux.vnet.ibm.com>
-Description: Since pdflush is replaced by per-BDI flusher, the interface of old pdflush
-             exported in /proc/sys/vm/ should be removed.
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 6648fbbb8157..a5dd8d82c253 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1344,11 +1344,6 @@ static struct ctl_table vm_table[] = {
 		.proc_handler	= dirtytime_interval_handler,
 		.extra1		= &zero,
 	},
-	{
-		.procname       = "nr_pdflush_threads",
-		.mode           = 0444 /* read-only */,
-		.proc_handler   = pdflush_proc_obsolete,
-	},
 	{
 		.procname	= "swappiness",
 		.data		= &vm_swappiness,
-- 
cgit v1.2.3


From 6cafbe159416822f6d3dfd711bf4c39050c650ba Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Tue, 20 Jun 2017 10:44:58 -0400
Subject: ftrace: Add a ftrace_free_mem() function for modules to use

In order to be able to trace module init functions, the module code needs to
tell ftrace what is being freed when the init sections are freed. Use the
code that the main init calls to tell ftrace to free the main init sections.
This requires passing in a start and end address to free.

Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 include/linux/ftrace.h |  2 ++
 kernel/trace/ftrace.c  | 14 +++++++++++---
 2 files changed, 13 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 2e028854bac7..47fc404ad233 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -151,8 +151,10 @@ struct ftrace_ops_hash {
 };
 
 void ftrace_free_init_mem(void);
+void ftrace_free_mem(void *start, void *end);
 #else
 static inline void ftrace_free_init_mem(void) { }
+static inline void ftrace_free_mem(void *start, void *end) { }
 #endif
 
 /*
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 6abfafd7f173..84cb5928665a 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -5868,10 +5868,10 @@ void ftrace_module_init(struct module *mod)
 }
 #endif /* CONFIG_MODULES */
 
-void __init ftrace_free_init_mem(void)
+void ftrace_free_mem(void *start_ptr, void *end_ptr)
 {
-	unsigned long start = (unsigned long)(&__init_begin);
-	unsigned long end = (unsigned long)(&__init_end);
+	unsigned long start = (unsigned long)(start_ptr);
+	unsigned long end = (unsigned long)(end_ptr);
 	struct ftrace_page **last_pg = &ftrace_pages_start;
 	struct ftrace_page *pg;
 	struct dyn_ftrace *rec;
@@ -5913,6 +5913,14 @@ void __init ftrace_free_init_mem(void)
 	mutex_unlock(&ftrace_lock);
 }
 
+void __init ftrace_free_init_mem(void)
+{
+	void *start = (void *)(&__init_begin);
+	void *end = (void *)(&__init_end);
+
+	ftrace_free_mem(start, end);
+}
+
 void __init ftrace_init(void)
 {
 	extern unsigned long __start_mcount_loc[];
-- 
cgit v1.2.3


From 324bda9e6c5add86ba2e1066476481c48132aca0 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@fb.com>
Date: Mon, 2 Oct 2017 22:50:21 -0700
Subject: bpf: multi program support for cgroup+bpf

introduce BPF_F_ALLOW_MULTI flag that can be used to attach multiple
bpf programs to a cgroup.

The difference between three possible flags for BPF_PROG_ATTACH command:
- NONE(default): No further bpf programs allowed in the subtree.
- BPF_F_ALLOW_OVERRIDE: If a sub-cgroup installs some bpf program,
  the program in this cgroup yields to sub-cgroup program.
- BPF_F_ALLOW_MULTI: If a sub-cgroup installs some bpf program,
  that cgroup program gets run in addition to the program in this cgroup.

NONE and BPF_F_ALLOW_OVERRIDE existed before. This patch doesn't
change their behavior. It only clarifies the semantics in relation
to new flag.

Only one program is allowed to be attached to a cgroup with
NONE or BPF_F_ALLOW_OVERRIDE flag.
Multiple programs are allowed to be attached to a cgroup with
BPF_F_ALLOW_MULTI flag. They are executed in FIFO order
(those that were attached first, run first)
The programs of sub-cgroup are executed first, then programs of
this cgroup and then programs of parent cgroup.
All eligible programs are executed regardless of return code from
earlier programs.

To allow efficient execution of multiple programs attached to a cgroup
and to avoid penalizing cgroups without any programs attached
introduce 'struct bpf_prog_array' which is RCU protected array
of pointers to bpf programs.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Martin KaFai Lau <kafai@fb.com>
for cgroup bits
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf-cgroup.h |  46 +++--
 include/linux/bpf.h        |  32 ++++
 include/linux/filter.h     |   2 +-
 include/uapi/linux/bpf.h   |  42 +++-
 kernel/bpf/cgroup.c        | 467 ++++++++++++++++++++++++++++++++-------------
 kernel/bpf/core.c          |  31 +++
 kernel/bpf/syscall.c       |  37 ++--
 kernel/cgroup/cgroup.c     |  28 ++-
 8 files changed, 516 insertions(+), 169 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
index d41d40ac3efd..102e56fbb6de 100644
--- a/include/linux/bpf-cgroup.h
+++ b/include/linux/bpf-cgroup.h
@@ -14,27 +14,42 @@ struct bpf_sock_ops_kern;
 extern struct static_key_false cgroup_bpf_enabled_key;
 #define cgroup_bpf_enabled static_branch_unlikely(&cgroup_bpf_enabled_key)
 
+struct bpf_prog_list {
+	struct list_head node;
+	struct bpf_prog *prog;
+};
+
+struct bpf_prog_array;
+
 struct cgroup_bpf {
-	/*
-	 * Store two sets of bpf_prog pointers, one for programs that are
-	 * pinned directly to this cgroup, and one for those that are effective
-	 * when this cgroup is accessed.
+	/* array of effective progs in this cgroup */
+	struct bpf_prog_array __rcu *effective[MAX_BPF_ATTACH_TYPE];
+
+	/* attached progs to this cgroup and attach flags
+	 * when flags == 0 or BPF_F_ALLOW_OVERRIDE the progs list will
+	 * have either zero or one element
+	 * when BPF_F_ALLOW_MULTI the list can have up to BPF_CGROUP_MAX_PROGS
 	 */
-	struct bpf_prog *prog[MAX_BPF_ATTACH_TYPE];
-	struct bpf_prog __rcu *effective[MAX_BPF_ATTACH_TYPE];
-	bool disallow_override[MAX_BPF_ATTACH_TYPE];
+	struct list_head progs[MAX_BPF_ATTACH_TYPE];
+	u32 flags[MAX_BPF_ATTACH_TYPE];
+
+	/* temp storage for effective prog array used by prog_attach/detach */
+	struct bpf_prog_array __rcu *inactive;
 };
 
 void cgroup_bpf_put(struct cgroup *cgrp);
-void cgroup_bpf_inherit(struct cgroup *cgrp, struct cgroup *parent);
+int cgroup_bpf_inherit(struct cgroup *cgrp);
 
-int __cgroup_bpf_update(struct cgroup *cgrp, struct cgroup *parent,
-			struct bpf_prog *prog, enum bpf_attach_type type,
-			bool overridable);
+int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog,
+			enum bpf_attach_type type, u32 flags);
+int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
+			enum bpf_attach_type type, u32 flags);
 
-/* Wrapper for __cgroup_bpf_update() protected by cgroup_mutex */
-int cgroup_bpf_update(struct cgroup *cgrp, struct bpf_prog *prog,
-		      enum bpf_attach_type type, bool overridable);
+/* Wrapper for __cgroup_bpf_*() protected by cgroup_mutex */
+int cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog,
+		      enum bpf_attach_type type, u32 flags);
+int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
+		      enum bpf_attach_type type, u32 flags);
 
 int __cgroup_bpf_run_filter_skb(struct sock *sk,
 				struct sk_buff *skb,
@@ -96,8 +111,7 @@ int __cgroup_bpf_run_filter_sock_ops(struct sock *sk,
 
 struct cgroup_bpf {};
 static inline void cgroup_bpf_put(struct cgroup *cgrp) {}
-static inline void cgroup_bpf_inherit(struct cgroup *cgrp,
-				      struct cgroup *parent) {}
+static inline int cgroup_bpf_inherit(struct cgroup *cgrp) { return 0; }
 
 #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk,skb) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_INET_EGRESS(sk,skb) ({ 0; })
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 252f4bc9eb25..a6964b75f070 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -241,6 +241,38 @@ int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr,
 int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr,
 			  union bpf_attr __user *uattr);
 
+/* an array of programs to be executed under rcu_lock.
+ *
+ * Typical usage:
+ * ret = BPF_PROG_RUN_ARRAY(&bpf_prog_array, ctx, BPF_PROG_RUN);
+ *
+ * the structure returned by bpf_prog_array_alloc() should be populated
+ * with program pointers and the last pointer must be NULL.
+ * The user has to keep refcnt on the program and make sure the program
+ * is removed from the array before bpf_prog_put().
+ * The 'struct bpf_prog_array *' should only be replaced with xchg()
+ * since other cpus are walking the array of pointers in parallel.
+ */
+struct bpf_prog_array {
+	struct rcu_head rcu;
+	struct bpf_prog *progs[0];
+};
+
+struct bpf_prog_array __rcu *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags);
+void bpf_prog_array_free(struct bpf_prog_array __rcu *progs);
+
+#define BPF_PROG_RUN_ARRAY(array, ctx, func)		\
+	({						\
+		struct bpf_prog **_prog;		\
+		u32 _ret = 1;				\
+		rcu_read_lock();			\
+		_prog = rcu_dereference(array)->progs;	\
+		for (; *_prog; _prog++)			\
+			_ret &= func(*_prog, ctx);	\
+		rcu_read_unlock();			\
+		_ret;					\
+	 })
+
 #ifdef CONFIG_BPF_SYSCALL
 DECLARE_PER_CPU(int, bpf_prog_active);
 
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 911d454af107..2d2db394b0ca 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -481,7 +481,7 @@ struct sk_filter {
 	struct bpf_prog	*prog;
 };
 
-#define BPF_PROG_RUN(filter, ctx)  (*filter->bpf_func)(ctx, filter->insnsi)
+#define BPF_PROG_RUN(filter, ctx)  (*(filter)->bpf_func)(ctx, (filter)->insnsi)
 
 #define BPF_SKB_CB_LEN QDISC_CB_PRIV_LEN
 
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 6d2137b4cf38..762f74bc6c47 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -143,11 +143,47 @@ enum bpf_attach_type {
 
 #define MAX_BPF_ATTACH_TYPE __MAX_BPF_ATTACH_TYPE
 
-/* If BPF_F_ALLOW_OVERRIDE flag is used in BPF_PROG_ATTACH command
- * to the given target_fd cgroup the descendent cgroup will be able to
- * override effective bpf program that was inherited from this cgroup
+/* cgroup-bpf attach flags used in BPF_PROG_ATTACH command
+ *
+ * NONE(default): No further bpf programs allowed in the subtree.
+ *
+ * BPF_F_ALLOW_OVERRIDE: If a sub-cgroup installs some bpf program,
+ * the program in this cgroup yields to sub-cgroup program.
+ *
+ * BPF_F_ALLOW_MULTI: If a sub-cgroup installs some bpf program,
+ * that cgroup program gets run in addition to the program in this cgroup.
+ *
+ * Only one program is allowed to be attached to a cgroup with
+ * NONE or BPF_F_ALLOW_OVERRIDE flag.
+ * Attaching another program on top of NONE or BPF_F_ALLOW_OVERRIDE will
+ * release old program and attach the new one. Attach flags has to match.
+ *
+ * Multiple programs are allowed to be attached to a cgroup with
+ * BPF_F_ALLOW_MULTI flag. They are executed in FIFO order
+ * (those that were attached first, run first)
+ * The programs of sub-cgroup are executed first, then programs of
+ * this cgroup and then programs of parent cgroup.
+ * When children program makes decision (like picking TCP CA or sock bind)
+ * parent program has a chance to override it.
+ *
+ * A cgroup with MULTI or OVERRIDE flag allows any attach flags in sub-cgroups.
+ * A cgroup with NONE doesn't allow any programs in sub-cgroups.
+ * Ex1:
+ * cgrp1 (MULTI progs A, B) ->
+ *    cgrp2 (OVERRIDE prog C) ->
+ *      cgrp3 (MULTI prog D) ->
+ *        cgrp4 (OVERRIDE prog E) ->
+ *          cgrp5 (NONE prog F)
+ * the event in cgrp5 triggers execution of F,D,A,B in that order.
+ * if prog F is detached, the execution is E,D,A,B
+ * if prog F and D are detached, the execution is E,A,B
+ * if prog F, E and D are detached, the execution is C,A,B
+ *
+ * All eligible programs are executed regardless of return code from
+ * earlier programs.
  */
 #define BPF_F_ALLOW_OVERRIDE	(1U << 0)
+#define BPF_F_ALLOW_MULTI	(1U << 1)
 
 /* If BPF_F_STRICT_ALIGNMENT is used in BPF_PROG_LOAD command, the
  * verifier will perform strict alignment checking as if the kernel
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 546113430049..6b7500bbdb53 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -27,129 +27,361 @@ void cgroup_bpf_put(struct cgroup *cgrp)
 {
 	unsigned int type;
 
-	for (type = 0; type < ARRAY_SIZE(cgrp->bpf.prog); type++) {
-		struct bpf_prog *prog = cgrp->bpf.prog[type];
-
-		if (prog) {
-			bpf_prog_put(prog);
+	for (type = 0; type < ARRAY_SIZE(cgrp->bpf.progs); type++) {
+		struct list_head *progs = &cgrp->bpf.progs[type];
+		struct bpf_prog_list *pl, *tmp;
+
+		list_for_each_entry_safe(pl, tmp, progs, node) {
+			list_del(&pl->node);
+			bpf_prog_put(pl->prog);
+			kfree(pl);
 			static_branch_dec(&cgroup_bpf_enabled_key);
 		}
+		bpf_prog_array_free(cgrp->bpf.effective[type]);
+	}
+}
+
+/* count number of elements in the list.
+ * it's slow but the list cannot be long
+ */
+static u32 prog_list_length(struct list_head *head)
+{
+	struct bpf_prog_list *pl;
+	u32 cnt = 0;
+
+	list_for_each_entry(pl, head, node) {
+		if (!pl->prog)
+			continue;
+		cnt++;
 	}
+	return cnt;
+}
+
+/* if parent has non-overridable prog attached,
+ * disallow attaching new programs to the descendent cgroup.
+ * if parent has overridable or multi-prog, allow attaching
+ */
+static bool hierarchy_allows_attach(struct cgroup *cgrp,
+				    enum bpf_attach_type type,
+				    u32 new_flags)
+{
+	struct cgroup *p;
+
+	p = cgroup_parent(cgrp);
+	if (!p)
+		return true;
+	do {
+		u32 flags = p->bpf.flags[type];
+		u32 cnt;
+
+		if (flags & BPF_F_ALLOW_MULTI)
+			return true;
+		cnt = prog_list_length(&p->bpf.progs[type]);
+		WARN_ON_ONCE(cnt > 1);
+		if (cnt == 1)
+			return !!(flags & BPF_F_ALLOW_OVERRIDE);
+		p = cgroup_parent(p);
+	} while (p);
+	return true;
+}
+
+/* compute a chain of effective programs for a given cgroup:
+ * start from the list of programs in this cgroup and add
+ * all parent programs.
+ * Note that parent's F_ALLOW_OVERRIDE-type program is yielding
+ * to programs in this cgroup
+ */
+static int compute_effective_progs(struct cgroup *cgrp,
+				   enum bpf_attach_type type,
+				   struct bpf_prog_array __rcu **array)
+{
+	struct bpf_prog_array __rcu *progs;
+	struct bpf_prog_list *pl;
+	struct cgroup *p = cgrp;
+	int cnt = 0;
+
+	/* count number of effective programs by walking parents */
+	do {
+		if (cnt == 0 || (p->bpf.flags[type] & BPF_F_ALLOW_MULTI))
+			cnt += prog_list_length(&p->bpf.progs[type]);
+		p = cgroup_parent(p);
+	} while (p);
+
+	progs = bpf_prog_array_alloc(cnt, GFP_KERNEL);
+	if (!progs)
+		return -ENOMEM;
+
+	/* populate the array with effective progs */
+	cnt = 0;
+	p = cgrp;
+	do {
+		if (cnt == 0 || (p->bpf.flags[type] & BPF_F_ALLOW_MULTI))
+			list_for_each_entry(pl,
+					    &p->bpf.progs[type], node) {
+				if (!pl->prog)
+					continue;
+				rcu_dereference_protected(progs, 1)->
+					progs[cnt++] = pl->prog;
+			}
+		p = cgroup_parent(p);
+	} while (p);
+
+	*array = progs;
+	return 0;
+}
+
+static void activate_effective_progs(struct cgroup *cgrp,
+				     enum bpf_attach_type type,
+				     struct bpf_prog_array __rcu *array)
+{
+	struct bpf_prog_array __rcu *old_array;
+
+	old_array = xchg(&cgrp->bpf.effective[type], array);
+	/* free prog array after grace period, since __cgroup_bpf_run_*()
+	 * might be still walking the array
+	 */
+	bpf_prog_array_free(old_array);
 }
 
 /**
  * cgroup_bpf_inherit() - inherit effective programs from parent
  * @cgrp: the cgroup to modify
- * @parent: the parent to inherit from
  */
-void cgroup_bpf_inherit(struct cgroup *cgrp, struct cgroup *parent)
+int cgroup_bpf_inherit(struct cgroup *cgrp)
 {
-	unsigned int type;
+/* has to use marco instead of const int, since compiler thinks
+ * that array below is variable length
+ */
+#define	NR ARRAY_SIZE(cgrp->bpf.effective)
+	struct bpf_prog_array __rcu *arrays[NR] = {};
+	int i;
 
-	for (type = 0; type < ARRAY_SIZE(cgrp->bpf.effective); type++) {
-		struct bpf_prog *e;
+	for (i = 0; i < NR; i++)
+		INIT_LIST_HEAD(&cgrp->bpf.progs[i]);
 
-		e = rcu_dereference_protected(parent->bpf.effective[type],
-					      lockdep_is_held(&cgroup_mutex));
-		rcu_assign_pointer(cgrp->bpf.effective[type], e);
-		cgrp->bpf.disallow_override[type] = parent->bpf.disallow_override[type];
-	}
+	for (i = 0; i < NR; i++)
+		if (compute_effective_progs(cgrp, i, &arrays[i]))
+			goto cleanup;
+
+	for (i = 0; i < NR; i++)
+		activate_effective_progs(cgrp, i, arrays[i]);
+
+	return 0;
+cleanup:
+	for (i = 0; i < NR; i++)
+		bpf_prog_array_free(arrays[i]);
+	return -ENOMEM;
 }
 
+#define BPF_CGROUP_MAX_PROGS 64
+
 /**
- * __cgroup_bpf_update() - Update the pinned program of a cgroup, and
+ * __cgroup_bpf_attach() - Attach the program to a cgroup, and
  *                         propagate the change to descendants
  * @cgrp: The cgroup which descendants to traverse
- * @parent: The parent of @cgrp, or %NULL if @cgrp is the root
- * @prog: A new program to pin
- * @type: Type of pinning operation (ingress/egress)
- *
- * Each cgroup has a set of two pointers for bpf programs; one for eBPF
- * programs it owns, and which is effective for execution.
- *
- * If @prog is not %NULL, this function attaches a new program to the cgroup
- * and releases the one that is currently attached, if any. @prog is then made
- * the effective program of type @type in that cgroup.
- *
- * If @prog is %NULL, the currently attached program of type @type is released,
- * and the effective program of the parent cgroup (if any) is inherited to
- * @cgrp.
- *
- * Then, the descendants of @cgrp are walked and the effective program for
- * each of them is set to the effective program of @cgrp unless the
- * descendant has its own program attached, in which case the subbranch is
- * skipped. This ensures that delegated subcgroups with own programs are left
- * untouched.
+ * @prog: A program to attach
+ * @type: Type of attach operation
  *
  * Must be called with cgroup_mutex held.
  */
-int __cgroup_bpf_update(struct cgroup *cgrp, struct cgroup *parent,
-			struct bpf_prog *prog, enum bpf_attach_type type,
-			bool new_overridable)
+int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog,
+			enum bpf_attach_type type, u32 flags)
 {
-	struct bpf_prog *old_prog, *effective = NULL;
-	struct cgroup_subsys_state *pos;
-	bool overridable = true;
-
-	if (parent) {
-		overridable = !parent->bpf.disallow_override[type];
-		effective = rcu_dereference_protected(parent->bpf.effective[type],
-						      lockdep_is_held(&cgroup_mutex));
-	}
-
-	if (prog && effective && !overridable)
-		/* if parent has non-overridable prog attached, disallow
-		 * attaching new programs to descendent cgroup
-		 */
+	struct list_head *progs = &cgrp->bpf.progs[type];
+	struct bpf_prog *old_prog = NULL;
+	struct cgroup_subsys_state *css;
+	struct bpf_prog_list *pl;
+	bool pl_was_allocated;
+	u32 old_flags;
+	int err;
+
+	if ((flags & BPF_F_ALLOW_OVERRIDE) && (flags & BPF_F_ALLOW_MULTI))
+		/* invalid combination */
+		return -EINVAL;
+
+	if (!hierarchy_allows_attach(cgrp, type, flags))
 		return -EPERM;
 
-	if (prog && effective && overridable != new_overridable)
-		/* if parent has overridable prog attached, only
-		 * allow overridable programs in descendent cgroup
+	if (!list_empty(progs) && cgrp->bpf.flags[type] != flags)
+		/* Disallow attaching non-overridable on top
+		 * of existing overridable in this cgroup.
+		 * Disallow attaching multi-prog if overridable or none
 		 */
 		return -EPERM;
 
-	old_prog = cgrp->bpf.prog[type];
-
-	if (prog) {
-		overridable = new_overridable;
-		effective = prog;
-		if (old_prog &&
-		    cgrp->bpf.disallow_override[type] == new_overridable)
-			/* disallow attaching non-overridable on top
-			 * of existing overridable in this cgroup
-			 * and vice versa
-			 */
-			return -EPERM;
+	if (prog_list_length(progs) >= BPF_CGROUP_MAX_PROGS)
+		return -E2BIG;
+
+	if (flags & BPF_F_ALLOW_MULTI) {
+		list_for_each_entry(pl, progs, node)
+			if (pl->prog == prog)
+				/* disallow attaching the same prog twice */
+				return -EINVAL;
+
+		pl = kmalloc(sizeof(*pl), GFP_KERNEL);
+		if (!pl)
+			return -ENOMEM;
+		pl_was_allocated = true;
+		pl->prog = prog;
+		list_add_tail(&pl->node, progs);
+	} else {
+		if (list_empty(progs)) {
+			pl = kmalloc(sizeof(*pl), GFP_KERNEL);
+			if (!pl)
+				return -ENOMEM;
+			pl_was_allocated = true;
+			list_add_tail(&pl->node, progs);
+		} else {
+			pl = list_first_entry(progs, typeof(*pl), node);
+			old_prog = pl->prog;
+			pl_was_allocated = false;
+		}
+		pl->prog = prog;
 	}
 
-	if (!prog && !old_prog)
-		/* report error when trying to detach and nothing is attached */
-		return -ENOENT;
+	old_flags = cgrp->bpf.flags[type];
+	cgrp->bpf.flags[type] = flags;
 
-	cgrp->bpf.prog[type] = prog;
+	/* allocate and recompute effective prog arrays */
+	css_for_each_descendant_pre(css, &cgrp->self) {
+		struct cgroup *desc = container_of(css, struct cgroup, self);
 
-	css_for_each_descendant_pre(pos, &cgrp->self) {
-		struct cgroup *desc = container_of(pos, struct cgroup, self);
-
-		/* skip the subtree if the descendant has its own program */
-		if (desc->bpf.prog[type] && desc != cgrp) {
-			pos = css_rightmost_descendant(pos);
-		} else {
-			rcu_assign_pointer(desc->bpf.effective[type],
-					   effective);
-			desc->bpf.disallow_override[type] = !overridable;
-		}
+		err = compute_effective_progs(desc, type, &desc->bpf.inactive);
+		if (err)
+			goto cleanup;
 	}
 
-	if (prog)
-		static_branch_inc(&cgroup_bpf_enabled_key);
+	/* all allocations were successful. Activate all prog arrays */
+	css_for_each_descendant_pre(css, &cgrp->self) {
+		struct cgroup *desc = container_of(css, struct cgroup, self);
 
+		activate_effective_progs(desc, type, desc->bpf.inactive);
+		desc->bpf.inactive = NULL;
+	}
+
+	static_branch_inc(&cgroup_bpf_enabled_key);
 	if (old_prog) {
 		bpf_prog_put(old_prog);
 		static_branch_dec(&cgroup_bpf_enabled_key);
 	}
 	return 0;
+
+cleanup:
+	/* oom while computing effective. Free all computed effective arrays
+	 * since they were not activated
+	 */
+	css_for_each_descendant_pre(css, &cgrp->self) {
+		struct cgroup *desc = container_of(css, struct cgroup, self);
+
+		bpf_prog_array_free(desc->bpf.inactive);
+		desc->bpf.inactive = NULL;
+	}
+
+	/* and cleanup the prog list */
+	pl->prog = old_prog;
+	if (pl_was_allocated) {
+		list_del(&pl->node);
+		kfree(pl);
+	}
+	return err;
+}
+
+/**
+ * __cgroup_bpf_detach() - Detach the program from a cgroup, and
+ *                         propagate the change to descendants
+ * @cgrp: The cgroup which descendants to traverse
+ * @prog: A program to detach or NULL
+ * @type: Type of detach operation
+ *
+ * Must be called with cgroup_mutex held.
+ */
+int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
+			enum bpf_attach_type type, u32 unused_flags)
+{
+	struct list_head *progs = &cgrp->bpf.progs[type];
+	u32 flags = cgrp->bpf.flags[type];
+	struct bpf_prog *old_prog = NULL;
+	struct cgroup_subsys_state *css;
+	struct bpf_prog_list *pl;
+	int err;
+
+	if (flags & BPF_F_ALLOW_MULTI) {
+		if (!prog)
+			/* to detach MULTI prog the user has to specify valid FD
+			 * of the program to be detached
+			 */
+			return -EINVAL;
+	} else {
+		if (list_empty(progs))
+			/* report error when trying to detach and nothing is attached */
+			return -ENOENT;
+	}
+
+	if (flags & BPF_F_ALLOW_MULTI) {
+		/* find the prog and detach it */
+		list_for_each_entry(pl, progs, node) {
+			if (pl->prog != prog)
+				continue;
+			old_prog = prog;
+			/* mark it deleted, so it's ignored while
+			 * recomputing effective
+			 */
+			pl->prog = NULL;
+			break;
+		}
+		if (!old_prog)
+			return -ENOENT;
+	} else {
+		/* to maintain backward compatibility NONE and OVERRIDE cgroups
+		 * allow detaching with invalid FD (prog==NULL)
+		 */
+		pl = list_first_entry(progs, typeof(*pl), node);
+		old_prog = pl->prog;
+		pl->prog = NULL;
+	}
+
+	/* allocate and recompute effective prog arrays */
+	css_for_each_descendant_pre(css, &cgrp->self) {
+		struct cgroup *desc = container_of(css, struct cgroup, self);
+
+		err = compute_effective_progs(desc, type, &desc->bpf.inactive);
+		if (err)
+			goto cleanup;
+	}
+
+	/* all allocations were successful. Activate all prog arrays */
+	css_for_each_descendant_pre(css, &cgrp->self) {
+		struct cgroup *desc = container_of(css, struct cgroup, self);
+
+		activate_effective_progs(desc, type, desc->bpf.inactive);
+		desc->bpf.inactive = NULL;
+	}
+
+	/* now can actually delete it from this cgroup list */
+	list_del(&pl->node);
+	kfree(pl);
+	if (list_empty(progs))
+		/* last program was detached, reset flags to zero */
+		cgrp->bpf.flags[type] = 0;
+
+	bpf_prog_put(old_prog);
+	static_branch_dec(&cgroup_bpf_enabled_key);
+	return 0;
+
+cleanup:
+	/* oom while computing effective. Free all computed effective arrays
+	 * since they were not activated
+	 */
+	css_for_each_descendant_pre(css, &cgrp->self) {
+		struct cgroup *desc = container_of(css, struct cgroup, self);
+
+		bpf_prog_array_free(desc->bpf.inactive);
+		desc->bpf.inactive = NULL;
+	}
+
+	/* and restore back old_prog */
+	pl->prog = old_prog;
+	return err;
 }
 
 /**
@@ -171,36 +403,26 @@ int __cgroup_bpf_run_filter_skb(struct sock *sk,
 				struct sk_buff *skb,
 				enum bpf_attach_type type)
 {
-	struct bpf_prog *prog;
+	unsigned int offset = skb->data - skb_network_header(skb);
+	struct sock *save_sk;
 	struct cgroup *cgrp;
-	int ret = 0;
+	int ret;
 
 	if (!sk || !sk_fullsock(sk))
 		return 0;
 
-	if (sk->sk_family != AF_INET &&
-	    sk->sk_family != AF_INET6)
+	if (sk->sk_family != AF_INET && sk->sk_family != AF_INET6)
 		return 0;
 
 	cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
-
-	rcu_read_lock();
-
-	prog = rcu_dereference(cgrp->bpf.effective[type]);
-	if (prog) {
-		unsigned int offset = skb->data - skb_network_header(skb);
-		struct sock *save_sk = skb->sk;
-
-		skb->sk = sk;
-		__skb_push(skb, offset);
-		ret = bpf_prog_run_save_cb(prog, skb) == 1 ? 0 : -EPERM;
-		__skb_pull(skb, offset);
-		skb->sk = save_sk;
-	}
-
-	rcu_read_unlock();
-
-	return ret;
+	save_sk = skb->sk;
+	skb->sk = sk;
+	__skb_push(skb, offset);
+	ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], skb,
+				 bpf_prog_run_save_cb);
+	__skb_pull(skb, offset);
+	skb->sk = save_sk;
+	return ret == 1 ? 0 : -EPERM;
 }
 EXPORT_SYMBOL(__cgroup_bpf_run_filter_skb);
 
@@ -221,19 +443,10 @@ int __cgroup_bpf_run_filter_sk(struct sock *sk,
 			       enum bpf_attach_type type)
 {
 	struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
-	struct bpf_prog *prog;
-	int ret = 0;
-
-
-	rcu_read_lock();
-
-	prog = rcu_dereference(cgrp->bpf.effective[type]);
-	if (prog)
-		ret = BPF_PROG_RUN(prog, sk) == 1 ? 0 : -EPERM;
+	int ret;
 
-	rcu_read_unlock();
-
-	return ret;
+	ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], sk, BPF_PROG_RUN);
+	return ret == 1 ? 0 : -EPERM;
 }
 EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk);
 
@@ -258,18 +471,10 @@ int __cgroup_bpf_run_filter_sock_ops(struct sock *sk,
 				     enum bpf_attach_type type)
 {
 	struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
-	struct bpf_prog *prog;
-	int ret = 0;
-
-
-	rcu_read_lock();
-
-	prog = rcu_dereference(cgrp->bpf.effective[type]);
-	if (prog)
-		ret = BPF_PROG_RUN(prog, sock_ops) == 1 ? 0 : -EPERM;
-
-	rcu_read_unlock();
+	int ret;
 
-	return ret;
+	ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], sock_ops,
+				 BPF_PROG_RUN);
+	return ret == 1 ? 0 : -EPERM;
 }
 EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_ops);
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 917cc04a0a94..6b49e1991ae7 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -1381,6 +1381,37 @@ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err)
 }
 EXPORT_SYMBOL_GPL(bpf_prog_select_runtime);
 
+/* to avoid allocating empty bpf_prog_array for cgroups that
+ * don't have bpf program attached use one global 'empty_prog_array'
+ * It will not be modified the caller of bpf_prog_array_alloc()
+ * (since caller requested prog_cnt == 0)
+ * that pointer should be 'freed' by bpf_prog_array_free()
+ */
+static struct {
+	struct bpf_prog_array hdr;
+	struct bpf_prog *null_prog;
+} empty_prog_array = {
+	.null_prog = NULL,
+};
+
+struct bpf_prog_array __rcu *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags)
+{
+	if (prog_cnt)
+		return kzalloc(sizeof(struct bpf_prog_array) +
+			       sizeof(struct bpf_prog *) * (prog_cnt + 1),
+			       flags);
+
+	return &empty_prog_array.hdr;
+}
+
+void bpf_prog_array_free(struct bpf_prog_array __rcu *progs)
+{
+	if (!progs ||
+	    progs == (struct bpf_prog_array __rcu *)&empty_prog_array.hdr)
+		return;
+	kfree_rcu(progs, rcu);
+}
+
 static void bpf_prog_free_deferred(struct work_struct *work)
 {
 	struct bpf_prog_aux *aux;
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index b927da66f653..51bee695d32c 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1168,6 +1168,9 @@ static int sockmap_get_from_fd(const union bpf_attr *attr, bool attach)
 	return 0;
 }
 
+#define BPF_F_ATTACH_MASK \
+	(BPF_F_ALLOW_OVERRIDE | BPF_F_ALLOW_MULTI)
+
 static int bpf_prog_attach(const union bpf_attr *attr)
 {
 	enum bpf_prog_type ptype;
@@ -1181,7 +1184,7 @@ static int bpf_prog_attach(const union bpf_attr *attr)
 	if (CHECK_ATTR(BPF_PROG_ATTACH))
 		return -EINVAL;
 
-	if (attr->attach_flags & ~BPF_F_ALLOW_OVERRIDE)
+	if (attr->attach_flags & ~BPF_F_ATTACH_MASK)
 		return -EINVAL;
 
 	switch (attr->attach_type) {
@@ -1212,8 +1215,8 @@ static int bpf_prog_attach(const union bpf_attr *attr)
 		return PTR_ERR(cgrp);
 	}
 
-	ret = cgroup_bpf_update(cgrp, prog, attr->attach_type,
-				attr->attach_flags & BPF_F_ALLOW_OVERRIDE);
+	ret = cgroup_bpf_attach(cgrp, prog, attr->attach_type,
+				attr->attach_flags);
 	if (ret)
 		bpf_prog_put(prog);
 	cgroup_put(cgrp);
@@ -1225,6 +1228,8 @@ static int bpf_prog_attach(const union bpf_attr *attr)
 
 static int bpf_prog_detach(const union bpf_attr *attr)
 {
+	enum bpf_prog_type ptype;
+	struct bpf_prog *prog;
 	struct cgroup *cgrp;
 	int ret;
 
@@ -1237,23 +1242,33 @@ static int bpf_prog_detach(const union bpf_attr *attr)
 	switch (attr->attach_type) {
 	case BPF_CGROUP_INET_INGRESS:
 	case BPF_CGROUP_INET_EGRESS:
+		ptype = BPF_PROG_TYPE_CGROUP_SKB;
+		break;
 	case BPF_CGROUP_INET_SOCK_CREATE:
+		ptype = BPF_PROG_TYPE_CGROUP_SOCK;
+		break;
 	case BPF_CGROUP_SOCK_OPS:
-		cgrp = cgroup_get_from_fd(attr->target_fd);
-		if (IS_ERR(cgrp))
-			return PTR_ERR(cgrp);
-
-		ret = cgroup_bpf_update(cgrp, NULL, attr->attach_type, false);
-		cgroup_put(cgrp);
+		ptype = BPF_PROG_TYPE_SOCK_OPS;
 		break;
 	case BPF_SK_SKB_STREAM_PARSER:
 	case BPF_SK_SKB_STREAM_VERDICT:
-		ret = sockmap_get_from_fd(attr, false);
-		break;
+		return sockmap_get_from_fd(attr, false);
 	default:
 		return -EINVAL;
 	}
 
+	cgrp = cgroup_get_from_fd(attr->target_fd);
+	if (IS_ERR(cgrp))
+		return PTR_ERR(cgrp);
+
+	prog = bpf_prog_get_type(attr->attach_bpf_fd, ptype);
+	if (IS_ERR(prog))
+		prog = NULL;
+
+	ret = cgroup_bpf_detach(cgrp, prog, attr->attach_type, 0);
+	if (prog)
+		bpf_prog_put(prog);
+	cgroup_put(cgrp);
 	return ret;
 }
 
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index d6551cd45238..57eb866ae78d 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -1896,6 +1896,9 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask, int ref_flags)
 	if (ret)
 		goto destroy_root;
 
+	ret = cgroup_bpf_inherit(root_cgrp);
+	WARN_ON_ONCE(ret);
+
 	trace_cgroup_setup_root(root);
 
 	/*
@@ -4713,6 +4716,9 @@ static struct cgroup *cgroup_create(struct cgroup *parent)
 	cgrp->self.parent = &parent->self;
 	cgrp->root = root;
 	cgrp->level = level;
+	ret = cgroup_bpf_inherit(cgrp);
+	if (ret)
+		goto out_idr_free;
 
 	for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp)) {
 		cgrp->ancestor_ids[tcgrp->level] = tcgrp->id;
@@ -4747,13 +4753,12 @@ static struct cgroup *cgroup_create(struct cgroup *parent)
 	if (!cgroup_on_dfl(cgrp))
 		cgrp->subtree_control = cgroup_control(cgrp);
 
-	if (parent)
-		cgroup_bpf_inherit(cgrp, parent);
-
 	cgroup_propagate_control(cgrp);
 
 	return cgrp;
 
+out_idr_free:
+	cgroup_idr_remove(&root->cgroup_idr, cgrp->id);
 out_cancel_ref:
 	percpu_ref_exit(&cgrp->self.refcnt);
 out_free_cgrp:
@@ -5736,14 +5741,23 @@ void cgroup_sk_free(struct sock_cgroup_data *skcd)
 #endif	/* CONFIG_SOCK_CGROUP_DATA */
 
 #ifdef CONFIG_CGROUP_BPF
-int cgroup_bpf_update(struct cgroup *cgrp, struct bpf_prog *prog,
-		      enum bpf_attach_type type, bool overridable)
+int cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog,
+		      enum bpf_attach_type type, u32 flags)
+{
+	int ret;
+
+	mutex_lock(&cgroup_mutex);
+	ret = __cgroup_bpf_attach(cgrp, prog, type, flags);
+	mutex_unlock(&cgroup_mutex);
+	return ret;
+}
+int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
+		      enum bpf_attach_type type, u32 flags)
 {
-	struct cgroup *parent = cgroup_parent(cgrp);
 	int ret;
 
 	mutex_lock(&cgroup_mutex);
-	ret = __cgroup_bpf_update(cgrp, parent, prog, type, overridable);
+	ret = __cgroup_bpf_detach(cgrp, prog, type, flags);
 	mutex_unlock(&cgroup_mutex);
 	return ret;
 }
-- 
cgit v1.2.3


From 468e2f64d220fe2dc11caa2bcb9b3a1e50fc7321 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@fb.com>
Date: Mon, 2 Oct 2017 22:50:22 -0700
Subject: bpf: introduce BPF_PROG_QUERY command

introduce BPF_PROG_QUERY command to retrieve a set of either
attached programs to given cgroup or a set of effective programs
that will execute for events within a cgroup

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Martin KaFai Lau <kafai@fb.com>
for cgroup bits
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf-cgroup.h |  4 ++++
 include/linux/bpf.h        |  3 +++
 include/uapi/linux/bpf.h   | 13 +++++++++++++
 kernel/bpf/cgroup.c        | 46 ++++++++++++++++++++++++++++++++++++++++++++++
 kernel/bpf/core.c          | 38 ++++++++++++++++++++++++++++++++++++++
 kernel/bpf/syscall.c       | 34 ++++++++++++++++++++++++++++++++++
 kernel/cgroup/cgroup.c     | 10 ++++++++++
 7 files changed, 148 insertions(+)

(limited to 'kernel')

diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
index 102e56fbb6de..359b6f5d3d90 100644
--- a/include/linux/bpf-cgroup.h
+++ b/include/linux/bpf-cgroup.h
@@ -44,12 +44,16 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog,
 			enum bpf_attach_type type, u32 flags);
 int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
 			enum bpf_attach_type type, u32 flags);
+int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
+		       union bpf_attr __user *uattr);
 
 /* Wrapper for __cgroup_bpf_*() protected by cgroup_mutex */
 int cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog,
 		      enum bpf_attach_type type, u32 flags);
 int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
 		      enum bpf_attach_type type, u32 flags);
+int cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
+		     union bpf_attr __user *uattr);
 
 int __cgroup_bpf_run_filter_skb(struct sock *sk,
 				struct sk_buff *skb,
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index a6964b75f070..a67daea731ab 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -260,6 +260,9 @@ struct bpf_prog_array {
 
 struct bpf_prog_array __rcu *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags);
 void bpf_prog_array_free(struct bpf_prog_array __rcu *progs);
+int bpf_prog_array_length(struct bpf_prog_array __rcu *progs);
+int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *progs,
+				__u32 __user *prog_ids, u32 cnt);
 
 #define BPF_PROG_RUN_ARRAY(array, ctx, func)		\
 	({						\
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 762f74bc6c47..cb2b9f95160a 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -92,6 +92,7 @@ enum bpf_cmd {
 	BPF_PROG_GET_FD_BY_ID,
 	BPF_MAP_GET_FD_BY_ID,
 	BPF_OBJ_GET_INFO_BY_FD,
+	BPF_PROG_QUERY,
 };
 
 enum bpf_map_type {
@@ -211,6 +212,9 @@ enum bpf_attach_type {
 /* Specify numa node during map creation */
 #define BPF_F_NUMA_NODE		(1U << 2)
 
+/* flags for BPF_PROG_QUERY */
+#define BPF_F_QUERY_EFFECTIVE	(1U << 0)
+
 #define BPF_OBJ_NAME_LEN 16U
 
 union bpf_attr {
@@ -289,6 +293,15 @@ union bpf_attr {
 		__u32		info_len;
 		__aligned_u64	info;
 	} info;
+
+	struct { /* anonymous struct used by BPF_PROG_QUERY command */
+		__u32		target_fd;	/* container object to query */
+		__u32		attach_type;
+		__u32		query_flags;
+		__u32		attach_flags;
+		__aligned_u64	prog_ids;
+		__u32		prog_cnt;
+	} query;
 } __attribute__((aligned(8)));
 
 /* BPF helper function descriptions:
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 6b7500bbdb53..e88abc0865d5 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -384,6 +384,52 @@ cleanup:
 	return err;
 }
 
+/* Must be called with cgroup_mutex held to avoid races. */
+int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
+		       union bpf_attr __user *uattr)
+{
+	__u32 __user *prog_ids = u64_to_user_ptr(attr->query.prog_ids);
+	enum bpf_attach_type type = attr->query.attach_type;
+	struct list_head *progs = &cgrp->bpf.progs[type];
+	u32 flags = cgrp->bpf.flags[type];
+	int cnt, ret = 0, i;
+
+	if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE)
+		cnt = bpf_prog_array_length(cgrp->bpf.effective[type]);
+	else
+		cnt = prog_list_length(progs);
+
+	if (copy_to_user(&uattr->query.attach_flags, &flags, sizeof(flags)))
+		return -EFAULT;
+	if (copy_to_user(&uattr->query.prog_cnt, &cnt, sizeof(cnt)))
+		return -EFAULT;
+	if (attr->query.prog_cnt == 0 || !prog_ids || !cnt)
+		/* return early if user requested only program count + flags */
+		return 0;
+	if (attr->query.prog_cnt < cnt) {
+		cnt = attr->query.prog_cnt;
+		ret = -ENOSPC;
+	}
+
+	if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE) {
+		return bpf_prog_array_copy_to_user(cgrp->bpf.effective[type],
+						   prog_ids, cnt);
+	} else {
+		struct bpf_prog_list *pl;
+		u32 id;
+
+		i = 0;
+		list_for_each_entry(pl, progs, node) {
+			id = pl->prog->aux->id;
+			if (copy_to_user(prog_ids + i, &id, sizeof(id)))
+				return -EFAULT;
+			if (++i == cnt)
+				break;
+		}
+	}
+	return ret;
+}
+
 /**
  * __cgroup_bpf_run_filter_skb() - Run a program for packet filtering
  * @sk: The socket sending or receiving traffic
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 6b49e1991ae7..eba966c09053 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -1412,6 +1412,44 @@ void bpf_prog_array_free(struct bpf_prog_array __rcu *progs)
 	kfree_rcu(progs, rcu);
 }
 
+int bpf_prog_array_length(struct bpf_prog_array __rcu *progs)
+{
+	struct bpf_prog **prog;
+	u32 cnt = 0;
+
+	rcu_read_lock();
+	prog = rcu_dereference(progs)->progs;
+	for (; *prog; prog++)
+		cnt++;
+	rcu_read_unlock();
+	return cnt;
+}
+
+int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *progs,
+				__u32 __user *prog_ids, u32 cnt)
+{
+	struct bpf_prog **prog;
+	u32 i = 0, id;
+
+	rcu_read_lock();
+	prog = rcu_dereference(progs)->progs;
+	for (; *prog; prog++) {
+		id = (*prog)->aux->id;
+		if (copy_to_user(prog_ids + i, &id, sizeof(id))) {
+			rcu_read_unlock();
+			return -EFAULT;
+		}
+		if (++i == cnt) {
+			prog++;
+			break;
+		}
+	}
+	rcu_read_unlock();
+	if (*prog)
+		return -ENOSPC;
+	return 0;
+}
+
 static void bpf_prog_free_deferred(struct work_struct *work)
 {
 	struct bpf_prog_aux *aux;
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 51bee695d32c..0048cb24ba7b 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1272,6 +1272,37 @@ static int bpf_prog_detach(const union bpf_attr *attr)
 	return ret;
 }
 
+#define BPF_PROG_QUERY_LAST_FIELD query.prog_cnt
+
+static int bpf_prog_query(const union bpf_attr *attr,
+			  union bpf_attr __user *uattr)
+{
+	struct cgroup *cgrp;
+	int ret;
+
+	if (!capable(CAP_NET_ADMIN))
+		return -EPERM;
+	if (CHECK_ATTR(BPF_PROG_QUERY))
+		return -EINVAL;
+	if (attr->query.query_flags & ~BPF_F_QUERY_EFFECTIVE)
+		return -EINVAL;
+
+	switch (attr->query.attach_type) {
+	case BPF_CGROUP_INET_INGRESS:
+	case BPF_CGROUP_INET_EGRESS:
+	case BPF_CGROUP_INET_SOCK_CREATE:
+	case BPF_CGROUP_SOCK_OPS:
+		break;
+	default:
+		return -EINVAL;
+	}
+	cgrp = cgroup_get_from_fd(attr->query.target_fd);
+	if (IS_ERR(cgrp))
+		return PTR_ERR(cgrp);
+	ret = cgroup_bpf_query(cgrp, attr, uattr);
+	cgroup_put(cgrp);
+	return ret;
+}
 #endif /* CONFIG_CGROUP_BPF */
 
 #define BPF_PROG_TEST_RUN_LAST_FIELD test.duration
@@ -1568,6 +1599,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
 	case BPF_PROG_DETACH:
 		err = bpf_prog_detach(&attr);
 		break;
+	case BPF_PROG_QUERY:
+		err = bpf_prog_query(&attr, uattr);
+		break;
 #endif
 	case BPF_PROG_TEST_RUN:
 		err = bpf_prog_test_run(&attr, uattr);
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 57eb866ae78d..269512b94a94 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -5761,4 +5761,14 @@ int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
 	mutex_unlock(&cgroup_mutex);
 	return ret;
 }
+int cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
+		     union bpf_attr __user *uattr)
+{
+	int ret;
+
+	mutex_lock(&cgroup_mutex);
+	ret = __cgroup_bpf_query(cgrp, attr, uattr);
+	mutex_unlock(&cgroup_mutex);
+	return ret;
+}
 #endif /* CONFIG_CGROUP_BPF */
-- 
cgit v1.2.3


From 390ee7e29fc8e6e90d3065b00afb047c4ee552f9 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@fb.com>
Date: Mon, 2 Oct 2017 22:50:23 -0700
Subject: bpf: enforce return code for cgroup-bpf programs

with addition of tnum logic the verifier got smart enough and
we can enforce return codes at program load time.
For now do so for cgroup-bpf program types.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/verifier.c                       | 40 ++++++++++++++++
 tools/testing/selftests/bpf/test_verifier.c | 72 +++++++++++++++++++++++++++++
 2 files changed, 112 insertions(+)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 4cf9b72c59a0..52b022310f6a 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -3073,6 +3073,43 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
 	return 0;
 }
 
+static int check_return_code(struct bpf_verifier_env *env)
+{
+	struct bpf_reg_state *reg;
+	struct tnum range = tnum_range(0, 1);
+
+	switch (env->prog->type) {
+	case BPF_PROG_TYPE_CGROUP_SKB:
+	case BPF_PROG_TYPE_CGROUP_SOCK:
+	case BPF_PROG_TYPE_SOCK_OPS:
+		break;
+	default:
+		return 0;
+	}
+
+	reg = &env->cur_state.regs[BPF_REG_0];
+	if (reg->type != SCALAR_VALUE) {
+		verbose("At program exit the register R0 is not a known value (%s)\n",
+			reg_type_str[reg->type]);
+		return -EINVAL;
+	}
+
+	if (!tnum_in(range, reg->var_off)) {
+		verbose("At program exit the register R0 ");
+		if (!tnum_is_unknown(reg->var_off)) {
+			char tn_buf[48];
+
+			tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
+			verbose("has value %s", tn_buf);
+		} else {
+			verbose("has unknown scalar value");
+		}
+		verbose(" should have been 0 or 1\n");
+		return -EINVAL;
+	}
+	return 0;
+}
+
 /* non-recursive DFS pseudo code
  * 1  procedure DFS-iterative(G,v):
  * 2      label v as discovered
@@ -3863,6 +3900,9 @@ static int do_check(struct bpf_verifier_env *env)
 					return -EACCES;
 				}
 
+				err = check_return_code(env);
+				if (err)
+					return err;
 process_bpf_exit:
 				insn_idx = pop_stack(env, &prev_insn_idx);
 				if (insn_idx < 0) {
diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c
index 290d5056c165..cc91d0159f43 100644
--- a/tools/testing/selftests/bpf/test_verifier.c
+++ b/tools/testing/selftests/bpf/test_verifier.c
@@ -6892,6 +6892,78 @@ static struct bpf_test tests[] = {
 		.result = ACCEPT,
 		.prog_type = BPF_PROG_TYPE_XDP,
 	},
+	{
+		"bpf_exit with invalid return code. test1",
+		.insns = {
+			BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, 0),
+			BPF_EXIT_INSN(),
+		},
+		.errstr = "R0 has value (0x0; 0xffffffff)",
+		.result = REJECT,
+		.prog_type = BPF_PROG_TYPE_CGROUP_SOCK,
+	},
+	{
+		"bpf_exit with invalid return code. test2",
+		.insns = {
+			BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, 0),
+			BPF_ALU64_IMM(BPF_AND, BPF_REG_0, 1),
+			BPF_EXIT_INSN(),
+		},
+		.result = ACCEPT,
+		.prog_type = BPF_PROG_TYPE_CGROUP_SOCK,
+	},
+	{
+		"bpf_exit with invalid return code. test3",
+		.insns = {
+			BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, 0),
+			BPF_ALU64_IMM(BPF_AND, BPF_REG_0, 3),
+			BPF_EXIT_INSN(),
+		},
+		.errstr = "R0 has value (0x0; 0x3)",
+		.result = REJECT,
+		.prog_type = BPF_PROG_TYPE_CGROUP_SOCK,
+	},
+	{
+		"bpf_exit with invalid return code. test4",
+		.insns = {
+			BPF_MOV64_IMM(BPF_REG_0, 1),
+			BPF_EXIT_INSN(),
+		},
+		.result = ACCEPT,
+		.prog_type = BPF_PROG_TYPE_CGROUP_SOCK,
+	},
+	{
+		"bpf_exit with invalid return code. test5",
+		.insns = {
+			BPF_MOV64_IMM(BPF_REG_0, 2),
+			BPF_EXIT_INSN(),
+		},
+		.errstr = "R0 has value (0x2; 0x0)",
+		.result = REJECT,
+		.prog_type = BPF_PROG_TYPE_CGROUP_SOCK,
+	},
+	{
+		"bpf_exit with invalid return code. test6",
+		.insns = {
+			BPF_MOV64_REG(BPF_REG_0, BPF_REG_1),
+			BPF_EXIT_INSN(),
+		},
+		.errstr = "R0 is not a known value (ctx)",
+		.result = REJECT,
+		.prog_type = BPF_PROG_TYPE_CGROUP_SOCK,
+	},
+	{
+		"bpf_exit with invalid return code. test7",
+		.insns = {
+			BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, 0),
+			BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, 4),
+			BPF_ALU64_REG(BPF_MUL, BPF_REG_0, BPF_REG_2),
+			BPF_EXIT_INSN(),
+		},
+		.errstr = "R0 has unknown scalar value",
+		.result = REJECT,
+		.prog_type = BPF_PROG_TYPE_CGROUP_SOCK,
+	},
 };
 
 static int probe_filter_length(const struct bpf_insn *fp)
-- 
cgit v1.2.3


From 3e234289f86b12985ef8909cd34525fcb66c4efb Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Fri, 3 Mar 2017 18:00:22 -0500
Subject: ftrace: Allow module init functions to be traced

Allow for module init sections to be traced as well as core kernel init
sections. Now that filtering modules functions can be stored, for when they
are loaded, it makes sense to be able to trace them.

Cc: Jessica Yu <jeyu@kernel.org>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 include/linux/init.h  | 4 +---
 kernel/module.c       | 2 ++
 kernel/trace/ftrace.c | 6 ++++--
 3 files changed, 7 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/init.h b/include/linux/init.h
index 94769d687cf0..a779c1816437 100644
--- a/include/linux/init.h
+++ b/include/linux/init.h
@@ -39,7 +39,7 @@
 
 /* These are for everybody (although not all archs will actually
    discard it in modules) */
-#define __init		__section(.init.text) __cold __inittrace __latent_entropy
+#define __init		__section(.init.text) __cold  __latent_entropy
 #define __initdata	__section(.init.data)
 #define __initconst	__section(.init.rodata)
 #define __exitdata	__section(.exit.data)
@@ -68,10 +68,8 @@
 
 #ifdef MODULE
 #define __exitused
-#define __inittrace notrace
 #else
 #define __exitused  __used
-#define __inittrace
 #endif
 
 #define __exit          __section(.exit.text) __exitused __cold notrace
diff --git a/kernel/module.c b/kernel/module.c
index de66ec825992..58bca427ac3f 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -3473,6 +3473,8 @@ static noinline int do_init_module(struct module *mod)
 	if (!mod->async_probe_requested && (current->flags & PF_USED_ASYNC))
 		async_synchronize_full();
 
+	ftrace_free_mem(mod->init_layout.base, mod->init_layout.base +
+			mod->init_layout.size);
 	mutex_lock(&module_mutex);
 	/* Drop initial reference. */
 	module_put(mod);
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 84cb5928665a..d7297e866e4a 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -5752,7 +5752,8 @@ void ftrace_release_mod(struct module *mod)
 	last_pg = &ftrace_pages_start;
 	for (pg = ftrace_pages_start; pg; pg = *last_pg) {
 		rec = &pg->records[0];
-		if (within_module_core(rec->ip, mod)) {
+		if (within_module_core(rec->ip, mod) ||
+		    within_module_init(rec->ip, mod)) {
 			/*
 			 * As core pages are first, the first
 			 * page should never be a module page.
@@ -5821,7 +5822,8 @@ void ftrace_module_enable(struct module *mod)
 		 * not part of this module, then skip this pg,
 		 * which the "break" will do.
 		 */
-		if (!within_module_core(rec->ip, mod))
+		if (!within_module_core(rec->ip, mod) &&
+		    !within_module_init(rec->ip, mod))
 			break;
 
 		cnt = 0;
-- 
cgit v1.2.3


From aba4b5c22cbac296f4081a0476d0c55828f135b4 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Fri, 1 Sep 2017 08:35:38 -0400
Subject: ftrace: Save module init functions kallsyms symbols for tracing

If function tracing is active when the module init functions are freed, then
store them to be referenced by kallsyms. As module init functions can now be
traced on module load, they were useless:

 ># echo ':mod:snd_seq' > set_ftrace_filter
 ># echo function > current_tracer
 ># modprobe snd_seq
 ># cat trace
 # tracer: function
 #
 #                              _-----=> irqs-off
 #                             / _----=> need-resched
 #                            | / _---=> hardirq/softirq
 #                            || / _--=> preempt-depth
 #                            ||| /     delay
 #           TASK-PID   CPU#  ||||    TIMESTAMP  FUNCTION
 #              | |       |   ||||       |         |
         modprobe-2786  [000] ....  3189.037874: 0xffffffffa0860000 <-do_one_initcall
         modprobe-2786  [000] ....  3189.037876: 0xffffffffa086004d <-0xffffffffa086000f
         modprobe-2786  [000] ....  3189.037876: 0xffffffffa086010d <-0xffffffffa0860018
         modprobe-2786  [000] ....  3189.037877: 0xffffffffa086011a <-0xffffffffa0860021
         modprobe-2786  [000] ....  3189.037877: 0xffffffffa0860080 <-0xffffffffa086002a
         modprobe-2786  [000] ....  3189.039523: 0xffffffffa0860400 <-0xffffffffa0860033
         modprobe-2786  [000] ....  3189.039523: 0xffffffffa086038a <-0xffffffffa086041c
         modprobe-2786  [000] ....  3189.039591: 0xffffffffa086038a <-0xffffffffa0860436
         modprobe-2786  [000] ....  3189.039657: 0xffffffffa086038a <-0xffffffffa0860450
         modprobe-2786  [000] ....  3189.039719: 0xffffffffa0860127 <-0xffffffffa086003c
         modprobe-2786  [000] ....  3189.039742: snd_seq_create_kernel_client <-0xffffffffa08601f6

When the output is shown, the kallsyms for the module init functions have
already been freed, and the output of the trace can not convert them to
their function names.

Now this looks like this:

 # tracer: function
 #
 #                              _-----=> irqs-off
 #                             / _----=> need-resched
 #                            | / _---=> hardirq/softirq
 #                            || / _--=> preempt-depth
 #                            ||| /     delay
 #           TASK-PID   CPU#  ||||    TIMESTAMP  FUNCTION
 #              | |       |   ||||       |         |
         modprobe-2463  [002] ....   174.243237: alsa_seq_init <-do_one_initcall
         modprobe-2463  [002] ....   174.243239: client_init_data <-alsa_seq_init
         modprobe-2463  [002] ....   174.243240: snd_sequencer_memory_init <-alsa_seq_init
         modprobe-2463  [002] ....   174.243240: snd_seq_queues_init <-alsa_seq_init
         modprobe-2463  [002] ....   174.243240: snd_sequencer_device_init <-alsa_seq_init
         modprobe-2463  [002] ....   174.244860: snd_seq_info_init <-alsa_seq_init
         modprobe-2463  [002] ....   174.244861: create_info_entry <-snd_seq_info_init
         modprobe-2463  [002] ....   174.244936: create_info_entry <-snd_seq_info_init
         modprobe-2463  [002] ....   174.245003: create_info_entry <-snd_seq_info_init
         modprobe-2463  [002] ....   174.245072: snd_seq_system_client_init <-alsa_seq_init
         modprobe-2463  [002] ....   174.245094: snd_seq_create_kernel_client <-snd_seq_system_client_init

Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 include/linux/ftrace.h |  20 ++++++-
 kernel/kallsyms.c      |   5 ++
 kernel/module.c        |   2 +-
 kernel/trace/ftrace.c  | 146 ++++++++++++++++++++++++++++++++++++++++++++++++-
 4 files changed, 168 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 47fc404ad233..202b40784c4e 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -51,6 +51,21 @@ static inline void early_trace_init(void) { }
 struct module;
 struct ftrace_hash;
 
+#if defined(CONFIG_FUNCTION_TRACER) && defined(CONFIG_MODULES) && \
+	defined(CONFIG_DYNAMIC_FTRACE)
+const char *
+ftrace_mod_address_lookup(unsigned long addr, unsigned long *size,
+		   unsigned long *off, char **modname, char *sym);
+#else
+static inline const char *
+ftrace_mod_address_lookup(unsigned long addr, unsigned long *size,
+		   unsigned long *off, char **modname, char *sym)
+{
+	return NULL;
+}
+#endif
+
+
 #ifdef CONFIG_FUNCTION_TRACER
 
 extern int ftrace_enabled;
@@ -151,10 +166,10 @@ struct ftrace_ops_hash {
 };
 
 void ftrace_free_init_mem(void);
-void ftrace_free_mem(void *start, void *end);
+void ftrace_free_mem(struct module *mod, void *start, void *end);
 #else
 static inline void ftrace_free_init_mem(void) { }
-static inline void ftrace_free_mem(void *start, void *end) { }
+static inline void ftrace_free_mem(struct module *mod, void *start, void *end) { }
 #endif
 
 /*
@@ -272,6 +287,7 @@ static inline int ftrace_nr_registered_ops(void)
 static inline void clear_ftrace_function(void) { }
 static inline void ftrace_kill(void) { }
 static inline void ftrace_free_init_mem(void) { }
+static inline void ftrace_free_mem(struct module *mod, void *start, void *end) { }
 #endif /* CONFIG_FUNCTION_TRACER */
 
 #ifdef CONFIG_STACK_TRACER
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 127e7cfafa55..976ecb9275d9 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -24,6 +24,7 @@
 #include <linux/ctype.h>
 #include <linux/slab.h>
 #include <linux/filter.h>
+#include <linux/ftrace.h>
 #include <linux/compiler.h>
 
 #include <asm/sections.h>
@@ -337,6 +338,10 @@ const char *kallsyms_lookup(unsigned long addr,
 	if (!ret)
 		ret = bpf_address_lookup(addr, symbolsize,
 					 offset, modname, namebuf);
+
+	if (!ret)
+		ret = ftrace_mod_address_lookup(addr, symbolsize,
+						offset, modname, namebuf);
 	return ret;
 }
 
diff --git a/kernel/module.c b/kernel/module.c
index 58bca427ac3f..279a469dc375 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -3473,7 +3473,7 @@ static noinline int do_init_module(struct module *mod)
 	if (!mod->async_probe_requested && (current->flags & PF_USED_ASYNC))
 		async_synchronize_full();
 
-	ftrace_free_mem(mod->init_layout.base, mod->init_layout.base +
+	ftrace_free_mem(mod, mod->init_layout.base, mod->init_layout.base +
 			mod->init_layout.size);
 	mutex_lock(&module_mutex);
 	/* Drop initial reference. */
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index d7297e866e4a..86dbbfb353db 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -5675,6 +5675,21 @@ static int ftrace_process_locs(struct module *mod,
 	return ret;
 }
 
+struct ftrace_mod_func {
+	struct list_head	list;
+	char			*name;
+	unsigned long		ip;
+	unsigned int		size;
+};
+
+struct ftrace_mod_map {
+	struct list_head	list;
+	struct module		*mod;
+	unsigned long		start_addr;
+	unsigned long		end_addr;
+	struct list_head	funcs;
+};
+
 #ifdef CONFIG_MODULES
 
 #define next_to_ftrace_page(p) container_of(p, struct ftrace_page, next)
@@ -5868,9 +5883,123 @@ void ftrace_module_init(struct module *mod)
 	ftrace_process_locs(mod, mod->ftrace_callsites,
 			    mod->ftrace_callsites + mod->num_ftrace_callsites);
 }
+
+static void save_ftrace_mod_rec(struct ftrace_mod_map *mod_map,
+				struct dyn_ftrace *rec)
+{
+	struct ftrace_mod_func *mod_func;
+	unsigned long symsize;
+	unsigned long offset;
+	char str[KSYM_SYMBOL_LEN];
+	char *modname;
+	const char *ret;
+
+	ret = kallsyms_lookup(rec->ip, &symsize, &offset, &modname, str);
+	if (!ret)
+		return;
+
+	mod_func = kmalloc(sizeof(*mod_func), GFP_KERNEL);
+	if (!mod_func)
+		return;
+
+	mod_func->name = kstrdup(str, GFP_KERNEL);
+	if (!mod_func->name) {
+		kfree(mod_func);
+		return;
+	}
+
+	mod_func->ip = rec->ip - offset;
+	mod_func->size = symsize;
+
+	list_add_rcu(&mod_func->list, &mod_map->funcs);
+}
+
+static LIST_HEAD(ftrace_mod_maps);
+
+static struct ftrace_mod_map *
+allocate_ftrace_mod_map(struct module *mod,
+			unsigned long start, unsigned long end)
+{
+	struct ftrace_mod_map *mod_map;
+
+	mod_map = kmalloc(sizeof(*mod_map), GFP_KERNEL);
+	if (!mod_map)
+		return NULL;
+
+	mod_map->mod = mod;
+	mod_map->start_addr = start;
+	mod_map->end_addr = end;
+
+	INIT_LIST_HEAD_RCU(&mod_map->funcs);
+
+	list_add_rcu(&mod_map->list, &ftrace_mod_maps);
+
+	return mod_map;
+}
+
+static const char *
+ftrace_func_address_lookup(struct ftrace_mod_map *mod_map,
+			   unsigned long addr, unsigned long *size,
+			   unsigned long *off, char *sym)
+{
+	struct ftrace_mod_func *found_func =  NULL;
+	struct ftrace_mod_func *mod_func;
+
+	list_for_each_entry_rcu(mod_func, &mod_map->funcs, list) {
+		if (addr >= mod_func->ip &&
+		    addr < mod_func->ip + mod_func->size) {
+			found_func = mod_func;
+			break;
+		}
+	}
+
+	if (found_func) {
+		if (size)
+			*size = found_func->size;
+		if (off)
+			*off = addr - found_func->ip;
+		if (sym)
+			strlcpy(sym, found_func->name, KSYM_NAME_LEN);
+
+		return found_func->name;
+	}
+
+	return NULL;
+}
+
+const char *
+ftrace_mod_address_lookup(unsigned long addr, unsigned long *size,
+		   unsigned long *off, char **modname, char *sym)
+{
+	struct ftrace_mod_map *mod_map;
+	const char *ret = NULL;
+
+	preempt_disable();
+	list_for_each_entry_rcu(mod_map, &ftrace_mod_maps, list) {
+		ret = ftrace_func_address_lookup(mod_map, addr, size, off, sym);
+		if (ret) {
+			if (modname)
+				*modname = mod_map->mod->name;
+			break;
+		}
+	}
+	preempt_enable();
+
+	return ret;
+}
+
+#else
+static void save_ftrace_mod_rec(struct ftrace_mod_map *mod_map,
+				struct dyn_ftrace *rec) { }
+static inline struct ftrace_mod_map *
+allocate_ftrace_mod_map(struct module *mod,
+			unsigned long start, unsigned long end)
+{
+	return NULL;
+}
 #endif /* CONFIG_MODULES */
 
-void ftrace_free_mem(void *start_ptr, void *end_ptr)
+void ftrace_free_mem(struct module *mod, void *start_ptr, void *end_ptr)
 {
 	unsigned long start = (unsigned long)(start_ptr);
 	unsigned long end = (unsigned long)(end_ptr);
@@ -5878,6 +6007,7 @@ void ftrace_free_mem(void *start_ptr, void *end_ptr)
 	struct ftrace_page *pg;
 	struct dyn_ftrace *rec;
 	struct dyn_ftrace key;
+	struct ftrace_mod_map *mod_map = NULL;
 	int order;
 
 	key.ip = start;
@@ -5885,6 +6015,14 @@ void ftrace_free_mem(void *start_ptr, void *end_ptr)
 
 	mutex_lock(&ftrace_lock);
 
+	/*
+	 * If we are freeing module init memory, then check if
+	 * any tracer is active. If so, we need to save a mapping of
+	 * the module functions being freed with the address.
+	 */
+	if (mod && ftrace_ops_list != &ftrace_list_end)
+		mod_map = allocate_ftrace_mod_map(mod, start, end);
+
 	for (pg = ftrace_pages_start; pg; last_pg = &pg->next, pg = *last_pg) {
 		if (end < pg->records[0].ip ||
 		    start >= (pg->records[pg->index - 1].ip + MCOUNT_INSN_SIZE))
@@ -5895,6 +6033,10 @@ void ftrace_free_mem(void *start_ptr, void *end_ptr)
 			      ftrace_cmp_recs);
 		if (!rec)
 			continue;
+
+		if (mod_map)
+			save_ftrace_mod_rec(mod_map, rec);
+
 		pg->index--;
 		ftrace_update_tot_cnt--;
 		if (!pg->index) {
@@ -5920,7 +6062,7 @@ void __init ftrace_free_init_mem(void)
 	void *start = (void *)(&__init_begin);
 	void *end = (void *)(&__init_end);
 
-	ftrace_free_mem(start, end);
+	ftrace_free_mem(NULL, start, end);
 }
 
 void __init ftrace_init(void)
-- 
cgit v1.2.3


From 6aa69784b43eb5f69120339938c50a97a433049f Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Tue, 5 Sep 2017 19:20:16 -0400
Subject: ftrace: Add freeing algorithm to free ftrace_mod_maps

The ftrace_mod_map is a descriptor to save module init function names in
case they were traced, and the trace output needs to reference the function
name from the function address. But after the function is unloaded, it
the maps should be freed, as the rest of the function names are as well.

Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/ftrace.c | 32 ++++++++++++++++++++++++++++++--
 1 file changed, 30 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 86dbbfb353db..a5824408bed9 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -5683,6 +5683,7 @@ struct ftrace_mod_func {
 };
 
 struct ftrace_mod_map {
+	struct rcu_head		rcu;
 	struct list_head	list;
 	struct module		*mod;
 	unsigned long		start_addr;
@@ -5694,6 +5695,8 @@ struct ftrace_mod_map {
 
 #define next_to_ftrace_page(p) container_of(p, struct ftrace_page, next)
 
+static LIST_HEAD(ftrace_mod_maps);
+
 static int referenced_filters(struct dyn_ftrace *rec)
 {
 	struct ftrace_ops *ops;
@@ -5747,8 +5750,26 @@ static void clear_mod_from_hashes(struct ftrace_page *pg)
 	mutex_unlock(&trace_types_lock);
 }
 
+static void ftrace_free_mod_map(struct rcu_head *rcu)
+{
+	struct ftrace_mod_map *mod_map = container_of(rcu, struct ftrace_mod_map, rcu);
+	struct ftrace_mod_func *mod_func;
+	struct ftrace_mod_func *n;
+
+	/* All the contents of mod_map are now not visible to readers */
+	list_for_each_entry_safe(mod_func, n, &mod_map->funcs, list) {
+		kfree(mod_func->name);
+		list_del(&mod_func->list);
+		kfree(mod_func);
+	}
+
+	kfree(mod_map);
+}
+
 void ftrace_release_mod(struct module *mod)
 {
+	struct ftrace_mod_map *mod_map;
+	struct ftrace_mod_map *n;
 	struct dyn_ftrace *rec;
 	struct ftrace_page **last_pg;
 	struct ftrace_page *tmp_page = NULL;
@@ -5760,6 +5781,14 @@ void ftrace_release_mod(struct module *mod)
 	if (ftrace_disabled)
 		goto out_unlock;
 
+	list_for_each_entry_safe(mod_map, n, &ftrace_mod_maps, list) {
+		if (mod_map->mod == mod) {
+			list_del_rcu(&mod_map->list);
+			call_rcu_sched(&mod_map->rcu, ftrace_free_mod_map);
+			break;
+		}
+	}
+
 	/*
 	 * Each module has its own ftrace_pages, remove
 	 * them from the list.
@@ -5914,8 +5943,6 @@ static void save_ftrace_mod_rec(struct ftrace_mod_map *mod_map,
 	list_add_rcu(&mod_func->list, &mod_map->funcs);
 }
 
-static LIST_HEAD(ftrace_mod_maps);
-
 static struct ftrace_mod_map *
 allocate_ftrace_mod_map(struct module *mod,
 			unsigned long start, unsigned long end)
@@ -5974,6 +6001,7 @@ ftrace_mod_address_lookup(unsigned long addr, unsigned long *size,
 	struct ftrace_mod_map *mod_map;
 	const char *ret = NULL;
 
+	/* mod_map is freed via call_rcu_sched() */
 	preempt_disable();
 	list_for_each_entry_rcu(mod_map, &ftrace_mod_maps, list) {
 		ret = ftrace_func_address_lookup(mod_map, addr, size, off, sym);
-- 
cgit v1.2.3


From 6171a0310a06a7a0cb83713fa7068bdd4192de19 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Wed, 6 Sep 2017 08:40:41 -0400
Subject: ftrace/kallsyms: Have /proc/kallsyms show saved mod init functions

If a module is loaded while tracing is enabled, then there's a possibility
that the module init functions were traced. These functions have their name
and address stored by ftrace such that it can translate the function address
that is written into the buffer into a human readable function name.

As userspace tools may be doing the same, they need a way to map function
names to their address as well. This is done through reading /proc/kallsyms.

Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 include/linux/ftrace.h |  9 +++++++++
 kernel/kallsyms.c      | 38 ++++++++++++++++++++++++++++++++------
 kernel/trace/ftrace.c  | 40 ++++++++++++++++++++++++++++++++++++++++
 3 files changed, 81 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 202b40784c4e..346f8294e40a 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -56,6 +56,9 @@ struct ftrace_hash;
 const char *
 ftrace_mod_address_lookup(unsigned long addr, unsigned long *size,
 		   unsigned long *off, char **modname, char *sym);
+int ftrace_mod_get_kallsym(unsigned int symnum, unsigned long *value,
+			   char *type, char *name,
+			   char *module_name, int *exported);
 #else
 static inline const char *
 ftrace_mod_address_lookup(unsigned long addr, unsigned long *size,
@@ -63,6 +66,12 @@ ftrace_mod_address_lookup(unsigned long addr, unsigned long *size,
 {
 	return NULL;
 }
+static inline int ftrace_mod_get_kallsym(unsigned int symnum, unsigned long *value,
+					 char *type, char *name,
+					 char *module_name, int *exported)
+{
+	return -1;
+}
 #endif
 
 
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 976ecb9275d9..1966fe1c2b57 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -479,6 +479,7 @@ EXPORT_SYMBOL(__print_symbol);
 struct kallsym_iter {
 	loff_t pos;
 	loff_t pos_mod_end;
+	loff_t pos_ftrace_mod_end;
 	unsigned long value;
 	unsigned int nameoff; /* If iterating in core kernel symbols. */
 	char type;
@@ -501,11 +502,25 @@ static int get_ksymbol_mod(struct kallsym_iter *iter)
 	return 1;
 }
 
+static int get_ksymbol_ftrace_mod(struct kallsym_iter *iter)
+{
+	int ret = ftrace_mod_get_kallsym(iter->pos - iter->pos_mod_end,
+					 &iter->value, &iter->type,
+					 iter->name, iter->module_name,
+					 &iter->exported);
+	if (ret < 0) {
+		iter->pos_ftrace_mod_end = iter->pos;
+		return 0;
+	}
+
+	return 1;
+}
+
 static int get_ksymbol_bpf(struct kallsym_iter *iter)
 {
 	iter->module_name[0] = '\0';
 	iter->exported = 0;
-	return bpf_get_kallsym(iter->pos - iter->pos_mod_end,
+	return bpf_get_kallsym(iter->pos - iter->pos_ftrace_mod_end,
 			       &iter->value, &iter->type,
 			       iter->name) < 0 ? 0 : 1;
 }
@@ -530,20 +545,31 @@ static void reset_iter(struct kallsym_iter *iter, loff_t new_pos)
 	iter->name[0] = '\0';
 	iter->nameoff = get_symbol_offset(new_pos);
 	iter->pos = new_pos;
-	if (new_pos == 0)
+	if (new_pos == 0) {
 		iter->pos_mod_end = 0;
+		iter->pos_ftrace_mod_end = 0;
+	}
 }
 
 static int update_iter_mod(struct kallsym_iter *iter, loff_t pos)
 {
 	iter->pos = pos;
 
-	if (iter->pos_mod_end > 0 &&
-	    iter->pos_mod_end < iter->pos)
+	if (iter->pos_ftrace_mod_end > 0 &&
+	    iter->pos_ftrace_mod_end < iter->pos)
 		return get_ksymbol_bpf(iter);
 
-	if (!get_ksymbol_mod(iter))
-		return get_ksymbol_bpf(iter);
+	if (iter->pos_mod_end > 0 &&
+	    iter->pos_mod_end < iter->pos) {
+		if (!get_ksymbol_ftrace_mod(iter))
+			return get_ksymbol_bpf(iter);
+		return 1;
+	}
+
+	if (!get_ksymbol_mod(iter)) {
+		if (!get_ksymbol_ftrace_mod(iter))
+			return get_ksymbol_bpf(iter);
+	}
 
 	return 1;
 }
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index a5824408bed9..9e99bd55732e 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -5689,6 +5689,7 @@ struct ftrace_mod_map {
 	unsigned long		start_addr;
 	unsigned long		end_addr;
 	struct list_head	funcs;
+	unsigned int		num_funcs;
 };
 
 #ifdef CONFIG_MODULES
@@ -5940,6 +5941,8 @@ static void save_ftrace_mod_rec(struct ftrace_mod_map *mod_map,
 	mod_func->ip = rec->ip - offset;
 	mod_func->size = symsize;
 
+	mod_map->num_funcs++;
+
 	list_add_rcu(&mod_func->list, &mod_map->funcs);
 }
 
@@ -5956,6 +5959,7 @@ allocate_ftrace_mod_map(struct module *mod,
 	mod_map->mod = mod;
 	mod_map->start_addr = start;
 	mod_map->end_addr = end;
+	mod_map->num_funcs = 0;
 
 	INIT_LIST_HEAD_RCU(&mod_map->funcs);
 
@@ -6016,6 +6020,42 @@ ftrace_mod_address_lookup(unsigned long addr, unsigned long *size,
 	return ret;
 }
 
+int ftrace_mod_get_kallsym(unsigned int symnum, unsigned long *value,
+			   char *type, char *name,
+			   char *module_name, int *exported)
+{
+	struct ftrace_mod_map *mod_map;
+	struct ftrace_mod_func *mod_func;
+
+	preempt_disable();
+	list_for_each_entry_rcu(mod_map, &ftrace_mod_maps, list) {
+
+		if (symnum >= mod_map->num_funcs) {
+			symnum -= mod_map->num_funcs;
+			continue;
+		}
+
+		list_for_each_entry_rcu(mod_func, &mod_map->funcs, list) {
+			if (symnum > 1) {
+				symnum--;
+				continue;
+			}
+
+			*value = mod_func->ip;
+			*type = 'T';
+			strlcpy(name, mod_func->name, KSYM_NAME_LEN);
+			strlcpy(module_name, mod_map->mod->name, MODULE_NAME_LEN);
+			*exported = 1;
+			preempt_enable();
+			return 0;
+		}
+		WARN_ON(1);
+		break;
+	}
+	preempt_enable();
+	return -ERANGE;
+}
+
 #else
 static void save_ftrace_mod_rec(struct ftrace_mod_map *mod_map,
 				struct dyn_ftrace *rec) { }
-- 
cgit v1.2.3


From aaecaa0b5f31794f1711247da4b5883a6ff98163 Mon Sep 17 00:00:00 2001
From: Joel Fernandes <joelaf@google.com>
Date: Thu, 5 Oct 2017 17:54:31 -0700
Subject: tracing: Prepare to add preempt and irq trace events

In preparation of adding irqsoff and preemptsoff enable and disable trace
events, move required functions and code to make it easier to add these events
in a later patch. This patch is just code movement and no functional change.
Link: http://lkml.kernel.org/r/20171006005432.14244-2-joelaf@google.com

Cc: Peter Zijlstra <peterz@infradead.org>
Cc: kernel-team@android.com
Signed-off-by: Joel Fernandes <joelaf@google.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace_irqsoff.c | 100 ++++++++++++++++++++++++++++++++-----------
 1 file changed, 74 insertions(+), 26 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 7758bc0617cb..0e3033c00474 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -16,6 +16,7 @@
 
 #include "trace.h"
 
+#if defined(CONFIG_IRQSOFF_TRACER) || defined(CONFIG_PREEMPT_TRACER)
 static struct trace_array		*irqsoff_trace __read_mostly;
 static int				tracer_enabled __read_mostly;
 
@@ -462,64 +463,44 @@ void time_hardirqs_off(unsigned long a0, unsigned long a1)
 
 #else /* !CONFIG_PROVE_LOCKING */
 
-/*
- * Stubs:
- */
-
-void trace_softirqs_on(unsigned long ip)
-{
-}
-
-void trace_softirqs_off(unsigned long ip)
-{
-}
-
-inline void print_irqtrace_events(struct task_struct *curr)
-{
-}
-
 /*
  * We are only interested in hardirq on/off events:
  */
-void trace_hardirqs_on(void)
+static inline void tracer_hardirqs_on(void)
 {
 	if (!preempt_trace() && irq_trace())
 		stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
 }
-EXPORT_SYMBOL(trace_hardirqs_on);
 
-void trace_hardirqs_off(void)
+static inline void tracer_hardirqs_off(void)
 {
 	if (!preempt_trace() && irq_trace())
 		start_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
 }
-EXPORT_SYMBOL(trace_hardirqs_off);
 
-__visible void trace_hardirqs_on_caller(unsigned long caller_addr)
+static inline void tracer_hardirqs_on_caller(unsigned long caller_addr)
 {
 	if (!preempt_trace() && irq_trace())
 		stop_critical_timing(CALLER_ADDR0, caller_addr);
 }
-EXPORT_SYMBOL(trace_hardirqs_on_caller);
 
-__visible void trace_hardirqs_off_caller(unsigned long caller_addr)
+static inline void tracer_hardirqs_off_caller(unsigned long caller_addr)
 {
 	if (!preempt_trace() && irq_trace())
 		start_critical_timing(CALLER_ADDR0, caller_addr);
 }
-EXPORT_SYMBOL(trace_hardirqs_off_caller);
 
 #endif /* CONFIG_PROVE_LOCKING */
 #endif /*  CONFIG_IRQSOFF_TRACER */
 
 #ifdef CONFIG_PREEMPT_TRACER
-void trace_preempt_on(unsigned long a0, unsigned long a1)
+static inline void tracer_preempt_on(unsigned long a0, unsigned long a1)
 {
 	if (preempt_trace() && !irq_trace())
 		stop_critical_timing(a0, a1);
 }
 
-void trace_preempt_off(unsigned long a0, unsigned long a1)
+static inline void tracer_preempt_off(unsigned long a0, unsigned long a1)
 {
 	if (preempt_trace() && !irq_trace())
 		start_critical_timing(a0, a1);
@@ -781,3 +762,70 @@ __init static int init_irqsoff_tracer(void)
 	return 0;
 }
 core_initcall(init_irqsoff_tracer);
+#endif /* IRQSOFF_TRACER || PREEMPTOFF_TRACER */
+
+#ifndef CONFIG_IRQSOFF_TRACER
+static inline void tracer_hardirqs_on(void) { }
+static inline void tracer_hardirqs_off(void) { }
+static inline void tracer_hardirqs_on_caller(unsigned long caller_addr) { }
+static inline void tracer_hardirqs_off_caller(unsigned long caller_addr) { }
+#endif
+
+#ifndef CONFIG_PREEMPT_TRACER
+static inline void tracer_preempt_on(unsigned long a0, unsigned long a1) { }
+static inline void tracer_preempt_off(unsigned long a0, unsigned long a1) { }
+#endif
+
+#if defined(CONFIG_TRACE_IRQFLAGS) && !defined(CONFIG_PROVE_LOCKING)
+void trace_hardirqs_on(void)
+{
+	tracer_hardirqs_on();
+}
+EXPORT_SYMBOL(trace_hardirqs_on);
+
+void trace_hardirqs_off(void)
+{
+	tracer_hardirqs_off();
+}
+EXPORT_SYMBOL(trace_hardirqs_off);
+
+__visible void trace_hardirqs_on_caller(unsigned long caller_addr)
+{
+	tracer_hardirqs_on_caller(caller_addr);
+}
+EXPORT_SYMBOL(trace_hardirqs_on_caller);
+
+__visible void trace_hardirqs_off_caller(unsigned long caller_addr)
+{
+	tracer_hardirqs_off_caller(caller_addr);
+}
+EXPORT_SYMBOL(trace_hardirqs_off_caller);
+
+/*
+ * Stubs:
+ */
+
+void trace_softirqs_on(unsigned long ip)
+{
+}
+
+void trace_softirqs_off(unsigned long ip)
+{
+}
+
+inline void print_irqtrace_events(struct task_struct *curr)
+{
+}
+#endif
+
+#ifdef CONFIG_PREEMPT_TRACER
+void trace_preempt_on(unsigned long a0, unsigned long a1)
+{
+	tracer_preempt_on(a0, a1);
+}
+
+void trace_preempt_off(unsigned long a0, unsigned long a1)
+{
+	tracer_preempt_off(a0, a1);
+}
+#endif
-- 
cgit v1.2.3


From 1bd845bcb41d5b7f83745e0cb99273eb376f2ec5 Mon Sep 17 00:00:00 2001
From: Mathias Krause <minipli@googlemail.com>
Date: Fri, 8 Sep 2017 20:57:09 +0200
Subject: padata: set cpu_index of unused CPUs to -1

The parallel queue per-cpu data structure gets initialized only for CPUs
in the 'pcpu' CPU mask set. This is not sufficient as the reorder timer
may run on a different CPU and might wrongly decide it's the target CPU
for the next reorder item as per-cpu memory gets memset(0) and we might
be waiting for the first CPU in cpumask.pcpu, i.e. cpu_index 0.

Make the '__this_cpu_read(pd->pqueue->cpu_index) == next_queue->cpu_index'
compare in padata_get_next() fail in this case by initializing the
cpu_index member of all per-cpu parallel queues. Use -1 for unused ones.

Signed-off-by: Mathias Krause <minipli@googlemail.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 kernel/padata.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/padata.c b/kernel/padata.c
index 868f947166d7..1b9b4bac4a9b 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -384,8 +384,14 @@ static void padata_init_pqueues(struct parallel_data *pd)
 	struct padata_parallel_queue *pqueue;
 
 	cpu_index = 0;
-	for_each_cpu(cpu, pd->cpumask.pcpu) {
+	for_each_possible_cpu(cpu) {
 		pqueue = per_cpu_ptr(pd->pqueue, cpu);
+
+		if (!cpumask_test_cpu(cpu, pd->cpumask.pcpu)) {
+			pqueue->cpu_index = -1;
+			continue;
+		}
+
 		pqueue->pd = pd;
 		pqueue->cpu_index = cpu_index;
 		cpu_index++;
-- 
cgit v1.2.3


From cf5868c8a22dc2854b96e9569064bb92365549ca Mon Sep 17 00:00:00 2001
From: Mathias Krause <minipli@googlemail.com>
Date: Fri, 8 Sep 2017 20:57:10 +0200
Subject: padata: ensure the reorder timer callback runs on the correct CPU

The reorder timer function runs on the CPU where the timer interrupt was
handled which is not necessarily one of the CPUs of the 'pcpu' CPU mask
set.

Ensure the padata_reorder() callback runs on the correct CPU, which is
one in the 'pcpu' CPU mask set and, preferrably, the next expected one.
Do so by comparing the current CPU with the expected target CPU. If they
match, call padata_reorder() right away. If they differ, schedule a work
item on the target CPU that does the padata_reorder() call for us.

Signed-off-by: Mathias Krause <minipli@googlemail.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 include/linux/padata.h |  2 ++
 kernel/padata.c        | 43 ++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 44 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/include/linux/padata.h b/include/linux/padata.h
index 2f9c1f93b1ce..5c0175bbc179 100644
--- a/include/linux/padata.h
+++ b/include/linux/padata.h
@@ -85,6 +85,7 @@ struct padata_serial_queue {
  * @swork: work struct for serialization.
  * @pd: Backpointer to the internal control structure.
  * @work: work struct for parallelization.
+ * @reorder_work: work struct for reordering.
  * @num_obj: Number of objects that are processed by this cpu.
  * @cpu_index: Index of the cpu.
  */
@@ -93,6 +94,7 @@ struct padata_parallel_queue {
        struct padata_list    reorder;
        struct parallel_data *pd;
        struct work_struct    work;
+       struct work_struct    reorder_work;
        atomic_t              num_obj;
        int                   cpu_index;
 };
diff --git a/kernel/padata.c b/kernel/padata.c
index 1b9b4bac4a9b..b4066147bce4 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -275,11 +275,51 @@ static void padata_reorder(struct parallel_data *pd)
 	return;
 }
 
+static void invoke_padata_reorder(struct work_struct *work)
+{
+	struct padata_parallel_queue *pqueue;
+	struct parallel_data *pd;
+
+	local_bh_disable();
+	pqueue = container_of(work, struct padata_parallel_queue, reorder_work);
+	pd = pqueue->pd;
+	padata_reorder(pd);
+	local_bh_enable();
+}
+
 static void padata_reorder_timer(unsigned long arg)
 {
 	struct parallel_data *pd = (struct parallel_data *)arg;
+	unsigned int weight;
+	int target_cpu, cpu;
 
-	padata_reorder(pd);
+	cpu = get_cpu();
+
+	/* We don't lock pd here to not interfere with parallel processing
+	 * padata_reorder() calls on other CPUs. We just need any CPU out of
+	 * the cpumask.pcpu set. It would be nice if it's the right one but
+	 * it doesn't matter if we're off to the next one by using an outdated
+	 * pd->processed value.
+	 */
+	weight = cpumask_weight(pd->cpumask.pcpu);
+	target_cpu = padata_index_to_cpu(pd, pd->processed % weight);
+
+	/* ensure to call the reorder callback on the correct CPU */
+	if (cpu != target_cpu) {
+		struct padata_parallel_queue *pqueue;
+		struct padata_instance *pinst;
+
+		/* The timer function is serialized wrt itself -- no locking
+		 * needed.
+		 */
+		pinst = pd->pinst;
+		pqueue = per_cpu_ptr(pd->pqueue, target_cpu);
+		queue_work_on(target_cpu, pinst->wq, &pqueue->reorder_work);
+	} else {
+		padata_reorder(pd);
+	}
+
+	put_cpu();
 }
 
 static void padata_serial_worker(struct work_struct *serial_work)
@@ -399,6 +439,7 @@ static void padata_init_pqueues(struct parallel_data *pd)
 		__padata_list_init(&pqueue->reorder);
 		__padata_list_init(&pqueue->parallel);
 		INIT_WORK(&pqueue->work, padata_parallel_worker);
+		INIT_WORK(&pqueue->reorder_work, invoke_padata_reorder);
 		atomic_set(&pqueue->num_obj, 0);
 	}
 }
-- 
cgit v1.2.3


From 350ef88e7e922354f82a931897ad4a4ce6c686ff Mon Sep 17 00:00:00 2001
From: Mathias Krause <minipli@googlemail.com>
Date: Fri, 8 Sep 2017 20:57:11 +0200
Subject: padata: ensure padata_do_serial() runs on the correct CPU

If the algorithm we're parallelizing is asynchronous we might change
CPUs between padata_do_parallel() and padata_do_serial(). However, we
don't expect this to happen as we need to enqueue the padata object into
the per-cpu reorder queue we took it from, i.e. the same-cpu's parallel
queue.

Ensure we're not switching CPUs for a given padata object by tracking
the CPU within the padata object. If the serial callback gets called on
the wrong CPU, defer invoking padata_reorder() via a kernel worker on
the CPU we're expected to run on.

Signed-off-by: Mathias Krause <minipli@googlemail.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 include/linux/padata.h |  2 ++
 kernel/padata.c        | 20 +++++++++++++++++++-
 2 files changed, 21 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/include/linux/padata.h b/include/linux/padata.h
index 5c0175bbc179..5d13d25da2c8 100644
--- a/include/linux/padata.h
+++ b/include/linux/padata.h
@@ -37,6 +37,7 @@
  * @list: List entry, to attach to the padata lists.
  * @pd: Pointer to the internal control structure.
  * @cb_cpu: Callback cpu for serializatioon.
+ * @cpu: Cpu for parallelization.
  * @seq_nr: Sequence number of the parallelized data object.
  * @info: Used to pass information from the parallel to the serial function.
  * @parallel: Parallel execution function.
@@ -46,6 +47,7 @@ struct padata_priv {
 	struct list_head	list;
 	struct parallel_data	*pd;
 	int			cb_cpu;
+	int			cpu;
 	int			info;
 	void                    (*parallel)(struct padata_priv *padata);
 	void                    (*serial)(struct padata_priv *padata);
diff --git a/kernel/padata.c b/kernel/padata.c
index b4066147bce4..f262c9a4e70a 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -131,6 +131,7 @@ int padata_do_parallel(struct padata_instance *pinst,
 	padata->cb_cpu = cb_cpu;
 
 	target_cpu = padata_cpu_hash(pd);
+	padata->cpu = target_cpu;
 	queue = per_cpu_ptr(pd->pqueue, target_cpu);
 
 	spin_lock(&queue->parallel.lock);
@@ -363,10 +364,21 @@ void padata_do_serial(struct padata_priv *padata)
 	int cpu;
 	struct padata_parallel_queue *pqueue;
 	struct parallel_data *pd;
+	int reorder_via_wq = 0;
 
 	pd = padata->pd;
 
 	cpu = get_cpu();
+
+	/* We need to run on the same CPU padata_do_parallel(.., padata, ..)
+	 * was called on -- or, at least, enqueue the padata object into the
+	 * correct per-cpu queue.
+	 */
+	if (cpu != padata->cpu) {
+		reorder_via_wq = 1;
+		cpu = padata->cpu;
+	}
+
 	pqueue = per_cpu_ptr(pd->pqueue, cpu);
 
 	spin_lock(&pqueue->reorder.lock);
@@ -376,7 +388,13 @@ void padata_do_serial(struct padata_priv *padata)
 
 	put_cpu();
 
-	padata_reorder(pd);
+	/* If we're running on the wrong CPU, call padata_reorder() via a
+	 * kernel worker.
+	 */
+	if (reorder_via_wq)
+		queue_work_on(cpu, pd->pinst->wq, &pqueue->reorder_work);
+	else
+		padata_reorder(pd);
 }
 EXPORT_SYMBOL(padata_do_serial);
 
-- 
cgit v1.2.3


From 97562633bcbac4a07d605ae628d7655fa71caaf5 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Thu, 5 Oct 2017 09:19:19 -0700
Subject: bpf: perf event change needed for subsequent bpf helpers

This patch does not impact existing functionalities.
It contains the changes in perf event area needed for
subsequent bpf_perf_event_read_value and
bpf_perf_prog_read_value helpers.

Signed-off-by: Yonghong Song <yhs@fb.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/perf_event.h |  7 +++++--
 kernel/bpf/arraymap.c      |  2 +-
 kernel/events/core.c       | 15 +++++++++++++--
 kernel/trace/bpf_trace.c   |  2 +-
 4 files changed, 20 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 8e22f24ded6a..79b18a20cf5d 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -806,6 +806,7 @@ struct perf_output_handle {
 struct bpf_perf_event_data_kern {
 	struct pt_regs *regs;
 	struct perf_sample_data *data;
+	struct perf_event *event;
 };
 
 #ifdef CONFIG_CGROUP_PERF
@@ -884,7 +885,8 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr,
 				void *context);
 extern void perf_pmu_migrate_context(struct pmu *pmu,
 				int src_cpu, int dst_cpu);
-int perf_event_read_local(struct perf_event *event, u64 *value);
+int perf_event_read_local(struct perf_event *event, u64 *value,
+			  u64 *enabled, u64 *running);
 extern u64 perf_event_read_value(struct perf_event *event,
 				 u64 *enabled, u64 *running);
 
@@ -1286,7 +1288,8 @@ static inline const struct perf_event_attr *perf_event_attrs(struct perf_event *
 {
 	return ERR_PTR(-EINVAL);
 }
-static inline int perf_event_read_local(struct perf_event *event, u64 *value)
+static inline int perf_event_read_local(struct perf_event *event, u64 *value,
+					u64 *enabled, u64 *running)
 {
 	return -EINVAL;
 }
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 98c0f00c3f5e..68d866628be0 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -492,7 +492,7 @@ static void *perf_event_fd_array_get_ptr(struct bpf_map *map,
 
 	ee = ERR_PTR(-EOPNOTSUPP);
 	event = perf_file->private_data;
-	if (perf_event_read_local(event, &value) == -EOPNOTSUPP)
+	if (perf_event_read_local(event, &value, NULL, NULL) == -EOPNOTSUPP)
 		goto err_out;
 
 	ee = bpf_event_entry_gen(perf_file, map_file);
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 6bc21e202ae4..902149f05381 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -3684,10 +3684,12 @@ static inline u64 perf_event_count(struct perf_event *event)
  *     will not be local and we cannot read them atomically
  *   - must not have a pmu::count method
  */
-int perf_event_read_local(struct perf_event *event, u64 *value)
+int perf_event_read_local(struct perf_event *event, u64 *value,
+			  u64 *enabled, u64 *running)
 {
 	unsigned long flags;
 	int ret = 0;
+	u64 now;
 
 	/*
 	 * Disabling interrupts avoids all counter scheduling (context
@@ -3718,13 +3720,21 @@ int perf_event_read_local(struct perf_event *event, u64 *value)
 		goto out;
 	}
 
+	now = event->shadow_ctx_time + perf_clock();
+	if (enabled)
+		*enabled = now - event->tstamp_enabled;
 	/*
 	 * If the event is currently on this CPU, its either a per-task event,
 	 * or local to this CPU. Furthermore it means its ACTIVE (otherwise
 	 * oncpu == -1).
 	 */
-	if (event->oncpu == smp_processor_id())
+	if (event->oncpu == smp_processor_id()) {
 		event->pmu->read(event);
+		if (running)
+			*running = now - event->tstamp_running;
+	} else if (running) {
+		*running = event->total_time_running;
+	}
 
 	*value = local64_read(&event->count);
 out:
@@ -8072,6 +8082,7 @@ static void bpf_overflow_handler(struct perf_event *event,
 	struct bpf_perf_event_data_kern ctx = {
 		.data = data,
 		.regs = regs,
+		.event = event,
 	};
 	int ret = 0;
 
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index dc498b605d5d..95888ae6c263 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -275,7 +275,7 @@ BPF_CALL_2(bpf_perf_event_read, struct bpf_map *, map, u64, flags)
 	if (!ee)
 		return -ENOENT;
 
-	err = perf_event_read_local(ee->event, &value);
+	err = perf_event_read_local(ee->event, &value, NULL, NULL);
 	/*
 	 * this api is ugly since we miss [-22..-2] range of valid
 	 * counter values, but that's uapi
-- 
cgit v1.2.3


From 908432ca84fc229e906ba164219e9ad0fe56f755 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Thu, 5 Oct 2017 09:19:20 -0700
Subject: bpf: add helper bpf_perf_event_read_value for perf event array map

Hardware pmu counters are limited resources. When there are more
pmu based perf events opened than available counters, kernel will
multiplex these events so each event gets certain percentage
(but not 100%) of the pmu time. In case that multiplexing happens,
the number of samples or counter value will not reflect the
case compared to no multiplexing. This makes comparison between
different runs difficult.

Typically, the number of samples or counter value should be
normalized before comparing to other experiments. The typical
normalization is done like:
  normalized_num_samples = num_samples * time_enabled / time_running
  normalized_counter_value = counter_value * time_enabled / time_running
where time_enabled is the time enabled for event and time_running is
the time running for event since last normalization.

This patch adds helper bpf_perf_event_read_value for kprobed based perf
event array map, to read perf counter and enabled/running time.
The enabled/running time is accumulated since the perf event open.
To achieve scaling factor between two bpf invocations, users
can can use cpu_id as the key (which is typical for perf array usage model)
to remember the previous value and do the calculation inside the
bpf program.

Signed-off-by: Yonghong Song <yhs@fb.com>
Acked-by: Alexei Starovoitov <ast@fb.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/bpf.h | 21 +++++++++++++++++++--
 kernel/bpf/verifier.c    |  4 +++-
 kernel/trace/bpf_trace.c | 45 +++++++++++++++++++++++++++++++++++++++++----
 3 files changed, 63 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 6082faf5fd2a..7b57a212c7d7 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -641,6 +641,14 @@ union bpf_attr {
  *     @xdp_md: pointer to xdp_md
  *     @delta: An positive/negative integer to be added to xdp_md.data_meta
  *     Return: 0 on success or negative on error
+ *
+ * int bpf_perf_event_read_value(map, flags, buf, buf_size)
+ *     read perf event counter value and perf event enabled/running time
+ *     @map: pointer to perf_event_array map
+ *     @flags: index of event in the map or bitmask flags
+ *     @buf: buf to fill
+ *     @buf_size: size of the buf
+ *     Return: 0 on success or negative error code
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -697,7 +705,8 @@ union bpf_attr {
 	FN(redirect_map),		\
 	FN(sk_redirect_map),		\
 	FN(sock_map_update),		\
-	FN(xdp_adjust_meta),
+	FN(xdp_adjust_meta),		\
+	FN(perf_event_read_value),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
@@ -741,7 +750,9 @@ enum bpf_func_id {
 #define BPF_F_ZERO_CSUM_TX		(1ULL << 1)
 #define BPF_F_DONT_FRAGMENT		(1ULL << 2)
 
-/* BPF_FUNC_perf_event_output and BPF_FUNC_perf_event_read flags. */
+/* BPF_FUNC_perf_event_output, BPF_FUNC_perf_event_read and
+ * BPF_FUNC_perf_event_read_value flags.
+ */
 #define BPF_F_INDEX_MASK		0xffffffffULL
 #define BPF_F_CURRENT_CPU		BPF_F_INDEX_MASK
 /* BPF_FUNC_perf_event_output for sk_buff input context. */
@@ -934,4 +945,10 @@ enum {
 #define TCP_BPF_IW		1001	/* Set TCP initial congestion window */
 #define TCP_BPF_SNDCWND_CLAMP	1002	/* Set sndcwnd_clamp */
 
+struct bpf_perf_event_value {
+	__u64 counter;
+	__u64 enabled;
+	__u64 running;
+};
+
 #endif /* _UAPI__LINUX_BPF_H__ */
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 52b022310f6a..590125e29161 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1552,7 +1552,8 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id)
 		break;
 	case BPF_MAP_TYPE_PERF_EVENT_ARRAY:
 		if (func_id != BPF_FUNC_perf_event_read &&
-		    func_id != BPF_FUNC_perf_event_output)
+		    func_id != BPF_FUNC_perf_event_output &&
+		    func_id != BPF_FUNC_perf_event_read_value)
 			goto error;
 		break;
 	case BPF_MAP_TYPE_STACK_TRACE:
@@ -1595,6 +1596,7 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id)
 		break;
 	case BPF_FUNC_perf_event_read:
 	case BPF_FUNC_perf_event_output:
+	case BPF_FUNC_perf_event_read_value:
 		if (map->map_type != BPF_MAP_TYPE_PERF_EVENT_ARRAY)
 			goto error;
 		break;
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 95888ae6c263..0be86cc0130e 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -255,14 +255,14 @@ const struct bpf_func_proto *bpf_get_trace_printk_proto(void)
 	return &bpf_trace_printk_proto;
 }
 
-BPF_CALL_2(bpf_perf_event_read, struct bpf_map *, map, u64, flags)
+static __always_inline int
+get_map_perf_counter(struct bpf_map *map, u64 flags,
+		     u64 *value, u64 *enabled, u64 *running)
 {
 	struct bpf_array *array = container_of(map, struct bpf_array, map);
 	unsigned int cpu = smp_processor_id();
 	u64 index = flags & BPF_F_INDEX_MASK;
 	struct bpf_event_entry *ee;
-	u64 value = 0;
-	int err;
 
 	if (unlikely(flags & ~(BPF_F_INDEX_MASK)))
 		return -EINVAL;
@@ -275,7 +275,15 @@ BPF_CALL_2(bpf_perf_event_read, struct bpf_map *, map, u64, flags)
 	if (!ee)
 		return -ENOENT;
 
-	err = perf_event_read_local(ee->event, &value, NULL, NULL);
+	return perf_event_read_local(ee->event, value, enabled, running);
+}
+
+BPF_CALL_2(bpf_perf_event_read, struct bpf_map *, map, u64, flags)
+{
+	u64 value = 0;
+	int err;
+
+	err = get_map_perf_counter(map, flags, &value, NULL, NULL);
 	/*
 	 * this api is ugly since we miss [-22..-2] range of valid
 	 * counter values, but that's uapi
@@ -293,6 +301,33 @@ static const struct bpf_func_proto bpf_perf_event_read_proto = {
 	.arg2_type	= ARG_ANYTHING,
 };
 
+BPF_CALL_4(bpf_perf_event_read_value, struct bpf_map *, map, u64, flags,
+	   struct bpf_perf_event_value *, buf, u32, size)
+{
+	int err = -EINVAL;
+
+	if (unlikely(size != sizeof(struct bpf_perf_event_value)))
+		goto clear;
+	err = get_map_perf_counter(map, flags, &buf->counter, &buf->enabled,
+				   &buf->running);
+	if (unlikely(err))
+		goto clear;
+	return 0;
+clear:
+	memset(buf, 0, size);
+	return err;
+}
+
+static const struct bpf_func_proto bpf_perf_event_read_value_proto = {
+	.func		= bpf_perf_event_read_value,
+	.gpl_only	= true,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_CONST_MAP_PTR,
+	.arg2_type	= ARG_ANYTHING,
+	.arg3_type	= ARG_PTR_TO_UNINIT_MEM,
+	.arg4_type	= ARG_CONST_SIZE,
+};
+
 static DEFINE_PER_CPU(struct perf_sample_data, bpf_sd);
 
 static __always_inline u64
@@ -499,6 +534,8 @@ static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func
 		return &bpf_perf_event_output_proto;
 	case BPF_FUNC_get_stackid:
 		return &bpf_get_stackid_proto;
+	case BPF_FUNC_perf_event_read_value:
+		return &bpf_perf_event_read_value_proto;
 	default:
 		return tracing_func_proto(func_id);
 	}
-- 
cgit v1.2.3


From 4bebdc7a85aa400c0222b5329861e4ad9252f1e5 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Thu, 5 Oct 2017 09:19:22 -0700
Subject: bpf: add helper bpf_perf_prog_read_value

This patch adds helper bpf_perf_prog_read_cvalue for perf event based bpf
programs, to read event counter and enabled/running time.
The enabled/running time is accumulated since the perf event open.

The typical use case for perf event based bpf program is to attach itself
to a single event. In such cases, if it is desirable to get scaling factor
between two bpf invocations, users can can save the time values in a map,
and use the value from the map and the current value to calculate
the scaling factor.

Signed-off-by: Yonghong Song <yhs@fb.com>
Acked-by: Alexei Starovoitov <ast@fb.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/bpf.h | 10 +++++++++-
 kernel/trace/bpf_trace.c | 28 ++++++++++++++++++++++++++++
 2 files changed, 37 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 7b57a212c7d7..5bbbec17aa5a 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -649,6 +649,13 @@ union bpf_attr {
  *     @buf: buf to fill
  *     @buf_size: size of the buf
  *     Return: 0 on success or negative error code
+ *
+ * int bpf_perf_prog_read_value(ctx, buf, buf_size)
+ *     read perf prog attached perf event counter and enabled/running time
+ *     @ctx: pointer to ctx
+ *     @buf: buf to fill
+ *     @buf_size: size of the buf
+ *     Return : 0 on success or negative error code
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -706,7 +713,8 @@ union bpf_attr {
 	FN(sk_redirect_map),		\
 	FN(sock_map_update),		\
 	FN(xdp_adjust_meta),		\
-	FN(perf_event_read_value),
+	FN(perf_event_read_value),	\
+	FN(perf_prog_read_value),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 0be86cc0130e..04ea5314f2bc 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -613,6 +613,32 @@ static const struct bpf_func_proto bpf_get_stackid_proto_tp = {
 	.arg3_type	= ARG_ANYTHING,
 };
 
+BPF_CALL_3(bpf_perf_prog_read_value_tp, struct bpf_perf_event_data_kern *, ctx,
+	   struct bpf_perf_event_value *, buf, u32, size)
+{
+	int err = -EINVAL;
+
+	if (unlikely(size != sizeof(struct bpf_perf_event_value)))
+		goto clear;
+	err = perf_event_read_local(ctx->event, &buf->counter, &buf->enabled,
+				    &buf->running);
+	if (unlikely(err))
+		goto clear;
+	return 0;
+clear:
+	memset(buf, 0, size);
+	return err;
+}
+
+static const struct bpf_func_proto bpf_perf_prog_read_value_proto_tp = {
+         .func           = bpf_perf_prog_read_value_tp,
+         .gpl_only       = true,
+         .ret_type       = RET_INTEGER,
+         .arg1_type      = ARG_PTR_TO_CTX,
+         .arg2_type      = ARG_PTR_TO_UNINIT_MEM,
+         .arg3_type      = ARG_CONST_SIZE,
+};
+
 static const struct bpf_func_proto *tp_prog_func_proto(enum bpf_func_id func_id)
 {
 	switch (func_id) {
@@ -620,6 +646,8 @@ static const struct bpf_func_proto *tp_prog_func_proto(enum bpf_func_id func_id)
 		return &bpf_perf_event_output_proto_tp;
 	case BPF_FUNC_get_stackid:
 		return &bpf_get_stackid_proto_tp;
+	case BPF_FUNC_perf_prog_read_value:
+		return &bpf_perf_prog_read_value_proto_tp;
 	default:
 		return tracing_func_proto(func_id);
 	}
-- 
cgit v1.2.3


From 473d97343f94ff20f5196078314e4dd83156d3a2 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Thu, 5 Oct 2017 21:52:11 -0700
Subject: bpf: Change bpf_obj_name_cpy() to better ensure map's name is init by
 0

During get_info_by_fd, the prog/map name is memcpy-ed.  It depends
on the prog->aux->name and map->name to be zero initialized.

bpf_prog_aux is easy to guarantee that aux->name is zero init.

The name in bpf_map may be harder to be guaranteed in the future when
new map type is added.

Hence, this patch makes bpf_obj_name_cpy() to always zero init
the prog/map name.

Suggested-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/syscall.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 0048cb24ba7b..d124e702e040 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -322,6 +322,8 @@ static int bpf_obj_name_cpy(char *dst, const char *src)
 {
 	const char *end = src + BPF_OBJ_NAME_LEN;
 
+	memset(dst, 0, BPF_OBJ_NAME_LEN);
+
 	/* Copy all isalnum() and '_' char */
 	while (src < end && *src) {
 		if (!isalnum(*src) && *src != '_')
@@ -333,9 +335,6 @@ static int bpf_obj_name_cpy(char *dst, const char *src)
 	if (src == end)
 		return -EINVAL;
 
-	/* '\0' terminates dst */
-	*dst = 0;
-
 	return 0;
 }
 
-- 
cgit v1.2.3


From 368211fb920a0b789c238942c6af0414539f79d6 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Thu, 5 Oct 2017 21:52:13 -0700
Subject: bpf: Append prog->aux->name in bpf_get_prog_name()

This patch makes the bpf_prog's name available
in kallsyms.

The new format is bpf_prog_tag[_name].

Sample kallsyms from running selftests/bpf/test_progs:
[root@arch-fb-vm1 ~]# egrep ' bpf_prog_[0-9a-fA-F]{16}' /proc/kallsyms
ffffffffa0048000 t bpf_prog_dabf0207d1992486_test_obj_id
ffffffffa0038000 t bpf_prog_a04f5eef06a7f555__123456789ABCDE
ffffffffa0050000 t bpf_prog_a04f5eef06a7f555

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@fb.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/core.c | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index c6be15ae83ee..248961af2421 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -309,12 +309,25 @@ bpf_get_prog_addr_region(const struct bpf_prog *prog,
 
 static void bpf_get_prog_name(const struct bpf_prog *prog, char *sym)
 {
+	const char *end = sym + KSYM_NAME_LEN;
+
 	BUILD_BUG_ON(sizeof("bpf_prog_") +
-		     sizeof(prog->tag) * 2 + 1 > KSYM_NAME_LEN);
+		     sizeof(prog->tag) * 2 +
+		     /* name has been null terminated.
+		      * We should need +1 for the '_' preceding
+		      * the name.  However, the null character
+		      * is double counted between the name and the
+		      * sizeof("bpf_prog_") above, so we omit
+		      * the +1 here.
+		      */
+		     sizeof(prog->aux->name) > KSYM_NAME_LEN);
 
 	sym += snprintf(sym, KSYM_NAME_LEN, "bpf_prog_");
 	sym  = bin2hex(sym, prog->tag, sizeof(prog->tag));
-	*sym = 0;
+	if (prog->aux->name[0])
+		snprintf(sym, (size_t)(end - sym), "_%s", prog->aux->name);
+	else
+		*sym = 0;
 }
 
 static __always_inline unsigned long
-- 
cgit v1.2.3


From de8cd83e91bc3ee212b3e6ec6e4283af9e4ab269 Mon Sep 17 00:00:00 2001
From: Steve Grubb <sgrubb@redhat.com>
Date: Mon, 2 Oct 2017 20:21:39 -0400
Subject: audit: Record fanotify access control decisions

The fanotify interface allows user space daemons to make access
control decisions. Under common criteria requirements, we need to
optionally record decisions based on policy. This patch adds a bit mask,
FAN_AUDIT, that a user space daemon can 'or' into the response decision
which will tell the kernel that it made a decision and record it.

It would be used something like this in user space code:

  response.response = FAN_DENY | FAN_AUDIT;
  write(fd, &response, sizeof(struct fanotify_response));

When the syscall ends, the audit system will record the decision as a
AUDIT_FANOTIFY auxiliary record to denote that the reason this event
occurred is the result of an access control decision from fanotify
rather than DAC or MAC policy.

A sample event looks like this:

type=PATH msg=audit(1504310584.332:290): item=0 name="./evil-ls"
inode=1319561 dev=fc:03 mode=0100755 ouid=1000 ogid=1000 rdev=00:00
obj=unconfined_u:object_r:user_home_t:s0 nametype=NORMAL
type=CWD msg=audit(1504310584.332:290): cwd="/home/sgrubb"
type=SYSCALL msg=audit(1504310584.332:290): arch=c000003e syscall=2
success=no exit=-1 a0=32cb3fca90 a1=0 a2=43 a3=8 items=1 ppid=901
pid=959 auid=1000 uid=1000 gid=1000 euid=1000 suid=1000
fsuid=1000 egid=1000 sgid=1000 fsgid=1000 tty=pts1 ses=3 comm="bash"
exe="/usr/bin/bash" subj=unconfined_u:unconfined_r:unconfined_t:
s0-s0:c0.c1023 key=(null)
type=FANOTIFY msg=audit(1504310584.332:290): resp=2

Prior to using the audit flag, the developer needs to call
fanotify_init or'ing in FAN_ENABLE_AUDIT to ensure that the kernel
supports auditing. The calling process must also have the CAP_AUDIT_WRITE
capability.

Signed-off-by: sgrubb <sgrubb@redhat.com>
Reviewed-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/notify/fanotify/fanotify.c      |  8 +++++++-
 fs/notify/fanotify/fanotify_user.c | 16 +++++++++++++++-
 fs/notify/fdinfo.c                 |  3 +++
 include/linux/audit.h              | 10 ++++++++++
 include/linux/fsnotify_backend.h   |  1 +
 include/uapi/linux/audit.h         |  1 +
 include/uapi/linux/fanotify.h      |  3 +++
 kernel/auditsc.c                   |  6 ++++++
 8 files changed, 46 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
index 2fa99aeaa095..1968d21a3f37 100644
--- a/fs/notify/fanotify/fanotify.c
+++ b/fs/notify/fanotify/fanotify.c
@@ -9,6 +9,7 @@
 #include <linux/sched/user.h>
 #include <linux/types.h>
 #include <linux/wait.h>
+#include <linux/audit.h>
 
 #include "fanotify.h"
 
@@ -78,7 +79,7 @@ static int fanotify_get_response(struct fsnotify_group *group,
 	fsnotify_finish_user_wait(iter_info);
 out:
 	/* userspace responded, convert to something usable */
-	switch (event->response) {
+	switch (event->response & ~FAN_AUDIT) {
 	case FAN_ALLOW:
 		ret = 0;
 		break;
@@ -86,6 +87,11 @@ out:
 	default:
 		ret = -EPERM;
 	}
+
+	/* Check if the response should be audited */
+	if (event->response & FAN_AUDIT)
+		audit_fanotify(event->response & ~FAN_AUDIT);
+
 	event->response = 0;
 
 	pr_debug("%s: group=%p event=%p about to return ret=%d\n", __func__,
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 907a481ac781..0455ea729384 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -179,7 +179,7 @@ static int process_access_response(struct fsnotify_group *group,
 	 * userspace can send a valid response or we will clean it up after the
 	 * timeout
 	 */
-	switch (response) {
+	switch (response & ~FAN_AUDIT) {
 	case FAN_ALLOW:
 	case FAN_DENY:
 		break;
@@ -190,6 +190,9 @@ static int process_access_response(struct fsnotify_group *group,
 	if (fd < 0)
 		return -EINVAL;
 
+	if ((response & FAN_AUDIT) && !group->fanotify_data.audit)
+		return -EINVAL;
+
 	event = dequeue_event(group, fd);
 	if (!event)
 		return -ENOENT;
@@ -721,7 +724,11 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
 
+#ifdef CONFIG_AUDITSYSCALL
+	if (flags & ~(FAN_ALL_INIT_FLAGS | FAN_ENABLE_AUDIT))
+#else
 	if (flags & ~FAN_ALL_INIT_FLAGS)
+#endif
 		return -EINVAL;
 
 	if (event_f_flags & ~FANOTIFY_INIT_ALL_EVENT_F_BITS)
@@ -805,6 +812,13 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
 		group->fanotify_data.max_marks = FANOTIFY_DEFAULT_MAX_MARKS;
 	}
 
+	if (flags & FAN_ENABLE_AUDIT) {
+		fd = -EPERM;
+		if (!capable(CAP_AUDIT_WRITE))
+			goto out_destroy_group;
+		group->fanotify_data.audit = true;
+	}
+
 	fd = anon_inode_getfd("[fanotify]", &fanotify_fops, group, f_flags);
 	if (fd < 0)
 		goto out_destroy_group;
diff --git a/fs/notify/fdinfo.c b/fs/notify/fdinfo.c
index dd63aa9a6f9a..645ab561e790 100644
--- a/fs/notify/fdinfo.c
+++ b/fs/notify/fdinfo.c
@@ -156,6 +156,9 @@ void fanotify_show_fdinfo(struct seq_file *m, struct file *f)
 	if (group->fanotify_data.max_marks == UINT_MAX)
 		flags |= FAN_UNLIMITED_MARKS;
 
+	if (group->fanotify_data.audit)
+		flags |= FAN_ENABLE_AUDIT;
+
 	seq_printf(m, "fanotify flags:%x event-flags:%x\n",
 		   flags, group->fanotify_data.f_flags);
 
diff --git a/include/linux/audit.h b/include/linux/audit.h
index cb708eb8accc..d66220dac364 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -356,6 +356,7 @@ extern int __audit_log_bprm_fcaps(struct linux_binprm *bprm,
 extern void __audit_log_capset(const struct cred *new, const struct cred *old);
 extern void __audit_mmap_fd(int fd, int flags);
 extern void __audit_log_kern_module(char *name);
+extern void __audit_fanotify(unsigned int response);
 
 static inline void audit_ipc_obj(struct kern_ipc_perm *ipcp)
 {
@@ -452,6 +453,12 @@ static inline void audit_log_kern_module(char *name)
 		__audit_log_kern_module(name);
 }
 
+static inline void audit_fanotify(unsigned int response)
+{
+	if (!audit_dummy_context())
+		__audit_fanotify(response);
+}
+
 extern int audit_n_rules;
 extern int audit_signals;
 #else /* CONFIG_AUDITSYSCALL */
@@ -568,6 +575,9 @@ static inline void audit_log_kern_module(char *name)
 {
 }
 
+static inline void audit_fanotify(unsigned int response)
+{ }
+
 static inline void audit_ptrace(struct task_struct *t)
 { }
 #define audit_n_rules 0
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index c6c69318752b..4a474f972910 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -190,6 +190,7 @@ struct fsnotify_group {
 			int f_flags;
 			unsigned int max_marks;
 			struct user_struct *user;
+			bool audit;
 		} fanotify_data;
 #endif /* CONFIG_FANOTIFY */
 	};
diff --git a/include/uapi/linux/audit.h b/include/uapi/linux/audit.h
index 0714a66f0e0c..221f8b7f01b2 100644
--- a/include/uapi/linux/audit.h
+++ b/include/uapi/linux/audit.h
@@ -112,6 +112,7 @@
 #define AUDIT_FEATURE_CHANGE	1328	/* audit log listing feature changes */
 #define AUDIT_REPLACE		1329	/* Replace auditd if this packet unanswerd */
 #define AUDIT_KERN_MODULE	1330	/* Kernel Module events */
+#define AUDIT_FANOTIFY		1331	/* Fanotify access decision */
 
 #define AUDIT_AVC		1400	/* SE Linux avc denial or grant */
 #define AUDIT_SELINUX_ERR	1401	/* Internal SE Linux Errors */
diff --git a/include/uapi/linux/fanotify.h b/include/uapi/linux/fanotify.h
index 030508d195d3..5dda19a9a947 100644
--- a/include/uapi/linux/fanotify.h
+++ b/include/uapi/linux/fanotify.h
@@ -35,6 +35,7 @@
 
 #define FAN_UNLIMITED_QUEUE	0x00000010
 #define FAN_UNLIMITED_MARKS	0x00000020
+#define FAN_ENABLE_AUDIT	0x00000040
 
 #define FAN_ALL_INIT_FLAGS	(FAN_CLOEXEC | FAN_NONBLOCK | \
 				 FAN_ALL_CLASS_BITS | FAN_UNLIMITED_QUEUE |\
@@ -99,6 +100,8 @@ struct fanotify_response {
 /* Legit userspace responses to a _PERM event */
 #define FAN_ALLOW	0x01
 #define FAN_DENY	0x02
+#define FAN_AUDIT	0x10	/* Bit mask to create audit record for result */
+
 /* No fd set in event */
 #define FAN_NOFD	-1
 
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index ecc23e25c9eb..9c723e978245 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -2390,6 +2390,12 @@ void __audit_log_kern_module(char *name)
 	context->type = AUDIT_KERN_MODULE;
 }
 
+void __audit_fanotify(unsigned int response)
+{
+	audit_log(current->audit_context, GFP_KERNEL,
+		AUDIT_FANOTIFY,	"resp=%u", response);
+}
+
 static void audit_log_task(struct audit_buffer *ab)
 {
 	kuid_t auid, uid;
-- 
cgit v1.2.3


From e7bf8249e8f1bac64885eeccb55bcf6111901a81 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Mon, 9 Oct 2017 10:30:10 -0700
Subject: bpf: encapsulate verifier log state into a structure

Put the loose log_* variables into a structure.  This will make
it simpler to remove the global verifier state in following patches.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Simon Horman <simon.horman@netronome.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf_verifier.h | 13 ++++++++++
 kernel/bpf/verifier.c        | 57 +++++++++++++++++++++++---------------------
 2 files changed, 43 insertions(+), 27 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index b8d200f60a40..163541ba70d9 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -115,6 +115,19 @@ struct bpf_insn_aux_data {
 
 #define MAX_USED_MAPS 64 /* max number of maps accessed by one eBPF program */
 
+struct bpf_verifer_log {
+	u32 level;
+	char *kbuf;
+	char __user *ubuf;
+	u32 len_used;
+	u32 len_total;
+};
+
+static inline bool bpf_verifier_log_full(const struct bpf_verifer_log *log)
+{
+	return log->len_used >= log->len_total - 1;
+}
+
 struct bpf_verifier_env;
 struct bpf_ext_analyzer_ops {
 	int (*insn_hook)(struct bpf_verifier_env *env,
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 6352a88ca6d1..e53458b02249 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -156,8 +156,7 @@ struct bpf_call_arg_meta {
 /* verbose verifier prints what it's seeing
  * bpf_check() is called under lock, so no race to access these global vars
  */
-static u32 log_level, log_size, log_len;
-static char *log_buf;
+static struct bpf_verifer_log verifier_log;
 
 static DEFINE_MUTEX(bpf_verifier_lock);
 
@@ -167,13 +166,15 @@ static DEFINE_MUTEX(bpf_verifier_lock);
  */
 static __printf(1, 2) void verbose(const char *fmt, ...)
 {
+	struct bpf_verifer_log *log = &verifier_log;
 	va_list args;
 
-	if (log_level == 0 || log_len >= log_size - 1)
+	if (!log->level || bpf_verifier_log_full(log))
 		return;
 
 	va_start(args, fmt);
-	log_len += vscnprintf(log_buf + log_len, log_size - log_len, fmt, args);
+	log->len_used += vscnprintf(log->kbuf + log->len_used,
+				    log->len_total - log->len_used, fmt, args);
 	va_end(args);
 }
 
@@ -886,7 +887,7 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno,
 	 * need to try adding each of min_value and max_value to off
 	 * to make sure our theoretical access will be safe.
 	 */
-	if (log_level)
+	if (verifier_log.level)
 		print_verifier_state(state);
 	/* The minimum value is only important with signed
 	 * comparisons where we can't assume the floor of a
@@ -2956,7 +2957,7 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
 		verbose("R%d pointer comparison prohibited\n", insn->dst_reg);
 		return -EACCES;
 	}
-	if (log_level)
+	if (verifier_log.level)
 		print_verifier_state(this_branch);
 	return 0;
 }
@@ -3712,7 +3713,7 @@ static int do_check(struct bpf_verifier_env *env)
 			return err;
 		if (err == 1) {
 			/* found equivalent state, can prune the search */
-			if (log_level) {
+			if (verifier_log.level) {
 				if (do_print_state)
 					verbose("\nfrom %d to %d: safe\n",
 						prev_insn_idx, insn_idx);
@@ -3725,8 +3726,9 @@ static int do_check(struct bpf_verifier_env *env)
 		if (need_resched())
 			cond_resched();
 
-		if (log_level > 1 || (log_level && do_print_state)) {
-			if (log_level > 1)
+		if (verifier_log.level > 1 ||
+		    (verifier_log.level && do_print_state)) {
+			if (verifier_log.level > 1)
 				verbose("%d:", insn_idx);
 			else
 				verbose("\nfrom %d to %d:",
@@ -3735,7 +3737,7 @@ static int do_check(struct bpf_verifier_env *env)
 			do_print_state = false;
 		}
 
-		if (log_level) {
+		if (verifier_log.level) {
 			verbose("%d: ", insn_idx);
 			print_bpf_insn(env, insn);
 		}
@@ -4389,7 +4391,7 @@ static void free_states(struct bpf_verifier_env *env)
 
 int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)
 {
-	char __user *log_ubuf = NULL;
+	struct bpf_verifer_log *log = &verifier_log;
 	struct bpf_verifier_env *env;
 	int ret = -EINVAL;
 
@@ -4414,23 +4416,23 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)
 		/* user requested verbose verifier output
 		 * and supplied buffer to store the verification trace
 		 */
-		log_level = attr->log_level;
-		log_ubuf = (char __user *) (unsigned long) attr->log_buf;
-		log_size = attr->log_size;
-		log_len = 0;
+		log->level = attr->log_level;
+		log->ubuf = (char __user *) (unsigned long) attr->log_buf;
+		log->len_total = attr->log_size;
+		log->len_used = 0;
 
 		ret = -EINVAL;
-		/* log_* values have to be sane */
-		if (log_size < 128 || log_size > UINT_MAX >> 8 ||
-		    log_level == 0 || log_ubuf == NULL)
+		/* log attributes have to be sane */
+		if (log->len_total < 128 || log->len_total > UINT_MAX >> 8 ||
+		    !log->level || !log->ubuf)
 			goto err_unlock;
 
 		ret = -ENOMEM;
-		log_buf = vmalloc(log_size);
-		if (!log_buf)
+		log->kbuf = vmalloc(log->len_total);
+		if (!log->kbuf)
 			goto err_unlock;
 	} else {
-		log_level = 0;
+		log->level = 0;
 	}
 
 	env->strict_alignment = !!(attr->prog_flags & BPF_F_STRICT_ALIGNMENT);
@@ -4467,15 +4469,16 @@ skip_full_check:
 	if (ret == 0)
 		ret = fixup_bpf_calls(env);
 
-	if (log_level && log_len >= log_size - 1) {
-		BUG_ON(log_len >= log_size);
+	if (log->level && bpf_verifier_log_full(log)) {
+		BUG_ON(log->len_used >= log->len_total);
 		/* verifier log exceeded user supplied buffer */
 		ret = -ENOSPC;
 		/* fall through to return what was recorded */
 	}
 
 	/* copy verifier log back to user space including trailing zero */
-	if (log_level && copy_to_user(log_ubuf, log_buf, log_len + 1) != 0) {
+	if (log->level && copy_to_user(log->ubuf, log->kbuf,
+				       log->len_used + 1) != 0) {
 		ret = -EFAULT;
 		goto free_log_buf;
 	}
@@ -4502,8 +4505,8 @@ skip_full_check:
 	}
 
 free_log_buf:
-	if (log_level)
-		vfree(log_buf);
+	if (log->level)
+		vfree(log->kbuf);
 	if (!env->prog->aux->used_maps)
 		/* if we didn't copy map pointers into bpf_prog_info, release
 		 * them now. Otherwise free_bpf_prog_info() will release them.
@@ -4540,7 +4543,7 @@ int bpf_analyzer(struct bpf_prog *prog, const struct bpf_ext_analyzer_ops *ops,
 	/* grab the mutex to protect few globals used by verifier */
 	mutex_lock(&bpf_verifier_lock);
 
-	log_level = 0;
+	verifier_log.level = 0;
 
 	env->strict_alignment = false;
 	if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS))
-- 
cgit v1.2.3


From 61bd5218eef349fcacc4976a251bc83a4748b4af Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Mon, 9 Oct 2017 10:30:11 -0700
Subject: bpf: move global verifier log into verifier environment

The biggest piece of global state protected by the verifier lock
is the verifier_log.  Move that log to struct bpf_verifier_env.
struct bpf_verifier_env has to be passed now to all invocations
of verbose().

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Simon Horman <simon.horman@netronome.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf_verifier.h |   2 +
 kernel/bpf/verifier.c        | 491 +++++++++++++++++++++++--------------------
 2 files changed, 261 insertions(+), 232 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 163541ba70d9..5ddb9a626a51 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -152,6 +152,8 @@ struct bpf_verifier_env {
 	bool allow_ptr_leaks;
 	bool seen_direct_write;
 	struct bpf_insn_aux_data *insn_aux_data; /* array of per-insn state */
+
+	struct bpf_verifer_log log;
 };
 
 int bpf_analyzer(struct bpf_prog *prog, const struct bpf_ext_analyzer_ops *ops,
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index e53458b02249..a352f93cd4b2 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -153,20 +153,16 @@ struct bpf_call_arg_meta {
 	int access_size;
 };
 
-/* verbose verifier prints what it's seeing
- * bpf_check() is called under lock, so no race to access these global vars
- */
-static struct bpf_verifer_log verifier_log;
-
 static DEFINE_MUTEX(bpf_verifier_lock);
 
 /* log_level controls verbosity level of eBPF verifier.
  * verbose() is used to dump the verification trace to the log, so the user
  * can figure out what's wrong with the program
  */
-static __printf(1, 2) void verbose(const char *fmt, ...)
+static __printf(2, 3) void verbose(struct bpf_verifier_env *env,
+				   const char *fmt, ...)
 {
-	struct bpf_verifer_log *log = &verifier_log;
+	struct bpf_verifer_log *log = &env->log;
 	va_list args;
 
 	if (!log->level || bpf_verifier_log_full(log))
@@ -214,7 +210,8 @@ static const char *func_id_name(int id)
 		return "unknown";
 }
 
-static void print_verifier_state(struct bpf_verifier_state *state)
+static void print_verifier_state(struct bpf_verifier_env *env,
+				 struct bpf_verifier_state *state)
 {
 	struct bpf_reg_state *reg;
 	enum bpf_reg_type t;
@@ -225,21 +222,21 @@ static void print_verifier_state(struct bpf_verifier_state *state)
 		t = reg->type;
 		if (t == NOT_INIT)
 			continue;
-		verbose(" R%d=%s", i, reg_type_str[t]);
+		verbose(env, " R%d=%s", i, reg_type_str[t]);
 		if ((t == SCALAR_VALUE || t == PTR_TO_STACK) &&
 		    tnum_is_const(reg->var_off)) {
 			/* reg->off should be 0 for SCALAR_VALUE */
-			verbose("%lld", reg->var_off.value + reg->off);
+			verbose(env, "%lld", reg->var_off.value + reg->off);
 		} else {
-			verbose("(id=%d", reg->id);
+			verbose(env, "(id=%d", reg->id);
 			if (t != SCALAR_VALUE)
-				verbose(",off=%d", reg->off);
+				verbose(env, ",off=%d", reg->off);
 			if (type_is_pkt_pointer(t))
-				verbose(",r=%d", reg->range);
+				verbose(env, ",r=%d", reg->range);
 			else if (t == CONST_PTR_TO_MAP ||
 				 t == PTR_TO_MAP_VALUE ||
 				 t == PTR_TO_MAP_VALUE_OR_NULL)
-				verbose(",ks=%d,vs=%d",
+				verbose(env, ",ks=%d,vs=%d",
 					reg->map_ptr->key_size,
 					reg->map_ptr->value_size);
 			if (tnum_is_const(reg->var_off)) {
@@ -247,38 +244,38 @@ static void print_verifier_state(struct bpf_verifier_state *state)
 				 * could be a pointer whose offset is too big
 				 * for reg->off
 				 */
-				verbose(",imm=%llx", reg->var_off.value);
+				verbose(env, ",imm=%llx", reg->var_off.value);
 			} else {
 				if (reg->smin_value != reg->umin_value &&
 				    reg->smin_value != S64_MIN)
-					verbose(",smin_value=%lld",
+					verbose(env, ",smin_value=%lld",
 						(long long)reg->smin_value);
 				if (reg->smax_value != reg->umax_value &&
 				    reg->smax_value != S64_MAX)
-					verbose(",smax_value=%lld",
+					verbose(env, ",smax_value=%lld",
 						(long long)reg->smax_value);
 				if (reg->umin_value != 0)
-					verbose(",umin_value=%llu",
+					verbose(env, ",umin_value=%llu",
 						(unsigned long long)reg->umin_value);
 				if (reg->umax_value != U64_MAX)
-					verbose(",umax_value=%llu",
+					verbose(env, ",umax_value=%llu",
 						(unsigned long long)reg->umax_value);
 				if (!tnum_is_unknown(reg->var_off)) {
 					char tn_buf[48];
 
 					tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
-					verbose(",var_off=%s", tn_buf);
+					verbose(env, ",var_off=%s", tn_buf);
 				}
 			}
-			verbose(")");
+			verbose(env, ")");
 		}
 	}
 	for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) {
 		if (state->stack_slot_type[i] == STACK_SPILL)
-			verbose(" fp%d=%s", -MAX_BPF_STACK + i,
+			verbose(env, " fp%d=%s", -MAX_BPF_STACK + i,
 				reg_type_str[state->spilled_regs[i / BPF_REG_SIZE].type]);
 	}
-	verbose("\n");
+	verbose(env, "\n");
 }
 
 static const char *const bpf_class_string[] = {
@@ -333,15 +330,15 @@ static const char *const bpf_jmp_string[16] = {
 	[BPF_EXIT >> 4] = "exit",
 };
 
-static void print_bpf_end_insn(const struct bpf_verifier_env *env,
+static void print_bpf_end_insn(struct bpf_verifier_env *env,
 			       const struct bpf_insn *insn)
 {
-	verbose("(%02x) r%d = %s%d r%d\n", insn->code, insn->dst_reg,
+	verbose(env, "(%02x) r%d = %s%d r%d\n", insn->code, insn->dst_reg,
 		BPF_SRC(insn->code) == BPF_TO_BE ? "be" : "le",
 		insn->imm, insn->dst_reg);
 }
 
-static void print_bpf_insn(const struct bpf_verifier_env *env,
+static void print_bpf_insn(struct bpf_verifier_env *env,
 			   const struct bpf_insn *insn)
 {
 	u8 class = BPF_CLASS(insn->code);
@@ -349,23 +346,23 @@ static void print_bpf_insn(const struct bpf_verifier_env *env,
 	if (class == BPF_ALU || class == BPF_ALU64) {
 		if (BPF_OP(insn->code) == BPF_END) {
 			if (class == BPF_ALU64)
-				verbose("BUG_alu64_%02x\n", insn->code);
+				verbose(env, "BUG_alu64_%02x\n", insn->code);
 			else
 				print_bpf_end_insn(env, insn);
 		} else if (BPF_OP(insn->code) == BPF_NEG) {
-			verbose("(%02x) r%d = %s-r%d\n",
+			verbose(env, "(%02x) r%d = %s-r%d\n",
 				insn->code, insn->dst_reg,
 				class == BPF_ALU ? "(u32) " : "",
 				insn->dst_reg);
 		} else if (BPF_SRC(insn->code) == BPF_X) {
-			verbose("(%02x) %sr%d %s %sr%d\n",
+			verbose(env, "(%02x) %sr%d %s %sr%d\n",
 				insn->code, class == BPF_ALU ? "(u32) " : "",
 				insn->dst_reg,
 				bpf_alu_string[BPF_OP(insn->code) >> 4],
 				class == BPF_ALU ? "(u32) " : "",
 				insn->src_reg);
 		} else {
-			verbose("(%02x) %sr%d %s %s%d\n",
+			verbose(env, "(%02x) %sr%d %s %s%d\n",
 				insn->code, class == BPF_ALU ? "(u32) " : "",
 				insn->dst_reg,
 				bpf_alu_string[BPF_OP(insn->code) >> 4],
@@ -374,46 +371,46 @@ static void print_bpf_insn(const struct bpf_verifier_env *env,
 		}
 	} else if (class == BPF_STX) {
 		if (BPF_MODE(insn->code) == BPF_MEM)
-			verbose("(%02x) *(%s *)(r%d %+d) = r%d\n",
+			verbose(env, "(%02x) *(%s *)(r%d %+d) = r%d\n",
 				insn->code,
 				bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
 				insn->dst_reg,
 				insn->off, insn->src_reg);
 		else if (BPF_MODE(insn->code) == BPF_XADD)
-			verbose("(%02x) lock *(%s *)(r%d %+d) += r%d\n",
+			verbose(env, "(%02x) lock *(%s *)(r%d %+d) += r%d\n",
 				insn->code,
 				bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
 				insn->dst_reg, insn->off,
 				insn->src_reg);
 		else
-			verbose("BUG_%02x\n", insn->code);
+			verbose(env, "BUG_%02x\n", insn->code);
 	} else if (class == BPF_ST) {
 		if (BPF_MODE(insn->code) != BPF_MEM) {
-			verbose("BUG_st_%02x\n", insn->code);
+			verbose(env, "BUG_st_%02x\n", insn->code);
 			return;
 		}
-		verbose("(%02x) *(%s *)(r%d %+d) = %d\n",
+		verbose(env, "(%02x) *(%s *)(r%d %+d) = %d\n",
 			insn->code,
 			bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
 			insn->dst_reg,
 			insn->off, insn->imm);
 	} else if (class == BPF_LDX) {
 		if (BPF_MODE(insn->code) != BPF_MEM) {
-			verbose("BUG_ldx_%02x\n", insn->code);
+			verbose(env, "BUG_ldx_%02x\n", insn->code);
 			return;
 		}
-		verbose("(%02x) r%d = *(%s *)(r%d %+d)\n",
+		verbose(env, "(%02x) r%d = *(%s *)(r%d %+d)\n",
 			insn->code, insn->dst_reg,
 			bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
 			insn->src_reg, insn->off);
 	} else if (class == BPF_LD) {
 		if (BPF_MODE(insn->code) == BPF_ABS) {
-			verbose("(%02x) r0 = *(%s *)skb[%d]\n",
+			verbose(env, "(%02x) r0 = *(%s *)skb[%d]\n",
 				insn->code,
 				bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
 				insn->imm);
 		} else if (BPF_MODE(insn->code) == BPF_IND) {
-			verbose("(%02x) r0 = *(%s *)skb[r%d + %d]\n",
+			verbose(env, "(%02x) r0 = *(%s *)skb[r%d + %d]\n",
 				insn->code,
 				bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
 				insn->src_reg, insn->imm);
@@ -428,36 +425,37 @@ static void print_bpf_insn(const struct bpf_verifier_env *env,
 			if (map_ptr && !env->allow_ptr_leaks)
 				imm = 0;
 
-			verbose("(%02x) r%d = 0x%llx\n", insn->code,
+			verbose(env, "(%02x) r%d = 0x%llx\n", insn->code,
 				insn->dst_reg, (unsigned long long)imm);
 		} else {
-			verbose("BUG_ld_%02x\n", insn->code);
+			verbose(env, "BUG_ld_%02x\n", insn->code);
 			return;
 		}
 	} else if (class == BPF_JMP) {
 		u8 opcode = BPF_OP(insn->code);
 
 		if (opcode == BPF_CALL) {
-			verbose("(%02x) call %s#%d\n", insn->code,
+			verbose(env, "(%02x) call %s#%d\n", insn->code,
 				func_id_name(insn->imm), insn->imm);
 		} else if (insn->code == (BPF_JMP | BPF_JA)) {
-			verbose("(%02x) goto pc%+d\n",
+			verbose(env, "(%02x) goto pc%+d\n",
 				insn->code, insn->off);
 		} else if (insn->code == (BPF_JMP | BPF_EXIT)) {
-			verbose("(%02x) exit\n", insn->code);
+			verbose(env, "(%02x) exit\n", insn->code);
 		} else if (BPF_SRC(insn->code) == BPF_X) {
-			verbose("(%02x) if r%d %s r%d goto pc%+d\n",
+			verbose(env, "(%02x) if r%d %s r%d goto pc%+d\n",
 				insn->code, insn->dst_reg,
 				bpf_jmp_string[BPF_OP(insn->code) >> 4],
 				insn->src_reg, insn->off);
 		} else {
-			verbose("(%02x) if r%d %s 0x%x goto pc%+d\n",
+			verbose(env, "(%02x) if r%d %s 0x%x goto pc%+d\n",
 				insn->code, insn->dst_reg,
 				bpf_jmp_string[BPF_OP(insn->code) >> 4],
 				insn->imm, insn->off);
 		}
 	} else {
-		verbose("(%02x) %s\n", insn->code, bpf_class_string[class]);
+		verbose(env, "(%02x) %s\n",
+			insn->code, bpf_class_string[class]);
 	}
 }
 
@@ -496,7 +494,7 @@ static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env,
 	env->head = elem;
 	env->stack_size++;
 	if (env->stack_size > BPF_COMPLEXITY_LIMIT_STACK) {
-		verbose("BPF program is too complex\n");
+		verbose(env, "BPF program is too complex\n");
 		goto err;
 	}
 	return &elem->st;
@@ -534,10 +532,11 @@ static void __mark_reg_known_zero(struct bpf_reg_state *reg)
 	__mark_reg_known(reg, 0);
 }
 
-static void mark_reg_known_zero(struct bpf_reg_state *regs, u32 regno)
+static void mark_reg_known_zero(struct bpf_verifier_env *env,
+				struct bpf_reg_state *regs, u32 regno)
 {
 	if (WARN_ON(regno >= MAX_BPF_REG)) {
-		verbose("mark_reg_known_zero(regs, %u)\n", regno);
+		verbose(env, "mark_reg_known_zero(regs, %u)\n", regno);
 		/* Something bad happened, let's kill all regs */
 		for (regno = 0; regno < MAX_BPF_REG; regno++)
 			__mark_reg_not_init(regs + regno);
@@ -647,10 +646,11 @@ static void __mark_reg_unknown(struct bpf_reg_state *reg)
 	__mark_reg_unbounded(reg);
 }
 
-static void mark_reg_unknown(struct bpf_reg_state *regs, u32 regno)
+static void mark_reg_unknown(struct bpf_verifier_env *env,
+			     struct bpf_reg_state *regs, u32 regno)
 {
 	if (WARN_ON(regno >= MAX_BPF_REG)) {
-		verbose("mark_reg_unknown(regs, %u)\n", regno);
+		verbose(env, "mark_reg_unknown(regs, %u)\n", regno);
 		/* Something bad happened, let's kill all regs */
 		for (regno = 0; regno < MAX_BPF_REG; regno++)
 			__mark_reg_not_init(regs + regno);
@@ -665,10 +665,11 @@ static void __mark_reg_not_init(struct bpf_reg_state *reg)
 	reg->type = NOT_INIT;
 }
 
-static void mark_reg_not_init(struct bpf_reg_state *regs, u32 regno)
+static void mark_reg_not_init(struct bpf_verifier_env *env,
+			      struct bpf_reg_state *regs, u32 regno)
 {
 	if (WARN_ON(regno >= MAX_BPF_REG)) {
-		verbose("mark_reg_not_init(regs, %u)\n", regno);
+		verbose(env, "mark_reg_not_init(regs, %u)\n", regno);
 		/* Something bad happened, let's kill all regs */
 		for (regno = 0; regno < MAX_BPF_REG; regno++)
 			__mark_reg_not_init(regs + regno);
@@ -677,22 +678,23 @@ static void mark_reg_not_init(struct bpf_reg_state *regs, u32 regno)
 	__mark_reg_not_init(regs + regno);
 }
 
-static void init_reg_state(struct bpf_reg_state *regs)
+static void init_reg_state(struct bpf_verifier_env *env,
+			   struct bpf_reg_state *regs)
 {
 	int i;
 
 	for (i = 0; i < MAX_BPF_REG; i++) {
-		mark_reg_not_init(regs, i);
+		mark_reg_not_init(env, regs, i);
 		regs[i].live = REG_LIVE_NONE;
 	}
 
 	/* frame pointer */
 	regs[BPF_REG_FP].type = PTR_TO_STACK;
-	mark_reg_known_zero(regs, BPF_REG_FP);
+	mark_reg_known_zero(env, regs, BPF_REG_FP);
 
 	/* 1st arg to a function */
 	regs[BPF_REG_1].type = PTR_TO_CTX;
-	mark_reg_known_zero(regs, BPF_REG_1);
+	mark_reg_known_zero(env, regs, BPF_REG_1);
 }
 
 enum reg_arg_type {
@@ -726,26 +728,26 @@ static int check_reg_arg(struct bpf_verifier_env *env, u32 regno,
 	struct bpf_reg_state *regs = env->cur_state.regs;
 
 	if (regno >= MAX_BPF_REG) {
-		verbose("R%d is invalid\n", regno);
+		verbose(env, "R%d is invalid\n", regno);
 		return -EINVAL;
 	}
 
 	if (t == SRC_OP) {
 		/* check whether register used as source operand can be read */
 		if (regs[regno].type == NOT_INIT) {
-			verbose("R%d !read_ok\n", regno);
+			verbose(env, "R%d !read_ok\n", regno);
 			return -EACCES;
 		}
 		mark_reg_read(&env->cur_state, regno);
 	} else {
 		/* check whether register used as dest operand can be written to */
 		if (regno == BPF_REG_FP) {
-			verbose("frame pointer is read only\n");
+			verbose(env, "frame pointer is read only\n");
 			return -EACCES;
 		}
 		regs[regno].live |= REG_LIVE_WRITTEN;
 		if (t == DST_OP)
-			mark_reg_unknown(regs, regno);
+			mark_reg_unknown(env, regs, regno);
 	}
 	return 0;
 }
@@ -770,7 +772,8 @@ static bool is_spillable_regtype(enum bpf_reg_type type)
 /* check_stack_read/write functions track spill/fill of registers,
  * stack boundary and alignment are checked in check_mem_access()
  */
-static int check_stack_write(struct bpf_verifier_state *state, int off,
+static int check_stack_write(struct bpf_verifier_env *env,
+			     struct bpf_verifier_state *state, int off,
 			     int size, int value_regno)
 {
 	int i, spi = (MAX_BPF_STACK + off) / BPF_REG_SIZE;
@@ -783,7 +786,7 @@ static int check_stack_write(struct bpf_verifier_state *state, int off,
 
 		/* register containing pointer is being spilled into stack */
 		if (size != BPF_REG_SIZE) {
-			verbose("invalid size of register spill\n");
+			verbose(env, "invalid size of register spill\n");
 			return -EACCES;
 		}
 
@@ -818,7 +821,8 @@ static void mark_stack_slot_read(const struct bpf_verifier_state *state, int slo
 	}
 }
 
-static int check_stack_read(struct bpf_verifier_state *state, int off, int size,
+static int check_stack_read(struct bpf_verifier_env *env,
+			    struct bpf_verifier_state *state, int off, int size,
 			    int value_regno)
 {
 	u8 *slot_type;
@@ -828,12 +832,12 @@ static int check_stack_read(struct bpf_verifier_state *state, int off, int size,
 
 	if (slot_type[0] == STACK_SPILL) {
 		if (size != BPF_REG_SIZE) {
-			verbose("invalid size of register spill\n");
+			verbose(env, "invalid size of register spill\n");
 			return -EACCES;
 		}
 		for (i = 1; i < BPF_REG_SIZE; i++) {
 			if (slot_type[i] != STACK_SPILL) {
-				verbose("corrupted spill memory\n");
+				verbose(env, "corrupted spill memory\n");
 				return -EACCES;
 			}
 		}
@@ -849,14 +853,14 @@ static int check_stack_read(struct bpf_verifier_state *state, int off, int size,
 	} else {
 		for (i = 0; i < size; i++) {
 			if (slot_type[i] != STACK_MISC) {
-				verbose("invalid read from stack off %d+%d size %d\n",
+				verbose(env, "invalid read from stack off %d+%d size %d\n",
 					off, i, size);
 				return -EACCES;
 			}
 		}
 		if (value_regno >= 0)
 			/* have read misc data from the stack */
-			mark_reg_unknown(state->regs, value_regno);
+			mark_reg_unknown(env, state->regs, value_regno);
 		return 0;
 	}
 }
@@ -868,7 +872,7 @@ static int __check_map_access(struct bpf_verifier_env *env, u32 regno, int off,
 	struct bpf_map *map = env->cur_state.regs[regno].map_ptr;
 
 	if (off < 0 || size <= 0 || off + size > map->value_size) {
-		verbose("invalid access to map value, value_size=%d off=%d size=%d\n",
+		verbose(env, "invalid access to map value, value_size=%d off=%d size=%d\n",
 			map->value_size, off, size);
 		return -EACCES;
 	}
@@ -887,8 +891,8 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno,
 	 * need to try adding each of min_value and max_value to off
 	 * to make sure our theoretical access will be safe.
 	 */
-	if (verifier_log.level)
-		print_verifier_state(state);
+	if (env->log.level)
+		print_verifier_state(env, state);
 	/* The minimum value is only important with signed
 	 * comparisons where we can't assume the floor of a
 	 * value is 0.  If we are using signed variables for our
@@ -896,13 +900,14 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno,
 	 * will have a set floor within our range.
 	 */
 	if (reg->smin_value < 0) {
-		verbose("R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n",
+		verbose(env, "R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n",
 			regno);
 		return -EACCES;
 	}
 	err = __check_map_access(env, regno, reg->smin_value + off, size);
 	if (err) {
-		verbose("R%d min value is outside of the array range\n", regno);
+		verbose(env, "R%d min value is outside of the array range\n",
+			regno);
 		return err;
 	}
 
@@ -911,13 +916,14 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno,
 	 * If reg->umax_value + off could overflow, treat that as unbounded too.
 	 */
 	if (reg->umax_value >= BPF_MAX_VAR_OFF) {
-		verbose("R%d unbounded memory access, make sure to bounds check any array access into a map\n",
+		verbose(env, "R%d unbounded memory access, make sure to bounds check any array access into a map\n",
 			regno);
 		return -EACCES;
 	}
 	err = __check_map_access(env, regno, reg->umax_value + off, size);
 	if (err)
-		verbose("R%d max value is outside of the array range\n", regno);
+		verbose(env, "R%d max value is outside of the array range\n",
+			regno);
 	return err;
 }
 
@@ -956,7 +962,7 @@ static int __check_packet_access(struct bpf_verifier_env *env, u32 regno,
 	struct bpf_reg_state *reg = &regs[regno];
 
 	if (off < 0 || size <= 0 || (u64)off + size > reg->range) {
-		verbose("invalid access to packet, off=%d size=%d, R%d(id=%d,off=%d,r=%d)\n",
+		verbose(env, "invalid access to packet, off=%d size=%d, R%d(id=%d,off=%d,r=%d)\n",
 			off, size, regno, reg->id, reg->off, reg->range);
 		return -EACCES;
 	}
@@ -979,13 +985,13 @@ static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off,
 	 * detail to prove they're safe.
 	 */
 	if (reg->smin_value < 0) {
-		verbose("R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n",
+		verbose(env, "R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n",
 			regno);
 		return -EACCES;
 	}
 	err = __check_packet_access(env, regno, off, size);
 	if (err) {
-		verbose("R%d offset is outside of the packet\n", regno);
+		verbose(env, "R%d offset is outside of the packet\n", regno);
 		return err;
 	}
 	return err;
@@ -1021,7 +1027,7 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off,
 		return 0;
 	}
 
-	verbose("invalid bpf_context access off=%d size=%d\n", off, size);
+	verbose(env, "invalid bpf_context access off=%d size=%d\n", off, size);
 	return -EACCES;
 }
 
@@ -1039,7 +1045,8 @@ static bool is_pointer_value(struct bpf_verifier_env *env, int regno)
 	return __is_pointer_value(env->allow_ptr_leaks, &env->cur_state.regs[regno]);
 }
 
-static int check_pkt_ptr_alignment(const struct bpf_reg_state *reg,
+static int check_pkt_ptr_alignment(struct bpf_verifier_env *env,
+				   const struct bpf_reg_state *reg,
 				   int off, int size, bool strict)
 {
 	struct tnum reg_off;
@@ -1064,7 +1071,8 @@ static int check_pkt_ptr_alignment(const struct bpf_reg_state *reg,
 		char tn_buf[48];
 
 		tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
-		verbose("misaligned packet access off %d+%s+%d+%d size %d\n",
+		verbose(env,
+			"misaligned packet access off %d+%s+%d+%d size %d\n",
 			ip_align, tn_buf, reg->off, off, size);
 		return -EACCES;
 	}
@@ -1072,7 +1080,8 @@ static int check_pkt_ptr_alignment(const struct bpf_reg_state *reg,
 	return 0;
 }
 
-static int check_generic_ptr_alignment(const struct bpf_reg_state *reg,
+static int check_generic_ptr_alignment(struct bpf_verifier_env *env,
+				       const struct bpf_reg_state *reg,
 				       const char *pointer_desc,
 				       int off, int size, bool strict)
 {
@@ -1087,7 +1096,7 @@ static int check_generic_ptr_alignment(const struct bpf_reg_state *reg,
 		char tn_buf[48];
 
 		tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
-		verbose("misaligned %saccess off %s+%d+%d size %d\n",
+		verbose(env, "misaligned %saccess off %s+%d+%d size %d\n",
 			pointer_desc, tn_buf, reg->off, off, size);
 		return -EACCES;
 	}
@@ -1108,7 +1117,7 @@ static int check_ptr_alignment(struct bpf_verifier_env *env,
 		/* Special case, because of NET_IP_ALIGN. Given metadata sits
 		 * right in front, treat it the very same way.
 		 */
-		return check_pkt_ptr_alignment(reg, off, size, strict);
+		return check_pkt_ptr_alignment(env, reg, off, size, strict);
 	case PTR_TO_MAP_VALUE:
 		pointer_desc = "value ";
 		break;
@@ -1121,7 +1130,8 @@ static int check_ptr_alignment(struct bpf_verifier_env *env,
 	default:
 		break;
 	}
-	return check_generic_ptr_alignment(reg, pointer_desc, off, size, strict);
+	return check_generic_ptr_alignment(env, reg, pointer_desc, off, size,
+					   strict);
 }
 
 /* check whether memory at (regno + off) is accessible for t = (read | write)
@@ -1153,20 +1163,20 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
 	if (reg->type == PTR_TO_MAP_VALUE) {
 		if (t == BPF_WRITE && value_regno >= 0 &&
 		    is_pointer_value(env, value_regno)) {
-			verbose("R%d leaks addr into map\n", value_regno);
+			verbose(env, "R%d leaks addr into map\n", value_regno);
 			return -EACCES;
 		}
 
 		err = check_map_access(env, regno, off, size);
 		if (!err && t == BPF_READ && value_regno >= 0)
-			mark_reg_unknown(state->regs, value_regno);
+			mark_reg_unknown(env, state->regs, value_regno);
 
 	} else if (reg->type == PTR_TO_CTX) {
 		enum bpf_reg_type reg_type = SCALAR_VALUE;
 
 		if (t == BPF_WRITE && value_regno >= 0 &&
 		    is_pointer_value(env, value_regno)) {
-			verbose("R%d leaks addr into ctx\n", value_regno);
+			verbose(env, "R%d leaks addr into ctx\n", value_regno);
 			return -EACCES;
 		}
 		/* ctx accesses must be at a fixed offset, so that we can
@@ -1176,7 +1186,8 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
 			char tn_buf[48];
 
 			tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
-			verbose("variable ctx access var_off=%s off=%d size=%d",
+			verbose(env,
+				"variable ctx access var_off=%s off=%d size=%d",
 				tn_buf, off, size);
 			return -EACCES;
 		}
@@ -1188,9 +1199,10 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
 			 * case, we know the offset is zero.
 			 */
 			if (reg_type == SCALAR_VALUE)
-				mark_reg_unknown(state->regs, value_regno);
+				mark_reg_unknown(env, state->regs, value_regno);
 			else
-				mark_reg_known_zero(state->regs, value_regno);
+				mark_reg_known_zero(env, state->regs,
+						    value_regno);
 			state->regs[value_regno].id = 0;
 			state->regs[value_regno].off = 0;
 			state->regs[value_regno].range = 0;
@@ -1206,13 +1218,14 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
 			char tn_buf[48];
 
 			tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
-			verbose("variable stack access var_off=%s off=%d size=%d",
+			verbose(env, "variable stack access var_off=%s off=%d size=%d",
 				tn_buf, off, size);
 			return -EACCES;
 		}
 		off += reg->var_off.value;
 		if (off >= 0 || off < -MAX_BPF_STACK) {
-			verbose("invalid stack off=%d size=%d\n", off, size);
+			verbose(env, "invalid stack off=%d size=%d\n", off,
+				size);
 			return -EACCES;
 		}
 
@@ -1223,29 +1236,32 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
 			if (!env->allow_ptr_leaks &&
 			    state->stack_slot_type[MAX_BPF_STACK + off] == STACK_SPILL &&
 			    size != BPF_REG_SIZE) {
-				verbose("attempt to corrupt spilled pointer on stack\n");
+				verbose(env, "attempt to corrupt spilled pointer on stack\n");
 				return -EACCES;
 			}
-			err = check_stack_write(state, off, size, value_regno);
+			err = check_stack_write(env, state, off, size,
+						value_regno);
 		} else {
-			err = check_stack_read(state, off, size, value_regno);
+			err = check_stack_read(env, state, off, size,
+					       value_regno);
 		}
 	} else if (reg_is_pkt_pointer(reg)) {
 		if (t == BPF_WRITE && !may_access_direct_pkt_data(env, NULL, t)) {
-			verbose("cannot write into packet\n");
+			verbose(env, "cannot write into packet\n");
 			return -EACCES;
 		}
 		if (t == BPF_WRITE && value_regno >= 0 &&
 		    is_pointer_value(env, value_regno)) {
-			verbose("R%d leaks addr into packet\n", value_regno);
+			verbose(env, "R%d leaks addr into packet\n",
+				value_regno);
 			return -EACCES;
 		}
 		err = check_packet_access(env, regno, off, size);
 		if (!err && t == BPF_READ && value_regno >= 0)
-			mark_reg_unknown(state->regs, value_regno);
+			mark_reg_unknown(env, state->regs, value_regno);
 	} else {
-		verbose("R%d invalid mem access '%s'\n",
-			regno, reg_type_str[reg->type]);
+		verbose(env, "R%d invalid mem access '%s'\n", regno,
+			reg_type_str[reg->type]);
 		return -EACCES;
 	}
 
@@ -1265,7 +1281,7 @@ static int check_xadd(struct bpf_verifier_env *env, int insn_idx, struct bpf_ins
 
 	if ((BPF_SIZE(insn->code) != BPF_W && BPF_SIZE(insn->code) != BPF_DW) ||
 	    insn->imm != 0) {
-		verbose("BPF_XADD uses reserved fields\n");
+		verbose(env, "BPF_XADD uses reserved fields\n");
 		return -EINVAL;
 	}
 
@@ -1280,7 +1296,7 @@ static int check_xadd(struct bpf_verifier_env *env, int insn_idx, struct bpf_ins
 		return err;
 
 	if (is_pointer_value(env, insn->src_reg)) {
-		verbose("R%d leaks addr into mem\n", insn->src_reg);
+		verbose(env, "R%d leaks addr into mem\n", insn->src_reg);
 		return -EACCES;
 	}
 
@@ -1321,7 +1337,7 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno,
 		    register_is_null(regs[regno]))
 			return 0;
 
-		verbose("R%d type=%s expected=%s\n", regno,
+		verbose(env, "R%d type=%s expected=%s\n", regno,
 			reg_type_str[regs[regno].type],
 			reg_type_str[PTR_TO_STACK]);
 		return -EACCES;
@@ -1332,13 +1348,13 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno,
 		char tn_buf[48];
 
 		tnum_strn(tn_buf, sizeof(tn_buf), regs[regno].var_off);
-		verbose("invalid variable stack read R%d var_off=%s\n",
+		verbose(env, "invalid variable stack read R%d var_off=%s\n",
 			regno, tn_buf);
 	}
 	off = regs[regno].off + regs[regno].var_off.value;
 	if (off >= 0 || off < -MAX_BPF_STACK || off + access_size > 0 ||
 	    access_size <= 0) {
-		verbose("invalid stack type R%d off=%d access_size=%d\n",
+		verbose(env, "invalid stack type R%d off=%d access_size=%d\n",
 			regno, off, access_size);
 		return -EACCES;
 	}
@@ -1354,7 +1370,7 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno,
 
 	for (i = 0; i < access_size; i++) {
 		if (state->stack_slot_type[MAX_BPF_STACK + off + i] != STACK_MISC) {
-			verbose("invalid indirect read from stack off %d+%d size %d\n",
+			verbose(env, "invalid indirect read from stack off %d+%d size %d\n",
 				off, i, access_size);
 			return -EACCES;
 		}
@@ -1397,7 +1413,8 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
 
 	if (arg_type == ARG_ANYTHING) {
 		if (is_pointer_value(env, regno)) {
-			verbose("R%d leaks addr into helper function\n", regno);
+			verbose(env, "R%d leaks addr into helper function\n",
+				regno);
 			return -EACCES;
 		}
 		return 0;
@@ -1405,7 +1422,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
 
 	if (type_is_pkt_pointer(type) &&
 	    !may_access_direct_pkt_data(env, meta, BPF_READ)) {
-		verbose("helper access to the packet is not allowed\n");
+		verbose(env, "helper access to the packet is not allowed\n");
 		return -EACCES;
 	}
 
@@ -1443,7 +1460,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
 			goto err_type;
 		meta->raw_mode = arg_type == ARG_PTR_TO_UNINIT_MEM;
 	} else {
-		verbose("unsupported arg_type %d\n", arg_type);
+		verbose(env, "unsupported arg_type %d\n", arg_type);
 		return -EFAULT;
 	}
 
@@ -1461,7 +1478,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
 			 * we have to check map_key here. Otherwise it means
 			 * that kernel subsystem misconfigured verifier
 			 */
-			verbose("invalid map_ptr to access map->key\n");
+			verbose(env, "invalid map_ptr to access map->key\n");
 			return -EACCES;
 		}
 		if (type_is_pkt_pointer(type))
@@ -1477,7 +1494,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
 		 */
 		if (!meta->map_ptr) {
 			/* kernel subsystem misconfigured verifier */
-			verbose("invalid map_ptr to access map->value\n");
+			verbose(env, "invalid map_ptr to access map->value\n");
 			return -EACCES;
 		}
 		if (type_is_pkt_pointer(type))
@@ -1497,7 +1514,8 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
 		 */
 		if (regno == 0) {
 			/* kernel subsystem misconfigured verifier */
-			verbose("ARG_CONST_SIZE cannot be first argument\n");
+			verbose(env,
+				"ARG_CONST_SIZE cannot be first argument\n");
 			return -EACCES;
 		}
 
@@ -1514,7 +1532,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
 			meta = NULL;
 
 		if (reg->smin_value < 0) {
-			verbose("R%d min value is negative, either use unsigned or 'var &= const'\n",
+			verbose(env, "R%d min value is negative, either use unsigned or 'var &= const'\n",
 				regno);
 			return -EACCES;
 		}
@@ -1528,7 +1546,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
 		}
 
 		if (reg->umax_value >= BPF_MAX_VAR_SIZ) {
-			verbose("R%d unbounded memory access, use 'var &= const' or 'if (var < const)'\n",
+			verbose(env, "R%d unbounded memory access, use 'var &= const' or 'if (var < const)'\n",
 				regno);
 			return -EACCES;
 		}
@@ -1539,12 +1557,13 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
 
 	return err;
 err_type:
-	verbose("R%d type=%s expected=%s\n", regno,
+	verbose(env, "R%d type=%s expected=%s\n", regno,
 		reg_type_str[type], reg_type_str[expected_type]);
 	return -EACCES;
 }
 
-static int check_map_func_compatibility(struct bpf_map *map, int func_id)
+static int check_map_func_compatibility(struct bpf_verifier_env *env,
+					struct bpf_map *map, int func_id)
 {
 	if (!map)
 		return 0;
@@ -1632,7 +1651,7 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id)
 
 	return 0;
 error:
-	verbose("cannot pass map_type %d into func %s#%d\n",
+	verbose(env, "cannot pass map_type %d into func %s#%d\n",
 		map->map_type, func_id_name(func_id), func_id);
 	return -EINVAL;
 }
@@ -1666,7 +1685,7 @@ static void clear_all_pkt_pointers(struct bpf_verifier_env *env)
 
 	for (i = 0; i < MAX_BPF_REG; i++)
 		if (reg_is_pkt_pointer_any(&regs[i]))
-			mark_reg_unknown(regs, i);
+			mark_reg_unknown(env, regs, i);
 
 	for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) {
 		if (state->stack_slot_type[i] != STACK_SPILL)
@@ -1688,7 +1707,8 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
 
 	/* find function prototype */
 	if (func_id < 0 || func_id >= __BPF_FUNC_MAX_ID) {
-		verbose("invalid func %s#%d\n", func_id_name(func_id), func_id);
+		verbose(env, "invalid func %s#%d\n", func_id_name(func_id),
+			func_id);
 		return -EINVAL;
 	}
 
@@ -1696,13 +1716,14 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
 		fn = env->prog->aux->ops->get_func_proto(func_id);
 
 	if (!fn) {
-		verbose("unknown func %s#%d\n", func_id_name(func_id), func_id);
+		verbose(env, "unknown func %s#%d\n", func_id_name(func_id),
+			func_id);
 		return -EINVAL;
 	}
 
 	/* eBPF programs must be GPL compatible to use GPL-ed functions */
 	if (!env->prog->gpl_compatible && fn->gpl_only) {
-		verbose("cannot call GPL only function from proprietary program\n");
+		verbose(env, "cannot call GPL only function from proprietary program\n");
 		return -EINVAL;
 	}
 
@@ -1716,7 +1737,7 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
 	 */
 	err = check_raw_mode(fn);
 	if (err) {
-		verbose("kernel subsystem misconfigured func %s#%d\n",
+		verbose(env, "kernel subsystem misconfigured func %s#%d\n",
 			func_id_name(func_id), func_id);
 		return err;
 	}
@@ -1749,14 +1770,14 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
 
 	/* reset caller saved regs */
 	for (i = 0; i < CALLER_SAVED_REGS; i++) {
-		mark_reg_not_init(regs, caller_saved[i]);
+		mark_reg_not_init(env, regs, caller_saved[i]);
 		check_reg_arg(env, caller_saved[i], DST_OP_NO_MARK);
 	}
 
 	/* update return register (already marked as written above) */
 	if (fn->ret_type == RET_INTEGER) {
 		/* sets type to SCALAR_VALUE */
-		mark_reg_unknown(regs, BPF_REG_0);
+		mark_reg_unknown(env, regs, BPF_REG_0);
 	} else if (fn->ret_type == RET_VOID) {
 		regs[BPF_REG_0].type = NOT_INIT;
 	} else if (fn->ret_type == RET_PTR_TO_MAP_VALUE_OR_NULL) {
@@ -1764,14 +1785,15 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
 
 		regs[BPF_REG_0].type = PTR_TO_MAP_VALUE_OR_NULL;
 		/* There is no offset yet applied, variable or fixed */
-		mark_reg_known_zero(regs, BPF_REG_0);
+		mark_reg_known_zero(env, regs, BPF_REG_0);
 		regs[BPF_REG_0].off = 0;
 		/* remember map_ptr, so that check_map_access()
 		 * can check 'value_size' boundary of memory access
 		 * to map element returned from bpf_map_lookup_elem()
 		 */
 		if (meta.map_ptr == NULL) {
-			verbose("kernel subsystem misconfigured verifier\n");
+			verbose(env,
+				"kernel subsystem misconfigured verifier\n");
 			return -EINVAL;
 		}
 		regs[BPF_REG_0].map_ptr = meta.map_ptr;
@@ -1782,12 +1804,12 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
 		else if (insn_aux->map_ptr != meta.map_ptr)
 			insn_aux->map_ptr = BPF_MAP_PTR_POISON;
 	} else {
-		verbose("unknown return type %d of func %s#%d\n",
+		verbose(env, "unknown return type %d of func %s#%d\n",
 			fn->ret_type, func_id_name(func_id), func_id);
 		return -EINVAL;
 	}
 
-	err = check_map_func_compatibility(meta.map_ptr, func_id);
+	err = check_map_func_compatibility(env, meta.map_ptr, func_id);
 	if (err)
 		return err;
 
@@ -1846,39 +1868,42 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
 	dst_reg = &regs[dst];
 
 	if (WARN_ON_ONCE(known && (smin_val != smax_val))) {
-		print_verifier_state(&env->cur_state);
-		verbose("verifier internal error: known but bad sbounds\n");
+		print_verifier_state(env, &env->cur_state);
+		verbose(env,
+			"verifier internal error: known but bad sbounds\n");
 		return -EINVAL;
 	}
 	if (WARN_ON_ONCE(known && (umin_val != umax_val))) {
-		print_verifier_state(&env->cur_state);
-		verbose("verifier internal error: known but bad ubounds\n");
+		print_verifier_state(env, &env->cur_state);
+		verbose(env,
+			"verifier internal error: known but bad ubounds\n");
 		return -EINVAL;
 	}
 
 	if (BPF_CLASS(insn->code) != BPF_ALU64) {
 		/* 32-bit ALU ops on pointers produce (meaningless) scalars */
 		if (!env->allow_ptr_leaks)
-			verbose("R%d 32-bit pointer arithmetic prohibited\n",
+			verbose(env,
+				"R%d 32-bit pointer arithmetic prohibited\n",
 				dst);
 		return -EACCES;
 	}
 
 	if (ptr_reg->type == PTR_TO_MAP_VALUE_OR_NULL) {
 		if (!env->allow_ptr_leaks)
-			verbose("R%d pointer arithmetic on PTR_TO_MAP_VALUE_OR_NULL prohibited, null-check it first\n",
+			verbose(env, "R%d pointer arithmetic on PTR_TO_MAP_VALUE_OR_NULL prohibited, null-check it first\n",
 				dst);
 		return -EACCES;
 	}
 	if (ptr_reg->type == CONST_PTR_TO_MAP) {
 		if (!env->allow_ptr_leaks)
-			verbose("R%d pointer arithmetic on CONST_PTR_TO_MAP prohibited\n",
+			verbose(env, "R%d pointer arithmetic on CONST_PTR_TO_MAP prohibited\n",
 				dst);
 		return -EACCES;
 	}
 	if (ptr_reg->type == PTR_TO_PACKET_END) {
 		if (!env->allow_ptr_leaks)
-			verbose("R%d pointer arithmetic on PTR_TO_PACKET_END prohibited\n",
+			verbose(env, "R%d pointer arithmetic on PTR_TO_PACKET_END prohibited\n",
 				dst);
 		return -EACCES;
 	}
@@ -1943,7 +1968,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
 		if (dst_reg == off_reg) {
 			/* scalar -= pointer.  Creates an unknown scalar */
 			if (!env->allow_ptr_leaks)
-				verbose("R%d tried to subtract pointer from scalar\n",
+				verbose(env, "R%d tried to subtract pointer from scalar\n",
 					dst);
 			return -EACCES;
 		}
@@ -1953,7 +1978,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
 		 */
 		if (ptr_reg->type == PTR_TO_STACK) {
 			if (!env->allow_ptr_leaks)
-				verbose("R%d subtraction from stack pointer prohibited\n",
+				verbose(env, "R%d subtraction from stack pointer prohibited\n",
 					dst);
 			return -EACCES;
 		}
@@ -2008,13 +2033,13 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
 		 * ptr &= ~3 which would reduce min_value by 3.)
 		 */
 		if (!env->allow_ptr_leaks)
-			verbose("R%d bitwise operator %s on pointer prohibited\n",
+			verbose(env, "R%d bitwise operator %s on pointer prohibited\n",
 				dst, bpf_alu_string[opcode >> 4]);
 		return -EACCES;
 	default:
 		/* other operators (e.g. MUL,LSH) produce non-pointer results */
 		if (!env->allow_ptr_leaks)
-			verbose("R%d pointer arithmetic with %s operator prohibited\n",
+			verbose(env, "R%d pointer arithmetic with %s operator prohibited\n",
 				dst, bpf_alu_string[opcode >> 4]);
 		return -EACCES;
 	}
@@ -2180,7 +2205,7 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
 			/* Shifts greater than 63 are undefined.  This includes
 			 * shifts by a negative number.
 			 */
-			mark_reg_unknown(regs, insn->dst_reg);
+			mark_reg_unknown(env, regs, insn->dst_reg);
 			break;
 		}
 		/* We lose all sign bit information (except what we can pick
@@ -2208,7 +2233,7 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
 			/* Shifts greater than 63 are undefined.  This includes
 			 * shifts by a negative number.
 			 */
-			mark_reg_unknown(regs, insn->dst_reg);
+			mark_reg_unknown(env, regs, insn->dst_reg);
 			break;
 		}
 		/* BPF_RSH is an unsigned shift, so make the appropriate casts */
@@ -2236,7 +2261,7 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
 		__update_reg_bounds(dst_reg);
 		break;
 	default:
-		mark_reg_unknown(regs, insn->dst_reg);
+		mark_reg_unknown(env, regs, insn->dst_reg);
 		break;
 	}
 
@@ -2268,12 +2293,12 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env,
 				 * an arbitrary scalar.
 				 */
 				if (!env->allow_ptr_leaks) {
-					verbose("R%d pointer %s pointer prohibited\n",
+					verbose(env, "R%d pointer %s pointer prohibited\n",
 						insn->dst_reg,
 						bpf_alu_string[opcode >> 4]);
 					return -EACCES;
 				}
-				mark_reg_unknown(regs, insn->dst_reg);
+				mark_reg_unknown(env, regs, insn->dst_reg);
 				return 0;
 			} else {
 				/* scalar += pointer
@@ -2325,13 +2350,13 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env,
 
 	/* Got here implies adding two SCALAR_VALUEs */
 	if (WARN_ON_ONCE(ptr_reg)) {
-		print_verifier_state(&env->cur_state);
-		verbose("verifier internal error: unexpected ptr_reg\n");
+		print_verifier_state(env, &env->cur_state);
+		verbose(env, "verifier internal error: unexpected ptr_reg\n");
 		return -EINVAL;
 	}
 	if (WARN_ON(!src_reg)) {
-		print_verifier_state(&env->cur_state);
-		verbose("verifier internal error: no src_reg\n");
+		print_verifier_state(env, &env->cur_state);
+		verbose(env, "verifier internal error: no src_reg\n");
 		return -EINVAL;
 	}
 	return adjust_scalar_min_max_vals(env, insn, dst_reg, *src_reg);
@@ -2349,14 +2374,14 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
 			if (BPF_SRC(insn->code) != 0 ||
 			    insn->src_reg != BPF_REG_0 ||
 			    insn->off != 0 || insn->imm != 0) {
-				verbose("BPF_NEG uses reserved fields\n");
+				verbose(env, "BPF_NEG uses reserved fields\n");
 				return -EINVAL;
 			}
 		} else {
 			if (insn->src_reg != BPF_REG_0 || insn->off != 0 ||
 			    (insn->imm != 16 && insn->imm != 32 && insn->imm != 64) ||
 			    BPF_CLASS(insn->code) == BPF_ALU64) {
-				verbose("BPF_END uses reserved fields\n");
+				verbose(env, "BPF_END uses reserved fields\n");
 				return -EINVAL;
 			}
 		}
@@ -2367,7 +2392,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
 			return err;
 
 		if (is_pointer_value(env, insn->dst_reg)) {
-			verbose("R%d pointer arithmetic prohibited\n",
+			verbose(env, "R%d pointer arithmetic prohibited\n",
 				insn->dst_reg);
 			return -EACCES;
 		}
@@ -2381,7 +2406,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
 
 		if (BPF_SRC(insn->code) == BPF_X) {
 			if (insn->imm != 0 || insn->off != 0) {
-				verbose("BPF_MOV uses reserved fields\n");
+				verbose(env, "BPF_MOV uses reserved fields\n");
 				return -EINVAL;
 			}
 
@@ -2391,7 +2416,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
 				return err;
 		} else {
 			if (insn->src_reg != BPF_REG_0 || insn->off != 0) {
-				verbose("BPF_MOV uses reserved fields\n");
+				verbose(env, "BPF_MOV uses reserved fields\n");
 				return -EINVAL;
 			}
 		}
@@ -2411,11 +2436,12 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
 			} else {
 				/* R1 = (u32) R2 */
 				if (is_pointer_value(env, insn->src_reg)) {
-					verbose("R%d partial copy of pointer\n",
+					verbose(env,
+						"R%d partial copy of pointer\n",
 						insn->src_reg);
 					return -EACCES;
 				}
-				mark_reg_unknown(regs, insn->dst_reg);
+				mark_reg_unknown(env, regs, insn->dst_reg);
 				/* high 32 bits are known zero. */
 				regs[insn->dst_reg].var_off = tnum_cast(
 						regs[insn->dst_reg].var_off, 4);
@@ -2430,14 +2456,14 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
 		}
 
 	} else if (opcode > BPF_END) {
-		verbose("invalid BPF_ALU opcode %x\n", opcode);
+		verbose(env, "invalid BPF_ALU opcode %x\n", opcode);
 		return -EINVAL;
 
 	} else {	/* all other ALU ops: and, sub, xor, add, ... */
 
 		if (BPF_SRC(insn->code) == BPF_X) {
 			if (insn->imm != 0 || insn->off != 0) {
-				verbose("BPF_ALU uses reserved fields\n");
+				verbose(env, "BPF_ALU uses reserved fields\n");
 				return -EINVAL;
 			}
 			/* check src1 operand */
@@ -2446,7 +2472,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
 				return err;
 		} else {
 			if (insn->src_reg != BPF_REG_0 || insn->off != 0) {
-				verbose("BPF_ALU uses reserved fields\n");
+				verbose(env, "BPF_ALU uses reserved fields\n");
 				return -EINVAL;
 			}
 		}
@@ -2458,7 +2484,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
 
 		if ((opcode == BPF_MOD || opcode == BPF_DIV) &&
 		    BPF_SRC(insn->code) == BPF_K && insn->imm == 0) {
-			verbose("div by zero\n");
+			verbose(env, "div by zero\n");
 			return -EINVAL;
 		}
 
@@ -2467,7 +2493,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
 			int size = BPF_CLASS(insn->code) == BPF_ALU64 ? 64 : 32;
 
 			if (insn->imm < 0 || insn->imm >= size) {
-				verbose("invalid shift %d\n", insn->imm);
+				verbose(env, "invalid shift %d\n", insn->imm);
 				return -EINVAL;
 			}
 		}
@@ -2820,13 +2846,13 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
 	int err;
 
 	if (opcode > BPF_JSLE) {
-		verbose("invalid BPF_JMP opcode %x\n", opcode);
+		verbose(env, "invalid BPF_JMP opcode %x\n", opcode);
 		return -EINVAL;
 	}
 
 	if (BPF_SRC(insn->code) == BPF_X) {
 		if (insn->imm != 0) {
-			verbose("BPF_JMP uses reserved fields\n");
+			verbose(env, "BPF_JMP uses reserved fields\n");
 			return -EINVAL;
 		}
 
@@ -2836,13 +2862,13 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
 			return err;
 
 		if (is_pointer_value(env, insn->src_reg)) {
-			verbose("R%d pointer comparison prohibited\n",
+			verbose(env, "R%d pointer comparison prohibited\n",
 				insn->src_reg);
 			return -EACCES;
 		}
 	} else {
 		if (insn->src_reg != BPF_REG_0) {
-			verbose("BPF_JMP uses reserved fields\n");
+			verbose(env, "BPF_JMP uses reserved fields\n");
 			return -EINVAL;
 		}
 	}
@@ -2954,11 +2980,12 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
 		find_good_pkt_pointers(this_branch, &regs[insn->src_reg],
 				       PTR_TO_PACKET_META);
 	} else if (is_pointer_value(env, insn->dst_reg)) {
-		verbose("R%d pointer comparison prohibited\n", insn->dst_reg);
+		verbose(env, "R%d pointer comparison prohibited\n",
+			insn->dst_reg);
 		return -EACCES;
 	}
-	if (verifier_log.level)
-		print_verifier_state(this_branch);
+	if (env->log.level)
+		print_verifier_state(env, this_branch);
 	return 0;
 }
 
@@ -2977,11 +3004,11 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn)
 	int err;
 
 	if (BPF_SIZE(insn->code) != BPF_DW) {
-		verbose("invalid BPF_LD_IMM insn\n");
+		verbose(env, "invalid BPF_LD_IMM insn\n");
 		return -EINVAL;
 	}
 	if (insn->off != 0) {
-		verbose("BPF_LD_IMM64 uses reserved fields\n");
+		verbose(env, "BPF_LD_IMM64 uses reserved fields\n");
 		return -EINVAL;
 	}
 
@@ -3039,14 +3066,14 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
 	int i, err;
 
 	if (!may_access_skb(env->prog->type)) {
-		verbose("BPF_LD_[ABS|IND] instructions not allowed for this program type\n");
+		verbose(env, "BPF_LD_[ABS|IND] instructions not allowed for this program type\n");
 		return -EINVAL;
 	}
 
 	if (insn->dst_reg != BPF_REG_0 || insn->off != 0 ||
 	    BPF_SIZE(insn->code) == BPF_DW ||
 	    (mode == BPF_ABS && insn->src_reg != BPF_REG_0)) {
-		verbose("BPF_LD_[ABS|IND] uses reserved fields\n");
+		verbose(env, "BPF_LD_[ABS|IND] uses reserved fields\n");
 		return -EINVAL;
 	}
 
@@ -3056,7 +3083,8 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
 		return err;
 
 	if (regs[BPF_REG_6].type != PTR_TO_CTX) {
-		verbose("at the time of BPF_LD_ABS|IND R6 != pointer to skb\n");
+		verbose(env,
+			"at the time of BPF_LD_ABS|IND R6 != pointer to skb\n");
 		return -EINVAL;
 	}
 
@@ -3069,7 +3097,7 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
 
 	/* reset caller saved regs to unreadable */
 	for (i = 0; i < CALLER_SAVED_REGS; i++) {
-		mark_reg_not_init(regs, caller_saved[i]);
+		mark_reg_not_init(env, regs, caller_saved[i]);
 		check_reg_arg(env, caller_saved[i], DST_OP_NO_MARK);
 	}
 
@@ -3077,7 +3105,7 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
 	 * the value fetched from the packet.
 	 * Already marked as written above.
 	 */
-	mark_reg_unknown(regs, BPF_REG_0);
+	mark_reg_unknown(env, regs, BPF_REG_0);
 	return 0;
 }
 
@@ -3097,22 +3125,22 @@ static int check_return_code(struct bpf_verifier_env *env)
 
 	reg = &env->cur_state.regs[BPF_REG_0];
 	if (reg->type != SCALAR_VALUE) {
-		verbose("At program exit the register R0 is not a known value (%s)\n",
+		verbose(env, "At program exit the register R0 is not a known value (%s)\n",
 			reg_type_str[reg->type]);
 		return -EINVAL;
 	}
 
 	if (!tnum_in(range, reg->var_off)) {
-		verbose("At program exit the register R0 ");
+		verbose(env, "At program exit the register R0 ");
 		if (!tnum_is_unknown(reg->var_off)) {
 			char tn_buf[48];
 
 			tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
-			verbose("has value %s", tn_buf);
+			verbose(env, "has value %s", tn_buf);
 		} else {
-			verbose("has unknown scalar value");
+			verbose(env, "has unknown scalar value");
 		}
-		verbose(" should have been 0 or 1\n");
+		verbose(env, " should have been 0 or 1\n");
 		return -EINVAL;
 	}
 	return 0;
@@ -3178,7 +3206,7 @@ static int push_insn(int t, int w, int e, struct bpf_verifier_env *env)
 		return 0;
 
 	if (w < 0 || w >= env->prog->len) {
-		verbose("jump out of range from insn %d to %d\n", t, w);
+		verbose(env, "jump out of range from insn %d to %d\n", t, w);
 		return -EINVAL;
 	}
 
@@ -3195,13 +3223,13 @@ static int push_insn(int t, int w, int e, struct bpf_verifier_env *env)
 		insn_stack[cur_stack++] = w;
 		return 1;
 	} else if ((insn_state[w] & 0xF0) == DISCOVERED) {
-		verbose("back-edge from insn %d to %d\n", t, w);
+		verbose(env, "back-edge from insn %d to %d\n", t, w);
 		return -EINVAL;
 	} else if (insn_state[w] == EXPLORED) {
 		/* forward- or cross-edge */
 		insn_state[t] = DISCOVERED | e;
 	} else {
-		verbose("insn state internal bug\n");
+		verbose(env, "insn state internal bug\n");
 		return -EFAULT;
 	}
 	return 0;
@@ -3295,7 +3323,7 @@ peek_stack:
 mark_explored:
 	insn_state[t] = EXPLORED;
 	if (cur_stack-- <= 0) {
-		verbose("pop stack internal bug\n");
+		verbose(env, "pop stack internal bug\n");
 		ret = -EFAULT;
 		goto err_free;
 	}
@@ -3304,7 +3332,7 @@ mark_explored:
 check_state:
 	for (i = 0; i < insn_cnt; i++) {
 		if (insn_state[i] != EXPLORED) {
-			verbose("unreachable insn %d\n", i);
+			verbose(env, "unreachable insn %d\n", i);
 			ret = -EINVAL;
 			goto err_free;
 		}
@@ -3685,7 +3713,7 @@ static int do_check(struct bpf_verifier_env *env)
 	int insn_processed = 0;
 	bool do_print_state = false;
 
-	init_reg_state(regs);
+	init_reg_state(env, regs);
 	state->parent = NULL;
 	insn_idx = 0;
 	for (;;) {
@@ -3694,7 +3722,7 @@ static int do_check(struct bpf_verifier_env *env)
 		int err;
 
 		if (insn_idx >= insn_cnt) {
-			verbose("invalid insn idx %d insn_cnt %d\n",
+			verbose(env, "invalid insn idx %d insn_cnt %d\n",
 				insn_idx, insn_cnt);
 			return -EFAULT;
 		}
@@ -3703,7 +3731,8 @@ static int do_check(struct bpf_verifier_env *env)
 		class = BPF_CLASS(insn->code);
 
 		if (++insn_processed > BPF_COMPLEXITY_LIMIT_INSNS) {
-			verbose("BPF program is too large. Processed %d insn\n",
+			verbose(env,
+				"BPF program is too large. Processed %d insn\n",
 				insn_processed);
 			return -E2BIG;
 		}
@@ -3713,12 +3742,12 @@ static int do_check(struct bpf_verifier_env *env)
 			return err;
 		if (err == 1) {
 			/* found equivalent state, can prune the search */
-			if (verifier_log.level) {
+			if (env->log.level) {
 				if (do_print_state)
-					verbose("\nfrom %d to %d: safe\n",
+					verbose(env, "\nfrom %d to %d: safe\n",
 						prev_insn_idx, insn_idx);
 				else
-					verbose("%d: safe\n", insn_idx);
+					verbose(env, "%d: safe\n", insn_idx);
 			}
 			goto process_bpf_exit;
 		}
@@ -3726,19 +3755,18 @@ static int do_check(struct bpf_verifier_env *env)
 		if (need_resched())
 			cond_resched();
 
-		if (verifier_log.level > 1 ||
-		    (verifier_log.level && do_print_state)) {
-			if (verifier_log.level > 1)
-				verbose("%d:", insn_idx);
+		if (env->log.level > 1 || (env->log.level && do_print_state)) {
+			if (env->log.level > 1)
+				verbose(env, "%d:", insn_idx);
 			else
-				verbose("\nfrom %d to %d:",
+				verbose(env, "\nfrom %d to %d:",
 					prev_insn_idx, insn_idx);
-			print_verifier_state(&env->cur_state);
+			print_verifier_state(env, &env->cur_state);
 			do_print_state = false;
 		}
 
-		if (verifier_log.level) {
-			verbose("%d: ", insn_idx);
+		if (env->log.level) {
+			verbose(env, "%d: ", insn_idx);
 			print_bpf_insn(env, insn);
 		}
 
@@ -3795,7 +3823,7 @@ static int do_check(struct bpf_verifier_env *env)
 				 * src_reg == stack|map in some other branch.
 				 * Reject it.
 				 */
-				verbose("same insn cannot be used with different pointers\n");
+				verbose(env, "same insn cannot be used with different pointers\n");
 				return -EINVAL;
 			}
 
@@ -3835,14 +3863,14 @@ static int do_check(struct bpf_verifier_env *env)
 			} else if (dst_reg_type != *prev_dst_type &&
 				   (dst_reg_type == PTR_TO_CTX ||
 				    *prev_dst_type == PTR_TO_CTX)) {
-				verbose("same insn cannot be used with different pointers\n");
+				verbose(env, "same insn cannot be used with different pointers\n");
 				return -EINVAL;
 			}
 
 		} else if (class == BPF_ST) {
 			if (BPF_MODE(insn->code) != BPF_MEM ||
 			    insn->src_reg != BPF_REG_0) {
-				verbose("BPF_ST uses reserved fields\n");
+				verbose(env, "BPF_ST uses reserved fields\n");
 				return -EINVAL;
 			}
 			/* check src operand */
@@ -3865,7 +3893,7 @@ static int do_check(struct bpf_verifier_env *env)
 				    insn->off != 0 ||
 				    insn->src_reg != BPF_REG_0 ||
 				    insn->dst_reg != BPF_REG_0) {
-					verbose("BPF_CALL uses reserved fields\n");
+					verbose(env, "BPF_CALL uses reserved fields\n");
 					return -EINVAL;
 				}
 
@@ -3878,7 +3906,7 @@ static int do_check(struct bpf_verifier_env *env)
 				    insn->imm != 0 ||
 				    insn->src_reg != BPF_REG_0 ||
 				    insn->dst_reg != BPF_REG_0) {
-					verbose("BPF_JA uses reserved fields\n");
+					verbose(env, "BPF_JA uses reserved fields\n");
 					return -EINVAL;
 				}
 
@@ -3890,7 +3918,7 @@ static int do_check(struct bpf_verifier_env *env)
 				    insn->imm != 0 ||
 				    insn->src_reg != BPF_REG_0 ||
 				    insn->dst_reg != BPF_REG_0) {
-					verbose("BPF_EXIT uses reserved fields\n");
+					verbose(env, "BPF_EXIT uses reserved fields\n");
 					return -EINVAL;
 				}
 
@@ -3905,7 +3933,7 @@ static int do_check(struct bpf_verifier_env *env)
 					return err;
 
 				if (is_pointer_value(env, BPF_REG_0)) {
-					verbose("R0 leaks addr as return value\n");
+					verbose(env, "R0 leaks addr as return value\n");
 					return -EACCES;
 				}
 
@@ -3940,19 +3968,19 @@ process_bpf_exit:
 
 				insn_idx++;
 			} else {
-				verbose("invalid BPF_LD mode\n");
+				verbose(env, "invalid BPF_LD mode\n");
 				return -EINVAL;
 			}
 		} else {
-			verbose("unknown insn class %d\n", class);
+			verbose(env, "unknown insn class %d\n", class);
 			return -EINVAL;
 		}
 
 		insn_idx++;
 	}
 
-	verbose("processed %d insns, stack depth %d\n",
-		insn_processed, env->prog->aux->stack_depth);
+	verbose(env, "processed %d insns, stack depth %d\n", insn_processed,
+		env->prog->aux->stack_depth);
 	return 0;
 }
 
@@ -3964,7 +3992,8 @@ static int check_map_prealloc(struct bpf_map *map)
 		!(map->map_flags & BPF_F_NO_PREALLOC);
 }
 
-static int check_map_prog_compatibility(struct bpf_map *map,
+static int check_map_prog_compatibility(struct bpf_verifier_env *env,
+					struct bpf_map *map,
 					struct bpf_prog *prog)
 
 {
@@ -3975,12 +4004,12 @@ static int check_map_prog_compatibility(struct bpf_map *map,
 	 */
 	if (prog->type == BPF_PROG_TYPE_PERF_EVENT) {
 		if (!check_map_prealloc(map)) {
-			verbose("perf_event programs can only use preallocated hash map\n");
+			verbose(env, "perf_event programs can only use preallocated hash map\n");
 			return -EINVAL;
 		}
 		if (map->inner_map_meta &&
 		    !check_map_prealloc(map->inner_map_meta)) {
-			verbose("perf_event programs can only use preallocated inner hash map\n");
+			verbose(env, "perf_event programs can only use preallocated inner hash map\n");
 			return -EINVAL;
 		}
 	}
@@ -4003,14 +4032,14 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env)
 	for (i = 0; i < insn_cnt; i++, insn++) {
 		if (BPF_CLASS(insn->code) == BPF_LDX &&
 		    (BPF_MODE(insn->code) != BPF_MEM || insn->imm != 0)) {
-			verbose("BPF_LDX uses reserved fields\n");
+			verbose(env, "BPF_LDX uses reserved fields\n");
 			return -EINVAL;
 		}
 
 		if (BPF_CLASS(insn->code) == BPF_STX &&
 		    ((BPF_MODE(insn->code) != BPF_MEM &&
 		      BPF_MODE(insn->code) != BPF_XADD) || insn->imm != 0)) {
-			verbose("BPF_STX uses reserved fields\n");
+			verbose(env, "BPF_STX uses reserved fields\n");
 			return -EINVAL;
 		}
 
@@ -4021,7 +4050,7 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env)
 			if (i == insn_cnt - 1 || insn[1].code != 0 ||
 			    insn[1].dst_reg != 0 || insn[1].src_reg != 0 ||
 			    insn[1].off != 0) {
-				verbose("invalid bpf_ld_imm64 insn\n");
+				verbose(env, "invalid bpf_ld_imm64 insn\n");
 				return -EINVAL;
 			}
 
@@ -4030,19 +4059,20 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env)
 				goto next_insn;
 
 			if (insn->src_reg != BPF_PSEUDO_MAP_FD) {
-				verbose("unrecognized bpf_ld_imm64 insn\n");
+				verbose(env,
+					"unrecognized bpf_ld_imm64 insn\n");
 				return -EINVAL;
 			}
 
 			f = fdget(insn->imm);
 			map = __bpf_map_get(f);
 			if (IS_ERR(map)) {
-				verbose("fd %d is not pointing to valid bpf_map\n",
+				verbose(env, "fd %d is not pointing to valid bpf_map\n",
 					insn->imm);
 				return PTR_ERR(map);
 			}
 
-			err = check_map_prog_compatibility(map, env->prog);
+			err = check_map_prog_compatibility(env, map, env->prog);
 			if (err) {
 				fdput(f);
 				return err;
@@ -4164,7 +4194,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
 		cnt = ops->gen_prologue(insn_buf, env->seen_direct_write,
 					env->prog);
 		if (cnt >= ARRAY_SIZE(insn_buf)) {
-			verbose("bpf verifier is misconfigured\n");
+			verbose(env, "bpf verifier is misconfigured\n");
 			return -EINVAL;
 		} else if (cnt) {
 			new_prog = bpf_patch_insn_data(env, 0, insn_buf, cnt);
@@ -4212,7 +4242,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
 			u8 size_code;
 
 			if (type == BPF_WRITE) {
-				verbose("bpf verifier narrow ctx access misconfigured\n");
+				verbose(env, "bpf verifier narrow ctx access misconfigured\n");
 				return -EINVAL;
 			}
 
@@ -4231,7 +4261,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
 					      &target_size);
 		if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf) ||
 		    (ctx_field_size && !target_size)) {
-			verbose("bpf verifier is misconfigured\n");
+			verbose(env, "bpf verifier is misconfigured\n");
 			return -EINVAL;
 		}
 
@@ -4313,7 +4343,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
 
 			cnt = map_ptr->ops->map_gen_lookup(map_ptr, insn_buf);
 			if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) {
-				verbose("bpf verifier is misconfigured\n");
+				verbose(env, "bpf verifier is misconfigured\n");
 				return -EINVAL;
 			}
 
@@ -4357,7 +4387,8 @@ patch_call_imm:
 		 * programs to call them, must be real in-kernel functions
 		 */
 		if (!fn->func) {
-			verbose("kernel subsystem misconfigured func %s#%d\n",
+			verbose(env,
+				"kernel subsystem misconfigured func %s#%d\n",
 				func_id_name(insn->imm), insn->imm);
 			return -EFAULT;
 		}
@@ -4391,8 +4422,8 @@ static void free_states(struct bpf_verifier_env *env)
 
 int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)
 {
-	struct bpf_verifer_log *log = &verifier_log;
 	struct bpf_verifier_env *env;
+	struct bpf_verifer_log *log;
 	int ret = -EINVAL;
 
 	/* 'struct bpf_verifier_env' can be global, but since it's not small,
@@ -4401,6 +4432,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)
 	env = kzalloc(sizeof(struct bpf_verifier_env), GFP_KERNEL);
 	if (!env)
 		return -ENOMEM;
+	log = &env->log;
 
 	env->insn_aux_data = vzalloc(sizeof(struct bpf_insn_aux_data) *
 				     (*prog)->len);
@@ -4419,7 +4451,6 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)
 		log->level = attr->log_level;
 		log->ubuf = (char __user *) (unsigned long) attr->log_buf;
 		log->len_total = attr->log_size;
-		log->len_used = 0;
 
 		ret = -EINVAL;
 		/* log attributes have to be sane */
@@ -4431,8 +4462,6 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)
 		log->kbuf = vmalloc(log->len_total);
 		if (!log->kbuf)
 			goto err_unlock;
-	} else {
-		log->level = 0;
 	}
 
 	env->strict_alignment = !!(attr->prog_flags & BPF_F_STRICT_ALIGNMENT);
@@ -4543,8 +4572,6 @@ int bpf_analyzer(struct bpf_prog *prog, const struct bpf_ext_analyzer_ops *ops,
 	/* grab the mutex to protect few globals used by verifier */
 	mutex_lock(&bpf_verifier_lock);
 
-	verifier_log.level = 0;
-
 	env->strict_alignment = false;
 	if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS))
 		env->strict_alignment = true;
-- 
cgit v1.2.3


From f4ac7e0b5cc8d16004ac57ff679266d573f30f77 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Mon, 9 Oct 2017 10:30:12 -0700
Subject: bpf: move instruction printing into a separate file

Separate the instruction printing into a standalone source file.
This way sneaky code from tools/ can compile it in directly.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Simon Horman <simon.horman@netronome.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/Makefile   |   1 +
 kernel/bpf/disasm.c   | 214 ++++++++++++++++++++++++++++++++++++++++++++++++++
 kernel/bpf/disasm.h   |  32 ++++++++
 kernel/bpf/verifier.c | 202 +----------------------------------------------
 4 files changed, 251 insertions(+), 198 deletions(-)
 create mode 100644 kernel/bpf/disasm.c
 create mode 100644 kernel/bpf/disasm.h

(limited to 'kernel')

diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index 897daa005b23..53fb09f92e3f 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -2,6 +2,7 @@ obj-y := core.o
 
 obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o
 obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o
+obj-$(CONFIG_BPF_SYSCALL) += disasm.o
 ifeq ($(CONFIG_NET),y)
 obj-$(CONFIG_BPF_SYSCALL) += devmap.o
 ifeq ($(CONFIG_STREAM_PARSER),y)
diff --git a/kernel/bpf/disasm.c b/kernel/bpf/disasm.c
new file mode 100644
index 000000000000..e682850c9715
--- /dev/null
+++ b/kernel/bpf/disasm.c
@@ -0,0 +1,214 @@
+/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
+ * Copyright (c) 2016 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+
+#include <linux/bpf.h>
+
+#include "disasm.h"
+
+#define __BPF_FUNC_STR_FN(x) [BPF_FUNC_ ## x] = __stringify(bpf_ ## x)
+static const char * const func_id_str[] = {
+	__BPF_FUNC_MAPPER(__BPF_FUNC_STR_FN)
+};
+#undef __BPF_FUNC_STR_FN
+
+const char *func_id_name(int id)
+{
+	BUILD_BUG_ON(ARRAY_SIZE(func_id_str) != __BPF_FUNC_MAX_ID);
+
+	if (id >= 0 && id < __BPF_FUNC_MAX_ID && func_id_str[id])
+		return func_id_str[id];
+	else
+		return "unknown";
+}
+
+const char *const bpf_class_string[8] = {
+	[BPF_LD]    = "ld",
+	[BPF_LDX]   = "ldx",
+	[BPF_ST]    = "st",
+	[BPF_STX]   = "stx",
+	[BPF_ALU]   = "alu",
+	[BPF_JMP]   = "jmp",
+	[BPF_RET]   = "BUG",
+	[BPF_ALU64] = "alu64",
+};
+
+const char *const bpf_alu_string[16] = {
+	[BPF_ADD >> 4]  = "+=",
+	[BPF_SUB >> 4]  = "-=",
+	[BPF_MUL >> 4]  = "*=",
+	[BPF_DIV >> 4]  = "/=",
+	[BPF_OR  >> 4]  = "|=",
+	[BPF_AND >> 4]  = "&=",
+	[BPF_LSH >> 4]  = "<<=",
+	[BPF_RSH >> 4]  = ">>=",
+	[BPF_NEG >> 4]  = "neg",
+	[BPF_MOD >> 4]  = "%=",
+	[BPF_XOR >> 4]  = "^=",
+	[BPF_MOV >> 4]  = "=",
+	[BPF_ARSH >> 4] = "s>>=",
+	[BPF_END >> 4]  = "endian",
+};
+
+static const char *const bpf_ldst_string[] = {
+	[BPF_W >> 3]  = "u32",
+	[BPF_H >> 3]  = "u16",
+	[BPF_B >> 3]  = "u8",
+	[BPF_DW >> 3] = "u64",
+};
+
+static const char *const bpf_jmp_string[16] = {
+	[BPF_JA >> 4]   = "jmp",
+	[BPF_JEQ >> 4]  = "==",
+	[BPF_JGT >> 4]  = ">",
+	[BPF_JLT >> 4]  = "<",
+	[BPF_JGE >> 4]  = ">=",
+	[BPF_JLE >> 4]  = "<=",
+	[BPF_JSET >> 4] = "&",
+	[BPF_JNE >> 4]  = "!=",
+	[BPF_JSGT >> 4] = "s>",
+	[BPF_JSLT >> 4] = "s<",
+	[BPF_JSGE >> 4] = "s>=",
+	[BPF_JSLE >> 4] = "s<=",
+	[BPF_CALL >> 4] = "call",
+	[BPF_EXIT >> 4] = "exit",
+};
+
+static void print_bpf_end_insn(bpf_insn_print_cb verbose,
+			       struct bpf_verifier_env *env,
+			       const struct bpf_insn *insn)
+{
+	verbose(env, "(%02x) r%d = %s%d r%d\n", insn->code, insn->dst_reg,
+		BPF_SRC(insn->code) == BPF_TO_BE ? "be" : "le",
+		insn->imm, insn->dst_reg);
+}
+
+void print_bpf_insn(bpf_insn_print_cb verbose, struct bpf_verifier_env *env,
+		    const struct bpf_insn *insn, bool allow_ptr_leaks)
+{
+	u8 class = BPF_CLASS(insn->code);
+
+	if (class == BPF_ALU || class == BPF_ALU64) {
+		if (BPF_OP(insn->code) == BPF_END) {
+			if (class == BPF_ALU64)
+				verbose(env, "BUG_alu64_%02x\n", insn->code);
+			else
+				print_bpf_end_insn(verbose, env, insn);
+		} else if (BPF_OP(insn->code) == BPF_NEG) {
+			verbose(env, "(%02x) r%d = %s-r%d\n",
+				insn->code, insn->dst_reg,
+				class == BPF_ALU ? "(u32) " : "",
+				insn->dst_reg);
+		} else if (BPF_SRC(insn->code) == BPF_X) {
+			verbose(env, "(%02x) %sr%d %s %sr%d\n",
+				insn->code, class == BPF_ALU ? "(u32) " : "",
+				insn->dst_reg,
+				bpf_alu_string[BPF_OP(insn->code) >> 4],
+				class == BPF_ALU ? "(u32) " : "",
+				insn->src_reg);
+		} else {
+			verbose(env, "(%02x) %sr%d %s %s%d\n",
+				insn->code, class == BPF_ALU ? "(u32) " : "",
+				insn->dst_reg,
+				bpf_alu_string[BPF_OP(insn->code) >> 4],
+				class == BPF_ALU ? "(u32) " : "",
+				insn->imm);
+		}
+	} else if (class == BPF_STX) {
+		if (BPF_MODE(insn->code) == BPF_MEM)
+			verbose(env, "(%02x) *(%s *)(r%d %+d) = r%d\n",
+				insn->code,
+				bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
+				insn->dst_reg,
+				insn->off, insn->src_reg);
+		else if (BPF_MODE(insn->code) == BPF_XADD)
+			verbose(env, "(%02x) lock *(%s *)(r%d %+d) += r%d\n",
+				insn->code,
+				bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
+				insn->dst_reg, insn->off,
+				insn->src_reg);
+		else
+			verbose(env, "BUG_%02x\n", insn->code);
+	} else if (class == BPF_ST) {
+		if (BPF_MODE(insn->code) != BPF_MEM) {
+			verbose(env, "BUG_st_%02x\n", insn->code);
+			return;
+		}
+		verbose(env, "(%02x) *(%s *)(r%d %+d) = %d\n",
+			insn->code,
+			bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
+			insn->dst_reg,
+			insn->off, insn->imm);
+	} else if (class == BPF_LDX) {
+		if (BPF_MODE(insn->code) != BPF_MEM) {
+			verbose(env, "BUG_ldx_%02x\n", insn->code);
+			return;
+		}
+		verbose(env, "(%02x) r%d = *(%s *)(r%d %+d)\n",
+			insn->code, insn->dst_reg,
+			bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
+			insn->src_reg, insn->off);
+	} else if (class == BPF_LD) {
+		if (BPF_MODE(insn->code) == BPF_ABS) {
+			verbose(env, "(%02x) r0 = *(%s *)skb[%d]\n",
+				insn->code,
+				bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
+				insn->imm);
+		} else if (BPF_MODE(insn->code) == BPF_IND) {
+			verbose(env, "(%02x) r0 = *(%s *)skb[r%d + %d]\n",
+				insn->code,
+				bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
+				insn->src_reg, insn->imm);
+		} else if (BPF_MODE(insn->code) == BPF_IMM &&
+			   BPF_SIZE(insn->code) == BPF_DW) {
+			/* At this point, we already made sure that the second
+			 * part of the ldimm64 insn is accessible.
+			 */
+			u64 imm = ((u64)(insn + 1)->imm << 32) | (u32)insn->imm;
+			bool map_ptr = insn->src_reg == BPF_PSEUDO_MAP_FD;
+
+			if (map_ptr && !allow_ptr_leaks)
+				imm = 0;
+
+			verbose(env, "(%02x) r%d = 0x%llx\n", insn->code,
+				insn->dst_reg, (unsigned long long)imm);
+		} else {
+			verbose(env, "BUG_ld_%02x\n", insn->code);
+			return;
+		}
+	} else if (class == BPF_JMP) {
+		u8 opcode = BPF_OP(insn->code);
+
+		if (opcode == BPF_CALL) {
+			verbose(env, "(%02x) call %s#%d\n", insn->code,
+				func_id_name(insn->imm), insn->imm);
+		} else if (insn->code == (BPF_JMP | BPF_JA)) {
+			verbose(env, "(%02x) goto pc%+d\n",
+				insn->code, insn->off);
+		} else if (insn->code == (BPF_JMP | BPF_EXIT)) {
+			verbose(env, "(%02x) exit\n", insn->code);
+		} else if (BPF_SRC(insn->code) == BPF_X) {
+			verbose(env, "(%02x) if r%d %s r%d goto pc%+d\n",
+				insn->code, insn->dst_reg,
+				bpf_jmp_string[BPF_OP(insn->code) >> 4],
+				insn->src_reg, insn->off);
+		} else {
+			verbose(env, "(%02x) if r%d %s 0x%x goto pc%+d\n",
+				insn->code, insn->dst_reg,
+				bpf_jmp_string[BPF_OP(insn->code) >> 4],
+				insn->imm, insn->off);
+		}
+	} else {
+		verbose(env, "(%02x) %s\n",
+			insn->code, bpf_class_string[class]);
+	}
+}
diff --git a/kernel/bpf/disasm.h b/kernel/bpf/disasm.h
new file mode 100644
index 000000000000..8de977e420b6
--- /dev/null
+++ b/kernel/bpf/disasm.h
@@ -0,0 +1,32 @@
+/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
+ * Copyright (c) 2016 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+
+#ifndef __BPF_DISASM_H__
+#define __BPF_DISASM_H__
+
+#include <linux/bpf.h>
+#include <linux/kernel.h>
+#include <linux/stringify.h>
+
+extern const char *const bpf_alu_string[16];
+extern const char *const bpf_class_string[8];
+
+const char *func_id_name(int id);
+
+struct bpf_verifier_env;
+typedef void (*bpf_insn_print_cb)(struct bpf_verifier_env *env,
+				  const char *, ...);
+void print_bpf_insn(bpf_insn_print_cb verbose, struct bpf_verifier_env *env,
+		    const struct bpf_insn *insn, bool allow_ptr_leaks);
+
+#endif
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index a352f93cd4b2..274c6582ec39 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -21,6 +21,8 @@
 #include <linux/vmalloc.h>
 #include <linux/stringify.h>
 
+#include "disasm.h"
+
 /* bpf_check() is a static code analyzer that walks eBPF program
  * instruction by instruction and updates register/stack state.
  * All paths of conditional branches are analyzed until 'bpf_exit' insn.
@@ -194,22 +196,6 @@ static const char * const reg_type_str[] = {
 	[PTR_TO_PACKET_END]	= "pkt_end",
 };
 
-#define __BPF_FUNC_STR_FN(x) [BPF_FUNC_ ## x] = __stringify(bpf_ ## x)
-static const char * const func_id_str[] = {
-	__BPF_FUNC_MAPPER(__BPF_FUNC_STR_FN)
-};
-#undef __BPF_FUNC_STR_FN
-
-static const char *func_id_name(int id)
-{
-	BUILD_BUG_ON(ARRAY_SIZE(func_id_str) != __BPF_FUNC_MAX_ID);
-
-	if (id >= 0 && id < __BPF_FUNC_MAX_ID && func_id_str[id])
-		return func_id_str[id];
-	else
-		return "unknown";
-}
-
 static void print_verifier_state(struct bpf_verifier_env *env,
 				 struct bpf_verifier_state *state)
 {
@@ -278,187 +264,6 @@ static void print_verifier_state(struct bpf_verifier_env *env,
 	verbose(env, "\n");
 }
 
-static const char *const bpf_class_string[] = {
-	[BPF_LD]    = "ld",
-	[BPF_LDX]   = "ldx",
-	[BPF_ST]    = "st",
-	[BPF_STX]   = "stx",
-	[BPF_ALU]   = "alu",
-	[BPF_JMP]   = "jmp",
-	[BPF_RET]   = "BUG",
-	[BPF_ALU64] = "alu64",
-};
-
-static const char *const bpf_alu_string[16] = {
-	[BPF_ADD >> 4]  = "+=",
-	[BPF_SUB >> 4]  = "-=",
-	[BPF_MUL >> 4]  = "*=",
-	[BPF_DIV >> 4]  = "/=",
-	[BPF_OR  >> 4]  = "|=",
-	[BPF_AND >> 4]  = "&=",
-	[BPF_LSH >> 4]  = "<<=",
-	[BPF_RSH >> 4]  = ">>=",
-	[BPF_NEG >> 4]  = "neg",
-	[BPF_MOD >> 4]  = "%=",
-	[BPF_XOR >> 4]  = "^=",
-	[BPF_MOV >> 4]  = "=",
-	[BPF_ARSH >> 4] = "s>>=",
-	[BPF_END >> 4]  = "endian",
-};
-
-static const char *const bpf_ldst_string[] = {
-	[BPF_W >> 3]  = "u32",
-	[BPF_H >> 3]  = "u16",
-	[BPF_B >> 3]  = "u8",
-	[BPF_DW >> 3] = "u64",
-};
-
-static const char *const bpf_jmp_string[16] = {
-	[BPF_JA >> 4]   = "jmp",
-	[BPF_JEQ >> 4]  = "==",
-	[BPF_JGT >> 4]  = ">",
-	[BPF_JLT >> 4]  = "<",
-	[BPF_JGE >> 4]  = ">=",
-	[BPF_JLE >> 4]  = "<=",
-	[BPF_JSET >> 4] = "&",
-	[BPF_JNE >> 4]  = "!=",
-	[BPF_JSGT >> 4] = "s>",
-	[BPF_JSLT >> 4] = "s<",
-	[BPF_JSGE >> 4] = "s>=",
-	[BPF_JSLE >> 4] = "s<=",
-	[BPF_CALL >> 4] = "call",
-	[BPF_EXIT >> 4] = "exit",
-};
-
-static void print_bpf_end_insn(struct bpf_verifier_env *env,
-			       const struct bpf_insn *insn)
-{
-	verbose(env, "(%02x) r%d = %s%d r%d\n", insn->code, insn->dst_reg,
-		BPF_SRC(insn->code) == BPF_TO_BE ? "be" : "le",
-		insn->imm, insn->dst_reg);
-}
-
-static void print_bpf_insn(struct bpf_verifier_env *env,
-			   const struct bpf_insn *insn)
-{
-	u8 class = BPF_CLASS(insn->code);
-
-	if (class == BPF_ALU || class == BPF_ALU64) {
-		if (BPF_OP(insn->code) == BPF_END) {
-			if (class == BPF_ALU64)
-				verbose(env, "BUG_alu64_%02x\n", insn->code);
-			else
-				print_bpf_end_insn(env, insn);
-		} else if (BPF_OP(insn->code) == BPF_NEG) {
-			verbose(env, "(%02x) r%d = %s-r%d\n",
-				insn->code, insn->dst_reg,
-				class == BPF_ALU ? "(u32) " : "",
-				insn->dst_reg);
-		} else if (BPF_SRC(insn->code) == BPF_X) {
-			verbose(env, "(%02x) %sr%d %s %sr%d\n",
-				insn->code, class == BPF_ALU ? "(u32) " : "",
-				insn->dst_reg,
-				bpf_alu_string[BPF_OP(insn->code) >> 4],
-				class == BPF_ALU ? "(u32) " : "",
-				insn->src_reg);
-		} else {
-			verbose(env, "(%02x) %sr%d %s %s%d\n",
-				insn->code, class == BPF_ALU ? "(u32) " : "",
-				insn->dst_reg,
-				bpf_alu_string[BPF_OP(insn->code) >> 4],
-				class == BPF_ALU ? "(u32) " : "",
-				insn->imm);
-		}
-	} else if (class == BPF_STX) {
-		if (BPF_MODE(insn->code) == BPF_MEM)
-			verbose(env, "(%02x) *(%s *)(r%d %+d) = r%d\n",
-				insn->code,
-				bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
-				insn->dst_reg,
-				insn->off, insn->src_reg);
-		else if (BPF_MODE(insn->code) == BPF_XADD)
-			verbose(env, "(%02x) lock *(%s *)(r%d %+d) += r%d\n",
-				insn->code,
-				bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
-				insn->dst_reg, insn->off,
-				insn->src_reg);
-		else
-			verbose(env, "BUG_%02x\n", insn->code);
-	} else if (class == BPF_ST) {
-		if (BPF_MODE(insn->code) != BPF_MEM) {
-			verbose(env, "BUG_st_%02x\n", insn->code);
-			return;
-		}
-		verbose(env, "(%02x) *(%s *)(r%d %+d) = %d\n",
-			insn->code,
-			bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
-			insn->dst_reg,
-			insn->off, insn->imm);
-	} else if (class == BPF_LDX) {
-		if (BPF_MODE(insn->code) != BPF_MEM) {
-			verbose(env, "BUG_ldx_%02x\n", insn->code);
-			return;
-		}
-		verbose(env, "(%02x) r%d = *(%s *)(r%d %+d)\n",
-			insn->code, insn->dst_reg,
-			bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
-			insn->src_reg, insn->off);
-	} else if (class == BPF_LD) {
-		if (BPF_MODE(insn->code) == BPF_ABS) {
-			verbose(env, "(%02x) r0 = *(%s *)skb[%d]\n",
-				insn->code,
-				bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
-				insn->imm);
-		} else if (BPF_MODE(insn->code) == BPF_IND) {
-			verbose(env, "(%02x) r0 = *(%s *)skb[r%d + %d]\n",
-				insn->code,
-				bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
-				insn->src_reg, insn->imm);
-		} else if (BPF_MODE(insn->code) == BPF_IMM &&
-			   BPF_SIZE(insn->code) == BPF_DW) {
-			/* At this point, we already made sure that the second
-			 * part of the ldimm64 insn is accessible.
-			 */
-			u64 imm = ((u64)(insn + 1)->imm << 32) | (u32)insn->imm;
-			bool map_ptr = insn->src_reg == BPF_PSEUDO_MAP_FD;
-
-			if (map_ptr && !env->allow_ptr_leaks)
-				imm = 0;
-
-			verbose(env, "(%02x) r%d = 0x%llx\n", insn->code,
-				insn->dst_reg, (unsigned long long)imm);
-		} else {
-			verbose(env, "BUG_ld_%02x\n", insn->code);
-			return;
-		}
-	} else if (class == BPF_JMP) {
-		u8 opcode = BPF_OP(insn->code);
-
-		if (opcode == BPF_CALL) {
-			verbose(env, "(%02x) call %s#%d\n", insn->code,
-				func_id_name(insn->imm), insn->imm);
-		} else if (insn->code == (BPF_JMP | BPF_JA)) {
-			verbose(env, "(%02x) goto pc%+d\n",
-				insn->code, insn->off);
-		} else if (insn->code == (BPF_JMP | BPF_EXIT)) {
-			verbose(env, "(%02x) exit\n", insn->code);
-		} else if (BPF_SRC(insn->code) == BPF_X) {
-			verbose(env, "(%02x) if r%d %s r%d goto pc%+d\n",
-				insn->code, insn->dst_reg,
-				bpf_jmp_string[BPF_OP(insn->code) >> 4],
-				insn->src_reg, insn->off);
-		} else {
-			verbose(env, "(%02x) if r%d %s 0x%x goto pc%+d\n",
-				insn->code, insn->dst_reg,
-				bpf_jmp_string[BPF_OP(insn->code) >> 4],
-				insn->imm, insn->off);
-		}
-	} else {
-		verbose(env, "(%02x) %s\n",
-			insn->code, bpf_class_string[class]);
-	}
-}
-
 static int pop_stack(struct bpf_verifier_env *env, int *prev_insn_idx)
 {
 	struct bpf_verifier_stack_elem *elem;
@@ -3767,7 +3572,8 @@ static int do_check(struct bpf_verifier_env *env)
 
 		if (env->log.level) {
 			verbose(env, "%d: ", insn_idx);
-			print_bpf_insn(env, insn);
+			print_bpf_insn(verbose, env, insn,
+				       env->allow_ptr_leaks);
 		}
 
 		err = ext_analyzer_insn_hook(env, insn_idx, prev_insn_idx);
-- 
cgit v1.2.3


From a2a7d5701052542cd2260e7659b12443e0a74733 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Mon, 9 Oct 2017 10:30:15 -0700
Subject: bpf: write back the verifier log buffer as it gets filled

Verifier log buffer can be quite large (up to 16MB currently).
As Eric Dumazet points out if we allow multiple verification
requests to proceed simultaneously, malicious user may use the
verifier as a way of allocating large amounts of unswappable
memory to OOM the host.

Switch to a strategy of allocating a smaller buffer (1024B)
and writing it out into the user buffer after every print.

While at it remove the old BUG_ON().

This is in preparation of the global verifier lock removal.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Simon Horman <simon.horman@netronome.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf_verifier.h |  4 +++-
 kernel/bpf/verifier.c        | 41 +++++++++++++++++++----------------------
 2 files changed, 22 insertions(+), 23 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 5ddb9a626a51..f00ef751c1c5 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -115,9 +115,11 @@ struct bpf_insn_aux_data {
 
 #define MAX_USED_MAPS 64 /* max number of maps accessed by one eBPF program */
 
+#define BPF_VERIFIER_TMP_LOG_SIZE	1024
+
 struct bpf_verifer_log {
 	u32 level;
-	char *kbuf;
+	char kbuf[BPF_VERIFIER_TMP_LOG_SIZE];
 	char __user *ubuf;
 	u32 len_used;
 	u32 len_total;
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 274c6582ec39..2cdbcc4f8f6b 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -165,15 +165,26 @@ static __printf(2, 3) void verbose(struct bpf_verifier_env *env,
 				   const char *fmt, ...)
 {
 	struct bpf_verifer_log *log = &env->log;
+	unsigned int n;
 	va_list args;
 
-	if (!log->level || bpf_verifier_log_full(log))
+	if (!log->level || !log->ubuf || bpf_verifier_log_full(log))
 		return;
 
 	va_start(args, fmt);
-	log->len_used += vscnprintf(log->kbuf + log->len_used,
-				    log->len_total - log->len_used, fmt, args);
+	n = vscnprintf(log->kbuf, BPF_VERIFIER_TMP_LOG_SIZE, fmt, args);
 	va_end(args);
+
+	WARN_ONCE(n >= BPF_VERIFIER_TMP_LOG_SIZE - 1,
+		  "verifier log line truncated - local buffer too short\n");
+
+	n = min(log->len_total - log->len_used - 1, n);
+	log->kbuf[n] = '\0';
+
+	if (!copy_to_user(log->ubuf + log->len_used, log->kbuf, n + 1))
+		log->len_used += n;
+	else
+		log->ubuf = NULL;
 }
 
 static bool type_is_pkt_pointer(enum bpf_reg_type type)
@@ -4263,11 +4274,6 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)
 		if (log->len_total < 128 || log->len_total > UINT_MAX >> 8 ||
 		    !log->level || !log->ubuf)
 			goto err_unlock;
-
-		ret = -ENOMEM;
-		log->kbuf = vmalloc(log->len_total);
-		if (!log->kbuf)
-			goto err_unlock;
 	}
 
 	env->strict_alignment = !!(attr->prog_flags & BPF_F_STRICT_ALIGNMENT);
@@ -4304,18 +4310,11 @@ skip_full_check:
 	if (ret == 0)
 		ret = fixup_bpf_calls(env);
 
-	if (log->level && bpf_verifier_log_full(log)) {
-		BUG_ON(log->len_used >= log->len_total);
-		/* verifier log exceeded user supplied buffer */
+	if (log->level && bpf_verifier_log_full(log))
 		ret = -ENOSPC;
-		/* fall through to return what was recorded */
-	}
-
-	/* copy verifier log back to user space including trailing zero */
-	if (log->level && copy_to_user(log->ubuf, log->kbuf,
-				       log->len_used + 1) != 0) {
+	if (log->level && !log->ubuf) {
 		ret = -EFAULT;
-		goto free_log_buf;
+		goto err_release_maps;
 	}
 
 	if (ret == 0 && env->used_map_cnt) {
@@ -4326,7 +4325,7 @@ skip_full_check:
 
 		if (!env->prog->aux->used_maps) {
 			ret = -ENOMEM;
-			goto free_log_buf;
+			goto err_release_maps;
 		}
 
 		memcpy(env->prog->aux->used_maps, env->used_maps,
@@ -4339,9 +4338,7 @@ skip_full_check:
 		convert_pseudo_ld_imm64(env);
 	}
 
-free_log_buf:
-	if (log->level)
-		vfree(log->kbuf);
+err_release_maps:
 	if (!env->prog->aux->used_maps)
 		/* if we didn't copy map pointers into bpf_prog_info, release
 		 * them now. Otherwise free_bpf_prog_info() will release them.
-- 
cgit v1.2.3


From d59158162e032917a428704160a2063a02405ec6 Mon Sep 17 00:00:00 2001
From: Joel Fernandes <joelaf@google.com>
Date: Tue, 10 Oct 2017 15:51:37 -0700
Subject: tracing: Add support for preempt and irq enable/disable events

Preempt and irq trace events can be used for tracing the start and
end of an atomic section which can be used by a trace viewer like
systrace to graphically view the start and end of an atomic section and
correlate them with latencies and scheduling issues.

This also serves as a prelude to using synthetic events or probes to
rewrite the preempt and irqsoff tracers, along with numerous benefits of
using trace events features for these events.
Link: http://lkml.kernel.org/r/20171006005432.14244-3-joelaf@google.com
Link: http://lkml.kernel.org/r/20171010225137.17370-1-joelaf@google.com

Cc: Peter Zilstra <peterz@infradead.org>
Cc: kernel-team@android.com
Signed-off-by: Joel Fernandes <joelaf@google.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 include/linux/ftrace.h            |  3 +-
 include/trace/events/preemptirq.h | 70 +++++++++++++++++++++++++++++++++++++++
 kernel/trace/Kconfig              | 11 ++++++
 kernel/trace/Makefile             |  1 +
 kernel/trace/trace_irqsoff.c      | 35 +++++++++++++++++++-
 5 files changed, 118 insertions(+), 2 deletions(-)
 create mode 100644 include/trace/events/preemptirq.h

(limited to 'kernel')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 346f8294e40a..1f8545caa691 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -769,7 +769,8 @@ static inline unsigned long get_lock_parent_ip(void)
   static inline void time_hardirqs_off(unsigned long a0, unsigned long a1) { }
 #endif
 
-#ifdef CONFIG_PREEMPT_TRACER
+#if defined(CONFIG_PREEMPT_TRACER) || \
+	(defined(CONFIG_DEBUG_PREEMPT) && defined(CONFIG_PREEMPTIRQ_EVENTS))
   extern void trace_preempt_on(unsigned long a0, unsigned long a1);
   extern void trace_preempt_off(unsigned long a0, unsigned long a1);
 #else
diff --git a/include/trace/events/preemptirq.h b/include/trace/events/preemptirq.h
new file mode 100644
index 000000000000..f5024c560d8f
--- /dev/null
+++ b/include/trace/events/preemptirq.h
@@ -0,0 +1,70 @@
+#ifdef CONFIG_PREEMPTIRQ_EVENTS
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM preemptirq
+
+#if !defined(_TRACE_PREEMPTIRQ_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_PREEMPTIRQ_H
+
+#include <linux/ktime.h>
+#include <linux/tracepoint.h>
+#include <linux/string.h>
+#include <asm/sections.h>
+
+DECLARE_EVENT_CLASS(preemptirq_template,
+
+	TP_PROTO(unsigned long ip, unsigned long parent_ip),
+
+	TP_ARGS(ip, parent_ip),
+
+	TP_STRUCT__entry(
+		__field(u32, caller_offs)
+		__field(u32, parent_offs)
+	),
+
+	TP_fast_assign(
+		__entry->caller_offs = (u32)(ip - (unsigned long)_stext);
+		__entry->parent_offs = (u32)(parent_ip - (unsigned long)_stext);
+	),
+
+	TP_printk("caller=%pF parent=%pF",
+		  (void *)((unsigned long)(_stext) + __entry->caller_offs),
+		  (void *)((unsigned long)(_stext) + __entry->parent_offs))
+);
+
+#ifndef CONFIG_PROVE_LOCKING
+DEFINE_EVENT(preemptirq_template, irq_disable,
+	     TP_PROTO(unsigned long ip, unsigned long parent_ip),
+	     TP_ARGS(ip, parent_ip));
+
+DEFINE_EVENT(preemptirq_template, irq_enable,
+	     TP_PROTO(unsigned long ip, unsigned long parent_ip),
+	     TP_ARGS(ip, parent_ip));
+#endif
+
+#ifdef CONFIG_DEBUG_PREEMPT
+DEFINE_EVENT(preemptirq_template, preempt_disable,
+	     TP_PROTO(unsigned long ip, unsigned long parent_ip),
+	     TP_ARGS(ip, parent_ip));
+
+DEFINE_EVENT(preemptirq_template, preempt_enable,
+	     TP_PROTO(unsigned long ip, unsigned long parent_ip),
+	     TP_ARGS(ip, parent_ip));
+#endif
+
+#endif /* _TRACE_PREEMPTIRQ_H */
+
+#include <trace/define_trace.h>
+
+#else /* !CONFIG_PREEMPTIRQ_EVENTS */
+
+#define trace_irq_enable(...)
+#define trace_irq_disable(...)
+#define trace_preempt_enable(...)
+#define trace_preempt_disable(...)
+#define trace_irq_enable_rcuidle(...)
+#define trace_irq_disable_rcuidle(...)
+#define trace_preempt_enable_rcuidle(...)
+#define trace_preempt_disable_rcuidle(...)
+
+#endif
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 434c840e2d82..b8395a020821 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -160,6 +160,17 @@ config FUNCTION_GRAPH_TRACER
 	  address on the current task structure into a stack of calls.
 
 
+config PREEMPTIRQ_EVENTS
+	bool "Enable trace events for preempt and irq disable/enable"
+	select TRACE_IRQFLAGS
+	depends on DEBUG_PREEMPT || !PROVE_LOCKING
+	default n
+	help
+	  Enable tracing of disable and enable events for preemption and irqs.
+	  For tracing preempt disable/enable events, DEBUG_PREEMPT must be
+	  enabled. For tracing irq disable/enable events, PROVE_LOCKING must
+	  be disabled.
+
 config IRQSOFF_TRACER
 	bool "Interrupts-off Latency Tracer"
 	default n
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 90f2701d92a7..9f62eee61f14 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -34,6 +34,7 @@ obj-$(CONFIG_TRACING) += trace_printk.o
 obj-$(CONFIG_TRACING_MAP) += tracing_map.o
 obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o
 obj-$(CONFIG_FUNCTION_TRACER) += trace_functions.o
+obj-$(CONFIG_PREEMPTIRQ_EVENTS) += trace_irqsoff.o
 obj-$(CONFIG_IRQSOFF_TRACER) += trace_irqsoff.o
 obj-$(CONFIG_PREEMPT_TRACER) += trace_irqsoff.o
 obj-$(CONFIG_SCHED_TRACER) += trace_sched_wakeup.o
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 0e3033c00474..03ecb4465ee4 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -16,6 +16,9 @@
 
 #include "trace.h"
 
+#define CREATE_TRACE_POINTS
+#include <trace/events/preemptirq.h>
+
 #if defined(CONFIG_IRQSOFF_TRACER) || defined(CONFIG_PREEMPT_TRACER)
 static struct trace_array		*irqsoff_trace __read_mostly;
 static int				tracer_enabled __read_mostly;
@@ -777,26 +780,53 @@ static inline void tracer_preempt_off(unsigned long a0, unsigned long a1) { }
 #endif
 
 #if defined(CONFIG_TRACE_IRQFLAGS) && !defined(CONFIG_PROVE_LOCKING)
+/* Per-cpu variable to prevent redundant calls when IRQs already off */
+static DEFINE_PER_CPU(int, tracing_irq_cpu);
+
 void trace_hardirqs_on(void)
 {
+	if (!this_cpu_read(tracing_irq_cpu))
+		return;
+
+	trace_irq_enable_rcuidle(CALLER_ADDR0, CALLER_ADDR1);
 	tracer_hardirqs_on();
+
+	this_cpu_write(tracing_irq_cpu, 0);
 }
 EXPORT_SYMBOL(trace_hardirqs_on);
 
 void trace_hardirqs_off(void)
 {
+	if (this_cpu_read(tracing_irq_cpu))
+		return;
+
+	this_cpu_write(tracing_irq_cpu, 1);
+
+	trace_irq_disable_rcuidle(CALLER_ADDR0, CALLER_ADDR1);
 	tracer_hardirqs_off();
 }
 EXPORT_SYMBOL(trace_hardirqs_off);
 
 __visible void trace_hardirqs_on_caller(unsigned long caller_addr)
 {
+	if (!this_cpu_read(tracing_irq_cpu))
+		return;
+
+	trace_irq_enable_rcuidle(CALLER_ADDR0, caller_addr);
 	tracer_hardirqs_on_caller(caller_addr);
+
+	this_cpu_write(tracing_irq_cpu, 0);
 }
 EXPORT_SYMBOL(trace_hardirqs_on_caller);
 
 __visible void trace_hardirqs_off_caller(unsigned long caller_addr)
 {
+	if (this_cpu_read(tracing_irq_cpu))
+		return;
+
+	this_cpu_write(tracing_irq_cpu, 1);
+
+	trace_irq_disable_rcuidle(CALLER_ADDR0, caller_addr);
 	tracer_hardirqs_off_caller(caller_addr);
 }
 EXPORT_SYMBOL(trace_hardirqs_off_caller);
@@ -818,14 +848,17 @@ inline void print_irqtrace_events(struct task_struct *curr)
 }
 #endif
 
-#ifdef CONFIG_PREEMPT_TRACER
+#if defined(CONFIG_PREEMPT_TRACER) || \
+	(defined(CONFIG_DEBUG_PREEMPT) && defined(CONFIG_PREEMPTIRQ_EVENTS))
 void trace_preempt_on(unsigned long a0, unsigned long a1)
 {
+	trace_preempt_enable_rcuidle(a0, a1);
 	tracer_preempt_on(a0, a1);
 }
 
 void trace_preempt_off(unsigned long a0, unsigned long a1)
 {
+	trace_preempt_disable_rcuidle(a0, a1);
 	tracer_preempt_off(a0, a1);
 }
 #endif
-- 
cgit v1.2.3


From 8715b108cd75523c9b2e833cdcd7aeb363767f95 Mon Sep 17 00:00:00 2001
From: Joel Fernandes <joelaf@google.com>
Date: Mon, 9 Oct 2017 12:29:31 -0700
Subject: ftrace: Clear hashes of stale ips of init memory

Filters should be cleared of init functions during freeing of init
memory when the ftrace dyn records are released. However in current
code, the filters are left as is. This patch clears the hashes of the
saved init functions when the init memory is freed. This fixes the
following issue reproducible with the following sequence of commands for
a test module:
================================================

void bar(void)
{
    printk(KERN_INFO "bar!\n");
}

void foo(void)
{
    printk(KERN_INFO "foo!\n");
    bar();
}

static int __init hello_init(void)
{
    printk(KERN_INFO "Hello world!\n");
    foo();
    return 0;
}

static void __exit hello_cleanup(void)
{
    printk(KERN_INFO "Cleaning up module.\n");
}

module_init(hello_init);
module_exit(hello_cleanup);
================================================

Commands:
echo '*:mod:test' > /d/tracing/set_ftrace_filter
echo function > /d/tracing/current_tracer
modprobe test
rmmod test
sleep 1
modprobe test
cat /d/tracing/set_ftrace_filter

Behavior without patch: Init function is still in the filter
Expected behavior: Shouldn't have any of the filters set

Link: http://lkml.kernel.org/r/20171009192931.56401-1-joelaf@google.com

Signed-off-by: Joel Fernandes <joelaf@google.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/ftrace.c | 69 +++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 69 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 9e99bd55732e..e0a98225666b 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -6067,6 +6067,63 @@ allocate_ftrace_mod_map(struct module *mod,
 }
 #endif /* CONFIG_MODULES */
 
+struct ftrace_init_func {
+	struct list_head list;
+	unsigned long ip;
+};
+
+/* Clear any init ips from hashes */
+static void
+clear_func_from_hash(struct ftrace_init_func *func, struct ftrace_hash *hash)
+{
+	struct ftrace_func_entry *entry;
+
+	if (ftrace_hash_empty(hash))
+		return;
+
+	entry = __ftrace_lookup_ip(hash, func->ip);
+
+	/*
+	 * Do not allow this rec to match again.
+	 * Yeah, it may waste some memory, but will be removed
+	 * if/when the hash is modified again.
+	 */
+	if (entry)
+		entry->ip = 0;
+}
+
+static void
+clear_func_from_hashes(struct ftrace_init_func *func)
+{
+	struct trace_array *tr;
+
+	mutex_lock(&trace_types_lock);
+	list_for_each_entry(tr, &ftrace_trace_arrays, list) {
+		if (!tr->ops || !tr->ops->func_hash)
+			continue;
+		mutex_lock(&tr->ops->func_hash->regex_lock);
+		clear_func_from_hash(func, tr->ops->func_hash->filter_hash);
+		clear_func_from_hash(func, tr->ops->func_hash->notrace_hash);
+		mutex_unlock(&tr->ops->func_hash->regex_lock);
+	}
+	mutex_unlock(&trace_types_lock);
+}
+
+static void add_to_clear_hash_list(struct list_head *clear_list,
+				   struct dyn_ftrace *rec)
+{
+	struct ftrace_init_func *func;
+
+	func = kmalloc(sizeof(*func), GFP_KERNEL);
+	if (!func) {
+		WARN_ONCE(1, "alloc failure, ftrace filter could be stale\n");
+		return;
+	}
+
+	func->ip = rec->ip;
+	list_add(&func->list, clear_list);
+}
+
 void ftrace_free_mem(struct module *mod, void *start_ptr, void *end_ptr)
 {
 	unsigned long start = (unsigned long)(start_ptr);
@@ -6076,8 +6133,12 @@ void ftrace_free_mem(struct module *mod, void *start_ptr, void *end_ptr)
 	struct dyn_ftrace *rec;
 	struct dyn_ftrace key;
 	struct ftrace_mod_map *mod_map = NULL;
+	struct ftrace_init_func *func, *func_next;
+	struct list_head clear_hash;
 	int order;
 
+	INIT_LIST_HEAD(&clear_hash);
+
 	key.ip = start;
 	key.flags = end;	/* overload flags, as it is unsigned long */
 
@@ -6102,6 +6163,9 @@ void ftrace_free_mem(struct module *mod, void *start_ptr, void *end_ptr)
 		if (!rec)
 			continue;
 
+		/* rec will be cleared from hashes after ftrace_lock unlock */
+		add_to_clear_hash_list(&clear_hash, rec);
+
 		if (mod_map)
 			save_ftrace_mod_rec(mod_map, rec);
 
@@ -6123,6 +6187,11 @@ void ftrace_free_mem(struct module *mod, void *start_ptr, void *end_ptr)
 		goto again;
 	}
 	mutex_unlock(&ftrace_lock);
+
+	list_for_each_entry_safe(func, func_next, &clear_hash, list) {
+		clear_func_from_hashes(func);
+		kfree(func);
+	}
 }
 
 void __init ftrace_free_init_mem(void)
-- 
cgit v1.2.3


From 952925dec0f276b407b2abce4aee82cba7c700c3 Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.king@canonical.com>
Date: Wed, 11 Oct 2017 11:56:23 +0100
Subject: bpf: remove redundant variable old_flags

Variable old_flags is being assigned but is never read; it is redundant
and can be removed.

Cleans up clang warning: Value stored to 'old_flags' is never read

Signed-off-by: Colin Ian King <colin.king@canonical.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/cgroup.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index e88abc0865d5..3db5a17fcfe8 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -192,7 +192,6 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog,
 	struct cgroup_subsys_state *css;
 	struct bpf_prog_list *pl;
 	bool pl_was_allocated;
-	u32 old_flags;
 	int err;
 
 	if ((flags & BPF_F_ALLOW_OVERRIDE) && (flags & BPF_F_ALLOW_MULTI))
@@ -239,7 +238,6 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog,
 		pl->prog = prog;
 	}
 
-	old_flags = cgrp->bpf.flags[type];
 	cgrp->bpf.flags[type] = flags;
 
 	/* allocate and recompute effective prog arrays */
-- 
cgit v1.2.3


From c5c1ea75a352c3864c4891f36155287a2ed73928 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Tue, 13 Jun 2017 13:06:59 +0200
Subject: tracing: Kconfig text fixes for CONFIG_HWLAT_TRACER

Trivial spelling fixes for Kconfig help text of config HWLAT_TRACER.

Fixes: e7c15cd8a113 ("tracing: Added hardware latency tracer")
Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 kernel/trace/Kconfig | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 434c840e2d82..f54b7b6b4a4b 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -224,7 +224,7 @@ config HWLAT_TRACER
 	select GENERIC_TRACER
 	help
 	 This tracer, when enabled will create one or more kernel threads,
-	 depening on what the cpumask file is set to, which each thread
+	 depending on what the cpumask file is set to, which each thread
 	 spinning in a loop looking for interruptions caused by
 	 something other than the kernel. For example, if a
 	 System Management Interrupt (SMI) takes a noticeable amount of
@@ -239,7 +239,7 @@ config HWLAT_TRACER
 				     iteration
 
 	 A kernel thread is created that will spin with interrupts disabled
-	 for "width" microseconds in every "widow" cycle. It will not spin
+	 for "width" microseconds in every "window" cycle. It will not spin
 	 for "window - width" microseconds, where the system can
 	 continue to operate.
 
-- 
cgit v1.2.3


From af41acf8347dd6d11a2a29a11e2866ca4892d600 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Wed, 11 Oct 2017 12:46:47 -0400
Subject: printk: Remove superfluous memory barriers from printk_safe

The variable printk_safe_irq_ready is set and never cleared at system
boot up, when there's only one CPU active. It is set before other
CPUs come on line. Also, it is extremely unlikely that an NMI would
trigger this early in boot up (which I wonder why we even have this
variable at all).

Also mark the printk_safe_irq_ready as read mostly, as it is set at
system boot up, and never touched again.

Link: http://lkml.kernel.org/r/20171011124647.7781f98f@gandalf.local.home

Reviewed-by: Petr Mladek <pmladek@suse.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/printk/printk_safe.c | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/printk/printk_safe.c b/kernel/printk/printk_safe.c
index 3cdaeaef9ce1..724d9292d4b9 100644
--- a/kernel/printk/printk_safe.c
+++ b/kernel/printk/printk_safe.c
@@ -39,7 +39,7 @@
  * There are situations when we want to make sure that all buffers
  * were handled or when IRQs are blocked.
  */
-static int printk_safe_irq_ready;
+static int printk_safe_irq_ready __read_mostly;
 
 #define SAFE_LOG_BUF_LEN ((1 << CONFIG_PRINTK_SAFE_LOG_BUF_SHIFT) -	\
 				sizeof(atomic_t) -			\
@@ -63,11 +63,8 @@ static DEFINE_PER_CPU(struct printk_safe_seq_buf, nmi_print_seq);
 /* Get flushed in a more safe context. */
 static void queue_flush_work(struct printk_safe_seq_buf *s)
 {
-	if (printk_safe_irq_ready) {
-		/* Make sure that IRQ work is really initialized. */
-		smp_rmb();
+	if (printk_safe_irq_ready)
 		irq_work_queue(&s->work);
-	}
 }
 
 /*
@@ -398,8 +395,12 @@ void __init printk_safe_init(void)
 #endif
 	}
 
-	/* Make sure that IRQ works are initialized before enabling. */
-	smp_wmb();
+	/*
+	 * In the highly unlikely event that a NMI were to trigger at
+	 * this moment. Make sure IRQ work is set up before this
+	 * variable is set.
+	 */
+	barrier();
 	printk_safe_irq_ready = 1;
 
 	/* Flush pending messages that did not have scheduled IRQ works. */
-- 
cgit v1.2.3


From c3b5b6ed1eb4f429addfd9e8e8eb39d1a38c85d0 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Fri, 13 Oct 2017 16:22:20 +0200
Subject: tracing: mark trace_test_buffer as __maybe_unused

After trace_selftest_startup_sched_switch is removed, trace_test_buffer()
is only used sometimes, leading to this warning:

kernel/trace/trace_selftest.c:62:12: error: 'trace_test_buffer' defined but not used [-Werror=unused-function]

There is no simple #ifdef condition that captures well whether the
function is in fact used or not, so marking it as __maybe_unused is
probably the best way to shut up the warning. The function will then
be silently dropped when there is no user.

Link: http://lkml.kernel.org/r/20171013142227.1273469-1-arnd@arndb.de

Fixes: d8c4deee6dc6 ("tracing: Remove obsolete sched_switch tracer selftest")
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace_selftest.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 364f78abdf47..eb9ba5c1ba40 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -59,7 +59,7 @@ static int trace_test_buffer_cpu(struct trace_buffer *buf, int cpu)
  * Test the trace buffer to see if all the elements
  * are still sane.
  */
-static int trace_test_buffer(struct trace_buffer *buf, unsigned long *count)
+static int __maybe_unused trace_test_buffer(struct trace_buffer *buf, unsigned long *count)
 {
 	unsigned long flags, cnt = 0;
 	int cpu, ret = 0;
-- 
cgit v1.2.3


From 1bdec44955edc22fb840f5965987d2972307dcc9 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Thu, 12 Oct 2017 10:34:07 -0700
Subject: bpf: verifier: set reg_type on context accesses in second pass

Use a simplified is_valid_access() callback when verifier
is used for program analysis by non-host JITs.  This allows
us to teach the verifier about packet start and packet end
offsets for direct packet access.

We can extend the callback as needed but for most packet
processing needs there isn't much more the offloads may
require.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Simon Horman <simon.horman@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/verifier.c | 43 +++++++++++++++++++++++++++++++++++++------
 1 file changed, 37 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 2cdbcc4f8f6b..9755279d94cb 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -813,6 +813,36 @@ static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off,
 	return err;
 }
 
+static bool analyzer_is_valid_access(struct bpf_verifier_env *env, int off,
+				     struct bpf_insn_access_aux *info)
+{
+	switch (env->prog->type) {
+	case BPF_PROG_TYPE_XDP:
+		switch (off) {
+		case offsetof(struct xdp_buff, data):
+			info->reg_type = PTR_TO_PACKET;
+			return true;
+		case offsetof(struct xdp_buff, data_end):
+			info->reg_type = PTR_TO_PACKET_END;
+			return true;
+		}
+		return false;
+	case BPF_PROG_TYPE_SCHED_CLS:
+		switch (off) {
+		case offsetof(struct sk_buff, data):
+			info->reg_type = PTR_TO_PACKET;
+			return true;
+		case offsetof(struct sk_buff, cb) +
+		     offsetof(struct bpf_skb_data_end, data_end):
+			info->reg_type = PTR_TO_PACKET_END;
+			return true;
+		}
+		return false;
+	default:
+		return false;
+	}
+}
+
 /* check access to 'struct bpf_context' fields.  Supports fixed offsets only */
 static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off, int size,
 			    enum bpf_access_type t, enum bpf_reg_type *reg_type)
@@ -821,12 +851,13 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off,
 		.reg_type = *reg_type,
 	};
 
-	/* for analyzer ctx accesses are already validated and converted */
-	if (env->analyzer_ops)
-		return 0;
-
-	if (env->prog->aux->ops->is_valid_access &&
-	    env->prog->aux->ops->is_valid_access(off, size, t, &info)) {
+	if (env->analyzer_ops) {
+		if (analyzer_is_valid_access(env, off, &info)) {
+			*reg_type = info.reg_type;
+			return 0;
+		}
+	} else if (env->prog->aux->ops->is_valid_access &&
+		   env->prog->aux->ops->is_valid_access(off, size, t, &info)) {
 		/* A non zero info.ctx_field_size indicates that this field is a
 		 * candidate for later verifier transformation to load the whole
 		 * field and then apply a mask when accessed with a narrower
-- 
cgit v1.2.3


From 9185a610f8f7f1b4e4d28c9de27d1969cf58e0f1 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Thu, 12 Oct 2017 18:40:02 -0400
Subject: tracing: bpf: Hide bpf trace events when they are not used

All the trace events defined in include/trace/events/bpf.h are only
used when CONFIG_BPF_SYSCALL is defined. But this file gets included by
include/linux/bpf_trace.h which is included by the networking code with
CREATE_TRACE_POINTS defined.

If a trace event is created but not used it still has data structures
and functions created for its use, even though nothing is using them.
To not waste space, do not define the BPF trace events in bpf.h unless
CONFIG_BPF_SYSCALL is defined.

Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/trace/events/bpf.h | 5 ++++-
 kernel/bpf/core.c          | 3 +++
 2 files changed, 7 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/include/trace/events/bpf.h b/include/trace/events/bpf.h
index 52c8425d144b..1fb58faa4a44 100644
--- a/include/trace/events/bpf.h
+++ b/include/trace/events/bpf.h
@@ -4,6 +4,9 @@
 #if !defined(_TRACE_BPF_H) || defined(TRACE_HEADER_MULTI_READ)
 #define _TRACE_BPF_H
 
+/* These are only used within the BPF_SYSCALL code */
+#ifdef CONFIG_BPF_SYSCALL
+
 #include <linux/filter.h>
 #include <linux/bpf.h>
 #include <linux/fs.h>
@@ -345,7 +348,7 @@ TRACE_EVENT(bpf_map_next_key,
 		  __print_hex(__get_dynamic_array(nxt), __entry->key_len),
 		  __entry->key_trunc ? " ..." : "")
 );
-
+#endif /* CONFIG_BPF_SYSCALL */
 #endif /* _TRACE_BPF_H */
 
 #include <trace/define_trace.h>
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 248961af2421..8e7c8bf2b687 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -1580,5 +1580,8 @@ int __weak skb_copy_bits(const struct sk_buff *skb, int offset, void *to,
 
 EXPORT_TRACEPOINT_SYMBOL_GPL(xdp_exception);
 
+/* These are only used within the BPF_SYSCALL code */
+#ifdef CONFIG_BPF_SYSCALL
 EXPORT_TRACEPOINT_SYMBOL_GPL(bpf_prog_get_type);
 EXPORT_TRACEPOINT_SYMBOL_GPL(bpf_prog_put_rcu);
+#endif
-- 
cgit v1.2.3


From 8fd0fbbe8888f295eb34172a7e47bf7d3a0a4687 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 11 Oct 2017 09:45:29 +0200
Subject: perf/ftrace: Revert ("perf/ftrace: Fix double traces of perf on
 ftrace:function")

Revert commit:

  75e8387685f6 ("perf/ftrace: Fix double traces of perf on ftrace:function")

The reason I instantly stumbled on that patch is that it only addresses the
ftrace situation and doesn't mention the other _5_ places that use this
interface. It doesn't explain why those don't have the problem and if not, why
their solution doesn't work for ftrace.

It doesn't, but this is just putting more duct tape on.

Link: http://lkml.kernel.org/r/20171011080224.200565770@infradead.org

Cc: Zhou Chengming <zhouchengming1@huawei.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 include/linux/perf_event.h      |  2 +-
 include/linux/trace_events.h    |  4 ++--
 kernel/events/core.c            | 13 ++++---------
 kernel/trace/trace_event_perf.c |  4 +---
 kernel/trace/trace_kprobe.c     |  4 ++--
 kernel/trace/trace_syscalls.c   |  4 ++--
 kernel/trace/trace_uprobe.c     |  2 +-
 7 files changed, 13 insertions(+), 20 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 8e22f24ded6a..569d1b54e201 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -1184,7 +1184,7 @@ extern void perf_event_init(void);
 extern void perf_tp_event(u16 event_type, u64 count, void *record,
 			  int entry_size, struct pt_regs *regs,
 			  struct hlist_head *head, int rctx,
-			  struct task_struct *task, struct perf_event *event);
+			  struct task_struct *task);
 extern void perf_bp_event(struct perf_event *event, void *data);
 
 #ifndef perf_misc_flags
diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index 2e0f22298fe9..a6349b76fd39 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -507,9 +507,9 @@ void perf_trace_run_bpf_submit(void *raw_data, int size, int rctx,
 static inline void
 perf_trace_buf_submit(void *raw_data, int size, int rctx, u16 type,
 		       u64 count, struct pt_regs *regs, void *head,
-		       struct task_struct *task, struct perf_event *event)
+		       struct task_struct *task)
 {
-	perf_tp_event(type, count, raw_data, size, regs, head, rctx, task, event);
+	perf_tp_event(type, count, raw_data, size, regs, head, rctx, task);
 }
 #endif
 
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 6bc21e202ae4..b8db80c5513b 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -7954,15 +7954,16 @@ void perf_trace_run_bpf_submit(void *raw_data, int size, int rctx,
 		}
 	}
 	perf_tp_event(call->event.type, count, raw_data, size, regs, head,
-		      rctx, task, NULL);
+		      rctx, task);
 }
 EXPORT_SYMBOL_GPL(perf_trace_run_bpf_submit);
 
 void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size,
 		   struct pt_regs *regs, struct hlist_head *head, int rctx,
-		   struct task_struct *task, struct perf_event *event)
+		   struct task_struct *task)
 {
 	struct perf_sample_data data;
+	struct perf_event *event;
 
 	struct perf_raw_record raw = {
 		.frag = {
@@ -7976,15 +7977,9 @@ void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size,
 
 	perf_trace_buf_update(record, event_type);
 
-	/* Use the given event instead of the hlist */
-	if (event) {
+	hlist_for_each_entry_rcu(event, head, hlist_entry) {
 		if (perf_tp_event_match(event, &data, regs))
 			perf_swevent_event(event, count, &data, regs);
-	} else {
-		hlist_for_each_entry_rcu(event, head, hlist_entry) {
-			if (perf_tp_event_match(event, &data, regs))
-				perf_swevent_event(event, count, &data, regs);
-		}
 	}
 
 	/*
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index 13ba2d3f6a91..562fa69df5d3 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -306,7 +306,6 @@ static void
 perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip,
 			  struct ftrace_ops *ops, struct pt_regs *pt_regs)
 {
-	struct perf_event *event;
 	struct ftrace_entry *entry;
 	struct hlist_head *head;
 	struct pt_regs regs;
@@ -330,9 +329,8 @@ perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip,
 
 	entry->ip = ip;
 	entry->parent_ip = parent_ip;
-	event = container_of(ops, struct perf_event, ftrace_ops);
 	perf_trace_buf_submit(entry, ENTRY_SIZE, rctx, TRACE_FN,
-			      1, &regs, head, NULL, event);
+			      1, &regs, head, NULL);
 
 #undef ENTRY_SIZE
 }
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index af6134f2e597..996902a526d4 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -1200,7 +1200,7 @@ kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs)
 	memset(&entry[1], 0, dsize);
 	store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize);
 	perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs,
-			      head, NULL, NULL);
+			      head, NULL);
 }
 NOKPROBE_SYMBOL(kprobe_perf_func);
 
@@ -1236,7 +1236,7 @@ kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
 	entry->ret_ip = (unsigned long)ri->ret_addr;
 	store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize);
 	perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs,
-			      head, NULL, NULL);
+			      head, NULL);
 }
 NOKPROBE_SYMBOL(kretprobe_perf_func);
 #endif	/* CONFIG_PERF_EVENTS */
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 696afe72d3b1..934b0da72679 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -622,7 +622,7 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
 
 	perf_trace_buf_submit(rec, size, rctx,
 			      sys_data->enter_event->event.type, 1, regs,
-			      head, NULL, NULL);
+			      head, NULL);
 }
 
 static int perf_sysenter_enable(struct trace_event_call *call)
@@ -716,7 +716,7 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
 	}
 
 	perf_trace_buf_submit(rec, size, rctx, sys_data->exit_event->event.type,
-			      1, regs, head, NULL, NULL);
+			      1, regs, head, NULL);
 }
 
 static int perf_sysexit_enable(struct trace_event_call *call)
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index b34965e62fb9..402120ba4594 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -1156,7 +1156,7 @@ static void __uprobe_perf_func(struct trace_uprobe *tu,
 	}
 
 	perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs,
-			      head, NULL, NULL);
+			      head, NULL);
  out:
 	preempt_enable();
 }
-- 
cgit v1.2.3


From 466c81c45b650deca213fda3d0ec4761667379a9 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 10 Oct 2017 17:15:47 +0200
Subject: perf/ftrace: Fix function trace events

The function-trace <-> perf interface is a tad messed up. Where all
the other trace <-> perf interfaces use a single trace hook
registration and use per-cpu RCU based hlist to iterate the events,
function-trace actually needs multiple hook registrations in order to
minimize function entry patching when filters are present.

The end result is that we iterate events both on the trace hook and on
the hlist, which results in reporting events multiple times.

Since function-trace cannot use the regular scheme, fix it the other
way around, use singleton hlists.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 include/linux/trace_events.h    |  5 +++
 kernel/trace/trace_event_perf.c | 80 +++++++++++++++++++++++++----------------
 2 files changed, 54 insertions(+), 31 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index a6349b76fd39..ca4e67e466a7 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -173,6 +173,11 @@ enum trace_reg {
 	TRACE_REG_PERF_UNREGISTER,
 	TRACE_REG_PERF_OPEN,
 	TRACE_REG_PERF_CLOSE,
+	/*
+	 * These (ADD/DEL) use a 'boolean' return value, where 1 (true) means a
+	 * custom action was taken and the default action is not to be
+	 * performed.
+	 */
 	TRACE_REG_PERF_ADD,
 	TRACE_REG_PERF_DEL,
 #endif
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index 562fa69df5d3..e73f9ab15939 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -240,27 +240,41 @@ void perf_trace_destroy(struct perf_event *p_event)
 int perf_trace_add(struct perf_event *p_event, int flags)
 {
 	struct trace_event_call *tp_event = p_event->tp_event;
-	struct hlist_head __percpu *pcpu_list;
-	struct hlist_head *list;
-
-	pcpu_list = tp_event->perf_events;
-	if (WARN_ON_ONCE(!pcpu_list))
-		return -EINVAL;
 
 	if (!(flags & PERF_EF_START))
 		p_event->hw.state = PERF_HES_STOPPED;
 
-	list = this_cpu_ptr(pcpu_list);
-	hlist_add_head_rcu(&p_event->hlist_entry, list);
+	/*
+	 * If TRACE_REG_PERF_ADD returns false; no custom action was performed
+	 * and we need to take the default action of enqueueing our event on
+	 * the right per-cpu hlist.
+	 */
+	if (!tp_event->class->reg(tp_event, TRACE_REG_PERF_ADD, p_event)) {
+		struct hlist_head __percpu *pcpu_list;
+		struct hlist_head *list;
+
+		pcpu_list = tp_event->perf_events;
+		if (WARN_ON_ONCE(!pcpu_list))
+			return -EINVAL;
+
+		list = this_cpu_ptr(pcpu_list);
+		hlist_add_head_rcu(&p_event->hlist_entry, list);
+	}
 
-	return tp_event->class->reg(tp_event, TRACE_REG_PERF_ADD, p_event);
+	return 0;
 }
 
 void perf_trace_del(struct perf_event *p_event, int flags)
 {
 	struct trace_event_call *tp_event = p_event->tp_event;
-	hlist_del_rcu(&p_event->hlist_entry);
-	tp_event->class->reg(tp_event, TRACE_REG_PERF_DEL, p_event);
+
+	/*
+	 * If TRACE_REG_PERF_DEL returns false; no custom action was performed
+	 * and we need to take the default action of dequeueing our event from
+	 * the right per-cpu hlist.
+	 */
+	if (!tp_event->class->reg(tp_event, TRACE_REG_PERF_DEL, p_event))
+		hlist_del_rcu(&p_event->hlist_entry);
 }
 
 void *perf_trace_buf_alloc(int size, struct pt_regs **regs, int *rctxp)
@@ -307,14 +321,24 @@ perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip,
 			  struct ftrace_ops *ops, struct pt_regs *pt_regs)
 {
 	struct ftrace_entry *entry;
-	struct hlist_head *head;
+	struct perf_event *event;
+	struct hlist_head head;
 	struct pt_regs regs;
 	int rctx;
 
-	head = this_cpu_ptr(event_function.perf_events);
-	if (hlist_empty(head))
+	if ((unsigned long)ops->private != smp_processor_id())
 		return;
 
+	event = container_of(ops, struct perf_event, ftrace_ops);
+
+	/*
+	 * @event->hlist entry is NULL (per INIT_HLIST_NODE), and all
+	 * the perf code does is hlist_for_each_entry_rcu(), so we can
+	 * get away with simply setting the @head.first pointer in order
+	 * to create a singular list.
+	 */
+	head.first = &event->hlist_entry;
+
 #define ENTRY_SIZE (ALIGN(sizeof(struct ftrace_entry) + sizeof(u32), \
 		    sizeof(u64)) - sizeof(u32))
 
@@ -330,7 +354,7 @@ perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip,
 	entry->ip = ip;
 	entry->parent_ip = parent_ip;
 	perf_trace_buf_submit(entry, ENTRY_SIZE, rctx, TRACE_FN,
-			      1, &regs, head, NULL);
+			      1, &regs, &head, NULL);
 
 #undef ENTRY_SIZE
 }
@@ -339,8 +363,10 @@ static int perf_ftrace_function_register(struct perf_event *event)
 {
 	struct ftrace_ops *ops = &event->ftrace_ops;
 
-	ops->flags |= FTRACE_OPS_FL_PER_CPU | FTRACE_OPS_FL_RCU;
-	ops->func = perf_ftrace_function_call;
+	ops->flags   |= FTRACE_OPS_FL_RCU;
+	ops->func    = perf_ftrace_function_call;
+	ops->private = (void *)(unsigned long)nr_cpu_ids;
+
 	return register_ftrace_function(ops);
 }
 
@@ -352,19 +378,11 @@ static int perf_ftrace_function_unregister(struct perf_event *event)
 	return ret;
 }
 
-static void perf_ftrace_function_enable(struct perf_event *event)
-{
-	ftrace_function_local_enable(&event->ftrace_ops);
-}
-
-static void perf_ftrace_function_disable(struct perf_event *event)
-{
-	ftrace_function_local_disable(&event->ftrace_ops);
-}
-
 int perf_ftrace_event_register(struct trace_event_call *call,
 			       enum trace_reg type, void *data)
 {
+	struct perf_event *event = data;
+
 	switch (type) {
 	case TRACE_REG_REGISTER:
 	case TRACE_REG_UNREGISTER:
@@ -377,11 +395,11 @@ int perf_ftrace_event_register(struct trace_event_call *call,
 	case TRACE_REG_PERF_CLOSE:
 		return perf_ftrace_function_unregister(data);
 	case TRACE_REG_PERF_ADD:
-		perf_ftrace_function_enable(data);
-		return 0;
+		event->ftrace_ops.private = (void *)(unsigned long)smp_processor_id();
+		return 1;
 	case TRACE_REG_PERF_DEL:
-		perf_ftrace_function_disable(data);
-		return 0;
+		event->ftrace_ops.private = (void *)(unsigned long)nr_cpu_ids;
+		return 1;
 	}
 
 	return -EINVAL;
-- 
cgit v1.2.3


From 1dd311e6dcda4020c603bcf9f390a577d439d509 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 11 Oct 2017 09:45:31 +0200
Subject: perf/ftrace: Small cleanup

ops->flags _should_ be 0 at this point, so setting the flag using
bitwise or is a bit daft.

Link: http://lkml.kernel.org/r/20171011080224.315585202@infradead.org

Requested-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace_event_perf.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index e73f9ab15939..55d6dff37daf 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -363,7 +363,7 @@ static int perf_ftrace_function_register(struct perf_event *event)
 {
 	struct ftrace_ops *ops = &event->ftrace_ops;
 
-	ops->flags   |= FTRACE_OPS_FL_RCU;
+	ops->flags   = FTRACE_OPS_FL_RCU;
 	ops->func    = perf_ftrace_function_call;
 	ops->private = (void *)(unsigned long)nr_cpu_ids;
 
-- 
cgit v1.2.3


From b3a88803ac5b4bda26017b485c8722a8487fefb7 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 11 Oct 2017 09:45:32 +0200
Subject: ftrace: Kill FTRACE_OPS_FL_PER_CPU

The one and only user of FTRACE_OPS_FL_PER_CPU is gone, remove the
lot.

Link: http://lkml.kernel.org/r/20171011080224.372422809@infradead.org

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 include/linux/ftrace.h | 83 +++++++++-----------------------------------------
 kernel/trace/ftrace.c  | 55 ++++-----------------------------
 2 files changed, 20 insertions(+), 118 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 1f8545caa691..252e334e7b5f 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -102,10 +102,6 @@ ftrace_func_t ftrace_ops_get_func(struct ftrace_ops *ops);
  * ENABLED - set/unset when ftrace_ops is registered/unregistered
  * DYNAMIC - set when ftrace_ops is registered to denote dynamically
  *           allocated ftrace_ops which need special care
- * PER_CPU - set manualy by ftrace_ops user to denote the ftrace_ops
- *           could be controlled by following calls:
- *             ftrace_function_local_enable
- *             ftrace_function_local_disable
  * SAVE_REGS - The ftrace_ops wants regs saved at each function called
  *            and passed to the callback. If this flag is set, but the
  *            architecture does not support passing regs
@@ -149,21 +145,20 @@ ftrace_func_t ftrace_ops_get_func(struct ftrace_ops *ops);
 enum {
 	FTRACE_OPS_FL_ENABLED			= 1 << 0,
 	FTRACE_OPS_FL_DYNAMIC			= 1 << 1,
-	FTRACE_OPS_FL_PER_CPU			= 1 << 2,
-	FTRACE_OPS_FL_SAVE_REGS			= 1 << 3,
-	FTRACE_OPS_FL_SAVE_REGS_IF_SUPPORTED	= 1 << 4,
-	FTRACE_OPS_FL_RECURSION_SAFE		= 1 << 5,
-	FTRACE_OPS_FL_STUB			= 1 << 6,
-	FTRACE_OPS_FL_INITIALIZED		= 1 << 7,
-	FTRACE_OPS_FL_DELETED			= 1 << 8,
-	FTRACE_OPS_FL_ADDING			= 1 << 9,
-	FTRACE_OPS_FL_REMOVING			= 1 << 10,
-	FTRACE_OPS_FL_MODIFYING			= 1 << 11,
-	FTRACE_OPS_FL_ALLOC_TRAMP		= 1 << 12,
-	FTRACE_OPS_FL_IPMODIFY			= 1 << 13,
-	FTRACE_OPS_FL_PID			= 1 << 14,
-	FTRACE_OPS_FL_RCU			= 1 << 15,
-	FTRACE_OPS_FL_TRACE_ARRAY		= 1 << 16,
+	FTRACE_OPS_FL_SAVE_REGS			= 1 << 2,
+	FTRACE_OPS_FL_SAVE_REGS_IF_SUPPORTED	= 1 << 3,
+	FTRACE_OPS_FL_RECURSION_SAFE		= 1 << 4,
+	FTRACE_OPS_FL_STUB			= 1 << 5,
+	FTRACE_OPS_FL_INITIALIZED		= 1 << 6,
+	FTRACE_OPS_FL_DELETED			= 1 << 7,
+	FTRACE_OPS_FL_ADDING			= 1 << 8,
+	FTRACE_OPS_FL_REMOVING			= 1 << 9,
+	FTRACE_OPS_FL_MODIFYING			= 1 << 10,
+	FTRACE_OPS_FL_ALLOC_TRAMP		= 1 << 11,
+	FTRACE_OPS_FL_IPMODIFY			= 1 << 12,
+	FTRACE_OPS_FL_PID			= 1 << 13,
+	FTRACE_OPS_FL_RCU			= 1 << 14,
+	FTRACE_OPS_FL_TRACE_ARRAY		= 1 << 15,
 };
 
 #ifdef CONFIG_DYNAMIC_FTRACE
@@ -198,7 +193,6 @@ struct ftrace_ops {
 	unsigned long			flags;
 	void				*private;
 	ftrace_func_t			saved_func;
-	int __percpu			*disabled;
 #ifdef CONFIG_DYNAMIC_FTRACE
 	struct ftrace_ops_hash		local_hash;
 	struct ftrace_ops_hash		*func_hash;
@@ -230,55 +224,6 @@ int register_ftrace_function(struct ftrace_ops *ops);
 int unregister_ftrace_function(struct ftrace_ops *ops);
 void clear_ftrace_function(void);
 
-/**
- * ftrace_function_local_enable - enable ftrace_ops on current cpu
- *
- * This function enables tracing on current cpu by decreasing
- * the per cpu control variable.
- * It must be called with preemption disabled and only on ftrace_ops
- * registered with FTRACE_OPS_FL_PER_CPU. If called without preemption
- * disabled, this_cpu_ptr will complain when CONFIG_DEBUG_PREEMPT is enabled.
- */
-static inline void ftrace_function_local_enable(struct ftrace_ops *ops)
-{
-	if (WARN_ON_ONCE(!(ops->flags & FTRACE_OPS_FL_PER_CPU)))
-		return;
-
-	(*this_cpu_ptr(ops->disabled))--;
-}
-
-/**
- * ftrace_function_local_disable - disable ftrace_ops on current cpu
- *
- * This function disables tracing on current cpu by increasing
- * the per cpu control variable.
- * It must be called with preemption disabled and only on ftrace_ops
- * registered with FTRACE_OPS_FL_PER_CPU. If called without preemption
- * disabled, this_cpu_ptr will complain when CONFIG_DEBUG_PREEMPT is enabled.
- */
-static inline void ftrace_function_local_disable(struct ftrace_ops *ops)
-{
-	if (WARN_ON_ONCE(!(ops->flags & FTRACE_OPS_FL_PER_CPU)))
-		return;
-
-	(*this_cpu_ptr(ops->disabled))++;
-}
-
-/**
- * ftrace_function_local_disabled - returns ftrace_ops disabled value
- *                                  on current cpu
- *
- * This function returns value of ftrace_ops::disabled on current cpu.
- * It must be called with preemption disabled and only on ftrace_ops
- * registered with FTRACE_OPS_FL_PER_CPU. If called without preemption
- * disabled, this_cpu_ptr will complain when CONFIG_DEBUG_PREEMPT is enabled.
- */
-static inline int ftrace_function_local_disabled(struct ftrace_ops *ops)
-{
-	WARN_ON_ONCE(!(ops->flags & FTRACE_OPS_FL_PER_CPU));
-	return *this_cpu_ptr(ops->disabled);
-}
-
 extern void ftrace_stub(unsigned long a0, unsigned long a1,
 			struct ftrace_ops *op, struct pt_regs *regs);
 
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index e0a98225666b..2fd3edaec6de 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -203,30 +203,6 @@ void clear_ftrace_function(void)
 	ftrace_trace_function = ftrace_stub;
 }
 
-static void per_cpu_ops_disable_all(struct ftrace_ops *ops)
-{
-	int cpu;
-
-	for_each_possible_cpu(cpu)
-		*per_cpu_ptr(ops->disabled, cpu) = 1;
-}
-
-static int per_cpu_ops_alloc(struct ftrace_ops *ops)
-{
-	int __percpu *disabled;
-
-	if (WARN_ON_ONCE(!(ops->flags & FTRACE_OPS_FL_PER_CPU)))
-		return -EINVAL;
-
-	disabled = alloc_percpu(int);
-	if (!disabled)
-		return -ENOMEM;
-
-	ops->disabled = disabled;
-	per_cpu_ops_disable_all(ops);
-	return 0;
-}
-
 static void ftrace_sync(struct work_struct *work)
 {
 	/*
@@ -262,8 +238,8 @@ static ftrace_func_t ftrace_ops_get_list_func(struct ftrace_ops *ops)
 	 * If this is a dynamic, RCU, or per CPU ops, or we force list func,
 	 * then it needs to call the list anyway.
 	 */
-	if (ops->flags & (FTRACE_OPS_FL_DYNAMIC | FTRACE_OPS_FL_PER_CPU |
-			  FTRACE_OPS_FL_RCU) || FTRACE_FORCE_LIST_FUNC)
+	if (ops->flags & (FTRACE_OPS_FL_DYNAMIC | FTRACE_OPS_FL_RCU) ||
+	    FTRACE_FORCE_LIST_FUNC)
 		return ftrace_ops_list_func;
 
 	return ftrace_ops_get_func(ops);
@@ -422,11 +398,6 @@ static int __register_ftrace_function(struct ftrace_ops *ops)
 	if (!core_kernel_data((unsigned long)ops))
 		ops->flags |= FTRACE_OPS_FL_DYNAMIC;
 
-	if (ops->flags & FTRACE_OPS_FL_PER_CPU) {
-		if (per_cpu_ops_alloc(ops))
-			return -ENOMEM;
-	}
-
 	add_ftrace_ops(&ftrace_ops_list, ops);
 
 	/* Always save the function, and reset at unregistering */
@@ -2727,11 +2698,6 @@ void __weak arch_ftrace_trampoline_free(struct ftrace_ops *ops)
 {
 }
 
-static void per_cpu_ops_free(struct ftrace_ops *ops)
-{
-	free_percpu(ops->disabled);
-}
-
 static void ftrace_startup_enable(int command)
 {
 	if (saved_ftrace_func != ftrace_trace_function) {
@@ -2833,7 +2799,7 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command)
 		 * not currently active, we can just free them
 		 * without synchronizing all CPUs.
 		 */
-		if (ops->flags & (FTRACE_OPS_FL_DYNAMIC | FTRACE_OPS_FL_PER_CPU))
+		if (ops->flags & FTRACE_OPS_FL_DYNAMIC)
 			goto free_ops;
 
 		return 0;
@@ -2880,7 +2846,7 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command)
 	 * The same goes for freeing the per_cpu data of the per_cpu
 	 * ops.
 	 */
-	if (ops->flags & (FTRACE_OPS_FL_DYNAMIC | FTRACE_OPS_FL_PER_CPU)) {
+	if (ops->flags & FTRACE_OPS_FL_DYNAMIC) {
 		/*
 		 * We need to do a hard force of sched synchronization.
 		 * This is because we use preempt_disable() to do RCU, but
@@ -2903,9 +2869,6 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command)
 
  free_ops:
 		arch_ftrace_trampoline_free(ops);
-
-		if (ops->flags & FTRACE_OPS_FL_PER_CPU)
-			per_cpu_ops_free(ops);
 	}
 
 	return 0;
@@ -6355,10 +6318,7 @@ __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
 		 * If any of the above fails then the op->func() is not executed.
 		 */
 		if ((!(op->flags & FTRACE_OPS_FL_RCU) || rcu_is_watching()) &&
-		    (!(op->flags & FTRACE_OPS_FL_PER_CPU) ||
-		     !ftrace_function_local_disabled(op)) &&
 		    ftrace_ops_test(op, ip, regs)) {
-		    
 			if (FTRACE_WARN_ON(!op->func)) {
 				pr_warn("op=%p %pS\n", op, op);
 				goto out;
@@ -6416,10 +6376,7 @@ static void ftrace_ops_assist_func(unsigned long ip, unsigned long parent_ip,
 
 	preempt_disable_notrace();
 
-	if (!(op->flags & FTRACE_OPS_FL_PER_CPU) ||
-	    !ftrace_function_local_disabled(op)) {
-		op->func(ip, parent_ip, op, regs);
-	}
+	op->func(ip, parent_ip, op, regs);
 
 	preempt_enable_notrace();
 	trace_clear_recursion(bit);
@@ -6443,7 +6400,7 @@ ftrace_func_t ftrace_ops_get_func(struct ftrace_ops *ops)
 	 * or does per cpu logic, then we need to call the assist handler.
 	 */
 	if (!(ops->flags & FTRACE_OPS_FL_RECURSION_SAFE) ||
-	    ops->flags & (FTRACE_OPS_FL_RCU | FTRACE_OPS_FL_PER_CPU))
+	    ops->flags & FTRACE_OPS_FL_RCU)
 		return ftrace_ops_assist_func;
 
 	return ops->func;
-- 
cgit v1.2.3


From 6710e1126934d8b4372b4d2f9ae1646cd3f151bf Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Mon, 16 Oct 2017 12:19:28 +0200
Subject: bpf: introduce new bpf cpu map type BPF_MAP_TYPE_CPUMAP

The 'cpumap' is primarily used as a backend map for XDP BPF helper
call bpf_redirect_map() and XDP_REDIRECT action, like 'devmap'.

This patch implement the main part of the map.  It is not connected to
the XDP redirect system yet, and no SKB allocation are done yet.

The main concern in this patch is to ensure the datapath can run
without any locking.  This adds complexity to the setup and tear-down
procedure, which assumptions are extra carefully documented in the
code comments.

V2:
 - make sure array isn't larger than NR_CPUS
 - make sure CPUs added is a valid possible CPU

V3: fix nitpicks from Jakub Kicinski <kubakici@wp.pl>

V5:
 - Restrict map allocation to root / CAP_SYS_ADMIN
 - WARN_ON_ONCE if queue is not empty on tear-down
 - Return -EPERM on memlock limit instead of -ENOMEM
 - Error code in __cpu_map_entry_alloc() also handle ptr_ring_cleanup()
 - Moved cpu_map_enqueue() to next patch

V6: all notice by Daniel Borkmann
 - Fix err return code in cpu_map_alloc() introduced in V5
 - Move cpu_possible() check after max_entries boundary check
 - Forbid usage initially in check_map_func_compatibility()

V7:
 - Fix alloc error path spotted by Daniel Borkmann
 - Did stress test adding+removing CPUs from the map concurrently
 - Fixed refcnt issue on cpu_map_entry, kthread started too soon
 - Make sure packets are flushed during tear-down, involved use of
   rcu_barrier() and kthread_run only exit after queue is empty
 - Fix alloc error path in __cpu_map_entry_alloc() for ptr_ring

V8:
 - Nitpicking comments and gramma by Edward Cree
 - Fix missing semi-colon introduced in V7 due to rebasing
 - Move struct bpf_cpu_map_entry members cpu+map_id to tracepoint patch

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf_types.h      |   1 +
 include/uapi/linux/bpf.h       |   1 +
 kernel/bpf/Makefile            |   1 +
 kernel/bpf/cpumap.c            | 560 +++++++++++++++++++++++++++++++++++++++++
 kernel/bpf/syscall.c           |   8 +-
 kernel/bpf/verifier.c          |   5 +
 tools/include/uapi/linux/bpf.h |   1 +
 7 files changed, 576 insertions(+), 1 deletion(-)
 create mode 100644 kernel/bpf/cpumap.c

(limited to 'kernel')

diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
index 6f1a567667b8..814c1081a4a9 100644
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -41,4 +41,5 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_DEVMAP, dev_map_ops)
 #ifdef CONFIG_STREAM_PARSER
 BPF_MAP_TYPE(BPF_MAP_TYPE_SOCKMAP, sock_map_ops)
 #endif
+BPF_MAP_TYPE(BPF_MAP_TYPE_CPUMAP, cpu_map_ops)
 #endif
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 6db9e1d679cd..4303fb6c3817 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -112,6 +112,7 @@ enum bpf_map_type {
 	BPF_MAP_TYPE_HASH_OF_MAPS,
 	BPF_MAP_TYPE_DEVMAP,
 	BPF_MAP_TYPE_SOCKMAP,
+	BPF_MAP_TYPE_CPUMAP,
 };
 
 enum bpf_prog_type {
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index 53fb09f92e3f..e597daae6120 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -5,6 +5,7 @@ obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list
 obj-$(CONFIG_BPF_SYSCALL) += disasm.o
 ifeq ($(CONFIG_NET),y)
 obj-$(CONFIG_BPF_SYSCALL) += devmap.o
+obj-$(CONFIG_BPF_SYSCALL) += cpumap.o
 ifeq ($(CONFIG_STREAM_PARSER),y)
 obj-$(CONFIG_BPF_SYSCALL) += sockmap.o
 endif
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
new file mode 100644
index 000000000000..e1e25ddba038
--- /dev/null
+++ b/kernel/bpf/cpumap.c
@@ -0,0 +1,560 @@
+/* bpf/cpumap.c
+ *
+ * Copyright (c) 2017 Jesper Dangaard Brouer, Red Hat Inc.
+ * Released under terms in GPL version 2.  See COPYING.
+ */
+
+/* The 'cpumap' is primarily used as a backend map for XDP BPF helper
+ * call bpf_redirect_map() and XDP_REDIRECT action, like 'devmap'.
+ *
+ * Unlike devmap which redirects XDP frames out another NIC device,
+ * this map type redirects raw XDP frames to another CPU.  The remote
+ * CPU will do SKB-allocation and call the normal network stack.
+ *
+ * This is a scalability and isolation mechanism, that allow
+ * separating the early driver network XDP layer, from the rest of the
+ * netstack, and assigning dedicated CPUs for this stage.  This
+ * basically allows for 10G wirespeed pre-filtering via bpf.
+ */
+#include <linux/bpf.h>
+#include <linux/filter.h>
+#include <linux/ptr_ring.h>
+
+#include <linux/sched.h>
+#include <linux/workqueue.h>
+#include <linux/kthread.h>
+#include <linux/capability.h>
+
+/* General idea: XDP packets getting XDP redirected to another CPU,
+ * will maximum be stored/queued for one driver ->poll() call.  It is
+ * guaranteed that setting flush bit and flush operation happen on
+ * same CPU.  Thus, cpu_map_flush operation can deduct via this_cpu_ptr()
+ * which queue in bpf_cpu_map_entry contains packets.
+ */
+
+#define CPU_MAP_BULK_SIZE 8  /* 8 == one cacheline on 64-bit archs */
+struct xdp_bulk_queue {
+	void *q[CPU_MAP_BULK_SIZE];
+	unsigned int count;
+};
+
+/* Struct for every remote "destination" CPU in map */
+struct bpf_cpu_map_entry {
+	u32 qsize;  /* Queue size placeholder for map lookup */
+
+	/* XDP can run multiple RX-ring queues, need __percpu enqueue store */
+	struct xdp_bulk_queue __percpu *bulkq;
+
+	/* Queue with potential multi-producers, and single-consumer kthread */
+	struct ptr_ring *queue;
+	struct task_struct *kthread;
+	struct work_struct kthread_stop_wq;
+
+	atomic_t refcnt; /* Control when this struct can be free'ed */
+	struct rcu_head rcu;
+};
+
+struct bpf_cpu_map {
+	struct bpf_map map;
+	/* Below members specific for map type */
+	struct bpf_cpu_map_entry **cpu_map;
+	unsigned long __percpu *flush_needed;
+};
+
+static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu,
+			     struct xdp_bulk_queue *bq);
+
+static u64 cpu_map_bitmap_size(const union bpf_attr *attr)
+{
+	return BITS_TO_LONGS(attr->max_entries) * sizeof(unsigned long);
+}
+
+static struct bpf_map *cpu_map_alloc(union bpf_attr *attr)
+{
+	struct bpf_cpu_map *cmap;
+	int err = -ENOMEM;
+	u64 cost;
+	int ret;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return ERR_PTR(-EPERM);
+
+	/* check sanity of attributes */
+	if (attr->max_entries == 0 || attr->key_size != 4 ||
+	    attr->value_size != 4 || attr->map_flags & ~BPF_F_NUMA_NODE)
+		return ERR_PTR(-EINVAL);
+
+	cmap = kzalloc(sizeof(*cmap), GFP_USER);
+	if (!cmap)
+		return ERR_PTR(-ENOMEM);
+
+	/* mandatory map attributes */
+	cmap->map.map_type = attr->map_type;
+	cmap->map.key_size = attr->key_size;
+	cmap->map.value_size = attr->value_size;
+	cmap->map.max_entries = attr->max_entries;
+	cmap->map.map_flags = attr->map_flags;
+	cmap->map.numa_node = bpf_map_attr_numa_node(attr);
+
+	/* Pre-limit array size based on NR_CPUS, not final CPU check */
+	if (cmap->map.max_entries > NR_CPUS) {
+		err = -E2BIG;
+		goto free_cmap;
+	}
+
+	/* make sure page count doesn't overflow */
+	cost = (u64) cmap->map.max_entries * sizeof(struct bpf_cpu_map_entry *);
+	cost += cpu_map_bitmap_size(attr) * num_possible_cpus();
+	if (cost >= U32_MAX - PAGE_SIZE)
+		goto free_cmap;
+	cmap->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
+
+	/* Notice returns -EPERM on if map size is larger than memlock limit */
+	ret = bpf_map_precharge_memlock(cmap->map.pages);
+	if (ret) {
+		err = ret;
+		goto free_cmap;
+	}
+
+	/* A per cpu bitfield with a bit per possible CPU in map  */
+	cmap->flush_needed = __alloc_percpu(cpu_map_bitmap_size(attr),
+					    __alignof__(unsigned long));
+	if (!cmap->flush_needed)
+		goto free_cmap;
+
+	/* Alloc array for possible remote "destination" CPUs */
+	cmap->cpu_map = bpf_map_area_alloc(cmap->map.max_entries *
+					   sizeof(struct bpf_cpu_map_entry *),
+					   cmap->map.numa_node);
+	if (!cmap->cpu_map)
+		goto free_percpu;
+
+	return &cmap->map;
+free_percpu:
+	free_percpu(cmap->flush_needed);
+free_cmap:
+	kfree(cmap);
+	return ERR_PTR(err);
+}
+
+void __cpu_map_queue_destructor(void *ptr)
+{
+	/* The tear-down procedure should have made sure that queue is
+	 * empty.  See __cpu_map_entry_replace() and work-queue
+	 * invoked cpu_map_kthread_stop(). Catch any broken behaviour
+	 * gracefully and warn once.
+	 */
+	if (WARN_ON_ONCE(ptr))
+		page_frag_free(ptr);
+}
+
+static void put_cpu_map_entry(struct bpf_cpu_map_entry *rcpu)
+{
+	if (atomic_dec_and_test(&rcpu->refcnt)) {
+		/* The queue should be empty at this point */
+		ptr_ring_cleanup(rcpu->queue, __cpu_map_queue_destructor);
+		kfree(rcpu->queue);
+		kfree(rcpu);
+	}
+}
+
+static void get_cpu_map_entry(struct bpf_cpu_map_entry *rcpu)
+{
+	atomic_inc(&rcpu->refcnt);
+}
+
+/* called from workqueue, to workaround syscall using preempt_disable */
+static void cpu_map_kthread_stop(struct work_struct *work)
+{
+	struct bpf_cpu_map_entry *rcpu;
+
+	rcpu = container_of(work, struct bpf_cpu_map_entry, kthread_stop_wq);
+
+	/* Wait for flush in __cpu_map_entry_free(), via full RCU barrier,
+	 * as it waits until all in-flight call_rcu() callbacks complete.
+	 */
+	rcu_barrier();
+
+	/* kthread_stop will wake_up_process and wait for it to complete */
+	kthread_stop(rcpu->kthread);
+}
+
+static int cpu_map_kthread_run(void *data)
+{
+	struct bpf_cpu_map_entry *rcpu = data;
+
+	set_current_state(TASK_INTERRUPTIBLE);
+
+	/* When kthread gives stop order, then rcpu have been disconnected
+	 * from map, thus no new packets can enter. Remaining in-flight
+	 * per CPU stored packets are flushed to this queue.  Wait honoring
+	 * kthread_stop signal until queue is empty.
+	 */
+	while (!kthread_should_stop() || !__ptr_ring_empty(rcpu->queue)) {
+		struct xdp_pkt *xdp_pkt;
+
+		schedule();
+		/* Do work */
+		while ((xdp_pkt = ptr_ring_consume(rcpu->queue))) {
+			/* For now just "refcnt-free" */
+			page_frag_free(xdp_pkt);
+		}
+		__set_current_state(TASK_INTERRUPTIBLE);
+	}
+	__set_current_state(TASK_RUNNING);
+
+	put_cpu_map_entry(rcpu);
+	return 0;
+}
+
+struct bpf_cpu_map_entry *__cpu_map_entry_alloc(u32 qsize, u32 cpu, int map_id)
+{
+	gfp_t gfp = GFP_ATOMIC|__GFP_NOWARN;
+	struct bpf_cpu_map_entry *rcpu;
+	int numa, err;
+
+	/* Have map->numa_node, but choose node of redirect target CPU */
+	numa = cpu_to_node(cpu);
+
+	rcpu = kzalloc_node(sizeof(*rcpu), gfp, numa);
+	if (!rcpu)
+		return NULL;
+
+	/* Alloc percpu bulkq */
+	rcpu->bulkq = __alloc_percpu_gfp(sizeof(*rcpu->bulkq),
+					 sizeof(void *), gfp);
+	if (!rcpu->bulkq)
+		goto free_rcu;
+
+	/* Alloc queue */
+	rcpu->queue = kzalloc_node(sizeof(*rcpu->queue), gfp, numa);
+	if (!rcpu->queue)
+		goto free_bulkq;
+
+	err = ptr_ring_init(rcpu->queue, qsize, gfp);
+	if (err)
+		goto free_queue;
+
+	rcpu->qsize = qsize;
+
+	/* Setup kthread */
+	rcpu->kthread = kthread_create_on_node(cpu_map_kthread_run, rcpu, numa,
+					       "cpumap/%d/map:%d", cpu, map_id);
+	if (IS_ERR(rcpu->kthread))
+		goto free_ptr_ring;
+
+	get_cpu_map_entry(rcpu); /* 1-refcnt for being in cmap->cpu_map[] */
+	get_cpu_map_entry(rcpu); /* 1-refcnt for kthread */
+
+	/* Make sure kthread runs on a single CPU */
+	kthread_bind(rcpu->kthread, cpu);
+	wake_up_process(rcpu->kthread);
+
+	return rcpu;
+
+free_ptr_ring:
+	ptr_ring_cleanup(rcpu->queue, NULL);
+free_queue:
+	kfree(rcpu->queue);
+free_bulkq:
+	free_percpu(rcpu->bulkq);
+free_rcu:
+	kfree(rcpu);
+	return NULL;
+}
+
+void __cpu_map_entry_free(struct rcu_head *rcu)
+{
+	struct bpf_cpu_map_entry *rcpu;
+	int cpu;
+
+	/* This cpu_map_entry have been disconnected from map and one
+	 * RCU graze-period have elapsed.  Thus, XDP cannot queue any
+	 * new packets and cannot change/set flush_needed that can
+	 * find this entry.
+	 */
+	rcpu = container_of(rcu, struct bpf_cpu_map_entry, rcu);
+
+	/* Flush remaining packets in percpu bulkq */
+	for_each_online_cpu(cpu) {
+		struct xdp_bulk_queue *bq = per_cpu_ptr(rcpu->bulkq, cpu);
+
+		/* No concurrent bq_enqueue can run at this point */
+		bq_flush_to_queue(rcpu, bq);
+	}
+	free_percpu(rcpu->bulkq);
+	/* Cannot kthread_stop() here, last put free rcpu resources */
+	put_cpu_map_entry(rcpu);
+}
+
+/* After xchg pointer to bpf_cpu_map_entry, use the call_rcu() to
+ * ensure any driver rcu critical sections have completed, but this
+ * does not guarantee a flush has happened yet. Because driver side
+ * rcu_read_lock/unlock only protects the running XDP program.  The
+ * atomic xchg and NULL-ptr check in __cpu_map_flush() makes sure a
+ * pending flush op doesn't fail.
+ *
+ * The bpf_cpu_map_entry is still used by the kthread, and there can
+ * still be pending packets (in queue and percpu bulkq).  A refcnt
+ * makes sure to last user (kthread_stop vs. call_rcu) free memory
+ * resources.
+ *
+ * The rcu callback __cpu_map_entry_free flush remaining packets in
+ * percpu bulkq to queue.  Due to caller map_delete_elem() disable
+ * preemption, cannot call kthread_stop() to make sure queue is empty.
+ * Instead a work_queue is started for stopping kthread,
+ * cpu_map_kthread_stop, which waits for an RCU graze period before
+ * stopping kthread, emptying the queue.
+ */
+void __cpu_map_entry_replace(struct bpf_cpu_map *cmap,
+			     u32 key_cpu, struct bpf_cpu_map_entry *rcpu)
+{
+	struct bpf_cpu_map_entry *old_rcpu;
+
+	old_rcpu = xchg(&cmap->cpu_map[key_cpu], rcpu);
+	if (old_rcpu) {
+		call_rcu(&old_rcpu->rcu, __cpu_map_entry_free);
+		INIT_WORK(&old_rcpu->kthread_stop_wq, cpu_map_kthread_stop);
+		schedule_work(&old_rcpu->kthread_stop_wq);
+	}
+}
+
+int cpu_map_delete_elem(struct bpf_map *map, void *key)
+{
+	struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);
+	u32 key_cpu = *(u32 *)key;
+
+	if (key_cpu >= map->max_entries)
+		return -EINVAL;
+
+	/* notice caller map_delete_elem() use preempt_disable() */
+	__cpu_map_entry_replace(cmap, key_cpu, NULL);
+	return 0;
+}
+
+int cpu_map_update_elem(struct bpf_map *map, void *key, void *value,
+				u64 map_flags)
+{
+	struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);
+	struct bpf_cpu_map_entry *rcpu;
+
+	/* Array index key correspond to CPU number */
+	u32 key_cpu = *(u32 *)key;
+	/* Value is the queue size */
+	u32 qsize = *(u32 *)value;
+
+	if (unlikely(map_flags > BPF_EXIST))
+		return -EINVAL;
+	if (unlikely(key_cpu >= cmap->map.max_entries))
+		return -E2BIG;
+	if (unlikely(map_flags == BPF_NOEXIST))
+		return -EEXIST;
+	if (unlikely(qsize > 16384)) /* sanity limit on qsize */
+		return -EOVERFLOW;
+
+	/* Make sure CPU is a valid possible cpu */
+	if (!cpu_possible(key_cpu))
+		return -ENODEV;
+
+	if (qsize == 0) {
+		rcpu = NULL; /* Same as deleting */
+	} else {
+		/* Updating qsize cause re-allocation of bpf_cpu_map_entry */
+		rcpu = __cpu_map_entry_alloc(qsize, key_cpu, map->id);
+		if (!rcpu)
+			return -ENOMEM;
+	}
+	rcu_read_lock();
+	__cpu_map_entry_replace(cmap, key_cpu, rcpu);
+	rcu_read_unlock();
+	return 0;
+}
+
+void cpu_map_free(struct bpf_map *map)
+{
+	struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);
+	int cpu;
+	u32 i;
+
+	/* At this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0,
+	 * so the bpf programs (can be more than one that used this map) were
+	 * disconnected from events. Wait for outstanding critical sections in
+	 * these programs to complete. The rcu critical section only guarantees
+	 * no further "XDP/bpf-side" reads against bpf_cpu_map->cpu_map.
+	 * It does __not__ ensure pending flush operations (if any) are
+	 * complete.
+	 */
+	synchronize_rcu();
+
+	/* To ensure all pending flush operations have completed wait for flush
+	 * bitmap to indicate all flush_needed bits to be zero on _all_ cpus.
+	 * Because the above synchronize_rcu() ensures the map is disconnected
+	 * from the program we can assume no new bits will be set.
+	 */
+	for_each_online_cpu(cpu) {
+		unsigned long *bitmap = per_cpu_ptr(cmap->flush_needed, cpu);
+
+		while (!bitmap_empty(bitmap, cmap->map.max_entries))
+			cond_resched();
+	}
+
+	/* For cpu_map the remote CPUs can still be using the entries
+	 * (struct bpf_cpu_map_entry).
+	 */
+	for (i = 0; i < cmap->map.max_entries; i++) {
+		struct bpf_cpu_map_entry *rcpu;
+
+		rcpu = READ_ONCE(cmap->cpu_map[i]);
+		if (!rcpu)
+			continue;
+
+		/* bq flush and cleanup happens after RCU graze-period */
+		__cpu_map_entry_replace(cmap, i, NULL); /* call_rcu */
+	}
+	free_percpu(cmap->flush_needed);
+	bpf_map_area_free(cmap->cpu_map);
+	kfree(cmap);
+}
+
+struct bpf_cpu_map_entry *__cpu_map_lookup_elem(struct bpf_map *map, u32 key)
+{
+	struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);
+	struct bpf_cpu_map_entry *rcpu;
+
+	if (key >= map->max_entries)
+		return NULL;
+
+	rcpu = READ_ONCE(cmap->cpu_map[key]);
+	return rcpu;
+}
+
+static void *cpu_map_lookup_elem(struct bpf_map *map, void *key)
+{
+	struct bpf_cpu_map_entry *rcpu =
+		__cpu_map_lookup_elem(map, *(u32 *)key);
+
+	return rcpu ? &rcpu->qsize : NULL;
+}
+
+static int cpu_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
+{
+	struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);
+	u32 index = key ? *(u32 *)key : U32_MAX;
+	u32 *next = next_key;
+
+	if (index >= cmap->map.max_entries) {
+		*next = 0;
+		return 0;
+	}
+
+	if (index == cmap->map.max_entries - 1)
+		return -ENOENT;
+	*next = index + 1;
+	return 0;
+}
+
+const struct bpf_map_ops cpu_map_ops = {
+	.map_alloc		= cpu_map_alloc,
+	.map_free		= cpu_map_free,
+	.map_delete_elem	= cpu_map_delete_elem,
+	.map_update_elem	= cpu_map_update_elem,
+	.map_lookup_elem	= cpu_map_lookup_elem,
+	.map_get_next_key	= cpu_map_get_next_key,
+};
+
+static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu,
+			     struct xdp_bulk_queue *bq)
+{
+	struct ptr_ring *q;
+	int i;
+
+	if (unlikely(!bq->count))
+		return 0;
+
+	q = rcpu->queue;
+	spin_lock(&q->producer_lock);
+
+	for (i = 0; i < bq->count; i++) {
+		void *xdp_pkt = bq->q[i];
+		int err;
+
+		err = __ptr_ring_produce(q, xdp_pkt);
+		if (err) {
+			/* Free xdp_pkt */
+			page_frag_free(xdp_pkt);
+		}
+	}
+	bq->count = 0;
+	spin_unlock(&q->producer_lock);
+
+	return 0;
+}
+
+/* Notice: Will change in later patch */
+struct xdp_pkt {
+	void *data;
+	u16 len;
+	u16 headroom;
+};
+
+/* Runs under RCU-read-side, plus in softirq under NAPI protection.
+ * Thus, safe percpu variable access.
+ */
+int bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_pkt *xdp_pkt)
+{
+	struct xdp_bulk_queue *bq = this_cpu_ptr(rcpu->bulkq);
+
+	if (unlikely(bq->count == CPU_MAP_BULK_SIZE))
+		bq_flush_to_queue(rcpu, bq);
+
+	/* Notice, xdp_buff/page MUST be queued here, long enough for
+	 * driver to code invoking us to finished, due to driver
+	 * (e.g. ixgbe) recycle tricks based on page-refcnt.
+	 *
+	 * Thus, incoming xdp_pkt is always queued here (else we race
+	 * with another CPU on page-refcnt and remaining driver code).
+	 * Queue time is very short, as driver will invoke flush
+	 * operation, when completing napi->poll call.
+	 */
+	bq->q[bq->count++] = xdp_pkt;
+	return 0;
+}
+
+void __cpu_map_insert_ctx(struct bpf_map *map, u32 bit)
+{
+	struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);
+	unsigned long *bitmap = this_cpu_ptr(cmap->flush_needed);
+
+	__set_bit(bit, bitmap);
+}
+
+void __cpu_map_flush(struct bpf_map *map)
+{
+	struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);
+	unsigned long *bitmap = this_cpu_ptr(cmap->flush_needed);
+	u32 bit;
+
+	/* The napi->poll softirq makes sure __cpu_map_insert_ctx()
+	 * and __cpu_map_flush() happen on same CPU. Thus, the percpu
+	 * bitmap indicate which percpu bulkq have packets.
+	 */
+	for_each_set_bit(bit, bitmap, map->max_entries) {
+		struct bpf_cpu_map_entry *rcpu = READ_ONCE(cmap->cpu_map[bit]);
+		struct xdp_bulk_queue *bq;
+
+		/* This is possible if entry is removed by user space
+		 * between xdp redirect and flush op.
+		 */
+		if (unlikely(!rcpu))
+			continue;
+
+		__clear_bit(bit, bitmap);
+
+		/* Flush all frames in bulkq to real queue */
+		bq = this_cpu_ptr(rcpu->bulkq);
+		bq_flush_to_queue(rcpu, bq);
+
+		/* If already running, costs spin_lock_irqsave + smb_mb */
+		wake_up_process(rcpu->kthread);
+	}
+}
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index d124e702e040..54fba06942f5 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -592,6 +592,12 @@ static int map_update_elem(union bpf_attr *attr)
 	if (copy_from_user(value, uvalue, value_size) != 0)
 		goto free_value;
 
+	/* Need to create a kthread, thus must support schedule */
+	if (map->map_type == BPF_MAP_TYPE_CPUMAP) {
+		err = map->ops->map_update_elem(map, key, value, attr->flags);
+		goto out;
+	}
+
 	/* must increment bpf_prog_active to avoid kprobe+bpf triggering from
 	 * inside bpf map update or delete otherwise deadlocks are possible
 	 */
@@ -622,7 +628,7 @@ static int map_update_elem(union bpf_attr *attr)
 	}
 	__this_cpu_dec(bpf_prog_active);
 	preempt_enable();
-
+out:
 	if (!err)
 		trace_bpf_map_update_elem(map, ufd, key, value);
 free_value:
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 9755279d94cb..cefa64be9a2f 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1444,6 +1444,11 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
 		if (func_id != BPF_FUNC_redirect_map)
 			goto error;
 		break;
+	/* Restrict bpf side of cpumap, open when use-cases appear */
+	case BPF_MAP_TYPE_CPUMAP:
+		if (func_id != BPF_FUNC_redirect_map)
+			goto error;
+		break;
 	case BPF_MAP_TYPE_ARRAY_OF_MAPS:
 	case BPF_MAP_TYPE_HASH_OF_MAPS:
 		if (func_id != BPF_FUNC_map_lookup_elem)
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index fb4fb81ce5b0..fa93033dc521 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -112,6 +112,7 @@ enum bpf_map_type {
 	BPF_MAP_TYPE_HASH_OF_MAPS,
 	BPF_MAP_TYPE_DEVMAP,
 	BPF_MAP_TYPE_SOCKMAP,
+	BPF_MAP_TYPE_CPUMAP,
 };
 
 enum bpf_prog_type {
-- 
cgit v1.2.3


From 9c270af37bb62e708e3e4415d653ce73e713df02 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Mon, 16 Oct 2017 12:19:34 +0200
Subject: bpf: XDP_REDIRECT enable use of cpumap

This patch connects cpumap to the xdp_do_redirect_map infrastructure.

Still no SKB allocation are done yet.  The XDP frames are transferred
to the other CPU, but they are simply refcnt decremented on the remote
CPU.  This served as a good benchmark for measuring the overhead of
remote refcnt decrement.  If driver page recycle cache is not
efficient then this, exposes a bottleneck in the page allocator.

A shout-out to MST's ptr_ring, which is the secret behind is being so
efficient to transfer memory pointers between CPUs, without constantly
bouncing cache-lines between CPUs.

V3: Handle !CONFIG_BPF_SYSCALL pointed out by kbuild test robot.

V4: Make Generic-XDP aware of cpumap type, but don't allow redirect yet,
 as implementation require a separate upstream discussion.

V5:
 - Fix a maybe-uninitialized pointed out by kbuild test robot.
 - Restrict bpf-prog side access to cpumap, open when use-cases appear
 - Implement cpu_map_enqueue() as a more simple void pointer enqueue

V6:
 - Allow cpumap type for usage in helper bpf_redirect_map,
   general bpf-prog side restriction moved to earlier patch.

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h        |  31 +++++++++-
 include/trace/events/xdp.h |  10 +++-
 kernel/bpf/cpumap.c        |  22 ++++++-
 kernel/bpf/verifier.c      |   3 +-
 net/core/filter.c          | 140 +++++++++++++++++++++++++++++++++++----------
 5 files changed, 172 insertions(+), 34 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 4373125de1f3..6d4dd844828a 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -355,6 +355,13 @@ struct net_device  *__dev_map_lookup_elem(struct bpf_map *map, u32 key);
 void __dev_map_insert_ctx(struct bpf_map *map, u32 index);
 void __dev_map_flush(struct bpf_map *map);
 
+struct bpf_cpu_map_entry *__cpu_map_lookup_elem(struct bpf_map *map, u32 key);
+void __cpu_map_insert_ctx(struct bpf_map *map, u32 index);
+void __cpu_map_flush(struct bpf_map *map);
+struct xdp_buff;
+int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp,
+		    struct net_device *dev_rx);
+
 /* Return map's numa specified by userspace */
 static inline int bpf_map_attr_numa_node(const union bpf_attr *attr)
 {
@@ -362,7 +369,7 @@ static inline int bpf_map_attr_numa_node(const union bpf_attr *attr)
 		attr->numa_node : NUMA_NO_NODE;
 }
 
-#else
+#else /* !CONFIG_BPF_SYSCALL */
 static inline struct bpf_prog *bpf_prog_get(u32 ufd)
 {
 	return ERR_PTR(-EOPNOTSUPP);
@@ -425,6 +432,28 @@ static inline void __dev_map_insert_ctx(struct bpf_map *map, u32 index)
 static inline void __dev_map_flush(struct bpf_map *map)
 {
 }
+
+static inline
+struct bpf_cpu_map_entry *__cpu_map_lookup_elem(struct bpf_map *map, u32 key)
+{
+	return NULL;
+}
+
+static inline void __cpu_map_insert_ctx(struct bpf_map *map, u32 index)
+{
+}
+
+static inline void __cpu_map_flush(struct bpf_map *map)
+{
+}
+
+struct xdp_buff;
+static inline int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu,
+				  struct xdp_buff *xdp,
+				  struct net_device *dev_rx)
+{
+	return 0;
+}
 #endif /* CONFIG_BPF_SYSCALL */
 
 #if defined(CONFIG_STREAM_PARSER) && defined(CONFIG_BPF_SYSCALL)
diff --git a/include/trace/events/xdp.h b/include/trace/events/xdp.h
index 4e16c43fba10..eb2ece96c1a2 100644
--- a/include/trace/events/xdp.h
+++ b/include/trace/events/xdp.h
@@ -136,12 +136,18 @@ DEFINE_EVENT_PRINT(xdp_redirect_template, xdp_redirect_map_err,
 		  __entry->map_id, __entry->map_index)
 );
 
+#define devmap_ifindex(fwd, map)				\
+	(!fwd ? 0 :						\
+	 (!map ? 0 :						\
+	  ((map->map_type == BPF_MAP_TYPE_DEVMAP) ?		\
+	   ((struct net_device *)fwd)->ifindex : 0)))
+
 #define _trace_xdp_redirect_map(dev, xdp, fwd, map, idx)		\
-	 trace_xdp_redirect_map(dev, xdp, fwd ? fwd->ifindex : 0,	\
+	 trace_xdp_redirect_map(dev, xdp, devmap_ifindex(fwd, map),	\
 				0, map, idx)
 
 #define _trace_xdp_redirect_map_err(dev, xdp, fwd, map, idx, err)	\
-	 trace_xdp_redirect_map_err(dev, xdp, fwd ? fwd->ifindex : 0,	\
+	 trace_xdp_redirect_map_err(dev, xdp, devmap_ifindex(fwd, map),	\
 				    err, map, idx)
 
 #endif /* _TRACE_XDP_H */
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index e1e25ddba038..768da6a2c265 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -500,7 +500,7 @@ struct xdp_pkt {
 /* Runs under RCU-read-side, plus in softirq under NAPI protection.
  * Thus, safe percpu variable access.
  */
-int bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_pkt *xdp_pkt)
+static int bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_pkt *xdp_pkt)
 {
 	struct xdp_bulk_queue *bq = this_cpu_ptr(rcpu->bulkq);
 
@@ -520,6 +520,26 @@ int bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_pkt *xdp_pkt)
 	return 0;
 }
 
+int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp,
+		    struct net_device *dev_rx)
+{
+	struct xdp_pkt *xdp_pkt;
+	int headroom;
+
+	/* For now this is just used as a void pointer to data_hard_start.
+	 * Followup patch will generalize this.
+	 */
+	xdp_pkt = xdp->data_hard_start;
+
+	/* Fake writing into xdp_pkt->data to measure overhead */
+	headroom = xdp->data - xdp->data_hard_start;
+	if (headroom < sizeof(*xdp_pkt))
+		xdp_pkt->data = xdp->data;
+
+	bq_enqueue(rcpu, xdp_pkt);
+	return 0;
+}
+
 void __cpu_map_insert_ctx(struct bpf_map *map, u32 bit)
 {
 	struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index cefa64be9a2f..e4d5136725a2 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1486,7 +1486,8 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
 			goto error;
 		break;
 	case BPF_FUNC_redirect_map:
-		if (map->map_type != BPF_MAP_TYPE_DEVMAP)
+		if (map->map_type != BPF_MAP_TYPE_DEVMAP &&
+		    map->map_type != BPF_MAP_TYPE_CPUMAP)
 			goto error;
 		break;
 	case BPF_FUNC_sk_redirect_map:
diff --git a/net/core/filter.c b/net/core/filter.c
index 140fa9f9c0f4..4d88e0665c41 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2526,10 +2526,36 @@ static int __bpf_tx_xdp(struct net_device *dev,
 	err = dev->netdev_ops->ndo_xdp_xmit(dev, xdp);
 	if (err)
 		return err;
-	if (map)
+	dev->netdev_ops->ndo_xdp_flush(dev);
+	return 0;
+}
+
+static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd,
+			    struct bpf_map *map,
+			    struct xdp_buff *xdp,
+			    u32 index)
+{
+	int err;
+
+	if (map->map_type == BPF_MAP_TYPE_DEVMAP) {
+		struct net_device *dev = fwd;
+
+		if (!dev->netdev_ops->ndo_xdp_xmit)
+			return -EOPNOTSUPP;
+
+		err = dev->netdev_ops->ndo_xdp_xmit(dev, xdp);
+		if (err)
+			return err;
 		__dev_map_insert_ctx(map, index);
-	else
-		dev->netdev_ops->ndo_xdp_flush(dev);
+
+	} else if (map->map_type == BPF_MAP_TYPE_CPUMAP) {
+		struct bpf_cpu_map_entry *rcpu = fwd;
+
+		err = cpu_map_enqueue(rcpu, xdp, dev_rx);
+		if (err)
+			return err;
+		__cpu_map_insert_ctx(map, index);
+	}
 	return 0;
 }
 
@@ -2539,11 +2565,33 @@ void xdp_do_flush_map(void)
 	struct bpf_map *map = ri->map_to_flush;
 
 	ri->map_to_flush = NULL;
-	if (map)
-		__dev_map_flush(map);
+	if (map) {
+		switch (map->map_type) {
+		case BPF_MAP_TYPE_DEVMAP:
+			__dev_map_flush(map);
+			break;
+		case BPF_MAP_TYPE_CPUMAP:
+			__cpu_map_flush(map);
+			break;
+		default:
+			break;
+		}
+	}
 }
 EXPORT_SYMBOL_GPL(xdp_do_flush_map);
 
+static void *__xdp_map_lookup_elem(struct bpf_map *map, u32 index)
+{
+	switch (map->map_type) {
+	case BPF_MAP_TYPE_DEVMAP:
+		return __dev_map_lookup_elem(map, index);
+	case BPF_MAP_TYPE_CPUMAP:
+		return __cpu_map_lookup_elem(map, index);
+	default:
+		return NULL;
+	}
+}
+
 static inline bool xdp_map_invalid(const struct bpf_prog *xdp_prog,
 				   unsigned long aux)
 {
@@ -2556,8 +2604,8 @@ static int xdp_do_redirect_map(struct net_device *dev, struct xdp_buff *xdp,
 	struct redirect_info *ri = this_cpu_ptr(&redirect_info);
 	unsigned long map_owner = ri->map_owner;
 	struct bpf_map *map = ri->map;
-	struct net_device *fwd = NULL;
 	u32 index = ri->ifindex;
+	void *fwd = NULL;
 	int err;
 
 	ri->ifindex = 0;
@@ -2570,7 +2618,7 @@ static int xdp_do_redirect_map(struct net_device *dev, struct xdp_buff *xdp,
 		goto err;
 	}
 
-	fwd = __dev_map_lookup_elem(map, index);
+	fwd = __xdp_map_lookup_elem(map, index);
 	if (!fwd) {
 		err = -EINVAL;
 		goto err;
@@ -2578,7 +2626,7 @@ static int xdp_do_redirect_map(struct net_device *dev, struct xdp_buff *xdp,
 	if (ri->map_to_flush && ri->map_to_flush != map)
 		xdp_do_flush_map();
 
-	err = __bpf_tx_xdp(fwd, map, xdp, index);
+	err = __bpf_tx_xdp_map(dev, fwd, map, xdp, index);
 	if (unlikely(err))
 		goto err;
 
@@ -2620,54 +2668,88 @@ err:
 }
 EXPORT_SYMBOL_GPL(xdp_do_redirect);
 
-int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb,
-			    struct bpf_prog *xdp_prog)
+static int __xdp_generic_ok_fwd_dev(struct sk_buff *skb, struct net_device *fwd)
+{
+	unsigned int len;
+
+	if (unlikely(!(fwd->flags & IFF_UP)))
+		return -ENETDOWN;
+
+	len = fwd->mtu + fwd->hard_header_len + VLAN_HLEN;
+	if (skb->len > len)
+		return -EMSGSIZE;
+
+	return 0;
+}
+
+int xdp_do_generic_redirect_map(struct net_device *dev, struct sk_buff *skb,
+				struct bpf_prog *xdp_prog)
 {
 	struct redirect_info *ri = this_cpu_ptr(&redirect_info);
 	unsigned long map_owner = ri->map_owner;
 	struct bpf_map *map = ri->map;
 	struct net_device *fwd = NULL;
 	u32 index = ri->ifindex;
-	unsigned int len;
 	int err = 0;
 
 	ri->ifindex = 0;
 	ri->map = NULL;
 	ri->map_owner = 0;
 
-	if (map) {
-		if (unlikely(xdp_map_invalid(xdp_prog, map_owner))) {
-			err = -EFAULT;
-			map = NULL;
-			goto err;
-		}
-		fwd = __dev_map_lookup_elem(map, index);
-	} else {
-		fwd = dev_get_by_index_rcu(dev_net(dev), index);
+	if (unlikely(xdp_map_invalid(xdp_prog, map_owner))) {
+		err = -EFAULT;
+		map = NULL;
+		goto err;
 	}
+	fwd = __xdp_map_lookup_elem(map, index);
 	if (unlikely(!fwd)) {
 		err = -EINVAL;
 		goto err;
 	}
 
-	if (unlikely(!(fwd->flags & IFF_UP))) {
-		err = -ENETDOWN;
+	if (map->map_type == BPF_MAP_TYPE_DEVMAP) {
+		if (unlikely((err = __xdp_generic_ok_fwd_dev(skb, fwd))))
+			goto err;
+		skb->dev = fwd;
+	} else {
+		/* TODO: Handle BPF_MAP_TYPE_CPUMAP */
+		err = -EBADRQC;
 		goto err;
 	}
 
-	len = fwd->mtu + fwd->hard_header_len + VLAN_HLEN;
-	if (skb->len > len) {
-		err = -EMSGSIZE;
+	_trace_xdp_redirect_map(dev, xdp_prog, fwd, map, index);
+	return 0;
+err:
+	_trace_xdp_redirect_map_err(dev, xdp_prog, fwd, map, index, err);
+	return err;
+}
+
+int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb,
+			    struct bpf_prog *xdp_prog)
+{
+	struct redirect_info *ri = this_cpu_ptr(&redirect_info);
+	u32 index = ri->ifindex;
+	struct net_device *fwd;
+	int err = 0;
+
+	if (ri->map)
+		return xdp_do_generic_redirect_map(dev, skb, xdp_prog);
+
+	ri->ifindex = 0;
+	fwd = dev_get_by_index_rcu(dev_net(dev), index);
+	if (unlikely(!fwd)) {
+		err = -EINVAL;
 		goto err;
 	}
 
+	if (unlikely((err = __xdp_generic_ok_fwd_dev(skb, fwd))))
+		goto err;
+
 	skb->dev = fwd;
-	map ? _trace_xdp_redirect_map(dev, xdp_prog, fwd, map, index)
-		: _trace_xdp_redirect(dev, xdp_prog, index);
+	_trace_xdp_redirect(dev, xdp_prog, index);
 	return 0;
 err:
-	map ? _trace_xdp_redirect_map_err(dev, xdp_prog, fwd, map, index, err)
-		: _trace_xdp_redirect_err(dev, xdp_prog, index, err);
+	_trace_xdp_redirect_err(dev, xdp_prog, index, err);
 	return err;
 }
 EXPORT_SYMBOL_GPL(xdp_do_generic_redirect);
-- 
cgit v1.2.3


From 1c601d829ab0d7ac3ac44853f83db2206afe67fc Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Mon, 16 Oct 2017 12:19:39 +0200
Subject: bpf: cpumap xdp_buff to skb conversion and allocation

This patch makes cpumap functional, by adding SKB allocation and
invoking the network stack on the dequeuing CPU.

For constructing the SKB on the remote CPU, the xdp_buff in converted
into a struct xdp_pkt, and it mapped into the top headroom of the
packet, to avoid allocating separate mem.  For now, struct xdp_pkt is
just a cpumap internal data structure, with info carried between
enqueue to dequeue.

If a driver doesn't have enough headroom it is simply dropped, with
return code -EOVERFLOW.  This will be picked up the xdp tracepoint
infrastructure, to allow users to catch this.

V2: take into account xdp->data_meta

V4:
 - Drop busypoll tricks, keeping it more simple.
 - Skip RPS and Generic-XDP-recursive-reinjection, suggested by Alexei

V5: correct RCU read protection around __netif_receive_skb_core.

V6: Setting TASK_RUNNING vs TASK_INTERRUPTIBLE based on talk with Rik van Riel

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h |   1 +
 kernel/bpf/cpumap.c       | 152 +++++++++++++++++++++++++++++++++++++++-------
 net/core/dev.c            |  27 ++++++++
 3 files changed, 158 insertions(+), 22 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 31bb3010c69b..bf014afcb914 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3260,6 +3260,7 @@ int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff *skb);
 int netif_rx(struct sk_buff *skb);
 int netif_rx_ni(struct sk_buff *skb);
 int netif_receive_skb(struct sk_buff *skb);
+int netif_receive_skb_core(struct sk_buff *skb);
 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb);
 void napi_gro_flush(struct napi_struct *napi, bool flush_old);
 struct sk_buff *napi_get_frags(struct napi_struct *napi);
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index 768da6a2c265..ee7adf4352dd 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -25,6 +25,9 @@
 #include <linux/kthread.h>
 #include <linux/capability.h>
 
+#include <linux/netdevice.h>   /* netif_receive_skb_core */
+#include <linux/etherdevice.h> /* eth_type_trans */
+
 /* General idea: XDP packets getting XDP redirected to another CPU,
  * will maximum be stored/queued for one driver ->poll() call.  It is
  * guaranteed that setting flush bit and flush operation happen on
@@ -179,6 +182,92 @@ static void cpu_map_kthread_stop(struct work_struct *work)
 	kthread_stop(rcpu->kthread);
 }
 
+/* For now, xdp_pkt is a cpumap internal data structure, with info
+ * carried between enqueue to dequeue. It is mapped into the top
+ * headroom of the packet, to avoid allocating separate mem.
+ */
+struct xdp_pkt {
+	void *data;
+	u16 len;
+	u16 headroom;
+	u16 metasize;
+	struct net_device *dev_rx;
+};
+
+/* Convert xdp_buff to xdp_pkt */
+static struct xdp_pkt *convert_to_xdp_pkt(struct xdp_buff *xdp)
+{
+	struct xdp_pkt *xdp_pkt;
+	int metasize;
+	int headroom;
+
+	/* Assure headroom is available for storing info */
+	headroom = xdp->data - xdp->data_hard_start;
+	metasize = xdp->data - xdp->data_meta;
+	metasize = metasize > 0 ? metasize : 0;
+	if ((headroom - metasize) < sizeof(*xdp_pkt))
+		return NULL;
+
+	/* Store info in top of packet */
+	xdp_pkt = xdp->data_hard_start;
+
+	xdp_pkt->data = xdp->data;
+	xdp_pkt->len  = xdp->data_end - xdp->data;
+	xdp_pkt->headroom = headroom - sizeof(*xdp_pkt);
+	xdp_pkt->metasize = metasize;
+
+	return xdp_pkt;
+}
+
+struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu,
+				  struct xdp_pkt *xdp_pkt)
+{
+	unsigned int frame_size;
+	void *pkt_data_start;
+	struct sk_buff *skb;
+
+	/* build_skb need to place skb_shared_info after SKB end, and
+	 * also want to know the memory "truesize".  Thus, need to
+	 * know the memory frame size backing xdp_buff.
+	 *
+	 * XDP was designed to have PAGE_SIZE frames, but this
+	 * assumption is not longer true with ixgbe and i40e.  It
+	 * would be preferred to set frame_size to 2048 or 4096
+	 * depending on the driver.
+	 *   frame_size = 2048;
+	 *   frame_len  = frame_size - sizeof(*xdp_pkt);
+	 *
+	 * Instead, with info avail, skb_shared_info in placed after
+	 * packet len.  This, unfortunately fakes the truesize.
+	 * Another disadvantage of this approach, the skb_shared_info
+	 * is not at a fixed memory location, with mixed length
+	 * packets, which is bad for cache-line hotness.
+	 */
+	frame_size = SKB_DATA_ALIGN(xdp_pkt->len) + xdp_pkt->headroom +
+		SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
+
+	pkt_data_start = xdp_pkt->data - xdp_pkt->headroom;
+	skb = build_skb(pkt_data_start, frame_size);
+	if (!skb)
+		return NULL;
+
+	skb_reserve(skb, xdp_pkt->headroom);
+	__skb_put(skb, xdp_pkt->len);
+	if (xdp_pkt->metasize)
+		skb_metadata_set(skb, xdp_pkt->metasize);
+
+	/* Essential SKB info: protocol and skb->dev */
+	skb->protocol = eth_type_trans(skb, xdp_pkt->dev_rx);
+
+	/* Optional SKB info, currently missing:
+	 * - HW checksum info		(skb->ip_summed)
+	 * - HW RX hash			(skb_set_hash)
+	 * - RX ring dev queue index	(skb_record_rx_queue)
+	 */
+
+	return skb;
+}
+
 static int cpu_map_kthread_run(void *data)
 {
 	struct bpf_cpu_map_entry *rcpu = data;
@@ -191,15 +280,45 @@ static int cpu_map_kthread_run(void *data)
 	 * kthread_stop signal until queue is empty.
 	 */
 	while (!kthread_should_stop() || !__ptr_ring_empty(rcpu->queue)) {
+		unsigned int processed = 0, drops = 0;
 		struct xdp_pkt *xdp_pkt;
 
-		schedule();
-		/* Do work */
-		while ((xdp_pkt = ptr_ring_consume(rcpu->queue))) {
-			/* For now just "refcnt-free" */
-			page_frag_free(xdp_pkt);
+		/* Release CPU reschedule checks */
+		if (__ptr_ring_empty(rcpu->queue)) {
+			__set_current_state(TASK_INTERRUPTIBLE);
+			schedule();
+		} else {
+			cond_resched();
+		}
+		__set_current_state(TASK_RUNNING);
+
+		/* Process packets in rcpu->queue */
+		local_bh_disable();
+		/*
+		 * The bpf_cpu_map_entry is single consumer, with this
+		 * kthread CPU pinned. Lockless access to ptr_ring
+		 * consume side valid as no-resize allowed of queue.
+		 */
+		while ((xdp_pkt = __ptr_ring_consume(rcpu->queue))) {
+			struct sk_buff *skb;
+			int ret;
+
+			skb = cpu_map_build_skb(rcpu, xdp_pkt);
+			if (!skb) {
+				page_frag_free(xdp_pkt);
+				continue;
+			}
+
+			/* Inject into network stack */
+			ret = netif_receive_skb_core(skb);
+			if (ret == NET_RX_DROP)
+				drops++;
+
+			/* Limit BH-disable period */
+			if (++processed == 8)
+				break;
 		}
-		__set_current_state(TASK_INTERRUPTIBLE);
+		local_bh_enable(); /* resched point, may call do_softirq() */
 	}
 	__set_current_state(TASK_RUNNING);
 
@@ -490,13 +609,6 @@ static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu,
 	return 0;
 }
 
-/* Notice: Will change in later patch */
-struct xdp_pkt {
-	void *data;
-	u16 len;
-	u16 headroom;
-};
-
 /* Runs under RCU-read-side, plus in softirq under NAPI protection.
  * Thus, safe percpu variable access.
  */
@@ -524,17 +636,13 @@ int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp,
 		    struct net_device *dev_rx)
 {
 	struct xdp_pkt *xdp_pkt;
-	int headroom;
 
-	/* For now this is just used as a void pointer to data_hard_start.
-	 * Followup patch will generalize this.
-	 */
-	xdp_pkt = xdp->data_hard_start;
+	xdp_pkt = convert_to_xdp_pkt(xdp);
+	if (!xdp_pkt)
+		return -EOVERFLOW;
 
-	/* Fake writing into xdp_pkt->data to measure overhead */
-	headroom = xdp->data - xdp->data_hard_start;
-	if (headroom < sizeof(*xdp_pkt))
-		xdp_pkt->data = xdp->data;
+	/* Info needed when constructing SKB on remote CPU */
+	xdp_pkt->dev_rx = dev_rx;
 
 	bq_enqueue(rcpu, xdp_pkt);
 	return 0;
diff --git a/net/core/dev.c b/net/core/dev.c
index d2b20e73080e..cf5894f0e6eb 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4492,6 +4492,33 @@ out:
 	return ret;
 }
 
+/**
+ *	netif_receive_skb_core - special purpose version of netif_receive_skb
+ *	@skb: buffer to process
+ *
+ *	More direct receive version of netif_receive_skb().  It should
+ *	only be used by callers that have a need to skip RPS and Generic XDP.
+ *	Caller must also take care of handling if (page_is_)pfmemalloc.
+ *
+ *	This function may only be called from softirq context and interrupts
+ *	should be enabled.
+ *
+ *	Return values (usually ignored):
+ *	NET_RX_SUCCESS: no congestion
+ *	NET_RX_DROP: packet was dropped
+ */
+int netif_receive_skb_core(struct sk_buff *skb)
+{
+	int ret;
+
+	rcu_read_lock();
+	ret = __netif_receive_skb_core(skb, false);
+	rcu_read_unlock();
+
+	return ret;
+}
+EXPORT_SYMBOL(netif_receive_skb_core);
+
 static int __netif_receive_skb(struct sk_buff *skb)
 {
 	int ret;
-- 
cgit v1.2.3


From f9419f7bd7a5318b636a941a0214c5cdfa6f6530 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Mon, 16 Oct 2017 12:19:44 +0200
Subject: bpf: cpumap add tracepoints

This adds two tracepoint to the cpumap.  One for the enqueue side
trace_xdp_cpumap_enqueue() and one for the kthread dequeue side
trace_xdp_cpumap_kthread().

To mitigate the tracepoint overhead, these are invoked during the
enqueue/dequeue bulking phases, thus amortizing the cost.

The obvious use-cases are for debugging and monitoring.  The
non-intuitive use-case is using these as a feedback loop to know the
system load.  One can imagine auto-scaling by reducing, adding or
activating more worker CPUs on demand.

V4: tracepoint remove time_limit info, instead add sched info

V8: intro struct bpf_cpu_map_entry members cpu+map_id in this patch

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/trace/events/xdp.h | 70 ++++++++++++++++++++++++++++++++++++++++++++++
 kernel/bpf/cpumap.c        | 24 ++++++++++++----
 2 files changed, 89 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/include/trace/events/xdp.h b/include/trace/events/xdp.h
index eb2ece96c1a2..0c8dec61987e 100644
--- a/include/trace/events/xdp.h
+++ b/include/trace/events/xdp.h
@@ -150,6 +150,76 @@ DEFINE_EVENT_PRINT(xdp_redirect_template, xdp_redirect_map_err,
 	 trace_xdp_redirect_map_err(dev, xdp, devmap_ifindex(fwd, map),	\
 				    err, map, idx)
 
+TRACE_EVENT(xdp_cpumap_kthread,
+
+	TP_PROTO(int map_id, unsigned int processed,  unsigned int drops,
+		 int sched),
+
+	TP_ARGS(map_id, processed, drops, sched),
+
+	TP_STRUCT__entry(
+		__field(int, map_id)
+		__field(u32, act)
+		__field(int, cpu)
+		__field(unsigned int, drops)
+		__field(unsigned int, processed)
+		__field(int, sched)
+	),
+
+	TP_fast_assign(
+		__entry->map_id		= map_id;
+		__entry->act		= XDP_REDIRECT;
+		__entry->cpu		= smp_processor_id();
+		__entry->drops		= drops;
+		__entry->processed	= processed;
+		__entry->sched	= sched;
+	),
+
+	TP_printk("kthread"
+		  " cpu=%d map_id=%d action=%s"
+		  " processed=%u drops=%u"
+		  " sched=%d",
+		  __entry->cpu, __entry->map_id,
+		  __print_symbolic(__entry->act, __XDP_ACT_SYM_TAB),
+		  __entry->processed, __entry->drops,
+		  __entry->sched)
+);
+
+TRACE_EVENT(xdp_cpumap_enqueue,
+
+	TP_PROTO(int map_id, unsigned int processed,  unsigned int drops,
+		 int to_cpu),
+
+	TP_ARGS(map_id, processed, drops, to_cpu),
+
+	TP_STRUCT__entry(
+		__field(int, map_id)
+		__field(u32, act)
+		__field(int, cpu)
+		__field(unsigned int, drops)
+		__field(unsigned int, processed)
+		__field(int, to_cpu)
+	),
+
+	TP_fast_assign(
+		__entry->map_id		= map_id;
+		__entry->act		= XDP_REDIRECT;
+		__entry->cpu		= smp_processor_id();
+		__entry->drops		= drops;
+		__entry->processed	= processed;
+		__entry->to_cpu		= to_cpu;
+	),
+
+	TP_printk("enqueue"
+		  " cpu=%d map_id=%d action=%s"
+		  " processed=%u drops=%u"
+		  " to_cpu=%d",
+		  __entry->cpu, __entry->map_id,
+		  __print_symbolic(__entry->act, __XDP_ACT_SYM_TAB),
+		  __entry->processed, __entry->drops,
+		  __entry->to_cpu)
+);
+
 #endif /* _TRACE_XDP_H */
 
 #include <trace/define_trace.h>
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index ee7adf4352dd..b4358d84ddf1 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -24,6 +24,7 @@
 #include <linux/workqueue.h>
 #include <linux/kthread.h>
 #include <linux/capability.h>
+#include <trace/events/xdp.h>
 
 #include <linux/netdevice.h>   /* netif_receive_skb_core */
 #include <linux/etherdevice.h> /* eth_type_trans */
@@ -43,6 +44,8 @@ struct xdp_bulk_queue {
 
 /* Struct for every remote "destination" CPU in map */
 struct bpf_cpu_map_entry {
+	u32 cpu;    /* kthread CPU and map index */
+	int map_id; /* Back reference to map */
 	u32 qsize;  /* Queue size placeholder for map lookup */
 
 	/* XDP can run multiple RX-ring queues, need __percpu enqueue store */
@@ -280,15 +283,16 @@ static int cpu_map_kthread_run(void *data)
 	 * kthread_stop signal until queue is empty.
 	 */
 	while (!kthread_should_stop() || !__ptr_ring_empty(rcpu->queue)) {
-		unsigned int processed = 0, drops = 0;
+		unsigned int processed = 0, drops = 0, sched = 0;
 		struct xdp_pkt *xdp_pkt;
 
 		/* Release CPU reschedule checks */
 		if (__ptr_ring_empty(rcpu->queue)) {
 			__set_current_state(TASK_INTERRUPTIBLE);
 			schedule();
+			sched = 1;
 		} else {
-			cond_resched();
+			sched = cond_resched();
 		}
 		__set_current_state(TASK_RUNNING);
 
@@ -318,6 +322,9 @@ static int cpu_map_kthread_run(void *data)
 			if (++processed == 8)
 				break;
 		}
+		/* Feedback loop via tracepoint */
+		trace_xdp_cpumap_kthread(rcpu->map_id, processed, drops, sched);
+
 		local_bh_enable(); /* resched point, may call do_softirq() */
 	}
 	__set_current_state(TASK_RUNNING);
@@ -354,7 +361,9 @@ struct bpf_cpu_map_entry *__cpu_map_entry_alloc(u32 qsize, u32 cpu, int map_id)
 	if (err)
 		goto free_queue;
 
-	rcpu->qsize = qsize;
+	rcpu->cpu    = cpu;
+	rcpu->map_id = map_id;
+	rcpu->qsize  = qsize;
 
 	/* Setup kthread */
 	rcpu->kthread = kthread_create_on_node(cpu_map_kthread_run, rcpu, numa,
@@ -584,6 +593,8 @@ const struct bpf_map_ops cpu_map_ops = {
 static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu,
 			     struct xdp_bulk_queue *bq)
 {
+	unsigned int processed = 0, drops = 0;
+	const int to_cpu = rcpu->cpu;
 	struct ptr_ring *q;
 	int i;
 
@@ -599,13 +610,16 @@ static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu,
 
 		err = __ptr_ring_produce(q, xdp_pkt);
 		if (err) {
-			/* Free xdp_pkt */
-			page_frag_free(xdp_pkt);
+			drops++;
+			page_frag_free(xdp_pkt); /* Free xdp_pkt */
 		}
+		processed++;
 	}
 	bq->count = 0;
 	spin_unlock(&q->producer_lock);
 
+	/* Feedback loop via tracepoints */
+	trace_xdp_cpumap_enqueue(rcpu->map_id, processed, drops, to_cpu);
 	return 0;
 }
 
-- 
cgit v1.2.3


From 5ffeb0501c6b36d080de78372fdb70b404b91e9d Mon Sep 17 00:00:00 2001
From: Will Deacon <will.deacon@arm.com>
Date: Mon, 25 Jul 2016 16:07:10 +0100
Subject: genirq: export irq_get_percpu_devid_partition to modules

Any modular driver using cluster-affine PPIs needs to be able to call
irq_get_percpu_devid_partition so that it can enable the IRQ on the
correct subset of CPUs.

This patch exports the symbol so that it can be called from within a
module.

Acked-by: Marc Zyngier <marc.zyngier@arm.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Will Deacon <will.deacon@arm.com>
---
 kernel/irq/irqdesc.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 82afb7ed369f..694c1a9d6485 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -863,6 +863,7 @@ int irq_get_percpu_devid_partition(unsigned int irq, struct cpumask *affinity)
 
 	return 0;
 }
+EXPORT_SYMBOL_GPL(irq_get_percpu_devid_partition);
 
 void kstat_incr_irq_this_cpu(unsigned int irq)
 {
-- 
cgit v1.2.3


From bc1d202023eb66f088f736ba423bee1cf135c720 Mon Sep 17 00:00:00 2001
From: Will Deacon <will.deacon@arm.com>
Date: Tue, 16 Aug 2016 16:53:15 +0100
Subject: perf/core: Export AUX buffer helpers to modules

Perf PMU drivers using AUX buffers cannot be built as modules unless
the AUX helpers are exported.

This patch exports perf_aux_output_{begin,end,skip} and perf_get_aux to
modules.

Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Will Deacon <will.deacon@arm.com>
---
 kernel/events/ring_buffer.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'kernel')

diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index f684d8e5fa2b..6d9bffe4d6cc 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -411,6 +411,7 @@ err:
 
 	return NULL;
 }
+EXPORT_SYMBOL_GPL(perf_aux_output_begin);
 
 static bool __always_inline rb_need_aux_wakeup(struct ring_buffer *rb)
 {
@@ -480,6 +481,7 @@ void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size)
 	rb_free_aux(rb);
 	ring_buffer_put(rb);
 }
+EXPORT_SYMBOL_GPL(perf_aux_output_end);
 
 /*
  * Skip over a given number of bytes in the AUX buffer, due to, for example,
@@ -505,6 +507,7 @@ int perf_aux_output_skip(struct perf_output_handle *handle, unsigned long size)
 
 	return 0;
 }
+EXPORT_SYMBOL_GPL(perf_aux_output_skip);
 
 void *perf_get_aux(struct perf_output_handle *handle)
 {
@@ -514,6 +517,7 @@ void *perf_get_aux(struct perf_output_handle *handle)
 
 	return handle->rb->aux_priv;
 }
+EXPORT_SYMBOL_GPL(perf_get_aux);
 
 #define PERF_AUX_GFP	(GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY)
 
-- 
cgit v1.2.3


From 7de16e3a35578f4f5accc6f5f23970310483d0a2 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Mon, 16 Oct 2017 16:40:53 -0700
Subject: bpf: split verifier and program ops

struct bpf_verifier_ops contains both verifier ops and operations
used later during program's lifetime (test_run).  Split the runtime
ops into a different structure.

BPF_PROG_TYPE() will now append ## _prog_ops or ## _verifier_ops
to the names.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h       | 15 ++++++++++-----
 include/linux/bpf_types.h | 28 ++++++++++++++--------------
 kernel/bpf/syscall.c      | 16 +++++++++++++---
 kernel/bpf/verifier.c     | 12 ++++++------
 kernel/trace/bpf_trace.c  | 15 ++++++++++++---
 net/core/filter.c         | 45 ++++++++++++++++++++++++++++++++++++---------
 6 files changed, 91 insertions(+), 40 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 6d4dd844828a..e1fba5504ca5 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -157,6 +157,11 @@ bpf_ctx_record_field_size(struct bpf_insn_access_aux *aux, u32 size)
 	aux->ctx_field_size = size;
 }
 
+struct bpf_prog_ops {
+	int (*test_run)(struct bpf_prog *prog, const union bpf_attr *kattr,
+			union bpf_attr __user *uattr);
+};
+
 struct bpf_verifier_ops {
 	/* return eBPF function prototype for verification */
 	const struct bpf_func_proto *(*get_func_proto)(enum bpf_func_id func_id);
@@ -172,8 +177,6 @@ struct bpf_verifier_ops {
 				  const struct bpf_insn *src,
 				  struct bpf_insn *dst,
 				  struct bpf_prog *prog, u32 *target_size);
-	int (*test_run)(struct bpf_prog *prog, const union bpf_attr *kattr,
-			union bpf_attr __user *uattr);
 };
 
 struct bpf_prog_aux {
@@ -184,7 +187,8 @@ struct bpf_prog_aux {
 	u32 id;
 	struct latch_tree_node ksym_tnode;
 	struct list_head ksym_lnode;
-	const struct bpf_verifier_ops *ops;
+	const struct bpf_prog_ops *ops;
+	const struct bpf_verifier_ops *vops;
 	struct bpf_map **used_maps;
 	struct bpf_prog *prog;
 	struct user_struct *user;
@@ -279,8 +283,9 @@ int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *progs,
 #ifdef CONFIG_BPF_SYSCALL
 DECLARE_PER_CPU(int, bpf_prog_active);
 
-#define BPF_PROG_TYPE(_id, _ops) \
-	extern const struct bpf_verifier_ops _ops;
+#define BPF_PROG_TYPE(_id, _name) \
+	extern const struct bpf_prog_ops _name ## _prog_ops; \
+	extern const struct bpf_verifier_ops _name ## _verifier_ops;
 #define BPF_MAP_TYPE(_id, _ops) \
 	extern const struct bpf_map_ops _ops;
 #include <linux/bpf_types.h>
diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
index 814c1081a4a9..36418ad43245 100644
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -1,22 +1,22 @@
 /* internal file - do not include directly */
 
 #ifdef CONFIG_NET
-BPF_PROG_TYPE(BPF_PROG_TYPE_SOCKET_FILTER, sk_filter_prog_ops)
-BPF_PROG_TYPE(BPF_PROG_TYPE_SCHED_CLS, tc_cls_act_prog_ops)
-BPF_PROG_TYPE(BPF_PROG_TYPE_SCHED_ACT, tc_cls_act_prog_ops)
-BPF_PROG_TYPE(BPF_PROG_TYPE_XDP, xdp_prog_ops)
-BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SKB, cg_skb_prog_ops)
-BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SOCK, cg_sock_prog_ops)
-BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_IN, lwt_inout_prog_ops)
-BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_OUT, lwt_inout_prog_ops)
-BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_XMIT, lwt_xmit_prog_ops)
-BPF_PROG_TYPE(BPF_PROG_TYPE_SOCK_OPS, sock_ops_prog_ops)
-BPF_PROG_TYPE(BPF_PROG_TYPE_SK_SKB, sk_skb_prog_ops)
+BPF_PROG_TYPE(BPF_PROG_TYPE_SOCKET_FILTER, sk_filter)
+BPF_PROG_TYPE(BPF_PROG_TYPE_SCHED_CLS, tc_cls_act)
+BPF_PROG_TYPE(BPF_PROG_TYPE_SCHED_ACT, tc_cls_act)
+BPF_PROG_TYPE(BPF_PROG_TYPE_XDP, xdp)
+BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SKB, cg_skb)
+BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SOCK, cg_sock)
+BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_IN, lwt_inout)
+BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_OUT, lwt_inout)
+BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_XMIT, lwt_xmit)
+BPF_PROG_TYPE(BPF_PROG_TYPE_SOCK_OPS, sock_ops)
+BPF_PROG_TYPE(BPF_PROG_TYPE_SK_SKB, sk_skb)
 #endif
 #ifdef CONFIG_BPF_EVENTS
-BPF_PROG_TYPE(BPF_PROG_TYPE_KPROBE, kprobe_prog_ops)
-BPF_PROG_TYPE(BPF_PROG_TYPE_TRACEPOINT, tracepoint_prog_ops)
-BPF_PROG_TYPE(BPF_PROG_TYPE_PERF_EVENT, perf_event_prog_ops)
+BPF_PROG_TYPE(BPF_PROG_TYPE_KPROBE, kprobe)
+BPF_PROG_TYPE(BPF_PROG_TYPE_TRACEPOINT, tracepoint)
+BPF_PROG_TYPE(BPF_PROG_TYPE_PERF_EVENT, perf_event)
 #endif
 
 BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY, array_map_ops)
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 54fba06942f5..444902b5a30d 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -739,9 +739,18 @@ err_put:
 	return err;
 }
 
-static const struct bpf_verifier_ops * const bpf_prog_types[] = {
-#define BPF_PROG_TYPE(_id, _ops) \
-	[_id] = &_ops,
+static const struct bpf_prog_ops * const bpf_prog_types[] = {
+#define BPF_PROG_TYPE(_id, _name) \
+	[_id] = & _name ## _prog_ops,
+#define BPF_MAP_TYPE(_id, _ops)
+#include <linux/bpf_types.h>
+#undef BPF_PROG_TYPE
+#undef BPF_MAP_TYPE
+};
+
+static const struct bpf_verifier_ops * const bpf_verifier_ops[] = {
+#define BPF_PROG_TYPE(_id, _name) \
+	[_id] = & _name ## _verifier_ops,
 #define BPF_MAP_TYPE(_id, _ops)
 #include <linux/bpf_types.h>
 #undef BPF_PROG_TYPE
@@ -754,6 +763,7 @@ static int find_prog_type(enum bpf_prog_type type, struct bpf_prog *prog)
 		return -EINVAL;
 
 	prog->aux->ops = bpf_prog_types[type];
+	prog->aux->vops = bpf_verifier_ops[type];
 	prog->type = type;
 	return 0;
 }
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index e4d5136725a2..38e24d69fc95 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -856,8 +856,8 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off,
 			*reg_type = info.reg_type;
 			return 0;
 		}
-	} else if (env->prog->aux->ops->is_valid_access &&
-		   env->prog->aux->ops->is_valid_access(off, size, t, &info)) {
+	} else if (env->prog->aux->vops->is_valid_access &&
+		   env->prog->aux->vops->is_valid_access(off, size, t, &info)) {
 		/* A non zero info.ctx_field_size indicates that this field is a
 		 * candidate for later verifier transformation to load the whole
 		 * field and then apply a mask when accessed with a narrower
@@ -1565,8 +1565,8 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
 		return -EINVAL;
 	}
 
-	if (env->prog->aux->ops->get_func_proto)
-		fn = env->prog->aux->ops->get_func_proto(func_id);
+	if (env->prog->aux->vops->get_func_proto)
+		fn = env->prog->aux->vops->get_func_proto(func_id);
 
 	if (!fn) {
 		verbose(env, "unknown func %s#%d\n", func_id_name(func_id),
@@ -4035,7 +4035,7 @@ static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 of
  */
 static int convert_ctx_accesses(struct bpf_verifier_env *env)
 {
-	const struct bpf_verifier_ops *ops = env->prog->aux->ops;
+	const struct bpf_verifier_ops *ops = env->prog->aux->vops;
 	int i, cnt, size, ctx_field_size, delta = 0;
 	const int insn_cnt = env->prog->len;
 	struct bpf_insn insn_buf[16], *insn;
@@ -4236,7 +4236,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
 			insn      = new_prog->insnsi + i + delta;
 		}
 patch_call_imm:
-		fn = prog->aux->ops->get_func_proto(insn->imm);
+		fn = prog->aux->vops->get_func_proto(insn->imm);
 		/* all functions that have prototype and verifier allowed
 		 * programs to call them, must be real in-kernel functions
 		 */
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 04ea5314f2bc..3126da2f468a 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -561,11 +561,14 @@ static bool kprobe_prog_is_valid_access(int off, int size, enum bpf_access_type
 	return true;
 }
 
-const struct bpf_verifier_ops kprobe_prog_ops = {
+const struct bpf_verifier_ops kprobe_verifier_ops = {
 	.get_func_proto  = kprobe_prog_func_proto,
 	.is_valid_access = kprobe_prog_is_valid_access,
 };
 
+const struct bpf_prog_ops kprobe_prog_ops = {
+};
+
 BPF_CALL_5(bpf_perf_event_output_tp, void *, tp_buff, struct bpf_map *, map,
 	   u64, flags, void *, data, u64, size)
 {
@@ -667,11 +670,14 @@ static bool tp_prog_is_valid_access(int off, int size, enum bpf_access_type type
 	return true;
 }
 
-const struct bpf_verifier_ops tracepoint_prog_ops = {
+const struct bpf_verifier_ops tracepoint_verifier_ops = {
 	.get_func_proto  = tp_prog_func_proto,
 	.is_valid_access = tp_prog_is_valid_access,
 };
 
+const struct bpf_prog_ops tracepoint_prog_ops = {
+};
+
 static bool pe_prog_is_valid_access(int off, int size, enum bpf_access_type type,
 				    struct bpf_insn_access_aux *info)
 {
@@ -727,8 +733,11 @@ static u32 pe_prog_convert_ctx_access(enum bpf_access_type type,
 	return insn - insn_buf;
 }
 
-const struct bpf_verifier_ops perf_event_prog_ops = {
+const struct bpf_verifier_ops perf_event_verifier_ops = {
 	.get_func_proto		= tp_prog_func_proto,
 	.is_valid_access	= pe_prog_is_valid_access,
 	.convert_ctx_access	= pe_prog_convert_ctx_access,
 };
+
+const struct bpf_prog_ops perf_event_prog_ops = {
+};
diff --git a/net/core/filter.c b/net/core/filter.c
index 4d88e0665c41..1dd3034f846f 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -4395,68 +4395,95 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
 	return insn - insn_buf;
 }
 
-const struct bpf_verifier_ops sk_filter_prog_ops = {
+const struct bpf_verifier_ops sk_filter_verifier_ops = {
 	.get_func_proto		= sk_filter_func_proto,
 	.is_valid_access	= sk_filter_is_valid_access,
 	.convert_ctx_access	= bpf_convert_ctx_access,
 };
 
-const struct bpf_verifier_ops tc_cls_act_prog_ops = {
+const struct bpf_prog_ops sk_filter_prog_ops = {
+};
+
+const struct bpf_verifier_ops tc_cls_act_verifier_ops = {
 	.get_func_proto		= tc_cls_act_func_proto,
 	.is_valid_access	= tc_cls_act_is_valid_access,
 	.convert_ctx_access	= tc_cls_act_convert_ctx_access,
 	.gen_prologue		= tc_cls_act_prologue,
+};
+
+const struct bpf_prog_ops tc_cls_act_prog_ops = {
 	.test_run		= bpf_prog_test_run_skb,
 };
 
-const struct bpf_verifier_ops xdp_prog_ops = {
+const struct bpf_verifier_ops xdp_verifier_ops = {
 	.get_func_proto		= xdp_func_proto,
 	.is_valid_access	= xdp_is_valid_access,
 	.convert_ctx_access	= xdp_convert_ctx_access,
+};
+
+const struct bpf_prog_ops xdp_prog_ops = {
 	.test_run		= bpf_prog_test_run_xdp,
 };
 
-const struct bpf_verifier_ops cg_skb_prog_ops = {
+const struct bpf_verifier_ops cg_skb_verifier_ops = {
 	.get_func_proto		= sk_filter_func_proto,
 	.is_valid_access	= sk_filter_is_valid_access,
 	.convert_ctx_access	= bpf_convert_ctx_access,
+};
+
+const struct bpf_prog_ops cg_skb_prog_ops = {
 	.test_run		= bpf_prog_test_run_skb,
 };
 
-const struct bpf_verifier_ops lwt_inout_prog_ops = {
+const struct bpf_verifier_ops lwt_inout_verifier_ops = {
 	.get_func_proto		= lwt_inout_func_proto,
 	.is_valid_access	= lwt_is_valid_access,
 	.convert_ctx_access	= bpf_convert_ctx_access,
+};
+
+const struct bpf_prog_ops lwt_inout_prog_ops = {
 	.test_run		= bpf_prog_test_run_skb,
 };
 
-const struct bpf_verifier_ops lwt_xmit_prog_ops = {
+const struct bpf_verifier_ops lwt_xmit_verifier_ops = {
 	.get_func_proto		= lwt_xmit_func_proto,
 	.is_valid_access	= lwt_is_valid_access,
 	.convert_ctx_access	= bpf_convert_ctx_access,
 	.gen_prologue		= tc_cls_act_prologue,
+};
+
+const struct bpf_prog_ops lwt_xmit_prog_ops = {
 	.test_run		= bpf_prog_test_run_skb,
 };
 
-const struct bpf_verifier_ops cg_sock_prog_ops = {
+const struct bpf_verifier_ops cg_sock_verifier_ops = {
 	.get_func_proto		= sock_filter_func_proto,
 	.is_valid_access	= sock_filter_is_valid_access,
 	.convert_ctx_access	= sock_filter_convert_ctx_access,
 };
 
-const struct bpf_verifier_ops sock_ops_prog_ops = {
+const struct bpf_prog_ops cg_sock_prog_ops = {
+};
+
+const struct bpf_verifier_ops sock_ops_verifier_ops = {
 	.get_func_proto		= sock_ops_func_proto,
 	.is_valid_access	= sock_ops_is_valid_access,
 	.convert_ctx_access	= sock_ops_convert_ctx_access,
 };
 
-const struct bpf_verifier_ops sk_skb_prog_ops = {
+const struct bpf_prog_ops sock_ops_prog_ops = {
+};
+
+const struct bpf_verifier_ops sk_skb_verifier_ops = {
 	.get_func_proto		= sk_skb_func_proto,
 	.is_valid_access	= sk_skb_is_valid_access,
 	.convert_ctx_access	= bpf_convert_ctx_access,
 	.gen_prologue		= sk_skb_prologue,
 };
 
+const struct bpf_prog_ops sk_skb_prog_ops = {
+};
+
 int sk_detach_filter(struct sock *sk)
 {
 	int ret = -ENOENT;
-- 
cgit v1.2.3


From 00176a34d9e27ab1e77db75fe13abc005cffe0ca Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Mon, 16 Oct 2017 16:40:54 -0700
Subject: bpf: remove the verifier ops from program structure

Since the verifier ops don't have to be associated with
the program for its entire lifetime we can move it to
verifier's struct bpf_verifier_env.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h          |  1 -
 include/linux/bpf_verifier.h |  1 +
 kernel/bpf/syscall.c         | 10 ----------
 kernel/bpf/verifier.c        | 23 +++++++++++++++++------
 4 files changed, 18 insertions(+), 17 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index e1fba5504ca5..cf91977e8719 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -188,7 +188,6 @@ struct bpf_prog_aux {
 	struct latch_tree_node ksym_tnode;
 	struct list_head ksym_lnode;
 	const struct bpf_prog_ops *ops;
-	const struct bpf_verifier_ops *vops;
 	struct bpf_map **used_maps;
 	struct bpf_prog *prog;
 	struct user_struct *user;
diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index f00ef751c1c5..feeaea93d959 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -141,6 +141,7 @@ struct bpf_ext_analyzer_ops {
  */
 struct bpf_verifier_env {
 	struct bpf_prog *prog;		/* eBPF program being verified */
+	const struct bpf_verifier_ops *ops;
 	struct bpf_verifier_stack_elem *head; /* stack of verifier states to be processed */
 	int stack_size;			/* number of states to be processed */
 	bool strict_alignment;		/* perform strict pointer alignment checks */
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 444902b5a30d..0e893cac6795 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -748,22 +748,12 @@ static const struct bpf_prog_ops * const bpf_prog_types[] = {
 #undef BPF_MAP_TYPE
 };
 
-static const struct bpf_verifier_ops * const bpf_verifier_ops[] = {
-#define BPF_PROG_TYPE(_id, _name) \
-	[_id] = & _name ## _verifier_ops,
-#define BPF_MAP_TYPE(_id, _ops)
-#include <linux/bpf_types.h>
-#undef BPF_PROG_TYPE
-#undef BPF_MAP_TYPE
-};
-
 static int find_prog_type(enum bpf_prog_type type, struct bpf_prog *prog)
 {
 	if (type >= ARRAY_SIZE(bpf_prog_types) || !bpf_prog_types[type])
 		return -EINVAL;
 
 	prog->aux->ops = bpf_prog_types[type];
-	prog->aux->vops = bpf_verifier_ops[type];
 	prog->type = type;
 	return 0;
 }
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 38e24d69fc95..3b6e2c550e96 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -23,6 +23,15 @@
 
 #include "disasm.h"
 
+static const struct bpf_verifier_ops * const bpf_verifier_ops[] = {
+#define BPF_PROG_TYPE(_id, _name) \
+	[_id] = & _name ## _verifier_ops,
+#define BPF_MAP_TYPE(_id, _ops)
+#include <linux/bpf_types.h>
+#undef BPF_PROG_TYPE
+#undef BPF_MAP_TYPE
+};
+
 /* bpf_check() is a static code analyzer that walks eBPF program
  * instruction by instruction and updates register/stack state.
  * All paths of conditional branches are analyzed until 'bpf_exit' insn.
@@ -856,8 +865,8 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off,
 			*reg_type = info.reg_type;
 			return 0;
 		}
-	} else if (env->prog->aux->vops->is_valid_access &&
-		   env->prog->aux->vops->is_valid_access(off, size, t, &info)) {
+	} else if (env->ops->is_valid_access &&
+		   env->ops->is_valid_access(off, size, t, &info)) {
 		/* A non zero info.ctx_field_size indicates that this field is a
 		 * candidate for later verifier transformation to load the whole
 		 * field and then apply a mask when accessed with a narrower
@@ -1565,8 +1574,8 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
 		return -EINVAL;
 	}
 
-	if (env->prog->aux->vops->get_func_proto)
-		fn = env->prog->aux->vops->get_func_proto(func_id);
+	if (env->ops->get_func_proto)
+		fn = env->ops->get_func_proto(func_id);
 
 	if (!fn) {
 		verbose(env, "unknown func %s#%d\n", func_id_name(func_id),
@@ -4035,7 +4044,7 @@ static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 of
  */
 static int convert_ctx_accesses(struct bpf_verifier_env *env)
 {
-	const struct bpf_verifier_ops *ops = env->prog->aux->vops;
+	const struct bpf_verifier_ops *ops = env->ops;
 	int i, cnt, size, ctx_field_size, delta = 0;
 	const int insn_cnt = env->prog->len;
 	struct bpf_insn insn_buf[16], *insn;
@@ -4236,7 +4245,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
 			insn      = new_prog->insnsi + i + delta;
 		}
 patch_call_imm:
-		fn = prog->aux->vops->get_func_proto(insn->imm);
+		fn = env->ops->get_func_proto(insn->imm);
 		/* all functions that have prototype and verifier allowed
 		 * programs to call them, must be real in-kernel functions
 		 */
@@ -4294,6 +4303,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)
 	if (!env->insn_aux_data)
 		goto err_free_env;
 	env->prog = *prog;
+	env->ops = bpf_verifier_ops[env->prog->type];
 
 	/* grab the mutex to protect few globals used by verifier */
 	mutex_lock(&bpf_verifier_lock);
@@ -4406,6 +4416,7 @@ int bpf_analyzer(struct bpf_prog *prog, const struct bpf_ext_analyzer_ops *ops,
 	if (!env->insn_aux_data)
 		goto err_free_env;
 	env->prog = prog;
+	env->ops = bpf_verifier_ops[env->prog->type];
 	env->analyzer_ops = ops;
 	env->analyzer_priv = priv;
 
-- 
cgit v1.2.3


From 4f9218aaf8a463f76cac40aa08d859d065f8cc9e Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Mon, 16 Oct 2017 16:40:55 -0700
Subject: bpf: move knowledge about post-translation offsets out of verifier

Use the fact that verifier ops are now separate from program
ops to define a separate set of callbacks for verification of
already translated programs.

Since we expect the analyzer ops to be defined only for
a small subset of all program types initialize their array
by hand (don't use linux/bpf_types.h).

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h   |  3 +++
 kernel/bpf/verifier.c | 55 +++++++++++++++------------------------------------
 net/core/filter.c     | 40 +++++++++++++++++++++++++++++++++++++
 3 files changed, 59 insertions(+), 39 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index cf91977e8719..d67ccdc0099f 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -291,6 +291,9 @@ DECLARE_PER_CPU(int, bpf_prog_active);
 #undef BPF_PROG_TYPE
 #undef BPF_MAP_TYPE
 
+extern const struct bpf_verifier_ops tc_cls_act_analyzer_ops;
+extern const struct bpf_verifier_ops xdp_analyzer_ops;
+
 struct bpf_prog *bpf_prog_get(u32 ufd);
 struct bpf_prog *bpf_prog_get_type(u32 ufd, enum bpf_prog_type type);
 struct bpf_prog * __must_check bpf_prog_add(struct bpf_prog *prog, int i);
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 3b6e2c550e96..545b8c45a578 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -822,36 +822,6 @@ static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off,
 	return err;
 }
 
-static bool analyzer_is_valid_access(struct bpf_verifier_env *env, int off,
-				     struct bpf_insn_access_aux *info)
-{
-	switch (env->prog->type) {
-	case BPF_PROG_TYPE_XDP:
-		switch (off) {
-		case offsetof(struct xdp_buff, data):
-			info->reg_type = PTR_TO_PACKET;
-			return true;
-		case offsetof(struct xdp_buff, data_end):
-			info->reg_type = PTR_TO_PACKET_END;
-			return true;
-		}
-		return false;
-	case BPF_PROG_TYPE_SCHED_CLS:
-		switch (off) {
-		case offsetof(struct sk_buff, data):
-			info->reg_type = PTR_TO_PACKET;
-			return true;
-		case offsetof(struct sk_buff, cb) +
-		     offsetof(struct bpf_skb_data_end, data_end):
-			info->reg_type = PTR_TO_PACKET_END;
-			return true;
-		}
-		return false;
-	default:
-		return false;
-	}
-}
-
 /* check access to 'struct bpf_context' fields.  Supports fixed offsets only */
 static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off, int size,
 			    enum bpf_access_type t, enum bpf_reg_type *reg_type)
@@ -860,13 +830,8 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off,
 		.reg_type = *reg_type,
 	};
 
-	if (env->analyzer_ops) {
-		if (analyzer_is_valid_access(env, off, &info)) {
-			*reg_type = info.reg_type;
-			return 0;
-		}
-	} else if (env->ops->is_valid_access &&
-		   env->ops->is_valid_access(off, size, t, &info)) {
+	if (env->ops->is_valid_access &&
+	    env->ops->is_valid_access(off, size, t, &info)) {
 		/* A non zero info.ctx_field_size indicates that this field is a
 		 * candidate for later verifier transformation to load the whole
 		 * field and then apply a mask when accessed with a narrower
@@ -874,9 +839,12 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off,
 		 * will only allow for whole field access and rejects any other
 		 * type of narrower access.
 		 */
-		env->insn_aux_data[insn_idx].ctx_field_size = info.ctx_field_size;
 		*reg_type = info.reg_type;
 
+		if (env->analyzer_ops)
+			return 0;
+
+		env->insn_aux_data[insn_idx].ctx_field_size = info.ctx_field_size;
 		/* remember the offset of last byte accessed in ctx */
 		if (env->prog->aux->max_ctx_offset < off + size)
 			env->prog->aux->max_ctx_offset = off + size;
@@ -4400,12 +4368,21 @@ err_free_env:
 	return ret;
 }
 
+static const struct bpf_verifier_ops * const bpf_analyzer_ops[] = {
+	[BPF_PROG_TYPE_XDP]		= &xdp_analyzer_ops,
+	[BPF_PROG_TYPE_SCHED_CLS]	= &tc_cls_act_analyzer_ops,
+};
+
 int bpf_analyzer(struct bpf_prog *prog, const struct bpf_ext_analyzer_ops *ops,
 		 void *priv)
 {
 	struct bpf_verifier_env *env;
 	int ret;
 
+	if (prog->type >= ARRAY_SIZE(bpf_analyzer_ops) ||
+	    !bpf_analyzer_ops[prog->type])
+		return -EOPNOTSUPP;
+
 	env = kzalloc(sizeof(struct bpf_verifier_env), GFP_KERNEL);
 	if (!env)
 		return -ENOMEM;
@@ -4416,7 +4393,7 @@ int bpf_analyzer(struct bpf_prog *prog, const struct bpf_ext_analyzer_ops *ops,
 	if (!env->insn_aux_data)
 		goto err_free_env;
 	env->prog = prog;
-	env->ops = bpf_verifier_ops[env->prog->type];
+	env->ops = bpf_analyzer_ops[env->prog->type];
 	env->analyzer_ops = ops;
 	env->analyzer_priv = priv;
 
diff --git a/net/core/filter.c b/net/core/filter.c
index 1dd3034f846f..7373a08fbef7 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -3732,6 +3732,23 @@ static bool tc_cls_act_is_valid_access(int off, int size,
 	return bpf_skb_is_valid_access(off, size, type, info);
 }
 
+static bool
+tc_cls_act_is_valid_access_analyzer(int off, int size,
+				    enum bpf_access_type type,
+				    struct bpf_insn_access_aux *info)
+{
+	switch (off) {
+	case offsetof(struct sk_buff, data):
+		info->reg_type = PTR_TO_PACKET;
+		return true;
+	case offsetof(struct sk_buff, cb) +
+	     offsetof(struct bpf_skb_data_end, data_end):
+		info->reg_type = PTR_TO_PACKET_END;
+		return true;
+	}
+	return false;
+}
+
 static bool __is_valid_xdp_access(int off, int size)
 {
 	if (off < 0 || off >= sizeof(struct xdp_md))
@@ -3766,6 +3783,21 @@ static bool xdp_is_valid_access(int off, int size,
 	return __is_valid_xdp_access(off, size);
 }
 
+static bool xdp_is_valid_access_analyzer(int off, int size,
+					 enum bpf_access_type type,
+					 struct bpf_insn_access_aux *info)
+{
+	switch (off) {
+	case offsetof(struct xdp_buff, data):
+		info->reg_type = PTR_TO_PACKET;
+		return true;
+	case offsetof(struct xdp_buff, data_end):
+		info->reg_type = PTR_TO_PACKET_END;
+		return true;
+	}
+	return false;
+}
+
 void bpf_warn_invalid_xdp_action(u32 act)
 {
 	const u32 act_max = XDP_REDIRECT;
@@ -4411,6 +4443,10 @@ const struct bpf_verifier_ops tc_cls_act_verifier_ops = {
 	.gen_prologue		= tc_cls_act_prologue,
 };
 
+const struct bpf_verifier_ops tc_cls_act_analyzer_ops = {
+	.is_valid_access	= tc_cls_act_is_valid_access_analyzer,
+};
+
 const struct bpf_prog_ops tc_cls_act_prog_ops = {
 	.test_run		= bpf_prog_test_run_skb,
 };
@@ -4421,6 +4457,10 @@ const struct bpf_verifier_ops xdp_verifier_ops = {
 	.convert_ctx_access	= xdp_convert_ctx_access,
 };
 
+const struct bpf_verifier_ops xdp_analyzer_ops = {
+	.is_valid_access	= xdp_is_valid_access_analyzer,
+};
+
 const struct bpf_prog_ops xdp_prog_ops = {
 	.test_run		= bpf_prog_test_run_xdp,
 };
-- 
cgit v1.2.3


From 93862e385ded7c60351e09fcd2a541d273650905 Mon Sep 17 00:00:00 2001
From: Joe Lawrence <joe.lawrence@redhat.com>
Date: Fri, 13 Oct 2017 15:08:41 -0400
Subject: livepatch: add (un)patch callbacks

Provide livepatch modules a klp_object (un)patching notification
mechanism.  Pre and post-(un)patch callbacks allow livepatch modules to
setup or synchronize changes that would be difficult to support in only
patched-or-unpatched code contexts.

Callbacks can be registered for target module or vmlinux klp_objects,
but each implementation is klp_object specific.

  - Pre-(un)patch callbacks run before any (un)patching transition
    starts.

  - Post-(un)patch callbacks run once an object has been (un)patched and
    the klp_patch fully transitioned to its target state.

Example use cases include modification of global data and registration
of newly available services/handlers.

See Documentation/livepatch/callbacks.txt for details and
samples/livepatch/ for examples.

Signed-off-by: Joe Lawrence <joe.lawrence@redhat.com>
Acked-by: Josh Poimboeuf <jpoimboe@redhat.com>
Acked-by: Miroslav Benes <mbenes@suse.cz>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 Documentation/livepatch/callbacks.txt           | 605 ++++++++++++++++++++++++
 include/linux/livepatch.h                       |  26 +
 kernel/livepatch/core.c                         |  51 +-
 kernel/livepatch/core.h                         |  38 ++
 kernel/livepatch/patch.c                        |   1 +
 kernel/livepatch/transition.c                   |  21 +-
 samples/livepatch/Makefile                      |   3 +
 samples/livepatch/livepatch-callbacks-busymod.c |  72 +++
 samples/livepatch/livepatch-callbacks-demo.c    | 234 +++++++++
 samples/livepatch/livepatch-callbacks-mod.c     |  53 +++
 10 files changed, 1091 insertions(+), 13 deletions(-)
 create mode 100644 Documentation/livepatch/callbacks.txt
 create mode 100644 samples/livepatch/livepatch-callbacks-busymod.c
 create mode 100644 samples/livepatch/livepatch-callbacks-demo.c
 create mode 100644 samples/livepatch/livepatch-callbacks-mod.c

(limited to 'kernel')

diff --git a/Documentation/livepatch/callbacks.txt b/Documentation/livepatch/callbacks.txt
new file mode 100644
index 000000000000..c9776f48e458
--- /dev/null
+++ b/Documentation/livepatch/callbacks.txt
@@ -0,0 +1,605 @@
+======================
+(Un)patching Callbacks
+======================
+
+Livepatch (un)patch-callbacks provide a mechanism for livepatch modules
+to execute callback functions when a kernel object is (un)patched.  They
+can be considered a "power feature" that extends livepatching abilities
+to include:
+
+  - Safe updates to global data
+
+  - "Patches" to init and probe functions
+
+  - Patching otherwise unpatchable code (i.e. assembly)
+
+In most cases, (un)patch callbacks will need to be used in conjunction
+with memory barriers and kernel synchronization primitives, like
+mutexes/spinlocks, or even stop_machine(), to avoid concurrency issues.
+
+Callbacks differ from existing kernel facilities:
+
+  - Module init/exit code doesn't run when disabling and re-enabling a
+    patch.
+
+  - A module notifier can't stop a to-be-patched module from loading.
+
+Callbacks are part of the klp_object structure and their implementation
+is specific to that klp_object.  Other livepatch objects may or may not
+be patched, irrespective of the target klp_object's current state.
+
+Callbacks can be registered for the following livepatch actions:
+
+  * Pre-patch    - before a klp_object is patched
+
+  * Post-patch   - after a klp_object has been patched and is active
+                   across all tasks
+
+  * Pre-unpatch  - before a klp_object is unpatched (ie, patched code is
+                   active), used to clean up post-patch callback
+                   resources
+
+  * Post-unpatch - after a klp_object has been patched, all code has
+                   been restored and no tasks are running patched code,
+                   used to cleanup pre-patch callback resources
+
+Each callback is optional, omitting one does not preclude specifying any
+other.  However, the livepatching core executes the handlers in
+symmetry: pre-patch callbacks have a post-unpatch counterpart and
+post-patch callbacks have a pre-unpatch counterpart.  An unpatch
+callback will only be executed if its corresponding patch callback was
+executed.  Typical use cases pair a patch handler that acquires and
+configures resources with an unpatch handler tears down and releases
+those same resources.
+
+A callback is only executed if its host klp_object is loaded.  For
+in-kernel vmlinux targets, this means that callbacks will always execute
+when a livepatch is enabled/disabled.  For patch target kernel modules,
+callbacks will only execute if the target module is loaded.  When a
+module target is (un)loaded, its callbacks will execute only if the
+livepatch module is enabled.
+
+The pre-patch callback, if specified, is expected to return a status
+code (0 for success, -ERRNO on error).  An error status code indicates
+to the livepatching core that patching of the current klp_object is not
+safe and to stop the current patching request.  (When no pre-patch
+callback is provided, the transition is assumed to be safe.)  If a
+pre-patch callback returns failure, the kernel's module loader will:
+
+  - Refuse to load a livepatch, if the livepatch is loaded after
+    targeted code.
+
+    or:
+
+  - Refuse to load a module, if the livepatch was already successfully
+    loaded.
+
+No post-patch, pre-unpatch, or post-unpatch callbacks will be executed
+for a given klp_object if the object failed to patch, due to a failed
+pre_patch callback or for any other reason.
+
+If a patch transition is reversed, no pre-unpatch handlers will be run
+(this follows the previously mentioned symmetry -- pre-unpatch callbacks
+will only occur if their corresponding post-patch callback executed).
+
+If the object did successfully patch, but the patch transition never
+started for some reason (e.g., if another object failed to patch),
+only the post-unpatch callback will be called.
+
+
+Example Use-cases
+=================
+
+Update global data
+------------------
+
+A pre-patch callback can be useful to update a global variable.  For
+example, 75ff39ccc1bd ("tcp: make challenge acks less predictable")
+changes a global sysctl, as well as patches the tcp_send_challenge_ack()
+function.
+
+In this case, if we're being super paranoid, it might make sense to
+patch the data *after* patching is complete with a post-patch callback,
+so that tcp_send_challenge_ack() could first be changed to read
+sysctl_tcp_challenge_ack_limit with READ_ONCE.
+
+
+Support __init and probe function patches
+-----------------------------------------
+
+Although __init and probe functions are not directly livepatch-able, it
+may be possible to implement similar updates via pre/post-patch
+callbacks.
+
+48900cb6af42 ("virtio-net: drop NETIF_F_FRAGLIST") change the way that
+virtnet_probe() initialized its driver's net_device features.  A
+pre/post-patch callback could iterate over all such devices, making a
+similar change to their hw_features value.  (Client functions of the
+value may need to be updated accordingly.)
+
+
+Test cases
+==========
+
+What follows is not an exhaustive test suite of every possible livepatch
+pre/post-(un)patch combination, but a selection that demonstrates a few
+important concepts.  Each test case uses the kernel modules located in
+the samples/livepatch/ and assumes that no livepatches are loaded at the
+beginning of the test.
+
+
+Test 1
+------
+
+Test a combination of loading a kernel module and a livepatch that
+patches a function in the first module.  (Un)load the target module
+before the livepatch module:
+
+- load target module
+- load livepatch
+- disable livepatch
+- unload target module
+- unload livepatch
+
+First load a target module:
+
+  % insmod samples/livepatch/livepatch-callbacks-mod.ko
+  [   34.475708] livepatch_callbacks_mod: livepatch_callbacks_mod_init
+
+On livepatch enable, before the livepatch transition starts, pre-patch
+callbacks are executed for vmlinux and livepatch_callbacks_mod (those
+klp_objects currently loaded).  After klp_objects are patched according
+to the klp_patch, their post-patch callbacks run and the transition
+completes:
+
+  % insmod samples/livepatch/livepatch-callbacks-demo.ko
+  [   36.503719] livepatch: enabling patch 'livepatch_callbacks_demo'
+  [   36.504213] livepatch: 'livepatch_callbacks_demo': initializing patching transition
+  [   36.504238] livepatch_callbacks_demo: pre_patch_callback: vmlinux
+  [   36.504721] livepatch_callbacks_demo: pre_patch_callback: livepatch_callbacks_mod -> [MODULE_STATE_LIVE] Normal state
+  [   36.505849] livepatch: 'livepatch_callbacks_demo': starting patching transition
+  [   37.727133] livepatch: 'livepatch_callbacks_demo': completing patching transition
+  [   37.727232] livepatch_callbacks_demo: post_patch_callback: vmlinux
+  [   37.727860] livepatch_callbacks_demo: post_patch_callback: livepatch_callbacks_mod -> [MODULE_STATE_LIVE] Normal state
+  [   37.728792] livepatch: 'livepatch_callbacks_demo': patching complete
+
+Similarly, on livepatch disable, pre-patch callbacks run before the
+unpatching transition starts.  klp_objects are reverted, post-patch
+callbacks execute and the transition completes:
+
+  % echo 0 > /sys/kernel/livepatch/livepatch_callbacks_demo/enabled
+  [   38.510209] livepatch: 'livepatch_callbacks_demo': initializing unpatching transition
+  [   38.510234] livepatch_callbacks_demo: pre_unpatch_callback: vmlinux
+  [   38.510982] livepatch_callbacks_demo: pre_unpatch_callback: livepatch_callbacks_mod -> [MODULE_STATE_LIVE] Normal state
+  [   38.512209] livepatch: 'livepatch_callbacks_demo': starting unpatching transition
+  [   39.711132] livepatch: 'livepatch_callbacks_demo': completing unpatching transition
+  [   39.711210] livepatch_callbacks_demo: post_unpatch_callback: vmlinux
+  [   39.711779] livepatch_callbacks_demo: post_unpatch_callback: livepatch_callbacks_mod -> [MODULE_STATE_LIVE] Normal state
+  [   39.712735] livepatch: 'livepatch_callbacks_demo': unpatching complete
+
+  % rmmod samples/livepatch/livepatch-callbacks-demo.ko
+  % rmmod samples/livepatch/livepatch-callbacks-mod.ko
+  [   42.534183] livepatch_callbacks_mod: livepatch_callbacks_mod_exit
+
+
+Test 2
+------
+
+This test is similar to the previous test, but (un)load the livepatch
+module before the target kernel module.  This tests the livepatch core's
+module_coming handler:
+
+- load livepatch
+- load target module
+- disable livepatch
+- unload livepatch
+- unload target module
+
+
+On livepatch enable, only pre/post-patch callbacks are executed for
+currently loaded klp_objects, in this case, vmlinux:
+
+  % insmod samples/livepatch/livepatch-callbacks-demo.ko
+  [   44.553328] livepatch: enabling patch 'livepatch_callbacks_demo'
+  [   44.553997] livepatch: 'livepatch_callbacks_demo': initializing patching transition
+  [   44.554049] livepatch_callbacks_demo: pre_patch_callback: vmlinux
+  [   44.554845] livepatch: 'livepatch_callbacks_demo': starting patching transition
+  [   45.727128] livepatch: 'livepatch_callbacks_demo': completing patching transition
+  [   45.727212] livepatch_callbacks_demo: post_patch_callback: vmlinux
+  [   45.727961] livepatch: 'livepatch_callbacks_demo': patching complete
+
+When a targeted module is subsequently loaded, only its pre/post-patch
+callbacks are executed:
+
+  % insmod samples/livepatch/livepatch-callbacks-mod.ko
+  [   46.560845] livepatch: applying patch 'livepatch_callbacks_demo' to loading module 'livepatch_callbacks_mod'
+  [   46.561988] livepatch_callbacks_demo: pre_patch_callback: livepatch_callbacks_mod -> [MODULE_STATE_COMING] Full formed, running module_init
+  [   46.563452] livepatch_callbacks_demo: post_patch_callback: livepatch_callbacks_mod -> [MODULE_STATE_COMING] Full formed, running module_init
+  [   46.565495] livepatch_callbacks_mod: livepatch_callbacks_mod_init
+
+On livepatch disable, all currently loaded klp_objects' (vmlinux and
+livepatch_callbacks_mod) pre/post-unpatch callbacks are executed:
+
+  % echo 0 > /sys/kernel/livepatch/livepatch_callbacks_demo/enabled
+  [   48.568885] livepatch: 'livepatch_callbacks_demo': initializing unpatching transition
+  [   48.568910] livepatch_callbacks_demo: pre_unpatch_callback: vmlinux
+  [   48.569441] livepatch_callbacks_demo: pre_unpatch_callback: livepatch_callbacks_mod -> [MODULE_STATE_LIVE] Normal state
+  [   48.570502] livepatch: 'livepatch_callbacks_demo': starting unpatching transition
+  [   49.759091] livepatch: 'livepatch_callbacks_demo': completing unpatching transition
+  [   49.759171] livepatch_callbacks_demo: post_unpatch_callback: vmlinux
+  [   49.759742] livepatch_callbacks_demo: post_unpatch_callback: livepatch_callbacks_mod -> [MODULE_STATE_LIVE] Normal state
+  [   49.760690] livepatch: 'livepatch_callbacks_demo': unpatching complete
+
+  % rmmod samples/livepatch/livepatch-callbacks-demo.ko
+  % rmmod samples/livepatch/livepatch-callbacks-mod.ko
+  [   52.592283] livepatch_callbacks_mod: livepatch_callbacks_mod_exit
+
+
+Test 3
+------
+
+Test loading the livepatch after a targeted kernel module, then unload
+the kernel module before disabling the livepatch.  This tests the
+livepatch core's module_going handler:
+
+- load target module
+- load livepatch
+- unload target module
+- disable livepatch
+- unload livepatch
+
+First load a target module, then the livepatch:
+
+  % insmod samples/livepatch/livepatch-callbacks-mod.ko
+  [   54.607948] livepatch_callbacks_mod: livepatch_callbacks_mod_init
+
+  % insmod samples/livepatch/livepatch-callbacks-demo.ko
+  [   56.613919] livepatch: enabling patch 'livepatch_callbacks_demo'
+  [   56.614411] livepatch: 'livepatch_callbacks_demo': initializing patching transition
+  [   56.614436] livepatch_callbacks_demo: pre_patch_callback: vmlinux
+  [   56.614818] livepatch_callbacks_demo: pre_patch_callback: livepatch_callbacks_mod -> [MODULE_STATE_LIVE] Normal state
+  [   56.615656] livepatch: 'livepatch_callbacks_demo': starting patching transition
+  [   57.759070] livepatch: 'livepatch_callbacks_demo': completing patching transition
+  [   57.759147] livepatch_callbacks_demo: post_patch_callback: vmlinux
+  [   57.759621] livepatch_callbacks_demo: post_patch_callback: livepatch_callbacks_mod -> [MODULE_STATE_LIVE] Normal state
+  [   57.760307] livepatch: 'livepatch_callbacks_demo': patching complete
+
+When a target module is unloaded, the livepatch is only reverted from
+that klp_object (livepatch_callbacks_mod).  As such, only its pre and
+post-unpatch callbacks are executed when this occurs:
+
+  % rmmod samples/livepatch/livepatch-callbacks-mod.ko
+  [   58.623409] livepatch_callbacks_mod: livepatch_callbacks_mod_exit
+  [   58.623903] livepatch_callbacks_demo: pre_unpatch_callback: livepatch_callbacks_mod -> [MODULE_STATE_GOING] Going away
+  [   58.624658] livepatch: reverting patch 'livepatch_callbacks_demo' on unloading module 'livepatch_callbacks_mod'
+  [   58.625305] livepatch_callbacks_demo: post_unpatch_callback: livepatch_callbacks_mod -> [MODULE_STATE_GOING] Going away
+
+When the livepatch is disabled, pre and post-unpatch callbacks are run
+for the remaining klp_object, vmlinux:
+
+  % echo 0 > /sys/kernel/livepatch/livepatch_callbacks_demo/enabled
+  [   60.638420] livepatch: 'livepatch_callbacks_demo': initializing unpatching transition
+  [   60.638444] livepatch_callbacks_demo: pre_unpatch_callback: vmlinux
+  [   60.638996] livepatch: 'livepatch_callbacks_demo': starting unpatching transition
+  [   61.727088] livepatch: 'livepatch_callbacks_demo': completing unpatching transition
+  [   61.727165] livepatch_callbacks_demo: post_unpatch_callback: vmlinux
+  [   61.727985] livepatch: 'livepatch_callbacks_demo': unpatching complete
+
+  % rmmod samples/livepatch/livepatch-callbacks-demo.ko
+
+
+Test 4
+------
+
+This test is similar to the previous test, however the livepatch is
+loaded first.  This tests the livepatch core's module_coming and
+module_going handlers:
+
+- load livepatch
+- load target module
+- unload target module
+- disable livepatch
+- unload livepatch
+
+First load the livepatch:
+
+  % insmod samples/livepatch/livepatch-callbacks-demo.ko
+  [   64.661552] livepatch: enabling patch 'livepatch_callbacks_demo'
+  [   64.662147] livepatch: 'livepatch_callbacks_demo': initializing patching transition
+  [   64.662175] livepatch_callbacks_demo: pre_patch_callback: vmlinux
+  [   64.662850] livepatch: 'livepatch_callbacks_demo': starting patching transition
+  [   65.695056] livepatch: 'livepatch_callbacks_demo': completing patching transition
+  [   65.695147] livepatch_callbacks_demo: post_patch_callback: vmlinux
+  [   65.695561] livepatch: 'livepatch_callbacks_demo': patching complete
+
+When a targeted kernel module is subsequently loaded, only its
+pre/post-patch callbacks are executed:
+
+  % insmod samples/livepatch/livepatch-callbacks-mod.ko
+  [   66.669196] livepatch: applying patch 'livepatch_callbacks_demo' to loading module 'livepatch_callbacks_mod'
+  [   66.669882] livepatch_callbacks_demo: pre_patch_callback: livepatch_callbacks_mod -> [MODULE_STATE_COMING] Full formed, running module_init
+  [   66.670744] livepatch_callbacks_demo: post_patch_callback: livepatch_callbacks_mod -> [MODULE_STATE_COMING] Full formed, running module_init
+  [   66.672873] livepatch_callbacks_mod: livepatch_callbacks_mod_init
+
+When the target module is unloaded, the livepatch is only reverted from
+the livepatch_callbacks_mod klp_object.  As such, only pre and
+post-unpatch callbacks are executed when this occurs:
+
+  % rmmod samples/livepatch/livepatch-callbacks-mod.ko
+  [   68.680065] livepatch_callbacks_mod: livepatch_callbacks_mod_exit
+  [   68.680688] livepatch_callbacks_demo: pre_unpatch_callback: livepatch_callbacks_mod -> [MODULE_STATE_GOING] Going away
+  [   68.681452] livepatch: reverting patch 'livepatch_callbacks_demo' on unloading module 'livepatch_callbacks_mod'
+  [   68.682094] livepatch_callbacks_demo: post_unpatch_callback: livepatch_callbacks_mod -> [MODULE_STATE_GOING] Going away
+
+  % echo 0 > /sys/kernel/livepatch/livepatch_callbacks_demo/enabled
+  [   70.689225] livepatch: 'livepatch_callbacks_demo': initializing unpatching transition
+  [   70.689256] livepatch_callbacks_demo: pre_unpatch_callback: vmlinux
+  [   70.689882] livepatch: 'livepatch_callbacks_demo': starting unpatching transition
+  [   71.711080] livepatch: 'livepatch_callbacks_demo': completing unpatching transition
+  [   71.711481] livepatch_callbacks_demo: post_unpatch_callback: vmlinux
+  [   71.711988] livepatch: 'livepatch_callbacks_demo': unpatching complete
+
+  % rmmod samples/livepatch/livepatch-callbacks-demo.ko
+
+
+Test 5
+------
+
+A simple test of loading a livepatch without one of its patch target
+klp_objects ever loaded (livepatch_callbacks_mod):
+
+- load livepatch
+- disable livepatch
+- unload livepatch
+
+Load the livepatch:
+
+  % insmod samples/livepatch/livepatch-callbacks-demo.ko
+  [   74.711081] livepatch: enabling patch 'livepatch_callbacks_demo'
+  [   74.711595] livepatch: 'livepatch_callbacks_demo': initializing patching transition
+  [   74.711639] livepatch_callbacks_demo: pre_patch_callback: vmlinux
+  [   74.712272] livepatch: 'livepatch_callbacks_demo': starting patching transition
+  [   75.743137] livepatch: 'livepatch_callbacks_demo': completing patching transition
+  [   75.743219] livepatch_callbacks_demo: post_patch_callback: vmlinux
+  [   75.743867] livepatch: 'livepatch_callbacks_demo': patching complete
+
+As expected, only pre/post-(un)patch handlers are executed for vmlinux:
+
+  % echo 0 > /sys/kernel/livepatch/livepatch_callbacks_demo/enabled
+  [   76.716254] livepatch: 'livepatch_callbacks_demo': initializing unpatching transition
+  [   76.716278] livepatch_callbacks_demo: pre_unpatch_callback: vmlinux
+  [   76.716666] livepatch: 'livepatch_callbacks_demo': starting unpatching transition
+  [   77.727089] livepatch: 'livepatch_callbacks_demo': completing unpatching transition
+  [   77.727194] livepatch_callbacks_demo: post_unpatch_callback: vmlinux
+  [   77.727907] livepatch: 'livepatch_callbacks_demo': unpatching complete
+
+  % rmmod samples/livepatch/livepatch-callbacks-demo.ko
+
+
+Test 6
+------
+
+Test a scenario where a vmlinux pre-patch callback returns a non-zero
+status (ie, failure):
+
+- load target module
+- load livepatch -ENODEV
+- unload target module
+
+First load a target module:
+
+  % insmod samples/livepatch/livepatch-callbacks-mod.ko
+  [   80.740520] livepatch_callbacks_mod: livepatch_callbacks_mod_init
+
+Load the livepatch module, setting its 'pre_patch_ret' value to -19
+(-ENODEV).  When its vmlinux pre-patch callback executed, this status
+code will propagate back to the module-loading subsystem.  The result is
+that the insmod command refuses to load the livepatch module:
+
+  % insmod samples/livepatch/livepatch-callbacks-demo.ko pre_patch_ret=-19
+  [   82.747326] livepatch: enabling patch 'livepatch_callbacks_demo'
+  [   82.747743] livepatch: 'livepatch_callbacks_demo': initializing patching transition
+  [   82.747767] livepatch_callbacks_demo: pre_patch_callback: vmlinux
+  [   82.748237] livepatch: pre-patch callback failed for object 'vmlinux'
+  [   82.748637] livepatch: failed to enable patch 'livepatch_callbacks_demo'
+  [   82.749059] livepatch: 'livepatch_callbacks_demo': canceling transition, going to unpatch
+  [   82.749060] livepatch: 'livepatch_callbacks_demo': completing unpatching transition
+  [   82.749868] livepatch: 'livepatch_callbacks_demo': unpatching complete
+  [   82.765809] insmod: ERROR: could not insert module samples/livepatch/livepatch-callbacks-demo.ko: No such device
+
+  % rmmod samples/livepatch/livepatch-callbacks-mod.ko
+  [   84.774238] livepatch_callbacks_mod: livepatch_callbacks_mod_exit
+
+
+Test 7
+------
+
+Similar to the previous test, setup a livepatch such that its vmlinux
+pre-patch callback returns success.  However, when a targeted kernel
+module is later loaded, have the livepatch return a failing status code:
+
+- load livepatch
+- setup -ENODEV
+- load target module
+- disable livepatch
+- unload livepatch
+
+Load the livepatch, notice vmlinux pre-patch callback succeeds:
+
+  % insmod samples/livepatch/livepatch-callbacks-demo.ko
+  [   86.787845] livepatch: enabling patch 'livepatch_callbacks_demo'
+  [   86.788325] livepatch: 'livepatch_callbacks_demo': initializing patching transition
+  [   86.788427] livepatch_callbacks_demo: pre_patch_callback: vmlinux
+  [   86.788821] livepatch: 'livepatch_callbacks_demo': starting patching transition
+  [   87.711069] livepatch: 'livepatch_callbacks_demo': completing patching transition
+  [   87.711143] livepatch_callbacks_demo: post_patch_callback: vmlinux
+  [   87.711886] livepatch: 'livepatch_callbacks_demo': patching complete
+
+Set a trap so subsequent pre-patch callbacks to this livepatch will
+return -ENODEV:
+
+  % echo -19 > /sys/module/livepatch_callbacks_demo/parameters/pre_patch_ret
+
+The livepatch pre-patch callback for subsequently loaded target modules
+will return failure, so the module loader refuses to load the kernel
+module.  Notice that no post-patch or pre/post-unpatch callbacks are
+executed for this klp_object:
+
+  % insmod samples/livepatch/livepatch-callbacks-mod.ko
+  [   90.796976] livepatch: applying patch 'livepatch_callbacks_demo' to loading module 'livepatch_callbacks_mod'
+  [   90.797834] livepatch_callbacks_demo: pre_patch_callback: livepatch_callbacks_mod -> [MODULE_STATE_COMING] Full formed, running module_init
+  [   90.798900] livepatch: pre-patch callback failed for object 'livepatch_callbacks_mod'
+  [   90.799652] livepatch: patch 'livepatch_callbacks_demo' failed for module 'livepatch_callbacks_mod', refusing to load module 'livepatch_callbacks_mod'
+  [   90.819737] insmod: ERROR: could not insert module samples/livepatch/livepatch-callbacks-mod.ko: No such device
+
+However, pre/post-unpatch callbacks run for the vmlinux klp_object:
+
+  % echo 0 > /sys/kernel/livepatch/livepatch_callbacks_demo/enabled
+  [   92.823547] livepatch: 'livepatch_callbacks_demo': initializing unpatching transition
+  [   92.823573] livepatch_callbacks_demo: pre_unpatch_callback: vmlinux
+  [   92.824331] livepatch: 'livepatch_callbacks_demo': starting unpatching transition
+  [   93.727128] livepatch: 'livepatch_callbacks_demo': completing unpatching transition
+  [   93.727327] livepatch_callbacks_demo: post_unpatch_callback: vmlinux
+  [   93.727861] livepatch: 'livepatch_callbacks_demo': unpatching complete
+
+  % rmmod samples/livepatch/livepatch-callbacks-demo.ko
+
+
+Test 8
+------
+
+Test loading multiple targeted kernel modules.  This test-case is
+mainly for comparing with the next test-case.
+
+- load busy target module (0s sleep),
+- load livepatch
+- load target module
+- unload target module
+- disable livepatch
+- unload livepatch
+- unload busy target module
+
+
+Load a target "busy" kernel module which kicks off a worker function
+that immediately exits:
+
+  % insmod samples/livepatch/livepatch-callbacks-busymod.ko sleep_secs=0
+  [   96.910107] livepatch_callbacks_busymod: livepatch_callbacks_mod_init
+  [   96.910600] livepatch_callbacks_busymod: busymod_work_func, sleeping 0 seconds ...
+  [   96.913024] livepatch_callbacks_busymod: busymod_work_func exit
+
+Proceed with loading the livepatch and another ordinary target module,
+notice that the post-patch callbacks are executed and the transition
+completes quickly:
+
+  % insmod samples/livepatch/livepatch-callbacks-demo.ko
+  [   98.917892] livepatch: enabling patch 'livepatch_callbacks_demo'
+  [   98.918426] livepatch: 'livepatch_callbacks_demo': initializing patching transition
+  [   98.918453] livepatch_callbacks_demo: pre_patch_callback: vmlinux
+  [   98.918955] livepatch_callbacks_demo: pre_patch_callback: livepatch_callbacks_busymod -> [MODULE_STATE_LIVE] Normal state
+  [   98.923835] livepatch: 'livepatch_callbacks_demo': starting patching transition
+  [   99.743104] livepatch: 'livepatch_callbacks_demo': completing patching transition
+  [   99.743156] livepatch_callbacks_demo: post_patch_callback: vmlinux
+  [   99.743679] livepatch_callbacks_demo: post_patch_callback: livepatch_callbacks_busymod -> [MODULE_STATE_LIVE] Normal state
+  [   99.744616] livepatch: 'livepatch_callbacks_demo': patching complete
+
+  % insmod samples/livepatch/livepatch-callbacks-mod.ko
+  [  100.930955] livepatch: applying patch 'livepatch_callbacks_demo' to loading module 'livepatch_callbacks_mod'
+  [  100.931668] livepatch_callbacks_demo: pre_patch_callback: livepatch_callbacks_mod -> [MODULE_STATE_COMING] Full formed, running module_init
+  [  100.932645] livepatch_callbacks_demo: post_patch_callback: livepatch_callbacks_mod -> [MODULE_STATE_COMING] Full formed, running module_init
+  [  100.934125] livepatch_callbacks_mod: livepatch_callbacks_mod_init
+
+  % rmmod samples/livepatch/livepatch-callbacks-mod.ko
+  [  102.942805] livepatch_callbacks_mod: livepatch_callbacks_mod_exit
+  [  102.943640] livepatch_callbacks_demo: pre_unpatch_callback: livepatch_callbacks_mod -> [MODULE_STATE_GOING] Going away
+  [  102.944585] livepatch: reverting patch 'livepatch_callbacks_demo' on unloading module 'livepatch_callbacks_mod'
+  [  102.945455] livepatch_callbacks_demo: post_unpatch_callback: livepatch_callbacks_mod -> [MODULE_STATE_GOING] Going away
+
+  % echo 0 > /sys/kernel/livepatch/livepatch_callbacks_demo/enabled
+  [  104.953815] livepatch: 'livepatch_callbacks_demo': initializing unpatching transition
+  [  104.953838] livepatch_callbacks_demo: pre_unpatch_callback: vmlinux
+  [  104.954431] livepatch_callbacks_demo: pre_unpatch_callback: livepatch_callbacks_busymod -> [MODULE_STATE_LIVE] Normal state
+  [  104.955426] livepatch: 'livepatch_callbacks_demo': starting unpatching transition
+  [  106.719073] livepatch: 'livepatch_callbacks_demo': completing unpatching transition
+  [  106.722633] livepatch_callbacks_demo: post_unpatch_callback: vmlinux
+  [  106.723282] livepatch_callbacks_demo: post_unpatch_callback: livepatch_callbacks_busymod -> [MODULE_STATE_LIVE] Normal state
+  [  106.724279] livepatch: 'livepatch_callbacks_demo': unpatching complete
+
+  % rmmod samples/livepatch/livepatch-callbacks-demo.ko
+  % rmmod samples/livepatch/livepatch-callbacks-busymod.ko
+  [  108.975660] livepatch_callbacks_busymod: livepatch_callbacks_mod_exit
+
+
+Test 9
+------
+
+A similar test as the previous one, but force the "busy" kernel module
+to do longer work.
+
+The livepatching core will refuse to patch a task that is currently
+executing a to-be-patched function -- the consistency model stalls the
+current patch transition until this safety-check is met.  Test a
+scenario where one of a livepatch's target klp_objects sits on such a
+function for a long time.  Meanwhile, load and unload other target
+kernel modules while the livepatch transition is in progress.
+
+- load busy target module (30s sleep)
+- load livepatch
+- load target module
+- unload target module
+- disable livepatch
+- unload livepatch
+- unload busy target module
+
+
+Load the "busy" kernel module, this time make it do 30 seconds worth of
+work:
+
+  % insmod samples/livepatch/livepatch-callbacks-busymod.ko sleep_secs=30
+  [  110.993362] livepatch_callbacks_busymod: livepatch_callbacks_mod_init
+  [  110.994059] livepatch_callbacks_busymod: busymod_work_func, sleeping 30 seconds ...
+
+Meanwhile, the livepatch is loaded.  Notice that the patch transition
+does not complete as the targeted "busy" module is sitting on a
+to-be-patched function:
+
+  % insmod samples/livepatch/livepatch-callbacks-demo.ko
+  [  113.000309] livepatch: enabling patch 'livepatch_callbacks_demo'
+  [  113.000764] livepatch: 'livepatch_callbacks_demo': initializing patching transition
+  [  113.000791] livepatch_callbacks_demo: pre_patch_callback: vmlinux
+  [  113.001289] livepatch_callbacks_demo: pre_patch_callback: livepatch_callbacks_busymod -> [MODULE_STATE_LIVE] Normal state
+  [  113.005208] livepatch: 'livepatch_callbacks_demo': starting patching transition
+
+Load a second target module (this one is an ordinary idle kernel
+module).  Note that *no* post-patch callbacks will be executed while the
+livepatch is still in transition:
+
+  % insmod samples/livepatch/livepatch-callbacks-mod.ko
+  [  115.012740] livepatch: applying patch 'livepatch_callbacks_demo' to loading module 'livepatch_callbacks_mod'
+  [  115.013406] livepatch_callbacks_demo: pre_patch_callback: livepatch_callbacks_mod -> [MODULE_STATE_COMING] Full formed, running module_init
+  [  115.015315] livepatch_callbacks_mod: livepatch_callbacks_mod_init
+
+Request an unload of the simple kernel module.  The patch is still
+transitioning, so its pre-unpatch callbacks are skipped:
+
+  % rmmod samples/livepatch/livepatch-callbacks-mod.ko
+  [  117.022626] livepatch_callbacks_mod: livepatch_callbacks_mod_exit
+  [  117.023376] livepatch: reverting patch 'livepatch_callbacks_demo' on unloading module 'livepatch_callbacks_mod'
+  [  117.024533] livepatch_callbacks_demo: post_unpatch_callback: livepatch_callbacks_mod -> [MODULE_STATE_GOING] Going away
+
+Finally the livepatch is disabled.  Since none of the patch's
+klp_object's post-patch callbacks executed, the remaining klp_object's
+pre-unpatch callbacks are skipped:
+
+  % echo 0 > /sys/kernel/livepatch/livepatch_callbacks_demo/enabled
+  [  119.035408] livepatch: 'livepatch_callbacks_demo': reversing transition from patching to unpatching
+  [  119.035485] livepatch: 'livepatch_callbacks_demo': starting unpatching transition
+  [  119.711166] livepatch: 'livepatch_callbacks_demo': completing unpatching transition
+  [  119.714179] livepatch_callbacks_demo: post_unpatch_callback: vmlinux
+  [  119.714653] livepatch_callbacks_demo: post_unpatch_callback: livepatch_callbacks_busymod -> [MODULE_STATE_LIVE] Normal state
+  [  119.715437] livepatch: 'livepatch_callbacks_demo': unpatching complete
+
+  % rmmod samples/livepatch/livepatch-callbacks-demo.ko
+  % rmmod samples/livepatch/livepatch-callbacks-busymod.ko
+  [  141.279111] livepatch_callbacks_busymod: busymod_work_func exit
+  [  141.279760] livepatch_callbacks_busymod: livepatch_callbacks_mod_exit
diff --git a/include/linux/livepatch.h b/include/linux/livepatch.h
index d08eddc00497..fc5c1be3f6f4 100644
--- a/include/linux/livepatch.h
+++ b/include/linux/livepatch.h
@@ -87,10 +87,35 @@ struct klp_func {
 	bool transition;
 };
 
+struct klp_object;
+
+/**
+ * struct klp_callbacks - pre/post live-(un)patch callback structure
+ * @pre_patch:		executed before code patching
+ * @post_patch:		executed after code patching
+ * @pre_unpatch:	executed before code unpatching
+ * @post_unpatch:	executed after code unpatching
+ * @post_unpatch_enabled:	flag indicating if post-unpatch callback
+ * 				should run
+ *
+ * All callbacks are optional.  Only the pre-patch callback, if provided,
+ * will be unconditionally executed.  If the parent klp_object fails to
+ * patch for any reason, including a non-zero error status returned from
+ * the pre-patch callback, no further callbacks will be executed.
+ */
+struct klp_callbacks {
+	int (*pre_patch)(struct klp_object *obj);
+	void (*post_patch)(struct klp_object *obj);
+	void (*pre_unpatch)(struct klp_object *obj);
+	void (*post_unpatch)(struct klp_object *obj);
+	bool post_unpatch_enabled;
+};
+
 /**
  * struct klp_object - kernel object structure for live patching
  * @name:	module name (or NULL for vmlinux)
  * @funcs:	function entries for functions to be patched in the object
+ * @callbacks:	functions to be executed pre/post (un)patching
  * @kobj:	kobject for sysfs resources
  * @mod:	kernel module associated with the patched object
  *		(NULL for vmlinux)
@@ -100,6 +125,7 @@ struct klp_object {
 	/* external */
 	const char *name;
 	struct klp_func *funcs;
+	struct klp_callbacks callbacks;
 
 	/* internal */
 	struct kobject kobj;
diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c
index b9628e43c78f..cafb5a84417d 100644
--- a/kernel/livepatch/core.c
+++ b/kernel/livepatch/core.c
@@ -54,11 +54,6 @@ static bool klp_is_module(struct klp_object *obj)
 	return obj->name;
 }
 
-static bool klp_is_object_loaded(struct klp_object *obj)
-{
-	return !obj->name || obj->mod;
-}
-
 /* sets obj->mod if object is not vmlinux and module is found */
 static void klp_find_object_module(struct klp_object *obj)
 {
@@ -285,6 +280,8 @@ static int klp_write_object_relocations(struct module *pmod,
 
 static int __klp_disable_patch(struct klp_patch *patch)
 {
+	struct klp_object *obj;
+
 	if (klp_transition_patch)
 		return -EBUSY;
 
@@ -295,6 +292,10 @@ static int __klp_disable_patch(struct klp_patch *patch)
 
 	klp_init_transition(patch, KLP_UNPATCHED);
 
+	klp_for_each_object(patch, obj)
+		if (patch->enabled && obj->patched)
+			klp_pre_unpatch_callback(obj);
+
 	/*
 	 * Enforce the order of the func->transition writes in
 	 * klp_init_transition() and the TIF_PATCH_PENDING writes in
@@ -388,13 +389,18 @@ static int __klp_enable_patch(struct klp_patch *patch)
 		if (!klp_is_object_loaded(obj))
 			continue;
 
-		ret = klp_patch_object(obj);
+		ret = klp_pre_patch_callback(obj);
 		if (ret) {
-			pr_warn("failed to enable patch '%s'\n",
-				patch->mod->name);
+			pr_warn("pre-patch callback failed for object '%s'\n",
+				klp_is_module(obj) ? obj->name : "vmlinux");
+			goto err;
+		}
 
-			klp_cancel_transition();
-			return ret;
+		ret = klp_patch_object(obj);
+		if (ret) {
+			pr_warn("failed to patch object '%s'\n",
+				klp_is_module(obj) ? obj->name : "vmlinux");
+			goto err;
 		}
 	}
 
@@ -403,6 +409,11 @@ static int __klp_enable_patch(struct klp_patch *patch)
 	patch->enabled = true;
 
 	return 0;
+err:
+	pr_warn("failed to enable patch '%s'\n", patch->mod->name);
+
+	klp_cancel_transition();
+	return ret;
 }
 
 /**
@@ -871,13 +882,27 @@ int klp_module_coming(struct module *mod)
 			pr_notice("applying patch '%s' to loading module '%s'\n",
 				  patch->mod->name, obj->mod->name);
 
+			ret = klp_pre_patch_callback(obj);
+			if (ret) {
+				pr_warn("pre-patch callback failed for object '%s'\n",
+					obj->name);
+				goto err;
+			}
+
 			ret = klp_patch_object(obj);
 			if (ret) {
 				pr_warn("failed to apply patch '%s' to module '%s' (%d)\n",
 					patch->mod->name, obj->mod->name, ret);
+
+				if (patch != klp_transition_patch)
+					klp_post_unpatch_callback(obj);
+
 				goto err;
 			}
 
+			if (patch != klp_transition_patch)
+				klp_post_patch_callback(obj);
+
 			break;
 		}
 	}
@@ -927,9 +952,15 @@ void klp_module_going(struct module *mod)
 			 * is in transition.
 			 */
 			if (patch->enabled || patch == klp_transition_patch) {
+
+				if (patch != klp_transition_patch)
+					klp_pre_unpatch_callback(obj);
+
 				pr_notice("reverting patch '%s' on unloading module '%s'\n",
 					  patch->mod->name, obj->mod->name);
 				klp_unpatch_object(obj);
+
+				klp_post_unpatch_callback(obj);
 			}
 
 			klp_free_object_loaded(obj);
diff --git a/kernel/livepatch/core.h b/kernel/livepatch/core.h
index c74f24c47837..6fc907b54e71 100644
--- a/kernel/livepatch/core.h
+++ b/kernel/livepatch/core.h
@@ -1,6 +1,44 @@
 #ifndef _LIVEPATCH_CORE_H
 #define _LIVEPATCH_CORE_H
 
+#include <linux/livepatch.h>
+
 extern struct mutex klp_mutex;
 
+static inline bool klp_is_object_loaded(struct klp_object *obj)
+{
+	return !obj->name || obj->mod;
+}
+
+static inline int klp_pre_patch_callback(struct klp_object *obj)
+{
+	int ret;
+
+	ret = (obj->callbacks.pre_patch) ?
+		(*obj->callbacks.pre_patch)(obj) : 0;
+
+	obj->callbacks.post_unpatch_enabled = !ret;
+
+	return ret;
+}
+
+static inline void klp_post_patch_callback(struct klp_object *obj)
+{
+	if (obj->callbacks.post_patch)
+		(*obj->callbacks.post_patch)(obj);
+}
+
+static inline void klp_pre_unpatch_callback(struct klp_object *obj)
+{
+	if (obj->callbacks.pre_unpatch)
+		(*obj->callbacks.pre_unpatch)(obj);
+}
+
+static inline void klp_post_unpatch_callback(struct klp_object *obj)
+{
+	if (obj->callbacks.post_unpatch_enabled &&
+	    obj->callbacks.post_unpatch)
+		(*obj->callbacks.post_unpatch)(obj);
+}
+
 #endif /* _LIVEPATCH_CORE_H */
diff --git a/kernel/livepatch/patch.c b/kernel/livepatch/patch.c
index 52c4e907c14b..82d584225dc6 100644
--- a/kernel/livepatch/patch.c
+++ b/kernel/livepatch/patch.c
@@ -28,6 +28,7 @@
 #include <linux/slab.h>
 #include <linux/bug.h>
 #include <linux/printk.h>
+#include "core.h"
 #include "patch.h"
 #include "transition.h"
 
diff --git a/kernel/livepatch/transition.c b/kernel/livepatch/transition.c
index b004a1fb6032..7bf55b7f3687 100644
--- a/kernel/livepatch/transition.c
+++ b/kernel/livepatch/transition.c
@@ -109,9 +109,6 @@ static void klp_complete_transition(void)
 		}
 	}
 
-	if (klp_target_state == KLP_UNPATCHED && !immediate_func)
-		module_put(klp_transition_patch->mod);
-
 	/* Prevent klp_ftrace_handler() from seeing KLP_UNDEFINED state */
 	if (klp_target_state == KLP_PATCHED)
 		klp_synchronize_transition();
@@ -130,6 +127,24 @@ static void klp_complete_transition(void)
 	}
 
 done:
+	klp_for_each_object(klp_transition_patch, obj) {
+		if (!klp_is_object_loaded(obj))
+			continue;
+		if (klp_target_state == KLP_PATCHED)
+			klp_post_patch_callback(obj);
+		else if (klp_target_state == KLP_UNPATCHED)
+			klp_post_unpatch_callback(obj);
+	}
+
+	/*
+	 * See complementary comment in __klp_enable_patch() for why we
+	 * keep the module reference for immediate patches.
+	 */
+	if (!klp_transition_patch->immediate && !immediate_func &&
+	    klp_target_state == KLP_UNPATCHED) {
+		module_put(klp_transition_patch->mod);
+	}
+
 	klp_target_state = KLP_UNDEFINED;
 	klp_transition_patch = NULL;
 }
diff --git a/samples/livepatch/Makefile b/samples/livepatch/Makefile
index 539e81d433cd..2472ce39a18d 100644
--- a/samples/livepatch/Makefile
+++ b/samples/livepatch/Makefile
@@ -2,3 +2,6 @@ obj-$(CONFIG_SAMPLE_LIVEPATCH) += livepatch-sample.o
 obj-$(CONFIG_SAMPLE_LIVEPATCH) += livepatch-shadow-mod.o
 obj-$(CONFIG_SAMPLE_LIVEPATCH) += livepatch-shadow-fix1.o
 obj-$(CONFIG_SAMPLE_LIVEPATCH) += livepatch-shadow-fix2.o
+obj-$(CONFIG_SAMPLE_LIVEPATCH) += livepatch-callbacks-demo.o
+obj-$(CONFIG_SAMPLE_LIVEPATCH) += livepatch-callbacks-mod.o
+obj-$(CONFIG_SAMPLE_LIVEPATCH) += livepatch-callbacks-busymod.o
diff --git a/samples/livepatch/livepatch-callbacks-busymod.c b/samples/livepatch/livepatch-callbacks-busymod.c
new file mode 100644
index 000000000000..80d06e103f1b
--- /dev/null
+++ b/samples/livepatch/livepatch-callbacks-busymod.c
@@ -0,0 +1,72 @@
+/*
+ * Copyright (C) 2017 Joe Lawrence <joe.lawrence@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+/*
+ * livepatch-callbacks-busymod.c - (un)patching callbacks demo support module
+ *
+ *
+ * Purpose
+ * -------
+ *
+ * Simple module to demonstrate livepatch (un)patching callbacks.
+ *
+ *
+ * Usage
+ * -----
+ *
+ * This module is not intended to be standalone.  See the "Usage"
+ * section of livepatch-callbacks-mod.c.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/workqueue.h>
+#include <linux/delay.h>
+
+static int sleep_secs;
+module_param(sleep_secs, int, 0644);
+MODULE_PARM_DESC(sleep_secs, "sleep_secs (default=0)");
+
+static void busymod_work_func(struct work_struct *work);
+static DECLARE_DELAYED_WORK(work, busymod_work_func);
+
+static void busymod_work_func(struct work_struct *work)
+{
+	pr_info("%s, sleeping %d seconds ...\n", __func__, sleep_secs);
+	msleep(sleep_secs * 1000);
+	pr_info("%s exit\n", __func__);
+}
+
+static int livepatch_callbacks_mod_init(void)
+{
+	pr_info("%s\n", __func__);
+	schedule_delayed_work(&work,
+		msecs_to_jiffies(1000 * 0));
+	return 0;
+}
+
+static void livepatch_callbacks_mod_exit(void)
+{
+	cancel_delayed_work_sync(&work);
+	pr_info("%s\n", __func__);
+}
+
+module_init(livepatch_callbacks_mod_init);
+module_exit(livepatch_callbacks_mod_exit);
+MODULE_LICENSE("GPL");
diff --git a/samples/livepatch/livepatch-callbacks-demo.c b/samples/livepatch/livepatch-callbacks-demo.c
new file mode 100644
index 000000000000..3d115bd68442
--- /dev/null
+++ b/samples/livepatch/livepatch-callbacks-demo.c
@@ -0,0 +1,234 @@
+/*
+ * Copyright (C) 2017 Joe Lawrence <joe.lawrence@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+/*
+ * livepatch-callbacks-demo.c - (un)patching callbacks livepatch demo
+ *
+ *
+ * Purpose
+ * -------
+ *
+ * Demonstration of registering livepatch (un)patching callbacks.
+ *
+ *
+ * Usage
+ * -----
+ *
+ * Step 1 - load the simple module
+ *
+ *   insmod samples/livepatch/livepatch-callbacks-mod.ko
+ *
+ *
+ * Step 2 - load the demonstration livepatch (with callbacks)
+ *
+ *   insmod samples/livepatch/livepatch-callbacks-demo.ko
+ *
+ *
+ * Step 3 - cleanup
+ *
+ *   echo 0 > /sys/kernel/livepatch/livepatch_callbacks_demo/enabled
+ *   rmmod livepatch_callbacks_demo
+ *   rmmod livepatch_callbacks_mod
+ *
+ * Watch dmesg output to see livepatch enablement, callback execution
+ * and patching operations for both vmlinux and module targets.
+ *
+ * NOTE: swap the insmod order of livepatch-callbacks-mod.ko and
+ *       livepatch-callbacks-demo.ko to observe what happens when a
+ *       target module is loaded after a livepatch with callbacks.
+ *
+ * NOTE: 'pre_patch_ret' is a module parameter that sets the pre-patch
+ *       callback return status.  Try setting up a non-zero status
+ *       such as -19 (-ENODEV):
+ *
+ *       # Load demo livepatch, vmlinux is patched
+ *       insmod samples/livepatch/livepatch-callbacks-demo.ko
+ *
+ *       # Setup next pre-patch callback to return -ENODEV
+ *       echo -19 > /sys/module/livepatch_callbacks_demo/parameters/pre_patch_ret
+ *
+ *       # Module loader refuses to load the target module
+ *       insmod samples/livepatch/livepatch-callbacks-mod.ko
+ *       insmod: ERROR: could not insert module samples/livepatch/livepatch-callbacks-mod.ko: No such device
+ *
+ * NOTE: There is a second target module,
+ *       livepatch-callbacks-busymod.ko, available for experimenting
+ *       with livepatch (un)patch callbacks.  This module contains
+ *       a 'sleep_secs' parameter that parks the module on one of the
+ *       functions that the livepatch demo module wants to patch.
+ *       Modifying this value and tweaking the order of module loads can
+ *       effectively demonstrate stalled patch transitions:
+ *
+ *       # Load a target module, let it park on 'busymod_work_func' for
+ *       # thirty seconds
+ *       insmod samples/livepatch/livepatch-callbacks-busymod.ko sleep_secs=30
+ *
+ *       # Meanwhile load the livepatch
+ *       insmod samples/livepatch/livepatch-callbacks-demo.ko
+ *
+ *       # ... then load and unload another target module while the
+ *       # transition is in progress
+ *       insmod samples/livepatch/livepatch-callbacks-mod.ko
+ *       rmmod samples/livepatch/livepatch-callbacks-mod.ko
+ *
+ *       # Finally cleanup
+ *       echo 0 > /sys/kernel/livepatch/livepatch_callbacks_demo/enabled
+ *       rmmod samples/livepatch/livepatch-callbacks-demo.ko
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/livepatch.h>
+
+static int pre_patch_ret;
+module_param(pre_patch_ret, int, 0644);
+MODULE_PARM_DESC(pre_patch_ret, "pre_patch_ret (default=0)");
+
+static const char *const module_state[] = {
+	[MODULE_STATE_LIVE]	= "[MODULE_STATE_LIVE] Normal state",
+	[MODULE_STATE_COMING]	= "[MODULE_STATE_COMING] Full formed, running module_init",
+	[MODULE_STATE_GOING]	= "[MODULE_STATE_GOING] Going away",
+	[MODULE_STATE_UNFORMED]	= "[MODULE_STATE_UNFORMED] Still setting it up",
+};
+
+static void callback_info(const char *callback, struct klp_object *obj)
+{
+	if (obj->mod)
+		pr_info("%s: %s -> %s\n", callback, obj->mod->name,
+			module_state[obj->mod->state]);
+	else
+		pr_info("%s: vmlinux\n", callback);
+}
+
+/* Executed on object patching (ie, patch enablement) */
+static int pre_patch_callback(struct klp_object *obj)
+{
+	callback_info(__func__, obj);
+	return pre_patch_ret;
+}
+
+/* Executed on object unpatching (ie, patch disablement) */
+static void post_patch_callback(struct klp_object *obj)
+{
+	callback_info(__func__, obj);
+}
+
+/* Executed on object unpatching (ie, patch disablement) */
+static void pre_unpatch_callback(struct klp_object *obj)
+{
+	callback_info(__func__, obj);
+}
+
+/* Executed on object unpatching (ie, patch disablement) */
+static void post_unpatch_callback(struct klp_object *obj)
+{
+	callback_info(__func__, obj);
+}
+
+static void patched_work_func(struct work_struct *work)
+{
+	pr_info("%s\n", __func__);
+}
+
+static struct klp_func no_funcs[] = {
+	{ }
+};
+
+static struct klp_func busymod_funcs[] = {
+	{
+		.old_name = "busymod_work_func",
+		.new_func = patched_work_func,
+	}, { }
+};
+
+static struct klp_object objs[] = {
+	{
+		.name = NULL,	/* vmlinux */
+		.funcs = no_funcs,
+		.callbacks = {
+			.pre_patch = pre_patch_callback,
+			.post_patch = post_patch_callback,
+			.pre_unpatch = pre_unpatch_callback,
+			.post_unpatch = post_unpatch_callback,
+		},
+	},	{
+		.name = "livepatch_callbacks_mod",
+		.funcs = no_funcs,
+		.callbacks = {
+			.pre_patch = pre_patch_callback,
+			.post_patch = post_patch_callback,
+			.pre_unpatch = pre_unpatch_callback,
+			.post_unpatch = post_unpatch_callback,
+		},
+	},	{
+		.name = "livepatch_callbacks_busymod",
+		.funcs = busymod_funcs,
+		.callbacks = {
+			.pre_patch = pre_patch_callback,
+			.post_patch = post_patch_callback,
+			.pre_unpatch = pre_unpatch_callback,
+			.post_unpatch = post_unpatch_callback,
+		},
+	}, { }
+};
+
+static struct klp_patch patch = {
+	.mod = THIS_MODULE,
+	.objs = objs,
+};
+
+static int livepatch_callbacks_demo_init(void)
+{
+	int ret;
+
+	if (!klp_have_reliable_stack() && !patch.immediate) {
+		/*
+		 * WARNING: Be very careful when using 'patch.immediate' in
+		 * your patches.  It's ok to use it for simple patches like
+		 * this, but for more complex patches which change function
+		 * semantics, locking semantics, or data structures, it may not
+		 * be safe.  Use of this option will also prevent removal of
+		 * the patch.
+		 *
+		 * See Documentation/livepatch/livepatch.txt for more details.
+		 */
+		patch.immediate = true;
+		pr_notice("The consistency model isn't supported for your architecture.  Bypassing safety mechanisms and applying the patch immediately.\n");
+	}
+
+	ret = klp_register_patch(&patch);
+	if (ret)
+		return ret;
+	ret = klp_enable_patch(&patch);
+	if (ret) {
+		WARN_ON(klp_unregister_patch(&patch));
+		return ret;
+	}
+	return 0;
+}
+
+static void livepatch_callbacks_demo_exit(void)
+{
+	WARN_ON(klp_unregister_patch(&patch));
+}
+
+module_init(livepatch_callbacks_demo_init);
+module_exit(livepatch_callbacks_demo_exit);
+MODULE_LICENSE("GPL");
+MODULE_INFO(livepatch, "Y");
diff --git a/samples/livepatch/livepatch-callbacks-mod.c b/samples/livepatch/livepatch-callbacks-mod.c
new file mode 100644
index 000000000000..e610ce29ba44
--- /dev/null
+++ b/samples/livepatch/livepatch-callbacks-mod.c
@@ -0,0 +1,53 @@
+/*
+ * Copyright (C) 2017 Joe Lawrence <joe.lawrence@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+/*
+ * livepatch-callbacks-mod.c - (un)patching callbacks demo support module
+ *
+ *
+ * Purpose
+ * -------
+ *
+ * Simple module to demonstrate livepatch (un)patching callbacks.
+ *
+ *
+ * Usage
+ * -----
+ *
+ * This module is not intended to be standalone.  See the "Usage"
+ * section of livepatch-callbacks-demo.c.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+
+static int livepatch_callbacks_mod_init(void)
+{
+	pr_info("%s\n", __func__);
+	return 0;
+}
+
+static void livepatch_callbacks_mod_exit(void)
+{
+	pr_info("%s\n", __func__);
+}
+
+module_init(livepatch_callbacks_mod_init);
+module_exit(livepatch_callbacks_mod_exit);
+MODULE_LICENSE("GPL");
-- 
cgit v1.2.3


From 6116c3033a761611b1da980ea664c6ddff3eaed6 Mon Sep 17 00:00:00 2001
From: Joe Lawrence <joe.lawrence@redhat.com>
Date: Fri, 13 Oct 2017 15:08:42 -0400
Subject: livepatch: move transition "complete" notice into
 klp_complete_transition()

klp_complete_transition() performs a bit of housework before a
transition to KLP_PATCHED or KLP_UNPATCHED is actually completed
(including post-(un)patch callbacks).  To be consistent, move the
transition "complete" kernel log notice out of
klp_try_complete_transition() and into klp_complete_transition().

Suggested-by: Josh Poimboeuf <jpoimboe@redhat.com>
Acked-by: Josh Poimboeuf <jpoimboe@redhat.com>
Signed-off-by: Joe Lawrence <joe.lawrence@redhat.com>
Acked-by: Miroslav Benes <mbenes@suse.cz>
Reviewed-by: Petr Mladek <pmladek@suse.com>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 kernel/livepatch/transition.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/livepatch/transition.c b/kernel/livepatch/transition.c
index 7bf55b7f3687..53887f0bca10 100644
--- a/kernel/livepatch/transition.c
+++ b/kernel/livepatch/transition.c
@@ -136,6 +136,9 @@ done:
 			klp_post_unpatch_callback(obj);
 	}
 
+	pr_notice("'%s': %s complete\n", klp_transition_patch->mod->name,
+		  klp_target_state == KLP_PATCHED ? "patching" : "unpatching");
+
 	/*
 	 * See complementary comment in __klp_enable_patch() for why we
 	 * keep the module reference for immediate patches.
@@ -423,9 +426,6 @@ void klp_try_complete_transition(void)
 	}
 
 success:
-	pr_notice("'%s': %s complete\n", klp_transition_patch->mod->name,
-		  klp_target_state == KLP_PATCHED ? "patching" : "unpatching");
-
 	/* we're done, now cleanup the data structures */
 	klp_complete_transition();
 }
-- 
cgit v1.2.3


From af026796054fb70439e919a925615e61b500ef6b Mon Sep 17 00:00:00 2001
From: Joe Lawrence <joe.lawrence@redhat.com>
Date: Fri, 13 Oct 2017 15:08:43 -0400
Subject: livepatch: add transition notices

Log a few kernel debug messages at the beginning of the following livepatch
transition functions:

  klp_complete_transition()
  klp_cancel_transition()
  klp_init_transition()
  klp_reverse_transition()

Also update the log notice message in klp_start_transition() for similar
verbiage as the above messages.

Suggested-by: Josh Poimboeuf <jpoimboe@redhat.com>
Signed-off-by: Joe Lawrence <joe.lawrence@redhat.com>
Acked-by: Miroslav Benes <mbenes@suse.cz>
Acked-by: Josh Poimboeuf <jpoimboe@redhat.com>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 kernel/livepatch/transition.c | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/livepatch/transition.c b/kernel/livepatch/transition.c
index 53887f0bca10..56add6327736 100644
--- a/kernel/livepatch/transition.c
+++ b/kernel/livepatch/transition.c
@@ -82,6 +82,10 @@ static void klp_complete_transition(void)
 	unsigned int cpu;
 	bool immediate_func = false;
 
+	pr_debug("'%s': completing %s transition\n",
+		 klp_transition_patch->mod->name,
+		 klp_target_state == KLP_PATCHED ? "patching" : "unpatching");
+
 	if (klp_target_state == KLP_UNPATCHED) {
 		/*
 		 * All tasks have transitioned to KLP_UNPATCHED so we can now
@@ -163,6 +167,9 @@ void klp_cancel_transition(void)
 	if (WARN_ON_ONCE(klp_target_state != KLP_PATCHED))
 		return;
 
+	pr_debug("'%s': canceling patching transition, going to unpatch\n",
+		 klp_transition_patch->mod->name);
+
 	klp_target_state = KLP_UNPATCHED;
 	klp_complete_transition();
 }
@@ -441,7 +448,8 @@ void klp_start_transition(void)
 
 	WARN_ON_ONCE(klp_target_state == KLP_UNDEFINED);
 
-	pr_notice("'%s': %s...\n", klp_transition_patch->mod->name,
+	pr_notice("'%s': starting %s transition\n",
+		  klp_transition_patch->mod->name,
 		  klp_target_state == KLP_PATCHED ? "patching" : "unpatching");
 
 	/*
@@ -497,6 +505,9 @@ void klp_init_transition(struct klp_patch *patch, int state)
 	 */
 	klp_target_state = state;
 
+	pr_debug("'%s': initializing %s transition\n", patch->mod->name,
+		 klp_target_state == KLP_PATCHED ? "patching" : "unpatching");
+
 	/*
 	 * If the patch can be applied or reverted immediately, skip the
 	 * per-task transitions.
@@ -562,6 +573,11 @@ void klp_reverse_transition(void)
 	unsigned int cpu;
 	struct task_struct *g, *task;
 
+	pr_debug("'%s': reversing transition from %s\n",
+		 klp_transition_patch->mod->name,
+		 klp_target_state == KLP_PATCHED ? "patching to unpatching" :
+						   "unpatching to patching");
+
 	klp_transition_patch->enabled = !klp_transition_patch->enabled;
 
 	klp_target_state = !klp_target_state;
-- 
cgit v1.2.3


From 9ad0457423af877ad1b76c105a57130da028ccad Mon Sep 17 00:00:00 2001
From: Markus Elfring <elfring@users.sourceforge.net>
Date: Fri, 6 Oct 2017 16:27:26 +0200
Subject: kernel/module: Delete an error message for a failed memory allocation
 in add_module_usage()

Omit an extra message for a memory allocation failure in this function.

This issue was detected by using the Coccinelle software.

Signed-off-by: Markus Elfring <elfring@users.sourceforge.net>
Signed-off-by: Jessica Yu <jeyu@kernel.org>
---
 kernel/module.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index de66ec825992..07ef44767245 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -837,10 +837,8 @@ static int add_module_usage(struct module *a, struct module *b)
 
 	pr_debug("Allocating new usage for %s.\n", a->name);
 	use = kmalloc(sizeof(*use), GFP_ATOMIC);
-	if (!use) {
-		pr_warn("%s: out of memory loading\n", a->name);
+	if (!use)
 		return -ENOMEM;
-	}
 
 	use->source = a;
 	use->target = b;
-- 
cgit v1.2.3


From 6e71b04a82248ccf13a94b85cbc674a9fefe53f5 Mon Sep 17 00:00:00 2001
From: Chenbo Feng <fengc@google.com>
Date: Wed, 18 Oct 2017 13:00:22 -0700
Subject: bpf: Add file mode configuration into bpf maps

Introduce the map read/write flags to the eBPF syscalls that returns the
map fd. The flags is used to set up the file mode when construct a new
file descriptor for bpf maps. To not break the backward capability, the
f_flags is set to O_RDWR if the flag passed by syscall is 0. Otherwise
it should be O_RDONLY or O_WRONLY. When the userspace want to modify or
read the map content, it will check the file mode to see if it is
allowed to make the change.

Signed-off-by: Chenbo Feng <fengc@google.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h      |  8 +++--
 include/uapi/linux/bpf.h |  6 ++++
 kernel/bpf/arraymap.c    |  6 +++-
 kernel/bpf/devmap.c      |  5 ++-
 kernel/bpf/hashtab.c     |  5 +--
 kernel/bpf/inode.c       | 15 ++++++---
 kernel/bpf/lpm_trie.c    |  3 +-
 kernel/bpf/sockmap.c     |  5 ++-
 kernel/bpf/stackmap.c    |  5 ++-
 kernel/bpf/syscall.c     | 88 ++++++++++++++++++++++++++++++++++++++++++------
 net/netfilter/xt_bpf.c   |  2 +-
 11 files changed, 122 insertions(+), 26 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index d67ccdc0099f..3e5508f2fa87 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -315,11 +315,11 @@ void bpf_map_area_free(void *base);
 
 extern int sysctl_unprivileged_bpf_disabled;
 
-int bpf_map_new_fd(struct bpf_map *map);
+int bpf_map_new_fd(struct bpf_map *map, int flags);
 int bpf_prog_new_fd(struct bpf_prog *prog);
 
 int bpf_obj_pin_user(u32 ufd, const char __user *pathname);
-int bpf_obj_get_user(const char __user *pathname);
+int bpf_obj_get_user(const char __user *pathname, int flags);
 
 int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value);
 int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value);
@@ -338,6 +338,8 @@ int bpf_fd_htab_map_update_elem(struct bpf_map *map, struct file *map_file,
 				void *key, void *value, u64 map_flags);
 int bpf_fd_htab_map_lookup_elem(struct bpf_map *map, void *key, u32 *value);
 
+int bpf_get_file_flag(int flags);
+
 /* memcpy that is used with 8-byte aligned pointers, power-of-8 size and
  * forced to use 'long' read/writes to try to atomically copy long counters.
  * Best-effort only.  No barriers here, since it _will_ race with concurrent
@@ -421,7 +423,7 @@ static inline void __bpf_prog_uncharge(struct user_struct *user, u32 pages)
 {
 }
 
-static inline int bpf_obj_get_user(const char __user *pathname)
+static inline int bpf_obj_get_user(const char __user *pathname, int flags)
 {
 	return -EOPNOTSUPP;
 }
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 4303fb6c3817..d83f95ea6a1b 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -218,6 +218,10 @@ enum bpf_attach_type {
 
 #define BPF_OBJ_NAME_LEN 16U
 
+/* Flags for accessing BPF object */
+#define BPF_F_RDONLY		(1U << 3)
+#define BPF_F_WRONLY		(1U << 4)
+
 union bpf_attr {
 	struct { /* anonymous struct used by BPF_MAP_CREATE command */
 		__u32	map_type;	/* one of enum bpf_map_type */
@@ -260,6 +264,7 @@ union bpf_attr {
 	struct { /* anonymous struct used by BPF_OBJ_* commands */
 		__aligned_u64	pathname;
 		__u32		bpf_fd;
+		__u32		file_flags;
 	};
 
 	struct { /* anonymous struct used by BPF_PROG_ATTACH/DETACH commands */
@@ -287,6 +292,7 @@ union bpf_attr {
 			__u32		map_id;
 		};
 		__u32		next_id;
+		__u32		open_flags;
 	};
 
 	struct { /* anonymous struct used by BPF_OBJ_GET_INFO_BY_FD */
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 68d866628be0..988c04c91e10 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -19,6 +19,9 @@
 
 #include "map_in_map.h"
 
+#define ARRAY_CREATE_FLAG_MASK \
+	(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
+
 static void bpf_array_free_percpu(struct bpf_array *array)
 {
 	int i;
@@ -56,7 +59,8 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
 
 	/* check sanity of attributes */
 	if (attr->max_entries == 0 || attr->key_size != 4 ||
-	    attr->value_size == 0 || attr->map_flags & ~BPF_F_NUMA_NODE ||
+	    attr->value_size == 0 ||
+	    attr->map_flags & ~ARRAY_CREATE_FLAG_MASK ||
 	    (percpu && numa_node != NUMA_NO_NODE))
 		return ERR_PTR(-EINVAL);
 
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index e093d9a2c4dd..e5d3de7cff2e 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -50,6 +50,9 @@
 #include <linux/bpf.h>
 #include <linux/filter.h>
 
+#define DEV_CREATE_FLAG_MASK \
+	(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
+
 struct bpf_dtab_netdev {
 	struct net_device *dev;
 	struct bpf_dtab *dtab;
@@ -80,7 +83,7 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
 
 	/* check sanity of attributes */
 	if (attr->max_entries == 0 || attr->key_size != 4 ||
-	    attr->value_size != 4 || attr->map_flags & ~BPF_F_NUMA_NODE)
+	    attr->value_size != 4 || attr->map_flags & ~DEV_CREATE_FLAG_MASK)
 		return ERR_PTR(-EINVAL);
 
 	dtab = kzalloc(sizeof(*dtab), GFP_USER);
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 431126f31ea3..919955236e63 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -18,8 +18,9 @@
 #include "bpf_lru_list.h"
 #include "map_in_map.h"
 
-#define HTAB_CREATE_FLAG_MASK \
-	(BPF_F_NO_PREALLOC | BPF_F_NO_COMMON_LRU | BPF_F_NUMA_NODE)
+#define HTAB_CREATE_FLAG_MASK						\
+	(BPF_F_NO_PREALLOC | BPF_F_NO_COMMON_LRU | BPF_F_NUMA_NODE |	\
+	 BPF_F_RDONLY | BPF_F_WRONLY)
 
 struct bucket {
 	struct hlist_nulls_head head;
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index be1dde967208..01aaef1a77c5 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -295,7 +295,7 @@ out:
 }
 
 static void *bpf_obj_do_get(const struct filename *pathname,
-			    enum bpf_type *type)
+			    enum bpf_type *type, int flags)
 {
 	struct inode *inode;
 	struct path path;
@@ -307,7 +307,7 @@ static void *bpf_obj_do_get(const struct filename *pathname,
 		return ERR_PTR(ret);
 
 	inode = d_backing_inode(path.dentry);
-	ret = inode_permission(inode, MAY_WRITE);
+	ret = inode_permission(inode, ACC_MODE(flags));
 	if (ret)
 		goto out;
 
@@ -326,18 +326,23 @@ out:
 	return ERR_PTR(ret);
 }
 
-int bpf_obj_get_user(const char __user *pathname)
+int bpf_obj_get_user(const char __user *pathname, int flags)
 {
 	enum bpf_type type = BPF_TYPE_UNSPEC;
 	struct filename *pname;
 	int ret = -ENOENT;
+	int f_flags;
 	void *raw;
 
+	f_flags = bpf_get_file_flag(flags);
+	if (f_flags < 0)
+		return f_flags;
+
 	pname = getname(pathname);
 	if (IS_ERR(pname))
 		return PTR_ERR(pname);
 
-	raw = bpf_obj_do_get(pname, &type);
+	raw = bpf_obj_do_get(pname, &type, f_flags);
 	if (IS_ERR(raw)) {
 		ret = PTR_ERR(raw);
 		goto out;
@@ -346,7 +351,7 @@ int bpf_obj_get_user(const char __user *pathname)
 	if (type == BPF_TYPE_PROG)
 		ret = bpf_prog_new_fd(raw);
 	else if (type == BPF_TYPE_MAP)
-		ret = bpf_map_new_fd(raw);
+		ret = bpf_map_new_fd(raw, f_flags);
 	else
 		goto out;
 
diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c
index 34d8a690ea05..885e45479680 100644
--- a/kernel/bpf/lpm_trie.c
+++ b/kernel/bpf/lpm_trie.c
@@ -495,7 +495,8 @@ out:
 #define LPM_KEY_SIZE_MAX	LPM_KEY_SIZE(LPM_DATA_SIZE_MAX)
 #define LPM_KEY_SIZE_MIN	LPM_KEY_SIZE(LPM_DATA_SIZE_MIN)
 
-#define LPM_CREATE_FLAG_MASK	(BPF_F_NO_PREALLOC | BPF_F_NUMA_NODE)
+#define LPM_CREATE_FLAG_MASK	(BPF_F_NO_PREALLOC | BPF_F_NUMA_NODE |	\
+				 BPF_F_RDONLY | BPF_F_WRONLY)
 
 static struct bpf_map *trie_alloc(union bpf_attr *attr)
 {
diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
index a298d6666698..86ec846f2d5e 100644
--- a/kernel/bpf/sockmap.c
+++ b/kernel/bpf/sockmap.c
@@ -40,6 +40,9 @@
 #include <linux/list.h>
 #include <net/strparser.h>
 
+#define SOCK_CREATE_FLAG_MASK \
+	(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
+
 struct bpf_stab {
 	struct bpf_map map;
 	struct sock **sock_map;
@@ -489,7 +492,7 @@ static struct bpf_map *sock_map_alloc(union bpf_attr *attr)
 
 	/* check sanity of attributes */
 	if (attr->max_entries == 0 || attr->key_size != 4 ||
-	    attr->value_size != 4 || attr->map_flags & ~BPF_F_NUMA_NODE)
+	    attr->value_size != 4 || attr->map_flags & ~SOCK_CREATE_FLAG_MASK)
 		return ERR_PTR(-EINVAL);
 
 	if (attr->value_size > KMALLOC_MAX_SIZE)
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 135be433e9a0..a15bc636cc98 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -11,6 +11,9 @@
 #include <linux/perf_event.h>
 #include "percpu_freelist.h"
 
+#define STACK_CREATE_FLAG_MASK \
+	(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
+
 struct stack_map_bucket {
 	struct pcpu_freelist_node fnode;
 	u32 hash;
@@ -60,7 +63,7 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
 	if (!capable(CAP_SYS_ADMIN))
 		return ERR_PTR(-EPERM);
 
-	if (attr->map_flags & ~BPF_F_NUMA_NODE)
+	if (attr->map_flags & ~STACK_CREATE_FLAG_MASK)
 		return ERR_PTR(-EINVAL);
 
 	/* check sanity of attributes */
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 0e893cac6795..676a06e6b322 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -34,6 +34,8 @@
 #define IS_FD_HASH(map) ((map)->map_type == BPF_MAP_TYPE_HASH_OF_MAPS)
 #define IS_FD_MAP(map) (IS_FD_ARRAY(map) || IS_FD_HASH(map))
 
+#define BPF_OBJ_FLAG_MASK   (BPF_F_RDONLY | BPF_F_WRONLY)
+
 DEFINE_PER_CPU(int, bpf_prog_active);
 static DEFINE_IDR(prog_idr);
 static DEFINE_SPINLOCK(prog_idr_lock);
@@ -294,17 +296,48 @@ static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp)
 }
 #endif
 
+static ssize_t bpf_dummy_read(struct file *filp, char __user *buf, size_t siz,
+			      loff_t *ppos)
+{
+	/* We need this handler such that alloc_file() enables
+	 * f_mode with FMODE_CAN_READ.
+	 */
+	return -EINVAL;
+}
+
+static ssize_t bpf_dummy_write(struct file *filp, const char __user *buf,
+			       size_t siz, loff_t *ppos)
+{
+	/* We need this handler such that alloc_file() enables
+	 * f_mode with FMODE_CAN_WRITE.
+	 */
+	return -EINVAL;
+}
+
 static const struct file_operations bpf_map_fops = {
 #ifdef CONFIG_PROC_FS
 	.show_fdinfo	= bpf_map_show_fdinfo,
 #endif
 	.release	= bpf_map_release,
+	.read		= bpf_dummy_read,
+	.write		= bpf_dummy_write,
 };
 
-int bpf_map_new_fd(struct bpf_map *map)
+int bpf_map_new_fd(struct bpf_map *map, int flags)
 {
 	return anon_inode_getfd("bpf-map", &bpf_map_fops, map,
-				O_RDWR | O_CLOEXEC);
+				flags | O_CLOEXEC);
+}
+
+int bpf_get_file_flag(int flags)
+{
+	if ((flags & BPF_F_RDONLY) && (flags & BPF_F_WRONLY))
+		return -EINVAL;
+	if (flags & BPF_F_RDONLY)
+		return O_RDONLY;
+	if (flags & BPF_F_WRONLY)
+		return O_WRONLY;
+	return O_RDWR;
 }
 
 /* helper macro to check that unused fields 'union bpf_attr' are zero */
@@ -344,12 +377,17 @@ static int map_create(union bpf_attr *attr)
 {
 	int numa_node = bpf_map_attr_numa_node(attr);
 	struct bpf_map *map;
+	int f_flags;
 	int err;
 
 	err = CHECK_ATTR(BPF_MAP_CREATE);
 	if (err)
 		return -EINVAL;
 
+	f_flags = bpf_get_file_flag(attr->map_flags);
+	if (f_flags < 0)
+		return f_flags;
+
 	if (numa_node != NUMA_NO_NODE &&
 	    ((unsigned int)numa_node >= nr_node_ids ||
 	     !node_online(numa_node)))
@@ -375,7 +413,7 @@ static int map_create(union bpf_attr *attr)
 	if (err)
 		goto free_map;
 
-	err = bpf_map_new_fd(map);
+	err = bpf_map_new_fd(map, f_flags);
 	if (err < 0) {
 		/* failed to allocate fd.
 		 * bpf_map_put() is needed because the above
@@ -490,6 +528,11 @@ static int map_lookup_elem(union bpf_attr *attr)
 	if (IS_ERR(map))
 		return PTR_ERR(map);
 
+	if (!(f.file->f_mode & FMODE_CAN_READ)) {
+		err = -EPERM;
+		goto err_put;
+	}
+
 	key = memdup_user(ukey, map->key_size);
 	if (IS_ERR(key)) {
 		err = PTR_ERR(key);
@@ -570,6 +613,11 @@ static int map_update_elem(union bpf_attr *attr)
 	if (IS_ERR(map))
 		return PTR_ERR(map);
 
+	if (!(f.file->f_mode & FMODE_CAN_WRITE)) {
+		err = -EPERM;
+		goto err_put;
+	}
+
 	key = memdup_user(ukey, map->key_size);
 	if (IS_ERR(key)) {
 		err = PTR_ERR(key);
@@ -659,6 +707,11 @@ static int map_delete_elem(union bpf_attr *attr)
 	if (IS_ERR(map))
 		return PTR_ERR(map);
 
+	if (!(f.file->f_mode & FMODE_CAN_WRITE)) {
+		err = -EPERM;
+		goto err_put;
+	}
+
 	key = memdup_user(ukey, map->key_size);
 	if (IS_ERR(key)) {
 		err = PTR_ERR(key);
@@ -702,6 +755,11 @@ static int map_get_next_key(union bpf_attr *attr)
 	if (IS_ERR(map))
 		return PTR_ERR(map);
 
+	if (!(f.file->f_mode & FMODE_CAN_READ)) {
+		err = -EPERM;
+		goto err_put;
+	}
+
 	if (ukey) {
 		key = memdup_user(ukey, map->key_size);
 		if (IS_ERR(key)) {
@@ -908,6 +966,8 @@ static const struct file_operations bpf_prog_fops = {
 	.show_fdinfo	= bpf_prog_show_fdinfo,
 #endif
 	.release	= bpf_prog_release,
+	.read		= bpf_dummy_read,
+	.write		= bpf_dummy_write,
 };
 
 int bpf_prog_new_fd(struct bpf_prog *prog)
@@ -1117,11 +1177,11 @@ free_prog_nouncharge:
 	return err;
 }
 
-#define BPF_OBJ_LAST_FIELD bpf_fd
+#define BPF_OBJ_LAST_FIELD file_flags
 
 static int bpf_obj_pin(const union bpf_attr *attr)
 {
-	if (CHECK_ATTR(BPF_OBJ))
+	if (CHECK_ATTR(BPF_OBJ) || attr->file_flags != 0)
 		return -EINVAL;
 
 	return bpf_obj_pin_user(attr->bpf_fd, u64_to_user_ptr(attr->pathname));
@@ -1129,10 +1189,12 @@ static int bpf_obj_pin(const union bpf_attr *attr)
 
 static int bpf_obj_get(const union bpf_attr *attr)
 {
-	if (CHECK_ATTR(BPF_OBJ) || attr->bpf_fd != 0)
+	if (CHECK_ATTR(BPF_OBJ) || attr->bpf_fd != 0 ||
+	    attr->file_flags & ~BPF_OBJ_FLAG_MASK)
 		return -EINVAL;
 
-	return bpf_obj_get_user(u64_to_user_ptr(attr->pathname));
+	return bpf_obj_get_user(u64_to_user_ptr(attr->pathname),
+				attr->file_flags);
 }
 
 #ifdef CONFIG_CGROUP_BPF
@@ -1392,20 +1454,26 @@ static int bpf_prog_get_fd_by_id(const union bpf_attr *attr)
 	return fd;
 }
 
-#define BPF_MAP_GET_FD_BY_ID_LAST_FIELD map_id
+#define BPF_MAP_GET_FD_BY_ID_LAST_FIELD open_flags
 
 static int bpf_map_get_fd_by_id(const union bpf_attr *attr)
 {
 	struct bpf_map *map;
 	u32 id = attr->map_id;
+	int f_flags;
 	int fd;
 
-	if (CHECK_ATTR(BPF_MAP_GET_FD_BY_ID))
+	if (CHECK_ATTR(BPF_MAP_GET_FD_BY_ID) ||
+	    attr->open_flags & ~BPF_OBJ_FLAG_MASK)
 		return -EINVAL;
 
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
 
+	f_flags = bpf_get_file_flag(attr->open_flags);
+	if (f_flags < 0)
+		return f_flags;
+
 	spin_lock_bh(&map_idr_lock);
 	map = idr_find(&map_idr, id);
 	if (map)
@@ -1417,7 +1485,7 @@ static int bpf_map_get_fd_by_id(const union bpf_attr *attr)
 	if (IS_ERR(map))
 		return PTR_ERR(map);
 
-	fd = bpf_map_new_fd(map);
+	fd = bpf_map_new_fd(map, f_flags);
 	if (fd < 0)
 		bpf_map_put(map);
 
diff --git a/net/netfilter/xt_bpf.c b/net/netfilter/xt_bpf.c
index 29123934887b..041da0d9c06f 100644
--- a/net/netfilter/xt_bpf.c
+++ b/net/netfilter/xt_bpf.c
@@ -56,7 +56,7 @@ static int __bpf_mt_check_path(const char *path, struct bpf_prog **ret)
 	int retval, fd;
 
 	set_fs(KERNEL_DS);
-	fd = bpf_obj_get_user(path);
+	fd = bpf_obj_get_user(path, 0);
 	set_fs(oldfs);
 	if (fd < 0)
 		return fd;
-- 
cgit v1.2.3


From afdb09c720b62b8090584c11151d856df330e57d Mon Sep 17 00:00:00 2001
From: Chenbo Feng <fengc@google.com>
Date: Wed, 18 Oct 2017 13:00:24 -0700
Subject: security: bpf: Add LSM hooks for bpf object related syscall

Introduce several LSM hooks for the syscalls that will allow the
userspace to access to eBPF object such as eBPF programs and eBPF maps.
The security check is aimed to enforce a per object security protection
for eBPF object so only processes with the right priviliges can
read/write to a specific map or use a specific eBPF program. Besides
that, a general security hook is added before the multiplexer of bpf
syscall to check the cmd and the attribute used for the command. The
actual security module can decide which command need to be checked and
how the cmd should be checked.

Signed-off-by: Chenbo Feng <fengc@google.com>
Acked-by: James Morris <james.l.morris@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h       |  6 ++++++
 include/linux/lsm_hooks.h | 54 +++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/security.h  | 45 +++++++++++++++++++++++++++++++++++++++
 kernel/bpf/syscall.c      | 34 +++++++++++++++++++++++++++--
 security/security.c       | 32 ++++++++++++++++++++++++++++
 5 files changed, 169 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 3e5508f2fa87..84c192da3e0b 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -57,6 +57,9 @@ struct bpf_map {
 	atomic_t usercnt;
 	struct bpf_map *inner_map_meta;
 	char name[BPF_OBJ_NAME_LEN];
+#ifdef CONFIG_SECURITY
+	void *security;
+#endif
 };
 
 /* function argument constraints */
@@ -193,6 +196,9 @@ struct bpf_prog_aux {
 	struct user_struct *user;
 	u64 load_time; /* ns since boottime */
 	char name[BPF_OBJ_NAME_LEN];
+#ifdef CONFIG_SECURITY
+	void *security;
+#endif
 	union {
 		struct work_struct work;
 		struct rcu_head	rcu;
diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h
index c9258124e417..7161d8e7ee79 100644
--- a/include/linux/lsm_hooks.h
+++ b/include/linux/lsm_hooks.h
@@ -1351,6 +1351,40 @@
  *	@inode we wish to get the security context of.
  *	@ctx is a pointer in which to place the allocated security context.
  *	@ctxlen points to the place to put the length of @ctx.
+ *
+ * Security hooks for using the eBPF maps and programs functionalities through
+ * eBPF syscalls.
+ *
+ * @bpf:
+ *	Do a initial check for all bpf syscalls after the attribute is copied
+ *	into the kernel. The actual security module can implement their own
+ *	rules to check the specific cmd they need.
+ *
+ * @bpf_map:
+ *	Do a check when the kernel generate and return a file descriptor for
+ *	eBPF maps.
+ *
+ *	@map: bpf map that we want to access
+ *	@mask: the access flags
+ *
+ * @bpf_prog:
+ *	Do a check when the kernel generate and return a file descriptor for
+ *	eBPF programs.
+ *
+ *	@prog: bpf prog that userspace want to use.
+ *
+ * @bpf_map_alloc_security:
+ *	Initialize the security field inside bpf map.
+ *
+ * @bpf_map_free_security:
+ *	Clean up the security information stored inside bpf map.
+ *
+ * @bpf_prog_alloc_security:
+ *	Initialize the security field inside bpf program.
+ *
+ * @bpf_prog_free_security:
+ *	Clean up the security information stored inside bpf prog.
+ *
  */
 union security_list_options {
 	int (*binder_set_context_mgr)(struct task_struct *mgr);
@@ -1682,6 +1716,17 @@ union security_list_options {
 				struct audit_context *actx);
 	void (*audit_rule_free)(void *lsmrule);
 #endif /* CONFIG_AUDIT */
+
+#ifdef CONFIG_BPF_SYSCALL
+	int (*bpf)(int cmd, union bpf_attr *attr,
+				 unsigned int size);
+	int (*bpf_map)(struct bpf_map *map, fmode_t fmode);
+	int (*bpf_prog)(struct bpf_prog *prog);
+	int (*bpf_map_alloc_security)(struct bpf_map *map);
+	void (*bpf_map_free_security)(struct bpf_map *map);
+	int (*bpf_prog_alloc_security)(struct bpf_prog_aux *aux);
+	void (*bpf_prog_free_security)(struct bpf_prog_aux *aux);
+#endif /* CONFIG_BPF_SYSCALL */
 };
 
 struct security_hook_heads {
@@ -1901,6 +1946,15 @@ struct security_hook_heads {
 	struct list_head audit_rule_match;
 	struct list_head audit_rule_free;
 #endif /* CONFIG_AUDIT */
+#ifdef CONFIG_BPF_SYSCALL
+	struct list_head bpf;
+	struct list_head bpf_map;
+	struct list_head bpf_prog;
+	struct list_head bpf_map_alloc_security;
+	struct list_head bpf_map_free_security;
+	struct list_head bpf_prog_alloc_security;
+	struct list_head bpf_prog_free_security;
+#endif /* CONFIG_BPF_SYSCALL */
 } __randomize_layout;
 
 /*
diff --git a/include/linux/security.h b/include/linux/security.h
index ce6265960d6c..18800b0911e5 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -31,6 +31,7 @@
 #include <linux/string.h>
 #include <linux/mm.h>
 #include <linux/fs.h>
+#include <linux/bpf.h>
 
 struct linux_binprm;
 struct cred;
@@ -1730,6 +1731,50 @@ static inline void securityfs_remove(struct dentry *dentry)
 
 #endif
 
+#ifdef CONFIG_BPF_SYSCALL
+#ifdef CONFIG_SECURITY
+extern int security_bpf(int cmd, union bpf_attr *attr, unsigned int size);
+extern int security_bpf_map(struct bpf_map *map, fmode_t fmode);
+extern int security_bpf_prog(struct bpf_prog *prog);
+extern int security_bpf_map_alloc(struct bpf_map *map);
+extern void security_bpf_map_free(struct bpf_map *map);
+extern int security_bpf_prog_alloc(struct bpf_prog_aux *aux);
+extern void security_bpf_prog_free(struct bpf_prog_aux *aux);
+#else
+static inline int security_bpf(int cmd, union bpf_attr *attr,
+					     unsigned int size)
+{
+	return 0;
+}
+
+static inline int security_bpf_map(struct bpf_map *map, fmode_t fmode)
+{
+	return 0;
+}
+
+static inline int security_bpf_prog(struct bpf_prog *prog)
+{
+	return 0;
+}
+
+static inline int security_bpf_map_alloc(struct bpf_map *map)
+{
+	return 0;
+}
+
+static inline void security_bpf_map_free(struct bpf_map *map)
+{ }
+
+static inline int security_bpf_prog_alloc(struct bpf_prog_aux *aux)
+{
+	return 0;
+}
+
+static inline void security_bpf_prog_free(struct bpf_prog_aux *aux)
+{ }
+#endif /* CONFIG_SECURITY */
+#endif /* CONFIG_BPF_SYSCALL */
+
 #ifdef CONFIG_SECURITY
 
 static inline char *alloc_secdata(void)
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 676a06e6b322..5cb56d06b48d 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -212,6 +212,7 @@ static void bpf_map_free_deferred(struct work_struct *work)
 	struct bpf_map *map = container_of(work, struct bpf_map, work);
 
 	bpf_map_uncharge_memlock(map);
+	security_bpf_map_free(map);
 	/* implementation dependent freeing */
 	map->ops->map_free(map);
 }
@@ -325,6 +326,12 @@ static const struct file_operations bpf_map_fops = {
 
 int bpf_map_new_fd(struct bpf_map *map, int flags)
 {
+	int ret;
+
+	ret = security_bpf_map(map, OPEN_FMODE(flags));
+	if (ret < 0)
+		return ret;
+
 	return anon_inode_getfd("bpf-map", &bpf_map_fops, map,
 				flags | O_CLOEXEC);
 }
@@ -405,10 +412,14 @@ static int map_create(union bpf_attr *attr)
 	atomic_set(&map->refcnt, 1);
 	atomic_set(&map->usercnt, 1);
 
-	err = bpf_map_charge_memlock(map);
+	err = security_bpf_map_alloc(map);
 	if (err)
 		goto free_map_nouncharge;
 
+	err = bpf_map_charge_memlock(map);
+	if (err)
+		goto free_map_sec;
+
 	err = bpf_map_alloc_id(map);
 	if (err)
 		goto free_map;
@@ -430,6 +441,8 @@ static int map_create(union bpf_attr *attr)
 
 free_map:
 	bpf_map_uncharge_memlock(map);
+free_map_sec:
+	security_bpf_map_free(map);
 free_map_nouncharge:
 	map->ops->map_free(map);
 	return err;
@@ -914,6 +927,7 @@ static void __bpf_prog_put_rcu(struct rcu_head *rcu)
 
 	free_used_maps(aux);
 	bpf_prog_uncharge_memlock(aux->prog);
+	security_bpf_prog_free(aux);
 	bpf_prog_free(aux->prog);
 }
 
@@ -972,6 +986,12 @@ static const struct file_operations bpf_prog_fops = {
 
 int bpf_prog_new_fd(struct bpf_prog *prog)
 {
+	int ret;
+
+	ret = security_bpf_prog(prog);
+	if (ret < 0)
+		return ret;
+
 	return anon_inode_getfd("bpf-prog", &bpf_prog_fops, prog,
 				O_RDWR | O_CLOEXEC);
 }
@@ -1111,10 +1131,14 @@ static int bpf_prog_load(union bpf_attr *attr)
 	if (!prog)
 		return -ENOMEM;
 
-	err = bpf_prog_charge_memlock(prog);
+	err = security_bpf_prog_alloc(prog->aux);
 	if (err)
 		goto free_prog_nouncharge;
 
+	err = bpf_prog_charge_memlock(prog);
+	if (err)
+		goto free_prog_sec;
+
 	prog->len = attr->insn_cnt;
 
 	err = -EFAULT;
@@ -1172,6 +1196,8 @@ free_used_maps:
 	free_used_maps(prog->aux);
 free_prog:
 	bpf_prog_uncharge_memlock(prog);
+free_prog_sec:
+	security_bpf_prog_free(prog->aux);
 free_prog_nouncharge:
 	bpf_prog_free(prog);
 	return err;
@@ -1640,6 +1666,10 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
 	if (copy_from_user(&attr, uattr, size) != 0)
 		return -EFAULT;
 
+	err = security_bpf(cmd, &attr, size);
+	if (err < 0)
+		return err;
+
 	switch (cmd) {
 	case BPF_MAP_CREATE:
 		err = map_create(&attr);
diff --git a/security/security.c b/security/security.c
index 4bf0f571b4ef..1cd8526cb0b7 100644
--- a/security/security.c
+++ b/security/security.c
@@ -12,6 +12,7 @@
  *	(at your option) any later version.
  */
 
+#include <linux/bpf.h>
 #include <linux/capability.h>
 #include <linux/dcache.h>
 #include <linux/module.h>
@@ -1703,3 +1704,34 @@ int security_audit_rule_match(u32 secid, u32 field, u32 op, void *lsmrule,
 				actx);
 }
 #endif /* CONFIG_AUDIT */
+
+#ifdef CONFIG_BPF_SYSCALL
+int security_bpf(int cmd, union bpf_attr *attr, unsigned int size)
+{
+	return call_int_hook(bpf, 0, cmd, attr, size);
+}
+int security_bpf_map(struct bpf_map *map, fmode_t fmode)
+{
+	return call_int_hook(bpf_map, 0, map, fmode);
+}
+int security_bpf_prog(struct bpf_prog *prog)
+{
+	return call_int_hook(bpf_prog, 0, prog);
+}
+int security_bpf_map_alloc(struct bpf_map *map)
+{
+	return call_int_hook(bpf_map_alloc_security, 0, map);
+}
+int security_bpf_prog_alloc(struct bpf_prog_aux *aux)
+{
+	return call_int_hook(bpf_prog_alloc_security, 0, aux);
+}
+void security_bpf_map_free(struct bpf_map *map)
+{
+	call_void_hook(bpf_map_free_security, map);
+}
+void security_bpf_prog_free(struct bpf_prog_aux *aux)
+{
+	call_void_hook(bpf_prog_free_security, aux);
+}
+#endif /* CONFIG_BPF_SYSCALL */
-- 
cgit v1.2.3


From f66e448cfda021b0bcd884f26709796fe19c7cc1 Mon Sep 17 00:00:00 2001
From: Chenbo Feng <fengc@google.com>
Date: Wed, 18 Oct 2017 13:00:26 -0700
Subject: selinux: bpf: Add addtional check for bpf object file receive

Introduce a bpf object related check when sending and receiving files
through unix domain socket as well as binder. It checks if the receiving
process have privilege to read/write the bpf map or use the bpf program.
This check is necessary because the bpf maps and programs are using a
anonymous inode as their shared inode so the normal way of checking the
files and sockets when passing between processes cannot work properly on
eBPF object. This check only works when the BPF_SYSCALL is configured.

Signed-off-by: Chenbo Feng <fengc@google.com>
Acked-by: Stephen Smalley <sds@tycho.nsa.gov>
Reviewed-by: James Morris <james.l.morris@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h      |  3 +++
 kernel/bpf/syscall.c     |  4 ++--
 security/selinux/hooks.c | 49 ++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 54 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 84c192da3e0b..1e334b248ff6 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -288,6 +288,9 @@ int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *progs,
 #ifdef CONFIG_BPF_SYSCALL
 DECLARE_PER_CPU(int, bpf_prog_active);
 
+extern const struct file_operations bpf_map_fops;
+extern const struct file_operations bpf_prog_fops;
+
 #define BPF_PROG_TYPE(_id, _name) \
 	extern const struct bpf_prog_ops _name ## _prog_ops; \
 	extern const struct bpf_verifier_ops _name ## _verifier_ops;
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 5cb56d06b48d..323be2473c4b 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -315,7 +315,7 @@ static ssize_t bpf_dummy_write(struct file *filp, const char __user *buf,
 	return -EINVAL;
 }
 
-static const struct file_operations bpf_map_fops = {
+const struct file_operations bpf_map_fops = {
 #ifdef CONFIG_PROC_FS
 	.show_fdinfo	= bpf_map_show_fdinfo,
 #endif
@@ -975,7 +975,7 @@ static void bpf_prog_show_fdinfo(struct seq_file *m, struct file *filp)
 }
 #endif
 
-static const struct file_operations bpf_prog_fops = {
+const struct file_operations bpf_prog_fops = {
 #ifdef CONFIG_PROC_FS
 	.show_fdinfo	= bpf_prog_show_fdinfo,
 #endif
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 12cf7de8cbed..2e3a627fc0b1 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -1815,6 +1815,10 @@ static inline int file_path_has_perm(const struct cred *cred,
 	return inode_has_perm(cred, file_inode(file), av, &ad);
 }
 
+#ifdef CONFIG_BPF_SYSCALL
+static int bpf_fd_pass(struct file *file, u32 sid);
+#endif
+
 /* Check whether a task can use an open file descriptor to
    access an inode in a given way.  Check access to the
    descriptor itself, and then use dentry_has_perm to
@@ -1845,6 +1849,12 @@ static int file_has_perm(const struct cred *cred,
 			goto out;
 	}
 
+#ifdef CONFIG_BPF_SYSCALL
+	rc = bpf_fd_pass(file, cred_sid(cred));
+	if (rc)
+		return rc;
+#endif
+
 	/* av is zero if only checking access to the descriptor. */
 	rc = 0;
 	if (av)
@@ -2165,6 +2175,12 @@ static int selinux_binder_transfer_file(struct task_struct *from,
 			return rc;
 	}
 
+#ifdef CONFIG_BPF_SYSCALL
+	rc = bpf_fd_pass(file, sid);
+	if (rc)
+		return rc;
+#endif
+
 	if (unlikely(IS_PRIVATE(d_backing_inode(dentry))))
 		return 0;
 
@@ -6288,6 +6304,39 @@ static u32 bpf_map_fmode_to_av(fmode_t fmode)
 	return av;
 }
 
+/* This function will check the file pass through unix socket or binder to see
+ * if it is a bpf related object. And apply correspinding checks on the bpf
+ * object based on the type. The bpf maps and programs, not like other files and
+ * socket, are using a shared anonymous inode inside the kernel as their inode.
+ * So checking that inode cannot identify if the process have privilege to
+ * access the bpf object and that's why we have to add this additional check in
+ * selinux_file_receive and selinux_binder_transfer_files.
+ */
+static int bpf_fd_pass(struct file *file, u32 sid)
+{
+	struct bpf_security_struct *bpfsec;
+	struct bpf_prog *prog;
+	struct bpf_map *map;
+	int ret;
+
+	if (file->f_op == &bpf_map_fops) {
+		map = file->private_data;
+		bpfsec = map->security;
+		ret = avc_has_perm(sid, bpfsec->sid, SECCLASS_BPF,
+				   bpf_map_fmode_to_av(file->f_mode), NULL);
+		if (ret)
+			return ret;
+	} else if (file->f_op == &bpf_prog_fops) {
+		prog = file->private_data;
+		bpfsec = prog->aux->security;
+		ret = avc_has_perm(sid, bpfsec->sid, SECCLASS_BPF,
+				   BPF__PROG_RUN, NULL);
+		if (ret)
+			return ret;
+	}
+	return 0;
+}
+
 static int selinux_bpf_map(struct bpf_map *map, fmode_t fmode)
 {
 	u32 sid = current_sid();
-- 
cgit v1.2.3


From b5149873a0c299195b5346fe4dc2c5b04ae2f995 Mon Sep 17 00:00:00 2001
From: Tal Shorer <tal.shorer@gmail.com>
Date: Sat, 21 Oct 2017 19:29:24 +0300
Subject: workqueue: respect isolated cpus when queueing an unbound work

Initialize wq_unbound_cpumask to exclude cpus that were isolated by
the cmdline's isolcpus parameter.

Signed-off-by: Tal Shorer <tal.shorer@gmail.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/workqueue.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 64d0edf428f8..bfa433b38a61 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -4980,6 +4980,10 @@ int workqueue_set_unbound_cpumask(cpumask_var_t cpumask)
 	if (!zalloc_cpumask_var(&saved_cpumask, GFP_KERNEL))
 		return -ENOMEM;
 
+	/*
+	 * Not excluding isolated cpus on purpose.
+	 * If the user wishes to include them, we allow that.
+	 */
 	cpumask_and(cpumask, cpumask, cpu_possible_mask);
 	if (!cpumask_empty(cpumask)) {
 		apply_wqattrs_lock();
@@ -5579,7 +5583,7 @@ int __init workqueue_init_early(void)
 	WARN_ON(__alignof__(struct pool_workqueue) < __alignof__(long long));
 
 	BUG_ON(!alloc_cpumask_var(&wq_unbound_cpumask, GFP_KERNEL));
-	cpumask_copy(wq_unbound_cpumask, cpu_possible_mask);
+	cpumask_andnot(wq_unbound_cpumask, cpu_possible_mask, cpu_isolated_map);
 
 	pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC);
 
-- 
cgit v1.2.3


From 31749468c3f9d77927ed3144259dc208e6625ede Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Mon, 23 Oct 2017 19:39:28 +0200
Subject: bpf: cpumap fix potential lost wake-up problem

As pointed out by Michael, commit 1c601d829ab0 ("bpf: cpumap xdp_buff
to skb conversion and allocation") contains a classical example of the
potential lost wake-up problem.

We need to recheck the condition __ptr_ring_empty() after changing
current->state to TASK_INTERRUPTIBLE, this avoids a race between
wake_up_process() and schedule(). After this, a race with
wake_up_process() will simply change the state to TASK_RUNNING, and
the schedule() call not really put us to sleep.

Fixes: 1c601d829ab0 ("bpf: cpumap xdp_buff to skb conversion and allocation")
Reported-by: "Michael S. Tsirkin" <mst@redhat.com>
Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/cpumap.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index b4358d84ddf1..86e29cbf7827 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -288,13 +288,17 @@ static int cpu_map_kthread_run(void *data)
 
 		/* Release CPU reschedule checks */
 		if (__ptr_ring_empty(rcpu->queue)) {
-			__set_current_state(TASK_INTERRUPTIBLE);
-			schedule();
-			sched = 1;
+			set_current_state(TASK_INTERRUPTIBLE);
+			/* Recheck to avoid lost wake-up */
+			if (__ptr_ring_empty(rcpu->queue)) {
+				schedule();
+				sched = 1;
+			} else {
+				__set_current_state(TASK_RUNNING);
+			}
 		} else {
 			sched = cond_resched();
 		}
-		__set_current_state(TASK_RUNNING);
 
 		/* Process packets in rcpu->queue */
 		local_bh_disable();
-- 
cgit v1.2.3


From 0b4c6841fee03e096b735074a0c4aab3a8e92986 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Mon, 23 Oct 2017 23:53:07 -0700
Subject: bpf: use the same condition in perf event set/free bpf handler

This is a cleanup such that doing the same check in
perf_event_free_bpf_prog as we already do in
perf_event_set_bpf_prog step.

Signed-off-by: Yonghong Song <yhs@fb.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/events/core.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 31ee304a5844..9f78a6825bbe 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -8191,10 +8191,10 @@ static void perf_event_free_bpf_prog(struct perf_event *event)
 {
 	struct bpf_prog *prog;
 
-	perf_event_free_bpf_handler(event);
-
-	if (!event->tp_event)
+	if (event->attr.type != PERF_TYPE_TRACEPOINT) {
+		perf_event_free_bpf_handler(event);
 		return;
+	}
 
 	prog = event->tp_event->prog;
 	if (prog && event->tp_event->bpf_prog_owner == event) {
-- 
cgit v1.2.3


From e87c6bc3852b981e71c757be20771546ce9f76f3 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Mon, 23 Oct 2017 23:53:08 -0700
Subject: bpf: permit multiple bpf attachments for a single perf event

This patch enables multiple bpf attachments for a
kprobe/uprobe/tracepoint single trace event.
Each trace_event keeps a list of attached perf events.
When an event happens, all attached bpf programs will
be executed based on the order of attachment.

A global bpf_event_mutex lock is introduced to protect
prog_array attaching and detaching. An alternative will
be introduce a mutex lock in every trace_event_call
structure, but it takes a lot of extra memory.
So a global bpf_event_mutex lock is a good compromise.

The bpf prog detachment involves allocation of memory.
If the allocation fails, a dummy do-nothing program
will replace to-be-detached program in-place.

Signed-off-by: Yonghong Song <yhs@fb.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h           | 30 +++++++++++++---
 include/linux/trace_events.h  | 43 ++++++++++++++++++++---
 include/trace/perf.h          |  6 ++--
 kernel/bpf/core.c             | 81 ++++++++++++++++++++++++++++++++++++++++++
 kernel/events/core.c          | 26 +++++---------
 kernel/trace/bpf_trace.c      | 82 ++++++++++++++++++++++++++++++++++++++++---
 kernel/trace/trace_kprobe.c   |  6 ++--
 kernel/trace/trace_syscalls.c | 34 ++++++++++--------
 kernel/trace/trace_uprobe.c   |  3 +-
 9 files changed, 255 insertions(+), 56 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 1e334b248ff6..172be7faf7ba 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -273,18 +273,38 @@ int bpf_prog_array_length(struct bpf_prog_array __rcu *progs);
 int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *progs,
 				__u32 __user *prog_ids, u32 cnt);
 
-#define BPF_PROG_RUN_ARRAY(array, ctx, func)		\
+void bpf_prog_array_delete_safe(struct bpf_prog_array __rcu *progs,
+				struct bpf_prog *old_prog);
+int bpf_prog_array_copy(struct bpf_prog_array __rcu *old_array,
+			struct bpf_prog *exclude_prog,
+			struct bpf_prog *include_prog,
+			struct bpf_prog_array **new_array);
+
+#define __BPF_PROG_RUN_ARRAY(array, ctx, func, check_non_null)	\
 	({						\
-		struct bpf_prog **_prog;		\
+		struct bpf_prog **_prog, *__prog;	\
+		struct bpf_prog_array *_array;		\
 		u32 _ret = 1;				\
 		rcu_read_lock();			\
-		_prog = rcu_dereference(array)->progs;	\
-		for (; *_prog; _prog++)			\
-			_ret &= func(*_prog, ctx);	\
+		_array = rcu_dereference(array);	\
+		if (unlikely(check_non_null && !_array))\
+			goto _out;			\
+		_prog = _array->progs;			\
+		while ((__prog = READ_ONCE(*_prog))) {	\
+			_ret &= func(__prog, ctx);	\
+			_prog++;			\
+		}					\
+_out:							\
 		rcu_read_unlock();			\
 		_ret;					\
 	 })
 
+#define BPF_PROG_RUN_ARRAY(array, ctx, func)		\
+	__BPF_PROG_RUN_ARRAY(array, ctx, func, false)
+
+#define BPF_PROG_RUN_ARRAY_CHECK(array, ctx, func)	\
+	__BPF_PROG_RUN_ARRAY(array, ctx, func, true)
+
 #ifdef CONFIG_BPF_SYSCALL
 DECLARE_PER_CPU(int, bpf_prog_active);
 
diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index 2e0f22298fe9..fc6aeca945db 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -271,14 +271,37 @@ struct trace_event_call {
 #ifdef CONFIG_PERF_EVENTS
 	int				perf_refcount;
 	struct hlist_head __percpu	*perf_events;
-	struct bpf_prog			*prog;
-	struct perf_event		*bpf_prog_owner;
+	struct bpf_prog_array __rcu	*prog_array;
 
 	int	(*perf_perm)(struct trace_event_call *,
 			     struct perf_event *);
 #endif
 };
 
+#ifdef CONFIG_PERF_EVENTS
+static inline bool bpf_prog_array_valid(struct trace_event_call *call)
+{
+	/*
+	 * This inline function checks whether call->prog_array
+	 * is valid or not. The function is called in various places,
+	 * outside rcu_read_lock/unlock, as a heuristic to speed up execution.
+	 *
+	 * If this function returns true, and later call->prog_array
+	 * becomes false inside rcu_read_lock/unlock region,
+	 * we bail out then. If this function return false,
+	 * there is a risk that we might miss a few events if the checking
+	 * were delayed until inside rcu_read_lock/unlock region and
+	 * call->prog_array happened to become non-NULL then.
+	 *
+	 * Here, READ_ONCE() is used instead of rcu_access_pointer().
+	 * rcu_access_pointer() requires the actual definition of
+	 * "struct bpf_prog_array" while READ_ONCE() only needs
+	 * a declaration of the same type.
+	 */
+	return !!READ_ONCE(call->prog_array);
+}
+#endif
+
 static inline const char *
 trace_event_name(struct trace_event_call *call)
 {
@@ -435,12 +458,23 @@ trace_trigger_soft_disabled(struct trace_event_file *file)
 }
 
 #ifdef CONFIG_BPF_EVENTS
-unsigned int trace_call_bpf(struct bpf_prog *prog, void *ctx);
+unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx);
+int perf_event_attach_bpf_prog(struct perf_event *event, struct bpf_prog *prog);
+void perf_event_detach_bpf_prog(struct perf_event *event);
 #else
-static inline unsigned int trace_call_bpf(struct bpf_prog *prog, void *ctx)
+static inline unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx)
 {
 	return 1;
 }
+
+static inline int
+perf_event_attach_bpf_prog(struct perf_event *event, struct bpf_prog *prog)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline void perf_event_detach_bpf_prog(struct perf_event *event) { }
+
 #endif
 
 enum {
@@ -511,6 +545,7 @@ perf_trace_buf_submit(void *raw_data, int size, int rctx, u16 type,
 {
 	perf_tp_event(type, count, raw_data, size, regs, head, rctx, task, event);
 }
+
 #endif
 
 #endif /* _LINUX_TRACE_EVENT_H */
diff --git a/include/trace/perf.h b/include/trace/perf.h
index 04fe68bbe767..14f127b6acf5 100644
--- a/include/trace/perf.h
+++ b/include/trace/perf.h
@@ -34,7 +34,6 @@ perf_trace_##call(void *__data, proto)					\
 	struct trace_event_call *event_call = __data;			\
 	struct trace_event_data_offsets_##call __maybe_unused __data_offsets;\
 	struct trace_event_raw_##call *entry;				\
-	struct bpf_prog *prog = event_call->prog;			\
 	struct pt_regs *__regs;						\
 	u64 __count = 1;						\
 	struct task_struct *__task = NULL;				\
@@ -46,8 +45,9 @@ perf_trace_##call(void *__data, proto)					\
 	__data_size = trace_event_get_offsets_##call(&__data_offsets, args); \
 									\
 	head = this_cpu_ptr(event_call->perf_events);			\
-	if (!prog && __builtin_constant_p(!__task) && !__task &&	\
-				hlist_empty(head))			\
+	if (!bpf_prog_array_valid(event_call) &&			\
+	    __builtin_constant_p(!__task) && !__task &&			\
+	    hlist_empty(head))						\
 		return;							\
 									\
 	__entry_size = ALIGN(__data_size + sizeof(*entry) + sizeof(u32),\
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 8e7c8bf2b687..7fe448799d76 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -1394,6 +1394,20 @@ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err)
 }
 EXPORT_SYMBOL_GPL(bpf_prog_select_runtime);
 
+static unsigned int __bpf_prog_ret1(const void *ctx,
+				    const struct bpf_insn *insn)
+{
+	return 1;
+}
+
+static struct bpf_prog_dummy {
+	struct bpf_prog prog;
+} dummy_bpf_prog = {
+	.prog = {
+		.bpf_func = __bpf_prog_ret1,
+	},
+};
+
 /* to avoid allocating empty bpf_prog_array for cgroups that
  * don't have bpf program attached use one global 'empty_prog_array'
  * It will not be modified the caller of bpf_prog_array_alloc()
@@ -1463,6 +1477,73 @@ int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *progs,
 	return 0;
 }
 
+void bpf_prog_array_delete_safe(struct bpf_prog_array __rcu *progs,
+				struct bpf_prog *old_prog)
+{
+	struct bpf_prog **prog = progs->progs;
+
+	for (; *prog; prog++)
+		if (*prog == old_prog) {
+			WRITE_ONCE(*prog, &dummy_bpf_prog.prog);
+			break;
+		}
+}
+
+int bpf_prog_array_copy(struct bpf_prog_array __rcu *old_array,
+			struct bpf_prog *exclude_prog,
+			struct bpf_prog *include_prog,
+			struct bpf_prog_array **new_array)
+{
+	int new_prog_cnt, carry_prog_cnt = 0;
+	struct bpf_prog **existing_prog;
+	struct bpf_prog_array *array;
+	int new_prog_idx = 0;
+
+	/* Figure out how many existing progs we need to carry over to
+	 * the new array.
+	 */
+	if (old_array) {
+		existing_prog = old_array->progs;
+		for (; *existing_prog; existing_prog++) {
+			if (*existing_prog != exclude_prog &&
+			    *existing_prog != &dummy_bpf_prog.prog)
+				carry_prog_cnt++;
+			if (*existing_prog == include_prog)
+				return -EEXIST;
+		}
+	}
+
+	/* How many progs (not NULL) will be in the new array? */
+	new_prog_cnt = carry_prog_cnt;
+	if (include_prog)
+		new_prog_cnt += 1;
+
+	/* Do we have any prog (not NULL) in the new array? */
+	if (!new_prog_cnt) {
+		*new_array = NULL;
+		return 0;
+	}
+
+	/* +1 as the end of prog_array is marked with NULL */
+	array = bpf_prog_array_alloc(new_prog_cnt + 1, GFP_KERNEL);
+	if (!array)
+		return -ENOMEM;
+
+	/* Fill in the new prog array */
+	if (carry_prog_cnt) {
+		existing_prog = old_array->progs;
+		for (; *existing_prog; existing_prog++)
+			if (*existing_prog != exclude_prog &&
+			    *existing_prog != &dummy_bpf_prog.prog)
+				array->progs[new_prog_idx++] = *existing_prog;
+	}
+	if (include_prog)
+		array->progs[new_prog_idx++] = include_prog;
+	array->progs[new_prog_idx] = NULL;
+	*new_array = array;
+	return 0;
+}
+
 static void bpf_prog_free_deferred(struct work_struct *work)
 {
 	struct bpf_prog_aux *aux;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 9f78a6825bbe..9660ee65fbef 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -7954,11 +7954,9 @@ void perf_trace_run_bpf_submit(void *raw_data, int size, int rctx,
 			       struct pt_regs *regs, struct hlist_head *head,
 			       struct task_struct *task)
 {
-	struct bpf_prog *prog = call->prog;
-
-	if (prog) {
+	if (bpf_prog_array_valid(call)) {
 		*(struct pt_regs **)raw_data = regs;
-		if (!trace_call_bpf(prog, raw_data) || hlist_empty(head)) {
+		if (!trace_call_bpf(call, raw_data) || hlist_empty(head)) {
 			perf_swevent_put_recursion_context(rctx);
 			return;
 		}
@@ -8147,13 +8145,11 @@ static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
 {
 	bool is_kprobe, is_tracepoint, is_syscall_tp;
 	struct bpf_prog *prog;
+	int ret;
 
 	if (event->attr.type != PERF_TYPE_TRACEPOINT)
 		return perf_event_set_bpf_handler(event, prog_fd);
 
-	if (event->tp_event->prog)
-		return -EEXIST;
-
 	is_kprobe = event->tp_event->flags & TRACE_EVENT_FL_UKPROBE;
 	is_tracepoint = event->tp_event->flags & TRACE_EVENT_FL_TRACEPOINT;
 	is_syscall_tp = is_syscall_trace_event(event->tp_event);
@@ -8181,26 +8177,20 @@ static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
 			return -EACCES;
 		}
 	}
-	event->tp_event->prog = prog;
-	event->tp_event->bpf_prog_owner = event;
 
-	return 0;
+	ret = perf_event_attach_bpf_prog(event, prog);
+	if (ret)
+		bpf_prog_put(prog);
+	return ret;
 }
 
 static void perf_event_free_bpf_prog(struct perf_event *event)
 {
-	struct bpf_prog *prog;
-
 	if (event->attr.type != PERF_TYPE_TRACEPOINT) {
 		perf_event_free_bpf_handler(event);
 		return;
 	}
-
-	prog = event->tp_event->prog;
-	if (prog && event->tp_event->bpf_prog_owner == event) {
-		event->tp_event->prog = NULL;
-		bpf_prog_put(prog);
-	}
+	perf_event_detach_bpf_prog(event);
 }
 
 #else
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 3126da2f468a..b65011d320e3 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -17,7 +17,7 @@
 
 /**
  * trace_call_bpf - invoke BPF program
- * @prog: BPF program
+ * @call: tracepoint event
  * @ctx: opaque context pointer
  *
  * kprobe handlers execute BPF programs via this helper.
@@ -29,7 +29,7 @@
  * 1 - store kprobe event into ring buffer
  * Other values are reserved and currently alias to 1
  */
-unsigned int trace_call_bpf(struct bpf_prog *prog, void *ctx)
+unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx)
 {
 	unsigned int ret;
 
@@ -49,9 +49,22 @@ unsigned int trace_call_bpf(struct bpf_prog *prog, void *ctx)
 		goto out;
 	}
 
-	rcu_read_lock();
-	ret = BPF_PROG_RUN(prog, ctx);
-	rcu_read_unlock();
+	/*
+	 * Instead of moving rcu_read_lock/rcu_dereference/rcu_read_unlock
+	 * to all call sites, we did a bpf_prog_array_valid() there to check
+	 * whether call->prog_array is empty or not, which is
+	 * a heurisitc to speed up execution.
+	 *
+	 * If bpf_prog_array_valid() fetched prog_array was
+	 * non-NULL, we go into trace_call_bpf() and do the actual
+	 * proper rcu_dereference() under RCU lock.
+	 * If it turns out that prog_array is NULL then, we bail out.
+	 * For the opposite, if the bpf_prog_array_valid() fetched pointer
+	 * was NULL, you'll skip the prog_array with the risk of missing
+	 * out of events when it was updated in between this and the
+	 * rcu_dereference() which is accepted risk.
+	 */
+	ret = BPF_PROG_RUN_ARRAY_CHECK(call->prog_array, ctx, BPF_PROG_RUN);
 
  out:
 	__this_cpu_dec(bpf_prog_active);
@@ -741,3 +754,62 @@ const struct bpf_verifier_ops perf_event_verifier_ops = {
 
 const struct bpf_prog_ops perf_event_prog_ops = {
 };
+
+static DEFINE_MUTEX(bpf_event_mutex);
+
+int perf_event_attach_bpf_prog(struct perf_event *event,
+			       struct bpf_prog *prog)
+{
+	struct bpf_prog_array __rcu *old_array;
+	struct bpf_prog_array *new_array;
+	int ret = -EEXIST;
+
+	mutex_lock(&bpf_event_mutex);
+
+	if (event->prog)
+		goto out;
+
+	old_array = rcu_dereference_protected(event->tp_event->prog_array,
+					      lockdep_is_held(&bpf_event_mutex));
+	ret = bpf_prog_array_copy(old_array, NULL, prog, &new_array);
+	if (ret < 0)
+		goto out;
+
+	/* set the new array to event->tp_event and set event->prog */
+	event->prog = prog;
+	rcu_assign_pointer(event->tp_event->prog_array, new_array);
+	bpf_prog_array_free(old_array);
+
+out:
+	mutex_unlock(&bpf_event_mutex);
+	return ret;
+}
+
+void perf_event_detach_bpf_prog(struct perf_event *event)
+{
+	struct bpf_prog_array __rcu *old_array;
+	struct bpf_prog_array *new_array;
+	int ret;
+
+	mutex_lock(&bpf_event_mutex);
+
+	if (!event->prog)
+		goto out;
+
+	old_array = rcu_dereference_protected(event->tp_event->prog_array,
+					      lockdep_is_held(&bpf_event_mutex));
+
+	ret = bpf_prog_array_copy(old_array, event->prog, NULL, &new_array);
+	if (ret < 0) {
+		bpf_prog_array_delete_safe(old_array, event->prog);
+	} else {
+		rcu_assign_pointer(event->tp_event->prog_array, new_array);
+		bpf_prog_array_free(old_array);
+	}
+
+	bpf_prog_put(event->prog);
+	event->prog = NULL;
+
+out:
+	mutex_unlock(&bpf_event_mutex);
+}
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 8a907e12b6b9..abf92e478cfb 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -1174,13 +1174,12 @@ static void
 kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs)
 {
 	struct trace_event_call *call = &tk->tp.call;
-	struct bpf_prog *prog = call->prog;
 	struct kprobe_trace_entry_head *entry;
 	struct hlist_head *head;
 	int size, __size, dsize;
 	int rctx;
 
-	if (prog && !trace_call_bpf(prog, regs))
+	if (bpf_prog_array_valid(call) && !trace_call_bpf(call, regs))
 		return;
 
 	head = this_cpu_ptr(call->perf_events);
@@ -1210,13 +1209,12 @@ kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
 		    struct pt_regs *regs)
 {
 	struct trace_event_call *call = &tk->tp.call;
-	struct bpf_prog *prog = call->prog;
 	struct kretprobe_trace_entry_head *entry;
 	struct hlist_head *head;
 	int size, __size, dsize;
 	int rctx;
 
-	if (prog && !trace_call_bpf(prog, regs))
+	if (bpf_prog_array_valid(call) && !trace_call_bpf(call, regs))
 		return;
 
 	head = this_cpu_ptr(call->perf_events);
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 696afe72d3b1..71a6af34d7a9 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -559,9 +559,10 @@ static DECLARE_BITMAP(enabled_perf_exit_syscalls, NR_syscalls);
 static int sys_perf_refcount_enter;
 static int sys_perf_refcount_exit;
 
-static int perf_call_bpf_enter(struct bpf_prog *prog, struct pt_regs *regs,
-			      struct syscall_metadata *sys_data,
-			      struct syscall_trace_enter *rec) {
+static int perf_call_bpf_enter(struct trace_event_call *call, struct pt_regs *regs,
+			       struct syscall_metadata *sys_data,
+			       struct syscall_trace_enter *rec)
+{
 	struct syscall_tp_t {
 		unsigned long long regs;
 		unsigned long syscall_nr;
@@ -573,7 +574,7 @@ static int perf_call_bpf_enter(struct bpf_prog *prog, struct pt_regs *regs,
 	param.syscall_nr = rec->nr;
 	for (i = 0; i < sys_data->nb_args; i++)
 		param.args[i] = rec->args[i];
-	return trace_call_bpf(prog, &param);
+	return trace_call_bpf(call, &param);
 }
 
 static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
@@ -581,7 +582,7 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
 	struct syscall_metadata *sys_data;
 	struct syscall_trace_enter *rec;
 	struct hlist_head *head;
-	struct bpf_prog *prog;
+	bool valid_prog_array;
 	int syscall_nr;
 	int rctx;
 	int size;
@@ -596,9 +597,9 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
 	if (!sys_data)
 		return;
 
-	prog = READ_ONCE(sys_data->enter_event->prog);
 	head = this_cpu_ptr(sys_data->enter_event->perf_events);
-	if (!prog && hlist_empty(head))
+	valid_prog_array = bpf_prog_array_valid(sys_data->enter_event);
+	if (!valid_prog_array && hlist_empty(head))
 		return;
 
 	/* get the size after alignment with the u32 buffer size field */
@@ -614,7 +615,8 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
 	syscall_get_arguments(current, regs, 0, sys_data->nb_args,
 			       (unsigned long *)&rec->args);
 
-	if ((prog && !perf_call_bpf_enter(prog, regs, sys_data, rec)) ||
+	if ((valid_prog_array &&
+	     !perf_call_bpf_enter(sys_data->enter_event, regs, sys_data, rec)) ||
 	    hlist_empty(head)) {
 		perf_swevent_put_recursion_context(rctx);
 		return;
@@ -659,8 +661,9 @@ static void perf_sysenter_disable(struct trace_event_call *call)
 	mutex_unlock(&syscall_trace_lock);
 }
 
-static int perf_call_bpf_exit(struct bpf_prog *prog, struct pt_regs *regs,
-			      struct syscall_trace_exit *rec) {
+static int perf_call_bpf_exit(struct trace_event_call *call, struct pt_regs *regs,
+			      struct syscall_trace_exit *rec)
+{
 	struct syscall_tp_t {
 		unsigned long long regs;
 		unsigned long syscall_nr;
@@ -670,7 +673,7 @@ static int perf_call_bpf_exit(struct bpf_prog *prog, struct pt_regs *regs,
 	*(struct pt_regs **)&param = regs;
 	param.syscall_nr = rec->nr;
 	param.ret = rec->ret;
-	return trace_call_bpf(prog, &param);
+	return trace_call_bpf(call, &param);
 }
 
 static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
@@ -678,7 +681,7 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
 	struct syscall_metadata *sys_data;
 	struct syscall_trace_exit *rec;
 	struct hlist_head *head;
-	struct bpf_prog *prog;
+	bool valid_prog_array;
 	int syscall_nr;
 	int rctx;
 	int size;
@@ -693,9 +696,9 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
 	if (!sys_data)
 		return;
 
-	prog = READ_ONCE(sys_data->exit_event->prog);
 	head = this_cpu_ptr(sys_data->exit_event->perf_events);
-	if (!prog && hlist_empty(head))
+	valid_prog_array = bpf_prog_array_valid(sys_data->exit_event);
+	if (!valid_prog_array && hlist_empty(head))
 		return;
 
 	/* We can probably do that at build time */
@@ -709,7 +712,8 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
 	rec->nr = syscall_nr;
 	rec->ret = syscall_get_return_value(current, regs);
 
-	if ((prog && !perf_call_bpf_exit(prog, regs, rec)) ||
+	if ((valid_prog_array &&
+	     !perf_call_bpf_exit(sys_data->exit_event, regs, rec)) ||
 	    hlist_empty(head)) {
 		perf_swevent_put_recursion_context(rctx);
 		return;
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 4525e0271a53..153c0e411461 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -1113,13 +1113,12 @@ static void __uprobe_perf_func(struct trace_uprobe *tu,
 {
 	struct trace_event_call *call = &tu->tp.call;
 	struct uprobe_trace_entry_head *entry;
-	struct bpf_prog *prog = call->prog;
 	struct hlist_head *head;
 	void *data;
 	int size, esize;
 	int rctx;
 
-	if (prog && !trace_call_bpf(prog, regs))
+	if (bpf_prog_array_valid(call) && !trace_call_bpf(call, regs))
 		return;
 
 	esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu));
-- 
cgit v1.2.3


From 5aaf1ab55389aeb6ce5527580a1a4d4dbc0f41ff Mon Sep 17 00:00:00 2001
From: Petr Mladek <pmladek@suse.com>
Date: Fri, 20 Oct 2017 16:56:50 +0200
Subject: livepatch: Correctly call klp_post_unpatch_callback() in error paths

The post_unpatch_enabled flag in struct klp_callbacks is set when a
pre-patch callback successfully executes, indicating that we need to
call a corresponding post-unpatch callback when the patch is reverted.
This is true for ordinary patch disable as well as the error paths of
klp_patch_object() callers.

As currently coded, we inadvertently execute the post-patch callback
twice in klp_module_coming() when klp_patch_object() fails:

  - We explicitly call klp_post_unpatch_callback() for the failed object
  - We call it again for the same object (and all the others) via
    klp_cleanup_module_patches_limited()

We should clear the flag in klp_post_unpatch_callback() to make
sure that the callback is not called twice. It makes the API
more safe.

(We could have removed the callback from the former error path as it
would be covered by the latter call, but I think that is is cleaner to
clear the post_unpatch_enabled after its invoked. For example, someone
might later decide to call the callback only when obj->patched flag is
set.)

There is another mistake in the error path of klp_coming_module() in
which it skips the post-unpatch callback for the klp_transition_patch.
However, the pre-patch callback was called even for this patch, so be
sure to make the corresponding callbacks for all patches.

Finally, I used this opportunity to make klp_pre_patch_callback() more
readable.

[jkosina@suse.cz: incorporate changelog wording changes proposed by Joe Lawrence]
Signed-off-by: Petr Mladek <pmladek@suse.com>
Acked-by: Joe Lawrence <joe.lawrence@redhat.com>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 kernel/livepatch/core.c | 4 +---
 kernel/livepatch/core.h | 8 +++++---
 2 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c
index cafb5a84417d..eb134479c394 100644
--- a/kernel/livepatch/core.c
+++ b/kernel/livepatch/core.c
@@ -894,9 +894,7 @@ int klp_module_coming(struct module *mod)
 				pr_warn("failed to apply patch '%s' to module '%s' (%d)\n",
 					patch->mod->name, obj->mod->name, ret);
 
-				if (patch != klp_transition_patch)
-					klp_post_unpatch_callback(obj);
-
+				klp_post_unpatch_callback(obj);
 				goto err;
 			}
 
diff --git a/kernel/livepatch/core.h b/kernel/livepatch/core.h
index 6fc907b54e71..cc3aa708e0b4 100644
--- a/kernel/livepatch/core.h
+++ b/kernel/livepatch/core.h
@@ -12,10 +12,10 @@ static inline bool klp_is_object_loaded(struct klp_object *obj)
 
 static inline int klp_pre_patch_callback(struct klp_object *obj)
 {
-	int ret;
+	int ret = 0;
 
-	ret = (obj->callbacks.pre_patch) ?
-		(*obj->callbacks.pre_patch)(obj) : 0;
+	if (obj->callbacks.pre_patch)
+		ret = (*obj->callbacks.pre_patch)(obj);
 
 	obj->callbacks.post_unpatch_enabled = !ret;
 
@@ -39,6 +39,8 @@ static inline void klp_post_unpatch_callback(struct klp_object *obj)
 	if (obj->callbacks.post_unpatch_enabled &&
 	    obj->callbacks.post_unpatch)
 		(*obj->callbacks.post_unpatch)(obj);
+
+	obj->callbacks.post_unpatch_enabled = false;
 }
 
 #endif /* _LIVEPATCH_CORE_H */
-- 
cgit v1.2.3


From 89a9a1c1c89cea5f70975c338c011b9f7024dee5 Mon Sep 17 00:00:00 2001
From: Petr Mladek <pmladek@suse.com>
Date: Fri, 20 Oct 2017 16:56:51 +0200
Subject: livepatch: __klp_disable_patch() should never be called for disabled
 patches

__klp_disable_patch() should never be called when the patch is not
enabled. Let's add the same warning that we have in __klp_enable_patch().

This allows to remove the check when calling klp_pre_unpatch_callback().
It was strange anyway because it repeatedly checked per-patch flag
for each patched object.

Signed-off-by: Petr Mladek <pmladek@suse.com>
Acked-by: Joe Lawrence <joe.lawrence@redhat.com>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 kernel/livepatch/core.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c
index eb134479c394..287f71e9dbfe 100644
--- a/kernel/livepatch/core.c
+++ b/kernel/livepatch/core.c
@@ -282,6 +282,9 @@ static int __klp_disable_patch(struct klp_patch *patch)
 {
 	struct klp_object *obj;
 
+	if (WARN_ON(!patch->enabled))
+		return -EINVAL;
+
 	if (klp_transition_patch)
 		return -EBUSY;
 
@@ -293,7 +296,7 @@ static int __klp_disable_patch(struct klp_patch *patch)
 	klp_init_transition(patch, KLP_UNPATCHED);
 
 	klp_for_each_object(patch, obj)
-		if (patch->enabled && obj->patched)
+		if (obj->patched)
 			klp_pre_unpatch_callback(obj);
 
 	/*
-- 
cgit v1.2.3


From d41bf8c9deaed1a90b18d3ffc5639d4c19f0259a Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 23 Oct 2017 16:18:27 -0700
Subject: cgroup, sched: Move basic cpu stats from cgroup.stat to cpu.stat

The basic cpu stat is currently shown with "cpu." prefix in
cgroup.stat, and the same information is duplicated in cpu.stat when
cpu controller is enabled.  This is ugly and not very scalable as we
want to expand the coverage of stat information which is always
available.

This patch makes cgroup core always create "cpu.stat" file and show
the basic cpu stat there and calls the cpu controller to show the
extra stats when enabled.  This ensures that the same information
isn't presented in multiple places and makes future expansion of basic
stats easier.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 Documentation/cgroup-v2.txt     | 15 ++++-------
 include/linux/cgroup-defs.h     |  2 ++
 include/linux/cgroup.h          |  2 --
 kernel/cgroup/cgroup-internal.h |  1 +
 kernel/cgroup/cgroup.c          | 60 +++++++++++++++++++++++++++++++++++++++--
 kernel/cgroup/stat.c            | 10 +++----
 kernel/sched/core.c             | 13 +++------
 7 files changed, 75 insertions(+), 28 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/cgroup-v2.txt b/Documentation/cgroup-v2.txt
index 0bbdc720dd7c..779211fbb69f 100644
--- a/Documentation/cgroup-v2.txt
+++ b/Documentation/cgroup-v2.txt
@@ -886,15 +886,6 @@ All cgroup core files are prefixed with "cgroup."
 		A dying cgroup can consume system resources not exceeding
 		limits, which were active at the moment of cgroup deletion.
 
-	  cpu.usage_usec
-		CPU time consumed in the subtree.
-
-	  cpu.user_usec
-		User CPU time consumed in the subtree.
-
-	  cpu.system_usec
-		System CPU time consumed in the subtree.
-
 
 Controllers
 ===========
@@ -915,12 +906,16 @@ All time durations are in microseconds.
 
   cpu.stat
 	A read-only flat-keyed file which exists on non-root cgroups.
+	This file exists whether the controller is enabled or not.
 
-	It reports the following six stats:
+	It always reports the following three stats:
 
 	- usage_usec
 	- user_usec
 	- system_usec
+
+	and the following three when the controller is enabled:
+
 	- nr_periods
 	- nr_throttled
 	- throttled_usec
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index 3e55bbd31ad1..ada6df7b1f55 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -569,6 +569,8 @@ struct cgroup_subsys {
 	void (*css_released)(struct cgroup_subsys_state *css);
 	void (*css_free)(struct cgroup_subsys_state *css);
 	void (*css_reset)(struct cgroup_subsys_state *css);
+	int (*css_extra_stat_show)(struct seq_file *seq,
+				   struct cgroup_subsys_state *css);
 
 	int (*can_attach)(struct cgroup_taskset *tset);
 	void (*cancel_attach)(struct cgroup_taskset *tset);
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 328a70ce0e23..03cad08b09d1 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -703,8 +703,6 @@ static inline void cpuacct_account_field(struct task_struct *tsk, int index,
 					 u64 val) {}
 #endif
 
-void cgroup_stat_show_cputime(struct seq_file *seq, const char *prefix);
-
 void __cgroup_account_cputime(struct cgroup *cgrp, u64 delta_exec);
 void __cgroup_account_cputime_field(struct cgroup *cgrp,
 				    enum cpu_usage_stat index, u64 delta_exec);
diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h
index fa642c99586a..4dc317090920 100644
--- a/kernel/cgroup/cgroup-internal.h
+++ b/kernel/cgroup/cgroup-internal.h
@@ -205,6 +205,7 @@ int cgroup_task_count(const struct cgroup *cgrp);
 void cgroup_stat_flush(struct cgroup *cgrp);
 int cgroup_stat_init(struct cgroup *cgrp);
 void cgroup_stat_exit(struct cgroup *cgrp);
+void cgroup_stat_show_cputime(struct seq_file *seq);
 void cgroup_stat_boot(void);
 
 /*
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 7975b20f1fd1..d9773e49a1b4 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -463,6 +463,28 @@ static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp,
 		return &cgrp->self;
 }
 
+/**
+ * cgroup_tryget_css - try to get a cgroup's css for the specified subsystem
+ * @cgrp: the cgroup of interest
+ * @ss: the subsystem of interest
+ *
+ * Find and get @cgrp's css assocaited with @ss.  If the css doesn't exist
+ * or is offline, %NULL is returned.
+ */
+static struct cgroup_subsys_state *cgroup_tryget_css(struct cgroup *cgrp,
+						     struct cgroup_subsys *ss)
+{
+	struct cgroup_subsys_state *css;
+
+	rcu_read_lock();
+	css = cgroup_css(cgrp, ss);
+	if (!css || !css_tryget_online(css))
+		css = NULL;
+	rcu_read_unlock();
+
+	return css;
+}
+
 /**
  * cgroup_e_css - obtain a cgroup's effective css for the specified subsystem
  * @cgrp: the cgroup of interest
@@ -3311,11 +3333,40 @@ static int cgroup_stat_show(struct seq_file *seq, void *v)
 	seq_printf(seq, "nr_dying_descendants %d\n",
 		   cgroup->nr_dying_descendants);
 
-	cgroup_stat_show_cputime(seq, "cpu.");
-
 	return 0;
 }
 
+static int __maybe_unused cgroup_extra_stat_show(struct seq_file *seq,
+						 struct cgroup *cgrp, int ssid)
+{
+	struct cgroup_subsys *ss = cgroup_subsys[ssid];
+	struct cgroup_subsys_state *css;
+	int ret;
+
+	if (!ss->css_extra_stat_show)
+		return 0;
+
+	css = cgroup_tryget_css(cgrp, ss);
+	if (!css)
+		return 0;
+
+	ret = ss->css_extra_stat_show(seq, css);
+	css_put(css);
+	return ret;
+}
+
+static int cpu_stat_show(struct seq_file *seq, void *v)
+{
+	struct cgroup *cgrp = seq_css(seq)->cgroup;
+	int ret = 0;
+
+	cgroup_stat_show_cputime(seq);
+#ifdef CONFIG_CGROUP_SCHED
+	ret = cgroup_extra_stat_show(seq, cgrp, cpu_cgrp_id);
+#endif
+	return ret;
+}
+
 static int cgroup_file_open(struct kernfs_open_file *of)
 {
 	struct cftype *cft = of->kn->priv;
@@ -4423,6 +4474,11 @@ static struct cftype cgroup_base_files[] = {
 		.name = "cgroup.stat",
 		.seq_show = cgroup_stat_show,
 	},
+	{
+		.name = "cpu.stat",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.seq_show = cpu_stat_show,
+	},
 	{ }	/* terminate */
 };
 
diff --git a/kernel/cgroup/stat.c b/kernel/cgroup/stat.c
index 9cce79e89320..133b465691d6 100644
--- a/kernel/cgroup/stat.c
+++ b/kernel/cgroup/stat.c
@@ -256,7 +256,7 @@ void __cgroup_account_cputime_field(struct cgroup *cgrp,
 	cgroup_cpu_stat_account_end(cgrp, cstat);
 }
 
-void cgroup_stat_show_cputime(struct seq_file *seq, const char *prefix)
+void cgroup_stat_show_cputime(struct seq_file *seq)
 {
 	struct cgroup *cgrp = seq_css(seq)->cgroup;
 	u64 usage, utime, stime;
@@ -278,10 +278,10 @@ void cgroup_stat_show_cputime(struct seq_file *seq, const char *prefix)
 	do_div(utime, NSEC_PER_USEC);
 	do_div(stime, NSEC_PER_USEC);
 
-	seq_printf(seq, "%susage_usec %llu\n"
-		   "%suser_usec %llu\n"
-		   "%ssystem_usec %llu\n",
-		   prefix, usage, prefix, utime, prefix, stime);
+	seq_printf(seq, "usage_usec %llu\n"
+		   "user_usec %llu\n"
+		   "system_usec %llu\n",
+		   usage, utime, stime);
 }
 
 int cgroup_stat_init(struct cgroup *cgrp)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index ad255162a830..0b3eec389552 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6678,13 +6678,12 @@ static struct cftype cpu_legacy_files[] = {
 	{ }	/* Terminate */
 };
 
-static int cpu_stat_show(struct seq_file *sf, void *v)
+static int cpu_extra_stat_show(struct seq_file *sf,
+			       struct cgroup_subsys_state *css)
 {
-	cgroup_stat_show_cputime(sf, "");
-
 #ifdef CONFIG_CFS_BANDWIDTH
 	{
-		struct task_group *tg = css_tg(seq_css(sf));
+		struct task_group *tg = css_tg(css);
 		struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
 		u64 throttled_usec;
 
@@ -6817,11 +6816,6 @@ static ssize_t cpu_max_write(struct kernfs_open_file *of,
 #endif
 
 static struct cftype cpu_files[] = {
-	{
-		.name = "stat",
-		.flags = CFTYPE_NOT_ON_ROOT,
-		.seq_show = cpu_stat_show,
-	},
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	{
 		.name = "weight",
@@ -6852,6 +6846,7 @@ struct cgroup_subsys cpu_cgrp_subsys = {
 	.css_online	= cpu_cgroup_css_online,
 	.css_released	= cpu_cgroup_css_released,
 	.css_free	= cpu_cgroup_css_free,
+	.css_extra_stat_show = cpu_extra_stat_show,
 	.fork		= cpu_cgroup_fork,
 	.can_attach	= cpu_cgroup_can_attach,
 	.attach		= cpu_cgroup_attach,
-- 
cgit v1.2.3


From 035226b964c820f65e201cdf123705a8f1d7c670 Mon Sep 17 00:00:00 2001
From: Gianluca Borello <g.borello@gmail.com>
Date: Thu, 26 Oct 2017 01:47:42 +0000
Subject: bpf: remove tail_call and get_stackid helper declarations from bpf.h

commit afdb09c720b6 ("security: bpf: Add LSM hooks for bpf object related
syscall") included linux/bpf.h in linux/security.h. As a result, bpf
programs including bpf_helpers.h and some other header that ends up
pulling in also security.h, such as several examples under samples/bpf,
fail to compile because bpf_tail_call and bpf_get_stackid are now
"redefined as different kind of symbol".

>From bpf.h:

u64 bpf_tail_call(u64 ctx, u64 r2, u64 index, u64 r4, u64 r5);
u64 bpf_get_stackid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);

Whereas in bpf_helpers.h they are:

static void (*bpf_tail_call)(void *ctx, void *map, int index);
static int (*bpf_get_stackid)(void *ctx, void *map, int flags);

Fix this by removing the unused declaration of bpf_tail_call and moving
the declaration of bpf_get_stackid in bpf_trace.c, which is the only
place where it's needed.

Signed-off-by: Gianluca Borello <g.borello@gmail.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h      | 3 ---
 kernel/trace/bpf_trace.c | 2 ++
 2 files changed, 2 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 172be7faf7ba..520aeebe0d93 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -231,9 +231,6 @@ struct bpf_event_entry {
 	struct rcu_head rcu;
 };
 
-u64 bpf_tail_call(u64 ctx, u64 r2, u64 index, u64 r4, u64 r5);
-u64 bpf_get_stackid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
-
 bool bpf_prog_array_compatible(struct bpf_array *array, const struct bpf_prog *fp);
 int bpf_prog_calc_tag(struct bpf_prog *fp);
 
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index b65011d320e3..136aa6bb0422 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -15,6 +15,8 @@
 #include <linux/ctype.h>
 #include "trace.h"
 
+u64 bpf_get_stackid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
+
 /**
  * trace_call_bpf - invoke BPF program
  * @call: tracepoint event
-- 
cgit v1.2.3


From bc8293663b953c23ff7b73eb15f82393425e5e47 Mon Sep 17 00:00:00 2001
From: Baoquan He <bhe@redhat.com>
Date: Sun, 22 Oct 2017 22:30:55 +0800
Subject: printk: fix typo in printk_safe.c

Link: http://lkml.kernel.org/r/1508682655-27293-1-git-send-email-bhe@redhat.com
Cc: rostedt@goodmis.org
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Baoquan He <bhe@redhat.com>
Reviewed-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Signed-off-by: Petr Mladek <pmladek@suse.com>
---
 kernel/printk/printk_safe.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/printk/printk_safe.c b/kernel/printk/printk_safe.c
index 3cdaeaef9ce1..89558b85f45c 100644
--- a/kernel/printk/printk_safe.c
+++ b/kernel/printk/printk_safe.c
@@ -75,7 +75,7 @@ static void queue_flush_work(struct printk_safe_seq_buf *s)
  * have dedicated buffers, because otherwise printk-safe preempted by
  * NMI-printk would have overwritten the NMI messages.
  *
- * The messages are fushed from irq work (or from panic()), possibly,
+ * The messages are flushed from irq work (or from panic()), possibly,
  * from other CPU, concurrently with printk_safe_log_store(). Should this
  * happen, printk_safe_log_store() will notice the buffer->len mismatch
  * and repeat the write.
-- 
cgit v1.2.3


From c3ba13298709f46e72b22d087d0aa02bd012e4b0 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 30 Oct 2017 08:13:14 -0700
Subject: cgroup: mark @cgrp __maybe_unused in cpu_stat_show()

The local variable @cgrp isn't used if !CONFIG_CGROUP_SCHED.  Mark the
variable with __maybe_unused to avoid a compile warning.

Reported-by: "kbuild-all@01.org" <kbuild-all@01.org>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/cgroup/cgroup.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index d9773e49a1b4..d6ed725f36d9 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -3357,7 +3357,7 @@ static int __maybe_unused cgroup_extra_stat_show(struct seq_file *seq,
 
 static int cpu_stat_show(struct seq_file *seq, void *v)
 {
-	struct cgroup *cgrp = seq_css(seq)->cgroup;
+	struct cgroup __maybe_unused *cgrp = seq_css(seq)->cgroup;
 	int ret = 0;
 
 	cgroup_stat_show_cputime(seq);
-- 
cgit v1.2.3


From 9afe77ed849de6af8532b4c1b9310102eed9edf7 Mon Sep 17 00:00:00 2001
From: Maxim Akristiniy <maksim.akristiniy@yotadevices.com>
Date: Mon, 23 Oct 2017 19:51:48 +0300
Subject: added new line symbol after warning about dropped messages

so this message will not mess with the next one

Cc: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Maxim Akristiniy <maksim.akristiniy@yotadevices.com>
Reviewed-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Signed-off-by: Petr Mladek <pmladek@suse.com>
---
 kernel/printk/printk.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 512f7c2baedd..5d81206a572d 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -2190,7 +2190,7 @@ again:
 		}
 
 		if (console_seq < log_first_seq) {
-			len = sprintf(text, "** %u printk messages dropped ** ",
+			len = sprintf(text, "** %u printk messages dropped **\n",
 				      (unsigned)(log_first_seq - console_seq));
 
 			/* messages are gone, move to first one */
-- 
cgit v1.2.3


From ab97f87325e28b7ef7717e6cb62e8da14a7176e1 Mon Sep 17 00:00:00 2001
From: Elena Reshetova <elena.reshetova@intel.com>
Date: Fri, 20 Oct 2017 13:26:02 +0300
Subject: fsnotify: convert fsnotify_mark.refcnt from atomic_t to refcount_t

atomic_t variables are currently used to implement reference
counters with the following properties:
 - counter is initialized to 1 using atomic_set()
 - a resource is freed upon counter reaching zero
 - once counter reaches zero, its further
   increments aren't allowed
 - counter schema uses basic atomic operations
   (set, inc, inc_not_zero, dec_and_test, etc.)

Such atomic variables should be converted to a newly provided
refcount_t type and API that prevents accidental counter overflows
and underflows. This is important since overflows and underflows
can lead to use-after-free situation and be exploitable.

The variable fsnotify_mark.refcnt is used as pure reference counter.
Convert it to refcount_t and fix up the operations.

Suggested-by: Kees Cook <keescook@chromium.org>
Reviewed-by: David Windsor <dwindsor@gmail.com>
Reviewed-by: Hans Liljestrand <ishkamiel@gmail.com>
Signed-off-by: Elena Reshetova <elena.reshetova@intel.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/notify/inotify/inotify_user.c |  4 ++--
 fs/notify/mark.c                 | 14 +++++++-------
 include/linux/fsnotify_backend.h |  2 +-
 kernel/audit_tree.c              |  2 +-
 4 files changed, 11 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index 7cc7d3fb1862..d3c20e0bb046 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -376,7 +376,7 @@ static struct inotify_inode_mark *inotify_idr_find_locked(struct fsnotify_group
 
 		fsnotify_get_mark(fsn_mark);
 		/* One ref for being in the idr, one ref we just took */
-		BUG_ON(atomic_read(&fsn_mark->refcnt) < 2);
+		BUG_ON(refcount_read(&fsn_mark->refcnt) < 2);
 	}
 
 	return i_mark;
@@ -446,7 +446,7 @@ static void inotify_remove_from_idr(struct fsnotify_group *group,
 	 * One ref for being in the idr
 	 * one ref grabbed by inotify_idr_find
 	 */
-	if (unlikely(atomic_read(&i_mark->fsn_mark.refcnt) < 2)) {
+	if (unlikely(refcount_read(&i_mark->fsn_mark.refcnt) < 2)) {
 		printk(KERN_ERR "%s: i_mark=%p i_mark->wd=%d i_mark->group=%p\n",
 			 __func__, i_mark, i_mark->wd, i_mark->fsn_mark.group);
 		/* we can't really recover with bad ref cnting.. */
diff --git a/fs/notify/mark.c b/fs/notify/mark.c
index f3a32ea15b49..e9191b416434 100644
--- a/fs/notify/mark.c
+++ b/fs/notify/mark.c
@@ -105,8 +105,8 @@ static DECLARE_WORK(connector_reaper_work, fsnotify_connector_destroy_workfn);
 
 void fsnotify_get_mark(struct fsnotify_mark *mark)
 {
-	WARN_ON_ONCE(!atomic_read(&mark->refcnt));
-	atomic_inc(&mark->refcnt);
+	WARN_ON_ONCE(!refcount_read(&mark->refcnt));
+	refcount_inc(&mark->refcnt);
 }
 
 static void __fsnotify_recalc_mask(struct fsnotify_mark_connector *conn)
@@ -201,7 +201,7 @@ void fsnotify_put_mark(struct fsnotify_mark *mark)
 
 	/* Catch marks that were actually never attached to object */
 	if (!mark->connector) {
-		if (atomic_dec_and_test(&mark->refcnt))
+		if (refcount_dec_and_test(&mark->refcnt))
 			fsnotify_final_mark_destroy(mark);
 		return;
 	}
@@ -210,7 +210,7 @@ void fsnotify_put_mark(struct fsnotify_mark *mark)
 	 * We have to be careful so that traversals of obj_list under lock can
 	 * safely grab mark reference.
 	 */
-	if (!atomic_dec_and_lock(&mark->refcnt, &mark->connector->lock))
+	if (!refcount_dec_and_lock(&mark->refcnt, &mark->connector->lock))
 		return;
 
 	conn = mark->connector;
@@ -258,7 +258,7 @@ static bool fsnotify_get_mark_safe(struct fsnotify_mark *mark)
 	if (!mark)
 		return true;
 
-	if (atomic_inc_not_zero(&mark->refcnt)) {
+	if (refcount_inc_not_zero(&mark->refcnt)) {
 		spin_lock(&mark->lock);
 		if (mark->flags & FSNOTIFY_MARK_FLAG_ATTACHED) {
 			/* mark is attached, group is still alive then */
@@ -335,7 +335,7 @@ void fsnotify_detach_mark(struct fsnotify_mark *mark)
 
 	WARN_ON_ONCE(!mutex_is_locked(&group->mark_mutex));
 	WARN_ON_ONCE(!srcu_read_lock_held(&fsnotify_mark_srcu) &&
-		     atomic_read(&mark->refcnt) < 1 +
+		     refcount_read(&mark->refcnt) < 1 +
 			!!(mark->flags & FSNOTIFY_MARK_FLAG_ATTACHED));
 
 	spin_lock(&mark->lock);
@@ -737,7 +737,7 @@ void fsnotify_init_mark(struct fsnotify_mark *mark,
 {
 	memset(mark, 0, sizeof(*mark));
 	spin_lock_init(&mark->lock);
-	atomic_set(&mark->refcnt, 1);
+	refcount_set(&mark->refcnt, 1);
 	fsnotify_get_group(group);
 	mark->group = group;
 }
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index 744e2b9969fc..9bcb43953f4e 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -242,7 +242,7 @@ struct fsnotify_mark {
 	__u32 mask;
 	/* We hold one for presence in g_list. Also one ref for each 'thing'
 	 * in kernel that found and may be using this mark. */
-	atomic_t refcnt;
+	refcount_t refcnt;
 	/* Group this mark is for. Set on mark creation, stable until last ref
 	 * is dropped */
 	struct fsnotify_group *group;
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 011d46e5f73f..45ec960ad536 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -1007,7 +1007,7 @@ static void audit_tree_freeing_mark(struct fsnotify_mark *entry, struct fsnotify
 	 * We are guaranteed to have at least one reference to the mark from
 	 * either the inode or the caller of fsnotify_destroy_mark().
 	 */
-	BUG_ON(atomic_read(&entry->refcnt) < 1);
+	BUG_ON(refcount_read(&entry->refcnt) < 1);
 }
 
 static const struct fsnotify_ops audit_tree_ops = {
-- 
cgit v1.2.3


From aa4bf44dc851c6bdd4f7b61b5f2c56c84dfe2ff0 Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian.brauner@ubuntu.com>
Date: Wed, 25 Oct 2017 00:04:40 +0200
Subject: userns: use union in {g,u}idmap struct

- Add a struct containing two pointer to extents and wrap both the static extent
  array and the struct into a union. This is done in preparation for bumping the
  {g,u}idmap limits for user namespaces.
- Add brackets around anonymous union when using designated initializers to
  initialize members in order to please gcc <= 4.4.

Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
 include/linux/user_namespace.h | 18 +++++++++++++-----
 kernel/user.c                  | 30 ++++++++++++++++++------------
 2 files changed, 31 insertions(+), 17 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h
index c18e01252346..7c83d7f6289b 100644
--- a/include/linux/user_namespace.h
+++ b/include/linux/user_namespace.h
@@ -12,13 +12,21 @@
 
 #define UID_GID_MAP_MAX_EXTENTS 5
 
+struct uid_gid_extent {
+	u32 first;
+	u32 lower_first;
+	u32 count;
+};
+
 struct uid_gid_map {	/* 64 bytes -- 1 cache line */
 	u32 nr_extents;
-	struct uid_gid_extent {
-		u32 first;
-		u32 lower_first;
-		u32 count;
-	} extent[UID_GID_MAP_MAX_EXTENTS];
+	union {
+		struct uid_gid_extent extent[UID_GID_MAP_MAX_EXTENTS];
+		struct {
+			struct uid_gid_extent *forward;
+			struct uid_gid_extent *reverse;
+		};
+	};
 };
 
 #define USERNS_SETGROUPS_ALLOWED 1UL
diff --git a/kernel/user.c b/kernel/user.c
index 00281add65b2..9a20acce460d 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -26,26 +26,32 @@
 struct user_namespace init_user_ns = {
 	.uid_map = {
 		.nr_extents = 1,
-		.extent[0] = {
-			.first = 0,
-			.lower_first = 0,
-			.count = 4294967295U,
+		{
+			.extent[0] = {
+				.first = 0,
+				.lower_first = 0,
+				.count = 4294967295U,
+			},
 		},
 	},
 	.gid_map = {
 		.nr_extents = 1,
-		.extent[0] = {
-			.first = 0,
-			.lower_first = 0,
-			.count = 4294967295U,
+		{
+			.extent[0] = {
+				.first = 0,
+				.lower_first = 0,
+				.count = 4294967295U,
+			},
 		},
 	},
 	.projid_map = {
 		.nr_extents = 1,
-		.extent[0] = {
-			.first = 0,
-			.lower_first = 0,
-			.count = 4294967295U,
+		{
+			.extent[0] = {
+				.first = 0,
+				.lower_first = 0,
+				.count = 4294967295U,
+			},
 		},
 	},
 	.count = ATOMIC_INIT(3),
-- 
cgit v1.2.3


From 6397fac4915ab3002dc15aae751455da1a852f25 Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian.brauner@ubuntu.com>
Date: Wed, 25 Oct 2017 00:04:41 +0200
Subject: userns: bump idmap limits to 340

There are quite some use cases where users run into the current limit for
{g,u}id mappings. Consider a user requesting us to map everything but 999, and
1001 for a given range of 1000000000 with a sub{g,u}id layout of:

some-user:100000:1000000000
some-user:999:1
some-user:1000:1
some-user:1001:1
some-user:1002:1

This translates to:

MAPPING-TYPE | CONTAINER |    HOST |     RANGE |
-------------|-----------|---------|-----------|
         uid |       999 |     999 |         1 |
         uid |      1001 |    1001 |         1 |
         uid |         0 | 1000000 |       999 |
         uid |      1000 | 1001000 |         1 |
         uid |      1002 | 1001002 | 999998998 |
------------------------------------------------
         gid |       999 |     999 |         1 |
         gid |      1001 |    1001 |         1 |
         gid |         0 | 1000000 |       999 |
         gid |      1000 | 1001000 |         1 |
         gid |      1002 | 1001002 | 999998998 |

which is already the current limit.

As discussed at LPC simply bumping the number of limits is not going to work
since this would mean that struct uid_gid_map won't fit into a single cache-line
anymore thereby regressing performance for the base-cases. The same problem
seems to arise when using a single pointer. So the idea is to use

struct uid_gid_extent {
	u32 first;
	u32 lower_first;
	u32 count;
};

struct uid_gid_map { /* 64 bytes -- 1 cache line */
	u32 nr_extents;
	union {
		struct uid_gid_extent extent[UID_GID_MAP_MAX_BASE_EXTENTS];
		struct {
			struct uid_gid_extent *forward;
			struct uid_gid_extent *reverse;
		};
	};
};

For the base cases we will only use the struct uid_gid_extent extent member. If
we go over UID_GID_MAP_MAX_BASE_EXTENTS mappings we perform a single 4k
kmalloc() which means we can have a maximum of 340 mappings
(340 * size(struct uid_gid_extent) = 4080). For the latter case we use two
pointers "forward" and "reverse". The forward pointer points to an array sorted
by "first" and the reverse pointer points to an array sorted by "lower_first".
We can then perform binary search on those arrays.

Performance Testing:
When Eric introduced the extent-based struct uid_gid_map approach he measured
the performanc impact of his idmap changes:

> My benchmark consisted of going to single user mode where nothing else was
> running. On an ext4 filesystem opening 1,000,000 files and looping through all
> of the files 1000 times and calling fstat on the individuals files. This was
> to ensure I was benchmarking stat times where the inodes were in the kernels
> cache, but the inode values were not in the processors cache. My results:

> v3.4-rc1:         ~= 156ns (unmodified v3.4-rc1 with user namespace support disabled)
> v3.4-rc1-userns-: ~= 155ns (v3.4-rc1 with my user namespace patches and user namespace support disabled)
> v3.4-rc1-userns+: ~= 164ns (v3.4-rc1 with my user namespace patches and user namespace support enabled)

I used an identical approach on my laptop. Here's a thorough description of what
I did. I built a 4.14.0-rc4 mainline kernel with my new idmap patches applied. I
booted into single user mode and used an ext4 filesystem to open/create
1,000,000 files. Then I looped through all of the files calling fstat() on each
of them 1000 times and calculated the mean fstat() time for a single file. (The
test program can be found below.)

Here are the results. For fun, I compared the first version of my patch which
scaled linearly with the new version of the patch:

|   # MAPPINGS |   PATCH-V1 | PATCH-NEW |
|--------------|------------|-----------|
|   0 mappings |     158 ns |   158 ns  |
|   1 mappings |     164 ns |   157 ns  |
|   2 mappings |     170 ns |   158 ns  |
|   3 mappings |     175 ns |   161 ns  |
|   5 mappings |     187 ns |   165 ns  |
|  10 mappings |     218 ns |   199 ns  |
|  50 mappings |     528 ns |   218 ns  |
| 100 mappings |     980 ns |   229 ns  |
| 200 mappings |    1880 ns |   239 ns  |
| 300 mappings |    2760 ns |   240 ns  |
| 340 mappings | not tested |   248 ns  |

Here's the test program I used. I asked Eric what he did and this is a more
"advanced" implementation of the idea. It's pretty straight-forward:

 #define __GNU_SOURCE
 #define __STDC_FORMAT_MACROS
 #include <errno.h>
 #include <dirent.h>
 #include <fcntl.h>
 #include <inttypes.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <unistd.h>
 #include <sys/stat.h>
 #include <sys/time.h>
 #include <sys/types.h>

 int main(int argc, char *argv[])
 {
 	int ret;
 	size_t i, k;
 	int fd[1000000];
 	int times[1000];
 	char pathname[4096];
 	struct stat st;
 	struct timeval t1, t2;
 	uint64_t time_in_mcs;
 	uint64_t sum = 0;

 	if (argc != 2) {
 		fprintf(stderr, "Please specify a directory where to create "
 				"the test files\n");
 		exit(EXIT_FAILURE);
 	}

 	for (i = 0; i < sizeof(fd) / sizeof(fd[0]); i++) {
 		sprintf(pathname, "%s/idmap_test_%zu", argv[1], i);
 		fd[i]= open(pathname, O_RDWR | O_CREAT, S_IXUSR | S_IXGRP | S_IXOTH);
 		if (fd[i] < 0) {
 			ssize_t j;
 			for (j = i; j >= 0; j--)
 				close(fd[j]);
 			exit(EXIT_FAILURE);
 		}
 	}

 	for (k = 0; k < 1000; k++) {
 		ret = gettimeofday(&t1, NULL);
 		if (ret < 0)
 			goto close_all;

 		for (i = 0; i < sizeof(fd) / sizeof(fd[0]); i++) {
 			ret = fstat(fd[i], &st);
 			if (ret < 0)
 				goto close_all;
 		}

 		ret = gettimeofday(&t2, NULL);
 		if (ret < 0)
 			goto close_all;

 		time_in_mcs = (1000000 * t2.tv_sec + t2.tv_usec) -
 			      (1000000 * t1.tv_sec + t1.tv_usec);
 		printf("Total time in micro seconds:       %" PRIu64 "\n",
 		       time_in_mcs);
 		printf("Total time in nanoseconds:         %" PRIu64 "\n",
 		       time_in_mcs * 1000);
 		printf("Time per file in nanoseconds:      %" PRIu64 "\n",
 		       (time_in_mcs * 1000) / 1000000);
 		times[k] = (time_in_mcs * 1000) / 1000000;
 	}

 close_all:
 	for (i = 0; i < sizeof(fd) / sizeof(fd[0]); i++)
 		close(fd[i]);

 	if (ret < 0)
 		exit(EXIT_FAILURE);

 	for (k = 0; k < 1000; k++) {
 		sum += times[k];
 	}

 	printf("Mean time per file in nanoseconds: %" PRIu64 "\n", sum / 1000);

 	exit(EXIT_SUCCESS);;
 }

Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
CC: Serge Hallyn <serge@hallyn.com>
CC: Eric Biederman <ebiederm@xmission.com>
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
 include/linux/user_namespace.h |   7 +-
 kernel/user_namespace.c        | 350 +++++++++++++++++++++++++++++++++++++----
 2 files changed, 324 insertions(+), 33 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h
index 7c83d7f6289b..1c1046a60fb4 100644
--- a/include/linux/user_namespace.h
+++ b/include/linux/user_namespace.h
@@ -10,7 +10,8 @@
 #include <linux/sysctl.h>
 #include <linux/err.h>
 
-#define UID_GID_MAP_MAX_EXTENTS 5
+#define UID_GID_MAP_MAX_BASE_EXTENTS 5
+#define UID_GID_MAP_MAX_EXTENTS 340
 
 struct uid_gid_extent {
 	u32 first;
@@ -18,10 +19,10 @@ struct uid_gid_extent {
 	u32 count;
 };
 
-struct uid_gid_map {	/* 64 bytes -- 1 cache line */
+struct uid_gid_map { /* 64 bytes -- 1 cache line */
 	u32 nr_extents;
 	union {
-		struct uid_gid_extent extent[UID_GID_MAP_MAX_EXTENTS];
+		struct uid_gid_extent extent[UID_GID_MAP_MAX_BASE_EXTENTS];
 		struct {
 			struct uid_gid_extent *forward;
 			struct uid_gid_extent *reverse;
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index c490f1e4313b..5fd2d53dbc75 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -23,6 +23,8 @@
 #include <linux/ctype.h>
 #include <linux/projid.h>
 #include <linux/fs_struct.h>
+#include <linux/bsearch.h>
+#include <linux/sort.h>
 
 static struct kmem_cache *user_ns_cachep __read_mostly;
 static DEFINE_MUTEX(userns_state_mutex);
@@ -181,6 +183,18 @@ static void free_user_ns(struct work_struct *work)
 	do {
 		struct ucounts *ucounts = ns->ucounts;
 		parent = ns->parent;
+		if (ns->gid_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) {
+			kfree(ns->gid_map.forward);
+			kfree(ns->gid_map.reverse);
+		}
+		if (ns->uid_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) {
+			kfree(ns->uid_map.forward);
+			kfree(ns->uid_map.reverse);
+		}
+		if (ns->projid_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) {
+			kfree(ns->projid_map.forward);
+			kfree(ns->projid_map.reverse);
+		}
 		retire_userns_sysctls(ns);
 #ifdef CONFIG_PERSISTENT_KEYRINGS
 		key_put(ns->persistent_keyring_register);
@@ -198,7 +212,84 @@ void __put_user_ns(struct user_namespace *ns)
 }
 EXPORT_SYMBOL(__put_user_ns);
 
-static u32 map_id_range_down(struct uid_gid_map *map, u32 id, u32 count)
+/**
+ * idmap_key struct holds the information necessary to find an idmapping in a
+ * sorted idmap array. It is passed to cmp_map_id() as first argument.
+ */
+struct idmap_key {
+	bool map_up; /* true  -> id from kid; false -> kid from id */
+	u32 id; /* id to find */
+	u32 count; /* == 0 unless used with map_id_range_down() */
+};
+
+/**
+ * cmp_map_id - Function to be passed to bsearch() to find the requested
+ * idmapping. Expects struct idmap_key to be passed via @k.
+ */
+static int cmp_map_id(const void *k, const void *e)
+{
+	u32 first, last, id2;
+	const struct idmap_key *key = k;
+	const struct uid_gid_extent *el = e;
+
+	/* handle map_id_range_down() */
+	if (key->count)
+		id2 = key->id + key->count - 1;
+	else
+		id2 = key->id;
+
+	/* handle map_id_{down,up}() */
+	if (key->map_up)
+		first = el->lower_first;
+	else
+		first = el->first;
+
+	last = first + el->count - 1;
+
+	if (key->id >= first && key->id <= last &&
+	    (id2 >= first && id2 <= last))
+		return 0;
+
+	if (key->id < first || id2 < first)
+		return -1;
+
+	return 1;
+}
+
+/**
+ * map_id_range_down_max - Find idmap via binary search in ordered idmap array.
+ * Can only be called if number of mappings exceeds UID_GID_MAP_MAX_BASE_EXTENTS.
+ */
+static u32 map_id_range_down_max(struct uid_gid_map *map, u32 id, u32 count)
+{
+	u32 extents;
+	struct uid_gid_extent *extent;
+	struct idmap_key key;
+
+	key.map_up = false;
+	key.count = count;
+	key.id = id;
+
+	extents = map->nr_extents;
+	smp_rmb();
+
+	extent = bsearch(&key, map->forward, extents,
+			 sizeof(struct uid_gid_extent), cmp_map_id);
+	/* Map the id or note failure */
+	if (extent)
+		id = (id - extent->first) + extent->lower_first;
+	else
+		id = (u32) -1;
+
+	return id;
+}
+
+/**
+ * map_id_range_down_base - Find idmap via binary search in static extent array.
+ * Can only be called if number of mappings is equal or less than
+ * UID_GID_MAP_MAX_BASE_EXTENTS.
+ */
+static u32 map_id_range_down_base(struct uid_gid_map *map, u32 id, u32 count)
 {
 	unsigned idx, extents;
 	u32 first, last, id2;
@@ -224,7 +315,23 @@ static u32 map_id_range_down(struct uid_gid_map *map, u32 id, u32 count)
 	return id;
 }
 
-static u32 map_id_down(struct uid_gid_map *map, u32 id)
+static u32 map_id_range_down(struct uid_gid_map *map, u32 id, u32 count)
+{
+	u32 extents = map->nr_extents;
+	smp_rmb();
+
+	if (extents <= UID_GID_MAP_MAX_BASE_EXTENTS)
+		return map_id_range_down_base(map, id, count);
+
+	return map_id_range_down_max(map, id, count);
+}
+
+/**
+ * map_id_down_base - Find idmap via binary search in static extent array.
+ * Can only be called if number of mappings is equal or less than
+ * UID_GID_MAP_MAX_BASE_EXTENTS.
+ */
+static u32 map_id_down_base(struct uid_gid_map *map, u32 id)
 {
 	unsigned idx, extents;
 	u32 first, last;
@@ -247,7 +354,23 @@ static u32 map_id_down(struct uid_gid_map *map, u32 id)
 	return id;
 }
 
-static u32 map_id_up(struct uid_gid_map *map, u32 id)
+static u32 map_id_down(struct uid_gid_map *map, u32 id)
+{
+	u32 extents = map->nr_extents;
+	smp_rmb();
+
+	if (extents <= UID_GID_MAP_MAX_BASE_EXTENTS)
+		return map_id_down_base(map, id);
+
+	return map_id_range_down_max(map, id, 0);
+}
+
+/**
+ * map_id_up_base - Find idmap via binary search in static extent array.
+ * Can only be called if number of mappings is equal or less than
+ * UID_GID_MAP_MAX_BASE_EXTENTS.
+ */
+static u32 map_id_up_base(struct uid_gid_map *map, u32 id)
 {
 	unsigned idx, extents;
 	u32 first, last;
@@ -270,6 +393,45 @@ static u32 map_id_up(struct uid_gid_map *map, u32 id)
 	return id;
 }
 
+/**
+ * map_id_up_max - Find idmap via binary search in ordered idmap array.
+ * Can only be called if number of mappings exceeds UID_GID_MAP_MAX_BASE_EXTENTS.
+ */
+static u32 map_id_up_max(struct uid_gid_map *map, u32 id)
+{
+	u32 extents;
+	struct uid_gid_extent *extent;
+	struct idmap_key key;
+
+	key.map_up = true;
+	key.count = 0;
+	key.id = id;
+
+	extents = map->nr_extents;
+	smp_rmb();
+
+	extent = bsearch(&key, map->reverse, extents,
+			 sizeof(struct uid_gid_extent), cmp_map_id);
+	/* Map the id or note failure */
+	if (extent)
+		id = (id - extent->lower_first) + extent->first;
+	else
+		id = (u32) -1;
+
+	return id;
+}
+
+static u32 map_id_up(struct uid_gid_map *map, u32 id)
+{
+	u32 extents = map->nr_extents;
+	smp_rmb();
+
+	if (extents <= UID_GID_MAP_MAX_BASE_EXTENTS)
+		return map_id_up_base(map, id);
+
+	return map_id_up_max(map, id);
+}
+
 /**
  *	make_kuid - Map a user-namespace uid pair into a kuid.
  *	@ns:  User namespace that the uid is in
@@ -540,13 +702,15 @@ static int projid_m_show(struct seq_file *seq, void *v)
 static void *m_start(struct seq_file *seq, loff_t *ppos,
 		     struct uid_gid_map *map)
 {
-	struct uid_gid_extent *extent = NULL;
 	loff_t pos = *ppos;
 
-	if (pos < map->nr_extents)
-		extent = &map->extent[pos];
+	if (pos >= map->nr_extents)
+		return NULL;
+
+	if (map->nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS)
+		return &map->extent[pos];
 
-	return extent;
+	return &map->forward[pos];
 }
 
 static void *uid_m_start(struct seq_file *seq, loff_t *ppos)
@@ -618,7 +782,10 @@ static bool mappings_overlap(struct uid_gid_map *new_map,
 		u32 prev_upper_last, prev_lower_last;
 		struct uid_gid_extent *prev;
 
-		prev = &new_map->extent[idx];
+		if (new_map->nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS)
+			prev = &new_map->extent[idx];
+		else
+			prev = &new_map->forward[idx];
 
 		prev_upper_first = prev->first;
 		prev_lower_first = prev->lower_first;
@@ -638,6 +805,102 @@ static bool mappings_overlap(struct uid_gid_map *new_map,
 	return false;
 }
 
+/**
+ * insert_extent - Safely insert a new idmap extent into struct uid_gid_map.
+ * Takes care to allocate a 4K block of memory if the number of mappings exceeds
+ * UID_GID_MAP_MAX_BASE_EXTENTS.
+ */
+static int insert_extent(struct uid_gid_map *map, struct uid_gid_extent *extent)
+{
+	if (map->nr_extents < UID_GID_MAP_MAX_BASE_EXTENTS) {
+		map->extent[map->nr_extents].first = extent->first;
+		map->extent[map->nr_extents].lower_first = extent->lower_first;
+		map->extent[map->nr_extents].count = extent->count;
+		return 0;
+	}
+
+	if (map->nr_extents == UID_GID_MAP_MAX_BASE_EXTENTS) {
+		struct uid_gid_extent *forward;
+
+		/* Allocate memory for 340 mappings. */
+		forward = kmalloc(sizeof(struct uid_gid_extent) *
+				 UID_GID_MAP_MAX_EXTENTS, GFP_KERNEL);
+		if (!forward)
+			return -ENOMEM;
+
+		/* Copy over memory. Only set up memory for the forward pointer.
+		 * Defer the memory setup for the reverse pointer.
+		 */
+		memcpy(forward, map->extent,
+		       map->nr_extents * sizeof(map->extent[0]));
+
+		map->forward = forward;
+		map->reverse = NULL;
+	}
+
+	map->forward[map->nr_extents].first = extent->first;
+	map->forward[map->nr_extents].lower_first = extent->lower_first;
+	map->forward[map->nr_extents].count = extent->count;
+	return 0;
+}
+
+/* cmp function to sort() forward mappings */
+static int cmp_extents_forward(const void *a, const void *b)
+{
+	const struct uid_gid_extent *e1 = a;
+	const struct uid_gid_extent *e2 = b;
+
+	if (e1->first < e2->first)
+		return -1;
+
+	if (e1->first > e2->first)
+		return 1;
+
+	return 0;
+}
+
+/* cmp function to sort() reverse mappings */
+static int cmp_extents_reverse(const void *a, const void *b)
+{
+	const struct uid_gid_extent *e1 = a;
+	const struct uid_gid_extent *e2 = b;
+
+	if (e1->lower_first < e2->lower_first)
+		return -1;
+
+	if (e1->lower_first > e2->lower_first)
+		return 1;
+
+	return 0;
+}
+
+/**
+ * sort_idmaps - Sorts an array of idmap entries.
+ * Can only be called if number of mappings exceeds UID_GID_MAP_MAX_BASE_EXTENTS.
+ */
+static int sort_idmaps(struct uid_gid_map *map)
+{
+	if (map->nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS)
+		return 0;
+
+	/* Sort forward array. */
+	sort(map->forward, map->nr_extents, sizeof(struct uid_gid_extent),
+	     cmp_extents_forward, NULL);
+
+	/* Only copy the memory from forward we actually need. */
+	map->reverse = kmemdup(map->forward,
+			       map->nr_extents * sizeof(struct uid_gid_extent),
+			       GFP_KERNEL);
+	if (!map->reverse)
+		return -ENOMEM;
+
+	/* Sort reverse array. */
+	sort(map->reverse, map->nr_extents, sizeof(struct uid_gid_extent),
+	     cmp_extents_reverse, NULL);
+
+	return 0;
+}
+
 static ssize_t map_write(struct file *file, const char __user *buf,
 			 size_t count, loff_t *ppos,
 			 int cap_setid,
@@ -648,7 +911,7 @@ static ssize_t map_write(struct file *file, const char __user *buf,
 	struct user_namespace *ns = seq->private;
 	struct uid_gid_map new_map;
 	unsigned idx;
-	struct uid_gid_extent *extent = NULL;
+	struct uid_gid_extent extent;
 	char *kbuf = NULL, *pos, *next_line;
 	ssize_t ret = -EINVAL;
 
@@ -673,6 +936,8 @@ static ssize_t map_write(struct file *file, const char __user *buf,
 	 */
 	mutex_lock(&userns_state_mutex);
 
+	memset(&new_map, 0, sizeof(struct uid_gid_map));
+
 	ret = -EPERM;
 	/* Only allow one successful write to the map */
 	if (map->nr_extents != 0)
@@ -700,9 +965,7 @@ static ssize_t map_write(struct file *file, const char __user *buf,
 	/* Parse the user data */
 	ret = -EINVAL;
 	pos = kbuf;
-	new_map.nr_extents = 0;
 	for (; pos; pos = next_line) {
-		extent = &new_map.extent[new_map.nr_extents];
 
 		/* Find the end of line and ensure I don't look past it */
 		next_line = strchr(pos, '\n');
@@ -714,17 +977,17 @@ static ssize_t map_write(struct file *file, const char __user *buf,
 		}
 
 		pos = skip_spaces(pos);
-		extent->first = simple_strtoul(pos, &pos, 10);
+		extent.first = simple_strtoul(pos, &pos, 10);
 		if (!isspace(*pos))
 			goto out;
 
 		pos = skip_spaces(pos);
-		extent->lower_first = simple_strtoul(pos, &pos, 10);
+		extent.lower_first = simple_strtoul(pos, &pos, 10);
 		if (!isspace(*pos))
 			goto out;
 
 		pos = skip_spaces(pos);
-		extent->count = simple_strtoul(pos, &pos, 10);
+		extent.count = simple_strtoul(pos, &pos, 10);
 		if (*pos && !isspace(*pos))
 			goto out;
 
@@ -734,29 +997,33 @@ static ssize_t map_write(struct file *file, const char __user *buf,
 			goto out;
 
 		/* Verify we have been given valid starting values */
-		if ((extent->first == (u32) -1) ||
-		    (extent->lower_first == (u32) -1))
+		if ((extent.first == (u32) -1) ||
+		    (extent.lower_first == (u32) -1))
 			goto out;
 
 		/* Verify count is not zero and does not cause the
 		 * extent to wrap
 		 */
-		if ((extent->first + extent->count) <= extent->first)
+		if ((extent.first + extent.count) <= extent.first)
 			goto out;
-		if ((extent->lower_first + extent->count) <=
-		     extent->lower_first)
+		if ((extent.lower_first + extent.count) <=
+		     extent.lower_first)
 			goto out;
 
 		/* Do the ranges in extent overlap any previous extents? */
-		if (mappings_overlap(&new_map, extent))
+		if (mappings_overlap(&new_map, &extent))
 			goto out;
 
-		new_map.nr_extents++;
-
-		/* Fail if the file contains too many extents */
-		if ((new_map.nr_extents == UID_GID_MAP_MAX_EXTENTS) &&
+		if ((new_map.nr_extents + 1) == UID_GID_MAP_MAX_EXTENTS &&
 		    (next_line != NULL))
 			goto out;
+
+		ret = insert_extent(&new_map, &extent);
+		if (ret < 0)
+			goto out;
+		ret = -EINVAL;
+
+		new_map.nr_extents++;
 	}
 	/* Be very certaint the new map actually exists */
 	if (new_map.nr_extents == 0)
@@ -767,16 +1034,26 @@ static ssize_t map_write(struct file *file, const char __user *buf,
 	if (!new_idmap_permitted(file, ns, cap_setid, &new_map))
 		goto out;
 
+	ret = sort_idmaps(&new_map);
+	if (ret < 0)
+		goto out;
+
+	ret = -EPERM;
 	/* Map the lower ids from the parent user namespace to the
 	 * kernel global id space.
 	 */
 	for (idx = 0; idx < new_map.nr_extents; idx++) {
+		struct uid_gid_extent *e;
 		u32 lower_first;
-		extent = &new_map.extent[idx];
+
+		if (new_map.nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS)
+			e = &new_map.extent[idx];
+		else
+			e = &new_map.forward[idx];
 
 		lower_first = map_id_range_down(parent_map,
-						extent->lower_first,
-						extent->count);
+						e->lower_first,
+						e->count);
 
 		/* Fail if we can not map the specified extent to
 		 * the kernel global id space.
@@ -784,18 +1061,31 @@ static ssize_t map_write(struct file *file, const char __user *buf,
 		if (lower_first == (u32) -1)
 			goto out;
 
-		extent->lower_first = lower_first;
+		e->lower_first = lower_first;
 	}
 
 	/* Install the map */
-	memcpy(map->extent, new_map.extent,
-		new_map.nr_extents*sizeof(new_map.extent[0]));
+	if (new_map.nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS) {
+		memcpy(map->extent, new_map.extent,
+		       new_map.nr_extents * sizeof(new_map.extent[0]));
+	} else {
+		map->forward = new_map.forward;
+		map->reverse = new_map.reverse;
+	}
 	smp_wmb();
 	map->nr_extents = new_map.nr_extents;
 
 	*ppos = count;
 	ret = count;
 out:
+	if (ret < 0 && new_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) {
+		kfree(new_map.forward);
+		kfree(new_map.reverse);
+		map->forward = NULL;
+		map->reverse = NULL;
+		map->nr_extents = 0;
+	}
+
 	mutex_unlock(&userns_state_mutex);
 	kfree(kbuf);
 	return ret;
-- 
cgit v1.2.3


From 11a8b9270e16e36d5fb607ba4b60db2958b7c625 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Tue, 31 Oct 2017 15:54:32 -0500
Subject: userns: Don't special case a count of 0

We can always use a count of 1 so there is no reason to have
a special case of a count of 0.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
 kernel/user_namespace.c | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 5fd2d53dbc75..c9904ee084c4 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -232,11 +232,7 @@ static int cmp_map_id(const void *k, const void *e)
 	const struct idmap_key *key = k;
 	const struct uid_gid_extent *el = e;
 
-	/* handle map_id_range_down() */
-	if (key->count)
-		id2 = key->id + key->count - 1;
-	else
-		id2 = key->id;
+	id2 = key->id + key->count - 1;
 
 	/* handle map_id_{down,up}() */
 	if (key->map_up)
@@ -362,7 +358,7 @@ static u32 map_id_down(struct uid_gid_map *map, u32 id)
 	if (extents <= UID_GID_MAP_MAX_BASE_EXTENTS)
 		return map_id_down_base(map, id);
 
-	return map_id_range_down_max(map, id, 0);
+	return map_id_range_down_max(map, id, 1);
 }
 
 /**
@@ -404,7 +400,7 @@ static u32 map_id_up_max(struct uid_gid_map *map, u32 id)
 	struct idmap_key key;
 
 	key.map_up = true;
-	key.count = 0;
+	key.count = 1;
 	key.id = id;
 
 	extents = map->nr_extents;
-- 
cgit v1.2.3


From 3edf652fa16562fb57a5a4b996ba72e2d7cdc38b Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Tue, 31 Oct 2017 16:27:29 -0500
Subject: userns: Simplify the user and group mapping functions

Consolidate reading the number of extents and computing the return
value in the map_id_down, map_id_range_down and map_id_range.

This removal of one read of extents makes one smp_rmb unnecessary
and makes the code safe it is executed during the map write.  Reading
the number of extents twice and depending on the result being the same
is not safe, as it could be 0 the first time and > 5 the second time,
which would lead to misinterpreting the union fields.

The consolidation of the return value just removes a duplicate
caluculation which should make it easier to understand and maintain
the code.

Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 kernel/user_namespace.c | 132 +++++++++++++++++++++---------------------------
 1 file changed, 58 insertions(+), 74 deletions(-)

(limited to 'kernel')

diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index c9904ee084c4..563a2981d7c7 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -256,28 +256,17 @@ static int cmp_map_id(const void *k, const void *e)
  * map_id_range_down_max - Find idmap via binary search in ordered idmap array.
  * Can only be called if number of mappings exceeds UID_GID_MAP_MAX_BASE_EXTENTS.
  */
-static u32 map_id_range_down_max(struct uid_gid_map *map, u32 id, u32 count)
+static struct uid_gid_extent *
+map_id_range_down_max(unsigned extents, struct uid_gid_map *map, u32 id, u32 count)
 {
-	u32 extents;
-	struct uid_gid_extent *extent;
 	struct idmap_key key;
 
 	key.map_up = false;
 	key.count = count;
 	key.id = id;
 
-	extents = map->nr_extents;
-	smp_rmb();
-
-	extent = bsearch(&key, map->forward, extents,
-			 sizeof(struct uid_gid_extent), cmp_map_id);
-	/* Map the id or note failure */
-	if (extent)
-		id = (id - extent->first) + extent->lower_first;
-	else
-		id = (u32) -1;
-
-	return id;
+	return bsearch(&key, map->forward, extents,
+		       sizeof(struct uid_gid_extent), cmp_map_id);
 }
 
 /**
@@ -285,41 +274,43 @@ static u32 map_id_range_down_max(struct uid_gid_map *map, u32 id, u32 count)
  * Can only be called if number of mappings is equal or less than
  * UID_GID_MAP_MAX_BASE_EXTENTS.
  */
-static u32 map_id_range_down_base(struct uid_gid_map *map, u32 id, u32 count)
+static struct uid_gid_extent *
+map_id_range_down_base(unsigned extents, struct uid_gid_map *map, u32 id, u32 count)
 {
-	unsigned idx, extents;
+	unsigned idx;
 	u32 first, last, id2;
 
 	id2 = id + count - 1;
 
 	/* Find the matching extent */
-	extents = map->nr_extents;
-	smp_rmb();
 	for (idx = 0; idx < extents; idx++) {
 		first = map->extent[idx].first;
 		last = first + map->extent[idx].count - 1;
 		if (id >= first && id <= last &&
 		    (id2 >= first && id2 <= last))
-			break;
+			return &map->extent[idx];
 	}
-	/* Map the id or note failure */
-	if (idx < extents)
-		id = (id - first) + map->extent[idx].lower_first;
-	else
-		id = (u32) -1;
-
-	return id;
+	return NULL;
 }
 
 static u32 map_id_range_down(struct uid_gid_map *map, u32 id, u32 count)
 {
-	u32 extents = map->nr_extents;
+	struct uid_gid_extent *extent;
+	unsigned extents = map->nr_extents;
 	smp_rmb();
 
 	if (extents <= UID_GID_MAP_MAX_BASE_EXTENTS)
-		return map_id_range_down_base(map, id, count);
+		extent = map_id_range_down_base(extents, map, id, count);
+	else
+		extent = map_id_range_down_max(extents, map, id, count);
 
-	return map_id_range_down_max(map, id, count);
+	/* Map the id or note failure */
+	if (extent)
+		id = (id - extent->first) + extent->lower_first;
+	else
+		id = (u32) -1;
+
+	return id;
 }
 
 /**
@@ -327,38 +318,40 @@ static u32 map_id_range_down(struct uid_gid_map *map, u32 id, u32 count)
  * Can only be called if number of mappings is equal or less than
  * UID_GID_MAP_MAX_BASE_EXTENTS.
  */
-static u32 map_id_down_base(struct uid_gid_map *map, u32 id)
+static struct uid_gid_extent *
+map_id_down_base(unsigned extents, struct uid_gid_map *map, u32 id)
 {
-	unsigned idx, extents;
+	unsigned idx;
 	u32 first, last;
 
 	/* Find the matching extent */
-	extents = map->nr_extents;
-	smp_rmb();
 	for (idx = 0; idx < extents; idx++) {
 		first = map->extent[idx].first;
 		last = first + map->extent[idx].count - 1;
 		if (id >= first && id <= last)
-			break;
+			return &map->extent[idx];
 	}
-	/* Map the id or note failure */
-	if (idx < extents)
-		id = (id - first) + map->extent[idx].lower_first;
-	else
-		id = (u32) -1;
-
-	return id;
+	return NULL;
 }
 
 static u32 map_id_down(struct uid_gid_map *map, u32 id)
 {
-	u32 extents = map->nr_extents;
+	struct uid_gid_extent *extent;
+	unsigned extents = map->nr_extents;
 	smp_rmb();
 
 	if (extents <= UID_GID_MAP_MAX_BASE_EXTENTS)
-		return map_id_down_base(map, id);
+		extent = map_id_down_base(extents, map, id);
+	else
+		extent = map_id_range_down_max(extents, map, id, 1);
 
-	return map_id_range_down_max(map, id, 1);
+	/* Map the id or note failure */
+	if (extent)
+		id = (id - extent->first) + extent->lower_first;
+	else
+		id = (u32) -1;
+
+	return id;
 }
 
 /**
@@ -366,48 +359,50 @@ static u32 map_id_down(struct uid_gid_map *map, u32 id)
  * Can only be called if number of mappings is equal or less than
  * UID_GID_MAP_MAX_BASE_EXTENTS.
  */
-static u32 map_id_up_base(struct uid_gid_map *map, u32 id)
+static struct uid_gid_extent *
+map_id_up_base(unsigned extents, struct uid_gid_map *map, u32 id)
 {
-	unsigned idx, extents;
+	unsigned idx;
 	u32 first, last;
 
 	/* Find the matching extent */
-	extents = map->nr_extents;
-	smp_rmb();
 	for (idx = 0; idx < extents; idx++) {
 		first = map->extent[idx].lower_first;
 		last = first + map->extent[idx].count - 1;
 		if (id >= first && id <= last)
-			break;
+			return &map->extent[idx];
 	}
-	/* Map the id or note failure */
-	if (idx < extents)
-		id = (id - first) + map->extent[idx].first;
-	else
-		id = (u32) -1;
-
-	return id;
+	return NULL;
 }
 
 /**
  * map_id_up_max - Find idmap via binary search in ordered idmap array.
  * Can only be called if number of mappings exceeds UID_GID_MAP_MAX_BASE_EXTENTS.
  */
-static u32 map_id_up_max(struct uid_gid_map *map, u32 id)
+static struct uid_gid_extent *
+map_id_up_max(unsigned extents, struct uid_gid_map *map, u32 id)
 {
-	u32 extents;
-	struct uid_gid_extent *extent;
 	struct idmap_key key;
 
 	key.map_up = true;
 	key.count = 1;
 	key.id = id;
 
-	extents = map->nr_extents;
+	return bsearch(&key, map->reverse, extents,
+		       sizeof(struct uid_gid_extent), cmp_map_id);
+}
+
+static u32 map_id_up(struct uid_gid_map *map, u32 id)
+{
+	struct uid_gid_extent *extent;
+	unsigned extents = map->nr_extents;
 	smp_rmb();
 
-	extent = bsearch(&key, map->reverse, extents,
-			 sizeof(struct uid_gid_extent), cmp_map_id);
+	if (extents <= UID_GID_MAP_MAX_BASE_EXTENTS)
+		extent = map_id_up_base(extents, map, id);
+	else
+		extent = map_id_up_max(extents, map, id);
+
 	/* Map the id or note failure */
 	if (extent)
 		id = (id - extent->lower_first) + extent->first;
@@ -417,17 +412,6 @@ static u32 map_id_up_max(struct uid_gid_map *map, u32 id)
 	return id;
 }
 
-static u32 map_id_up(struct uid_gid_map *map, u32 id)
-{
-	u32 extents = map->nr_extents;
-	smp_rmb();
-
-	if (extents <= UID_GID_MAP_MAX_BASE_EXTENTS)
-		return map_id_up_base(map, id);
-
-	return map_id_up_max(map, id);
-}
-
 /**
  *	make_kuid - Map a user-namespace uid pair into a kuid.
  *	@ns:  User namespace that the uid is in
-- 
cgit v1.2.3


From d5e7b3c5f51fc6d34e12b6d87bfd30ab277c4625 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Tue, 31 Oct 2017 17:09:34 -0500
Subject: userns: Don't read extents twice in m_start

This is important so reading /proc/<pid>/{uid_map,gid_map,projid_map} while
the map is being written does not do strange things.

Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 kernel/user_namespace.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 563a2981d7c7..4f7e357ac1e2 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -683,11 +683,13 @@ static void *m_start(struct seq_file *seq, loff_t *ppos,
 		     struct uid_gid_map *map)
 {
 	loff_t pos = *ppos;
+	unsigned extents = map->nr_extents;
+	smp_rmb();
 
-	if (pos >= map->nr_extents)
+	if (pos >= extents)
 		return NULL;
 
-	if (map->nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS)
+	if (extents <= UID_GID_MAP_MAX_BASE_EXTENTS)
 		return &map->extent[pos];
 
 	return &map->forward[pos];
-- 
cgit v1.2.3


From ece66133979b211324cc6aff9285889b425243d2 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Tue, 31 Oct 2017 16:53:09 -0500
Subject: userns: Make map_id_down a wrapper for map_id_range_down

There is no good reason for this code duplication, the number of cache
line accesses not the number of instructions are the bottleneck in
this code.

Therefore simplify maintenance by removing unnecessary code.

Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 kernel/user_namespace.c | 38 +-------------------------------------
 1 file changed, 1 insertion(+), 37 deletions(-)

(limited to 'kernel')

diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 4f7e357ac1e2..1d0298870ee3 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -313,45 +313,9 @@ static u32 map_id_range_down(struct uid_gid_map *map, u32 id, u32 count)
 	return id;
 }
 
-/**
- * map_id_down_base - Find idmap via binary search in static extent array.
- * Can only be called if number of mappings is equal or less than
- * UID_GID_MAP_MAX_BASE_EXTENTS.
- */
-static struct uid_gid_extent *
-map_id_down_base(unsigned extents, struct uid_gid_map *map, u32 id)
-{
-	unsigned idx;
-	u32 first, last;
-
-	/* Find the matching extent */
-	for (idx = 0; idx < extents; idx++) {
-		first = map->extent[idx].first;
-		last = first + map->extent[idx].count - 1;
-		if (id >= first && id <= last)
-			return &map->extent[idx];
-	}
-	return NULL;
-}
-
 static u32 map_id_down(struct uid_gid_map *map, u32 id)
 {
-	struct uid_gid_extent *extent;
-	unsigned extents = map->nr_extents;
-	smp_rmb();
-
-	if (extents <= UID_GID_MAP_MAX_BASE_EXTENTS)
-		extent = map_id_down_base(extents, map, id);
-	else
-		extent = map_id_range_down_max(extents, map, id, 1);
-
-	/* Map the id or note failure */
-	if (extent)
-		id = (id - extent->first) + extent->lower_first;
-	else
-		id = (u32) -1;
-
-	return id;
+	return map_id_range_down(map, id, 1);
 }
 
 /**
-- 
cgit v1.2.3


From 3fda0e737e906ce73220b20c27e7f792d0aac6a8 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Tue, 31 Oct 2017 17:15:30 -0500
Subject: userns: Simplify insert_extent

Consolidate the code to write to the new mapping at the end of the
function to remove the duplication.  Move the increase in the number
of mappings into insert_extent, keeping the logic together.

Just a small increase in readability and maintainability.

Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 kernel/user_namespace.c | 19 ++++++++-----------
 1 file changed, 8 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 1d0298870ee3..899c31060ff3 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -758,12 +758,7 @@ static bool mappings_overlap(struct uid_gid_map *new_map,
  */
 static int insert_extent(struct uid_gid_map *map, struct uid_gid_extent *extent)
 {
-	if (map->nr_extents < UID_GID_MAP_MAX_BASE_EXTENTS) {
-		map->extent[map->nr_extents].first = extent->first;
-		map->extent[map->nr_extents].lower_first = extent->lower_first;
-		map->extent[map->nr_extents].count = extent->count;
-		return 0;
-	}
+	struct uid_gid_extent *dest;
 
 	if (map->nr_extents == UID_GID_MAP_MAX_BASE_EXTENTS) {
 		struct uid_gid_extent *forward;
@@ -784,9 +779,13 @@ static int insert_extent(struct uid_gid_map *map, struct uid_gid_extent *extent)
 		map->reverse = NULL;
 	}
 
-	map->forward[map->nr_extents].first = extent->first;
-	map->forward[map->nr_extents].lower_first = extent->lower_first;
-	map->forward[map->nr_extents].count = extent->count;
+	if (map->nr_extents < UID_GID_MAP_MAX_BASE_EXTENTS)
+		dest = &map->extent[map->nr_extents];
+	else
+		dest = &map->forward[map->nr_extents];
+
+	*dest = *extent;
+	map->nr_extents++;
 	return 0;
 }
 
@@ -968,8 +967,6 @@ static ssize_t map_write(struct file *file, const char __user *buf,
 		if (ret < 0)
 			goto out;
 		ret = -EINVAL;
-
-		new_map.nr_extents++;
 	}
 	/* Be very certaint the new map actually exists */
 	if (new_map.nr_extents == 0)
-- 
cgit v1.2.3


From 638f5b90d46016372a8e3e0a434f199cc5e12b8c Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@fb.com>
Date: Tue, 31 Oct 2017 18:16:05 -0700
Subject: bpf: reduce verifier memory consumption

the verifier got progressively smarter over time and size of its internal
state grew as well. Time to reduce the memory consumption.

Before:
sizeof(struct bpf_verifier_state) = 6520
After:
sizeof(struct bpf_verifier_state) = 896

It's done by observing that majority of BPF programs use little to
no stack whereas verifier kept all of 512 stack slots ready always.
Instead dynamically reallocate struct verifier state when stack
access is detected.
Runtime difference before vs after is within a noise.
The number of processed instructions stays the same.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/netronome/nfp/bpf/verifier.c |   8 +-
 include/linux/bpf_verifier.h                      |  16 +-
 kernel/bpf/verifier.c                             | 437 ++++++++++++++--------
 3 files changed, 305 insertions(+), 156 deletions(-)

(limited to 'kernel')

diff --git a/drivers/net/ethernet/netronome/nfp/bpf/verifier.c b/drivers/net/ethernet/netronome/nfp/bpf/verifier.c
index 3d3dcac1c942..a8c7615546a9 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/verifier.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/verifier.c
@@ -76,9 +76,9 @@ nfp_bpf_goto_meta(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
 
 static int
 nfp_bpf_check_exit(struct nfp_prog *nfp_prog,
-		   const struct bpf_verifier_env *env)
+		   struct bpf_verifier_env *env)
 {
-	const struct bpf_reg_state *reg0 = &env->cur_state.regs[0];
+	const struct bpf_reg_state *reg0 = cur_regs(env) + BPF_REG_0;
 	u64 imm;
 
 	if (nfp_prog->act == NN_ACT_XDP)
@@ -144,9 +144,9 @@ nfp_bpf_check_stack_access(struct nfp_prog *nfp_prog,
 
 static int
 nfp_bpf_check_ptr(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
-		  const struct bpf_verifier_env *env, u8 reg_no)
+		  struct bpf_verifier_env *env, u8 reg_no)
 {
-	const struct bpf_reg_state *reg = &env->cur_state.regs[reg_no];
+	const struct bpf_reg_state *reg = cur_regs(env) + reg_no;
 	int err;
 
 	if (reg->type != PTR_TO_CTX &&
diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index feeaea93d959..3b0976aaac75 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -88,14 +88,19 @@ enum bpf_stack_slot_type {
 
 #define BPF_REG_SIZE 8	/* size of eBPF register in bytes */
 
+struct bpf_stack_state {
+	struct bpf_reg_state spilled_ptr;
+	u8 slot_type[BPF_REG_SIZE];
+};
+
 /* state of the program:
  * type of all registers and stack info
  */
 struct bpf_verifier_state {
 	struct bpf_reg_state regs[MAX_BPF_REG];
-	u8 stack_slot_type[MAX_BPF_STACK];
-	struct bpf_reg_state spilled_regs[MAX_BPF_STACK / BPF_REG_SIZE];
 	struct bpf_verifier_state *parent;
+	int allocated_stack;
+	struct bpf_stack_state *stack;
 };
 
 /* linked list of verifier states used to prune search */
@@ -145,7 +150,7 @@ struct bpf_verifier_env {
 	struct bpf_verifier_stack_elem *head; /* stack of verifier states to be processed */
 	int stack_size;			/* number of states to be processed */
 	bool strict_alignment;		/* perform strict pointer alignment checks */
-	struct bpf_verifier_state cur_state; /* current verifier state */
+	struct bpf_verifier_state *cur_state; /* current verifier state */
 	struct bpf_verifier_state_list **explored_states; /* search pruning optimization */
 	const struct bpf_ext_analyzer_ops *analyzer_ops; /* external analyzer ops */
 	void *analyzer_priv; /* pointer to external analyzer's private data */
@@ -159,6 +164,11 @@ struct bpf_verifier_env {
 	struct bpf_verifer_log log;
 };
 
+static inline struct bpf_reg_state *cur_regs(struct bpf_verifier_env *env)
+{
+	return env->cur_state->regs;
+}
+
 int bpf_analyzer(struct bpf_prog *prog, const struct bpf_ext_analyzer_ops *ops,
 		 void *priv);
 
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index d906775e12c1..5f26f7ad124f 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -276,43 +276,132 @@ static void print_verifier_state(struct bpf_verifier_env *env,
 			verbose(env, ")");
 		}
 	}
-	for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) {
-		if (state->stack_slot_type[i] == STACK_SPILL)
-			verbose(env, " fp%d=%s", -MAX_BPF_STACK + i,
-				reg_type_str[state->spilled_regs[i / BPF_REG_SIZE].type]);
+	for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) {
+		if (state->stack[i].slot_type[0] == STACK_SPILL)
+			verbose(env, " fp%d=%s",
+				-MAX_BPF_STACK + i * BPF_REG_SIZE,
+				reg_type_str[state->stack[i].spilled_ptr.type]);
 	}
 	verbose(env, "\n");
 }
 
-static int pop_stack(struct bpf_verifier_env *env, int *prev_insn_idx)
+static int copy_stack_state(struct bpf_verifier_state *dst,
+			    const struct bpf_verifier_state *src)
 {
-	struct bpf_verifier_stack_elem *elem;
-	int insn_idx;
+	if (!src->stack)
+		return 0;
+	if (WARN_ON_ONCE(dst->allocated_stack < src->allocated_stack)) {
+		/* internal bug, make state invalid to reject the program */
+		memset(dst, 0, sizeof(*dst));
+		return -EFAULT;
+	}
+	memcpy(dst->stack, src->stack,
+	       sizeof(*src->stack) * (src->allocated_stack / BPF_REG_SIZE));
+	return 0;
+}
+
+/* do_check() starts with zero-sized stack in struct bpf_verifier_state to
+ * make it consume minimal amount of memory. check_stack_write() access from
+ * the program calls into realloc_verifier_state() to grow the stack size.
+ * Note there is a non-zero 'parent' pointer inside bpf_verifier_state
+ * which this function copies over. It points to previous bpf_verifier_state
+ * which is never reallocated
+ */
+static int realloc_verifier_state(struct bpf_verifier_state *state, int size,
+				  bool copy_old)
+{
+	u32 old_size = state->allocated_stack;
+	struct bpf_stack_state *new_stack;
+	int slot = size / BPF_REG_SIZE;
+
+	if (size <= old_size || !size) {
+		if (copy_old)
+			return 0;
+		state->allocated_stack = slot * BPF_REG_SIZE;
+		if (!size && old_size) {
+			kfree(state->stack);
+			state->stack = NULL;
+		}
+		return 0;
+	}
+	new_stack = kmalloc_array(slot, sizeof(struct bpf_stack_state),
+				  GFP_KERNEL);
+	if (!new_stack)
+		return -ENOMEM;
+	if (copy_old) {
+		if (state->stack)
+			memcpy(new_stack, state->stack,
+			       sizeof(*new_stack) * (old_size / BPF_REG_SIZE));
+		memset(new_stack + old_size / BPF_REG_SIZE, 0,
+		       sizeof(*new_stack) * (size - old_size) / BPF_REG_SIZE);
+	}
+	state->allocated_stack = slot * BPF_REG_SIZE;
+	kfree(state->stack);
+	state->stack = new_stack;
+	return 0;
+}
+
+static void free_verifier_state(struct bpf_verifier_state *state)
+{
+	kfree(state->stack);
+	kfree(state);
+}
+
+/* copy verifier state from src to dst growing dst stack space
+ * when necessary to accommodate larger src stack
+ */
+static int copy_verifier_state(struct bpf_verifier_state *dst,
+			       const struct bpf_verifier_state *src)
+{
+	int err;
+
+	err = realloc_verifier_state(dst, src->allocated_stack, false);
+	if (err)
+		return err;
+	memcpy(dst, src, offsetof(struct bpf_verifier_state, allocated_stack));
+	return copy_stack_state(dst, src);
+}
+
+static int pop_stack(struct bpf_verifier_env *env, int *prev_insn_idx,
+		     int *insn_idx)
+{
+	struct bpf_verifier_state *cur = env->cur_state;
+	struct bpf_verifier_stack_elem *elem, *head = env->head;
+	int err;
 
 	if (env->head == NULL)
-		return -1;
+		return -ENOENT;
 
-	memcpy(&env->cur_state, &env->head->st, sizeof(env->cur_state));
-	insn_idx = env->head->insn_idx;
+	if (cur) {
+		err = copy_verifier_state(cur, &head->st);
+		if (err)
+			return err;
+	}
+	if (insn_idx)
+		*insn_idx = head->insn_idx;
 	if (prev_insn_idx)
-		*prev_insn_idx = env->head->prev_insn_idx;
-	elem = env->head->next;
-	kfree(env->head);
+		*prev_insn_idx = head->prev_insn_idx;
+	elem = head->next;
+	kfree(head);
 	env->head = elem;
 	env->stack_size--;
-	return insn_idx;
+	return 0;
 }
 
 static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env,
 					     int insn_idx, int prev_insn_idx)
 {
+	struct bpf_verifier_state *cur = env->cur_state;
 	struct bpf_verifier_stack_elem *elem;
+	int err;
 
-	elem = kmalloc(sizeof(struct bpf_verifier_stack_elem), GFP_KERNEL);
+	elem = kzalloc(sizeof(struct bpf_verifier_stack_elem), GFP_KERNEL);
 	if (!elem)
 		goto err;
 
-	memcpy(&elem->st, &env->cur_state, sizeof(env->cur_state));
+	err = copy_verifier_state(&elem->st, cur);
+	if (err)
+		return NULL;
 	elem->insn_idx = insn_idx;
 	elem->prev_insn_idx = prev_insn_idx;
 	elem->next = env->head;
@@ -325,7 +414,7 @@ static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env,
 	return &elem->st;
 err:
 	/* pop all elements and return */
-	while (pop_stack(env, NULL) >= 0);
+	while (!pop_stack(env, NULL, NULL));
 	return NULL;
 }
 
@@ -550,7 +639,7 @@ static void mark_reg_read(const struct bpf_verifier_state *state, u32 regno)
 static int check_reg_arg(struct bpf_verifier_env *env, u32 regno,
 			 enum reg_arg_type t)
 {
-	struct bpf_reg_state *regs = env->cur_state.regs;
+	struct bpf_reg_state *regs = env->cur_state->regs;
 
 	if (regno >= MAX_BPF_REG) {
 		verbose(env, "R%d is invalid\n", regno);
@@ -563,7 +652,7 @@ static int check_reg_arg(struct bpf_verifier_env *env, u32 regno,
 			verbose(env, "R%d !read_ok\n", regno);
 			return -EACCES;
 		}
-		mark_reg_read(&env->cur_state, regno);
+		mark_reg_read(env->cur_state, regno);
 	} else {
 		/* check whether register used as dest operand can be written to */
 		if (regno == BPF_REG_FP) {
@@ -601,10 +690,21 @@ static int check_stack_write(struct bpf_verifier_env *env,
 			     struct bpf_verifier_state *state, int off,
 			     int size, int value_regno)
 {
-	int i, spi = (MAX_BPF_STACK + off) / BPF_REG_SIZE;
+	int i, slot = -off - 1, spi = slot / BPF_REG_SIZE, err;
+
+	err = realloc_verifier_state(state, round_up(slot + 1, BPF_REG_SIZE),
+				     true);
+	if (err)
+		return err;
 	/* caller checked that off % size == 0 and -MAX_BPF_STACK <= off < 0,
 	 * so it's aligned access and [off, off + size) are within stack limits
 	 */
+	if (!env->allow_ptr_leaks &&
+	    state->stack[spi].slot_type[0] == STACK_SPILL &&
+	    size != BPF_REG_SIZE) {
+		verbose(env, "attempt to corrupt spilled pointer on stack\n");
+		return -EACCES;
+	}
 
 	if (value_regno >= 0 &&
 	    is_spillable_regtype(state->regs[value_regno].type)) {
@@ -616,17 +716,18 @@ static int check_stack_write(struct bpf_verifier_env *env,
 		}
 
 		/* save register state */
-		state->spilled_regs[spi] = state->regs[value_regno];
-		state->spilled_regs[spi].live |= REG_LIVE_WRITTEN;
+		state->stack[spi].spilled_ptr = state->regs[value_regno];
+		state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN;
 
 		for (i = 0; i < BPF_REG_SIZE; i++)
-			state->stack_slot_type[MAX_BPF_STACK + off + i] = STACK_SPILL;
+			state->stack[spi].slot_type[i] = STACK_SPILL;
 	} else {
 		/* regular write of data into stack */
-		state->spilled_regs[spi] = (struct bpf_reg_state) {};
+		state->stack[spi].spilled_ptr = (struct bpf_reg_state) {};
 
 		for (i = 0; i < size; i++)
-			state->stack_slot_type[MAX_BPF_STACK + off + i] = STACK_MISC;
+			state->stack[spi].slot_type[(slot - i) % BPF_REG_SIZE] =
+				STACK_MISC;
 	}
 	return 0;
 }
@@ -637,10 +738,10 @@ static void mark_stack_slot_read(const struct bpf_verifier_state *state, int slo
 
 	while (parent) {
 		/* if read wasn't screened by an earlier write ... */
-		if (state->spilled_regs[slot].live & REG_LIVE_WRITTEN)
+		if (state->stack[slot].spilled_ptr.live & REG_LIVE_WRITTEN)
 			break;
 		/* ... then we depend on parent's value */
-		parent->spilled_regs[slot].live |= REG_LIVE_READ;
+		parent->stack[slot].spilled_ptr.live |= REG_LIVE_READ;
 		state = parent;
 		parent = state->parent;
 	}
@@ -650,34 +751,37 @@ static int check_stack_read(struct bpf_verifier_env *env,
 			    struct bpf_verifier_state *state, int off, int size,
 			    int value_regno)
 {
-	u8 *slot_type;
-	int i, spi;
+	int i, slot = -off - 1, spi = slot / BPF_REG_SIZE;
+	u8 *stype;
 
-	slot_type = &state->stack_slot_type[MAX_BPF_STACK + off];
+	if (state->allocated_stack <= slot) {
+		verbose(env, "invalid read from stack off %d+0 size %d\n",
+			off, size);
+		return -EACCES;
+	}
+	stype = state->stack[spi].slot_type;
 
-	if (slot_type[0] == STACK_SPILL) {
+	if (stype[0] == STACK_SPILL) {
 		if (size != BPF_REG_SIZE) {
 			verbose(env, "invalid size of register spill\n");
 			return -EACCES;
 		}
 		for (i = 1; i < BPF_REG_SIZE; i++) {
-			if (slot_type[i] != STACK_SPILL) {
+			if (stype[(slot - i) % BPF_REG_SIZE] != STACK_SPILL) {
 				verbose(env, "corrupted spill memory\n");
 				return -EACCES;
 			}
 		}
 
-		spi = (MAX_BPF_STACK + off) / BPF_REG_SIZE;
-
 		if (value_regno >= 0) {
 			/* restore register state from stack */
-			state->regs[value_regno] = state->spilled_regs[spi];
+			state->regs[value_regno] = state->stack[spi].spilled_ptr;
 			mark_stack_slot_read(state, spi);
 		}
 		return 0;
 	} else {
 		for (i = 0; i < size; i++) {
-			if (slot_type[i] != STACK_MISC) {
+			if (stype[(slot - i) % BPF_REG_SIZE] != STACK_MISC) {
 				verbose(env, "invalid read from stack off %d+%d size %d\n",
 					off, i, size);
 				return -EACCES;
@@ -694,7 +798,8 @@ static int check_stack_read(struct bpf_verifier_env *env,
 static int __check_map_access(struct bpf_verifier_env *env, u32 regno, int off,
 			    int size)
 {
-	struct bpf_map *map = env->cur_state.regs[regno].map_ptr;
+	struct bpf_reg_state *regs = cur_regs(env);
+	struct bpf_map *map = regs[regno].map_ptr;
 
 	if (off < 0 || size <= 0 || off + size > map->value_size) {
 		verbose(env, "invalid access to map value, value_size=%d off=%d size=%d\n",
@@ -706,9 +811,9 @@ static int __check_map_access(struct bpf_verifier_env *env, u32 regno, int off,
 
 /* check read/write into a map element with possible variable offset */
 static int check_map_access(struct bpf_verifier_env *env, u32 regno,
-				int off, int size)
+			    int off, int size)
 {
-	struct bpf_verifier_state *state = &env->cur_state;
+	struct bpf_verifier_state *state = env->cur_state;
 	struct bpf_reg_state *reg = &state->regs[regno];
 	int err;
 
@@ -783,7 +888,7 @@ static bool may_access_direct_pkt_data(struct bpf_verifier_env *env,
 static int __check_packet_access(struct bpf_verifier_env *env, u32 regno,
 				 int off, int size)
 {
-	struct bpf_reg_state *regs = env->cur_state.regs;
+	struct bpf_reg_state *regs = cur_regs(env);
 	struct bpf_reg_state *reg = &regs[regno];
 
 	if (off < 0 || size <= 0 || (u64)off + size > reg->range) {
@@ -797,7 +902,7 @@ static int __check_packet_access(struct bpf_verifier_env *env, u32 regno,
 static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off,
 			       int size)
 {
-	struct bpf_reg_state *regs = env->cur_state.regs;
+	struct bpf_reg_state *regs = cur_regs(env);
 	struct bpf_reg_state *reg = &regs[regno];
 	int err;
 
@@ -866,7 +971,7 @@ static bool __is_pointer_value(bool allow_ptr_leaks,
 
 static bool is_pointer_value(struct bpf_verifier_env *env, int regno)
 {
-	return __is_pointer_value(env->allow_ptr_leaks, &env->cur_state.regs[regno]);
+	return __is_pointer_value(env->allow_ptr_leaks, cur_regs(env) + regno);
 }
 
 static int check_pkt_ptr_alignment(struct bpf_verifier_env *env,
@@ -968,8 +1073,9 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
 			    int bpf_size, enum bpf_access_type t,
 			    int value_regno)
 {
-	struct bpf_verifier_state *state = &env->cur_state;
-	struct bpf_reg_state *reg = &state->regs[regno];
+	struct bpf_verifier_state *state = env->cur_state;
+	struct bpf_reg_state *regs = cur_regs(env);
+	struct bpf_reg_state *reg = regs + regno;
 	int size, err = 0;
 
 	size = bpf_size_to_bytes(bpf_size);
@@ -993,7 +1099,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
 
 		err = check_map_access(env, regno, off, size);
 		if (!err && t == BPF_READ && value_regno >= 0)
-			mark_reg_unknown(env, state->regs, value_regno);
+			mark_reg_unknown(env, regs, value_regno);
 
 	} else if (reg->type == PTR_TO_CTX) {
 		enum bpf_reg_type reg_type = SCALAR_VALUE;
@@ -1028,14 +1134,14 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
 			 * case, we know the offset is zero.
 			 */
 			if (reg_type == SCALAR_VALUE)
-				mark_reg_unknown(env, state->regs, value_regno);
+				mark_reg_unknown(env, regs, value_regno);
 			else
-				mark_reg_known_zero(env, state->regs,
+				mark_reg_known_zero(env, regs,
 						    value_regno);
-			state->regs[value_regno].id = 0;
-			state->regs[value_regno].off = 0;
-			state->regs[value_regno].range = 0;
-			state->regs[value_regno].type = reg_type;
+			regs[value_regno].id = 0;
+			regs[value_regno].off = 0;
+			regs[value_regno].range = 0;
+			regs[value_regno].type = reg_type;
 		}
 
 	} else if (reg->type == PTR_TO_STACK) {
@@ -1061,19 +1167,12 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
 		if (env->prog->aux->stack_depth < -off)
 			env->prog->aux->stack_depth = -off;
 
-		if (t == BPF_WRITE) {
-			if (!env->allow_ptr_leaks &&
-			    state->stack_slot_type[MAX_BPF_STACK + off] == STACK_SPILL &&
-			    size != BPF_REG_SIZE) {
-				verbose(env, "attempt to corrupt spilled pointer on stack\n");
-				return -EACCES;
-			}
+		if (t == BPF_WRITE)
 			err = check_stack_write(env, state, off, size,
 						value_regno);
-		} else {
+		else
 			err = check_stack_read(env, state, off, size,
 					       value_regno);
-		}
 	} else if (reg_is_pkt_pointer(reg)) {
 		if (t == BPF_WRITE && !may_access_direct_pkt_data(env, NULL, t)) {
 			verbose(env, "cannot write into packet\n");
@@ -1087,7 +1186,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
 		}
 		err = check_packet_access(env, regno, off, size);
 		if (!err && t == BPF_READ && value_regno >= 0)
-			mark_reg_unknown(env, state->regs, value_regno);
+			mark_reg_unknown(env, regs, value_regno);
 	} else {
 		verbose(env, "R%d invalid mem access '%s'\n", regno,
 			reg_type_str[reg->type]);
@@ -1095,11 +1194,11 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
 	}
 
 	if (!err && size < BPF_REG_SIZE && value_regno >= 0 && t == BPF_READ &&
-	    state->regs[value_regno].type == SCALAR_VALUE) {
+	    regs[value_regno].type == SCALAR_VALUE) {
 		/* b/h/w load zero-extends, mark upper bits as known 0 */
-		state->regs[value_regno].var_off = tnum_cast(
-					state->regs[value_regno].var_off, size);
-		__update_reg_bounds(&state->regs[value_regno]);
+		regs[value_regno].var_off =
+			tnum_cast(regs[value_regno].var_off, size);
+		__update_reg_bounds(&regs[value_regno]);
 	}
 	return err;
 }
@@ -1156,9 +1255,9 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno,
 				int access_size, bool zero_size_allowed,
 				struct bpf_call_arg_meta *meta)
 {
-	struct bpf_verifier_state *state = &env->cur_state;
+	struct bpf_verifier_state *state = env->cur_state;
 	struct bpf_reg_state *regs = state->regs;
-	int off, i;
+	int off, i, slot, spi;
 
 	if (regs[regno].type != PTR_TO_STACK) {
 		/* Allow zero-byte read from NULL, regardless of pointer type */
@@ -1198,7 +1297,11 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno,
 	}
 
 	for (i = 0; i < access_size; i++) {
-		if (state->stack_slot_type[MAX_BPF_STACK + off + i] != STACK_MISC) {
+		slot = -(off + i) - 1;
+		spi = slot / BPF_REG_SIZE;
+		if (state->allocated_stack <= slot ||
+		    state->stack[spi].slot_type[slot % BPF_REG_SIZE] !=
+			STACK_MISC) {
 			verbose(env, "invalid indirect read from stack off %d+%d size %d\n",
 				off, i, access_size);
 			return -EACCES;
@@ -1211,7 +1314,7 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,
 				   int access_size, bool zero_size_allowed,
 				   struct bpf_call_arg_meta *meta)
 {
-	struct bpf_reg_state *regs = env->cur_state.regs, *reg = &regs[regno];
+	struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
 
 	switch (reg->type) {
 	case PTR_TO_PACKET:
@@ -1229,7 +1332,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
 			  enum bpf_arg_type arg_type,
 			  struct bpf_call_arg_meta *meta)
 {
-	struct bpf_reg_state *regs = env->cur_state.regs, *reg = &regs[regno];
+	struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
 	enum bpf_reg_type expected_type, type = reg->type;
 	int err = 0;
 
@@ -1514,7 +1617,7 @@ static int check_raw_mode(const struct bpf_func_proto *fn)
  */
 static void clear_all_pkt_pointers(struct bpf_verifier_env *env)
 {
-	struct bpf_verifier_state *state = &env->cur_state;
+	struct bpf_verifier_state *state = env->cur_state;
 	struct bpf_reg_state *regs = state->regs, *reg;
 	int i;
 
@@ -1522,10 +1625,10 @@ static void clear_all_pkt_pointers(struct bpf_verifier_env *env)
 		if (reg_is_pkt_pointer_any(&regs[i]))
 			mark_reg_unknown(env, regs, i);
 
-	for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) {
-		if (state->stack_slot_type[i] != STACK_SPILL)
+	for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) {
+		if (state->stack[i].slot_type[0] != STACK_SPILL)
 			continue;
-		reg = &state->spilled_regs[i / BPF_REG_SIZE];
+		reg = &state->stack[i].spilled_ptr;
 		if (reg_is_pkt_pointer_any(reg))
 			__mark_reg_unknown(reg);
 	}
@@ -1533,9 +1636,8 @@ static void clear_all_pkt_pointers(struct bpf_verifier_env *env)
 
 static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
 {
-	struct bpf_verifier_state *state = &env->cur_state;
 	const struct bpf_func_proto *fn = NULL;
-	struct bpf_reg_state *regs = state->regs;
+	struct bpf_reg_state *regs;
 	struct bpf_call_arg_meta meta;
 	bool changes_data;
 	int i, err;
@@ -1603,6 +1705,7 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
 			return err;
 	}
 
+	regs = cur_regs(env);
 	/* reset caller saved regs */
 	for (i = 0; i < CALLER_SAVED_REGS; i++) {
 		mark_reg_not_init(env, regs, caller_saved[i]);
@@ -1691,7 +1794,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
 				   const struct bpf_reg_state *ptr_reg,
 				   const struct bpf_reg_state *off_reg)
 {
-	struct bpf_reg_state *regs = env->cur_state.regs, *dst_reg;
+	struct bpf_reg_state *regs = cur_regs(env), *dst_reg;
 	bool known = tnum_is_const(off_reg->var_off);
 	s64 smin_val = off_reg->smin_value, smax_val = off_reg->smax_value,
 	    smin_ptr = ptr_reg->smin_value, smax_ptr = ptr_reg->smax_value;
@@ -1703,13 +1806,13 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
 	dst_reg = &regs[dst];
 
 	if (WARN_ON_ONCE(known && (smin_val != smax_val))) {
-		print_verifier_state(env, &env->cur_state);
+		print_verifier_state(env, env->cur_state);
 		verbose(env,
 			"verifier internal error: known but bad sbounds\n");
 		return -EINVAL;
 	}
 	if (WARN_ON_ONCE(known && (umin_val != umax_val))) {
-		print_verifier_state(env, &env->cur_state);
+		print_verifier_state(env, env->cur_state);
 		verbose(env,
 			"verifier internal error: known but bad ubounds\n");
 		return -EINVAL;
@@ -1890,7 +1993,7 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
 				      struct bpf_reg_state *dst_reg,
 				      struct bpf_reg_state src_reg)
 {
-	struct bpf_reg_state *regs = env->cur_state.regs;
+	struct bpf_reg_state *regs = cur_regs(env);
 	u8 opcode = BPF_OP(insn->code);
 	bool src_known, dst_known;
 	s64 smin_val, smax_val;
@@ -2111,7 +2214,7 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
 static int adjust_reg_min_max_vals(struct bpf_verifier_env *env,
 				   struct bpf_insn *insn)
 {
-	struct bpf_reg_state *regs = env->cur_state.regs, *dst_reg, *src_reg;
+	struct bpf_reg_state *regs = cur_regs(env), *dst_reg, *src_reg;
 	struct bpf_reg_state *ptr_reg = NULL, off_reg = {0};
 	u8 opcode = BPF_OP(insn->code);
 	int rc;
@@ -2185,12 +2288,12 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env,
 
 	/* Got here implies adding two SCALAR_VALUEs */
 	if (WARN_ON_ONCE(ptr_reg)) {
-		print_verifier_state(env, &env->cur_state);
+		print_verifier_state(env, env->cur_state);
 		verbose(env, "verifier internal error: unexpected ptr_reg\n");
 		return -EINVAL;
 	}
 	if (WARN_ON(!src_reg)) {
-		print_verifier_state(env, &env->cur_state);
+		print_verifier_state(env, env->cur_state);
 		verbose(env, "verifier internal error: no src_reg\n");
 		return -EINVAL;
 	}
@@ -2200,7 +2303,7 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env,
 /* check validity of 32-bit and 64-bit arithmetic operations */
 static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
 {
-	struct bpf_reg_state *regs = env->cur_state.regs;
+	struct bpf_reg_state *regs = cur_regs(env);
 	u8 opcode = BPF_OP(insn->code);
 	int err;
 
@@ -2421,10 +2524,10 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *state,
 			/* keep the maximum range already checked */
 			regs[i].range = max(regs[i].range, new_range);
 
-	for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) {
-		if (state->stack_slot_type[i] != STACK_SPILL)
+	for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) {
+		if (state->stack[i].slot_type[0] != STACK_SPILL)
 			continue;
-		reg = &state->spilled_regs[i / BPF_REG_SIZE];
+		reg = &state->stack[i].spilled_ptr;
 		if (reg->type == type && reg->id == dst_reg->id)
 			reg->range = max_t(u16, reg->range, new_range);
 	}
@@ -2674,17 +2777,17 @@ static void mark_map_regs(struct bpf_verifier_state *state, u32 regno,
 	for (i = 0; i < MAX_BPF_REG; i++)
 		mark_map_reg(regs, i, id, is_null);
 
-	for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) {
-		if (state->stack_slot_type[i] != STACK_SPILL)
+	for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) {
+		if (state->stack[i].slot_type[0] != STACK_SPILL)
 			continue;
-		mark_map_reg(state->spilled_regs, i / BPF_REG_SIZE, id, is_null);
+		mark_map_reg(&state->stack[i].spilled_ptr, 0, id, is_null);
 	}
 }
 
 static int check_cond_jmp_op(struct bpf_verifier_env *env,
 			     struct bpf_insn *insn, int *insn_idx)
 {
-	struct bpf_verifier_state *other_branch, *this_branch = &env->cur_state;
+	struct bpf_verifier_state *other_branch, *this_branch = env->cur_state;
 	struct bpf_reg_state *regs = this_branch->regs, *dst_reg;
 	u8 opcode = BPF_OP(insn->code);
 	int err;
@@ -2876,7 +2979,7 @@ static struct bpf_map *ld_imm64_to_map_ptr(struct bpf_insn *insn)
 /* verify BPF_LD_IMM64 instruction */
 static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn)
 {
-	struct bpf_reg_state *regs = env->cur_state.regs;
+	struct bpf_reg_state *regs = cur_regs(env);
 	int err;
 
 	if (BPF_SIZE(insn->code) != BPF_DW) {
@@ -2937,7 +3040,7 @@ static bool may_access_skb(enum bpf_prog_type type)
  */
 static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
 {
-	struct bpf_reg_state *regs = env->cur_state.regs;
+	struct bpf_reg_state *regs = cur_regs(env);
 	u8 mode = BPF_MODE(insn->code);
 	int i, err;
 
@@ -2999,7 +3102,7 @@ static int check_return_code(struct bpf_verifier_env *env)
 		return 0;
 	}
 
-	reg = &env->cur_state.regs[BPF_REG_0];
+	reg = cur_regs(env) + BPF_REG_0;
 	if (reg->type != SCALAR_VALUE) {
 		verbose(env, "At program exit the register R0 is not a known value (%s)\n",
 			reg_type_str[reg->type]);
@@ -3363,6 +3466,57 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur,
 	return false;
 }
 
+static bool stacksafe(struct bpf_verifier_state *old,
+		      struct bpf_verifier_state *cur,
+		      struct idpair *idmap)
+{
+	int i, spi;
+
+	/* if explored stack has more populated slots than current stack
+	 * such stacks are not equivalent
+	 */
+	if (old->allocated_stack > cur->allocated_stack)
+		return false;
+
+	/* walk slots of the explored stack and ignore any additional
+	 * slots in the current stack, since explored(safe) state
+	 * didn't use them
+	 */
+	for (i = 0; i < old->allocated_stack; i++) {
+		spi = i / BPF_REG_SIZE;
+
+		if (old->stack[spi].slot_type[i % BPF_REG_SIZE] == STACK_INVALID)
+			continue;
+		if (old->stack[spi].slot_type[i % BPF_REG_SIZE] !=
+		    cur->stack[spi].slot_type[i % BPF_REG_SIZE])
+			/* Ex: old explored (safe) state has STACK_SPILL in
+			 * this stack slot, but current has has STACK_MISC ->
+			 * this verifier states are not equivalent,
+			 * return false to continue verification of this path
+			 */
+			return false;
+		if (i % BPF_REG_SIZE)
+			continue;
+		if (old->stack[spi].slot_type[0] != STACK_SPILL)
+			continue;
+		if (!regsafe(&old->stack[spi].spilled_ptr,
+			     &cur->stack[spi].spilled_ptr,
+			     idmap))
+			/* when explored and current stack slot are both storing
+			 * spilled registers, check that stored pointers types
+			 * are the same as well.
+			 * Ex: explored safe path could have stored
+			 * (bpf_reg_state) {.type = PTR_TO_STACK, .off = -8}
+			 * but current path has stored:
+			 * (bpf_reg_state) {.type = PTR_TO_STACK, .off = -16}
+			 * such verifier states are not equivalent.
+			 * return false to continue verification of this path
+			 */
+			return false;
+	}
+	return true;
+}
+
 /* compare two verifier states
  *
  * all states stored in state_list are known to be valid, since
@@ -3407,37 +3561,8 @@ static bool states_equal(struct bpf_verifier_env *env,
 			goto out_free;
 	}
 
-	for (i = 0; i < MAX_BPF_STACK; i++) {
-		if (old->stack_slot_type[i] == STACK_INVALID)
-			continue;
-		if (old->stack_slot_type[i] != cur->stack_slot_type[i])
-			/* Ex: old explored (safe) state has STACK_SPILL in
-			 * this stack slot, but current has has STACK_MISC ->
-			 * this verifier states are not equivalent,
-			 * return false to continue verification of this path
-			 */
-			goto out_free;
-		if (i % BPF_REG_SIZE)
-			continue;
-		if (old->stack_slot_type[i] != STACK_SPILL)
-			continue;
-		if (!regsafe(&old->spilled_regs[i / BPF_REG_SIZE],
-			     &cur->spilled_regs[i / BPF_REG_SIZE],
-			     idmap))
-			/* when explored and current stack slot are both storing
-			 * spilled registers, check that stored pointers types
-			 * are the same as well.
-			 * Ex: explored safe path could have stored
-			 * (bpf_reg_state) {.type = PTR_TO_STACK, .off = -8}
-			 * but current path has stored:
-			 * (bpf_reg_state) {.type = PTR_TO_STACK, .off = -16}
-			 * such verifier states are not equivalent.
-			 * return false to continue verification of this path
-			 */
-			goto out_free;
-		else
-			continue;
-	}
+	if (!stacksafe(old, cur, idmap))
+		goto out_free;
 	ret = true;
 out_free:
 	kfree(idmap);
@@ -3473,17 +3598,19 @@ static bool do_propagate_liveness(const struct bpf_verifier_state *state,
 		}
 	}
 	/* ... and stack slots */
-	for (i = 0; i < MAX_BPF_STACK / BPF_REG_SIZE; i++) {
-		if (parent->stack_slot_type[i * BPF_REG_SIZE] != STACK_SPILL)
+	for (i = 0; i < state->allocated_stack / BPF_REG_SIZE &&
+		    i < parent->allocated_stack / BPF_REG_SIZE; i++) {
+		if (parent->stack[i].slot_type[0] != STACK_SPILL)
 			continue;
-		if (state->stack_slot_type[i * BPF_REG_SIZE] != STACK_SPILL)
+		if (state->stack[i].slot_type[0] != STACK_SPILL)
 			continue;
-		if (parent->spilled_regs[i].live & REG_LIVE_READ)
+		if (parent->stack[i].spilled_ptr.live & REG_LIVE_READ)
 			continue;
-		if (writes && (state->spilled_regs[i].live & REG_LIVE_WRITTEN))
+		if (writes &&
+		    (state->stack[i].spilled_ptr.live & REG_LIVE_WRITTEN))
 			continue;
-		if (state->spilled_regs[i].live & REG_LIVE_READ) {
-			parent->spilled_regs[i].live |= REG_LIVE_READ;
+		if (state->stack[i].spilled_ptr.live & REG_LIVE_READ) {
+			parent->stack[i].spilled_ptr.live |= REG_LIVE_READ;
 			touched = true;
 		}
 	}
@@ -3513,6 +3640,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
 {
 	struct bpf_verifier_state_list *new_sl;
 	struct bpf_verifier_state_list *sl;
+	struct bpf_verifier_state *cur = env->cur_state;
 	int i;
 
 	sl = env->explored_states[insn_idx];
@@ -3523,7 +3651,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
 		return 0;
 
 	while (sl != STATE_LIST_MARK) {
-		if (states_equal(env, &sl->state, &env->cur_state)) {
+		if (states_equal(env, &sl->state, cur)) {
 			/* reached equivalent register/stack state,
 			 * prune the search.
 			 * Registers read by the continuation are read by us.
@@ -3534,7 +3662,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
 			 * they'll be immediately forgotten as we're pruning
 			 * this state and will pop a new one.
 			 */
-			propagate_liveness(&sl->state, &env->cur_state);
+			propagate_liveness(&sl->state, cur);
 			return 1;
 		}
 		sl = sl->next;
@@ -3546,16 +3674,16 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
 	 * it will be rejected. Since there are no loops, we won't be
 	 * seeing this 'insn_idx' instruction again on the way to bpf_exit
 	 */
-	new_sl = kmalloc(sizeof(struct bpf_verifier_state_list), GFP_USER);
+	new_sl = kzalloc(sizeof(struct bpf_verifier_state_list), GFP_KERNEL);
 	if (!new_sl)
 		return -ENOMEM;
 
 	/* add new state to the head of linked list */
-	memcpy(&new_sl->state, &env->cur_state, sizeof(env->cur_state));
+	copy_verifier_state(&new_sl->state, cur);
 	new_sl->next = env->explored_states[insn_idx];
 	env->explored_states[insn_idx] = new_sl;
 	/* connect new state to parentage chain */
-	env->cur_state.parent = &new_sl->state;
+	cur->parent = &new_sl->state;
 	/* clear write marks in current state: the writes we did are not writes
 	 * our child did, so they don't screen off its reads from us.
 	 * (There are no read marks in current state, because reads always mark
@@ -3563,10 +3691,10 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
 	 * explored_states can get read marks.)
 	 */
 	for (i = 0; i < BPF_REG_FP; i++)
-		env->cur_state.regs[i].live = REG_LIVE_NONE;
-	for (i = 0; i < MAX_BPF_STACK / BPF_REG_SIZE; i++)
-		if (env->cur_state.stack_slot_type[i * BPF_REG_SIZE] == STACK_SPILL)
-			env->cur_state.spilled_regs[i].live = REG_LIVE_NONE;
+		cur->regs[i].live = REG_LIVE_NONE;
+	for (i = 0; i < cur->allocated_stack / BPF_REG_SIZE; i++)
+		if (cur->stack[i].slot_type[0] == STACK_SPILL)
+			cur->stack[i].spilled_ptr.live = REG_LIVE_NONE;
 	return 0;
 }
 
@@ -3581,15 +3709,19 @@ static int ext_analyzer_insn_hook(struct bpf_verifier_env *env,
 
 static int do_check(struct bpf_verifier_env *env)
 {
-	struct bpf_verifier_state *state = &env->cur_state;
+	struct bpf_verifier_state *state;
 	struct bpf_insn *insns = env->prog->insnsi;
-	struct bpf_reg_state *regs = state->regs;
+	struct bpf_reg_state *regs;
 	int insn_cnt = env->prog->len;
 	int insn_idx, prev_insn_idx = 0;
 	int insn_processed = 0;
 	bool do_print_state = false;
 
-	init_reg_state(env, regs);
+	state = kzalloc(sizeof(struct bpf_verifier_state), GFP_KERNEL);
+	if (!state)
+		return -ENOMEM;
+	env->cur_state = state;
+	init_reg_state(env, state->regs);
 	state->parent = NULL;
 	insn_idx = 0;
 	for (;;) {
@@ -3637,7 +3769,7 @@ static int do_check(struct bpf_verifier_env *env)
 			else
 				verbose(env, "\nfrom %d to %d:",
 					prev_insn_idx, insn_idx);
-			print_verifier_state(env, &env->cur_state);
+			print_verifier_state(env, state);
 			do_print_state = false;
 		}
 
@@ -3651,6 +3783,7 @@ static int do_check(struct bpf_verifier_env *env)
 		if (err)
 			return err;
 
+		regs = cur_regs(env);
 		if (class == BPF_ALU || class == BPF_ALU64) {
 			err = check_alu_op(env, insn);
 			if (err)
@@ -3818,8 +3951,10 @@ static int do_check(struct bpf_verifier_env *env)
 				if (err)
 					return err;
 process_bpf_exit:
-				insn_idx = pop_stack(env, &prev_insn_idx);
-				if (insn_idx < 0) {
+				err = pop_stack(env, &prev_insn_idx, &insn_idx);
+				if (err < 0) {
+					if (err != -ENOENT)
+						return err;
 					break;
 				} else {
 					do_print_state = true;
@@ -4359,9 +4494,11 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)
 	env->allow_ptr_leaks = capable(CAP_SYS_ADMIN);
 
 	ret = do_check(env);
+	free_verifier_state(env->cur_state);
+	env->cur_state = NULL;
 
 skip_full_check:
-	while (pop_stack(env, NULL) >= 0);
+	while (!pop_stack(env, NULL, NULL));
 	free_states(env);
 
 	if (ret == 0)
@@ -4464,9 +4601,11 @@ int bpf_analyzer(struct bpf_prog *prog, const struct bpf_ext_analyzer_ops *ops,
 	env->allow_ptr_leaks = capable(CAP_SYS_ADMIN);
 
 	ret = do_check(env);
+	free_verifier_state(env->cur_state);
+	env->cur_state = NULL;
 
 skip_full_check:
-	while (pop_stack(env, NULL) >= 0);
+	while (!pop_stack(env, NULL, NULL));
 	free_states(env);
 
 	mutex_unlock(&bpf_verifier_lock);
-- 
cgit v1.2.3


From 07c41a295c5f25928a7cb689fdec816bd0089fe8 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Mon, 30 Oct 2017 13:50:22 -0700
Subject: bpf: avoid rcu_dereference inside bpf_event_mutex lock region

During perf event attaching/detaching bpf programs,
the tp_event->prog_array change is protected by the
bpf_event_mutex lock in both attaching and deteching
functions. Although tp_event->prog_array is a rcu
pointer, rcu_derefrence is not needed to access it
since mutex lock will guarantee ordering.

Verified through "make C=2" that sparse
locking check still happy with the new change.

Also change the label name in perf_event_{attach,detach}_bpf_prog
from "out" to "unlock" to reflect the code action after the label.

Signed-off-by: Yonghong Song <yhs@fb.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/trace/bpf_trace.c | 17 +++++++----------
 1 file changed, 7 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 136aa6bb0422..506efe6e8ed9 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -769,20 +769,19 @@ int perf_event_attach_bpf_prog(struct perf_event *event,
 	mutex_lock(&bpf_event_mutex);
 
 	if (event->prog)
-		goto out;
+		goto unlock;
 
-	old_array = rcu_dereference_protected(event->tp_event->prog_array,
-					      lockdep_is_held(&bpf_event_mutex));
+	old_array = event->tp_event->prog_array;
 	ret = bpf_prog_array_copy(old_array, NULL, prog, &new_array);
 	if (ret < 0)
-		goto out;
+		goto unlock;
 
 	/* set the new array to event->tp_event and set event->prog */
 	event->prog = prog;
 	rcu_assign_pointer(event->tp_event->prog_array, new_array);
 	bpf_prog_array_free(old_array);
 
-out:
+unlock:
 	mutex_unlock(&bpf_event_mutex);
 	return ret;
 }
@@ -796,11 +795,9 @@ void perf_event_detach_bpf_prog(struct perf_event *event)
 	mutex_lock(&bpf_event_mutex);
 
 	if (!event->prog)
-		goto out;
-
-	old_array = rcu_dereference_protected(event->tp_event->prog_array,
-					      lockdep_is_held(&bpf_event_mutex));
+		goto unlock;
 
+	old_array = event->tp_event->prog_array;
 	ret = bpf_prog_array_copy(old_array, event->prog, NULL, &new_array);
 	if (ret < 0) {
 		bpf_prog_array_delete_safe(old_array, event->prog);
@@ -812,6 +809,6 @@ void perf_event_detach_bpf_prog(struct perf_event *event)
 	bpf_prog_put(event->prog);
 	event->prog = NULL;
 
-out:
+unlock:
 	mutex_unlock(&bpf_event_mutex);
 }
-- 
cgit v1.2.3


From 1969db47f8d0e800397abd4ee4e8d27d2b578587 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@fb.com>
Date: Wed, 1 Nov 2017 00:08:04 -0700
Subject: bpf: fix verifier memory leaks

fix verifier memory leaks

Fixes: 638f5b90d460 ("bpf: reduce verifier memory consumption")
Signed-off-by: Alexei Starovoitov <ast@fb.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/verifier.c | 27 ++++++++++++++++++---------
 1 file changed, 18 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 5f26f7ad124f..2bb6d6aa7085 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -341,10 +341,12 @@ static int realloc_verifier_state(struct bpf_verifier_state *state, int size,
 	return 0;
 }
 
-static void free_verifier_state(struct bpf_verifier_state *state)
+static void free_verifier_state(struct bpf_verifier_state *state,
+				bool free_self)
 {
 	kfree(state->stack);
-	kfree(state);
+	if (free_self)
+		kfree(state);
 }
 
 /* copy verifier state from src to dst growing dst stack space
@@ -382,6 +384,7 @@ static int pop_stack(struct bpf_verifier_env *env, int *prev_insn_idx,
 	if (prev_insn_idx)
 		*prev_insn_idx = head->prev_insn_idx;
 	elem = head->next;
+	free_verifier_state(&head->st, false);
 	kfree(head);
 	env->head = elem;
 	env->stack_size--;
@@ -399,14 +402,14 @@ static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env,
 	if (!elem)
 		goto err;
 
-	err = copy_verifier_state(&elem->st, cur);
-	if (err)
-		return NULL;
 	elem->insn_idx = insn_idx;
 	elem->prev_insn_idx = prev_insn_idx;
 	elem->next = env->head;
 	env->head = elem;
 	env->stack_size++;
+	err = copy_verifier_state(&elem->st, cur);
+	if (err)
+		goto err;
 	if (env->stack_size > BPF_COMPLEXITY_LIMIT_STACK) {
 		verbose(env, "BPF program is too complex\n");
 		goto err;
@@ -3641,7 +3644,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
 	struct bpf_verifier_state_list *new_sl;
 	struct bpf_verifier_state_list *sl;
 	struct bpf_verifier_state *cur = env->cur_state;
-	int i;
+	int i, err;
 
 	sl = env->explored_states[insn_idx];
 	if (!sl)
@@ -3679,7 +3682,12 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
 		return -ENOMEM;
 
 	/* add new state to the head of linked list */
-	copy_verifier_state(&new_sl->state, cur);
+	err = copy_verifier_state(&new_sl->state, cur);
+	if (err) {
+		free_verifier_state(&new_sl->state, false);
+		kfree(new_sl);
+		return err;
+	}
 	new_sl->next = env->explored_states[insn_idx];
 	env->explored_states[insn_idx] = new_sl;
 	/* connect new state to parentage chain */
@@ -4424,6 +4432,7 @@ static void free_states(struct bpf_verifier_env *env)
 		if (sl)
 			while (sl != STATE_LIST_MARK) {
 				sln = sl->next;
+				free_verifier_state(&sl->state, false);
 				kfree(sl);
 				sl = sln;
 			}
@@ -4494,7 +4503,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)
 	env->allow_ptr_leaks = capable(CAP_SYS_ADMIN);
 
 	ret = do_check(env);
-	free_verifier_state(env->cur_state);
+	free_verifier_state(env->cur_state, true);
 	env->cur_state = NULL;
 
 skip_full_check:
@@ -4601,7 +4610,7 @@ int bpf_analyzer(struct bpf_prog *prog, const struct bpf_ext_analyzer_ops *ops,
 	env->allow_ptr_leaks = capable(CAP_SYS_ADMIN);
 
 	ret = do_check(env);
-	free_verifier_state(env->cur_state);
+	free_verifier_state(env->cur_state, true);
 	env->cur_state = NULL;
 
 skip_full_check:
-- 
cgit v1.2.3


From 03c4cc385faaa46e5877f499c6b997fef792f8d3 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Wed, 1 Nov 2017 12:44:45 +0100
Subject: bpf: cpumap micro-optimization in cpu_map_enqueue

Discovered that the compiler laid-out asm code in suboptimal way
when studying perf report during benchmarking of cpumap. Help
the compiler by the marking unlikely code paths.

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/cpumap.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index 86e29cbf7827..ce5b669003b2 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -208,7 +208,7 @@ static struct xdp_pkt *convert_to_xdp_pkt(struct xdp_buff *xdp)
 	headroom = xdp->data - xdp->data_hard_start;
 	metasize = xdp->data - xdp->data_meta;
 	metasize = metasize > 0 ? metasize : 0;
-	if ((headroom - metasize) < sizeof(*xdp_pkt))
+	if (unlikely((headroom - metasize) < sizeof(*xdp_pkt)))
 		return NULL;
 
 	/* Store info in top of packet */
@@ -656,7 +656,7 @@ int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp,
 	struct xdp_pkt *xdp_pkt;
 
 	xdp_pkt = convert_to_xdp_pkt(xdp);
-	if (!xdp_pkt)
+	if (unlikely(!xdp_pkt))
 		return -EOVERFLOW;
 
 	/* Info needed when constructing SKB on remote CPU */
-- 
cgit v1.2.3


From b06723da824af1e979eb1699623881b5f45a783c Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Wed, 1 Nov 2017 23:58:09 +0100
Subject: bpf: minor cleanups after merge

Two minor cleanups after Dave's recent merge in f8ddadc4db6c
("Merge git://git.kernel.org...") of net into net-next in
order to get the code in line with what was done originally
in the net tree: i) use max() instead of max_t() since both
ranges are u16, ii) don't split the direct access test cases
in the middle with bpf_exit test cases from 390ee7e29fc
("bpf: enforce return code for cgroup-bpf programs").

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/verifier.c                       |   2 +-
 tools/testing/selftests/bpf/test_verifier.c | 144 ++++++++++++++--------------
 2 files changed, 73 insertions(+), 73 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 2bb6d6aa7085..2cc3e9486a1f 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -2532,7 +2532,7 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *state,
 			continue;
 		reg = &state->stack[i].spilled_ptr;
 		if (reg->type == type && reg->id == dst_reg->id)
-			reg->range = max_t(u16, reg->range, new_range);
+			reg->range = max(reg->range, new_range);
 	}
 }
 
diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c
index 1b93941bdfea..3b38a3d2eebd 100644
--- a/tools/testing/selftests/bpf/test_verifier.c
+++ b/tools/testing/selftests/bpf/test_verifier.c
@@ -7249,78 +7249,6 @@ static struct bpf_test tests[] = {
 		.result = ACCEPT,
 		.prog_type = BPF_PROG_TYPE_XDP,
 	},
-	{
-		"bpf_exit with invalid return code. test1",
-		.insns = {
-			BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, 0),
-			BPF_EXIT_INSN(),
-		},
-		.errstr = "R0 has value (0x0; 0xffffffff)",
-		.result = REJECT,
-		.prog_type = BPF_PROG_TYPE_CGROUP_SOCK,
-	},
-	{
-		"bpf_exit with invalid return code. test2",
-		.insns = {
-			BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, 0),
-			BPF_ALU64_IMM(BPF_AND, BPF_REG_0, 1),
-			BPF_EXIT_INSN(),
-		},
-		.result = ACCEPT,
-		.prog_type = BPF_PROG_TYPE_CGROUP_SOCK,
-	},
-	{
-		"bpf_exit with invalid return code. test3",
-		.insns = {
-			BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, 0),
-			BPF_ALU64_IMM(BPF_AND, BPF_REG_0, 3),
-			BPF_EXIT_INSN(),
-		},
-		.errstr = "R0 has value (0x0; 0x3)",
-		.result = REJECT,
-		.prog_type = BPF_PROG_TYPE_CGROUP_SOCK,
-	},
-	{
-		"bpf_exit with invalid return code. test4",
-		.insns = {
-			BPF_MOV64_IMM(BPF_REG_0, 1),
-			BPF_EXIT_INSN(),
-		},
-		.result = ACCEPT,
-		.prog_type = BPF_PROG_TYPE_CGROUP_SOCK,
-	},
-	{
-		"bpf_exit with invalid return code. test5",
-		.insns = {
-			BPF_MOV64_IMM(BPF_REG_0, 2),
-			BPF_EXIT_INSN(),
-		},
-		.errstr = "R0 has value (0x2; 0x0)",
-		.result = REJECT,
-		.prog_type = BPF_PROG_TYPE_CGROUP_SOCK,
-	},
-	{
-		"bpf_exit with invalid return code. test6",
-		.insns = {
-			BPF_MOV64_REG(BPF_REG_0, BPF_REG_1),
-			BPF_EXIT_INSN(),
-		},
-		.errstr = "R0 is not a known value (ctx)",
-		.result = REJECT,
-		.prog_type = BPF_PROG_TYPE_CGROUP_SOCK,
-	},
-	{
-		"bpf_exit with invalid return code. test7",
-		.insns = {
-			BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, 0),
-			BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, 4),
-			BPF_ALU64_REG(BPF_MUL, BPF_REG_0, BPF_REG_2),
-			BPF_EXIT_INSN(),
-		},
-		.errstr = "R0 has unknown scalar value",
-		.result = REJECT,
-		.prog_type = BPF_PROG_TYPE_CGROUP_SOCK,
-	},
 	{
 		"XDP pkt read, pkt_end >= pkt_data', bad access 1",
 		.insns = {
@@ -7470,6 +7398,78 @@ static struct bpf_test tests[] = {
 		.prog_type = BPF_PROG_TYPE_XDP,
 		.flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
 	},
+	{
+		"bpf_exit with invalid return code. test1",
+		.insns = {
+			BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, 0),
+			BPF_EXIT_INSN(),
+		},
+		.errstr = "R0 has value (0x0; 0xffffffff)",
+		.result = REJECT,
+		.prog_type = BPF_PROG_TYPE_CGROUP_SOCK,
+	},
+	{
+		"bpf_exit with invalid return code. test2",
+		.insns = {
+			BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, 0),
+			BPF_ALU64_IMM(BPF_AND, BPF_REG_0, 1),
+			BPF_EXIT_INSN(),
+		},
+		.result = ACCEPT,
+		.prog_type = BPF_PROG_TYPE_CGROUP_SOCK,
+	},
+	{
+		"bpf_exit with invalid return code. test3",
+		.insns = {
+			BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, 0),
+			BPF_ALU64_IMM(BPF_AND, BPF_REG_0, 3),
+			BPF_EXIT_INSN(),
+		},
+		.errstr = "R0 has value (0x0; 0x3)",
+		.result = REJECT,
+		.prog_type = BPF_PROG_TYPE_CGROUP_SOCK,
+	},
+	{
+		"bpf_exit with invalid return code. test4",
+		.insns = {
+			BPF_MOV64_IMM(BPF_REG_0, 1),
+			BPF_EXIT_INSN(),
+		},
+		.result = ACCEPT,
+		.prog_type = BPF_PROG_TYPE_CGROUP_SOCK,
+	},
+	{
+		"bpf_exit with invalid return code. test5",
+		.insns = {
+			BPF_MOV64_IMM(BPF_REG_0, 2),
+			BPF_EXIT_INSN(),
+		},
+		.errstr = "R0 has value (0x2; 0x0)",
+		.result = REJECT,
+		.prog_type = BPF_PROG_TYPE_CGROUP_SOCK,
+	},
+	{
+		"bpf_exit with invalid return code. test6",
+		.insns = {
+			BPF_MOV64_REG(BPF_REG_0, BPF_REG_1),
+			BPF_EXIT_INSN(),
+		},
+		.errstr = "R0 is not a known value (ctx)",
+		.result = REJECT,
+		.prog_type = BPF_PROG_TYPE_CGROUP_SOCK,
+	},
+	{
+		"bpf_exit with invalid return code. test7",
+		.insns = {
+			BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, 0),
+			BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, 4),
+			BPF_ALU64_REG(BPF_MUL, BPF_REG_0, BPF_REG_2),
+			BPF_EXIT_INSN(),
+		},
+		.errstr = "R0 has unknown scalar value",
+		.result = REJECT,
+		.prog_type = BPF_PROG_TYPE_CGROUP_SOCK,
+	},
 };
 
 static int probe_filter_length(const struct bpf_insn *fp)
-- 
cgit v1.2.3


From 5beca081be9195b4316ac5f32921df0234ee8e0e Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Wed, 1 Nov 2017 23:58:10 +0100
Subject: bpf: also improve pattern matches for meta access

Follow-up to 0fd4759c5515 ("bpf: fix pattern matches for direct
packet access") to cover also the remaining data_meta/data matches
in the verifier. The matches are also refactored a bit to simplify
handling of all the cases.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/verifier.c | 165 +++++++++++++++++++++++++++++---------------------
 1 file changed, 96 insertions(+), 69 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 2cc3e9486a1f..530b68550fd2 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -2787,6 +2787,99 @@ static void mark_map_regs(struct bpf_verifier_state *state, u32 regno,
 	}
 }
 
+static bool try_match_pkt_pointers(const struct bpf_insn *insn,
+				   struct bpf_reg_state *dst_reg,
+				   struct bpf_reg_state *src_reg,
+				   struct bpf_verifier_state *this_branch,
+				   struct bpf_verifier_state *other_branch)
+{
+	if (BPF_SRC(insn->code) != BPF_X)
+		return false;
+
+	switch (BPF_OP(insn->code)) {
+	case BPF_JGT:
+		if ((dst_reg->type == PTR_TO_PACKET &&
+		     src_reg->type == PTR_TO_PACKET_END) ||
+		    (dst_reg->type == PTR_TO_PACKET_META &&
+		     reg_is_init_pkt_pointer(src_reg, PTR_TO_PACKET))) {
+			/* pkt_data' > pkt_end, pkt_meta' > pkt_data */
+			find_good_pkt_pointers(this_branch, dst_reg,
+					       dst_reg->type, false);
+		} else if ((dst_reg->type == PTR_TO_PACKET_END &&
+			    src_reg->type == PTR_TO_PACKET) ||
+			   (reg_is_init_pkt_pointer(dst_reg, PTR_TO_PACKET) &&
+			    src_reg->type == PTR_TO_PACKET_META)) {
+			/* pkt_end > pkt_data', pkt_data > pkt_meta' */
+			find_good_pkt_pointers(other_branch, src_reg,
+					       src_reg->type, true);
+		} else {
+			return false;
+		}
+		break;
+	case BPF_JLT:
+		if ((dst_reg->type == PTR_TO_PACKET &&
+		     src_reg->type == PTR_TO_PACKET_END) ||
+		    (dst_reg->type == PTR_TO_PACKET_META &&
+		     reg_is_init_pkt_pointer(src_reg, PTR_TO_PACKET))) {
+			/* pkt_data' < pkt_end, pkt_meta' < pkt_data */
+			find_good_pkt_pointers(other_branch, dst_reg,
+					       dst_reg->type, true);
+		} else if ((dst_reg->type == PTR_TO_PACKET_END &&
+			    src_reg->type == PTR_TO_PACKET) ||
+			   (reg_is_init_pkt_pointer(dst_reg, PTR_TO_PACKET) &&
+			    src_reg->type == PTR_TO_PACKET_META)) {
+			/* pkt_end < pkt_data', pkt_data > pkt_meta' */
+			find_good_pkt_pointers(this_branch, src_reg,
+					       src_reg->type, false);
+		} else {
+			return false;
+		}
+		break;
+	case BPF_JGE:
+		if ((dst_reg->type == PTR_TO_PACKET &&
+		     src_reg->type == PTR_TO_PACKET_END) ||
+		    (dst_reg->type == PTR_TO_PACKET_META &&
+		     reg_is_init_pkt_pointer(src_reg, PTR_TO_PACKET))) {
+			/* pkt_data' >= pkt_end, pkt_meta' >= pkt_data */
+			find_good_pkt_pointers(this_branch, dst_reg,
+					       dst_reg->type, true);
+		} else if ((dst_reg->type == PTR_TO_PACKET_END &&
+			    src_reg->type == PTR_TO_PACKET) ||
+			   (reg_is_init_pkt_pointer(dst_reg, PTR_TO_PACKET) &&
+			    src_reg->type == PTR_TO_PACKET_META)) {
+			/* pkt_end >= pkt_data', pkt_data >= pkt_meta' */
+			find_good_pkt_pointers(other_branch, src_reg,
+					       src_reg->type, false);
+		} else {
+			return false;
+		}
+		break;
+	case BPF_JLE:
+		if ((dst_reg->type == PTR_TO_PACKET &&
+		     src_reg->type == PTR_TO_PACKET_END) ||
+		    (dst_reg->type == PTR_TO_PACKET_META &&
+		     reg_is_init_pkt_pointer(src_reg, PTR_TO_PACKET))) {
+			/* pkt_data' <= pkt_end, pkt_meta' <= pkt_data */
+			find_good_pkt_pointers(other_branch, dst_reg,
+					       dst_reg->type, false);
+		} else if ((dst_reg->type == PTR_TO_PACKET_END &&
+			    src_reg->type == PTR_TO_PACKET) ||
+			   (reg_is_init_pkt_pointer(dst_reg, PTR_TO_PACKET) &&
+			    src_reg->type == PTR_TO_PACKET_META)) {
+			/* pkt_end <= pkt_data', pkt_data <= pkt_meta' */
+			find_good_pkt_pointers(this_branch, src_reg,
+					       src_reg->type, true);
+		} else {
+			return false;
+		}
+		break;
+	default:
+		return false;
+	}
+
+	return true;
+}
+
 static int check_cond_jmp_op(struct bpf_verifier_env *env,
 			     struct bpf_insn *insn, int *insn_idx)
 {
@@ -2893,75 +2986,9 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
 		 */
 		mark_map_regs(this_branch, insn->dst_reg, opcode == BPF_JNE);
 		mark_map_regs(other_branch, insn->dst_reg, opcode == BPF_JEQ);
-	} else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGT &&
-		   dst_reg->type == PTR_TO_PACKET &&
-		   regs[insn->src_reg].type == PTR_TO_PACKET_END) {
-		/* pkt_data' > pkt_end */
-		find_good_pkt_pointers(this_branch, dst_reg,
-				       PTR_TO_PACKET, false);
-	} else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGT &&
-		   dst_reg->type == PTR_TO_PACKET_END &&
-		   regs[insn->src_reg].type == PTR_TO_PACKET) {
-		/* pkt_end > pkt_data' */
-		find_good_pkt_pointers(other_branch, &regs[insn->src_reg],
-				       PTR_TO_PACKET, true);
-	} else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLT &&
-		   dst_reg->type == PTR_TO_PACKET &&
-		   regs[insn->src_reg].type == PTR_TO_PACKET_END) {
-		/* pkt_data' < pkt_end */
-		find_good_pkt_pointers(other_branch, dst_reg, PTR_TO_PACKET,
-				       true);
-	} else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLT &&
-		   dst_reg->type == PTR_TO_PACKET_END &&
-		   regs[insn->src_reg].type == PTR_TO_PACKET) {
-		/* pkt_end < pkt_data' */
-		find_good_pkt_pointers(this_branch, &regs[insn->src_reg],
-				       PTR_TO_PACKET, false);
-	} else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGE &&
-		   dst_reg->type == PTR_TO_PACKET &&
-		   regs[insn->src_reg].type == PTR_TO_PACKET_END) {
-		/* pkt_data' >= pkt_end */
-		find_good_pkt_pointers(this_branch, dst_reg,
-				       PTR_TO_PACKET, true);
-	} else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGE &&
-		   dst_reg->type == PTR_TO_PACKET_END &&
-		   regs[insn->src_reg].type == PTR_TO_PACKET) {
-		/* pkt_end >= pkt_data' */
-		find_good_pkt_pointers(other_branch, &regs[insn->src_reg],
-				       PTR_TO_PACKET, false);
-	} else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLE &&
-		   dst_reg->type == PTR_TO_PACKET &&
-		   regs[insn->src_reg].type == PTR_TO_PACKET_END) {
-		/* pkt_data' <= pkt_end */
-		find_good_pkt_pointers(other_branch, dst_reg,
-				       PTR_TO_PACKET, false);
-	} else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLE &&
-		   dst_reg->type == PTR_TO_PACKET_END &&
-		   regs[insn->src_reg].type == PTR_TO_PACKET) {
-		/* pkt_end <= pkt_data' */
-		find_good_pkt_pointers(this_branch, &regs[insn->src_reg],
-				       PTR_TO_PACKET, true);
-	} else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGT &&
-		   dst_reg->type == PTR_TO_PACKET_META &&
-		   reg_is_init_pkt_pointer(&regs[insn->src_reg], PTR_TO_PACKET)) {
-		find_good_pkt_pointers(this_branch, dst_reg,
-				       PTR_TO_PACKET_META, false);
-	} else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLT &&
-		   dst_reg->type == PTR_TO_PACKET_META &&
-		   reg_is_init_pkt_pointer(&regs[insn->src_reg], PTR_TO_PACKET)) {
-		find_good_pkt_pointers(other_branch, dst_reg,
-				       PTR_TO_PACKET_META, false);
-	} else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGE &&
-		   reg_is_init_pkt_pointer(dst_reg, PTR_TO_PACKET) &&
-		   regs[insn->src_reg].type == PTR_TO_PACKET_META) {
-		find_good_pkt_pointers(other_branch, &regs[insn->src_reg],
-				       PTR_TO_PACKET_META, false);
-	} else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLE &&
-		   reg_is_init_pkt_pointer(dst_reg, PTR_TO_PACKET) &&
-		   regs[insn->src_reg].type == PTR_TO_PACKET_META) {
-		find_good_pkt_pointers(this_branch, &regs[insn->src_reg],
-				       PTR_TO_PACKET_META, false);
-	} else if (is_pointer_value(env, insn->dst_reg)) {
+	} else if (!try_match_pkt_pointers(insn, dst_reg, &regs[insn->src_reg],
+					   this_branch, other_branch) &&
+		   is_pointer_value(env, insn->dst_reg)) {
 		verbose(env, "R%d pointer comparison prohibited\n",
 			insn->dst_reg);
 		return -EACCES;
-- 
cgit v1.2.3


From 7cce782ef32f5003c18ab8f9f586f2ed5ce2c33e Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Thu, 2 Nov 2017 12:05:51 +0100
Subject: bpf: fix link error without CONFIG_NET

I ran into this link error with the latest net-next plus linux-next
trees when networking is disabled:

kernel/bpf/verifier.o:(.rodata+0x2958): undefined reference to `tc_cls_act_analyzer_ops'
kernel/bpf/verifier.o:(.rodata+0x2970): undefined reference to `xdp_analyzer_ops'

It seems that the code was written to deal with varying contents of
the arrray, but the actual #ifdef was missing. Both tc_cls_act_analyzer_ops
and xdp_analyzer_ops are defined in the core networking code, so adding
a check for CONFIG_NET seems appropriate here, and I've verified this with
many randconfig builds

Fixes: 4f9218aaf8a4 ("bpf: move knowledge about post-translation offsets out of verifier")
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/verifier.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 530b68550fd2..5f3799dcba01 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -4588,8 +4588,10 @@ err_free_env:
 }
 
 static const struct bpf_verifier_ops * const bpf_analyzer_ops[] = {
+#ifdef CONFIG_NET
 	[BPF_PROG_TYPE_XDP]		= &xdp_analyzer_ops,
 	[BPF_PROG_TYPE_SCHED_CLS]	= &tc_cls_act_analyzer_ops,
+#endif
 };
 
 int bpf_analyzer(struct bpf_prog *prog, const struct bpf_ext_analyzer_ops *ops,
-- 
cgit v1.2.3


From eba0c929d1d0f16c4b03628b7bf8ce363b9e5c9a Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Thu, 2 Nov 2017 12:05:52 +0100
Subject: bpf: fix out-of-bounds access warning in bpf_check

The bpf_verifer_ops array is generated dynamically and may be
empty depending on configuration, which then causes an out
of bounds access:

kernel/bpf/verifier.c: In function 'bpf_check':
kernel/bpf/verifier.c:4320:29: error: array subscript is above array bounds [-Werror=array-bounds]

This adds a check to the start of the function as a workaround.
I would assume that the function is never called in that configuration,
so the warning is probably harmless.

Fixes: 00176a34d9e2 ("bpf: remove the verifier ops from program structure")
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/verifier.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 5f3799dcba01..ab5aa5497666 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -4474,6 +4474,10 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)
 	struct bpf_verifer_log *log;
 	int ret = -EINVAL;
 
+	/* no program is valid */
+	if (ARRAY_SIZE(bpf_verifier_ops) == 0)
+		return -EINVAL;
+
 	/* 'struct bpf_verifier_env' can be global, but since it's not small,
 	 * allocate/free it every time bpf_check() is called
 	 */
-- 
cgit v1.2.3


From 8c01c4f896aa3404af948880dcb29a2d51c833dc Mon Sep 17 00:00:00 2001
From: Craig Gallek <kraig@google.com>
Date: Thu, 2 Nov 2017 11:18:01 -0400
Subject: bpf: fix verifier NULL pointer dereference

do_check() can fail early without allocating env->cur_state under
memory pressure.  Syzkaller found the stack below on the linux-next
tree because of this.

  kasan: CONFIG_KASAN_INLINE enabled
  kasan: GPF could be caused by NULL-ptr deref or user memory access
  general protection fault: 0000 [#1] SMP KASAN
  Dumping ftrace buffer:
     (ftrace buffer empty)
  Modules linked in:
  CPU: 1 PID: 27062 Comm: syz-executor5 Not tainted 4.14.0-rc7+ #106
  Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
  task: ffff8801c2c74700 task.stack: ffff8801c3e28000
  RIP: 0010:free_verifier_state kernel/bpf/verifier.c:347 [inline]
  RIP: 0010:bpf_check+0xcf4/0x19c0 kernel/bpf/verifier.c:4533
  RSP: 0018:ffff8801c3e2f5c8 EFLAGS: 00010202
  RAX: dffffc0000000000 RBX: 00000000fffffff4 RCX: 0000000000000000
  RDX: 0000000000000070 RSI: ffffffff817d5aa9 RDI: 0000000000000380
  RBP: ffff8801c3e2f668 R08: 0000000000000000 R09: 1ffff100387c5d9f
  R10: 00000000218c4e80 R11: ffffffff85b34380 R12: ffff8801c4dc6a28
  R13: 0000000000000000 R14: ffff8801c4dc6a00 R15: ffff8801c4dc6a20
  FS:  00007f311079b700(0000) GS:ffff8801db300000(0000) knlGS:0000000000000000
  CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
  CR2: 00000000004d4a24 CR3: 00000001cbcd0000 CR4: 00000000001406e0
  DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
  DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
  Call Trace:
   bpf_prog_load+0xcbb/0x18e0 kernel/bpf/syscall.c:1166
   SYSC_bpf kernel/bpf/syscall.c:1690 [inline]
   SyS_bpf+0xae9/0x4620 kernel/bpf/syscall.c:1652
   entry_SYSCALL_64_fastpath+0x1f/0xbe
  RIP: 0033:0x452869
  RSP: 002b:00007f311079abe8 EFLAGS: 00000212 ORIG_RAX: 0000000000000141
  RAX: ffffffffffffffda RBX: 0000000000758020 RCX: 0000000000452869
  RDX: 0000000000000030 RSI: 0000000020168000 RDI: 0000000000000005
  RBP: 00007f311079aa20 R08: 0000000000000000 R09: 0000000000000000
  R10: 0000000000000000 R11: 0000000000000212 R12: 00000000004b7550
  R13: 00007f311079ab58 R14: 00000000004b7560 R15: 0000000000000000
  Code: df 48 c1 ea 03 80 3c 02 00 0f 85 e6 0b 00 00 4d 8b 6e 20 48 b8 00 00 00 00 00 fc ff df 49 8d bd 80 03 00 00 48 89 fa 48 c1 ea 03 <80> 3c 02 00 0f 85 b6 0b 00 00 49 8b bd 80 03 00 00 e8 d6 0c 26
  RIP: free_verifier_state kernel/bpf/verifier.c:347 [inline] RSP: ffff8801c3e2f5c8
  RIP: bpf_check+0xcf4/0x19c0 kernel/bpf/verifier.c:4533 RSP: ffff8801c3e2f5c8
  ---[ end trace c8d37f339dc64004 ]---

Fixes: 638f5b90d460 ("bpf: reduce verifier memory consumption")
Fixes: 1969db47f8d0 ("bpf: fix verifier memory leaks")
Signed-off-by: Craig Gallek <kraig@google.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/verifier.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index ab5aa5497666..04357ad5a812 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -4534,8 +4534,10 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)
 	env->allow_ptr_leaks = capable(CAP_SYS_ADMIN);
 
 	ret = do_check(env);
-	free_verifier_state(env->cur_state, true);
-	env->cur_state = NULL;
+	if (env->cur_state) {
+		free_verifier_state(env->cur_state, true);
+		env->cur_state = NULL;
+	}
 
 skip_full_check:
 	while (!pop_stack(env, NULL, NULL));
@@ -4643,8 +4645,10 @@ int bpf_analyzer(struct bpf_prog *prog, const struct bpf_ext_analyzer_ops *ops,
 	env->allow_ptr_leaks = capable(CAP_SYS_ADMIN);
 
 	ret = do_check(env);
-	free_verifier_state(env->cur_state, true);
-	env->cur_state = NULL;
+	if (env->cur_state) {
+		free_verifier_state(env->cur_state, true);
+		env->cur_state = NULL;
+	}
 
 skip_full_check:
 	while (!pop_stack(env, NULL, NULL));
-- 
cgit v1.2.3


From edbfd9112f70c34b2965580a67dad5fb306fb6c9 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 3 Nov 2017 07:02:15 -0700
Subject: Revert "workqueue: respect isolated cpus when queueing an unbound
 work"

This reverts commit b5149873a0c299195b5346fe4dc2c5b04ae2f995.

It conflicts with the following isolcpus change from the sched branch.

 edb9382175c3 ("sched/isolation: Move isolcpus= handling to the housekeeping code")

Let's revert for now.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/workqueue.c | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index bfa433b38a61..64d0edf428f8 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -4980,10 +4980,6 @@ int workqueue_set_unbound_cpumask(cpumask_var_t cpumask)
 	if (!zalloc_cpumask_var(&saved_cpumask, GFP_KERNEL))
 		return -ENOMEM;
 
-	/*
-	 * Not excluding isolated cpus on purpose.
-	 * If the user wishes to include them, we allow that.
-	 */
 	cpumask_and(cpumask, cpumask, cpu_possible_mask);
 	if (!cpumask_empty(cpumask)) {
 		apply_wqattrs_lock();
@@ -5583,7 +5579,7 @@ int __init workqueue_init_early(void)
 	WARN_ON(__alignof__(struct pool_workqueue) < __alignof__(long long));
 
 	BUG_ON(!alloc_cpumask_var(&wq_unbound_cpumask, GFP_KERNEL));
-	cpumask_andnot(wq_unbound_cpumask, cpu_possible_mask, cpu_isolated_map);
+	cpumask_copy(wq_unbound_cpumask, cpu_possible_mask);
 
 	pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC);
 
-- 
cgit v1.2.3


From 2d2123bc7c7f843aa9db87720de159a049839862 Mon Sep 17 00:00:00 2001
From: Dave Martin <Dave.Martin@arm.com>
Date: Tue, 31 Oct 2017 15:51:14 +0000
Subject: arm64/sve: Add prctl controls for userspace vector length management
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This patch adds two arm64-specific prctls, to permit userspace to
control its vector length:

 * PR_SVE_SET_VL: set the thread's SVE vector length and vector
   length inheritance mode.

 * PR_SVE_GET_VL: get the same information.

Although these prctls resemble instruction set features in the SVE
architecture, they provide additional control: the vector length
inheritance mode is Linux-specific and nothing to do with the
architecture, and the architecture does not permit EL0 to set its
own vector length directly.  Both can be used in portable tools
without requiring the use of SVE instructions.

Signed-off-by: Dave Martin <Dave.Martin@arm.com>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Cc: Alex Bennée <alex.bennee@linaro.org>
[will: Fixed up prctl constants to avoid clash with PDEATHSIG]
Signed-off-by: Will Deacon <will.deacon@arm.com>
---
 arch/arm64/include/asm/fpsimd.h    | 14 +++++++++++
 arch/arm64/include/asm/processor.h |  4 +++
 arch/arm64/kernel/fpsimd.c         | 50 ++++++++++++++++++++++++++++++++++++++
 include/uapi/linux/prctl.h         |  4 +++
 kernel/sys.c                       | 12 +++++++++
 5 files changed, 84 insertions(+)

(limited to 'kernel')

diff --git a/arch/arm64/include/asm/fpsimd.h b/arch/arm64/include/asm/fpsimd.h
index d754e5a6949c..b868412c815c 100644
--- a/arch/arm64/include/asm/fpsimd.h
+++ b/arch/arm64/include/asm/fpsimd.h
@@ -17,6 +17,7 @@
 #define __ASM_FP_H
 
 #include <asm/ptrace.h>
+#include <asm/errno.h>
 
 #ifndef __ASSEMBLY__
 
@@ -98,6 +99,9 @@ extern void sve_sync_from_fpsimd_zeropad(struct task_struct *task);
 extern int sve_set_vector_length(struct task_struct *task,
 				 unsigned long vl, unsigned long flags);
 
+extern int sve_set_current_vl(unsigned long arg);
+extern int sve_get_current_vl(void);
+
 /*
  * Probing and setup functions.
  * Calls to these functions must be serialised with one another.
@@ -114,6 +118,16 @@ static inline void fpsimd_release_task(struct task_struct *task) { }
 static inline void sve_sync_to_fpsimd(struct task_struct *task) { }
 static inline void sve_sync_from_fpsimd_zeropad(struct task_struct *task) { }
 
+static inline int sve_set_current_vl(unsigned long arg)
+{
+	return -EINVAL;
+}
+
+static inline int sve_get_current_vl(void)
+{
+	return -EINVAL;
+}
+
 static inline void sve_init_vq_map(void) { }
 static inline void sve_update_vq_map(void) { }
 static inline int sve_verify_vq_map(void) { return 0; }
diff --git a/arch/arm64/include/asm/processor.h b/arch/arm64/include/asm/processor.h
index c6fddb005dc2..023cacb946c3 100644
--- a/arch/arm64/include/asm/processor.h
+++ b/arch/arm64/include/asm/processor.h
@@ -217,5 +217,9 @@ static inline void spin_lock_prefetch(const void *ptr)
 int cpu_enable_pan(void *__unused);
 int cpu_enable_cache_maint_trap(void *__unused);
 
+/* Userspace interface for PR_SVE_{SET,GET}_VL prctl()s: */
+#define SVE_SET_VL(arg)	sve_set_current_vl(arg)
+#define SVE_GET_VL()	sve_get_current_vl()
+
 #endif /* __ASSEMBLY__ */
 #endif /* __ASM_PROCESSOR_H */
diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c
index b82d44693b9d..fd3cfdd7f9be 100644
--- a/arch/arm64/kernel/fpsimd.c
+++ b/arch/arm64/kernel/fpsimd.c
@@ -29,6 +29,7 @@
 #include <linux/irqflags.h>
 #include <linux/init.h>
 #include <linux/percpu.h>
+#include <linux/prctl.h>
 #include <linux/preempt.h>
 #include <linux/prctl.h>
 #include <linux/ptrace.h>
@@ -558,6 +559,55 @@ out:
 	return 0;
 }
 
+/*
+ * Encode the current vector length and flags for return.
+ * This is only required for prctl(): ptrace has separate fields
+ *
+ * flags are as for sve_set_vector_length().
+ */
+static int sve_prctl_status(unsigned long flags)
+{
+	int ret;
+
+	if (flags & PR_SVE_SET_VL_ONEXEC)
+		ret = current->thread.sve_vl_onexec;
+	else
+		ret = current->thread.sve_vl;
+
+	if (test_thread_flag(TIF_SVE_VL_INHERIT))
+		ret |= PR_SVE_VL_INHERIT;
+
+	return ret;
+}
+
+/* PR_SVE_SET_VL */
+int sve_set_current_vl(unsigned long arg)
+{
+	unsigned long vl, flags;
+	int ret;
+
+	vl = arg & PR_SVE_VL_LEN_MASK;
+	flags = arg & ~vl;
+
+	if (!system_supports_sve())
+		return -EINVAL;
+
+	ret = sve_set_vector_length(current, vl, flags);
+	if (ret)
+		return ret;
+
+	return sve_prctl_status(flags);
+}
+
+/* PR_SVE_GET_VL */
+int sve_get_current_vl(void)
+{
+	if (!system_supports_sve())
+		return -EINVAL;
+
+	return sve_prctl_status(0);
+}
+
 /*
  * Bitmap for temporary storage of the per-CPU set of supported vector lengths
  * during secondary boot.
diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h
index 1b64901ca6b3..f60db5db6e8e 100644
--- a/include/uapi/linux/prctl.h
+++ b/include/uapi/linux/prctl.h
@@ -198,7 +198,11 @@ struct prctl_mm_map {
 # define PR_CAP_AMBIENT_CLEAR_ALL	4
 
 /* arm64 Scalable Vector Extension controls */
+/* Flag values must be kept in sync with ptrace NT_ARM_SVE interface */
+#define PR_SVE_SET_VL			50	/* set task vector length */
 # define PR_SVE_SET_VL_ONEXEC		(1 << 18) /* defer effect until exec */
+#define PR_SVE_GET_VL			51	/* get task vector length */
+/* Bits common to PR_SVE_SET_VL and PR_SVE_GET_VL */
 # define PR_SVE_VL_LEN_MASK		0xffff
 # define PR_SVE_VL_INHERIT		(1 << 17) /* inherit across exec */
 
diff --git a/kernel/sys.c b/kernel/sys.c
index 9aebc2935013..c541916b38c6 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -110,6 +110,12 @@
 #ifndef SET_FP_MODE
 # define SET_FP_MODE(a,b)	(-EINVAL)
 #endif
+#ifndef SVE_SET_VL
+# define SVE_SET_VL(a)		(-EINVAL)
+#endif
+#ifndef SVE_GET_VL
+# define SVE_GET_VL()		(-EINVAL)
+#endif
 
 /*
  * this is where the system-wide overflow UID and GID are defined, for
@@ -2385,6 +2391,12 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
 	case PR_GET_FP_MODE:
 		error = GET_FP_MODE(me);
 		break;
+	case PR_SVE_SET_VL:
+		error = SVE_SET_VL(arg2);
+		break;
+	case PR_SVE_GET_VL:
+		error = SVE_GET_VL();
+		break;
 	default:
 		error = -EINVAL;
 		break;
-- 
cgit v1.2.3


From ab3f0063c48c26c927851b6767824e35a716d878 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Fri, 3 Nov 2017 13:56:17 -0700
Subject: bpf: offload: add infrastructure for loading programs for a specific
 netdev

The fact that we don't know which device the program is going
to be used on is quite limiting in current eBPF infrastructure.
We have to reverse or limit the changes which kernel makes to
the loaded bytecode if we want it to be offloaded to a networking
device.  We also have to invent new APIs for debugging and
troubleshooting support.

Make it possible to load programs for a specific netdev.  This
helps us to bring the debug information closer to the core
eBPF infrastructure (e.g. we will be able to reuse the verifer
log in device JIT).  It allows device JITs to perform translation
on the original bytecode.

__bpf_prog_get() when called to get a reference for an attachment
point will now refuse to give it if program has a device assigned.
Following patches will add a version of that function which passes
the expected netdev in. @type argument in __bpf_prog_get() is
renamed to attach_type to make it clearer that it's only set on
attachment.

All calls to ndo_bpf are protected by rtnl, only verifier callbacks
are not.  We need a wait queue to make sure netdev doesn't get
destroyed while verifier is still running and calling its driver.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Simon Horman <simon.horman@netronome.com>
Reviewed-by: Quentin Monnet <quentin.monnet@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h          |  36 +++++++++
 include/linux/bpf_verifier.h |  10 +++
 include/linux/netdevice.h    |  14 ++++
 include/uapi/linux/bpf.h     |   1 +
 kernel/bpf/Makefile          |   1 +
 kernel/bpf/core.c            |  10 ++-
 kernel/bpf/offload.c         | 182 +++++++++++++++++++++++++++++++++++++++++++
 kernel/bpf/syscall.c         |  17 +++-
 kernel/bpf/verifier.c        |  15 +++-
 9 files changed, 278 insertions(+), 8 deletions(-)
 create mode 100644 kernel/bpf/offload.c

(limited to 'kernel')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 520aeebe0d93..e45d43f9ec92 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -15,6 +15,7 @@
 #include <linux/err.h>
 #include <linux/rbtree_latch.h>
 #include <linux/numa.h>
+#include <linux/wait.h>
 
 struct perf_event;
 struct bpf_prog;
@@ -182,6 +183,16 @@ struct bpf_verifier_ops {
 				  struct bpf_prog *prog, u32 *target_size);
 };
 
+struct bpf_dev_offload {
+	struct bpf_prog		*prog;
+	struct net_device	*netdev;
+	void			*dev_priv;
+	struct list_head	offloads;
+	bool			dev_state;
+	bool			verifier_running;
+	wait_queue_head_t	verifier_done;
+};
+
 struct bpf_prog_aux {
 	atomic_t refcnt;
 	u32 used_map_cnt;
@@ -199,6 +210,7 @@ struct bpf_prog_aux {
 #ifdef CONFIG_SECURITY
 	void *security;
 #endif
+	struct bpf_dev_offload *offload;
 	union {
 		struct work_struct work;
 		struct rcu_head	rcu;
@@ -317,6 +329,7 @@ extern const struct file_operations bpf_prog_fops;
 #undef BPF_PROG_TYPE
 #undef BPF_MAP_TYPE
 
+extern const struct bpf_prog_ops bpf_offload_prog_ops;
 extern const struct bpf_verifier_ops tc_cls_act_analyzer_ops;
 extern const struct bpf_verifier_ops xdp_analyzer_ops;
 
@@ -491,6 +504,29 @@ static inline int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu,
 }
 #endif /* CONFIG_BPF_SYSCALL */
 
+int bpf_prog_offload_compile(struct bpf_prog *prog);
+void bpf_prog_offload_destroy(struct bpf_prog *prog);
+
+#if defined(CONFIG_NET) && defined(CONFIG_BPF_SYSCALL)
+int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr);
+
+static inline bool bpf_prog_is_dev_bound(struct bpf_prog_aux *aux)
+{
+	return aux->offload;
+}
+#else
+static inline int bpf_prog_offload_init(struct bpf_prog *prog,
+					union bpf_attr *attr)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline bool bpf_prog_is_dev_bound(struct bpf_prog_aux *aux)
+{
+	return false;
+}
+#endif /* CONFIG_NET && CONFIG_BPF_SYSCALL */
+
 #if defined(CONFIG_STREAM_PARSER) && defined(CONFIG_BPF_SYSCALL)
 struct sock  *__sock_map_lookup_elem(struct bpf_map *map, u32 key);
 int sock_map_prog(struct bpf_map *map, struct bpf_prog *prog, u32 type);
diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 3b0976aaac75..e45011dbc02d 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -153,6 +153,7 @@ struct bpf_verifier_env {
 	struct bpf_verifier_state *cur_state; /* current verifier state */
 	struct bpf_verifier_state_list **explored_states; /* search pruning optimization */
 	const struct bpf_ext_analyzer_ops *analyzer_ops; /* external analyzer ops */
+	const struct bpf_ext_analyzer_ops *dev_ops; /* device analyzer ops */
 	void *analyzer_priv; /* pointer to external analyzer's private data */
 	struct bpf_map *used_maps[MAX_USED_MAPS]; /* array of map's used by eBPF program */
 	u32 used_map_cnt;		/* number of used maps */
@@ -169,6 +170,15 @@ static inline struct bpf_reg_state *cur_regs(struct bpf_verifier_env *env)
 	return env->cur_state->regs;
 }
 
+#if defined(CONFIG_NET) && defined(CONFIG_BPF_SYSCALL)
+int bpf_prog_offload_verifier_prep(struct bpf_verifier_env *env);
+#else
+int bpf_prog_offload_verifier_prep(struct bpf_verifier_env *env)
+{
+	return -EOPNOTSUPP;
+}
+#endif
+
 int bpf_analyzer(struct bpf_prog *prog, const struct bpf_ext_analyzer_ops *ops,
 		 void *priv);
 
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 9af9feaaeb64..fda527ccb263 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -797,8 +797,13 @@ enum bpf_netdev_command {
 	 * is equivalent to XDP_ATTACHED_DRV.
 	 */
 	XDP_QUERY_PROG,
+	/* BPF program for offload callbacks, invoked at program load time. */
+	BPF_OFFLOAD_VERIFIER_PREP,
+	BPF_OFFLOAD_TRANSLATE,
+	BPF_OFFLOAD_DESTROY,
 };
 
+struct bpf_ext_analyzer_ops;
 struct netlink_ext_ack;
 
 struct netdev_bpf {
@@ -815,6 +820,15 @@ struct netdev_bpf {
 			u8 prog_attached;
 			u32 prog_id;
 		};
+		/* BPF_OFFLOAD_VERIFIER_PREP */
+		struct {
+			struct bpf_prog *prog;
+			const struct bpf_ext_analyzer_ops *ops; /* callee set */
+		} verifier;
+		/* BPF_OFFLOAD_TRANSLATE, BPF_OFFLOAD_DESTROY */
+		struct {
+			struct bpf_prog *prog;
+		} offload;
 	};
 };
 
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index a9820677c2ff..80d191a93fb0 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -260,6 +260,7 @@ union bpf_attr {
 		__u32		kern_version;	/* checked when prog_type=kprobe */
 		__u32		prog_flags;
 		char		prog_name[BPF_OBJ_NAME_LEN];
+		__u32		prog_target_ifindex;	/* ifindex of netdev to prep for */
 	};
 
 	struct { /* anonymous struct used by BPF_OBJ_* commands */
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index 16e95c8e749e..e691da0b3bab 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -7,6 +7,7 @@ obj-$(CONFIG_BPF_SYSCALL) += disasm.o
 ifeq ($(CONFIG_NET),y)
 obj-$(CONFIG_BPF_SYSCALL) += devmap.o
 obj-$(CONFIG_BPF_SYSCALL) += cpumap.o
+obj-$(CONFIG_BPF_SYSCALL) += offload.o
 ifeq ($(CONFIG_STREAM_PARSER),y)
 obj-$(CONFIG_BPF_SYSCALL) += sockmap.o
 endif
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 7fe448799d76..8a6c37762330 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -1380,7 +1380,13 @@ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err)
 	 * valid program, which in this case would simply not
 	 * be JITed, but falls back to the interpreter.
 	 */
-	fp = bpf_int_jit_compile(fp);
+	if (!bpf_prog_is_dev_bound(fp->aux)) {
+		fp = bpf_int_jit_compile(fp);
+	} else {
+		*err = bpf_prog_offload_compile(fp);
+		if (*err)
+			return fp;
+	}
 	bpf_prog_lock_ro(fp);
 
 	/* The tail call compatibility check can only be done at
@@ -1549,6 +1555,8 @@ static void bpf_prog_free_deferred(struct work_struct *work)
 	struct bpf_prog_aux *aux;
 
 	aux = container_of(work, struct bpf_prog_aux, work);
+	if (bpf_prog_is_dev_bound(aux))
+		bpf_prog_offload_destroy(aux->prog);
 	bpf_jit_free(aux->prog);
 }
 
diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c
new file mode 100644
index 000000000000..5553e0e2f8b1
--- /dev/null
+++ b/kernel/bpf/offload.c
@@ -0,0 +1,182 @@
+#include <linux/bpf.h>
+#include <linux/bpf_verifier.h>
+#include <linux/bug.h>
+#include <linux/list.h>
+#include <linux/netdevice.h>
+#include <linux/printk.h>
+#include <linux/rtnetlink.h>
+
+/* protected by RTNL */
+static LIST_HEAD(bpf_prog_offload_devs);
+
+int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr)
+{
+	struct net *net = current->nsproxy->net_ns;
+	struct bpf_dev_offload *offload;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	if (attr->prog_flags)
+		return -EINVAL;
+
+	offload = kzalloc(sizeof(*offload), GFP_USER);
+	if (!offload)
+		return -ENOMEM;
+
+	offload->prog = prog;
+	init_waitqueue_head(&offload->verifier_done);
+
+	rtnl_lock();
+	offload->netdev = __dev_get_by_index(net, attr->prog_target_ifindex);
+	if (!offload->netdev) {
+		rtnl_unlock();
+		kfree(offload);
+		return -EINVAL;
+	}
+
+	prog->aux->offload = offload;
+	list_add_tail(&offload->offloads, &bpf_prog_offload_devs);
+	rtnl_unlock();
+
+	return 0;
+}
+
+static int __bpf_offload_ndo(struct bpf_prog *prog, enum bpf_netdev_command cmd,
+			     struct netdev_bpf *data)
+{
+	struct net_device *netdev = prog->aux->offload->netdev;
+
+	ASSERT_RTNL();
+
+	if (!netdev)
+		return -ENODEV;
+	if (!netdev->netdev_ops->ndo_bpf)
+		return -EOPNOTSUPP;
+
+	data->command = cmd;
+
+	return netdev->netdev_ops->ndo_bpf(netdev, data);
+}
+
+int bpf_prog_offload_verifier_prep(struct bpf_verifier_env *env)
+{
+	struct netdev_bpf data = {};
+	int err;
+
+	data.verifier.prog = env->prog;
+
+	rtnl_lock();
+	err = __bpf_offload_ndo(env->prog, BPF_OFFLOAD_VERIFIER_PREP, &data);
+	if (err)
+		goto exit_unlock;
+
+	env->dev_ops = data.verifier.ops;
+
+	env->prog->aux->offload->dev_state = true;
+	env->prog->aux->offload->verifier_running = true;
+exit_unlock:
+	rtnl_unlock();
+	return err;
+}
+
+static void __bpf_prog_offload_destroy(struct bpf_prog *prog)
+{
+	struct bpf_dev_offload *offload = prog->aux->offload;
+	struct netdev_bpf data = {};
+
+	data.offload.prog = prog;
+
+	if (offload->verifier_running)
+		wait_event(offload->verifier_done, !offload->verifier_running);
+
+	if (offload->dev_state)
+		WARN_ON(__bpf_offload_ndo(prog, BPF_OFFLOAD_DESTROY, &data));
+
+	offload->dev_state = false;
+	list_del_init(&offload->offloads);
+	offload->netdev = NULL;
+}
+
+void bpf_prog_offload_destroy(struct bpf_prog *prog)
+{
+	struct bpf_dev_offload *offload = prog->aux->offload;
+
+	offload->verifier_running = false;
+	wake_up(&offload->verifier_done);
+
+	rtnl_lock();
+	__bpf_prog_offload_destroy(prog);
+	rtnl_unlock();
+
+	kfree(offload);
+}
+
+static int bpf_prog_offload_translate(struct bpf_prog *prog)
+{
+	struct bpf_dev_offload *offload = prog->aux->offload;
+	struct netdev_bpf data = {};
+	int ret;
+
+	data.offload.prog = prog;
+
+	offload->verifier_running = false;
+	wake_up(&offload->verifier_done);
+
+	rtnl_lock();
+	ret = __bpf_offload_ndo(prog, BPF_OFFLOAD_TRANSLATE, &data);
+	rtnl_unlock();
+
+	return ret;
+}
+
+static unsigned int bpf_prog_warn_on_exec(const void *ctx,
+					  const struct bpf_insn *insn)
+{
+	WARN(1, "attempt to execute device eBPF program on the host!");
+	return 0;
+}
+
+int bpf_prog_offload_compile(struct bpf_prog *prog)
+{
+	prog->bpf_func = bpf_prog_warn_on_exec;
+
+	return bpf_prog_offload_translate(prog);
+}
+
+const struct bpf_prog_ops bpf_offload_prog_ops = {
+};
+
+static int bpf_offload_notification(struct notifier_block *notifier,
+				    ulong event, void *ptr)
+{
+	struct net_device *netdev = netdev_notifier_info_to_dev(ptr);
+	struct bpf_dev_offload *offload, *tmp;
+
+	ASSERT_RTNL();
+
+	switch (event) {
+	case NETDEV_UNREGISTER:
+		list_for_each_entry_safe(offload, tmp, &bpf_prog_offload_devs,
+					 offloads) {
+			if (offload->netdev == netdev)
+				__bpf_prog_offload_destroy(offload->prog);
+		}
+		break;
+	default:
+		break;
+	}
+	return NOTIFY_OK;
+}
+
+static struct notifier_block bpf_offload_notifier = {
+	.notifier_call = bpf_offload_notification,
+};
+
+static int __init bpf_offload_init(void)
+{
+	register_netdevice_notifier(&bpf_offload_notifier);
+	return 0;
+}
+
+subsys_initcall(bpf_offload_init);
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 323be2473c4b..1574b9f0f24e 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -824,7 +824,10 @@ static int find_prog_type(enum bpf_prog_type type, struct bpf_prog *prog)
 	if (type >= ARRAY_SIZE(bpf_prog_types) || !bpf_prog_types[type])
 		return -EINVAL;
 
-	prog->aux->ops = bpf_prog_types[type];
+	if (!bpf_prog_is_dev_bound(prog->aux))
+		prog->aux->ops = bpf_prog_types[type];
+	else
+		prog->aux->ops = &bpf_offload_prog_ops;
 	prog->type = type;
 	return 0;
 }
@@ -1054,7 +1057,7 @@ struct bpf_prog *bpf_prog_inc_not_zero(struct bpf_prog *prog)
 }
 EXPORT_SYMBOL_GPL(bpf_prog_inc_not_zero);
 
-static struct bpf_prog *__bpf_prog_get(u32 ufd, enum bpf_prog_type *type)
+static struct bpf_prog *__bpf_prog_get(u32 ufd, enum bpf_prog_type *attach_type)
 {
 	struct fd f = fdget(ufd);
 	struct bpf_prog *prog;
@@ -1062,7 +1065,7 @@ static struct bpf_prog *__bpf_prog_get(u32 ufd, enum bpf_prog_type *type)
 	prog = ____bpf_prog_get(f);
 	if (IS_ERR(prog))
 		return prog;
-	if (type && prog->type != *type) {
+	if (attach_type && (prog->type != *attach_type || prog->aux->offload)) {
 		prog = ERR_PTR(-EINVAL);
 		goto out;
 	}
@@ -1089,7 +1092,7 @@ struct bpf_prog *bpf_prog_get_type(u32 ufd, enum bpf_prog_type type)
 EXPORT_SYMBOL_GPL(bpf_prog_get_type);
 
 /* last field in 'union bpf_attr' used by this command */
-#define	BPF_PROG_LOAD_LAST_FIELD prog_name
+#define	BPF_PROG_LOAD_LAST_FIELD prog_target_ifindex
 
 static int bpf_prog_load(union bpf_attr *attr)
 {
@@ -1152,6 +1155,12 @@ static int bpf_prog_load(union bpf_attr *attr)
 	atomic_set(&prog->aux->refcnt, 1);
 	prog->gpl_compatible = is_gpl ? 1 : 0;
 
+	if (attr->prog_target_ifindex) {
+		err = bpf_prog_offload_init(prog, attr);
+		if (err)
+			goto free_prog;
+	}
+
 	/* find program type: socket_filter vs tracing_filter */
 	err = find_prog_type(type, prog);
 	if (err < 0)
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 04357ad5a812..51aabb32ad67 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -3736,10 +3736,13 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
 static int ext_analyzer_insn_hook(struct bpf_verifier_env *env,
 				  int insn_idx, int prev_insn_idx)
 {
-	if (!env->analyzer_ops || !env->analyzer_ops->insn_hook)
-		return 0;
+	if (env->analyzer_ops && env->analyzer_ops->insn_hook)
+		return env->analyzer_ops->insn_hook(env, insn_idx,
+						    prev_insn_idx);
+	if (env->dev_ops && env->dev_ops->insn_hook)
+		return env->dev_ops->insn_hook(env, insn_idx, prev_insn_idx);
 
-	return env->analyzer_ops->insn_hook(env, insn_idx, prev_insn_idx);
+	return 0;
 }
 
 static int do_check(struct bpf_verifier_env *env)
@@ -4516,6 +4519,12 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)
 	if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS))
 		env->strict_alignment = true;
 
+	if (env->prog->aux->offload) {
+		ret = bpf_prog_offload_verifier_prep(env);
+		if (ret)
+			goto err_unlock;
+	}
+
 	ret = replace_map_fd_with_map_ptr(env);
 	if (ret < 0)
 		goto skip_full_check;
-- 
cgit v1.2.3


From bd601b6ada11fdfb9e277f24ad2eb54bc599156b Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Fri, 3 Nov 2017 13:56:18 -0700
Subject: bpf: report offload info to user space

Extend struct bpf_prog_info to contain information about program
being bound to a device.  Since the netdev may get destroyed while
program still exists we need a flag to indicate the program is
loaded for a device, even if the device is gone.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Simon Horman <simon.horman@netronome.com>
Reviewed-by: Quentin Monnet <quentin.monnet@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h      |  1 +
 include/uapi/linux/bpf.h |  6 ++++++
 kernel/bpf/offload.c     | 12 ++++++++++++
 kernel/bpf/syscall.c     |  5 +++++
 4 files changed, 24 insertions(+)

(limited to 'kernel')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index e45d43f9ec92..98bacd0fa5cc 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -506,6 +506,7 @@ static inline int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu,
 
 int bpf_prog_offload_compile(struct bpf_prog *prog);
 void bpf_prog_offload_destroy(struct bpf_prog *prog);
+u32 bpf_prog_offload_ifindex(struct bpf_prog *prog);
 
 #if defined(CONFIG_NET) && defined(CONFIG_BPF_SYSCALL)
 int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr);
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 80d191a93fb0..4455dd195201 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -895,6 +895,10 @@ enum sk_action {
 
 #define BPF_TAG_SIZE	8
 
+enum bpf_prog_status {
+	BPF_PROG_STATUS_DEV_BOUND	= (1 << 0),
+};
+
 struct bpf_prog_info {
 	__u32 type;
 	__u32 id;
@@ -908,6 +912,8 @@ struct bpf_prog_info {
 	__u32 nr_map_ids;
 	__aligned_u64 map_ids;
 	char name[BPF_OBJ_NAME_LEN];
+	__u32 ifindex;
+	__u32 status;
 } __attribute__((aligned(8)));
 
 struct bpf_map_info {
diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c
index 5553e0e2f8b1..2816feb38be1 100644
--- a/kernel/bpf/offload.c
+++ b/kernel/bpf/offload.c
@@ -144,6 +144,18 @@ int bpf_prog_offload_compile(struct bpf_prog *prog)
 	return bpf_prog_offload_translate(prog);
 }
 
+u32 bpf_prog_offload_ifindex(struct bpf_prog *prog)
+{
+	struct bpf_dev_offload *offload = prog->aux->offload;
+	u32 ifindex;
+
+	rtnl_lock();
+	ifindex = offload->netdev ? offload->netdev->ifindex : 0;
+	rtnl_unlock();
+
+	return ifindex;
+}
+
 const struct bpf_prog_ops bpf_offload_prog_ops = {
 };
 
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 1574b9f0f24e..3217c20ea91b 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1592,6 +1592,11 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
 			return -EFAULT;
 	}
 
+	if (bpf_prog_is_dev_bound(prog->aux)) {
+		info.status |= BPF_PROG_STATUS_DEV_BOUND;
+		info.ifindex = bpf_prog_offload_ifindex(prog);
+	}
+
 done:
 	if (copy_to_user(uinfo, &info, info_len) ||
 	    put_user(info_len, &uattr->info.info_len))
-- 
cgit v1.2.3


From 248f346ffe9508dee0039db4ac839cb31ba3bdec Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Fri, 3 Nov 2017 13:56:20 -0700
Subject: xdp: allow attaching programs loaded for specific device

Pass the netdev pointer to bpf_prog_get_type().  This way
BPF code can decide whether the device matches what the
code was loaded/translated for.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Simon Horman <simon.horman@netronome.com>
Reviewed-by: Quentin Monnet <quentin.monnet@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h  | 10 ++++++++++
 kernel/bpf/syscall.c | 33 +++++++++++++++++++++++++++++----
 net/core/dev.c       |  6 +++++-
 3 files changed, 44 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 98bacd0fa5cc..c397934f91dd 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -335,6 +335,8 @@ extern const struct bpf_verifier_ops xdp_analyzer_ops;
 
 struct bpf_prog *bpf_prog_get(u32 ufd);
 struct bpf_prog *bpf_prog_get_type(u32 ufd, enum bpf_prog_type type);
+struct bpf_prog *bpf_prog_get_type_dev(u32 ufd, enum bpf_prog_type type,
+				       struct net_device *netdev);
 struct bpf_prog * __must_check bpf_prog_add(struct bpf_prog *prog, int i);
 void bpf_prog_sub(struct bpf_prog *prog, int i);
 struct bpf_prog * __must_check bpf_prog_inc(struct bpf_prog *prog);
@@ -428,6 +430,14 @@ static inline struct bpf_prog *bpf_prog_get_type(u32 ufd,
 {
 	return ERR_PTR(-EOPNOTSUPP);
 }
+
+static inline struct bpf_prog *bpf_prog_get_type_dev(u32 ufd,
+						     enum bpf_prog_type type,
+						     struct net_device *netdev)
+{
+	return ERR_PTR(-EOPNOTSUPP);
+}
+
 static inline struct bpf_prog * __must_check bpf_prog_add(struct bpf_prog *prog,
 							  int i)
 {
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 3217c20ea91b..68f9123acd39 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1057,7 +1057,22 @@ struct bpf_prog *bpf_prog_inc_not_zero(struct bpf_prog *prog)
 }
 EXPORT_SYMBOL_GPL(bpf_prog_inc_not_zero);
 
-static struct bpf_prog *__bpf_prog_get(u32 ufd, enum bpf_prog_type *attach_type)
+static bool bpf_prog_can_attach(struct bpf_prog *prog,
+				enum bpf_prog_type *attach_type,
+				struct net_device *netdev)
+{
+	struct bpf_dev_offload *offload = prog->aux->offload;
+
+	if (prog->type != *attach_type)
+		return false;
+	if (offload && offload->netdev != netdev)
+		return false;
+
+	return true;
+}
+
+static struct bpf_prog *__bpf_prog_get(u32 ufd, enum bpf_prog_type *attach_type,
+				       struct net_device *netdev)
 {
 	struct fd f = fdget(ufd);
 	struct bpf_prog *prog;
@@ -1065,7 +1080,7 @@ static struct bpf_prog *__bpf_prog_get(u32 ufd, enum bpf_prog_type *attach_type)
 	prog = ____bpf_prog_get(f);
 	if (IS_ERR(prog))
 		return prog;
-	if (attach_type && (prog->type != *attach_type || prog->aux->offload)) {
+	if (attach_type && !bpf_prog_can_attach(prog, attach_type, netdev)) {
 		prog = ERR_PTR(-EINVAL);
 		goto out;
 	}
@@ -1078,12 +1093,12 @@ out:
 
 struct bpf_prog *bpf_prog_get(u32 ufd)
 {
-	return __bpf_prog_get(ufd, NULL);
+	return __bpf_prog_get(ufd, NULL, NULL);
 }
 
 struct bpf_prog *bpf_prog_get_type(u32 ufd, enum bpf_prog_type type)
 {
-	struct bpf_prog *prog = __bpf_prog_get(ufd, &type);
+	struct bpf_prog *prog = __bpf_prog_get(ufd, &type, NULL);
 
 	if (!IS_ERR(prog))
 		trace_bpf_prog_get_type(prog);
@@ -1091,6 +1106,16 @@ struct bpf_prog *bpf_prog_get_type(u32 ufd, enum bpf_prog_type type)
 }
 EXPORT_SYMBOL_GPL(bpf_prog_get_type);
 
+struct bpf_prog *bpf_prog_get_type_dev(u32 ufd, enum bpf_prog_type type,
+				       struct net_device *netdev)
+{
+	struct bpf_prog *prog = __bpf_prog_get(ufd, &type, netdev);
+
+	if (!IS_ERR(prog))
+		trace_bpf_prog_get_type(prog);
+	return prog;
+}
+
 /* last field in 'union bpf_attr' used by this command */
 #define	BPF_PROG_LOAD_LAST_FIELD prog_target_ifindex
 
diff --git a/net/core/dev.c b/net/core/dev.c
index 10cde58d3275..30b5fe32c525 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -7157,7 +7157,11 @@ int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
 		    __dev_xdp_attached(dev, bpf_op, NULL))
 			return -EBUSY;
 
-		prog = bpf_prog_get_type(fd, BPF_PROG_TYPE_XDP);
+		if (bpf_op == ops->ndo_bpf)
+			prog = bpf_prog_get_type_dev(fd, BPF_PROG_TYPE_XDP,
+						     dev);
+		else
+			prog = bpf_prog_get_type(fd, BPF_PROG_TYPE_XDP);
 		if (IS_ERR(prog))
 			return PTR_ERR(prog);
 	}
-- 
cgit v1.2.3


From 6c8dfe21c435cf2953e3cee43e12180cbc4f0820 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Fri, 3 Nov 2017 13:56:21 -0700
Subject: cls_bpf: allow attaching programs loaded for specific device

If TC program is loaded with skip_sw flag, we should allow
the device-specific programs to be accepted.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Simon Horman <simon.horman@netronome.com>
Reviewed-by: Quentin Monnet <quentin.monnet@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/syscall.c |  1 +
 net/sched/cls_bpf.c  | 10 +++++++---
 2 files changed, 8 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 68f9123acd39..416d70cdfc76 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1115,6 +1115,7 @@ struct bpf_prog *bpf_prog_get_type_dev(u32 ufd, enum bpf_prog_type type,
 		trace_bpf_prog_get_type(prog);
 	return prog;
 }
+EXPORT_SYMBOL_GPL(bpf_prog_get_type_dev);
 
 /* last field in 'union bpf_attr' used by this command */
 #define	BPF_PROG_LOAD_LAST_FIELD prog_target_ifindex
diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c
index bc3edde1b9d7..dc9bd9a0070b 100644
--- a/net/sched/cls_bpf.c
+++ b/net/sched/cls_bpf.c
@@ -374,7 +374,7 @@ static int cls_bpf_prog_from_ops(struct nlattr **tb, struct cls_bpf_prog *prog)
 }
 
 static int cls_bpf_prog_from_efd(struct nlattr **tb, struct cls_bpf_prog *prog,
-				 const struct tcf_proto *tp)
+				 u32 gen_flags, const struct tcf_proto *tp)
 {
 	struct bpf_prog *fp;
 	char *name = NULL;
@@ -382,7 +382,11 @@ static int cls_bpf_prog_from_efd(struct nlattr **tb, struct cls_bpf_prog *prog,
 
 	bpf_fd = nla_get_u32(tb[TCA_BPF_FD]);
 
-	fp = bpf_prog_get_type(bpf_fd, BPF_PROG_TYPE_SCHED_CLS);
+	if (gen_flags & TCA_CLS_FLAGS_SKIP_SW)
+		fp = bpf_prog_get_type_dev(bpf_fd, BPF_PROG_TYPE_SCHED_CLS,
+					   qdisc_dev(tp->q));
+	else
+		fp = bpf_prog_get_type(bpf_fd, BPF_PROG_TYPE_SCHED_CLS);
 	if (IS_ERR(fp))
 		return PTR_ERR(fp);
 
@@ -440,7 +444,7 @@ static int cls_bpf_set_parms(struct net *net, struct tcf_proto *tp,
 	prog->gen_flags = gen_flags;
 
 	ret = is_bpf ? cls_bpf_prog_from_ops(tb, prog) :
-		       cls_bpf_prog_from_efd(tb, prog, tp);
+		       cls_bpf_prog_from_efd(tb, prog, gen_flags, tp);
 	if (ret < 0)
 		return ret;
 
-- 
cgit v1.2.3


From b37a530613104aa3f592376c67a462823298759c Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Fri, 3 Nov 2017 13:56:30 -0700
Subject: bpf: remove old offload/analyzer

Thanks to the ability to load a program for a specific device,
running verifier twice is no longer needed.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Quentin Monnet <quentin.monnet@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf_verifier.h |  5 ---
 kernel/bpf/verifier.c        | 75 --------------------------------------------
 net/core/filter.c            | 42 -------------------------
 3 files changed, 122 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index e45011dbc02d..07b96aaca256 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -152,9 +152,7 @@ struct bpf_verifier_env {
 	bool strict_alignment;		/* perform strict pointer alignment checks */
 	struct bpf_verifier_state *cur_state; /* current verifier state */
 	struct bpf_verifier_state_list **explored_states; /* search pruning optimization */
-	const struct bpf_ext_analyzer_ops *analyzer_ops; /* external analyzer ops */
 	const struct bpf_ext_analyzer_ops *dev_ops; /* device analyzer ops */
-	void *analyzer_priv; /* pointer to external analyzer's private data */
 	struct bpf_map *used_maps[MAX_USED_MAPS]; /* array of map's used by eBPF program */
 	u32 used_map_cnt;		/* number of used maps */
 	u32 id_gen;			/* used to generate unique reg IDs */
@@ -179,7 +177,4 @@ int bpf_prog_offload_verifier_prep(struct bpf_verifier_env *env)
 }
 #endif
 
-int bpf_analyzer(struct bpf_prog *prog, const struct bpf_ext_analyzer_ops *ops,
-		 void *priv);
-
 #endif /* _LINUX_BPF_VERIFIER_H */
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 51aabb32ad67..add845fe788a 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -949,9 +949,6 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off,
 		 */
 		*reg_type = info.reg_type;
 
-		if (env->analyzer_ops)
-			return 0;
-
 		env->insn_aux_data[insn_idx].ctx_field_size = info.ctx_field_size;
 		/* remember the offset of last byte accessed in ctx */
 		if (env->prog->aux->max_ctx_offset < off + size)
@@ -3736,9 +3733,6 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
 static int ext_analyzer_insn_hook(struct bpf_verifier_env *env,
 				  int insn_idx, int prev_insn_idx)
 {
-	if (env->analyzer_ops && env->analyzer_ops->insn_hook)
-		return env->analyzer_ops->insn_hook(env, insn_idx,
-						    prev_insn_idx);
 	if (env->dev_ops && env->dev_ops->insn_hook)
 		return env->dev_ops->insn_hook(env, insn_idx, prev_insn_idx);
 
@@ -4601,72 +4595,3 @@ err_free_env:
 	kfree(env);
 	return ret;
 }
-
-static const struct bpf_verifier_ops * const bpf_analyzer_ops[] = {
-#ifdef CONFIG_NET
-	[BPF_PROG_TYPE_XDP]		= &xdp_analyzer_ops,
-	[BPF_PROG_TYPE_SCHED_CLS]	= &tc_cls_act_analyzer_ops,
-#endif
-};
-
-int bpf_analyzer(struct bpf_prog *prog, const struct bpf_ext_analyzer_ops *ops,
-		 void *priv)
-{
-	struct bpf_verifier_env *env;
-	int ret;
-
-	if (prog->type >= ARRAY_SIZE(bpf_analyzer_ops) ||
-	    !bpf_analyzer_ops[prog->type])
-		return -EOPNOTSUPP;
-
-	env = kzalloc(sizeof(struct bpf_verifier_env), GFP_KERNEL);
-	if (!env)
-		return -ENOMEM;
-
-	env->insn_aux_data = vzalloc(sizeof(struct bpf_insn_aux_data) *
-				     prog->len);
-	ret = -ENOMEM;
-	if (!env->insn_aux_data)
-		goto err_free_env;
-	env->prog = prog;
-	env->ops = bpf_analyzer_ops[env->prog->type];
-	env->analyzer_ops = ops;
-	env->analyzer_priv = priv;
-
-	/* grab the mutex to protect few globals used by verifier */
-	mutex_lock(&bpf_verifier_lock);
-
-	env->strict_alignment = false;
-	if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS))
-		env->strict_alignment = true;
-
-	env->explored_states = kcalloc(env->prog->len,
-				       sizeof(struct bpf_verifier_state_list *),
-				       GFP_KERNEL);
-	ret = -ENOMEM;
-	if (!env->explored_states)
-		goto skip_full_check;
-
-	ret = check_cfg(env);
-	if (ret < 0)
-		goto skip_full_check;
-
-	env->allow_ptr_leaks = capable(CAP_SYS_ADMIN);
-
-	ret = do_check(env);
-	if (env->cur_state) {
-		free_verifier_state(env->cur_state, true);
-		env->cur_state = NULL;
-	}
-
-skip_full_check:
-	while (!pop_stack(env, NULL, NULL));
-	free_states(env);
-
-	mutex_unlock(&bpf_verifier_lock);
-	vfree(env->insn_aux_data);
-err_free_env:
-	kfree(env);
-	return ret;
-}
-EXPORT_SYMBOL_GPL(bpf_analyzer);
diff --git a/net/core/filter.c b/net/core/filter.c
index a0112168d6f9..1afa17935954 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -3777,25 +3777,6 @@ static bool tc_cls_act_is_valid_access(int off, int size,
 	return bpf_skb_is_valid_access(off, size, type, info);
 }
 
-static bool
-tc_cls_act_is_valid_access_analyzer(int off, int size,
-				    enum bpf_access_type type,
-				    struct bpf_insn_access_aux *info)
-{
-	switch (off) {
-	case offsetof(struct sk_buff, len):
-		return true;
-	case offsetof(struct sk_buff, data):
-		info->reg_type = PTR_TO_PACKET;
-		return true;
-	case offsetof(struct sk_buff, cb) +
-	     offsetof(struct bpf_skb_data_end, data_end):
-		info->reg_type = PTR_TO_PACKET_END;
-		return true;
-	}
-	return false;
-}
-
 static bool __is_valid_xdp_access(int off, int size)
 {
 	if (off < 0 || off >= sizeof(struct xdp_md))
@@ -3830,21 +3811,6 @@ static bool xdp_is_valid_access(int off, int size,
 	return __is_valid_xdp_access(off, size);
 }
 
-static bool xdp_is_valid_access_analyzer(int off, int size,
-					 enum bpf_access_type type,
-					 struct bpf_insn_access_aux *info)
-{
-	switch (off) {
-	case offsetof(struct xdp_buff, data):
-		info->reg_type = PTR_TO_PACKET;
-		return true;
-	case offsetof(struct xdp_buff, data_end):
-		info->reg_type = PTR_TO_PACKET_END;
-		return true;
-	}
-	return false;
-}
-
 void bpf_warn_invalid_xdp_action(u32 act)
 {
 	const u32 act_max = XDP_REDIRECT;
@@ -4516,10 +4482,6 @@ const struct bpf_verifier_ops tc_cls_act_verifier_ops = {
 	.gen_prologue		= tc_cls_act_prologue,
 };
 
-const struct bpf_verifier_ops tc_cls_act_analyzer_ops = {
-	.is_valid_access	= tc_cls_act_is_valid_access_analyzer,
-};
-
 const struct bpf_prog_ops tc_cls_act_prog_ops = {
 	.test_run		= bpf_prog_test_run_skb,
 };
@@ -4530,10 +4492,6 @@ const struct bpf_verifier_ops xdp_verifier_ops = {
 	.convert_ctx_access	= xdp_convert_ctx_access,
 };
 
-const struct bpf_verifier_ops xdp_analyzer_ops = {
-	.is_valid_access	= xdp_is_valid_access_analyzer,
-};
-
 const struct bpf_prog_ops xdp_prog_ops = {
 	.test_run		= bpf_prog_test_run_xdp,
 };
-- 
cgit v1.2.3


From ebc614f687369f9df99828572b1d85a7c2de3d92 Mon Sep 17 00:00:00 2001
From: Roman Gushchin <guro@fb.com>
Date: Sun, 5 Nov 2017 08:15:32 -0500
Subject: bpf, cgroup: implement eBPF-based device controller for cgroup v2

Cgroup v2 lacks the device controller, provided by cgroup v1.
This patch adds a new eBPF program type, which in combination
of previously added ability to attach multiple eBPF programs
to a cgroup, will provide a similar functionality, but with some
additional flexibility.

This patch introduces a BPF_PROG_TYPE_CGROUP_DEVICE program type.
A program takes major and minor device numbers, device type
(block/character) and access type (mknod/read/write) as parameters
and returns an integer which defines if the operation should be
allowed or terminated with -EPERM.

Signed-off-by: Roman Gushchin <guro@fb.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Tejun Heo <tj@kernel.org>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf-cgroup.h     | 15 ++++++++++
 include/linux/bpf_types.h      |  3 ++
 include/linux/device_cgroup.h  |  8 ++++-
 include/uapi/linux/bpf.h       | 15 ++++++++++
 kernel/bpf/cgroup.c            | 67 ++++++++++++++++++++++++++++++++++++++++++
 kernel/bpf/syscall.c           |  7 +++++
 kernel/bpf/verifier.c          |  1 +
 tools/include/uapi/linux/bpf.h | 15 ++++++++++
 8 files changed, 130 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
index 87a7db9feb38..a7f16e0f8d68 100644
--- a/include/linux/bpf-cgroup.h
+++ b/include/linux/bpf-cgroup.h
@@ -67,6 +67,9 @@ int __cgroup_bpf_run_filter_sock_ops(struct sock *sk,
 				     struct bpf_sock_ops_kern *sock_ops,
 				     enum bpf_attach_type type);
 
+int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor,
+				      short access, enum bpf_attach_type type);
+
 /* Wrappers for __cgroup_bpf_run_filter_skb() guarded by cgroup_bpf_enabled. */
 #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk, skb)			      \
 ({									      \
@@ -112,6 +115,17 @@ int __cgroup_bpf_run_filter_sock_ops(struct sock *sk,
 	}								       \
 	__ret;								       \
 })
+
+#define BPF_CGROUP_RUN_PROG_DEVICE_CGROUP(type, major, minor, access)	      \
+({									      \
+	int __ret = 0;							      \
+	if (cgroup_bpf_enabled)						      \
+		__ret = __cgroup_bpf_check_dev_permission(type, major, minor, \
+							  access,	      \
+							  BPF_CGROUP_DEVICE); \
+									      \
+	__ret;								      \
+})
 #else
 
 struct cgroup_bpf {};
@@ -122,6 +136,7 @@ static inline int cgroup_bpf_inherit(struct cgroup *cgrp) { return 0; }
 #define BPF_CGROUP_RUN_PROG_INET_EGRESS(sk,skb) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_INET_SOCK(sk) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_SOCK_OPS(sock_ops) ({ 0; })
+#define BPF_CGROUP_RUN_PROG_DEVICE_CGROUP(type,major,minor,access) ({ 0; })
 
 #endif /* CONFIG_CGROUP_BPF */
 
diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
index 53c5b9ad7220..978c1d9c9383 100644
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -19,6 +19,9 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_KPROBE, kprobe)
 BPF_PROG_TYPE(BPF_PROG_TYPE_TRACEPOINT, tracepoint)
 BPF_PROG_TYPE(BPF_PROG_TYPE_PERF_EVENT, perf_event)
 #endif
+#ifdef CONFIG_CGROUP_BPF
+BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_DEVICE, cg_dev)
+#endif
 
 BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY, array_map_ops)
 BPF_MAP_TYPE(BPF_MAP_TYPE_PERCPU_ARRAY, percpu_array_map_ops)
diff --git a/include/linux/device_cgroup.h b/include/linux/device_cgroup.h
index 2d93d7ecd479..8557efe096dc 100644
--- a/include/linux/device_cgroup.h
+++ b/include/linux/device_cgroup.h
@@ -1,5 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 #include <linux/fs.h>
+#include <linux/bpf-cgroup.h>
 
 #define DEVCG_ACC_MKNOD 1
 #define DEVCG_ACC_READ  2
@@ -19,10 +20,15 @@ static inline int __devcgroup_check_permission(short type, u32 major, u32 minor,
 { return 0; }
 #endif
 
-#ifdef CONFIG_CGROUP_DEVICE
+#if defined(CONFIG_CGROUP_DEVICE) || defined(CONFIG_CGROUP_BPF)
 static inline int devcgroup_check_permission(short type, u32 major, u32 minor,
 					     short access)
 {
+	int rc = BPF_CGROUP_RUN_PROG_DEVICE_CGROUP(type, major, minor, access);
+
+	if (rc)
+		return -EPERM;
+
 	return __devcgroup_check_permission(type, major, minor, access);
 }
 
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 4455dd195201..e880ae6434ee 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -132,6 +132,7 @@ enum bpf_prog_type {
 	BPF_PROG_TYPE_LWT_XMIT,
 	BPF_PROG_TYPE_SOCK_OPS,
 	BPF_PROG_TYPE_SK_SKB,
+	BPF_PROG_TYPE_CGROUP_DEVICE,
 };
 
 enum bpf_attach_type {
@@ -141,6 +142,7 @@ enum bpf_attach_type {
 	BPF_CGROUP_SOCK_OPS,
 	BPF_SK_SKB_STREAM_PARSER,
 	BPF_SK_SKB_STREAM_VERDICT,
+	BPF_CGROUP_DEVICE,
 	__MAX_BPF_ATTACH_TYPE
 };
 
@@ -991,4 +993,17 @@ struct bpf_perf_event_value {
 	__u64 running;
 };
 
+#define BPF_DEVCG_ACC_MKNOD	(1ULL << 0)
+#define BPF_DEVCG_ACC_READ	(1ULL << 1)
+#define BPF_DEVCG_ACC_WRITE	(1ULL << 2)
+
+#define BPF_DEVCG_DEV_BLOCK	(1ULL << 0)
+#define BPF_DEVCG_DEV_CHAR	(1ULL << 1)
+
+struct bpf_cgroup_dev_ctx {
+	__u32 access_type; /* (access << 16) | type */
+	__u32 major;
+	__u32 minor;
+};
+
 #endif /* _UAPI__LINUX_BPF_H__ */
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 3db5a17fcfe8..b789ab78d28f 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -522,3 +522,70 @@ int __cgroup_bpf_run_filter_sock_ops(struct sock *sk,
 	return ret == 1 ? 0 : -EPERM;
 }
 EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_ops);
+
+int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor,
+				      short access, enum bpf_attach_type type)
+{
+	struct cgroup *cgrp;
+	struct bpf_cgroup_dev_ctx ctx = {
+		.access_type = (access << 16) | dev_type,
+		.major = major,
+		.minor = minor,
+	};
+	int allow = 1;
+
+	rcu_read_lock();
+	cgrp = task_dfl_cgroup(current);
+	allow = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], &ctx,
+				   BPF_PROG_RUN);
+	rcu_read_unlock();
+
+	return !allow;
+}
+EXPORT_SYMBOL(__cgroup_bpf_check_dev_permission);
+
+static const struct bpf_func_proto *
+cgroup_dev_func_proto(enum bpf_func_id func_id)
+{
+	switch (func_id) {
+	case BPF_FUNC_map_lookup_elem:
+		return &bpf_map_lookup_elem_proto;
+	case BPF_FUNC_map_update_elem:
+		return &bpf_map_update_elem_proto;
+	case BPF_FUNC_map_delete_elem:
+		return &bpf_map_delete_elem_proto;
+	case BPF_FUNC_get_current_uid_gid:
+		return &bpf_get_current_uid_gid_proto;
+	case BPF_FUNC_trace_printk:
+		if (capable(CAP_SYS_ADMIN))
+			return bpf_get_trace_printk_proto();
+	default:
+		return NULL;
+	}
+}
+
+static bool cgroup_dev_is_valid_access(int off, int size,
+				       enum bpf_access_type type,
+				       struct bpf_insn_access_aux *info)
+{
+	if (type == BPF_WRITE)
+		return false;
+
+	if (off < 0 || off + size > sizeof(struct bpf_cgroup_dev_ctx))
+		return false;
+	/* The verifier guarantees that size > 0. */
+	if (off % size != 0)
+		return false;
+	if (size != sizeof(__u32))
+		return false;
+
+	return true;
+}
+
+const struct bpf_prog_ops cg_dev_prog_ops = {
+};
+
+const struct bpf_verifier_ops cg_dev_verifier_ops = {
+	.get_func_proto		= cgroup_dev_func_proto,
+	.is_valid_access	= cgroup_dev_is_valid_access,
+};
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 416d70cdfc76..09badc37e864 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1326,6 +1326,9 @@ static int bpf_prog_attach(const union bpf_attr *attr)
 	case BPF_CGROUP_SOCK_OPS:
 		ptype = BPF_PROG_TYPE_SOCK_OPS;
 		break;
+	case BPF_CGROUP_DEVICE:
+		ptype = BPF_PROG_TYPE_CGROUP_DEVICE;
+		break;
 	case BPF_SK_SKB_STREAM_PARSER:
 	case BPF_SK_SKB_STREAM_VERDICT:
 		return sockmap_get_from_fd(attr, true);
@@ -1378,6 +1381,9 @@ static int bpf_prog_detach(const union bpf_attr *attr)
 	case BPF_CGROUP_SOCK_OPS:
 		ptype = BPF_PROG_TYPE_SOCK_OPS;
 		break;
+	case BPF_CGROUP_DEVICE:
+		ptype = BPF_PROG_TYPE_CGROUP_DEVICE;
+		break;
 	case BPF_SK_SKB_STREAM_PARSER:
 	case BPF_SK_SKB_STREAM_VERDICT:
 		return sockmap_get_from_fd(attr, false);
@@ -1420,6 +1426,7 @@ static int bpf_prog_query(const union bpf_attr *attr,
 	case BPF_CGROUP_INET_EGRESS:
 	case BPF_CGROUP_INET_SOCK_CREATE:
 	case BPF_CGROUP_SOCK_OPS:
+	case BPF_CGROUP_DEVICE:
 		break;
 	default:
 		return -EINVAL;
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index add845fe788a..4a942e2e753d 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -3124,6 +3124,7 @@ static int check_return_code(struct bpf_verifier_env *env)
 	case BPF_PROG_TYPE_CGROUP_SKB:
 	case BPF_PROG_TYPE_CGROUP_SOCK:
 	case BPF_PROG_TYPE_SOCK_OPS:
+	case BPF_PROG_TYPE_CGROUP_DEVICE:
 		break;
 	default:
 		return 0;
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index e92f62cf933a..b280f37cd057 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -131,6 +131,7 @@ enum bpf_prog_type {
 	BPF_PROG_TYPE_LWT_XMIT,
 	BPF_PROG_TYPE_SOCK_OPS,
 	BPF_PROG_TYPE_SK_SKB,
+	BPF_PROG_TYPE_CGROUP_DEVICE,
 };
 
 enum bpf_attach_type {
@@ -140,6 +141,7 @@ enum bpf_attach_type {
 	BPF_CGROUP_SOCK_OPS,
 	BPF_SK_SKB_STREAM_PARSER,
 	BPF_SK_SKB_STREAM_VERDICT,
+	BPF_CGROUP_DEVICE,
 	__MAX_BPF_ATTACH_TYPE
 };
 
@@ -990,4 +992,17 @@ struct bpf_perf_event_value {
 	__u64 running;
 };
 
+#define BPF_DEVCG_ACC_MKNOD	(1ULL << 0)
+#define BPF_DEVCG_ACC_READ	(1ULL << 1)
+#define BPF_DEVCG_ACC_WRITE	(1ULL << 2)
+
+#define BPF_DEVCG_DEV_BLOCK	(1ULL << 0)
+#define BPF_DEVCG_DEV_CHAR	(1ULL << 1)
+
+struct bpf_cgroup_dev_ctx {
+	__u32 access_type; /* (access << 16) | type */
+	__u32 major;
+	__u32 minor;
+};
+
 #endif /* _UAPI__LINUX_BPF_H__ */
-- 
cgit v1.2.3


From 9a19b463863e757e649c37af245b6af101410c1e Mon Sep 17 00:00:00 2001
From: Wang Long <wanglong19@meituan.com>
Date: Thu, 2 Nov 2017 23:05:12 -0400
Subject: workqueue: Fix comment for unbound workqueue's attrbutes

Signed-off-by: Wang Long <wanglong19@meituan.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/workqueue.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 64d0edf428f8..5f99851bff09 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -5013,9 +5013,10 @@ int workqueue_set_unbound_cpumask(cpumask_var_t cpumask)
  *
  * Unbound workqueues have the following extra attributes.
  *
- *  id		RO int	: the associated pool ID
+ *  pool_ids	RO int	: the associated pool IDs for each node
  *  nice	RW int	: nice value of the workers
  *  cpumask	RW mask	: bitmask of allowed CPUs for the workers
+ *  numa	RW bool	: whether enable NUMA affinity
  */
 struct wq_device {
 	struct workqueue_struct		*wq;
-- 
cgit v1.2.3


From 01ee6cfb1483fe57c9cbd8e73817dfbf9bacffd3 Mon Sep 17 00:00:00 2001
From: Roman Gushchin <guro@fb.com>
Date: Mon, 6 Nov 2017 13:30:28 -0500
Subject: cgroup: export list of delegatable control files using sysfs

Delegatable cgroup v2 control files may require special handling
(e.g. chowning), and the exact list of such files varies between
kernel versions (and likely to be extended in the future).

To guarantee correctness of this list and simplify the life
of userspace (systemd, first of all), let's export the list
via /sys/kernel/cgroup/delegate pseudo-file.

Format is siple: each control file name is printed on a new line.
Example:
  $ cat /sys/kernel/cgroup/delegate
  cgroup.procs
  cgroup.subtree_control

Signed-off-by: Roman Gushchin <guro@fb.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: kernel-team@fb.com
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/cgroup/cgroup.c | 61 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 61 insertions(+)

(limited to 'kernel')

diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index d6ed725f36d9..eed92ed624e5 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -5832,3 +5832,64 @@ int cgroup_bpf_update(struct cgroup *cgrp, struct bpf_prog *prog,
 	return ret;
 }
 #endif /* CONFIG_CGROUP_BPF */
+
+#ifdef CONFIG_SYSFS
+static ssize_t show_delegatable_files(struct cftype *files, char *buf,
+				      ssize_t size, const char *prefix)
+{
+	struct cftype *cft;
+	ssize_t ret = 0;
+
+	for (cft = files; cft && cft->name[0] != '\0'; cft++) {
+		if (!(cft->flags & CFTYPE_NS_DELEGATABLE))
+			continue;
+
+		if (prefix)
+			ret += snprintf(buf + ret, size - ret, "%s.", prefix);
+
+		ret += snprintf(buf + ret, size - ret, "%s\n", cft->name);
+
+		if (unlikely(ret >= size)) {
+			WARN_ON(1);
+			break;
+		}
+	}
+
+	return ret;
+}
+
+static ssize_t delegate_show(struct kobject *kobj, struct kobj_attribute *attr,
+			      char *buf)
+{
+	struct cgroup_subsys *ss;
+	int ssid;
+	ssize_t ret = 0;
+
+	ret = show_delegatable_files(cgroup_base_files, buf, PAGE_SIZE - ret,
+				     NULL);
+
+	for_each_subsys(ss, ssid)
+		ret += show_delegatable_files(ss->dfl_cftypes, buf + ret,
+					      PAGE_SIZE - ret,
+					      cgroup_subsys_name[ssid]);
+
+	return ret;
+}
+static struct kobj_attribute cgroup_delegate_attr = __ATTR_RO(delegate);
+
+static struct attribute *cgroup_sysfs_attrs[] = {
+	&cgroup_delegate_attr.attr,
+	NULL,
+};
+
+static const struct attribute_group cgroup_sysfs_attr_group = {
+	.attrs = cgroup_sysfs_attrs,
+	.name = "cgroup",
+};
+
+static int __init cgroup_sysfs_init(void)
+{
+	return sysfs_create_group(kernel_kobj, &cgroup_sysfs_attr_group);
+}
+subsys_initcall(cgroup_sysfs_init);
+#endif /* CONFIG_SYSFS */
-- 
cgit v1.2.3


From 5f2e673405b742be64e7c3604ed4ed3ac14f35ce Mon Sep 17 00:00:00 2001
From: Roman Gushchin <guro@fb.com>
Date: Mon, 6 Nov 2017 13:30:29 -0500
Subject: cgroup: export list of cgroups v2 features using sysfs

The active development of cgroups v2 sometimes leads to a creation
of interfaces, which are not turned on by default (to provide
backward compatibility). It's handy to know from userspace, which
cgroup v2 features are supported without calculating it based
on the kernel version. So, let's export the list of such features
using /sys/kernel/cgroup/features pseudo-file.

The list is hardcoded and has to be extended when new functionality
is added. Each feature is printed on a new line.

Example:
  $ cat /sys/kernel/cgroup/features
  nsdelegate

Signed-off-by: Roman Gushchin <guro@fb.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: kernel-team@fb.com
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/cgroup/cgroup.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'kernel')

diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index eed92ed624e5..69e65d28fe98 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -5877,8 +5877,16 @@ static ssize_t delegate_show(struct kobject *kobj, struct kobj_attribute *attr,
 }
 static struct kobj_attribute cgroup_delegate_attr = __ATTR_RO(delegate);
 
+static ssize_t features_show(struct kobject *kobj, struct kobj_attribute *attr,
+			     char *buf)
+{
+	return snprintf(buf, PAGE_SIZE, "nsdelegate\n");
+}
+static struct kobj_attribute cgroup_features_attr = __ATTR_RO(features);
+
 static struct attribute *cgroup_sysfs_attrs[] = {
 	&cgroup_delegate_attr.attr,
+	&cgroup_features_attr.attr,
 	NULL,
 };
 
-- 
cgit v1.2.3


From 95b982b45122c57da2ee0b46cce70775e1d987af Mon Sep 17 00:00:00 2001
From: Rajat Jain <rajatja@google.com>
Date: Tue, 31 Oct 2017 14:44:24 -0700
Subject: PM / s2idle: Clear the events_check_enabled flag

Problem: This flag does not get cleared currently in the suspend or
resume path in the following cases:

 * In case some driver's suspend routine returns an error.
 * Successful s2idle case
 * etc?

Why is this a problem: What happens is that the next suspend attempt
could fail even though the user did not enable the flag by writing to
/sys/power/wakeup_count. This is 1 use case how the issue can be seen
(but similar use case with driver suspend failure can be thought of):

 1. Read /sys/power/wakeup_count
 2. echo count > /sys/power/wakeup_count
 3. echo freeze > /sys/power/wakeup_count
 4. Let the system suspend, and wakeup the system using some wake source
    that calls pm_wakeup_event() e.g. power button or something.
 5. Note that the combined wakeup count would be incremented due
    to the pm_wakeup_event() in the resume path.
 6. After resuming the events_check_enabled flag is still set.

At this point if the user attempts to freeze again (without writing to
/sys/power/wakeup_count), the suspend would fail even though there has
been no wake event since the past resume.

Address that by clearing the flag just before a resume is completed,
so that it is always cleared for the corner cases mentioned above.

Signed-off-by: Rajat Jain <rajatja@google.com>
Acked-by: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 kernel/power/suspend.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index ccd2d20e6b06..0685c4499431 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -437,7 +437,6 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
 			error = suspend_ops->enter(state);
 			trace_suspend_resume(TPS("machine_suspend"),
 				state, false);
-			events_check_enabled = false;
 		} else if (*wakeup) {
 			error = -EBUSY;
 		}
@@ -582,6 +581,7 @@ static int enter_state(suspend_state_t state)
 	pm_restore_gfp_mask();
 
  Finish:
+	events_check_enabled = false;
 	pm_pr_dbg("Finishing wakeup.\n");
 	suspend_finish();
  Unlock:
-- 
cgit v1.2.3


From 07458f6a5171d97511dfbdf6ce549ed2ca0280c7 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Wed, 8 Nov 2017 20:23:55 +0530
Subject: cpufreq: schedutil: Reset cached_raw_freq when not in sync with
 next_freq

'cached_raw_freq' is used to get the next frequency quickly but should
always be in sync with sg_policy->next_freq. There is a case where it is
not and in such cases it should be reset to avoid switching to incorrect
frequencies.

Consider this case for example:

 - policy->cur is 1.2 GHz (Max)
 - New request comes for 780 MHz and we store that in cached_raw_freq.
 - Based on 780 MHz, we calculate the effective frequency as 800 MHz.
 - We then see the CPU wasn't idle recently and choose to keep the next
   freq as 1.2 GHz.
 - Now we have cached_raw_freq is 780 MHz and sg_policy->next_freq is
   1.2 GHz.
 - Now if the utilization doesn't change in then next request, then the
   next target frequency will still be 780 MHz and it will match with
   cached_raw_freq. But we will choose 1.2 GHz instead of 800 MHz here.

Fixes: b7eaf1aab9f8 (cpufreq: schedutil: Avoid reducing frequency of busy CPUs prematurely)
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Cc: 4.12+ <stable@vger.kernel.org> # 4.12+
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 kernel/sched/cpufreq_schedutil.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index ba0da243fdd8..2f52ec0f1539 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -282,8 +282,12 @@ static void sugov_update_single(struct update_util_data *hook, u64 time,
 		 * Do not reduce the frequency if the CPU has not been idle
 		 * recently, as the reduction is likely to be premature then.
 		 */
-		if (busy && next_f < sg_policy->next_freq)
+		if (busy && next_f < sg_policy->next_freq) {
 			next_f = sg_policy->next_freq;
+
+			/* Reset cached freq as next_freq has changed */
+			sg_policy->cached_raw_freq = 0;
+		}
 	}
 	sugov_update_commit(sg_policy, time, next_f);
 }
-- 
cgit v1.2.3


From 173743dd99a49c956b124a74c8aacb0384739a4c Mon Sep 17 00:00:00 2001
From: Paul Moore <paul@paul-moore.com>
Date: Fri, 1 Sep 2017 09:44:34 -0400
Subject: audit: ensure that 'audit=1' actually enables audit for PID 1

Prior to this patch we enabled audit in audit_init(), which is too
late for PID 1 as the standard initcalls are run after the PID 1 task
is forked.  This means that we never allocate an audit_context (see
audit_alloc()) for PID 1 and therefore miss a lot of audit events
generated by PID 1.

This patch enables audit as early as possible to help ensure that when
PID 1 is forked it can allocate an audit_context if required.

Reviewed-by: Richard Guy Briggs <rgb@redhat.com>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 kernel/audit.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/audit.c b/kernel/audit.c
index be1c28fd4d57..ec3d0802734d 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -85,13 +85,13 @@ static int	audit_initialized;
 #define AUDIT_OFF	0
 #define AUDIT_ON	1
 #define AUDIT_LOCKED	2
-u32		audit_enabled;
-u32		audit_ever_enabled;
+u32		audit_enabled = AUDIT_OFF;
+u32		audit_ever_enabled = !!AUDIT_OFF;
 
 EXPORT_SYMBOL_GPL(audit_enabled);
 
 /* Default state when kernel boots without any parameters. */
-static u32	audit_default;
+static u32	audit_default = AUDIT_OFF;
 
 /* If auditing cannot proceed, audit_failure selects what happens. */
 static u32	audit_failure = AUDIT_FAIL_PRINTK;
@@ -1549,8 +1549,6 @@ static int __init audit_init(void)
 	register_pernet_subsys(&audit_net_ops);
 
 	audit_initialized = AUDIT_INITIALIZED;
-	audit_enabled = audit_default;
-	audit_ever_enabled |= !!audit_default;
 
 	kauditd_task = kthread_run(kauditd_thread, NULL, "kauditd");
 	if (IS_ERR(kauditd_task)) {
@@ -1572,6 +1570,8 @@ static int __init audit_enable(char *str)
 	audit_default = !!simple_strtol(str, NULL, 0);
 	if (!audit_default)
 		audit_initialized = AUDIT_DISABLED;
+	audit_enabled = audit_default;
+	audit_ever_enabled = !!audit_enabled;
 
 	pr_info("%s\n", audit_default ?
 		"enabled (after initialization)" : "disabled (until reboot)");
-- 
cgit v1.2.3


From be4104abf25c63ccef36e1e06262c73c0df9fd60 Mon Sep 17 00:00:00 2001
From: Paul Moore <paul@paul-moore.com>
Date: Fri, 1 Sep 2017 09:44:44 -0400
Subject: audit: initialize the audit subsystem as early as possible

We can't initialize the audit subsystem until after the network layer
is initialized (core_initcall), but do it soon after.

Reviewed-by: Richard Guy Briggs <rgb@redhat.com>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 kernel/audit.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/audit.c b/kernel/audit.c
index ec3d0802734d..db71c45ea6f8 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -1562,7 +1562,7 @@ static int __init audit_init(void)
 
 	return 0;
 }
-__initcall(audit_init);
+postcore_initcall(audit_init);
 
 /* Process kernel command-line parameter at boot time.  audit=0 or audit=1. */
 static int __init audit_enable(char *str)
-- 
cgit v1.2.3


From 80ab4df62706b882922c3bb0b05ce2c8ab10828a Mon Sep 17 00:00:00 2001
From: Paul Moore <paul@paul-moore.com>
Date: Fri, 1 Sep 2017 09:44:51 -0400
Subject: audit: don't use simple_strtol() anymore

The simple_strtol() function is deprecated, use kstrtol() instead.

Reviewed-by: Richard Guy Briggs <rgb@redhat.com>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 kernel/audit.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/audit.c b/kernel/audit.c
index db71c45ea6f8..e75918f79a83 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -1567,8 +1567,13 @@ postcore_initcall(audit_init);
 /* Process kernel command-line parameter at boot time.  audit=0 or audit=1. */
 static int __init audit_enable(char *str)
 {
-	audit_default = !!simple_strtol(str, NULL, 0);
-	if (!audit_default)
+	long val;
+
+	if (kstrtol(str, 0, &val))
+		panic("audit: invalid 'audit' parameter value (%s)\n", str);
+	audit_default = (val ? AUDIT_ON : AUDIT_OFF);
+
+	if (audit_default == AUDIT_OFF)
 		audit_initialized = AUDIT_DISABLED;
 	audit_enabled = audit_default;
 	audit_ever_enabled = !!audit_enabled;
-- 
cgit v1.2.3


From b3b4fdf6a8ae9ea51ea274e3f2f8a6a58b98cc0b Mon Sep 17 00:00:00 2001
From: Paul Moore <paul@paul-moore.com>
Date: Fri, 1 Sep 2017 09:44:57 -0400
Subject: audit: convert audit_ever_enabled to a boolean

We were treating it as a boolean, let's make it a boolean to help
avoid future mistakes.

Reviewed-by: Richard Guy Briggs <rgb@redhat.com>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 kernel/audit.c | 2 +-
 kernel/audit.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/audit.c b/kernel/audit.c
index e75918f79a83..f0cf9bfc806c 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -86,7 +86,7 @@ static int	audit_initialized;
 #define AUDIT_ON	1
 #define AUDIT_LOCKED	2
 u32		audit_enabled = AUDIT_OFF;
-u32		audit_ever_enabled = !!AUDIT_OFF;
+bool		audit_ever_enabled = !!AUDIT_OFF;
 
 EXPORT_SYMBOL_GPL(audit_enabled);
 
diff --git a/kernel/audit.h b/kernel/audit.h
index b331d9b83f63..6bdaf6bd377e 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -208,7 +208,7 @@ struct audit_context {
 	struct audit_proctitle proctitle;
 };
 
-extern u32 audit_ever_enabled;
+extern bool audit_ever_enabled;
 
 extern void audit_copy_inode(struct audit_names *name,
 			     const struct dentry *dentry,
-- 
cgit v1.2.3


From 5d842a5b77a58160625548fa6be2dc159179ffdb Mon Sep 17 00:00:00 2001
From: Paul Moore <paul@paul-moore.com>
Date: Fri, 1 Sep 2017 09:45:05 -0400
Subject: audit: use audit_set_enabled() in audit_enable()

Use audit_set_enabled() to enable auditing during early boot.  This
obviously won't emit an audit change record, but it will work anyway
and should help prevent in future problems by consolidating the
enable/disable code in one function.

Reviewed-by: Richard Guy Briggs <rgb@redhat.com>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 kernel/audit.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/audit.c b/kernel/audit.c
index f0cf9bfc806c..67b3863261d4 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -1575,8 +1575,8 @@ static int __init audit_enable(char *str)
 
 	if (audit_default == AUDIT_OFF)
 		audit_initialized = AUDIT_DISABLED;
-	audit_enabled = audit_default;
-	audit_ever_enabled = !!audit_enabled;
+	if (audit_set_enabled(audit_default))
+		panic("audit: error setting audit state (%d)\n", audit_default);
 
 	pr_info("%s\n", audit_default ?
 		"enabled (after initialization)" : "disabled (until reboot)");
-- 
cgit v1.2.3


From 33e8a907804428109ce1d12301c3365d619cc4df Mon Sep 17 00:00:00 2001
From: Steve Grubb <sgrubb@redhat.com>
Date: Tue, 17 Oct 2017 18:29:22 -0400
Subject: audit: Allow auditd to set pid to 0 to end auditing

The API to end auditing has historically been for auditd to set the
pid to 0. This patch restores that functionality.

See: https://github.com/linux-audit/audit-kernel/issues/69

Reviewed-by: Richard Guy Briggs <rgb@redhat.com>
Signed-off-by: Steve Grubb <sgrubb@redhat.com>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 kernel/audit.c | 29 ++++++++++++++++-------------
 1 file changed, 16 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/kernel/audit.c b/kernel/audit.c
index 67b3863261d4..64e1d0ec19de 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -1197,25 +1197,28 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 			pid_t auditd_pid;
 			struct pid *req_pid = task_tgid(current);
 
-			/* sanity check - PID values must match */
-			if (new_pid != pid_vnr(req_pid))
+			/* Sanity check - PID values must match. Setting
+			 * pid to 0 is how auditd ends auditing. */
+			if (new_pid && (new_pid != pid_vnr(req_pid)))
 				return -EINVAL;
 
 			/* test the auditd connection */
 			audit_replace(req_pid);
 
 			auditd_pid = auditd_pid_vnr();
-			/* only the current auditd can unregister itself */
-			if ((!new_pid) && (new_pid != auditd_pid)) {
-				audit_log_config_change("audit_pid", new_pid,
-							auditd_pid, 0);
-				return -EACCES;
-			}
-			/* replacing a healthy auditd is not allowed */
-			if (auditd_pid && new_pid) {
-				audit_log_config_change("audit_pid", new_pid,
-							auditd_pid, 0);
-				return -EEXIST;
+			if (auditd_pid) {
+				/* replacing a healthy auditd is not allowed */
+				if (new_pid) {
+					audit_log_config_change("audit_pid",
+							new_pid, auditd_pid, 0);
+					return -EEXIST;
+				}
+				/* only current auditd can unregister itself */
+				if (pid_vnr(req_pid) != auditd_pid) {
+					audit_log_config_change("audit_pid",
+							new_pid, auditd_pid, 0);
+					return -EACCES;
+				}
 			}
 
 			if (new_pid) {
-- 
cgit v1.2.3


From f7b53637c090bd8ce2dc74ad0f3aa1898aff2524 Mon Sep 17 00:00:00 2001
From: Casey Schaufler <casey@schaufler-ca.com>
Date: Tue, 24 Oct 2017 18:52:31 -0700
Subject: Audit: remove unused audit_log_secctx function

The function audit_log_secctx() is unused in the upstream kernel.
All it does is wrap another function that doesn't need wrapping.
It claims to give you the SELinux context, but that is not true if
you are using a different security module.

Signed-off-by: Casey Schaufler <casey@schaufler-ca.com>
Reviewed-by: James Morris <james.l.morris@oracle.com>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 include/linux/audit.h |  8 --------
 kernel/audit.c        | 26 --------------------------
 2 files changed, 34 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/audit.h b/include/linux/audit.h
index 2150bdccfbab..fa1b068d911d 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -149,12 +149,6 @@ extern void		    audit_log_key(struct audit_buffer *ab,
 extern void		    audit_log_link_denied(const char *operation,
 						  const struct path *link);
 extern void		    audit_log_lost(const char *message);
-#ifdef CONFIG_SECURITY
-extern void 		    audit_log_secctx(struct audit_buffer *ab, u32 secid);
-#else
-static inline void	    audit_log_secctx(struct audit_buffer *ab, u32 secid)
-{ }
-#endif
 
 extern int audit_log_task_context(struct audit_buffer *ab);
 extern void audit_log_task_info(struct audit_buffer *ab,
@@ -203,8 +197,6 @@ static inline void audit_log_key(struct audit_buffer *ab, char *key)
 static inline void audit_log_link_denied(const char *string,
 					 const struct path *link)
 { }
-static inline void audit_log_secctx(struct audit_buffer *ab, u32 secid)
-{ }
 static inline int audit_log_task_context(struct audit_buffer *ab)
 {
 	return 0;
diff --git a/kernel/audit.c b/kernel/audit.c
index 64e1d0ec19de..227db99b0f19 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -2345,32 +2345,6 @@ void audit_log(struct audit_context *ctx, gfp_t gfp_mask, int type,
 	}
 }
 
-#ifdef CONFIG_SECURITY
-/**
- * audit_log_secctx - Converts and logs SELinux context
- * @ab: audit_buffer
- * @secid: security number
- *
- * This is a helper function that calls security_secid_to_secctx to convert
- * secid to secctx and then adds the (converted) SELinux context to the audit
- * log by calling audit_log_format, thus also preventing leak of internal secid
- * to userspace. If secid cannot be converted audit_panic is called.
- */
-void audit_log_secctx(struct audit_buffer *ab, u32 secid)
-{
-	u32 len;
-	char *secctx;
-
-	if (security_secid_to_secctx(secid, &secctx, &len)) {
-		audit_panic("Cannot convert secid to context");
-	} else {
-		audit_log_format(ab, " obj=%s", secctx);
-		security_release_secctx(secctx, len);
-	}
-}
-EXPORT_SYMBOL(audit_log_secctx);
-#endif
-
 EXPORT_SYMBOL(audit_log_start);
 EXPORT_SYMBOL(audit_log_end);
 EXPORT_SYMBOL(audit_log_format);
-- 
cgit v1.2.3


From 42d5e37654e4cdb9fb2e2f3ab30045fee35c42d8 Mon Sep 17 00:00:00 2001
From: Richard Guy Briggs <rgb@redhat.com>
Date: Wed, 23 Aug 2017 07:03:39 -0400
Subject: audit: filter PATH records keyed on filesystem magic

Tracefs or debugfs were causing hundreds to thousands of PATH records to
be associated with the init_module and finit_module SYSCALL records on a
few modules when the following rule was in place for startup:
	-a always,exit -F arch=x86_64 -S init_module -F key=mod-load

Provide a method to ignore these large number of PATH records from
overwhelming the logs if they are not of interest.  Introduce a new
filter list "AUDIT_FILTER_FS", with a new field type AUDIT_FSTYPE,
which keys off the filesystem 4-octet hexadecimal magic identifier to
filter specific filesystem PATH records.

An example rule would look like:
	-a never,filesystem -F fstype=0x74726163 -F key=ignore_tracefs
	-a never,filesystem -F fstype=0x64626720 -F key=ignore_debugfs

Arguably the better way to address this issue is to disable tracefs and
debugfs on boot from production systems.

See: https://github.com/linux-audit/audit-kernel/issues/16
See: https://github.com/linux-audit/audit-userspace/issues/8
Test case: https://github.com/linux-audit/audit-testsuite/issues/42

Signed-off-by: Richard Guy Briggs <rgb@redhat.com>
[PM: fixed the whitespace damage in kernel/auditsc.c]
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 include/uapi/linux/audit.h |  8 ++++++--
 kernel/auditfilter.c       | 39 ++++++++++++++++++++++++++++++++-------
 kernel/auditsc.c           | 23 +++++++++++++++++++++++
 3 files changed, 61 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/include/uapi/linux/audit.h b/include/uapi/linux/audit.h
index 0714a66f0e0c..be711341938e 100644
--- a/include/uapi/linux/audit.h
+++ b/include/uapi/linux/audit.h
@@ -155,8 +155,9 @@
 #define AUDIT_FILTER_WATCH	0x03	/* Apply rule to file system watches */
 #define AUDIT_FILTER_EXIT	0x04	/* Apply rule at syscall exit */
 #define AUDIT_FILTER_TYPE	0x05	/* Apply rule at audit_log_start */
+#define AUDIT_FILTER_FS		0x06	/* Apply rule at __audit_inode_child */
 
-#define AUDIT_NR_FILTERS	6
+#define AUDIT_NR_FILTERS	7
 
 #define AUDIT_FILTER_PREPEND	0x10	/* Prepend to front of list */
 
@@ -256,6 +257,7 @@
 #define AUDIT_OBJ_LEV_HIGH	23
 #define AUDIT_LOGINUID_SET	24
 #define AUDIT_SESSIONID	25	/* Session ID */
+#define AUDIT_FSTYPE	26	/* FileSystem Type */
 
 				/* These are ONLY useful when checking
 				 * at syscall exit time (AUDIT_AT_EXIT). */
@@ -335,13 +337,15 @@ enum {
 #define AUDIT_FEATURE_BITMAP_EXCLUDE_EXTEND	0x00000008
 #define AUDIT_FEATURE_BITMAP_SESSIONID_FILTER	0x00000010
 #define AUDIT_FEATURE_BITMAP_LOST_RESET		0x00000020
+#define AUDIT_FEATURE_BITMAP_FILTER_FS		0x00000040
 
 #define AUDIT_FEATURE_BITMAP_ALL (AUDIT_FEATURE_BITMAP_BACKLOG_LIMIT | \
 				  AUDIT_FEATURE_BITMAP_BACKLOG_WAIT_TIME | \
 				  AUDIT_FEATURE_BITMAP_EXECUTABLE_PATH | \
 				  AUDIT_FEATURE_BITMAP_EXCLUDE_EXTEND | \
 				  AUDIT_FEATURE_BITMAP_SESSIONID_FILTER | \
-				  AUDIT_FEATURE_BITMAP_LOST_RESET)
+				  AUDIT_FEATURE_BITMAP_LOST_RESET | \
+				  AUDIT_FEATURE_BITMAP_FILTER_FS)
 
 /* deprecated: AUDIT_VERSION_* */
 #define AUDIT_VERSION_LATEST 		AUDIT_FEATURE_BITMAP_ALL
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 0b0aa5854dac..4a1758adb222 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -56,7 +56,8 @@ struct list_head audit_filter_list[AUDIT_NR_FILTERS] = {
 	LIST_HEAD_INIT(audit_filter_list[3]),
 	LIST_HEAD_INIT(audit_filter_list[4]),
 	LIST_HEAD_INIT(audit_filter_list[5]),
-#if AUDIT_NR_FILTERS != 6
+	LIST_HEAD_INIT(audit_filter_list[6]),
+#if AUDIT_NR_FILTERS != 7
 #error Fix audit_filter_list initialiser
 #endif
 };
@@ -67,6 +68,7 @@ static struct list_head audit_rules_list[AUDIT_NR_FILTERS] = {
 	LIST_HEAD_INIT(audit_rules_list[3]),
 	LIST_HEAD_INIT(audit_rules_list[4]),
 	LIST_HEAD_INIT(audit_rules_list[5]),
+	LIST_HEAD_INIT(audit_rules_list[6]),
 };
 
 DEFINE_MUTEX(audit_filter_mutex);
@@ -263,6 +265,7 @@ static inline struct audit_entry *audit_to_entry_common(struct audit_rule_data *
 #endif
 	case AUDIT_FILTER_USER:
 	case AUDIT_FILTER_TYPE:
+	case AUDIT_FILTER_FS:
 		;
 	}
 	if (unlikely(rule->action == AUDIT_POSSIBLE)) {
@@ -338,6 +341,21 @@ static int audit_field_valid(struct audit_entry *entry, struct audit_field *f)
 		    entry->rule.listnr != AUDIT_FILTER_USER)
 			return -EINVAL;
 		break;
+	case AUDIT_FSTYPE:
+		if (entry->rule.listnr != AUDIT_FILTER_FS)
+			return -EINVAL;
+		break;
+	}
+
+	switch(entry->rule.listnr) {
+	case AUDIT_FILTER_FS:
+		switch(f->type) {
+		case AUDIT_FSTYPE:
+		case AUDIT_FILTERKEY:
+			break;
+		default:
+			return -EINVAL;
+		}
 	}
 
 	switch(f->type) {
@@ -391,6 +409,7 @@ static int audit_field_valid(struct audit_entry *entry, struct audit_field *f)
 			return -EINVAL;
 	/* FALL THROUGH */
 	case AUDIT_ARCH:
+	case AUDIT_FSTYPE:
 		if (f->op != Audit_not_equal && f->op != Audit_equal)
 			return -EINVAL;
 		break;
@@ -910,10 +929,13 @@ static inline int audit_add_rule(struct audit_entry *entry)
 #ifdef CONFIG_AUDITSYSCALL
 	int dont_count = 0;
 
-	/* If either of these, don't count towards total */
-	if (entry->rule.listnr == AUDIT_FILTER_USER ||
-		entry->rule.listnr == AUDIT_FILTER_TYPE)
+	/* If any of these, don't count towards total */
+	switch(entry->rule.listnr) {
+	case AUDIT_FILTER_USER:
+	case AUDIT_FILTER_TYPE:
+	case AUDIT_FILTER_FS:
 		dont_count = 1;
+	}
 #endif
 
 	mutex_lock(&audit_filter_mutex);
@@ -989,10 +1011,13 @@ int audit_del_rule(struct audit_entry *entry)
 #ifdef CONFIG_AUDITSYSCALL
 	int dont_count = 0;
 
-	/* If either of these, don't count towards total */
-	if (entry->rule.listnr == AUDIT_FILTER_USER ||
-		entry->rule.listnr == AUDIT_FILTER_TYPE)
+	/* If any of these, don't count towards total */
+	switch(entry->rule.listnr) {
+	case AUDIT_FILTER_USER:
+	case AUDIT_FILTER_TYPE:
+	case AUDIT_FILTER_FS:
 		dont_count = 1;
+	}
 #endif
 
 	mutex_lock(&audit_filter_mutex);
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index aac1a41f82bd..c9bb29e17335 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -1869,10 +1869,33 @@ void __audit_inode_child(struct inode *parent,
 	struct inode *inode = d_backing_inode(dentry);
 	const char *dname = dentry->d_name.name;
 	struct audit_names *n, *found_parent = NULL, *found_child = NULL;
+	struct audit_entry *e;
+	struct list_head *list = &audit_filter_list[AUDIT_FILTER_FS];
+	int i;
 
 	if (!context->in_syscall)
 		return;
 
+	rcu_read_lock();
+	if (!list_empty(list)) {
+		list_for_each_entry_rcu(e, list, list) {
+			for (i = 0; i < e->rule.field_count; i++) {
+				struct audit_field *f = &e->rule.fields[i];
+
+				if (f->type == AUDIT_FSTYPE) {
+					if (audit_comparator(parent->i_sb->s_magic,
+					    f->op, f->val)) {
+						if (e->rule.action == AUDIT_NEVER) {
+							rcu_read_unlock();
+							return;
+						}
+					}
+				}
+			}
+		}
+	}
+	rcu_read_unlock();
+
 	if (inode)
 		handle_one(inode);
 
-- 
cgit v1.2.3


From 1f2cac107c591c24b60b115d6050adc213d10fc0 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Sun, 5 Nov 2017 09:13:48 -0700
Subject: blktrace: fix unlocked access to init/start-stop/teardown

sg.c calls into the blktrace functions without holding the proper queue
mutex for doing setup, start/stop, or teardown.

Add internal unlocked variants, and export the ones that do the proper
locking.

Fixes: 6da127ad0918 ("blktrace: Add blktrace ioctls to SCSI generic devices")
Tested-by: Dmitry Vyukov <dvyukov@google.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 kernel/trace/blktrace.c | 58 ++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 48 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 45a3928544ce..ea57dd94b2b2 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -336,7 +336,7 @@ static void blk_trace_cleanup(struct blk_trace *bt)
 		blk_unregister_tracepoints();
 }
 
-int blk_trace_remove(struct request_queue *q)
+static int __blk_trace_remove(struct request_queue *q)
 {
 	struct blk_trace *bt;
 
@@ -349,6 +349,17 @@ int blk_trace_remove(struct request_queue *q)
 
 	return 0;
 }
+
+int blk_trace_remove(struct request_queue *q)
+{
+	int ret;
+
+	mutex_lock(&q->blk_trace_mutex);
+	ret = __blk_trace_remove(q);
+	mutex_unlock(&q->blk_trace_mutex);
+
+	return ret;
+}
 EXPORT_SYMBOL_GPL(blk_trace_remove);
 
 static ssize_t blk_dropped_read(struct file *filp, char __user *buffer,
@@ -550,9 +561,8 @@ err:
 	return ret;
 }
 
-int blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
-		    struct block_device *bdev,
-		    char __user *arg)
+static int __blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
+			     struct block_device *bdev, char __user *arg)
 {
 	struct blk_user_trace_setup buts;
 	int ret;
@@ -571,6 +581,19 @@ int blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
 	}
 	return 0;
 }
+
+int blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
+		    struct block_device *bdev,
+		    char __user *arg)
+{
+	int ret;
+
+	mutex_lock(&q->blk_trace_mutex);
+	ret = __blk_trace_setup(q, name, dev, bdev, arg);
+	mutex_unlock(&q->blk_trace_mutex);
+
+	return ret;
+}
 EXPORT_SYMBOL_GPL(blk_trace_setup);
 
 #if defined(CONFIG_COMPAT) && defined(CONFIG_X86_64)
@@ -607,7 +630,7 @@ static int compat_blk_trace_setup(struct request_queue *q, char *name,
 }
 #endif
 
-int blk_trace_startstop(struct request_queue *q, int start)
+static int __blk_trace_startstop(struct request_queue *q, int start)
 {
 	int ret;
 	struct blk_trace *bt = q->blk_trace;
@@ -646,6 +669,17 @@ int blk_trace_startstop(struct request_queue *q, int start)
 
 	return ret;
 }
+
+int blk_trace_startstop(struct request_queue *q, int start)
+{
+	int ret;
+
+	mutex_lock(&q->blk_trace_mutex);
+	ret = __blk_trace_startstop(q, start);
+	mutex_unlock(&q->blk_trace_mutex);
+
+	return ret;
+}
 EXPORT_SYMBOL_GPL(blk_trace_startstop);
 
 /*
@@ -676,7 +710,7 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg)
 	switch (cmd) {
 	case BLKTRACESETUP:
 		bdevname(bdev, b);
-		ret = blk_trace_setup(q, b, bdev->bd_dev, bdev, arg);
+		ret = __blk_trace_setup(q, b, bdev->bd_dev, bdev, arg);
 		break;
 #if defined(CONFIG_COMPAT) && defined(CONFIG_X86_64)
 	case BLKTRACESETUP32:
@@ -687,10 +721,10 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg)
 	case BLKTRACESTART:
 		start = 1;
 	case BLKTRACESTOP:
-		ret = blk_trace_startstop(q, start);
+		ret = __blk_trace_startstop(q, start);
 		break;
 	case BLKTRACETEARDOWN:
-		ret = blk_trace_remove(q);
+		ret = __blk_trace_remove(q);
 		break;
 	default:
 		ret = -ENOTTY;
@@ -708,10 +742,14 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg)
  **/
 void blk_trace_shutdown(struct request_queue *q)
 {
+	mutex_lock(&q->blk_trace_mutex);
+
 	if (q->blk_trace) {
-		blk_trace_startstop(q, 0);
-		blk_trace_remove(q);
+		__blk_trace_startstop(q, 0);
+		__blk_trace_remove(q);
 	}
+
+	mutex_unlock(&q->blk_trace_mutex);
 }
 
 #ifdef CONFIG_BLK_CGROUP
-- 
cgit v1.2.3


From a6da0024ffc19e0d47712bb5ca4fd083f76b07df Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Sun, 5 Nov 2017 09:16:09 -0700
Subject: blktrace: fix unlocked registration of tracepoints

We need to ensure that tracepoints are registered and unregistered
with the users of them. The existing atomic count isn't enough for
that. Add a lock around the tracepoints, so we serialize access
to them.

This fixes cases where we have multiple users setting up and
tearing down tracepoints, like this:

CPU: 0 PID: 2995 Comm: syzkaller857118 Not tainted
4.14.0-rc5-next-20171018+ #36
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS
Google 01/01/2011
Call Trace:
  __dump_stack lib/dump_stack.c:16 [inline]
  dump_stack+0x194/0x257 lib/dump_stack.c:52
  panic+0x1e4/0x41c kernel/panic.c:183
  __warn+0x1c4/0x1e0 kernel/panic.c:546
  report_bug+0x211/0x2d0 lib/bug.c:183
  fixup_bug+0x40/0x90 arch/x86/kernel/traps.c:177
  do_trap_no_signal arch/x86/kernel/traps.c:211 [inline]
  do_trap+0x260/0x390 arch/x86/kernel/traps.c:260
  do_error_trap+0x120/0x390 arch/x86/kernel/traps.c:297
  do_invalid_op+0x1b/0x20 arch/x86/kernel/traps.c:310
  invalid_op+0x18/0x20 arch/x86/entry/entry_64.S:905
RIP: 0010:tracepoint_add_func kernel/tracepoint.c:210 [inline]
RIP: 0010:tracepoint_probe_register_prio+0x397/0x9a0 kernel/tracepoint.c:283
RSP: 0018:ffff8801d1d1f6c0 EFLAGS: 00010293
RAX: ffff8801d22e8540 RBX: 00000000ffffffef RCX: ffffffff81710f07
RDX: 0000000000000000 RSI: ffffffff85b679c0 RDI: ffff8801d5f19818
RBP: ffff8801d1d1f7c8 R08: ffffffff81710c10 R09: 0000000000000004
R10: ffff8801d1d1f6b0 R11: 0000000000000003 R12: ffffffff817597f0
R13: 0000000000000000 R14: 00000000ffffffff R15: ffff8801d1d1f7a0
  tracepoint_probe_register+0x2a/0x40 kernel/tracepoint.c:304
  register_trace_block_rq_insert include/trace/events/block.h:191 [inline]
  blk_register_tracepoints+0x1e/0x2f0 kernel/trace/blktrace.c:1043
  do_blk_trace_setup+0xa10/0xcf0 kernel/trace/blktrace.c:542
  blk_trace_setup+0xbd/0x180 kernel/trace/blktrace.c:564
  sg_ioctl+0xc71/0x2d90 drivers/scsi/sg.c:1089
  vfs_ioctl fs/ioctl.c:45 [inline]
  do_vfs_ioctl+0x1b1/0x1520 fs/ioctl.c:685
  SYSC_ioctl fs/ioctl.c:700 [inline]
  SyS_ioctl+0x8f/0xc0 fs/ioctl.c:691
  entry_SYSCALL_64_fastpath+0x1f/0xbe
RIP: 0033:0x444339
RSP: 002b:00007ffe05bb5b18 EFLAGS: 00000206 ORIG_RAX: 0000000000000010
RAX: ffffffffffffffda RBX: 00000000006d66c0 RCX: 0000000000444339
RDX: 000000002084cf90 RSI: 00000000c0481273 RDI: 0000000000000009
RBP: 0000000000000082 R08: 0000000000000000 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000206 R12: ffffffffffffffff
R13: 00000000c0481273 R14: 0000000000000000 R15: 0000000000000000

since we can now run these in parallel. Ensure that the exported helpers
for doing this are grabbing the queue trace mutex.

Reported-by: Steven Rostedt <rostedt@goodmis.org>
Tested-by: Dmitry Vyukov <dvyukov@google.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 kernel/trace/blktrace.c | 32 ++++++++++++++++++++++----------
 1 file changed, 22 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index ea57dd94b2b2..206e0e2ace53 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -66,7 +66,8 @@ static struct tracer_flags blk_tracer_flags = {
 };
 
 /* Global reference count of probes */
-static atomic_t blk_probes_ref = ATOMIC_INIT(0);
+static DEFINE_MUTEX(blk_probe_mutex);
+static int blk_probes_ref;
 
 static void blk_register_tracepoints(void);
 static void blk_unregister_tracepoints(void);
@@ -329,11 +330,26 @@ static void blk_trace_free(struct blk_trace *bt)
 	kfree(bt);
 }
 
+static void get_probe_ref(void)
+{
+	mutex_lock(&blk_probe_mutex);
+	if (++blk_probes_ref == 1)
+		blk_register_tracepoints();
+	mutex_unlock(&blk_probe_mutex);
+}
+
+static void put_probe_ref(void)
+{
+	mutex_lock(&blk_probe_mutex);
+	if (!--blk_probes_ref)
+		blk_unregister_tracepoints();
+	mutex_unlock(&blk_probe_mutex);
+}
+
 static void blk_trace_cleanup(struct blk_trace *bt)
 {
 	blk_trace_free(bt);
-	if (atomic_dec_and_test(&blk_probes_ref))
-		blk_unregister_tracepoints();
+	put_probe_ref();
 }
 
 static int __blk_trace_remove(struct request_queue *q)
@@ -549,8 +565,7 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
 	if (cmpxchg(&q->blk_trace, NULL, bt))
 		goto err;
 
-	if (atomic_inc_return(&blk_probes_ref) == 1)
-		blk_register_tracepoints();
+	get_probe_ref();
 
 	ret = 0;
 err:
@@ -1596,9 +1611,7 @@ static int blk_trace_remove_queue(struct request_queue *q)
 	if (bt == NULL)
 		return -EINVAL;
 
-	if (atomic_dec_and_test(&blk_probes_ref))
-		blk_unregister_tracepoints();
-
+	put_probe_ref();
 	blk_trace_free(bt);
 	return 0;
 }
@@ -1629,8 +1642,7 @@ static int blk_trace_setup_queue(struct request_queue *q,
 	if (cmpxchg(&q->blk_trace, NULL, bt))
 		goto free_bt;
 
-	if (atomic_inc_return(&blk_probes_ref) == 1)
-		blk_register_tracepoints();
+	get_probe_ref();
 	return 0;
 
 free_bt:
-- 
cgit v1.2.3


From e10237cc76ef9a4066a84aa2cc710bfd708cc341 Mon Sep 17 00:00:00 2001
From: Shaohua Li <shli@fb.com>
Date: Tue, 7 Nov 2017 11:09:50 -0800
Subject: kthread: zero the kthread data structure

kthread() could bail out early before we initialize blkcg_css (if the
kthread is killed very early. Please see xchg() statement in kthread()),
which confuses free_kthread_struct. Instead of moving the blkcg_css
initialization early, we simply zero the whole 'self' data structure,
which doesn't sound much overhead.

Reported-by: syzbot <syzkaller@googlegroups.com>
Fixes: 05e3db95ebfc ("kthread: add a mechanism to store cgroup info")
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Dmitry Vyukov <dvyukov@google.com>
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Shaohua Li <shli@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 kernel/kthread.c | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kthread.c b/kernel/kthread.c
index f87cd8b4eb2a..8dbe2454cb1d 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -204,7 +204,7 @@ static int kthread(void *_create)
 	struct kthread *self;
 	int ret;
 
-	self = kmalloc(sizeof(*self), GFP_KERNEL);
+	self = kzalloc(sizeof(*self), GFP_KERNEL);
 	set_kthread_struct(self);
 
 	/* If user was SIGKILLed, I release the structure. */
@@ -220,13 +220,9 @@ static int kthread(void *_create)
 		do_exit(-ENOMEM);
 	}
 
-	self->flags = 0;
 	self->data = data;
 	init_completion(&self->exited);
 	init_completion(&self->parked);
-#ifdef CONFIG_BLK_CGROUP
-	self->blkcg_css = NULL;
-#endif
 	current->vfork_done = &self->exited;
 
 	/* OK, tell user we're spawned, wait for stop or wakeup */
-- 
cgit v1.2.3


From dd0bb688eaa241b5655d396d45366cba9225aed9 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fb.com>
Date: Tue, 7 Nov 2017 15:28:42 -0500
Subject: bpf: add a bpf_override_function helper

Error injection is sloppy and very ad-hoc.  BPF could fill this niche
perfectly with it's kprobe functionality.  We could make sure errors are
only triggered in specific call chains that we care about with very
specific situations.  Accomplish this with the bpf_override_funciton
helper.  This will modify the probe'd callers return value to the
specified value and set the PC to an override function that simply
returns, bypassing the originally probed function.  This gives us a nice
clean way to implement systematic error injection for all of our code
paths.

Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/Kconfig                     |  3 +++
 arch/x86/Kconfig                 |  1 +
 arch/x86/include/asm/kprobes.h   |  4 ++++
 arch/x86/include/asm/ptrace.h    |  5 +++++
 arch/x86/kernel/kprobes/ftrace.c | 14 ++++++++++++++
 include/linux/filter.h           |  3 ++-
 include/linux/trace_events.h     |  1 +
 include/uapi/linux/bpf.h         |  7 ++++++-
 kernel/bpf/core.c                |  3 +++
 kernel/bpf/verifier.c            |  2 ++
 kernel/events/core.c             |  7 +++++++
 kernel/trace/Kconfig             | 11 +++++++++++
 kernel/trace/bpf_trace.c         | 35 +++++++++++++++++++++++++++++++++++
 kernel/trace/trace_kprobe.c      | 40 +++++++++++++++++++++++++++++++++-------
 kernel/trace/trace_probe.h       |  6 ++++++
 15 files changed, 133 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/arch/Kconfig b/arch/Kconfig
index 057370a0ac4e..6e8520f09bc1 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -196,6 +196,9 @@ config HAVE_OPTPROBES
 config HAVE_KPROBES_ON_FTRACE
 	bool
 
+config HAVE_KPROBE_OVERRIDE
+	bool
+
 config HAVE_NMI
 	bool
 
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 2fdb23313dd5..51458c1a0b4a 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -153,6 +153,7 @@ config X86
 	select HAVE_KERNEL_XZ
 	select HAVE_KPROBES
 	select HAVE_KPROBES_ON_FTRACE
+	select HAVE_KPROBE_OVERRIDE
 	select HAVE_KRETPROBES
 	select HAVE_KVM
 	select HAVE_LIVEPATCH			if X86_64
diff --git a/arch/x86/include/asm/kprobes.h b/arch/x86/include/asm/kprobes.h
index 6cf65437b5e5..c6c3b1f4306a 100644
--- a/arch/x86/include/asm/kprobes.h
+++ b/arch/x86/include/asm/kprobes.h
@@ -67,6 +67,10 @@ extern const int kretprobe_blacklist_size;
 void arch_remove_kprobe(struct kprobe *p);
 asmlinkage void kretprobe_trampoline(void);
 
+#ifdef CONFIG_KPROBES_ON_FTRACE
+extern void arch_ftrace_kprobe_override_function(struct pt_regs *regs);
+#endif
+
 /* Architecture specific copy of original instruction*/
 struct arch_specific_insn {
 	/* copy of the original instruction */
diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h
index c0e3c45cf6ab..2370bb0149cc 100644
--- a/arch/x86/include/asm/ptrace.h
+++ b/arch/x86/include/asm/ptrace.h
@@ -109,6 +109,11 @@ static inline unsigned long regs_return_value(struct pt_regs *regs)
 	return regs->ax;
 }
 
+static inline void regs_set_return_value(struct pt_regs *regs, unsigned long rc)
+{
+	regs->ax = rc;
+}
+
 /*
  * user_mode(regs) determines whether a register set came from user
  * mode.  On x86_32, this is true if V8086 mode was enabled OR if the
diff --git a/arch/x86/kernel/kprobes/ftrace.c b/arch/x86/kernel/kprobes/ftrace.c
index 041f7b6dfa0f..3c455bf490cb 100644
--- a/arch/x86/kernel/kprobes/ftrace.c
+++ b/arch/x86/kernel/kprobes/ftrace.c
@@ -97,3 +97,17 @@ int arch_prepare_kprobe_ftrace(struct kprobe *p)
 	p->ainsn.boostable = false;
 	return 0;
 }
+
+asmlinkage void override_func(void);
+asm(
+	".type override_func, @function\n"
+	"override_func:\n"
+	"	ret\n"
+	".size override_func, .-override_func\n"
+);
+
+void arch_ftrace_kprobe_override_function(struct pt_regs *regs)
+{
+	regs->ip = (unsigned long)&override_func;
+}
+NOKPROBE_SYMBOL(arch_ftrace_kprobe_override_function);
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 0cd02ff4ae30..eaec066f99e8 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -459,7 +459,8 @@ struct bpf_prog {
 				locked:1,	/* Program image locked? */
 				gpl_compatible:1, /* Is filter GPL compatible? */
 				cb_access:1,	/* Is control block accessed? */
-				dst_needed:1;	/* Do we need dst entry? */
+				dst_needed:1,	/* Do we need dst entry? */
+				kprobe_override:1; /* Do we override a kprobe? */
 	kmemcheck_bitfield_end(meta);
 	enum bpf_prog_type	type;		/* Type of BPF program */
 	u32			len;		/* Number of filter blocks */
diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index 84014ecfa67f..17e5e820a84c 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -523,6 +523,7 @@ do {									\
 struct perf_event;
 
 DECLARE_PER_CPU(struct pt_regs, perf_trace_regs);
+DECLARE_PER_CPU(int, bpf_kprobe_override);
 
 extern int  perf_trace_init(struct perf_event *event);
 extern void perf_trace_destroy(struct perf_event *event);
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index e880ae6434ee..adb66f78b674 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -677,6 +677,10 @@ union bpf_attr {
  *     @buf: buf to fill
  *     @buf_size: size of the buf
  *     Return : 0 on success or negative error code
+ *
+ * int bpf_override_return(pt_regs, rc)
+ *	@pt_regs: pointer to struct pt_regs
+ *	@rc: the return value to set
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -736,7 +740,8 @@ union bpf_attr {
 	FN(xdp_adjust_meta),		\
 	FN(perf_event_read_value),	\
 	FN(perf_prog_read_value),	\
-	FN(getsockopt),
+	FN(getsockopt),			\
+	FN(override_return),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 8a6c37762330..271daad31f37 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -1326,6 +1326,9 @@ EVAL4(PROG_NAME_LIST, 416, 448, 480, 512)
 bool bpf_prog_array_compatible(struct bpf_array *array,
 			       const struct bpf_prog *fp)
 {
+	if (fp->kprobe_override)
+		return false;
+
 	if (!array->owner_prog_type) {
 		/* There's no owner yet where we could check for
 		 * compatibility.
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 4a942e2e753d..bc464b8ec91e 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -4357,6 +4357,8 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
 			prog->dst_needed = 1;
 		if (insn->imm == BPF_FUNC_get_prandom_u32)
 			bpf_user_rnd_init_once();
+		if (insn->imm == BPF_FUNC_override_return)
+			prog->kprobe_override = 1;
 		if (insn->imm == BPF_FUNC_tail_call) {
 			/* If we tail call into other programs, we
 			 * cannot make any assumptions since they can
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 42d24bd64ea4..ac240d31b5bf 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -8171,6 +8171,13 @@ static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
 		return -EINVAL;
 	}
 
+	/* Kprobe override only works for kprobes, not uprobes. */
+	if (prog->kprobe_override &&
+	    !(event->tp_event->flags & TRACE_EVENT_FL_KPROBE)) {
+		bpf_prog_put(prog);
+		return -EINVAL;
+	}
+
 	if (is_tracepoint || is_syscall_tp) {
 		int off = trace_event_get_offsets(event->tp_event);
 
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 434c840e2d82..9dc0deeaad2b 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -518,6 +518,17 @@ config FUNCTION_PROFILER
 
 	  If in doubt, say N.
 
+config BPF_KPROBE_OVERRIDE
+	bool "Enable BPF programs to override a kprobed function"
+	depends on BPF_EVENTS
+	depends on KPROBES_ON_FTRACE
+	depends on HAVE_KPROBE_OVERRIDE
+	depends on DYNAMIC_FTRACE_WITH_REGS
+	default n
+	help
+	 Allows BPF to override the execution of a probed function and
+	 set a different return value.  This is used for error injection.
+
 config FTRACE_MCOUNT_RECORD
 	def_bool y
 	depends on DYNAMIC_FTRACE
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 506efe6e8ed9..1865b0d4cdeb 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -13,6 +13,10 @@
 #include <linux/filter.h>
 #include <linux/uaccess.h>
 #include <linux/ctype.h>
+#include <linux/kprobes.h>
+#include <asm/kprobes.h>
+
+#include "trace_probe.h"
 #include "trace.h"
 
 u64 bpf_get_stackid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
@@ -76,6 +80,29 @@ unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx)
 }
 EXPORT_SYMBOL_GPL(trace_call_bpf);
 
+#ifdef CONFIG_BPF_KPROBE_OVERRIDE
+BPF_CALL_2(bpf_override_return, struct pt_regs *, regs, unsigned long, rc)
+{
+	__this_cpu_write(bpf_kprobe_override, 1);
+	regs_set_return_value(regs, rc);
+	arch_ftrace_kprobe_override_function(regs);
+	return 0;
+}
+#else
+BPF_CALL_2(bpf_override_return, struct pt_regs *, regs, unsigned long, rc)
+{
+	return -EINVAL;
+}
+#endif
+
+static const struct bpf_func_proto bpf_override_return_proto = {
+	.func		= bpf_override_return,
+	.gpl_only	= true,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_ANYTHING,
+};
+
 BPF_CALL_3(bpf_probe_read, void *, dst, u32, size, const void *, unsafe_ptr)
 {
 	int ret;
@@ -551,6 +578,10 @@ static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func
 		return &bpf_get_stackid_proto;
 	case BPF_FUNC_perf_event_read_value:
 		return &bpf_perf_event_read_value_proto;
+	case BPF_FUNC_override_return:
+		pr_warn_ratelimited("%s[%d] is installing a program with bpf_override_return helper that may cause unexpected behavior!",
+				    current->comm, task_pid_nr(current));
+		return &bpf_override_return_proto;
 	default:
 		return tracing_func_proto(func_id);
 	}
@@ -766,6 +797,10 @@ int perf_event_attach_bpf_prog(struct perf_event *event,
 	struct bpf_prog_array *new_array;
 	int ret = -EEXIST;
 
+	/* Kprobe override only works for ftrace based kprobes. */
+	if (prog->kprobe_override && !trace_kprobe_ftrace(event->tp_event))
+		return -EINVAL;
+
 	mutex_lock(&bpf_event_mutex);
 
 	if (event->prog)
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index abf92e478cfb..8e3c9ec1faf7 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -42,6 +42,7 @@ struct trace_kprobe {
 	(offsetof(struct trace_kprobe, tp.args) +	\
 	(sizeof(struct probe_arg) * (n)))
 
+DEFINE_PER_CPU(int, bpf_kprobe_override);
 
 static nokprobe_inline bool trace_kprobe_is_return(struct trace_kprobe *tk)
 {
@@ -87,6 +88,12 @@ static nokprobe_inline unsigned long trace_kprobe_nhit(struct trace_kprobe *tk)
 	return nhit;
 }
 
+int trace_kprobe_ftrace(struct trace_event_call *call)
+{
+	struct trace_kprobe *tk = (struct trace_kprobe *)call->data;
+	return kprobe_ftrace(&tk->rp.kp);
+}
+
 static int register_kprobe_event(struct trace_kprobe *tk);
 static int unregister_kprobe_event(struct trace_kprobe *tk);
 
@@ -1170,7 +1177,7 @@ static int kretprobe_event_define_fields(struct trace_event_call *event_call)
 #ifdef CONFIG_PERF_EVENTS
 
 /* Kprobe profile handler */
-static void
+static int
 kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs)
 {
 	struct trace_event_call *call = &tk->tp.call;
@@ -1179,12 +1186,29 @@ kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs)
 	int size, __size, dsize;
 	int rctx;
 
-	if (bpf_prog_array_valid(call) && !trace_call_bpf(call, regs))
-		return;
+	if (bpf_prog_array_valid(call)) {
+		int ret;
+
+		ret = trace_call_bpf(call, regs);
+
+		/*
+		 * We need to check and see if we modified the pc of the
+		 * pt_regs, and if so clear the kprobe and return 1 so that we
+		 * don't do the instruction skipping.  Also reset our state so
+		 * we are clean the next pass through.
+		 */
+		if (__this_cpu_read(bpf_kprobe_override)) {
+			__this_cpu_write(bpf_kprobe_override, 0);
+			reset_current_kprobe();
+			return 1;
+		}
+		if (!ret)
+			return 0;
+	}
 
 	head = this_cpu_ptr(call->perf_events);
 	if (hlist_empty(head))
-		return;
+		return 0;
 
 	dsize = __get_data_size(&tk->tp, regs);
 	__size = sizeof(*entry) + tk->tp.size + dsize;
@@ -1193,13 +1217,14 @@ kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs)
 
 	entry = perf_trace_buf_alloc(size, NULL, &rctx);
 	if (!entry)
-		return;
+		return 0;
 
 	entry->ip = (unsigned long)tk->rp.kp.addr;
 	memset(&entry[1], 0, dsize);
 	store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize);
 	perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs,
 			      head, NULL, NULL);
+	return 0;
 }
 NOKPROBE_SYMBOL(kprobe_perf_func);
 
@@ -1275,6 +1300,7 @@ static int kprobe_register(struct trace_event_call *event,
 static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs)
 {
 	struct trace_kprobe *tk = container_of(kp, struct trace_kprobe, rp.kp);
+	int ret = 0;
 
 	raw_cpu_inc(*tk->nhit);
 
@@ -1282,9 +1308,9 @@ static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs)
 		kprobe_trace_func(tk, regs);
 #ifdef CONFIG_PERF_EVENTS
 	if (tk->tp.flags & TP_FLAG_PROFILE)
-		kprobe_perf_func(tk, regs);
+		ret = kprobe_perf_func(tk, regs);
 #endif
-	return 0;	/* We don't tweek kernel, so just return 0 */
+	return ret;
 }
 NOKPROBE_SYMBOL(kprobe_dispatcher);
 
diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h
index 903273c93e61..adbb3f7d1fb5 100644
--- a/kernel/trace/trace_probe.h
+++ b/kernel/trace/trace_probe.h
@@ -253,6 +253,7 @@ struct symbol_cache;
 unsigned long update_symbol_cache(struct symbol_cache *sc);
 void free_symbol_cache(struct symbol_cache *sc);
 struct symbol_cache *alloc_symbol_cache(const char *sym, long offset);
+int trace_kprobe_ftrace(struct trace_event_call *call);
 #else
 /* uprobes do not support symbol fetch methods */
 #define fetch_symbol_u8			NULL
@@ -278,6 +279,11 @@ alloc_symbol_cache(const char *sym, long offset)
 {
 	return NULL;
 }
+
+static inline int trace_kprobe_ftrace(struct trace_event_call *call)
+{
+	return 0;
+}
 #endif /* CONFIG_KPROBE_EVENTS */
 
 struct probe_arg {
-- 
cgit v1.2.3


From f3edacbd697f94a743fff1a3d26910ab99948ba7 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Sat, 11 Nov 2017 18:24:55 +0900
Subject: bpf: Revert bpf_overrid_function() helper changes.

NACK'd by x86 maintainer.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/Kconfig                              |  3 ---
 arch/x86/Kconfig                          |  1 -
 arch/x86/include/asm/kprobes.h            |  4 ----
 arch/x86/include/asm/ptrace.h             |  5 ----
 arch/x86/kernel/kprobes/ftrace.c          | 14 -----------
 include/linux/filter.h                    |  3 +--
 include/linux/trace_events.h              |  1 -
 include/uapi/linux/bpf.h                  |  7 +-----
 kernel/bpf/core.c                         |  3 ---
 kernel/bpf/verifier.c                     |  2 --
 kernel/events/core.c                      |  7 ------
 kernel/trace/Kconfig                      | 11 ---------
 kernel/trace/bpf_trace.c                  | 35 ---------------------------
 kernel/trace/trace_kprobe.c               | 40 ++++++-------------------------
 kernel/trace/trace_probe.h                |  6 -----
 samples/bpf/Makefile                      |  4 ----
 samples/bpf/test_override_return.sh       | 15 ------------
 samples/bpf/tracex7_kern.c                | 16 -------------
 samples/bpf/tracex7_user.c                | 28 ----------------------
 tools/include/uapi/linux/bpf.h            |  7 +-----
 tools/testing/selftests/bpf/bpf_helpers.h |  3 +--
 21 files changed, 11 insertions(+), 204 deletions(-)
 delete mode 100755 samples/bpf/test_override_return.sh
 delete mode 100644 samples/bpf/tracex7_kern.c
 delete mode 100644 samples/bpf/tracex7_user.c

(limited to 'kernel')

diff --git a/arch/Kconfig b/arch/Kconfig
index 6e8520f09bc1..057370a0ac4e 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -196,9 +196,6 @@ config HAVE_OPTPROBES
 config HAVE_KPROBES_ON_FTRACE
 	bool
 
-config HAVE_KPROBE_OVERRIDE
-	bool
-
 config HAVE_NMI
 	bool
 
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 51458c1a0b4a..2fdb23313dd5 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -153,7 +153,6 @@ config X86
 	select HAVE_KERNEL_XZ
 	select HAVE_KPROBES
 	select HAVE_KPROBES_ON_FTRACE
-	select HAVE_KPROBE_OVERRIDE
 	select HAVE_KRETPROBES
 	select HAVE_KVM
 	select HAVE_LIVEPATCH			if X86_64
diff --git a/arch/x86/include/asm/kprobes.h b/arch/x86/include/asm/kprobes.h
index c6c3b1f4306a..6cf65437b5e5 100644
--- a/arch/x86/include/asm/kprobes.h
+++ b/arch/x86/include/asm/kprobes.h
@@ -67,10 +67,6 @@ extern const int kretprobe_blacklist_size;
 void arch_remove_kprobe(struct kprobe *p);
 asmlinkage void kretprobe_trampoline(void);
 
-#ifdef CONFIG_KPROBES_ON_FTRACE
-extern void arch_ftrace_kprobe_override_function(struct pt_regs *regs);
-#endif
-
 /* Architecture specific copy of original instruction*/
 struct arch_specific_insn {
 	/* copy of the original instruction */
diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h
index 2370bb0149cc..c0e3c45cf6ab 100644
--- a/arch/x86/include/asm/ptrace.h
+++ b/arch/x86/include/asm/ptrace.h
@@ -109,11 +109,6 @@ static inline unsigned long regs_return_value(struct pt_regs *regs)
 	return regs->ax;
 }
 
-static inline void regs_set_return_value(struct pt_regs *regs, unsigned long rc)
-{
-	regs->ax = rc;
-}
-
 /*
  * user_mode(regs) determines whether a register set came from user
  * mode.  On x86_32, this is true if V8086 mode was enabled OR if the
diff --git a/arch/x86/kernel/kprobes/ftrace.c b/arch/x86/kernel/kprobes/ftrace.c
index 3c455bf490cb..041f7b6dfa0f 100644
--- a/arch/x86/kernel/kprobes/ftrace.c
+++ b/arch/x86/kernel/kprobes/ftrace.c
@@ -97,17 +97,3 @@ int arch_prepare_kprobe_ftrace(struct kprobe *p)
 	p->ainsn.boostable = false;
 	return 0;
 }
-
-asmlinkage void override_func(void);
-asm(
-	".type override_func, @function\n"
-	"override_func:\n"
-	"	ret\n"
-	".size override_func, .-override_func\n"
-);
-
-void arch_ftrace_kprobe_override_function(struct pt_regs *regs)
-{
-	regs->ip = (unsigned long)&override_func;
-}
-NOKPROBE_SYMBOL(arch_ftrace_kprobe_override_function);
diff --git a/include/linux/filter.h b/include/linux/filter.h
index eaec066f99e8..0cd02ff4ae30 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -459,8 +459,7 @@ struct bpf_prog {
 				locked:1,	/* Program image locked? */
 				gpl_compatible:1, /* Is filter GPL compatible? */
 				cb_access:1,	/* Is control block accessed? */
-				dst_needed:1,	/* Do we need dst entry? */
-				kprobe_override:1; /* Do we override a kprobe? */
+				dst_needed:1;	/* Do we need dst entry? */
 	kmemcheck_bitfield_end(meta);
 	enum bpf_prog_type	type;		/* Type of BPF program */
 	u32			len;		/* Number of filter blocks */
diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index 17e5e820a84c..84014ecfa67f 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -523,7 +523,6 @@ do {									\
 struct perf_event;
 
 DECLARE_PER_CPU(struct pt_regs, perf_trace_regs);
-DECLARE_PER_CPU(int, bpf_kprobe_override);
 
 extern int  perf_trace_init(struct perf_event *event);
 extern void perf_trace_destroy(struct perf_event *event);
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index adb66f78b674..e880ae6434ee 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -677,10 +677,6 @@ union bpf_attr {
  *     @buf: buf to fill
  *     @buf_size: size of the buf
  *     Return : 0 on success or negative error code
- *
- * int bpf_override_return(pt_regs, rc)
- *	@pt_regs: pointer to struct pt_regs
- *	@rc: the return value to set
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -740,8 +736,7 @@ union bpf_attr {
 	FN(xdp_adjust_meta),		\
 	FN(perf_event_read_value),	\
 	FN(perf_prog_read_value),	\
-	FN(getsockopt),			\
-	FN(override_return),
+	FN(getsockopt),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 271daad31f37..8a6c37762330 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -1326,9 +1326,6 @@ EVAL4(PROG_NAME_LIST, 416, 448, 480, 512)
 bool bpf_prog_array_compatible(struct bpf_array *array,
 			       const struct bpf_prog *fp)
 {
-	if (fp->kprobe_override)
-		return false;
-
 	if (!array->owner_prog_type) {
 		/* There's no owner yet where we could check for
 		 * compatibility.
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index bc464b8ec91e..4a942e2e753d 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -4357,8 +4357,6 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
 			prog->dst_needed = 1;
 		if (insn->imm == BPF_FUNC_get_prandom_u32)
 			bpf_user_rnd_init_once();
-		if (insn->imm == BPF_FUNC_override_return)
-			prog->kprobe_override = 1;
 		if (insn->imm == BPF_FUNC_tail_call) {
 			/* If we tail call into other programs, we
 			 * cannot make any assumptions since they can
diff --git a/kernel/events/core.c b/kernel/events/core.c
index ac240d31b5bf..42d24bd64ea4 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -8171,13 +8171,6 @@ static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
 		return -EINVAL;
 	}
 
-	/* Kprobe override only works for kprobes, not uprobes. */
-	if (prog->kprobe_override &&
-	    !(event->tp_event->flags & TRACE_EVENT_FL_KPROBE)) {
-		bpf_prog_put(prog);
-		return -EINVAL;
-	}
-
 	if (is_tracepoint || is_syscall_tp) {
 		int off = trace_event_get_offsets(event->tp_event);
 
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 9dc0deeaad2b..434c840e2d82 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -518,17 +518,6 @@ config FUNCTION_PROFILER
 
 	  If in doubt, say N.
 
-config BPF_KPROBE_OVERRIDE
-	bool "Enable BPF programs to override a kprobed function"
-	depends on BPF_EVENTS
-	depends on KPROBES_ON_FTRACE
-	depends on HAVE_KPROBE_OVERRIDE
-	depends on DYNAMIC_FTRACE_WITH_REGS
-	default n
-	help
-	 Allows BPF to override the execution of a probed function and
-	 set a different return value.  This is used for error injection.
-
 config FTRACE_MCOUNT_RECORD
 	def_bool y
 	depends on DYNAMIC_FTRACE
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 1865b0d4cdeb..506efe6e8ed9 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -13,10 +13,6 @@
 #include <linux/filter.h>
 #include <linux/uaccess.h>
 #include <linux/ctype.h>
-#include <linux/kprobes.h>
-#include <asm/kprobes.h>
-
-#include "trace_probe.h"
 #include "trace.h"
 
 u64 bpf_get_stackid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
@@ -80,29 +76,6 @@ unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx)
 }
 EXPORT_SYMBOL_GPL(trace_call_bpf);
 
-#ifdef CONFIG_BPF_KPROBE_OVERRIDE
-BPF_CALL_2(bpf_override_return, struct pt_regs *, regs, unsigned long, rc)
-{
-	__this_cpu_write(bpf_kprobe_override, 1);
-	regs_set_return_value(regs, rc);
-	arch_ftrace_kprobe_override_function(regs);
-	return 0;
-}
-#else
-BPF_CALL_2(bpf_override_return, struct pt_regs *, regs, unsigned long, rc)
-{
-	return -EINVAL;
-}
-#endif
-
-static const struct bpf_func_proto bpf_override_return_proto = {
-	.func		= bpf_override_return,
-	.gpl_only	= true,
-	.ret_type	= RET_INTEGER,
-	.arg1_type	= ARG_PTR_TO_CTX,
-	.arg2_type	= ARG_ANYTHING,
-};
-
 BPF_CALL_3(bpf_probe_read, void *, dst, u32, size, const void *, unsafe_ptr)
 {
 	int ret;
@@ -578,10 +551,6 @@ static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func
 		return &bpf_get_stackid_proto;
 	case BPF_FUNC_perf_event_read_value:
 		return &bpf_perf_event_read_value_proto;
-	case BPF_FUNC_override_return:
-		pr_warn_ratelimited("%s[%d] is installing a program with bpf_override_return helper that may cause unexpected behavior!",
-				    current->comm, task_pid_nr(current));
-		return &bpf_override_return_proto;
 	default:
 		return tracing_func_proto(func_id);
 	}
@@ -797,10 +766,6 @@ int perf_event_attach_bpf_prog(struct perf_event *event,
 	struct bpf_prog_array *new_array;
 	int ret = -EEXIST;
 
-	/* Kprobe override only works for ftrace based kprobes. */
-	if (prog->kprobe_override && !trace_kprobe_ftrace(event->tp_event))
-		return -EINVAL;
-
 	mutex_lock(&bpf_event_mutex);
 
 	if (event->prog)
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 8e3c9ec1faf7..abf92e478cfb 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -42,7 +42,6 @@ struct trace_kprobe {
 	(offsetof(struct trace_kprobe, tp.args) +	\
 	(sizeof(struct probe_arg) * (n)))
 
-DEFINE_PER_CPU(int, bpf_kprobe_override);
 
 static nokprobe_inline bool trace_kprobe_is_return(struct trace_kprobe *tk)
 {
@@ -88,12 +87,6 @@ static nokprobe_inline unsigned long trace_kprobe_nhit(struct trace_kprobe *tk)
 	return nhit;
 }
 
-int trace_kprobe_ftrace(struct trace_event_call *call)
-{
-	struct trace_kprobe *tk = (struct trace_kprobe *)call->data;
-	return kprobe_ftrace(&tk->rp.kp);
-}
-
 static int register_kprobe_event(struct trace_kprobe *tk);
 static int unregister_kprobe_event(struct trace_kprobe *tk);
 
@@ -1177,7 +1170,7 @@ static int kretprobe_event_define_fields(struct trace_event_call *event_call)
 #ifdef CONFIG_PERF_EVENTS
 
 /* Kprobe profile handler */
-static int
+static void
 kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs)
 {
 	struct trace_event_call *call = &tk->tp.call;
@@ -1186,29 +1179,12 @@ kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs)
 	int size, __size, dsize;
 	int rctx;
 
-	if (bpf_prog_array_valid(call)) {
-		int ret;
-
-		ret = trace_call_bpf(call, regs);
-
-		/*
-		 * We need to check and see if we modified the pc of the
-		 * pt_regs, and if so clear the kprobe and return 1 so that we
-		 * don't do the instruction skipping.  Also reset our state so
-		 * we are clean the next pass through.
-		 */
-		if (__this_cpu_read(bpf_kprobe_override)) {
-			__this_cpu_write(bpf_kprobe_override, 0);
-			reset_current_kprobe();
-			return 1;
-		}
-		if (!ret)
-			return 0;
-	}
+	if (bpf_prog_array_valid(call) && !trace_call_bpf(call, regs))
+		return;
 
 	head = this_cpu_ptr(call->perf_events);
 	if (hlist_empty(head))
-		return 0;
+		return;
 
 	dsize = __get_data_size(&tk->tp, regs);
 	__size = sizeof(*entry) + tk->tp.size + dsize;
@@ -1217,14 +1193,13 @@ kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs)
 
 	entry = perf_trace_buf_alloc(size, NULL, &rctx);
 	if (!entry)
-		return 0;
+		return;
 
 	entry->ip = (unsigned long)tk->rp.kp.addr;
 	memset(&entry[1], 0, dsize);
 	store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize);
 	perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs,
 			      head, NULL, NULL);
-	return 0;
 }
 NOKPROBE_SYMBOL(kprobe_perf_func);
 
@@ -1300,7 +1275,6 @@ static int kprobe_register(struct trace_event_call *event,
 static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs)
 {
 	struct trace_kprobe *tk = container_of(kp, struct trace_kprobe, rp.kp);
-	int ret = 0;
 
 	raw_cpu_inc(*tk->nhit);
 
@@ -1308,9 +1282,9 @@ static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs)
 		kprobe_trace_func(tk, regs);
 #ifdef CONFIG_PERF_EVENTS
 	if (tk->tp.flags & TP_FLAG_PROFILE)
-		ret = kprobe_perf_func(tk, regs);
+		kprobe_perf_func(tk, regs);
 #endif
-	return ret;
+	return 0;	/* We don't tweek kernel, so just return 0 */
 }
 NOKPROBE_SYMBOL(kprobe_dispatcher);
 
diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h
index adbb3f7d1fb5..903273c93e61 100644
--- a/kernel/trace/trace_probe.h
+++ b/kernel/trace/trace_probe.h
@@ -253,7 +253,6 @@ struct symbol_cache;
 unsigned long update_symbol_cache(struct symbol_cache *sc);
 void free_symbol_cache(struct symbol_cache *sc);
 struct symbol_cache *alloc_symbol_cache(const char *sym, long offset);
-int trace_kprobe_ftrace(struct trace_event_call *call);
 #else
 /* uprobes do not support symbol fetch methods */
 #define fetch_symbol_u8			NULL
@@ -279,11 +278,6 @@ alloc_symbol_cache(const char *sym, long offset)
 {
 	return NULL;
 }
-
-static inline int trace_kprobe_ftrace(struct trace_event_call *call)
-{
-	return 0;
-}
 #endif /* CONFIG_KPROBE_EVENTS */
 
 struct probe_arg {
diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index 87db0f9a4c15..3b4945c1eab0 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -15,7 +15,6 @@ hostprogs-y += tracex3
 hostprogs-y += tracex4
 hostprogs-y += tracex5
 hostprogs-y += tracex6
-hostprogs-y += tracex7
 hostprogs-y += test_probe_write_user
 hostprogs-y += trace_output
 hostprogs-y += lathist
@@ -62,7 +61,6 @@ tracex3-objs := bpf_load.o $(LIBBPF) tracex3_user.o
 tracex4-objs := bpf_load.o $(LIBBPF) tracex4_user.o
 tracex5-objs := bpf_load.o $(LIBBPF) tracex5_user.o
 tracex6-objs := bpf_load.o $(LIBBPF) tracex6_user.o
-tracex7-objs := bpf_load.o $(LIBBPF) tracex7_user.o
 load_sock_ops-objs := bpf_load.o $(LIBBPF) load_sock_ops.o
 test_probe_write_user-objs := bpf_load.o $(LIBBPF) test_probe_write_user_user.o
 trace_output-objs := bpf_load.o $(LIBBPF) trace_output_user.o
@@ -106,7 +104,6 @@ always += tracex3_kern.o
 always += tracex4_kern.o
 always += tracex5_kern.o
 always += tracex6_kern.o
-always += tracex7_kern.o
 always += sock_flags_kern.o
 always += test_probe_write_user_kern.o
 always += trace_output_kern.o
@@ -161,7 +158,6 @@ HOSTLOADLIBES_tracex3 += -lelf
 HOSTLOADLIBES_tracex4 += -lelf -lrt
 HOSTLOADLIBES_tracex5 += -lelf
 HOSTLOADLIBES_tracex6 += -lelf
-HOSTLOADLIBES_tracex7 += -lelf
 HOSTLOADLIBES_test_cgrp2_sock2 += -lelf
 HOSTLOADLIBES_load_sock_ops += -lelf
 HOSTLOADLIBES_test_probe_write_user += -lelf
diff --git a/samples/bpf/test_override_return.sh b/samples/bpf/test_override_return.sh
deleted file mode 100755
index e68b9ee6814b..000000000000
--- a/samples/bpf/test_override_return.sh
+++ /dev/null
@@ -1,15 +0,0 @@
-#!/bin/bash
-
-rm -f testfile.img
-dd if=/dev/zero of=testfile.img bs=1M seek=1000 count=1
-DEVICE=$(losetup --show -f testfile.img)
-mkfs.btrfs -f $DEVICE
-mkdir tmpmnt
-./tracex7 $DEVICE
-if [ $? -eq 0 ]
-then
-	echo "SUCCESS!"
-else
-	echo "FAILED!"
-fi
-losetup -d $DEVICE
diff --git a/samples/bpf/tracex7_kern.c b/samples/bpf/tracex7_kern.c
deleted file mode 100644
index 1ab308a43e0f..000000000000
--- a/samples/bpf/tracex7_kern.c
+++ /dev/null
@@ -1,16 +0,0 @@
-#include <uapi/linux/ptrace.h>
-#include <uapi/linux/bpf.h>
-#include <linux/version.h>
-#include "bpf_helpers.h"
-
-SEC("kprobe/open_ctree")
-int bpf_prog1(struct pt_regs *ctx)
-{
-	unsigned long rc = -12;
-
-	bpf_override_return(ctx, rc);
-	return 0;
-}
-
-char _license[] SEC("license") = "GPL";
-u32 _version SEC("version") = LINUX_VERSION_CODE;
diff --git a/samples/bpf/tracex7_user.c b/samples/bpf/tracex7_user.c
deleted file mode 100644
index 8a52ac492e8b..000000000000
--- a/samples/bpf/tracex7_user.c
+++ /dev/null
@@ -1,28 +0,0 @@
-#define _GNU_SOURCE
-
-#include <stdio.h>
-#include <linux/bpf.h>
-#include <unistd.h>
-#include "libbpf.h"
-#include "bpf_load.h"
-
-int main(int argc, char **argv)
-{
-	FILE *f;
-	char filename[256];
-	char command[256];
-	int ret;
-
-	snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
-
-	if (load_bpf_file(filename)) {
-		printf("%s", bpf_log_buf);
-		return 1;
-	}
-
-	snprintf(command, 256, "mount %s tmpmnt/", argv[1]);
-	f = popen(command, "r");
-	ret = pclose(f);
-
-	return ret ? 0 : 1;
-}
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index adb66f78b674..e880ae6434ee 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -677,10 +677,6 @@ union bpf_attr {
  *     @buf: buf to fill
  *     @buf_size: size of the buf
  *     Return : 0 on success or negative error code
- *
- * int bpf_override_return(pt_regs, rc)
- *	@pt_regs: pointer to struct pt_regs
- *	@rc: the return value to set
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -740,8 +736,7 @@ union bpf_attr {
 	FN(xdp_adjust_meta),		\
 	FN(perf_event_read_value),	\
 	FN(perf_prog_read_value),	\
-	FN(getsockopt),			\
-	FN(override_return),
+	FN(getsockopt),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
diff --git a/tools/testing/selftests/bpf/bpf_helpers.h b/tools/testing/selftests/bpf/bpf_helpers.h
index 33cb00e46c49..fd9a17fa8a8b 100644
--- a/tools/testing/selftests/bpf/bpf_helpers.h
+++ b/tools/testing/selftests/bpf/bpf_helpers.h
@@ -82,8 +82,7 @@ static int (*bpf_perf_event_read_value)(void *map, unsigned long long flags,
 static int (*bpf_perf_prog_read_value)(void *ctx, void *buf,
 				       unsigned int buf_size) =
 	(void *) BPF_FUNC_perf_prog_read_value;
-static int (*bpf_override_return)(void *ctx, unsigned long rc) =
-	(void *) BPF_FUNC_override_return;
+
 
 /* llvm builtin functions that eBPF C program may use to
  * emit BPF_LD_ABS and BPF_LD_IND instructions
-- 
cgit v1.2.3


From 5e4def20381678ba3ce0a4e117f97e378ecd81bc Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 2 Nov 2017 15:27:44 +0000
Subject: Pass mode to wait_on_atomic_t() action funcs and provide default
 actions

Make wait_on_atomic_t() pass the TASK_* mode onto its action function as an
extra argument and make it 'unsigned int throughout.

Also, consolidate a bunch of identical action functions into a default
function that can do the appropriate thing for the mode.

Also, change the argument name in the bit_wait*() function declarations to
reflect the fact that it's the mode and not the bit number.

[Peter Z gives this a grudging ACK, but thinks that the whole atomic_t wait
should be done differently, though he's not immediately sure as to how]

Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
cc: Ingo Molnar <mingo@kernel.org>
---
 arch/mips/kernel/traps.c                           | 14 +----------
 drivers/gpu/drm/drm_dp_aux_dev.c                   |  8 +------
 drivers/gpu/drm/i915/selftests/intel_breadcrumbs.c | 10 ++------
 drivers/media/platform/qcom/venus/hfi.c            |  8 +------
 fs/afs/rxrpc.c                                     |  8 +------
 fs/btrfs/extent-tree.c                             | 27 +++-------------------
 fs/fscache/cookie.c                                |  2 +-
 fs/fscache/internal.h                              |  2 --
 fs/fscache/main.c                                  |  9 --------
 fs/nfs/inode.c                                     |  4 ++--
 fs/nfs/internal.h                                  |  2 +-
 fs/ocfs2/filecheck.c                               |  8 +------
 include/linux/wait_bit.h                           | 15 +++++++-----
 kernel/sched/wait_bit.c                            | 18 +++++++++++----
 14 files changed, 37 insertions(+), 98 deletions(-)

(limited to 'kernel')

diff --git a/arch/mips/kernel/traps.c b/arch/mips/kernel/traps.c
index 5669d3b8bd38..5d19ed07e99d 100644
--- a/arch/mips/kernel/traps.c
+++ b/arch/mips/kernel/traps.c
@@ -1233,18 +1233,6 @@ static int default_cu2_call(struct notifier_block *nfb, unsigned long action,
 	return NOTIFY_OK;
 }
 
-static int wait_on_fp_mode_switch(atomic_t *p)
-{
-	/*
-	 * The FP mode for this task is currently being switched. That may
-	 * involve modifications to the format of this tasks FP context which
-	 * make it unsafe to proceed with execution for the moment. Instead,
-	 * schedule some other task.
-	 */
-	schedule();
-	return 0;
-}
-
 static int enable_restore_fp_context(int msa)
 {
 	int err, was_fpu_owner, prior_msa;
@@ -1254,7 +1242,7 @@ static int enable_restore_fp_context(int msa)
 	 * complete before proceeding.
 	 */
 	wait_on_atomic_t(&current->mm->context.fp_mode_switching,
-			 wait_on_fp_mode_switch, TASK_KILLABLE);
+			 atomic_t_wait, TASK_KILLABLE);
 
 	if (!used_math()) {
 		/* First time FP context user. */
diff --git a/drivers/gpu/drm/drm_dp_aux_dev.c b/drivers/gpu/drm/drm_dp_aux_dev.c
index d34e5096887a..053044201e31 100644
--- a/drivers/gpu/drm/drm_dp_aux_dev.c
+++ b/drivers/gpu/drm/drm_dp_aux_dev.c
@@ -263,12 +263,6 @@ static struct drm_dp_aux_dev *drm_dp_aux_dev_get_by_aux(struct drm_dp_aux *aux)
 	return aux_dev;
 }
 
-static int auxdev_wait_atomic_t(atomic_t *p)
-{
-	schedule();
-	return 0;
-}
-
 void drm_dp_aux_unregister_devnode(struct drm_dp_aux *aux)
 {
 	struct drm_dp_aux_dev *aux_dev;
@@ -283,7 +277,7 @@ void drm_dp_aux_unregister_devnode(struct drm_dp_aux *aux)
 	mutex_unlock(&aux_idr_mutex);
 
 	atomic_dec(&aux_dev->usecount);
-	wait_on_atomic_t(&aux_dev->usecount, auxdev_wait_atomic_t,
+	wait_on_atomic_t(&aux_dev->usecount, atomic_t_wait,
 			 TASK_UNINTERRUPTIBLE);
 
 	minor = aux_dev->index;
diff --git a/drivers/gpu/drm/i915/selftests/intel_breadcrumbs.c b/drivers/gpu/drm/i915/selftests/intel_breadcrumbs.c
index 828904b7d468..54fc571b1102 100644
--- a/drivers/gpu/drm/i915/selftests/intel_breadcrumbs.c
+++ b/drivers/gpu/drm/i915/selftests/intel_breadcrumbs.c
@@ -271,13 +271,7 @@ struct igt_wakeup {
 	u32 seqno;
 };
 
-static int wait_atomic(atomic_t *p)
-{
-	schedule();
-	return 0;
-}
-
-static int wait_atomic_timeout(atomic_t *p)
+static int wait_atomic_timeout(atomic_t *p, unsigned int mode)
 {
 	return schedule_timeout(10 * HZ) ? 0 : -ETIMEDOUT;
 }
@@ -348,7 +342,7 @@ static void igt_wake_all_sync(atomic_t *ready,
 	atomic_set(ready, 0);
 	wake_up_all(wq);
 
-	wait_on_atomic_t(set, wait_atomic, TASK_UNINTERRUPTIBLE);
+	wait_on_atomic_t(set, atomic_t_wait, TASK_UNINTERRUPTIBLE);
 	atomic_set(ready, count);
 	atomic_set(done, count);
 }
diff --git a/drivers/media/platform/qcom/venus/hfi.c b/drivers/media/platform/qcom/venus/hfi.c
index c09490876516..e374c7d1a618 100644
--- a/drivers/media/platform/qcom/venus/hfi.c
+++ b/drivers/media/platform/qcom/venus/hfi.c
@@ -88,12 +88,6 @@ unlock:
 	return ret;
 }
 
-static int core_deinit_wait_atomic_t(atomic_t *p)
-{
-	schedule();
-	return 0;
-}
-
 int hfi_core_deinit(struct venus_core *core, bool blocking)
 {
 	int ret = 0, empty;
@@ -112,7 +106,7 @@ int hfi_core_deinit(struct venus_core *core, bool blocking)
 
 	if (!empty) {
 		mutex_unlock(&core->lock);
-		wait_on_atomic_t(&core->insts_count, core_deinit_wait_atomic_t,
+		wait_on_atomic_t(&core->insts_count, atomic_t_wait,
 				 TASK_UNINTERRUPTIBLE);
 		mutex_lock(&core->lock);
 	}
diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c
index bb1e2caa1720..77f5420a1a24 100644
--- a/fs/afs/rxrpc.c
+++ b/fs/afs/rxrpc.c
@@ -41,12 +41,6 @@ static void afs_charge_preallocation(struct work_struct *);
 
 static DECLARE_WORK(afs_charge_preallocation_work, afs_charge_preallocation);
 
-static int afs_wait_atomic_t(atomic_t *p)
-{
-	schedule();
-	return 0;
-}
-
 /*
  * open an RxRPC socket and bind it to be a server for callback notifications
  * - the socket is left in blocking mode and non-blocking ops use MSG_DONTWAIT
@@ -121,7 +115,7 @@ void afs_close_socket(void)
 	}
 
 	_debug("outstanding %u", atomic_read(&afs_outstanding_calls));
-	wait_on_atomic_t(&afs_outstanding_calls, afs_wait_atomic_t,
+	wait_on_atomic_t(&afs_outstanding_calls, atomic_t_wait,
 			 TASK_UNINTERRUPTIBLE);
 	_debug("no outstanding calls");
 
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index e2d7e86b51d1..24cefde30e30 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -4016,16 +4016,9 @@ void btrfs_dec_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr)
 	btrfs_put_block_group(bg);
 }
 
-static int btrfs_wait_nocow_writers_atomic_t(atomic_t *a)
-{
-	schedule();
-	return 0;
-}
-
 void btrfs_wait_nocow_writers(struct btrfs_block_group_cache *bg)
 {
-	wait_on_atomic_t(&bg->nocow_writers,
-			 btrfs_wait_nocow_writers_atomic_t,
+	wait_on_atomic_t(&bg->nocow_writers, atomic_t_wait,
 			 TASK_UNINTERRUPTIBLE);
 }
 
@@ -6595,12 +6588,6 @@ void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info,
 	btrfs_put_block_group(bg);
 }
 
-static int btrfs_wait_bg_reservations_atomic_t(atomic_t *a)
-{
-	schedule();
-	return 0;
-}
-
 void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg)
 {
 	struct btrfs_space_info *space_info = bg->space_info;
@@ -6623,8 +6610,7 @@ void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg)
 	down_write(&space_info->groups_sem);
 	up_write(&space_info->groups_sem);
 
-	wait_on_atomic_t(&bg->reservations,
-			 btrfs_wait_bg_reservations_atomic_t,
+	wait_on_atomic_t(&bg->reservations, atomic_t_wait,
 			 TASK_UNINTERRUPTIBLE);
 }
 
@@ -11106,12 +11092,6 @@ int btrfs_start_write_no_snapshotting(struct btrfs_root *root)
 	return 1;
 }
 
-static int wait_snapshotting_atomic_t(atomic_t *a)
-{
-	schedule();
-	return 0;
-}
-
 void btrfs_wait_for_snapshot_creation(struct btrfs_root *root)
 {
 	while (true) {
@@ -11120,8 +11100,7 @@ void btrfs_wait_for_snapshot_creation(struct btrfs_root *root)
 		ret = btrfs_start_write_no_snapshotting(root);
 		if (ret)
 			break;
-		wait_on_atomic_t(&root->will_be_snapshotted,
-				 wait_snapshotting_atomic_t,
+		wait_on_atomic_t(&root->will_be_snapshotted, atomic_t_wait,
 				 TASK_UNINTERRUPTIBLE);
 	}
 }
diff --git a/fs/fscache/cookie.c b/fs/fscache/cookie.c
index 40d61077bead..ff84258132bb 100644
--- a/fs/fscache/cookie.c
+++ b/fs/fscache/cookie.c
@@ -558,7 +558,7 @@ void __fscache_disable_cookie(struct fscache_cookie *cookie, bool invalidate)
 	 * have completed.
 	 */
 	if (!atomic_dec_and_test(&cookie->n_active))
-		wait_on_atomic_t(&cookie->n_active, fscache_wait_atomic_t,
+		wait_on_atomic_t(&cookie->n_active, atomic_t_wait,
 				 TASK_UNINTERRUPTIBLE);
 
 	/* Make sure any pending writes are cancelled. */
diff --git a/fs/fscache/internal.h b/fs/fscache/internal.h
index 97ec45110957..0ff4b49a0037 100644
--- a/fs/fscache/internal.h
+++ b/fs/fscache/internal.h
@@ -97,8 +97,6 @@ static inline bool fscache_object_congested(void)
 	return workqueue_congested(WORK_CPU_UNBOUND, fscache_object_wq);
 }
 
-extern int fscache_wait_atomic_t(atomic_t *);
-
 /*
  * object.c
  */
diff --git a/fs/fscache/main.c b/fs/fscache/main.c
index b39d487ccfb0..249968dcbf5c 100644
--- a/fs/fscache/main.c
+++ b/fs/fscache/main.c
@@ -195,12 +195,3 @@ static void __exit fscache_exit(void)
 }
 
 module_exit(fscache_exit);
-
-/*
- * wait_on_atomic_t() sleep function for uninterruptible waiting
- */
-int fscache_wait_atomic_t(atomic_t *p)
-{
-	schedule();
-	return 0;
-}
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 134d9f560240..1629056aa2c9 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -85,9 +85,9 @@ int nfs_wait_bit_killable(struct wait_bit_key *key, int mode)
 }
 EXPORT_SYMBOL_GPL(nfs_wait_bit_killable);
 
-int nfs_wait_atomic_killable(atomic_t *p)
+int nfs_wait_atomic_killable(atomic_t *p, unsigned int mode)
 {
-	return nfs_wait_killable(TASK_KILLABLE);
+	return nfs_wait_killable(mode);
 }
 
 /**
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index f9a4a5524bd5..5ab17fd4700a 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -388,7 +388,7 @@ extern void nfs_evict_inode(struct inode *);
 void nfs_zap_acl_cache(struct inode *inode);
 extern bool nfs_check_cache_invalid(struct inode *, unsigned long);
 extern int nfs_wait_bit_killable(struct wait_bit_key *key, int mode);
-extern int nfs_wait_atomic_killable(atomic_t *p);
+extern int nfs_wait_atomic_killable(atomic_t *p, unsigned int mode);
 
 /* super.c */
 extern const struct super_operations nfs_sops;
diff --git a/fs/ocfs2/filecheck.c b/fs/ocfs2/filecheck.c
index 2cabbcf2f28e..e87279e49ba3 100644
--- a/fs/ocfs2/filecheck.c
+++ b/fs/ocfs2/filecheck.c
@@ -129,19 +129,13 @@ static struct kobj_attribute ocfs2_attr_filecheck_set =
 					ocfs2_filecheck_show,
 					ocfs2_filecheck_store);
 
-static int ocfs2_filecheck_sysfs_wait(atomic_t *p)
-{
-	schedule();
-	return 0;
-}
-
 static void
 ocfs2_filecheck_sysfs_free(struct ocfs2_filecheck_sysfs_entry *entry)
 {
 	struct ocfs2_filecheck_entry *p;
 
 	if (!atomic_dec_and_test(&entry->fs_count))
-		wait_on_atomic_t(&entry->fs_count, ocfs2_filecheck_sysfs_wait,
+		wait_on_atomic_t(&entry->fs_count, atomic_t_wait,
 				 TASK_UNINTERRUPTIBLE);
 
 	spin_lock(&entry->fs_fcheck->fc_lock);
diff --git a/include/linux/wait_bit.h b/include/linux/wait_bit.h
index af0d495430d7..61b39eaf7cad 100644
--- a/include/linux/wait_bit.h
+++ b/include/linux/wait_bit.h
@@ -26,6 +26,8 @@ struct wait_bit_queue_entry {
 	{ .flags = p, .bit_nr = WAIT_ATOMIC_T_BIT_NR, }
 
 typedef int wait_bit_action_f(struct wait_bit_key *key, int mode);
+typedef int wait_atomic_t_action_f(atomic_t *counter, unsigned int mode);
+
 void __wake_up_bit(struct wait_queue_head *wq_head, void *word, int bit);
 int __wait_on_bit(struct wait_queue_head *wq_head, struct wait_bit_queue_entry *wbq_entry, wait_bit_action_f *action, unsigned int mode);
 int __wait_on_bit_lock(struct wait_queue_head *wq_head, struct wait_bit_queue_entry *wbq_entry, wait_bit_action_f *action, unsigned int mode);
@@ -34,7 +36,7 @@ void wake_up_atomic_t(atomic_t *p);
 int out_of_line_wait_on_bit(void *word, int, wait_bit_action_f *action, unsigned int mode);
 int out_of_line_wait_on_bit_timeout(void *word, int, wait_bit_action_f *action, unsigned int mode, unsigned long timeout);
 int out_of_line_wait_on_bit_lock(void *word, int, wait_bit_action_f *action, unsigned int mode);
-int out_of_line_wait_on_atomic_t(atomic_t *p, int (*)(atomic_t *), unsigned int mode);
+int out_of_line_wait_on_atomic_t(atomic_t *p, wait_atomic_t_action_f action, unsigned int mode);
 struct wait_queue_head *bit_waitqueue(void *word, int bit);
 extern void __init wait_bit_init(void);
 
@@ -51,10 +53,11 @@ int wake_bit_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync
 		},								\
 	}
 
-extern int bit_wait(struct wait_bit_key *key, int bit);
-extern int bit_wait_io(struct wait_bit_key *key, int bit);
-extern int bit_wait_timeout(struct wait_bit_key *key, int bit);
-extern int bit_wait_io_timeout(struct wait_bit_key *key, int bit);
+extern int bit_wait(struct wait_bit_key *key, int mode);
+extern int bit_wait_io(struct wait_bit_key *key, int mode);
+extern int bit_wait_timeout(struct wait_bit_key *key, int mode);
+extern int bit_wait_io_timeout(struct wait_bit_key *key, int mode);
+extern int atomic_t_wait(atomic_t *counter, unsigned int mode);
 
 /**
  * wait_on_bit - wait for a bit to be cleared
@@ -251,7 +254,7 @@ wait_on_bit_lock_action(unsigned long *word, int bit, wait_bit_action_f *action,
  * outside of the target 'word'.
  */
 static inline
-int wait_on_atomic_t(atomic_t *val, int (*action)(atomic_t *), unsigned mode)
+int wait_on_atomic_t(atomic_t *val, wait_atomic_t_action_f action, unsigned mode)
 {
 	might_sleep();
 	if (atomic_read(val) == 0)
diff --git a/kernel/sched/wait_bit.c b/kernel/sched/wait_bit.c
index f8159698aa4d..84cb3acd9260 100644
--- a/kernel/sched/wait_bit.c
+++ b/kernel/sched/wait_bit.c
@@ -183,7 +183,7 @@ static int wake_atomic_t_function(struct wait_queue_entry *wq_entry, unsigned mo
  */
 static __sched
 int __wait_on_atomic_t(struct wait_queue_head *wq_head, struct wait_bit_queue_entry *wbq_entry,
-		       int (*action)(atomic_t *), unsigned mode)
+		       wait_atomic_t_action_f action, unsigned int mode)
 {
 	atomic_t *val;
 	int ret = 0;
@@ -193,7 +193,7 @@ int __wait_on_atomic_t(struct wait_queue_head *wq_head, struct wait_bit_queue_en
 		val = wbq_entry->key.flags;
 		if (atomic_read(val) == 0)
 			break;
-		ret = (*action)(val);
+		ret = (*action)(val, mode);
 	} while (!ret && atomic_read(val) != 0);
 	finish_wait(wq_head, &wbq_entry->wq_entry);
 	return ret;
@@ -210,8 +210,9 @@ int __wait_on_atomic_t(struct wait_queue_head *wq_head, struct wait_bit_queue_en
 		},							\
 	}
 
-__sched int out_of_line_wait_on_atomic_t(atomic_t *p, int (*action)(atomic_t *),
-					 unsigned mode)
+__sched int out_of_line_wait_on_atomic_t(atomic_t *p,
+					 wait_atomic_t_action_f action,
+					 unsigned int mode)
 {
 	struct wait_queue_head *wq_head = atomic_t_waitqueue(p);
 	DEFINE_WAIT_ATOMIC_T(wq_entry, p);
@@ -220,6 +221,15 @@ __sched int out_of_line_wait_on_atomic_t(atomic_t *p, int (*action)(atomic_t *),
 }
 EXPORT_SYMBOL(out_of_line_wait_on_atomic_t);
 
+__sched int atomic_t_wait(atomic_t *counter, unsigned int mode)
+{
+	schedule();
+	if (signal_pending_state(mode, current))
+		return -EINTR;
+	return 0;
+}
+EXPORT_SYMBOL(atomic_t_wait);
+
 /**
  * wake_up_atomic_t - Wake up a waiter on a atomic_t
  * @p: The atomic_t being waited on, a kernel virtual address
-- 
cgit v1.2.3


From 9fd29c08e52023252f0480ab8f6906a1ecc9a8d5 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Sun, 12 Nov 2017 14:49:09 -0800
Subject: bpf: improve verifier ARG_CONST_SIZE_OR_ZERO semantics

For helpers, the argument type ARG_CONST_SIZE_OR_ZERO permits the
access size to be 0 when accessing the previous argument (arg).
Right now, it requires the arg needs to be NULL when size passed
is 0 or could be 0. It also requires a non-NULL arg when the size
is proved to be non-0.

This patch changes verifier ARG_CONST_SIZE_OR_ZERO behavior
such that for size-0 or possible size-0, it is not required
the arg equal to NULL.

There are a couple of reasons for this semantics change, and
all of them intends to simplify user bpf programs which
may improve user experience and/or increase chances of
verifier acceptance. Together with the next patch which
changes bpf_probe_read arg2 type from ARG_CONST_SIZE to
ARG_CONST_SIZE_OR_ZERO, the following two examples, which
fail the verifier currently, are able to get verifier acceptance.

Example 1:
   unsigned long len = pend - pstart;
   len = len > MAX_PAYLOAD_LEN ? MAX_PAYLOAD_LEN : len;
   len &= MAX_PAYLOAD_LEN;
   bpf_probe_read(data->payload, len, pstart);

It does not have test for "len > 0" and it failed the verifier.
Users may not be aware that they have to add this test.
Converting the bpf_probe_read helper to have
ARG_CONST_SIZE_OR_ZERO helps the above code get
verifier acceptance.

Example 2:
  Here is one example where llvm "messed up" the code and
  the verifier fails.

......
   unsigned long len = pend - pstart;
   if (len > 0 && len <= MAX_PAYLOAD_LEN)
     bpf_probe_read(data->payload, len, pstart);
......

The compiler generates the following code and verifier fails:
......
39: (79) r2 = *(u64 *)(r10 -16)
40: (1f) r2 -= r8
41: (bf) r1 = r2
42: (07) r1 += -1
43: (25) if r1 > 0xffe goto pc+3
  R0=inv(id=0) R1=inv(id=0,umax_value=4094,var_off=(0x0; 0xfff))
  R2=inv(id=0) R6=map_value(id=0,off=0,ks=4,vs=4095,imm=0) R7=inv(id=0)
  R8=inv(id=0) R9=inv0 R10=fp0
44: (bf) r1 = r6
45: (bf) r3 = r8
46: (85) call bpf_probe_read#45
R2 min value is negative, either use unsigned or 'var &= const'
......

The compiler optimization is correct. If r1 = 0,
r1 - 1 = 0xffffffffffffffff > 0xffe.  If r1 != 0, r1 - 1 will not wrap.
r1 > 0xffe at insn #43 can actually capture
both "r1 > 0" and "len <= MAX_PAYLOAD_LEN".
This however causes an issue in verifier as the value range of arg2
"r2" does not properly get refined and lead to verification failure.

Relaxing bpf_prog_read arg2 from ARG_CONST_SIZE to ARG_CONST_SIZE_OR_ZERO
allows the following simplied code:
   unsigned long len = pend - pstart;
   if (len <= MAX_PAYLOAD_LEN)
     bpf_probe_read(data->payload, len, pstart);

The llvm compiler will generate less complex code and the
verifier is able to verify that the program is okay.

Signed-off-by: Yonghong Song <yhs@fb.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/verifier.c | 40 ++++++++++++++++++++++++----------------
 1 file changed, 24 insertions(+), 16 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 4a942e2e753d..dd54d20ace2f 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -799,12 +799,13 @@ static int check_stack_read(struct bpf_verifier_env *env,
 
 /* check read/write into map element returned by bpf_map_lookup_elem() */
 static int __check_map_access(struct bpf_verifier_env *env, u32 regno, int off,
-			    int size)
+			      int size, bool zero_size_allowed)
 {
 	struct bpf_reg_state *regs = cur_regs(env);
 	struct bpf_map *map = regs[regno].map_ptr;
 
-	if (off < 0 || size <= 0 || off + size > map->value_size) {
+	if (off < 0 || size < 0 || (size == 0 && !zero_size_allowed) ||
+	    off + size > map->value_size) {
 		verbose(env, "invalid access to map value, value_size=%d off=%d size=%d\n",
 			map->value_size, off, size);
 		return -EACCES;
@@ -814,7 +815,7 @@ static int __check_map_access(struct bpf_verifier_env *env, u32 regno, int off,
 
 /* check read/write into a map element with possible variable offset */
 static int check_map_access(struct bpf_verifier_env *env, u32 regno,
-			    int off, int size)
+			    int off, int size, bool zero_size_allowed)
 {
 	struct bpf_verifier_state *state = env->cur_state;
 	struct bpf_reg_state *reg = &state->regs[regno];
@@ -837,7 +838,8 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno,
 			regno);
 		return -EACCES;
 	}
-	err = __check_map_access(env, regno, reg->smin_value + off, size);
+	err = __check_map_access(env, regno, reg->smin_value + off, size,
+				 zero_size_allowed);
 	if (err) {
 		verbose(env, "R%d min value is outside of the array range\n",
 			regno);
@@ -853,7 +855,8 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno,
 			regno);
 		return -EACCES;
 	}
-	err = __check_map_access(env, regno, reg->umax_value + off, size);
+	err = __check_map_access(env, regno, reg->umax_value + off, size,
+				 zero_size_allowed);
 	if (err)
 		verbose(env, "R%d max value is outside of the array range\n",
 			regno);
@@ -889,12 +892,13 @@ static bool may_access_direct_pkt_data(struct bpf_verifier_env *env,
 }
 
 static int __check_packet_access(struct bpf_verifier_env *env, u32 regno,
-				 int off, int size)
+				 int off, int size, bool zero_size_allowed)
 {
 	struct bpf_reg_state *regs = cur_regs(env);
 	struct bpf_reg_state *reg = &regs[regno];
 
-	if (off < 0 || size <= 0 || (u64)off + size > reg->range) {
+	if (off < 0 || size < 0 || (size == 0 && !zero_size_allowed) ||
+	    (u64)off + size > reg->range) {
 		verbose(env, "invalid access to packet, off=%d size=%d, R%d(id=%d,off=%d,r=%d)\n",
 			off, size, regno, reg->id, reg->off, reg->range);
 		return -EACCES;
@@ -903,7 +907,7 @@ static int __check_packet_access(struct bpf_verifier_env *env, u32 regno,
 }
 
 static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off,
-			       int size)
+			       int size, bool zero_size_allowed)
 {
 	struct bpf_reg_state *regs = cur_regs(env);
 	struct bpf_reg_state *reg = &regs[regno];
@@ -922,7 +926,7 @@ static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off,
 			regno);
 		return -EACCES;
 	}
-	err = __check_packet_access(env, regno, off, size);
+	err = __check_packet_access(env, regno, off, size, zero_size_allowed);
 	if (err) {
 		verbose(env, "R%d offset is outside of the packet\n", regno);
 		return err;
@@ -1097,7 +1101,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
 			return -EACCES;
 		}
 
-		err = check_map_access(env, regno, off, size);
+		err = check_map_access(env, regno, off, size, false);
 		if (!err && t == BPF_READ && value_regno >= 0)
 			mark_reg_unknown(env, regs, value_regno);
 
@@ -1184,7 +1188,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
 				value_regno);
 			return -EACCES;
 		}
-		err = check_packet_access(env, regno, off, size);
+		err = check_packet_access(env, regno, off, size, false);
 		if (!err && t == BPF_READ && value_regno >= 0)
 			mark_reg_unknown(env, regs, value_regno);
 	} else {
@@ -1281,7 +1285,7 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno,
 	}
 	off = regs[regno].off + regs[regno].var_off.value;
 	if (off >= 0 || off < -MAX_BPF_STACK || off + access_size > 0 ||
-	    access_size <= 0) {
+	    access_size < 0 || (access_size == 0 && !zero_size_allowed)) {
 		verbose(env, "invalid stack type R%d off=%d access_size=%d\n",
 			regno, off, access_size);
 		return -EACCES;
@@ -1319,9 +1323,11 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,
 	switch (reg->type) {
 	case PTR_TO_PACKET:
 	case PTR_TO_PACKET_META:
-		return check_packet_access(env, regno, reg->off, access_size);
+		return check_packet_access(env, regno, reg->off, access_size,
+					   zero_size_allowed);
 	case PTR_TO_MAP_VALUE:
-		return check_map_access(env, regno, reg->off, access_size);
+		return check_map_access(env, regno, reg->off, access_size,
+					zero_size_allowed);
 	default: /* scalar_value|ptr_to_stack or invalid ptr */
 		return check_stack_boundary(env, regno, access_size,
 					    zero_size_allowed, meta);
@@ -1415,7 +1421,8 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
 		}
 		if (type_is_pkt_pointer(type))
 			err = check_packet_access(env, regno, reg->off,
-						  meta->map_ptr->key_size);
+						  meta->map_ptr->key_size,
+						  false);
 		else
 			err = check_stack_boundary(env, regno,
 						   meta->map_ptr->key_size,
@@ -1431,7 +1438,8 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
 		}
 		if (type_is_pkt_pointer(type))
 			err = check_packet_access(env, regno, reg->off,
-						  meta->map_ptr->value_size);
+						  meta->map_ptr->value_size,
+						  false);
 		else
 			err = check_stack_boundary(env, regno,
 						   meta->map_ptr->value_size,
-- 
cgit v1.2.3


From 9c019e2bc4b2bd8223c8c0d4b6962478b479834d Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Sun, 12 Nov 2017 14:49:10 -0800
Subject: bpf: change helper bpf_probe_read arg2 type to ARG_CONST_SIZE_OR_ZERO

The helper bpf_probe_read arg2 type is changed
from ARG_CONST_SIZE to ARG_CONST_SIZE_OR_ZERO to permit
size-0 buffer. Together with newer ARG_CONST_SIZE_OR_ZERO
semantics which allows non-NULL buffer with size 0,
this allows simpler bpf programs with verifier acceptance.
The previous commit which changes ARG_CONST_SIZE_OR_ZERO semantics
has details on examples.

Signed-off-by: Yonghong Song <yhs@fb.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/trace/bpf_trace.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 506efe6e8ed9..a5580c670866 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -78,12 +78,16 @@ EXPORT_SYMBOL_GPL(trace_call_bpf);
 
 BPF_CALL_3(bpf_probe_read, void *, dst, u32, size, const void *, unsafe_ptr)
 {
-	int ret;
+	int ret = 0;
+
+	if (unlikely(size == 0))
+		goto out;
 
 	ret = probe_kernel_read(dst, unsafe_ptr, size);
 	if (unlikely(ret < 0))
 		memset(dst, 0, size);
 
+ out:
 	return ret;
 }
 
@@ -92,7 +96,7 @@ static const struct bpf_func_proto bpf_probe_read_proto = {
 	.gpl_only	= true,
 	.ret_type	= RET_INTEGER,
 	.arg1_type	= ARG_PTR_TO_UNINIT_MEM,
-	.arg2_type	= ARG_CONST_SIZE,
+	.arg2_type	= ARG_CONST_SIZE_OR_ZERO,
 	.arg3_type	= ARG_ANYTHING,
 };
 
-- 
cgit v1.2.3


From 89ad2fa3f043a1e8daae193bcb5fe34d5f8caf28 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 14 Nov 2017 17:15:50 -0800
Subject: bpf: fix lockdep splat

pcpu_freelist_pop() needs the same lockdep awareness than
pcpu_freelist_populate() to avoid a false positive.

 [ INFO: SOFTIRQ-safe -> SOFTIRQ-unsafe lock order detected ]

 switchto-defaul/12508 [HC0[0]:SC0[6]:HE0:SE0] is trying to acquire:
  (&htab->buckets[i].lock){......}, at: [<ffffffff9dc099cb>] __htab_percpu_map_update_elem+0x1cb/0x300

 and this task is already holding:
  (dev_queue->dev->qdisc_class ?: &qdisc_tx_lock#2){+.-...}, at: [<ffffffff9e135848>] __dev_queue_xmit+0
x868/0x1240
 which would create a new lock dependency:
  (dev_queue->dev->qdisc_class ?: &qdisc_tx_lock#2){+.-...} -> (&htab->buckets[i].lock){......}

 but this new dependency connects a SOFTIRQ-irq-safe lock:
  (dev_queue->dev->qdisc_class ?: &qdisc_tx_lock#2){+.-...}
 ... which became SOFTIRQ-irq-safe at:
   [<ffffffff9db5931b>] __lock_acquire+0x42b/0x1f10
   [<ffffffff9db5b32c>] lock_acquire+0xbc/0x1b0
   [<ffffffff9da05e38>] _raw_spin_lock+0x38/0x50
   [<ffffffff9e135848>] __dev_queue_xmit+0x868/0x1240
   [<ffffffff9e136240>] dev_queue_xmit+0x10/0x20
   [<ffffffff9e1965d9>] ip_finish_output2+0x439/0x590
   [<ffffffff9e197410>] ip_finish_output+0x150/0x2f0
   [<ffffffff9e19886d>] ip_output+0x7d/0x260
   [<ffffffff9e19789e>] ip_local_out+0x5e/0xe0
   [<ffffffff9e197b25>] ip_queue_xmit+0x205/0x620
   [<ffffffff9e1b8398>] tcp_transmit_skb+0x5a8/0xcb0
   [<ffffffff9e1ba152>] tcp_write_xmit+0x242/0x1070
   [<ffffffff9e1baffc>] __tcp_push_pending_frames+0x3c/0xf0
   [<ffffffff9e1b3472>] tcp_rcv_established+0x312/0x700
   [<ffffffff9e1c1acc>] tcp_v4_do_rcv+0x11c/0x200
   [<ffffffff9e1c3dc2>] tcp_v4_rcv+0xaa2/0xc30
   [<ffffffff9e191107>] ip_local_deliver_finish+0xa7/0x240
   [<ffffffff9e191a36>] ip_local_deliver+0x66/0x200
   [<ffffffff9e19137d>] ip_rcv_finish+0xdd/0x560
   [<ffffffff9e191e65>] ip_rcv+0x295/0x510
   [<ffffffff9e12ff88>] __netif_receive_skb_core+0x988/0x1020
   [<ffffffff9e130641>] __netif_receive_skb+0x21/0x70
   [<ffffffff9e1306ff>] process_backlog+0x6f/0x230
   [<ffffffff9e132129>] net_rx_action+0x229/0x420
   [<ffffffff9da07ee8>] __do_softirq+0xd8/0x43d
   [<ffffffff9e282bcc>] do_softirq_own_stack+0x1c/0x30
   [<ffffffff9dafc2f5>] do_softirq+0x55/0x60
   [<ffffffff9dafc3a8>] __local_bh_enable_ip+0xa8/0xb0
   [<ffffffff9db4c727>] cpu_startup_entry+0x1c7/0x500
   [<ffffffff9daab333>] start_secondary+0x113/0x140

 to a SOFTIRQ-irq-unsafe lock:
  (&head->lock){+.+...}
 ... which became SOFTIRQ-irq-unsafe at:
 ...  [<ffffffff9db5971f>] __lock_acquire+0x82f/0x1f10
   [<ffffffff9db5b32c>] lock_acquire+0xbc/0x1b0
   [<ffffffff9da05e38>] _raw_spin_lock+0x38/0x50
   [<ffffffff9dc0b7fa>] pcpu_freelist_pop+0x7a/0xb0
   [<ffffffff9dc08b2c>] htab_map_alloc+0x50c/0x5f0
   [<ffffffff9dc00dc5>] SyS_bpf+0x265/0x1200
   [<ffffffff9e28195f>] entry_SYSCALL_64_fastpath+0x12/0x17

 other info that might help us debug this:

 Chain exists of:
   dev_queue->dev->qdisc_class ?: &qdisc_tx_lock#2 --> &htab->buckets[i].lock --> &head->lock

  Possible interrupt unsafe locking scenario:

        CPU0                    CPU1
        ----                    ----
   lock(&head->lock);
                                local_irq_disable();
                                lock(dev_queue->dev->qdisc_class ?: &qdisc_tx_lock#2);
                                lock(&htab->buckets[i].lock);
   <Interrupt>
     lock(dev_queue->dev->qdisc_class ?: &qdisc_tx_lock#2);

  *** DEADLOCK ***

Fixes: e19494edab82 ("bpf: introduce percpu_freelist")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/percpu_freelist.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/percpu_freelist.c b/kernel/bpf/percpu_freelist.c
index 5c51d1985b51..673fa6fe2d73 100644
--- a/kernel/bpf/percpu_freelist.c
+++ b/kernel/bpf/percpu_freelist.c
@@ -78,8 +78,10 @@ struct pcpu_freelist_node *pcpu_freelist_pop(struct pcpu_freelist *s)
 {
 	struct pcpu_freelist_head *head;
 	struct pcpu_freelist_node *node;
+	unsigned long flags;
 	int orig_cpu, cpu;
 
+	local_irq_save(flags);
 	orig_cpu = cpu = raw_smp_processor_id();
 	while (1) {
 		head = per_cpu_ptr(s->freelist, cpu);
@@ -87,14 +89,16 @@ struct pcpu_freelist_node *pcpu_freelist_pop(struct pcpu_freelist *s)
 		node = head->first;
 		if (node) {
 			head->first = node->next;
-			raw_spin_unlock(&head->lock);
+			raw_spin_unlock_irqrestore(&head->lock, flags);
 			return node;
 		}
 		raw_spin_unlock(&head->lock);
 		cpu = cpumask_next(cpu, cpu_possible_mask);
 		if (cpu >= nr_cpu_ids)
 			cpu = 0;
-		if (cpu == orig_cpu)
+		if (cpu == orig_cpu) {
+			local_irq_restore(flags);
 			return NULL;
+		}
 	}
 }
-- 
cgit v1.2.3


From b4e98d9ac775907cc53fb08fcb6776deb7694e30 Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Wed, 15 Nov 2017 17:35:33 -0800
Subject: mm: account pud page tables

On a machine with 5-level paging support a process can allocate
significant amount of memory and stay unnoticed by oom-killer and memory
cgroup.  The trick is to allocate a lot of PUD page tables.  We don't
account PUD page tables, only PMD and PTE.

We already addressed the same issue for PMD page tables, see commit
dc6c9a35b66b ("mm: account pmd page tables to the process").
Introduction of 5-level paging brings the same issue for PUD page
tables.

The patch expands accounting to PUD level.

[kirill.shutemov@linux.intel.com: s/pmd_t/pud_t/]
  Link: http://lkml.kernel.org/r/20171004074305.x35eh5u7ybbt5kar@black.fi.intel.com
[heiko.carstens@de.ibm.com: s390/mm: fix pud table accounting]
  Link: http://lkml.kernel.org/r/20171103090551.18231-1-heiko.carstens@de.ibm.com
Link: http://lkml.kernel.org/r/20171002080427.3320-1-kirill.shutemov@linux.intel.com
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Acked-by: Rik van Riel <riel@redhat.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/sysctl/vm.txt         |  8 ++++----
 arch/powerpc/mm/hugetlbpage.c       |  1 +
 arch/s390/include/asm/mmu_context.h |  4 +++-
 arch/sparc/mm/hugetlbpage.c         |  1 +
 fs/proc/task_mmu.c                  |  5 ++++-
 include/linux/mm.h                  | 36 +++++++++++++++++++++++++++++++++---
 include/linux/mm_types.h            |  3 +++
 kernel/fork.c                       |  4 ++++
 mm/debug.c                          |  6 ++++--
 mm/memory.c                         | 15 +++++++++------
 mm/oom_kill.c                       |  8 +++++---
 11 files changed, 71 insertions(+), 20 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt
index 30fd16b14196..2729e8db9492 100644
--- a/Documentation/sysctl/vm.txt
+++ b/Documentation/sysctl/vm.txt
@@ -629,10 +629,10 @@ oom_dump_tasks
 
 Enables a system-wide task dump (excluding kernel threads) to be produced
 when the kernel performs an OOM-killing and includes such information as
-pid, uid, tgid, vm size, rss, nr_ptes, nr_pmds, swapents, oom_score_adj
-score, and name.  This is helpful to determine why the OOM killer was
-invoked, to identify the rogue task that caused it, and to determine why
-the OOM killer chose the task it did to kill.
+pid, uid, tgid, vm size, rss, nr_ptes, nr_pmds, nr_puds, swapents,
+oom_score_adj score, and name.  This is helpful to determine why the OOM
+killer was invoked, to identify the rogue task that caused it, and to
+determine why the OOM killer chose the task it did to kill.
 
 If this is set to zero, this information is suppressed.  On very
 large systems with thousands of tasks it may not be feasible to dump
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index 1571a498a33f..a9b9083c5e49 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -433,6 +433,7 @@ static void hugetlb_free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
 	pud = pud_offset(pgd, start);
 	pgd_clear(pgd);
 	pud_free_tlb(tlb, pud, start);
+	mm_dec_nr_puds(tlb->mm);
 }
 
 /*
diff --git a/arch/s390/include/asm/mmu_context.h b/arch/s390/include/asm/mmu_context.h
index 43607bb12cc2..cf4c1cb17dcd 100644
--- a/arch/s390/include/asm/mmu_context.h
+++ b/arch/s390/include/asm/mmu_context.h
@@ -44,6 +44,8 @@ static inline int init_new_context(struct task_struct *tsk,
 		mm->context.asce_limit = STACK_TOP_MAX;
 		mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH |
 				   _ASCE_USER_BITS | _ASCE_TYPE_REGION3;
+		/* pgd_alloc() did not account this pud */
+		mm_inc_nr_puds(mm);
 		break;
 	case -PAGE_SIZE:
 		/* forked 5-level task, set new asce with new_mm->pgd */
@@ -59,7 +61,7 @@ static inline int init_new_context(struct task_struct *tsk,
 		/* forked 2-level compat task, set new asce with new mm->pgd */
 		mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH |
 				   _ASCE_USER_BITS | _ASCE_TYPE_SEGMENT;
-		/* pgd_alloc() did not increase mm->nr_pmds */
+		/* pgd_alloc() did not account this pmd */
 		mm_inc_nr_pmds(mm);
 	}
 	crst_table_init((unsigned long *) mm->pgd, pgd_entry_type(mm));
diff --git a/arch/sparc/mm/hugetlbpage.c b/arch/sparc/mm/hugetlbpage.c
index 5078b7f68890..01f63b4ee2b4 100644
--- a/arch/sparc/mm/hugetlbpage.c
+++ b/arch/sparc/mm/hugetlbpage.c
@@ -472,6 +472,7 @@ static void hugetlb_free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
 	pud = pud_offset(pgd, start);
 	pgd_clear(pgd);
 	pud_free_tlb(tlb, pud, start);
+	mm_dec_nr_puds(tlb->mm);
 }
 
 void hugetlb_free_pgd_range(struct mmu_gather *tlb,
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 6744bd706ecf..a05ce6186f99 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -26,7 +26,7 @@
 
 void task_mem(struct seq_file *m, struct mm_struct *mm)
 {
-	unsigned long text, lib, swap, ptes, pmds, anon, file, shmem;
+	unsigned long text, lib, swap, ptes, pmds, puds, anon, file, shmem;
 	unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss;
 
 	anon = get_mm_counter(mm, MM_ANONPAGES);
@@ -52,6 +52,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
 	swap = get_mm_counter(mm, MM_SWAPENTS);
 	ptes = PTRS_PER_PTE * sizeof(pte_t) * atomic_long_read(&mm->nr_ptes);
 	pmds = PTRS_PER_PMD * sizeof(pmd_t) * mm_nr_pmds(mm);
+	puds = PTRS_PER_PUD * sizeof(pud_t) * mm_nr_puds(mm);
 	seq_printf(m,
 		"VmPeak:\t%8lu kB\n"
 		"VmSize:\t%8lu kB\n"
@@ -68,6 +69,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
 		"VmLib:\t%8lu kB\n"
 		"VmPTE:\t%8lu kB\n"
 		"VmPMD:\t%8lu kB\n"
+		"VmPUD:\t%8lu kB\n"
 		"VmSwap:\t%8lu kB\n",
 		hiwater_vm << (PAGE_SHIFT-10),
 		total_vm << (PAGE_SHIFT-10),
@@ -82,6 +84,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
 		mm->stack_vm << (PAGE_SHIFT-10), text, lib,
 		ptes >> 10,
 		pmds >> 10,
+		puds >> 10,
 		swap << (PAGE_SHIFT-10));
 	hugetlb_report_usage(m, mm);
 }
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 91b46f99b4d2..9af86f39d928 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1599,14 +1599,44 @@ static inline int __p4d_alloc(struct mm_struct *mm, pgd_t *pgd,
 int __p4d_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address);
 #endif
 
-#ifdef __PAGETABLE_PUD_FOLDED
+#if defined(__PAGETABLE_PUD_FOLDED) || !defined(CONFIG_MMU)
 static inline int __pud_alloc(struct mm_struct *mm, p4d_t *p4d,
 						unsigned long address)
 {
 	return 0;
 }
+
+static inline unsigned long mm_nr_puds(const struct mm_struct *mm)
+{
+	return 0;
+}
+
+static inline void mm_nr_puds_init(struct mm_struct *mm) {}
+static inline void mm_inc_nr_puds(struct mm_struct *mm) {}
+static inline void mm_dec_nr_puds(struct mm_struct *mm) {}
+
 #else
 int __pud_alloc(struct mm_struct *mm, p4d_t *p4d, unsigned long address);
+
+static inline void mm_nr_puds_init(struct mm_struct *mm)
+{
+	atomic_long_set(&mm->nr_puds, 0);
+}
+
+static inline unsigned long mm_nr_puds(const struct mm_struct *mm)
+{
+	return atomic_long_read(&mm->nr_puds);
+}
+
+static inline void mm_inc_nr_puds(struct mm_struct *mm)
+{
+	atomic_long_inc(&mm->nr_puds);
+}
+
+static inline void mm_dec_nr_puds(struct mm_struct *mm)
+{
+	atomic_long_dec(&mm->nr_puds);
+}
 #endif
 
 #if defined(__PAGETABLE_PMD_FOLDED) || !defined(CONFIG_MMU)
@@ -1618,7 +1648,7 @@ static inline int __pmd_alloc(struct mm_struct *mm, pud_t *pud,
 
 static inline void mm_nr_pmds_init(struct mm_struct *mm) {}
 
-static inline unsigned long mm_nr_pmds(struct mm_struct *mm)
+static inline unsigned long mm_nr_pmds(const struct mm_struct *mm)
 {
 	return 0;
 }
@@ -1634,7 +1664,7 @@ static inline void mm_nr_pmds_init(struct mm_struct *mm)
 	atomic_long_set(&mm->nr_pmds, 0);
 }
 
-static inline unsigned long mm_nr_pmds(struct mm_struct *mm)
+static inline unsigned long mm_nr_pmds(const struct mm_struct *mm)
 {
 	return atomic_long_read(&mm->nr_pmds);
 }
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index d1b8e8f97fc2..e9e561e02d22 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -404,6 +404,9 @@ struct mm_struct {
 	atomic_long_t nr_ptes;			/* PTE page table pages */
 #if CONFIG_PGTABLE_LEVELS > 2
 	atomic_long_t nr_pmds;			/* PMD page table pages */
+#endif
+#if CONFIG_PGTABLE_LEVELS > 3
+	atomic_long_t nr_puds;			/* PUD page table pages */
 #endif
 	int map_count;				/* number of VMAs */
 
diff --git a/kernel/fork.c b/kernel/fork.c
index 07cc743698d3..a4eb6f289365 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -819,6 +819,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
 	mm->core_state = NULL;
 	atomic_long_set(&mm->nr_ptes, 0);
 	mm_nr_pmds_init(mm);
+	mm_nr_puds_init(mm);
 	mm->map_count = 0;
 	mm->locked_vm = 0;
 	mm->pinned_vm = 0;
@@ -878,6 +879,9 @@ static void check_mm(struct mm_struct *mm)
 	if (mm_nr_pmds(mm))
 		pr_alert("BUG: non-zero nr_pmds on freeing mm: %ld\n",
 				mm_nr_pmds(mm));
+	if (mm_nr_puds(mm))
+		pr_alert("BUG: non-zero nr_puds on freeing mm: %ld\n",
+				mm_nr_puds(mm));
 
 #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
 	VM_BUG_ON_MM(mm->pmd_huge_pte, mm);
diff --git a/mm/debug.c b/mm/debug.c
index 6726bec731c9..a12d826bb774 100644
--- a/mm/debug.c
+++ b/mm/debug.c
@@ -105,7 +105,8 @@ void dump_mm(const struct mm_struct *mm)
 		"get_unmapped_area %p\n"
 #endif
 		"mmap_base %lu mmap_legacy_base %lu highest_vm_end %lu\n"
-		"pgd %p mm_users %d mm_count %d nr_ptes %lu nr_pmds %lu map_count %d\n"
+		"pgd %p mm_users %d mm_count %d\n"
+		"nr_ptes %lu nr_pmds %lu nr_puds %lu map_count %d\n"
 		"hiwater_rss %lx hiwater_vm %lx total_vm %lx locked_vm %lx\n"
 		"pinned_vm %lx data_vm %lx exec_vm %lx stack_vm %lx\n"
 		"start_code %lx end_code %lx start_data %lx end_data %lx\n"
@@ -136,7 +137,8 @@ void dump_mm(const struct mm_struct *mm)
 		mm->pgd, atomic_read(&mm->mm_users),
 		atomic_read(&mm->mm_count),
 		atomic_long_read((atomic_long_t *)&mm->nr_ptes),
-		mm_nr_pmds((struct mm_struct *)mm),
+		mm_nr_pmds(mm),
+		mm_nr_puds(mm),
 		mm->map_count,
 		mm->hiwater_rss, mm->hiwater_vm, mm->total_vm, mm->locked_vm,
 		mm->pinned_vm, mm->data_vm, mm->exec_vm, mm->stack_vm,
diff --git a/mm/memory.c b/mm/memory.c
index 42fb30300bb5..6bbd4078ec98 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -506,6 +506,7 @@ static inline void free_pud_range(struct mmu_gather *tlb, p4d_t *p4d,
 	pud = pud_offset(p4d, start);
 	p4d_clear(p4d);
 	pud_free_tlb(tlb, pud, start);
+	mm_dec_nr_puds(tlb->mm);
 }
 
 static inline void free_p4d_range(struct mmu_gather *tlb, pgd_t *pgd,
@@ -4149,15 +4150,17 @@ int __pud_alloc(struct mm_struct *mm, p4d_t *p4d, unsigned long address)
 
 	spin_lock(&mm->page_table_lock);
 #ifndef __ARCH_HAS_5LEVEL_HACK
-	if (p4d_present(*p4d))		/* Another has populated it */
-		pud_free(mm, new);
-	else
+	if (!p4d_present(*p4d)) {
+		mm_inc_nr_puds(mm);
 		p4d_populate(mm, p4d, new);
-#else
-	if (pgd_present(*p4d))		/* Another has populated it */
+	} else	/* Another has populated it */
 		pud_free(mm, new);
-	else
+#else
+	if (!pgd_present(*p4d)) {
+		mm_inc_nr_puds(mm);
 		pgd_populate(mm, p4d, new);
+	} else	/* Another has populated it */
+		pud_free(mm, new);
 #endif /* __ARCH_HAS_5LEVEL_HACK */
 	spin_unlock(&mm->page_table_lock);
 	return 0;
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 3023919970f7..f642a45b7f14 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -221,7 +221,8 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
 	 * task's rss, pagetable and swap space use.
 	 */
 	points = get_mm_rss(p->mm) + get_mm_counter(p->mm, MM_SWAPENTS) +
-		atomic_long_read(&p->mm->nr_ptes) + mm_nr_pmds(p->mm);
+		atomic_long_read(&p->mm->nr_ptes) + mm_nr_pmds(p->mm) +
+		mm_nr_puds(p->mm);
 	task_unlock(p);
 
 	/*
@@ -397,7 +398,7 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask)
 	struct task_struct *p;
 	struct task_struct *task;
 
-	pr_info("[ pid ]   uid  tgid total_vm      rss nr_ptes nr_pmds swapents oom_score_adj name\n");
+	pr_info("[ pid ]   uid  tgid total_vm      rss nr_ptes nr_pmds nr_puds swapents oom_score_adj name\n");
 	rcu_read_lock();
 	for_each_process(p) {
 		if (oom_unkillable_task(p, memcg, nodemask))
@@ -413,11 +414,12 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask)
 			continue;
 		}
 
-		pr_info("[%5d] %5d %5d %8lu %8lu %7ld %7ld %8lu         %5hd %s\n",
+		pr_info("[%5d] %5d %5d %8lu %8lu %7ld %7ld %7ld %8lu         %5hd %s\n",
 			task->pid, from_kuid(&init_user_ns, task_uid(task)),
 			task->tgid, task->mm->total_vm, get_mm_rss(task->mm),
 			atomic_long_read(&task->mm->nr_ptes),
 			mm_nr_pmds(task->mm),
+			mm_nr_puds(task->mm),
 			get_mm_counter(task->mm, MM_SWAPENTS),
 			task->signal->oom_score_adj, task->comm);
 		task_unlock(task);
-- 
cgit v1.2.3


From c4812909f5d5a9b7f1c85a2d95be388a066cda52 Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Wed, 15 Nov 2017 17:35:37 -0800
Subject: mm: introduce wrappers to access mm->nr_ptes

Let's add wrappers for ->nr_ptes with the same interface as for nr_pmd
and nr_pud.

The patch also makes nr_ptes accounting dependent onto CONFIG_MMU.  Page
table accounting doesn't make sense if you don't have page tables.

It's preparation for consolidation of page-table counters in mm_struct.

Link: http://lkml.kernel.org/r/20171006100651.44742-1-kirill.shutemov@linux.intel.com
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/arm/mm/pgd.c           |  2 +-
 arch/sparc/mm/hugetlbpage.c |  2 +-
 arch/unicore32/mm/pgd.c     |  2 +-
 fs/proc/task_mmu.c          |  2 +-
 include/linux/mm.h          | 32 ++++++++++++++++++++++++++++++++
 include/linux/mm_types.h    |  2 ++
 kernel/fork.c               |  6 +++---
 mm/debug.c                  |  2 +-
 mm/huge_memory.c            | 10 +++++-----
 mm/khugepaged.c             |  2 +-
 mm/memory.c                 |  8 ++++----
 mm/oom_kill.c               |  5 ++---
 12 files changed, 54 insertions(+), 21 deletions(-)

(limited to 'kernel')

diff --git a/arch/arm/mm/pgd.c b/arch/arm/mm/pgd.c
index c1c1a5c67da1..61e281cb29fb 100644
--- a/arch/arm/mm/pgd.c
+++ b/arch/arm/mm/pgd.c
@@ -141,7 +141,7 @@ void pgd_free(struct mm_struct *mm, pgd_t *pgd_base)
 	pte = pmd_pgtable(*pmd);
 	pmd_clear(pmd);
 	pte_free(mm, pte);
-	atomic_long_dec(&mm->nr_ptes);
+	mm_dec_nr_ptes(mm);
 no_pmd:
 	pud_clear(pud);
 	pmd_free(mm, pmd);
diff --git a/arch/sparc/mm/hugetlbpage.c b/arch/sparc/mm/hugetlbpage.c
index 01f63b4ee2b4..0112d6942288 100644
--- a/arch/sparc/mm/hugetlbpage.c
+++ b/arch/sparc/mm/hugetlbpage.c
@@ -397,7 +397,7 @@ static void hugetlb_free_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
 
 	pmd_clear(pmd);
 	pte_free_tlb(tlb, token, addr);
-	atomic_long_dec(&tlb->mm->nr_ptes);
+	mm_dec_nr_ptes(tlb->mm);
 }
 
 static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
diff --git a/arch/unicore32/mm/pgd.c b/arch/unicore32/mm/pgd.c
index c572a28c76c9..a830a300aaa1 100644
--- a/arch/unicore32/mm/pgd.c
+++ b/arch/unicore32/mm/pgd.c
@@ -97,7 +97,7 @@ void free_pgd_slow(struct mm_struct *mm, pgd_t *pgd)
 	pte = pmd_pgtable(*pmd);
 	pmd_clear(pmd);
 	pte_free(mm, pte);
-	atomic_long_dec(&mm->nr_ptes);
+	mm_dec_nr_ptes(mm);
 	pmd_free(mm, pmd);
 	mm_dec_nr_pmds(mm);
 free:
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index a05ce6186f99..9bd2a0294ac1 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -50,7 +50,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
 	text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) >> 10;
 	lib = (mm->exec_vm << (PAGE_SHIFT-10)) - text;
 	swap = get_mm_counter(mm, MM_SWAPENTS);
-	ptes = PTRS_PER_PTE * sizeof(pte_t) * atomic_long_read(&mm->nr_ptes);
+	ptes = PTRS_PER_PTE * sizeof(pte_t) * mm_nr_ptes(mm);
 	pmds = PTRS_PER_PMD * sizeof(pmd_t) * mm_nr_pmds(mm);
 	puds = PTRS_PER_PUD * sizeof(pud_t) * mm_nr_puds(mm);
 	seq_printf(m,
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 9af86f39d928..2ca799f0d762 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1680,6 +1680,38 @@ static inline void mm_dec_nr_pmds(struct mm_struct *mm)
 }
 #endif
 
+#ifdef CONFIG_MMU
+static inline void mm_nr_ptes_init(struct mm_struct *mm)
+{
+	atomic_long_set(&mm->nr_ptes, 0);
+}
+
+static inline unsigned long mm_nr_ptes(const struct mm_struct *mm)
+{
+	return atomic_long_read(&mm->nr_ptes);
+}
+
+static inline void mm_inc_nr_ptes(struct mm_struct *mm)
+{
+	atomic_long_inc(&mm->nr_ptes);
+}
+
+static inline void mm_dec_nr_ptes(struct mm_struct *mm)
+{
+	atomic_long_dec(&mm->nr_ptes);
+}
+#else
+static inline void mm_nr_ptes_init(struct mm_struct *mm) {}
+
+static inline unsigned long mm_nr_ptes(const struct mm_struct *mm)
+{
+	return 0;
+}
+
+static inline void mm_inc_nr_ptes(struct mm_struct *mm) {}
+static inline void mm_dec_nr_ptes(struct mm_struct *mm) {}
+#endif
+
 int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address);
 int __pte_alloc_kernel(pmd_t *pmd, unsigned long address);
 
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index e9e561e02d22..e42048020664 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -401,7 +401,9 @@ struct mm_struct {
 	 */
 	atomic_t mm_count;
 
+#ifdef CONFIG_MMU
 	atomic_long_t nr_ptes;			/* PTE page table pages */
+#endif
 #if CONFIG_PGTABLE_LEVELS > 2
 	atomic_long_t nr_pmds;			/* PMD page table pages */
 #endif
diff --git a/kernel/fork.c b/kernel/fork.c
index a4eb6f289365..946922a30ede 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -817,7 +817,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
 	init_rwsem(&mm->mmap_sem);
 	INIT_LIST_HEAD(&mm->mmlist);
 	mm->core_state = NULL;
-	atomic_long_set(&mm->nr_ptes, 0);
+	mm_nr_ptes_init(mm);
 	mm_nr_pmds_init(mm);
 	mm_nr_puds_init(mm);
 	mm->map_count = 0;
@@ -873,9 +873,9 @@ static void check_mm(struct mm_struct *mm)
 					  "mm:%p idx:%d val:%ld\n", mm, i, x);
 	}
 
-	if (atomic_long_read(&mm->nr_ptes))
+	if (mm_nr_ptes(mm))
 		pr_alert("BUG: non-zero nr_ptes on freeing mm: %ld\n",
-				atomic_long_read(&mm->nr_ptes));
+				mm_nr_ptes(mm));
 	if (mm_nr_pmds(mm))
 		pr_alert("BUG: non-zero nr_pmds on freeing mm: %ld\n",
 				mm_nr_pmds(mm));
diff --git a/mm/debug.c b/mm/debug.c
index a12d826bb774..c9888a6d7875 100644
--- a/mm/debug.c
+++ b/mm/debug.c
@@ -136,7 +136,7 @@ void dump_mm(const struct mm_struct *mm)
 		mm->mmap_base, mm->mmap_legacy_base, mm->highest_vm_end,
 		mm->pgd, atomic_read(&mm->mm_users),
 		atomic_read(&mm->mm_count),
-		atomic_long_read((atomic_long_t *)&mm->nr_ptes),
+		mm_nr_ptes(mm),
 		mm_nr_pmds(mm),
 		mm_nr_puds(mm),
 		mm->map_count,
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index cc65fb87c9db..3610d81c062a 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -606,7 +606,7 @@ static int __do_huge_pmd_anonymous_page(struct vm_fault *vmf, struct page *page,
 		pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
 		set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
 		add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
-		atomic_long_inc(&vma->vm_mm->nr_ptes);
+		mm_inc_nr_ptes(vma->vm_mm);
 		spin_unlock(vmf->ptl);
 		count_vm_event(THP_FAULT_ALLOC);
 	}
@@ -662,7 +662,7 @@ static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
 	if (pgtable)
 		pgtable_trans_huge_deposit(mm, pmd, pgtable);
 	set_pmd_at(mm, haddr, pmd, entry);
-	atomic_long_inc(&mm->nr_ptes);
+	mm_inc_nr_ptes(mm);
 	return true;
 }
 
@@ -747,7 +747,7 @@ static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
 
 	if (pgtable) {
 		pgtable_trans_huge_deposit(mm, pmd, pgtable);
-		atomic_long_inc(&mm->nr_ptes);
+		mm_inc_nr_ptes(mm);
 	}
 
 	set_pmd_at(mm, addr, pmd, entry);
@@ -978,7 +978,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 	get_page(src_page);
 	page_dup_rmap(src_page, true);
 	add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
-	atomic_long_inc(&dst_mm->nr_ptes);
+	mm_inc_nr_ptes(dst_mm);
 	pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
 
 	pmdp_set_wrprotect(src_mm, addr, src_pmd);
@@ -1695,7 +1695,7 @@ static inline void zap_deposited_table(struct mm_struct *mm, pmd_t *pmd)
 
 	pgtable = pgtable_trans_huge_withdraw(mm, pmd);
 	pte_free(mm, pgtable);
-	atomic_long_dec(&mm->nr_ptes);
+	mm_dec_nr_ptes(mm);
 }
 
 int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 43cb3043311b..ea4ff259b671 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1270,7 +1270,7 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
 			_pmd = pmdp_collapse_flush(vma, addr, pmd);
 			spin_unlock(ptl);
 			up_write(&vma->vm_mm->mmap_sem);
-			atomic_long_dec(&vma->vm_mm->nr_ptes);
+			mm_dec_nr_ptes(vma->vm_mm);
 			pte_free(vma->vm_mm, pmd_pgtable(_pmd));
 		}
 	}
diff --git a/mm/memory.c b/mm/memory.c
index 6bbd4078ec98..6dec21b182b0 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -438,7 +438,7 @@ static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
 	pgtable_t token = pmd_pgtable(*pmd);
 	pmd_clear(pmd);
 	pte_free_tlb(tlb, token, addr);
-	atomic_long_dec(&tlb->mm->nr_ptes);
+	mm_dec_nr_ptes(tlb->mm);
 }
 
 static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
@@ -666,7 +666,7 @@ int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
 
 	ptl = pmd_lock(mm, pmd);
 	if (likely(pmd_none(*pmd))) {	/* Has another populated it ? */
-		atomic_long_inc(&mm->nr_ptes);
+		mm_inc_nr_ptes(mm);
 		pmd_populate(mm, pmd, new);
 		new = NULL;
 	}
@@ -3238,7 +3238,7 @@ static int pte_alloc_one_map(struct vm_fault *vmf)
 			goto map_pte;
 		}
 
-		atomic_long_inc(&vma->vm_mm->nr_ptes);
+		mm_inc_nr_ptes(vma->vm_mm);
 		pmd_populate(vma->vm_mm, vmf->pmd, vmf->prealloc_pte);
 		spin_unlock(vmf->ptl);
 		vmf->prealloc_pte = NULL;
@@ -3297,7 +3297,7 @@ static void deposit_prealloc_pte(struct vm_fault *vmf)
 	 * We are going to consume the prealloc table,
 	 * count that as nr_ptes.
 	 */
-	atomic_long_inc(&vma->vm_mm->nr_ptes);
+	mm_inc_nr_ptes(vma->vm_mm);
 	vmf->prealloc_pte = NULL;
 }
 
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index f642a45b7f14..f9300141480e 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -221,8 +221,7 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
 	 * task's rss, pagetable and swap space use.
 	 */
 	points = get_mm_rss(p->mm) + get_mm_counter(p->mm, MM_SWAPENTS) +
-		atomic_long_read(&p->mm->nr_ptes) + mm_nr_pmds(p->mm) +
-		mm_nr_puds(p->mm);
+		mm_nr_ptes(p->mm) + mm_nr_pmds(p->mm) + mm_nr_puds(p->mm);
 	task_unlock(p);
 
 	/*
@@ -417,7 +416,7 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask)
 		pr_info("[%5d] %5d %5d %8lu %8lu %7ld %7ld %7ld %8lu         %5hd %s\n",
 			task->pid, from_kuid(&init_user_ns, task_uid(task)),
 			task->tgid, task->mm->total_vm, get_mm_rss(task->mm),
-			atomic_long_read(&task->mm->nr_ptes),
+			mm_nr_ptes(task->mm),
 			mm_nr_pmds(task->mm),
 			mm_nr_puds(task->mm),
 			get_mm_counter(task->mm, MM_SWAPENTS),
-- 
cgit v1.2.3


From af5b0f6a09e42c9f4fa87735f2a366748767b686 Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Wed, 15 Nov 2017 17:35:40 -0800
Subject: mm: consolidate page table accounting

Currently, we account page tables separately for each page table level,
but that's redundant -- we only make use of total memory allocated to
page tables for oom_badness calculation.  We also provide the
information to userspace, but it has dubious value there too.

This patch switches page table accounting to single counter.

mm->pgtables_bytes is now used to account all page table levels.  We use
bytes, because page table size for different levels of page table tree
may be different.

The change has user-visible effect: we don't have VmPMD and VmPUD
reported in /proc/[pid]/status.  Not sure if anybody uses them.  (As
alternative, we can always report 0 kB for them.)

OOM-killer report is also slightly changed: we now report pgtables_bytes
instead of nr_ptes, nr_pmd, nr_puds.

Apart from reducing number of counters per-mm, the benefit is that we
now calculate oom_badness() more correctly for machines which have
different size of page tables depending on level or where page tables
are less than a page in size.

The only downside can be debuggability because we do not know which page
table level could leak.  But I do not remember many bugs that would be
caught by separate counters so I wouldn't lose sleep over this.

[akpm@linux-foundation.org: fix mm/huge_memory.c]
Link: http://lkml.kernel.org/r/20171006100651.44742-2-kirill.shutemov@linux.intel.com
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
[kirill.shutemov@linux.intel.com: fix build]
  Link: http://lkml.kernel.org/r/20171016150113.ikfxy3e7zzfvsr4w@black.fi.intel.com
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/filesystems/proc.txt |  1 -
 Documentation/sysctl/vm.txt        |  8 +++---
 fs/proc/task_mmu.c                 | 11 ++------
 include/linux/mm.h                 | 58 ++++++++------------------------------
 include/linux/mm_types.h           |  8 +-----
 kernel/fork.c                      | 16 +++--------
 mm/debug.c                         |  7 ++---
 mm/huge_memory.c                   |  2 +-
 mm/oom_kill.c                      | 14 ++++-----
 9 files changed, 32 insertions(+), 93 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index adba21b5ada7..ec571b9bb18a 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -250,7 +250,6 @@ Table 1-2: Contents of the status files (as of 4.8)
  VmExe                       size of text segment
  VmLib                       size of shared library code
  VmPTE                       size of page table entries
- VmPMD                       size of second level page tables
  VmSwap                      amount of swap used by anonymous private data
                              (shmem swap usage is not included)
  HugetlbPages                size of hugetlb memory portions
diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt
index 2729e8db9492..3e579740b49f 100644
--- a/Documentation/sysctl/vm.txt
+++ b/Documentation/sysctl/vm.txt
@@ -629,10 +629,10 @@ oom_dump_tasks
 
 Enables a system-wide task dump (excluding kernel threads) to be produced
 when the kernel performs an OOM-killing and includes such information as
-pid, uid, tgid, vm size, rss, nr_ptes, nr_pmds, nr_puds, swapents,
-oom_score_adj score, and name.  This is helpful to determine why the OOM
-killer was invoked, to identify the rogue task that caused it, and to
-determine why the OOM killer chose the task it did to kill.
+pid, uid, tgid, vm size, rss, pgtables_bytes, swapents, oom_score_adj
+score, and name.  This is helpful to determine why the OOM killer was
+invoked, to identify the rogue task that caused it, and to determine why
+the OOM killer chose the task it did to kill.
 
 If this is set to zero, this information is suppressed.  On very
 large systems with thousands of tasks it may not be feasible to dump
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 9bd2a0294ac1..875231c36cb3 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -26,7 +26,7 @@
 
 void task_mem(struct seq_file *m, struct mm_struct *mm)
 {
-	unsigned long text, lib, swap, ptes, pmds, puds, anon, file, shmem;
+	unsigned long text, lib, swap, anon, file, shmem;
 	unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss;
 
 	anon = get_mm_counter(mm, MM_ANONPAGES);
@@ -50,9 +50,6 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
 	text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) >> 10;
 	lib = (mm->exec_vm << (PAGE_SHIFT-10)) - text;
 	swap = get_mm_counter(mm, MM_SWAPENTS);
-	ptes = PTRS_PER_PTE * sizeof(pte_t) * mm_nr_ptes(mm);
-	pmds = PTRS_PER_PMD * sizeof(pmd_t) * mm_nr_pmds(mm);
-	puds = PTRS_PER_PUD * sizeof(pud_t) * mm_nr_puds(mm);
 	seq_printf(m,
 		"VmPeak:\t%8lu kB\n"
 		"VmSize:\t%8lu kB\n"
@@ -68,8 +65,6 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
 		"VmExe:\t%8lu kB\n"
 		"VmLib:\t%8lu kB\n"
 		"VmPTE:\t%8lu kB\n"
-		"VmPMD:\t%8lu kB\n"
-		"VmPUD:\t%8lu kB\n"
 		"VmSwap:\t%8lu kB\n",
 		hiwater_vm << (PAGE_SHIFT-10),
 		total_vm << (PAGE_SHIFT-10),
@@ -82,9 +77,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
 		shmem << (PAGE_SHIFT-10),
 		mm->data_vm << (PAGE_SHIFT-10),
 		mm->stack_vm << (PAGE_SHIFT-10), text, lib,
-		ptes >> 10,
-		pmds >> 10,
-		puds >> 10,
+		mm_pgtables_bytes(mm) >> 10,
 		swap << (PAGE_SHIFT-10));
 	hugetlb_report_usage(m, mm);
 }
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 2ca799f0d762..7c1e82a1aa77 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1605,37 +1605,20 @@ static inline int __pud_alloc(struct mm_struct *mm, p4d_t *p4d,
 {
 	return 0;
 }
-
-static inline unsigned long mm_nr_puds(const struct mm_struct *mm)
-{
-	return 0;
-}
-
-static inline void mm_nr_puds_init(struct mm_struct *mm) {}
 static inline void mm_inc_nr_puds(struct mm_struct *mm) {}
 static inline void mm_dec_nr_puds(struct mm_struct *mm) {}
 
 #else
 int __pud_alloc(struct mm_struct *mm, p4d_t *p4d, unsigned long address);
 
-static inline void mm_nr_puds_init(struct mm_struct *mm)
-{
-	atomic_long_set(&mm->nr_puds, 0);
-}
-
-static inline unsigned long mm_nr_puds(const struct mm_struct *mm)
-{
-	return atomic_long_read(&mm->nr_puds);
-}
-
 static inline void mm_inc_nr_puds(struct mm_struct *mm)
 {
-	atomic_long_inc(&mm->nr_puds);
+	atomic_long_add(PTRS_PER_PUD * sizeof(pud_t), &mm->pgtables_bytes);
 }
 
 static inline void mm_dec_nr_puds(struct mm_struct *mm)
 {
-	atomic_long_dec(&mm->nr_puds);
+	atomic_long_sub(PTRS_PER_PUD * sizeof(pud_t), &mm->pgtables_bytes);
 }
 #endif
 
@@ -1646,64 +1629,47 @@ static inline int __pmd_alloc(struct mm_struct *mm, pud_t *pud,
 	return 0;
 }
 
-static inline void mm_nr_pmds_init(struct mm_struct *mm) {}
-
-static inline unsigned long mm_nr_pmds(const struct mm_struct *mm)
-{
-	return 0;
-}
-
 static inline void mm_inc_nr_pmds(struct mm_struct *mm) {}
 static inline void mm_dec_nr_pmds(struct mm_struct *mm) {}
 
 #else
 int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address);
 
-static inline void mm_nr_pmds_init(struct mm_struct *mm)
-{
-	atomic_long_set(&mm->nr_pmds, 0);
-}
-
-static inline unsigned long mm_nr_pmds(const struct mm_struct *mm)
-{
-	return atomic_long_read(&mm->nr_pmds);
-}
-
 static inline void mm_inc_nr_pmds(struct mm_struct *mm)
 {
-	atomic_long_inc(&mm->nr_pmds);
+	atomic_long_add(PTRS_PER_PMD * sizeof(pmd_t), &mm->pgtables_bytes);
 }
 
 static inline void mm_dec_nr_pmds(struct mm_struct *mm)
 {
-	atomic_long_dec(&mm->nr_pmds);
+	atomic_long_sub(PTRS_PER_PMD * sizeof(pmd_t), &mm->pgtables_bytes);
 }
 #endif
 
 #ifdef CONFIG_MMU
-static inline void mm_nr_ptes_init(struct mm_struct *mm)
+static inline void mm_pgtables_bytes_init(struct mm_struct *mm)
 {
-	atomic_long_set(&mm->nr_ptes, 0);
+	atomic_long_set(&mm->pgtables_bytes, 0);
 }
 
-static inline unsigned long mm_nr_ptes(const struct mm_struct *mm)
+static inline unsigned long mm_pgtables_bytes(const struct mm_struct *mm)
 {
-	return atomic_long_read(&mm->nr_ptes);
+	return atomic_long_read(&mm->pgtables_bytes);
 }
 
 static inline void mm_inc_nr_ptes(struct mm_struct *mm)
 {
-	atomic_long_inc(&mm->nr_ptes);
+	atomic_long_add(PTRS_PER_PTE * sizeof(pte_t), &mm->pgtables_bytes);
 }
 
 static inline void mm_dec_nr_ptes(struct mm_struct *mm)
 {
-	atomic_long_dec(&mm->nr_ptes);
+	atomic_long_sub(PTRS_PER_PTE * sizeof(pte_t), &mm->pgtables_bytes);
 }
 #else
-static inline void mm_nr_ptes_init(struct mm_struct *mm) {}
 
-static inline unsigned long mm_nr_ptes(const struct mm_struct *mm)
+static inline void mm_pgtables_bytes_init(struct mm_struct *mm) {}
+static inline unsigned long mm_pgtables_bytes(const struct mm_struct *mm)
 {
 	return 0;
 }
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index e42048020664..09643e0472fc 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -402,13 +402,7 @@ struct mm_struct {
 	atomic_t mm_count;
 
 #ifdef CONFIG_MMU
-	atomic_long_t nr_ptes;			/* PTE page table pages */
-#endif
-#if CONFIG_PGTABLE_LEVELS > 2
-	atomic_long_t nr_pmds;			/* PMD page table pages */
-#endif
-#if CONFIG_PGTABLE_LEVELS > 3
-	atomic_long_t nr_puds;			/* PUD page table pages */
+	atomic_long_t pgtables_bytes;		/* PTE page table pages */
 #endif
 	int map_count;				/* number of VMAs */
 
diff --git a/kernel/fork.c b/kernel/fork.c
index 946922a30ede..006dc5899a1a 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -817,9 +817,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
 	init_rwsem(&mm->mmap_sem);
 	INIT_LIST_HEAD(&mm->mmlist);
 	mm->core_state = NULL;
-	mm_nr_ptes_init(mm);
-	mm_nr_pmds_init(mm);
-	mm_nr_puds_init(mm);
+	mm_pgtables_bytes_init(mm);
 	mm->map_count = 0;
 	mm->locked_vm = 0;
 	mm->pinned_vm = 0;
@@ -873,15 +871,9 @@ static void check_mm(struct mm_struct *mm)
 					  "mm:%p idx:%d val:%ld\n", mm, i, x);
 	}
 
-	if (mm_nr_ptes(mm))
-		pr_alert("BUG: non-zero nr_ptes on freeing mm: %ld\n",
-				mm_nr_ptes(mm));
-	if (mm_nr_pmds(mm))
-		pr_alert("BUG: non-zero nr_pmds on freeing mm: %ld\n",
-				mm_nr_pmds(mm));
-	if (mm_nr_puds(mm))
-		pr_alert("BUG: non-zero nr_puds on freeing mm: %ld\n",
-				mm_nr_puds(mm));
+	if (mm_pgtables_bytes(mm))
+		pr_alert("BUG: non-zero pgtables_bytes on freeing mm: %ld\n",
+				mm_pgtables_bytes(mm));
 
 #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
 	VM_BUG_ON_MM(mm->pmd_huge_pte, mm);
diff --git a/mm/debug.c b/mm/debug.c
index c9888a6d7875..d947f3e03b0d 100644
--- a/mm/debug.c
+++ b/mm/debug.c
@@ -105,8 +105,7 @@ void dump_mm(const struct mm_struct *mm)
 		"get_unmapped_area %p\n"
 #endif
 		"mmap_base %lu mmap_legacy_base %lu highest_vm_end %lu\n"
-		"pgd %p mm_users %d mm_count %d\n"
-		"nr_ptes %lu nr_pmds %lu nr_puds %lu map_count %d\n"
+		"pgd %p mm_users %d mm_count %d pgtables_bytes %lu map_count %d\n"
 		"hiwater_rss %lx hiwater_vm %lx total_vm %lx locked_vm %lx\n"
 		"pinned_vm %lx data_vm %lx exec_vm %lx stack_vm %lx\n"
 		"start_code %lx end_code %lx start_data %lx end_data %lx\n"
@@ -136,9 +135,7 @@ void dump_mm(const struct mm_struct *mm)
 		mm->mmap_base, mm->mmap_legacy_base, mm->highest_vm_end,
 		mm->pgd, atomic_read(&mm->mm_users),
 		atomic_read(&mm->mm_count),
-		mm_nr_ptes(mm),
-		mm_nr_pmds(mm),
-		mm_nr_puds(mm),
+		mm_pgtables_bytes(mm),
 		mm->map_count,
 		mm->hiwater_rss, mm->hiwater_vm, mm->total_vm, mm->locked_vm,
 		mm->pinned_vm, mm->data_vm, mm->exec_vm, mm->stack_vm,
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 3610d81c062a..86fe697e8bfb 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -942,7 +942,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 			set_pmd_at(src_mm, addr, src_pmd, pmd);
 		}
 		add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
-		atomic_long_inc(&dst_mm->nr_ptes);
+		mm_inc_nr_ptes(dst_mm);
 		pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
 		set_pmd_at(dst_mm, addr, dst_pmd, pmd);
 		ret = 0;
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index f9300141480e..26add8a0d1f7 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -221,7 +221,7 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
 	 * task's rss, pagetable and swap space use.
 	 */
 	points = get_mm_rss(p->mm) + get_mm_counter(p->mm, MM_SWAPENTS) +
-		mm_nr_ptes(p->mm) + mm_nr_pmds(p->mm) + mm_nr_puds(p->mm);
+		mm_pgtables_bytes(p->mm) / PAGE_SIZE;
 	task_unlock(p);
 
 	/*
@@ -389,15 +389,15 @@ static void select_bad_process(struct oom_control *oc)
  * Dumps the current memory state of all eligible tasks.  Tasks not in the same
  * memcg, not in the same cpuset, or bound to a disjoint set of mempolicy nodes
  * are not shown.
- * State information includes task's pid, uid, tgid, vm size, rss, nr_ptes,
- * swapents, oom_score_adj value, and name.
+ * State information includes task's pid, uid, tgid, vm size, rss,
+ * pgtables_bytes, swapents, oom_score_adj value, and name.
  */
 static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask)
 {
 	struct task_struct *p;
 	struct task_struct *task;
 
-	pr_info("[ pid ]   uid  tgid total_vm      rss nr_ptes nr_pmds nr_puds swapents oom_score_adj name\n");
+	pr_info("[ pid ]   uid  tgid total_vm      rss pgtables_bytes swapents oom_score_adj name\n");
 	rcu_read_lock();
 	for_each_process(p) {
 		if (oom_unkillable_task(p, memcg, nodemask))
@@ -413,12 +413,10 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask)
 			continue;
 		}
 
-		pr_info("[%5d] %5d %5d %8lu %8lu %7ld %7ld %7ld %8lu         %5hd %s\n",
+		pr_info("[%5d] %5d %5d %8lu %8lu %8ld %8lu         %5hd %s\n",
 			task->pid, from_kuid(&init_user_ns, task_uid(task)),
 			task->tgid, task->mm->total_vm, get_mm_rss(task->mm),
-			mm_nr_ptes(task->mm),
-			mm_nr_pmds(task->mm),
-			mm_nr_puds(task->mm),
+			mm_pgtables_bytes(task->mm),
 			get_mm_counter(task->mm, MM_SWAPENTS),
 			task->signal->oom_score_adj, task->comm);
 		task_unlock(task);
-- 
cgit v1.2.3


From 4950276672fce5c241857540f8561c440663673d Mon Sep 17 00:00:00 2001
From: "Levin, Alexander (Sasha Levin)" <alexander.levin@verizon.com>
Date: Wed, 15 Nov 2017 17:35:51 -0800
Subject: kmemcheck: remove annotations

Patch series "kmemcheck: kill kmemcheck", v2.

As discussed at LSF/MM, kill kmemcheck.

KASan is a replacement that is able to work without the limitation of
kmemcheck (single CPU, slow).  KASan is already upstream.

We are also not aware of any users of kmemcheck (or users who don't
consider KASan as a suitable replacement).

The only objection was that since KASAN wasn't supported by all GCC
versions provided by distros at that time we should hold off for 2
years, and try again.

Now that 2 years have passed, and all distros provide gcc that supports
KASAN, kill kmemcheck again for the very same reasons.

This patch (of 4):

Remove kmemcheck annotations, and calls to kmemcheck from the kernel.

[alexander.levin@verizon.com: correctly remove kmemcheck call from dma_map_sg_attrs]
  Link: http://lkml.kernel.org/r/20171012192151.26531-1-alexander.levin@verizon.com
Link: http://lkml.kernel.org/r/20171007030159.22241-2-alexander.levin@verizon.com
Signed-off-by: Sasha Levin <alexander.levin@verizon.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Tim Hansen <devtimhansen@gmail.com>
Cc: Vegard Nossum <vegardno@ifi.uio.no>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/arm/include/asm/dma-iommu.h        |  1 -
 arch/openrisc/include/asm/dma-mapping.h |  1 -
 arch/x86/Makefile                       |  5 -----
 arch/x86/include/asm/dma-mapping.h      |  1 -
 arch/x86/include/asm/xor.h              |  5 +----
 arch/x86/kernel/traps.c                 |  5 -----
 arch/x86/mm/fault.c                     |  6 ------
 drivers/char/random.c                   |  1 -
 drivers/misc/c2port/core.c              |  2 --
 fs/dcache.c                             |  2 --
 include/linux/c2port.h                  |  4 ----
 include/linux/dma-mapping.h             |  8 +-------
 include/linux/filter.h                  |  2 --
 include/linux/mm_types.h                |  8 --------
 include/linux/net.h                     |  3 ---
 include/linux/ring_buffer.h             |  3 ---
 include/linux/skbuff.h                  |  3 ---
 include/net/inet_sock.h                 |  3 ---
 include/net/inet_timewait_sock.h        |  4 ----
 include/net/sock.h                      |  3 ---
 init/main.c                             |  1 -
 kernel/bpf/core.c                       |  6 ------
 kernel/locking/lockdep.c                |  3 ---
 kernel/trace/ring_buffer.c              |  3 ---
 mm/kmemleak.c                           |  9 ---------
 mm/page_alloc.c                         | 14 --------------
 mm/slab.c                               | 14 --------------
 mm/slab.h                               |  2 --
 mm/slub.c                               | 20 --------------------
 net/core/skbuff.c                       |  5 -----
 net/core/sock.c                         |  2 --
 net/ipv4/inet_timewait_sock.c           |  3 ---
 net/ipv4/tcp_input.c                    |  1 -
 net/socket.c                            |  1 -
 34 files changed, 2 insertions(+), 152 deletions(-)

(limited to 'kernel')

diff --git a/arch/arm/include/asm/dma-iommu.h b/arch/arm/include/asm/dma-iommu.h
index 0722ec6be692..6821f1249300 100644
--- a/arch/arm/include/asm/dma-iommu.h
+++ b/arch/arm/include/asm/dma-iommu.h
@@ -7,7 +7,6 @@
 #include <linux/mm_types.h>
 #include <linux/scatterlist.h>
 #include <linux/dma-debug.h>
-#include <linux/kmemcheck.h>
 #include <linux/kref.h>
 
 #define ARM_MAPPING_ERROR		(~(dma_addr_t)0x0)
diff --git a/arch/openrisc/include/asm/dma-mapping.h b/arch/openrisc/include/asm/dma-mapping.h
index f41bd3cb76d9..e212a1f0b6d2 100644
--- a/arch/openrisc/include/asm/dma-mapping.h
+++ b/arch/openrisc/include/asm/dma-mapping.h
@@ -23,7 +23,6 @@
  */
 
 #include <linux/dma-debug.h>
-#include <linux/kmemcheck.h>
 #include <linux/dma-mapping.h>
 
 extern const struct dma_map_ops or1k_dma_map_ops;
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index a20eacd9c7e9..3e73bc255e4e 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -158,11 +158,6 @@ ifdef CONFIG_X86_X32
 endif
 export CONFIG_X86_X32_ABI
 
-# Don't unroll struct assignments with kmemcheck enabled
-ifeq ($(CONFIG_KMEMCHECK),y)
-	KBUILD_CFLAGS += $(call cc-option,-fno-builtin-memcpy)
-endif
-
 #
 # If the function graph tracer is used with mcount instead of fentry,
 # '-maccumulate-outgoing-args' is needed to prevent a GCC bug
diff --git a/arch/x86/include/asm/dma-mapping.h b/arch/x86/include/asm/dma-mapping.h
index 43cbe843de8d..0350d99bb8fd 100644
--- a/arch/x86/include/asm/dma-mapping.h
+++ b/arch/x86/include/asm/dma-mapping.h
@@ -7,7 +7,6 @@
  * Documentation/DMA-API.txt for documentation.
  */
 
-#include <linux/kmemcheck.h>
 #include <linux/scatterlist.h>
 #include <linux/dma-debug.h>
 #include <asm/io.h>
diff --git a/arch/x86/include/asm/xor.h b/arch/x86/include/asm/xor.h
index 1f5c5161ead6..45c8605467f1 100644
--- a/arch/x86/include/asm/xor.h
+++ b/arch/x86/include/asm/xor.h
@@ -1,7 +1,4 @@
-#ifdef CONFIG_KMEMCHECK
-/* kmemcheck doesn't handle MMX/SSE/SSE2 instructions */
-# include <asm-generic/xor.h>
-#elif !defined(_ASM_X86_XOR_H)
+#ifndef _ASM_X86_XOR_H
 #define _ASM_X86_XOR_H
 
 /*
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index b7b0f74a2150..989514c94a55 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -42,7 +42,6 @@
 #include <linux/edac.h>
 #endif
 
-#include <asm/kmemcheck.h>
 #include <asm/stacktrace.h>
 #include <asm/processor.h>
 #include <asm/debugreg.h>
@@ -749,10 +748,6 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code)
 	if (!dr6 && user_mode(regs))
 		user_icebp = 1;
 
-	/* Catch kmemcheck conditions! */
-	if ((dr6 & DR_STEP) && kmemcheck_trap(regs))
-		goto exit;
-
 	/* Store the virtualized DR6 value */
 	tsk->thread.debugreg6 = dr6;
 
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 3109ba6c6ede..78ca9a8ee454 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -20,7 +20,6 @@
 #include <asm/cpufeature.h>		/* boot_cpu_has, ...		*/
 #include <asm/traps.h>			/* dotraplinkage, ...		*/
 #include <asm/pgalloc.h>		/* pgd_*(), ...			*/
-#include <asm/kmemcheck.h>		/* kmemcheck_*(), ...		*/
 #include <asm/fixmap.h>			/* VSYSCALL_ADDR		*/
 #include <asm/vsyscall.h>		/* emulate_vsyscall		*/
 #include <asm/vm86.h>			/* struct vm86			*/
@@ -1256,8 +1255,6 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
 	 * Detect and handle instructions that would cause a page fault for
 	 * both a tracked kernel page and a userspace page.
 	 */
-	if (kmemcheck_active(regs))
-		kmemcheck_hide(regs);
 	prefetchw(&mm->mmap_sem);
 
 	if (unlikely(kmmio_fault(regs, address)))
@@ -1280,9 +1277,6 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
 		if (!(error_code & (X86_PF_RSVD | X86_PF_USER | X86_PF_PROT))) {
 			if (vmalloc_fault(address) >= 0)
 				return;
-
-			if (kmemcheck_fault(regs, address, error_code))
-				return;
 		}
 
 		/* Can handle a stale RO->RW TLB: */
diff --git a/drivers/char/random.c b/drivers/char/random.c
index 6c7ccac2679e..ec42c8bb9b0d 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -259,7 +259,6 @@
 #include <linux/cryptohash.h>
 #include <linux/fips.h>
 #include <linux/ptrace.h>
-#include <linux/kmemcheck.h>
 #include <linux/workqueue.h>
 #include <linux/irq.h>
 #include <linux/syscalls.h>
diff --git a/drivers/misc/c2port/core.c b/drivers/misc/c2port/core.c
index 1922cb8f6b88..1c5b7aec13d4 100644
--- a/drivers/misc/c2port/core.c
+++ b/drivers/misc/c2port/core.c
@@ -15,7 +15,6 @@
 #include <linux/errno.h>
 #include <linux/err.h>
 #include <linux/kernel.h>
-#include <linux/kmemcheck.h>
 #include <linux/ctype.h>
 #include <linux/delay.h>
 #include <linux/idr.h>
@@ -904,7 +903,6 @@ struct c2port_device *c2port_device_register(char *name,
 		return ERR_PTR(-EINVAL);
 
 	c2dev = kmalloc(sizeof(struct c2port_device), GFP_KERNEL);
-	kmemcheck_annotate_bitfield(c2dev, flags);
 	if (unlikely(!c2dev))
 		return ERR_PTR(-ENOMEM);
 
diff --git a/fs/dcache.c b/fs/dcache.c
index bcc9f6981569..5c7df1df81ff 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -2705,8 +2705,6 @@ static void swap_names(struct dentry *dentry, struct dentry *target)
 			 */
 			unsigned int i;
 			BUILD_BUG_ON(!IS_ALIGNED(DNAME_INLINE_LEN, sizeof(long)));
-			kmemcheck_mark_initialized(dentry->d_iname, DNAME_INLINE_LEN);
-			kmemcheck_mark_initialized(target->d_iname, DNAME_INLINE_LEN);
 			for (i = 0; i < DNAME_INLINE_LEN / sizeof(long); i++) {
 				swap(((long *) &dentry->d_iname)[i],
 				     ((long *) &target->d_iname)[i]);
diff --git a/include/linux/c2port.h b/include/linux/c2port.h
index 4efabcb51347..f2736348ca26 100644
--- a/include/linux/c2port.h
+++ b/include/linux/c2port.h
@@ -9,8 +9,6 @@
  * the Free Software Foundation
  */
 
-#include <linux/kmemcheck.h>
-
 #define C2PORT_NAME_LEN			32
 
 struct device;
@@ -22,10 +20,8 @@ struct device;
 /* Main struct */
 struct c2port_ops;
 struct c2port_device {
-	kmemcheck_bitfield_begin(flags);
 	unsigned int access:1;
 	unsigned int flash_access:1;
-	kmemcheck_bitfield_end(flags);
 
 	int id;
 	char name[C2PORT_NAME_LEN];
diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
index eee1499db396..e8f8e8fb244d 100644
--- a/include/linux/dma-mapping.h
+++ b/include/linux/dma-mapping.h
@@ -9,7 +9,6 @@
 #include <linux/dma-debug.h>
 #include <linux/dma-direction.h>
 #include <linux/scatterlist.h>
-#include <linux/kmemcheck.h>
 #include <linux/bug.h>
 #include <linux/mem_encrypt.h>
 
@@ -232,7 +231,6 @@ static inline dma_addr_t dma_map_single_attrs(struct device *dev, void *ptr,
 	const struct dma_map_ops *ops = get_dma_ops(dev);
 	dma_addr_t addr;
 
-	kmemcheck_mark_initialized(ptr, size);
 	BUG_ON(!valid_dma_direction(dir));
 	addr = ops->map_page(dev, virt_to_page(ptr),
 			     offset_in_page(ptr), size,
@@ -265,11 +263,8 @@ static inline int dma_map_sg_attrs(struct device *dev, struct scatterlist *sg,
 				   unsigned long attrs)
 {
 	const struct dma_map_ops *ops = get_dma_ops(dev);
-	int i, ents;
-	struct scatterlist *s;
+	int ents;
 
-	for_each_sg(sg, s, nents, i)
-		kmemcheck_mark_initialized(sg_virt(s), s->length);
 	BUG_ON(!valid_dma_direction(dir));
 	ents = ops->map_sg(dev, sg, nents, dir, attrs);
 	BUG_ON(ents < 0);
@@ -299,7 +294,6 @@ static inline dma_addr_t dma_map_page_attrs(struct device *dev,
 	const struct dma_map_ops *ops = get_dma_ops(dev);
 	dma_addr_t addr;
 
-	kmemcheck_mark_initialized(page_address(page) + offset, size);
 	BUG_ON(!valid_dma_direction(dir));
 	addr = ops->map_page(dev, page, offset, size, dir, attrs);
 	debug_dma_map_page(dev, page, offset, size, dir, addr, false);
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 48ec57e70f9f..42197b16dd78 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -454,13 +454,11 @@ struct bpf_binary_header {
 
 struct bpf_prog {
 	u16			pages;		/* Number of allocated pages */
-	kmemcheck_bitfield_begin(meta);
 	u16			jited:1,	/* Is our filter JIT'ed? */
 				locked:1,	/* Program image locked? */
 				gpl_compatible:1, /* Is filter GPL compatible? */
 				cb_access:1,	/* Is control block accessed? */
 				dst_needed:1;	/* Do we need dst entry? */
-	kmemcheck_bitfield_end(meta);
 	enum bpf_prog_type	type;		/* Type of BPF program */
 	u32			len;		/* Number of filter blocks */
 	u32			jited_len;	/* Size of jited insns in bytes */
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 09643e0472fc..cfd0ac4e5e0e 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -209,14 +209,6 @@ struct page {
 					   not kmapped, ie. highmem) */
 #endif /* WANT_PAGE_VIRTUAL */
 
-#ifdef CONFIG_KMEMCHECK
-	/*
-	 * kmemcheck wants to track the status of each byte in a page; this
-	 * is a pointer to such a status block. NULL if not tracked.
-	 */
-	void *shadow;
-#endif
-
 #ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS
 	int _last_cpupid;
 #endif
diff --git a/include/linux/net.h b/include/linux/net.h
index d97d80d7fdf8..caeb159abda5 100644
--- a/include/linux/net.h
+++ b/include/linux/net.h
@@ -22,7 +22,6 @@
 #include <linux/random.h>
 #include <linux/wait.h>
 #include <linux/fcntl.h>	/* For O_CLOEXEC and O_NONBLOCK */
-#include <linux/kmemcheck.h>
 #include <linux/rcupdate.h>
 #include <linux/once.h>
 #include <linux/fs.h>
@@ -111,9 +110,7 @@ struct socket_wq {
 struct socket {
 	socket_state		state;
 
-	kmemcheck_bitfield_begin(type);
 	short			type;
-	kmemcheck_bitfield_end(type);
 
 	unsigned long		flags;
 
diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h
index fa6ace66fea5..289e4d54e3e0 100644
--- a/include/linux/ring_buffer.h
+++ b/include/linux/ring_buffer.h
@@ -2,7 +2,6 @@
 #ifndef _LINUX_RING_BUFFER_H
 #define _LINUX_RING_BUFFER_H
 
-#include <linux/kmemcheck.h>
 #include <linux/mm.h>
 #include <linux/seq_file.h>
 #include <linux/poll.h>
@@ -14,9 +13,7 @@ struct ring_buffer_iter;
  * Don't refer to this struct directly, use functions below.
  */
 struct ring_buffer_event {
-	kmemcheck_bitfield_begin(bitfield);
 	u32		type_len:5, time_delta:27;
-	kmemcheck_bitfield_end(bitfield);
 
 	u32		array[];
 };
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index d448a4804aea..aa1341474916 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -15,7 +15,6 @@
 #define _LINUX_SKBUFF_H
 
 #include <linux/kernel.h>
-#include <linux/kmemcheck.h>
 #include <linux/compiler.h>
 #include <linux/time.h>
 #include <linux/bug.h>
@@ -704,7 +703,6 @@ struct sk_buff {
 	/* Following fields are _not_ copied in __copy_skb_header()
 	 * Note that queue_mapping is here mostly to fill a hole.
 	 */
-	kmemcheck_bitfield_begin(flags1);
 	__u16			queue_mapping;
 
 /* if you move cloned around you also must adapt those constants */
@@ -723,7 +721,6 @@ struct sk_buff {
 				head_frag:1,
 				xmit_more:1,
 				__unused:1; /* one bit hole */
-	kmemcheck_bitfield_end(flags1);
 
 	/* fields enclosed in headers_start/headers_end are copied
 	 * using a single memcpy() in __copy_skb_header()
diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h
index db8162dd8c0b..8e51b4a69088 100644
--- a/include/net/inet_sock.h
+++ b/include/net/inet_sock.h
@@ -17,7 +17,6 @@
 #define _INET_SOCK_H
 
 #include <linux/bitops.h>
-#include <linux/kmemcheck.h>
 #include <linux/string.h>
 #include <linux/types.h>
 #include <linux/jhash.h>
@@ -84,7 +83,6 @@ struct inet_request_sock {
 #define ireq_state		req.__req_common.skc_state
 #define ireq_family		req.__req_common.skc_family
 
-	kmemcheck_bitfield_begin(flags);
 	u16			snd_wscale : 4,
 				rcv_wscale : 4,
 				tstamp_ok  : 1,
@@ -93,7 +91,6 @@ struct inet_request_sock {
 				ecn_ok	   : 1,
 				acked	   : 1,
 				no_srccheck: 1;
-	kmemcheck_bitfield_end(flags);
 	u32                     ir_mark;
 	union {
 		struct ip_options_rcu __rcu	*ireq_opt;
diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h
index 6a75d67a30fd..1356fa6a7566 100644
--- a/include/net/inet_timewait_sock.h
+++ b/include/net/inet_timewait_sock.h
@@ -15,8 +15,6 @@
 #ifndef _INET_TIMEWAIT_SOCK_
 #define _INET_TIMEWAIT_SOCK_
 
-
-#include <linux/kmemcheck.h>
 #include <linux/list.h>
 #include <linux/timer.h>
 #include <linux/types.h>
@@ -69,14 +67,12 @@ struct inet_timewait_sock {
 	/* Socket demultiplex comparisons on incoming packets. */
 	/* these three are in inet_sock */
 	__be16			tw_sport;
-	kmemcheck_bitfield_begin(flags);
 	/* And these are ours. */
 	unsigned int		tw_kill		: 1,
 				tw_transparent  : 1,
 				tw_flowlabel	: 20,
 				tw_pad		: 2,	/* 2 bits hole */
 				tw_tos		: 8;
-	kmemcheck_bitfield_end(flags);
 	struct timer_list	tw_timer;
 	struct inet_bind_bucket	*tw_tb;
 };
diff --git a/include/net/sock.h b/include/net/sock.h
index c577286dbffb..a63e6a8bb7e0 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -436,7 +436,6 @@ struct sock {
 #define SK_FL_TYPE_MASK    0xffff0000
 #endif
 
-	kmemcheck_bitfield_begin(flags);
 	unsigned int		sk_padding : 1,
 				sk_kern_sock : 1,
 				sk_no_check_tx : 1,
@@ -445,8 +444,6 @@ struct sock {
 				sk_protocol  : 8,
 				sk_type      : 16;
 #define SK_PROTOCOL_MAX U8_MAX
-	kmemcheck_bitfield_end(flags);
-
 	u16			sk_gso_max_segs;
 	unsigned long	        sk_lingertime;
 	struct proto		*sk_prot_creator;
diff --git a/init/main.c b/init/main.c
index 3bdd8da90f69..859a786f7c0a 100644
--- a/init/main.c
+++ b/init/main.c
@@ -70,7 +70,6 @@
 #include <linux/kgdb.h>
 #include <linux/ftrace.h>
 #include <linux/async.h>
-#include <linux/kmemcheck.h>
 #include <linux/sfi.h>
 #include <linux/shmem_fs.h>
 #include <linux/slab.h>
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 7b62df86be1d..11ad089f2c74 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -85,8 +85,6 @@ struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags)
 	if (fp == NULL)
 		return NULL;
 
-	kmemcheck_annotate_bitfield(fp, meta);
-
 	aux = kzalloc(sizeof(*aux), GFP_KERNEL | gfp_extra_flags);
 	if (aux == NULL) {
 		vfree(fp);
@@ -127,8 +125,6 @@ struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size,
 	if (fp == NULL) {
 		__bpf_prog_uncharge(fp_old->aux->user, delta);
 	} else {
-		kmemcheck_annotate_bitfield(fp, meta);
-
 		memcpy(fp, fp_old, fp_old->pages * PAGE_SIZE);
 		fp->pages = pages;
 		fp->aux->prog = fp;
@@ -662,8 +658,6 @@ static struct bpf_prog *bpf_prog_clone_create(struct bpf_prog *fp_other,
 
 	fp = __vmalloc(fp_other->pages * PAGE_SIZE, gfp_flags, PAGE_KERNEL);
 	if (fp != NULL) {
-		kmemcheck_annotate_bitfield(fp, meta);
-
 		/* aux->prog still points to the fp_other one, so
 		 * when promoting the clone to the real program,
 		 * this still needs to be adapted.
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index db933d063bfc..9776da8db180 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -47,7 +47,6 @@
 #include <linux/stringify.h>
 #include <linux/bitops.h>
 #include <linux/gfp.h>
-#include <linux/kmemcheck.h>
 #include <linux/random.h>
 #include <linux/jhash.h>
 
@@ -3238,8 +3237,6 @@ static void __lockdep_init_map(struct lockdep_map *lock, const char *name,
 {
 	int i;
 
-	kmemcheck_mark_initialized(lock, sizeof(*lock));
-
 	for (i = 0; i < NR_LOCKDEP_CACHING_CLASSES; i++)
 		lock->class_cache[i] = NULL;
 
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 845f3805c73d..d57fede84b38 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -13,7 +13,6 @@
 #include <linux/uaccess.h>
 #include <linux/hardirq.h>
 #include <linux/kthread.h>	/* for self test */
-#include <linux/kmemcheck.h>
 #include <linux/module.h>
 #include <linux/percpu.h>
 #include <linux/mutex.h>
@@ -2055,7 +2054,6 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
 	}
 
 	event = __rb_page_index(tail_page, tail);
-	kmemcheck_annotate_bitfield(event, bitfield);
 
 	/* account for padding bytes */
 	local_add(BUF_PAGE_SIZE - tail, &cpu_buffer->entries_bytes);
@@ -2686,7 +2684,6 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
 	/* We reserved something on the buffer */
 
 	event = __rb_page_index(tail_page, tail);
-	kmemcheck_annotate_bitfield(event, bitfield);
 	rb_update_event(cpu_buffer, event, info);
 
 	local_inc(&tail_page->entries);
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index fca3452e56c1..e4738d5e9b8c 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -110,7 +110,6 @@
 #include <linux/atomic.h>
 
 #include <linux/kasan.h>
-#include <linux/kmemcheck.h>
 #include <linux/kmemleak.h>
 #include <linux/memory_hotplug.h>
 
@@ -1238,9 +1237,6 @@ static bool update_checksum(struct kmemleak_object *object)
 {
 	u32 old_csum = object->checksum;
 
-	if (!kmemcheck_is_obj_initialized(object->pointer, object->size))
-		return false;
-
 	kasan_disable_current();
 	object->checksum = crc32(0, (void *)object->pointer, object->size);
 	kasan_enable_current();
@@ -1314,11 +1310,6 @@ static void scan_block(void *_start, void *_end,
 		if (scan_should_stop())
 			break;
 
-		/* don't scan uninitialized memory */
-		if (!kmemcheck_is_obj_initialized((unsigned long)ptr,
-						  BYTES_PER_POINTER))
-			continue;
-
 		kasan_disable_current();
 		pointer = *ptr;
 		kasan_enable_current();
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index e6106d7e9eb0..30a464b47366 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -24,7 +24,6 @@
 #include <linux/memblock.h>
 #include <linux/compiler.h>
 #include <linux/kernel.h>
-#include <linux/kmemcheck.h>
 #include <linux/kasan.h>
 #include <linux/module.h>
 #include <linux/suspend.h>
@@ -1013,7 +1012,6 @@ static __always_inline bool free_pages_prepare(struct page *page,
 	VM_BUG_ON_PAGE(PageTail(page), page);
 
 	trace_mm_page_free(page, order);
-	kmemcheck_free_shadow(page, order);
 
 	/*
 	 * Check tail pages before head page information is cleared to
@@ -2669,15 +2667,6 @@ void split_page(struct page *page, unsigned int order)
 	VM_BUG_ON_PAGE(PageCompound(page), page);
 	VM_BUG_ON_PAGE(!page_count(page), page);
 
-#ifdef CONFIG_KMEMCHECK
-	/*
-	 * Split shadow pages too, because free(page[0]) would
-	 * otherwise free the whole shadow.
-	 */
-	if (kmemcheck_page_is_tracked(page))
-		split_page(virt_to_page(page[0].shadow), order);
-#endif
-
 	for (i = 1; i < (1 << order); i++)
 		set_page_refcounted(page + i);
 	split_page_owner(page, order);
@@ -4223,9 +4212,6 @@ out:
 		page = NULL;
 	}
 
-	if (kmemcheck_enabled && page)
-		kmemcheck_pagealloc_alloc(page, order, gfp_mask);
-
 	trace_mm_page_alloc(page, order, alloc_mask, ac.migratetype);
 
 	return page;
diff --git a/mm/slab.c b/mm/slab.c
index 7a5e0888a401..c84365e9a591 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -114,7 +114,6 @@
 #include	<linux/rtmutex.h>
 #include	<linux/reciprocal_div.h>
 #include	<linux/debugobjects.h>
-#include	<linux/kmemcheck.h>
 #include	<linux/memory.h>
 #include	<linux/prefetch.h>
 #include	<linux/sched/task_stack.h>
@@ -1433,15 +1432,6 @@ static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags,
 	if (sk_memalloc_socks() && page_is_pfmemalloc(page))
 		SetPageSlabPfmemalloc(page);
 
-	if (kmemcheck_enabled && !(cachep->flags & SLAB_NOTRACK)) {
-		kmemcheck_alloc_shadow(page, cachep->gfporder, flags, nodeid);
-
-		if (cachep->ctor)
-			kmemcheck_mark_uninitialized_pages(page, nr_pages);
-		else
-			kmemcheck_mark_unallocated_pages(page, nr_pages);
-	}
-
 	return page;
 }
 
@@ -1453,8 +1443,6 @@ static void kmem_freepages(struct kmem_cache *cachep, struct page *page)
 	int order = cachep->gfporder;
 	unsigned long nr_freed = (1 << order);
 
-	kmemcheck_free_shadow(page, order);
-
 	if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
 		mod_lruvec_page_state(page, NR_SLAB_RECLAIMABLE, -nr_freed);
 	else
@@ -3515,8 +3503,6 @@ void ___cache_free(struct kmem_cache *cachep, void *objp,
 	kmemleak_free_recursive(objp, cachep->flags);
 	objp = cache_free_debugcheck(cachep, objp, caller);
 
-	kmemcheck_slab_free(cachep, objp, cachep->object_size);
-
 	/*
 	 * Skip calling cache_free_alien() when the platform is not numa.
 	 * This will avoid cache misses that happen while accessing slabp (which
diff --git a/mm/slab.h b/mm/slab.h
index e19255638cb6..e60a3d1d8f6f 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -40,7 +40,6 @@ struct kmem_cache {
 
 #include <linux/memcontrol.h>
 #include <linux/fault-inject.h>
-#include <linux/kmemcheck.h>
 #include <linux/kasan.h>
 #include <linux/kmemleak.h>
 #include <linux/random.h>
@@ -439,7 +438,6 @@ static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags,
 	for (i = 0; i < size; i++) {
 		void *object = p[i];
 
-		kmemcheck_slab_alloc(s, flags, object, slab_ksize(s));
 		kmemleak_alloc_recursive(object, s->object_size, 1,
 					 s->flags, flags);
 		kasan_slab_alloc(s, object, flags);
diff --git a/mm/slub.c b/mm/slub.c
index 51484f0fc068..ac3b50b9abec 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -22,7 +22,6 @@
 #include <linux/notifier.h>
 #include <linux/seq_file.h>
 #include <linux/kasan.h>
-#include <linux/kmemcheck.h>
 #include <linux/cpu.h>
 #include <linux/cpuset.h>
 #include <linux/mempolicy.h>
@@ -1377,7 +1376,6 @@ static inline void *slab_free_hook(struct kmem_cache *s, void *x)
 		unsigned long flags;
 
 		local_irq_save(flags);
-		kmemcheck_slab_free(s, x, s->object_size);
 		debug_check_no_locks_freed(x, s->object_size);
 		local_irq_restore(flags);
 	}
@@ -1598,22 +1596,6 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
 		stat(s, ORDER_FALLBACK);
 	}
 
-	if (kmemcheck_enabled &&
-	    !(s->flags & (SLAB_NOTRACK | DEBUG_DEFAULT_FLAGS))) {
-		int pages = 1 << oo_order(oo);
-
-		kmemcheck_alloc_shadow(page, oo_order(oo), alloc_gfp, node);
-
-		/*
-		 * Objects from caches that have a constructor don't get
-		 * cleared when they're allocated, so we need to do it here.
-		 */
-		if (s->ctor)
-			kmemcheck_mark_uninitialized_pages(page, pages);
-		else
-			kmemcheck_mark_unallocated_pages(page, pages);
-	}
-
 	page->objects = oo_objects(oo);
 
 	order = compound_order(page);
@@ -1689,8 +1671,6 @@ static void __free_slab(struct kmem_cache *s, struct page *page)
 			check_object(s, page, p, SLUB_RED_INACTIVE);
 	}
 
-	kmemcheck_free_shadow(page, compound_order(page));
-
 	mod_lruvec_page_state(page,
 		(s->flags & SLAB_RECLAIM_ACCOUNT) ?
 		NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE,
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index e140ba49b30a..6cd057b41f34 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -41,7 +41,6 @@
 #include <linux/module.h>
 #include <linux/types.h>
 #include <linux/kernel.h>
-#include <linux/kmemcheck.h>
 #include <linux/mm.h>
 #include <linux/interrupt.h>
 #include <linux/in.h>
@@ -234,14 +233,12 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
 	shinfo = skb_shinfo(skb);
 	memset(shinfo, 0, offsetof(struct skb_shared_info, dataref));
 	atomic_set(&shinfo->dataref, 1);
-	kmemcheck_annotate_variable(shinfo->destructor_arg);
 
 	if (flags & SKB_ALLOC_FCLONE) {
 		struct sk_buff_fclones *fclones;
 
 		fclones = container_of(skb, struct sk_buff_fclones, skb1);
 
-		kmemcheck_annotate_bitfield(&fclones->skb2, flags1);
 		skb->fclone = SKB_FCLONE_ORIG;
 		refcount_set(&fclones->fclone_ref, 1);
 
@@ -301,7 +298,6 @@ struct sk_buff *__build_skb(void *data, unsigned int frag_size)
 	shinfo = skb_shinfo(skb);
 	memset(shinfo, 0, offsetof(struct skb_shared_info, dataref));
 	atomic_set(&shinfo->dataref, 1);
-	kmemcheck_annotate_variable(shinfo->destructor_arg);
 
 	return skb;
 }
@@ -1283,7 +1279,6 @@ struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask)
 		if (!n)
 			return NULL;
 
-		kmemcheck_annotate_bitfield(n, flags1);
 		n->fclone = SKB_FCLONE_UNAVAILABLE;
 	}
 
diff --git a/net/core/sock.c b/net/core/sock.c
index 415f441c63b9..78401fa33ce8 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1469,8 +1469,6 @@ static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
 		sk = kmalloc(prot->obj_size, priority);
 
 	if (sk != NULL) {
-		kmemcheck_annotate_bitfield(sk, flags);
-
 		if (security_sk_alloc(sk, family, priority))
 			goto out_free;
 
diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c
index 5b039159e67a..d451b9f19b59 100644
--- a/net/ipv4/inet_timewait_sock.c
+++ b/net/ipv4/inet_timewait_sock.c
@@ -9,7 +9,6 @@
  */
 
 #include <linux/kernel.h>
-#include <linux/kmemcheck.h>
 #include <linux/slab.h>
 #include <linux/module.h>
 #include <net/inet_hashtables.h>
@@ -167,8 +166,6 @@ struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk,
 	if (tw) {
 		const struct inet_sock *inet = inet_sk(sk);
 
-		kmemcheck_annotate_bitfield(tw, flags);
-
 		tw->tw_dr	    = dr;
 		/* Give us an identity. */
 		tw->tw_daddr	    = inet->inet_daddr;
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 887585045b27..c04d60a677a7 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -6195,7 +6195,6 @@ struct request_sock *inet_reqsk_alloc(const struct request_sock_ops *ops,
 	if (req) {
 		struct inet_request_sock *ireq = inet_rsk(req);
 
-		kmemcheck_annotate_bitfield(ireq, flags);
 		ireq->ireq_opt = NULL;
 #if IS_ENABLED(CONFIG_IPV6)
 		ireq->pktopts = NULL;
diff --git a/net/socket.c b/net/socket.c
index c729625eb5d3..42d8e9c9ccd5 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -568,7 +568,6 @@ struct socket *sock_alloc(void)
 
 	sock = SOCKET_I(inode);
 
-	kmemcheck_annotate_bitfield(sock, type);
 	inode->i_ino = get_next_ino();
 	inode->i_mode = S_IFSOCK | S_IRWXUGO;
 	inode->i_uid = current_fsuid();
-- 
cgit v1.2.3


From 75f296d93bcebcfe375884ddac79e30263a31766 Mon Sep 17 00:00:00 2001
From: "Levin, Alexander (Sasha Levin)" <alexander.levin@verizon.com>
Date: Wed, 15 Nov 2017 17:35:54 -0800
Subject: kmemcheck: stop using GFP_NOTRACK and SLAB_NOTRACK

Convert all allocations that used a NOTRACK flag to stop using it.

Link: http://lkml.kernel.org/r/20171007030159.22241-3-alexander.levin@verizon.com
Signed-off-by: Sasha Levin <alexander.levin@verizon.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Tim Hansen <devtimhansen@gmail.com>
Cc: Vegard Nossum <vegardno@ifi.uio.no>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/arm/include/asm/pgalloc.h       |  2 +-
 arch/arm64/include/asm/pgalloc.h     |  2 +-
 arch/powerpc/include/asm/pgalloc.h   |  2 +-
 arch/sh/kernel/dwarf.c               |  4 ++--
 arch/sh/kernel/process.c             |  2 +-
 arch/sparc/mm/init_64.c              |  4 ++--
 arch/unicore32/include/asm/pgalloc.h |  2 +-
 arch/x86/kernel/espfix_64.c          |  2 +-
 arch/x86/mm/init.c                   |  3 +--
 arch/x86/mm/init_64.c                |  2 +-
 arch/x86/mm/pageattr.c               | 10 +++++-----
 arch/x86/mm/pgtable.c                |  2 +-
 arch/x86/platform/efi/efi_64.c       |  2 +-
 crypto/xor.c                         |  7 +------
 include/linux/thread_info.h          |  5 ++---
 init/do_mounts.c                     |  3 +--
 kernel/fork.c                        | 12 ++++++------
 kernel/signal.c                      |  3 +--
 mm/kmemcheck.c                       |  2 +-
 mm/slab.c                            |  2 +-
 mm/slab.h                            |  5 ++---
 mm/slab_common.c                     |  2 +-
 mm/slub.c                            |  4 +---
 23 files changed, 36 insertions(+), 48 deletions(-)

(limited to 'kernel')

diff --git a/arch/arm/include/asm/pgalloc.h b/arch/arm/include/asm/pgalloc.h
index b2902a5cd780..2d7344f0e208 100644
--- a/arch/arm/include/asm/pgalloc.h
+++ b/arch/arm/include/asm/pgalloc.h
@@ -57,7 +57,7 @@ static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd)
 extern pgd_t *pgd_alloc(struct mm_struct *mm);
 extern void pgd_free(struct mm_struct *mm, pgd_t *pgd);
 
-#define PGALLOC_GFP	(GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO)
+#define PGALLOC_GFP	(GFP_KERNEL | __GFP_ZERO)
 
 static inline void clean_pte_table(pte_t *pte)
 {
diff --git a/arch/arm64/include/asm/pgalloc.h b/arch/arm64/include/asm/pgalloc.h
index d25f4f137c2a..5ca6a573a701 100644
--- a/arch/arm64/include/asm/pgalloc.h
+++ b/arch/arm64/include/asm/pgalloc.h
@@ -26,7 +26,7 @@
 
 #define check_pgt_cache()		do { } while (0)
 
-#define PGALLOC_GFP	(GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO)
+#define PGALLOC_GFP	(GFP_KERNEL | __GFP_ZERO)
 #define PGD_SIZE	(PTRS_PER_PGD * sizeof(pgd_t))
 
 #if CONFIG_PGTABLE_LEVELS > 2
diff --git a/arch/powerpc/include/asm/pgalloc.h b/arch/powerpc/include/asm/pgalloc.h
index a14203c005f1..e11f03007b57 100644
--- a/arch/powerpc/include/asm/pgalloc.h
+++ b/arch/powerpc/include/asm/pgalloc.h
@@ -18,7 +18,7 @@ static inline gfp_t pgtable_gfp_flags(struct mm_struct *mm, gfp_t gfp)
 }
 #endif /* MODULE */
 
-#define PGALLOC_GFP (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO)
+#define PGALLOC_GFP (GFP_KERNEL | __GFP_ZERO)
 
 #ifdef CONFIG_PPC_BOOK3S
 #include <asm/book3s/pgalloc.h>
diff --git a/arch/sh/kernel/dwarf.c b/arch/sh/kernel/dwarf.c
index e1d751ae2498..1a2526676a87 100644
--- a/arch/sh/kernel/dwarf.c
+++ b/arch/sh/kernel/dwarf.c
@@ -1172,11 +1172,11 @@ static int __init dwarf_unwinder_init(void)
 
 	dwarf_frame_cachep = kmem_cache_create("dwarf_frames",
 			sizeof(struct dwarf_frame), 0,
-			SLAB_PANIC | SLAB_HWCACHE_ALIGN | SLAB_NOTRACK, NULL);
+			SLAB_PANIC | SLAB_HWCACHE_ALIGN, NULL);
 
 	dwarf_reg_cachep = kmem_cache_create("dwarf_regs",
 			sizeof(struct dwarf_reg), 0,
-			SLAB_PANIC | SLAB_HWCACHE_ALIGN | SLAB_NOTRACK, NULL);
+			SLAB_PANIC | SLAB_HWCACHE_ALIGN, NULL);
 
 	dwarf_frame_pool = mempool_create_slab_pool(DWARF_FRAME_MIN_REQ,
 						    dwarf_frame_cachep);
diff --git a/arch/sh/kernel/process.c b/arch/sh/kernel/process.c
index b2d9963d5978..68b1a67533ce 100644
--- a/arch/sh/kernel/process.c
+++ b/arch/sh/kernel/process.c
@@ -59,7 +59,7 @@ void arch_task_cache_init(void)
 
 	task_xstate_cachep = kmem_cache_create("task_xstate", xstate_size,
 					       __alignof__(union thread_xstate),
-					       SLAB_PANIC | SLAB_NOTRACK, NULL);
+					       SLAB_PANIC, NULL);
 }
 
 #ifdef CONFIG_SH_FPU_EMU
diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c
index 61bdc1270d19..2de22d703076 100644
--- a/arch/sparc/mm/init_64.c
+++ b/arch/sparc/mm/init_64.c
@@ -2927,7 +2927,7 @@ void __flush_tlb_all(void)
 pte_t *pte_alloc_one_kernel(struct mm_struct *mm,
 			    unsigned long address)
 {
-	struct page *page = alloc_page(GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO);
+	struct page *page = alloc_page(GFP_KERNEL | __GFP_ZERO);
 	pte_t *pte = NULL;
 
 	if (page)
@@ -2939,7 +2939,7 @@ pte_t *pte_alloc_one_kernel(struct mm_struct *mm,
 pgtable_t pte_alloc_one(struct mm_struct *mm,
 			unsigned long address)
 {
-	struct page *page = alloc_page(GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO);
+	struct page *page = alloc_page(GFP_KERNEL | __GFP_ZERO);
 	if (!page)
 		return NULL;
 	if (!pgtable_page_ctor(page)) {
diff --git a/arch/unicore32/include/asm/pgalloc.h b/arch/unicore32/include/asm/pgalloc.h
index 26775793c204..f0fdb268f8f2 100644
--- a/arch/unicore32/include/asm/pgalloc.h
+++ b/arch/unicore32/include/asm/pgalloc.h
@@ -28,7 +28,7 @@ extern void free_pgd_slow(struct mm_struct *mm, pgd_t *pgd);
 #define pgd_alloc(mm)			get_pgd_slow(mm)
 #define pgd_free(mm, pgd)		free_pgd_slow(mm, pgd)
 
-#define PGALLOC_GFP	(GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO)
+#define PGALLOC_GFP	(GFP_KERNEL | __GFP_ZERO)
 
 /*
  * Allocate one PTE table.
diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c
index 7d7715dde901..e5ec3cafa72e 100644
--- a/arch/x86/kernel/espfix_64.c
+++ b/arch/x86/kernel/espfix_64.c
@@ -57,7 +57,7 @@
 # error "Need more virtual address space for the ESPFIX hack"
 #endif
 
-#define PGALLOC_GFP (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO)
+#define PGALLOC_GFP (GFP_KERNEL | __GFP_ZERO)
 
 /* This contains the *bottom* address of the espfix stack */
 DEFINE_PER_CPU_READ_MOSTLY(unsigned long, espfix_stack);
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index a22c2b95e513..ef94620ceb8a 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -92,8 +92,7 @@ __ref void *alloc_low_pages(unsigned int num)
 		unsigned int order;
 
 		order = get_order((unsigned long)num << PAGE_SHIFT);
-		return (void *)__get_free_pages(GFP_ATOMIC | __GFP_NOTRACK |
-						__GFP_ZERO, order);
+		return (void *)__get_free_pages(GFP_ATOMIC | __GFP_ZERO, order);
 	}
 
 	if ((pgt_buf_end + num) > pgt_buf_top || !can_use_brk_pgt) {
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index adcea90a2046..5fa3a58b5d78 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -184,7 +184,7 @@ static __ref void *spp_getpage(void)
 	void *ptr;
 
 	if (after_bootmem)
-		ptr = (void *) get_zeroed_page(GFP_ATOMIC | __GFP_NOTRACK);
+		ptr = (void *) get_zeroed_page(GFP_ATOMIC);
 	else
 		ptr = alloc_bootmem_pages(PAGE_SIZE);
 
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 3fe68483463c..85cf12219dea 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -753,7 +753,7 @@ static int split_large_page(struct cpa_data *cpa, pte_t *kpte,
 
 	if (!debug_pagealloc_enabled())
 		spin_unlock(&cpa_lock);
-	base = alloc_pages(GFP_KERNEL | __GFP_NOTRACK, 0);
+	base = alloc_pages(GFP_KERNEL, 0);
 	if (!debug_pagealloc_enabled())
 		spin_lock(&cpa_lock);
 	if (!base)
@@ -904,7 +904,7 @@ static void unmap_pud_range(p4d_t *p4d, unsigned long start, unsigned long end)
 
 static int alloc_pte_page(pmd_t *pmd)
 {
-	pte_t *pte = (pte_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK);
+	pte_t *pte = (pte_t *)get_zeroed_page(GFP_KERNEL);
 	if (!pte)
 		return -1;
 
@@ -914,7 +914,7 @@ static int alloc_pte_page(pmd_t *pmd)
 
 static int alloc_pmd_page(pud_t *pud)
 {
-	pmd_t *pmd = (pmd_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK);
+	pmd_t *pmd = (pmd_t *)get_zeroed_page(GFP_KERNEL);
 	if (!pmd)
 		return -1;
 
@@ -1120,7 +1120,7 @@ static int populate_pgd(struct cpa_data *cpa, unsigned long addr)
 	pgd_entry = cpa->pgd + pgd_index(addr);
 
 	if (pgd_none(*pgd_entry)) {
-		p4d = (p4d_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK);
+		p4d = (p4d_t *)get_zeroed_page(GFP_KERNEL);
 		if (!p4d)
 			return -1;
 
@@ -1132,7 +1132,7 @@ static int populate_pgd(struct cpa_data *cpa, unsigned long addr)
 	 */
 	p4d = p4d_offset(pgd_entry, addr);
 	if (p4d_none(*p4d)) {
-		pud = (pud_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK);
+		pud = (pud_t *)get_zeroed_page(GFP_KERNEL);
 		if (!pud)
 			return -1;
 
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 17ebc5a978cc..96d456a94b03 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -7,7 +7,7 @@
 #include <asm/fixmap.h>
 #include <asm/mtrr.h>
 
-#define PGALLOC_GFP (GFP_KERNEL_ACCOUNT | __GFP_NOTRACK | __GFP_ZERO)
+#define PGALLOC_GFP (GFP_KERNEL_ACCOUNT | __GFP_ZERO)
 
 #ifdef CONFIG_HIGHPTE
 #define PGALLOC_USER_GFP __GFP_HIGHMEM
diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c
index 9e4ee5b04b2d..6a151ce70e86 100644
--- a/arch/x86/platform/efi/efi_64.c
+++ b/arch/x86/platform/efi/efi_64.c
@@ -207,7 +207,7 @@ int __init efi_alloc_page_tables(void)
 	if (efi_enabled(EFI_OLD_MEMMAP))
 		return 0;
 
-	gfp_mask = GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO;
+	gfp_mask = GFP_KERNEL | __GFP_ZERO;
 	efi_pgd = (pgd_t *)__get_free_page(gfp_mask);
 	if (!efi_pgd)
 		return -ENOMEM;
diff --git a/crypto/xor.c b/crypto/xor.c
index 263af9fb45ea..bce9fe7af40a 100644
--- a/crypto/xor.c
+++ b/crypto/xor.c
@@ -122,12 +122,7 @@ calibrate_xor_blocks(void)
 		goto out;
 	}
 
-	/*
-	 * Note: Since the memory is not actually used for _anything_ but to
-	 * test the XOR speed, we don't really want kmemcheck to warn about
-	 * reading uninitialized bytes here.
-	 */
-	b1 = (void *) __get_free_pages(GFP_KERNEL | __GFP_NOTRACK, 2);
+	b1 = (void *) __get_free_pages(GFP_KERNEL, 2);
 	if (!b1) {
 		printk(KERN_WARNING "xor: Yikes!  No memory available.\n");
 		return -ENOMEM;
diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h
index 4bcdf00c110f..34f053a150a9 100644
--- a/include/linux/thread_info.h
+++ b/include/linux/thread_info.h
@@ -44,10 +44,9 @@ enum {
 #endif
 
 #if IS_ENABLED(CONFIG_DEBUG_STACK_USAGE) || IS_ENABLED(CONFIG_DEBUG_KMEMLEAK)
-# define THREADINFO_GFP		(GFP_KERNEL_ACCOUNT | __GFP_NOTRACK | \
-				 __GFP_ZERO)
+# define THREADINFO_GFP		(GFP_KERNEL_ACCOUNT | __GFP_ZERO)
 #else
-# define THREADINFO_GFP		(GFP_KERNEL_ACCOUNT | __GFP_NOTRACK)
+# define THREADINFO_GFP		(GFP_KERNEL_ACCOUNT)
 #endif
 
 /*
diff --git a/init/do_mounts.c b/init/do_mounts.c
index f6d4dd764a52..7cf4f6dafd5f 100644
--- a/init/do_mounts.c
+++ b/init/do_mounts.c
@@ -380,8 +380,7 @@ static int __init do_mount_root(char *name, char *fs, int flags, void *data)
 
 void __init mount_block_root(char *name, int flags)
 {
-	struct page *page = alloc_page(GFP_KERNEL |
-					__GFP_NOTRACK_FALSE_POSITIVE);
+	struct page *page = alloc_page(GFP_KERNEL);
 	char *fs_names = page_address(page);
 	char *p;
 #ifdef CONFIG_BLOCK
diff --git a/kernel/fork.c b/kernel/fork.c
index 006dc5899a1a..4e55eedba8d6 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -469,7 +469,7 @@ void __init fork_init(void)
 	/* create a slab on which task_structs can be allocated */
 	task_struct_cachep = kmem_cache_create("task_struct",
 			arch_task_struct_size, align,
-			SLAB_PANIC|SLAB_NOTRACK|SLAB_ACCOUNT, NULL);
+			SLAB_PANIC|SLAB_ACCOUNT, NULL);
 #endif
 
 	/* do the arch specific task caches init */
@@ -2205,18 +2205,18 @@ void __init proc_caches_init(void)
 	sighand_cachep = kmem_cache_create("sighand_cache",
 			sizeof(struct sighand_struct), 0,
 			SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_TYPESAFE_BY_RCU|
-			SLAB_NOTRACK|SLAB_ACCOUNT, sighand_ctor);
+			SLAB_ACCOUNT, sighand_ctor);
 	signal_cachep = kmem_cache_create("signal_cache",
 			sizeof(struct signal_struct), 0,
-			SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK|SLAB_ACCOUNT,
+			SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
 			NULL);
 	files_cachep = kmem_cache_create("files_cache",
 			sizeof(struct files_struct), 0,
-			SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK|SLAB_ACCOUNT,
+			SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
 			NULL);
 	fs_cachep = kmem_cache_create("fs_cache",
 			sizeof(struct fs_struct), 0,
-			SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK|SLAB_ACCOUNT,
+			SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
 			NULL);
 	/*
 	 * FIXME! The "sizeof(struct mm_struct)" currently includes the
@@ -2227,7 +2227,7 @@ void __init proc_caches_init(void)
 	 */
 	mm_cachep = kmem_cache_create("mm_struct",
 			sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN,
-			SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK|SLAB_ACCOUNT,
+			SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
 			NULL);
 	vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC|SLAB_ACCOUNT);
 	mmap_init();
diff --git a/kernel/signal.c b/kernel/signal.c
index 8dcd8825b2de..aa1fb9f905db 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1036,8 +1036,7 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
 	else
 		override_rlimit = 0;
 
-	q = __sigqueue_alloc(sig, t, GFP_ATOMIC | __GFP_NOTRACK_FALSE_POSITIVE,
-		override_rlimit);
+	q = __sigqueue_alloc(sig, t, GFP_ATOMIC, override_rlimit);
 	if (q) {
 		list_add_tail(&q->list, &pending->list);
 		switch ((unsigned long) info) {
diff --git a/mm/kmemcheck.c b/mm/kmemcheck.c
index 800d64b854ea..b3a4d61d341c 100644
--- a/mm/kmemcheck.c
+++ b/mm/kmemcheck.c
@@ -18,7 +18,7 @@ void kmemcheck_alloc_shadow(struct page *page, int order, gfp_t flags, int node)
 	 * With kmemcheck enabled, we need to allocate a memory area for the
 	 * shadow bits as well.
 	 */
-	shadow = alloc_pages_node(node, flags | __GFP_NOTRACK, order);
+	shadow = alloc_pages_node(node, flags, order);
 	if (!shadow) {
 		if (printk_ratelimit())
 			pr_err("kmemcheck: failed to allocate shadow bitmap\n");
diff --git a/mm/slab.c b/mm/slab.c
index c84365e9a591..183e996dde5f 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1410,7 +1410,7 @@ static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags,
 
 	flags |= cachep->allocflags;
 
-	page = __alloc_pages_node(nodeid, flags | __GFP_NOTRACK, cachep->gfporder);
+	page = __alloc_pages_node(nodeid, flags, cachep->gfporder);
 	if (!page) {
 		slab_out_of_memory(cachep, flags, nodeid);
 		return NULL;
diff --git a/mm/slab.h b/mm/slab.h
index e60a3d1d8f6f..ad657ffa44e5 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -141,10 +141,10 @@ static inline slab_flags_t kmem_cache_flags(unsigned long object_size,
 #if defined(CONFIG_SLAB)
 #define SLAB_CACHE_FLAGS (SLAB_MEM_SPREAD | SLAB_NOLEAKTRACE | \
 			  SLAB_RECLAIM_ACCOUNT | SLAB_TEMPORARY | \
-			  SLAB_NOTRACK | SLAB_ACCOUNT)
+			  SLAB_ACCOUNT)
 #elif defined(CONFIG_SLUB)
 #define SLAB_CACHE_FLAGS (SLAB_NOLEAKTRACE | SLAB_RECLAIM_ACCOUNT | \
-			  SLAB_TEMPORARY | SLAB_NOTRACK | SLAB_ACCOUNT)
+			  SLAB_TEMPORARY | SLAB_ACCOUNT)
 #else
 #define SLAB_CACHE_FLAGS (0)
 #endif
@@ -163,7 +163,6 @@ static inline slab_flags_t kmem_cache_flags(unsigned long object_size,
 			      SLAB_NOLEAKTRACE | \
 			      SLAB_RECLAIM_ACCOUNT | \
 			      SLAB_TEMPORARY | \
-			      SLAB_NOTRACK | \
 			      SLAB_ACCOUNT)
 
 int __kmem_cache_shutdown(struct kmem_cache *);
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 175e86637afd..c8cb36774ba1 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -44,7 +44,7 @@ static DECLARE_WORK(slab_caches_to_rcu_destroy_work,
 		SLAB_FAILSLAB | SLAB_KASAN)
 
 #define SLAB_MERGE_SAME (SLAB_RECLAIM_ACCOUNT | SLAB_CACHE_DMA | \
-			 SLAB_NOTRACK | SLAB_ACCOUNT)
+			 SLAB_ACCOUNT)
 
 /*
  * Merge control. If this is set then no merging of slab caches will occur.
diff --git a/mm/slub.c b/mm/slub.c
index ac3b50b9abec..91aa99b4b836 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1436,8 +1436,6 @@ static inline struct page *alloc_slab_page(struct kmem_cache *s,
 	struct page *page;
 	int order = oo_order(oo);
 
-	flags |= __GFP_NOTRACK;
-
 	if (node == NUMA_NO_NODE)
 		page = alloc_pages(flags, order);
 	else
@@ -3774,7 +3772,7 @@ static void *kmalloc_large_node(size_t size, gfp_t flags, int node)
 	struct page *page;
 	void *ptr = NULL;
 
-	flags |= __GFP_COMP | __GFP_NOTRACK;
+	flags |= __GFP_COMP;
 	page = alloc_pages_node(node, flags, get_order(size));
 	if (page)
 		ptr = page_address(page);
-- 
cgit v1.2.3


From 4675ff05de2d76d167336b368bd07f3fef6ed5a6 Mon Sep 17 00:00:00 2001
From: "Levin, Alexander (Sasha Levin)" <alexander.levin@verizon.com>
Date: Wed, 15 Nov 2017 17:36:02 -0800
Subject: kmemcheck: rip it out

Fix up makefiles, remove references, and git rm kmemcheck.

Link: http://lkml.kernel.org/r/20171007030159.22241-4-alexander.levin@verizon.com
Signed-off-by: Sasha Levin <alexander.levin@verizon.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Vegard Nossum <vegardno@ifi.uio.no>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Tim Hansen <devtimhansen@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/admin-guide/kernel-parameters.txt |   7 -
 Documentation/dev-tools/index.rst               |   1 -
 Documentation/dev-tools/kmemcheck.rst           | 733 ------------------------
 MAINTAINERS                                     |  10 -
 arch/x86/Kconfig                                |   3 +-
 arch/x86/include/asm/kmemcheck.h                |  42 --
 arch/x86/include/asm/string_32.h                |   9 -
 arch/x86/include/asm/string_64.h                |   8 -
 arch/x86/kernel/cpu/intel.c                     |  15 -
 arch/x86/mm/Makefile                            |   2 -
 arch/x86/mm/init.c                              |   5 +-
 arch/x86/mm/kmemcheck/Makefile                  |   1 -
 arch/x86/mm/kmemcheck/error.c                   | 227 --------
 arch/x86/mm/kmemcheck/error.h                   |  15 -
 arch/x86/mm/kmemcheck/kmemcheck.c               | 658 ---------------------
 arch/x86/mm/kmemcheck/opcode.c                  | 106 ----
 arch/x86/mm/kmemcheck/opcode.h                  |   9 -
 arch/x86/mm/kmemcheck/pte.c                     |  22 -
 arch/x86/mm/kmemcheck/pte.h                     |  10 -
 arch/x86/mm/kmemcheck/selftest.c                |  70 ---
 arch/x86/mm/kmemcheck/selftest.h                |   6 -
 arch/x86/mm/kmemcheck/shadow.c                  | 173 ------
 arch/x86/mm/kmemcheck/shadow.h                  |  18 -
 include/linux/interrupt.h                       |  15 -
 include/linux/kmemcheck.h                       | 171 ------
 kernel/softirq.c                                |  10 -
 kernel/sysctl.c                                 |  10 -
 lib/Kconfig.debug                               |   6 +-
 lib/Kconfig.kmemcheck                           |  94 ---
 mm/Kconfig.debug                                |   1 -
 mm/Makefile                                     |   2 -
 mm/kmemcheck.c                                  | 125 ----
 mm/slub.c                                       |   5 +-
 scripts/kernel-doc                              |   2 -
 tools/include/linux/kmemcheck.h                 |   8 -
 35 files changed, 7 insertions(+), 2592 deletions(-)
 delete mode 100644 Documentation/dev-tools/kmemcheck.rst
 delete mode 100644 arch/x86/mm/kmemcheck/Makefile
 delete mode 100644 arch/x86/mm/kmemcheck/kmemcheck.c
 delete mode 100644 arch/x86/mm/kmemcheck/shadow.c
 delete mode 100644 lib/Kconfig.kmemcheck

(limited to 'kernel')

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index b74e13312fdc..00bb04972612 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -1864,13 +1864,6 @@
 			Built with CONFIG_DEBUG_KMEMLEAK_DEFAULT_OFF=y,
 			the default is off.
 
-	kmemcheck=	[X86] Boot-time kmemcheck enable/disable/one-shot mode
-			Valid arguments: 0, 1, 2
-			kmemcheck=0 (disabled)
-			kmemcheck=1 (enabled)
-			kmemcheck=2 (one-shot mode)
-			Default: 2 (one-shot mode)
-
 	kvm.ignore_msrs=[KVM] Ignore guest accesses to unhandled MSRs.
 			Default is 0 (don't ignore, but inject #GP)
 
diff --git a/Documentation/dev-tools/index.rst b/Documentation/dev-tools/index.rst
index a81787cd47d7..e313925fb0fa 100644
--- a/Documentation/dev-tools/index.rst
+++ b/Documentation/dev-tools/index.rst
@@ -21,7 +21,6 @@ whole; patches welcome!
    kasan
    ubsan
    kmemleak
-   kmemcheck
    gdb-kernel-debugging
    kgdb
    kselftest
diff --git a/Documentation/dev-tools/kmemcheck.rst b/Documentation/dev-tools/kmemcheck.rst
deleted file mode 100644
index 7f3d1985de74..000000000000
--- a/Documentation/dev-tools/kmemcheck.rst
+++ /dev/null
@@ -1,733 +0,0 @@
-Getting started with kmemcheck
-==============================
-
-Vegard Nossum <vegardno@ifi.uio.no>
-
-
-Introduction
-------------
-
-kmemcheck is a debugging feature for the Linux Kernel. More specifically, it
-is a dynamic checker that detects and warns about some uses of uninitialized
-memory.
-
-Userspace programmers might be familiar with Valgrind's memcheck. The main
-difference between memcheck and kmemcheck is that memcheck works for userspace
-programs only, and kmemcheck works for the kernel only. The implementations
-are of course vastly different. Because of this, kmemcheck is not as accurate
-as memcheck, but it turns out to be good enough in practice to discover real
-programmer errors that the compiler is not able to find through static
-analysis.
-
-Enabling kmemcheck on a kernel will probably slow it down to the extent that
-the machine will not be usable for normal workloads such as e.g. an
-interactive desktop. kmemcheck will also cause the kernel to use about twice
-as much memory as normal. For this reason, kmemcheck is strictly a debugging
-feature.
-
-
-Downloading
------------
-
-As of version 2.6.31-rc1, kmemcheck is included in the mainline kernel.
-
-
-Configuring and compiling
--------------------------
-
-kmemcheck only works for the x86 (both 32- and 64-bit) platform. A number of
-configuration variables must have specific settings in order for the kmemcheck
-menu to even appear in "menuconfig". These are:
-
-- ``CONFIG_CC_OPTIMIZE_FOR_SIZE=n``
-	This option is located under "General setup" / "Optimize for size".
-
-	Without this, gcc will use certain optimizations that usually lead to
-	false positive warnings from kmemcheck. An example of this is a 16-bit
-	field in a struct, where gcc may load 32 bits, then discard the upper
-	16 bits. kmemcheck sees only the 32-bit load, and may trigger a
-	warning for the upper 16 bits (if they're uninitialized).
-
-- ``CONFIG_SLAB=y`` or ``CONFIG_SLUB=y``
-	This option is located under "General setup" / "Choose SLAB
-	allocator".
-
-- ``CONFIG_FUNCTION_TRACER=n``
-	This option is located under "Kernel hacking" / "Tracers" / "Kernel
-	Function Tracer"
-
-	When function tracing is compiled in, gcc emits a call to another
-	function at the beginning of every function. This means that when the
-	page fault handler is called, the ftrace framework will be called
-	before kmemcheck has had a chance to handle the fault. If ftrace then
-	modifies memory that was tracked by kmemcheck, the result is an
-	endless recursive page fault.
-
-- ``CONFIG_DEBUG_PAGEALLOC=n``
-	This option is located under "Kernel hacking" / "Memory Debugging"
-	/ "Debug page memory allocations".
-
-In addition, I highly recommend turning on ``CONFIG_DEBUG_INFO=y``. This is also
-located under "Kernel hacking". With this, you will be able to get line number
-information from the kmemcheck warnings, which is extremely valuable in
-debugging a problem. This option is not mandatory, however, because it slows
-down the compilation process and produces a much bigger kernel image.
-
-Now the kmemcheck menu should be visible (under "Kernel hacking" / "Memory
-Debugging" / "kmemcheck: trap use of uninitialized memory"). Here follows
-a description of the kmemcheck configuration variables:
-
-- ``CONFIG_KMEMCHECK``
-	This must be enabled in order to use kmemcheck at all...
-
-- ``CONFIG_KMEMCHECK_``[``DISABLED`` | ``ENABLED`` | ``ONESHOT``]``_BY_DEFAULT``
-	This option controls the status of kmemcheck at boot-time. "Enabled"
-	will enable kmemcheck right from the start, "disabled" will boot the
-	kernel as normal (but with the kmemcheck code compiled in, so it can
-	be enabled at run-time after the kernel has booted), and "one-shot" is
-	a special mode which will turn kmemcheck off automatically after
-	detecting the first use of uninitialized memory.
-
-	If you are using kmemcheck to actively debug a problem, then you
-	probably want to choose "enabled" here.
-
-	The one-shot mode is mostly useful in automated test setups because it
-	can prevent floods of warnings and increase the chances of the machine
-	surviving in case something is really wrong. In other cases, the one-
-	shot mode could actually be counter-productive because it would turn
-	itself off at the very first error -- in the case of a false positive
-	too -- and this would come in the way of debugging the specific
-	problem you were interested in.
-
-	If you would like to use your kernel as normal, but with a chance to
-	enable kmemcheck in case of some problem, it might be a good idea to
-	choose "disabled" here. When kmemcheck is disabled, most of the run-
-	time overhead is not incurred, and the kernel will be almost as fast
-	as normal.
-
-- ``CONFIG_KMEMCHECK_QUEUE_SIZE``
-	Select the maximum number of error reports to store in an internal
-	(fixed-size) buffer. Since errors can occur virtually anywhere and in
-	any context, we need a temporary storage area which is guaranteed not
-	to generate any other page faults when accessed. The queue will be
-	emptied as soon as a tasklet may be scheduled. If the queue is full,
-	new error reports will be lost.
-
-	The default value of 64 is probably fine. If some code produces more
-	than 64 errors within an irqs-off section, then the code is likely to
-	produce many, many more, too, and these additional reports seldom give
-	any more information (the first report is usually the most valuable
-	anyway).
-
-	This number might have to be adjusted if you are not using serial
-	console or similar to capture the kernel log. If you are using the
-	"dmesg" command to save the log, then getting a lot of kmemcheck
-	warnings might overflow the kernel log itself, and the earlier reports
-	will get lost in that way instead. Try setting this to 10 or so on
-	such a setup.
-
-- ``CONFIG_KMEMCHECK_SHADOW_COPY_SHIFT``
-	Select the number of shadow bytes to save along with each entry of the
-	error-report queue. These bytes indicate what parts of an allocation
-	are initialized, uninitialized, etc. and will be displayed when an
-	error is detected to help the debugging of a particular problem.
-
-	The number entered here is actually the logarithm of the number of
-	bytes that will be saved. So if you pick for example 5 here, kmemcheck
-	will save 2^5 = 32 bytes.
-
-	The default value should be fine for debugging most problems. It also
-	fits nicely within 80 columns.
-
-- ``CONFIG_KMEMCHECK_PARTIAL_OK``
-	This option (when enabled) works around certain GCC optimizations that
-	produce 32-bit reads from 16-bit variables where the upper 16 bits are
-	thrown away afterwards.
-
-	The default value (enabled) is recommended. This may of course hide
-	some real errors, but disabling it would probably produce a lot of
-	false positives.
-
-- ``CONFIG_KMEMCHECK_BITOPS_OK``
-	This option silences warnings that would be generated for bit-field
-	accesses where not all the bits are initialized at the same time. This
-	may also hide some real bugs.
-
-	This option is probably obsolete, or it should be replaced with
-	the kmemcheck-/bitfield-annotations for the code in question. The
-	default value is therefore fine.
-
-Now compile the kernel as usual.
-
-
-How to use
-----------
-
-Booting
-~~~~~~~
-
-First some information about the command-line options. There is only one
-option specific to kmemcheck, and this is called "kmemcheck". It can be used
-to override the default mode as chosen by the ``CONFIG_KMEMCHECK_*_BY_DEFAULT``
-option. Its possible settings are:
-
-- ``kmemcheck=0`` (disabled)
-- ``kmemcheck=1`` (enabled)
-- ``kmemcheck=2`` (one-shot mode)
-
-If SLUB debugging has been enabled in the kernel, it may take precedence over
-kmemcheck in such a way that the slab caches which are under SLUB debugging
-will not be tracked by kmemcheck. In order to ensure that this doesn't happen
-(even though it shouldn't by default), use SLUB's boot option ``slub_debug``,
-like this: ``slub_debug=-``
-
-In fact, this option may also be used for fine-grained control over SLUB vs.
-kmemcheck. For example, if the command line includes
-``kmemcheck=1 slub_debug=,dentry``, then SLUB debugging will be used only
-for the "dentry" slab cache, and with kmemcheck tracking all the other
-caches. This is advanced usage, however, and is not generally recommended.
-
-
-Run-time enable/disable
-~~~~~~~~~~~~~~~~~~~~~~~
-
-When the kernel has booted, it is possible to enable or disable kmemcheck at
-run-time. WARNING: This feature is still experimental and may cause false
-positive warnings to appear. Therefore, try not to use this. If you find that
-it doesn't work properly (e.g. you see an unreasonable amount of warnings), I
-will be happy to take bug reports.
-
-Use the file ``/proc/sys/kernel/kmemcheck`` for this purpose, e.g.::
-
-	$ echo 0 > /proc/sys/kernel/kmemcheck # disables kmemcheck
-
-The numbers are the same as for the ``kmemcheck=`` command-line option.
-
-
-Debugging
-~~~~~~~~~
-
-A typical report will look something like this::
-
-    WARNING: kmemcheck: Caught 32-bit read from uninitialized memory (ffff88003e4a2024)
-    80000000000000000000000000000000000000000088ffff0000000000000000
-     i i i i u u u u i i i i i i i i u u u u u u u u u u u u u u u u
-             ^
-
-    Pid: 1856, comm: ntpdate Not tainted 2.6.29-rc5 #264 945P-A
-    RIP: 0010:[<ffffffff8104ede8>]  [<ffffffff8104ede8>] __dequeue_signal+0xc8/0x190
-    RSP: 0018:ffff88003cdf7d98  EFLAGS: 00210002
-    RAX: 0000000000000030 RBX: ffff88003d4ea968 RCX: 0000000000000009
-    RDX: ffff88003e5d6018 RSI: ffff88003e5d6024 RDI: ffff88003cdf7e84
-    RBP: ffff88003cdf7db8 R08: ffff88003e5d6000 R09: 0000000000000000
-    R10: 0000000000000080 R11: 0000000000000000 R12: 000000000000000e
-    R13: ffff88003cdf7e78 R14: ffff88003d530710 R15: ffff88003d5a98c8
-    FS:  0000000000000000(0000) GS:ffff880001982000(0063) knlGS:00000
-    CS:  0010 DS: 002b ES: 002b CR0: 0000000080050033
-    CR2: ffff88003f806ea0 CR3: 000000003c036000 CR4: 00000000000006a0
-    DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
-    DR3: 0000000000000000 DR6: 00000000ffff4ff0 DR7: 0000000000000400
-     [<ffffffff8104f04e>] dequeue_signal+0x8e/0x170
-     [<ffffffff81050bd8>] get_signal_to_deliver+0x98/0x390
-     [<ffffffff8100b87d>] do_notify_resume+0xad/0x7d0
-     [<ffffffff8100c7b5>] int_signal+0x12/0x17
-     [<ffffffffffffffff>] 0xffffffffffffffff
-
-The single most valuable information in this report is the RIP (or EIP on 32-
-bit) value. This will help us pinpoint exactly which instruction that caused
-the warning.
-
-If your kernel was compiled with ``CONFIG_DEBUG_INFO=y``, then all we have to do
-is give this address to the addr2line program, like this::
-
-	$ addr2line -e vmlinux -i ffffffff8104ede8
-	arch/x86/include/asm/string_64.h:12
-	include/asm-generic/siginfo.h:287
-	kernel/signal.c:380
-	kernel/signal.c:410
-
-The "``-e vmlinux``" tells addr2line which file to look in. **IMPORTANT:**
-This must be the vmlinux of the kernel that produced the warning in the
-first place! If not, the line number information will almost certainly be
-wrong.
-
-The "``-i``" tells addr2line to also print the line numbers of inlined
-functions.  In this case, the flag was very important, because otherwise,
-it would only have printed the first line, which is just a call to
-``memcpy()``, which could be called from a thousand places in the kernel, and
-is therefore not very useful.  These inlined functions would not show up in
-the stack trace above, simply because the kernel doesn't load the extra
-debugging information. This technique can of course be used with ordinary
-kernel oopses as well.
-
-In this case, it's the caller of ``memcpy()`` that is interesting, and it can be
-found in ``include/asm-generic/siginfo.h``, line 287::
-
-    281 static inline void copy_siginfo(struct siginfo *to, struct siginfo *from)
-    282 {
-    283         if (from->si_code < 0)
-    284                 memcpy(to, from, sizeof(*to));
-    285         else
-    286                 /* _sigchld is currently the largest know union member */
-    287                 memcpy(to, from, __ARCH_SI_PREAMBLE_SIZE + sizeof(from->_sifields._sigchld));
-    288 }
-
-Since this was a read (kmemcheck usually warns about reads only, though it can
-warn about writes to unallocated or freed memory as well), it was probably the
-"from" argument which contained some uninitialized bytes. Following the chain
-of calls, we move upwards to see where "from" was allocated or initialized,
-``kernel/signal.c``, line 380::
-
-    359 static void collect_signal(int sig, struct sigpending *list, siginfo_t *info)
-    360 {
-    ...
-    367         list_for_each_entry(q, &list->list, list) {
-    368                 if (q->info.si_signo == sig) {
-    369                         if (first)
-    370                                 goto still_pending;
-    371                         first = q;
-    ...
-    377         if (first) {
-    378 still_pending:
-    379                 list_del_init(&first->list);
-    380                 copy_siginfo(info, &first->info);
-    381                 __sigqueue_free(first);
-    ...
-    392         }
-    393 }
-
-Here, it is ``&first->info`` that is being passed on to ``copy_siginfo()``. The
-variable ``first`` was found on a list -- passed in as the second argument to
-``collect_signal()``. We  continue our journey through the stack, to figure out
-where the item on "list" was allocated or initialized. We move to line 410::
-
-    395 static int __dequeue_signal(struct sigpending *pending, sigset_t *mask,
-    396                         siginfo_t *info)
-    397 {
-    ...
-    410                 collect_signal(sig, pending, info);
-    ...
-    414 }
-
-Now we need to follow the ``pending`` pointer, since that is being passed on to
-``collect_signal()`` as ``list``. At this point, we've run out of lines from the
-"addr2line" output. Not to worry, we just paste the next addresses from the
-kmemcheck stack dump, i.e.::
-
-     [<ffffffff8104f04e>] dequeue_signal+0x8e/0x170
-     [<ffffffff81050bd8>] get_signal_to_deliver+0x98/0x390
-     [<ffffffff8100b87d>] do_notify_resume+0xad/0x7d0
-     [<ffffffff8100c7b5>] int_signal+0x12/0x17
-
-	$ addr2line -e vmlinux -i ffffffff8104f04e ffffffff81050bd8 \
-		ffffffff8100b87d ffffffff8100c7b5
-	kernel/signal.c:446
-	kernel/signal.c:1806
-	arch/x86/kernel/signal.c:805
-	arch/x86/kernel/signal.c:871
-	arch/x86/kernel/entry_64.S:694
-
-Remember that since these addresses were found on the stack and not as the
-RIP value, they actually point to the _next_ instruction (they are return
-addresses). This becomes obvious when we look at the code for line 446::
-
-    422 int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
-    423 {
-    ...
-    431                 signr = __dequeue_signal(&tsk->signal->shared_pending,
-    432						 mask, info);
-    433			/*
-    434			 * itimer signal ?
-    435			 *
-    436			 * itimers are process shared and we restart periodic
-    437			 * itimers in the signal delivery path to prevent DoS
-    438			 * attacks in the high resolution timer case. This is
-    439			 * compliant with the old way of self restarting
-    440			 * itimers, as the SIGALRM is a legacy signal and only
-    441			 * queued once. Changing the restart behaviour to
-    442			 * restart the timer in the signal dequeue path is
-    443			 * reducing the timer noise on heavy loaded !highres
-    444			 * systems too.
-    445			 */
-    446			if (unlikely(signr == SIGALRM)) {
-    ...
-    489 }
-
-So instead of looking at 446, we should be looking at 431, which is the line
-that executes just before 446. Here we see that what we are looking for is
-``&tsk->signal->shared_pending``.
-
-Our next task is now to figure out which function that puts items on this
-``shared_pending`` list. A crude, but efficient tool, is ``git grep``::
-
-	$ git grep -n 'shared_pending' kernel/
-	...
-	kernel/signal.c:828:	pending = group ? &t->signal->shared_pending : &t->pending;
-	kernel/signal.c:1339:	pending = group ? &t->signal->shared_pending : &t->pending;
-	...
-
-There were more results, but none of them were related to list operations,
-and these were the only assignments. We inspect the line numbers more closely
-and find that this is indeed where items are being added to the list::
-
-    816 static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
-    817				int group)
-    818 {
-    ...
-    828		pending = group ? &t->signal->shared_pending : &t->pending;
-    ...
-    851		q = __sigqueue_alloc(t, GFP_ATOMIC, (sig < SIGRTMIN &&
-    852						     (is_si_special(info) ||
-    853						      info->si_code >= 0)));
-    854		if (q) {
-    855			list_add_tail(&q->list, &pending->list);
-    ...
-    890 }
-
-and::
-
-    1309 int send_sigqueue(struct sigqueue *q, struct task_struct *t, int group)
-    1310 {
-    ....
-    1339	 pending = group ? &t->signal->shared_pending : &t->pending;
-    1340	 list_add_tail(&q->list, &pending->list);
-    ....
-    1347 }
-
-In the first case, the list element we are looking for, ``q``, is being
-returned from the function ``__sigqueue_alloc()``, which looks like an
-allocation function.  Let's take a look at it::
-
-    187 static struct sigqueue *__sigqueue_alloc(struct task_struct *t, gfp_t flags,
-    188						 int override_rlimit)
-    189 {
-    190		struct sigqueue *q = NULL;
-    191		struct user_struct *user;
-    192
-    193		/*
-    194		 * We won't get problems with the target's UID changing under us
-    195		 * because changing it requires RCU be used, and if t != current, the
-    196		 * caller must be holding the RCU readlock (by way of a spinlock) and
-    197		 * we use RCU protection here
-    198		 */
-    199		user = get_uid(__task_cred(t)->user);
-    200		atomic_inc(&user->sigpending);
-    201		if (override_rlimit ||
-    202		    atomic_read(&user->sigpending) <=
-    203				t->signal->rlim[RLIMIT_SIGPENDING].rlim_cur)
-    204			q = kmem_cache_alloc(sigqueue_cachep, flags);
-    205		if (unlikely(q == NULL)) {
-    206			atomic_dec(&user->sigpending);
-    207			free_uid(user);
-    208		} else {
-    209			INIT_LIST_HEAD(&q->list);
-    210			q->flags = 0;
-    211			q->user = user;
-    212		}
-    213
-    214		return q;
-    215 }
-
-We see that this function initializes ``q->list``, ``q->flags``, and
-``q->user``. It seems that now is the time to look at the definition of
-``struct sigqueue``, e.g.::
-
-    14 struct sigqueue {
-    15	       struct list_head list;
-    16	       int flags;
-    17	       siginfo_t info;
-    18	       struct user_struct *user;
-    19 };
-
-And, you might remember, it was a ``memcpy()`` on ``&first->info`` that
-caused the warning, so this makes perfect sense. It also seems reasonable
-to assume that it is the caller of ``__sigqueue_alloc()`` that has the
-responsibility of filling out (initializing) this member.
-
-But just which fields of the struct were uninitialized? Let's look at
-kmemcheck's report again::
-
-    WARNING: kmemcheck: Caught 32-bit read from uninitialized memory (ffff88003e4a2024)
-    80000000000000000000000000000000000000000088ffff0000000000000000
-     i i i i u u u u i i i i i i i i u u u u u u u u u u u u u u u u
-	     ^
-
-These first two lines are the memory dump of the memory object itself, and
-the shadow bytemap, respectively. The memory object itself is in this case
-``&first->info``. Just beware that the start of this dump is NOT the start
-of the object itself! The position of the caret (^) corresponds with the
-address of the read (ffff88003e4a2024).
-
-The shadow bytemap dump legend is as follows:
-
-- i: initialized
-- u: uninitialized
-- a: unallocated (memory has been allocated by the slab layer, but has not
-  yet been handed off to anybody)
-- f: freed (memory has been allocated by the slab layer, but has been freed
-  by the previous owner)
-
-In order to figure out where (relative to the start of the object) the
-uninitialized memory was located, we have to look at the disassembly. For
-that, we'll need the RIP address again::
-
-    RIP: 0010:[<ffffffff8104ede8>]  [<ffffffff8104ede8>] __dequeue_signal+0xc8/0x190
-
-	$ objdump -d --no-show-raw-insn vmlinux | grep -C 8 ffffffff8104ede8:
-	ffffffff8104edc8:	mov    %r8,0x8(%r8)
-	ffffffff8104edcc:	test   %r10d,%r10d
-	ffffffff8104edcf:	js     ffffffff8104ee88 <__dequeue_signal+0x168>
-	ffffffff8104edd5:	mov    %rax,%rdx
-	ffffffff8104edd8:	mov    $0xc,%ecx
-	ffffffff8104eddd:	mov    %r13,%rdi
-	ffffffff8104ede0:	mov    $0x30,%eax
-	ffffffff8104ede5:	mov    %rdx,%rsi
-	ffffffff8104ede8:	rep movsl %ds:(%rsi),%es:(%rdi)
-	ffffffff8104edea:	test   $0x2,%al
-	ffffffff8104edec:	je     ffffffff8104edf0 <__dequeue_signal+0xd0>
-	ffffffff8104edee:	movsw  %ds:(%rsi),%es:(%rdi)
-	ffffffff8104edf0:	test   $0x1,%al
-	ffffffff8104edf2:	je     ffffffff8104edf5 <__dequeue_signal+0xd5>
-	ffffffff8104edf4:	movsb  %ds:(%rsi),%es:(%rdi)
-	ffffffff8104edf5:	mov    %r8,%rdi
-	ffffffff8104edf8:	callq  ffffffff8104de60 <__sigqueue_free>
-
-As expected, it's the "``rep movsl``" instruction from the ``memcpy()``
-that causes the warning. We know about ``REP MOVSL`` that it uses the register
-``RCX`` to count the number of remaining iterations. By taking a look at the
-register dump again (from the kmemcheck report), we can figure out how many
-bytes were left to copy::
-
-    RAX: 0000000000000030 RBX: ffff88003d4ea968 RCX: 0000000000000009
-
-By looking at the disassembly, we also see that ``%ecx`` is being loaded
-with the value ``$0xc`` just before (ffffffff8104edd8), so we are very
-lucky. Keep in mind that this is the number of iterations, not bytes. And
-since this is a "long" operation, we need to multiply by 4 to get the
-number of bytes. So this means that the uninitialized value was encountered
-at 4 * (0xc - 0x9) = 12 bytes from the start of the object.
-
-We can now try to figure out which field of the "``struct siginfo``" that
-was not initialized. This is the beginning of the struct::
-
-    40 typedef struct siginfo {
-    41	       int si_signo;
-    42	       int si_errno;
-    43	       int si_code;
-    44
-    45	       union {
-    ..
-    92	       } _sifields;
-    93 } siginfo_t;
-
-On 64-bit, the int is 4 bytes long, so it must the union member that has
-not been initialized. We can verify this using gdb::
-
-	$ gdb vmlinux
-	...
-	(gdb) p &((struct siginfo *) 0)->_sifields
-	$1 = (union {...} *) 0x10
-
-Actually, it seems that the union member is located at offset 0x10 -- which
-means that gcc has inserted 4 bytes of padding between the members ``si_code``
-and ``_sifields``. We can now get a fuller picture of the memory dump::
-
-		 _----------------------------=> si_code
-		/	 _--------------------=> (padding)
-	       |	/	 _------------=> _sifields(._kill._pid)
-	       |       |	/	 _----=> _sifields(._kill._uid)
-	       |       |       |	/
-	-------|-------|-------|-------|
-	80000000000000000000000000000000000000000088ffff0000000000000000
-	 i i i i u u u u i i i i i i i i u u u u u u u u u u u u u u u u
-
-This allows us to realize another important fact: ``si_code`` contains the
-value 0x80. Remember that x86 is little endian, so the first 4 bytes
-"80000000" are really the number 0x00000080. With a bit of research, we
-find that this is actually the constant ``SI_KERNEL`` defined in
-``include/asm-generic/siginfo.h``::
-
-    144 #define SI_KERNEL	0x80		/* sent by the kernel from somewhere	 */
-
-This macro is used in exactly one place in the x86 kernel: In ``send_signal()``
-in ``kernel/signal.c``::
-
-    816 static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
-    817				int group)
-    818 {
-    ...
-    828		pending = group ? &t->signal->shared_pending : &t->pending;
-    ...
-    851		q = __sigqueue_alloc(t, GFP_ATOMIC, (sig < SIGRTMIN &&
-    852						     (is_si_special(info) ||
-    853						      info->si_code >= 0)));
-    854		if (q) {
-    855			list_add_tail(&q->list, &pending->list);
-    856			switch ((unsigned long) info) {
-    ...
-    865			case (unsigned long) SEND_SIG_PRIV:
-    866				q->info.si_signo = sig;
-    867				q->info.si_errno = 0;
-    868				q->info.si_code = SI_KERNEL;
-    869				q->info.si_pid = 0;
-    870				q->info.si_uid = 0;
-    871				break;
-    ...
-    890 }
-
-Not only does this match with the ``.si_code`` member, it also matches the place
-we found earlier when looking for where siginfo_t objects are enqueued on the
-``shared_pending`` list.
-
-So to sum up: It seems that it is the padding introduced by the compiler
-between two struct fields that is uninitialized, and this gets reported when
-we do a ``memcpy()`` on the struct. This means that we have identified a false
-positive warning.
-
-Normally, kmemcheck will not report uninitialized accesses in ``memcpy()`` calls
-when both the source and destination addresses are tracked. (Instead, we copy
-the shadow bytemap as well). In this case, the destination address clearly
-was not tracked. We can dig a little deeper into the stack trace from above::
-
-	arch/x86/kernel/signal.c:805
-	arch/x86/kernel/signal.c:871
-	arch/x86/kernel/entry_64.S:694
-
-And we clearly see that the destination siginfo object is located on the
-stack::
-
-    782 static void do_signal(struct pt_regs *regs)
-    783 {
-    784		struct k_sigaction ka;
-    785		siginfo_t info;
-    ...
-    804		signr = get_signal_to_deliver(&info, &ka, regs, NULL);
-    ...
-    854 }
-
-And this ``&info`` is what eventually gets passed to ``copy_siginfo()`` as the
-destination argument.
-
-Now, even though we didn't find an actual error here, the example is still a
-good one, because it shows how one would go about to find out what the report
-was all about.
-
-
-Annotating false positives
-~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-There are a few different ways to make annotations in the source code that
-will keep kmemcheck from checking and reporting certain allocations. Here
-they are:
-
-- ``__GFP_NOTRACK_FALSE_POSITIVE``
-	This flag can be passed to ``kmalloc()`` or ``kmem_cache_alloc()``
-	(therefore also to other functions that end up calling one of
-	these) to indicate that the allocation should not be tracked
-	because it would lead to a false positive report. This is a "big
-	hammer" way of silencing kmemcheck; after all, even if the false
-	positive pertains to particular field in a struct, for example, we
-	will now lose the ability to find (real) errors in other parts of
-	the same struct.
-
-	Example::
-
-	    /* No warnings will ever trigger on accessing any part of x */
-	    x = kmalloc(sizeof *x, GFP_KERNEL | __GFP_NOTRACK_FALSE_POSITIVE);
-
-- ``kmemcheck_bitfield_begin(name)``/``kmemcheck_bitfield_end(name)`` and
-	``kmemcheck_annotate_bitfield(ptr, name)``
-	The first two of these three macros can be used inside struct
-	definitions to signal, respectively, the beginning and end of a
-	bitfield. Additionally, this will assign the bitfield a name, which
-	is given as an argument to the macros.
-
-	Having used these markers, one can later use
-	kmemcheck_annotate_bitfield() at the point of allocation, to indicate
-	which parts of the allocation is part of a bitfield.
-
-	Example::
-
-	    struct foo {
-		int x;
-
-		kmemcheck_bitfield_begin(flags);
-		int flag_a:1;
-		int flag_b:1;
-		kmemcheck_bitfield_end(flags);
-
-		int y;
-	    };
-
-	    struct foo *x = kmalloc(sizeof *x);
-
-	    /* No warnings will trigger on accessing the bitfield of x */
-	    kmemcheck_annotate_bitfield(x, flags);
-
-	Note that ``kmemcheck_annotate_bitfield()`` can be used even before the
-	return value of ``kmalloc()`` is checked -- in other words, passing NULL
-	as the first argument is legal (and will do nothing).
-
-
-Reporting errors
-----------------
-
-As we have seen, kmemcheck will produce false positive reports. Therefore, it
-is not very wise to blindly post kmemcheck warnings to mailing lists and
-maintainers. Instead, I encourage maintainers and developers to find errors
-in their own code. If you get a warning, you can try to work around it, try
-to figure out if it's a real error or not, or simply ignore it. Most
-developers know their own code and will quickly and efficiently determine the
-root cause of a kmemcheck report. This is therefore also the most efficient
-way to work with kmemcheck.
-
-That said, we (the kmemcheck maintainers) will always be on the lookout for
-false positives that we can annotate and silence. So whatever you find,
-please drop us a note privately! Kernel configs and steps to reproduce (if
-available) are of course a great help too.
-
-Happy hacking!
-
-
-Technical description
----------------------
-
-kmemcheck works by marking memory pages non-present. This means that whenever
-somebody attempts to access the page, a page fault is generated. The page
-fault handler notices that the page was in fact only hidden, and so it calls
-on the kmemcheck code to make further investigations.
-
-When the investigations are completed, kmemcheck "shows" the page by marking
-it present (as it would be under normal circumstances). This way, the
-interrupted code can continue as usual.
-
-But after the instruction has been executed, we should hide the page again, so
-that we can catch the next access too! Now kmemcheck makes use of a debugging
-feature of the processor, namely single-stepping. When the processor has
-finished the one instruction that generated the memory access, a debug
-exception is raised. From here, we simply hide the page again and continue
-execution, this time with the single-stepping feature turned off.
-
-kmemcheck requires some assistance from the memory allocator in order to work.
-The memory allocator needs to
-
-  1. Tell kmemcheck about newly allocated pages and pages that are about to
-     be freed. This allows kmemcheck to set up and tear down the shadow memory
-     for the pages in question. The shadow memory stores the status of each
-     byte in the allocation proper, e.g. whether it is initialized or
-     uninitialized.
-
-  2. Tell kmemcheck which parts of memory should be marked uninitialized.
-     There are actually a few more states, such as "not yet allocated" and
-     "recently freed".
-
-If a slab cache is set up using the SLAB_NOTRACK flag, it will never return
-memory that can take page faults because of kmemcheck.
-
-If a slab cache is NOT set up using the SLAB_NOTRACK flag, callers can still
-request memory with the __GFP_NOTRACK or __GFP_NOTRACK_FALSE_POSITIVE flags.
-This does not prevent the page faults from occurring, however, but marks the
-object in question as being initialized so that no warnings will ever be
-produced for this object.
-
-Currently, the SLAB and SLUB allocators are supported by kmemcheck.
diff --git a/MAINTAINERS b/MAINTAINERS
index 7e9c887ad951..ac814d3dd1c1 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -7688,16 +7688,6 @@ F:	include/linux/kdb.h
 F:	include/linux/kgdb.h
 F:	kernel/debug/
 
-KMEMCHECK
-M:	Vegard Nossum <vegardno@ifi.uio.no>
-M:	Pekka Enberg <penberg@kernel.org>
-S:	Maintained
-F:	Documentation/dev-tools/kmemcheck.rst
-F:	arch/x86/include/asm/kmemcheck.h
-F:	arch/x86/mm/kmemcheck/
-F:	include/linux/kmemcheck.h
-F:	mm/kmemcheck.c
-
 KMEMLEAK
 M:	Catalin Marinas <catalin.marinas@arm.com>
 S:	Maintained
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index f08977d82ca0..cb678192da4a 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -112,7 +112,6 @@ config X86
 	select HAVE_ARCH_JUMP_LABEL
 	select HAVE_ARCH_KASAN			if X86_64 && SPARSEMEM_VMEMMAP
 	select HAVE_ARCH_KGDB
-	select HAVE_ARCH_KMEMCHECK
 	select HAVE_ARCH_MMAP_RND_BITS		if MMU
 	select HAVE_ARCH_MMAP_RND_COMPAT_BITS	if MMU && COMPAT
 	select HAVE_ARCH_COMPAT_MMAP_BASES	if MMU && COMPAT
@@ -1430,7 +1429,7 @@ config ARCH_DMA_ADDR_T_64BIT
 
 config X86_DIRECT_GBPAGES
 	def_bool y
-	depends on X86_64 && !DEBUG_PAGEALLOC && !KMEMCHECK
+	depends on X86_64 && !DEBUG_PAGEALLOC
 	---help---
 	  Certain kernel features effectively disable kernel
 	  linear 1 GB mappings (even if the CPU otherwise
diff --git a/arch/x86/include/asm/kmemcheck.h b/arch/x86/include/asm/kmemcheck.h
index 945a0337fbcf..ea32a7d3cf1b 100644
--- a/arch/x86/include/asm/kmemcheck.h
+++ b/arch/x86/include/asm/kmemcheck.h
@@ -1,43 +1 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-#ifndef ASM_X86_KMEMCHECK_H
-#define ASM_X86_KMEMCHECK_H
-
-#include <linux/types.h>
-#include <asm/ptrace.h>
-
-#ifdef CONFIG_KMEMCHECK
-bool kmemcheck_active(struct pt_regs *regs);
-
-void kmemcheck_show(struct pt_regs *regs);
-void kmemcheck_hide(struct pt_regs *regs);
-
-bool kmemcheck_fault(struct pt_regs *regs,
-	unsigned long address, unsigned long error_code);
-bool kmemcheck_trap(struct pt_regs *regs);
-#else
-static inline bool kmemcheck_active(struct pt_regs *regs)
-{
-	return false;
-}
-
-static inline void kmemcheck_show(struct pt_regs *regs)
-{
-}
-
-static inline void kmemcheck_hide(struct pt_regs *regs)
-{
-}
-
-static inline bool kmemcheck_fault(struct pt_regs *regs,
-	unsigned long address, unsigned long error_code)
-{
-	return false;
-}
-
-static inline bool kmemcheck_trap(struct pt_regs *regs)
-{
-	return false;
-}
-#endif /* CONFIG_KMEMCHECK */
-
-#endif
diff --git a/arch/x86/include/asm/string_32.h b/arch/x86/include/asm/string_32.h
index 076502241eae..55d392c6bd29 100644
--- a/arch/x86/include/asm/string_32.h
+++ b/arch/x86/include/asm/string_32.h
@@ -179,8 +179,6 @@ static inline void *__memcpy3d(void *to, const void *from, size_t len)
  *	No 3D Now!
  */
 
-#ifndef CONFIG_KMEMCHECK
-
 #if (__GNUC__ >= 4)
 #define memcpy(t, f, n) __builtin_memcpy(t, f, n)
 #else
@@ -189,13 +187,6 @@ static inline void *__memcpy3d(void *to, const void *from, size_t len)
 	 ? __constant_memcpy((t), (f), (n))	\
 	 : __memcpy((t), (f), (n)))
 #endif
-#else
-/*
- * kmemcheck becomes very happy if we use the REP instructions unconditionally,
- * because it means that we know both memory operands in advance.
- */
-#define memcpy(t, f, n) __memcpy((t), (f), (n))
-#endif
 
 #endif
 #endif /* !CONFIG_FORTIFY_SOURCE */
diff --git a/arch/x86/include/asm/string_64.h b/arch/x86/include/asm/string_64.h
index 0b1b4445f4c5..533f74c300c2 100644
--- a/arch/x86/include/asm/string_64.h
+++ b/arch/x86/include/asm/string_64.h
@@ -33,7 +33,6 @@ extern void *memcpy(void *to, const void *from, size_t len);
 extern void *__memcpy(void *to, const void *from, size_t len);
 
 #ifndef CONFIG_FORTIFY_SOURCE
-#ifndef CONFIG_KMEMCHECK
 #if (__GNUC__ == 4 && __GNUC_MINOR__ < 3) || __GNUC__ < 4
 #define memcpy(dst, src, len)					\
 ({								\
@@ -46,13 +45,6 @@ extern void *__memcpy(void *to, const void *from, size_t len);
 	__ret;							\
 })
 #endif
-#else
-/*
- * kmemcheck becomes very happy if we use the REP instructions unconditionally,
- * because it means that we know both memory operands in advance.
- */
-#define memcpy(dst, src, len) __inline_memcpy((dst), (src), (len))
-#endif
 #endif /* !CONFIG_FORTIFY_SOURCE */
 
 #define __HAVE_ARCH_MEMSET
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index b720dacac051..b1af22073e28 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -187,21 +187,6 @@ static void early_init_intel(struct cpuinfo_x86 *c)
 	if (c->x86 == 6 && c->x86_model < 15)
 		clear_cpu_cap(c, X86_FEATURE_PAT);
 
-#ifdef CONFIG_KMEMCHECK
-	/*
-	 * P4s have a "fast strings" feature which causes single-
-	 * stepping REP instructions to only generate a #DB on
-	 * cache-line boundaries.
-	 *
-	 * Ingo Molnar reported a Pentium D (model 6) and a Xeon
-	 * (model 2) with the same problem.
-	 */
-	if (c->x86 == 15)
-		if (msr_clear_bit(MSR_IA32_MISC_ENABLE,
-				  MSR_IA32_MISC_ENABLE_FAST_STRING_BIT) > 0)
-			pr_info("kmemcheck: Disabling fast string operations\n");
-#endif
-
 	/*
 	 * If fast string is not enabled in IA32_MISC_ENABLE for any reason,
 	 * clear the fast string and enhanced fast string CPU capabilities.
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index 7ba7f3d7f477..8e13b8cc6bed 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -29,8 +29,6 @@ obj-$(CONFIG_X86_PTDUMP)	+= debug_pagetables.o
 
 obj-$(CONFIG_HIGHMEM)		+= highmem_32.o
 
-obj-$(CONFIG_KMEMCHECK)		+= kmemcheck/
-
 KASAN_SANITIZE_kasan_init_$(BITS).o := n
 obj-$(CONFIG_KASAN)		+= kasan_init_$(BITS).o
 
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index ef94620ceb8a..6fdf91ef130a 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -163,12 +163,11 @@ static int page_size_mask;
 static void __init probe_page_size_mask(void)
 {
 	/*
-	 * For CONFIG_KMEMCHECK or pagealloc debugging, identity mapping will
-	 * use small pages.
+	 * For pagealloc debugging, identity mapping will use small pages.
 	 * This will simplify cpa(), which otherwise needs to support splitting
 	 * large pages into small in interrupt context, etc.
 	 */
-	if (boot_cpu_has(X86_FEATURE_PSE) && !debug_pagealloc_enabled() && !IS_ENABLED(CONFIG_KMEMCHECK))
+	if (boot_cpu_has(X86_FEATURE_PSE) && !debug_pagealloc_enabled())
 		page_size_mask |= 1 << PG_LEVEL_2M;
 	else
 		direct_gbpages = 0;
diff --git a/arch/x86/mm/kmemcheck/Makefile b/arch/x86/mm/kmemcheck/Makefile
deleted file mode 100644
index 520b3bce4095..000000000000
--- a/arch/x86/mm/kmemcheck/Makefile
+++ /dev/null
@@ -1 +0,0 @@
-obj-y := error.o kmemcheck.o opcode.o pte.o selftest.o shadow.o
diff --git a/arch/x86/mm/kmemcheck/error.c b/arch/x86/mm/kmemcheck/error.c
index 872ec4159a68..cec594032515 100644
--- a/arch/x86/mm/kmemcheck/error.c
+++ b/arch/x86/mm/kmemcheck/error.c
@@ -1,228 +1 @@
 // SPDX-License-Identifier: GPL-2.0
-#include <linux/interrupt.h>
-#include <linux/kdebug.h>
-#include <linux/kmemcheck.h>
-#include <linux/kernel.h>
-#include <linux/types.h>
-#include <linux/ptrace.h>
-#include <linux/stacktrace.h>
-#include <linux/string.h>
-
-#include "error.h"
-#include "shadow.h"
-
-enum kmemcheck_error_type {
-	KMEMCHECK_ERROR_INVALID_ACCESS,
-	KMEMCHECK_ERROR_BUG,
-};
-
-#define SHADOW_COPY_SIZE (1 << CONFIG_KMEMCHECK_SHADOW_COPY_SHIFT)
-
-struct kmemcheck_error {
-	enum kmemcheck_error_type type;
-
-	union {
-		/* KMEMCHECK_ERROR_INVALID_ACCESS */
-		struct {
-			/* Kind of access that caused the error */
-			enum kmemcheck_shadow state;
-			/* Address and size of the erroneous read */
-			unsigned long	address;
-			unsigned int	size;
-		};
-	};
-
-	struct pt_regs		regs;
-	struct stack_trace	trace;
-	unsigned long		trace_entries[32];
-
-	/* We compress it to a char. */
-	unsigned char		shadow_copy[SHADOW_COPY_SIZE];
-	unsigned char		memory_copy[SHADOW_COPY_SIZE];
-};
-
-/*
- * Create a ring queue of errors to output. We can't call printk() directly
- * from the kmemcheck traps, since this may call the console drivers and
- * result in a recursive fault.
- */
-static struct kmemcheck_error error_fifo[CONFIG_KMEMCHECK_QUEUE_SIZE];
-static unsigned int error_count;
-static unsigned int error_rd;
-static unsigned int error_wr;
-static unsigned int error_missed_count;
-
-static struct kmemcheck_error *error_next_wr(void)
-{
-	struct kmemcheck_error *e;
-
-	if (error_count == ARRAY_SIZE(error_fifo)) {
-		++error_missed_count;
-		return NULL;
-	}
-
-	e = &error_fifo[error_wr];
-	if (++error_wr == ARRAY_SIZE(error_fifo))
-		error_wr = 0;
-	++error_count;
-	return e;
-}
-
-static struct kmemcheck_error *error_next_rd(void)
-{
-	struct kmemcheck_error *e;
-
-	if (error_count == 0)
-		return NULL;
-
-	e = &error_fifo[error_rd];
-	if (++error_rd == ARRAY_SIZE(error_fifo))
-		error_rd = 0;
-	--error_count;
-	return e;
-}
-
-void kmemcheck_error_recall(void)
-{
-	static const char *desc[] = {
-		[KMEMCHECK_SHADOW_UNALLOCATED]		= "unallocated",
-		[KMEMCHECK_SHADOW_UNINITIALIZED]	= "uninitialized",
-		[KMEMCHECK_SHADOW_INITIALIZED]		= "initialized",
-		[KMEMCHECK_SHADOW_FREED]		= "freed",
-	};
-
-	static const char short_desc[] = {
-		[KMEMCHECK_SHADOW_UNALLOCATED]		= 'a',
-		[KMEMCHECK_SHADOW_UNINITIALIZED]	= 'u',
-		[KMEMCHECK_SHADOW_INITIALIZED]		= 'i',
-		[KMEMCHECK_SHADOW_FREED]		= 'f',
-	};
-
-	struct kmemcheck_error *e;
-	unsigned int i;
-
-	e = error_next_rd();
-	if (!e)
-		return;
-
-	switch (e->type) {
-	case KMEMCHECK_ERROR_INVALID_ACCESS:
-		printk(KERN_WARNING "WARNING: kmemcheck: Caught %d-bit read from %s memory (%p)\n",
-			8 * e->size, e->state < ARRAY_SIZE(desc) ?
-				desc[e->state] : "(invalid shadow state)",
-			(void *) e->address);
-
-		printk(KERN_WARNING);
-		for (i = 0; i < SHADOW_COPY_SIZE; ++i)
-			printk(KERN_CONT "%02x", e->memory_copy[i]);
-		printk(KERN_CONT "\n");
-
-		printk(KERN_WARNING);
-		for (i = 0; i < SHADOW_COPY_SIZE; ++i) {
-			if (e->shadow_copy[i] < ARRAY_SIZE(short_desc))
-				printk(KERN_CONT " %c", short_desc[e->shadow_copy[i]]);
-			else
-				printk(KERN_CONT " ?");
-		}
-		printk(KERN_CONT "\n");
-		printk(KERN_WARNING "%*c\n", 2 + 2
-			* (int) (e->address & (SHADOW_COPY_SIZE - 1)), '^');
-		break;
-	case KMEMCHECK_ERROR_BUG:
-		printk(KERN_EMERG "ERROR: kmemcheck: Fatal error\n");
-		break;
-	}
-
-	__show_regs(&e->regs, 1);
-	print_stack_trace(&e->trace, 0);
-}
-
-static void do_wakeup(unsigned long data)
-{
-	while (error_count > 0)
-		kmemcheck_error_recall();
-
-	if (error_missed_count > 0) {
-		printk(KERN_WARNING "kmemcheck: Lost %d error reports because "
-			"the queue was too small\n", error_missed_count);
-		error_missed_count = 0;
-	}
-}
-
-static DECLARE_TASKLET(kmemcheck_tasklet, &do_wakeup, 0);
-
-/*
- * Save the context of an error report.
- */
-void kmemcheck_error_save(enum kmemcheck_shadow state,
-	unsigned long address, unsigned int size, struct pt_regs *regs)
-{
-	static unsigned long prev_ip;
-
-	struct kmemcheck_error *e;
-	void *shadow_copy;
-	void *memory_copy;
-
-	/* Don't report several adjacent errors from the same EIP. */
-	if (regs->ip == prev_ip)
-		return;
-	prev_ip = regs->ip;
-
-	e = error_next_wr();
-	if (!e)
-		return;
-
-	e->type = KMEMCHECK_ERROR_INVALID_ACCESS;
-
-	e->state = state;
-	e->address = address;
-	e->size = size;
-
-	/* Save regs */
-	memcpy(&e->regs, regs, sizeof(*regs));
-
-	/* Save stack trace */
-	e->trace.nr_entries = 0;
-	e->trace.entries = e->trace_entries;
-	e->trace.max_entries = ARRAY_SIZE(e->trace_entries);
-	e->trace.skip = 0;
-	save_stack_trace_regs(regs, &e->trace);
-
-	/* Round address down to nearest 16 bytes */
-	shadow_copy = kmemcheck_shadow_lookup(address
-		& ~(SHADOW_COPY_SIZE - 1));
-	BUG_ON(!shadow_copy);
-
-	memcpy(e->shadow_copy, shadow_copy, SHADOW_COPY_SIZE);
-
-	kmemcheck_show_addr(address);
-	memory_copy = (void *) (address & ~(SHADOW_COPY_SIZE - 1));
-	memcpy(e->memory_copy, memory_copy, SHADOW_COPY_SIZE);
-	kmemcheck_hide_addr(address);
-
-	tasklet_hi_schedule_first(&kmemcheck_tasklet);
-}
-
-/*
- * Save the context of a kmemcheck bug.
- */
-void kmemcheck_error_save_bug(struct pt_regs *regs)
-{
-	struct kmemcheck_error *e;
-
-	e = error_next_wr();
-	if (!e)
-		return;
-
-	e->type = KMEMCHECK_ERROR_BUG;
-
-	memcpy(&e->regs, regs, sizeof(*regs));
-
-	e->trace.nr_entries = 0;
-	e->trace.entries = e->trace_entries;
-	e->trace.max_entries = ARRAY_SIZE(e->trace_entries);
-	e->trace.skip = 1;
-	save_stack_trace(&e->trace);
-
-	tasklet_hi_schedule_first(&kmemcheck_tasklet);
-}
diff --git a/arch/x86/mm/kmemcheck/error.h b/arch/x86/mm/kmemcheck/error.h
index 39f80d7a874d..ea32a7d3cf1b 100644
--- a/arch/x86/mm/kmemcheck/error.h
+++ b/arch/x86/mm/kmemcheck/error.h
@@ -1,16 +1 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-#ifndef ARCH__X86__MM__KMEMCHECK__ERROR_H
-#define ARCH__X86__MM__KMEMCHECK__ERROR_H
-
-#include <linux/ptrace.h>
-
-#include "shadow.h"
-
-void kmemcheck_error_save(enum kmemcheck_shadow state,
-	unsigned long address, unsigned int size, struct pt_regs *regs);
-
-void kmemcheck_error_save_bug(struct pt_regs *regs);
-
-void kmemcheck_error_recall(void);
-
-#endif
diff --git a/arch/x86/mm/kmemcheck/kmemcheck.c b/arch/x86/mm/kmemcheck/kmemcheck.c
deleted file mode 100644
index 4515bae36bbe..000000000000
--- a/arch/x86/mm/kmemcheck/kmemcheck.c
+++ /dev/null
@@ -1,658 +0,0 @@
-/**
- * kmemcheck - a heavyweight memory checker for the linux kernel
- * Copyright (C) 2007, 2008  Vegard Nossum <vegardno@ifi.uio.no>
- * (With a lot of help from Ingo Molnar and Pekka Enberg.)
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License (version 2) as
- * published by the Free Software Foundation.
- */
-
-#include <linux/init.h>
-#include <linux/interrupt.h>
-#include <linux/kallsyms.h>
-#include <linux/kernel.h>
-#include <linux/kmemcheck.h>
-#include <linux/mm.h>
-#include <linux/page-flags.h>
-#include <linux/percpu.h>
-#include <linux/ptrace.h>
-#include <linux/string.h>
-#include <linux/types.h>
-
-#include <asm/cacheflush.h>
-#include <asm/kmemcheck.h>
-#include <asm/pgtable.h>
-#include <asm/tlbflush.h>
-
-#include "error.h"
-#include "opcode.h"
-#include "pte.h"
-#include "selftest.h"
-#include "shadow.h"
-
-
-#ifdef CONFIG_KMEMCHECK_DISABLED_BY_DEFAULT
-#  define KMEMCHECK_ENABLED 0
-#endif
-
-#ifdef CONFIG_KMEMCHECK_ENABLED_BY_DEFAULT
-#  define KMEMCHECK_ENABLED 1
-#endif
-
-#ifdef CONFIG_KMEMCHECK_ONESHOT_BY_DEFAULT
-#  define KMEMCHECK_ENABLED 2
-#endif
-
-int kmemcheck_enabled = KMEMCHECK_ENABLED;
-
-int __init kmemcheck_init(void)
-{
-#ifdef CONFIG_SMP
-	/*
-	 * Limit SMP to use a single CPU. We rely on the fact that this code
-	 * runs before SMP is set up.
-	 */
-	if (setup_max_cpus > 1) {
-		printk(KERN_INFO
-			"kmemcheck: Limiting number of CPUs to 1.\n");
-		setup_max_cpus = 1;
-	}
-#endif
-
-	if (!kmemcheck_selftest()) {
-		printk(KERN_INFO "kmemcheck: self-tests failed; disabling\n");
-		kmemcheck_enabled = 0;
-		return -EINVAL;
-	}
-
-	printk(KERN_INFO "kmemcheck: Initialized\n");
-	return 0;
-}
-
-early_initcall(kmemcheck_init);
-
-/*
- * We need to parse the kmemcheck= option before any memory is allocated.
- */
-static int __init param_kmemcheck(char *str)
-{
-	int val;
-	int ret;
-
-	if (!str)
-		return -EINVAL;
-
-	ret = kstrtoint(str, 0, &val);
-	if (ret)
-		return ret;
-	kmemcheck_enabled = val;
-	return 0;
-}
-
-early_param("kmemcheck", param_kmemcheck);
-
-int kmemcheck_show_addr(unsigned long address)
-{
-	pte_t *pte;
-
-	pte = kmemcheck_pte_lookup(address);
-	if (!pte)
-		return 0;
-
-	set_pte(pte, __pte(pte_val(*pte) | _PAGE_PRESENT));
-	__flush_tlb_one(address);
-	return 1;
-}
-
-int kmemcheck_hide_addr(unsigned long address)
-{
-	pte_t *pte;
-
-	pte = kmemcheck_pte_lookup(address);
-	if (!pte)
-		return 0;
-
-	set_pte(pte, __pte(pte_val(*pte) & ~_PAGE_PRESENT));
-	__flush_tlb_one(address);
-	return 1;
-}
-
-struct kmemcheck_context {
-	bool busy;
-	int balance;
-
-	/*
-	 * There can be at most two memory operands to an instruction, but
-	 * each address can cross a page boundary -- so we may need up to
-	 * four addresses that must be hidden/revealed for each fault.
-	 */
-	unsigned long addr[4];
-	unsigned long n_addrs;
-	unsigned long flags;
-
-	/* Data size of the instruction that caused a fault. */
-	unsigned int size;
-};
-
-static DEFINE_PER_CPU(struct kmemcheck_context, kmemcheck_context);
-
-bool kmemcheck_active(struct pt_regs *regs)
-{
-	struct kmemcheck_context *data = this_cpu_ptr(&kmemcheck_context);
-
-	return data->balance > 0;
-}
-
-/* Save an address that needs to be shown/hidden */
-static void kmemcheck_save_addr(unsigned long addr)
-{
-	struct kmemcheck_context *data = this_cpu_ptr(&kmemcheck_context);
-
-	BUG_ON(data->n_addrs >= ARRAY_SIZE(data->addr));
-	data->addr[data->n_addrs++] = addr;
-}
-
-static unsigned int kmemcheck_show_all(void)
-{
-	struct kmemcheck_context *data = this_cpu_ptr(&kmemcheck_context);
-	unsigned int i;
-	unsigned int n;
-
-	n = 0;
-	for (i = 0; i < data->n_addrs; ++i)
-		n += kmemcheck_show_addr(data->addr[i]);
-
-	return n;
-}
-
-static unsigned int kmemcheck_hide_all(void)
-{
-	struct kmemcheck_context *data = this_cpu_ptr(&kmemcheck_context);
-	unsigned int i;
-	unsigned int n;
-
-	n = 0;
-	for (i = 0; i < data->n_addrs; ++i)
-		n += kmemcheck_hide_addr(data->addr[i]);
-
-	return n;
-}
-
-/*
- * Called from the #PF handler.
- */
-void kmemcheck_show(struct pt_regs *regs)
-{
-	struct kmemcheck_context *data = this_cpu_ptr(&kmemcheck_context);
-
-	BUG_ON(!irqs_disabled());
-
-	if (unlikely(data->balance != 0)) {
-		kmemcheck_show_all();
-		kmemcheck_error_save_bug(regs);
-		data->balance = 0;
-		return;
-	}
-
-	/*
-	 * None of the addresses actually belonged to kmemcheck. Note that
-	 * this is not an error.
-	 */
-	if (kmemcheck_show_all() == 0)
-		return;
-
-	++data->balance;
-
-	/*
-	 * The IF needs to be cleared as well, so that the faulting
-	 * instruction can run "uninterrupted". Otherwise, we might take
-	 * an interrupt and start executing that before we've had a chance
-	 * to hide the page again.
-	 *
-	 * NOTE: In the rare case of multiple faults, we must not override
-	 * the original flags:
-	 */
-	if (!(regs->flags & X86_EFLAGS_TF))
-		data->flags = regs->flags;
-
-	regs->flags |= X86_EFLAGS_TF;
-	regs->flags &= ~X86_EFLAGS_IF;
-}
-
-/*
- * Called from the #DB handler.
- */
-void kmemcheck_hide(struct pt_regs *regs)
-{
-	struct kmemcheck_context *data = this_cpu_ptr(&kmemcheck_context);
-	int n;
-
-	BUG_ON(!irqs_disabled());
-
-	if (unlikely(data->balance != 1)) {
-		kmemcheck_show_all();
-		kmemcheck_error_save_bug(regs);
-		data->n_addrs = 0;
-		data->balance = 0;
-
-		if (!(data->flags & X86_EFLAGS_TF))
-			regs->flags &= ~X86_EFLAGS_TF;
-		if (data->flags & X86_EFLAGS_IF)
-			regs->flags |= X86_EFLAGS_IF;
-		return;
-	}
-
-	if (kmemcheck_enabled)
-		n = kmemcheck_hide_all();
-	else
-		n = kmemcheck_show_all();
-
-	if (n == 0)
-		return;
-
-	--data->balance;
-
-	data->n_addrs = 0;
-
-	if (!(data->flags & X86_EFLAGS_TF))
-		regs->flags &= ~X86_EFLAGS_TF;
-	if (data->flags & X86_EFLAGS_IF)
-		regs->flags |= X86_EFLAGS_IF;
-}
-
-void kmemcheck_show_pages(struct page *p, unsigned int n)
-{
-	unsigned int i;
-
-	for (i = 0; i < n; ++i) {
-		unsigned long address;
-		pte_t *pte;
-		unsigned int level;
-
-		address = (unsigned long) page_address(&p[i]);
-		pte = lookup_address(address, &level);
-		BUG_ON(!pte);
-		BUG_ON(level != PG_LEVEL_4K);
-
-		set_pte(pte, __pte(pte_val(*pte) | _PAGE_PRESENT));
-		set_pte(pte, __pte(pte_val(*pte) & ~_PAGE_HIDDEN));
-		__flush_tlb_one(address);
-	}
-}
-
-bool kmemcheck_page_is_tracked(struct page *p)
-{
-	/* This will also check the "hidden" flag of the PTE. */
-	return kmemcheck_pte_lookup((unsigned long) page_address(p));
-}
-
-void kmemcheck_hide_pages(struct page *p, unsigned int n)
-{
-	unsigned int i;
-
-	for (i = 0; i < n; ++i) {
-		unsigned long address;
-		pte_t *pte;
-		unsigned int level;
-
-		address = (unsigned long) page_address(&p[i]);
-		pte = lookup_address(address, &level);
-		BUG_ON(!pte);
-		BUG_ON(level != PG_LEVEL_4K);
-
-		set_pte(pte, __pte(pte_val(*pte) & ~_PAGE_PRESENT));
-		set_pte(pte, __pte(pte_val(*pte) | _PAGE_HIDDEN));
-		__flush_tlb_one(address);
-	}
-}
-
-/* Access may NOT cross page boundary */
-static void kmemcheck_read_strict(struct pt_regs *regs,
-	unsigned long addr, unsigned int size)
-{
-	void *shadow;
-	enum kmemcheck_shadow status;
-
-	shadow = kmemcheck_shadow_lookup(addr);
-	if (!shadow)
-		return;
-
-	kmemcheck_save_addr(addr);
-	status = kmemcheck_shadow_test(shadow, size);
-	if (status == KMEMCHECK_SHADOW_INITIALIZED)
-		return;
-
-	if (kmemcheck_enabled)
-		kmemcheck_error_save(status, addr, size, regs);
-
-	if (kmemcheck_enabled == 2)
-		kmemcheck_enabled = 0;
-
-	/* Don't warn about it again. */
-	kmemcheck_shadow_set(shadow, size);
-}
-
-bool kmemcheck_is_obj_initialized(unsigned long addr, size_t size)
-{
-	enum kmemcheck_shadow status;
-	void *shadow;
-
-	shadow = kmemcheck_shadow_lookup(addr);
-	if (!shadow)
-		return true;
-
-	status = kmemcheck_shadow_test_all(shadow, size);
-
-	return status == KMEMCHECK_SHADOW_INITIALIZED;
-}
-
-/* Access may cross page boundary */
-static void kmemcheck_read(struct pt_regs *regs,
-	unsigned long addr, unsigned int size)
-{
-	unsigned long page = addr & PAGE_MASK;
-	unsigned long next_addr = addr + size - 1;
-	unsigned long next_page = next_addr & PAGE_MASK;
-
-	if (likely(page == next_page)) {
-		kmemcheck_read_strict(regs, addr, size);
-		return;
-	}
-
-	/*
-	 * What we do is basically to split the access across the
-	 * two pages and handle each part separately. Yes, this means
-	 * that we may now see reads that are 3 + 5 bytes, for
-	 * example (and if both are uninitialized, there will be two
-	 * reports), but it makes the code a lot simpler.
-	 */
-	kmemcheck_read_strict(regs, addr, next_page - addr);
-	kmemcheck_read_strict(regs, next_page, next_addr - next_page);
-}
-
-static void kmemcheck_write_strict(struct pt_regs *regs,
-	unsigned long addr, unsigned int size)
-{
-	void *shadow;
-
-	shadow = kmemcheck_shadow_lookup(addr);
-	if (!shadow)
-		return;
-
-	kmemcheck_save_addr(addr);
-	kmemcheck_shadow_set(shadow, size);
-}
-
-static void kmemcheck_write(struct pt_regs *regs,
-	unsigned long addr, unsigned int size)
-{
-	unsigned long page = addr & PAGE_MASK;
-	unsigned long next_addr = addr + size - 1;
-	unsigned long next_page = next_addr & PAGE_MASK;
-
-	if (likely(page == next_page)) {
-		kmemcheck_write_strict(regs, addr, size);
-		return;
-	}
-
-	/* See comment in kmemcheck_read(). */
-	kmemcheck_write_strict(regs, addr, next_page - addr);
-	kmemcheck_write_strict(regs, next_page, next_addr - next_page);
-}
-
-/*
- * Copying is hard. We have two addresses, each of which may be split across
- * a page (and each page will have different shadow addresses).
- */
-static void kmemcheck_copy(struct pt_regs *regs,
-	unsigned long src_addr, unsigned long dst_addr, unsigned int size)
-{
-	uint8_t shadow[8];
-	enum kmemcheck_shadow status;
-
-	unsigned long page;
-	unsigned long next_addr;
-	unsigned long next_page;
-
-	uint8_t *x;
-	unsigned int i;
-	unsigned int n;
-
-	BUG_ON(size > sizeof(shadow));
-
-	page = src_addr & PAGE_MASK;
-	next_addr = src_addr + size - 1;
-	next_page = next_addr & PAGE_MASK;
-
-	if (likely(page == next_page)) {
-		/* Same page */
-		x = kmemcheck_shadow_lookup(src_addr);
-		if (x) {
-			kmemcheck_save_addr(src_addr);
-			for (i = 0; i < size; ++i)
-				shadow[i] = x[i];
-		} else {
-			for (i = 0; i < size; ++i)
-				shadow[i] = KMEMCHECK_SHADOW_INITIALIZED;
-		}
-	} else {
-		n = next_page - src_addr;
-		BUG_ON(n > sizeof(shadow));
-
-		/* First page */
-		x = kmemcheck_shadow_lookup(src_addr);
-		if (x) {
-			kmemcheck_save_addr(src_addr);
-			for (i = 0; i < n; ++i)
-				shadow[i] = x[i];
-		} else {
-			/* Not tracked */
-			for (i = 0; i < n; ++i)
-				shadow[i] = KMEMCHECK_SHADOW_INITIALIZED;
-		}
-
-		/* Second page */
-		x = kmemcheck_shadow_lookup(next_page);
-		if (x) {
-			kmemcheck_save_addr(next_page);
-			for (i = n; i < size; ++i)
-				shadow[i] = x[i - n];
-		} else {
-			/* Not tracked */
-			for (i = n; i < size; ++i)
-				shadow[i] = KMEMCHECK_SHADOW_INITIALIZED;
-		}
-	}
-
-	page = dst_addr & PAGE_MASK;
-	next_addr = dst_addr + size - 1;
-	next_page = next_addr & PAGE_MASK;
-
-	if (likely(page == next_page)) {
-		/* Same page */
-		x = kmemcheck_shadow_lookup(dst_addr);
-		if (x) {
-			kmemcheck_save_addr(dst_addr);
-			for (i = 0; i < size; ++i) {
-				x[i] = shadow[i];
-				shadow[i] = KMEMCHECK_SHADOW_INITIALIZED;
-			}
-		}
-	} else {
-		n = next_page - dst_addr;
-		BUG_ON(n > sizeof(shadow));
-
-		/* First page */
-		x = kmemcheck_shadow_lookup(dst_addr);
-		if (x) {
-			kmemcheck_save_addr(dst_addr);
-			for (i = 0; i < n; ++i) {
-				x[i] = shadow[i];
-				shadow[i] = KMEMCHECK_SHADOW_INITIALIZED;
-			}
-		}
-
-		/* Second page */
-		x = kmemcheck_shadow_lookup(next_page);
-		if (x) {
-			kmemcheck_save_addr(next_page);
-			for (i = n; i < size; ++i) {
-				x[i - n] = shadow[i];
-				shadow[i] = KMEMCHECK_SHADOW_INITIALIZED;
-			}
-		}
-	}
-
-	status = kmemcheck_shadow_test(shadow, size);
-	if (status == KMEMCHECK_SHADOW_INITIALIZED)
-		return;
-
-	if (kmemcheck_enabled)
-		kmemcheck_error_save(status, src_addr, size, regs);
-
-	if (kmemcheck_enabled == 2)
-		kmemcheck_enabled = 0;
-}
-
-enum kmemcheck_method {
-	KMEMCHECK_READ,
-	KMEMCHECK_WRITE,
-};
-
-static void kmemcheck_access(struct pt_regs *regs,
-	unsigned long fallback_address, enum kmemcheck_method fallback_method)
-{
-	const uint8_t *insn;
-	const uint8_t *insn_primary;
-	unsigned int size;
-
-	struct kmemcheck_context *data = this_cpu_ptr(&kmemcheck_context);
-
-	/* Recursive fault -- ouch. */
-	if (data->busy) {
-		kmemcheck_show_addr(fallback_address);
-		kmemcheck_error_save_bug(regs);
-		return;
-	}
-
-	data->busy = true;
-
-	insn = (const uint8_t *) regs->ip;
-	insn_primary = kmemcheck_opcode_get_primary(insn);
-
-	kmemcheck_opcode_decode(insn, &size);
-
-	switch (insn_primary[0]) {
-#ifdef CONFIG_KMEMCHECK_BITOPS_OK
-		/* AND, OR, XOR */
-		/*
-		 * Unfortunately, these instructions have to be excluded from
-		 * our regular checking since they access only some (and not
-		 * all) bits. This clears out "bogus" bitfield-access warnings.
-		 */
-	case 0x80:
-	case 0x81:
-	case 0x82:
-	case 0x83:
-		switch ((insn_primary[1] >> 3) & 7) {
-			/* OR */
-		case 1:
-			/* AND */
-		case 4:
-			/* XOR */
-		case 6:
-			kmemcheck_write(regs, fallback_address, size);
-			goto out;
-
-			/* ADD */
-		case 0:
-			/* ADC */
-		case 2:
-			/* SBB */
-		case 3:
-			/* SUB */
-		case 5:
-			/* CMP */
-		case 7:
-			break;
-		}
-		break;
-#endif
-
-		/* MOVS, MOVSB, MOVSW, MOVSD */
-	case 0xa4:
-	case 0xa5:
-		/*
-		 * These instructions are special because they take two
-		 * addresses, but we only get one page fault.
-		 */
-		kmemcheck_copy(regs, regs->si, regs->di, size);
-		goto out;
-
-		/* CMPS, CMPSB, CMPSW, CMPSD */
-	case 0xa6:
-	case 0xa7:
-		kmemcheck_read(regs, regs->si, size);
-		kmemcheck_read(regs, regs->di, size);
-		goto out;
-	}
-
-	/*
-	 * If the opcode isn't special in any way, we use the data from the
-	 * page fault handler to determine the address and type of memory
-	 * access.
-	 */
-	switch (fallback_method) {
-	case KMEMCHECK_READ:
-		kmemcheck_read(regs, fallback_address, size);
-		goto out;
-	case KMEMCHECK_WRITE:
-		kmemcheck_write(regs, fallback_address, size);
-		goto out;
-	}
-
-out:
-	data->busy = false;
-}
-
-bool kmemcheck_fault(struct pt_regs *regs, unsigned long address,
-	unsigned long error_code)
-{
-	pte_t *pte;
-
-	/*
-	 * XXX: Is it safe to assume that memory accesses from virtual 86
-	 * mode or non-kernel code segments will _never_ access kernel
-	 * memory (e.g. tracked pages)? For now, we need this to avoid
-	 * invoking kmemcheck for PnP BIOS calls.
-	 */
-	if (regs->flags & X86_VM_MASK)
-		return false;
-	if (regs->cs != __KERNEL_CS)
-		return false;
-
-	pte = kmemcheck_pte_lookup(address);
-	if (!pte)
-		return false;
-
-	WARN_ON_ONCE(in_nmi());
-
-	if (error_code & 2)
-		kmemcheck_access(regs, address, KMEMCHECK_WRITE);
-	else
-		kmemcheck_access(regs, address, KMEMCHECK_READ);
-
-	kmemcheck_show(regs);
-	return true;
-}
-
-bool kmemcheck_trap(struct pt_regs *regs)
-{
-	if (!kmemcheck_active(regs))
-		return false;
-
-	/* We're done. */
-	kmemcheck_hide(regs);
-	return true;
-}
diff --git a/arch/x86/mm/kmemcheck/opcode.c b/arch/x86/mm/kmemcheck/opcode.c
index df8109ddf7fe..cec594032515 100644
--- a/arch/x86/mm/kmemcheck/opcode.c
+++ b/arch/x86/mm/kmemcheck/opcode.c
@@ -1,107 +1 @@
 // SPDX-License-Identifier: GPL-2.0
-#include <linux/types.h>
-
-#include "opcode.h"
-
-static bool opcode_is_prefix(uint8_t b)
-{
-	return
-		/* Group 1 */
-		b == 0xf0 || b == 0xf2 || b == 0xf3
-		/* Group 2 */
-		|| b == 0x2e || b == 0x36 || b == 0x3e || b == 0x26
-		|| b == 0x64 || b == 0x65
-		/* Group 3 */
-		|| b == 0x66
-		/* Group 4 */
-		|| b == 0x67;
-}
-
-#ifdef CONFIG_X86_64
-static bool opcode_is_rex_prefix(uint8_t b)
-{
-	return (b & 0xf0) == 0x40;
-}
-#else
-static bool opcode_is_rex_prefix(uint8_t b)
-{
-	return false;
-}
-#endif
-
-#define REX_W (1 << 3)
-
-/*
- * This is a VERY crude opcode decoder. We only need to find the size of the
- * load/store that caused our #PF and this should work for all the opcodes
- * that we care about. Moreover, the ones who invented this instruction set
- * should be shot.
- */
-void kmemcheck_opcode_decode(const uint8_t *op, unsigned int *size)
-{
-	/* Default operand size */
-	int operand_size_override = 4;
-
-	/* prefixes */
-	for (; opcode_is_prefix(*op); ++op) {
-		if (*op == 0x66)
-			operand_size_override = 2;
-	}
-
-	/* REX prefix */
-	if (opcode_is_rex_prefix(*op)) {
-		uint8_t rex = *op;
-
-		++op;
-		if (rex & REX_W) {
-			switch (*op) {
-			case 0x63:
-				*size = 4;
-				return;
-			case 0x0f:
-				++op;
-
-				switch (*op) {
-				case 0xb6:
-				case 0xbe:
-					*size = 1;
-					return;
-				case 0xb7:
-				case 0xbf:
-					*size = 2;
-					return;
-				}
-
-				break;
-			}
-
-			*size = 8;
-			return;
-		}
-	}
-
-	/* escape opcode */
-	if (*op == 0x0f) {
-		++op;
-
-		/*
-		 * This is move with zero-extend and sign-extend, respectively;
-		 * we don't have to think about 0xb6/0xbe, because this is
-		 * already handled in the conditional below.
-		 */
-		if (*op == 0xb7 || *op == 0xbf)
-			operand_size_override = 2;
-	}
-
-	*size = (*op & 1) ? operand_size_override : 1;
-}
-
-const uint8_t *kmemcheck_opcode_get_primary(const uint8_t *op)
-{
-	/* skip prefixes */
-	while (opcode_is_prefix(*op))
-		++op;
-	if (opcode_is_rex_prefix(*op))
-		++op;
-	return op;
-}
diff --git a/arch/x86/mm/kmemcheck/opcode.h b/arch/x86/mm/kmemcheck/opcode.h
index 51a1ce94c24a..ea32a7d3cf1b 100644
--- a/arch/x86/mm/kmemcheck/opcode.h
+++ b/arch/x86/mm/kmemcheck/opcode.h
@@ -1,10 +1 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-#ifndef ARCH__X86__MM__KMEMCHECK__OPCODE_H
-#define ARCH__X86__MM__KMEMCHECK__OPCODE_H
-
-#include <linux/types.h>
-
-void kmemcheck_opcode_decode(const uint8_t *op, unsigned int *size);
-const uint8_t *kmemcheck_opcode_get_primary(const uint8_t *op);
-
-#endif
diff --git a/arch/x86/mm/kmemcheck/pte.c b/arch/x86/mm/kmemcheck/pte.c
index 8a03be90272a..cec594032515 100644
--- a/arch/x86/mm/kmemcheck/pte.c
+++ b/arch/x86/mm/kmemcheck/pte.c
@@ -1,23 +1 @@
 // SPDX-License-Identifier: GPL-2.0
-#include <linux/mm.h>
-
-#include <asm/pgtable.h>
-
-#include "pte.h"
-
-pte_t *kmemcheck_pte_lookup(unsigned long address)
-{
-	pte_t *pte;
-	unsigned int level;
-
-	pte = lookup_address(address, &level);
-	if (!pte)
-		return NULL;
-	if (level != PG_LEVEL_4K)
-		return NULL;
-	if (!pte_hidden(*pte))
-		return NULL;
-
-	return pte;
-}
-
diff --git a/arch/x86/mm/kmemcheck/pte.h b/arch/x86/mm/kmemcheck/pte.h
index b595612382c2..ea32a7d3cf1b 100644
--- a/arch/x86/mm/kmemcheck/pte.h
+++ b/arch/x86/mm/kmemcheck/pte.h
@@ -1,11 +1 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-#ifndef ARCH__X86__MM__KMEMCHECK__PTE_H
-#define ARCH__X86__MM__KMEMCHECK__PTE_H
-
-#include <linux/mm.h>
-
-#include <asm/pgtable.h>
-
-pte_t *kmemcheck_pte_lookup(unsigned long address);
-
-#endif
diff --git a/arch/x86/mm/kmemcheck/selftest.c b/arch/x86/mm/kmemcheck/selftest.c
index 7ce0be1f99eb..cec594032515 100644
--- a/arch/x86/mm/kmemcheck/selftest.c
+++ b/arch/x86/mm/kmemcheck/selftest.c
@@ -1,71 +1 @@
 // SPDX-License-Identifier: GPL-2.0
-#include <linux/bug.h>
-#include <linux/kernel.h>
-
-#include "opcode.h"
-#include "selftest.h"
-
-struct selftest_opcode {
-	unsigned int expected_size;
-	const uint8_t *insn;
-	const char *desc;
-};
-
-static const struct selftest_opcode selftest_opcodes[] = {
-	/* REP MOVS */
-	{1, "\xf3\xa4", 		"rep movsb <mem8>, <mem8>"},
-	{4, "\xf3\xa5",			"rep movsl <mem32>, <mem32>"},
-
-	/* MOVZX / MOVZXD */
-	{1, "\x66\x0f\xb6\x51\xf8",	"movzwq <mem8>, <reg16>"},
-	{1, "\x0f\xb6\x51\xf8",		"movzwq <mem8>, <reg32>"},
-
-	/* MOVSX / MOVSXD */
-	{1, "\x66\x0f\xbe\x51\xf8",	"movswq <mem8>, <reg16>"},
-	{1, "\x0f\xbe\x51\xf8",		"movswq <mem8>, <reg32>"},
-
-#ifdef CONFIG_X86_64
-	/* MOVZX / MOVZXD */
-	{1, "\x49\x0f\xb6\x51\xf8",	"movzbq <mem8>, <reg64>"},
-	{2, "\x49\x0f\xb7\x51\xf8",	"movzbq <mem16>, <reg64>"},
-
-	/* MOVSX / MOVSXD */
-	{1, "\x49\x0f\xbe\x51\xf8",	"movsbq <mem8>, <reg64>"},
-	{2, "\x49\x0f\xbf\x51\xf8",	"movsbq <mem16>, <reg64>"},
-	{4, "\x49\x63\x51\xf8",		"movslq <mem32>, <reg64>"},
-#endif
-};
-
-static bool selftest_opcode_one(const struct selftest_opcode *op)
-{
-	unsigned size;
-
-	kmemcheck_opcode_decode(op->insn, &size);
-
-	if (size == op->expected_size)
-		return true;
-
-	printk(KERN_WARNING "kmemcheck: opcode %s: expected size %d, got %d\n",
-		op->desc, op->expected_size, size);
-	return false;
-}
-
-static bool selftest_opcodes_all(void)
-{
-	bool pass = true;
-	unsigned int i;
-
-	for (i = 0; i < ARRAY_SIZE(selftest_opcodes); ++i)
-		pass = pass && selftest_opcode_one(&selftest_opcodes[i]);
-
-	return pass;
-}
-
-bool kmemcheck_selftest(void)
-{
-	bool pass = true;
-
-	pass = pass && selftest_opcodes_all();
-
-	return pass;
-}
diff --git a/arch/x86/mm/kmemcheck/selftest.h b/arch/x86/mm/kmemcheck/selftest.h
index 8d759aae453d..ea32a7d3cf1b 100644
--- a/arch/x86/mm/kmemcheck/selftest.h
+++ b/arch/x86/mm/kmemcheck/selftest.h
@@ -1,7 +1 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-#ifndef ARCH_X86_MM_KMEMCHECK_SELFTEST_H
-#define ARCH_X86_MM_KMEMCHECK_SELFTEST_H
-
-bool kmemcheck_selftest(void);
-
-#endif
diff --git a/arch/x86/mm/kmemcheck/shadow.c b/arch/x86/mm/kmemcheck/shadow.c
deleted file mode 100644
index c2638a7d2c10..000000000000
--- a/arch/x86/mm/kmemcheck/shadow.c
+++ /dev/null
@@ -1,173 +0,0 @@
-#include <linux/kmemcheck.h>
-#include <linux/export.h>
-#include <linux/mm.h>
-
-#include <asm/page.h>
-#include <asm/pgtable.h>
-
-#include "pte.h"
-#include "shadow.h"
-
-/*
- * Return the shadow address for the given address. Returns NULL if the
- * address is not tracked.
- *
- * We need to be extremely careful not to follow any invalid pointers,
- * because this function can be called for *any* possible address.
- */
-void *kmemcheck_shadow_lookup(unsigned long address)
-{
-	pte_t *pte;
-	struct page *page;
-
-	if (!virt_addr_valid(address))
-		return NULL;
-
-	pte = kmemcheck_pte_lookup(address);
-	if (!pte)
-		return NULL;
-
-	page = virt_to_page(address);
-	if (!page->shadow)
-		return NULL;
-	return page->shadow + (address & (PAGE_SIZE - 1));
-}
-
-static void mark_shadow(void *address, unsigned int n,
-	enum kmemcheck_shadow status)
-{
-	unsigned long addr = (unsigned long) address;
-	unsigned long last_addr = addr + n - 1;
-	unsigned long page = addr & PAGE_MASK;
-	unsigned long last_page = last_addr & PAGE_MASK;
-	unsigned int first_n;
-	void *shadow;
-
-	/* If the memory range crosses a page boundary, stop there. */
-	if (page == last_page)
-		first_n = n;
-	else
-		first_n = page + PAGE_SIZE - addr;
-
-	shadow = kmemcheck_shadow_lookup(addr);
-	if (shadow)
-		memset(shadow, status, first_n);
-
-	addr += first_n;
-	n -= first_n;
-
-	/* Do full-page memset()s. */
-	while (n >= PAGE_SIZE) {
-		shadow = kmemcheck_shadow_lookup(addr);
-		if (shadow)
-			memset(shadow, status, PAGE_SIZE);
-
-		addr += PAGE_SIZE;
-		n -= PAGE_SIZE;
-	}
-
-	/* Do the remaining page, if any. */
-	if (n > 0) {
-		shadow = kmemcheck_shadow_lookup(addr);
-		if (shadow)
-			memset(shadow, status, n);
-	}
-}
-
-void kmemcheck_mark_unallocated(void *address, unsigned int n)
-{
-	mark_shadow(address, n, KMEMCHECK_SHADOW_UNALLOCATED);
-}
-
-void kmemcheck_mark_uninitialized(void *address, unsigned int n)
-{
-	mark_shadow(address, n, KMEMCHECK_SHADOW_UNINITIALIZED);
-}
-
-/*
- * Fill the shadow memory of the given address such that the memory at that
- * address is marked as being initialized.
- */
-void kmemcheck_mark_initialized(void *address, unsigned int n)
-{
-	mark_shadow(address, n, KMEMCHECK_SHADOW_INITIALIZED);
-}
-EXPORT_SYMBOL_GPL(kmemcheck_mark_initialized);
-
-void kmemcheck_mark_freed(void *address, unsigned int n)
-{
-	mark_shadow(address, n, KMEMCHECK_SHADOW_FREED);
-}
-
-void kmemcheck_mark_unallocated_pages(struct page *p, unsigned int n)
-{
-	unsigned int i;
-
-	for (i = 0; i < n; ++i)
-		kmemcheck_mark_unallocated(page_address(&p[i]), PAGE_SIZE);
-}
-
-void kmemcheck_mark_uninitialized_pages(struct page *p, unsigned int n)
-{
-	unsigned int i;
-
-	for (i = 0; i < n; ++i)
-		kmemcheck_mark_uninitialized(page_address(&p[i]), PAGE_SIZE);
-}
-
-void kmemcheck_mark_initialized_pages(struct page *p, unsigned int n)
-{
-	unsigned int i;
-
-	for (i = 0; i < n; ++i)
-		kmemcheck_mark_initialized(page_address(&p[i]), PAGE_SIZE);
-}
-
-enum kmemcheck_shadow kmemcheck_shadow_test(void *shadow, unsigned int size)
-{
-#ifdef CONFIG_KMEMCHECK_PARTIAL_OK
-	uint8_t *x;
-	unsigned int i;
-
-	x = shadow;
-
-	/*
-	 * Make sure _some_ bytes are initialized. Gcc frequently generates
-	 * code to access neighboring bytes.
-	 */
-	for (i = 0; i < size; ++i) {
-		if (x[i] == KMEMCHECK_SHADOW_INITIALIZED)
-			return x[i];
-	}
-
-	return x[0];
-#else
-	return kmemcheck_shadow_test_all(shadow, size);
-#endif
-}
-
-enum kmemcheck_shadow kmemcheck_shadow_test_all(void *shadow, unsigned int size)
-{
-	uint8_t *x;
-	unsigned int i;
-
-	x = shadow;
-
-	/* All bytes must be initialized. */
-	for (i = 0; i < size; ++i) {
-		if (x[i] != KMEMCHECK_SHADOW_INITIALIZED)
-			return x[i];
-	}
-
-	return x[0];
-}
-
-void kmemcheck_shadow_set(void *shadow, unsigned int size)
-{
-	uint8_t *x;
-	unsigned int i;
-
-	x = shadow;
-	for (i = 0; i < size; ++i)
-		x[i] = KMEMCHECK_SHADOW_INITIALIZED;
-}
diff --git a/arch/x86/mm/kmemcheck/shadow.h b/arch/x86/mm/kmemcheck/shadow.h
index 49768dc18664..ea32a7d3cf1b 100644
--- a/arch/x86/mm/kmemcheck/shadow.h
+++ b/arch/x86/mm/kmemcheck/shadow.h
@@ -1,19 +1 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-#ifndef ARCH__X86__MM__KMEMCHECK__SHADOW_H
-#define ARCH__X86__MM__KMEMCHECK__SHADOW_H
-
-enum kmemcheck_shadow {
-	KMEMCHECK_SHADOW_UNALLOCATED,
-	KMEMCHECK_SHADOW_UNINITIALIZED,
-	KMEMCHECK_SHADOW_INITIALIZED,
-	KMEMCHECK_SHADOW_FREED,
-};
-
-void *kmemcheck_shadow_lookup(unsigned long address);
-
-enum kmemcheck_shadow kmemcheck_shadow_test(void *shadow, unsigned int size);
-enum kmemcheck_shadow kmemcheck_shadow_test_all(void *shadow,
-						unsigned int size);
-void kmemcheck_shadow_set(void *shadow, unsigned int size);
-
-#endif
diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index baeb872283d9..69c238210325 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -594,21 +594,6 @@ static inline void tasklet_hi_schedule(struct tasklet_struct *t)
 		__tasklet_hi_schedule(t);
 }
 
-extern void __tasklet_hi_schedule_first(struct tasklet_struct *t);
-
-/*
- * This version avoids touching any other tasklets. Needed for kmemcheck
- * in order not to take any page faults while enqueueing this tasklet;
- * consider VERY carefully whether you really need this or
- * tasklet_hi_schedule()...
- */
-static inline void tasklet_hi_schedule_first(struct tasklet_struct *t)
-{
-	if (!test_and_set_bit(TASKLET_STATE_SCHED, &t->state))
-		__tasklet_hi_schedule_first(t);
-}
-
-
 static inline void tasklet_disable_nosync(struct tasklet_struct *t)
 {
 	atomic_inc(&t->count);
diff --git a/include/linux/kmemcheck.h b/include/linux/kmemcheck.h
index 7b1d7bead7d9..ea32a7d3cf1b 100644
--- a/include/linux/kmemcheck.h
+++ b/include/linux/kmemcheck.h
@@ -1,172 +1 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-#ifndef LINUX_KMEMCHECK_H
-#define LINUX_KMEMCHECK_H
-
-#include <linux/mm_types.h>
-#include <linux/types.h>
-
-#ifdef CONFIG_KMEMCHECK
-extern int kmemcheck_enabled;
-
-/* The slab-related functions. */
-void kmemcheck_alloc_shadow(struct page *page, int order, gfp_t flags, int node);
-void kmemcheck_free_shadow(struct page *page, int order);
-void kmemcheck_slab_alloc(struct kmem_cache *s, gfp_t gfpflags, void *object,
-			  size_t size);
-void kmemcheck_slab_free(struct kmem_cache *s, void *object, size_t size);
-
-void kmemcheck_pagealloc_alloc(struct page *p, unsigned int order,
-			       gfp_t gfpflags);
-
-void kmemcheck_show_pages(struct page *p, unsigned int n);
-void kmemcheck_hide_pages(struct page *p, unsigned int n);
-
-bool kmemcheck_page_is_tracked(struct page *p);
-
-void kmemcheck_mark_unallocated(void *address, unsigned int n);
-void kmemcheck_mark_uninitialized(void *address, unsigned int n);
-void kmemcheck_mark_initialized(void *address, unsigned int n);
-void kmemcheck_mark_freed(void *address, unsigned int n);
-
-void kmemcheck_mark_unallocated_pages(struct page *p, unsigned int n);
-void kmemcheck_mark_uninitialized_pages(struct page *p, unsigned int n);
-void kmemcheck_mark_initialized_pages(struct page *p, unsigned int n);
-
-int kmemcheck_show_addr(unsigned long address);
-int kmemcheck_hide_addr(unsigned long address);
-
-bool kmemcheck_is_obj_initialized(unsigned long addr, size_t size);
-
-/*
- * Bitfield annotations
- *
- * How to use: If you have a struct using bitfields, for example
- *
- *     struct a {
- *             int x:8, y:8;
- *     };
- *
- * then this should be rewritten as
- *
- *     struct a {
- *             kmemcheck_bitfield_begin(flags);
- *             int x:8, y:8;
- *             kmemcheck_bitfield_end(flags);
- *     };
- *
- * Now the "flags_begin" and "flags_end" members may be used to refer to the
- * beginning and end, respectively, of the bitfield (and things like
- * &x.flags_begin is allowed). As soon as the struct is allocated, the bit-
- * fields should be annotated:
- *
- *     struct a *a = kmalloc(sizeof(struct a), GFP_KERNEL);
- *     kmemcheck_annotate_bitfield(a, flags);
- */
-#define kmemcheck_bitfield_begin(name)	\
-	int name##_begin[0];
-
-#define kmemcheck_bitfield_end(name)	\
-	int name##_end[0];
-
-#define kmemcheck_annotate_bitfield(ptr, name)				\
-	do {								\
-		int _n;							\
-									\
-		if (!ptr)						\
-			break;						\
-									\
-		_n = (long) &((ptr)->name##_end)			\
-			- (long) &((ptr)->name##_begin);		\
-		BUILD_BUG_ON(_n < 0);					\
-									\
-		kmemcheck_mark_initialized(&((ptr)->name##_begin), _n);	\
-	} while (0)
-
-#define kmemcheck_annotate_variable(var)				\
-	do {								\
-		kmemcheck_mark_initialized(&(var), sizeof(var));	\
-	} while (0)							\
-
-#else
-#define kmemcheck_enabled 0
-
-static inline void
-kmemcheck_alloc_shadow(struct page *page, int order, gfp_t flags, int node)
-{
-}
-
-static inline void
-kmemcheck_free_shadow(struct page *page, int order)
-{
-}
-
-static inline void
-kmemcheck_slab_alloc(struct kmem_cache *s, gfp_t gfpflags, void *object,
-		     size_t size)
-{
-}
-
-static inline void kmemcheck_slab_free(struct kmem_cache *s, void *object,
-				       size_t size)
-{
-}
-
-static inline void kmemcheck_pagealloc_alloc(struct page *p,
-	unsigned int order, gfp_t gfpflags)
-{
-}
-
-static inline bool kmemcheck_page_is_tracked(struct page *p)
-{
-	return false;
-}
-
-static inline void kmemcheck_mark_unallocated(void *address, unsigned int n)
-{
-}
-
-static inline void kmemcheck_mark_uninitialized(void *address, unsigned int n)
-{
-}
-
-static inline void kmemcheck_mark_initialized(void *address, unsigned int n)
-{
-}
-
-static inline void kmemcheck_mark_freed(void *address, unsigned int n)
-{
-}
-
-static inline void kmemcheck_mark_unallocated_pages(struct page *p,
-						    unsigned int n)
-{
-}
-
-static inline void kmemcheck_mark_uninitialized_pages(struct page *p,
-						      unsigned int n)
-{
-}
-
-static inline void kmemcheck_mark_initialized_pages(struct page *p,
-						    unsigned int n)
-{
-}
-
-static inline bool kmemcheck_is_obj_initialized(unsigned long addr, size_t size)
-{
-	return true;
-}
-
-#define kmemcheck_bitfield_begin(name)
-#define kmemcheck_bitfield_end(name)
-#define kmemcheck_annotate_bitfield(ptr, name)	\
-	do {					\
-	} while (0)
-
-#define kmemcheck_annotate_variable(var)	\
-	do {					\
-	} while (0)
-
-#endif /* CONFIG_KMEMCHECK */
-
-#endif /* LINUX_KMEMCHECK_H */
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 662f7b1b7a78..2f5e87f1bae2 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -486,16 +486,6 @@ void __tasklet_hi_schedule(struct tasklet_struct *t)
 }
 EXPORT_SYMBOL(__tasklet_hi_schedule);
 
-void __tasklet_hi_schedule_first(struct tasklet_struct *t)
-{
-	lockdep_assert_irqs_disabled();
-
-	t->next = __this_cpu_read(tasklet_hi_vec.head);
-	__this_cpu_write(tasklet_hi_vec.head, t);
-	__raise_softirq_irqoff(HI_SOFTIRQ);
-}
-EXPORT_SYMBOL(__tasklet_hi_schedule_first);
-
 static __latent_entropy void tasklet_action(struct softirq_action *a)
 {
 	struct tasklet_struct *list;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 9576bd582d4a..7638e2f7fff8 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -30,7 +30,6 @@
 #include <linux/proc_fs.h>
 #include <linux/security.h>
 #include <linux/ctype.h>
-#include <linux/kmemcheck.h>
 #include <linux/kmemleak.h>
 #include <linux/fs.h>
 #include <linux/init.h>
@@ -1173,15 +1172,6 @@ static struct ctl_table kern_table[] = {
 		.extra1		= &zero,
 		.extra2		= &one_thousand,
 	},
-#endif
-#ifdef CONFIG_KMEMCHECK
-	{
-		.procname	= "kmemcheck",
-		.data		= &kmemcheck_enabled,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec,
-	},
 #endif
 	{
 		.procname	= "panic_on_warn",
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 07ce7449765a..5402e3954659 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -504,7 +504,7 @@ config DEBUG_OBJECTS_ENABLE_DEFAULT
 
 config DEBUG_SLAB
 	bool "Debug slab memory allocations"
-	depends on DEBUG_KERNEL && SLAB && !KMEMCHECK
+	depends on DEBUG_KERNEL && SLAB
 	help
 	  Say Y here to have the kernel do limited verification on memory
 	  allocation as well as poisoning memory on free to catch use of freed
@@ -516,7 +516,7 @@ config DEBUG_SLAB_LEAK
 
 config SLUB_DEBUG_ON
 	bool "SLUB debugging on by default"
-	depends on SLUB && SLUB_DEBUG && !KMEMCHECK
+	depends on SLUB && SLUB_DEBUG
 	default n
 	help
 	  Boot with debugging on by default. SLUB boots by default with
@@ -730,8 +730,6 @@ config DEBUG_STACKOVERFLOW
 
 	  If in doubt, say "N".
 
-source "lib/Kconfig.kmemcheck"
-
 source "lib/Kconfig.kasan"
 
 endmenu # "Memory Debugging"
diff --git a/lib/Kconfig.kmemcheck b/lib/Kconfig.kmemcheck
deleted file mode 100644
index 846e039a86b4..000000000000
--- a/lib/Kconfig.kmemcheck
+++ /dev/null
@@ -1,94 +0,0 @@
-config HAVE_ARCH_KMEMCHECK
-	bool
-
-if HAVE_ARCH_KMEMCHECK
-
-menuconfig KMEMCHECK
-	bool "kmemcheck: trap use of uninitialized memory"
-	depends on DEBUG_KERNEL
-	depends on !X86_USE_3DNOW
-	depends on SLUB || SLAB
-	depends on !CC_OPTIMIZE_FOR_SIZE
-	depends on !FUNCTION_TRACER
-	select FRAME_POINTER
-	select STACKTRACE
-	default n
-	help
-	  This option enables tracing of dynamically allocated kernel memory
-	  to see if memory is used before it has been given an initial value.
-	  Be aware that this requires half of your memory for bookkeeping and
-	  will insert extra code at *every* read and write to tracked memory
-	  thus slow down the kernel code (but user code is unaffected).
-
-	  The kernel may be started with kmemcheck=0 or kmemcheck=1 to disable
-	  or enable kmemcheck at boot-time. If the kernel is started with
-	  kmemcheck=0, the large memory and CPU overhead is not incurred.
-
-choice
-	prompt "kmemcheck: default mode at boot"
-	depends on KMEMCHECK
-	default KMEMCHECK_ONESHOT_BY_DEFAULT
-	help
-	  This option controls the default behaviour of kmemcheck when the
-	  kernel boots and no kmemcheck= parameter is given.
-
-config KMEMCHECK_DISABLED_BY_DEFAULT
-	bool "disabled"
-	depends on KMEMCHECK
-
-config KMEMCHECK_ENABLED_BY_DEFAULT
-	bool "enabled"
-	depends on KMEMCHECK
-
-config KMEMCHECK_ONESHOT_BY_DEFAULT
-	bool "one-shot"
-	depends on KMEMCHECK
-	help
-	  In one-shot mode, only the first error detected is reported before
-	  kmemcheck is disabled.
-
-endchoice
-
-config KMEMCHECK_QUEUE_SIZE
-	int "kmemcheck: error queue size"
-	depends on KMEMCHECK
-	default 64
-	help
-	  Select the maximum number of errors to store in the queue. Since
-	  errors can occur virtually anywhere and in any context, we need a
-	  temporary storage area which is guarantueed not to generate any
-	  other faults. The queue will be emptied as soon as a tasklet may
-	  be scheduled. If the queue is full, new error reports will be
-	  lost.
-
-config KMEMCHECK_SHADOW_COPY_SHIFT
-	int "kmemcheck: shadow copy size (5 => 32 bytes, 6 => 64 bytes)"
-	depends on KMEMCHECK
-	range 2 8
-	default 5
-	help
-	  Select the number of shadow bytes to save along with each entry of
-	  the queue. These bytes indicate what parts of an allocation are
-	  initialized, uninitialized, etc. and will be displayed when an
-	  error is detected to help the debugging of a particular problem.
-
-config KMEMCHECK_PARTIAL_OK
-	bool "kmemcheck: allow partially uninitialized memory"
-	depends on KMEMCHECK
-	default y
-	help
-	  This option works around certain GCC optimizations that produce
-	  32-bit reads from 16-bit variables where the upper 16 bits are
-	  thrown away afterwards. This may of course also hide some real
-	  bugs.
-
-config KMEMCHECK_BITOPS_OK
-	bool "kmemcheck: allow bit-field manipulation"
-	depends on KMEMCHECK
-	default n
-	help
-	  This option silences warnings that would be generated for bit-field
-	  accesses where not all the bits are initialized at the same time.
-	  This may also hide some real bugs.
-
-endif
diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug
index 5b0adf1435de..e5e606ee5f71 100644
--- a/mm/Kconfig.debug
+++ b/mm/Kconfig.debug
@@ -11,7 +11,6 @@ config DEBUG_PAGEALLOC
 	bool "Debug page memory allocations"
 	depends on DEBUG_KERNEL
 	depends on !HIBERNATION || ARCH_SUPPORTS_DEBUG_PAGEALLOC && !PPC && !SPARC
-	depends on !KMEMCHECK
 	select PAGE_EXTENSION
 	select PAGE_POISONING if !ARCH_SUPPORTS_DEBUG_PAGEALLOC
 	---help---
diff --git a/mm/Makefile b/mm/Makefile
index 4659b93cba43..e7ebd176fb93 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -17,7 +17,6 @@ KCOV_INSTRUMENT_slub.o := n
 KCOV_INSTRUMENT_page_alloc.o := n
 KCOV_INSTRUMENT_debug-pagealloc.o := n
 KCOV_INSTRUMENT_kmemleak.o := n
-KCOV_INSTRUMENT_kmemcheck.o := n
 KCOV_INSTRUMENT_memcontrol.o := n
 KCOV_INSTRUMENT_mmzone.o := n
 KCOV_INSTRUMENT_vmstat.o := n
@@ -70,7 +69,6 @@ obj-$(CONFIG_KSM) += ksm.o
 obj-$(CONFIG_PAGE_POISONING) += page_poison.o
 obj-$(CONFIG_SLAB) += slab.o
 obj-$(CONFIG_SLUB) += slub.o
-obj-$(CONFIG_KMEMCHECK) += kmemcheck.o
 obj-$(CONFIG_KASAN)	+= kasan/
 obj-$(CONFIG_FAILSLAB) += failslab.o
 obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
diff --git a/mm/kmemcheck.c b/mm/kmemcheck.c
index b3a4d61d341c..cec594032515 100644
--- a/mm/kmemcheck.c
+++ b/mm/kmemcheck.c
@@ -1,126 +1 @@
 // SPDX-License-Identifier: GPL-2.0
-#include <linux/gfp.h>
-#include <linux/mm_types.h>
-#include <linux/mm.h>
-#include <linux/slab.h>
-#include "slab.h"
-#include <linux/kmemcheck.h>
-
-void kmemcheck_alloc_shadow(struct page *page, int order, gfp_t flags, int node)
-{
-	struct page *shadow;
-	int pages;
-	int i;
-
-	pages = 1 << order;
-
-	/*
-	 * With kmemcheck enabled, we need to allocate a memory area for the
-	 * shadow bits as well.
-	 */
-	shadow = alloc_pages_node(node, flags, order);
-	if (!shadow) {
-		if (printk_ratelimit())
-			pr_err("kmemcheck: failed to allocate shadow bitmap\n");
-		return;
-	}
-
-	for(i = 0; i < pages; ++i)
-		page[i].shadow = page_address(&shadow[i]);
-
-	/*
-	 * Mark it as non-present for the MMU so that our accesses to
-	 * this memory will trigger a page fault and let us analyze
-	 * the memory accesses.
-	 */
-	kmemcheck_hide_pages(page, pages);
-}
-
-void kmemcheck_free_shadow(struct page *page, int order)
-{
-	struct page *shadow;
-	int pages;
-	int i;
-
-	if (!kmemcheck_page_is_tracked(page))
-		return;
-
-	pages = 1 << order;
-
-	kmemcheck_show_pages(page, pages);
-
-	shadow = virt_to_page(page[0].shadow);
-
-	for(i = 0; i < pages; ++i)
-		page[i].shadow = NULL;
-
-	__free_pages(shadow, order);
-}
-
-void kmemcheck_slab_alloc(struct kmem_cache *s, gfp_t gfpflags, void *object,
-			  size_t size)
-{
-	if (unlikely(!object)) /* Skip object if allocation failed */
-		return;
-
-	/*
-	 * Has already been memset(), which initializes the shadow for us
-	 * as well.
-	 */
-	if (gfpflags & __GFP_ZERO)
-		return;
-
-	/* No need to initialize the shadow of a non-tracked slab. */
-	if (s->flags & SLAB_NOTRACK)
-		return;
-
-	if (!kmemcheck_enabled || gfpflags & __GFP_NOTRACK) {
-		/*
-		 * Allow notracked objects to be allocated from
-		 * tracked caches. Note however that these objects
-		 * will still get page faults on access, they just
-		 * won't ever be flagged as uninitialized. If page
-		 * faults are not acceptable, the slab cache itself
-		 * should be marked NOTRACK.
-		 */
-		kmemcheck_mark_initialized(object, size);
-	} else if (!s->ctor) {
-		/*
-		 * New objects should be marked uninitialized before
-		 * they're returned to the called.
-		 */
-		kmemcheck_mark_uninitialized(object, size);
-	}
-}
-
-void kmemcheck_slab_free(struct kmem_cache *s, void *object, size_t size)
-{
-	/* TODO: RCU freeing is unsupported for now; hide false positives. */
-	if (!s->ctor && !(s->flags & SLAB_TYPESAFE_BY_RCU))
-		kmemcheck_mark_freed(object, size);
-}
-
-void kmemcheck_pagealloc_alloc(struct page *page, unsigned int order,
-			       gfp_t gfpflags)
-{
-	int pages;
-
-	if (gfpflags & (__GFP_HIGHMEM | __GFP_NOTRACK))
-		return;
-
-	pages = 1 << order;
-
-	/*
-	 * NOTE: We choose to track GFP_ZERO pages too; in fact, they
-	 * can become uninitialized by copying uninitialized memory
-	 * into them.
-	 */
-
-	/* XXX: Can use zone->node for node? */
-	kmemcheck_alloc_shadow(page, order, gfpflags, -1);
-
-	if (gfpflags & __GFP_ZERO)
-		kmemcheck_mark_initialized_pages(page, pages);
-	else
-		kmemcheck_mark_uninitialized_pages(page, pages);
-}
diff --git a/mm/slub.c b/mm/slub.c
index c2c41e178acf..cfd56e5a35fb 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1371,7 +1371,7 @@ static inline void *slab_free_hook(struct kmem_cache *s, void *x)
 	 * So in order to make the debug calls that expect irqs to be
 	 * disabled we need to disable interrupts temporarily.
 	 */
-#if defined(CONFIG_KMEMCHECK) || defined(CONFIG_LOCKDEP)
+#ifdef CONFIG_LOCKDEP
 	{
 		unsigned long flags;
 
@@ -1399,8 +1399,7 @@ static inline void slab_free_freelist_hook(struct kmem_cache *s,
  * Compiler cannot detect this function can be removed if slab_free_hook()
  * evaluates to nothing.  Thus, catch all relevant config debug options here.
  */
-#if defined(CONFIG_KMEMCHECK) ||		\
-	defined(CONFIG_LOCKDEP)	||		\
+#if defined(CONFIG_LOCKDEP)	||		\
 	defined(CONFIG_DEBUG_KMEMLEAK) ||	\
 	defined(CONFIG_DEBUG_OBJECTS_FREE) ||	\
 	defined(CONFIG_KASAN)
diff --git a/scripts/kernel-doc b/scripts/kernel-doc
index 67d051edd615..7bd52b8f63d4 100755
--- a/scripts/kernel-doc
+++ b/scripts/kernel-doc
@@ -2182,8 +2182,6 @@ sub dump_struct($$) {
 	# strip comments:
 	$members =~ s/\/\*.*?\*\///gos;
 	$nested =~ s/\/\*.*?\*\///gos;
-	# strip kmemcheck_bitfield_{begin,end}.*;
-	$members =~ s/kmemcheck_bitfield_.*?;//gos;
 	# strip attributes
 	$members =~ s/__attribute__\s*\(\([a-z,_\*\s\(\)]*\)\)//i;
 	$members =~ s/__aligned\s*\([^;]*\)//gos;
diff --git a/tools/include/linux/kmemcheck.h b/tools/include/linux/kmemcheck.h
index 2bccd2c7b897..ea32a7d3cf1b 100644
--- a/tools/include/linux/kmemcheck.h
+++ b/tools/include/linux/kmemcheck.h
@@ -1,9 +1 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _LIBLOCKDEP_LINUX_KMEMCHECK_H_
-#define _LIBLOCKDEP_LINUX_KMEMCHECK_H_
-
-static inline void kmemcheck_mark_initialized(void *address, unsigned int n)
-{
-}
-
-#endif
-- 
cgit v1.2.3


From 453f85d43fa9ee243f0fc3ac4e1be45615301e3f Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@techsingularity.net>
Date: Wed, 15 Nov 2017 17:38:03 -0800
Subject: mm: remove __GFP_COLD

As the page free path makes no distinction between cache hot and cold
pages, there is no real useful ordering of pages in the free list that
allocation requests can take advantage of.  Juding from the users of
__GFP_COLD, it is likely that a number of them are the result of copying
other sites instead of actually measuring the impact.  Remove the
__GFP_COLD parameter which simplifies a number of paths in the page
allocator.

This is potentially controversial but bear in mind that the size of the
per-cpu pagelists versus modern cache sizes means that the whole per-cpu
list can often fit in the L3 cache.  Hence, there is only a potential
benefit for microbenchmarks that alloc/free pages in a tight loop.  It's
even worse when THP is taken into account which has little or no chance
of getting a cache-hot page as the per-cpu list is bypassed and the
zeroing of multiple pages will thrash the cache anyway.

The truncate microbenchmarks are not shown as this patch affects the
allocation path and not the free path.  A page fault microbenchmark was
tested but it showed no sigificant difference which is not surprising
given that the __GFP_COLD branches are a miniscule percentage of the
fault path.

Link: http://lkml.kernel.org/r/20171018075952.10627-9-mgorman@techsingularity.net
Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/net/ethernet/amazon/ena/ena_netdev.c         |  2 +-
 drivers/net/ethernet/amd/xgbe/xgbe-desc.c            |  2 +-
 drivers/net/ethernet/aquantia/atlantic/aq_ring.c     |  3 +--
 .../net/ethernet/cavium/liquidio/octeon_network.h    |  2 +-
 drivers/net/ethernet/mellanox/mlx4/en_rx.c           |  5 ++---
 drivers/net/ethernet/netronome/nfp/nfp_net_common.c  |  4 ++--
 drivers/net/ethernet/qlogic/qlge/qlge_main.c         |  3 +--
 drivers/net/ethernet/sfc/falcon/rx.c                 |  2 +-
 drivers/net/ethernet/sfc/rx.c                        |  2 +-
 drivers/net/ethernet/synopsys/dwc-xlgmac-desc.c      |  2 +-
 drivers/net/ethernet/ti/netcp_core.c                 |  2 +-
 drivers/net/virtio_net.c                             |  1 -
 drivers/staging/lustre/lustre/mdc/mdc_request.c      |  2 +-
 fs/cachefiles/rdwr.c                                 |  6 ++----
 include/linux/gfp.h                                  |  5 -----
 include/linux/pagemap.h                              |  8 +-------
 include/linux/skbuff.h                               |  2 +-
 include/linux/slab.h                                 |  3 ---
 include/trace/events/mmflags.h                       |  1 -
 kernel/power/snapshot.c                              |  4 ++--
 mm/filemap.c                                         |  6 +++---
 mm/page_alloc.c                                      | 20 ++++++--------------
 mm/percpu-vm.c                                       |  2 +-
 net/core/skbuff.c                                    |  4 ++--
 tools/perf/builtin-kmem.c                            |  1 -
 25 files changed, 32 insertions(+), 62 deletions(-)

(limited to 'kernel')

diff --git a/drivers/net/ethernet/amazon/ena/ena_netdev.c b/drivers/net/ethernet/amazon/ena/ena_netdev.c
index c6bd5e24005d..fbbbd8b3eb45 100644
--- a/drivers/net/ethernet/amazon/ena/ena_netdev.c
+++ b/drivers/net/ethernet/amazon/ena/ena_netdev.c
@@ -517,7 +517,7 @@ static int ena_refill_rx_bufs(struct ena_ring *rx_ring, u32 num)
 
 
 		rc = ena_alloc_rx_page(rx_ring, rx_info,
-				       __GFP_COLD | GFP_ATOMIC | __GFP_COMP);
+				       GFP_ATOMIC | __GFP_COMP);
 		if (unlikely(rc < 0)) {
 			netif_warn(rx_ring->adapter, rx_err, rx_ring->netdev,
 				   "failed to alloc buffer for rx queue %d\n",
diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-desc.c b/drivers/net/ethernet/amd/xgbe/xgbe-desc.c
index 45d92304068e..cc1e4f820e64 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe-desc.c
+++ b/drivers/net/ethernet/amd/xgbe/xgbe-desc.c
@@ -295,7 +295,7 @@ again:
 	order = alloc_order;
 
 	/* Try to obtain pages, decreasing order if necessary */
-	gfp = GFP_ATOMIC | __GFP_COLD | __GFP_COMP | __GFP_NOWARN;
+	gfp = GFP_ATOMIC | __GFP_COMP | __GFP_NOWARN;
 	while (order >= 0) {
 		pages = alloc_pages_node(node, gfp, order);
 		if (pages)
diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_ring.c b/drivers/net/ethernet/aquantia/atlantic/aq_ring.c
index 0654e0c76bc2..519ca6534b85 100644
--- a/drivers/net/ethernet/aquantia/atlantic/aq_ring.c
+++ b/drivers/net/ethernet/aquantia/atlantic/aq_ring.c
@@ -304,8 +304,7 @@ int aq_ring_rx_fill(struct aq_ring_s *self)
 		buff->flags = 0U;
 		buff->len = AQ_CFG_RX_FRAME_MAX;
 
-		buff->page = alloc_pages(GFP_ATOMIC | __GFP_COLD |
-					 __GFP_COMP, pages_order);
+		buff->page = alloc_pages(GFP_ATOMIC | __GFP_COMP, pages_order);
 		if (!buff->page) {
 			err = -ENOMEM;
 			goto err_exit;
diff --git a/drivers/net/ethernet/cavium/liquidio/octeon_network.h b/drivers/net/ethernet/cavium/liquidio/octeon_network.h
index 9e36319cead6..57853eead4b5 100644
--- a/drivers/net/ethernet/cavium/liquidio/octeon_network.h
+++ b/drivers/net/ethernet/cavium/liquidio/octeon_network.h
@@ -195,7 +195,7 @@ static inline void
 	struct sk_buff *skb;
 	struct octeon_skb_page_info *skb_pg_info;
 
-	page = alloc_page(GFP_ATOMIC | __GFP_COLD);
+	page = alloc_page(GFP_ATOMIC);
 	if (unlikely(!page))
 		return NULL;
 
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_rx.c b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
index b97a55c827eb..ffead38cf5da 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
@@ -193,7 +193,7 @@ static int mlx4_en_fill_rx_buffers(struct mlx4_en_priv *priv)
 
 			if (mlx4_en_prepare_rx_desc(priv, ring,
 						    ring->actual_size,
-						    GFP_KERNEL | __GFP_COLD)) {
+						    GFP_KERNEL)) {
 				if (ring->actual_size < MLX4_EN_MIN_RX_SIZE) {
 					en_err(priv, "Failed to allocate enough rx buffers\n");
 					return -ENOMEM;
@@ -552,8 +552,7 @@ static void mlx4_en_refill_rx_buffers(struct mlx4_en_priv *priv,
 	do {
 		if (mlx4_en_prepare_rx_desc(priv, ring,
 					    ring->prod & ring->size_mask,
-					    GFP_ATOMIC | __GFP_COLD |
-					    __GFP_MEMALLOC))
+					    GFP_ATOMIC | __GFP_MEMALLOC))
 			break;
 		ring->prod++;
 	} while (likely(--missing));
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
index e118b5f23996..ffb12dc13a5a 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
@@ -1185,7 +1185,7 @@ static void *nfp_net_rx_alloc_one(struct nfp_net_dp *dp, dma_addr_t *dma_addr)
 	} else {
 		struct page *page;
 
-		page = alloc_page(GFP_KERNEL | __GFP_COLD);
+		page = alloc_page(GFP_KERNEL);
 		frag = page ? page_address(page) : NULL;
 	}
 	if (!frag) {
@@ -1212,7 +1212,7 @@ static void *nfp_net_napi_alloc_one(struct nfp_net_dp *dp, dma_addr_t *dma_addr)
 	} else {
 		struct page *page;
 
-		page = alloc_page(GFP_ATOMIC | __GFP_COLD);
+		page = alloc_page(GFP_ATOMIC);
 		frag = page ? page_address(page) : NULL;
 	}
 	if (!frag) {
diff --git a/drivers/net/ethernet/qlogic/qlge/qlge_main.c b/drivers/net/ethernet/qlogic/qlge/qlge_main.c
index 29fea74bff2e..7b97a9969046 100644
--- a/drivers/net/ethernet/qlogic/qlge/qlge_main.c
+++ b/drivers/net/ethernet/qlogic/qlge/qlge_main.c
@@ -1092,8 +1092,7 @@ static int ql_get_next_chunk(struct ql_adapter *qdev, struct rx_ring *rx_ring,
 {
 	if (!rx_ring->pg_chunk.page) {
 		u64 map;
-		rx_ring->pg_chunk.page = alloc_pages(__GFP_COLD | __GFP_COMP |
-						GFP_ATOMIC,
+		rx_ring->pg_chunk.page = alloc_pages(__GFP_COMP | GFP_ATOMIC,
 						qdev->lbq_buf_order);
 		if (unlikely(!rx_ring->pg_chunk.page)) {
 			netif_err(qdev, drv, qdev->ndev,
diff --git a/drivers/net/ethernet/sfc/falcon/rx.c b/drivers/net/ethernet/sfc/falcon/rx.c
index 6a8406dc0c2b..91097aea6c41 100644
--- a/drivers/net/ethernet/sfc/falcon/rx.c
+++ b/drivers/net/ethernet/sfc/falcon/rx.c
@@ -163,7 +163,7 @@ static int ef4_init_rx_buffers(struct ef4_rx_queue *rx_queue, bool atomic)
 	do {
 		page = ef4_reuse_page(rx_queue);
 		if (page == NULL) {
-			page = alloc_pages(__GFP_COLD | __GFP_COMP |
+			page = alloc_pages(__GFP_COMP |
 					   (atomic ? GFP_ATOMIC : GFP_KERNEL),
 					   efx->rx_buffer_order);
 			if (unlikely(page == NULL))
diff --git a/drivers/net/ethernet/sfc/rx.c b/drivers/net/ethernet/sfc/rx.c
index 42443f434569..0004c50d3c83 100644
--- a/drivers/net/ethernet/sfc/rx.c
+++ b/drivers/net/ethernet/sfc/rx.c
@@ -163,7 +163,7 @@ static int efx_init_rx_buffers(struct efx_rx_queue *rx_queue, bool atomic)
 	do {
 		page = efx_reuse_page(rx_queue);
 		if (page == NULL) {
-			page = alloc_pages(__GFP_COLD | __GFP_COMP |
+			page = alloc_pages(__GFP_COMP |
 					   (atomic ? GFP_ATOMIC : GFP_KERNEL),
 					   efx->rx_buffer_order);
 			if (unlikely(page == NULL))
diff --git a/drivers/net/ethernet/synopsys/dwc-xlgmac-desc.c b/drivers/net/ethernet/synopsys/dwc-xlgmac-desc.c
index e9672b1f9968..031cf9c3435a 100644
--- a/drivers/net/ethernet/synopsys/dwc-xlgmac-desc.c
+++ b/drivers/net/ethernet/synopsys/dwc-xlgmac-desc.c
@@ -335,7 +335,7 @@ static int xlgmac_alloc_pages(struct xlgmac_pdata *pdata,
 	dma_addr_t pages_dma;
 
 	/* Try to obtain pages, decreasing order if necessary */
-	gfp |= __GFP_COLD | __GFP_COMP | __GFP_NOWARN;
+	gfp |= __GFP_COMP | __GFP_NOWARN;
 	while (order >= 0) {
 		pages = alloc_pages(gfp, order);
 		if (pages)
diff --git a/drivers/net/ethernet/ti/netcp_core.c b/drivers/net/ethernet/ti/netcp_core.c
index 437d36289786..50d2b76771b5 100644
--- a/drivers/net/ethernet/ti/netcp_core.c
+++ b/drivers/net/ethernet/ti/netcp_core.c
@@ -906,7 +906,7 @@ static int netcp_allocate_rx_buf(struct netcp_intf *netcp, int fdq)
 		sw_data[0] = (u32)bufptr;
 	} else {
 		/* Allocate a secondary receive queue entry */
-		page = alloc_page(GFP_ATOMIC | GFP_DMA | __GFP_COLD);
+		page = alloc_page(GFP_ATOMIC | GFP_DMA);
 		if (unlikely(!page)) {
 			dev_warn_ratelimited(netcp->ndev_dev, "Secondary page alloc failed\n");
 			goto fail;
diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 511f8339fa96..5eec09d63fc0 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -988,7 +988,6 @@ static bool try_fill_recv(struct virtnet_info *vi, struct receive_queue *rq,
 	int err;
 	bool oom;
 
-	gfp |= __GFP_COLD;
 	do {
 		if (vi->mergeable_rx_bufs)
 			err = add_recvbuf_mergeable(vi, rq, gfp);
diff --git a/drivers/staging/lustre/lustre/mdc/mdc_request.c b/drivers/staging/lustre/lustre/mdc/mdc_request.c
index 9e538a59f09d..03e55bca4ada 100644
--- a/drivers/staging/lustre/lustre/mdc/mdc_request.c
+++ b/drivers/staging/lustre/lustre/mdc/mdc_request.c
@@ -1152,7 +1152,7 @@ static int mdc_read_page_remote(void *data, struct page *page0)
 	}
 
 	for (npages = 1; npages < max_pages; npages++) {
-		page = page_cache_alloc_cold(inode->i_mapping);
+		page = page_cache_alloc(inode->i_mapping);
 		if (!page)
 			break;
 		page_pool[npages] = page;
diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c
index 23097cca2674..883bc7bb12c5 100644
--- a/fs/cachefiles/rdwr.c
+++ b/fs/cachefiles/rdwr.c
@@ -256,8 +256,7 @@ static int cachefiles_read_backing_file_one(struct cachefiles_object *object,
 			goto backing_page_already_present;
 
 		if (!newpage) {
-			newpage = __page_cache_alloc(cachefiles_gfp |
-						     __GFP_COLD);
+			newpage = __page_cache_alloc(cachefiles_gfp);
 			if (!newpage)
 				goto nomem_monitor;
 		}
@@ -493,8 +492,7 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object,
 				goto backing_page_already_present;
 
 			if (!newpage) {
-				newpage = __page_cache_alloc(cachefiles_gfp |
-							     __GFP_COLD);
+				newpage = __page_cache_alloc(cachefiles_gfp);
 				if (!newpage)
 					goto nomem;
 			}
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index f7e62d9096fe..1a4582b44d32 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -24,7 +24,6 @@ struct vm_area_struct;
 #define ___GFP_HIGH		0x20u
 #define ___GFP_IO		0x40u
 #define ___GFP_FS		0x80u
-#define ___GFP_COLD		0x100u
 #define ___GFP_NOWARN		0x200u
 #define ___GFP_RETRY_MAYFAIL	0x400u
 #define ___GFP_NOFAIL		0x800u
@@ -192,16 +191,12 @@ struct vm_area_struct;
 /*
  * Action modifiers
  *
- * __GFP_COLD indicates that the caller does not expect to be used in the near
- *   future. Where possible, a cache-cold page will be returned.
- *
  * __GFP_NOWARN suppresses allocation failure reports.
  *
  * __GFP_COMP address compound page metadata.
  *
  * __GFP_ZERO returns a zeroed page on success.
  */
-#define __GFP_COLD	((__force gfp_t)___GFP_COLD)
 #define __GFP_NOWARN	((__force gfp_t)___GFP_NOWARN)
 #define __GFP_COMP	((__force gfp_t)___GFP_COMP)
 #define __GFP_ZERO	((__force gfp_t)___GFP_ZERO)
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 4c6790bb7afb..34ce3ebf97d5 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -234,15 +234,9 @@ static inline struct page *page_cache_alloc(struct address_space *x)
 	return __page_cache_alloc(mapping_gfp_mask(x));
 }
 
-static inline struct page *page_cache_alloc_cold(struct address_space *x)
-{
-	return __page_cache_alloc(mapping_gfp_mask(x)|__GFP_COLD);
-}
-
 static inline gfp_t readahead_gfp_mask(struct address_space *x)
 {
-	return mapping_gfp_mask(x) |
-				  __GFP_COLD | __GFP_NORETRY | __GFP_NOWARN;
+	return mapping_gfp_mask(x) | __GFP_NORETRY | __GFP_NOWARN;
 }
 
 typedef int filler_t(void *, struct page *);
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index aa1341474916..7c46fd0b8b64 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -2672,7 +2672,7 @@ static inline struct page *__dev_alloc_pages(gfp_t gfp_mask,
 	 * 4.  __GFP_MEMALLOC is ignored if __GFP_NOMEMALLOC is set due to
 	 *     code in gfp_to_alloc_flags that should be enforcing this.
 	 */
-	gfp_mask |= __GFP_COLD | __GFP_COMP | __GFP_MEMALLOC;
+	gfp_mask |= __GFP_COMP | __GFP_MEMALLOC;
 
 	return alloc_pages_node(NUMA_NO_NODE, gfp_mask, order);
 }
diff --git a/include/linux/slab.h b/include/linux/slab.h
index 79f6532f8a0b..50697a1d6621 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -467,9 +467,6 @@ static __always_inline void *kmalloc_large(size_t size, gfp_t flags)
  * Also it is possible to set different flags by OR'ing
  * in one or more of the following additional @flags:
  *
- * %__GFP_COLD - Request cache-cold pages instead of
- *   trying to return cache-warm pages.
- *
  * %__GFP_HIGH - This allocation has high priority and may use emergency pools.
  *
  * %__GFP_NOFAIL - Indicate that this allocation is in no way allowed to fail
diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h
index 72162f3a03fa..dbe1bb058c09 100644
--- a/include/trace/events/mmflags.h
+++ b/include/trace/events/mmflags.h
@@ -32,7 +32,6 @@
 	{(unsigned long)__GFP_ATOMIC,		"__GFP_ATOMIC"},	\
 	{(unsigned long)__GFP_IO,		"__GFP_IO"},		\
 	{(unsigned long)__GFP_FS,		"__GFP_FS"},		\
-	{(unsigned long)__GFP_COLD,		"__GFP_COLD"},		\
 	{(unsigned long)__GFP_NOWARN,		"__GFP_NOWARN"},	\
 	{(unsigned long)__GFP_RETRY_MAYFAIL,	"__GFP_RETRY_MAYFAIL"},	\
 	{(unsigned long)__GFP_NOFAIL,		"__GFP_NOFAIL"},	\
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index a917a301e201..bce0464524d8 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -1884,7 +1884,7 @@ static int enough_free_mem(unsigned int nr_pages, unsigned int nr_highmem)
  */
 static inline int get_highmem_buffer(int safe_needed)
 {
-	buffer = get_image_page(GFP_ATOMIC | __GFP_COLD, safe_needed);
+	buffer = get_image_page(GFP_ATOMIC, safe_needed);
 	return buffer ? 0 : -ENOMEM;
 }
 
@@ -1945,7 +1945,7 @@ static int swsusp_alloc(struct memory_bitmap *copy_bm,
 		while (nr_pages-- > 0) {
 			struct page *page;
 
-			page = alloc_image_page(GFP_ATOMIC | __GFP_COLD);
+			page = alloc_image_page(GFP_ATOMIC);
 			if (!page)
 				goto err_out;
 			memory_bm_set_bit(copy_bm, page_to_pfn(page));
diff --git a/mm/filemap.c b/mm/filemap.c
index 90a9f261f85f..923fc2ebd74a 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2272,7 +2272,7 @@ no_cached_page:
 		 * Ok, it wasn't cached, so we need to create a new
 		 * page..
 		 */
-		page = page_cache_alloc_cold(mapping);
+		page = page_cache_alloc(mapping);
 		if (!page) {
 			error = -ENOMEM;
 			goto out;
@@ -2384,7 +2384,7 @@ static int page_cache_read(struct file *file, pgoff_t offset, gfp_t gfp_mask)
 	int ret;
 
 	do {
-		page = __page_cache_alloc(gfp_mask|__GFP_COLD);
+		page = __page_cache_alloc(gfp_mask);
 		if (!page)
 			return -ENOMEM;
 
@@ -2788,7 +2788,7 @@ static struct page *do_read_cache_page(struct address_space *mapping,
 repeat:
 	page = find_get_page(mapping, index);
 	if (!page) {
-		page = __page_cache_alloc(gfp | __GFP_COLD);
+		page = __page_cache_alloc(gfp);
 		if (!page)
 			return ERR_PTR(-ENOMEM);
 		err = add_to_page_cache_lru(page, mapping, index, gfp);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index f265d37b3152..370b64d03e3f 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2336,7 +2336,7 @@ retry:
  */
 static int rmqueue_bulk(struct zone *zone, unsigned int order,
 			unsigned long count, struct list_head *list,
-			int migratetype, bool cold)
+			int migratetype)
 {
 	int i, alloced = 0;
 
@@ -2358,10 +2358,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
 		 * merge IO requests if the physical pages are ordered
 		 * properly.
 		 */
-		if (likely(!cold))
-			list_add(&page->lru, list);
-		else
-			list_add_tail(&page->lru, list);
+		list_add(&page->lru, list);
 		list = &page->lru;
 		alloced++;
 		if (is_migrate_cma(get_pcppage_migratetype(page)))
@@ -2795,7 +2792,7 @@ static inline void zone_statistics(struct zone *preferred_zone, struct zone *z)
 
 /* Remove page from the per-cpu list, caller must protect the list */
 static struct page *__rmqueue_pcplist(struct zone *zone, int migratetype,
-			bool cold, struct per_cpu_pages *pcp,
+			struct per_cpu_pages *pcp,
 			struct list_head *list)
 {
 	struct page *page;
@@ -2804,16 +2801,12 @@ static struct page *__rmqueue_pcplist(struct zone *zone, int migratetype,
 		if (list_empty(list)) {
 			pcp->count += rmqueue_bulk(zone, 0,
 					pcp->batch, list,
-					migratetype, cold);
+					migratetype);
 			if (unlikely(list_empty(list)))
 				return NULL;
 		}
 
-		if (cold)
-			page = list_last_entry(list, struct page, lru);
-		else
-			page = list_first_entry(list, struct page, lru);
-
+		page = list_first_entry(list, struct page, lru);
 		list_del(&page->lru);
 		pcp->count--;
 	} while (check_new_pcp(page));
@@ -2828,14 +2821,13 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone,
 {
 	struct per_cpu_pages *pcp;
 	struct list_head *list;
-	bool cold = ((gfp_flags & __GFP_COLD) != 0);
 	struct page *page;
 	unsigned long flags;
 
 	local_irq_save(flags);
 	pcp = &this_cpu_ptr(zone->pageset)->pcp;
 	list = &pcp->lists[migratetype];
-	page = __rmqueue_pcplist(zone,  migratetype, cold, pcp, list);
+	page = __rmqueue_pcplist(zone,  migratetype, pcp, list);
 	if (page) {
 		__count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
 		zone_statistics(preferred_zone, zone);
diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c
index 15dab691ea70..9158e5a81391 100644
--- a/mm/percpu-vm.c
+++ b/mm/percpu-vm.c
@@ -81,7 +81,7 @@ static void pcpu_free_pages(struct pcpu_chunk *chunk,
 static int pcpu_alloc_pages(struct pcpu_chunk *chunk,
 			    struct page **pages, int page_start, int page_end)
 {
-	const gfp_t gfp = GFP_KERNEL | __GFP_HIGHMEM | __GFP_COLD;
+	const gfp_t gfp = GFP_KERNEL | __GFP_HIGHMEM;
 	unsigned int cpu, tcpu;
 	int i;
 
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 6cd057b41f34..9c68555bb906 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -353,7 +353,7 @@ static void *__netdev_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)
  */
 void *netdev_alloc_frag(unsigned int fragsz)
 {
-	return __netdev_alloc_frag(fragsz, GFP_ATOMIC | __GFP_COLD);
+	return __netdev_alloc_frag(fragsz, GFP_ATOMIC);
 }
 EXPORT_SYMBOL(netdev_alloc_frag);
 
@@ -366,7 +366,7 @@ static void *__napi_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)
 
 void *napi_alloc_frag(unsigned int fragsz)
 {
-	return __napi_alloc_frag(fragsz, GFP_ATOMIC | __GFP_COLD);
+	return __napi_alloc_frag(fragsz, GFP_ATOMIC);
 }
 EXPORT_SYMBOL(napi_alloc_frag);
 
diff --git a/tools/perf/builtin-kmem.c b/tools/perf/builtin-kmem.c
index cbf70738ef5f..ae11e4c3516a 100644
--- a/tools/perf/builtin-kmem.c
+++ b/tools/perf/builtin-kmem.c
@@ -641,7 +641,6 @@ static const struct {
 	{ "__GFP_ATOMIC",		"_A" },
 	{ "__GFP_IO",			"I" },
 	{ "__GFP_FS",			"F" },
-	{ "__GFP_COLD",			"CO" },
 	{ "__GFP_NOWARN",		"NWR" },
 	{ "__GFP_RETRY_MAYFAIL",	"R" },
 	{ "__GFP_NOFAIL",		"NF" },
-- 
cgit v1.2.3


From 4518085e127dff97e74f74a8780d7564e273bec8 Mon Sep 17 00:00:00 2001
From: Kemi Wang <kemi.wang@intel.com>
Date: Wed, 15 Nov 2017 17:38:22 -0800
Subject: mm, sysctl: make NUMA stats configurable

This is the second step which introduces a tunable interface that allow
numa stats configurable for optimizing zone_statistics(), as suggested
by Dave Hansen and Ying Huang.

=========================================================================

When page allocation performance becomes a bottleneck and you can
tolerate some possible tool breakage and decreased numa counter
precision, you can do:

	echo 0 > /proc/sys/vm/numa_stat

In this case, numa counter update is ignored.  We can see about
*4.8%*(185->176) drop of cpu cycles per single page allocation and
reclaim on Jesper's page_bench01 (single thread) and *8.1%*(343->315)
drop of cpu cycles per single page allocation and reclaim on Jesper's
page_bench03 (88 threads) running on a 2-Socket Broadwell-based server
(88 threads, 126G memory).

Benchmark link provided by Jesper D Brouer (increase loop times to
10000000):

  https://github.com/netoptimizer/prototype-kernel/tree/master/kernel/mm/bench

=========================================================================

When page allocation performance is not a bottleneck and you want all
tooling to work, you can do:

	echo 1 > /proc/sys/vm/numa_stat

This is system default setting.

Many thanks to Michal Hocko, Dave Hansen, Ying Huang and Vlastimil Babka
for comments to help improve the original patch.

[keescook@chromium.org: make sure mutex is a global static]
  Link: http://lkml.kernel.org/r/20171107213809.GA4314@beast
Link: http://lkml.kernel.org/r/1508290927-8518-1-git-send-email-kemi.wang@intel.com
Signed-off-by: Kemi Wang <kemi.wang@intel.com>
Signed-off-by: Kees Cook <keescook@chromium.org>
Reported-by: Jesper Dangaard Brouer <brouer@redhat.com>
Suggested-by: Dave Hansen <dave.hansen@intel.com>
Suggested-by: Ying Huang <ying.huang@intel.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: "Luis R . Rodriguez" <mcgrof@kernel.org>
Cc: Kees Cook <keescook@chromium.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Christopher Lameter <cl@linux.com>
Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Tim Chen <tim.c.chen@intel.com>
Cc: Andi Kleen <andi.kleen@intel.com>
Cc: Aaron Lu <aaron.lu@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/sysctl/vm.txt | 16 ++++++++++
 include/linux/vmstat.h      | 10 +++++++
 kernel/sysctl.c             |  9 ++++++
 mm/mempolicy.c              |  3 ++
 mm/page_alloc.c             |  6 ++++
 mm/vmstat.c                 | 71 +++++++++++++++++++++++++++++++++++++++++++++
 6 files changed, 115 insertions(+)

(limited to 'kernel')

diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt
index 3e579740b49f..055c8b3e1018 100644
--- a/Documentation/sysctl/vm.txt
+++ b/Documentation/sysctl/vm.txt
@@ -58,6 +58,7 @@ Currently, these files are in /proc/sys/vm:
 - percpu_pagelist_fraction
 - stat_interval
 - stat_refresh
+- numa_stat
 - swappiness
 - user_reserve_kbytes
 - vfs_cache_pressure
@@ -799,6 +800,21 @@ with no ill effects: errors and warnings on these stats are suppressed.)
 
 ==============================================================
 
+numa_stat
+
+This interface allows runtime configuration of numa statistics.
+
+When page allocation performance becomes a bottleneck and you can tolerate
+some possible tool breakage and decreased numa counter precision, you can
+do:
+	echo 0 > /proc/sys/vm/numa_stat
+
+When page allocation performance is not a bottleneck and you want all
+tooling to work, you can do:
+	echo 1 > /proc/sys/vm/numa_stat
+
+==============================================================
+
 swappiness
 
 This control is used to define how aggressive the kernel will swap
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index 1e0cb72e0598..1779c9817b39 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -7,9 +7,19 @@
 #include <linux/mmzone.h>
 #include <linux/vm_event_item.h>
 #include <linux/atomic.h>
+#include <linux/static_key.h>
 
 extern int sysctl_stat_interval;
 
+#ifdef CONFIG_NUMA
+#define ENABLE_NUMA_STAT   1
+#define DISABLE_NUMA_STAT   0
+extern int sysctl_vm_numa_stat;
+DECLARE_STATIC_KEY_TRUE(vm_numa_stat_key);
+extern int sysctl_vm_numa_stat_handler(struct ctl_table *table,
+		int write, void __user *buffer, size_t *length, loff_t *ppos);
+#endif
+
 #ifdef CONFIG_VM_EVENT_COUNTERS
 /*
  * Light weight per cpu counter implementation.
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 7638e2f7fff8..4a13a389e99b 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1356,6 +1356,15 @@ static struct ctl_table vm_table[] = {
 		.mode           = 0644,
 		.proc_handler   = &hugetlb_mempolicy_sysctl_handler,
 	},
+	{
+		.procname		= "numa_stat",
+		.data			= &sysctl_vm_numa_stat,
+		.maxlen			= sizeof(int),
+		.mode			= 0644,
+		.proc_handler	= sysctl_vm_numa_stat_handler,
+		.extra1			= &zero,
+		.extra2			= &one,
+	},
 #endif
 	 {
 		.procname	= "hugetlb_shm_group",
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index dad166b736ba..4ce44d3ff03d 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1915,6 +1915,9 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
 	struct page *page;
 
 	page = __alloc_pages(gfp, order, nid);
+	/* skip NUMA_INTERLEAVE_HIT counter update if numa stats is disabled */
+	if (!static_branch_likely(&vm_numa_stat_key))
+		return page;
 	if (page && page_to_nid(page) == nid) {
 		preempt_disable();
 		__inc_numa_state(page_zone(page), NUMA_INTERLEAVE_HIT);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 7ca668e946e5..67f523c4711a 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -82,6 +82,8 @@ DEFINE_PER_CPU(int, numa_node);
 EXPORT_PER_CPU_SYMBOL(numa_node);
 #endif
 
+DEFINE_STATIC_KEY_TRUE(vm_numa_stat_key);
+
 #ifdef CONFIG_HAVE_MEMORYLESS_NODES
 /*
  * N.B., Do NOT reference the '_numa_mem_' per cpu variable directly.
@@ -2777,6 +2779,10 @@ static inline void zone_statistics(struct zone *preferred_zone, struct zone *z)
 #ifdef CONFIG_NUMA
 	enum numa_stat_item local_stat = NUMA_LOCAL;
 
+	/* skip numa counters update if numa stats is disabled */
+	if (!static_branch_likely(&vm_numa_stat_key))
+		return;
+
 	if (z->node != numa_node_id())
 		local_stat = NUMA_OTHER;
 
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 7d11554861e4..40b2db6db6b1 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -32,6 +32,77 @@
 
 #define NUMA_STATS_THRESHOLD (U16_MAX - 2)
 
+#ifdef CONFIG_NUMA
+int sysctl_vm_numa_stat = ENABLE_NUMA_STAT;
+
+/* zero numa counters within a zone */
+static void zero_zone_numa_counters(struct zone *zone)
+{
+	int item, cpu;
+
+	for (item = 0; item < NR_VM_NUMA_STAT_ITEMS; item++) {
+		atomic_long_set(&zone->vm_numa_stat[item], 0);
+		for_each_online_cpu(cpu)
+			per_cpu_ptr(zone->pageset, cpu)->vm_numa_stat_diff[item]
+						= 0;
+	}
+}
+
+/* zero numa counters of all the populated zones */
+static void zero_zones_numa_counters(void)
+{
+	struct zone *zone;
+
+	for_each_populated_zone(zone)
+		zero_zone_numa_counters(zone);
+}
+
+/* zero global numa counters */
+static void zero_global_numa_counters(void)
+{
+	int item;
+
+	for (item = 0; item < NR_VM_NUMA_STAT_ITEMS; item++)
+		atomic_long_set(&vm_numa_stat[item], 0);
+}
+
+static void invalid_numa_statistics(void)
+{
+	zero_zones_numa_counters();
+	zero_global_numa_counters();
+}
+
+static DEFINE_MUTEX(vm_numa_stat_lock);
+
+int sysctl_vm_numa_stat_handler(struct ctl_table *table, int write,
+		void __user *buffer, size_t *length, loff_t *ppos)
+{
+	int ret, oldval;
+
+	mutex_lock(&vm_numa_stat_lock);
+	if (write)
+		oldval = sysctl_vm_numa_stat;
+	ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
+	if (ret || !write)
+		goto out;
+
+	if (oldval == sysctl_vm_numa_stat)
+		goto out;
+	else if (sysctl_vm_numa_stat == ENABLE_NUMA_STAT) {
+		static_branch_enable(&vm_numa_stat_key);
+		pr_info("enable numa statistics\n");
+	} else {
+		static_branch_disable(&vm_numa_stat_key);
+		invalid_numa_statistics();
+		pr_info("disable numa statistics, and clear numa counters\n");
+	}
+
+out:
+	mutex_unlock(&vm_numa_stat_lock);
+	return ret;
+}
+#endif
+
 #ifdef CONFIG_VM_EVENT_COUNTERS
 DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}};
 EXPORT_PER_CPU_SYMBOL(vm_event_states);
-- 
cgit v1.2.3


From b1fca27d384e8418aac84b39f6f5179aecc1b64f Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Fri, 17 Nov 2017 15:27:03 -0800
Subject: kernel debug: support resetting WARN*_ONCE

I like _ONCE warnings because it's guaranteed that they don't flood the
log.

During testing I find it useful to reset the state of the once warnings,
so that I can rerun tests and see if they trigger again, or can
guarantee that a test run always hits the same warnings.

This patch adds a debugfs interface to reset all the _ONCE warnings so
that they appear again:

  echo 1 > /sys/kernel/debug/clear_warn_once

This is implemented by putting all the warning booleans into a special
section, and clearing it.

[akpm@linux-foundation.org: coding-style fixes]
Link: http://lkml.kernel.org/r/20171017221455.6740-1-andi@firstfloor.org
Signed-off-by: Andi Kleen <ak@linux.intel.com>
Tested-by: Michael Ellerman <mpe@ellerman.id.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/clearing-warn-once.txt |  7 +++++++
 include/asm-generic/bug.h            |  6 +++---
 include/asm-generic/sections.h       |  1 +
 include/asm-generic/vmlinux.lds.h    |  3 +++
 kernel/panic.c                       | 28 ++++++++++++++++++++++++++++
 5 files changed, 42 insertions(+), 3 deletions(-)
 create mode 100644 Documentation/clearing-warn-once.txt

(limited to 'kernel')

diff --git a/Documentation/clearing-warn-once.txt b/Documentation/clearing-warn-once.txt
new file mode 100644
index 000000000000..5b1f5d547be1
--- /dev/null
+++ b/Documentation/clearing-warn-once.txt
@@ -0,0 +1,7 @@
+
+WARN_ONCE / WARN_ON_ONCE only print a warning once.
+
+echo 1 > /sys/kernel/debug/clear_warn_once
+
+clears the state and allows the warnings to print once again.
+This can be useful after test suite runs to reproduce problems.
diff --git a/include/asm-generic/bug.h b/include/asm-generic/bug.h
index af2cc94a61bf..7844b0df88cd 100644
--- a/include/asm-generic/bug.h
+++ b/include/asm-generic/bug.h
@@ -130,7 +130,7 @@ void __warn(const char *file, int line, void *caller, unsigned taint,
 
 #ifndef WARN_ON_ONCE
 #define WARN_ON_ONCE(condition)	({				\
-	static bool __section(.data.unlikely) __warned;		\
+	static bool __section(.data.once) __warned;		\
 	int __ret_warn_once = !!(condition);			\
 								\
 	if (unlikely(__ret_warn_once && !__warned)) {		\
@@ -142,7 +142,7 @@ void __warn(const char *file, int line, void *caller, unsigned taint,
 #endif
 
 #define WARN_ONCE(condition, format...)	({			\
-	static bool __section(.data.unlikely) __warned;		\
+	static bool __section(.data.once) __warned;		\
 	int __ret_warn_once = !!(condition);			\
 								\
 	if (unlikely(__ret_warn_once && !__warned)) {		\
@@ -153,7 +153,7 @@ void __warn(const char *file, int line, void *caller, unsigned taint,
 })
 
 #define WARN_TAINT_ONCE(condition, taint, format...)	({	\
-	static bool __section(.data.unlikely) __warned;		\
+	static bool __section(.data.once) __warned;		\
 	int __ret_warn_once = !!(condition);			\
 								\
 	if (unlikely(__ret_warn_once && !__warned)) {		\
diff --git a/include/asm-generic/sections.h b/include/asm-generic/sections.h
index 6d9576931084..03cc5f9bba71 100644
--- a/include/asm-generic/sections.h
+++ b/include/asm-generic/sections.h
@@ -44,6 +44,7 @@ extern char __entry_text_start[], __entry_text_end[];
 extern char __start_rodata[], __end_rodata[];
 extern char __irqentry_text_start[], __irqentry_text_end[];
 extern char __softirqentry_text_start[], __softirqentry_text_end[];
+extern char __start_once[], __end_once[];
 
 /* Start and end of .ctors section - used for constructor calls. */
 extern char __ctors_start[], __ctors_end[];
diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index bdcd1caae092..ee8b707d9fa9 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -223,6 +223,9 @@
 	MEM_KEEP(init.data)						\
 	MEM_KEEP(exit.data)						\
 	*(.data.unlikely)						\
+	VMLINUX_SYMBOL(__start_once) = .;				\
+	*(.data.once)							\
+	VMLINUX_SYMBOL(__end_once) = .;					\
 	STRUCT_ALIGN();							\
 	*(__tracepoints)						\
 	/* implement dynamic printk debug */				\
diff --git a/kernel/panic.c b/kernel/panic.c
index bdd18afa19a4..672a91dc20fe 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -27,6 +27,8 @@
 #include <linux/console.h>
 #include <linux/bug.h>
 #include <linux/ratelimit.h>
+#include <linux/debugfs.h>
+#include <asm/sections.h>
 
 #define PANIC_TIMER_STEP 100
 #define PANIC_BLINK_SPD 18
@@ -587,6 +589,32 @@ void warn_slowpath_null(const char *file, int line)
 EXPORT_SYMBOL(warn_slowpath_null);
 #endif
 
+#ifdef CONFIG_BUG
+
+/* Support resetting WARN*_ONCE state */
+
+static int clear_warn_once_set(void *data, u64 val)
+{
+	memset(__start_once, 0, __end_once - __start_once);
+	return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(clear_warn_once_fops,
+			NULL,
+			clear_warn_once_set,
+			"%lld\n");
+
+static __init int register_warn_debugfs(void)
+{
+	/* Don't care about failure */
+	debugfs_create_file("clear_warn_once", 0644, NULL,
+			    NULL, &clear_warn_once_fops);
+	return 0;
+}
+
+device_initcall(register_warn_debugfs);
+#endif
+
 #ifdef CONFIG_CC_STACKPROTECTOR
 
 /*
-- 
cgit v1.2.3


From aaf5dcfb223617ac2d16113e4b500199c65689de Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Fri, 17 Nov 2017 15:27:06 -0800
Subject: kernel debug: support resetting WARN_ONCE for all architectures

Some architectures store the WARN_ONCE state in the flags field of the
bug_entry.  Clear that one too when resetting once state through
/sys/kernel/debug/clear_warn_once

Pointed out by Michael Ellerman

Improves the earlier patch that add clear_warn_once.

[ak@linux.intel.com: add a missing ifdef CONFIG_MODULES]
  Link: http://lkml.kernel.org/r/20171020170633.9593-1-andi@firstfloor.org
[akpm@linux-foundation.org: fix unused var warning]
[akpm@linux-foundation.org: Use 0200 for clear_warn_once file, per mpe]
[akpm@linux-foundation.org: clear BUGFLAG_DONE in clear_once_table(), per mpe]
Link: http://lkml.kernel.org/r/20171019204642.7404-1-andi@firstfloor.org
Signed-off-by: Andi Kleen <ak@linux.intel.com>
Tested-by: Michael Ellerman <mpe@ellerman.id.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/bug.h |  5 +++++
 kernel/panic.c      |  3 ++-
 lib/bug.c           | 23 +++++++++++++++++++++++
 3 files changed, 30 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/include/linux/bug.h b/include/linux/bug.h
index da4231c905c8..fe5916550da8 100644
--- a/include/linux/bug.h
+++ b/include/linux/bug.h
@@ -43,6 +43,8 @@ enum bug_trap_type report_bug(unsigned long bug_addr, struct pt_regs *regs);
 /* These are defined by the architecture */
 int is_valid_bugaddr(unsigned long addr);
 
+void generic_bug_clear_once(void);
+
 #else	/* !CONFIG_GENERIC_BUG */
 
 static inline enum bug_trap_type report_bug(unsigned long bug_addr,
@@ -51,6 +53,9 @@ static inline enum bug_trap_type report_bug(unsigned long bug_addr,
 	return BUG_TRAP_TYPE_BUG;
 }
 
+
+static inline void generic_bug_clear_once(void) {}
+
 #endif	/* CONFIG_GENERIC_BUG */
 
 /*
diff --git a/kernel/panic.c b/kernel/panic.c
index 672a91dc20fe..67cebf2a3b67 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -595,6 +595,7 @@ EXPORT_SYMBOL(warn_slowpath_null);
 
 static int clear_warn_once_set(void *data, u64 val)
 {
+	generic_bug_clear_once();
 	memset(__start_once, 0, __end_once - __start_once);
 	return 0;
 }
@@ -607,7 +608,7 @@ DEFINE_SIMPLE_ATTRIBUTE(clear_warn_once_fops,
 static __init int register_warn_debugfs(void)
 {
 	/* Don't care about failure */
-	debugfs_create_file("clear_warn_once", 0644, NULL,
+	debugfs_create_file("clear_warn_once", 0200, NULL,
 			    NULL, &clear_warn_once_fops);
 	return 0;
 }
diff --git a/lib/bug.c b/lib/bug.c
index 1e094408c893..f66be6bf6206 100644
--- a/lib/bug.c
+++ b/lib/bug.c
@@ -196,3 +196,26 @@ enum bug_trap_type report_bug(unsigned long bugaddr, struct pt_regs *regs)
 
 	return BUG_TRAP_TYPE_BUG;
 }
+
+static void clear_once_table(struct bug_entry *start, struct bug_entry *end)
+{
+	struct bug_entry *bug;
+
+	for (bug = start; bug < end; bug++)
+		bug->flags &= ~BUGFLAG_DONE;
+}
+
+void generic_bug_clear_once(void)
+{
+#ifdef CONFIG_MODULES
+	struct module *mod;
+
+	rcu_read_lock_sched();
+	list_for_each_entry_rcu(mod, &module_bug_list, bug_list)
+		clear_once_table(mod->bug_table,
+				 mod->bug_table + mod->num_bugs);
+	rcu_read_unlock_sched();
+#endif
+
+	clear_once_table(__start___bug_table, __stop___bug_table);
+}
-- 
cgit v1.2.3


From 2a8358d8a339540f00ec596526690e8eeca931a3 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Fri, 17 Nov 2017 15:27:21 -0800
Subject: bug: define the "cut here" string in a single place

The "cut here" string is used in a few paths.  Define it in a single
place.

Link: http://lkml.kernel.org/r/1510100869-73751-3-git-send-email-keescook@chromium.org
Signed-off-by: Kees Cook <keescook@chromium.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Fengguang Wu <fengguang.wu@intel.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/mn10300/mm/fault.c   | 2 +-
 include/asm-generic/bug.h | 2 ++
 kernel/panic.c            | 2 +-
 lib/bug.c                 | 2 +-
 4 files changed, 5 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/arch/mn10300/mm/fault.c b/arch/mn10300/mm/fault.c
index f23781d6bbb3..f0bfa1448744 100644
--- a/arch/mn10300/mm/fault.c
+++ b/arch/mn10300/mm/fault.c
@@ -60,7 +60,7 @@ void bust_spinlocks(int yes)
 void do_BUG(const char *file, int line)
 {
 	bust_spinlocks(1);
-	printk(KERN_EMERG "------------[ cut here ]------------\n");
+	printk(KERN_EMERG CUT_HERE);
 	printk(KERN_EMERG "kernel BUG at %s:%d!\n", file, line);
 }
 
diff --git a/include/asm-generic/bug.h b/include/asm-generic/bug.h
index 7844b0df88cd..1283473f234e 100644
--- a/include/asm-generic/bug.h
+++ b/include/asm-generic/bug.h
@@ -4,6 +4,8 @@
 
 #include <linux/compiler.h>
 
+#define CUT_HERE		"------------[ cut here ]------------\n"
+
 #ifdef CONFIG_GENERIC_BUG
 #define BUGFLAG_WARNING		(1 << 0)
 #define BUGFLAG_ONCE		(1 << 1)
diff --git a/kernel/panic.c b/kernel/panic.c
index 67cebf2a3b67..89df5fa2f798 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -520,7 +520,7 @@ void __warn(const char *file, int line, void *caller, unsigned taint,
 {
 	disable_trace_on_warning();
 
-	pr_warn("------------[ cut here ]------------\n");
+	pr_warn(CUT_HERE);
 
 	if (file)
 		pr_warn("WARNING: CPU: %d PID: %d at %s:%d %pS\n",
diff --git a/lib/bug.c b/lib/bug.c
index f66be6bf6206..c1b0fad31b10 100644
--- a/lib/bug.c
+++ b/lib/bug.c
@@ -186,7 +186,7 @@ enum bug_trap_type report_bug(unsigned long bugaddr, struct pt_regs *regs)
 		return BUG_TRAP_TYPE_WARN;
 	}
 
-	printk(KERN_DEFAULT "------------[ cut here ]------------\n");
+	printk(KERN_DEFAULT CUT_HERE);
 
 	if (file)
 		pr_crit("kernel BUG at %s:%u!\n", file, line);
-- 
cgit v1.2.3


From a7bed27af194aa3f67915688039d93188ed95e2a Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Fri, 17 Nov 2017 15:27:24 -0800
Subject: bug: fix "cut here" location for __WARN_TAINT architectures

Prior to v4.11, x86 used warn_slowpath_fmt() for handling WARN()s.
After WARN() was moved to using UD0 on x86, the warning text started
appearing _before_ the "cut here" line.  This appears to have been a
long-standing bug on architectures that used __WARN_TAINT, but it didn't
get fixed.

v4.11 and earlier on x86:

  ------------[ cut here ]------------
  WARNING: CPU: 0 PID: 2956 at drivers/misc/lkdtm_bugs.c:65 lkdtm_WARNING+0x21/0x30
  This is a warning message
  Modules linked in:

v4.12 and later on x86:

  This is a warning message
  ------------[ cut here ]------------
  WARNING: CPU: 1 PID: 2982 at drivers/misc/lkdtm_bugs.c:68 lkdtm_WARNING+0x15/0x20
  Modules linked in:

With this fix:

  ------------[ cut here ]------------
  This is a warning message
  WARNING: CPU: 3 PID: 3009 at drivers/misc/lkdtm_bugs.c:67 lkdtm_WARNING+0x15/0x20

Since the __FILE__ reporting happens as part of the UD0 handler, it
isn't trivial to move the message to after the WARNING line, but at
least we can fix the position of the "cut here" line so all the various
logging tools will start including the actual runtime warning message
again, when they follow the instruction and "cut here".

Link: http://lkml.kernel.org/r/1510100869-73751-4-git-send-email-keescook@chromium.org
Fixes: 9a93848fe787 ("x86/debug: Implement __WARN() using UD0")
Signed-off-by: Kees Cook <keescook@chromium.org>
Cc: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Fengguang Wu <fengguang.wu@intel.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/asm-generic/bug.h |  5 +++--
 kernel/panic.c            | 16 +++++++++++++++-
 2 files changed, 18 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/include/asm-generic/bug.h b/include/asm-generic/bug.h
index 1283473f234e..963b755d19b0 100644
--- a/include/asm-generic/bug.h
+++ b/include/asm-generic/bug.h
@@ -92,10 +92,11 @@ extern void warn_slowpath_null(const char *file, const int line);
 #define __WARN_printf_taint(taint, arg...)				\
 	warn_slowpath_fmt_taint(__FILE__, __LINE__, taint, arg)
 #else
+extern __printf(1, 2) void __warn_printk(const char *fmt, ...);
 #define __WARN()		__WARN_TAINT(TAINT_WARN)
-#define __WARN_printf(arg...)	do { printk(arg); __WARN(); } while (0)
+#define __WARN_printf(arg...)	do { __warn_printk(arg); __WARN(); } while (0)
 #define __WARN_printf_taint(taint, arg...)				\
-	do { printk(arg); __WARN_TAINT(taint); } while (0)
+	do { __warn_printk(arg); __WARN_TAINT(taint); } while (0)
 #endif
 
 /* used internally by panic.c */
diff --git a/kernel/panic.c b/kernel/panic.c
index 89df5fa2f798..3242b64b1956 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -520,7 +520,8 @@ void __warn(const char *file, int line, void *caller, unsigned taint,
 {
 	disable_trace_on_warning();
 
-	pr_warn(CUT_HERE);
+	if (args)
+		pr_warn(CUT_HERE);
 
 	if (file)
 		pr_warn("WARNING: CPU: %d PID: %d at %s:%d %pS\n",
@@ -584,9 +585,22 @@ EXPORT_SYMBOL(warn_slowpath_fmt_taint);
 
 void warn_slowpath_null(const char *file, int line)
 {
+	pr_warn(CUT_HERE);
 	__warn(file, line, __builtin_return_address(0), TAINT_WARN, NULL, NULL);
 }
 EXPORT_SYMBOL(warn_slowpath_null);
+#else
+void __warn_printk(const char *fmt, ...)
+{
+	va_list args;
+
+	pr_warn(CUT_HERE);
+
+	va_start(args, fmt);
+	vprintk(fmt, args);
+	va_end(args);
+}
+EXPORT_SYMBOL(__warn_printk);
 #endif
 
 #ifdef CONFIG_BUG
-- 
cgit v1.2.3


From 8c703d660450c4df72ff24f79a335dc7973a9dc8 Mon Sep 17 00:00:00 2001
From: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Date: Fri, 17 Nov 2017 15:27:32 -0800
Subject: kernel/umh.c: optimize 'proc_cap_handler()'

If 'write' is 0, we can avoid a call to spin_lock/spin_unlock.

Link: http://lkml.kernel.org/r/20171020193331.7233-1-christophe.jaillet@wanadoo.fr
Signed-off-by: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Acked-by: Luis R. Rodriguez <mcgrof@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/umh.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/umh.c b/kernel/umh.c
index 6ff9905250ff..18e5fa4b0e71 100644
--- a/kernel/umh.c
+++ b/kernel/umh.c
@@ -537,14 +537,14 @@ static int proc_cap_handler(struct ctl_table *table, int write,
 	/*
 	 * Drop everything not in the new_cap (but don't add things)
 	 */
-	spin_lock(&umh_sysctl_lock);
 	if (write) {
+		spin_lock(&umh_sysctl_lock);
 		if (table->data == CAP_BSET)
 			usermodehelper_bset = cap_intersect(usermodehelper_bset, new_cap);
 		if (table->data == CAP_PI)
 			usermodehelper_inheritable = cap_intersect(usermodehelper_inheritable, new_cap);
+		spin_unlock(&umh_sysctl_lock);
 	}
-	spin_unlock(&umh_sysctl_lock);
 
 	return 0;
 }
-- 
cgit v1.2.3


From 98159d977f71c3b3dee898d1c34e56f520b094e7 Mon Sep 17 00:00:00 2001
From: Joe Lawrence <joe.lawrence@redhat.com>
Date: Fri, 17 Nov 2017 15:29:17 -0800
Subject: pipe: match pipe_max_size data type with procfs

Patch series "A few round_pipe_size() and pipe-max-size fixups", v3.

While backporting Michael's "pipe: fix limit handling" patchset to a
distro-kernel, Mikulas noticed that current upstream pipe limit handling
contains a few problems:

  1 - procfs signed wrap: echo'ing a large number into
      /proc/sys/fs/pipe-max-size and then cat'ing it back out shows a
      negative value.

  2 - round_pipe_size() nr_pages overflow on 32bit:  this would
      subsequently try roundup_pow_of_two(0), which is undefined.

  3 - visible non-rounded pipe-max-size value: there is no mutual
      exclusion or protection between the time pipe_max_size is assigned
      a raw value from proc_dointvec_minmax() and when it is rounded.

  4 - unsigned long -> unsigned int conversion makes for potential odd
      return errors from do_proc_douintvec_minmax_conv() and
      do_proc_dopipe_max_size_conv().

This version underwent the same testing as v1:
https://marc.info/?l=linux-kernel&m=150643571406022&w=2

This patch (of 4):

pipe_max_size is defined as an unsigned int:

  unsigned int pipe_max_size = 1048576;

but its procfs/sysctl representation is an integer:

  static struct ctl_table fs_table[] = {
          ...
          {
                  .procname       = "pipe-max-size",
                  .data           = &pipe_max_size,
                  .maxlen         = sizeof(int),
                  .mode           = 0644,
                  .proc_handler   = &pipe_proc_fn,
                  .extra1         = &pipe_min_size,
          },
          ...

that is signed:

  int pipe_proc_fn(struct ctl_table *table, int write, void __user *buf,
                   size_t *lenp, loff_t *ppos)
  {
          ...
          ret = proc_dointvec_minmax(table, write, buf, lenp, ppos)

This leads to signed results via procfs for large values of pipe_max_size:

  % echo 2147483647 >/proc/sys/fs/pipe-max-size
  % cat /proc/sys/fs/pipe-max-size
  -2147483648

Use unsigned operations on this variable to avoid such negative values.

Link: http://lkml.kernel.org/r/1507658689-11669-2-git-send-email-joe.lawrence@redhat.com
Signed-off-by: Joe Lawrence <joe.lawrence@redhat.com>
Reported-by: Mikulas Patocka <mpatocka@redhat.com>
Reviewed-by: Mikulas Patocka <mpatocka@redhat.com>
Cc: Michael Kerrisk <mtk.manpages@gmail.com>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/pipe.c       | 2 +-
 kernel/sysctl.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/fs/pipe.c b/fs/pipe.c
index 349c9d56d4b3..3909c55ed389 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -1125,7 +1125,7 @@ int pipe_proc_fn(struct ctl_table *table, int write, void __user *buf,
 {
 	int ret;
 
-	ret = proc_dointvec_minmax(table, write, buf, lenp, ppos);
+	ret = proc_douintvec_minmax(table, write, buf, lenp, ppos);
 	if (ret < 0 || !write)
 		return ret;
 
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 4a13a389e99b..2d42183b4c98 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1816,7 +1816,7 @@ static struct ctl_table fs_table[] = {
 	{
 		.procname	= "pipe-max-size",
 		.data		= &pipe_max_size,
-		.maxlen		= sizeof(int),
+		.maxlen		= sizeof(pipe_max_size),
 		.mode		= 0644,
 		.proc_handler	= &pipe_proc_fn,
 		.extra1		= &pipe_min_size,
-- 
cgit v1.2.3


From 7a8d181949fb2c16be00f8cdb354794a30e46b39 Mon Sep 17 00:00:00 2001
From: Joe Lawrence <joe.lawrence@redhat.com>
Date: Fri, 17 Nov 2017 15:29:24 -0800
Subject: pipe: add proc_dopipe_max_size() to safely assign pipe_max_size

pipe_max_size is assigned directly via procfs sysctl:

  static struct ctl_table fs_table[] = {
          ...
          {
                  .procname       = "pipe-max-size",
                  .data           = &pipe_max_size,
                  .maxlen         = sizeof(int),
                  .mode           = 0644,
                  .proc_handler   = &pipe_proc_fn,
                  .extra1         = &pipe_min_size,
          },
          ...

  int pipe_proc_fn(struct ctl_table *table, int write, void __user *buf,
                   size_t *lenp, loff_t *ppos)
  {
          ...
          ret = proc_dointvec_minmax(table, write, buf, lenp, ppos)
          ...

and then later rounded in-place a few statements later:

          ...
          pipe_max_size = round_pipe_size(pipe_max_size);
          ...

This leaves a window of time between initial assignment and rounding
that may be visible to other threads.  (For example, one thread sets a
non-rounded value to pipe_max_size while another reads its value.)

Similar reads of pipe_max_size are potentially racy:

  pipe.c :: alloc_pipe_info()
  pipe.c :: pipe_set_size()

Add a new proc_dopipe_max_size() that consolidates reading the new value
from the user buffer, verifying bounds, and calling round_pipe_size()
with a single assignment to pipe_max_size.

Link: http://lkml.kernel.org/r/1507658689-11669-4-git-send-email-joe.lawrence@redhat.com
Signed-off-by: Joe Lawrence <joe.lawrence@redhat.com>
Reported-by: Mikulas Patocka <mpatocka@redhat.com>
Reviewed-by: Mikulas Patocka <mpatocka@redhat.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Michael Kerrisk <mtk.manpages@gmail.com>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/pipe.c                 | 18 +++--------------
 include/linux/pipe_fs_i.h |  1 +
 include/linux/sysctl.h    |  3 +++
 kernel/sysctl.c           | 49 +++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 56 insertions(+), 15 deletions(-)

(limited to 'kernel')

diff --git a/fs/pipe.c b/fs/pipe.c
index f0f4ab36c444..6d98566201ef 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -1020,7 +1020,7 @@ const struct file_operations pipefifo_fops = {
  * Currently we rely on the pipe array holding a power-of-2 number
  * of pages. Returns 0 on error.
  */
-static inline unsigned int round_pipe_size(unsigned int size)
+unsigned int round_pipe_size(unsigned int size)
 {
 	unsigned long nr_pages;
 
@@ -1125,25 +1125,13 @@ out_revert_acct:
 }
 
 /*
- * This should work even if CONFIG_PROC_FS isn't set, as proc_dointvec_minmax
+ * This should work even if CONFIG_PROC_FS isn't set, as proc_dopipe_max_size
  * will return an error.
  */
 int pipe_proc_fn(struct ctl_table *table, int write, void __user *buf,
 		 size_t *lenp, loff_t *ppos)
 {
-	unsigned int rounded_pipe_max_size;
-	int ret;
-
-	ret = proc_douintvec_minmax(table, write, buf, lenp, ppos);
-	if (ret < 0 || !write)
-		return ret;
-
-	rounded_pipe_max_size = round_pipe_size(pipe_max_size);
-	if (rounded_pipe_max_size == 0)
-		return -EINVAL;
-
-	pipe_max_size = rounded_pipe_max_size;
-	return ret;
+	return proc_dopipe_max_size(table, write, buf, lenp, ppos);
 }
 
 /*
diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h
index 6a80cfc63e0c..2dc5e9870fcd 100644
--- a/include/linux/pipe_fs_i.h
+++ b/include/linux/pipe_fs_i.h
@@ -191,5 +191,6 @@ long pipe_fcntl(struct file *, unsigned int, unsigned long arg);
 struct pipe_inode_info *get_pipe_info(struct file *file);
 
 int create_pipe_files(struct file **, int);
+unsigned int round_pipe_size(unsigned int size);
 
 #endif
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index b769ecfcc3bd..992bc9948232 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -51,6 +51,9 @@ extern int proc_dointvec_minmax(struct ctl_table *, int,
 extern int proc_douintvec_minmax(struct ctl_table *table, int write,
 				 void __user *buffer, size_t *lenp,
 				 loff_t *ppos);
+extern int proc_dopipe_max_size(struct ctl_table *table, int write,
+				void __user *buffer, size_t *lenp,
+				loff_t *ppos);
 extern int proc_dointvec_jiffies(struct ctl_table *, int,
 				 void __user *, size_t *, loff_t *);
 extern int proc_dointvec_userhz_jiffies(struct ctl_table *, int,
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 2d42183b4c98..138b6484f277 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -66,6 +66,7 @@
 #include <linux/kexec.h>
 #include <linux/bpf.h>
 #include <linux/mount.h>
+#include <linux/pipe_fs_i.h>
 
 #include <linux/uaccess.h>
 #include <asm/processor.h>
@@ -2620,6 +2621,47 @@ int proc_douintvec_minmax(struct ctl_table *table, int write,
 				 do_proc_douintvec_minmax_conv, &param);
 }
 
+struct do_proc_dopipe_max_size_conv_param {
+	unsigned int *min;
+};
+
+static int do_proc_dopipe_max_size_conv(unsigned long *lvalp,
+					unsigned int *valp,
+					int write, void *data)
+{
+	struct do_proc_dopipe_max_size_conv_param *param = data;
+
+	if (write) {
+		unsigned int val = round_pipe_size(*lvalp);
+
+		if (val == 0)
+			return -EINVAL;
+
+		if (param->min && *param->min > val)
+			return -ERANGE;
+
+		if (*lvalp > UINT_MAX)
+			return -EINVAL;
+
+		*valp = val;
+	} else {
+		unsigned int val = *valp;
+		*lvalp = (unsigned long) val;
+	}
+
+	return 0;
+}
+
+int proc_dopipe_max_size(struct ctl_table *table, int write,
+			 void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	struct do_proc_dopipe_max_size_conv_param param = {
+		.min = (unsigned int *) table->extra1,
+	};
+	return do_proc_douintvec(table, write, buffer, lenp, ppos,
+				 do_proc_dopipe_max_size_conv, &param);
+}
+
 static void validate_coredump_safety(void)
 {
 #ifdef CONFIG_COREDUMP
@@ -3125,6 +3167,12 @@ int proc_douintvec_minmax(struct ctl_table *table, int write,
 	return -ENOSYS;
 }
 
+int proc_dopipe_max_size(struct ctl_table *table, int write,
+			 void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	return -ENOSYS;
+}
+
 int proc_dointvec_jiffies(struct ctl_table *table, int write,
 		    void __user *buffer, size_t *lenp, loff_t *ppos)
 {
@@ -3168,6 +3216,7 @@ EXPORT_SYMBOL(proc_douintvec);
 EXPORT_SYMBOL(proc_dointvec_jiffies);
 EXPORT_SYMBOL(proc_dointvec_minmax);
 EXPORT_SYMBOL_GPL(proc_douintvec_minmax);
+EXPORT_SYMBOL_GPL(proc_dopipe_max_size);
 EXPORT_SYMBOL(proc_dointvec_userhz_jiffies);
 EXPORT_SYMBOL(proc_dointvec_ms_jiffies);
 EXPORT_SYMBOL(proc_dostring);
-- 
cgit v1.2.3


From fb910c42ccebf853c29296185c45c11164a56098 Mon Sep 17 00:00:00 2001
From: Joe Lawrence <joe.lawrence@redhat.com>
Date: Fri, 17 Nov 2017 15:29:28 -0800
Subject: sysctl: check for UINT_MAX before unsigned int min/max

Mikulas noticed in the existing do_proc_douintvec_minmax_conv() and
do_proc_dopipe_max_size_conv() introduced in this patchset, that they
inconsistently handle overflow and min/max range inputs:

For example:

  0 ... param->min - 1 ---> ERANGE
  param->min ... param->max ---> the value is accepted
  param->max + 1 ... 0x100000000L + param->min - 1 ---> ERANGE
  0x100000000L + param->min ... 0x100000000L + param->max ---> EINVAL
  0x100000000L + param->max + 1, 0x200000000L + param->min - 1 ---> ERANGE
  0x200000000L + param->min ... 0x200000000L + param->max ---> EINVAL
  0x200000000L + param->max + 1, 0x300000000L + param->min - 1 ---> ERANGE

In do_proc_do*() routines which store values into unsigned int variables
(4 bytes wide for 64-bit builds), first validate that the input unsigned
long value (8 bytes wide for 64-bit builds) will fit inside the smaller
unsigned int variable.  Then check that the unsigned int value falls
inside the specified parameter min, max range.  Otherwise the unsigned
long -> unsigned int conversion drops leading bits from the input value,
leading to the inconsistent pattern Mikulas documented above.

Link: http://lkml.kernel.org/r/1507658689-11669-5-git-send-email-joe.lawrence@redhat.com
Signed-off-by: Joe Lawrence <joe.lawrence@redhat.com>
Reported-by: Mikulas Patocka <mpatocka@redhat.com>
Reviewed-by: Mikulas Patocka <mpatocka@redhat.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Michael Kerrisk <mtk.manpages@gmail.com>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/sysctl.c | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 138b6484f277..dd25d90896fc 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -2576,12 +2576,13 @@ static int do_proc_douintvec_minmax_conv(unsigned long *lvalp,
 	if (write) {
 		unsigned int val = *lvalp;
 
+		if (*lvalp > UINT_MAX)
+			return -EINVAL;
+
 		if ((param->min && *param->min > val) ||
 		    (param->max && *param->max < val))
 			return -ERANGE;
 
-		if (*lvalp > UINT_MAX)
-			return -EINVAL;
 		*valp = val;
 	} else {
 		unsigned int val = *valp;
@@ -2632,17 +2633,18 @@ static int do_proc_dopipe_max_size_conv(unsigned long *lvalp,
 	struct do_proc_dopipe_max_size_conv_param *param = data;
 
 	if (write) {
-		unsigned int val = round_pipe_size(*lvalp);
+		unsigned int val;
 
+		if (*lvalp > UINT_MAX)
+			return -EINVAL;
+
+		val = round_pipe_size(*lvalp);
 		if (val == 0)
 			return -EINVAL;
 
 		if (param->min && *param->min > val)
 			return -ERANGE;
 
-		if (*lvalp > UINT_MAX)
-			return -EINVAL;
-
 		*valp = val;
 	} else {
 		unsigned int val = *valp;
-- 
cgit v1.2.3


From 628c1bcba204052d19b686b5bac149a644cdb72e Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Fri, 17 Nov 2017 15:30:01 -0800
Subject: kernel/signal.c: protect the traced SIGNAL_UNKILLABLE tasks from
 SIGKILL

The comment in sig_ignored() says "Tracers may want to know about even
ignored signals" but SIGKILL can not be reported to debugger and it is
just wrong to return 0 in this case: SIGKILL should only kill the
SIGNAL_UNKILLABLE task if it comes from the parent ns.

Change sig_ignored() to ignore ->ptrace if sig == SIGKILL and rely on
sig_task_ignored().

SISGTOP coming from within the namespace is not really right too but at
least debugger can intercept it, and we can't drop it here because this
will break "gdb -p 1": ptrace_attach() won't work.  Perhaps we will add
another ->ptrace check later, we will see.

Link: http://lkml.kernel.org/r/20171103184206.GB21036@redhat.com
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Tested-by: Kyle Huey <me@kylehuey.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/signal.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index aa1fb9f905db..be5913134742 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -94,13 +94,15 @@ static int sig_ignored(struct task_struct *t, int sig, bool force)
 	if (sigismember(&t->blocked, sig) || sigismember(&t->real_blocked, sig))
 		return 0;
 
-	if (!sig_task_ignored(t, sig, force))
-		return 0;
-
 	/*
-	 * Tracers may want to know about even ignored signals.
+	 * Tracers may want to know about even ignored signal unless it
+	 * is SIGKILL which can't be reported anyway but can be ignored
+	 * by SIGNAL_UNKILLABLE task.
 	 */
-	return !t->ptrace;
+	if (t->ptrace && sig != SIGKILL)
+		return 0;
+
+	return sig_task_ignored(t, sig, force);
 }
 
 /*
-- 
cgit v1.2.3


From ac25385089f673560867eb5179228a44ade0cfc1 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Fri, 17 Nov 2017 15:30:04 -0800
Subject: kernel/signal.c: protect the SIGNAL_UNKILLABLE tasks from
 !sig_kernel_only() signals

Change sig_task_ignored() to drop the SIG_DFL && !sig_kernel_only()
signals even if force == T.  This simplifies the next change and this
matches the same check in get_signal() which will drop these signals
anyway.

Link: http://lkml.kernel.org/r/20171103184227.GC21036@redhat.com
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Tested-by: Kyle Huey <me@kylehuey.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/signal.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index be5913134742..01ba166a5e3a 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -78,7 +78,7 @@ static int sig_task_ignored(struct task_struct *t, int sig, bool force)
 	handler = sig_handler(t, sig);
 
 	if (unlikely(t->signal->flags & SIGNAL_UNKILLABLE) &&
-			handler == SIG_DFL && !force)
+	    handler == SIG_DFL && !(force && sig_kernel_only(sig)))
 		return 1;
 
 	return sig_handler_ignored(handler, sig);
-- 
cgit v1.2.3


From 426915796ccaf9c2bd9bb06dc5702225957bc2e5 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Fri, 17 Nov 2017 15:30:08 -0800
Subject: kernel/signal.c: remove the no longer needed SIGNAL_UNKILLABLE check
 in complete_signal()

complete_signal() checks SIGNAL_UNKILLABLE before it starts to destroy
the thread group, today this is wrong in many ways.

If nothing else, fatal_signal_pending() should always imply that the
whole thread group (except ->group_exit_task if it is not NULL) is
killed, this check breaks the rule.

After the previous changes we can rely on sig_task_ignored();
sig_fatal(sig) && SIGNAL_UNKILLABLE can only be true if we actually want
to kill this task and sig == SIGKILL OR it is traced and debugger can
intercept the signal.

This should hopefully fix the problem reported by Dmitry.  This
test-case

	static int init(void *arg)
	{
		for (;;)
			pause();
	}

	int main(void)
	{
		char stack[16 * 1024];

		for (;;) {
			int pid = clone(init, stack + sizeof(stack)/2,
					CLONE_NEWPID | SIGCHLD, NULL);
			assert(pid > 0);

			assert(ptrace(PTRACE_ATTACH, pid, 0, 0) == 0);
			assert(waitpid(-1, NULL, WSTOPPED) == pid);

			assert(ptrace(PTRACE_DETACH, pid, 0, SIGSTOP) == 0);
			assert(syscall(__NR_tkill, pid, SIGKILL) == 0);
			assert(pid == wait(NULL));
		}
	}

triggers the WARN_ON_ONCE(!(task->jobctl & JOBCTL_STOP_PENDING)) in
task_participate_group_stop().  do_signal_stop()->signal_group_exit()
checks SIGNAL_GROUP_EXIT and return false, but task_set_jobctl_pending()
checks fatal_signal_pending() and does not set JOBCTL_STOP_PENDING.

And his should fix the minor security problem reported by Kyle,
SECCOMP_RET_TRACE can miss fatal_signal_pending() the same way if the
task is the root of a pid namespace.

Link: http://lkml.kernel.org/r/20171103184246.GD21036@redhat.com
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Reported-by: Dmitry Vyukov <dvyukov@google.com>
Reported-by: Kyle Huey <me@kylehuey.com>
Reviewed-by: Kees Cook <keescook@chromium.org>
Tested-by: Kyle Huey <me@kylehuey.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/signal.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index 01ba166a5e3a..6895f6bb98a7 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -931,9 +931,9 @@ static void complete_signal(int sig, struct task_struct *p, int group)
 	 * then start taking the whole group down immediately.
 	 */
 	if (sig_fatal(p, sig) &&
-	    !(signal->flags & (SIGNAL_UNKILLABLE | SIGNAL_GROUP_EXIT)) &&
+	    !(signal->flags & SIGNAL_GROUP_EXIT) &&
 	    !sigismember(&t->real_blocked, sig) &&
-	    (sig == SIGKILL || !t->ptrace)) {
+	    (sig == SIGKILL || !p->ptrace)) {
 		/*
 		 * This signal will be fatal to the whole group.
 		 */
-- 
cgit v1.2.3


From de40ccefd1f19180c0a43e4d9b9d2f4dc8856c8b Mon Sep 17 00:00:00 2001
From: Dave Young <dyoung@redhat.com>
Date: Fri, 17 Nov 2017 15:30:12 -0800
Subject: kdump: print a message in case parse_crashkernel_mem resulted in zero
 bytes

parse_crashkernel_mem() silently returns if we get zero bytes in the
parsing function.  It is useful for debugging to add a message,
especially if the kernel cannot boot correctly.

Add a pr_info instead of pr_warn because it is expected behavior for
size = 0, eg.  crashkernel=2G-4G:128M, size will be 0 in case system
memory is less than 2G.

Link: http://lkml.kernel.org/r/20171114080129.GA6115@dhcp-128-65.nay.redhat.com
Signed-off-by: Dave Young <dyoung@redhat.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: Bhupesh Sharma <bhsharma@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/crash_core.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/crash_core.c b/kernel/crash_core.c
index 6db80fc0810b..b3663896278e 100644
--- a/kernel/crash_core.c
+++ b/kernel/crash_core.c
@@ -108,7 +108,8 @@ static int __init parse_crashkernel_mem(char *cmdline,
 				return -EINVAL;
 			}
 		}
-	}
+	} else
+		pr_info("crashkernel size resulted in zero bytes\n");
 
 	return 0;
 }
-- 
cgit v1.2.3


From f9eb2fdd04d4e68fbea18970bbf65ace716d25b6 Mon Sep 17 00:00:00 2001
From: "Ola N. Kaldestad" <mail@okal.no>
Date: Fri, 17 Nov 2017 15:30:26 -0800
Subject: kernel/sysctl.c: code cleanups

Remove unnecessary else block, remove redundant return and call to kfree
in if block.

Link: http://lkml.kernel.org/r/1510238435-1655-1-git-send-email-mail@okal.no
Signed-off-by: Ola N. Kaldestad <mail@okal.no>
Acked-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/sysctl.c | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index dd25d90896fc..557d46728577 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -3127,14 +3127,12 @@ int proc_do_large_bitmap(struct ctl_table *table, int write,
 			else
 				bitmap_copy(bitmap, tmp_bitmap, bitmap_len);
 		}
-		kfree(tmp_bitmap);
 		*lenp -= left;
 		*ppos += *lenp;
-		return 0;
-	} else {
-		kfree(tmp_bitmap);
-		return err;
 	}
+
+	kfree(tmp_bitmap);
+	return err;
 }
 
 #else /* CONFIG_PROC_SYSCTL */
-- 
cgit v1.2.3


From 95846ecf9dac5089aed4b144d912225f8ef86ae4 Mon Sep 17 00:00:00 2001
From: Gargi Sharma <gs051095@gmail.com>
Date: Fri, 17 Nov 2017 15:30:30 -0800
Subject: pid: replace pid bitmap implementation with IDR API

Patch series "Replacing PID bitmap implementation with IDR API", v4.

This series replaces kernel bitmap implementation of PID allocation with
IDR API.  These patches are written to simplify the kernel by replacing
custom code with calls to generic code.

The following are the stats for pid and pid_namespace object files
before and after the replacement.  There is a noteworthy change between
the IDR and bitmap implementation.

Before
   text       data        bss        dec        hex    filename
   8447       3894         64      12405       3075    kernel/pid.o
After
   text       data        bss        dec        hex    filename
   3397        304          0       3701        e75    kernel/pid.o

Before
   text       data        bss        dec        hex    filename
   5692       1842        192       7726       1e2e    kernel/pid_namespace.o
After
   text       data        bss        dec        hex    filename
   2854        216         16       3086        c0e    kernel/pid_namespace.o

The following are the stats for ps, pstree and calling readdir on /proc
for 10,000 processes.

ps:
        With IDR API    With bitmap
real    0m1.479s        0m2.319s
user    0m0.070s        0m0.060s
sys     0m0.289s        0m0.516s

pstree:
        With IDR API    With bitmap
real    0m1.024s        0m1.794s
user    0m0.348s        0m0.612s
sys     0m0.184s        0m0.264s

proc:
        With IDR API    With bitmap
real    0m0.059s        0m0.074s
user    0m0.000s        0m0.004s
sys     0m0.016s        0m0.016s

This patch (of 2):

Replace the current bitmap implementation for Process ID allocation.
Functions that are no longer required, for example, free_pidmap(),
alloc_pidmap(), etc.  are removed.  The rest of the functions are
modified to use the IDR API.  The change was made to make the PID
allocation less complex by replacing custom code with calls to generic
API.

[gs051095@gmail.com: v6]
  Link: http://lkml.kernel.org/r/1507760379-21662-2-git-send-email-gs051095@gmail.com
[avagin@openvz.org: restore the old behaviour of the ns_last_pid sysctl]
  Link: http://lkml.kernel.org/r/20171106183144.16368-1-avagin@openvz.org
Link: http://lkml.kernel.org/r/1507583624-22146-2-git-send-email-gs051095@gmail.com
Signed-off-by: Gargi Sharma <gs051095@gmail.com>
Reviewed-by: Rik van Riel <riel@redhat.com>
Acked-by: Oleg Nesterov <oleg@redhat.com>
Cc: Julia Lawall <julia.lawall@lip6.fr>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Pavel Tatashin <pasha.tatashin@oracle.com>
Cc: Kirill Tkhai <ktkhai@virtuozzo.com>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/powerpc/platforms/cell/spufs/sched.c |   2 +-
 fs/proc/loadavg.c                         |   2 +-
 include/linux/pid_namespace.h             |  14 +--
 init/main.c                               |   2 +-
 kernel/pid.c                              | 201 ++++++------------------------
 kernel/pid_namespace.c                    |  53 ++++----
 6 files changed, 65 insertions(+), 209 deletions(-)

(limited to 'kernel')

diff --git a/arch/powerpc/platforms/cell/spufs/sched.c b/arch/powerpc/platforms/cell/spufs/sched.c
index 1fbb5da17dd2..e47761cdcb98 100644
--- a/arch/powerpc/platforms/cell/spufs/sched.c
+++ b/arch/powerpc/platforms/cell/spufs/sched.c
@@ -1093,7 +1093,7 @@ static int show_spu_loadavg(struct seq_file *s, void *private)
 		LOAD_INT(c), LOAD_FRAC(c),
 		count_active_contexts(),
 		atomic_read(&nr_spu_contexts),
-		task_active_pid_ns(current)->last_pid);
+		idr_get_cursor(&task_active_pid_ns(current)->idr));
 	return 0;
 }
 
diff --git a/fs/proc/loadavg.c b/fs/proc/loadavg.c
index 9bc5c58c00ee..a000d7547479 100644
--- a/fs/proc/loadavg.c
+++ b/fs/proc/loadavg.c
@@ -24,7 +24,7 @@ static int loadavg_proc_show(struct seq_file *m, void *v)
 		LOAD_INT(avnrun[1]), LOAD_FRAC(avnrun[1]),
 		LOAD_INT(avnrun[2]), LOAD_FRAC(avnrun[2]),
 		nr_running(), nr_threads,
-		task_active_pid_ns(current)->last_pid);
+		idr_get_cursor(&task_active_pid_ns(current)->idr));
 	return 0;
 }
 
diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h
index c78af6061644..92c6aa509d2e 100644
--- a/include/linux/pid_namespace.h
+++ b/include/linux/pid_namespace.h
@@ -10,15 +10,8 @@
 #include <linux/nsproxy.h>
 #include <linux/kref.h>
 #include <linux/ns_common.h>
+#include <linux/idr.h>
 
-struct pidmap {
-       atomic_t nr_free;
-       void *page;
-};
-
-#define BITS_PER_PAGE		(PAGE_SIZE * 8)
-#define BITS_PER_PAGE_MASK	(BITS_PER_PAGE-1)
-#define PIDMAP_ENTRIES		((PID_MAX_LIMIT+BITS_PER_PAGE-1)/BITS_PER_PAGE)
 
 struct fs_pin;
 
@@ -30,9 +23,8 @@ enum { /* definitions for pid_namespace's hide_pid field */
 
 struct pid_namespace {
 	struct kref kref;
-	struct pidmap pidmap[PIDMAP_ENTRIES];
+	struct idr idr;
 	struct rcu_head rcu;
-	int last_pid;
 	unsigned int nr_hashed;
 	struct task_struct *child_reaper;
 	struct kmem_cache *pid_cachep;
@@ -106,6 +98,6 @@ static inline int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd)
 
 extern struct pid_namespace *task_active_pid_ns(struct task_struct *tsk);
 void pidhash_init(void);
-void pidmap_init(void);
+void pid_idr_init(void);
 
 #endif /* _LINUX_PID_NS_H */
diff --git a/init/main.c b/init/main.c
index 859a786f7c0a..d0cbcfc06124 100644
--- a/init/main.c
+++ b/init/main.c
@@ -669,7 +669,7 @@ asmlinkage __visible void __init start_kernel(void)
 	if (late_time_init)
 		late_time_init();
 	calibrate_delay();
-	pidmap_init();
+	pid_idr_init();
 	anon_vma_init();
 #ifdef CONFIG_X86
 	if (efi_enabled(EFI_RUNTIME_SERVICES))
diff --git a/kernel/pid.c b/kernel/pid.c
index 020dedbdf066..0ce59369632f 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -39,6 +39,7 @@
 #include <linux/proc_ns.h>
 #include <linux/proc_fs.h>
 #include <linux/sched/task.h>
+#include <linux/idr.h>
 
 #define pid_hashfn(nr, ns)	\
 	hash_long((unsigned long)nr + (unsigned long)ns, pidhash_shift)
@@ -53,14 +54,6 @@ int pid_max = PID_MAX_DEFAULT;
 int pid_max_min = RESERVED_PIDS + 1;
 int pid_max_max = PID_MAX_LIMIT;
 
-static inline int mk_pid(struct pid_namespace *pid_ns,
-		struct pidmap *map, int off)
-{
-	return (map - pid_ns->pidmap)*BITS_PER_PAGE + off;
-}
-
-#define find_next_offset(map, off)					\
-		find_next_zero_bit((map)->page, BITS_PER_PAGE, off)
 
 /*
  * PID-map pages start out as NULL, they get allocated upon
@@ -70,10 +63,7 @@ static inline int mk_pid(struct pid_namespace *pid_ns,
  */
 struct pid_namespace init_pid_ns = {
 	.kref = KREF_INIT(2),
-	.pidmap = {
-		[ 0 ... PIDMAP_ENTRIES-1] = { ATOMIC_INIT(BITS_PER_PAGE), NULL }
-	},
-	.last_pid = 0,
+	.idr = IDR_INIT,
 	.nr_hashed = PIDNS_HASH_ADDING,
 	.level = 0,
 	.child_reaper = &init_task,
@@ -101,138 +91,6 @@ EXPORT_SYMBOL_GPL(init_pid_ns);
 
 static  __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock);
 
-static void free_pidmap(struct upid *upid)
-{
-	int nr = upid->nr;
-	struct pidmap *map = upid->ns->pidmap + nr / BITS_PER_PAGE;
-	int offset = nr & BITS_PER_PAGE_MASK;
-
-	clear_bit(offset, map->page);
-	atomic_inc(&map->nr_free);
-}
-
-/*
- * If we started walking pids at 'base', is 'a' seen before 'b'?
- */
-static int pid_before(int base, int a, int b)
-{
-	/*
-	 * This is the same as saying
-	 *
-	 * (a - base + MAXUINT) % MAXUINT < (b - base + MAXUINT) % MAXUINT
-	 * and that mapping orders 'a' and 'b' with respect to 'base'.
-	 */
-	return (unsigned)(a - base) < (unsigned)(b - base);
-}
-
-/*
- * We might be racing with someone else trying to set pid_ns->last_pid
- * at the pid allocation time (there's also a sysctl for this, but racing
- * with this one is OK, see comment in kernel/pid_namespace.c about it).
- * We want the winner to have the "later" value, because if the
- * "earlier" value prevails, then a pid may get reused immediately.
- *
- * Since pids rollover, it is not sufficient to just pick the bigger
- * value.  We have to consider where we started counting from.
- *
- * 'base' is the value of pid_ns->last_pid that we observed when
- * we started looking for a pid.
- *
- * 'pid' is the pid that we eventually found.
- */
-static void set_last_pid(struct pid_namespace *pid_ns, int base, int pid)
-{
-	int prev;
-	int last_write = base;
-	do {
-		prev = last_write;
-		last_write = cmpxchg(&pid_ns->last_pid, prev, pid);
-	} while ((prev != last_write) && (pid_before(base, last_write, pid)));
-}
-
-static int alloc_pidmap(struct pid_namespace *pid_ns)
-{
-	int i, offset, max_scan, pid, last = pid_ns->last_pid;
-	struct pidmap *map;
-
-	pid = last + 1;
-	if (pid >= pid_max)
-		pid = RESERVED_PIDS;
-	offset = pid & BITS_PER_PAGE_MASK;
-	map = &pid_ns->pidmap[pid/BITS_PER_PAGE];
-	/*
-	 * If last_pid points into the middle of the map->page we
-	 * want to scan this bitmap block twice, the second time
-	 * we start with offset == 0 (or RESERVED_PIDS).
-	 */
-	max_scan = DIV_ROUND_UP(pid_max, BITS_PER_PAGE) - !offset;
-	for (i = 0; i <= max_scan; ++i) {
-		if (unlikely(!map->page)) {
-			void *page = kzalloc(PAGE_SIZE, GFP_KERNEL);
-			/*
-			 * Free the page if someone raced with us
-			 * installing it:
-			 */
-			spin_lock_irq(&pidmap_lock);
-			if (!map->page) {
-				map->page = page;
-				page = NULL;
-			}
-			spin_unlock_irq(&pidmap_lock);
-			kfree(page);
-			if (unlikely(!map->page))
-				return -ENOMEM;
-		}
-		if (likely(atomic_read(&map->nr_free))) {
-			for ( ; ; ) {
-				if (!test_and_set_bit(offset, map->page)) {
-					atomic_dec(&map->nr_free);
-					set_last_pid(pid_ns, last, pid);
-					return pid;
-				}
-				offset = find_next_offset(map, offset);
-				if (offset >= BITS_PER_PAGE)
-					break;
-				pid = mk_pid(pid_ns, map, offset);
-				if (pid >= pid_max)
-					break;
-			}
-		}
-		if (map < &pid_ns->pidmap[(pid_max-1)/BITS_PER_PAGE]) {
-			++map;
-			offset = 0;
-		} else {
-			map = &pid_ns->pidmap[0];
-			offset = RESERVED_PIDS;
-			if (unlikely(last == offset))
-				break;
-		}
-		pid = mk_pid(pid_ns, map, offset);
-	}
-	return -EAGAIN;
-}
-
-int next_pidmap(struct pid_namespace *pid_ns, unsigned int last)
-{
-	int offset;
-	struct pidmap *map, *end;
-
-	if (last >= PID_MAX_LIMIT)
-		return -1;
-
-	offset = (last + 1) & BITS_PER_PAGE_MASK;
-	map = &pid_ns->pidmap[(last + 1)/BITS_PER_PAGE];
-	end = &pid_ns->pidmap[PIDMAP_ENTRIES];
-	for (; map < end; map++, offset = 0) {
-		if (unlikely(!map->page))
-			continue;
-		offset = find_next_bit((map)->page, BITS_PER_PAGE, offset);
-		if (offset < BITS_PER_PAGE)
-			return mk_pid(pid_ns, map, offset);
-	}
-	return -1;
-}
-
 void put_pid(struct pid *pid)
 {
 	struct pid_namespace *ns;
@@ -266,7 +124,7 @@ void free_pid(struct pid *pid)
 		struct upid *upid = pid->numbers + i;
 		struct pid_namespace *ns = upid->ns;
 		hlist_del_rcu(&upid->pid_chain);
-		switch(--ns->nr_hashed) {
+		switch (--ns->nr_hashed) {
 		case 2:
 		case 1:
 			/* When all that is left in the pid namespace
@@ -284,12 +142,11 @@ void free_pid(struct pid *pid)
 			schedule_work(&ns->proc_work);
 			break;
 		}
+
+		idr_remove(&ns->idr, upid->nr);
 	}
 	spin_unlock_irqrestore(&pidmap_lock, flags);
 
-	for (i = 0; i <= pid->level; i++)
-		free_pidmap(pid->numbers + i);
-
 	call_rcu(&pid->rcu, delayed_put_pid);
 }
 
@@ -308,8 +165,29 @@ struct pid *alloc_pid(struct pid_namespace *ns)
 
 	tmp = ns;
 	pid->level = ns->level;
+
 	for (i = ns->level; i >= 0; i--) {
-		nr = alloc_pidmap(tmp);
+		int pid_min = 1;
+
+		idr_preload(GFP_KERNEL);
+		spin_lock_irq(&pidmap_lock);
+
+		/*
+		 * init really needs pid 1, but after reaching the maximum
+		 * wrap back to RESERVED_PIDS
+		 */
+		if (idr_get_cursor(&tmp->idr) > RESERVED_PIDS)
+			pid_min = RESERVED_PIDS;
+
+		/*
+		 * Store a null pointer so find_pid_ns does not find
+		 * a partially initialized PID (see below).
+		 */
+		nr = idr_alloc_cyclic(&tmp->idr, NULL, pid_min,
+				      pid_max, GFP_ATOMIC);
+		spin_unlock_irq(&pidmap_lock);
+		idr_preload_end();
+
 		if (nr < 0) {
 			retval = nr;
 			goto out_free;
@@ -339,6 +217,8 @@ struct pid *alloc_pid(struct pid_namespace *ns)
 	for ( ; upid >= pid->numbers; --upid) {
 		hlist_add_head_rcu(&upid->pid_chain,
 				&pid_hash[pid_hashfn(upid->nr, upid->ns)]);
+		/* Make the PID visible to find_pid_ns. */
+		idr_replace(&upid->ns->idr, pid, upid->nr);
 		upid->ns->nr_hashed++;
 	}
 	spin_unlock_irq(&pidmap_lock);
@@ -350,8 +230,11 @@ out_unlock:
 	put_pid_ns(ns);
 
 out_free:
+	spin_lock_irq(&pidmap_lock);
 	while (++i <= ns->level)
-		free_pidmap(pid->numbers + i);
+		idr_remove(&ns->idr, (pid->numbers + i)->nr);
+
+	spin_unlock_irq(&pidmap_lock);
 
 	kmem_cache_free(ns->pid_cachep, pid);
 	return ERR_PTR(retval);
@@ -553,16 +436,7 @@ EXPORT_SYMBOL_GPL(task_active_pid_ns);
  */
 struct pid *find_ge_pid(int nr, struct pid_namespace *ns)
 {
-	struct pid *pid;
-
-	do {
-		pid = find_pid_ns(nr, ns);
-		if (pid)
-			break;
-		nr = next_pidmap(ns, nr);
-	} while (nr > 0);
-
-	return pid;
+	return idr_get_next(&ns->idr, &nr);
 }
 
 /*
@@ -578,7 +452,7 @@ void __init pidhash_init(void)
 					   0, 4096);
 }
 
-void __init pidmap_init(void)
+void __init pid_idr_init(void)
 {
 	/* Verify no one has done anything silly: */
 	BUILD_BUG_ON(PID_MAX_LIMIT >= PIDNS_HASH_ADDING);
@@ -590,10 +464,7 @@ void __init pidmap_init(void)
 				PIDS_PER_CPU_MIN * num_possible_cpus());
 	pr_info("pid_max: default: %u minimum: %u\n", pid_max, pid_max_min);
 
-	init_pid_ns.pidmap[0].page = kzalloc(PAGE_SIZE, GFP_KERNEL);
-	/* Reserve PID 0. We never call free_pidmap(0) */
-	set_bit(0, init_pid_ns.pidmap[0].page);
-	atomic_dec(&init_pid_ns.pidmap[0].nr_free);
+	idr_init(&init_pid_ns.idr);
 
 	init_pid_ns.pid_cachep = KMEM_CACHE(pid,
 			SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT);
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index 4918314893bc..ca7c8a8823b1 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -21,6 +21,7 @@
 #include <linux/export.h>
 #include <linux/sched/task.h>
 #include <linux/sched/signal.h>
+#include <linux/idr.h>
 
 struct pid_cache {
 	int nr_ids;
@@ -98,7 +99,6 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns
 	struct pid_namespace *ns;
 	unsigned int level = parent_pid_ns->level + 1;
 	struct ucounts *ucounts;
-	int i;
 	int err;
 
 	err = -EINVAL;
@@ -117,17 +117,15 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns
 	if (ns == NULL)
 		goto out_dec;
 
-	ns->pidmap[0].page = kzalloc(PAGE_SIZE, GFP_KERNEL);
-	if (!ns->pidmap[0].page)
-		goto out_free;
+	idr_init(&ns->idr);
 
 	ns->pid_cachep = create_pid_cachep(level + 1);
 	if (ns->pid_cachep == NULL)
-		goto out_free_map;
+		goto out_free_idr;
 
 	err = ns_alloc_inum(&ns->ns);
 	if (err)
-		goto out_free_map;
+		goto out_free_idr;
 	ns->ns.ops = &pidns_operations;
 
 	kref_init(&ns->kref);
@@ -138,17 +136,10 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns
 	ns->nr_hashed = PIDNS_HASH_ADDING;
 	INIT_WORK(&ns->proc_work, proc_cleanup_work);
 
-	set_bit(0, ns->pidmap[0].page);
-	atomic_set(&ns->pidmap[0].nr_free, BITS_PER_PAGE - 1);
-
-	for (i = 1; i < PIDMAP_ENTRIES; i++)
-		atomic_set(&ns->pidmap[i].nr_free, BITS_PER_PAGE);
-
 	return ns;
 
-out_free_map:
-	kfree(ns->pidmap[0].page);
-out_free:
+out_free_idr:
+	idr_destroy(&ns->idr);
 	kmem_cache_free(pid_ns_cachep, ns);
 out_dec:
 	dec_pid_namespaces(ucounts);
@@ -168,11 +159,9 @@ static void delayed_free_pidns(struct rcu_head *p)
 
 static void destroy_pid_namespace(struct pid_namespace *ns)
 {
-	int i;
-
 	ns_free_inum(&ns->ns);
-	for (i = 0; i < PIDMAP_ENTRIES; i++)
-		kfree(ns->pidmap[i].page);
+
+	idr_destroy(&ns->idr);
 	call_rcu(&ns->rcu, delayed_free_pidns);
 }
 
@@ -213,6 +202,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
 	int rc;
 	struct task_struct *task, *me = current;
 	int init_pids = thread_group_leader(me) ? 1 : 2;
+	struct pid *pid;
 
 	/* Don't allow any more processes into the pid namespace */
 	disable_pid_allocation(pid_ns);
@@ -239,20 +229,16 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
 	 * 	  maintain a tasklist for each pid namespace.
 	 *
 	 */
+	rcu_read_lock();
 	read_lock(&tasklist_lock);
-	nr = next_pidmap(pid_ns, 1);
-	while (nr > 0) {
-		rcu_read_lock();
-
-		task = pid_task(find_vpid(nr), PIDTYPE_PID);
+	nr = 2;
+	idr_for_each_entry_continue(&pid_ns->idr, pid, nr) {
+		task = pid_task(pid, PIDTYPE_PID);
 		if (task && !__fatal_signal_pending(task))
 			send_sig_info(SIGKILL, SEND_SIG_FORCED, task);
-
-		rcu_read_unlock();
-
-		nr = next_pidmap(pid_ns, nr);
 	}
 	read_unlock(&tasklist_lock);
+	rcu_read_unlock();
 
 	/*
 	 * Reap the EXIT_ZOMBIE children we had before we ignored SIGCHLD.
@@ -301,6 +287,7 @@ static int pid_ns_ctl_handler(struct ctl_table *table, int write,
 {
 	struct pid_namespace *pid_ns = task_active_pid_ns(current);
 	struct ctl_table tmp = *table;
+	int ret, next;
 
 	if (write && !ns_capable(pid_ns->user_ns, CAP_SYS_ADMIN))
 		return -EPERM;
@@ -311,8 +298,14 @@ static int pid_ns_ctl_handler(struct ctl_table *table, int write,
 	 * it should synchronize its usage with external means.
 	 */
 
-	tmp.data = &pid_ns->last_pid;
-	return proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
+	next = idr_get_cursor(&pid_ns->idr) - 1;
+
+	tmp.data = &next;
+	ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
+	if (!ret && write)
+		idr_set_cursor(&pid_ns->idr, next + 1);
+
+	return ret;
 }
 
 extern int pid_max;
-- 
cgit v1.2.3


From e8cfbc245e24887e3c30235f71e9e9405e0cfc39 Mon Sep 17 00:00:00 2001
From: Gargi Sharma <gs051095@gmail.com>
Date: Fri, 17 Nov 2017 15:30:34 -0800
Subject: pid: remove pidhash

pidhash is no longer required as all the information can be looked up
from idr tree.  nr_hashed represented the number of pids that had been
hashed.  Since, nr_hashed and PIDNS_HASH_ADDING are no longer relevant,
it has been renamed to pid_allocated and PIDNS_ADDING respectively.

[gs051095@gmail.com: v6]
  Link: http://lkml.kernel.org/r/1507760379-21662-3-git-send-email-gs051095@gmail.com
Link: http://lkml.kernel.org/r/1507583624-22146-3-git-send-email-gs051095@gmail.com
Signed-off-by: Gargi Sharma <gs051095@gmail.com>
Reviewed-by: Rik van Riel <riel@redhat.com>
Tested-by: Tony Luck <tony.luck@intel.com>	[ia64]
Cc: Julia Lawall <julia.lawall@lip6.fr>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Pavel Tatashin <pasha.tatashin@oracle.com>
Cc: Kirill Tkhai <ktkhai@virtuozzo.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/ia64/kernel/asm-offsets.c |  4 ++--
 include/linux/init_task.h      |  1 -
 include/linux/pid.h            |  2 --
 include/linux/pid_namespace.h  |  4 ++--
 init/main.c                    |  1 -
 kernel/fork.c                  |  2 +-
 kernel/pid.c                   | 48 +++++++++---------------------------------
 kernel/pid_namespace.c         |  6 +++---
 8 files changed, 18 insertions(+), 50 deletions(-)

(limited to 'kernel')

diff --git a/arch/ia64/kernel/asm-offsets.c b/arch/ia64/kernel/asm-offsets.c
index f7693f49c573..f4db2168d1b8 100644
--- a/arch/ia64/kernel/asm-offsets.c
+++ b/arch/ia64/kernel/asm-offsets.c
@@ -31,8 +31,8 @@ void foo(void)
 	DEFINE(SIGFRAME_SIZE, sizeof (struct sigframe));
 	DEFINE(UNW_FRAME_INFO_SIZE, sizeof (struct unw_frame_info));
 
-	BUILD_BUG_ON(sizeof(struct upid) != 32);
-	DEFINE(IA64_UPID_SHIFT, 5);
+	BUILD_BUG_ON(sizeof(struct upid) != 16);
+	DEFINE(IA64_UPID_SHIFT, 4);
 
 	BLANK();
 
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 8062e6cc607c..6a532629c983 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -105,7 +105,6 @@ extern struct group_info init_groups;
 	.numbers	= { {						\
 		.nr		= 0,					\
 		.ns		= &init_pid_ns,				\
-		.pid_chain	= { .next = NULL, .pprev = NULL },	\
 	}, }								\
 }
 
diff --git a/include/linux/pid.h b/include/linux/pid.h
index dfd684ce0787..7633d55d9a24 100644
--- a/include/linux/pid.h
+++ b/include/linux/pid.h
@@ -51,10 +51,8 @@ enum pid_type
  */
 
 struct upid {
-	/* Try to keep pid_chain in the same cacheline as nr for find_vpid */
 	int nr;
 	struct pid_namespace *ns;
-	struct hlist_node pid_chain;
 };
 
 struct pid
diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h
index 92c6aa509d2e..49538b172483 100644
--- a/include/linux/pid_namespace.h
+++ b/include/linux/pid_namespace.h
@@ -25,7 +25,7 @@ struct pid_namespace {
 	struct kref kref;
 	struct idr idr;
 	struct rcu_head rcu;
-	unsigned int nr_hashed;
+	unsigned int pid_allocated;
 	struct task_struct *child_reaper;
 	struct kmem_cache *pid_cachep;
 	unsigned int level;
@@ -49,7 +49,7 @@ struct pid_namespace {
 
 extern struct pid_namespace init_pid_ns;
 
-#define PIDNS_HASH_ADDING (1U << 31)
+#define PIDNS_ADDING (1U << 31)
 
 #ifdef CONFIG_PID_NS
 static inline struct pid_namespace *get_pid_ns(struct pid_namespace *ns)
diff --git a/init/main.c b/init/main.c
index d0cbcfc06124..dfec3809e740 100644
--- a/init/main.c
+++ b/init/main.c
@@ -562,7 +562,6 @@ asmlinkage __visible void __init start_kernel(void)
 	 * kmem_cache_init()
 	 */
 	setup_log_buf(0);
-	pidhash_init();
 	vfs_caches_init_early();
 	sort_main_extable();
 	trap_init();
diff --git a/kernel/fork.c b/kernel/fork.c
index 4e55eedba8d6..432eadf6b58c 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1871,7 +1871,7 @@ static __latent_entropy struct task_struct *copy_process(
 		retval = -ERESTARTNOINTR;
 		goto bad_fork_cancel_cgroup;
 	}
-	if (unlikely(!(ns_of_pid(pid)->nr_hashed & PIDNS_HASH_ADDING))) {
+	if (unlikely(!(ns_of_pid(pid)->pid_allocated & PIDNS_ADDING))) {
 		retval = -ENOMEM;
 		goto bad_fork_cancel_cgroup;
 	}
diff --git a/kernel/pid.c b/kernel/pid.c
index 0ce59369632f..b13b624e2c49 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -41,10 +41,6 @@
 #include <linux/sched/task.h>
 #include <linux/idr.h>
 
-#define pid_hashfn(nr, ns)	\
-	hash_long((unsigned long)nr + (unsigned long)ns, pidhash_shift)
-static struct hlist_head *pid_hash;
-static unsigned int pidhash_shift = 4;
 struct pid init_struct_pid = INIT_STRUCT_PID;
 
 int pid_max = PID_MAX_DEFAULT;
@@ -54,7 +50,6 @@ int pid_max = PID_MAX_DEFAULT;
 int pid_max_min = RESERVED_PIDS + 1;
 int pid_max_max = PID_MAX_LIMIT;
 
-
 /*
  * PID-map pages start out as NULL, they get allocated upon
  * first use and are never deallocated. This way a low pid_max
@@ -64,7 +59,7 @@ int pid_max_max = PID_MAX_LIMIT;
 struct pid_namespace init_pid_ns = {
 	.kref = KREF_INIT(2),
 	.idr = IDR_INIT,
-	.nr_hashed = PIDNS_HASH_ADDING,
+	.pid_allocated = PIDNS_ADDING,
 	.level = 0,
 	.child_reaper = &init_task,
 	.user_ns = &init_user_ns,
@@ -123,8 +118,7 @@ void free_pid(struct pid *pid)
 	for (i = 0; i <= pid->level; i++) {
 		struct upid *upid = pid->numbers + i;
 		struct pid_namespace *ns = upid->ns;
-		hlist_del_rcu(&upid->pid_chain);
-		switch (--ns->nr_hashed) {
+		switch (--ns->pid_allocated) {
 		case 2:
 		case 1:
 			/* When all that is left in the pid namespace
@@ -133,10 +127,10 @@ void free_pid(struct pid *pid)
 			 */
 			wake_up_process(ns->child_reaper);
 			break;
-		case PIDNS_HASH_ADDING:
+		case PIDNS_ADDING:
 			/* Handle a fork failure of the first process */
 			WARN_ON(ns->child_reaper);
-			ns->nr_hashed = 0;
+			ns->pid_allocated = 0;
 			/* fall through */
 		case 0:
 			schedule_work(&ns->proc_work);
@@ -212,14 +206,12 @@ struct pid *alloc_pid(struct pid_namespace *ns)
 
 	upid = pid->numbers + ns->level;
 	spin_lock_irq(&pidmap_lock);
-	if (!(ns->nr_hashed & PIDNS_HASH_ADDING))
+	if (!(ns->pid_allocated & PIDNS_ADDING))
 		goto out_unlock;
 	for ( ; upid >= pid->numbers; --upid) {
-		hlist_add_head_rcu(&upid->pid_chain,
-				&pid_hash[pid_hashfn(upid->nr, upid->ns)]);
 		/* Make the PID visible to find_pid_ns. */
 		idr_replace(&upid->ns->idr, pid, upid->nr);
-		upid->ns->nr_hashed++;
+		upid->ns->pid_allocated++;
 	}
 	spin_unlock_irq(&pidmap_lock);
 
@@ -243,21 +235,13 @@ out_free:
 void disable_pid_allocation(struct pid_namespace *ns)
 {
 	spin_lock_irq(&pidmap_lock);
-	ns->nr_hashed &= ~PIDNS_HASH_ADDING;
+	ns->pid_allocated &= ~PIDNS_ADDING;
 	spin_unlock_irq(&pidmap_lock);
 }
 
 struct pid *find_pid_ns(int nr, struct pid_namespace *ns)
 {
-	struct upid *pnr;
-
-	hlist_for_each_entry_rcu(pnr,
-			&pid_hash[pid_hashfn(nr, ns)], pid_chain)
-		if (pnr->nr == nr && pnr->ns == ns)
-			return container_of(pnr, struct pid,
-					numbers[ns->level]);
-
-	return NULL;
+	return idr_find(&ns->idr, nr);
 }
 EXPORT_SYMBOL_GPL(find_pid_ns);
 
@@ -413,6 +397,7 @@ pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type,
 		if (type != PIDTYPE_PID) {
 			if (type == __PIDTYPE_TGID)
 				type = PIDTYPE_PID;
+
 			task = task->group_leader;
 		}
 		nr = pid_nr_ns(rcu_dereference(task->pids[type].pid), ns);
@@ -439,23 +424,10 @@ struct pid *find_ge_pid(int nr, struct pid_namespace *ns)
 	return idr_get_next(&ns->idr, &nr);
 }
 
-/*
- * The pid hash table is scaled according to the amount of memory in the
- * machine.  From a minimum of 16 slots up to 4096 slots at one gigabyte or
- * more.
- */
-void __init pidhash_init(void)
-{
-	pid_hash = alloc_large_system_hash("PID", sizeof(*pid_hash), 0, 18,
-					   HASH_EARLY | HASH_SMALL | HASH_ZERO,
-					   &pidhash_shift, NULL,
-					   0, 4096);
-}
-
 void __init pid_idr_init(void)
 {
 	/* Verify no one has done anything silly: */
-	BUILD_BUG_ON(PID_MAX_LIMIT >= PIDNS_HASH_ADDING);
+	BUILD_BUG_ON(PID_MAX_LIMIT >= PIDNS_ADDING);
 
 	/* bump default and minimum pid_max based on number of cpus */
 	pid_max = min(pid_max_max, max_t(int, pid_max,
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index ca7c8a8823b1..0b53eef7d34b 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -133,7 +133,7 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns
 	ns->parent = get_pid_ns(parent_pid_ns);
 	ns->user_ns = get_user_ns(user_ns);
 	ns->ucounts = ucounts;
-	ns->nr_hashed = PIDNS_HASH_ADDING;
+	ns->pid_allocated = PIDNS_ADDING;
 	INIT_WORK(&ns->proc_work, proc_cleanup_work);
 
 	return ns;
@@ -254,7 +254,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
 	 * sys_wait4() above can't reap the EXIT_DEAD children but we do not
 	 * really care, we could reparent them to the global init. We could
 	 * exit and reap ->child_reaper even if it is not the last thread in
-	 * this pid_ns, free_pid(nr_hashed == 0) calls proc_cleanup_work(),
+	 * this pid_ns, free_pid(pid_allocated == 0) calls proc_cleanup_work(),
 	 * pid_ns can not go away until proc_kill_sb() drops the reference.
 	 *
 	 * But this ns can also have other tasks injected by setns()+fork().
@@ -268,7 +268,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
 	 */
 	for (;;) {
 		set_current_state(TASK_INTERRUPTIBLE);
-		if (pid_ns->nr_hashed == init_pids)
+		if (pid_ns->pid_allocated == init_pids)
 			break;
 		schedule();
 	}
-- 
cgit v1.2.3


From 4efb442cc12eb66535b7c7ed06005fd7889c1d77 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Fri, 17 Nov 2017 15:30:38 -0800
Subject: kernel/panic.c: add TAINT_AUX

This is the gist of a patch which we've been forward-porting in our
kernels for a long time now and it probably would make a good sense to
have such TAINT_AUX flag upstream which can be used by each distro etc,
how they see fit.  This way, we won't need to forward-port a distro-only
version indefinitely.

Add an auxiliary taint flag to be used by distros and others.  This
obviates the need to forward-port whatever internal solutions people
have in favor of a single flag which they can map arbitrarily to a
definition of their pleasing.

The "X" mnemonic could also mean eXternal, which would be taint from a
distro or something else but not the upstream kernel.  We will use it to
mark modules for which we don't provide support.  I.e., a really
eXternal module.

Link: http://lkml.kernel.org/r/20170911134533.dp5mtyku5bongx4c@pd.tnic
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Kees Cook <keescook@chromium.org>
Cc: Jessica Yu <jeyu@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Jiri Slaby <jslaby@suse.cz>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Michal Marek <mmarek@suse.cz>
Cc: Jiri Kosina <jkosina@suse.cz>
Cc: Takashi Iwai <tiwai@suse.de>
Cc: Petr Mladek <pmladek@suse.com>
Cc: Jeff Mahoney <jeffm@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kernel.h | 3 ++-
 kernel/panic.c         | 2 ++
 2 files changed, 4 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 4b484ab9e163..ce51455e2adf 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -549,7 +549,8 @@ extern enum system_states {
 #define TAINT_UNSIGNED_MODULE		13
 #define TAINT_SOFTLOCKUP		14
 #define TAINT_LIVEPATCH			15
-#define TAINT_FLAGS_COUNT		16
+#define TAINT_AUX			16
+#define TAINT_FLAGS_COUNT		17
 
 struct taint_flag {
 	char c_true;	/* character printed when tainted */
diff --git a/kernel/panic.c b/kernel/panic.c
index 3242b64b1956..2cfef408fec9 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -324,6 +324,7 @@ const struct taint_flag taint_flags[TAINT_FLAGS_COUNT] = {
 	{ 'E', ' ', true },	/* TAINT_UNSIGNED_MODULE */
 	{ 'L', ' ', false },	/* TAINT_SOFTLOCKUP */
 	{ 'K', ' ', true },	/* TAINT_LIVEPATCH */
+	{ 'X', ' ', true },	/* TAINT_AUX */
 };
 
 /**
@@ -345,6 +346,7 @@ const struct taint_flag taint_flags[TAINT_FLAGS_COUNT] = {
  *  'E' - Unsigned module has been loaded.
  *  'L' - A soft lockup has previously occurred.
  *  'K' - Kernel has been live patched.
+ *  'X' - Auxiliary taint, for distros' use.
  *
  *	The string is overwritten by the next call to print_tainted().
  */
-- 
cgit v1.2.3


From fcf4edac049a8bca41658970292e2dfdbc9d5f62 Mon Sep 17 00:00:00 2001
From: Andrey Ryabinin <aryabinin@virtuozzo.com>
Date: Fri, 17 Nov 2017 15:30:42 -0800
Subject: kcov: remove pointless current != NULL check

__sanitizer_cov_trace_pc() is a hot code, so it's worth to remove
pointless '!current' check.  Current is never NULL.

Link: http://lkml.kernel.org/r/20170929162221.32500-1-aryabinin@virtuozzo.com
Signed-off-by: Andrey Ryabinin <aryabinin@virtuozzo.com>
Acked-by: Dmitry Vyukov <dvyukov@google.com>
Acked-by: Mark Rutland <mark.rutland@arm.com>
Cc: Andrey Konovalov <andreyknvl@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/kcov.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/kcov.c b/kernel/kcov.c
index fc6af9e1308b..d9f9fa9cacc6 100644
--- a/kernel/kcov.c
+++ b/kernel/kcov.c
@@ -62,7 +62,7 @@ void notrace __sanitizer_cov_trace_pc(void)
 	 * We are interested in code coverage as a function of a syscall inputs,
 	 * so we ignore code executed in interrupts.
 	 */
-	if (!t || !in_task())
+	if (!in_task())
 		return;
 	mode = READ_ONCE(t->kcov_mode);
 	if (mode == KCOV_MODE_TRACE) {
-- 
cgit v1.2.3


From ded97d2c2b2c5f1dcced0bc57133f7753b037dfc Mon Sep 17 00:00:00 2001
From: Victor Chibotaru <tchibo@google.com>
Date: Fri, 17 Nov 2017 15:30:46 -0800
Subject: kcov: support comparison operands collection

Enables kcov to collect comparison operands from instrumented code.
This is done by using Clang's -fsanitize=trace-cmp instrumentation
(currently not available for GCC).

The comparison operands help a lot in fuzz testing.  E.g.  they are used
in Syzkaller to cover the interiors of conditional statements with way
less attempts and thus make previously unreachable code reachable.

To allow separate collection of coverage and comparison operands two
different work modes are implemented.  Mode selection is now done via a
KCOV_ENABLE ioctl call with corresponding argument value.

Link: http://lkml.kernel.org/r/20171011095459.70721-1-glider@google.com
Signed-off-by: Victor Chibotaru <tchibo@google.com>
Signed-off-by: Alexander Potapenko <glider@google.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Andrey Konovalov <andreyknvl@google.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Alexander Popov <alex.popov@linux.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Vegard Nossum <vegard.nossum@oracle.com>
Cc: Quentin Casasnovas <quentin.casasnovas@oracle.com>
Cc: <syzkaller@googlegroups.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kcov.h      |  12 ++-
 include/uapi/linux/kcov.h |  24 ++++++
 kernel/kcov.c             | 214 ++++++++++++++++++++++++++++++++++++++--------
 3 files changed, 211 insertions(+), 39 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/kcov.h b/include/linux/kcov.h
index f5d8ce4f4f86..3ecf6f5e3a5f 100644
--- a/include/linux/kcov.h
+++ b/include/linux/kcov.h
@@ -8,19 +8,23 @@ struct task_struct;
 
 #ifdef CONFIG_KCOV
 
-void kcov_task_init(struct task_struct *t);
-void kcov_task_exit(struct task_struct *t);
-
 enum kcov_mode {
 	/* Coverage collection is not enabled yet. */
 	KCOV_MODE_DISABLED = 0,
+	/* KCOV was initialized, but tracing mode hasn't been chosen yet. */
+	KCOV_MODE_INIT = 1,
 	/*
 	 * Tracing coverage collection mode.
 	 * Covered PCs are collected in a per-task buffer.
 	 */
-	KCOV_MODE_TRACE = 1,
+	KCOV_MODE_TRACE_PC = 2,
+	/* Collecting comparison operands mode. */
+	KCOV_MODE_TRACE_CMP = 3,
 };
 
+void kcov_task_init(struct task_struct *t);
+void kcov_task_exit(struct task_struct *t);
+
 #else
 
 static inline void kcov_task_init(struct task_struct *t) {}
diff --git a/include/uapi/linux/kcov.h b/include/uapi/linux/kcov.h
index 33eabbb8ada1..9529867717a8 100644
--- a/include/uapi/linux/kcov.h
+++ b/include/uapi/linux/kcov.h
@@ -8,4 +8,28 @@
 #define KCOV_ENABLE			_IO('c', 100)
 #define KCOV_DISABLE			_IO('c', 101)
 
+enum {
+	/*
+	 * Tracing coverage collection mode.
+	 * Covered PCs are collected in a per-task buffer.
+	 * In new KCOV version the mode is chosen by calling
+	 * ioctl(fd, KCOV_ENABLE, mode). In older versions the mode argument
+	 * was supposed to be 0 in such a call. So, for reasons of backward
+	 * compatibility, we have chosen the value KCOV_TRACE_PC to be 0.
+	 */
+	KCOV_TRACE_PC = 0,
+	/* Collecting comparison operands mode. */
+	KCOV_TRACE_CMP = 1,
+};
+
+/*
+ * The format for the types of collected comparisons.
+ *
+ * Bit 0 shows whether one of the arguments is a compile-time constant.
+ * Bits 1 & 2 contain log2 of the argument size, up to 8 bytes.
+ */
+#define KCOV_CMP_CONST          (1 << 0)
+#define KCOV_CMP_SIZE(n)        ((n) << 1)
+#define KCOV_CMP_MASK           KCOV_CMP_SIZE(3)
+
 #endif /* _LINUX_KCOV_IOCTLS_H */
diff --git a/kernel/kcov.c b/kernel/kcov.c
index d9f9fa9cacc6..15f33faf4013 100644
--- a/kernel/kcov.c
+++ b/kernel/kcov.c
@@ -22,13 +22,21 @@
 #include <linux/kcov.h>
 #include <asm/setup.h>
 
+/* Number of 64-bit words written per one comparison: */
+#define KCOV_WORDS_PER_CMP 4
+
 /*
  * kcov descriptor (one per opened debugfs file).
  * State transitions of the descriptor:
  *  - initial state after open()
  *  - then there must be a single ioctl(KCOV_INIT_TRACE) call
  *  - then, mmap() call (several calls are allowed but not useful)
- *  - then, repeated enable/disable for a task (only one task a time allowed)
+ *  - then, ioctl(KCOV_ENABLE, arg), where arg is
+ *	KCOV_TRACE_PC - to trace only the PCs
+ *	or
+ *	KCOV_TRACE_CMP - to trace only the comparison operands
+ *  - then, ioctl(KCOV_DISABLE) to disable the task.
+ * Enabling/disabling ioctls can be repeated (only one task a time allowed).
  */
 struct kcov {
 	/*
@@ -48,51 +56,176 @@ struct kcov {
 	struct task_struct	*t;
 };
 
-/*
- * Entry point from instrumented code.
- * This is called once per basic-block/edge.
- */
-void notrace __sanitizer_cov_trace_pc(void)
+static bool check_kcov_mode(enum kcov_mode needed_mode, struct task_struct *t)
 {
-	struct task_struct *t;
 	enum kcov_mode mode;
 
-	t = current;
 	/*
 	 * We are interested in code coverage as a function of a syscall inputs,
 	 * so we ignore code executed in interrupts.
 	 */
 	if (!in_task())
-		return;
+		return false;
 	mode = READ_ONCE(t->kcov_mode);
-	if (mode == KCOV_MODE_TRACE) {
-		unsigned long *area;
-		unsigned long pos;
-		unsigned long ip = _RET_IP_;
+	/*
+	 * There is some code that runs in interrupts but for which
+	 * in_interrupt() returns false (e.g. preempt_schedule_irq()).
+	 * READ_ONCE()/barrier() effectively provides load-acquire wrt
+	 * interrupts, there are paired barrier()/WRITE_ONCE() in
+	 * kcov_ioctl_locked().
+	 */
+	barrier();
+	return mode == needed_mode;
+}
 
+static unsigned long canonicalize_ip(unsigned long ip)
+{
 #ifdef CONFIG_RANDOMIZE_BASE
-		ip -= kaslr_offset();
+	ip -= kaslr_offset();
 #endif
+	return ip;
+}
 
-		/*
-		 * There is some code that runs in interrupts but for which
-		 * in_interrupt() returns false (e.g. preempt_schedule_irq()).
-		 * READ_ONCE()/barrier() effectively provides load-acquire wrt
-		 * interrupts, there are paired barrier()/WRITE_ONCE() in
-		 * kcov_ioctl_locked().
-		 */
-		barrier();
-		area = t->kcov_area;
-		/* The first word is number of subsequent PCs. */
-		pos = READ_ONCE(area[0]) + 1;
-		if (likely(pos < t->kcov_size)) {
-			area[pos] = ip;
-			WRITE_ONCE(area[0], pos);
-		}
+/*
+ * Entry point from instrumented code.
+ * This is called once per basic-block/edge.
+ */
+void notrace __sanitizer_cov_trace_pc(void)
+{
+	struct task_struct *t;
+	unsigned long *area;
+	unsigned long ip = canonicalize_ip(_RET_IP_);
+	unsigned long pos;
+
+	t = current;
+	if (!check_kcov_mode(KCOV_MODE_TRACE_PC, t))
+		return;
+
+	area = t->kcov_area;
+	/* The first 64-bit word is the number of subsequent PCs. */
+	pos = READ_ONCE(area[0]) + 1;
+	if (likely(pos < t->kcov_size)) {
+		area[pos] = ip;
+		WRITE_ONCE(area[0], pos);
 	}
 }
 EXPORT_SYMBOL(__sanitizer_cov_trace_pc);
 
+#ifdef CONFIG_KCOV_ENABLE_COMPARISONS
+static void write_comp_data(u64 type, u64 arg1, u64 arg2, u64 ip)
+{
+	struct task_struct *t;
+	u64 *area;
+	u64 count, start_index, end_pos, max_pos;
+
+	t = current;
+	if (!check_kcov_mode(KCOV_MODE_TRACE_CMP, t))
+		return;
+
+	ip = canonicalize_ip(ip);
+
+	/*
+	 * We write all comparison arguments and types as u64.
+	 * The buffer was allocated for t->kcov_size unsigned longs.
+	 */
+	area = (u64 *)t->kcov_area;
+	max_pos = t->kcov_size * sizeof(unsigned long);
+
+	count = READ_ONCE(area[0]);
+
+	/* Every record is KCOV_WORDS_PER_CMP 64-bit words. */
+	start_index = 1 + count * KCOV_WORDS_PER_CMP;
+	end_pos = (start_index + KCOV_WORDS_PER_CMP) * sizeof(u64);
+	if (likely(end_pos <= max_pos)) {
+		area[start_index] = type;
+		area[start_index + 1] = arg1;
+		area[start_index + 2] = arg2;
+		area[start_index + 3] = ip;
+		WRITE_ONCE(area[0], count + 1);
+	}
+}
+
+void notrace __sanitizer_cov_trace_cmp1(u8 arg1, u8 arg2)
+{
+	write_comp_data(KCOV_CMP_SIZE(0), arg1, arg2, _RET_IP_);
+}
+EXPORT_SYMBOL(__sanitizer_cov_trace_cmp1);
+
+void notrace __sanitizer_cov_trace_cmp2(u16 arg1, u16 arg2)
+{
+	write_comp_data(KCOV_CMP_SIZE(1), arg1, arg2, _RET_IP_);
+}
+EXPORT_SYMBOL(__sanitizer_cov_trace_cmp2);
+
+void notrace __sanitizer_cov_trace_cmp4(u16 arg1, u16 arg2)
+{
+	write_comp_data(KCOV_CMP_SIZE(2), arg1, arg2, _RET_IP_);
+}
+EXPORT_SYMBOL(__sanitizer_cov_trace_cmp4);
+
+void notrace __sanitizer_cov_trace_cmp8(u64 arg1, u64 arg2)
+{
+	write_comp_data(KCOV_CMP_SIZE(3), arg1, arg2, _RET_IP_);
+}
+EXPORT_SYMBOL(__sanitizer_cov_trace_cmp8);
+
+void notrace __sanitizer_cov_trace_const_cmp1(u8 arg1, u8 arg2)
+{
+	write_comp_data(KCOV_CMP_SIZE(0) | KCOV_CMP_CONST, arg1, arg2,
+			_RET_IP_);
+}
+EXPORT_SYMBOL(__sanitizer_cov_trace_const_cmp1);
+
+void notrace __sanitizer_cov_trace_const_cmp2(u16 arg1, u16 arg2)
+{
+	write_comp_data(KCOV_CMP_SIZE(1) | KCOV_CMP_CONST, arg1, arg2,
+			_RET_IP_);
+}
+EXPORT_SYMBOL(__sanitizer_cov_trace_const_cmp2);
+
+void notrace __sanitizer_cov_trace_const_cmp4(u16 arg1, u16 arg2)
+{
+	write_comp_data(KCOV_CMP_SIZE(2) | KCOV_CMP_CONST, arg1, arg2,
+			_RET_IP_);
+}
+EXPORT_SYMBOL(__sanitizer_cov_trace_const_cmp4);
+
+void notrace __sanitizer_cov_trace_const_cmp8(u64 arg1, u64 arg2)
+{
+	write_comp_data(KCOV_CMP_SIZE(3) | KCOV_CMP_CONST, arg1, arg2,
+			_RET_IP_);
+}
+EXPORT_SYMBOL(__sanitizer_cov_trace_const_cmp8);
+
+void notrace __sanitizer_cov_trace_switch(u64 val, u64 *cases)
+{
+	u64 i;
+	u64 count = cases[0];
+	u64 size = cases[1];
+	u64 type = KCOV_CMP_CONST;
+
+	switch (size) {
+	case 8:
+		type |= KCOV_CMP_SIZE(0);
+		break;
+	case 16:
+		type |= KCOV_CMP_SIZE(1);
+		break;
+	case 32:
+		type |= KCOV_CMP_SIZE(2);
+		break;
+	case 64:
+		type |= KCOV_CMP_SIZE(3);
+		break;
+	default:
+		return;
+	}
+	for (i = 0; i < count; i++)
+		write_comp_data(type, cases[i + 2], val, _RET_IP_);
+}
+EXPORT_SYMBOL(__sanitizer_cov_trace_switch);
+#endif /* ifdef CONFIG_KCOV_ENABLE_COMPARISONS */
+
 static void kcov_get(struct kcov *kcov)
 {
 	atomic_inc(&kcov->refcount);
@@ -129,6 +262,7 @@ void kcov_task_exit(struct task_struct *t)
 	/* Just to not leave dangling references behind. */
 	kcov_task_init(t);
 	kcov->t = NULL;
+	kcov->mode = KCOV_MODE_INIT;
 	spin_unlock(&kcov->lock);
 	kcov_put(kcov);
 }
@@ -147,7 +281,7 @@ static int kcov_mmap(struct file *filep, struct vm_area_struct *vma)
 
 	spin_lock(&kcov->lock);
 	size = kcov->size * sizeof(unsigned long);
-	if (kcov->mode == KCOV_MODE_DISABLED || vma->vm_pgoff != 0 ||
+	if (kcov->mode != KCOV_MODE_INIT || vma->vm_pgoff != 0 ||
 	    vma->vm_end - vma->vm_start != size) {
 		res = -EINVAL;
 		goto exit;
@@ -176,6 +310,7 @@ static int kcov_open(struct inode *inode, struct file *filep)
 	kcov = kzalloc(sizeof(*kcov), GFP_KERNEL);
 	if (!kcov)
 		return -ENOMEM;
+	kcov->mode = KCOV_MODE_DISABLED;
 	atomic_set(&kcov->refcount, 1);
 	spin_lock_init(&kcov->lock);
 	filep->private_data = kcov;
@@ -211,7 +346,7 @@ static int kcov_ioctl_locked(struct kcov *kcov, unsigned int cmd,
 		if (size < 2 || size > INT_MAX / sizeof(unsigned long))
 			return -EINVAL;
 		kcov->size = size;
-		kcov->mode = KCOV_MODE_TRACE;
+		kcov->mode = KCOV_MODE_INIT;
 		return 0;
 	case KCOV_ENABLE:
 		/*
@@ -221,17 +356,25 @@ static int kcov_ioctl_locked(struct kcov *kcov, unsigned int cmd,
 		 * at task exit or voluntary by KCOV_DISABLE. After that it can
 		 * be enabled for another task.
 		 */
-		unused = arg;
-		if (unused != 0 || kcov->mode == KCOV_MODE_DISABLED ||
-		    kcov->area == NULL)
+		if (kcov->mode != KCOV_MODE_INIT || !kcov->area)
 			return -EINVAL;
 		if (kcov->t != NULL)
 			return -EBUSY;
+		if (arg == KCOV_TRACE_PC)
+			kcov->mode = KCOV_MODE_TRACE_PC;
+		else if (arg == KCOV_TRACE_CMP)
+#ifdef CONFIG_KCOV_ENABLE_COMPARISONS
+			kcov->mode = KCOV_MODE_TRACE_CMP;
+#else
+		return -ENOTSUPP;
+#endif
+		else
+			return -EINVAL;
 		t = current;
 		/* Cache in task struct for performance. */
 		t->kcov_size = kcov->size;
 		t->kcov_area = kcov->area;
-		/* See comment in __sanitizer_cov_trace_pc(). */
+		/* See comment in check_kcov_mode(). */
 		barrier();
 		WRITE_ONCE(t->kcov_mode, kcov->mode);
 		t->kcov = kcov;
@@ -249,6 +392,7 @@ static int kcov_ioctl_locked(struct kcov *kcov, unsigned int cmd,
 			return -EINVAL;
 		kcov_task_init(t);
 		kcov->t = NULL;
+		kcov->mode = KCOV_MODE_INIT;
 		kcov_put(kcov);
 		return 0;
 	default:
-- 
cgit v1.2.3


From 2d8364bae4db144df75ba85e92d2b8619ba8eedc Mon Sep 17 00:00:00 2001
From: Andrey Smirnov <andrew.smirnov@gmail.com>
Date: Fri, 17 Nov 2017 15:30:57 -0800
Subject: kernel/reboot.c: add devm_register_reboot_notifier()

Add devm_* wrapper around register_reboot_notifier to simplify device
specific reboot notifier registration/unregistration.

[akpm@linux-foundation.org: move `struct device' forward decl to top-of-file]
Link: http://lkml.kernel.org/r/20170320171753.1705-1-andrew.smirnov@gmail.com
Signed-off-by: Andrey Smirnov <andrew.smirnov@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/reboot.h |  4 ++++
 kernel/reboot.c        | 27 +++++++++++++++++++++++++++
 2 files changed, 31 insertions(+)

(limited to 'kernel')

diff --git a/include/linux/reboot.h b/include/linux/reboot.h
index d03da0eb95ca..e63799a6e895 100644
--- a/include/linux/reboot.h
+++ b/include/linux/reboot.h
@@ -6,6 +6,8 @@
 #include <linux/notifier.h>
 #include <uapi/linux/reboot.h>
 
+struct device;
+
 #define SYS_DOWN	0x0001	/* Notify of system down */
 #define SYS_RESTART	SYS_DOWN
 #define SYS_HALT	0x0002	/* Notify of system halt */
@@ -39,6 +41,8 @@ extern int reboot_force;
 extern int register_reboot_notifier(struct notifier_block *);
 extern int unregister_reboot_notifier(struct notifier_block *);
 
+extern int devm_register_reboot_notifier(struct device *, struct notifier_block *);
+
 extern int register_restart_handler(struct notifier_block *);
 extern int unregister_restart_handler(struct notifier_block *);
 extern void do_kernel_restart(char *cmd);
diff --git a/kernel/reboot.c b/kernel/reboot.c
index bd30a973fe94..e4ced883d8de 100644
--- a/kernel/reboot.c
+++ b/kernel/reboot.c
@@ -104,6 +104,33 @@ int unregister_reboot_notifier(struct notifier_block *nb)
 }
 EXPORT_SYMBOL(unregister_reboot_notifier);
 
+static void devm_unregister_reboot_notifier(struct device *dev, void *res)
+{
+	WARN_ON(unregister_reboot_notifier(*(struct notifier_block **)res));
+}
+
+int devm_register_reboot_notifier(struct device *dev, struct notifier_block *nb)
+{
+	struct notifier_block **rcnb;
+	int ret;
+
+	rcnb = devres_alloc(devm_unregister_reboot_notifier,
+			    sizeof(*rcnb), GFP_KERNEL);
+	if (!rcnb)
+		return -ENOMEM;
+
+	ret = register_reboot_notifier(nb);
+	if (!ret) {
+		*rcnb = nb;
+		devres_add(dev, rcnb);
+	} else {
+		devres_free(rcnb);
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL(devm_register_reboot_notifier);
+
 /*
  *	Notifier list for kernel code which wants to be called
  *	to restart the system.
-- 
cgit v1.2.3


From 24ed960abf1d50cb7834e99a0cfc081bc0656712 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Mon, 28 Aug 2017 11:28:21 -0700
Subject: treewide: Switch DEFINE_TIMER callbacks to struct timer_list *

This changes all DEFINE_TIMER() callbacks to use a struct timer_list
pointer instead of unsigned long. Since the data argument has already been
removed, none of these callbacks are using their argument currently, so
this renames the argument to "unused".

Done using the following semantic patch:

@match_define_timer@
declarer name DEFINE_TIMER;
identifier _timer, _callback;
@@

 DEFINE_TIMER(_timer, _callback);

@change_callback depends on match_define_timer@
identifier match_define_timer._callback;
type _origtype;
identifier _origarg;
@@

 void
-_callback(_origtype _origarg)
+_callback(struct timer_list *unused)
 { ... }

Signed-off-by: Kees Cook <keescook@chromium.org>
---
 arch/arm/mach-ixp4xx/dsmg600-setup.c      |  4 ++--
 arch/arm/mach-ixp4xx/nas100d-setup.c      |  4 ++--
 arch/m68k/amiga/amisound.c                |  4 ++--
 arch/m68k/mac/macboing.c                  |  4 ++--
 arch/mips/mti-malta/malta-display.c       |  4 ++--
 arch/parisc/kernel/pdc_cons.c             |  4 ++--
 drivers/atm/idt77105.c                    |  8 ++++----
 drivers/atm/iphase.c                      |  4 ++--
 drivers/block/ataflop.c                   | 16 ++++++++--------
 drivers/char/dtlk.c                       |  4 ++--
 drivers/char/hangcheck-timer.c            |  4 ++--
 drivers/char/nwbutton.c                   |  4 ++--
 drivers/char/nwbutton.h                   |  2 +-
 drivers/char/rtc.c                        |  4 ++--
 drivers/input/touchscreen/s3c2410_ts.c    |  2 +-
 drivers/net/wireless/atmel/at76c50x-usb.c |  4 ++--
 drivers/staging/speakup/main.c            |  4 ++--
 drivers/staging/speakup/synth.c           |  2 +-
 drivers/tty/cyclades.c                    |  4 ++--
 drivers/tty/isicom.c                      |  4 ++--
 drivers/tty/moxa.c                        |  4 ++--
 drivers/tty/rocket.c                      |  4 ++--
 drivers/tty/vt/keyboard.c                 |  2 +-
 drivers/tty/vt/vt.c                       |  4 ++--
 drivers/watchdog/alim7101_wdt.c           |  4 ++--
 drivers/watchdog/machzwd.c                |  4 ++--
 drivers/watchdog/mixcomwd.c               |  4 ++--
 drivers/watchdog/sbc60xxwdt.c             |  4 ++--
 drivers/watchdog/sc520_wdt.c              |  4 ++--
 drivers/watchdog/via_wdt.c                |  4 ++--
 drivers/watchdog/w83877f_wdt.c            |  4 ++--
 drivers/xen/grant-table.c                 |  4 ++--
 fs/pstore/platform.c                      |  4 ++--
 kernel/irq/spurious.c                     |  4 ++--
 lib/random32.c                            |  4 ++--
 net/decnet/dn_route.c                     |  4 ++--
 net/ipv6/ip6_flowlabel.c                  |  4 ++--
 net/netrom/nr_loopback.c                  |  4 ++--
 security/keys/gc.c                        |  4 ++--
 39 files changed, 82 insertions(+), 82 deletions(-)

(limited to 'kernel')

diff --git a/arch/arm/mach-ixp4xx/dsmg600-setup.c b/arch/arm/mach-ixp4xx/dsmg600-setup.c
index ac97a4599034..0f5c99941a7d 100644
--- a/arch/arm/mach-ixp4xx/dsmg600-setup.c
+++ b/arch/arm/mach-ixp4xx/dsmg600-setup.c
@@ -179,10 +179,10 @@ static int power_button_countdown;
 /* Must hold the button down for at least this many counts to be processed */
 #define PBUTTON_HOLDDOWN_COUNT 4 /* 2 secs */
 
-static void dsmg600_power_handler(unsigned long data);
+static void dsmg600_power_handler(struct timer_list *unused);
 static DEFINE_TIMER(dsmg600_power_timer, dsmg600_power_handler);
 
-static void dsmg600_power_handler(unsigned long data)
+static void dsmg600_power_handler(struct timer_list *unused)
 {
 	/* This routine is called twice per second to check the
 	 * state of the power button.
diff --git a/arch/arm/mach-ixp4xx/nas100d-setup.c b/arch/arm/mach-ixp4xx/nas100d-setup.c
index 435602085408..76dfff03cb71 100644
--- a/arch/arm/mach-ixp4xx/nas100d-setup.c
+++ b/arch/arm/mach-ixp4xx/nas100d-setup.c
@@ -202,10 +202,10 @@ static int power_button_countdown;
 /* Must hold the button down for at least this many counts to be processed */
 #define PBUTTON_HOLDDOWN_COUNT 4 /* 2 secs */
 
-static void nas100d_power_handler(unsigned long data);
+static void nas100d_power_handler(struct timer_list *unused);
 static DEFINE_TIMER(nas100d_power_timer, nas100d_power_handler);
 
-static void nas100d_power_handler(unsigned long data)
+static void nas100d_power_handler(struct timer_list *unused)
 {
 	/* This routine is called twice per second to check the
 	 * state of the power button.
diff --git a/arch/m68k/amiga/amisound.c b/arch/m68k/amiga/amisound.c
index a23f48181fd6..442bdeee6bd7 100644
--- a/arch/m68k/amiga/amisound.c
+++ b/arch/m68k/amiga/amisound.c
@@ -65,7 +65,7 @@ void __init amiga_init_sound(void)
 #endif
 }
 
-static void nosound( unsigned long ignored );
+static void nosound(struct timer_list *unused);
 static DEFINE_TIMER(sound_timer, nosound);
 
 void amiga_mksound( unsigned int hz, unsigned int ticks )
@@ -107,7 +107,7 @@ void amiga_mksound( unsigned int hz, unsigned int ticks )
 }
 
 
-static void nosound( unsigned long ignored )
+static void nosound(struct timer_list *unused)
 {
 	/* turn off DMA for audio channel 2 */
 	custom.dmacon = DMAF_AUD2;
diff --git a/arch/m68k/mac/macboing.c b/arch/m68k/mac/macboing.c
index d17668649641..135a87bbd1a2 100644
--- a/arch/m68k/mac/macboing.c
+++ b/arch/m68k/mac/macboing.c
@@ -48,7 +48,7 @@ static unsigned long mac_bell_phasepersample;
  * some function protos
  */
 static void mac_init_asc( void );
-static void mac_nosound( unsigned long );
+static void mac_nosound(struct timer_list *);
 static void mac_quadra_start_bell( unsigned int, unsigned int, unsigned int );
 static void mac_quadra_ring_bell( unsigned long );
 static void mac_av_start_bell( unsigned int, unsigned int, unsigned int );
@@ -216,7 +216,7 @@ void mac_mksound( unsigned int freq, unsigned int length )
 /*
  * regular ASC: stop whining ..
  */
-static void mac_nosound( unsigned long ignored )
+static void mac_nosound(struct timer_list *unused)
 {
 	mac_asc_regs[ ASC_ENABLE ] = 0;
 }
diff --git a/arch/mips/mti-malta/malta-display.c b/arch/mips/mti-malta/malta-display.c
index 063de44675ce..ee0bd50f754b 100644
--- a/arch/mips/mti-malta/malta-display.c
+++ b/arch/mips/mti-malta/malta-display.c
@@ -36,10 +36,10 @@ void mips_display_message(const char *str)
 	}
 }
 
-static void scroll_display_message(unsigned long unused);
+static void scroll_display_message(struct timer_list *unused);
 static DEFINE_TIMER(mips_scroll_timer, scroll_display_message);
 
-static void scroll_display_message(unsigned long unused)
+static void scroll_display_message(struct timer_list *unused)
 {
 	mips_display_message(&display_string[display_count++]);
 	if (display_count == max_display_count)
diff --git a/arch/parisc/kernel/pdc_cons.c b/arch/parisc/kernel/pdc_cons.c
index 27a2dd616a7d..c46bf29ae412 100644
--- a/arch/parisc/kernel/pdc_cons.c
+++ b/arch/parisc/kernel/pdc_cons.c
@@ -91,7 +91,7 @@ static int pdc_console_setup(struct console *co, char *options)
 
 #define PDC_CONS_POLL_DELAY (30 * HZ / 1000)
 
-static void pdc_console_poll(unsigned long unused);
+static void pdc_console_poll(struct timer_list *unused);
 static DEFINE_TIMER(pdc_console_timer, pdc_console_poll);
 static struct tty_port tty_port;
 
@@ -135,7 +135,7 @@ static const struct tty_operations pdc_console_tty_ops = {
 	.chars_in_buffer = pdc_console_tty_chars_in_buffer,
 };
 
-static void pdc_console_poll(unsigned long unused)
+static void pdc_console_poll(struct timer_list *unused)
 {
 	int data, count = 0;
 
diff --git a/drivers/atm/idt77105.c b/drivers/atm/idt77105.c
index 909744eb7bab..0a67487c0b1d 100644
--- a/drivers/atm/idt77105.c
+++ b/drivers/atm/idt77105.c
@@ -45,8 +45,8 @@ static DEFINE_SPINLOCK(idt77105_priv_lock);
 #define PUT(val,reg) dev->ops->phy_put(dev,val,IDT77105_##reg)
 #define GET(reg) dev->ops->phy_get(dev,IDT77105_##reg)
 
-static void idt77105_stats_timer_func(unsigned long);
-static void idt77105_restart_timer_func(unsigned long);
+static void idt77105_stats_timer_func(struct timer_list *);
+static void idt77105_restart_timer_func(struct timer_list *);
 
 
 static DEFINE_TIMER(stats_timer, idt77105_stats_timer_func);
@@ -80,7 +80,7 @@ static u16 get_counter(struct atm_dev *dev, int counter)
  * a separate copy of the stats allows implementation of
  * an ioctl which gathers the stats *without* zero'ing them.
  */
-static void idt77105_stats_timer_func(unsigned long dummy)
+static void idt77105_stats_timer_func(struct timer_list *unused)
 {
 	struct idt77105_priv *walk;
 	struct atm_dev *dev;
@@ -109,7 +109,7 @@ static void idt77105_stats_timer_func(unsigned long dummy)
  * interrupts need to be disabled when the cable is pulled out
  * to avoid lots of spurious cell error interrupts.
  */
-static void idt77105_restart_timer_func(unsigned long dummy)
+static void idt77105_restart_timer_func(struct timer_list *unused)
 {
 	struct idt77105_priv *walk;
 	struct atm_dev *dev;
diff --git a/drivers/atm/iphase.c b/drivers/atm/iphase.c
index 12f646760b68..98a3a43484c8 100644
--- a/drivers/atm/iphase.c
+++ b/drivers/atm/iphase.c
@@ -75,7 +75,7 @@ static void desc_dbg(IADEV *iadev);
 static IADEV *ia_dev[8];
 static struct atm_dev *_ia_dev[8];
 static int iadev_count;
-static void ia_led_timer(unsigned long arg);
+static void ia_led_timer(struct timer_list *unused);
 static DEFINE_TIMER(ia_timer, ia_led_timer);
 static int IA_TX_BUF = DFL_TX_BUFFERS, IA_TX_BUF_SZ = DFL_TX_BUF_SZ;
 static int IA_RX_BUF = DFL_RX_BUFFERS, IA_RX_BUF_SZ = DFL_RX_BUF_SZ;
@@ -2432,7 +2432,7 @@ static void ia_update_stats(IADEV *iadev) {
     return;
 }
   
-static void ia_led_timer(unsigned long arg) {
+static void ia_led_timer(struct timer_list *unused) {
  	unsigned long flags;
   	static u_char blinking[8] = {0, 0, 0, 0, 0, 0, 0, 0};
         u_char i;
diff --git a/drivers/block/ataflop.c b/drivers/block/ataflop.c
index ae596e55bcb6..8bc3b9fd8dd2 100644
--- a/drivers/block/ataflop.c
+++ b/drivers/block/ataflop.c
@@ -342,8 +342,8 @@ static int NeedSeek = 0;
 static void fd_select_side( int side );
 static void fd_select_drive( int drive );
 static void fd_deselect( void );
-static void fd_motor_off_timer( unsigned long dummy );
-static void check_change( unsigned long dummy );
+static void fd_motor_off_timer(struct timer_list *unused);
+static void check_change(struct timer_list *unused);
 static irqreturn_t floppy_irq (int irq, void *dummy);
 static void fd_error( void );
 static int do_format(int drive, int type, struct atari_format_descr *desc);
@@ -353,12 +353,12 @@ static void fd_calibrate_done( int status );
 static void fd_seek( void );
 static void fd_seek_done( int status );
 static void fd_rwsec( void );
-static void fd_readtrack_check( unsigned long dummy );
+static void fd_readtrack_check(struct timer_list *unused);
 static void fd_rwsec_done( int status );
 static void fd_rwsec_done1(int status);
 static void fd_writetrack( void );
 static void fd_writetrack_done( int status );
-static void fd_times_out( unsigned long dummy );
+static void fd_times_out(struct timer_list *unused);
 static void finish_fdc( void );
 static void finish_fdc_done( int dummy );
 static void setup_req_params( int drive );
@@ -479,7 +479,7 @@ static void fd_deselect( void )
  * counts the index signals, which arrive only if one drive is selected.
  */
 
-static void fd_motor_off_timer( unsigned long dummy )
+static void fd_motor_off_timer(struct timer_list *unused)
 {
 	unsigned char status;
 
@@ -515,7 +515,7 @@ static void fd_motor_off_timer( unsigned long dummy )
  * as possible) and keep track of the current state of the write protection.
  */
 
-static void check_change( unsigned long dummy )
+static void check_change(struct timer_list *unused)
 {
 	static int    drive = 0;
 
@@ -966,7 +966,7 @@ static void fd_rwsec( void )
 }
 
     
-static void fd_readtrack_check( unsigned long dummy )
+static void fd_readtrack_check(struct timer_list *unused)
 {
 	unsigned long flags, addr, addr2;
 
@@ -1237,7 +1237,7 @@ static void fd_writetrack_done( int status )
 	fd_error();
 }
 
-static void fd_times_out( unsigned long dummy )
+static void fd_times_out(struct timer_list *unused)
 {
 	atari_disable_irq( IRQ_MFP_FDC );
 	if (!FloppyIRQHandler) goto end; /* int occurred after timer was fired, but
diff --git a/drivers/char/dtlk.c b/drivers/char/dtlk.c
index 1a0385ed6417..839ee61d352a 100644
--- a/drivers/char/dtlk.c
+++ b/drivers/char/dtlk.c
@@ -74,7 +74,7 @@
 #endif				/* TRACING */
 
 static DEFINE_MUTEX(dtlk_mutex);
-static void dtlk_timer_tick(unsigned long data);
+static void dtlk_timer_tick(struct timer_list *unused);
 
 static int dtlk_major;
 static int dtlk_port_lpc;
@@ -259,7 +259,7 @@ static unsigned int dtlk_poll(struct file *file, poll_table * wait)
 	return mask;
 }
 
-static void dtlk_timer_tick(unsigned long data)
+static void dtlk_timer_tick(struct timer_list *unused)
 {
 	TRACE_TEXT(" dtlk_timer_tick");
 	wake_up_interruptible(&dtlk_process_list);
diff --git a/drivers/char/hangcheck-timer.c b/drivers/char/hangcheck-timer.c
index 5b8db2ed844d..7700280717f2 100644
--- a/drivers/char/hangcheck-timer.c
+++ b/drivers/char/hangcheck-timer.c
@@ -122,11 +122,11 @@ __setup("hcheck_dump_tasks", hangcheck_parse_dump_tasks);
 /* Last time scheduled */
 static unsigned long long hangcheck_tsc, hangcheck_tsc_margin;
 
-static void hangcheck_fire(unsigned long);
+static void hangcheck_fire(struct timer_list *);
 
 static DEFINE_TIMER(hangcheck_ticktock, hangcheck_fire);
 
-static void hangcheck_fire(unsigned long data)
+static void hangcheck_fire(struct timer_list *unused)
 {
 	unsigned long long cur_tsc, tsc_diff;
 
diff --git a/drivers/char/nwbutton.c b/drivers/char/nwbutton.c
index 44006ed9558f..a7113b78251a 100644
--- a/drivers/char/nwbutton.c
+++ b/drivers/char/nwbutton.c
@@ -23,7 +23,7 @@
 #define __NWBUTTON_C		/* Tell the header file who we are */
 #include "nwbutton.h"
 
-static void button_sequence_finished (unsigned long parameters);
+static void button_sequence_finished(struct timer_list *unused);
 
 static int button_press_count;		/* The count of button presses */
 /* Times for the end of a sequence */
@@ -127,7 +127,7 @@ static void button_consume_callbacks (int bpcount)
  * any matching registered function callbacks, initiate reboot, etc.).
  */
 
-static void button_sequence_finished (unsigned long parameters)
+static void button_sequence_finished(struct timer_list *unused)
 {
 	if (IS_ENABLED(CONFIG_NWBUTTON_REBOOT) &&
 	    button_press_count == reboot_count)
diff --git a/drivers/char/nwbutton.h b/drivers/char/nwbutton.h
index abee3ca74801..9dedfd7adc0e 100644
--- a/drivers/char/nwbutton.h
+++ b/drivers/char/nwbutton.h
@@ -25,7 +25,7 @@ struct button_callback {
 
 /* Function prototypes: */
 
-static void button_sequence_finished (unsigned long parameters);
+static void button_sequence_finished(struct timer_list *unused);
 static irqreturn_t button_handler (int irq, void *dev_id);
 int button_init (void);
 int button_add_callback (void (*callback) (void), int count);
diff --git a/drivers/char/rtc.c b/drivers/char/rtc.c
index 616871e68e09..5542a438bbd0 100644
--- a/drivers/char/rtc.c
+++ b/drivers/char/rtc.c
@@ -135,7 +135,7 @@ static struct fasync_struct *rtc_async_queue;
 static DECLARE_WAIT_QUEUE_HEAD(rtc_wait);
 
 #ifdef RTC_IRQ
-static void rtc_dropped_irq(unsigned long data);
+static void rtc_dropped_irq(struct timer_list *unused);
 
 static DEFINE_TIMER(rtc_irq_timer, rtc_dropped_irq);
 #endif
@@ -1171,7 +1171,7 @@ module_exit(rtc_exit);
  *	for something that requires a steady > 1KHz signal anyways.)
  */
 
-static void rtc_dropped_irq(unsigned long data)
+static void rtc_dropped_irq(struct timer_list *unused)
 {
 	unsigned long freq;
 
diff --git a/drivers/input/touchscreen/s3c2410_ts.c b/drivers/input/touchscreen/s3c2410_ts.c
index d3265b6b58b8..1173890f6719 100644
--- a/drivers/input/touchscreen/s3c2410_ts.c
+++ b/drivers/input/touchscreen/s3c2410_ts.c
@@ -102,7 +102,7 @@ static inline bool get_down(unsigned long data0, unsigned long data1)
 		!(data1 & S3C2410_ADCDAT0_UPDOWN));
 }
 
-static void touch_timer_fire(unsigned long data)
+static void touch_timer_fire(struct timer_list *unused)
 {
 	unsigned long data0;
 	unsigned long data1;
diff --git a/drivers/net/wireless/atmel/at76c50x-usb.c b/drivers/net/wireless/atmel/at76c50x-usb.c
index ede89d4ffc88..e99e766a3028 100644
--- a/drivers/net/wireless/atmel/at76c50x-usb.c
+++ b/drivers/net/wireless/atmel/at76c50x-usb.c
@@ -518,11 +518,11 @@ exit:
 
 /* LED trigger */
 static int tx_activity;
-static void at76_ledtrig_tx_timerfunc(unsigned long data);
+static void at76_ledtrig_tx_timerfunc(struct timer_list *unused);
 static DEFINE_TIMER(ledtrig_tx_timer, at76_ledtrig_tx_timerfunc);
 DEFINE_LED_TRIGGER(ledtrig_tx);
 
-static void at76_ledtrig_tx_timerfunc(unsigned long data)
+static void at76_ledtrig_tx_timerfunc(struct timer_list *unused)
 {
 	static int tx_lastactivity;
 
diff --git a/drivers/staging/speakup/main.c b/drivers/staging/speakup/main.c
index 16497202473f..aae868509e13 100644
--- a/drivers/staging/speakup/main.c
+++ b/drivers/staging/speakup/main.c
@@ -1164,7 +1164,7 @@ static void spkup_write(const u16 *in_buf, int count)
 static const int NUM_CTL_LABELS = (MSG_CTL_END - MSG_CTL_START + 1);
 
 static void read_all_doc(struct vc_data *vc);
-static void cursor_done(u_long data);
+static void cursor_done(struct timer_list *unused);
 static DEFINE_TIMER(cursor_timer, cursor_done);
 
 static void do_handle_shift(struct vc_data *vc, u_char value, char up_flag)
@@ -1682,7 +1682,7 @@ static int speak_highlight(struct vc_data *vc)
 	return 0;
 }
 
-static void cursor_done(u_long data)
+static void cursor_done(struct timer_list *unused)
 {
 	struct vc_data *vc = vc_cons[cursor_con].d;
 	unsigned long flags;
diff --git a/drivers/staging/speakup/synth.c b/drivers/staging/speakup/synth.c
index 6ddd3fc3f08d..aac29c816d09 100644
--- a/drivers/staging/speakup/synth.c
+++ b/drivers/staging/speakup/synth.c
@@ -153,7 +153,7 @@ int spk_synth_is_alive_restart(struct spk_synth *synth)
 }
 EXPORT_SYMBOL_GPL(spk_synth_is_alive_restart);
 
-static void thread_wake_up(u_long data)
+static void thread_wake_up(struct timer_list *unused)
 {
 	wake_up_interruptible_all(&speakup_event);
 }
diff --git a/drivers/tty/cyclades.c b/drivers/tty/cyclades.c
index 5d442469c95e..cf0bde3bb927 100644
--- a/drivers/tty/cyclades.c
+++ b/drivers/tty/cyclades.c
@@ -279,7 +279,7 @@ static unsigned detect_isa_irq(void __iomem *);
 #endif				/* CONFIG_ISA */
 
 #ifndef CONFIG_CYZ_INTR
-static void cyz_poll(unsigned long);
+static void cyz_poll(struct timer_list *);
 
 /* The Cyclades-Z polling cycle is defined by this variable */
 static long cyz_polling_cycle = CZ_DEF_POLL;
@@ -1214,7 +1214,7 @@ static void cyz_rx_restart(struct timer_list *t)
 
 #else				/* CONFIG_CYZ_INTR */
 
-static void cyz_poll(unsigned long arg)
+static void cyz_poll(struct timer_list *unused)
 {
 	struct cyclades_card *cinfo;
 	struct cyclades_port *info;
diff --git a/drivers/tty/isicom.c b/drivers/tty/isicom.c
index ee7958ab269f..015686ff4825 100644
--- a/drivers/tty/isicom.c
+++ b/drivers/tty/isicom.c
@@ -170,7 +170,7 @@ static struct pci_driver isicom_driver = {
 static int prev_card = 3;	/*	start servicing isi_card[0]	*/
 static struct tty_driver *isicom_normal;
 
-static void isicom_tx(unsigned long _data);
+static void isicom_tx(struct timer_list *unused);
 static void isicom_start(struct tty_struct *tty);
 
 static DEFINE_TIMER(tx, isicom_tx);
@@ -394,7 +394,7 @@ static inline int __isicom_paranoia_check(struct isi_port const *port,
  *	will do the rest of the work for us.
  */
 
-static void isicom_tx(unsigned long _data)
+static void isicom_tx(struct timer_list *unused)
 {
 	unsigned long flags, base;
 	unsigned int retries;
diff --git a/drivers/tty/moxa.c b/drivers/tty/moxa.c
index 65a70f3c7cde..68cbc03aab4b 100644
--- a/drivers/tty/moxa.c
+++ b/drivers/tty/moxa.c
@@ -198,7 +198,7 @@ static void moxa_hangup(struct tty_struct *);
 static int moxa_tiocmget(struct tty_struct *tty);
 static int moxa_tiocmset(struct tty_struct *tty,
 			 unsigned int set, unsigned int clear);
-static void moxa_poll(unsigned long);
+static void moxa_poll(struct timer_list *);
 static void moxa_set_tty_param(struct tty_struct *, struct ktermios *);
 static void moxa_shutdown(struct tty_port *);
 static int moxa_carrier_raised(struct tty_port *);
@@ -1429,7 +1429,7 @@ put:
 	return 0;
 }
 
-static void moxa_poll(unsigned long ignored)
+static void moxa_poll(struct timer_list *unused)
 {
 	struct moxa_board_conf *brd;
 	u16 __iomem *ip;
diff --git a/drivers/tty/rocket.c b/drivers/tty/rocket.c
index f7dc9b1ea806..bdd17d2aaafd 100644
--- a/drivers/tty/rocket.c
+++ b/drivers/tty/rocket.c
@@ -86,7 +86,7 @@
 
 /****** RocketPort Local Variables ******/
 
-static void rp_do_poll(unsigned long dummy);
+static void rp_do_poll(struct timer_list *unused);
 
 static struct tty_driver *rocket_driver;
 
@@ -525,7 +525,7 @@ static void rp_handle_port(struct r_port *info)
 /*
  *  The top level polling routine.  Repeats every 1/100 HZ (10ms).
  */
-static void rp_do_poll(unsigned long dummy)
+static void rp_do_poll(struct timer_list *unused)
 {
 	CONTROLLER_t *ctlp;
 	int ctrl, aiop, ch, line;
diff --git a/drivers/tty/vt/keyboard.c b/drivers/tty/vt/keyboard.c
index c8d90d7e7e37..5d412df8e943 100644
--- a/drivers/tty/vt/keyboard.c
+++ b/drivers/tty/vt/keyboard.c
@@ -244,7 +244,7 @@ static int kd_sound_helper(struct input_handle *handle, void *data)
 	return 0;
 }
 
-static void kd_nosound(unsigned long ignored)
+static void kd_nosound(struct timer_list *unused)
 {
 	static unsigned int zero;
 
diff --git a/drivers/tty/vt/vt.c b/drivers/tty/vt/vt.c
index bce4c71cb338..88b902c525d7 100644
--- a/drivers/tty/vt/vt.c
+++ b/drivers/tty/vt/vt.c
@@ -158,7 +158,7 @@ static void set_cursor(struct vc_data *vc);
 static void hide_cursor(struct vc_data *vc);
 static void console_callback(struct work_struct *ignored);
 static void con_driver_unregister_callback(struct work_struct *ignored);
-static void blank_screen_t(unsigned long dummy);
+static void blank_screen_t(struct timer_list *unused);
 static void set_palette(struct vc_data *vc);
 
 #define vt_get_kmsg_redirect() vt_kmsg_redirect(-1)
@@ -3929,7 +3929,7 @@ void unblank_screen(void)
  * (console operations can still happen at irq time, but only from printk which
  * has the console mutex. Not perfect yet, but better than no locking
  */
-static void blank_screen_t(unsigned long dummy)
+static void blank_screen_t(struct timer_list *unused)
 {
 	blank_timer_expired = 1;
 	schedule_work(&console_work);
diff --git a/drivers/watchdog/alim7101_wdt.c b/drivers/watchdog/alim7101_wdt.c
index 18e896eeca62..12f7ea62dddd 100644
--- a/drivers/watchdog/alim7101_wdt.c
+++ b/drivers/watchdog/alim7101_wdt.c
@@ -70,7 +70,7 @@ module_param(use_gpio, int, 0);
 MODULE_PARM_DESC(use_gpio,
 		"Use the gpio watchdog (required by old cobalt boards).");
 
-static void wdt_timer_ping(unsigned long);
+static void wdt_timer_ping(struct timer_list *);
 static DEFINE_TIMER(timer, wdt_timer_ping);
 static unsigned long next_heartbeat;
 static unsigned long wdt_is_open;
@@ -87,7 +87,7 @@ MODULE_PARM_DESC(nowayout,
  *	Whack the dog
  */
 
-static void wdt_timer_ping(unsigned long unused)
+static void wdt_timer_ping(struct timer_list *unused)
 {
 	/* If we got a heartbeat pulse within the WDT_US_INTERVAL
 	 * we agree to ping the WDT
diff --git a/drivers/watchdog/machzwd.c b/drivers/watchdog/machzwd.c
index 8a616a57bb90..88d823d87a4b 100644
--- a/drivers/watchdog/machzwd.c
+++ b/drivers/watchdog/machzwd.c
@@ -121,7 +121,7 @@ module_param(action, int, 0);
 MODULE_PARM_DESC(action, "after watchdog resets, generate: "
 				"0 = RESET(*)  1 = SMI  2 = NMI  3 = SCI");
 
-static void zf_ping(unsigned long data);
+static void zf_ping(struct timer_list *unused);
 
 static int zf_action = GEN_RESET;
 static unsigned long zf_is_open;
@@ -237,7 +237,7 @@ static void zf_timer_on(void)
 }
 
 
-static void zf_ping(unsigned long data)
+static void zf_ping(struct timer_list *unused)
 {
 	unsigned int ctrl_reg = 0;
 	unsigned long flags;
diff --git a/drivers/watchdog/mixcomwd.c b/drivers/watchdog/mixcomwd.c
index c9e38096ea91..3cc07447c655 100644
--- a/drivers/watchdog/mixcomwd.c
+++ b/drivers/watchdog/mixcomwd.c
@@ -99,7 +99,7 @@ static struct {
 	{0x0000, 0},
 };
 
-static void mixcomwd_timerfun(unsigned long d);
+static void mixcomwd_timerfun(struct timer_list *unused);
 
 static unsigned long mixcomwd_opened; /* long req'd for setbit --RR */
 
@@ -120,7 +120,7 @@ static void mixcomwd_ping(void)
 	return;
 }
 
-static void mixcomwd_timerfun(unsigned long d)
+static void mixcomwd_timerfun(struct timer_list *unused)
 {
 	mixcomwd_ping();
 	mod_timer(&mixcomwd_timer, jiffies + 5 * HZ);
diff --git a/drivers/watchdog/sbc60xxwdt.c b/drivers/watchdog/sbc60xxwdt.c
index 8d589939bc84..87333a41f753 100644
--- a/drivers/watchdog/sbc60xxwdt.c
+++ b/drivers/watchdog/sbc60xxwdt.c
@@ -112,7 +112,7 @@ MODULE_PARM_DESC(nowayout,
 	"Watchdog cannot be stopped once started (default="
 				__MODULE_STRING(WATCHDOG_NOWAYOUT) ")");
 
-static void wdt_timer_ping(unsigned long);
+static void wdt_timer_ping(struct timer_list *);
 static DEFINE_TIMER(timer, wdt_timer_ping);
 static unsigned long next_heartbeat;
 static unsigned long wdt_is_open;
@@ -122,7 +122,7 @@ static char wdt_expect_close;
  *	Whack the dog
  */
 
-static void wdt_timer_ping(unsigned long data)
+static void wdt_timer_ping(struct timer_list *unused)
 {
 	/* If we got a heartbeat pulse within the WDT_US_INTERVAL
 	 * we agree to ping the WDT
diff --git a/drivers/watchdog/sc520_wdt.c b/drivers/watchdog/sc520_wdt.c
index 3e9bbaa37bf4..6aadb56e7faa 100644
--- a/drivers/watchdog/sc520_wdt.c
+++ b/drivers/watchdog/sc520_wdt.c
@@ -123,7 +123,7 @@ MODULE_PARM_DESC(nowayout,
 
 static __u16 __iomem *wdtmrctl;
 
-static void wdt_timer_ping(unsigned long);
+static void wdt_timer_ping(struct timer_list *);
 static DEFINE_TIMER(timer, wdt_timer_ping);
 static unsigned long next_heartbeat;
 static unsigned long wdt_is_open;
@@ -134,7 +134,7 @@ static DEFINE_SPINLOCK(wdt_spinlock);
  *	Whack the dog
  */
 
-static void wdt_timer_ping(unsigned long data)
+static void wdt_timer_ping(struct timer_list *unused)
 {
 	/* If we got a heartbeat pulse within the WDT_US_INTERVAL
 	 * we agree to ping the WDT
diff --git a/drivers/watchdog/via_wdt.c b/drivers/watchdog/via_wdt.c
index ad3c3be13b40..b085ef1084ec 100644
--- a/drivers/watchdog/via_wdt.c
+++ b/drivers/watchdog/via_wdt.c
@@ -67,7 +67,7 @@ static struct watchdog_device wdt_dev;
 static struct resource wdt_res;
 static void __iomem *wdt_mem;
 static unsigned int mmio;
-static void wdt_timer_tick(unsigned long data);
+static void wdt_timer_tick(struct timer_list *unused);
 static DEFINE_TIMER(timer, wdt_timer_tick);
 					/* The timer that pings the watchdog */
 static unsigned long next_heartbeat;	/* the next_heartbeat for the timer */
@@ -88,7 +88,7 @@ static inline void wdt_reset(void)
  *     then the external/userspace heartbeat).
  *  2) the watchdog timer has been stopped by userspace.
  */
-static void wdt_timer_tick(unsigned long data)
+static void wdt_timer_tick(struct timer_list *unused)
 {
 	if (time_before(jiffies, next_heartbeat) ||
 	   (!watchdog_active(&wdt_dev))) {
diff --git a/drivers/watchdog/w83877f_wdt.c b/drivers/watchdog/w83877f_wdt.c
index ba6b680af100..05658ecc0aa4 100644
--- a/drivers/watchdog/w83877f_wdt.c
+++ b/drivers/watchdog/w83877f_wdt.c
@@ -97,7 +97,7 @@ MODULE_PARM_DESC(nowayout,
 		"Watchdog cannot be stopped once started (default="
 				__MODULE_STRING(WATCHDOG_NOWAYOUT) ")");
 
-static void wdt_timer_ping(unsigned long);
+static void wdt_timer_ping(struct timer_list *);
 static DEFINE_TIMER(timer, wdt_timer_ping);
 static unsigned long next_heartbeat;
 static unsigned long wdt_is_open;
@@ -108,7 +108,7 @@ static DEFINE_SPINLOCK(wdt_spinlock);
  *	Whack the dog
  */
 
-static void wdt_timer_ping(unsigned long data)
+static void wdt_timer_ping(struct timer_list *unused)
 {
 	/* If we got a heartbeat pulse within the WDT_US_INTERVAL
 	 * we agree to ping the WDT
diff --git a/drivers/xen/grant-table.c b/drivers/xen/grant-table.c
index 139e018a82b0..f45114fd8e1e 100644
--- a/drivers/xen/grant-table.c
+++ b/drivers/xen/grant-table.c
@@ -358,10 +358,10 @@ struct deferred_entry {
 	struct page *page;
 };
 static LIST_HEAD(deferred_list);
-static void gnttab_handle_deferred(unsigned long);
+static void gnttab_handle_deferred(struct timer_list *);
 static DEFINE_TIMER(deferred_timer, gnttab_handle_deferred);
 
-static void gnttab_handle_deferred(unsigned long unused)
+static void gnttab_handle_deferred(struct timer_list *unused)
 {
 	unsigned int nr = 10;
 	struct deferred_entry *first = NULL;
diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c
index 423159abd501..691032107f8c 100644
--- a/fs/pstore/platform.c
+++ b/fs/pstore/platform.c
@@ -61,7 +61,7 @@ MODULE_PARM_DESC(update_ms, "milliseconds before pstore updates its content "
 
 static int pstore_new_entry;
 
-static void pstore_timefunc(unsigned long);
+static void pstore_timefunc(struct timer_list *);
 static DEFINE_TIMER(pstore_timer, pstore_timefunc);
 
 static void pstore_dowork(struct work_struct *);
@@ -890,7 +890,7 @@ static void pstore_dowork(struct work_struct *work)
 	pstore_get_records(1);
 }
 
-static void pstore_timefunc(unsigned long dummy)
+static void pstore_timefunc(struct timer_list *unused)
 {
 	if (pstore_new_entry) {
 		pstore_new_entry = 0;
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index 1215229d1c12..ef2a47e0eab6 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -20,7 +20,7 @@
 static int irqfixup __read_mostly;
 
 #define POLL_SPURIOUS_IRQ_INTERVAL (HZ/10)
-static void poll_spurious_irqs(unsigned long dummy);
+static void poll_spurious_irqs(struct timer_list *unused);
 static DEFINE_TIMER(poll_spurious_irq_timer, poll_spurious_irqs);
 static int irq_poll_cpu;
 static atomic_t irq_poll_active;
@@ -143,7 +143,7 @@ out:
 	return ok;
 }
 
-static void poll_spurious_irqs(unsigned long dummy)
+static void poll_spurious_irqs(struct timer_list *unused)
 {
 	struct irq_desc *desc;
 	int i;
diff --git a/lib/random32.c b/lib/random32.c
index 65cc018fef40..4aaa76404d56 100644
--- a/lib/random32.c
+++ b/lib/random32.c
@@ -213,11 +213,11 @@ static int __init prandom_init(void)
 }
 core_initcall(prandom_init);
 
-static void __prandom_timer(unsigned long dontcare);
+static void __prandom_timer(struct timer_list *unused);
 
 static DEFINE_TIMER(seed_timer, __prandom_timer);
 
-static void __prandom_timer(unsigned long dontcare)
+static void __prandom_timer(struct timer_list *unused)
 {
 	u32 entropy;
 	unsigned long expires;
diff --git a/net/decnet/dn_route.c b/net/decnet/dn_route.c
index b36dceab0dc1..de4a0cafb19f 100644
--- a/net/decnet/dn_route.c
+++ b/net/decnet/dn_route.c
@@ -125,7 +125,7 @@ static struct neighbour *dn_dst_neigh_lookup(const struct dst_entry *dst,
 					     struct sk_buff *skb,
 					     const void *daddr);
 static int dn_route_input(struct sk_buff *);
-static void dn_run_flush(unsigned long dummy);
+static void dn_run_flush(struct timer_list *unused);
 
 static struct dn_rt_hash_bucket *dn_rt_hash_table;
 static unsigned int dn_rt_hash_mask;
@@ -357,7 +357,7 @@ static int dn_insert_route(struct dn_route *rt, unsigned int hash, struct dn_rou
 	return 0;
 }
 
-static void dn_run_flush(unsigned long dummy)
+static void dn_run_flush(struct timer_list *unused)
 {
 	int i;
 	struct dn_route *rt, *next;
diff --git a/net/ipv6/ip6_flowlabel.c b/net/ipv6/ip6_flowlabel.c
index 9f2e73c71768..7f59c8fabeeb 100644
--- a/net/ipv6/ip6_flowlabel.c
+++ b/net/ipv6/ip6_flowlabel.c
@@ -46,7 +46,7 @@
 static atomic_t fl_size = ATOMIC_INIT(0);
 static struct ip6_flowlabel __rcu *fl_ht[FL_HASH_MASK+1];
 
-static void ip6_fl_gc(unsigned long dummy);
+static void ip6_fl_gc(struct timer_list *unused);
 static DEFINE_TIMER(ip6_fl_gc_timer, ip6_fl_gc);
 
 /* FL hash table lock: it protects only of GC */
@@ -127,7 +127,7 @@ static void fl_release(struct ip6_flowlabel *fl)
 	spin_unlock_bh(&ip6_fl_lock);
 }
 
-static void ip6_fl_gc(unsigned long dummy)
+static void ip6_fl_gc(struct timer_list *unused)
 {
 	int i;
 	unsigned long now = jiffies;
diff --git a/net/netrom/nr_loopback.c b/net/netrom/nr_loopback.c
index 989ae647825e..215ad22a9647 100644
--- a/net/netrom/nr_loopback.c
+++ b/net/netrom/nr_loopback.c
@@ -15,7 +15,7 @@
 #include <net/netrom.h>
 #include <linux/init.h>
 
-static void nr_loopback_timer(unsigned long);
+static void nr_loopback_timer(struct timer_list *);
 
 static struct sk_buff_head loopback_queue;
 static DEFINE_TIMER(loopback_timer, nr_loopback_timer);
@@ -48,7 +48,7 @@ int nr_loopback_queue(struct sk_buff *skb)
 	return 1;
 }
 
-static void nr_loopback_timer(unsigned long param)
+static void nr_loopback_timer(struct timer_list *unused)
 {
 	struct sk_buff *skb;
 	ax25_address *nr_dest;
diff --git a/security/keys/gc.c b/security/keys/gc.c
index afb3a9175d76..b93603724b8c 100644
--- a/security/keys/gc.c
+++ b/security/keys/gc.c
@@ -29,7 +29,7 @@ DECLARE_WORK(key_gc_work, key_garbage_collector);
 /*
  * Reaper for links from keyrings to dead keys.
  */
-static void key_gc_timer_func(unsigned long);
+static void key_gc_timer_func(struct timer_list *);
 static DEFINE_TIMER(key_gc_timer, key_gc_timer_func);
 
 static time_t key_gc_next_run = LONG_MAX;
@@ -84,7 +84,7 @@ void key_schedule_gc_links(void)
  * Some key's cleanup time was met after it expired, so we need to get the
  * reaper to go through a cycle finding expired keys.
  */
-static void key_gc_timer_func(unsigned long data)
+static void key_gc_timer_func(struct timer_list *unused)
 {
 	kenter("");
 	key_gc_next_run = LONG_MAX;
-- 
cgit v1.2.3


From b9eaf18722221ef8b2bd6a67240ebe668622152a Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Mon, 16 Oct 2017 13:15:39 -0700
Subject: treewide: init_timer() -> setup_timer()

This mechanically converts all remaining cases of ancient open-coded timer
setup with the old setup_timer() API, which is the first step in timer
conversions. This has no behavioral changes, since it ultimately just
changes the order of assignment to fields of struct timer_list when
finding variations of:

    init_timer(&t);
    f.function = timer_callback;
    t.data = timer_callback_arg;

to be converted into:

    setup_timer(&t, timer_callback, timer_callback_arg);

The conversion is done with the following Coccinelle script, which
is an improved version of scripts/cocci/api/setup_timer.cocci, in the
following ways:
 - assignments-before-init_timer() cases
 - limit the .data case removal to the specific struct timer_list instance
 - handling calls by dereference (timer->field vs timer.field)

spatch --very-quiet --all-includes --include-headers \
	-I ./arch/x86/include -I ./arch/x86/include/generated \
	-I ./include -I ./arch/x86/include/uapi \
	-I ./arch/x86/include/generated/uapi -I ./include/uapi \
	-I ./include/generated/uapi --include ./include/linux/kconfig.h \
	--dir . \
	--cocci-file ~/src/data/setup_timer.cocci

@fix_address_of@
expression e;
@@

 init_timer(
-&(e)
+&e
 , ...)

// Match the common cases first to avoid Coccinelle parsing loops with
// "... when" clauses.

@match_immediate_function_data_after_init_timer@
expression e, func, da;
@@

-init_timer
+setup_timer
 ( \(&e\|e\)
+, func, da
 );
(
-\(e.function\|e->function\) = func;
-\(e.data\|e->data\) = da;
|
-\(e.data\|e->data\) = da;
-\(e.function\|e->function\) = func;
)

@match_immediate_function_data_before_init_timer@
expression e, func, da;
@@

(
-\(e.function\|e->function\) = func;
-\(e.data\|e->data\) = da;
|
-\(e.data\|e->data\) = da;
-\(e.function\|e->function\) = func;
)
-init_timer
+setup_timer
 ( \(&e\|e\)
+, func, da
 );

@match_function_and_data_after_init_timer@
expression e, e2, e3, e4, e5, func, da;
@@

-init_timer
+setup_timer
 ( \(&e\|e\)
+, func, da
 );
 ... when != func = e2
     when != da = e3
(
-e.function = func;
... when != da = e4
-e.data = da;
|
-e->function = func;
... when != da = e4
-e->data = da;
|
-e.data = da;
... when != func = e5
-e.function = func;
|
-e->data = da;
... when != func = e5
-e->function = func;
)

@match_function_and_data_before_init_timer@
expression e, e2, e3, e4, e5, func, da;
@@
(
-e.function = func;
... when != da = e4
-e.data = da;
|
-e->function = func;
... when != da = e4
-e->data = da;
|
-e.data = da;
... when != func = e5
-e.function = func;
|
-e->data = da;
... when != func = e5
-e->function = func;
)
... when != func = e2
    when != da = e3
-init_timer
+setup_timer
 ( \(&e\|e\)
+, func, da
 );

@r1 exists@
expression t;
identifier f;
position p;
@@

f(...) { ... when any
  init_timer@p(\(&t\|t\))
  ... when any
}

@r2 exists@
expression r1.t;
identifier g != r1.f;
expression e8;
@@

g(...) { ... when any
  \(t.data\|t->data\) = e8
  ... when any
}

// It is dangerous to use setup_timer if data field is initialized
// in another function.
@script:python depends on r2@
p << r1.p;
@@

cocci.include_match(False)

@r3@
expression r1.t, func, e7;
position r1.p;
@@

(
-init_timer@p(&t);
+setup_timer(&t, func, 0UL);
... when != func = e7
-t.function = func;
|
-t.function = func;
... when != func = e7
-init_timer@p(&t);
+setup_timer(&t, func, 0UL);
|
-init_timer@p(t);
+setup_timer(t, func, 0UL);
... when != func = e7
-t->function = func;
|
-t->function = func;
... when != func = e7
-init_timer@p(t);
+setup_timer(t, func, 0UL);
)

Signed-off-by: Kees Cook <keescook@chromium.org>
---
 arch/arm/mach-iop32x/n2100.c                     |  3 +--
 arch/blackfin/kernel/nmi.c                       |  3 +--
 arch/sh/drivers/pci/common.c                     | 10 ++++------
 arch/sh/drivers/push-switch.c                    |  5 +----
 drivers/atm/firestream.c                         |  4 +---
 drivers/atm/lanai.c                              |  4 +---
 drivers/atm/nicstar.c                            |  4 +---
 drivers/block/DAC960.c                           |  5 ++---
 drivers/block/umem.c                             |  3 +--
 drivers/gpu/drm/omapdrm/dss/dsi.c                |  4 +---
 drivers/infiniband/hw/mthca/mthca_catas.c        |  4 +---
 drivers/isdn/i4l/isdn_common.c                   |  3 +--
 drivers/isdn/i4l/isdn_net.c                      |  6 +++---
 drivers/media/platform/s5p-mfc/s5p_mfc.c         |  5 ++---
 drivers/media/usb/au0828/au0828-dvb.c            |  5 ++---
 drivers/net/wireless/intersil/hostap/hostap_ap.c |  4 +---
 drivers/net/wireless/intersil/hostap/hostap_hw.c | 11 ++++-------
 drivers/nfc/pn533/pn533.c                        |  5 ++---
 drivers/nfc/st-nci/ndlc.c                        |  9 ++-------
 drivers/nfc/st-nci/se.c                          | 11 ++++-------
 drivers/nfc/st21nfca/se.c                        | 10 ++++------
 drivers/s390/block/dasd.c                        |  9 +++------
 drivers/s390/net/fsm.c                           |  4 +---
 drivers/scsi/arcmsr/arcmsr_hba.c                 | 10 ++++------
 drivers/scsi/arm/fas216.c                        |  4 +---
 drivers/scsi/bfa/bfad.c                          |  4 +---
 drivers/scsi/esas2r/esas2r_main.c                |  4 +---
 drivers/scsi/ncr53c8xx.c                         |  4 +---
 drivers/scsi/sym53c8xx_2/sym_glue.c              |  4 +---
 drivers/usb/gadget/udc/omap_udc.c                |  4 +---
 kernel/time/clocksource.c                        |  3 +--
 31 files changed, 55 insertions(+), 113 deletions(-)

(limited to 'kernel')

diff --git a/arch/arm/mach-iop32x/n2100.c b/arch/arm/mach-iop32x/n2100.c
index c1cd80ecc219..4a64a11ba63c 100644
--- a/arch/arm/mach-iop32x/n2100.c
+++ b/arch/arm/mach-iop32x/n2100.c
@@ -336,8 +336,7 @@ static int __init n2100_request_gpios(void)
 			pr_err("could not set power GPIO as input\n");
 	}
 	/* Set up power button poll timer */
-	init_timer(&power_button_poll_timer);
-	power_button_poll_timer.function = power_button_poll;
+	setup_timer(&power_button_poll_timer, power_button_poll, 0UL);
 	power_button_poll_timer.expires = jiffies + (HZ / 10);
 	add_timer(&power_button_poll_timer);
 	return 0;
diff --git a/arch/blackfin/kernel/nmi.c b/arch/blackfin/kernel/nmi.c
index 1e714329fe8a..828f4fbdb58a 100644
--- a/arch/blackfin/kernel/nmi.c
+++ b/arch/blackfin/kernel/nmi.c
@@ -180,8 +180,7 @@ static int __init init_nmi_wdt(void)
 	nmi_wdt_start();
 	nmi_active = true;
 
-	init_timer(&ntimer);
-	ntimer.function = nmi_wdt_timer;
+	setup_timer(&ntimer, nmi_wdt_timer, 0UL);
 	ntimer.expires = jiffies + NMI_CHECK_TIMEOUT;
 	add_timer(&ntimer);
 
diff --git a/arch/sh/drivers/pci/common.c b/arch/sh/drivers/pci/common.c
index cae707f3472d..0d7eb7b5ac8d 100644
--- a/arch/sh/drivers/pci/common.c
+++ b/arch/sh/drivers/pci/common.c
@@ -106,15 +106,13 @@ static void pcibios_enable_serr(unsigned long __data)
 void pcibios_enable_timers(struct pci_channel *hose)
 {
 	if (hose->err_irq) {
-		init_timer(&hose->err_timer);
-		hose->err_timer.data = (unsigned long)hose;
-		hose->err_timer.function = pcibios_enable_err;
+		setup_timer(&hose->err_timer, pcibios_enable_err,
+			    (unsigned long)hose);
 	}
 
 	if (hose->serr_irq) {
-		init_timer(&hose->serr_timer);
-		hose->serr_timer.data = (unsigned long)hose;
-		hose->serr_timer.function = pcibios_enable_serr;
+		setup_timer(&hose->serr_timer, pcibios_enable_serr,
+			    (unsigned long)hose);
 	}
 }
 
diff --git a/arch/sh/drivers/push-switch.c b/arch/sh/drivers/push-switch.c
index 5bfb341cc5c4..2dc791507968 100644
--- a/arch/sh/drivers/push-switch.c
+++ b/arch/sh/drivers/push-switch.c
@@ -78,10 +78,7 @@ static int switch_drv_probe(struct platform_device *pdev)
 	}
 
 	INIT_WORK(&psw->work, switch_work_handler);
-	init_timer(&psw->debounce);
-
-	psw->debounce.function = switch_timer;
-	psw->debounce.data = (unsigned long)psw;
+	setup_timer(&psw->debounce, switch_timer, (unsigned long)psw);
 
 	/* Workqueue API brain-damage */
 	psw->pdev = pdev;
diff --git a/drivers/atm/firestream.c b/drivers/atm/firestream.c
index 6b6368a56526..534001270be5 100644
--- a/drivers/atm/firestream.c
+++ b/drivers/atm/firestream.c
@@ -1885,9 +1885,7 @@ static int fs_init(struct fs_dev *dev)
 	}
 
 #ifdef FS_POLL_FREQ
-	init_timer (&dev->timer);
-	dev->timer.data = (unsigned long) dev;
-	dev->timer.function = fs_poll;
+	setup_timer (&dev->timer, fs_poll, (unsigned long)dev);
 	dev->timer.expires = jiffies + FS_POLL_FREQ;
 	add_timer (&dev->timer);
 #endif
diff --git a/drivers/atm/lanai.c b/drivers/atm/lanai.c
index 2351dad78ff5..87e8b5dfac39 100644
--- a/drivers/atm/lanai.c
+++ b/drivers/atm/lanai.c
@@ -1790,10 +1790,8 @@ static void lanai_timed_poll(unsigned long arg)
 
 static inline void lanai_timed_poll_start(struct lanai_dev *lanai)
 {
-	init_timer(&lanai->timer);
+	setup_timer(&lanai->timer, lanai_timed_poll, (unsigned long)lanai);
 	lanai->timer.expires = jiffies + LANAI_POLL_PERIOD;
-	lanai->timer.data = (unsigned long) lanai;
-	lanai->timer.function = lanai_timed_poll;
 	add_timer(&lanai->timer);
 }
 
diff --git a/drivers/atm/nicstar.c b/drivers/atm/nicstar.c
index a9702836cbae..335447ed0ba4 100644
--- a/drivers/atm/nicstar.c
+++ b/drivers/atm/nicstar.c
@@ -284,10 +284,8 @@ static int __init nicstar_init(void)
 	XPRINTK("nicstar: nicstar_init() returned.\n");
 
 	if (!error) {
-		init_timer(&ns_timer);
+		setup_timer(&ns_timer, ns_poll, 0UL);
 		ns_timer.expires = jiffies + NS_POLL_PERIOD;
-		ns_timer.data = 0UL;
-		ns_timer.function = ns_poll;
 		add_timer(&ns_timer);
 	}
 
diff --git a/drivers/block/DAC960.c b/drivers/block/DAC960.c
index 255591ab3716..6f14cdd6015b 100644
--- a/drivers/block/DAC960.c
+++ b/drivers/block/DAC960.c
@@ -3079,11 +3079,10 @@ DAC960_InitializeController(DAC960_Controller_T *Controller)
       /*
 	Initialize the Monitoring Timer.
       */
-      init_timer(&Controller->MonitoringTimer);
+      setup_timer(&Controller->MonitoringTimer,
+                  DAC960_MonitoringTimerFunction, (unsigned long)Controller);
       Controller->MonitoringTimer.expires =
 	jiffies + DAC960_MonitoringTimerInterval;
-      Controller->MonitoringTimer.data = (unsigned long) Controller;
-      Controller->MonitoringTimer.function = DAC960_MonitoringTimerFunction;
       add_timer(&Controller->MonitoringTimer);
       Controller->ControllerInitialized = true;
       return true;
diff --git a/drivers/block/umem.c b/drivers/block/umem.c
index 0677d2514665..b4d4ccfe7582 100644
--- a/drivers/block/umem.c
+++ b/drivers/block/umem.c
@@ -738,8 +738,7 @@ static void check_all_batteries(unsigned long ptr)
 
 static void init_battery_timer(void)
 {
-	init_timer(&battery_timer);
-	battery_timer.function = check_all_batteries;
+	setup_timer(&battery_timer, check_all_batteries, 0UL);
 	battery_timer.expires = jiffies + (HZ * 60);
 	add_timer(&battery_timer);
 }
diff --git a/drivers/gpu/drm/omapdrm/dss/dsi.c b/drivers/gpu/drm/omapdrm/dss/dsi.c
index b56a05730314..cea744e4d9bd 100644
--- a/drivers/gpu/drm/omapdrm/dss/dsi.c
+++ b/drivers/gpu/drm/omapdrm/dss/dsi.c
@@ -5449,9 +5449,7 @@ static int dsi_bind(struct device *dev, struct device *master, void *data)
 			     dsi_framedone_timeout_work_callback);
 
 #ifdef DSI_CATCH_MISSING_TE
-	init_timer(&dsi->te_timer);
-	dsi->te_timer.function = dsi_te_timeout;
-	dsi->te_timer.data = 0;
+	setup_timer(&dsi->te_timer, dsi_te_timeout, 0);
 #endif
 
 	dsi_mem = platform_get_resource_byname(dsidev, IORESOURCE_MEM, "proto");
diff --git a/drivers/infiniband/hw/mthca/mthca_catas.c b/drivers/infiniband/hw/mthca/mthca_catas.c
index f6474c24f193..23cc08d5c24e 100644
--- a/drivers/infiniband/hw/mthca/mthca_catas.c
+++ b/drivers/infiniband/hw/mthca/mthca_catas.c
@@ -149,7 +149,7 @@ void mthca_start_catas_poll(struct mthca_dev *dev)
 {
 	phys_addr_t addr;
 
-	init_timer(&dev->catas_err.timer);
+	setup_timer(&dev->catas_err.timer, poll_catas, (unsigned long)dev);
 	dev->catas_err.map  = NULL;
 
 	addr = pci_resource_start(dev->pdev, 0) +
@@ -164,8 +164,6 @@ void mthca_start_catas_poll(struct mthca_dev *dev)
 		return;
 	}
 
-	dev->catas_err.timer.data     = (unsigned long) dev;
-	dev->catas_err.timer.function = poll_catas;
 	dev->catas_err.timer.expires  = jiffies + MTHCA_CATAS_POLL_INTERVAL;
 	INIT_LIST_HEAD(&dev->catas_err.list);
 	add_timer(&dev->catas_err.timer);
diff --git a/drivers/isdn/i4l/isdn_common.c b/drivers/isdn/i4l/isdn_common.c
index 38a5bb764c7b..3fa2f7b31131 100644
--- a/drivers/isdn/i4l/isdn_common.c
+++ b/drivers/isdn/i4l/isdn_common.c
@@ -2294,8 +2294,7 @@ static int __init isdn_init(void)
 		printk(KERN_WARNING "isdn: Could not allocate device-struct.\n");
 		return -EIO;
 	}
-	init_timer(&dev->timer);
-	dev->timer.function = isdn_timer_funct;
+	setup_timer(&dev->timer, isdn_timer_funct, 0UL);
 	spin_lock_init(&dev->lock);
 	spin_lock_init(&dev->timerlock);
 #ifdef MODULE
diff --git a/drivers/isdn/i4l/isdn_net.c b/drivers/isdn/i4l/isdn_net.c
index f63a110b7bcb..59d40160cab2 100644
--- a/drivers/isdn/i4l/isdn_net.c
+++ b/drivers/isdn/i4l/isdn_net.c
@@ -1615,9 +1615,9 @@ isdn_net_ciscohdlck_connected(isdn_net_local *lp)
 	/* send slarp request because interface/seq.no.s reset */
 	isdn_net_ciscohdlck_slarp_send_request(lp);
 
-	init_timer(&lp->cisco_timer);
-	lp->cisco_timer.data = (unsigned long) lp;
-	lp->cisco_timer.function = isdn_net_ciscohdlck_slarp_send_keepalive;
+	setup_timer(&lp->cisco_timer,
+		    isdn_net_ciscohdlck_slarp_send_keepalive,
+		    (unsigned long)lp);
 	lp->cisco_timer.expires = jiffies + lp->cisco_keepalive_period * HZ;
 	add_timer(&lp->cisco_timer);
 }
diff --git a/drivers/media/platform/s5p-mfc/s5p_mfc.c b/drivers/media/platform/s5p-mfc/s5p_mfc.c
index 1839a86cc2a5..e179b33d3775 100644
--- a/drivers/media/platform/s5p-mfc/s5p_mfc.c
+++ b/drivers/media/platform/s5p-mfc/s5p_mfc.c
@@ -1314,9 +1314,8 @@ static int s5p_mfc_probe(struct platform_device *pdev)
 	dev->hw_lock = 0;
 	INIT_WORK(&dev->watchdog_work, s5p_mfc_watchdog_worker);
 	atomic_set(&dev->watchdog_cnt, 0);
-	init_timer(&dev->watchdog_timer);
-	dev->watchdog_timer.data = (unsigned long)dev;
-	dev->watchdog_timer.function = s5p_mfc_watchdog;
+	setup_timer(&dev->watchdog_timer, s5p_mfc_watchdog,
+		    (unsigned long)dev);
 
 	ret = v4l2_device_register(&pdev->dev, &dev->v4l2_dev);
 	if (ret)
diff --git a/drivers/media/usb/au0828/au0828-dvb.c b/drivers/media/usb/au0828/au0828-dvb.c
index 34dc7e062471..d701c04b3783 100644
--- a/drivers/media/usb/au0828/au0828-dvb.c
+++ b/drivers/media/usb/au0828/au0828-dvb.c
@@ -648,9 +648,8 @@ int au0828_dvb_register(struct au0828_dev *dev)
 		return ret;
 	}
 
-	dev->bulk_timeout.function = au0828_bulk_timeout;
-	dev->bulk_timeout.data = (unsigned long) dev;
-	init_timer(&dev->bulk_timeout);
+	setup_timer(&dev->bulk_timeout, au0828_bulk_timeout,
+		    (unsigned long)dev);
 
 	return 0;
 }
diff --git a/drivers/net/wireless/intersil/hostap/hostap_ap.c b/drivers/net/wireless/intersil/hostap/hostap_ap.c
index 1a8d8db80b05..f9d047314692 100644
--- a/drivers/net/wireless/intersil/hostap/hostap_ap.c
+++ b/drivers/net/wireless/intersil/hostap/hostap_ap.c
@@ -1189,10 +1189,8 @@ static struct sta_info * ap_add_sta(struct ap_data *ap, u8 *addr)
 	}
 
 #ifndef PRISM2_NO_KERNEL_IEEE80211_MGMT
-	init_timer(&sta->timer);
+	setup_timer(&sta->timer, ap_handle_timer, (unsigned long)sta);
 	sta->timer.expires = jiffies + ap->max_inactivity;
-	sta->timer.data = (unsigned long) sta;
-	sta->timer.function = ap_handle_timer;
 	if (!ap->local->hostapd)
 		add_timer(&sta->timer);
 #endif /* PRISM2_NO_KERNEL_IEEE80211_MGMT */
diff --git a/drivers/net/wireless/intersil/hostap/hostap_hw.c b/drivers/net/wireless/intersil/hostap/hostap_hw.c
index 72b46eaf3de2..8177fd6f65c1 100644
--- a/drivers/net/wireless/intersil/hostap/hostap_hw.c
+++ b/drivers/net/wireless/intersil/hostap/hostap_hw.c
@@ -3225,13 +3225,10 @@ while (0)
 
 	lib80211_crypt_info_init(&local->crypt_info, dev->name, &local->lock);
 
-	init_timer(&local->passive_scan_timer);
-	local->passive_scan_timer.data = (unsigned long) local;
-	local->passive_scan_timer.function = hostap_passive_scan;
-
-	init_timer(&local->tick_timer);
-	local->tick_timer.data = (unsigned long) local;
-	local->tick_timer.function = hostap_tick_timer;
+	setup_timer(&local->passive_scan_timer, hostap_passive_scan,
+		    (unsigned long)local);
+	setup_timer(&local->tick_timer, hostap_tick_timer,
+		    (unsigned long)local);
 	local->tick_timer.expires = jiffies + 2 * HZ;
 	add_timer(&local->tick_timer);
 
diff --git a/drivers/nfc/pn533/pn533.c b/drivers/nfc/pn533/pn533.c
index c05cb637ba92..2effa5ff7082 100644
--- a/drivers/nfc/pn533/pn533.c
+++ b/drivers/nfc/pn533/pn533.c
@@ -2632,9 +2632,8 @@ struct pn533 *pn533_register_device(u32 device_type,
 	if (priv->wq == NULL)
 		goto error;
 
-	init_timer(&priv->listen_timer);
-	priv->listen_timer.data = (unsigned long) priv;
-	priv->listen_timer.function = pn533_listen_mode_timer;
+	setup_timer(&priv->listen_timer, pn533_listen_mode_timer,
+		    (unsigned long)priv);
 
 	skb_queue_head_init(&priv->resp_q);
 	skb_queue_head_init(&priv->fragment_skb);
diff --git a/drivers/nfc/st-nci/ndlc.c b/drivers/nfc/st-nci/ndlc.c
index 9477994cf975..93a7536a9af9 100644
--- a/drivers/nfc/st-nci/ndlc.c
+++ b/drivers/nfc/st-nci/ndlc.c
@@ -282,13 +282,8 @@ int ndlc_probe(void *phy_id, struct nfc_phy_ops *phy_ops, struct device *dev,
 	*ndlc_id = ndlc;
 
 	/* initialize timers */
-	init_timer(&ndlc->t1_timer);
-	ndlc->t1_timer.data = (unsigned long)ndlc;
-	ndlc->t1_timer.function = ndlc_t1_timeout;
-
-	init_timer(&ndlc->t2_timer);
-	ndlc->t2_timer.data = (unsigned long)ndlc;
-	ndlc->t2_timer.function = ndlc_t2_timeout;
+	setup_timer(&ndlc->t1_timer, ndlc_t1_timeout, (unsigned long)ndlc);
+	setup_timer(&ndlc->t2_timer, ndlc_t2_timeout, (unsigned long)ndlc);
 
 	skb_queue_head_init(&ndlc->rcv_q);
 	skb_queue_head_init(&ndlc->send_q);
diff --git a/drivers/nfc/st-nci/se.c b/drivers/nfc/st-nci/se.c
index 56f2112e0cd8..bd7c1e83169c 100644
--- a/drivers/nfc/st-nci/se.c
+++ b/drivers/nfc/st-nci/se.c
@@ -725,15 +725,12 @@ int st_nci_se_init(struct nci_dev *ndev, struct st_nci_se_status *se_status)
 
 	init_completion(&info->se_info.req_completion);
 	/* initialize timers */
-	init_timer(&info->se_info.bwi_timer);
-	info->se_info.bwi_timer.data = (unsigned long)info;
-	info->se_info.bwi_timer.function = st_nci_se_wt_timeout;
+	setup_timer(&info->se_info.bwi_timer, st_nci_se_wt_timeout,
+		    (unsigned long)info);
 	info->se_info.bwi_active = false;
 
-	init_timer(&info->se_info.se_active_timer);
-	info->se_info.se_active_timer.data = (unsigned long)info;
-	info->se_info.se_active_timer.function =
-			st_nci_se_activation_timeout;
+	setup_timer(&info->se_info.se_active_timer,
+		    st_nci_se_activation_timeout, (unsigned long)info);
 	info->se_info.se_active = false;
 
 	info->se_info.xch_error = false;
diff --git a/drivers/nfc/st21nfca/se.c b/drivers/nfc/st21nfca/se.c
index 3a98563d4a12..b2285455f77d 100644
--- a/drivers/nfc/st21nfca/se.c
+++ b/drivers/nfc/st21nfca/se.c
@@ -392,14 +392,12 @@ void st21nfca_se_init(struct nfc_hci_dev *hdev)
 
 	init_completion(&info->se_info.req_completion);
 	/* initialize timers */
-	init_timer(&info->se_info.bwi_timer);
-	info->se_info.bwi_timer.data = (unsigned long)info;
-	info->se_info.bwi_timer.function = st21nfca_se_wt_timeout;
+	setup_timer(&info->se_info.bwi_timer, st21nfca_se_wt_timeout,
+		    (unsigned long)info);
 	info->se_info.bwi_active = false;
 
-	init_timer(&info->se_info.se_active_timer);
-	info->se_info.se_active_timer.data = (unsigned long)info;
-	info->se_info.se_active_timer.function = st21nfca_se_activation_timeout;
+	setup_timer(&info->se_info.se_active_timer,
+		    st21nfca_se_activation_timeout, (unsigned long)info);
 	info->se_info.se_active = false;
 
 	info->se_info.count_pipes = 0;
diff --git a/drivers/s390/block/dasd.c b/drivers/s390/block/dasd.c
index 29f35e29d480..adba91318768 100644
--- a/drivers/s390/block/dasd.c
+++ b/drivers/s390/block/dasd.c
@@ -119,9 +119,8 @@ struct dasd_device *dasd_alloc_device(void)
 		     (void (*)(unsigned long)) dasd_device_tasklet,
 		     (unsigned long) device);
 	INIT_LIST_HEAD(&device->ccw_queue);
-	init_timer(&device->timer);
-	device->timer.function = dasd_device_timeout;
-	device->timer.data = (unsigned long) device;
+	setup_timer(&device->timer, dasd_device_timeout,
+		    (unsigned long)device);
 	INIT_WORK(&device->kick_work, do_kick_device);
 	INIT_WORK(&device->restore_device, do_restore_device);
 	INIT_WORK(&device->reload_device, do_reload_device);
@@ -163,9 +162,7 @@ struct dasd_block *dasd_alloc_block(void)
 		     (unsigned long) block);
 	INIT_LIST_HEAD(&block->ccw_queue);
 	spin_lock_init(&block->queue_lock);
-	init_timer(&block->timer);
-	block->timer.function = dasd_block_timeout;
-	block->timer.data = (unsigned long) block;
+	setup_timer(&block->timer, dasd_block_timeout, (unsigned long)block);
 	spin_lock_init(&block->profile.lock);
 
 	return block;
diff --git a/drivers/s390/net/fsm.c b/drivers/s390/net/fsm.c
index 8c14c6c3ad3d..16b81be1f07a 100644
--- a/drivers/s390/net/fsm.c
+++ b/drivers/s390/net/fsm.c
@@ -142,13 +142,11 @@ void
 fsm_settimer(fsm_instance *fi, fsm_timer *this)
 {
 	this->fi = fi;
-	this->tl.function = (void *)fsm_expire_timer;
-	this->tl.data = (long)this;
 #if FSM_TIMER_DEBUG
 	printk(KERN_DEBUG "fsm(%s): Create timer %p\n", fi->name,
 	       this);
 #endif
-	init_timer(&this->tl);
+	setup_timer(&this->tl, (void *)fsm_expire_timer, (long)this);
 }
 
 void
diff --git a/drivers/scsi/arcmsr/arcmsr_hba.c b/drivers/scsi/arcmsr/arcmsr_hba.c
index af032c46ec0e..a54b6c11b505 100644
--- a/drivers/scsi/arcmsr/arcmsr_hba.c
+++ b/drivers/scsi/arcmsr/arcmsr_hba.c
@@ -837,10 +837,9 @@ static int arcmsr_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 	atomic_set(&acb->rq_map_token, 16);
 	atomic_set(&acb->ante_token_value, 16);
 	acb->fw_flag = FW_NORMAL;
-	init_timer(&acb->eternal_timer);
+	setup_timer(&acb->eternal_timer, &arcmsr_request_device_map,
+		    (unsigned long)acb);
 	acb->eternal_timer.expires = jiffies + msecs_to_jiffies(6 * HZ);
-	acb->eternal_timer.data = (unsigned long) acb;
-	acb->eternal_timer.function = &arcmsr_request_device_map;
 	add_timer(&acb->eternal_timer);
 	if(arcmsr_alloc_sysfs_attr(acb))
 		goto out_free_sysfs;
@@ -930,10 +929,9 @@ static int arcmsr_resume(struct pci_dev *pdev)
 	atomic_set(&acb->rq_map_token, 16);
 	atomic_set(&acb->ante_token_value, 16);
 	acb->fw_flag = FW_NORMAL;
-	init_timer(&acb->eternal_timer);
+	setup_timer(&acb->eternal_timer, &arcmsr_request_device_map,
+		    (unsigned long)acb);
 	acb->eternal_timer.expires = jiffies + msecs_to_jiffies(6 * HZ);
-	acb->eternal_timer.data = (unsigned long) acb;
-	acb->eternal_timer.function = &arcmsr_request_device_map;
 	add_timer(&acb->eternal_timer);
 	return 0;
 controller_stop:
diff --git a/drivers/scsi/arm/fas216.c b/drivers/scsi/arm/fas216.c
index 24388795ee9a..7304d5a4fc4f 100644
--- a/drivers/scsi/arm/fas216.c
+++ b/drivers/scsi/arm/fas216.c
@@ -2849,9 +2849,7 @@ int fas216_init(struct Scsi_Host *host)
 	info->rst_dev_status = -1;
 	info->rst_bus_status = -1;
 	init_waitqueue_head(&info->eh_wait);
-	init_timer(&info->eh_timer);
-	info->eh_timer.data  = (unsigned long)info;
-	info->eh_timer.function = fas216_eh_timer;
+	setup_timer(&info->eh_timer, fas216_eh_timer, (unsigned long)info);
 	
 	spin_lock_init(&info->host_lock);
 
diff --git a/drivers/scsi/bfa/bfad.c b/drivers/scsi/bfa/bfad.c
index 5caf5f3ff642..d10826a69725 100644
--- a/drivers/scsi/bfa/bfad.c
+++ b/drivers/scsi/bfa/bfad.c
@@ -719,9 +719,7 @@ bfad_bfa_tmo(unsigned long data)
 void
 bfad_init_timer(struct bfad_s *bfad)
 {
-	init_timer(&bfad->hal_tmo);
-	bfad->hal_tmo.function = bfad_bfa_tmo;
-	bfad->hal_tmo.data = (unsigned long)bfad;
+	setup_timer(&bfad->hal_tmo, bfad_bfa_tmo, (unsigned long)bfad);
 
 	mod_timer(&bfad->hal_tmo,
 		  jiffies + msecs_to_jiffies(BFA_TIMER_FREQ));
diff --git a/drivers/scsi/esas2r/esas2r_main.c b/drivers/scsi/esas2r/esas2r_main.c
index 81f226be3e3b..af4af504a97f 100644
--- a/drivers/scsi/esas2r/esas2r_main.c
+++ b/drivers/scsi/esas2r/esas2r_main.c
@@ -1635,10 +1635,8 @@ static void esas2r_timer_callback(unsigned long context);
 
 void esas2r_kickoff_timer(struct esas2r_adapter *a)
 {
-	init_timer(&a->timer);
+	setup_timer(&a->timer, esas2r_timer_callback, (unsigned long)a);
 
-	a->timer.function = esas2r_timer_callback;
-	a->timer.data = (unsigned long)a;
 	a->timer.expires = jiffies +
 			   msecs_to_jiffies(100);
 
diff --git a/drivers/scsi/ncr53c8xx.c b/drivers/scsi/ncr53c8xx.c
index 5b93ed810f6e..017216f5e919 100644
--- a/drivers/scsi/ncr53c8xx.c
+++ b/drivers/scsi/ncr53c8xx.c
@@ -8357,9 +8357,7 @@ struct Scsi_Host * __init ncr_attach(struct scsi_host_template *tpnt,
 	if (!np->scripth0)
 		goto attach_error;
 
-	init_timer(&np->timer);
-	np->timer.data     = (unsigned long) np;
-	np->timer.function = ncr53c8xx_timeout;
+	setup_timer(&np->timer, ncr53c8xx_timeout, (unsigned long)np);
 
 	/* Try to map the controller chip to virtual and physical memory. */
 
diff --git a/drivers/scsi/sym53c8xx_2/sym_glue.c b/drivers/scsi/sym53c8xx_2/sym_glue.c
index d32e3ba8863e..285397d42558 100644
--- a/drivers/scsi/sym53c8xx_2/sym_glue.c
+++ b/drivers/scsi/sym53c8xx_2/sym_glue.c
@@ -1351,9 +1351,7 @@ static struct Scsi_Host *sym_attach(struct scsi_host_template *tpnt, int unit,
 	/*
 	 *  Start the timer daemon
 	 */
-	init_timer(&np->s.timer);
-	np->s.timer.data     = (unsigned long) np;
-	np->s.timer.function = sym53c8xx_timer;
+	setup_timer(&np->s.timer, sym53c8xx_timer, (unsigned long)np);
 	np->s.lasttime=0;
 	sym_timer (np);
 
diff --git a/drivers/usb/gadget/udc/omap_udc.c b/drivers/usb/gadget/udc/omap_udc.c
index fc7f810baef7..fb8c4bff584c 100644
--- a/drivers/usb/gadget/udc/omap_udc.c
+++ b/drivers/usb/gadget/udc/omap_udc.c
@@ -2542,9 +2542,7 @@ omap_ep_setup(char *name, u8 addr, u8 type,
 		}
 		if (dbuf && addr)
 			epn_rxtx |= UDC_EPN_RX_DB;
-		init_timer(&ep->timer);
-		ep->timer.function = pio_out_timer;
-		ep->timer.data = (unsigned long) ep;
+		setup_timer(&ep->timer, pio_out_timer, (unsigned long)ep);
 	}
 	if (addr)
 		epn_rxtx |= UDC_EPN_RX_VALID;
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 03918a19cf2d..5b51d5ba2a85 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -290,8 +290,7 @@ static inline void clocksource_start_watchdog(void)
 {
 	if (watchdog_running || !watchdog || list_empty(&watchdog_list))
 		return;
-	init_timer(&watchdog_timer);
-	watchdog_timer.function = clocksource_watchdog;
+	setup_timer(&watchdog_timer, clocksource_watchdog, 0UL);
 	watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL;
 	add_timer_on(&watchdog_timer, cpumask_first(cpu_online_mask));
 	watchdog_running = 1;
-- 
cgit v1.2.3


From e99e88a9d2b067465adaa9c111ada99a041bef9a Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Mon, 16 Oct 2017 14:43:17 -0700
Subject: treewide: setup_timer() -> timer_setup()

This converts all remaining cases of the old setup_timer() API into using
timer_setup(), where the callback argument is the structure already
holding the struct timer_list. These should have no behavioral changes,
since they just change which pointer is passed into the callback with
the same available pointers after conversion. It handles the following
examples, in addition to some other variations.

Casting from unsigned long:

    void my_callback(unsigned long data)
    {
        struct something *ptr = (struct something *)data;
    ...
    }
    ...
    setup_timer(&ptr->my_timer, my_callback, ptr);

and forced object casts:

    void my_callback(struct something *ptr)
    {
    ...
    }
    ...
    setup_timer(&ptr->my_timer, my_callback, (unsigned long)ptr);

become:

    void my_callback(struct timer_list *t)
    {
        struct something *ptr = from_timer(ptr, t, my_timer);
    ...
    }
    ...
    timer_setup(&ptr->my_timer, my_callback, 0);

Direct function assignments:

    void my_callback(unsigned long data)
    {
        struct something *ptr = (struct something *)data;
    ...
    }
    ...
    ptr->my_timer.function = my_callback;

have a temporary cast added, along with converting the args:

    void my_callback(struct timer_list *t)
    {
        struct something *ptr = from_timer(ptr, t, my_timer);
    ...
    }
    ...
    ptr->my_timer.function = (TIMER_FUNC_TYPE)my_callback;

And finally, callbacks without a data assignment:

    void my_callback(unsigned long data)
    {
    ...
    }
    ...
    setup_timer(&ptr->my_timer, my_callback, 0);

have their argument renamed to verify they're unused during conversion:

    void my_callback(struct timer_list *unused)
    {
    ...
    }
    ...
    timer_setup(&ptr->my_timer, my_callback, 0);

The conversion is done with the following Coccinelle script:

spatch --very-quiet --all-includes --include-headers \
	-I ./arch/x86/include -I ./arch/x86/include/generated \
	-I ./include -I ./arch/x86/include/uapi \
	-I ./arch/x86/include/generated/uapi -I ./include/uapi \
	-I ./include/generated/uapi --include ./include/linux/kconfig.h \
	--dir . \
	--cocci-file ~/src/data/timer_setup.cocci

@fix_address_of@
expression e;
@@

 setup_timer(
-&(e)
+&e
 , ...)

// Update any raw setup_timer() usages that have a NULL callback, but
// would otherwise match change_timer_function_usage, since the latter
// will update all function assignments done in the face of a NULL
// function initialization in setup_timer().
@change_timer_function_usage_NULL@
expression _E;
identifier _timer;
type _cast_data;
@@

(
-setup_timer(&_E->_timer, NULL, _E);
+timer_setup(&_E->_timer, NULL, 0);
|
-setup_timer(&_E->_timer, NULL, (_cast_data)_E);
+timer_setup(&_E->_timer, NULL, 0);
|
-setup_timer(&_E._timer, NULL, &_E);
+timer_setup(&_E._timer, NULL, 0);
|
-setup_timer(&_E._timer, NULL, (_cast_data)&_E);
+timer_setup(&_E._timer, NULL, 0);
)

@change_timer_function_usage@
expression _E;
identifier _timer;
struct timer_list _stl;
identifier _callback;
type _cast_func, _cast_data;
@@

(
-setup_timer(&_E->_timer, _callback, _E);
+timer_setup(&_E->_timer, _callback, 0);
|
-setup_timer(&_E->_timer, &_callback, _E);
+timer_setup(&_E->_timer, _callback, 0);
|
-setup_timer(&_E->_timer, _callback, (_cast_data)_E);
+timer_setup(&_E->_timer, _callback, 0);
|
-setup_timer(&_E->_timer, &_callback, (_cast_data)_E);
+timer_setup(&_E->_timer, _callback, 0);
|
-setup_timer(&_E->_timer, (_cast_func)_callback, _E);
+timer_setup(&_E->_timer, _callback, 0);
|
-setup_timer(&_E->_timer, (_cast_func)&_callback, _E);
+timer_setup(&_E->_timer, _callback, 0);
|
-setup_timer(&_E->_timer, (_cast_func)_callback, (_cast_data)_E);
+timer_setup(&_E->_timer, _callback, 0);
|
-setup_timer(&_E->_timer, (_cast_func)&_callback, (_cast_data)_E);
+timer_setup(&_E->_timer, _callback, 0);
|
-setup_timer(&_E._timer, _callback, (_cast_data)_E);
+timer_setup(&_E._timer, _callback, 0);
|
-setup_timer(&_E._timer, _callback, (_cast_data)&_E);
+timer_setup(&_E._timer, _callback, 0);
|
-setup_timer(&_E._timer, &_callback, (_cast_data)_E);
+timer_setup(&_E._timer, _callback, 0);
|
-setup_timer(&_E._timer, &_callback, (_cast_data)&_E);
+timer_setup(&_E._timer, _callback, 0);
|
-setup_timer(&_E._timer, (_cast_func)_callback, (_cast_data)_E);
+timer_setup(&_E._timer, _callback, 0);
|
-setup_timer(&_E._timer, (_cast_func)_callback, (_cast_data)&_E);
+timer_setup(&_E._timer, _callback, 0);
|
-setup_timer(&_E._timer, (_cast_func)&_callback, (_cast_data)_E);
+timer_setup(&_E._timer, _callback, 0);
|
-setup_timer(&_E._timer, (_cast_func)&_callback, (_cast_data)&_E);
+timer_setup(&_E._timer, _callback, 0);
|
 _E->_timer@_stl.function = _callback;
|
 _E->_timer@_stl.function = &_callback;
|
 _E->_timer@_stl.function = (_cast_func)_callback;
|
 _E->_timer@_stl.function = (_cast_func)&_callback;
|
 _E._timer@_stl.function = _callback;
|
 _E._timer@_stl.function = &_callback;
|
 _E._timer@_stl.function = (_cast_func)_callback;
|
 _E._timer@_stl.function = (_cast_func)&_callback;
)

// callback(unsigned long arg)
@change_callback_handle_cast
 depends on change_timer_function_usage@
identifier change_timer_function_usage._callback;
identifier change_timer_function_usage._timer;
type _origtype;
identifier _origarg;
type _handletype;
identifier _handle;
@@

 void _callback(
-_origtype _origarg
+struct timer_list *t
 )
 {
(
	... when != _origarg
	_handletype *_handle =
-(_handletype *)_origarg;
+from_timer(_handle, t, _timer);
	... when != _origarg
|
	... when != _origarg
	_handletype *_handle =
-(void *)_origarg;
+from_timer(_handle, t, _timer);
	... when != _origarg
|
	... when != _origarg
	_handletype *_handle;
	... when != _handle
	_handle =
-(_handletype *)_origarg;
+from_timer(_handle, t, _timer);
	... when != _origarg
|
	... when != _origarg
	_handletype *_handle;
	... when != _handle
	_handle =
-(void *)_origarg;
+from_timer(_handle, t, _timer);
	... when != _origarg
)
 }

// callback(unsigned long arg) without existing variable
@change_callback_handle_cast_no_arg
 depends on change_timer_function_usage &&
                     !change_callback_handle_cast@
identifier change_timer_function_usage._callback;
identifier change_timer_function_usage._timer;
type _origtype;
identifier _origarg;
type _handletype;
@@

 void _callback(
-_origtype _origarg
+struct timer_list *t
 )
 {
+	_handletype *_origarg = from_timer(_origarg, t, _timer);
+
	... when != _origarg
-	(_handletype *)_origarg
+	_origarg
	... when != _origarg
 }

// Avoid already converted callbacks.
@match_callback_converted
 depends on change_timer_function_usage &&
            !change_callback_handle_cast &&
	    !change_callback_handle_cast_no_arg@
identifier change_timer_function_usage._callback;
identifier t;
@@

 void _callback(struct timer_list *t)
 { ... }

// callback(struct something *handle)
@change_callback_handle_arg
 depends on change_timer_function_usage &&
	    !match_callback_converted &&
            !change_callback_handle_cast &&
            !change_callback_handle_cast_no_arg@
identifier change_timer_function_usage._callback;
identifier change_timer_function_usage._timer;
type _handletype;
identifier _handle;
@@

 void _callback(
-_handletype *_handle
+struct timer_list *t
 )
 {
+	_handletype *_handle = from_timer(_handle, t, _timer);
	...
 }

// If change_callback_handle_arg ran on an empty function, remove
// the added handler.
@unchange_callback_handle_arg
 depends on change_timer_function_usage &&
	    change_callback_handle_arg@
identifier change_timer_function_usage._callback;
identifier change_timer_function_usage._timer;
type _handletype;
identifier _handle;
identifier t;
@@

 void _callback(struct timer_list *t)
 {
-	_handletype *_handle = from_timer(_handle, t, _timer);
 }

// We only want to refactor the setup_timer() data argument if we've found
// the matching callback. This undoes changes in change_timer_function_usage.
@unchange_timer_function_usage
 depends on change_timer_function_usage &&
            !change_callback_handle_cast &&
            !change_callback_handle_cast_no_arg &&
	    !change_callback_handle_arg@
expression change_timer_function_usage._E;
identifier change_timer_function_usage._timer;
identifier change_timer_function_usage._callback;
type change_timer_function_usage._cast_data;
@@

(
-timer_setup(&_E->_timer, _callback, 0);
+setup_timer(&_E->_timer, _callback, (_cast_data)_E);
|
-timer_setup(&_E._timer, _callback, 0);
+setup_timer(&_E._timer, _callback, (_cast_data)&_E);
)

// If we fixed a callback from a .function assignment, fix the
// assignment cast now.
@change_timer_function_assignment
 depends on change_timer_function_usage &&
            (change_callback_handle_cast ||
             change_callback_handle_cast_no_arg ||
             change_callback_handle_arg)@
expression change_timer_function_usage._E;
identifier change_timer_function_usage._timer;
identifier change_timer_function_usage._callback;
type _cast_func;
typedef TIMER_FUNC_TYPE;
@@

(
 _E->_timer.function =
-_callback
+(TIMER_FUNC_TYPE)_callback
 ;
|
 _E->_timer.function =
-&_callback
+(TIMER_FUNC_TYPE)_callback
 ;
|
 _E->_timer.function =
-(_cast_func)_callback;
+(TIMER_FUNC_TYPE)_callback
 ;
|
 _E->_timer.function =
-(_cast_func)&_callback
+(TIMER_FUNC_TYPE)_callback
 ;
|
 _E._timer.function =
-_callback
+(TIMER_FUNC_TYPE)_callback
 ;
|
 _E._timer.function =
-&_callback;
+(TIMER_FUNC_TYPE)_callback
 ;
|
 _E._timer.function =
-(_cast_func)_callback
+(TIMER_FUNC_TYPE)_callback
 ;
|
 _E._timer.function =
-(_cast_func)&_callback
+(TIMER_FUNC_TYPE)_callback
 ;
)

// Sometimes timer functions are called directly. Replace matched args.
@change_timer_function_calls
 depends on change_timer_function_usage &&
            (change_callback_handle_cast ||
             change_callback_handle_cast_no_arg ||
             change_callback_handle_arg)@
expression _E;
identifier change_timer_function_usage._timer;
identifier change_timer_function_usage._callback;
type _cast_data;
@@

 _callback(
(
-(_cast_data)_E
+&_E->_timer
|
-(_cast_data)&_E
+&_E._timer
|
-_E
+&_E->_timer
)
 )

// If a timer has been configured without a data argument, it can be
// converted without regard to the callback argument, since it is unused.
@match_timer_function_unused_data@
expression _E;
identifier _timer;
identifier _callback;
@@

(
-setup_timer(&_E->_timer, _callback, 0);
+timer_setup(&_E->_timer, _callback, 0);
|
-setup_timer(&_E->_timer, _callback, 0L);
+timer_setup(&_E->_timer, _callback, 0);
|
-setup_timer(&_E->_timer, _callback, 0UL);
+timer_setup(&_E->_timer, _callback, 0);
|
-setup_timer(&_E._timer, _callback, 0);
+timer_setup(&_E._timer, _callback, 0);
|
-setup_timer(&_E._timer, _callback, 0L);
+timer_setup(&_E._timer, _callback, 0);
|
-setup_timer(&_E._timer, _callback, 0UL);
+timer_setup(&_E._timer, _callback, 0);
|
-setup_timer(&_timer, _callback, 0);
+timer_setup(&_timer, _callback, 0);
|
-setup_timer(&_timer, _callback, 0L);
+timer_setup(&_timer, _callback, 0);
|
-setup_timer(&_timer, _callback, 0UL);
+timer_setup(&_timer, _callback, 0);
|
-setup_timer(_timer, _callback, 0);
+timer_setup(_timer, _callback, 0);
|
-setup_timer(_timer, _callback, 0L);
+timer_setup(_timer, _callback, 0);
|
-setup_timer(_timer, _callback, 0UL);
+timer_setup(_timer, _callback, 0);
)

@change_callback_unused_data
 depends on match_timer_function_unused_data@
identifier match_timer_function_unused_data._callback;
type _origtype;
identifier _origarg;
@@

 void _callback(
-_origtype _origarg
+struct timer_list *unused
 )
 {
	... when != _origarg
 }

Signed-off-by: Kees Cook <keescook@chromium.org>
---
 arch/alpha/kernel/srmcons.c                        |  7 ++---
 arch/arm/mach-iop32x/n2100.c                       |  4 +--
 arch/arm/mach-orion5x/db88f5281-setup.c            |  4 +--
 arch/blackfin/kernel/nmi.c                         |  4 +--
 arch/mips/lasat/picvue_proc.c                      |  4 +--
 arch/powerpc/kernel/tau_6xx.c                      |  4 +--
 arch/powerpc/oprofile/op_model_cell.c              |  8 +++---
 arch/powerpc/platforms/cell/spufs/sched.c          |  8 +++---
 arch/powerpc/platforms/powermac/low_i2c.c          |  6 ++--
 arch/s390/kernel/time.c                            |  4 +--
 arch/sh/drivers/heartbeat.c                        |  6 ++--
 arch/sh/drivers/pci/common.c                       | 14 ++++-----
 arch/sh/drivers/push-switch.c                      |  6 ++--
 block/blk-stat.c                                   |  6 ++--
 block/blk-throttle.c                               |  9 +++---
 drivers/atm/ambassador.c                           |  9 +++---
 drivers/atm/firestream.c                           |  6 ++--
 drivers/atm/horizon.c                              |  8 +++---
 drivers/atm/idt77252.c                             |  6 ++--
 drivers/atm/lanai.c                                |  6 ++--
 drivers/atm/nicstar.c                              |  6 ++--
 drivers/block/DAC960.c                             |  8 +++---
 drivers/block/DAC960.h                             |  2 +-
 drivers/block/rsxx/dma.c                           |  7 ++---
 drivers/block/skd_main.c                           |  6 ++--
 drivers/block/sunvdc.c                             |  9 +++---
 drivers/block/umem.c                               |  4 +--
 drivers/block/xsysace.c                            |  6 ++--
 drivers/char/ipmi/bt-bmc.c                         |  7 ++---
 drivers/char/ipmi/ipmi_msghandler.c                |  4 +--
 drivers/char/ipmi/ipmi_si_intf.c                   |  6 ++--
 drivers/char/ipmi/ipmi_ssif.c                      |  7 ++---
 drivers/char/tpm/tpm-dev-common.c                  |  7 ++---
 drivers/gpu/drm/drm_vblank.c                       | 11 ++++----
 drivers/gpu/drm/exynos/exynos_drm_vidi.c           |  6 ++--
 drivers/gpu/drm/i2c/tda998x_drv.c                  |  7 ++---
 drivers/gpu/drm/msm/adreno/a5xx_preempt.c          |  7 ++---
 drivers/gpu/drm/msm/msm_gpu.c                      |  7 ++---
 drivers/gpu/drm/omapdrm/dss/dsi.c                  |  4 +--
 drivers/gpu/drm/rockchip/rockchip_drm_psr.c        |  6 ++--
 drivers/gpu/drm/vgem/vgem_fence.c                  |  6 ++--
 drivers/gpu/drm/via/via_dmablit.c                  |  7 ++---
 drivers/hid/hid-appleir.c                          |  7 ++---
 drivers/hid/hid-prodikeys.c                        |  7 ++---
 drivers/hid/hid-wiimote-core.c                     |  6 ++--
 drivers/iio/common/ssp_sensors/ssp_dev.c           |  6 ++--
 drivers/infiniband/hw/mlx5/mr.c                    |  6 ++--
 drivers/input/gameport/gameport.c                  |  7 ++---
 drivers/input/joystick/db9.c                       |  6 ++--
 drivers/input/joystick/gamecon.c                   |  6 ++--
 drivers/input/joystick/turbografx.c                |  6 ++--
 drivers/iommu/iova.c                               |  8 +++---
 drivers/isdn/capi/capidrv.c                        |  6 ++--
 drivers/isdn/divert/isdn_divert.c                  |  9 +++---
 drivers/isdn/hardware/eicon/divasi.c               |  9 +++---
 drivers/isdn/hardware/mISDN/hfcmulti.c             |  8 ++----
 drivers/isdn/hardware/mISDN/hfcpci.c               |  5 ++--
 drivers/isdn/hardware/mISDN/mISDNisar.c            | 10 +++----
 drivers/isdn/i4l/isdn_common.c                     |  4 +--
 drivers/isdn/i4l/isdn_net.c                        |  9 +++---
 drivers/isdn/i4l/isdn_ppp.c                        |  9 +++---
 drivers/isdn/i4l/isdn_tty.c                        |  7 ++---
 drivers/media/platform/s5p-mfc/s5p_mfc.c           |  7 ++---
 .../media/platform/sti/c8sectpfe/c8sectpfe-core.c  |  7 ++---
 drivers/media/platform/vim2m.c                     |  6 ++--
 drivers/media/usb/au0828/au0828-dvb.c              |  7 ++---
 drivers/media/usb/au0828/au0828-video.c            | 14 ++++-----
 drivers/memstick/core/ms_block.c                   |  7 ++---
 drivers/mfd/rtsx_usb.c                             |  6 ++--
 drivers/mmc/core/host.c                            |  6 ++--
 drivers/mtd/sm_ftl.c                               |  6 ++--
 drivers/net/caif/caif_hsi.c                        | 21 ++++++--------
 drivers/net/dsa/mv88e6xxx/phy.c                    |  7 ++---
 drivers/net/eql.c                                  |  6 ++--
 drivers/net/ethernet/adi/bfin_mac.c                |  9 +++---
 drivers/net/ethernet/agere/et131x.c                |  7 ++---
 drivers/net/ethernet/amazon/ena/ena_netdev.c       |  7 ++---
 drivers/net/ethernet/aquantia/atlantic/aq_nic.c    | 14 ++++-----
 drivers/net/ethernet/atheros/atl1c/atl1c_main.c    |  8 +++---
 drivers/net/ethernet/atheros/atl1e/atl1e_main.c    |  8 +++---
 drivers/net/ethernet/atheros/atlx/atl1.c           |  8 +++---
 drivers/net/ethernet/atheros/atlx/atl2.c           | 15 +++++-----
 drivers/net/ethernet/broadcom/b44.c                |  6 ++--
 drivers/net/ethernet/broadcom/bnx2.c               |  6 ++--
 drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c   |  6 ++--
 drivers/net/ethernet/broadcom/bnxt/bnxt.c          |  6 ++--
 drivers/net/ethernet/broadcom/tg3.c                |  6 ++--
 drivers/net/ethernet/cisco/enic/enic_main.c        |  7 ++---
 drivers/net/ethernet/marvell/mv643xx_eth.c         | 13 ++++-----
 drivers/net/ethernet/marvell/pxa168_eth.c          |  7 ++---
 drivers/net/ethernet/marvell/skge.c                |  6 ++--
 drivers/net/ethernet/marvell/sky2.c                |  6 ++--
 drivers/net/ethernet/myricom/myri10ge/myri10ge.c   |  7 ++---
 .../net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c   |  8 +++---
 drivers/net/ethernet/pasemi/pasemi_mac.c           |  7 ++---
 drivers/net/ethernet/qlogic/qla3xxx.c              |  6 ++--
 drivers/net/ethernet/rocker/rocker_ofdpa.c         |  7 ++---
 drivers/net/ethernet/stmicro/stmmac/stmmac_main.c  | 15 +++++-----
 drivers/net/ethernet/synopsys/dwc-xlgmac-net.c     |  7 ++---
 drivers/net/ethernet/ti/cpsw_ale.c                 |  6 ++--
 drivers/net/ethernet/ti/netcp_ethss.c              |  7 ++---
 drivers/net/ethernet/toshiba/spider_net.c          | 16 +++++------
 drivers/net/slip/slip.c                            | 16 +++++------
 drivers/net/tun.c                                  |  8 ++++--
 drivers/net/wan/hdlc_ppp.c                         |  6 ++--
 .../wireless/broadcom/brcm80211/brcmfmac/btcoex.c  |  6 ++--
 .../broadcom/brcm80211/brcmfmac/cfg80211.c         |  7 ++---
 .../wireless/broadcom/brcm80211/brcmfmac/sdio.c    |  7 ++---
 drivers/net/wireless/intel/iwlwifi/dvm/main.c      | 14 ++++-----
 drivers/net/wireless/intel/iwlwifi/pcie/tx.c       |  7 ++---
 drivers/net/wireless/intersil/hostap/hostap_ap.c   |  6 ++--
 drivers/net/wireless/intersil/hostap/hostap_hw.c   | 14 ++++-----
 .../net/wireless/intersil/orinoco/orinoco_usb.c    |  6 ++--
 drivers/net/wireless/quantenna/qtnfmac/core.c      |  2 +-
 drivers/net/wireless/ti/wlcore/main.c              |  7 ++---
 drivers/net/xen-netfront.c                         |  7 ++---
 drivers/nfc/pn533/pn533.c                          |  7 ++---
 drivers/nfc/st-nci/ndlc.c                          | 12 ++++----
 drivers/ntb/test/ntb_pingpong.c                    |  8 +++---
 drivers/platform/x86/sony-laptop.c                 |  4 +--
 drivers/pps/clients/pps-ktimer.c                   |  4 +--
 drivers/rtc/rtc-dev.c                              |  6 ++--
 drivers/s390/block/dasd.c                          | 17 ++++++-----
 drivers/s390/net/fsm.c                             |  9 +++---
 drivers/scsi/arcmsr/arcmsr_hba.c                   | 12 ++++----
 drivers/scsi/arm/fas216.c                          |  6 ++--
 drivers/scsi/bfa/bfad.c                            |  6 ++--
 drivers/scsi/bfa/bfad_drv.h                        |  2 +-
 drivers/scsi/bnx2fc/bnx2fc_tgt.c                   | 16 +++++------
 drivers/scsi/esas2r/esas2r_main.c                  |  8 +++---
 drivers/scsi/fcoe/fcoe_ctlr.c                      |  8 +++---
 drivers/scsi/fnic/fnic_main.c                      | 14 ++++-----
 drivers/scsi/ncr53c8xx.c                           |  6 ++--
 drivers/staging/greybus/operation.c                |  7 ++---
 drivers/staging/lustre/lnet/lnet/net_fault.c       |  6 ++--
 drivers/staging/lustre/lustre/ptlrpc/service.c     |  9 +++---
 drivers/staging/media/imx/imx-ic-prpencvf.c        |  7 ++---
 drivers/staging/media/imx/imx-media-csi.c          |  7 ++---
 drivers/staging/most/hdm-usb/hdm_usb.c             |  7 ++---
 .../staging/rtl8192u/ieee80211/ieee80211_softmac.c | 16 +++++------
 drivers/staging/rtl8712/recv_linux.c               |  9 +++---
 drivers/staging/rtl8712/rtl8712_led.c              |  9 +++---
 drivers/staging/unisys/visorbus/visorbus_main.c    |  6 ++--
 drivers/staging/unisys/visornic/visornic_main.c    |  8 +++---
 drivers/staging/wilc1000/wilc_wfi_cfgoperations.c  |  8 +++---
 drivers/target/target_core_user.c                  |  7 ++---
 drivers/tty/ipwireless/hardware.c                  | 11 ++++----
 drivers/tty/n_gsm.c                                | 12 ++++----
 drivers/tty/n_r3964.c                              |  8 +++---
 drivers/tty/serial/crisv10.c                       |  4 +--
 drivers/tty/serial/fsl_lpuart.c                    |  7 ++---
 drivers/tty/serial/ifx6x60.c                       |  7 ++---
 drivers/tty/serial/imx.c                           |  6 ++--
 drivers/tty/serial/kgdb_nmi.c                      |  6 ++--
 drivers/tty/serial/max3100.c                       |  7 ++---
 drivers/tty/serial/mux.c                           |  4 +--
 drivers/tty/serial/pnx8xxx_uart.c                  |  7 ++---
 drivers/tty/serial/sa1100.c                        |  7 ++---
 drivers/tty/serial/sh-sci.c                        | 16 +++++------
 drivers/tty/serial/sn_console.c                    |  6 ++--
 drivers/tty/synclink.c                             |  8 +++---
 drivers/tty/synclink_gt.c                          | 16 +++++------
 drivers/tty/synclinkmp.c                           | 17 ++++++-----
 drivers/usb/core/hcd.c                             |  8 ++++--
 drivers/usb/dwc2/hcd.c                             |  7 ++---
 drivers/usb/dwc2/hcd_queue.c                       |  7 ++---
 drivers/usb/gadget/udc/at91_udc.c                  |  7 ++---
 drivers/usb/gadget/udc/dummy_hcd.c                 |  8 +++---
 drivers/usb/gadget/udc/m66592-udc.c                |  6 ++--
 drivers/usb/gadget/udc/omap_udc.c                  |  6 ++--
 drivers/usb/gadget/udc/pxa25x_udc.c                |  6 ++--
 drivers/usb/gadget/udc/r8a66597-udc.c              |  6 ++--
 drivers/usb/host/ohci-hcd.c                        |  9 +++---
 drivers/usb/host/oxu210hp-hcd.c                    |  6 ++--
 drivers/usb/host/r8a66597-hcd.c                    |  7 ++---
 drivers/usb/host/sl811-hcd.c                       |  6 ++--
 drivers/usb/host/uhci-hcd.c                        |  3 +-
 drivers/usb/host/uhci-q.c                          |  4 +--
 drivers/usb/host/xhci.c                            |  8 +++---
 drivers/usb/serial/mos7840.c                       | 15 +++++-----
 drivers/usb/storage/realtek_cr.c                   |  7 ++---
 drivers/uwb/drp.c                                  |  6 ++--
 drivers/uwb/neh.c                                  |  8 +++---
 drivers/uwb/rsv.c                                  | 15 +++++-----
 drivers/uwb/uwb-internal.h                         |  2 +-
 drivers/watchdog/at91sam9_wdt.c                    |  6 ++--
 drivers/watchdog/bcm47xx_wdt.c                     |  9 +++---
 drivers/watchdog/bcm63xx_wdt.c                     |  4 +--
 drivers/watchdog/cpu5wdt.c                         |  4 +--
 drivers/watchdog/mpc8xxx_wdt.c                     |  7 ++---
 drivers/watchdog/mtx-1_wdt.c                       |  4 +--
 drivers/watchdog/nuc900_wdt.c                      |  4 +--
 drivers/watchdog/pcwd.c                            |  4 +--
 drivers/watchdog/pika_wdt.c                        |  4 +--
 drivers/watchdog/rdc321x_wdt.c                     |  4 +--
 drivers/watchdog/shwdt.c                           |  6 ++--
 fs/ocfs2/cluster/tcp.c                             |  9 +++---
 kernel/padata.c                                    |  6 ++--
 kernel/time/clocksource.c                          |  4 +--
 net/802/garp.c                                     |  6 ++--
 net/802/mrp.c                                      | 13 ++++-----
 net/appletalk/aarp.c                               |  4 +--
 net/appletalk/ddp.c                                |  7 ++---
 net/batman-adv/tp_meter.c                          | 14 ++++-----
 net/bluetooth/hidp/core.c                          |  7 ++---
 net/bluetooth/rfcomm/core.c                        | 12 ++++----
 net/bluetooth/sco.c                                |  6 ++--
 net/core/drop_monitor.c                            |  7 ++---
 net/core/gen_estimator.c                           |  6 ++--
 net/core/neighbour.c                               | 14 ++++-----
 net/decnet/dn_route.c                              |  4 +--
 net/decnet/dn_timer.c                              |  8 +++---
 net/ipv4/igmp.c                                    | 20 ++++++-------
 net/ipv4/ipmr.c                                    |  9 +++---
 net/ipv6/addrconf.c                                |  9 +++---
 net/ipv6/ip6mr.c                                   |  9 +++---
 net/ipv6/mcast.c                                   | 33 ++++++++++------------
 net/ncsi/ncsi-manage.c                             |  8 ++----
 net/netfilter/nf_conntrack_expect.c                |  7 ++---
 net/netfilter/nfnetlink_log.c                      |  8 +++---
 net/netfilter/xt_IDLETIMER.c                       |  7 ++---
 net/netfilter/xt_LED.c                             |  8 +++---
 net/nfc/nci/core.c                                 | 14 ++++-----
 net/rxrpc/call_object.c                            |  7 ++---
 net/wireless/lib80211.c                            | 11 ++++----
 net/x25/x25_link.c                                 |  8 +++---
 net/xfrm/xfrm_state.c                              |  9 +++---
 227 files changed, 824 insertions(+), 937 deletions(-)

(limited to 'kernel')

diff --git a/arch/alpha/kernel/srmcons.c b/arch/alpha/kernel/srmcons.c
index 5da0aec8ce90..438b10c44d73 100644
--- a/arch/alpha/kernel/srmcons.c
+++ b/arch/alpha/kernel/srmcons.c
@@ -65,9 +65,9 @@ srmcons_do_receive_chars(struct tty_port *port)
 }
 
 static void
-srmcons_receive_chars(unsigned long data)
+srmcons_receive_chars(struct timer_list *t)
 {
-	struct srmcons_private *srmconsp = (struct srmcons_private *)data;
+	struct srmcons_private *srmconsp = from_timer(srmconsp, t, timer);
 	struct tty_port *port = &srmconsp->port;
 	unsigned long flags;
 	int incr = 10;
@@ -206,8 +206,7 @@ static const struct tty_operations srmcons_ops = {
 static int __init
 srmcons_init(void)
 {
-	setup_timer(&srmcons_singleton.timer, srmcons_receive_chars,
-			(unsigned long)&srmcons_singleton);
+	timer_setup(&srmcons_singleton.timer, srmcons_receive_chars, 0);
 	if (srm_is_registered_console) {
 		struct tty_driver *driver;
 		int err;
diff --git a/arch/arm/mach-iop32x/n2100.c b/arch/arm/mach-iop32x/n2100.c
index 4a64a11ba63c..3b73813c6b04 100644
--- a/arch/arm/mach-iop32x/n2100.c
+++ b/arch/arm/mach-iop32x/n2100.c
@@ -305,7 +305,7 @@ static void n2100_restart(enum reboot_mode mode, const char *cmd)
 
 static struct timer_list power_button_poll_timer;
 
-static void power_button_poll(unsigned long dummy)
+static void power_button_poll(struct timer_list *unused)
 {
 	if (gpio_get_value(N2100_POWER_BUTTON) == 0) {
 		ctrl_alt_del();
@@ -336,7 +336,7 @@ static int __init n2100_request_gpios(void)
 			pr_err("could not set power GPIO as input\n");
 	}
 	/* Set up power button poll timer */
-	setup_timer(&power_button_poll_timer, power_button_poll, 0UL);
+	timer_setup(&power_button_poll_timer, power_button_poll, 0);
 	power_button_poll_timer.expires = jiffies + (HZ / 10);
 	add_timer(&power_button_poll_timer);
 	return 0;
diff --git a/arch/arm/mach-orion5x/db88f5281-setup.c b/arch/arm/mach-orion5x/db88f5281-setup.c
index 3f5863de766a..39eae10ac8de 100644
--- a/arch/arm/mach-orion5x/db88f5281-setup.c
+++ b/arch/arm/mach-orion5x/db88f5281-setup.c
@@ -172,7 +172,7 @@ static struct platform_device db88f5281_nand_flash = {
 static void __iomem *db88f5281_7seg;
 static struct timer_list db88f5281_timer;
 
-static void db88f5281_7seg_event(unsigned long data)
+static void db88f5281_7seg_event(struct timer_list *unused)
 {
 	static int count = 0;
 	writel(0, db88f5281_7seg + (count << 4));
@@ -189,7 +189,7 @@ static int __init db88f5281_7seg_init(void)
 			printk(KERN_ERR "Failed to ioremap db88f5281_7seg\n");
 			return -EIO;
 		}
-		setup_timer(&db88f5281_timer, db88f5281_7seg_event, 0);
+		timer_setup(&db88f5281_timer, db88f5281_7seg_event, 0);
 		mod_timer(&db88f5281_timer, jiffies + 2 * HZ);
 	}
 
diff --git a/arch/blackfin/kernel/nmi.c b/arch/blackfin/kernel/nmi.c
index 828f4fbdb58a..8a211d95821f 100644
--- a/arch/blackfin/kernel/nmi.c
+++ b/arch/blackfin/kernel/nmi.c
@@ -166,7 +166,7 @@ int check_nmi_wdt_touched(void)
 	return 1;
 }
 
-static void nmi_wdt_timer(unsigned long data)
+static void nmi_wdt_timer(struct timer_list *unused)
 {
 	if (check_nmi_wdt_touched())
 		nmi_wdt_keepalive();
@@ -180,7 +180,7 @@ static int __init init_nmi_wdt(void)
 	nmi_wdt_start();
 	nmi_active = true;
 
-	setup_timer(&ntimer, nmi_wdt_timer, 0UL);
+	timer_setup(&ntimer, nmi_wdt_timer, 0);
 	ntimer.expires = jiffies + NMI_CHECK_TIMEOUT;
 	add_timer(&ntimer);
 
diff --git a/arch/mips/lasat/picvue_proc.c b/arch/mips/lasat/picvue_proc.c
index a8103f6972cd..5d89e1ec5fcc 100644
--- a/arch/mips/lasat/picvue_proc.c
+++ b/arch/mips/lasat/picvue_proc.c
@@ -156,7 +156,7 @@ static const struct file_operations pvc_scroll_proc_fops = {
 	.write		= pvc_scroll_proc_write,
 };
 
-void pvc_proc_timerfunc(unsigned long data)
+void pvc_proc_timerfunc(struct timer_list *unused)
 {
 	if (scroll_dir < 0)
 		pvc_move(DISPLAY|RIGHT);
@@ -197,7 +197,7 @@ static int __init pvc_proc_init(void)
 	if (proc_entry == NULL)
 		goto error;
 
-	setup_timer(&timer, pvc_proc_timerfunc, 0UL);
+	timer_setup(&timer, pvc_proc_timerfunc, 0);
 
 	return 0;
 error:
diff --git a/arch/powerpc/kernel/tau_6xx.c b/arch/powerpc/kernel/tau_6xx.c
index e3c5f75d137c..8cdd852aedd1 100644
--- a/arch/powerpc/kernel/tau_6xx.c
+++ b/arch/powerpc/kernel/tau_6xx.c
@@ -188,7 +188,7 @@ static void tau_timeout(void * info)
 	local_irq_restore(flags);
 }
 
-static void tau_timeout_smp(unsigned long unused)
+static void tau_timeout_smp(struct timer_list *unused)
 {
 
 	/* schedule ourselves to be run again */
@@ -230,7 +230,7 @@ int __init TAU_init(void)
 
 
 	/* first, set up the window shrinking timer */
-	setup_timer(&tau_timer, tau_timeout_smp, 0UL);
+	timer_setup(&tau_timer, tau_timeout_smp, 0);
 	tau_timer.expires = jiffies + shrink_timer;
 	add_timer(&tau_timer);
 
diff --git a/arch/powerpc/oprofile/op_model_cell.c b/arch/powerpc/oprofile/op_model_cell.c
index 264b6ab11978..b90a21bc2f3f 100644
--- a/arch/powerpc/oprofile/op_model_cell.c
+++ b/arch/powerpc/oprofile/op_model_cell.c
@@ -451,7 +451,7 @@ static inline void enable_ctr(u32 cpu, u32 ctr, u32 *pm07_cntrl)
  * This routine will alternate loading the virtual counters for
  * virtual CPUs
  */
-static void cell_virtual_cntr(unsigned long data)
+static void cell_virtual_cntr(struct timer_list *unused)
 {
 	int i, prev_hdw_thread, next_hdw_thread;
 	u32 cpu;
@@ -555,7 +555,7 @@ static void cell_virtual_cntr(unsigned long data)
 
 static void start_virt_cntrs(void)
 {
-	setup_timer(&timer_virt_cntr, cell_virtual_cntr, 0UL);
+	timer_setup(&timer_virt_cntr, cell_virtual_cntr, 0);
 	timer_virt_cntr.expires = jiffies + HZ / 10;
 	add_timer(&timer_virt_cntr);
 }
@@ -587,7 +587,7 @@ static int cell_reg_setup_spu_cycles(struct op_counter_config *ctr,
  * periodically based on kernel timer to switch which SPU is
  * being monitored in a round robbin fashion.
  */
-static void spu_evnt_swap(unsigned long data)
+static void spu_evnt_swap(struct timer_list *unused)
 {
 	int node;
 	int cur_phys_spu, nxt_phys_spu, cur_spu_evnt_phys_spu_indx;
@@ -677,7 +677,7 @@ static void spu_evnt_swap(unsigned long data)
 
 static void start_spu_event_swap(void)
 {
-	setup_timer(&timer_spu_event_swap, spu_evnt_swap, 0UL);
+	timer_setup(&timer_spu_event_swap, spu_evnt_swap, 0);
 	timer_spu_event_swap.expires = jiffies + HZ / 25;
 	add_timer(&timer_spu_event_swap);
 }
diff --git a/arch/powerpc/platforms/cell/spufs/sched.c b/arch/powerpc/platforms/cell/spufs/sched.c
index e47761cdcb98..9033c8194eda 100644
--- a/arch/powerpc/platforms/cell/spufs/sched.c
+++ b/arch/powerpc/platforms/cell/spufs/sched.c
@@ -992,13 +992,13 @@ static void spu_calc_load(void)
 	CALC_LOAD(spu_avenrun[2], EXP_15, active_tasks);
 }
 
-static void spusched_wake(unsigned long data)
+static void spusched_wake(struct timer_list *unused)
 {
 	mod_timer(&spusched_timer, jiffies + SPUSCHED_TICK);
 	wake_up_process(spusched_task);
 }
 
-static void spuloadavg_wake(unsigned long data)
+static void spuloadavg_wake(struct timer_list *unused)
 {
 	mod_timer(&spuloadavg_timer, jiffies + LOAD_FREQ);
 	spu_calc_load();
@@ -1124,8 +1124,8 @@ int __init spu_sched_init(void)
 	}
 	spin_lock_init(&spu_prio->runq_lock);
 
-	setup_timer(&spusched_timer, spusched_wake, 0);
-	setup_timer(&spuloadavg_timer, spuloadavg_wake, 0);
+	timer_setup(&spusched_timer, spusched_wake, 0);
+	timer_setup(&spuloadavg_timer, spuloadavg_wake, 0);
 
 	spusched_task = kthread_run(spusched_thread, NULL, "spusched");
 	if (IS_ERR(spusched_task)) {
diff --git a/arch/powerpc/platforms/powermac/low_i2c.c b/arch/powerpc/platforms/powermac/low_i2c.c
index 39a1d4225e0f..3408f315ef48 100644
--- a/arch/powerpc/platforms/powermac/low_i2c.c
+++ b/arch/powerpc/platforms/powermac/low_i2c.c
@@ -361,9 +361,9 @@ static irqreturn_t kw_i2c_irq(int irq, void *dev_id)
 	return IRQ_HANDLED;
 }
 
-static void kw_i2c_timeout(unsigned long data)
+static void kw_i2c_timeout(struct timer_list *t)
 {
-	struct pmac_i2c_host_kw *host = (struct pmac_i2c_host_kw *)data;
+	struct pmac_i2c_host_kw *host = from_timer(host, t, timeout_timer);
 	unsigned long flags;
 
 	spin_lock_irqsave(&host->lock, flags);
@@ -513,7 +513,7 @@ static struct pmac_i2c_host_kw *__init kw_i2c_host_init(struct device_node *np)
 	mutex_init(&host->mutex);
 	init_completion(&host->complete);
 	spin_lock_init(&host->lock);
-	setup_timer(&host->timeout_timer, kw_i2c_timeout, (unsigned long)host);
+	timer_setup(&host->timeout_timer, kw_i2c_timeout, 0);
 
 	psteps = of_get_property(np, "AAPL,address-step", NULL);
 	steps = psteps ? (*psteps) : 0x10;
diff --git a/arch/s390/kernel/time.c b/arch/s390/kernel/time.c
index 5cbd52169348..be6198193ec2 100644
--- a/arch/s390/kernel/time.c
+++ b/arch/s390/kernel/time.c
@@ -523,7 +523,7 @@ static void __init stp_reset(void)
 	}
 }
 
-static void stp_timeout(unsigned long dummy)
+static void stp_timeout(struct timer_list *unused)
 {
 	queue_work(time_sync_wq, &stp_work);
 }
@@ -532,7 +532,7 @@ static int __init stp_init(void)
 {
 	if (!test_bit(CLOCK_SYNC_HAS_STP, &clock_sync_flags))
 		return 0;
-	setup_timer(&stp_timer, stp_timeout, 0UL);
+	timer_setup(&stp_timer, stp_timeout, 0);
 	time_init_wq();
 	if (!stp_online)
 		return 0;
diff --git a/arch/sh/drivers/heartbeat.c b/arch/sh/drivers/heartbeat.c
index c6d96049a0bb..e8af2ff29bc3 100644
--- a/arch/sh/drivers/heartbeat.c
+++ b/arch/sh/drivers/heartbeat.c
@@ -59,9 +59,9 @@ static inline void heartbeat_toggle_bit(struct heartbeat_data *hd,
 	}
 }
 
-static void heartbeat_timer(unsigned long data)
+static void heartbeat_timer(struct timer_list *t)
 {
-	struct heartbeat_data *hd = (struct heartbeat_data *)data;
+	struct heartbeat_data *hd = from_timer(hd, t, timer);
 	static unsigned bit = 0, up = 1;
 
 	heartbeat_toggle_bit(hd, bit, hd->flags & HEARTBEAT_INVERTED);
@@ -133,7 +133,7 @@ static int heartbeat_drv_probe(struct platform_device *pdev)
 		}
 	}
 
-	setup_timer(&hd->timer, heartbeat_timer, (unsigned long)hd);
+	timer_setup(&hd->timer, heartbeat_timer, 0);
 	platform_set_drvdata(pdev, hd);
 
 	return mod_timer(&hd->timer, jiffies + 1);
diff --git a/arch/sh/drivers/pci/common.c b/arch/sh/drivers/pci/common.c
index 0d7eb7b5ac8d..fe163ecd0719 100644
--- a/arch/sh/drivers/pci/common.c
+++ b/arch/sh/drivers/pci/common.c
@@ -85,18 +85,18 @@ int __init pci_is_66mhz_capable(struct pci_channel *hose,
 	return cap66 > 0;
 }
 
-static void pcibios_enable_err(unsigned long __data)
+static void pcibios_enable_err(struct timer_list *t)
 {
-	struct pci_channel *hose = (struct pci_channel *)__data;
+	struct pci_channel *hose = from_timer(hose, t, err_timer);
 
 	del_timer(&hose->err_timer);
 	printk(KERN_DEBUG "PCI: re-enabling error IRQ.\n");
 	enable_irq(hose->err_irq);
 }
 
-static void pcibios_enable_serr(unsigned long __data)
+static void pcibios_enable_serr(struct timer_list *t)
 {
-	struct pci_channel *hose = (struct pci_channel *)__data;
+	struct pci_channel *hose = from_timer(hose, t, serr_timer);
 
 	del_timer(&hose->serr_timer);
 	printk(KERN_DEBUG "PCI: re-enabling system error IRQ.\n");
@@ -106,13 +106,11 @@ static void pcibios_enable_serr(unsigned long __data)
 void pcibios_enable_timers(struct pci_channel *hose)
 {
 	if (hose->err_irq) {
-		setup_timer(&hose->err_timer, pcibios_enable_err,
-			    (unsigned long)hose);
+		timer_setup(&hose->err_timer, pcibios_enable_err, 0);
 	}
 
 	if (hose->serr_irq) {
-		setup_timer(&hose->serr_timer, pcibios_enable_serr,
-			    (unsigned long)hose);
+		timer_setup(&hose->serr_timer, pcibios_enable_serr, 0);
 	}
 }
 
diff --git a/arch/sh/drivers/push-switch.c b/arch/sh/drivers/push-switch.c
index 2dc791507968..a17181160233 100644
--- a/arch/sh/drivers/push-switch.c
+++ b/arch/sh/drivers/push-switch.c
@@ -26,9 +26,9 @@ static ssize_t switch_show(struct device *dev,
 }
 static DEVICE_ATTR(switch, S_IRUGO, switch_show, NULL);
 
-static void switch_timer(unsigned long data)
+static void switch_timer(struct timer_list *t)
 {
-	struct push_switch *psw = (struct push_switch *)data;
+	struct push_switch *psw = from_timer(psw, t, debounce);
 
 	schedule_work(&psw->work);
 }
@@ -78,7 +78,7 @@ static int switch_drv_probe(struct platform_device *pdev)
 	}
 
 	INIT_WORK(&psw->work, switch_work_handler);
-	setup_timer(&psw->debounce, switch_timer, (unsigned long)psw);
+	timer_setup(&psw->debounce, switch_timer, 0);
 
 	/* Workqueue API brain-damage */
 	psw->pdev = pdev;
diff --git a/block/blk-stat.c b/block/blk-stat.c
index 3a2f3c96f367..28003bf9941c 100644
--- a/block/blk-stat.c
+++ b/block/blk-stat.c
@@ -79,9 +79,9 @@ void blk_stat_add(struct request *rq)
 	rcu_read_unlock();
 }
 
-static void blk_stat_timer_fn(unsigned long data)
+static void blk_stat_timer_fn(struct timer_list *t)
 {
-	struct blk_stat_callback *cb = (void *)data;
+	struct blk_stat_callback *cb = from_timer(cb, t, timer);
 	unsigned int bucket;
 	int cpu;
 
@@ -130,7 +130,7 @@ blk_stat_alloc_callback(void (*timer_fn)(struct blk_stat_callback *),
 	cb->bucket_fn = bucket_fn;
 	cb->data = data;
 	cb->buckets = buckets;
-	setup_timer(&cb->timer, blk_stat_timer_fn, (unsigned long)cb);
+	timer_setup(&cb->timer, blk_stat_timer_fn, 0);
 
 	return cb;
 }
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 96ad32623427..825bc29767e6 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -225,7 +225,7 @@ struct throtl_data
 	bool track_bio_latency;
 };
 
-static void throtl_pending_timer_fn(unsigned long arg);
+static void throtl_pending_timer_fn(struct timer_list *t);
 
 static inline struct throtl_grp *pd_to_tg(struct blkg_policy_data *pd)
 {
@@ -478,8 +478,7 @@ static void throtl_service_queue_init(struct throtl_service_queue *sq)
 	INIT_LIST_HEAD(&sq->queued[0]);
 	INIT_LIST_HEAD(&sq->queued[1]);
 	sq->pending_tree = RB_ROOT;
-	setup_timer(&sq->pending_timer, throtl_pending_timer_fn,
-		    (unsigned long)sq);
+	timer_setup(&sq->pending_timer, throtl_pending_timer_fn, 0);
 }
 
 static struct blkg_policy_data *throtl_pd_alloc(gfp_t gfp, int node)
@@ -1249,9 +1248,9 @@ static bool throtl_can_upgrade(struct throtl_data *td,
  * the top-level service_tree is reached, throtl_data->dispatch_work is
  * kicked so that the ready bio's are issued.
  */
-static void throtl_pending_timer_fn(unsigned long arg)
+static void throtl_pending_timer_fn(struct timer_list *t)
 {
-	struct throtl_service_queue *sq = (void *)arg;
+	struct throtl_service_queue *sq = from_timer(sq, t, pending_timer);
 	struct throtl_grp *tg = sq_to_tg(sq);
 	struct throtl_data *td = sq_to_td(sq);
 	struct request_queue *q = td->queue;
diff --git a/drivers/atm/ambassador.c b/drivers/atm/ambassador.c
index acf16c323e38..dd286ad404f8 100644
--- a/drivers/atm/ambassador.c
+++ b/drivers/atm/ambassador.c
@@ -293,7 +293,7 @@ static inline void __init show_version (void) {
   
 */
 
-static void do_housekeeping (unsigned long arg);
+static void do_housekeeping (struct timer_list *t);
 /********** globals **********/
 
 static unsigned short debug = 0;
@@ -1493,8 +1493,8 @@ static const struct atmdev_ops amb_ops = {
 };
 
 /********** housekeeping **********/
-static void do_housekeeping (unsigned long arg) {
-  amb_dev * dev = (amb_dev *) arg;
+static void do_housekeeping (struct timer_list *t) {
+  amb_dev * dev = from_timer(dev, t, housekeeping);
   
   // could collect device-specific (not driver/atm-linux) stats here
       
@@ -2267,8 +2267,7 @@ static int amb_probe(struct pci_dev *pci_dev,
 	dev->atm_dev->ci_range.vpi_bits = NUM_VPI_BITS;
 	dev->atm_dev->ci_range.vci_bits = NUM_VCI_BITS;
 
-	setup_timer(&dev->housekeeping, do_housekeeping,
-		    (unsigned long)dev);
+	timer_setup(&dev->housekeeping, do_housekeeping, 0);
 	mod_timer(&dev->housekeeping, jiffies);
 
 	// enable host interrupts
diff --git a/drivers/atm/firestream.c b/drivers/atm/firestream.c
index 534001270be5..d97c05690faa 100644
--- a/drivers/atm/firestream.c
+++ b/drivers/atm/firestream.c
@@ -1656,9 +1656,9 @@ static irqreturn_t fs_irq (int irq, void *dev_id)
 
 
 #ifdef FS_POLL_FREQ
-static void fs_poll (unsigned long data)
+static void fs_poll (struct timer_list *t)
 {
-	struct fs_dev *dev = (struct fs_dev *) data;
+	struct fs_dev *dev = from_timer(dev, t, timer);
   
 	fs_irq (0, dev);
 	dev->timer.expires = jiffies + FS_POLL_FREQ;
@@ -1885,7 +1885,7 @@ static int fs_init(struct fs_dev *dev)
 	}
 
 #ifdef FS_POLL_FREQ
-	setup_timer (&dev->timer, fs_poll, (unsigned long)dev);
+	timer_setup(&dev->timer, fs_poll, 0);
 	dev->timer.expires = jiffies + FS_POLL_FREQ;
 	add_timer (&dev->timer);
 #endif
diff --git a/drivers/atm/horizon.c b/drivers/atm/horizon.c
index e121b8485731..5ddc203206b8 100644
--- a/drivers/atm/horizon.c
+++ b/drivers/atm/horizon.c
@@ -357,7 +357,7 @@ static inline void __init show_version (void) {
 
 /********** globals **********/
 
-static void do_housekeeping (unsigned long arg);
+static void do_housekeeping (struct timer_list *t);
 
 static unsigned short debug = 0;
 static unsigned short vpi_bits = 0;
@@ -1418,9 +1418,9 @@ static irqreturn_t interrupt_handler(int irq, void *dev_id)
 
 /********** housekeeping **********/
 
-static void do_housekeeping (unsigned long arg) {
+static void do_housekeeping (struct timer_list *t) {
   // just stats at the moment
-  hrz_dev * dev = (hrz_dev *) arg;
+  hrz_dev * dev = from_timer(dev, t, housekeeping);
 
   // collect device-specific (not driver/atm-linux) stats here
   dev->tx_cell_count += rd_regw (dev, TX_CELL_COUNT_OFF);
@@ -2796,7 +2796,7 @@ static int hrz_probe(struct pci_dev *pci_dev,
 	dev->atm_dev->ci_range.vpi_bits = vpi_bits;
 	dev->atm_dev->ci_range.vci_bits = 10-vpi_bits;
 
-	setup_timer(&dev->housekeeping, do_housekeeping, (unsigned long) dev);
+	timer_setup(&dev->housekeeping, do_housekeeping, 0);
 	mod_timer(&dev->housekeeping, jiffies);
 
 out:
diff --git a/drivers/atm/idt77252.c b/drivers/atm/idt77252.c
index 0e3b9c44c808..0277f36be85b 100644
--- a/drivers/atm/idt77252.c
+++ b/drivers/atm/idt77252.c
@@ -1528,9 +1528,9 @@ idt77252_tx(struct idt77252_dev *card)
 
 
 static void
-tst_timer(unsigned long data)
+tst_timer(struct timer_list *t)
 {
-	struct idt77252_dev *card = (struct idt77252_dev *)data;
+	struct idt77252_dev *card = from_timer(card, t, tst_timer);
 	unsigned long base, idle, jump;
 	unsigned long flags;
 	u32 pc;
@@ -3634,7 +3634,7 @@ static int idt77252_init_one(struct pci_dev *pcidev,
 	spin_lock_init(&card->cmd_lock);
 	spin_lock_init(&card->tst_lock);
 
-	setup_timer(&card->tst_timer, tst_timer, (unsigned long)card);
+	timer_setup(&card->tst_timer, tst_timer, 0);
 
 	/* Do the I/O remapping... */
 	card->membase = ioremap(membase, 1024);
diff --git a/drivers/atm/lanai.c b/drivers/atm/lanai.c
index 87e8b5dfac39..6664aa50789e 100644
--- a/drivers/atm/lanai.c
+++ b/drivers/atm/lanai.c
@@ -1761,9 +1761,9 @@ static void iter_dequeue(struct lanai_dev *lanai, vci_t vci)
 }
 #endif /* !DEBUG_RW */
 
-static void lanai_timed_poll(unsigned long arg)
+static void lanai_timed_poll(struct timer_list *t)
 {
-	struct lanai_dev *lanai = (struct lanai_dev *) arg;
+	struct lanai_dev *lanai = from_timer(lanai, t, timer);
 #ifndef DEBUG_RW
 	unsigned long flags;
 #ifdef USE_POWERDOWN
@@ -1790,7 +1790,7 @@ static void lanai_timed_poll(unsigned long arg)
 
 static inline void lanai_timed_poll_start(struct lanai_dev *lanai)
 {
-	setup_timer(&lanai->timer, lanai_timed_poll, (unsigned long)lanai);
+	timer_setup(&lanai->timer, lanai_timed_poll, 0);
 	lanai->timer.expires = jiffies + LANAI_POLL_PERIOD;
 	add_timer(&lanai->timer);
 }
diff --git a/drivers/atm/nicstar.c b/drivers/atm/nicstar.c
index 335447ed0ba4..cbec9adc01c7 100644
--- a/drivers/atm/nicstar.c
+++ b/drivers/atm/nicstar.c
@@ -145,7 +145,7 @@ static int ns_ioctl(struct atm_dev *dev, unsigned int cmd, void __user * arg);
 #ifdef EXTRA_DEBUG
 static void which_list(ns_dev * card, struct sk_buff *skb);
 #endif
-static void ns_poll(unsigned long arg);
+static void ns_poll(struct timer_list *unused);
 static void ns_phy_put(struct atm_dev *dev, unsigned char value,
 		       unsigned long addr);
 static unsigned char ns_phy_get(struct atm_dev *dev, unsigned long addr);
@@ -284,7 +284,7 @@ static int __init nicstar_init(void)
 	XPRINTK("nicstar: nicstar_init() returned.\n");
 
 	if (!error) {
-		setup_timer(&ns_timer, ns_poll, 0UL);
+		timer_setup(&ns_timer, ns_poll, 0);
 		ns_timer.expires = jiffies + NS_POLL_PERIOD;
 		add_timer(&ns_timer);
 	}
@@ -2679,7 +2679,7 @@ static void which_list(ns_dev * card, struct sk_buff *skb)
 }
 #endif /* EXTRA_DEBUG */
 
-static void ns_poll(unsigned long arg)
+static void ns_poll(struct timer_list *unused)
 {
 	int i;
 	ns_dev *card;
diff --git a/drivers/block/DAC960.c b/drivers/block/DAC960.c
index 6f14cdd6015b..442e777bdfb2 100644
--- a/drivers/block/DAC960.c
+++ b/drivers/block/DAC960.c
@@ -3079,8 +3079,8 @@ DAC960_InitializeController(DAC960_Controller_T *Controller)
       /*
 	Initialize the Monitoring Timer.
       */
-      setup_timer(&Controller->MonitoringTimer,
-                  DAC960_MonitoringTimerFunction, (unsigned long)Controller);
+      timer_setup(&Controller->MonitoringTimer,
+                  DAC960_MonitoringTimerFunction, 0);
       Controller->MonitoringTimer.expires =
 	jiffies + DAC960_MonitoringTimerInterval;
       add_timer(&Controller->MonitoringTimer);
@@ -5619,9 +5619,9 @@ static void DAC960_V2_QueueMonitoringCommand(DAC960_Command_T *Command)
   the status of DAC960 Controllers.
 */
 
-static void DAC960_MonitoringTimerFunction(unsigned long TimerData)
+static void DAC960_MonitoringTimerFunction(struct timer_list *t)
 {
-  DAC960_Controller_T *Controller = (DAC960_Controller_T *) TimerData;
+  DAC960_Controller_T *Controller = from_timer(Controller, t, MonitoringTimer);
   DAC960_Command_T *Command;
   unsigned long flags;
 
diff --git a/drivers/block/DAC960.h b/drivers/block/DAC960.h
index 85fa9bb63759..6a6226a2b932 100644
--- a/drivers/block/DAC960.h
+++ b/drivers/block/DAC960.h
@@ -4406,7 +4406,7 @@ static irqreturn_t DAC960_PD_InterruptHandler(int, void *);
 static irqreturn_t DAC960_P_InterruptHandler(int, void *);
 static void DAC960_V1_QueueMonitoringCommand(DAC960_Command_T *);
 static void DAC960_V2_QueueMonitoringCommand(DAC960_Command_T *);
-static void DAC960_MonitoringTimerFunction(unsigned long);
+static void DAC960_MonitoringTimerFunction(struct timer_list *);
 static void DAC960_Message(DAC960_MessageLevel_T, unsigned char *,
 			   DAC960_Controller_T *, ...);
 static void DAC960_CreateProcEntries(DAC960_Controller_T *);
diff --git a/drivers/block/rsxx/dma.c b/drivers/block/rsxx/dma.c
index 6a1b2177951c..beaccf197a5a 100644
--- a/drivers/block/rsxx/dma.c
+++ b/drivers/block/rsxx/dma.c
@@ -354,9 +354,9 @@ static void rsxx_handle_dma_error(struct rsxx_dma_ctrl *ctrl,
 		rsxx_complete_dma(ctrl, dma, status);
 }
 
-static void dma_engine_stalled(unsigned long data)
+static void dma_engine_stalled(struct timer_list *t)
 {
-	struct rsxx_dma_ctrl *ctrl = (struct rsxx_dma_ctrl *)data;
+	struct rsxx_dma_ctrl *ctrl = from_timer(ctrl, t, activity_timer);
 	int cnt;
 
 	if (atomic_read(&ctrl->stats.hw_q_depth) == 0 ||
@@ -838,8 +838,7 @@ static int rsxx_dma_ctrl_init(struct pci_dev *dev,
 	mutex_init(&ctrl->work_lock);
 	INIT_LIST_HEAD(&ctrl->queue);
 
-	setup_timer(&ctrl->activity_timer, dma_engine_stalled,
-					(unsigned long)ctrl);
+	timer_setup(&ctrl->activity_timer, dma_engine_stalled, 0);
 
 	ctrl->issue_wq = alloc_ordered_workqueue(DRIVER_NAME"_issue", 0);
 	if (!ctrl->issue_wq)
diff --git a/drivers/block/skd_main.c b/drivers/block/skd_main.c
index 2819f23e8bf2..de0d08133c7e 100644
--- a/drivers/block/skd_main.c
+++ b/drivers/block/skd_main.c
@@ -707,9 +707,9 @@ static void skd_start_queue(struct work_struct *work)
 	blk_mq_start_hw_queues(skdev->queue);
 }
 
-static void skd_timer_tick(ulong arg)
+static void skd_timer_tick(struct timer_list *t)
 {
-	struct skd_device *skdev = (struct skd_device *)arg;
+	struct skd_device *skdev = from_timer(skdev, t, timer);
 	unsigned long reqflags;
 	u32 state;
 
@@ -857,7 +857,7 @@ static int skd_start_timer(struct skd_device *skdev)
 {
 	int rc;
 
-	setup_timer(&skdev->timer, skd_timer_tick, (ulong)skdev);
+	timer_setup(&skdev->timer, skd_timer_tick, 0);
 
 	rc = mod_timer(&skdev->timer, (jiffies + HZ));
 	if (rc)
diff --git a/drivers/block/sunvdc.c b/drivers/block/sunvdc.c
index ad9749463d4f..5ca56bfae63c 100644
--- a/drivers/block/sunvdc.c
+++ b/drivers/block/sunvdc.c
@@ -81,7 +81,7 @@ struct vdc_port {
 
 static void vdc_ldc_reset(struct vdc_port *port);
 static void vdc_ldc_reset_work(struct work_struct *work);
-static void vdc_ldc_reset_timer(unsigned long _arg);
+static void vdc_ldc_reset_timer(struct timer_list *t);
 
 static inline struct vdc_port *to_vdc_port(struct vio_driver_state *vio)
 {
@@ -974,8 +974,7 @@ static int vdc_port_probe(struct vio_dev *vdev, const struct vio_device_id *id)
 	 */
 	ldc_timeout = mdesc_get_property(hp, vdev->mp, "vdc-timeout", NULL);
 	port->ldc_timeout = ldc_timeout ? *ldc_timeout : 0;
-	setup_timer(&port->ldc_reset_timer, vdc_ldc_reset_timer,
-		    (unsigned long)port);
+	timer_setup(&port->ldc_reset_timer, vdc_ldc_reset_timer, 0);
 	INIT_WORK(&port->ldc_reset_work, vdc_ldc_reset_work);
 
 	err = vio_driver_init(&port->vio, vdev, VDEV_DISK,
@@ -1087,9 +1086,9 @@ static void vdc_queue_drain(struct vdc_port *port)
 		__blk_end_request_all(req, BLK_STS_IOERR);
 }
 
-static void vdc_ldc_reset_timer(unsigned long _arg)
+static void vdc_ldc_reset_timer(struct timer_list *t)
 {
-	struct vdc_port *port = (struct vdc_port *) _arg;
+	struct vdc_port *port = from_timer(port, t, ldc_reset_timer);
 	struct vio_driver_state *vio = &port->vio;
 	unsigned long flags;
 
diff --git a/drivers/block/umem.c b/drivers/block/umem.c
index b4d4ccfe7582..8077123678ad 100644
--- a/drivers/block/umem.c
+++ b/drivers/block/umem.c
@@ -718,7 +718,7 @@ static void check_batteries(struct cardinfo *card)
 		set_fault_to_battery_status(card);
 }
 
-static void check_all_batteries(unsigned long ptr)
+static void check_all_batteries(struct timer_list *unused)
 {
 	int i;
 
@@ -738,7 +738,7 @@ static void check_all_batteries(unsigned long ptr)
 
 static void init_battery_timer(void)
 {
-	setup_timer(&battery_timer, check_all_batteries, 0UL);
+	timer_setup(&battery_timer, check_all_batteries, 0);
 	battery_timer.expires = jiffies + (HZ * 60);
 	add_timer(&battery_timer);
 }
diff --git a/drivers/block/xsysace.c b/drivers/block/xsysace.c
index 14459d66ef0c..c24589414c75 100644
--- a/drivers/block/xsysace.c
+++ b/drivers/block/xsysace.c
@@ -770,9 +770,9 @@ static void ace_fsm_tasklet(unsigned long data)
 	spin_unlock_irqrestore(&ace->lock, flags);
 }
 
-static void ace_stall_timer(unsigned long data)
+static void ace_stall_timer(struct timer_list *t)
 {
-	struct ace_device *ace = (void *)data;
+	struct ace_device *ace = from_timer(ace, t, stall_timer);
 	unsigned long flags;
 
 	dev_warn(ace->dev,
@@ -984,7 +984,7 @@ static int ace_setup(struct ace_device *ace)
 	 * Initialize the state machine tasklet and stall timer
 	 */
 	tasklet_init(&ace->fsm_tasklet, ace_fsm_tasklet, (unsigned long)ace);
-	setup_timer(&ace->stall_timer, ace_stall_timer, (unsigned long)ace);
+	timer_setup(&ace->stall_timer, ace_stall_timer, 0);
 
 	/*
 	 * Initialize the request queue
diff --git a/drivers/char/ipmi/bt-bmc.c b/drivers/char/ipmi/bt-bmc.c
index c4ef73c6f455..6edfaa72b98b 100644
--- a/drivers/char/ipmi/bt-bmc.c
+++ b/drivers/char/ipmi/bt-bmc.c
@@ -367,9 +367,9 @@ static const struct file_operations bt_bmc_fops = {
 	.unlocked_ioctl	= bt_bmc_ioctl,
 };
 
-static void poll_timer(unsigned long data)
+static void poll_timer(struct timer_list *t)
 {
-	struct bt_bmc *bt_bmc = (void *)data;
+	struct bt_bmc *bt_bmc = from_timer(bt_bmc, t, poll_timer);
 
 	bt_bmc->poll_timer.expires += msecs_to_jiffies(500);
 	wake_up(&bt_bmc->queue);
@@ -487,8 +487,7 @@ static int bt_bmc_probe(struct platform_device *pdev)
 		dev_info(dev, "Using IRQ %d\n", bt_bmc->irq);
 	} else {
 		dev_info(dev, "No IRQ; using timer\n");
-		setup_timer(&bt_bmc->poll_timer, poll_timer,
-			    (unsigned long)bt_bmc);
+		timer_setup(&bt_bmc->poll_timer, poll_timer, 0);
 		bt_bmc->poll_timer.expires = jiffies + msecs_to_jiffies(10);
 		add_timer(&bt_bmc->poll_timer);
 	}
diff --git a/drivers/char/ipmi/ipmi_msghandler.c b/drivers/char/ipmi/ipmi_msghandler.c
index 9de189db2cc3..f45732a2cb3e 100644
--- a/drivers/char/ipmi/ipmi_msghandler.c
+++ b/drivers/char/ipmi/ipmi_msghandler.c
@@ -4766,7 +4766,7 @@ static struct timer_list ipmi_timer;
 
 static atomic_t stop_operation;
 
-static void ipmi_timeout(unsigned long data)
+static void ipmi_timeout(struct timer_list *unused)
 {
 	ipmi_smi_t intf;
 	int nt = 0;
@@ -5172,7 +5172,7 @@ static int ipmi_init_msghandler(void)
 
 #endif /* CONFIG_IPMI_PROC_INTERFACE */
 
-	setup_timer(&ipmi_timer, ipmi_timeout, 0);
+	timer_setup(&ipmi_timer, ipmi_timeout, 0);
 	mod_timer(&ipmi_timer, jiffies + IPMI_TIMEOUT_JIFFIES);
 
 	atomic_notifier_chain_register(&panic_notifier_list, &panic_block);
diff --git a/drivers/char/ipmi/ipmi_si_intf.c b/drivers/char/ipmi/ipmi_si_intf.c
index 71d33a1807e4..779869ed32b1 100644
--- a/drivers/char/ipmi/ipmi_si_intf.c
+++ b/drivers/char/ipmi/ipmi_si_intf.c
@@ -1091,9 +1091,9 @@ static void set_need_watch(void *send_info, bool enable)
 	spin_unlock_irqrestore(&smi_info->si_lock, flags);
 }
 
-static void smi_timeout(unsigned long data)
+static void smi_timeout(struct timer_list *t)
 {
-	struct smi_info   *smi_info = (struct smi_info *) data;
+	struct smi_info   *smi_info = from_timer(smi_info, t, si_timer);
 	enum si_sm_result smi_result;
 	unsigned long     flags;
 	unsigned long     jiffies_now;
@@ -1166,7 +1166,7 @@ static int smi_start_processing(void       *send_info,
 	new_smi->intf = intf;
 
 	/* Set up the timer that drives the interface. */
-	setup_timer(&new_smi->si_timer, smi_timeout, (long)new_smi);
+	timer_setup(&new_smi->si_timer, smi_timeout, 0);
 	smi_mod_timer(new_smi, jiffies + SI_TIMEOUT_JIFFIES);
 
 	/* Try to claim any interrupts. */
diff --git a/drivers/char/ipmi/ipmi_ssif.c b/drivers/char/ipmi/ipmi_ssif.c
index 466b3a1c0adf..3cfaec728604 100644
--- a/drivers/char/ipmi/ipmi_ssif.c
+++ b/drivers/char/ipmi/ipmi_ssif.c
@@ -551,9 +551,9 @@ static void start_get(struct ssif_info *ssif_info)
 	}
 }
 
-static void retry_timeout(unsigned long data)
+static void retry_timeout(struct timer_list *t)
 {
-	struct ssif_info *ssif_info = (void *) data;
+	struct ssif_info *ssif_info = from_timer(ssif_info, t, retry_timer);
 	unsigned long oflags, *flags;
 	bool waiting;
 
@@ -1691,8 +1691,7 @@ static int ssif_probe(struct i2c_client *client, const struct i2c_device_id *id)
 
 	spin_lock_init(&ssif_info->lock);
 	ssif_info->ssif_state = SSIF_NORMAL;
-	setup_timer(&ssif_info->retry_timer, retry_timeout,
-		    (unsigned long)ssif_info);
+	timer_setup(&ssif_info->retry_timer, retry_timeout, 0);
 
 	for (i = 0; i < SSIF_NUM_STATS; i++)
 		atomic_set(&ssif_info->stats[i], 0);
diff --git a/drivers/char/tpm/tpm-dev-common.c b/drivers/char/tpm/tpm-dev-common.c
index 461bf0b8a094..230b99288024 100644
--- a/drivers/char/tpm/tpm-dev-common.c
+++ b/drivers/char/tpm/tpm-dev-common.c
@@ -22,9 +22,9 @@
 #include "tpm.h"
 #include "tpm-dev.h"
 
-static void user_reader_timeout(unsigned long ptr)
+static void user_reader_timeout(struct timer_list *t)
 {
-	struct file_priv *priv = (struct file_priv *)ptr;
+	struct file_priv *priv = from_timer(priv, t, user_read_timer);
 
 	pr_warn("TPM user space timeout is deprecated (pid=%d)\n",
 		task_tgid_nr(current));
@@ -48,8 +48,7 @@ void tpm_common_open(struct file *file, struct tpm_chip *chip,
 	priv->chip = chip;
 	atomic_set(&priv->data_pending, 0);
 	mutex_init(&priv->buffer_mutex);
-	setup_timer(&priv->user_read_timer, user_reader_timeout,
-			(unsigned long)priv);
+	timer_setup(&priv->user_read_timer, user_reader_timeout, 0);
 	INIT_WORK(&priv->work, timeout_work);
 
 	file->private_data = priv;
diff --git a/drivers/gpu/drm/drm_vblank.c b/drivers/gpu/drm/drm_vblank.c
index 09c1c4ff93ca..3717b3df34a4 100644
--- a/drivers/gpu/drm/drm_vblank.c
+++ b/drivers/gpu/drm/drm_vblank.c
@@ -367,9 +367,9 @@ void drm_vblank_disable_and_save(struct drm_device *dev, unsigned int pipe)
 	spin_unlock_irqrestore(&dev->vblank_time_lock, irqflags);
 }
 
-static void vblank_disable_fn(unsigned long arg)
+static void vblank_disable_fn(struct timer_list *t)
 {
-	struct drm_vblank_crtc *vblank = (void *)arg;
+	struct drm_vblank_crtc *vblank = from_timer(vblank, t, disable_timer);
 	struct drm_device *dev = vblank->dev;
 	unsigned int pipe = vblank->pipe;
 	unsigned long irqflags;
@@ -436,8 +436,7 @@ int drm_vblank_init(struct drm_device *dev, unsigned int num_crtcs)
 		vblank->dev = dev;
 		vblank->pipe = i;
 		init_waitqueue_head(&vblank->queue);
-		setup_timer(&vblank->disable_timer, vblank_disable_fn,
-			    (unsigned long)vblank);
+		timer_setup(&vblank->disable_timer, vblank_disable_fn, 0);
 		seqlock_init(&vblank->seqlock);
 	}
 
@@ -1019,7 +1018,7 @@ static void drm_vblank_put(struct drm_device *dev, unsigned int pipe)
 		if (drm_vblank_offdelay == 0)
 			return;
 		else if (drm_vblank_offdelay < 0)
-			vblank_disable_fn((unsigned long)vblank);
+			vblank_disable_fn(&vblank->disable_timer);
 		else if (!dev->vblank_disable_immediate)
 			mod_timer(&vblank->disable_timer,
 				  jiffies + ((drm_vblank_offdelay * HZ)/1000));
@@ -1650,7 +1649,7 @@ bool drm_handle_vblank(struct drm_device *dev, unsigned int pipe)
 	spin_unlock_irqrestore(&dev->event_lock, irqflags);
 
 	if (disable_irq)
-		vblank_disable_fn((unsigned long)vblank);
+		vblank_disable_fn(&vblank->disable_timer);
 
 	return true;
 }
diff --git a/drivers/gpu/drm/exynos/exynos_drm_vidi.c b/drivers/gpu/drm/exynos/exynos_drm_vidi.c
index 53e03f8af3d5..e6b0940b1ac2 100644
--- a/drivers/gpu/drm/exynos/exynos_drm_vidi.c
+++ b/drivers/gpu/drm/exynos/exynos_drm_vidi.c
@@ -161,9 +161,9 @@ static const struct exynos_drm_crtc_ops vidi_crtc_ops = {
 	.atomic_flush = exynos_crtc_handle_event,
 };
 
-static void vidi_fake_vblank_timer(unsigned long arg)
+static void vidi_fake_vblank_timer(struct timer_list *t)
 {
-	struct vidi_context *ctx = (void *)arg;
+	struct vidi_context *ctx = from_timer(ctx, t, timer);
 
 	if (drm_crtc_handle_vblank(&ctx->crtc->base))
 		mod_timer(&ctx->timer,
@@ -449,7 +449,7 @@ static int vidi_probe(struct platform_device *pdev)
 
 	ctx->pdev = pdev;
 
-	setup_timer(&ctx->timer, vidi_fake_vblank_timer, (unsigned long)ctx);
+	timer_setup(&ctx->timer, vidi_fake_vblank_timer, 0);
 
 	mutex_init(&ctx->lock);
 
diff --git a/drivers/gpu/drm/i2c/tda998x_drv.c b/drivers/gpu/drm/i2c/tda998x_drv.c
index 4d1f45acf2cd..127815253a84 100644
--- a/drivers/gpu/drm/i2c/tda998x_drv.c
+++ b/drivers/gpu/drm/i2c/tda998x_drv.c
@@ -601,9 +601,9 @@ tda998x_reset(struct tda998x_priv *priv)
  * we have seen a HPD inactive->active transition.  This code implements
  * that delay.
  */
-static void tda998x_edid_delay_done(unsigned long data)
+static void tda998x_edid_delay_done(struct timer_list *t)
 {
-	struct tda998x_priv *priv = (struct tda998x_priv *)data;
+	struct tda998x_priv *priv = from_timer(priv, t, edid_delay_timer);
 
 	priv->edid_delay_active = false;
 	wake_up(&priv->edid_delay_waitq);
@@ -1492,8 +1492,7 @@ static int tda998x_create(struct i2c_client *client, struct tda998x_priv *priv)
 
 	mutex_init(&priv->mutex);	/* protect the page access */
 	init_waitqueue_head(&priv->edid_delay_waitq);
-	setup_timer(&priv->edid_delay_timer, tda998x_edid_delay_done,
-		    (unsigned long)priv);
+	timer_setup(&priv->edid_delay_timer, tda998x_edid_delay_done, 0);
 	INIT_WORK(&priv->detect_work, tda998x_detect_work);
 
 	/* wake up the device: */
diff --git a/drivers/gpu/drm/msm/adreno/a5xx_preempt.c b/drivers/gpu/drm/msm/adreno/a5xx_preempt.c
index 40f4840ef98e..970c7963ae29 100644
--- a/drivers/gpu/drm/msm/adreno/a5xx_preempt.c
+++ b/drivers/gpu/drm/msm/adreno/a5xx_preempt.c
@@ -82,9 +82,9 @@ static struct msm_ringbuffer *get_next_ring(struct msm_gpu *gpu)
 	return NULL;
 }
 
-static void a5xx_preempt_timer(unsigned long data)
+static void a5xx_preempt_timer(struct timer_list *t)
 {
-	struct a5xx_gpu *a5xx_gpu = (struct a5xx_gpu *) data;
+	struct a5xx_gpu *a5xx_gpu = from_timer(a5xx_gpu, t, preempt_timer);
 	struct msm_gpu *gpu = &a5xx_gpu->base.base;
 	struct drm_device *dev = gpu->dev;
 	struct msm_drm_private *priv = dev->dev_private;
@@ -300,6 +300,5 @@ void a5xx_preempt_init(struct msm_gpu *gpu)
 		}
 	}
 
-	setup_timer(&a5xx_gpu->preempt_timer, a5xx_preempt_timer,
-		(unsigned long) a5xx_gpu);
+	timer_setup(&a5xx_gpu->preempt_timer, a5xx_preempt_timer, 0);
 }
diff --git a/drivers/gpu/drm/msm/msm_gpu.c b/drivers/gpu/drm/msm/msm_gpu.c
index 8d4477818ec2..232201403439 100644
--- a/drivers/gpu/drm/msm/msm_gpu.c
+++ b/drivers/gpu/drm/msm/msm_gpu.c
@@ -353,9 +353,9 @@ static void hangcheck_timer_reset(struct msm_gpu *gpu)
 			round_jiffies_up(jiffies + DRM_MSM_HANGCHECK_JIFFIES));
 }
 
-static void hangcheck_handler(unsigned long data)
+static void hangcheck_handler(struct timer_list *t)
 {
-	struct msm_gpu *gpu = (struct msm_gpu *)data;
+	struct msm_gpu *gpu = from_timer(gpu, t, hangcheck_timer);
 	struct drm_device *dev = gpu->dev;
 	struct msm_drm_private *priv = dev->dev_private;
 	struct msm_ringbuffer *ring = gpu->funcs->active_ring(gpu);
@@ -703,8 +703,7 @@ int msm_gpu_init(struct drm_device *drm, struct platform_device *pdev,
 	INIT_WORK(&gpu->recover_work, recover_worker);
 
 
-	setup_timer(&gpu->hangcheck_timer, hangcheck_handler,
-			(unsigned long)gpu);
+	timer_setup(&gpu->hangcheck_timer, hangcheck_handler, 0);
 
 	spin_lock_init(&gpu->perf_lock);
 
diff --git a/drivers/gpu/drm/omapdrm/dss/dsi.c b/drivers/gpu/drm/omapdrm/dss/dsi.c
index cea744e4d9bd..c2cf6d98e577 100644
--- a/drivers/gpu/drm/omapdrm/dss/dsi.c
+++ b/drivers/gpu/drm/omapdrm/dss/dsi.c
@@ -4095,7 +4095,7 @@ static void dsi_update_screen_dispc(struct platform_device *dsidev)
 }
 
 #ifdef DSI_CATCH_MISSING_TE
-static void dsi_te_timeout(unsigned long arg)
+static void dsi_te_timeout(struct timer_list *unused)
 {
 	DSSERR("TE not received for 250ms!\n");
 }
@@ -5449,7 +5449,7 @@ static int dsi_bind(struct device *dev, struct device *master, void *data)
 			     dsi_framedone_timeout_work_callback);
 
 #ifdef DSI_CATCH_MISSING_TE
-	setup_timer(&dsi->te_timer, dsi_te_timeout, 0);
+	timer_setup(&dsi->te_timer, dsi_te_timeout, 0);
 #endif
 
 	dsi_mem = platform_get_resource_byname(dsidev, IORESOURCE_MEM, "proto");
diff --git a/drivers/gpu/drm/rockchip/rockchip_drm_psr.c b/drivers/gpu/drm/rockchip/rockchip_drm_psr.c
index a553e182ff53..3acfd576b7df 100644
--- a/drivers/gpu/drm/rockchip/rockchip_drm_psr.c
+++ b/drivers/gpu/drm/rockchip/rockchip_drm_psr.c
@@ -101,9 +101,9 @@ static void psr_set_state(struct psr_drv *psr, enum psr_state state)
 	spin_unlock_irqrestore(&psr->lock, flags);
 }
 
-static void psr_flush_handler(unsigned long data)
+static void psr_flush_handler(struct timer_list *t)
 {
-	struct psr_drv *psr = (struct psr_drv *)data;
+	struct psr_drv *psr = from_timer(psr, t, flush_timer);
 	unsigned long flags;
 
 	/* If the state has changed since we initiated the flush, do nothing */
@@ -232,7 +232,7 @@ int rockchip_drm_psr_register(struct drm_encoder *encoder,
 	if (!psr)
 		return -ENOMEM;
 
-	setup_timer(&psr->flush_timer, psr_flush_handler, (unsigned long)psr);
+	timer_setup(&psr->flush_timer, psr_flush_handler, 0);
 	spin_lock_init(&psr->lock);
 
 	psr->active = true;
diff --git a/drivers/gpu/drm/vgem/vgem_fence.c b/drivers/gpu/drm/vgem/vgem_fence.c
index 8fd52f211e9d..b28876c222b4 100644
--- a/drivers/gpu/drm/vgem/vgem_fence.c
+++ b/drivers/gpu/drm/vgem/vgem_fence.c
@@ -85,9 +85,9 @@ static const struct dma_fence_ops vgem_fence_ops = {
 	.timeline_value_str = vgem_fence_timeline_value_str,
 };
 
-static void vgem_fence_timeout(unsigned long data)
+static void vgem_fence_timeout(struct timer_list *t)
 {
-	struct vgem_fence *fence = (struct vgem_fence *)data;
+	struct vgem_fence *fence = from_timer(fence, t, timer);
 
 	dma_fence_signal(&fence->base);
 }
@@ -105,7 +105,7 @@ static struct dma_fence *vgem_fence_create(struct vgem_file *vfile,
 	dma_fence_init(&fence->base, &vgem_fence_ops, &fence->lock,
 		       dma_fence_context_alloc(1), 1);
 
-	setup_timer(&fence->timer, vgem_fence_timeout, (unsigned long)fence);
+	timer_setup(&fence->timer, vgem_fence_timeout, 0);
 
 	/* We force the fence to expire within 10s to prevent driver hangs */
 	mod_timer(&fence->timer, jiffies + VGEM_FENCE_TIMEOUT);
diff --git a/drivers/gpu/drm/via/via_dmablit.c b/drivers/gpu/drm/via/via_dmablit.c
index 32c9938e1e1e..d6e84a589ef1 100644
--- a/drivers/gpu/drm/via/via_dmablit.c
+++ b/drivers/gpu/drm/via/via_dmablit.c
@@ -452,9 +452,9 @@ via_dmablit_sync(struct drm_device *dev, uint32_t handle, int engine)
 
 
 static void
-via_dmablit_timer(unsigned long data)
+via_dmablit_timer(struct timer_list *t)
 {
-	drm_via_blitq_t *blitq = (drm_via_blitq_t *) data;
+	drm_via_blitq_t *blitq = from_timer(blitq, t, poll_timer);
 	struct drm_device *dev = blitq->dev;
 	int engine = (int)
 		(blitq - ((drm_via_private_t *)dev->dev_private)->blit_queues);
@@ -559,8 +559,7 @@ via_init_dmablit(struct drm_device *dev)
 			init_waitqueue_head(blitq->blit_queue + j);
 		init_waitqueue_head(&blitq->busy_queue);
 		INIT_WORK(&blitq->wq, via_dmablit_workqueue);
-		setup_timer(&blitq->poll_timer, via_dmablit_timer,
-				(unsigned long)blitq);
+		timer_setup(&blitq->poll_timer, via_dmablit_timer, 0);
 	}
 }
 
diff --git a/drivers/hid/hid-appleir.c b/drivers/hid/hid-appleir.c
index 07cbc70f00e7..eae7d52cf1a8 100644
--- a/drivers/hid/hid-appleir.c
+++ b/drivers/hid/hid-appleir.c
@@ -173,9 +173,9 @@ static void battery_flat(struct appleir *appleir)
 	dev_err(&appleir->input_dev->dev, "possible flat battery?\n");
 }
 
-static void key_up_tick(unsigned long data)
+static void key_up_tick(struct timer_list *t)
 {
-	struct appleir *appleir = (struct appleir *)data;
+	struct appleir *appleir = from_timer(appleir, t, key_up_timer);
 	struct hid_device *hid = appleir->hid;
 	unsigned long flags;
 
@@ -303,8 +303,7 @@ static int appleir_probe(struct hid_device *hid, const struct hid_device_id *id)
 	hid->quirks |= HID_QUIRK_HIDINPUT_FORCE;
 
 	spin_lock_init(&appleir->lock);
-	setup_timer(&appleir->key_up_timer,
-		    key_up_tick, (unsigned long) appleir);
+	timer_setup(&appleir->key_up_timer, key_up_tick, 0);
 
 	hid_set_drvdata(hid, appleir);
 
diff --git a/drivers/hid/hid-prodikeys.c b/drivers/hid/hid-prodikeys.c
index 49c4bd34b3c5..87eda34ea2f8 100644
--- a/drivers/hid/hid-prodikeys.c
+++ b/drivers/hid/hid-prodikeys.c
@@ -239,9 +239,9 @@ drop_note:
 	return;
 }
 
-static void pcmidi_sustained_note_release(unsigned long data)
+static void pcmidi_sustained_note_release(struct timer_list *t)
 {
-	struct pcmidi_sustain *pms = (struct pcmidi_sustain *)data;
+	struct pcmidi_sustain *pms = from_timer(pms, t, timer);
 
 	pcmidi_send_note(pms->pm, pms->status, pms->note, pms->velocity);
 	pms->in_use = 0;
@@ -256,8 +256,7 @@ static void init_sustain_timers(struct pcmidi_snd *pm)
 		pms = &pm->sustained_notes[i];
 		pms->in_use = 0;
 		pms->pm = pm;
-		setup_timer(&pms->timer, pcmidi_sustained_note_release,
-			(unsigned long)pms);
+		timer_setup(&pms->timer, pcmidi_sustained_note_release, 0);
 	}
 }
 
diff --git a/drivers/hid/hid-wiimote-core.c b/drivers/hid/hid-wiimote-core.c
index d00391418d1a..579884ebd94d 100644
--- a/drivers/hid/hid-wiimote-core.c
+++ b/drivers/hid/hid-wiimote-core.c
@@ -1226,9 +1226,9 @@ static void wiimote_schedule(struct wiimote_data *wdata)
 	spin_unlock_irqrestore(&wdata->state.lock, flags);
 }
 
-static void wiimote_init_timeout(unsigned long arg)
+static void wiimote_init_timeout(struct timer_list *t)
 {
-	struct wiimote_data *wdata = (void*)arg;
+	struct wiimote_data *wdata = from_timer(wdata, t, timer);
 
 	wiimote_schedule(wdata);
 }
@@ -1740,7 +1740,7 @@ static struct wiimote_data *wiimote_create(struct hid_device *hdev)
 	wdata->state.cmd_battery = 0xff;
 
 	INIT_WORK(&wdata->init_worker, wiimote_init_worker);
-	setup_timer(&wdata->timer, wiimote_init_timeout, (long)wdata);
+	timer_setup(&wdata->timer, wiimote_init_timeout, 0);
 
 	return wdata;
 }
diff --git a/drivers/iio/common/ssp_sensors/ssp_dev.c b/drivers/iio/common/ssp_sensors/ssp_dev.c
index ea7adb638d99..2ba2ff5e59c4 100644
--- a/drivers/iio/common/ssp_sensors/ssp_dev.c
+++ b/drivers/iio/common/ssp_sensors/ssp_dev.c
@@ -175,9 +175,9 @@ static void ssp_wdt_work_func(struct work_struct *work)
 	data->timeout_cnt = 0;
 }
 
-static void ssp_wdt_timer_func(unsigned long ptr)
+static void ssp_wdt_timer_func(struct timer_list *t)
 {
-	struct ssp_data *data = (struct ssp_data *)ptr;
+	struct ssp_data *data = from_timer(data, t, wdt_timer);
 
 	switch (data->fw_dl_state) {
 	case SSP_FW_DL_STATE_FAIL:
@@ -571,7 +571,7 @@ static int ssp_probe(struct spi_device *spi)
 	INIT_WORK(&data->work_wdt, ssp_wdt_work_func);
 	INIT_DELAYED_WORK(&data->work_refresh, ssp_refresh_task);
 
-	setup_timer(&data->wdt_timer, ssp_wdt_timer_func, (unsigned long)data);
+	timer_setup(&data->wdt_timer, ssp_wdt_timer_func, 0);
 
 	ret = request_threaded_irq(data->spi->irq, NULL,
 				   ssp_irq_thread_fn,
diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c
index 9beee9cef137..ee0ee1f9994b 100644
--- a/drivers/infiniband/hw/mlx5/mr.c
+++ b/drivers/infiniband/hw/mlx5/mr.c
@@ -642,9 +642,9 @@ err:
 	return -ENOMEM;
 }
 
-static void delay_time_func(unsigned long ctx)
+static void delay_time_func(struct timer_list *t)
 {
-	struct mlx5_ib_dev *dev = (struct mlx5_ib_dev *)ctx;
+	struct mlx5_ib_dev *dev = from_timer(dev, t, delay_timer);
 
 	dev->fill_delay = 0;
 }
@@ -663,7 +663,7 @@ int mlx5_mr_cache_init(struct mlx5_ib_dev *dev)
 		return -ENOMEM;
 	}
 
-	setup_timer(&dev->delay_timer, delay_time_func, (unsigned long)dev);
+	timer_setup(&dev->delay_timer, delay_time_func, 0);
 	for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) {
 		ent = &cache->ent[i];
 		INIT_LIST_HEAD(&ent->head);
diff --git a/drivers/input/gameport/gameport.c b/drivers/input/gameport/gameport.c
index cedc665364cd..73862a836062 100644
--- a/drivers/input/gameport/gameport.c
+++ b/drivers/input/gameport/gameport.c
@@ -202,9 +202,9 @@ void gameport_stop_polling(struct gameport *gameport)
 }
 EXPORT_SYMBOL(gameport_stop_polling);
 
-static void gameport_run_poll_handler(unsigned long d)
+static void gameport_run_poll_handler(struct timer_list *t)
 {
-	struct gameport *gameport = (struct gameport *)d;
+	struct gameport *gameport = from_timer(gameport, t, poll_timer);
 
 	gameport->poll_handler(gameport);
 	if (gameport->poll_cnt)
@@ -542,8 +542,7 @@ static void gameport_init_port(struct gameport *gameport)
 
 	INIT_LIST_HEAD(&gameport->node);
 	spin_lock_init(&gameport->timer_lock);
-	setup_timer(&gameport->poll_timer, gameport_run_poll_handler,
-		    (unsigned long)gameport);
+	timer_setup(&gameport->poll_timer, gameport_run_poll_handler, 0);
 }
 
 /*
diff --git a/drivers/input/joystick/db9.c b/drivers/input/joystick/db9.c
index f4ad83eab67f..de0dd4756c84 100644
--- a/drivers/input/joystick/db9.c
+++ b/drivers/input/joystick/db9.c
@@ -364,9 +364,9 @@ static int db9_saturn(int mode, struct parport *port, struct input_dev *devs[])
 	return 0;
 }
 
-static void db9_timer(unsigned long private)
+static void db9_timer(struct timer_list *t)
 {
-	struct db9 *db9 = (void *) private;
+	struct db9 *db9 = from_timer(db9, t, timer);
 	struct parport *port = db9->pd->port;
 	struct input_dev *dev = db9->dev[0];
 	struct input_dev *dev2 = db9->dev[1];
@@ -609,7 +609,7 @@ static void db9_attach(struct parport *pp)
 	db9->pd = pd;
 	db9->mode = mode;
 	db9->parportno = pp->number;
-	setup_timer(&db9->timer, db9_timer, (long)db9);
+	timer_setup(&db9->timer, db9_timer, 0);
 
 	for (i = 0; i < (min(db9_mode->n_pads, DB9_MAX_DEVICES)); i++) {
 
diff --git a/drivers/input/joystick/gamecon.c b/drivers/input/joystick/gamecon.c
index ca734ea97e53..2ffb2e8bdc3b 100644
--- a/drivers/input/joystick/gamecon.c
+++ b/drivers/input/joystick/gamecon.c
@@ -743,9 +743,9 @@ static void gc_psx_process_packet(struct gc *gc)
  * gc_timer() initiates reads of console pads data.
  */
 
-static void gc_timer(unsigned long private)
+static void gc_timer(struct timer_list *t)
 {
-	struct gc *gc = (void *) private;
+	struct gc *gc = from_timer(gc, t, timer);
 
 /*
  * N64 pads - must be read first, any read confuses them for 200 us
@@ -974,7 +974,7 @@ static void gc_attach(struct parport *pp)
 	mutex_init(&gc->mutex);
 	gc->pd = pd;
 	gc->parportno = pp->number;
-	setup_timer(&gc->timer, gc_timer, (long) gc);
+	timer_setup(&gc->timer, gc_timer, 0);
 
 	for (i = 0; i < n_pads && i < GC_MAX_DEVICES; i++) {
 		if (!pads[i])
diff --git a/drivers/input/joystick/turbografx.c b/drivers/input/joystick/turbografx.c
index a1fdc75a438d..e2685753e460 100644
--- a/drivers/input/joystick/turbografx.c
+++ b/drivers/input/joystick/turbografx.c
@@ -89,9 +89,9 @@ static struct tgfx {
  * tgfx_timer() reads and analyzes TurboGraFX joystick data.
  */
 
-static void tgfx_timer(unsigned long private)
+static void tgfx_timer(struct timer_list *t)
 {
-	struct tgfx *tgfx = (void *) private;
+	struct tgfx *tgfx = from_timer(tgfx, t, timer);
 	struct input_dev *dev;
 	int data1, data2, i;
 
@@ -200,7 +200,7 @@ static void tgfx_attach(struct parport *pp)
 	mutex_init(&tgfx->sem);
 	tgfx->pd = pd;
 	tgfx->parportno = pp->number;
-	setup_timer(&tgfx->timer, tgfx_timer, (long)tgfx);
+	timer_setup(&tgfx->timer, tgfx_timer, 0);
 
 	for (i = 0; i < n_devs; i++) {
 		if (n_buttons[i] < 1)
diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c
index 466aaa8ba841..83fe2621effe 100644
--- a/drivers/iommu/iova.c
+++ b/drivers/iommu/iova.c
@@ -36,7 +36,7 @@ static unsigned long iova_rcache_get(struct iova_domain *iovad,
 static void init_iova_rcaches(struct iova_domain *iovad);
 static void free_iova_rcaches(struct iova_domain *iovad);
 static void fq_destroy_all_entries(struct iova_domain *iovad);
-static void fq_flush_timeout(unsigned long data);
+static void fq_flush_timeout(struct timer_list *t);
 
 void
 init_iova_domain(struct iova_domain *iovad, unsigned long granule,
@@ -107,7 +107,7 @@ int init_iova_flush_queue(struct iova_domain *iovad,
 		spin_lock_init(&fq->lock);
 	}
 
-	setup_timer(&iovad->fq_timer, fq_flush_timeout, (unsigned long)iovad);
+	timer_setup(&iovad->fq_timer, fq_flush_timeout, 0);
 	atomic_set(&iovad->fq_timer_on, 0);
 
 	return 0;
@@ -519,9 +519,9 @@ static void fq_destroy_all_entries(struct iova_domain *iovad)
 	}
 }
 
-static void fq_flush_timeout(unsigned long data)
+static void fq_flush_timeout(struct timer_list *t)
 {
-	struct iova_domain *iovad = (struct iova_domain *)data;
+	struct iova_domain *iovad = from_timer(iovad, t, fq_timer);
 	int cpu;
 
 	atomic_set(&iovad->fq_timer_on, 0);
diff --git a/drivers/isdn/capi/capidrv.c b/drivers/isdn/capi/capidrv.c
index 89dd1303a98a..49fef08858c5 100644
--- a/drivers/isdn/capi/capidrv.c
+++ b/drivers/isdn/capi/capidrv.c
@@ -2235,9 +2235,9 @@ static void send_listen(capidrv_contr *card)
 	send_message(card, &cmdcmsg);
 }
 
-static void listentimerfunc(unsigned long x)
+static void listentimerfunc(struct timer_list *t)
 {
-	capidrv_contr *card = (capidrv_contr *)x;
+	capidrv_contr *card = from_timer(card, t, listentimer);
 	if (card->state != ST_LISTEN_NONE && card->state != ST_LISTEN_ACTIVE)
 		printk(KERN_ERR "%s: controller dead ??\n", card->name);
 	send_listen(card);
@@ -2264,7 +2264,7 @@ static int capidrv_addcontr(u16 contr, struct capi_profile *profp)
 		return -1;
 	}
 	card->owner = THIS_MODULE;
-	setup_timer(&card->listentimer, listentimerfunc, (unsigned long)card);
+	timer_setup(&card->listentimer, listentimerfunc, 0);
 	strcpy(card->name, id);
 	card->contrnr = contr;
 	card->nbchan = profp->nbchannel;
diff --git a/drivers/isdn/divert/isdn_divert.c b/drivers/isdn/divert/isdn_divert.c
index 6f423bc49d0d..5620fd2c6009 100644
--- a/drivers/isdn/divert/isdn_divert.c
+++ b/drivers/isdn/divert/isdn_divert.c
@@ -55,10 +55,10 @@ DEFINE_SPINLOCK(divert_lock);
 /***************************/
 /* timer callback function */
 /***************************/
-static void deflect_timer_expire(ulong arg)
+static void deflect_timer_expire(struct timer_list *t)
 {
 	unsigned long flags;
-	struct call_struc *cs = (struct call_struc *) arg;
+	struct call_struc *cs = from_timer(cs, t, timer);
 
 	spin_lock_irqsave(&divert_lock, flags);
 	del_timer(&cs->timer); /* delete active timer */
@@ -157,7 +157,7 @@ int cf_command(int drvid, int mode,
 	/* allocate mem for information struct */
 	if (!(cs = kmalloc(sizeof(struct call_struc), GFP_ATOMIC)))
 		return (-ENOMEM); /* no memory */
-	setup_timer(&cs->timer, deflect_timer_expire, (ulong)cs);
+	timer_setup(&cs->timer, deflect_timer_expire, 0);
 	cs->info[0] = '\0';
 	cs->ics.driver = drvid;
 	cs->ics.command = ISDN_CMD_PROT_IO; /* protocol specific io */
@@ -450,8 +450,7 @@ static int isdn_divert_icall(isdn_ctrl *ic)
 					return (0); /* no external deflection needed */
 			if (!(cs = kmalloc(sizeof(struct call_struc), GFP_ATOMIC)))
 				return (0); /* no memory */
-			setup_timer(&cs->timer, deflect_timer_expire,
-				    (ulong)cs);
+			timer_setup(&cs->timer, deflect_timer_expire, 0);
 			cs->info[0] = '\0';
 
 			cs->ics = *ic; /* copy incoming data */
diff --git a/drivers/isdn/hardware/eicon/divasi.c b/drivers/isdn/hardware/eicon/divasi.c
index c61049585cbd..0033d74a7291 100644
--- a/drivers/isdn/hardware/eicon/divasi.c
+++ b/drivers/isdn/hardware/eicon/divasi.c
@@ -78,7 +78,7 @@ static unsigned int um_idi_poll(struct file *file, poll_table *wait);
 static int um_idi_open(struct inode *inode, struct file *file);
 static int um_idi_release(struct inode *inode, struct file *file);
 static int remove_entity(void *entity);
-static void diva_um_timer_function(unsigned long data);
+static void diva_um_timer_function(struct timer_list *t);
 
 /*
  * proc entry
@@ -300,8 +300,7 @@ static int um_idi_open_adapter(struct file *file, int adapter_nr)
 	p_os = (diva_um_idi_os_context_t *) diva_um_id_get_os_context(e);
 	init_waitqueue_head(&p_os->read_wait);
 	init_waitqueue_head(&p_os->close_wait);
-	setup_timer(&p_os->diva_timer_id, (void *)diva_um_timer_function,
-		    (unsigned long)p_os);
+	timer_setup(&p_os->diva_timer_id, diva_um_timer_function, 0);
 	p_os->aborted = 0;
 	p_os->adapter_nr = adapter_nr;
 	return (1);
@@ -457,9 +456,9 @@ void diva_os_wakeup_close(void *os_context)
 }
 
 static
-void diva_um_timer_function(unsigned long data)
+void diva_um_timer_function(struct timer_list *t)
 {
-	diva_um_idi_os_context_t *p_os = (diva_um_idi_os_context_t *) data;
+	diva_um_idi_os_context_t *p_os = from_timer(p_os, t, diva_timer_id);
 
 	p_os->aborted = 1;
 	wake_up_interruptible(&p_os->read_wait);
diff --git a/drivers/isdn/hardware/mISDN/hfcmulti.c b/drivers/isdn/hardware/mISDN/hfcmulti.c
index 3cf07b8ced1c..4d85645c87f7 100644
--- a/drivers/isdn/hardware/mISDN/hfcmulti.c
+++ b/drivers/isdn/hardware/mISDN/hfcmulti.c
@@ -2855,7 +2855,7 @@ irq_notforus:
  */
 
 static void
-hfcmulti_dbusy_timer(struct hfc_multi *hc)
+hfcmulti_dbusy_timer(struct timer_list *t)
 {
 }
 
@@ -3877,8 +3877,7 @@ hfcmulti_initmode(struct dchannel *dch)
 		if (hc->dnum[pt]) {
 			mode_hfcmulti(hc, dch->slot, dch->dev.D.protocol,
 				      -1, 0, -1, 0);
-			setup_timer(&dch->timer, (void *)hfcmulti_dbusy_timer,
-				    (long)dch);
+			timer_setup(&dch->timer, hfcmulti_dbusy_timer, 0);
 		}
 		for (i = 1; i <= 31; i++) {
 			if (!((1 << i) & hc->bmask[pt])) /* skip unused chan */
@@ -3984,8 +3983,7 @@ hfcmulti_initmode(struct dchannel *dch)
 		hc->chan[i].slot_rx = -1;
 		hc->chan[i].conf = -1;
 		mode_hfcmulti(hc, i, dch->dev.D.protocol, -1, 0, -1, 0);
-		setup_timer(&dch->timer, (void *)hfcmulti_dbusy_timer,
-			    (long)dch);
+		timer_setup(&dch->timer, hfcmulti_dbusy_timer, 0);
 		hc->chan[i - 2].slot_tx = -1;
 		hc->chan[i - 2].slot_rx = -1;
 		hc->chan[i - 2].conf = -1;
diff --git a/drivers/isdn/hardware/mISDN/hfcpci.c b/drivers/isdn/hardware/mISDN/hfcpci.c
index e4ebbee863a1..ba3fe14bbe00 100644
--- a/drivers/isdn/hardware/mISDN/hfcpci.c
+++ b/drivers/isdn/hardware/mISDN/hfcpci.c
@@ -1241,7 +1241,7 @@ hfcpci_int(int intno, void *dev_id)
  * timer callback for D-chan busy resolution. Currently no function
  */
 static void
-hfcpci_dbusy_timer(struct hfc_pci *hc)
+hfcpci_dbusy_timer(struct timer_list *t)
 {
 }
 
@@ -1717,8 +1717,7 @@ static void
 inithfcpci(struct hfc_pci *hc)
 {
 	printk(KERN_DEBUG "inithfcpci: entered\n");
-	setup_timer(&hc->dch.timer, (void *)hfcpci_dbusy_timer,
-		    (long)&hc->dch);
+	timer_setup(&hc->dch.timer, hfcpci_dbusy_timer, 0);
 	hc->chanlimit = 2;
 	mode_hfcpci(&hc->bch[0], 1, -1);
 	mode_hfcpci(&hc->bch[1], 2, -1);
diff --git a/drivers/isdn/hardware/mISDN/mISDNisar.c b/drivers/isdn/hardware/mISDN/mISDNisar.c
index 5b078591b6ee..b791688d0228 100644
--- a/drivers/isdn/hardware/mISDN/mISDNisar.c
+++ b/drivers/isdn/hardware/mISDN/mISDNisar.c
@@ -1146,9 +1146,9 @@ mISDNisar_irq(struct isar_hw *isar)
 EXPORT_SYMBOL(mISDNisar_irq);
 
 static void
-ftimer_handler(unsigned long data)
+ftimer_handler(struct timer_list *t)
 {
-	struct isar_ch *ch = (struct isar_ch *)data;
+	struct isar_ch *ch = from_timer(ch, t, ftimer);
 
 	pr_debug("%s: ftimer flags %lx\n", ch->is->name, ch->bch.Flags);
 	test_and_clear_bit(FLG_FTI_RUN, &ch->bch.Flags);
@@ -1635,11 +1635,9 @@ init_isar(struct isar_hw *isar)
 	}
 	if (isar->version != 1)
 		return -EINVAL;
-	setup_timer(&isar->ch[0].ftimer, &ftimer_handler,
-		    (long)&isar->ch[0]);
+	timer_setup(&isar->ch[0].ftimer, ftimer_handler, 0);
 	test_and_set_bit(FLG_INITIALIZED, &isar->ch[0].bch.Flags);
-	setup_timer(&isar->ch[1].ftimer, &ftimer_handler,
-		    (long)&isar->ch[1]);
+	timer_setup(&isar->ch[1].ftimer, ftimer_handler, 0);
 	test_and_set_bit(FLG_INITIALIZED, &isar->ch[1].bch.Flags);
 	return 0;
 }
diff --git a/drivers/isdn/i4l/isdn_common.c b/drivers/isdn/i4l/isdn_common.c
index 3fa2f7b31131..8b03d618185e 100644
--- a/drivers/isdn/i4l/isdn_common.c
+++ b/drivers/isdn/i4l/isdn_common.c
@@ -231,7 +231,7 @@ static int isdn_timer_cnt2 = 0;
 static int isdn_timer_cnt3 = 0;
 
 static void
-isdn_timer_funct(ulong dummy)
+isdn_timer_funct(struct timer_list *unused)
 {
 	int tf = dev->tflags;
 	if (tf & ISDN_TIMER_FAST) {
@@ -2294,7 +2294,7 @@ static int __init isdn_init(void)
 		printk(KERN_WARNING "isdn: Could not allocate device-struct.\n");
 		return -EIO;
 	}
-	setup_timer(&dev->timer, isdn_timer_funct, 0UL);
+	timer_setup(&dev->timer, isdn_timer_funct, 0);
 	spin_lock_init(&dev->lock);
 	spin_lock_init(&dev->timerlock);
 #ifdef MODULE
diff --git a/drivers/isdn/i4l/isdn_net.c b/drivers/isdn/i4l/isdn_net.c
index 59d40160cab2..c138f66f2659 100644
--- a/drivers/isdn/i4l/isdn_net.c
+++ b/drivers/isdn/i4l/isdn_net.c
@@ -1509,9 +1509,9 @@ static int isdn_net_ioctl(struct net_device *dev,
 
 /* called via cisco_timer.function */
 static void
-isdn_net_ciscohdlck_slarp_send_keepalive(unsigned long data)
+isdn_net_ciscohdlck_slarp_send_keepalive(struct timer_list *t)
 {
-	isdn_net_local *lp = (isdn_net_local *) data;
+	isdn_net_local *lp = from_timer(lp, t, cisco_timer);
 	struct sk_buff *skb;
 	unsigned char *p;
 	unsigned long last_cisco_myseq = lp->cisco_myseq;
@@ -1615,9 +1615,8 @@ isdn_net_ciscohdlck_connected(isdn_net_local *lp)
 	/* send slarp request because interface/seq.no.s reset */
 	isdn_net_ciscohdlck_slarp_send_request(lp);
 
-	setup_timer(&lp->cisco_timer,
-		    isdn_net_ciscohdlck_slarp_send_keepalive,
-		    (unsigned long)lp);
+	timer_setup(&lp->cisco_timer,
+		    isdn_net_ciscohdlck_slarp_send_keepalive, 0);
 	lp->cisco_timer.expires = jiffies + lp->cisco_keepalive_period * HZ;
 	add_timer(&lp->cisco_timer);
 }
diff --git a/drivers/isdn/i4l/isdn_ppp.c b/drivers/isdn/i4l/isdn_ppp.c
index cd2b3c69771a..e07aefb9151d 100644
--- a/drivers/isdn/i4l/isdn_ppp.c
+++ b/drivers/isdn/i4l/isdn_ppp.c
@@ -50,7 +50,7 @@ static struct ippp_ccp_reset *isdn_ppp_ccp_reset_alloc(struct ippp_struct *is);
 static void isdn_ppp_ccp_reset_free(struct ippp_struct *is);
 static void isdn_ppp_ccp_reset_free_state(struct ippp_struct *is,
 					  unsigned char id);
-static void isdn_ppp_ccp_timer_callback(unsigned long closure);
+static void isdn_ppp_ccp_timer_callback(struct timer_list *t);
 static struct ippp_ccp_reset_state *isdn_ppp_ccp_reset_alloc_state(struct ippp_struct *is,
 								   unsigned char id);
 static void isdn_ppp_ccp_reset_trans(struct ippp_struct *is,
@@ -2327,10 +2327,10 @@ static void isdn_ppp_ccp_reset_free_state(struct ippp_struct *is,
 
 /* The timer callback function which is called when a ResetReq has timed out,
    aka has never been answered by a ResetAck */
-static void isdn_ppp_ccp_timer_callback(unsigned long closure)
+static void isdn_ppp_ccp_timer_callback(struct timer_list *t)
 {
 	struct ippp_ccp_reset_state *rs =
-		(struct ippp_ccp_reset_state *)closure;
+		from_timer(rs, t, timer);
 
 	if (!rs) {
 		printk(KERN_ERR "ippp_ccp: timer cb with zero closure.\n");
@@ -2376,8 +2376,7 @@ static struct ippp_ccp_reset_state *isdn_ppp_ccp_reset_alloc_state(struct ippp_s
 		rs->state = CCPResetIdle;
 		rs->is = is;
 		rs->id = id;
-		setup_timer(&rs->timer, isdn_ppp_ccp_timer_callback,
-			    (unsigned long)rs);
+		timer_setup(&rs->timer, isdn_ppp_ccp_timer_callback, 0);
 		is->reset->rs[id] = rs;
 	}
 	return rs;
diff --git a/drivers/isdn/i4l/isdn_tty.c b/drivers/isdn/i4l/isdn_tty.c
index d30130c8d0f3..960f26348bb5 100644
--- a/drivers/isdn/i4l/isdn_tty.c
+++ b/drivers/isdn/i4l/isdn_tty.c
@@ -541,9 +541,9 @@ isdn_tty_senddown(modem_info *info)
  * into the tty's buffer.
  */
 static void
-isdn_tty_modem_do_ncarrier(unsigned long data)
+isdn_tty_modem_do_ncarrier(struct timer_list *t)
 {
-	modem_info *info = (modem_info *) data;
+	modem_info *info = from_timer(info, t, nc_timer);
 	isdn_tty_modem_result(RESULT_NO_CARRIER, info);
 }
 
@@ -1812,8 +1812,7 @@ isdn_tty_modem_init(void)
 		info->isdn_channel = -1;
 		info->drv_index = -1;
 		info->xmit_size = ISDN_SERIAL_XMIT_SIZE;
-		setup_timer(&info->nc_timer, isdn_tty_modem_do_ncarrier,
-			    (unsigned long)info);
+		timer_setup(&info->nc_timer, isdn_tty_modem_do_ncarrier, 0);
 		skb_queue_head_init(&info->xmit_queue);
 #ifdef CONFIG_ISDN_AUDIO
 		skb_queue_head_init(&info->dtmf_queue);
diff --git a/drivers/media/platform/s5p-mfc/s5p_mfc.c b/drivers/media/platform/s5p-mfc/s5p_mfc.c
index e179b33d3775..bc68dbbcaec1 100644
--- a/drivers/media/platform/s5p-mfc/s5p_mfc.c
+++ b/drivers/media/platform/s5p-mfc/s5p_mfc.c
@@ -145,9 +145,9 @@ void s5p_mfc_cleanup_queue(struct list_head *lh, struct vb2_queue *vq)
 	}
 }
 
-static void s5p_mfc_watchdog(unsigned long arg)
+static void s5p_mfc_watchdog(struct timer_list *t)
 {
-	struct s5p_mfc_dev *dev = (struct s5p_mfc_dev *)arg;
+	struct s5p_mfc_dev *dev = from_timer(dev, t, watchdog_timer);
 
 	if (test_bit(0, &dev->hw_lock))
 		atomic_inc(&dev->watchdog_cnt);
@@ -1314,8 +1314,7 @@ static int s5p_mfc_probe(struct platform_device *pdev)
 	dev->hw_lock = 0;
 	INIT_WORK(&dev->watchdog_work, s5p_mfc_watchdog_worker);
 	atomic_set(&dev->watchdog_cnt, 0);
-	setup_timer(&dev->watchdog_timer, s5p_mfc_watchdog,
-		    (unsigned long)dev);
+	timer_setup(&dev->watchdog_timer, s5p_mfc_watchdog, 0);
 
 	ret = v4l2_device_register(&pdev->dev, &dev->v4l2_dev);
 	if (ret)
diff --git a/drivers/media/platform/sti/c8sectpfe/c8sectpfe-core.c b/drivers/media/platform/sti/c8sectpfe/c8sectpfe-core.c
index 59280ac31937..a0acee7671b1 100644
--- a/drivers/media/platform/sti/c8sectpfe/c8sectpfe-core.c
+++ b/drivers/media/platform/sti/c8sectpfe/c8sectpfe-core.c
@@ -61,9 +61,9 @@ static int load_c8sectpfe_fw(struct c8sectpfei *fei);
 
 #define FIFO_LEN 1024
 
-static void c8sectpfe_timer_interrupt(unsigned long ac8sectpfei)
+static void c8sectpfe_timer_interrupt(struct timer_list *t)
 {
-	struct c8sectpfei *fei = (struct c8sectpfei *)ac8sectpfei;
+	struct c8sectpfei *fei = from_timer(fei, t, timer);
 	struct channel_info *channel;
 	int chan_num;
 
@@ -865,8 +865,7 @@ static int c8sectpfe_probe(struct platform_device *pdev)
 	}
 
 	/* Setup timer interrupt */
-	setup_timer(&fei->timer, c8sectpfe_timer_interrupt,
-		    (unsigned long)fei);
+	timer_setup(&fei->timer, c8sectpfe_timer_interrupt, 0);
 
 	mutex_init(&fei->lock);
 
diff --git a/drivers/media/platform/vim2m.c b/drivers/media/platform/vim2m.c
index b01fba020d5f..7bf9fa2f8534 100644
--- a/drivers/media/platform/vim2m.c
+++ b/drivers/media/platform/vim2m.c
@@ -388,9 +388,9 @@ static void device_run(void *priv)
 	schedule_irq(dev, ctx->transtime);
 }
 
-static void device_isr(unsigned long priv)
+static void device_isr(struct timer_list *t)
 {
-	struct vim2m_dev *vim2m_dev = (struct vim2m_dev *)priv;
+	struct vim2m_dev *vim2m_dev = from_timer(vim2m_dev, t, timer);
 	struct vim2m_ctx *curr_ctx;
 	struct vb2_v4l2_buffer *src_vb, *dst_vb;
 	unsigned long flags;
@@ -1024,7 +1024,7 @@ static int vim2m_probe(struct platform_device *pdev)
 	v4l2_info(&dev->v4l2_dev,
 			"Device registered as /dev/video%d\n", vfd->num);
 
-	setup_timer(&dev->timer, device_isr, (long)dev);
+	timer_setup(&dev->timer, device_isr, 0);
 	platform_set_drvdata(pdev, dev);
 
 	dev->m2m_dev = v4l2_m2m_init(&m2m_ops);
diff --git a/drivers/media/usb/au0828/au0828-dvb.c b/drivers/media/usb/au0828/au0828-dvb.c
index d701c04b3783..d9093a3c57c5 100644
--- a/drivers/media/usb/au0828/au0828-dvb.c
+++ b/drivers/media/usb/au0828/au0828-dvb.c
@@ -105,9 +105,9 @@ static struct tda18271_config hauppauge_woodbury_tunerconfig = {
 
 static void au0828_restart_dvb_streaming(struct work_struct *work);
 
-static void au0828_bulk_timeout(unsigned long data)
+static void au0828_bulk_timeout(struct timer_list *t)
 {
-	struct au0828_dev *dev = (struct au0828_dev *) data;
+	struct au0828_dev *dev = from_timer(dev, t, bulk_timeout);
 
 	dprintk(1, "%s called\n", __func__);
 	dev->bulk_timeout_running = 0;
@@ -648,8 +648,7 @@ int au0828_dvb_register(struct au0828_dev *dev)
 		return ret;
 	}
 
-	setup_timer(&dev->bulk_timeout, au0828_bulk_timeout,
-		    (unsigned long)dev);
+	timer_setup(&dev->bulk_timeout, au0828_bulk_timeout, 0);
 
 	return 0;
 }
diff --git a/drivers/media/usb/au0828/au0828-video.c b/drivers/media/usb/au0828/au0828-video.c
index 654f67c25863..a240153821e0 100644
--- a/drivers/media/usb/au0828/au0828-video.c
+++ b/drivers/media/usb/au0828/au0828-video.c
@@ -954,9 +954,9 @@ int au0828_analog_unregister(struct au0828_dev *dev)
 /* This function ensures that video frames continue to be delivered even if
    the ITU-656 input isn't receiving any data (thereby preventing applications
    such as tvtime from hanging) */
-static void au0828_vid_buffer_timeout(unsigned long data)
+static void au0828_vid_buffer_timeout(struct timer_list *t)
 {
-	struct au0828_dev *dev = (struct au0828_dev *) data;
+	struct au0828_dev *dev = from_timer(dev, t, vid_timeout);
 	struct au0828_dmaqueue *dma_q = &dev->vidq;
 	struct au0828_buffer *buf;
 	unsigned char *vid_data;
@@ -978,9 +978,9 @@ static void au0828_vid_buffer_timeout(unsigned long data)
 	spin_unlock_irqrestore(&dev->slock, flags);
 }
 
-static void au0828_vbi_buffer_timeout(unsigned long data)
+static void au0828_vbi_buffer_timeout(struct timer_list *t)
 {
-	struct au0828_dev *dev = (struct au0828_dev *) data;
+	struct au0828_dev *dev = from_timer(dev, t, vbi_timeout);
 	struct au0828_dmaqueue *dma_q = &dev->vbiq;
 	struct au0828_buffer *buf;
 	unsigned char *vbi_data;
@@ -1953,10 +1953,8 @@ int au0828_analog_register(struct au0828_dev *dev,
 	INIT_LIST_HEAD(&dev->vidq.active);
 	INIT_LIST_HEAD(&dev->vbiq.active);
 
-	setup_timer(&dev->vid_timeout, au0828_vid_buffer_timeout,
-		    (unsigned long)dev);
-	setup_timer(&dev->vbi_timeout, au0828_vbi_buffer_timeout,
-		    (unsigned long)dev);
+	timer_setup(&dev->vid_timeout, au0828_vid_buffer_timeout, 0);
+	timer_setup(&dev->vbi_timeout, au0828_vbi_buffer_timeout, 0);
 
 	dev->width = NTSC_STD_W;
 	dev->height = NTSC_STD_H;
diff --git a/drivers/memstick/core/ms_block.c b/drivers/memstick/core/ms_block.c
index 22de7f5ed032..57b13dfbd21e 100644
--- a/drivers/memstick/core/ms_block.c
+++ b/drivers/memstick/core/ms_block.c
@@ -1492,9 +1492,9 @@ static int msb_ftl_scan(struct msb_data *msb)
 	return 0;
 }
 
-static void msb_cache_flush_timer(unsigned long data)
+static void msb_cache_flush_timer(struct timer_list *t)
 {
-	struct msb_data *msb = (struct msb_data *)data;
+	struct msb_data *msb = from_timer(msb, t, cache_flush_timer);
 	msb->need_flush_cache = true;
 	queue_work(msb->io_queue, &msb->io_work);
 }
@@ -1514,8 +1514,7 @@ static void msb_cache_discard(struct msb_data *msb)
 
 static int msb_cache_init(struct msb_data *msb)
 {
-	setup_timer(&msb->cache_flush_timer, msb_cache_flush_timer,
-		(unsigned long)msb);
+	timer_setup(&msb->cache_flush_timer, msb_cache_flush_timer, 0);
 
 	if (!msb->cache)
 		msb->cache = kzalloc(msb->block_size, GFP_KERNEL);
diff --git a/drivers/mfd/rtsx_usb.c b/drivers/mfd/rtsx_usb.c
index 691dab791f7a..59d61b04c197 100644
--- a/drivers/mfd/rtsx_usb.c
+++ b/drivers/mfd/rtsx_usb.c
@@ -40,9 +40,9 @@ static const struct mfd_cell rtsx_usb_cells[] = {
 	},
 };
 
-static void rtsx_usb_sg_timed_out(unsigned long data)
+static void rtsx_usb_sg_timed_out(struct timer_list *t)
 {
-	struct rtsx_ucr *ucr = (struct rtsx_ucr *)data;
+	struct rtsx_ucr *ucr = from_timer(ucr, t, sg_timer);
 
 	dev_dbg(&ucr->pusb_intf->dev, "%s: sg transfer timed out", __func__);
 	usb_sg_cancel(&ucr->current_sg);
@@ -663,7 +663,7 @@ static int rtsx_usb_probe(struct usb_interface *intf,
 		goto out_init_fail;
 
 	/* initialize USB SG transfer timer */
-	setup_timer(&ucr->sg_timer, rtsx_usb_sg_timed_out, (unsigned long) ucr);
+	timer_setup(&ucr->sg_timer, rtsx_usb_sg_timed_out, 0);
 
 	ret = mfd_add_hotplug_devices(&intf->dev, rtsx_usb_cells,
 				      ARRAY_SIZE(rtsx_usb_cells));
diff --git a/drivers/mmc/core/host.c b/drivers/mmc/core/host.c
index 35a9e4fd1a9f..64b03d6eaf18 100644
--- a/drivers/mmc/core/host.c
+++ b/drivers/mmc/core/host.c
@@ -160,9 +160,9 @@ out:
 	return err;
 }
 
-static void mmc_retune_timer(unsigned long data)
+static void mmc_retune_timer(struct timer_list *t)
 {
-	struct mmc_host *host = (struct mmc_host *)data;
+	struct mmc_host *host = from_timer(host, t, retune_timer);
 
 	mmc_retune_needed(host);
 }
@@ -389,7 +389,7 @@ struct mmc_host *mmc_alloc_host(int extra, struct device *dev)
 	init_waitqueue_head(&host->wq);
 	INIT_DELAYED_WORK(&host->detect, mmc_rescan);
 	INIT_DELAYED_WORK(&host->sdio_irq_work, sdio_irq_work);
-	setup_timer(&host->retune_timer, mmc_retune_timer, (unsigned long)host);
+	timer_setup(&host->retune_timer, mmc_retune_timer, 0);
 
 	/*
 	 * By default, hosts do not support SGIO or large requests.
diff --git a/drivers/mtd/sm_ftl.c b/drivers/mtd/sm_ftl.c
index 3692dd547879..4237c7cebf02 100644
--- a/drivers/mtd/sm_ftl.c
+++ b/drivers/mtd/sm_ftl.c
@@ -989,9 +989,9 @@ restart:
 
 
 /* flush timer, runs a second after last write */
-static void sm_cache_flush_timer(unsigned long data)
+static void sm_cache_flush_timer(struct timer_list *t)
 {
-	struct sm_ftl *ftl = (struct sm_ftl *)data;
+	struct sm_ftl *ftl = from_timer(ftl, t, timer);
 	queue_work(cache_flush_workqueue, &ftl->flush_work);
 }
 
@@ -1139,7 +1139,7 @@ static void sm_add_mtd(struct mtd_blktrans_ops *tr, struct mtd_info *mtd)
 
 
 	mutex_init(&ftl->mutex);
-	setup_timer(&ftl->timer, sm_cache_flush_timer, (unsigned long)ftl);
+	timer_setup(&ftl->timer, sm_cache_flush_timer, 0);
 	INIT_WORK(&ftl->flush_work, sm_cache_flush_work);
 	init_completion(&ftl->erase_completion);
 
diff --git a/drivers/net/caif/caif_hsi.c b/drivers/net/caif/caif_hsi.c
index fed75e75207a..b8029ea03307 100644
--- a/drivers/net/caif/caif_hsi.c
+++ b/drivers/net/caif/caif_hsi.c
@@ -66,9 +66,9 @@ static const struct cfhsi_config  hsi_default_config = {
 
 static LIST_HEAD(cfhsi_list);
 
-static void cfhsi_inactivity_tout(unsigned long arg)
+static void cfhsi_inactivity_tout(struct timer_list *t)
 {
-	struct cfhsi *cfhsi = (struct cfhsi *)arg;
+	struct cfhsi *cfhsi = from_timer(cfhsi, t, inactivity_timer);
 
 	netdev_dbg(cfhsi->ndev, "%s.\n",
 		__func__);
@@ -737,9 +737,9 @@ out_of_sync:
 	schedule_work(&cfhsi->out_of_sync_work);
 }
 
-static void cfhsi_rx_slowpath(unsigned long arg)
+static void cfhsi_rx_slowpath(struct timer_list *t)
 {
-	struct cfhsi *cfhsi = (struct cfhsi *)arg;
+	struct cfhsi *cfhsi = from_timer(cfhsi, t, rx_slowpath_timer);
 
 	netdev_dbg(cfhsi->ndev, "%s.\n",
 		__func__);
@@ -997,9 +997,9 @@ static void cfhsi_wake_down_cb(struct cfhsi_cb_ops *cb_ops)
 	wake_up_interruptible(&cfhsi->wake_down_wait);
 }
 
-static void cfhsi_aggregation_tout(unsigned long arg)
+static void cfhsi_aggregation_tout(struct timer_list *t)
 {
-	struct cfhsi *cfhsi = (struct cfhsi *)arg;
+	struct cfhsi *cfhsi = from_timer(cfhsi, t, aggregation_timer);
 
 	netdev_dbg(cfhsi->ndev, "%s.\n",
 		__func__);
@@ -1211,14 +1211,11 @@ static int cfhsi_open(struct net_device *ndev)
 	init_waitqueue_head(&cfhsi->flush_fifo_wait);
 
 	/* Setup the inactivity timer. */
-	setup_timer(&cfhsi->inactivity_timer, cfhsi_inactivity_tout,
-		    (unsigned long)cfhsi);
+	timer_setup(&cfhsi->inactivity_timer, cfhsi_inactivity_tout, 0);
 	/* Setup the slowpath RX timer. */
-	setup_timer(&cfhsi->rx_slowpath_timer, cfhsi_rx_slowpath,
-		    (unsigned long)cfhsi);
+	timer_setup(&cfhsi->rx_slowpath_timer, cfhsi_rx_slowpath, 0);
 	/* Setup the aggregation timer. */
-	setup_timer(&cfhsi->aggregation_timer, cfhsi_aggregation_tout,
-		    (unsigned long)cfhsi);
+	timer_setup(&cfhsi->aggregation_timer, cfhsi_aggregation_tout, 0);
 
 	/* Activate HSI interface. */
 	res = cfhsi->ops->cfhsi_up(cfhsi->ops);
diff --git a/drivers/net/dsa/mv88e6xxx/phy.c b/drivers/net/dsa/mv88e6xxx/phy.c
index 436668bd50dc..46af8052e535 100644
--- a/drivers/net/dsa/mv88e6xxx/phy.c
+++ b/drivers/net/dsa/mv88e6xxx/phy.c
@@ -149,9 +149,9 @@ static void mv88e6xxx_phy_ppu_reenable_work(struct work_struct *ugly)
 	mutex_unlock(&chip->reg_lock);
 }
 
-static void mv88e6xxx_phy_ppu_reenable_timer(unsigned long _ps)
+static void mv88e6xxx_phy_ppu_reenable_timer(struct timer_list *t)
 {
-	struct mv88e6xxx_chip *chip = (void *)_ps;
+	struct mv88e6xxx_chip *chip = from_timer(chip, t, ppu_timer);
 
 	schedule_work(&chip->ppu_work);
 }
@@ -193,8 +193,7 @@ static void mv88e6xxx_phy_ppu_state_init(struct mv88e6xxx_chip *chip)
 {
 	mutex_init(&chip->ppu_mutex);
 	INIT_WORK(&chip->ppu_work, mv88e6xxx_phy_ppu_reenable_work);
-	setup_timer(&chip->ppu_timer, mv88e6xxx_phy_ppu_reenable_timer,
-		    (unsigned long)chip);
+	timer_setup(&chip->ppu_timer, mv88e6xxx_phy_ppu_reenable_timer, 0);
 }
 
 static void mv88e6xxx_phy_ppu_state_destroy(struct mv88e6xxx_chip *chip)
diff --git a/drivers/net/eql.c b/drivers/net/eql.c
index fccce4b47778..74263f8efe1a 100644
--- a/drivers/net/eql.c
+++ b/drivers/net/eql.c
@@ -139,9 +139,9 @@ static netdev_tx_t eql_slave_xmit(struct sk_buff *skb, struct net_device *dev);
 
 static void eql_kill_one_slave(slave_queue_t *queue, slave_t *slave);
 
-static void eql_timer(unsigned long param)
+static void eql_timer(struct timer_list *t)
 {
-	equalizer_t *eql = (equalizer_t *) param;
+	equalizer_t *eql = from_timer(eql, t, timer);
 	struct list_head *this, *tmp, *head;
 
 	spin_lock(&eql->queue.lock);
@@ -178,7 +178,7 @@ static void __init eql_setup(struct net_device *dev)
 {
 	equalizer_t *eql = netdev_priv(dev);
 
-	setup_timer(&eql->timer, eql_timer, (unsigned long)eql);
+	timer_setup(&eql->timer, eql_timer, 0);
 	eql->timer.expires  	= jiffies + EQL_DEFAULT_RESCHED_IVAL;
 
 	spin_lock_init(&eql->queue.lock);
diff --git a/drivers/net/ethernet/adi/bfin_mac.c b/drivers/net/ethernet/adi/bfin_mac.c
index 0658cde1586a..7120f2b9c6ef 100644
--- a/drivers/net/ethernet/adi/bfin_mac.c
+++ b/drivers/net/ethernet/adi/bfin_mac.c
@@ -1092,9 +1092,11 @@ static void tx_reclaim_skb(struct bfin_mac_local *lp)
 	return;
 }
 
-static void tx_reclaim_skb_timeout(unsigned long lp)
+static void tx_reclaim_skb_timeout(struct timer_list *t)
 {
-	tx_reclaim_skb((struct bfin_mac_local *)lp);
+	struct bfin_mac_local *lp = from_timer(lp, t, tx_reclaim_timer);
+
+	tx_reclaim_skb(lp);
 }
 
 static int bfin_mac_hard_start_xmit(struct sk_buff *skb,
@@ -1650,8 +1652,7 @@ static int bfin_mac_probe(struct platform_device *pdev)
 	ndev->netdev_ops = &bfin_mac_netdev_ops;
 	ndev->ethtool_ops = &bfin_mac_ethtool_ops;
 
-	setup_timer(&lp->tx_reclaim_timer, tx_reclaim_skb_timeout,
-		    (unsigned long)lp);
+	timer_setup(&lp->tx_reclaim_timer, tx_reclaim_skb_timeout, 0);
 
 	lp->flags = 0;
 	netif_napi_add(ndev, &lp->napi, bfin_mac_poll, CONFIG_BFIN_RX_DESC_NUM);
diff --git a/drivers/net/ethernet/agere/et131x.c b/drivers/net/ethernet/agere/et131x.c
index 658e92f79d36..48220b6c600d 100644
--- a/drivers/net/ethernet/agere/et131x.c
+++ b/drivers/net/ethernet/agere/et131x.c
@@ -3080,9 +3080,9 @@ err_out:
  * The routine called when the error timer expires, to track the number of
  * recurring errors.
  */
-static void et131x_error_timer_handler(unsigned long data)
+static void et131x_error_timer_handler(struct timer_list *t)
 {
-	struct et131x_adapter *adapter = (struct et131x_adapter *)data;
+	struct et131x_adapter *adapter = from_timer(adapter, t, error_timer);
 	struct phy_device *phydev = adapter->netdev->phydev;
 
 	if (et1310_in_phy_coma(adapter)) {
@@ -3624,8 +3624,7 @@ static int et131x_open(struct net_device *netdev)
 	int result;
 
 	/* Start the timer to track NIC errors */
-	setup_timer(&adapter->error_timer, et131x_error_timer_handler,
-		    (unsigned long)adapter);
+	timer_setup(&adapter->error_timer, et131x_error_timer_handler, 0);
 	adapter->error_timer.expires = jiffies +
 		msecs_to_jiffies(TX_ERROR_PERIOD);
 	add_timer(&adapter->error_timer);
diff --git a/drivers/net/ethernet/amazon/ena/ena_netdev.c b/drivers/net/ethernet/amazon/ena/ena_netdev.c
index 1c1ddd891ca3..97c5a89a9cf7 100644
--- a/drivers/net/ethernet/amazon/ena/ena_netdev.c
+++ b/drivers/net/ethernet/amazon/ena/ena_netdev.c
@@ -2859,9 +2859,9 @@ static void ena_update_host_info(struct ena_admin_host_info *host_info,
 		(netdev->features & GENMASK_ULL(63, 32)) >> 32;
 }
 
-static void ena_timer_service(unsigned long data)
+static void ena_timer_service(struct timer_list *t)
 {
-	struct ena_adapter *adapter = (struct ena_adapter *)data;
+	struct ena_adapter *adapter = from_timer(adapter, t, timer_service);
 	u8 *debug_area = adapter->ena_dev->host_attr.debug_area_virt_addr;
 	struct ena_admin_host_info *host_info =
 		adapter->ena_dev->host_attr.host_info;
@@ -3278,8 +3278,7 @@ static int ena_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 
 	ena_update_hints(adapter, &get_feat_ctx.hw_hints);
 
-	setup_timer(&adapter->timer_service, ena_timer_service,
-		    (unsigned long)adapter);
+	timer_setup(&adapter->timer_service, ena_timer_service, 0);
 	mod_timer(&adapter->timer_service, round_jiffies(jiffies + HZ));
 
 	dev_info(&pdev->dev, "%s found at mem %lx, mac addr %pM Queues %d\n",
diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_nic.c b/drivers/net/ethernet/aquantia/atlantic/aq_nic.c
index 483e97691eea..78dfb2ab78ce 100644
--- a/drivers/net/ethernet/aquantia/atlantic/aq_nic.c
+++ b/drivers/net/ethernet/aquantia/atlantic/aq_nic.c
@@ -163,9 +163,9 @@ static int aq_nic_update_link_status(struct aq_nic_s *self)
 	return 0;
 }
 
-static void aq_nic_service_timer_cb(unsigned long param)
+static void aq_nic_service_timer_cb(struct timer_list *t)
 {
-	struct aq_nic_s *self = (struct aq_nic_s *)param;
+	struct aq_nic_s *self = from_timer(self, t, service_timer);
 	struct net_device *ndev = aq_nic_get_ndev(self);
 	int err = 0;
 	unsigned int i = 0U;
@@ -201,9 +201,9 @@ err_exit:
 		  jiffies + AQ_CFG_SERVICE_TIMER_INTERVAL);
 }
 
-static void aq_nic_polling_timer_cb(unsigned long param)
+static void aq_nic_polling_timer_cb(struct timer_list *t)
 {
-	struct aq_nic_s *self = (struct aq_nic_s *)param;
+	struct aq_nic_s *self = from_timer(self, t, polling_timer);
 	struct aq_vec_s *aq_vec = NULL;
 	unsigned int i = 0U;
 
@@ -440,14 +440,12 @@ int aq_nic_start(struct aq_nic_s *self)
 	err = aq_nic_update_interrupt_moderation_settings(self);
 	if (err)
 		goto err_exit;
-	setup_timer(&self->service_timer, &aq_nic_service_timer_cb,
-		    (unsigned long)self);
+	timer_setup(&self->service_timer, aq_nic_service_timer_cb, 0);
 	mod_timer(&self->service_timer, jiffies +
 			AQ_CFG_SERVICE_TIMER_INTERVAL);
 
 	if (self->aq_nic_cfg.is_polling) {
-		setup_timer(&self->polling_timer, &aq_nic_polling_timer_cb,
-			    (unsigned long)self);
+		timer_setup(&self->polling_timer, aq_nic_polling_timer_cb, 0);
 		mod_timer(&self->polling_timer, jiffies +
 			  AQ_CFG_POLLING_TIMER_INTERVAL);
 	} else {
diff --git a/drivers/net/ethernet/atheros/atl1c/atl1c_main.c b/drivers/net/ethernet/atheros/atl1c/atl1c_main.c
index 8c9986f3fc01..94270f654b3b 100644
--- a/drivers/net/ethernet/atheros/atl1c/atl1c_main.c
+++ b/drivers/net/ethernet/atheros/atl1c/atl1c_main.c
@@ -222,9 +222,10 @@ static u32 atl1c_wait_until_idle(struct atl1c_hw *hw, u32 modu_ctrl)
  * atl1c_phy_config - Timer Call-back
  * @data: pointer to netdev cast into an unsigned long
  */
-static void atl1c_phy_config(unsigned long data)
+static void atl1c_phy_config(struct timer_list *t)
 {
-	struct atl1c_adapter *adapter = (struct atl1c_adapter *) data;
+	struct atl1c_adapter *adapter = from_timer(adapter, t,
+						   phy_config_timer);
 	struct atl1c_hw *hw = &adapter->hw;
 	unsigned long flags;
 
@@ -2613,8 +2614,7 @@ static int atl1c_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 	adapter->mii.phy_id_mask = 0x1f;
 	adapter->mii.reg_num_mask = MDIO_CTRL_REG_MASK;
 	netif_napi_add(netdev, &adapter->napi, atl1c_clean, 64);
-	setup_timer(&adapter->phy_config_timer, atl1c_phy_config,
-			(unsigned long)adapter);
+	timer_setup(&adapter->phy_config_timer, atl1c_phy_config, 0);
 	/* setup the private structure */
 	err = atl1c_sw_init(adapter);
 	if (err) {
diff --git a/drivers/net/ethernet/atheros/atl1e/atl1e_main.c b/drivers/net/ethernet/atheros/atl1e/atl1e_main.c
index 4f7e195af0bc..9dc6da039a6d 100644
--- a/drivers/net/ethernet/atheros/atl1e/atl1e_main.c
+++ b/drivers/net/ethernet/atheros/atl1e/atl1e_main.c
@@ -130,9 +130,10 @@ static inline void atl1e_irq_reset(struct atl1e_adapter *adapter)
  * atl1e_phy_config - Timer Call-back
  * @data: pointer to netdev cast into an unsigned long
  */
-static void atl1e_phy_config(unsigned long data)
+static void atl1e_phy_config(struct timer_list *t)
 {
-	struct atl1e_adapter *adapter = (struct atl1e_adapter *) data;
+	struct atl1e_adapter *adapter = from_timer(adapter, t,
+						   phy_config_timer);
 	struct atl1e_hw *hw = &adapter->hw;
 	unsigned long flags;
 
@@ -2361,8 +2362,7 @@ static int atl1e_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 
 	netif_napi_add(netdev, &adapter->napi, atl1e_clean, 64);
 
-	setup_timer(&adapter->phy_config_timer, atl1e_phy_config,
-		    (unsigned long)adapter);
+	timer_setup(&adapter->phy_config_timer, atl1e_phy_config, 0);
 
 	/* get user settings */
 	atl1e_check_options(adapter);
diff --git a/drivers/net/ethernet/atheros/atlx/atl1.c b/drivers/net/ethernet/atheros/atlx/atl1.c
index 83d2db2abb45..b81fbf119bce 100644
--- a/drivers/net/ethernet/atheros/atlx/atl1.c
+++ b/drivers/net/ethernet/atheros/atlx/atl1.c
@@ -2575,9 +2575,10 @@ static irqreturn_t atl1_intr(int irq, void *data)
  * atl1_phy_config - Timer Call-back
  * @data: pointer to netdev cast into an unsigned long
  */
-static void atl1_phy_config(unsigned long data)
+static void atl1_phy_config(struct timer_list *t)
 {
-	struct atl1_adapter *adapter = (struct atl1_adapter *)data;
+	struct atl1_adapter *adapter = from_timer(adapter, t,
+						  phy_config_timer);
 	struct atl1_hw *hw = &adapter->hw;
 	unsigned long flags;
 
@@ -3071,8 +3072,7 @@ static int atl1_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 	/* assume we have no link for now */
 	netif_carrier_off(netdev);
 
-	setup_timer(&adapter->phy_config_timer, atl1_phy_config,
-		    (unsigned long)adapter);
+	timer_setup(&adapter->phy_config_timer, atl1_phy_config, 0);
 	adapter->phy_timer_pending = false;
 
 	INIT_WORK(&adapter->reset_dev_task, atl1_reset_dev_task);
diff --git a/drivers/net/ethernet/atheros/atlx/atl2.c b/drivers/net/ethernet/atheros/atlx/atl2.c
index 77a1c03255de..db4bcc51023a 100644
--- a/drivers/net/ethernet/atheros/atlx/atl2.c
+++ b/drivers/net/ethernet/atheros/atlx/atl2.c
@@ -1028,9 +1028,9 @@ static void atl2_tx_timeout(struct net_device *netdev)
  * atl2_watchdog - Timer Call-back
  * @data: pointer to netdev cast into an unsigned long
  */
-static void atl2_watchdog(unsigned long data)
+static void atl2_watchdog(struct timer_list *t)
 {
-	struct atl2_adapter *adapter = (struct atl2_adapter *) data;
+	struct atl2_adapter *adapter = from_timer(adapter, t, watchdog_timer);
 
 	if (!test_bit(__ATL2_DOWN, &adapter->flags)) {
 		u32 drop_rxd, drop_rxs;
@@ -1053,9 +1053,10 @@ static void atl2_watchdog(unsigned long data)
  * atl2_phy_config - Timer Call-back
  * @data: pointer to netdev cast into an unsigned long
  */
-static void atl2_phy_config(unsigned long data)
+static void atl2_phy_config(struct timer_list *t)
 {
-	struct atl2_adapter *adapter = (struct atl2_adapter *) data;
+	struct atl2_adapter *adapter = from_timer(adapter, t,
+						  phy_config_timer);
 	struct atl2_hw *hw = &adapter->hw;
 	unsigned long flags;
 
@@ -1434,11 +1435,9 @@ static int atl2_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 
 	atl2_check_options(adapter);
 
-	setup_timer(&adapter->watchdog_timer, atl2_watchdog,
-		    (unsigned long)adapter);
+	timer_setup(&adapter->watchdog_timer, atl2_watchdog, 0);
 
-	setup_timer(&adapter->phy_config_timer, atl2_phy_config,
-		    (unsigned long)adapter);
+	timer_setup(&adapter->phy_config_timer, atl2_phy_config, 0);
 
 	INIT_WORK(&adapter->reset_task, atl2_reset_task);
 	INIT_WORK(&adapter->link_chg_task, atl2_link_chg_task);
diff --git a/drivers/net/ethernet/broadcom/b44.c b/drivers/net/ethernet/broadcom/b44.c
index 42e44fc03a18..e445ab724827 100644
--- a/drivers/net/ethernet/broadcom/b44.c
+++ b/drivers/net/ethernet/broadcom/b44.c
@@ -599,9 +599,9 @@ static void b44_check_phy(struct b44 *bp)
 	}
 }
 
-static void b44_timer(unsigned long __opaque)
+static void b44_timer(struct timer_list *t)
 {
-	struct b44 *bp = (struct b44 *) __opaque;
+	struct b44 *bp = from_timer(bp, t, timer);
 
 	spin_lock_irq(&bp->lock);
 
@@ -1474,7 +1474,7 @@ static int b44_open(struct net_device *dev)
 		goto out;
 	}
 
-	setup_timer(&bp->timer, b44_timer, (unsigned long)bp);
+	timer_setup(&bp->timer, b44_timer, 0);
 	bp->timer.expires = jiffies + HZ;
 	add_timer(&bp->timer);
 
diff --git a/drivers/net/ethernet/broadcom/bnx2.c b/drivers/net/ethernet/broadcom/bnx2.c
index b3055a76dfbf..7919f6112ecf 100644
--- a/drivers/net/ethernet/broadcom/bnx2.c
+++ b/drivers/net/ethernet/broadcom/bnx2.c
@@ -6183,9 +6183,9 @@ bnx2_5708_serdes_timer(struct bnx2 *bp)
 }
 
 static void
-bnx2_timer(unsigned long data)
+bnx2_timer(struct timer_list *t)
 {
-	struct bnx2 *bp = (struct bnx2 *) data;
+	struct bnx2 *bp = from_timer(bp, t, timer);
 
 	if (!netif_running(bp->dev))
 		return;
@@ -8462,7 +8462,7 @@ bnx2_init_board(struct pci_dev *pdev, struct net_device *dev)
 	bnx2_set_default_link(bp);
 	bp->req_flow_ctrl = FLOW_CTRL_RX | FLOW_CTRL_TX;
 
-	setup_timer(&bp->timer, bnx2_timer, (unsigned long)bp);
+	timer_setup(&bp->timer, bnx2_timer, 0);
 	bp->timer.expires = RUN_AT(BNX2_TIMER_INTERVAL);
 
 #ifdef BCM_CNIC
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
index be9fd7d184d0..91e2a7560b48 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
@@ -5761,9 +5761,9 @@ void bnx2x_drv_pulse(struct bnx2x *bp)
 		 bp->fw_drv_pulse_wr_seq);
 }
 
-static void bnx2x_timer(unsigned long data)
+static void bnx2x_timer(struct timer_list *t)
 {
-	struct bnx2x *bp = (struct bnx2x *) data;
+	struct bnx2x *bp = from_timer(bp, t, timer);
 
 	if (!netif_running(bp->dev))
 		return;
@@ -12421,7 +12421,7 @@ static int bnx2x_init_bp(struct bnx2x *bp)
 
 	bp->current_interval = CHIP_REV_IS_SLOW(bp) ? 5*HZ : HZ;
 
-	setup_timer(&bp->timer, bnx2x_timer, (unsigned long)bp);
+	timer_setup(&bp->timer, bnx2x_timer, 0);
 	bp->timer.expires = jiffies + bp->current_interval;
 
 	if (SHMEM2_HAS(bp, dcbx_lldp_params_offset) &&
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index 33c49ad697e4..c5c38d4b7d1c 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -6962,9 +6962,9 @@ static void bnxt_poll_controller(struct net_device *dev)
 }
 #endif
 
-static void bnxt_timer(unsigned long data)
+static void bnxt_timer(struct timer_list *t)
 {
-	struct bnxt *bp = (struct bnxt *)data;
+	struct bnxt *bp = from_timer(bp, t, timer);
 	struct net_device *dev = bp->dev;
 
 	if (!netif_running(dev))
@@ -7236,7 +7236,7 @@ static int bnxt_init_board(struct pci_dev *pdev, struct net_device *dev)
 
 	bnxt_init_dflt_coal(bp);
 
-	setup_timer(&bp->timer, bnxt_timer, (unsigned long)bp);
+	timer_setup(&bp->timer, bnxt_timer, 0);
 	bp->current_interval = BNXT_TIMER_INTERVAL;
 
 	clear_bit(BNXT_STATE_OPEN, &bp->state);
diff --git a/drivers/net/ethernet/broadcom/tg3.c b/drivers/net/ethernet/broadcom/tg3.c
index d8d5f207c759..de51c2177d03 100644
--- a/drivers/net/ethernet/broadcom/tg3.c
+++ b/drivers/net/ethernet/broadcom/tg3.c
@@ -10931,9 +10931,9 @@ static void tg3_chk_missed_msi(struct tg3 *tp)
 	}
 }
 
-static void tg3_timer(unsigned long __opaque)
+static void tg3_timer(struct timer_list *t)
 {
-	struct tg3 *tp = (struct tg3 *) __opaque;
+	struct tg3 *tp = from_timer(tp, t, timer);
 
 	spin_lock(&tp->lock);
 
@@ -11087,7 +11087,7 @@ static void tg3_timer_init(struct tg3 *tp)
 	tp->asf_multiplier = (HZ / tp->timer_offset) *
 			     TG3_FW_UPDATE_FREQ_SEC;
 
-	setup_timer(&tp->timer, tg3_timer, (unsigned long)tp);
+	timer_setup(&tp->timer, tg3_timer, 0);
 }
 
 static void tg3_timer_start(struct tg3 *tp)
diff --git a/drivers/net/ethernet/cisco/enic/enic_main.c b/drivers/net/ethernet/cisco/enic/enic_main.c
index 4a11baffe02d..e130fb757e7b 100644
--- a/drivers/net/ethernet/cisco/enic/enic_main.c
+++ b/drivers/net/ethernet/cisco/enic/enic_main.c
@@ -1676,9 +1676,9 @@ static int enic_poll_msix_rq(struct napi_struct *napi, int budget)
 	return work_done;
 }
 
-static void enic_notify_timer(unsigned long data)
+static void enic_notify_timer(struct timer_list *t)
 {
-	struct enic *enic = (struct enic *)data;
+	struct enic *enic = from_timer(enic, t, notify_timer);
 
 	enic_notify_check(enic);
 
@@ -2846,8 +2846,7 @@ static int enic_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 	/* Setup notification timer, HW reset task, and wq locks
 	 */
 
-	setup_timer(&enic->notify_timer, enic_notify_timer,
-		    (unsigned long)enic);
+	timer_setup(&enic->notify_timer, enic_notify_timer, 0);
 
 	enic_set_rx_coal_setting(enic);
 	INIT_WORK(&enic->reset, enic_reset);
diff --git a/drivers/net/ethernet/marvell/mv643xx_eth.c b/drivers/net/ethernet/marvell/mv643xx_eth.c
index 81c1fac00d33..62f204f32316 100644
--- a/drivers/net/ethernet/marvell/mv643xx_eth.c
+++ b/drivers/net/ethernet/marvell/mv643xx_eth.c
@@ -1346,9 +1346,9 @@ static void mib_counters_update(struct mv643xx_eth_private *mp)
 	spin_unlock_bh(&mp->mib_counters_lock);
 }
 
-static void mib_counters_timer_wrapper(unsigned long _mp)
+static void mib_counters_timer_wrapper(struct timer_list *t)
 {
-	struct mv643xx_eth_private *mp = (void *)_mp;
+	struct mv643xx_eth_private *mp = from_timer(mp, t, mib_counters_timer);
 	mib_counters_update(mp);
 	mod_timer(&mp->mib_counters_timer, jiffies + 30 * HZ);
 }
@@ -2321,9 +2321,9 @@ static int mv643xx_eth_poll(struct napi_struct *napi, int budget)
 	return work_done;
 }
 
-static inline void oom_timer_wrapper(unsigned long data)
+static inline void oom_timer_wrapper(struct timer_list *t)
 {
-	struct mv643xx_eth_private *mp = (void *)data;
+	struct mv643xx_eth_private *mp = from_timer(mp, t, rx_oom);
 
 	napi_schedule(&mp->napi);
 }
@@ -3178,8 +3178,7 @@ static int mv643xx_eth_probe(struct platform_device *pdev)
 
 	mib_counters_clear(mp);
 
-	setup_timer(&mp->mib_counters_timer, mib_counters_timer_wrapper,
-		    (unsigned long)mp);
+	timer_setup(&mp->mib_counters_timer, mib_counters_timer_wrapper, 0);
 	mp->mib_counters_timer.expires = jiffies + 30 * HZ;
 
 	spin_lock_init(&mp->mib_counters_lock);
@@ -3188,7 +3187,7 @@ static int mv643xx_eth_probe(struct platform_device *pdev)
 
 	netif_napi_add(dev, &mp->napi, mv643xx_eth_poll, NAPI_POLL_WEIGHT);
 
-	setup_timer(&mp->rx_oom, oom_timer_wrapper, (unsigned long)mp);
+	timer_setup(&mp->rx_oom, oom_timer_wrapper, 0);
 
 
 	res = platform_get_resource(pdev, IORESOURCE_IRQ, 0);
diff --git a/drivers/net/ethernet/marvell/pxa168_eth.c b/drivers/net/ethernet/marvell/pxa168_eth.c
index 91b1c154fd29..7bbd86f08e5f 100644
--- a/drivers/net/ethernet/marvell/pxa168_eth.c
+++ b/drivers/net/ethernet/marvell/pxa168_eth.c
@@ -362,9 +362,9 @@ static void rxq_refill(struct net_device *dev)
 	}
 }
 
-static inline void rxq_refill_timer_wrapper(unsigned long data)
+static inline void rxq_refill_timer_wrapper(struct timer_list *t)
 {
-	struct pxa168_eth_private *pep = (void *)data;
+	struct pxa168_eth_private *pep = from_timer(pep, t, timeout);
 	napi_schedule(&pep->napi);
 }
 
@@ -1496,8 +1496,7 @@ static int pxa168_eth_probe(struct platform_device *pdev)
 	netif_napi_add(dev, &pep->napi, pxa168_rx_poll, pep->rx_ring_size);
 
 	memset(&pep->timeout, 0, sizeof(struct timer_list));
-	setup_timer(&pep->timeout, rxq_refill_timer_wrapper,
-		    (unsigned long)pep);
+	timer_setup(&pep->timeout, rxq_refill_timer_wrapper, 0);
 
 	pep->smi_bus = mdiobus_alloc();
 	if (!pep->smi_bus) {
diff --git a/drivers/net/ethernet/marvell/skge.c b/drivers/net/ethernet/marvell/skge.c
index eef35bf3e849..6e423f098a60 100644
--- a/drivers/net/ethernet/marvell/skge.c
+++ b/drivers/net/ethernet/marvell/skge.c
@@ -1495,9 +1495,9 @@ static int xm_check_link(struct net_device *dev)
  * get an interrupt when carrier is detected, need to poll for
  * link coming up.
  */
-static void xm_link_timer(unsigned long arg)
+static void xm_link_timer(struct timer_list *t)
 {
-	struct skge_port *skge = (struct skge_port *) arg;
+	struct skge_port *skge = from_timer(skge, t, link_timer);
 	struct net_device *dev = skge->netdev;
 	struct skge_hw *hw = skge->hw;
 	int port = skge->port;
@@ -3897,7 +3897,7 @@ static struct net_device *skge_devinit(struct skge_hw *hw, int port,
 
 	/* Only used for Genesis XMAC */
 	if (is_genesis(hw))
-	    setup_timer(&skge->link_timer, xm_link_timer, (unsigned long) skge);
+	    timer_setup(&skge->link_timer, xm_link_timer, 0);
 	else {
 		dev->hw_features = NETIF_F_IP_CSUM | NETIF_F_SG |
 		                   NETIF_F_RXCSUM;
diff --git a/drivers/net/ethernet/marvell/sky2.c b/drivers/net/ethernet/marvell/sky2.c
index 1145cde2274a..9efe1771423c 100644
--- a/drivers/net/ethernet/marvell/sky2.c
+++ b/drivers/net/ethernet/marvell/sky2.c
@@ -2974,9 +2974,9 @@ static int sky2_rx_hung(struct net_device *dev)
 	}
 }
 
-static void sky2_watchdog(unsigned long arg)
+static void sky2_watchdog(struct timer_list *t)
 {
-	struct sky2_hw *hw = (struct sky2_hw *) arg;
+	struct sky2_hw *hw = from_timer(hw, t, watchdog_timer);
 
 	/* Check for lost IRQ once a second */
 	if (sky2_read32(hw, B0_ISRC)) {
@@ -5083,7 +5083,7 @@ static int sky2_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 		sky2_show_addr(dev1);
 	}
 
-	setup_timer(&hw->watchdog_timer, sky2_watchdog, (unsigned long) hw);
+	timer_setup(&hw->watchdog_timer, sky2_watchdog, 0);
 	INIT_WORK(&hw->restart_work, sky2_restart);
 
 	pci_set_drvdata(pdev, hw);
diff --git a/drivers/net/ethernet/myricom/myri10ge/myri10ge.c b/drivers/net/ethernet/myricom/myri10ge/myri10ge.c
index b171ed2015fe..2521c8c40015 100644
--- a/drivers/net/ethernet/myricom/myri10ge/myri10ge.c
+++ b/drivers/net/ethernet/myricom/myri10ge/myri10ge.c
@@ -3501,7 +3501,7 @@ static void myri10ge_watchdog(struct work_struct *work)
  * cannot detect a NIC with a parity error in a timely fashion if the
  * NIC is lightly loaded.
  */
-static void myri10ge_watchdog_timer(unsigned long arg)
+static void myri10ge_watchdog_timer(struct timer_list *t)
 {
 	struct myri10ge_priv *mgp;
 	struct myri10ge_slice_state *ss;
@@ -3509,7 +3509,7 @@ static void myri10ge_watchdog_timer(unsigned long arg)
 	u32 rx_pause_cnt;
 	u16 cmd;
 
-	mgp = (struct myri10ge_priv *)arg;
+	mgp = from_timer(mgp, t, watchdog_timer);
 
 	rx_pause_cnt = ntohl(mgp->ss[0].fw_stats->dropped_pause);
 	busy_slice_cnt = 0;
@@ -3930,8 +3930,7 @@ static int myri10ge_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 	pci_save_state(pdev);
 
 	/* Setup the watchdog timer */
-	setup_timer(&mgp->watchdog_timer, myri10ge_watchdog_timer,
-		    (unsigned long)mgp);
+	timer_setup(&mgp->watchdog_timer, myri10ge_watchdog_timer, 0);
 
 	netdev->ethtool_ops = &myri10ge_ethtool_ops;
 	INIT_WORK(&mgp->watchdog_work, myri10ge_watchdog);
diff --git a/drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c b/drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c
index 457ee80307ea..40e52ffb732f 100644
--- a/drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c
+++ b/drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c
@@ -1089,9 +1089,10 @@ static void pch_gbe_set_mode(struct pch_gbe_adapter *adapter, u16 speed,
  * pch_gbe_watchdog - Watchdog process
  * @data:  Board private structure
  */
-static void pch_gbe_watchdog(unsigned long data)
+static void pch_gbe_watchdog(struct timer_list *t)
 {
-	struct pch_gbe_adapter *adapter = (struct pch_gbe_adapter *)data;
+	struct pch_gbe_adapter *adapter = from_timer(adapter, t,
+						     watchdog_timer);
 	struct net_device *netdev = adapter->netdev;
 	struct pch_gbe_hw *hw = &adapter->hw;
 
@@ -2644,8 +2645,7 @@ static int pch_gbe_probe(struct pci_dev *pdev,
 		dev_err(&pdev->dev, "Invalid MAC address, "
 		                    "interface disabled.\n");
 	}
-	setup_timer(&adapter->watchdog_timer, pch_gbe_watchdog,
-		    (unsigned long)adapter);
+	timer_setup(&adapter->watchdog_timer, pch_gbe_watchdog, 0);
 
 	INIT_WORK(&adapter->reset_task, pch_gbe_reset_task);
 
diff --git a/drivers/net/ethernet/pasemi/pasemi_mac.c b/drivers/net/ethernet/pasemi/pasemi_mac.c
index 49591d9c2e1b..c9a55b774935 100644
--- a/drivers/net/ethernet/pasemi/pasemi_mac.c
+++ b/drivers/net/ethernet/pasemi/pasemi_mac.c
@@ -943,9 +943,9 @@ static irqreturn_t pasemi_mac_rx_intr(int irq, void *data)
 
 #define TX_CLEAN_INTERVAL HZ
 
-static void pasemi_mac_tx_timer(unsigned long data)
+static void pasemi_mac_tx_timer(struct timer_list *t)
 {
-	struct pasemi_mac_txring *txring = (struct pasemi_mac_txring *)data;
+	struct pasemi_mac_txring *txring = from_timer(txring, t, clean_timer);
 	struct pasemi_mac *mac = txring->mac;
 
 	pasemi_mac_clean_tx(txring);
@@ -1199,8 +1199,7 @@ static int pasemi_mac_open(struct net_device *dev)
 	if (dev->phydev)
 		phy_start(dev->phydev);
 
-	setup_timer(&mac->tx->clean_timer, pasemi_mac_tx_timer,
-		    (unsigned long)mac->tx);
+	timer_setup(&mac->tx->clean_timer, pasemi_mac_tx_timer, 0);
 	mod_timer(&mac->tx->clean_timer, jiffies + HZ);
 
 	return 0;
diff --git a/drivers/net/ethernet/qlogic/qla3xxx.c b/drivers/net/ethernet/qlogic/qla3xxx.c
index 05479d435469..9e5264d8773b 100644
--- a/drivers/net/ethernet/qlogic/qla3xxx.c
+++ b/drivers/net/ethernet/qlogic/qla3xxx.c
@@ -3749,9 +3749,9 @@ static void ql_get_board_info(struct ql3_adapter *qdev)
 	qdev->pci_slot = (u8) PCI_SLOT(qdev->pdev->devfn);
 }
 
-static void ql3xxx_timer(unsigned long ptr)
+static void ql3xxx_timer(struct timer_list *t)
 {
-	struct ql3_adapter *qdev = (struct ql3_adapter *)ptr;
+	struct ql3_adapter *qdev = from_timer(qdev, t, adapter_timer);
 	queue_delayed_work(qdev->workqueue, &qdev->link_state_work, 0);
 }
 
@@ -3891,7 +3891,7 @@ static int ql3xxx_probe(struct pci_dev *pdev,
 	INIT_DELAYED_WORK(&qdev->tx_timeout_work, ql_tx_timeout_work);
 	INIT_DELAYED_WORK(&qdev->link_state_work, ql_link_state_machine_work);
 
-	setup_timer(&qdev->adapter_timer, ql3xxx_timer, (unsigned long)qdev);
+	timer_setup(&qdev->adapter_timer, ql3xxx_timer, 0);
 	qdev->adapter_timer.expires = jiffies + HZ * 2;	/* two second delay */
 
 	if (!cards_found) {
diff --git a/drivers/net/ethernet/rocker/rocker_ofdpa.c b/drivers/net/ethernet/rocker/rocker_ofdpa.c
index 0653b70723a3..6d6fb8cf3e7c 100644
--- a/drivers/net/ethernet/rocker/rocker_ofdpa.c
+++ b/drivers/net/ethernet/rocker/rocker_ofdpa.c
@@ -1983,9 +1983,9 @@ err_out:
 	return err;
 }
 
-static void ofdpa_fdb_cleanup(unsigned long data)
+static void ofdpa_fdb_cleanup(struct timer_list *t)
 {
-	struct ofdpa *ofdpa = (struct ofdpa *)data;
+	struct ofdpa *ofdpa = from_timer(ofdpa, t, fdb_cleanup_timer);
 	struct ofdpa_port *ofdpa_port;
 	struct ofdpa_fdb_tbl_entry *entry;
 	struct hlist_node *tmp;
@@ -2368,8 +2368,7 @@ static int ofdpa_init(struct rocker *rocker)
 	hash_init(ofdpa->neigh_tbl);
 	spin_lock_init(&ofdpa->neigh_tbl_lock);
 
-	setup_timer(&ofdpa->fdb_cleanup_timer, ofdpa_fdb_cleanup,
-		    (unsigned long) ofdpa);
+	timer_setup(&ofdpa->fdb_cleanup_timer, ofdpa_fdb_cleanup, 0);
 	mod_timer(&ofdpa->fdb_cleanup_timer, jiffies);
 
 	ofdpa->ageing_time = BR_DEFAULT_AGEING_TIME;
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
index ff4fb5eae1af..f63c2ddced3c 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
@@ -345,9 +345,9 @@ void stmmac_disable_eee_mode(struct stmmac_priv *priv)
  *  if there is no data transfer and if we are not in LPI state,
  *  then MAC Transmitter can be moved to LPI state.
  */
-static void stmmac_eee_ctrl_timer(unsigned long arg)
+static void stmmac_eee_ctrl_timer(struct timer_list *t)
 {
-	struct stmmac_priv *priv = (struct stmmac_priv *)arg;
+	struct stmmac_priv *priv = from_timer(priv, t, eee_ctrl_timer);
 
 	stmmac_enable_eee_mode(priv);
 	mod_timer(&priv->eee_ctrl_timer, STMMAC_LPI_T(eee_timer));
@@ -401,9 +401,8 @@ bool stmmac_eee_init(struct stmmac_priv *priv)
 		spin_lock_irqsave(&priv->lock, flags);
 		if (!priv->eee_active) {
 			priv->eee_active = 1;
-			setup_timer(&priv->eee_ctrl_timer,
-				    stmmac_eee_ctrl_timer,
-				    (unsigned long)priv);
+			timer_setup(&priv->eee_ctrl_timer,
+				    stmmac_eee_ctrl_timer, 0);
 			mod_timer(&priv->eee_ctrl_timer,
 				  STMMAC_LPI_T(eee_timer));
 
@@ -2221,9 +2220,9 @@ static int stmmac_init_dma_engine(struct stmmac_priv *priv)
  * Description:
  * This is the timer handler to directly invoke the stmmac_tx_clean.
  */
-static void stmmac_tx_timer(unsigned long data)
+static void stmmac_tx_timer(struct timer_list *t)
 {
-	struct stmmac_priv *priv = (struct stmmac_priv *)data;
+	struct stmmac_priv *priv = from_timer(priv, t, txtimer);
 	u32 tx_queues_count = priv->plat->tx_queues_to_use;
 	u32 queue;
 
@@ -2244,7 +2243,7 @@ static void stmmac_init_tx_coalesce(struct stmmac_priv *priv)
 {
 	priv->tx_coal_frames = STMMAC_TX_FRAMES;
 	priv->tx_coal_timer = STMMAC_COAL_TX_TIMER;
-	setup_timer(&priv->txtimer, stmmac_tx_timer, (unsigned long)priv);
+	timer_setup(&priv->txtimer, stmmac_tx_timer, 0);
 	priv->txtimer.expires = STMMAC_COAL_TIMER(priv->tx_coal_timer);
 	add_timer(&priv->txtimer);
 }
diff --git a/drivers/net/ethernet/synopsys/dwc-xlgmac-net.c b/drivers/net/ethernet/synopsys/dwc-xlgmac-net.c
index e1b55b8fb8e0..1f8e9601592a 100644
--- a/drivers/net/ethernet/synopsys/dwc-xlgmac-net.c
+++ b/drivers/net/ethernet/synopsys/dwc-xlgmac-net.c
@@ -358,9 +358,9 @@ static irqreturn_t xlgmac_dma_isr(int irq, void *data)
 	return IRQ_HANDLED;
 }
 
-static void xlgmac_tx_timer(unsigned long data)
+static void xlgmac_tx_timer(struct timer_list *t)
 {
-	struct xlgmac_channel *channel = (struct xlgmac_channel *)data;
+	struct xlgmac_channel *channel = from_timer(channel, t, tx_timer);
 	struct xlgmac_pdata *pdata = channel->pdata;
 	struct napi_struct *napi;
 
@@ -391,8 +391,7 @@ static void xlgmac_init_timers(struct xlgmac_pdata *pdata)
 		if (!channel->tx_ring)
 			break;
 
-		setup_timer(&channel->tx_timer, xlgmac_tx_timer,
-			    (unsigned long)channel);
+		timer_setup(&channel->tx_timer, xlgmac_tx_timer, 0);
 	}
 }
 
diff --git a/drivers/net/ethernet/ti/cpsw_ale.c b/drivers/net/ethernet/ti/cpsw_ale.c
index cd1185e66133..b432a75fb874 100644
--- a/drivers/net/ethernet/ti/cpsw_ale.c
+++ b/drivers/net/ethernet/ti/cpsw_ale.c
@@ -765,9 +765,9 @@ int cpsw_ale_control_get(struct cpsw_ale *ale, int port, int control)
 }
 EXPORT_SYMBOL_GPL(cpsw_ale_control_get);
 
-static void cpsw_ale_timer(unsigned long arg)
+static void cpsw_ale_timer(struct timer_list *t)
 {
-	struct cpsw_ale *ale = (struct cpsw_ale *)arg;
+	struct cpsw_ale *ale = from_timer(ale, t, timer);
 
 	cpsw_ale_control_set(ale, 0, ALE_AGEOUT, 1);
 
@@ -859,7 +859,7 @@ void cpsw_ale_start(struct cpsw_ale *ale)
 	cpsw_ale_control_set(ale, 0, ALE_ENABLE, 1);
 	cpsw_ale_control_set(ale, 0, ALE_CLEAR, 1);
 
-	setup_timer(&ale->timer, cpsw_ale_timer, (unsigned long)ale);
+	timer_setup(&ale->timer, cpsw_ale_timer, 0);
 	if (ale->ageout) {
 		ale->timer.expires = jiffies + ale->ageout;
 		add_timer(&ale->timer);
diff --git a/drivers/net/ethernet/ti/netcp_ethss.c b/drivers/net/ethernet/ti/netcp_ethss.c
index 4ad821655e51..e831c49713ee 100644
--- a/drivers/net/ethernet/ti/netcp_ethss.c
+++ b/drivers/net/ethernet/ti/netcp_ethss.c
@@ -2745,9 +2745,9 @@ static int gbe_ioctl(void *intf_priv, struct ifreq *req, int cmd)
 	return -EOPNOTSUPP;
 }
 
-static void netcp_ethss_timer(unsigned long arg)
+static void netcp_ethss_timer(struct timer_list *t)
 {
-	struct gbe_priv *gbe_dev = (struct gbe_priv *)arg;
+	struct gbe_priv *gbe_dev = from_timer(gbe_dev, t, timer);
 	struct gbe_intf *gbe_intf;
 	struct gbe_slave *slave;
 
@@ -3616,8 +3616,7 @@ static int gbe_probe(struct netcp_device *netcp_device, struct device *dev,
 	}
 	spin_unlock_bh(&gbe_dev->hw_stats_lock);
 
-	setup_timer(&gbe_dev->timer, netcp_ethss_timer,
-		    (unsigned long)gbe_dev);
+	timer_setup(&gbe_dev->timer, netcp_ethss_timer, 0);
 	gbe_dev->timer.expires	 = jiffies + GBE_TIMER_INTERVAL;
 	add_timer(&gbe_dev->timer);
 	*inst_priv = gbe_dev;
diff --git a/drivers/net/ethernet/toshiba/spider_net.c b/drivers/net/ethernet/toshiba/spider_net.c
index a913538d3213..d925b8203996 100644
--- a/drivers/net/ethernet/toshiba/spider_net.c
+++ b/drivers/net/ethernet/toshiba/spider_net.c
@@ -912,8 +912,9 @@ spider_net_xmit(struct sk_buff *skb, struct net_device *netdev)
  * packets, including updating the queue tail pointer.
  */
 static void
-spider_net_cleanup_tx_ring(struct spider_net_card *card)
+spider_net_cleanup_tx_ring(struct timer_list *t)
 {
+	struct spider_net_card *card = from_timer(card, t, tx_timer);
 	if ((spider_net_release_tx_chain(card, 0) != 0) &&
 	    (card->netdev->flags & IFF_UP)) {
 		spider_net_kick_tx_dma(card);
@@ -1265,7 +1266,7 @@ static int spider_net_poll(struct napi_struct *napi, int budget)
 	spider_net_refill_rx_chain(card);
 	spider_net_enable_rxdmac(card);
 
-	spider_net_cleanup_tx_ring(card);
+	spider_net_cleanup_tx_ring(&card->tx_timer);
 
 	/* if all packets are in the stack, enable interrupts and return 0 */
 	/* if not, return 1 */
@@ -1977,9 +1978,9 @@ init_firmware_failed:
  * @data: used for pointer to card structure
  *
  */
-static void spider_net_link_phy(unsigned long data)
+static void spider_net_link_phy(struct timer_list *t)
 {
-	struct spider_net_card *card = (struct spider_net_card *)data;
+	struct spider_net_card *card = from_timer(card, t, aneg_timer);
 	struct mii_phy *phy = &card->phy;
 
 	/* if link didn't come up after SPIDER_NET_ANEG_TIMEOUT tries, setup phy again */
@@ -2256,14 +2257,11 @@ spider_net_setup_netdev(struct spider_net_card *card)
 
 	pci_set_drvdata(card->pdev, netdev);
 
-	setup_timer(&card->tx_timer,
-		    (void(*)(unsigned long))spider_net_cleanup_tx_ring,
-		    (unsigned long)card);
+	timer_setup(&card->tx_timer, spider_net_cleanup_tx_ring, 0);
 	netdev->irq = card->pdev->irq;
 
 	card->aneg_count = 0;
-	setup_timer(&card->aneg_timer, spider_net_link_phy,
-		    (unsigned long)card);
+	timer_setup(&card->aneg_timer, spider_net_link_phy, 0);
 
 	netif_napi_add(netdev, &card->napi,
 		       spider_net_poll, SPIDER_NET_NAPI_WEIGHT);
diff --git a/drivers/net/slip/slip.c b/drivers/net/slip/slip.c
index eb8a18991d8c..cc63102ca96e 100644
--- a/drivers/net/slip/slip.c
+++ b/drivers/net/slip/slip.c
@@ -106,8 +106,8 @@ static int slip_esc6(unsigned char *p, unsigned char *d, int len);
 static void slip_unesc6(struct slip *sl, unsigned char c);
 #endif
 #ifdef CONFIG_SLIP_SMART
-static void sl_keepalive(unsigned long sls);
-static void sl_outfill(unsigned long sls);
+static void sl_keepalive(struct timer_list *t);
+static void sl_outfill(struct timer_list *t);
 static int sl_ioctl(struct net_device *dev, struct ifreq *rq, int cmd);
 #endif
 
@@ -763,8 +763,8 @@ static struct slip *sl_alloc(dev_t line)
 	sl->mode        = SL_MODE_DEFAULT;
 #ifdef CONFIG_SLIP_SMART
 	/* initialize timer_list struct */
-	setup_timer(&sl->keepalive_timer, sl_keepalive, (unsigned long)sl);
-	setup_timer(&sl->outfill_timer, sl_outfill, (unsigned long)sl);
+	timer_setup(&sl->keepalive_timer, sl_keepalive, 0);
+	timer_setup(&sl->outfill_timer, sl_outfill, 0);
 #endif
 	slip_devs[i] = dev;
 	return sl;
@@ -1388,9 +1388,9 @@ module_exit(slip_exit);
  * added by Stanislav Voronyi. All changes before marked VSV
  */
 
-static void sl_outfill(unsigned long sls)
+static void sl_outfill(struct timer_list *t)
 {
-	struct slip *sl = (struct slip *)sls;
+	struct slip *sl = from_timer(sl, t, outfill_timer);
 
 	spin_lock(&sl->lock);
 
@@ -1419,9 +1419,9 @@ out:
 	spin_unlock(&sl->lock);
 }
 
-static void sl_keepalive(unsigned long sls)
+static void sl_keepalive(struct timer_list *t)
 {
-	struct slip *sl = (struct slip *)sls;
+	struct slip *sl = from_timer(sl, t, keepalive_timer);
 
 	spin_lock(&sl->lock);
 
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 5a2ea78a008f..c3af08f24679 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -444,9 +444,9 @@ static void tun_flow_delete_by_queue(struct tun_struct *tun, u16 queue_index)
 	spin_unlock_bh(&tun->lock);
 }
 
-static void tun_flow_cleanup(unsigned long data)
+static void tun_flow_cleanup(struct timer_list *t)
 {
-	struct tun_struct *tun = (struct tun_struct *)data;
+	struct tun_struct *tun = from_timer(tun, t, flow_gc_timer);
 	unsigned long delay = tun->ageing_time;
 	unsigned long next_timer = jiffies + delay;
 	unsigned long count = 0;
@@ -1196,7 +1196,9 @@ static void tun_flow_init(struct tun_struct *tun)
 		INIT_HLIST_HEAD(&tun->flows[i]);
 
 	tun->ageing_time = TUN_FLOW_EXPIRE;
-	setup_timer(&tun->flow_gc_timer, tun_flow_cleanup, (unsigned long)tun);
+	timer_setup(&tun->flow_gc_timer, tun_flow_cleanup, 0);
+	mod_timer(&tun->flow_gc_timer,
+		  round_jiffies_up(jiffies + tun->ageing_time));
 }
 
 static void tun_flow_uninit(struct tun_struct *tun)
diff --git a/drivers/net/wan/hdlc_ppp.c b/drivers/net/wan/hdlc_ppp.c
index c7721c729541..afeca6bcdade 100644
--- a/drivers/net/wan/hdlc_ppp.c
+++ b/drivers/net/wan/hdlc_ppp.c
@@ -558,9 +558,9 @@ out:
 	return NET_RX_DROP;
 }
 
-static void ppp_timer(unsigned long arg)
+static void ppp_timer(struct timer_list *t)
 {
-	struct proto *proto = (struct proto *)arg;
+	struct proto *proto = from_timer(proto, t, timer);
 	struct ppp *ppp = get_ppp(proto->dev);
 	unsigned long flags;
 
@@ -610,7 +610,7 @@ static void ppp_start(struct net_device *dev)
 	for (i = 0; i < IDX_COUNT; i++) {
 		struct proto *proto = &ppp->protos[i];
 		proto->dev = dev;
-		setup_timer(&proto->timer, ppp_timer, (unsigned long)proto);
+		timer_setup(&proto->timer, ppp_timer, 0);
 		proto->state = CLOSED;
 	}
 	ppp->protos[IDX_LCP].pid = PID_LCP;
diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/btcoex.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/btcoex.c
index 3559fb5b8fb0..03aae6bc1838 100644
--- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/btcoex.c
+++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/btcoex.c
@@ -280,9 +280,9 @@ static void brcmf_btcoex_restore_part1(struct brcmf_btcoex_info *btci)
 /**
  * brcmf_btcoex_timerfunc() - BT coex timer callback
  */
-static void brcmf_btcoex_timerfunc(ulong data)
+static void brcmf_btcoex_timerfunc(struct timer_list *t)
 {
-	struct brcmf_btcoex_info *bt_local = (struct brcmf_btcoex_info *)data;
+	struct brcmf_btcoex_info *bt_local = from_timer(bt_local, t, timer);
 	brcmf_dbg(TRACE, "enter\n");
 
 	bt_local->timer_on = false;
@@ -380,7 +380,7 @@ int brcmf_btcoex_attach(struct brcmf_cfg80211_info *cfg)
 	/* Set up timer for BT  */
 	btci->timer_on = false;
 	btci->timeout = BRCMF_BTCOEX_OPPR_WIN_TIME;
-	setup_timer(&btci->timer, brcmf_btcoex_timerfunc, (ulong)btci);
+	timer_setup(&btci->timer, brcmf_btcoex_timerfunc, 0);
 	btci->cfg = cfg;
 	btci->saved_regs_part1 = false;
 	btci->saved_regs_part2 = false;
diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c
index 6e70df978159..15fa00d79fc6 100644
--- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c
+++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c
@@ -2983,10 +2983,10 @@ static void brcmf_cfg80211_escan_timeout_worker(struct work_struct *work)
 	brcmf_notify_escan_complete(cfg, cfg->escan_info.ifp, true, true);
 }
 
-static void brcmf_escan_timeout(unsigned long data)
+static void brcmf_escan_timeout(struct timer_list *t)
 {
 	struct brcmf_cfg80211_info *cfg =
-			(struct brcmf_cfg80211_info *)data;
+			from_timer(cfg, t, escan_timeout);
 
 	if (cfg->int_escan_map || cfg->scan_request) {
 		brcmf_err("timer expired\n");
@@ -3150,8 +3150,7 @@ static void brcmf_init_escan(struct brcmf_cfg80211_info *cfg)
 			    brcmf_cfg80211_escan_handler);
 	cfg->escan_info.escan_state = WL_ESCAN_STATE_IDLE;
 	/* Init scan_timeout timer */
-	setup_timer(&cfg->escan_timeout, brcmf_escan_timeout,
-		    (unsigned long)cfg);
+	timer_setup(&cfg->escan_timeout, brcmf_escan_timeout, 0);
 	INIT_WORK(&cfg->escan_timeout_work,
 		  brcmf_cfg80211_escan_timeout_worker);
 }
diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c
index e3495ea95553..310c4e2746aa 100644
--- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c
+++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c
@@ -3972,9 +3972,9 @@ brcmf_sdio_watchdog_thread(void *data)
 }
 
 static void
-brcmf_sdio_watchdog(unsigned long data)
+brcmf_sdio_watchdog(struct timer_list *t)
 {
-	struct brcmf_sdio *bus = (struct brcmf_sdio *)data;
+	struct brcmf_sdio *bus = from_timer(bus, t, timer);
 
 	if (bus->watchdog_tsk) {
 		complete(&bus->watchdog_wait);
@@ -4169,8 +4169,7 @@ struct brcmf_sdio *brcmf_sdio_probe(struct brcmf_sdio_dev *sdiodev)
 	init_waitqueue_head(&bus->dcmd_resp_wait);
 
 	/* Set up the watchdog timer */
-	setup_timer(&bus->timer, brcmf_sdio_watchdog,
-		    (unsigned long)bus);
+	timer_setup(&bus->timer, brcmf_sdio_watchdog, 0);
 	/* Initialize watchdog thread */
 	init_completion(&bus->watchdog_wait);
 	bus->watchdog_tsk = kthread_run(brcmf_sdio_watchdog_thread,
diff --git a/drivers/net/wireless/intel/iwlwifi/dvm/main.c b/drivers/net/wireless/intel/iwlwifi/dvm/main.c
index 2acd94da9efe..d11d72615de2 100644
--- a/drivers/net/wireless/intel/iwlwifi/dvm/main.c
+++ b/drivers/net/wireless/intel/iwlwifi/dvm/main.c
@@ -399,9 +399,9 @@ int iwl_send_statistics_request(struct iwl_priv *priv, u8 flags, bool clear)
  * was received.  We need to ensure we receive the statistics in order
  * to update the temperature used for calibrating the TXPOWER.
  */
-static void iwl_bg_statistics_periodic(unsigned long data)
+static void iwl_bg_statistics_periodic(struct timer_list *t)
 {
-	struct iwl_priv *priv = (struct iwl_priv *)data;
+	struct iwl_priv *priv = from_timer(priv, t, statistics_periodic);
 
 	if (test_bit(STATUS_EXIT_PENDING, &priv->status))
 		return;
@@ -556,9 +556,9 @@ static void iwl_continuous_event_trace(struct iwl_priv *priv)
  * this function is to perform continuous uCode event logging operation
  * if enabled
  */
-static void iwl_bg_ucode_trace(unsigned long data)
+static void iwl_bg_ucode_trace(struct timer_list *t)
 {
-	struct iwl_priv *priv = (struct iwl_priv *)data;
+	struct iwl_priv *priv = from_timer(priv, t, ucode_trace);
 
 	if (test_bit(STATUS_EXIT_PENDING, &priv->status))
 		return;
@@ -1085,11 +1085,9 @@ static void iwl_setup_deferred_work(struct iwl_priv *priv)
 	if (priv->lib->bt_params)
 		iwlagn_bt_setup_deferred_work(priv);
 
-	setup_timer(&priv->statistics_periodic, iwl_bg_statistics_periodic,
-		    (unsigned long)priv);
+	timer_setup(&priv->statistics_periodic, iwl_bg_statistics_periodic, 0);
 
-	setup_timer(&priv->ucode_trace, iwl_bg_ucode_trace,
-		    (unsigned long)priv);
+	timer_setup(&priv->ucode_trace, iwl_bg_ucode_trace, 0);
 }
 
 void iwl_cancel_deferred_work(struct iwl_priv *priv)
diff --git a/drivers/net/wireless/intel/iwlwifi/pcie/tx.c b/drivers/net/wireless/intel/iwlwifi/pcie/tx.c
index b5c459cd70ce..fed6d842a5e1 100644
--- a/drivers/net/wireless/intel/iwlwifi/pcie/tx.c
+++ b/drivers/net/wireless/intel/iwlwifi/pcie/tx.c
@@ -147,9 +147,9 @@ void iwl_pcie_free_dma_ptr(struct iwl_trans *trans, struct iwl_dma_ptr *ptr)
 	memset(ptr, 0, sizeof(*ptr));
 }
 
-static void iwl_pcie_txq_stuck_timer(unsigned long data)
+static void iwl_pcie_txq_stuck_timer(struct timer_list *t)
 {
-	struct iwl_txq *txq = (void *)data;
+	struct iwl_txq *txq = from_timer(txq, t, stuck_timer);
 	struct iwl_trans_pcie *trans_pcie = txq->trans_pcie;
 	struct iwl_trans *trans = iwl_trans_pcie_get_trans(trans_pcie);
 
@@ -495,8 +495,7 @@ int iwl_pcie_txq_alloc(struct iwl_trans *trans, struct iwl_txq *txq,
 	if (WARN_ON(txq->entries || txq->tfds))
 		return -EINVAL;
 
-	setup_timer(&txq->stuck_timer, iwl_pcie_txq_stuck_timer,
-		    (unsigned long)txq);
+	timer_setup(&txq->stuck_timer, iwl_pcie_txq_stuck_timer, 0);
 	txq->trans_pcie = trans_pcie;
 
 	txq->n_window = slots_num;
diff --git a/drivers/net/wireless/intersil/hostap/hostap_ap.c b/drivers/net/wireless/intersil/hostap/hostap_ap.c
index f9d047314692..b4dfe1893d18 100644
--- a/drivers/net/wireless/intersil/hostap/hostap_ap.c
+++ b/drivers/net/wireless/intersil/hostap/hostap_ap.c
@@ -185,9 +185,9 @@ static void hostap_event_expired_sta(struct net_device *dev,
 
 #ifndef PRISM2_NO_KERNEL_IEEE80211_MGMT
 
-static void ap_handle_timer(unsigned long data)
+static void ap_handle_timer(struct timer_list *t)
 {
-	struct sta_info *sta = (struct sta_info *) data;
+	struct sta_info *sta = from_timer(sta, t, timer);
 	local_info_t *local;
 	struct ap_data *ap;
 	unsigned long next_time = 0;
@@ -1189,7 +1189,7 @@ static struct sta_info * ap_add_sta(struct ap_data *ap, u8 *addr)
 	}
 
 #ifndef PRISM2_NO_KERNEL_IEEE80211_MGMT
-	setup_timer(&sta->timer, ap_handle_timer, (unsigned long)sta);
+	timer_setup(&sta->timer, ap_handle_timer, 0);
 	sta->timer.expires = jiffies + ap->max_inactivity;
 	if (!ap->local->hostapd)
 		add_timer(&sta->timer);
diff --git a/drivers/net/wireless/intersil/hostap/hostap_hw.c b/drivers/net/wireless/intersil/hostap/hostap_hw.c
index 8177fd6f65c1..5c4a17a18968 100644
--- a/drivers/net/wireless/intersil/hostap/hostap_hw.c
+++ b/drivers/net/wireless/intersil/hostap/hostap_hw.c
@@ -2794,9 +2794,9 @@ static void prism2_check_sta_fw_version(local_info_t *local)
 }
 
 
-static void hostap_passive_scan(unsigned long data)
+static void hostap_passive_scan(struct timer_list *t)
 {
-	local_info_t *local = (local_info_t *) data;
+	local_info_t *local = from_timer(local, t, passive_scan_timer);
 	struct net_device *dev = local->dev;
 	u16 chan;
 
@@ -2869,10 +2869,10 @@ static void handle_comms_qual_update(struct work_struct *work)
  * used to monitor that local->last_tick_timer is being updated. If not,
  * interrupt busy-loop is assumed and driver tries to recover by masking out
  * some events. */
-static void hostap_tick_timer(unsigned long data)
+static void hostap_tick_timer(struct timer_list *t)
 {
 	static unsigned long last_inquire = 0;
-	local_info_t *local = (local_info_t *) data;
+	local_info_t *local = from_timer(local, t, tick_timer);
 	local->last_tick_timer = jiffies;
 
 	/* Inquire CommTallies every 10 seconds to keep the statistics updated
@@ -3225,10 +3225,8 @@ while (0)
 
 	lib80211_crypt_info_init(&local->crypt_info, dev->name, &local->lock);
 
-	setup_timer(&local->passive_scan_timer, hostap_passive_scan,
-		    (unsigned long)local);
-	setup_timer(&local->tick_timer, hostap_tick_timer,
-		    (unsigned long)local);
+	timer_setup(&local->passive_scan_timer, hostap_passive_scan, 0);
+	timer_setup(&local->tick_timer, hostap_tick_timer, 0);
 	local->tick_timer.expires = jiffies + 2 * HZ;
 	add_timer(&local->tick_timer);
 
diff --git a/drivers/net/wireless/intersil/orinoco/orinoco_usb.c b/drivers/net/wireless/intersil/orinoco/orinoco_usb.c
index 501180584b4b..94ad6fe29e69 100644
--- a/drivers/net/wireless/intersil/orinoco/orinoco_usb.c
+++ b/drivers/net/wireless/intersil/orinoco/orinoco_usb.c
@@ -319,9 +319,9 @@ static inline void ezusb_mod_timer(struct ezusb_priv *upriv,
 	mod_timer(timer, expire);
 }
 
-static void ezusb_request_timerfn(u_long _ctx)
+static void ezusb_request_timerfn(struct timer_list *t)
 {
-	struct request_context *ctx = (void *) _ctx;
+	struct request_context *ctx = from_timer(ctx, t, timer);
 
 	ctx->outurb->transfer_flags |= URB_ASYNC_UNLINK;
 	if (usb_unlink_urb(ctx->outurb) == -EINPROGRESS) {
@@ -365,7 +365,7 @@ static struct request_context *ezusb_alloc_ctx(struct ezusb_priv *upriv,
 	refcount_set(&ctx->refcount, 1);
 	init_completion(&ctx->done);
 
-	setup_timer(&ctx->timer, ezusb_request_timerfn, (u_long)ctx);
+	timer_setup(&ctx->timer, ezusb_request_timerfn, 0);
 	return ctx;
 }
 
diff --git a/drivers/net/wireless/quantenna/qtnfmac/core.c b/drivers/net/wireless/quantenna/qtnfmac/core.c
index 2d2c1ea65cb2..3423dc51198b 100644
--- a/drivers/net/wireless/quantenna/qtnfmac/core.c
+++ b/drivers/net/wireless/quantenna/qtnfmac/core.c
@@ -288,7 +288,7 @@ static struct qtnf_wmac *qtnf_core_mac_alloc(struct qtnf_bus *bus,
 		mac->iflist[i].vifid = i;
 		qtnf_sta_list_init(&mac->iflist[i].sta_list);
 		mutex_init(&mac->mac_lock);
-		setup_timer(&mac->scan_timeout, NULL, 0);
+		timer_setup(&mac->scan_timeout, NULL, 0);
 	}
 
 	qtnf_mac_init_primary_intf(mac);
diff --git a/drivers/net/wireless/ti/wlcore/main.c b/drivers/net/wireless/ti/wlcore/main.c
index c346c021b999..d47921a84509 100644
--- a/drivers/net/wireless/ti/wlcore/main.c
+++ b/drivers/net/wireless/ti/wlcore/main.c
@@ -196,9 +196,9 @@ out:
 	mutex_unlock(&wl->mutex);
 }
 
-static void wl1271_rx_streaming_timer(unsigned long data)
+static void wl1271_rx_streaming_timer(struct timer_list *t)
 {
-	struct wl12xx_vif *wlvif = (struct wl12xx_vif *)data;
+	struct wl12xx_vif *wlvif = from_timer(wlvif, t, rx_streaming_timer);
 	struct wl1271 *wl = wlvif->wl;
 	ieee80211_queue_work(wl->hw, &wlvif->rx_streaming_disable_work);
 }
@@ -2279,8 +2279,7 @@ static int wl12xx_init_vif_data(struct wl1271 *wl, struct ieee80211_vif *vif)
 			  wlcore_pending_auth_complete_work);
 	INIT_LIST_HEAD(&wlvif->list);
 
-	setup_timer(&wlvif->rx_streaming_timer, wl1271_rx_streaming_timer,
-		    (unsigned long) wlvif);
+	timer_setup(&wlvif->rx_streaming_timer, wl1271_rx_streaming_timer, 0);
 	return 0;
 }
 
diff --git a/drivers/net/xen-netfront.c b/drivers/net/xen-netfront.c
index 8b8689c6d887..18c85e55e76a 100644
--- a/drivers/net/xen-netfront.c
+++ b/drivers/net/xen-netfront.c
@@ -228,9 +228,9 @@ static bool xennet_can_sg(struct net_device *dev)
 }
 
 
-static void rx_refill_timeout(unsigned long data)
+static void rx_refill_timeout(struct timer_list *t)
 {
-	struct netfront_queue *queue = (struct netfront_queue *)data;
+	struct netfront_queue *queue = from_timer(queue, t, rx_refill_timer);
 	napi_schedule(&queue->napi);
 }
 
@@ -1605,8 +1605,7 @@ static int xennet_init_queue(struct netfront_queue *queue)
 	spin_lock_init(&queue->tx_lock);
 	spin_lock_init(&queue->rx_lock);
 
-	setup_timer(&queue->rx_refill_timer, rx_refill_timeout,
-		    (unsigned long)queue);
+	timer_setup(&queue->rx_refill_timer, rx_refill_timeout, 0);
 
 	snprintf(queue->name, sizeof(queue->name), "%s-q%u",
 		 queue->info->netdev->name, queue->id);
diff --git a/drivers/nfc/pn533/pn533.c b/drivers/nfc/pn533/pn533.c
index 2effa5ff7082..a0cc1cc45292 100644
--- a/drivers/nfc/pn533/pn533.c
+++ b/drivers/nfc/pn533/pn533.c
@@ -1232,9 +1232,9 @@ static int pn533_init_target_complete(struct pn533 *dev, struct sk_buff *resp)
 	return 0;
 }
 
-static void pn533_listen_mode_timer(unsigned long data)
+static void pn533_listen_mode_timer(struct timer_list *t)
 {
-	struct pn533 *dev = (struct pn533 *)data;
+	struct pn533 *dev = from_timer(dev, t, listen_timer);
 
 	dev_dbg(dev->dev, "Listen mode timeout\n");
 
@@ -2632,8 +2632,7 @@ struct pn533 *pn533_register_device(u32 device_type,
 	if (priv->wq == NULL)
 		goto error;
 
-	setup_timer(&priv->listen_timer, pn533_listen_mode_timer,
-		    (unsigned long)priv);
+	timer_setup(&priv->listen_timer, pn533_listen_mode_timer, 0);
 
 	skb_queue_head_init(&priv->resp_q);
 	skb_queue_head_init(&priv->fragment_skb);
diff --git a/drivers/nfc/st-nci/ndlc.c b/drivers/nfc/st-nci/ndlc.c
index 93a7536a9af9..f26d938d240f 100644
--- a/drivers/nfc/st-nci/ndlc.c
+++ b/drivers/nfc/st-nci/ndlc.c
@@ -246,18 +246,18 @@ void ndlc_recv(struct llt_ndlc *ndlc, struct sk_buff *skb)
 }
 EXPORT_SYMBOL(ndlc_recv);
 
-static void ndlc_t1_timeout(unsigned long data)
+static void ndlc_t1_timeout(struct timer_list *t)
 {
-	struct llt_ndlc *ndlc = (struct llt_ndlc *)data;
+	struct llt_ndlc *ndlc = from_timer(ndlc, t, t1_timer);
 
 	pr_debug("\n");
 
 	schedule_work(&ndlc->sm_work);
 }
 
-static void ndlc_t2_timeout(unsigned long data)
+static void ndlc_t2_timeout(struct timer_list *t)
 {
-	struct llt_ndlc *ndlc = (struct llt_ndlc *)data;
+	struct llt_ndlc *ndlc = from_timer(ndlc, t, t2_timer);
 
 	pr_debug("\n");
 
@@ -282,8 +282,8 @@ int ndlc_probe(void *phy_id, struct nfc_phy_ops *phy_ops, struct device *dev,
 	*ndlc_id = ndlc;
 
 	/* initialize timers */
-	setup_timer(&ndlc->t1_timer, ndlc_t1_timeout, (unsigned long)ndlc);
-	setup_timer(&ndlc->t2_timer, ndlc_t2_timeout, (unsigned long)ndlc);
+	timer_setup(&ndlc->t1_timer, ndlc_t1_timeout, 0);
+	timer_setup(&ndlc->t2_timer, ndlc_t2_timeout, 0);
 
 	skb_queue_head_init(&ndlc->rcv_q);
 	skb_queue_head_init(&ndlc->send_q);
diff --git a/drivers/ntb/test/ntb_pingpong.c b/drivers/ntb/test/ntb_pingpong.c
index 938a18bcfc3f..3f5a92bae6f8 100644
--- a/drivers/ntb/test/ntb_pingpong.c
+++ b/drivers/ntb/test/ntb_pingpong.c
@@ -107,9 +107,9 @@ struct pp_ctx {
 
 static struct dentry *pp_debugfs_dir;
 
-static void pp_ping(unsigned long ctx)
+static void pp_ping(struct timer_list *t)
 {
-	struct pp_ctx *pp = (void *)ctx;
+	struct pp_ctx *pp = from_timer(pp, t, db_timer);
 	unsigned long irqflags;
 	u64 db_bits, db_mask;
 	u32 spad_rd, spad_wr;
@@ -153,7 +153,7 @@ static void pp_link_event(void *ctx)
 
 	if (ntb_link_is_up(pp->ntb, NULL, NULL) == 1) {
 		dev_dbg(&pp->ntb->dev, "link is up\n");
-		pp_ping((unsigned long)pp);
+		pp_ping(&pp->db_timer);
 	} else {
 		dev_dbg(&pp->ntb->dev, "link is down\n");
 		del_timer(&pp->db_timer);
@@ -252,7 +252,7 @@ static int pp_probe(struct ntb_client *client,
 	pp->db_bits = 0;
 	atomic_set(&pp->count, 0);
 	spin_lock_init(&pp->db_lock);
-	setup_timer(&pp->db_timer, pp_ping, (unsigned long)pp);
+	timer_setup(&pp->db_timer, pp_ping, 0);
 	pp->db_delay = msecs_to_jiffies(delay_ms);
 
 	rc = ntb_set_ctx(ntb, pp, &pp_ops);
diff --git a/drivers/platform/x86/sony-laptop.c b/drivers/platform/x86/sony-laptop.c
index 62aa2c37b8d2..935121814c97 100644
--- a/drivers/platform/x86/sony-laptop.c
+++ b/drivers/platform/x86/sony-laptop.c
@@ -363,7 +363,7 @@ static int sony_laptop_input_keycode_map[] = {
 };
 
 /* release buttons after a short delay if pressed */
-static void do_sony_laptop_release_key(unsigned long unused)
+static void do_sony_laptop_release_key(struct timer_list *unused)
 {
 	struct sony_laptop_keypress kp;
 	unsigned long flags;
@@ -470,7 +470,7 @@ static int sony_laptop_setup_input(struct acpi_device *acpi_device)
 		goto err_dec_users;
 	}
 
-	setup_timer(&sony_laptop_input.release_key_timer,
+	timer_setup(&sony_laptop_input.release_key_timer,
 		    do_sony_laptop_release_key, 0);
 
 	/* input keys */
diff --git a/drivers/pps/clients/pps-ktimer.c b/drivers/pps/clients/pps-ktimer.c
index 436b4e4e71a1..04735649052a 100644
--- a/drivers/pps/clients/pps-ktimer.c
+++ b/drivers/pps/clients/pps-ktimer.c
@@ -39,7 +39,7 @@ static struct timer_list ktimer;
  * The kernel timer
  */
 
-static void pps_ktimer_event(unsigned long ptr)
+static void pps_ktimer_event(struct timer_list *unused)
 {
 	struct pps_event_time ts;
 
@@ -85,7 +85,7 @@ static int __init pps_ktimer_init(void)
 		return -ENOMEM;
 	}
 
-	setup_timer(&ktimer, pps_ktimer_event, 0);
+	timer_setup(&ktimer, pps_ktimer_event, 0);
 	mod_timer(&ktimer, jiffies + HZ);
 
 	dev_info(pps->dev, "ktimer PPS source registered\n");
diff --git a/drivers/rtc/rtc-dev.c b/drivers/rtc/rtc-dev.c
index 00efe24a6063..215eac68ae2d 100644
--- a/drivers/rtc/rtc-dev.c
+++ b/drivers/rtc/rtc-dev.c
@@ -71,9 +71,9 @@ static void rtc_uie_task(struct work_struct *work)
 	if (num)
 		rtc_handle_legacy_irq(rtc, num, RTC_UF);
 }
-static void rtc_uie_timer(unsigned long data)
+static void rtc_uie_timer(struct timer_list *t)
 {
-	struct rtc_device *rtc = (struct rtc_device *)data;
+	struct rtc_device *rtc = from_timer(rtc, t, uie_timer);
 	unsigned long flags;
 
 	spin_lock_irqsave(&rtc->irq_lock, flags);
@@ -460,7 +460,7 @@ void rtc_dev_prepare(struct rtc_device *rtc)
 
 #ifdef CONFIG_RTC_INTF_DEV_UIE_EMUL
 	INIT_WORK(&rtc->uie_task, rtc_uie_task);
-	setup_timer(&rtc->uie_timer, rtc_uie_timer, (unsigned long)rtc);
+	timer_setup(&rtc->uie_timer, rtc_uie_timer, 0);
 #endif
 
 	cdev_init(&rtc->char_dev, &rtc_dev_fops);
diff --git a/drivers/s390/block/dasd.c b/drivers/s390/block/dasd.c
index adba91318768..0f1ff0813493 100644
--- a/drivers/s390/block/dasd.c
+++ b/drivers/s390/block/dasd.c
@@ -70,8 +70,8 @@ static void do_restore_device(struct work_struct *);
 static void do_reload_device(struct work_struct *);
 static void do_requeue_requests(struct work_struct *);
 static void dasd_return_cqr_cb(struct dasd_ccw_req *, void *);
-static void dasd_device_timeout(unsigned long);
-static void dasd_block_timeout(unsigned long);
+static void dasd_device_timeout(struct timer_list *);
+static void dasd_block_timeout(struct timer_list *);
 static void __dasd_process_erp(struct dasd_device *, struct dasd_ccw_req *);
 static void dasd_profile_init(struct dasd_profile *, struct dentry *);
 static void dasd_profile_exit(struct dasd_profile *);
@@ -119,8 +119,7 @@ struct dasd_device *dasd_alloc_device(void)
 		     (void (*)(unsigned long)) dasd_device_tasklet,
 		     (unsigned long) device);
 	INIT_LIST_HEAD(&device->ccw_queue);
-	setup_timer(&device->timer, dasd_device_timeout,
-		    (unsigned long)device);
+	timer_setup(&device->timer, dasd_device_timeout, 0);
 	INIT_WORK(&device->kick_work, do_kick_device);
 	INIT_WORK(&device->restore_device, do_restore_device);
 	INIT_WORK(&device->reload_device, do_reload_device);
@@ -162,7 +161,7 @@ struct dasd_block *dasd_alloc_block(void)
 		     (unsigned long) block);
 	INIT_LIST_HEAD(&block->ccw_queue);
 	spin_lock_init(&block->queue_lock);
-	setup_timer(&block->timer, dasd_block_timeout, (unsigned long)block);
+	timer_setup(&block->timer, dasd_block_timeout, 0);
 	spin_lock_init(&block->profile.lock);
 
 	return block;
@@ -1557,12 +1556,12 @@ EXPORT_SYMBOL(dasd_start_IO);
  * The head of the ccw queue will have status DASD_CQR_IN_IO for 1),
  * DASD_CQR_QUEUED for 2) and 3).
  */
-static void dasd_device_timeout(unsigned long ptr)
+static void dasd_device_timeout(struct timer_list *t)
 {
 	unsigned long flags;
 	struct dasd_device *device;
 
-	device = (struct dasd_device *) ptr;
+	device = from_timer(device, t, timer);
 	spin_lock_irqsave(get_ccwdev_lock(device->cdev), flags);
 	/* re-activate request queue */
 	dasd_device_remove_stop_bits(device, DASD_STOPPED_PENDING);
@@ -2625,12 +2624,12 @@ EXPORT_SYMBOL(dasd_cancel_req);
  * is waiting for something that may not come reliably, (e.g. a state
  * change interrupt)
  */
-static void dasd_block_timeout(unsigned long ptr)
+static void dasd_block_timeout(struct timer_list *t)
 {
 	unsigned long flags;
 	struct dasd_block *block;
 
-	block = (struct dasd_block *) ptr;
+	block = from_timer(block, t, timer);
 	spin_lock_irqsave(get_ccwdev_lock(block->base->cdev), flags);
 	/* re-activate request queue */
 	dasd_device_remove_stop_bits(block->base, DASD_STOPPED_PENDING);
diff --git a/drivers/s390/net/fsm.c b/drivers/s390/net/fsm.c
index 16b81be1f07a..c81adf8042d7 100644
--- a/drivers/s390/net/fsm.c
+++ b/drivers/s390/net/fsm.c
@@ -129,8 +129,9 @@ fsm_getstate_str(fsm_instance *fi)
 }
 
 static void
-fsm_expire_timer(fsm_timer *this)
+fsm_expire_timer(struct timer_list *t)
 {
+	fsm_timer *this = from_timer(this, t, tl);
 #if FSM_TIMER_DEBUG
 	printk(KERN_DEBUG "fsm(%s): Timer %p expired\n",
 	       this->fi->name, this);
@@ -146,7 +147,7 @@ fsm_settimer(fsm_instance *fi, fsm_timer *this)
 	printk(KERN_DEBUG "fsm(%s): Create timer %p\n", fi->name,
 	       this);
 #endif
-	setup_timer(&this->tl, (void *)fsm_expire_timer, (long)this);
+	timer_setup(&this->tl, fsm_expire_timer, 0);
 }
 
 void
@@ -168,7 +169,7 @@ fsm_addtimer(fsm_timer *this, int millisec, int event, void *arg)
 	       this->fi->name, this, millisec);
 #endif
 
-	setup_timer(&this->tl, (void *)fsm_expire_timer, (long)this);
+	timer_setup(&this->tl, fsm_expire_timer, 0);
 	this->expire_event = event;
 	this->event_arg = arg;
 	this->tl.expires = jiffies + (millisec * HZ) / 1000;
@@ -187,7 +188,7 @@ fsm_modtimer(fsm_timer *this, int millisec, int event, void *arg)
 #endif
 
 	del_timer(&this->tl);
-	setup_timer(&this->tl, (void *)fsm_expire_timer, (long)this);
+	timer_setup(&this->tl, fsm_expire_timer, 0);
 	this->expire_event = event;
 	this->event_arg = arg;
 	this->tl.expires = jiffies + (millisec * HZ) / 1000;
diff --git a/drivers/scsi/arcmsr/arcmsr_hba.c b/drivers/scsi/arcmsr/arcmsr_hba.c
index a54b6c11b505..21f6421536a0 100644
--- a/drivers/scsi/arcmsr/arcmsr_hba.c
+++ b/drivers/scsi/arcmsr/arcmsr_hba.c
@@ -101,7 +101,7 @@ static void arcmsr_enable_outbound_ints(struct AdapterControlBlock *acb,
 static void arcmsr_stop_adapter_bgrb(struct AdapterControlBlock *acb);
 static void arcmsr_hbaA_flush_cache(struct AdapterControlBlock *acb);
 static void arcmsr_hbaB_flush_cache(struct AdapterControlBlock *acb);
-static void arcmsr_request_device_map(unsigned long pacb);
+static void arcmsr_request_device_map(struct timer_list *t);
 static void arcmsr_hbaA_request_device_map(struct AdapterControlBlock *acb);
 static void arcmsr_hbaB_request_device_map(struct AdapterControlBlock *acb);
 static void arcmsr_hbaC_request_device_map(struct AdapterControlBlock *acb);
@@ -837,8 +837,7 @@ static int arcmsr_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 	atomic_set(&acb->rq_map_token, 16);
 	atomic_set(&acb->ante_token_value, 16);
 	acb->fw_flag = FW_NORMAL;
-	setup_timer(&acb->eternal_timer, &arcmsr_request_device_map,
-		    (unsigned long)acb);
+	timer_setup(&acb->eternal_timer, arcmsr_request_device_map, 0);
 	acb->eternal_timer.expires = jiffies + msecs_to_jiffies(6 * HZ);
 	add_timer(&acb->eternal_timer);
 	if(arcmsr_alloc_sysfs_attr(acb))
@@ -929,8 +928,7 @@ static int arcmsr_resume(struct pci_dev *pdev)
 	atomic_set(&acb->rq_map_token, 16);
 	atomic_set(&acb->ante_token_value, 16);
 	acb->fw_flag = FW_NORMAL;
-	setup_timer(&acb->eternal_timer, &arcmsr_request_device_map,
-		    (unsigned long)acb);
+	timer_setup(&acb->eternal_timer, arcmsr_request_device_map, 0);
 	acb->eternal_timer.expires = jiffies + msecs_to_jiffies(6 * HZ);
 	add_timer(&acb->eternal_timer);
 	return 0;
@@ -3457,9 +3455,9 @@ static void arcmsr_hbaD_request_device_map(struct AdapterControlBlock *acb)
 	}
 }
 
-static void arcmsr_request_device_map(unsigned long pacb)
+static void arcmsr_request_device_map(struct timer_list *t)
 {
-	struct AdapterControlBlock *acb = (struct AdapterControlBlock *)pacb;
+	struct AdapterControlBlock *acb = from_timer(acb, t, eternal_timer);
 	switch (acb->adapter_type) {
 		case ACB_ADAPTER_TYPE_A: {
 			arcmsr_hbaA_request_device_map(acb);
diff --git a/drivers/scsi/arm/fas216.c b/drivers/scsi/arm/fas216.c
index 7304d5a4fc4f..f4775ca70bab 100644
--- a/drivers/scsi/arm/fas216.c
+++ b/drivers/scsi/arm/fas216.c
@@ -2318,9 +2318,9 @@ DEF_SCSI_QCMD(fas216_noqueue_command)
  * Error handler timeout function.  Indicate that we timed out,
  * and wake up any error handler process so it can continue.
  */
-static void fas216_eh_timer(unsigned long data)
+static void fas216_eh_timer(struct timer_list *t)
 {
-	FAS216_Info *info = (FAS216_Info *)data;
+	FAS216_Info *info = from_timer(info, t, eh_timer);
 
 	fas216_log(info, LOG_ERROR, "error handling timed out\n");
 
@@ -2849,7 +2849,7 @@ int fas216_init(struct Scsi_Host *host)
 	info->rst_dev_status = -1;
 	info->rst_bus_status = -1;
 	init_waitqueue_head(&info->eh_wait);
-	setup_timer(&info->eh_timer, fas216_eh_timer, (unsigned long)info);
+	timer_setup(&info->eh_timer, fas216_eh_timer, 0);
 	
 	spin_lock_init(&info->host_lock);
 
diff --git a/drivers/scsi/bfa/bfad.c b/drivers/scsi/bfa/bfad.c
index d10826a69725..cf0466686804 100644
--- a/drivers/scsi/bfa/bfad.c
+++ b/drivers/scsi/bfa/bfad.c
@@ -692,9 +692,9 @@ ext:
 }
 
 void
-bfad_bfa_tmo(unsigned long data)
+bfad_bfa_tmo(struct timer_list *t)
 {
-	struct bfad_s	      *bfad = (struct bfad_s *) data;
+	struct bfad_s	      *bfad = from_timer(bfad, t, hal_tmo);
 	unsigned long	flags;
 	struct list_head	       doneq;
 
@@ -719,7 +719,7 @@ bfad_bfa_tmo(unsigned long data)
 void
 bfad_init_timer(struct bfad_s *bfad)
 {
-	setup_timer(&bfad->hal_tmo, bfad_bfa_tmo, (unsigned long)bfad);
+	timer_setup(&bfad->hal_tmo, bfad_bfa_tmo, 0);
 
 	mod_timer(&bfad->hal_tmo,
 		  jiffies + msecs_to_jiffies(BFA_TIMER_FREQ));
diff --git a/drivers/scsi/bfa/bfad_drv.h b/drivers/scsi/bfa/bfad_drv.h
index cfcfff48e8e1..4fe980a6441f 100644
--- a/drivers/scsi/bfa/bfad_drv.h
+++ b/drivers/scsi/bfa/bfad_drv.h
@@ -314,7 +314,7 @@ int		bfad_setup_intr(struct bfad_s *bfad);
 void		bfad_remove_intr(struct bfad_s *bfad);
 void		bfad_update_hal_cfg(struct bfa_iocfc_cfg_s *bfa_cfg);
 bfa_status_t	bfad_hal_mem_alloc(struct bfad_s *bfad);
-void		bfad_bfa_tmo(unsigned long data);
+void		bfad_bfa_tmo(struct timer_list *t);
 void		bfad_init_timer(struct bfad_s *bfad);
 int		bfad_pci_init(struct pci_dev *pdev, struct bfad_s *bfad);
 void		bfad_pci_uninit(struct pci_dev *pdev, struct bfad_s *bfad);
diff --git a/drivers/scsi/bnx2fc/bnx2fc_tgt.c b/drivers/scsi/bnx2fc/bnx2fc_tgt.c
index 59a2dfbcbc69..a8ae1a019eea 100644
--- a/drivers/scsi/bnx2fc/bnx2fc_tgt.c
+++ b/drivers/scsi/bnx2fc/bnx2fc_tgt.c
@@ -14,8 +14,8 @@
  */
 
 #include "bnx2fc.h"
-static void bnx2fc_upld_timer(unsigned long data);
-static void bnx2fc_ofld_timer(unsigned long data);
+static void bnx2fc_upld_timer(struct timer_list *t);
+static void bnx2fc_ofld_timer(struct timer_list *t);
 static int bnx2fc_init_tgt(struct bnx2fc_rport *tgt,
 			   struct fcoe_port *port,
 			   struct fc_rport_priv *rdata);
@@ -27,10 +27,10 @@ static void bnx2fc_free_session_resc(struct bnx2fc_hba *hba,
 			      struct bnx2fc_rport *tgt);
 static void bnx2fc_free_conn_id(struct bnx2fc_hba *hba, u32 conn_id);
 
-static void bnx2fc_upld_timer(unsigned long data)
+static void bnx2fc_upld_timer(struct timer_list *t)
 {
 
-	struct bnx2fc_rport *tgt = (struct bnx2fc_rport *)data;
+	struct bnx2fc_rport *tgt = from_timer(tgt, t, upld_timer);
 
 	BNX2FC_TGT_DBG(tgt, "upld_timer - Upload compl not received!!\n");
 	/* fake upload completion */
@@ -40,10 +40,10 @@ static void bnx2fc_upld_timer(unsigned long data)
 	wake_up_interruptible(&tgt->upld_wait);
 }
 
-static void bnx2fc_ofld_timer(unsigned long data)
+static void bnx2fc_ofld_timer(struct timer_list *t)
 {
 
-	struct bnx2fc_rport *tgt = (struct bnx2fc_rport *)data;
+	struct bnx2fc_rport *tgt = from_timer(tgt, t, ofld_timer);
 
 	BNX2FC_TGT_DBG(tgt, "entered bnx2fc_ofld_timer\n");
 	/* NOTE: This function should never be called, as
@@ -65,7 +65,7 @@ static void bnx2fc_ofld_timer(unsigned long data)
 
 static void bnx2fc_ofld_wait(struct bnx2fc_rport *tgt)
 {
-	setup_timer(&tgt->ofld_timer, bnx2fc_ofld_timer, (unsigned long)tgt);
+	timer_setup(&tgt->ofld_timer, bnx2fc_ofld_timer, 0);
 	mod_timer(&tgt->ofld_timer, jiffies + BNX2FC_FW_TIMEOUT);
 
 	wait_event_interruptible(tgt->ofld_wait,
@@ -277,7 +277,7 @@ void bnx2fc_flush_active_ios(struct bnx2fc_rport *tgt)
 
 static void bnx2fc_upld_wait(struct bnx2fc_rport *tgt)
 {
-	setup_timer(&tgt->upld_timer, bnx2fc_upld_timer, (unsigned long)tgt);
+	timer_setup(&tgt->upld_timer, bnx2fc_upld_timer, 0);
 	mod_timer(&tgt->upld_timer, jiffies + BNX2FC_FW_TIMEOUT);
 	wait_event_interruptible(tgt->upld_wait,
 				 (test_bit(
diff --git a/drivers/scsi/esas2r/esas2r_main.c b/drivers/scsi/esas2r/esas2r_main.c
index af4af504a97f..4eb14301a497 100644
--- a/drivers/scsi/esas2r/esas2r_main.c
+++ b/drivers/scsi/esas2r/esas2r_main.c
@@ -1631,11 +1631,11 @@ void esas2r_adapter_tasklet(unsigned long context)
 	}
 }
 
-static void esas2r_timer_callback(unsigned long context);
+static void esas2r_timer_callback(struct timer_list *t);
 
 void esas2r_kickoff_timer(struct esas2r_adapter *a)
 {
-	setup_timer(&a->timer, esas2r_timer_callback, (unsigned long)a);
+	timer_setup(&a->timer, esas2r_timer_callback, 0);
 
 	a->timer.expires = jiffies +
 			   msecs_to_jiffies(100);
@@ -1643,9 +1643,9 @@ void esas2r_kickoff_timer(struct esas2r_adapter *a)
 	add_timer(&a->timer);
 }
 
-static void esas2r_timer_callback(unsigned long context)
+static void esas2r_timer_callback(struct timer_list *t)
 {
-	struct esas2r_adapter *a = (struct esas2r_adapter *)context;
+	struct esas2r_adapter *a = from_timer(a, t, timer);
 
 	set_bit(AF2_TIMER_TICK, &a->flags2);
 
diff --git a/drivers/scsi/fcoe/fcoe_ctlr.c b/drivers/scsi/fcoe/fcoe_ctlr.c
index fff6f1851dc1..097f37de6ce9 100644
--- a/drivers/scsi/fcoe/fcoe_ctlr.c
+++ b/drivers/scsi/fcoe/fcoe_ctlr.c
@@ -49,7 +49,7 @@
 #define	FCOE_CTLR_MIN_FKA	500		/* min keep alive (mS) */
 #define	FCOE_CTLR_DEF_FKA	FIP_DEF_FKA	/* default keep alive (mS) */
 
-static void fcoe_ctlr_timeout(unsigned long);
+static void fcoe_ctlr_timeout(struct timer_list *);
 static void fcoe_ctlr_timer_work(struct work_struct *);
 static void fcoe_ctlr_recv_work(struct work_struct *);
 static int fcoe_ctlr_flogi_retry(struct fcoe_ctlr *);
@@ -156,7 +156,7 @@ void fcoe_ctlr_init(struct fcoe_ctlr *fip, enum fip_state mode)
 	mutex_init(&fip->ctlr_mutex);
 	spin_lock_init(&fip->ctlr_lock);
 	fip->flogi_oxid = FC_XID_UNKNOWN;
-	setup_timer(&fip->timer, fcoe_ctlr_timeout, (unsigned long)fip);
+	timer_setup(&fip->timer, fcoe_ctlr_timeout, 0);
 	INIT_WORK(&fip->timer_work, fcoe_ctlr_timer_work);
 	INIT_WORK(&fip->recv_work, fcoe_ctlr_recv_work);
 	skb_queue_head_init(&fip->fip_recv_list);
@@ -1786,9 +1786,9 @@ unlock:
  * fcoe_ctlr_timeout() - FIP timeout handler
  * @arg: The FCoE controller that timed out
  */
-static void fcoe_ctlr_timeout(unsigned long arg)
+static void fcoe_ctlr_timeout(struct timer_list *t)
 {
-	struct fcoe_ctlr *fip = (struct fcoe_ctlr *)arg;
+	struct fcoe_ctlr *fip = from_timer(fip, t, timer);
 
 	schedule_work(&fip->timer_work);
 }
diff --git a/drivers/scsi/fnic/fnic_main.c b/drivers/scsi/fnic/fnic_main.c
index aacadbf20b69..e52599f44170 100644
--- a/drivers/scsi/fnic/fnic_main.c
+++ b/drivers/scsi/fnic/fnic_main.c
@@ -407,18 +407,18 @@ static int fnic_notify_set(struct fnic *fnic)
 	return err;
 }
 
-static void fnic_notify_timer(unsigned long data)
+static void fnic_notify_timer(struct timer_list *t)
 {
-	struct fnic *fnic = (struct fnic *)data;
+	struct fnic *fnic = from_timer(fnic, t, notify_timer);
 
 	fnic_handle_link_event(fnic);
 	mod_timer(&fnic->notify_timer,
 		  round_jiffies(jiffies + FNIC_NOTIFY_TIMER_PERIOD));
 }
 
-static void fnic_fip_notify_timer(unsigned long data)
+static void fnic_fip_notify_timer(struct timer_list *t)
 {
-	struct fnic *fnic = (struct fnic *)data;
+	struct fnic *fnic = from_timer(fnic, t, fip_timer);
 
 	fnic_handle_fip_timer(fnic);
 }
@@ -777,8 +777,7 @@ static int fnic_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 		vnic_dev_add_addr(fnic->vdev, fnic->ctlr.ctl_src_addr);
 		fnic->set_vlan = fnic_set_vlan;
 		fcoe_ctlr_init(&fnic->ctlr, FIP_MODE_AUTO);
-		setup_timer(&fnic->fip_timer, fnic_fip_notify_timer,
-							(unsigned long)fnic);
+		timer_setup(&fnic->fip_timer, fnic_fip_notify_timer, 0);
 		spin_lock_init(&fnic->vlans_lock);
 		INIT_WORK(&fnic->fip_frame_work, fnic_handle_fip_frame);
 		INIT_WORK(&fnic->event_work, fnic_handle_event);
@@ -809,8 +808,7 @@ static int fnic_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 
 	/* Setup notify timer when using MSI interrupts */
 	if (vnic_dev_get_intr_mode(fnic->vdev) == VNIC_DEV_INTR_MODE_MSI)
-		setup_timer(&fnic->notify_timer,
-			    fnic_notify_timer, (unsigned long)fnic);
+		timer_setup(&fnic->notify_timer, fnic_notify_timer, 0);
 
 	/* allocate RQ buffers and post them to RQ*/
 	for (i = 0; i < fnic->rq_count; i++) {
diff --git a/drivers/scsi/ncr53c8xx.c b/drivers/scsi/ncr53c8xx.c
index 017216f5e919..dc4e801b2cef 100644
--- a/drivers/scsi/ncr53c8xx.c
+++ b/drivers/scsi/ncr53c8xx.c
@@ -8093,9 +8093,9 @@ irqreturn_t ncr53c8xx_intr(int irq, void *dev_id)
      return IRQ_HANDLED;
 }
 
-static void ncr53c8xx_timeout(unsigned long npref)
+static void ncr53c8xx_timeout(struct timer_list *t)
 {
-	struct ncb *np = (struct ncb *) npref;
+	struct ncb *np = from_timer(np, t, timer);
 	unsigned long flags;
 	struct scsi_cmnd *done_list;
 
@@ -8357,7 +8357,7 @@ struct Scsi_Host * __init ncr_attach(struct scsi_host_template *tpnt,
 	if (!np->scripth0)
 		goto attach_error;
 
-	setup_timer(&np->timer, ncr53c8xx_timeout, (unsigned long)np);
+	timer_setup(&np->timer, ncr53c8xx_timeout, 0);
 
 	/* Try to map the controller chip to virtual and physical memory. */
 
diff --git a/drivers/staging/greybus/operation.c b/drivers/staging/greybus/operation.c
index 609332b3e15b..c462b1c046cd 100644
--- a/drivers/staging/greybus/operation.c
+++ b/drivers/staging/greybus/operation.c
@@ -293,9 +293,9 @@ static void gb_operation_work(struct work_struct *work)
 	gb_operation_put(operation);
 }
 
-static void gb_operation_timeout(unsigned long arg)
+static void gb_operation_timeout(struct timer_list *t)
 {
-	struct gb_operation *operation = (void *)arg;
+	struct gb_operation *operation = from_timer(operation, t, timer);
 
 	if (gb_operation_result_set(operation, -ETIMEDOUT)) {
 		/*
@@ -540,8 +540,7 @@ gb_operation_create_common(struct gb_connection *connection, u8 type,
 			goto err_request;
 		}
 
-		setup_timer(&operation->timer, gb_operation_timeout,
-			    (unsigned long)operation);
+		timer_setup(&operation->timer, gb_operation_timeout, 0);
 	}
 
 	operation->flags = op_flags;
diff --git a/drivers/staging/lustre/lnet/lnet/net_fault.c b/drivers/staging/lustre/lnet/lnet/net_fault.c
index 3c83aa31e2c2..5a5d1811ffbe 100644
--- a/drivers/staging/lustre/lnet/lnet/net_fault.c
+++ b/drivers/staging/lustre/lnet/lnet/net_fault.c
@@ -700,9 +700,9 @@ lnet_delay_rule_daemon(void *arg)
 }
 
 static void
-delay_timer_cb(unsigned long arg)
+delay_timer_cb(struct timer_list *t)
 {
-	struct lnet_delay_rule *rule = (struct lnet_delay_rule *)arg;
+	struct lnet_delay_rule *rule = from_timer(rule, t, dl_timer);
 
 	spin_lock_bh(&delay_dd.dd_lock);
 	if (list_empty(&rule->dl_sched_link) && delay_dd.dd_running) {
@@ -762,7 +762,7 @@ lnet_delay_rule_add(struct lnet_fault_attr *attr)
 		wait_event(delay_dd.dd_ctl_waitq, delay_dd.dd_running);
 	}
 
-	setup_timer(&rule->dl_timer, delay_timer_cb, (unsigned long)rule);
+	timer_setup(&rule->dl_timer, delay_timer_cb, 0);
 
 	spin_lock_init(&rule->dl_lock);
 	INIT_LIST_HEAD(&rule->dl_msg_list);
diff --git a/drivers/staging/lustre/lustre/ptlrpc/service.c b/drivers/staging/lustre/lustre/ptlrpc/service.c
index 23cdb7c4476c..63be6e7273f3 100644
--- a/drivers/staging/lustre/lustre/ptlrpc/service.c
+++ b/drivers/staging/lustre/lustre/ptlrpc/service.c
@@ -329,11 +329,11 @@ ptlrpc_server_post_idle_rqbds(struct ptlrpc_service_part *svcpt)
 	return -1;
 }
 
-static void ptlrpc_at_timer(unsigned long castmeharder)
+static void ptlrpc_at_timer(struct timer_list *t)
 {
 	struct ptlrpc_service_part *svcpt;
 
-	svcpt = (struct ptlrpc_service_part *)castmeharder;
+	svcpt = from_timer(svcpt, t, scp_at_timer);
 
 	svcpt->scp_at_check = 1;
 	svcpt->scp_at_checktime = cfs_time_current();
@@ -506,8 +506,7 @@ ptlrpc_service_part_init(struct ptlrpc_service *svc,
 	if (!array->paa_reqs_count)
 		goto free_reqs_array;
 
-	setup_timer(&svcpt->scp_at_timer, ptlrpc_at_timer,
-		    (unsigned long)svcpt);
+	timer_setup(&svcpt->scp_at_timer, ptlrpc_at_timer, 0);
 
 	/* At SOW, service time should be quick; 10s seems generous. If client
 	 * timeout is less than this, we'll be sending an early reply.
@@ -926,7 +925,7 @@ static void ptlrpc_at_set_timer(struct ptlrpc_service_part *svcpt)
 	next = (__s32)(array->paa_deadline - ktime_get_real_seconds() -
 		       at_early_margin);
 	if (next <= 0) {
-		ptlrpc_at_timer((unsigned long)svcpt);
+		ptlrpc_at_timer(&svcpt->scp_at_timer);
 	} else {
 		mod_timer(&svcpt->scp_at_timer, cfs_time_shift(next));
 		CDEBUG(D_INFO, "armed %s at %+ds\n",
diff --git a/drivers/staging/media/imx/imx-ic-prpencvf.c b/drivers/staging/media/imx/imx-ic-prpencvf.c
index 0790b3d9e255..143038c6c403 100644
--- a/drivers/staging/media/imx/imx-ic-prpencvf.c
+++ b/drivers/staging/media/imx/imx-ic-prpencvf.c
@@ -293,9 +293,9 @@ static irqreturn_t prp_nfb4eof_interrupt(int irq, void *dev_id)
  * EOF timeout timer function. This is an unrecoverable condition
  * without a stream restart.
  */
-static void prp_eof_timeout(unsigned long data)
+static void prp_eof_timeout(struct timer_list *t)
 {
-	struct prp_priv *priv = (struct prp_priv *)data;
+	struct prp_priv *priv = from_timer(priv, t, eof_timeout_timer);
 	struct imx_media_video_dev *vdev = priv->vdev;
 	struct imx_ic_priv *ic_priv = priv->ic_priv;
 
@@ -1292,8 +1292,7 @@ static int prp_init(struct imx_ic_priv *ic_priv)
 	priv->ic_priv = ic_priv;
 
 	spin_lock_init(&priv->irqlock);
-	setup_timer(&priv->eof_timeout_timer, prp_eof_timeout,
-		    (unsigned long)priv);
+	timer_setup(&priv->eof_timeout_timer, prp_eof_timeout, 0);
 
 	priv->vdev = imx_media_capture_device_init(&ic_priv->sd,
 						   PRPENCVF_SRC_PAD);
diff --git a/drivers/staging/media/imx/imx-media-csi.c b/drivers/staging/media/imx/imx-media-csi.c
index 6d856118c223..bb1d6dafca83 100644
--- a/drivers/staging/media/imx/imx-media-csi.c
+++ b/drivers/staging/media/imx/imx-media-csi.c
@@ -254,9 +254,9 @@ static irqreturn_t csi_idmac_nfb4eof_interrupt(int irq, void *dev_id)
  * EOF timeout timer function. This is an unrecoverable condition
  * without a stream restart.
  */
-static void csi_idmac_eof_timeout(unsigned long data)
+static void csi_idmac_eof_timeout(struct timer_list *t)
 {
-	struct csi_priv *priv = (struct csi_priv *)data;
+	struct csi_priv *priv = from_timer(priv, t, eof_timeout_timer);
 	struct imx_media_video_dev *vdev = priv->vdev;
 
 	v4l2_err(&priv->sd, "EOF timeout\n");
@@ -1739,8 +1739,7 @@ static int imx_csi_probe(struct platform_device *pdev)
 	priv->csi_id = pdata->csi;
 	priv->smfc_id = (priv->csi_id == 0) ? 0 : 2;
 
-	setup_timer(&priv->eof_timeout_timer, csi_idmac_eof_timeout,
-		    (unsigned long)priv);
+	timer_setup(&priv->eof_timeout_timer, csi_idmac_eof_timeout, 0);
 	spin_lock_init(&priv->irqlock);
 
 	v4l2_subdev_init(&priv->sd, &csi_subdev_ops);
diff --git a/drivers/staging/most/hdm-usb/hdm_usb.c b/drivers/staging/most/hdm-usb/hdm_usb.c
index 85775da293fb..667dacac81f0 100644
--- a/drivers/staging/most/hdm-usb/hdm_usb.c
+++ b/drivers/staging/most/hdm-usb/hdm_usb.c
@@ -744,9 +744,9 @@ static void hdm_request_netinfo(struct most_interface *iface, int channel,
  * The handler runs in interrupt context. That's why we need to defer the
  * tasks to a work queue.
  */
-static void link_stat_timer_handler(unsigned long data)
+static void link_stat_timer_handler(struct timer_list *t)
 {
-	struct most_dev *mdev = (struct most_dev *)data;
+	struct most_dev *mdev = from_timer(mdev, t, link_stat_timer);
 
 	schedule_work(&mdev->poll_work_obj);
 	mdev->link_stat_timer.expires = jiffies + (2 * HZ);
@@ -1138,8 +1138,7 @@ hdm_probe(struct usb_interface *interface, const struct usb_device_id *id)
 	num_endpoints = usb_iface_desc->desc.bNumEndpoints;
 	mutex_init(&mdev->io_mutex);
 	INIT_WORK(&mdev->poll_work_obj, wq_netinfo);
-	setup_timer(&mdev->link_stat_timer, link_stat_timer_handler,
-		    (unsigned long)mdev);
+	timer_setup(&mdev->link_stat_timer, link_stat_timer_handler, 0);
 
 	mdev->usb_device = usb_dev;
 	mdev->link_stat_timer.expires = jiffies + (2 * HZ);
diff --git a/drivers/staging/rtl8192u/ieee80211/ieee80211_softmac.c b/drivers/staging/rtl8192u/ieee80211/ieee80211_softmac.c
index 4e7908322d77..f56fdc7a4b61 100644
--- a/drivers/staging/rtl8192u/ieee80211/ieee80211_softmac.c
+++ b/drivers/staging/rtl8192u/ieee80211/ieee80211_softmac.c
@@ -391,10 +391,10 @@ static void ieee80211_send_beacon(struct ieee80211_device *ieee)
 }
 
 
-static void ieee80211_send_beacon_cb(unsigned long _ieee)
+static void ieee80211_send_beacon_cb(struct timer_list *t)
 {
 	struct ieee80211_device *ieee =
-		(struct ieee80211_device *) _ieee;
+		from_timer(ieee, t, beacon_timer);
 	unsigned long flags;
 
 	spin_lock_irqsave(&ieee->beacon_lock, flags);
@@ -1251,9 +1251,11 @@ void ieee80211_associate_abort(struct ieee80211_device *ieee)
 	spin_unlock_irqrestore(&ieee->lock, flags);
 }
 
-static void ieee80211_associate_abort_cb(unsigned long dev)
+static void ieee80211_associate_abort_cb(struct timer_list *t)
 {
-	ieee80211_associate_abort((struct ieee80211_device *) dev);
+	struct ieee80211_device *dev = from_timer(dev, t, associate_timer);
+
+	ieee80211_associate_abort(dev);
 }
 
 
@@ -2718,11 +2720,9 @@ void ieee80211_softmac_init(struct ieee80211_device *ieee)
 	ieee->enable_rx_imm_BA = true;
 	ieee->tx_pending.txb = NULL;
 
-	setup_timer(&ieee->associate_timer, ieee80211_associate_abort_cb,
-		    (unsigned long)ieee);
+	timer_setup(&ieee->associate_timer, ieee80211_associate_abort_cb, 0);
 
-	setup_timer(&ieee->beacon_timer, ieee80211_send_beacon_cb,
-		    (unsigned long)ieee);
+	timer_setup(&ieee->beacon_timer, ieee80211_send_beacon_cb, 0);
 
 
 	INIT_DELAYED_WORK(&ieee->start_ibss_wq, ieee80211_start_ibss_wq);
diff --git a/drivers/staging/rtl8712/recv_linux.c b/drivers/staging/rtl8712/recv_linux.c
index 576c15d25a0f..986a55bb9877 100644
--- a/drivers/staging/rtl8712/recv_linux.c
+++ b/drivers/staging/rtl8712/recv_linux.c
@@ -138,17 +138,16 @@ _recv_indicatepkt_drop:
 	precvpriv->rx_drop++;
 }
 
-static void _r8712_reordering_ctrl_timeout_handler (unsigned long data)
+static void _r8712_reordering_ctrl_timeout_handler (struct timer_list *t)
 {
 	struct recv_reorder_ctrl *preorder_ctrl =
-			 (struct recv_reorder_ctrl *)data;
+			 from_timer(preorder_ctrl, t, reordering_ctrl_timer);
 
 	r8712_reordering_ctrl_timeout_handler(preorder_ctrl);
 }
 
 void r8712_init_recv_timer(struct recv_reorder_ctrl *preorder_ctrl)
 {
-	setup_timer(&preorder_ctrl->reordering_ctrl_timer,
-		     _r8712_reordering_ctrl_timeout_handler,
-		     (unsigned long)preorder_ctrl);
+	timer_setup(&preorder_ctrl->reordering_ctrl_timer,
+		    _r8712_reordering_ctrl_timeout_handler, 0);
 }
diff --git a/drivers/staging/rtl8712/rtl8712_led.c b/drivers/staging/rtl8712/rtl8712_led.c
index da1d4a641dcd..455fba721135 100644
--- a/drivers/staging/rtl8712/rtl8712_led.c
+++ b/drivers/staging/rtl8712/rtl8712_led.c
@@ -74,7 +74,7 @@ enum _LED_STATE_871x {
  *	Prototype of protected function.
  *===========================================================================
  */
-static void BlinkTimerCallback(unsigned long data);
+static void BlinkTimerCallback(struct timer_list *t);
 
 static void BlinkWorkItemCallback(struct work_struct *work);
 /*===========================================================================
@@ -99,8 +99,7 @@ static void InitLed871x(struct _adapter *padapter, struct LED_871x *pLed,
 	pLed->bLedBlinkInProgress = false;
 	pLed->BlinkTimes = 0;
 	pLed->BlinkingLedState = LED_UNKNOWN;
-	setup_timer(&pLed->BlinkTimer, BlinkTimerCallback,
-		    (unsigned long)pLed);
+	timer_setup(&pLed->BlinkTimer, BlinkTimerCallback, 0);
 	INIT_WORK(&pLed->BlinkWorkItem, BlinkWorkItemCallback);
 }
 
@@ -825,9 +824,9 @@ static void SwLedBlink6(struct LED_871x *pLed)
  *		Callback function of LED BlinkTimer,
  *		it just schedules to corresponding BlinkWorkItem.
  */
-static void BlinkTimerCallback(unsigned long data)
+static void BlinkTimerCallback(struct timer_list *t)
 {
-	struct LED_871x  *pLed = (struct LED_871x *)data;
+	struct LED_871x  *pLed = from_timer(pLed, t, BlinkTimer);
 
 	/* This fixed the crash problem on Fedora 12 when trying to do the
 	 * insmod;ifconfig up;rmmod commands.
diff --git a/drivers/staging/unisys/visorbus/visorbus_main.c b/drivers/staging/unisys/visorbus/visorbus_main.c
index b604d0cccef1..6cb6eb0673c6 100644
--- a/drivers/staging/unisys/visorbus/visorbus_main.c
+++ b/drivers/staging/unisys/visorbus/visorbus_main.c
@@ -493,9 +493,9 @@ static const struct file_operations bus_info_debugfs_fops = {
 	.release = single_release,
 };
 
-static void dev_periodic_work(unsigned long __opaque)
+static void dev_periodic_work(struct timer_list *t)
 {
-	struct visor_device *dev = (struct visor_device *)__opaque;
+	struct visor_device *dev = from_timer(dev, t, timer);
 	struct visor_driver *drv = to_visor_driver(dev->device.driver);
 
 	drv->channel_interrupt(dev);
@@ -667,7 +667,7 @@ int create_visor_device(struct visor_device *dev)
 	dev->device.release = visorbus_release_device;
 	/* keep a reference just for us (now 2) */
 	get_device(&dev->device);
-	setup_timer(&dev->timer, dev_periodic_work, (unsigned long)dev);
+	timer_setup(&dev->timer, dev_periodic_work, 0);
 	/*
 	 * bus_id must be a unique name with respect to this bus TYPE (NOT bus
 	 * instance).  That's why we need to include the bus number within the
diff --git a/drivers/staging/unisys/visornic/visornic_main.c b/drivers/staging/unisys/visornic/visornic_main.c
index 735d7e5fa86b..6d8239163ba5 100644
--- a/drivers/staging/unisys/visornic/visornic_main.c
+++ b/drivers/staging/unisys/visornic/visornic_main.c
@@ -1766,9 +1766,10 @@ static int visornic_poll(struct napi_struct *napi, int budget)
  * Main function of the vnic_incoming thread. Periodically check the response
  * queue and drain it if needed.
  */
-static void poll_for_irq(unsigned long v)
+static void poll_for_irq(struct timer_list *t)
 {
-	struct visornic_devdata *devdata = (struct visornic_devdata *)v;
+	struct visornic_devdata *devdata = from_timer(devdata, t,
+						      irq_poll_timer);
 
 	if (!visorchannel_signalempty(
 				   devdata->dev->visorchannel,
@@ -1899,8 +1900,7 @@ static int visornic_probe(struct visor_device *dev)
 	/* Let's start our threads to get responses */
 	netif_napi_add(netdev, &devdata->napi, visornic_poll, NAPI_WEIGHT);
 
-	setup_timer(&devdata->irq_poll_timer, poll_for_irq,
-		    (unsigned long)devdata);
+	timer_setup(&devdata->irq_poll_timer, poll_for_irq, 0);
 	/* Note: This time has to start running before the while
 	 * loop below because the napi routine is responsible for
 	 * setting enab_dis_acked
diff --git a/drivers/staging/wilc1000/wilc_wfi_cfgoperations.c b/drivers/staging/wilc1000/wilc_wfi_cfgoperations.c
index 8a275996d4e6..028da1dc1b81 100644
--- a/drivers/staging/wilc1000/wilc_wfi_cfgoperations.c
+++ b/drivers/staging/wilc1000/wilc_wfi_cfgoperations.c
@@ -267,7 +267,7 @@ static void update_scan_time(void)
 		last_scanned_shadow[i].time_scan = jiffies;
 }
 
-static void remove_network_from_shadow(unsigned long unused)
+static void remove_network_from_shadow(struct timer_list *unused)
 {
 	unsigned long now = jiffies;
 	int i, j;
@@ -292,7 +292,7 @@ static void remove_network_from_shadow(unsigned long unused)
 	}
 }
 
-static void clear_duringIP(unsigned long arg)
+static void clear_duringIP(struct timer_list *unused)
 {
 	wilc_optaining_ip = false;
 }
@@ -2278,8 +2278,8 @@ int wilc_init_host_int(struct net_device *net)
 
 	priv = wdev_priv(net->ieee80211_ptr);
 	if (op_ifcs == 0) {
-		setup_timer(&hAgingTimer, remove_network_from_shadow, 0);
-		setup_timer(&wilc_during_ip_timer, clear_duringIP, 0);
+		timer_setup(&hAgingTimer, remove_network_from_shadow, 0);
+		timer_setup(&wilc_during_ip_timer, clear_duringIP, 0);
 	}
 	op_ifcs++;
 
diff --git a/drivers/target/target_core_user.c b/drivers/target/target_core_user.c
index 9469695f5871..a8eaed2c211a 100644
--- a/drivers/target/target_core_user.c
+++ b/drivers/target/target_core_user.c
@@ -1044,9 +1044,9 @@ static int tcmu_check_expired_cmd(int id, void *p, void *data)
 	return 0;
 }
 
-static void tcmu_device_timedout(unsigned long data)
+static void tcmu_device_timedout(struct timer_list *t)
 {
-	struct tcmu_dev *udev = (struct tcmu_dev *)data;
+	struct tcmu_dev *udev = from_timer(udev, t, timeout);
 	unsigned long flags;
 
 	spin_lock_irqsave(&udev->commands_lock, flags);
@@ -1106,8 +1106,7 @@ static struct se_device *tcmu_alloc_device(struct se_hba *hba, const char *name)
 	idr_init(&udev->commands);
 	spin_lock_init(&udev->commands_lock);
 
-	setup_timer(&udev->timeout, tcmu_device_timedout,
-		(unsigned long)udev);
+	timer_setup(&udev->timeout, tcmu_device_timedout, 0);
 
 	init_waitqueue_head(&udev->nl_cmd_wq);
 	spin_lock_init(&udev->nl_cmd_lock);
diff --git a/drivers/tty/ipwireless/hardware.c b/drivers/tty/ipwireless/hardware.c
index a6b8240af6cd..b0baa4ce10f9 100644
--- a/drivers/tty/ipwireless/hardware.c
+++ b/drivers/tty/ipwireless/hardware.c
@@ -33,7 +33,7 @@ static void handle_received_SETUP_packet(struct ipw_hardware *ipw,
 					 unsigned int address,
 					 const unsigned char *data, int len,
 					 int is_last);
-static void ipwireless_setup_timer(unsigned long data);
+static void ipwireless_setup_timer(struct timer_list *t);
 static void handle_received_CTRL_packet(struct ipw_hardware *hw,
 		unsigned int channel_idx, const unsigned char *data, int len);
 
@@ -1635,8 +1635,7 @@ struct ipw_hardware *ipwireless_hardware_create(void)
 	spin_lock_init(&hw->lock);
 	tasklet_init(&hw->tasklet, ipwireless_do_tasklet, (unsigned long) hw);
 	INIT_WORK(&hw->work_rx, ipw_receive_data_work);
-	setup_timer(&hw->setup_timer, ipwireless_setup_timer,
-			(unsigned long) hw);
+	timer_setup(&hw->setup_timer, ipwireless_setup_timer, 0);
 
 	return hw;
 }
@@ -1670,12 +1669,12 @@ void ipwireless_init_hardware_v2_v3(struct ipw_hardware *hw)
 	hw->init_loops = 0;
 	printk(KERN_INFO IPWIRELESS_PCCARD_NAME
 	       ": waiting for card to start up...\n");
-	ipwireless_setup_timer((unsigned long) hw);
+	ipwireless_setup_timer(&hw->setup_timer);
 }
 
-static void ipwireless_setup_timer(unsigned long data)
+static void ipwireless_setup_timer(struct timer_list *t)
 {
-	struct ipw_hardware *hw = (struct ipw_hardware *) data;
+	struct ipw_hardware *hw = from_timer(hw, t, setup_timer);
 
 	hw->init_loops++;
 
diff --git a/drivers/tty/n_gsm.c b/drivers/tty/n_gsm.c
index 3a39eb685c69..5131bdc9e765 100644
--- a/drivers/tty/n_gsm.c
+++ b/drivers/tty/n_gsm.c
@@ -1310,9 +1310,9 @@ static void gsm_control_transmit(struct gsm_mux *gsm, struct gsm_control *ctrl)
  *	gsm->pending_cmd will be NULL and we just let the timer expire.
  */
 
-static void gsm_control_retransmit(unsigned long data)
+static void gsm_control_retransmit(struct timer_list *t)
 {
-	struct gsm_mux *gsm = (struct gsm_mux *)data;
+	struct gsm_mux *gsm = from_timer(gsm, t, t2_timer);
 	struct gsm_control *ctrl;
 	unsigned long flags;
 	spin_lock_irqsave(&gsm->control_lock, flags);
@@ -1453,9 +1453,9 @@ static void gsm_dlci_open(struct gsm_dlci *dlci)
  *	end will get a DM response)
  */
 
-static void gsm_dlci_t1(unsigned long data)
+static void gsm_dlci_t1(struct timer_list *t)
 {
-	struct gsm_dlci *dlci = (struct gsm_dlci *)data;
+	struct gsm_dlci *dlci = from_timer(dlci, t, t1);
 	struct gsm_mux *gsm = dlci->gsm;
 
 	switch (dlci->state) {
@@ -1634,7 +1634,7 @@ static struct gsm_dlci *gsm_dlci_alloc(struct gsm_mux *gsm, int addr)
 	}
 
 	skb_queue_head_init(&dlci->skb_list);
-	setup_timer(&dlci->t1, gsm_dlci_t1, (unsigned long)dlci);
+	timer_setup(&dlci->t1, gsm_dlci_t1, 0);
 	tty_port_init(&dlci->port);
 	dlci->port.ops = &gsm_port_ops;
 	dlci->gsm = gsm;
@@ -2088,7 +2088,7 @@ static int gsm_activate_mux(struct gsm_mux *gsm)
 	struct gsm_dlci *dlci;
 	int i = 0;
 
-	setup_timer(&gsm->t2_timer, gsm_control_retransmit, (unsigned long)gsm);
+	timer_setup(&gsm->t2_timer, gsm_control_retransmit, 0);
 	init_waitqueue_head(&gsm->event);
 	spin_lock_init(&gsm->control_lock);
 	spin_lock_init(&gsm->tx_lock);
diff --git a/drivers/tty/n_r3964.c b/drivers/tty/n_r3964.c
index 9f246d4db3ca..30bb0900cd2f 100644
--- a/drivers/tty/n_r3964.c
+++ b/drivers/tty/n_r3964.c
@@ -115,7 +115,7 @@ static void retry_transmit(struct r3964_info *pInfo);
 static void transmit_block(struct r3964_info *pInfo);
 static void receive_char(struct r3964_info *pInfo, const unsigned char c);
 static void receive_error(struct r3964_info *pInfo, const char flag);
-static void on_timeout(unsigned long priv);
+static void on_timeout(struct timer_list *t);
 static int enable_signals(struct r3964_info *pInfo, struct pid *pid, int arg);
 static int read_telegram(struct r3964_info *pInfo, struct pid *pid,
 		unsigned char __user * buf);
@@ -688,9 +688,9 @@ static void receive_error(struct r3964_info *pInfo, const char flag)
 	}
 }
 
-static void on_timeout(unsigned long priv)
+static void on_timeout(struct timer_list *t)
 {
-	struct r3964_info *pInfo = (void *)priv;
+	struct r3964_info *pInfo = from_timer(pInfo, t, tmr);
 
 	switch (pInfo->state) {
 	case R3964_TX_REQUEST:
@@ -993,7 +993,7 @@ static int r3964_open(struct tty_struct *tty)
 	tty->disc_data = pInfo;
 	tty->receive_room = 65536;
 
-	setup_timer(&pInfo->tmr, on_timeout, (unsigned long)pInfo);
+	timer_setup(&pInfo->tmr, on_timeout, 0);
 
 	return 0;
 }
diff --git a/drivers/tty/serial/crisv10.c b/drivers/tty/serial/crisv10.c
index 1421804975e0..c9458a033e3c 100644
--- a/drivers/tty/serial/crisv10.c
+++ b/drivers/tty/serial/crisv10.c
@@ -2059,7 +2059,7 @@ static void flush_timeout_function(unsigned long data)
 static struct timer_list flush_timer;
 
 static void
-timed_flush_handler(unsigned long ptr)
+timed_flush_handler(struct timer_list *unused)
 {
 	struct e100_serial *info;
 	int i;
@@ -4137,7 +4137,7 @@ static int __init rs_init(void)
 	/* Setup the timed flush handler system */
 
 #if !defined(CONFIG_ETRAX_SERIAL_FAST_TIMER)
-	setup_timer(&flush_timer, timed_flush_handler, 0);
+	timer_setup(&flush_timer, timed_flush_handler, 0);
 	mod_timer(&flush_timer, jiffies + 5);
 #endif
 
diff --git a/drivers/tty/serial/fsl_lpuart.c b/drivers/tty/serial/fsl_lpuart.c
index c84e6f0db54e..1c4d3f387138 100644
--- a/drivers/tty/serial/fsl_lpuart.c
+++ b/drivers/tty/serial/fsl_lpuart.c
@@ -966,9 +966,9 @@ static void lpuart_dma_rx_complete(void *arg)
 	lpuart_copy_rx_to_tty(sport);
 }
 
-static void lpuart_timer_func(unsigned long data)
+static void lpuart_timer_func(struct timer_list *t)
 {
-	struct lpuart_port *sport = (struct lpuart_port *)data;
+	struct lpuart_port *sport = from_timer(sport, t, lpuart_timer);
 
 	lpuart_copy_rx_to_tty(sport);
 }
@@ -1263,8 +1263,7 @@ static void lpuart32_setup_watermark(struct lpuart_port *sport)
 
 static void rx_dma_timer_init(struct lpuart_port *sport)
 {
-		setup_timer(&sport->lpuart_timer, lpuart_timer_func,
-				(unsigned long)sport);
+		timer_setup(&sport->lpuart_timer, lpuart_timer_func, 0);
 		sport->lpuart_timer.expires = jiffies + sport->dma_rx_timeout;
 		add_timer(&sport->lpuart_timer);
 }
diff --git a/drivers/tty/serial/ifx6x60.c b/drivers/tty/serial/ifx6x60.c
index 473f4f81d690..ffefd218761e 100644
--- a/drivers/tty/serial/ifx6x60.c
+++ b/drivers/tty/serial/ifx6x60.c
@@ -263,9 +263,9 @@ static void mrdy_assert(struct ifx_spi_device *ifx_dev)
  *	The SPI has timed out: hang up the tty. Users will then see a hangup
  *	and error events.
  */
-static void ifx_spi_timeout(unsigned long arg)
+static void ifx_spi_timeout(struct timer_list *t)
 {
-	struct ifx_spi_device *ifx_dev = (struct ifx_spi_device *)arg;
+	struct ifx_spi_device *ifx_dev = from_timer(ifx_dev, t, spi_timer);
 
 	dev_warn(&ifx_dev->spi_dev->dev, "*** SPI Timeout ***");
 	tty_port_tty_hangup(&ifx_dev->tty_port, false);
@@ -1016,8 +1016,7 @@ static int ifx_spi_spi_probe(struct spi_device *spi)
 	spin_lock_init(&ifx_dev->write_lock);
 	spin_lock_init(&ifx_dev->power_lock);
 	ifx_dev->power_status = 0;
-	setup_timer(&ifx_dev->spi_timer, ifx_spi_timeout,
-		    (unsigned long)ifx_dev);
+	timer_setup(&ifx_dev->spi_timer, ifx_spi_timeout, 0);
 	ifx_dev->modem = pl_data->modem_type;
 	ifx_dev->use_dma = pl_data->use_dma;
 	ifx_dev->max_hz = pl_data->max_hz;
diff --git a/drivers/tty/serial/imx.c b/drivers/tty/serial/imx.c
index a67a606c38eb..e4b3d9123a03 100644
--- a/drivers/tty/serial/imx.c
+++ b/drivers/tty/serial/imx.c
@@ -906,9 +906,9 @@ static void imx_break_ctl(struct uart_port *port, int break_state)
  * This is our per-port timeout handler, for checking the
  * modem status signals.
  */
-static void imx_timeout(unsigned long data)
+static void imx_timeout(struct timer_list *t)
 {
-	struct imx_port *sport = (struct imx_port *)data;
+	struct imx_port *sport = from_timer(sport, t, timer);
 	unsigned long flags;
 
 	if (sport->port.state) {
@@ -2082,7 +2082,7 @@ static int serial_imx_probe(struct platform_device *pdev)
 	sport->port.rs485_config = imx_rs485_config;
 	sport->port.rs485.flags |= SER_RS485_RTS_ON_SEND;
 	sport->port.flags = UPF_BOOT_AUTOCONF;
-	setup_timer(&sport->timer, imx_timeout, (unsigned long)sport);
+	timer_setup(&sport->timer, imx_timeout, 0);
 
 	sport->gpios = mctrl_gpio_init(&sport->port, 0);
 	if (IS_ERR(sport->gpios))
diff --git a/drivers/tty/serial/kgdb_nmi.c b/drivers/tty/serial/kgdb_nmi.c
index ed2b03058627..4029272891f9 100644
--- a/drivers/tty/serial/kgdb_nmi.c
+++ b/drivers/tty/serial/kgdb_nmi.c
@@ -188,9 +188,9 @@ bool kgdb_nmi_poll_knock(void)
  * The tasklet is cheap, it does not cause wakeups when reschedules itself,
  * instead it waits for the next tick.
  */
-static void kgdb_nmi_tty_receiver(unsigned long data)
+static void kgdb_nmi_tty_receiver(struct timer_list *t)
 {
-	struct kgdb_nmi_tty_priv *priv = (void *)data;
+	struct kgdb_nmi_tty_priv *priv = from_timer(priv, t, timer);
 	char ch;
 
 	priv->timer.expires = jiffies + (HZ/100);
@@ -241,7 +241,7 @@ static int kgdb_nmi_tty_install(struct tty_driver *drv, struct tty_struct *tty)
 		return -ENOMEM;
 
 	INIT_KFIFO(priv->fifo);
-	setup_timer(&priv->timer, kgdb_nmi_tty_receiver, (unsigned long)priv);
+	timer_setup(&priv->timer, kgdb_nmi_tty_receiver, 0);
 	tty_port_init(&priv->port);
 	priv->port.ops = &kgdb_nmi_tty_port_ops;
 	tty->driver_data = priv;
diff --git a/drivers/tty/serial/max3100.c b/drivers/tty/serial/max3100.c
index 27d6049eb6a9..371569a0fd00 100644
--- a/drivers/tty/serial/max3100.c
+++ b/drivers/tty/serial/max3100.c
@@ -178,9 +178,9 @@ static void max3100_dowork(struct max3100_port *s)
 		queue_work(s->workqueue, &s->work);
 }
 
-static void max3100_timeout(unsigned long data)
+static void max3100_timeout(struct timer_list *t)
 {
-	struct max3100_port *s = (struct max3100_port *)data;
+	struct max3100_port *s = from_timer(s, t, timer);
 
 	if (s->port.state) {
 		max3100_dowork(s);
@@ -780,8 +780,7 @@ static int max3100_probe(struct spi_device *spi)
 		max3100s[i]->poll_time = 1;
 	max3100s[i]->max3100_hw_suspend = pdata->max3100_hw_suspend;
 	max3100s[i]->minor = i;
-	setup_timer(&max3100s[i]->timer, max3100_timeout,
-		    (unsigned long)max3100s[i]);
+	timer_setup(&max3100s[i]->timer, max3100_timeout, 0);
 
 	dev_dbg(&spi->dev, "%s: adding port %d\n", __func__, i);
 	max3100s[i]->port.irq = max3100s[i]->irq;
diff --git a/drivers/tty/serial/mux.c b/drivers/tty/serial/mux.c
index 3b74369c262f..00ce31e8d19a 100644
--- a/drivers/tty/serial/mux.c
+++ b/drivers/tty/serial/mux.c
@@ -371,7 +371,7 @@ static int mux_verify_port(struct uart_port *port, struct serial_struct *ser)
  *
  * This function periodically polls the Serial MUX to check for new data.
  */
-static void mux_poll(unsigned long unused)
+static void mux_poll(struct timer_list *unused)
 {  
 	int i;
 
@@ -572,7 +572,7 @@ static int __init mux_init(void)
 
 	if(port_cnt > 0) {
 		/* Start the Mux timer */
-		setup_timer(&mux_timer, mux_poll, 0UL);
+		timer_setup(&mux_timer, mux_poll, 0);
 		mod_timer(&mux_timer, jiffies + MUX_POLL_DELAY);
 
 #ifdef CONFIG_SERIAL_MUX_CONSOLE
diff --git a/drivers/tty/serial/pnx8xxx_uart.c b/drivers/tty/serial/pnx8xxx_uart.c
index f8812389b8a8..223a9499104e 100644
--- a/drivers/tty/serial/pnx8xxx_uart.c
+++ b/drivers/tty/serial/pnx8xxx_uart.c
@@ -103,9 +103,9 @@ static void pnx8xxx_mctrl_check(struct pnx8xxx_port *sport)
  * This is our per-port timeout handler, for checking the
  * modem status signals.
  */
-static void pnx8xxx_timeout(unsigned long data)
+static void pnx8xxx_timeout(struct timer_list *t)
 {
-	struct pnx8xxx_port *sport = (struct pnx8xxx_port *)data;
+	struct pnx8xxx_port *sport = from_timer(sport, t, timer);
 	unsigned long flags;
 
 	if (sport->port.state) {
@@ -662,8 +662,7 @@ static void __init pnx8xxx_init_ports(void)
 	first = 0;
 
 	for (i = 0; i < NR_PORTS; i++) {
-		setup_timer(&pnx8xxx_ports[i].timer, pnx8xxx_timeout,
-			    (unsigned long)&pnx8xxx_ports[i]);
+		timer_setup(&pnx8xxx_ports[i].timer, pnx8xxx_timeout, 0);
 		pnx8xxx_ports[i].port.ops = &pnx8xxx_pops;
 	}
 }
diff --git a/drivers/tty/serial/sa1100.c b/drivers/tty/serial/sa1100.c
index 4e3f169b30cf..a399772be3fc 100644
--- a/drivers/tty/serial/sa1100.c
+++ b/drivers/tty/serial/sa1100.c
@@ -110,9 +110,9 @@ static void sa1100_mctrl_check(struct sa1100_port *sport)
  * This is our per-port timeout handler, for checking the
  * modem status signals.
  */
-static void sa1100_timeout(unsigned long data)
+static void sa1100_timeout(struct timer_list *t)
 {
-	struct sa1100_port *sport = (struct sa1100_port *)data;
+	struct sa1100_port *sport = from_timer(sport, t, timer);
 	unsigned long flags;
 
 	if (sport->port.state) {
@@ -627,8 +627,7 @@ static void __init sa1100_init_ports(void)
 		sa1100_ports[i].port.fifosize  = 8;
 		sa1100_ports[i].port.line      = i;
 		sa1100_ports[i].port.iotype    = UPIO_MEM;
-		setup_timer(&sa1100_ports[i].timer, sa1100_timeout,
-			    (unsigned long)&sa1100_ports[i]);
+		timer_setup(&sa1100_ports[i].timer, sa1100_timeout, 0);
 	}
 
 	/*
diff --git a/drivers/tty/serial/sh-sci.c b/drivers/tty/serial/sh-sci.c
index 31fcc7072a90..d9f399c4e90c 100644
--- a/drivers/tty/serial/sh-sci.c
+++ b/drivers/tty/serial/sh-sci.c
@@ -1058,9 +1058,9 @@ static int scif_rtrg_enabled(struct uart_port *port)
 			(SCFCR_RTRG0 | SCFCR_RTRG1)) != 0;
 }
 
-static void rx_fifo_timer_fn(unsigned long arg)
+static void rx_fifo_timer_fn(struct timer_list *t)
 {
-	struct sci_port *s = (struct sci_port *)arg;
+	struct sci_port *s = from_timer(s, t, rx_fifo_timer);
 	struct uart_port *port = &s->port;
 
 	dev_dbg(port->dev, "Rx timed out\n");
@@ -1138,8 +1138,7 @@ static ssize_t rx_fifo_timeout_store(struct device *dev,
 		sci->rx_fifo_timeout = r;
 		scif_set_rtrg(port, 1);
 		if (r > 0)
-			setup_timer(&sci->rx_fifo_timer, rx_fifo_timer_fn,
-				    (unsigned long)sci);
+			timer_setup(&sci->rx_fifo_timer, rx_fifo_timer_fn, 0);
 	}
 
 	return count;
@@ -1392,9 +1391,9 @@ static void work_fn_tx(struct work_struct *work)
 	dma_async_issue_pending(chan);
 }
 
-static void rx_timer_fn(unsigned long arg)
+static void rx_timer_fn(struct timer_list *t)
 {
-	struct sci_port *s = (struct sci_port *)arg;
+	struct sci_port *s = from_timer(s, t, rx_timer);
 	struct dma_chan *chan = s->chan_rx;
 	struct uart_port *port = &s->port;
 	struct dma_tx_state state;
@@ -1572,7 +1571,7 @@ static void sci_request_dma(struct uart_port *port)
 			dma += s->buf_len_rx;
 		}
 
-		setup_timer(&s->rx_timer, rx_timer_fn, (unsigned long)s);
+		timer_setup(&s->rx_timer, rx_timer_fn, 0);
 
 		if (port->type == PORT_SCIFA || port->type == PORT_SCIFB)
 			sci_submit_rx(s);
@@ -2238,8 +2237,7 @@ static void sci_reset(struct uart_port *port)
 	if (s->rx_trigger > 1) {
 		if (s->rx_fifo_timeout) {
 			scif_set_rtrg(port, 1);
-			setup_timer(&s->rx_fifo_timer, rx_fifo_timer_fn,
-				    (unsigned long)s);
+			timer_setup(&s->rx_fifo_timer, rx_fifo_timer_fn, 0);
 		} else {
 			if (port->type == PORT_SCIFA ||
 			    port->type == PORT_SCIFB)
diff --git a/drivers/tty/serial/sn_console.c b/drivers/tty/serial/sn_console.c
index ed78542c4c37..42b9aded4eb1 100644
--- a/drivers/tty/serial/sn_console.c
+++ b/drivers/tty/serial/sn_console.c
@@ -612,9 +612,9 @@ static irqreturn_t sn_sal_interrupt(int irq, void *dev_id)
  * Obviously not used in interrupt mode
  *
  */
-static void sn_sal_timer_poll(unsigned long data)
+static void sn_sal_timer_poll(struct timer_list *t)
 {
-	struct sn_cons_port *port = (struct sn_cons_port *)data;
+	struct sn_cons_port *port = from_timer(port, t, sc_timer);
 	unsigned long flags;
 
 	if (!port)
@@ -668,7 +668,7 @@ static void __init sn_sal_switch_to_asynch(struct sn_cons_port *port)
 	 * timer to poll for input and push data from the console
 	 * buffer.
 	 */
-	setup_timer(&port->sc_timer, sn_sal_timer_poll, (unsigned long)port);
+	timer_setup(&port->sc_timer, sn_sal_timer_poll, 0);
 
 	if (IS_RUNNING_ON_SIMULATOR())
 		port->sc_interrupt_timeout = 6;
diff --git a/drivers/tty/synclink.c b/drivers/tty/synclink.c
index f2c34d656144..3c4ad71f261d 100644
--- a/drivers/tty/synclink.c
+++ b/drivers/tty/synclink.c
@@ -700,7 +700,7 @@ static void usc_enable_async_clock( struct mgsl_struct *info, u32 DataRate );
 
 static void usc_loopback_frame( struct mgsl_struct *info );
 
-static void mgsl_tx_timeout(unsigned long context);
+static void mgsl_tx_timeout(struct timer_list *t);
 
 
 static void usc_loopmode_cancel_transmit( struct mgsl_struct * info );
@@ -1768,7 +1768,7 @@ static int startup(struct mgsl_struct * info)
 	
 	memset(&info->icount, 0, sizeof(info->icount));
 
-	setup_timer(&info->tx_timer, mgsl_tx_timeout, (unsigned long)info);
+	timer_setup(&info->tx_timer, mgsl_tx_timeout, 0);
 	
 	/* Allocate and claim adapter resources */
 	retval = mgsl_claim_resources(info);
@@ -7517,9 +7517,9 @@ static void mgsl_trace_block(struct mgsl_struct *info,const char* data, int coun
  * Arguments:	context		pointer to device instance data
  * Return Value:	None
  */
-static void mgsl_tx_timeout(unsigned long context)
+static void mgsl_tx_timeout(struct timer_list *t)
 {
-	struct mgsl_struct *info = (struct mgsl_struct*)context;
+	struct mgsl_struct *info = from_timer(info, t, tx_timer);
 	unsigned long flags;
 	
 	if ( debug_level >= DEBUG_LEVEL_INFO )
diff --git a/drivers/tty/synclink_gt.c b/drivers/tty/synclink_gt.c
index 06a03731bba7..255c49687877 100644
--- a/drivers/tty/synclink_gt.c
+++ b/drivers/tty/synclink_gt.c
@@ -493,8 +493,8 @@ static void free_bufs(struct slgt_info *info, struct slgt_desc *bufs, int count)
 static int  alloc_tmp_rbuf(struct slgt_info *info);
 static void free_tmp_rbuf(struct slgt_info *info);
 
-static void tx_timeout(unsigned long context);
-static void rx_timeout(unsigned long context);
+static void tx_timeout(struct timer_list *t);
+static void rx_timeout(struct timer_list *t);
 
 /*
  * ioctl handlers
@@ -3597,8 +3597,8 @@ static struct slgt_info *alloc_dev(int adapter_num, int port_num, struct pci_dev
 		info->adapter_num = adapter_num;
 		info->port_num = port_num;
 
-		setup_timer(&info->tx_timer, tx_timeout, (unsigned long)info);
-		setup_timer(&info->rx_timer, rx_timeout, (unsigned long)info);
+		timer_setup(&info->tx_timer, tx_timeout, 0);
+		timer_setup(&info->rx_timer, rx_timeout, 0);
 
 		/* Copy configuration info to device instance data */
 		info->pdev = pdev;
@@ -5112,9 +5112,9 @@ static int adapter_test(struct slgt_info *info)
 /*
  * transmit timeout handler
  */
-static void tx_timeout(unsigned long context)
+static void tx_timeout(struct timer_list *t)
 {
-	struct slgt_info *info = (struct slgt_info*)context;
+	struct slgt_info *info = from_timer(info, t, tx_timer);
 	unsigned long flags;
 
 	DBGINFO(("%s tx_timeout\n", info->device_name));
@@ -5136,9 +5136,9 @@ static void tx_timeout(unsigned long context)
 /*
  * receive buffer polling timer
  */
-static void rx_timeout(unsigned long context)
+static void rx_timeout(struct timer_list *t)
 {
-	struct slgt_info *info = (struct slgt_info*)context;
+	struct slgt_info *info = from_timer(info, t, rx_timer);
 	unsigned long flags;
 
 	DBGINFO(("%s rx_timeout\n", info->device_name));
diff --git a/drivers/tty/synclinkmp.c b/drivers/tty/synclinkmp.c
index d45f234e1914..75f11ce1f0a1 100644
--- a/drivers/tty/synclinkmp.c
+++ b/drivers/tty/synclinkmp.c
@@ -615,8 +615,8 @@ static void free_tmp_rx_buf(SLMP_INFO *info);
 
 static void load_pci_memory(SLMP_INFO *info, char* dest, const char* src, unsigned short count);
 static void trace_block(SLMP_INFO *info, const char* data, int count, int xmit);
-static void tx_timeout(unsigned long context);
-static void status_timeout(unsigned long context);
+static void tx_timeout(struct timer_list *t);
+static void status_timeout(struct timer_list *t);
 
 static unsigned char read_reg(SLMP_INFO *info, unsigned char addr);
 static void write_reg(SLMP_INFO *info, unsigned char addr, unsigned char val);
@@ -3782,9 +3782,8 @@ static SLMP_INFO *alloc_dev(int adapter_num, int port_num, struct pci_dev *pdev)
 		info->bus_type = MGSL_BUS_TYPE_PCI;
 		info->irq_flags = IRQF_SHARED;
 
-		setup_timer(&info->tx_timer, tx_timeout, (unsigned long)info);
-		setup_timer(&info->status_timer, status_timeout,
-				(unsigned long)info);
+		timer_setup(&info->tx_timer, tx_timeout, 0);
+		timer_setup(&info->status_timer, status_timeout, 0);
 
 		/* Store the PCI9050 misc control register value because a flaw
 		 * in the PCI9050 prevents LCR registers from being read if
@@ -5468,9 +5467,9 @@ static void trace_block(SLMP_INFO *info,const char* data, int count, int xmit)
 /* called when HDLC frame times out
  * update stats and do tx completion processing
  */
-static void tx_timeout(unsigned long context)
+static void tx_timeout(struct timer_list *t)
 {
-	SLMP_INFO *info = (SLMP_INFO*)context;
+	SLMP_INFO *info = from_timer(info, t, tx_timer);
 	unsigned long flags;
 
 	if ( debug_level >= DEBUG_LEVEL_INFO )
@@ -5495,10 +5494,10 @@ static void tx_timeout(unsigned long context)
 
 /* called to periodically check the DSR/RI modem signal input status
  */
-static void status_timeout(unsigned long context)
+static void status_timeout(struct timer_list *t)
 {
 	u16 status = 0;
-	SLMP_INFO *info = (SLMP_INFO*)context;
+	SLMP_INFO *info = from_timer(info, t, status_timer);
 	unsigned long flags;
 	unsigned char delta;
 
diff --git a/drivers/usb/core/hcd.c b/drivers/usb/core/hcd.c
index 19b5c4afeef2..fc32391a34d5 100644
--- a/drivers/usb/core/hcd.c
+++ b/drivers/usb/core/hcd.c
@@ -788,9 +788,11 @@ void usb_hcd_poll_rh_status(struct usb_hcd *hcd)
 EXPORT_SYMBOL_GPL(usb_hcd_poll_rh_status);
 
 /* timer callback */
-static void rh_timer_func (unsigned long _hcd)
+static void rh_timer_func (struct timer_list *t)
 {
-	usb_hcd_poll_rh_status((struct usb_hcd *) _hcd);
+	struct usb_hcd *_hcd = from_timer(_hcd, t, rh_timer);
+
+	usb_hcd_poll_rh_status(_hcd);
 }
 
 /*-------------------------------------------------------------------------*/
@@ -2545,7 +2547,7 @@ struct usb_hcd *__usb_create_hcd(const struct hc_driver *driver,
 	hcd->self.bus_name = bus_name;
 	hcd->self.uses_dma = (sysdev->dma_mask != NULL);
 
-	setup_timer(&hcd->rh_timer, rh_timer_func, (unsigned long)hcd);
+	timer_setup(&hcd->rh_timer, rh_timer_func, 0);
 #ifdef CONFIG_PM
 	INIT_WORK(&hcd->wakeup_work, hcd_resume_work);
 #endif
diff --git a/drivers/usb/dwc2/hcd.c b/drivers/usb/dwc2/hcd.c
index 69eb40cd1b47..7b6eb0ad513b 100644
--- a/drivers/usb/dwc2/hcd.c
+++ b/drivers/usb/dwc2/hcd.c
@@ -3314,9 +3314,9 @@ host:
 	}
 }
 
-static void dwc2_wakeup_detected(unsigned long data)
+static void dwc2_wakeup_detected(struct timer_list *t)
 {
-	struct dwc2_hsotg *hsotg = (struct dwc2_hsotg *)data;
+	struct dwc2_hsotg *hsotg = from_timer(hsotg, t, wkp_timer);
 	u32 hprt0;
 
 	dev_dbg(hsotg->dev, "%s()\n", __func__);
@@ -5155,8 +5155,7 @@ int dwc2_hcd_init(struct dwc2_hsotg *hsotg)
 	}
 	INIT_WORK(&hsotg->wf_otg, dwc2_conn_id_status_change);
 
-	setup_timer(&hsotg->wkp_timer, dwc2_wakeup_detected,
-		    (unsigned long)hsotg);
+	timer_setup(&hsotg->wkp_timer, dwc2_wakeup_detected, 0);
 
 	/* Initialize the non-periodic schedule */
 	INIT_LIST_HEAD(&hsotg->non_periodic_sched_inactive);
diff --git a/drivers/usb/dwc2/hcd_queue.c b/drivers/usb/dwc2/hcd_queue.c
index f472de238ac2..fcd1676c7f0b 100644
--- a/drivers/usb/dwc2/hcd_queue.c
+++ b/drivers/usb/dwc2/hcd_queue.c
@@ -1275,9 +1275,9 @@ static void dwc2_do_unreserve(struct dwc2_hsotg *hsotg, struct dwc2_qh *qh)
  *
  * @work: Pointer to a qh unreserve_work.
  */
-static void dwc2_unreserve_timer_fn(unsigned long data)
+static void dwc2_unreserve_timer_fn(struct timer_list *t)
 {
-	struct dwc2_qh *qh = (struct dwc2_qh *)data;
+	struct dwc2_qh *qh = from_timer(qh, t, unreserve_timer);
 	struct dwc2_hsotg *hsotg = qh->hsotg;
 	unsigned long flags;
 
@@ -1467,8 +1467,7 @@ static void dwc2_qh_init(struct dwc2_hsotg *hsotg, struct dwc2_qh *qh,
 
 	/* Initialize QH */
 	qh->hsotg = hsotg;
-	setup_timer(&qh->unreserve_timer, dwc2_unreserve_timer_fn,
-		    (unsigned long)qh);
+	timer_setup(&qh->unreserve_timer, dwc2_unreserve_timer_fn, 0);
 	qh->ep_type = ep_type;
 	qh->ep_is_in = ep_is_in;
 
diff --git a/drivers/usb/gadget/udc/at91_udc.c b/drivers/usb/gadget/udc/at91_udc.c
index bfe278294e88..ad743a8493be 100644
--- a/drivers/usb/gadget/udc/at91_udc.c
+++ b/drivers/usb/gadget/udc/at91_udc.c
@@ -1550,9 +1550,9 @@ static void at91_vbus_timer_work(struct work_struct *work)
 		mod_timer(&udc->vbus_timer, jiffies + VBUS_POLL_TIMEOUT);
 }
 
-static void at91_vbus_timer(unsigned long data)
+static void at91_vbus_timer(struct timer_list *t)
 {
-	struct at91_udc *udc = (struct at91_udc *)data;
+	struct at91_udc *udc = from_timer(udc, t, vbus_timer);
 
 	/*
 	 * If we are polling vbus it is likely that the gpio is on an
@@ -1918,8 +1918,7 @@ static int at91udc_probe(struct platform_device *pdev)
 
 		if (udc->board.vbus_polled) {
 			INIT_WORK(&udc->vbus_timer_work, at91_vbus_timer_work);
-			setup_timer(&udc->vbus_timer, at91_vbus_timer,
-				    (unsigned long)udc);
+			timer_setup(&udc->vbus_timer, at91_vbus_timer, 0);
 			mod_timer(&udc->vbus_timer,
 				  jiffies + VBUS_POLL_TIMEOUT);
 		} else {
diff --git a/drivers/usb/gadget/udc/dummy_hcd.c b/drivers/usb/gadget/udc/dummy_hcd.c
index 4f1b1809472c..d0128f92ec5a 100644
--- a/drivers/usb/gadget/udc/dummy_hcd.c
+++ b/drivers/usb/gadget/udc/dummy_hcd.c
@@ -1771,9 +1771,9 @@ static int handle_control_request(struct dummy_hcd *dum_hcd, struct urb *urb,
 /* drive both sides of the transfers; looks like irq handlers to
  * both drivers except the callbacks aren't in_irq().
  */
-static void dummy_timer(unsigned long _dum_hcd)
+static void dummy_timer(struct timer_list *t)
 {
-	struct dummy_hcd	*dum_hcd = (struct dummy_hcd *) _dum_hcd;
+	struct dummy_hcd	*dum_hcd = from_timer(dum_hcd, t, timer);
 	struct dummy		*dum = dum_hcd->dum;
 	struct urbp		*urbp, *tmp;
 	unsigned long		flags;
@@ -2445,7 +2445,7 @@ static DEVICE_ATTR_RO(urbs);
 
 static int dummy_start_ss(struct dummy_hcd *dum_hcd)
 {
-	setup_timer(&dum_hcd->timer, dummy_timer, (unsigned long)dum_hcd);
+	timer_setup(&dum_hcd->timer, dummy_timer, 0);
 	dum_hcd->rh_state = DUMMY_RH_RUNNING;
 	dum_hcd->stream_en_ep = 0;
 	INIT_LIST_HEAD(&dum_hcd->urbp_list);
@@ -2474,7 +2474,7 @@ static int dummy_start(struct usb_hcd *hcd)
 		return dummy_start_ss(dum_hcd);
 
 	spin_lock_init(&dum_hcd->dum->lock);
-	setup_timer(&dum_hcd->timer, dummy_timer, (unsigned long)dum_hcd);
+	timer_setup(&dum_hcd->timer, dummy_timer, 0);
 	dum_hcd->rh_state = DUMMY_RH_RUNNING;
 
 	INIT_LIST_HEAD(&dum_hcd->urbp_list);
diff --git a/drivers/usb/gadget/udc/m66592-udc.c b/drivers/usb/gadget/udc/m66592-udc.c
index f19e6282a688..a8288df6aadf 100644
--- a/drivers/usb/gadget/udc/m66592-udc.c
+++ b/drivers/usb/gadget/udc/m66592-udc.c
@@ -1259,9 +1259,9 @@ static irqreturn_t m66592_irq(int irq, void *_m66592)
 	return IRQ_HANDLED;
 }
 
-static void m66592_timer(unsigned long _m66592)
+static void m66592_timer(struct timer_list *t)
 {
-	struct m66592 *m66592 = (struct m66592 *)_m66592;
+	struct m66592 *m66592 = from_timer(m66592, t, timer);
 	unsigned long flags;
 	u16 tmp;
 
@@ -1589,7 +1589,7 @@ static int m66592_probe(struct platform_device *pdev)
 	m66592->gadget.max_speed = USB_SPEED_HIGH;
 	m66592->gadget.name = udc_name;
 
-	setup_timer(&m66592->timer, m66592_timer, (unsigned long)m66592);
+	timer_setup(&m66592->timer, m66592_timer, 0);
 	m66592->reg = reg;
 
 	ret = request_irq(ires->start, m66592_irq, IRQF_SHARED,
diff --git a/drivers/usb/gadget/udc/omap_udc.c b/drivers/usb/gadget/udc/omap_udc.c
index fb8c4bff584c..dc35a54bad90 100644
--- a/drivers/usb/gadget/udc/omap_udc.c
+++ b/drivers/usb/gadget/udc/omap_udc.c
@@ -1854,9 +1854,9 @@ static irqreturn_t omap_udc_irq(int irq, void *_udc)
 #define PIO_OUT_TIMEOUT	(jiffies + HZ/3)
 #define HALF_FULL(f)	(!((f)&(UDC_NON_ISO_FIFO_FULL|UDC_NON_ISO_FIFO_EMPTY)))
 
-static void pio_out_timer(unsigned long _ep)
+static void pio_out_timer(struct timer_list *t)
 {
-	struct omap_ep	*ep = (void *) _ep;
+	struct omap_ep	*ep = from_timer(ep, t, timer);
 	unsigned long	flags;
 	u16		stat_flg;
 
@@ -2542,7 +2542,7 @@ omap_ep_setup(char *name, u8 addr, u8 type,
 		}
 		if (dbuf && addr)
 			epn_rxtx |= UDC_EPN_RX_DB;
-		setup_timer(&ep->timer, pio_out_timer, (unsigned long)ep);
+		timer_setup(&ep->timer, pio_out_timer, 0);
 	}
 	if (addr)
 		epn_rxtx |= UDC_EPN_RX_VALID;
diff --git a/drivers/usb/gadget/udc/pxa25x_udc.c b/drivers/usb/gadget/udc/pxa25x_udc.c
index 8f135d9fa245..0e3f5faa000e 100644
--- a/drivers/usb/gadget/udc/pxa25x_udc.c
+++ b/drivers/usb/gadget/udc/pxa25x_udc.c
@@ -1624,9 +1624,9 @@ static inline void clear_ep_state (struct pxa25x_udc *dev)
 		nuke(&dev->ep[i], -ECONNABORTED);
 }
 
-static void udc_watchdog(unsigned long _dev)
+static void udc_watchdog(struct timer_list *t)
 {
-	struct pxa25x_udc	*dev = (void *)_dev;
+	struct pxa25x_udc	*dev = from_timer(dev, t, timer);
 
 	local_irq_disable();
 	if (dev->ep0state == EP0_STALL
@@ -2413,7 +2413,7 @@ static int pxa25x_udc_probe(struct platform_device *pdev)
 		gpio_direction_output(dev->mach->gpio_pullup, 0);
 	}
 
-	setup_timer(&dev->timer, udc_watchdog, (unsigned long)dev);
+	timer_setup(&dev->timer, udc_watchdog, 0);
 
 	the_controller = dev;
 	platform_set_drvdata(pdev, dev);
diff --git a/drivers/usb/gadget/udc/r8a66597-udc.c b/drivers/usb/gadget/udc/r8a66597-udc.c
index 143122ed3c66..a3ecce62662b 100644
--- a/drivers/usb/gadget/udc/r8a66597-udc.c
+++ b/drivers/usb/gadget/udc/r8a66597-udc.c
@@ -1514,9 +1514,9 @@ static irqreturn_t r8a66597_irq(int irq, void *_r8a66597)
 	return IRQ_HANDLED;
 }
 
-static void r8a66597_timer(unsigned long _r8a66597)
+static void r8a66597_timer(struct timer_list *t)
 {
-	struct r8a66597 *r8a66597 = (struct r8a66597 *)_r8a66597;
+	struct r8a66597 *r8a66597 = from_timer(r8a66597, t, timer);
 	unsigned long flags;
 	u16 tmp;
 
@@ -1874,7 +1874,7 @@ static int r8a66597_probe(struct platform_device *pdev)
 	r8a66597->gadget.max_speed = USB_SPEED_HIGH;
 	r8a66597->gadget.name = udc_name;
 
-	setup_timer(&r8a66597->timer, r8a66597_timer, (unsigned long)r8a66597);
+	timer_setup(&r8a66597->timer, r8a66597_timer, 0);
 	r8a66597->reg = reg;
 
 	if (r8a66597->pdata->on_chip) {
diff --git a/drivers/usb/host/ohci-hcd.c b/drivers/usb/host/ohci-hcd.c
index 10887e09e9bc..ee9676349333 100644
--- a/drivers/usb/host/ohci-hcd.c
+++ b/drivers/usb/host/ohci-hcd.c
@@ -80,7 +80,7 @@ static const char	hcd_name [] = "ohci_hcd";
 
 static void ohci_dump(struct ohci_hcd *ohci);
 static void ohci_stop(struct usb_hcd *hcd);
-static void io_watchdog_func(unsigned long _ohci);
+static void io_watchdog_func(struct timer_list *t);
 
 #include "ohci-hub.c"
 #include "ohci-dbg.c"
@@ -500,8 +500,7 @@ static int ohci_init (struct ohci_hcd *ohci)
 	if (ohci->hcca)
 		return 0;
 
-	setup_timer(&ohci->io_watchdog, io_watchdog_func,
-			(unsigned long) ohci);
+	timer_setup(&ohci->io_watchdog, io_watchdog_func, 0);
 
 	ohci->hcca = dma_alloc_coherent (hcd->self.controller,
 			sizeof(*ohci->hcca), &ohci->hcca_dma, GFP_KERNEL);
@@ -723,9 +722,9 @@ static int ohci_start(struct usb_hcd *hcd)
  * the unlink list.  As a result, URBs could never be dequeued and
  * endpoints could never be released.
  */
-static void io_watchdog_func(unsigned long _ohci)
+static void io_watchdog_func(struct timer_list *t)
 {
-	struct ohci_hcd	*ohci = (struct ohci_hcd *) _ohci;
+	struct ohci_hcd	*ohci = from_timer(ohci, t, io_watchdog);
 	bool		takeback_all_pending = false;
 	u32		status;
 	u32		head;
diff --git a/drivers/usb/host/oxu210hp-hcd.c b/drivers/usb/host/oxu210hp-hcd.c
index 0bf7759aae78..c5e6e8d0b5ef 100644
--- a/drivers/usb/host/oxu210hp-hcd.c
+++ b/drivers/usb/host/oxu210hp-hcd.c
@@ -2539,9 +2539,9 @@ static irqreturn_t oxu_irq(struct usb_hcd *hcd)
 	return ret;
 }
 
-static void oxu_watchdog(unsigned long param)
+static void oxu_watchdog(struct timer_list *t)
 {
-	struct oxu_hcd	*oxu = (struct oxu_hcd *) param;
+	struct oxu_hcd	*oxu = from_timer(oxu, t, watchdog);
 	unsigned long flags;
 
 	spin_lock_irqsave(&oxu->lock, flags);
@@ -2577,7 +2577,7 @@ static int oxu_hcd_init(struct usb_hcd *hcd)
 
 	spin_lock_init(&oxu->lock);
 
-	setup_timer(&oxu->watchdog, oxu_watchdog, (unsigned long)oxu);
+	timer_setup(&oxu->watchdog, oxu_watchdog, 0);
 
 	/*
 	 * hw default: 1K periodic list heads, one per frame.
diff --git a/drivers/usb/host/r8a66597-hcd.c b/drivers/usb/host/r8a66597-hcd.c
index f3d9ba420a97..984892dd72f5 100644
--- a/drivers/usb/host/r8a66597-hcd.c
+++ b/drivers/usb/host/r8a66597-hcd.c
@@ -1798,9 +1798,9 @@ static void r8a66597_td_timer(struct timer_list *t)
 	spin_unlock_irqrestore(&r8a66597->lock, flags);
 }
 
-static void r8a66597_timer(unsigned long _r8a66597)
+static void r8a66597_timer(struct timer_list *t)
 {
-	struct r8a66597 *r8a66597 = (struct r8a66597 *)_r8a66597;
+	struct r8a66597 *r8a66597 = from_timer(r8a66597, t, rh_timer);
 	unsigned long flags;
 	int port;
 
@@ -2472,8 +2472,7 @@ static int r8a66597_probe(struct platform_device *pdev)
 		r8a66597->max_root_hub = 2;
 
 	spin_lock_init(&r8a66597->lock);
-	setup_timer(&r8a66597->rh_timer, r8a66597_timer,
-		    (unsigned long)r8a66597);
+	timer_setup(&r8a66597->rh_timer, r8a66597_timer, 0);
 	r8a66597->reg = reg;
 
 	/* make sure no interrupts are pending */
diff --git a/drivers/usb/host/sl811-hcd.c b/drivers/usb/host/sl811-hcd.c
index 601fb00603cc..fa88a903fa2e 100644
--- a/drivers/usb/host/sl811-hcd.c
+++ b/drivers/usb/host/sl811-hcd.c
@@ -1119,9 +1119,9 @@ sl811h_hub_descriptor (
 }
 
 static void
-sl811h_timer(unsigned long _sl811)
+sl811h_timer(struct timer_list *t)
 {
-	struct sl811 	*sl811 = (void *) _sl811;
+	struct sl811 	*sl811 = from_timer(sl811, t, timer);
 	unsigned long	flags;
 	u8		irqstat;
 	u8		signaling = sl811->ctrl1 & SL11H_CTL1MASK_FORCE;
@@ -1692,7 +1692,7 @@ sl811h_probe(struct platform_device *dev)
 	spin_lock_init(&sl811->lock);
 	INIT_LIST_HEAD(&sl811->async);
 	sl811->board = dev_get_platdata(&dev->dev);
-	setup_timer(&sl811->timer, sl811h_timer, (unsigned long)sl811);
+	timer_setup(&sl811->timer, sl811h_timer, 0);
 	sl811->addr_reg = addr_reg;
 	sl811->data_reg = data_reg;
 
diff --git a/drivers/usb/host/uhci-hcd.c b/drivers/usb/host/uhci-hcd.c
index babeefd84ffd..f5c90217777a 100644
--- a/drivers/usb/host/uhci-hcd.c
+++ b/drivers/usb/host/uhci-hcd.c
@@ -585,8 +585,7 @@ static int uhci_start(struct usb_hcd *hcd)
 		hcd->self.sg_tablesize = ~0;
 
 	spin_lock_init(&uhci->lock);
-	setup_timer(&uhci->fsbr_timer, uhci_fsbr_timeout,
-			(unsigned long) uhci);
+	timer_setup(&uhci->fsbr_timer, uhci_fsbr_timeout, 0);
 	INIT_LIST_HEAD(&uhci->idle_qh_list);
 	init_waitqueue_head(&uhci->waitqh);
 
diff --git a/drivers/usb/host/uhci-q.c b/drivers/usb/host/uhci-q.c
index 49d4edc03cc2..d40438238938 100644
--- a/drivers/usb/host/uhci-q.c
+++ b/drivers/usb/host/uhci-q.c
@@ -90,9 +90,9 @@ static void uhci_urbp_wants_fsbr(struct uhci_hcd *uhci, struct urb_priv *urbp)
 	}
 }
 
-static void uhci_fsbr_timeout(unsigned long _uhci)
+static void uhci_fsbr_timeout(struct timer_list *t)
 {
-	struct uhci_hcd *uhci = (struct uhci_hcd *) _uhci;
+	struct uhci_hcd *uhci = from_timer(uhci, t, fsbr_timer);
 	unsigned long flags;
 
 	spin_lock_irqsave(&uhci->lock, flags);
diff --git a/drivers/usb/host/xhci.c b/drivers/usb/host/xhci.c
index 327ba8b8a98b..2424d3020ca3 100644
--- a/drivers/usb/host/xhci.c
+++ b/drivers/usb/host/xhci.c
@@ -395,14 +395,14 @@ static inline void xhci_msix_sync_irqs(struct xhci_hcd *xhci)
 
 #endif
 
-static void compliance_mode_recovery(unsigned long arg)
+static void compliance_mode_recovery(struct timer_list *t)
 {
 	struct xhci_hcd *xhci;
 	struct usb_hcd *hcd;
 	u32 temp;
 	int i;
 
-	xhci = (struct xhci_hcd *)arg;
+	xhci = from_timer(xhci, t, comp_mode_recovery_timer);
 
 	for (i = 0; i < xhci->num_usb3_ports; i++) {
 		temp = readl(xhci->usb3_ports[i]);
@@ -443,8 +443,8 @@ static void compliance_mode_recovery(unsigned long arg)
 static void compliance_mode_recovery_timer_init(struct xhci_hcd *xhci)
 {
 	xhci->port_status_u0 = 0;
-	setup_timer(&xhci->comp_mode_recovery_timer,
-		    compliance_mode_recovery, (unsigned long)xhci);
+	timer_setup(&xhci->comp_mode_recovery_timer, compliance_mode_recovery,
+		    0);
 	xhci->comp_mode_recovery_timer.expires = jiffies +
 			msecs_to_jiffies(COMP_MODE_RCVRY_MSECS);
 
diff --git a/drivers/usb/serial/mos7840.c b/drivers/usb/serial/mos7840.c
index a859c2d33c29..fdceb46d9fc6 100644
--- a/drivers/usb/serial/mos7840.c
+++ b/drivers/usb/serial/mos7840.c
@@ -555,9 +555,9 @@ static void mos7840_set_led_sync(struct usb_serial_port *port, __u16 reg,
 			val, reg, NULL, 0, MOS_WDR_TIMEOUT);
 }
 
-static void mos7840_led_off(unsigned long arg)
+static void mos7840_led_off(struct timer_list *t)
 {
-	struct moschip_port *mcs = (struct moschip_port *) arg;
+	struct moschip_port *mcs = from_timer(mcs, t, led_timer1);
 
 	/* Turn off LED */
 	mos7840_set_led_async(mcs, 0x0300, MODEM_CONTROL_REGISTER);
@@ -565,9 +565,9 @@ static void mos7840_led_off(unsigned long arg)
 				jiffies + msecs_to_jiffies(LED_OFF_MS));
 }
 
-static void mos7840_led_flag_off(unsigned long arg)
+static void mos7840_led_flag_off(struct timer_list *t)
 {
-	struct moschip_port *mcs = (struct moschip_port *) arg;
+	struct moschip_port *mcs = from_timer(mcs, t, led_timer2);
 
 	clear_bit_unlock(MOS7840_FLAG_LED_BUSY, &mcs->flags);
 }
@@ -2289,12 +2289,11 @@ static int mos7840_port_probe(struct usb_serial_port *port)
 			goto error;
 		}
 
-		setup_timer(&mos7840_port->led_timer1, mos7840_led_off,
-			    (unsigned long)mos7840_port);
+		timer_setup(&mos7840_port->led_timer1, mos7840_led_off, 0);
 		mos7840_port->led_timer1.expires =
 			jiffies + msecs_to_jiffies(LED_ON_MS);
-		setup_timer(&mos7840_port->led_timer2, mos7840_led_flag_off,
-			    (unsigned long)mos7840_port);
+		timer_setup(&mos7840_port->led_timer2, mos7840_led_flag_off,
+			    0);
 		mos7840_port->led_timer2.expires =
 			jiffies + msecs_to_jiffies(LED_OFF_MS);
 
diff --git a/drivers/usb/storage/realtek_cr.c b/drivers/usb/storage/realtek_cr.c
index 48e2e32c97e8..31b024441938 100644
--- a/drivers/usb/storage/realtek_cr.c
+++ b/drivers/usb/storage/realtek_cr.c
@@ -751,9 +751,9 @@ static void rts51x_modi_suspend_timer(struct rts51x_chip *chip)
 	mod_timer(&chip->rts51x_suspend_timer, chip->timer_expires);
 }
 
-static void rts51x_suspend_timer_fn(unsigned long data)
+static void rts51x_suspend_timer_fn(struct timer_list *t)
 {
-	struct rts51x_chip *chip = (struct rts51x_chip *)data;
+	struct rts51x_chip *chip = from_timer(chip, t, rts51x_suspend_timer);
 	struct us_data *us = chip->us;
 
 	switch (rts51x_get_stat(chip)) {
@@ -917,8 +917,7 @@ static int realtek_cr_autosuspend_setup(struct us_data *us)
 	us->proto_handler = rts51x_invoke_transport;
 
 	chip->timer_expires = 0;
-	setup_timer(&chip->rts51x_suspend_timer, rts51x_suspend_timer_fn,
-			(unsigned long)chip);
+	timer_setup(&chip->rts51x_suspend_timer, rts51x_suspend_timer_fn, 0);
 	fw5895_init(us);
 
 	/* enable autosuspend function of the usb device */
diff --git a/drivers/uwb/drp.c b/drivers/uwb/drp.c
index 38d0504a1bbc..625f706b8160 100644
--- a/drivers/uwb/drp.c
+++ b/drivers/uwb/drp.c
@@ -603,9 +603,9 @@ static void uwb_cnflt_update_work(struct work_struct *work)
 	mutex_unlock(&rc->rsvs_mutex);
 }
 
-static void uwb_cnflt_timer(unsigned long arg)
+static void uwb_cnflt_timer(struct timer_list *t)
 {
-	struct uwb_cnflt_alien *cnflt = (struct uwb_cnflt_alien *)arg;
+	struct uwb_cnflt_alien *cnflt = from_timer(cnflt, t, timer);
 
 	queue_work(cnflt->rc->rsv_workq, &cnflt->cnflt_update_work);
 }
@@ -642,7 +642,7 @@ static void uwb_drp_handle_alien_drp(struct uwb_rc *rc, struct uwb_ie_drp *drp_i
 	}
 
 	INIT_LIST_HEAD(&cnflt->rc_node);
-	setup_timer(&cnflt->timer, uwb_cnflt_timer, (unsigned long)cnflt);
+	timer_setup(&cnflt->timer, uwb_cnflt_timer, 0);
 
 	cnflt->rc = rc;
 	INIT_WORK(&cnflt->cnflt_update_work, uwb_cnflt_update_work);
diff --git a/drivers/uwb/neh.c b/drivers/uwb/neh.c
index 36b5cb62c15d..fbdca728bd9f 100644
--- a/drivers/uwb/neh.c
+++ b/drivers/uwb/neh.c
@@ -115,7 +115,7 @@ struct uwb_rc_neh {
 	struct list_head list_node;
 };
 
-static void uwb_rc_neh_timer(unsigned long arg);
+static void uwb_rc_neh_timer(struct timer_list *t);
 
 static void uwb_rc_neh_release(struct kref *kref)
 {
@@ -223,7 +223,7 @@ struct uwb_rc_neh *uwb_rc_neh_add(struct uwb_rc *rc, struct uwb_rccb *cmd,
 
 	kref_init(&neh->kref);
 	INIT_LIST_HEAD(&neh->list_node);
-	setup_timer(&neh->timer, uwb_rc_neh_timer, (unsigned long)neh);
+	timer_setup(&neh->timer, uwb_rc_neh_timer, 0);
 
 	neh->rc = rc;
 	neh->evt_type = expected_type;
@@ -565,9 +565,9 @@ void uwb_rc_neh_error(struct uwb_rc *rc, int error)
 EXPORT_SYMBOL_GPL(uwb_rc_neh_error);
 
 
-static void uwb_rc_neh_timer(unsigned long arg)
+static void uwb_rc_neh_timer(struct timer_list *t)
 {
-	struct uwb_rc_neh *neh = (struct uwb_rc_neh *)arg;
+	struct uwb_rc_neh *neh = from_timer(neh, t, timer);
 	struct uwb_rc *rc = neh->rc;
 	unsigned long flags;
 
diff --git a/drivers/uwb/rsv.c b/drivers/uwb/rsv.c
index f5e27247a38f..fe25a8cc6fa1 100644
--- a/drivers/uwb/rsv.c
+++ b/drivers/uwb/rsv.c
@@ -23,7 +23,7 @@
 
 #include "uwb-internal.h"
 
-static void uwb_rsv_timer(unsigned long arg);
+static void uwb_rsv_timer(struct timer_list *t);
 
 static const char *rsv_states[] = {
 	[UWB_RSV_STATE_NONE]                 = "none            ",
@@ -198,9 +198,9 @@ static void uwb_rsv_put_stream(struct uwb_rsv *rsv)
 	dev_dbg(dev, "put stream %d\n", rsv->stream);
 }
 
-void uwb_rsv_backoff_win_timer(unsigned long arg)
+void uwb_rsv_backoff_win_timer(struct timer_list *t)
 {
-	struct uwb_drp_backoff_win *bow = (struct uwb_drp_backoff_win *)arg;
+	struct uwb_drp_backoff_win *bow = from_timer(bow, t, timer);
 	struct uwb_rc *rc = container_of(bow, struct uwb_rc, bow);
 	struct device *dev = &rc->uwb_dev.dev;
 
@@ -470,7 +470,7 @@ static struct uwb_rsv *uwb_rsv_alloc(struct uwb_rc *rc)
 	INIT_LIST_HEAD(&rsv->rc_node);
 	INIT_LIST_HEAD(&rsv->pal_node);
 	kref_init(&rsv->kref);
-	setup_timer(&rsv->timer, uwb_rsv_timer, (unsigned long)rsv);
+	timer_setup(&rsv->timer, uwb_rsv_timer, 0);
 
 	rsv->rc = rc;
 	INIT_WORK(&rsv->handle_timeout_work, uwb_rsv_handle_timeout_work);
@@ -939,9 +939,9 @@ static void uwb_rsv_alien_bp_work(struct work_struct *work)
 	mutex_unlock(&rc->rsvs_mutex);
 }
 
-static void uwb_rsv_timer(unsigned long arg)
+static void uwb_rsv_timer(struct timer_list *t)
 {
-	struct uwb_rsv *rsv = (struct uwb_rsv *)arg;
+	struct uwb_rsv *rsv = from_timer(rsv, t, timer);
 
 	queue_work(rsv->rc->rsv_workq, &rsv->handle_timeout_work);
 }
@@ -987,8 +987,7 @@ void uwb_rsv_init(struct uwb_rc *rc)
 	rc->bow.can_reserve_extra_mases = true;
 	rc->bow.total_expired = 0;
 	rc->bow.window = UWB_DRP_BACKOFF_WIN_MIN >> 1;
-	setup_timer(&rc->bow.timer, uwb_rsv_backoff_win_timer,
-			(unsigned long)&rc->bow);
+	timer_setup(&rc->bow.timer, uwb_rsv_backoff_win_timer, 0);
 
 	bitmap_complement(rc->uwb_dev.streams, rc->uwb_dev.streams, UWB_NUM_STREAMS);
 }
diff --git a/drivers/uwb/uwb-internal.h b/drivers/uwb/uwb-internal.h
index 353c0555a1f5..91326ce093a7 100644
--- a/drivers/uwb/uwb-internal.h
+++ b/drivers/uwb/uwb-internal.h
@@ -329,7 +329,7 @@ void uwb_rsv_put(struct uwb_rsv *rsv);
 bool uwb_rsv_has_two_drp_ies(struct uwb_rsv *rsv);
 void uwb_rsv_dump(char *text, struct uwb_rsv *rsv);
 int uwb_rsv_try_move(struct uwb_rsv *rsv, struct uwb_mas_bm *available);
-void uwb_rsv_backoff_win_timer(unsigned long arg);
+void uwb_rsv_backoff_win_timer(struct timer_list *t);
 void uwb_rsv_backoff_win_increment(struct uwb_rc *rc);
 int uwb_rsv_status(struct uwb_rsv *rsv);
 int uwb_rsv_companion_status(struct uwb_rsv *rsv);
diff --git a/drivers/watchdog/at91sam9_wdt.c b/drivers/watchdog/at91sam9_wdt.c
index 7e6acaf3ece4..88c05d0448b2 100644
--- a/drivers/watchdog/at91sam9_wdt.c
+++ b/drivers/watchdog/at91sam9_wdt.c
@@ -120,9 +120,9 @@ static inline void at91_wdt_reset(struct at91wdt *wdt)
 /*
  * Timer tick
  */
-static void at91_ping(unsigned long data)
+static void at91_ping(struct timer_list *t)
 {
-	struct at91wdt *wdt = (struct at91wdt *)data;
+	struct at91wdt *wdt = from_timer(wdt, t, timer);
 	if (time_before(jiffies, wdt->next_heartbeat) ||
 	    !watchdog_active(&wdt->wdd)) {
 		at91_wdt_reset(wdt);
@@ -222,7 +222,7 @@ static int at91_wdt_init(struct platform_device *pdev, struct at91wdt *wdt)
 			 "watchdog already configured differently (mr = %x expecting %x)\n",
 			 tmp & wdt->mr_mask, wdt->mr & wdt->mr_mask);
 
-	setup_timer(&wdt->timer, at91_ping, (unsigned long)wdt);
+	timer_setup(&wdt->timer, at91_ping, 0);
 
 	/*
 	 * Use min_heartbeat the first time to avoid spurious watchdog reset:
diff --git a/drivers/watchdog/bcm47xx_wdt.c b/drivers/watchdog/bcm47xx_wdt.c
index 236582809336..f41b756d6dd5 100644
--- a/drivers/watchdog/bcm47xx_wdt.c
+++ b/drivers/watchdog/bcm47xx_wdt.c
@@ -106,9 +106,9 @@ static const struct watchdog_ops bcm47xx_wdt_hard_ops = {
 	.restart        = bcm47xx_wdt_restart,
 };
 
-static void bcm47xx_wdt_soft_timer_tick(unsigned long data)
+static void bcm47xx_wdt_soft_timer_tick(struct timer_list *t)
 {
-	struct bcm47xx_wdt *wdt = (struct bcm47xx_wdt *)data;
+	struct bcm47xx_wdt *wdt = from_timer(wdt, t, soft_timer);
 	u32 next_tick = min(wdt->wdd.timeout * 1000, wdt->max_timer_ms);
 
 	if (!atomic_dec_and_test(&wdt->soft_ticks)) {
@@ -133,7 +133,7 @@ static int bcm47xx_wdt_soft_start(struct watchdog_device *wdd)
 	struct bcm47xx_wdt *wdt = bcm47xx_wdt_get(wdd);
 
 	bcm47xx_wdt_soft_keepalive(wdd);
-	bcm47xx_wdt_soft_timer_tick((unsigned long)wdt);
+	bcm47xx_wdt_soft_timer_tick(&wdt->soft_timer);
 
 	return 0;
 }
@@ -190,8 +190,7 @@ static int bcm47xx_wdt_probe(struct platform_device *pdev)
 
 	if (soft) {
 		wdt->wdd.ops = &bcm47xx_wdt_soft_ops;
-		setup_timer(&wdt->soft_timer, bcm47xx_wdt_soft_timer_tick,
-			    (long unsigned int)wdt);
+		timer_setup(&wdt->soft_timer, bcm47xx_wdt_soft_timer_tick, 0);
 	} else {
 		wdt->wdd.ops = &bcm47xx_wdt_hard_ops;
 	}
diff --git a/drivers/watchdog/bcm63xx_wdt.c b/drivers/watchdog/bcm63xx_wdt.c
index ab26fd90729e..8555afc70f9b 100644
--- a/drivers/watchdog/bcm63xx_wdt.c
+++ b/drivers/watchdog/bcm63xx_wdt.c
@@ -77,7 +77,7 @@ static void bcm63xx_wdt_isr(void *data)
 	die(PFX " fire", regs);
 }
 
-static void bcm63xx_timer_tick(unsigned long unused)
+static void bcm63xx_timer_tick(struct timer_list *unused)
 {
 	if (!atomic_dec_and_test(&bcm63xx_wdt_device.ticks)) {
 		bcm63xx_wdt_hw_start();
@@ -240,7 +240,7 @@ static int bcm63xx_wdt_probe(struct platform_device *pdev)
 	int ret;
 	struct resource *r;
 
-	setup_timer(&bcm63xx_wdt_device.timer, bcm63xx_timer_tick, 0L);
+	timer_setup(&bcm63xx_wdt_device.timer, bcm63xx_timer_tick, 0);
 
 	r = platform_get_resource(pdev, IORESOURCE_MEM, 0);
 	if (!r) {
diff --git a/drivers/watchdog/cpu5wdt.c b/drivers/watchdog/cpu5wdt.c
index 6c3f78e45c26..6cfb102c397c 100644
--- a/drivers/watchdog/cpu5wdt.c
+++ b/drivers/watchdog/cpu5wdt.c
@@ -69,7 +69,7 @@ static struct {
 
 /* generic helper functions */
 
-static void cpu5wdt_trigger(unsigned long unused)
+static void cpu5wdt_trigger(struct timer_list *unused)
 {
 	if (verbose > 2)
 		pr_debug("trigger at %i ticks\n", ticks);
@@ -224,7 +224,7 @@ static int cpu5wdt_init(void)
 
 	init_completion(&cpu5wdt_device.stop);
 	cpu5wdt_device.queue = 0;
-	setup_timer(&cpu5wdt_device.timer, cpu5wdt_trigger, 0);
+	timer_setup(&cpu5wdt_device.timer, cpu5wdt_trigger, 0);
 	cpu5wdt_device.default_ticks = ticks;
 
 	if (!request_region(port, CPU5WDT_EXTENT, PFX)) {
diff --git a/drivers/watchdog/mpc8xxx_wdt.c b/drivers/watchdog/mpc8xxx_wdt.c
index 366e5c7e650b..6610e9217dbc 100644
--- a/drivers/watchdog/mpc8xxx_wdt.c
+++ b/drivers/watchdog/mpc8xxx_wdt.c
@@ -80,9 +80,9 @@ static void mpc8xxx_wdt_keepalive(struct mpc8xxx_wdt_ddata *ddata)
 	spin_unlock(&ddata->lock);
 }
 
-static void mpc8xxx_wdt_timer_ping(unsigned long arg)
+static void mpc8xxx_wdt_timer_ping(struct timer_list *t)
 {
-	struct mpc8xxx_wdt_ddata *ddata = (void *)arg;
+	struct mpc8xxx_wdt_ddata *ddata = from_timer(ddata, t, timer);
 
 	mpc8xxx_wdt_keepalive(ddata);
 	/* We're pinging it twice faster than needed, just to be sure. */
@@ -173,8 +173,7 @@ static int mpc8xxx_wdt_probe(struct platform_device *ofdev)
 	}
 
 	spin_lock_init(&ddata->lock);
-	setup_timer(&ddata->timer, mpc8xxx_wdt_timer_ping,
-		    (unsigned long)ddata);
+	timer_setup(&ddata->timer, mpc8xxx_wdt_timer_ping, 0);
 
 	ddata->wdd.info = &mpc8xxx_wdt_info,
 	ddata->wdd.ops = &mpc8xxx_wdt_ops,
diff --git a/drivers/watchdog/mtx-1_wdt.c b/drivers/watchdog/mtx-1_wdt.c
index ff27c4ac96e4..ca360d204548 100644
--- a/drivers/watchdog/mtx-1_wdt.c
+++ b/drivers/watchdog/mtx-1_wdt.c
@@ -68,7 +68,7 @@ static struct {
 	unsigned int gstate;
 } mtx1_wdt_device;
 
-static void mtx1_wdt_trigger(unsigned long unused)
+static void mtx1_wdt_trigger(struct timer_list *unused)
 {
 	spin_lock(&mtx1_wdt_device.lock);
 	if (mtx1_wdt_device.running)
@@ -219,7 +219,7 @@ static int mtx1_wdt_probe(struct platform_device *pdev)
 	init_completion(&mtx1_wdt_device.stop);
 	mtx1_wdt_device.queue = 0;
 	clear_bit(0, &mtx1_wdt_device.inuse);
-	setup_timer(&mtx1_wdt_device.timer, mtx1_wdt_trigger, 0L);
+	timer_setup(&mtx1_wdt_device.timer, mtx1_wdt_trigger, 0);
 	mtx1_wdt_device.default_ticks = ticks;
 
 	ret = misc_register(&mtx1_wdt_misc);
diff --git a/drivers/watchdog/nuc900_wdt.c b/drivers/watchdog/nuc900_wdt.c
index d5bed78c4d9f..830bd04ff911 100644
--- a/drivers/watchdog/nuc900_wdt.c
+++ b/drivers/watchdog/nuc900_wdt.c
@@ -216,7 +216,7 @@ static ssize_t nuc900_wdt_write(struct file *file, const char __user *data,
 	return len;
 }
 
-static void nuc900_wdt_timer_ping(unsigned long data)
+static void nuc900_wdt_timer_ping(struct timer_list *unused)
 {
 	if (time_before(jiffies, nuc900_wdt->next_heartbeat)) {
 		nuc900_wdt_keepalive();
@@ -267,7 +267,7 @@ static int nuc900wdt_probe(struct platform_device *pdev)
 
 	clk_enable(nuc900_wdt->wdt_clock);
 
-	setup_timer(&nuc900_wdt->timer, nuc900_wdt_timer_ping, 0);
+	timer_setup(&nuc900_wdt->timer, nuc900_wdt_timer_ping, 0);
 
 	ret = misc_register(&nuc900wdt_miscdev);
 	if (ret) {
diff --git a/drivers/watchdog/pcwd.c b/drivers/watchdog/pcwd.c
index 3ad5206d7935..b72ce68eacd3 100644
--- a/drivers/watchdog/pcwd.c
+++ b/drivers/watchdog/pcwd.c
@@ -367,7 +367,7 @@ static void pcwd_show_card_info(void)
 		pr_info("No previous trip detected - Cold boot or reset\n");
 }
 
-static void pcwd_timer_ping(unsigned long data)
+static void pcwd_timer_ping(struct timer_list *unused)
 {
 	int wdrst_stat;
 
@@ -893,7 +893,7 @@ static int pcwd_isa_probe(struct device *dev, unsigned int id)
 	/* clear the "card caused reboot" flag */
 	pcwd_clear_status();
 
-	setup_timer(&pcwd_private.timer, pcwd_timer_ping, 0);
+	timer_setup(&pcwd_private.timer, pcwd_timer_ping, 0);
 
 	/*  Disable the board  */
 	pcwd_stop();
diff --git a/drivers/watchdog/pika_wdt.c b/drivers/watchdog/pika_wdt.c
index e35cf5e87907..e0a6f8c0f03c 100644
--- a/drivers/watchdog/pika_wdt.c
+++ b/drivers/watchdog/pika_wdt.c
@@ -85,7 +85,7 @@ static inline void pikawdt_reset(void)
 /*
  * Timer tick
  */
-static void pikawdt_ping(unsigned long data)
+static void pikawdt_ping(struct timer_list *unused)
 {
 	if (time_before(jiffies, pikawdt_private.next_heartbeat) ||
 			(!nowayout && !pikawdt_private.open)) {
@@ -269,7 +269,7 @@ static int __init pikawdt_init(void)
 
 	iounmap(fpga);
 
-	setup_timer(&pikawdt_private.timer, pikawdt_ping, 0);
+	timer_setup(&pikawdt_private.timer, pikawdt_ping, 0);
 
 	ret = misc_register(&pikawdt_miscdev);
 	if (ret) {
diff --git a/drivers/watchdog/rdc321x_wdt.c b/drivers/watchdog/rdc321x_wdt.c
index 47a8f1b1087d..a281aa84bfb1 100644
--- a/drivers/watchdog/rdc321x_wdt.c
+++ b/drivers/watchdog/rdc321x_wdt.c
@@ -67,7 +67,7 @@ static struct {
 
 /* generic helper functions */
 
-static void rdc321x_wdt_trigger(unsigned long unused)
+static void rdc321x_wdt_trigger(struct timer_list *unused)
 {
 	unsigned long flags;
 	u32 val;
@@ -262,7 +262,7 @@ static int rdc321x_wdt_probe(struct platform_device *pdev)
 
 	clear_bit(0, &rdc321x_wdt_device.inuse);
 
-	setup_timer(&rdc321x_wdt_device.timer, rdc321x_wdt_trigger, 0);
+	timer_setup(&rdc321x_wdt_device.timer, rdc321x_wdt_trigger, 0);
 
 	rdc321x_wdt_device.default_ticks = ticks;
 
diff --git a/drivers/watchdog/shwdt.c b/drivers/watchdog/shwdt.c
index 517a733175ef..a7d6425db807 100644
--- a/drivers/watchdog/shwdt.c
+++ b/drivers/watchdog/shwdt.c
@@ -175,9 +175,9 @@ static int sh_wdt_set_heartbeat(struct watchdog_device *wdt_dev, unsigned t)
 	return 0;
 }
 
-static void sh_wdt_ping(unsigned long data)
+static void sh_wdt_ping(struct timer_list *t)
 {
-	struct sh_wdt *wdt = (struct sh_wdt *)data;
+	struct sh_wdt *wdt = from_timer(wdt, t, timer);
 	unsigned long flags;
 
 	spin_lock_irqsave(&wdt->lock, flags);
@@ -275,7 +275,7 @@ static int sh_wdt_probe(struct platform_device *pdev)
 		return rc;
 	}
 
-	setup_timer(&wdt->timer, sh_wdt_ping, (unsigned long)wdt);
+	timer_setup(&wdt->timer, sh_wdt_ping, 0);
 	wdt->timer.expires	= next_ping_period(clock_division_ratio);
 
 	dev_info(&pdev->dev, "initialized.\n");
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index 8d779227370a..bebe59feca58 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -140,7 +140,7 @@ static void o2net_rx_until_empty(struct work_struct *work);
 static void o2net_shutdown_sc(struct work_struct *work);
 static void o2net_listen_data_ready(struct sock *sk);
 static void o2net_sc_send_keep_req(struct work_struct *work);
-static void o2net_idle_timer(unsigned long data);
+static void o2net_idle_timer(struct timer_list *t);
 static void o2net_sc_postpone_idle(struct o2net_sock_container *sc);
 static void o2net_sc_reset_idle_timer(struct o2net_sock_container *sc);
 
@@ -450,8 +450,7 @@ static struct o2net_sock_container *sc_alloc(struct o2nm_node *node)
 	INIT_WORK(&sc->sc_shutdown_work, o2net_shutdown_sc);
 	INIT_DELAYED_WORK(&sc->sc_keepalive_work, o2net_sc_send_keep_req);
 
-	setup_timer(&sc->sc_idle_timeout, o2net_idle_timer,
-		    (unsigned long)sc);
+	timer_setup(&sc->sc_idle_timeout, o2net_idle_timer, 0);
 
 	sclog(sc, "alloced\n");
 
@@ -1517,9 +1516,9 @@ static void o2net_sc_send_keep_req(struct work_struct *work)
 /* socket shutdown does a del_timer_sync against this as it tears down.
  * we can't start this timer until we've got to the point in sc buildup
  * where shutdown is going to be involved */
-static void o2net_idle_timer(unsigned long data)
+static void o2net_idle_timer(struct timer_list *t)
 {
-	struct o2net_sock_container *sc = (struct o2net_sock_container *)data;
+	struct o2net_sock_container *sc = from_timer(sc, t, sc_idle_timeout);
 	struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num);
 #ifdef CONFIG_DEBUG_FS
 	unsigned long msecs = ktime_to_ms(ktime_get()) -
diff --git a/kernel/padata.c b/kernel/padata.c
index f262c9a4e70a..57c0074d50cc 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -288,9 +288,9 @@ static void invoke_padata_reorder(struct work_struct *work)
 	local_bh_enable();
 }
 
-static void padata_reorder_timer(unsigned long arg)
+static void padata_reorder_timer(struct timer_list *t)
 {
-	struct parallel_data *pd = (struct parallel_data *)arg;
+	struct parallel_data *pd = from_timer(pd, t, timer);
 	unsigned int weight;
 	int target_cpu, cpu;
 
@@ -485,7 +485,7 @@ static struct parallel_data *padata_alloc_pd(struct padata_instance *pinst,
 
 	padata_init_pqueues(pd);
 	padata_init_squeues(pd);
-	setup_timer(&pd->timer, padata_reorder_timer, (unsigned long)pd);
+	timer_setup(&pd->timer, padata_reorder_timer, 0);
 	atomic_set(&pd->seq_nr, -1);
 	atomic_set(&pd->reorder_objects, 0);
 	atomic_set(&pd->refcnt, 0);
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 5b51d5ba2a85..65f9e3f24dde 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -171,7 +171,7 @@ void clocksource_mark_unstable(struct clocksource *cs)
 	spin_unlock_irqrestore(&watchdog_lock, flags);
 }
 
-static void clocksource_watchdog(unsigned long data)
+static void clocksource_watchdog(struct timer_list *unused)
 {
 	struct clocksource *cs;
 	u64 csnow, wdnow, cslast, wdlast, delta;
@@ -290,7 +290,7 @@ static inline void clocksource_start_watchdog(void)
 {
 	if (watchdog_running || !watchdog || list_empty(&watchdog_list))
 		return;
-	setup_timer(&watchdog_timer, clocksource_watchdog, 0UL);
+	timer_setup(&watchdog_timer, clocksource_watchdog, 0);
 	watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL;
 	add_timer_on(&watchdog_timer, cpumask_first(cpu_online_mask));
 	watchdog_running = 1;
diff --git a/net/802/garp.c b/net/802/garp.c
index 2dac647ff420..7f50d47470bd 100644
--- a/net/802/garp.c
+++ b/net/802/garp.c
@@ -401,9 +401,9 @@ static void garp_join_timer_arm(struct garp_applicant *app)
 	mod_timer(&app->join_timer, jiffies + delay);
 }
 
-static void garp_join_timer(unsigned long data)
+static void garp_join_timer(struct timer_list *t)
 {
-	struct garp_applicant *app = (struct garp_applicant *)data;
+	struct garp_applicant *app = from_timer(app, t, join_timer);
 
 	spin_lock(&app->lock);
 	garp_gid_event(app, GARP_EVENT_TRANSMIT_PDU);
@@ -584,7 +584,7 @@ int garp_init_applicant(struct net_device *dev, struct garp_application *appl)
 	spin_lock_init(&app->lock);
 	skb_queue_head_init(&app->queue);
 	rcu_assign_pointer(dev->garp_port->applicants[appl->type], app);
-	setup_timer(&app->join_timer, garp_join_timer, (unsigned long)app);
+	timer_setup(&app->join_timer, garp_join_timer, 0);
 	garp_join_timer_arm(app);
 	return 0;
 
diff --git a/net/802/mrp.c b/net/802/mrp.c
index be4dd3165347..a808dd5bbb27 100644
--- a/net/802/mrp.c
+++ b/net/802/mrp.c
@@ -586,9 +586,9 @@ static void mrp_join_timer_arm(struct mrp_applicant *app)
 	mod_timer(&app->join_timer, jiffies + delay);
 }
 
-static void mrp_join_timer(unsigned long data)
+static void mrp_join_timer(struct timer_list *t)
 {
-	struct mrp_applicant *app = (struct mrp_applicant *)data;
+	struct mrp_applicant *app = from_timer(app, t, join_timer);
 
 	spin_lock(&app->lock);
 	mrp_mad_event(app, MRP_EVENT_TX);
@@ -605,9 +605,9 @@ static void mrp_periodic_timer_arm(struct mrp_applicant *app)
 		  jiffies + msecs_to_jiffies(mrp_periodic_time));
 }
 
-static void mrp_periodic_timer(unsigned long data)
+static void mrp_periodic_timer(struct timer_list *t)
 {
-	struct mrp_applicant *app = (struct mrp_applicant *)data;
+	struct mrp_applicant *app = from_timer(app, t, periodic_timer);
 
 	spin_lock(&app->lock);
 	mrp_mad_event(app, MRP_EVENT_PERIODIC);
@@ -865,10 +865,9 @@ int mrp_init_applicant(struct net_device *dev, struct mrp_application *appl)
 	spin_lock_init(&app->lock);
 	skb_queue_head_init(&app->queue);
 	rcu_assign_pointer(dev->mrp_port->applicants[appl->type], app);
-	setup_timer(&app->join_timer, mrp_join_timer, (unsigned long)app);
+	timer_setup(&app->join_timer, mrp_join_timer, 0);
 	mrp_join_timer_arm(app);
-	setup_timer(&app->periodic_timer, mrp_periodic_timer,
-		    (unsigned long)app);
+	timer_setup(&app->periodic_timer, mrp_periodic_timer, 0);
 	mrp_periodic_timer_arm(app);
 	return 0;
 
diff --git a/net/appletalk/aarp.c b/net/appletalk/aarp.c
index 8ad3ec2610b6..309d7dbb36e8 100644
--- a/net/appletalk/aarp.c
+++ b/net/appletalk/aarp.c
@@ -310,7 +310,7 @@ static void __aarp_expire_device(struct aarp_entry **n, struct net_device *dev)
 }
 
 /* Handle the timer event */
-static void aarp_expire_timeout(unsigned long unused)
+static void aarp_expire_timeout(struct timer_list *unused)
 {
 	int ct;
 
@@ -884,7 +884,7 @@ void __init aarp_proto_init(void)
 	aarp_dl = register_snap_client(aarp_snap_id, aarp_rcv);
 	if (!aarp_dl)
 		printk(KERN_CRIT "Unable to register AARP with SNAP.\n");
-	setup_timer(&aarp_timer, aarp_expire_timeout, 0);
+	timer_setup(&aarp_timer, aarp_expire_timeout, 0);
 	aarp_timer.expires  = jiffies + sysctl_aarp_expiry_time;
 	add_timer(&aarp_timer);
 	register_netdevice_notifier(&aarp_notifier);
diff --git a/net/appletalk/ddp.c b/net/appletalk/ddp.c
index 5d035c1f1156..03a9fc0771c0 100644
--- a/net/appletalk/ddp.c
+++ b/net/appletalk/ddp.c
@@ -158,9 +158,9 @@ found:
 	return s;
 }
 
-static void atalk_destroy_timer(unsigned long data)
+static void atalk_destroy_timer(struct timer_list *t)
 {
-	struct sock *sk = (struct sock *)data;
+	struct sock *sk = from_timer(sk, t, sk_timer);
 
 	if (sk_has_allocations(sk)) {
 		sk->sk_timer.expires = jiffies + SOCK_DESTROY_TIME;
@@ -175,8 +175,7 @@ static inline void atalk_destroy_socket(struct sock *sk)
 	skb_queue_purge(&sk->sk_receive_queue);
 
 	if (sk_has_allocations(sk)) {
-		setup_timer(&sk->sk_timer, atalk_destroy_timer,
-				(unsigned long)sk);
+		timer_setup(&sk->sk_timer, atalk_destroy_timer, 0);
 		sk->sk_timer.expires	= jiffies + SOCK_DESTROY_TIME;
 		add_timer(&sk->sk_timer);
 	} else
diff --git a/net/batman-adv/tp_meter.c b/net/batman-adv/tp_meter.c
index 4b90033f35a8..15cd2139381e 100644
--- a/net/batman-adv/tp_meter.c
+++ b/net/batman-adv/tp_meter.c
@@ -488,9 +488,9 @@ static void batadv_tp_reset_sender_timer(struct batadv_tp_vars *tp_vars)
  * Switch to Slow Start, set the ss_threshold to half of the current cwnd and
  * reset the cwnd to 3*MSS
  */
-static void batadv_tp_sender_timeout(unsigned long arg)
+static void batadv_tp_sender_timeout(struct timer_list *t)
 {
-	struct batadv_tp_vars *tp_vars = (struct batadv_tp_vars *)arg;
+	struct batadv_tp_vars *tp_vars = from_timer(tp_vars, t, timer);
 	struct batadv_priv *bat_priv = tp_vars->bat_priv;
 
 	if (atomic_read(&tp_vars->sending) == 0)
@@ -1020,8 +1020,7 @@ void batadv_tp_start(struct batadv_priv *bat_priv, const u8 *dst,
 	atomic64_set(&tp_vars->tot_sent, 0);
 
 	kref_get(&tp_vars->refcount);
-	setup_timer(&tp_vars->timer, batadv_tp_sender_timeout,
-		    (unsigned long)tp_vars);
+	timer_setup(&tp_vars->timer, batadv_tp_sender_timeout, 0);
 
 	tp_vars->bat_priv = bat_priv;
 	tp_vars->start_time = jiffies;
@@ -1109,9 +1108,9 @@ static void batadv_tp_reset_receiver_timer(struct batadv_tp_vars *tp_vars)
  *  reached without received ack
  * @arg: address of the related tp_vars
  */
-static void batadv_tp_receiver_shutdown(unsigned long arg)
+static void batadv_tp_receiver_shutdown(struct timer_list *t)
 {
-	struct batadv_tp_vars *tp_vars = (struct batadv_tp_vars *)arg;
+	struct batadv_tp_vars *tp_vars = from_timer(tp_vars, t, timer);
 	struct batadv_tp_unacked *un, *safe;
 	struct batadv_priv *bat_priv;
 
@@ -1373,8 +1372,7 @@ batadv_tp_init_recv(struct batadv_priv *bat_priv,
 	hlist_add_head_rcu(&tp_vars->list, &bat_priv->tp_list);
 
 	kref_get(&tp_vars->refcount);
-	setup_timer(&tp_vars->timer, batadv_tp_receiver_shutdown,
-		    (unsigned long)tp_vars);
+	timer_setup(&tp_vars->timer, batadv_tp_receiver_shutdown, 0);
 
 	batadv_tp_reset_receiver_timer(tp_vars);
 
diff --git a/net/bluetooth/hidp/core.c b/net/bluetooth/hidp/core.c
index 8112893037bd..f2cec70d520c 100644
--- a/net/bluetooth/hidp/core.c
+++ b/net/bluetooth/hidp/core.c
@@ -398,9 +398,9 @@ static int hidp_raw_request(struct hid_device *hid, unsigned char reportnum,
 	}
 }
 
-static void hidp_idle_timeout(unsigned long arg)
+static void hidp_idle_timeout(struct timer_list *t)
 {
-	struct hidp_session *session = (struct hidp_session *) arg;
+	struct hidp_session *session = from_timer(session, t, timer);
 
 	/* The HIDP user-space API only contains calls to add and remove
 	 * devices. There is no way to forward events of any kind. Therefore,
@@ -944,8 +944,7 @@ static int hidp_session_new(struct hidp_session **out, const bdaddr_t *bdaddr,
 
 	/* device management */
 	INIT_WORK(&session->dev_init, hidp_session_dev_work);
-	setup_timer(&session->timer, hidp_idle_timeout,
-		    (unsigned long)session);
+	timer_setup(&session->timer, hidp_idle_timeout, 0);
 
 	/* session data */
 	mutex_init(&session->report_mutex);
diff --git a/net/bluetooth/rfcomm/core.c b/net/bluetooth/rfcomm/core.c
index 4a0b41d75c84..b98225d65e87 100644
--- a/net/bluetooth/rfcomm/core.c
+++ b/net/bluetooth/rfcomm/core.c
@@ -233,9 +233,9 @@ static int rfcomm_check_security(struct rfcomm_dlc *d)
 				 d->out);
 }
 
-static void rfcomm_session_timeout(unsigned long arg)
+static void rfcomm_session_timeout(struct timer_list *t)
 {
-	struct rfcomm_session *s = (void *) arg;
+	struct rfcomm_session *s = from_timer(s, t, timer);
 
 	BT_DBG("session %p state %ld", s, s->state);
 
@@ -258,9 +258,9 @@ static void rfcomm_session_clear_timer(struct rfcomm_session *s)
 }
 
 /* ---- RFCOMM DLCs ---- */
-static void rfcomm_dlc_timeout(unsigned long arg)
+static void rfcomm_dlc_timeout(struct timer_list *t)
 {
-	struct rfcomm_dlc *d = (void *) arg;
+	struct rfcomm_dlc *d = from_timer(d, t, timer);
 
 	BT_DBG("dlc %p state %ld", d, d->state);
 
@@ -307,7 +307,7 @@ struct rfcomm_dlc *rfcomm_dlc_alloc(gfp_t prio)
 	if (!d)
 		return NULL;
 
-	setup_timer(&d->timer, rfcomm_dlc_timeout, (unsigned long)d);
+	timer_setup(&d->timer, rfcomm_dlc_timeout, 0);
 
 	skb_queue_head_init(&d->tx_queue);
 	mutex_init(&d->lock);
@@ -650,7 +650,7 @@ static struct rfcomm_session *rfcomm_session_add(struct socket *sock, int state)
 
 	BT_DBG("session %p sock %p", s, sock);
 
-	setup_timer(&s->timer, rfcomm_session_timeout, (unsigned long) s);
+	timer_setup(&s->timer, rfcomm_session_timeout, 0);
 
 	INIT_LIST_HEAD(&s->dlcs);
 	s->state = state;
diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c
index 795e920a3281..08df57665e1f 100644
--- a/net/bluetooth/sco.c
+++ b/net/bluetooth/sco.c
@@ -73,9 +73,9 @@ struct sco_pinfo {
 #define SCO_CONN_TIMEOUT	(HZ * 40)
 #define SCO_DISCONN_TIMEOUT	(HZ * 2)
 
-static void sco_sock_timeout(unsigned long arg)
+static void sco_sock_timeout(struct timer_list *t)
 {
-	struct sock *sk = (struct sock *)arg;
+	struct sock *sk = from_timer(sk, t, sk_timer);
 
 	BT_DBG("sock %p state %d", sk, sk->sk_state);
 
@@ -487,7 +487,7 @@ static struct sock *sco_sock_alloc(struct net *net, struct socket *sock,
 
 	sco_pi(sk)->setting = BT_VOICE_CVSD_16BIT;
 
-	setup_timer(&sk->sk_timer, sco_sock_timeout, (unsigned long)sk);
+	timer_setup(&sk->sk_timer, sco_sock_timeout, 0);
 
 	bt_sock_link(&sco_sk_list, sk);
 	return sk;
diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c
index 70ccda233bd1..c7785efeea57 100644
--- a/net/core/drop_monitor.c
+++ b/net/core/drop_monitor.c
@@ -144,9 +144,9 @@ static void send_dm_alert(struct work_struct *work)
  * in the event that more drops will arrive during the
  * hysteresis period.
  */
-static void sched_send_work(unsigned long _data)
+static void sched_send_work(struct timer_list *t)
 {
-	struct per_cpu_dm_data *data = (struct per_cpu_dm_data *)_data;
+	struct per_cpu_dm_data *data = from_timer(data, t, send_timer);
 
 	schedule_work(&data->dm_alert_work);
 }
@@ -412,8 +412,7 @@ static int __init init_net_drop_monitor(void)
 	for_each_possible_cpu(cpu) {
 		data = &per_cpu(dm_cpu_data, cpu);
 		INIT_WORK(&data->dm_alert_work, send_dm_alert);
-		setup_timer(&data->send_timer, sched_send_work,
-			    (unsigned long)data);
+		timer_setup(&data->send_timer, sched_send_work, 0);
 		spin_lock_init(&data->lock);
 		reset_per_cpu_data(data);
 	}
diff --git a/net/core/gen_estimator.c b/net/core/gen_estimator.c
index 7c1ffd6f9501..9834cfa21b21 100644
--- a/net/core/gen_estimator.c
+++ b/net/core/gen_estimator.c
@@ -76,9 +76,9 @@ static void est_fetch_counters(struct net_rate_estimator *e,
 
 }
 
-static void est_timer(unsigned long arg)
+static void est_timer(struct timer_list *t)
 {
-	struct net_rate_estimator *est = (struct net_rate_estimator *)arg;
+	struct net_rate_estimator *est = from_timer(est, t, timer);
 	struct gnet_stats_basic_packed b;
 	u64 rate, brate;
 
@@ -170,7 +170,7 @@ int gen_new_estimator(struct gnet_stats_basic_packed *bstats,
 	}
 
 	est->next_jiffies = jiffies + ((HZ/4) << intvl_log);
-	setup_timer(&est->timer, est_timer, (unsigned long)est);
+	timer_setup(&est->timer, est_timer, 0);
 	mod_timer(&est->timer, est->next_jiffies);
 
 	rcu_assign_pointer(*rate_est, est);
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 6ea3a1a7f36a..d1f5fe986edd 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -51,7 +51,7 @@ do {						\
 
 #define PNEIGH_HASHMASK		0xF
 
-static void neigh_timer_handler(unsigned long arg);
+static void neigh_timer_handler(struct timer_list *t);
 static void __neigh_notify(struct neighbour *n, int type, int flags,
 			   u32 pid);
 static void neigh_update_notify(struct neighbour *neigh, u32 nlmsg_pid);
@@ -331,7 +331,7 @@ static struct neighbour *neigh_alloc(struct neigh_table *tbl, struct net_device
 	n->output	  = neigh_blackhole;
 	seqlock_init(&n->hh.hh_lock);
 	n->parms	  = neigh_parms_clone(&tbl->parms);
-	setup_timer(&n->timer, neigh_timer_handler, (unsigned long)n);
+	timer_setup(&n->timer, neigh_timer_handler, 0);
 
 	NEIGH_CACHE_STAT_INC(tbl, allocs);
 	n->tbl		  = tbl;
@@ -903,10 +903,10 @@ static void neigh_probe(struct neighbour *neigh)
 
 /* Called when a timer expires for a neighbour entry. */
 
-static void neigh_timer_handler(unsigned long arg)
+static void neigh_timer_handler(struct timer_list *t)
 {
 	unsigned long now, next;
-	struct neighbour *neigh = (struct neighbour *)arg;
+	struct neighbour *neigh = from_timer(neigh, t, timer);
 	unsigned int state;
 	int notify = 0;
 
@@ -1391,9 +1391,9 @@ int neigh_direct_output(struct neighbour *neigh, struct sk_buff *skb)
 }
 EXPORT_SYMBOL(neigh_direct_output);
 
-static void neigh_proxy_process(unsigned long arg)
+static void neigh_proxy_process(struct timer_list *t)
 {
-	struct neigh_table *tbl = (struct neigh_table *)arg;
+	struct neigh_table *tbl = from_timer(tbl, t, proxy_timer);
 	long sched_next = 0;
 	unsigned long now = jiffies;
 	struct sk_buff *skb, *n;
@@ -1573,7 +1573,7 @@ void neigh_table_init(int index, struct neigh_table *tbl)
 	INIT_DEFERRABLE_WORK(&tbl->gc_work, neigh_periodic_work);
 	queue_delayed_work(system_power_efficient_wq, &tbl->gc_work,
 			tbl->parms.reachable_time);
-	setup_timer(&tbl->proxy_timer, neigh_proxy_process, (unsigned long)tbl);
+	timer_setup(&tbl->proxy_timer, neigh_proxy_process, 0);
 	skb_queue_head_init_class(&tbl->proxy_queue,
 			&neigh_table_proxy_queue_class);
 
diff --git a/net/decnet/dn_route.c b/net/decnet/dn_route.c
index de4a0cafb19f..324cb9f2f551 100644
--- a/net/decnet/dn_route.c
+++ b/net/decnet/dn_route.c
@@ -183,7 +183,7 @@ static __inline__ unsigned int dn_hash(__le16 src, __le16 dst)
 	return dn_rt_hash_mask & (unsigned int)tmp;
 }
 
-static void dn_dst_check_expire(unsigned long dummy)
+static void dn_dst_check_expire(struct timer_list *unused)
 {
 	int i;
 	struct dn_route *rt;
@@ -1875,7 +1875,7 @@ void __init dn_route_init(void)
 		kmem_cache_create("dn_dst_cache", sizeof(struct dn_route), 0,
 				  SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
 	dst_entries_init(&dn_dst_ops);
-	setup_timer(&dn_route_timer, dn_dst_check_expire, 0);
+	timer_setup(&dn_route_timer, dn_dst_check_expire, 0);
 	dn_route_timer.expires = jiffies + decnet_dst_gc_interval * HZ;
 	add_timer(&dn_route_timer);
 
diff --git a/net/decnet/dn_timer.c b/net/decnet/dn_timer.c
index f430daed24a0..aa4155875ca8 100644
--- a/net/decnet/dn_timer.c
+++ b/net/decnet/dn_timer.c
@@ -34,11 +34,11 @@
 
 #define SLOW_INTERVAL (HZ/2)
 
-static void dn_slow_timer(unsigned long arg);
+static void dn_slow_timer(struct timer_list *t);
 
 void dn_start_slow_timer(struct sock *sk)
 {
-	setup_timer(&sk->sk_timer, dn_slow_timer, (unsigned long)sk);
+	timer_setup(&sk->sk_timer, dn_slow_timer, 0);
 	sk_reset_timer(sk, &sk->sk_timer, jiffies + SLOW_INTERVAL);
 }
 
@@ -47,9 +47,9 @@ void dn_stop_slow_timer(struct sock *sk)
 	sk_stop_timer(sk, &sk->sk_timer);
 }
 
-static void dn_slow_timer(unsigned long arg)
+static void dn_slow_timer(struct timer_list *t)
 {
-	struct sock *sk = (struct sock *)arg;
+	struct sock *sk = from_timer(sk, t, sk_timer);
 	struct dn_scp *scp = DN_SK(sk);
 
 	bh_lock_sock(sk);
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index ab183af0b5b6..d1f8f302dbf3 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -752,18 +752,18 @@ static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc,
 	return ip_local_out(net, skb->sk, skb);
 }
 
-static void igmp_gq_timer_expire(unsigned long data)
+static void igmp_gq_timer_expire(struct timer_list *t)
 {
-	struct in_device *in_dev = (struct in_device *)data;
+	struct in_device *in_dev = from_timer(in_dev, t, mr_gq_timer);
 
 	in_dev->mr_gq_running = 0;
 	igmpv3_send_report(in_dev, NULL);
 	in_dev_put(in_dev);
 }
 
-static void igmp_ifc_timer_expire(unsigned long data)
+static void igmp_ifc_timer_expire(struct timer_list *t)
 {
-	struct in_device *in_dev = (struct in_device *)data;
+	struct in_device *in_dev = from_timer(in_dev, t, mr_ifc_timer);
 
 	igmpv3_send_cr(in_dev);
 	if (in_dev->mr_ifc_count) {
@@ -784,9 +784,9 @@ static void igmp_ifc_event(struct in_device *in_dev)
 }
 
 
-static void igmp_timer_expire(unsigned long data)
+static void igmp_timer_expire(struct timer_list *t)
 {
-	struct ip_mc_list *im = (struct ip_mc_list *)data;
+	struct ip_mc_list *im = from_timer(im, t, timer);
 	struct in_device *in_dev = im->interface;
 
 	spin_lock(&im->lock);
@@ -1385,7 +1385,7 @@ void ip_mc_inc_group(struct in_device *in_dev, __be32 addr)
 	refcount_set(&im->refcnt, 1);
 	spin_lock_init(&im->lock);
 #ifdef CONFIG_IP_MULTICAST
-	setup_timer(&im->timer, igmp_timer_expire, (unsigned long)im);
+	timer_setup(&im->timer, igmp_timer_expire, 0);
 	im->unsolicit_count = net->ipv4.sysctl_igmp_qrv;
 #endif
 
@@ -1695,10 +1695,8 @@ void ip_mc_init_dev(struct in_device *in_dev)
 	ASSERT_RTNL();
 
 #ifdef CONFIG_IP_MULTICAST
-	setup_timer(&in_dev->mr_gq_timer, igmp_gq_timer_expire,
-			(unsigned long)in_dev);
-	setup_timer(&in_dev->mr_ifc_timer, igmp_ifc_timer_expire,
-			(unsigned long)in_dev);
+	timer_setup(&in_dev->mr_gq_timer, igmp_gq_timer_expire, 0);
+	timer_setup(&in_dev->mr_ifc_timer, igmp_ifc_timer_expire, 0);
 	in_dev->mr_qrv = net->ipv4.sysctl_igmp_qrv;
 #endif
 
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 40a43ad294cb..fd5f19c988e4 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -112,7 +112,7 @@ static void mroute_netlink_event(struct mr_table *mrt, struct mfc_cache *mfc,
 				 int cmd);
 static void igmpmsg_netlink_event(struct mr_table *mrt, struct sk_buff *pkt);
 static void mroute_clean_tables(struct mr_table *mrt, bool all);
-static void ipmr_expire_process(unsigned long arg);
+static void ipmr_expire_process(struct timer_list *t);
 
 #ifdef CONFIG_IP_MROUTE_MULTIPLE_TABLES
 #define ipmr_for_each_table(mrt, net) \
@@ -375,8 +375,7 @@ static struct mr_table *ipmr_new_table(struct net *net, u32 id)
 	INIT_LIST_HEAD(&mrt->mfc_cache_list);
 	INIT_LIST_HEAD(&mrt->mfc_unres_queue);
 
-	setup_timer(&mrt->ipmr_expire_timer, ipmr_expire_process,
-		    (unsigned long)mrt);
+	timer_setup(&mrt->ipmr_expire_timer, ipmr_expire_process, 0);
 
 	mrt->mroute_reg_vif_num = -1;
 #ifdef CONFIG_IP_MROUTE_MULTIPLE_TABLES
@@ -804,9 +803,9 @@ static void ipmr_destroy_unres(struct mr_table *mrt, struct mfc_cache *c)
 }
 
 /* Timer process for the unresolved queue. */
-static void ipmr_expire_process(unsigned long arg)
+static void ipmr_expire_process(struct timer_list *t)
 {
-	struct mr_table *mrt = (struct mr_table *)arg;
+	struct mr_table *mrt = from_timer(mrt, t, ipmr_expire_timer);
 	unsigned long now;
 	unsigned long expires;
 	struct mfc_cache *c, *next;
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index a0ae1c9d37df..f49bd7897e95 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -188,7 +188,7 @@ static void addrconf_dad_start(struct inet6_ifaddr *ifp);
 static void addrconf_dad_work(struct work_struct *w);
 static void addrconf_dad_completed(struct inet6_ifaddr *ifp, bool bump_id);
 static void addrconf_dad_run(struct inet6_dev *idev);
-static void addrconf_rs_timer(unsigned long data);
+static void addrconf_rs_timer(struct timer_list *t);
 static void __ipv6_ifa_notify(int event, struct inet6_ifaddr *ifa);
 static void ipv6_ifa_notify(int event, struct inet6_ifaddr *ifa);
 
@@ -388,8 +388,7 @@ static struct inet6_dev *ipv6_add_dev(struct net_device *dev)
 	rwlock_init(&ndev->lock);
 	ndev->dev = dev;
 	INIT_LIST_HEAD(&ndev->addr_list);
-	setup_timer(&ndev->rs_timer, addrconf_rs_timer,
-		    (unsigned long)ndev);
+	timer_setup(&ndev->rs_timer, addrconf_rs_timer, 0);
 	memcpy(&ndev->cnf, dev_net(dev)->ipv6.devconf_dflt, sizeof(ndev->cnf));
 
 	if (ndev->cnf.stable_secret.initialized)
@@ -3741,9 +3740,9 @@ restart:
 	return 0;
 }
 
-static void addrconf_rs_timer(unsigned long data)
+static void addrconf_rs_timer(struct timer_list *t)
 {
-	struct inet6_dev *idev = (struct inet6_dev *)data;
+	struct inet6_dev *idev = from_timer(idev, t, rs_timer);
 	struct net_device *dev = idev->dev;
 	struct in6_addr lladdr;
 
diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index 9c24b85949c1..a2e1a864eb46 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -120,7 +120,7 @@ static void mrt6msg_netlink_event(struct mr6_table *mrt, struct sk_buff *pkt);
 static int ip6mr_rtm_dumproute(struct sk_buff *skb,
 			       struct netlink_callback *cb);
 static void mroute_clean_tables(struct mr6_table *mrt, bool all);
-static void ipmr_expire_process(unsigned long arg);
+static void ipmr_expire_process(struct timer_list *t);
 
 #ifdef CONFIG_IPV6_MROUTE_MULTIPLE_TABLES
 #define ip6mr_for_each_table(mrt, net) \
@@ -320,8 +320,7 @@ static struct mr6_table *ip6mr_new_table(struct net *net, u32 id)
 
 	INIT_LIST_HEAD(&mrt->mfc6_unres_queue);
 
-	setup_timer(&mrt->ipmr_expire_timer, ipmr_expire_process,
-		    (unsigned long)mrt);
+	timer_setup(&mrt->ipmr_expire_timer, ipmr_expire_process, 0);
 
 #ifdef CONFIG_IPV6_PIMSM_V2
 	mrt->mroute_reg_vif_num = -1;
@@ -888,9 +887,9 @@ static void ipmr_do_expire_process(struct mr6_table *mrt)
 		mod_timer(&mrt->ipmr_expire_timer, jiffies + expires);
 }
 
-static void ipmr_expire_process(unsigned long arg)
+static void ipmr_expire_process(struct timer_list *t)
 {
-	struct mr6_table *mrt = (struct mr6_table *)arg;
+	struct mr6_table *mrt = from_timer(mrt, t, ipmr_expire_timer);
 
 	if (!spin_trylock(&mfc_unres_lock)) {
 		mod_timer(&mrt->ipmr_expire_timer, jiffies + 1);
diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c
index 12b7c27ce5ce..fc6d7d143f2c 100644
--- a/net/ipv6/mcast.c
+++ b/net/ipv6/mcast.c
@@ -75,10 +75,10 @@ static struct in6_addr mld2_all_mcr = MLD2_ALL_MCR_INIT;
 
 static void igmp6_join_group(struct ifmcaddr6 *ma);
 static void igmp6_leave_group(struct ifmcaddr6 *ma);
-static void igmp6_timer_handler(unsigned long data);
+static void igmp6_timer_handler(struct timer_list *t);
 
-static void mld_gq_timer_expire(unsigned long data);
-static void mld_ifc_timer_expire(unsigned long data);
+static void mld_gq_timer_expire(struct timer_list *t);
+static void mld_ifc_timer_expire(struct timer_list *t);
 static void mld_ifc_event(struct inet6_dev *idev);
 static void mld_add_delrec(struct inet6_dev *idev, struct ifmcaddr6 *pmc);
 static void mld_del_delrec(struct inet6_dev *idev, struct ifmcaddr6 *pmc);
@@ -839,7 +839,7 @@ static struct ifmcaddr6 *mca_alloc(struct inet6_dev *idev,
 	if (!mc)
 		return NULL;
 
-	setup_timer(&mc->mca_timer, igmp6_timer_handler, (unsigned long)mc);
+	timer_setup(&mc->mca_timer, igmp6_timer_handler, 0);
 
 	mc->mca_addr = *addr;
 	mc->idev = idev; /* reference taken by caller */
@@ -2083,9 +2083,9 @@ void ipv6_mc_dad_complete(struct inet6_dev *idev)
 	}
 }
 
-static void mld_dad_timer_expire(unsigned long data)
+static void mld_dad_timer_expire(struct timer_list *t)
 {
-	struct inet6_dev *idev = (struct inet6_dev *)data;
+	struct inet6_dev *idev = from_timer(idev, t, mc_dad_timer);
 
 	mld_send_initial_cr(idev);
 	if (idev->mc_dad_count) {
@@ -2432,18 +2432,18 @@ static void igmp6_leave_group(struct ifmcaddr6 *ma)
 	}
 }
 
-static void mld_gq_timer_expire(unsigned long data)
+static void mld_gq_timer_expire(struct timer_list *t)
 {
-	struct inet6_dev *idev = (struct inet6_dev *)data;
+	struct inet6_dev *idev = from_timer(idev, t, mc_gq_timer);
 
 	idev->mc_gq_running = 0;
 	mld_send_report(idev, NULL);
 	in6_dev_put(idev);
 }
 
-static void mld_ifc_timer_expire(unsigned long data)
+static void mld_ifc_timer_expire(struct timer_list *t)
 {
-	struct inet6_dev *idev = (struct inet6_dev *)data;
+	struct inet6_dev *idev = from_timer(idev, t, mc_ifc_timer);
 
 	mld_send_cr(idev);
 	if (idev->mc_ifc_count) {
@@ -2462,9 +2462,9 @@ static void mld_ifc_event(struct inet6_dev *idev)
 	mld_ifc_start_timer(idev, 1);
 }
 
-static void igmp6_timer_handler(unsigned long data)
+static void igmp6_timer_handler(struct timer_list *t)
 {
-	struct ifmcaddr6 *ma = (struct ifmcaddr6 *) data;
+	struct ifmcaddr6 *ma = from_timer(ma, t, mca_timer);
 
 	if (mld_in_v1_mode(ma->idev))
 		igmp6_send(&ma->mca_addr, ma->idev->dev, ICMPV6_MGM_REPORT);
@@ -2552,14 +2552,11 @@ void ipv6_mc_init_dev(struct inet6_dev *idev)
 	write_lock_bh(&idev->lock);
 	spin_lock_init(&idev->mc_lock);
 	idev->mc_gq_running = 0;
-	setup_timer(&idev->mc_gq_timer, mld_gq_timer_expire,
-			(unsigned long)idev);
+	timer_setup(&idev->mc_gq_timer, mld_gq_timer_expire, 0);
 	idev->mc_tomb = NULL;
 	idev->mc_ifc_count = 0;
-	setup_timer(&idev->mc_ifc_timer, mld_ifc_timer_expire,
-			(unsigned long)idev);
-	setup_timer(&idev->mc_dad_timer, mld_dad_timer_expire,
-		    (unsigned long)idev);
+	timer_setup(&idev->mc_ifc_timer, mld_ifc_timer_expire, 0);
+	timer_setup(&idev->mc_dad_timer, mld_dad_timer_expire, 0);
 	ipv6_mc_reset(idev);
 	write_unlock_bh(&idev->lock);
 }
diff --git a/net/ncsi/ncsi-manage.c b/net/ncsi/ncsi-manage.c
index a2b904a718c6..3ccf8dfb233e 100644
--- a/net/ncsi/ncsi-manage.c
+++ b/net/ncsi/ncsi-manage.c
@@ -529,9 +529,9 @@ struct ncsi_dev *ncsi_find_dev(struct net_device *dev)
 	return NULL;
 }
 
-static void ncsi_request_timeout(unsigned long data)
+static void ncsi_request_timeout(struct timer_list *t)
 {
-	struct ncsi_request *nr = (struct ncsi_request *)data;
+	struct ncsi_request *nr = from_timer(nr, t, timer);
 	struct ncsi_dev_priv *ndp = nr->ndp;
 	unsigned long flags;
 
@@ -1577,9 +1577,7 @@ struct ncsi_dev *ncsi_register_dev(struct net_device *dev,
 	for (i = 0; i < ARRAY_SIZE(ndp->requests); i++) {
 		ndp->requests[i].id = i;
 		ndp->requests[i].ndp = ndp;
-		setup_timer(&ndp->requests[i].timer,
-			    ncsi_request_timeout,
-			    (unsigned long)&ndp->requests[i]);
+		timer_setup(&ndp->requests[i].timer, ncsi_request_timeout, 0);
 	}
 
 	spin_lock_irqsave(&ncsi_dev_lock, flags);
diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c
index 64778f9a8548..d6748a8a79c5 100644
--- a/net/netfilter/nf_conntrack_expect.c
+++ b/net/netfilter/nf_conntrack_expect.c
@@ -67,9 +67,9 @@ void nf_ct_unlink_expect_report(struct nf_conntrack_expect *exp,
 }
 EXPORT_SYMBOL_GPL(nf_ct_unlink_expect_report);
 
-static void nf_ct_expectation_timed_out(unsigned long ul_expect)
+static void nf_ct_expectation_timed_out(struct timer_list *t)
 {
-	struct nf_conntrack_expect *exp = (void *)ul_expect;
+	struct nf_conntrack_expect *exp = from_timer(exp, t, timeout);
 
 	spin_lock_bh(&nf_conntrack_expect_lock);
 	nf_ct_unlink_expect(exp);
@@ -368,8 +368,7 @@ static void nf_ct_expect_insert(struct nf_conntrack_expect *exp)
 	/* two references : one for hash insert, one for the timer */
 	refcount_add(2, &exp->use);
 
-	setup_timer(&exp->timeout, nf_ct_expectation_timed_out,
-		    (unsigned long)exp);
+	timer_setup(&exp->timeout, nf_ct_expectation_timed_out, 0);
 	helper = rcu_dereference_protected(master_help->helper,
 					   lockdep_is_held(&nf_conntrack_expect_lock));
 	if (helper) {
diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c
index cad6498f10b0..e5afab86381c 100644
--- a/net/netfilter/nfnetlink_log.c
+++ b/net/netfilter/nfnetlink_log.c
@@ -151,7 +151,7 @@ instance_put(struct nfulnl_instance *inst)
 		call_rcu_bh(&inst->rcu, nfulnl_instance_free_rcu);
 }
 
-static void nfulnl_timer(unsigned long data);
+static void nfulnl_timer(struct timer_list *t);
 
 static struct nfulnl_instance *
 instance_create(struct net *net, u_int16_t group_num,
@@ -184,7 +184,7 @@ instance_create(struct net *net, u_int16_t group_num,
 	/* needs to be two, since we _put() after creation */
 	refcount_set(&inst->use, 2);
 
-	setup_timer(&inst->timer, nfulnl_timer, (unsigned long)inst);
+	timer_setup(&inst->timer, nfulnl_timer, 0);
 
 	inst->net = get_net(net);
 	inst->peer_user_ns = user_ns;
@@ -377,9 +377,9 @@ __nfulnl_flush(struct nfulnl_instance *inst)
 }
 
 static void
-nfulnl_timer(unsigned long data)
+nfulnl_timer(struct timer_list *t)
 {
-	struct nfulnl_instance *inst = (struct nfulnl_instance *)data;
+	struct nfulnl_instance *inst = from_timer(inst, t, timer);
 
 	spin_lock_bh(&inst->lock);
 	if (inst->skb)
diff --git a/net/netfilter/xt_IDLETIMER.c b/net/netfilter/xt_IDLETIMER.c
index daf45da448fa..ee3421ad108d 100644
--- a/net/netfilter/xt_IDLETIMER.c
+++ b/net/netfilter/xt_IDLETIMER.c
@@ -107,9 +107,9 @@ static void idletimer_tg_work(struct work_struct *work)
 	sysfs_notify(idletimer_tg_kobj, NULL, timer->attr.attr.name);
 }
 
-static void idletimer_tg_expired(unsigned long data)
+static void idletimer_tg_expired(struct timer_list *t)
 {
-	struct idletimer_tg *timer = (struct idletimer_tg *) data;
+	struct idletimer_tg *timer = from_timer(timer, t, timer);
 
 	pr_debug("timer %s expired\n", timer->attr.attr.name);
 
@@ -143,8 +143,7 @@ static int idletimer_tg_create(struct idletimer_tg_info *info)
 
 	list_add(&info->timer->entry, &idletimer_tg_list);
 
-	setup_timer(&info->timer->timer, idletimer_tg_expired,
-		    (unsigned long) info->timer);
+	timer_setup(&info->timer->timer, idletimer_tg_expired, 0);
 	info->timer->refcnt = 1;
 
 	mod_timer(&info->timer->timer,
diff --git a/net/netfilter/xt_LED.c b/net/netfilter/xt_LED.c
index 3ba31c194cce..0971634e5444 100644
--- a/net/netfilter/xt_LED.c
+++ b/net/netfilter/xt_LED.c
@@ -85,9 +85,10 @@ led_tg(struct sk_buff *skb, const struct xt_action_param *par)
 	return XT_CONTINUE;
 }
 
-static void led_timeout_callback(unsigned long data)
+static void led_timeout_callback(struct timer_list *t)
 {
-	struct xt_led_info_internal *ledinternal = (struct xt_led_info_internal *)data;
+	struct xt_led_info_internal *ledinternal = from_timer(ledinternal, t,
+							      timer);
 
 	led_trigger_event(&ledinternal->netfilter_led_trigger, LED_OFF);
 }
@@ -143,8 +144,7 @@ static int led_tg_check(const struct xt_tgchk_param *par)
 
 	/* See if we need to set up a timer */
 	if (ledinfo->delay > 0)
-		setup_timer(&ledinternal->timer, led_timeout_callback,
-			    (unsigned long)ledinternal);
+		timer_setup(&ledinternal->timer, led_timeout_callback, 0);
 
 	list_add_tail(&ledinternal->list, &xt_led_triggers);
 
diff --git a/net/nfc/nci/core.c b/net/nfc/nci/core.c
index c25e9b4179c3..074960154993 100644
--- a/net/nfc/nci/core.c
+++ b/net/nfc/nci/core.c
@@ -591,18 +591,18 @@ static int nci_close_device(struct nci_dev *ndev)
 }
 
 /* NCI command timer function */
-static void nci_cmd_timer(unsigned long arg)
+static void nci_cmd_timer(struct timer_list *t)
 {
-	struct nci_dev *ndev = (void *) arg;
+	struct nci_dev *ndev = from_timer(ndev, t, cmd_timer);
 
 	atomic_set(&ndev->cmd_cnt, 1);
 	queue_work(ndev->cmd_wq, &ndev->cmd_work);
 }
 
 /* NCI data exchange timer function */
-static void nci_data_timer(unsigned long arg)
+static void nci_data_timer(struct timer_list *t)
 {
-	struct nci_dev *ndev = (void *) arg;
+	struct nci_dev *ndev = from_timer(ndev, t, data_timer);
 
 	set_bit(NCI_DATA_EXCHANGE_TO, &ndev->flags);
 	queue_work(ndev->rx_wq, &ndev->rx_work);
@@ -1232,10 +1232,8 @@ int nci_register_device(struct nci_dev *ndev)
 	skb_queue_head_init(&ndev->rx_q);
 	skb_queue_head_init(&ndev->tx_q);
 
-	setup_timer(&ndev->cmd_timer, nci_cmd_timer,
-		    (unsigned long) ndev);
-	setup_timer(&ndev->data_timer, nci_data_timer,
-		    (unsigned long) ndev);
+	timer_setup(&ndev->cmd_timer, nci_cmd_timer, 0);
+	timer_setup(&ndev->data_timer, nci_data_timer, 0);
 
 	mutex_init(&ndev->req_lock);
 	INIT_LIST_HEAD(&ndev->conn_info_list);
diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c
index 4c7fbc6dcce7..994dc2df57e4 100644
--- a/net/rxrpc/call_object.c
+++ b/net/rxrpc/call_object.c
@@ -45,9 +45,9 @@ const char *const rxrpc_call_completions[NR__RXRPC_CALL_COMPLETIONS] = {
 
 struct kmem_cache *rxrpc_call_jar;
 
-static void rxrpc_call_timer_expired(unsigned long _call)
+static void rxrpc_call_timer_expired(struct timer_list *t)
 {
-	struct rxrpc_call *call = (struct rxrpc_call *)_call;
+	struct rxrpc_call *call = from_timer(call, t, timer);
 
 	_enter("%d", call->debug_id);
 
@@ -114,8 +114,7 @@ struct rxrpc_call *rxrpc_alloc_call(gfp_t gfp)
 		goto nomem_2;
 
 	mutex_init(&call->user_mutex);
-	setup_timer(&call->timer, rxrpc_call_timer_expired,
-		    (unsigned long)call);
+	timer_setup(&call->timer, rxrpc_call_timer_expired, 0);
 	INIT_WORK(&call->processor, &rxrpc_process_call);
 	INIT_LIST_HEAD(&call->link);
 	INIT_LIST_HEAD(&call->chan_wait_link);
diff --git a/net/wireless/lib80211.c b/net/wireless/lib80211.c
index 459611577d3d..801d4781a73b 100644
--- a/net/wireless/lib80211.c
+++ b/net/wireless/lib80211.c
@@ -44,7 +44,7 @@ static DEFINE_SPINLOCK(lib80211_crypto_lock);
 static void lib80211_crypt_deinit_entries(struct lib80211_crypt_info *info,
 					  int force);
 static void lib80211_crypt_quiescing(struct lib80211_crypt_info *info);
-static void lib80211_crypt_deinit_handler(unsigned long data);
+static void lib80211_crypt_deinit_handler(struct timer_list *t);
 
 int lib80211_crypt_info_init(struct lib80211_crypt_info *info, char *name,
 				spinlock_t *lock)
@@ -55,8 +55,8 @@ int lib80211_crypt_info_init(struct lib80211_crypt_info *info, char *name,
 	info->lock = lock;
 
 	INIT_LIST_HEAD(&info->crypt_deinit_list);
-	setup_timer(&info->crypt_deinit_timer, lib80211_crypt_deinit_handler,
-			(unsigned long)info);
+	timer_setup(&info->crypt_deinit_timer, lib80211_crypt_deinit_handler,
+		    0);
 
 	return 0;
 }
@@ -116,9 +116,10 @@ static void lib80211_crypt_quiescing(struct lib80211_crypt_info *info)
 	spin_unlock_irqrestore(info->lock, flags);
 }
 
-static void lib80211_crypt_deinit_handler(unsigned long data)
+static void lib80211_crypt_deinit_handler(struct timer_list *t)
 {
-	struct lib80211_crypt_info *info = (struct lib80211_crypt_info *)data;
+	struct lib80211_crypt_info *info = from_timer(info, t,
+						      crypt_deinit_timer);
 	unsigned long flags;
 
 	lib80211_crypt_deinit_entries(info, 0);
diff --git a/net/x25/x25_link.c b/net/x25/x25_link.c
index e0cd04d28352..a6a8ab09b914 100644
--- a/net/x25/x25_link.c
+++ b/net/x25/x25_link.c
@@ -36,7 +36,7 @@
 LIST_HEAD(x25_neigh_list);
 DEFINE_RWLOCK(x25_neigh_list_lock);
 
-static void x25_t20timer_expiry(unsigned long);
+static void x25_t20timer_expiry(struct timer_list *);
 
 static void x25_transmit_restart_confirmation(struct x25_neigh *nb);
 static void x25_transmit_restart_request(struct x25_neigh *nb);
@@ -49,9 +49,9 @@ static inline void x25_start_t20timer(struct x25_neigh *nb)
 	mod_timer(&nb->t20timer, jiffies + nb->t20);
 }
 
-static void x25_t20timer_expiry(unsigned long param)
+static void x25_t20timer_expiry(struct timer_list *t)
 {
-	struct x25_neigh *nb = (struct x25_neigh *)param;
+	struct x25_neigh *nb = from_timer(nb, t, t20timer);
 
 	x25_transmit_restart_request(nb);
 
@@ -252,7 +252,7 @@ void x25_link_device_up(struct net_device *dev)
 		return;
 
 	skb_queue_head_init(&nb->queue);
-	setup_timer(&nb->t20timer, x25_t20timer_expiry, (unsigned long)nb);
+	timer_setup(&nb->t20timer, x25_t20timer_expiry, 0);
 
 	dev_hold(dev);
 	nb->dev      = dev;
diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index 1f5cee2269af..065d89606888 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -556,7 +556,7 @@ out:
 	return HRTIMER_NORESTART;
 }
 
-static void xfrm_replay_timer_handler(unsigned long data);
+static void xfrm_replay_timer_handler(struct timer_list *t);
 
 struct xfrm_state *xfrm_state_alloc(struct net *net)
 {
@@ -574,8 +574,7 @@ struct xfrm_state *xfrm_state_alloc(struct net *net)
 		INIT_HLIST_NODE(&x->byspi);
 		tasklet_hrtimer_init(&x->mtimer, xfrm_timer_handler,
 					CLOCK_BOOTTIME, HRTIMER_MODE_ABS);
-		setup_timer(&x->rtimer, xfrm_replay_timer_handler,
-				(unsigned long)x);
+		timer_setup(&x->rtimer, xfrm_replay_timer_handler, 0);
 		x->curlft.add_time = get_seconds();
 		x->lft.soft_byte_limit = XFRM_INF;
 		x->lft.soft_packet_limit = XFRM_INF;
@@ -1879,9 +1878,9 @@ void xfrm_state_walk_done(struct xfrm_state_walk *walk, struct net *net)
 }
 EXPORT_SYMBOL(xfrm_state_walk_done);
 
-static void xfrm_replay_timer_handler(unsigned long data)
+static void xfrm_replay_timer_handler(struct timer_list *t)
 {
-	struct xfrm_state *x = (struct xfrm_state *)data;
+	struct xfrm_state *x = from_timer(x, t, rtimer);
 
 	spin_lock(&x->lock);
 
-- 
cgit v1.2.3


From c1eba5bcb6430868427e0b9d1cd1205a07302f06 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Sun, 22 Oct 2017 18:18:19 -0700
Subject: timer: Pass timer_list pointer to callbacks unconditionally

Now that all timer callbacks are already taking their struct timer_list
pointer as the callback argument, just do this unconditionally and remove
the .data field.

Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: John Stultz <john.stultz@linaro.org>
Cc: Stephen Boyd <sboyd@codeaurora.org>
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 include/linux/timer.h |  4 ----
 kernel/time/timer.c   | 17 +++++++----------
 2 files changed, 7 insertions(+), 14 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/timer.h b/include/linux/timer.h
index 47615dca4c5c..20a6e7af5fd6 100644
--- a/include/linux/timer.h
+++ b/include/linux/timer.h
@@ -18,7 +18,6 @@ struct timer_list {
 	struct hlist_node	entry;
 	unsigned long		expires;
 	void			(*function)(unsigned long);
-	unsigned long		data;
 	u32			flags;
 
 #ifdef CONFIG_LOCKDEP
@@ -70,7 +69,6 @@ struct timer_list {
 #define __TIMER_INITIALIZER(_function, _data, _flags) {		\
 		.entry = { .next = TIMER_ENTRY_STATIC },	\
 		.function = (_function),			\
-		.data = (_data),				\
 		.flags = (_flags),				\
 		__TIMER_LOCKDEP_MAP_INITIALIZER(		\
 			__FILE__ ":" __stringify(__LINE__))	\
@@ -121,14 +119,12 @@ static inline void init_timer_on_stack_key(struct timer_list *timer,
 	do {								\
 		__init_timer((_timer), (_flags));			\
 		(_timer)->function = (_fn);				\
-		(_timer)->data = (_data);				\
 	} while (0)
 
 #define __setup_timer_on_stack(_timer, _fn, _data, _flags)		\
 	do {								\
 		__init_timer_on_stack((_timer), (_flags));		\
 		(_timer)->function = (_fn);				\
-		(_timer)->data = (_data);				\
 	} while (0)
 
 #ifndef CONFIG_LOCKDEP
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index af0b8bae4502..a07eb124332f 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -1107,12 +1107,12 @@ EXPORT_SYMBOL(timer_reduce);
  * add_timer - start a timer
  * @timer: the timer to be added
  *
- * The kernel will do a ->function(->data) callback from the
+ * The kernel will do a ->function(@timer) callback from the
  * timer interrupt at the ->expires point in the future. The
  * current time is 'jiffies'.
  *
- * The timer's ->expires, ->function (and if the handler uses it, ->data)
- * fields must be set prior calling this function.
+ * The timer's ->expires, ->function fields must be set prior calling this
+ * function.
  *
  * Timers with an ->expires field in the past will be executed in the next
  * timer tick.
@@ -1284,8 +1284,7 @@ int del_timer_sync(struct timer_list *timer)
 EXPORT_SYMBOL(del_timer_sync);
 #endif
 
-static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long),
-			  unsigned long data)
+static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long))
 {
 	int count = preempt_count();
 
@@ -1309,7 +1308,7 @@ static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long),
 	lock_map_acquire(&lockdep_map);
 
 	trace_timer_expire_entry(timer);
-	fn(data);
+	fn((TIMER_DATA_TYPE)timer);
 	trace_timer_expire_exit(timer);
 
 	lock_map_release(&lockdep_map);
@@ -1332,7 +1331,6 @@ static void expire_timers(struct timer_base *base, struct hlist_head *head)
 	while (!hlist_empty(head)) {
 		struct timer_list *timer;
 		void (*fn)(unsigned long);
-		unsigned long data;
 
 		timer = hlist_entry(head->first, struct timer_list, entry);
 
@@ -1340,15 +1338,14 @@ static void expire_timers(struct timer_base *base, struct hlist_head *head)
 		detach_timer(timer, true);
 
 		fn = timer->function;
-		data = timer->data;
 
 		if (timer->flags & TIMER_IRQSAFE) {
 			raw_spin_unlock(&base->lock);
-			call_timer_fn(timer, fn, data);
+			call_timer_fn(timer, fn);
 			raw_spin_lock(&base->lock);
 		} else {
 			raw_spin_unlock_irq(&base->lock);
-			call_timer_fn(timer, fn, data);
+			call_timer_fn(timer, fn);
 			raw_spin_lock_irq(&base->lock);
 		}
 	}
-- 
cgit v1.2.3


From 354b46b1a0adda1dd5b7f0bc2a5604cca091be5f Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Sun, 22 Oct 2017 19:15:40 -0700
Subject: timer: Switch callback prototype to take struct timer_list * argument

Since all callbacks have been converted, we can switch the core
prototype to "struct timer_list *" now too.

Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: John Stultz <john.stultz@linaro.org>
Cc: Stephen Boyd <sboyd@codeaurora.org>
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 include/linux/timer.h | 4 ++--
 kernel/time/timer.c   | 6 +++---
 2 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/timer.h b/include/linux/timer.h
index 20a6e7af5fd6..a6d04fb72c9e 100644
--- a/include/linux/timer.h
+++ b/include/linux/timer.h
@@ -17,7 +17,7 @@ struct timer_list {
 	 */
 	struct hlist_node	entry;
 	unsigned long		expires;
-	void			(*function)(unsigned long);
+	void			(*function)(struct timer_list *);
 	u32			flags;
 
 #ifdef CONFIG_LOCKDEP
@@ -63,7 +63,7 @@ struct timer_list {
 
 #define TIMER_TRACE_FLAGMASK	(TIMER_MIGRATING | TIMER_DEFERRABLE | TIMER_PINNED | TIMER_IRQSAFE)
 
-#define TIMER_DATA_TYPE		unsigned long
+#define TIMER_DATA_TYPE		struct timer_list *
 #define TIMER_FUNC_TYPE		void (*)(TIMER_DATA_TYPE)
 
 #define __TIMER_INITIALIZER(_function, _data, _flags) {		\
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index a07eb124332f..0f0d49a02d04 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -1284,7 +1284,7 @@ int del_timer_sync(struct timer_list *timer)
 EXPORT_SYMBOL(del_timer_sync);
 #endif
 
-static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long))
+static void call_timer_fn(struct timer_list *timer, void (*fn)(struct timer_list *))
 {
 	int count = preempt_count();
 
@@ -1308,7 +1308,7 @@ static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long))
 	lock_map_acquire(&lockdep_map);
 
 	trace_timer_expire_entry(timer);
-	fn((TIMER_DATA_TYPE)timer);
+	fn(timer);
 	trace_timer_expire_exit(timer);
 
 	lock_map_release(&lockdep_map);
@@ -1330,7 +1330,7 @@ static void expire_timers(struct timer_base *base, struct hlist_head *head)
 {
 	while (!hlist_empty(head)) {
 		struct timer_list *timer;
-		void (*fn)(unsigned long);
+		void (*fn)(struct timer_list *);
 
 		timer = hlist_entry(head->first, struct timer_list, entry);
 
-- 
cgit v1.2.3


From 188665b2d67db8953899551d1a9d4481b2a0ac60 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Sun, 22 Oct 2017 18:14:46 -0700
Subject: timer: Pass function down to initialization routines

In preparation for removing more macros, pass the function down to the
initialization routines instead of doing it in macros.

Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: John Stultz <john.stultz@linaro.org>
Cc: Stephen Boyd <sboyd@codeaurora.org>
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 include/linux/timer.h | 33 ++++++++++++++++++---------------
 kernel/time/timer.c   | 21 +++++++++++++++------
 2 files changed, 33 insertions(+), 21 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/timer.h b/include/linux/timer.h
index e6bab51db13d..aff73b1c8f7b 100644
--- a/include/linux/timer.h
+++ b/include/linux/timer.h
@@ -78,53 +78,56 @@ struct timer_list {
 	struct timer_list _name =				\
 		__TIMER_INITIALIZER((TIMER_FUNC_TYPE)_function, 0)
 
-void init_timer_key(struct timer_list *timer, unsigned int flags,
+void init_timer_key(struct timer_list *timer,
+		    void (*func)(struct timer_list *), unsigned int flags,
 		    const char *name, struct lock_class_key *key);
 
 #ifdef CONFIG_DEBUG_OBJECTS_TIMERS
 extern void init_timer_on_stack_key(struct timer_list *timer,
+				    void (*func)(struct timer_list *),
 				    unsigned int flags, const char *name,
 				    struct lock_class_key *key);
 extern void destroy_timer_on_stack(struct timer_list *timer);
 #else
 static inline void destroy_timer_on_stack(struct timer_list *timer) { }
 static inline void init_timer_on_stack_key(struct timer_list *timer,
-					   unsigned int flags, const char *name,
+					   void (*func)(struct timer_list *),
+					   unsigned int flags,
+					   const char *name,
 					   struct lock_class_key *key)
 {
-	init_timer_key(timer, flags, name, key);
+	init_timer_key(timer, func, flags, name, key);
 }
 #endif
 
 #ifdef CONFIG_LOCKDEP
-#define __init_timer(_timer, _flags)					\
+#define __init_timer(_timer, _fn, _flags)				\
 	do {								\
 		static struct lock_class_key __key;			\
-		init_timer_key((_timer), (_flags), #_timer, &__key);	\
+		init_timer_key((_timer), (_fn), (_flags), #_timer, &__key);\
 	} while (0)
 
-#define __init_timer_on_stack(_timer, _flags)				\
+#define __init_timer_on_stack(_timer, _fn, _flags)			\
 	do {								\
 		static struct lock_class_key __key;			\
-		init_timer_on_stack_key((_timer), (_flags), #_timer, &__key); \
+		init_timer_on_stack_key((_timer), (_fn), (_flags),	\
+					#_timer, &__key);		 \
 	} while (0)
 #else
-#define __init_timer(_timer, _flags)					\
-	init_timer_key((_timer), (_flags), NULL, NULL)
-#define __init_timer_on_stack(_timer, _flags)				\
-	init_timer_on_stack_key((_timer), (_flags), NULL, NULL)
+#define __init_timer(_timer, _fn, _flags)				\
+	init_timer_key((_timer), (_fn), (_flags), NULL, NULL)
+#define __init_timer_on_stack(_timer, _fn, _flags)			\
+	init_timer_on_stack_key((_timer), (_fn), (_flags), NULL, NULL)
 #endif
 
 #define __setup_timer(_timer, _fn, _flags)				\
 	do {								\
-		__init_timer((_timer), (_flags));			\
-		(_timer)->function = (_fn);				\
+		__init_timer((_timer), (_fn), (_flags));		\
 	} while (0)
 
 #define __setup_timer_on_stack(_timer, _fn, _flags)			\
 	do {								\
-		__init_timer_on_stack((_timer), (_flags));		\
-		(_timer)->function = (_fn);				\
+		__init_timer_on_stack((_timer), (_fn), (_flags));	\
 	} while (0)
 
 #ifndef CONFIG_LOCKDEP
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 0f0d49a02d04..ffebcf878fba 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -707,14 +707,18 @@ static inline void debug_timer_assert_init(struct timer_list *timer)
 	debug_object_assert_init(timer, &timer_debug_descr);
 }
 
-static void do_init_timer(struct timer_list *timer, unsigned int flags,
+static void do_init_timer(struct timer_list *timer,
+			  void (*func)(struct timer_list *),
+			  unsigned int flags,
 			  const char *name, struct lock_class_key *key);
 
-void init_timer_on_stack_key(struct timer_list *timer, unsigned int flags,
+void init_timer_on_stack_key(struct timer_list *timer,
+			     void (*func)(struct timer_list *),
+			     unsigned int flags,
 			     const char *name, struct lock_class_key *key)
 {
 	debug_object_init_on_stack(timer, &timer_debug_descr);
-	do_init_timer(timer, flags, name, key);
+	do_init_timer(timer, func, flags, name, key);
 }
 EXPORT_SYMBOL_GPL(init_timer_on_stack_key);
 
@@ -755,10 +759,13 @@ static inline void debug_assert_init(struct timer_list *timer)
 	debug_timer_assert_init(timer);
 }
 
-static void do_init_timer(struct timer_list *timer, unsigned int flags,
+static void do_init_timer(struct timer_list *timer,
+			  void (*func)(struct timer_list *),
+			  unsigned int flags,
 			  const char *name, struct lock_class_key *key)
 {
 	timer->entry.pprev = NULL;
+	timer->function = func;
 	timer->flags = flags | raw_smp_processor_id();
 	lockdep_init_map(&timer->lockdep_map, name, key, 0);
 }
@@ -766,6 +773,7 @@ static void do_init_timer(struct timer_list *timer, unsigned int flags,
 /**
  * init_timer_key - initialize a timer
  * @timer: the timer to be initialized
+ * @func: timer callback function
  * @flags: timer flags
  * @name: name of the timer
  * @key: lockdep class key of the fake lock used for tracking timer
@@ -774,11 +782,12 @@ static void do_init_timer(struct timer_list *timer, unsigned int flags,
  * init_timer_key() must be done to a timer prior calling *any* of the
  * other timer functions.
  */
-void init_timer_key(struct timer_list *timer, unsigned int flags,
+void init_timer_key(struct timer_list *timer,
+		    void (*func)(struct timer_list *), unsigned int flags,
 		    const char *name, struct lock_class_key *key)
 {
 	debug_init(timer);
-	do_init_timer(timer, flags, name, key);
+	do_init_timer(timer, func, flags, name, key);
 }
 EXPORT_SYMBOL(init_timer_key);
 
-- 
cgit v1.2.3


From 841b86f3289dbe858daeceec36423d4ea286fac2 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Mon, 23 Oct 2017 09:40:42 +0200
Subject: treewide: Remove TIMER_FUNC_TYPE and TIMER_DATA_TYPE casts

With all callbacks converted, and the timer callback prototype
switched over, the TIMER_FUNC_TYPE cast is no longer needed,
so remove it. Conversion was done with the following scripts:

    perl -pi -e 's|\(TIMER_FUNC_TYPE\)||g' \
        $(git grep TIMER_FUNC_TYPE | cut -d: -f1 | sort -u)

    perl -pi -e 's|\(TIMER_DATA_TYPE\)||g' \
        $(git grep TIMER_DATA_TYPE | cut -d: -f1 | sort -u)

The now unused macros are also dropped from include/linux/timer.h.

Signed-off-by: Kees Cook <keescook@chromium.org>
---
 drivers/base/power/wakeup.c                       |  2 +-
 drivers/block/aoe/aoecmd.c                        |  2 +-
 drivers/block/swim3.c                             |  2 +-
 drivers/infiniband/hw/nes/nes_verbs.c             |  2 +-
 drivers/input/input.c                             |  2 +-
 drivers/media/common/saa7146/saa7146_vbi.c        |  2 +-
 drivers/net/ethernet/ti/tlan.c                    |  6 +++---
 drivers/net/hamradio/scc.c                        |  8 ++++----
 drivers/net/wireless/quantenna/qtnfmac/cfg80211.c |  2 +-
 drivers/net/wireless/ray_cs.c                     | 12 ++++++------
 drivers/s390/char/sclp.c                          |  4 ++--
 drivers/s390/scsi/zfcp_fsf.c                      |  4 ++--
 drivers/scsi/aic94xx/aic94xx_hwi.c                |  2 +-
 drivers/scsi/aic94xx/aic94xx_tmf.c                |  2 +-
 drivers/scsi/be2iscsi/be_main.c                   |  4 ++--
 drivers/scsi/cxgbi/cxgb3i/cxgb3i.c                |  4 ++--
 drivers/scsi/cxgbi/cxgb4i/cxgb4i.c                |  4 ++--
 drivers/scsi/hisi_sas/hisi_sas_main.c             |  4 ++--
 drivers/scsi/hisi_sas/hisi_sas_v2_hw.c            |  6 +++---
 drivers/scsi/ipr.c                                |  8 ++++----
 drivers/scsi/libfc/fc_fcp.c                       |  6 +++---
 drivers/scsi/libsas/sas_expander.c                |  2 +-
 drivers/scsi/libsas/sas_scsi_host.c               |  2 +-
 drivers/scsi/mvsas/mv_sas.c                       |  4 ++--
 drivers/scsi/pm8001/pm8001_sas.c                  |  4 ++--
 drivers/scsi/pmcraid.c                            | 10 +++++-----
 drivers/staging/irda/include/net/irda/timer.h     |  2 +-
 drivers/tty/serial/8250/8250_core.c               |  4 ++--
 include/linux/kthread.h                           |  2 +-
 include/linux/timer.h                             |  5 +----
 include/linux/workqueue.h                         |  2 +-
 kernel/kthread.c                                  |  2 +-
 kernel/workqueue.c                                |  2 +-
 net/atm/lec.c                                     |  6 +++---
 net/can/proc.c                                    |  4 ++--
 net/lapb/lapb_timer.c                             |  4 ++--
 net/netrom/af_netrom.c                            |  2 +-
 net/netrom/nr_timer.c                             |  2 +-
 net/rose/rose_link.c                              |  4 ++--
 net/rose/rose_timer.c                             | 12 ++++++------
 net/sunrpc/svc_xprt.c                             |  2 +-
 net/x25/af_x25.c                                  |  2 +-
 net/x25/x25_timer.c                               |  2 +-
 sound/usb/line6/driver.c                          |  2 +-
 44 files changed, 84 insertions(+), 87 deletions(-)

(limited to 'kernel')

diff --git a/drivers/base/power/wakeup.c b/drivers/base/power/wakeup.c
index 680ee1d36ac9..38559f04db2c 100644
--- a/drivers/base/power/wakeup.c
+++ b/drivers/base/power/wakeup.c
@@ -481,7 +481,7 @@ static bool wakeup_source_not_registered(struct wakeup_source *ws)
 	 * Use timer struct to check if the given source is initialized
 	 * by wakeup_source_add.
 	 */
-	return ws->timer.function != (TIMER_FUNC_TYPE)pm_wakeup_timer_fn;
+	return ws->timer.function != pm_wakeup_timer_fn;
 }
 
 /*
diff --git a/drivers/block/aoe/aoecmd.c b/drivers/block/aoe/aoecmd.c
index 55ab25f79a08..812fed069708 100644
--- a/drivers/block/aoe/aoecmd.c
+++ b/drivers/block/aoe/aoecmd.c
@@ -1429,7 +1429,7 @@ aoecmd_ata_id(struct aoedev *d)
 
 	d->rttavg = RTTAVG_INIT;
 	d->rttdev = RTTDEV_INIT;
-	d->timer.function = (TIMER_FUNC_TYPE)rexmit_timer;
+	d->timer.function = rexmit_timer;
 
 	skb = skb_clone(skb, GFP_ATOMIC);
 	if (skb) {
diff --git a/drivers/block/swim3.c b/drivers/block/swim3.c
index e620e423102b..af51015d056e 100644
--- a/drivers/block/swim3.c
+++ b/drivers/block/swim3.c
@@ -397,7 +397,7 @@ static void set_timeout(struct floppy_state *fs, int nticks,
 	if (fs->timeout_pending)
 		del_timer(&fs->timeout);
 	fs->timeout.expires = jiffies + nticks;
-	fs->timeout.function = (TIMER_FUNC_TYPE)proc;
+	fs->timeout.function = proc;
 	add_timer(&fs->timeout);
 	fs->timeout_pending = 1;
 }
diff --git a/drivers/infiniband/hw/nes/nes_verbs.c b/drivers/infiniband/hw/nes/nes_verbs.c
index db46b7b53fb4..162475aeeedd 100644
--- a/drivers/infiniband/hw/nes/nes_verbs.c
+++ b/drivers/infiniband/hw/nes/nes_verbs.c
@@ -3819,7 +3819,7 @@ void  nes_port_ibevent(struct nes_vnic *nesvnic)
 	if (!nesvnic->event_timer.function) {
 		ib_dispatch_event(&event);
 		nesvnic->last_dispatched_event = event.event;
-		nesvnic->event_timer.function = (TIMER_FUNC_TYPE)nes_handle_delayed_event;
+		nesvnic->event_timer.function = nes_handle_delayed_event;
 		nesvnic->event_timer.expires = jiffies + NES_EVENT_DELAY;
 		add_timer(&nesvnic->event_timer);
 	} else {
diff --git a/drivers/input/input.c b/drivers/input/input.c
index 44916ef4a424..e30642db50d5 100644
--- a/drivers/input/input.c
+++ b/drivers/input/input.c
@@ -2047,7 +2047,7 @@ static void devm_input_device_unregister(struct device *dev, void *res)
  */
 void input_enable_softrepeat(struct input_dev *dev, int delay, int period)
 {
-	dev->timer.function = (TIMER_FUNC_TYPE)input_repeat_key;
+	dev->timer.function = input_repeat_key;
 	dev->rep[REP_DELAY] = delay;
 	dev->rep[REP_PERIOD] = period;
 }
diff --git a/drivers/media/common/saa7146/saa7146_vbi.c b/drivers/media/common/saa7146/saa7146_vbi.c
index ce8d78c137f0..e1d369b976ed 100644
--- a/drivers/media/common/saa7146/saa7146_vbi.c
+++ b/drivers/media/common/saa7146/saa7146_vbi.c
@@ -402,7 +402,7 @@ static int vbi_open(struct saa7146_dev *dev, struct file *file)
 			    sizeof(struct saa7146_buf),
 			    file, &dev->v4l2_lock);
 
-	vv->vbi_read_timeout.function = (TIMER_FUNC_TYPE)vbi_read_timeout;
+	vv->vbi_read_timeout.function = vbi_read_timeout;
 	vv->vbi_read_timeout_file = file;
 
 	/* initialize the brs */
diff --git a/drivers/net/ethernet/ti/tlan.c b/drivers/net/ethernet/ti/tlan.c
index 8f53d762fbc4..5a4e78fde530 100644
--- a/drivers/net/ethernet/ti/tlan.c
+++ b/drivers/net/ethernet/ti/tlan.c
@@ -254,7 +254,7 @@ tlan_set_timer(struct net_device *dev, u32 ticks, u32 type)
 			spin_unlock_irqrestore(&priv->lock, flags);
 		return;
 	}
-	priv->timer.function = (TIMER_FUNC_TYPE)tlan_timer;
+	priv->timer.function = tlan_timer;
 	if (!in_irq())
 		spin_unlock_irqrestore(&priv->lock, flags);
 
@@ -1425,7 +1425,7 @@ static u32 tlan_handle_tx_eof(struct net_device *dev, u16 host_int)
 		tlan_dio_write8(dev->base_addr,
 				TLAN_LED_REG, TLAN_LED_LINK | TLAN_LED_ACT);
 		if (priv->timer.function == NULL) {
-			priv->timer.function = (TIMER_FUNC_TYPE)tlan_timer;
+			priv->timer.function = tlan_timer;
 			priv->timer.expires = jiffies + TLAN_TIMER_ACT_DELAY;
 			priv->timer_set_at = jiffies;
 			priv->timer_type = TLAN_TIMER_ACTIVITY;
@@ -1576,7 +1576,7 @@ drop_and_reuse:
 		tlan_dio_write8(dev->base_addr,
 				TLAN_LED_REG, TLAN_LED_LINK | TLAN_LED_ACT);
 		if (priv->timer.function == NULL)  {
-			priv->timer.function = (TIMER_FUNC_TYPE)tlan_timer;
+			priv->timer.function = tlan_timer;
 			priv->timer.expires = jiffies + TLAN_TIMER_ACT_DELAY;
 			priv->timer_set_at = jiffies;
 			priv->timer_type = TLAN_TIMER_ACTIVITY;
diff --git a/drivers/net/hamradio/scc.c b/drivers/net/hamradio/scc.c
index c9f7215c5dc2..3de272959090 100644
--- a/drivers/net/hamradio/scc.c
+++ b/drivers/net/hamradio/scc.c
@@ -1005,7 +1005,7 @@ static void __scc_start_tx_timer(struct scc_channel *scc,
 	} else 
 	if (when != TIMER_OFF)
 	{
-		scc->tx_t.function = (TIMER_FUNC_TYPE)handler;
+		scc->tx_t.function = handler;
 		scc->tx_t.expires = jiffies + (when*HZ)/100;
 		add_timer(&scc->tx_t);
 	}
@@ -1031,7 +1031,7 @@ static void scc_start_defer(struct scc_channel *scc)
 	
 	if (scc->kiss.maxdefer != 0 && scc->kiss.maxdefer != TIMER_OFF)
 	{
-		scc->tx_wdog.function = (TIMER_FUNC_TYPE)t_busy;
+		scc->tx_wdog.function = t_busy;
 		scc->tx_wdog.expires = jiffies + HZ*scc->kiss.maxdefer;
 		add_timer(&scc->tx_wdog);
 	}
@@ -1047,7 +1047,7 @@ static void scc_start_maxkeyup(struct scc_channel *scc)
 	
 	if (scc->kiss.maxkeyup != 0 && scc->kiss.maxkeyup != TIMER_OFF)
 	{
-		scc->tx_wdog.function = (TIMER_FUNC_TYPE)t_maxkeyup;
+		scc->tx_wdog.function = t_maxkeyup;
 		scc->tx_wdog.expires = jiffies + HZ*scc->kiss.maxkeyup;
 		add_timer(&scc->tx_wdog);
 	}
@@ -1428,7 +1428,7 @@ scc_start_calibrate(struct scc_channel *scc, int duration, unsigned char pattern
 
 	del_timer(&scc->tx_wdog);
 
-	scc->tx_wdog.function = (TIMER_FUNC_TYPE)scc_stop_calibrate;
+	scc->tx_wdog.function = scc_stop_calibrate;
 	scc->tx_wdog.expires = jiffies + HZ*duration;
 	add_timer(&scc->tx_wdog);
 
diff --git a/drivers/net/wireless/quantenna/qtnfmac/cfg80211.c b/drivers/net/wireless/quantenna/qtnfmac/cfg80211.c
index 7d6dc76c930a..6711e7fb6926 100644
--- a/drivers/net/wireless/quantenna/qtnfmac/cfg80211.c
+++ b/drivers/net/wireless/quantenna/qtnfmac/cfg80211.c
@@ -554,7 +554,7 @@ qtnf_scan(struct wiphy *wiphy, struct cfg80211_scan_request *request)
 		return -EFAULT;
 	}
 
-	mac->scan_timeout.function = (TIMER_FUNC_TYPE)qtnf_scan_timeout;
+	mac->scan_timeout.function = qtnf_scan_timeout;
 	mod_timer(&mac->scan_timeout,
 		  jiffies + QTNF_SCAN_TIMEOUT_SEC * HZ);
 
diff --git a/drivers/net/wireless/ray_cs.c b/drivers/net/wireless/ray_cs.c
index d8afcdfca1ed..0133fcd4601b 100644
--- a/drivers/net/wireless/ray_cs.c
+++ b/drivers/net/wireless/ray_cs.c
@@ -569,7 +569,7 @@ static int dl_startup_params(struct net_device *dev)
 	local->card_status = CARD_DL_PARAM;
 	/* Start kernel timer to wait for dl startup to complete. */
 	local->timer.expires = jiffies + HZ / 2;
-	local->timer.function = (TIMER_FUNC_TYPE)verify_dl_startup;
+	local->timer.function = verify_dl_startup;
 	add_timer(&local->timer);
 	dev_dbg(&link->dev,
 	      "ray_cs dl_startup_params started timer for verify_dl_startup\n");
@@ -1947,12 +1947,12 @@ static irqreturn_t ray_interrupt(int irq, void *dev_id)
 					dev_dbg(&link->dev,
 					      "ray_cs interrupt network \"%s\" start failed\n",
 					      memtmp);
-					local->timer.function = (TIMER_FUNC_TYPE)start_net;
+					local->timer.function = start_net;
 				} else {
 					dev_dbg(&link->dev,
 					      "ray_cs interrupt network \"%s\" join failed\n",
 					      memtmp);
-					local->timer.function = (TIMER_FUNC_TYPE)join_net;
+					local->timer.function = join_net;
 				}
 				add_timer(&local->timer);
 			}
@@ -2417,9 +2417,9 @@ static void authenticate(ray_dev_t *local)
 
 	del_timer(&local->timer);
 	if (build_auth_frame(local, local->bss_id, OPEN_AUTH_REQUEST)) {
-		local->timer.function = (TIMER_FUNC_TYPE)join_net;
+		local->timer.function = join_net;
 	} else {
-		local->timer.function = (TIMER_FUNC_TYPE)authenticate_timeout;
+		local->timer.function = authenticate_timeout;
 	}
 	local->timer.expires = jiffies + HZ * 2;
 	add_timer(&local->timer);
@@ -2502,7 +2502,7 @@ static void associate(ray_dev_t *local)
 
 		del_timer(&local->timer);
 		local->timer.expires = jiffies + HZ * 2;
-		local->timer.function = (TIMER_FUNC_TYPE)join_net;
+		local->timer.function = join_net;
 		add_timer(&local->timer);
 		local->card_status = CARD_ASSOC_FAILED;
 		return;
diff --git a/drivers/s390/char/sclp.c b/drivers/s390/char/sclp.c
index 9b4c61c1e309..e4e2df7a478e 100644
--- a/drivers/s390/char/sclp.c
+++ b/drivers/s390/char/sclp.c
@@ -158,7 +158,7 @@ static inline void
 __sclp_set_request_timer(unsigned long time, void (*cb)(struct timer_list *))
 {
 	del_timer(&sclp_request_timer);
-	sclp_request_timer.function = (TIMER_FUNC_TYPE)cb;
+	sclp_request_timer.function = cb;
 	sclp_request_timer.expires = jiffies + time;
 	add_timer(&sclp_request_timer);
 }
@@ -566,7 +566,7 @@ sclp_sync_wait(void)
 		if (timer_pending(&sclp_request_timer) &&
 		    get_tod_clock_fast() > timeout &&
 		    del_timer(&sclp_request_timer))
-			sclp_request_timer.function((TIMER_DATA_TYPE)&sclp_request_timer);
+			sclp_request_timer.function(&sclp_request_timer);
 		cpu_relax();
 	}
 	local_irq_disable();
diff --git a/drivers/s390/scsi/zfcp_fsf.c b/drivers/s390/scsi/zfcp_fsf.c
index 51b81c0a0652..b12cb81ad8a2 100644
--- a/drivers/s390/scsi/zfcp_fsf.c
+++ b/drivers/s390/scsi/zfcp_fsf.c
@@ -34,7 +34,7 @@ static void zfcp_fsf_request_timeout_handler(struct timer_list *t)
 static void zfcp_fsf_start_timer(struct zfcp_fsf_req *fsf_req,
 				 unsigned long timeout)
 {
-	fsf_req->timer.function = (TIMER_FUNC_TYPE)zfcp_fsf_request_timeout_handler;
+	fsf_req->timer.function = zfcp_fsf_request_timeout_handler;
 	fsf_req->timer.expires = jiffies + timeout;
 	add_timer(&fsf_req->timer);
 }
@@ -42,7 +42,7 @@ static void zfcp_fsf_start_timer(struct zfcp_fsf_req *fsf_req,
 static void zfcp_fsf_start_erp_timer(struct zfcp_fsf_req *fsf_req)
 {
 	BUG_ON(!fsf_req->erp_action);
-	fsf_req->timer.function = (TIMER_FUNC_TYPE)zfcp_erp_timeout_handler;
+	fsf_req->timer.function = zfcp_erp_timeout_handler;
 	fsf_req->timer.expires = jiffies + 30 * HZ;
 	add_timer(&fsf_req->timer);
 }
diff --git a/drivers/scsi/aic94xx/aic94xx_hwi.c b/drivers/scsi/aic94xx/aic94xx_hwi.c
index 5402b85b0bdc..2dbc8330d7d3 100644
--- a/drivers/scsi/aic94xx/aic94xx_hwi.c
+++ b/drivers/scsi/aic94xx/aic94xx_hwi.c
@@ -1175,7 +1175,7 @@ static void asd_start_scb_timers(struct list_head *list)
 	struct asd_ascb *ascb;
 	list_for_each_entry(ascb, list, list) {
 		if (!ascb->uldd_timer) {
-			ascb->timer.function = (TIMER_FUNC_TYPE)asd_ascb_timedout;
+			ascb->timer.function = asd_ascb_timedout;
 			ascb->timer.expires = jiffies + AIC94XX_SCB_TIMEOUT;
 			add_timer(&ascb->timer);
 		}
diff --git a/drivers/scsi/aic94xx/aic94xx_tmf.c b/drivers/scsi/aic94xx/aic94xx_tmf.c
index 4637119c09d8..2a01702d5ba7 100644
--- a/drivers/scsi/aic94xx/aic94xx_tmf.c
+++ b/drivers/scsi/aic94xx/aic94xx_tmf.c
@@ -42,7 +42,7 @@ static int asd_enqueue_internal(struct asd_ascb *ascb,
 	ascb->tasklet_complete = tasklet_complete;
 	ascb->uldd_timer = 1;
 
-	ascb->timer.function = (TIMER_FUNC_TYPE)timed_out;
+	ascb->timer.function = timed_out;
 	ascb->timer.expires = jiffies + AIC94XX_SCB_TIMEOUT;
 
 	add_timer(&ascb->timer);
diff --git a/drivers/scsi/be2iscsi/be_main.c b/drivers/scsi/be2iscsi/be_main.c
index be96aa1e5077..b3cfdd5f4d1c 100644
--- a/drivers/scsi/be2iscsi/be_main.c
+++ b/drivers/scsi/be2iscsi/be_main.c
@@ -5279,7 +5279,7 @@ static void beiscsi_hw_health_check(struct timer_list *t)
 		if (!test_bit(BEISCSI_HBA_UER_SUPP, &phba->state))
 			return;
 		/* modify this timer to check TPE */
-		phba->hw_check.function = (TIMER_FUNC_TYPE)beiscsi_hw_tpe_check;
+		phba->hw_check.function = beiscsi_hw_tpe_check;
 	}
 
 	mod_timer(&phba->hw_check,
@@ -5367,7 +5367,7 @@ static int beiscsi_enable_port(struct beiscsi_hba *phba)
 	 * Timer function gets modified for TPE detection.
 	 * Always reinit to do health check first.
 	 */
-	phba->hw_check.function = (TIMER_FUNC_TYPE)beiscsi_hw_health_check;
+	phba->hw_check.function = beiscsi_hw_health_check;
 	mod_timer(&phba->hw_check,
 		  jiffies + msecs_to_jiffies(BEISCSI_UE_DETECT_INTERVAL));
 	return 0;
diff --git a/drivers/scsi/cxgbi/cxgb3i/cxgb3i.c b/drivers/scsi/cxgbi/cxgb3i/cxgb3i.c
index babd79361a46..bf07735275a4 100644
--- a/drivers/scsi/cxgbi/cxgb3i/cxgb3i.c
+++ b/drivers/scsi/cxgbi/cxgb3i/cxgb3i.c
@@ -586,8 +586,8 @@ static int do_act_open_rpl(struct t3cdev *tdev, struct sk_buff *skb, void *ctx)
 	cxgbi_sock_get(csk);
 	spin_lock_bh(&csk->lock);
 	if (rpl->status == CPL_ERR_CONN_EXIST &&
-	    csk->retry_timer.function != (TIMER_FUNC_TYPE)act_open_retry_timer) {
-		csk->retry_timer.function = (TIMER_FUNC_TYPE)act_open_retry_timer;
+	    csk->retry_timer.function != act_open_retry_timer) {
+		csk->retry_timer.function = act_open_retry_timer;
 		mod_timer(&csk->retry_timer, jiffies + HZ / 2);
 	} else
 		cxgbi_sock_fail_act_open(csk,
diff --git a/drivers/scsi/cxgbi/cxgb4i/cxgb4i.c b/drivers/scsi/cxgbi/cxgb4i/cxgb4i.c
index 266eddf17a99..406e94312d4e 100644
--- a/drivers/scsi/cxgbi/cxgb4i/cxgb4i.c
+++ b/drivers/scsi/cxgbi/cxgb4i/cxgb4i.c
@@ -963,8 +963,8 @@ static void do_act_open_rpl(struct cxgbi_device *cdev, struct sk_buff *skb)
 	spin_lock_bh(&csk->lock);
 
 	if (status == CPL_ERR_CONN_EXIST &&
-	    csk->retry_timer.function != (TIMER_FUNC_TYPE)csk_act_open_retry_timer) {
-		csk->retry_timer.function = (TIMER_FUNC_TYPE)csk_act_open_retry_timer;
+	    csk->retry_timer.function != csk_act_open_retry_timer) {
+		csk->retry_timer.function = csk_act_open_retry_timer;
 		mod_timer(&csk->retry_timer, jiffies + HZ / 2);
 	} else
 		cxgbi_sock_fail_act_open(csk,
diff --git a/drivers/scsi/hisi_sas/hisi_sas_main.c b/drivers/scsi/hisi_sas/hisi_sas_main.c
index 61a85ff8e459..5f503cb09508 100644
--- a/drivers/scsi/hisi_sas/hisi_sas_main.c
+++ b/drivers/scsi/hisi_sas/hisi_sas_main.c
@@ -839,7 +839,7 @@ static int hisi_sas_exec_internal_tmf_task(struct domain_device *device,
 		}
 		task->task_done = hisi_sas_task_done;
 
-		task->slow_task->timer.function = (TIMER_FUNC_TYPE)hisi_sas_tmf_timedout;
+		task->slow_task->timer.function = hisi_sas_tmf_timedout;
 		task->slow_task->timer.expires = jiffies + TASK_TIMEOUT*HZ;
 		add_timer(&task->slow_task->timer);
 
@@ -1451,7 +1451,7 @@ hisi_sas_internal_task_abort(struct hisi_hba *hisi_hba,
 	task->dev = device;
 	task->task_proto = device->tproto;
 	task->task_done = hisi_sas_task_done;
-	task->slow_task->timer.function = (TIMER_FUNC_TYPE)hisi_sas_tmf_timedout;
+	task->slow_task->timer.function = hisi_sas_tmf_timedout;
 	task->slow_task->timer.expires = jiffies + msecs_to_jiffies(110);
 	add_timer(&task->slow_task->timer);
 
diff --git a/drivers/scsi/hisi_sas/hisi_sas_v2_hw.c b/drivers/scsi/hisi_sas/hisi_sas_v2_hw.c
index d02c2a791981..5d3467fd728d 100644
--- a/drivers/scsi/hisi_sas/hisi_sas_v2_hw.c
+++ b/drivers/scsi/hisi_sas/hisi_sas_v2_hw.c
@@ -1268,7 +1268,7 @@ static void link_timeout_enable_link(struct timer_list *t)
 		}
 	}
 
-	hisi_hba->timer.function = (TIMER_FUNC_TYPE)link_timeout_disable_link;
+	hisi_hba->timer.function = link_timeout_disable_link;
 	mod_timer(&hisi_hba->timer, jiffies + msecs_to_jiffies(900));
 }
 
@@ -1289,13 +1289,13 @@ static void link_timeout_disable_link(struct timer_list *t)
 		}
 	}
 
-	hisi_hba->timer.function = (TIMER_FUNC_TYPE)link_timeout_enable_link;
+	hisi_hba->timer.function = link_timeout_enable_link;
 	mod_timer(&hisi_hba->timer, jiffies + msecs_to_jiffies(100));
 }
 
 static void set_link_timer_quirk(struct hisi_hba *hisi_hba)
 {
-	hisi_hba->timer.function = (TIMER_FUNC_TYPE)link_timeout_disable_link;
+	hisi_hba->timer.function = link_timeout_disable_link;
 	hisi_hba->timer.expires = jiffies + msecs_to_jiffies(1000);
 	add_timer(&hisi_hba->timer);
 }
diff --git a/drivers/scsi/ipr.c b/drivers/scsi/ipr.c
index d53429371127..cc0187965eee 100644
--- a/drivers/scsi/ipr.c
+++ b/drivers/scsi/ipr.c
@@ -997,7 +997,7 @@ static void ipr_do_req(struct ipr_cmnd *ipr_cmd,
 	ipr_cmd->done = done;
 
 	ipr_cmd->timer.expires = jiffies + timeout;
-	ipr_cmd->timer.function = (TIMER_FUNC_TYPE)timeout_func;
+	ipr_cmd->timer.function = timeout_func;
 
 	add_timer(&ipr_cmd->timer);
 
@@ -8312,7 +8312,7 @@ static void ipr_reset_start_timer(struct ipr_cmnd *ipr_cmd,
 	ipr_cmd->done = ipr_reset_ioa_job;
 
 	ipr_cmd->timer.expires = jiffies + timeout;
-	ipr_cmd->timer.function = (TIMER_FUNC_TYPE)ipr_reset_timer_done;
+	ipr_cmd->timer.function = ipr_reset_timer_done;
 	add_timer(&ipr_cmd->timer);
 }
 
@@ -8397,7 +8397,7 @@ static int ipr_reset_next_stage(struct ipr_cmnd *ipr_cmd)
 	}
 
 	ipr_cmd->timer.expires = jiffies + stage_time * HZ;
-	ipr_cmd->timer.function = (TIMER_FUNC_TYPE)ipr_oper_timeout;
+	ipr_cmd->timer.function = ipr_oper_timeout;
 	ipr_cmd->done = ipr_reset_ioa_job;
 	add_timer(&ipr_cmd->timer);
 
@@ -8468,7 +8468,7 @@ static int ipr_reset_enable_ioa(struct ipr_cmnd *ipr_cmd)
 	}
 
 	ipr_cmd->timer.expires = jiffies + (ioa_cfg->transop_timeout * HZ);
-	ipr_cmd->timer.function = (TIMER_FUNC_TYPE)ipr_oper_timeout;
+	ipr_cmd->timer.function = ipr_oper_timeout;
 	ipr_cmd->done = ipr_reset_ioa_job;
 	add_timer(&ipr_cmd->timer);
 	list_add_tail(&ipr_cmd->queue, &ipr_cmd->hrrq->hrrq_pending_q);
diff --git a/drivers/scsi/libfc/fc_fcp.c b/drivers/scsi/libfc/fc_fcp.c
index 1a4e701a8449..4fae253d4f3d 100644
--- a/drivers/scsi/libfc/fc_fcp.c
+++ b/drivers/scsi/libfc/fc_fcp.c
@@ -1214,7 +1214,7 @@ static int fc_fcp_cmd_send(struct fc_lport *lport, struct fc_fcp_pkt *fsp,
 	fsp->seq_ptr = seq;
 	fc_fcp_pkt_hold(fsp);	/* hold for fc_fcp_pkt_destroy */
 
-	fsp->timer.function = (TIMER_FUNC_TYPE)fc_fcp_timeout;
+	fsp->timer.function = fc_fcp_timeout;
 	if (rpriv->flags & FC_RP_FLAGS_REC_SUPPORTED)
 		fc_fcp_timer_set(fsp, get_fsp_rec_tov(fsp));
 
@@ -1307,7 +1307,7 @@ static void fc_lun_reset_send(struct timer_list *t)
 			return;
 		if (fc_fcp_lock_pkt(fsp))
 			return;
-		fsp->timer.function = (TIMER_FUNC_TYPE)fc_lun_reset_send;
+		fsp->timer.function = fc_lun_reset_send;
 		fc_fcp_timer_set(fsp, get_fsp_rec_tov(fsp));
 		fc_fcp_unlock_pkt(fsp);
 	}
@@ -1445,7 +1445,7 @@ static void fc_fcp_timeout(struct timer_list *t)
 	if (fsp->lp->qfull) {
 		FC_FCP_DBG(fsp, "fcp timeout, resetting timer delay %d\n",
 			   fsp->timer_delay);
-		fsp->timer.function = (TIMER_FUNC_TYPE)fc_fcp_timeout;
+		fsp->timer.function = fc_fcp_timeout;
 		fc_fcp_timer_set(fsp, fsp->timer_delay);
 		goto unlock;
 	}
diff --git a/drivers/scsi/libsas/sas_expander.c b/drivers/scsi/libsas/sas_expander.c
index 174e5eff6155..ca1566237ae7 100644
--- a/drivers/scsi/libsas/sas_expander.c
+++ b/drivers/scsi/libsas/sas_expander.c
@@ -92,7 +92,7 @@ static int smp_execute_task_sg(struct domain_device *dev,
 
 		task->task_done = smp_task_done;
 
-		task->slow_task->timer.function = (TIMER_FUNC_TYPE)smp_task_timedout;
+		task->slow_task->timer.function = smp_task_timedout;
 		task->slow_task->timer.expires = jiffies + SMP_TIMEOUT*HZ;
 		add_timer(&task->slow_task->timer);
 
diff --git a/drivers/scsi/libsas/sas_scsi_host.c b/drivers/scsi/libsas/sas_scsi_host.c
index 91795eb56206..58476b728c57 100644
--- a/drivers/scsi/libsas/sas_scsi_host.c
+++ b/drivers/scsi/libsas/sas_scsi_host.c
@@ -919,7 +919,7 @@ void sas_task_abort(struct sas_task *task)
 			return;
 		if (!del_timer(&slow->timer))
 			return;
-		slow->timer.function((TIMER_DATA_TYPE)&slow->timer);
+		slow->timer.function(&slow->timer);
 		return;
 	}
 
diff --git a/drivers/scsi/mvsas/mv_sas.c b/drivers/scsi/mvsas/mv_sas.c
index cff1c37b8d2e..cff43bd9f675 100644
--- a/drivers/scsi/mvsas/mv_sas.c
+++ b/drivers/scsi/mvsas/mv_sas.c
@@ -1310,7 +1310,7 @@ static int mvs_exec_internal_tmf_task(struct domain_device *dev,
 		memcpy(&task->ssp_task, parameter, para_len);
 		task->task_done = mvs_task_done;
 
-		task->slow_task->timer.function = (TIMER_FUNC_TYPE)mvs_tmf_timedout;
+		task->slow_task->timer.function = mvs_tmf_timedout;
 		task->slow_task->timer.expires = jiffies + MVS_TASK_TIMEOUT*HZ;
 		add_timer(&task->slow_task->timer);
 
@@ -2020,7 +2020,7 @@ void mvs_int_port(struct mvs_info *mvi, int phy_no, u32 events)
 		MVS_CHIP_DISP->write_port_irq_mask(mvi, phy_no,
 					tmp | PHYEV_SIG_FIS);
 		if (phy->timer.function == NULL) {
-			phy->timer.function = (TIMER_FUNC_TYPE)mvs_sig_time_out;
+			phy->timer.function = mvs_sig_time_out;
 			phy->timer.expires = jiffies + 5*HZ;
 			add_timer(&phy->timer);
 		}
diff --git a/drivers/scsi/pm8001/pm8001_sas.c b/drivers/scsi/pm8001/pm8001_sas.c
index 0e294e80c169..947d6017d004 100644
--- a/drivers/scsi/pm8001/pm8001_sas.c
+++ b/drivers/scsi/pm8001/pm8001_sas.c
@@ -695,7 +695,7 @@ static int pm8001_exec_internal_tmf_task(struct domain_device *dev,
 		task->task_proto = dev->tproto;
 		memcpy(&task->ssp_task, parameter, para_len);
 		task->task_done = pm8001_task_done;
-		task->slow_task->timer.function = (TIMER_FUNC_TYPE)pm8001_tmf_timedout;
+		task->slow_task->timer.function = pm8001_tmf_timedout;
 		task->slow_task->timer.expires = jiffies + PM8001_TASK_TIMEOUT*HZ;
 		add_timer(&task->slow_task->timer);
 
@@ -781,7 +781,7 @@ pm8001_exec_internal_task_abort(struct pm8001_hba_info *pm8001_ha,
 		task->dev = dev;
 		task->task_proto = dev->tproto;
 		task->task_done = pm8001_task_done;
-		task->slow_task->timer.function = (TIMER_FUNC_TYPE)pm8001_tmf_timedout;
+		task->slow_task->timer.function = pm8001_tmf_timedout;
 		task->slow_task->timer.expires = jiffies + PM8001_TASK_TIMEOUT * HZ;
 		add_timer(&task->slow_task->timer);
 
diff --git a/drivers/scsi/pmcraid.c b/drivers/scsi/pmcraid.c
index 4f9f115fb6a0..e58be98430b0 100644
--- a/drivers/scsi/pmcraid.c
+++ b/drivers/scsi/pmcraid.c
@@ -604,7 +604,7 @@ static void pmcraid_start_bist(struct pmcraid_cmd *cmd)
 
 	cmd->time_left = msecs_to_jiffies(PMCRAID_BIST_TIMEOUT);
 	cmd->timer.expires = jiffies + msecs_to_jiffies(PMCRAID_BIST_TIMEOUT);
-	cmd->timer.function = (TIMER_FUNC_TYPE)pmcraid_bist_done;
+	cmd->timer.function = pmcraid_bist_done;
 	add_timer(&cmd->timer);
 }
 
@@ -636,7 +636,7 @@ static void pmcraid_reset_alert_done(struct timer_list *t)
 		/* restart timer if some more time is available to wait */
 		cmd->time_left -= PMCRAID_CHECK_FOR_RESET_TIMEOUT;
 		cmd->timer.expires = jiffies + PMCRAID_CHECK_FOR_RESET_TIMEOUT;
-		cmd->timer.function = (TIMER_FUNC_TYPE)pmcraid_reset_alert_done;
+		cmd->timer.function = pmcraid_reset_alert_done;
 		add_timer(&cmd->timer);
 	}
 }
@@ -673,7 +673,7 @@ static void pmcraid_reset_alert(struct pmcraid_cmd *cmd)
 		 */
 		cmd->time_left = PMCRAID_RESET_TIMEOUT;
 		cmd->timer.expires = jiffies + PMCRAID_CHECK_FOR_RESET_TIMEOUT;
-		cmd->timer.function = (TIMER_FUNC_TYPE)pmcraid_reset_alert_done;
+		cmd->timer.function = pmcraid_reset_alert_done;
 		add_timer(&cmd->timer);
 
 		iowrite32(DOORBELL_IOA_RESET_ALERT,
@@ -923,7 +923,7 @@ static void pmcraid_send_cmd(
 	if (timeout_func) {
 		/* setup timeout handler */
 		cmd->timer.expires = jiffies + timeout;
-		cmd->timer.function = (TIMER_FUNC_TYPE)timeout_func;
+		cmd->timer.function = timeout_func;
 		add_timer(&cmd->timer);
 	}
 
@@ -1951,7 +1951,7 @@ static void pmcraid_soft_reset(struct pmcraid_cmd *cmd)
 	cmd->cmd_done = pmcraid_ioa_reset;
 	cmd->timer.expires = jiffies +
 			     msecs_to_jiffies(PMCRAID_TRANSOP_TIMEOUT);
-	cmd->timer.function = (TIMER_FUNC_TYPE)pmcraid_timeout_handler;
+	cmd->timer.function = pmcraid_timeout_handler;
 
 	if (!timer_pending(&cmd->timer))
 		add_timer(&cmd->timer);
diff --git a/drivers/staging/irda/include/net/irda/timer.h b/drivers/staging/irda/include/net/irda/timer.h
index a6635f0afae9..6dab15f5dae1 100644
--- a/drivers/staging/irda/include/net/irda/timer.h
+++ b/drivers/staging/irda/include/net/irda/timer.h
@@ -75,7 +75,7 @@ struct lap_cb;
 static inline void irda_start_timer(struct timer_list *ptimer, int timeout,
 				    void (*callback)(struct timer_list *))
 {
-	ptimer->function = (TIMER_FUNC_TYPE) callback;
+	ptimer->function =  callback;
 
 	/* Set new value for timer (update or add timer).
 	 * We use mod_timer() because it's more efficient and also
diff --git a/drivers/tty/serial/8250/8250_core.c b/drivers/tty/serial/8250/8250_core.c
index d64afdd93872..9342fc2ee7df 100644
--- a/drivers/tty/serial/8250/8250_core.c
+++ b/drivers/tty/serial/8250/8250_core.c
@@ -325,7 +325,7 @@ static int univ8250_setup_irq(struct uart_8250_port *up)
 	if (up->bugs & UART_BUG_THRE) {
 		pr_debug("ttyS%d - using backup timer\n", serial_index(port));
 
-		up->timer.function = (TIMER_FUNC_TYPE)serial8250_backup_timeout;
+		up->timer.function = serial8250_backup_timeout;
 		mod_timer(&up->timer, jiffies +
 			  uart_poll_timeout(port) + HZ / 5);
 	}
@@ -348,7 +348,7 @@ static void univ8250_release_irq(struct uart_8250_port *up)
 	struct uart_port *port = &up->port;
 
 	del_timer_sync(&up->timer);
-	up->timer.function = (TIMER_FUNC_TYPE)serial8250_timeout;
+	up->timer.function = serial8250_timeout;
 	if (port->irq)
 		serial_unlink_irq_chain(up);
 }
diff --git a/include/linux/kthread.h b/include/linux/kthread.h
index dc850d257ea2..c1961761311d 100644
--- a/include/linux/kthread.h
+++ b/include/linux/kthread.h
@@ -118,7 +118,7 @@ struct kthread_delayed_work {
 
 #define KTHREAD_DELAYED_WORK_INIT(dwork, fn) {				\
 	.work = KTHREAD_WORK_INIT((dwork).work, (fn)),			\
-	.timer = __TIMER_INITIALIZER((TIMER_FUNC_TYPE)kthread_delayed_work_timer_fn,\
+	.timer = __TIMER_INITIALIZER(kthread_delayed_work_timer_fn,\
 				     TIMER_IRQSAFE),			\
 	}
 
diff --git a/include/linux/timer.h b/include/linux/timer.h
index b1ae64b112c2..04af640ea95b 100644
--- a/include/linux/timer.h
+++ b/include/linux/timer.h
@@ -63,9 +63,6 @@ struct timer_list {
 
 #define TIMER_TRACE_FLAGMASK	(TIMER_MIGRATING | TIMER_DEFERRABLE | TIMER_PINNED | TIMER_IRQSAFE)
 
-#define TIMER_DATA_TYPE		struct timer_list *
-#define TIMER_FUNC_TYPE		void (*)(TIMER_DATA_TYPE)
-
 #define __TIMER_INITIALIZER(_function, _flags) {		\
 		.entry = { .next = TIMER_ENTRY_STATIC },	\
 		.function = (_function),			\
@@ -76,7 +73,7 @@ struct timer_list {
 
 #define DEFINE_TIMER(_name, _function)				\
 	struct timer_list _name =				\
-		__TIMER_INITIALIZER((TIMER_FUNC_TYPE)_function, 0)
+		__TIMER_INITIALIZER(_function, 0)
 
 /*
  * LOCKDEP and DEBUG timer interfaces.
diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index bff39faba793..4a54ef96aff5 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -176,7 +176,7 @@ struct execute_work {
 
 #define __DELAYED_WORK_INITIALIZER(n, f, tflags) {			\
 	.work = __WORK_INITIALIZER((n).work, (f)),			\
-	.timer = __TIMER_INITIALIZER((TIMER_FUNC_TYPE)delayed_work_timer_fn,\
+	.timer = __TIMER_INITIALIZER(delayed_work_timer_fn,\
 				     (tflags) | TIMER_IRQSAFE),		\
 	}
 
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 8af313081b0d..cd50e99202b0 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -843,7 +843,7 @@ void __kthread_queue_delayed_work(struct kthread_worker *worker,
 	struct timer_list *timer = &dwork->timer;
 	struct kthread_work *work = &dwork->work;
 
-	WARN_ON_ONCE(timer->function != (TIMER_FUNC_TYPE)kthread_delayed_work_timer_fn);
+	WARN_ON_ONCE(timer->function != kthread_delayed_work_timer_fn);
 
 	/*
 	 * If @delay is 0, queue @dwork->work immediately.  This is for
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index dde6298f6b22..8fdb710bfdd7 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -1509,7 +1509,7 @@ static void __queue_delayed_work(int cpu, struct workqueue_struct *wq,
 	struct work_struct *work = &dwork->work;
 
 	WARN_ON_ONCE(!wq);
-	WARN_ON_ONCE(timer->function != (TIMER_FUNC_TYPE)delayed_work_timer_fn);
+	WARN_ON_ONCE(timer->function != delayed_work_timer_fn);
 	WARN_ON_ONCE(timer_pending(timer));
 	WARN_ON_ONCE(!list_empty(&work->entry));
 
diff --git a/net/atm/lec.c b/net/atm/lec.c
index c976196da3ea..6676e3433261 100644
--- a/net/atm/lec.c
+++ b/net/atm/lec.c
@@ -1798,7 +1798,7 @@ static struct atm_vcc *lec_arp_resolve(struct lec_priv *priv,
 		else
 			send_to_lecd(priv, l_arp_xmt, mac_to_find, NULL, NULL);
 		entry->timer.expires = jiffies + (1 * HZ);
-		entry->timer.function = (TIMER_FUNC_TYPE)lec_arp_expire_arp;
+		entry->timer.function = lec_arp_expire_arp;
 		add_timer(&entry->timer);
 		found = priv->mcast_vcc;
 	}
@@ -1998,7 +1998,7 @@ lec_vcc_added(struct lec_priv *priv, const struct atmlec_ioc *ioc_data,
 		entry->old_recv_push = old_push;
 		entry->status = ESI_UNKNOWN;
 		entry->timer.expires = jiffies + priv->vcc_timeout_period;
-		entry->timer.function = (TIMER_FUNC_TYPE)lec_arp_expire_vcc;
+		entry->timer.function = lec_arp_expire_vcc;
 		hlist_add_head(&entry->next, &priv->lec_no_forward);
 		add_timer(&entry->timer);
 		dump_arp_table(priv);
@@ -2082,7 +2082,7 @@ lec_vcc_added(struct lec_priv *priv, const struct atmlec_ioc *ioc_data,
 	entry->status = ESI_UNKNOWN;
 	hlist_add_head(&entry->next, &priv->lec_arp_empty_ones);
 	entry->timer.expires = jiffies + priv->vcc_timeout_period;
-	entry->timer.function = (TIMER_FUNC_TYPE)lec_arp_expire_vcc;
+	entry->timer.function = lec_arp_expire_vcc;
 	add_timer(&entry->timer);
 	pr_debug("After vcc was added\n");
 	dump_arp_table(priv);
diff --git a/net/can/proc.c b/net/can/proc.c
index d979b3dc49a6..0c59f876fe6f 100644
--- a/net/can/proc.c
+++ b/net/can/proc.c
@@ -221,7 +221,7 @@ static int can_stats_proc_show(struct seq_file *m, void *v)
 
 	seq_putc(m, '\n');
 
-	if (net->can.can_stattimer.function == (TIMER_FUNC_TYPE)can_stat_update) {
+	if (net->can.can_stattimer.function == can_stat_update) {
 		seq_printf(m, " %8ld %% total match ratio (RXMR)\n",
 				can_stats->total_rx_match_ratio);
 
@@ -291,7 +291,7 @@ static int can_reset_stats_proc_show(struct seq_file *m, void *v)
 
 	user_reset = 1;
 
-	if (net->can.can_stattimer.function == (TIMER_FUNC_TYPE)can_stat_update) {
+	if (net->can.can_stattimer.function == can_stat_update) {
 		seq_printf(m, "Scheduled statistic reset #%ld.\n",
 				can_pstats->stats_reset + 1);
 	} else {
diff --git a/net/lapb/lapb_timer.c b/net/lapb/lapb_timer.c
index 8bb469cb3abe..5d4ae01951b5 100644
--- a/net/lapb/lapb_timer.c
+++ b/net/lapb/lapb_timer.c
@@ -42,7 +42,7 @@ void lapb_start_t1timer(struct lapb_cb *lapb)
 {
 	del_timer(&lapb->t1timer);
 
-	lapb->t1timer.function = (TIMER_FUNC_TYPE)lapb_t1timer_expiry;
+	lapb->t1timer.function = lapb_t1timer_expiry;
 	lapb->t1timer.expires  = jiffies + lapb->t1;
 
 	add_timer(&lapb->t1timer);
@@ -52,7 +52,7 @@ void lapb_start_t2timer(struct lapb_cb *lapb)
 {
 	del_timer(&lapb->t2timer);
 
-	lapb->t2timer.function = (TIMER_FUNC_TYPE)lapb_t2timer_expiry;
+	lapb->t2timer.function = lapb_t2timer_expiry;
 	lapb->t2timer.expires  = jiffies + lapb->t2;
 
 	add_timer(&lapb->t2timer);
diff --git a/net/netrom/af_netrom.c b/net/netrom/af_netrom.c
index 2dec3583c97d..7ed9d4422a73 100644
--- a/net/netrom/af_netrom.c
+++ b/net/netrom/af_netrom.c
@@ -284,7 +284,7 @@ void nr_destroy_socket(struct sock *sk)
 
 	if (sk_has_allocations(sk)) {
 		/* Defer: outstanding buffers */
-		sk->sk_timer.function = (TIMER_FUNC_TYPE)nr_destroy_timer;
+		sk->sk_timer.function = nr_destroy_timer;
 		sk->sk_timer.expires  = jiffies + 2 * HZ;
 		add_timer(&sk->sk_timer);
 	} else
diff --git a/net/netrom/nr_timer.c b/net/netrom/nr_timer.c
index 43569aea0f5e..cbd51ed5a2d7 100644
--- a/net/netrom/nr_timer.c
+++ b/net/netrom/nr_timer.c
@@ -45,7 +45,7 @@ void nr_init_timers(struct sock *sk)
 	timer_setup(&nr->idletimer, nr_idletimer_expiry, 0);
 
 	/* initialized by sock_init_data */
-	sk->sk_timer.function = (TIMER_FUNC_TYPE)nr_heartbeat_expiry;
+	sk->sk_timer.function = nr_heartbeat_expiry;
 }
 
 void nr_start_t1timer(struct sock *sk)
diff --git a/net/rose/rose_link.c b/net/rose/rose_link.c
index cda4c6678ef1..62055d3069d2 100644
--- a/net/rose/rose_link.c
+++ b/net/rose/rose_link.c
@@ -37,7 +37,7 @@ void rose_start_ftimer(struct rose_neigh *neigh)
 {
 	del_timer(&neigh->ftimer);
 
-	neigh->ftimer.function = (TIMER_FUNC_TYPE)rose_ftimer_expiry;
+	neigh->ftimer.function = rose_ftimer_expiry;
 	neigh->ftimer.expires  =
 		jiffies + msecs_to_jiffies(sysctl_rose_link_fail_timeout);
 
@@ -48,7 +48,7 @@ static void rose_start_t0timer(struct rose_neigh *neigh)
 {
 	del_timer(&neigh->t0timer);
 
-	neigh->t0timer.function = (TIMER_FUNC_TYPE)rose_t0timer_expiry;
+	neigh->t0timer.function = rose_t0timer_expiry;
 	neigh->t0timer.expires  =
 		jiffies + msecs_to_jiffies(sysctl_rose_restart_request_timeout);
 
diff --git a/net/rose/rose_timer.c b/net/rose/rose_timer.c
index ea613b2a9735..74555fb95615 100644
--- a/net/rose/rose_timer.c
+++ b/net/rose/rose_timer.c
@@ -36,7 +36,7 @@ void rose_start_heartbeat(struct sock *sk)
 {
 	del_timer(&sk->sk_timer);
 
-	sk->sk_timer.function = (TIMER_FUNC_TYPE)rose_heartbeat_expiry;
+	sk->sk_timer.function = rose_heartbeat_expiry;
 	sk->sk_timer.expires  = jiffies + 5 * HZ;
 
 	add_timer(&sk->sk_timer);
@@ -48,7 +48,7 @@ void rose_start_t1timer(struct sock *sk)
 
 	del_timer(&rose->timer);
 
-	rose->timer.function = (TIMER_FUNC_TYPE)rose_timer_expiry;
+	rose->timer.function = rose_timer_expiry;
 	rose->timer.expires  = jiffies + rose->t1;
 
 	add_timer(&rose->timer);
@@ -60,7 +60,7 @@ void rose_start_t2timer(struct sock *sk)
 
 	del_timer(&rose->timer);
 
-	rose->timer.function = (TIMER_FUNC_TYPE)rose_timer_expiry;
+	rose->timer.function = rose_timer_expiry;
 	rose->timer.expires  = jiffies + rose->t2;
 
 	add_timer(&rose->timer);
@@ -72,7 +72,7 @@ void rose_start_t3timer(struct sock *sk)
 
 	del_timer(&rose->timer);
 
-	rose->timer.function = (TIMER_FUNC_TYPE)rose_timer_expiry;
+	rose->timer.function = rose_timer_expiry;
 	rose->timer.expires  = jiffies + rose->t3;
 
 	add_timer(&rose->timer);
@@ -84,7 +84,7 @@ void rose_start_hbtimer(struct sock *sk)
 
 	del_timer(&rose->timer);
 
-	rose->timer.function = (TIMER_FUNC_TYPE)rose_timer_expiry;
+	rose->timer.function = rose_timer_expiry;
 	rose->timer.expires  = jiffies + rose->hb;
 
 	add_timer(&rose->timer);
@@ -97,7 +97,7 @@ void rose_start_idletimer(struct sock *sk)
 	del_timer(&rose->idletimer);
 
 	if (rose->idle > 0) {
-		rose->idletimer.function = (TIMER_FUNC_TYPE)rose_idletimer_expiry;
+		rose->idletimer.function = rose_idletimer_expiry;
 		rose->idletimer.expires  = jiffies + rose->idle;
 
 		add_timer(&rose->idletimer);
diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
index e8e0831229cf..f9307bd6644b 100644
--- a/net/sunrpc/svc_xprt.c
+++ b/net/sunrpc/svc_xprt.c
@@ -745,7 +745,7 @@ static void svc_add_new_temp_xprt(struct svc_serv *serv, struct svc_xprt *newxpt
 	serv->sv_tmpcnt++;
 	if (serv->sv_temptimer.function == NULL) {
 		/* setup timer to age temp transports */
-		serv->sv_temptimer.function = (TIMER_FUNC_TYPE)svc_age_temp_xprts;
+		serv->sv_temptimer.function = svc_age_temp_xprts;
 		mod_timer(&serv->sv_temptimer,
 			  jiffies + svc_conn_age_period * HZ);
 	}
diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c
index ea87143314f3..562cc11131f6 100644
--- a/net/x25/af_x25.c
+++ b/net/x25/af_x25.c
@@ -415,7 +415,7 @@ static void __x25_destroy_socket(struct sock *sk)
 	if (sk_has_allocations(sk)) {
 		/* Defer: outstanding buffers */
 		sk->sk_timer.expires  = jiffies + 10 * HZ;
-		sk->sk_timer.function = (TIMER_FUNC_TYPE)x25_destroy_timer;
+		sk->sk_timer.function = x25_destroy_timer;
 		add_timer(&sk->sk_timer);
 	} else {
 		/* drop last reference so sock_put will free */
diff --git a/net/x25/x25_timer.c b/net/x25/x25_timer.c
index 1dfba3c23459..fa3461002b3e 100644
--- a/net/x25/x25_timer.c
+++ b/net/x25/x25_timer.c
@@ -36,7 +36,7 @@ void x25_init_timers(struct sock *sk)
 	timer_setup(&x25->timer, x25_timer_expiry, 0);
 
 	/* initialized by sock_init_data */
-	sk->sk_timer.function = (TIMER_FUNC_TYPE)x25_heartbeat_expiry;
+	sk->sk_timer.function = x25_heartbeat_expiry;
 }
 
 void x25_start_heartbeat(struct sock *sk)
diff --git a/sound/usb/line6/driver.c b/sound/usb/line6/driver.c
index 4f9613e5fc9e..c1376bfdc90b 100644
--- a/sound/usb/line6/driver.c
+++ b/sound/usb/line6/driver.c
@@ -201,7 +201,7 @@ static int line6_send_raw_message_async_part(struct message *msg,
 void line6_start_timer(struct timer_list *timer, unsigned long msecs,
 		       void (*function)(struct timer_list *t))
 {
-	timer->function = (TIMER_FUNC_TYPE)function;
+	timer->function = function;
 	mod_timer(timer, jiffies + msecs_to_jiffies(msecs));
 }
 EXPORT_SYMBOL_GPL(line6_start_timer);
-- 
cgit v1.2.3