From 93aa6c7bbc4124c8361c26a8b2c5c40afb185619 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Fri, 3 Jul 2015 09:28:09 -0400
Subject: SUNRPC: Don't reencode message if transmission failed with ENOBUFS

If we're running out of buffer memory when transmitting data, then
we want to just delay for a moment, and then continue transmitting
the remainder of the message.

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 net/sunrpc/clnt.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index cbc6af923dd1..23608eb0ded2 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -1902,6 +1902,7 @@ call_transmit_status(struct rpc_task *task)
 
 	switch (task->tk_status) {
 	case -EAGAIN:
+	case -ENOBUFS:
 		break;
 	default:
 		dprint_status(task);
@@ -1928,7 +1929,6 @@ call_transmit_status(struct rpc_task *task)
 	case -ECONNABORTED:
 	case -EADDRINUSE:
 	case -ENOTCONN:
-	case -ENOBUFS:
 	case -EPIPE:
 		rpc_task_force_reencode(task);
 	}
@@ -2057,12 +2057,13 @@ call_status(struct rpc_task *task)
 	case -ECONNABORTED:
 		rpc_force_rebind(clnt);
 	case -EADDRINUSE:
-	case -ENOBUFS:
 		rpc_delay(task, 3*HZ);
 	case -EPIPE:
 	case -ENOTCONN:
 		task->tk_action = call_bind;
 		break;
+	case -ENOBUFS:
+		rpc_delay(task, HZ>>2);
 	case -EAGAIN:
 		task->tk_action = call_transmit;
 		break;
-- 
cgit v1.2.3


From b5872f0c67edf3714dd46f04d73c3644f3addaf9 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Fri, 3 Jul 2015 09:32:23 -0400
Subject: SUNRPC: Don't confuse ENOBUFS with a write_space issue

ENOBUFS means that memory allocations are failing due to an actual
low memory situation. It should not be confused with being out of
socket buffer space.

Handle the problem by just punting to the delay in call_status.

Reported-by: Neil Brown <neilb@suse.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 net/sunrpc/xprtsock.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index ee359fc7af16..44c1927b68c7 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -539,6 +539,7 @@ static int xs_local_send_request(struct rpc_task *task)
 
 	switch (status) {
 	case -ENOBUFS:
+		break;
 	case -EAGAIN:
 		status = xs_nospace(task);
 		break;
@@ -692,7 +693,6 @@ static int xs_tcp_send_request(struct rpc_task *task)
 		status = -ENOTCONN;
 		/* Should we call xs_close() here? */
 		break;
-	case -ENOBUFS:
 	case -EAGAIN:
 		status = xs_nospace(task);
 		break;
@@ -703,6 +703,7 @@ static int xs_tcp_send_request(struct rpc_task *task)
 	case -ECONNREFUSED:
 	case -ENOTCONN:
 	case -EADDRINUSE:
+	case -ENOBUFS:
 	case -EPIPE:
 		clear_bit(SOCK_ASYNC_NOSPACE, &transport->sock->flags);
 	}
-- 
cgit v1.2.3


From 95dd8653de658143770cb0e55a58d2aab97c79d2 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Thu, 9 Jul 2015 22:56:00 +0200
Subject: netfilter: ctnetlink: put back references to master ct and expect
 objects

We have to put back the references to the master conntrack and the expectation
that we just created, otherwise we'll leak them.

Fixes: 0ef71ee1a5b9 ("netfilter: ctnetlink: refactor ctnetlink_create_expect")
Reported-by: Tim Wiess <Tim.Wiess@watchguard.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_conntrack_netlink.c | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'net')

diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index d1c23940a86a..6b8b0abbfab4 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -2995,11 +2995,6 @@ ctnetlink_create_expect(struct net *net, u16 zone,
 	}
 
 	err = nf_ct_expect_related_report(exp, portid, report);
-	if (err < 0)
-		goto err_exp;
-
-	return 0;
-err_exp:
 	nf_ct_expect_put(exp);
 err_ct:
 	nf_ct_put(ct);
-- 
cgit v1.2.3


From 484836ec2de24d9a7c6471f022b746d947698725 Mon Sep 17 00:00:00 2001
From: Dmitry Torokhov <dtor@google.com>
Date: Thu, 9 Jul 2015 17:15:01 -0700
Subject: netfilter: IDLETIMER: fix lockdep warning

Dynamically allocated sysfs attributes should be initialized with
sysfs_attr_init() otherwise lockdep will be angry with us:

[   45.468653] BUG: key ffffffc030fad4e0 not in .data!
[   45.468655] ------------[ cut here ]------------
[   45.468666] WARNING: CPU: 0 PID: 1176 at /mnt/host/source/src/third_party/kernel/v3.18/kernel/locking/lockdep.c:2991 lockdep_init_map+0x12c/0x490()
[   45.468672] DEBUG_LOCKS_WARN_ON(1)
[   45.468672] CPU: 0 PID: 1176 Comm: iptables Tainted: G     U  W 3.18.0 #43
[   45.468674] Hardware name: XXX
[   45.468675] Call trace:
[   45.468680] [<ffffffc0002072b4>] dump_backtrace+0x0/0x10c
[   45.468683] [<ffffffc0002073d0>] show_stack+0x10/0x1c
[   45.468688] [<ffffffc000a86cd4>] dump_stack+0x74/0x94
[   45.468692] [<ffffffc000217ae0>] warn_slowpath_common+0x84/0xb0
[   45.468694] [<ffffffc000217b84>] warn_slowpath_fmt+0x4c/0x58
[   45.468697] [<ffffffc0002530a4>] lockdep_init_map+0x128/0x490
[   45.468701] [<ffffffc000367ef0>] __kernfs_create_file+0x80/0xe4
[   45.468704] [<ffffffc00036862c>] sysfs_add_file_mode_ns+0x104/0x170
[   45.468706] [<ffffffc00036870c>] sysfs_create_file_ns+0x58/0x64
[   45.468711] [<ffffffc000930430>] idletimer_tg_checkentry+0x14c/0x324
[   45.468714] [<ffffffc00092a728>] xt_check_target+0x170/0x198
[   45.468717] [<ffffffc000993efc>] check_target+0x58/0x6c
[   45.468720] [<ffffffc000994c64>] translate_table+0x30c/0x424
[   45.468723] [<ffffffc00099529c>] do_ipt_set_ctl+0x144/0x1d0
[   45.468728] [<ffffffc0009079f0>] nf_setsockopt+0x50/0x60
[   45.468732] [<ffffffc000946870>] ip_setsockopt+0x8c/0xb4
[   45.468735] [<ffffffc0009661c0>] raw_setsockopt+0x10/0x50
[   45.468739] [<ffffffc0008c1550>] sock_common_setsockopt+0x14/0x20
[   45.468742] [<ffffffc0008bd190>] SyS_setsockopt+0x88/0xb8
[   45.468744] ---[ end trace 41d156354d18c039 ]---

Signed-off-by: Dmitry Torokhov <dtor@google.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/xt_IDLETIMER.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/netfilter/xt_IDLETIMER.c b/net/netfilter/xt_IDLETIMER.c
index f407ebc13481..29d2c31f406c 100644
--- a/net/netfilter/xt_IDLETIMER.c
+++ b/net/netfilter/xt_IDLETIMER.c
@@ -126,6 +126,7 @@ static int idletimer_tg_create(struct idletimer_tg_info *info)
 		goto out;
 	}
 
+	sysfs_attr_init(&info->timer->attr.attr);
 	info->timer->attr.attr.name = kstrdup(info->label, GFP_KERNEL);
 	if (!info->timer->attr.attr.name) {
 		ret = -ENOMEM;
-- 
cgit v1.2.3


From ea52bf8eda9832ad30e9f059c5ead8d44f882a53 Mon Sep 17 00:00:00 2001
From: Pierre Morel <pmorel@linux.vnet.ibm.com>
Date: Thu, 9 Jul 2015 14:58:26 +0200
Subject: 9p/trans_virtio: reset virtio device on remove

On device shutdown/removal, virtio drivers need to trigger a reset on
the device; if this is neglected, the virtio core will complain about
non-zero device status.

This patch resets the status when the 9p virtio driver is removed
from the system by calling vdev->config->reset on the virtio_device
to send a reset to the host virtio device.

Signed-off-by: Pierre Morel <pmorel@linux.vnet.ibm.com>
Reviewed-by: Cornelia Huck <cornelia.huck@de.ibm.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 net/9p/trans_virtio.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/9p/trans_virtio.c b/net/9p/trans_virtio.c
index 9dd49ca67dbc..6e70ddb158b4 100644
--- a/net/9p/trans_virtio.c
+++ b/net/9p/trans_virtio.c
@@ -704,6 +704,7 @@ static void p9_virtio_remove(struct virtio_device *vdev)
 
 	mutex_unlock(&virtio_9p_lock);
 
+	vdev->config->reset(vdev);
 	vdev->config->del_vqs(vdev);
 
 	sysfs_remove_file(&(vdev->dev.kobj), &dev_attr_mount_tag.attr);
-- 
cgit v1.2.3


From 326bf17ea5d4f8f17b54cbf167b8cb504c606ee9 Mon Sep 17 00:00:00 2001
From: Alex Gartrell <agartrell@fb.com>
Date: Fri, 26 Jun 2015 03:18:45 -0700
Subject: ipvs: fix ipv6 route unreach panic

Previously there was a trivial panic

unshare -n /bin/bash <<EOF
ip addr add dev lo face::1/128
ipvsadm -A -t [face::1]:15213
ipvsadm -a -t [face::1]:15213 -r b00c::1
echo boom | nc face::1 15213
EOF

This patch allows us to replicate the net logic above and simply capture
the skb_dst(skb)->dev and use that for the purpose of the invocation.

Signed-off-by: Alex Gartrell <agartrell@fb.com>
Acked-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: Simon Horman <horms@verge.net.au>
---
 net/netfilter/ipvs/ip_vs_xmit.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'net')

diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c
index bf66a8657a5f..b99d80695b1f 100644
--- a/net/netfilter/ipvs/ip_vs_xmit.c
+++ b/net/netfilter/ipvs/ip_vs_xmit.c
@@ -505,6 +505,13 @@ err_put:
 	return -1;
 
 err_unreach:
+	/* The ip6_link_failure function requires the dev field to be set
+	 * in order to get the net (further for the sake of fwmark
+	 * reflection).
+	 */
+	if (!skb->dev)
+		skb->dev = skb_dst(skb)->dev;
+
 	dst_link_failure(skb);
 	return -1;
 }
-- 
cgit v1.2.3


From 4754957f04f5f368792a0eb7dab0ae89fb93dcfd Mon Sep 17 00:00:00 2001
From: Julian Anastasov <ja@ssi.bg>
Date: Sat, 27 Jun 2015 14:39:30 +0300
Subject: ipvs: do not use random local source address for tunnels

Michael Vallaly reports about wrong source address used
in rare cases for tunneled traffic. Looks like
__ip_vs_get_out_rt in 3.10+ is providing uninitialized
dest_dst->dst_saddr.ip because ip_vs_dest_dst_alloc uses
kmalloc. While we retry after seeing EINVAL from routing
for data that does not look like valid local address, it
still succeeded when this memory was previously used from
other dests and with different local addresses. As result,
we can use valid local address that is not suitable for
our real server.

Fix it by providing 0.0.0.0 every time our cache is refreshed.
By this way we will get preferred source address from routing.

Reported-by: Michael Vallaly <lvs@nolatency.com>
Fixes: 026ace060dfe ("ipvs: optimize dst usage for real server")
Signed-off-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: Simon Horman <horms@verge.net.au>
---
 net/netfilter/ipvs/ip_vs_xmit.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'net')

diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c
index b99d80695b1f..ec30d68ccc0b 100644
--- a/net/netfilter/ipvs/ip_vs_xmit.c
+++ b/net/netfilter/ipvs/ip_vs_xmit.c
@@ -130,7 +130,6 @@ static struct rtable *do_output_route4(struct net *net, __be32 daddr,
 
 	memset(&fl4, 0, sizeof(fl4));
 	fl4.daddr = daddr;
-	fl4.saddr = (rt_mode & IP_VS_RT_MODE_CONNECT) ? *saddr : 0;
 	fl4.flowi4_flags = (rt_mode & IP_VS_RT_MODE_KNOWN_NH) ?
 			   FLOWI_FLAG_KNOWN_NH : 0;
 
-- 
cgit v1.2.3


From 05f00505a89acd21f5d0d20f5797dfbc4cf85243 Mon Sep 17 00:00:00 2001
From: Julian Anastasov <ja@ssi.bg>
Date: Mon, 29 Jun 2015 21:51:40 +0300
Subject: ipvs: fix crash if scheduler is changed

I overlooked the svc->sched_data usage from schedulers
when the services were converted to RCU in 3.10. Now
the rare ipvsadm -E command can change the scheduler
but due to the reverse order of ip_vs_bind_scheduler
and ip_vs_unbind_scheduler we provide new sched_data
to the old scheduler resulting in a crash.

To fix it without changing the scheduler methods we
have to use synchronize_rcu() only for the editing case.
It means all svc->scheduler readers should expect a
NULL value. To avoid breakage for the service listing
and ipvsadm -R we can use the "none" name to indicate
that scheduler is not assigned, a state when we drop
new connections.

Reported-by: Alexander Vasiliev <a.vasylev@404-group.com>
Fixes: ceec4c381681 ("ipvs: convert services to rcu")
Signed-off-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: Simon Horman <horms@verge.net.au>
---
 net/netfilter/ipvs/ip_vs_core.c  | 16 +++++++--
 net/netfilter/ipvs/ip_vs_ctl.c   | 78 +++++++++++++++++++++++++---------------
 net/netfilter/ipvs/ip_vs_sched.c | 12 +++----
 3 files changed, 69 insertions(+), 37 deletions(-)

(limited to 'net')

diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c
index 5d2b806a862e..38fbc194b9cb 100644
--- a/net/netfilter/ipvs/ip_vs_core.c
+++ b/net/netfilter/ipvs/ip_vs_core.c
@@ -319,7 +319,13 @@ ip_vs_sched_persist(struct ip_vs_service *svc,
 		 * return *ignored=0 i.e. ICMP and NF_DROP
 		 */
 		sched = rcu_dereference(svc->scheduler);
-		dest = sched->schedule(svc, skb, iph);
+		if (sched) {
+			/* read svc->sched_data after svc->scheduler */
+			smp_rmb();
+			dest = sched->schedule(svc, skb, iph);
+		} else {
+			dest = NULL;
+		}
 		if (!dest) {
 			IP_VS_DBG(1, "p-schedule: no dest found.\n");
 			kfree(param.pe_data);
@@ -467,7 +473,13 @@ ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb,
 	}
 
 	sched = rcu_dereference(svc->scheduler);
-	dest = sched->schedule(svc, skb, iph);
+	if (sched) {
+		/* read svc->sched_data after svc->scheduler */
+		smp_rmb();
+		dest = sched->schedule(svc, skb, iph);
+	} else {
+		dest = NULL;
+	}
 	if (dest == NULL) {
 		IP_VS_DBG(1, "Schedule: no dest found.\n");
 		return NULL;
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index 285eae3a1454..24c554201a76 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -842,15 +842,16 @@ __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest,
 	__ip_vs_dst_cache_reset(dest);
 	spin_unlock_bh(&dest->dst_lock);
 
-	sched = rcu_dereference_protected(svc->scheduler, 1);
 	if (add) {
 		ip_vs_start_estimator(svc->net, &dest->stats);
 		list_add_rcu(&dest->n_list, &svc->destinations);
 		svc->num_dests++;
-		if (sched->add_dest)
+		sched = rcu_dereference_protected(svc->scheduler, 1);
+		if (sched && sched->add_dest)
 			sched->add_dest(svc, dest);
 	} else {
-		if (sched->upd_dest)
+		sched = rcu_dereference_protected(svc->scheduler, 1);
+		if (sched && sched->upd_dest)
 			sched->upd_dest(svc, dest);
 	}
 }
@@ -1084,7 +1085,7 @@ static void __ip_vs_unlink_dest(struct ip_vs_service *svc,
 		struct ip_vs_scheduler *sched;
 
 		sched = rcu_dereference_protected(svc->scheduler, 1);
-		if (sched->del_dest)
+		if (sched && sched->del_dest)
 			sched->del_dest(svc, dest);
 	}
 }
@@ -1175,11 +1176,14 @@ ip_vs_add_service(struct net *net, struct ip_vs_service_user_kern *u,
 	ip_vs_use_count_inc();
 
 	/* Lookup the scheduler by 'u->sched_name' */
-	sched = ip_vs_scheduler_get(u->sched_name);
-	if (sched == NULL) {
-		pr_info("Scheduler module ip_vs_%s not found\n", u->sched_name);
-		ret = -ENOENT;
-		goto out_err;
+	if (strcmp(u->sched_name, "none")) {
+		sched = ip_vs_scheduler_get(u->sched_name);
+		if (!sched) {
+			pr_info("Scheduler module ip_vs_%s not found\n",
+				u->sched_name);
+			ret = -ENOENT;
+			goto out_err;
+		}
 	}
 
 	if (u->pe_name && *u->pe_name) {
@@ -1240,10 +1244,12 @@ ip_vs_add_service(struct net *net, struct ip_vs_service_user_kern *u,
 	spin_lock_init(&svc->stats.lock);
 
 	/* Bind the scheduler */
-	ret = ip_vs_bind_scheduler(svc, sched);
-	if (ret)
-		goto out_err;
-	sched = NULL;
+	if (sched) {
+		ret = ip_vs_bind_scheduler(svc, sched);
+		if (ret)
+			goto out_err;
+		sched = NULL;
+	}
 
 	/* Bind the ct retriever */
 	RCU_INIT_POINTER(svc->pe, pe);
@@ -1291,17 +1297,20 @@ ip_vs_add_service(struct net *net, struct ip_vs_service_user_kern *u,
 static int
 ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u)
 {
-	struct ip_vs_scheduler *sched, *old_sched;
+	struct ip_vs_scheduler *sched = NULL, *old_sched;
 	struct ip_vs_pe *pe = NULL, *old_pe = NULL;
 	int ret = 0;
 
 	/*
 	 * Lookup the scheduler, by 'u->sched_name'
 	 */
-	sched = ip_vs_scheduler_get(u->sched_name);
-	if (sched == NULL) {
-		pr_info("Scheduler module ip_vs_%s not found\n", u->sched_name);
-		return -ENOENT;
+	if (strcmp(u->sched_name, "none")) {
+		sched = ip_vs_scheduler_get(u->sched_name);
+		if (!sched) {
+			pr_info("Scheduler module ip_vs_%s not found\n",
+				u->sched_name);
+			return -ENOENT;
+		}
 	}
 	old_sched = sched;
 
@@ -1329,14 +1338,20 @@ ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u)
 
 	old_sched = rcu_dereference_protected(svc->scheduler, 1);
 	if (sched != old_sched) {
+		if (old_sched) {
+			ip_vs_unbind_scheduler(svc, old_sched);
+			RCU_INIT_POINTER(svc->scheduler, NULL);
+			/* Wait all svc->sched_data users */
+			synchronize_rcu();
+		}
 		/* Bind the new scheduler */
-		ret = ip_vs_bind_scheduler(svc, sched);
-		if (ret) {
-			old_sched = sched;
-			goto out;
+		if (sched) {
+			ret = ip_vs_bind_scheduler(svc, sched);
+			if (ret) {
+				ip_vs_scheduler_put(sched);
+				goto out;
+			}
 		}
-		/* Unbind the old scheduler on success */
-		ip_vs_unbind_scheduler(svc, old_sched);
 	}
 
 	/*
@@ -1982,6 +1997,7 @@ static int ip_vs_info_seq_show(struct seq_file *seq, void *v)
 		const struct ip_vs_iter *iter = seq->private;
 		const struct ip_vs_dest *dest;
 		struct ip_vs_scheduler *sched = rcu_dereference(svc->scheduler);
+		char *sched_name = sched ? sched->name : "none";
 
 		if (iter->table == ip_vs_svc_table) {
 #ifdef CONFIG_IP_VS_IPV6
@@ -1990,18 +2006,18 @@ static int ip_vs_info_seq_show(struct seq_file *seq, void *v)
 					   ip_vs_proto_name(svc->protocol),
 					   &svc->addr.in6,
 					   ntohs(svc->port),
-					   sched->name);
+					   sched_name);
 			else
 #endif
 				seq_printf(seq, "%s  %08X:%04X %s %s ",
 					   ip_vs_proto_name(svc->protocol),
 					   ntohl(svc->addr.ip),
 					   ntohs(svc->port),
-					   sched->name,
+					   sched_name,
 					   (svc->flags & IP_VS_SVC_F_ONEPACKET)?"ops ":"");
 		} else {
 			seq_printf(seq, "FWM  %08X %s %s",
-				   svc->fwmark, sched->name,
+				   svc->fwmark, sched_name,
 				   (svc->flags & IP_VS_SVC_F_ONEPACKET)?"ops ":"");
 		}
 
@@ -2427,13 +2443,15 @@ ip_vs_copy_service(struct ip_vs_service_entry *dst, struct ip_vs_service *src)
 {
 	struct ip_vs_scheduler *sched;
 	struct ip_vs_kstats kstats;
+	char *sched_name;
 
 	sched = rcu_dereference_protected(src->scheduler, 1);
+	sched_name = sched ? sched->name : "none";
 	dst->protocol = src->protocol;
 	dst->addr = src->addr.ip;
 	dst->port = src->port;
 	dst->fwmark = src->fwmark;
-	strlcpy(dst->sched_name, sched->name, sizeof(dst->sched_name));
+	strlcpy(dst->sched_name, sched_name, sizeof(dst->sched_name));
 	dst->flags = src->flags;
 	dst->timeout = src->timeout / HZ;
 	dst->netmask = src->netmask;
@@ -2892,6 +2910,7 @@ static int ip_vs_genl_fill_service(struct sk_buff *skb,
 	struct ip_vs_flags flags = { .flags = svc->flags,
 				     .mask = ~0 };
 	struct ip_vs_kstats kstats;
+	char *sched_name;
 
 	nl_service = nla_nest_start(skb, IPVS_CMD_ATTR_SERVICE);
 	if (!nl_service)
@@ -2910,8 +2929,9 @@ static int ip_vs_genl_fill_service(struct sk_buff *skb,
 	}
 
 	sched = rcu_dereference_protected(svc->scheduler, 1);
+	sched_name = sched ? sched->name : "none";
 	pe = rcu_dereference_protected(svc->pe, 1);
-	if (nla_put_string(skb, IPVS_SVC_ATTR_SCHED_NAME, sched->name) ||
+	if (nla_put_string(skb, IPVS_SVC_ATTR_SCHED_NAME, sched_name) ||
 	    (pe && nla_put_string(skb, IPVS_SVC_ATTR_PE_NAME, pe->name)) ||
 	    nla_put(skb, IPVS_SVC_ATTR_FLAGS, sizeof(flags), &flags) ||
 	    nla_put_u32(skb, IPVS_SVC_ATTR_TIMEOUT, svc->timeout / HZ) ||
diff --git a/net/netfilter/ipvs/ip_vs_sched.c b/net/netfilter/ipvs/ip_vs_sched.c
index 199760c71f39..7e8141647943 100644
--- a/net/netfilter/ipvs/ip_vs_sched.c
+++ b/net/netfilter/ipvs/ip_vs_sched.c
@@ -74,7 +74,7 @@ void ip_vs_unbind_scheduler(struct ip_vs_service *svc,
 
 	if (sched->done_service)
 		sched->done_service(svc);
-	/* svc->scheduler can not be set to NULL */
+	/* svc->scheduler can be set to NULL only by caller */
 }
 
 
@@ -147,21 +147,21 @@ void ip_vs_scheduler_put(struct ip_vs_scheduler *scheduler)
 
 void ip_vs_scheduler_err(struct ip_vs_service *svc, const char *msg)
 {
-	struct ip_vs_scheduler *sched;
+	struct ip_vs_scheduler *sched = rcu_dereference(svc->scheduler);
+	char *sched_name = sched ? sched->name : "none";
 
-	sched = rcu_dereference(svc->scheduler);
 	if (svc->fwmark) {
 		IP_VS_ERR_RL("%s: FWM %u 0x%08X - %s\n",
-			     sched->name, svc->fwmark, svc->fwmark, msg);
+			     sched_name, svc->fwmark, svc->fwmark, msg);
 #ifdef CONFIG_IP_VS_IPV6
 	} else if (svc->af == AF_INET6) {
 		IP_VS_ERR_RL("%s: %s [%pI6c]:%d - %s\n",
-			     sched->name, ip_vs_proto_name(svc->protocol),
+			     sched_name, ip_vs_proto_name(svc->protocol),
 			     &svc->addr.in6, ntohs(svc->port), msg);
 #endif
 	} else {
 		IP_VS_ERR_RL("%s: %s %pI4:%d - %s\n",
-			     sched->name, ip_vs_proto_name(svc->protocol),
+			     sched_name, ip_vs_proto_name(svc->protocol),
 			     &svc->addr.ip, ntohs(svc->port), msg);
 	}
 }
-- 
cgit v1.2.3


From 71563f3414e917c62acd8e0fb0edf8ed6af63e4b Mon Sep 17 00:00:00 2001
From: Alex Gartrell <agartrell@fb.com>
Date: Sun, 5 Jul 2015 14:28:26 -0700
Subject: ipvs: skb_orphan in case of forwarding

It is possible that we bind against a local socket in early_demux when we
are actually going to want to forward it.  In this case, the socket serves
no purpose and only serves to confuse things (particularly functions which
implicitly expect sk_fullsock to be true, like ip_local_out).
Additionally, skb_set_owner_w is totally broken for non full-socks.

Signed-off-by: Alex Gartrell <agartrell@fb.com>
Fixes: 41063e9dd119 ("ipv4: Early TCP socket demux.")
Acked-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: Simon Horman <horms@verge.net.au>
---
 net/netfilter/ipvs/ip_vs_xmit.c | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)

(limited to 'net')

diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c
index ec30d68ccc0b..34dc1429ebdb 100644
--- a/net/netfilter/ipvs/ip_vs_xmit.c
+++ b/net/netfilter/ipvs/ip_vs_xmit.c
@@ -533,6 +533,21 @@ static inline int ip_vs_tunnel_xmit_prepare(struct sk_buff *skb,
 	return ret;
 }
 
+/* In the event of a remote destination, it's possible that we would have
+ * matches against an old socket (particularly a TIME-WAIT socket). This
+ * causes havoc down the line (ip_local_out et. al. expect regular sockets
+ * and invalid memory accesses will happen) so simply drop the association
+ * in this case.
+*/
+static inline void ip_vs_drop_early_demux_sk(struct sk_buff *skb)
+{
+	/* If dev is set, the packet came from the LOCAL_IN callback and
+	 * not from a local TCP socket.
+	 */
+	if (skb->dev)
+		skb_orphan(skb);
+}
+
 /* return NF_STOLEN (sent) or NF_ACCEPT if local=1 (not sent) */
 static inline int ip_vs_nat_send_or_cont(int pf, struct sk_buff *skb,
 					 struct ip_vs_conn *cp, int local)
@@ -544,12 +559,21 @@ static inline int ip_vs_nat_send_or_cont(int pf, struct sk_buff *skb,
 		ip_vs_notrack(skb);
 	else
 		ip_vs_update_conntrack(skb, cp, 1);
+
+	/* Remove the early_demux association unless it's bound for the
+	 * exact same port and address on this host after translation.
+	 */
+	if (!local || cp->vport != cp->dport ||
+	    !ip_vs_addr_equal(cp->af, &cp->vaddr, &cp->daddr))
+		ip_vs_drop_early_demux_sk(skb);
+
 	if (!local) {
 		skb_forward_csum(skb);
 		NF_HOOK(pf, NF_INET_LOCAL_OUT, NULL, skb,
 			NULL, skb_dst(skb)->dev, dst_output_sk);
 	} else
 		ret = NF_ACCEPT;
+
 	return ret;
 }
 
@@ -563,6 +587,7 @@ static inline int ip_vs_send_or_cont(int pf, struct sk_buff *skb,
 	if (likely(!(cp->flags & IP_VS_CONN_F_NFCT)))
 		ip_vs_notrack(skb);
 	if (!local) {
+		ip_vs_drop_early_demux_sk(skb);
 		skb_forward_csum(skb);
 		NF_HOOK(pf, NF_INET_LOCAL_OUT, NULL, skb,
 			NULL, skb_dst(skb)->dev, dst_output_sk);
@@ -851,6 +876,8 @@ ip_vs_prepare_tunneled_skb(struct sk_buff *skb, int skb_af,
 	struct ipv6hdr *old_ipv6h = NULL;
 #endif
 
+	ip_vs_drop_early_demux_sk(skb);
+
 	if (skb_headroom(skb) < max_headroom || skb_cloned(skb)) {
 		new_skb = skb_realloc_headroom(skb, max_headroom);
 		if (!new_skb)
-- 
cgit v1.2.3


From 56184858d1fc95c46723436b455cb7261cd8be6f Mon Sep 17 00:00:00 2001
From: Julian Anastasov <ja@ssi.bg>
Date: Wed, 8 Jul 2015 08:31:33 +0300
Subject: ipvs: fix crash with sync protocol v0 and FTP

Fix crash in 3.5+ if FTP is used after switching
sync_version to 0.

Fixes: 749c42b620a9 ("ipvs: reduce sync rate with time thresholds")
Signed-off-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: Simon Horman <horms@verge.net.au>
---
 net/netfilter/ipvs/ip_vs_sync.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c
index b08ba9538d12..d99ad93eb855 100644
--- a/net/netfilter/ipvs/ip_vs_sync.c
+++ b/net/netfilter/ipvs/ip_vs_sync.c
@@ -612,7 +612,7 @@ static void ip_vs_sync_conn_v0(struct net *net, struct ip_vs_conn *cp,
 			pkts = atomic_add_return(1, &cp->in_pkts);
 		else
 			pkts = sysctl_sync_threshold(ipvs);
-		ip_vs_sync_conn(net, cp->control, pkts);
+		ip_vs_sync_conn(net, cp, pkts);
 	}
 }
 
-- 
cgit v1.2.3


From e3895c0334d0ef46e80f22eaf2a52401ff6d5a67 Mon Sep 17 00:00:00 2001
From: Julian Anastasov <ja@ssi.bg>
Date: Thu, 9 Jul 2015 11:15:27 +0300
Subject: ipvs: call skb_sender_cpu_clear

Reset XPS's sender_cpu on forwarding.

Signed-off-by: Julian Anastasov <ja@ssi.bg>
Fixes: 2bd82484bb4c ("xps: fix xps for stacked devices")
Signed-off-by: Simon Horman <horms@verge.net.au>
---
 net/netfilter/ipvs/ip_vs_xmit.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'net')

diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c
index 34dc1429ebdb..258a0b0e82a2 100644
--- a/net/netfilter/ipvs/ip_vs_xmit.c
+++ b/net/netfilter/ipvs/ip_vs_xmit.c
@@ -529,6 +529,8 @@ static inline int ip_vs_tunnel_xmit_prepare(struct sk_buff *skb,
 	if (ret == NF_ACCEPT) {
 		nf_reset(skb);
 		skb_forward_csum(skb);
+		if (!skb->sk)
+			skb_sender_cpu_clear(skb);
 	}
 	return ret;
 }
@@ -569,6 +571,8 @@ static inline int ip_vs_nat_send_or_cont(int pf, struct sk_buff *skb,
 
 	if (!local) {
 		skb_forward_csum(skb);
+		if (!skb->sk)
+			skb_sender_cpu_clear(skb);
 		NF_HOOK(pf, NF_INET_LOCAL_OUT, NULL, skb,
 			NULL, skb_dst(skb)->dev, dst_output_sk);
 	} else
@@ -589,6 +593,8 @@ static inline int ip_vs_send_or_cont(int pf, struct sk_buff *skb,
 	if (!local) {
 		ip_vs_drop_early_demux_sk(skb);
 		skb_forward_csum(skb);
+		if (!skb->sk)
+			skb_sender_cpu_clear(skb);
 		NF_HOOK(pf, NF_INET_LOCAL_OUT, NULL, skb,
 			NULL, skb_dst(skb)->dev, dst_output_sk);
 	} else
-- 
cgit v1.2.3


From 035d210f928ce083435b4fd351a26d126c02c927 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Mon, 13 Jul 2015 00:06:02 +0200
Subject: rtnetlink: reject non-IFLA_VF_PORT attributes inside IFLA_VF_PORTS

Similarly as in commit 4f7d2cdfdde7 ("rtnetlink: verify IFLA_VF_INFO
attributes before passing them to driver"), we have a double nesting
of netlink attributes, i.e. IFLA_VF_PORTS only contains IFLA_VF_PORT
that is nested itself. While IFLA_VF_PORTS is a verified attribute
from ifla_policy[], we only check if the IFLA_VF_PORTS container has
IFLA_VF_PORT attributes and then pass the attribute's content itself
via nla_parse_nested(). It would be more correct to reject inner types
other than IFLA_VF_PORT instead of continuing parsing and also similarly
as in commit 4f7d2cdfdde7, to check for a minimum of NLA_HDRLEN.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Cc: Roopa Prabhu <roopa@cumulusnetworks.com>
Cc: Scott Feldman <sfeldma@gmail.com>
Cc: Jason Gunthorpe <jgunthorpe@obsidianresearch.com>
Acked-by: Roopa Prabhu <roopa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/rtnetlink.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 9e433d58d265..dc004b1e1f85 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -1804,10 +1804,13 @@ static int do_setlink(const struct sk_buff *skb,
 			goto errout;
 
 		nla_for_each_nested(attr, tb[IFLA_VF_PORTS], rem) {
-			if (nla_type(attr) != IFLA_VF_PORT)
-				continue;
-			err = nla_parse_nested(port, IFLA_PORT_MAX,
-				attr, ifla_port_policy);
+			if (nla_type(attr) != IFLA_VF_PORT ||
+			    nla_len(attr) < NLA_HDRLEN) {
+				err = -EINVAL;
+				goto errout;
+			}
+			err = nla_parse_nested(port, IFLA_PORT_MAX, attr,
+					       ifla_port_policy);
 			if (err < 0)
 				goto errout;
 			if (!port[IFLA_PORT_VF]) {
-- 
cgit v1.2.3


From 738ac1ebb96d02e0d23bc320302a6ea94c612dec Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Mon, 13 Jul 2015 16:04:13 +0800
Subject: net: Clone skb before setting peeked flag

Shared skbs must not be modified and this is crucial for broadcast
and/or multicast paths where we use it as an optimisation to avoid
unnecessary cloning.

The function skb_recv_datagram breaks this rule by setting peeked
without cloning the skb first.  This causes funky races which leads
to double-free.

This patch fixes this by cloning the skb and replacing the skb
in the list when setting skb->peeked.

Fixes: a59322be07c9 ("[UDP]: Only increment counter on first peek/recv")
Reported-by: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/datagram.c | 41 ++++++++++++++++++++++++++++++++++++++---
 1 file changed, 38 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/core/datagram.c b/net/core/datagram.c
index b80fb91bb3f7..4e9a3f690b7e 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -131,6 +131,35 @@ out_noerr:
 	goto out;
 }
 
+static int skb_set_peeked(struct sk_buff *skb)
+{
+	struct sk_buff *nskb;
+
+	if (skb->peeked)
+		return 0;
+
+	/* We have to unshare an skb before modifying it. */
+	if (!skb_shared(skb))
+		goto done;
+
+	nskb = skb_clone(skb, GFP_ATOMIC);
+	if (!nskb)
+		return -ENOMEM;
+
+	skb->prev->next = nskb;
+	skb->next->prev = nskb;
+	nskb->prev = skb->prev;
+	nskb->next = skb->next;
+
+	consume_skb(skb);
+	skb = nskb;
+
+done:
+	skb->peeked = 1;
+
+	return 0;
+}
+
 /**
  *	__skb_recv_datagram - Receive a datagram skbuff
  *	@sk: socket
@@ -165,7 +194,9 @@ out_noerr:
 struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned int flags,
 				    int *peeked, int *off, int *err)
 {
+	struct sk_buff_head *queue = &sk->sk_receive_queue;
 	struct sk_buff *skb, *last;
+	unsigned long cpu_flags;
 	long timeo;
 	/*
 	 * Caller is allowed not to check sk->sk_err before skb_recv_datagram()
@@ -184,8 +215,6 @@ struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned int flags,
 		 * Look at current nfs client by the way...
 		 * However, this function was correct in any case. 8)
 		 */
-		unsigned long cpu_flags;
-		struct sk_buff_head *queue = &sk->sk_receive_queue;
 		int _off = *off;
 
 		last = (struct sk_buff *)queue;
@@ -199,7 +228,11 @@ struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned int flags,
 					_off -= skb->len;
 					continue;
 				}
-				skb->peeked = 1;
+
+				error = skb_set_peeked(skb);
+				if (error)
+					goto unlock_err;
+
 				atomic_inc(&skb->users);
 			} else
 				__skb_unlink(skb, queue);
@@ -223,6 +256,8 @@ struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned int flags,
 
 	return NULL;
 
+unlock_err:
+	spin_unlock_irqrestore(&queue->lock, cpu_flags);
 no_packet:
 	*err = error;
 	return NULL;
-- 
cgit v1.2.3


From da278622bf04f8ddb14519a2b8214e108ef26101 Mon Sep 17 00:00:00 2001
From: Richard Stearn <richard@rns-stearn.demon.co.uk>
Date: Mon, 13 Jul 2015 11:38:24 +0200
Subject: NET: AX.25: Stop heartbeat timer on disconnect.

This may result in a kernel panic.  The bug has always existed but
somehow we've run out of luck now and it bites.

Signed-off-by: Richard Stearn <richard@rns-stearn.demon.co.uk>
Cc: stable@vger.kernel.org	# all branches
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ax25/ax25_subr.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/ax25/ax25_subr.c b/net/ax25/ax25_subr.c
index 1997538a5d23..3b78e8473a01 100644
--- a/net/ax25/ax25_subr.c
+++ b/net/ax25/ax25_subr.c
@@ -264,6 +264,7 @@ void ax25_disconnect(ax25_cb *ax25, int reason)
 {
 	ax25_clear_queues(ax25);
 
+	ax25_stop_heartbeat(ax25);
 	ax25_stop_t1timer(ax25);
 	ax25_stop_t2timer(ax25);
 	ax25_stop_t3timer(ax25);
-- 
cgit v1.2.3


From 89c22d8c3b278212eef6a8cc66b570bc840a6f5a Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Mon, 13 Jul 2015 20:01:42 +0800
Subject: net: Fix skb csum races when peeking

When we calculate the checksum on the recv path, we store the
result in the skb as an optimisation in case we need the checksum
again down the line.

This is in fact bogus for the MSG_PEEK case as this is done without
any locking.  So multiple threads can peek and then store the result
to the same skb, potentially resulting in bogus skb states.

This patch fixes this by only storing the result if the skb is not
shared.  This preserves the optimisations for the few cases where
it can be done safely due to locking or other reasons, e.g., SIOCINQ.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/datagram.c | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

(limited to 'net')

diff --git a/net/core/datagram.c b/net/core/datagram.c
index 4e9a3f690b7e..4967262b2707 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -657,7 +657,8 @@ __sum16 __skb_checksum_complete_head(struct sk_buff *skb, int len)
 		    !skb->csum_complete_sw)
 			netdev_rx_csum_fault(skb->dev);
 	}
-	skb->csum_valid = !sum;
+	if (!skb_shared(skb))
+		skb->csum_valid = !sum;
 	return sum;
 }
 EXPORT_SYMBOL(__skb_checksum_complete_head);
@@ -677,11 +678,13 @@ __sum16 __skb_checksum_complete(struct sk_buff *skb)
 			netdev_rx_csum_fault(skb->dev);
 	}
 
-	/* Save full packet checksum */
-	skb->csum = csum;
-	skb->ip_summed = CHECKSUM_COMPLETE;
-	skb->csum_complete_sw = 1;
-	skb->csum_valid = !sum;
+	if (!skb_shared(skb)) {
+		/* Save full packet checksum */
+		skb->csum = csum;
+		skb->ip_summed = CHECKSUM_COMPLETE;
+		skb->csum_complete_sw = 1;
+		skb->csum_valid = !sum;
+	}
 
 	return sum;
 }
-- 
cgit v1.2.3


From bc8c20acaea154efc558f5f4122ed65d396f6156 Mon Sep 17 00:00:00 2001
From: Satish Ashok <sashok@cumulusnetworks.com>
Date: Mon, 13 Jul 2015 05:28:37 -0700
Subject: bridge: multicast: treat igmpv3 report with INCLUDE and no sources as
 a leave

A report with INCLUDE/Change_to_include and empty source list should be
treated as a leave, specified by RFC 3376, section 3.1:
"If the requested filter mode is INCLUDE *and* the requested source
 list is empty, then the entry corresponding to the requested
 interface and multicast address is deleted if present.  If no such
 entry is present, the request is ignored."

Signed-off-by: Satish Ashok <sashok@cumulusnetworks.com>
Signed-off-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/bridge/br_multicast.c | 37 ++++++++++++++++++++++++++++++-------
 1 file changed, 30 insertions(+), 7 deletions(-)

(limited to 'net')

diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c
index 742a6c27d7a2..79db489cdade 100644
--- a/net/bridge/br_multicast.c
+++ b/net/bridge/br_multicast.c
@@ -39,6 +39,16 @@ static void br_multicast_start_querier(struct net_bridge *br,
 				       struct bridge_mcast_own_query *query);
 static void br_multicast_add_router(struct net_bridge *br,
 				    struct net_bridge_port *port);
+static void br_ip4_multicast_leave_group(struct net_bridge *br,
+					 struct net_bridge_port *port,
+					 __be32 group,
+					 __u16 vid);
+#if IS_ENABLED(CONFIG_IPV6)
+static void br_ip6_multicast_leave_group(struct net_bridge *br,
+					 struct net_bridge_port *port,
+					 const struct in6_addr *group,
+					 __u16 vid);
+#endif
 unsigned int br_mdb_rehash_seq;
 
 static inline int br_ip_equal(const struct br_ip *a, const struct br_ip *b)
@@ -1010,9 +1020,15 @@ static int br_ip4_multicast_igmp3_report(struct net_bridge *br,
 			continue;
 		}
 
-		err = br_ip4_multicast_add_group(br, port, group, vid);
-		if (err)
-			break;
+		if ((type == IGMPV3_CHANGE_TO_INCLUDE ||
+		     type == IGMPV3_MODE_IS_INCLUDE) &&
+		    ntohs(grec->grec_nsrcs) == 0) {
+			br_ip4_multicast_leave_group(br, port, group, vid);
+		} else {
+			err = br_ip4_multicast_add_group(br, port, group, vid);
+			if (err)
+				break;
+		}
 	}
 
 	return err;
@@ -1071,10 +1087,17 @@ static int br_ip6_multicast_mld2_report(struct net_bridge *br,
 			continue;
 		}
 
-		err = br_ip6_multicast_add_group(br, port, &grec->grec_mca,
-						 vid);
-		if (err)
-			break;
+		if ((grec->grec_type == MLD2_CHANGE_TO_INCLUDE ||
+		     grec->grec_type == MLD2_MODE_IS_INCLUDE) &&
+		    ntohs(*nsrcs) == 0) {
+			br_ip6_multicast_leave_group(br, port, &grec->grec_mca,
+						     vid);
+		} else {
+			err = br_ip6_multicast_add_group(br, port,
+							 &grec->grec_mca, vid);
+			if (!err)
+				break;
+		}
 	}
 
 	return err;
-- 
cgit v1.2.3


From 5ebc784625ea68a9570d1f70557e7932988cd1b4 Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Date: Mon, 13 Jul 2015 06:36:19 -0700
Subject: bridge: mdb: fix double add notification

Since the mdb add/del code was introduced there have been 2 br_mdb_notify
calls when doing br_mdb_add() resulting in 2 notifications on each add.

Example:
 Command: bridge mdb add dev br0 port eth1 grp 239.0.0.1 permanent
 Before patch:
 root@debian:~# bridge monitor all
 [MDB]dev br0 port eth1 grp 239.0.0.1 permanent
 [MDB]dev br0 port eth1 grp 239.0.0.1 permanent

 After patch:
 root@debian:~# bridge monitor all
 [MDB]dev br0 port eth1 grp 239.0.0.1 permanent

Signed-off-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Fixes: cfd567543590 ("bridge: add support of adding and deleting mdb entries")
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/bridge/br_mdb.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'net')

diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c
index c11cf2611db0..1198a3dbad95 100644
--- a/net/bridge/br_mdb.c
+++ b/net/bridge/br_mdb.c
@@ -351,7 +351,6 @@ static int br_mdb_add_group(struct net_bridge *br, struct net_bridge_port *port,
 	if (state == MDB_TEMPORARY)
 		mod_timer(&p->timer, now + br->multicast_membership_interval);
 
-	br_mdb_notify(br->dev, port, group, RTM_NEWMDB);
 	return 0;
 }
 
-- 
cgit v1.2.3


From f82b681a511f4d61069e9586a9cf97bdef371ef3 Mon Sep 17 00:00:00 2001
From: Yuchung Cheng <ycheng@google.com>
Date: Mon, 13 Jul 2015 12:10:20 -0700
Subject: tcp: don't use F-RTO on non-recurring timeouts

Currently F-RTO may repeatedly send new data packets on non-recurring
timeouts in CA_Loss mode. This is a bug because F-RTO (RFC5682)
should only be used on either new recovery or recurring timeouts.

This exacerbates the recovery progress during frequent timeout &
repair, because we prioritize sending new data packets instead of
repairing the holes when the bandwidth is already scarce.

Fix it by correcting the test of a new recovery episode.

Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 684f095d196e..728f5b3d3c64 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -1917,14 +1917,13 @@ void tcp_enter_loss(struct sock *sk)
 	const struct inet_connection_sock *icsk = inet_csk(sk);
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct sk_buff *skb;
-	bool new_recovery = false;
+	bool new_recovery = icsk->icsk_ca_state < TCP_CA_Recovery;
 	bool is_reneg;			/* is receiver reneging on SACKs? */
 
 	/* Reduce ssthresh if it has not yet been made inside this window. */
 	if (icsk->icsk_ca_state <= TCP_CA_Disorder ||
 	    !after(tp->high_seq, tp->snd_una) ||
 	    (icsk->icsk_ca_state == TCP_CA_Loss && !icsk->icsk_retransmits)) {
-		new_recovery = true;
 		tp->prior_ssthresh = tcp_current_ssthresh(sk);
 		tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
 		tcp_ca_event(sk, CA_EVENT_LOSS);
-- 
cgit v1.2.3


From 052cbda41fdc243a8d40cce7ab3a6327b4b2887e Mon Sep 17 00:00:00 2001
From: WANG Cong <xiyou.wangcong@gmail.com>
Date: Mon, 13 Jul 2015 12:30:07 -0700
Subject: fq_codel: fix a use-after-free

Fixes: 25331d6ce42b ("net: sched: implement qstat helper routines")
Cc: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
Signed-off-by: Cong Wang <cwang@twopensource.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/sch_fq_codel.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c
index d75993f89fac..06e7c845e24d 100644
--- a/net/sched/sch_fq_codel.c
+++ b/net/sched/sch_fq_codel.c
@@ -155,10 +155,10 @@ static unsigned int fq_codel_drop(struct Qdisc *sch)
 	skb = dequeue_head(flow);
 	len = qdisc_pkt_len(skb);
 	q->backlogs[idx] -= len;
-	kfree_skb(skb);
 	sch->q.qlen--;
 	qdisc_qstats_drop(sch);
 	qdisc_qstats_backlog_dec(sch, skb);
+	kfree_skb(skb);
 	flow->dropped++;
 	return idx;
 }
-- 
cgit v1.2.3


From 03645a11a570d52e70631838cb786eb4253eb463 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 14 Jul 2015 08:10:22 +0200
Subject: ipv6: lock socket in ip6_datagram_connect()

ip6_datagram_connect() is doing a lot of socket changes without
socket being locked.

This looks wrong, at least for udp_lib_rehash() which could corrupt
lists because of concurrent udp_sk(sk)->udp_portaddr_hash accesses.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip.h    |  1 +
 net/ipv4/datagram.c | 16 ++++++++++++----
 net/ipv6/datagram.c | 20 +++++++++++++++-----
 3 files changed, 28 insertions(+), 9 deletions(-)

(limited to 'net')

diff --git a/include/net/ip.h b/include/net/ip.h
index 0750a186ea63..d5fe9f2ab699 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -161,6 +161,7 @@ static inline __u8 get_rtconn_flags(struct ipcm_cookie* ipc, struct sock* sk)
 }
 
 /* datagram.c */
+int __ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len);
 int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len);
 
 void ip4_datagram_release_cb(struct sock *sk);
diff --git a/net/ipv4/datagram.c b/net/ipv4/datagram.c
index 90c0e8386116..574fad9cca05 100644
--- a/net/ipv4/datagram.c
+++ b/net/ipv4/datagram.c
@@ -20,7 +20,7 @@
 #include <net/route.h>
 #include <net/tcp_states.h>
 
-int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
+int __ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 {
 	struct inet_sock *inet = inet_sk(sk);
 	struct sockaddr_in *usin = (struct sockaddr_in *) uaddr;
@@ -39,8 +39,6 @@ int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 
 	sk_dst_reset(sk);
 
-	lock_sock(sk);
-
 	oif = sk->sk_bound_dev_if;
 	saddr = inet->inet_saddr;
 	if (ipv4_is_multicast(usin->sin_addr.s_addr)) {
@@ -82,9 +80,19 @@ int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 	sk_dst_set(sk, &rt->dst);
 	err = 0;
 out:
-	release_sock(sk);
 	return err;
 }
+EXPORT_SYMBOL(__ip4_datagram_connect);
+
+int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
+{
+	int res;
+
+	lock_sock(sk);
+	res = __ip4_datagram_connect(sk, uaddr, addr_len);
+	release_sock(sk);
+	return res;
+}
 EXPORT_SYMBOL(ip4_datagram_connect);
 
 /* Because UDP xmit path can manipulate sk_dst_cache without holding
diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c
index 62d908e64eeb..b10a88986a98 100644
--- a/net/ipv6/datagram.c
+++ b/net/ipv6/datagram.c
@@ -40,7 +40,7 @@ static bool ipv6_mapped_addr_any(const struct in6_addr *a)
 	return ipv6_addr_v4mapped(a) && (a->s6_addr32[3] == 0);
 }
 
-int ip6_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
+static int __ip6_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 {
 	struct sockaddr_in6	*usin = (struct sockaddr_in6 *) uaddr;
 	struct inet_sock	*inet = inet_sk(sk);
@@ -56,7 +56,7 @@ int ip6_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 	if (usin->sin6_family == AF_INET) {
 		if (__ipv6_only_sock(sk))
 			return -EAFNOSUPPORT;
-		err = ip4_datagram_connect(sk, uaddr, addr_len);
+		err = __ip4_datagram_connect(sk, uaddr, addr_len);
 		goto ipv4_connected;
 	}
 
@@ -98,9 +98,9 @@ int ip6_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 		sin.sin_addr.s_addr = daddr->s6_addr32[3];
 		sin.sin_port = usin->sin6_port;
 
-		err = ip4_datagram_connect(sk,
-					   (struct sockaddr *) &sin,
-					   sizeof(sin));
+		err = __ip4_datagram_connect(sk,
+					     (struct sockaddr *) &sin,
+					     sizeof(sin));
 
 ipv4_connected:
 		if (err)
@@ -204,6 +204,16 @@ out:
 	fl6_sock_release(flowlabel);
 	return err;
 }
+
+int ip6_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
+{
+	int res;
+
+	lock_sock(sk);
+	res = __ip6_datagram_connect(sk, uaddr, addr_len);
+	release_sock(sk);
+	return res;
+}
 EXPORT_SYMBOL_GPL(ip6_datagram_connect);
 
 int ip6_datagram_connect_v6_only(struct sock *sk, struct sockaddr *uaddr,
-- 
cgit v1.2.3


From e8d092aafd9e68c04d7b468e95ff7a617998a796 Mon Sep 17 00:00:00 2001
From: WANG Cong <xiyou.wangcong@gmail.com>
Date: Tue, 14 Jul 2015 11:21:57 -0700
Subject: net_sched: fix a use-after-free in sfq

Fixes: 25331d6ce42b ("net: sched: implement qstat helper routines")
Cc: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
Signed-off-by: Cong Wang <cwang@twopensource.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/sch_sfq.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c
index 7d1492663360..52f75a5473e1 100644
--- a/net/sched/sch_sfq.c
+++ b/net/sched/sch_sfq.c
@@ -306,10 +306,10 @@ drop:
 		len = qdisc_pkt_len(skb);
 		slot->backlog -= len;
 		sfq_dec(q, x);
-		kfree_skb(skb);
 		sch->q.qlen--;
 		qdisc_qstats_drop(sch);
 		qdisc_qstats_backlog_dec(sch, skb);
+		kfree_skb(skb);
 		return len;
 	}
 
-- 
cgit v1.2.3


From c0afd9ce4d6a646fb6433536f95a418bb348fab1 Mon Sep 17 00:00:00 2001
From: WANG Cong <xiyou.wangcong@gmail.com>
Date: Tue, 14 Jul 2015 11:21:58 -0700
Subject: fq_codel: fix return value of fq_codel_drop()

The ->drop() is supposed to return the number of bytes it dropped,
however fq_codel_drop() returns the index of the flow where it drops
a packet from.

Fix this by introducing a helper to wrap fq_codel_drop().

Cc: Eric Dumazet <edumazet@google.com>
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
Signed-off-by: Cong Wang <cwang@twopensource.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/sch_fq_codel.c | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c
index 06e7c845e24d..21ca33c9f036 100644
--- a/net/sched/sch_fq_codel.c
+++ b/net/sched/sch_fq_codel.c
@@ -163,6 +163,15 @@ static unsigned int fq_codel_drop(struct Qdisc *sch)
 	return idx;
 }
 
+static unsigned int fq_codel_qdisc_drop(struct Qdisc *sch)
+{
+	unsigned int prev_backlog;
+
+	prev_backlog = sch->qstats.backlog;
+	fq_codel_drop(sch);
+	return prev_backlog - sch->qstats.backlog;
+}
+
 static int fq_codel_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 {
 	struct fq_codel_sched_data *q = qdisc_priv(sch);
@@ -604,7 +613,7 @@ static struct Qdisc_ops fq_codel_qdisc_ops __read_mostly = {
 	.enqueue	=	fq_codel_enqueue,
 	.dequeue	=	fq_codel_dequeue,
 	.peek		=	qdisc_peek_dequeued,
-	.drop		=	fq_codel_drop,
+	.drop		=	fq_codel_qdisc_drop,
 	.init		=	fq_codel_init,
 	.reset		=	fq_codel_reset,
 	.destroy	=	fq_codel_destroy,
-- 
cgit v1.2.3


From ddf06c1e569a64a44c4c750ae45b2604f19e45f0 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@plumgrid.com>
Date: Tue, 14 Jul 2015 12:15:19 -0700
Subject: tc: act_bpf: fix memory leak

prog->bpf_ops is populated when act_bpf is used with classic BPF and
prog->bpf_name is optionally used with extended BPF.
Fix memory leak when act_bpf is released.

Fixes: d23b8ad8ab23 ("tc: add BPF based action")
Fixes: a8cb5f556b56 ("act_bpf: add initial eBPF support for actions")
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/act_bpf.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'net')

diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c
index 1d56903fd4c7..1df78289e248 100644
--- a/net/sched/act_bpf.c
+++ b/net/sched/act_bpf.c
@@ -339,6 +339,9 @@ static void tcf_bpf_cleanup(struct tc_action *act, int bind)
 		bpf_prog_put(prog->filter);
 	else
 		bpf_prog_destroy(prog->filter);
+
+	kfree(prog->bpf_ops);
+	kfree(prog->bpf_name);
 }
 
 static struct tc_action_ops act_bpf_ops __read_mostly = {
-- 
cgit v1.2.3


From 4479004e6409087d1b4986881dc98c6c15dffb28 Mon Sep 17 00:00:00 2001
From: Tom Hughes <tom@compton.nu>
Date: Mon, 29 Jun 2015 19:41:49 +0100
Subject: mac80211: clear subdir_stations when removing debugfs

If we don't do this, and we then fail to recreate the debugfs
directory during a mode change, then we will fail later trying
to add stations to this now bogus directory:

BUG: unable to handle kernel NULL pointer dereference at 0000006c
IP: [<c0a92202>] mutex_lock+0x12/0x30
Call Trace:
[<c0678ab4>] start_creating+0x44/0xc0
[<c0679203>] debugfs_create_dir+0x13/0xf0
[<f8a938ae>] ieee80211_sta_debugfs_add+0x6e/0x490 [mac80211]

Cc: stable@kernel.org
Signed-off-by: Tom Hughes <tom@compton.nu>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/debugfs_netdev.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/mac80211/debugfs_netdev.c b/net/mac80211/debugfs_netdev.c
index 29236e832e44..c09c0131bfa2 100644
--- a/net/mac80211/debugfs_netdev.c
+++ b/net/mac80211/debugfs_netdev.c
@@ -723,6 +723,7 @@ void ieee80211_debugfs_remove_netdev(struct ieee80211_sub_if_data *sdata)
 
 	debugfs_remove_recursive(sdata->vif.debugfs_dir);
 	sdata->vif.debugfs_dir = NULL;
+	sdata->debugfs.subdir_stations = NULL;
 }
 
 void ieee80211_debugfs_rename_netdev(struct ieee80211_sub_if_data *sdata)
-- 
cgit v1.2.3


From e9de01907e3d1957591113daa3857c0bf01067ef Mon Sep 17 00:00:00 2001
From: Michal Kazior <michal.kazior@tieto.com>
Date: Thu, 2 Jul 2015 09:59:56 +0200
Subject: mac80211: don't clear all tx flags when requeing

When acting as AP and a PS-Poll frame is received
associated station is marked as one in a Service
Period. This state is kept until Tx status for
released frame is reported. While a station is in
Service Period PS-Poll frames are ignored.

However if PS-Poll was received during A-MPDU
teardown it was possible to have the to-be
released frame re-queued back to pending queue.
In such case the frame was stripped of 2 important
flags:

 (a) IEEE80211_TX_CTL_NO_PS_BUFFER
 (b) IEEE80211_TX_STATUS_EOSP

Stripping of (a) led to the frame that was to be
released to be queued back to ps_tx_buf queue. If
station remained to use only PS-Poll frames the
re-queued frame (and new ones) was never actually
transmitted because mac80211 would ignore
subsequent PS-Poll frames due to station being in
Service Period. There was nothing left to clear
the Service Period bit (no xmit -> no tx status ->
no SP end), i.e. the AP would have the station
stuck in Service Period. Beacon TIM would
repeatedly prompt station to poll for frames but
it would get none.

Once (a) is not stripped (b) becomes important
because it's the main condition to clear the
Service Period bit of the station when Tx status
for the released frame is reported back.

This problem was observed with ath9k acting as P2P
GO in some testing scenarios but isn't limited to
it. AP operation with mac80211 based Tx A-MPDU
control combined with clients using PS-Poll frames
is subject to this race.

Signed-off-by: Michal Kazior <michal.kazior@tieto.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/tx.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index 8410bb3bf5e8..b8233505bf9f 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -1117,7 +1117,9 @@ static bool ieee80211_tx_prep_agg(struct ieee80211_tx_data *tx,
 			queued = true;
 			info->control.vif = &tx->sdata->vif;
 			info->flags |= IEEE80211_TX_INTFL_NEED_TXPROCESSING;
-			info->flags &= ~IEEE80211_TX_TEMPORARY_FLAGS;
+			info->flags &= ~IEEE80211_TX_TEMPORARY_FLAGS |
+					IEEE80211_TX_CTL_NO_PS_BUFFER |
+					IEEE80211_TX_STATUS_EOSP;
 			__skb_queue_tail(&tid_tx->pending, skb);
 			if (skb_queue_len(&tid_tx->pending) > STA_MAX_TX_BUFFER)
 				purge_skb = __skb_dequeue(&tid_tx->pending);
-- 
cgit v1.2.3


From 541b6ed7cee1ec7c8f525f51a0ff097776111aeb Mon Sep 17 00:00:00 2001
From: Chaitanya T K <chaitanya.mgit@gmail.com>
Date: Wed, 10 Jun 2015 19:12:31 +0530
Subject: mac80211: wowlan: enable powersave if suspend while ps-polling

If for any reason we're in the middle of PS-polling or awake after
TX due to dynamic powersave while going to suspend, go back to save
power. This might cause a response frame to get lost, but since we
can't really wait for it while going to suspend that's still better
than not enabling powersave which would cause higher power usage
during (and possibly even after) suspend.

Note that this really only affects the very few drivers that use
the powersave implementation in mac80211.

Signed-off-by: Chaitanya T K <chaitanya.mgit@gmail.com>
[rewrite misleading commit log]
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/pm.c | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

(limited to 'net')

diff --git a/net/mac80211/pm.c b/net/mac80211/pm.c
index 06b60980c62c..b676b9fa707b 100644
--- a/net/mac80211/pm.c
+++ b/net/mac80211/pm.c
@@ -76,6 +76,22 @@ int __ieee80211_suspend(struct ieee80211_hw *hw, struct cfg80211_wowlan *wowlan)
 			if (sdata->vif.type != NL80211_IFTYPE_STATION)
 				continue;
 			ieee80211_mgd_quiesce(sdata);
+			/* If suspended during TX in progress, and wowlan
+			 * is enabled (connection will be active) there
+			 * can be a race where the driver is put out
+			 * of power-save due to TX and during suspend
+			 * dynamic_ps_timer is cancelled and TX packet
+			 * is flushed, leaving the driver in ACTIVE even
+			 * after resuming until dynamic_ps_timer puts
+			 * driver back in DOZE.
+			 */
+			if (sdata->u.mgd.associated &&
+			    sdata->u.mgd.powersave &&
+			     !(local->hw.conf.flags & IEEE80211_CONF_PS)) {
+				local->hw.conf.flags |= IEEE80211_CONF_PS;
+				ieee80211_hw_config(local,
+						    IEEE80211_CONF_CHANGE_PS);
+			}
 		}
 
 		err = drv_suspend(local, wowlan);
-- 
cgit v1.2.3


From d8d9008cfb919db48d95f96b05e81f84b3774318 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Wed, 8 Jul 2015 15:41:50 +0300
Subject: mac80211: shut down interfaces before destroying interface list

If the hardware is unregistered while interfaces are up, mac80211 will
unregister all interfaces, which in turns causes mac80211 to be called
again to remove them all from the driver and eventually shut down the
hardware.

During this shutdown, however, it's currently already unsafe to iterate
the list of interfaces atomically, as the list is manipulated in an
unsafe manner. This puts an undue burden on the driver - it must stop
all its activities before calling ieee80211_unregister_hw(), while in
the normal stop path it can do all cleanup in the stop method. If, for
example, it's using the iteration during RX for some reason, it would
have to stop RX before unregistering to avoid crashes.

Fix this problem by closing all interfaces before unregistering them.
This will cause the driver stop to have completed before we manipulate
the interface list, and after the driver is stopped *and* has called
ieee80211_unregister_hw() it really musn't be iterating any more as
the memory will be freed as well.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/iface.c | 25 ++++++++++++++-----------
 1 file changed, 14 insertions(+), 11 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c
index ed1edac14372..553ac6dd4867 100644
--- a/net/mac80211/iface.c
+++ b/net/mac80211/iface.c
@@ -1863,10 +1863,6 @@ void ieee80211_sdata_stop(struct ieee80211_sub_if_data *sdata)
 	ieee80211_teardown_sdata(sdata);
 }
 
-/*
- * Remove all interfaces, may only be called at hardware unregistration
- * time because it doesn't do RCU-safe list removals.
- */
 void ieee80211_remove_interfaces(struct ieee80211_local *local)
 {
 	struct ieee80211_sub_if_data *sdata, *tmp;
@@ -1875,14 +1871,21 @@ void ieee80211_remove_interfaces(struct ieee80211_local *local)
 
 	ASSERT_RTNL();
 
-	/*
-	 * Close all AP_VLAN interfaces first, as otherwise they
-	 * might be closed while the AP interface they belong to
-	 * is closed, causing unregister_netdevice_many() to crash.
+	/* Before destroying the interfaces, make sure they're all stopped so
+	 * that the hardware is stopped. Otherwise, the driver might still be
+	 * iterating the interfaces during the shutdown, e.g. from a worker
+	 * or from RX processing or similar, and if it does so (using atomic
+	 * iteration) while we're manipulating the list, the iteration will
+	 * crash.
+	 *
+	 * After this, the hardware should be stopped and the driver should
+	 * have stopped all of its activities, so that we can do RCU-unaware
+	 * manipulations of the interface list below.
 	 */
-	list_for_each_entry(sdata, &local->interfaces, list)
-		if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN)
-			dev_close(sdata->dev);
+	cfg80211_shutdown_all_interfaces(local->hw.wiphy);
+
+	WARN(local->open_count, "%s: open count remains %d\n",
+	     wiphy_name(local->hw.wiphy), local->open_count);
 
 	mutex_lock(&local->iflist_mtx);
 	list_for_each_entry_safe(sdata, tmp, &local->interfaces, list) {
-- 
cgit v1.2.3


From 042ab5fc7a80b934032fcc673a125feb36645b33 Mon Sep 17 00:00:00 2001
From: Thomas Petazzoni <thomas.petazzoni@free-electrons.com>
Date: Thu, 9 Jul 2015 15:35:15 +0200
Subject: wireless: regulatory: reduce log level of CRDA related messages

With a basic Linux userspace, the messages "Calling CRDA to update
world regulatory domain" appears 10 times after boot every second or
so, followed by a final "Exceeded CRDA call max attempts. Not calling
CRDA". For those of us not having the corresponding userspace parts,
having those messages repeatedly displayed at boot time is a bit
annoying, so this commit reduces their log level to pr_debug().

Signed-off-by: Thomas Petazzoni <thomas.petazzoni@free-electrons.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/wireless/reg.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/wireless/reg.c b/net/wireless/reg.c
index d359e0610198..29134c81e73c 100644
--- a/net/wireless/reg.c
+++ b/net/wireless/reg.c
@@ -544,15 +544,15 @@ static int call_crda(const char *alpha2)
 	reg_regdb_query(alpha2);
 
 	if (reg_crda_timeouts > REG_MAX_CRDA_TIMEOUTS) {
-		pr_info("Exceeded CRDA call max attempts. Not calling CRDA\n");
+		pr_debug("Exceeded CRDA call max attempts. Not calling CRDA\n");
 		return -EINVAL;
 	}
 
 	if (!is_world_regdom((char *) alpha2))
-		pr_info("Calling CRDA for country: %c%c\n",
+		pr_debug("Calling CRDA for country: %c%c\n",
 			alpha2[0], alpha2[1]);
 	else
-		pr_info("Calling CRDA to update world regulatory domain\n");
+		pr_debug("Calling CRDA to update world regulatory domain\n");
 
 	return kobject_uevent_env(&reg_pdev->dev.kobj, KOBJ_CHANGE, env);
 }
-- 
cgit v1.2.3


From 2ea752cd2ce066c5d8c1807b5310ef329885cecb Mon Sep 17 00:00:00 2001
From: Bob Copeland <me@bobcopeland.com>
Date: Tue, 14 Jul 2015 08:31:55 -0400
Subject: mac80211: correct aid location in peering frames

According to 802.11-2012 8.5.16.3.2 AID comes directly after the
capability bytes in mesh peering confirm frames.  The existing
code, however, was adding a 2 byte offset to this location,
resulting in garbage data going out over the air.  Remove the
offset to fix it.

Signed-off-by: Bob Copeland <me@bobcopeland.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/mesh_plink.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/mac80211/mesh_plink.c b/net/mac80211/mesh_plink.c
index 5438d13e2f00..f17127e754c9 100644
--- a/net/mac80211/mesh_plink.c
+++ b/net/mac80211/mesh_plink.c
@@ -306,7 +306,7 @@ static int mesh_plink_frame_tx(struct ieee80211_sub_if_data *sdata,
 		if (action == WLAN_SP_MESH_PEERING_CONFIRM) {
 			/* AID */
 			pos = skb_put(skb, 2);
-			put_unaligned_le16(plid, pos + 2);
+			put_unaligned_le16(plid, pos);
 		}
 		if (ieee80211_add_srates_ie(sdata, skb, true, band) ||
 		    ieee80211_add_ext_srates_ie(sdata, skb, true, band) ||
-- 
cgit v1.2.3


From b3e7de873df77c1fa0bc2cfaf3eaff757b80e773 Mon Sep 17 00:00:00 2001
From: Bob Copeland <me@bobcopeland.com>
Date: Tue, 14 Jul 2015 08:31:56 -0400
Subject: mac80211: add missing length check for confirm frames

Although mesh_rx_plink_frame() already checks that frames have enough
bytes for the action code plus another two bytes for capability/reason
code, it doesn't take into account that confirm frames also have an
additional two-byte aid.  As a result, a corrupt frame could cause a
subsequent subtraction to wrap around to ill effect.  Add another
check for this case.

Signed-off-by: Bob Copeland <me@bobcopeland.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/mesh_plink.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'net')

diff --git a/net/mac80211/mesh_plink.c b/net/mac80211/mesh_plink.c
index f17127e754c9..3b59099413fb 100644
--- a/net/mac80211/mesh_plink.c
+++ b/net/mac80211/mesh_plink.c
@@ -1122,6 +1122,9 @@ void mesh_rx_plink_frame(struct ieee80211_sub_if_data *sdata,
 						WLAN_SP_MESH_PEERING_CONFIRM) {
 		baseaddr += 4;
 		baselen += 4;
+
+		if (baselen > len)
+			return;
 	}
 	ieee802_11_parse_elems(baseaddr, len - baselen, true, &elems);
 	mesh_process_plink_frame(sdata, mgmt, &elems);
-- 
cgit v1.2.3


From 923b352f19d9ea971ae2536eab55f5fc9e95fedf Mon Sep 17 00:00:00 2001
From: Arik Nemtsov <arik@wizery.com>
Date: Wed, 8 Jul 2015 15:41:44 +0300
Subject: cfg80211: use RTNL locked reg_can_beacon for IR-relaxation

The RTNL is required to check for IR-relaxation conditions that allow
more channels to beacon. Export an RTNL locked version of reg_can_beacon
and use it where possible in AP/STA interface type flows, where
IR-relaxation may be applicable.

Fixes: 06f207fc5418 ("cfg80211: change GO_CONCURRENT to IR_CONCURRENT for STA")
Signed-off-by: Arik Nemtsov <arikx.nemtsov@intel.com>
Signed-off-by: Emmanuel Grumbach <emmanuel.grumbach@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h | 17 +++++++++++++++++
 net/mac80211/tdls.c    |  6 +++---
 net/wireless/chan.c    | 45 ++++++++++++++++++++++++++++++++++-----------
 net/wireless/nl80211.c | 14 ++++++++------
 net/wireless/reg.c     |  2 +-
 net/wireless/trace.h   | 11 +++++++----
 6 files changed, 70 insertions(+), 25 deletions(-)

(limited to 'net')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index a741678f24a2..883fe1e7c5a1 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -4868,6 +4868,23 @@ bool cfg80211_reg_can_beacon(struct wiphy *wiphy,
 			     struct cfg80211_chan_def *chandef,
 			     enum nl80211_iftype iftype);
 
+/**
+ * cfg80211_reg_can_beacon_relax - check if beaconing is allowed with relaxation
+ * @wiphy: the wiphy
+ * @chandef: the channel definition
+ * @iftype: interface type
+ *
+ * Return: %true if there is no secondary channel or the secondary channel(s)
+ * can be used for beaconing (i.e. is not a radar channel etc.). This version
+ * also checks if IR-relaxation conditions apply, to allow beaconing under
+ * more permissive conditions.
+ *
+ * Requires the RTNL to be held.
+ */
+bool cfg80211_reg_can_beacon_relax(struct wiphy *wiphy,
+				   struct cfg80211_chan_def *chandef,
+				   enum nl80211_iftype iftype);
+
 /*
  * cfg80211_ch_switch_notify - update wdev channel and notify userspace
  * @dev: the device which switched channels
diff --git a/net/mac80211/tdls.c b/net/mac80211/tdls.c
index ad31b2dab4f5..8db6e2994bbc 100644
--- a/net/mac80211/tdls.c
+++ b/net/mac80211/tdls.c
@@ -60,6 +60,7 @@ ieee80211_tdls_add_subband(struct ieee80211_sub_if_data *sdata,
 	struct ieee80211_channel *ch;
 	struct cfg80211_chan_def chandef;
 	int i, subband_start;
+	struct wiphy *wiphy = sdata->local->hw.wiphy;
 
 	for (i = start; i <= end; i += spacing) {
 		if (!ch_cnt)
@@ -70,9 +71,8 @@ ieee80211_tdls_add_subband(struct ieee80211_sub_if_data *sdata,
 			/* we will be active on the channel */
 			cfg80211_chandef_create(&chandef, ch,
 						NL80211_CHAN_NO_HT);
-			if (cfg80211_reg_can_beacon(sdata->local->hw.wiphy,
-						    &chandef,
-						    sdata->wdev.iftype)) {
+			if (cfg80211_reg_can_beacon_relax(wiphy, &chandef,
+							  sdata->wdev.iftype)) {
 				ch_cnt++;
 				/*
 				 * check if the next channel is also part of
diff --git a/net/wireless/chan.c b/net/wireless/chan.c
index 915b328b9ac5..59cabc9bce69 100644
--- a/net/wireless/chan.c
+++ b/net/wireless/chan.c
@@ -797,23 +797,18 @@ static bool cfg80211_ir_permissive_chan(struct wiphy *wiphy,
 	return false;
 }
 
-bool cfg80211_reg_can_beacon(struct wiphy *wiphy,
-			     struct cfg80211_chan_def *chandef,
-			     enum nl80211_iftype iftype)
+static bool _cfg80211_reg_can_beacon(struct wiphy *wiphy,
+				     struct cfg80211_chan_def *chandef,
+				     enum nl80211_iftype iftype,
+				     bool check_no_ir)
 {
 	bool res;
 	u32 prohibited_flags = IEEE80211_CHAN_DISABLED |
 			       IEEE80211_CHAN_RADAR;
 
-	trace_cfg80211_reg_can_beacon(wiphy, chandef, iftype);
+	trace_cfg80211_reg_can_beacon(wiphy, chandef, iftype, check_no_ir);
 
-	/*
-	 * Under certain conditions suggested by some regulatory bodies a
-	 * GO/STA can IR on channels marked with IEEE80211_NO_IR. Set this flag
-	 * only if such relaxations are not enabled and the conditions are not
-	 * met.
-	 */
-	if (!cfg80211_ir_permissive_chan(wiphy, iftype, chandef->chan))
+	if (check_no_ir)
 		prohibited_flags |= IEEE80211_CHAN_NO_IR;
 
 	if (cfg80211_chandef_dfs_required(wiphy, chandef, iftype) > 0 &&
@@ -827,8 +822,36 @@ bool cfg80211_reg_can_beacon(struct wiphy *wiphy,
 	trace_cfg80211_return_bool(res);
 	return res;
 }
+
+bool cfg80211_reg_can_beacon(struct wiphy *wiphy,
+			     struct cfg80211_chan_def *chandef,
+			     enum nl80211_iftype iftype)
+{
+	return _cfg80211_reg_can_beacon(wiphy, chandef, iftype, true);
+}
 EXPORT_SYMBOL(cfg80211_reg_can_beacon);
 
+bool cfg80211_reg_can_beacon_relax(struct wiphy *wiphy,
+				   struct cfg80211_chan_def *chandef,
+				   enum nl80211_iftype iftype)
+{
+	bool check_no_ir;
+
+	ASSERT_RTNL();
+
+	/*
+	 * Under certain conditions suggested by some regulatory bodies a
+	 * GO/STA can IR on channels marked with IEEE80211_NO_IR. Set this flag
+	 * only if such relaxations are not enabled and the conditions are not
+	 * met.
+	 */
+	check_no_ir = !cfg80211_ir_permissive_chan(wiphy, iftype,
+						   chandef->chan);
+
+	return _cfg80211_reg_can_beacon(wiphy, chandef, iftype, check_no_ir);
+}
+EXPORT_SYMBOL(cfg80211_reg_can_beacon_relax);
+
 int cfg80211_set_monitor_channel(struct cfg80211_registered_device *rdev,
 				 struct cfg80211_chan_def *chandef)
 {
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index c264effd00a6..76b41578a838 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -2003,7 +2003,8 @@ static int __nl80211_set_channel(struct cfg80211_registered_device *rdev,
 	switch (iftype) {
 	case NL80211_IFTYPE_AP:
 	case NL80211_IFTYPE_P2P_GO:
-		if (!cfg80211_reg_can_beacon(&rdev->wiphy, &chandef, iftype)) {
+		if (!cfg80211_reg_can_beacon_relax(&rdev->wiphy, &chandef,
+						   iftype)) {
 			result = -EINVAL;
 			break;
 		}
@@ -3403,8 +3404,8 @@ static int nl80211_start_ap(struct sk_buff *skb, struct genl_info *info)
 	} else if (!nl80211_get_ap_channel(rdev, &params))
 		return -EINVAL;
 
-	if (!cfg80211_reg_can_beacon(&rdev->wiphy, &params.chandef,
-				     wdev->iftype))
+	if (!cfg80211_reg_can_beacon_relax(&rdev->wiphy, &params.chandef,
+					   wdev->iftype))
 		return -EINVAL;
 
 	if (info->attrs[NL80211_ATTR_ACL_POLICY]) {
@@ -6492,8 +6493,8 @@ skip_beacons:
 	if (err)
 		return err;
 
-	if (!cfg80211_reg_can_beacon(&rdev->wiphy, &params.chandef,
-				     wdev->iftype))
+	if (!cfg80211_reg_can_beacon_relax(&rdev->wiphy, &params.chandef,
+					   wdev->iftype))
 		return -EINVAL;
 
 	err = cfg80211_chandef_dfs_required(wdev->wiphy,
@@ -10170,7 +10171,8 @@ static int nl80211_tdls_channel_switch(struct sk_buff *skb,
 		return -EINVAL;
 
 	/* we will be active on the TDLS link */
-	if (!cfg80211_reg_can_beacon(&rdev->wiphy, &chandef, wdev->iftype))
+	if (!cfg80211_reg_can_beacon_relax(&rdev->wiphy, &chandef,
+					   wdev->iftype))
 		return -EINVAL;
 
 	/* don't allow switching to DFS channels */
diff --git a/net/wireless/reg.c b/net/wireless/reg.c
index 29134c81e73c..aa2d75482017 100644
--- a/net/wireless/reg.c
+++ b/net/wireless/reg.c
@@ -1589,7 +1589,7 @@ static bool reg_wdev_chan_valid(struct wiphy *wiphy, struct wireless_dev *wdev)
 	case NL80211_IFTYPE_AP:
 	case NL80211_IFTYPE_P2P_GO:
 	case NL80211_IFTYPE_ADHOC:
-		return cfg80211_reg_can_beacon(wiphy, &chandef, iftype);
+		return cfg80211_reg_can_beacon_relax(wiphy, &chandef, iftype);
 	case NL80211_IFTYPE_STATION:
 	case NL80211_IFTYPE_P2P_CLIENT:
 		return cfg80211_chandef_usable(wiphy, &chandef,
diff --git a/net/wireless/trace.h b/net/wireless/trace.h
index af3617c9879e..a808279a432a 100644
--- a/net/wireless/trace.h
+++ b/net/wireless/trace.h
@@ -2358,20 +2358,23 @@ TRACE_EVENT(cfg80211_cqm_rssi_notify,
 
 TRACE_EVENT(cfg80211_reg_can_beacon,
 	TP_PROTO(struct wiphy *wiphy, struct cfg80211_chan_def *chandef,
-		 enum nl80211_iftype iftype),
-	TP_ARGS(wiphy, chandef, iftype),
+		 enum nl80211_iftype iftype, bool check_no_ir),
+	TP_ARGS(wiphy, chandef, iftype, check_no_ir),
 	TP_STRUCT__entry(
 		WIPHY_ENTRY
 		CHAN_DEF_ENTRY
 		__field(enum nl80211_iftype, iftype)
+		__field(bool, check_no_ir)
 	),
 	TP_fast_assign(
 		WIPHY_ASSIGN;
 		CHAN_DEF_ASSIGN(chandef);
 		__entry->iftype = iftype;
+		__entry->check_no_ir = check_no_ir;
 	),
-	TP_printk(WIPHY_PR_FMT ", " CHAN_DEF_PR_FMT ", iftype=%d",
-		  WIPHY_PR_ARG, CHAN_DEF_PR_ARG, __entry->iftype)
+	TP_printk(WIPHY_PR_FMT ", " CHAN_DEF_PR_FMT ", iftype=%d check_no_ir=%s",
+		  WIPHY_PR_ARG, CHAN_DEF_PR_ARG, __entry->iftype,
+		  BOOL_TO_STR(__entry->check_no_ir))
 );
 
 TRACE_EVENT(cfg80211_chandef_dfs_required,
-- 
cgit v1.2.3


From 0838aa7fcfcd875caa7bcc5dab0c3fd40444553d Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Mon, 13 Jul 2015 15:11:48 +0200
Subject: netfilter: fix netns dependencies with conntrack templates

Quoting Daniel Borkmann:

"When adding connection tracking template rules to a netns, f.e. to
configure netfilter zones, the kernel will endlessly busy-loop as soon
as we try to delete the given netns in case there's at least one
template present, which is problematic i.e. if there is such bravery that
the priviledged user inside the netns is assumed untrusted.

Minimal example:

  ip netns add foo
  ip netns exec foo iptables -t raw -A PREROUTING -d 1.2.3.4 -j CT --zone 1
  ip netns del foo

What happens is that when nf_ct_iterate_cleanup() is being called from
nf_conntrack_cleanup_net_list() for a provided netns, we always end up
with a net->ct.count > 0 and thus jump back to i_see_dead_people. We
don't get a soft-lockup as we still have a schedule() point, but the
serving CPU spins on 100% from that point onwards.

Since templates are normally allocated with nf_conntrack_alloc(), we
also bump net->ct.count. The issue why they are not yet nf_ct_put() is
because the per netns .exit() handler from x_tables (which would eventually
invoke xt_CT's xt_ct_tg_destroy() that drops reference on info->ct) is
called in the dependency chain at a *later* point in time than the per
netns .exit() handler for the connection tracker.

This is clearly a chicken'n'egg problem: after the connection tracker
.exit() handler, we've teared down all the connection tracking
infrastructure already, so rightfully, xt_ct_tg_destroy() cannot be
invoked at a later point in time during the netns cleanup, as that would
lead to a use-after-free. At the same time, we cannot make x_tables depend
on the connection tracker module, so that the xt_ct_tg_destroy() would
be invoked earlier in the cleanup chain."

Daniel confirms this has to do with the order in which modules are loaded or
having compiled nf_conntrack as modules while x_tables built-in. So we have no
guarantees regarding the order in which netns callbacks are executed.

Fix this by allocating the templates through kmalloc() from the respective
SYNPROXY and CT targets, so they don't depend on the conntrack kmem cache.
Then, release then via nf_ct_tmpl_free() from destroy_conntrack(). This branch
is marked as unlikely since conntrack templates are rarely allocated and only
from the configuration plane path.

Note that templates are not kept in any list to avoid further dependencies with
nf_conntrack anymore, thus, the tmpl larval list is removed.

Reported-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Tested-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/net/netfilter/nf_conntrack.h |  2 +-
 include/net/netns/conntrack.h        |  1 -
 net/netfilter/nf_conntrack_core.c    | 67 +++++++++++++++++++++++-------------
 net/netfilter/nf_synproxy_core.c     |  7 ++--
 net/netfilter/xt_CT.c                |  8 ++---
 5 files changed, 51 insertions(+), 34 deletions(-)

(limited to 'net')

diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h
index 095433b8a8b0..37cd3911d5c5 100644
--- a/include/net/netfilter/nf_conntrack.h
+++ b/include/net/netfilter/nf_conntrack.h
@@ -291,7 +291,7 @@ extern unsigned int nf_conntrack_max;
 extern unsigned int nf_conntrack_hash_rnd;
 void init_nf_conntrack_hash_rnd(void);
 
-void nf_conntrack_tmpl_insert(struct net *net, struct nf_conn *tmpl);
+struct nf_conn *nf_ct_tmpl_alloc(struct net *net, u16 zone, gfp_t flags);
 
 #define NF_CT_STAT_INC(net, count)	  __this_cpu_inc((net)->ct.stat->count)
 #define NF_CT_STAT_INC_ATOMIC(net, count) this_cpu_inc((net)->ct.stat->count)
diff --git a/include/net/netns/conntrack.h b/include/net/netns/conntrack.h
index 29d6a94db54d..723b61c82b3f 100644
--- a/include/net/netns/conntrack.h
+++ b/include/net/netns/conntrack.h
@@ -68,7 +68,6 @@ struct ct_pcpu {
 	spinlock_t		lock;
 	struct hlist_nulls_head unconfirmed;
 	struct hlist_nulls_head dying;
-	struct hlist_nulls_head tmpl;
 };
 
 struct netns_ct {
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 13fad8668f83..651039ad1681 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -287,6 +287,46 @@ static void nf_ct_del_from_dying_or_unconfirmed_list(struct nf_conn *ct)
 	spin_unlock(&pcpu->lock);
 }
 
+/* Released via destroy_conntrack() */
+struct nf_conn *nf_ct_tmpl_alloc(struct net *net, u16 zone, gfp_t flags)
+{
+	struct nf_conn *tmpl;
+
+	tmpl = kzalloc(sizeof(struct nf_conn), GFP_KERNEL);
+	if (tmpl == NULL)
+		return NULL;
+
+	tmpl->status = IPS_TEMPLATE;
+	write_pnet(&tmpl->ct_net, net);
+
+#ifdef CONFIG_NF_CONNTRACK_ZONES
+	if (zone) {
+		struct nf_conntrack_zone *nf_ct_zone;
+
+		nf_ct_zone = nf_ct_ext_add(tmpl, NF_CT_EXT_ZONE, GFP_ATOMIC);
+		if (!nf_ct_zone)
+			goto out_free;
+		nf_ct_zone->id = zone;
+	}
+#endif
+	atomic_set(&tmpl->ct_general.use, 0);
+
+	return tmpl;
+#ifdef CONFIG_NF_CONNTRACK_ZONES
+out_free:
+	kfree(tmpl);
+	return NULL;
+#endif
+}
+EXPORT_SYMBOL_GPL(nf_ct_tmpl_alloc);
+
+static void nf_ct_tmpl_free(struct nf_conn *tmpl)
+{
+	nf_ct_ext_destroy(tmpl);
+	nf_ct_ext_free(tmpl);
+	kfree(tmpl);
+}
+
 static void
 destroy_conntrack(struct nf_conntrack *nfct)
 {
@@ -298,6 +338,10 @@ destroy_conntrack(struct nf_conntrack *nfct)
 	NF_CT_ASSERT(atomic_read(&nfct->use) == 0);
 	NF_CT_ASSERT(!timer_pending(&ct->timeout));
 
+	if (unlikely(nf_ct_is_template(ct))) {
+		nf_ct_tmpl_free(ct);
+		return;
+	}
 	rcu_read_lock();
 	l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct));
 	if (l4proto && l4proto->destroy)
@@ -540,28 +584,6 @@ out:
 }
 EXPORT_SYMBOL_GPL(nf_conntrack_hash_check_insert);
 
-/* deletion from this larval template list happens via nf_ct_put() */
-void nf_conntrack_tmpl_insert(struct net *net, struct nf_conn *tmpl)
-{
-	struct ct_pcpu *pcpu;
-
-	__set_bit(IPS_TEMPLATE_BIT, &tmpl->status);
-	__set_bit(IPS_CONFIRMED_BIT, &tmpl->status);
-	nf_conntrack_get(&tmpl->ct_general);
-
-	/* add this conntrack to the (per cpu) tmpl list */
-	local_bh_disable();
-	tmpl->cpu = smp_processor_id();
-	pcpu = per_cpu_ptr(nf_ct_net(tmpl)->ct.pcpu_lists, tmpl->cpu);
-
-	spin_lock(&pcpu->lock);
-	/* Overload tuple linked list to put us in template list. */
-	hlist_nulls_add_head_rcu(&tmpl->tuplehash[IP_CT_DIR_ORIGINAL].hnnode,
-				 &pcpu->tmpl);
-	spin_unlock_bh(&pcpu->lock);
-}
-EXPORT_SYMBOL_GPL(nf_conntrack_tmpl_insert);
-
 /* Confirm a connection given skb; places it in hash table */
 int
 __nf_conntrack_confirm(struct sk_buff *skb)
@@ -1751,7 +1773,6 @@ int nf_conntrack_init_net(struct net *net)
 		spin_lock_init(&pcpu->lock);
 		INIT_HLIST_NULLS_HEAD(&pcpu->unconfirmed, UNCONFIRMED_NULLS_VAL);
 		INIT_HLIST_NULLS_HEAD(&pcpu->dying, DYING_NULLS_VAL);
-		INIT_HLIST_NULLS_HEAD(&pcpu->tmpl, TEMPLATE_NULLS_VAL);
 	}
 
 	net->ct.stat = alloc_percpu(struct ip_conntrack_stat);
diff --git a/net/netfilter/nf_synproxy_core.c b/net/netfilter/nf_synproxy_core.c
index 789feeae6c44..71f1e9fdfa18 100644
--- a/net/netfilter/nf_synproxy_core.c
+++ b/net/netfilter/nf_synproxy_core.c
@@ -349,12 +349,10 @@ static void __net_exit synproxy_proc_exit(struct net *net)
 static int __net_init synproxy_net_init(struct net *net)
 {
 	struct synproxy_net *snet = synproxy_pernet(net);
-	struct nf_conntrack_tuple t;
 	struct nf_conn *ct;
 	int err = -ENOMEM;
 
-	memset(&t, 0, sizeof(t));
-	ct = nf_conntrack_alloc(net, 0, &t, &t, GFP_KERNEL);
+	ct = nf_ct_tmpl_alloc(net, 0, GFP_KERNEL);
 	if (IS_ERR(ct)) {
 		err = PTR_ERR(ct);
 		goto err1;
@@ -365,7 +363,8 @@ static int __net_init synproxy_net_init(struct net *net)
 	if (!nfct_synproxy_ext_add(ct))
 		goto err2;
 
-	nf_conntrack_tmpl_insert(net, ct);
+	__set_bit(IPS_CONFIRMED_BIT, &ct->status);
+	nf_conntrack_get(&ct->ct_general);
 	snet->tmpl = ct;
 
 	snet->stats = alloc_percpu(struct synproxy_stats);
diff --git a/net/netfilter/xt_CT.c b/net/netfilter/xt_CT.c
index 75747aecdebe..c6630030c912 100644
--- a/net/netfilter/xt_CT.c
+++ b/net/netfilter/xt_CT.c
@@ -184,7 +184,6 @@ out:
 static int xt_ct_tg_check(const struct xt_tgchk_param *par,
 			  struct xt_ct_target_info_v1 *info)
 {
-	struct nf_conntrack_tuple t;
 	struct nf_conn *ct;
 	int ret = -EOPNOTSUPP;
 
@@ -202,8 +201,7 @@ static int xt_ct_tg_check(const struct xt_tgchk_param *par,
 	if (ret < 0)
 		goto err1;
 
-	memset(&t, 0, sizeof(t));
-	ct = nf_conntrack_alloc(par->net, info->zone, &t, &t, GFP_KERNEL);
+	ct = nf_ct_tmpl_alloc(par->net, info->zone, GFP_KERNEL);
 	ret = PTR_ERR(ct);
 	if (IS_ERR(ct))
 		goto err2;
@@ -227,8 +225,8 @@ static int xt_ct_tg_check(const struct xt_tgchk_param *par,
 		if (ret < 0)
 			goto err3;
 	}
-
-	nf_conntrack_tmpl_insert(par->net, ct);
+	__set_bit(IPS_CONFIRMED_BIT, &ct->status);
+	nf_conntrack_get(&ct->ct_general);
 out:
 	info->ct = ct;
 	return 0;
-- 
cgit v1.2.3


From fdbf5b097bbd9693a86c0b8bfdd071a9a2117cfc Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Mon, 20 Jul 2015 17:55:38 +0800
Subject: Revert "sit: Add gro callbacks to sit_offload"

This patch reverts 19424e052fb44da2f00d1a868cbb51f3e9f4bbb5 ("sit:
Add gro callbacks to sit_offload") because it generates packets
that cannot be handled even by our own GSO.

Reported-by: Wolfgang Walter <linux@stwm.de>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/ip6_offload.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c
index e893cd18612f..08b62047c67f 100644
--- a/net/ipv6/ip6_offload.c
+++ b/net/ipv6/ip6_offload.c
@@ -292,8 +292,6 @@ static struct packet_offload ipv6_packet_offload __read_mostly = {
 static const struct net_offload sit_offload = {
 	.callbacks = {
 		.gso_segment	= ipv6_gso_segment,
-		.gro_receive	= ipv6_gro_receive,
-		.gro_complete	= ipv6_gro_complete,
 	},
 };
 
-- 
cgit v1.2.3


From b8a23e8d8e31abeda2f6cfa36a772414b2a86ffc Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 17 Jul 2015 10:19:23 +0200
Subject: caif: fix leaks and race in caif_queue_rcv_skb()

1) If sk_filter() is applied, skb was leaked (not freed)
2) Testing SOCK_DEAD twice is racy :
   packet could be freed while already queued.
3) Remove obsolete comment about caching skb->len

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/caif/caif_socket.c | 19 ++++++++-----------
 1 file changed, 8 insertions(+), 11 deletions(-)

(limited to 'net')

diff --git a/net/caif/caif_socket.c b/net/caif/caif_socket.c
index 3cc71b9f5517..cc858919108e 100644
--- a/net/caif/caif_socket.c
+++ b/net/caif/caif_socket.c
@@ -121,12 +121,13 @@ static void caif_flow_ctrl(struct sock *sk, int mode)
  * Copied from sock.c:sock_queue_rcv_skb(), but changed so packets are
  * not dropped, but CAIF is sending flow off instead.
  */
-static int caif_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
+static void caif_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 {
 	int err;
 	unsigned long flags;
 	struct sk_buff_head *list = &sk->sk_receive_queue;
 	struct caifsock *cf_sk = container_of(sk, struct caifsock, sk);
+	bool queued = false;
 
 	if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
 		(unsigned int)sk->sk_rcvbuf && rx_flow_is_on(cf_sk)) {
@@ -139,7 +140,8 @@ static int caif_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 
 	err = sk_filter(sk, skb);
 	if (err)
-		return err;
+		goto out;
+
 	if (!sk_rmem_schedule(sk, skb, skb->truesize) && rx_flow_is_on(cf_sk)) {
 		set_rx_flow_off(cf_sk);
 		net_dbg_ratelimited("sending flow OFF due to rmem_schedule\n");
@@ -147,21 +149,16 @@ static int caif_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 	}
 	skb->dev = NULL;
 	skb_set_owner_r(skb, sk);
-	/* Cache the SKB length before we tack it onto the receive
-	 * queue. Once it is added it no longer belongs to us and
-	 * may be freed by other threads of control pulling packets
-	 * from the queue.
-	 */
 	spin_lock_irqsave(&list->lock, flags);
-	if (!sock_flag(sk, SOCK_DEAD))
+	queued = !sock_flag(sk, SOCK_DEAD);
+	if (queued)
 		__skb_queue_tail(list, skb);
 	spin_unlock_irqrestore(&list->lock, flags);
-
-	if (!sock_flag(sk, SOCK_DEAD))
+out:
+	if (queued)
 		sk->sk_data_ready(sk);
 	else
 		kfree_skb(skb);
-	return 0;
 }
 
 /* Packet Receive Callback function called from CAIF Stack */
-- 
cgit v1.2.3


From 8bf4ada2e21378816b28205427ee6b0e1ca4c5f1 Mon Sep 17 00:00:00 2001
From: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
Date: Fri, 17 Jul 2015 14:01:11 +0300
Subject: net: ratelimit warnings about dst entry refcount underflow or
 overflow

Kernel generates a lot of warnings when dst entry reference counter
overflows and becomes negative. That bug was seen several times at
machines with outdated 3.10.y kernels. Most like it's already fixed
in upstream. Anyway that flood completely kills machine and makes
further debugging impossible.

Signed-off-by: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dst.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/core/dst.c b/net/core/dst.c
index e956ce6d1378..002144bea935 100644
--- a/net/core/dst.c
+++ b/net/core/dst.c
@@ -284,7 +284,9 @@ void dst_release(struct dst_entry *dst)
 		int newrefcnt;
 
 		newrefcnt = atomic_dec_return(&dst->__refcnt);
-		WARN_ON(newrefcnt < 0);
+		if (unlikely(newrefcnt < 0))
+			net_warn_ratelimited("%s: dst:%p refcnt:%d\n",
+					     __func__, dst, newrefcnt);
 		if (unlikely(dst->flags & DST_NOCACHE) && !newrefcnt)
 			call_rcu(&dst->rcu_head, dst_destroy_rcu);
 	}
-- 
cgit v1.2.3


From f6bfc46da6292b630ba389592123f0dd02066172 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Fri, 17 Jul 2015 22:38:43 +0200
Subject: sched: cls_bpf: fix panic on filter replace

The following test case causes a NULL pointer dereference in cls_bpf:

  FOO="1,6 0 0 4294967295,"
  tc filter add dev foo parent 1: bpf bytecode "$FOO" flowid 1:1 action ok
  tc filter replace dev foo parent 1: pref 49152 handle 0x1 \
            bpf bytecode "$FOO" flowid 1:1 action drop

The problem is that commit 1f947bf151e9 ("net: sched: rcu'ify cls_bpf")
accidentally swapped the arguments of list_replace_rcu(), the old
element needs to be the first argument and the new element the second.

Fixes: 1f947bf151e9 ("net: sched: rcu'ify cls_bpf")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: John Fastabend <john.r.fastabend@intel.com>
Acked-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/cls_bpf.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c
index c79ecfd36e0f..e5168f8b9640 100644
--- a/net/sched/cls_bpf.c
+++ b/net/sched/cls_bpf.c
@@ -378,7 +378,7 @@ static int cls_bpf_change(struct net *net, struct sk_buff *in_skb,
 		goto errout;
 
 	if (oldprog) {
-		list_replace_rcu(&prog->link, &oldprog->link);
+		list_replace_rcu(&oldprog->link, &prog->link);
 		tcf_unbind_filter(tp, &oldprog->res);
 		call_rcu(&oldprog->rcu, __cls_bpf_delete_prog);
 	} else {
-- 
cgit v1.2.3


From ff3532f2655b79058fec035ca54fced69a483084 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Fri, 17 Jul 2015 22:38:44 +0200
Subject: sched: cls_flower: fix panic on filter replace

The following test case causes a NULL pointer dereference in cls_flower:

  tc filter add dev foo parent 1: flower eth_type ipv4 action ok flowid 1:1
  tc filter replace dev foo parent 1: pref 49152 handle 0x1 \
            flower eth_type ipv6 action ok flowid 1:1

The problem is that commit 77b9900ef53a ("tc: introduce Flower classifier")
accidentally swapped the arguments of list_replace_rcu(), the old
element needs to be the first argument and the new element the second.

Fixes: 77b9900ef53a ("tc: introduce Flower classifier")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Jiri Pirko <jiri@resnulli.us>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/cls_flower.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index 9d37ccd95062..2f3d03f99487 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -499,7 +499,7 @@ static int fl_change(struct net *net, struct sk_buff *in_skb,
 	*arg = (unsigned long) fnew;
 
 	if (fold) {
-		list_replace_rcu(&fnew->list, &fold->list);
+		list_replace_rcu(&fold->list, &fnew->list);
 		tcf_unbind_filter(tp, &fold->res);
 		call_rcu(&fold->rcu, fl_destroy_filter);
 	} else {
-- 
cgit v1.2.3


From 32b2f4b196b37695fdb42b31afcbc15399d6ef91 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Fri, 17 Jul 2015 22:38:45 +0200
Subject: sched: cls_flow: fix panic on filter replace

The following test case causes a NULL pointer dereference in cls_flow:

  tc filter add dev foo parent 1: handle 0x1 flow hash keys dst action ok
  tc filter replace dev foo parent 1: pref 49152 handle 0x1 \
            flow hash keys mark action drop

To be more precise, actually two different panics are fixed, the first
occurs because tcf_exts_init() is not called on the newly allocated
filter when we do a replace. And the second panic uncovered after that
happens since the arguments of list_replace_rcu() are swapped, the old
element needs to be the first argument and the new element the second.

Fixes: 70da9f0bf999 ("net: sched: cls_flow use RCU")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: John Fastabend <john.r.fastabend@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/cls_flow.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/sched/cls_flow.c b/net/sched/cls_flow.c
index 76bc3a20ffdb..bb2a0f529c1f 100644
--- a/net/sched/cls_flow.c
+++ b/net/sched/cls_flow.c
@@ -425,6 +425,8 @@ static int flow_change(struct net *net, struct sk_buff *in_skb,
 	if (!fnew)
 		goto err2;
 
+	tcf_exts_init(&fnew->exts, TCA_FLOW_ACT, TCA_FLOW_POLICE);
+
 	fold = (struct flow_filter *)*arg;
 	if (fold) {
 		err = -EINVAL;
@@ -486,7 +488,6 @@ static int flow_change(struct net *net, struct sk_buff *in_skb,
 		fnew->mask  = ~0U;
 		fnew->tp = tp;
 		get_random_bytes(&fnew->hashrnd, 4);
-		tcf_exts_init(&fnew->exts, TCA_FLOW_ACT, TCA_FLOW_POLICE);
 	}
 
 	fnew->perturb_timer.function = flow_perturbation;
@@ -526,7 +527,7 @@ static int flow_change(struct net *net, struct sk_buff *in_skb,
 	if (*arg == 0)
 		list_add_tail_rcu(&fnew->list, &head->filters);
 	else
-		list_replace_rcu(&fnew->list, &fold->list);
+		list_replace_rcu(&fold->list, &fnew->list);
 
 	*arg = (unsigned long)fnew;
 
-- 
cgit v1.2.3


From 0848f6428ba3a2e42db124d41ac6f548655735bf Mon Sep 17 00:00:00 2001
From: Edward Hyunkoo Jee <edjee@google.com>
Date: Tue, 21 Jul 2015 09:43:59 +0200
Subject: inet: frags: fix defragmented packet's IP header for af_packet

When ip_frag_queue() computes positions, it assumes that the passed
sk_buff does not contain L2 headers.

However, when PACKET_FANOUT_FLAG_DEFRAG is used, IP reassembly
functions can be called on outgoing packets that contain L2 headers.

Also, IPv4 checksum is not corrected after reassembly.

Fixes: 7736d33f4262 ("packet: Add pre-defragmentation support for ipv4 fanouts.")
Signed-off-by: Edward Hyunkoo Jee <edjee@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Willem de Bruijn <willemb@google.com>
Cc: Jerry Chu <hkchu@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ip_fragment.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index a50dc6d408d1..31f71b15cfba 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -351,7 +351,7 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
 	ihl = ip_hdrlen(skb);
 
 	/* Determine the position of this fragment. */
-	end = offset + skb->len - ihl;
+	end = offset + skb->len - skb_network_offset(skb) - ihl;
 	err = -EINVAL;
 
 	/* Is this the final fragment? */
@@ -381,7 +381,7 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
 		goto err;
 
 	err = -ENOMEM;
-	if (!pskb_pull(skb, ihl))
+	if (!pskb_pull(skb, skb_network_offset(skb) + ihl))
 		goto err;
 
 	err = pskb_trim_rcsum(skb, end - offset);
@@ -641,6 +641,8 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
 		iph->frag_off = 0;
 	}
 
+	ip_send_check(iph);
+
 	IP_INC_STATS_BH(net, IPSTATS_MIB_REASMOKS);
 	qp->q.fragments = NULL;
 	qp->q.fragments_tail = NULL;
-- 
cgit v1.2.3


From 89e478a2aa58af2548b7f316e4d5b6bcc9eade5b Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 22 Jul 2015 07:02:00 +0200
Subject: tcp: suppress a division by zero warning

Andrew Morton reported following warning on one ARM build
with gcc-4.4 :

net/ipv4/inet_hashtables.c: In function 'inet_ehash_locks_alloc':
net/ipv4/inet_hashtables.c:617: warning: division by zero

Even guarded with a test on sizeof(spinlock_t), compiler does not
like current construct on a !CONFIG_SMP build.

Remove the warning by using a temporary variable.

Fixes: 095dc8e0c368 ("tcp: fix/cleanup inet_ehash_locks_alloc()")
Reported-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/inet_hashtables.c | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index 5f9b063bbe8a..0cb9165421d4 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -624,22 +624,21 @@ EXPORT_SYMBOL_GPL(inet_hashinfo_init);
 
 int inet_ehash_locks_alloc(struct inet_hashinfo *hashinfo)
 {
+	unsigned int locksz = sizeof(spinlock_t);
 	unsigned int i, nblocks = 1;
 
-	if (sizeof(spinlock_t) != 0) {
+	if (locksz != 0) {
 		/* allocate 2 cache lines or at least one spinlock per cpu */
-		nblocks = max_t(unsigned int,
-				2 * L1_CACHE_BYTES / sizeof(spinlock_t),
-				1);
+		nblocks = max(2U * L1_CACHE_BYTES / locksz, 1U);
 		nblocks = roundup_pow_of_two(nblocks * num_possible_cpus());
 
 		/* no more locks than number of hash buckets */
 		nblocks = min(nblocks, hashinfo->ehash_mask + 1);
 
-		hashinfo->ehash_locks =	kmalloc_array(nblocks, sizeof(spinlock_t),
+		hashinfo->ehash_locks =	kmalloc_array(nblocks, locksz,
 						      GFP_KERNEL | __GFP_NOWARN);
 		if (!hashinfo->ehash_locks)
-			hashinfo->ehash_locks = vmalloc(nblocks * sizeof(spinlock_t));
+			hashinfo->ehash_locks = vmalloc(nblocks * locksz);
 
 		if (!hashinfo->ehash_locks)
 			return -ENOMEM;
-- 
cgit v1.2.3


From 0470eb99b4721586ccac954faac3fa4472da0845 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Tue, 21 Jul 2015 16:33:50 +0200
Subject: netlink: don't hold mutex in rcu callback when releasing mmapd ring

Kirill A. Shutemov says:

This simple test-case trigers few locking asserts in kernel:

int main(int argc, char **argv)
{
        unsigned int block_size = 16 * 4096;
        struct nl_mmap_req req = {
                .nm_block_size          = block_size,
                .nm_block_nr            = 64,
                .nm_frame_size          = 16384,
                .nm_frame_nr            = 64 * block_size / 16384,
        };
        unsigned int ring_size;
	int fd;

	fd = socket(AF_NETLINK, SOCK_RAW, NETLINK_GENERIC);
        if (setsockopt(fd, SOL_NETLINK, NETLINK_RX_RING, &req, sizeof(req)) < 0)
                exit(1);
        if (setsockopt(fd, SOL_NETLINK, NETLINK_TX_RING, &req, sizeof(req)) < 0)
                exit(1);

	ring_size = req.nm_block_nr * req.nm_block_size;
	mmap(NULL, 2 * ring_size, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
	return 0;
}

+++ exited with 0 +++
BUG: sleeping function called from invalid context at /home/kas/git/public/linux-mm/kernel/locking/mutex.c:616
in_atomic(): 1, irqs_disabled(): 0, pid: 1, name: init
3 locks held by init/1:
 #0:  (reboot_mutex){+.+...}, at: [<ffffffff81080959>] SyS_reboot+0xa9/0x220
 #1:  ((reboot_notifier_list).rwsem){.+.+..}, at: [<ffffffff8107f379>] __blocking_notifier_call_chain+0x39/0x70
 #2:  (rcu_callback){......}, at: [<ffffffff810d32e0>] rcu_do_batch.isra.49+0x160/0x10c0
Preemption disabled at:[<ffffffff8145365f>] __delay+0xf/0x20

CPU: 1 PID: 1 Comm: init Not tainted 4.1.0-00009-gbddf4c4818e0 #253
Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS Debian-1.8.2-1 04/01/2014
 ffff88017b3d8000 ffff88027bc03c38 ffffffff81929ceb 0000000000000102
 0000000000000000 ffff88027bc03c68 ffffffff81085a9d 0000000000000002
 ffffffff81ca2a20 0000000000000268 0000000000000000 ffff88027bc03c98
Call Trace:
 <IRQ>  [<ffffffff81929ceb>] dump_stack+0x4f/0x7b
 [<ffffffff81085a9d>] ___might_sleep+0x16d/0x270
 [<ffffffff81085bed>] __might_sleep+0x4d/0x90
 [<ffffffff8192e96f>] mutex_lock_nested+0x2f/0x430
 [<ffffffff81932fed>] ? _raw_spin_unlock_irqrestore+0x5d/0x80
 [<ffffffff81464143>] ? __this_cpu_preempt_check+0x13/0x20
 [<ffffffff8182fc3d>] netlink_set_ring+0x1ed/0x350
 [<ffffffff8182e000>] ? netlink_undo_bind+0x70/0x70
 [<ffffffff8182fe20>] netlink_sock_destruct+0x80/0x150
 [<ffffffff817e484d>] __sk_free+0x1d/0x160
 [<ffffffff817e49a9>] sk_free+0x19/0x20
[..]

Cong Wang says:

We can't hold mutex lock in a rcu callback, [..]

Thomas Graf says:

The socket should be dead at this point. It might be simpler to
add a netlink_release_ring() function which doesn't require
locking at all.

Reported-by: "Kirill A. Shutemov" <kirill@shutemov.name>
Diagnosed-by: Cong Wang <cwang@twopensource.com>
Suggested-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/netlink/af_netlink.c | 79 ++++++++++++++++++++++++++++--------------------
 1 file changed, 47 insertions(+), 32 deletions(-)

(limited to 'net')

diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 9a0ae7172f92..d8e2e3918ce2 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -357,25 +357,52 @@ err1:
 	return NULL;
 }
 
+
+static void
+__netlink_set_ring(struct sock *sk, struct nl_mmap_req *req, bool tx_ring, void **pg_vec,
+		   unsigned int order)
+{
+	struct netlink_sock *nlk = nlk_sk(sk);
+	struct sk_buff_head *queue;
+	struct netlink_ring *ring;
+
+	queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue;
+	ring  = tx_ring ? &nlk->tx_ring : &nlk->rx_ring;
+
+	spin_lock_bh(&queue->lock);
+
+	ring->frame_max		= req->nm_frame_nr - 1;
+	ring->head		= 0;
+	ring->frame_size	= req->nm_frame_size;
+	ring->pg_vec_pages	= req->nm_block_size / PAGE_SIZE;
+
+	swap(ring->pg_vec_len, req->nm_block_nr);
+	swap(ring->pg_vec_order, order);
+	swap(ring->pg_vec, pg_vec);
+
+	__skb_queue_purge(queue);
+	spin_unlock_bh(&queue->lock);
+
+	WARN_ON(atomic_read(&nlk->mapped));
+
+	if (pg_vec)
+		free_pg_vec(pg_vec, order, req->nm_block_nr);
+}
+
 static int netlink_set_ring(struct sock *sk, struct nl_mmap_req *req,
-			    bool closing, bool tx_ring)
+			    bool tx_ring)
 {
 	struct netlink_sock *nlk = nlk_sk(sk);
 	struct netlink_ring *ring;
-	struct sk_buff_head *queue;
 	void **pg_vec = NULL;
 	unsigned int order = 0;
-	int err;
 
 	ring  = tx_ring ? &nlk->tx_ring : &nlk->rx_ring;
-	queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue;
 
-	if (!closing) {
-		if (atomic_read(&nlk->mapped))
-			return -EBUSY;
-		if (atomic_read(&ring->pending))
-			return -EBUSY;
-	}
+	if (atomic_read(&nlk->mapped))
+		return -EBUSY;
+	if (atomic_read(&ring->pending))
+		return -EBUSY;
 
 	if (req->nm_block_nr) {
 		if (ring->pg_vec != NULL)
@@ -407,31 +434,19 @@ static int netlink_set_ring(struct sock *sk, struct nl_mmap_req *req,
 			return -EINVAL;
 	}
 
-	err = -EBUSY;
 	mutex_lock(&nlk->pg_vec_lock);
-	if (closing || atomic_read(&nlk->mapped) == 0) {
-		err = 0;
-		spin_lock_bh(&queue->lock);
-
-		ring->frame_max		= req->nm_frame_nr - 1;
-		ring->head		= 0;
-		ring->frame_size	= req->nm_frame_size;
-		ring->pg_vec_pages	= req->nm_block_size / PAGE_SIZE;
-
-		swap(ring->pg_vec_len, req->nm_block_nr);
-		swap(ring->pg_vec_order, order);
-		swap(ring->pg_vec, pg_vec);
-
-		__skb_queue_purge(queue);
-		spin_unlock_bh(&queue->lock);
-
-		WARN_ON(atomic_read(&nlk->mapped));
+	if (atomic_read(&nlk->mapped) == 0) {
+		__netlink_set_ring(sk, req, tx_ring, pg_vec, order);
+		mutex_unlock(&nlk->pg_vec_lock);
+		return 0;
 	}
+
 	mutex_unlock(&nlk->pg_vec_lock);
 
 	if (pg_vec)
 		free_pg_vec(pg_vec, order, req->nm_block_nr);
-	return err;
+
+	return -EBUSY;
 }
 
 static void netlink_mm_open(struct vm_area_struct *vma)
@@ -900,10 +915,10 @@ static void netlink_sock_destruct(struct sock *sk)
 
 		memset(&req, 0, sizeof(req));
 		if (nlk->rx_ring.pg_vec)
-			netlink_set_ring(sk, &req, true, false);
+			__netlink_set_ring(sk, &req, false, NULL, 0);
 		memset(&req, 0, sizeof(req));
 		if (nlk->tx_ring.pg_vec)
-			netlink_set_ring(sk, &req, true, true);
+			__netlink_set_ring(sk, &req, true, NULL, 0);
 	}
 #endif /* CONFIG_NETLINK_MMAP */
 
@@ -2223,7 +2238,7 @@ static int netlink_setsockopt(struct socket *sock, int level, int optname,
 			return -EINVAL;
 		if (copy_from_user(&req, optval, sizeof(req)))
 			return -EFAULT;
-		err = netlink_set_ring(sk, &req, false,
+		err = netlink_set_ring(sk, &req,
 				       optname == NETLINK_TX_RING);
 		break;
 	}
-- 
cgit v1.2.3


From bac541e46319c2dc6ff9fd5ad6df59cd38686bdb Mon Sep 17 00:00:00 2001
From: Chris J Arges <chris.j.arges@canonical.com>
Date: Tue, 21 Jul 2015 12:36:33 -0500
Subject: openvswitch: allocate nr_node_ids flow_stats instead of
 num_possible_nodes

Some architectures like POWER can have a NUMA node_possible_map that
contains sparse entries. This causes memory corruption with openvswitch
since it allocates flow_cache with a multiple of num_possible_nodes() and
assumes the node variable returned by for_each_node will index into
flow->stats[node].

Use nr_node_ids to allocate a maximal sparse array instead of
num_possible_nodes().

The crash was noticed after 3af229f2 was applied as it changed the
node_possible_map to match node_online_map on boot.
Fixes: 3af229f2071f5b5cb31664be6109561fbe19c861

Signed-off-by: Chris J Arges <chris.j.arges@canonical.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Nishanth Aravamudan <nacc@linux.vnet.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/openvswitch/flow_table.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/openvswitch/flow_table.c b/net/openvswitch/flow_table.c
index 4613df8c8290..65523948fb95 100644
--- a/net/openvswitch/flow_table.c
+++ b/net/openvswitch/flow_table.c
@@ -752,7 +752,7 @@ int ovs_flow_init(void)
 	BUILD_BUG_ON(sizeof(struct sw_flow_key) % sizeof(long));
 
 	flow_cache = kmem_cache_create("sw_flow", sizeof(struct sw_flow)
-				       + (num_possible_nodes()
+				       + (nr_node_ids
 					  * sizeof(struct flow_stats *)),
 				       0, 0, NULL);
 	if (flow_cache == NULL)
-- 
cgit v1.2.3


From 4b31814d20cbe5cd4ccf18089751e77a04afe4f2 Mon Sep 17 00:00:00 2001
From: Joe Stringer <joestringer@nicira.com>
Date: Tue, 21 Jul 2015 21:37:31 -0700
Subject: netfilter: nf_conntrack: Support expectations in different zones

When zones were originally introduced, the expectation functions were
all extended to perform lookup using the zone. However, insertion was
not modified to check the zone. This means that two expectations which
are intended to apply for different connections that have the same tuple
but exist in different zones cannot both be tracked.

Fixes: 5d0aa2ccd4 (netfilter: nf_conntrack: add support for "conntrack zones")
Signed-off-by: Joe Stringer <joestringer@nicira.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_conntrack_expect.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c
index 7a17070c5dab..b45a4223cb05 100644
--- a/net/netfilter/nf_conntrack_expect.c
+++ b/net/netfilter/nf_conntrack_expect.c
@@ -219,7 +219,8 @@ static inline int expect_clash(const struct nf_conntrack_expect *a,
 			a->mask.src.u3.all[count] & b->mask.src.u3.all[count];
 	}
 
-	return nf_ct_tuple_mask_cmp(&a->tuple, &b->tuple, &intersect_mask);
+	return nf_ct_tuple_mask_cmp(&a->tuple, &b->tuple, &intersect_mask) &&
+	       nf_ct_zone(a->master) == nf_ct_zone(b->master);
 }
 
 static inline int expect_matches(const struct nf_conntrack_expect *a,
-- 
cgit v1.2.3


From 68514471ceceac63c7fa9ad684d882f41be5b2d8 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Wed, 22 Jul 2015 16:31:17 -0400
Subject: SUNRPC: Fix a backchannel deadlock

xprt_alloc_bc_request() cannot call xprt_free_bc_request() without
deadlocking, since it already holds the xprt->bc_pa_lock.

Reported-by: Chuck Lever <chuck.lever@oracle.com>
Fixes: 0d2a970d0ae55 ("SUNRPC: Fix a backchannel race")
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 net/sunrpc/backchannel_rqst.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/sunrpc/backchannel_rqst.c b/net/sunrpc/backchannel_rqst.c
index 9825ff0f91d6..5a3b50aec397 100644
--- a/net/sunrpc/backchannel_rqst.c
+++ b/net/sunrpc/backchannel_rqst.c
@@ -240,8 +240,8 @@ static struct rpc_rqst *xprt_alloc_bc_request(struct rpc_xprt *xprt, __be32 xid)
 		req = xprt_alloc_bc_req(xprt, GFP_ATOMIC);
 		if (!req)
 			goto not_found;
-		/* Note: this 'free' request adds it to xprt->bc_pa_list */
-		xprt_free_bc_request(req);
+		list_add_tail(&req->rq_bc_pa_list, &xprt->bc_pa_list);
+		xprt->bc_alloc_count++;
 	}
 	req = list_first_entry(&xprt->bc_pa_list, struct rpc_rqst,
 				rq_bc_pa_list);
-- 
cgit v1.2.3


From 1980bd4d829a87ccd21b949f8a11ff1b426f5b0b Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Wed, 22 Jul 2015 17:05:32 -0400
Subject: SUNRPC: xprt_complete_bc_request must also decrement the free slot
 count

Calling xprt_complete_bc_request() effectively causes the slot to be allocated,
so it needs to decrement the backchannel free slot count as well.

Fixes: 0d2a970d0ae5 ("SUNRPC: Fix a backchannel race")
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 net/sunrpc/backchannel_rqst.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/sunrpc/backchannel_rqst.c b/net/sunrpc/backchannel_rqst.c
index 5a3b50aec397..6255d141133b 100644
--- a/net/sunrpc/backchannel_rqst.c
+++ b/net/sunrpc/backchannel_rqst.c
@@ -336,7 +336,7 @@ void xprt_complete_bc_request(struct rpc_rqst *req, uint32_t copied)
 
 	spin_lock(&xprt->bc_pa_lock);
 	list_del(&req->rq_bc_pa_list);
-	xprt->bc_alloc_count--;
+	xprt_dec_alloc_count(xprt, 1);
 	spin_unlock(&xprt->bc_pa_lock);
 
 	req->rq_private_buf.len = copied;
-- 
cgit v1.2.3


From 25ba265390c09b0a2b2f3fd9ba82e37248b7a371 Mon Sep 17 00:00:00 2001
From: Johan Hedberg <johan.hedberg@intel.com>
Date: Mon, 20 Jul 2015 20:31:25 +0300
Subject: Bluetooth: Fix NULL pointer dereference in smp_conn_security

The l2cap_conn->smp pointer may be NULL for various valid reasons where SMP has
failed to initialize properly. One such scenario is when crypto support is
missing, another when the adapter has been powered on through a legacy method.
The smp_conn_security() function should have the appropriate check for this
situation to avoid NULL pointer dereferences.

Signed-off-by: Johan Hedberg <johan.hedberg@intel.com>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
Cc: stable@vger.kernel.org # 4.0+
---
 net/bluetooth/smp.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'net')

diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c
index 3d0f7d2a0616..ad82324f710f 100644
--- a/net/bluetooth/smp.c
+++ b/net/bluetooth/smp.c
@@ -2312,6 +2312,10 @@ int smp_conn_security(struct hci_conn *hcon, __u8 sec_level)
 		return 1;
 
 	chan = conn->smp;
+	if (!chan) {
+		BT_ERR("SMP security requested but not available");
+		return 1;
+	}
 
 	if (!hci_dev_test_flag(hcon->hdev, HCI_LE_ENABLED))
 		return 1;
-- 
cgit v1.2.3


From 18a912e9a832dcfc7db9e055c7e41701ff5f9e69 Mon Sep 17 00:00:00 2001
From: Julian Anastasov <ja@ssi.bg>
Date: Wed, 22 Jul 2015 10:43:22 +0300
Subject: ipv4: fib_select_default should match the prefix

fib_trie starting from 4.1 can link fib aliases from
different prefixes in same list. Make sure the alternative
gateways are in same table and for same prefix (0) by
checking tb_id and fa_slen.

Fixes: 79e5ad2ceb00 ("fib_trie: Remove leaf_info")
Signed-off-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/fib_semantics.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'net')

diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index c7358ea4ae93..e1079583b8b7 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -1207,12 +1207,17 @@ void fib_select_default(struct fib_result *res)
 	struct fib_info *fi = NULL, *last_resort = NULL;
 	struct hlist_head *fa_head = res->fa_head;
 	struct fib_table *tb = res->table;
+	u8 slen = 32 - res->prefixlen;
 	int order = -1, last_idx = -1;
 	struct fib_alias *fa;
 
 	hlist_for_each_entry_rcu(fa, fa_head, fa_list) {
 		struct fib_info *next_fi = fa->fa_info;
 
+		if (fa->fa_slen != slen)
+			continue;
+		if (fa->tb_id != tb->tb_id)
+			continue;
 		if (next_fi->fib_scope != res->scope ||
 		    fa->fa_type != RTN_UNICAST)
 			continue;
-- 
cgit v1.2.3


From 2392debc2be721a7d5b907cbcbc0ebb858dead01 Mon Sep 17 00:00:00 2001
From: Julian Anastasov <ja@ssi.bg>
Date: Wed, 22 Jul 2015 10:43:23 +0300
Subject: ipv4: consider TOS in fib_select_default

fib_select_default considers alternative routes only when
res->fi is for the first alias in res->fa_head. In the
common case this can happen only when the initial lookup
matches the first alias with highest TOS value. This
prevents the alternative routes to require specific TOS.

This patch solves the problem as follows:

- routes that require specific TOS should be returned by
fib_select_default only when TOS matches, as already done
in fib_table_lookup. This rule implies that depending on the
TOS we can have many different lists of alternative gateways
and we have to keep the last used gateway (fa_default) in first
alias for the TOS instead of using single tb_default value.

- as the aliases are ordered by many keys (TOS desc,
fib_priority asc), we restrict the possible results to
routes with matching TOS and lowest metric (fib_priority)
and routes that match any TOS, again with lowest metric.

For example, packet with TOS 8 can not use gw3 (not lowest
metric), gw4 (different TOS) and gw6 (not lowest metric),
all other gateways can be used:

tos 8 via gw1 metric 2 <--- res->fa_head and res->fi
tos 8 via gw2 metric 2
tos 8 via gw3 metric 3
tos 4 via gw4
tos 0 via gw5
tos 0 via gw6 metric 1

Reported-by: Hagen Paul Pfeifer <hagen@jauu.net>
Signed-off-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip_fib.h     |  3 +--
 net/ipv4/fib_lookup.h    |  1 +
 net/ipv4/fib_semantics.c | 36 +++++++++++++++++++++++++-----------
 net/ipv4/fib_trie.c      |  3 ++-
 net/ipv4/route.c         |  2 +-
 5 files changed, 30 insertions(+), 15 deletions(-)

(limited to 'net')

diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index 49c142bdf01e..5fa643b4e891 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -183,7 +183,6 @@ __be32 fib_info_update_nh_saddr(struct net *net, struct fib_nh *nh);
 struct fib_table {
 	struct hlist_node	tb_hlist;
 	u32			tb_id;
-	int			tb_default;
 	int			tb_num_default;
 	struct rcu_head		rcu;
 	unsigned long 		*tb_data;
@@ -290,7 +289,7 @@ __be32 fib_compute_spec_dst(struct sk_buff *skb);
 int fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst,
 			u8 tos, int oif, struct net_device *dev,
 			struct in_device *idev, u32 *itag);
-void fib_select_default(struct fib_result *res);
+void fib_select_default(const struct flowi4 *flp, struct fib_result *res);
 #ifdef CONFIG_IP_ROUTE_CLASSID
 static inline int fib_num_tclassid_users(struct net *net)
 {
diff --git a/net/ipv4/fib_lookup.h b/net/ipv4/fib_lookup.h
index c6211ed60b03..9c02920725db 100644
--- a/net/ipv4/fib_lookup.h
+++ b/net/ipv4/fib_lookup.h
@@ -13,6 +13,7 @@ struct fib_alias {
 	u8			fa_state;
 	u8			fa_slen;
 	u32			tb_id;
+	s16			fa_default;
 	struct rcu_head		rcu;
 };
 
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index e1079583b8b7..3a06586b170c 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -1202,28 +1202,40 @@ int fib_sync_down_dev(struct net_device *dev, unsigned long event)
 }
 
 /* Must be invoked inside of an RCU protected region.  */
-void fib_select_default(struct fib_result *res)
+void fib_select_default(const struct flowi4 *flp, struct fib_result *res)
 {
 	struct fib_info *fi = NULL, *last_resort = NULL;
 	struct hlist_head *fa_head = res->fa_head;
 	struct fib_table *tb = res->table;
 	u8 slen = 32 - res->prefixlen;
 	int order = -1, last_idx = -1;
-	struct fib_alias *fa;
+	struct fib_alias *fa, *fa1 = NULL;
+	u32 last_prio = res->fi->fib_priority;
+	u8 last_tos = 0;
 
 	hlist_for_each_entry_rcu(fa, fa_head, fa_list) {
 		struct fib_info *next_fi = fa->fa_info;
 
 		if (fa->fa_slen != slen)
 			continue;
+		if (fa->fa_tos && fa->fa_tos != flp->flowi4_tos)
+			continue;
 		if (fa->tb_id != tb->tb_id)
 			continue;
+		if (next_fi->fib_priority > last_prio &&
+		    fa->fa_tos == last_tos) {
+			if (last_tos)
+				continue;
+			break;
+		}
+		if (next_fi->fib_flags & RTNH_F_DEAD)
+			continue;
+		last_tos = fa->fa_tos;
+		last_prio = next_fi->fib_priority;
+
 		if (next_fi->fib_scope != res->scope ||
 		    fa->fa_type != RTN_UNICAST)
 			continue;
-
-		if (next_fi->fib_priority > res->fi->fib_priority)
-			break;
 		if (!next_fi->fib_nh[0].nh_gw ||
 		    next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK)
 			continue;
@@ -1233,10 +1245,11 @@ void fib_select_default(struct fib_result *res)
 		if (!fi) {
 			if (next_fi != res->fi)
 				break;
+			fa1 = fa;
 		} else if (!fib_detect_death(fi, order, &last_resort,
-					     &last_idx, tb->tb_default)) {
+					     &last_idx, fa1->fa_default)) {
 			fib_result_assign(res, fi);
-			tb->tb_default = order;
+			fa1->fa_default = order;
 			goto out;
 		}
 		fi = next_fi;
@@ -1244,20 +1257,21 @@ void fib_select_default(struct fib_result *res)
 	}
 
 	if (order <= 0 || !fi) {
-		tb->tb_default = -1;
+		if (fa1)
+			fa1->fa_default = -1;
 		goto out;
 	}
 
 	if (!fib_detect_death(fi, order, &last_resort, &last_idx,
-				tb->tb_default)) {
+			      fa1->fa_default)) {
 		fib_result_assign(res, fi);
-		tb->tb_default = order;
+		fa1->fa_default = order;
 		goto out;
 	}
 
 	if (last_idx >= 0)
 		fib_result_assign(res, last_resort);
-	tb->tb_default = last_idx;
+	fa1->fa_default = last_idx;
 out:
 	return;
 }
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 15d32612e3c6..81797e065b21 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -1171,6 +1171,7 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
 			new_fa->fa_state = state & ~FA_S_ACCESSED;
 			new_fa->fa_slen = fa->fa_slen;
 			new_fa->tb_id = tb->tb_id;
+			new_fa->fa_default = -1;
 
 			err = switchdev_fib_ipv4_add(key, plen, fi,
 						     new_fa->fa_tos,
@@ -1222,6 +1223,7 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
 	new_fa->fa_state = 0;
 	new_fa->fa_slen = slen;
 	new_fa->tb_id = tb->tb_id;
+	new_fa->fa_default = -1;
 
 	/* (Optionally) offload fib entry to switch hardware. */
 	err = switchdev_fib_ipv4_add(key, plen, fi, tos, cfg->fc_type,
@@ -1990,7 +1992,6 @@ struct fib_table *fib_trie_table(u32 id, struct fib_table *alias)
 		return NULL;
 
 	tb->tb_id = id;
-	tb->tb_default = -1;
 	tb->tb_num_default = 0;
 	tb->tb_data = (alias ? alias->__data : tb->__data);
 
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index d0362a2de3d3..e681b852ced1 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -2176,7 +2176,7 @@ struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
 	if (!res.prefixlen &&
 	    res.table->tb_num_default > 1 &&
 	    res.type == RTN_UNICAST && !fl4->flowi4_oif)
-		fib_select_default(&res);
+		fib_select_default(fl4, &res);
 
 	if (!fl4->saddr)
 		fl4->saddr = FIB_RES_PREFSRC(net, res);
-- 
cgit v1.2.3


From fe6bea7f1f3a09fc06d835446d34d3b3b6a543fb Mon Sep 17 00:00:00 2001
From: WANG Cong <xiyou.wangcong@gmail.com>
Date: Tue, 21 Jul 2015 16:31:53 -0700
Subject: sch_plug: purge buffered packets during reset

Otherwise the skbuff related structures are not correctly
refcount'ed.

Cc: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/sch_plug.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/sched/sch_plug.c b/net/sched/sch_plug.c
index 89f8fcf73f18..ade9445a55ab 100644
--- a/net/sched/sch_plug.c
+++ b/net/sched/sch_plug.c
@@ -216,6 +216,7 @@ static struct Qdisc_ops plug_qdisc_ops __read_mostly = {
 	.peek        =       qdisc_peek_head,
 	.init        =       plug_init,
 	.change      =       plug_change,
+	.reset       =	     qdisc_reset_queue,
 	.owner       =       THIS_MODULE,
 };
 
-- 
cgit v1.2.3


From 77e62da6e60c7772971f813f588d372a7f1a4167 Mon Sep 17 00:00:00 2001
From: WANG Cong <xiyou.wangcong@gmail.com>
Date: Tue, 21 Jul 2015 16:52:43 -0700
Subject: sch_choke: drop all packets in queue during reset

Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/sch_choke.c | 13 +++++++++++++
 1 file changed, 13 insertions(+)

(limited to 'net')

diff --git a/net/sched/sch_choke.c b/net/sched/sch_choke.c
index 93d5742dc7e0..6a783afe4960 100644
--- a/net/sched/sch_choke.c
+++ b/net/sched/sch_choke.c
@@ -385,6 +385,19 @@ static void choke_reset(struct Qdisc *sch)
 {
 	struct choke_sched_data *q = qdisc_priv(sch);
 
+	while (q->head != q->tail) {
+		struct sk_buff *skb = q->tab[q->head];
+
+		q->head = (q->head + 1) & q->tab_mask;
+		if (!skb)
+			continue;
+		qdisc_qstats_backlog_dec(sch, skb);
+		--sch->q.qlen;
+		qdisc_drop(skb, sch);
+	}
+
+	memset(q->tab, 0, (q->tab_mask + 1) * sizeof(struct sk_buff *));
+	q->head = q->tail = 0;
 	red_restart(&q->vars);
 }
 
-- 
cgit v1.2.3


From cc9f4daa638e660f7a910b8094122561470ac331 Mon Sep 17 00:00:00 2001
From: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
Date: Wed, 22 Jul 2015 12:23:20 +0300
Subject: cgroup: net_cls: fix false-positive "suspicious RCU usage"

In dev_queue_xmit() net_cls protected with rcu-bh.

[  270.730026] ===============================
[  270.730029] [ INFO: suspicious RCU usage. ]
[  270.730033] 4.2.0-rc3+ #2 Not tainted
[  270.730036] -------------------------------
[  270.730040] include/linux/cgroup.h:353 suspicious rcu_dereference_check() usage!
[  270.730041] other info that might help us debug this:
[  270.730043] rcu_scheduler_active = 1, debug_locks = 1
[  270.730045] 2 locks held by dhclient/748:
[  270.730046]  #0:  (rcu_read_lock_bh){......}, at: [<ffffffff81682b70>] __dev_queue_xmit+0x50/0x960
[  270.730085]  #1:  (&qdisc_tx_lock){+.....}, at: [<ffffffff81682d60>] __dev_queue_xmit+0x240/0x960
[  270.730090] stack backtrace:
[  270.730096] CPU: 0 PID: 748 Comm: dhclient Not tainted 4.2.0-rc3+ #2
[  270.730098] Hardware name: OpenStack Foundation OpenStack Nova, BIOS Bochs 01/01/2011
[  270.730100]  0000000000000001 ffff8800bafeba58 ffffffff817ad487 0000000000000007
[  270.730103]  ffff880232a0a780 ffff8800bafeba88 ffffffff810ca4f2 ffff88022fb23e00
[  270.730105]  ffff880232a0a780 ffff8800bafebb68 ffff8800bafebb68 ffff8800bafebaa8
[  270.730108] Call Trace:
[  270.730121]  [<ffffffff817ad487>] dump_stack+0x4c/0x65
[  270.730148]  [<ffffffff810ca4f2>] lockdep_rcu_suspicious+0xe2/0x120
[  270.730153]  [<ffffffff816a62d2>] task_cls_state+0x92/0xa0
[  270.730158]  [<ffffffffa00b534f>] cls_cgroup_classify+0x4f/0x120 [cls_cgroup]
[  270.730164]  [<ffffffff816aac74>] tc_classify_compat+0x74/0xc0
[  270.730166]  [<ffffffff816ab573>] tc_classify+0x33/0x90
[  270.730170]  [<ffffffffa00bcb0a>] htb_enqueue+0xaa/0x4a0 [sch_htb]
[  270.730172]  [<ffffffff81682e26>] __dev_queue_xmit+0x306/0x960
[  270.730174]  [<ffffffff81682b70>] ? __dev_queue_xmit+0x50/0x960
[  270.730176]  [<ffffffff816834a3>] dev_queue_xmit_sk+0x13/0x20
[  270.730185]  [<ffffffff81787770>] dev_queue_xmit+0x10/0x20
[  270.730187]  [<ffffffff8178b91c>] packet_snd.isra.62+0x54c/0x760
[  270.730190]  [<ffffffff8178be25>] packet_sendmsg+0x2f5/0x3f0
[  270.730203]  [<ffffffff81665245>] ? sock_def_readable+0x5/0x190
[  270.730210]  [<ffffffff817b64bb>] ? _raw_spin_unlock+0x2b/0x40
[  270.730216]  [<ffffffff8173bcbc>] ? unix_dgram_sendmsg+0x5cc/0x640
[  270.730219]  [<ffffffff8165f367>] sock_sendmsg+0x47/0x50
[  270.730221]  [<ffffffff8165f42f>] sock_write_iter+0x7f/0xd0
[  270.730232]  [<ffffffff811fd4c7>] __vfs_write+0xa7/0xf0
[  270.730234]  [<ffffffff811fe5b8>] vfs_write+0xb8/0x190
[  270.730236]  [<ffffffff811fe8c2>] SyS_write+0x52/0xb0
[  270.730239]  [<ffffffff817b6bae>] entry_SYSCALL_64_fastpath+0x12/0x76

Signed-off-by: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/netclassid_cgroup.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/core/netclassid_cgroup.c b/net/core/netclassid_cgroup.c
index 1f2a126f4ffa..6441f47b1a8f 100644
--- a/net/core/netclassid_cgroup.c
+++ b/net/core/netclassid_cgroup.c
@@ -23,7 +23,8 @@ static inline struct cgroup_cls_state *css_cls_state(struct cgroup_subsys_state
 
 struct cgroup_cls_state *task_cls_state(struct task_struct *p)
 {
-	return css_cls_state(task_css(p, net_cls_cgrp_id));
+	return css_cls_state(task_css_check(p, net_cls_cgrp_id,
+					    rcu_read_lock_bh_held()));
 }
 EXPORT_SYMBOL_GPL(task_cls_state);
 
-- 
cgit v1.2.3


From 963ad94853000ab100f5ff19eea80095660d41b4 Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Date: Wed, 22 Jul 2015 13:03:40 +0200
Subject: bridge: netlink: fix slave_changelink/br_setport race conditions

Since slave_changelink support was added there have been a few race
conditions when using br_setport() since some of the port functions it
uses require the bridge lock. It is very easy to trigger a lockup due to
some internal spin_lock() usage without bh disabled, also it's possible to
get the bridge into an inconsistent state.

Signed-off-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Fixes: 3ac636b8591c ("bridge: implement rtnl_link_ops->slave_changelink")
Reviewed-by: Jiri Pirko <jiri@resnulli.us>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/bridge/br_netlink.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c
index 364bdc98bd9b..3da5525eb8a2 100644
--- a/net/bridge/br_netlink.c
+++ b/net/bridge/br_netlink.c
@@ -693,9 +693,17 @@ static int br_port_slave_changelink(struct net_device *brdev,
 				    struct nlattr *tb[],
 				    struct nlattr *data[])
 {
+	struct net_bridge *br = netdev_priv(brdev);
+	int ret;
+
 	if (!data)
 		return 0;
-	return br_setport(br_port_get_rtnl(dev), data);
+
+	spin_lock_bh(&br->lock);
+	ret = br_setport(br_port_get_rtnl(dev), data);
+	spin_unlock_bh(&br->lock);
+
+	return ret;
 }
 
 static int br_port_fill_slave_info(struct sk_buff *skb,
-- 
cgit v1.2.3


From 81296fc67319d96ea6f7f43a07494394e1236a19 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Wed, 22 Jul 2015 16:31:49 +0200
Subject: net: sctp: stop spamming klog with rfc6458, 5.3.2. deprecation
 warnings

Back then when we added support for SCTP_SNDINFO/SCTP_RCVINFO from
RFC6458 5.3.4/5.3.5, we decided to add a deprecation warning for the
(as per RFC deprecated) SCTP_SNDRCV via commit bbbea41d5e53 ("net:
sctp: deprecate rfc6458, 5.3.2. SCTP_SNDRCV support"), see [1].

Imho, it was not a good idea, and we should just revert that message
for a couple of reasons:

  1) It's uapi and therefore set in stone forever.

  2) To be able to run on older and newer kernels, an SCTP application
     would need to probe for both, SCTP_SNDRCV, but also SCTP_SNDINFO/
     SCTP_RCVINFO support, so that on older kernels, it can make use
     of SCTP_SNDRCV, and on newer kernels SCTP_SNDINFO/SCTP_RCVINFO.
     In my (limited) experience, a lot of SCTP appliances are migrating
     to newer kernels only ve(ee)ry slowly.

  3) Some people don't have the chance to change their applications,
     f.e. due to proprietary legacy stuff. So, they'll hit this warning
     in fast path and are stuck with older kernels.

But i.e. due to point 1) I really fail to see the benefit of a warning.
So just revert that for now, the issue was reported up Jamal.

  [1] http://thread.gmane.org/gmane.linux.network/321960/

Reported-by: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Cc: Michael Tuexen <tuexen@fh-muenster.de>
Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sctp/socket.c | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'net')

diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 1425ec2bbd5a..17bef01b9aa3 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -2200,12 +2200,6 @@ static int sctp_setsockopt_events(struct sock *sk, char __user *optval,
 	if (copy_from_user(&sctp_sk(sk)->subscribe, optval, optlen))
 		return -EFAULT;
 
-	if (sctp_sk(sk)->subscribe.sctp_data_io_event)
-		pr_warn_ratelimited(DEPRECATED "%s (pid %d) "
-				    "Requested SCTP_SNDRCVINFO event.\n"
-				    "Use SCTP_RCVINFO through SCTP_RECVRCVINFO option instead.\n",
-				    current->comm, task_pid_nr(current));
-
 	/* At the time when a user app subscribes to SCTP_SENDER_DRY_EVENT,
 	 * if there is no data to be sent or retransmit, the stack will
 	 * immediately send up this notification.
-- 
cgit v1.2.3


From d1fe19444d82e399e38c1594c71b850eca8e9de0 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Thu, 23 Jul 2015 12:05:37 +0200
Subject: inet: frag: don't re-use chainlist for evictor

commit 65ba1f1ec0eff ("inet: frags: fix a race between inet_evict_bucket
and inet_frag_kill") describes the bug, but the fix doesn't work reliably.

Problem is that ->flags member can be set on other cpu without chainlock
being held by that task, i.e. the RMW-Cycle can clear INET_FRAG_EVICTED
bit after we put the element on the evictor private list.

We can crash when walking the 'private' evictor list since an element can
be deleted from list underneath the evictor.

Join work with Nikolay Alexandrov.

Fixes: b13d3cbfb8e8 ("inet: frag: move eviction of queues to work queue")
Reported-by: Johan Schuijt <johan@transip.nl>
Tested-by: Frank Schreuder <fschreuder@transip.nl>
Signed-off-by: Nikolay Alexandrov <nikolay@cumulusnetworks.com>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/inet_frag.h  | 2 ++
 net/ipv4/inet_fragment.c | 8 +++-----
 2 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h
index e1300b3dd597..56a3a5685f76 100644
--- a/include/net/inet_frag.h
+++ b/include/net/inet_frag.h
@@ -45,6 +45,7 @@ enum {
  * @flags: fragment queue flags
  * @max_size: maximum received fragment size
  * @net: namespace that this frag belongs to
+ * @list_evictor: list of queues to forcefully evict (e.g. due to low memory)
  */
 struct inet_frag_queue {
 	spinlock_t		lock;
@@ -59,6 +60,7 @@ struct inet_frag_queue {
 	__u8			flags;
 	u16			max_size;
 	struct netns_frags	*net;
+	struct hlist_node	list_evictor;
 };
 
 #define INETFRAGS_HASHSZ	1024
diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
index 5e346a082e5f..172234864fec 100644
--- a/net/ipv4/inet_fragment.c
+++ b/net/ipv4/inet_fragment.c
@@ -151,14 +151,13 @@ evict_again:
 		}
 
 		fq->flags |= INET_FRAG_EVICTED;
-		hlist_del(&fq->list);
-		hlist_add_head(&fq->list, &expired);
+		hlist_add_head(&fq->list_evictor, &expired);
 		++evicted;
 	}
 
 	spin_unlock(&hb->chain_lock);
 
-	hlist_for_each_entry_safe(fq, n, &expired, list)
+	hlist_for_each_entry_safe(fq, n, &expired, list_evictor)
 		f->frag_expire((unsigned long) fq);
 
 	return evicted;
@@ -284,8 +283,7 @@ static inline void fq_unlink(struct inet_frag_queue *fq, struct inet_frags *f)
 	struct inet_frag_bucket *hb;
 
 	hb = get_frag_bucket_locked(fq, f);
-	if (!(fq->flags & INET_FRAG_EVICTED))
-		hlist_del(&fq->list);
+	hlist_del(&fq->list);
 	spin_unlock(&hb->chain_lock);
 }
 
-- 
cgit v1.2.3


From 0e60d245a0be7fdbb723607f1d6621007916b252 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Thu, 23 Jul 2015 12:05:38 +0200
Subject: inet: frag: change *_frag_mem_limit functions to take netns_frags as
 argument

Followup patch will call it after inet_frag_queue was freed, so q->net
doesn't work anymore (but netf = q->net; free(q); mem_limit(netf) would).

Tested-by: Frank Schreuder <fschreuder@transip.nl>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/inet_frag.h                 |  8 ++++----
 net/ieee802154/6lowpan/reassembly.c     |  6 +++---
 net/ipv4/inet_fragment.c                |  4 ++--
 net/ipv4/ip_fragment.c                  | 10 +++++-----
 net/ipv6/netfilter/nf_conntrack_reasm.c |  6 +++---
 net/ipv6/reassembly.c                   |  6 +++---
 6 files changed, 20 insertions(+), 20 deletions(-)

(limited to 'net')

diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h
index 56a3a5685f76..e71ca17024f2 100644
--- a/include/net/inet_frag.h
+++ b/include/net/inet_frag.h
@@ -141,14 +141,14 @@ static inline int frag_mem_limit(struct netns_frags *nf)
 	return percpu_counter_read(&nf->mem);
 }
 
-static inline void sub_frag_mem_limit(struct inet_frag_queue *q, int i)
+static inline void sub_frag_mem_limit(struct netns_frags *nf, int i)
 {
-	__percpu_counter_add(&q->net->mem, -i, frag_percpu_counter_batch);
+	__percpu_counter_add(&nf->mem, -i, frag_percpu_counter_batch);
 }
 
-static inline void add_frag_mem_limit(struct inet_frag_queue *q, int i)
+static inline void add_frag_mem_limit(struct netns_frags *nf, int i)
 {
-	__percpu_counter_add(&q->net->mem, i, frag_percpu_counter_batch);
+	__percpu_counter_add(&nf->mem, i, frag_percpu_counter_batch);
 }
 
 static inline void init_frag_mem_limit(struct netns_frags *nf)
diff --git a/net/ieee802154/6lowpan/reassembly.c b/net/ieee802154/6lowpan/reassembly.c
index f46e4d1306f2..214d44aef35b 100644
--- a/net/ieee802154/6lowpan/reassembly.c
+++ b/net/ieee802154/6lowpan/reassembly.c
@@ -207,7 +207,7 @@ found:
 	} else {
 		fq->q.meat += skb->len;
 	}
-	add_frag_mem_limit(&fq->q, skb->truesize);
+	add_frag_mem_limit(fq->q.net, skb->truesize);
 
 	if (fq->q.flags == (INET_FRAG_FIRST_IN | INET_FRAG_LAST_IN) &&
 	    fq->q.meat == fq->q.len) {
@@ -287,7 +287,7 @@ static int lowpan_frag_reasm(struct lowpan_frag_queue *fq, struct sk_buff *prev,
 		clone->data_len = clone->len;
 		head->data_len -= clone->len;
 		head->len -= clone->len;
-		add_frag_mem_limit(&fq->q, clone->truesize);
+		add_frag_mem_limit(fq->q.net, clone->truesize);
 	}
 
 	WARN_ON(head == NULL);
@@ -310,7 +310,7 @@ static int lowpan_frag_reasm(struct lowpan_frag_queue *fq, struct sk_buff *prev,
 		}
 		fp = next;
 	}
-	sub_frag_mem_limit(&fq->q, sum_truesize);
+	sub_frag_mem_limit(fq->q.net, sum_truesize);
 
 	head->next = NULL;
 	head->dev = dev;
diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
index 172234864fec..4473232e4e88 100644
--- a/net/ipv4/inet_fragment.c
+++ b/net/ipv4/inet_fragment.c
@@ -328,7 +328,7 @@ void inet_frag_destroy(struct inet_frag_queue *q, struct inet_frags *f)
 		fp = xp;
 	}
 	sum = sum_truesize + f->qsize;
-	sub_frag_mem_limit(q, sum);
+	sub_frag_mem_limit(q->net, sum);
 
 	if (f->destructor)
 		f->destructor(q);
@@ -388,7 +388,7 @@ static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf,
 
 	q->net = nf;
 	f->constructor(q, arg);
-	add_frag_mem_limit(q, f->qsize);
+	add_frag_mem_limit(nf, f->qsize);
 
 	setup_timer(&q->timer, f->frag_expire, (unsigned long)q);
 	spin_lock_init(&q->lock);
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index 31f71b15cfba..b4a77d021c0d 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -309,7 +309,7 @@ static int ip_frag_reinit(struct ipq *qp)
 		kfree_skb(fp);
 		fp = xp;
 	} while (fp);
-	sub_frag_mem_limit(&qp->q, sum_truesize);
+	sub_frag_mem_limit(qp->q.net, sum_truesize);
 
 	qp->q.flags = 0;
 	qp->q.len = 0;
@@ -455,7 +455,7 @@ found:
 				qp->q.fragments = next;
 
 			qp->q.meat -= free_it->len;
-			sub_frag_mem_limit(&qp->q, free_it->truesize);
+			sub_frag_mem_limit(qp->q.net, free_it->truesize);
 			kfree_skb(free_it);
 		}
 	}
@@ -479,7 +479,7 @@ found:
 	qp->q.stamp = skb->tstamp;
 	qp->q.meat += skb->len;
 	qp->ecn |= ecn;
-	add_frag_mem_limit(&qp->q, skb->truesize);
+	add_frag_mem_limit(qp->q.net, skb->truesize);
 	if (offset == 0)
 		qp->q.flags |= INET_FRAG_FIRST_IN;
 
@@ -587,7 +587,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
 		head->len -= clone->len;
 		clone->csum = 0;
 		clone->ip_summed = head->ip_summed;
-		add_frag_mem_limit(&qp->q, clone->truesize);
+		add_frag_mem_limit(qp->q.net, clone->truesize);
 	}
 
 	skb_push(head, head->data - skb_network_header(head));
@@ -615,7 +615,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
 		}
 		fp = next;
 	}
-	sub_frag_mem_limit(&qp->q, sum_truesize);
+	sub_frag_mem_limit(qp->q.net, sum_truesize);
 
 	head->next = NULL;
 	head->dev = dev;
diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c
index 6f187c8d8a1b..6d02498172c1 100644
--- a/net/ipv6/netfilter/nf_conntrack_reasm.c
+++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
@@ -348,7 +348,7 @@ found:
 	fq->ecn |= ecn;
 	if (payload_len > fq->q.max_size)
 		fq->q.max_size = payload_len;
-	add_frag_mem_limit(&fq->q, skb->truesize);
+	add_frag_mem_limit(fq->q.net, skb->truesize);
 
 	/* The first fragment.
 	 * nhoffset is obtained from the first fragment, of course.
@@ -430,7 +430,7 @@ nf_ct_frag6_reasm(struct frag_queue *fq, struct net_device *dev)
 		clone->ip_summed = head->ip_summed;
 
 		NFCT_FRAG6_CB(clone)->orig = NULL;
-		add_frag_mem_limit(&fq->q, clone->truesize);
+		add_frag_mem_limit(fq->q.net, clone->truesize);
 	}
 
 	/* We have to remove fragment header from datagram and to relocate
@@ -454,7 +454,7 @@ nf_ct_frag6_reasm(struct frag_queue *fq, struct net_device *dev)
 			head->csum = csum_add(head->csum, fp->csum);
 		head->truesize += fp->truesize;
 	}
-	sub_frag_mem_limit(&fq->q, head->truesize);
+	sub_frag_mem_limit(fq->q.net, head->truesize);
 
 	head->ignore_df = 1;
 	head->next = NULL;
diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c
index 8ffa2c8cce77..5c3bbca6a150 100644
--- a/net/ipv6/reassembly.c
+++ b/net/ipv6/reassembly.c
@@ -330,7 +330,7 @@ found:
 	fq->q.stamp = skb->tstamp;
 	fq->q.meat += skb->len;
 	fq->ecn |= ecn;
-	add_frag_mem_limit(&fq->q, skb->truesize);
+	add_frag_mem_limit(fq->q.net, skb->truesize);
 
 	/* The first fragment.
 	 * nhoffset is obtained from the first fragment, of course.
@@ -443,7 +443,7 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev,
 		head->len -= clone->len;
 		clone->csum = 0;
 		clone->ip_summed = head->ip_summed;
-		add_frag_mem_limit(&fq->q, clone->truesize);
+		add_frag_mem_limit(fq->q.net, clone->truesize);
 	}
 
 	/* We have to remove fragment header from datagram and to relocate
@@ -481,7 +481,7 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev,
 		}
 		fp = next;
 	}
-	sub_frag_mem_limit(&fq->q, sum_truesize);
+	sub_frag_mem_limit(fq->q.net, sum_truesize);
 
 	head->next = NULL;
 	head->dev = dev;
-- 
cgit v1.2.3


From 5719b296fb81502d0dbbb4e87b3235e5bdcdfc6b Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Thu, 23 Jul 2015 12:05:39 +0200
Subject: inet: frag: don't wait for timer deletion when evicting

Frank reports 'NMI watchdog: BUG: soft lockup' errors when
load is high.  Instead of (potentially) unbounded restarts of the
eviction process, just skip to the next entry.

One caveat is that, when a netns is exiting, a timer may still be running
by the time inet_evict_bucket returns.

We use the frag memory accounting to wait for outstanding timers,
so that when we free the percpu counter we can be sure no running
timer will trip over it.

Reported-and-tested-by: Frank Schreuder <fschreuder@transip.nl>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/inet_fragment.c | 29 +++++++++++------------------
 1 file changed, 11 insertions(+), 18 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
index 4473232e4e88..a00ca4c00c35 100644
--- a/net/ipv4/inet_fragment.c
+++ b/net/ipv4/inet_fragment.c
@@ -131,24 +131,14 @@ inet_evict_bucket(struct inet_frags *f, struct inet_frag_bucket *hb)
 	unsigned int evicted = 0;
 	HLIST_HEAD(expired);
 
-evict_again:
 	spin_lock(&hb->chain_lock);
 
 	hlist_for_each_entry_safe(fq, n, &hb->chain, list) {
 		if (!inet_fragq_should_evict(fq))
 			continue;
 
-		if (!del_timer(&fq->timer)) {
-			/* q expiring right now thus increment its refcount so
-			 * it won't be freed under us and wait until the timer
-			 * has finished executing then destroy it
-			 */
-			atomic_inc(&fq->refcnt);
-			spin_unlock(&hb->chain_lock);
-			del_timer_sync(&fq->timer);
-			inet_frag_put(fq, f);
-			goto evict_again;
-		}
+		if (!del_timer(&fq->timer))
+			continue;
 
 		fq->flags |= INET_FRAG_EVICTED;
 		hlist_add_head(&fq->list_evictor, &expired);
@@ -239,18 +229,20 @@ void inet_frags_exit_net(struct netns_frags *nf, struct inet_frags *f)
 	int i;
 
 	nf->low_thresh = 0;
-	local_bh_disable();
 
 evict_again:
+	local_bh_disable();
 	seq = read_seqbegin(&f->rnd_seqlock);
 
 	for (i = 0; i < INETFRAGS_HASHSZ ; i++)
 		inet_evict_bucket(f, &f->hash[i]);
 
-	if (read_seqretry(&f->rnd_seqlock, seq))
-		goto evict_again;
-
 	local_bh_enable();
+	cond_resched();
+
+	if (read_seqretry(&f->rnd_seqlock, seq) ||
+	    percpu_counter_sum(&nf->mem))
+		goto evict_again;
 
 	percpu_counter_destroy(&nf->mem);
 }
@@ -284,6 +276,7 @@ static inline void fq_unlink(struct inet_frag_queue *fq, struct inet_frags *f)
 
 	hb = get_frag_bucket_locked(fq, f);
 	hlist_del(&fq->list);
+	fq->flags |= INET_FRAG_COMPLETE;
 	spin_unlock(&hb->chain_lock);
 }
 
@@ -295,7 +288,6 @@ void inet_frag_kill(struct inet_frag_queue *fq, struct inet_frags *f)
 	if (!(fq->flags & INET_FRAG_COMPLETE)) {
 		fq_unlink(fq, f);
 		atomic_dec(&fq->refcnt);
-		fq->flags |= INET_FRAG_COMPLETE;
 	}
 }
 EXPORT_SYMBOL(inet_frag_kill);
@@ -328,11 +320,12 @@ void inet_frag_destroy(struct inet_frag_queue *q, struct inet_frags *f)
 		fp = xp;
 	}
 	sum = sum_truesize + f->qsize;
-	sub_frag_mem_limit(q->net, sum);
 
 	if (f->destructor)
 		f->destructor(q);
 	kmem_cache_free(f->frags_cachep, q);
+
+	sub_frag_mem_limit(nf, sum);
 }
 EXPORT_SYMBOL(inet_frag_destroy);
 
-- 
cgit v1.2.3


From caaecdd3d3f8ec0ea9906c54b1dd8ec8316d26b9 Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Date: Thu, 23 Jul 2015 12:05:40 +0200
Subject: inet: frags: remove INET_FRAG_EVICTED and use list_evictor for the
 test

We can simply remove the INET_FRAG_EVICTED flag to avoid all the flags
race conditions with the evictor and use a participation test for the
evictor list, when we're at that point (after inet_frag_kill) in the
timer there're 2 possible cases:

1. The evictor added the entry to its evictor list while the timer was
waiting for the chainlock
or
2. The timer unchained the entry and the evictor won't see it

In both cases we should be able to see list_evictor correctly due
to the sync on the chainlock.

Joint work with Florian Westphal.

Tested-by: Frank Schreuder <fschreuder@transip.nl>
Signed-off-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/inet_frag.h  | 7 +++++--
 net/ipv4/inet_fragment.c | 1 -
 net/ipv4/ip_fragment.c   | 2 +-
 net/ipv6/reassembly.c    | 2 +-
 4 files changed, 7 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h
index e71ca17024f2..53eead2da743 100644
--- a/include/net/inet_frag.h
+++ b/include/net/inet_frag.h
@@ -21,13 +21,11 @@ struct netns_frags {
  * @INET_FRAG_FIRST_IN: first fragment has arrived
  * @INET_FRAG_LAST_IN: final fragment has arrived
  * @INET_FRAG_COMPLETE: frag queue has been processed and is due for destruction
- * @INET_FRAG_EVICTED: frag queue is being evicted
  */
 enum {
 	INET_FRAG_FIRST_IN	= BIT(0),
 	INET_FRAG_LAST_IN	= BIT(1),
 	INET_FRAG_COMPLETE	= BIT(2),
-	INET_FRAG_EVICTED	= BIT(3)
 };
 
 /**
@@ -127,6 +125,11 @@ static inline void inet_frag_put(struct inet_frag_queue *q, struct inet_frags *f
 		inet_frag_destroy(q, f);
 }
 
+static inline bool inet_frag_evicting(struct inet_frag_queue *q)
+{
+	return !hlist_unhashed(&q->list_evictor);
+}
+
 /* Memory Tracking Functions. */
 
 /* The default percpu_counter batch size is not big enough to scale to
diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
index a00ca4c00c35..d0a7c0319e3d 100644
--- a/net/ipv4/inet_fragment.c
+++ b/net/ipv4/inet_fragment.c
@@ -140,7 +140,6 @@ inet_evict_bucket(struct inet_frags *f, struct inet_frag_bucket *hb)
 		if (!del_timer(&fq->timer))
 			continue;
 
-		fq->flags |= INET_FRAG_EVICTED;
 		hlist_add_head(&fq->list_evictor, &expired);
 		++evicted;
 	}
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index b4a77d021c0d..921138f6c97c 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -202,7 +202,7 @@ static void ip_expire(unsigned long arg)
 	ipq_kill(qp);
 	IP_INC_STATS_BH(net, IPSTATS_MIB_REASMFAILS);
 
-	if (!(qp->q.flags & INET_FRAG_EVICTED)) {
+	if (!inet_frag_evicting(&qp->q)) {
 		struct sk_buff *head = qp->q.fragments;
 		const struct iphdr *iph;
 		int err;
diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c
index 5c3bbca6a150..f1159bb76e0a 100644
--- a/net/ipv6/reassembly.c
+++ b/net/ipv6/reassembly.c
@@ -144,7 +144,7 @@ void ip6_expire_frag_queue(struct net *net, struct frag_queue *fq,
 
 	IP6_INC_STATS_BH(net, __in6_dev_get(dev), IPSTATS_MIB_REASMFAILS);
 
-	if (fq->q.flags & INET_FRAG_EVICTED)
+	if (inet_frag_evicting(&fq->q))
 		goto out_rcu_unlock;
 
 	IP6_INC_STATS_BH(net, __in6_dev_get(dev), IPSTATS_MIB_REASMTIMEOUT);
-- 
cgit v1.2.3


From dfbafc995304ebb9a9b03f65083e6e9cea143b20 Mon Sep 17 00:00:00 2001
From: Sabrina Dubroca <sd@queasysnail.net>
Date: Fri, 24 Jul 2015 18:19:25 +0200
Subject: tcp: fix recv with flags MSG_WAITALL | MSG_PEEK

Currently, tcp_recvmsg enters a busy loop in sk_wait_data if called
with flags = MSG_WAITALL | MSG_PEEK.

sk_wait_data waits for sk_receive_queue not empty, but in this case,
the receive queue is not empty, but does not contain any skb that we
can use.

Add a "last skb seen on receive queue" argument to sk_wait_data, so
that it sleeps until the receive queue has new skbs.

Link: https://bugzilla.kernel.org/show_bug.cgi?id=99461
Link: https://sourceware.org/bugzilla/show_bug.cgi?id=18493
Link: https://bugzilla.redhat.com/show_bug.cgi?id=1205258
Reported-by: Enrico Scholz <rh-bugzilla@ensc.de>
Reported-by: Dan Searle <dan@censornet.com>
Signed-off-by: Sabrina Dubroca <sd@queasysnail.net>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sock.h |  2 +-
 net/core/sock.c    |  5 +++--
 net/dccp/proto.c   |  2 +-
 net/ipv4/tcp.c     | 11 +++++++----
 net/llc/af_llc.c   |  4 ++--
 5 files changed, 14 insertions(+), 10 deletions(-)

(limited to 'net')

diff --git a/include/net/sock.h b/include/net/sock.h
index 05a8c1aea251..f21f0708ec59 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -902,7 +902,7 @@ void sk_stream_kill_queues(struct sock *sk);
 void sk_set_memalloc(struct sock *sk);
 void sk_clear_memalloc(struct sock *sk);
 
-int sk_wait_data(struct sock *sk, long *timeo);
+int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb);
 
 struct request_sock_ops;
 struct timewait_sock_ops;
diff --git a/net/core/sock.c b/net/core/sock.c
index 08f16db46070..8a14f1285fc4 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1967,20 +1967,21 @@ static void __release_sock(struct sock *sk)
  * sk_wait_data - wait for data to arrive at sk_receive_queue
  * @sk:    sock to wait on
  * @timeo: for how long
+ * @skb:   last skb seen on sk_receive_queue
  *
  * Now socket state including sk->sk_err is changed only under lock,
  * hence we may omit checks after joining wait queue.
  * We check receive queue before schedule() only as optimization;
  * it is very likely that release_sock() added new data.
  */
-int sk_wait_data(struct sock *sk, long *timeo)
+int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
 {
 	int rc;
 	DEFINE_WAIT(wait);
 
 	prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
 	set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
-	rc = sk_wait_event(sk, timeo, !skb_queue_empty(&sk->sk_receive_queue));
+	rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb);
 	clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
 	finish_wait(sk_sleep(sk), &wait);
 	return rc;
diff --git a/net/dccp/proto.c b/net/dccp/proto.c
index 52a94016526d..b5cf13a28009 100644
--- a/net/dccp/proto.c
+++ b/net/dccp/proto.c
@@ -886,7 +886,7 @@ verify_sock_status:
 			break;
 		}
 
-		sk_wait_data(sk, &timeo);
+		sk_wait_data(sk, &timeo, NULL);
 		continue;
 	found_ok_skb:
 		if (len > skb->len)
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 7f4056785acc..45534a5ab430 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -780,7 +780,7 @@ ssize_t tcp_splice_read(struct socket *sock, loff_t *ppos,
 				ret = -EAGAIN;
 				break;
 			}
-			sk_wait_data(sk, &timeo);
+			sk_wait_data(sk, &timeo, NULL);
 			if (signal_pending(current)) {
 				ret = sock_intr_errno(timeo);
 				break;
@@ -1575,7 +1575,7 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
 	int target;		/* Read at least this many bytes */
 	long timeo;
 	struct task_struct *user_recv = NULL;
-	struct sk_buff *skb;
+	struct sk_buff *skb, *last;
 	u32 urg_hole = 0;
 
 	if (unlikely(flags & MSG_ERRQUEUE))
@@ -1635,7 +1635,9 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
 
 		/* Next get a buffer. */
 
+		last = skb_peek_tail(&sk->sk_receive_queue);
 		skb_queue_walk(&sk->sk_receive_queue, skb) {
+			last = skb;
 			/* Now that we have two receive queues this
 			 * shouldn't happen.
 			 */
@@ -1754,8 +1756,9 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
 			/* Do not sleep, just process backlog. */
 			release_sock(sk);
 			lock_sock(sk);
-		} else
-			sk_wait_data(sk, &timeo);
+		} else {
+			sk_wait_data(sk, &timeo, last);
+		}
 
 		if (user_recv) {
 			int chunk;
diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c
index 8fd9febaa5ba..8dab4e569571 100644
--- a/net/llc/af_llc.c
+++ b/net/llc/af_llc.c
@@ -613,7 +613,7 @@ static int llc_wait_data(struct sock *sk, long timeo)
 		if (signal_pending(current))
 			break;
 		rc = 0;
-		if (sk_wait_data(sk, &timeo))
+		if (sk_wait_data(sk, &timeo, NULL))
 			break;
 	}
 	return rc;
@@ -802,7 +802,7 @@ static int llc_ui_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
 			release_sock(sk);
 			lock_sock(sk);
 		} else
-			sk_wait_data(sk, &timeo);
+			sk_wait_data(sk, &timeo, NULL);
 
 		if ((flags & MSG_PEEK) && peek_seq != llc->copied_seq) {
 			net_dbg_ratelimited("LLC(%s:%d): Application bug, race in MSG_PEEK\n",
-- 
cgit v1.2.3


From 743c69e7c089ba1bea1b207c5829dd079a4e98f9 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.com>
Date: Mon, 27 Jul 2015 10:55:35 +1000
Subject: sunrpc: translate -EAGAIN to -ENOBUFS when socket is writable.

The networking layer does not reliably report the distinction between
a non-block write failing because:
 1/ the queue is too full already and
 2/ a memory allocation attempt failed.

The distinction is important because in the first case it is
appropriate to retry as soon as the socket reports that it is
writable, and in the second case a small delay is required as the
socket will most likely report as writable but kmalloc could still
fail.

sk_stream_wait_memory() exhibits this distinction nicely, setting
'vm_wait' if a small wait is needed.  However in the non-blocking case
it always returns -EAGAIN no matter the cause of the failure.  This
-EAGAIN call get all the way to sunrpc.

The sunrpc layer expects EAGAIN to indicate the first cause, and
ENOBUFS to indicate the second.  Various documentation suggests that
this is not unreasonable, but does not guarantee the desired error
codes.

The result of getting -EAGAIN when -ENOBUFS is expected is that the
send is tried again in a tight loop and soft lockups are reported.

so: add tests after calls to xs_sendpages() to translate -EAGAIN into
-ENOBUFS if the socket is writable.  This cannot happen inside
xs_sendpages() as the test for "is socket writable" is different
between TCP and UDP.

With this change, the tight loop retrying xs_sendpages() becomes a
loop which only retries every 250ms, and so will not trigger a
soft-lockup warning.

It is possible that the write did fail because the queue was too full
and by the time xs_sendpages() completed, the queue was writable
again.  In this case an extra 250ms delay is inserted that isn't
really needed.  This circumstance suggests a degree of congestion so a
delay is not necessarily a bad thing, and it can only cause a single
250ms delay, not a series of them.

Signed-off-by: NeilBrown <neilb@suse.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 net/sunrpc/xprtsock.c | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'net')

diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 44c1927b68c7..4f48b1a19e9f 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -527,6 +527,10 @@ static int xs_local_send_request(struct rpc_task *task)
 			      true, &sent);
 	dprintk("RPC:       %s(%u) = %d\n",
 			__func__, xdr->len - req->rq_bytes_sent, status);
+
+	if (status == -EAGAIN && sock_writeable(transport->inet))
+		status = -ENOBUFS;
+
 	if (likely(sent > 0) || status == 0) {
 		req->rq_bytes_sent += sent;
 		req->rq_xmit_bytes_sent += sent;
@@ -590,6 +594,9 @@ static int xs_udp_send_request(struct rpc_task *task)
 	if (status == -EPERM)
 		goto process_status;
 
+	if (status == -EAGAIN && sock_writeable(transport->inet))
+		status = -ENOBUFS;
+
 	if (sent > 0 || status == 0) {
 		req->rq_xmit_bytes_sent += sent;
 		if (sent >= req->rq_slen)
@@ -687,6 +694,8 @@ static int xs_tcp_send_request(struct rpc_task *task)
 		status = -EAGAIN;
 		break;
 	}
+	if (status == -EAGAIN && sk_stream_is_writeable(transport->inet))
+		status = -ENOBUFS;
 
 	switch (status) {
 	case -ENOTSOCK:
-- 
cgit v1.2.3


From 1513069edcf8dd86cfd8d5daef482b97d6b93df6 Mon Sep 17 00:00:00 2001
From: Alexander Duyck <alexander.h.duyck@redhat.com>
Date: Mon, 27 Jul 2015 13:08:06 -0700
Subject: fib_trie: Drop unnecessary calls to leaf_pull_suffix

It was reported that update_suffix was taking a long time on systems where
a large number of leaves were attached to a single node.  As it turns out
fib_table_flush was calling update_suffix for each leaf that didn't have all
of the aliases stripped from it.  As a result, on this large node removing
one leaf would result in us calling update_suffix for every other leaf on
the node.

The fix is to just remove the calls to leaf_pull_suffix since they are
redundant as we already have a call in resize that will go through and
update the suffix length for the node before we exit out of
fib_table_flush or fib_table_flush_external.

Reported-by: David Ahern <dsa@cumulusnetworks.com>
Signed-off-by: Alexander Duyck <alexander.h.duyck@redhat.com>
Tested-by: David Ahern <dsa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/fib_trie.c | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 81797e065b21..37c4bb89a708 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -1793,8 +1793,6 @@ void fib_table_flush_external(struct fib_table *tb)
 		if (hlist_empty(&n->leaf)) {
 			put_child_root(pn, n->key, NULL);
 			node_free(n);
-		} else {
-			leaf_pull_suffix(pn, n);
 		}
 	}
 }
@@ -1864,8 +1862,6 @@ int fib_table_flush(struct fib_table *tb)
 		if (hlist_empty(&n->leaf)) {
 			put_child_root(pn, n->key, NULL);
 			node_free(n);
-		} else {
-			leaf_pull_suffix(pn, n);
 		}
 	}
 
-- 
cgit v1.2.3


From f580dd042823294b5b548e0f8bf1ba7a4b114fa5 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Sat, 11 Jul 2015 17:48:52 +0200
Subject: SUNRPC: Report TCP errors to the caller

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 net/sunrpc/xprtsock.c | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

(limited to 'net')

diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 4f48b1a19e9f..6a21368bdd8e 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -677,9 +677,6 @@ static int xs_tcp_send_request(struct rpc_task *task)
 		dprintk("RPC:       xs_tcp_send_request(%u) = %d\n",
 				xdr->len - req->rq_bytes_sent, status);
 
-		if (unlikely(sent == 0 && status < 0))
-			break;
-
 		/* If we've sent the entire packet, immediately
 		 * reset the count of bytes sent. */
 		req->rq_bytes_sent += sent;
@@ -689,10 +686,12 @@ static int xs_tcp_send_request(struct rpc_task *task)
 			return 0;
 		}
 
-		if (sent != 0)
-			continue;
-		status = -EAGAIN;
-		break;
+		if (status < 0)
+			break;
+		if (sent == 0) {
+			status = -EAGAIN;
+			break;
+		}
 	}
 	if (status == -EAGAIN && sk_stream_is_writeable(transport->inet))
 		status = -ENOBUFS;
-- 
cgit v1.2.3


From 158cd4af8dedbda0d612d448c724c715d0dda649 Mon Sep 17 00:00:00 2001
From: Lars Westerhoff <lars.westerhoff@newtec.eu>
Date: Tue, 28 Jul 2015 01:32:21 +0300
Subject: packet: missing dev_put() in packet_do_bind()

When binding a PF_PACKET socket, the use count of the bound interface is
always increased with dev_hold in dev_get_by_{index,name}.  However,
when rebound with the same protocol and device as in the previous bind
the use count of the interface was not decreased.  Ultimately, this
caused the deletion of the interface to fail with the following message:

unregister_netdevice: waiting for dummy0 to become free. Usage count = 1

This patch moves the dev_put out of the conditional part that was only
executed when either the protocol or device changed on a bind.

Fixes: 902fefb82ef7 ('packet: improve socket create/bind latency in some cases')
Signed-off-by: Lars Westerhoff <lars.westerhoff@newtec.eu>
Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Reviewed-by: Daniel Borkmann <dborkman@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/packet/af_packet.c | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index c9e8741226c6..c7c42eb617ef 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -2784,7 +2784,7 @@ static int packet_release(struct socket *sock)
 static int packet_do_bind(struct sock *sk, struct net_device *dev, __be16 proto)
 {
 	struct packet_sock *po = pkt_sk(sk);
-	const struct net_device *dev_curr;
+	struct net_device *dev_curr;
 	__be16 proto_curr;
 	bool need_rehook;
 
@@ -2808,15 +2808,13 @@ static int packet_do_bind(struct sock *sk, struct net_device *dev, __be16 proto)
 
 		po->num = proto;
 		po->prot_hook.type = proto;
-
-		if (po->prot_hook.dev)
-			dev_put(po->prot_hook.dev);
-
 		po->prot_hook.dev = dev;
 
 		po->ifindex = dev ? dev->ifindex : 0;
 		packet_cached_dev_assign(po, dev);
 	}
+	if (dev_curr)
+		dev_put(dev_curr);
 
 	if (proto == 0 || !need_rehook)
 		goto out_unlock;
-- 
cgit v1.2.3


From 76b91c32dd86f624b5df038dcb68cddd5a18d355 Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov <razor@blackwall.org>
Date: Thu, 23 Jul 2015 11:01:05 -0700
Subject: bridge: stp: when using userspace stp stop kernel hello and hold
 timers

These should be handled only by the respective STP which is in control.
They become problematic for devices with limited resources with many
ports because the hold_timer is per port and fires each second and the
hello timer fires each 2 seconds even though it's global. While in
user-space STP mode these timers are completely unnecessary so it's better
to keep them off.
Also ensure that when the bridge is up these timers are started only when
running with kernel STP.

Signed-off-by: Satish Ashok <sashok@cumulusnetworks.com>
Signed-off-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/bridge/br_stp.c       |  5 +++--
 net/bridge/br_stp_if.c    | 13 ++++++++++++-
 net/bridge/br_stp_timer.c |  4 +++-
 3 files changed, 18 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/bridge/br_stp.c b/net/bridge/br_stp.c
index b4b6dab9c285..ed74ffaa851f 100644
--- a/net/bridge/br_stp.c
+++ b/net/bridge/br_stp.c
@@ -209,8 +209,9 @@ void br_transmit_config(struct net_bridge_port *p)
 		br_send_config_bpdu(p, &bpdu);
 		p->topology_change_ack = 0;
 		p->config_pending = 0;
-		mod_timer(&p->hold_timer,
-			  round_jiffies(jiffies + BR_HOLD_TIME));
+		if (p->br->stp_enabled == BR_KERNEL_STP)
+			mod_timer(&p->hold_timer,
+				  round_jiffies(jiffies + BR_HOLD_TIME));
 	}
 }
 
diff --git a/net/bridge/br_stp_if.c b/net/bridge/br_stp_if.c
index a2730e7196cd..4ca449a16132 100644
--- a/net/bridge/br_stp_if.c
+++ b/net/bridge/br_stp_if.c
@@ -48,7 +48,8 @@ void br_stp_enable_bridge(struct net_bridge *br)
 	struct net_bridge_port *p;
 
 	spin_lock_bh(&br->lock);
-	mod_timer(&br->hello_timer, jiffies + br->hello_time);
+	if (br->stp_enabled == BR_KERNEL_STP)
+		mod_timer(&br->hello_timer, jiffies + br->hello_time);
 	mod_timer(&br->gc_timer, jiffies + HZ/10);
 
 	br_config_bpdu_generation(br);
@@ -127,6 +128,7 @@ static void br_stp_start(struct net_bridge *br)
 	int r;
 	char *argv[] = { BR_STP_PROG, br->dev->name, "start", NULL };
 	char *envp[] = { NULL };
+	struct net_bridge_port *p;
 
 	r = call_usermodehelper(BR_STP_PROG, argv, envp, UMH_WAIT_PROC);
 
@@ -140,6 +142,10 @@ static void br_stp_start(struct net_bridge *br)
 	if (r == 0) {
 		br->stp_enabled = BR_USER_STP;
 		br_debug(br, "userspace STP started\n");
+		/* Stop hello and hold timers */
+		del_timer(&br->hello_timer);
+		list_for_each_entry(p, &br->port_list, list)
+			del_timer(&p->hold_timer);
 	} else {
 		br->stp_enabled = BR_KERNEL_STP;
 		br_debug(br, "using kernel STP\n");
@@ -156,12 +162,17 @@ static void br_stp_stop(struct net_bridge *br)
 	int r;
 	char *argv[] = { BR_STP_PROG, br->dev->name, "stop", NULL };
 	char *envp[] = { NULL };
+	struct net_bridge_port *p;
 
 	if (br->stp_enabled == BR_USER_STP) {
 		r = call_usermodehelper(BR_STP_PROG, argv, envp, UMH_WAIT_PROC);
 		br_info(br, "userspace STP stopped, return code %d\n", r);
 
 		/* To start timers on any ports left in blocking */
+		mod_timer(&br->hello_timer, jiffies + br->hello_time);
+		list_for_each_entry(p, &br->port_list, list)
+			mod_timer(&p->hold_timer,
+				  round_jiffies(jiffies + BR_HOLD_TIME));
 		spin_lock_bh(&br->lock);
 		br_port_state_selection(br);
 		spin_unlock_bh(&br->lock);
diff --git a/net/bridge/br_stp_timer.c b/net/bridge/br_stp_timer.c
index 7caf7fae2d5b..5f0f5af0ec35 100644
--- a/net/bridge/br_stp_timer.c
+++ b/net/bridge/br_stp_timer.c
@@ -40,7 +40,9 @@ static void br_hello_timer_expired(unsigned long arg)
 	if (br->dev->flags & IFF_UP) {
 		br_config_bpdu_generation(br);
 
-		mod_timer(&br->hello_timer, round_jiffies(jiffies + br->hello_time));
+		if (br->stp_enabled != BR_USER_STP)
+			mod_timer(&br->hello_timer,
+				  round_jiffies(jiffies + br->hello_time));
 	}
 	spin_unlock(&br->lock);
 }
-- 
cgit v1.2.3


From 865b804244f228e80fb62abe464296399253cce8 Mon Sep 17 00:00:00 2001
From: David Ward <david.ward@ll.mit.edu>
Date: Sun, 26 Jul 2015 12:18:58 -0400
Subject: net/ipv4: suppress NETDEV_UP notification on address lifetime update

This notification causes the FIB to be updated, which is not needed
because the address already exists, and more importantly it may undo
intentional changes that were made to the FIB after the address was
originally added. (As a point of comparison, when an address becomes
deprecated because its preferred lifetime expired, a notification on
this chain is not generated.)

The motivation for this commit is fixing an incompatibility between
DHCP clients which set and update the address lifetime according to
the lease, and a commercial VPN client which replaces kernel routes
in a way that outbound traffic is sent only through the tunnel (and
disconnects if any further route changes are detected via netlink).

Signed-off-by: David Ward <david.ward@ll.mit.edu>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/devinet.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index e813196c91c7..2d9cb1748f81 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -882,7 +882,6 @@ static int inet_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh)
 		queue_delayed_work(system_power_efficient_wq,
 				&check_lifetime_work, 0);
 		rtmsg_ifa(RTM_NEWADDR, ifa, nlh, NETLINK_CB(skb).portid);
-		blocking_notifier_call_chain(&inetaddr_chain, NETDEV_UP, ifa);
 	}
 	return 0;
 }
-- 
cgit v1.2.3


From 11c91ef98f37cd743098de26160fffd7f9bd40e1 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 27 Jul 2015 11:33:50 +0200
Subject: arp: filter NOARP neighbours for SIOCGARP

When arp is off on a device, and ioctl(SIOCGARP) is queried,
a buggy answer is given with MAC address of the device, instead
of the mac address of the destination/gateway.

We filter out NUD_NOARP neighbours for /proc/net/arp,
we must do the same for SIOCGARP ioctl.

Tested:

lpaa23:~# ./arp 10.246.7.190
MAC=00:01:e8:22:cb:1d      // correct answer

lpaa23:~# ip link set dev eth0 arp off
lpaa23:~# cat /proc/net/arp   # check arp table is now 'empty'
IP address       HW type     Flags       HW address    Mask     Device
lpaa23:~# ./arp 10.246.7.190
MAC=00:1a:11:c3:0d:7f   // buggy answer before patch (this is eth0 mac)

After patch :

lpaa23:~# ip link set dev eth0 arp off
lpaa23:~# ./arp 10.246.7.190
ioctl(SIOCGARP) failed: No such device or address

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: Vytautas Valancius <valas@google.com>
Cc: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/arp.c | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index 933a92820d26..6c8b1fbafce8 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -1017,14 +1017,16 @@ static int arp_req_get(struct arpreq *r, struct net_device *dev)
 
 	neigh = neigh_lookup(&arp_tbl, &ip, dev);
 	if (neigh) {
-		read_lock_bh(&neigh->lock);
-		memcpy(r->arp_ha.sa_data, neigh->ha, dev->addr_len);
-		r->arp_flags = arp_state_to_flags(neigh);
-		read_unlock_bh(&neigh->lock);
-		r->arp_ha.sa_family = dev->type;
-		strlcpy(r->arp_dev, dev->name, sizeof(r->arp_dev));
+		if (!(neigh->nud_state & NUD_NOARP)) {
+			read_lock_bh(&neigh->lock);
+			memcpy(r->arp_ha.sa_data, neigh->ha, dev->addr_len);
+			r->arp_flags = arp_state_to_flags(neigh);
+			read_unlock_bh(&neigh->lock);
+			r->arp_ha.sa_family = dev->type;
+			strlcpy(r->arp_dev, dev->name, sizeof(r->arp_dev));
+			err = 0;
+		}
 		neigh_release(neigh);
-		err = 0;
 	}
 	return err;
 }
-- 
cgit v1.2.3


From dbd46ab412b8fb395f2b0ff6f6a7eec9df311550 Mon Sep 17 00:00:00 2001
From: Alexander Drozdov <al.drozdov@gmail.com>
Date: Tue, 28 Jul 2015 13:57:01 +0300
Subject: packet: tpacket_snd(): fix signed/unsigned comparison

tpacket_fill_skb() can return a negative value (-errno) which
is stored in tp_len variable. In that case the following
condition will be (but shouldn't be) true:

tp_len > dev->mtu + dev->hard_header_len

as dev->mtu and dev->hard_header_len are both unsigned.

That may lead to just returning an incorrect EMSGSIZE errno
to the user.

Fixes: 52f1454f629fa ("packet: allow to transmit +4 byte in TX_RING slot for VLAN case")
Signed-off-by: Alexander Drozdov <al.drozdov@gmail.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/packet/af_packet.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index c7c42eb617ef..ed458b315ef4 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -2403,7 +2403,8 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
 		}
 		tp_len = tpacket_fill_skb(po, skb, ph, dev, size_max, proto,
 					  addr, hlen);
-		if (tp_len > dev->mtu + dev->hard_header_len) {
+		if (likely(tp_len >= 0) &&
+		    tp_len > dev->mtu + dev->hard_header_len) {
 			struct ethhdr *ehdr;
 			/* Earlier code assumed this would be a VLAN pkt,
 			 * double-check this now that we have the actual
-- 
cgit v1.2.3


From df356d5e81b04dd51dd9f23f2bce7d73dd929899 Mon Sep 17 00:00:00 2001
From: Toshiaki Makita <makita.toshiaki@lab.ntt.co.jp>
Date: Tue, 28 Jul 2015 19:05:37 +0900
Subject: bridge: Fix network header pointer for vlan tagged packets

There are several devices that can receive vlan tagged packets with
CHECKSUM_PARTIAL like tap, possibly veth and xennet.
When (multiple) vlan tagged packets with CHECKSUM_PARTIAL are forwarded
by bridge to a device with the IP_CSUM feature, they end up with checksum
error because before entering bridge, the network header is set to
ETH_HLEN (not including vlan header length) in __netif_receive_skb_core(),
get_rps_cpu(), or drivers' rx functions, and nobody fixes the pointer later.

Since the network header is exepected to be ETH_HLEN in flow-dissection
and hash-calculation in RPS in rx path, and since the header pointer fix
is needed only in tx path, set the appropriate network header on forwarding
packets.

Signed-off-by: Toshiaki Makita <makita.toshiaki@lab.ntt.co.jp>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/bridge/br_forward.c | 29 ++++++++++++++++++++++-------
 1 file changed, 22 insertions(+), 7 deletions(-)

(limited to 'net')

diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c
index 0ff6e1bbca91..fa7bfced888e 100644
--- a/net/bridge/br_forward.c
+++ b/net/bridge/br_forward.c
@@ -37,15 +37,30 @@ static inline int should_deliver(const struct net_bridge_port *p,
 
 int br_dev_queue_push_xmit(struct sock *sk, struct sk_buff *skb)
 {
-	if (!is_skb_forwardable(skb->dev, skb)) {
-		kfree_skb(skb);
-	} else {
-		skb_push(skb, ETH_HLEN);
-		br_drop_fake_rtable(skb);
-		skb_sender_cpu_clear(skb);
-		dev_queue_xmit(skb);
+	if (!is_skb_forwardable(skb->dev, skb))
+		goto drop;
+
+	skb_push(skb, ETH_HLEN);
+	br_drop_fake_rtable(skb);
+	skb_sender_cpu_clear(skb);
+
+	if (skb->ip_summed == CHECKSUM_PARTIAL &&
+	    (skb->protocol == htons(ETH_P_8021Q) ||
+	     skb->protocol == htons(ETH_P_8021AD))) {
+		int depth;
+
+		if (!__vlan_get_protocol(skb, skb->protocol, &depth))
+			goto drop;
+
+		skb_set_network_header(skb, depth);
 	}
 
+	dev_queue_xmit(skb);
+
+	return 0;
+
+drop:
+	kfree_skb(skb);
 	return 0;
 }
 EXPORT_SYMBOL_GPL(br_dev_queue_push_xmit);
-- 
cgit v1.2.3


From 544586f742b43c6fd5fcb74c794d33b2ef189e64 Mon Sep 17 00:00:00 2001
From: Satish Ashok <sashok@cumulusnetworks.com>
Date: Tue, 28 Jul 2015 03:28:27 -0700
Subject: bridge: mcast: give fast leave precedence over multicast router and
 querier

When fast leave is configured on a bridge port and an IGMP leave is
received for a group, the group is not deleted immediately if there is
a router detected or if multicast querier is configured.
Ideally the group should be deleted immediately when fast leave is
configured.

Signed-off-by: Satish Ashok <sashok@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/bridge/br_multicast.c | 50 ++++++++++++++++++++++++-----------------------
 1 file changed, 26 insertions(+), 24 deletions(-)

(limited to 'net')

diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c
index 79db489cdade..0b39dcc65b94 100644
--- a/net/bridge/br_multicast.c
+++ b/net/bridge/br_multicast.c
@@ -1416,8 +1416,7 @@ br_multicast_leave_group(struct net_bridge *br,
 
 	spin_lock(&br->multicast_lock);
 	if (!netif_running(br->dev) ||
-	    (port && port->state == BR_STATE_DISABLED) ||
-	    timer_pending(&other_query->timer))
+	    (port && port->state == BR_STATE_DISABLED))
 		goto out;
 
 	mdb = mlock_dereference(br->mdb, br);
@@ -1425,6 +1424,31 @@ br_multicast_leave_group(struct net_bridge *br,
 	if (!mp)
 		goto out;
 
+	if (port && (port->flags & BR_MULTICAST_FAST_LEAVE)) {
+		struct net_bridge_port_group __rcu **pp;
+
+		for (pp = &mp->ports;
+		     (p = mlock_dereference(*pp, br)) != NULL;
+		     pp = &p->next) {
+			if (p->port != port)
+				continue;
+
+			rcu_assign_pointer(*pp, p->next);
+			hlist_del_init(&p->mglist);
+			del_timer(&p->timer);
+			call_rcu_bh(&p->rcu, br_multicast_free_pg);
+			br_mdb_notify(br->dev, port, group, RTM_DELMDB);
+
+			if (!mp->ports && !mp->mglist &&
+			    netif_running(br->dev))
+				mod_timer(&mp->timer, jiffies);
+		}
+		goto out;
+	}
+
+	if (timer_pending(&other_query->timer))
+		goto out;
+
 	if (br->multicast_querier) {
 		__br_multicast_send_query(br, port, &mp->addr);
 
@@ -1450,28 +1474,6 @@ br_multicast_leave_group(struct net_bridge *br,
 		}
 	}
 
-	if (port && (port->flags & BR_MULTICAST_FAST_LEAVE)) {
-		struct net_bridge_port_group __rcu **pp;
-
-		for (pp = &mp->ports;
-		     (p = mlock_dereference(*pp, br)) != NULL;
-		     pp = &p->next) {
-			if (p->port != port)
-				continue;
-
-			rcu_assign_pointer(*pp, p->next);
-			hlist_del_init(&p->mglist);
-			del_timer(&p->timer);
-			call_rcu_bh(&p->rcu, br_multicast_free_pg);
-			br_mdb_notify(br->dev, port, group, RTM_DELMDB);
-
-			if (!mp->ports && !mp->mglist &&
-			    netif_running(br->dev))
-				mod_timer(&mp->timer, jiffies);
-		}
-		goto out;
-	}
-
 	now = jiffies;
 	time = now + br->multicast_last_member_count *
 		     br->multicast_last_member_interval;
-- 
cgit v1.2.3


From 7ae90a4f96486e3e20274afa1b8329802f5e1981 Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Date: Tue, 28 Jul 2015 13:10:44 +0200
Subject: bridge: mdb: fix delmdb state in the notification

Since mdb states were introduced when deleting an entry the state was
left as it was set in the delete request from the user which leads to
the following output when doing a monitor (for example):
$ bridge mdb add dev br0 port eth3 grp 239.0.0.1 permanent
(monitor) dev br0 port eth3 grp 239.0.0.1 permanent
$ bridge mdb del dev br0 port eth3 grp 239.0.0.1 permanent
(monitor) dev br0 port eth3 grp 239.0.0.1 temp
^^^
Note the "temp" state in the delete notification which is wrong since
the entry was permanent, the state in a delete is always reported as
"temp" regardless of the real state of the entry.

After this patch:
$ bridge mdb add dev br0 port eth3 grp 239.0.0.1 permanent
(monitor) dev br0 port eth3 grp 239.0.0.1 permanent
$ bridge mdb del dev br0 port eth3 grp 239.0.0.1 permanent
(monitor) dev br0 port eth3 grp 239.0.0.1 permanent

There's one important note to make here that the state is actually not
matched when doing a delete, so one can delete a permanent entry by
stating "temp" in the end of the command, I've chosen this fix in order
not to break user-space tools which rely on this (incorrect) behaviour.

So to give an example after this patch and using the wrong state:
$ bridge mdb add dev br0 port eth3 grp 239.0.0.1 permanent
(monitor) dev br0 port eth3 grp 239.0.0.1 permanent
$ bridge mdb del dev br0 port eth3 grp 239.0.0.1 temp
(monitor) dev br0 port eth3 grp 239.0.0.1 permanent

Note the state of the entry that got deleted is correct in the
notification.

Signed-off-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Fixes: ccb1c31a7a87 ("bridge: add flags to distinguish permanent mdb entires")
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/bridge/br_mdb.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c
index 1198a3dbad95..c94321955db7 100644
--- a/net/bridge/br_mdb.c
+++ b/net/bridge/br_mdb.c
@@ -445,6 +445,7 @@ static int __br_mdb_del(struct net_bridge *br, struct br_mdb_entry *entry)
 		if (p->port->state == BR_STATE_DISABLED)
 			goto unlock;
 
+		entry->state = p->state;
 		rcu_assign_pointer(*pp, p->next);
 		hlist_del_init(&p->mglist);
 		del_timer(&p->timer);
-- 
cgit v1.2.3


From c8507fb235bea3314a02a67ddda0d4e6cf01fa78 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 29 Jul 2015 12:01:41 +0200
Subject: ipv6: flush nd cache on IFF_NOARP change

This patch is the IPv6 equivalent of commit
6c8b4e3ff81b ("arp: flush arp cache on IFF_NOARP change")

Without it, we keep buggy neighbours in the cache, with destination
MAC address equal to our own MAC address.

Tested:
 tcpdump -i eth0 -s 0 ip6 -n -e &
 ip link set dev eth0 arp off
 ping6 remote   // sends buggy frames
 ip link set dev eth0 arp on
 ping6 remote   // should work once kernel is patched

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: Mario Fanelli <mariofanelli@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/ndisc.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'net')

diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index 0a05b35a90fc..c53331cfed95 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -1650,6 +1650,7 @@ int ndisc_rcv(struct sk_buff *skb)
 static int ndisc_netdev_event(struct notifier_block *this, unsigned long event, void *ptr)
 {
 	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+	struct netdev_notifier_change_info *change_info;
 	struct net *net = dev_net(dev);
 	struct inet6_dev *idev;
 
@@ -1664,6 +1665,11 @@ static int ndisc_netdev_event(struct notifier_block *this, unsigned long event,
 			ndisc_send_unsol_na(dev);
 		in6_dev_put(idev);
 		break;
+	case NETDEV_CHANGE:
+		change_info = ptr;
+		if (change_info->flags_changed & IFF_NOARP)
+			neigh_changeaddr(&nd_tbl, dev);
+		break;
 	case NETDEV_DOWN:
 		neigh_ifdown(&nd_tbl, dev);
 		fib6_run_gc(0, net, false);
-- 
cgit v1.2.3


From f4eaed28c7834fc049c754f63e6988bbd73778d9 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Wed, 29 Jul 2015 18:40:56 +0200
Subject: act_bpf: fix memory leaks when replacing bpf programs

We currently trigger multiple memory leaks when replacing bpf
actions, besides others:

  comm "tc", pid 1909, jiffies 4294851310 (age 1602.796s)
  hex dump (first 32 bytes):
    01 00 00 00 03 00 00 00 00 00 00 00 00 00 00 00  ................
    18 b0 98 6d 00 88 ff ff 00 00 00 00 00 00 00 00  ...m............
  backtrace:
    [<ffffffff817e623e>] kmemleak_alloc+0x4e/0xb0
    [<ffffffff8120a22d>] __vmalloc_node_range+0x1bd/0x2c0
    [<ffffffff8120a37a>] __vmalloc+0x4a/0x50
    [<ffffffff811a8d0a>] bpf_prog_alloc+0x3a/0xa0
    [<ffffffff816c0684>] bpf_prog_create+0x44/0xa0
    [<ffffffffa09ba4eb>] tcf_bpf_init+0x28b/0x3c0 [act_bpf]
    [<ffffffff816d7001>] tcf_action_init_1+0x191/0x1b0
    [<ffffffff816d70a2>] tcf_action_init+0x82/0xf0
    [<ffffffff816d4d12>] tcf_exts_validate+0xb2/0xc0
    [<ffffffffa09b5838>] cls_bpf_modify_existing+0x98/0x340 [cls_bpf]
    [<ffffffffa09b5cd6>] cls_bpf_change+0x1a6/0x274 [cls_bpf]
    [<ffffffff816d56e5>] tc_ctl_tfilter+0x335/0x910
    [<ffffffff816b9145>] rtnetlink_rcv_msg+0x95/0x240
    [<ffffffff816df34f>] netlink_rcv_skb+0xaf/0xc0
    [<ffffffff816b909e>] rtnetlink_rcv+0x2e/0x40
    [<ffffffff816deaaf>] netlink_unicast+0xef/0x1b0

Issue is that the old content from tcf_bpf is allocated and needs
to be released when we replace it. We seem to do that since the
beginning of act_bpf on the filter and insns, later on the name as
well.

Example test case, after patch:

  # FOO="1,6 0 0 4294967295,"
  # BAR="1,6 0 0 4294967294,"
  # tc actions add action bpf bytecode "$FOO" index 2
  # tc actions show action bpf
   action order 0: bpf bytecode '1,6 0 0 4294967295' default-action pipe
   index 2 ref 1 bind 0
  # tc actions replace action bpf bytecode "$BAR" index 2
  # tc actions show action bpf
   action order 0: bpf bytecode '1,6 0 0 4294967294' default-action pipe
   index 2 ref 1 bind 0
  # tc actions replace action bpf bytecode "$FOO" index 2
  # tc actions show action bpf
   action order 0: bpf bytecode '1,6 0 0 4294967295' default-action pipe
   index 2 ref 1 bind 0
  # tc actions del action bpf index 2
  [...]
  # echo "scan" > /sys/kernel/debug/kmemleak
  # cat /sys/kernel/debug/kmemleak | grep "comm \"tc\"" | wc -l
  0

Fixes: d23b8ad8ab23 ("tc: add BPF based action")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/act_bpf.c | 53 +++++++++++++++++++++++++++++++++++------------------
 1 file changed, 35 insertions(+), 18 deletions(-)

(limited to 'net')

diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c
index 1df78289e248..d0edeb7a1950 100644
--- a/net/sched/act_bpf.c
+++ b/net/sched/act_bpf.c
@@ -27,9 +27,10 @@
 struct tcf_bpf_cfg {
 	struct bpf_prog *filter;
 	struct sock_filter *bpf_ops;
-	char *bpf_name;
+	const char *bpf_name;
 	u32 bpf_fd;
 	u16 bpf_num_ops;
+	bool is_ebpf;
 };
 
 static int tcf_bpf(struct sk_buff *skb, const struct tc_action *act,
@@ -207,6 +208,7 @@ static int tcf_bpf_init_from_ops(struct nlattr **tb, struct tcf_bpf_cfg *cfg)
 	cfg->bpf_ops = bpf_ops;
 	cfg->bpf_num_ops = bpf_num_ops;
 	cfg->filter = fp;
+	cfg->is_ebpf = false;
 
 	return 0;
 }
@@ -241,18 +243,40 @@ static int tcf_bpf_init_from_efd(struct nlattr **tb, struct tcf_bpf_cfg *cfg)
 	cfg->bpf_fd = bpf_fd;
 	cfg->bpf_name = name;
 	cfg->filter = fp;
+	cfg->is_ebpf = true;
 
 	return 0;
 }
 
+static void tcf_bpf_cfg_cleanup(const struct tcf_bpf_cfg *cfg)
+{
+	if (cfg->is_ebpf)
+		bpf_prog_put(cfg->filter);
+	else
+		bpf_prog_destroy(cfg->filter);
+
+	kfree(cfg->bpf_ops);
+	kfree(cfg->bpf_name);
+}
+
+static void tcf_bpf_prog_fill_cfg(const struct tcf_bpf *prog,
+				  struct tcf_bpf_cfg *cfg)
+{
+	cfg->is_ebpf = tcf_bpf_is_ebpf(prog);
+	cfg->filter = prog->filter;
+
+	cfg->bpf_ops = prog->bpf_ops;
+	cfg->bpf_name = prog->bpf_name;
+}
+
 static int tcf_bpf_init(struct net *net, struct nlattr *nla,
 			struct nlattr *est, struct tc_action *act,
 			int replace, int bind)
 {
 	struct nlattr *tb[TCA_ACT_BPF_MAX + 1];
+	struct tcf_bpf_cfg cfg, old;
 	struct tc_act_bpf *parm;
 	struct tcf_bpf *prog;
-	struct tcf_bpf_cfg cfg;
 	bool is_bpf, is_ebpf;
 	int ret;
 
@@ -301,6 +325,9 @@ static int tcf_bpf_init(struct net *net, struct nlattr *nla,
 	prog = to_bpf(act);
 	spin_lock_bh(&prog->tcf_lock);
 
+	if (ret != ACT_P_CREATED)
+		tcf_bpf_prog_fill_cfg(prog, &old);
+
 	prog->bpf_ops = cfg.bpf_ops;
 	prog->bpf_name = cfg.bpf_name;
 
@@ -316,32 +343,22 @@ static int tcf_bpf_init(struct net *net, struct nlattr *nla,
 
 	if (ret == ACT_P_CREATED)
 		tcf_hash_insert(act);
+	else
+		tcf_bpf_cfg_cleanup(&old);
 
 	return ret;
 
 destroy_fp:
-	if (is_ebpf)
-		bpf_prog_put(cfg.filter);
-	else
-		bpf_prog_destroy(cfg.filter);
-
-	kfree(cfg.bpf_ops);
-	kfree(cfg.bpf_name);
-
+	tcf_bpf_cfg_cleanup(&cfg);
 	return ret;
 }
 
 static void tcf_bpf_cleanup(struct tc_action *act, int bind)
 {
-	const struct tcf_bpf *prog = act->priv;
-
-	if (tcf_bpf_is_ebpf(prog))
-		bpf_prog_put(prog->filter);
-	else
-		bpf_prog_destroy(prog->filter);
+	struct tcf_bpf_cfg tmp;
 
-	kfree(prog->bpf_ops);
-	kfree(prog->bpf_name);
+	tcf_bpf_prog_fill_cfg(act->priv, &tmp);
+	tcf_bpf_cfg_cleanup(&tmp);
 }
 
 static struct tc_action_ops act_bpf_ops __read_mostly = {
-- 
cgit v1.2.3


From 28e6b67f0b292f557468c139085303b15f1a678f Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Wed, 29 Jul 2015 23:35:25 +0200
Subject: net: sched: fix refcount imbalance in actions

Since commit 55334a5db5cd ("net_sched: act: refuse to remove bound action
outside"), we end up with a wrong reference count for a tc action.

Test case 1:

  FOO="1,6 0 0 4294967295,"
  BAR="1,6 0 0 4294967294,"
  tc filter add dev foo parent 1: bpf bytecode "$FOO" flowid 1:1 \
     action bpf bytecode "$FOO"
  tc actions show action bpf
    action order 0: bpf bytecode '1,6 0 0 4294967295' default-action pipe
    index 1 ref 1 bind 1
  tc actions replace action bpf bytecode "$BAR" index 1
  tc actions show action bpf
    action order 0: bpf bytecode '1,6 0 0 4294967294' default-action pipe
    index 1 ref 2 bind 1
  tc actions replace action bpf bytecode "$FOO" index 1
  tc actions show action bpf
    action order 0: bpf bytecode '1,6 0 0 4294967295' default-action pipe
    index 1 ref 3 bind 1

Test case 2:

  FOO="1,6 0 0 4294967295,"
  tc filter add dev foo parent 1: bpf bytecode "$FOO" flowid 1:1 action ok
  tc actions show action gact
    action order 0: gact action pass
    random type none pass val 0
     index 1 ref 1 bind 1
  tc actions add action drop index 1
    RTNETLINK answers: File exists [...]
  tc actions show action gact
    action order 0: gact action pass
     random type none pass val 0
     index 1 ref 2 bind 1
  tc actions add action drop index 1
    RTNETLINK answers: File exists [...]
  tc actions show action gact
    action order 0: gact action pass
     random type none pass val 0
     index 1 ref 3 bind 1

What happens is that in tcf_hash_check(), we check tcf_common for a given
index and increase tcfc_refcnt and conditionally tcfc_bindcnt when we've
found an existing action. Now there are the following cases:

  1) We do a late binding of an action. In that case, we leave the
     tcfc_refcnt/tcfc_bindcnt increased and are done with the ->init()
     handler. This is correctly handeled.

  2) We replace the given action, or we try to add one without replacing
     and find out that the action at a specific index already exists
     (thus, we go out with error in that case).

In case of 2), we have to undo the reference count increase from
tcf_hash_check() in the tcf_hash_check() function. Currently, we fail to
do so because of the 'tcfc_bindcnt > 0' check which bails out early with
an -EPERM error.

Now, while commit 55334a5db5cd prevents 'tc actions del action ...' on an
already classifier-bound action to drop the reference count (which could
then become negative, wrap around etc), this restriction only accounts for
invocations outside a specific action's ->init() handler.

One possible solution would be to add a flag thus we possibly trigger
the -EPERM ony in situations where it is indeed relevant.

After the patch, above test cases have correct reference count again.

Fixes: 55334a5db5cd ("net_sched: act: refuse to remove bound action outside")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: Cong Wang <cwang@twopensource.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/act_api.h |  8 +++++++-
 net/sched/act_api.c   | 11 ++++++-----
 2 files changed, 13 insertions(+), 6 deletions(-)

(limited to 'net')

diff --git a/include/net/act_api.h b/include/net/act_api.h
index 3ee4c92afd1b..931738bc5bba 100644
--- a/include/net/act_api.h
+++ b/include/net/act_api.h
@@ -99,7 +99,6 @@ struct tc_action_ops {
 
 int tcf_hash_search(struct tc_action *a, u32 index);
 void tcf_hash_destroy(struct tc_action *a);
-int tcf_hash_release(struct tc_action *a, int bind);
 u32 tcf_hash_new_index(struct tcf_hashinfo *hinfo);
 int tcf_hash_check(u32 index, struct tc_action *a, int bind);
 int tcf_hash_create(u32 index, struct nlattr *est, struct tc_action *a,
@@ -107,6 +106,13 @@ int tcf_hash_create(u32 index, struct nlattr *est, struct tc_action *a,
 void tcf_hash_cleanup(struct tc_action *a, struct nlattr *est);
 void tcf_hash_insert(struct tc_action *a);
 
+int __tcf_hash_release(struct tc_action *a, bool bind, bool strict);
+
+static inline int tcf_hash_release(struct tc_action *a, bool bind)
+{
+	return __tcf_hash_release(a, bind, false);
+}
+
 int tcf_register_action(struct tc_action_ops *a, unsigned int mask);
 int tcf_unregister_action(struct tc_action_ops *a);
 int tcf_action_destroy(struct list_head *actions, int bind);
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index af427a3dbcba..43ec92680ae8 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -45,7 +45,7 @@ void tcf_hash_destroy(struct tc_action *a)
 }
 EXPORT_SYMBOL(tcf_hash_destroy);
 
-int tcf_hash_release(struct tc_action *a, int bind)
+int __tcf_hash_release(struct tc_action *a, bool bind, bool strict)
 {
 	struct tcf_common *p = a->priv;
 	int ret = 0;
@@ -53,7 +53,7 @@ int tcf_hash_release(struct tc_action *a, int bind)
 	if (p) {
 		if (bind)
 			p->tcfc_bindcnt--;
-		else if (p->tcfc_bindcnt > 0)
+		else if (strict && p->tcfc_bindcnt > 0)
 			return -EPERM;
 
 		p->tcfc_refcnt--;
@@ -64,9 +64,10 @@ int tcf_hash_release(struct tc_action *a, int bind)
 			ret = 1;
 		}
 	}
+
 	return ret;
 }
-EXPORT_SYMBOL(tcf_hash_release);
+EXPORT_SYMBOL(__tcf_hash_release);
 
 static int tcf_dump_walker(struct sk_buff *skb, struct netlink_callback *cb,
 			   struct tc_action *a)
@@ -136,7 +137,7 @@ static int tcf_del_walker(struct sk_buff *skb, struct tc_action *a)
 		head = &hinfo->htab[tcf_hash(i, hinfo->hmask)];
 		hlist_for_each_entry_safe(p, n, head, tcfc_head) {
 			a->priv = p;
-			ret = tcf_hash_release(a, 0);
+			ret = __tcf_hash_release(a, false, true);
 			if (ret == ACT_P_DELETED) {
 				module_put(a->ops->owner);
 				n_i++;
@@ -408,7 +409,7 @@ int tcf_action_destroy(struct list_head *actions, int bind)
 	int ret = 0;
 
 	list_for_each_entry_safe(a, tmp, actions, list) {
-		ret = tcf_hash_release(a, bind);
+		ret = __tcf_hash_release(a, bind, true);
 		if (ret == ACT_P_DELETED)
 			module_put(a->ops->owner);
 		else if (ret < 0)
-- 
cgit v1.2.3


From 8a68173691f036613e3d4e6bf8dc129d4a7bf383 Mon Sep 17 00:00:00 2001
From: Sowmini Varadhan <sowmini.varadhan@oracle.com>
Date: Thu, 30 Jul 2015 15:50:36 +0200
Subject: net: sk_clone_lock() should only do get_net() if the parent is not a
 kernel socket

The newsk returned by sk_clone_lock should hold a get_net()
reference if, and only if, the parent is not a kernel socket
(making this similar to sk_alloc()).

E.g,. for the SYN_RECV path, tcp_v4_syn_recv_sock->..inet_csk_clone_lock
sets up the syn_recv newsk from sk_clone_lock. When the parent (listen)
socket is a kernel socket (defined in sk_alloc() as having
sk_net_refcnt == 0), then the newsk should also have a 0 sk_net_refcnt
and should not hold a get_net() reference.

Fixes: 26abe14379f8 ("net: Modify sk_alloc to not reference count the
      netns of kernel sockets.")
Acked-by: Eric Dumazet <edumazet@google.com>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Sowmini Varadhan <sowmini.varadhan@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/sock.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/core/sock.c b/net/core/sock.c
index 8a14f1285fc4..193901d09757 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1497,7 +1497,8 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
 		sock_copy(newsk, sk);
 
 		/* SANITY */
-		get_net(sock_net(newsk));
+		if (likely(newsk->sk_net_refcnt))
+			get_net(sock_net(newsk));
 		sk_node_init(&newsk->sk_node);
 		sock_lock_init(newsk);
 		bh_lock_sock(newsk);
-- 
cgit v1.2.3


From 5175f7106cc55a1bcf97bf7d5ba0900017ebcef8 Mon Sep 17 00:00:00 2001
From: WANG Cong <xiyou.wangcong@gmail.com>
Date: Thu, 30 Jul 2015 17:12:21 -0700
Subject: act_pedit: check binding before calling tcf_hash_release()

When we share an action within a filter, the bind refcnt
should increase, therefore we should not call tcf_hash_release().

Fixes: 1a29321ed045 ("net_sched: act: Dont increment refcnt on replace")
Cc: Jamal Hadi Salim <jhs@mojatatu.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
Signed-off-by: Cong Wang <cwang@twopensource.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/act_pedit.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c
index 17e6d6669c7f..ff8b466a73f6 100644
--- a/net/sched/act_pedit.c
+++ b/net/sched/act_pedit.c
@@ -68,13 +68,12 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla,
 		}
 		ret = ACT_P_CREATED;
 	} else {
-		p = to_pedit(a);
-		tcf_hash_release(a, bind);
 		if (bind)
 			return 0;
+		tcf_hash_release(a, bind);
 		if (!ovr)
 			return -EEXIST;
-
+		p = to_pedit(a);
 		if (p->tcfp_nkeys && p->tcfp_nkeys != parm->nkeys) {
 			keys = kmalloc(ksize, GFP_KERNEL);
 			if (keys == NULL)
-- 
cgit v1.2.3