From 719f8bcc883e7992615f4d5625922e24995e2d98 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Mon, 13 Aug 2012 17:03:00 -0400
Subject: svcrpc: fix xpt_list traversal locking on shutdown

Server threads are not running at this point, but svc_age_temp_xprts
still may be, so we need this locking.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 net/sunrpc/svc_xprt.c | 24 +++++++++++++++---------
 1 file changed, 15 insertions(+), 9 deletions(-)

(limited to 'net')

diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
index bac973a31367..e1810b947dea 100644
--- a/net/sunrpc/svc_xprt.c
+++ b/net/sunrpc/svc_xprt.c
@@ -917,16 +917,18 @@ void svc_close_xprt(struct svc_xprt *xprt)
 }
 EXPORT_SYMBOL_GPL(svc_close_xprt);
 
-static void svc_close_list(struct list_head *xprt_list, struct net *net)
+static void svc_close_list(struct svc_serv *serv, struct list_head *xprt_list, struct net *net)
 {
 	struct svc_xprt *xprt;
 
+	spin_lock(&serv->sv_lock);
 	list_for_each_entry(xprt, xprt_list, xpt_list) {
 		if (xprt->xpt_net != net)
 			continue;
 		set_bit(XPT_CLOSE, &xprt->xpt_flags);
 		set_bit(XPT_BUSY, &xprt->xpt_flags);
 	}
+	spin_unlock(&serv->sv_lock);
 }
 
 static void svc_clear_pools(struct svc_serv *serv, struct net *net)
@@ -949,24 +951,28 @@ static void svc_clear_pools(struct svc_serv *serv, struct net *net)
 	}
 }
 
-static void svc_clear_list(struct list_head *xprt_list, struct net *net)
+static void svc_clear_list(struct svc_serv *serv, struct list_head *xprt_list, struct net *net)
 {
 	struct svc_xprt *xprt;
 	struct svc_xprt *tmp;
+	LIST_HEAD(victims);
 
+	spin_lock(&serv->sv_lock);
 	list_for_each_entry_safe(xprt, tmp, xprt_list, xpt_list) {
 		if (xprt->xpt_net != net)
 			continue;
-		svc_delete_xprt(xprt);
+		list_move(&xprt->xpt_list, &victims);
 	}
-	list_for_each_entry(xprt, xprt_list, xpt_list)
-		BUG_ON(xprt->xpt_net == net);
+	spin_unlock(&serv->sv_lock);
+
+	list_for_each_entry_safe(xprt, tmp, &victims, xpt_list)
+		svc_delete_xprt(xprt);
 }
 
 void svc_close_net(struct svc_serv *serv, struct net *net)
 {
-	svc_close_list(&serv->sv_tempsocks, net);
-	svc_close_list(&serv->sv_permsocks, net);
+	svc_close_list(serv, &serv->sv_tempsocks, net);
+	svc_close_list(serv, &serv->sv_permsocks, net);
 
 	svc_clear_pools(serv, net);
 	/*
@@ -974,8 +980,8 @@ void svc_close_net(struct svc_serv *serv, struct net *net)
 	 * svc_xprt_enqueue will not add new entries without taking the
 	 * sp_lock and checking XPT_BUSY.
 	 */
-	svc_clear_list(&serv->sv_tempsocks, net);
-	svc_clear_list(&serv->sv_permsocks, net);
+	svc_clear_list(serv, &serv->sv_tempsocks, net);
+	svc_clear_list(serv, &serv->sv_permsocks, net);
 }
 
 /*
-- 
cgit v1.2.3


From 72c3537607e42928f13691d59579ec840014b19e Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Mon, 13 Aug 2012 17:46:17 -0400
Subject: svcrpc: standardize svc_setup_socket return convention

Use the kernel-standard ptr-or-error return convention instead of
passing a pointer to the error.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 net/sunrpc/svcsock.c | 40 ++++++++++++++++++++++------------------
 1 file changed, 22 insertions(+), 18 deletions(-)

(limited to 'net')

diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index 998aa8c1807c..d028b51a69ad 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -59,7 +59,7 @@
 
 
 static struct svc_sock *svc_setup_socket(struct svc_serv *, struct socket *,
-					 int *errp, int flags);
+					 int flags);
 static void		svc_udp_data_ready(struct sock *, int);
 static int		svc_udp_recvfrom(struct svc_rqst *);
 static int		svc_udp_sendto(struct svc_rqst *);
@@ -900,8 +900,9 @@ static struct svc_xprt *svc_tcp_accept(struct svc_xprt *xprt)
 	 */
 	newsock->sk->sk_sndtimeo = HZ*30;
 
-	if (!(newsvsk = svc_setup_socket(serv, newsock, &err,
-				 (SVC_SOCK_ANONYMOUS | SVC_SOCK_TEMPORARY))))
+	newsvsk = svc_setup_socket(serv, newsock,
+				 (SVC_SOCK_ANONYMOUS | SVC_SOCK_TEMPORARY));
+	if (IS_ERR(newsvsk))
 		goto failed;
 	svc_xprt_set_remote(&newsvsk->sk_xprt, sin, slen);
 	err = kernel_getsockname(newsock, sin, &slen);
@@ -1383,29 +1384,29 @@ EXPORT_SYMBOL_GPL(svc_sock_update_bufs);
  */
 static struct svc_sock *svc_setup_socket(struct svc_serv *serv,
 						struct socket *sock,
-						int *errp, int flags)
+						int flags)
 {
 	struct svc_sock	*svsk;
 	struct sock	*inet;
 	int		pmap_register = !(flags & SVC_SOCK_ANONYMOUS);
+	int		err = 0;
 
 	dprintk("svc: svc_setup_socket %p\n", sock);
-	if (!(svsk = kzalloc(sizeof(*svsk), GFP_KERNEL))) {
-		*errp = -ENOMEM;
-		return NULL;
-	}
+	svsk = kzalloc(sizeof(*svsk), GFP_KERNEL);
+	if (!svsk)
+		return ERR_PTR(-ENOMEM);
 
 	inet = sock->sk;
 
 	/* Register socket with portmapper */
-	if (*errp >= 0 && pmap_register)
-		*errp = svc_register(serv, sock_net(sock->sk), inet->sk_family,
+	if (pmap_register)
+		err = svc_register(serv, sock_net(sock->sk), inet->sk_family,
 				     inet->sk_protocol,
 				     ntohs(inet_sk(inet)->inet_sport));
 
-	if (*errp < 0) {
+	if (err < 0) {
 		kfree(svsk);
-		return NULL;
+		return ERR_PTR(err);
 	}
 
 	inet->sk_user_data = svsk;
@@ -1463,10 +1464,12 @@ int svc_addsock(struct svc_serv *serv, const int fd, char *name_return,
 	else {
 		if (!try_module_get(THIS_MODULE))
 			err = -ENOENT;
-		else
-			svsk = svc_setup_socket(serv, so, &err,
-						SVC_SOCK_DEFAULTS);
-		if (svsk) {
+		else {
+			svsk = svc_setup_socket(serv, so, SVC_SOCK_DEFAULTS);
+			if (IS_ERR(svsk))
+				err = PTR_ERR(svsk);
+		}
+		if (err == 0) {
 			struct sockaddr_storage addr;
 			struct sockaddr *sin = (struct sockaddr *)&addr;
 			int salen;
@@ -1563,11 +1566,12 @@ static struct svc_xprt *svc_create_socket(struct svc_serv *serv,
 			goto bummer;
 	}
 
-	if ((svsk = svc_setup_socket(serv, sock, &error, flags)) != NULL) {
+	svsk = svc_setup_socket(serv, sock, flags);
+	if (!IS_ERR(svsk)) {
 		svc_xprt_set_local(&svsk->sk_xprt, newsin, newlen);
 		return (struct svc_xprt *)svsk;
 	}
-
+	error = PTR_ERR(svsk);
 bummer:
 	dprintk("svc: svc_create_socket error = %d\n", -error);
 	sock_release(sock);
-- 
cgit v1.2.3


From a8e10078a87c8a2c3c8d0f9856c0f74272fc0f74 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Mon, 13 Aug 2012 18:01:03 -0400
Subject: svcrpc: clean up control flow

Mainly, use the kernel standard

	err = -ERROR;
	if (something_bad)
		goto out;
	normal case;

rather than

	if (something_bad)
		err = -ERROR
	else {
		normal case;
	}

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 net/sunrpc/svcsock.c | 69 ++++++++++++++++++++++++++--------------------------
 1 file changed, 34 insertions(+), 35 deletions(-)

(limited to 'net')

diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index d028b51a69ad..bf10b723f429 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -1451,44 +1451,42 @@ int svc_addsock(struct svc_serv *serv, const int fd, char *name_return,
 	int err = 0;
 	struct socket *so = sockfd_lookup(fd, &err);
 	struct svc_sock *svsk = NULL;
+	struct sockaddr_storage addr;
+	struct sockaddr *sin = (struct sockaddr *)&addr;
+	int salen;
 
 	if (!so)
 		return err;
+	err = -EAFNOSUPPORT;
 	if ((so->sk->sk_family != PF_INET) && (so->sk->sk_family != PF_INET6))
-		err =  -EAFNOSUPPORT;
-	else if (so->sk->sk_protocol != IPPROTO_TCP &&
+		goto out;
+	err =  -EPROTONOSUPPORT;
+	if (so->sk->sk_protocol != IPPROTO_TCP &&
 	    so->sk->sk_protocol != IPPROTO_UDP)
-		err =  -EPROTONOSUPPORT;
-	else if (so->state > SS_UNCONNECTED)
-		err = -EISCONN;
-	else {
-		if (!try_module_get(THIS_MODULE))
-			err = -ENOENT;
-		else {
-			svsk = svc_setup_socket(serv, so, SVC_SOCK_DEFAULTS);
-			if (IS_ERR(svsk))
-				err = PTR_ERR(svsk);
-		}
-		if (err == 0) {
-			struct sockaddr_storage addr;
-			struct sockaddr *sin = (struct sockaddr *)&addr;
-			int salen;
-			if (kernel_getsockname(svsk->sk_sock, sin, &salen) == 0)
-				svc_xprt_set_local(&svsk->sk_xprt, sin, salen);
-			clear_bit(XPT_TEMP, &svsk->sk_xprt.xpt_flags);
-			spin_lock_bh(&serv->sv_lock);
-			list_add(&svsk->sk_xprt.xpt_list, &serv->sv_permsocks);
-			spin_unlock_bh(&serv->sv_lock);
-			svc_xprt_received(&svsk->sk_xprt);
-			err = 0;
-		} else
-			module_put(THIS_MODULE);
-	}
-	if (err) {
-		sockfd_put(so);
-		return err;
+		goto out;
+	err = -EISCONN;
+	if (so->state > SS_UNCONNECTED)
+		goto out;
+	err = -ENOENT;
+	if (!try_module_get(THIS_MODULE))
+		goto out;
+	svsk = svc_setup_socket(serv, so, SVC_SOCK_DEFAULTS);
+	if (IS_ERR(svsk)) {
+		module_put(THIS_MODULE);
+		err = PTR_ERR(svsk);
+		goto out;
 	}
+	if (kernel_getsockname(svsk->sk_sock, sin, &salen) == 0)
+		svc_xprt_set_local(&svsk->sk_xprt, sin, salen);
+	clear_bit(XPT_TEMP, &svsk->sk_xprt.xpt_flags);
+	spin_lock_bh(&serv->sv_lock);
+	list_add(&svsk->sk_xprt.xpt_list, &serv->sv_permsocks);
+	spin_unlock_bh(&serv->sv_lock);
+	svc_xprt_received(&svsk->sk_xprt);
 	return svc_one_sock_name(svsk, name_return, len);
+out:
+	sockfd_put(so);
+	return err;
 }
 EXPORT_SYMBOL_GPL(svc_addsock);
 
@@ -1567,11 +1565,12 @@ static struct svc_xprt *svc_create_socket(struct svc_serv *serv,
 	}
 
 	svsk = svc_setup_socket(serv, sock, flags);
-	if (!IS_ERR(svsk)) {
-		svc_xprt_set_local(&svsk->sk_xprt, newsin, newlen);
-		return (struct svc_xprt *)svsk;
+	if (IS_ERR(svsk)) {
+		error = PTR_ERR(svsk);
+		goto bummer;
 	}
-	error = PTR_ERR(svsk);
+	svc_xprt_set_local(&svsk->sk_xprt, newsin, newlen);
+	return (struct svc_xprt *)svsk;
 bummer:
 	dprintk("svc: svc_create_socket error = %d\n", -error);
 	sock_release(sock);
-- 
cgit v1.2.3


From c3341966943284ab3618a1814cefd693ad9aa736 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Tue, 14 Aug 2012 15:27:23 -0400
Subject: svcrpc: make svc_create_xprt enqueue on clearing XPT_BUSY

Whenever we clear XPT_BUSY we should call svc_xprt_enqueue().  Without
that we may fail to notice any events (such as new connections) that
arrived while XPT_BUSY was set.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 net/sunrpc/svc_xprt.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
index e1810b947dea..4801fdac2c9d 100644
--- a/net/sunrpc/svc_xprt.c
+++ b/net/sunrpc/svc_xprt.c
@@ -238,7 +238,7 @@ int svc_create_xprt(struct svc_serv *serv, const char *xprt_name,
 		list_add(&newxprt->xpt_list, &serv->sv_permsocks);
 		spin_unlock_bh(&serv->sv_lock);
 		newport = svc_xprt_local_port(newxprt);
-		clear_bit(XPT_BUSY, &newxprt->xpt_flags);
+		svc_xprt_received(newxprt);
 		return newport;
 	}
  err:
-- 
cgit v1.2.3


From 39b553013719fe6495cf5e496b827b2d712e4265 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Tue, 14 Aug 2012 15:50:34 -0400
Subject: svcrpc: share some setup of listening sockets

There's some duplicate code here.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 include/linux/sunrpc/svc_xprt.h |  1 +
 net/sunrpc/svc_xprt.c           | 16 ++++++++++------
 net/sunrpc/svcsock.c            |  6 +-----
 3 files changed, 12 insertions(+), 11 deletions(-)

(limited to 'net')

diff --git a/include/linux/sunrpc/svc_xprt.h b/include/linux/sunrpc/svc_xprt.h
index b3f64b12f141..73c7a68667ea 100644
--- a/include/linux/sunrpc/svc_xprt.h
+++ b/include/linux/sunrpc/svc_xprt.h
@@ -124,6 +124,7 @@ struct	svc_xprt *svc_find_xprt(struct svc_serv *serv, const char *xcl_name,
 			struct net *net, const sa_family_t af,
 			const unsigned short port);
 int	svc_xprt_names(struct svc_serv *serv, char *buf, const int buflen);
+void	svc_add_new_perm_xprt(struct svc_serv *serv, struct svc_xprt *xprt);
 
 static inline void svc_xprt_get(struct svc_xprt *xprt)
 {
diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
index 4801fdac2c9d..ee15663798b3 100644
--- a/net/sunrpc/svc_xprt.c
+++ b/net/sunrpc/svc_xprt.c
@@ -208,6 +208,15 @@ static struct svc_xprt *__svc_xpo_create(struct svc_xprt_class *xcl,
 	return xcl->xcl_ops->xpo_create(serv, net, sap, len, flags);
 }
 
+void svc_add_new_perm_xprt(struct svc_serv *serv, struct svc_xprt *new)
+{
+	clear_bit(XPT_TEMP, &new->xpt_flags);
+	spin_lock_bh(&serv->sv_lock);
+	list_add(&new->xpt_list, &serv->sv_permsocks);
+	spin_unlock_bh(&serv->sv_lock);
+	svc_xprt_received(new);
+}
+
 int svc_create_xprt(struct svc_serv *serv, const char *xprt_name,
 		    struct net *net, const int family,
 		    const unsigned short port, int flags)
@@ -232,13 +241,8 @@ int svc_create_xprt(struct svc_serv *serv, const char *xprt_name,
 			module_put(xcl->xcl_owner);
 			return PTR_ERR(newxprt);
 		}
-
-		clear_bit(XPT_TEMP, &newxprt->xpt_flags);
-		spin_lock_bh(&serv->sv_lock);
-		list_add(&newxprt->xpt_list, &serv->sv_permsocks);
-		spin_unlock_bh(&serv->sv_lock);
+		svc_add_new_perm_xprt(serv, newxprt);
 		newport = svc_xprt_local_port(newxprt);
-		svc_xprt_received(newxprt);
 		return newport;
 	}
  err:
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index bf10b723f429..c7a7b14f54ed 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -1478,11 +1478,7 @@ int svc_addsock(struct svc_serv *serv, const int fd, char *name_return,
 	}
 	if (kernel_getsockname(svsk->sk_sock, sin, &salen) == 0)
 		svc_xprt_set_local(&svsk->sk_xprt, sin, salen);
-	clear_bit(XPT_TEMP, &svsk->sk_xprt.xpt_flags);
-	spin_lock_bh(&serv->sv_lock);
-	list_add(&svsk->sk_xprt.xpt_list, &serv->sv_permsocks);
-	spin_unlock_bh(&serv->sv_lock);
-	svc_xprt_received(&svsk->sk_xprt);
+	svc_add_new_perm_xprt(serv, &svsk->sk_xprt);
 	return svc_one_sock_name(svsk, name_return, len);
 out:
 	sockfd_put(so);
-- 
cgit v1.2.3


From f23abfdb94fda3108441530cb4a813088d3f9176 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Fri, 17 Aug 2012 20:32:27 -0400
Subject: svcrpc: minor udp code cleanup

Order the code in a more boring way.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 net/sunrpc/svcsock.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index c7a7b14f54ed..06ae8a755349 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -620,10 +620,7 @@ static int svc_udp_recvfrom(struct svc_rqst *rqstp)
 	if (!svc_udp_get_dest_address(rqstp, cmh)) {
 		net_warn_ratelimited("svc: received unknown control message %d/%d; dropping RPC reply datagram\n",
 				     cmh->cmsg_level, cmh->cmsg_type);
-out_free:
-		trace_kfree_skb(skb, svc_udp_recvfrom);
-		skb_free_datagram_locked(svsk->sk_sk, skb);
-		return 0;
+		goto out_free;
 	}
 	rqstp->rq_daddrlen = svc_addr_len(svc_daddr(rqstp));
 
@@ -662,6 +659,10 @@ out_free:
 		serv->sv_stats->netudpcnt++;
 
 	return len;
+out_free:
+	trace_kfree_skb(skb, svc_udp_recvfrom);
+	skb_free_datagram_locked(svsk->sk_sk, skb);
+	return 0;
 }
 
 static int
-- 
cgit v1.2.3


From af6d572134b012ca92c4efc8a2f1cadbe5d01064 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Tue, 21 Aug 2012 17:22:11 -0400
Subject: svcrpc: don't bother checking bad svc_addr_len result

None of the callers should see an unsupported address family (only one
of them even bothers to check for that case), so just check for the
buggy case in svc_addr_len and don't bother elsewhere.

Acked-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 include/linux/sunrpc/svc_xprt.h | 3 +--
 net/sunrpc/svcsock.c            | 2 --
 2 files changed, 1 insertion(+), 4 deletions(-)

(limited to 'net')

diff --git a/include/linux/sunrpc/svc_xprt.h b/include/linux/sunrpc/svc_xprt.h
index 73c7a68667ea..193dddab6511 100644
--- a/include/linux/sunrpc/svc_xprt.h
+++ b/include/linux/sunrpc/svc_xprt.h
@@ -167,8 +167,7 @@ static inline size_t svc_addr_len(const struct sockaddr *sa)
 	case AF_INET6:
 		return sizeof(struct sockaddr_in6);
 	}
-
-	return 0;
+	BUG();
 }
 
 static inline unsigned short svc_xprt_local_port(const struct svc_xprt *xprt)
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index 06ae8a755349..406688baac57 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -601,8 +601,6 @@ static int svc_udp_recvfrom(struct svc_rqst *rqstp)
 		return -EAGAIN;
 	}
 	len = svc_addr_len(svc_addr(rqstp));
-	if (len == 0)
-		return -EAFNOSUPPORT;
 	rqstp->rq_addrlen = len;
 	if (skb->tstamp.tv64 == 0) {
 		skb->tstamp = ktime_get_real();
-- 
cgit v1.2.3


From 9f9d2ebe693a98d517257e1a39f61120b4473b96 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Fri, 17 Aug 2012 21:35:24 -0400
Subject: svcrpc: make xpo_recvfrom return only >=0

The only errors returned from xpo_recvfrom have been -EAGAIN and
-EAFNOSUPPORT.  The latter was removed by a previous patch.  That leaves
only -EAGAIN, which is treated just like 0 by the caller (svc_recv).

So, just ditch -EAGAIN and return 0 instead.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 net/sunrpc/svc_xprt.c | 2 +-
 net/sunrpc/svcsock.c  | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
index ee15663798b3..3e317307e288 100644
--- a/net/sunrpc/svc_xprt.c
+++ b/net/sunrpc/svc_xprt.c
@@ -743,7 +743,7 @@ int svc_recv(struct svc_rqst *rqstp, long timeout)
 	svc_xprt_received(xprt);
 
 	/* No data, incomplete (TCP) read, or accept() */
-	if (len == 0 || len == -EAGAIN)
+	if (len <= 0)
 		goto out;
 
 	clear_bit(XPT_OLD, &xprt->xpt_flags);
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index 406688baac57..7aee54c3fe46 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -598,7 +598,7 @@ static int svc_udp_recvfrom(struct svc_rqst *rqstp)
 			dprintk("svc: recvfrom returned error %d\n", -err);
 			set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
 		}
-		return -EAGAIN;
+		return 0;
 	}
 	len = svc_addr_len(svc_addr(rqstp));
 	rqstp->rq_addrlen = len;
@@ -1174,13 +1174,13 @@ error:
 	if (len != -EAGAIN)
 		goto err_other;
 	dprintk("RPC: TCP recvfrom got EAGAIN\n");
-	return -EAGAIN;
+	return 0;
 err_other:
 	printk(KERN_NOTICE "%s: recvfrom returned errno %d\n",
 	       svsk->sk_xprt.xpt_server->sv_name, -len);
 	set_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags);
 err_noclose:
-	return -EAGAIN;	/* record not complete */
+	return 0;	/* record not complete */
 }
 
 /*
-- 
cgit v1.2.3


From 6741019c829ecfa6f7a504fae1305dcf5d5cf057 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Fri, 17 Aug 2012 22:12:19 -0400
Subject: svcrpc: make svc_xprt_received static

Note this isn't used outside svc_xprt.c.

May as well move it so we don't need a declaration while we're here.

Also remove an outdated comment.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 include/linux/sunrpc/svc_xprt.h          |  1 -
 net/sunrpc/svc_xprt.c                    | 41 ++++++++++++++++----------------
 net/sunrpc/xprtrdma/svc_rdma_transport.c |  4 ----
 3 files changed, 20 insertions(+), 26 deletions(-)

(limited to 'net')

diff --git a/include/linux/sunrpc/svc_xprt.h b/include/linux/sunrpc/svc_xprt.h
index 193dddab6511..b05963f09ebf 100644
--- a/include/linux/sunrpc/svc_xprt.h
+++ b/include/linux/sunrpc/svc_xprt.h
@@ -114,7 +114,6 @@ void	svc_xprt_init(struct net *, struct svc_xprt_class *, struct svc_xprt *,
 int	svc_create_xprt(struct svc_serv *, const char *, struct net *,
 			const int, const unsigned short, int);
 void	svc_xprt_enqueue(struct svc_xprt *xprt);
-void	svc_xprt_received(struct svc_xprt *);
 void	svc_xprt_put(struct svc_xprt *xprt);
 void	svc_xprt_copy_addrs(struct svc_rqst *rqstp, struct svc_xprt *xprt);
 void	svc_close_xprt(struct svc_xprt *xprt);
diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
index 3e317307e288..295e6ed21ca0 100644
--- a/net/sunrpc/svc_xprt.c
+++ b/net/sunrpc/svc_xprt.c
@@ -208,6 +208,26 @@ static struct svc_xprt *__svc_xpo_create(struct svc_xprt_class *xcl,
 	return xcl->xcl_ops->xpo_create(serv, net, sap, len, flags);
 }
 
+/*
+ * svc_xprt_received conditionally queues the transport for processing
+ * by another thread. The caller must hold the XPT_BUSY bit and must
+ * not thereafter touch transport data.
+ *
+ * Note: XPT_DATA only gets cleared when a read-attempt finds no (or
+ * insufficient) data.
+ */
+static void svc_xprt_received(struct svc_xprt *xprt)
+{
+	BUG_ON(!test_bit(XPT_BUSY, &xprt->xpt_flags));
+	/* As soon as we clear busy, the xprt could be closed and
+	 * 'put', so we need a reference to call svc_xprt_enqueue with:
+	 */
+	svc_xprt_get(xprt);
+	clear_bit(XPT_BUSY, &xprt->xpt_flags);
+	svc_xprt_enqueue(xprt);
+	svc_xprt_put(xprt);
+}
+
 void svc_add_new_perm_xprt(struct svc_serv *serv, struct svc_xprt *new)
 {
 	clear_bit(XPT_TEMP, &new->xpt_flags);
@@ -398,27 +418,6 @@ static struct svc_xprt *svc_xprt_dequeue(struct svc_pool *pool)
 	return xprt;
 }
 
-/*
- * svc_xprt_received conditionally queues the transport for processing
- * by another thread. The caller must hold the XPT_BUSY bit and must
- * not thereafter touch transport data.
- *
- * Note: XPT_DATA only gets cleared when a read-attempt finds no (or
- * insufficient) data.
- */
-void svc_xprt_received(struct svc_xprt *xprt)
-{
-	BUG_ON(!test_bit(XPT_BUSY, &xprt->xpt_flags));
-	/* As soon as we clear busy, the xprt could be closed and
-	 * 'put', so we need a reference to call svc_xprt_enqueue with:
-	 */
-	svc_xprt_get(xprt);
-	clear_bit(XPT_BUSY, &xprt->xpt_flags);
-	svc_xprt_enqueue(xprt);
-	svc_xprt_put(xprt);
-}
-EXPORT_SYMBOL_GPL(svc_xprt_received);
-
 /**
  * svc_reserve - change the space reserved for the reply to a request.
  * @rqstp:  The request in question
diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
index 73b428bef598..62e4f9bcc387 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -578,10 +578,6 @@ static void handle_connect_req(struct rdma_cm_id *new_cma_id, size_t client_ird)
 	list_add_tail(&newxprt->sc_accept_q, &listen_xprt->sc_accept_q);
 	spin_unlock_bh(&listen_xprt->sc_lock);
 
-	/*
-	 * Can't use svc_xprt_received here because we are not on a
-	 * rqstp thread
-	*/
 	set_bit(XPT_CONN, &listen_xprt->sc_xprt.xpt_flags);
 	svc_xprt_enqueue(&listen_xprt->sc_xprt);
 }
-- 
cgit v1.2.3


From 6797fa5a018ff916a071c6265fbf043644abcd29 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Sat, 18 Aug 2012 15:33:51 -0400
Subject: svcrpc: break up svc_recv

Matter of taste, I suppose, but svc_recv breaks up naturally into:

	allocate pages and setup arg
	dequeue (wait for, if necessary) next socket
	do something with that socket

And I find it easier to read when it doesn't go on for pages and pages.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 net/sunrpc/svc_xprt.c | 103 ++++++++++++++++++++++++++++++++------------------
 1 file changed, 67 insertions(+), 36 deletions(-)

(limited to 'net')

diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
index 295e6ed21ca0..6ebc9a95bbab 100644
--- a/net/sunrpc/svc_xprt.c
+++ b/net/sunrpc/svc_xprt.c
@@ -568,33 +568,12 @@ static void svc_check_conn_limits(struct svc_serv *serv)
 	}
 }
 
-/*
- * Receive the next request on any transport.  This code is carefully
- * organised not to touch any cachelines in the shared svc_serv
- * structure, only cachelines in the local svc_pool.
- */
-int svc_recv(struct svc_rqst *rqstp, long timeout)
+int svc_alloc_arg(struct svc_rqst *rqstp)
 {
-	struct svc_xprt		*xprt = NULL;
-	struct svc_serv		*serv = rqstp->rq_server;
-	struct svc_pool		*pool = rqstp->rq_pool;
-	int			len, i;
-	int			pages;
-	struct xdr_buf		*arg;
-	DECLARE_WAITQUEUE(wait, current);
-	long			time_left;
-
-	dprintk("svc: server %p waiting for data (to = %ld)\n",
-		rqstp, timeout);
-
-	if (rqstp->rq_xprt)
-		printk(KERN_ERR
-			"svc_recv: service %p, transport not NULL!\n",
-			 rqstp);
-	if (waitqueue_active(&rqstp->rq_wait))
-		printk(KERN_ERR
-			"svc_recv: service %p, wait queue active!\n",
-			 rqstp);
+	struct svc_serv *serv = rqstp->rq_server;
+	struct xdr_buf *arg;
+	int pages;
+	int i;
 
 	/* now allocate needed pages.  If we get a failure, sleep briefly */
 	pages = (serv->sv_max_mesg + PAGE_SIZE) / PAGE_SIZE;
@@ -624,11 +603,15 @@ int svc_recv(struct svc_rqst *rqstp, long timeout)
 	arg->page_len = (pages-2)*PAGE_SIZE;
 	arg->len = (pages-1)*PAGE_SIZE;
 	arg->tail[0].iov_len = 0;
+	return 0;
+}
 
-	try_to_freeze();
-	cond_resched();
-	if (signalled() || kthread_should_stop())
-		return -EINTR;
+struct svc_xprt *svc_get_next_xprt(struct svc_rqst *rqstp, long timeout)
+{
+	struct svc_xprt *xprt;
+	struct svc_pool		*pool = rqstp->rq_pool;
+	DECLARE_WAITQUEUE(wait, current);
+	long			time_left;
 
 	/* Normally we will wait up to 5 seconds for any required
 	 * cache information to be provided.
@@ -666,7 +649,7 @@ int svc_recv(struct svc_rqst *rqstp, long timeout)
 		if (kthread_should_stop()) {
 			set_current_state(TASK_RUNNING);
 			spin_unlock_bh(&pool->sp_lock);
-			return -EINTR;
+			return ERR_PTR(-EINTR);
 		}
 
 		add_wait_queue(&rqstp->rq_wait, &wait);
@@ -687,19 +670,25 @@ int svc_recv(struct svc_rqst *rqstp, long timeout)
 			spin_unlock_bh(&pool->sp_lock);
 			dprintk("svc: server %p, no data yet\n", rqstp);
 			if (signalled() || kthread_should_stop())
-				return -EINTR;
+				return ERR_PTR(-EINTR);
 			else
-				return -EAGAIN;
+				return ERR_PTR(-EAGAIN);
 		}
 	}
 	spin_unlock_bh(&pool->sp_lock);
+	return xprt;
+}
+
+static int svc_handle_xprt(struct svc_rqst *rqstp, struct svc_xprt *xprt)
+{
+	struct svc_serv *serv = rqstp->rq_server;
+	int len = 0;
 
-	len = 0;
 	if (test_bit(XPT_CLOSE, &xprt->xpt_flags)) {
 		dprintk("svc_recv: found XPT_CLOSE\n");
 		svc_delete_xprt(xprt);
 		/* Leave XPT_BUSY set on the dead xprt: */
-		goto out;
+		return 0;
 	}
 	if (test_bit(XPT_LISTENER, &xprt->xpt_flags)) {
 		struct svc_xprt *newxpt;
@@ -727,8 +716,9 @@ int svc_recv(struct svc_rqst *rqstp, long timeout)
 			svc_xprt_received(newxpt);
 		}
 	} else if (xprt->xpt_ops->xpo_has_wspace(xprt)) {
+		/* XPT_DATA|XPT_DEFERRED case: */
 		dprintk("svc: server %p, pool %u, transport %p, inuse=%d\n",
-			rqstp, pool->sp_id, xprt,
+			rqstp, rqstp->rq_pool->sp_id, xprt,
 			atomic_read(&xprt->xpt_ref.refcount));
 		rqstp->rq_deferred = svc_deferred_dequeue(xprt);
 		if (rqstp->rq_deferred)
@@ -739,7 +729,48 @@ int svc_recv(struct svc_rqst *rqstp, long timeout)
 		rqstp->rq_reserved = serv->sv_max_mesg;
 		atomic_add(rqstp->rq_reserved, &xprt->xpt_reserved);
 	}
+	/* clear XPT_BUSY: */
 	svc_xprt_received(xprt);
+	return len;
+}
+
+/*
+ * Receive the next request on any transport.  This code is carefully
+ * organised not to touch any cachelines in the shared svc_serv
+ * structure, only cachelines in the local svc_pool.
+ */
+int svc_recv(struct svc_rqst *rqstp, long timeout)
+{
+	struct svc_xprt		*xprt = NULL;
+	struct svc_serv		*serv = rqstp->rq_server;
+	int			len, err;
+
+	dprintk("svc: server %p waiting for data (to = %ld)\n",
+		rqstp, timeout);
+
+	if (rqstp->rq_xprt)
+		printk(KERN_ERR
+			"svc_recv: service %p, transport not NULL!\n",
+			 rqstp);
+	if (waitqueue_active(&rqstp->rq_wait))
+		printk(KERN_ERR
+			"svc_recv: service %p, wait queue active!\n",
+			 rqstp);
+
+	err = svc_alloc_arg(rqstp);
+	if (err)
+		return err;
+
+	try_to_freeze();
+	cond_resched();
+	if (signalled() || kthread_should_stop())
+		return -EINTR;
+
+	xprt = svc_get_next_xprt(rqstp, timeout);
+	if (IS_ERR(xprt))
+		return PTR_ERR(xprt);
+
+	len = svc_handle_xprt(rqstp, xprt);
 
 	/* No data, incomplete (TCP) read, or accept() */
 	if (len <= 0)
-- 
cgit v1.2.3


From 65b2e6656bda2ad983727fcc725ac66b6d5035a7 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Sat, 18 Aug 2012 15:44:33 -0400
Subject: svcrpc: split up svc_handle_xprt

Move initialization of newly accepted socket into a helper.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 net/sunrpc/svc_xprt.c | 47 +++++++++++++++++++++++++----------------------
 1 file changed, 25 insertions(+), 22 deletions(-)

(limited to 'net')

diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
index 6ebc9a95bbab..194d865fae72 100644
--- a/net/sunrpc/svc_xprt.c
+++ b/net/sunrpc/svc_xprt.c
@@ -679,6 +679,23 @@ struct svc_xprt *svc_get_next_xprt(struct svc_rqst *rqstp, long timeout)
 	return xprt;
 }
 
+void svc_add_new_temp_xprt(struct svc_serv *serv, struct svc_xprt *newxpt)
+{
+	spin_lock_bh(&serv->sv_lock);
+	set_bit(XPT_TEMP, &newxpt->xpt_flags);
+	list_add(&newxpt->xpt_list, &serv->sv_tempsocks);
+	serv->sv_tmpcnt++;
+	if (serv->sv_temptimer.function == NULL) {
+		/* setup timer to age temp transports */
+		setup_timer(&serv->sv_temptimer, svc_age_temp_xprts,
+			    (unsigned long)serv);
+		mod_timer(&serv->sv_temptimer,
+			  jiffies + svc_conn_age_period * HZ);
+	}
+	spin_unlock_bh(&serv->sv_lock);
+	svc_xprt_received(newxpt);
+}
+
 static int svc_handle_xprt(struct svc_rqst *rqstp, struct svc_xprt *xprt)
 {
 	struct svc_serv *serv = rqstp->rq_server;
@@ -692,29 +709,15 @@ static int svc_handle_xprt(struct svc_rqst *rqstp, struct svc_xprt *xprt)
 	}
 	if (test_bit(XPT_LISTENER, &xprt->xpt_flags)) {
 		struct svc_xprt *newxpt;
+		/*
+		 * We know this module_get will succeed because the
+		 * listener holds a reference too
+		 */
+		__module_get(xprt->xpt_class->xcl_owner);
+		svc_check_conn_limits(xprt->xpt_server);
 		newxpt = xprt->xpt_ops->xpo_accept(xprt);
-		if (newxpt) {
-			/*
-			 * We know this module_get will succeed because the
-			 * listener holds a reference too
-			 */
-			__module_get(newxpt->xpt_class->xcl_owner);
-			svc_check_conn_limits(xprt->xpt_server);
-			spin_lock_bh(&serv->sv_lock);
-			set_bit(XPT_TEMP, &newxpt->xpt_flags);
-			list_add(&newxpt->xpt_list, &serv->sv_tempsocks);
-			serv->sv_tmpcnt++;
-			if (serv->sv_temptimer.function == NULL) {
-				/* setup timer to age temp transports */
-				setup_timer(&serv->sv_temptimer,
-					    svc_age_temp_xprts,
-					    (unsigned long)serv);
-				mod_timer(&serv->sv_temptimer,
-					  jiffies + svc_conn_age_period * HZ);
-			}
-			spin_unlock_bh(&serv->sv_lock);
-			svc_xprt_received(newxpt);
-		}
+		if (newxpt)
+			svc_add_new_temp_xprt(serv, newxpt);
 	} else if (xprt->xpt_ops->xpo_has_wspace(xprt)) {
 		/* XPT_DATA|XPT_DEFERRED case: */
 		dprintk("svc: server %p, pool %u, transport %p, inuse=%d\n",
-- 
cgit v1.2.3


From 43def35c1030d91a7414936c7c1b416828b20afb Mon Sep 17 00:00:00 2001
From: Simon Derr <simon.derr@bull.net>
Date: Fri, 10 Aug 2012 15:52:06 +0200
Subject: net/9p: Check errno validity

While working on a modified server I had the Linux clients crash
a few times. This lead me to find this:

Some error codes are directly extracted from the server replies.
A malformed server reply could contain an invalid error code, with a
very large value. If this value is then passed to ERR_PTR() it will
not be properly detected as an error code by IS_ERR() and as a result
the kernel will dereference an invalid pointer.

This patch tries to avoid this.

Signed-off-by: Simon Derr <simon.derr@bull.net>
Signed-off-by: Eric Van Hensbergen <ericvh@gmail.com>
---
 net/9p/client.c | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/9p/client.c b/net/9p/client.c
index 8260f132b32e..34d417670935 100644
--- a/net/9p/client.c
+++ b/net/9p/client.c
@@ -76,6 +76,20 @@ inline int p9_is_proto_dotu(struct p9_client *clnt)
 }
 EXPORT_SYMBOL(p9_is_proto_dotu);
 
+/*
+ * Some error codes are taken directly from the server replies,
+ * make sure they are valid.
+ */
+static int safe_errno(int err)
+{
+	if ((err > 0) || (err < -MAX_ERRNO)) {
+		p9_debug(P9_DEBUG_ERROR, "Invalid error code %d\n", err);
+		return -EPROTO;
+	}
+	return err;
+}
+
+
 /* Interpret mount option for protocol version */
 static int get_protocol_version(char *s)
 {
@@ -782,7 +796,7 @@ again:
 		return req;
 reterr:
 	p9_free_req(c, req);
-	return ERR_PTR(err);
+	return ERR_PTR(safe_errno(err));
 }
 
 /**
@@ -865,7 +879,7 @@ static struct p9_req_t *p9_client_zc_rpc(struct p9_client *c, int8_t type,
 		return req;
 reterr:
 	p9_free_req(c, req);
-	return ERR_PTR(err);
+	return ERR_PTR(safe_errno(err));
 }
 
 static struct p9_fid *p9_fid_create(struct p9_client *clnt)
-- 
cgit v1.2.3


From eccf50c129686de11358093839749c83f6cae5db Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Wed, 15 Aug 2012 18:07:43 -0400
Subject: nfsd: remove unused listener-removal interfaces

You can use nfsd/portlist to give nfsd additional sockets to listen on.
In theory you can also remove listening sockets this way.  But nobody's
ever done that as far as I can tell.

Also this was partially broken in 2.6.25, by
a217813f9067b785241cb7f31956e51d2071703a "knfsd: Support adding
transports by writing portlist file".

(Note that we decide whether to take the "delfd" case by checking for a
digit--but what's actually expected in that case is something made by
svc_one_sock_name(), which won't begin with a digit.)

So, let's just rip out this stuff.

Acked-by: NeilBrown <neilb@suse.de>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfsctl.c               | 78 ------------------------------------------
 include/linux/sunrpc/svcsock.h |  3 --
 net/sunrpc/svcsock.c           | 51 ---------------------------
 3 files changed, 132 deletions(-)

(limited to 'net')

diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index e41a08ffbe0a..dab350dfc376 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -682,25 +682,6 @@ static ssize_t __write_ports_addfd(char *buf)
 	return err;
 }
 
-/*
- * A '-' followed by the 'name' of a socket means we close the socket.
- */
-static ssize_t __write_ports_delfd(char *buf)
-{
-	char *toclose;
-	int len = 0;
-
-	toclose = kstrdup(buf + 1, GFP_KERNEL);
-	if (toclose == NULL)
-		return -ENOMEM;
-
-	if (nfsd_serv != NULL)
-		len = svc_sock_names(nfsd_serv, buf,
-					SIMPLE_TRANSACTION_LIMIT, toclose);
-	kfree(toclose);
-	return len;
-}
-
 /*
  * A transport listener is added by writing it's transport name and
  * a port number.
@@ -746,31 +727,6 @@ out_err:
 	return err;
 }
 
-/*
- * A transport listener is removed by writing a "-", it's transport
- * name, and it's port number.
- */
-static ssize_t __write_ports_delxprt(char *buf)
-{
-	struct svc_xprt *xprt;
-	char transport[16];
-	int port;
-
-	if (sscanf(&buf[1], "%15s %4u", transport, &port) != 2)
-		return -EINVAL;
-
-	if (port < 1 || port > USHRT_MAX || nfsd_serv == NULL)
-		return -EINVAL;
-
-	xprt = svc_find_xprt(nfsd_serv, transport, &init_net, AF_UNSPEC, port);
-	if (xprt == NULL)
-		return -ENOTCONN;
-
-	svc_close_xprt(xprt);
-	svc_xprt_put(xprt);
-	return 0;
-}
-
 static ssize_t __write_ports(struct file *file, char *buf, size_t size)
 {
 	if (size == 0)
@@ -779,15 +735,9 @@ static ssize_t __write_ports(struct file *file, char *buf, size_t size)
 	if (isdigit(buf[0]))
 		return __write_ports_addfd(buf);
 
-	if (buf[0] == '-' && isdigit(buf[1]))
-		return __write_ports_delfd(buf);
-
 	if (isalpha(buf[0]))
 		return __write_ports_addxprt(buf);
 
-	if (buf[0] == '-' && isalpha(buf[1]))
-		return __write_ports_delxprt(buf);
-
 	return -EINVAL;
 }
 
@@ -825,21 +775,6 @@ static ssize_t __write_ports(struct file *file, char *buf, size_t size)
  * OR
  *
  * Input:
- *			buf:		C string containing a "-" followed
- *					by an integer value representing a
- *					previously passed in socket file
- *					descriptor
- *			size:		non-zero length of C string in @buf
- * Output:
- *	On success:	NFS service no longer listens on that socket;
- *			passed-in buffer filled with a '\n'-terminated C
- *			string containing a unique name of the listener;
- *			return code is the size in bytes of the string
- *	On error:	return code is a negative errno value
- *
- * OR
- *
- * Input:
  *			buf:		C string containing a transport
  *					name and an unsigned integer value
  *					representing the port to listen on,
@@ -848,19 +783,6 @@ static ssize_t __write_ports(struct file *file, char *buf, size_t size)
  * Output:
  *	On success:	returns zero; NFS service is started
  *	On error:	return code is a negative errno value
- *
- * OR
- *
- * Input:
- *			buf:		C string containing a "-" followed
- *					by a transport name and an unsigned
- *					integer value representing the port
- *					to listen on, separated by whitespace
- *			size:		non-zero length of C string in @buf
- * Output:
- *	On success:	returns zero; NFS service no longer listens
- *			on that transport
- *	On error:	return code is a negative errno value
  */
 static ssize_t write_ports(struct file *file, char *buf, size_t size)
 {
diff --git a/include/linux/sunrpc/svcsock.h b/include/linux/sunrpc/svcsock.h
index cb4ac69e1f33..92ad02f0dcc0 100644
--- a/include/linux/sunrpc/svcsock.h
+++ b/include/linux/sunrpc/svcsock.h
@@ -39,9 +39,6 @@ int		svc_recv(struct svc_rqst *, long);
 int		svc_send(struct svc_rqst *);
 void		svc_drop(struct svc_rqst *);
 void		svc_sock_update_bufs(struct svc_serv *serv);
-int		svc_sock_names(struct svc_serv *serv, char *buf,
-					const size_t buflen,
-					const char *toclose);
 int		svc_addsock(struct svc_serv *serv, const int fd,
 					char *name_return, const size_t len);
 void		svc_init_xprt_sock(void);
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index 7aee54c3fe46..03827cef1fa7 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -305,57 +305,6 @@ static int svc_one_sock_name(struct svc_sock *svsk, char *buf, int remaining)
 	return len;
 }
 
-/**
- * svc_sock_names - construct a list of listener names in a string
- * @serv: pointer to RPC service
- * @buf: pointer to a buffer to fill in with socket names
- * @buflen: size of the buffer to be filled
- * @toclose: pointer to '\0'-terminated C string containing the name
- *		of a listener to be closed
- *
- * Fills in @buf with a '\n'-separated list of names of listener
- * sockets.  If @toclose is not NULL, the socket named by @toclose
- * is closed, and is not included in the output list.
- *
- * Returns positive length of the socket name string, or a negative
- * errno value on error.
- */
-int svc_sock_names(struct svc_serv *serv, char *buf, const size_t buflen,
-		   const char *toclose)
-{
-	struct svc_sock *svsk, *closesk = NULL;
-	int len = 0;
-
-	if (!serv)
-		return 0;
-
-	spin_lock_bh(&serv->sv_lock);
-	list_for_each_entry(svsk, &serv->sv_permsocks, sk_xprt.xpt_list) {
-		int onelen = svc_one_sock_name(svsk, buf + len, buflen - len);
-		if (onelen < 0) {
-			len = onelen;
-			break;
-		}
-		if (toclose && strcmp(toclose, buf + len) == 0) {
-			closesk = svsk;
-			svc_xprt_get(&closesk->sk_xprt);
-		} else
-			len += onelen;
-	}
-	spin_unlock_bh(&serv->sv_lock);
-
-	if (closesk) {
-		/* Should unregister with portmap, but you cannot
-		 * unregister just one protocol...
-		 */
-		svc_close_xprt(&closesk->sk_xprt);
-		svc_xprt_put(&closesk->sk_xprt);
-	} else if (toclose)
-		return -ENOENT;
-	return len;
-}
-EXPORT_SYMBOL_GPL(svc_sock_names);
-
 /*
  * Check input queue length
  */
-- 
cgit v1.2.3


From 0462194d358c2e040282d4d1a4fd1aab84417e42 Mon Sep 17 00:00:00 2001
From: Simon Derr <simon.derr@bull.net>
Date: Mon, 17 Sep 2012 15:16:28 +0200
Subject: 9P: Fix race in p9_read_work()

Race scenario between p9_read_work() and p9_poll_mux()

Data arrive, Rworksched is set, p9_read_work() is called.

thread A                                thread B

                                        p9_read_work()
                                                .
                                        reads data
                                                .
                                        checks if new data ready. No.
                                                .
                                        gets preempted
                                                .
More data arrive, p9_poll_mux() is called.      .
                                                .
                                                .
p9_poll_mux()                                   .
                                                .
if (!test_and_set_bit(Rworksched,               .
                      &m->wsched)) {            .
  schedule_work(&m->rq);                        .
}                                               .
                                                .
-> does not schedule work because               .
   Rworksched is set                            .
                                                .
                                        clear_bit(Rworksched, &m->wsched);
                                        return;

No work has been scheduled, and yet data are waiting.

Currently p9_read_work() checks if there is data to read,
and if not, it clears Rworksched.

I think it should clear Rworksched first, and then check if there is data to read.

Signed-off-by: Simon Derr <simon.derr@bull.net>
Signed-off-by: Eric Van Hensbergen <ericvh@gmail.com>
---
 net/9p/trans_fd.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'net')

diff --git a/net/9p/trans_fd.c b/net/9p/trans_fd.c
index 6449bae15702..de1bbad0c7de 100644
--- a/net/9p/trans_fd.c
+++ b/net/9p/trans_fd.c
@@ -316,8 +316,7 @@ static void p9_read_work(struct work_struct *work)
 						m->rsize - m->rpos);
 	p9_debug(P9_DEBUG_TRANS, "mux %p got %d bytes\n", m, err);
 	if (err == -EAGAIN) {
-		clear_bit(Rworksched, &m->wsched);
-		return;
+		goto end_clear;
 	}
 
 	if (err <= 0)
@@ -379,19 +378,20 @@ static void p9_read_work(struct work_struct *work)
 		m->req = NULL;
 	}
 
+end_clear:
+	clear_bit(Rworksched, &m->wsched);
+
 	if (!list_empty(&m->req_list)) {
 		if (test_and_clear_bit(Rpending, &m->wsched))
 			n = POLLIN;
 		else
 			n = p9_fd_poll(m->client, NULL);
 
-		if (n & POLLIN) {
+		if ((n & POLLIN) && !test_and_set_bit(Rworksched, &m->wsched)) {
 			p9_debug(P9_DEBUG_TRANS, "sched read work %p\n", m);
 			schedule_work(&m->rq);
-		} else
-			clear_bit(Rworksched, &m->wsched);
-	} else
-		clear_bit(Rworksched, &m->wsched);
+		}
+	}
 
 	return;
 error:
-- 
cgit v1.2.3


From 1957b3a86f8eb5ceab32e3aae99e2822258aa530 Mon Sep 17 00:00:00 2001
From: Simon Derr <simon.derr@bull.net>
Date: Mon, 17 Sep 2012 15:16:29 +0200
Subject: 9P: fix test at the end of p9_write_work()

At the end of p9_write_work() we want to test if there is still data to send.
This means:
- either the current request still has data to send (wsize != 0)
- or there are requests in the unsent queue

Signed-off-by: Simon Derr <simon.derr@bull.net>
Signed-off-by: Eric Van Hensbergen <ericvh@gmail.com>
---
 net/9p/trans_fd.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/9p/trans_fd.c b/net/9p/trans_fd.c
index de1bbad0c7de..7088a94b2601 100644
--- a/net/9p/trans_fd.c
+++ b/net/9p/trans_fd.c
@@ -492,7 +492,7 @@ static void p9_write_work(struct work_struct *work)
 	if (m->wpos == m->wsize)
 		m->wpos = m->wsize = 0;
 
-	if (m->wsize == 0 && !list_empty(&m->unsent_req_list)) {
+	if (m->wsize || !list_empty(&m->unsent_req_list)) {
 		if (test_and_clear_bit(Wpending, &m->wsched))
 			n = POLLOUT;
 		else
-- 
cgit v1.2.3


From 584a8c13d58423462680907d4cc40d9929c9030a Mon Sep 17 00:00:00 2001
From: Simon Derr <simon.derr@bull.net>
Date: Mon, 17 Sep 2012 15:16:30 +0200
Subject: 9P: Fix race in p9_write_work()

See previous commit about p9_read_work() for details.

This fixes a similar race between p9_write_work() and p9_poll_mux()

Signed-off-by: Simon Derr <simon.derr@bull.net>
Signed-off-by: Eric Van Hensbergen <ericvh@gmail.com>
---
 net/9p/trans_fd.c | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

(limited to 'net')

diff --git a/net/9p/trans_fd.c b/net/9p/trans_fd.c
index 7088a94b2601..b2c308fffb8a 100644
--- a/net/9p/trans_fd.c
+++ b/net/9p/trans_fd.c
@@ -476,10 +476,9 @@ static void p9_write_work(struct work_struct *work)
 	clear_bit(Wpending, &m->wsched);
 	err = p9_fd_write(m->client, m->wbuf + m->wpos, m->wsize - m->wpos);
 	p9_debug(P9_DEBUG_TRANS, "mux %p sent %d bytes\n", m, err);
-	if (err == -EAGAIN) {
-		clear_bit(Wworksched, &m->wsched);
-		return;
-	}
+	if (err == -EAGAIN)
+		goto end_clear;
+
 
 	if (err < 0)
 		goto error;
@@ -492,19 +491,21 @@ static void p9_write_work(struct work_struct *work)
 	if (m->wpos == m->wsize)
 		m->wpos = m->wsize = 0;
 
+end_clear:
+	clear_bit(Wworksched, &m->wsched);
+
 	if (m->wsize || !list_empty(&m->unsent_req_list)) {
 		if (test_and_clear_bit(Wpending, &m->wsched))
 			n = POLLOUT;
 		else
 			n = p9_fd_poll(m->client, NULL);
 
-		if (n & POLLOUT) {
+		if ((n & POLLOUT) &&
+		   !test_and_set_bit(Wworksched, &m->wsched)) {
 			p9_debug(P9_DEBUG_TRANS, "sched write work %p\n", m);
 			schedule_work(&m->wq);
-		} else
-			clear_bit(Wworksched, &m->wsched);
-	} else
-		clear_bit(Wworksched, &m->wsched);
+		}
+	}
 
 	return;
 
-- 
cgit v1.2.3


From a519fc7a70d1a918574bb826cc6905b87b482eb9 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Wed, 12 Sep 2012 16:49:15 -0400
Subject: SUNRPC: Ensure that the TCP socket is closed when in CLOSE_WAIT

Instead of doing a shutdown() call, we need to do an actual close().
Ditto if/when the server is sending us junk RPC headers.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Tested-by: Simon Kirby <sim@hostway.ca>
Cc: stable@vger.kernel.org
---
 net/sunrpc/xprtsock.c | 21 ++++++++++++++++-----
 1 file changed, 16 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index a35b8e52e551..d1988cf8bf33 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -1025,6 +1025,16 @@ static void xs_udp_data_ready(struct sock *sk, int len)
 	read_unlock_bh(&sk->sk_callback_lock);
 }
 
+/*
+ * Helper function to force a TCP close if the server is sending
+ * junk and/or it has put us in CLOSE_WAIT
+ */
+static void xs_tcp_force_close(struct rpc_xprt *xprt)
+{
+	set_bit(XPRT_CONNECTION_CLOSE, &xprt->state);
+	xprt_force_disconnect(xprt);
+}
+
 static inline void xs_tcp_read_fraghdr(struct rpc_xprt *xprt, struct xdr_skb_reader *desc)
 {
 	struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
@@ -1051,7 +1061,7 @@ static inline void xs_tcp_read_fraghdr(struct rpc_xprt *xprt, struct xdr_skb_rea
 	/* Sanity check of the record length */
 	if (unlikely(transport->tcp_reclen < 8)) {
 		dprintk("RPC:       invalid TCP record fragment length\n");
-		xprt_force_disconnect(xprt);
+		xs_tcp_force_close(xprt);
 		return;
 	}
 	dprintk("RPC:       reading TCP record fragment of length %d\n",
@@ -1132,7 +1142,7 @@ static inline void xs_tcp_read_calldir(struct sock_xprt *transport,
 		break;
 	default:
 		dprintk("RPC:       invalid request message type\n");
-		xprt_force_disconnect(&transport->xprt);
+		xs_tcp_force_close(&transport->xprt);
 	}
 	xs_tcp_check_fraghdr(transport);
 }
@@ -1455,6 +1465,8 @@ static void xs_tcp_cancel_linger_timeout(struct rpc_xprt *xprt)
 static void xs_sock_mark_closed(struct rpc_xprt *xprt)
 {
 	smp_mb__before_clear_bit();
+	clear_bit(XPRT_CONNECTION_ABORT, &xprt->state);
+	clear_bit(XPRT_CONNECTION_CLOSE, &xprt->state);
 	clear_bit(XPRT_CLOSE_WAIT, &xprt->state);
 	clear_bit(XPRT_CLOSING, &xprt->state);
 	smp_mb__after_clear_bit();
@@ -1512,8 +1524,8 @@ static void xs_tcp_state_change(struct sock *sk)
 		break;
 	case TCP_CLOSE_WAIT:
 		/* The server initiated a shutdown of the socket */
-		xprt_force_disconnect(xprt);
 		xprt->connect_cookie++;
+		xs_tcp_force_close(xprt);
 	case TCP_CLOSING:
 		/*
 		 * If the server closed down the connection, make sure that
@@ -2199,8 +2211,7 @@ static void xs_tcp_setup_socket(struct work_struct *work)
 		/* We're probably in TIME_WAIT. Get rid of existing socket,
 		 * and retry
 		 */
-		set_bit(XPRT_CONNECTION_CLOSE, &xprt->state);
-		xprt_force_disconnect(xprt);
+		xs_tcp_force_close(xprt);
 		break;
 	case -ECONNREFUSED:
 	case -ECONNRESET:
-- 
cgit v1.2.3


From 84e28a307e376f271505af65a7b7e212dd6f61f4 Mon Sep 17 00:00:00 2001
From: Bryan Schumaker <bjschuma@netapp.com>
Date: Mon, 24 Sep 2012 13:39:01 -0400
Subject: SUNRPC: Set alloc_slot for backchannel tcp ops

f39c1bfb5a03e2d255451bff05be0d7255298fa4 (SUNRPC: Fix a UDP transport
regression) introduced the "alloc_slot" function for xprt operations,
but never created one for the backchannel operations.  This patch fixes
a null pointer dereference when mounting NFS over v4.1.

Call Trace:
 [<ffffffffa0207957>] ? xprt_reserve+0x47/0x50 [sunrpc]
 [<ffffffffa02023a4>] call_reserve+0x34/0x60 [sunrpc]
 [<ffffffffa020e280>] __rpc_execute+0x90/0x400 [sunrpc]
 [<ffffffffa020e61a>] rpc_async_schedule+0x2a/0x40 [sunrpc]
 [<ffffffff81073589>] process_one_work+0x139/0x500
 [<ffffffff81070e70>] ? alloc_worker+0x70/0x70
 [<ffffffffa020e5f0>] ? __rpc_execute+0x400/0x400 [sunrpc]
 [<ffffffff81073d1e>] worker_thread+0x15e/0x460
 [<ffffffff8145c839>] ? preempt_schedule+0x49/0x70
 [<ffffffff81073bc0>] ? rescuer_thread+0x230/0x230
 [<ffffffff81079603>] kthread+0x93/0xa0
 [<ffffffff81465d04>] kernel_thread_helper+0x4/0x10
 [<ffffffff81079570>] ? kthread_freezable_should_stop+0x70/0x70
 [<ffffffff81465d00>] ? gs_change+0x13/0x13

Signed-off-by: Bryan Schumaker <bjschuma@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 net/sunrpc/xprtsock.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index d1988cf8bf33..97f8918169ed 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -2539,6 +2539,7 @@ static struct rpc_xprt_ops xs_tcp_ops = {
 static struct rpc_xprt_ops bc_tcp_ops = {
 	.reserve_xprt		= xprt_reserve_xprt,
 	.release_xprt		= xprt_release_xprt,
+	.alloc_slot		= xprt_alloc_slot,
 	.rpcbind		= xs_local_rpcbind,
 	.buf_alloc		= bc_malloc,
 	.buf_free		= bc_free,
-- 
cgit v1.2.3


From 8a9a8b8332b92b13316cf49685b5dc5257cfe115 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Wed, 1 Aug 2012 14:32:13 -0400
Subject: SUNRPC: Fix the return value of xdr_align_pages()

The callers of xdr_align_pages() expect it to return the number of bytes
of actual XDR data remaining in the pages.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 net/sunrpc/xdr.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'net')

diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c
index 0afba1b4b656..fbbd1c475b43 100644
--- a/net/sunrpc/xdr.c
+++ b/net/sunrpc/xdr.c
@@ -742,6 +742,8 @@ static unsigned int xdr_align_pages(struct xdr_stream *xdr, unsigned int len)
 	/* Truncate page data and move it into the tail */
 	if (buf->page_len > len)
 		xdr_shrink_pagelen(buf, buf->page_len - len);
+	else
+		len = buf->page_len;
 	xdr->nwords = XDR_QUADLEN(buf->len - cur);
 	return len;
 }
-- 
cgit v1.2.3


From 28407630513b1a86133db0ef8b39fabad6c494af Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 17 Aug 2012 23:54:15 -0400
Subject: take descriptor handling from sock_alloc_file() to callers

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 net/socket.c | 62 +++++++++++++++++++++++++++++++++++++++---------------------
 1 file changed, 40 insertions(+), 22 deletions(-)

(limited to 'net')

diff --git a/net/socket.c b/net/socket.c
index edc3c4af9085..a14ec19164b6 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -346,22 +346,15 @@ static struct file_system_type sock_fs_type = {
  *	but we take care of internal coherence yet.
  */
 
-static int sock_alloc_file(struct socket *sock, struct file **f, int flags)
+static struct file *sock_alloc_file(struct socket *sock, int flags)
 {
 	struct qstr name = { .name = "" };
 	struct path path;
 	struct file *file;
-	int fd;
-
-	fd = get_unused_fd_flags(flags);
-	if (unlikely(fd < 0))
-		return fd;
 
 	path.dentry = d_alloc_pseudo(sock_mnt->mnt_sb, &name);
-	if (unlikely(!path.dentry)) {
-		put_unused_fd(fd);
-		return -ENOMEM;
-	}
+	if (unlikely(!path.dentry))
+		return ERR_PTR(-ENOMEM);
 	path.mnt = mntget(sock_mnt);
 
 	d_instantiate(path.dentry, SOCK_INODE(sock));
@@ -373,28 +366,31 @@ static int sock_alloc_file(struct socket *sock, struct file **f, int flags)
 		/* drop dentry, keep inode */
 		ihold(path.dentry->d_inode);
 		path_put(&path);
-		put_unused_fd(fd);
-		return -ENFILE;
+		return ERR_PTR(-ENFILE);
 	}
 
 	sock->file = file;
 	file->f_flags = O_RDWR | (flags & O_NONBLOCK);
 	file->f_pos = 0;
 	file->private_data = sock;
-
-	*f = file;
-	return fd;
+	return file;
 }
 
 int sock_map_fd(struct socket *sock, int flags)
 {
 	struct file *newfile;
-	int fd = sock_alloc_file(sock, &newfile, flags);
+	int fd = get_unused_fd_flags(flags);
+	if (unlikely(fd < 0))
+		return fd;
 
-	if (likely(fd >= 0))
+	newfile = sock_alloc_file(sock, flags);
+	if (likely(!IS_ERR(newfile))) {
 		fd_install(fd, newfile);
+		return fd;
+	}
 
-	return fd;
+	put_unused_fd(fd);
+	return PTR_ERR(newfile);
 }
 EXPORT_SYMBOL(sock_map_fd);
 
@@ -1394,17 +1390,32 @@ SYSCALL_DEFINE4(socketpair, int, family, int, type, int, protocol,
 	if (err < 0)
 		goto out_release_both;
 
-	fd1 = sock_alloc_file(sock1, &newfile1, flags);
+	fd1 = get_unused_fd_flags(flags);
 	if (unlikely(fd1 < 0)) {
 		err = fd1;
 		goto out_release_both;
 	}
-
-	fd2 = sock_alloc_file(sock2, &newfile2, flags);
+	fd2 = get_unused_fd_flags(flags);
 	if (unlikely(fd2 < 0)) {
 		err = fd2;
+		put_unused_fd(fd1);
+		goto out_release_both;
+	}
+
+	newfile1 = sock_alloc_file(sock1, flags);
+	if (unlikely(IS_ERR(newfile1))) {
+		err = PTR_ERR(newfile1);
+		put_unused_fd(fd1);
+		put_unused_fd(fd2);
+		goto out_release_both;
+	}
+
+	newfile2 = sock_alloc_file(sock2, flags);
+	if (IS_ERR(newfile2)) {
+		err = PTR_ERR(newfile2);
 		fput(newfile1);
 		put_unused_fd(fd1);
+		put_unused_fd(fd2);
 		sock_release(sock2);
 		goto out;
 	}
@@ -1536,12 +1547,19 @@ SYSCALL_DEFINE4(accept4, int, fd, struct sockaddr __user *, upeer_sockaddr,
 	 */
 	__module_get(newsock->ops->owner);
 
-	newfd = sock_alloc_file(newsock, &newfile, flags);
+	newfd = get_unused_fd_flags(flags);
 	if (unlikely(newfd < 0)) {
 		err = newfd;
 		sock_release(newsock);
 		goto out_put;
 	}
+	newfile = sock_alloc_file(newsock, flags);
+	if (unlikely(IS_ERR(newfile))) {
+		err = PTR_ERR(newfile);
+		put_unused_fd(newfd);
+		sock_release(newsock);
+		goto out_put;
+	}
 
 	err = security_socket_accept(sock, newsock);
 	if (err)
-- 
cgit v1.2.3


From 56b31d1c9f1e6a3ad92e7bfe252721e05d92b285 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 18 Aug 2012 00:25:51 -0400
Subject: unexport sock_map_fd(), switch to sock_alloc_file()

Both modular callers of sock_map_fd() had been buggy; sctp one leaks
descriptor and file if copy_to_user() fails, 9p one shouldn't be
exposing file in the descriptor table at all.

Switch both to sock_alloc_file(), export it, unexport sock_map_fd() and
make it static.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/net.h |  3 ++-
 net/9p/trans_fd.c   | 16 +++++++---------
 net/sctp/socket.c   | 25 ++++++++++++++++++++-----
 net/socket.c        |  6 +++---
 4 files changed, 32 insertions(+), 18 deletions(-)

(limited to 'net')

diff --git a/include/linux/net.h b/include/linux/net.h
index 99276c3dc89a..c8a9708d4d66 100644
--- a/include/linux/net.h
+++ b/include/linux/net.h
@@ -65,6 +65,7 @@ typedef enum {
 struct poll_table_struct;
 struct pipe_inode_info;
 struct inode;
+struct file;
 struct net;
 
 #define SOCK_ASYNC_NOSPACE	0
@@ -246,7 +247,7 @@ extern int   	     sock_sendmsg(struct socket *sock, struct msghdr *msg,
 				  size_t len);
 extern int	     sock_recvmsg(struct socket *sock, struct msghdr *msg,
 				  size_t size, int flags);
-extern int 	     sock_map_fd(struct socket *sock, int flags);
+extern struct file  *sock_alloc_file(struct socket *sock, int flags);
 extern struct socket *sockfd_lookup(int fd, int *err);
 extern struct socket *sock_from_file(struct file *file, int *err);
 #define		     sockfd_put(sock) fput(sock->file)
diff --git a/net/9p/trans_fd.c b/net/9p/trans_fd.c
index 6449bae15702..8c4e0b538a8a 100644
--- a/net/9p/trans_fd.c
+++ b/net/9p/trans_fd.c
@@ -793,30 +793,28 @@ static int p9_fd_open(struct p9_client *client, int rfd, int wfd)
 static int p9_socket_open(struct p9_client *client, struct socket *csocket)
 {
 	struct p9_trans_fd *p;
-	int ret, fd;
+	struct file *file;
+	int ret;
 
 	p = kmalloc(sizeof(struct p9_trans_fd), GFP_KERNEL);
 	if (!p)
 		return -ENOMEM;
 
 	csocket->sk->sk_allocation = GFP_NOIO;
-	fd = sock_map_fd(csocket, 0);
-	if (fd < 0) {
+	file = sock_alloc_file(csocket, 0);
+	if (IS_ERR(file)) {
 		pr_err("%s (%d): failed to map fd\n",
 		       __func__, task_pid_nr(current));
 		sock_release(csocket);
 		kfree(p);
-		return fd;
+		return PTR_ERR(file);
 	}
 
-	get_file(csocket->file);
-	get_file(csocket->file);
-	p->wr = p->rd = csocket->file;
+	get_file(file);
+	p->wr = p->rd = file;
 	client->trans = p;
 	client->status = Connected;
 
-	sys_close(fd);	/* still racy */
-
 	p->rd->f_flags |= O_NONBLOCK;
 
 	p->conn = p9_conn_create(client);
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 5e259817a7f3..fb5931ca50d0 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -70,6 +70,7 @@
 #include <linux/init.h>
 #include <linux/crypto.h>
 #include <linux/slab.h>
+#include <linux/file.h>
 
 #include <net/ip.h>
 #include <net/icmp.h>
@@ -4276,6 +4277,7 @@ static int sctp_getsockopt_peeloff(struct sock *sk, int len, char __user *optval
 {
 	sctp_peeloff_arg_t peeloff;
 	struct socket *newsock;
+	struct file *newfile;
 	int retval = 0;
 
 	if (len < sizeof(sctp_peeloff_arg_t))
@@ -4289,22 +4291,35 @@ static int sctp_getsockopt_peeloff(struct sock *sk, int len, char __user *optval
 		goto out;
 
 	/* Map the socket to an unused fd that can be returned to the user.  */
-	retval = sock_map_fd(newsock, 0);
+	retval = get_unused_fd();
 	if (retval < 0) {
 		sock_release(newsock);
 		goto out;
 	}
 
+	newfile = sock_alloc_file(newsock, 0);
+	if (unlikely(IS_ERR(newfile))) {
+		put_unused_fd(retval);
+		sock_release(newsock);
+		return PTR_ERR(newfile);
+	}
+
 	SCTP_DEBUG_PRINTK("%s: sk: %p newsk: %p sd: %d\n",
 			  __func__, sk, newsock->sk, retval);
 
 	/* Return the fd mapped to the new socket.  */
+	if (put_user(len, optlen)) {
+		fput(newfile);
+		put_unused_fd(retval);
+		return -EFAULT;
+	}
 	peeloff.sd = retval;
-	if (put_user(len, optlen))
+	if (copy_to_user(optval, &peeloff, len)) {
+		fput(newfile);
+		put_unused_fd(retval);
 		return -EFAULT;
-	if (copy_to_user(optval, &peeloff, len))
-		retval = -EFAULT;
-
+	}
+	fd_install(retval, newfile);
 out:
 	return retval;
 }
diff --git a/net/socket.c b/net/socket.c
index a14ec19164b6..38a14311f3a6 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -346,7 +346,7 @@ static struct file_system_type sock_fs_type = {
  *	but we take care of internal coherence yet.
  */
 
-static struct file *sock_alloc_file(struct socket *sock, int flags)
+struct file *sock_alloc_file(struct socket *sock, int flags)
 {
 	struct qstr name = { .name = "" };
 	struct path path;
@@ -375,8 +375,9 @@ static struct file *sock_alloc_file(struct socket *sock, int flags)
 	file->private_data = sock;
 	return file;
 }
+EXPORT_SYMBOL(sock_alloc_file);
 
-int sock_map_fd(struct socket *sock, int flags)
+static int sock_map_fd(struct socket *sock, int flags)
 {
 	struct file *newfile;
 	int fd = get_unused_fd_flags(flags);
@@ -392,7 +393,6 @@ int sock_map_fd(struct socket *sock, int flags)
 	put_unused_fd(fd);
 	return PTR_ERR(newfile);
 }
-EXPORT_SYMBOL(sock_map_fd);
 
 struct socket *sock_from_file(struct file *file, int *err)
 {
-- 
cgit v1.2.3


From c3c073f808b22dfae15ef8412b6f7b998644139a Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 21 Aug 2012 22:32:06 -0400
Subject: new helper: iterate_fd()

iterates through the opened files in given descriptor table,
calling a supplied function; we stop once non-zero is returned.
Callback gets struct file *, descriptor number and const void *
argument passed to iterator.  It is called with files->file_lock
held, so it is not allowed to block.

tty_io, netprio_cgroup and selinux flush_unauthorized_files()
converted to its use.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 drivers/tty/tty_io.c      | 36 +++++++++++-------------------
 fs/file.c                 | 21 +++++++++++++++++
 include/linux/fdtable.h   |  3 +++
 net/core/netprio_cgroup.c | 38 ++++++++++---------------------
 security/selinux/hooks.c  | 57 ++++++++++++++++++-----------------------------
 5 files changed, 71 insertions(+), 84 deletions(-)

(limited to 'net')

diff --git a/drivers/tty/tty_io.c b/drivers/tty/tty_io.c
index b425c79675ad..71d95cfbabec 100644
--- a/drivers/tty/tty_io.c
+++ b/drivers/tty/tty_io.c
@@ -2791,6 +2791,13 @@ static long tty_compat_ioctl(struct file *file, unsigned int cmd,
 }
 #endif
 
+static int this_tty(const void *t, struct file *file, unsigned fd)
+{
+	if (likely(file->f_op->read != tty_read))
+		return 0;
+	return file_tty(file) != t ? 0 : fd + 1;
+}
+	
 /*
  * This implements the "Secure Attention Key" ---  the idea is to
  * prevent trojan horses by killing all processes associated with this
@@ -2818,8 +2825,6 @@ void __do_SAK(struct tty_struct *tty)
 	struct task_struct *g, *p;
 	struct pid *session;
 	int		i;
-	struct file	*filp;
-	struct fdtable *fdt;
 
 	if (!tty)
 		return;
@@ -2849,27 +2854,12 @@ void __do_SAK(struct tty_struct *tty)
 			continue;
 		}
 		task_lock(p);
-		if (p->files) {
-			/*
-			 * We don't take a ref to the file, so we must
-			 * hold ->file_lock instead.
-			 */
-			spin_lock(&p->files->file_lock);
-			fdt = files_fdtable(p->files);
-			for (i = 0; i < fdt->max_fds; i++) {
-				filp = fcheck_files(p->files, i);
-				if (!filp)
-					continue;
-				if (filp->f_op->read == tty_read &&
-				    file_tty(filp) == tty) {
-					printk(KERN_NOTICE "SAK: killed process %d"
-					    " (%s): fd#%d opened to the tty\n",
-					    task_pid_nr(p), p->comm, i);
-					force_sig(SIGKILL, p);
-					break;
-				}
-			}
-			spin_unlock(&p->files->file_lock);
+		i = iterate_fd(p->files, 0, this_tty, tty);
+		if (i != 0) {
+			printk(KERN_NOTICE "SAK: killed process %d"
+			    " (%s): fd#%d opened to the tty\n",
+				    task_pid_nr(p), p->comm, i - 1);
+			force_sig(SIGKILL, p);
 		}
 		task_unlock(p);
 	} while_each_thread(g, p);
diff --git a/fs/file.c b/fs/file.c
index 967bd0dadbe5..e6e418122587 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -979,3 +979,24 @@ int f_dupfd(unsigned int from, struct file *file, unsigned flags)
 	}
 	return err;
 }
+
+int iterate_fd(struct files_struct *files, unsigned n,
+		int (*f)(const void *, struct file *, unsigned),
+		const void *p)
+{
+	struct fdtable *fdt;
+	struct file *file;
+	int res = 0;
+	if (!files)
+		return 0;
+	spin_lock(&files->file_lock);
+	fdt = files_fdtable(files);
+	while (!res && n < fdt->max_fds) {
+		file = rcu_dereference_check_fdtable(files, fdt->fd[n++]);
+		if (file)
+			res = f(p, file, n);
+	}
+	spin_unlock(&files->file_lock);
+	return res;
+}
+EXPORT_SYMBOL(iterate_fd);
diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h
index de2b71caa0f0..fb7dacae0522 100644
--- a/include/linux/fdtable.h
+++ b/include/linux/fdtable.h
@@ -98,6 +98,9 @@ void reset_files_struct(struct files_struct *);
 int unshare_files(struct files_struct **);
 struct files_struct *dup_fd(struct files_struct *, int *);
 void do_close_on_exec(struct files_struct *);
+int iterate_fd(struct files_struct *, unsigned,
+		int (*)(const void *, struct file *, unsigned),
+		const void *);
 
 extern int __alloc_fd(struct files_struct *files,
 		      unsigned start, unsigned end, unsigned flags);
diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c
index c75e3f9d060f..5ffd084c6a83 100644
--- a/net/core/netprio_cgroup.c
+++ b/net/core/netprio_cgroup.c
@@ -272,38 +272,24 @@ out_free_devname:
 	return ret;
 }
 
+static int update_netprio(const void *v, struct file *file, unsigned n)
+{
+	int err;
+	struct socket *sock = sock_from_file(file, &err);
+	if (sock)
+		sock->sk->sk_cgrp_prioidx = (u32)(unsigned long)v;
+	return 0;
+}
+
 void net_prio_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
 {
 	struct task_struct *p;
+	void *v;
 
 	cgroup_taskset_for_each(p, cgrp, tset) {
-		unsigned int fd;
-		struct fdtable *fdt;
-		struct files_struct *files;
-
 		task_lock(p);
-		files = p->files;
-		if (!files) {
-			task_unlock(p);
-			continue;
-		}
-
-		spin_lock(&files->file_lock);
-		fdt = files_fdtable(files);
-		for (fd = 0; fd < fdt->max_fds; fd++) {
-			struct file *file;
-			struct socket *sock;
-			int err;
-
-			file = fcheck_files(files, fd);
-			if (!file)
-				continue;
-
-			sock = sock_from_file(file, &err);
-			if (sock)
-				sock_update_netprioidx(sock->sk, p);
-		}
-		spin_unlock(&files->file_lock);
+		v = (void *)(unsigned long)task_netprioidx(p);
+		iterate_fd(p->files, 0, update_netprio, v);
 		task_unlock(p);
 	}
 }
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 00b50113642d..4dfbcea10eb7 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -2088,15 +2088,19 @@ static int selinux_bprm_secureexec(struct linux_binprm *bprm)
 	return (atsecure || cap_bprm_secureexec(bprm));
 }
 
+static int match_file(const void *p, struct file *file, unsigned fd)
+{
+	return file_has_perm(p, file, file_to_av(file)) ? fd + 1 : 0;
+}
+
 /* Derived from fs/exec.c:flush_old_files. */
 static inline void flush_unauthorized_files(const struct cred *cred,
 					    struct files_struct *files)
 {
 	struct file *file, *devnull = NULL;
 	struct tty_struct *tty;
-	struct fdtable *fdt;
-	long j = -1;
 	int drop_tty = 0;
+	unsigned n;
 
 	tty = get_current_tty();
 	if (tty) {
@@ -2123,41 +2127,24 @@ static inline void flush_unauthorized_files(const struct cred *cred,
 		no_tty();
 
 	/* Revalidate access to inherited open files. */
-	spin_lock(&files->file_lock);
-	for (;;) {
-		unsigned long set, i;
-		j++;
-		i = j * BITS_PER_LONG;
-		fdt = files_fdtable(files);
-		if (i >= fdt->max_fds)
-			break;
-		set = fdt->open_fds[j];
-		if (!set)
-			continue;
-		spin_unlock(&files->file_lock);
-		for ( ; set ; i++, set >>= 1) {
-			if (!(set & 1))
-				continue;
-			file = fget(i);
-			if (!file)
-				continue;
-			if (file_has_perm(cred, file, file_to_av(file))) {
-				if (devnull) {
-					get_file(devnull);
-				} else {
-					devnull = dentry_open(&selinux_null,
-								O_RDWR, cred);
-					if (IS_ERR(devnull))
-						devnull = NULL;
-				}
-				replace_fd(i, devnull, 0);
-			}
-			fput(file);
-		}
-		spin_lock(&files->file_lock);
+	n = iterate_fd(files, 0, match_file, cred);
+	if (!n) /* none found? */
+		return;
 
+	devnull = dentry_open(&selinux_null, O_RDWR, cred);
+	if (!IS_ERR(devnull)) {
+		/* replace all the matching ones with this */
+		do {
+			get_file(devnull);
+			replace_fd(n - 1, devnull, 0);
+		} while ((n = iterate_fd(files, n, match_file, cred)) != 0);
+		fput(devnull);
+	} else {
+		/* just close all the matching ones */
+		do {
+			replace_fd(n - 1, NULL, 0);
+		} while ((n = iterate_fd(files, n, match_file, cred)) != 0);
 	}
-	spin_unlock(&files->file_lock);
 }
 
 /*
-- 
cgit v1.2.3


From cb0942b81249798e15c3f04eee2946ef543e8115 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 27 Aug 2012 14:48:26 -0400
Subject: make get_file() return its argument

simplifies a bunch of callers...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 arch/ia64/kernel/perfmon.c         | 4 +---
 drivers/base/dma-buf.c             | 3 +--
 drivers/staging/omapdrm/omap_gem.c | 3 +--
 drivers/tty/tty_io.c               | 9 +++------
 fs/autofs4/waitq.c                 | 3 +--
 fs/fuse/dev.c                      | 3 +--
 fs/nfsd/nfs4state.c                | 3 +--
 fs/proc/base.c                     | 3 +--
 fs/select.c                        | 3 +--
 include/linux/fs.h                 | 6 +++++-
 mm/fremap.c                        | 3 +--
 mm/mmap.c                          | 3 +--
 mm/nommu.c                         | 6 ++----
 net/compat.c                       | 3 +--
 net/core/scm.c                     | 3 +--
 security/selinux/hooks.c           | 3 +--
 16 files changed, 23 insertions(+), 38 deletions(-)

(limited to 'net')

diff --git a/arch/ia64/kernel/perfmon.c b/arch/ia64/kernel/perfmon.c
index 79826c13b8b6..ff5d4e4c3733 100644
--- a/arch/ia64/kernel/perfmon.c
+++ b/arch/ia64/kernel/perfmon.c
@@ -2306,7 +2306,7 @@ pfm_smpl_buffer_alloc(struct task_struct *task, struct file *filp, pfm_context_t
 	 * partially initialize the vma for the sampling buffer
 	 */
 	vma->vm_mm	     = mm;
-	vma->vm_file	     = filp;
+	vma->vm_file	     = get_file(filp);
 	vma->vm_flags	     = VM_READ| VM_MAYREAD |VM_RESERVED;
 	vma->vm_page_prot    = PAGE_READONLY; /* XXX may need to change */
 
@@ -2345,8 +2345,6 @@ pfm_smpl_buffer_alloc(struct task_struct *task, struct file *filp, pfm_context_t
 		goto error;
 	}
 
-	get_file(filp);
-
 	/*
 	 * now insert the vma in the vm list for the process, must be
 	 * done with mmap lock held
diff --git a/drivers/base/dma-buf.c b/drivers/base/dma-buf.c
index c30f3e1d0efc..460e22dee36d 100644
--- a/drivers/base/dma-buf.c
+++ b/drivers/base/dma-buf.c
@@ -460,8 +460,7 @@ int dma_buf_mmap(struct dma_buf *dmabuf, struct vm_area_struct *vma,
 	if (vma->vm_file)
 		fput(vma->vm_file);
 
-	vma->vm_file = dmabuf->file;
-	get_file(vma->vm_file);
+	vma->vm_file = get_file(dmabuf->file);
 
 	vma->vm_pgoff = pgoff;
 
diff --git a/drivers/staging/omapdrm/omap_gem.c b/drivers/staging/omapdrm/omap_gem.c
index 3a0d035a9e03..2a6bb7f9ee68 100644
--- a/drivers/staging/omapdrm/omap_gem.c
+++ b/drivers/staging/omapdrm/omap_gem.c
@@ -566,9 +566,8 @@ int omap_gem_mmap_obj(struct drm_gem_object *obj,
 		 * in particular in the case of mmap'd dmabufs)
 		 */
 		fput(vma->vm_file);
-		get_file(obj->filp);
 		vma->vm_pgoff = 0;
-		vma->vm_file  = obj->filp;
+		vma->vm_file  = get_file(obj->filp);
 
 		vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
 	}
diff --git a/drivers/tty/tty_io.c b/drivers/tty/tty_io.c
index 71d95cfbabec..c7561f29d894 100644
--- a/drivers/tty/tty_io.c
+++ b/drivers/tty/tty_io.c
@@ -1163,10 +1163,8 @@ ssize_t redirected_tty_write(struct file *file, const char __user *buf,
 	struct file *p = NULL;
 
 	spin_lock(&redirect_lock);
-	if (redirect) {
-		get_file(redirect);
-		p = redirect;
-	}
+	if (redirect)
+		p = get_file(redirect);
 	spin_unlock(&redirect_lock);
 
 	if (p) {
@@ -2246,8 +2244,7 @@ static int tioccons(struct file *file)
 		spin_unlock(&redirect_lock);
 		return -EBUSY;
 	}
-	get_file(file);
-	redirect = file;
+	redirect = get_file(file);
 	spin_unlock(&redirect_lock);
 	return 0;
 }
diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c
index da8876d38a7b..dce436e595c1 100644
--- a/fs/autofs4/waitq.c
+++ b/fs/autofs4/waitq.c
@@ -175,8 +175,7 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi,
 		return;
 	}
 
-	pipe = sbi->pipe;
-	get_file(pipe);
+	pipe = get_file(sbi->pipe);
 
 	mutex_unlock(&sbi->wq_mutex);
 
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index f4246cfc8d87..8c23fa7a91e6 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -148,8 +148,7 @@ static struct fuse_req *get_reserved_req(struct fuse_conn *fc,
 		if (ff->reserved_req) {
 			req = ff->reserved_req;
 			ff->reserved_req = NULL;
-			get_file(file);
-			req->stolen_file = file;
+			req->stolen_file = get_file(file);
 		}
 		spin_unlock(&fc->lock);
 	} while (!req);
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index cc894eda385a..48a1bad37334 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -2837,8 +2837,7 @@ static int nfs4_setlease(struct nfs4_delegation *dp, int flag)
 		return -ENOMEM;
 	}
 	fp->fi_lease = fl;
-	fp->fi_deleg_file = fl->fl_file;
-	get_file(fp->fi_deleg_file);
+	fp->fi_deleg_file = get_file(fl->fl_file);
 	atomic_set(&fp->fi_delegees, 1);
 	list_add(&dp->dl_perfile, &fp->fi_delegations);
 	return 0;
diff --git a/fs/proc/base.c b/fs/proc/base.c
index b55c3bb298e3..f1e8438d21b5 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1979,8 +1979,7 @@ proc_map_files_readdir(struct file *filp, void *dirent, filldir_t filldir)
 				if (++pos <= filp->f_pos)
 					continue;
 
-				get_file(vma->vm_file);
-				info.file = vma->vm_file;
+				info.file = get_file(vma->vm_file);
 				info.len = snprintf(info.name,
 						sizeof(info.name), "%lx-%lx",
 						vma->vm_start, vma->vm_end);
diff --git a/fs/select.c b/fs/select.c
index db14c781335e..ffdd16d6e691 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -220,8 +220,7 @@ static void __pollwait(struct file *filp, wait_queue_head_t *wait_address,
 	struct poll_table_entry *entry = poll_get_entry(pwq);
 	if (!entry)
 		return;
-	get_file(filp);
-	entry->filp = filp;
+	entry->filp = get_file(filp);
 	entry->wait_address = wait_address;
 	entry->key = p->_key;
 	init_waitqueue_func_entry(&entry->wait, pollwake);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index aa110476a95b..de1db1c12080 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1074,7 +1074,11 @@ struct file_handle {
 	unsigned char f_handle[0];
 };
 
-#define get_file(x)	atomic_long_inc(&(x)->f_count)
+static inline struct file *get_file(struct file *f)
+{
+	atomic_long_inc(&f->f_count);
+	return f;
+}
 #define fput_atomic(x)	atomic_long_add_unless(&(x)->f_count, -1, 1)
 #define file_count(x)	atomic_long_read(&(x)->f_count)
 
diff --git a/mm/fremap.c b/mm/fremap.c
index 9ed4fd432467..048659c0c03d 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -195,10 +195,9 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
 		 */
 		if (mapping_cap_account_dirty(mapping)) {
 			unsigned long addr;
-			struct file *file = vma->vm_file;
+			struct file *file = get_file(vma->vm_file);
 
 			flags &= MAP_NONBLOCK;
-			get_file(file);
 			addr = mmap_region(file, start, size,
 					flags, vma->vm_flags, pgoff);
 			fput(file);
diff --git a/mm/mmap.c b/mm/mmap.c
index ae18a48e7e4e..872441e81914 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1301,8 +1301,7 @@ munmap_back:
 				goto free_vma;
 			correct_wcount = 1;
 		}
-		vma->vm_file = file;
-		get_file(file);
+		vma->vm_file = get_file(file);
 		error = file->f_op->mmap(file, vma);
 		if (error)
 			goto unmap_and_free_vma;
diff --git a/mm/nommu.c b/mm/nommu.c
index d4b0c10872de..dee2ff89fd58 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1282,10 +1282,8 @@ unsigned long do_mmap_pgoff(struct file *file,
 	vma->vm_pgoff = pgoff;
 
 	if (file) {
-		region->vm_file = file;
-		get_file(file);
-		vma->vm_file = file;
-		get_file(file);
+		region->vm_file = get_file(file);
+		vma->vm_file = get_file(file);
 		if (vm_flags & VM_EXECUTABLE) {
 			added_exe_file_vma(current->mm);
 			vma->vm_mm = current->mm;
diff --git a/net/compat.c b/net/compat.c
index 74ed1d7a84a2..79ae88485001 100644
--- a/net/compat.c
+++ b/net/compat.c
@@ -301,8 +301,7 @@ void scm_detach_fds_compat(struct msghdr *kmsg, struct scm_cookie *scm)
 			break;
 		}
 		/* Bump the usage count and install the file. */
-		get_file(fp[i]);
-		fd_install(new_fd, fp[i]);
+		fd_install(new_fd, get_file(fp[i]));
 	}
 
 	if (i > 0) {
diff --git a/net/core/scm.c b/net/core/scm.c
index 040cebeed45b..b0098d259233 100644
--- a/net/core/scm.c
+++ b/net/core/scm.c
@@ -281,11 +281,10 @@ void scm_detach_fds(struct msghdr *msg, struct scm_cookie *scm)
 			break;
 		}
 		/* Bump the usage count and install the file. */
-		get_file(fp[i]);
 		sock = sock_from_file(fp[i], &err);
 		if (sock)
 			sock_update_netprioidx(sock->sk, current);
-		fd_install(new_fd, fp[i]);
+		fd_install(new_fd, get_file(fp[i]));
 	}
 
 	if (i > 0)
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 4dfbcea10eb7..651d8456611a 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -2135,8 +2135,7 @@ static inline void flush_unauthorized_files(const struct cred *cred,
 	if (!IS_ERR(devnull)) {
 		/* replace all the matching ones with this */
 		do {
-			get_file(devnull);
-			replace_fd(n - 1, devnull, 0);
+			replace_fd(n - 1, get_file(devnull), 0);
 		} while ((n = iterate_fd(files, n, match_file, cred)) != 0);
 		fput(devnull);
 	} else {
-- 
cgit v1.2.3


From a11a2bf4de5679fa0b63474c7d39bea2dac7d061 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Thu, 2 Aug 2012 13:21:43 -0400
Subject: SUNRPC: Optimise away unnecessary data moves in xdr_align_pages

We only have to call xdr_shrink_pagelen() if the remaining RPC
message does not fit in the page buffer length that we supplied
to xdr_align_pages().

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 net/sunrpc/xdr.c | 23 +++++++++++++----------
 1 file changed, 13 insertions(+), 10 deletions(-)

(limited to 'net')

diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c
index fbbd1c475b43..08f50afd5f2a 100644
--- a/net/sunrpc/xdr.c
+++ b/net/sunrpc/xdr.c
@@ -730,21 +730,24 @@ static unsigned int xdr_align_pages(struct xdr_stream *xdr, unsigned int len)
 
 	if (xdr->nwords == 0)
 		return 0;
-	if (nwords > xdr->nwords) {
-		nwords = xdr->nwords;
-		len = nwords << 2;
-	}
 	/* Realign pages to current pointer position */
 	iov  = buf->head;
-	if (iov->iov_len > cur)
+	if (iov->iov_len > cur) {
 		xdr_shrink_bufhead(buf, iov->iov_len - cur);
+		xdr->nwords = XDR_QUADLEN(buf->len - cur);
+	}
 
-	/* Truncate page data and move it into the tail */
-	if (buf->page_len > len)
-		xdr_shrink_pagelen(buf, buf->page_len - len);
-	else
+	if (nwords > xdr->nwords) {
+		nwords = xdr->nwords;
+		len = nwords << 2;
+	}
+	if (buf->page_len <= len)
 		len = buf->page_len;
-	xdr->nwords = XDR_QUADLEN(buf->len - cur);
+	else if (nwords < xdr->nwords) {
+		/* Truncate page data and move it into the tail */
+		xdr_shrink_pagelen(buf, buf->page_len - len);
+		xdr->nwords = XDR_QUADLEN(buf->len - cur);
+	}
 	return len;
 }
 
-- 
cgit v1.2.3


From d19751e7b9bd8a01d00372325439589886674f79 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Tue, 11 Sep 2012 17:21:25 -0400
Subject: SUNRPC: Get rid of the redundant xprt->shutdown bit field

It is only set after everyone has dereferenced the transport,
and serves no useful purpose: setting it is racy, so all the
socket code, etc still needs to be able to cope with the cases
where they miss reading it.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 include/linux/sunrpc/xprt.h     |  3 +--
 net/sunrpc/xprt.c               |  8 ++------
 net/sunrpc/xprtrdma/transport.c | 22 ++++++++--------------
 net/sunrpc/xprtsock.c           | 18 ------------------
 4 files changed, 11 insertions(+), 40 deletions(-)

(limited to 'net')

diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h
index bf8c49ff7530..951cb9b7d02b 100644
--- a/include/linux/sunrpc/xprt.h
+++ b/include/linux/sunrpc/xprt.h
@@ -173,8 +173,7 @@ struct rpc_xprt {
 	unsigned int		min_reqs;	/* min number of slots */
 	atomic_t		num_reqs;	/* total slots */
 	unsigned long		state;		/* transport state */
-	unsigned char		shutdown   : 1,	/* being shut down */
-				resvport   : 1; /* use a reserved port */
+	unsigned char		resvport   : 1; /* use a reserved port */
 	unsigned int		swapper;	/* we're swapping over this
 						   transport */
 	unsigned int		bind_index;	/* bind function index */
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index 5d7f61d7559c..bd462a532acf 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -231,7 +231,7 @@ EXPORT_SYMBOL_GPL(xprt_reserve_xprt);
 static void xprt_clear_locked(struct rpc_xprt *xprt)
 {
 	xprt->snd_task = NULL;
-	if (!test_bit(XPRT_CLOSE_WAIT, &xprt->state) || xprt->shutdown) {
+	if (!test_bit(XPRT_CLOSE_WAIT, &xprt->state)) {
 		smp_mb__before_clear_bit();
 		clear_bit(XPRT_LOCKED, &xprt->state);
 		smp_mb__after_clear_bit();
@@ -504,9 +504,6 @@ EXPORT_SYMBOL_GPL(xprt_wait_for_buffer_space);
  */
 void xprt_write_space(struct rpc_xprt *xprt)
 {
-	if (unlikely(xprt->shutdown))
-		return;
-
 	spin_lock_bh(&xprt->transport_lock);
 	if (xprt->snd_task) {
 		dprintk("RPC:       write space: waking waiting task on "
@@ -679,7 +676,7 @@ xprt_init_autodisconnect(unsigned long data)
 	struct rpc_xprt *xprt = (struct rpc_xprt *)data;
 
 	spin_lock(&xprt->transport_lock);
-	if (!list_empty(&xprt->recv) || xprt->shutdown)
+	if (!list_empty(&xprt->recv))
 		goto out_abort;
 	if (test_and_set_bit(XPRT_LOCKED, &xprt->state))
 		goto out_abort;
@@ -1262,7 +1259,6 @@ out:
 static void xprt_destroy(struct rpc_xprt *xprt)
 {
 	dprintk("RPC:       destroying transport %p\n", xprt);
-	xprt->shutdown = 1;
 	del_timer_sync(&xprt->timer);
 
 	rpc_destroy_wait_queue(&xprt->binding);
diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
index 5d9202dc7cb1..c9aa7a35f3bf 100644
--- a/net/sunrpc/xprtrdma/transport.c
+++ b/net/sunrpc/xprtrdma/transport.c
@@ -199,21 +199,15 @@ xprt_rdma_connect_worker(struct work_struct *work)
 	struct rpc_xprt *xprt = &r_xprt->xprt;
 	int rc = 0;
 
-	if (!xprt->shutdown) {
-		current->flags |= PF_FSTRANS;
-		xprt_clear_connected(xprt);
-
-		dprintk("RPC:       %s: %sconnect\n", __func__,
-				r_xprt->rx_ep.rep_connected != 0 ? "re" : "");
-		rc = rpcrdma_ep_connect(&r_xprt->rx_ep, &r_xprt->rx_ia);
-		if (rc)
-			goto out;
-	}
-	goto out_clear;
+	current->flags |= PF_FSTRANS;
+	xprt_clear_connected(xprt);
+
+	dprintk("RPC:       %s: %sconnect\n", __func__,
+			r_xprt->rx_ep.rep_connected != 0 ? "re" : "");
+	rc = rpcrdma_ep_connect(&r_xprt->rx_ep, &r_xprt->rx_ia);
+	if (rc)
+		xprt_wake_pending_tasks(xprt, rc);
 
-out:
-	xprt_wake_pending_tasks(xprt, rc);
-out_clear:
 	dprintk("RPC:       %s: exit\n", __func__);
 	xprt_clear_connecting(xprt);
 	current->flags &= ~PF_FSTRANS;
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 97f8918169ed..aaaadfbe36e9 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -917,9 +917,6 @@ static void xs_local_data_ready(struct sock *sk, int len)
 	if (skb == NULL)
 		goto out;
 
-	if (xprt->shutdown)
-		goto dropit;
-
 	repsize = skb->len - sizeof(rpc_fraghdr);
 	if (repsize < 4) {
 		dprintk("RPC:       impossible RPC reply size %d\n", repsize);
@@ -981,9 +978,6 @@ static void xs_udp_data_ready(struct sock *sk, int len)
 	if ((skb = skb_recv_datagram(sk, 0, 1, &err)) == NULL)
 		goto out;
 
-	if (xprt->shutdown)
-		goto dropit;
-
 	repsize = skb->len - sizeof(struct udphdr);
 	if (repsize < 4) {
 		dprintk("RPC:       impossible RPC reply size %d!\n", repsize);
@@ -1412,9 +1406,6 @@ static void xs_tcp_data_ready(struct sock *sk, int bytes)
 	read_lock_bh(&sk->sk_callback_lock);
 	if (!(xprt = xprt_from_sock(sk)))
 		goto out;
-	if (xprt->shutdown)
-		goto out;
-
 	/* Any data means we had a useful conversation, so
 	 * the we don't need to delay the next reconnect
 	 */
@@ -1901,9 +1892,6 @@ static void xs_local_setup_socket(struct work_struct *work)
 	struct socket *sock;
 	int status = -EIO;
 
-	if (xprt->shutdown)
-		goto out;
-
 	current->flags |= PF_FSTRANS;
 
 	clear_bit(XPRT_CONNECTION_ABORT, &xprt->state);
@@ -2020,9 +2008,6 @@ static void xs_udp_setup_socket(struct work_struct *work)
 	struct socket *sock = transport->sock;
 	int status = -EIO;
 
-	if (xprt->shutdown)
-		goto out;
-
 	current->flags |= PF_FSTRANS;
 
 	/* Start by resetting any existing state */
@@ -2168,9 +2153,6 @@ static void xs_tcp_setup_socket(struct work_struct *work)
 	struct rpc_xprt *xprt = &transport->xprt;
 	int status = -EIO;
 
-	if (xprt->shutdown)
-		goto out;
-
 	current->flags |= PF_FSTRANS;
 
 	if (!sock) {
-- 
cgit v1.2.3


From 9b96ce71974127af0304514d310abe596426c112 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Fri, 28 Sep 2012 20:24:16 -0400
Subject: SUNRPC: Limit the rpciod workqueue concurrency

We shouldn't need more than 1 worker thread per cpu, since rpciod
is designed to run without sleeping in most cases.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 net/sunrpc/sched.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c
index 128494ec9a64..6357fcb00c7e 100644
--- a/net/sunrpc/sched.c
+++ b/net/sunrpc/sched.c
@@ -1022,7 +1022,7 @@ static int rpciod_start(void)
 	 * Create the rpciod thread and wait for it to start.
 	 */
 	dprintk("RPC:       creating workqueue rpciod\n");
-	wq = alloc_workqueue("rpciod", WQ_MEM_RECLAIM, 0);
+	wq = alloc_workqueue("rpciod", WQ_MEM_RECLAIM, 1);
 	rpciod_workqueue = wq;
 	return rpciod_workqueue != NULL;
 }
-- 
cgit v1.2.3


From 290e33593d76d1cebf873da50e036559c4820af9 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@inktank.com>
Date: Fri, 17 Aug 2012 09:47:49 -0700
Subject: libceph: remove unused monc->have_fsid

This is unused; use monc->client->have_fsid.

Signed-off-by: Sage Weil <sage@inktank.com>
---
 include/linux/ceph/mon_client.h | 1 -
 net/ceph/mon_client.c           | 1 -
 2 files changed, 2 deletions(-)

(limited to 'net')

diff --git a/include/linux/ceph/mon_client.h b/include/linux/ceph/mon_client.h
index 2113e3850a4e..1dc508aeb73d 100644
--- a/include/linux/ceph/mon_client.h
+++ b/include/linux/ceph/mon_client.h
@@ -71,7 +71,6 @@ struct ceph_mon_client {
 	int cur_mon;                       /* last monitor i contacted */
 	unsigned long sub_sent, sub_renew_after;
 	struct ceph_connection con;
-	bool have_fsid;
 
 	/* pending generic requests */
 	struct rb_root generic_request_tree;
diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c
index 900ea0f043fc..e98f6070b5ae 100644
--- a/net/ceph/mon_client.c
+++ b/net/ceph/mon_client.c
@@ -769,7 +769,6 @@ static int build_initial_monmap(struct ceph_mon_client *monc)
 		monc->monmap->mon_inst[i].name.num = cpu_to_le64(i);
 	}
 	monc->monmap->num_mon = num_mon;
-	monc->have_fsid = false;
 	return 0;
 }
 
-- 
cgit v1.2.3


From 7698f2f5e0d7b7a062213fa970b7c4e121adf38e Mon Sep 17 00:00:00 2001
From: Iulius Curt <iulius.curt@gmail.com>
Date: Thu, 23 Aug 2012 15:14:29 +0300
Subject: libceph: Fix sparse warning

Make ceph_monc_do_poolop() static to remove the following sparse warning:
 * net/ceph/mon_client.c:616:5: warning: symbol 'ceph_monc_do_poolop' was not
   declared. Should it be static?
Also drops the 'ceph_monc_' prefix, now being a private function.

Signed-off-by: Iulius Curt <icurt@ixiacom.com>
Signed-off-by: Sage Weil <sage@inktank.com>
---
 net/ceph/mon_client.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c
index e98f6070b5ae..812eb3b46c1f 100644
--- a/net/ceph/mon_client.c
+++ b/net/ceph/mon_client.c
@@ -637,7 +637,7 @@ bad:
 /*
  * Do a synchronous pool op.
  */
-int ceph_monc_do_poolop(struct ceph_mon_client *monc, u32 op,
+static int do_poolop(struct ceph_mon_client *monc, u32 op,
 			u32 pool, u64 snapid,
 			char *buf, int len)
 {
@@ -687,7 +687,7 @@ out:
 int ceph_monc_create_snapid(struct ceph_mon_client *monc,
 			    u32 pool, u64 *snapid)
 {
-	return ceph_monc_do_poolop(monc,  POOL_OP_CREATE_UNMANAGED_SNAP,
+	return do_poolop(monc,  POOL_OP_CREATE_UNMANAGED_SNAP,
 				   pool, 0, (char *)snapid, sizeof(*snapid));
 
 }
@@ -696,7 +696,7 @@ EXPORT_SYMBOL(ceph_monc_create_snapid);
 int ceph_monc_delete_snapid(struct ceph_mon_client *monc,
 			    u32 pool, u64 snapid)
 {
-	return ceph_monc_do_poolop(monc,  POOL_OP_CREATE_UNMANAGED_SNAP,
+	return do_poolop(monc,  POOL_OP_CREATE_UNMANAGED_SNAP,
 				   pool, snapid, 0, 0);
 
 }
-- 
cgit v1.2.3


From cc4829e5967de577794b25dfcd1a65e509d171ed Mon Sep 17 00:00:00 2001
From: Wei Yongjun <yongjun_wei@trendmicro.com.cn>
Date: Wed, 5 Sep 2012 14:34:32 +0800
Subject: ceph: use list_move_tail instead of list_del/list_add_tail

Using list_move_tail() instead of list_del() + list_add_tail().

Signed-off-by: Wei Yongjun <yongjun_wei@trendmicro.com.cn>
Signed-off-by: Sage Weil <sage@inktank.com>
---
 net/ceph/pagelist.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/ceph/pagelist.c b/net/ceph/pagelist.c
index 665cd23020ff..92866bebb65f 100644
--- a/net/ceph/pagelist.c
+++ b/net/ceph/pagelist.c
@@ -1,4 +1,3 @@
-
 #include <linux/module.h>
 #include <linux/gfp.h>
 #include <linux/pagemap.h>
@@ -134,8 +133,8 @@ int ceph_pagelist_truncate(struct ceph_pagelist *pl,
 	ceph_pagelist_unmap_tail(pl);
 	while (pl->head.prev != c->page_lru) {
 		page = list_entry(pl->head.prev, struct page, lru);
-		list_del(&page->lru);                /* remove from pagelist */
-		list_add_tail(&page->lru, &pl->free_list); /* add to reserve */
+		/* move from pagelist to reserve */
+		list_move_tail(&page->lru, &pl->free_list);
 		++pl->num_pages_free;
 	}
 	pl->room = c->room;
-- 
cgit v1.2.3


From d63b77f4c552cc3a20506871046ab0fcbc332609 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@inktank.com>
Date: Mon, 24 Sep 2012 20:59:48 -0700
Subject: libceph: check for invalid mapping

If we encounter an invalid (e.g., zeroed) mapping, return an error
and avoid a divide by zero.

Signed-off-by: Sage Weil <sage@inktank.com>
Reviewed-by: Alex Elder <elder@inktank.com>
---
 include/linux/ceph/osd_client.h |  2 +-
 include/linux/ceph/osdmap.h     |  6 +++---
 net/ceph/osd_client.c           | 32 ++++++++++++++++++++------------
 net/ceph/osdmap.c               | 18 ++++++++++++++++--
 4 files changed, 40 insertions(+), 18 deletions(-)

(limited to 'net')

diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index cedfb1a8434a..d9b880e977e6 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -207,7 +207,7 @@ extern void ceph_osdc_handle_reply(struct ceph_osd_client *osdc,
 extern void ceph_osdc_handle_map(struct ceph_osd_client *osdc,
 				 struct ceph_msg *msg);
 
-extern void ceph_calc_raw_layout(struct ceph_osd_client *osdc,
+extern int ceph_calc_raw_layout(struct ceph_osd_client *osdc,
 			struct ceph_file_layout *layout,
 			u64 snapid,
 			u64 off, u64 *plen, u64 *bno,
diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h
index 311ef8d6aa9e..e88a620b9f8a 100644
--- a/include/linux/ceph/osdmap.h
+++ b/include/linux/ceph/osdmap.h
@@ -109,9 +109,9 @@ extern struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
 extern void ceph_osdmap_destroy(struct ceph_osdmap *map);
 
 /* calculate mapping of a file extent to an object */
-extern void ceph_calc_file_object_mapping(struct ceph_file_layout *layout,
-					  u64 off, u64 *plen,
-					  u64 *bno, u64 *oxoff, u64 *oxlen);
+extern int ceph_calc_file_object_mapping(struct ceph_file_layout *layout,
+					 u64 off, u64 *plen,
+					 u64 *bno, u64 *oxoff, u64 *oxlen);
 
 /* calculate mapping of object to a placement group */
 extern int ceph_calc_object_layout(struct ceph_object_layout *ol,
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 42119c05e82c..f7b56e23988c 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -52,7 +52,7 @@ static int op_has_extent(int op)
 		op == CEPH_OSD_OP_WRITE);
 }
 
-void ceph_calc_raw_layout(struct ceph_osd_client *osdc,
+int ceph_calc_raw_layout(struct ceph_osd_client *osdc,
 			struct ceph_file_layout *layout,
 			u64 snapid,
 			u64 off, u64 *plen, u64 *bno,
@@ -62,12 +62,15 @@ void ceph_calc_raw_layout(struct ceph_osd_client *osdc,
 	struct ceph_osd_request_head *reqhead = req->r_request->front.iov_base;
 	u64 orig_len = *plen;
 	u64 objoff, objlen;    /* extent in object */
+	int r;
 
 	reqhead->snapid = cpu_to_le64(snapid);
 
 	/* object extent? */
-	ceph_calc_file_object_mapping(layout, off, plen, bno,
-				      &objoff, &objlen);
+	r = ceph_calc_file_object_mapping(layout, off, plen, bno,
+					  &objoff, &objlen);
+	if (r < 0)
+		return r;
 	if (*plen < orig_len)
 		dout(" skipping last %llu, final file extent %llu~%llu\n",
 		     orig_len - *plen, off, *plen);
@@ -83,7 +86,7 @@ void ceph_calc_raw_layout(struct ceph_osd_client *osdc,
 
 	dout("calc_layout bno=%llx %llu~%llu (%d pages)\n",
 	     *bno, objoff, objlen, req->r_num_pages);
-
+	return 0;
 }
 EXPORT_SYMBOL(ceph_calc_raw_layout);
 
@@ -112,20 +115,25 @@ EXPORT_SYMBOL(ceph_calc_raw_layout);
  *
  * fill osd op in request message.
  */
-static void calc_layout(struct ceph_osd_client *osdc,
-			struct ceph_vino vino,
-			struct ceph_file_layout *layout,
-			u64 off, u64 *plen,
-			struct ceph_osd_request *req,
-			struct ceph_osd_req_op *op)
+static int calc_layout(struct ceph_osd_client *osdc,
+		       struct ceph_vino vino,
+		       struct ceph_file_layout *layout,
+		       u64 off, u64 *plen,
+		       struct ceph_osd_request *req,
+		       struct ceph_osd_req_op *op)
 {
 	u64 bno;
+	int r;
 
-	ceph_calc_raw_layout(osdc, layout, vino.snap, off,
-			     plen, &bno, req, op);
+	r = ceph_calc_raw_layout(osdc, layout, vino.snap, off,
+				 plen, &bno, req, op);
+	if (r < 0)
+		return r;
 
 	snprintf(req->r_oid, sizeof(req->r_oid), "%llx.%08llx", vino.ino, bno);
 	req->r_oid_len = strlen(req->r_oid);
+
+	return r;
 }
 
 /*
diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index 3124b71a8883..5433fb0eb3c6 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -984,7 +984,7 @@ bad:
  * for now, we write only a single su, until we can
  * pass a stride back to the caller.
  */
-void ceph_calc_file_object_mapping(struct ceph_file_layout *layout,
+int ceph_calc_file_object_mapping(struct ceph_file_layout *layout,
 				   u64 off, u64 *plen,
 				   u64 *ono,
 				   u64 *oxoff, u64 *oxlen)
@@ -998,11 +998,17 @@ void ceph_calc_file_object_mapping(struct ceph_file_layout *layout,
 
 	dout("mapping %llu~%llu  osize %u fl_su %u\n", off, *plen,
 	     osize, su);
+	if (su == 0 || sc == 0)
+		goto invalid;
 	su_per_object = osize / su;
+	if (su_per_object == 0)
+		goto invalid;
 	dout("osize %u / su %u = su_per_object %u\n", osize, su,
 	     su_per_object);
 
-	BUG_ON((su & ~PAGE_MASK) != 0);
+	if ((su & ~PAGE_MASK) != 0)
+		goto invalid;
+
 	/* bl = *off / su; */
 	t = off;
 	do_div(t, su);
@@ -1030,6 +1036,14 @@ void ceph_calc_file_object_mapping(struct ceph_file_layout *layout,
 	*plen = *oxlen;
 
 	dout(" obj extent %llu~%llu\n", *oxoff, *oxlen);
+	return 0;
+
+invalid:
+	dout(" invalid layout\n");
+	*ono = 0;
+	*oxoff = 0;
+	*oxlen = 0;
+	return -EINVAL;
 }
 EXPORT_SYMBOL(ceph_calc_file_object_mapping);
 
-- 
cgit v1.2.3


From 6816282dab3a72efe8c0d182c1bc2960d87f4322 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@inktank.com>
Date: Mon, 24 Sep 2012 21:01:02 -0700
Subject: ceph: propagate layout error on osd request creation

If we are creating an osd request and get an invalid layout, return
an EINVAL to the caller.  We switch up the return to have an error
code instead of NULL implying -ENOMEM.

Signed-off-by: Sage Weil <sage@inktank.com>
Reviewed-by: Alex Elder <elder@inktank.com>
---
 fs/ceph/addr.c        |  8 ++++----
 fs/ceph/file.c        |  4 ++--
 net/ceph/osd_client.c | 15 +++++++++------
 3 files changed, 15 insertions(+), 12 deletions(-)

(limited to 'net')

diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 452e71a1b753..4469b63c9b7b 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -308,8 +308,8 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max)
 				    NULL, 0,
 				    ci->i_truncate_seq, ci->i_truncate_size,
 				    NULL, false, 1, 0);
-	if (!req)
-		return -ENOMEM;
+	if (IS_ERR(req))
+		return PTR_ERR(req);
 
 	/* build page vector */
 	nr_pages = len >> PAGE_CACHE_SHIFT;
@@ -832,8 +832,8 @@ get_more_pages:
 					    ci->i_truncate_size,
 					    &inode->i_mtime, true, 1, 0);
 
-				if (!req) {
-					rc = -ENOMEM;
+				if (IS_ERR(req)) {
+					rc = PTR_ERR(req);
 					unlock_page(page);
 					break;
 				}
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index ecebbc09bfc7..5840d2aaed15 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -536,8 +536,8 @@ more:
 				    do_sync,
 				    ci->i_truncate_seq, ci->i_truncate_size,
 				    &mtime, false, 2, page_align);
-	if (!req)
-		return -ENOMEM;
+	if (IS_ERR(req))
+		return PTR_ERR(req);
 
 	if (file->f_flags & O_DIRECT) {
 		pages = ceph_get_direct_page_vector(data, num_pages, false);
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index f7b56e23988c..ccbdfbba9e53 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -464,6 +464,7 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
 {
 	struct ceph_osd_req_op ops[3];
 	struct ceph_osd_request *req;
+	int r;
 
 	ops[0].op = opcode;
 	ops[0].extent.truncate_seq = truncate_seq;
@@ -482,10 +483,12 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
 					 use_mempool,
 					 GFP_NOFS, NULL, NULL);
 	if (!req)
-		return NULL;
+		return ERR_PTR(-ENOMEM);
 
 	/* calculate max write size */
-	calc_layout(osdc, vino, layout, off, plen, req, ops);
+	r = calc_layout(osdc, vino, layout, off, plen, req, ops);
+	if (r < 0)
+		return ERR_PTR(r);
 	req->r_file_layout = *layout;  /* keep a copy */
 
 	/* in case it differs from natural (file) alignment that
@@ -1928,8 +1931,8 @@ int ceph_osdc_readpages(struct ceph_osd_client *osdc,
 				    CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ,
 				    NULL, 0, truncate_seq, truncate_size, NULL,
 				    false, 1, page_align);
-	if (!req)
-		return -ENOMEM;
+	if (IS_ERR(req))
+		return PTR_ERR(req);
 
 	/* it may be a short read due to an object boundary */
 	req->r_pages = pages;
@@ -1971,8 +1974,8 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino,
 				    snapc, do_sync,
 				    truncate_seq, truncate_size, mtime,
 				    nofail, 1, page_align);
-	if (!req)
-		return -ENOMEM;
+	if (IS_ERR(req))
+		return PTR_ERR(req);
 
 	/* it may be a short write due to an object boundary */
 	req->r_pages = pages;
-- 
cgit v1.2.3


From d8af9bc16c9e9506c10581111f897be04486218f Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Fri, 14 Sep 2012 17:23:34 -0400
Subject: SUNRPC: Clean up dprintk messages in rpc_pipe.c

Clean up: The blank space in front of the message must be spaces.
Tabs show up on the console as a graphical character.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 net/sunrpc/rpc_pipe.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c
index 21fde99e5c56..80f5dd23417d 100644
--- a/net/sunrpc/rpc_pipe.c
+++ b/net/sunrpc/rpc_pipe.c
@@ -1119,8 +1119,8 @@ rpc_fill_super(struct super_block *sb, void *data, int silent)
 		return -ENOMEM;
 	if (rpc_populate(root, files, RPCAUTH_lockd, RPCAUTH_RootEOF, NULL))
 		return -ENOMEM;
-	dprintk("RPC:	sending pipefs MOUNT notification for net %p%s\n", net,
-								NET_NAME(net));
+	dprintk("RPC:       sending pipefs MOUNT notification for net %p%s\n",
+		net, NET_NAME(net));
 	sn->pipefs_sb = sb;
 	err = blocking_notifier_call_chain(&rpc_pipefs_notifier_list,
 					   RPC_PIPEFS_MOUNT,
@@ -1155,8 +1155,8 @@ static void rpc_kill_sb(struct super_block *sb)
 	sn->pipefs_sb = NULL;
 	mutex_unlock(&sn->pipefs_sb_lock);
 	put_net(net);
-	dprintk("RPC:	sending pipefs UMOUNT notification for net %p%s\n", net,
-								NET_NAME(net));
+	dprintk("RPC:       sending pipefs UMOUNT notification for net %p%s\n",
+		net, NET_NAME(net));
 	blocking_notifier_call_chain(&rpc_pipefs_notifier_list,
 					   RPC_PIPEFS_UMOUNT,
 					   sb);
-- 
cgit v1.2.3


From 632f0d0503accb8ab749a1165af99d344579c37b Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Fri, 14 Sep 2012 17:23:43 -0400
Subject: SUNRPC: Use __func__ in dprintk() in auth_gss.c

Clean up: Some function names have changed, but debugging messages
were never updated.  Automate the construction of the function name
in debugging messages.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 net/sunrpc/auth_gss/auth_gss.c | 58 +++++++++++++++++++++---------------------
 1 file changed, 29 insertions(+), 29 deletions(-)

(limited to 'net')

diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c
index 34c522021004..909dc0c31aab 100644
--- a/net/sunrpc/auth_gss/auth_gss.c
+++ b/net/sunrpc/auth_gss/auth_gss.c
@@ -239,7 +239,7 @@ gss_fill_context(const void *p, const void *end, struct gss_cl_ctx *ctx, struct
 	}
 	return q;
 err:
-	dprintk("RPC:       gss_fill_context returning %ld\n", -PTR_ERR(p));
+	dprintk("RPC:       %s returning %ld\n", __func__, -PTR_ERR(p));
 	return p;
 }
 
@@ -301,10 +301,10 @@ __gss_find_upcall(struct rpc_pipe *pipe, uid_t uid)
 		if (pos->uid != uid)
 			continue;
 		atomic_inc(&pos->count);
-		dprintk("RPC:       gss_find_upcall found msg %p\n", pos);
+		dprintk("RPC:       %s found msg %p\n", __func__, pos);
 		return pos;
 	}
-	dprintk("RPC:       gss_find_upcall found nothing\n");
+	dprintk("RPC:       %s found nothing\n", __func__);
 	return NULL;
 }
 
@@ -507,8 +507,8 @@ gss_refresh_upcall(struct rpc_task *task)
 	struct rpc_pipe *pipe;
 	int err = 0;
 
-	dprintk("RPC: %5u gss_refresh_upcall for uid %u\n", task->tk_pid,
-								cred->cr_uid);
+	dprintk("RPC: %5u %s for uid %u\n",
+		task->tk_pid, __func__, cred->cr_uid);
 	gss_msg = gss_setup_upcall(task->tk_client, gss_auth, cred);
 	if (PTR_ERR(gss_msg) == -EAGAIN) {
 		/* XXX: warning on the first, under the assumption we
@@ -539,8 +539,8 @@ gss_refresh_upcall(struct rpc_task *task)
 	spin_unlock(&pipe->lock);
 	gss_release_msg(gss_msg);
 out:
-	dprintk("RPC: %5u gss_refresh_upcall for uid %u result %d\n",
-			task->tk_pid, cred->cr_uid, err);
+	dprintk("RPC: %5u %s for uid %u result %d\n",
+		task->tk_pid, __func__, cred->cr_uid, err);
 	return err;
 }
 
@@ -553,7 +553,7 @@ gss_create_upcall(struct gss_auth *gss_auth, struct gss_cred *gss_cred)
 	DEFINE_WAIT(wait);
 	int err = 0;
 
-	dprintk("RPC:       gss_upcall for uid %u\n", cred->cr_uid);
+	dprintk("RPC:       %s for uid %u\n", __func__, cred->cr_uid);
 retry:
 	gss_msg = gss_setup_upcall(gss_auth->client, gss_auth, cred);
 	if (PTR_ERR(gss_msg) == -EAGAIN) {
@@ -594,8 +594,8 @@ out_intr:
 	finish_wait(&gss_msg->waitqueue, &wait);
 	gss_release_msg(gss_msg);
 out:
-	dprintk("RPC:       gss_create_upcall for uid %u result %d\n",
-			cred->cr_uid, err);
+	dprintk("RPC:       %s for uid %u result %d\n",
+		__func__, cred->cr_uid, err);
 	return err;
 }
 
@@ -681,7 +681,7 @@ err_put_ctx:
 err:
 	kfree(buf);
 out:
-	dprintk("RPC:       gss_pipe_downcall returning %Zd\n", err);
+	dprintk("RPC:       %s returning %Zd\n", __func__, err);
 	return err;
 }
 
@@ -747,8 +747,8 @@ gss_pipe_destroy_msg(struct rpc_pipe_msg *msg)
 	struct gss_upcall_msg *gss_msg = container_of(msg, struct gss_upcall_msg, msg);
 
 	if (msg->errno < 0) {
-		dprintk("RPC:       gss_pipe_destroy_msg releasing msg %p\n",
-				gss_msg);
+		dprintk("RPC:       %s releasing msg %p\n",
+			__func__, gss_msg);
 		atomic_inc(&gss_msg->count);
 		gss_unhash_msg(gss_msg);
 		if (msg->errno == -ETIMEDOUT)
@@ -976,7 +976,7 @@ gss_destroying_context(struct rpc_cred *cred)
 static void
 gss_do_free_ctx(struct gss_cl_ctx *ctx)
 {
-	dprintk("RPC:       gss_free_ctx\n");
+	dprintk("RPC:       %s\n", __func__);
 
 	gss_delete_sec_context(&ctx->gc_gss_ctx);
 	kfree(ctx->gc_wire_ctx.data);
@@ -999,7 +999,7 @@ gss_free_ctx(struct gss_cl_ctx *ctx)
 static void
 gss_free_cred(struct gss_cred *gss_cred)
 {
-	dprintk("RPC:       gss_free_cred %p\n", gss_cred);
+	dprintk("RPC:       %s cred=%p\n", __func__, gss_cred);
 	kfree(gss_cred);
 }
 
@@ -1049,8 +1049,8 @@ gss_create_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags)
 	struct gss_cred	*cred = NULL;
 	int err = -ENOMEM;
 
-	dprintk("RPC:       gss_create_cred for uid %d, flavor %d\n",
-		acred->uid, auth->au_flavor);
+	dprintk("RPC:       %s for uid %d, flavor %d\n",
+		__func__, acred->uid, auth->au_flavor);
 
 	if (!(cred = kzalloc(sizeof(*cred), GFP_NOFS)))
 		goto out_err;
@@ -1069,7 +1069,7 @@ gss_create_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags)
 	return &cred->gc_base;
 
 out_err:
-	dprintk("RPC:       gss_create_cred failed with error %d\n", err);
+	dprintk("RPC:       %s failed with error %d\n", __func__, err);
 	return ERR_PTR(err);
 }
 
@@ -1127,7 +1127,7 @@ gss_marshal(struct rpc_task *task, __be32 *p)
 	struct kvec	iov;
 	struct xdr_buf	verf_buf;
 
-	dprintk("RPC: %5u gss_marshal\n", task->tk_pid);
+	dprintk("RPC: %5u %s\n", task->tk_pid, __func__);
 
 	*p++ = htonl(RPC_AUTH_GSS);
 	cred_len = p++;
@@ -1253,7 +1253,7 @@ gss_validate(struct rpc_task *task, __be32 *p)
 	u32		flav,len;
 	u32		maj_stat;
 
-	dprintk("RPC: %5u gss_validate\n", task->tk_pid);
+	dprintk("RPC: %5u %s\n", task->tk_pid, __func__);
 
 	flav = ntohl(*p++);
 	if ((len = ntohl(*p++)) > RPC_MAX_AUTH_SIZE)
@@ -1271,20 +1271,20 @@ gss_validate(struct rpc_task *task, __be32 *p)
 	if (maj_stat == GSS_S_CONTEXT_EXPIRED)
 		clear_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags);
 	if (maj_stat) {
-		dprintk("RPC: %5u gss_validate: gss_verify_mic returned "
-				"error 0x%08x\n", task->tk_pid, maj_stat);
+		dprintk("RPC: %5u %s: gss_verify_mic returned error 0x%08x\n",
+			task->tk_pid, __func__, maj_stat);
 		goto out_bad;
 	}
 	/* We leave it to unwrap to calculate au_rslack. For now we just
 	 * calculate the length of the verifier: */
 	cred->cr_auth->au_verfsize = XDR_QUADLEN(len) + 2;
 	gss_put_ctx(ctx);
-	dprintk("RPC: %5u gss_validate: gss_verify_mic succeeded.\n",
-			task->tk_pid);
+	dprintk("RPC: %5u %s: gss_verify_mic succeeded.\n",
+			task->tk_pid, __func__);
 	return p + XDR_QUADLEN(len);
 out_bad:
 	gss_put_ctx(ctx);
-	dprintk("RPC: %5u gss_validate failed.\n", task->tk_pid);
+	dprintk("RPC: %5u %s failed.\n", task->tk_pid, __func__);
 	return NULL;
 }
 
@@ -1466,7 +1466,7 @@ gss_wrap_req(struct rpc_task *task,
 	struct gss_cl_ctx *ctx = gss_cred_get_ctx(cred);
 	int             status = -EIO;
 
-	dprintk("RPC: %5u gss_wrap_req\n", task->tk_pid);
+	dprintk("RPC: %5u %s\n", task->tk_pid, __func__);
 	if (ctx->gc_proc != RPC_GSS_PROC_DATA) {
 		/* The spec seems a little ambiguous here, but I think that not
 		 * wrapping context destruction requests makes the most sense.
@@ -1489,7 +1489,7 @@ gss_wrap_req(struct rpc_task *task,
 	}
 out:
 	gss_put_ctx(ctx);
-	dprintk("RPC: %5u gss_wrap_req returning %d\n", task->tk_pid, status);
+	dprintk("RPC: %5u %s returning %d\n", task->tk_pid, __func__, status);
 	return status;
 }
 
@@ -1604,8 +1604,8 @@ out_decode:
 	status = gss_unwrap_req_decode(decode, rqstp, p, obj);
 out:
 	gss_put_ctx(ctx);
-	dprintk("RPC: %5u gss_unwrap_resp returning %d\n", task->tk_pid,
-			status);
+	dprintk("RPC: %5u %s returning %d\n",
+		task->tk_pid, __func__, status);
 	return status;
 }
 
-- 
cgit v1.2.3


From 1b63a75180c6c65c71655c250a4e6b578ba7d1c0 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Fri, 14 Sep 2012 17:23:52 -0400
Subject: SUNRPC: Refactor rpc_clone_client()

rpc_clone_client() does most of the same tasks as rpc_new_client(),
so there is an opportunity for code re-use.  Create a generic helper
that makes it easy to clone an RPC client while replacing any of the
clnt's parameters.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 net/sunrpc/clnt.c | 83 ++++++++++++++++++++++++++++---------------------------
 1 file changed, 43 insertions(+), 40 deletions(-)

(limited to 'net')

diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index fa48c60aef23..afbeefab6600 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -490,59 +490,62 @@ EXPORT_SYMBOL_GPL(rpc_create);
  * same transport while varying parameters such as the authentication
  * flavour.
  */
-struct rpc_clnt *
-rpc_clone_client(struct rpc_clnt *clnt)
+static struct rpc_clnt *__rpc_clone_client(struct rpc_create_args *args,
+					   struct rpc_clnt *clnt)
 {
-	struct rpc_clnt *new;
 	struct rpc_xprt *xprt;
-	int err = -ENOMEM;
+	struct rpc_clnt *new;
+	int err;
 
-	new = kmemdup(clnt, sizeof(*new), GFP_KERNEL);
-	if (!new)
-		goto out_no_clnt;
-	new->cl_parent = clnt;
-	/* Turn off autobind on clones */
-	new->cl_autobind = 0;
-	INIT_LIST_HEAD(&new->cl_tasks);
-	spin_lock_init(&new->cl_lock);
-	rpc_init_rtt(&new->cl_rtt_default, clnt->cl_timeout->to_initval);
-	new->cl_metrics = rpc_alloc_iostats(clnt);
-	if (new->cl_metrics == NULL)
-		goto out_no_stats;
-	if (clnt->cl_principal) {
-		new->cl_principal = kstrdup(clnt->cl_principal, GFP_KERNEL);
-		if (new->cl_principal == NULL)
-			goto out_no_principal;
-	}
+	err = -ENOMEM;
 	rcu_read_lock();
 	xprt = xprt_get(rcu_dereference(clnt->cl_xprt));
 	rcu_read_unlock();
 	if (xprt == NULL)
-		goto out_no_transport;
-	rcu_assign_pointer(new->cl_xprt, xprt);
-	atomic_set(&new->cl_count, 1);
-	err = rpc_setup_pipedir(new, clnt->cl_program->pipe_dir_name);
-	if (err != 0)
-		goto out_no_path;
-	rpc_clnt_set_nodename(new, utsname()->nodename);
-	if (new->cl_auth)
-		atomic_inc(&new->cl_auth->au_count);
+		goto out_err;
+	args->servername = xprt->servername;
+
+	new = rpc_new_client(args, xprt);
+	if (IS_ERR(new)) {
+		err = PTR_ERR(new);
+		goto out_put;
+	}
+
 	atomic_inc(&clnt->cl_count);
-	rpc_register_client(new);
-	rpciod_up();
+	new->cl_parent = clnt;
+
+	/* Turn off autobind on clones */
+	new->cl_autobind = 0;
+	new->cl_softrtry = clnt->cl_softrtry;
+	new->cl_discrtry = clnt->cl_discrtry;
+	new->cl_chatty = clnt->cl_chatty;
 	return new;
-out_no_path:
+
+out_put:
 	xprt_put(xprt);
-out_no_transport:
-	kfree(new->cl_principal);
-out_no_principal:
-	rpc_free_iostats(new->cl_metrics);
-out_no_stats:
-	kfree(new);
-out_no_clnt:
+out_err:
 	dprintk("RPC:       %s: returned error %d\n", __func__, err);
 	return ERR_PTR(err);
 }
+
+/**
+ * rpc_clone_client - Clone an RPC client structure
+ *
+ * @clnt: RPC client whose parameters are copied
+ *
+ * Returns a fresh RPC client or an ERR_PTR.
+ */
+struct rpc_clnt *rpc_clone_client(struct rpc_clnt *clnt)
+{
+	struct rpc_create_args args = {
+		.program	= clnt->cl_program,
+		.prognumber	= clnt->cl_prog,
+		.version	= clnt->cl_vers,
+		.authflavor	= clnt->cl_auth->au_flavor,
+		.client_name	= clnt->cl_principal,
+	};
+	return __rpc_clone_client(&args, clnt);
+}
 EXPORT_SYMBOL_GPL(rpc_clone_client);
 
 /*
-- 
cgit v1.2.3


From ba9b584c1dc37851d9c6ca6d0d2ccba55d9aad04 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Fri, 14 Sep 2012 17:24:02 -0400
Subject: SUNRPC: Introduce rpc_clone_client_set_auth()

An ULP is supposed to be able to replace a GSS rpc_auth object with
another GSS rpc_auth object using rpcauth_create().  However,
rpcauth_create() in 3.5 reliably fails with -EEXIST in this case.
This is because when gss_create() attempts to create the upcall pipes,
sometimes they are already there.  For example if a pipe FS mount
event occurs, or a previous GSS flavor was in use for this rpc_clnt.

It turns out that's not the only problem here.  While working on a
fix for the above problem, we noticed that replacing an rpc_clnt's
rpc_auth is not safe, since dereferencing the cl_auth field is not
protected in any way.

So we're deprecating the ability of rpcauth_create() to switch an
rpc_clnt's security flavor during normal operation.  Instead, let's
add a fresh API that clones an rpc_clnt and gives the clone a new
flavor before it's used.

This makes immediate use of the new __rpc_clone_client() helper.

This can be used in a similar fashion to rpcauth_create() when a
client is hunting for the correct security flavor.  Instead of
replacing an rpc_clnt's security flavor in a loop, the ULP replaces
the whole rpc_clnt.

To fix the -EEXIST problem, any ULP logic that relies on replacing
an rpc_clnt's rpc_auth with rpcauth_create() must be changed to use
this API instead.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/client.c             | 13 ++-----------
 fs/nfs/nfs4namespace.c      | 14 +-------------
 include/linux/sunrpc/clnt.h |  2 ++
 net/sunrpc/clnt.c           | 22 ++++++++++++++++++++++
 4 files changed, 27 insertions(+), 24 deletions(-)

(limited to 'net')

diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 99694442b93f..143149db3440 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -668,7 +668,8 @@ int nfs_init_server_rpcclient(struct nfs_server *server,
 {
 	struct nfs_client *clp = server->nfs_client;
 
-	server->client = rpc_clone_client(clp->cl_rpcclient);
+	server->client = rpc_clone_client_set_auth(clp->cl_rpcclient,
+							pseudoflavour);
 	if (IS_ERR(server->client)) {
 		dprintk("%s: couldn't create rpc_client!\n", __func__);
 		return PTR_ERR(server->client);
@@ -678,16 +679,6 @@ int nfs_init_server_rpcclient(struct nfs_server *server,
 			timeo,
 			sizeof(server->client->cl_timeout_default));
 	server->client->cl_timeout = &server->client->cl_timeout_default;
-
-	if (pseudoflavour != clp->cl_rpcclient->cl_auth->au_flavor) {
-		struct rpc_auth *auth;
-
-		auth = rpcauth_create(pseudoflavour, server->client);
-		if (IS_ERR(auth)) {
-			dprintk("%s: couldn't create credcache!\n", __func__);
-			return PTR_ERR(auth);
-		}
-	}
 	server->client->cl_softrtry = 0;
 	if (server->flags & NFS_MOUNT_SOFT)
 		server->client->cl_softrtry = 1;
diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c
index 4fdeb1b7042e..79fbb61ce202 100644
--- a/fs/nfs/nfs4namespace.c
+++ b/fs/nfs/nfs4namespace.c
@@ -192,25 +192,13 @@ out:
 struct rpc_clnt *nfs4_create_sec_client(struct rpc_clnt *clnt, struct inode *inode,
 					struct qstr *name)
 {
-	struct rpc_clnt *clone;
-	struct rpc_auth *auth;
 	rpc_authflavor_t flavor;
 
 	flavor = nfs4_negotiate_security(inode, name);
 	if ((int)flavor < 0)
 		return ERR_PTR((int)flavor);
 
-	clone = rpc_clone_client(clnt);
-	if (IS_ERR(clone))
-		return clone;
-
-	auth = rpcauth_create(flavor, clone);
-	if (IS_ERR(auth)) {
-		rpc_shutdown_client(clone);
-		clone = ERR_PTR(-EIO);
-	}
-
-	return clone;
+	return rpc_clone_client_set_auth(clnt, flavor);
 }
 
 static struct vfsmount *try_location(struct nfs_clone_mount *mountdata,
diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h
index 523547ecfee2..34206b84d8da 100644
--- a/include/linux/sunrpc/clnt.h
+++ b/include/linux/sunrpc/clnt.h
@@ -130,6 +130,8 @@ struct rpc_clnt	*rpc_bind_new_program(struct rpc_clnt *,
 				const struct rpc_program *, u32);
 void rpc_task_reset_client(struct rpc_task *task, struct rpc_clnt *clnt);
 struct rpc_clnt *rpc_clone_client(struct rpc_clnt *);
+struct rpc_clnt *rpc_clone_client_set_auth(struct rpc_clnt *,
+				rpc_authflavor_t);
 void		rpc_shutdown_client(struct rpc_clnt *);
 void		rpc_release_client(struct rpc_clnt *);
 void		rpc_task_release_client(struct rpc_task *);
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index afbeefab6600..cdc7564b4512 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -548,6 +548,28 @@ struct rpc_clnt *rpc_clone_client(struct rpc_clnt *clnt)
 }
 EXPORT_SYMBOL_GPL(rpc_clone_client);
 
+/**
+ * rpc_clone_client_set_auth - Clone an RPC client structure and set its auth
+ *
+ * @clnt: RPC client whose parameters are copied
+ * @auth: security flavor for new client
+ *
+ * Returns a fresh RPC client or an ERR_PTR.
+ */
+struct rpc_clnt *
+rpc_clone_client_set_auth(struct rpc_clnt *clnt, rpc_authflavor_t flavor)
+{
+	struct rpc_create_args args = {
+		.program	= clnt->cl_program,
+		.prognumber	= clnt->cl_prog,
+		.version	= clnt->cl_vers,
+		.authflavor	= flavor,
+		.client_name	= clnt->cl_principal,
+	};
+	return __rpc_clone_client(&args, clnt);
+}
+EXPORT_SYMBOL_GPL(rpc_clone_client_set_auth);
+
 /*
  * Kill all tasks for the given client.
  * XXX: kill their descendants as well?
-- 
cgit v1.2.3


From 6299b669b1340b9f7de2bc2bd565921a1494e7f7 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Thu, 4 Oct 2012 17:12:08 -0700
Subject: sections: fix section conflicts in net/can

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Cc: Oliver Hartkopp <socketcan@hartkopp.net>
Cc: David Miller <davem@davemloft.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 net/can/af_can.c | 2 +-
 net/can/bcm.c    | 2 +-
 net/can/gw.c     | 2 +-
 net/can/raw.c    | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/can/af_can.c b/net/can/af_can.c
index 821022a7214f..ddac1ee2ed20 100644
--- a/net/can/af_can.c
+++ b/net/can/af_can.c
@@ -63,7 +63,7 @@
 
 #include "af_can.h"
 
-static __initdata const char banner[] = KERN_INFO
+static __initconst const char banner[] = KERN_INFO
 	"can: controller area network core (" CAN_VERSION_STRING ")\n";
 
 MODULE_DESCRIPTION("Controller Area Network PF_CAN core");
diff --git a/net/can/bcm.c b/net/can/bcm.c
index 151b7730c12c..6f747582718e 100644
--- a/net/can/bcm.c
+++ b/net/can/bcm.c
@@ -77,7 +77,7 @@
 		     (CAN_SFF_MASK | CAN_EFF_FLAG | CAN_RTR_FLAG))
 
 #define CAN_BCM_VERSION CAN_VERSION
-static __initdata const char banner[] = KERN_INFO
+static __initconst const char banner[] = KERN_INFO
 	"can: broadcast manager protocol (rev " CAN_BCM_VERSION " t)\n";
 
 MODULE_DESCRIPTION("PF_CAN broadcast manager protocol");
diff --git a/net/can/gw.c b/net/can/gw.c
index 127879c55fb6..1f5c9785a262 100644
--- a/net/can/gw.c
+++ b/net/can/gw.c
@@ -58,7 +58,7 @@
 #include <net/sock.h>
 
 #define CAN_GW_VERSION "20101209"
-static __initdata const char banner[] =
+static __initconst const char banner[] =
 	KERN_INFO "can: netlink gateway (rev " CAN_GW_VERSION ")\n";
 
 MODULE_DESCRIPTION("PF_CAN netlink gateway");
diff --git a/net/can/raw.c b/net/can/raw.c
index 3e9c89356a93..5b0e3e330d97 100644
--- a/net/can/raw.c
+++ b/net/can/raw.c
@@ -55,7 +55,7 @@
 #include <net/net_namespace.h>
 
 #define CAN_RAW_VERSION CAN_VERSION
-static __initdata const char banner[] =
+static __initconst const char banner[] =
 	KERN_INFO "can: raw protocol (rev " CAN_RAW_VERSION ")\n";
 
 MODULE_DESCRIPTION("PF_CAN raw protocol");
-- 
cgit v1.2.3


From 04a6f82cf01aeef9fb058b2fca0ef1fe0a09c2fa Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Thu, 4 Oct 2012 17:12:11 -0700
Subject: sections: fix section conflicts in net

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Cc: David Miller <davem@davemloft.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/net/net_namespace.h | 2 ++
 net/decnet/dn_rules.c       | 2 +-
 net/ipv4/fib_rules.c        | 2 +-
 net/ipv4/ipmr.c             | 2 +-
 net/ipv6/addrlabel.c        | 2 +-
 net/ipv6/fib6_rules.c       | 2 +-
 net/ipv6/ip6mr.c            | 2 +-
 7 files changed, 8 insertions(+), 6 deletions(-)

(limited to 'net')

diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h
index 4faf6612ecac..95e646641184 100644
--- a/include/net/net_namespace.h
+++ b/include/net/net_namespace.h
@@ -257,10 +257,12 @@ static inline struct net *read_pnet(struct net * const *pnet)
 #define __net_init
 #define __net_exit
 #define __net_initdata
+#define __net_initconst
 #else
 #define __net_init	__init
 #define __net_exit	__exit_refok
 #define __net_initdata	__initdata
+#define __net_initconst	__initconst
 #endif
 
 struct pernet_operations {
diff --git a/net/decnet/dn_rules.c b/net/decnet/dn_rules.c
index e65f2c856e06..faf7cc3483fe 100644
--- a/net/decnet/dn_rules.c
+++ b/net/decnet/dn_rules.c
@@ -220,7 +220,7 @@ static void dn_fib_rule_flush_cache(struct fib_rules_ops *ops)
 	dn_rt_cache_flush(-1);
 }
 
-static const struct fib_rules_ops __net_initdata dn_fib_rules_ops_template = {
+static const struct fib_rules_ops __net_initconst dn_fib_rules_ops_template = {
 	.family		= AF_DECnet,
 	.rule_size	= sizeof(struct dn_fib_rule),
 	.addr_size	= sizeof(u16),
diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c
index 274309d3aded..26aa65d1fce4 100644
--- a/net/ipv4/fib_rules.c
+++ b/net/ipv4/fib_rules.c
@@ -262,7 +262,7 @@ static void fib4_rule_flush_cache(struct fib_rules_ops *ops)
 	rt_cache_flush(ops->fro_net);
 }
 
-static const struct fib_rules_ops __net_initdata fib4_rules_ops_template = {
+static const struct fib_rules_ops __net_initconst fib4_rules_ops_template = {
 	.family		= AF_INET,
 	.rule_size	= sizeof(struct fib4_rule),
 	.addr_size	= sizeof(u32),
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 1daa95c2a0ba..6168c4dc58b1 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -221,7 +221,7 @@ static int ipmr_rule_fill(struct fib_rule *rule, struct sk_buff *skb,
 	return 0;
 }
 
-static const struct fib_rules_ops __net_initdata ipmr_rules_ops_template = {
+static const struct fib_rules_ops __net_initconst ipmr_rules_ops_template = {
 	.family		= RTNL_FAMILY_IPMR,
 	.rule_size	= sizeof(struct ipmr_rule),
 	.addr_size	= sizeof(u32),
diff --git a/net/ipv6/addrlabel.c b/net/ipv6/addrlabel.c
index 4be23da32b89..ff76eecfd622 100644
--- a/net/ipv6/addrlabel.c
+++ b/net/ipv6/addrlabel.c
@@ -79,7 +79,7 @@ struct net *ip6addrlbl_net(const struct ip6addrlbl_entry *lbl)
 
 #define IPV6_ADDR_LABEL_DEFAULT	0xffffffffUL
 
-static const __net_initdata struct ip6addrlbl_init_table
+static const __net_initconst struct ip6addrlbl_init_table
 {
 	const struct in6_addr *prefix;
 	int prefixlen;
diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c
index 0ff1cfd55bc4..d9fb9110f607 100644
--- a/net/ipv6/fib6_rules.c
+++ b/net/ipv6/fib6_rules.c
@@ -238,7 +238,7 @@ static size_t fib6_rule_nlmsg_payload(struct fib_rule *rule)
 	       + nla_total_size(16); /* src */
 }
 
-static const struct fib_rules_ops __net_initdata fib6_rules_ops_template = {
+static const struct fib_rules_ops __net_initconst fib6_rules_ops_template = {
 	.family			= AF_INET6,
 	.rule_size		= sizeof(struct fib6_rule),
 	.addr_size		= sizeof(struct in6_addr),
diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index 08ea3f0b6e55..f7c7c6319720 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -205,7 +205,7 @@ static int ip6mr_rule_fill(struct fib_rule *rule, struct sk_buff *skb,
 	return 0;
 }
 
-static const struct fib_rules_ops __net_initdata ip6mr_rules_ops_template = {
+static const struct fib_rules_ops __net_initconst ip6mr_rules_ops_template = {
 	.family		= RTNL_FAMILY_IP6MR,
 	.rule_size	= sizeof(struct ip6mr_rule),
 	.addr_size	= sizeof(struct in6_addr),
-- 
cgit v1.2.3


From 6dc878a8ca39e93f70c42f3dd7260bde10c1e0f1 Mon Sep 17 00:00:00 2001
From: Gao feng <gaofeng@cn.fujitsu.com>
Date: Thu, 4 Oct 2012 20:15:48 +0000
Subject: netlink: add reference of module in netlink_dump_start

I get a panic when I use ss -a and rmmod inet_diag at the
same time.

It's because netlink_dump uses inet_diag_dump which belongs to module
inet_diag.

I search the codes and find many modules have the same problem.  We
need to add a reference to the module which the cb->dump belongs to.

Thanks for all help from Stephen,Jan,Eric,Steffen and Pablo.

Change From v3:
change netlink_dump_start to inline,suggestion from Pablo and
Eric.

Change From v2:
delete netlink_dump_done,and call module_put in netlink_dump
and netlink_sock_destruct.

Signed-off-by: Gao feng <gaofeng@cn.fujitsu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netlink.h  | 20 ++++++++++++++++----
 net/netlink/af_netlink.c | 29 +++++++++++++++++++++--------
 2 files changed, 37 insertions(+), 12 deletions(-)

(limited to 'net')

diff --git a/include/linux/netlink.h b/include/linux/netlink.h
index f80c56ac4d82..6d3af05c107c 100644
--- a/include/linux/netlink.h
+++ b/include/linux/netlink.h
@@ -245,6 +245,8 @@ struct netlink_callback {
 					struct netlink_callback *cb);
 	int			(*done)(struct netlink_callback *cb);
 	void			*data;
+	/* the module that dump function belong to */
+	struct module		*module;
 	u16			family;
 	u16			min_dump_alloc;
 	unsigned int		prev_seq, seq;
@@ -262,14 +264,24 @@ __nlmsg_put(struct sk_buff *skb, u32 portid, u32 seq, int type, int len, int fla
 
 struct netlink_dump_control {
 	int (*dump)(struct sk_buff *skb, struct netlink_callback *);
-	int (*done)(struct netlink_callback*);
+	int (*done)(struct netlink_callback *);
 	void *data;
+	struct module *module;
 	u16 min_dump_alloc;
 };
 
-extern int netlink_dump_start(struct sock *ssk, struct sk_buff *skb,
-			      const struct nlmsghdr *nlh,
-			      struct netlink_dump_control *control);
+extern int __netlink_dump_start(struct sock *ssk, struct sk_buff *skb,
+				const struct nlmsghdr *nlh,
+				struct netlink_dump_control *control);
+static inline int netlink_dump_start(struct sock *ssk, struct sk_buff *skb,
+				     const struct nlmsghdr *nlh,
+				     struct netlink_dump_control *control)
+{
+	if (!control->module)
+		control->module = THIS_MODULE;
+
+	return __netlink_dump_start(ssk, skb, nlh, control);
+}
 
 #endif /* __KERNEL__ */
 
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 0f2e3ad69c47..01e944a017a4 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -169,6 +169,8 @@ static void netlink_sock_destruct(struct sock *sk)
 	if (nlk->cb) {
 		if (nlk->cb->done)
 			nlk->cb->done(nlk->cb);
+
+		module_put(nlk->cb->module);
 		netlink_destroy_callback(nlk->cb);
 	}
 
@@ -1758,6 +1760,7 @@ static int netlink_dump(struct sock *sk)
 	nlk->cb = NULL;
 	mutex_unlock(nlk->cb_mutex);
 
+	module_put(cb->module);
 	netlink_consume_callback(cb);
 	return 0;
 
@@ -1767,9 +1770,9 @@ errout_skb:
 	return err;
 }
 
-int netlink_dump_start(struct sock *ssk, struct sk_buff *skb,
-		       const struct nlmsghdr *nlh,
-		       struct netlink_dump_control *control)
+int __netlink_dump_start(struct sock *ssk, struct sk_buff *skb,
+			 const struct nlmsghdr *nlh,
+			 struct netlink_dump_control *control)
 {
 	struct netlink_callback *cb;
 	struct sock *sk;
@@ -1784,6 +1787,7 @@ int netlink_dump_start(struct sock *ssk, struct sk_buff *skb,
 	cb->done = control->done;
 	cb->nlh = nlh;
 	cb->data = control->data;
+	cb->module = control->module;
 	cb->min_dump_alloc = control->min_dump_alloc;
 	atomic_inc(&skb->users);
 	cb->skb = skb;
@@ -1794,19 +1798,28 @@ int netlink_dump_start(struct sock *ssk, struct sk_buff *skb,
 		return -ECONNREFUSED;
 	}
 	nlk = nlk_sk(sk);
-	/* A dump is in progress... */
+
 	mutex_lock(nlk->cb_mutex);
+	/* A dump is in progress... */
 	if (nlk->cb) {
 		mutex_unlock(nlk->cb_mutex);
 		netlink_destroy_callback(cb);
-		sock_put(sk);
-		return -EBUSY;
+		ret = -EBUSY;
+		goto out;
 	}
+	/* add reference of module which cb->dump belongs to */
+	if (!try_module_get(cb->module)) {
+		mutex_unlock(nlk->cb_mutex);
+		netlink_destroy_callback(cb);
+		ret = -EPROTONOSUPPORT;
+		goto out;
+	}
+
 	nlk->cb = cb;
 	mutex_unlock(nlk->cb_mutex);
 
 	ret = netlink_dump(sk);
-
+out:
 	sock_put(sk);
 
 	if (ret)
@@ -1817,7 +1830,7 @@ int netlink_dump_start(struct sock *ssk, struct sk_buff *skb,
 	 */
 	return -EINTR;
 }
-EXPORT_SYMBOL(netlink_dump_start);
+EXPORT_SYMBOL(__netlink_dump_start);
 
 void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err)
 {
-- 
cgit v1.2.3


From acb600def2110b1310466c0e485c0d26299898ae Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 5 Oct 2012 06:23:55 +0000
Subject: net: remove skb recycling

Over time, skb recycling infrastructure got litle interest and
many bugs. Generic rx path skb allocation is now using page
fragments for efficient GRO / TCP coalescing, and recyling
a tx skb for rx path is not worth the pain.

Last identified bug is that fat skbs can be recycled
and it can endup using high order pages after few iterations.

With help from Maxime Bizon, who pointed out that commit
87151b8689d (net: allow pskb_expand_head() to get maximum tailroom)
introduced this regression for recycled skbs.

Instead of fixing this bug, lets remove skb recycling.

Drivers wanting really hot skbs should use build_skb() anyway,
to allocate/populate sk_buff right before netif_receive_skb()

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Maxime Bizon <mbizon@freebox.fr>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/calxeda/xgmac.c              | 19 +--------
 drivers/net/ethernet/freescale/gianfar.c          | 27 ++-----------
 drivers/net/ethernet/freescale/gianfar.h          |  2 -
 drivers/net/ethernet/freescale/ucc_geth.c         | 29 +++-----------
 drivers/net/ethernet/freescale/ucc_geth.h         |  2 -
 drivers/net/ethernet/marvell/mv643xx_eth.c        | 18 +--------
 drivers/net/ethernet/stmicro/stmmac/stmmac.h      |  1 -
 drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 20 +---------
 include/linux/skbuff.h                            | 24 ------------
 net/core/skbuff.c                                 | 47 -----------------------
 10 files changed, 16 insertions(+), 173 deletions(-)

(limited to 'net')

diff --git a/drivers/net/ethernet/calxeda/xgmac.c b/drivers/net/ethernet/calxeda/xgmac.c
index 2b4b4f529ab4..16814b34d4b6 100644
--- a/drivers/net/ethernet/calxeda/xgmac.c
+++ b/drivers/net/ethernet/calxeda/xgmac.c
@@ -375,7 +375,6 @@ struct xgmac_priv {
 	unsigned int tx_tail;
 
 	void __iomem *base;
-	struct sk_buff_head rx_recycle;
 	unsigned int dma_buf_sz;
 	dma_addr_t dma_rx_phy;
 	dma_addr_t dma_tx_phy;
@@ -672,9 +671,7 @@ static void xgmac_rx_refill(struct xgmac_priv *priv)
 		p = priv->dma_rx + entry;
 
 		if (priv->rx_skbuff[entry] == NULL) {
-			skb = __skb_dequeue(&priv->rx_recycle);
-			if (skb == NULL)
-				skb = netdev_alloc_skb(priv->dev, priv->dma_buf_sz);
+			skb = netdev_alloc_skb(priv->dev, priv->dma_buf_sz);
 			if (unlikely(skb == NULL))
 				break;
 
@@ -887,17 +884,7 @@ static void xgmac_tx_complete(struct xgmac_priv *priv)
 				       desc_get_buf_len(p), DMA_TO_DEVICE);
 		}
 
-		/*
-		 * If there's room in the queue (limit it to size)
-		 * we add this skb back into the pool,
-		 * if it's the right size.
-		 */
-		if ((skb_queue_len(&priv->rx_recycle) <
-			DMA_RX_RING_SZ) &&
-			skb_recycle_check(skb, priv->dma_buf_sz))
-			__skb_queue_head(&priv->rx_recycle, skb);
-		else
-			dev_kfree_skb(skb);
+		dev_kfree_skb(skb);
 	}
 
 	if (dma_ring_space(priv->tx_head, priv->tx_tail, DMA_TX_RING_SZ) >
@@ -1016,7 +1003,6 @@ static int xgmac_open(struct net_device *dev)
 			dev->dev_addr);
 	}
 
-	skb_queue_head_init(&priv->rx_recycle);
 	memset(&priv->xstats, 0, sizeof(struct xgmac_extra_stats));
 
 	/* Initialize the XGMAC and descriptors */
@@ -1053,7 +1039,6 @@ static int xgmac_stop(struct net_device *dev)
 		napi_disable(&priv->napi);
 
 	writel(0, priv->base + XGMAC_DMA_INTR_ENA);
-	skb_queue_purge(&priv->rx_recycle);
 
 	/* Disable the MAC core */
 	xgmac_mac_disable(priv->base);
diff --git a/drivers/net/ethernet/freescale/gianfar.c b/drivers/net/ethernet/freescale/gianfar.c
index a1b52ec3b930..1d03dcdd5e56 100644
--- a/drivers/net/ethernet/freescale/gianfar.c
+++ b/drivers/net/ethernet/freescale/gianfar.c
@@ -1765,7 +1765,6 @@ static void free_skb_resources(struct gfar_private *priv)
 			  sizeof(struct rxbd8) * priv->total_rx_ring_size,
 			  priv->tx_queue[0]->tx_bd_base,
 			  priv->tx_queue[0]->tx_bd_dma_base);
-	skb_queue_purge(&priv->rx_recycle);
 }
 
 void gfar_start(struct net_device *dev)
@@ -1943,8 +1942,6 @@ static int gfar_enet_open(struct net_device *dev)
 
 	enable_napi(priv);
 
-	skb_queue_head_init(&priv->rx_recycle);
-
 	/* Initialize a bunch of registers */
 	init_registers(dev);
 
@@ -2533,16 +2530,7 @@ static int gfar_clean_tx_ring(struct gfar_priv_tx_q *tx_queue)
 
 		bytes_sent += skb->len;
 
-		/* If there's room in the queue (limit it to rx_buffer_size)
-		 * we add this skb back into the pool, if it's the right size
-		 */
-		if (skb_queue_len(&priv->rx_recycle) < rx_queue->rx_ring_size &&
-		    skb_recycle_check(skb, priv->rx_buffer_size +
-				      RXBUF_ALIGNMENT)) {
-			gfar_align_skb(skb);
-			skb_queue_head(&priv->rx_recycle, skb);
-		} else
-			dev_kfree_skb_any(skb);
+		dev_kfree_skb_any(skb);
 
 		tx_queue->tx_skbuff[skb_dirtytx] = NULL;
 
@@ -2608,7 +2596,7 @@ static void gfar_new_rxbdp(struct gfar_priv_rx_q *rx_queue, struct rxbd8 *bdp,
 static struct sk_buff *gfar_alloc_skb(struct net_device *dev)
 {
 	struct gfar_private *priv = netdev_priv(dev);
-	struct sk_buff *skb = NULL;
+	struct sk_buff *skb;
 
 	skb = netdev_alloc_skb(dev, priv->rx_buffer_size + RXBUF_ALIGNMENT);
 	if (!skb)
@@ -2621,14 +2609,7 @@ static struct sk_buff *gfar_alloc_skb(struct net_device *dev)
 
 struct sk_buff *gfar_new_skb(struct net_device *dev)
 {
-	struct gfar_private *priv = netdev_priv(dev);
-	struct sk_buff *skb = NULL;
-
-	skb = skb_dequeue(&priv->rx_recycle);
-	if (!skb)
-		skb = gfar_alloc_skb(dev);
-
-	return skb;
+	return gfar_alloc_skb(dev);
 }
 
 static inline void count_errors(unsigned short status, struct net_device *dev)
@@ -2787,7 +2768,7 @@ int gfar_clean_rx_ring(struct gfar_priv_rx_q *rx_queue, int rx_work_limit)
 			if (unlikely(!newskb))
 				newskb = skb;
 			else if (skb)
-				skb_queue_head(&priv->rx_recycle, skb);
+				dev_kfree_skb(skb);
 		} else {
 			/* Increment the number of packets */
 			rx_queue->stats.rx_packets++;
diff --git a/drivers/net/ethernet/freescale/gianfar.h b/drivers/net/ethernet/freescale/gianfar.h
index 4141ef2ddafc..22eabc13ca99 100644
--- a/drivers/net/ethernet/freescale/gianfar.h
+++ b/drivers/net/ethernet/freescale/gianfar.h
@@ -1080,8 +1080,6 @@ struct gfar_private {
 
 	u32 cur_filer_idx;
 
-	struct sk_buff_head rx_recycle;
-
 	/* RX queue filer rule set*/
 	struct ethtool_rx_list rx_list;
 	struct mutex rx_queue_access;
diff --git a/drivers/net/ethernet/freescale/ucc_geth.c b/drivers/net/ethernet/freescale/ucc_geth.c
index 164288439220..dfa0aaaab009 100644
--- a/drivers/net/ethernet/freescale/ucc_geth.c
+++ b/drivers/net/ethernet/freescale/ucc_geth.c
@@ -209,14 +209,12 @@ static struct list_head *dequeue(struct list_head *lh)
 static struct sk_buff *get_new_skb(struct ucc_geth_private *ugeth,
 		u8 __iomem *bd)
 {
-	struct sk_buff *skb = NULL;
+	struct sk_buff *skb;
 
-	skb = __skb_dequeue(&ugeth->rx_recycle);
+	skb = netdev_alloc_skb(ugeth->ndev,
+			       ugeth->ug_info->uf_info.max_rx_buf_length +
+			       UCC_GETH_RX_DATA_BUF_ALIGNMENT);
 	if (!skb)
-		skb = netdev_alloc_skb(ugeth->ndev,
-				      ugeth->ug_info->uf_info.max_rx_buf_length +
-				      UCC_GETH_RX_DATA_BUF_ALIGNMENT);
-	if (skb == NULL)
 		return NULL;
 
 	/* We need the data buffer to be aligned properly.  We will reserve
@@ -2020,8 +2018,6 @@ static void ucc_geth_memclean(struct ucc_geth_private *ugeth)
 		iounmap(ugeth->ug_regs);
 		ugeth->ug_regs = NULL;
 	}
-
-	skb_queue_purge(&ugeth->rx_recycle);
 }
 
 static void ucc_geth_set_multi(struct net_device *dev)
@@ -2230,8 +2226,6 @@ static int ucc_struct_init(struct ucc_geth_private *ugeth)
 		return -ENOMEM;
 	}
 
-	skb_queue_head_init(&ugeth->rx_recycle);
-
 	return 0;
 }
 
@@ -3274,12 +3268,7 @@ static int ucc_geth_rx(struct ucc_geth_private *ugeth, u8 rxQ, int rx_work_limit
 			if (netif_msg_rx_err(ugeth))
 				ugeth_err("%s, %d: ERROR!!! skb - 0x%08x",
 					   __func__, __LINE__, (u32) skb);
-			if (skb) {
-				skb->data = skb->head + NET_SKB_PAD;
-				skb->len = 0;
-				skb_reset_tail_pointer(skb);
-				__skb_queue_head(&ugeth->rx_recycle, skb);
-			}
+			dev_free_skb(skb);
 
 			ugeth->rx_skbuff[rxQ][ugeth->skb_currx[rxQ]] = NULL;
 			dev->stats.rx_dropped++;
@@ -3349,13 +3338,7 @@ static int ucc_geth_tx(struct net_device *dev, u8 txQ)
 
 		dev->stats.tx_packets++;
 
-		if (skb_queue_len(&ugeth->rx_recycle) < RX_BD_RING_LEN &&
-			     skb_recycle_check(skb,
-				    ugeth->ug_info->uf_info.max_rx_buf_length +
-				    UCC_GETH_RX_DATA_BUF_ALIGNMENT))
-			__skb_queue_head(&ugeth->rx_recycle, skb);
-		else
-			dev_kfree_skb(skb);
+		dev_kfree_skb(skb);
 
 		ugeth->tx_skbuff[txQ][ugeth->skb_dirtytx[txQ]] = NULL;
 		ugeth->skb_dirtytx[txQ] =
diff --git a/drivers/net/ethernet/freescale/ucc_geth.h b/drivers/net/ethernet/freescale/ucc_geth.h
index f71b3e7b12de..75f337163ce3 100644
--- a/drivers/net/ethernet/freescale/ucc_geth.h
+++ b/drivers/net/ethernet/freescale/ucc_geth.h
@@ -1214,8 +1214,6 @@ struct ucc_geth_private {
 	/* index of the first skb which hasn't been transmitted yet. */
 	u16 skb_dirtytx[NUM_TX_QUEUES];
 
-	struct sk_buff_head rx_recycle;
-
 	struct ugeth_mii_info *mii_info;
 	struct phy_device *phydev;
 	phy_interface_t phy_interface;
diff --git a/drivers/net/ethernet/marvell/mv643xx_eth.c b/drivers/net/ethernet/marvell/mv643xx_eth.c
index 087b9e0669f1..84c13263c514 100644
--- a/drivers/net/ethernet/marvell/mv643xx_eth.c
+++ b/drivers/net/ethernet/marvell/mv643xx_eth.c
@@ -412,7 +412,6 @@ struct mv643xx_eth_private {
 	u8 work_rx_refill;
 
 	int skb_size;
-	struct sk_buff_head rx_recycle;
 
 	/*
 	 * RX state.
@@ -673,9 +672,7 @@ static int rxq_refill(struct rx_queue *rxq, int budget)
 		struct rx_desc *rx_desc;
 		int size;
 
-		skb = __skb_dequeue(&mp->rx_recycle);
-		if (skb == NULL)
-			skb = netdev_alloc_skb(mp->dev, mp->skb_size);
+		skb = netdev_alloc_skb(mp->dev, mp->skb_size);
 
 		if (skb == NULL) {
 			mp->oom = 1;
@@ -989,14 +986,7 @@ static int txq_reclaim(struct tx_queue *txq, int budget, int force)
 				       desc->byte_cnt, DMA_TO_DEVICE);
 		}
 
-		if (skb != NULL) {
-			if (skb_queue_len(&mp->rx_recycle) <
-					mp->rx_ring_size &&
-			    skb_recycle_check(skb, mp->skb_size))
-				__skb_queue_head(&mp->rx_recycle, skb);
-			else
-				dev_kfree_skb(skb);
-		}
+		dev_kfree_skb(skb);
 	}
 
 	__netif_tx_unlock(nq);
@@ -2349,8 +2339,6 @@ static int mv643xx_eth_open(struct net_device *dev)
 
 	napi_enable(&mp->napi);
 
-	skb_queue_head_init(&mp->rx_recycle);
-
 	mp->int_mask = INT_EXT;
 
 	for (i = 0; i < mp->rxq_count; i++) {
@@ -2445,8 +2433,6 @@ static int mv643xx_eth_stop(struct net_device *dev)
 	mib_counters_update(mp);
 	del_timer_sync(&mp->mib_counters_timer);
 
-	skb_queue_purge(&mp->rx_recycle);
-
 	for (i = 0; i < mp->rxq_count; i++)
 		rxq_deinit(mp->rxq + i);
 	for (i = 0; i < mp->txq_count; i++)
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac.h b/drivers/net/ethernet/stmicro/stmmac/stmmac.h
index e872e1da3137..7d51a65ab099 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac.h
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac.h
@@ -50,7 +50,6 @@ struct stmmac_priv {
 	unsigned int dirty_rx;
 	struct sk_buff **rx_skbuff;
 	dma_addr_t *rx_skbuff_dma;
-	struct sk_buff_head rx_recycle;
 
 	struct net_device *dev;
 	dma_addr_t dma_rx_phy;
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
index 3be88331d17a..c6cdbc4eb05e 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
@@ -747,18 +747,7 @@ static void stmmac_tx(struct stmmac_priv *priv)
 		priv->hw->ring->clean_desc3(p);
 
 		if (likely(skb != NULL)) {
-			/*
-			 * If there's room in the queue (limit it to size)
-			 * we add this skb back into the pool,
-			 * if it's the right size.
-			 */
-			if ((skb_queue_len(&priv->rx_recycle) <
-				priv->dma_rx_size) &&
-				skb_recycle_check(skb, priv->dma_buf_sz))
-				__skb_queue_head(&priv->rx_recycle, skb);
-			else
-				dev_kfree_skb(skb);
-
+			dev_kfree_skb(skb);
 			priv->tx_skbuff[entry] = NULL;
 		}
 
@@ -1169,7 +1158,6 @@ static int stmmac_open(struct net_device *dev)
 	priv->eee_enabled = stmmac_eee_init(priv);
 
 	napi_enable(&priv->napi);
-	skb_queue_head_init(&priv->rx_recycle);
 	netif_start_queue(dev);
 
 	return 0;
@@ -1222,7 +1210,6 @@ static int stmmac_release(struct net_device *dev)
 		kfree(priv->tm);
 #endif
 	napi_disable(&priv->napi);
-	skb_queue_purge(&priv->rx_recycle);
 
 	/* Free the IRQ lines */
 	free_irq(dev->irq, dev);
@@ -1388,10 +1375,7 @@ static inline void stmmac_rx_refill(struct stmmac_priv *priv)
 		if (likely(priv->rx_skbuff[entry] == NULL)) {
 			struct sk_buff *skb;
 
-			skb = __skb_dequeue(&priv->rx_recycle);
-			if (skb == NULL)
-				skb = netdev_alloc_skb_ip_align(priv->dev,
-								bfsize);
+			skb = netdev_alloc_skb_ip_align(priv->dev, bfsize);
 
 			if (unlikely(skb == NULL))
 				break;
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index b33a3a1f205e..6a2c34e6d962 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -589,9 +589,6 @@ static inline struct sk_buff *alloc_skb_fclone(unsigned int size,
 	return __alloc_skb(size, priority, SKB_ALLOC_FCLONE, NUMA_NO_NODE);
 }
 
-extern void skb_recycle(struct sk_buff *skb);
-extern bool skb_recycle_check(struct sk_buff *skb, int skb_size);
-
 extern struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src);
 extern int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask);
 extern struct sk_buff *skb_clone(struct sk_buff *skb,
@@ -2645,27 +2642,6 @@ static inline void skb_checksum_none_assert(const struct sk_buff *skb)
 
 bool skb_partial_csum_set(struct sk_buff *skb, u16 start, u16 off);
 
-static inline bool skb_is_recycleable(const struct sk_buff *skb, int skb_size)
-{
-	if (irqs_disabled())
-		return false;
-
-	if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY)
-		return false;
-
-	if (skb_is_nonlinear(skb) || skb->fclone != SKB_FCLONE_UNAVAILABLE)
-		return false;
-
-	skb_size = SKB_DATA_ALIGN(skb_size + NET_SKB_PAD);
-	if (skb_end_offset(skb) < skb_size)
-		return false;
-
-	if (skb_shared(skb) || skb_cloned(skb))
-		return false;
-
-	return true;
-}
-
 /**
  * skb_head_is_locked - Determine if the skb->head is locked down
  * @skb: skb to check
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index cdc28598f4ef..6e04b1fa11f2 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -655,53 +655,6 @@ void consume_skb(struct sk_buff *skb)
 }
 EXPORT_SYMBOL(consume_skb);
 
-/**
- * 	skb_recycle - clean up an skb for reuse
- * 	@skb: buffer
- *
- * 	Recycles the skb to be reused as a receive buffer. This
- * 	function does any necessary reference count dropping, and
- * 	cleans up the skbuff as if it just came from __alloc_skb().
- */
-void skb_recycle(struct sk_buff *skb)
-{
-	struct skb_shared_info *shinfo;
-
-	skb_release_head_state(skb);
-
-	shinfo = skb_shinfo(skb);
-	memset(shinfo, 0, offsetof(struct skb_shared_info, dataref));
-	atomic_set(&shinfo->dataref, 1);
-
-	memset(skb, 0, offsetof(struct sk_buff, tail));
-	skb->data = skb->head + NET_SKB_PAD;
-	skb_reset_tail_pointer(skb);
-}
-EXPORT_SYMBOL(skb_recycle);
-
-/**
- *	skb_recycle_check - check if skb can be reused for receive
- *	@skb: buffer
- *	@skb_size: minimum receive buffer size
- *
- *	Checks that the skb passed in is not shared or cloned, and
- *	that it is linear and its head portion at least as large as
- *	skb_size so that it can be recycled as a receive buffer.
- *	If these conditions are met, this function does any necessary
- *	reference count dropping and cleans up the skbuff as if it
- *	just came from __alloc_skb().
- */
-bool skb_recycle_check(struct sk_buff *skb, int skb_size)
-{
-	if (!skb_is_recycleable(skb, skb_size))
-		return false;
-
-	skb_recycle(skb);
-
-	return true;
-}
-EXPORT_SYMBOL(skb_recycle_check);
-
 static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
 {
 	new->tstamp		= old->tstamp;
-- 
cgit v1.2.3


From e1f165032c8bade3a6bdf546f8faf61fda4dd01c Mon Sep 17 00:00:00 2001
From: "ramesh.nagappa@gmail.com" <ramesh.nagappa@gmail.com>
Date: Fri, 5 Oct 2012 19:10:15 +0000
Subject: net: Fix skb_under_panic oops in neigh_resolve_output

The retry loop in neigh_resolve_output() and neigh_connected_output()
call dev_hard_header() with out reseting the skb to network_header.
This causes the retry to fail with skb_under_panic. The fix is to
reset the network_header within the retry loop.

Signed-off-by: Ramesh Nagappa <ramesh.nagappa@ericsson.com>
Reviewed-by: Shawn Lu <shawn.lu@ericsson.com>
Reviewed-by: Robert Coulson <robert.coulson@ericsson.com>
Reviewed-by: Billie Alsup <billie.alsup@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/neighbour.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index baca771caae2..22571488730a 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -1301,8 +1301,6 @@ int neigh_resolve_output(struct neighbour *neigh, struct sk_buff *skb)
 	if (!dst)
 		goto discard;
 
-	__skb_pull(skb, skb_network_offset(skb));
-
 	if (!neigh_event_send(neigh, skb)) {
 		int err;
 		struct net_device *dev = neigh->dev;
@@ -1312,6 +1310,7 @@ int neigh_resolve_output(struct neighbour *neigh, struct sk_buff *skb)
 			neigh_hh_init(neigh, dst);
 
 		do {
+			__skb_pull(skb, skb_network_offset(skb));
 			seq = read_seqbegin(&neigh->ha_lock);
 			err = dev_hard_header(skb, dev, ntohs(skb->protocol),
 					      neigh->ha, NULL, skb->len);
@@ -1342,9 +1341,8 @@ int neigh_connected_output(struct neighbour *neigh, struct sk_buff *skb)
 	unsigned int seq;
 	int err;
 
-	__skb_pull(skb, skb_network_offset(skb));
-
 	do {
+		__skb_pull(skb, skb_network_offset(skb));
 		seq = read_seqbegin(&neigh->ha_lock);
 		err = dev_hard_header(skb, dev, ntohs(skb->protocol),
 				      neigh->ha, NULL, skb->len);
-- 
cgit v1.2.3


From 51ec04038c113a811b177baa85d293feff9ce995 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 5 Oct 2012 20:43:30 +0000
Subject: ipv6: GRO should be ECN friendly

IPv4 side of the problem was addressed in commit a9e050f4e7f9d
(net: tcp: GRO should be ECN friendly)

This patch does the same, but for IPv6 : A Traffic Class mismatch
doesnt mean flows are different, but instead should force a flush
of previous packets.

This patch removes artificial packet reordering problem.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/af_inet6.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index e22e6d88bac6..f757e3b7cfbf 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -880,22 +880,25 @@ static struct sk_buff **ipv6_gro_receive(struct sk_buff **head,
 	nlen = skb_network_header_len(skb);
 
 	for (p = *head; p; p = p->next) {
-		struct ipv6hdr *iph2;
+		const struct ipv6hdr *iph2;
+		__be32 first_word; /* <Version:4><Traffic_Class:8><Flow_Label:20> */
 
 		if (!NAPI_GRO_CB(p)->same_flow)
 			continue;
 
 		iph2 = ipv6_hdr(p);
+		first_word = *(__be32 *)iph ^ *(__be32 *)iph2 ;
 
-		/* All fields must match except length. */
+		/* All fields must match except length and Traffic Class. */
 		if (nlen != skb_network_header_len(p) ||
-		    memcmp(iph, iph2, offsetof(struct ipv6hdr, payload_len)) ||
+		    (first_word & htonl(0xF00FFFFF)) ||
 		    memcmp(&iph->nexthdr, &iph2->nexthdr,
 			   nlen - offsetof(struct ipv6hdr, nexthdr))) {
 			NAPI_GRO_CB(p)->same_flow = 0;
 			continue;
 		}
-
+		/* flush if Traffic Class fields are different */
+		NAPI_GRO_CB(p)->flush |= !!(first_word & htonl(0x0FF00000));
 		NAPI_GRO_CB(p)->flush |= flush;
 	}
 
-- 
cgit v1.2.3


From ca07e43e288956a0ad5e6bd075f7aa1fca3bca00 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Sat, 6 Oct 2012 22:28:06 +0000
Subject: net: gro: fix a potential crash in skb_gro_reset_offset

Before accessing skb first fragment, better make sure there
is one.

This is probably not needed for old kernels, since an ethernet frame
cannot contain only an ethernet header, but the recent GRO addition
to tunnels makes this patch needed.

Also skb_gro_reset_offset() can be static, it actually allows
compiler to inline it.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h |  1 -
 net/core/dev.c            | 14 ++++++++------
 2 files changed, 8 insertions(+), 7 deletions(-)

(limited to 'net')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 01646aa53b0e..a659fd0ba965 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1663,7 +1663,6 @@ extern int		netpoll_trap(void);
 #endif
 extern int	       skb_gro_receive(struct sk_buff **head,
 				       struct sk_buff *skb);
-extern void	       skb_gro_reset_offset(struct sk_buff *skb);
 
 static inline unsigned int skb_gro_offset(const struct sk_buff *skb)
 {
diff --git a/net/core/dev.c b/net/core/dev.c
index 1e0a1847c3bb..de2bad717d56 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3631,20 +3631,22 @@ gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
 }
 EXPORT_SYMBOL(napi_skb_finish);
 
-void skb_gro_reset_offset(struct sk_buff *skb)
+static void skb_gro_reset_offset(struct sk_buff *skb)
 {
+	const struct skb_shared_info *pinfo = skb_shinfo(skb);
+	const skb_frag_t *frag0 = &pinfo->frags[0];
+
 	NAPI_GRO_CB(skb)->data_offset = 0;
 	NAPI_GRO_CB(skb)->frag0 = NULL;
 	NAPI_GRO_CB(skb)->frag0_len = 0;
 
 	if (skb->mac_header == skb->tail &&
-	    !PageHighMem(skb_frag_page(&skb_shinfo(skb)->frags[0]))) {
-		NAPI_GRO_CB(skb)->frag0 =
-			skb_frag_address(&skb_shinfo(skb)->frags[0]);
-		NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(&skb_shinfo(skb)->frags[0]);
+	    pinfo->nr_frags &&
+	    !PageHighMem(skb_frag_page(frag0))) {
+		NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
+		NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(frag0);
 	}
 }
-EXPORT_SYMBOL(skb_gro_reset_offset);
 
 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
 {
-- 
cgit v1.2.3


From cf7f601c067994f371ba77721d1e45fce61a4569 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 13 Sep 2012 13:06:29 +0100
Subject: KEYS: Add payload preparsing opportunity prior to key instantiate or
 update

Give the key type the opportunity to preparse the payload prior to the
instantiation and update routines being called.  This is done with the
provision of two new key type operations:

	int (*preparse)(struct key_preparsed_payload *prep);
	void (*free_preparse)(struct key_preparsed_payload *prep);

If the first operation is present, then it is called before key creation (in
the add/update case) or before the key semaphore is taken (in the update and
instantiate cases).  The second operation is called to clean up if the first
was called.

preparse() is given the opportunity to fill in the following structure:

	struct key_preparsed_payload {
		char		*description;
		void		*type_data[2];
		void		*payload;
		const void	*data;
		size_t		datalen;
		size_t		quotalen;
	};

Before the preparser is called, the first three fields will have been cleared,
the payload pointer and size will be stored in data and datalen and the default
quota size from the key_type struct will be stored into quotalen.

The preparser may parse the payload in any way it likes and may store data in
the type_data[] and payload fields for use by the instantiate() and update()
ops.

The preparser may also propose a description for the key by attaching it as a
string to the description field.  This can be used by passing a NULL or ""
description to the add_key() system call or the key_create_or_update()
function.  This cannot work with request_key() as that required the description
to tell the upcall about the key to be created.

This, for example permits keys that store PGP public keys to generate their own
name from the user ID and public key fingerprint in the key.

The instantiate() and update() operations are then modified to look like this:

	int (*instantiate)(struct key *key, struct key_preparsed_payload *prep);
	int (*update)(struct key *key, struct key_preparsed_payload *prep);

and the new payload data is passed in *prep, whether or not it was preparsed.

Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 Documentation/security/keys.txt          |  50 +++++++++++++-
 fs/cifs/cifs_spnego.c                    |   6 +-
 fs/cifs/cifsacl.c                        |   8 +--
 include/keys/user-type.h                 |   6 +-
 include/linux/key-type.h                 |  35 +++++++++-
 net/ceph/crypto.c                        |   9 +--
 net/dns_resolver/dns_key.c               |   6 +-
 net/rxrpc/ar-key.c                       |  40 +++++------
 security/keys/encrypted-keys/encrypted.c |  16 +++--
 security/keys/key.c                      | 114 ++++++++++++++++++++++---------
 security/keys/keyctl.c                   |  18 +++--
 security/keys/keyring.c                  |   6 +-
 security/keys/request_key_auth.c         |   8 +--
 security/keys/trusted.c                  |  16 +++--
 security/keys/user_defined.c             |  14 ++--
 15 files changed, 250 insertions(+), 102 deletions(-)

(limited to 'net')

diff --git a/Documentation/security/keys.txt b/Documentation/security/keys.txt
index aa0dbd74b71b..7d9ca92022d8 100644
--- a/Documentation/security/keys.txt
+++ b/Documentation/security/keys.txt
@@ -412,6 +412,10 @@ The main syscalls are:
      to the keyring. In this case, an error will be generated if the process
      does not have permission to write to the keyring.
 
+     If the key type supports it, if the description is NULL or an empty
+     string, the key type will try and generate a description from the content
+     of the payload.
+
      The payload is optional, and the pointer can be NULL if not required by
      the type. The payload is plen in size, and plen can be zero for an empty
      payload.
@@ -1114,12 +1118,53 @@ The structure has a number of fields, some of which are mandatory:
      it should return 0.
 
 
- (*) int (*instantiate)(struct key *key, const void *data, size_t datalen);
+ (*) int (*preparse)(struct key_preparsed_payload *prep);
+
+     This optional method permits the key type to attempt to parse payload
+     before a key is created (add key) or the key semaphore is taken (update or
+     instantiate key).  The structure pointed to by prep looks like:
+
+	struct key_preparsed_payload {
+		char		*description;
+		void		*type_data[2];
+		void		*payload;
+		const void	*data;
+		size_t		datalen;
+		size_t		quotalen;
+	};
+
+     Before calling the method, the caller will fill in data and datalen with
+     the payload blob parameters; quotalen will be filled in with the default
+     quota size from the key type and the rest will be cleared.
+
+     If a description can be proposed from the payload contents, that should be
+     attached as a string to the description field.  This will be used for the
+     key description if the caller of add_key() passes NULL or "".
+
+     The method can attach anything it likes to type_data[] and payload.  These
+     are merely passed along to the instantiate() or update() operations.
+
+     The method should return 0 if success ful or a negative error code
+     otherwise.
+
+     
+ (*) void (*free_preparse)(struct key_preparsed_payload *prep);
+
+     This method is only required if the preparse() method is provided,
+     otherwise it is unused.  It cleans up anything attached to the
+     description, type_data and payload fields of the key_preparsed_payload
+     struct as filled in by the preparse() method.
+
+
+ (*) int (*instantiate)(struct key *key, struct key_preparsed_payload *prep);
 
      This method is called to attach a payload to a key during construction.
      The payload attached need not bear any relation to the data passed to this
      function.
 
+     The prep->data and prep->datalen fields will define the original payload
+     blob.  If preparse() was supplied then other fields may be filled in also.
+
      If the amount of data attached to the key differs from the size in
      keytype->def_datalen, then key_payload_reserve() should be called.
 
@@ -1135,6 +1180,9 @@ The structure has a number of fields, some of which are mandatory:
      If this type of key can be updated, then this method should be provided.
      It is called to update a key's payload from the blob of data provided.
 
+     The prep->data and prep->datalen fields will define the original payload
+     blob.  If preparse() was supplied then other fields may be filled in also.
+
      key_payload_reserve() should be called if the data length might change
      before any changes are actually made. Note that if this succeeds, the type
      is committed to changing the key because it's already been altered, so all
diff --git a/fs/cifs/cifs_spnego.c b/fs/cifs/cifs_spnego.c
index e622863b292f..086f381d6489 100644
--- a/fs/cifs/cifs_spnego.c
+++ b/fs/cifs/cifs_spnego.c
@@ -31,18 +31,18 @@
 
 /* create a new cifs key */
 static int
-cifs_spnego_key_instantiate(struct key *key, const void *data, size_t datalen)
+cifs_spnego_key_instantiate(struct key *key, struct key_preparsed_payload *prep)
 {
 	char *payload;
 	int ret;
 
 	ret = -ENOMEM;
-	payload = kmalloc(datalen, GFP_KERNEL);
+	payload = kmalloc(prep->datalen, GFP_KERNEL);
 	if (!payload)
 		goto error;
 
 	/* attach the data */
-	memcpy(payload, data, datalen);
+	memcpy(payload, prep->data, prep->datalen);
 	key->payload.data = payload;
 	ret = 0;
 
diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index 05f4dc263a23..f3c60e264ca8 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -167,17 +167,17 @@ static struct shrinker cifs_shrinker = {
 };
 
 static int
-cifs_idmap_key_instantiate(struct key *key, const void *data, size_t datalen)
+cifs_idmap_key_instantiate(struct key *key, struct key_preparsed_payload *prep)
 {
 	char *payload;
 
-	payload = kmalloc(datalen, GFP_KERNEL);
+	payload = kmalloc(prep->datalen, GFP_KERNEL);
 	if (!payload)
 		return -ENOMEM;
 
-	memcpy(payload, data, datalen);
+	memcpy(payload, prep->data, prep->datalen);
 	key->payload.data = payload;
-	key->datalen = datalen;
+	key->datalen = prep->datalen;
 	return 0;
 }
 
diff --git a/include/keys/user-type.h b/include/keys/user-type.h
index bc9ec1d7698c..5e452c84f1e6 100644
--- a/include/keys/user-type.h
+++ b/include/keys/user-type.h
@@ -35,8 +35,10 @@ struct user_key_payload {
 extern struct key_type key_type_user;
 extern struct key_type key_type_logon;
 
-extern int user_instantiate(struct key *key, const void *data, size_t datalen);
-extern int user_update(struct key *key, const void *data, size_t datalen);
+struct key_preparsed_payload;
+
+extern int user_instantiate(struct key *key, struct key_preparsed_payload *prep);
+extern int user_update(struct key *key, struct key_preparsed_payload *prep);
 extern int user_match(const struct key *key, const void *criterion);
 extern void user_revoke(struct key *key);
 extern void user_destroy(struct key *key);
diff --git a/include/linux/key-type.h b/include/linux/key-type.h
index f0c651cda7b0..518a53afb9ea 100644
--- a/include/linux/key-type.h
+++ b/include/linux/key-type.h
@@ -26,6 +26,27 @@ struct key_construction {
 	struct key	*authkey;/* authorisation for key being constructed */
 };
 
+/*
+ * Pre-parsed payload, used by key add, update and instantiate.
+ *
+ * This struct will be cleared and data and datalen will be set with the data
+ * and length parameters from the caller and quotalen will be set from
+ * def_datalen from the key type.  Then if the preparse() op is provided by the
+ * key type, that will be called.  Then the struct will be passed to the
+ * instantiate() or the update() op.
+ *
+ * If the preparse() op is given, the free_preparse() op will be called to
+ * clear the contents.
+ */
+struct key_preparsed_payload {
+	char		*description;	/* Proposed key description (or NULL) */
+	void		*type_data[2];	/* Private key-type data */
+	void		*payload;	/* Proposed payload */
+	const void	*data;		/* Raw data */
+	size_t		datalen;	/* Raw datalen */
+	size_t		quotalen;	/* Quota length for proposed payload */
+};
+
 typedef int (*request_key_actor_t)(struct key_construction *key,
 				   const char *op, void *aux);
 
@@ -45,18 +66,28 @@ struct key_type {
 	/* vet a description */
 	int (*vet_description)(const char *description);
 
+	/* Preparse the data blob from userspace that is to be the payload,
+	 * generating a proposed description and payload that will be handed to
+	 * the instantiate() and update() ops.
+	 */
+	int (*preparse)(struct key_preparsed_payload *prep);
+
+	/* Free a preparse data structure.
+	 */
+	void (*free_preparse)(struct key_preparsed_payload *prep);
+
 	/* instantiate a key of this type
 	 * - this method should call key_payload_reserve() to determine if the
 	 *   user's quota will hold the payload
 	 */
-	int (*instantiate)(struct key *key, const void *data, size_t datalen);
+	int (*instantiate)(struct key *key, struct key_preparsed_payload *prep);
 
 	/* update a key of this type (optional)
 	 * - this method should call key_payload_reserve() to recalculate the
 	 *   quota consumption
 	 * - the key must be locked against read when modifying
 	 */
-	int (*update)(struct key *key, const void *data, size_t datalen);
+	int (*update)(struct key *key, struct key_preparsed_payload *prep);
 
 	/* match a key against a description */
 	int (*match)(const struct key *key, const void *desc);
diff --git a/net/ceph/crypto.c b/net/ceph/crypto.c
index 9da7fdd3cd8a..af14cb425164 100644
--- a/net/ceph/crypto.c
+++ b/net/ceph/crypto.c
@@ -423,14 +423,15 @@ int ceph_encrypt2(struct ceph_crypto_key *secret, void *dst, size_t *dst_len,
 	}
 }
 
-int ceph_key_instantiate(struct key *key, const void *data, size_t datalen)
+int ceph_key_instantiate(struct key *key, struct key_preparsed_payload *prep)
 {
 	struct ceph_crypto_key *ckey;
+	size_t datalen = prep->datalen;
 	int ret;
 	void *p;
 
 	ret = -EINVAL;
-	if (datalen <= 0 || datalen > 32767 || !data)
+	if (datalen <= 0 || datalen > 32767 || !prep->data)
 		goto err;
 
 	ret = key_payload_reserve(key, datalen);
@@ -443,8 +444,8 @@ int ceph_key_instantiate(struct key *key, const void *data, size_t datalen)
 		goto err;
 
 	/* TODO ceph_crypto_key_decode should really take const input */
-	p = (void *)data;
-	ret = ceph_crypto_key_decode(ckey, &p, (char*)data+datalen);
+	p = (void *)prep->data;
+	ret = ceph_crypto_key_decode(ckey, &p, (char*)prep->data+datalen);
 	if (ret < 0)
 		goto err_ckey;
 
diff --git a/net/dns_resolver/dns_key.c b/net/dns_resolver/dns_key.c
index d9507dd05818..859ab8b6ec34 100644
--- a/net/dns_resolver/dns_key.c
+++ b/net/dns_resolver/dns_key.c
@@ -59,13 +59,13 @@ const struct cred *dns_resolver_cache;
  *        "ip1,ip2,...#foo=bar"
  */
 static int
-dns_resolver_instantiate(struct key *key, const void *_data, size_t datalen)
+dns_resolver_instantiate(struct key *key, struct key_preparsed_payload *prep)
 {
 	struct user_key_payload *upayload;
 	unsigned long derrno;
 	int ret;
-	size_t result_len = 0;
-	const char *data = _data, *end, *opt;
+	size_t datalen = prep->datalen, result_len = 0;
+	const char *data = prep->data, *end, *opt;
 
 	kenter("%%%d,%s,'%*.*s',%zu",
 	       key->serial, key->description,
diff --git a/net/rxrpc/ar-key.c b/net/rxrpc/ar-key.c
index 8b1f9f49960f..106c5a6b1ab2 100644
--- a/net/rxrpc/ar-key.c
+++ b/net/rxrpc/ar-key.c
@@ -26,8 +26,8 @@
 #include "ar-internal.h"
 
 static int rxrpc_vet_description_s(const char *);
-static int rxrpc_instantiate(struct key *, const void *, size_t);
-static int rxrpc_instantiate_s(struct key *, const void *, size_t);
+static int rxrpc_instantiate(struct key *, struct key_preparsed_payload *);
+static int rxrpc_instantiate_s(struct key *, struct key_preparsed_payload *);
 static void rxrpc_destroy(struct key *);
 static void rxrpc_destroy_s(struct key *);
 static void rxrpc_describe(const struct key *, struct seq_file *);
@@ -678,7 +678,7 @@ error:
  *
  * if no data is provided, then a no-security key is made
  */
-static int rxrpc_instantiate(struct key *key, const void *data, size_t datalen)
+static int rxrpc_instantiate(struct key *key, struct key_preparsed_payload *prep)
 {
 	const struct rxrpc_key_data_v1 *v1;
 	struct rxrpc_key_token *token, **pp;
@@ -686,26 +686,26 @@ static int rxrpc_instantiate(struct key *key, const void *data, size_t datalen)
 	u32 kver;
 	int ret;
 
-	_enter("{%x},,%zu", key_serial(key), datalen);
+	_enter("{%x},,%zu", key_serial(key), prep->datalen);
 
 	/* handle a no-security key */
-	if (!data && datalen == 0)
+	if (!prep->data && prep->datalen == 0)
 		return 0;
 
 	/* determine if the XDR payload format is being used */
-	if (datalen > 7 * 4) {
-		ret = rxrpc_instantiate_xdr(key, data, datalen);
+	if (prep->datalen > 7 * 4) {
+		ret = rxrpc_instantiate_xdr(key, prep->data, prep->datalen);
 		if (ret != -EPROTO)
 			return ret;
 	}
 
 	/* get the key interface version number */
 	ret = -EINVAL;
-	if (datalen <= 4 || !data)
+	if (prep->datalen <= 4 || !prep->data)
 		goto error;
-	memcpy(&kver, data, sizeof(kver));
-	data += sizeof(kver);
-	datalen -= sizeof(kver);
+	memcpy(&kver, prep->data, sizeof(kver));
+	prep->data += sizeof(kver);
+	prep->datalen -= sizeof(kver);
 
 	_debug("KEY I/F VERSION: %u", kver);
 
@@ -715,11 +715,11 @@ static int rxrpc_instantiate(struct key *key, const void *data, size_t datalen)
 
 	/* deal with a version 1 key */
 	ret = -EINVAL;
-	if (datalen < sizeof(*v1))
+	if (prep->datalen < sizeof(*v1))
 		goto error;
 
-	v1 = data;
-	if (datalen != sizeof(*v1) + v1->ticket_length)
+	v1 = prep->data;
+	if (prep->datalen != sizeof(*v1) + v1->ticket_length)
 		goto error;
 
 	_debug("SCIX: %u", v1->security_index);
@@ -784,17 +784,17 @@ error:
  * instantiate a server secret key
  * data should be a pointer to the 8-byte secret key
  */
-static int rxrpc_instantiate_s(struct key *key, const void *data,
-			       size_t datalen)
+static int rxrpc_instantiate_s(struct key *key,
+			       struct key_preparsed_payload *prep)
 {
 	struct crypto_blkcipher *ci;
 
-	_enter("{%x},,%zu", key_serial(key), datalen);
+	_enter("{%x},,%zu", key_serial(key), prep->datalen);
 
-	if (datalen != 8)
+	if (prep->datalen != 8)
 		return -EINVAL;
 
-	memcpy(&key->type_data, data, 8);
+	memcpy(&key->type_data, prep->data, 8);
 
 	ci = crypto_alloc_blkcipher("pcbc(des)", 0, CRYPTO_ALG_ASYNC);
 	if (IS_ERR(ci)) {
@@ -802,7 +802,7 @@ static int rxrpc_instantiate_s(struct key *key, const void *data,
 		return PTR_ERR(ci);
 	}
 
-	if (crypto_blkcipher_setkey(ci, data, 8) < 0)
+	if (crypto_blkcipher_setkey(ci, prep->data, 8) < 0)
 		BUG();
 
 	key->payload.data = ci;
diff --git a/security/keys/encrypted-keys/encrypted.c b/security/keys/encrypted-keys/encrypted.c
index 2d1bb8af7696..9e1e005c7596 100644
--- a/security/keys/encrypted-keys/encrypted.c
+++ b/security/keys/encrypted-keys/encrypted.c
@@ -773,8 +773,8 @@ static int encrypted_init(struct encrypted_key_payload *epayload,
  *
  * On success, return 0. Otherwise return errno.
  */
-static int encrypted_instantiate(struct key *key, const void *data,
-				 size_t datalen)
+static int encrypted_instantiate(struct key *key,
+				 struct key_preparsed_payload *prep)
 {
 	struct encrypted_key_payload *epayload = NULL;
 	char *datablob = NULL;
@@ -782,16 +782,17 @@ static int encrypted_instantiate(struct key *key, const void *data,
 	char *master_desc = NULL;
 	char *decrypted_datalen = NULL;
 	char *hex_encoded_iv = NULL;
+	size_t datalen = prep->datalen;
 	int ret;
 
-	if (datalen <= 0 || datalen > 32767 || !data)
+	if (datalen <= 0 || datalen > 32767 || !prep->data)
 		return -EINVAL;
 
 	datablob = kmalloc(datalen + 1, GFP_KERNEL);
 	if (!datablob)
 		return -ENOMEM;
 	datablob[datalen] = 0;
-	memcpy(datablob, data, datalen);
+	memcpy(datablob, prep->data, datalen);
 	ret = datablob_parse(datablob, &format, &master_desc,
 			     &decrypted_datalen, &hex_encoded_iv);
 	if (ret < 0)
@@ -834,16 +835,17 @@ static void encrypted_rcu_free(struct rcu_head *rcu)
  *
  * On success, return 0. Otherwise return errno.
  */
-static int encrypted_update(struct key *key, const void *data, size_t datalen)
+static int encrypted_update(struct key *key, struct key_preparsed_payload *prep)
 {
 	struct encrypted_key_payload *epayload = key->payload.data;
 	struct encrypted_key_payload *new_epayload;
 	char *buf;
 	char *new_master_desc = NULL;
 	const char *format = NULL;
+	size_t datalen = prep->datalen;
 	int ret = 0;
 
-	if (datalen <= 0 || datalen > 32767 || !data)
+	if (datalen <= 0 || datalen > 32767 || !prep->data)
 		return -EINVAL;
 
 	buf = kmalloc(datalen + 1, GFP_KERNEL);
@@ -851,7 +853,7 @@ static int encrypted_update(struct key *key, const void *data, size_t datalen)
 		return -ENOMEM;
 
 	buf[datalen] = 0;
-	memcpy(buf, data, datalen);
+	memcpy(buf, prep->data, datalen);
 	ret = datablob_parse(buf, &format, &new_master_desc, NULL, NULL);
 	if (ret < 0)
 		goto out;
diff --git a/security/keys/key.c b/security/keys/key.c
index 50d96d4e06f2..1d039af99f50 100644
--- a/security/keys/key.c
+++ b/security/keys/key.c
@@ -412,8 +412,7 @@ EXPORT_SYMBOL(key_payload_reserve);
  * key_construction_mutex.
  */
 static int __key_instantiate_and_link(struct key *key,
-				      const void *data,
-				      size_t datalen,
+				      struct key_preparsed_payload *prep,
 				      struct key *keyring,
 				      struct key *authkey,
 				      unsigned long *_prealloc)
@@ -431,7 +430,7 @@ static int __key_instantiate_and_link(struct key *key,
 	/* can't instantiate twice */
 	if (!test_bit(KEY_FLAG_INSTANTIATED, &key->flags)) {
 		/* instantiate the key */
-		ret = key->type->instantiate(key, data, datalen);
+		ret = key->type->instantiate(key, prep);
 
 		if (ret == 0) {
 			/* mark the key as being instantiated */
@@ -482,22 +481,37 @@ int key_instantiate_and_link(struct key *key,
 			     struct key *keyring,
 			     struct key *authkey)
 {
+	struct key_preparsed_payload prep;
 	unsigned long prealloc;
 	int ret;
 
+	memset(&prep, 0, sizeof(prep));
+	prep.data = data;
+	prep.datalen = datalen;
+	prep.quotalen = key->type->def_datalen;
+	if (key->type->preparse) {
+		ret = key->type->preparse(&prep);
+		if (ret < 0)
+			goto error;
+	}
+
 	if (keyring) {
 		ret = __key_link_begin(keyring, key->type, key->description,
 				       &prealloc);
 		if (ret < 0)
-			return ret;
+			goto error_free_preparse;
 	}
 
-	ret = __key_instantiate_and_link(key, data, datalen, keyring, authkey,
+	ret = __key_instantiate_and_link(key, &prep, keyring, authkey,
 					 &prealloc);
 
 	if (keyring)
 		__key_link_end(keyring, key->type, prealloc);
 
+error_free_preparse:
+	if (key->type->preparse)
+		key->type->free_preparse(&prep);
+error:
 	return ret;
 }
 
@@ -706,7 +720,7 @@ void key_type_put(struct key_type *ktype)
  * if we get an error.
  */
 static inline key_ref_t __key_update(key_ref_t key_ref,
-				     const void *payload, size_t plen)
+				     struct key_preparsed_payload *prep)
 {
 	struct key *key = key_ref_to_ptr(key_ref);
 	int ret;
@@ -722,7 +736,7 @@ static inline key_ref_t __key_update(key_ref_t key_ref,
 
 	down_write(&key->sem);
 
-	ret = key->type->update(key, payload, plen);
+	ret = key->type->update(key, prep);
 	if (ret == 0)
 		/* updating a negative key instantiates it */
 		clear_bit(KEY_FLAG_NEGATIVE, &key->flags);
@@ -774,6 +788,7 @@ key_ref_t key_create_or_update(key_ref_t keyring_ref,
 			       unsigned long flags)
 {
 	unsigned long prealloc;
+	struct key_preparsed_payload prep;
 	const struct cred *cred = current_cred();
 	struct key_type *ktype;
 	struct key *keyring, *key = NULL;
@@ -789,8 +804,9 @@ key_ref_t key_create_or_update(key_ref_t keyring_ref,
 	}
 
 	key_ref = ERR_PTR(-EINVAL);
-	if (!ktype->match || !ktype->instantiate)
-		goto error_2;
+	if (!ktype->match || !ktype->instantiate ||
+	    (!description && !ktype->preparse))
+		goto error_put_type;
 
 	keyring = key_ref_to_ptr(keyring_ref);
 
@@ -798,18 +814,37 @@ key_ref_t key_create_or_update(key_ref_t keyring_ref,
 
 	key_ref = ERR_PTR(-ENOTDIR);
 	if (keyring->type != &key_type_keyring)
-		goto error_2;
+		goto error_put_type;
+
+	memset(&prep, 0, sizeof(prep));
+	prep.data = payload;
+	prep.datalen = plen;
+	prep.quotalen = ktype->def_datalen;
+	if (ktype->preparse) {
+		ret = ktype->preparse(&prep);
+		if (ret < 0) {
+			key_ref = ERR_PTR(ret);
+			goto error_put_type;
+		}
+		if (!description)
+			description = prep.description;
+		key_ref = ERR_PTR(-EINVAL);
+		if (!description)
+			goto error_free_prep;
+	}
 
 	ret = __key_link_begin(keyring, ktype, description, &prealloc);
-	if (ret < 0)
-		goto error_2;
+	if (ret < 0) {
+		key_ref = ERR_PTR(ret);
+		goto error_free_prep;
+	}
 
 	/* if we're going to allocate a new key, we're going to have
 	 * to modify the keyring */
 	ret = key_permission(keyring_ref, KEY_WRITE);
 	if (ret < 0) {
 		key_ref = ERR_PTR(ret);
-		goto error_3;
+		goto error_link_end;
 	}
 
 	/* if it's possible to update this type of key, search for an existing
@@ -840,25 +875,27 @@ key_ref_t key_create_or_update(key_ref_t keyring_ref,
 			perm, flags);
 	if (IS_ERR(key)) {
 		key_ref = ERR_CAST(key);
-		goto error_3;
+		goto error_link_end;
 	}
 
 	/* instantiate it and link it into the target keyring */
-	ret = __key_instantiate_and_link(key, payload, plen, keyring, NULL,
-					 &prealloc);
+	ret = __key_instantiate_and_link(key, &prep, keyring, NULL, &prealloc);
 	if (ret < 0) {
 		key_put(key);
 		key_ref = ERR_PTR(ret);
-		goto error_3;
+		goto error_link_end;
 	}
 
 	key_ref = make_key_ref(key, is_key_possessed(keyring_ref));
 
- error_3:
+error_link_end:
 	__key_link_end(keyring, ktype, prealloc);
- error_2:
+error_free_prep:
+	if (ktype->preparse)
+		ktype->free_preparse(&prep);
+error_put_type:
 	key_type_put(ktype);
- error:
+error:
 	return key_ref;
 
  found_matching_key:
@@ -866,10 +903,9 @@ key_ref_t key_create_or_update(key_ref_t keyring_ref,
 	 * - we can drop the locks first as we have the key pinned
 	 */
 	__key_link_end(keyring, ktype, prealloc);
-	key_type_put(ktype);
 
-	key_ref = __key_update(key_ref, payload, plen);
-	goto error;
+	key_ref = __key_update(key_ref, &prep);
+	goto error_free_prep;
 }
 EXPORT_SYMBOL(key_create_or_update);
 
@@ -888,6 +924,7 @@ EXPORT_SYMBOL(key_create_or_update);
  */
 int key_update(key_ref_t key_ref, const void *payload, size_t plen)
 {
+	struct key_preparsed_payload prep;
 	struct key *key = key_ref_to_ptr(key_ref);
 	int ret;
 
@@ -900,18 +937,31 @@ int key_update(key_ref_t key_ref, const void *payload, size_t plen)
 
 	/* attempt to update it if supported */
 	ret = -EOPNOTSUPP;
-	if (key->type->update) {
-		down_write(&key->sem);
-
-		ret = key->type->update(key, payload, plen);
-		if (ret == 0)
-			/* updating a negative key instantiates it */
-			clear_bit(KEY_FLAG_NEGATIVE, &key->flags);
+	if (!key->type->update)
+		goto error;
 
-		up_write(&key->sem);
+	memset(&prep, 0, sizeof(prep));
+	prep.data = payload;
+	prep.datalen = plen;
+	prep.quotalen = key->type->def_datalen;
+	if (key->type->preparse) {
+		ret = key->type->preparse(&prep);
+		if (ret < 0)
+			goto error;
 	}
 
- error:
+	down_write(&key->sem);
+
+	ret = key->type->update(key, &prep);
+	if (ret == 0)
+		/* updating a negative key instantiates it */
+		clear_bit(KEY_FLAG_NEGATIVE, &key->flags);
+
+	up_write(&key->sem);
+
+	if (key->type->preparse)
+		key->type->free_preparse(&prep);
+error:
 	return ret;
 }
 EXPORT_SYMBOL(key_update);
diff --git a/security/keys/keyctl.c b/security/keys/keyctl.c
index 3364fbf46807..505d40be196c 100644
--- a/security/keys/keyctl.c
+++ b/security/keys/keyctl.c
@@ -46,6 +46,9 @@ static int key_get_type_from_user(char *type,
  * Extract the description of a new key from userspace and either add it as a
  * new key to the specified keyring or update a matching key in that keyring.
  *
+ * If the description is NULL or an empty string, the key type is asked to
+ * generate one from the payload.
+ *
  * The keyring must be writable so that we can attach the key to it.
  *
  * If successful, the new key's serial number is returned, otherwise an error
@@ -72,10 +75,17 @@ SYSCALL_DEFINE5(add_key, const char __user *, _type,
 	if (ret < 0)
 		goto error;
 
-	description = strndup_user(_description, PAGE_SIZE);
-	if (IS_ERR(description)) {
-		ret = PTR_ERR(description);
-		goto error;
+	description = NULL;
+	if (_description) {
+		description = strndup_user(_description, PAGE_SIZE);
+		if (IS_ERR(description)) {
+			ret = PTR_ERR(description);
+			goto error;
+		}
+		if (!*description) {
+			kfree(description);
+			description = NULL;
+		}
 	}
 
 	/* pull the payload in if one was supplied */
diff --git a/security/keys/keyring.c b/security/keys/keyring.c
index 81e7852d281d..f04d8cf81f3c 100644
--- a/security/keys/keyring.c
+++ b/security/keys/keyring.c
@@ -66,7 +66,7 @@ static inline unsigned keyring_hash(const char *desc)
  * operations.
  */
 static int keyring_instantiate(struct key *keyring,
-			       const void *data, size_t datalen);
+			       struct key_preparsed_payload *prep);
 static int keyring_match(const struct key *keyring, const void *criterion);
 static void keyring_revoke(struct key *keyring);
 static void keyring_destroy(struct key *keyring);
@@ -121,12 +121,12 @@ static void keyring_publish_name(struct key *keyring)
  * Returns 0 on success, -EINVAL if given any data.
  */
 static int keyring_instantiate(struct key *keyring,
-			       const void *data, size_t datalen)
+			       struct key_preparsed_payload *prep)
 {
 	int ret;
 
 	ret = -EINVAL;
-	if (datalen == 0) {
+	if (prep->datalen == 0) {
 		/* make the keyring available by name if it has one */
 		keyring_publish_name(keyring);
 		ret = 0;
diff --git a/security/keys/request_key_auth.c b/security/keys/request_key_auth.c
index 60d4e3f5e4bb..85730d5a5a59 100644
--- a/security/keys/request_key_auth.c
+++ b/security/keys/request_key_auth.c
@@ -19,7 +19,8 @@
 #include <asm/uaccess.h>
 #include "internal.h"
 
-static int request_key_auth_instantiate(struct key *, const void *, size_t);
+static int request_key_auth_instantiate(struct key *,
+					struct key_preparsed_payload *);
 static void request_key_auth_describe(const struct key *, struct seq_file *);
 static void request_key_auth_revoke(struct key *);
 static void request_key_auth_destroy(struct key *);
@@ -42,10 +43,9 @@ struct key_type key_type_request_key_auth = {
  * Instantiate a request-key authorisation key.
  */
 static int request_key_auth_instantiate(struct key *key,
-					const void *data,
-					size_t datalen)
+					struct key_preparsed_payload *prep)
 {
-	key->payload.data = (struct request_key_auth *) data;
+	key->payload.data = (struct request_key_auth *)prep->data;
 	return 0;
 }
 
diff --git a/security/keys/trusted.c b/security/keys/trusted.c
index 2d5d041f2049..42036c7a0856 100644
--- a/security/keys/trusted.c
+++ b/security/keys/trusted.c
@@ -927,22 +927,23 @@ static struct trusted_key_payload *trusted_payload_alloc(struct key *key)
  *
  * On success, return 0. Otherwise return errno.
  */
-static int trusted_instantiate(struct key *key, const void *data,
-			       size_t datalen)
+static int trusted_instantiate(struct key *key,
+			       struct key_preparsed_payload *prep)
 {
 	struct trusted_key_payload *payload = NULL;
 	struct trusted_key_options *options = NULL;
+	size_t datalen = prep->datalen;
 	char *datablob;
 	int ret = 0;
 	int key_cmd;
 
-	if (datalen <= 0 || datalen > 32767 || !data)
+	if (datalen <= 0 || datalen > 32767 || !prep->data)
 		return -EINVAL;
 
 	datablob = kmalloc(datalen + 1, GFP_KERNEL);
 	if (!datablob)
 		return -ENOMEM;
-	memcpy(datablob, data, datalen);
+	memcpy(datablob, prep->data, datalen);
 	datablob[datalen] = '\0';
 
 	options = trusted_options_alloc();
@@ -1011,17 +1012,18 @@ static void trusted_rcu_free(struct rcu_head *rcu)
 /*
  * trusted_update - reseal an existing key with new PCR values
  */
-static int trusted_update(struct key *key, const void *data, size_t datalen)
+static int trusted_update(struct key *key, struct key_preparsed_payload *prep)
 {
 	struct trusted_key_payload *p = key->payload.data;
 	struct trusted_key_payload *new_p;
 	struct trusted_key_options *new_o;
+	size_t datalen = prep->datalen;
 	char *datablob;
 	int ret = 0;
 
 	if (!p->migratable)
 		return -EPERM;
-	if (datalen <= 0 || datalen > 32767 || !data)
+	if (datalen <= 0 || datalen > 32767 || !prep->data)
 		return -EINVAL;
 
 	datablob = kmalloc(datalen + 1, GFP_KERNEL);
@@ -1038,7 +1040,7 @@ static int trusted_update(struct key *key, const void *data, size_t datalen)
 		goto out;
 	}
 
-	memcpy(datablob, data, datalen);
+	memcpy(datablob, prep->data, datalen);
 	datablob[datalen] = '\0';
 	ret = datablob_parse(datablob, new_p, new_o);
 	if (ret != Opt_update) {
diff --git a/security/keys/user_defined.c b/security/keys/user_defined.c
index c7660a25a3e4..55dc88939185 100644
--- a/security/keys/user_defined.c
+++ b/security/keys/user_defined.c
@@ -58,13 +58,14 @@ EXPORT_SYMBOL_GPL(key_type_logon);
 /*
  * instantiate a user defined key
  */
-int user_instantiate(struct key *key, const void *data, size_t datalen)
+int user_instantiate(struct key *key, struct key_preparsed_payload *prep)
 {
 	struct user_key_payload *upayload;
+	size_t datalen = prep->datalen;
 	int ret;
 
 	ret = -EINVAL;
-	if (datalen <= 0 || datalen > 32767 || !data)
+	if (datalen <= 0 || datalen > 32767 || !prep->data)
 		goto error;
 
 	ret = key_payload_reserve(key, datalen);
@@ -78,7 +79,7 @@ int user_instantiate(struct key *key, const void *data, size_t datalen)
 
 	/* attach the data */
 	upayload->datalen = datalen;
-	memcpy(upayload->data, data, datalen);
+	memcpy(upayload->data, prep->data, datalen);
 	rcu_assign_keypointer(key, upayload);
 	ret = 0;
 
@@ -92,13 +93,14 @@ EXPORT_SYMBOL_GPL(user_instantiate);
  * update a user defined key
  * - the key's semaphore is write-locked
  */
-int user_update(struct key *key, const void *data, size_t datalen)
+int user_update(struct key *key, struct key_preparsed_payload *prep)
 {
 	struct user_key_payload *upayload, *zap;
+	size_t datalen = prep->datalen;
 	int ret;
 
 	ret = -EINVAL;
-	if (datalen <= 0 || datalen > 32767 || !data)
+	if (datalen <= 0 || datalen > 32767 || !prep->data)
 		goto error;
 
 	/* construct a replacement payload */
@@ -108,7 +110,7 @@ int user_update(struct key *key, const void *data, size_t datalen)
 		goto error;
 
 	upayload->datalen = datalen;
-	memcpy(upayload->data, data, datalen);
+	memcpy(upayload->data, prep->data, datalen);
 
 	/* check the quota and attach the new data */
 	zap = upayload;
-- 
cgit v1.2.3


From d851c12b60471188e15e5c8405b289073e8dd025 Mon Sep 17 00:00:00 2001
From: Steffen Klassert <steffen.klassert@secunet.com>
Date: Sun, 7 Oct 2012 22:47:25 +0000
Subject: ipv4: Always invalidate or update the route on pmtu events

Some protocols, like IPsec still cache routes. So we need to invalidate
the old route on pmtu events to avoid the reuse of stale routes.
We also need to update the mtu and expire time of the route if we already
use a nh exception route, otherwise we ignore newly learned pmtu values
after the first expiration.

With this patch we always invalidate or update the route on pmtu events.

Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/route.c | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index ff622069fcef..90ba8358a892 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -904,22 +904,29 @@ out:	kfree_skb(skb);
 	return 0;
 }
 
-static u32 __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
+static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
 {
+	struct dst_entry *dst = &rt->dst;
 	struct fib_result res;
 
 	if (mtu < ip_rt_min_pmtu)
 		mtu = ip_rt_min_pmtu;
 
+	if (!rt->rt_pmtu) {
+		dst->obsolete = DST_OBSOLETE_KILL;
+	} else {
+		rt->rt_pmtu = mtu;
+		dst->expires = max(1UL, jiffies + ip_rt_mtu_expires);
+	}
+
 	rcu_read_lock();
-	if (fib_lookup(dev_net(rt->dst.dev), fl4, &res) == 0) {
+	if (fib_lookup(dev_net(dst->dev), fl4, &res) == 0) {
 		struct fib_nh *nh = &FIB_RES_NH(res);
 
 		update_or_create_fnhe(nh, fl4->daddr, 0, mtu,
 				      jiffies + ip_rt_mtu_expires);
 	}
 	rcu_read_unlock();
-	return mtu;
 }
 
 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
@@ -929,14 +936,7 @@ static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
 	struct flowi4 fl4;
 
 	ip_rt_build_flow_key(&fl4, sk, skb);
-	mtu = __ip_rt_update_pmtu(rt, &fl4, mtu);
-
-	if (!rt->rt_pmtu) {
-		dst->obsolete = DST_OBSOLETE_KILL;
-	} else {
-		rt->rt_pmtu = mtu;
-		rt->dst.expires = max(1UL, jiffies + ip_rt_mtu_expires);
-	}
+	__ip_rt_update_pmtu(rt, &fl4, mtu);
 }
 
 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
-- 
cgit v1.2.3


From 7f92d334ba19a0d8e96f8f8f092219553367d921 Mon Sep 17 00:00:00 2001
From: Steffen Klassert <steffen.klassert@secunet.com>
Date: Sun, 7 Oct 2012 22:48:18 +0000
Subject: ipv4: Don't create nh exeption when the device mtu is smaller than
 the reported pmtu

When a local tool like tracepath tries to send packets bigger than
the device mtu, we create a nh exeption and set the pmtu to device
mtu. The device mtu does not expire, so check if the device mtu is
smaller than the reported pmtu and don't crerate a nh exeption in
that case.

Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/route.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'net')

diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 90ba8358a892..741df67a81ec 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -909,6 +909,9 @@ static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
 	struct dst_entry *dst = &rt->dst;
 	struct fib_result res;
 
+	if (dst->dev->mtu < mtu)
+		return;
+
 	if (mtu < ip_rt_min_pmtu)
 		mtu = ip_rt_min_pmtu;
 
-- 
cgit v1.2.3


From ee9a8f7ab2edf801b8b514c310455c94acc232f6 Mon Sep 17 00:00:00 2001
From: Steffen Klassert <steffen.klassert@secunet.com>
Date: Mon, 8 Oct 2012 00:56:54 +0000
Subject: ipv4: Don't report stale pmtu values to userspace

We report cached pmtu values even if they are already expired.
Change this to not report these values after they are expired
and fix a race in the expire time calculation, as suggested by
Eric Dumazet.

Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/route.c | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 741df67a81ec..132e0dfee53a 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -2187,8 +2187,18 @@ static int rt_fill_info(struct net *net,  __be32 dst, __be32 src,
 	    nla_put_be32(skb, RTA_GATEWAY, rt->rt_gateway))
 		goto nla_put_failure;
 
+	expires = rt->dst.expires;
+	if (expires) {
+		unsigned long now = jiffies;
+
+		if (time_before(now, expires))
+			expires -= now;
+		else
+			expires = 0;
+	}
+
 	memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
-	if (rt->rt_pmtu)
+	if (rt->rt_pmtu && expires)
 		metrics[RTAX_MTU - 1] = rt->rt_pmtu;
 	if (rtnetlink_put_metrics(skb, metrics) < 0)
 		goto nla_put_failure;
@@ -2198,13 +2208,6 @@ static int rt_fill_info(struct net *net,  __be32 dst, __be32 src,
 		goto nla_put_failure;
 
 	error = rt->dst.error;
-	expires = rt->dst.expires;
-	if (expires) {
-		if (time_before(jiffies, expires))
-			expires -= jiffies;
-		else
-			expires = 0;
-	}
 
 	if (rt_is_input_route(rt)) {
 		if (nla_put_u32(skb, RTA_IIF, rt->rt_iif))
-- 
cgit v1.2.3


From 2e71a6f8084e7ac87166dd77d99c44190fb844fc Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Sat, 6 Oct 2012 08:08:49 +0000
Subject: net: gro: selective flush of packets

Current GRO can hold packets in gro_list for almost unlimited
time, in case napi->poll() handler consumes its budget over and over.

In this case, napi_complete()/napi_gro_flush() are not called.

Another problem is that gro_list is flushed in non friendly way :
We scan the list and complete packets in the reverse order.
(youngest packets first, oldest packets last)
This defeats priorities that sender could have cooked.

Since GRO currently only store TCP packets, we dont really notice the
bug because of retransmits, but this behavior can add unexpected
latencies, particularly on mice flows clamped by elephant flows.

This patch makes sure no packet can stay more than 1 ms in queue, and
only in stress situations.

It also complete packets in the right order to minimize latencies.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Cc: Jesse Gross <jesse@nicira.com>
Cc: Tom Herbert <therbert@google.com>
Cc: Yuchung Cheng <ycheng@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/marvell/skge.c   |  2 +-
 drivers/net/ethernet/realtek/8139cp.c |  2 +-
 include/linux/netdevice.h             | 15 ++++++++------
 net/core/dev.c                        | 38 ++++++++++++++++++++++++++++-------
 4 files changed, 42 insertions(+), 15 deletions(-)

(limited to 'net')

diff --git a/drivers/net/ethernet/marvell/skge.c b/drivers/net/ethernet/marvell/skge.c
index 3f7dab46626b..9b9c2ac5c4c2 100644
--- a/drivers/net/ethernet/marvell/skge.c
+++ b/drivers/net/ethernet/marvell/skge.c
@@ -3189,7 +3189,7 @@ static int skge_poll(struct napi_struct *napi, int to_do)
 	if (work_done < to_do) {
 		unsigned long flags;
 
-		napi_gro_flush(napi);
+		napi_gro_flush(napi, false);
 		spin_lock_irqsave(&hw->hw_lock, flags);
 		__napi_complete(napi);
 		hw->intr_mask |= napimask[skge->port];
diff --git a/drivers/net/ethernet/realtek/8139cp.c b/drivers/net/ethernet/realtek/8139cp.c
index 995d0cfc4c06..1c818254b7be 100644
--- a/drivers/net/ethernet/realtek/8139cp.c
+++ b/drivers/net/ethernet/realtek/8139cp.c
@@ -563,7 +563,7 @@ rx_next:
 		if (cpr16(IntrStatus) & cp_rx_intr_mask)
 			goto rx_status_loop;
 
-		napi_gro_flush(napi);
+		napi_gro_flush(napi, false);
 		spin_lock_irqsave(&cp->lock, flags);
 		__napi_complete(napi);
 		cpw16_f(IntrMask, cp_intr_mask);
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index a659fd0ba965..0a36fff75bd5 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1497,19 +1497,22 @@ struct napi_gro_cb {
 	/* This indicates where we are processing relative to skb->data. */
 	int data_offset;
 
-	/* This is non-zero if the packet may be of the same flow. */
-	int same_flow;
-
 	/* This is non-zero if the packet cannot be merged with the new skb. */
 	int flush;
 
 	/* Number of segments aggregated. */
-	int count;
+	u16	count;
+
+	/* This is non-zero if the packet may be of the same flow. */
+	u8	same_flow;
 
 	/* Free the skb? */
-	int free;
+	u8	free;
 #define NAPI_GRO_FREE		  1
 #define NAPI_GRO_FREE_STOLEN_HEAD 2
+
+	/* jiffies when first packet was created/queued */
+	unsigned long age;
 };
 
 #define NAPI_GRO_CB(skb) ((struct napi_gro_cb *)(skb)->cb)
@@ -2156,7 +2159,7 @@ extern gro_result_t	dev_gro_receive(struct napi_struct *napi,
 extern gro_result_t	napi_skb_finish(gro_result_t ret, struct sk_buff *skb);
 extern gro_result_t	napi_gro_receive(struct napi_struct *napi,
 					 struct sk_buff *skb);
-extern void		napi_gro_flush(struct napi_struct *napi);
+extern void		napi_gro_flush(struct napi_struct *napi, bool flush_old);
 extern struct sk_buff *	napi_get_frags(struct napi_struct *napi);
 extern gro_result_t	napi_frags_finish(struct napi_struct *napi,
 					  struct sk_buff *skb,
diff --git a/net/core/dev.c b/net/core/dev.c
index de2bad717d56..d44668f63c88 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3471,17 +3471,31 @@ out:
 	return netif_receive_skb(skb);
 }
 
-inline void napi_gro_flush(struct napi_struct *napi)
+/* napi->gro_list contains packets ordered by age.
+ * youngest packets at the head of it.
+ * Complete skbs in reverse order to reduce latencies.
+ */
+void napi_gro_flush(struct napi_struct *napi, bool flush_old)
 {
-	struct sk_buff *skb, *next;
+	struct sk_buff *skb, *prev = NULL;
 
-	for (skb = napi->gro_list; skb; skb = next) {
-		next = skb->next;
+	/* scan list and build reverse chain */
+	for (skb = napi->gro_list; skb != NULL; skb = skb->next) {
+		skb->prev = prev;
+		prev = skb;
+	}
+
+	for (skb = prev; skb; skb = prev) {
 		skb->next = NULL;
+
+		if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
+			return;
+
+		prev = skb->prev;
 		napi_gro_complete(skb);
+		napi->gro_count--;
 	}
 
-	napi->gro_count = 0;
 	napi->gro_list = NULL;
 }
 EXPORT_SYMBOL(napi_gro_flush);
@@ -3542,6 +3556,7 @@ enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
 
 	napi->gro_count++;
 	NAPI_GRO_CB(skb)->count = 1;
+	NAPI_GRO_CB(skb)->age = jiffies;
 	skb_shinfo(skb)->gso_size = skb_gro_len(skb);
 	skb->next = napi->gro_list;
 	napi->gro_list = skb;
@@ -3878,7 +3893,7 @@ void napi_complete(struct napi_struct *n)
 	if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
 		return;
 
-	napi_gro_flush(n);
+	napi_gro_flush(n, false);
 	local_irq_save(flags);
 	__napi_complete(n);
 	local_irq_restore(flags);
@@ -3983,8 +3998,17 @@ static void net_rx_action(struct softirq_action *h)
 				local_irq_enable();
 				napi_complete(n);
 				local_irq_disable();
-			} else
+			} else {
+				if (n->gro_list) {
+					/* flush too old packets
+					 * If HZ < 1000, flush all packets.
+					 */
+					local_irq_enable();
+					napi_gro_flush(n, HZ >= 1000);
+					local_irq_disable();
+				}
 				list_move_tail(&n->poll_list, &sd->poll_list);
+			}
 		}
 
 		netpoll_poll_unlock(have);
-- 
cgit v1.2.3


From 55fabefe3695241e6ccfa0cd4974f3fa497693dc Mon Sep 17 00:00:00 2001
From: Thomas Pedersen <thomas@cozybit.com>
Date: Fri, 5 Oct 2012 17:57:39 -0700
Subject: mac80211: call drv_get_tsf() in sleepable context

The call to drv_get/set_tsf() was put on the workqueue to perform tsf
adjustments since that function might sleep. However it ended up inside
a spinlock, whose critical section must be atomic. Do tsf adjustment
outside the spinlock instead, and get rid of a warning.

Signed-off-by: Thomas Pedersen <thomas@cozybit.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 net/mac80211/mesh_sync.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/mesh_sync.c b/net/mac80211/mesh_sync.c
index accfa00ffcdf..a16b7b4b1e02 100644
--- a/net/mac80211/mesh_sync.c
+++ b/net/mac80211/mesh_sync.c
@@ -56,7 +56,6 @@ void mesh_sync_adjust_tbtt(struct ieee80211_sub_if_data *sdata)
 	u64 tsfdelta;
 
 	spin_lock_bh(&ifmsh->sync_offset_lock);
-
 	if (ifmsh->sync_offset_clockdrift_max < beacon_int_fraction) {
 		msync_dbg(sdata, "TBTT : max clockdrift=%lld; adjusting\n",
 			  (long long) ifmsh->sync_offset_clockdrift_max);
@@ -69,11 +68,11 @@ void mesh_sync_adjust_tbtt(struct ieee80211_sub_if_data *sdata)
 		tsfdelta = -beacon_int_fraction;
 		ifmsh->sync_offset_clockdrift_max -= beacon_int_fraction;
 	}
+	spin_unlock_bh(&ifmsh->sync_offset_lock);
 
 	tsf = drv_get_tsf(local, sdata);
 	if (tsf != -1ULL)
 		drv_set_tsf(local, sdata, tsf + tsfdelta);
-	spin_unlock_bh(&ifmsh->sync_offset_lock);
 }
 
 static void mesh_sync_offset_rx_bcn_presp(struct ieee80211_sub_if_data *sdata,
-- 
cgit v1.2.3


From c3e7724b6bc2f25e46c38dbe68f09d71fafeafb8 Mon Sep 17 00:00:00 2001
From: Felix Fietkau <nbd@openwrt.org>
Date: Mon, 8 Oct 2012 14:39:33 +0200
Subject: mac80211: use ieee80211_free_txskb to fix possible skb leaks

A few places free skbs using dev_kfree_skb even though they're called
after ieee80211_subif_start_xmit might have cloned it for tracking tx
status. Use ieee80211_free_txskb here to prevent skb leaks.

Signed-off-by: Felix Fietkau <nbd@openwrt.org>
Cc: stable@vger.kernel.org
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 net/mac80211/status.c |  4 ++--
 net/mac80211/tx.c     | 22 ++++++++++++----------
 2 files changed, 14 insertions(+), 12 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/status.c b/net/mac80211/status.c
index 2ce89732d0f2..3af0cc4130f1 100644
--- a/net/mac80211/status.c
+++ b/net/mac80211/status.c
@@ -34,7 +34,7 @@ void ieee80211_tx_status_irqsafe(struct ieee80211_hw *hw,
 		skb_queue_len(&local->skb_queue_unreliable);
 	while (tmp > IEEE80211_IRQSAFE_QUEUE_LIMIT &&
 	       (skb = skb_dequeue(&local->skb_queue_unreliable))) {
-		dev_kfree_skb_irq(skb);
+		ieee80211_free_txskb(hw, skb);
 		tmp--;
 		I802_DEBUG_INC(local->tx_status_drop);
 	}
@@ -159,7 +159,7 @@ static void ieee80211_handle_filtered_frame(struct ieee80211_local *local,
 			   "dropped TX filtered frame, queue_len=%d PS=%d @%lu\n",
 			   skb_queue_len(&sta->tx_filtered[ac]),
 			   !!test_sta_flag(sta, WLAN_STA_PS_STA), jiffies);
-	dev_kfree_skb(skb);
+	ieee80211_free_txskb(&local->hw, skb);
 }
 
 static void ieee80211_check_pending_bar(struct sta_info *sta, u8 *addr, u8 tid)
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index e0e0d1d0e830..c9bf83f36657 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -354,7 +354,7 @@ static void purge_old_ps_buffers(struct ieee80211_local *local)
 			total += skb_queue_len(&sta->ps_tx_buf[ac]);
 			if (skb) {
 				purged++;
-				dev_kfree_skb(skb);
+				ieee80211_free_txskb(&local->hw, skb);
 				break;
 			}
 		}
@@ -466,7 +466,7 @@ ieee80211_tx_h_unicast_ps_buf(struct ieee80211_tx_data *tx)
 			ps_dbg(tx->sdata,
 			       "STA %pM TX buffer for AC %d full - dropping oldest frame\n",
 			       sta->sta.addr, ac);
-			dev_kfree_skb(old);
+			ieee80211_free_txskb(&local->hw, old);
 		} else
 			tx->local->total_ps_buffered++;
 
@@ -1103,7 +1103,7 @@ static bool ieee80211_tx_prep_agg(struct ieee80211_tx_data *tx,
 		spin_unlock(&tx->sta->lock);
 
 		if (purge_skb)
-			dev_kfree_skb(purge_skb);
+			ieee80211_free_txskb(&tx->local->hw, purge_skb);
 	}
 
 	/* reset session timer */
@@ -1214,7 +1214,7 @@ static bool ieee80211_tx_frags(struct ieee80211_local *local,
 #ifdef CONFIG_MAC80211_VERBOSE_DEBUG
 		if (WARN_ON_ONCE(q >= local->hw.queues)) {
 			__skb_unlink(skb, skbs);
-			dev_kfree_skb(skb);
+			ieee80211_free_txskb(&local->hw, skb);
 			continue;
 		}
 #endif
@@ -1356,7 +1356,7 @@ static int invoke_tx_handlers(struct ieee80211_tx_data *tx)
 	if (unlikely(res == TX_DROP)) {
 		I802_DEBUG_INC(tx->local->tx_handlers_drop);
 		if (tx->skb)
-			dev_kfree_skb(tx->skb);
+			ieee80211_free_txskb(&tx->local->hw, tx->skb);
 		else
 			__skb_queue_purge(&tx->skbs);
 		return -1;
@@ -1393,7 +1393,7 @@ static bool ieee80211_tx(struct ieee80211_sub_if_data *sdata,
 	res_prepare = ieee80211_tx_prepare(sdata, &tx, skb);
 
 	if (unlikely(res_prepare == TX_DROP)) {
-		dev_kfree_skb(skb);
+		ieee80211_free_txskb(&local->hw, skb);
 		goto out;
 	} else if (unlikely(res_prepare == TX_QUEUED)) {
 		goto out;
@@ -1465,7 +1465,7 @@ void ieee80211_xmit(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb)
 	headroom = max_t(int, 0, headroom);
 
 	if (ieee80211_skb_resize(sdata, skb, headroom, may_encrypt)) {
-		dev_kfree_skb(skb);
+		ieee80211_free_txskb(&local->hw, skb);
 		rcu_read_unlock();
 		return;
 	}
@@ -2050,8 +2050,10 @@ netdev_tx_t ieee80211_subif_start_xmit(struct sk_buff *skb,
 		head_need += IEEE80211_ENCRYPT_HEADROOM;
 		head_need += local->tx_headroom;
 		head_need = max_t(int, 0, head_need);
-		if (ieee80211_skb_resize(sdata, skb, head_need, true))
-			goto fail;
+		if (ieee80211_skb_resize(sdata, skb, head_need, true)) {
+			ieee80211_free_txskb(&local->hw, skb);
+			return NETDEV_TX_OK;
+		}
 	}
 
 	if (encaps_data) {
@@ -2184,7 +2186,7 @@ void ieee80211_tx_pending(unsigned long data)
 			struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
 
 			if (WARN_ON(!info->control.vif)) {
-				kfree_skb(skb);
+				ieee80211_free_txskb(&local->hw, skb);
 				continue;
 			}
 
-- 
cgit v1.2.3


From 48cc32d38a52d0b68f91a171a8d00531edc6a46e Mon Sep 17 00:00:00 2001
From: Florian Zumbiehl <florz@florz.de>
Date: Sun, 7 Oct 2012 15:51:58 +0000
Subject: vlan: don't deliver frames for unknown vlans to protocols

6a32e4f9dd9219261f8856f817e6655114cfec2f made the vlan code skip marking
vlan-tagged frames for not locally configured vlans as PACKET_OTHERHOST if
there was an rx_handler, as the rx_handler could cause the frame to be received
on a different (virtual) vlan-capable interface where that vlan might be
configured.

As rx_handlers do not necessarily return RX_HANDLER_ANOTHER, this could cause
frames for unknown vlans to be delivered to the protocol stack as if they had
been received untagged.

For example, if an ipv6 router advertisement that's tagged for a locally not
configured vlan is received on an interface with macvlan interfaces attached,
macvlan's rx_handler returns RX_HANDLER_PASS after delivering the frame to the
macvlan interfaces, which caused it to be passed to the protocol stack, leading
to ipv6 addresses for the announced prefix being configured even though those
are completely unusable on the underlying interface.

The fix moves marking as PACKET_OTHERHOST after the rx_handler so the
rx_handler, if there is one, sees the frame unchanged, but afterwards,
before the frame is delivered to the protocol stack, it gets marked whether
there is an rx_handler or not.

Signed-off-by: Florian Zumbiehl <florz@florz.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/if_vlan.h |  8 ++++----
 net/8021q/vlan_core.c   | 10 ++--------
 net/core/dev.c          |  7 +++++--
 3 files changed, 11 insertions(+), 14 deletions(-)

(limited to 'net')

diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h
index e6ff12dd717b..c0ff748d0aa5 100644
--- a/include/linux/if_vlan.h
+++ b/include/linux/if_vlan.h
@@ -80,6 +80,8 @@ static inline int is_vlan_dev(struct net_device *dev)
 }
 
 #define vlan_tx_tag_present(__skb)	((__skb)->vlan_tci & VLAN_TAG_PRESENT)
+#define vlan_tx_nonzero_tag_present(__skb) \
+	(vlan_tx_tag_present(__skb) && ((__skb)->vlan_tci & VLAN_VID_MASK))
 #define vlan_tx_tag_get(__skb)		((__skb)->vlan_tci & ~VLAN_TAG_PRESENT)
 
 #if defined(CONFIG_VLAN_8021Q) || defined(CONFIG_VLAN_8021Q_MODULE)
@@ -89,7 +91,7 @@ extern struct net_device *__vlan_find_dev_deep(struct net_device *real_dev,
 extern struct net_device *vlan_dev_real_dev(const struct net_device *dev);
 extern u16 vlan_dev_vlan_id(const struct net_device *dev);
 
-extern bool vlan_do_receive(struct sk_buff **skb, bool last_handler);
+extern bool vlan_do_receive(struct sk_buff **skb);
 extern struct sk_buff *vlan_untag(struct sk_buff *skb);
 
 extern int vlan_vid_add(struct net_device *dev, unsigned short vid);
@@ -120,10 +122,8 @@ static inline u16 vlan_dev_vlan_id(const struct net_device *dev)
 	return 0;
 }
 
-static inline bool vlan_do_receive(struct sk_buff **skb, bool last_handler)
+static inline bool vlan_do_receive(struct sk_buff **skb)
 {
-	if (((*skb)->vlan_tci & VLAN_VID_MASK) && last_handler)
-		(*skb)->pkt_type = PACKET_OTHERHOST;
 	return false;
 }
 
diff --git a/net/8021q/vlan_core.c b/net/8021q/vlan_core.c
index add69d0fd99d..fbbf1fa00940 100644
--- a/net/8021q/vlan_core.c
+++ b/net/8021q/vlan_core.c
@@ -5,7 +5,7 @@
 #include <linux/export.h>
 #include "vlan.h"
 
-bool vlan_do_receive(struct sk_buff **skbp, bool last_handler)
+bool vlan_do_receive(struct sk_buff **skbp)
 {
 	struct sk_buff *skb = *skbp;
 	u16 vlan_id = skb->vlan_tci & VLAN_VID_MASK;
@@ -13,14 +13,8 @@ bool vlan_do_receive(struct sk_buff **skbp, bool last_handler)
 	struct vlan_pcpu_stats *rx_stats;
 
 	vlan_dev = vlan_find_dev(skb->dev, vlan_id);
-	if (!vlan_dev) {
-		/* Only the last call to vlan_do_receive() should change
-		 * pkt_type to PACKET_OTHERHOST
-		 */
-		if (vlan_id && last_handler)
-			skb->pkt_type = PACKET_OTHERHOST;
+	if (!vlan_dev)
 		return false;
-	}
 
 	skb = *skbp = skb_share_check(skb, GFP_ATOMIC);
 	if (unlikely(!skb))
diff --git a/net/core/dev.c b/net/core/dev.c
index d44668f63c88..09cb3f6dc40c 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3300,18 +3300,18 @@ ncls:
 				&& !skb_pfmemalloc_protocol(skb))
 		goto drop;
 
-	rx_handler = rcu_dereference(skb->dev->rx_handler);
 	if (vlan_tx_tag_present(skb)) {
 		if (pt_prev) {
 			ret = deliver_skb(skb, pt_prev, orig_dev);
 			pt_prev = NULL;
 		}
-		if (vlan_do_receive(&skb, !rx_handler))
+		if (vlan_do_receive(&skb))
 			goto another_round;
 		else if (unlikely(!skb))
 			goto unlock;
 	}
 
+	rx_handler = rcu_dereference(skb->dev->rx_handler);
 	if (rx_handler) {
 		if (pt_prev) {
 			ret = deliver_skb(skb, pt_prev, orig_dev);
@@ -3331,6 +3331,9 @@ ncls:
 		}
 	}
 
+	if (vlan_tx_nonzero_tag_present(skb))
+		skb->pkt_type = PACKET_OTHERHOST;
+
 	/* deliver only exact match when indicated */
 	null_or_dev = deliver_exact ? skb->dev : NULL;
 
-- 
cgit v1.2.3


From 863472454ce50d4ef0929c6aa738cc5d64b84679 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 8 Oct 2012 21:38:50 +0200
Subject: ipv6: gro: fix PV6_GRO_CB(skb)->proto problem

It seems IPV6_GRO_CB(skb)->proto can be destroyed in skb_gro_receive()
if a new skb is allocated (to serve as an anchor for frag_list)

We copy NAPI_GRO_CB() only (not the IPV6 specific part) in :

*NAPI_GRO_CB(nskb) = *NAPI_GRO_CB(p);

So we leave IPV6_GRO_CB(nskb)->proto to 0 (fresh skb allocation) instead
of IPPROTO_TCP (6)

ipv6_gro_complete() isnt able to call ops->gro_complete()
[ tcp6_gro_complete() ]

Fix this by moving proto in NAPI_GRO_CB() and getting rid of
IPV6_GRO_CB

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h |  3 +++
 net/ipv6/af_inet6.c       | 11 ++---------
 2 files changed, 5 insertions(+), 9 deletions(-)

(limited to 'net')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 0a36fff75bd5..561c8bc8976d 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1513,6 +1513,9 @@ struct napi_gro_cb {
 
 	/* jiffies when first packet was created/queued */
 	unsigned long age;
+
+	/* Used in ipv6_gro_receive() */
+	int	proto;
 };
 
 #define NAPI_GRO_CB(skb) ((struct napi_gro_cb *)(skb)->cb)
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index f757e3b7cfbf..a974247a9ae4 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -822,13 +822,6 @@ out:
 	return segs;
 }
 
-struct ipv6_gro_cb {
-	struct napi_gro_cb napi;
-	int proto;
-};
-
-#define IPV6_GRO_CB(skb) ((struct ipv6_gro_cb *)(skb)->cb)
-
 static struct sk_buff **ipv6_gro_receive(struct sk_buff **head,
 					 struct sk_buff *skb)
 {
@@ -874,7 +867,7 @@ static struct sk_buff **ipv6_gro_receive(struct sk_buff **head,
 		iph = ipv6_hdr(skb);
 	}
 
-	IPV6_GRO_CB(skb)->proto = proto;
+	NAPI_GRO_CB(skb)->proto = proto;
 
 	flush--;
 	nlen = skb_network_header_len(skb);
@@ -930,7 +923,7 @@ static int ipv6_gro_complete(struct sk_buff *skb)
 				 sizeof(*iph));
 
 	rcu_read_lock();
-	ops = rcu_dereference(inet6_protos[IPV6_GRO_CB(skb)->proto]);
+	ops = rcu_dereference(inet6_protos[NAPI_GRO_CB(skb)->proto]);
 	if (WARN_ON(!ops || !ops->gro_complete))
 		goto out_unlock;
 
-- 
cgit v1.2.3


From e81da0e113a1b7fc7449ae6213f65f89ccac6d06 Mon Sep 17 00:00:00 2001
From: Julian Anastasov <ja@ssi.bg>
Date: Mon, 8 Oct 2012 11:41:15 +0000
Subject: ipv4: fix sending of redirects

After "Cache input routes in fib_info nexthops" (commit
d2d68ba9fe) and "Elide fib_validate_source() completely when possible"
(commit 7a9bc9b81a) we can not send ICMP redirects. It seems we
should not cache the RTCF_DOREDIRECT flag in nh_rth_input because
the same fib_info can be used for traffic that is not redirected,
eg. from other input devices or from sources that are not in same subnet.

	As result, we have to disable the caching of RTCF_DOREDIRECT
flag and to force source validation for the case when forwarding
traffic to the input device. If traffic comes from directly connected
source we allow redirection as it was done before both changes.

	Avoid setting RTCF_DOREDIRECT if IN_DEV_TX_REDIRECTS
is disabled, this can avoid source address validation and to
help caching the routes.

	After the change "Adjust semantics of rt->rt_gateway"
(commit f8126f1d51) we should make sure our ICMP_REDIR_HOST messages
contain daddr instead of 0.0.0.0 when target is directly connected.

Signed-off-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/fib_frontend.c |  3 ++-
 net/ipv4/route.c        | 30 ++++++++++++++++--------------
 2 files changed, 18 insertions(+), 15 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 68c93d1bb03a..825c608826de 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -322,7 +322,8 @@ int fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst,
 {
 	int r = secpath_exists(skb) ? 0 : IN_DEV_RPFILTER(idev);
 
-	if (!r && !fib_num_tclassid_users(dev_net(dev))) {
+	if (!r && !fib_num_tclassid_users(dev_net(dev)) &&
+	    (dev->ifindex != oif || !IN_DEV_TX_REDIRECTS(idev))) {
 		*itag = 0;
 		return 0;
 	}
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 132e0dfee53a..b90da1bc2704 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -802,7 +802,8 @@ void ip_rt_send_redirect(struct sk_buff *skb)
 	net = dev_net(rt->dst.dev);
 	peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
 	if (!peer) {
-		icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
+		icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
+			  rt_nexthop(rt, ip_hdr(skb)->daddr));
 		return;
 	}
 
@@ -827,7 +828,9 @@ void ip_rt_send_redirect(struct sk_buff *skb)
 	    time_after(jiffies,
 		       (peer->rate_last +
 			(ip_rt_redirect_load << peer->rate_tokens)))) {
-		icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
+		__be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
+
+		icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
 		peer->rate_last = jiffies;
 		++peer->rate_tokens;
 #ifdef CONFIG_IP_ROUTE_VERBOSE
@@ -835,7 +838,7 @@ void ip_rt_send_redirect(struct sk_buff *skb)
 		    peer->rate_tokens == ip_rt_redirect_number)
 			net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
 					     &ip_hdr(skb)->saddr, inet_iif(skb),
-					     &ip_hdr(skb)->daddr, &rt->rt_gateway);
+					     &ip_hdr(skb)->daddr, &gw);
 #endif
 	}
 out_put_peer:
@@ -1442,10 +1445,13 @@ static int __mkroute_input(struct sk_buff *skb,
 		goto cleanup;
 	}
 
-	if (out_dev == in_dev && err &&
+	do_cache = res->fi && !itag;
+	if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
 	    (IN_DEV_SHARED_MEDIA(out_dev) ||
-	     inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
+	     inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res)))) {
 		flags |= RTCF_DOREDIRECT;
+		do_cache = false;
+	}
 
 	if (skb->protocol != htons(ETH_P_IP)) {
 		/* Not IP (i.e. ARP). Do not create route, if it is
@@ -1462,15 +1468,11 @@ static int __mkroute_input(struct sk_buff *skb,
 		}
 	}
 
-	do_cache = false;
-	if (res->fi) {
-		if (!itag) {
-			rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
-			if (rt_cache_valid(rth)) {
-				skb_dst_set_noref(skb, &rth->dst);
-				goto out;
-			}
-			do_cache = true;
+	if (do_cache) {
+		rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
+		if (rt_cache_valid(rth)) {
+			skb_dst_set_noref(skb, &rth->dst);
+			goto out;
 		}
 	}
 
-- 
cgit v1.2.3


From e0adef0f7456d5d3a3bfe8ea61c7dddf146b40e1 Mon Sep 17 00:00:00 2001
From: Julian Anastasov <ja@ssi.bg>
Date: Mon, 8 Oct 2012 11:41:16 +0000
Subject: ipv4: fix forwarding for strict source routes

After the change "Adjust semantics of rt->rt_gateway"
(commit f8126f1d51) rt_gateway can be 0 but ip_forward() compares
it directly with nexthop. What we want here is to check if traffic
is to directly connected nexthop and to fail if using gateway.

Signed-off-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ip_forward.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c
index ab09b126423c..7f35ac26a71a 100644
--- a/net/ipv4/ip_forward.c
+++ b/net/ipv4/ip_forward.c
@@ -85,7 +85,7 @@ int ip_forward(struct sk_buff *skb)
 
 	rt = skb_rtable(skb);
 
-	if (opt->is_strictroute && opt->nexthop != rt->rt_gateway)
+	if (opt->is_strictroute && rt->rt_gateway)
 		goto sr_failed;
 
 	if (unlikely(skb->len > dst_mtu(&rt->dst) && !skb_is_gso(skb) &&
-- 
cgit v1.2.3


From f8a17175c63fd3e8b573719f7538816f8c96abf4 Mon Sep 17 00:00:00 2001
From: Julian Anastasov <ja@ssi.bg>
Date: Mon, 8 Oct 2012 11:41:17 +0000
Subject: ipv4: make sure nh_pcpu_rth_output is always allocated

Avoid checking nh_pcpu_rth_output in fast path,
abort fib_info creation on alloc_percpu failure.

Signed-off-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/fib_semantics.c | 2 ++
 net/ipv4/route.c         | 3 ---
 2 files changed, 2 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 267753060ffc..71b125cd5db1 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -840,6 +840,8 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
 	change_nexthops(fi) {
 		nexthop_nh->nh_parent = fi;
 		nexthop_nh->nh_pcpu_rth_output = alloc_percpu(struct rtable __rcu *);
+		if (!nexthop_nh->nh_pcpu_rth_output)
+			goto failure;
 	} endfor_nexthops(fi)
 
 	if (cfg->fc_mx) {
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index b90da1bc2704..5b0180f11b20 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1207,8 +1207,6 @@ static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt)
 	if (rt_is_input_route(rt)) {
 		p = (struct rtable **)&nh->nh_rth_input;
 	} else {
-		if (!nh->nh_pcpu_rth_output)
-			goto nocache;
 		p = (struct rtable **)__this_cpu_ptr(nh->nh_pcpu_rth_output);
 	}
 	orig = *p;
@@ -1223,7 +1221,6 @@ static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt)
 		 * unsuccessful at storing this route into the cache
 		 * we really need to set it.
 		 */
-nocache:
 		rt->dst.flags |= DST_NOCACHE;
 		ret = false;
 	}
-- 
cgit v1.2.3


From 155e8336c373d14d87a7f91e356d85ef4b93b8f9 Mon Sep 17 00:00:00 2001
From: Julian Anastasov <ja@ssi.bg>
Date: Mon, 8 Oct 2012 11:41:18 +0000
Subject: ipv4: introduce rt_uses_gateway

Add new flag to remember when route is via gateway.
We will use it to allow rt_gateway to contain address of
directly connected host for the cases when DST_NOCACHE is
used or when the NH exception caches per-destination route
without DST_NOCACHE flag, i.e. when routes are not used for
other destinations. By this way we force the neighbour
resolving to work with the routed destination but we
can use different address in the packet, feature needed
for IPVS-DR where original packet for virtual IP is routed
via route to real IP.

Signed-off-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/route.h             |  3 ++-
 net/ipv4/inet_connection_sock.c |  4 ++--
 net/ipv4/ip_forward.c           |  2 +-
 net/ipv4/ip_output.c            |  4 ++--
 net/ipv4/route.c                | 48 ++++++++++++++++++++++-------------------
 net/ipv4/xfrm4_policy.c         |  1 +
 6 files changed, 34 insertions(+), 28 deletions(-)

(limited to 'net')

diff --git a/include/net/route.h b/include/net/route.h
index da22243d2760..bc40b633a5c4 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -48,7 +48,8 @@ struct rtable {
 	int			rt_genid;
 	unsigned int		rt_flags;
 	__u16			rt_type;
-	__u16			rt_is_input;
+	__u8			rt_is_input;
+	__u8			rt_uses_gateway;
 
 	int			rt_iif;
 
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index f0c5b9c1a957..d34ce2972c8f 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -406,7 +406,7 @@ struct dst_entry *inet_csk_route_req(struct sock *sk,
 	rt = ip_route_output_flow(net, fl4, sk);
 	if (IS_ERR(rt))
 		goto no_route;
-	if (opt && opt->opt.is_strictroute && rt->rt_gateway)
+	if (opt && opt->opt.is_strictroute && rt->rt_uses_gateway)
 		goto route_err;
 	return &rt->dst;
 
@@ -442,7 +442,7 @@ struct dst_entry *inet_csk_route_child_sock(struct sock *sk,
 	rt = ip_route_output_flow(net, fl4, sk);
 	if (IS_ERR(rt))
 		goto no_route;
-	if (opt && opt->opt.is_strictroute && rt->rt_gateway)
+	if (opt && opt->opt.is_strictroute && rt->rt_uses_gateway)
 		goto route_err;
 	rcu_read_unlock();
 	return &rt->dst;
diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c
index 7f35ac26a71a..694de3b7aebf 100644
--- a/net/ipv4/ip_forward.c
+++ b/net/ipv4/ip_forward.c
@@ -85,7 +85,7 @@ int ip_forward(struct sk_buff *skb)
 
 	rt = skb_rtable(skb);
 
-	if (opt->is_strictroute && rt->rt_gateway)
+	if (opt->is_strictroute && rt->rt_uses_gateway)
 		goto sr_failed;
 
 	if (unlikely(skb->len > dst_mtu(&rt->dst) && !skb_is_gso(skb) &&
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 24a29a39e9a8..6537a408a4fb 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -193,7 +193,7 @@ static inline int ip_finish_output2(struct sk_buff *skb)
 	}
 
 	rcu_read_lock_bh();
-	nexthop = rt->rt_gateway ? rt->rt_gateway : ip_hdr(skb)->daddr;
+	nexthop = (__force u32) rt_nexthop(rt, ip_hdr(skb)->daddr);
 	neigh = __ipv4_neigh_lookup_noref(dev, nexthop);
 	if (unlikely(!neigh))
 		neigh = __neigh_create(&arp_tbl, &nexthop, dev, false);
@@ -371,7 +371,7 @@ int ip_queue_xmit(struct sk_buff *skb, struct flowi *fl)
 	skb_dst_set_noref(skb, &rt->dst);
 
 packet_routed:
-	if (inet_opt && inet_opt->opt.is_strictroute && rt->rt_gateway)
+	if (inet_opt && inet_opt->opt.is_strictroute && rt->rt_uses_gateway)
 		goto no_route;
 
 	/* OK, we know where to send it, allocate and build IP header. */
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 5b0180f11b20..3a116cb0991a 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1126,7 +1126,7 @@ static unsigned int ipv4_mtu(const struct dst_entry *dst)
 	mtu = dst->dev->mtu;
 
 	if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
-		if (rt->rt_gateway && mtu > 576)
+		if (rt->rt_uses_gateway && mtu > 576)
 			mtu = 576;
 	}
 
@@ -1177,7 +1177,9 @@ static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
 		if (fnhe->fnhe_gw) {
 			rt->rt_flags |= RTCF_REDIRECTED;
 			rt->rt_gateway = fnhe->fnhe_gw;
-		}
+			rt->rt_uses_gateway = 1;
+		} else if (!rt->rt_gateway)
+			rt->rt_gateway = daddr;
 
 		orig = rcu_dereference(fnhe->fnhe_rth);
 		rcu_assign_pointer(fnhe->fnhe_rth, rt);
@@ -1186,13 +1188,6 @@ static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
 
 		fnhe->fnhe_stamp = jiffies;
 		ret = true;
-	} else {
-		/* Routes we intend to cache in nexthop exception have
-		 * the DST_NOCACHE bit clear.  However, if we are
-		 * unsuccessful at storing this route into the cache
-		 * we really need to set it.
-		 */
-		rt->dst.flags |= DST_NOCACHE;
 	}
 	spin_unlock_bh(&fnhe_lock);
 
@@ -1215,15 +1210,8 @@ static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt)
 	if (prev == orig) {
 		if (orig)
 			rt_free(orig);
-	} else {
-		/* Routes we intend to cache in the FIB nexthop have
-		 * the DST_NOCACHE bit clear.  However, if we are
-		 * unsuccessful at storing this route into the cache
-		 * we really need to set it.
-		 */
-		rt->dst.flags |= DST_NOCACHE;
+	} else
 		ret = false;
-	}
 
 	return ret;
 }
@@ -1284,8 +1272,10 @@ static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
 	if (fi) {
 		struct fib_nh *nh = &FIB_RES_NH(*res);
 
-		if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK)
+		if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) {
 			rt->rt_gateway = nh->nh_gw;
+			rt->rt_uses_gateway = 1;
+		}
 		dst_init_metrics(&rt->dst, fi->fib_metrics, true);
 #ifdef CONFIG_IP_ROUTE_CLASSID
 		rt->dst.tclassid = nh->nh_tclassid;
@@ -1294,8 +1284,18 @@ static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
 			cached = rt_bind_exception(rt, fnhe, daddr);
 		else if (!(rt->dst.flags & DST_NOCACHE))
 			cached = rt_cache_route(nh, rt);
-	}
-	if (unlikely(!cached))
+		if (unlikely(!cached)) {
+			/* Routes we intend to cache in nexthop exception or
+			 * FIB nexthop have the DST_NOCACHE bit clear.
+			 * However, if we are unsuccessful at storing this
+			 * route into the cache we really need to set it.
+			 */
+			rt->dst.flags |= DST_NOCACHE;
+			if (!rt->rt_gateway)
+				rt->rt_gateway = daddr;
+			rt_add_uncached_list(rt);
+		}
+	} else
 		rt_add_uncached_list(rt);
 
 #ifdef CONFIG_IP_ROUTE_CLASSID
@@ -1363,6 +1363,7 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
 	rth->rt_iif	= 0;
 	rth->rt_pmtu	= 0;
 	rth->rt_gateway	= 0;
+	rth->rt_uses_gateway = 0;
 	INIT_LIST_HEAD(&rth->rt_uncached);
 	if (our) {
 		rth->dst.input= ip_local_deliver;
@@ -1432,7 +1433,6 @@ static int __mkroute_input(struct sk_buff *skb,
 		return -EINVAL;
 	}
 
-
 	err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
 				  in_dev->dev, in_dev, &itag);
 	if (err < 0) {
@@ -1488,6 +1488,7 @@ static int __mkroute_input(struct sk_buff *skb,
 	rth->rt_iif 	= 0;
 	rth->rt_pmtu	= 0;
 	rth->rt_gateway	= 0;
+	rth->rt_uses_gateway = 0;
 	INIT_LIST_HEAD(&rth->rt_uncached);
 
 	rth->dst.input = ip_forward;
@@ -1658,6 +1659,7 @@ local_input:
 	rth->rt_iif	= 0;
 	rth->rt_pmtu	= 0;
 	rth->rt_gateway	= 0;
+	rth->rt_uses_gateway = 0;
 	INIT_LIST_HEAD(&rth->rt_uncached);
 	if (res.type == RTN_UNREACHABLE) {
 		rth->dst.input= ip_error;
@@ -1826,6 +1828,7 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
 	rth->rt_iif	= orig_oif ? : 0;
 	rth->rt_pmtu	= 0;
 	rth->rt_gateway = 0;
+	rth->rt_uses_gateway = 0;
 	INIT_LIST_HEAD(&rth->rt_uncached);
 
 	RT_CACHE_STAT_INC(out_slow_tot);
@@ -2104,6 +2107,7 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or
 		rt->rt_flags = ort->rt_flags;
 		rt->rt_type = ort->rt_type;
 		rt->rt_gateway = ort->rt_gateway;
+		rt->rt_uses_gateway = ort->rt_uses_gateway;
 
 		INIT_LIST_HEAD(&rt->rt_uncached);
 
@@ -2182,7 +2186,7 @@ static int rt_fill_info(struct net *net,  __be32 dst, __be32 src,
 		if (nla_put_be32(skb, RTA_PREFSRC, fl4->saddr))
 			goto nla_put_failure;
 	}
-	if (rt->rt_gateway &&
+	if (rt->rt_uses_gateway &&
 	    nla_put_be32(skb, RTA_GATEWAY, rt->rt_gateway))
 		goto nla_put_failure;
 
diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
index 681ea2f413e2..05c5ab8d983c 100644
--- a/net/ipv4/xfrm4_policy.c
+++ b/net/ipv4/xfrm4_policy.c
@@ -91,6 +91,7 @@ static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev,
 					      RTCF_LOCAL);
 	xdst->u.rt.rt_type = rt->rt_type;
 	xdst->u.rt.rt_gateway = rt->rt_gateway;
+	xdst->u.rt.rt_uses_gateway = rt->rt_uses_gateway;
 	xdst->u.rt.rt_pmtu = rt->rt_pmtu;
 	INIT_LIST_HEAD(&xdst->u.rt.rt_uncached);
 
-- 
cgit v1.2.3


From c92b96553a80c1dbe2ebe128bbe37c8f98f148bf Mon Sep 17 00:00:00 2001
From: Julian Anastasov <ja@ssi.bg>
Date: Mon, 8 Oct 2012 11:41:19 +0000
Subject: ipv4: Add FLOWI_FLAG_KNOWN_NH

Add flag to request that output route should be
returned with known rt_gateway, in case we want to use
it as nexthop for neighbour resolving.

	The returned route can be cached as follows:

- in NH exception: because the cached routes are not shared
	with other destinations
- in FIB NH: when using gateway because all destinations for
	NH share same gateway

	As last option, to return rt_gateway!=0 we have to
set DST_NOCACHE.

Signed-off-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/flow.h |  1 +
 net/ipv4/route.c   | 21 +++++++++++++++++----
 2 files changed, 18 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/include/net/flow.h b/include/net/flow.h
index e1dd5082ec7e..628e11b98c58 100644
--- a/include/net/flow.h
+++ b/include/net/flow.h
@@ -21,6 +21,7 @@ struct flowi_common {
 	__u8	flowic_flags;
 #define FLOWI_FLAG_ANYSRC		0x01
 #define FLOWI_FLAG_CAN_SLEEP		0x02
+#define FLOWI_FLAG_KNOWN_NH		0x04
 	__u32	flowic_secid;
 };
 
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 3a116cb0991a..1a0da8dc8180 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1762,6 +1762,7 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
 	struct in_device *in_dev;
 	u16 type = res->type;
 	struct rtable *rth;
+	bool do_cache;
 
 	in_dev = __in_dev_get_rcu(dev_out);
 	if (!in_dev)
@@ -1798,24 +1799,36 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
 	}
 
 	fnhe = NULL;
+	do_cache = fi != NULL;
 	if (fi) {
 		struct rtable __rcu **prth;
+		struct fib_nh *nh = &FIB_RES_NH(*res);
 
-		fnhe = find_exception(&FIB_RES_NH(*res), fl4->daddr);
+		fnhe = find_exception(nh, fl4->daddr);
 		if (fnhe)
 			prth = &fnhe->fnhe_rth;
-		else
-			prth = __this_cpu_ptr(FIB_RES_NH(*res).nh_pcpu_rth_output);
+		else {
+			if (unlikely(fl4->flowi4_flags &
+				     FLOWI_FLAG_KNOWN_NH &&
+				     !(nh->nh_gw &&
+				       nh->nh_scope == RT_SCOPE_LINK))) {
+				do_cache = false;
+				goto add;
+			}
+			prth = __this_cpu_ptr(nh->nh_pcpu_rth_output);
+		}
 		rth = rcu_dereference(*prth);
 		if (rt_cache_valid(rth)) {
 			dst_hold(&rth->dst);
 			return rth;
 		}
 	}
+
+add:
 	rth = rt_dst_alloc(dev_out,
 			   IN_DEV_CONF_GET(in_dev, NOPOLICY),
 			   IN_DEV_CONF_GET(in_dev, NOXFRM),
-			   fi);
+			   do_cache);
 	if (!rth)
 		return ERR_PTR(-ENOBUFS);
 
-- 
cgit v1.2.3


From ad4d3ef8b7eb527cca478dc08c02c10936e64115 Mon Sep 17 00:00:00 2001
From: Julian Anastasov <ja@ssi.bg>
Date: Mon, 8 Oct 2012 11:41:20 +0000
Subject: ipvs: fix ARP resolving for direct routing mode

After the change "Make neigh lookups directly in output packet path"
(commit a263b30936) IPVS can not reach the real server for DR mode
because we resolve the destination address from IP header, not from
route neighbour. Use the new FLOWI_FLAG_KNOWN_NH flag to request
output routes with known nexthop, so that it has preference
on resolving.

Signed-off-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/netfilter/ipvs/ip_vs_xmit.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c
index 56f6d5d81a77..cc4c8095681a 100644
--- a/net/netfilter/ipvs/ip_vs_xmit.c
+++ b/net/netfilter/ipvs/ip_vs_xmit.c
@@ -50,6 +50,7 @@ enum {
 				      * local
 				      */
 	IP_VS_RT_MODE_CONNECT	= 8, /* Always bind route to saddr */
+	IP_VS_RT_MODE_KNOWN_NH	= 16,/* Route via remote addr */
 };
 
 /*
@@ -113,6 +114,8 @@ static struct rtable *do_output_route4(struct net *net, __be32 daddr,
 	fl4.daddr = daddr;
 	fl4.saddr = (rt_mode & IP_VS_RT_MODE_CONNECT) ? *saddr : 0;
 	fl4.flowi4_tos = rtos;
+	fl4.flowi4_flags = (rt_mode & IP_VS_RT_MODE_KNOWN_NH) ?
+			   FLOWI_FLAG_KNOWN_NH : 0;
 
 retry:
 	rt = ip_route_output_key(net, &fl4);
@@ -1061,7 +1064,8 @@ ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 	if (!(rt = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip,
 				      RT_TOS(iph->tos),
 				      IP_VS_RT_MODE_LOCAL |
-					IP_VS_RT_MODE_NON_LOCAL, NULL)))
+				      IP_VS_RT_MODE_NON_LOCAL |
+				      IP_VS_RT_MODE_KNOWN_NH, NULL)))
 		goto tx_error_icmp;
 	if (rt->rt_flags & RTCF_LOCAL) {
 		ip_rt_put(rt);
-- 
cgit v1.2.3


From 4c199a93a2d36b277a9fd209a0f2793f8460a215 Mon Sep 17 00:00:00 2001
From: Michel Lespinasse <walken@google.com>
Date: Mon, 8 Oct 2012 16:30:32 -0700
Subject: rbtree: empty nodes have no color

Empty nodes have no color.  We can make use of this property to simplify
the code emitted by the RB_EMPTY_NODE and RB_CLEAR_NODE macros.  Also,
we can get rid of the rb_init_node function which had been introduced by
commit 88d19cf37952 ("timers: Add rb_init_node() to allow for stack
allocated rb nodes") to avoid some issue with the empty node's color not
being initialized.

I'm not sure what the RB_EMPTY_NODE checks in rb_prev() / rb_next() are
doing there, though.  axboe introduced them in commit 10fd48f2376d
("rbtree: fixed reversed RB_EMPTY_NODE and rb_next/prev").  The way I
see it, the 'empty node' abstraction is only used by rbtree users to
flag nodes that they haven't inserted in any rbtree, so asking the
predecessor or successor of such nodes doesn't make any sense.

One final rb_init_node() caller was recently added in sysctl code to
implement faster sysctl name lookups.  This code doesn't make use of
RB_EMPTY_NODE at all, and from what I could see it only called
rb_init_node() under the mistaken assumption that such initialization was
required before node insertion.

[sfr@canb.auug.org.au: fix net/ceph/osd_client.c build]
Signed-off-by: Michel Lespinasse <walken@google.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Acked-by: David Woodhouse <David.Woodhouse@intel.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Daniel Santos <daniel.santos@pobox.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: John Stultz <john.stultz@linaro.org>
Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/proc_sysctl.c      |  4 +---
 include/linux/rbtree.h     | 15 +++++----------
 include/linux/timerqueue.h |  2 +-
 lib/rbtree.c               |  4 ++--
 net/ceph/osd_client.c      |  1 -
 5 files changed, 9 insertions(+), 17 deletions(-)

(limited to 'net')

diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index dcd56f84db7e..fddc50729632 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -168,10 +168,8 @@ static void init_header(struct ctl_table_header *head,
 	head->node = node;
 	if (node) {
 		struct ctl_table *entry;
-		for (entry = table; entry->procname; entry++, node++) {
-			rb_init_node(&node->node);
+		for (entry = table; entry->procname; entry++, node++)
 			node->header = head;
-		}
 	}
 }
 
diff --git a/include/linux/rbtree.h b/include/linux/rbtree.h
index e6a807720ded..2049087c43b7 100644
--- a/include/linux/rbtree.h
+++ b/include/linux/rbtree.h
@@ -67,17 +67,12 @@ static inline void rb_set_color(struct rb_node *rb, int color)
 #define RB_ROOT	(struct rb_root) { NULL, }
 #define	rb_entry(ptr, type, member) container_of(ptr, type, member)
 
-#define RB_EMPTY_ROOT(root)	((root)->rb_node == NULL)
-#define RB_EMPTY_NODE(node)	(rb_parent(node) == node)
-#define RB_CLEAR_NODE(node)	(rb_set_parent(node, node))
+#define RB_EMPTY_ROOT(root)  ((root)->rb_node == NULL)
+
+/* 'empty' nodes are nodes that are known not to be inserted in an rbree */
+#define RB_EMPTY_NODE(node)  ((node)->rb_parent_color == (unsigned long)(node))
+#define RB_CLEAR_NODE(node)  ((node)->rb_parent_color = (unsigned long)(node))
 
-static inline void rb_init_node(struct rb_node *rb)
-{
-	rb->rb_parent_color = 0;
-	rb->rb_right = NULL;
-	rb->rb_left = NULL;
-	RB_CLEAR_NODE(rb);
-}
 
 extern void rb_insert_color(struct rb_node *, struct rb_root *);
 extern void rb_erase(struct rb_node *, struct rb_root *);
diff --git a/include/linux/timerqueue.h b/include/linux/timerqueue.h
index 5088727478fd..a520fd70a59f 100644
--- a/include/linux/timerqueue.h
+++ b/include/linux/timerqueue.h
@@ -39,7 +39,7 @@ struct timerqueue_node *timerqueue_getnext(struct timerqueue_head *head)
 
 static inline void timerqueue_init(struct timerqueue_node *node)
 {
-	rb_init_node(&node->node);
+	RB_CLEAR_NODE(&node->node);
 }
 
 static inline void timerqueue_init_head(struct timerqueue_head *head)
diff --git a/lib/rbtree.c b/lib/rbtree.c
index d4175565dc2c..fe43c8c5f527 100644
--- a/lib/rbtree.c
+++ b/lib/rbtree.c
@@ -387,7 +387,7 @@ struct rb_node *rb_next(const struct rb_node *node)
 {
 	struct rb_node *parent;
 
-	if (rb_parent(node) == node)
+	if (RB_EMPTY_NODE(node))
 		return NULL;
 
 	/* If we have a right-hand child, go down and then left as far
@@ -416,7 +416,7 @@ struct rb_node *rb_prev(const struct rb_node *node)
 {
 	struct rb_node *parent;
 
-	if (rb_parent(node) == node)
+	if (RB_EMPTY_NODE(node))
 		return NULL;
 
 	/* If we have a left-hand child, go down and then right as far
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index ccbdfbba9e53..c1d756cc7448 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -221,7 +221,6 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
 	kref_init(&req->r_kref);
 	init_completion(&req->r_completion);
 	init_completion(&req->r_safe_completion);
-	rb_init_node(&req->r_node);
 	INIT_LIST_HEAD(&req->r_unsafe_item);
 	INIT_LIST_HEAD(&req->r_linger_item);
 	INIT_LIST_HEAD(&req->r_linger_osd);
-- 
cgit v1.2.3


From 5175a5e76bbdf20a614fb47ce7a38f0f39e70226 Mon Sep 17 00:00:00 2001
From: "jeff.liu" <jeff.liu@oracle.com>
Date: Mon, 8 Oct 2012 18:57:27 +0000
Subject: RDS: fix rds-ping spinlock recursion

This is the revised patch for fixing rds-ping spinlock recursion
according to Venkat's suggestions.

RDS ping/pong over TCP feature has been broken for years(2.6.39 to
3.6.0) since we have to set TCP cork and call kernel_sendmsg() between
ping/pong which both need to lock "struct sock *sk". However, this
lock has already been hold before rds_tcp_data_ready() callback is
triggerred. As a result, we always facing spinlock resursion which
would resulting in system panic.

Given that RDS ping is only used to test the connectivity and not for
serious performance measurements, we can queue the pong transmit to
rds_wq as a delayed response.

Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
CC: Venkat Venkatsubra <venkat.x.venkatsubra@oracle.com>
CC: David S. Miller <davem@davemloft.net>
CC: James Morris <james.l.morris@oracle.com>
Signed-off-by: Jie Liu <jeff.liu@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/rds/send.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/rds/send.c b/net/rds/send.c
index 96531d4033a2..88eace57dd6b 100644
--- a/net/rds/send.c
+++ b/net/rds/send.c
@@ -1122,7 +1122,7 @@ rds_send_pong(struct rds_connection *conn, __be16 dport)
 	rds_stats_inc(s_send_pong);
 
 	if (!test_bit(RDS_LL_SEND_FULL, &conn->c_flags))
-		rds_send_xmit(conn);
+		queue_delayed_work(rds_wq, &conn->c_send_w, 0);
 
 	rds_message_put(rm);
 	return 0;
-- 
cgit v1.2.3


From 5aa8b572007c4bca1e6d3dd4c4820f1ae49d6bb2 Mon Sep 17 00:00:00 2001
From: Amerigo Wang <amwang@redhat.com>
Date: Tue, 9 Oct 2012 17:48:16 +0000
Subject: pktgen: fix crash when generating IPv6 packets

For IPv6, sizeof(struct ipv6hdr) = 40, thus the following
expression will result negative:

        datalen = pkt_dev->cur_pkt_size - 14 -
                  sizeof(struct ipv6hdr) - sizeof(struct udphdr) -
                  pkt_dev->pkt_overhead;

And,  the check "if (datalen < sizeof(struct pktgen_hdr))" will be
passed as "datalen" is promoted to unsigned, therefore will cause
a crash later.

This is a quick fix by checking if "datalen" is negative. The following
patch will increase the default value of 'min_pkt_size' for IPv6.

This bug should exist for a long time, so Cc -stable too.

Cc: <stable@vger.kernel.org>
Cc: David S. Miller <davem@davemloft.net>
Signed-off-by: Cong Wang <amwang@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/pktgen.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index 148e73d2c451..e356b8d52bad 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -2927,7 +2927,7 @@ static struct sk_buff *fill_packet_ipv6(struct net_device *odev,
 		  sizeof(struct ipv6hdr) - sizeof(struct udphdr) -
 		  pkt_dev->pkt_overhead;
 
-	if (datalen < sizeof(struct pktgen_hdr)) {
+	if (datalen < 0 || datalen < sizeof(struct pktgen_hdr)) {
 		datalen = sizeof(struct pktgen_hdr);
 		net_info_ratelimited("increased datalen to %d\n", datalen);
 	}
-- 
cgit v1.2.3


From 68bf9f0b91ed4440951312cf7d4ffa70b9e8cf73 Mon Sep 17 00:00:00 2001
From: Amerigo Wang <amwang@redhat.com>
Date: Tue, 9 Oct 2012 17:48:17 +0000
Subject: pktgen: set different default min_pkt_size for different protocols

ETH_ZLEN is too small for IPv6, so this default value is not
suitable.

Cc: David S. Miller <davem@davemloft.net>
Signed-off-by: Cong Wang <amwang@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/pktgen.c | 27 ++++++++++++++++++++-------
 1 file changed, 20 insertions(+), 7 deletions(-)

(limited to 'net')

diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index e356b8d52bad..98ee54963553 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -248,8 +248,8 @@ struct pktgen_dev {
 	int removal_mark;	/* non-zero => the device is marked for
 				 * removal by worker thread */
 
-	int min_pkt_size;	/* = ETH_ZLEN; */
-	int max_pkt_size;	/* = ETH_ZLEN; */
+	int min_pkt_size;
+	int max_pkt_size;
 	int pkt_overhead;	/* overhead for MPLS, VLANs, IPSEC etc */
 	int nfrags;
 	struct page *page;
@@ -2036,10 +2036,14 @@ static void pktgen_setup_inject(struct pktgen_dev *pkt_dev)
 	/* Set up Dest MAC */
 	memcpy(&(pkt_dev->hh[0]), pkt_dev->dst_mac, ETH_ALEN);
 
-	/* Set up pkt size */
-	pkt_dev->cur_pkt_size = pkt_dev->min_pkt_size;
-
 	if (pkt_dev->flags & F_IPV6) {
+		if (pkt_dev->min_pkt_size == 0) {
+			pkt_dev->min_pkt_size = 14 + sizeof(struct ipv6hdr)
+						+ sizeof(struct udphdr)
+						+ sizeof(struct pktgen_hdr)
+						+ pkt_dev->pkt_overhead;
+		}
+
 		/*
 		 * Skip this automatic address setting until locks or functions
 		 * gets exported
@@ -2086,6 +2090,13 @@ static void pktgen_setup_inject(struct pktgen_dev *pkt_dev)
 		}
 #endif
 	} else {
+		if (pkt_dev->min_pkt_size == 0) {
+			pkt_dev->min_pkt_size = 14 + sizeof(struct iphdr)
+						+ sizeof(struct udphdr)
+						+ sizeof(struct pktgen_hdr)
+						+ pkt_dev->pkt_overhead;
+		}
+
 		pkt_dev->saddr_min = 0;
 		pkt_dev->saddr_max = 0;
 		if (strlen(pkt_dev->src_min) == 0) {
@@ -2111,6 +2122,10 @@ static void pktgen_setup_inject(struct pktgen_dev *pkt_dev)
 		pkt_dev->daddr_max = in_aton(pkt_dev->dst_max);
 	}
 	/* Initialize current values. */
+	pkt_dev->cur_pkt_size = pkt_dev->min_pkt_size;
+	if (pkt_dev->min_pkt_size > pkt_dev->max_pkt_size)
+		pkt_dev->max_pkt_size = pkt_dev->min_pkt_size;
+
 	pkt_dev->cur_dst_mac_offset = 0;
 	pkt_dev->cur_src_mac_offset = 0;
 	pkt_dev->cur_saddr = pkt_dev->saddr_min;
@@ -3548,8 +3563,6 @@ static int pktgen_add_device(struct pktgen_thread *t, const char *ifname)
 	}
 
 	pkt_dev->removal_mark = 0;
-	pkt_dev->min_pkt_size = ETH_ZLEN;
-	pkt_dev->max_pkt_size = ETH_ZLEN;
 	pkt_dev->nfrags = 0;
 	pkt_dev->delay = pg_delay_d;
 	pkt_dev->count = pg_count_d;
-- 
cgit v1.2.3


From 0373a94671be1f5c823dbfc03617418d8effd5ce Mon Sep 17 00:00:00 2001
From: Amerigo Wang <amwang@redhat.com>
Date: Tue, 9 Oct 2012 17:48:18 +0000
Subject: pktgen: display IPv4 address in human-readable format

It is weird to display IPv4 address in %x format, what's more,
IPv6 address is disaplayed in human-readable format too. So,
make it human-readable.

Cc: David S. Miller <davem@davemloft.net>
Signed-off-by: Cong Wang <amwang@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/pktgen.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index 98ee54963553..f9b4637e9f24 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -702,8 +702,8 @@ static int pktgen_if_show(struct seq_file *seq, void *v)
 				&pkt_dev->cur_in6_saddr,
 				&pkt_dev->cur_in6_daddr);
 	} else
-		seq_printf(seq, "     cur_saddr: 0x%x  cur_daddr: 0x%x\n",
-			   pkt_dev->cur_saddr, pkt_dev->cur_daddr);
+		seq_printf(seq, "     cur_saddr: %pI4  cur_daddr: %pI4\n",
+			   &pkt_dev->cur_saddr, &pkt_dev->cur_daddr);
 
 	seq_printf(seq, "     cur_udp_dst: %d  cur_udp_src: %d\n",
 		   pkt_dev->cur_udp_dst, pkt_dev->cur_udp_src);
-- 
cgit v1.2.3


From 4c139b8ccebaecdfad58eb068d61ef386f1a58ed Mon Sep 17 00:00:00 2001
From: Amerigo Wang <amwang@redhat.com>
Date: Tue, 9 Oct 2012 17:48:19 +0000
Subject: pktgen: enable automatic IPv6 address setting

Cc: David S. Miller <davem@davemloft.net>
Signed-off-by: Cong Wang <amwang@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/pktgen.c | 18 +++++-------------
 1 file changed, 5 insertions(+), 13 deletions(-)

(limited to 'net')

diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index f9b4637e9f24..47fe18e6a8e4 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -2037,6 +2037,9 @@ static void pktgen_setup_inject(struct pktgen_dev *pkt_dev)
 	memcpy(&(pkt_dev->hh[0]), pkt_dev->dst_mac, ETH_ALEN);
 
 	if (pkt_dev->flags & F_IPV6) {
+		int i, set = 0, err = 1;
+		struct inet6_dev *idev;
+
 		if (pkt_dev->min_pkt_size == 0) {
 			pkt_dev->min_pkt_size = 14 + sizeof(struct ipv6hdr)
 						+ sizeof(struct udphdr)
@@ -2044,15 +2047,6 @@ static void pktgen_setup_inject(struct pktgen_dev *pkt_dev)
 						+ pkt_dev->pkt_overhead;
 		}
 
-		/*
-		 * Skip this automatic address setting until locks or functions
-		 * gets exported
-		 */
-
-#ifdef NOTNOW
-		int i, set = 0, err = 1;
-		struct inet6_dev *idev;
-
 		for (i = 0; i < IN6_ADDR_HSIZE; i++)
 			if (pkt_dev->cur_in6_saddr.s6_addr[i]) {
 				set = 1;
@@ -2073,9 +2067,8 @@ static void pktgen_setup_inject(struct pktgen_dev *pkt_dev)
 				struct inet6_ifaddr *ifp;
 
 				read_lock_bh(&idev->lock);
-				for (ifp = idev->addr_list; ifp;
-				     ifp = ifp->if_next) {
-					if (ifp->scope == IFA_LINK &&
+				list_for_each_entry(ifp, &idev->addr_list, if_list) {
+					if ((ifp->scope & IFA_LINK) &&
 					    !(ifp->flags & IFA_F_TENTATIVE)) {
 						pkt_dev->cur_in6_saddr = ifp->addr;
 						err = 0;
@@ -2088,7 +2081,6 @@ static void pktgen_setup_inject(struct pktgen_dev *pkt_dev)
 			if (err)
 				pr_err("ERROR: IPv6 link address not available\n");
 		}
-#endif
 	} else {
 		if (pkt_dev->min_pkt_size == 0) {
 			pkt_dev->min_pkt_size = 14 + sizeof(struct iphdr)
-- 
cgit v1.2.3


From c468fb1375f1b4de851e3e0dbe9d1293d414a160 Mon Sep 17 00:00:00 2001
From: Amerigo Wang <amwang@redhat.com>
Date: Tue, 9 Oct 2012 17:48:20 +0000
Subject: pktgen: replace scan_ip6() with in6_pton()

Cc: David S. Miller <davem@davemloft.net>
Signed-off-by: Cong Wang <amwang@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/pktgen.c | 101 +++---------------------------------------------------
 1 file changed, 4 insertions(+), 97 deletions(-)

(limited to 'net')

diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index 47fe18e6a8e4..d1dc14c2aac4 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -449,8 +449,6 @@ static void pktgen_stop_all_threads_ifs(void);
 static void pktgen_stop(struct pktgen_thread *t);
 static void pktgen_clear_counters(struct pktgen_dev *pkt_dev);
 
-static unsigned int scan_ip6(const char *s, char ip[16]);
-
 /* Module parameters, defaults. */
 static int pg_count_d __read_mostly = 1000;
 static int pg_delay_d __read_mostly;
@@ -1299,7 +1297,7 @@ static ssize_t pktgen_if_write(struct file *file,
 			return -EFAULT;
 		buf[len] = 0;
 
-		scan_ip6(buf, pkt_dev->in6_daddr.s6_addr);
+		in6_pton(buf, -1, pkt_dev->in6_daddr.s6_addr, -1, NULL);
 		snprintf(buf, sizeof(buf), "%pI6c", &pkt_dev->in6_daddr);
 
 		pkt_dev->cur_in6_daddr = pkt_dev->in6_daddr;
@@ -1322,7 +1320,7 @@ static ssize_t pktgen_if_write(struct file *file,
 			return -EFAULT;
 		buf[len] = 0;
 
-		scan_ip6(buf, pkt_dev->min_in6_daddr.s6_addr);
+		in6_pton(buf, -1, pkt_dev->min_in6_daddr.s6_addr, -1, NULL);
 		snprintf(buf, sizeof(buf), "%pI6c", &pkt_dev->min_in6_daddr);
 
 		pkt_dev->cur_in6_daddr = pkt_dev->min_in6_daddr;
@@ -1344,7 +1342,7 @@ static ssize_t pktgen_if_write(struct file *file,
 			return -EFAULT;
 		buf[len] = 0;
 
-		scan_ip6(buf, pkt_dev->max_in6_daddr.s6_addr);
+		in6_pton(buf, -1, pkt_dev->max_in6_daddr.s6_addr, -1, NULL);
 		snprintf(buf, sizeof(buf), "%pI6c", &pkt_dev->max_in6_daddr);
 
 		if (debug)
@@ -1365,7 +1363,7 @@ static ssize_t pktgen_if_write(struct file *file,
 			return -EFAULT;
 		buf[len] = 0;
 
-		scan_ip6(buf, pkt_dev->in6_saddr.s6_addr);
+		in6_pton(buf, -1, pkt_dev->in6_saddr.s6_addr, -1, NULL);
 		snprintf(buf, sizeof(buf), "%pI6c", &pkt_dev->in6_saddr);
 
 		pkt_dev->cur_in6_saddr = pkt_dev->in6_saddr;
@@ -2765,97 +2763,6 @@ static struct sk_buff *fill_packet_ipv4(struct net_device *odev,
 	return skb;
 }
 
-/*
- * scan_ip6, fmt_ip taken from dietlibc-0.21
- * Author Felix von Leitner <felix-dietlibc@fefe.de>
- *
- * Slightly modified for kernel.
- * Should be candidate for net/ipv4/utils.c
- * --ro
- */
-
-static unsigned int scan_ip6(const char *s, char ip[16])
-{
-	unsigned int i;
-	unsigned int len = 0;
-	unsigned long u;
-	char suffix[16];
-	unsigned int prefixlen = 0;
-	unsigned int suffixlen = 0;
-	__be32 tmp;
-	char *pos;
-
-	for (i = 0; i < 16; i++)
-		ip[i] = 0;
-
-	for (;;) {
-		if (*s == ':') {
-			len++;
-			if (s[1] == ':') {	/* Found "::", skip to part 2 */
-				s += 2;
-				len++;
-				break;
-			}
-			s++;
-		}
-
-		u = simple_strtoul(s, &pos, 16);
-		i = pos - s;
-		if (!i)
-			return 0;
-		if (prefixlen == 12 && s[i] == '.') {
-
-			/* the last 4 bytes may be written as IPv4 address */
-
-			tmp = in_aton(s);
-			memcpy((struct in_addr *)(ip + 12), &tmp, sizeof(tmp));
-			return i + len;
-		}
-		ip[prefixlen++] = (u >> 8);
-		ip[prefixlen++] = (u & 255);
-		s += i;
-		len += i;
-		if (prefixlen == 16)
-			return len;
-	}
-
-/* part 2, after "::" */
-	for (;;) {
-		if (*s == ':') {
-			if (suffixlen == 0)
-				break;
-			s++;
-			len++;
-		} else if (suffixlen != 0)
-			break;
-
-		u = simple_strtol(s, &pos, 16);
-		i = pos - s;
-		if (!i) {
-			if (*s)
-				len--;
-			break;
-		}
-		if (suffixlen + prefixlen <= 12 && s[i] == '.') {
-			tmp = in_aton(s);
-			memcpy((struct in_addr *)(suffix + suffixlen), &tmp,
-			       sizeof(tmp));
-			suffixlen += 4;
-			len += strlen(s);
-			break;
-		}
-		suffix[suffixlen++] = (u >> 8);
-		suffix[suffixlen++] = (u & 255);
-		s += i;
-		len += i;
-		if (prefixlen + suffixlen == 16)
-			break;
-	}
-	for (i = 0; i < suffixlen; i++)
-		ip[16 - suffixlen + i] = suffix[i];
-	return len;
-}
-
 static struct sk_buff *fill_packet_ipv6(struct net_device *odev,
 					struct pktgen_dev *pkt_dev)
 {
-- 
cgit v1.2.3


From 6caab7b0544e83e6c160b5e80f5a4a7dd69545c7 Mon Sep 17 00:00:00 2001
From: Sarveshwar Bandi <sarveshwar.bandi@emulex.com>
Date: Wed, 10 Oct 2012 01:15:01 +0000
Subject: bridge: Pull ip header into skb->data before looking into ip header.

If lower layer driver leaves the ip header in the skb fragment, it needs to
be first pulled into skb->data before inspecting ip header length or ip version
number.

Signed-off-by: Sarveshwar Bandi <sarveshwar.bandi@emulex.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/bridge/br_netfilter.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'net')

diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c
index 68e8f364bbf8..fe43bc7b063f 100644
--- a/net/bridge/br_netfilter.c
+++ b/net/bridge/br_netfilter.c
@@ -265,6 +265,9 @@ static int br_parse_ip_options(struct sk_buff *skb)
 	struct net_device *dev = skb->dev;
 	u32 len;
 
+	if (!pskb_may_pull(skb, sizeof(struct iphdr)))
+		goto inhdr_error;
+
 	iph = ip_hdr(skb);
 	opt = &(IPCB(skb)->opt);
 
-- 
cgit v1.2.3


From 68aaed54e7682aef261d5c2cf99e85a9dbf33a84 Mon Sep 17 00:00:00 2001
From: stephen hemminger <shemminger@vyatta.com>
Date: Wed, 10 Oct 2012 08:27:25 +0000
Subject: ipv4: fix route mark sparse warning

Sparse complains about RTA_MARK which is should be host order according
to include file and usage in iproute.

net/ipv4/route.c:2223:46: warning: incorrect type in argument 3 (different base types)
net/ipv4/route.c:2223:46:    expected restricted __be32 [usertype] value
net/ipv4/route.c:2223:46:    got unsigned int [unsigned] [usertype] flowic_mark

Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/route.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 1a0da8dc8180..432f4bb77238 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -2220,7 +2220,7 @@ static int rt_fill_info(struct net *net,  __be32 dst, __be32 src,
 		goto nla_put_failure;
 
 	if (fl4->flowi4_mark &&
-	    nla_put_be32(skb, RTA_MARK, fl4->flowi4_mark))
+	    nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
 		goto nla_put_failure;
 
 	error = rt->dst.error;
-- 
cgit v1.2.3


From 759f42987f98915764bad922ee123acb0eadbe33 Mon Sep 17 00:00:00 2001
From: Simon Derr <simon.derr@bull.net>
Date: Mon, 17 Sep 2012 15:16:31 +0200
Subject: 9P: Fix race between p9_write_work() and p9_fd_request()

Race scenario:

thread A			thread B

p9_write_work()                p9_fd_request()

if (list_empty
  (&m->unsent_req_list))
  ...

                               spin_lock(&client->lock);
                               req->status = REQ_STATUS_UNSENT;
                               list_add_tail(..., &m->unsent_req_list);
                               spin_unlock(&client->lock);
                               ....
                               if (n & POLLOUT &&
                               !test_and_set_bit(Wworksched, &m->wsched)
                               schedule_work(&m->wq);
                               --> not done because Wworksched is set

  clear_bit(Wworksched, &m->wsched);
  return;

--> nobody will take care of sending the new request.

This is not very likely to happen though, because p9_write_work()
being called with an empty unsent_req_list is not frequent.
But this also means that taking the lock earlier will not be costly.

Signed-off-by: Simon Derr <simon.derr@bull.net>
Signed-off-by: Eric Van Hensbergen <ericvh@gmail.com>
---
 net/9p/trans_fd.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/9p/trans_fd.c b/net/9p/trans_fd.c
index b2c308fffb8a..0031a8cf145d 100644
--- a/net/9p/trans_fd.c
+++ b/net/9p/trans_fd.c
@@ -453,12 +453,13 @@ static void p9_write_work(struct work_struct *work)
 	}
 
 	if (!m->wsize) {
+		spin_lock(&m->client->lock);
 		if (list_empty(&m->unsent_req_list)) {
 			clear_bit(Wworksched, &m->wsched);
+			spin_unlock(&m->client->lock);
 			return;
 		}
 
-		spin_lock(&m->client->lock);
 		req = list_entry(m->unsent_req_list.next, struct p9_req_t,
 			       req_list);
 		req->status = REQ_STATUS_SENT;
-- 
cgit v1.2.3


From 0e24c4fc52b16f0a1102a933f636d2f350c41098 Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@linux.intel.com>
Date: Thu, 11 Oct 2012 06:24:14 +0000
Subject: tcp: sysctl interface leaks 16 bytes of kernel memory

If the rc_dereference of tcp_fastopen_ctx ever fails then we copy 16 bytes
of kernel stack into the proc result.

Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/sysctl_net_ipv4.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'net')

diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 9205e492dc9d..63d4eccc674d 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -248,6 +248,8 @@ int proc_tcp_fastopen_key(ctl_table *ctl, int write, void __user *buffer,
 	ctxt = rcu_dereference(tcp_fastopen_ctx);
 	if (ctxt)
 		memcpy(user_key, ctxt->key, TCP_FASTOPEN_KEY_LENGTH);
+	else
+		memset(user_key, 0, sizeof(user_key));
 	rcu_read_unlock();
 
 	snprintf(tbl.data, tbl.maxlen, "%08x-%08x-%08x-%08x",
-- 
cgit v1.2.3


From 4c67525849e0b7f4bd4fab2487ec9e43ea52ef29 Mon Sep 17 00:00:00 2001
From: Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
Date: Fri, 12 Oct 2012 04:34:17 +0000
Subject: tcp: resets are misrouted

After commit e2446eaa ("tcp_v4_send_reset: binding oif to iif in no
sock case").. tcp resets are always lost, when routing is asymmetric.
Yes, backing out that patch will result in misrouting of resets for
dead connections which used interface binding when were alive, but we
actually cannot do anything here.  What's died that's died and correct
handling normal unbound connections is obviously a priority.

Comment to comment:
> This has few benefits:
>   1. tcp_v6_send_reset already did that.

It was done to route resets for IPv6 link local addresses. It was a
mistake to do so for global addresses. The patch fixes this as well.

Actually, the problem appears to be even more serious than guaranteed
loss of resets.  As reported by Sergey Soloviev <sol@eqv.ru>, those
misrouted resets create a lot of arp traffic and huge amount of
unresolved arp entires putting down to knees NAT firewalls which use
asymmetric routing.

Signed-off-by: Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
---
 net/ipv4/tcp_ipv4.c | 7 ++++---
 net/ipv6/tcp_ipv6.c | 3 ++-
 2 files changed, 6 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 75735c9a6a9d..ef998b008a57 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -708,10 +708,11 @@ static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 	arg.flags = (sk && inet_sk(sk)->transparent) ? IP_REPLY_ARG_NOSRCCHECK : 0;
 	/* When socket is gone, all binding information is lost.
-	 * routing might fail in this case. using iif for oif to
-	 * make sure we can deliver it
+	 * routing might fail in this case. No choice here, if we choose to force
+	 * input interface, we will misroute in case of asymmetric route.
 	 */
-	arg.bound_dev_if = sk ? sk->sk_bound_dev_if : inet_iif(skb);
+	if (sk)
+		arg.bound_dev_if = sk->sk_bound_dev_if;
 
 	net = dev_net(skb_dst(skb)->dev);
 	arg.tos = ip_hdr(skb)->tos;
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 49c890386ce9..26175bffbaa0 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -877,7 +877,8 @@ static void tcp_v6_send_response(struct sk_buff *skb, u32 seq, u32 ack, u32 win,
 	__tcp_v6_send_check(buff, &fl6.saddr, &fl6.daddr);
 
 	fl6.flowi6_proto = IPPROTO_TCP;
-	fl6.flowi6_oif = inet6_iif(skb);
+	if (ipv6_addr_type(&fl6.daddr) & IPV6_ADDR_LINKLOCAL)
+		fl6.flowi6_oif = inet6_iif(skb);
 	fl6.fl6_dport = t1->dest;
 	fl6.fl6_sport = t1->source;
 	security_skb_classify_flow(skb, flowi6_to_flowi(&fl6));
-- 
cgit v1.2.3


From 8437e7610c2d3e06f87f71fb82e10ed4b291812a Mon Sep 17 00:00:00 2001
From: stephen hemminger <shemminger@vyatta.com>
Date: Thu, 11 Oct 2012 12:51:28 +0000
Subject: vti: fix sparse bit endian warnings

Use be32_to_cpu instead of htonl to keep sparse happy.

Signed-off-by: Stephen Hemminger <shemminger@vyatta.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ip_vti.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c
index 978bca4818ae..1831092f999f 100644
--- a/net/ipv4/ip_vti.c
+++ b/net/ipv4/ip_vti.c
@@ -374,7 +374,7 @@ static netdev_tx_t vti_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
 
 	memset(&fl4, 0, sizeof(fl4));
 	flowi4_init_output(&fl4, tunnel->parms.link,
-			   htonl(tunnel->parms.i_key), RT_TOS(tos),
+			   be32_to_cpu(tunnel->parms.i_key), RT_TOS(tos),
 			   RT_SCOPE_UNIVERSE,
 			   IPPROTO_IPIP, 0,
 			   dst, tiph->saddr, 0, 0);
@@ -441,7 +441,7 @@ static int vti_tunnel_bind_dev(struct net_device *dev)
 		struct flowi4 fl4;
 		memset(&fl4, 0, sizeof(fl4));
 		flowi4_init_output(&fl4, tunnel->parms.link,
-				   htonl(tunnel->parms.i_key),
+				   be32_to_cpu(tunnel->parms.i_key),
 				   RT_TOS(iph->tos), RT_SCOPE_UNIVERSE,
 				   IPPROTO_IPIP, 0,
 				   iph->daddr, iph->saddr, 0, 0);
-- 
cgit v1.2.3


From 28194fcdc150e4d5b418d01db3c29058f60ef32c Mon Sep 17 00:00:00 2001
From: Amerigo Wang <amwang@redhat.com>
Date: Thu, 11 Oct 2012 21:06:16 +0000
Subject: net: add doc for in6_pton()

It is not easy to use in6_pton() correctly without reading
its definition, so add some doc for it.

Cc: David S. Miller <davem@davemloft.net>
Signed-off-by: Cong Wang <amwang@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/utils.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'net')

diff --git a/net/core/utils.c b/net/core/utils.c
index f5613d569c23..30f3879faecc 100644
--- a/net/core/utils.c
+++ b/net/core/utils.c
@@ -161,6 +161,18 @@ out:
 }
 EXPORT_SYMBOL(in4_pton);
 
+/**
+ * in6_pton - convert an IPv6 address from literal to binary representation
+ * @src: the start of the IPv6 address string
+ * @srclen: the length of the string, -1 means strlen(src)
+ * @dst: the binary (u8[16] array) representation of the IPv6 address
+ * @delim: the delimiter of the IPv6 address in @src, -1 means no delimiter
+ * @end: A pointer to the end of the parsed string will be placed here
+ *
+ * Return one on success, return zero when any error occurs
+ * and @end will point to the end of the parsed string.
+ *
+ */
 int in6_pton(const char *src, int srclen,
 	     u8 *dst,
 	     int delim, const char **end)
-- 
cgit v1.2.3


From 93ac0ef016a1b223d23fbb5e0397cab75a8f7d34 Mon Sep 17 00:00:00 2001
From: Amerigo Wang <amwang@redhat.com>
Date: Thu, 11 Oct 2012 21:06:17 +0000
Subject: net: add doc for in4_pton()

It is not easy to use in4_pton() correctly without reading
its definition, so add some doc for it.

Cc: David S. Miller <davem@davemloft.net>
Signed-off-by: Cong Wang <amwang@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/utils.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'net')

diff --git a/net/core/utils.c b/net/core/utils.c
index 30f3879faecc..e3487e461939 100644
--- a/net/core/utils.c
+++ b/net/core/utils.c
@@ -107,6 +107,18 @@ static inline int xdigit2bin(char c, int delim)
 	return IN6PTON_UNKNOWN;
 }
 
+/**
+ * in4_pton - convert an IPv4 address from literal to binary representation
+ * @src: the start of the IPv4 address string
+ * @srclen: the length of the string, -1 means strlen(src)
+ * @dst: the binary (u8[4] array) representation of the IPv4 address
+ * @delim: the delimiter of the IPv4 address in @src, -1 means no delimiter
+ * @end: A pointer to the end of the parsed string will be placed here
+ *
+ * Return one on success, return zero when any error occurs
+ * and @end will point to the end of the parsed string.
+ *
+ */
 int in4_pton(const char *src, int srclen,
 	     u8 *dst,
 	     int delim, const char **end)
-- 
cgit v1.2.3


From 1bbb3095a5912be4b9c90397ef2182a5a328865b Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Wed, 3 Oct 2012 20:32:17 -0700
Subject: userns: Properly print bluetooth socket uids
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

With user namespace support enabled building bluetooth generated the warning.
net/bluetooth/af_bluetooth.c: In function ‘bt_seq_show’:
net/bluetooth/af_bluetooth.c:598:7: warning: format ‘%u’ expects argument of type ‘unsigned int’, but argument 7 has type ‘kuid_t’ [-Wformat]

Convert sock_i_uid from a kuid_t to a uid_t before printing, to avoid
this problem.

Reported-by: Fengguang Wu <fengguang.wu@intel.com>
Cc: Masatake YAMATO <yamato@redhat.com>
Cc: Gustavo Padovan <gustavo.padovan@collabora.co.uk>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 net/bluetooth/af_bluetooth.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/bluetooth/af_bluetooth.c b/net/bluetooth/af_bluetooth.c
index 9d49ee6d7219..ba033f09196e 100644
--- a/net/bluetooth/af_bluetooth.c
+++ b/net/bluetooth/af_bluetooth.c
@@ -591,7 +591,7 @@ static int bt_seq_show(struct seq_file *seq, void *v)
 			   atomic_read(&sk->sk_refcnt),
 			   sk_rmem_alloc_get(sk),
 			   sk_wmem_alloc_get(sk),
-			   sock_i_uid(sk),
+			   from_kuid(seq_user_ns(seq), sock_i_uid(sk)),
 			   sock_i_ino(sk),
 			   &src_baswapped,
 			   &dst_baswapped,
-- 
cgit v1.2.3


From 55462cf30ad9768fff6a6d36db21879146a39bdf Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@resnulli.us>
Date: Sun, 14 Oct 2012 04:30:56 +0000
Subject: vlan: fix bond/team enslave of vlan challenged slave/port

In vlan_uses_dev() check for number of vlan devs rather than existence
of vlan_info. The reason is that vlan id 0 is there without appropriate
vlan dev on it by default which prevented from enslaving vlan challenged
dev.

Reported-by: Jon Stanley <jstanley@rmrf.net>
Signed-off-by: Jiri Pirko <jiri@resnulli.us>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/bonding/bond_main.c | 2 +-
 net/8021q/vlan_core.c           | 9 ++++++++-
 2 files changed, 9 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index b721902bb6b4..b2530b002125 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -1519,7 +1519,7 @@ int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev)
 	/* no need to lock since we're protected by rtnl_lock */
 	if (slave_dev->features & NETIF_F_VLAN_CHALLENGED) {
 		pr_debug("%s: NETIF_F_VLAN_CHALLENGED\n", slave_dev->name);
-		if (bond_vlan_used(bond)) {
+		if (vlan_uses_dev(bond_dev)) {
 			pr_err("%s: Error: cannot enslave VLAN challenged slave %s on VLAN enabled bond %s\n",
 			       bond_dev->name, slave_dev->name, bond_dev->name);
 			return -EPERM;
diff --git a/net/8021q/vlan_core.c b/net/8021q/vlan_core.c
index fbbf1fa00940..65e06abe023f 100644
--- a/net/8021q/vlan_core.c
+++ b/net/8021q/vlan_core.c
@@ -366,6 +366,13 @@ EXPORT_SYMBOL(vlan_vids_del_by_dev);
 
 bool vlan_uses_dev(const struct net_device *dev)
 {
-	return rtnl_dereference(dev->vlan_info) ? true : false;
+	struct vlan_info *vlan_info;
+
+	ASSERT_RTNL();
+
+	vlan_info = rtnl_dereference(dev->vlan_info);
+	if (!vlan_info)
+		return false;
+	return vlan_info->grp.nr_vlan_devs ? true : false;
 }
 EXPORT_SYMBOL(vlan_uses_dev);
-- 
cgit v1.2.3


From f6e80abeab928b7c47cc1fbf53df13b4398a2bec Mon Sep 17 00:00:00 2001
From: Zijie Pan <zijie.pan@6wind.com>
Date: Mon, 15 Oct 2012 03:56:39 +0000
Subject: sctp: fix call to SCTP_CMD_PROCESS_SACK in sctp_cmd_interpreter()

Bug introduced by commit edfee0339e681a784ebacec7e8c2dc97dc6d2839
(sctp: check src addr when processing SACK to update transport state)

Signed-off-by: Zijie Pan <zijie.pan@6wind.com>
Signed-off-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Acked-by: Vlad Yasevich <vyasevich@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sctp/sm_sideeffect.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c
index 57f7de839b03..6773d7803627 100644
--- a/net/sctp/sm_sideeffect.c
+++ b/net/sctp/sm_sideeffect.c
@@ -1642,8 +1642,9 @@ static int sctp_cmd_interpreter(sctp_event_t event_type,
 					asoc->outqueue.outstanding_bytes;
 			sackh.num_gap_ack_blocks = 0;
 			sackh.num_dup_tsns = 0;
+			chunk->subh.sack_hdr = &sackh;
 			sctp_add_cmd_sf(commands, SCTP_CMD_PROCESS_SACK,
-					SCTP_SACKH(&sackh));
+					SCTP_CHUNK(chunk));
 			break;
 
 		case SCTP_CMD_DISCARD_PACKET:
-- 
cgit v1.2.3


From 9f0d3c2781baa1102108e16efbe640dd74564a7c Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 16 Oct 2012 07:37:27 +0000
Subject: ipv6: addrconf: fix /proc/net/if_inet6

Commit 1d5783030a1 (ipv6/addrconf: speedup /proc/net/if_inet6 filling)
added bugs hiding some devices from if_inet6 and breaking applications.

"ip -6 addr" could still display all IPv6 addresses, while "ifconfig -a"
couldnt.

One way to reproduce the bug is by starting in a shell :

unshare -n /bin/bash
ifconfig lo up

And in original net namespace, lo device disappeared from if_inet6

Reported-by: Jan Hinnerk Stosch <janhinnerk.stosch@gmail.com>
Tested-by: Jan Hinnerk Stosch <janhinnerk.stosch@gmail.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Mihai Maruseac <mihai.maruseac@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/addrconf.c | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index d7c56f8a5b4e..0424e4e27414 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -3064,14 +3064,15 @@ static struct inet6_ifaddr *if6_get_first(struct seq_file *seq, loff_t pos)
 		struct hlist_node *n;
 		hlist_for_each_entry_rcu_bh(ifa, n, &inet6_addr_lst[state->bucket],
 					 addr_lst) {
+			if (!net_eq(dev_net(ifa->idev->dev), net))
+				continue;
 			/* sync with offset */
 			if (p < state->offset) {
 				p++;
 				continue;
 			}
 			state->offset++;
-			if (net_eq(dev_net(ifa->idev->dev), net))
-				return ifa;
+			return ifa;
 		}
 
 		/* prepare for next bucket */
@@ -3089,18 +3090,20 @@ static struct inet6_ifaddr *if6_get_next(struct seq_file *seq,
 	struct hlist_node *n = &ifa->addr_lst;
 
 	hlist_for_each_entry_continue_rcu_bh(ifa, n, addr_lst) {
+		if (!net_eq(dev_net(ifa->idev->dev), net))
+			continue;
 		state->offset++;
-		if (net_eq(dev_net(ifa->idev->dev), net))
-			return ifa;
+		return ifa;
 	}
 
 	while (++state->bucket < IN6_ADDR_HSIZE) {
 		state->offset = 0;
 		hlist_for_each_entry_rcu_bh(ifa, n,
 				     &inet6_addr_lst[state->bucket], addr_lst) {
+			if (!net_eq(dev_net(ifa->idev->dev), net))
+				continue;
 			state->offset++;
-			if (net_eq(dev_net(ifa->idev->dev), net))
-				return ifa;
+			return ifa;
 		}
 	}
 
-- 
cgit v1.2.3